[PATCH] faster copy_*_user for bad alignments on intel ia32

This patch speeds up copy_*_user for some Intel ia32 processors. It is based on work by Mala Anand. It is a good win. Around 30% for all src/dest alignments except 32/32. In this test a fully-cached one gigabyte file was read into an 8192-byte userspace buffer using read(fd, buf, 8192). The alignment of the user-side buffer was altered between runs. This is a PIII. Times are in seconds. User buffer 2.5.41 2.5.41+ patch++ 0x804c000 4.373 4.343 0x804c001 10.024 6.401 0x804c002 10.002 6.347 0x804c003 10.013 6.328 0x804c004 10.105 6.273 0x804c005 10.184 6.323 0x804c006 10.179 6.322 0x804c007 10.185 6.319 0x804c008 9.725 6.347 0x804c009 9.780 6.275 0x804c00a 9.779 6.355 0x804c00b 9.778 6.350 0x804c00c 9.723 6.351 0x804c00d 9.790 6.307 0x804c00e 9.790 6.289 0x804c00f 9.785 6.294 0x804c010 9.727 6.277 0x804c011 9.779 6.251 0x804c012 9.783 6.246 0x804c013 9.786 6.245 0x804c014 9.772 6.063 0x804c015 9.919 6.237 0x804c016 9.920 6.234 0x804c017 9.918 6.237 0x804c018 9.846 6.372 0x804c019 10.060 6.294 0x804c01a 10.049 6.328 0x804c01b 10.041 6.337 0x804c01c 9.931 6.347 0x804c01d 10.013 6.273 0x804c01e 10.020 6.346 0x804c01f 10.016 6.356 0x804c020 4.442 4.366 So `rep;movsl' is slower at all non-cache-aligned offsets. PII is using the PIII alignment. I don't have a PII any more, but I do recall that it demonstrated the same behaviour as the PIII. The patch contains an enhancement (based on careful testing) from Hirokazu Takahashi <taka@valinux.co.jp>. In cases where source and dest have the same alignment, but that aligment is poor, we do a short copy of a few bytes to bring the two pointers onto a favourable boundary and then do the big copy. And also a bugfix from Hirokazu Takahashi. As an added bonus, this patch decreases the kernel text by 28 kbytes. 22k of this in in .text and the rest in __ex_table. I'm not really sure why .text shrunk so much. These copy routines have no special-case for constant-sized copies. So a lot of uaccess.h becomes dead code with this patch. The next patch which uninlines the copy_*_user functions cleans all that up and saves an additional 5k.

[PATCH] faster copy_*_user for bad alignments on intel ia32
This patch speeds up copy_*_user for some Intel ia32 processors. It is based on work by Mala Anand. It is a good win. Around 30% for all src/dest alignments except 32/32. In this test a fully-cached one gigabyte file was read into an 8192-byte userspace buffer using read(fd, buf, 8192). The alignment of the user-side buffer was altered between runs. This is a PIII. Times are in seconds. User buffer 2.5.41 2.5.41+ patch++ 0x804c000 4.373 4.343 0x804c001 10.024 6.401 0x804c002 10.002 6.347 0x804c003 10.013 6.328 0x804c004 10.105 6.273 0x804c005 10.184 6.323 0x804c006 10.179 6.322 0x804c007 10.185 6.319 0x804c008 9.725 6.347 0x804c009 9.780 6.275 0x804c00a 9.779 6.355 0x804c00b 9.778 6.350 0x804c00c 9.723 6.351 0x804c00d 9.790 6.307 0x804c00e 9.790 6.289 0x804c00f 9.785 6.294 0x804c010 9.727 6.277 0x804c011 9.779 6.251 0x804c012 9.783 6.246 0x804c013 9.786 6.245 0x804c014 9.772 6.063 0x804c015 9.919 6.237 0x804c016 9.920 6.234 0x804c017 9.918 6.237 0x804c018 9.846 6.372 0x804c019 10.060 6.294 0x804c01a 10.049 6.328 0x804c01b 10.041 6.337 0x804c01c 9.931 6.347 0x804c01d 10.013 6.273 0x804c01e 10.020 6.346 0x804c01f 10.016 6.356 0x804c020 4.442 4.366 So `rep;movsl' is slower at all non-cache-aligned offsets. PII is using the PIII alignment. I don't have a PII any more, but I do recall that it demonstrated the same behaviour as the PIII. The patch contains an enhancement (based on careful testing) from Hirokazu Takahashi <taka@valinux.co.jp>. In cases where source and dest have the same alignment, but that aligment is poor, we do a short copy of a few bytes to bring the two pointers onto a favourable boundary and then do the big copy. And also a bugfix from Hirokazu Takahashi. As an added bonus, this patch decreases the kernel text by 28 kbytes. 22k of this in in .text and the rest in __ex_table. I'm not really sure why .text shrunk so much. These copy routines have no special-case for constant-sized copies. So a lot of uaccess.h becomes dead code with this patch. The next patch which uninlines the copy_*_user functions cleans all that up and saves an additional 5k.
a792a27c · Andrew Morton · Jens Axboe · 43c8cc21 · a792a27c · a792a27c
Commit a792a27c authored Oct 28, 2002 by Andrew Morton Committed by Jens Axboe Oct 28, 2002
4 changed files
--- a/arch/i386/kernel/cpu/intel.c
+++ b/arch/i386/kernel/cpu/intel.c
@@ -6,6 +6,7 @@
 #include <asm/processor.h>
 #include <asm/thread_info.h>
 #include <asm/msr.h>
+#include <asm/uaccess.h>

 #include "cpu.h"

@@ -13,6 +14,11 @@ static int disable_x86_serial_nr __initdata = 1;
 static int disable_P4_HT __initdata = 0;
 extern int trap_init_f00f_bug(void);

+#ifdef INTEL_MOVSL
+struct movsl_mask movsl_mask;	/* alignment at which movsl is preferred for
+			   	   bulk memory copies */
+#endif
+
 /*
 *	Early probe support logic for ppro memory erratum #50
 *
@@ -348,6 +354,25 @@ static void __init init_intel(struct cpuinfo_x86 *c)

 	/* Work around errata */
 	Intel_errata_workarounds(c);
+
+#ifdef INTEL_MOVSL
+	/*
+	 * Set up the preferred alignment for movsl bulk memory moves
+	 */
+	switch (c->x86) {
+	case 4:		/* 486: untested */
+		break;
+	case 5:		/* Old Pentia: untested */
+		break;
+	case 6:		/* PII/PIII only like movsl with 8-byte alignment */
+		movsl_mask.mask = 7;
+		break;
+	case 15:	/* P4 is OK down to 8-byte alignment */
+		movsl_mask.mask = 7;
+		break;
+	}
+#endif
+
 }



--- a/arch/i386/kernel/i386_ksyms.c
+++ b/arch/i386/kernel/i386_ksyms.c
@@ -119,6 +119,11 @@ EXPORT_SYMBOL(__clear_user);
 EXPORT_SYMBOL(__generic_copy_from_user);
 EXPORT_SYMBOL(__generic_copy_to_user);
 EXPORT_SYMBOL(strnlen_user);
+#ifdef INTEL_MOVSL
+EXPORT_SYMBOL(movsl_mask);
+EXPORT_SYMBOL(__copy_user_int);
+EXPORT_SYMBOL(__copy_user_zeroing_int);
+#endif

 EXPORT_SYMBOL(pci_alloc_consistent);
 EXPORT_SYMBOL(pci_free_consistent);

--- a/arch/i386/lib/usercopy.c
+++ b/arch/i386/lib/usercopy.c
@@ -45,8 +45,12 @@ unsigned long
 __generic_copy_to_user(void *to, const void *from, unsigned long n)
 {
 	prefetch(from);
-	if (access_ok(VERIFY_WRITE, to, n))
-		__copy_user(to,from,n);
+	if (access_ok(VERIFY_WRITE, to, n)) {
+		if (movsl_is_ok(to, from, n))
+			__copy_user(to, from, n);
+		else
+			n = __copy_user_int(to, from, n);
+	}
 	return n;
 }

@@ -54,10 +58,14 @@ unsigned long
 __generic_copy_from_user(void *to, const void *from, unsigned long n)
 {
 	prefetchw(to);
-	if (access_ok(VERIFY_READ, from, n))
-		__copy_user_zeroing(to,from,n);
-	else
+	if (access_ok(VERIFY_READ, from, n)) {
+		if (movsl_is_ok(to, from, n))
+			__copy_user_zeroing(to,from,n);
+		else
+			n = __copy_user_zeroing_int(to, from, n);
+	} else {
 		memset(to, 0, n);
+	}
 	return n;
 }

@@ -188,3 +196,191 @@ long strnlen_user(const char *s, long n)
 		:"cc");
 	return res & mask;
 }
+
+#ifdef INTEL_MOVSL
+/*
+ * Copy To/From Userspace
+ */
+
+/* Generic arbitrary sized copy.  */
+unsigned long __copy_user_int(void *to, const void *from,unsigned long size)
+{
+	int d0, d1;
+	__asm__ __volatile__(
+		       "       .align 2,0x90\n" 
+		       "0:     movl 32(%4), %%eax\n"
+		       "       cmpl $67, %0\n"     
+		       "       jbe 1f\n"            
+		       "       movl 64(%4), %%eax\n"
+		       "       .align 2,0x90\n"     
+		       "1:     movl 0(%4), %%eax\n" 
+		       "       movl 4(%4), %%edx\n" 
+		       "2:     movl %%eax, 0(%3)\n" 
+		       "21:    movl %%edx, 4(%3)\n" 
+		       "       movl 8(%4), %%eax\n" 
+		       "       movl 12(%4),%%edx\n" 
+		       "3:     movl %%eax, 8(%3)\n" 
+		       "31:    movl %%edx, 12(%3)\n"
+		       "       movl 16(%4), %%eax\n"
+		       "       movl 20(%4), %%edx\n"
+		       "4:     movl %%eax, 16(%3)\n"
+		       "41:    movl %%edx, 20(%3)\n"
+		       "       movl 24(%4), %%eax\n"
+		       "       movl 28(%4), %%edx\n"
+		       "10:    movl %%eax, 24(%3)\n"
+		       "51:    movl %%edx, 28(%3)\n"
+		       "       movl 32(%4), %%eax\n"
+		       "       movl 36(%4), %%edx\n"
+		       "11:    movl %%eax, 32(%3)\n"
+		       "61:    movl %%edx, 36(%3)\n"
+		       "       movl 40(%4), %%eax\n"
+		       "       movl 44(%4), %%edx\n"
+		       "12:    movl %%eax, 40(%3)\n"
+		       "71:    movl %%edx, 44(%3)\n"
+		       "       movl 48(%4), %%eax\n"
+		       "       movl 52(%4), %%edx\n"
+		       "13:    movl %%eax, 48(%3)\n"
+		       "81:    movl %%edx, 52(%3)\n"
+		       "       movl 56(%4), %%eax\n"
+		       "       movl 60(%4), %%edx\n"
+		       "14:    movl %%eax, 56(%3)\n"
+		       "91:    movl %%edx, 60(%3)\n"
+		       "       addl $-64, %0\n"     
+		       "       addl $64, %4\n"      
+		       "       addl $64, %3\n"      
+		       "       cmpl $63, %0\n"      
+		       "       ja  0b\n"            
+		       "5:     movl  %0, %%eax\n"   
+		       "       shrl  $2, %0\n"      
+		       "       andl  $3, %%eax\n"   
+		       "       cld\n"               
+		       "6:     rep; movsl\n"        
+		       "       movl %%eax, %0\n"    
+		       "7:     rep; movsb\n"		
+		       "8:\n"				
+		       ".section .fixup,\"ax\"\n"	
+		       "9:     lea 0(%%eax,%0,4),%0\n"	
+		       "       jmp 8b\n"               
+		       ".previous\n"			
+		       ".section __ex_table,\"a\"\n"	
+		       "       .align 4\n"		
+		       "       .long 2b,8b\n"		
+		       "       .long 21b,8b\n"	
+		       "       .long 3b,8b\n"		
+		       "       .long 31b,8b\n"	
+		       "       .long 4b,8b\n"		
+		       "       .long 41b,8b\n"	
+		       "       .long 10b,8b\n"	
+		       "       .long 51b,8b\n"	
+		       "       .long 11b,8b\n"	
+		       "       .long 61b,8b\n"	
+		       "       .long 12b,8b\n"	
+		       "       .long 71b,8b\n"	
+		       "       .long 13b,8b\n"	
+		       "       .long 81b,8b\n"	
+		       "       .long 14b,8b\n"	
+		       "       .long 91b,8b\n"	
+		       "       .long 6b,9b\n"		
+		       "       .long 7b,8b\n"          
+		       ".previous"			
+		       : "=&c"(size), "=&D" (d0), "=&S" (d1)
+		       :  "1"(to), "2"(from), "0"(size)
+		       : "eax", "edx", "memory");			
+	return size;
+}
+
+unsigned long
+__copy_user_zeroing_int(void *to, const void *from, unsigned long size)
+{
+	int d0, d1;
+	__asm__ __volatile__(
+		       "        .align 2,0x90\n"
+		       "0:      movl 32(%4), %%eax\n"
+		       "        cmpl $67, %0\n"      
+		       "        jbe 2f\n"            
+		       "1:      movl 64(%4), %%eax\n"
+		       "        .align 2,0x90\n"     
+		       "2:      movl 0(%4), %%eax\n" 
+		       "21:     movl 4(%4), %%edx\n" 
+		       "        movl %%eax, 0(%3)\n" 
+		       "        movl %%edx, 4(%3)\n" 
+		       "3:      movl 8(%4), %%eax\n" 
+		       "31:     movl 12(%4),%%edx\n" 
+		       "        movl %%eax, 8(%3)\n" 
+		       "        movl %%edx, 12(%3)\n"
+		       "4:      movl 16(%4), %%eax\n"
+		       "41:     movl 20(%4), %%edx\n"
+		       "        movl %%eax, 16(%3)\n"
+		       "        movl %%edx, 20(%3)\n"
+		       "10:     movl 24(%4), %%eax\n"
+		       "51:     movl 28(%4), %%edx\n"
+		       "        movl %%eax, 24(%3)\n"
+		       "        movl %%edx, 28(%3)\n"
+		       "11:     movl 32(%4), %%eax\n"
+		       "61:     movl 36(%4), %%edx\n"
+		       "        movl %%eax, 32(%3)\n"
+		       "        movl %%edx, 36(%3)\n"
+		       "12:     movl 40(%4), %%eax\n"
+		       "71:     movl 44(%4), %%edx\n"
+		       "        movl %%eax, 40(%3)\n"
+		       "        movl %%edx, 44(%3)\n"
+		       "13:     movl 48(%4), %%eax\n"
+		       "81:     movl 52(%4), %%edx\n"
+		       "        movl %%eax, 48(%3)\n"
+		       "        movl %%edx, 52(%3)\n"
+		       "14:     movl 56(%4), %%eax\n"
+		       "91:     movl 60(%4), %%edx\n"
+		       "        movl %%eax, 56(%3)\n"
+		       "        movl %%edx, 60(%3)\n"
+		       "        addl $-64, %0\n"     
+		       "        addl $64, %4\n"      
+		       "        addl $64, %3\n"      
+		       "        cmpl $63, %0\n"      
+		       "        ja  0b\n"            
+		       "5:      movl  %0, %%eax\n"   
+		       "        shrl  $2, %0\n"      
+		       "        andl $3, %%eax\n"    
+		       "        cld\n"               
+		       "6:      rep; movsl\n"   
+		       "        movl %%eax,%0\n"
+		       "7:      rep; movsb\n"	
+		       "8:\n"			
+		       ".section .fixup,\"ax\"\n"
+		       "9:      lea 0(%%eax,%0,4),%0\n"	
+		       "16:     pushl %0\n"	
+		       "        pushl %%eax\n"	
+		       "        xorl %%eax,%%eax\n"
+		       "        rep; stosb\n"	
+		       "        popl %%eax\n"	
+		       "        popl %0\n"	
+		       "        jmp 8b\n"	
+		       ".previous\n"		
+		       ".section __ex_table,\"a\"\n"
+		       "	.align 4\n"	   
+		       "	.long 0b,16b\n"	 
+		       "	.long 1b,16b\n"
+		       "	.long 2b,16b\n"
+		       "	.long 21b,16b\n"
+		       "	.long 3b,16b\n"	
+		       "	.long 31b,16b\n"
+		       "	.long 4b,16b\n"	
+		       "	.long 41b,16b\n"
+		       "	.long 10b,16b\n"
+		       "	.long 51b,16b\n"
+		       "	.long 11b,16b\n"
+		       "	.long 61b,16b\n"
+		       "	.long 12b,16b\n"
+		       "	.long 71b,16b\n"
+		       "	.long 13b,16b\n"
+		       "	.long 81b,16b\n"
+		       "	.long 14b,16b\n"
+		       "	.long 91b,16b\n"
+		       "	.long 6b,9b\n"	
+		       "        .long 7b,16b\n" 
+		       ".previous"		
+		       : "=&c"(size), "=&D" (d0), "=&S" (d1)
+		       :  "1"(to), "2"(from), "0"(size)
+		       : "eax", "edx", "memory");
+	return size;
+}
+#endif	/* INTEL_MOVSL */
--- a/include/asm-i386/uaccess.h
+++ b/include/asm-i386/uaccess.h
@@ -33,7 +33,39 @@

 #define segment_eq(a,b)	((a).seg == (b).seg)

-extern int __verify_write(const void *, unsigned long);
+/*
+ * movsl can be slow when source and dest are not both 8-byte aligned
+ */
+#if defined(CONFIG_M586MMX) || defined(CONFIG_M686) || \
+	defined(CONFIG_MPENTIUMIII) || defined(CONFIG_MPENTIUM4)
+#define INTEL_MOVSL
+#endif
+
+#ifdef INTEL_MOVSL
+extern struct movsl_mask {
+	int mask;
+} ____cacheline_aligned_in_smp movsl_mask;
+
+static inline int movsl_is_ok(const void *a1, const void *a2, unsigned long n)
+{
+	if (n < 64)
+		return 1;
+	if ((((const long)a1 ^ (const long)a2) & movsl_mask.mask) == 0)
+		return 1;
+	return 0;
+}
+#else
+static inline int movsl_is_ok(const void *a1, const void *a2, unsigned long n)
+{
+	return 1;
+}
+#endif
+
+/* These are undefined on !INTEL_MOVSL.  And they should be unreferenced. */
+unsigned long __copy_user_int(void *, const void *, unsigned long);
+unsigned long __copy_user_zeroing_int(void *, const void *, unsigned long);
+
+int __verify_write(const void *, unsigned long);

 #define __addr_ok(addr) ((unsigned long)(addr) < (current_thread_info()->addr_limit.seg))

@@ -255,37 +287,64 @@ do {									\
 /* Generic arbitrary sized copy.  */
 #define __copy_user(to,from,size)					\
 do {									\
-	int __d0, __d1;							\
+	int __d0, __d1, __d2;						\
 	__asm__ __volatile__(						\
+		"	cmp  $7,%0\n"					\
+		"	jbe  1f\n"					\
+		"	movl %1,%0\n"					\
+		"	negl %0\n"					\
+		"	andl $7,%0\n"					\
+		"	subl %0,%3\n"					\
+		"4:	rep; movsb\n"					\
+		"	movl %3,%0\n"					\
+		"	shrl $2,%0\n"					\
+		"	andl $3,%3\n"					\
+		"	.align 2,0x90\n"				\
 		"0:	rep; movsl\n"					\
 		"	movl %3,%0\n"					\
 		"1:	rep; movsb\n"					\
 		"2:\n"							\
 		".section .fixup,\"ax\"\n"				\
+		"5:	addl %3,%0\n"					\
+		"	jmp 2b\n"					\
 		"3:	lea 0(%3,%0,4),%0\n"				\
 		"	jmp 2b\n"					\
 		".previous\n"						\
 		".section __ex_table,\"a\"\n"				\
 		"	.align 4\n"					\
+		"	.long 4b,5b\n"					\
 		"	.long 0b,3b\n"					\
 		"	.long 1b,2b\n"					\
 		".previous"						\
-		: "=&c"(size), "=&D" (__d0), "=&S" (__d1)		\
-		: "r"(size & 3), "0"(size / 4), "1"(to), "2"(from)	\
+		: "=&c"(size), "=&D" (__d0), "=&S" (__d1), "=r"(__d2)	\
+		: "3"(size), "0"(size), "1"(to), "2"(from)		\
 		: "memory");						\
 } while (0)

 #define __copy_user_zeroing(to,from,size)				\
 do {									\
-	int __d0, __d1;							\
+	int __d0, __d1, __d2;						\
 	__asm__ __volatile__(						\
+		"	cmp  $7,%0\n"					\
+		"	jbe  1f\n"					\
+		"	movl %1,%0\n"					\
+		"	negl %0\n"					\
+		"	andl $7,%0\n"					\
+		"	subl %0,%3\n"					\
+		"4:	rep; movsb\n"					\
+		"	movl %3,%0\n"					\
+		"	shrl $2,%0\n"					\
+		"	andl $3,%3\n"					\
+		"	.align 2,0x90\n"				\
 		"0:	rep; movsl\n"					\
 		"	movl %3,%0\n"					\
 		"1:	rep; movsb\n"					\
 		"2:\n"							\
 		".section .fixup,\"ax\"\n"				\
+		"5:	addl %3,%0\n"					\
+		"	jmp 6f\n"					\
 		"3:	lea 0(%3,%0,4),%0\n"				\
-		"4:	pushl %0\n"					\
+		"6:	pushl %0\n"					\
 		"	pushl %%eax\n"					\
 		"	xorl %%eax,%%eax\n"				\
 		"	rep; stosb\n"					\
@@ -295,28 +354,37 @@ do {									\
 		".previous\n"						\
 		".section __ex_table,\"a\"\n"				\
 		"	.align 4\n"					\
+		"	.long 4b,5b\n"					\
 		"	.long 0b,3b\n"					\
-		"	.long 1b,4b\n"					\
+		"	.long 1b,6b\n"					\
 		".previous"						\
-		: "=&c"(size), "=&D" (__d0), "=&S" (__d1)		\
-		: "r"(size & 3), "0"(size / 4), "1"(to), "2"(from)	\
+		: "=&c"(size), "=&D" (__d0), "=&S" (__d1), "=r"(__d2)	\
+		: "3"(size), "0"(size), "1"(to), "2"(from)		\
 		: "memory");						\
 } while (0)

+
+
 /* We let the __ versions of copy_from/to_user inline, because they're often
 * used in fast paths and have only a small space overhead.
 */
 static inline unsigned long
 __generic_copy_from_user_nocheck(void *to, const void *from, unsigned long n)
 {
-	__copy_user_zeroing(to,from,n);
+	if (movsl_is_ok(to, from, n))
+		__copy_user_zeroing(to, from, n);
+	else
+		n = __copy_user_zeroing_int(to, from, n);
 	return n;
 }

 static inline unsigned long
 __generic_copy_to_user_nocheck(void *to, const void *from, unsigned long n)
 {
-	__copy_user(to,from,n);
+	if (movsl_is_ok(to, from, n))
+		__copy_user(to, from, n);
+	else
+		n = __copy_user_int(to, from, n);
 	return n;
 }

@@ -578,24 +646,16 @@ __constant_copy_from_user_nocheck(void *to, const void *from, unsigned long n)
 }

 #define copy_to_user(to,from,n)				\
-	(__builtin_constant_p(n) ?			\
-	 __constant_copy_to_user((to),(from),(n)) :	\
-	 __generic_copy_to_user((to),(from),(n)))
+	 __generic_copy_to_user((to),(from),(n))

 #define copy_from_user(to,from,n)			\
-	(__builtin_constant_p(n) ?			\
-	 __constant_copy_from_user((to),(from),(n)) :	\
-	 __generic_copy_from_user((to),(from),(n)))
+	 __generic_copy_from_user((to),(from),(n))

 #define __copy_to_user(to,from,n)			\
-	(__builtin_constant_p(n) ?			\
-	 __constant_copy_to_user_nocheck((to),(from),(n)) :	\
-	 __generic_copy_to_user_nocheck((to),(from),(n)))
+	 __generic_copy_to_user_nocheck((to),(from),(n))

 #define __copy_from_user(to,from,n)			\
-	(__builtin_constant_p(n) ?			\
-	 __constant_copy_from_user_nocheck((to),(from),(n)) :	\
-	 __generic_copy_from_user_nocheck((to),(from),(n)))
+	 __generic_copy_from_user_nocheck((to),(from),(n))

 long strncpy_from_user(char *dst, const char *src, long count);
 long __strncpy_from_user(char *dst, const char *src, long count);