Commit 427fda2c authored by Linus Torvalds's avatar Linus Torvalds

x86: improve on the non-rep 'copy_user' function

The old 'copy_user_generic_unrolled' function was oddly implemented for
largely historical reasons: it had been largely based on the uncached
copy case, which has some other concerns.

For example, the __copy_user_nocache() function uses 'movnti' for the
destination stores, and those want the destination to be aligned.  In
contrast, the regular copy function doesn't really care, and trying to
align things only complicates matters.

Also, like the clear_user function, the copy function had some odd
handling of the repeat counts, complicating the exception handling for
no really good reason.  So as with clear_user, just write it to keep all
the byte counts in the %rcx register, exactly like the 'rep movs'
functionality that this replaces.

Unlike a real 'rep movs', we do allow for this to trash a few temporary
registers to not have to unnecessarily save/restore registers on the
stack.

And like the clearing case, rename this to what it now clearly is:
'rep_movs_alternative', and make it one coherent function, so that it
shows up as such in profiles (instead of the odd split between
"copy_user_generic_unrolled" and "copy_user_short_string", the latter of
which was not about strings at all, and which was shared with the
uncached case).
Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
parent 8c9b6a88
...@@ -18,7 +18,7 @@ ...@@ -18,7 +18,7 @@
/* Handles exceptions in both to and from, but doesn't do access_ok */ /* Handles exceptions in both to and from, but doesn't do access_ok */
__must_check unsigned long __must_check unsigned long
copy_user_generic_unrolled(void *to, const void *from, unsigned len); rep_movs_alternative(void *to, const void *from, unsigned len);
static __always_inline __must_check unsigned long static __always_inline __must_check unsigned long
copy_user_generic(void *to, const void *from, unsigned long len) copy_user_generic(void *to, const void *from, unsigned long len)
...@@ -26,16 +26,16 @@ copy_user_generic(void *to, const void *from, unsigned long len) ...@@ -26,16 +26,16 @@ copy_user_generic(void *to, const void *from, unsigned long len)
stac(); stac();
/* /*
* If CPU has FSRM feature, use 'rep movs'. * If CPU has FSRM feature, use 'rep movs'.
* Otherwise, use copy_user_generic_unrolled. * Otherwise, use rep_movs_alternative.
*/ */
asm volatile( asm volatile(
"1:\n\t" "1:\n\t"
ALTERNATIVE("rep movsb", ALTERNATIVE("rep movsb",
"call copy_user_generic_unrolled", ALT_NOT(X86_FEATURE_FSRM)) "call rep_movs_alternative", ALT_NOT(X86_FEATURE_FSRM))
"2:\n" "2:\n"
_ASM_EXTABLE_UA(1b, 2b) _ASM_EXTABLE_UA(1b, 2b)
:"+c" (len), "+D" (to), "+S" (from), ASM_CALL_CONSTRAINT :"+c" (len), "+D" (to), "+S" (from), ASM_CALL_CONSTRAINT
: : "memory", "rax", "rdx", "r8", "r9", "r10", "r11"); : : "memory", "rax", "r8", "r9", "r10", "r11");
clac(); clac();
return len; return len;
} }
......
...@@ -17,30 +17,9 @@ ...@@ -17,30 +17,9 @@
#include <asm/export.h> #include <asm/export.h>
#include <asm/trapnr.h> #include <asm/trapnr.h>
.macro ALIGN_DESTINATION
/* check for bad alignment of destination */
movl %edi,%ecx
andl $7,%ecx
jz 102f /* already aligned */
subl $8,%ecx
negl %ecx
subl %ecx,%edx
100: movb (%rsi),%al
101: movb %al,(%rdi)
incq %rsi
incq %rdi
decl %ecx
jnz 100b
102:
_ASM_EXTABLE_CPY(100b, .Lcopy_user_handle_align)
_ASM_EXTABLE_CPY(101b, .Lcopy_user_handle_align)
.endm
/* /*
* copy_user_generic_unrolled - memory copy with exception handling. * rep_movs_alternative - memory copy with exception handling.
* This version is for CPUs like P4 that don't have efficient micro * This version is for CPUs that don't have FSRM (Fast Short Rep Movs)
* code for rep movsq
* *
* Input: * Input:
* rdi destination * rdi destination
...@@ -52,156 +31,119 @@ ...@@ -52,156 +31,119 @@
* *
* NOTE! The calling convention is very intentionally the same as * NOTE! The calling convention is very intentionally the same as
* for 'rep movs', so that we can rewrite the function call with * for 'rep movs', so that we can rewrite the function call with
* just a plain 'rep movs' on machines that have FSRM. * just a plain 'rep movs' on machines that have FSRM. But to make
* * it simpler for us, we can clobber rsi/rdi and rax/r8-r11 freely.
* HOWEVER! This function ends up having a lot of the code common
* with __copy_user_nocache(), which is a normal C function, and
* has a similar calling convention, but gets the 'count' in %rdx,
* and returns the result in %rax.
*
* To share as much code as possible, we end up returning the
* result in *both* %rcx/%rax, and we also move the initial count
* into %rdx.
*
* We can clobber rdx/rsi/rdi and r8-r11
*/ */
SYM_FUNC_START(copy_user_generic_unrolled) SYM_FUNC_START(rep_movs_alternative)
movl %ecx,%edx cmpq $64,%rcx
cmpl $8,%ecx jae .Lunrolled
jb .Lcopy_user_short_string_bytes
ALIGN_DESTINATION
movl %edx,%ecx
andl $63,%edx
shrl $6,%ecx
jz copy_user_short_string
1: movq (%rsi),%r8
2: movq 1*8(%rsi),%r9
3: movq 2*8(%rsi),%r10
4: movq 3*8(%rsi),%r11
5: movq %r8,(%rdi)
6: movq %r9,1*8(%rdi)
7: movq %r10,2*8(%rdi)
8: movq %r11,3*8(%rdi)
9: movq 4*8(%rsi),%r8
10: movq 5*8(%rsi),%r9
11: movq 6*8(%rsi),%r10
12: movq 7*8(%rsi),%r11
13: movq %r8,4*8(%rdi)
14: movq %r9,5*8(%rdi)
15: movq %r10,6*8(%rdi)
16: movq %r11,7*8(%rdi)
leaq 64(%rsi),%rsi
leaq 64(%rdi),%rdi
decl %ecx
jnz 1b
jmp copy_user_short_string
30: shll $6,%ecx cmp $8,%ecx
addl %ecx,%edx jae .Lword
jmp .Lcopy_user_handle_tail
_ASM_EXTABLE_CPY(1b, 30b) testl %ecx,%ecx
_ASM_EXTABLE_CPY(2b, 30b) je .Lexit
_ASM_EXTABLE_CPY(3b, 30b)
_ASM_EXTABLE_CPY(4b, 30b)
_ASM_EXTABLE_CPY(5b, 30b)
_ASM_EXTABLE_CPY(6b, 30b)
_ASM_EXTABLE_CPY(7b, 30b)
_ASM_EXTABLE_CPY(8b, 30b)
_ASM_EXTABLE_CPY(9b, 30b)
_ASM_EXTABLE_CPY(10b, 30b)
_ASM_EXTABLE_CPY(11b, 30b)
_ASM_EXTABLE_CPY(12b, 30b)
_ASM_EXTABLE_CPY(13b, 30b)
_ASM_EXTABLE_CPY(14b, 30b)
_ASM_EXTABLE_CPY(15b, 30b)
_ASM_EXTABLE_CPY(16b, 30b)
SYM_FUNC_END(copy_user_generic_unrolled)
EXPORT_SYMBOL(copy_user_generic_unrolled)
/* .Lcopy_user_tail:
* Try to copy last bytes and clear the rest if needed. 0: movb (%rsi),%al
* Since protection fault in copy_from/to_user is not a normal situation, 1: movb %al,(%rdi)
* it is not necessary to optimize tail handling. inc %rdi
* Don't try to copy the tail if machine check happened inc %rsi
* dec %rcx
* Input: jne .Lcopy_user_tail
* eax trap number written by ex_handler_copy() .Lexit:
* rdi destination
* rsi source
* rdx count
*
* Output:
* eax uncopied bytes or 0 if successful.
*/
SYM_CODE_START_LOCAL(.Lcopy_user_handle_tail)
cmp $X86_TRAP_MC,%eax
je 3f
movl %edx,%ecx
1: rep movsb
2: mov %ecx,%eax
RET RET
3: _ASM_EXTABLE_UA( 0b, .Lexit)
movl %edx,%eax _ASM_EXTABLE_UA( 1b, .Lexit)
movl %edx,%ecx
RET
_ASM_EXTABLE_CPY(1b, 2b) .p2align 4
.Lword:
2: movq (%rsi),%rax
3: movq %rax,(%rdi)
addq $8,%rsi
addq $8,%rdi
sub $8,%ecx
je .Lexit
cmp $8,%ecx
jae .Lword
jmp .Lcopy_user_tail
.Lcopy_user_handle_align: _ASM_EXTABLE_UA( 2b, .Lcopy_user_tail)
addl %ecx,%edx /* ecx is zerorest also */ _ASM_EXTABLE_UA( 3b, .Lcopy_user_tail)
jmp .Lcopy_user_handle_tail
SYM_CODE_END(.Lcopy_user_handle_tail) .p2align 4
.Lunrolled:
10: movq (%rsi),%r8
11: movq 8(%rsi),%r9
12: movq 16(%rsi),%r10
13: movq 24(%rsi),%r11
14: movq %r8,(%rdi)
15: movq %r9,8(%rdi)
16: movq %r10,16(%rdi)
17: movq %r11,24(%rdi)
20: movq 32(%rsi),%r8
21: movq 40(%rsi),%r9
22: movq 48(%rsi),%r10
23: movq 56(%rsi),%r11
24: movq %r8,32(%rdi)
25: movq %r9,40(%rdi)
26: movq %r10,48(%rdi)
27: movq %r11,56(%rdi)
addq $64,%rsi
addq $64,%rdi
subq $64,%rcx
cmpq $64,%rcx
jae .Lunrolled
cmpl $8,%ecx
jae .Lword
testl %ecx,%ecx
jne .Lcopy_user_tail
RET
_ASM_EXTABLE_UA(10b, .Lcopy_user_tail)
_ASM_EXTABLE_UA(11b, .Lcopy_user_tail)
_ASM_EXTABLE_UA(12b, .Lcopy_user_tail)
_ASM_EXTABLE_UA(13b, .Lcopy_user_tail)
_ASM_EXTABLE_UA(14b, .Lcopy_user_tail)
_ASM_EXTABLE_UA(15b, .Lcopy_user_tail)
_ASM_EXTABLE_UA(16b, .Lcopy_user_tail)
_ASM_EXTABLE_UA(17b, .Lcopy_user_tail)
_ASM_EXTABLE_UA(20b, .Lcopy_user_tail)
_ASM_EXTABLE_UA(21b, .Lcopy_user_tail)
_ASM_EXTABLE_UA(22b, .Lcopy_user_tail)
_ASM_EXTABLE_UA(23b, .Lcopy_user_tail)
_ASM_EXTABLE_UA(24b, .Lcopy_user_tail)
_ASM_EXTABLE_UA(25b, .Lcopy_user_tail)
_ASM_EXTABLE_UA(26b, .Lcopy_user_tail)
_ASM_EXTABLE_UA(27b, .Lcopy_user_tail)
SYM_FUNC_END(rep_movs_alternative)
EXPORT_SYMBOL(rep_movs_alternative)
/* /*
* Finish memcpy of less than 64 bytes. #AC should already be set. * The uncached copy needs to align the destination for
* * movnti and friends.
* Input:
* rdi destination
* rsi source
* rdx count (< 64)
*
* Output:
* eax uncopied bytes or 0 if successful.
*/ */
SYM_CODE_START_LOCAL(copy_user_short_string) .macro ALIGN_DESTINATION
movl %edx,%ecx /* check for bad alignment of destination */
andl $7,%edx movl %edi,%ecx
shrl $3,%ecx andl $7,%ecx
jz .Lcopy_user_short_string_bytes jz 102f /* already aligned */
18: movq (%rsi),%r8 subl $8,%ecx
19: movq %r8,(%rdi) negl %ecx
leaq 8(%rsi),%rsi subl %ecx,%edx
leaq 8(%rdi),%rdi 100: movb (%rsi),%al
decl %ecx 101: movb %al,(%rdi)
jnz 18b
.Lcopy_user_short_string_bytes:
andl %edx,%edx
jz 23f
movl %edx,%ecx
21: movb (%rsi),%al
22: movb %al,(%rdi)
incq %rsi incq %rsi
incq %rdi incq %rdi
decl %ecx decl %ecx
jnz 21b jnz 100b
23: xor %eax,%eax 102:
xor %ecx,%ecx
RET
40: leal (%rdx,%rcx,8),%edx _ASM_EXTABLE_CPY(100b, .Lcopy_user_handle_align)
jmp 60f _ASM_EXTABLE_CPY(101b, .Lcopy_user_handle_align)
50: movl %ecx,%edx /* ecx is zerorest also */ .endm
60: jmp .Lcopy_user_handle_tail
_ASM_EXTABLE_CPY(18b, 40b)
_ASM_EXTABLE_CPY(19b, 40b)
_ASM_EXTABLE_CPY(21b, 50b)
_ASM_EXTABLE_CPY(22b, 50b)
SYM_CODE_END(copy_user_short_string)
/* /*
* copy_user_nocache - Uncached memory copy with exception handling * copy_user_nocache - Uncached memory copy with exception handling
...@@ -346,5 +288,40 @@ SYM_FUNC_START(__copy_user_nocache) ...@@ -346,5 +288,40 @@ SYM_FUNC_START(__copy_user_nocache)
_ASM_EXTABLE_CPY(31b, .L_fixup_4b_copy) _ASM_EXTABLE_CPY(31b, .L_fixup_4b_copy)
_ASM_EXTABLE_CPY(40b, .L_fixup_1b_copy) _ASM_EXTABLE_CPY(40b, .L_fixup_1b_copy)
_ASM_EXTABLE_CPY(41b, .L_fixup_1b_copy) _ASM_EXTABLE_CPY(41b, .L_fixup_1b_copy)
/*
* Try to copy last bytes and clear the rest if needed.
* Since protection fault in copy_from/to_user is not a normal situation,
* it is not necessary to optimize tail handling.
* Don't try to copy the tail if machine check happened
*
* Input:
* eax trap number written by ex_handler_copy()
* rdi destination
* rsi source
* rdx count
*
* Output:
* eax uncopied bytes or 0 if successful.
*/
.Lcopy_user_handle_tail:
cmp $X86_TRAP_MC,%eax
je 3f
movl %edx,%ecx
1: rep movsb
2: mov %ecx,%eax
RET
3:
movl %edx,%eax
RET
_ASM_EXTABLE_CPY(1b, 2b)
.Lcopy_user_handle_align:
addl %ecx,%edx /* ecx is zerorest also */
jmp .Lcopy_user_handle_tail
SYM_FUNC_END(__copy_user_nocache) SYM_FUNC_END(__copy_user_nocache)
EXPORT_SYMBOL(__copy_user_nocache) EXPORT_SYMBOL(__copy_user_nocache)
...@@ -1285,7 +1285,7 @@ static const char *uaccess_safe_builtin[] = { ...@@ -1285,7 +1285,7 @@ static const char *uaccess_safe_builtin[] = {
"copy_mc_enhanced_fast_string", "copy_mc_enhanced_fast_string",
"ftrace_likely_update", /* CONFIG_TRACE_BRANCH_PROFILING */ "ftrace_likely_update", /* CONFIG_TRACE_BRANCH_PROFILING */
"rep_stos_alternative", "rep_stos_alternative",
"copy_user_generic_unrolled", "rep_movs_alternative",
"__copy_user_nocache", "__copy_user_nocache",
NULL NULL
}; };
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment