Commit a5624566 authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'x86-rep-insns': x86 user copy clarifications

Merge my x86 user copy updates branch.

This cleans up a lot of our x86 memory copy code, particularly for user
accesses.  I've been pushing for microarchitectural support for good
memory copying and clearing for a long while, and it's been visible in
how the kernel has aggressively used 'rep movs' and 'rep stos' whenever
possible.

And that micro-architectural support has been improving over the years,
to the point where on modern CPU's the best option for a memory copy
that would become a function call (as opposed to being something that
can just be turned into individual 'mov' instructions) is now to inline
the string instruction sequence instead.

However, that only makes sense when we have the modern markers for this:
the x86 FSRM and FSRS capabilities ("Fast Short REP MOVS/STOS").

So this cleans up a lot of our historical code, gets rid of the legacy
marker use ("REP_GOOD" and "ERMS") from the memcpy/memset cases, and
replaces it with that modern reality.  Note that REP_GOOD and ERMS end
up still being used by the known large cases (ie page copyin gand
clearing).

The reason much of this ends up being about user memory accesses is that
the normal in-kernel cases are done by the compiler (__builtin_memcpy()
and __builtin_memset()) and getting to the point where we can use our
instruction rewriting to inline those to be string instructions will
need some compiler support.

In contrast, the user accessor functions are all entirely controlled by
the kernel code, so we can change those arbitrarily.

Thanks to Borislav Petkov for feedback on the series, and Jens testing
some of this on micro-architectures I didn't personally have access to.

* x86-rep-insns:
  x86: rewrite '__copy_user_nocache' function
  x86: remove 'zerorest' argument from __copy_user_nocache()
  x86: set FSRS automatically on AMD CPUs that have FSRM
  x86: improve on the non-rep 'copy_user' function
  x86: improve on the non-rep 'clear_user' function
  x86: inline the 'rep movs' in user copies for the FSRM case
  x86: move stac/clac from user copy routines into callers
  x86: don't use REP_GOOD or ERMS for user memory clearing
  x86: don't use REP_GOOD or ERMS for user memory copies
  x86: don't use REP_GOOD or ERMS for small memory clearing
  x86: don't use REP_GOOD or ERMS for small memory copies
parents 487c20b0 034ff37d
...@@ -18,32 +18,26 @@ ...@@ -18,32 +18,26 @@
/* Handles exceptions in both to and from, but doesn't do access_ok */ /* Handles exceptions in both to and from, but doesn't do access_ok */
__must_check unsigned long __must_check unsigned long
copy_user_enhanced_fast_string(void *to, const void *from, unsigned len); rep_movs_alternative(void *to, const void *from, unsigned len);
__must_check unsigned long
copy_user_generic_string(void *to, const void *from, unsigned len);
__must_check unsigned long
copy_user_generic_unrolled(void *to, const void *from, unsigned len);
static __always_inline __must_check unsigned long static __always_inline __must_check unsigned long
copy_user_generic(void *to, const void *from, unsigned len) copy_user_generic(void *to, const void *from, unsigned long len)
{ {
unsigned ret; stac();
/* /*
* If CPU has ERMS feature, use copy_user_enhanced_fast_string. * If CPU has FSRM feature, use 'rep movs'.
* Otherwise, if CPU has rep_good feature, use copy_user_generic_string. * Otherwise, use rep_movs_alternative.
* Otherwise, use copy_user_generic_unrolled.
*/ */
alternative_call_2(copy_user_generic_unrolled, asm volatile(
copy_user_generic_string, "1:\n\t"
X86_FEATURE_REP_GOOD, ALTERNATIVE("rep movsb",
copy_user_enhanced_fast_string, "call rep_movs_alternative", ALT_NOT(X86_FEATURE_FSRM))
X86_FEATURE_ERMS, "2:\n"
ASM_OUTPUT2("=a" (ret), "=D" (to), "=S" (from), _ASM_EXTABLE_UA(1b, 2b)
"=d" (len)), :"+c" (len), "+D" (to), "+S" (from), ASM_CALL_CONSTRAINT
"1" (to), "2" (from), "3" (len) : : "memory", "rax", "r8", "r9", "r10", "r11");
: "memory", "rcx", "r8", "r9", "r10", "r11"); clac();
return ret; return len;
} }
static __always_inline __must_check unsigned long static __always_inline __must_check unsigned long
...@@ -58,9 +52,7 @@ raw_copy_to_user(void __user *dst, const void *src, unsigned long size) ...@@ -58,9 +52,7 @@ raw_copy_to_user(void __user *dst, const void *src, unsigned long size)
return copy_user_generic((__force void *)dst, src, size); return copy_user_generic((__force void *)dst, src, size);
} }
extern long __copy_user_nocache(void *dst, const void __user *src, extern long __copy_user_nocache(void *dst, const void __user *src, unsigned size);
unsigned size, int zerorest);
extern long __copy_user_flushcache(void *dst, const void __user *src, unsigned size); extern long __copy_user_flushcache(void *dst, const void __user *src, unsigned size);
extern void memcpy_page_flushcache(char *to, struct page *page, size_t offset, extern void memcpy_page_flushcache(char *to, struct page *page, size_t offset,
size_t len); size_t len);
...@@ -69,8 +61,12 @@ static inline int ...@@ -69,8 +61,12 @@ static inline int
__copy_from_user_inatomic_nocache(void *dst, const void __user *src, __copy_from_user_inatomic_nocache(void *dst, const void __user *src,
unsigned size) unsigned size)
{ {
long ret;
kasan_check_write(dst, size); kasan_check_write(dst, size);
return __copy_user_nocache(dst, src, size, 0); stac();
ret = __copy_user_nocache(dst, src, size);
clac();
return ret;
} }
static inline int static inline int
...@@ -85,11 +81,7 @@ __copy_from_user_flushcache(void *dst, const void __user *src, unsigned size) ...@@ -85,11 +81,7 @@ __copy_from_user_flushcache(void *dst, const void __user *src, unsigned size)
*/ */
__must_check unsigned long __must_check unsigned long
clear_user_original(void __user *addr, unsigned long len); rep_stos_alternative(void __user *addr, unsigned long len);
__must_check unsigned long
clear_user_rep_good(void __user *addr, unsigned long len);
__must_check unsigned long
clear_user_erms(void __user *addr, unsigned long len);
static __always_inline __must_check unsigned long __clear_user(void __user *addr, unsigned long size) static __always_inline __must_check unsigned long __clear_user(void __user *addr, unsigned long size)
{ {
...@@ -102,16 +94,12 @@ static __always_inline __must_check unsigned long __clear_user(void __user *addr ...@@ -102,16 +94,12 @@ static __always_inline __must_check unsigned long __clear_user(void __user *addr
*/ */
asm volatile( asm volatile(
"1:\n\t" "1:\n\t"
ALTERNATIVE_3("rep stosb", ALTERNATIVE("rep stosb",
"call clear_user_erms", ALT_NOT(X86_FEATURE_FSRM), "call rep_stos_alternative", ALT_NOT(X86_FEATURE_FSRS))
"call clear_user_rep_good", ALT_NOT(X86_FEATURE_ERMS),
"call clear_user_original", ALT_NOT(X86_FEATURE_REP_GOOD))
"2:\n" "2:\n"
_ASM_EXTABLE_UA(1b, 2b) _ASM_EXTABLE_UA(1b, 2b)
: "+c" (size), "+D" (addr), ASM_CALL_CONSTRAINT : "+c" (size), "+D" (addr), ASM_CALL_CONSTRAINT
: "a" (0) : "a" (0));
/* rep_good clobbers %rdx */
: "rdx");
clac(); clac();
......
...@@ -929,6 +929,10 @@ static void init_amd(struct cpuinfo_x86 *c) ...@@ -929,6 +929,10 @@ static void init_amd(struct cpuinfo_x86 *c)
if (c->x86 >= 0x10) if (c->x86 >= 0x10)
set_cpu_cap(c, X86_FEATURE_REP_GOOD); set_cpu_cap(c, X86_FEATURE_REP_GOOD);
/* AMD FSRM also implies FSRS */
if (cpu_has(c, X86_FEATURE_FSRM))
set_cpu_cap(c, X86_FEATURE_FSRS);
/* get apicid instead of initial apic id from cpuid */ /* get apicid instead of initial apic id from cpuid */
c->apicid = hard_smp_processor_id(); c->apicid = hard_smp_processor_id();
......
...@@ -71,6 +71,6 @@ ifneq ($(CONFIG_GENERIC_CSUM),y) ...@@ -71,6 +71,6 @@ ifneq ($(CONFIG_GENERIC_CSUM),y)
endif endif
lib-y += clear_page_64.o copy_page_64.o lib-y += clear_page_64.o copy_page_64.o
lib-y += memmove_64.o memset_64.o lib-y += memmove_64.o memset_64.o
lib-y += copy_user_64.o lib-y += copy_user_64.o copy_user_uncached_64.o
lib-y += cmpxchg16b_emu.o lib-y += cmpxchg16b_emu.o
endif endif
...@@ -57,134 +57,85 @@ EXPORT_SYMBOL_GPL(clear_page_erms) ...@@ -57,134 +57,85 @@ EXPORT_SYMBOL_GPL(clear_page_erms)
* Input: * Input:
* rdi destination * rdi destination
* rcx count * rcx count
* rax is zero
* *
* Output: * Output:
* rcx: uncleared bytes or 0 if successful. * rcx: uncleared bytes or 0 if successful.
*/ */
SYM_FUNC_START(clear_user_original) SYM_FUNC_START(rep_stos_alternative)
/* cmpq $64,%rcx
* Copy only the lower 32 bits of size as that is enough to handle the rest bytes, jae .Lunrolled
* i.e., no need for a 'q' suffix and thus a REX prefix.
*/
mov %ecx,%eax
shr $3,%rcx
jz .Lrest_bytes
# do the qwords first cmp $8,%ecx
.p2align 4 jae .Lword
.Lqwords:
movq $0,(%rdi)
lea 8(%rdi),%rdi
dec %rcx
jnz .Lqwords
.Lrest_bytes: testl %ecx,%ecx
and $7, %eax je .Lexit
jz .Lexit
# now do the rest bytes .Lclear_user_tail:
.Lbytes: 0: movb %al,(%rdi)
movb $0,(%rdi)
inc %rdi inc %rdi
dec %eax dec %rcx
jnz .Lbytes jnz .Lclear_user_tail
.Lexit: .Lexit:
/*
* %rax still needs to be cleared in the exception case because this function is called
* from inline asm and the compiler expects %rax to be zero when exiting the inline asm,
* in case it might reuse it somewhere.
*/
xor %eax,%eax
RET
.Lqwords_exception:
# convert remaining qwords back into bytes to return to caller
shl $3, %rcx
and $7, %eax
add %rax,%rcx
jmp .Lexit
.Lbytes_exception:
mov %eax,%ecx
jmp .Lexit
_ASM_EXTABLE_UA(.Lqwords, .Lqwords_exception)
_ASM_EXTABLE_UA(.Lbytes, .Lbytes_exception)
SYM_FUNC_END(clear_user_original)
EXPORT_SYMBOL(clear_user_original)
/*
* Alternative clear user-space when CPU feature X86_FEATURE_REP_GOOD is
* present.
* Input:
* rdi destination
* rcx count
*
* Output:
* rcx: uncleared bytes or 0 if successful.
*/
SYM_FUNC_START(clear_user_rep_good)
# call the original thing for less than a cacheline
cmp $64, %rcx
jb clear_user_original
.Lprep:
# copy lower 32-bits for rest bytes
mov %ecx, %edx
shr $3, %rcx
jz .Lrep_good_rest_bytes
.Lrep_good_qwords:
rep stosq
.Lrep_good_rest_bytes:
and $7, %edx
jz .Lrep_good_exit
.Lrep_good_bytes:
mov %edx, %ecx
rep stosb
.Lrep_good_exit:
# see .Lexit comment above
xor %eax, %eax
RET RET
.Lrep_good_qwords_exception: _ASM_EXTABLE_UA( 0b, .Lexit)
# convert remaining qwords back into bytes to return to caller
shl $3, %rcx
and $7, %edx
add %rdx, %rcx
jmp .Lrep_good_exit
_ASM_EXTABLE_UA(.Lrep_good_qwords, .Lrep_good_qwords_exception) .Lword:
_ASM_EXTABLE_UA(.Lrep_good_bytes, .Lrep_good_exit) 1: movq %rax,(%rdi)
SYM_FUNC_END(clear_user_rep_good) addq $8,%rdi
EXPORT_SYMBOL(clear_user_rep_good) sub $8,%ecx
je .Lexit
cmp $8,%ecx
jae .Lword
jmp .Lclear_user_tail
/* .p2align 4
* Alternative clear user-space when CPU feature X86_FEATURE_ERMS is present. .Lunrolled:
* Input: 10: movq %rax,(%rdi)
* rdi destination 11: movq %rax,8(%rdi)
* rcx count 12: movq %rax,16(%rdi)
* 13: movq %rax,24(%rdi)
* Output: 14: movq %rax,32(%rdi)
* rcx: uncleared bytes or 0 if successful. 15: movq %rax,40(%rdi)
* 16: movq %rax,48(%rdi)
*/ 17: movq %rax,56(%rdi)
SYM_FUNC_START(clear_user_erms) addq $64,%rdi
# call the original thing for less than a cacheline subq $64,%rcx
cmp $64, %rcx cmpq $64,%rcx
jb clear_user_original jae .Lunrolled
cmpl $8,%ecx
.Lerms_bytes: jae .Lword
rep stosb testl %ecx,%ecx
jne .Lclear_user_tail
.Lerms_exit:
xorl %eax,%eax
RET RET
_ASM_EXTABLE_UA(.Lerms_bytes, .Lerms_exit) /*
SYM_FUNC_END(clear_user_erms) * If we take an exception on any of the
EXPORT_SYMBOL(clear_user_erms) * word stores, we know that %rcx isn't zero,
* so we can just go to the tail clearing to
* get the exact count.
*
* The unrolled case might end up clearing
* some bytes twice. Don't care.
*
* We could use the value in %rdi to avoid
* a second fault on the exact count case,
* but do we really care? No.
*
* Finally, we could try to align %rdi at the
* top of the unrolling. But unaligned stores
* just aren't that common or expensive.
*/
_ASM_EXTABLE_UA( 1b, .Lclear_user_tail)
_ASM_EXTABLE_UA(10b, .Lclear_user_tail)
_ASM_EXTABLE_UA(11b, .Lclear_user_tail)
_ASM_EXTABLE_UA(12b, .Lclear_user_tail)
_ASM_EXTABLE_UA(13b, .Lclear_user_tail)
_ASM_EXTABLE_UA(14b, .Lclear_user_tail)
_ASM_EXTABLE_UA(15b, .Lclear_user_tail)
_ASM_EXTABLE_UA(16b, .Lclear_user_tail)
_ASM_EXTABLE_UA(17b, .Lclear_user_tail)
SYM_FUNC_END(rep_stos_alternative)
EXPORT_SYMBOL(rep_stos_alternative)
This diff is collapsed.
/* SPDX-License-Identifier: GPL-2.0-only */
/*
* Copyright 2023 Linus Torvalds <torvalds@linux-foundation.org>
*/
#include <linux/linkage.h>
#include <asm/asm.h>
#include <asm/export.h>
/*
* copy_user_nocache - Uncached memory copy with exception handling
*
* This copies from user space into kernel space, but the kernel
* space accesses can take a machine check exception, so they too
* need exception handling.
*
* Note: only 32-bit and 64-bit stores have non-temporal versions,
* and we only use aligned versions. Any unaligned parts at the
* start or end of the copy will be done using normal cached stores.
*
* Input:
* rdi destination
* rsi source
* edx count
*
* Output:
* rax uncopied bytes or 0 if successful.
*/
SYM_FUNC_START(__copy_user_nocache)
/* If destination is not 7-byte aligned, we'll have to align it */
testb $7,%dil
jne .Lalign
.Lis_aligned:
cmp $64,%edx
jb .Lquadwords
.p2align 4,0x90
.Lunrolled:
10: movq (%rsi),%r8
11: movq 8(%rsi),%r9
12: movq 16(%rsi),%r10
13: movq 24(%rsi),%r11
20: movnti %r8,(%rdi)
21: movnti %r9,8(%rdi)
22: movnti %r10,16(%rdi)
23: movnti %r11,24(%rdi)
30: movq 32(%rsi),%r8
31: movq 40(%rsi),%r9
32: movq 48(%rsi),%r10
33: movq 56(%rsi),%r11
40: movnti %r8,32(%rdi)
41: movnti %r9,40(%rdi)
42: movnti %r10,48(%rdi)
43: movnti %r11,56(%rdi)
addq $64,%rsi
addq $64,%rdi
sub $64,%edx
cmp $64,%edx
jae .Lunrolled
/*
* First set of user mode loads have been done
* without any stores, so if they fail, we can
* just try the non-unrolled loop.
*/
_ASM_EXTABLE_UA(10b, .Lquadwords)
_ASM_EXTABLE_UA(11b, .Lquadwords)
_ASM_EXTABLE_UA(12b, .Lquadwords)
_ASM_EXTABLE_UA(13b, .Lquadwords)
/*
* The second set of user mode loads have been
* done with 32 bytes stored to the destination,
* so we need to take that into account before
* falling back to the unrolled loop.
*/
_ASM_EXTABLE_UA(30b, .Lfixup32)
_ASM_EXTABLE_UA(31b, .Lfixup32)
_ASM_EXTABLE_UA(32b, .Lfixup32)
_ASM_EXTABLE_UA(33b, .Lfixup32)
/*
* An exception on a write means that we're
* done, but we need to update the count
* depending on where in the unrolled loop
* we were.
*/
_ASM_EXTABLE_UA(20b, .Ldone0)
_ASM_EXTABLE_UA(21b, .Ldone8)
_ASM_EXTABLE_UA(22b, .Ldone16)
_ASM_EXTABLE_UA(23b, .Ldone24)
_ASM_EXTABLE_UA(40b, .Ldone32)
_ASM_EXTABLE_UA(41b, .Ldone40)
_ASM_EXTABLE_UA(42b, .Ldone48)
_ASM_EXTABLE_UA(43b, .Ldone56)
.Lquadwords:
cmp $8,%edx
jb .Llong
50: movq (%rsi),%rax
51: movnti %rax,(%rdi)
addq $8,%rsi
addq $8,%rdi
sub $8,%edx
jmp .Lquadwords
/*
* If we fail on the last full quadword, we will
* not try to do any byte-wise cached accesses.
* We will try to do one more 4-byte uncached
* one, though.
*/
_ASM_EXTABLE_UA(50b, .Llast4)
_ASM_EXTABLE_UA(51b, .Ldone0)
.Llong:
test $4,%dl
je .Lword
60: movl (%rsi),%eax
61: movnti %eax,(%rdi)
addq $4,%rsi
addq $4,%rdi
sub $4,%edx
.Lword:
sfence
test $2,%dl
je .Lbyte
70: movw (%rsi),%ax
71: movw %ax,(%rdi)
addq $2,%rsi
addq $2,%rdi
sub $2,%edx
.Lbyte:
test $1,%dl
je .Ldone
80: movb (%rsi),%al
81: movb %al,(%rdi)
dec %edx
.Ldone:
mov %edx,%eax
RET
/*
* If we fail on the last four bytes, we won't
* bother with any fixups. It's dead, Jim. Note
* that there's no need for 'sfence' for any
* of this, since the exception will have been
* serializing.
*/
_ASM_EXTABLE_UA(60b, .Ldone)
_ASM_EXTABLE_UA(61b, .Ldone)
_ASM_EXTABLE_UA(70b, .Ldone)
_ASM_EXTABLE_UA(71b, .Ldone)
_ASM_EXTABLE_UA(80b, .Ldone)
_ASM_EXTABLE_UA(81b, .Ldone)
/*
* This is the "head needs aliging" case when
* the destination isn't 8-byte aligned. The
* 4-byte case can be done uncached, but any
* smaller alignment is done with regular stores.
*/
.Lalign:
test $1,%dil
je .Lalign_word
test %edx,%edx
je .Ldone
90: movb (%rsi),%al
91: movb %al,(%rdi)
inc %rsi
inc %rdi
dec %edx
.Lalign_word:
test $2,%dil
je .Lalign_long
cmp $2,%edx
jb .Lbyte
92: movw (%rsi),%ax
93: movw %ax,(%rdi)
addq $2,%rsi
addq $2,%rdi
sub $2,%edx
.Lalign_long:
test $4,%dil
je .Lis_aligned
cmp $4,%edx
jb .Lword
94: movl (%rsi),%eax
95: movnti %eax,(%rdi)
addq $4,%rsi
addq $4,%rdi
sub $4,%edx
jmp .Lis_aligned
/*
* If we fail on the initial alignment accesses,
* we're all done. Again, no point in trying to
* do byte-by-byte probing if the 4-byte load
* fails - we're not doing any uncached accesses
* any more.
*/
_ASM_EXTABLE_UA(90b, .Ldone)
_ASM_EXTABLE_UA(91b, .Ldone)
_ASM_EXTABLE_UA(92b, .Ldone)
_ASM_EXTABLE_UA(93b, .Ldone)
_ASM_EXTABLE_UA(94b, .Ldone)
_ASM_EXTABLE_UA(95b, .Ldone)
/*
* Exception table fixups for faults in the middle
*/
.Ldone56: sub $8,%edx
.Ldone48: sub $8,%edx
.Ldone40: sub $8,%edx
.Ldone32: sub $8,%edx
.Ldone24: sub $8,%edx
.Ldone16: sub $8,%edx
.Ldone8: sub $8,%edx
.Ldone0:
mov %edx,%eax
RET
.Lfixup32:
addq $32,%rsi
addq $32,%rdi
sub $32,%edx
jmp .Lquadwords
.Llast4:
52: movl (%rsi),%eax
53: movnti %eax,(%rdi)
sfence
sub $4,%edx
mov %edx,%eax
RET
_ASM_EXTABLE_UA(52b, .Ldone0)
_ASM_EXTABLE_UA(53b, .Ldone0)
SYM_FUNC_END(__copy_user_nocache)
EXPORT_SYMBOL(__copy_user_nocache)
...@@ -10,13 +10,6 @@ ...@@ -10,13 +10,6 @@
.section .noinstr.text, "ax" .section .noinstr.text, "ax"
/*
* We build a jump to memcpy_orig by default which gets NOPped out on
* the majority of x86 CPUs which set REP_GOOD. In addition, CPUs which
* have the enhanced REP MOVSB/STOSB feature (ERMS), change those NOPs
* to a jmp to memcpy_erms which does the REP; MOVSB mem copy.
*/
/* /*
* memcpy - Copy a memory block. * memcpy - Copy a memory block.
* *
...@@ -27,17 +20,21 @@ ...@@ -27,17 +20,21 @@
* *
* Output: * Output:
* rax original destination * rax original destination
*
* The FSRM alternative should be done inline (avoiding the call and
* the disgusting return handling), but that would require some help
* from the compiler for better calling conventions.
*
* The 'rep movsb' itself is small enough to replace the call, but the
* two register moves blow up the code. And one of them is "needed"
* only for the return value that is the same as the source input,
* which the compiler could/should do much better anyway.
*/ */
SYM_TYPED_FUNC_START(__memcpy) SYM_TYPED_FUNC_START(__memcpy)
ALTERNATIVE_2 "jmp memcpy_orig", "", X86_FEATURE_REP_GOOD, \ ALTERNATIVE "jmp memcpy_orig", "", X86_FEATURE_FSRM
"jmp memcpy_erms", X86_FEATURE_ERMS
movq %rdi, %rax movq %rdi, %rax
movq %rdx, %rcx movq %rdx, %rcx
shrq $3, %rcx
andl $7, %edx
rep movsq
movl %edx, %ecx
rep movsb rep movsb
RET RET
SYM_FUNC_END(__memcpy) SYM_FUNC_END(__memcpy)
...@@ -46,17 +43,6 @@ EXPORT_SYMBOL(__memcpy) ...@@ -46,17 +43,6 @@ EXPORT_SYMBOL(__memcpy)
SYM_FUNC_ALIAS(memcpy, __memcpy) SYM_FUNC_ALIAS(memcpy, __memcpy)
EXPORT_SYMBOL(memcpy) EXPORT_SYMBOL(memcpy)
/*
* memcpy_erms() - enhanced fast string memcpy. This is faster and
* simpler than memcpy. Use memcpy_erms when possible.
*/
SYM_FUNC_START_LOCAL(memcpy_erms)
movq %rdi, %rax
movq %rdx, %rcx
rep movsb
RET
SYM_FUNC_END(memcpy_erms)
SYM_FUNC_START_LOCAL(memcpy_orig) SYM_FUNC_START_LOCAL(memcpy_orig)
movq %rdi, %rax movq %rdi, %rax
......
...@@ -18,27 +18,22 @@ ...@@ -18,27 +18,22 @@
* rdx count (bytes) * rdx count (bytes)
* *
* rax original destination * rax original destination
*
* The FSRS alternative should be done inline (avoiding the call and
* the disgusting return handling), but that would require some help
* from the compiler for better calling conventions.
*
* The 'rep stosb' itself is small enough to replace the call, but all
* the register moves blow up the code. And two of them are "needed"
* only for the return value that is the same as the source input,
* which the compiler could/should do much better anyway.
*/ */
SYM_FUNC_START(__memset) SYM_FUNC_START(__memset)
/* ALTERNATIVE "jmp memset_orig", "", X86_FEATURE_FSRS
* Some CPUs support enhanced REP MOVSB/STOSB feature. It is recommended
* to use it when possible. If not available, use fast string instructions.
*
* Otherwise, use original memset function.
*/
ALTERNATIVE_2 "jmp memset_orig", "", X86_FEATURE_REP_GOOD, \
"jmp memset_erms", X86_FEATURE_ERMS
movq %rdi,%r9 movq %rdi,%r9
movb %sil,%al
movq %rdx,%rcx movq %rdx,%rcx
andl $7,%edx
shrq $3,%rcx
/* expand byte value */
movzbl %sil,%esi
movabs $0x0101010101010101,%rax
imulq %rsi,%rax
rep stosq
movl %edx,%ecx
rep stosb rep stosb
movq %r9,%rax movq %r9,%rax
RET RET
...@@ -48,26 +43,6 @@ EXPORT_SYMBOL(__memset) ...@@ -48,26 +43,6 @@ EXPORT_SYMBOL(__memset)
SYM_FUNC_ALIAS(memset, __memset) SYM_FUNC_ALIAS(memset, __memset)
EXPORT_SYMBOL(memset) EXPORT_SYMBOL(memset)
/*
* ISO C memset - set a memory block to a byte value. This function uses
* enhanced rep stosb to override the fast string function.
* The code is simpler and shorter than the fast string function as well.
*
* rdi destination
* rsi value (char)
* rdx count (bytes)
*
* rax original destination
*/
SYM_FUNC_START_LOCAL(memset_erms)
movq %rdi,%r9
movb %sil,%al
movq %rdx,%rcx
rep stosb
movq %r9,%rax
RET
SYM_FUNC_END(memset_erms)
SYM_FUNC_START_LOCAL(memset_orig) SYM_FUNC_START_LOCAL(memset_orig)
movq %rdi,%r10 movq %rdi,%r10
......
...@@ -45,7 +45,11 @@ EXPORT_SYMBOL_GPL(arch_wb_cache_pmem); ...@@ -45,7 +45,11 @@ EXPORT_SYMBOL_GPL(arch_wb_cache_pmem);
long __copy_user_flushcache(void *dst, const void __user *src, unsigned size) long __copy_user_flushcache(void *dst, const void __user *src, unsigned size)
{ {
unsigned long flushed, dest = (unsigned long) dst; unsigned long flushed, dest = (unsigned long) dst;
long rc = __copy_user_nocache(dst, src, size, 0); long rc;
stac();
rc = __copy_user_nocache(dst, src, size);
clac();
/* /*
* __copy_user_nocache() uses non-temporal stores for the bulk * __copy_user_nocache() uses non-temporal stores for the bulk
......
...@@ -97,7 +97,7 @@ static void cacheless_memcpy(void *dst, void *src, size_t n) ...@@ -97,7 +97,7 @@ static void cacheless_memcpy(void *dst, void *src, size_t n)
* there are no security issues. The extra fault recovery machinery * there are no security issues. The extra fault recovery machinery
* is not invoked. * is not invoked.
*/ */
__copy_user_nocache(dst, (void __user *)src, n, 0); __copy_user_nocache(dst, (void __user *)src, n);
} }
void rvt_wss_exit(struct rvt_dev_info *rdi) void rvt_wss_exit(struct rvt_dev_info *rdi)
......
...@@ -1284,9 +1284,9 @@ static const char *uaccess_safe_builtin[] = { ...@@ -1284,9 +1284,9 @@ static const char *uaccess_safe_builtin[] = {
"copy_mc_fragile_handle_tail", "copy_mc_fragile_handle_tail",
"copy_mc_enhanced_fast_string", "copy_mc_enhanced_fast_string",
"ftrace_likely_update", /* CONFIG_TRACE_BRANCH_PROFILING */ "ftrace_likely_update", /* CONFIG_TRACE_BRANCH_PROFILING */
"clear_user_erms", "rep_stos_alternative",
"clear_user_rep_good", "rep_movs_alternative",
"clear_user_original", "__copy_user_nocache",
NULL NULL
}; };
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment