tools headers: Update the copy of x86's mem{cpy,set}_64.S used in 'perf bench'

This is to get the changes from:

  68674f94 ("x86: don't use REP_GOOD or ERMS for small memory copies")
  20f3337d ("x86: don't use REP_GOOD or ERMS for small memory clearing")

This also make the 'perf bench mem' files stop referring to the erms
versions that gone away with the above patches.

That addresses these perf tools build warning:

  Warning: Kernel ABI header at 'tools/arch/x86/lib/memcpy_64.S' differs from latest version at 'arch/x86/lib/memcpy_64.S'
  diff -u tools/arch/x86/lib/memcpy_64.S arch/x86/lib/memcpy_64.S
  Warning: Kernel ABI header at 'tools/arch/x86/lib/memset_64.S' differs from latest version at 'arch/x86/lib/memset_64.S'
  diff -u tools/arch/x86/lib/memset_64.S arch/x86/lib/memset_64.S
Signed-off-by: default avatarArnaldo Carvalho de Melo <acme@redhat.com>
parent 9bc83d6e
...@@ -9,13 +9,6 @@ ...@@ -9,13 +9,6 @@
.section .noinstr.text, "ax" .section .noinstr.text, "ax"
/*
* We build a jump to memcpy_orig by default which gets NOPped out on
* the majority of x86 CPUs which set REP_GOOD. In addition, CPUs which
* have the enhanced REP MOVSB/STOSB feature (ERMS), change those NOPs
* to a jmp to memcpy_erms which does the REP; MOVSB mem copy.
*/
/* /*
* memcpy - Copy a memory block. * memcpy - Copy a memory block.
* *
...@@ -26,17 +19,21 @@ ...@@ -26,17 +19,21 @@
* *
* Output: * Output:
* rax original destination * rax original destination
*
* The FSRM alternative should be done inline (avoiding the call and
* the disgusting return handling), but that would require some help
* from the compiler for better calling conventions.
*
* The 'rep movsb' itself is small enough to replace the call, but the
* two register moves blow up the code. And one of them is "needed"
* only for the return value that is the same as the source input,
* which the compiler could/should do much better anyway.
*/ */
SYM_TYPED_FUNC_START(__memcpy) SYM_TYPED_FUNC_START(__memcpy)
ALTERNATIVE_2 "jmp memcpy_orig", "", X86_FEATURE_REP_GOOD, \ ALTERNATIVE "jmp memcpy_orig", "", X86_FEATURE_FSRM
"jmp memcpy_erms", X86_FEATURE_ERMS
movq %rdi, %rax movq %rdi, %rax
movq %rdx, %rcx movq %rdx, %rcx
shrq $3, %rcx
andl $7, %edx
rep movsq
movl %edx, %ecx
rep movsb rep movsb
RET RET
SYM_FUNC_END(__memcpy) SYM_FUNC_END(__memcpy)
...@@ -45,17 +42,6 @@ EXPORT_SYMBOL(__memcpy) ...@@ -45,17 +42,6 @@ EXPORT_SYMBOL(__memcpy)
SYM_FUNC_ALIAS(memcpy, __memcpy) SYM_FUNC_ALIAS(memcpy, __memcpy)
EXPORT_SYMBOL(memcpy) EXPORT_SYMBOL(memcpy)
/*
* memcpy_erms() - enhanced fast string memcpy. This is faster and
* simpler than memcpy. Use memcpy_erms when possible.
*/
SYM_FUNC_START_LOCAL(memcpy_erms)
movq %rdi, %rax
movq %rdx, %rcx
rep movsb
RET
SYM_FUNC_END(memcpy_erms)
SYM_FUNC_START_LOCAL(memcpy_orig) SYM_FUNC_START_LOCAL(memcpy_orig)
movq %rdi, %rax movq %rdi, %rax
......
...@@ -18,27 +18,22 @@ ...@@ -18,27 +18,22 @@
* rdx count (bytes) * rdx count (bytes)
* *
* rax original destination * rax original destination
*
* The FSRS alternative should be done inline (avoiding the call and
* the disgusting return handling), but that would require some help
* from the compiler for better calling conventions.
*
* The 'rep stosb' itself is small enough to replace the call, but all
* the register moves blow up the code. And two of them are "needed"
* only for the return value that is the same as the source input,
* which the compiler could/should do much better anyway.
*/ */
SYM_FUNC_START(__memset) SYM_FUNC_START(__memset)
/* ALTERNATIVE "jmp memset_orig", "", X86_FEATURE_FSRS
* Some CPUs support enhanced REP MOVSB/STOSB feature. It is recommended
* to use it when possible. If not available, use fast string instructions.
*
* Otherwise, use original memset function.
*/
ALTERNATIVE_2 "jmp memset_orig", "", X86_FEATURE_REP_GOOD, \
"jmp memset_erms", X86_FEATURE_ERMS
movq %rdi,%r9 movq %rdi,%r9
movb %sil,%al
movq %rdx,%rcx movq %rdx,%rcx
andl $7,%edx
shrq $3,%rcx
/* expand byte value */
movzbl %sil,%esi
movabs $0x0101010101010101,%rax
imulq %rsi,%rax
rep stosq
movl %edx,%ecx
rep stosb rep stosb
movq %r9,%rax movq %r9,%rax
RET RET
...@@ -48,26 +43,6 @@ EXPORT_SYMBOL(__memset) ...@@ -48,26 +43,6 @@ EXPORT_SYMBOL(__memset)
SYM_FUNC_ALIAS(memset, __memset) SYM_FUNC_ALIAS(memset, __memset)
EXPORT_SYMBOL(memset) EXPORT_SYMBOL(memset)
/*
* ISO C memset - set a memory block to a byte value. This function uses
* enhanced rep stosb to override the fast string function.
* The code is simpler and shorter than the fast string function as well.
*
* rdi destination
* rsi value (char)
* rdx count (bytes)
*
* rax original destination
*/
SYM_FUNC_START_LOCAL(memset_erms)
movq %rdi,%r9
movb %sil,%al
movq %rdx,%rcx
rep stosb
movq %r9,%rax
RET
SYM_FUNC_END(memset_erms)
SYM_FUNC_START_LOCAL(memset_orig) SYM_FUNC_START_LOCAL(memset_orig)
movq %rdi,%r10 movq %rdi,%r10
......
...@@ -4,7 +4,6 @@ ...@@ -4,7 +4,6 @@
/* Just disable it so we can build arch/x86/lib/memcpy_64.S for perf bench: */ /* Just disable it so we can build arch/x86/lib/memcpy_64.S for perf bench: */
#define altinstruction_entry # #define ALTERNATIVE #
#define ALTERNATIVE_2 #
#endif #endif
...@@ -7,7 +7,3 @@ MEMCPY_FN(memcpy_orig, ...@@ -7,7 +7,3 @@ MEMCPY_FN(memcpy_orig,
MEMCPY_FN(__memcpy, MEMCPY_FN(__memcpy,
"x86-64-movsq", "x86-64-movsq",
"movsq-based memcpy() in arch/x86/lib/memcpy_64.S") "movsq-based memcpy() in arch/x86/lib/memcpy_64.S")
MEMCPY_FN(memcpy_erms,
"x86-64-movsb",
"movsb-based memcpy() in arch/x86/lib/memcpy_64.S")
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
/* Various wrappers to make the kernel .S file build in user-space: */ /* Various wrappers to make the kernel .S file build in user-space: */
// memcpy_orig and memcpy_erms are being defined as SYM_L_LOCAL but we need it // memcpy_orig is being defined as SYM_L_LOCAL but we need it
#define SYM_FUNC_START_LOCAL(name) \ #define SYM_FUNC_START_LOCAL(name) \
SYM_START(name, SYM_L_GLOBAL, SYM_A_ALIGN) SYM_START(name, SYM_L_GLOBAL, SYM_A_ALIGN)
#define memcpy MEMCPY /* don't hide glibc's memcpy() */ #define memcpy MEMCPY /* don't hide glibc's memcpy() */
......
...@@ -7,7 +7,3 @@ MEMSET_FN(memset_orig, ...@@ -7,7 +7,3 @@ MEMSET_FN(memset_orig,
MEMSET_FN(__memset, MEMSET_FN(__memset,
"x86-64-stosq", "x86-64-stosq",
"movsq-based memset() in arch/x86/lib/memset_64.S") "movsq-based memset() in arch/x86/lib/memset_64.S")
MEMSET_FN(memset_erms,
"x86-64-stosb",
"movsb-based memset() in arch/x86/lib/memset_64.S")
/* SPDX-License-Identifier: GPL-2.0 */ /* SPDX-License-Identifier: GPL-2.0 */
// memset_orig and memset_erms are being defined as SYM_L_LOCAL but we need it // memset_orig is being defined as SYM_L_LOCAL but we need it
#define SYM_FUNC_START_LOCAL(name) \ #define SYM_FUNC_START_LOCAL(name) \
SYM_START(name, SYM_L_GLOBAL, SYM_A_ALIGN) SYM_START(name, SYM_L_GLOBAL, SYM_A_ALIGN)
#define memset MEMSET /* don't hide glibc's memset() */ #define memset MEMSET /* don't hide glibc's memset() */
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment