Commit f8e92fb4 authored by Ingo Molnar's avatar Ingo Molnar

Merge tag 'alternatives_padding' of...

Merge tag 'alternatives_padding' of git://git.kernel.org/pub/scm/linux/kernel/git/bp/bp into x86/asm

Pull alternative instructions framework improvements from Borislav Petkov:

 "A more involved rework of the alternatives framework to be able to
  pad instructions and thus make using the alternatives macros more
  straightforward and without having to figure out old and new instruction
  sizes but have the toolchain figure that out for us.

  Furthermore, it optimizes JMPs used so that fetch and decode can be
  relieved with smaller versions of the JMPs, where possible.

  Some stats:

    x86_64 defconfig:

    Alternatives sites total:               2478
    Total padding added (in Bytes):         6051

  The padding is currently done for:

    X86_FEATURE_ALWAYS
    X86_FEATURE_ERMS
    X86_FEATURE_LFENCE_RDTSC
    X86_FEATURE_MFENCE_RDTSC
    X86_FEATURE_SMAP

  This is with the latest version of the patchset. Of course, on each
  machine the alternatives sites actually being patched are a proper
  subset of the total number."
Signed-off-by: default avatarIngo Molnar <mingo@kernel.org>
parents d2c032e3 dfecb95c
...@@ -18,12 +18,53 @@ ...@@ -18,12 +18,53 @@
.endm .endm
#endif #endif
.macro altinstruction_entry orig alt feature orig_len alt_len .macro altinstruction_entry orig alt feature orig_len alt_len pad_len
.long \orig - . .long \orig - .
.long \alt - . .long \alt - .
.word \feature .word \feature
.byte \orig_len .byte \orig_len
.byte \alt_len .byte \alt_len
.byte \pad_len
.endm
.macro ALTERNATIVE oldinstr, newinstr, feature
140:
\oldinstr
141:
.skip -(((144f-143f)-(141b-140b)) > 0) * ((144f-143f)-(141b-140b)),0x90
142:
.pushsection .altinstructions,"a"
altinstruction_entry 140b,143f,\feature,142b-140b,144f-143f,142b-141b
.popsection
.pushsection .altinstr_replacement,"ax"
143:
\newinstr
144:
.popsection
.endm
.macro ALTERNATIVE_2 oldinstr, newinstr1, feature1, newinstr2, feature2
140:
\oldinstr
141:
.skip -(((144f-143f)-(141b-140b)) > 0) * ((144f-143f)-(141b-140b)),0x90
.skip -(((145f-144f)-(144f-143f)-(141b-140b)) > 0) * ((145f-144f)-(144f-143f)-(141b-140b)),0x90
142:
.pushsection .altinstructions,"a"
altinstruction_entry 140b,143f,\feature1,142b-140b,144f-143f,142b-141b
altinstruction_entry 140b,144f,\feature2,142b-140b,145f-144f,142b-141b
.popsection
.pushsection .altinstr_replacement,"ax"
143:
\newinstr1
144:
\newinstr2
145:
.popsection
.endm .endm
#endif /* __ASSEMBLY__ */ #endif /* __ASSEMBLY__ */
......
...@@ -48,8 +48,9 @@ struct alt_instr { ...@@ -48,8 +48,9 @@ struct alt_instr {
s32 repl_offset; /* offset to replacement instruction */ s32 repl_offset; /* offset to replacement instruction */
u16 cpuid; /* cpuid bit set for replacement */ u16 cpuid; /* cpuid bit set for replacement */
u8 instrlen; /* length of original instruction */ u8 instrlen; /* length of original instruction */
u8 replacementlen; /* length of new instruction, <= instrlen */ u8 replacementlen; /* length of new instruction */
}; u8 padlen; /* length of build-time padding */
} __packed;
extern void alternative_instructions(void); extern void alternative_instructions(void);
extern void apply_alternatives(struct alt_instr *start, struct alt_instr *end); extern void apply_alternatives(struct alt_instr *start, struct alt_instr *end);
...@@ -76,50 +77,61 @@ static inline int alternatives_text_reserved(void *start, void *end) ...@@ -76,50 +77,61 @@ static inline int alternatives_text_reserved(void *start, void *end)
} }
#endif /* CONFIG_SMP */ #endif /* CONFIG_SMP */
#define OLDINSTR(oldinstr) "661:\n\t" oldinstr "\n662:\n" #define b_replacement(num) "664"#num
#define e_replacement(num) "665"#num
#define b_replacement(number) "663"#number #define alt_end_marker "663"
#define e_replacement(number) "664"#number #define alt_slen "662b-661b"
#define alt_pad_len alt_end_marker"b-662b"
#define alt_total_slen alt_end_marker"b-661b"
#define alt_rlen(num) e_replacement(num)"f-"b_replacement(num)"f"
#define alt_slen "662b-661b" #define __OLDINSTR(oldinstr, num) \
#define alt_rlen(number) e_replacement(number)"f-"b_replacement(number)"f" "661:\n\t" oldinstr "\n662:\n" \
".skip -(((" alt_rlen(num) ")-(" alt_slen ")) > 0) * " \
"((" alt_rlen(num) ")-(" alt_slen ")),0x90\n"
#define ALTINSTR_ENTRY(feature, number) \ #define OLDINSTR(oldinstr, num) \
__OLDINSTR(oldinstr, num) \
alt_end_marker ":\n"
/*
* Pad the second replacement alternative with additional NOPs if it is
* additionally longer than the first replacement alternative.
*/
#define OLDINSTR_2(oldinstr, num1, num2) \
__OLDINSTR(oldinstr, num1) \
".skip -(((" alt_rlen(num2) ")-(" alt_rlen(num1) ")-(662b-661b)) > 0) * " \
"((" alt_rlen(num2) ")-(" alt_rlen(num1) ")-(662b-661b)),0x90\n" \
alt_end_marker ":\n"
#define ALTINSTR_ENTRY(feature, num) \
" .long 661b - .\n" /* label */ \ " .long 661b - .\n" /* label */ \
" .long " b_replacement(number)"f - .\n" /* new instruction */ \ " .long " b_replacement(num)"f - .\n" /* new instruction */ \
" .word " __stringify(feature) "\n" /* feature bit */ \ " .word " __stringify(feature) "\n" /* feature bit */ \
" .byte " alt_slen "\n" /* source len */ \ " .byte " alt_total_slen "\n" /* source len */ \
" .byte " alt_rlen(number) "\n" /* replacement len */ " .byte " alt_rlen(num) "\n" /* replacement len */ \
" .byte " alt_pad_len "\n" /* pad len */
#define DISCARD_ENTRY(number) /* rlen <= slen */ \ #define ALTINSTR_REPLACEMENT(newinstr, feature, num) /* replacement */ \
" .byte 0xff + (" alt_rlen(number) ") - (" alt_slen ")\n" b_replacement(num)":\n\t" newinstr "\n" e_replacement(num) ":\n\t"
#define ALTINSTR_REPLACEMENT(newinstr, feature, number) /* replacement */ \
b_replacement(number)":\n\t" newinstr "\n" e_replacement(number) ":\n\t"
/* alternative assembly primitive: */ /* alternative assembly primitive: */
#define ALTERNATIVE(oldinstr, newinstr, feature) \ #define ALTERNATIVE(oldinstr, newinstr, feature) \
OLDINSTR(oldinstr) \ OLDINSTR(oldinstr, 1) \
".pushsection .altinstructions,\"a\"\n" \ ".pushsection .altinstructions,\"a\"\n" \
ALTINSTR_ENTRY(feature, 1) \ ALTINSTR_ENTRY(feature, 1) \
".popsection\n" \ ".popsection\n" \
".pushsection .discard,\"aw\",@progbits\n" \
DISCARD_ENTRY(1) \
".popsection\n" \
".pushsection .altinstr_replacement, \"ax\"\n" \ ".pushsection .altinstr_replacement, \"ax\"\n" \
ALTINSTR_REPLACEMENT(newinstr, feature, 1) \ ALTINSTR_REPLACEMENT(newinstr, feature, 1) \
".popsection" ".popsection"
#define ALTERNATIVE_2(oldinstr, newinstr1, feature1, newinstr2, feature2)\ #define ALTERNATIVE_2(oldinstr, newinstr1, feature1, newinstr2, feature2)\
OLDINSTR(oldinstr) \ OLDINSTR_2(oldinstr, 1, 2) \
".pushsection .altinstructions,\"a\"\n" \ ".pushsection .altinstructions,\"a\"\n" \
ALTINSTR_ENTRY(feature1, 1) \ ALTINSTR_ENTRY(feature1, 1) \
ALTINSTR_ENTRY(feature2, 2) \ ALTINSTR_ENTRY(feature2, 2) \
".popsection\n" \ ".popsection\n" \
".pushsection .discard,\"aw\",@progbits\n" \
DISCARD_ENTRY(1) \
DISCARD_ENTRY(2) \
".popsection\n" \
".pushsection .altinstr_replacement, \"ax\"\n" \ ".pushsection .altinstr_replacement, \"ax\"\n" \
ALTINSTR_REPLACEMENT(newinstr1, feature1, 1) \ ALTINSTR_REPLACEMENT(newinstr1, feature1, 1) \
ALTINSTR_REPLACEMENT(newinstr2, feature2, 2) \ ALTINSTR_REPLACEMENT(newinstr2, feature2, 2) \
...@@ -146,6 +158,9 @@ static inline int alternatives_text_reserved(void *start, void *end) ...@@ -146,6 +158,9 @@ static inline int alternatives_text_reserved(void *start, void *end)
#define alternative(oldinstr, newinstr, feature) \ #define alternative(oldinstr, newinstr, feature) \
asm volatile (ALTERNATIVE(oldinstr, newinstr, feature) : : : "memory") asm volatile (ALTERNATIVE(oldinstr, newinstr, feature) : : : "memory")
#define alternative_2(oldinstr, newinstr1, feature1, newinstr2, feature2) \
asm volatile(ALTERNATIVE_2(oldinstr, newinstr1, feature1, newinstr2, feature2) ::: "memory")
/* /*
* Alternative inline assembly with input. * Alternative inline assembly with input.
* *
......
...@@ -91,7 +91,7 @@ static inline void native_apic_mem_write(u32 reg, u32 v) ...@@ -91,7 +91,7 @@ static inline void native_apic_mem_write(u32 reg, u32 v)
{ {
volatile u32 *addr = (volatile u32 *)(APIC_BASE + reg); volatile u32 *addr = (volatile u32 *)(APIC_BASE + reg);
alternative_io("movl %0, %1", "xchgl %0, %1", X86_BUG_11AP, alternative_io("movl %0, %P1", "xchgl %0, %P1", X86_BUG_11AP,
ASM_OUTPUT2("=r" (v), "=m" (*addr)), ASM_OUTPUT2("=r" (v), "=m" (*addr)),
ASM_OUTPUT2("0" (v), "m" (*addr))); ASM_OUTPUT2("0" (v), "m" (*addr)));
} }
......
...@@ -95,13 +95,11 @@ do { \ ...@@ -95,13 +95,11 @@ do { \
* Stop RDTSC speculation. This is needed when you need to use RDTSC * Stop RDTSC speculation. This is needed when you need to use RDTSC
* (or get_cycles or vread that possibly accesses the TSC) in a defined * (or get_cycles or vread that possibly accesses the TSC) in a defined
* code region. * code region.
*
* (Could use an alternative three way for this if there was one.)
*/ */
static __always_inline void rdtsc_barrier(void) static __always_inline void rdtsc_barrier(void)
{ {
alternative(ASM_NOP3, "mfence", X86_FEATURE_MFENCE_RDTSC); alternative_2("", "mfence", X86_FEATURE_MFENCE_RDTSC,
alternative(ASM_NOP3, "lfence", X86_FEATURE_LFENCE_RDTSC); "lfence", X86_FEATURE_LFENCE_RDTSC);
} }
#endif /* _ASM_X86_BARRIER_H */ #endif /* _ASM_X86_BARRIER_H */
...@@ -419,6 +419,7 @@ static __always_inline __pure bool __static_cpu_has(u16 bit) ...@@ -419,6 +419,7 @@ static __always_inline __pure bool __static_cpu_has(u16 bit)
" .word %P0\n" /* 1: do replace */ " .word %P0\n" /* 1: do replace */
" .byte 2b - 1b\n" /* source len */ " .byte 2b - 1b\n" /* source len */
" .byte 0\n" /* replacement len */ " .byte 0\n" /* replacement len */
" .byte 0\n" /* pad len */
".previous\n" ".previous\n"
/* skipping size check since replacement size = 0 */ /* skipping size check since replacement size = 0 */
: : "i" (X86_FEATURE_ALWAYS) : : t_warn); : : "i" (X86_FEATURE_ALWAYS) : : t_warn);
...@@ -433,6 +434,7 @@ static __always_inline __pure bool __static_cpu_has(u16 bit) ...@@ -433,6 +434,7 @@ static __always_inline __pure bool __static_cpu_has(u16 bit)
" .word %P0\n" /* feature bit */ " .word %P0\n" /* feature bit */
" .byte 2b - 1b\n" /* source len */ " .byte 2b - 1b\n" /* source len */
" .byte 0\n" /* replacement len */ " .byte 0\n" /* replacement len */
" .byte 0\n" /* pad len */
".previous\n" ".previous\n"
/* skipping size check since replacement size = 0 */ /* skipping size check since replacement size = 0 */
: : "i" (bit) : : t_no); : : "i" (bit) : : t_no);
...@@ -458,6 +460,7 @@ static __always_inline __pure bool __static_cpu_has(u16 bit) ...@@ -458,6 +460,7 @@ static __always_inline __pure bool __static_cpu_has(u16 bit)
" .word %P1\n" /* feature bit */ " .word %P1\n" /* feature bit */
" .byte 2b - 1b\n" /* source len */ " .byte 2b - 1b\n" /* source len */
" .byte 4f - 3f\n" /* replacement len */ " .byte 4f - 3f\n" /* replacement len */
" .byte 0\n" /* pad len */
".previous\n" ".previous\n"
".section .discard,\"aw\",@progbits\n" ".section .discard,\"aw\",@progbits\n"
" .byte 0xff + (4f-3f) - (2b-1b)\n" /* size check */ " .byte 0xff + (4f-3f) - (2b-1b)\n" /* size check */
...@@ -484,31 +487,30 @@ static __always_inline __pure bool __static_cpu_has(u16 bit) ...@@ -484,31 +487,30 @@ static __always_inline __pure bool __static_cpu_has(u16 bit)
static __always_inline __pure bool _static_cpu_has_safe(u16 bit) static __always_inline __pure bool _static_cpu_has_safe(u16 bit)
{ {
#ifdef CC_HAVE_ASM_GOTO #ifdef CC_HAVE_ASM_GOTO
/* asm_volatile_goto("1: jmp %l[t_dynamic]\n"
* We need to spell the jumps to the compiler because, depending on the offset,
* the replacement jump can be bigger than the original jump, and this we cannot
* have. Thus, we force the jump to the widest, 4-byte, signed relative
* offset even though the last would often fit in less bytes.
*/
asm_volatile_goto("1: .byte 0xe9\n .long %l[t_dynamic] - 2f\n"
"2:\n" "2:\n"
".skip -(((5f-4f) - (2b-1b)) > 0) * "
"((5f-4f) - (2b-1b)),0x90\n"
"3:\n"
".section .altinstructions,\"a\"\n" ".section .altinstructions,\"a\"\n"
" .long 1b - .\n" /* src offset */ " .long 1b - .\n" /* src offset */
" .long 3f - .\n" /* repl offset */ " .long 4f - .\n" /* repl offset */
" .word %P1\n" /* always replace */ " .word %P1\n" /* always replace */
" .byte 2b - 1b\n" /* src len */ " .byte 3b - 1b\n" /* src len */
" .byte 4f - 3f\n" /* repl len */ " .byte 5f - 4f\n" /* repl len */
" .byte 3b - 2b\n" /* pad len */
".previous\n" ".previous\n"
".section .altinstr_replacement,\"ax\"\n" ".section .altinstr_replacement,\"ax\"\n"
"3: .byte 0xe9\n .long %l[t_no] - 2b\n" "4: jmp %l[t_no]\n"
"4:\n" "5:\n"
".previous\n" ".previous\n"
".section .altinstructions,\"a\"\n" ".section .altinstructions,\"a\"\n"
" .long 1b - .\n" /* src offset */ " .long 1b - .\n" /* src offset */
" .long 0\n" /* no replacement */ " .long 0\n" /* no replacement */
" .word %P0\n" /* feature bit */ " .word %P0\n" /* feature bit */
" .byte 2b - 1b\n" /* src len */ " .byte 3b - 1b\n" /* src len */
" .byte 0\n" /* repl len */ " .byte 0\n" /* repl len */
" .byte 0\n" /* pad len */
".previous\n" ".previous\n"
: : "i" (bit), "i" (X86_FEATURE_ALWAYS) : : "i" (bit), "i" (X86_FEATURE_ALWAYS)
: : t_dynamic, t_no); : : t_dynamic, t_no);
...@@ -528,6 +530,7 @@ static __always_inline __pure bool _static_cpu_has_safe(u16 bit) ...@@ -528,6 +530,7 @@ static __always_inline __pure bool _static_cpu_has_safe(u16 bit)
" .word %P2\n" /* always replace */ " .word %P2\n" /* always replace */
" .byte 2b - 1b\n" /* source len */ " .byte 2b - 1b\n" /* source len */
" .byte 4f - 3f\n" /* replacement len */ " .byte 4f - 3f\n" /* replacement len */
" .byte 0\n" /* pad len */
".previous\n" ".previous\n"
".section .discard,\"aw\",@progbits\n" ".section .discard,\"aw\",@progbits\n"
" .byte 0xff + (4f-3f) - (2b-1b)\n" /* size check */ " .byte 0xff + (4f-3f) - (2b-1b)\n" /* size check */
...@@ -542,6 +545,7 @@ static __always_inline __pure bool _static_cpu_has_safe(u16 bit) ...@@ -542,6 +545,7 @@ static __always_inline __pure bool _static_cpu_has_safe(u16 bit)
" .word %P1\n" /* feature bit */ " .word %P1\n" /* feature bit */
" .byte 4b - 3b\n" /* src len */ " .byte 4b - 3b\n" /* src len */
" .byte 6f - 5f\n" /* repl len */ " .byte 6f - 5f\n" /* repl len */
" .byte 0\n" /* pad len */
".previous\n" ".previous\n"
".section .discard,\"aw\",@progbits\n" ".section .discard,\"aw\",@progbits\n"
" .byte 0xff + (6f-5f) - (4b-3b)\n" /* size check */ " .byte 0xff + (6f-5f) - (4b-3b)\n" /* size check */
......
...@@ -761,10 +761,10 @@ extern char ignore_fpu_irq; ...@@ -761,10 +761,10 @@ extern char ignore_fpu_irq;
#define ARCH_HAS_SPINLOCK_PREFETCH #define ARCH_HAS_SPINLOCK_PREFETCH
#ifdef CONFIG_X86_32 #ifdef CONFIG_X86_32
# define BASE_PREFETCH ASM_NOP4 # define BASE_PREFETCH ""
# define ARCH_HAS_PREFETCH # define ARCH_HAS_PREFETCH
#else #else
# define BASE_PREFETCH "prefetcht0 (%1)" # define BASE_PREFETCH "prefetcht0 %P1"
#endif #endif
/* /*
...@@ -775,10 +775,9 @@ extern char ignore_fpu_irq; ...@@ -775,10 +775,9 @@ extern char ignore_fpu_irq;
*/ */
static inline void prefetch(const void *x) static inline void prefetch(const void *x)
{ {
alternative_input(BASE_PREFETCH, alternative_input(BASE_PREFETCH, "prefetchnta %P1",
"prefetchnta (%1)",
X86_FEATURE_XMM, X86_FEATURE_XMM,
"r" (x)); "m" (*(const char *)x));
} }
/* /*
...@@ -788,10 +787,9 @@ static inline void prefetch(const void *x) ...@@ -788,10 +787,9 @@ static inline void prefetch(const void *x)
*/ */
static inline void prefetchw(const void *x) static inline void prefetchw(const void *x)
{ {
alternative_input(BASE_PREFETCH, alternative_input(BASE_PREFETCH, "prefetchw %P1",
"prefetchw (%1)", X86_FEATURE_3DNOWPREFETCH,
X86_FEATURE_3DNOW, "m" (*(const char *)x));
"r" (x));
} }
static inline void spin_lock_prefetch(const void *x) static inline void spin_lock_prefetch(const void *x)
......
...@@ -27,23 +27,11 @@ ...@@ -27,23 +27,11 @@
#ifdef CONFIG_X86_SMAP #ifdef CONFIG_X86_SMAP
#define ASM_CLAC \ #define ASM_CLAC \
661: ASM_NOP3 ; \ ALTERNATIVE "", __stringify(__ASM_CLAC), X86_FEATURE_SMAP
.pushsection .altinstr_replacement, "ax" ; \
662: __ASM_CLAC ; \ #define ASM_STAC \
.popsection ; \ ALTERNATIVE "", __stringify(__ASM_STAC), X86_FEATURE_SMAP
.pushsection .altinstructions, "a" ; \
altinstruction_entry 661b, 662b, X86_FEATURE_SMAP, 3, 3 ; \
.popsection
#define ASM_STAC \
661: ASM_NOP3 ; \
.pushsection .altinstr_replacement, "ax" ; \
662: __ASM_STAC ; \
.popsection ; \
.pushsection .altinstructions, "a" ; \
altinstruction_entry 661b, 662b, X86_FEATURE_SMAP, 3, 3 ; \
.popsection
#else /* CONFIG_X86_SMAP */ #else /* CONFIG_X86_SMAP */
...@@ -61,20 +49,20 @@ ...@@ -61,20 +49,20 @@
static __always_inline void clac(void) static __always_inline void clac(void)
{ {
/* Note: a barrier is implicit in alternative() */ /* Note: a barrier is implicit in alternative() */
alternative(ASM_NOP3, __stringify(__ASM_CLAC), X86_FEATURE_SMAP); alternative("", __stringify(__ASM_CLAC), X86_FEATURE_SMAP);
} }
static __always_inline void stac(void) static __always_inline void stac(void)
{ {
/* Note: a barrier is implicit in alternative() */ /* Note: a barrier is implicit in alternative() */
alternative(ASM_NOP3, __stringify(__ASM_STAC), X86_FEATURE_SMAP); alternative("", __stringify(__ASM_STAC), X86_FEATURE_SMAP);
} }
/* These macros can be used in asm() statements */ /* These macros can be used in asm() statements */
#define ASM_CLAC \ #define ASM_CLAC \
ALTERNATIVE(ASM_NOP3, __stringify(__ASM_CLAC), X86_FEATURE_SMAP) ALTERNATIVE("", __stringify(__ASM_CLAC), X86_FEATURE_SMAP)
#define ASM_STAC \ #define ASM_STAC \
ALTERNATIVE(ASM_NOP3, __stringify(__ASM_STAC), X86_FEATURE_SMAP) ALTERNATIVE("", __stringify(__ASM_STAC), X86_FEATURE_SMAP)
#else /* CONFIG_X86_SMAP */ #else /* CONFIG_X86_SMAP */
......
...@@ -52,10 +52,25 @@ static int __init setup_noreplace_paravirt(char *str) ...@@ -52,10 +52,25 @@ static int __init setup_noreplace_paravirt(char *str)
__setup("noreplace-paravirt", setup_noreplace_paravirt); __setup("noreplace-paravirt", setup_noreplace_paravirt);
#endif #endif
#define DPRINTK(fmt, ...) \ #define DPRINTK(fmt, args...) \
do { \ do { \
if (debug_alternative) \ if (debug_alternative) \
printk(KERN_DEBUG fmt, ##__VA_ARGS__); \ printk(KERN_DEBUG "%s: " fmt "\n", __func__, ##args); \
} while (0)
#define DUMP_BYTES(buf, len, fmt, args...) \
do { \
if (unlikely(debug_alternative)) { \
int j; \
\
if (!(len)) \
break; \
\
printk(KERN_DEBUG fmt, ##args); \
for (j = 0; j < (len) - 1; j++) \
printk(KERN_CONT "%02hhx ", buf[j]); \
printk(KERN_CONT "%02hhx\n", buf[j]); \
} \
} while (0) } while (0)
/* /*
...@@ -243,12 +258,86 @@ extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; ...@@ -243,12 +258,86 @@ extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
extern s32 __smp_locks[], __smp_locks_end[]; extern s32 __smp_locks[], __smp_locks_end[];
void *text_poke_early(void *addr, const void *opcode, size_t len); void *text_poke_early(void *addr, const void *opcode, size_t len);
/* Replace instructions with better alternatives for this CPU type. /*
This runs before SMP is initialized to avoid SMP problems with * Are we looking at a near JMP with a 1 or 4-byte displacement.
self modifying code. This implies that asymmetric systems where */
APs have less capabilities than the boot processor are not handled. static inline bool is_jmp(const u8 opcode)
Tough. Make sure you disable such features by hand. */ {
return opcode == 0xeb || opcode == 0xe9;
}
static void __init_or_module
recompute_jump(struct alt_instr *a, u8 *orig_insn, u8 *repl_insn, u8 *insnbuf)
{
u8 *next_rip, *tgt_rip;
s32 n_dspl, o_dspl;
int repl_len;
if (a->replacementlen != 5)
return;
o_dspl = *(s32 *)(insnbuf + 1);
/* next_rip of the replacement JMP */
next_rip = repl_insn + a->replacementlen;
/* target rip of the replacement JMP */
tgt_rip = next_rip + o_dspl;
n_dspl = tgt_rip - orig_insn;
DPRINTK("target RIP: %p, new_displ: 0x%x", tgt_rip, n_dspl);
if (tgt_rip - orig_insn >= 0) {
if (n_dspl - 2 <= 127)
goto two_byte_jmp;
else
goto five_byte_jmp;
/* negative offset */
} else {
if (((n_dspl - 2) & 0xff) == (n_dspl - 2))
goto two_byte_jmp;
else
goto five_byte_jmp;
}
two_byte_jmp:
n_dspl -= 2;
insnbuf[0] = 0xeb;
insnbuf[1] = (s8)n_dspl;
add_nops(insnbuf + 2, 3);
repl_len = 2;
goto done;
five_byte_jmp:
n_dspl -= 5;
insnbuf[0] = 0xe9;
*(s32 *)&insnbuf[1] = n_dspl;
repl_len = 5;
done:
DPRINTK("final displ: 0x%08x, JMP 0x%lx",
n_dspl, (unsigned long)orig_insn + n_dspl + repl_len);
}
static void __init_or_module optimize_nops(struct alt_instr *a, u8 *instr)
{
add_nops(instr + (a->instrlen - a->padlen), a->padlen);
DUMP_BYTES(instr, a->instrlen, "%p: [%d:%d) optimized NOPs: ",
instr, a->instrlen - a->padlen, a->padlen);
}
/*
* Replace instructions with better alternatives for this CPU type. This runs
* before SMP is initialized to avoid SMP problems with self modifying code.
* This implies that asymmetric systems where APs have less capabilities than
* the boot processor are not handled. Tough. Make sure you disable such
* features by hand.
*/
void __init_or_module apply_alternatives(struct alt_instr *start, void __init_or_module apply_alternatives(struct alt_instr *start,
struct alt_instr *end) struct alt_instr *end)
{ {
...@@ -256,10 +345,10 @@ void __init_or_module apply_alternatives(struct alt_instr *start, ...@@ -256,10 +345,10 @@ void __init_or_module apply_alternatives(struct alt_instr *start,
u8 *instr, *replacement; u8 *instr, *replacement;
u8 insnbuf[MAX_PATCH_LEN]; u8 insnbuf[MAX_PATCH_LEN];
DPRINTK("%s: alt table %p -> %p\n", __func__, start, end); DPRINTK("alt table %p -> %p", start, end);
/* /*
* The scan order should be from start to end. A later scanned * The scan order should be from start to end. A later scanned
* alternative code can overwrite a previous scanned alternative code. * alternative code can overwrite previously scanned alternative code.
* Some kernel functions (e.g. memcpy, memset, etc) use this order to * Some kernel functions (e.g. memcpy, memset, etc) use this order to
* patch code. * patch code.
* *
...@@ -267,29 +356,54 @@ void __init_or_module apply_alternatives(struct alt_instr *start, ...@@ -267,29 +356,54 @@ void __init_or_module apply_alternatives(struct alt_instr *start,
* order. * order.
*/ */
for (a = start; a < end; a++) { for (a = start; a < end; a++) {
int insnbuf_sz = 0;
instr = (u8 *)&a->instr_offset + a->instr_offset; instr = (u8 *)&a->instr_offset + a->instr_offset;
replacement = (u8 *)&a->repl_offset + a->repl_offset; replacement = (u8 *)&a->repl_offset + a->repl_offset;
BUG_ON(a->replacementlen > a->instrlen);
BUG_ON(a->instrlen > sizeof(insnbuf)); BUG_ON(a->instrlen > sizeof(insnbuf));
BUG_ON(a->cpuid >= (NCAPINTS + NBUGINTS) * 32); BUG_ON(a->cpuid >= (NCAPINTS + NBUGINTS) * 32);
if (!boot_cpu_has(a->cpuid)) if (!boot_cpu_has(a->cpuid)) {
if (a->padlen > 1)
optimize_nops(a, instr);
continue; continue;
}
DPRINTK("feat: %d*32+%d, old: (%p, len: %d), repl: (%p, len: %d)",
a->cpuid >> 5,
a->cpuid & 0x1f,
instr, a->instrlen,
replacement, a->replacementlen);
DUMP_BYTES(instr, a->instrlen, "%p: old_insn: ", instr);
DUMP_BYTES(replacement, a->replacementlen, "%p: rpl_insn: ", replacement);
memcpy(insnbuf, replacement, a->replacementlen); memcpy(insnbuf, replacement, a->replacementlen);
insnbuf_sz = a->replacementlen;
/* 0xe8 is a relative jump; fix the offset. */ /* 0xe8 is a relative jump; fix the offset. */
if (*insnbuf == 0xe8 && a->replacementlen == 5) if (*insnbuf == 0xe8 && a->replacementlen == 5) {
*(s32 *)(insnbuf + 1) += replacement - instr; *(s32 *)(insnbuf + 1) += replacement - instr;
DPRINTK("Fix CALL offset: 0x%x, CALL 0x%lx",
*(s32 *)(insnbuf + 1),
(unsigned long)instr + *(s32 *)(insnbuf + 1) + 5);
}
if (a->replacementlen && is_jmp(replacement[0]))
recompute_jump(a, instr, replacement, insnbuf);
add_nops(insnbuf + a->replacementlen, if (a->instrlen > a->replacementlen) {
a->instrlen - a->replacementlen); add_nops(insnbuf + a->replacementlen,
a->instrlen - a->replacementlen);
insnbuf_sz += a->instrlen - a->replacementlen;
}
DUMP_BYTES(insnbuf, insnbuf_sz, "%p: final_insn: ", instr);
text_poke_early(instr, insnbuf, a->instrlen); text_poke_early(instr, insnbuf, insnbuf_sz);
} }
} }
#ifdef CONFIG_SMP #ifdef CONFIG_SMP
static void alternatives_smp_lock(const s32 *start, const s32 *end, static void alternatives_smp_lock(const s32 *start, const s32 *end,
u8 *text, u8 *text_end) u8 *text, u8 *text_end)
{ {
...@@ -371,8 +485,8 @@ void __init_or_module alternatives_smp_module_add(struct module *mod, ...@@ -371,8 +485,8 @@ void __init_or_module alternatives_smp_module_add(struct module *mod,
smp->locks_end = locks_end; smp->locks_end = locks_end;
smp->text = text; smp->text = text;
smp->text_end = text_end; smp->text_end = text_end;
DPRINTK("%s: locks %p -> %p, text %p -> %p, name %s\n", DPRINTK("locks %p -> %p, text %p -> %p, name %s\n",
__func__, smp->locks, smp->locks_end, smp->locks, smp->locks_end,
smp->text, smp->text_end, smp->name); smp->text, smp->text_end, smp->name);
list_add_tail(&smp->next, &smp_alt_modules); list_add_tail(&smp->next, &smp_alt_modules);
...@@ -440,7 +554,7 @@ int alternatives_text_reserved(void *start, void *end) ...@@ -440,7 +554,7 @@ int alternatives_text_reserved(void *start, void *end)
return 0; return 0;
} }
#endif #endif /* CONFIG_SMP */
#ifdef CONFIG_PARAVIRT #ifdef CONFIG_PARAVIRT
void __init_or_module apply_paravirt(struct paravirt_patch_site *start, void __init_or_module apply_paravirt(struct paravirt_patch_site *start,
......
...@@ -711,6 +711,11 @@ static void init_amd(struct cpuinfo_x86 *c) ...@@ -711,6 +711,11 @@ static void init_amd(struct cpuinfo_x86 *c)
set_cpu_bug(c, X86_BUG_AMD_APIC_C1E); set_cpu_bug(c, X86_BUG_AMD_APIC_C1E);
rdmsr_safe(MSR_AMD64_PATCH_LEVEL, &c->microcode, &dummy); rdmsr_safe(MSR_AMD64_PATCH_LEVEL, &c->microcode, &dummy);
/* 3DNow or LM implies PREFETCHW */
if (!cpu_has(c, X86_FEATURE_3DNOWPREFETCH))
if (cpu_has(c, X86_FEATURE_3DNOW) || cpu_has(c, X86_FEATURE_LM))
set_cpu_cap(c, X86_FEATURE_3DNOWPREFETCH);
} }
#ifdef CONFIG_X86_32 #ifdef CONFIG_X86_32
......
...@@ -816,15 +816,9 @@ ENTRY(simd_coprocessor_error) ...@@ -816,15 +816,9 @@ ENTRY(simd_coprocessor_error)
pushl_cfi $0 pushl_cfi $0
#ifdef CONFIG_X86_INVD_BUG #ifdef CONFIG_X86_INVD_BUG
/* AMD 486 bug: invd from userspace calls exception 19 instead of #GP */ /* AMD 486 bug: invd from userspace calls exception 19 instead of #GP */
661: pushl_cfi $do_general_protection ALTERNATIVE "pushl_cfi $do_general_protection", \
662: "pushl $do_simd_coprocessor_error", \
.section .altinstructions,"a" X86_FEATURE_XMM
altinstruction_entry 661b, 663f, X86_FEATURE_XMM, 662b-661b, 664f-663f
.previous
.section .altinstr_replacement,"ax"
663: pushl $do_simd_coprocessor_error
664:
.previous
#else #else
pushl_cfi $do_simd_coprocessor_error pushl_cfi $do_simd_coprocessor_error
#endif #endif
......
#include <linux/linkage.h> #include <linux/linkage.h>
#include <asm/dwarf2.h> #include <asm/dwarf2.h>
#include <asm/cpufeature.h>
#include <asm/alternative-asm.h> #include <asm/alternative-asm.h>
/* /*
* Zero a page. * Most CPUs support enhanced REP MOVSB/STOSB instructions. It is
* rdi page * recommended to use this when possible and we do use them by default.
*/ * If enhanced REP MOVSB/STOSB is not available, try to use fast string.
ENTRY(clear_page_c) * Otherwise, use original.
*/
/*
* Zero a page.
* %rdi - page
*/
ENTRY(clear_page)
CFI_STARTPROC CFI_STARTPROC
ALTERNATIVE_2 "jmp clear_page_orig", "", X86_FEATURE_REP_GOOD, \
"jmp clear_page_c_e", X86_FEATURE_ERMS
movl $4096/8,%ecx movl $4096/8,%ecx
xorl %eax,%eax xorl %eax,%eax
rep stosq rep stosq
ret ret
CFI_ENDPROC CFI_ENDPROC
ENDPROC(clear_page_c) ENDPROC(clear_page)
ENTRY(clear_page_c_e) ENTRY(clear_page_orig)
CFI_STARTPROC CFI_STARTPROC
movl $4096,%ecx
xorl %eax,%eax
rep stosb
ret
CFI_ENDPROC
ENDPROC(clear_page_c_e)
ENTRY(clear_page)
CFI_STARTPROC
xorl %eax,%eax xorl %eax,%eax
movl $4096/64,%ecx movl $4096/64,%ecx
.p2align 4 .p2align 4
...@@ -45,29 +49,13 @@ ENTRY(clear_page) ...@@ -45,29 +49,13 @@ ENTRY(clear_page)
nop nop
ret ret
CFI_ENDPROC CFI_ENDPROC
.Lclear_page_end: ENDPROC(clear_page_orig)
ENDPROC(clear_page)
/*
* Some CPUs support enhanced REP MOVSB/STOSB instructions.
* It is recommended to use this when possible.
* If enhanced REP MOVSB/STOSB is not available, try to use fast string.
* Otherwise, use original function.
*
*/
#include <asm/cpufeature.h> ENTRY(clear_page_c_e)
CFI_STARTPROC
.section .altinstr_replacement,"ax" movl $4096,%ecx
1: .byte 0xeb /* jmp <disp8> */ xorl %eax,%eax
.byte (clear_page_c - clear_page) - (2f - 1b) /* offset */ rep stosb
2: .byte 0xeb /* jmp <disp8> */ ret
.byte (clear_page_c_e - clear_page) - (3f - 2b) /* offset */ CFI_ENDPROC
3: ENDPROC(clear_page_c_e)
.previous
.section .altinstructions,"a"
altinstruction_entry clear_page,1b,X86_FEATURE_REP_GOOD,\
.Lclear_page_end-clear_page, 2b-1b
altinstruction_entry clear_page,2b,X86_FEATURE_ERMS, \
.Lclear_page_end-clear_page,3b-2b
.previous
...@@ -2,23 +2,26 @@ ...@@ -2,23 +2,26 @@
#include <linux/linkage.h> #include <linux/linkage.h>
#include <asm/dwarf2.h> #include <asm/dwarf2.h>
#include <asm/cpufeature.h>
#include <asm/alternative-asm.h> #include <asm/alternative-asm.h>
/*
* Some CPUs run faster using the string copy instructions (sane microcode).
* It is also a lot simpler. Use this when possible. But, don't use streaming
* copy unless the CPU indicates X86_FEATURE_REP_GOOD. Could vary the
* prefetch distance based on SMP/UP.
*/
ALIGN ALIGN
copy_page_rep: ENTRY(copy_page)
CFI_STARTPROC CFI_STARTPROC
ALTERNATIVE "jmp copy_page_regs", "", X86_FEATURE_REP_GOOD
movl $4096/8, %ecx movl $4096/8, %ecx
rep movsq rep movsq
ret ret
CFI_ENDPROC CFI_ENDPROC
ENDPROC(copy_page_rep) ENDPROC(copy_page)
/*
* Don't use streaming copy unless the CPU indicates X86_FEATURE_REP_GOOD.
* Could vary the prefetch distance based on SMP/UP.
*/
ENTRY(copy_page) ENTRY(copy_page_regs)
CFI_STARTPROC CFI_STARTPROC
subq $2*8, %rsp subq $2*8, %rsp
CFI_ADJUST_CFA_OFFSET 2*8 CFI_ADJUST_CFA_OFFSET 2*8
...@@ -90,21 +93,5 @@ ENTRY(copy_page) ...@@ -90,21 +93,5 @@ ENTRY(copy_page)
addq $2*8, %rsp addq $2*8, %rsp
CFI_ADJUST_CFA_OFFSET -2*8 CFI_ADJUST_CFA_OFFSET -2*8
ret ret
.Lcopy_page_end:
CFI_ENDPROC CFI_ENDPROC
ENDPROC(copy_page) ENDPROC(copy_page_regs)
/* Some CPUs run faster using the string copy instructions.
It is also a lot simpler. Use this when possible */
#include <asm/cpufeature.h>
.section .altinstr_replacement,"ax"
1: .byte 0xeb /* jmp <disp8> */
.byte (copy_page_rep - copy_page) - (2f - 1b) /* offset */
2:
.previous
.section .altinstructions,"a"
altinstruction_entry copy_page, 1b, X86_FEATURE_REP_GOOD, \
.Lcopy_page_end-copy_page, 2b-1b
.previous
...@@ -8,9 +8,6 @@ ...@@ -8,9 +8,6 @@
#include <linux/linkage.h> #include <linux/linkage.h>
#include <asm/dwarf2.h> #include <asm/dwarf2.h>
#define FIX_ALIGNMENT 1
#include <asm/current.h> #include <asm/current.h>
#include <asm/asm-offsets.h> #include <asm/asm-offsets.h>
#include <asm/thread_info.h> #include <asm/thread_info.h>
...@@ -19,33 +16,7 @@ ...@@ -19,33 +16,7 @@
#include <asm/asm.h> #include <asm/asm.h>
#include <asm/smap.h> #include <asm/smap.h>
/*
* By placing feature2 after feature1 in altinstructions section, we logically
* implement:
* If CPU has feature2, jmp to alt2 is used
* else if CPU has feature1, jmp to alt1 is used
* else jmp to orig is used.
*/
.macro ALTERNATIVE_JUMP feature1,feature2,orig,alt1,alt2
0:
.byte 0xe9 /* 32bit jump */
.long \orig-1f /* by default jump to orig */
1:
.section .altinstr_replacement,"ax"
2: .byte 0xe9 /* near jump with 32bit immediate */
.long \alt1-1b /* offset */ /* or alternatively to alt1 */
3: .byte 0xe9 /* near jump with 32bit immediate */
.long \alt2-1b /* offset */ /* or alternatively to alt2 */
.previous
.section .altinstructions,"a"
altinstruction_entry 0b,2b,\feature1,5,5
altinstruction_entry 0b,3b,\feature2,5,5
.previous
.endm
.macro ALIGN_DESTINATION .macro ALIGN_DESTINATION
#ifdef FIX_ALIGNMENT
/* check for bad alignment of destination */ /* check for bad alignment of destination */
movl %edi,%ecx movl %edi,%ecx
andl $7,%ecx andl $7,%ecx
...@@ -67,7 +38,6 @@ ...@@ -67,7 +38,6 @@
_ASM_EXTABLE(100b,103b) _ASM_EXTABLE(100b,103b)
_ASM_EXTABLE(101b,103b) _ASM_EXTABLE(101b,103b)
#endif
.endm .endm
/* Standard copy_to_user with segment limit checking */ /* Standard copy_to_user with segment limit checking */
...@@ -79,9 +49,11 @@ ENTRY(_copy_to_user) ...@@ -79,9 +49,11 @@ ENTRY(_copy_to_user)
jc bad_to_user jc bad_to_user
cmpq TI_addr_limit(%rax),%rcx cmpq TI_addr_limit(%rax),%rcx
ja bad_to_user ja bad_to_user
ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,X86_FEATURE_ERMS, \ ALTERNATIVE_2 "jmp copy_user_generic_unrolled", \
copy_user_generic_unrolled,copy_user_generic_string, \ "jmp copy_user_generic_string", \
copy_user_enhanced_fast_string X86_FEATURE_REP_GOOD, \
"jmp copy_user_enhanced_fast_string", \
X86_FEATURE_ERMS
CFI_ENDPROC CFI_ENDPROC
ENDPROC(_copy_to_user) ENDPROC(_copy_to_user)
...@@ -94,9 +66,11 @@ ENTRY(_copy_from_user) ...@@ -94,9 +66,11 @@ ENTRY(_copy_from_user)
jc bad_from_user jc bad_from_user
cmpq TI_addr_limit(%rax),%rcx cmpq TI_addr_limit(%rax),%rcx
ja bad_from_user ja bad_from_user
ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,X86_FEATURE_ERMS, \ ALTERNATIVE_2 "jmp copy_user_generic_unrolled", \
copy_user_generic_unrolled,copy_user_generic_string, \ "jmp copy_user_generic_string", \
copy_user_enhanced_fast_string X86_FEATURE_REP_GOOD, \
"jmp copy_user_enhanced_fast_string", \
X86_FEATURE_ERMS
CFI_ENDPROC CFI_ENDPROC
ENDPROC(_copy_from_user) ENDPROC(_copy_from_user)
......
/* Copyright 2002 Andi Kleen */ /* Copyright 2002 Andi Kleen */
#include <linux/linkage.h> #include <linux/linkage.h>
#include <asm/cpufeature.h> #include <asm/cpufeature.h>
#include <asm/dwarf2.h> #include <asm/dwarf2.h>
#include <asm/alternative-asm.h> #include <asm/alternative-asm.h>
/*
* We build a jump to memcpy_orig by default which gets NOPped out on
* the majority of x86 CPUs which set REP_GOOD. In addition, CPUs which
* have the enhanced REP MOVSB/STOSB feature (ERMS), change those NOPs
* to a jmp to memcpy_erms which does the REP; MOVSB mem copy.
*/
.weak memcpy
/* /*
* memcpy - Copy a memory block. * memcpy - Copy a memory block.
* *
...@@ -17,15 +25,11 @@ ...@@ -17,15 +25,11 @@
* Output: * Output:
* rax original destination * rax original destination
*/ */
ENTRY(__memcpy)
ENTRY(memcpy)
ALTERNATIVE_2 "jmp memcpy_orig", "", X86_FEATURE_REP_GOOD, \
"jmp memcpy_erms", X86_FEATURE_ERMS
/*
* memcpy_c() - fast string ops (REP MOVSQ) based variant.
*
* This gets patched over the unrolled variant (below) via the
* alternative instructions framework:
*/
.section .altinstr_replacement, "ax", @progbits
.Lmemcpy_c:
movq %rdi, %rax movq %rdi, %rax
movq %rdx, %rcx movq %rdx, %rcx
shrq $3, %rcx shrq $3, %rcx
...@@ -34,29 +38,21 @@ ...@@ -34,29 +38,21 @@
movl %edx, %ecx movl %edx, %ecx
rep movsb rep movsb
ret ret
.Lmemcpy_e: ENDPROC(memcpy)
.previous ENDPROC(__memcpy)
/* /*
* memcpy_c_e() - enhanced fast string memcpy. This is faster and simpler than * memcpy_erms() - enhanced fast string memcpy. This is faster and
* memcpy_c. Use memcpy_c_e when possible. * simpler than memcpy. Use memcpy_erms when possible.
*
* This gets patched over the unrolled variant (below) via the
* alternative instructions framework:
*/ */
.section .altinstr_replacement, "ax", @progbits ENTRY(memcpy_erms)
.Lmemcpy_c_e:
movq %rdi, %rax movq %rdi, %rax
movq %rdx, %rcx movq %rdx, %rcx
rep movsb rep movsb
ret ret
.Lmemcpy_e_e: ENDPROC(memcpy_erms)
.previous
.weak memcpy
ENTRY(__memcpy) ENTRY(memcpy_orig)
ENTRY(memcpy)
CFI_STARTPROC CFI_STARTPROC
movq %rdi, %rax movq %rdi, %rax
...@@ -183,26 +179,4 @@ ENTRY(memcpy) ...@@ -183,26 +179,4 @@ ENTRY(memcpy)
.Lend: .Lend:
retq retq
CFI_ENDPROC CFI_ENDPROC
ENDPROC(memcpy) ENDPROC(memcpy_orig)
ENDPROC(__memcpy)
/*
* Some CPUs are adding enhanced REP MOVSB/STOSB feature
* If the feature is supported, memcpy_c_e() is the first choice.
* If enhanced rep movsb copy is not available, use fast string copy
* memcpy_c() when possible. This is faster and code is simpler than
* original memcpy().
* Otherwise, original memcpy() is used.
* In .altinstructions section, ERMS feature is placed after REG_GOOD
* feature to implement the right patch order.
*
* Replace only beginning, memcpy is used to apply alternatives,
* so it is silly to overwrite itself with nops - reboot is the
* only outcome...
*/
.section .altinstructions, "a"
altinstruction_entry __memcpy,.Lmemcpy_c,X86_FEATURE_REP_GOOD,\
.Lmemcpy_e-.Lmemcpy_c,.Lmemcpy_e-.Lmemcpy_c
altinstruction_entry __memcpy,.Lmemcpy_c_e,X86_FEATURE_ERMS, \
.Lmemcpy_e_e-.Lmemcpy_c_e,.Lmemcpy_e_e-.Lmemcpy_c_e
.previous
...@@ -5,7 +5,6 @@ ...@@ -5,7 +5,6 @@
* This assembly file is re-written from memmove_64.c file. * This assembly file is re-written from memmove_64.c file.
* - Copyright 2011 Fenghua Yu <fenghua.yu@intel.com> * - Copyright 2011 Fenghua Yu <fenghua.yu@intel.com>
*/ */
#define _STRING_C
#include <linux/linkage.h> #include <linux/linkage.h>
#include <asm/dwarf2.h> #include <asm/dwarf2.h>
#include <asm/cpufeature.h> #include <asm/cpufeature.h>
...@@ -44,6 +43,8 @@ ENTRY(__memmove) ...@@ -44,6 +43,8 @@ ENTRY(__memmove)
jg 2f jg 2f
.Lmemmove_begin_forward: .Lmemmove_begin_forward:
ALTERNATIVE "", "movq %rdx, %rcx; rep movsb; retq", X86_FEATURE_ERMS
/* /*
* movsq instruction have many startup latency * movsq instruction have many startup latency
* so we handle small size by general register. * so we handle small size by general register.
...@@ -207,21 +208,5 @@ ENTRY(__memmove) ...@@ -207,21 +208,5 @@ ENTRY(__memmove)
13: 13:
retq retq
CFI_ENDPROC CFI_ENDPROC
.section .altinstr_replacement,"ax"
.Lmemmove_begin_forward_efs:
/* Forward moving data. */
movq %rdx, %rcx
rep movsb
retq
.Lmemmove_end_forward_efs:
.previous
.section .altinstructions,"a"
altinstruction_entry .Lmemmove_begin_forward, \
.Lmemmove_begin_forward_efs,X86_FEATURE_ERMS, \
.Lmemmove_end_forward-.Lmemmove_begin_forward, \
.Lmemmove_end_forward_efs-.Lmemmove_begin_forward_efs
.previous
ENDPROC(__memmove) ENDPROC(__memmove)
ENDPROC(memmove) ENDPROC(memmove)
...@@ -5,19 +5,30 @@ ...@@ -5,19 +5,30 @@
#include <asm/cpufeature.h> #include <asm/cpufeature.h>
#include <asm/alternative-asm.h> #include <asm/alternative-asm.h>
.weak memset
/* /*
* ISO C memset - set a memory block to a byte value. This function uses fast * ISO C memset - set a memory block to a byte value. This function uses fast
* string to get better performance than the original function. The code is * string to get better performance than the original function. The code is
* simpler and shorter than the orignal function as well. * simpler and shorter than the orignal function as well.
* *
* rdi destination * rdi destination
* rsi value (char) * rsi value (char)
* rdx count (bytes) * rdx count (bytes)
* *
* rax original destination * rax original destination
*/ */
.section .altinstr_replacement, "ax", @progbits ENTRY(memset)
.Lmemset_c: ENTRY(__memset)
/*
* Some CPUs support enhanced REP MOVSB/STOSB feature. It is recommended
* to use it when possible. If not available, use fast string instructions.
*
* Otherwise, use original memset function.
*/
ALTERNATIVE_2 "jmp memset_orig", "", X86_FEATURE_REP_GOOD, \
"jmp memset_erms", X86_FEATURE_ERMS
movq %rdi,%r9 movq %rdi,%r9
movq %rdx,%rcx movq %rdx,%rcx
andl $7,%edx andl $7,%edx
...@@ -31,8 +42,8 @@ ...@@ -31,8 +42,8 @@
rep stosb rep stosb
movq %r9,%rax movq %r9,%rax
ret ret
.Lmemset_e: ENDPROC(memset)
.previous ENDPROC(__memset)
/* /*
* ISO C memset - set a memory block to a byte value. This function uses * ISO C memset - set a memory block to a byte value. This function uses
...@@ -45,21 +56,16 @@ ...@@ -45,21 +56,16 @@
* *
* rax original destination * rax original destination
*/ */
.section .altinstr_replacement, "ax", @progbits ENTRY(memset_erms)
.Lmemset_c_e:
movq %rdi,%r9 movq %rdi,%r9
movb %sil,%al movb %sil,%al
movq %rdx,%rcx movq %rdx,%rcx
rep stosb rep stosb
movq %r9,%rax movq %r9,%rax
ret ret
.Lmemset_e_e: ENDPROC(memset_erms)
.previous
.weak memset
ENTRY(memset) ENTRY(memset_orig)
ENTRY(__memset)
CFI_STARTPROC CFI_STARTPROC
movq %rdi,%r10 movq %rdi,%r10
...@@ -134,23 +140,4 @@ ENTRY(__memset) ...@@ -134,23 +140,4 @@ ENTRY(__memset)
jmp .Lafter_bad_alignment jmp .Lafter_bad_alignment
.Lfinal: .Lfinal:
CFI_ENDPROC CFI_ENDPROC
ENDPROC(memset) ENDPROC(memset_orig)
ENDPROC(__memset)
/* Some CPUs support enhanced REP MOVSB/STOSB feature.
* It is recommended to use this when possible.
*
* If enhanced REP MOVSB/STOSB feature is not available, use fast string
* instructions.
*
* Otherwise, use original memset function.
*
* In .altinstructions section, ERMS feature is placed after REG_GOOD
* feature to implement the right patch order.
*/
.section .altinstructions,"a"
altinstruction_entry __memset,.Lmemset_c,X86_FEATURE_REP_GOOD,\
.Lfinal-__memset,.Lmemset_e-.Lmemset_c
altinstruction_entry __memset,.Lmemset_c_e,X86_FEATURE_ERMS, \
.Lfinal-__memset,.Lmemset_e_e-.Lmemset_c_e
.previous
...@@ -64,8 +64,8 @@ ...@@ -64,8 +64,8 @@
*/ */
static inline void rdtsc_barrier(void) static inline void rdtsc_barrier(void)
{ {
alternative(ASM_NOP3, "mfence", X86_FEATURE_MFENCE_RDTSC); alternative_2("", "mfence", X86_FEATURE_MFENCE_RDTSC,
alternative(ASM_NOP3, "lfence", X86_FEATURE_LFENCE_RDTSC); "lfence", X86_FEATURE_LFENCE_RDTSC);
} }
#endif #endif
MEMCPY_FN(__memcpy, MEMCPY_FN(memcpy_orig,
"x86-64-unrolled", "x86-64-unrolled",
"unrolled memcpy() in arch/x86/lib/memcpy_64.S") "unrolled memcpy() in arch/x86/lib/memcpy_64.S")
MEMCPY_FN(memcpy_c, MEMCPY_FN(__memcpy,
"x86-64-movsq", "x86-64-movsq",
"movsq-based memcpy() in arch/x86/lib/memcpy_64.S") "movsq-based memcpy() in arch/x86/lib/memcpy_64.S")
MEMCPY_FN(memcpy_c_e, MEMCPY_FN(memcpy_erms,
"x86-64-movsb", "x86-64-movsb",
"movsb-based memcpy() in arch/x86/lib/memcpy_64.S") "movsb-based memcpy() in arch/x86/lib/memcpy_64.S")
#define memcpy MEMCPY /* don't hide glibc's memcpy() */ #define memcpy MEMCPY /* don't hide glibc's memcpy() */
#define altinstr_replacement text #define altinstr_replacement text
#define globl p2align 4; .globl #define globl p2align 4; .globl
#define Lmemcpy_c globl memcpy_c; memcpy_c
#define Lmemcpy_c_e globl memcpy_c_e; memcpy_c_e
#include "../../../arch/x86/lib/memcpy_64.S" #include "../../../arch/x86/lib/memcpy_64.S"
/* /*
* We need to provide note.GNU-stack section, saying that we want * We need to provide note.GNU-stack section, saying that we want
......
...@@ -36,7 +36,7 @@ static const struct option options[] = { ...@@ -36,7 +36,7 @@ static const struct option options[] = {
"Specify length of memory to copy. " "Specify length of memory to copy. "
"Available units: B, KB, MB, GB and TB (upper and lower)"), "Available units: B, KB, MB, GB and TB (upper and lower)"),
OPT_STRING('r', "routine", &routine, "default", OPT_STRING('r', "routine", &routine, "default",
"Specify routine to copy"), "Specify routine to copy, \"all\" runs all available routines"),
OPT_INTEGER('i', "iterations", &iterations, OPT_INTEGER('i', "iterations", &iterations,
"repeat memcpy() invocation this number of times"), "repeat memcpy() invocation this number of times"),
OPT_BOOLEAN('c', "cycle", &use_cycle, OPT_BOOLEAN('c', "cycle", &use_cycle,
...@@ -135,55 +135,16 @@ struct bench_mem_info { ...@@ -135,55 +135,16 @@ struct bench_mem_info {
const char *const *usage; const char *const *usage;
}; };
static int bench_mem_common(int argc, const char **argv, static void __bench_mem_routine(struct bench_mem_info *info, int r_idx, size_t len, double totallen)
const char *prefix __maybe_unused,
struct bench_mem_info *info)
{ {
int i; const struct routine *r = &info->routines[r_idx];
size_t len;
double totallen;
double result_bps[2]; double result_bps[2];
u64 result_cycle[2]; u64 result_cycle[2];
argc = parse_options(argc, argv, options,
info->usage, 0);
if (no_prefault && only_prefault) {
fprintf(stderr, "Invalid options: -o and -n are mutually exclusive\n");
return 1;
}
if (use_cycle)
init_cycle();
len = (size_t)perf_atoll((char *)length_str);
totallen = (double)len * iterations;
result_cycle[0] = result_cycle[1] = 0ULL; result_cycle[0] = result_cycle[1] = 0ULL;
result_bps[0] = result_bps[1] = 0.0; result_bps[0] = result_bps[1] = 0.0;
if ((s64)len <= 0) { printf("Routine %s (%s)\n", r->name, r->desc);
fprintf(stderr, "Invalid length:%s\n", length_str);
return 1;
}
/* same to without specifying either of prefault and no-prefault */
if (only_prefault && no_prefault)
only_prefault = no_prefault = false;
for (i = 0; info->routines[i].name; i++) {
if (!strcmp(info->routines[i].name, routine))
break;
}
if (!info->routines[i].name) {
printf("Unknown routine:%s\n", routine);
printf("Available routines...\n");
for (i = 0; info->routines[i].name; i++) {
printf("\t%s ... %s\n",
info->routines[i].name, info->routines[i].desc);
}
return 1;
}
if (bench_format == BENCH_FORMAT_DEFAULT) if (bench_format == BENCH_FORMAT_DEFAULT)
printf("# Copying %s Bytes ...\n\n", length_str); printf("# Copying %s Bytes ...\n\n", length_str);
...@@ -191,28 +152,17 @@ static int bench_mem_common(int argc, const char **argv, ...@@ -191,28 +152,17 @@ static int bench_mem_common(int argc, const char **argv,
if (!only_prefault && !no_prefault) { if (!only_prefault && !no_prefault) {
/* show both of results */ /* show both of results */
if (use_cycle) { if (use_cycle) {
result_cycle[0] = result_cycle[0] = info->do_cycle(r, len, false);
info->do_cycle(&info->routines[i], len, false); result_cycle[1] = info->do_cycle(r, len, true);
result_cycle[1] =
info->do_cycle(&info->routines[i], len, true);
} else { } else {
result_bps[0] = result_bps[0] = info->do_gettimeofday(r, len, false);
info->do_gettimeofday(&info->routines[i], result_bps[1] = info->do_gettimeofday(r, len, true);
len, false);
result_bps[1] =
info->do_gettimeofday(&info->routines[i],
len, true);
} }
} else { } else {
if (use_cycle) { if (use_cycle)
result_cycle[pf] = result_cycle[pf] = info->do_cycle(r, len, only_prefault);
info->do_cycle(&info->routines[i], else
len, only_prefault); result_bps[pf] = info->do_gettimeofday(r, len, only_prefault);
} else {
result_bps[pf] =
info->do_gettimeofday(&info->routines[i],
len, only_prefault);
}
} }
switch (bench_format) { switch (bench_format) {
...@@ -265,6 +215,60 @@ static int bench_mem_common(int argc, const char **argv, ...@@ -265,6 +215,60 @@ static int bench_mem_common(int argc, const char **argv,
die("unknown format: %d\n", bench_format); die("unknown format: %d\n", bench_format);
break; break;
} }
}
static int bench_mem_common(int argc, const char **argv,
const char *prefix __maybe_unused,
struct bench_mem_info *info)
{
int i;
size_t len;
double totallen;
argc = parse_options(argc, argv, options,
info->usage, 0);
if (no_prefault && only_prefault) {
fprintf(stderr, "Invalid options: -o and -n are mutually exclusive\n");
return 1;
}
if (use_cycle)
init_cycle();
len = (size_t)perf_atoll((char *)length_str);
totallen = (double)len * iterations;
if ((s64)len <= 0) {
fprintf(stderr, "Invalid length:%s\n", length_str);
return 1;
}
/* same to without specifying either of prefault and no-prefault */
if (only_prefault && no_prefault)
only_prefault = no_prefault = false;
if (!strncmp(routine, "all", 3)) {
for (i = 0; info->routines[i].name; i++)
__bench_mem_routine(info, i, len, totallen);
return 0;
}
for (i = 0; info->routines[i].name; i++) {
if (!strcmp(info->routines[i].name, routine))
break;
}
if (!info->routines[i].name) {
printf("Unknown routine:%s\n", routine);
printf("Available routines...\n");
for (i = 0; info->routines[i].name; i++) {
printf("\t%s ... %s\n",
info->routines[i].name, info->routines[i].desc);
}
return 1;
}
__bench_mem_routine(info, i, len, totallen);
return 0; return 0;
} }
......
MEMSET_FN(__memset, MEMSET_FN(memset_orig,
"x86-64-unrolled", "x86-64-unrolled",
"unrolled memset() in arch/x86/lib/memset_64.S") "unrolled memset() in arch/x86/lib/memset_64.S")
MEMSET_FN(memset_c, MEMSET_FN(__memset,
"x86-64-stosq", "x86-64-stosq",
"movsq-based memset() in arch/x86/lib/memset_64.S") "movsq-based memset() in arch/x86/lib/memset_64.S")
MEMSET_FN(memset_c_e, MEMSET_FN(memset_erms,
"x86-64-stosb", "x86-64-stosb",
"movsb-based memset() in arch/x86/lib/memset_64.S") "movsb-based memset() in arch/x86/lib/memset_64.S")
#define memset MEMSET /* don't hide glibc's memset() */ #define memset MEMSET /* don't hide glibc's memset() */
#define altinstr_replacement text #define altinstr_replacement text
#define globl p2align 4; .globl #define globl p2align 4; .globl
#define Lmemset_c globl memset_c; memset_c
#define Lmemset_c_e globl memset_c_e; memset_c_e
#include "../../../arch/x86/lib/memset_64.S" #include "../../../arch/x86/lib/memset_64.S"
/* /*
......
...@@ -4,5 +4,6 @@ ...@@ -4,5 +4,6 @@
/* Just disable it so we can build arch/x86/lib/memcpy_64.S for perf bench: */ /* Just disable it so we can build arch/x86/lib/memcpy_64.S for perf bench: */
#define altinstruction_entry # #define altinstruction_entry #
#define ALTERNATIVE_2 #
#endif #endif
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment