Commit 5780e39e authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'x86-asm-2023-10-28' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull x86 assembly code updates from Ingo Molnar:

 - Micro-optimize the x86 bitops code

 - Define target-specific {raw,this}_cpu_try_cmpxchg{64,128}() to
   improve code generation

 - Define and use raw_cpu_try_cmpxchg() preempt_count_set()

 - Do not clobber %rsi in percpu_{try_,}cmpxchg{64,128}_op

 - Remove the unused __sw_hweight64() implementation on x86-32

 - Misc fixes and cleanups

* tag 'x86-asm-2023-10-28' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  x86/lib: Address kernel-doc warnings
  x86/entry: Fix typos in comments
  x86/entry: Remove unused argument %rsi passed to exc_nmi()
  x86/bitops: Remove unused __sw_hweight64() assembly implementation on x86-32
  x86/percpu: Do not clobber %rsi in percpu_{try_,}cmpxchg{64,128}_op
  x86/percpu: Use raw_cpu_try_cmpxchg() in preempt_count_set()
  x86/percpu: Define raw_cpu_try_cmpxchg and this_cpu_try_cmpxchg()
  x86/percpu: Define {raw,this}_cpu_try_cmpxchg{64,128}
  x86/asm/bitops: Use __builtin_clz{l|ll} to evaluate constant expressions
parents 2b95bb05 8ae292c6
...@@ -1163,8 +1163,8 @@ SYM_CODE_START(asm_exc_nmi) ...@@ -1163,8 +1163,8 @@ SYM_CODE_START(asm_exc_nmi)
* anyway. * anyway.
* *
* To handle this case we do the following: * To handle this case we do the following:
* Check the a special location on the stack that contains * Check a special location on the stack that contains a
* a variable that is set when NMIs are executing. * variable that is set when NMIs are executing.
* The interrupted task's stack is also checked to see if it * The interrupted task's stack is also checked to see if it
* is an NMI stack. * is an NMI stack.
* If the variable is not set and the stack is not the NMI * If the variable is not set and the stack is not the NMI
...@@ -1237,7 +1237,6 @@ SYM_CODE_START(asm_exc_nmi) ...@@ -1237,7 +1237,6 @@ SYM_CODE_START(asm_exc_nmi)
*/ */
movq %rsp, %rdi movq %rsp, %rdi
movq $-1, %rsi
call exc_nmi call exc_nmi
/* /*
...@@ -1295,8 +1294,8 @@ SYM_CODE_START(asm_exc_nmi) ...@@ -1295,8 +1294,8 @@ SYM_CODE_START(asm_exc_nmi)
* end_repeat_nmi, then we are a nested NMI. We must not * end_repeat_nmi, then we are a nested NMI. We must not
* modify the "iret" frame because it's being written by * modify the "iret" frame because it's being written by
* the outer NMI. That's okay; the outer NMI handler is * the outer NMI. That's okay; the outer NMI handler is
* about to about to call exc_nmi() anyway, so we can just * about to call exc_nmi() anyway, so we can just resume
* resume the outer NMI. * the outer NMI.
*/ */
movq $repeat_nmi, %rdx movq $repeat_nmi, %rdx
...@@ -1451,7 +1450,6 @@ end_repeat_nmi: ...@@ -1451,7 +1450,6 @@ end_repeat_nmi:
UNWIND_HINT_REGS UNWIND_HINT_REGS
movq %rsp, %rdi movq %rsp, %rdi
movq $-1, %rsi
call exc_nmi call exc_nmi
/* Always restore stashed SPEC_CTRL value (see paranoid_entry) */ /* Always restore stashed SPEC_CTRL value (see paranoid_entry) */
......
...@@ -293,6 +293,9 @@ static __always_inline unsigned long variable_ffz(unsigned long word) ...@@ -293,6 +293,9 @@ static __always_inline unsigned long variable_ffz(unsigned long word)
*/ */
static __always_inline unsigned long __fls(unsigned long word) static __always_inline unsigned long __fls(unsigned long word)
{ {
if (__builtin_constant_p(word))
return BITS_PER_LONG - 1 - __builtin_clzl(word);
asm("bsr %1,%0" asm("bsr %1,%0"
: "=r" (word) : "=r" (word)
: "rm" (word)); : "rm" (word));
...@@ -360,6 +363,9 @@ static __always_inline int fls(unsigned int x) ...@@ -360,6 +363,9 @@ static __always_inline int fls(unsigned int x)
{ {
int r; int r;
if (__builtin_constant_p(x))
return x ? 32 - __builtin_clz(x) : 0;
#ifdef CONFIG_X86_64 #ifdef CONFIG_X86_64
/* /*
* AMD64 says BSRL won't clobber the dest reg if x==0; Intel64 says the * AMD64 says BSRL won't clobber the dest reg if x==0; Intel64 says the
...@@ -401,6 +407,9 @@ static __always_inline int fls(unsigned int x) ...@@ -401,6 +407,9 @@ static __always_inline int fls(unsigned int x)
static __always_inline int fls64(__u64 x) static __always_inline int fls64(__u64 x)
{ {
int bitpos = -1; int bitpos = -1;
if (__builtin_constant_p(x))
return x ? 64 - __builtin_clzll(x) : 0;
/* /*
* AMD64 says BSRQ won't clobber the dest reg if x==0; Intel64 says the * AMD64 says BSRQ won't clobber the dest reg if x==0; Intel64 says the
* dest reg is undefined if x==0, but their CPU architect says its * dest reg is undefined if x==0, but their CPU architect says its
......
...@@ -210,6 +210,25 @@ do { \ ...@@ -210,6 +210,25 @@ do { \
(typeof(_var))(unsigned long) pco_old__; \ (typeof(_var))(unsigned long) pco_old__; \
}) })
#define percpu_try_cmpxchg_op(size, qual, _var, _ovalp, _nval) \
({ \
bool success; \
__pcpu_type_##size *pco_oval__ = (__pcpu_type_##size *)(_ovalp); \
__pcpu_type_##size pco_old__ = *pco_oval__; \
__pcpu_type_##size pco_new__ = __pcpu_cast_##size(_nval); \
asm qual (__pcpu_op2_##size("cmpxchg", "%[nval]", \
__percpu_arg([var])) \
CC_SET(z) \
: CC_OUT(z) (success), \
[oval] "+a" (pco_old__), \
[var] "+m" (_var) \
: [nval] __pcpu_reg_##size(, pco_new__) \
: "memory"); \
if (unlikely(!success)) \
*pco_oval__ = pco_old__; \
likely(success); \
})
#if defined(CONFIG_X86_32) && !defined(CONFIG_UML) #if defined(CONFIG_X86_32) && !defined(CONFIG_UML)
#define percpu_cmpxchg64_op(size, qual, _var, _oval, _nval) \ #define percpu_cmpxchg64_op(size, qual, _var, _oval, _nval) \
({ \ ({ \
...@@ -223,26 +242,63 @@ do { \ ...@@ -223,26 +242,63 @@ do { \
old__.var = _oval; \ old__.var = _oval; \
new__.var = _nval; \ new__.var = _nval; \
\ \
asm qual (ALTERNATIVE("leal %P[var], %%esi; call this_cpu_cmpxchg8b_emu", \ asm qual (ALTERNATIVE("call this_cpu_cmpxchg8b_emu", \
"cmpxchg8b " __percpu_arg([var]), X86_FEATURE_CX8) \ "cmpxchg8b " __percpu_arg([var]), X86_FEATURE_CX8) \
: [var] "+m" (_var), \ : [var] "+m" (_var), \
"+a" (old__.low), \ "+a" (old__.low), \
"+d" (old__.high) \ "+d" (old__.high) \
: "b" (new__.low), \ : "b" (new__.low), \
"c" (new__.high) \ "c" (new__.high), \
: "memory", "esi"); \ "S" (&(_var)) \
: "memory"); \
\ \
old__.var; \ old__.var; \
}) })
#define raw_cpu_cmpxchg64(pcp, oval, nval) percpu_cmpxchg64_op(8, , pcp, oval, nval) #define raw_cpu_cmpxchg64(pcp, oval, nval) percpu_cmpxchg64_op(8, , pcp, oval, nval)
#define this_cpu_cmpxchg64(pcp, oval, nval) percpu_cmpxchg64_op(8, volatile, pcp, oval, nval) #define this_cpu_cmpxchg64(pcp, oval, nval) percpu_cmpxchg64_op(8, volatile, pcp, oval, nval)
#define percpu_try_cmpxchg64_op(size, qual, _var, _ovalp, _nval) \
({ \
bool success; \
u64 *_oval = (u64 *)(_ovalp); \
union { \
u64 var; \
struct { \
u32 low, high; \
}; \
} old__, new__; \
\
old__.var = *_oval; \
new__.var = _nval; \
\
asm qual (ALTERNATIVE("call this_cpu_cmpxchg8b_emu", \
"cmpxchg8b " __percpu_arg([var]), X86_FEATURE_CX8) \
CC_SET(z) \
: CC_OUT(z) (success), \
[var] "+m" (_var), \
"+a" (old__.low), \
"+d" (old__.high) \
: "b" (new__.low), \
"c" (new__.high), \
"S" (&(_var)) \
: "memory"); \
if (unlikely(!success)) \
*_oval = old__.var; \
likely(success); \
})
#define raw_cpu_try_cmpxchg64(pcp, ovalp, nval) percpu_try_cmpxchg64_op(8, , pcp, ovalp, nval)
#define this_cpu_try_cmpxchg64(pcp, ovalp, nval) percpu_try_cmpxchg64_op(8, volatile, pcp, ovalp, nval)
#endif #endif
#ifdef CONFIG_X86_64 #ifdef CONFIG_X86_64
#define raw_cpu_cmpxchg64(pcp, oval, nval) percpu_cmpxchg_op(8, , pcp, oval, nval); #define raw_cpu_cmpxchg64(pcp, oval, nval) percpu_cmpxchg_op(8, , pcp, oval, nval);
#define this_cpu_cmpxchg64(pcp, oval, nval) percpu_cmpxchg_op(8, volatile, pcp, oval, nval); #define this_cpu_cmpxchg64(pcp, oval, nval) percpu_cmpxchg_op(8, volatile, pcp, oval, nval);
#define raw_cpu_try_cmpxchg64(pcp, ovalp, nval) percpu_try_cmpxchg_op(8, , pcp, ovalp, nval);
#define this_cpu_try_cmpxchg64(pcp, ovalp, nval) percpu_try_cmpxchg_op(8, volatile, pcp, ovalp, nval);
#define percpu_cmpxchg128_op(size, qual, _var, _oval, _nval) \ #define percpu_cmpxchg128_op(size, qual, _var, _oval, _nval) \
({ \ ({ \
union { \ union { \
...@@ -255,20 +311,54 @@ do { \ ...@@ -255,20 +311,54 @@ do { \
old__.var = _oval; \ old__.var = _oval; \
new__.var = _nval; \ new__.var = _nval; \
\ \
asm qual (ALTERNATIVE("leaq %P[var], %%rsi; call this_cpu_cmpxchg16b_emu", \ asm qual (ALTERNATIVE("call this_cpu_cmpxchg16b_emu", \
"cmpxchg16b " __percpu_arg([var]), X86_FEATURE_CX16) \ "cmpxchg16b " __percpu_arg([var]), X86_FEATURE_CX16) \
: [var] "+m" (_var), \ : [var] "+m" (_var), \
"+a" (old__.low), \ "+a" (old__.low), \
"+d" (old__.high) \ "+d" (old__.high) \
: "b" (new__.low), \ : "b" (new__.low), \
"c" (new__.high) \ "c" (new__.high), \
: "memory", "rsi"); \ "S" (&(_var)) \
: "memory"); \
\ \
old__.var; \ old__.var; \
}) })
#define raw_cpu_cmpxchg128(pcp, oval, nval) percpu_cmpxchg128_op(16, , pcp, oval, nval) #define raw_cpu_cmpxchg128(pcp, oval, nval) percpu_cmpxchg128_op(16, , pcp, oval, nval)
#define this_cpu_cmpxchg128(pcp, oval, nval) percpu_cmpxchg128_op(16, volatile, pcp, oval, nval) #define this_cpu_cmpxchg128(pcp, oval, nval) percpu_cmpxchg128_op(16, volatile, pcp, oval, nval)
#define percpu_try_cmpxchg128_op(size, qual, _var, _ovalp, _nval) \
({ \
bool success; \
u128 *_oval = (u128 *)(_ovalp); \
union { \
u128 var; \
struct { \
u64 low, high; \
}; \
} old__, new__; \
\
old__.var = *_oval; \
new__.var = _nval; \
\
asm qual (ALTERNATIVE("call this_cpu_cmpxchg16b_emu", \
"cmpxchg16b " __percpu_arg([var]), X86_FEATURE_CX16) \
CC_SET(z) \
: CC_OUT(z) (success), \
[var] "+m" (_var), \
"+a" (old__.low), \
"+d" (old__.high) \
: "b" (new__.low), \
"c" (new__.high), \
"S" (&(_var)) \
: "memory"); \
if (unlikely(!success)) \
*_oval = old__.var; \
likely(success); \
})
#define raw_cpu_try_cmpxchg128(pcp, ovalp, nval) percpu_try_cmpxchg128_op(16, , pcp, ovalp, nval)
#define this_cpu_try_cmpxchg128(pcp, ovalp, nval) percpu_try_cmpxchg128_op(16, volatile, pcp, ovalp, nval)
#endif #endif
/* /*
...@@ -343,6 +433,9 @@ do { \ ...@@ -343,6 +433,9 @@ do { \
#define raw_cpu_cmpxchg_1(pcp, oval, nval) percpu_cmpxchg_op(1, , pcp, oval, nval) #define raw_cpu_cmpxchg_1(pcp, oval, nval) percpu_cmpxchg_op(1, , pcp, oval, nval)
#define raw_cpu_cmpxchg_2(pcp, oval, nval) percpu_cmpxchg_op(2, , pcp, oval, nval) #define raw_cpu_cmpxchg_2(pcp, oval, nval) percpu_cmpxchg_op(2, , pcp, oval, nval)
#define raw_cpu_cmpxchg_4(pcp, oval, nval) percpu_cmpxchg_op(4, , pcp, oval, nval) #define raw_cpu_cmpxchg_4(pcp, oval, nval) percpu_cmpxchg_op(4, , pcp, oval, nval)
#define raw_cpu_try_cmpxchg_1(pcp, ovalp, nval) percpu_try_cmpxchg_op(1, , pcp, ovalp, nval)
#define raw_cpu_try_cmpxchg_2(pcp, ovalp, nval) percpu_try_cmpxchg_op(2, , pcp, ovalp, nval)
#define raw_cpu_try_cmpxchg_4(pcp, ovalp, nval) percpu_try_cmpxchg_op(4, , pcp, ovalp, nval)
#define this_cpu_add_return_1(pcp, val) percpu_add_return_op(1, volatile, pcp, val) #define this_cpu_add_return_1(pcp, val) percpu_add_return_op(1, volatile, pcp, val)
#define this_cpu_add_return_2(pcp, val) percpu_add_return_op(2, volatile, pcp, val) #define this_cpu_add_return_2(pcp, val) percpu_add_return_op(2, volatile, pcp, val)
...@@ -350,6 +443,9 @@ do { \ ...@@ -350,6 +443,9 @@ do { \
#define this_cpu_cmpxchg_1(pcp, oval, nval) percpu_cmpxchg_op(1, volatile, pcp, oval, nval) #define this_cpu_cmpxchg_1(pcp, oval, nval) percpu_cmpxchg_op(1, volatile, pcp, oval, nval)
#define this_cpu_cmpxchg_2(pcp, oval, nval) percpu_cmpxchg_op(2, volatile, pcp, oval, nval) #define this_cpu_cmpxchg_2(pcp, oval, nval) percpu_cmpxchg_op(2, volatile, pcp, oval, nval)
#define this_cpu_cmpxchg_4(pcp, oval, nval) percpu_cmpxchg_op(4, volatile, pcp, oval, nval) #define this_cpu_cmpxchg_4(pcp, oval, nval) percpu_cmpxchg_op(4, volatile, pcp, oval, nval)
#define this_cpu_try_cmpxchg_1(pcp, ovalp, nval) percpu_try_cmpxchg_op(1, volatile, pcp, ovalp, nval)
#define this_cpu_try_cmpxchg_2(pcp, ovalp, nval) percpu_try_cmpxchg_op(2, volatile, pcp, ovalp, nval)
#define this_cpu_try_cmpxchg_4(pcp, ovalp, nval) percpu_try_cmpxchg_op(4, volatile, pcp, ovalp, nval)
/* /*
* Per cpu atomic 64 bit operations are only available under 64 bit. * Per cpu atomic 64 bit operations are only available under 64 bit.
...@@ -364,6 +460,7 @@ do { \ ...@@ -364,6 +460,7 @@ do { \
#define raw_cpu_add_return_8(pcp, val) percpu_add_return_op(8, , pcp, val) #define raw_cpu_add_return_8(pcp, val) percpu_add_return_op(8, , pcp, val)
#define raw_cpu_xchg_8(pcp, nval) raw_percpu_xchg_op(pcp, nval) #define raw_cpu_xchg_8(pcp, nval) raw_percpu_xchg_op(pcp, nval)
#define raw_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(8, , pcp, oval, nval) #define raw_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(8, , pcp, oval, nval)
#define raw_cpu_try_cmpxchg_8(pcp, ovalp, nval) percpu_try_cmpxchg_op(8, , pcp, ovalp, nval)
#define this_cpu_read_8(pcp) percpu_from_op(8, volatile, "mov", pcp) #define this_cpu_read_8(pcp) percpu_from_op(8, volatile, "mov", pcp)
#define this_cpu_write_8(pcp, val) percpu_to_op(8, volatile, "mov", (pcp), val) #define this_cpu_write_8(pcp, val) percpu_to_op(8, volatile, "mov", (pcp), val)
...@@ -373,6 +470,7 @@ do { \ ...@@ -373,6 +470,7 @@ do { \
#define this_cpu_add_return_8(pcp, val) percpu_add_return_op(8, volatile, pcp, val) #define this_cpu_add_return_8(pcp, val) percpu_add_return_op(8, volatile, pcp, val)
#define this_cpu_xchg_8(pcp, nval) percpu_xchg_op(8, volatile, pcp, nval) #define this_cpu_xchg_8(pcp, nval) percpu_xchg_op(8, volatile, pcp, nval)
#define this_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(8, volatile, pcp, oval, nval) #define this_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(8, volatile, pcp, oval, nval)
#define this_cpu_try_cmpxchg_8(pcp, ovalp, nval) percpu_try_cmpxchg_op(8, volatile, pcp, ovalp, nval)
#endif #endif
static __always_inline bool x86_this_cpu_constant_test_bit(unsigned int nr, static __always_inline bool x86_this_cpu_constant_test_bit(unsigned int nr,
......
...@@ -31,11 +31,11 @@ static __always_inline void preempt_count_set(int pc) ...@@ -31,11 +31,11 @@ static __always_inline void preempt_count_set(int pc)
{ {
int old, new; int old, new;
do {
old = raw_cpu_read_4(pcpu_hot.preempt_count); old = raw_cpu_read_4(pcpu_hot.preempt_count);
do {
new = (old & PREEMPT_NEED_RESCHED) | new = (old & PREEMPT_NEED_RESCHED) |
(pc & ~PREEMPT_NEED_RESCHED); (pc & ~PREEMPT_NEED_RESCHED);
} while (raw_cpu_cmpxchg_4(pcpu_hot.preempt_count, old, new) != old); } while (!raw_cpu_try_cmpxchg_4(pcpu_hot.preempt_count, &old, new));
} }
/* /*
......
...@@ -14,8 +14,6 @@ ...@@ -14,8 +14,6 @@
* @src: source address (user space) * @src: source address (user space)
* @dst: destination address * @dst: destination address
* @len: number of bytes to be copied. * @len: number of bytes to be copied.
* @isum: initial sum that is added into the result (32bit unfolded)
* @errp: set to -EFAULT for an bad source address.
* *
* Returns an 32bit unfolded checksum of the buffer. * Returns an 32bit unfolded checksum of the buffer.
* src and dst are best aligned to 64bits. * src and dst are best aligned to 64bits.
...@@ -38,8 +36,6 @@ csum_and_copy_from_user(const void __user *src, void *dst, int len) ...@@ -38,8 +36,6 @@ csum_and_copy_from_user(const void __user *src, void *dst, int len)
* @src: source address * @src: source address
* @dst: destination address (user space) * @dst: destination address (user space)
* @len: number of bytes to be copied. * @len: number of bytes to be copied.
* @isum: initial sum that is added into the result (32bit unfolded)
* @errp: set to -EFAULT for an bad destination address.
* *
* Returns an 32bit unfolded checksum of the buffer. * Returns an 32bit unfolded checksum of the buffer.
* src and dst are best aligned to 64bits. * src and dst are best aligned to 64bits.
...@@ -62,7 +58,6 @@ csum_and_copy_to_user(const void *src, void __user *dst, int len) ...@@ -62,7 +58,6 @@ csum_and_copy_to_user(const void *src, void __user *dst, int len)
* @src: source address * @src: source address
* @dst: destination address * @dst: destination address
* @len: number of bytes to be copied. * @len: number of bytes to be copied.
* @sum: initial sum that is added into the result (32bit unfolded)
* *
* Returns an 32bit unfolded checksum of the buffer. * Returns an 32bit unfolded checksum of the buffer.
*/ */
......
...@@ -36,8 +36,12 @@ SYM_FUNC_START(__sw_hweight32) ...@@ -36,8 +36,12 @@ SYM_FUNC_START(__sw_hweight32)
SYM_FUNC_END(__sw_hweight32) SYM_FUNC_END(__sw_hweight32)
EXPORT_SYMBOL(__sw_hweight32) EXPORT_SYMBOL(__sw_hweight32)
SYM_FUNC_START(__sw_hweight64) /*
* No 32-bit variant, because it's implemented as an inline wrapper
* on top of __arch_hweight32():
*/
#ifdef CONFIG_X86_64 #ifdef CONFIG_X86_64
SYM_FUNC_START(__sw_hweight64)
pushq %rdi pushq %rdi
pushq %rdx pushq %rdx
...@@ -66,18 +70,6 @@ SYM_FUNC_START(__sw_hweight64) ...@@ -66,18 +70,6 @@ SYM_FUNC_START(__sw_hweight64)
popq %rdx popq %rdx
popq %rdi popq %rdi
RET RET
#else /* CONFIG_X86_32 */
/* We're getting an u64 arg in (%eax,%edx): unsigned long hweight64(__u64 w) */
pushl %ecx
call __sw_hweight32
movl %eax, %ecx # stash away result
movl %edx, %eax # second part of input
call __sw_hweight32
addl %ecx, %eax # result
popl %ecx
RET
#endif
SYM_FUNC_END(__sw_hweight64) SYM_FUNC_END(__sw_hweight64)
EXPORT_SYMBOL(__sw_hweight64) EXPORT_SYMBOL(__sw_hweight64)
#endif
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment