Commit 8dd5032d authored by Denys Vlasenko's avatar Denys Vlasenko Committed by Ingo Molnar

x86/asm/bitops: Force inlining of test_and_set_bit and friends

Sometimes GCC mysteriously doesn't inline very small functions
we expect to be inlined, see:

  https://gcc.gnu.org/bugzilla/show_bug.cgi?id=66122

Arguably, GCC should do better, but GCC people aren't willing
to invest time into it and are asking to use __always_inline
instead.

With this .config:

  http://busybox.net/~vda/kernel_config_OPTIMIZE_INLINING_and_Os

here's an example of functions getting deinlined many times:

  test_and_set_bit (166 copies, ~1260 calls)
         55                      push   %rbp
         48 89 e5                mov    %rsp,%rbp
         f0 48 0f ab 3e          lock bts %rdi,(%rsi)
         72 04                   jb     <test_and_set_bit+0xf>
         31 c0                   xor    %eax,%eax
         eb 05                   jmp    <test_and_set_bit+0x14>
         b8 01 00 00 00          mov    $0x1,%eax
         5d                      pop    %rbp
         c3                      retq

  test_and_clear_bit (124 copies, ~1000 calls)
         55                      push   %rbp
         48 89 e5                mov    %rsp,%rbp
         f0 48 0f b3 3e          lock btr %rdi,(%rsi)
         72 04                   jb     <test_and_clear_bit+0xf>
         31 c0                   xor    %eax,%eax
         eb 05                   jmp    <test_and_clear_bit+0x14>
         b8 01 00 00 00          mov    $0x1,%eax
         5d                      pop    %rbp
         c3                      retq

  change_bit (3 copies, 8 calls)
         55                      push   %rbp
         48 89 e5                mov    %rsp,%rbp
         f0 48 0f bb 3e          lock btc %rdi,(%rsi)
         5d                      pop    %rbp
         c3                      retq

  clear_bit_unlock (2 copies, 11 calls)
         55                      push   %rbp
         48 89 e5                mov    %rsp,%rbp
         f0 48 0f b3 3e          lock btr %rdi,(%rsi)
         5d                      pop    %rbp
         c3                      retq

This patch works it around via s/inline/__always_inline/.

Code size decrease by ~13.5k after the patch:

      text     data      bss       dec    filename
  92110727 20826144 36417536 149354407    vmlinux.before
  92097234 20826176 36417536 149340946    vmlinux.after
Signed-off-by: default avatarDenys Vlasenko <dvlasenk@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: David Rientjes <rientjes@google.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Thomas Graf <tgraf@suug.ch>
Link: http://lkml.kernel.org/r/1454881887-1367-1-git-send-email-dvlasenk@redhat.comSigned-off-by: default avatarIngo Molnar <mingo@kernel.org>
parent d99e1bd1
...@@ -91,7 +91,7 @@ set_bit(long nr, volatile unsigned long *addr) ...@@ -91,7 +91,7 @@ set_bit(long nr, volatile unsigned long *addr)
* If it's called on the same region of memory simultaneously, the effect * If it's called on the same region of memory simultaneously, the effect
* may be that only one operation succeeds. * may be that only one operation succeeds.
*/ */
static inline void __set_bit(long nr, volatile unsigned long *addr) static __always_inline void __set_bit(long nr, volatile unsigned long *addr)
{ {
asm volatile("bts %1,%0" : ADDR : "Ir" (nr) : "memory"); asm volatile("bts %1,%0" : ADDR : "Ir" (nr) : "memory");
} }
...@@ -128,13 +128,13 @@ clear_bit(long nr, volatile unsigned long *addr) ...@@ -128,13 +128,13 @@ clear_bit(long nr, volatile unsigned long *addr)
* clear_bit() is atomic and implies release semantics before the memory * clear_bit() is atomic and implies release semantics before the memory
* operation. It can be used for an unlock. * operation. It can be used for an unlock.
*/ */
static inline void clear_bit_unlock(long nr, volatile unsigned long *addr) static __always_inline void clear_bit_unlock(long nr, volatile unsigned long *addr)
{ {
barrier(); barrier();
clear_bit(nr, addr); clear_bit(nr, addr);
} }
static inline void __clear_bit(long nr, volatile unsigned long *addr) static __always_inline void __clear_bit(long nr, volatile unsigned long *addr)
{ {
asm volatile("btr %1,%0" : ADDR : "Ir" (nr)); asm volatile("btr %1,%0" : ADDR : "Ir" (nr));
} }
...@@ -151,7 +151,7 @@ static inline void __clear_bit(long nr, volatile unsigned long *addr) ...@@ -151,7 +151,7 @@ static inline void __clear_bit(long nr, volatile unsigned long *addr)
* No memory barrier is required here, because x86 cannot reorder stores past * No memory barrier is required here, because x86 cannot reorder stores past
* older loads. Same principle as spin_unlock. * older loads. Same principle as spin_unlock.
*/ */
static inline void __clear_bit_unlock(long nr, volatile unsigned long *addr) static __always_inline void __clear_bit_unlock(long nr, volatile unsigned long *addr)
{ {
barrier(); barrier();
__clear_bit(nr, addr); __clear_bit(nr, addr);
...@@ -166,7 +166,7 @@ static inline void __clear_bit_unlock(long nr, volatile unsigned long *addr) ...@@ -166,7 +166,7 @@ static inline void __clear_bit_unlock(long nr, volatile unsigned long *addr)
* If it's called on the same region of memory simultaneously, the effect * If it's called on the same region of memory simultaneously, the effect
* may be that only one operation succeeds. * may be that only one operation succeeds.
*/ */
static inline void __change_bit(long nr, volatile unsigned long *addr) static __always_inline void __change_bit(long nr, volatile unsigned long *addr)
{ {
asm volatile("btc %1,%0" : ADDR : "Ir" (nr)); asm volatile("btc %1,%0" : ADDR : "Ir" (nr));
} }
...@@ -180,7 +180,7 @@ static inline void __change_bit(long nr, volatile unsigned long *addr) ...@@ -180,7 +180,7 @@ static inline void __change_bit(long nr, volatile unsigned long *addr)
* Note that @nr may be almost arbitrarily large; this function is not * Note that @nr may be almost arbitrarily large; this function is not
* restricted to acting on a single-word quantity. * restricted to acting on a single-word quantity.
*/ */
static inline void change_bit(long nr, volatile unsigned long *addr) static __always_inline void change_bit(long nr, volatile unsigned long *addr)
{ {
if (IS_IMMEDIATE(nr)) { if (IS_IMMEDIATE(nr)) {
asm volatile(LOCK_PREFIX "xorb %1,%0" asm volatile(LOCK_PREFIX "xorb %1,%0"
...@@ -201,7 +201,7 @@ static inline void change_bit(long nr, volatile unsigned long *addr) ...@@ -201,7 +201,7 @@ static inline void change_bit(long nr, volatile unsigned long *addr)
* This operation is atomic and cannot be reordered. * This operation is atomic and cannot be reordered.
* It also implies a memory barrier. * It also implies a memory barrier.
*/ */
static inline int test_and_set_bit(long nr, volatile unsigned long *addr) static __always_inline int test_and_set_bit(long nr, volatile unsigned long *addr)
{ {
GEN_BINARY_RMWcc(LOCK_PREFIX "bts", *addr, "Ir", nr, "%0", "c"); GEN_BINARY_RMWcc(LOCK_PREFIX "bts", *addr, "Ir", nr, "%0", "c");
} }
...@@ -228,7 +228,7 @@ test_and_set_bit_lock(long nr, volatile unsigned long *addr) ...@@ -228,7 +228,7 @@ test_and_set_bit_lock(long nr, volatile unsigned long *addr)
* If two examples of this operation race, one can appear to succeed * If two examples of this operation race, one can appear to succeed
* but actually fail. You must protect multiple accesses with a lock. * but actually fail. You must protect multiple accesses with a lock.
*/ */
static inline int __test_and_set_bit(long nr, volatile unsigned long *addr) static __always_inline int __test_and_set_bit(long nr, volatile unsigned long *addr)
{ {
int oldbit; int oldbit;
...@@ -247,7 +247,7 @@ static inline int __test_and_set_bit(long nr, volatile unsigned long *addr) ...@@ -247,7 +247,7 @@ static inline int __test_and_set_bit(long nr, volatile unsigned long *addr)
* This operation is atomic and cannot be reordered. * This operation is atomic and cannot be reordered.
* It also implies a memory barrier. * It also implies a memory barrier.
*/ */
static inline int test_and_clear_bit(long nr, volatile unsigned long *addr) static __always_inline int test_and_clear_bit(long nr, volatile unsigned long *addr)
{ {
GEN_BINARY_RMWcc(LOCK_PREFIX "btr", *addr, "Ir", nr, "%0", "c"); GEN_BINARY_RMWcc(LOCK_PREFIX "btr", *addr, "Ir", nr, "%0", "c");
} }
...@@ -268,7 +268,7 @@ static inline int test_and_clear_bit(long nr, volatile unsigned long *addr) ...@@ -268,7 +268,7 @@ static inline int test_and_clear_bit(long nr, volatile unsigned long *addr)
* accessed from a hypervisor on the same CPU if running in a VM: don't change * accessed from a hypervisor on the same CPU if running in a VM: don't change
* this without also updating arch/x86/kernel/kvm.c * this without also updating arch/x86/kernel/kvm.c
*/ */
static inline int __test_and_clear_bit(long nr, volatile unsigned long *addr) static __always_inline int __test_and_clear_bit(long nr, volatile unsigned long *addr)
{ {
int oldbit; int oldbit;
...@@ -280,7 +280,7 @@ static inline int __test_and_clear_bit(long nr, volatile unsigned long *addr) ...@@ -280,7 +280,7 @@ static inline int __test_and_clear_bit(long nr, volatile unsigned long *addr)
} }
/* WARNING: non atomic and it can be reordered! */ /* WARNING: non atomic and it can be reordered! */
static inline int __test_and_change_bit(long nr, volatile unsigned long *addr) static __always_inline int __test_and_change_bit(long nr, volatile unsigned long *addr)
{ {
int oldbit; int oldbit;
...@@ -300,7 +300,7 @@ static inline int __test_and_change_bit(long nr, volatile unsigned long *addr) ...@@ -300,7 +300,7 @@ static inline int __test_and_change_bit(long nr, volatile unsigned long *addr)
* This operation is atomic and cannot be reordered. * This operation is atomic and cannot be reordered.
* It also implies a memory barrier. * It also implies a memory barrier.
*/ */
static inline int test_and_change_bit(long nr, volatile unsigned long *addr) static __always_inline int test_and_change_bit(long nr, volatile unsigned long *addr)
{ {
GEN_BINARY_RMWcc(LOCK_PREFIX "btc", *addr, "Ir", nr, "%0", "c"); GEN_BINARY_RMWcc(LOCK_PREFIX "btc", *addr, "Ir", nr, "%0", "c");
} }
...@@ -311,7 +311,7 @@ static __always_inline int constant_test_bit(long nr, const volatile unsigned lo ...@@ -311,7 +311,7 @@ static __always_inline int constant_test_bit(long nr, const volatile unsigned lo
(addr[nr >> _BITOPS_LONG_SHIFT])) != 0; (addr[nr >> _BITOPS_LONG_SHIFT])) != 0;
} }
static inline int variable_test_bit(long nr, volatile const unsigned long *addr) static __always_inline int variable_test_bit(long nr, volatile const unsigned long *addr)
{ {
int oldbit; int oldbit;
...@@ -343,7 +343,7 @@ static int test_bit(int nr, const volatile unsigned long *addr); ...@@ -343,7 +343,7 @@ static int test_bit(int nr, const volatile unsigned long *addr);
* *
* Undefined if no bit exists, so code should check against 0 first. * Undefined if no bit exists, so code should check against 0 first.
*/ */
static inline unsigned long __ffs(unsigned long word) static __always_inline unsigned long __ffs(unsigned long word)
{ {
asm("rep; bsf %1,%0" asm("rep; bsf %1,%0"
: "=r" (word) : "=r" (word)
...@@ -357,7 +357,7 @@ static inline unsigned long __ffs(unsigned long word) ...@@ -357,7 +357,7 @@ static inline unsigned long __ffs(unsigned long word)
* *
* Undefined if no zero exists, so code should check against ~0UL first. * Undefined if no zero exists, so code should check against ~0UL first.
*/ */
static inline unsigned long ffz(unsigned long word) static __always_inline unsigned long ffz(unsigned long word)
{ {
asm("rep; bsf %1,%0" asm("rep; bsf %1,%0"
: "=r" (word) : "=r" (word)
...@@ -371,7 +371,7 @@ static inline unsigned long ffz(unsigned long word) ...@@ -371,7 +371,7 @@ static inline unsigned long ffz(unsigned long word)
* *
* Undefined if no set bit exists, so code should check against 0 first. * Undefined if no set bit exists, so code should check against 0 first.
*/ */
static inline unsigned long __fls(unsigned long word) static __always_inline unsigned long __fls(unsigned long word)
{ {
asm("bsr %1,%0" asm("bsr %1,%0"
: "=r" (word) : "=r" (word)
...@@ -393,7 +393,7 @@ static inline unsigned long __fls(unsigned long word) ...@@ -393,7 +393,7 @@ static inline unsigned long __fls(unsigned long word)
* set bit if value is nonzero. The first (least significant) bit * set bit if value is nonzero. The first (least significant) bit
* is at position 1. * is at position 1.
*/ */
static inline int ffs(int x) static __always_inline int ffs(int x)
{ {
int r; int r;
...@@ -434,7 +434,7 @@ static inline int ffs(int x) ...@@ -434,7 +434,7 @@ static inline int ffs(int x)
* set bit if value is nonzero. The last (most significant) bit is * set bit if value is nonzero. The last (most significant) bit is
* at position 32. * at position 32.
*/ */
static inline int fls(int x) static __always_inline int fls(int x)
{ {
int r; int r;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment