Commit 3cf3fabc authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'locking-core-2023-10-28' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull locking updates from Info Molnar:
 "Futex improvements:

   - Add the 'futex2' syscall ABI, which is an attempt to get away from
     the multiplex syscall and adds a little room for extentions, while
     lifting some limitations.

   - Fix futex PI recursive rt_mutex waiter state bug

   - Fix inter-process shared futexes on no-MMU systems

   - Use folios instead of pages

  Micro-optimizations of locking primitives:

   - Improve arch_spin_value_unlocked() on asm-generic ticket spinlock
     architectures, to improve lockref code generation

   - Improve the x86-32 lockref_get_not_zero() main loop by adding
     build-time CMPXCHG8B support detection for the relevant lockref
     code, and by better interfacing the CMPXCHG8B assembly code with
     the compiler

   - Introduce arch_sync_try_cmpxchg() on x86 to improve
     sync_try_cmpxchg() code generation. Convert some sync_cmpxchg()
     users to sync_try_cmpxchg().

   - Micro-optimize rcuref_put_slowpath()

  Locking debuggability improvements:

   - Improve CONFIG_DEBUG_RT_MUTEXES=y to have a fast-path as well

   - Enforce atomicity of sched_submit_work(), which is de-facto atomic
     but was un-enforced previously.

   - Extend <linux/cleanup.h>'s no_free_ptr() with __must_check
     semantics

   - Fix ww_mutex self-tests

   - Clean up const-propagation in <linux/seqlock.h> and simplify the
     API-instantiation macros a bit

  RT locking improvements:

   - Provide the rt_mutex_*_schedule() primitives/helpers and use them
     in the rtmutex code to avoid recursion vs. rtlock on the PI state.

   - Add nested blocking lockdep asserts to rt_mutex_lock(),
     rtlock_lock() and rwbase_read_lock()

  .. plus misc fixes & cleanups"

* tag 'locking-core-2023-10-28' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (39 commits)
  futex: Don't include process MM in futex key on no-MMU
  locking/seqlock: Fix grammar in comment
  alpha: Fix up new futex syscall numbers
  locking/seqlock: Propagate 'const' pointers within read-only methods, remove forced type casts
  locking/lockdep: Fix string sizing bug that triggers a format-truncation compiler-warning
  locking/seqlock: Change __seqprop() to return the function pointer
  locking/seqlock: Simplify SEQCOUNT_LOCKNAME()
  locking/atomics: Use atomic_try_cmpxchg_release() to micro-optimize rcuref_put_slowpath()
  locking/atomic, xen: Use sync_try_cmpxchg() instead of sync_cmpxchg()
  locking/atomic/x86: Introduce arch_sync_try_cmpxchg()
  locking/atomic: Add generic support for sync_try_cmpxchg() and its fallback
  locking/seqlock: Fix typo in comment
  futex/requeue: Remove unnecessary ‘NULL’ initialization from futex_proxy_trylock_atomic()
  locking/local, arch: Rewrite local_add_unless() as a static inline function
  locking/debug: Fix debugfs API return value checks to use IS_ERR()
  locking/ww_mutex/test: Make sure we bail out instead of livelock
  locking/ww_mutex/test: Fix potential workqueue corruption
  locking/ww_mutex/test: Use prng instead of rng to avoid hangs at bootup
  futex: Add sys_futex_requeue()
  futex: Add flags2 argument to futex_requeue()
  ...
parents 9cda4eb0 c73801ae
...@@ -65,28 +65,27 @@ static __inline__ bool local_try_cmpxchg(local_t *l, long *old, long new) ...@@ -65,28 +65,27 @@ static __inline__ bool local_try_cmpxchg(local_t *l, long *old, long new)
#define local_xchg(l, n) (xchg_local(&((l)->a.counter), (n))) #define local_xchg(l, n) (xchg_local(&((l)->a.counter), (n)))
/** /**
* local_add_unless - add unless the number is a given value * local_add_unless - add unless the number is already a given value
* @l: pointer of type local_t * @l: pointer of type local_t
* @a: the amount to add to l... * @a: the amount to add to l...
* @u: ...unless l is equal to u. * @u: ...unless l is equal to u.
* *
* Atomically adds @a to @l, so long as it was not @u. * Atomically adds @a to @l, if @v was not already @u.
* Returns non-zero if @l was not @u, and zero otherwise. * Returns true if the addition was done.
*/ */
#define local_add_unless(l, a, u) \ static __inline__ bool
({ \ local_add_unless(local_t *l, long a, long u)
long c, old; \ {
c = local_read(l); \ long c = local_read(l);
for (;;) { \
if (unlikely(c == (u))) \ do {
break; \ if (unlikely(c == u))
old = local_cmpxchg((l), c, c + (a)); \ return false;
if (likely(old == c)) \ } while (!local_try_cmpxchg(l, &c, c + a));
break; \
c = old; \ return true;
} \ }
c != (u); \
})
#define local_inc_not_zero(l) local_add_unless((l), 1, 0) #define local_inc_not_zero(l) local_add_unless((l), 1, 0)
#define local_add_negative(a, l) (local_add_return((a), (l)) < 0) #define local_add_negative(a, l) (local_add_return((a), (l)) < 0)
......
...@@ -492,3 +492,7 @@ ...@@ -492,3 +492,7 @@
560 common set_mempolicy_home_node sys_ni_syscall 560 common set_mempolicy_home_node sys_ni_syscall
561 common cachestat sys_cachestat 561 common cachestat sys_cachestat
562 common fchmodat2 sys_fchmodat2 562 common fchmodat2 sys_fchmodat2
# 563 reserved for map_shadow_stack
564 common futex_wake sys_futex_wake
565 common futex_wait sys_futex_wait
566 common futex_requeue sys_futex_requeue
...@@ -466,3 +466,6 @@ ...@@ -466,3 +466,6 @@
450 common set_mempolicy_home_node sys_set_mempolicy_home_node 450 common set_mempolicy_home_node sys_set_mempolicy_home_node
451 common cachestat sys_cachestat 451 common cachestat sys_cachestat
452 common fchmodat2 sys_fchmodat2 452 common fchmodat2 sys_fchmodat2
454 common futex_wake sys_futex_wake
455 common futex_wait sys_futex_wait
456 common futex_requeue sys_futex_requeue
...@@ -39,7 +39,7 @@ ...@@ -39,7 +39,7 @@
#define __ARM_NR_compat_set_tls (__ARM_NR_COMPAT_BASE + 5) #define __ARM_NR_compat_set_tls (__ARM_NR_COMPAT_BASE + 5)
#define __ARM_NR_COMPAT_END (__ARM_NR_COMPAT_BASE + 0x800) #define __ARM_NR_COMPAT_END (__ARM_NR_COMPAT_BASE + 0x800)
#define __NR_compat_syscalls 453 #define __NR_compat_syscalls 457
#endif #endif
#define __ARCH_WANT_SYS_CLONE #define __ARCH_WANT_SYS_CLONE
......
...@@ -911,6 +911,12 @@ __SYSCALL(__NR_set_mempolicy_home_node, sys_set_mempolicy_home_node) ...@@ -911,6 +911,12 @@ __SYSCALL(__NR_set_mempolicy_home_node, sys_set_mempolicy_home_node)
__SYSCALL(__NR_cachestat, sys_cachestat) __SYSCALL(__NR_cachestat, sys_cachestat)
#define __NR_fchmodat2 452 #define __NR_fchmodat2 452
__SYSCALL(__NR_fchmodat2, sys_fchmodat2) __SYSCALL(__NR_fchmodat2, sys_fchmodat2)
#define __NR_futex_wake 454
__SYSCALL(__NR_futex_wake, sys_futex_wake)
#define __NR_futex_wait 455
__SYSCALL(__NR_futex_wait, sys_futex_wait)
#define __NR_futex_requeue 456
__SYSCALL(__NR_futex_requeue, sys_futex_requeue)
/* /*
* Please add new compat syscalls above this comment and update * Please add new compat syscalls above this comment and update
......
...@@ -373,3 +373,6 @@ ...@@ -373,3 +373,6 @@
450 common set_mempolicy_home_node sys_set_mempolicy_home_node 450 common set_mempolicy_home_node sys_set_mempolicy_home_node
451 common cachestat sys_cachestat 451 common cachestat sys_cachestat
452 common fchmodat2 sys_fchmodat2 452 common fchmodat2 sys_fchmodat2
454 common futex_wake sys_futex_wake
455 common futex_wait sys_futex_wait
456 common futex_requeue sys_futex_requeue
...@@ -70,22 +70,27 @@ static inline bool local_try_cmpxchg(local_t *l, long *old, long new) ...@@ -70,22 +70,27 @@ static inline bool local_try_cmpxchg(local_t *l, long *old, long new)
#define local_xchg(l, n) (atomic_long_xchg((&(l)->a), (n))) #define local_xchg(l, n) (atomic_long_xchg((&(l)->a), (n)))
/** /**
* local_add_unless - add unless the number is a given value * local_add_unless - add unless the number is already a given value
* @l: pointer of type local_t * @l: pointer of type local_t
* @a: the amount to add to l... * @a: the amount to add to l...
* @u: ...unless l is equal to u. * @u: ...unless l is equal to u.
* *
* Atomically adds @a to @l, so long as it was not @u. * Atomically adds @a to @l, if @v was not already @u.
* Returns non-zero if @l was not @u, and zero otherwise. * Returns true if the addition was done.
*/ */
#define local_add_unless(l, a, u) \ static inline bool
({ \ local_add_unless(local_t *l, long a, long u)
long c, old; \ {
c = local_read(l); \ long c = local_read(l);
while (c != (u) && (old = local_cmpxchg((l), c, c + (a))) != c) \
c = old; \ do {
c != (u); \ if (unlikely(c == u))
}) return false;
} while (!local_try_cmpxchg(l, &c, c + a));
return true;
}
#define local_inc_not_zero(l) local_add_unless((l), 1, 0) #define local_inc_not_zero(l) local_add_unless((l), 1, 0)
#define local_dec_return(l) local_sub_return(1, (l)) #define local_dec_return(l) local_sub_return(1, (l))
......
...@@ -452,3 +452,6 @@ ...@@ -452,3 +452,6 @@
450 common set_mempolicy_home_node sys_set_mempolicy_home_node 450 common set_mempolicy_home_node sys_set_mempolicy_home_node
451 common cachestat sys_cachestat 451 common cachestat sys_cachestat
452 common fchmodat2 sys_fchmodat2 452 common fchmodat2 sys_fchmodat2
454 common futex_wake sys_futex_wake
455 common futex_wait sys_futex_wait
456 common futex_requeue sys_futex_requeue
...@@ -458,3 +458,6 @@ ...@@ -458,3 +458,6 @@
450 common set_mempolicy_home_node sys_set_mempolicy_home_node 450 common set_mempolicy_home_node sys_set_mempolicy_home_node
451 common cachestat sys_cachestat 451 common cachestat sys_cachestat
452 common fchmodat2 sys_fchmodat2 452 common fchmodat2 sys_fchmodat2
454 common futex_wake sys_futex_wake
455 common futex_wait sys_futex_wait
456 common futex_requeue sys_futex_requeue
...@@ -108,22 +108,27 @@ static __inline__ bool local_try_cmpxchg(local_t *l, long *old, long new) ...@@ -108,22 +108,27 @@ static __inline__ bool local_try_cmpxchg(local_t *l, long *old, long new)
#define local_xchg(l, n) (atomic_long_xchg((&(l)->a), (n))) #define local_xchg(l, n) (atomic_long_xchg((&(l)->a), (n)))
/** /**
* local_add_unless - add unless the number is a given value * local_add_unless - add unless the number is already a given value
* @l: pointer of type local_t * @l: pointer of type local_t
* @a: the amount to add to l... * @a: the amount to add to l...
* @u: ...unless l is equal to u. * @u: ...unless l is equal to u.
* *
* Atomically adds @a to @l, so long as it was not @u. * Atomically adds @a to @l, if @v was not already @u.
* Returns non-zero if @l was not @u, and zero otherwise. * Returns true if the addition was done.
*/ */
#define local_add_unless(l, a, u) \ static __inline__ bool
({ \ local_add_unless(local_t *l, long a, long u)
long c, old; \ {
c = local_read(l); \ long c = local_read(l);
while (c != (u) && (old = local_cmpxchg((l), c, c + (a))) != c) \
c = old; \ do {
c != (u); \ if (unlikely(c == u))
}) return false;
} while (!local_try_cmpxchg(l, &c, c + a));
return true;
}
#define local_inc_not_zero(l) local_add_unless((l), 1, 0) #define local_inc_not_zero(l) local_add_unless((l), 1, 0)
#define local_dec_return(l) local_sub_return(1, (l)) #define local_dec_return(l) local_sub_return(1, (l))
......
...@@ -391,3 +391,6 @@ ...@@ -391,3 +391,6 @@
450 n32 set_mempolicy_home_node sys_set_mempolicy_home_node 450 n32 set_mempolicy_home_node sys_set_mempolicy_home_node
451 n32 cachestat sys_cachestat 451 n32 cachestat sys_cachestat
452 n32 fchmodat2 sys_fchmodat2 452 n32 fchmodat2 sys_fchmodat2
454 n32 futex_wake sys_futex_wake
455 n32 futex_wait sys_futex_wait
456 n32 futex_requeue sys_futex_requeue
...@@ -367,3 +367,6 @@ ...@@ -367,3 +367,6 @@
450 common set_mempolicy_home_node sys_set_mempolicy_home_node 450 common set_mempolicy_home_node sys_set_mempolicy_home_node
451 n64 cachestat sys_cachestat 451 n64 cachestat sys_cachestat
452 n64 fchmodat2 sys_fchmodat2 452 n64 fchmodat2 sys_fchmodat2
454 n64 futex_wake sys_futex_wake
455 n64 futex_wait sys_futex_wait
456 n64 futex_requeue sys_futex_requeue
...@@ -440,3 +440,6 @@ ...@@ -440,3 +440,6 @@
450 o32 set_mempolicy_home_node sys_set_mempolicy_home_node 450 o32 set_mempolicy_home_node sys_set_mempolicy_home_node
451 o32 cachestat sys_cachestat 451 o32 cachestat sys_cachestat
452 o32 fchmodat2 sys_fchmodat2 452 o32 fchmodat2 sys_fchmodat2
454 o32 futex_wake sys_futex_wake
455 o32 futex_wait sys_futex_wait
456 o32 futex_requeue sys_futex_requeue
...@@ -451,3 +451,6 @@ ...@@ -451,3 +451,6 @@
450 common set_mempolicy_home_node sys_set_mempolicy_home_node 450 common set_mempolicy_home_node sys_set_mempolicy_home_node
451 common cachestat sys_cachestat 451 common cachestat sys_cachestat
452 common fchmodat2 sys_fchmodat2 452 common fchmodat2 sys_fchmodat2
454 common futex_wake sys_futex_wake
455 common futex_wait sys_futex_wait
456 common futex_requeue sys_futex_requeue
...@@ -115,23 +115,23 @@ static __inline__ long local_xchg(local_t *l, long n) ...@@ -115,23 +115,23 @@ static __inline__ long local_xchg(local_t *l, long n)
} }
/** /**
* local_add_unless - add unless the number is a given value * local_add_unless - add unless the number is already a given value
* @l: pointer of type local_t * @l: pointer of type local_t
* @a: the amount to add to v... * @a: the amount to add to v...
* @u: ...unless v is equal to u. * @u: ...unless v is equal to u.
* *
* Atomically adds @a to @l, so long as it was not @u. * Atomically adds @a to @l, if @v was not already @u.
* Returns non-zero if @l was not @u, and zero otherwise. * Returns true if the addition was done.
*/ */
static __inline__ int local_add_unless(local_t *l, long a, long u) static __inline__ bool local_add_unless(local_t *l, long a, long u)
{ {
unsigned long flags; unsigned long flags;
int ret = 0; bool ret = false;
powerpc_local_irq_pmu_save(flags); powerpc_local_irq_pmu_save(flags);
if (l->v != u) { if (l->v != u) {
l->v += a; l->v += a;
ret = 1; ret = true;
} }
powerpc_local_irq_pmu_restore(flags); powerpc_local_irq_pmu_restore(flags);
......
...@@ -539,3 +539,6 @@ ...@@ -539,3 +539,6 @@
450 nospu set_mempolicy_home_node sys_set_mempolicy_home_node 450 nospu set_mempolicy_home_node sys_set_mempolicy_home_node
451 common cachestat sys_cachestat 451 common cachestat sys_cachestat
452 common fchmodat2 sys_fchmodat2 452 common fchmodat2 sys_fchmodat2
454 common futex_wake sys_futex_wake
455 common futex_wait sys_futex_wait
456 common futex_requeue sys_futex_requeue
...@@ -455,3 +455,6 @@ ...@@ -455,3 +455,6 @@
450 common set_mempolicy_home_node sys_set_mempolicy_home_node sys_set_mempolicy_home_node 450 common set_mempolicy_home_node sys_set_mempolicy_home_node sys_set_mempolicy_home_node
451 common cachestat sys_cachestat sys_cachestat 451 common cachestat sys_cachestat sys_cachestat
452 common fchmodat2 sys_fchmodat2 sys_fchmodat2 452 common fchmodat2 sys_fchmodat2 sys_fchmodat2
454 common futex_wake sys_futex_wake sys_futex_wake
455 common futex_wait sys_futex_wait sys_futex_wait
456 common futex_requeue sys_futex_requeue sys_futex_requeue
...@@ -455,3 +455,6 @@ ...@@ -455,3 +455,6 @@
450 common set_mempolicy_home_node sys_set_mempolicy_home_node 450 common set_mempolicy_home_node sys_set_mempolicy_home_node
451 common cachestat sys_cachestat 451 common cachestat sys_cachestat
452 common fchmodat2 sys_fchmodat2 452 common fchmodat2 sys_fchmodat2
454 common futex_wake sys_futex_wake
455 common futex_wait sys_futex_wait
456 common futex_requeue sys_futex_requeue
...@@ -498,3 +498,6 @@ ...@@ -498,3 +498,6 @@
450 common set_mempolicy_home_node sys_set_mempolicy_home_node 450 common set_mempolicy_home_node sys_set_mempolicy_home_node
451 common cachestat sys_cachestat 451 common cachestat sys_cachestat
452 common fchmodat2 sys_fchmodat2 452 common fchmodat2 sys_fchmodat2
454 common futex_wake sys_futex_wake
455 common futex_wait sys_futex_wait
456 common futex_requeue sys_futex_requeue
...@@ -28,7 +28,6 @@ config X86_64 ...@@ -28,7 +28,6 @@ config X86_64
select ARCH_HAS_GIGANTIC_PAGE select ARCH_HAS_GIGANTIC_PAGE
select ARCH_SUPPORTS_INT128 if CC_HAS_INT128 select ARCH_SUPPORTS_INT128 if CC_HAS_INT128
select ARCH_SUPPORTS_PER_VMA_LOCK select ARCH_SUPPORTS_PER_VMA_LOCK
select ARCH_USE_CMPXCHG_LOCKREF
select HAVE_ARCH_SOFT_DIRTY select HAVE_ARCH_SOFT_DIRTY
select MODULES_USE_ELF_RELA select MODULES_USE_ELF_RELA
select NEED_DMA_MAP_STATE select NEED_DMA_MAP_STATE
...@@ -118,6 +117,7 @@ config X86 ...@@ -118,6 +117,7 @@ config X86
select ARCH_SUPPORTS_LTO_CLANG select ARCH_SUPPORTS_LTO_CLANG
select ARCH_SUPPORTS_LTO_CLANG_THIN select ARCH_SUPPORTS_LTO_CLANG_THIN
select ARCH_USE_BUILTIN_BSWAP select ARCH_USE_BUILTIN_BSWAP
select ARCH_USE_CMPXCHG_LOCKREF if X86_CMPXCHG64
select ARCH_USE_MEMTEST select ARCH_USE_MEMTEST
select ARCH_USE_QUEUED_RWLOCKS select ARCH_USE_QUEUED_RWLOCKS
select ARCH_USE_QUEUED_SPINLOCKS select ARCH_USE_QUEUED_SPINLOCKS
......
...@@ -457,3 +457,6 @@ ...@@ -457,3 +457,6 @@
450 i386 set_mempolicy_home_node sys_set_mempolicy_home_node 450 i386 set_mempolicy_home_node sys_set_mempolicy_home_node
451 i386 cachestat sys_cachestat 451 i386 cachestat sys_cachestat
452 i386 fchmodat2 sys_fchmodat2 452 i386 fchmodat2 sys_fchmodat2
454 i386 futex_wake sys_futex_wake
455 i386 futex_wait sys_futex_wait
456 i386 futex_requeue sys_futex_requeue
...@@ -375,6 +375,9 @@ ...@@ -375,6 +375,9 @@
451 common cachestat sys_cachestat 451 common cachestat sys_cachestat
452 common fchmodat2 sys_fchmodat2 452 common fchmodat2 sys_fchmodat2
453 64 map_shadow_stack sys_map_shadow_stack 453 64 map_shadow_stack sys_map_shadow_stack
454 common futex_wake sys_futex_wake
455 common futex_wait sys_futex_wait
456 common futex_requeue sys_futex_requeue
# #
# Due to a historical design error, certain syscalls are numbered differently # Due to a historical design error, certain syscalls are numbered differently
......
...@@ -221,12 +221,18 @@ extern void __add_wrong_size(void) ...@@ -221,12 +221,18 @@ extern void __add_wrong_size(void)
#define __try_cmpxchg(ptr, pold, new, size) \ #define __try_cmpxchg(ptr, pold, new, size) \
__raw_try_cmpxchg((ptr), (pold), (new), (size), LOCK_PREFIX) __raw_try_cmpxchg((ptr), (pold), (new), (size), LOCK_PREFIX)
#define __sync_try_cmpxchg(ptr, pold, new, size) \
__raw_try_cmpxchg((ptr), (pold), (new), (size), "lock; ")
#define __try_cmpxchg_local(ptr, pold, new, size) \ #define __try_cmpxchg_local(ptr, pold, new, size) \
__raw_try_cmpxchg((ptr), (pold), (new), (size), "") __raw_try_cmpxchg((ptr), (pold), (new), (size), "")
#define arch_try_cmpxchg(ptr, pold, new) \ #define arch_try_cmpxchg(ptr, pold, new) \
__try_cmpxchg((ptr), (pold), (new), sizeof(*(ptr))) __try_cmpxchg((ptr), (pold), (new), sizeof(*(ptr)))
#define arch_sync_try_cmpxchg(ptr, pold, new) \
__sync_try_cmpxchg((ptr), (pold), (new), sizeof(*(ptr)))
#define arch_try_cmpxchg_local(ptr, pold, new) \ #define arch_try_cmpxchg_local(ptr, pold, new) \
__try_cmpxchg_local((ptr), (pold), (new), sizeof(*(ptr))) __try_cmpxchg_local((ptr), (pold), (new), sizeof(*(ptr)))
......
...@@ -135,28 +135,27 @@ static inline bool local_try_cmpxchg(local_t *l, long *old, long new) ...@@ -135,28 +135,27 @@ static inline bool local_try_cmpxchg(local_t *l, long *old, long new)
#define local_xchg(l, n) (xchg(&((l)->a.counter), (n))) #define local_xchg(l, n) (xchg(&((l)->a.counter), (n)))
/** /**
* local_add_unless - add unless the number is a given value * local_add_unless - add unless the number is already a given value
* @l: pointer of type local_t * @l: pointer of type local_t
* @a: the amount to add to l... * @a: the amount to add to l...
* @u: ...unless l is equal to u. * @u: ...unless l is equal to u.
* *
* Atomically adds @a to @l, so long as it was not @u. * Atomically adds @a to @l, if @v was not already @u.
* Returns non-zero if @l was not @u, and zero otherwise. * Returns true if the addition was done.
*/ */
#define local_add_unless(l, a, u) \ static __always_inline bool
({ \ local_add_unless(local_t *l, long a, long u)
long c, old; \ {
c = local_read((l)); \ long c = local_read(l);
for (;;) { \
if (unlikely(c == (u))) \ do {
break; \ if (unlikely(c == u))
old = local_cmpxchg((l), c, c + (a)); \ return false;
if (likely(old == c)) \ } while (!local_try_cmpxchg(l, &c, c + a));
break; \
c = old; \ return true;
} \ }
c != (u); \
})
#define local_inc_not_zero(l) local_add_unless((l), 1, 0) #define local_inc_not_zero(l) local_add_unless((l), 1, 0)
/* On x86_32, these are no better than the atomic variants. /* On x86_32, these are no better than the atomic variants.
......
...@@ -423,3 +423,6 @@ ...@@ -423,3 +423,6 @@
450 common set_mempolicy_home_node sys_set_mempolicy_home_node 450 common set_mempolicy_home_node sys_set_mempolicy_home_node
451 common cachestat sys_cachestat 451 common cachestat sys_cachestat
452 common fchmodat2 sys_fchmodat2 452 common fchmodat2 sys_fchmodat2
454 common futex_wake sys_futex_wake
455 common futex_wait sys_futex_wait
456 common futex_requeue sys_futex_requeue
...@@ -226,21 +226,20 @@ static bool evtchn_fifo_is_masked(evtchn_port_t port) ...@@ -226,21 +226,20 @@ static bool evtchn_fifo_is_masked(evtchn_port_t port)
*/ */
static bool clear_masked_cond(volatile event_word_t *word) static bool clear_masked_cond(volatile event_word_t *word)
{ {
event_word_t new, old, w; event_word_t new, old;
w = *word; old = *word;
do { do {
if (!(w & (1 << EVTCHN_FIFO_MASKED))) if (!(old & (1 << EVTCHN_FIFO_MASKED)))
return true; return true;
if (w & (1 << EVTCHN_FIFO_PENDING)) if (old & (1 << EVTCHN_FIFO_PENDING))
return false; return false;
old = w & ~(1 << EVTCHN_FIFO_BUSY); old = old & ~(1 << EVTCHN_FIFO_BUSY);
new = old & ~(1 << EVTCHN_FIFO_MASKED); new = old & ~(1 << EVTCHN_FIFO_MASKED);
w = sync_cmpxchg(word, old, new); } while (!sync_try_cmpxchg(word, &old, new));
} while (w != old);
return true; return true;
} }
...@@ -259,17 +258,16 @@ static void evtchn_fifo_unmask(evtchn_port_t port) ...@@ -259,17 +258,16 @@ static void evtchn_fifo_unmask(evtchn_port_t port)
static uint32_t clear_linked(volatile event_word_t *word) static uint32_t clear_linked(volatile event_word_t *word)
{ {
event_word_t new, old, w; event_word_t new, old;
w = *word; old = *word;
do { do {
old = w; new = (old & ~((1 << EVTCHN_FIFO_LINKED)
new = (w & ~((1 << EVTCHN_FIFO_LINKED)
| EVTCHN_FIFO_LINK_MASK)); | EVTCHN_FIFO_LINK_MASK));
} while ((w = sync_cmpxchg(word, old, new)) != old); } while (!sync_try_cmpxchg(word, &old, new));
return w & EVTCHN_FIFO_LINK_MASK; return old & EVTCHN_FIFO_LINK_MASK;
} }
static void consume_one_event(unsigned cpu, struct evtchn_loop_ctrl *ctrl, static void consume_one_event(unsigned cpu, struct evtchn_loop_ctrl *ctrl,
......
...@@ -427,16 +427,14 @@ EXPORT_SYMBOL_GPL(gnttab_grant_foreign_access); ...@@ -427,16 +427,14 @@ EXPORT_SYMBOL_GPL(gnttab_grant_foreign_access);
static int gnttab_end_foreign_access_ref_v1(grant_ref_t ref) static int gnttab_end_foreign_access_ref_v1(grant_ref_t ref)
{ {
u16 flags, nflags; u16 *pflags = &gnttab_shared.v1[ref].flags;
u16 *pflags; u16 flags;
pflags = &gnttab_shared.v1[ref].flags; flags = *pflags;
nflags = *pflags;
do { do {
flags = nflags;
if (flags & (GTF_reading|GTF_writing)) if (flags & (GTF_reading|GTF_writing))
return 0; return 0;
} while ((nflags = sync_cmpxchg(pflags, flags, 0)) != flags); } while (!sync_try_cmpxchg(pflags, &flags, 0));
return 1; return 1;
} }
......
...@@ -68,11 +68,18 @@ static __always_inline void arch_spin_unlock(arch_spinlock_t *lock) ...@@ -68,11 +68,18 @@ static __always_inline void arch_spin_unlock(arch_spinlock_t *lock)
smp_store_release(ptr, (u16)val + 1); smp_store_release(ptr, (u16)val + 1);
} }
static __always_inline int arch_spin_value_unlocked(arch_spinlock_t lock)
{
u32 val = lock.counter;
return ((val >> 16) == (val & 0xffff));
}
static __always_inline int arch_spin_is_locked(arch_spinlock_t *lock) static __always_inline int arch_spin_is_locked(arch_spinlock_t *lock)
{ {
u32 val = atomic_read(lock); arch_spinlock_t val = READ_ONCE(*lock);
return ((val >> 16) != (val & 0xffff)); return !arch_spin_value_unlocked(val);
} }
static __always_inline int arch_spin_is_contended(arch_spinlock_t *lock) static __always_inline int arch_spin_is_contended(arch_spinlock_t *lock)
...@@ -82,11 +89,6 @@ static __always_inline int arch_spin_is_contended(arch_spinlock_t *lock) ...@@ -82,11 +89,6 @@ static __always_inline int arch_spin_is_contended(arch_spinlock_t *lock)
return (s16)((val >> 16) - (val & 0xffff)) > 1; return (s16)((val >> 16) - (val & 0xffff)) > 1;
} }
static __always_inline int arch_spin_value_unlocked(arch_spinlock_t lock)
{
return !arch_spin_is_locked(&lock);
}
#include <asm/qrwlock.h> #include <asm/qrwlock.h>
#endif /* __ASM_GENERIC_SPINLOCK_H */ #endif /* __ASM_GENERIC_SPINLOCK_H */
...@@ -428,6 +428,19 @@ extern void raw_cmpxchg128_relaxed_not_implemented(void); ...@@ -428,6 +428,19 @@ extern void raw_cmpxchg128_relaxed_not_implemented(void);
#define raw_sync_cmpxchg arch_sync_cmpxchg #define raw_sync_cmpxchg arch_sync_cmpxchg
#ifdef arch_sync_try_cmpxchg
#define raw_sync_try_cmpxchg arch_sync_try_cmpxchg
#else
#define raw_sync_try_cmpxchg(_ptr, _oldp, _new) \
({ \
typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \
___r = raw_sync_cmpxchg((_ptr), ___o, (_new)); \
if (unlikely(___r != ___o)) \
*___op = ___r; \
likely(___r == ___o); \
})
#endif
/** /**
* raw_atomic_read() - atomic load with relaxed ordering * raw_atomic_read() - atomic load with relaxed ordering
* @v: pointer to atomic_t * @v: pointer to atomic_t
...@@ -4649,4 +4662,4 @@ raw_atomic64_dec_if_positive(atomic64_t *v) ...@@ -4649,4 +4662,4 @@ raw_atomic64_dec_if_positive(atomic64_t *v)
} }
#endif /* _LINUX_ATOMIC_FALLBACK_H */ #endif /* _LINUX_ATOMIC_FALLBACK_H */
// 2fdd6702823fa842f9cea57a002e6e4476ae780c // eec048affea735b8464f58e6d96992101f8f85f1
...@@ -4998,6 +4998,14 @@ atomic_long_dec_if_positive(atomic_long_t *v) ...@@ -4998,6 +4998,14 @@ atomic_long_dec_if_positive(atomic_long_t *v)
raw_try_cmpxchg128_local(__ai_ptr, __ai_oldp, __VA_ARGS__); \ raw_try_cmpxchg128_local(__ai_ptr, __ai_oldp, __VA_ARGS__); \
}) })
#define sync_try_cmpxchg(ptr, ...) \
({ \
typeof(ptr) __ai_ptr = (ptr); \
kcsan_mb(); \
instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
raw_sync_try_cmpxchg(__ai_ptr, __VA_ARGS__); \
})
#endif /* _LINUX_ATOMIC_INSTRUMENTED_H */ #endif /* _LINUX_ATOMIC_INSTRUMENTED_H */
// 1568f875fef72097413caab8339120c065a39aa4 // 2cc4bc990fef44d3836ec108f11b610f3f438184
...@@ -7,8 +7,9 @@ ...@@ -7,8 +7,9 @@
/* /*
* DEFINE_FREE(name, type, free): * DEFINE_FREE(name, type, free):
* simple helper macro that defines the required wrapper for a __free() * simple helper macro that defines the required wrapper for a __free()
* based cleanup function. @free is an expression using '_T' to access * based cleanup function. @free is an expression using '_T' to access the
* the variable. * variable. @free should typically include a NULL test before calling a
* function, see the example below.
* *
* __free(name): * __free(name):
* variable attribute to add a scoped based cleanup to the variable. * variable attribute to add a scoped based cleanup to the variable.
...@@ -17,6 +18,9 @@ ...@@ -17,6 +18,9 @@
* like a non-atomic xchg(var, NULL), such that the cleanup function will * like a non-atomic xchg(var, NULL), such that the cleanup function will
* be inhibited -- provided it sanely deals with a NULL value. * be inhibited -- provided it sanely deals with a NULL value.
* *
* NOTE: this has __must_check semantics so that it is harder to accidentally
* leak the resource.
*
* return_ptr(p): * return_ptr(p):
* returns p while inhibiting the __free(). * returns p while inhibiting the __free().
* *
...@@ -24,6 +28,8 @@ ...@@ -24,6 +28,8 @@
* *
* DEFINE_FREE(kfree, void *, if (_T) kfree(_T)) * DEFINE_FREE(kfree, void *, if (_T) kfree(_T))
* *
* void *alloc_obj(...)
* {
* struct obj *p __free(kfree) = kmalloc(...); * struct obj *p __free(kfree) = kmalloc(...);
* if (!p) * if (!p)
* return NULL; * return NULL;
...@@ -32,6 +38,24 @@ ...@@ -32,6 +38,24 @@
* return NULL; * return NULL;
* *
* return_ptr(p); * return_ptr(p);
* }
*
* NOTE: the DEFINE_FREE()'s @free expression includes a NULL test even though
* kfree() is fine to be called with a NULL value. This is on purpose. This way
* the compiler sees the end of our alloc_obj() function as:
*
* tmp = p;
* p = NULL;
* if (p)
* kfree(p);
* return tmp;
*
* And through the magic of value-propagation and dead-code-elimination, it
* eliminates the actual cleanup call and compiles into:
*
* return p;
*
* Without the NULL test it turns into a mess and the compiler can't help us.
*/ */
#define DEFINE_FREE(_name, _type, _free) \ #define DEFINE_FREE(_name, _type, _free) \
...@@ -39,8 +63,17 @@ ...@@ -39,8 +63,17 @@
#define __free(_name) __cleanup(__free_##_name) #define __free(_name) __cleanup(__free_##_name)
#define __get_and_null_ptr(p) \
({ __auto_type __ptr = &(p); \
__auto_type __val = *__ptr; \
*__ptr = NULL; __val; })
static inline __must_check
const volatile void * __must_check_fn(const volatile void *val)
{ return val; }
#define no_free_ptr(p) \ #define no_free_ptr(p) \
({ __auto_type __ptr = (p); (p) = NULL; __ptr; }) ((typeof(p)) __must_check_fn(__get_and_null_ptr(p)))
#define return_ptr(p) return no_free_ptr(p) #define return_ptr(p) return no_free_ptr(p)
......
...@@ -912,6 +912,9 @@ struct task_struct { ...@@ -912,6 +912,9 @@ struct task_struct {
* ->sched_remote_wakeup gets used, so it can be in this word. * ->sched_remote_wakeup gets used, so it can be in this word.
*/ */
unsigned sched_remote_wakeup:1; unsigned sched_remote_wakeup:1;
#ifdef CONFIG_RT_MUTEXES
unsigned sched_rt_mutex:1;
#endif
/* Bit to tell LSMs we're in execve(): */ /* Bit to tell LSMs we're in execve(): */
unsigned in_execve:1; unsigned in_execve:1;
......
...@@ -30,6 +30,10 @@ static inline bool task_is_realtime(struct task_struct *tsk) ...@@ -30,6 +30,10 @@ static inline bool task_is_realtime(struct task_struct *tsk)
} }
#ifdef CONFIG_RT_MUTEXES #ifdef CONFIG_RT_MUTEXES
extern void rt_mutex_pre_schedule(void);
extern void rt_mutex_schedule(void);
extern void rt_mutex_post_schedule(void);
/* /*
* Must hold either p->pi_lock or task_rq(p)->lock. * Must hold either p->pi_lock or task_rq(p)->lock.
*/ */
......
...@@ -191,11 +191,9 @@ static inline void seqcount_lockdep_reader_access(const seqcount_t *s) ...@@ -191,11 +191,9 @@ static inline void seqcount_lockdep_reader_access(const seqcount_t *s)
* @lockname: "LOCKNAME" part of seqcount_LOCKNAME_t * @lockname: "LOCKNAME" part of seqcount_LOCKNAME_t
* @locktype: LOCKNAME canonical C data type * @locktype: LOCKNAME canonical C data type
* @preemptible: preemptibility of above locktype * @preemptible: preemptibility of above locktype
* @lockmember: argument for lockdep_assert_held() * @lockbase: prefix for associated lock/unlock
* @lockbase: associated lock release function (prefix only)
* @lock_acquire: associated lock acquisition function (full call)
*/ */
#define SEQCOUNT_LOCKNAME(lockname, locktype, preemptible, lockmember, lockbase, lock_acquire) \ #define SEQCOUNT_LOCKNAME(lockname, locktype, preemptible, lockbase) \
typedef struct seqcount_##lockname { \ typedef struct seqcount_##lockname { \
seqcount_t seqcount; \ seqcount_t seqcount; \
__SEQ_LOCK(locktype *lock); \ __SEQ_LOCK(locktype *lock); \
...@@ -203,6 +201,12 @@ typedef struct seqcount_##lockname { \ ...@@ -203,6 +201,12 @@ typedef struct seqcount_##lockname { \
\ \
static __always_inline seqcount_t * \ static __always_inline seqcount_t * \
__seqprop_##lockname##_ptr(seqcount_##lockname##_t *s) \ __seqprop_##lockname##_ptr(seqcount_##lockname##_t *s) \
{ \
return &s->seqcount; \
} \
\
static __always_inline const seqcount_t * \
__seqprop_##lockname##_const_ptr(const seqcount_##lockname##_t *s) \
{ \ { \
return &s->seqcount; \ return &s->seqcount; \
} \ } \
...@@ -216,7 +220,7 @@ __seqprop_##lockname##_sequence(const seqcount_##lockname##_t *s) \ ...@@ -216,7 +220,7 @@ __seqprop_##lockname##_sequence(const seqcount_##lockname##_t *s) \
return seq; \ return seq; \
\ \
if (preemptible && unlikely(seq & 1)) { \ if (preemptible && unlikely(seq & 1)) { \
__SEQ_LOCK(lock_acquire); \ __SEQ_LOCK(lockbase##_lock(s->lock)); \
__SEQ_LOCK(lockbase##_unlock(s->lock)); \ __SEQ_LOCK(lockbase##_unlock(s->lock)); \
\ \
/* \ /* \
...@@ -242,7 +246,7 @@ __seqprop_##lockname##_preemptible(const seqcount_##lockname##_t *s) \ ...@@ -242,7 +246,7 @@ __seqprop_##lockname##_preemptible(const seqcount_##lockname##_t *s) \
static __always_inline void \ static __always_inline void \
__seqprop_##lockname##_assert(const seqcount_##lockname##_t *s) \ __seqprop_##lockname##_assert(const seqcount_##lockname##_t *s) \
{ \ { \
__SEQ_LOCK(lockdep_assert_held(lockmember)); \ __SEQ_LOCK(lockdep_assert_held(s->lock)); \
} }
/* /*
...@@ -254,6 +258,11 @@ static inline seqcount_t *__seqprop_ptr(seqcount_t *s) ...@@ -254,6 +258,11 @@ static inline seqcount_t *__seqprop_ptr(seqcount_t *s)
return s; return s;
} }
static inline const seqcount_t *__seqprop_const_ptr(const seqcount_t *s)
{
return s;
}
static inline unsigned __seqprop_sequence(const seqcount_t *s) static inline unsigned __seqprop_sequence(const seqcount_t *s)
{ {
return READ_ONCE(s->sequence); return READ_ONCE(s->sequence);
...@@ -271,10 +280,10 @@ static inline void __seqprop_assert(const seqcount_t *s) ...@@ -271,10 +280,10 @@ static inline void __seqprop_assert(const seqcount_t *s)
#define __SEQ_RT IS_ENABLED(CONFIG_PREEMPT_RT) #define __SEQ_RT IS_ENABLED(CONFIG_PREEMPT_RT)
SEQCOUNT_LOCKNAME(raw_spinlock, raw_spinlock_t, false, s->lock, raw_spin, raw_spin_lock(s->lock)) SEQCOUNT_LOCKNAME(raw_spinlock, raw_spinlock_t, false, raw_spin)
SEQCOUNT_LOCKNAME(spinlock, spinlock_t, __SEQ_RT, s->lock, spin, spin_lock(s->lock)) SEQCOUNT_LOCKNAME(spinlock, spinlock_t, __SEQ_RT, spin)
SEQCOUNT_LOCKNAME(rwlock, rwlock_t, __SEQ_RT, s->lock, read, read_lock(s->lock)) SEQCOUNT_LOCKNAME(rwlock, rwlock_t, __SEQ_RT, read)
SEQCOUNT_LOCKNAME(mutex, struct mutex, true, s->lock, mutex, mutex_lock(s->lock)) SEQCOUNT_LOCKNAME(mutex, struct mutex, true, mutex)
/* /*
* SEQCNT_LOCKNAME_ZERO - static initializer for seqcount_LOCKNAME_t * SEQCNT_LOCKNAME_ZERO - static initializer for seqcount_LOCKNAME_t
...@@ -294,19 +303,20 @@ SEQCOUNT_LOCKNAME(mutex, struct mutex, true, s->lock, mutex ...@@ -294,19 +303,20 @@ SEQCOUNT_LOCKNAME(mutex, struct mutex, true, s->lock, mutex
#define SEQCNT_WW_MUTEX_ZERO(name, lock) SEQCOUNT_LOCKNAME_ZERO(name, lock) #define SEQCNT_WW_MUTEX_ZERO(name, lock) SEQCOUNT_LOCKNAME_ZERO(name, lock)
#define __seqprop_case(s, lockname, prop) \ #define __seqprop_case(s, lockname, prop) \
seqcount_##lockname##_t: __seqprop_##lockname##_##prop((void *)(s)) seqcount_##lockname##_t: __seqprop_##lockname##_##prop
#define __seqprop(s, prop) _Generic(*(s), \ #define __seqprop(s, prop) _Generic(*(s), \
seqcount_t: __seqprop_##prop((void *)(s)), \ seqcount_t: __seqprop_##prop, \
__seqprop_case((s), raw_spinlock, prop), \ __seqprop_case((s), raw_spinlock, prop), \
__seqprop_case((s), spinlock, prop), \ __seqprop_case((s), spinlock, prop), \
__seqprop_case((s), rwlock, prop), \ __seqprop_case((s), rwlock, prop), \
__seqprop_case((s), mutex, prop)) __seqprop_case((s), mutex, prop))
#define seqprop_ptr(s) __seqprop(s, ptr) #define seqprop_ptr(s) __seqprop(s, ptr)(s)
#define seqprop_sequence(s) __seqprop(s, sequence) #define seqprop_const_ptr(s) __seqprop(s, const_ptr)(s)
#define seqprop_preemptible(s) __seqprop(s, preemptible) #define seqprop_sequence(s) __seqprop(s, sequence)(s)
#define seqprop_assert(s) __seqprop(s, assert) #define seqprop_preemptible(s) __seqprop(s, preemptible)(s)
#define seqprop_assert(s) __seqprop(s, assert)(s)
/** /**
* __read_seqcount_begin() - begin a seqcount_t read section w/o barrier * __read_seqcount_begin() - begin a seqcount_t read section w/o barrier
...@@ -355,7 +365,7 @@ SEQCOUNT_LOCKNAME(mutex, struct mutex, true, s->lock, mutex ...@@ -355,7 +365,7 @@ SEQCOUNT_LOCKNAME(mutex, struct mutex, true, s->lock, mutex
*/ */
#define read_seqcount_begin(s) \ #define read_seqcount_begin(s) \
({ \ ({ \
seqcount_lockdep_reader_access(seqprop_ptr(s)); \ seqcount_lockdep_reader_access(seqprop_const_ptr(s)); \
raw_read_seqcount_begin(s); \ raw_read_seqcount_begin(s); \
}) })
...@@ -421,7 +431,7 @@ SEQCOUNT_LOCKNAME(mutex, struct mutex, true, s->lock, mutex ...@@ -421,7 +431,7 @@ SEQCOUNT_LOCKNAME(mutex, struct mutex, true, s->lock, mutex
* Return: true if a read section retry is required, else false * Return: true if a read section retry is required, else false
*/ */
#define __read_seqcount_retry(s, start) \ #define __read_seqcount_retry(s, start) \
do___read_seqcount_retry(seqprop_ptr(s), start) do___read_seqcount_retry(seqprop_const_ptr(s), start)
static inline int do___read_seqcount_retry(const seqcount_t *s, unsigned start) static inline int do___read_seqcount_retry(const seqcount_t *s, unsigned start)
{ {
...@@ -441,7 +451,7 @@ static inline int do___read_seqcount_retry(const seqcount_t *s, unsigned start) ...@@ -441,7 +451,7 @@ static inline int do___read_seqcount_retry(const seqcount_t *s, unsigned start)
* Return: true if a read section retry is required, else false * Return: true if a read section retry is required, else false
*/ */
#define read_seqcount_retry(s, start) \ #define read_seqcount_retry(s, start) \
do_read_seqcount_retry(seqprop_ptr(s), start) do_read_seqcount_retry(seqprop_const_ptr(s), start)
static inline int do_read_seqcount_retry(const seqcount_t *s, unsigned start) static inline int do_read_seqcount_retry(const seqcount_t *s, unsigned start)
{ {
...@@ -574,7 +584,7 @@ static inline void do_write_seqcount_end(seqcount_t *s) ...@@ -574,7 +584,7 @@ static inline void do_write_seqcount_end(seqcount_t *s)
* via WRITE_ONCE): a) to ensure the writes become visible to other threads * via WRITE_ONCE): a) to ensure the writes become visible to other threads
* atomically, avoiding compiler optimizations; b) to document which writes are * atomically, avoiding compiler optimizations; b) to document which writes are
* meant to propagate to the reader critical section. This is necessary because * meant to propagate to the reader critical section. This is necessary because
* neither writes before and after the barrier are enclosed in a seq-writer * neither writes before nor after the barrier are enclosed in a seq-writer
* critical section that would ensure readers are aware of ongoing writes:: * critical section that would ensure readers are aware of ongoing writes::
* *
* seqcount_t seq; * seqcount_t seq;
...@@ -864,7 +874,7 @@ static inline unsigned read_seqretry(const seqlock_t *sl, unsigned start) ...@@ -864,7 +874,7 @@ static inline unsigned read_seqretry(const seqlock_t *sl, unsigned start)
} }
/* /*
* For all seqlock_t write side functions, use the the internal * For all seqlock_t write side functions, use the internal
* do_write_seqcount_begin() instead of generic write_seqcount_begin(). * do_write_seqcount_begin() instead of generic write_seqcount_begin().
* This way, no redundant lockdep_assert_held() checks are added. * This way, no redundant lockdep_assert_held() checks are added.
*/ */
......
...@@ -549,6 +549,16 @@ asmlinkage long sys_set_robust_list(struct robust_list_head __user *head, ...@@ -549,6 +549,16 @@ asmlinkage long sys_set_robust_list(struct robust_list_head __user *head,
asmlinkage long sys_futex_waitv(struct futex_waitv *waiters, asmlinkage long sys_futex_waitv(struct futex_waitv *waiters,
unsigned int nr_futexes, unsigned int flags, unsigned int nr_futexes, unsigned int flags,
struct __kernel_timespec __user *timeout, clockid_t clockid); struct __kernel_timespec __user *timeout, clockid_t clockid);
asmlinkage long sys_futex_wake(void __user *uaddr, unsigned long mask, int nr, unsigned int flags);
asmlinkage long sys_futex_wait(void __user *uaddr, unsigned long val, unsigned long mask,
unsigned int flags, struct __kernel_timespec __user *timespec,
clockid_t clockid);
asmlinkage long sys_futex_requeue(struct futex_waitv __user *waiters,
unsigned int flags, int nr_wake, int nr_requeue);
asmlinkage long sys_nanosleep(struct __kernel_timespec __user *rqtp, asmlinkage long sys_nanosleep(struct __kernel_timespec __user *rqtp,
struct __kernel_timespec __user *rmtp); struct __kernel_timespec __user *rmtp);
asmlinkage long sys_nanosleep_time32(struct old_timespec32 __user *rqtp, asmlinkage long sys_nanosleep_time32(struct old_timespec32 __user *rqtp,
......
...@@ -822,9 +822,15 @@ __SYSCALL(__NR_cachestat, sys_cachestat) ...@@ -822,9 +822,15 @@ __SYSCALL(__NR_cachestat, sys_cachestat)
#define __NR_fchmodat2 452 #define __NR_fchmodat2 452
__SYSCALL(__NR_fchmodat2, sys_fchmodat2) __SYSCALL(__NR_fchmodat2, sys_fchmodat2)
#define __NR_futex_wake 454
__SYSCALL(__NR_futex_wake, sys_futex_wake)
#define __NR_futex_wait 455
__SYSCALL(__NR_futex_wait, sys_futex_wait)
#define __NR_futex_requeue 456
__SYSCALL(__NR_futex_requeue, sys_futex_requeue)
#undef __NR_syscalls #undef __NR_syscalls
#define __NR_syscalls 453 #define __NR_syscalls 457
/* /*
* 32 bit systems traditionally used different * 32 bit systems traditionally used different
......
...@@ -44,10 +44,35 @@ ...@@ -44,10 +44,35 @@
FUTEX_PRIVATE_FLAG) FUTEX_PRIVATE_FLAG)
/* /*
* Flags to specify the bit length of the futex word for futex2 syscalls. * Flags for futex2 syscalls.
* Currently, only 32 is supported. *
* NOTE: these are not pure flags, they can also be seen as:
*
* union {
* u32 flags;
* struct {
* u32 size : 2,
* numa : 1,
* : 4,
* private : 1;
* };
* };
*/ */
#define FUTEX_32 2 #define FUTEX2_SIZE_U8 0x00
#define FUTEX2_SIZE_U16 0x01
#define FUTEX2_SIZE_U32 0x02
#define FUTEX2_SIZE_U64 0x03
#define FUTEX2_NUMA 0x04
/* 0x08 */
/* 0x10 */
/* 0x20 */
/* 0x40 */
#define FUTEX2_PRIVATE FUTEX_PRIVATE_FLAG
#define FUTEX2_SIZE_MASK 0x03
/* do not use */
#define FUTEX_32 FUTEX2_SIZE_U32 /* historical accident :-( */
/* /*
* Max numbers of elements in a futex_waitv array * Max numbers of elements in a futex_waitv array
......
...@@ -193,7 +193,7 @@ static u64 get_inode_sequence_number(struct inode *inode) ...@@ -193,7 +193,7 @@ static u64 get_inode_sequence_number(struct inode *inode)
/** /**
* get_futex_key() - Get parameters which are the keys for a futex * get_futex_key() - Get parameters which are the keys for a futex
* @uaddr: virtual address of the futex * @uaddr: virtual address of the futex
* @fshared: false for a PROCESS_PRIVATE futex, true for PROCESS_SHARED * @flags: FLAGS_*
* @key: address where result is stored. * @key: address where result is stored.
* @rw: mapping needs to be read/write (values: FUTEX_READ, * @rw: mapping needs to be read/write (values: FUTEX_READ,
* FUTEX_WRITE) * FUTEX_WRITE)
...@@ -217,14 +217,18 @@ static u64 get_inode_sequence_number(struct inode *inode) ...@@ -217,14 +217,18 @@ static u64 get_inode_sequence_number(struct inode *inode)
* *
* lock_page() might sleep, the caller should not hold a spinlock. * lock_page() might sleep, the caller should not hold a spinlock.
*/ */
int get_futex_key(u32 __user *uaddr, bool fshared, union futex_key *key, int get_futex_key(u32 __user *uaddr, unsigned int flags, union futex_key *key,
enum futex_access rw) enum futex_access rw)
{ {
unsigned long address = (unsigned long)uaddr; unsigned long address = (unsigned long)uaddr;
struct mm_struct *mm = current->mm; struct mm_struct *mm = current->mm;
struct page *page, *tail; struct page *page;
struct folio *folio;
struct address_space *mapping; struct address_space *mapping;
int err, ro = 0; int err, ro = 0;
bool fshared;
fshared = flags & FLAGS_SHARED;
/* /*
* The futex address must be "naturally" aligned. * The futex address must be "naturally" aligned.
...@@ -248,7 +252,17 @@ int get_futex_key(u32 __user *uaddr, bool fshared, union futex_key *key, ...@@ -248,7 +252,17 @@ int get_futex_key(u32 __user *uaddr, bool fshared, union futex_key *key,
* but access_ok() should be faster than find_vma() * but access_ok() should be faster than find_vma()
*/ */
if (!fshared) { if (!fshared) {
/*
* On no-MMU, shared futexes are treated as private, therefore
* we must not include the current process in the key. Since
* there is only one address space, the address is a unique key
* on its own.
*/
if (IS_ENABLED(CONFIG_MMU))
key->private.mm = mm; key->private.mm = mm;
else
key->private.mm = NULL;
key->private.address = address; key->private.address = address;
return 0; return 0;
} }
...@@ -273,54 +287,52 @@ int get_futex_key(u32 __user *uaddr, bool fshared, union futex_key *key, ...@@ -273,54 +287,52 @@ int get_futex_key(u32 __user *uaddr, bool fshared, union futex_key *key,
err = 0; err = 0;
/* /*
* The treatment of mapping from this point on is critical. The page * The treatment of mapping from this point on is critical. The folio
* lock protects many things but in this context the page lock * lock protects many things but in this context the folio lock
* stabilizes mapping, prevents inode freeing in the shared * stabilizes mapping, prevents inode freeing in the shared
* file-backed region case and guards against movement to swap cache. * file-backed region case and guards against movement to swap cache.
* *
* Strictly speaking the page lock is not needed in all cases being * Strictly speaking the folio lock is not needed in all cases being
* considered here and page lock forces unnecessarily serialization * considered here and folio lock forces unnecessarily serialization.
* From this point on, mapping will be re-verified if necessary and * From this point on, mapping will be re-verified if necessary and
* page lock will be acquired only if it is unavoidable * folio lock will be acquired only if it is unavoidable
* *
* Mapping checks require the head page for any compound page so the * Mapping checks require the folio so it is looked up now. For
* head page and mapping is looked up now. For anonymous pages, it * anonymous pages, it does not matter if the folio is split
* does not matter if the page splits in the future as the key is * in the future as the key is based on the address. For
* based on the address. For filesystem-backed pages, the tail is * filesystem-backed pages, the precise page is required as the
* required as the index of the page determines the key. For * index of the page determines the key.
* base pages, there is no tail page and tail == page.
*/ */
tail = page; folio = page_folio(page);
page = compound_head(page); mapping = READ_ONCE(folio->mapping);
mapping = READ_ONCE(page->mapping);
/* /*
* If page->mapping is NULL, then it cannot be a PageAnon * If folio->mapping is NULL, then it cannot be an anonymous
* page; but it might be the ZERO_PAGE or in the gate area or * page; but it might be the ZERO_PAGE or in the gate area or
* in a special mapping (all cases which we are happy to fail); * in a special mapping (all cases which we are happy to fail);
* or it may have been a good file page when get_user_pages_fast * or it may have been a good file page when get_user_pages_fast
* found it, but truncated or holepunched or subjected to * found it, but truncated or holepunched or subjected to
* invalidate_complete_page2 before we got the page lock (also * invalidate_complete_page2 before we got the folio lock (also
* cases which we are happy to fail). And we hold a reference, * cases which we are happy to fail). And we hold a reference,
* so refcount care in invalidate_inode_page's remove_mapping * so refcount care in invalidate_inode_page's remove_mapping
* prevents drop_caches from setting mapping to NULL beneath us. * prevents drop_caches from setting mapping to NULL beneath us.
* *
* The case we do have to guard against is when memory pressure made * The case we do have to guard against is when memory pressure made
* shmem_writepage move it from filecache to swapcache beneath us: * shmem_writepage move it from filecache to swapcache beneath us:
* an unlikely race, but we do need to retry for page->mapping. * an unlikely race, but we do need to retry for folio->mapping.
*/ */
if (unlikely(!mapping)) { if (unlikely(!mapping)) {
int shmem_swizzled; int shmem_swizzled;
/* /*
* Page lock is required to identify which special case above * Folio lock is required to identify which special case above
* applies. If this is really a shmem page then the page lock * applies. If this is really a shmem page then the folio lock
* will prevent unexpected transitions. * will prevent unexpected transitions.
*/ */
lock_page(page); folio_lock(folio);
shmem_swizzled = PageSwapCache(page) || page->mapping; shmem_swizzled = folio_test_swapcache(folio) || folio->mapping;
unlock_page(page); folio_unlock(folio);
put_page(page); folio_put(folio);
if (shmem_swizzled) if (shmem_swizzled)
goto again; goto again;
...@@ -331,14 +343,14 @@ int get_futex_key(u32 __user *uaddr, bool fshared, union futex_key *key, ...@@ -331,14 +343,14 @@ int get_futex_key(u32 __user *uaddr, bool fshared, union futex_key *key,
/* /*
* Private mappings are handled in a simple way. * Private mappings are handled in a simple way.
* *
* If the futex key is stored on an anonymous page, then the associated * If the futex key is stored in anonymous memory, then the associated
* object is the mm which is implicitly pinned by the calling process. * object is the mm which is implicitly pinned by the calling process.
* *
* NOTE: When userspace waits on a MAP_SHARED mapping, even if * NOTE: When userspace waits on a MAP_SHARED mapping, even if
* it's a read-only handle, it's expected that futexes attach to * it's a read-only handle, it's expected that futexes attach to
* the object not the particular process. * the object not the particular process.
*/ */
if (PageAnon(page)) { if (folio_test_anon(folio)) {
/* /*
* A RO anonymous page will never change and thus doesn't make * A RO anonymous page will never change and thus doesn't make
* sense for futex operations. * sense for futex operations.
...@@ -357,10 +369,10 @@ int get_futex_key(u32 __user *uaddr, bool fshared, union futex_key *key, ...@@ -357,10 +369,10 @@ int get_futex_key(u32 __user *uaddr, bool fshared, union futex_key *key,
/* /*
* The associated futex object in this case is the inode and * The associated futex object in this case is the inode and
* the page->mapping must be traversed. Ordinarily this should * the folio->mapping must be traversed. Ordinarily this should
* be stabilised under page lock but it's not strictly * be stabilised under folio lock but it's not strictly
* necessary in this case as we just want to pin the inode, not * necessary in this case as we just want to pin the inode, not
* update the radix tree or anything like that. * update i_pages or anything like that.
* *
* The RCU read lock is taken as the inode is finally freed * The RCU read lock is taken as the inode is finally freed
* under RCU. If the mapping still matches expectations then the * under RCU. If the mapping still matches expectations then the
...@@ -368,9 +380,9 @@ int get_futex_key(u32 __user *uaddr, bool fshared, union futex_key *key, ...@@ -368,9 +380,9 @@ int get_futex_key(u32 __user *uaddr, bool fshared, union futex_key *key,
*/ */
rcu_read_lock(); rcu_read_lock();
if (READ_ONCE(page->mapping) != mapping) { if (READ_ONCE(folio->mapping) != mapping) {
rcu_read_unlock(); rcu_read_unlock();
put_page(page); folio_put(folio);
goto again; goto again;
} }
...@@ -378,19 +390,19 @@ int get_futex_key(u32 __user *uaddr, bool fshared, union futex_key *key, ...@@ -378,19 +390,19 @@ int get_futex_key(u32 __user *uaddr, bool fshared, union futex_key *key,
inode = READ_ONCE(mapping->host); inode = READ_ONCE(mapping->host);
if (!inode) { if (!inode) {
rcu_read_unlock(); rcu_read_unlock();
put_page(page); folio_put(folio);
goto again; goto again;
} }
key->both.offset |= FUT_OFF_INODE; /* inode-based key */ key->both.offset |= FUT_OFF_INODE; /* inode-based key */
key->shared.i_seq = get_inode_sequence_number(inode); key->shared.i_seq = get_inode_sequence_number(inode);
key->shared.pgoff = page_to_pgoff(tail); key->shared.pgoff = folio->index + folio_page_idx(folio, page);
rcu_read_unlock(); rcu_read_unlock();
} }
out: out:
put_page(page); folio_put(folio);
return err; return err;
} }
......
...@@ -5,6 +5,7 @@ ...@@ -5,6 +5,7 @@
#include <linux/futex.h> #include <linux/futex.h>
#include <linux/rtmutex.h> #include <linux/rtmutex.h>
#include <linux/sched/wake_q.h> #include <linux/sched/wake_q.h>
#include <linux/compat.h>
#ifdef CONFIG_PREEMPT_RT #ifdef CONFIG_PREEMPT_RT
#include <linux/rcuwait.h> #include <linux/rcuwait.h>
...@@ -16,17 +17,84 @@ ...@@ -16,17 +17,84 @@
* Futex flags used to encode options to functions and preserve them across * Futex flags used to encode options to functions and preserve them across
* restarts. * restarts.
*/ */
#define FLAGS_SIZE_8 0x0000
#define FLAGS_SIZE_16 0x0001
#define FLAGS_SIZE_32 0x0002
#define FLAGS_SIZE_64 0x0003
#define FLAGS_SIZE_MASK 0x0003
#ifdef CONFIG_MMU #ifdef CONFIG_MMU
# define FLAGS_SHARED 0x01 # define FLAGS_SHARED 0x0010
#else #else
/* /*
* NOMMU does not have per process address space. Let the compiler optimize * NOMMU does not have per process address space. Let the compiler optimize
* code away. * code away.
*/ */
# define FLAGS_SHARED 0x00 # define FLAGS_SHARED 0x0000
#endif #endif
#define FLAGS_CLOCKRT 0x02 #define FLAGS_CLOCKRT 0x0020
#define FLAGS_HAS_TIMEOUT 0x04 #define FLAGS_HAS_TIMEOUT 0x0040
#define FLAGS_NUMA 0x0080
#define FLAGS_STRICT 0x0100
/* FUTEX_ to FLAGS_ */
static inline unsigned int futex_to_flags(unsigned int op)
{
unsigned int flags = FLAGS_SIZE_32;
if (!(op & FUTEX_PRIVATE_FLAG))
flags |= FLAGS_SHARED;
if (op & FUTEX_CLOCK_REALTIME)
flags |= FLAGS_CLOCKRT;
return flags;
}
/* FUTEX2_ to FLAGS_ */
static inline unsigned int futex2_to_flags(unsigned int flags2)
{
unsigned int flags = flags2 & FUTEX2_SIZE_MASK;
if (!(flags2 & FUTEX2_PRIVATE))
flags |= FLAGS_SHARED;
if (flags2 & FUTEX2_NUMA)
flags |= FLAGS_NUMA;
return flags;
}
static inline unsigned int futex_size(unsigned int flags)
{
return 1 << (flags & FLAGS_SIZE_MASK);
}
static inline bool futex_flags_valid(unsigned int flags)
{
/* Only 64bit futexes for 64bit code */
if (!IS_ENABLED(CONFIG_64BIT) || in_compat_syscall()) {
if ((flags & FLAGS_SIZE_MASK) == FLAGS_SIZE_64)
return false;
}
/* Only 32bit futexes are implemented -- for now */
if ((flags & FLAGS_SIZE_MASK) != FLAGS_SIZE_32)
return false;
return true;
}
static inline bool futex_validate_input(unsigned int flags, u64 val)
{
int bits = 8 * futex_size(flags);
if (bits < 64 && (val >> bits))
return false;
return true;
}
#ifdef CONFIG_FAIL_FUTEX #ifdef CONFIG_FAIL_FUTEX
extern bool should_fail_futex(bool fshared); extern bool should_fail_futex(bool fshared);
...@@ -116,7 +184,7 @@ enum futex_access { ...@@ -116,7 +184,7 @@ enum futex_access {
FUTEX_WRITE FUTEX_WRITE
}; };
extern int get_futex_key(u32 __user *uaddr, bool fshared, union futex_key *key, extern int get_futex_key(u32 __user *uaddr, unsigned int flags, union futex_key *key,
enum futex_access rw); enum futex_access rw);
extern struct hrtimer_sleeper * extern struct hrtimer_sleeper *
...@@ -260,10 +328,14 @@ extern int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, u32 ...@@ -260,10 +328,14 @@ extern int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, u32
val, ktime_t *abs_time, u32 bitset, u32 __user val, ktime_t *abs_time, u32 bitset, u32 __user
*uaddr2); *uaddr2);
extern int futex_requeue(u32 __user *uaddr1, unsigned int flags, extern int futex_requeue(u32 __user *uaddr1, unsigned int flags1,
u32 __user *uaddr2, int nr_wake, int nr_requeue, u32 __user *uaddr2, unsigned int flags2,
int nr_wake, int nr_requeue,
u32 *cmpval, int requeue_pi); u32 *cmpval, int requeue_pi);
extern int __futex_wait(u32 __user *uaddr, unsigned int flags, u32 val,
struct hrtimer_sleeper *to, u32 bitset);
extern int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, extern int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val,
ktime_t *abs_time, u32 bitset); ktime_t *abs_time, u32 bitset);
......
// SPDX-License-Identifier: GPL-2.0-or-later // SPDX-License-Identifier: GPL-2.0-or-later
#include <linux/slab.h> #include <linux/slab.h>
#include <linux/sched/rt.h>
#include <linux/sched/task.h> #include <linux/sched/task.h>
#include "futex.h" #include "futex.h"
...@@ -610,29 +611,16 @@ int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb, ...@@ -610,29 +611,16 @@ int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
/* /*
* Caller must hold a reference on @pi_state. * Caller must hold a reference on @pi_state.
*/ */
static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_state) static int wake_futex_pi(u32 __user *uaddr, u32 uval,
struct futex_pi_state *pi_state,
struct rt_mutex_waiter *top_waiter)
{ {
struct rt_mutex_waiter *top_waiter;
struct task_struct *new_owner; struct task_struct *new_owner;
bool postunlock = false; bool postunlock = false;
DEFINE_RT_WAKE_Q(wqh); DEFINE_RT_WAKE_Q(wqh);
u32 curval, newval; u32 curval, newval;
int ret = 0; int ret = 0;
top_waiter = rt_mutex_top_waiter(&pi_state->pi_mutex);
if (WARN_ON_ONCE(!top_waiter)) {
/*
* As per the comment in futex_unlock_pi() this should not happen.
*
* When this happens, give up our locks and try again, giving
* the futex_lock_pi() instance time to complete, either by
* waiting on the rtmutex or removing itself from the futex
* queue.
*/
ret = -EAGAIN;
goto out_unlock;
}
new_owner = top_waiter->task; new_owner = top_waiter->task;
/* /*
...@@ -945,7 +933,7 @@ int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int tryl ...@@ -945,7 +933,7 @@ int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int tryl
to = futex_setup_timer(time, &timeout, flags, 0); to = futex_setup_timer(time, &timeout, flags, 0);
retry: retry:
ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key, FUTEX_WRITE); ret = get_futex_key(uaddr, flags, &q.key, FUTEX_WRITE);
if (unlikely(ret != 0)) if (unlikely(ret != 0))
goto out; goto out;
...@@ -1002,6 +990,12 @@ int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int tryl ...@@ -1002,6 +990,12 @@ int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int tryl
goto no_block; goto no_block;
} }
/*
* Must be done before we enqueue the waiter, here is unfortunately
* under the hb lock, but that *should* work because it does nothing.
*/
rt_mutex_pre_schedule();
rt_mutex_init_waiter(&rt_waiter); rt_mutex_init_waiter(&rt_waiter);
/* /*
...@@ -1039,19 +1033,37 @@ int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int tryl ...@@ -1039,19 +1033,37 @@ int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int tryl
ret = rt_mutex_wait_proxy_lock(&q.pi_state->pi_mutex, to, &rt_waiter); ret = rt_mutex_wait_proxy_lock(&q.pi_state->pi_mutex, to, &rt_waiter);
cleanup: cleanup:
spin_lock(q.lock_ptr);
/* /*
* If we failed to acquire the lock (deadlock/signal/timeout), we must * If we failed to acquire the lock (deadlock/signal/timeout), we must
* first acquire the hb->lock before removing the lock from the * must unwind the above, however we canont lock hb->lock because
* rt_mutex waitqueue, such that we can keep the hb and rt_mutex wait * rt_mutex already has a waiter enqueued and hb->lock can itself try
* lists consistent. * and enqueue an rt_waiter through rtlock.
*
* Doing the cleanup without holding hb->lock can cause inconsistent
* state between hb and pi_state, but only in the direction of not
* seeing a waiter that is leaving.
*
* See futex_unlock_pi(), it deals with this inconsistency.
*
* There be dragons here, since we must deal with the inconsistency on
* the way out (here), it is impossible to detect/warn about the race
* the other way around (missing an incoming waiter).
* *
* In particular; it is important that futex_unlock_pi() can not * What could possibly go wrong...
* observe this inconsistency.
*/ */
if (ret && !rt_mutex_cleanup_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter)) if (ret && !rt_mutex_cleanup_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter))
ret = 0; ret = 0;
/*
* Now that the rt_waiter has been dequeued, it is safe to use
* spinlock/rtlock (which might enqueue its own rt_waiter) and fix up
* the
*/
spin_lock(q.lock_ptr);
/*
* Waiter is unqueued.
*/
rt_mutex_post_schedule();
no_block: no_block:
/* /*
* Fixup the pi_state owner and possibly acquire the lock if we * Fixup the pi_state owner and possibly acquire the lock if we
...@@ -1117,7 +1129,7 @@ int futex_unlock_pi(u32 __user *uaddr, unsigned int flags) ...@@ -1117,7 +1129,7 @@ int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
if ((uval & FUTEX_TID_MASK) != vpid) if ((uval & FUTEX_TID_MASK) != vpid)
return -EPERM; return -EPERM;
ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, FUTEX_WRITE); ret = get_futex_key(uaddr, flags, &key, FUTEX_WRITE);
if (ret) if (ret)
return ret; return ret;
...@@ -1132,6 +1144,7 @@ int futex_unlock_pi(u32 __user *uaddr, unsigned int flags) ...@@ -1132,6 +1144,7 @@ int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
top_waiter = futex_top_waiter(hb, &key); top_waiter = futex_top_waiter(hb, &key);
if (top_waiter) { if (top_waiter) {
struct futex_pi_state *pi_state = top_waiter->pi_state; struct futex_pi_state *pi_state = top_waiter->pi_state;
struct rt_mutex_waiter *rt_waiter;
ret = -EINVAL; ret = -EINVAL;
if (!pi_state) if (!pi_state)
...@@ -1144,22 +1157,39 @@ int futex_unlock_pi(u32 __user *uaddr, unsigned int flags) ...@@ -1144,22 +1157,39 @@ int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
if (pi_state->owner != current) if (pi_state->owner != current)
goto out_unlock; goto out_unlock;
get_pi_state(pi_state);
/* /*
* By taking wait_lock while still holding hb->lock, we ensure * By taking wait_lock while still holding hb->lock, we ensure
* there is no point where we hold neither; and therefore * there is no point where we hold neither; and thereby
* wake_futex_p() must observe a state consistent with what we * wake_futex_pi() must observe any new waiters.
* observed. *
* Since the cleanup: case in futex_lock_pi() removes the
* rt_waiter without holding hb->lock, it is possible for
* wake_futex_pi() to not find a waiter while the above does,
* in this case the waiter is on the way out and it can be
* ignored.
* *
* In particular; this forces __rt_mutex_start_proxy() to * In particular; this forces __rt_mutex_start_proxy() to
* complete such that we're guaranteed to observe the * complete such that we're guaranteed to observe the
* rt_waiter. Also see the WARN in wake_futex_pi(). * rt_waiter.
*/ */
raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
/*
* Futex vs rt_mutex waiter state -- if there are no rt_mutex
* waiters even though futex thinks there are, then the waiter
* is leaving and the uncontended path is safe to take.
*/
rt_waiter = rt_mutex_top_waiter(&pi_state->pi_mutex);
if (!rt_waiter) {
raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
goto do_uncontended;
}
get_pi_state(pi_state);
spin_unlock(&hb->lock); spin_unlock(&hb->lock);
/* drops pi_state->pi_mutex.wait_lock */ /* drops pi_state->pi_mutex.wait_lock */
ret = wake_futex_pi(uaddr, uval, pi_state); ret = wake_futex_pi(uaddr, uval, pi_state, rt_waiter);
put_pi_state(pi_state); put_pi_state(pi_state);
...@@ -1187,6 +1217,7 @@ int futex_unlock_pi(u32 __user *uaddr, unsigned int flags) ...@@ -1187,6 +1217,7 @@ int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
return ret; return ret;
} }
do_uncontended:
/* /*
* We have no kernel internal state, i.e. no waiters in the * We have no kernel internal state, i.e. no waiters in the
* kernel. Waiters which are about to queue themselves are stuck * kernel. Waiters which are about to queue themselves are stuck
......
...@@ -269,7 +269,7 @@ futex_proxy_trylock_atomic(u32 __user *pifutex, struct futex_hash_bucket *hb1, ...@@ -269,7 +269,7 @@ futex_proxy_trylock_atomic(u32 __user *pifutex, struct futex_hash_bucket *hb1,
union futex_key *key2, struct futex_pi_state **ps, union futex_key *key2, struct futex_pi_state **ps,
struct task_struct **exiting, int set_waiters) struct task_struct **exiting, int set_waiters)
{ {
struct futex_q *top_waiter = NULL; struct futex_q *top_waiter;
u32 curval; u32 curval;
int ret; int ret;
...@@ -346,8 +346,9 @@ futex_proxy_trylock_atomic(u32 __user *pifutex, struct futex_hash_bucket *hb1, ...@@ -346,8 +346,9 @@ futex_proxy_trylock_atomic(u32 __user *pifutex, struct futex_hash_bucket *hb1,
/** /**
* futex_requeue() - Requeue waiters from uaddr1 to uaddr2 * futex_requeue() - Requeue waiters from uaddr1 to uaddr2
* @uaddr1: source futex user address * @uaddr1: source futex user address
* @flags: futex flags (FLAGS_SHARED, etc.) * @flags1: futex flags (FLAGS_SHARED, etc.)
* @uaddr2: target futex user address * @uaddr2: target futex user address
* @flags2: futex flags (FLAGS_SHARED, etc.)
* @nr_wake: number of waiters to wake (must be 1 for requeue_pi) * @nr_wake: number of waiters to wake (must be 1 for requeue_pi)
* @nr_requeue: number of waiters to requeue (0-INT_MAX) * @nr_requeue: number of waiters to requeue (0-INT_MAX)
* @cmpval: @uaddr1 expected value (or %NULL) * @cmpval: @uaddr1 expected value (or %NULL)
...@@ -361,7 +362,8 @@ futex_proxy_trylock_atomic(u32 __user *pifutex, struct futex_hash_bucket *hb1, ...@@ -361,7 +362,8 @@ futex_proxy_trylock_atomic(u32 __user *pifutex, struct futex_hash_bucket *hb1,
* - >=0 - on success, the number of tasks requeued or woken; * - >=0 - on success, the number of tasks requeued or woken;
* - <0 - on error * - <0 - on error
*/ */
int futex_requeue(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2, int futex_requeue(u32 __user *uaddr1, unsigned int flags1,
u32 __user *uaddr2, unsigned int flags2,
int nr_wake, int nr_requeue, u32 *cmpval, int requeue_pi) int nr_wake, int nr_requeue, u32 *cmpval, int requeue_pi)
{ {
union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
...@@ -424,10 +426,10 @@ int futex_requeue(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2, ...@@ -424,10 +426,10 @@ int futex_requeue(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2,
} }
retry: retry:
ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, FUTEX_READ); ret = get_futex_key(uaddr1, flags1, &key1, FUTEX_READ);
if (unlikely(ret != 0)) if (unlikely(ret != 0))
return ret; return ret;
ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, ret = get_futex_key(uaddr2, flags2, &key2,
requeue_pi ? FUTEX_WRITE : FUTEX_READ); requeue_pi ? FUTEX_WRITE : FUTEX_READ);
if (unlikely(ret != 0)) if (unlikely(ret != 0))
return ret; return ret;
...@@ -459,7 +461,7 @@ int futex_requeue(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2, ...@@ -459,7 +461,7 @@ int futex_requeue(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2,
if (ret) if (ret)
return ret; return ret;
if (!(flags & FLAGS_SHARED)) if (!(flags1 & FLAGS_SHARED))
goto retry_private; goto retry_private;
goto retry; goto retry;
...@@ -789,7 +791,7 @@ int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, ...@@ -789,7 +791,7 @@ int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
*/ */
rt_mutex_init_waiter(&rt_waiter); rt_mutex_init_waiter(&rt_waiter);
ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, FUTEX_WRITE); ret = get_futex_key(uaddr2, flags, &key2, FUTEX_WRITE);
if (unlikely(ret != 0)) if (unlikely(ret != 0))
goto out; goto out;
...@@ -850,11 +852,13 @@ int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, ...@@ -850,11 +852,13 @@ int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
pi_mutex = &q.pi_state->pi_mutex; pi_mutex = &q.pi_state->pi_mutex;
ret = rt_mutex_wait_proxy_lock(pi_mutex, to, &rt_waiter); ret = rt_mutex_wait_proxy_lock(pi_mutex, to, &rt_waiter);
/* Current is not longer pi_blocked_on */ /*
spin_lock(q.lock_ptr); * See futex_unlock_pi()'s cleanup: comment.
*/
if (ret && !rt_mutex_cleanup_proxy_lock(pi_mutex, &rt_waiter)) if (ret && !rt_mutex_cleanup_proxy_lock(pi_mutex, &rt_waiter))
ret = 0; ret = 0;
spin_lock(q.lock_ptr);
debug_rt_mutex_free_waiter(&rt_waiter); debug_rt_mutex_free_waiter(&rt_waiter);
/* /*
* Fixup the pi_state owner and possibly acquire the lock if we * Fixup the pi_state owner and possibly acquire the lock if we
......
// SPDX-License-Identifier: GPL-2.0-or-later // SPDX-License-Identifier: GPL-2.0-or-later
#include <linux/compat.h>
#include <linux/syscalls.h> #include <linux/syscalls.h>
#include <linux/time_namespace.h> #include <linux/time_namespace.h>
...@@ -85,15 +84,12 @@ SYSCALL_DEFINE3(get_robust_list, int, pid, ...@@ -85,15 +84,12 @@ SYSCALL_DEFINE3(get_robust_list, int, pid,
long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
u32 __user *uaddr2, u32 val2, u32 val3) u32 __user *uaddr2, u32 val2, u32 val3)
{ {
unsigned int flags = futex_to_flags(op);
int cmd = op & FUTEX_CMD_MASK; int cmd = op & FUTEX_CMD_MASK;
unsigned int flags = 0;
if (!(op & FUTEX_PRIVATE_FLAG)) if (flags & FLAGS_CLOCKRT) {
flags |= FLAGS_SHARED; if (cmd != FUTEX_WAIT_BITSET &&
cmd != FUTEX_WAIT_REQUEUE_PI &&
if (op & FUTEX_CLOCK_REALTIME) {
flags |= FLAGS_CLOCKRT;
if (cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI &&
cmd != FUTEX_LOCK_PI2) cmd != FUTEX_LOCK_PI2)
return -ENOSYS; return -ENOSYS;
} }
...@@ -110,9 +106,9 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, ...@@ -110,9 +106,9 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
case FUTEX_WAKE_BITSET: case FUTEX_WAKE_BITSET:
return futex_wake(uaddr, flags, val, val3); return futex_wake(uaddr, flags, val, val3);
case FUTEX_REQUEUE: case FUTEX_REQUEUE:
return futex_requeue(uaddr, flags, uaddr2, val, val2, NULL, 0); return futex_requeue(uaddr, flags, uaddr2, flags, val, val2, NULL, 0);
case FUTEX_CMP_REQUEUE: case FUTEX_CMP_REQUEUE:
return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 0); return futex_requeue(uaddr, flags, uaddr2, flags, val, val2, &val3, 0);
case FUTEX_WAKE_OP: case FUTEX_WAKE_OP:
return futex_wake_op(uaddr, flags, uaddr2, val, val2, val3); return futex_wake_op(uaddr, flags, uaddr2, val, val2, val3);
case FUTEX_LOCK_PI: case FUTEX_LOCK_PI:
...@@ -129,7 +125,7 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, ...@@ -129,7 +125,7 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
return futex_wait_requeue_pi(uaddr, flags, val, timeout, val3, return futex_wait_requeue_pi(uaddr, flags, val, timeout, val3,
uaddr2); uaddr2);
case FUTEX_CMP_REQUEUE_PI: case FUTEX_CMP_REQUEUE_PI:
return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 1); return futex_requeue(uaddr, flags, uaddr2, flags, val, val2, &val3, 1);
} }
return -ENOSYS; return -ENOSYS;
} }
...@@ -183,8 +179,7 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, ...@@ -183,8 +179,7 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
return do_futex(uaddr, op, val, tp, uaddr2, (unsigned long)utime, val3); return do_futex(uaddr, op, val, tp, uaddr2, (unsigned long)utime, val3);
} }
/* Mask of available flags for each futex in futex_waitv list */ #define FUTEX2_VALID_MASK (FUTEX2_SIZE_MASK | FUTEX2_PRIVATE)
#define FUTEXV_WAITER_MASK (FUTEX_32 | FUTEX_PRIVATE_FLAG)
/** /**
* futex_parse_waitv - Parse a waitv array from userspace * futex_parse_waitv - Parse a waitv array from userspace
...@@ -202,16 +197,22 @@ static int futex_parse_waitv(struct futex_vector *futexv, ...@@ -202,16 +197,22 @@ static int futex_parse_waitv(struct futex_vector *futexv,
unsigned int i; unsigned int i;
for (i = 0; i < nr_futexes; i++) { for (i = 0; i < nr_futexes; i++) {
unsigned int flags;
if (copy_from_user(&aux, &uwaitv[i], sizeof(aux))) if (copy_from_user(&aux, &uwaitv[i], sizeof(aux)))
return -EFAULT; return -EFAULT;
if ((aux.flags & ~FUTEXV_WAITER_MASK) || aux.__reserved) if ((aux.flags & ~FUTEX2_VALID_MASK) || aux.__reserved)
return -EINVAL;
flags = futex2_to_flags(aux.flags);
if (!futex_flags_valid(flags))
return -EINVAL; return -EINVAL;
if (!(aux.flags & FUTEX_32)) if (!futex_validate_input(flags, aux.val))
return -EINVAL; return -EINVAL;
futexv[i].w.flags = aux.flags; futexv[i].w.flags = flags;
futexv[i].w.val = aux.val; futexv[i].w.val = aux.val;
futexv[i].w.uaddr = aux.uaddr; futexv[i].w.uaddr = aux.uaddr;
futexv[i].q = futex_q_init; futexv[i].q = futex_q_init;
...@@ -220,6 +221,46 @@ static int futex_parse_waitv(struct futex_vector *futexv, ...@@ -220,6 +221,46 @@ static int futex_parse_waitv(struct futex_vector *futexv,
return 0; return 0;
} }
static int futex2_setup_timeout(struct __kernel_timespec __user *timeout,
clockid_t clockid, struct hrtimer_sleeper *to)
{
int flag_clkid = 0, flag_init = 0;
struct timespec64 ts;
ktime_t time;
int ret;
if (!timeout)
return 0;
if (clockid == CLOCK_REALTIME) {
flag_clkid = FLAGS_CLOCKRT;
flag_init = FUTEX_CLOCK_REALTIME;
}
if (clockid != CLOCK_REALTIME && clockid != CLOCK_MONOTONIC)
return -EINVAL;
if (get_timespec64(&ts, timeout))
return -EFAULT;
/*
* Since there's no opcode for futex_waitv, use
* FUTEX_WAIT_BITSET that uses absolute timeout as well
*/
ret = futex_init_timeout(FUTEX_WAIT_BITSET, flag_init, &ts, &time);
if (ret)
return ret;
futex_setup_timer(&time, to, flag_clkid, 0);
return 0;
}
static inline void futex2_destroy_timeout(struct hrtimer_sleeper *to)
{
hrtimer_cancel(&to->timer);
destroy_hrtimer_on_stack(&to->timer);
}
/** /**
* sys_futex_waitv - Wait on a list of futexes * sys_futex_waitv - Wait on a list of futexes
* @waiters: List of futexes to wait on * @waiters: List of futexes to wait on
...@@ -249,8 +290,6 @@ SYSCALL_DEFINE5(futex_waitv, struct futex_waitv __user *, waiters, ...@@ -249,8 +290,6 @@ SYSCALL_DEFINE5(futex_waitv, struct futex_waitv __user *, waiters,
{ {
struct hrtimer_sleeper to; struct hrtimer_sleeper to;
struct futex_vector *futexv; struct futex_vector *futexv;
struct timespec64 ts;
ktime_t time;
int ret; int ret;
/* This syscall supports no flags for now */ /* This syscall supports no flags for now */
...@@ -260,31 +299,9 @@ SYSCALL_DEFINE5(futex_waitv, struct futex_waitv __user *, waiters, ...@@ -260,31 +299,9 @@ SYSCALL_DEFINE5(futex_waitv, struct futex_waitv __user *, waiters,
if (!nr_futexes || nr_futexes > FUTEX_WAITV_MAX || !waiters) if (!nr_futexes || nr_futexes > FUTEX_WAITV_MAX || !waiters)
return -EINVAL; return -EINVAL;
if (timeout) { if (timeout && (ret = futex2_setup_timeout(timeout, clockid, &to)))
int flag_clkid = 0, flag_init = 0;
if (clockid == CLOCK_REALTIME) {
flag_clkid = FLAGS_CLOCKRT;
flag_init = FUTEX_CLOCK_REALTIME;
}
if (clockid != CLOCK_REALTIME && clockid != CLOCK_MONOTONIC)
return -EINVAL;
if (get_timespec64(&ts, timeout))
return -EFAULT;
/*
* Since there's no opcode for futex_waitv, use
* FUTEX_WAIT_BITSET that uses absolute timeout as well
*/
ret = futex_init_timeout(FUTEX_WAIT_BITSET, flag_init, &ts, &time);
if (ret)
return ret; return ret;
futex_setup_timer(&time, &to, flag_clkid, 0);
}
futexv = kcalloc(nr_futexes, sizeof(*futexv), GFP_KERNEL); futexv = kcalloc(nr_futexes, sizeof(*futexv), GFP_KERNEL);
if (!futexv) { if (!futexv) {
ret = -ENOMEM; ret = -ENOMEM;
...@@ -298,13 +315,125 @@ SYSCALL_DEFINE5(futex_waitv, struct futex_waitv __user *, waiters, ...@@ -298,13 +315,125 @@ SYSCALL_DEFINE5(futex_waitv, struct futex_waitv __user *, waiters,
kfree(futexv); kfree(futexv);
destroy_timer: destroy_timer:
if (timeout) { if (timeout)
hrtimer_cancel(&to.timer); futex2_destroy_timeout(&to);
destroy_hrtimer_on_stack(&to.timer);
}
return ret; return ret;
} }
/*
* sys_futex_wake - Wake a number of futexes
* @uaddr: Address of the futex(es) to wake
* @mask: bitmask
* @nr: Number of the futexes to wake
* @flags: FUTEX2 flags
*
* Identical to the traditional FUTEX_WAKE_BITSET op, except it is part of the
* futex2 family of calls.
*/
SYSCALL_DEFINE4(futex_wake,
void __user *, uaddr,
unsigned long, mask,
int, nr,
unsigned int, flags)
{
if (flags & ~FUTEX2_VALID_MASK)
return -EINVAL;
flags = futex2_to_flags(flags);
if (!futex_flags_valid(flags))
return -EINVAL;
if (!futex_validate_input(flags, mask))
return -EINVAL;
return futex_wake(uaddr, FLAGS_STRICT | flags, nr, mask);
}
/*
* sys_futex_wait - Wait on a futex
* @uaddr: Address of the futex to wait on
* @val: Value of @uaddr
* @mask: bitmask
* @flags: FUTEX2 flags
* @timeout: Optional absolute timeout
* @clockid: Clock to be used for the timeout, realtime or monotonic
*
* Identical to the traditional FUTEX_WAIT_BITSET op, except it is part of the
* futex2 familiy of calls.
*/
SYSCALL_DEFINE6(futex_wait,
void __user *, uaddr,
unsigned long, val,
unsigned long, mask,
unsigned int, flags,
struct __kernel_timespec __user *, timeout,
clockid_t, clockid)
{
struct hrtimer_sleeper to;
int ret;
if (flags & ~FUTEX2_VALID_MASK)
return -EINVAL;
flags = futex2_to_flags(flags);
if (!futex_flags_valid(flags))
return -EINVAL;
if (!futex_validate_input(flags, val) ||
!futex_validate_input(flags, mask))
return -EINVAL;
if (timeout && (ret = futex2_setup_timeout(timeout, clockid, &to)))
return ret;
ret = __futex_wait(uaddr, flags, val, timeout ? &to : NULL, mask);
if (timeout)
futex2_destroy_timeout(&to);
return ret;
}
/*
* sys_futex_requeue - Requeue a waiter from one futex to another
* @waiters: array describing the source and destination futex
* @flags: unused
* @nr_wake: number of futexes to wake
* @nr_requeue: number of futexes to requeue
*
* Identical to the traditional FUTEX_CMP_REQUEUE op, except it is part of the
* futex2 family of calls.
*/
SYSCALL_DEFINE4(futex_requeue,
struct futex_waitv __user *, waiters,
unsigned int, flags,
int, nr_wake,
int, nr_requeue)
{
struct futex_vector futexes[2];
u32 cmpval;
int ret;
if (flags)
return -EINVAL;
if (!waiters)
return -EINVAL;
ret = futex_parse_waitv(futexes, waiters, 2);
if (ret)
return ret;
cmpval = futexes[0].w.val;
return futex_requeue(u64_to_user_ptr(futexes[0].w.uaddr), futexes[0].w.flags,
u64_to_user_ptr(futexes[1].w.uaddr), futexes[1].w.flags,
nr_wake, nr_requeue, &cmpval, 0);
}
#ifdef CONFIG_COMPAT #ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE2(set_robust_list, COMPAT_SYSCALL_DEFINE2(set_robust_list,
struct compat_robust_list_head __user *, head, struct compat_robust_list_head __user *, head,
......
...@@ -145,16 +145,19 @@ int futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset) ...@@ -145,16 +145,19 @@ int futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
struct futex_hash_bucket *hb; struct futex_hash_bucket *hb;
struct futex_q *this, *next; struct futex_q *this, *next;
union futex_key key = FUTEX_KEY_INIT; union futex_key key = FUTEX_KEY_INIT;
int ret;
DEFINE_WAKE_Q(wake_q); DEFINE_WAKE_Q(wake_q);
int ret;
if (!bitset) if (!bitset)
return -EINVAL; return -EINVAL;
ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, FUTEX_READ); ret = get_futex_key(uaddr, flags, &key, FUTEX_READ);
if (unlikely(ret != 0)) if (unlikely(ret != 0))
return ret; return ret;
if ((flags & FLAGS_STRICT) && !nr_wake)
return 0;
hb = futex_hash(&key); hb = futex_hash(&key);
/* Make sure we really have tasks to wakeup */ /* Make sure we really have tasks to wakeup */
...@@ -245,10 +248,10 @@ int futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2, ...@@ -245,10 +248,10 @@ int futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2,
DEFINE_WAKE_Q(wake_q); DEFINE_WAKE_Q(wake_q);
retry: retry:
ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, FUTEX_READ); ret = get_futex_key(uaddr1, flags, &key1, FUTEX_READ);
if (unlikely(ret != 0)) if (unlikely(ret != 0))
return ret; return ret;
ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, FUTEX_WRITE); ret = get_futex_key(uaddr2, flags, &key2, FUTEX_WRITE);
if (unlikely(ret != 0)) if (unlikely(ret != 0))
return ret; return ret;
...@@ -419,11 +422,11 @@ static int futex_wait_multiple_setup(struct futex_vector *vs, int count, int *wo ...@@ -419,11 +422,11 @@ static int futex_wait_multiple_setup(struct futex_vector *vs, int count, int *wo
*/ */
retry: retry:
for (i = 0; i < count; i++) { for (i = 0; i < count; i++) {
if ((vs[i].w.flags & FUTEX_PRIVATE_FLAG) && retry) if (!(vs[i].w.flags & FLAGS_SHARED) && retry)
continue; continue;
ret = get_futex_key(u64_to_user_ptr(vs[i].w.uaddr), ret = get_futex_key(u64_to_user_ptr(vs[i].w.uaddr),
!(vs[i].w.flags & FUTEX_PRIVATE_FLAG), vs[i].w.flags,
&vs[i].q.key, FUTEX_READ); &vs[i].q.key, FUTEX_READ);
if (unlikely(ret)) if (unlikely(ret))
...@@ -435,7 +438,7 @@ static int futex_wait_multiple_setup(struct futex_vector *vs, int count, int *wo ...@@ -435,7 +438,7 @@ static int futex_wait_multiple_setup(struct futex_vector *vs, int count, int *wo
for (i = 0; i < count; i++) { for (i = 0; i < count; i++) {
u32 __user *uaddr = (u32 __user *)(unsigned long)vs[i].w.uaddr; u32 __user *uaddr = (u32 __user *)(unsigned long)vs[i].w.uaddr;
struct futex_q *q = &vs[i].q; struct futex_q *q = &vs[i].q;
u32 val = (u32)vs[i].w.val; u32 val = vs[i].w.val;
hb = futex_q_lock(q); hb = futex_q_lock(q);
ret = futex_get_value_locked(&uval, uaddr); ret = futex_get_value_locked(&uval, uaddr);
...@@ -599,7 +602,7 @@ int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, ...@@ -599,7 +602,7 @@ int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
* while the syscall executes. * while the syscall executes.
*/ */
retry: retry:
ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key, FUTEX_READ); ret = get_futex_key(uaddr, flags, &q->key, FUTEX_READ);
if (unlikely(ret != 0)) if (unlikely(ret != 0))
return ret; return ret;
...@@ -629,20 +632,18 @@ int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, ...@@ -629,20 +632,18 @@ int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
return ret; return ret;
} }
int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, ktime_t *abs_time, u32 bitset) int __futex_wait(u32 __user *uaddr, unsigned int flags, u32 val,
struct hrtimer_sleeper *to, u32 bitset)
{ {
struct hrtimer_sleeper timeout, *to;
struct restart_block *restart;
struct futex_hash_bucket *hb;
struct futex_q q = futex_q_init; struct futex_q q = futex_q_init;
struct futex_hash_bucket *hb;
int ret; int ret;
if (!bitset) if (!bitset)
return -EINVAL; return -EINVAL;
q.bitset = bitset; q.bitset = bitset;
to = futex_setup_timer(abs_time, &timeout, flags,
current->timer_slack_ns);
retry: retry:
/* /*
* Prepare to wait on uaddr. On success, it holds hb->lock and q * Prepare to wait on uaddr. On success, it holds hb->lock and q
...@@ -650,18 +651,17 @@ int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, ktime_t *abs_time ...@@ -650,18 +651,17 @@ int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, ktime_t *abs_time
*/ */
ret = futex_wait_setup(uaddr, val, flags, &q, &hb); ret = futex_wait_setup(uaddr, val, flags, &q, &hb);
if (ret) if (ret)
goto out; return ret;
/* futex_queue and wait for wakeup, timeout, or a signal. */ /* futex_queue and wait for wakeup, timeout, or a signal. */
futex_wait_queue(hb, &q, to); futex_wait_queue(hb, &q, to);
/* If we were woken (and unqueued), we succeeded, whatever. */ /* If we were woken (and unqueued), we succeeded, whatever. */
ret = 0;
if (!futex_unqueue(&q)) if (!futex_unqueue(&q))
goto out; return 0;
ret = -ETIMEDOUT;
if (to && !to->task) if (to && !to->task)
goto out; return -ETIMEDOUT;
/* /*
* We expect signal_pending(current), but we might be the * We expect signal_pending(current), but we might be the
...@@ -670,10 +670,28 @@ int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, ktime_t *abs_time ...@@ -670,10 +670,28 @@ int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, ktime_t *abs_time
if (!signal_pending(current)) if (!signal_pending(current))
goto retry; goto retry;
ret = -ERESTARTSYS; return -ERESTARTSYS;
if (!abs_time) }
goto out;
int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, ktime_t *abs_time, u32 bitset)
{
struct hrtimer_sleeper timeout, *to;
struct restart_block *restart;
int ret;
to = futex_setup_timer(abs_time, &timeout, flags,
current->timer_slack_ns);
ret = __futex_wait(uaddr, flags, val, to, bitset);
/* No timeout, nothing to clean up. */
if (!to)
return ret;
hrtimer_cancel(&to->timer);
destroy_hrtimer_on_stack(&to->timer);
if (ret == -ERESTARTSYS) {
restart = &current->restart_block; restart = &current->restart_block;
restart->futex.uaddr = uaddr; restart->futex.uaddr = uaddr;
restart->futex.val = val; restart->futex.val = val;
...@@ -681,13 +699,9 @@ int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, ktime_t *abs_time ...@@ -681,13 +699,9 @@ int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, ktime_t *abs_time
restart->futex.bitset = bitset; restart->futex.bitset = bitset;
restart->futex.flags = flags | FLAGS_HAS_TIMEOUT; restart->futex.flags = flags | FLAGS_HAS_TIMEOUT;
ret = set_restart_fn(restart, futex_wait_restart); return set_restart_fn(restart, futex_wait_restart);
out:
if (to) {
hrtimer_cancel(&to->timer);
destroy_hrtimer_on_stack(&to->timer);
} }
return ret; return ret;
} }
......
...@@ -146,7 +146,7 @@ static int __init init_lockevent_counts(void) ...@@ -146,7 +146,7 @@ static int __init init_lockevent_counts(void)
struct dentry *d_counts = debugfs_create_dir(LOCK_EVENTS_DIR, NULL); struct dentry *d_counts = debugfs_create_dir(LOCK_EVENTS_DIR, NULL);
int i; int i;
if (!d_counts) if (IS_ERR(d_counts))
goto out; goto out;
/* /*
...@@ -159,14 +159,14 @@ static int __init init_lockevent_counts(void) ...@@ -159,14 +159,14 @@ static int __init init_lockevent_counts(void)
for (i = 0; i < lockevent_num; i++) { for (i = 0; i < lockevent_num; i++) {
if (skip_lockevent(lockevent_names[i])) if (skip_lockevent(lockevent_names[i]))
continue; continue;
if (!debugfs_create_file(lockevent_names[i], 0400, d_counts, if (IS_ERR(debugfs_create_file(lockevent_names[i], 0400, d_counts,
(void *)(long)i, &fops_lockevent)) (void *)(long)i, &fops_lockevent)))
goto fail_undo; goto fail_undo;
} }
if (!debugfs_create_file(lockevent_names[LOCKEVENT_reset_cnts], 0200, if (IS_ERR(debugfs_create_file(lockevent_names[LOCKEVENT_reset_cnts], 0200,
d_counts, (void *)(long)LOCKEVENT_reset_cnts, d_counts, (void *)(long)LOCKEVENT_reset_cnts,
&fops_lockevent)) &fops_lockevent)))
goto fail_undo; goto fail_undo;
return 0; return 0;
......
...@@ -440,7 +440,7 @@ static void snprint_time(char *buf, size_t bufsiz, s64 nr) ...@@ -440,7 +440,7 @@ static void snprint_time(char *buf, size_t bufsiz, s64 nr)
static void seq_time(struct seq_file *m, s64 time) static void seq_time(struct seq_file *m, s64 time)
{ {
char num[15]; char num[22];
snprint_time(num, sizeof(num), time); snprint_time(num, sizeof(num), time);
seq_printf(m, " %14s", num); seq_printf(m, " %14s", num);
......
...@@ -218,6 +218,11 @@ static __always_inline bool rt_mutex_cmpxchg_acquire(struct rt_mutex_base *lock, ...@@ -218,6 +218,11 @@ static __always_inline bool rt_mutex_cmpxchg_acquire(struct rt_mutex_base *lock,
return try_cmpxchg_acquire(&lock->owner, &old, new); return try_cmpxchg_acquire(&lock->owner, &old, new);
} }
static __always_inline bool rt_mutex_try_acquire(struct rt_mutex_base *lock)
{
return rt_mutex_cmpxchg_acquire(lock, NULL, current);
}
static __always_inline bool rt_mutex_cmpxchg_release(struct rt_mutex_base *lock, static __always_inline bool rt_mutex_cmpxchg_release(struct rt_mutex_base *lock,
struct task_struct *old, struct task_struct *old,
struct task_struct *new) struct task_struct *new)
...@@ -297,6 +302,20 @@ static __always_inline bool rt_mutex_cmpxchg_acquire(struct rt_mutex_base *lock, ...@@ -297,6 +302,20 @@ static __always_inline bool rt_mutex_cmpxchg_acquire(struct rt_mutex_base *lock,
} }
static int __sched rt_mutex_slowtrylock(struct rt_mutex_base *lock);
static __always_inline bool rt_mutex_try_acquire(struct rt_mutex_base *lock)
{
/*
* With debug enabled rt_mutex_cmpxchg trylock() will always fail.
*
* Avoid unconditionally taking the slow path by using
* rt_mutex_slow_trylock() which is covered by the debug code and can
* acquire a non-contended rtmutex.
*/
return rt_mutex_slowtrylock(lock);
}
static __always_inline bool rt_mutex_cmpxchg_release(struct rt_mutex_base *lock, static __always_inline bool rt_mutex_cmpxchg_release(struct rt_mutex_base *lock,
struct task_struct *old, struct task_struct *old,
struct task_struct *new) struct task_struct *new)
...@@ -1613,7 +1632,7 @@ static int __sched rt_mutex_slowlock_block(struct rt_mutex_base *lock, ...@@ -1613,7 +1632,7 @@ static int __sched rt_mutex_slowlock_block(struct rt_mutex_base *lock,
raw_spin_unlock_irq(&lock->wait_lock); raw_spin_unlock_irq(&lock->wait_lock);
if (!owner || !rtmutex_spin_on_owner(lock, waiter, owner)) if (!owner || !rtmutex_spin_on_owner(lock, waiter, owner))
schedule(); rt_mutex_schedule();
raw_spin_lock_irq(&lock->wait_lock); raw_spin_lock_irq(&lock->wait_lock);
set_current_state(state); set_current_state(state);
...@@ -1642,7 +1661,7 @@ static void __sched rt_mutex_handle_deadlock(int res, int detect_deadlock, ...@@ -1642,7 +1661,7 @@ static void __sched rt_mutex_handle_deadlock(int res, int detect_deadlock,
WARN(1, "rtmutex deadlock detected\n"); WARN(1, "rtmutex deadlock detected\n");
while (1) { while (1) {
set_current_state(TASK_INTERRUPTIBLE); set_current_state(TASK_INTERRUPTIBLE);
schedule(); rt_mutex_schedule();
} }
} }
...@@ -1737,6 +1756,15 @@ static int __sched rt_mutex_slowlock(struct rt_mutex_base *lock, ...@@ -1737,6 +1756,15 @@ static int __sched rt_mutex_slowlock(struct rt_mutex_base *lock,
unsigned long flags; unsigned long flags;
int ret; int ret;
/*
* Do all pre-schedule work here, before we queue a waiter and invoke
* PI -- any such work that trips on rtlock (PREEMPT_RT spinlock) would
* otherwise recurse back into task_blocks_on_rt_mutex() through
* rtlock_slowlock() and will then enqueue a second waiter for this
* same task and things get really confusing real fast.
*/
rt_mutex_pre_schedule();
/* /*
* Technically we could use raw_spin_[un]lock_irq() here, but this can * Technically we could use raw_spin_[un]lock_irq() here, but this can
* be called in early boot if the cmpxchg() fast path is disabled * be called in early boot if the cmpxchg() fast path is disabled
...@@ -1748,6 +1776,7 @@ static int __sched rt_mutex_slowlock(struct rt_mutex_base *lock, ...@@ -1748,6 +1776,7 @@ static int __sched rt_mutex_slowlock(struct rt_mutex_base *lock,
raw_spin_lock_irqsave(&lock->wait_lock, flags); raw_spin_lock_irqsave(&lock->wait_lock, flags);
ret = __rt_mutex_slowlock_locked(lock, ww_ctx, state); ret = __rt_mutex_slowlock_locked(lock, ww_ctx, state);
raw_spin_unlock_irqrestore(&lock->wait_lock, flags); raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
rt_mutex_post_schedule();
return ret; return ret;
} }
...@@ -1755,7 +1784,9 @@ static int __sched rt_mutex_slowlock(struct rt_mutex_base *lock, ...@@ -1755,7 +1784,9 @@ static int __sched rt_mutex_slowlock(struct rt_mutex_base *lock,
static __always_inline int __rt_mutex_lock(struct rt_mutex_base *lock, static __always_inline int __rt_mutex_lock(struct rt_mutex_base *lock,
unsigned int state) unsigned int state)
{ {
if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) lockdep_assert(!current->pi_blocked_on);
if (likely(rt_mutex_try_acquire(lock)))
return 0; return 0;
return rt_mutex_slowlock(lock, NULL, state); return rt_mutex_slowlock(lock, NULL, state);
......
...@@ -71,6 +71,7 @@ static int __sched __rwbase_read_lock(struct rwbase_rt *rwb, ...@@ -71,6 +71,7 @@ static int __sched __rwbase_read_lock(struct rwbase_rt *rwb,
struct rt_mutex_base *rtm = &rwb->rtmutex; struct rt_mutex_base *rtm = &rwb->rtmutex;
int ret; int ret;
rwbase_pre_schedule();
raw_spin_lock_irq(&rtm->wait_lock); raw_spin_lock_irq(&rtm->wait_lock);
/* /*
...@@ -125,12 +126,15 @@ static int __sched __rwbase_read_lock(struct rwbase_rt *rwb, ...@@ -125,12 +126,15 @@ static int __sched __rwbase_read_lock(struct rwbase_rt *rwb,
rwbase_rtmutex_unlock(rtm); rwbase_rtmutex_unlock(rtm);
trace_contention_end(rwb, ret); trace_contention_end(rwb, ret);
rwbase_post_schedule();
return ret; return ret;
} }
static __always_inline int rwbase_read_lock(struct rwbase_rt *rwb, static __always_inline int rwbase_read_lock(struct rwbase_rt *rwb,
unsigned int state) unsigned int state)
{ {
lockdep_assert(!current->pi_blocked_on);
if (rwbase_read_trylock(rwb)) if (rwbase_read_trylock(rwb))
return 0; return 0;
...@@ -237,6 +241,8 @@ static int __sched rwbase_write_lock(struct rwbase_rt *rwb, ...@@ -237,6 +241,8 @@ static int __sched rwbase_write_lock(struct rwbase_rt *rwb,
/* Force readers into slow path */ /* Force readers into slow path */
atomic_sub(READER_BIAS, &rwb->readers); atomic_sub(READER_BIAS, &rwb->readers);
rwbase_pre_schedule();
raw_spin_lock_irqsave(&rtm->wait_lock, flags); raw_spin_lock_irqsave(&rtm->wait_lock, flags);
if (__rwbase_write_trylock(rwb)) if (__rwbase_write_trylock(rwb))
goto out_unlock; goto out_unlock;
...@@ -248,6 +254,7 @@ static int __sched rwbase_write_lock(struct rwbase_rt *rwb, ...@@ -248,6 +254,7 @@ static int __sched rwbase_write_lock(struct rwbase_rt *rwb,
if (rwbase_signal_pending_state(state, current)) { if (rwbase_signal_pending_state(state, current)) {
rwbase_restore_current_state(); rwbase_restore_current_state();
__rwbase_write_unlock(rwb, 0, flags); __rwbase_write_unlock(rwb, 0, flags);
rwbase_post_schedule();
trace_contention_end(rwb, -EINTR); trace_contention_end(rwb, -EINTR);
return -EINTR; return -EINTR;
} }
...@@ -266,6 +273,7 @@ static int __sched rwbase_write_lock(struct rwbase_rt *rwb, ...@@ -266,6 +273,7 @@ static int __sched rwbase_write_lock(struct rwbase_rt *rwb,
out_unlock: out_unlock:
raw_spin_unlock_irqrestore(&rtm->wait_lock, flags); raw_spin_unlock_irqrestore(&rtm->wait_lock, flags);
rwbase_post_schedule();
return 0; return 0;
} }
......
...@@ -1427,8 +1427,14 @@ static inline void __downgrade_write(struct rw_semaphore *sem) ...@@ -1427,8 +1427,14 @@ static inline void __downgrade_write(struct rw_semaphore *sem)
#define rwbase_signal_pending_state(state, current) \ #define rwbase_signal_pending_state(state, current) \
signal_pending_state(state, current) signal_pending_state(state, current)
#define rwbase_pre_schedule() \
rt_mutex_pre_schedule()
#define rwbase_schedule() \ #define rwbase_schedule() \
schedule() rt_mutex_schedule()
#define rwbase_post_schedule() \
rt_mutex_post_schedule()
#include "rwbase_rt.c" #include "rwbase_rt.c"
......
...@@ -37,6 +37,8 @@ ...@@ -37,6 +37,8 @@
static __always_inline void rtlock_lock(struct rt_mutex_base *rtm) static __always_inline void rtlock_lock(struct rt_mutex_base *rtm)
{ {
lockdep_assert(!current->pi_blocked_on);
if (unlikely(!rt_mutex_cmpxchg_acquire(rtm, NULL, current))) if (unlikely(!rt_mutex_cmpxchg_acquire(rtm, NULL, current)))
rtlock_slowlock(rtm); rtlock_slowlock(rtm);
} }
...@@ -184,9 +186,13 @@ static __always_inline int rwbase_rtmutex_trylock(struct rt_mutex_base *rtm) ...@@ -184,9 +186,13 @@ static __always_inline int rwbase_rtmutex_trylock(struct rt_mutex_base *rtm)
#define rwbase_signal_pending_state(state, current) (0) #define rwbase_signal_pending_state(state, current) (0)
#define rwbase_pre_schedule()
#define rwbase_schedule() \ #define rwbase_schedule() \
schedule_rtlock() schedule_rtlock()
#define rwbase_post_schedule()
#include "rwbase_rt.c" #include "rwbase_rt.c"
/* /*
* The common functions which get wrapped into the rwlock API. * The common functions which get wrapped into the rwlock API.
......
...@@ -9,7 +9,7 @@ ...@@ -9,7 +9,7 @@
#include <linux/delay.h> #include <linux/delay.h>
#include <linux/kthread.h> #include <linux/kthread.h>
#include <linux/module.h> #include <linux/module.h>
#include <linux/random.h> #include <linux/prandom.h>
#include <linux/slab.h> #include <linux/slab.h>
#include <linux/ww_mutex.h> #include <linux/ww_mutex.h>
...@@ -386,6 +386,19 @@ struct stress { ...@@ -386,6 +386,19 @@ struct stress {
int nlocks; int nlocks;
}; };
struct rnd_state rng;
DEFINE_SPINLOCK(rng_lock);
static inline u32 prandom_u32_below(u32 ceil)
{
u32 ret;
spin_lock(&rng_lock);
ret = prandom_u32_state(&rng) % ceil;
spin_unlock(&rng_lock);
return ret;
}
static int *get_random_order(int count) static int *get_random_order(int count)
{ {
int *order; int *order;
...@@ -399,7 +412,7 @@ static int *get_random_order(int count) ...@@ -399,7 +412,7 @@ static int *get_random_order(int count)
order[n] = n; order[n] = n;
for (n = count - 1; n > 1; n--) { for (n = count - 1; n > 1; n--) {
r = get_random_u32_below(n + 1); r = prandom_u32_below(n + 1);
if (r != n) { if (r != n) {
tmp = order[n]; tmp = order[n];
order[n] = order[r]; order[n] = order[r];
...@@ -452,21 +465,21 @@ static void stress_inorder_work(struct work_struct *work) ...@@ -452,21 +465,21 @@ static void stress_inorder_work(struct work_struct *work)
ww_mutex_unlock(&locks[order[n]]); ww_mutex_unlock(&locks[order[n]]);
if (err == -EDEADLK) { if (err == -EDEADLK) {
if (!time_after(jiffies, stress->timeout)) {
ww_mutex_lock_slow(&locks[order[contended]], &ctx); ww_mutex_lock_slow(&locks[order[contended]], &ctx);
goto retry; goto retry;
} }
}
ww_acquire_fini(&ctx);
if (err) { if (err) {
pr_err_once("stress (%s) failed with %d\n", pr_err_once("stress (%s) failed with %d\n",
__func__, err); __func__, err);
break; break;
} }
ww_acquire_fini(&ctx);
} while (!time_after(jiffies, stress->timeout)); } while (!time_after(jiffies, stress->timeout));
kfree(order); kfree(order);
kfree(stress);
} }
struct reorder_lock { struct reorder_lock {
...@@ -531,7 +544,6 @@ static void stress_reorder_work(struct work_struct *work) ...@@ -531,7 +544,6 @@ static void stress_reorder_work(struct work_struct *work)
list_for_each_entry_safe(ll, ln, &locks, link) list_for_each_entry_safe(ll, ln, &locks, link)
kfree(ll); kfree(ll);
kfree(order); kfree(order);
kfree(stress);
} }
static void stress_one_work(struct work_struct *work) static void stress_one_work(struct work_struct *work)
...@@ -552,8 +564,6 @@ static void stress_one_work(struct work_struct *work) ...@@ -552,8 +564,6 @@ static void stress_one_work(struct work_struct *work)
break; break;
} }
} while (!time_after(jiffies, stress->timeout)); } while (!time_after(jiffies, stress->timeout));
kfree(stress);
} }
#define STRESS_INORDER BIT(0) #define STRESS_INORDER BIT(0)
...@@ -564,15 +574,24 @@ static void stress_one_work(struct work_struct *work) ...@@ -564,15 +574,24 @@ static void stress_one_work(struct work_struct *work)
static int stress(int nlocks, int nthreads, unsigned int flags) static int stress(int nlocks, int nthreads, unsigned int flags)
{ {
struct ww_mutex *locks; struct ww_mutex *locks;
int n; struct stress *stress_array;
int n, count;
locks = kmalloc_array(nlocks, sizeof(*locks), GFP_KERNEL); locks = kmalloc_array(nlocks, sizeof(*locks), GFP_KERNEL);
if (!locks) if (!locks)
return -ENOMEM; return -ENOMEM;
stress_array = kmalloc_array(nthreads, sizeof(*stress_array),
GFP_KERNEL);
if (!stress_array) {
kfree(locks);
return -ENOMEM;
}
for (n = 0; n < nlocks; n++) for (n = 0; n < nlocks; n++)
ww_mutex_init(&locks[n], &ww_class); ww_mutex_init(&locks[n], &ww_class);
count = 0;
for (n = 0; nthreads; n++) { for (n = 0; nthreads; n++) {
struct stress *stress; struct stress *stress;
void (*fn)(struct work_struct *work); void (*fn)(struct work_struct *work);
...@@ -596,9 +615,7 @@ static int stress(int nlocks, int nthreads, unsigned int flags) ...@@ -596,9 +615,7 @@ static int stress(int nlocks, int nthreads, unsigned int flags)
if (!fn) if (!fn)
continue; continue;
stress = kmalloc(sizeof(*stress), GFP_KERNEL); stress = &stress_array[count++];
if (!stress)
break;
INIT_WORK(&stress->work, fn); INIT_WORK(&stress->work, fn);
stress->locks = locks; stress->locks = locks;
...@@ -613,6 +630,7 @@ static int stress(int nlocks, int nthreads, unsigned int flags) ...@@ -613,6 +630,7 @@ static int stress(int nlocks, int nthreads, unsigned int flags)
for (n = 0; n < nlocks; n++) for (n = 0; n < nlocks; n++)
ww_mutex_destroy(&locks[n]); ww_mutex_destroy(&locks[n]);
kfree(stress_array);
kfree(locks); kfree(locks);
return 0; return 0;
...@@ -625,6 +643,8 @@ static int __init test_ww_mutex_init(void) ...@@ -625,6 +643,8 @@ static int __init test_ww_mutex_init(void)
printk(KERN_INFO "Beginning ww mutex selftests\n"); printk(KERN_INFO "Beginning ww mutex selftests\n");
prandom_seed_state(&rng, get_random_u64());
wq = alloc_workqueue("test-ww_mutex", WQ_UNBOUND, 0); wq = alloc_workqueue("test-ww_mutex", WQ_UNBOUND, 0);
if (!wq) if (!wq)
return -ENOMEM; return -ENOMEM;
......
...@@ -62,7 +62,7 @@ __ww_rt_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ww_ctx, ...@@ -62,7 +62,7 @@ __ww_rt_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ww_ctx,
} }
mutex_acquire_nest(&rtm->dep_map, 0, 0, nest_lock, ip); mutex_acquire_nest(&rtm->dep_map, 0, 0, nest_lock, ip);
if (likely(rt_mutex_cmpxchg_acquire(&rtm->rtmutex, NULL, current))) { if (likely(rt_mutex_try_acquire(&rtm->rtmutex))) {
if (ww_ctx) if (ww_ctx)
ww_mutex_set_context_fastpath(lock, ww_ctx); ww_mutex_set_context_fastpath(lock, ww_ctx);
return 0; return 0;
......
...@@ -6720,10 +6720,14 @@ void __noreturn do_task_dead(void) ...@@ -6720,10 +6720,14 @@ void __noreturn do_task_dead(void)
static inline void sched_submit_work(struct task_struct *tsk) static inline void sched_submit_work(struct task_struct *tsk)
{ {
static DEFINE_WAIT_OVERRIDE_MAP(sched_map, LD_WAIT_CONFIG);
unsigned int task_flags; unsigned int task_flags;
if (task_is_running(tsk)) /*
return; * Establish LD_WAIT_CONFIG context to ensure none of the code called
* will use a blocking primitive -- which would lead to recursion.
*/
lock_map_acquire_try(&sched_map);
task_flags = tsk->flags; task_flags = tsk->flags;
/* /*
...@@ -6749,6 +6753,8 @@ static inline void sched_submit_work(struct task_struct *tsk) ...@@ -6749,6 +6753,8 @@ static inline void sched_submit_work(struct task_struct *tsk)
* make sure to submit it to avoid deadlocks. * make sure to submit it to avoid deadlocks.
*/ */
blk_flush_plug(tsk->plug, true); blk_flush_plug(tsk->plug, true);
lock_map_release(&sched_map);
} }
static void sched_update_worker(struct task_struct *tsk) static void sched_update_worker(struct task_struct *tsk)
...@@ -6761,16 +6767,26 @@ static void sched_update_worker(struct task_struct *tsk) ...@@ -6761,16 +6767,26 @@ static void sched_update_worker(struct task_struct *tsk)
} }
} }
asmlinkage __visible void __sched schedule(void) static __always_inline void __schedule_loop(unsigned int sched_mode)
{ {
struct task_struct *tsk = current;
sched_submit_work(tsk);
do { do {
preempt_disable(); preempt_disable();
__schedule(SM_NONE); __schedule(sched_mode);
sched_preempt_enable_no_resched(); sched_preempt_enable_no_resched();
} while (need_resched()); } while (need_resched());
}
asmlinkage __visible void __sched schedule(void)
{
struct task_struct *tsk = current;
#ifdef CONFIG_RT_MUTEXES
lockdep_assert(!tsk->sched_rt_mutex);
#endif
if (!task_is_running(tsk))
sched_submit_work(tsk);
__schedule_loop(SM_NONE);
sched_update_worker(tsk); sched_update_worker(tsk);
} }
EXPORT_SYMBOL(schedule); EXPORT_SYMBOL(schedule);
...@@ -6834,11 +6850,7 @@ void __sched schedule_preempt_disabled(void) ...@@ -6834,11 +6850,7 @@ void __sched schedule_preempt_disabled(void)
#ifdef CONFIG_PREEMPT_RT #ifdef CONFIG_PREEMPT_RT
void __sched notrace schedule_rtlock(void) void __sched notrace schedule_rtlock(void)
{ {
do { __schedule_loop(SM_RTLOCK_WAIT);
preempt_disable();
__schedule(SM_RTLOCK_WAIT);
sched_preempt_enable_no_resched();
} while (need_resched());
} }
NOKPROBE_SYMBOL(schedule_rtlock); NOKPROBE_SYMBOL(schedule_rtlock);
#endif #endif
...@@ -7034,6 +7046,32 @@ static void __setscheduler_prio(struct task_struct *p, int prio) ...@@ -7034,6 +7046,32 @@ static void __setscheduler_prio(struct task_struct *p, int prio)
#ifdef CONFIG_RT_MUTEXES #ifdef CONFIG_RT_MUTEXES
/*
* Would be more useful with typeof()/auto_type but they don't mix with
* bit-fields. Since it's a local thing, use int. Keep the generic sounding
* name such that if someone were to implement this function we get to compare
* notes.
*/
#define fetch_and_set(x, v) ({ int _x = (x); (x) = (v); _x; })
void rt_mutex_pre_schedule(void)
{
lockdep_assert(!fetch_and_set(current->sched_rt_mutex, 1));
sched_submit_work(current);
}
void rt_mutex_schedule(void)
{
lockdep_assert(current->sched_rt_mutex);
__schedule_loop(SM_NONE);
}
void rt_mutex_post_schedule(void)
{
sched_update_worker(current);
lockdep_assert(fetch_and_set(current->sched_rt_mutex, 0));
}
static inline int __rt_effective_prio(struct task_struct *pi_task, int prio) static inline int __rt_effective_prio(struct task_struct *pi_task, int prio)
{ {
if (pi_task) if (pi_task)
......
...@@ -87,6 +87,9 @@ COND_SYSCALL_COMPAT(set_robust_list); ...@@ -87,6 +87,9 @@ COND_SYSCALL_COMPAT(set_robust_list);
COND_SYSCALL(get_robust_list); COND_SYSCALL(get_robust_list);
COND_SYSCALL_COMPAT(get_robust_list); COND_SYSCALL_COMPAT(get_robust_list);
COND_SYSCALL(futex_waitv); COND_SYSCALL(futex_waitv);
COND_SYSCALL(futex_wake);
COND_SYSCALL(futex_wait);
COND_SYSCALL(futex_requeue);
COND_SYSCALL(kexec_load); COND_SYSCALL(kexec_load);
COND_SYSCALL_COMPAT(kexec_load); COND_SYSCALL_COMPAT(kexec_load);
COND_SYSCALL(init_module); COND_SYSCALL(init_module);
......
...@@ -248,7 +248,7 @@ bool rcuref_put_slowpath(rcuref_t *ref) ...@@ -248,7 +248,7 @@ bool rcuref_put_slowpath(rcuref_t *ref)
* require a retry. If this fails the caller is not * require a retry. If this fails the caller is not
* allowed to deconstruct the object. * allowed to deconstruct the object.
*/ */
if (atomic_cmpxchg_release(&ref->refcnt, RCUREF_NOREF, RCUREF_DEAD) != RCUREF_NOREF) if (!atomic_try_cmpxchg_release(&ref->refcnt, &cnt, RCUREF_DEAD))
return false; return false;
/* /*
......
...@@ -223,14 +223,15 @@ gen_xchg_fallbacks() ...@@ -223,14 +223,15 @@ gen_xchg_fallbacks()
gen_try_cmpxchg_fallback() gen_try_cmpxchg_fallback()
{ {
local prefix="$1"; shift
local cmpxchg="$1"; shift; local cmpxchg="$1"; shift;
local order="$1"; shift; local suffix="$1"; shift;
cat <<EOF cat <<EOF
#define raw_try_${cmpxchg}${order}(_ptr, _oldp, _new) \\ #define raw_${prefix}try_${cmpxchg}${suffix}(_ptr, _oldp, _new) \\
({ \\ ({ \\
typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \\ typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \\
___r = raw_${cmpxchg}${order}((_ptr), ___o, (_new)); \\ ___r = raw_${prefix}${cmpxchg}${suffix}((_ptr), ___o, (_new)); \\
if (unlikely(___r != ___o)) \\ if (unlikely(___r != ___o)) \\
*___op = ___r; \\ *___op = ___r; \\
likely(___r == ___o); \\ likely(___r == ___o); \\
...@@ -259,11 +260,11 @@ gen_try_cmpxchg_order_fallback() ...@@ -259,11 +260,11 @@ gen_try_cmpxchg_order_fallback()
fi fi
printf "#else\n" printf "#else\n"
gen_try_cmpxchg_fallback "${cmpxchg}" "${order}" gen_try_cmpxchg_fallback "" "${cmpxchg}" "${order}"
printf "#endif\n\n" printf "#endif\n\n"
} }
gen_try_cmpxchg_fallbacks() gen_try_cmpxchg_order_fallbacks()
{ {
local cmpxchg="$1"; shift; local cmpxchg="$1"; shift;
...@@ -272,15 +273,17 @@ gen_try_cmpxchg_fallbacks() ...@@ -272,15 +273,17 @@ gen_try_cmpxchg_fallbacks()
done done
} }
gen_cmpxchg_local_fallbacks() gen_def_and_try_cmpxchg_fallback()
{ {
local prefix="$1"; shift
local cmpxchg="$1"; shift local cmpxchg="$1"; shift
local suffix="$1"; shift
printf "#define raw_${cmpxchg} arch_${cmpxchg}\n\n" printf "#define raw_${prefix}${cmpxchg}${suffix} arch_${prefix}${cmpxchg}${suffix}\n\n"
printf "#ifdef arch_try_${cmpxchg}\n" printf "#ifdef arch_${prefix}try_${cmpxchg}${suffix}\n"
printf "#define raw_try_${cmpxchg} arch_try_${cmpxchg}\n" printf "#define raw_${prefix}try_${cmpxchg}${suffix} arch_${prefix}try_${cmpxchg}${suffix}\n"
printf "#else\n" printf "#else\n"
gen_try_cmpxchg_fallback "${cmpxchg}" "" gen_try_cmpxchg_fallback "${prefix}" "${cmpxchg}" "${suffix}"
printf "#endif\n\n" printf "#endif\n\n"
} }
...@@ -302,15 +305,15 @@ for xchg in "xchg" "cmpxchg" "cmpxchg64" "cmpxchg128"; do ...@@ -302,15 +305,15 @@ for xchg in "xchg" "cmpxchg" "cmpxchg64" "cmpxchg128"; do
done done
for cmpxchg in "cmpxchg" "cmpxchg64" "cmpxchg128"; do for cmpxchg in "cmpxchg" "cmpxchg64" "cmpxchg128"; do
gen_try_cmpxchg_fallbacks "${cmpxchg}" gen_try_cmpxchg_order_fallbacks "${cmpxchg}"
done done
for cmpxchg in "cmpxchg_local" "cmpxchg64_local" "cmpxchg128_local"; do for cmpxchg in "cmpxchg" "cmpxchg64" "cmpxchg128"; do
gen_cmpxchg_local_fallbacks "${cmpxchg}" "" gen_def_and_try_cmpxchg_fallback "" "${cmpxchg}" "_local"
done done
for cmpxchg in "sync_cmpxchg"; do for cmpxchg in "cmpxchg"; do
printf "#define raw_${cmpxchg} arch_${cmpxchg}\n\n" gen_def_and_try_cmpxchg_fallback "sync_" "${cmpxchg}" ""
done done
grep '^[a-z]' "$1" | while read name meta args; do grep '^[a-z]' "$1" | while read name meta args; do
......
...@@ -169,7 +169,8 @@ for xchg in "xchg" "cmpxchg" "cmpxchg64" "cmpxchg128" "try_cmpxchg" "try_cmpxchg ...@@ -169,7 +169,8 @@ for xchg in "xchg" "cmpxchg" "cmpxchg64" "cmpxchg128" "try_cmpxchg" "try_cmpxchg
done done
done done
for xchg in "cmpxchg_local" "cmpxchg64_local" "cmpxchg128_local" "sync_cmpxchg" "try_cmpxchg_local" "try_cmpxchg64_local" "try_cmpxchg128_local"; do for xchg in "cmpxchg_local" "cmpxchg64_local" "cmpxchg128_local" "sync_cmpxchg" \
"try_cmpxchg_local" "try_cmpxchg64_local" "try_cmpxchg128_local" "sync_try_cmpxchg"; do
gen_xchg "${xchg}" "" gen_xchg "${xchg}" ""
printf "\n" printf "\n"
done done
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment