Commit 6b4f4bc9 authored by Will Deacon's avatar Will Deacon

locking/futex: Allow low-level atomic operations to return -EAGAIN

Some futex() operations, including FUTEX_WAKE_OP, require the kernel to
perform an atomic read-modify-write of the futex word via the userspace
mapping. These operations are implemented by each architecture in
arch_futex_atomic_op_inuser() and futex_atomic_cmpxchg_inatomic(), which
are called in atomic context with the relevant hash bucket locks held.

Although these routines may return -EFAULT in response to a page fault
generated when accessing userspace, they are expected to succeed (i.e.
return 0) in all other cases. This poses a problem for architectures
that do not provide bounded forward progress guarantees or fairness of
contended atomic operations and can lead to starvation in some cases.

In these problematic scenarios, we must return back to the core futex
code so that we can drop the hash bucket locks and reschedule if
necessary, much like we do in the case of a page fault.

Allow architectures to return -EAGAIN from their implementations of
arch_futex_atomic_op_inuser() and futex_atomic_cmpxchg_inatomic(), which
will cause the core futex code to reschedule if necessary and return
back to the architecture code later on.

Cc: <stable@kernel.org>
Acked-by: default avatarPeter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: default avatarWill Deacon <will.deacon@arm.com>
parent 84ff7a09
...@@ -1311,13 +1311,15 @@ static int lookup_pi_state(u32 __user *uaddr, u32 uval, ...@@ -1311,13 +1311,15 @@ static int lookup_pi_state(u32 __user *uaddr, u32 uval,
static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval) static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval)
{ {
int err;
u32 uninitialized_var(curval); u32 uninitialized_var(curval);
if (unlikely(should_fail_futex(true))) if (unlikely(should_fail_futex(true)))
return -EFAULT; return -EFAULT;
if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))) err = cmpxchg_futex_value_locked(&curval, uaddr, uval, newval);
return -EFAULT; if (unlikely(err))
return err;
/* If user space value changed, let the caller retry */ /* If user space value changed, let the caller retry */
return curval != uval ? -EAGAIN : 0; return curval != uval ? -EAGAIN : 0;
...@@ -1502,10 +1504,8 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_ ...@@ -1502,10 +1504,8 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_
if (unlikely(should_fail_futex(true))) if (unlikely(should_fail_futex(true)))
ret = -EFAULT; ret = -EFAULT;
if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) { ret = cmpxchg_futex_value_locked(&curval, uaddr, uval, newval);
ret = -EFAULT; if (!ret && (curval != uval)) {
} else if (curval != uval) {
/* /*
* If a unconditional UNLOCK_PI operation (user space did not * If a unconditional UNLOCK_PI operation (user space did not
* try the TID->0 transition) raced with a waiter setting the * try the TID->0 transition) raced with a waiter setting the
...@@ -1700,32 +1700,32 @@ futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2, ...@@ -1700,32 +1700,32 @@ futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2,
double_lock_hb(hb1, hb2); double_lock_hb(hb1, hb2);
op_ret = futex_atomic_op_inuser(op, uaddr2); op_ret = futex_atomic_op_inuser(op, uaddr2);
if (unlikely(op_ret < 0)) { if (unlikely(op_ret < 0)) {
double_unlock_hb(hb1, hb2); double_unlock_hb(hb1, hb2);
#ifndef CONFIG_MMU if (!IS_ENABLED(CONFIG_MMU) ||
/* unlikely(op_ret != -EFAULT && op_ret != -EAGAIN)) {
* we don't get EFAULT from MMU faults if we don't have an MMU, /*
* but we might get them from range checking * we don't get EFAULT from MMU faults if we don't have
*/ * an MMU, but we might get them from range checking
ret = op_ret; */
goto out_put_keys;
#endif
if (unlikely(op_ret != -EFAULT)) {
ret = op_ret; ret = op_ret;
goto out_put_keys; goto out_put_keys;
} }
ret = fault_in_user_writeable(uaddr2); if (op_ret == -EFAULT) {
if (ret) ret = fault_in_user_writeable(uaddr2);
goto out_put_keys; if (ret)
goto out_put_keys;
}
if (!(flags & FLAGS_SHARED)) if (!(flags & FLAGS_SHARED)) {
cond_resched();
goto retry_private; goto retry_private;
}
put_futex_key(&key2); put_futex_key(&key2);
put_futex_key(&key1); put_futex_key(&key1);
cond_resched();
goto retry; goto retry;
} }
...@@ -2350,7 +2350,7 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, ...@@ -2350,7 +2350,7 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
u32 uval, uninitialized_var(curval), newval; u32 uval, uninitialized_var(curval), newval;
struct task_struct *oldowner, *newowner; struct task_struct *oldowner, *newowner;
u32 newtid; u32 newtid;
int ret; int ret, err = 0;
lockdep_assert_held(q->lock_ptr); lockdep_assert_held(q->lock_ptr);
...@@ -2421,14 +2421,17 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, ...@@ -2421,14 +2421,17 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
if (!pi_state->owner) if (!pi_state->owner)
newtid |= FUTEX_OWNER_DIED; newtid |= FUTEX_OWNER_DIED;
if (get_futex_value_locked(&uval, uaddr)) err = get_futex_value_locked(&uval, uaddr);
goto handle_fault; if (err)
goto handle_err;
for (;;) { for (;;) {
newval = (uval & FUTEX_OWNER_DIED) | newtid; newval = (uval & FUTEX_OWNER_DIED) | newtid;
if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) err = cmpxchg_futex_value_locked(&curval, uaddr, uval, newval);
goto handle_fault; if (err)
goto handle_err;
if (curval == uval) if (curval == uval)
break; break;
uval = curval; uval = curval;
...@@ -2456,23 +2459,37 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, ...@@ -2456,23 +2459,37 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
return 0; return 0;
/* /*
* To handle the page fault we need to drop the locks here. That gives * In order to reschedule or handle a page fault, we need to drop the
* the other task (either the highest priority waiter itself or the * locks here. In the case of a fault, this gives the other task
* task which stole the rtmutex) the chance to try the fixup of the * (either the highest priority waiter itself or the task which stole
* pi_state. So once we are back from handling the fault we need to * the rtmutex) the chance to try the fixup of the pi_state. So once we
* check the pi_state after reacquiring the locks and before trying to * are back from handling the fault we need to check the pi_state after
* do another fixup. When the fixup has been done already we simply * reacquiring the locks and before trying to do another fixup. When
* return. * the fixup has been done already we simply return.
* *
* Note: we hold both hb->lock and pi_mutex->wait_lock. We can safely * Note: we hold both hb->lock and pi_mutex->wait_lock. We can safely
* drop hb->lock since the caller owns the hb -> futex_q relation. * drop hb->lock since the caller owns the hb -> futex_q relation.
* Dropping the pi_mutex->wait_lock requires the state revalidate. * Dropping the pi_mutex->wait_lock requires the state revalidate.
*/ */
handle_fault: handle_err:
raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
spin_unlock(q->lock_ptr); spin_unlock(q->lock_ptr);
ret = fault_in_user_writeable(uaddr); switch (err) {
case -EFAULT:
ret = fault_in_user_writeable(uaddr);
break;
case -EAGAIN:
cond_resched();
ret = 0;
break;
default:
WARN_ON_ONCE(1);
ret = err;
break;
}
spin_lock(q->lock_ptr); spin_lock(q->lock_ptr);
raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
...@@ -3041,10 +3058,8 @@ static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags) ...@@ -3041,10 +3058,8 @@ static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
* A unconditional UNLOCK_PI op raced against a waiter * A unconditional UNLOCK_PI op raced against a waiter
* setting the FUTEX_WAITERS bit. Try again. * setting the FUTEX_WAITERS bit. Try again.
*/ */
if (ret == -EAGAIN) { if (ret == -EAGAIN)
put_futex_key(&key); goto pi_retry;
goto retry;
}
/* /*
* wake_futex_pi has detected invalid state. Tell user * wake_futex_pi has detected invalid state. Tell user
* space. * space.
...@@ -3059,9 +3074,19 @@ static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags) ...@@ -3059,9 +3074,19 @@ static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
* preserve the WAITERS bit not the OWNER_DIED one. We are the * preserve the WAITERS bit not the OWNER_DIED one. We are the
* owner. * owner.
*/ */
if (cmpxchg_futex_value_locked(&curval, uaddr, uval, 0)) { if ((ret = cmpxchg_futex_value_locked(&curval, uaddr, uval, 0))) {
spin_unlock(&hb->lock); spin_unlock(&hb->lock);
goto pi_faulted; switch (ret) {
case -EFAULT:
goto pi_faulted;
case -EAGAIN:
goto pi_retry;
default:
WARN_ON_ONCE(1);
goto out_putkey;
}
} }
/* /*
...@@ -3075,6 +3100,11 @@ static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags) ...@@ -3075,6 +3100,11 @@ static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
put_futex_key(&key); put_futex_key(&key);
return ret; return ret;
pi_retry:
put_futex_key(&key);
cond_resched();
goto retry;
pi_faulted: pi_faulted:
put_futex_key(&key); put_futex_key(&key);
...@@ -3435,6 +3465,7 @@ SYSCALL_DEFINE3(get_robust_list, int, pid, ...@@ -3435,6 +3465,7 @@ SYSCALL_DEFINE3(get_robust_list, int, pid,
static int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi) static int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi)
{ {
u32 uval, uninitialized_var(nval), mval; u32 uval, uninitialized_var(nval), mval;
int err;
/* Futex address must be 32bit aligned */ /* Futex address must be 32bit aligned */
if ((((unsigned long)uaddr) % sizeof(*uaddr)) != 0) if ((((unsigned long)uaddr) % sizeof(*uaddr)) != 0)
...@@ -3444,42 +3475,57 @@ static int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int p ...@@ -3444,42 +3475,57 @@ static int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int p
if (get_user(uval, uaddr)) if (get_user(uval, uaddr))
return -1; return -1;
if ((uval & FUTEX_TID_MASK) == task_pid_vnr(curr)) { if ((uval & FUTEX_TID_MASK) != task_pid_vnr(curr))
/* return 0;
* Ok, this dying thread is truly holding a futex
* of interest. Set the OWNER_DIED bit atomically /*
* via cmpxchg, and if the value had FUTEX_WAITERS * Ok, this dying thread is truly holding a futex
* set, wake up a waiter (if any). (We have to do a * of interest. Set the OWNER_DIED bit atomically
* futex_wake() even if OWNER_DIED is already set - * via cmpxchg, and if the value had FUTEX_WAITERS
* to handle the rare but possible case of recursive * set, wake up a waiter (if any). (We have to do a
* thread-death.) The rest of the cleanup is done in * futex_wake() even if OWNER_DIED is already set -
* userspace. * to handle the rare but possible case of recursive
*/ * thread-death.) The rest of the cleanup is done in
mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED; * userspace.
/* */
* We are not holding a lock here, but we want to have mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED;
* the pagefault_disable/enable() protection because
* we want to handle the fault gracefully. If the /*
* access fails we try to fault in the futex with R/W * We are not holding a lock here, but we want to have
* verification via get_user_pages. get_user() above * the pagefault_disable/enable() protection because
* does not guarantee R/W access. If that fails we * we want to handle the fault gracefully. If the
* give up and leave the futex locked. * access fails we try to fault in the futex with R/W
*/ * verification via get_user_pages. get_user() above
if (cmpxchg_futex_value_locked(&nval, uaddr, uval, mval)) { * does not guarantee R/W access. If that fails we
* give up and leave the futex locked.
*/
if ((err = cmpxchg_futex_value_locked(&nval, uaddr, uval, mval))) {
switch (err) {
case -EFAULT:
if (fault_in_user_writeable(uaddr)) if (fault_in_user_writeable(uaddr))
return -1; return -1;
goto retry; goto retry;
}
if (nval != uval) case -EAGAIN:
cond_resched();
goto retry; goto retry;
/* default:
* Wake robust non-PI futexes here. The wakeup of WARN_ON_ONCE(1);
* PI futexes happens in exit_pi_state(): return err;
*/ }
if (!pi && (uval & FUTEX_WAITERS))
futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY);
} }
if (nval != uval)
goto retry;
/*
* Wake robust non-PI futexes here. The wakeup of
* PI futexes happens in exit_pi_state():
*/
if (!pi && (uval & FUTEX_WAITERS))
futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY);
return 0; return 0;
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment