Commit 9b91d73b authored by Andrew Morton's avatar Andrew Morton Committed by Linus Torvalds

[PATCH] Add FUTEX_CMP_REQUEUE futex op

From: Jakub Jelinek <jakub@redhat.com>

FUTEX_REQUEUE operation has been added to the kernel mainly to improve
pthread_cond_broadcast which previously used FUTEX_WAKE INT_MAX op.
pthread_cond_broadcast releases internal condvar mutex before FUTEX_REQUEUE
operation, as otherwise the woken up thread most likely immediately sleeps
again on the internal condvar mutex until the broadcasting thread releases it.

Unfortunately this is racy and causes e.g.
http://sources.redhat.com/cgi-bin/cvsweb.cgi/libc/nptl/tst-cond16.c?rev=1.1&content-type=text/x-cvsweb-markup&cvsroot=glibc
to hang on SMP.

http://listman.redhat.com/archives/phil-list/2004-May/msg00023.html contains
analysis how the hang happens, the problem is if any thread does
pthread_cond_*wait in between releasing of the internal condvar mutex and
FUTEX_REQUEUE operation, a wrong thread might be awaken (and immediately go to
sleep again because it doesn't satisfy conditions for returning from
pthread_cond_*wait) while the right thread requeued on the associated mutex
and there would be nobody to wake that thread up.

The patch below extends FUTEX_REQUEUE operation with something FUTEX_WAIT
already uses:

FUTEX_CMP_REQUEUE is passed an additional argument which is the expected value
of *futex.  Kernel then while holding the futex locks checks if *futex !=
expected and returns -EAGAIN in that case, while if it is equal, continues
with a normal FUTEX_REQUEUE operation.  If the syscall returns -EAGAIN, NPTL
can fall back to FUTEX_WAKE INT_MAX operation which doesn't have this problem,
but is less efficient, while in the likely case that nobody hit the (small)
window the efficient FUTEX_REQUEUE operation is used.
Signed-off-by: default avatarAndrew Morton <akpm@osdl.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@osdl.org>
parent 673a68e6
...@@ -8,9 +8,10 @@ ...@@ -8,9 +8,10 @@
#define FUTEX_WAKE (1) #define FUTEX_WAKE (1)
#define FUTEX_FD (2) #define FUTEX_FD (2)
#define FUTEX_REQUEUE (3) #define FUTEX_REQUEUE (3)
#define FUTEX_CMP_REQUEUE (4)
long do_futex(unsigned long uaddr, int op, int val, long do_futex(unsigned long uaddr, int op, int val,
unsigned long timeout, unsigned long uaddr2, int val2); unsigned long timeout, unsigned long uaddr2, int val2,
int val3);
#endif #endif
...@@ -210,7 +210,8 @@ asmlinkage long compat_sys_sigprocmask(int how, compat_old_sigset_t __user *set, ...@@ -210,7 +210,8 @@ asmlinkage long compat_sys_sigprocmask(int how, compat_old_sigset_t __user *set,
#ifdef CONFIG_FUTEX #ifdef CONFIG_FUTEX
asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, int val, asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, int val,
struct compat_timespec __user *utime, u32 __user *uaddr2) struct compat_timespec __user *utime, u32 __user *uaddr2,
int val3)
{ {
struct timespec t; struct timespec t;
unsigned long timeout = MAX_SCHEDULE_TIMEOUT; unsigned long timeout = MAX_SCHEDULE_TIMEOUT;
...@@ -221,11 +222,11 @@ asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, int val, ...@@ -221,11 +222,11 @@ asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, int val,
return -EFAULT; return -EFAULT;
timeout = timespec_to_jiffies(&t) + 1; timeout = timespec_to_jiffies(&t) + 1;
} }
if (op == FUTEX_REQUEUE) if (op >= FUTEX_REQUEUE)
val2 = (int) (long) utime; val2 = (int) (long) utime;
return do_futex((unsigned long)uaddr, op, val, timeout, return do_futex((unsigned long)uaddr, op, val, timeout,
(unsigned long)uaddr2, val2); (unsigned long)uaddr2, val2, val3);
} }
#endif #endif
......
...@@ -96,6 +96,7 @@ struct futex_q { ...@@ -96,6 +96,7 @@ struct futex_q {
*/ */
struct futex_hash_bucket { struct futex_hash_bucket {
spinlock_t lock; spinlock_t lock;
unsigned int nqueued;
struct list_head chain; struct list_head chain;
}; };
...@@ -318,13 +319,14 @@ static int futex_wake(unsigned long uaddr, int nr_wake) ...@@ -318,13 +319,14 @@ static int futex_wake(unsigned long uaddr, int nr_wake)
* physical page. * physical page.
*/ */
static int futex_requeue(unsigned long uaddr1, unsigned long uaddr2, static int futex_requeue(unsigned long uaddr1, unsigned long uaddr2,
int nr_wake, int nr_requeue) int nr_wake, int nr_requeue, int *valp)
{ {
union futex_key key1, key2; union futex_key key1, key2;
struct futex_hash_bucket *bh1, *bh2; struct futex_hash_bucket *bh1, *bh2;
struct list_head *head1; struct list_head *head1;
struct futex_q *this, *next; struct futex_q *this, *next;
int ret, drop_count = 0; int ret, drop_count = 0;
unsigned int nqueued;
down_read(&current->mm->mmap_sem); down_read(&current->mm->mmap_sem);
...@@ -338,12 +340,41 @@ static int futex_requeue(unsigned long uaddr1, unsigned long uaddr2, ...@@ -338,12 +340,41 @@ static int futex_requeue(unsigned long uaddr1, unsigned long uaddr2,
bh1 = hash_futex(&key1); bh1 = hash_futex(&key1);
bh2 = hash_futex(&key2); bh2 = hash_futex(&key2);
nqueued = bh1->nqueued;
if (likely(valp != NULL)) {
int curval;
/* In order to avoid doing get_user while
holding bh1->lock and bh2->lock, nqueued
(monotonically increasing field) must be first
read, then *uaddr1 fetched from userland and
after acquiring lock nqueued field compared with
the stored value. The smp_mb () below
makes sure that bh1->nqueued is read from memory
before *uaddr1. */
smp_mb();
if (get_user(curval, (int *)uaddr1) != 0) {
ret = -EFAULT;
goto out;
}
if (curval != *valp) {
ret = -EAGAIN;
goto out;
}
}
if (bh1 < bh2) if (bh1 < bh2)
spin_lock(&bh1->lock); spin_lock(&bh1->lock);
spin_lock(&bh2->lock); spin_lock(&bh2->lock);
if (bh1 > bh2) if (bh1 > bh2)
spin_lock(&bh1->lock); spin_lock(&bh1->lock);
if (unlikely(nqueued != bh1->nqueued && valp != NULL)) {
ret = -EAGAIN;
goto out_unlock;
}
head1 = &bh1->chain; head1 = &bh1->chain;
list_for_each_entry_safe(this, next, head1, list) { list_for_each_entry_safe(this, next, head1, list) {
if (!match_futex (&this->key, &key1)) if (!match_futex (&this->key, &key1))
...@@ -365,6 +396,7 @@ static int futex_requeue(unsigned long uaddr1, unsigned long uaddr2, ...@@ -365,6 +396,7 @@ static int futex_requeue(unsigned long uaddr1, unsigned long uaddr2,
} }
} }
out_unlock:
spin_unlock(&bh1->lock); spin_unlock(&bh1->lock);
if (bh1 != bh2) if (bh1 != bh2)
spin_unlock(&bh2->lock); spin_unlock(&bh2->lock);
...@@ -398,6 +430,7 @@ static void queue_me(struct futex_q *q, int fd, struct file *filp) ...@@ -398,6 +430,7 @@ static void queue_me(struct futex_q *q, int fd, struct file *filp)
q->lock_ptr = &bh->lock; q->lock_ptr = &bh->lock;
spin_lock(&bh->lock); spin_lock(&bh->lock);
bh->nqueued++;
list_add_tail(&q->list, &bh->chain); list_add_tail(&q->list, &bh->chain);
spin_unlock(&bh->lock); spin_unlock(&bh->lock);
} }
...@@ -625,7 +658,7 @@ static int futex_fd(unsigned long uaddr, int signal) ...@@ -625,7 +658,7 @@ static int futex_fd(unsigned long uaddr, int signal)
} }
long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout, long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout,
unsigned long uaddr2, int val2) unsigned long uaddr2, int val2, int val3)
{ {
int ret; int ret;
...@@ -641,7 +674,10 @@ long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout, ...@@ -641,7 +674,10 @@ long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout,
ret = futex_fd(uaddr, val); ret = futex_fd(uaddr, val);
break; break;
case FUTEX_REQUEUE: case FUTEX_REQUEUE:
ret = futex_requeue(uaddr, uaddr2, val, val2); ret = futex_requeue(uaddr, uaddr2, val, val2, NULL);
break;
case FUTEX_CMP_REQUEUE:
ret = futex_requeue(uaddr, uaddr2, val, val2, &val3);
break; break;
default: default:
ret = -ENOSYS; ret = -ENOSYS;
...@@ -651,7 +687,8 @@ long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout, ...@@ -651,7 +687,8 @@ long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout,
asmlinkage long sys_futex(u32 __user *uaddr, int op, int val, asmlinkage long sys_futex(u32 __user *uaddr, int op, int val,
struct timespec __user *utime, u32 __user *uaddr2) struct timespec __user *utime, u32 __user *uaddr2,
int val3)
{ {
struct timespec t; struct timespec t;
unsigned long timeout = MAX_SCHEDULE_TIMEOUT; unsigned long timeout = MAX_SCHEDULE_TIMEOUT;
...@@ -665,11 +702,11 @@ asmlinkage long sys_futex(u32 __user *uaddr, int op, int val, ...@@ -665,11 +702,11 @@ asmlinkage long sys_futex(u32 __user *uaddr, int op, int val,
/* /*
* requeue parameter in 'utime' if op == FUTEX_REQUEUE. * requeue parameter in 'utime' if op == FUTEX_REQUEUE.
*/ */
if (op == FUTEX_REQUEUE) if (op >= FUTEX_REQUEUE)
val2 = (int) (long) utime; val2 = (int) (long) utime;
return do_futex((unsigned long)uaddr, op, val, timeout, return do_futex((unsigned long)uaddr, op, val, timeout,
(unsigned long)uaddr2, val2); (unsigned long)uaddr2, val2, val3);
} }
static struct super_block * static struct super_block *
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment