Commit fe6e674c authored by Michel Lespinasse's avatar Michel Lespinasse Committed by Linus Torvalds

rwsem: implement support for write lock stealing on the fastpath

When we decide to wake up readers, we must first grant them as many read
locks as necessary, and then actually wake up all these readers.  But in
order to know how many read shares to grant, we must first count the
readers at the head of the queue.  This might take a while if there are
many readers, and we want to be protected against a writer stealing the
lock while we're counting.  To that end, we grant the first reader lock
before counting how many more readers are queued.

We also require some adjustments to the wake_type semantics.

RWSEM_WAKE_NO_ACTIVE used to mean that we had found the count to be
RWSEM_WAITING_BIAS, in which case the rwsem was known to be free as
nobody could steal it while we hold the wait_lock.  This doesn't make
sense once we implement fastpath write lock stealing, so we now use
RWSEM_WAKE_ANY in that case.

Similarly, when rwsem_down_write_failed found that a read lock was
active, it would use RWSEM_WAKE_READ_OWNED which signalled that new
readers could be woken without checking first that the rwsem was
available.  We can't do that anymore since the existing readers might
release their read locks, and a writer could steal the lock before we
wake up additional readers.  So, we have to use a new RWSEM_WAKE_READERS
value to indicate we only want to wake readers, but we don't currently
hold any read lock.
Signed-off-by: default avatarMichel Lespinasse <walken@google.com>
Reviewed-by: default avatarPeter Hurley <peter@hurleysoftware.com>
Acked-by: default avatarDavidlohr Bueso <davidlohr.bueso@hp.com>
Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
parent 8cf5322c
...@@ -4,6 +4,7 @@ ...@@ -4,6 +4,7 @@
* Derived from arch/i386/kernel/semaphore.c * Derived from arch/i386/kernel/semaphore.c
* *
* Writer lock-stealing by Alex Shi <alex.shi@intel.com> * Writer lock-stealing by Alex Shi <alex.shi@intel.com>
* and Michel Lespinasse <walken@google.com>
*/ */
#include <linux/rwsem.h> #include <linux/rwsem.h>
#include <linux/sched.h> #include <linux/sched.h>
...@@ -41,13 +42,11 @@ struct rwsem_waiter { ...@@ -41,13 +42,11 @@ struct rwsem_waiter {
enum rwsem_waiter_type type; enum rwsem_waiter_type type;
}; };
/* Wake types for __rwsem_do_wake(). Note that RWSEM_WAKE_NO_ACTIVE and enum rwsem_wake_type {
* RWSEM_WAKE_READ_OWNED imply that the spinlock must have been kept held RWSEM_WAKE_ANY, /* Wake whatever's at head of wait list */
* since the rwsem value was observed. RWSEM_WAKE_READERS, /* Wake readers only */
*/ RWSEM_WAKE_READ_OWNED /* Waker thread holds the read lock */
#define RWSEM_WAKE_ANY 0 /* Wake whatever's at head of wait list */ };
#define RWSEM_WAKE_NO_ACTIVE 1 /* rwsem was observed with no active thread */
#define RWSEM_WAKE_READ_OWNED 2 /* rwsem was observed to be read owned */
/* /*
* handle the lock release when processes blocked on it that can now run * handle the lock release when processes blocked on it that can now run
...@@ -60,16 +59,16 @@ struct rwsem_waiter { ...@@ -60,16 +59,16 @@ struct rwsem_waiter {
* - writers are only woken if downgrading is false * - writers are only woken if downgrading is false
*/ */
static struct rw_semaphore * static struct rw_semaphore *
__rwsem_do_wake(struct rw_semaphore *sem, int wake_type) __rwsem_do_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type)
{ {
struct rwsem_waiter *waiter; struct rwsem_waiter *waiter;
struct task_struct *tsk; struct task_struct *tsk;
struct list_head *next; struct list_head *next;
signed long woken, loop, adjustment; signed long oldcount, woken, loop, adjustment;
waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list); waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list);
if (waiter->type == RWSEM_WAITING_FOR_WRITE) { if (waiter->type == RWSEM_WAITING_FOR_WRITE) {
if (wake_type != RWSEM_WAKE_READ_OWNED) if (wake_type == RWSEM_WAKE_ANY)
/* Wake writer at the front of the queue, but do not /* Wake writer at the front of the queue, but do not
* grant it the lock yet as we want other writers * grant it the lock yet as we want other writers
* to be able to steal it. Readers, on the other hand, * to be able to steal it. Readers, on the other hand,
...@@ -79,24 +78,24 @@ __rwsem_do_wake(struct rw_semaphore *sem, int wake_type) ...@@ -79,24 +78,24 @@ __rwsem_do_wake(struct rw_semaphore *sem, int wake_type)
goto out; goto out;
} }
/* If we come here from up_xxxx(), another thread might have reached /* Writers might steal the lock before we grant it to the next reader.
* rwsem_down_failed_common() before we acquired the spinlock and * We prefer to do the first reader grant before counting readers
* woken up a waiter, making it now active. We prefer to check for * so we can bail out early if a writer stole the lock.
* this first in order to not spend too much time with the spinlock
* held if we're not going to be able to wake up readers in the end.
*
* Note that we do not need to update the rwsem count: any writer
* trying to acquire rwsem will run rwsem_down_write_failed() due
* to the waiting threads and block trying to acquire the spinlock.
*
* We use a dummy atomic update in order to acquire the cache line
* exclusively since we expect to succeed and run the final rwsem
* count adjustment pretty soon.
*/ */
if (wake_type == RWSEM_WAKE_ANY && adjustment = 0;
rwsem_atomic_update(0, sem) < RWSEM_WAITING_BIAS) if (wake_type != RWSEM_WAKE_READ_OWNED) {
/* Someone grabbed the sem for write already */ adjustment = RWSEM_ACTIVE_READ_BIAS;
try_reader_grant:
oldcount = rwsem_atomic_update(adjustment, sem) - adjustment;
if (unlikely(oldcount < RWSEM_WAITING_BIAS)) {
/* A writer stole the lock. Undo our reader grant. */
if (rwsem_atomic_update(-adjustment, sem) &
RWSEM_ACTIVE_MASK)
goto out; goto out;
/* Last active locker left. Retry waking readers. */
goto try_reader_grant;
}
}
/* Grant an infinite number of read locks to the readers at the front /* Grant an infinite number of read locks to the readers at the front
* of the queue. Note we increment the 'active part' of the count by * of the queue. Note we increment the 'active part' of the count by
...@@ -114,11 +113,12 @@ __rwsem_do_wake(struct rw_semaphore *sem, int wake_type) ...@@ -114,11 +113,12 @@ __rwsem_do_wake(struct rw_semaphore *sem, int wake_type)
} while (waiter->type != RWSEM_WAITING_FOR_WRITE); } while (waiter->type != RWSEM_WAITING_FOR_WRITE);
adjustment = woken * RWSEM_ACTIVE_READ_BIAS; adjustment = woken * RWSEM_ACTIVE_READ_BIAS - adjustment;
if (waiter->type != RWSEM_WAITING_FOR_WRITE) if (waiter->type != RWSEM_WAITING_FOR_WRITE)
/* hit end of list above */ /* hit end of list above */
adjustment -= RWSEM_WAITING_BIAS; adjustment -= RWSEM_WAITING_BIAS;
if (adjustment)
rwsem_atomic_add(adjustment, sem); rwsem_atomic_add(adjustment, sem);
next = sem->wait_list.next; next = sem->wait_list.next;
...@@ -164,8 +164,8 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) ...@@ -164,8 +164,8 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
count = rwsem_atomic_update(adjustment, sem); count = rwsem_atomic_update(adjustment, sem);
/* If there are no active locks, wake the front queued process(es). */ /* If there are no active locks, wake the front queued process(es). */
if (count == RWSEM_WAITING_BIAS) if (!(count & RWSEM_ACTIVE_MASK))
sem = __rwsem_do_wake(sem, RWSEM_WAKE_NO_ACTIVE); sem = __rwsem_do_wake(sem, RWSEM_WAKE_ANY);
raw_spin_unlock_irq(&sem->wait_lock); raw_spin_unlock_irq(&sem->wait_lock);
...@@ -209,7 +209,7 @@ struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem) ...@@ -209,7 +209,7 @@ struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem)
* any read locks that were queued ahead of us. */ * any read locks that were queued ahead of us. */
if (count > RWSEM_WAITING_BIAS && if (count > RWSEM_WAITING_BIAS &&
adjustment == -RWSEM_ACTIVE_WRITE_BIAS) adjustment == -RWSEM_ACTIVE_WRITE_BIAS)
sem = __rwsem_do_wake(sem, RWSEM_WAKE_READ_OWNED); sem = __rwsem_do_wake(sem, RWSEM_WAKE_READERS);
/* wait until we successfully acquire the lock */ /* wait until we successfully acquire the lock */
set_task_state(tsk, TASK_UNINTERRUPTIBLE); set_task_state(tsk, TASK_UNINTERRUPTIBLE);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment