Commit abf137dd authored by Jens Axboe's avatar Jens Axboe

aio: make the lookup_ioctx() lockless

The mm->ioctx_list is currently protected by a reader-writer lock,
so we always grab that lock on the read side for doing ioctx
lookups. As the workload is extremely reader biased, turn this into
an rcu hlist so we can make lookup_ioctx() lockless. Get rid of
the rwlock and use a spinlock for providing update side exclusion.

There's usually only 1 entry on this list, so it doesn't make sense
to look into fancier data structures.
Reviewed-by: default avatarJeff Moyer <jmoyer@redhat.com>
Signed-off-by: default avatarJens Axboe <jens.axboe@oracle.com>
parent 392ddc32
...@@ -263,7 +263,7 @@ int s390_enable_sie(void) ...@@ -263,7 +263,7 @@ int s390_enable_sie(void)
/* lets check if we are allowed to replace the mm */ /* lets check if we are allowed to replace the mm */
task_lock(tsk); task_lock(tsk);
if (!tsk->mm || atomic_read(&tsk->mm->mm_users) > 1 || if (!tsk->mm || atomic_read(&tsk->mm->mm_users) > 1 ||
tsk->mm != tsk->active_mm || tsk->mm->ioctx_list) { tsk->mm != tsk->active_mm || !hlist_empty(&tsk->mm->ioctx_list)) {
task_unlock(tsk); task_unlock(tsk);
return -EINVAL; return -EINVAL;
} }
...@@ -279,7 +279,7 @@ int s390_enable_sie(void) ...@@ -279,7 +279,7 @@ int s390_enable_sie(void)
/* Now lets check again if something happened */ /* Now lets check again if something happened */
task_lock(tsk); task_lock(tsk);
if (!tsk->mm || atomic_read(&tsk->mm->mm_users) > 1 || if (!tsk->mm || atomic_read(&tsk->mm->mm_users) > 1 ||
tsk->mm != tsk->active_mm || tsk->mm->ioctx_list) { tsk->mm != tsk->active_mm || !hlist_empty(&tsk->mm->ioctx_list)) {
mmput(mm); mmput(mm);
task_unlock(tsk); task_unlock(tsk);
return -EINVAL; return -EINVAL;
......
...@@ -191,6 +191,20 @@ static int aio_setup_ring(struct kioctx *ctx) ...@@ -191,6 +191,20 @@ static int aio_setup_ring(struct kioctx *ctx)
kunmap_atomic((void *)((unsigned long)__event & PAGE_MASK), km); \ kunmap_atomic((void *)((unsigned long)__event & PAGE_MASK), km); \
} while(0) } while(0)
static void ctx_rcu_free(struct rcu_head *head)
{
struct kioctx *ctx = container_of(head, struct kioctx, rcu_head);
unsigned nr_events = ctx->max_reqs;
kmem_cache_free(kioctx_cachep, ctx);
if (nr_events) {
spin_lock(&aio_nr_lock);
BUG_ON(aio_nr - nr_events > aio_nr);
aio_nr -= nr_events;
spin_unlock(&aio_nr_lock);
}
}
/* __put_ioctx /* __put_ioctx
* Called when the last user of an aio context has gone away, * Called when the last user of an aio context has gone away,
...@@ -198,8 +212,6 @@ static int aio_setup_ring(struct kioctx *ctx) ...@@ -198,8 +212,6 @@ static int aio_setup_ring(struct kioctx *ctx)
*/ */
static void __put_ioctx(struct kioctx *ctx) static void __put_ioctx(struct kioctx *ctx)
{ {
unsigned nr_events = ctx->max_reqs;
BUG_ON(ctx->reqs_active); BUG_ON(ctx->reqs_active);
cancel_delayed_work(&ctx->wq); cancel_delayed_work(&ctx->wq);
...@@ -208,14 +220,7 @@ static void __put_ioctx(struct kioctx *ctx) ...@@ -208,14 +220,7 @@ static void __put_ioctx(struct kioctx *ctx)
mmdrop(ctx->mm); mmdrop(ctx->mm);
ctx->mm = NULL; ctx->mm = NULL;
pr_debug("__put_ioctx: freeing %p\n", ctx); pr_debug("__put_ioctx: freeing %p\n", ctx);
kmem_cache_free(kioctx_cachep, ctx); call_rcu(&ctx->rcu_head, ctx_rcu_free);
if (nr_events) {
spin_lock(&aio_nr_lock);
BUG_ON(aio_nr - nr_events > aio_nr);
aio_nr -= nr_events;
spin_unlock(&aio_nr_lock);
}
} }
#define get_ioctx(kioctx) do { \ #define get_ioctx(kioctx) do { \
...@@ -235,6 +240,7 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) ...@@ -235,6 +240,7 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
{ {
struct mm_struct *mm; struct mm_struct *mm;
struct kioctx *ctx; struct kioctx *ctx;
int did_sync = 0;
/* Prevent overflows */ /* Prevent overflows */
if ((nr_events > (0x10000000U / sizeof(struct io_event))) || if ((nr_events > (0x10000000U / sizeof(struct io_event))) ||
...@@ -267,21 +273,30 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) ...@@ -267,21 +273,30 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
goto out_freectx; goto out_freectx;
/* limit the number of system wide aios */ /* limit the number of system wide aios */
spin_lock(&aio_nr_lock); do {
if (aio_nr + ctx->max_reqs > aio_max_nr || spin_lock_bh(&aio_nr_lock);
aio_nr + ctx->max_reqs < aio_nr) if (aio_nr + nr_events > aio_max_nr ||
aio_nr + nr_events < aio_nr)
ctx->max_reqs = 0; ctx->max_reqs = 0;
else else
aio_nr += ctx->max_reqs; aio_nr += ctx->max_reqs;
spin_unlock(&aio_nr_lock); spin_unlock_bh(&aio_nr_lock);
if (ctx->max_reqs || did_sync)
break;
/* wait for rcu callbacks to have completed before giving up */
synchronize_rcu();
did_sync = 1;
ctx->max_reqs = nr_events;
} while (1);
if (ctx->max_reqs == 0) if (ctx->max_reqs == 0)
goto out_cleanup; goto out_cleanup;
/* now link into global list. */ /* now link into global list. */
write_lock(&mm->ioctx_list_lock); spin_lock(&mm->ioctx_lock);
ctx->next = mm->ioctx_list; hlist_add_head_rcu(&ctx->list, &mm->ioctx_list);
mm->ioctx_list = ctx; spin_unlock(&mm->ioctx_lock);
write_unlock(&mm->ioctx_list_lock);
dprintk("aio: allocated ioctx %p[%ld]: mm=%p mask=0x%x\n", dprintk("aio: allocated ioctx %p[%ld]: mm=%p mask=0x%x\n",
ctx, ctx->user_id, current->mm, ctx->ring_info.nr); ctx, ctx->user_id, current->mm, ctx->ring_info.nr);
...@@ -375,11 +390,12 @@ ssize_t wait_on_sync_kiocb(struct kiocb *iocb) ...@@ -375,11 +390,12 @@ ssize_t wait_on_sync_kiocb(struct kiocb *iocb)
*/ */
void exit_aio(struct mm_struct *mm) void exit_aio(struct mm_struct *mm)
{ {
struct kioctx *ctx = mm->ioctx_list; struct kioctx *ctx;
mm->ioctx_list = NULL;
while (ctx) { while (!hlist_empty(&mm->ioctx_list)) {
struct kioctx *next = ctx->next; ctx = hlist_entry(mm->ioctx_list.first, struct kioctx, list);
ctx->next = NULL; hlist_del_rcu(&ctx->list);
aio_cancel_all(ctx); aio_cancel_all(ctx);
wait_for_all_aios(ctx); wait_for_all_aios(ctx);
...@@ -394,7 +410,6 @@ void exit_aio(struct mm_struct *mm) ...@@ -394,7 +410,6 @@ void exit_aio(struct mm_struct *mm)
atomic_read(&ctx->users), ctx->dead, atomic_read(&ctx->users), ctx->dead,
ctx->reqs_active); ctx->reqs_active);
put_ioctx(ctx); put_ioctx(ctx);
ctx = next;
} }
} }
...@@ -555,19 +570,21 @@ int aio_put_req(struct kiocb *req) ...@@ -555,19 +570,21 @@ int aio_put_req(struct kiocb *req)
static struct kioctx *lookup_ioctx(unsigned long ctx_id) static struct kioctx *lookup_ioctx(unsigned long ctx_id)
{ {
struct kioctx *ioctx; struct mm_struct *mm = current->mm;
struct mm_struct *mm; struct kioctx *ctx = NULL;
struct hlist_node *n;
rcu_read_lock();
mm = current->mm; hlist_for_each_entry_rcu(ctx, n, &mm->ioctx_list, list) {
read_lock(&mm->ioctx_list_lock); if (ctx->user_id == ctx_id && !ctx->dead) {
for (ioctx = mm->ioctx_list; ioctx; ioctx = ioctx->next) get_ioctx(ctx);
if (likely(ioctx->user_id == ctx_id && !ioctx->dead)) {
get_ioctx(ioctx);
break; break;
} }
read_unlock(&mm->ioctx_list_lock); }
return ioctx; rcu_read_unlock();
return ctx;
} }
/* /*
...@@ -1215,19 +1232,14 @@ static int read_events(struct kioctx *ctx, ...@@ -1215,19 +1232,14 @@ static int read_events(struct kioctx *ctx,
static void io_destroy(struct kioctx *ioctx) static void io_destroy(struct kioctx *ioctx)
{ {
struct mm_struct *mm = current->mm; struct mm_struct *mm = current->mm;
struct kioctx **tmp;
int was_dead; int was_dead;
/* delete the entry from the list is someone else hasn't already */ /* delete the entry from the list is someone else hasn't already */
write_lock(&mm->ioctx_list_lock); spin_lock(&mm->ioctx_lock);
was_dead = ioctx->dead; was_dead = ioctx->dead;
ioctx->dead = 1; ioctx->dead = 1;
for (tmp = &mm->ioctx_list; *tmp && *tmp != ioctx; hlist_del_rcu(&ioctx->list);
tmp = &(*tmp)->next) spin_unlock(&mm->ioctx_lock);
;
if (*tmp)
*tmp = ioctx->next;
write_unlock(&mm->ioctx_list_lock);
dprintk("aio_release(%p)\n", ioctx); dprintk("aio_release(%p)\n", ioctx);
if (likely(!was_dead)) if (likely(!was_dead))
......
...@@ -5,6 +5,7 @@ ...@@ -5,6 +5,7 @@
#include <linux/workqueue.h> #include <linux/workqueue.h>
#include <linux/aio_abi.h> #include <linux/aio_abi.h>
#include <linux/uio.h> #include <linux/uio.h>
#include <linux/rcupdate.h>
#include <asm/atomic.h> #include <asm/atomic.h>
...@@ -183,7 +184,7 @@ struct kioctx { ...@@ -183,7 +184,7 @@ struct kioctx {
/* This needs improving */ /* This needs improving */
unsigned long user_id; unsigned long user_id;
struct kioctx *next; struct hlist_node list;
wait_queue_head_t wait; wait_queue_head_t wait;
...@@ -199,6 +200,8 @@ struct kioctx { ...@@ -199,6 +200,8 @@ struct kioctx {
struct aio_ring_info ring_info; struct aio_ring_info ring_info;
struct delayed_work wq; struct delayed_work wq;
struct rcu_head rcu_head;
}; };
/* prototypes */ /* prototypes */
......
...@@ -232,8 +232,9 @@ struct mm_struct { ...@@ -232,8 +232,9 @@ struct mm_struct {
struct core_state *core_state; /* coredumping support */ struct core_state *core_state; /* coredumping support */
/* aio bits */ /* aio bits */
rwlock_t ioctx_list_lock; /* aio lock */ spinlock_t ioctx_lock;
struct kioctx *ioctx_list; struct hlist_head ioctx_list;
#ifdef CONFIG_MM_OWNER #ifdef CONFIG_MM_OWNER
/* /*
* "owner" points to a task that is regarded as the canonical * "owner" points to a task that is regarded as the canonical
......
...@@ -415,8 +415,8 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p) ...@@ -415,8 +415,8 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
set_mm_counter(mm, file_rss, 0); set_mm_counter(mm, file_rss, 0);
set_mm_counter(mm, anon_rss, 0); set_mm_counter(mm, anon_rss, 0);
spin_lock_init(&mm->page_table_lock); spin_lock_init(&mm->page_table_lock);
rwlock_init(&mm->ioctx_list_lock); spin_lock_init(&mm->ioctx_lock);
mm->ioctx_list = NULL; INIT_HLIST_HEAD(&mm->ioctx_list);
mm->free_area_cache = TASK_UNMAPPED_BASE; mm->free_area_cache = TASK_UNMAPPED_BASE;
mm->cached_hole_size = ~0UL; mm->cached_hole_size = ~0UL;
mm_init_owner(mm, p); mm_init_owner(mm, p);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment