Commit 5cf4f52e authored by Jens Axboe's avatar Jens Axboe

io_uring: free io_buffer_list entries via RCU

mmap_lock nests under uring_lock out of necessity, as we may be doing
user copies with uring_lock held. However, for mmap of provided buffer
rings, we attempt to grab uring_lock with mmap_lock already held from
do_mmap(). This makes lockdep, rightfully, complain:

WARNING: possible circular locking dependency detected
6.7.0-rc1-00009-gff3337ebaf94-dirty #4438 Not tainted
------------------------------------------------------
buf-ring.t/442 is trying to acquire lock:
ffff00020e1480a8 (&ctx->uring_lock){+.+.}-{3:3}, at: io_uring_validate_mmap_request.isra.0+0x4c/0x140

but task is already holding lock:
ffff0000dc226190 (&mm->mmap_lock){++++}-{3:3}, at: vm_mmap_pgoff+0x124/0x264

which lock already depends on the new lock.

the existing dependency chain (in reverse order) is:

-> #1 (&mm->mmap_lock){++++}-{3:3}:
       __might_fault+0x90/0xbc
       io_register_pbuf_ring+0x94/0x488
       __arm64_sys_io_uring_register+0x8dc/0x1318
       invoke_syscall+0x5c/0x17c
       el0_svc_common.constprop.0+0x108/0x130
       do_el0_svc+0x2c/0x38
       el0_svc+0x4c/0x94
       el0t_64_sync_handler+0x118/0x124
       el0t_64_sync+0x168/0x16c

-> #0 (&ctx->uring_lock){+.+.}-{3:3}:
       __lock_acquire+0x19a0/0x2d14
       lock_acquire+0x2e0/0x44c
       __mutex_lock+0x118/0x564
       mutex_lock_nested+0x20/0x28
       io_uring_validate_mmap_request.isra.0+0x4c/0x140
       io_uring_mmu_get_unmapped_area+0x3c/0x98
       get_unmapped_area+0xa4/0x158
       do_mmap+0xec/0x5b4
       vm_mmap_pgoff+0x158/0x264
       ksys_mmap_pgoff+0x1d4/0x254
       __arm64_sys_mmap+0x80/0x9c
       invoke_syscall+0x5c/0x17c
       el0_svc_common.constprop.0+0x108/0x130
       do_el0_svc+0x2c/0x38
       el0_svc+0x4c/0x94
       el0t_64_sync_handler+0x118/0x124
       el0t_64_sync+0x168/0x16c

From that mmap(2) path, we really just need to ensure that the buffer
list doesn't go away from underneath us. For the lower indexed entries,
they never go away until the ring is freed and we can always sanely
reference those as long as the caller has a file reference. For the
higher indexed ones in our xarray, we just need to ensure that the
buffer list remains valid while we return the address of it.

Free the higher indexed io_buffer_list entries via RCU. With that we can
avoid needing ->uring_lock inside mmap(2), and simply hold the RCU read
lock around the buffer list lookup and address check.

To ensure that the arrayed lookup either returns a valid fully formulated
entry via RCU lookup, add an 'is_ready' flag that we access with store
and release memory ordering. This isn't needed for the xarray lookups,
but doesn't hurt either. Since this isn't a fast path, retain it across
both types. Similarly, for the allocated array inside the ctx, ensure
we use the proper load/acquire as setup could in theory be running in
parallel with mmap.

While in there, add a few lockdep checks for documentation purposes.

Cc: stable@vger.kernel.org
Fixes: c56e022c ("io_uring: add support for user mapped provided buffer ring")
Signed-off-by: default avatarJens Axboe <axboe@kernel.dk>
parent 07d6063d
...@@ -3498,9 +3498,9 @@ static void *io_uring_validate_mmap_request(struct file *file, ...@@ -3498,9 +3498,9 @@ static void *io_uring_validate_mmap_request(struct file *file,
unsigned int bgid; unsigned int bgid;
bgid = (offset & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT; bgid = (offset & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT;
mutex_lock(&ctx->uring_lock); rcu_read_lock();
ptr = io_pbuf_get_address(ctx, bgid); ptr = io_pbuf_get_address(ctx, bgid);
mutex_unlock(&ctx->uring_lock); rcu_read_unlock();
if (!ptr) if (!ptr)
return ERR_PTR(-EINVAL); return ERR_PTR(-EINVAL);
break; break;
......
...@@ -40,19 +40,35 @@ struct io_buf_free { ...@@ -40,19 +40,35 @@ struct io_buf_free {
int inuse; int inuse;
}; };
static inline struct io_buffer_list *io_buffer_get_list(struct io_ring_ctx *ctx, static struct io_buffer_list *__io_buffer_get_list(struct io_ring_ctx *ctx,
struct io_buffer_list *bl,
unsigned int bgid) unsigned int bgid)
{ {
if (ctx->io_bl && bgid < BGID_ARRAY) if (bl && bgid < BGID_ARRAY)
return &ctx->io_bl[bgid]; return &bl[bgid];
return xa_load(&ctx->io_bl_xa, bgid); return xa_load(&ctx->io_bl_xa, bgid);
} }
static inline struct io_buffer_list *io_buffer_get_list(struct io_ring_ctx *ctx,
unsigned int bgid)
{
lockdep_assert_held(&ctx->uring_lock);
return __io_buffer_get_list(ctx, ctx->io_bl, bgid);
}
static int io_buffer_add_list(struct io_ring_ctx *ctx, static int io_buffer_add_list(struct io_ring_ctx *ctx,
struct io_buffer_list *bl, unsigned int bgid) struct io_buffer_list *bl, unsigned int bgid)
{ {
/*
* Store buffer group ID and finally mark the list as visible.
* The normal lookup doesn't care about the visibility as we're
* always under the ->uring_lock, but the RCU lookup from mmap does.
*/
bl->bgid = bgid; bl->bgid = bgid;
smp_store_release(&bl->is_ready, 1);
if (bgid < BGID_ARRAY) if (bgid < BGID_ARRAY)
return 0; return 0;
...@@ -203,18 +219,19 @@ void __user *io_buffer_select(struct io_kiocb *req, size_t *len, ...@@ -203,18 +219,19 @@ void __user *io_buffer_select(struct io_kiocb *req, size_t *len,
static __cold int io_init_bl_list(struct io_ring_ctx *ctx) static __cold int io_init_bl_list(struct io_ring_ctx *ctx)
{ {
struct io_buffer_list *bl;
int i; int i;
ctx->io_bl = kcalloc(BGID_ARRAY, sizeof(struct io_buffer_list), bl = kcalloc(BGID_ARRAY, sizeof(struct io_buffer_list), GFP_KERNEL);
GFP_KERNEL); if (!bl)
if (!ctx->io_bl)
return -ENOMEM; return -ENOMEM;
for (i = 0; i < BGID_ARRAY; i++) { for (i = 0; i < BGID_ARRAY; i++) {
INIT_LIST_HEAD(&ctx->io_bl[i].buf_list); INIT_LIST_HEAD(&bl[i].buf_list);
ctx->io_bl[i].bgid = i; bl[i].bgid = i;
} }
smp_store_release(&ctx->io_bl, bl);
return 0; return 0;
} }
...@@ -303,7 +320,7 @@ void io_destroy_buffers(struct io_ring_ctx *ctx) ...@@ -303,7 +320,7 @@ void io_destroy_buffers(struct io_ring_ctx *ctx)
xa_for_each(&ctx->io_bl_xa, index, bl) { xa_for_each(&ctx->io_bl_xa, index, bl) {
xa_erase(&ctx->io_bl_xa, bl->bgid); xa_erase(&ctx->io_bl_xa, bl->bgid);
__io_remove_buffers(ctx, bl, -1U); __io_remove_buffers(ctx, bl, -1U);
kfree(bl); kfree_rcu(bl, rcu);
} }
/* /*
...@@ -497,7 +514,16 @@ int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags) ...@@ -497,7 +514,16 @@ int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags)
INIT_LIST_HEAD(&bl->buf_list); INIT_LIST_HEAD(&bl->buf_list);
ret = io_buffer_add_list(ctx, bl, p->bgid); ret = io_buffer_add_list(ctx, bl, p->bgid);
if (ret) { if (ret) {
kfree(bl); /*
* Doesn't need rcu free as it was never visible, but
* let's keep it consistent throughout. Also can't
* be a lower indexed array group, as adding one
* where lookup failed cannot happen.
*/
if (p->bgid >= BGID_ARRAY)
kfree_rcu(bl, rcu);
else
WARN_ON_ONCE(1);
goto err; goto err;
} }
} }
...@@ -636,6 +662,8 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) ...@@ -636,6 +662,8 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
struct io_buffer_list *bl, *free_bl = NULL; struct io_buffer_list *bl, *free_bl = NULL;
int ret; int ret;
lockdep_assert_held(&ctx->uring_lock);
if (copy_from_user(&reg, arg, sizeof(reg))) if (copy_from_user(&reg, arg, sizeof(reg)))
return -EFAULT; return -EFAULT;
...@@ -690,7 +718,7 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) ...@@ -690,7 +718,7 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
return 0; return 0;
} }
kfree(free_bl); kfree_rcu(free_bl, rcu);
return ret; return ret;
} }
...@@ -699,6 +727,8 @@ int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) ...@@ -699,6 +727,8 @@ int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
struct io_uring_buf_reg reg; struct io_uring_buf_reg reg;
struct io_buffer_list *bl; struct io_buffer_list *bl;
lockdep_assert_held(&ctx->uring_lock);
if (copy_from_user(&reg, arg, sizeof(reg))) if (copy_from_user(&reg, arg, sizeof(reg)))
return -EFAULT; return -EFAULT;
if (reg.resv[0] || reg.resv[1] || reg.resv[2]) if (reg.resv[0] || reg.resv[1] || reg.resv[2])
...@@ -715,7 +745,7 @@ int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) ...@@ -715,7 +745,7 @@ int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
__io_remove_buffers(ctx, bl, -1U); __io_remove_buffers(ctx, bl, -1U);
if (bl->bgid >= BGID_ARRAY) { if (bl->bgid >= BGID_ARRAY) {
xa_erase(&ctx->io_bl_xa, bl->bgid); xa_erase(&ctx->io_bl_xa, bl->bgid);
kfree(bl); kfree_rcu(bl, rcu);
} }
return 0; return 0;
} }
...@@ -724,7 +754,15 @@ void *io_pbuf_get_address(struct io_ring_ctx *ctx, unsigned long bgid) ...@@ -724,7 +754,15 @@ void *io_pbuf_get_address(struct io_ring_ctx *ctx, unsigned long bgid)
{ {
struct io_buffer_list *bl; struct io_buffer_list *bl;
bl = io_buffer_get_list(ctx, bgid); bl = __io_buffer_get_list(ctx, smp_load_acquire(&ctx->io_bl), bgid);
/*
* Ensure the list is fully setup. Only strictly needed for RCU lookup
* via mmap, and in that case only for the array indexed groups. For
* the xarray lookups, it's either visible and ready, or not at all.
*/
if (!smp_load_acquire(&bl->is_ready))
return NULL;
if (!bl || !bl->is_mmap) if (!bl || !bl->is_mmap)
return NULL; return NULL;
......
...@@ -15,6 +15,7 @@ struct io_buffer_list { ...@@ -15,6 +15,7 @@ struct io_buffer_list {
struct page **buf_pages; struct page **buf_pages;
struct io_uring_buf_ring *buf_ring; struct io_uring_buf_ring *buf_ring;
}; };
struct rcu_head rcu;
}; };
__u16 bgid; __u16 bgid;
...@@ -28,6 +29,8 @@ struct io_buffer_list { ...@@ -28,6 +29,8 @@ struct io_buffer_list {
__u8 is_mapped; __u8 is_mapped;
/* ring mapped provided buffers, but mmap'ed by application */ /* ring mapped provided buffers, but mmap'ed by application */
__u8 is_mmap; __u8 is_mmap;
/* bl is visible from an RCU point of view for lookup */
__u8 is_ready;
}; };
struct io_buffer { struct io_buffer {
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment