Commit ffa059b2 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'for-6.7/io_uring-2023-10-30' of git://git.kernel.dk/linux

Pull io_uring updates from Jens Axboe:
 "This contains the core io_uring updates, of which there are not many,
  and adds support for using WAITID through io_uring and hence not
  needing to block on these kinds of events.

  Outside of that, tweaks to the legacy provided buffer handling and
  some cleanups related to cancelations for uring_cmd support"

* tag 'for-6.7/io_uring-2023-10-30' of git://git.kernel.dk/linux:
  io_uring/poll: use IOU_F_TWQ_LAZY_WAKE for wakeups
  io_uring/kbuf: Use slab for struct io_buffer objects
  io_uring/kbuf: Allow the full buffer id space for provided buffers
  io_uring/kbuf: Fix check of BID wrapping in provided buffers
  io_uring/rsrc: cleanup io_pin_pages()
  io_uring: cancelable uring_cmd
  io_uring: retain top 8bits of uring_cmd flags for kernel internal use
  io_uring: add IORING_OP_WAITID support
  exit: add internal include file with helpers
  exit: add kernel_waitid_prepare() helper
  exit: move core of do_wait() into helper
  exit: abstract out should_wake helper for child_wait_callback()
  io_uring/rw: add support for IORING_OP_READ_MULTISHOT
  io_uring/rw: mark readv/writev as vectored in the opcode definition
  io_uring/rw: split io_read() into a helper
parents ca995ce4 6ce4a93d
...@@ -20,8 +20,15 @@ enum io_uring_cmd_flags { ...@@ -20,8 +20,15 @@ enum io_uring_cmd_flags {
IO_URING_F_SQE128 = (1 << 8), IO_URING_F_SQE128 = (1 << 8),
IO_URING_F_CQE32 = (1 << 9), IO_URING_F_CQE32 = (1 << 9),
IO_URING_F_IOPOLL = (1 << 10), IO_URING_F_IOPOLL = (1 << 10),
/* set when uring wants to cancel a previously issued command */
IO_URING_F_CANCEL = (1 << 11),
}; };
/* only top 8 bits of sqe->uring_cmd_flags for kernel internal use */
#define IORING_URING_CMD_CANCELABLE (1U << 30)
#define IORING_URING_CMD_POLLED (1U << 31)
struct io_uring_cmd { struct io_uring_cmd {
struct file *file; struct file *file;
const struct io_uring_sqe *sqe; const struct io_uring_sqe *sqe;
...@@ -82,6 +89,9 @@ static inline void io_uring_free(struct task_struct *tsk) ...@@ -82,6 +89,9 @@ static inline void io_uring_free(struct task_struct *tsk)
__io_uring_free(tsk); __io_uring_free(tsk);
} }
int io_uring_cmd_sock(struct io_uring_cmd *cmd, unsigned int issue_flags); int io_uring_cmd_sock(struct io_uring_cmd *cmd, unsigned int issue_flags);
void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd,
unsigned int issue_flags);
struct task_struct *io_uring_cmd_get_task(struct io_uring_cmd *cmd);
#else #else
static inline int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw, static inline int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw,
struct iov_iter *iter, void *ioucmd) struct iov_iter *iter, void *ioucmd)
...@@ -122,6 +132,14 @@ static inline int io_uring_cmd_sock(struct io_uring_cmd *cmd, ...@@ -122,6 +132,14 @@ static inline int io_uring_cmd_sock(struct io_uring_cmd *cmd,
{ {
return -EOPNOTSUPP; return -EOPNOTSUPP;
} }
static inline void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd,
unsigned int issue_flags)
{
}
static inline struct task_struct *io_uring_cmd_get_task(struct io_uring_cmd *cmd)
{
return NULL;
}
#endif #endif
#endif #endif
...@@ -265,6 +265,12 @@ struct io_ring_ctx { ...@@ -265,6 +265,12 @@ struct io_ring_ctx {
*/ */
struct io_wq_work_list iopoll_list; struct io_wq_work_list iopoll_list;
bool poll_multi_queue; bool poll_multi_queue;
/*
* Any cancelable uring_cmd is added to this list in
* ->uring_cmd() by io_uring_cmd_insert_cancelable()
*/
struct hlist_head cancelable_uring_cmd;
} ____cacheline_aligned_in_smp; } ____cacheline_aligned_in_smp;
struct { struct {
...@@ -313,6 +319,8 @@ struct io_ring_ctx { ...@@ -313,6 +319,8 @@ struct io_ring_ctx {
struct list_head cq_overflow_list; struct list_head cq_overflow_list;
struct io_hash_table cancel_table; struct io_hash_table cancel_table;
struct hlist_head waitid_list;
const struct cred *sq_creds; /* cred used for __io_sq_thread() */ const struct cred *sq_creds; /* cred used for __io_sq_thread() */
struct io_sq_data *sq_data; /* if using sq thread polling */ struct io_sq_data *sq_data; /* if using sq thread polling */
...@@ -342,8 +350,6 @@ struct io_ring_ctx { ...@@ -342,8 +350,6 @@ struct io_ring_ctx {
struct wait_queue_head rsrc_quiesce_wq; struct wait_queue_head rsrc_quiesce_wq;
unsigned rsrc_quiesce; unsigned rsrc_quiesce;
struct list_head io_buffers_pages;
#if defined(CONFIG_UNIX) #if defined(CONFIG_UNIX)
struct socket *ring_sock; struct socket *ring_sock;
#endif #endif
......
...@@ -65,6 +65,7 @@ struct io_uring_sqe { ...@@ -65,6 +65,7 @@ struct io_uring_sqe {
__u32 xattr_flags; __u32 xattr_flags;
__u32 msg_ring_flags; __u32 msg_ring_flags;
__u32 uring_cmd_flags; __u32 uring_cmd_flags;
__u32 waitid_flags;
}; };
__u64 user_data; /* data to be passed back at completion time */ __u64 user_data; /* data to be passed back at completion time */
/* pack this to avoid bogus arm OABI complaints */ /* pack this to avoid bogus arm OABI complaints */
...@@ -240,19 +241,20 @@ enum io_uring_op { ...@@ -240,19 +241,20 @@ enum io_uring_op {
IORING_OP_URING_CMD, IORING_OP_URING_CMD,
IORING_OP_SEND_ZC, IORING_OP_SEND_ZC,
IORING_OP_SENDMSG_ZC, IORING_OP_SENDMSG_ZC,
IORING_OP_READ_MULTISHOT,
IORING_OP_WAITID,
/* this goes last, obviously */ /* this goes last, obviously */
IORING_OP_LAST, IORING_OP_LAST,
}; };
/* /*
* sqe->uring_cmd_flags * sqe->uring_cmd_flags top 8bits aren't available for userspace
* IORING_URING_CMD_FIXED use registered buffer; pass this flag * IORING_URING_CMD_FIXED use registered buffer; pass this flag
* along with setting sqe->buf_index. * along with setting sqe->buf_index.
* IORING_URING_CMD_POLLED driver use only
*/ */
#define IORING_URING_CMD_FIXED (1U << 0) #define IORING_URING_CMD_FIXED (1U << 0)
#define IORING_URING_CMD_POLLED (1U << 31) #define IORING_URING_CMD_MASK IORING_URING_CMD_FIXED
/* /*
......
...@@ -7,5 +7,6 @@ obj-$(CONFIG_IO_URING) += io_uring.o xattr.o nop.o fs.o splice.o \ ...@@ -7,5 +7,6 @@ obj-$(CONFIG_IO_URING) += io_uring.o xattr.o nop.o fs.o splice.o \
openclose.o uring_cmd.o epoll.o \ openclose.o uring_cmd.o epoll.o \
statx.o net.o msg_ring.o timeout.o \ statx.o net.o msg_ring.o timeout.o \
sqpoll.o fdinfo.o tctx.o poll.o \ sqpoll.o fdinfo.o tctx.o poll.o \
cancel.o kbuf.o rsrc.o rw.o opdef.o notif.o cancel.o kbuf.o rsrc.o rw.o opdef.o \
notif.o waitid.o
obj-$(CONFIG_IO_WQ) += io-wq.o obj-$(CONFIG_IO_WQ) += io-wq.o
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
#include "tctx.h" #include "tctx.h"
#include "poll.h" #include "poll.h"
#include "timeout.h" #include "timeout.h"
#include "waitid.h"
#include "cancel.h" #include "cancel.h"
struct io_cancel { struct io_cancel {
...@@ -119,6 +120,10 @@ int io_try_cancel(struct io_uring_task *tctx, struct io_cancel_data *cd, ...@@ -119,6 +120,10 @@ int io_try_cancel(struct io_uring_task *tctx, struct io_cancel_data *cd,
if (ret != -ENOENT) if (ret != -ENOENT)
return ret; return ret;
ret = io_waitid_cancel(ctx, cd, issue_flags);
if (ret != -ENOENT)
return ret;
spin_lock(&ctx->completion_lock); spin_lock(&ctx->completion_lock);
if (!(cd->flags & IORING_ASYNC_CANCEL_FD)) if (!(cd->flags & IORING_ASYNC_CANCEL_FD))
ret = io_timeout_cancel(ctx, cd); ret = io_timeout_cancel(ctx, cd);
......
...@@ -92,6 +92,7 @@ ...@@ -92,6 +92,7 @@
#include "cancel.h" #include "cancel.h"
#include "net.h" #include "net.h"
#include "notif.h" #include "notif.h"
#include "waitid.h"
#include "timeout.h" #include "timeout.h"
#include "poll.h" #include "poll.h"
...@@ -338,7 +339,6 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) ...@@ -338,7 +339,6 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
spin_lock_init(&ctx->completion_lock); spin_lock_init(&ctx->completion_lock);
spin_lock_init(&ctx->timeout_lock); spin_lock_init(&ctx->timeout_lock);
INIT_WQ_LIST(&ctx->iopoll_list); INIT_WQ_LIST(&ctx->iopoll_list);
INIT_LIST_HEAD(&ctx->io_buffers_pages);
INIT_LIST_HEAD(&ctx->io_buffers_comp); INIT_LIST_HEAD(&ctx->io_buffers_comp);
INIT_LIST_HEAD(&ctx->defer_list); INIT_LIST_HEAD(&ctx->defer_list);
INIT_LIST_HEAD(&ctx->timeout_list); INIT_LIST_HEAD(&ctx->timeout_list);
...@@ -348,8 +348,10 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) ...@@ -348,8 +348,10 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
INIT_LIST_HEAD(&ctx->tctx_list); INIT_LIST_HEAD(&ctx->tctx_list);
ctx->submit_state.free_list.next = NULL; ctx->submit_state.free_list.next = NULL;
INIT_WQ_LIST(&ctx->locked_free_list); INIT_WQ_LIST(&ctx->locked_free_list);
INIT_HLIST_HEAD(&ctx->waitid_list);
INIT_DELAYED_WORK(&ctx->fallback_work, io_fallback_req_func); INIT_DELAYED_WORK(&ctx->fallback_work, io_fallback_req_func);
INIT_WQ_LIST(&ctx->submit_state.compl_reqs); INIT_WQ_LIST(&ctx->submit_state.compl_reqs);
INIT_HLIST_HEAD(&ctx->cancelable_uring_cmd);
return ctx; return ctx;
err: err:
kfree(ctx->cancel_table.hbs); kfree(ctx->cancel_table.hbs);
...@@ -3276,6 +3278,37 @@ static __cold bool io_uring_try_cancel_iowq(struct io_ring_ctx *ctx) ...@@ -3276,6 +3278,37 @@ static __cold bool io_uring_try_cancel_iowq(struct io_ring_ctx *ctx)
return ret; return ret;
} }
static bool io_uring_try_cancel_uring_cmd(struct io_ring_ctx *ctx,
struct task_struct *task, bool cancel_all)
{
struct hlist_node *tmp;
struct io_kiocb *req;
bool ret = false;
lockdep_assert_held(&ctx->uring_lock);
hlist_for_each_entry_safe(req, tmp, &ctx->cancelable_uring_cmd,
hash_node) {
struct io_uring_cmd *cmd = io_kiocb_to_cmd(req,
struct io_uring_cmd);
struct file *file = req->file;
if (!cancel_all && req->task != task)
continue;
if (cmd->flags & IORING_URING_CMD_CANCELABLE) {
/* ->sqe isn't available if no async data */
if (!req_has_async_data(req))
cmd->sqe = NULL;
file->f_op->uring_cmd(cmd, IO_URING_F_CANCEL);
ret = true;
}
}
io_submit_flush_completions(ctx);
return ret;
}
static __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx, static __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
struct task_struct *task, struct task_struct *task,
bool cancel_all) bool cancel_all)
...@@ -3323,6 +3356,8 @@ static __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx, ...@@ -3323,6 +3356,8 @@ static __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
ret |= io_cancel_defer_files(ctx, task, cancel_all); ret |= io_cancel_defer_files(ctx, task, cancel_all);
mutex_lock(&ctx->uring_lock); mutex_lock(&ctx->uring_lock);
ret |= io_poll_remove_all(ctx, task, cancel_all); ret |= io_poll_remove_all(ctx, task, cancel_all);
ret |= io_waitid_remove_all(ctx, task, cancel_all);
ret |= io_uring_try_cancel_uring_cmd(ctx, task, cancel_all);
mutex_unlock(&ctx->uring_lock); mutex_unlock(&ctx->uring_lock);
ret |= io_kill_timeouts(ctx, task, cancel_all); ret |= io_kill_timeouts(ctx, task, cancel_all);
if (task) if (task)
...@@ -4686,6 +4721,9 @@ static int __init io_uring_init(void) ...@@ -4686,6 +4721,9 @@ static int __init io_uring_init(void)
BUILD_BUG_ON(sizeof(atomic_t) != sizeof(u32)); BUILD_BUG_ON(sizeof(atomic_t) != sizeof(u32));
/* top 8bits are for internal use */
BUILD_BUG_ON((IORING_URING_CMD_MASK & 0xff000000) != 0);
io_uring_optable_init(); io_uring_optable_init();
/* /*
...@@ -4701,6 +4739,9 @@ static int __init io_uring_init(void) ...@@ -4701,6 +4739,9 @@ static int __init io_uring_init(void)
SLAB_ACCOUNT | SLAB_TYPESAFE_BY_RCU, SLAB_ACCOUNT | SLAB_TYPESAFE_BY_RCU,
offsetof(struct io_kiocb, cmd.data), offsetof(struct io_kiocb, cmd.data),
sizeof_field(struct io_kiocb, cmd.data), NULL); sizeof_field(struct io_kiocb, cmd.data), NULL);
io_buf_cachep = kmem_cache_create("io_buffer", sizeof(struct io_buffer), 0,
SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT,
NULL);
#ifdef CONFIG_SYSCTL #ifdef CONFIG_SYSCTL
register_sysctl_init("kernel", kernel_io_uring_disabled_table); register_sysctl_init("kernel", kernel_io_uring_disabled_table);
......
...@@ -343,6 +343,7 @@ static inline bool io_req_cache_empty(struct io_ring_ctx *ctx) ...@@ -343,6 +343,7 @@ static inline bool io_req_cache_empty(struct io_ring_ctx *ctx)
} }
extern struct kmem_cache *req_cachep; extern struct kmem_cache *req_cachep;
extern struct kmem_cache *io_buf_cachep;
static inline struct io_kiocb *io_extract_req(struct io_ring_ctx *ctx) static inline struct io_kiocb *io_extract_req(struct io_ring_ctx *ctx)
{ {
......
...@@ -19,12 +19,17 @@ ...@@ -19,12 +19,17 @@
#define BGID_ARRAY 64 #define BGID_ARRAY 64
/* BIDs are addressed by a 16-bit field in a CQE */
#define MAX_BIDS_PER_BGID (1 << 16)
struct kmem_cache *io_buf_cachep;
struct io_provide_buf { struct io_provide_buf {
struct file *file; struct file *file;
__u64 addr; __u64 addr;
__u32 len; __u32 len;
__u32 bgid; __u32 bgid;
__u16 nbufs; __u32 nbufs;
__u16 bid; __u16 bid;
}; };
...@@ -255,6 +260,8 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx, ...@@ -255,6 +260,8 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx,
void io_destroy_buffers(struct io_ring_ctx *ctx) void io_destroy_buffers(struct io_ring_ctx *ctx)
{ {
struct io_buffer_list *bl; struct io_buffer_list *bl;
struct list_head *item, *tmp;
struct io_buffer *buf;
unsigned long index; unsigned long index;
int i; int i;
...@@ -270,12 +277,9 @@ void io_destroy_buffers(struct io_ring_ctx *ctx) ...@@ -270,12 +277,9 @@ void io_destroy_buffers(struct io_ring_ctx *ctx)
kfree(bl); kfree(bl);
} }
while (!list_empty(&ctx->io_buffers_pages)) { list_for_each_safe(item, tmp, &ctx->io_buffers_cache) {
struct page *page; buf = list_entry(item, struct io_buffer, list);
kmem_cache_free(io_buf_cachep, buf);
page = list_first_entry(&ctx->io_buffers_pages, struct page, lru);
list_del_init(&page->lru);
__free_page(page);
} }
} }
...@@ -289,7 +293,7 @@ int io_remove_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) ...@@ -289,7 +293,7 @@ int io_remove_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
return -EINVAL; return -EINVAL;
tmp = READ_ONCE(sqe->fd); tmp = READ_ONCE(sqe->fd);
if (!tmp || tmp > USHRT_MAX) if (!tmp || tmp > MAX_BIDS_PER_BGID)
return -EINVAL; return -EINVAL;
memset(p, 0, sizeof(*p)); memset(p, 0, sizeof(*p));
...@@ -332,7 +336,7 @@ int io_provide_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe ...@@ -332,7 +336,7 @@ int io_provide_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe
return -EINVAL; return -EINVAL;
tmp = READ_ONCE(sqe->fd); tmp = READ_ONCE(sqe->fd);
if (!tmp || tmp > USHRT_MAX) if (!tmp || tmp > MAX_BIDS_PER_BGID)
return -E2BIG; return -E2BIG;
p->nbufs = tmp; p->nbufs = tmp;
p->addr = READ_ONCE(sqe->addr); p->addr = READ_ONCE(sqe->addr);
...@@ -352,17 +356,18 @@ int io_provide_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe ...@@ -352,17 +356,18 @@ int io_provide_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe
tmp = READ_ONCE(sqe->off); tmp = READ_ONCE(sqe->off);
if (tmp > USHRT_MAX) if (tmp > USHRT_MAX)
return -E2BIG; return -E2BIG;
if (tmp + p->nbufs >= USHRT_MAX) if (tmp + p->nbufs > MAX_BIDS_PER_BGID)
return -EINVAL; return -EINVAL;
p->bid = tmp; p->bid = tmp;
return 0; return 0;
} }
#define IO_BUFFER_ALLOC_BATCH 64
static int io_refill_buffer_cache(struct io_ring_ctx *ctx) static int io_refill_buffer_cache(struct io_ring_ctx *ctx)
{ {
struct io_buffer *buf; struct io_buffer *bufs[IO_BUFFER_ALLOC_BATCH];
struct page *page; int allocated;
int bufs_in_page;
/* /*
* Completions that don't happen inline (eg not under uring_lock) will * Completions that don't happen inline (eg not under uring_lock) will
...@@ -382,22 +387,25 @@ static int io_refill_buffer_cache(struct io_ring_ctx *ctx) ...@@ -382,22 +387,25 @@ static int io_refill_buffer_cache(struct io_ring_ctx *ctx)
/* /*
* No free buffers and no completion entries either. Allocate a new * No free buffers and no completion entries either. Allocate a new
* page worth of buffer entries and add those to our freelist. * batch of buffer entries and add those to our freelist.
*/ */
page = alloc_page(GFP_KERNEL_ACCOUNT);
if (!page)
return -ENOMEM;
list_add(&page->lru, &ctx->io_buffers_pages);
buf = page_address(page); allocated = kmem_cache_alloc_bulk(io_buf_cachep, GFP_KERNEL_ACCOUNT,
bufs_in_page = PAGE_SIZE / sizeof(*buf); ARRAY_SIZE(bufs), (void **) bufs);
while (bufs_in_page) { if (unlikely(!allocated)) {
list_add_tail(&buf->list, &ctx->io_buffers_cache); /*
buf++; * Bulk alloc is all-or-nothing. If we fail to get a batch,
bufs_in_page--; * retry single alloc to be on the safe side.
*/
bufs[0] = kmem_cache_alloc(io_buf_cachep, GFP_KERNEL);
if (!bufs[0])
return -ENOMEM;
allocated = 1;
} }
while (allocated)
list_add_tail(&bufs[--allocated]->list, &ctx->io_buffers_cache);
return 0; return 0;
} }
......
...@@ -33,6 +33,7 @@ ...@@ -33,6 +33,7 @@
#include "poll.h" #include "poll.h"
#include "cancel.h" #include "cancel.h"
#include "rw.h" #include "rw.h"
#include "waitid.h"
static int io_no_issue(struct io_kiocb *req, unsigned int issue_flags) static int io_no_issue(struct io_kiocb *req, unsigned int issue_flags)
{ {
...@@ -63,6 +64,7 @@ const struct io_issue_def io_issue_defs[] = { ...@@ -63,6 +64,7 @@ const struct io_issue_def io_issue_defs[] = {
.ioprio = 1, .ioprio = 1,
.iopoll = 1, .iopoll = 1,
.iopoll_queue = 1, .iopoll_queue = 1,
.vectored = 1,
.prep = io_prep_rw, .prep = io_prep_rw,
.issue = io_read, .issue = io_read,
}, },
...@@ -76,6 +78,7 @@ const struct io_issue_def io_issue_defs[] = { ...@@ -76,6 +78,7 @@ const struct io_issue_def io_issue_defs[] = {
.ioprio = 1, .ioprio = 1,
.iopoll = 1, .iopoll = 1,
.iopoll_queue = 1, .iopoll_queue = 1,
.vectored = 1,
.prep = io_prep_rw, .prep = io_prep_rw,
.issue = io_write, .issue = io_write,
}, },
...@@ -428,9 +431,21 @@ const struct io_issue_def io_issue_defs[] = { ...@@ -428,9 +431,21 @@ const struct io_issue_def io_issue_defs[] = {
.prep = io_eopnotsupp_prep, .prep = io_eopnotsupp_prep,
#endif #endif
}, },
[IORING_OP_READ_MULTISHOT] = {
.needs_file = 1,
.unbound_nonreg_file = 1,
.pollin = 1,
.buffer_select = 1,
.audit_skip = 1,
.prep = io_read_mshot_prep,
.issue = io_read_mshot,
},
[IORING_OP_WAITID] = {
.prep = io_waitid_prep,
.issue = io_waitid,
},
}; };
const struct io_cold_def io_cold_defs[] = { const struct io_cold_def io_cold_defs[] = {
[IORING_OP_NOP] = { [IORING_OP_NOP] = {
.name = "NOP", .name = "NOP",
...@@ -648,6 +663,13 @@ const struct io_cold_def io_cold_defs[] = { ...@@ -648,6 +663,13 @@ const struct io_cold_def io_cold_defs[] = {
.fail = io_sendrecv_fail, .fail = io_sendrecv_fail,
#endif #endif
}, },
[IORING_OP_READ_MULTISHOT] = {
.name = "READ_MULTISHOT",
},
[IORING_OP_WAITID] = {
.name = "WAITID",
.async_size = sizeof(struct io_waitid_async),
},
}; };
const char *io_uring_get_opcode(u8 opcode) const char *io_uring_get_opcode(u8 opcode)
......
...@@ -29,6 +29,8 @@ struct io_issue_def { ...@@ -29,6 +29,8 @@ struct io_issue_def {
unsigned iopoll_queue : 1; unsigned iopoll_queue : 1;
/* opcode specific path will handle ->async_data allocation if needed */ /* opcode specific path will handle ->async_data allocation if needed */
unsigned manual_alloc : 1; unsigned manual_alloc : 1;
/* vectored opcode, set if 1) vectored, and 2) handler needs to know */
unsigned vectored : 1;
int (*issue)(struct io_kiocb *, unsigned int); int (*issue)(struct io_kiocb *, unsigned int);
int (*prep)(struct io_kiocb *, const struct io_uring_sqe *); int (*prep)(struct io_kiocb *, const struct io_uring_sqe *);
......
...@@ -370,7 +370,7 @@ static void __io_poll_execute(struct io_kiocb *req, int mask) ...@@ -370,7 +370,7 @@ static void __io_poll_execute(struct io_kiocb *req, int mask)
req->io_task_work.func = io_poll_task_func; req->io_task_work.func = io_poll_task_func;
trace_io_uring_task_add(req, mask); trace_io_uring_task_add(req, mask);
io_req_task_work_add(req); __io_req_task_work_add(req, IOU_F_TWQ_LAZY_WAKE);
} }
static inline void io_poll_execute(struct io_kiocb *req, int res) static inline void io_poll_execute(struct io_kiocb *req, int res)
......
...@@ -1037,39 +1037,36 @@ struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages) ...@@ -1037,39 +1037,36 @@ struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages)
{ {
unsigned long start, end, nr_pages; unsigned long start, end, nr_pages;
struct page **pages = NULL; struct page **pages = NULL;
int pret, ret = -ENOMEM; int ret;
end = (ubuf + len + PAGE_SIZE - 1) >> PAGE_SHIFT; end = (ubuf + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
start = ubuf >> PAGE_SHIFT; start = ubuf >> PAGE_SHIFT;
nr_pages = end - start; nr_pages = end - start;
WARN_ON(!nr_pages);
pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL); pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL);
if (!pages) if (!pages)
goto done; return ERR_PTR(-ENOMEM);
ret = 0;
mmap_read_lock(current->mm); mmap_read_lock(current->mm);
pret = pin_user_pages(ubuf, nr_pages, FOLL_WRITE | FOLL_LONGTERM, ret = pin_user_pages(ubuf, nr_pages, FOLL_WRITE | FOLL_LONGTERM, pages);
pages); mmap_read_unlock(current->mm);
if (pret == nr_pages)
/* success, mapped all pages */
if (ret == nr_pages) {
*npages = nr_pages; *npages = nr_pages;
else return pages;
ret = pret < 0 ? pret : -EFAULT; }
mmap_read_unlock(current->mm); /* partial map, or didn't map anything */
if (ret) { if (ret >= 0) {
/* if we did partial map, release any pages we did get */ /* if we did partial map, release any pages we did get */
if (pret > 0) if (ret)
unpin_user_pages(pages, pret); unpin_user_pages(pages, ret);
goto done; ret = -EFAULT;
} }
ret = 0;
done:
if (ret < 0) {
kvfree(pages); kvfree(pages);
pages = ERR_PTR(ret); return ERR_PTR(ret);
}
return pages;
} }
static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
......
...@@ -123,6 +123,22 @@ int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe) ...@@ -123,6 +123,22 @@ int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe)
return 0; return 0;
} }
/*
* Multishot read is prepared just like a normal read/write request, only
* difference is that we set the MULTISHOT flag.
*/
int io_read_mshot_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
int ret;
ret = io_prep_rw(req, sqe);
if (unlikely(ret))
return ret;
req->flags |= REQ_F_APOLL_MULTISHOT;
return 0;
}
void io_readv_writev_cleanup(struct io_kiocb *req) void io_readv_writev_cleanup(struct io_kiocb *req)
{ {
struct io_async_rw *io = req->async_data; struct io_async_rw *io = req->async_data;
...@@ -388,8 +404,7 @@ static struct iovec *__io_import_iovec(int ddir, struct io_kiocb *req, ...@@ -388,8 +404,7 @@ static struct iovec *__io_import_iovec(int ddir, struct io_kiocb *req,
buf = u64_to_user_ptr(rw->addr); buf = u64_to_user_ptr(rw->addr);
sqe_len = rw->len; sqe_len = rw->len;
if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE || if (!io_issue_defs[opcode].vectored || req->flags & REQ_F_BUFFER_SELECT) {
(req->flags & REQ_F_BUFFER_SELECT)) {
if (io_do_buffer_select(req)) { if (io_do_buffer_select(req)) {
buf = io_buffer_select(req, &sqe_len, issue_flags); buf = io_buffer_select(req, &sqe_len, issue_flags);
if (!buf) if (!buf)
...@@ -708,7 +723,7 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode) ...@@ -708,7 +723,7 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode)
return 0; return 0;
} }
int io_read(struct io_kiocb *req, unsigned int issue_flags) static int __io_read(struct io_kiocb *req, unsigned int issue_flags)
{ {
struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
struct io_rw_state __s, *s = &__s; struct io_rw_state __s, *s = &__s;
...@@ -776,8 +791,11 @@ int io_read(struct io_kiocb *req, unsigned int issue_flags) ...@@ -776,8 +791,11 @@ int io_read(struct io_kiocb *req, unsigned int issue_flags)
if (ret == -EAGAIN || (req->flags & REQ_F_REISSUE)) { if (ret == -EAGAIN || (req->flags & REQ_F_REISSUE)) {
req->flags &= ~REQ_F_REISSUE; req->flags &= ~REQ_F_REISSUE;
/* if we can poll, just do that */ /*
if (req->opcode == IORING_OP_READ && file_can_poll(req->file)) * If we can poll, just do that. For a vectored read, we'll
* need to copy state first.
*/
if (file_can_poll(req->file) && !io_issue_defs[req->opcode].vectored)
return -EAGAIN; return -EAGAIN;
/* IOPOLL retry should happen for io-wq threads */ /* IOPOLL retry should happen for io-wq threads */
if (!force_nonblock && !(req->ctx->flags & IORING_SETUP_IOPOLL)) if (!force_nonblock && !(req->ctx->flags & IORING_SETUP_IOPOLL))
...@@ -853,7 +871,69 @@ int io_read(struct io_kiocb *req, unsigned int issue_flags) ...@@ -853,7 +871,69 @@ int io_read(struct io_kiocb *req, unsigned int issue_flags)
/* it's faster to check here then delegate to kfree */ /* it's faster to check here then delegate to kfree */
if (iovec) if (iovec)
kfree(iovec); kfree(iovec);
return ret;
}
int io_read(struct io_kiocb *req, unsigned int issue_flags)
{
int ret;
ret = __io_read(req, issue_flags);
if (ret >= 0)
return kiocb_done(req, ret, issue_flags); return kiocb_done(req, ret, issue_flags);
return ret;
}
int io_read_mshot(struct io_kiocb *req, unsigned int issue_flags)
{
unsigned int cflags = 0;
int ret;
/*
* Multishot MUST be used on a pollable file
*/
if (!file_can_poll(req->file))
return -EBADFD;
ret = __io_read(req, issue_flags);
/*
* If we get -EAGAIN, recycle our buffer and just let normal poll
* handling arm it.
*/
if (ret == -EAGAIN) {
io_kbuf_recycle(req, issue_flags);
return -EAGAIN;
}
/*
* Any successful return value will keep the multishot read armed.
*/
if (ret > 0) {
/*
* Put our buffer and post a CQE. If we fail to post a CQE, then
* jump to the termination path. This request is then done.
*/
cflags = io_put_kbuf(req, issue_flags);
if (io_fill_cqe_req_aux(req,
issue_flags & IO_URING_F_COMPLETE_DEFER,
ret, cflags | IORING_CQE_F_MORE)) {
if (issue_flags & IO_URING_F_MULTISHOT)
return IOU_ISSUE_SKIP_COMPLETE;
return -EAGAIN;
}
}
/*
* Either an error, or we've hit overflow posting the CQE. For any
* multishot request, hitting overflow will terminate it.
*/
io_req_set_res(req, ret, cflags);
if (issue_flags & IO_URING_F_MULTISHOT)
return IOU_STOP_MULTISHOT;
return IOU_OK;
} }
int io_write(struct io_kiocb *req, unsigned int issue_flags) int io_write(struct io_kiocb *req, unsigned int issue_flags)
......
...@@ -23,3 +23,5 @@ int io_writev_prep_async(struct io_kiocb *req); ...@@ -23,3 +23,5 @@ int io_writev_prep_async(struct io_kiocb *req);
void io_readv_writev_cleanup(struct io_kiocb *req); void io_readv_writev_cleanup(struct io_kiocb *req);
void io_rw_fail(struct io_kiocb *req); void io_rw_fail(struct io_kiocb *req);
void io_req_rw_complete(struct io_kiocb *req, struct io_tw_state *ts); void io_req_rw_complete(struct io_kiocb *req, struct io_tw_state *ts);
int io_read_mshot_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_read_mshot(struct io_kiocb *req, unsigned int issue_flags);
...@@ -13,6 +13,51 @@ ...@@ -13,6 +13,51 @@
#include "rsrc.h" #include "rsrc.h"
#include "uring_cmd.h" #include "uring_cmd.h"
static void io_uring_cmd_del_cancelable(struct io_uring_cmd *cmd,
unsigned int issue_flags)
{
struct io_kiocb *req = cmd_to_io_kiocb(cmd);
struct io_ring_ctx *ctx = req->ctx;
if (!(cmd->flags & IORING_URING_CMD_CANCELABLE))
return;
cmd->flags &= ~IORING_URING_CMD_CANCELABLE;
io_ring_submit_lock(ctx, issue_flags);
hlist_del(&req->hash_node);
io_ring_submit_unlock(ctx, issue_flags);
}
/*
* Mark this command as concelable, then io_uring_try_cancel_uring_cmd()
* will try to cancel this issued command by sending ->uring_cmd() with
* issue_flags of IO_URING_F_CANCEL.
*
* The command is guaranteed to not be done when calling ->uring_cmd()
* with IO_URING_F_CANCEL, but it is driver's responsibility to deal
* with race between io_uring canceling and normal completion.
*/
void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd,
unsigned int issue_flags)
{
struct io_kiocb *req = cmd_to_io_kiocb(cmd);
struct io_ring_ctx *ctx = req->ctx;
if (!(cmd->flags & IORING_URING_CMD_CANCELABLE)) {
cmd->flags |= IORING_URING_CMD_CANCELABLE;
io_ring_submit_lock(ctx, issue_flags);
hlist_add_head(&req->hash_node, &ctx->cancelable_uring_cmd);
io_ring_submit_unlock(ctx, issue_flags);
}
}
EXPORT_SYMBOL_GPL(io_uring_cmd_mark_cancelable);
struct task_struct *io_uring_cmd_get_task(struct io_uring_cmd *cmd)
{
return cmd_to_io_kiocb(cmd)->task;
}
EXPORT_SYMBOL_GPL(io_uring_cmd_get_task);
static void io_uring_cmd_work(struct io_kiocb *req, struct io_tw_state *ts) static void io_uring_cmd_work(struct io_kiocb *req, struct io_tw_state *ts)
{ {
struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd); struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd);
...@@ -56,6 +101,8 @@ void io_uring_cmd_done(struct io_uring_cmd *ioucmd, ssize_t ret, ssize_t res2, ...@@ -56,6 +101,8 @@ void io_uring_cmd_done(struct io_uring_cmd *ioucmd, ssize_t ret, ssize_t res2,
{ {
struct io_kiocb *req = cmd_to_io_kiocb(ioucmd); struct io_kiocb *req = cmd_to_io_kiocb(ioucmd);
io_uring_cmd_del_cancelable(ioucmd, issue_flags);
if (ret < 0) if (ret < 0)
req_set_fail(req); req_set_fail(req);
...@@ -91,7 +138,7 @@ int io_uring_cmd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) ...@@ -91,7 +138,7 @@ int io_uring_cmd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
return -EINVAL; return -EINVAL;
ioucmd->flags = READ_ONCE(sqe->uring_cmd_flags); ioucmd->flags = READ_ONCE(sqe->uring_cmd_flags);
if (ioucmd->flags & ~IORING_URING_CMD_FIXED) if (ioucmd->flags & ~IORING_URING_CMD_MASK)
return -EINVAL; return -EINVAL;
if (ioucmd->flags & IORING_URING_CMD_FIXED) { if (ioucmd->flags & IORING_URING_CMD_FIXED) {
......
// SPDX-License-Identifier: GPL-2.0
/*
* Support for async notification of waitid
*/
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/fs.h>
#include <linux/file.h>
#include <linux/compat.h>
#include <linux/io_uring.h>
#include <uapi/linux/io_uring.h>
#include "io_uring.h"
#include "cancel.h"
#include "waitid.h"
#include "../kernel/exit.h"
static void io_waitid_cb(struct io_kiocb *req, struct io_tw_state *ts);
#define IO_WAITID_CANCEL_FLAG BIT(31)
#define IO_WAITID_REF_MASK GENMASK(30, 0)
struct io_waitid {
struct file *file;
int which;
pid_t upid;
int options;
atomic_t refs;
struct wait_queue_head *head;
struct siginfo __user *infop;
struct waitid_info info;
};
static void io_waitid_free(struct io_kiocb *req)
{
struct io_waitid_async *iwa = req->async_data;
put_pid(iwa->wo.wo_pid);
kfree(req->async_data);
req->async_data = NULL;
req->flags &= ~REQ_F_ASYNC_DATA;
}
#ifdef CONFIG_COMPAT
static bool io_waitid_compat_copy_si(struct io_waitid *iw, int signo)
{
struct compat_siginfo __user *infop;
bool ret;
infop = (struct compat_siginfo __user *) iw->infop;
if (!user_write_access_begin(infop, sizeof(*infop)))
return false;
unsafe_put_user(signo, &infop->si_signo, Efault);
unsafe_put_user(0, &infop->si_errno, Efault);
unsafe_put_user(iw->info.cause, &infop->si_code, Efault);
unsafe_put_user(iw->info.pid, &infop->si_pid, Efault);
unsafe_put_user(iw->info.uid, &infop->si_uid, Efault);
unsafe_put_user(iw->info.status, &infop->si_status, Efault);
ret = true;
done:
user_write_access_end();
return ret;
Efault:
ret = false;
goto done;
}
#endif
static bool io_waitid_copy_si(struct io_kiocb *req, int signo)
{
struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid);
bool ret;
if (!iw->infop)
return true;
#ifdef CONFIG_COMPAT
if (req->ctx->compat)
return io_waitid_compat_copy_si(iw, signo);
#endif
if (!user_write_access_begin(iw->infop, sizeof(*iw->infop)))
return false;
unsafe_put_user(signo, &iw->infop->si_signo, Efault);
unsafe_put_user(0, &iw->infop->si_errno, Efault);
unsafe_put_user(iw->info.cause, &iw->infop->si_code, Efault);
unsafe_put_user(iw->info.pid, &iw->infop->si_pid, Efault);
unsafe_put_user(iw->info.uid, &iw->infop->si_uid, Efault);
unsafe_put_user(iw->info.status, &iw->infop->si_status, Efault);
ret = true;
done:
user_write_access_end();
return ret;
Efault:
ret = false;
goto done;
}
static int io_waitid_finish(struct io_kiocb *req, int ret)
{
int signo = 0;
if (ret > 0) {
signo = SIGCHLD;
ret = 0;
}
if (!io_waitid_copy_si(req, signo))
ret = -EFAULT;
io_waitid_free(req);
return ret;
}
static void io_waitid_complete(struct io_kiocb *req, int ret)
{
struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid);
struct io_tw_state ts = { .locked = true };
/* anyone completing better be holding a reference */
WARN_ON_ONCE(!(atomic_read(&iw->refs) & IO_WAITID_REF_MASK));
lockdep_assert_held(&req->ctx->uring_lock);
/*
* Did cancel find it meanwhile?
*/
if (hlist_unhashed(&req->hash_node))
return;
hlist_del_init(&req->hash_node);
ret = io_waitid_finish(req, ret);
if (ret < 0)
req_set_fail(req);
io_req_set_res(req, ret, 0);
io_req_task_complete(req, &ts);
}
static bool __io_waitid_cancel(struct io_ring_ctx *ctx, struct io_kiocb *req)
{
struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid);
struct io_waitid_async *iwa = req->async_data;
/*
* Mark us canceled regardless of ownership. This will prevent a
* potential retry from a spurious wakeup.
*/
atomic_or(IO_WAITID_CANCEL_FLAG, &iw->refs);
/* claim ownership */
if (atomic_fetch_inc(&iw->refs) & IO_WAITID_REF_MASK)
return false;
spin_lock_irq(&iw->head->lock);
list_del_init(&iwa->wo.child_wait.entry);
spin_unlock_irq(&iw->head->lock);
io_waitid_complete(req, -ECANCELED);
return true;
}
int io_waitid_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd,
unsigned int issue_flags)
{
struct hlist_node *tmp;
struct io_kiocb *req;
int nr = 0;
if (cd->flags & (IORING_ASYNC_CANCEL_FD|IORING_ASYNC_CANCEL_FD_FIXED))
return -ENOENT;
io_ring_submit_lock(ctx, issue_flags);
hlist_for_each_entry_safe(req, tmp, &ctx->waitid_list, hash_node) {
if (req->cqe.user_data != cd->data &&
!(cd->flags & IORING_ASYNC_CANCEL_ANY))
continue;
if (__io_waitid_cancel(ctx, req))
nr++;
if (!(cd->flags & IORING_ASYNC_CANCEL_ALL))
break;
}
io_ring_submit_unlock(ctx, issue_flags);
if (nr)
return nr;
return -ENOENT;
}
bool io_waitid_remove_all(struct io_ring_ctx *ctx, struct task_struct *task,
bool cancel_all)
{
struct hlist_node *tmp;
struct io_kiocb *req;
bool found = false;
lockdep_assert_held(&ctx->uring_lock);
hlist_for_each_entry_safe(req, tmp, &ctx->waitid_list, hash_node) {
if (!io_match_task_safe(req, task, cancel_all))
continue;
__io_waitid_cancel(ctx, req);
found = true;
}
return found;
}
static inline bool io_waitid_drop_issue_ref(struct io_kiocb *req)
{
struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid);
struct io_waitid_async *iwa = req->async_data;
if (!atomic_sub_return(1, &iw->refs))
return false;
/*
* Wakeup triggered, racing with us. It was prevented from
* completing because of that, queue up the tw to do that.
*/
req->io_task_work.func = io_waitid_cb;
io_req_task_work_add(req);
remove_wait_queue(iw->head, &iwa->wo.child_wait);
return true;
}
static void io_waitid_cb(struct io_kiocb *req, struct io_tw_state *ts)
{
struct io_waitid_async *iwa = req->async_data;
struct io_ring_ctx *ctx = req->ctx;
int ret;
io_tw_lock(ctx, ts);
ret = __do_wait(&iwa->wo);
/*
* If we get -ERESTARTSYS here, we need to re-arm and check again
* to ensure we get another callback. If the retry works, then we can
* just remove ourselves from the waitqueue again and finish the
* request.
*/
if (unlikely(ret == -ERESTARTSYS)) {
struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid);
/* Don't retry if cancel found it meanwhile */
ret = -ECANCELED;
if (!(atomic_read(&iw->refs) & IO_WAITID_CANCEL_FLAG)) {
iw->head = &current->signal->wait_chldexit;
add_wait_queue(iw->head, &iwa->wo.child_wait);
ret = __do_wait(&iwa->wo);
if (ret == -ERESTARTSYS) {
/* retry armed, drop our ref */
io_waitid_drop_issue_ref(req);
return;
}
remove_wait_queue(iw->head, &iwa->wo.child_wait);
}
}
io_waitid_complete(req, ret);
}
static int io_waitid_wait(struct wait_queue_entry *wait, unsigned mode,
int sync, void *key)
{
struct wait_opts *wo = container_of(wait, struct wait_opts, child_wait);
struct io_waitid_async *iwa = container_of(wo, struct io_waitid_async, wo);
struct io_kiocb *req = iwa->req;
struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid);
struct task_struct *p = key;
if (!pid_child_should_wake(wo, p))
return 0;
/* cancel is in progress */
if (atomic_fetch_inc(&iw->refs) & IO_WAITID_REF_MASK)
return 1;
req->io_task_work.func = io_waitid_cb;
io_req_task_work_add(req);
list_del_init(&wait->entry);
return 1;
}
int io_waitid_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid);
if (sqe->addr || sqe->buf_index || sqe->addr3 || sqe->waitid_flags)
return -EINVAL;
iw->which = READ_ONCE(sqe->len);
iw->upid = READ_ONCE(sqe->fd);
iw->options = READ_ONCE(sqe->file_index);
iw->infop = u64_to_user_ptr(READ_ONCE(sqe->addr2));
return 0;
}
int io_waitid(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid);
struct io_ring_ctx *ctx = req->ctx;
struct io_waitid_async *iwa;
int ret;
if (io_alloc_async_data(req))
return -ENOMEM;
iwa = req->async_data;
iwa->req = req;
ret = kernel_waitid_prepare(&iwa->wo, iw->which, iw->upid, &iw->info,
iw->options, NULL);
if (ret)
goto done;
/*
* Mark the request as busy upfront, in case we're racing with the
* wakeup. If we are, then we'll notice when we drop this initial
* reference again after arming.
*/
atomic_set(&iw->refs, 1);
/*
* Cancel must hold the ctx lock, so there's no risk of cancelation
* finding us until a) we remain on the list, and b) the lock is
* dropped. We only need to worry about racing with the wakeup
* callback.
*/
io_ring_submit_lock(ctx, issue_flags);
hlist_add_head(&req->hash_node, &ctx->waitid_list);
init_waitqueue_func_entry(&iwa->wo.child_wait, io_waitid_wait);
iwa->wo.child_wait.private = req->task;
iw->head = &current->signal->wait_chldexit;
add_wait_queue(iw->head, &iwa->wo.child_wait);
ret = __do_wait(&iwa->wo);
if (ret == -ERESTARTSYS) {
/*
* Nobody else grabbed a reference, it'll complete when we get
* a waitqueue callback, or if someone cancels it.
*/
if (!io_waitid_drop_issue_ref(req)) {
io_ring_submit_unlock(ctx, issue_flags);
return IOU_ISSUE_SKIP_COMPLETE;
}
/*
* Wakeup triggered, racing with us. It was prevented from
* completing because of that, queue up the tw to do that.
*/
io_ring_submit_unlock(ctx, issue_flags);
return IOU_ISSUE_SKIP_COMPLETE;
}
hlist_del_init(&req->hash_node);
remove_wait_queue(iw->head, &iwa->wo.child_wait);
ret = io_waitid_finish(req, ret);
io_ring_submit_unlock(ctx, issue_flags);
done:
if (ret < 0)
req_set_fail(req);
io_req_set_res(req, ret, 0);
return IOU_OK;
}
// SPDX-License-Identifier: GPL-2.0
#include "../kernel/exit.h"
struct io_waitid_async {
struct io_kiocb *req;
struct wait_opts wo;
};
int io_waitid_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_waitid(struct io_kiocb *req, unsigned int issue_flags);
int io_waitid_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd,
unsigned int issue_flags);
bool io_waitid_remove_all(struct io_ring_ctx *ctx, struct task_struct *task,
bool cancel_all);
...@@ -74,6 +74,8 @@ ...@@ -74,6 +74,8 @@
#include <asm/unistd.h> #include <asm/unistd.h>
#include <asm/mmu_context.h> #include <asm/mmu_context.h>
#include "exit.h"
/* /*
* The default value should be high enough to not crash a system that randomly * The default value should be high enough to not crash a system that randomly
* crashes its kernel from time to time, but low enough to at least not permit * crashes its kernel from time to time, but low enough to at least not permit
...@@ -1037,26 +1039,6 @@ SYSCALL_DEFINE1(exit_group, int, error_code) ...@@ -1037,26 +1039,6 @@ SYSCALL_DEFINE1(exit_group, int, error_code)
return 0; return 0;
} }
struct waitid_info {
pid_t pid;
uid_t uid;
int status;
int cause;
};
struct wait_opts {
enum pid_type wo_type;
int wo_flags;
struct pid *wo_pid;
struct waitid_info *wo_info;
int wo_stat;
struct rusage *wo_rusage;
wait_queue_entry_t child_wait;
int notask_error;
};
static int eligible_pid(struct wait_opts *wo, struct task_struct *p) static int eligible_pid(struct wait_opts *wo, struct task_struct *p)
{ {
return wo->wo_type == PIDTYPE_MAX || return wo->wo_type == PIDTYPE_MAX ||
...@@ -1520,6 +1502,17 @@ static int ptrace_do_wait(struct wait_opts *wo, struct task_struct *tsk) ...@@ -1520,6 +1502,17 @@ static int ptrace_do_wait(struct wait_opts *wo, struct task_struct *tsk)
return 0; return 0;
} }
bool pid_child_should_wake(struct wait_opts *wo, struct task_struct *p)
{
if (!eligible_pid(wo, p))
return false;
if ((wo->wo_flags & __WNOTHREAD) && wo->child_wait.private != p->parent)
return false;
return true;
}
static int child_wait_callback(wait_queue_entry_t *wait, unsigned mode, static int child_wait_callback(wait_queue_entry_t *wait, unsigned mode,
int sync, void *key) int sync, void *key)
{ {
...@@ -1527,13 +1520,10 @@ static int child_wait_callback(wait_queue_entry_t *wait, unsigned mode, ...@@ -1527,13 +1520,10 @@ static int child_wait_callback(wait_queue_entry_t *wait, unsigned mode,
child_wait); child_wait);
struct task_struct *p = key; struct task_struct *p = key;
if (!eligible_pid(wo, p)) if (pid_child_should_wake(wo, p))
return 0; return default_wake_function(wait, mode, sync, key);
if ((wo->wo_flags & __WNOTHREAD) && wait->private != p->parent)
return 0; return 0;
return default_wake_function(wait, mode, sync, key);
} }
void __wake_up_parent(struct task_struct *p, struct task_struct *parent) void __wake_up_parent(struct task_struct *p, struct task_struct *parent)
...@@ -1582,16 +1572,10 @@ static int do_wait_pid(struct wait_opts *wo) ...@@ -1582,16 +1572,10 @@ static int do_wait_pid(struct wait_opts *wo)
return 0; return 0;
} }
static long do_wait(struct wait_opts *wo) long __do_wait(struct wait_opts *wo)
{ {
int retval; long retval;
trace_sched_process_wait(wo->wo_pid);
init_waitqueue_func_entry(&wo->child_wait, child_wait_callback);
wo->child_wait.private = current;
add_wait_queue(&current->signal->wait_chldexit, &wo->child_wait);
repeat:
/* /*
* If there is nothing that can match our criteria, just get out. * If there is nothing that can match our criteria, just get out.
* We will clear ->notask_error to zero if we see any child that * We will clear ->notask_error to zero if we see any child that
...@@ -1603,24 +1587,23 @@ static long do_wait(struct wait_opts *wo) ...@@ -1603,24 +1587,23 @@ static long do_wait(struct wait_opts *wo)
(!wo->wo_pid || !pid_has_task(wo->wo_pid, wo->wo_type))) (!wo->wo_pid || !pid_has_task(wo->wo_pid, wo->wo_type)))
goto notask; goto notask;
set_current_state(TASK_INTERRUPTIBLE);
read_lock(&tasklist_lock); read_lock(&tasklist_lock);
if (wo->wo_type == PIDTYPE_PID) { if (wo->wo_type == PIDTYPE_PID) {
retval = do_wait_pid(wo); retval = do_wait_pid(wo);
if (retval) if (retval)
goto end; return retval;
} else { } else {
struct task_struct *tsk = current; struct task_struct *tsk = current;
do { do {
retval = do_wait_thread(wo, tsk); retval = do_wait_thread(wo, tsk);
if (retval) if (retval)
goto end; return retval;
retval = ptrace_do_wait(wo, tsk); retval = ptrace_do_wait(wo, tsk);
if (retval) if (retval)
goto end; return retval;
if (wo->wo_flags & __WNOTHREAD) if (wo->wo_flags & __WNOTHREAD)
break; break;
...@@ -1630,27 +1613,44 @@ static long do_wait(struct wait_opts *wo) ...@@ -1630,27 +1613,44 @@ static long do_wait(struct wait_opts *wo)
notask: notask:
retval = wo->notask_error; retval = wo->notask_error;
if (!retval && !(wo->wo_flags & WNOHANG)) { if (!retval && !(wo->wo_flags & WNOHANG))
retval = -ERESTARTSYS; return -ERESTARTSYS;
if (!signal_pending(current)) {
return retval;
}
static long do_wait(struct wait_opts *wo)
{
int retval;
trace_sched_process_wait(wo->wo_pid);
init_waitqueue_func_entry(&wo->child_wait, child_wait_callback);
wo->child_wait.private = current;
add_wait_queue(&current->signal->wait_chldexit, &wo->child_wait);
do {
set_current_state(TASK_INTERRUPTIBLE);
retval = __do_wait(wo);
if (retval != -ERESTARTSYS)
break;
if (signal_pending(current))
break;
schedule(); schedule();
goto repeat; } while (1);
}
}
end:
__set_current_state(TASK_RUNNING); __set_current_state(TASK_RUNNING);
remove_wait_queue(&current->signal->wait_chldexit, &wo->child_wait); remove_wait_queue(&current->signal->wait_chldexit, &wo->child_wait);
return retval; return retval;
} }
static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop, int kernel_waitid_prepare(struct wait_opts *wo, int which, pid_t upid,
int options, struct rusage *ru) struct waitid_info *infop, int options,
struct rusage *ru)
{ {
struct wait_opts wo; unsigned int f_flags = 0;
struct pid *pid = NULL; struct pid *pid = NULL;
enum pid_type type; enum pid_type type;
long ret;
unsigned int f_flags = 0;
if (options & ~(WNOHANG|WNOWAIT|WEXITED|WSTOPPED|WCONTINUED| if (options & ~(WNOHANG|WNOWAIT|WEXITED|WSTOPPED|WCONTINUED|
__WNOTHREAD|__WCLONE|__WALL)) __WNOTHREAD|__WCLONE|__WALL))
...@@ -1693,19 +1693,32 @@ static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop, ...@@ -1693,19 +1693,32 @@ static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop,
return -EINVAL; return -EINVAL;
} }
wo.wo_type = type; wo->wo_type = type;
wo.wo_pid = pid; wo->wo_pid = pid;
wo.wo_flags = options; wo->wo_flags = options;
wo.wo_info = infop; wo->wo_info = infop;
wo.wo_rusage = ru; wo->wo_rusage = ru;
if (f_flags & O_NONBLOCK) if (f_flags & O_NONBLOCK)
wo.wo_flags |= WNOHANG; wo->wo_flags |= WNOHANG;
return 0;
}
static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop,
int options, struct rusage *ru)
{
struct wait_opts wo;
long ret;
ret = kernel_waitid_prepare(&wo, which, upid, infop, options, ru);
if (ret)
return ret;
ret = do_wait(&wo); ret = do_wait(&wo);
if (!ret && !(options & WNOHANG) && (f_flags & O_NONBLOCK)) if (!ret && !(options & WNOHANG) && (wo.wo_flags & WNOHANG))
ret = -EAGAIN; ret = -EAGAIN;
put_pid(pid); put_pid(wo.wo_pid);
return ret; return ret;
} }
......
// SPDX-License-Identifier: GPL-2.0-only
#ifndef LINUX_WAITID_H
#define LINUX_WAITID_H
struct waitid_info {
pid_t pid;
uid_t uid;
int status;
int cause;
};
struct wait_opts {
enum pid_type wo_type;
int wo_flags;
struct pid *wo_pid;
struct waitid_info *wo_info;
int wo_stat;
struct rusage *wo_rusage;
wait_queue_entry_t child_wait;
int notask_error;
};
bool pid_child_should_wake(struct wait_opts *wo, struct task_struct *p);
long __do_wait(struct wait_opts *wo);
int kernel_waitid_prepare(struct wait_opts *wo, int which, pid_t upid,
struct waitid_info *infop, int options,
struct rusage *ru);
#endif
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment