Commit c1b7fcf3 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'for-6.6/io_uring-2023-08-28' of git://git.kernel.dk/linux

Pull io_uring updates from Jens Axboe:
 "Fairly quiet round in terms of features, mostly just improvements all
  over the map for existing code. In detail:

   - Initial support for socket operations through io_uring. Latter half
     of this will likely land with the 6.7 kernel, then allowing things
     like get/setsockopt (Breno)

   - Cleanup of the cancel code, and then adding support for canceling
     requests with the opcode as the key (me)

   - Improvements for the io-wq locking (me)

   - Fix affinity setting for SQPOLL based io-wq (me)

   - Remove the io_uring userspace code. These were added initially as
     copies from liburing, but all of them have since bitrotted and are
     way out of date at this point. Rather than attempt to keep them in
     sync, just get rid of them. People will have liburing available
     anyway for these examples. (Pavel)

   - Series improving the CQ/SQ ring caching (Pavel)

   - Misc fixes and cleanups (Pavel, Yue, me)"

* tag 'for-6.6/io_uring-2023-08-28' of git://git.kernel.dk/linux: (47 commits)
  io_uring: move iopoll ctx fields around
  io_uring: move multishot cqe cache in ctx
  io_uring: separate task_work/waiting cache line
  io_uring: banish non-hot data to end of io_ring_ctx
  io_uring: move non aligned field to the end
  io_uring: add option to remove SQ indirection
  io_uring: compact SQ/CQ heads/tails
  io_uring: force inline io_fill_cqe_req
  io_uring: merge iopoll and normal completion paths
  io_uring: reorder cqring_flush and wakeups
  io_uring: optimise extra io_get_cqe null check
  io_uring: refactor __io_get_cqe()
  io_uring: simplify big_cqe handling
  io_uring: cqe init hardening
  io_uring: improve cqe !tracing hot path
  io_uring/rsrc: Annotate struct io_mapped_ubuf with __counted_by
  io_uring/sqpoll: fix io-wq affinity when IORING_SETUP_SQPOLL is used
  io_uring: simplify io_run_task_work_sig return
  io_uring/rsrc: keep one global dummy_ubuf
  io_uring: never overflow io_aux_cqe
  ...
parents adfd6716 644c4a7a
...@@ -10966,7 +10966,6 @@ F: include/linux/io_uring_types.h ...@@ -10966,7 +10966,6 @@ F: include/linux/io_uring_types.h
F: include/trace/events/io_uring.h F: include/trace/events/io_uring.h
F: include/uapi/linux/io_uring.h F: include/uapi/linux/io_uring.h
F: io_uring/ F: io_uring/
F: tools/io_uring/
IPMI SUBSYSTEM IPMI SUBSYSTEM
M: Corey Minyard <minyard@acm.org> M: Corey Minyard <minyard@acm.org>
......
...@@ -81,6 +81,7 @@ static inline void io_uring_free(struct task_struct *tsk) ...@@ -81,6 +81,7 @@ static inline void io_uring_free(struct task_struct *tsk)
if (tsk->io_uring) if (tsk->io_uring)
__io_uring_free(tsk); __io_uring_free(tsk);
} }
int io_uring_cmd_sock(struct io_uring_cmd *cmd, unsigned int issue_flags);
#else #else
static inline int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw, static inline int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw,
struct iov_iter *iter, void *ioucmd) struct iov_iter *iter, void *ioucmd)
...@@ -116,6 +117,11 @@ static inline const char *io_uring_get_opcode(u8 opcode) ...@@ -116,6 +117,11 @@ static inline const char *io_uring_get_opcode(u8 opcode)
{ {
return ""; return "";
} }
static inline int io_uring_cmd_sock(struct io_uring_cmd *cmd,
unsigned int issue_flags)
{
return -EOPNOTSUPP;
}
#endif #endif
#endif #endif
...@@ -69,8 +69,8 @@ struct io_uring_task { ...@@ -69,8 +69,8 @@ struct io_uring_task {
}; };
struct io_uring { struct io_uring {
u32 head ____cacheline_aligned_in_smp; u32 head;
u32 tail ____cacheline_aligned_in_smp; u32 tail;
}; };
/* /*
...@@ -176,7 +176,6 @@ struct io_submit_state { ...@@ -176,7 +176,6 @@ struct io_submit_state {
unsigned short submit_nr; unsigned short submit_nr;
unsigned int cqes_count; unsigned int cqes_count;
struct blk_plug plug; struct blk_plug plug;
struct io_uring_cqe cqes[16];
}; };
struct io_ev_fd { struct io_ev_fd {
...@@ -205,25 +204,17 @@ struct io_ring_ctx { ...@@ -205,25 +204,17 @@ struct io_ring_ctx {
unsigned int has_evfd: 1; unsigned int has_evfd: 1;
/* all CQEs should be posted only by the submitter task */ /* all CQEs should be posted only by the submitter task */
unsigned int task_complete: 1; unsigned int task_complete: 1;
unsigned int lockless_cq: 1;
unsigned int syscall_iopoll: 1; unsigned int syscall_iopoll: 1;
unsigned int poll_activated: 1; unsigned int poll_activated: 1;
unsigned int drain_disabled: 1; unsigned int drain_disabled: 1;
unsigned int compat: 1; unsigned int compat: 1;
enum task_work_notify_mode notify_method; struct task_struct *submitter_task;
struct io_rings *rings;
struct percpu_ref refs;
/* enum task_work_notify_mode notify_method;
* If IORING_SETUP_NO_MMAP is used, then the below holds
* the gup'ed pages for the two rings, and the sqes.
*/
unsigned short n_ring_pages;
unsigned short n_sqe_pages;
struct page **ring_pages;
struct page **sqe_pages;
struct io_rings *rings;
struct task_struct *submitter_task;
struct percpu_ref refs;
} ____cacheline_aligned_in_smp; } ____cacheline_aligned_in_smp;
/* submission data */ /* submission data */
...@@ -261,31 +252,20 @@ struct io_ring_ctx { ...@@ -261,31 +252,20 @@ struct io_ring_ctx {
struct io_buffer_list *io_bl; struct io_buffer_list *io_bl;
struct xarray io_bl_xa; struct xarray io_bl_xa;
struct list_head io_buffers_cache;
struct io_hash_table cancel_table_locked; struct io_hash_table cancel_table_locked;
struct list_head cq_overflow_list;
struct io_alloc_cache apoll_cache; struct io_alloc_cache apoll_cache;
struct io_alloc_cache netmsg_cache; struct io_alloc_cache netmsg_cache;
} ____cacheline_aligned_in_smp;
/* IRQ completion list, under ->completion_lock */
struct io_wq_work_list locked_free_list;
unsigned int locked_free_nr;
const struct cred *sq_creds; /* cred used for __io_sq_thread() */
struct io_sq_data *sq_data; /* if using sq thread polling */
struct wait_queue_head sqo_sq_wait;
struct list_head sqd_list;
unsigned long check_cq; /*
* ->iopoll_list is protected by the ctx->uring_lock for
unsigned int file_alloc_start; * io_uring instances that don't use IORING_SETUP_SQPOLL.
unsigned int file_alloc_end; * For SQPOLL, only the single threaded io_sq_thread() will
* manipulate the list, hence no extra locking is needed there.
struct xarray personalities; */
u32 pers_next; struct io_wq_work_list iopoll_list;
bool poll_multi_queue;
} ____cacheline_aligned_in_smp;
struct { struct {
/* /*
...@@ -298,39 +278,55 @@ struct io_ring_ctx { ...@@ -298,39 +278,55 @@ struct io_ring_ctx {
unsigned cached_cq_tail; unsigned cached_cq_tail;
unsigned cq_entries; unsigned cq_entries;
struct io_ev_fd __rcu *io_ev_fd; struct io_ev_fd __rcu *io_ev_fd;
struct wait_queue_head cq_wait;
unsigned cq_extra; unsigned cq_extra;
} ____cacheline_aligned_in_smp; } ____cacheline_aligned_in_smp;
/*
* task_work and async notification delivery cacheline. Expected to
* regularly bounce b/w CPUs.
*/
struct { struct {
spinlock_t completion_lock;
bool poll_multi_queue;
atomic_t cq_wait_nr;
/*
* ->iopoll_list is protected by the ctx->uring_lock for
* io_uring instances that don't use IORING_SETUP_SQPOLL.
* For SQPOLL, only the single threaded io_sq_thread() will
* manipulate the list, hence no extra locking is needed there.
*/
struct io_wq_work_list iopoll_list;
struct io_hash_table cancel_table;
struct llist_head work_llist; struct llist_head work_llist;
unsigned long check_cq;
struct list_head io_buffers_comp; atomic_t cq_wait_nr;
atomic_t cq_timeouts;
struct wait_queue_head cq_wait;
} ____cacheline_aligned_in_smp; } ____cacheline_aligned_in_smp;
/* timeouts */ /* timeouts */
struct { struct {
spinlock_t timeout_lock; spinlock_t timeout_lock;
atomic_t cq_timeouts;
struct list_head timeout_list; struct list_head timeout_list;
struct list_head ltimeout_list; struct list_head ltimeout_list;
unsigned cq_last_tm_flush; unsigned cq_last_tm_flush;
} ____cacheline_aligned_in_smp; } ____cacheline_aligned_in_smp;
struct io_uring_cqe completion_cqes[16];
spinlock_t completion_lock;
/* IRQ completion list, under ->completion_lock */
struct io_wq_work_list locked_free_list;
unsigned int locked_free_nr;
struct list_head io_buffers_comp;
struct list_head cq_overflow_list;
struct io_hash_table cancel_table;
const struct cred *sq_creds; /* cred used for __io_sq_thread() */
struct io_sq_data *sq_data; /* if using sq thread polling */
struct wait_queue_head sqo_sq_wait;
struct list_head sqd_list;
unsigned int file_alloc_start;
unsigned int file_alloc_end;
struct xarray personalities;
u32 pers_next;
struct list_head io_buffers_cache;
/* Keep this last, we don't need it for the fast path */ /* Keep this last, we don't need it for the fast path */
struct wait_queue_head poll_wq; struct wait_queue_head poll_wq;
struct io_restriction restrictions; struct io_restriction restrictions;
...@@ -374,6 +370,15 @@ struct io_ring_ctx { ...@@ -374,6 +370,15 @@ struct io_ring_ctx {
unsigned sq_thread_idle; unsigned sq_thread_idle;
/* protected by ->completion_lock */ /* protected by ->completion_lock */
unsigned evfd_last_cq_tail; unsigned evfd_last_cq_tail;
/*
* If IORING_SETUP_NO_MMAP is used, then the below holds
* the gup'ed pages for the two rings, and the sqes.
*/
unsigned short n_ring_pages;
unsigned short n_sqe_pages;
struct page **ring_pages;
struct page **sqe_pages;
}; };
struct io_tw_state { struct io_tw_state {
...@@ -409,7 +414,6 @@ enum { ...@@ -409,7 +414,6 @@ enum {
REQ_F_SINGLE_POLL_BIT, REQ_F_SINGLE_POLL_BIT,
REQ_F_DOUBLE_POLL_BIT, REQ_F_DOUBLE_POLL_BIT,
REQ_F_PARTIAL_IO_BIT, REQ_F_PARTIAL_IO_BIT,
REQ_F_CQE32_INIT_BIT,
REQ_F_APOLL_MULTISHOT_BIT, REQ_F_APOLL_MULTISHOT_BIT,
REQ_F_CLEAR_POLLIN_BIT, REQ_F_CLEAR_POLLIN_BIT,
REQ_F_HASH_LOCKED_BIT, REQ_F_HASH_LOCKED_BIT,
...@@ -479,8 +483,6 @@ enum { ...@@ -479,8 +483,6 @@ enum {
REQ_F_PARTIAL_IO = BIT(REQ_F_PARTIAL_IO_BIT), REQ_F_PARTIAL_IO = BIT(REQ_F_PARTIAL_IO_BIT),
/* fast poll multishot mode */ /* fast poll multishot mode */
REQ_F_APOLL_MULTISHOT = BIT(REQ_F_APOLL_MULTISHOT_BIT), REQ_F_APOLL_MULTISHOT = BIT(REQ_F_APOLL_MULTISHOT_BIT),
/* ->extra1 and ->extra2 are initialised */
REQ_F_CQE32_INIT = BIT(REQ_F_CQE32_INIT_BIT),
/* recvmsg special flag, clear EPOLLIN */ /* recvmsg special flag, clear EPOLLIN */
REQ_F_CLEAR_POLLIN = BIT(REQ_F_CLEAR_POLLIN_BIT), REQ_F_CLEAR_POLLIN = BIT(REQ_F_CLEAR_POLLIN_BIT),
/* hashed into ->cancel_hash_locked, protected by ->uring_lock */ /* hashed into ->cancel_hash_locked, protected by ->uring_lock */
...@@ -579,13 +581,7 @@ struct io_kiocb { ...@@ -579,13 +581,7 @@ struct io_kiocb {
struct io_task_work io_task_work; struct io_task_work io_task_work;
unsigned nr_tw; unsigned nr_tw;
/* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */ /* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */
union { struct hlist_node hash_node;
struct hlist_node hash_node;
struct {
u64 extra1;
u64 extra2;
};
};
/* internal polling, see IORING_FEAT_FAST_POLL */ /* internal polling, see IORING_FEAT_FAST_POLL */
struct async_poll *apoll; struct async_poll *apoll;
/* opcode allocated if it needs to store data for async defer */ /* opcode allocated if it needs to store data for async defer */
...@@ -595,6 +591,11 @@ struct io_kiocb { ...@@ -595,6 +591,11 @@ struct io_kiocb {
/* custom credentials, valid IFF REQ_F_CREDS is set */ /* custom credentials, valid IFF REQ_F_CREDS is set */
const struct cred *creds; const struct cred *creds;
struct io_wq_work work; struct io_wq_work work;
struct {
u64 extra1;
u64 extra2;
} big_cqe;
}; };
struct io_overflow_cqe { struct io_overflow_cqe {
......
...@@ -185,6 +185,11 @@ enum { ...@@ -185,6 +185,11 @@ enum {
*/ */
#define IORING_SETUP_REGISTERED_FD_ONLY (1U << 15) #define IORING_SETUP_REGISTERED_FD_ONLY (1U << 15)
/*
* Removes indirection through the SQ index array.
*/
#define IORING_SETUP_NO_SQARRAY (1U << 16)
enum io_uring_op { enum io_uring_op {
IORING_OP_NOP, IORING_OP_NOP,
IORING_OP_READV, IORING_OP_READV,
...@@ -299,11 +304,15 @@ enum io_uring_op { ...@@ -299,11 +304,15 @@ enum io_uring_op {
* request 'user_data' * request 'user_data'
* IORING_ASYNC_CANCEL_ANY Match any request * IORING_ASYNC_CANCEL_ANY Match any request
* IORING_ASYNC_CANCEL_FD_FIXED 'fd' passed in is a fixed descriptor * IORING_ASYNC_CANCEL_FD_FIXED 'fd' passed in is a fixed descriptor
* IORING_ASYNC_CANCEL_USERDATA Match on user_data, default for no other key
* IORING_ASYNC_CANCEL_OP Match request based on opcode
*/ */
#define IORING_ASYNC_CANCEL_ALL (1U << 0) #define IORING_ASYNC_CANCEL_ALL (1U << 0)
#define IORING_ASYNC_CANCEL_FD (1U << 1) #define IORING_ASYNC_CANCEL_FD (1U << 1)
#define IORING_ASYNC_CANCEL_ANY (1U << 2) #define IORING_ASYNC_CANCEL_ANY (1U << 2)
#define IORING_ASYNC_CANCEL_FD_FIXED (1U << 3) #define IORING_ASYNC_CANCEL_FD_FIXED (1U << 3)
#define IORING_ASYNC_CANCEL_USERDATA (1U << 4)
#define IORING_ASYNC_CANCEL_OP (1U << 5)
/* /*
* send/sendmsg and recv/recvmsg flags (sqe->ioprio) * send/sendmsg and recv/recvmsg flags (sqe->ioprio)
...@@ -697,7 +706,9 @@ struct io_uring_sync_cancel_reg { ...@@ -697,7 +706,9 @@ struct io_uring_sync_cancel_reg {
__s32 fd; __s32 fd;
__u32 flags; __u32 flags;
struct __kernel_timespec timeout; struct __kernel_timespec timeout;
__u64 pad[4]; __u8 opcode;
__u8 pad[7];
__u64 pad2[3];
}; };
/* /*
...@@ -717,6 +728,14 @@ struct io_uring_recvmsg_out { ...@@ -717,6 +728,14 @@ struct io_uring_recvmsg_out {
__u32 flags; __u32 flags;
}; };
/*
* Argument for IORING_OP_URING_CMD when file is a socket
*/
enum {
SOCKET_URING_OP_SIOCINQ = 0,
SOCKET_URING_OP_SIOCOUTQ,
};
#ifdef __cplusplus #ifdef __cplusplus
} }
#endif #endif
......
...@@ -22,35 +22,56 @@ struct io_cancel { ...@@ -22,35 +22,56 @@ struct io_cancel {
u64 addr; u64 addr;
u32 flags; u32 flags;
s32 fd; s32 fd;
u8 opcode;
}; };
#define CANCEL_FLAGS (IORING_ASYNC_CANCEL_ALL | IORING_ASYNC_CANCEL_FD | \ #define CANCEL_FLAGS (IORING_ASYNC_CANCEL_ALL | IORING_ASYNC_CANCEL_FD | \
IORING_ASYNC_CANCEL_ANY | IORING_ASYNC_CANCEL_FD_FIXED) IORING_ASYNC_CANCEL_ANY | IORING_ASYNC_CANCEL_FD_FIXED | \
IORING_ASYNC_CANCEL_USERDATA | IORING_ASYNC_CANCEL_OP)
static bool io_cancel_cb(struct io_wq_work *work, void *data) /*
* Returns true if the request matches the criteria outlined by 'cd'.
*/
bool io_cancel_req_match(struct io_kiocb *req, struct io_cancel_data *cd)
{ {
struct io_kiocb *req = container_of(work, struct io_kiocb, work); bool match_user_data = cd->flags & IORING_ASYNC_CANCEL_USERDATA;
struct io_cancel_data *cd = data;
if (req->ctx != cd->ctx) if (req->ctx != cd->ctx)
return false; return false;
if (cd->flags & IORING_ASYNC_CANCEL_ANY) {
; if (!(cd->flags & (IORING_ASYNC_CANCEL_FD | IORING_ASYNC_CANCEL_OP)))
} else if (cd->flags & IORING_ASYNC_CANCEL_FD) { match_user_data = true;
if (cd->flags & IORING_ASYNC_CANCEL_ANY)
goto check_seq;
if (cd->flags & IORING_ASYNC_CANCEL_FD) {
if (req->file != cd->file) if (req->file != cd->file)
return false; return false;
} else { }
if (req->cqe.user_data != cd->data) if (cd->flags & IORING_ASYNC_CANCEL_OP) {
if (req->opcode != cd->opcode)
return false; return false;
} }
if (cd->flags & (IORING_ASYNC_CANCEL_ALL|IORING_ASYNC_CANCEL_ANY)) { if (match_user_data && req->cqe.user_data != cd->data)
return false;
if (cd->flags & IORING_ASYNC_CANCEL_ALL) {
check_seq:
if (cd->seq == req->work.cancel_seq) if (cd->seq == req->work.cancel_seq)
return false; return false;
req->work.cancel_seq = cd->seq; req->work.cancel_seq = cd->seq;
} }
return true; return true;
} }
static bool io_cancel_cb(struct io_wq_work *work, void *data)
{
struct io_kiocb *req = container_of(work, struct io_kiocb, work);
struct io_cancel_data *cd = data;
return io_cancel_req_match(req, cd);
}
static int io_async_cancel_one(struct io_uring_task *tctx, static int io_async_cancel_one(struct io_uring_task *tctx,
struct io_cancel_data *cd) struct io_cancel_data *cd)
{ {
...@@ -111,7 +132,7 @@ int io_async_cancel_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) ...@@ -111,7 +132,7 @@ int io_async_cancel_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
if (unlikely(req->flags & REQ_F_BUFFER_SELECT)) if (unlikely(req->flags & REQ_F_BUFFER_SELECT))
return -EINVAL; return -EINVAL;
if (sqe->off || sqe->len || sqe->splice_fd_in) if (sqe->off || sqe->splice_fd_in)
return -EINVAL; return -EINVAL;
cancel->addr = READ_ONCE(sqe->addr); cancel->addr = READ_ONCE(sqe->addr);
...@@ -123,6 +144,11 @@ int io_async_cancel_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) ...@@ -123,6 +144,11 @@ int io_async_cancel_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
return -EINVAL; return -EINVAL;
cancel->fd = READ_ONCE(sqe->fd); cancel->fd = READ_ONCE(sqe->fd);
} }
if (cancel->flags & IORING_ASYNC_CANCEL_OP) {
if (cancel->flags & IORING_ASYNC_CANCEL_ANY)
return -EINVAL;
cancel->opcode = READ_ONCE(sqe->len);
}
return 0; return 0;
} }
...@@ -169,6 +195,7 @@ int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags) ...@@ -169,6 +195,7 @@ int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags)
.ctx = req->ctx, .ctx = req->ctx,
.data = cancel->addr, .data = cancel->addr,
.flags = cancel->flags, .flags = cancel->flags,
.opcode = cancel->opcode,
.seq = atomic_inc_return(&req->ctx->cancel_seq), .seq = atomic_inc_return(&req->ctx->cancel_seq),
}; };
struct io_uring_task *tctx = req->task->io_uring; struct io_uring_task *tctx = req->task->io_uring;
...@@ -238,17 +265,22 @@ int io_sync_cancel(struct io_ring_ctx *ctx, void __user *arg) ...@@ -238,17 +265,22 @@ int io_sync_cancel(struct io_ring_ctx *ctx, void __user *arg)
struct io_uring_sync_cancel_reg sc; struct io_uring_sync_cancel_reg sc;
struct fd f = { }; struct fd f = { };
DEFINE_WAIT(wait); DEFINE_WAIT(wait);
int ret; int ret, i;
if (copy_from_user(&sc, arg, sizeof(sc))) if (copy_from_user(&sc, arg, sizeof(sc)))
return -EFAULT; return -EFAULT;
if (sc.flags & ~CANCEL_FLAGS) if (sc.flags & ~CANCEL_FLAGS)
return -EINVAL; return -EINVAL;
if (sc.pad[0] || sc.pad[1] || sc.pad[2] || sc.pad[3]) for (i = 0; i < ARRAY_SIZE(sc.pad); i++)
return -EINVAL; if (sc.pad[i])
return -EINVAL;
for (i = 0; i < ARRAY_SIZE(sc.pad2); i++)
if (sc.pad2[i])
return -EINVAL;
cd.data = sc.addr; cd.data = sc.addr;
cd.flags = sc.flags; cd.flags = sc.flags;
cd.opcode = sc.opcode;
/* we can grab a normal file descriptor upfront */ /* we can grab a normal file descriptor upfront */
if ((cd.flags & IORING_ASYNC_CANCEL_FD) && if ((cd.flags & IORING_ASYNC_CANCEL_FD) &&
......
...@@ -8,11 +8,11 @@ struct io_cancel_data { ...@@ -8,11 +8,11 @@ struct io_cancel_data {
u64 data; u64 data;
struct file *file; struct file *file;
}; };
u8 opcode;
u32 flags; u32 flags;
int seq; int seq;
}; };
int io_async_cancel_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_async_cancel_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags); int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags);
...@@ -21,3 +21,4 @@ int io_try_cancel(struct io_uring_task *tctx, struct io_cancel_data *cd, ...@@ -21,3 +21,4 @@ int io_try_cancel(struct io_uring_task *tctx, struct io_cancel_data *cd,
void init_hash_table(struct io_hash_table *table, unsigned size); void init_hash_table(struct io_hash_table *table, unsigned size);
int io_sync_cancel(struct io_ring_ctx *ctx, void __user *arg); int io_sync_cancel(struct io_ring_ctx *ctx, void __user *arg);
bool io_cancel_req_match(struct io_kiocb *req, struct io_cancel_data *cd);
...@@ -46,9 +46,13 @@ static __cold int io_uring_show_cred(struct seq_file *m, unsigned int id, ...@@ -46,9 +46,13 @@ static __cold int io_uring_show_cred(struct seq_file *m, unsigned int id,
return 0; return 0;
} }
static __cold void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, /*
struct seq_file *m) * Caller holds a reference to the file already, we don't need to do
* anything else to get an extra reference.
*/
__cold void io_uring_show_fdinfo(struct seq_file *m, struct file *f)
{ {
struct io_ring_ctx *ctx = f->private_data;
struct io_sq_data *sq = NULL; struct io_sq_data *sq = NULL;
struct io_overflow_cqe *ocqe; struct io_overflow_cqe *ocqe;
struct io_rings *r = ctx->rings; struct io_rings *r = ctx->rings;
...@@ -203,14 +207,4 @@ static __cold void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, ...@@ -203,14 +207,4 @@ static __cold void __io_uring_show_fdinfo(struct io_ring_ctx *ctx,
spin_unlock(&ctx->completion_lock); spin_unlock(&ctx->completion_lock);
} }
__cold void io_uring_show_fdinfo(struct seq_file *m, struct file *f)
{
struct io_ring_ctx *ctx = f->private_data;
if (percpu_ref_tryget(&ctx->refs)) {
__io_uring_show_fdinfo(ctx, m);
percpu_ref_put(&ctx->refs);
}
}
#endif #endif
...@@ -232,17 +232,25 @@ static void io_worker_exit(struct io_worker *worker) ...@@ -232,17 +232,25 @@ static void io_worker_exit(struct io_worker *worker)
do_exit(0); do_exit(0);
} }
static inline bool io_acct_run_queue(struct io_wq_acct *acct) static inline bool __io_acct_run_queue(struct io_wq_acct *acct)
{ {
bool ret = false; return !test_bit(IO_ACCT_STALLED_BIT, &acct->flags) &&
!wq_list_empty(&acct->work_list);
}
/*
* If there's work to do, returns true with acct->lock acquired. If not,
* returns false with no lock held.
*/
static inline bool io_acct_run_queue(struct io_wq_acct *acct)
__acquires(&acct->lock)
{
raw_spin_lock(&acct->lock); raw_spin_lock(&acct->lock);
if (!wq_list_empty(&acct->work_list) && if (__io_acct_run_queue(acct))
!test_bit(IO_ACCT_STALLED_BIT, &acct->flags)) return true;
ret = true;
raw_spin_unlock(&acct->lock);
return ret; raw_spin_unlock(&acct->lock);
return false;
} }
/* /*
...@@ -268,11 +276,14 @@ static bool io_wq_activate_free_worker(struct io_wq *wq, ...@@ -268,11 +276,14 @@ static bool io_wq_activate_free_worker(struct io_wq *wq,
io_worker_release(worker); io_worker_release(worker);
continue; continue;
} }
if (wake_up_process(worker->task)) { /*
io_worker_release(worker); * If the worker is already running, it's either already
return true; * starting work or finishing work. In either case, if it does
} * to go sleep, we'll kick off a new task for this work anyway.
*/
wake_up_process(worker->task);
io_worker_release(worker); io_worker_release(worker);
return true;
} }
return false; return false;
...@@ -397,6 +408,7 @@ static void io_wq_dec_running(struct io_worker *worker) ...@@ -397,6 +408,7 @@ static void io_wq_dec_running(struct io_worker *worker)
if (!io_acct_run_queue(acct)) if (!io_acct_run_queue(acct))
return; return;
raw_spin_unlock(&acct->lock);
atomic_inc(&acct->nr_running); atomic_inc(&acct->nr_running);
atomic_inc(&wq->worker_refs); atomic_inc(&wq->worker_refs);
io_queue_worker_create(worker, acct, create_worker_cb); io_queue_worker_create(worker, acct, create_worker_cb);
...@@ -521,9 +533,13 @@ static void io_assign_current_work(struct io_worker *worker, ...@@ -521,9 +533,13 @@ static void io_assign_current_work(struct io_worker *worker,
raw_spin_unlock(&worker->lock); raw_spin_unlock(&worker->lock);
} }
static void io_worker_handle_work(struct io_worker *worker) /*
* Called with acct->lock held, drops it before returning
*/
static void io_worker_handle_work(struct io_wq_acct *acct,
struct io_worker *worker)
__releases(&acct->lock)
{ {
struct io_wq_acct *acct = io_wq_get_acct(worker);
struct io_wq *wq = worker->wq; struct io_wq *wq = worker->wq;
bool do_kill = test_bit(IO_WQ_BIT_EXIT, &wq->state); bool do_kill = test_bit(IO_WQ_BIT_EXIT, &wq->state);
...@@ -537,7 +553,6 @@ static void io_worker_handle_work(struct io_worker *worker) ...@@ -537,7 +553,6 @@ static void io_worker_handle_work(struct io_worker *worker)
* can't make progress, any work completion or insertion will * can't make progress, any work completion or insertion will
* clear the stalled flag. * clear the stalled flag.
*/ */
raw_spin_lock(&acct->lock);
work = io_get_next_work(acct, worker); work = io_get_next_work(acct, worker);
raw_spin_unlock(&acct->lock); raw_spin_unlock(&acct->lock);
if (work) { if (work) {
...@@ -591,6 +606,10 @@ static void io_worker_handle_work(struct io_worker *worker) ...@@ -591,6 +606,10 @@ static void io_worker_handle_work(struct io_worker *worker)
wake_up(&wq->hash->wait); wake_up(&wq->hash->wait);
} }
} while (work); } while (work);
if (!__io_acct_run_queue(acct))
break;
raw_spin_lock(&acct->lock);
} while (1); } while (1);
} }
...@@ -611,8 +630,13 @@ static int io_wq_worker(void *data) ...@@ -611,8 +630,13 @@ static int io_wq_worker(void *data)
long ret; long ret;
set_current_state(TASK_INTERRUPTIBLE); set_current_state(TASK_INTERRUPTIBLE);
/*
* If we have work to do, io_acct_run_queue() returns with
* the acct->lock held. If not, it will drop it.
*/
while (io_acct_run_queue(acct)) while (io_acct_run_queue(acct))
io_worker_handle_work(worker); io_worker_handle_work(acct, worker);
raw_spin_lock(&wq->lock); raw_spin_lock(&wq->lock);
/* /*
...@@ -645,8 +669,8 @@ static int io_wq_worker(void *data) ...@@ -645,8 +669,8 @@ static int io_wq_worker(void *data)
} }
} }
if (test_bit(IO_WQ_BIT_EXIT, &wq->state)) if (test_bit(IO_WQ_BIT_EXIT, &wq->state) && io_acct_run_queue(acct))
io_worker_handle_work(worker); io_worker_handle_work(acct, worker);
io_worker_exit(worker); io_worker_exit(worker);
return 0; return 0;
...@@ -909,13 +933,10 @@ void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work) ...@@ -909,13 +933,10 @@ void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work)
clear_bit(IO_ACCT_STALLED_BIT, &acct->flags); clear_bit(IO_ACCT_STALLED_BIT, &acct->flags);
raw_spin_unlock(&acct->lock); raw_spin_unlock(&acct->lock);
raw_spin_lock(&wq->lock);
rcu_read_lock(); rcu_read_lock();
do_create = !io_wq_activate_free_worker(wq, acct); do_create = !io_wq_activate_free_worker(wq, acct);
rcu_read_unlock(); rcu_read_unlock();
raw_spin_unlock(&wq->lock);
if (do_create && ((work_flags & IO_WQ_WORK_CONCURRENT) || if (do_create && ((work_flags & IO_WQ_WORK_CONCURRENT) ||
!atomic_read(&acct->nr_running))) { !atomic_read(&acct->nr_running))) {
bool did_create; bool did_create;
...@@ -1285,13 +1306,16 @@ static int io_wq_cpu_offline(unsigned int cpu, struct hlist_node *node) ...@@ -1285,13 +1306,16 @@ static int io_wq_cpu_offline(unsigned int cpu, struct hlist_node *node)
return __io_wq_cpu_online(wq, cpu, false); return __io_wq_cpu_online(wq, cpu, false);
} }
int io_wq_cpu_affinity(struct io_wq *wq, cpumask_var_t mask) int io_wq_cpu_affinity(struct io_uring_task *tctx, cpumask_var_t mask)
{ {
if (!tctx || !tctx->io_wq)
return -EINVAL;
rcu_read_lock(); rcu_read_lock();
if (mask) if (mask)
cpumask_copy(wq->cpu_mask, mask); cpumask_copy(tctx->io_wq->cpu_mask, mask);
else else
cpumask_copy(wq->cpu_mask, cpu_possible_mask); cpumask_copy(tctx->io_wq->cpu_mask, cpu_possible_mask);
rcu_read_unlock(); rcu_read_unlock();
return 0; return 0;
......
...@@ -50,7 +50,7 @@ void io_wq_put_and_exit(struct io_wq *wq); ...@@ -50,7 +50,7 @@ void io_wq_put_and_exit(struct io_wq *wq);
void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work); void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work);
void io_wq_hash_work(struct io_wq_work *work, void *val); void io_wq_hash_work(struct io_wq_work *work, void *val);
int io_wq_cpu_affinity(struct io_wq *wq, cpumask_var_t mask); int io_wq_cpu_affinity(struct io_uring_task *tctx, cpumask_var_t mask);
int io_wq_max_workers(struct io_wq *wq, int *new_count); int io_wq_max_workers(struct io_wq *wq, int *new_count);
static inline bool io_wq_is_hashed(struct io_wq_work *work) static inline bool io_wq_is_hashed(struct io_wq_work *work)
......
...@@ -147,8 +147,6 @@ static bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx, ...@@ -147,8 +147,6 @@ static bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
bool cancel_all); bool cancel_all);
static void io_queue_sqe(struct io_kiocb *req); static void io_queue_sqe(struct io_kiocb *req);
static void io_move_task_work_from_local(struct io_ring_ctx *ctx);
static void __io_submit_flush_completions(struct io_ring_ctx *ctx);
struct kmem_cache *req_cachep; struct kmem_cache *req_cachep;
...@@ -229,7 +227,6 @@ static inline void req_fail_link_node(struct io_kiocb *req, int res) ...@@ -229,7 +227,6 @@ static inline void req_fail_link_node(struct io_kiocb *req, int res)
static inline void io_req_add_to_cache(struct io_kiocb *req, struct io_ring_ctx *ctx) static inline void io_req_add_to_cache(struct io_kiocb *req, struct io_ring_ctx *ctx)
{ {
wq_stack_add_head(&req->comp_list, &ctx->submit_state.free_list); wq_stack_add_head(&req->comp_list, &ctx->submit_state.free_list);
kasan_poison_object_data(req_cachep, req);
} }
static __cold void io_ring_ctx_ref_free(struct percpu_ref *ref) static __cold void io_ring_ctx_ref_free(struct percpu_ref *ref)
...@@ -292,13 +289,6 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) ...@@ -292,13 +289,6 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
goto err; goto err;
if (io_alloc_hash_table(&ctx->cancel_table_locked, hash_bits)) if (io_alloc_hash_table(&ctx->cancel_table_locked, hash_bits))
goto err; goto err;
ctx->dummy_ubuf = kzalloc(sizeof(*ctx->dummy_ubuf), GFP_KERNEL);
if (!ctx->dummy_ubuf)
goto err;
/* set invalid range, so io_import_fixed() fails meeting it */
ctx->dummy_ubuf->ubuf = -1UL;
if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free, if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free,
0, GFP_KERNEL)) 0, GFP_KERNEL))
goto err; goto err;
...@@ -337,7 +327,6 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) ...@@ -337,7 +327,6 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
INIT_WQ_LIST(&ctx->submit_state.compl_reqs); INIT_WQ_LIST(&ctx->submit_state.compl_reqs);
return ctx; return ctx;
err: err:
kfree(ctx->dummy_ubuf);
kfree(ctx->cancel_table.hbs); kfree(ctx->cancel_table.hbs);
kfree(ctx->cancel_table_locked.hbs); kfree(ctx->cancel_table_locked.hbs);
kfree(ctx->io_bl); kfree(ctx->io_bl);
...@@ -626,7 +615,7 @@ void __io_commit_cqring_flush(struct io_ring_ctx *ctx) ...@@ -626,7 +615,7 @@ void __io_commit_cqring_flush(struct io_ring_ctx *ctx)
static inline void __io_cq_lock(struct io_ring_ctx *ctx) static inline void __io_cq_lock(struct io_ring_ctx *ctx)
{ {
if (!ctx->task_complete) if (!ctx->lockless_cq)
spin_lock(&ctx->completion_lock); spin_lock(&ctx->completion_lock);
} }
...@@ -639,19 +628,14 @@ static inline void io_cq_lock(struct io_ring_ctx *ctx) ...@@ -639,19 +628,14 @@ static inline void io_cq_lock(struct io_ring_ctx *ctx)
static inline void __io_cq_unlock_post(struct io_ring_ctx *ctx) static inline void __io_cq_unlock_post(struct io_ring_ctx *ctx)
{ {
io_commit_cqring(ctx); io_commit_cqring(ctx);
if (!ctx->task_complete) {
if (ctx->task_complete) { if (!ctx->lockless_cq)
/* spin_unlock(&ctx->completion_lock);
* ->task_complete implies that only current might be waiting /* IOPOLL rings only need to wake up if it's also SQPOLL */
* for CQEs, and obviously, we currently don't. No one is if (!ctx->syscall_iopoll)
* waiting, wakeups are futile, skip them. io_cqring_wake(ctx);
*/
io_commit_cqring_flush(ctx);
} else {
spin_unlock(&ctx->completion_lock);
io_commit_cqring_flush(ctx);
io_cqring_wake(ctx);
} }
io_commit_cqring_flush(ctx);
} }
static void io_cq_unlock_post(struct io_ring_ctx *ctx) static void io_cq_unlock_post(struct io_ring_ctx *ctx)
...@@ -659,8 +643,8 @@ static void io_cq_unlock_post(struct io_ring_ctx *ctx) ...@@ -659,8 +643,8 @@ static void io_cq_unlock_post(struct io_ring_ctx *ctx)
{ {
io_commit_cqring(ctx); io_commit_cqring(ctx);
spin_unlock(&ctx->completion_lock); spin_unlock(&ctx->completion_lock);
io_commit_cqring_flush(ctx);
io_cqring_wake(ctx); io_cqring_wake(ctx);
io_commit_cqring_flush(ctx);
} }
/* Returns true if there are no backlogged entries after the flush */ /* Returns true if there are no backlogged entries after the flush */
...@@ -693,10 +677,10 @@ static void __io_cqring_overflow_flush(struct io_ring_ctx *ctx) ...@@ -693,10 +677,10 @@ static void __io_cqring_overflow_flush(struct io_ring_ctx *ctx)
io_cq_lock(ctx); io_cq_lock(ctx);
while (!list_empty(&ctx->cq_overflow_list)) { while (!list_empty(&ctx->cq_overflow_list)) {
struct io_uring_cqe *cqe = io_get_cqe_overflow(ctx, true); struct io_uring_cqe *cqe;
struct io_overflow_cqe *ocqe; struct io_overflow_cqe *ocqe;
if (!cqe) if (!io_get_cqe_overflow(ctx, &cqe, true))
break; break;
ocqe = list_first_entry(&ctx->cq_overflow_list, ocqe = list_first_entry(&ctx->cq_overflow_list,
struct io_overflow_cqe, list); struct io_overflow_cqe, list);
...@@ -815,15 +799,12 @@ static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data, ...@@ -815,15 +799,12 @@ static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data,
return true; return true;
} }
bool io_req_cqe_overflow(struct io_kiocb *req) void io_req_cqe_overflow(struct io_kiocb *req)
{ {
if (!(req->flags & REQ_F_CQE32_INIT)) { io_cqring_event_overflow(req->ctx, req->cqe.user_data,
req->extra1 = 0; req->cqe.res, req->cqe.flags,
req->extra2 = 0; req->big_cqe.extra1, req->big_cqe.extra2);
} memset(&req->big_cqe, 0, sizeof(req->big_cqe));
return io_cqring_event_overflow(req->ctx, req->cqe.user_data,
req->cqe.res, req->cqe.flags,
req->extra1, req->extra2);
} }
/* /*
...@@ -831,7 +812,7 @@ bool io_req_cqe_overflow(struct io_kiocb *req) ...@@ -831,7 +812,7 @@ bool io_req_cqe_overflow(struct io_kiocb *req)
* control dependency is enough as we're using WRITE_ONCE to * control dependency is enough as we're using WRITE_ONCE to
* fill the cq entry * fill the cq entry
*/ */
struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx, bool overflow) bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow)
{ {
struct io_rings *rings = ctx->rings; struct io_rings *rings = ctx->rings;
unsigned int off = ctx->cached_cq_tail & (ctx->cq_entries - 1); unsigned int off = ctx->cached_cq_tail & (ctx->cq_entries - 1);
...@@ -843,7 +824,7 @@ struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx, bool overflow) ...@@ -843,7 +824,7 @@ struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx, bool overflow)
* Force overflow the completion. * Force overflow the completion.
*/ */
if (!overflow && (ctx->check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT))) if (!overflow && (ctx->check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT)))
return NULL; return false;
/* userspace may cheat modifying the tail, be safe and do min */ /* userspace may cheat modifying the tail, be safe and do min */
queued = min(__io_cqring_events(ctx), ctx->cq_entries); queued = min(__io_cqring_events(ctx), ctx->cq_entries);
...@@ -851,7 +832,7 @@ struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx, bool overflow) ...@@ -851,7 +832,7 @@ struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx, bool overflow)
/* we need a contiguous range, limit based on the current array offset */ /* we need a contiguous range, limit based on the current array offset */
len = min(free, ctx->cq_entries - off); len = min(free, ctx->cq_entries - off);
if (!len) if (!len)
return NULL; return false;
if (ctx->flags & IORING_SETUP_CQE32) { if (ctx->flags & IORING_SETUP_CQE32) {
off <<= 1; off <<= 1;
...@@ -860,12 +841,7 @@ struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx, bool overflow) ...@@ -860,12 +841,7 @@ struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx, bool overflow)
ctx->cqe_cached = &rings->cqes[off]; ctx->cqe_cached = &rings->cqes[off];
ctx->cqe_sentinel = ctx->cqe_cached + len; ctx->cqe_sentinel = ctx->cqe_cached + len;
return true;
ctx->cached_cq_tail++;
ctx->cqe_cached++;
if (ctx->flags & IORING_SETUP_CQE32)
ctx->cqe_cached++;
return &rings->cqes[off];
} }
static bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res, static bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res,
...@@ -880,8 +856,7 @@ static bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res, ...@@ -880,8 +856,7 @@ static bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res,
* submission (by quite a lot). Increment the overflow count in * submission (by quite a lot). Increment the overflow count in
* the ring. * the ring.
*/ */
cqe = io_get_cqe(ctx); if (likely(io_get_cqe(ctx, &cqe))) {
if (likely(cqe)) {
trace_io_uring_complete(ctx, NULL, user_data, res, cflags, 0, 0); trace_io_uring_complete(ctx, NULL, user_data, res, cflags, 0, 0);
WRITE_ONCE(cqe->user_data, user_data); WRITE_ONCE(cqe->user_data, user_data);
...@@ -905,7 +880,7 @@ static void __io_flush_post_cqes(struct io_ring_ctx *ctx) ...@@ -905,7 +880,7 @@ static void __io_flush_post_cqes(struct io_ring_ctx *ctx)
lockdep_assert_held(&ctx->uring_lock); lockdep_assert_held(&ctx->uring_lock);
for (i = 0; i < state->cqes_count; i++) { for (i = 0; i < state->cqes_count; i++) {
struct io_uring_cqe *cqe = &state->cqes[i]; struct io_uring_cqe *cqe = &ctx->completion_cqes[i];
if (!io_fill_cqe_aux(ctx, cqe->user_data, cqe->res, cqe->flags)) { if (!io_fill_cqe_aux(ctx, cqe->user_data, cqe->res, cqe->flags)) {
if (ctx->task_complete) { if (ctx->task_complete) {
...@@ -941,19 +916,22 @@ bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags ...@@ -941,19 +916,22 @@ bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags
return __io_post_aux_cqe(ctx, user_data, res, cflags, true); return __io_post_aux_cqe(ctx, user_data, res, cflags, true);
} }
bool io_aux_cqe(const struct io_kiocb *req, bool defer, s32 res, u32 cflags, /*
bool allow_overflow) * A helper for multishot requests posting additional CQEs.
* Should only be used from a task_work including IO_URING_F_MULTISHOT.
*/
bool io_fill_cqe_req_aux(struct io_kiocb *req, bool defer, s32 res, u32 cflags)
{ {
struct io_ring_ctx *ctx = req->ctx; struct io_ring_ctx *ctx = req->ctx;
u64 user_data = req->cqe.user_data; u64 user_data = req->cqe.user_data;
struct io_uring_cqe *cqe; struct io_uring_cqe *cqe;
if (!defer) if (!defer)
return __io_post_aux_cqe(ctx, user_data, res, cflags, allow_overflow); return __io_post_aux_cqe(ctx, user_data, res, cflags, false);
lockdep_assert_held(&ctx->uring_lock); lockdep_assert_held(&ctx->uring_lock);
if (ctx->submit_state.cqes_count == ARRAY_SIZE(ctx->submit_state.cqes)) { if (ctx->submit_state.cqes_count == ARRAY_SIZE(ctx->completion_cqes)) {
__io_cq_lock(ctx); __io_cq_lock(ctx);
__io_flush_post_cqes(ctx); __io_flush_post_cqes(ctx);
/* no need to flush - flush is deferred */ /* no need to flush - flush is deferred */
...@@ -964,10 +942,10 @@ bool io_aux_cqe(const struct io_kiocb *req, bool defer, s32 res, u32 cflags, ...@@ -964,10 +942,10 @@ bool io_aux_cqe(const struct io_kiocb *req, bool defer, s32 res, u32 cflags,
* however it's main job is to prevent unbounded posted completions, * however it's main job is to prevent unbounded posted completions,
* and in that it works just as well. * and in that it works just as well.
*/ */
if (!allow_overflow && test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq)) if (test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq))
return false; return false;
cqe = &ctx->submit_state.cqes[ctx->submit_state.cqes_count++]; cqe = &ctx->completion_cqes[ctx->submit_state.cqes_count++];
cqe->user_data = user_data; cqe->user_data = user_data;
cqe->res = res; cqe->res = res;
cqe->flags = cflags; cqe->flags = cflags;
...@@ -980,8 +958,10 @@ static void __io_req_complete_post(struct io_kiocb *req, unsigned issue_flags) ...@@ -980,8 +958,10 @@ static void __io_req_complete_post(struct io_kiocb *req, unsigned issue_flags)
struct io_rsrc_node *rsrc_node = NULL; struct io_rsrc_node *rsrc_node = NULL;
io_cq_lock(ctx); io_cq_lock(ctx);
if (!(req->flags & REQ_F_CQE_SKIP)) if (!(req->flags & REQ_F_CQE_SKIP)) {
io_fill_cqe_req(ctx, req); if (!io_fill_cqe_req(ctx, req))
io_req_cqe_overflow(req);
}
/* /*
* If we're the last reference to this request, add to our locked * If we're the last reference to this request, add to our locked
...@@ -999,8 +979,7 @@ static void __io_req_complete_post(struct io_kiocb *req, unsigned issue_flags) ...@@ -999,8 +979,7 @@ static void __io_req_complete_post(struct io_kiocb *req, unsigned issue_flags)
io_put_kbuf_comp(req); io_put_kbuf_comp(req);
if (unlikely(req->flags & IO_REQ_CLEAN_FLAGS)) if (unlikely(req->flags & IO_REQ_CLEAN_FLAGS))
io_clean_op(req); io_clean_op(req);
if (!(req->flags & REQ_F_FIXED_FILE)) io_put_file(req);
io_put_file(req->file);
rsrc_node = req->rsrc_node; rsrc_node = req->rsrc_node;
/* /*
...@@ -1062,7 +1041,8 @@ static void io_preinit_req(struct io_kiocb *req, struct io_ring_ctx *ctx) ...@@ -1062,7 +1041,8 @@ static void io_preinit_req(struct io_kiocb *req, struct io_ring_ctx *ctx)
req->link = NULL; req->link = NULL;
req->async_data = NULL; req->async_data = NULL;
/* not necessary, but safer to zero */ /* not necessary, but safer to zero */
req->cqe.res = 0; memset(&req->cqe, 0, sizeof(req->cqe));
memset(&req->big_cqe, 0, sizeof(req->big_cqe));
} }
static void io_flush_cached_locked_reqs(struct io_ring_ctx *ctx, static void io_flush_cached_locked_reqs(struct io_ring_ctx *ctx,
...@@ -1507,7 +1487,8 @@ void io_queue_next(struct io_kiocb *req) ...@@ -1507,7 +1487,8 @@ void io_queue_next(struct io_kiocb *req)
io_req_task_queue(nxt); io_req_task_queue(nxt);
} }
void io_free_batch_list(struct io_ring_ctx *ctx, struct io_wq_work_node *node) static void io_free_batch_list(struct io_ring_ctx *ctx,
struct io_wq_work_node *node)
__must_hold(&ctx->uring_lock) __must_hold(&ctx->uring_lock)
{ {
do { do {
...@@ -1534,8 +1515,7 @@ void io_free_batch_list(struct io_ring_ctx *ctx, struct io_wq_work_node *node) ...@@ -1534,8 +1515,7 @@ void io_free_batch_list(struct io_ring_ctx *ctx, struct io_wq_work_node *node)
if (unlikely(req->flags & IO_REQ_CLEAN_FLAGS)) if (unlikely(req->flags & IO_REQ_CLEAN_FLAGS))
io_clean_op(req); io_clean_op(req);
} }
if (!(req->flags & REQ_F_FIXED_FILE)) io_put_file(req);
io_put_file(req->file);
io_req_put_rsrc_locked(req, ctx); io_req_put_rsrc_locked(req, ctx);
...@@ -1545,7 +1525,7 @@ void io_free_batch_list(struct io_ring_ctx *ctx, struct io_wq_work_node *node) ...@@ -1545,7 +1525,7 @@ void io_free_batch_list(struct io_ring_ctx *ctx, struct io_wq_work_node *node)
} while (node); } while (node);
} }
static void __io_submit_flush_completions(struct io_ring_ctx *ctx) void __io_submit_flush_completions(struct io_ring_ctx *ctx)
__must_hold(&ctx->uring_lock) __must_hold(&ctx->uring_lock)
{ {
struct io_submit_state *state = &ctx->submit_state; struct io_submit_state *state = &ctx->submit_state;
...@@ -1560,7 +1540,7 @@ static void __io_submit_flush_completions(struct io_ring_ctx *ctx) ...@@ -1560,7 +1540,7 @@ static void __io_submit_flush_completions(struct io_ring_ctx *ctx)
comp_list); comp_list);
if (!(req->flags & REQ_F_CQE_SKIP) && if (!(req->flags & REQ_F_CQE_SKIP) &&
unlikely(!__io_fill_cqe_req(ctx, req))) { unlikely(!io_fill_cqe_req(ctx, req))) {
if (ctx->task_complete) { if (ctx->task_complete) {
spin_lock(&ctx->completion_lock); spin_lock(&ctx->completion_lock);
io_req_cqe_overflow(req); io_req_cqe_overflow(req);
...@@ -1616,7 +1596,6 @@ static __cold void io_iopoll_try_reap_events(struct io_ring_ctx *ctx) ...@@ -1616,7 +1596,6 @@ static __cold void io_iopoll_try_reap_events(struct io_ring_ctx *ctx)
static int io_iopoll_check(struct io_ring_ctx *ctx, long min) static int io_iopoll_check(struct io_ring_ctx *ctx, long min)
{ {
unsigned int nr_events = 0; unsigned int nr_events = 0;
int ret = 0;
unsigned long check_cq; unsigned long check_cq;
if (!io_allowed_run_tw(ctx)) if (!io_allowed_run_tw(ctx))
...@@ -1642,6 +1621,8 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min) ...@@ -1642,6 +1621,8 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min)
return 0; return 0;
do { do {
int ret = 0;
/* /*
* If a submit got punted to a workqueue, we can have the * If a submit got punted to a workqueue, we can have the
* application entering polling for a command before it gets * application entering polling for a command before it gets
...@@ -1670,13 +1651,18 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min) ...@@ -1670,13 +1651,18 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min)
break; break;
} }
ret = io_do_iopoll(ctx, !min); ret = io_do_iopoll(ctx, !min);
if (ret < 0) if (unlikely(ret < 0))
return ret;
if (task_sigpending(current))
return -EINTR;
if (need_resched())
break; break;
nr_events += ret; nr_events += ret;
ret = 0; } while (nr_events < min);
} while (nr_events < min && !need_resched());
return ret; return 0;
} }
void io_req_task_complete(struct io_kiocb *req, struct io_tw_state *ts) void io_req_task_complete(struct io_kiocb *req, struct io_tw_state *ts)
...@@ -2361,8 +2347,21 @@ static void io_commit_sqring(struct io_ring_ctx *ctx) ...@@ -2361,8 +2347,21 @@ static void io_commit_sqring(struct io_ring_ctx *ctx)
*/ */
static bool io_get_sqe(struct io_ring_ctx *ctx, const struct io_uring_sqe **sqe) static bool io_get_sqe(struct io_ring_ctx *ctx, const struct io_uring_sqe **sqe)
{ {
unsigned head, mask = ctx->sq_entries - 1; unsigned mask = ctx->sq_entries - 1;
unsigned sq_idx = ctx->cached_sq_head++ & mask; unsigned head = ctx->cached_sq_head++ & mask;
if (!(ctx->flags & IORING_SETUP_NO_SQARRAY)) {
head = READ_ONCE(ctx->sq_array[head]);
if (unlikely(head >= ctx->sq_entries)) {
/* drop invalid entries */
spin_lock(&ctx->completion_lock);
ctx->cq_extra--;
spin_unlock(&ctx->completion_lock);
WRITE_ONCE(ctx->rings->sq_dropped,
READ_ONCE(ctx->rings->sq_dropped) + 1);
return false;
}
}
/* /*
* The cached sq head (or cq tail) serves two purposes: * The cached sq head (or cq tail) serves two purposes:
...@@ -2372,20 +2371,12 @@ static bool io_get_sqe(struct io_ring_ctx *ctx, const struct io_uring_sqe **sqe) ...@@ -2372,20 +2371,12 @@ static bool io_get_sqe(struct io_ring_ctx *ctx, const struct io_uring_sqe **sqe)
* 2) allows the kernel side to track the head on its own, even * 2) allows the kernel side to track the head on its own, even
* though the application is the one updating it. * though the application is the one updating it.
*/ */
head = READ_ONCE(ctx->sq_array[sq_idx]);
if (likely(head < ctx->sq_entries)) {
/* double index for 128-byte SQEs, twice as long */
if (ctx->flags & IORING_SETUP_SQE128)
head <<= 1;
*sqe = &ctx->sq_sqes[head];
return true;
}
/* drop invalid entries */ /* double index for 128-byte SQEs, twice as long */
ctx->cq_extra--; if (ctx->flags & IORING_SETUP_SQE128)
WRITE_ONCE(ctx->rings->sq_dropped, head <<= 1;
READ_ONCE(ctx->rings->sq_dropped) + 1); *sqe = &ctx->sq_sqes[head];
return false; return true;
} }
int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr) int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
...@@ -2484,10 +2475,10 @@ int io_run_task_work_sig(struct io_ring_ctx *ctx) ...@@ -2484,10 +2475,10 @@ int io_run_task_work_sig(struct io_ring_ctx *ctx)
if (!llist_empty(&ctx->work_llist)) { if (!llist_empty(&ctx->work_llist)) {
__set_current_state(TASK_RUNNING); __set_current_state(TASK_RUNNING);
if (io_run_local_work(ctx) > 0) if (io_run_local_work(ctx) > 0)
return 1; return 0;
} }
if (io_run_task_work() > 0) if (io_run_task_work() > 0)
return 1; return 0;
if (task_sigpending(current)) if (task_sigpending(current))
return -EINTR; return -EINTR;
return 0; return 0;
...@@ -2761,6 +2752,12 @@ static unsigned long rings_size(struct io_ring_ctx *ctx, unsigned int sq_entries ...@@ -2761,6 +2752,12 @@ static unsigned long rings_size(struct io_ring_ctx *ctx, unsigned int sq_entries
return SIZE_MAX; return SIZE_MAX;
#endif #endif
if (ctx->flags & IORING_SETUP_NO_SQARRAY) {
if (sq_offset)
*sq_offset = SIZE_MAX;
return off;
}
if (sq_offset) if (sq_offset)
*sq_offset = off; *sq_offset = off;
...@@ -2903,7 +2900,6 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) ...@@ -2903,7 +2900,6 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
io_wq_put_hash(ctx->hash_map); io_wq_put_hash(ctx->hash_map);
kfree(ctx->cancel_table.hbs); kfree(ctx->cancel_table.hbs);
kfree(ctx->cancel_table_locked.hbs); kfree(ctx->cancel_table_locked.hbs);
kfree(ctx->dummy_ubuf);
kfree(ctx->io_bl); kfree(ctx->io_bl);
xa_destroy(&ctx->io_bl_xa); xa_destroy(&ctx->io_bl_xa);
kfree(ctx); kfree(ctx);
...@@ -3733,7 +3729,8 @@ static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx, ...@@ -3733,7 +3729,8 @@ static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx,
return PTR_ERR(rings); return PTR_ERR(rings);
ctx->rings = rings; ctx->rings = rings;
ctx->sq_array = (u32 *)((char *)rings + sq_array_offset); if (!(ctx->flags & IORING_SETUP_NO_SQARRAY))
ctx->sq_array = (u32 *)((char *)rings + sq_array_offset);
rings->sq_ring_mask = p->sq_entries - 1; rings->sq_ring_mask = p->sq_entries - 1;
rings->cq_ring_mask = p->cq_entries - 1; rings->cq_ring_mask = p->cq_entries - 1;
rings->sq_ring_entries = p->sq_entries; rings->sq_ring_entries = p->sq_entries;
...@@ -3862,6 +3859,9 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p, ...@@ -3862,6 +3859,9 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,
!(ctx->flags & IORING_SETUP_SQPOLL)) !(ctx->flags & IORING_SETUP_SQPOLL))
ctx->task_complete = true; ctx->task_complete = true;
if (ctx->task_complete || (ctx->flags & IORING_SETUP_IOPOLL))
ctx->lockless_cq = true;
/* /*
* lazy poll_wq activation relies on ->task_complete for synchronisation * lazy poll_wq activation relies on ->task_complete for synchronisation
* purposes, see io_activate_pollwq() * purposes, see io_activate_pollwq()
...@@ -3941,7 +3941,8 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p, ...@@ -3941,7 +3941,8 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,
p->sq_off.ring_entries = offsetof(struct io_rings, sq_ring_entries); p->sq_off.ring_entries = offsetof(struct io_rings, sq_ring_entries);
p->sq_off.flags = offsetof(struct io_rings, sq_flags); p->sq_off.flags = offsetof(struct io_rings, sq_flags);
p->sq_off.dropped = offsetof(struct io_rings, sq_dropped); p->sq_off.dropped = offsetof(struct io_rings, sq_dropped);
p->sq_off.array = (char *)ctx->sq_array - (char *)ctx->rings; if (!(ctx->flags & IORING_SETUP_NO_SQARRAY))
p->sq_off.array = (char *)ctx->sq_array - (char *)ctx->rings;
p->sq_off.resv1 = 0; p->sq_off.resv1 = 0;
if (!(ctx->flags & IORING_SETUP_NO_MMAP)) if (!(ctx->flags & IORING_SETUP_NO_MMAP))
p->sq_off.user_addr = 0; p->sq_off.user_addr = 0;
...@@ -4030,7 +4031,8 @@ static long io_uring_setup(u32 entries, struct io_uring_params __user *params) ...@@ -4030,7 +4031,8 @@ static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
IORING_SETUP_COOP_TASKRUN | IORING_SETUP_TASKRUN_FLAG | IORING_SETUP_COOP_TASKRUN | IORING_SETUP_TASKRUN_FLAG |
IORING_SETUP_SQE128 | IORING_SETUP_CQE32 | IORING_SETUP_SQE128 | IORING_SETUP_CQE32 |
IORING_SETUP_SINGLE_ISSUER | IORING_SETUP_DEFER_TASKRUN | IORING_SETUP_SINGLE_ISSUER | IORING_SETUP_DEFER_TASKRUN |
IORING_SETUP_NO_MMAP | IORING_SETUP_REGISTERED_FD_ONLY)) IORING_SETUP_NO_MMAP | IORING_SETUP_REGISTERED_FD_ONLY |
IORING_SETUP_NO_SQARRAY))
return -EINVAL; return -EINVAL;
return io_uring_create(entries, &p, params); return io_uring_create(entries, &p, params);
...@@ -4193,16 +4195,28 @@ static int io_register_enable_rings(struct io_ring_ctx *ctx) ...@@ -4193,16 +4195,28 @@ static int io_register_enable_rings(struct io_ring_ctx *ctx)
return 0; return 0;
} }
static __cold int __io_register_iowq_aff(struct io_ring_ctx *ctx,
cpumask_var_t new_mask)
{
int ret;
if (!(ctx->flags & IORING_SETUP_SQPOLL)) {
ret = io_wq_cpu_affinity(current->io_uring, new_mask);
} else {
mutex_unlock(&ctx->uring_lock);
ret = io_sqpoll_wq_cpu_affinity(ctx, new_mask);
mutex_lock(&ctx->uring_lock);
}
return ret;
}
static __cold int io_register_iowq_aff(struct io_ring_ctx *ctx, static __cold int io_register_iowq_aff(struct io_ring_ctx *ctx,
void __user *arg, unsigned len) void __user *arg, unsigned len)
{ {
struct io_uring_task *tctx = current->io_uring;
cpumask_var_t new_mask; cpumask_var_t new_mask;
int ret; int ret;
if (!tctx || !tctx->io_wq)
return -EINVAL;
if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
return -ENOMEM; return -ENOMEM;
...@@ -4223,19 +4237,14 @@ static __cold int io_register_iowq_aff(struct io_ring_ctx *ctx, ...@@ -4223,19 +4237,14 @@ static __cold int io_register_iowq_aff(struct io_ring_ctx *ctx,
return -EFAULT; return -EFAULT;
} }
ret = io_wq_cpu_affinity(tctx->io_wq, new_mask); ret = __io_register_iowq_aff(ctx, new_mask);
free_cpumask_var(new_mask); free_cpumask_var(new_mask);
return ret; return ret;
} }
static __cold int io_unregister_iowq_aff(struct io_ring_ctx *ctx) static __cold int io_unregister_iowq_aff(struct io_ring_ctx *ctx)
{ {
struct io_uring_task *tctx = current->io_uring; return __io_register_iowq_aff(ctx, NULL);
if (!tctx || !tctx->io_wq)
return -EINVAL;
return io_wq_cpu_affinity(tctx->io_wq, NULL);
} }
static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx, static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
...@@ -4611,8 +4620,20 @@ static int __init io_uring_init(void) ...@@ -4611,8 +4620,20 @@ static int __init io_uring_init(void)
io_uring_optable_init(); io_uring_optable_init();
req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC | /*
SLAB_ACCOUNT | SLAB_TYPESAFE_BY_RCU); * Allow user copy in the per-command field, which starts after the
* file in io_kiocb and until the opcode field. The openat2 handling
* requires copying in user memory into the io_kiocb object in that
* range, and HARDENED_USERCOPY will complain if we haven't
* correctly annotated this range.
*/
req_cachep = kmem_cache_create_usercopy("io_kiocb",
sizeof(struct io_kiocb), 0,
SLAB_HWCACHE_ALIGN | SLAB_PANIC |
SLAB_ACCOUNT | SLAB_TYPESAFE_BY_RCU,
offsetof(struct io_kiocb, cmd.data),
sizeof_field(struct io_kiocb, cmd.data), NULL);
return 0; return 0;
}; };
__initcall(io_uring_init); __initcall(io_uring_init);
...@@ -38,14 +38,13 @@ enum { ...@@ -38,14 +38,13 @@ enum {
IOU_STOP_MULTISHOT = -ECANCELED, IOU_STOP_MULTISHOT = -ECANCELED,
}; };
struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx, bool overflow); bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow);
bool io_req_cqe_overflow(struct io_kiocb *req); void io_req_cqe_overflow(struct io_kiocb *req);
int io_run_task_work_sig(struct io_ring_ctx *ctx); int io_run_task_work_sig(struct io_ring_ctx *ctx);
void io_req_defer_failed(struct io_kiocb *req, s32 res); void io_req_defer_failed(struct io_kiocb *req, s32 res);
void io_req_complete_post(struct io_kiocb *req, unsigned issue_flags); void io_req_complete_post(struct io_kiocb *req, unsigned issue_flags);
bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags); bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags);
bool io_aux_cqe(const struct io_kiocb *req, bool defer, s32 res, u32 cflags, bool io_fill_cqe_req_aux(struct io_kiocb *req, bool defer, s32 res, u32 cflags);
bool allow_overflow);
void __io_commit_cqring_flush(struct io_ring_ctx *ctx); void __io_commit_cqring_flush(struct io_ring_ctx *ctx);
struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages); struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages);
...@@ -73,7 +72,7 @@ int io_ring_add_registered_file(struct io_uring_task *tctx, struct file *file, ...@@ -73,7 +72,7 @@ int io_ring_add_registered_file(struct io_uring_task *tctx, struct file *file,
int io_poll_issue(struct io_kiocb *req, struct io_tw_state *ts); int io_poll_issue(struct io_kiocb *req, struct io_tw_state *ts);
int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr); int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr);
int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin); int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin);
void io_free_batch_list(struct io_ring_ctx *ctx, struct io_wq_work_node *node); void __io_submit_flush_completions(struct io_ring_ctx *ctx);
int io_req_prep_async(struct io_kiocb *req); int io_req_prep_async(struct io_kiocb *req);
struct io_wq_work *io_wq_free_work(struct io_wq_work *work); struct io_wq_work *io_wq_free_work(struct io_wq_work *work);
...@@ -110,31 +109,31 @@ static inline void io_req_task_work_add(struct io_kiocb *req) ...@@ -110,31 +109,31 @@ static inline void io_req_task_work_add(struct io_kiocb *req)
#define io_for_each_link(pos, head) \ #define io_for_each_link(pos, head) \
for (pos = (head); pos; pos = pos->link) for (pos = (head); pos; pos = pos->link)
static inline struct io_uring_cqe *io_get_cqe_overflow(struct io_ring_ctx *ctx, static inline bool io_get_cqe_overflow(struct io_ring_ctx *ctx,
bool overflow) struct io_uring_cqe **ret,
bool overflow)
{ {
io_lockdep_assert_cq_locked(ctx); io_lockdep_assert_cq_locked(ctx);
if (likely(ctx->cqe_cached < ctx->cqe_sentinel)) { if (unlikely(ctx->cqe_cached >= ctx->cqe_sentinel)) {
struct io_uring_cqe *cqe = ctx->cqe_cached; if (unlikely(!io_cqe_cache_refill(ctx, overflow)))
return false;
ctx->cached_cq_tail++;
ctx->cqe_cached++;
if (ctx->flags & IORING_SETUP_CQE32)
ctx->cqe_cached++;
return cqe;
} }
*ret = ctx->cqe_cached;
return __io_get_cqe(ctx, overflow); ctx->cached_cq_tail++;
ctx->cqe_cached++;
if (ctx->flags & IORING_SETUP_CQE32)
ctx->cqe_cached++;
return true;
} }
static inline struct io_uring_cqe *io_get_cqe(struct io_ring_ctx *ctx) static inline bool io_get_cqe(struct io_ring_ctx *ctx, struct io_uring_cqe **ret)
{ {
return io_get_cqe_overflow(ctx, false); return io_get_cqe_overflow(ctx, ret, false);
} }
static inline bool __io_fill_cqe_req(struct io_ring_ctx *ctx, static __always_inline bool io_fill_cqe_req(struct io_ring_ctx *ctx,
struct io_kiocb *req) struct io_kiocb *req)
{ {
struct io_uring_cqe *cqe; struct io_uring_cqe *cqe;
...@@ -143,39 +142,22 @@ static inline bool __io_fill_cqe_req(struct io_ring_ctx *ctx, ...@@ -143,39 +142,22 @@ static inline bool __io_fill_cqe_req(struct io_ring_ctx *ctx,
* submission (by quite a lot). Increment the overflow count in * submission (by quite a lot). Increment the overflow count in
* the ring. * the ring.
*/ */
cqe = io_get_cqe(ctx); if (unlikely(!io_get_cqe(ctx, &cqe)))
if (unlikely(!cqe))
return false; return false;
trace_io_uring_complete(req->ctx, req, req->cqe.user_data, if (trace_io_uring_complete_enabled())
req->cqe.res, req->cqe.flags, trace_io_uring_complete(req->ctx, req, req->cqe.user_data,
(req->flags & REQ_F_CQE32_INIT) ? req->extra1 : 0, req->cqe.res, req->cqe.flags,
(req->flags & REQ_F_CQE32_INIT) ? req->extra2 : 0); req->big_cqe.extra1, req->big_cqe.extra2);
memcpy(cqe, &req->cqe, sizeof(*cqe)); memcpy(cqe, &req->cqe, sizeof(*cqe));
if (ctx->flags & IORING_SETUP_CQE32) { if (ctx->flags & IORING_SETUP_CQE32) {
u64 extra1 = 0, extra2 = 0; memcpy(cqe->big_cqe, &req->big_cqe, sizeof(*cqe));
memset(&req->big_cqe, 0, sizeof(req->big_cqe));
if (req->flags & REQ_F_CQE32_INIT) {
extra1 = req->extra1;
extra2 = req->extra2;
}
WRITE_ONCE(cqe->big_cqe[0], extra1);
WRITE_ONCE(cqe->big_cqe[1], extra2);
} }
return true; return true;
} }
static inline bool io_fill_cqe_req(struct io_ring_ctx *ctx,
struct io_kiocb *req)
{
if (likely(__io_fill_cqe_req(ctx, req)))
return true;
return io_req_cqe_overflow(req);
}
static inline void req_set_fail(struct io_kiocb *req) static inline void req_set_fail(struct io_kiocb *req)
{ {
req->flags |= REQ_F_FAIL; req->flags |= REQ_F_FAIL;
...@@ -196,10 +178,10 @@ static inline bool req_has_async_data(struct io_kiocb *req) ...@@ -196,10 +178,10 @@ static inline bool req_has_async_data(struct io_kiocb *req)
return req->flags & REQ_F_ASYNC_DATA; return req->flags & REQ_F_ASYNC_DATA;
} }
static inline void io_put_file(struct file *file) static inline void io_put_file(struct io_kiocb *req)
{ {
if (file) if (!(req->flags & REQ_F_FIXED_FILE) && req->file)
fput(file); fput(req->file);
} }
static inline void io_ring_submit_unlock(struct io_ring_ctx *ctx, static inline void io_ring_submit_unlock(struct io_ring_ctx *ctx,
...@@ -354,7 +336,6 @@ static inline struct io_kiocb *io_extract_req(struct io_ring_ctx *ctx) ...@@ -354,7 +336,6 @@ static inline struct io_kiocb *io_extract_req(struct io_ring_ctx *ctx)
struct io_kiocb *req; struct io_kiocb *req;
req = container_of(ctx->submit_state.free_list.next, struct io_kiocb, comp_list); req = container_of(ctx->submit_state.free_list.next, struct io_kiocb, comp_list);
kasan_unpoison_object_data(req_cachep, req);
wq_stack_extract(&ctx->submit_state.free_list); wq_stack_extract(&ctx->submit_state.free_list);
return req; return req;
} }
......
...@@ -641,8 +641,8 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret, ...@@ -641,8 +641,8 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret,
} }
if (!mshot_finished) { if (!mshot_finished) {
if (io_aux_cqe(req, issue_flags & IO_URING_F_COMPLETE_DEFER, if (io_fill_cqe_req_aux(req, issue_flags & IO_URING_F_COMPLETE_DEFER,
*ret, cflags | IORING_CQE_F_MORE, true)) { *ret, cflags | IORING_CQE_F_MORE)) {
io_recv_prep_retry(req); io_recv_prep_retry(req);
/* Known not-empty or unknown state, retry */ /* Known not-empty or unknown state, retry */
if (cflags & IORING_CQE_F_SOCK_NONEMPTY || if (cflags & IORING_CQE_F_SOCK_NONEMPTY ||
...@@ -1366,8 +1366,8 @@ int io_accept(struct io_kiocb *req, unsigned int issue_flags) ...@@ -1366,8 +1366,8 @@ int io_accept(struct io_kiocb *req, unsigned int issue_flags)
if (ret < 0) if (ret < 0)
return ret; return ret;
if (io_aux_cqe(req, issue_flags & IO_URING_F_COMPLETE_DEFER, ret, if (io_fill_cqe_req_aux(req, issue_flags & IO_URING_F_COMPLETE_DEFER,
IORING_CQE_F_MORE, true)) ret, IORING_CQE_F_MORE))
goto retry; goto retry;
return -ECANCELED; return -ECANCELED;
......
...@@ -300,8 +300,8 @@ static int io_poll_check_events(struct io_kiocb *req, struct io_tw_state *ts) ...@@ -300,8 +300,8 @@ static int io_poll_check_events(struct io_kiocb *req, struct io_tw_state *ts)
__poll_t mask = mangle_poll(req->cqe.res & __poll_t mask = mangle_poll(req->cqe.res &
req->apoll_events); req->apoll_events);
if (!io_aux_cqe(req, ts->locked, mask, if (!io_fill_cqe_req_aux(req, ts->locked, mask,
IORING_CQE_F_MORE, false)) { IORING_CQE_F_MORE)) {
io_req_set_res(req, mask, 0); io_req_set_res(req, mask, 0);
return IOU_POLL_REMOVE_POLL_USE_RES; return IOU_POLL_REMOVE_POLL_USE_RES;
} }
...@@ -824,14 +824,10 @@ static struct io_kiocb *io_poll_file_find(struct io_ring_ctx *ctx, ...@@ -824,14 +824,10 @@ static struct io_kiocb *io_poll_file_find(struct io_ring_ctx *ctx,
spin_lock(&hb->lock); spin_lock(&hb->lock);
hlist_for_each_entry(req, &hb->list, hash_node) { hlist_for_each_entry(req, &hb->list, hash_node) {
if (!(cd->flags & IORING_ASYNC_CANCEL_ANY) && if (io_cancel_req_match(req, cd)) {
req->file != cd->file) *out_bucket = hb;
continue; return req;
if (cd->seq == req->work.cancel_seq) }
continue;
req->work.cancel_seq = cd->seq;
*out_bucket = hb;
return req;
} }
spin_unlock(&hb->lock); spin_unlock(&hb->lock);
} }
...@@ -855,7 +851,8 @@ static int __io_poll_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd, ...@@ -855,7 +851,8 @@ static int __io_poll_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd,
struct io_hash_bucket *bucket; struct io_hash_bucket *bucket;
struct io_kiocb *req; struct io_kiocb *req;
if (cd->flags & (IORING_ASYNC_CANCEL_FD|IORING_ASYNC_CANCEL_ANY)) if (cd->flags & (IORING_ASYNC_CANCEL_FD | IORING_ASYNC_CANCEL_OP |
IORING_ASYNC_CANCEL_ANY))
req = io_poll_file_find(ctx, cd, table, &bucket); req = io_poll_file_find(ctx, cd, table, &bucket);
else else
req = io_poll_find(ctx, false, cd, table, &bucket); req = io_poll_find(ctx, false, cd, table, &bucket);
...@@ -972,8 +969,8 @@ int io_poll_add(struct io_kiocb *req, unsigned int issue_flags) ...@@ -972,8 +969,8 @@ int io_poll_add(struct io_kiocb *req, unsigned int issue_flags)
int io_poll_remove(struct io_kiocb *req, unsigned int issue_flags) int io_poll_remove(struct io_kiocb *req, unsigned int issue_flags)
{ {
struct io_poll_update *poll_update = io_kiocb_to_cmd(req, struct io_poll_update); struct io_poll_update *poll_update = io_kiocb_to_cmd(req, struct io_poll_update);
struct io_cancel_data cd = { .data = poll_update->old_user_data, };
struct io_ring_ctx *ctx = req->ctx; struct io_ring_ctx *ctx = req->ctx;
struct io_cancel_data cd = { .ctx = ctx, .data = poll_update->old_user_data, };
struct io_hash_bucket *bucket; struct io_hash_bucket *bucket;
struct io_kiocb *preq; struct io_kiocb *preq;
int ret2, ret = 0; int ret2, ret = 0;
......
...@@ -33,6 +33,12 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, ...@@ -33,6 +33,12 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
#define IORING_MAX_FIXED_FILES (1U << 20) #define IORING_MAX_FIXED_FILES (1U << 20)
#define IORING_MAX_REG_BUFFERS (1U << 14) #define IORING_MAX_REG_BUFFERS (1U << 14)
static const struct io_mapped_ubuf dummy_ubuf = {
/* set invalid range, so io_import_fixed() fails meeting it */
.ubuf = -1UL,
.ubuf_end = 0,
};
int __io_account_mem(struct user_struct *user, unsigned long nr_pages) int __io_account_mem(struct user_struct *user, unsigned long nr_pages)
{ {
unsigned long page_limit, cur_pages, new_pages; unsigned long page_limit, cur_pages, new_pages;
...@@ -132,7 +138,7 @@ static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf **slo ...@@ -132,7 +138,7 @@ static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf **slo
struct io_mapped_ubuf *imu = *slot; struct io_mapped_ubuf *imu = *slot;
unsigned int i; unsigned int i;
if (imu != ctx->dummy_ubuf) { if (imu != &dummy_ubuf) {
for (i = 0; i < imu->nr_bvecs; i++) for (i = 0; i < imu->nr_bvecs; i++)
unpin_user_page(imu->bvec[i].bv_page); unpin_user_page(imu->bvec[i].bv_page);
if (imu->acct_pages) if (imu->acct_pages)
...@@ -459,14 +465,14 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx, ...@@ -459,14 +465,14 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
break; break;
i = array_index_nospec(up->offset + done, ctx->nr_user_bufs); i = array_index_nospec(up->offset + done, ctx->nr_user_bufs);
if (ctx->user_bufs[i] != ctx->dummy_ubuf) { if (ctx->user_bufs[i] != &dummy_ubuf) {
err = io_queue_rsrc_removal(ctx->buf_data, i, err = io_queue_rsrc_removal(ctx->buf_data, i,
ctx->user_bufs[i]); ctx->user_bufs[i]);
if (unlikely(err)) { if (unlikely(err)) {
io_buffer_unmap(ctx, &imu); io_buffer_unmap(ctx, &imu);
break; break;
} }
ctx->user_bufs[i] = ctx->dummy_ubuf; ctx->user_bufs[i] = (struct io_mapped_ubuf *)&dummy_ubuf;
} }
ctx->user_bufs[i] = imu; ctx->user_bufs[i] = imu;
...@@ -1077,7 +1083,7 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, ...@@ -1077,7 +1083,7 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
int ret, nr_pages, i; int ret, nr_pages, i;
struct folio *folio = NULL; struct folio *folio = NULL;
*pimu = ctx->dummy_ubuf; *pimu = (struct io_mapped_ubuf *)&dummy_ubuf;
if (!iov->iov_base) if (!iov->iov_base)
return 0; return 0;
......
...@@ -54,10 +54,9 @@ struct io_mapped_ubuf { ...@@ -54,10 +54,9 @@ struct io_mapped_ubuf {
u64 ubuf_end; u64 ubuf_end;
unsigned int nr_bvecs; unsigned int nr_bvecs;
unsigned long acct_pages; unsigned long acct_pages;
struct bio_vec bvec[]; struct bio_vec bvec[] __counted_by(nr_bvecs);
}; };
void io_rsrc_put_tw(struct callback_head *cb);
void io_rsrc_node_ref_zero(struct io_rsrc_node *node); void io_rsrc_node_ref_zero(struct io_rsrc_node *node);
void io_rsrc_node_destroy(struct io_ring_ctx *ctx, struct io_rsrc_node *ref_node); void io_rsrc_node_destroy(struct io_ring_ctx *ctx, struct io_rsrc_node *ref_node);
struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx); struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx);
......
...@@ -989,13 +989,6 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags) ...@@ -989,13 +989,6 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags)
return ret; return ret;
} }
static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx)
{
io_commit_cqring_flush(ctx);
if (ctx->flags & IORING_SETUP_SQPOLL)
io_cqring_wake(ctx);
}
void io_rw_fail(struct io_kiocb *req) void io_rw_fail(struct io_kiocb *req)
{ {
int res; int res;
...@@ -1066,24 +1059,17 @@ int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin) ...@@ -1066,24 +1059,17 @@ int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
if (!smp_load_acquire(&req->iopoll_completed)) if (!smp_load_acquire(&req->iopoll_completed))
break; break;
nr_events++; nr_events++;
if (unlikely(req->flags & REQ_F_CQE_SKIP))
continue;
req->cqe.flags = io_put_kbuf(req, 0); req->cqe.flags = io_put_kbuf(req, 0);
if (unlikely(!__io_fill_cqe_req(ctx, req))) {
spin_lock(&ctx->completion_lock);
io_req_cqe_overflow(req);
spin_unlock(&ctx->completion_lock);
}
} }
if (unlikely(!nr_events)) if (unlikely(!nr_events))
return 0; return 0;
io_commit_cqring(ctx);
io_cqring_ev_posted_iopoll(ctx);
pos = start ? start->next : ctx->iopoll_list.first; pos = start ? start->next : ctx->iopoll_list.first;
wq_list_cut(&ctx->iopoll_list, prev, start); wq_list_cut(&ctx->iopoll_list, prev, start);
io_free_batch_list(ctx, pos);
if (WARN_ON_ONCE(!wq_list_empty(&ctx->submit_state.compl_reqs)))
return 0;
ctx->submit_state.compl_reqs.first = pos;
__io_submit_flush_completions(ctx);
return nr_events; return nr_events;
} }
...@@ -68,7 +68,7 @@ int io_tee(struct io_kiocb *req, unsigned int issue_flags) ...@@ -68,7 +68,7 @@ int io_tee(struct io_kiocb *req, unsigned int issue_flags)
ret = do_tee(in, out, sp->len, flags); ret = do_tee(in, out, sp->len, flags);
if (!(sp->flags & SPLICE_F_FD_IN_FIXED)) if (!(sp->flags & SPLICE_F_FD_IN_FIXED))
io_put_file(in); fput(in);
done: done:
if (ret != sp->len) if (ret != sp->len)
req_set_fail(req); req_set_fail(req);
...@@ -112,7 +112,7 @@ int io_splice(struct io_kiocb *req, unsigned int issue_flags) ...@@ -112,7 +112,7 @@ int io_splice(struct io_kiocb *req, unsigned int issue_flags)
ret = do_splice(in, poff_in, out, poff_out, sp->len, flags); ret = do_splice(in, poff_in, out, poff_out, sp->len, flags);
if (!(sp->flags & SPLICE_F_FD_IN_FIXED)) if (!(sp->flags & SPLICE_F_FD_IN_FIXED))
io_put_file(in); fput(in);
done: done:
if (ret != sp->len) if (ret != sp->len)
req_set_fail(req); req_set_fail(req);
......
...@@ -421,3 +421,18 @@ __cold int io_sq_offload_create(struct io_ring_ctx *ctx, ...@@ -421,3 +421,18 @@ __cold int io_sq_offload_create(struct io_ring_ctx *ctx,
io_sq_thread_finish(ctx); io_sq_thread_finish(ctx);
return ret; return ret;
} }
__cold int io_sqpoll_wq_cpu_affinity(struct io_ring_ctx *ctx,
cpumask_var_t mask)
{
struct io_sq_data *sqd = ctx->sq_data;
int ret = -EINVAL;
if (sqd) {
io_sq_thread_park(sqd);
ret = io_wq_cpu_affinity(sqd->thread->io_uring, mask);
io_sq_thread_unpark(sqd);
}
return ret;
}
...@@ -27,3 +27,4 @@ void io_sq_thread_park(struct io_sq_data *sqd); ...@@ -27,3 +27,4 @@ void io_sq_thread_park(struct io_sq_data *sqd);
void io_sq_thread_unpark(struct io_sq_data *sqd); void io_sq_thread_unpark(struct io_sq_data *sqd);
void io_put_sq_data(struct io_sq_data *sqd); void io_put_sq_data(struct io_sq_data *sqd);
void io_sqpoll_wait_sq(struct io_ring_ctx *ctx); void io_sqpoll_wait_sq(struct io_ring_ctx *ctx);
int io_sqpoll_wq_cpu_affinity(struct io_ring_ctx *ctx, cpumask_var_t mask);
...@@ -73,8 +73,8 @@ static void io_timeout_complete(struct io_kiocb *req, struct io_tw_state *ts) ...@@ -73,8 +73,8 @@ static void io_timeout_complete(struct io_kiocb *req, struct io_tw_state *ts)
if (!io_timeout_finish(timeout, data)) { if (!io_timeout_finish(timeout, data)) {
bool filled; bool filled;
filled = io_aux_cqe(req, ts->locked, -ETIME, IORING_CQE_F_MORE, filled = io_fill_cqe_req_aux(req, ts->locked, -ETIME,
false); IORING_CQE_F_MORE);
if (filled) { if (filled) {
/* re-arm timer */ /* re-arm timer */
spin_lock_irq(&ctx->timeout_lock); spin_lock_irq(&ctx->timeout_lock);
...@@ -268,16 +268,10 @@ static struct io_kiocb *io_timeout_extract(struct io_ring_ctx *ctx, ...@@ -268,16 +268,10 @@ static struct io_kiocb *io_timeout_extract(struct io_ring_ctx *ctx,
list_for_each_entry(timeout, &ctx->timeout_list, list) { list_for_each_entry(timeout, &ctx->timeout_list, list) {
struct io_kiocb *tmp = cmd_to_io_kiocb(timeout); struct io_kiocb *tmp = cmd_to_io_kiocb(timeout);
if (!(cd->flags & IORING_ASYNC_CANCEL_ANY) && if (io_cancel_req_match(tmp, cd)) {
cd->data != tmp->cqe.user_data) req = tmp;
continue; break;
if (cd->flags & (IORING_ASYNC_CANCEL_ALL|IORING_ASYNC_CANCEL_ANY)) {
if (cd->seq == tmp->work.cancel_seq)
continue;
tmp->work.cancel_seq = cd->seq;
} }
req = tmp;
break;
} }
if (!req) if (!req)
return ERR_PTR(-ENOENT); return ERR_PTR(-ENOENT);
...@@ -409,7 +403,7 @@ static int io_timeout_update(struct io_ring_ctx *ctx, __u64 user_data, ...@@ -409,7 +403,7 @@ static int io_timeout_update(struct io_ring_ctx *ctx, __u64 user_data,
struct timespec64 *ts, enum hrtimer_mode mode) struct timespec64 *ts, enum hrtimer_mode mode)
__must_hold(&ctx->timeout_lock) __must_hold(&ctx->timeout_lock)
{ {
struct io_cancel_data cd = { .data = user_data, }; struct io_cancel_data cd = { .ctx = ctx, .data = user_data, };
struct io_kiocb *req = io_timeout_extract(ctx, &cd); struct io_kiocb *req = io_timeout_extract(ctx, &cd);
struct io_timeout *timeout = io_kiocb_to_cmd(req, struct io_timeout); struct io_timeout *timeout = io_kiocb_to_cmd(req, struct io_timeout);
struct io_timeout_data *data; struct io_timeout_data *data;
...@@ -473,7 +467,7 @@ int io_timeout_remove(struct io_kiocb *req, unsigned int issue_flags) ...@@ -473,7 +467,7 @@ int io_timeout_remove(struct io_kiocb *req, unsigned int issue_flags)
int ret; int ret;
if (!(tr->flags & IORING_TIMEOUT_UPDATE)) { if (!(tr->flags & IORING_TIMEOUT_UPDATE)) {
struct io_cancel_data cd = { .data = tr->addr, }; struct io_cancel_data cd = { .ctx = ctx, .data = tr->addr, };
spin_lock(&ctx->completion_lock); spin_lock(&ctx->completion_lock);
ret = io_timeout_cancel(ctx, &cd); ret = io_timeout_cancel(ctx, &cd);
......
...@@ -7,6 +7,7 @@ ...@@ -7,6 +7,7 @@
#include <linux/nospec.h> #include <linux/nospec.h>
#include <uapi/linux/io_uring.h> #include <uapi/linux/io_uring.h>
#include <uapi/asm-generic/ioctls.h>
#include "io_uring.h" #include "io_uring.h"
#include "rsrc.h" #include "rsrc.h"
...@@ -42,9 +43,8 @@ EXPORT_SYMBOL_GPL(io_uring_cmd_do_in_task_lazy); ...@@ -42,9 +43,8 @@ EXPORT_SYMBOL_GPL(io_uring_cmd_do_in_task_lazy);
static inline void io_req_set_cqe32_extra(struct io_kiocb *req, static inline void io_req_set_cqe32_extra(struct io_kiocb *req,
u64 extra1, u64 extra2) u64 extra1, u64 extra2)
{ {
req->extra1 = extra1; req->big_cqe.extra1 = extra1;
req->extra2 = extra2; req->big_cqe.extra2 = extra2;
req->flags |= REQ_F_CQE32_INIT;
} }
/* /*
...@@ -164,3 +164,30 @@ int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw, ...@@ -164,3 +164,30 @@ int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw,
return io_import_fixed(rw, iter, req->imu, ubuf, len); return io_import_fixed(rw, iter, req->imu, ubuf, len);
} }
EXPORT_SYMBOL_GPL(io_uring_cmd_import_fixed); EXPORT_SYMBOL_GPL(io_uring_cmd_import_fixed);
int io_uring_cmd_sock(struct io_uring_cmd *cmd, unsigned int issue_flags)
{
struct socket *sock = cmd->file->private_data;
struct sock *sk = sock->sk;
struct proto *prot = READ_ONCE(sk->sk_prot);
int ret, arg = 0;
if (!prot || !prot->ioctl)
return -EOPNOTSUPP;
switch (cmd->sqe->cmd_op) {
case SOCKET_URING_OP_SIOCINQ:
ret = prot->ioctl(sk, SIOCINQ, &arg);
if (ret)
return ret;
return arg;
case SOCKET_URING_OP_SIOCOUTQ:
ret = prot->ioctl(sk, SIOCOUTQ, &arg);
if (ret)
return ret;
return arg;
default:
return -EOPNOTSUPP;
}
}
EXPORT_SYMBOL_GPL(io_uring_cmd_sock);
...@@ -88,6 +88,7 @@ ...@@ -88,6 +88,7 @@
#include <linux/xattr.h> #include <linux/xattr.h>
#include <linux/nospec.h> #include <linux/nospec.h>
#include <linux/indirect_call_wrapper.h> #include <linux/indirect_call_wrapper.h>
#include <linux/io_uring.h>
#include <linux/uaccess.h> #include <linux/uaccess.h>
#include <asm/unistd.h> #include <asm/unistd.h>
...@@ -160,6 +161,7 @@ static const struct file_operations socket_file_ops = { ...@@ -160,6 +161,7 @@ static const struct file_operations socket_file_ops = {
#ifdef CONFIG_COMPAT #ifdef CONFIG_COMPAT
.compat_ioctl = compat_sock_ioctl, .compat_ioctl = compat_sock_ioctl,
#endif #endif
.uring_cmd = io_uring_cmd_sock,
.mmap = sock_mmap, .mmap = sock_mmap,
.release = sock_close, .release = sock_close,
.fasync = sock_fasync, .fasync = sock_fasync,
......
# SPDX-License-Identifier: GPL-2.0
# Makefile for io_uring test tools
CFLAGS += -Wall -Wextra -g -D_GNU_SOURCE
LDLIBS += -lpthread
all: io_uring-cp io_uring-bench
%: %.c
$(CC) $(CFLAGS) -o $@ $^
io_uring-bench: syscall.o io_uring-bench.o
$(CC) $(CFLAGS) -o $@ $^ $(LDLIBS)
io_uring-cp: setup.o syscall.o queue.o
clean:
$(RM) io_uring-cp io_uring-bench *.o
.PHONY: all clean
This directory includes a few programs that demonstrate how to use io_uring
in an application. The examples are:
io_uring-cp
A very basic io_uring implementation of cp(1). It takes two
arguments, copies the first argument to the second. This example
is part of liburing, and hence uses the simplified liburing API
for setting up an io_uring instance, submitting IO, completing IO,
etc. The support functions in queue.c and setup.c are straight
out of liburing.
io_uring-bench
Benchmark program that does random reads on a number of files. This
app demonstrates the various features of io_uring, like fixed files,
fixed buffers, and polled IO. There are options in the program to
control which features to use. Arguments is the file (or files) that
io_uring-bench should operate on. This uses the raw io_uring
interface.
liburing can be cloned with git here:
git://git.kernel.dk/liburing
and contains a number of unit tests as well for testing io_uring. It also
comes with man pages for the three system calls.
Fio includes an io_uring engine, you can clone fio here:
git://git.kernel.dk/fio
#ifndef LIBURING_BARRIER_H
#define LIBURING_BARRIER_H
#if defined(__x86_64) || defined(__i386__)
#define read_barrier() __asm__ __volatile__("":::"memory")
#define write_barrier() __asm__ __volatile__("":::"memory")
#else
/*
* Add arch appropriate definitions. Be safe and use full barriers for
* archs we don't have support for.
*/
#define read_barrier() __sync_synchronize()
#define write_barrier() __sync_synchronize()
#endif
#endif
// SPDX-License-Identifier: GPL-2.0
/*
* Simple benchmark program that uses the various features of io_uring
* to provide fast random access to a device/file. It has various
* options that are control how we use io_uring, see the OPTIONS section
* below. This uses the raw io_uring interface.
*
* Copyright (C) 2018-2019 Jens Axboe
*/
#include <stdio.h>
#include <errno.h>
#include <assert.h>
#include <stdlib.h>
#include <stddef.h>
#include <signal.h>
#include <inttypes.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/ioctl.h>
#include <sys/syscall.h>
#include <sys/resource.h>
#include <sys/mman.h>
#include <sys/uio.h>
#include <linux/fs.h>
#include <fcntl.h>
#include <unistd.h>
#include <string.h>
#include <pthread.h>
#include <sched.h>
#include "liburing.h"
#include "barrier.h"
#define min(a, b) ((a < b) ? (a) : (b))
struct io_sq_ring {
unsigned *head;
unsigned *tail;
unsigned *ring_mask;
unsigned *ring_entries;
unsigned *flags;
unsigned *array;
};
struct io_cq_ring {
unsigned *head;
unsigned *tail;
unsigned *ring_mask;
unsigned *ring_entries;
struct io_uring_cqe *cqes;
};
#define DEPTH 128
#define BATCH_SUBMIT 32
#define BATCH_COMPLETE 32
#define BS 4096
#define MAX_FDS 16
static unsigned sq_ring_mask, cq_ring_mask;
struct file {
unsigned long max_blocks;
unsigned pending_ios;
int real_fd;
int fixed_fd;
};
struct submitter {
pthread_t thread;
int ring_fd;
struct drand48_data rand;
struct io_sq_ring sq_ring;
struct io_uring_sqe *sqes;
struct iovec iovecs[DEPTH];
struct io_cq_ring cq_ring;
int inflight;
unsigned long reaps;
unsigned long done;
unsigned long calls;
volatile int finish;
__s32 *fds;
struct file files[MAX_FDS];
unsigned nr_files;
unsigned cur_file;
};
static struct submitter submitters[1];
static volatile int finish;
/*
* OPTIONS: Set these to test the various features of io_uring.
*/
static int polled = 1; /* use IO polling */
static int fixedbufs = 1; /* use fixed user buffers */
static int register_files = 1; /* use fixed files */
static int buffered = 0; /* use buffered IO, not O_DIRECT */
static int sq_thread_poll = 0; /* use kernel submission/poller thread */
static int sq_thread_cpu = -1; /* pin above thread to this CPU */
static int do_nop = 0; /* no-op SQ ring commands */
static int io_uring_register_buffers(struct submitter *s)
{
if (do_nop)
return 0;
return io_uring_register(s->ring_fd, IORING_REGISTER_BUFFERS, s->iovecs,
DEPTH);
}
static int io_uring_register_files(struct submitter *s)
{
unsigned i;
if (do_nop)
return 0;
s->fds = calloc(s->nr_files, sizeof(__s32));
for (i = 0; i < s->nr_files; i++) {
s->fds[i] = s->files[i].real_fd;
s->files[i].fixed_fd = i;
}
return io_uring_register(s->ring_fd, IORING_REGISTER_FILES, s->fds,
s->nr_files);
}
static int lk_gettid(void)
{
return syscall(__NR_gettid);
}
static unsigned file_depth(struct submitter *s)
{
return (DEPTH + s->nr_files - 1) / s->nr_files;
}
static void init_io(struct submitter *s, unsigned index)
{
struct io_uring_sqe *sqe = &s->sqes[index];
unsigned long offset;
struct file *f;
long r;
if (do_nop) {
sqe->opcode = IORING_OP_NOP;
return;
}
if (s->nr_files == 1) {
f = &s->files[0];
} else {
f = &s->files[s->cur_file];
if (f->pending_ios >= file_depth(s)) {
s->cur_file++;
if (s->cur_file == s->nr_files)
s->cur_file = 0;
f = &s->files[s->cur_file];
}
}
f->pending_ios++;
lrand48_r(&s->rand, &r);
offset = (r % (f->max_blocks - 1)) * BS;
if (register_files) {
sqe->flags = IOSQE_FIXED_FILE;
sqe->fd = f->fixed_fd;
} else {
sqe->flags = 0;
sqe->fd = f->real_fd;
}
if (fixedbufs) {
sqe->opcode = IORING_OP_READ_FIXED;
sqe->addr = (unsigned long) s->iovecs[index].iov_base;
sqe->len = BS;
sqe->buf_index = index;
} else {
sqe->opcode = IORING_OP_READV;
sqe->addr = (unsigned long) &s->iovecs[index];
sqe->len = 1;
sqe->buf_index = 0;
}
sqe->ioprio = 0;
sqe->off = offset;
sqe->user_data = (unsigned long) f;
}
static int prep_more_ios(struct submitter *s, unsigned max_ios)
{
struct io_sq_ring *ring = &s->sq_ring;
unsigned index, tail, next_tail, prepped = 0;
next_tail = tail = *ring->tail;
do {
next_tail++;
read_barrier();
if (next_tail == *ring->head)
break;
index = tail & sq_ring_mask;
init_io(s, index);
ring->array[index] = index;
prepped++;
tail = next_tail;
} while (prepped < max_ios);
if (*ring->tail != tail) {
/* order tail store with writes to sqes above */
write_barrier();
*ring->tail = tail;
write_barrier();
}
return prepped;
}
static int get_file_size(struct file *f)
{
struct stat st;
if (fstat(f->real_fd, &st) < 0)
return -1;
if (S_ISBLK(st.st_mode)) {
unsigned long long bytes;
if (ioctl(f->real_fd, BLKGETSIZE64, &bytes) != 0)
return -1;
f->max_blocks = bytes / BS;
return 0;
} else if (S_ISREG(st.st_mode)) {
f->max_blocks = st.st_size / BS;
return 0;
}
return -1;
}
static int reap_events(struct submitter *s)
{
struct io_cq_ring *ring = &s->cq_ring;
struct io_uring_cqe *cqe;
unsigned head, reaped = 0;
head = *ring->head;
do {
struct file *f;
read_barrier();
if (head == *ring->tail)
break;
cqe = &ring->cqes[head & cq_ring_mask];
if (!do_nop) {
f = (struct file *) (uintptr_t) cqe->user_data;
f->pending_ios--;
if (cqe->res != BS) {
printf("io: unexpected ret=%d\n", cqe->res);
if (polled && cqe->res == -EOPNOTSUPP)
printf("Your filesystem doesn't support poll\n");
return -1;
}
}
reaped++;
head++;
} while (1);
s->inflight -= reaped;
*ring->head = head;
write_barrier();
return reaped;
}
static void *submitter_fn(void *data)
{
struct submitter *s = data;
struct io_sq_ring *ring = &s->sq_ring;
int ret, prepped;
printf("submitter=%d\n", lk_gettid());
srand48_r(pthread_self(), &s->rand);
prepped = 0;
do {
int to_wait, to_submit, this_reap, to_prep;
if (!prepped && s->inflight < DEPTH) {
to_prep = min(DEPTH - s->inflight, BATCH_SUBMIT);
prepped = prep_more_ios(s, to_prep);
}
s->inflight += prepped;
submit_more:
to_submit = prepped;
submit:
if (to_submit && (s->inflight + to_submit <= DEPTH))
to_wait = 0;
else
to_wait = min(s->inflight + to_submit, BATCH_COMPLETE);
/*
* Only need to call io_uring_enter if we're not using SQ thread
* poll, or if IORING_SQ_NEED_WAKEUP is set.
*/
if (!sq_thread_poll || (*ring->flags & IORING_SQ_NEED_WAKEUP)) {
unsigned flags = 0;
if (to_wait)
flags = IORING_ENTER_GETEVENTS;
if ((*ring->flags & IORING_SQ_NEED_WAKEUP))
flags |= IORING_ENTER_SQ_WAKEUP;
ret = io_uring_enter(s->ring_fd, to_submit, to_wait,
flags, NULL);
s->calls++;
}
/*
* For non SQ thread poll, we already got the events we needed
* through the io_uring_enter() above. For SQ thread poll, we
* need to loop here until we find enough events.
*/
this_reap = 0;
do {
int r;
r = reap_events(s);
if (r == -1) {
s->finish = 1;
break;
} else if (r > 0)
this_reap += r;
} while (sq_thread_poll && this_reap < to_wait);
s->reaps += this_reap;
if (ret >= 0) {
if (!ret) {
to_submit = 0;
if (s->inflight)
goto submit;
continue;
} else if (ret < to_submit) {
int diff = to_submit - ret;
s->done += ret;
prepped -= diff;
goto submit_more;
}
s->done += ret;
prepped = 0;
continue;
} else if (ret < 0) {
if (errno == EAGAIN) {
if (s->finish)
break;
if (this_reap)
goto submit;
to_submit = 0;
goto submit;
}
printf("io_submit: %s\n", strerror(errno));
break;
}
} while (!s->finish);
finish = 1;
return NULL;
}
static void sig_int(int sig)
{
printf("Exiting on signal %d\n", sig);
submitters[0].finish = 1;
finish = 1;
}
static void arm_sig_int(void)
{
struct sigaction act;
memset(&act, 0, sizeof(act));
act.sa_handler = sig_int;
act.sa_flags = SA_RESTART;
sigaction(SIGINT, &act, NULL);
}
static int setup_ring(struct submitter *s)
{
struct io_sq_ring *sring = &s->sq_ring;
struct io_cq_ring *cring = &s->cq_ring;
struct io_uring_params p;
int ret, fd;
void *ptr;
memset(&p, 0, sizeof(p));
if (polled && !do_nop)
p.flags |= IORING_SETUP_IOPOLL;
if (sq_thread_poll) {
p.flags |= IORING_SETUP_SQPOLL;
if (sq_thread_cpu != -1) {
p.flags |= IORING_SETUP_SQ_AFF;
p.sq_thread_cpu = sq_thread_cpu;
}
}
fd = io_uring_setup(DEPTH, &p);
if (fd < 0) {
perror("io_uring_setup");
return 1;
}
s->ring_fd = fd;
if (fixedbufs) {
ret = io_uring_register_buffers(s);
if (ret < 0) {
perror("io_uring_register_buffers");
return 1;
}
}
if (register_files) {
ret = io_uring_register_files(s);
if (ret < 0) {
perror("io_uring_register_files");
return 1;
}
}
ptr = mmap(0, p.sq_off.array + p.sq_entries * sizeof(__u32),
PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd,
IORING_OFF_SQ_RING);
printf("sq_ring ptr = 0x%p\n", ptr);
sring->head = ptr + p.sq_off.head;
sring->tail = ptr + p.sq_off.tail;
sring->ring_mask = ptr + p.sq_off.ring_mask;
sring->ring_entries = ptr + p.sq_off.ring_entries;
sring->flags = ptr + p.sq_off.flags;
sring->array = ptr + p.sq_off.array;
sq_ring_mask = *sring->ring_mask;
s->sqes = mmap(0, p.sq_entries * sizeof(struct io_uring_sqe),
PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd,
IORING_OFF_SQES);
printf("sqes ptr = 0x%p\n", s->sqes);
ptr = mmap(0, p.cq_off.cqes + p.cq_entries * sizeof(struct io_uring_cqe),
PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd,
IORING_OFF_CQ_RING);
printf("cq_ring ptr = 0x%p\n", ptr);
cring->head = ptr + p.cq_off.head;
cring->tail = ptr + p.cq_off.tail;
cring->ring_mask = ptr + p.cq_off.ring_mask;
cring->ring_entries = ptr + p.cq_off.ring_entries;
cring->cqes = ptr + p.cq_off.cqes;
cq_ring_mask = *cring->ring_mask;
return 0;
}
static void file_depths(char *buf)
{
struct submitter *s = &submitters[0];
unsigned i;
char *p;
buf[0] = '\0';
p = buf;
for (i = 0; i < s->nr_files; i++) {
struct file *f = &s->files[i];
if (i + 1 == s->nr_files)
p += sprintf(p, "%d", f->pending_ios);
else
p += sprintf(p, "%d, ", f->pending_ios);
}
}
int main(int argc, char *argv[])
{
struct submitter *s = &submitters[0];
unsigned long done, calls, reap;
int err, i, flags, fd;
char *fdepths;
void *ret;
if (!do_nop && argc < 2) {
printf("%s: filename\n", argv[0]);
return 1;
}
flags = O_RDONLY | O_NOATIME;
if (!buffered)
flags |= O_DIRECT;
i = 1;
while (!do_nop && i < argc) {
struct file *f;
if (s->nr_files == MAX_FDS) {
printf("Max number of files (%d) reached\n", MAX_FDS);
break;
}
fd = open(argv[i], flags);
if (fd < 0) {
perror("open");
return 1;
}
f = &s->files[s->nr_files];
f->real_fd = fd;
if (get_file_size(f)) {
printf("failed getting size of device/file\n");
return 1;
}
if (f->max_blocks <= 1) {
printf("Zero file/device size?\n");
return 1;
}
f->max_blocks--;
printf("Added file %s\n", argv[i]);
s->nr_files++;
i++;
}
if (fixedbufs) {
struct rlimit rlim;
rlim.rlim_cur = RLIM_INFINITY;
rlim.rlim_max = RLIM_INFINITY;
if (setrlimit(RLIMIT_MEMLOCK, &rlim) < 0) {
perror("setrlimit");
return 1;
}
}
arm_sig_int();
for (i = 0; i < DEPTH; i++) {
void *buf;
if (posix_memalign(&buf, BS, BS)) {
printf("failed alloc\n");
return 1;
}
s->iovecs[i].iov_base = buf;
s->iovecs[i].iov_len = BS;
}
err = setup_ring(s);
if (err) {
printf("ring setup failed: %s, %d\n", strerror(errno), err);
return 1;
}
printf("polled=%d, fixedbufs=%d, buffered=%d", polled, fixedbufs, buffered);
printf(" QD=%d, sq_ring=%d, cq_ring=%d\n", DEPTH, *s->sq_ring.ring_entries, *s->cq_ring.ring_entries);
pthread_create(&s->thread, NULL, submitter_fn, s);
fdepths = malloc(8 * s->nr_files);
reap = calls = done = 0;
do {
unsigned long this_done = 0;
unsigned long this_reap = 0;
unsigned long this_call = 0;
unsigned long rpc = 0, ipc = 0;
sleep(1);
this_done += s->done;
this_call += s->calls;
this_reap += s->reaps;
if (this_call - calls) {
rpc = (this_done - done) / (this_call - calls);
ipc = (this_reap - reap) / (this_call - calls);
} else
rpc = ipc = -1;
file_depths(fdepths);
printf("IOPS=%lu, IOS/call=%ld/%ld, inflight=%u (%s)\n",
this_done - done, rpc, ipc, s->inflight,
fdepths);
done = this_done;
calls = this_call;
reap = this_reap;
} while (!finish);
pthread_join(s->thread, &ret);
close(s->ring_fd);
free(fdepths);
return 0;
}
// SPDX-License-Identifier: GPL-2.0
/*
* Simple test program that demonstrates a file copy through io_uring. This
* uses the API exposed by liburing.
*
* Copyright (C) 2018-2019 Jens Axboe
*/
#include <stdio.h>
#include <fcntl.h>
#include <string.h>
#include <stdlib.h>
#include <unistd.h>
#include <assert.h>
#include <errno.h>
#include <inttypes.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/ioctl.h>
#include "liburing.h"
#define QD 64
#define BS (32*1024)
static int infd, outfd;
struct io_data {
int read;
off_t first_offset, offset;
size_t first_len;
struct iovec iov;
};
static int setup_context(unsigned entries, struct io_uring *ring)
{
int ret;
ret = io_uring_queue_init(entries, ring, 0);
if (ret < 0) {
fprintf(stderr, "queue_init: %s\n", strerror(-ret));
return -1;
}
return 0;
}
static int get_file_size(int fd, off_t *size)
{
struct stat st;
if (fstat(fd, &st) < 0)
return -1;
if (S_ISREG(st.st_mode)) {
*size = st.st_size;
return 0;
} else if (S_ISBLK(st.st_mode)) {
unsigned long long bytes;
if (ioctl(fd, BLKGETSIZE64, &bytes) != 0)
return -1;
*size = bytes;
return 0;
}
return -1;
}
static void queue_prepped(struct io_uring *ring, struct io_data *data)
{
struct io_uring_sqe *sqe;
sqe = io_uring_get_sqe(ring);
assert(sqe);
if (data->read)
io_uring_prep_readv(sqe, infd, &data->iov, 1, data->offset);
else
io_uring_prep_writev(sqe, outfd, &data->iov, 1, data->offset);
io_uring_sqe_set_data(sqe, data);
}
static int queue_read(struct io_uring *ring, off_t size, off_t offset)
{
struct io_uring_sqe *sqe;
struct io_data *data;
data = malloc(size + sizeof(*data));
if (!data)
return 1;
sqe = io_uring_get_sqe(ring);
if (!sqe) {
free(data);
return 1;
}
data->read = 1;
data->offset = data->first_offset = offset;
data->iov.iov_base = data + 1;
data->iov.iov_len = size;
data->first_len = size;
io_uring_prep_readv(sqe, infd, &data->iov, 1, offset);
io_uring_sqe_set_data(sqe, data);
return 0;
}
static void queue_write(struct io_uring *ring, struct io_data *data)
{
data->read = 0;
data->offset = data->first_offset;
data->iov.iov_base = data + 1;
data->iov.iov_len = data->first_len;
queue_prepped(ring, data);
io_uring_submit(ring);
}
static int copy_file(struct io_uring *ring, off_t insize)
{
unsigned long reads, writes;
struct io_uring_cqe *cqe;
off_t write_left, offset;
int ret;
write_left = insize;
writes = reads = offset = 0;
while (insize || write_left) {
int had_reads, got_comp;
/*
* Queue up as many reads as we can
*/
had_reads = reads;
while (insize) {
off_t this_size = insize;
if (reads + writes >= QD)
break;
if (this_size > BS)
this_size = BS;
else if (!this_size)
break;
if (queue_read(ring, this_size, offset))
break;
insize -= this_size;
offset += this_size;
reads++;
}
if (had_reads != reads) {
ret = io_uring_submit(ring);
if (ret < 0) {
fprintf(stderr, "io_uring_submit: %s\n", strerror(-ret));
break;
}
}
/*
* Queue is full at this point. Find at least one completion.
*/
got_comp = 0;
while (write_left) {
struct io_data *data;
if (!got_comp) {
ret = io_uring_wait_cqe(ring, &cqe);
got_comp = 1;
} else {
ret = io_uring_peek_cqe(ring, &cqe);
if (ret == -EAGAIN) {
cqe = NULL;
ret = 0;
}
}
if (ret < 0) {
fprintf(stderr, "io_uring_peek_cqe: %s\n",
strerror(-ret));
return 1;
}
if (!cqe)
break;
data = io_uring_cqe_get_data(cqe);
if (cqe->res < 0) {
if (cqe->res == -EAGAIN) {
queue_prepped(ring, data);
io_uring_cqe_seen(ring, cqe);
continue;
}
fprintf(stderr, "cqe failed: %s\n",
strerror(-cqe->res));
return 1;
} else if (cqe->res != data->iov.iov_len) {
/* Short read/write, adjust and requeue */
data->iov.iov_base += cqe->res;
data->iov.iov_len -= cqe->res;
data->offset += cqe->res;
queue_prepped(ring, data);
io_uring_cqe_seen(ring, cqe);
continue;
}
/*
* All done. if write, nothing else to do. if read,
* queue up corresponding write.
*/
if (data->read) {
queue_write(ring, data);
write_left -= data->first_len;
reads--;
writes++;
} else {
free(data);
writes--;
}
io_uring_cqe_seen(ring, cqe);
}
}
/* wait out pending writes */
while (writes) {
struct io_data *data;
ret = io_uring_wait_cqe(ring, &cqe);
if (ret) {
fprintf(stderr, "wait_cqe=%d\n", ret);
return 1;
}
if (cqe->res < 0) {
fprintf(stderr, "write res=%d\n", cqe->res);
return 1;
}
data = io_uring_cqe_get_data(cqe);
free(data);
writes--;
io_uring_cqe_seen(ring, cqe);
}
return 0;
}
int main(int argc, char *argv[])
{
struct io_uring ring;
off_t insize;
int ret;
if (argc < 3) {
printf("%s: infile outfile\n", argv[0]);
return 1;
}
infd = open(argv[1], O_RDONLY);
if (infd < 0) {
perror("open infile");
return 1;
}
outfd = open(argv[2], O_WRONLY | O_CREAT | O_TRUNC, 0644);
if (outfd < 0) {
perror("open outfile");
return 1;
}
if (setup_context(QD, &ring))
return 1;
if (get_file_size(infd, &insize))
return 1;
ret = copy_file(&ring, insize);
close(infd);
close(outfd);
io_uring_queue_exit(&ring);
return ret;
}
#ifndef LIB_URING_H
#define LIB_URING_H
#ifdef __cplusplus
extern "C" {
#endif
#include <sys/uio.h>
#include <signal.h>
#include <string.h>
#include "../../include/uapi/linux/io_uring.h"
#include <inttypes.h>
#include <linux/swab.h>
#include "barrier.h"
/*
* Library interface to io_uring
*/
struct io_uring_sq {
unsigned *khead;
unsigned *ktail;
unsigned *kring_mask;
unsigned *kring_entries;
unsigned *kflags;
unsigned *kdropped;
unsigned *array;
struct io_uring_sqe *sqes;
unsigned sqe_head;
unsigned sqe_tail;
size_t ring_sz;
};
struct io_uring_cq {
unsigned *khead;
unsigned *ktail;
unsigned *kring_mask;
unsigned *kring_entries;
unsigned *koverflow;
struct io_uring_cqe *cqes;
size_t ring_sz;
};
struct io_uring {
struct io_uring_sq sq;
struct io_uring_cq cq;
int ring_fd;
};
/*
* System calls
*/
extern int io_uring_setup(unsigned entries, struct io_uring_params *p);
extern int io_uring_enter(int fd, unsigned to_submit,
unsigned min_complete, unsigned flags, sigset_t *sig);
extern int io_uring_register(int fd, unsigned int opcode, void *arg,
unsigned int nr_args);
/*
* Library interface
*/
extern int io_uring_queue_init(unsigned entries, struct io_uring *ring,
unsigned flags);
extern int io_uring_queue_mmap(int fd, struct io_uring_params *p,
struct io_uring *ring);
extern void io_uring_queue_exit(struct io_uring *ring);
extern int io_uring_peek_cqe(struct io_uring *ring,
struct io_uring_cqe **cqe_ptr);
extern int io_uring_wait_cqe(struct io_uring *ring,
struct io_uring_cqe **cqe_ptr);
extern int io_uring_submit(struct io_uring *ring);
extern struct io_uring_sqe *io_uring_get_sqe(struct io_uring *ring);
/*
* Must be called after io_uring_{peek,wait}_cqe() after the cqe has
* been processed by the application.
*/
static inline void io_uring_cqe_seen(struct io_uring *ring,
struct io_uring_cqe *cqe)
{
if (cqe) {
struct io_uring_cq *cq = &ring->cq;
(*cq->khead)++;
/*
* Ensure that the kernel sees our new head, the kernel has
* the matching read barrier.
*/
write_barrier();
}
}
/*
* Command prep helpers
*/
static inline void io_uring_sqe_set_data(struct io_uring_sqe *sqe, void *data)
{
sqe->user_data = (unsigned long) data;
}
static inline void *io_uring_cqe_get_data(struct io_uring_cqe *cqe)
{
return (void *) (uintptr_t) cqe->user_data;
}
static inline void io_uring_prep_rw(int op, struct io_uring_sqe *sqe, int fd,
const void *addr, unsigned len,
off_t offset)
{
memset(sqe, 0, sizeof(*sqe));
sqe->opcode = op;
sqe->fd = fd;
sqe->off = offset;
sqe->addr = (unsigned long) addr;
sqe->len = len;
}
static inline void io_uring_prep_readv(struct io_uring_sqe *sqe, int fd,
const struct iovec *iovecs,
unsigned nr_vecs, off_t offset)
{
io_uring_prep_rw(IORING_OP_READV, sqe, fd, iovecs, nr_vecs, offset);
}
static inline void io_uring_prep_read_fixed(struct io_uring_sqe *sqe, int fd,
void *buf, unsigned nbytes,
off_t offset)
{
io_uring_prep_rw(IORING_OP_READ_FIXED, sqe, fd, buf, nbytes, offset);
}
static inline void io_uring_prep_writev(struct io_uring_sqe *sqe, int fd,
const struct iovec *iovecs,
unsigned nr_vecs, off_t offset)
{
io_uring_prep_rw(IORING_OP_WRITEV, sqe, fd, iovecs, nr_vecs, offset);
}
static inline void io_uring_prep_write_fixed(struct io_uring_sqe *sqe, int fd,
const void *buf, unsigned nbytes,
off_t offset)
{
io_uring_prep_rw(IORING_OP_WRITE_FIXED, sqe, fd, buf, nbytes, offset);
}
static inline void io_uring_prep_poll_add(struct io_uring_sqe *sqe, int fd,
unsigned poll_mask)
{
memset(sqe, 0, sizeof(*sqe));
sqe->opcode = IORING_OP_POLL_ADD;
sqe->fd = fd;
#if __BYTE_ORDER == __BIG_ENDIAN
poll_mask = __swahw32(poll_mask);
#endif
sqe->poll_events = poll_mask;
}
static inline void io_uring_prep_poll_remove(struct io_uring_sqe *sqe,
void *user_data)
{
memset(sqe, 0, sizeof(*sqe));
sqe->opcode = IORING_OP_POLL_REMOVE;
sqe->addr = (unsigned long) user_data;
}
static inline void io_uring_prep_fsync(struct io_uring_sqe *sqe, int fd,
unsigned fsync_flags)
{
memset(sqe, 0, sizeof(*sqe));
sqe->opcode = IORING_OP_FSYNC;
sqe->fd = fd;
sqe->fsync_flags = fsync_flags;
}
static inline void io_uring_prep_nop(struct io_uring_sqe *sqe)
{
memset(sqe, 0, sizeof(*sqe));
sqe->opcode = IORING_OP_NOP;
}
#ifdef __cplusplus
}
#endif
#endif
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/mman.h>
#include <unistd.h>
#include <errno.h>
#include <string.h>
#include "liburing.h"
#include "barrier.h"
static int __io_uring_get_cqe(struct io_uring *ring,
struct io_uring_cqe **cqe_ptr, int wait)
{
struct io_uring_cq *cq = &ring->cq;
const unsigned mask = *cq->kring_mask;
unsigned head;
int ret;
*cqe_ptr = NULL;
head = *cq->khead;
do {
/*
* It's necessary to use a read_barrier() before reading
* the CQ tail, since the kernel updates it locklessly. The
* kernel has the matching store barrier for the update. The
* kernel also ensures that previous stores to CQEs are ordered
* with the tail update.
*/
read_barrier();
if (head != *cq->ktail) {
*cqe_ptr = &cq->cqes[head & mask];
break;
}
if (!wait)
break;
ret = io_uring_enter(ring->ring_fd, 0, 1,
IORING_ENTER_GETEVENTS, NULL);
if (ret < 0)
return -errno;
} while (1);
return 0;
}
/*
* Return an IO completion, if one is readily available. Returns 0 with
* cqe_ptr filled in on success, -errno on failure.
*/
int io_uring_peek_cqe(struct io_uring *ring, struct io_uring_cqe **cqe_ptr)
{
return __io_uring_get_cqe(ring, cqe_ptr, 0);
}
/*
* Return an IO completion, waiting for it if necessary. Returns 0 with
* cqe_ptr filled in on success, -errno on failure.
*/
int io_uring_wait_cqe(struct io_uring *ring, struct io_uring_cqe **cqe_ptr)
{
return __io_uring_get_cqe(ring, cqe_ptr, 1);
}
/*
* Submit sqes acquired from io_uring_get_sqe() to the kernel.
*
* Returns number of sqes submitted
*/
int io_uring_submit(struct io_uring *ring)
{
struct io_uring_sq *sq = &ring->sq;
const unsigned mask = *sq->kring_mask;
unsigned ktail, ktail_next, submitted, to_submit;
int ret;
/*
* If we have pending IO in the kring, submit it first. We need a
* read barrier here to match the kernels store barrier when updating
* the SQ head.
*/
read_barrier();
if (*sq->khead != *sq->ktail) {
submitted = *sq->kring_entries;
goto submit;
}
if (sq->sqe_head == sq->sqe_tail)
return 0;
/*
* Fill in sqes that we have queued up, adding them to the kernel ring
*/
submitted = 0;
ktail = ktail_next = *sq->ktail;
to_submit = sq->sqe_tail - sq->sqe_head;
while (to_submit--) {
ktail_next++;
read_barrier();
sq->array[ktail & mask] = sq->sqe_head & mask;
ktail = ktail_next;
sq->sqe_head++;
submitted++;
}
if (!submitted)
return 0;
if (*sq->ktail != ktail) {
/*
* First write barrier ensures that the SQE stores are updated
* with the tail update. This is needed so that the kernel
* will never see a tail update without the preceeding sQE
* stores being done.
*/
write_barrier();
*sq->ktail = ktail;
/*
* The kernel has the matching read barrier for reading the
* SQ tail.
*/
write_barrier();
}
submit:
ret = io_uring_enter(ring->ring_fd, submitted, 0,
IORING_ENTER_GETEVENTS, NULL);
if (ret < 0)
return -errno;
return ret;
}
/*
* Return an sqe to fill. Application must later call io_uring_submit()
* when it's ready to tell the kernel about it. The caller may call this
* function multiple times before calling io_uring_submit().
*
* Returns a vacant sqe, or NULL if we're full.
*/
struct io_uring_sqe *io_uring_get_sqe(struct io_uring *ring)
{
struct io_uring_sq *sq = &ring->sq;
unsigned next = sq->sqe_tail + 1;
struct io_uring_sqe *sqe;
/*
* All sqes are used
*/
if (next - sq->sqe_head > *sq->kring_entries)
return NULL;
sqe = &sq->sqes[sq->sqe_tail & *sq->kring_mask];
sq->sqe_tail = next;
return sqe;
}
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/mman.h>
#include <unistd.h>
#include <errno.h>
#include <string.h>
#include "liburing.h"
static int io_uring_mmap(int fd, struct io_uring_params *p,
struct io_uring_sq *sq, struct io_uring_cq *cq)
{
size_t size;
void *ptr;
int ret;
sq->ring_sz = p->sq_off.array + p->sq_entries * sizeof(unsigned);
ptr = mmap(0, sq->ring_sz, PROT_READ | PROT_WRITE,
MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_SQ_RING);
if (ptr == MAP_FAILED)
return -errno;
sq->khead = ptr + p->sq_off.head;
sq->ktail = ptr + p->sq_off.tail;
sq->kring_mask = ptr + p->sq_off.ring_mask;
sq->kring_entries = ptr + p->sq_off.ring_entries;
sq->kflags = ptr + p->sq_off.flags;
sq->kdropped = ptr + p->sq_off.dropped;
sq->array = ptr + p->sq_off.array;
size = p->sq_entries * sizeof(struct io_uring_sqe);
sq->sqes = mmap(0, size, PROT_READ | PROT_WRITE,
MAP_SHARED | MAP_POPULATE, fd,
IORING_OFF_SQES);
if (sq->sqes == MAP_FAILED) {
ret = -errno;
err:
munmap(sq->khead, sq->ring_sz);
return ret;
}
cq->ring_sz = p->cq_off.cqes + p->cq_entries * sizeof(struct io_uring_cqe);
ptr = mmap(0, cq->ring_sz, PROT_READ | PROT_WRITE,
MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_CQ_RING);
if (ptr == MAP_FAILED) {
ret = -errno;
munmap(sq->sqes, p->sq_entries * sizeof(struct io_uring_sqe));
goto err;
}
cq->khead = ptr + p->cq_off.head;
cq->ktail = ptr + p->cq_off.tail;
cq->kring_mask = ptr + p->cq_off.ring_mask;
cq->kring_entries = ptr + p->cq_off.ring_entries;
cq->koverflow = ptr + p->cq_off.overflow;
cq->cqes = ptr + p->cq_off.cqes;
return 0;
}
/*
* For users that want to specify sq_thread_cpu or sq_thread_idle, this
* interface is a convenient helper for mmap()ing the rings.
* Returns -1 on error, or zero on success. On success, 'ring'
* contains the necessary information to read/write to the rings.
*/
int io_uring_queue_mmap(int fd, struct io_uring_params *p, struct io_uring *ring)
{
int ret;
memset(ring, 0, sizeof(*ring));
ret = io_uring_mmap(fd, p, &ring->sq, &ring->cq);
if (!ret)
ring->ring_fd = fd;
return ret;
}
/*
* Returns -1 on error, or zero on success. On success, 'ring'
* contains the necessary information to read/write to the rings.
*/
int io_uring_queue_init(unsigned entries, struct io_uring *ring, unsigned flags)
{
struct io_uring_params p;
int fd, ret;
memset(&p, 0, sizeof(p));
p.flags = flags;
fd = io_uring_setup(entries, &p);
if (fd < 0)
return fd;
ret = io_uring_queue_mmap(fd, &p, ring);
if (ret)
close(fd);
return ret;
}
void io_uring_queue_exit(struct io_uring *ring)
{
struct io_uring_sq *sq = &ring->sq;
struct io_uring_cq *cq = &ring->cq;
munmap(sq->sqes, *sq->kring_entries * sizeof(struct io_uring_sqe));
munmap(sq->khead, sq->ring_sz);
munmap(cq->khead, cq->ring_sz);
close(ring->ring_fd);
}
/*
* Will go away once libc support is there
*/
#include <unistd.h>
#include <sys/syscall.h>
#include <sys/uio.h>
#include <signal.h>
#include "liburing.h"
#ifdef __alpha__
/*
* alpha is the only exception, all other architectures
* have common numbers for new system calls.
*/
# ifndef __NR_io_uring_setup
# define __NR_io_uring_setup 535
# endif
# ifndef __NR_io_uring_enter
# define __NR_io_uring_enter 536
# endif
# ifndef __NR_io_uring_register
# define __NR_io_uring_register 537
# endif
#else /* !__alpha__ */
# ifndef __NR_io_uring_setup
# define __NR_io_uring_setup 425
# endif
# ifndef __NR_io_uring_enter
# define __NR_io_uring_enter 426
# endif
# ifndef __NR_io_uring_register
# define __NR_io_uring_register 427
# endif
#endif
int io_uring_register(int fd, unsigned int opcode, void *arg,
unsigned int nr_args)
{
return syscall(__NR_io_uring_register, fd, opcode, arg, nr_args);
}
int io_uring_setup(unsigned int entries, struct io_uring_params *p)
{
return syscall(__NR_io_uring_setup, entries, p);
}
int io_uring_enter(int fd, unsigned int to_submit, unsigned int min_complete,
unsigned int flags, sigset_t *sig)
{
return syscall(__NR_io_uring_enter, fd, to_submit, min_complete,
flags, sig, _NSIG / 8);
}
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment