Commit fb4b3d3f authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'for-5.5/io_uring-20191121' of git://git.kernel.dk/linux-block

Pull io_uring updates from Jens Axboe:
 "A lot of stuff has been going on this cycle, with improving the
  support for networked IO (and hence unbounded request completion
  times) being one of the major themes. There's been a set of fixes done
  this week, I'll send those out as well once we're certain we're fully
  happy with them.

  This contains:

   - Unification of the "normal" submit path and the SQPOLL path (Pavel)

   - Support for sparse (and bigger) file sets, and updating of those
     file sets without needing to unregister/register again.

   - Independently sized CQ ring, instead of just making it always 2x
     the SQ ring size. This makes it more flexible for networked
     applications.

   - Support for overflowed CQ ring, never dropping events but providing
     backpressure on submits.

   - Add support for absolute timeouts, not just relative ones.

   - Support for generic cancellations. This divorces io_uring from
     workqueues as well, which additionally gets us one step closer to
     generic async system call support.

   - With cancellations, we can support grabbing the process file table
     as well, just like we do mm context. This allows support for system
     calls that create file descriptors, like accept4() support that's
     built on top of that.

   - Support for io_uring tracing (Dmitrii)

   - Support for linked timeouts. These abort an operation if it isn't
     completed by the time noted in the linke timeout.

   - Speedup tracking of poll requests

   - Various cleanups making the coder easier to follow (Jackie, Pavel,
     Bob, YueHaibing, me)

   - Update MAINTAINERS with new io_uring list"

* tag 'for-5.5/io_uring-20191121' of git://git.kernel.dk/linux-block: (64 commits)
  io_uring: make POLL_ADD/POLL_REMOVE scale better
  io-wq: remove now redundant struct io_wq_nulls_list
  io_uring: Fix getting file for non-fd opcodes
  io_uring: introduce req_need_defer()
  io_uring: clean up io_uring_cancel_files()
  io-wq: ensure free/busy list browsing see all items
  io-wq: ensure we have a stable view of ->cur_work for cancellations
  io_wq: add get/put_work handlers to io_wq_create()
  io_uring: check for validity of ->rings in teardown
  io_uring: fix potential deadlock in io_poll_wake()
  io_uring: use correct "is IO worker" helper
  io_uring: fix -ENOENT issue with linked timer with short timeout
  io_uring: don't do flush cancel under inflight_lock
  io_uring: flag SQPOLL busy condition to userspace
  io_uring: make ASYNC_CANCEL work with poll and timeout
  io_uring: provide fallback request for OOM situations
  io_uring: convert accept4() -ERESTARTSYS into -EINTR
  io_uring: fix error clear of ->file_table in io_sqe_files_register()
  io_uring: separate the io_free_req and io_free_req_find_next interface
  io_uring: keep io_put_req only responsible for release and put req
  ...
parents 54f0e540 eac406c6
......@@ -8564,12 +8564,13 @@ F: include/linux/iova.h
IO_URING
M: Jens Axboe <axboe@kernel.dk>
L: linux-block@vger.kernel.org
L: linux-fsdevel@vger.kernel.org
L: io-uring@vger.kernel.org
T: git git://git.kernel.dk/linux-block
T: git git://git.kernel.dk/liburing
S: Maintained
F: fs/io_uring.c
F: fs/io-wq.c
F: fs/io-wq.h
F: include/uapi/linux/io_uring.h
IPMI SUBSYSTEM
......
......@@ -322,4 +322,7 @@ source "fs/nls/Kconfig"
source "fs/dlm/Kconfig"
source "fs/unicode/Kconfig"
config IO_WQ
bool
endmenu
......@@ -32,6 +32,7 @@ obj-$(CONFIG_EVENTFD) += eventfd.o
obj-$(CONFIG_USERFAULTFD) += userfaultfd.o
obj-$(CONFIG_AIO) += aio.o
obj-$(CONFIG_IO_URING) += io_uring.o
obj-$(CONFIG_IO_WQ) += io-wq.o
obj-$(CONFIG_FS_DAX) += dax.o
obj-$(CONFIG_FS_ENCRYPTION) += crypto/
obj-$(CONFIG_FS_VERITY) += verity/
......
This diff is collapsed.
#ifndef INTERNAL_IO_WQ_H
#define INTERNAL_IO_WQ_H
struct io_wq;
enum {
IO_WQ_WORK_CANCEL = 1,
IO_WQ_WORK_HAS_MM = 2,
IO_WQ_WORK_HASHED = 4,
IO_WQ_WORK_NEEDS_USER = 8,
IO_WQ_WORK_NEEDS_FILES = 16,
IO_WQ_WORK_UNBOUND = 32,
IO_WQ_WORK_INTERNAL = 64,
IO_WQ_HASH_SHIFT = 24, /* upper 8 bits are used for hash key */
};
enum io_wq_cancel {
IO_WQ_CANCEL_OK, /* cancelled before started */
IO_WQ_CANCEL_RUNNING, /* found, running, and attempted cancelled */
IO_WQ_CANCEL_NOTFOUND, /* work not found */
};
struct io_wq_work {
struct list_head list;
void (*func)(struct io_wq_work **);
unsigned flags;
struct files_struct *files;
};
#define INIT_IO_WORK(work, _func) \
do { \
(work)->func = _func; \
(work)->flags = 0; \
(work)->files = NULL; \
} while (0) \
typedef void (get_work_fn)(struct io_wq_work *);
typedef void (put_work_fn)(struct io_wq_work *);
struct io_wq *io_wq_create(unsigned bounded, struct mm_struct *mm,
struct user_struct *user,
get_work_fn *get_work, put_work_fn *put_work);
void io_wq_destroy(struct io_wq *wq);
void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work);
void io_wq_enqueue_hashed(struct io_wq *wq, struct io_wq_work *work, void *val);
void io_wq_flush(struct io_wq *wq);
void io_wq_cancel_all(struct io_wq *wq);
enum io_wq_cancel io_wq_cancel_work(struct io_wq *wq, struct io_wq_work *cwork);
typedef bool (work_cancel_fn)(struct io_wq_work *, void *);
enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel,
void *data);
#if defined(CONFIG_IO_WQ)
extern void io_wq_worker_sleeping(struct task_struct *);
extern void io_wq_worker_running(struct task_struct *);
#else
static inline void io_wq_worker_sleeping(struct task_struct *tsk)
{
}
static inline void io_wq_worker_running(struct task_struct *tsk)
{
}
#endif
static inline bool io_wq_current_is_worker(void)
{
return in_task() && (current->flags & PF_IO_WORKER);
}
#endif
This diff is collapsed.
......@@ -1027,6 +1027,7 @@ header-test- += trace/events/fsi_master_gpio.h
header-test- += trace/events/huge_memory.h
header-test- += trace/events/ib_mad.h
header-test- += trace/events/ib_umad.h
header-test- += trace/events/io_uring.h
header-test- += trace/events/iscsi.h
header-test- += trace/events/jbd2.h
header-test- += trace/events/kvm.h
......
......@@ -1468,6 +1468,7 @@ extern struct pid *cad_pid;
#define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_mask */
#define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */
#define PF_MEMALLOC_NOCMA 0x10000000 /* All allocation request will have _GFP_MOVABLE cleared */
#define PF_IO_WORKER 0x20000000 /* Task is an IO worker */
#define PF_FREEZER_SKIP 0x40000000 /* Freezer should not count it as freezable */
#define PF_SUSPEND_TASK 0x80000000 /* This thread called freeze_processes() and should not be frozen */
......
......@@ -392,6 +392,9 @@ extern int __sys_recvfrom(int fd, void __user *ubuf, size_t size,
extern int __sys_sendto(int fd, void __user *buff, size_t len,
unsigned int flags, struct sockaddr __user *addr,
int addr_len);
extern int __sys_accept4_file(struct file *file, unsigned file_flags,
struct sockaddr __user *upeer_sockaddr,
int __user *upeer_addrlen, int flags);
extern int __sys_accept4(int fd, struct sockaddr __user *upeer_sockaddr,
int __user *upeer_addrlen, int flags);
extern int __sys_socket(int family, int type, int protocol);
......
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM io_uring
#if !defined(_TRACE_IO_URING_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_IO_URING_H
#include <linux/tracepoint.h>
struct io_wq_work;
/**
* io_uring_create - called after a new io_uring context was prepared
*
* @fd: corresponding file descriptor
* @ctx: pointer to a ring context structure
* @sq_entries: actual SQ size
* @cq_entries: actual CQ size
* @flags: SQ ring flags, provided to io_uring_setup(2)
*
* Allows to trace io_uring creation and provide pointer to a context, that can
* be used later to find correlated events.
*/
TRACE_EVENT(io_uring_create,
TP_PROTO(int fd, void *ctx, u32 sq_entries, u32 cq_entries, u32 flags),
TP_ARGS(fd, ctx, sq_entries, cq_entries, flags),
TP_STRUCT__entry (
__field( int, fd )
__field( void *, ctx )
__field( u32, sq_entries )
__field( u32, cq_entries )
__field( u32, flags )
),
TP_fast_assign(
__entry->fd = fd;
__entry->ctx = ctx;
__entry->sq_entries = sq_entries;
__entry->cq_entries = cq_entries;
__entry->flags = flags;
),
TP_printk("ring %p, fd %d sq size %d, cq size %d, flags %d",
__entry->ctx, __entry->fd, __entry->sq_entries,
__entry->cq_entries, __entry->flags)
);
/**
* io_uring_register - called after a buffer/file/eventfd was succesfully
* registered for a ring
*
* @ctx: pointer to a ring context structure
* @opcode: describes which operation to perform
* @nr_user_files: number of registered files
* @nr_user_bufs: number of registered buffers
* @cq_ev_fd: whether eventfs registered or not
* @ret: return code
*
* Allows to trace fixed files/buffers/eventfds, that could be registered to
* avoid an overhead of getting references to them for every operation. This
* event, together with io_uring_file_get, can provide a full picture of how
* much overhead one can reduce via fixing.
*/
TRACE_EVENT(io_uring_register,
TP_PROTO(void *ctx, unsigned opcode, unsigned nr_files,
unsigned nr_bufs, bool eventfd, long ret),
TP_ARGS(ctx, opcode, nr_files, nr_bufs, eventfd, ret),
TP_STRUCT__entry (
__field( void *, ctx )
__field( unsigned, opcode )
__field( unsigned, nr_files )
__field( unsigned, nr_bufs )
__field( bool, eventfd )
__field( long, ret )
),
TP_fast_assign(
__entry->ctx = ctx;
__entry->opcode = opcode;
__entry->nr_files = nr_files;
__entry->nr_bufs = nr_bufs;
__entry->eventfd = eventfd;
__entry->ret = ret;
),
TP_printk("ring %p, opcode %d, nr_user_files %d, nr_user_bufs %d, "
"eventfd %d, ret %ld",
__entry->ctx, __entry->opcode, __entry->nr_files,
__entry->nr_bufs, __entry->eventfd, __entry->ret)
);
/**
* io_uring_file_get - called before getting references to an SQE file
*
* @ctx: pointer to a ring context structure
* @fd: SQE file descriptor
*
* Allows to trace out how often an SQE file reference is obtained, which can
* help figuring out if it makes sense to use fixed files, or check that fixed
* files are used correctly.
*/
TRACE_EVENT(io_uring_file_get,
TP_PROTO(void *ctx, int fd),
TP_ARGS(ctx, fd),
TP_STRUCT__entry (
__field( void *, ctx )
__field( int, fd )
),
TP_fast_assign(
__entry->ctx = ctx;
__entry->fd = fd;
),
TP_printk("ring %p, fd %d", __entry->ctx, __entry->fd)
);
/**
* io_uring_queue_async_work - called before submitting a new async work
*
* @ctx: pointer to a ring context structure
* @hashed: type of workqueue, hashed or normal
* @req: pointer to a submitted request
* @work: pointer to a submitted io_wq_work
*
* Allows to trace asynchronous work submission.
*/
TRACE_EVENT(io_uring_queue_async_work,
TP_PROTO(void *ctx, int rw, void * req, struct io_wq_work *work,
unsigned int flags),
TP_ARGS(ctx, rw, req, work, flags),
TP_STRUCT__entry (
__field( void *, ctx )
__field( int, rw )
__field( void *, req )
__field( struct io_wq_work *, work )
__field( unsigned int, flags )
),
TP_fast_assign(
__entry->ctx = ctx;
__entry->rw = rw;
__entry->req = req;
__entry->work = work;
__entry->flags = flags;
),
TP_printk("ring %p, request %p, flags %d, %s queue, work %p",
__entry->ctx, __entry->req, __entry->flags,
__entry->rw ? "hashed" : "normal", __entry->work)
);
/**
* io_uring_defer_list - called before the io_uring work added into defer_list
*
* @ctx: pointer to a ring context structure
* @req: pointer to a deferred request
* @shadow: whether request is shadow or not
*
* Allows to track deferred requests, to get an insight about what requests are
* not started immediately.
*/
TRACE_EVENT(io_uring_defer,
TP_PROTO(void *ctx, void *req, bool shadow),
TP_ARGS(ctx, req, shadow),
TP_STRUCT__entry (
__field( void *, ctx )
__field( void *, req )
__field( bool, shadow )
),
TP_fast_assign(
__entry->ctx = ctx;
__entry->req = req;
__entry->shadow = shadow;
),
TP_printk("ring %p, request %p%s", __entry->ctx, __entry->req,
__entry->shadow ? ", shadow": "")
);
/**
* io_uring_link - called before the io_uring request added into link_list of
* another request
*
* @ctx: pointer to a ring context structure
* @req: pointer to a linked request
* @target_req: pointer to a previous request, that would contain @req
*
* Allows to track linked requests, to understand dependencies between requests
* and how does it influence their execution flow.
*/
TRACE_EVENT(io_uring_link,
TP_PROTO(void *ctx, void *req, void *target_req),
TP_ARGS(ctx, req, target_req),
TP_STRUCT__entry (
__field( void *, ctx )
__field( void *, req )
__field( void *, target_req )
),
TP_fast_assign(
__entry->ctx = ctx;
__entry->req = req;
__entry->target_req = target_req;
),
TP_printk("ring %p, request %p linked after %p",
__entry->ctx, __entry->req, __entry->target_req)
);
/**
* io_uring_cqring_wait - called before start waiting for an available CQE
*
* @ctx: pointer to a ring context structure
* @min_events: minimal number of events to wait for
*
* Allows to track waiting for CQE, so that we can e.g. troubleshoot
* situations, when an application wants to wait for an event, that never
* comes.
*/
TRACE_EVENT(io_uring_cqring_wait,
TP_PROTO(void *ctx, int min_events),
TP_ARGS(ctx, min_events),
TP_STRUCT__entry (
__field( void *, ctx )
__field( int, min_events )
),
TP_fast_assign(
__entry->ctx = ctx;
__entry->min_events = min_events;
),
TP_printk("ring %p, min_events %d", __entry->ctx, __entry->min_events)
);
/**
* io_uring_fail_link - called before failing a linked request
*
* @req: request, which links were cancelled
* @link: cancelled link
*
* Allows to track linked requests cancellation, to see not only that some work
* was cancelled, but also which request was the reason.
*/
TRACE_EVENT(io_uring_fail_link,
TP_PROTO(void *req, void *link),
TP_ARGS(req, link),
TP_STRUCT__entry (
__field( void *, req )
__field( void *, link )
),
TP_fast_assign(
__entry->req = req;
__entry->link = link;
),
TP_printk("request %p, link %p", __entry->req, __entry->link)
);
/**
* io_uring_complete - called when completing an SQE
*
* @ctx: pointer to a ring context structure
* @user_data: user data associated with the request
* @res: result of the request
*
*/
TRACE_EVENT(io_uring_complete,
TP_PROTO(void *ctx, u64 user_data, long res),
TP_ARGS(ctx, user_data, res),
TP_STRUCT__entry (
__field( void *, ctx )
__field( u64, user_data )
__field( long, res )
),
TP_fast_assign(
__entry->ctx = ctx;
__entry->user_data = user_data;
__entry->res = res;
),
TP_printk("ring %p, user_data 0x%llx, result %ld",
__entry->ctx, (unsigned long long)__entry->user_data,
__entry->res)
);
/**
* io_uring_submit_sqe - called before submitting one SQE
*
* @ctx: pointer to a ring context structure
* @user_data: user data associated with the request
* @force_nonblock: whether a context blocking or not
* @sq_thread: true if sq_thread has submitted this SQE
*
* Allows to track SQE submitting, to understand what was the source of it, SQ
* thread or io_uring_enter call.
*/
TRACE_EVENT(io_uring_submit_sqe,
TP_PROTO(void *ctx, u64 user_data, bool force_nonblock, bool sq_thread),
TP_ARGS(ctx, user_data, force_nonblock, sq_thread),
TP_STRUCT__entry (
__field( void *, ctx )
__field( u64, user_data )
__field( bool, force_nonblock )
__field( bool, sq_thread )
),
TP_fast_assign(
__entry->ctx = ctx;
__entry->user_data = user_data;
__entry->force_nonblock = force_nonblock;
__entry->sq_thread = sq_thread;
),
TP_printk("ring %p, user data 0x%llx, non block %d, sq_thread %d",
__entry->ctx, (unsigned long long) __entry->user_data,
__entry->force_nonblock, __entry->sq_thread)
);
#endif /* _TRACE_IO_URING_H */
/* This part must be outside protection */
#include <trace/define_trace.h>
......@@ -19,7 +19,10 @@ struct io_uring_sqe {
__u8 flags; /* IOSQE_ flags */
__u16 ioprio; /* ioprio for the request */
__s32 fd; /* file descriptor to do IO on */
__u64 off; /* offset into file */
union {
__u64 off; /* offset into file */
__u64 addr2;
};
__u64 addr; /* pointer to buffer or iovecs */
__u32 len; /* buffer size or number of iovecs */
union {
......@@ -29,6 +32,8 @@ struct io_uring_sqe {
__u32 sync_range_flags;
__u32 msg_flags;
__u32 timeout_flags;
__u32 accept_flags;
__u32 cancel_flags;
};
__u64 user_data; /* data to be passed back at completion time */
union {
......@@ -50,6 +55,7 @@ struct io_uring_sqe {
#define IORING_SETUP_IOPOLL (1U << 0) /* io_context is polled */
#define IORING_SETUP_SQPOLL (1U << 1) /* SQ poll thread */
#define IORING_SETUP_SQ_AFF (1U << 2) /* sq_thread_cpu is valid */
#define IORING_SETUP_CQSIZE (1U << 3) /* app defines CQ size */
#define IORING_OP_NOP 0
#define IORING_OP_READV 1
......@@ -63,12 +69,21 @@ struct io_uring_sqe {
#define IORING_OP_SENDMSG 9
#define IORING_OP_RECVMSG 10
#define IORING_OP_TIMEOUT 11
#define IORING_OP_TIMEOUT_REMOVE 12
#define IORING_OP_ACCEPT 13
#define IORING_OP_ASYNC_CANCEL 14
#define IORING_OP_LINK_TIMEOUT 15
/*
* sqe->fsync_flags
*/
#define IORING_FSYNC_DATASYNC (1U << 0)
/*
* sqe->timeout_flags
*/
#define IORING_TIMEOUT_ABS (1U << 0)
/*
* IO completion data structure (Completion Queue Entry)
*/
......@@ -140,6 +155,7 @@ struct io_uring_params {
* io_uring_params->features flags
*/
#define IORING_FEAT_SINGLE_MMAP (1U << 0)
#define IORING_FEAT_NODROP (1U << 1)
/*
* io_uring_register(2) opcodes and arguments
......@@ -150,5 +166,11 @@ struct io_uring_params {
#define IORING_UNREGISTER_FILES 3
#define IORING_REGISTER_EVENTFD 4
#define IORING_UNREGISTER_EVENTFD 5
#define IORING_REGISTER_FILES_UPDATE 6
struct io_uring_files_update {
__u32 offset;
__s32 *fds;
};
#endif
......@@ -1548,6 +1548,7 @@ config AIO
config IO_URING
bool "Enable IO uring support" if EXPERT
select ANON_INODES
select IO_WQ
default y
help
This option enables support for the io_uring interface, enabling
......
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment