Commit e59cd880 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'for-5.7/io_uring-2020-03-29' of git://git.kernel.dk/linux-block

Pull io_uring updates from Jens Axboe:
 "Here are the io_uring changes for this merge window. Light on new
  features this time around (just splice + buffer selection), lots of
  cleanups, fixes, and improvements to existing support. In particular,
  this contains:

   - Cleanup fixed file update handling for stack fallback (Hillf)

   - Re-work of how pollable async IO is handled, we no longer require
     thread offload to handle that. Instead we rely using poll to drive
     this, with task_work execution.

   - In conjunction with the above, allow expendable buffer selection,
     so that poll+recv (for example) no longer has to be a split
     operation.

   - Make sure we honor RLIMIT_FSIZE for buffered writes

   - Add support for splice (Pavel)

   - Linked work inheritance fixes and optimizations (Pavel)

   - Async work fixes and cleanups (Pavel)

   - Improve io-wq locking (Pavel)

   - Hashed link write improvements (Pavel)

   - SETUP_IOPOLL|SETUP_SQPOLL improvements (Xiaoguang)"

* tag 'for-5.7/io_uring-2020-03-29' of git://git.kernel.dk/linux-block: (54 commits)
  io_uring: cleanup io_alloc_async_ctx()
  io_uring: fix missing 'return' in comment
  io-wq: handle hashed writes in chains
  io-uring: drop 'free_pfile' in struct io_file_put
  io-uring: drop completion when removing file
  io_uring: Fix ->data corruption on re-enqueue
  io-wq: close cancel gap for hashed linked work
  io_uring: make spdxcheck.py happy
  io_uring: honor original task RLIMIT_FSIZE
  io-wq: hash dependent work
  io-wq: split hashing and enqueueing
  io-wq: don't resched if there is no work
  io-wq: remove duplicated cancel code
  io_uring: fix truncated async read/readv and write/writev retry
  io_uring: dual license io_uring.h uapi header
  io_uring: io_uring_enter(2) don't poll while SETUP_IOPOLL|SETUP_SQPOLL enabled
  io_uring: Fix unused function warnings
  io_uring: add end-of-bits marker and build time verify it
  io_uring: provide means of removing buffers
  io_uring: add IOSQE_BUFFER_SELECT support for IORING_OP_RECVMSG
  ...
parents 15926148 3d9932a8
This diff is collapsed.
...@@ -5,10 +5,8 @@ struct io_wq; ...@@ -5,10 +5,8 @@ struct io_wq;
enum { enum {
IO_WQ_WORK_CANCEL = 1, IO_WQ_WORK_CANCEL = 1,
IO_WQ_WORK_HAS_MM = 2,
IO_WQ_WORK_HASHED = 4, IO_WQ_WORK_HASHED = 4,
IO_WQ_WORK_UNBOUND = 32, IO_WQ_WORK_UNBOUND = 32,
IO_WQ_WORK_CB = 128,
IO_WQ_WORK_NO_CANCEL = 256, IO_WQ_WORK_NO_CANCEL = 256,
IO_WQ_WORK_CONCURRENT = 512, IO_WQ_WORK_CONCURRENT = 512,
...@@ -30,6 +28,18 @@ struct io_wq_work_list { ...@@ -30,6 +28,18 @@ struct io_wq_work_list {
struct io_wq_work_node *last; struct io_wq_work_node *last;
}; };
static inline void wq_list_add_after(struct io_wq_work_node *node,
struct io_wq_work_node *pos,
struct io_wq_work_list *list)
{
struct io_wq_work_node *next = pos->next;
pos->next = node;
node->next = next;
if (!next)
list->last = node;
}
static inline void wq_list_add_tail(struct io_wq_work_node *node, static inline void wq_list_add_tail(struct io_wq_work_node *node,
struct io_wq_work_list *list) struct io_wq_work_list *list)
{ {
...@@ -42,17 +52,26 @@ static inline void wq_list_add_tail(struct io_wq_work_node *node, ...@@ -42,17 +52,26 @@ static inline void wq_list_add_tail(struct io_wq_work_node *node,
} }
} }
static inline void wq_node_del(struct io_wq_work_list *list, static inline void wq_list_cut(struct io_wq_work_list *list,
struct io_wq_work_node *node, struct io_wq_work_node *last,
struct io_wq_work_node *prev) struct io_wq_work_node *prev)
{ {
if (node == list->first) /* first in the list, if prev==NULL */
WRITE_ONCE(list->first, node->next); if (!prev)
if (node == list->last) WRITE_ONCE(list->first, last->next);
else
prev->next = last->next;
if (last == list->last)
list->last = prev; list->last = prev;
if (prev) last->next = NULL;
prev->next = node->next; }
node->next = NULL;
static inline void wq_list_del(struct io_wq_work_list *list,
struct io_wq_work_node *node,
struct io_wq_work_node *prev)
{
wq_list_cut(list, node, prev);
} }
#define wq_list_for_each(pos, prv, head) \ #define wq_list_for_each(pos, prv, head) \
...@@ -65,10 +84,7 @@ static inline void wq_node_del(struct io_wq_work_list *list, ...@@ -65,10 +84,7 @@ static inline void wq_node_del(struct io_wq_work_list *list,
} while (0) } while (0)
struct io_wq_work { struct io_wq_work {
union { struct io_wq_work_node list;
struct io_wq_work_node list;
void *data;
};
void (*func)(struct io_wq_work **); void (*func)(struct io_wq_work **);
struct files_struct *files; struct files_struct *files;
struct mm_struct *mm; struct mm_struct *mm;
...@@ -83,14 +99,20 @@ struct io_wq_work { ...@@ -83,14 +99,20 @@ struct io_wq_work {
*(work) = (struct io_wq_work){ .func = _func }; \ *(work) = (struct io_wq_work){ .func = _func }; \
} while (0) \ } while (0) \
typedef void (get_work_fn)(struct io_wq_work *); static inline struct io_wq_work *wq_next_work(struct io_wq_work *work)
typedef void (put_work_fn)(struct io_wq_work *); {
if (!work->list.next)
return NULL;
return container_of(work->list.next, struct io_wq_work, list);
}
typedef void (free_work_fn)(struct io_wq_work *);
struct io_wq_data { struct io_wq_data {
struct user_struct *user; struct user_struct *user;
get_work_fn *get_work; free_work_fn *free_work;
put_work_fn *put_work;
}; };
struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data); struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data);
...@@ -98,7 +120,12 @@ bool io_wq_get(struct io_wq *wq, struct io_wq_data *data); ...@@ -98,7 +120,12 @@ bool io_wq_get(struct io_wq *wq, struct io_wq_data *data);
void io_wq_destroy(struct io_wq *wq); void io_wq_destroy(struct io_wq *wq);
void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work); void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work);
void io_wq_enqueue_hashed(struct io_wq *wq, struct io_wq_work *work, void *val); void io_wq_hash_work(struct io_wq_work *work, void *val);
static inline bool io_wq_is_hashed(struct io_wq_work *work)
{
return work->flags & IO_WQ_WORK_HASHED;
}
void io_wq_cancel_all(struct io_wq *wq); void io_wq_cancel_all(struct io_wq *wq);
enum io_wq_cancel io_wq_cancel_work(struct io_wq *wq, struct io_wq_work *cwork); enum io_wq_cancel io_wq_cancel_work(struct io_wq *wq, struct io_wq_work *cwork);
......
This diff is collapsed.
...@@ -1109,9 +1109,9 @@ static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe, ...@@ -1109,9 +1109,9 @@ static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
/* /*
* Determine where to splice to/from. * Determine where to splice to/from.
*/ */
static long do_splice(struct file *in, loff_t __user *off_in, long do_splice(struct file *in, loff_t __user *off_in,
struct file *out, loff_t __user *off_out, struct file *out, loff_t __user *off_out,
size_t len, unsigned int flags) size_t len, unsigned int flags)
{ {
struct pipe_inode_info *ipipe; struct pipe_inode_info *ipipe;
struct pipe_inode_info *opipe; struct pipe_inode_info *opipe;
......
...@@ -391,6 +391,10 @@ extern int recvmsg_copy_msghdr(struct msghdr *msg, ...@@ -391,6 +391,10 @@ extern int recvmsg_copy_msghdr(struct msghdr *msg,
struct user_msghdr __user *umsg, unsigned flags, struct user_msghdr __user *umsg, unsigned flags,
struct sockaddr __user **uaddr, struct sockaddr __user **uaddr,
struct iovec **iov); struct iovec **iov);
extern int __copy_msghdr_from_user(struct msghdr *kmsg,
struct user_msghdr __user *umsg,
struct sockaddr __user **save_addr,
struct iovec __user **uiov, size_t *nsegs);
/* helpers which do the actual work for syscalls */ /* helpers which do the actual work for syscalls */
extern int __sys_recvfrom(int fd, void __user *ubuf, size_t size, extern int __sys_recvfrom(int fd, void __user *ubuf, size_t size,
......
...@@ -78,6 +78,9 @@ extern ssize_t add_to_pipe(struct pipe_inode_info *, ...@@ -78,6 +78,9 @@ extern ssize_t add_to_pipe(struct pipe_inode_info *,
struct pipe_buffer *); struct pipe_buffer *);
extern ssize_t splice_direct_to_actor(struct file *, struct splice_desc *, extern ssize_t splice_direct_to_actor(struct file *, struct splice_desc *,
splice_direct_actor *); splice_direct_actor *);
extern long do_splice(struct file *in, loff_t __user *off_in,
struct file *out, loff_t __user *off_out,
size_t len, unsigned int flags);
/* /*
* for dynamic pipe sizing * for dynamic pipe sizing
......
...@@ -38,6 +38,9 @@ struct compat_cmsghdr { ...@@ -38,6 +38,9 @@ struct compat_cmsghdr {
#define compat_mmsghdr mmsghdr #define compat_mmsghdr mmsghdr
#endif /* defined(CONFIG_COMPAT) */ #endif /* defined(CONFIG_COMPAT) */
int __get_compat_msghdr(struct msghdr *kmsg, struct compat_msghdr __user *umsg,
struct sockaddr __user **save_addr, compat_uptr_t *ptr,
compat_size_t *len);
int get_compat_msghdr(struct msghdr *, struct compat_msghdr __user *, int get_compat_msghdr(struct msghdr *, struct compat_msghdr __user *,
struct sockaddr __user **, struct iovec **); struct sockaddr __user **, struct iovec **);
struct sock_fprog __user *get_compat_bpf_fprog(char __user *optval); struct sock_fprog __user *get_compat_bpf_fprog(char __user *optval);
......
...@@ -357,6 +357,109 @@ TRACE_EVENT(io_uring_submit_sqe, ...@@ -357,6 +357,109 @@ TRACE_EVENT(io_uring_submit_sqe,
__entry->force_nonblock, __entry->sq_thread) __entry->force_nonblock, __entry->sq_thread)
); );
TRACE_EVENT(io_uring_poll_arm,
TP_PROTO(void *ctx, u8 opcode, u64 user_data, int mask, int events),
TP_ARGS(ctx, opcode, user_data, mask, events),
TP_STRUCT__entry (
__field( void *, ctx )
__field( u8, opcode )
__field( u64, user_data )
__field( int, mask )
__field( int, events )
),
TP_fast_assign(
__entry->ctx = ctx;
__entry->opcode = opcode;
__entry->user_data = user_data;
__entry->mask = mask;
__entry->events = events;
),
TP_printk("ring %p, op %d, data 0x%llx, mask 0x%x, events 0x%x",
__entry->ctx, __entry->opcode,
(unsigned long long) __entry->user_data,
__entry->mask, __entry->events)
);
TRACE_EVENT(io_uring_poll_wake,
TP_PROTO(void *ctx, u8 opcode, u64 user_data, int mask),
TP_ARGS(ctx, opcode, user_data, mask),
TP_STRUCT__entry (
__field( void *, ctx )
__field( u8, opcode )
__field( u64, user_data )
__field( int, mask )
),
TP_fast_assign(
__entry->ctx = ctx;
__entry->opcode = opcode;
__entry->user_data = user_data;
__entry->mask = mask;
),
TP_printk("ring %p, op %d, data 0x%llx, mask 0x%x",
__entry->ctx, __entry->opcode,
(unsigned long long) __entry->user_data,
__entry->mask)
);
TRACE_EVENT(io_uring_task_add,
TP_PROTO(void *ctx, u8 opcode, u64 user_data, int mask),
TP_ARGS(ctx, opcode, user_data, mask),
TP_STRUCT__entry (
__field( void *, ctx )
__field( u8, opcode )
__field( u64, user_data )
__field( int, mask )
),
TP_fast_assign(
__entry->ctx = ctx;
__entry->opcode = opcode;
__entry->user_data = user_data;
__entry->mask = mask;
),
TP_printk("ring %p, op %d, data 0x%llx, mask %x",
__entry->ctx, __entry->opcode,
(unsigned long long) __entry->user_data,
__entry->mask)
);
TRACE_EVENT(io_uring_task_run,
TP_PROTO(void *ctx, u8 opcode, u64 user_data),
TP_ARGS(ctx, opcode, user_data),
TP_STRUCT__entry (
__field( void *, ctx )
__field( u8, opcode )
__field( u64, user_data )
),
TP_fast_assign(
__entry->ctx = ctx;
__entry->opcode = opcode;
__entry->user_data = user_data;
),
TP_printk("ring %p, op %d, data 0x%llx",
__entry->ctx, __entry->opcode,
(unsigned long long) __entry->user_data)
);
#endif /* _TRACE_IO_URING_H */ #endif /* _TRACE_IO_URING_H */
/* This part must be outside protection */ /* This part must be outside protection */
......
/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ /* SPDX-License-Identifier: (GPL-2.0 WITH Linux-syscall-note) OR MIT */
/* /*
* Header file for the io_uring interface. * Header file for the io_uring interface.
* *
...@@ -23,7 +23,10 @@ struct io_uring_sqe { ...@@ -23,7 +23,10 @@ struct io_uring_sqe {
__u64 off; /* offset into file */ __u64 off; /* offset into file */
__u64 addr2; __u64 addr2;
}; };
__u64 addr; /* pointer to buffer or iovecs */ union {
__u64 addr; /* pointer to buffer or iovecs */
__u64 splice_off_in;
};
__u32 len; /* buffer size or number of iovecs */ __u32 len; /* buffer size or number of iovecs */
union { union {
__kernel_rwf_t rw_flags; __kernel_rwf_t rw_flags;
...@@ -37,14 +40,21 @@ struct io_uring_sqe { ...@@ -37,14 +40,21 @@ struct io_uring_sqe {
__u32 open_flags; __u32 open_flags;
__u32 statx_flags; __u32 statx_flags;
__u32 fadvise_advice; __u32 fadvise_advice;
__u32 splice_flags;
}; };
__u64 user_data; /* data to be passed back at completion time */ __u64 user_data; /* data to be passed back at completion time */
union { union {
struct { struct {
/* index into fixed buffers, if used */ /* pack this to avoid bogus arm OABI complaints */
__u16 buf_index; union {
/* index into fixed buffers, if used */
__u16 buf_index;
/* for grouped buffer selection */
__u16 buf_group;
} __attribute__((packed));
/* personality to use, if used */ /* personality to use, if used */
__u16 personality; __u16 personality;
__s32 splice_fd_in;
}; };
__u64 __pad2[3]; __u64 __pad2[3];
}; };
...@@ -56,6 +66,7 @@ enum { ...@@ -56,6 +66,7 @@ enum {
IOSQE_IO_LINK_BIT, IOSQE_IO_LINK_BIT,
IOSQE_IO_HARDLINK_BIT, IOSQE_IO_HARDLINK_BIT,
IOSQE_ASYNC_BIT, IOSQE_ASYNC_BIT,
IOSQE_BUFFER_SELECT_BIT,
}; };
/* /*
...@@ -71,6 +82,8 @@ enum { ...@@ -71,6 +82,8 @@ enum {
#define IOSQE_IO_HARDLINK (1U << IOSQE_IO_HARDLINK_BIT) #define IOSQE_IO_HARDLINK (1U << IOSQE_IO_HARDLINK_BIT)
/* always go async */ /* always go async */
#define IOSQE_ASYNC (1U << IOSQE_ASYNC_BIT) #define IOSQE_ASYNC (1U << IOSQE_ASYNC_BIT)
/* select buffer from sqe->buf_group */
#define IOSQE_BUFFER_SELECT (1U << IOSQE_BUFFER_SELECT_BIT)
/* /*
* io_uring_setup() flags * io_uring_setup() flags
...@@ -113,6 +126,9 @@ enum { ...@@ -113,6 +126,9 @@ enum {
IORING_OP_RECV, IORING_OP_RECV,
IORING_OP_OPENAT2, IORING_OP_OPENAT2,
IORING_OP_EPOLL_CTL, IORING_OP_EPOLL_CTL,
IORING_OP_SPLICE,
IORING_OP_PROVIDE_BUFFERS,
IORING_OP_REMOVE_BUFFERS,
/* this goes last, obviously */ /* this goes last, obviously */
IORING_OP_LAST, IORING_OP_LAST,
...@@ -128,6 +144,12 @@ enum { ...@@ -128,6 +144,12 @@ enum {
*/ */
#define IORING_TIMEOUT_ABS (1U << 0) #define IORING_TIMEOUT_ABS (1U << 0)
/*
* sqe->splice_flags
* extends splice(2) flags
*/
#define SPLICE_F_FD_IN_FIXED (1U << 31) /* the last bit of __u32 */
/* /*
* IO completion data structure (Completion Queue Entry) * IO completion data structure (Completion Queue Entry)
*/ */
...@@ -137,6 +159,17 @@ struct io_uring_cqe { ...@@ -137,6 +159,17 @@ struct io_uring_cqe {
__u32 flags; __u32 flags;
}; };
/*
* cqe->flags
*
* IORING_CQE_F_BUFFER If set, the upper 16 bits are the buffer ID
*/
#define IORING_CQE_F_BUFFER (1U << 0)
enum {
IORING_CQE_BUFFER_SHIFT = 16,
};
/* /*
* Magic offsets for the application to mmap the data it needs * Magic offsets for the application to mmap the data it needs
*/ */
...@@ -204,6 +237,7 @@ struct io_uring_params { ...@@ -204,6 +237,7 @@ struct io_uring_params {
#define IORING_FEAT_SUBMIT_STABLE (1U << 2) #define IORING_FEAT_SUBMIT_STABLE (1U << 2)
#define IORING_FEAT_RW_CUR_POS (1U << 3) #define IORING_FEAT_RW_CUR_POS (1U << 3)
#define IORING_FEAT_CUR_PERSONALITY (1U << 4) #define IORING_FEAT_CUR_PERSONALITY (1U << 4)
#define IORING_FEAT_FAST_POLL (1U << 5)
/* /*
* io_uring_register(2) opcodes and arguments * io_uring_register(2) opcodes and arguments
......
...@@ -97,16 +97,26 @@ void task_work_run(void) ...@@ -97,16 +97,26 @@ void task_work_run(void)
* work->func() can do task_work_add(), do not set * work->func() can do task_work_add(), do not set
* work_exited unless the list is empty. * work_exited unless the list is empty.
*/ */
raw_spin_lock_irq(&task->pi_lock);
do { do {
head = NULL;
work = READ_ONCE(task->task_works); work = READ_ONCE(task->task_works);
head = !work && (task->flags & PF_EXITING) ? if (!work) {
&work_exited : NULL; if (task->flags & PF_EXITING)
head = &work_exited;
else
break;
}
} while (cmpxchg(&task->task_works, work, head) != work); } while (cmpxchg(&task->task_works, work, head) != work);
raw_spin_unlock_irq(&task->pi_lock);
if (!work) if (!work)
break; break;
/*
* Synchronize with task_work_cancel(). It can not remove
* the first entry == work, cmpxchg(task_works) must fail.
* But it can remove another entry from the ->next list.
*/
raw_spin_lock_irq(&task->pi_lock);
raw_spin_unlock_irq(&task->pi_lock);
do { do {
next = work->next; next = work->next;
......
...@@ -33,10 +33,10 @@ ...@@ -33,10 +33,10 @@
#include <linux/uaccess.h> #include <linux/uaccess.h>
#include <net/compat.h> #include <net/compat.h>
int get_compat_msghdr(struct msghdr *kmsg, int __get_compat_msghdr(struct msghdr *kmsg,
struct compat_msghdr __user *umsg, struct compat_msghdr __user *umsg,
struct sockaddr __user **save_addr, struct sockaddr __user **save_addr,
struct iovec **iov) compat_uptr_t *ptr, compat_size_t *len)
{ {
struct compat_msghdr msg; struct compat_msghdr msg;
ssize_t err; ssize_t err;
...@@ -79,10 +79,26 @@ int get_compat_msghdr(struct msghdr *kmsg, ...@@ -79,10 +79,26 @@ int get_compat_msghdr(struct msghdr *kmsg,
return -EMSGSIZE; return -EMSGSIZE;
kmsg->msg_iocb = NULL; kmsg->msg_iocb = NULL;
*ptr = msg.msg_iov;
*len = msg.msg_iovlen;
return 0;
}
int get_compat_msghdr(struct msghdr *kmsg,
struct compat_msghdr __user *umsg,
struct sockaddr __user **save_addr,
struct iovec **iov)
{
compat_uptr_t ptr;
compat_size_t len;
ssize_t err;
err = __get_compat_msghdr(kmsg, umsg, save_addr, &ptr, &len);
if (err)
return err;
err = compat_import_iovec(save_addr ? READ : WRITE, err = compat_import_iovec(save_addr ? READ : WRITE, compat_ptr(ptr),
compat_ptr(msg.msg_iov), msg.msg_iovlen, len, UIO_FASTIOV, iov, &kmsg->msg_iter);
UIO_FASTIOV, iov, &kmsg->msg_iter);
return err < 0 ? err : 0; return err < 0 ? err : 0;
} }
......
...@@ -2228,10 +2228,10 @@ struct used_address { ...@@ -2228,10 +2228,10 @@ struct used_address {
unsigned int name_len; unsigned int name_len;
}; };
static int copy_msghdr_from_user(struct msghdr *kmsg, int __copy_msghdr_from_user(struct msghdr *kmsg,
struct user_msghdr __user *umsg, struct user_msghdr __user *umsg,
struct sockaddr __user **save_addr, struct sockaddr __user **save_addr,
struct iovec **iov) struct iovec __user **uiov, size_t *nsegs)
{ {
struct user_msghdr msg; struct user_msghdr msg;
ssize_t err; ssize_t err;
...@@ -2273,6 +2273,23 @@ static int copy_msghdr_from_user(struct msghdr *kmsg, ...@@ -2273,6 +2273,23 @@ static int copy_msghdr_from_user(struct msghdr *kmsg,
return -EMSGSIZE; return -EMSGSIZE;
kmsg->msg_iocb = NULL; kmsg->msg_iocb = NULL;
*uiov = msg.msg_iov;
*nsegs = msg.msg_iovlen;
return 0;
}
static int copy_msghdr_from_user(struct msghdr *kmsg,
struct user_msghdr __user *umsg,
struct sockaddr __user **save_addr,
struct iovec **iov)
{
struct user_msghdr msg;
ssize_t err;
err = __copy_msghdr_from_user(kmsg, umsg, save_addr, &msg.msg_iov,
&msg.msg_iovlen);
if (err)
return err;
err = import_iovec(save_addr ? READ : WRITE, err = import_iovec(save_addr ? READ : WRITE,
msg.msg_iov, msg.msg_iovlen, msg.msg_iov, msg.msg_iovlen,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment