Commit c288d9cd authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'for-5.14/io_uring-2021-06-30' of git://git.kernel.dk/linux-block

Pull io_uring updates from Jens Axboe:

 - Multi-queue iopoll improvement (Fam)

 - Allow configurable io-wq CPU masks (me)

 - renameat/linkat tightening (me)

 - poll re-arm improvement (Olivier)

 - SQPOLL race fix (Olivier)

 - Cancelation unification (Pavel)

 - SQPOLL cleanups (Pavel)

 - Enable file backed buffers for shmem/memfd (Pavel)

 - A ton of cleanups and performance improvements (Pavel)

 - Followup and misc fixes (Colin, Fam, Hao, Olivier)

* tag 'for-5.14/io_uring-2021-06-30' of git://git.kernel.dk/linux-block: (83 commits)
  io_uring: code clean for kiocb_done()
  io_uring: spin in iopoll() only when reqs are in a single queue
  io_uring: pre-initialise some of req fields
  io_uring: refactor io_submit_flush_completions
  io_uring: optimise hot path restricted checks
  io_uring: remove not needed PF_EXITING check
  io_uring: mainstream sqpoll task_work running
  io_uring: refactor io_arm_poll_handler()
  io_uring: reduce latency by reissueing the operation
  io_uring: add IOPOLL and reserved field checks to IORING_OP_UNLINKAT
  io_uring: add IOPOLL and reserved field checks to IORING_OP_RENAMEAT
  io_uring: refactor io_openat2()
  io_uring: simplify struct io_uring_sqe layout
  io_uring: update sqe layout build checks
  io_uring: fix code style problems
  io_uring: refactor io_sq_thread()
  io_uring: don't change sqpoll creds if not needed
  io_uring: Create define to modify a SQPOLL parameter
  io_uring: Fix race condition when sqp thread goes to sleep
  io_uring: improve in tctx_task_work() resubmission
  ...
parents 911a2997 e149bd74
...@@ -9,8 +9,6 @@ ...@@ -9,8 +9,6 @@
#include <linux/init.h> #include <linux/init.h>
#include <linux/errno.h> #include <linux/errno.h>
#include <linux/sched/signal.h> #include <linux/sched/signal.h>
#include <linux/mm.h>
#include <linux/sched/mm.h>
#include <linux/percpu.h> #include <linux/percpu.h>
#include <linux/slab.h> #include <linux/slab.h>
#include <linux/rculist_nulls.h> #include <linux/rculist_nulls.h>
...@@ -96,13 +94,14 @@ struct io_wqe { ...@@ -96,13 +94,14 @@ struct io_wqe {
struct io_wq *wq; struct io_wq *wq;
struct io_wq_work *hash_tail[IO_WQ_NR_HASH_BUCKETS]; struct io_wq_work *hash_tail[IO_WQ_NR_HASH_BUCKETS];
cpumask_var_t cpu_mask;
}; };
/* /*
* Per io_wq state * Per io_wq state
*/ */
struct io_wq { struct io_wq {
struct io_wqe **wqes;
unsigned long state; unsigned long state;
free_work_fn *free_work; free_work_fn *free_work;
...@@ -110,14 +109,14 @@ struct io_wq { ...@@ -110,14 +109,14 @@ struct io_wq {
struct io_wq_hash *hash; struct io_wq_hash *hash;
refcount_t refs;
atomic_t worker_refs; atomic_t worker_refs;
struct completion worker_done; struct completion worker_done;
struct hlist_node cpuhp_node; struct hlist_node cpuhp_node;
struct task_struct *task; struct task_struct *task;
struct io_wqe *wqes[];
}; };
static enum cpuhp_state io_wq_online; static enum cpuhp_state io_wq_online;
...@@ -241,7 +240,8 @@ static void io_wqe_wake_worker(struct io_wqe *wqe, struct io_wqe_acct *acct) ...@@ -241,7 +240,8 @@ static void io_wqe_wake_worker(struct io_wqe *wqe, struct io_wqe_acct *acct)
* Most likely an attempt to queue unbounded work on an io_wq that * Most likely an attempt to queue unbounded work on an io_wq that
* wasn't setup with any unbounded workers. * wasn't setup with any unbounded workers.
*/ */
WARN_ON_ONCE(!acct->max_workers); if (unlikely(!acct->max_workers))
pr_warn_once("io-wq is not configured for unbound workers");
rcu_read_lock(); rcu_read_lock();
ret = io_wqe_activate_free_worker(wqe); ret = io_wqe_activate_free_worker(wqe);
...@@ -560,17 +560,13 @@ static int io_wqe_worker(void *data) ...@@ -560,17 +560,13 @@ static int io_wqe_worker(void *data)
if (ret) if (ret)
continue; continue;
/* timed out, exit unless we're the fixed worker */ /* timed out, exit unless we're the fixed worker */
if (test_bit(IO_WQ_BIT_EXIT, &wq->state) || if (!(worker->flags & IO_WORKER_F_FIXED))
!(worker->flags & IO_WORKER_F_FIXED))
break; break;
} }
if (test_bit(IO_WQ_BIT_EXIT, &wq->state)) { if (test_bit(IO_WQ_BIT_EXIT, &wq->state)) {
raw_spin_lock_irq(&wqe->lock); raw_spin_lock_irq(&wqe->lock);
if (!wq_list_empty(&wqe->work_list))
io_worker_handle_work(worker); io_worker_handle_work(worker);
else
raw_spin_unlock_irq(&wqe->lock);
} }
io_worker_exit(worker); io_worker_exit(worker);
...@@ -645,7 +641,7 @@ static void create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index) ...@@ -645,7 +641,7 @@ static void create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index)
tsk->pf_io_worker = worker; tsk->pf_io_worker = worker;
worker->task = tsk; worker->task = tsk;
set_cpus_allowed_ptr(tsk, cpumask_of_node(wqe->node)); set_cpus_allowed_ptr(tsk, wqe->cpu_mask);
tsk->flags |= PF_NO_SETAFFINITY; tsk->flags |= PF_NO_SETAFFINITY;
raw_spin_lock_irq(&wqe->lock); raw_spin_lock_irq(&wqe->lock);
...@@ -901,23 +897,20 @@ static int io_wqe_hash_wake(struct wait_queue_entry *wait, unsigned mode, ...@@ -901,23 +897,20 @@ static int io_wqe_hash_wake(struct wait_queue_entry *wait, unsigned mode,
struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
{ {
int ret = -ENOMEM, node; int ret, node;
struct io_wq *wq; struct io_wq *wq;
if (WARN_ON_ONCE(!data->free_work || !data->do_work)) if (WARN_ON_ONCE(!data->free_work || !data->do_work))
return ERR_PTR(-EINVAL); return ERR_PTR(-EINVAL);
if (WARN_ON_ONCE(!bounded))
return ERR_PTR(-EINVAL);
wq = kzalloc(sizeof(*wq), GFP_KERNEL); wq = kzalloc(struct_size(wq, wqes, nr_node_ids), GFP_KERNEL);
if (!wq) if (!wq)
return ERR_PTR(-ENOMEM); return ERR_PTR(-ENOMEM);
wq->wqes = kcalloc(nr_node_ids, sizeof(struct io_wqe *), GFP_KERNEL);
if (!wq->wqes)
goto err_wq;
ret = cpuhp_state_add_instance_nocalls(io_wq_online, &wq->cpuhp_node); ret = cpuhp_state_add_instance_nocalls(io_wq_online, &wq->cpuhp_node);
if (ret) if (ret)
goto err_wqes; goto err_wq;
refcount_inc(&data->hash->refs); refcount_inc(&data->hash->refs);
wq->hash = data->hash; wq->hash = data->hash;
...@@ -934,6 +927,9 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) ...@@ -934,6 +927,9 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
wqe = kzalloc_node(sizeof(struct io_wqe), GFP_KERNEL, alloc_node); wqe = kzalloc_node(sizeof(struct io_wqe), GFP_KERNEL, alloc_node);
if (!wqe) if (!wqe)
goto err; goto err;
if (!alloc_cpumask_var(&wqe->cpu_mask, GFP_KERNEL))
goto err;
cpumask_copy(wqe->cpu_mask, cpumask_of_node(node));
wq->wqes[node] = wqe; wq->wqes[node] = wqe;
wqe->node = alloc_node; wqe->node = alloc_node;
wqe->acct[IO_WQ_ACCT_BOUND].index = IO_WQ_ACCT_BOUND; wqe->acct[IO_WQ_ACCT_BOUND].index = IO_WQ_ACCT_BOUND;
...@@ -953,17 +949,18 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) ...@@ -953,17 +949,18 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
} }
wq->task = get_task_struct(data->task); wq->task = get_task_struct(data->task);
refcount_set(&wq->refs, 1);
atomic_set(&wq->worker_refs, 1); atomic_set(&wq->worker_refs, 1);
init_completion(&wq->worker_done); init_completion(&wq->worker_done);
return wq; return wq;
err: err:
io_wq_put_hash(data->hash); io_wq_put_hash(data->hash);
cpuhp_state_remove_instance_nocalls(io_wq_online, &wq->cpuhp_node); cpuhp_state_remove_instance_nocalls(io_wq_online, &wq->cpuhp_node);
for_each_node(node) for_each_node(node) {
if (!wq->wqes[node])
continue;
free_cpumask_var(wq->wqes[node]->cpu_mask);
kfree(wq->wqes[node]); kfree(wq->wqes[node]);
err_wqes: }
kfree(wq->wqes);
err_wq: err_wq:
kfree(wq); kfree(wq);
return ERR_PTR(ret); return ERR_PTR(ret);
...@@ -1033,10 +1030,10 @@ static void io_wq_destroy(struct io_wq *wq) ...@@ -1033,10 +1030,10 @@ static void io_wq_destroy(struct io_wq *wq)
.cancel_all = true, .cancel_all = true,
}; };
io_wqe_cancel_pending_work(wqe, &match); io_wqe_cancel_pending_work(wqe, &match);
free_cpumask_var(wqe->cpu_mask);
kfree(wqe); kfree(wqe);
} }
io_wq_put_hash(wq->hash); io_wq_put_hash(wq->hash);
kfree(wq->wqes);
kfree(wq); kfree(wq);
} }
...@@ -1045,25 +1042,67 @@ void io_wq_put_and_exit(struct io_wq *wq) ...@@ -1045,25 +1042,67 @@ void io_wq_put_and_exit(struct io_wq *wq)
WARN_ON_ONCE(!test_bit(IO_WQ_BIT_EXIT, &wq->state)); WARN_ON_ONCE(!test_bit(IO_WQ_BIT_EXIT, &wq->state));
io_wq_exit_workers(wq); io_wq_exit_workers(wq);
if (refcount_dec_and_test(&wq->refs))
io_wq_destroy(wq); io_wq_destroy(wq);
} }
struct online_data {
unsigned int cpu;
bool online;
};
static bool io_wq_worker_affinity(struct io_worker *worker, void *data) static bool io_wq_worker_affinity(struct io_worker *worker, void *data)
{ {
set_cpus_allowed_ptr(worker->task, cpumask_of_node(worker->wqe->node)); struct online_data *od = data;
if (od->online)
cpumask_set_cpu(od->cpu, worker->wqe->cpu_mask);
else
cpumask_clear_cpu(od->cpu, worker->wqe->cpu_mask);
return false; return false;
} }
static int __io_wq_cpu_online(struct io_wq *wq, unsigned int cpu, bool online)
{
struct online_data od = {
.cpu = cpu,
.online = online
};
int i;
rcu_read_lock();
for_each_node(i)
io_wq_for_each_worker(wq->wqes[i], io_wq_worker_affinity, &od);
rcu_read_unlock();
return 0;
}
static int io_wq_cpu_online(unsigned int cpu, struct hlist_node *node) static int io_wq_cpu_online(unsigned int cpu, struct hlist_node *node)
{ {
struct io_wq *wq = hlist_entry_safe(node, struct io_wq, cpuhp_node); struct io_wq *wq = hlist_entry_safe(node, struct io_wq, cpuhp_node);
return __io_wq_cpu_online(wq, cpu, true);
}
static int io_wq_cpu_offline(unsigned int cpu, struct hlist_node *node)
{
struct io_wq *wq = hlist_entry_safe(node, struct io_wq, cpuhp_node);
return __io_wq_cpu_online(wq, cpu, false);
}
int io_wq_cpu_affinity(struct io_wq *wq, cpumask_var_t mask)
{
int i; int i;
rcu_read_lock(); rcu_read_lock();
for_each_node(i) for_each_node(i) {
io_wq_for_each_worker(wq->wqes[i], io_wq_worker_affinity, NULL); struct io_wqe *wqe = wq->wqes[i];
if (mask)
cpumask_copy(wqe->cpu_mask, mask);
else
cpumask_copy(wqe->cpu_mask, cpumask_of_node(i));
}
rcu_read_unlock(); rcu_read_unlock();
return 0; return 0;
} }
...@@ -1073,7 +1112,7 @@ static __init int io_wq_init(void) ...@@ -1073,7 +1112,7 @@ static __init int io_wq_init(void)
int ret; int ret;
ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, "io-wq/online", ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, "io-wq/online",
io_wq_cpu_online, NULL); io_wq_cpu_online, io_wq_cpu_offline);
if (ret < 0) if (ret < 0)
return ret; return ret;
io_wq_online = ret; io_wq_online = ret;
......
...@@ -87,7 +87,6 @@ static inline void wq_list_del(struct io_wq_work_list *list, ...@@ -87,7 +87,6 @@ static inline void wq_list_del(struct io_wq_work_list *list,
struct io_wq_work { struct io_wq_work {
struct io_wq_work_node list; struct io_wq_work_node list;
const struct cred *creds;
unsigned flags; unsigned flags;
}; };
...@@ -128,6 +127,8 @@ void io_wq_put_and_exit(struct io_wq *wq); ...@@ -128,6 +127,8 @@ void io_wq_put_and_exit(struct io_wq *wq);
void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work); void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work);
void io_wq_hash_work(struct io_wq_work *work, void *val); void io_wq_hash_work(struct io_wq_work *work, void *val);
int io_wq_cpu_affinity(struct io_wq *wq, cpumask_var_t mask);
static inline bool io_wq_is_hashed(struct io_wq_work *work) static inline bool io_wq_is_hashed(struct io_wq_work *work)
{ {
return work->flags & IO_WQ_WORK_HASHED; return work->flags & IO_WQ_WORK_HASHED;
......
This diff is collapsed.
...@@ -318,13 +318,14 @@ TRACE_EVENT(io_uring_complete, ...@@ -318,13 +318,14 @@ TRACE_EVENT(io_uring_complete,
__entry->res, __entry->cflags) __entry->res, __entry->cflags)
); );
/** /**
* io_uring_submit_sqe - called before submitting one SQE * io_uring_submit_sqe - called before submitting one SQE
* *
* @ctx: pointer to a ring context structure * @ctx: pointer to a ring context structure
* @req: pointer to a submitted request
* @opcode: opcode of request * @opcode: opcode of request
* @user_data: user data associated with the request * @user_data: user data associated with the request
* @flags request flags
* @force_nonblock: whether a context blocking or not * @force_nonblock: whether a context blocking or not
* @sq_thread: true if sq_thread has submitted this SQE * @sq_thread: true if sq_thread has submitted this SQE
* *
...@@ -333,41 +334,60 @@ TRACE_EVENT(io_uring_complete, ...@@ -333,41 +334,60 @@ TRACE_EVENT(io_uring_complete,
*/ */
TRACE_EVENT(io_uring_submit_sqe, TRACE_EVENT(io_uring_submit_sqe,
TP_PROTO(void *ctx, u8 opcode, u64 user_data, bool force_nonblock, TP_PROTO(void *ctx, void *req, u8 opcode, u64 user_data, u32 flags,
bool sq_thread), bool force_nonblock, bool sq_thread),
TP_ARGS(ctx, opcode, user_data, force_nonblock, sq_thread), TP_ARGS(ctx, req, opcode, user_data, flags, force_nonblock, sq_thread),
TP_STRUCT__entry ( TP_STRUCT__entry (
__field( void *, ctx ) __field( void *, ctx )
__field( void *, req )
__field( u8, opcode ) __field( u8, opcode )
__field( u64, user_data ) __field( u64, user_data )
__field( u32, flags )
__field( bool, force_nonblock ) __field( bool, force_nonblock )
__field( bool, sq_thread ) __field( bool, sq_thread )
), ),
TP_fast_assign( TP_fast_assign(
__entry->ctx = ctx; __entry->ctx = ctx;
__entry->req = req;
__entry->opcode = opcode; __entry->opcode = opcode;
__entry->user_data = user_data; __entry->user_data = user_data;
__entry->flags = flags;
__entry->force_nonblock = force_nonblock; __entry->force_nonblock = force_nonblock;
__entry->sq_thread = sq_thread; __entry->sq_thread = sq_thread;
), ),
TP_printk("ring %p, op %d, data 0x%llx, non block %d, sq_thread %d", TP_printk("ring %p, req %p, op %d, data 0x%llx, flags %u, "
__entry->ctx, __entry->opcode, "non block %d, sq_thread %d", __entry->ctx, __entry->req,
(unsigned long long) __entry->user_data, __entry->opcode, (unsigned long long)__entry->user_data,
__entry->force_nonblock, __entry->sq_thread) __entry->flags, __entry->force_nonblock, __entry->sq_thread)
); );
/*
* io_uring_poll_arm - called after arming a poll wait if successful
*
* @ctx: pointer to a ring context structure
* @req: pointer to the armed request
* @opcode: opcode of request
* @user_data: user data associated with the request
* @mask: request poll events mask
* @events: registered events of interest
*
* Allows to track which fds are waiting for and what are the events of
* interest.
*/
TRACE_EVENT(io_uring_poll_arm, TRACE_EVENT(io_uring_poll_arm,
TP_PROTO(void *ctx, u8 opcode, u64 user_data, int mask, int events), TP_PROTO(void *ctx, void *req, u8 opcode, u64 user_data,
int mask, int events),
TP_ARGS(ctx, opcode, user_data, mask, events), TP_ARGS(ctx, req, opcode, user_data, mask, events),
TP_STRUCT__entry ( TP_STRUCT__entry (
__field( void *, ctx ) __field( void *, ctx )
__field( void *, req )
__field( u8, opcode ) __field( u8, opcode )
__field( u64, user_data ) __field( u64, user_data )
__field( int, mask ) __field( int, mask )
...@@ -376,14 +396,15 @@ TRACE_EVENT(io_uring_poll_arm, ...@@ -376,14 +396,15 @@ TRACE_EVENT(io_uring_poll_arm,
TP_fast_assign( TP_fast_assign(
__entry->ctx = ctx; __entry->ctx = ctx;
__entry->req = req;
__entry->opcode = opcode; __entry->opcode = opcode;
__entry->user_data = user_data; __entry->user_data = user_data;
__entry->mask = mask; __entry->mask = mask;
__entry->events = events; __entry->events = events;
), ),
TP_printk("ring %p, op %d, data 0x%llx, mask 0x%x, events 0x%x", TP_printk("ring %p, req %p, op %d, data 0x%llx, mask 0x%x, events 0x%x",
__entry->ctx, __entry->opcode, __entry->ctx, __entry->req, __entry->opcode,
(unsigned long long) __entry->user_data, (unsigned long long) __entry->user_data,
__entry->mask, __entry->events) __entry->mask, __entry->events)
); );
...@@ -440,26 +461,39 @@ TRACE_EVENT(io_uring_task_add, ...@@ -440,26 +461,39 @@ TRACE_EVENT(io_uring_task_add,
__entry->mask) __entry->mask)
); );
/*
* io_uring_task_run - called when task_work_run() executes the poll events
* notification callbacks
*
* @ctx: pointer to a ring context structure
* @req: pointer to the armed request
* @opcode: opcode of request
* @user_data: user data associated with the request
*
* Allows to track when notified poll events are processed
*/
TRACE_EVENT(io_uring_task_run, TRACE_EVENT(io_uring_task_run,
TP_PROTO(void *ctx, u8 opcode, u64 user_data), TP_PROTO(void *ctx, void *req, u8 opcode, u64 user_data),
TP_ARGS(ctx, opcode, user_data), TP_ARGS(ctx, req, opcode, user_data),
TP_STRUCT__entry ( TP_STRUCT__entry (
__field( void *, ctx ) __field( void *, ctx )
__field( void *, req )
__field( u8, opcode ) __field( u8, opcode )
__field( u64, user_data ) __field( u64, user_data )
), ),
TP_fast_assign( TP_fast_assign(
__entry->ctx = ctx; __entry->ctx = ctx;
__entry->req = req;
__entry->opcode = opcode; __entry->opcode = opcode;
__entry->user_data = user_data; __entry->user_data = user_data;
), ),
TP_printk("ring %p, op %d, data 0x%llx", TP_printk("ring %p, req %p, op %d, data 0x%llx",
__entry->ctx, __entry->opcode, __entry->ctx, __entry->req, __entry->opcode,
(unsigned long long) __entry->user_data) (unsigned long long) __entry->user_data)
); );
......
...@@ -46,8 +46,6 @@ struct io_uring_sqe { ...@@ -46,8 +46,6 @@ struct io_uring_sqe {
__u32 unlink_flags; __u32 unlink_flags;
}; };
__u64 user_data; /* data to be passed back at completion time */ __u64 user_data; /* data to be passed back at completion time */
union {
struct {
/* pack this to avoid bogus arm OABI complaints */ /* pack this to avoid bogus arm OABI complaints */
union { union {
/* index into fixed buffers, if used */ /* index into fixed buffers, if used */
...@@ -58,9 +56,7 @@ struct io_uring_sqe { ...@@ -58,9 +56,7 @@ struct io_uring_sqe {
/* personality to use, if used */ /* personality to use, if used */
__u16 personality; __u16 personality;
__s32 splice_fd_in; __s32 splice_fd_in;
}; __u64 __pad2[2];
__u64 __pad2[3];
};
}; };
enum { enum {
...@@ -306,6 +302,10 @@ enum { ...@@ -306,6 +302,10 @@ enum {
IORING_REGISTER_BUFFERS2 = 15, IORING_REGISTER_BUFFERS2 = 15,
IORING_REGISTER_BUFFERS_UPDATE = 16, IORING_REGISTER_BUFFERS_UPDATE = 16,
/* set/clear io-wq thread affinities */
IORING_REGISTER_IOWQ_AFF = 17,
IORING_UNREGISTER_IOWQ_AFF = 18,
/* this goes last */ /* this goes last */
IORING_REGISTER_LAST IORING_REGISTER_LAST
}; };
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment