Commit 74dea5d9 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'io_uring-5.6-2020-02-28' of git://git.kernel.dk/linux-block

Pull io_uring fixes from Jens Axboe:

 - Fix for a race with IOPOLL used with SQPOLL (Xiaoguang)

 - Only show ->fdinfo if procfs is enabled (Tobias)

 - Fix for a chain with multiple personalities in the SQEs

 - Fix for a missing free of personality idr on exit

 - Removal of the spin-for-work optimization

 - Fix for next work lookup on request completion

 - Fix for non-vec read/write result progation in case of links

 - Fix for a fileset references on switch

 - Fix for a recvmsg/sendmsg 32-bit compatability mode

* tag 'io_uring-5.6-2020-02-28' of git://git.kernel.dk/linux-block:
  io_uring: fix 32-bit compatability with sendmsg/recvmsg
  io_uring: define and set show_fdinfo only if procfs is enabled
  io_uring: drop file set ref put/get on switch
  io_uring: import_single_range() returns 0/-ERROR
  io_uring: pick up link work on submit reference drop
  io-wq: ensure work->task_pid is cleared on init
  io-wq: remove spin-for-work optimization
  io_uring: fix poll_list race for SETUP_IOPOLL|SETUP_SQPOLL
  io_uring: fix personality idr leak
  io_uring: handle multiple personalities in link chains
parents c60c0402 d8768362
......@@ -535,42 +535,23 @@ static void io_worker_handle_work(struct io_worker *worker)
} while (1);
}
static inline void io_worker_spin_for_work(struct io_wqe *wqe)
{
int i = 0;
while (++i < 1000) {
if (io_wqe_run_queue(wqe))
break;
if (need_resched())
break;
cpu_relax();
}
}
static int io_wqe_worker(void *data)
{
struct io_worker *worker = data;
struct io_wqe *wqe = worker->wqe;
struct io_wq *wq = wqe->wq;
bool did_work;
io_worker_start(wqe, worker);
did_work = false;
while (!test_bit(IO_WQ_BIT_EXIT, &wq->state)) {
set_current_state(TASK_INTERRUPTIBLE);
loop:
if (did_work)
io_worker_spin_for_work(wqe);
spin_lock_irq(&wqe->lock);
if (io_wqe_run_queue(wqe)) {
__set_current_state(TASK_RUNNING);
io_worker_handle_work(worker);
did_work = true;
goto loop;
}
did_work = false;
/* drops the lock on success, retry */
if (__io_worker_idle(wqe, worker)) {
__release(&wqe->lock);
......
......@@ -79,16 +79,10 @@ struct io_wq_work {
pid_t task_pid;
};
#define INIT_IO_WORK(work, _func) \
do { \
(work)->list.next = NULL; \
(work)->func = _func; \
(work)->files = NULL; \
(work)->mm = NULL; \
(work)->creds = NULL; \
(work)->fs = NULL; \
(work)->flags = 0; \
} while (0) \
#define INIT_IO_WORK(work, _func) \
do { \
*(work) = (struct io_wq_work){ .func = _func }; \
} while (0) \
typedef void (get_work_fn)(struct io_wq_work *);
typedef void (put_work_fn)(struct io_wq_work *);
......
......@@ -183,17 +183,12 @@ struct fixed_file_table {
struct file **files;
};
enum {
FFD_F_ATOMIC,
};
struct fixed_file_data {
struct fixed_file_table *table;
struct io_ring_ctx *ctx;
struct percpu_ref refs;
struct llist_head put_llist;
unsigned long state;
struct work_struct ref_work;
struct completion done;
};
......@@ -1483,10 +1478,10 @@ static void io_free_req(struct io_kiocb *req)
__attribute__((nonnull))
static void io_put_req_find_next(struct io_kiocb *req, struct io_kiocb **nxtptr)
{
io_req_find_next(req, nxtptr);
if (refcount_dec_and_test(&req->refs))
if (refcount_dec_and_test(&req->refs)) {
io_req_find_next(req, nxtptr);
__io_free_req(req);
}
}
static void io_put_req(struct io_kiocb *req)
......@@ -1821,6 +1816,10 @@ static void io_iopoll_req_issued(struct io_kiocb *req)
list_add(&req->list, &ctx->poll_list);
else
list_add_tail(&req->list, &ctx->poll_list);
if ((ctx->flags & IORING_SETUP_SQPOLL) &&
wq_has_sleeper(&ctx->sqo_wait))
wake_up(&ctx->sqo_wait);
}
static void io_file_put(struct io_submit_state *state)
......@@ -2071,7 +2070,7 @@ static ssize_t io_import_iovec(int rw, struct io_kiocb *req,
ssize_t ret;
ret = import_single_range(rw, buf, sqe_len, *iovec, iter);
*iovec = NULL;
return ret;
return ret < 0 ? ret : sqe_len;
}
if (req->io) {
......@@ -3002,6 +3001,11 @@ static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
sr->msg = u64_to_user_ptr(READ_ONCE(sqe->addr));
sr->len = READ_ONCE(sqe->len);
#ifdef CONFIG_COMPAT
if (req->ctx->compat)
sr->msg_flags |= MSG_CMSG_COMPAT;
#endif
if (!io || req->opcode == IORING_OP_SEND)
return 0;
/* iovec is already imported */
......@@ -3154,6 +3158,11 @@ static int io_recvmsg_prep(struct io_kiocb *req,
sr->msg = u64_to_user_ptr(READ_ONCE(sqe->addr));
sr->len = READ_ONCE(sqe->len);
#ifdef CONFIG_COMPAT
if (req->ctx->compat)
sr->msg_flags |= MSG_CMSG_COMPAT;
#endif
if (!io || req->opcode == IORING_OP_RECV)
return 0;
/* iovec is already imported */
......@@ -4705,11 +4714,21 @@ static void __io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_kiocb *linked_timeout;
struct io_kiocb *nxt = NULL;
const struct cred *old_creds = NULL;
int ret;
again:
linked_timeout = io_prep_linked_timeout(req);
if (req->work.creds && req->work.creds != current_cred()) {
if (old_creds)
revert_creds(old_creds);
if (old_creds == req->work.creds)
old_creds = NULL; /* restored original creds */
else
old_creds = override_creds(req->work.creds);
}
ret = io_issue_sqe(req, sqe, &nxt, true);
/*
......@@ -4735,7 +4754,7 @@ static void __io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe)
err:
/* drop submission reference */
io_put_req(req);
io_put_req_find_next(req, &nxt);
if (linked_timeout) {
if (!ret)
......@@ -4759,6 +4778,8 @@ static void __io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe)
goto punt;
goto again;
}
if (old_creds)
revert_creds(old_creds);
}
static void io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe)
......@@ -4803,7 +4824,6 @@ static inline void io_queue_link_head(struct io_kiocb *req)
static bool io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
struct io_submit_state *state, struct io_kiocb **link)
{
const struct cred *old_creds = NULL;
struct io_ring_ctx *ctx = req->ctx;
unsigned int sqe_flags;
int ret, id;
......@@ -4818,14 +4838,12 @@ static bool io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
id = READ_ONCE(sqe->personality);
if (id) {
const struct cred *personality_creds;
personality_creds = idr_find(&ctx->personality_idr, id);
if (unlikely(!personality_creds)) {
req->work.creds = idr_find(&ctx->personality_idr, id);
if (unlikely(!req->work.creds)) {
ret = -EINVAL;
goto err_req;
}
old_creds = override_creds(personality_creds);
get_cred(req->work.creds);
}
/* same numerical values with corresponding REQ_F_*, safe to copy */
......@@ -4837,8 +4855,6 @@ static bool io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
err_req:
io_cqring_add_event(req, ret);
io_double_put_req(req);
if (old_creds)
revert_creds(old_creds);
return false;
}
......@@ -4899,8 +4915,6 @@ static bool io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
}
}
if (old_creds)
revert_creds(old_creds);
return true;
}
......@@ -5081,9 +5095,8 @@ static int io_sq_thread(void *data)
const struct cred *old_cred;
mm_segment_t old_fs;
DEFINE_WAIT(wait);
unsigned inflight;
unsigned long timeout;
int ret;
int ret = 0;
complete(&ctx->completions[1]);
......@@ -5091,39 +5104,19 @@ static int io_sq_thread(void *data)
set_fs(USER_DS);
old_cred = override_creds(ctx->creds);
ret = timeout = inflight = 0;
timeout = jiffies + ctx->sq_thread_idle;
while (!kthread_should_park()) {
unsigned int to_submit;
if (inflight) {
if (!list_empty(&ctx->poll_list)) {
unsigned nr_events = 0;
if (ctx->flags & IORING_SETUP_IOPOLL) {
/*
* inflight is the count of the maximum possible
* entries we submitted, but it can be smaller
* if we dropped some of them. If we don't have
* poll entries available, then we know that we
* have nothing left to poll for. Reset the
* inflight count to zero in that case.
*/
mutex_lock(&ctx->uring_lock);
if (!list_empty(&ctx->poll_list))
io_iopoll_getevents(ctx, &nr_events, 0);
else
inflight = 0;
mutex_unlock(&ctx->uring_lock);
} else {
/*
* Normal IO, just pretend everything completed.
* We don't have to poll completions for that.
*/
nr_events = inflight;
}
inflight -= nr_events;
if (!inflight)
mutex_lock(&ctx->uring_lock);
if (!list_empty(&ctx->poll_list))
io_iopoll_getevents(ctx, &nr_events, 0);
else
timeout = jiffies + ctx->sq_thread_idle;
mutex_unlock(&ctx->uring_lock);
}
to_submit = io_sqring_entries(ctx);
......@@ -5152,7 +5145,7 @@ static int io_sq_thread(void *data)
* more IO, we should wait for the application to
* reap events and wake us up.
*/
if (inflight ||
if (!list_empty(&ctx->poll_list) ||
(!time_after(jiffies, timeout) && ret != -EBUSY &&
!percpu_ref_is_dying(&ctx->refs))) {
cond_resched();
......@@ -5162,6 +5155,19 @@ static int io_sq_thread(void *data)
prepare_to_wait(&ctx->sqo_wait, &wait,
TASK_INTERRUPTIBLE);
/*
* While doing polled IO, before going to sleep, we need
* to check if there are new reqs added to poll_list, it
* is because reqs may have been punted to io worker and
* will be added to poll_list later, hence check the
* poll_list again.
*/
if ((ctx->flags & IORING_SETUP_IOPOLL) &&
!list_empty_careful(&ctx->poll_list)) {
finish_wait(&ctx->sqo_wait, &wait);
continue;
}
/* Tell userspace we may need a wakeup call */
ctx->rings->sq_flags |= IORING_SQ_NEED_WAKEUP;
/* make sure to read SQ tail after writing flags */
......@@ -5189,8 +5195,7 @@ static int io_sq_thread(void *data)
mutex_lock(&ctx->uring_lock);
ret = io_submit_sqes(ctx, to_submit, NULL, -1, &cur_mm, true);
mutex_unlock(&ctx->uring_lock);
if (ret > 0)
inflight += ret;
timeout = jiffies + ctx->sq_thread_idle;
}
set_fs(old_fs);
......@@ -5595,7 +5600,6 @@ static void io_ring_file_ref_switch(struct work_struct *work)
data = container_of(work, struct fixed_file_data, ref_work);
io_ring_file_ref_flush(data);
percpu_ref_get(&data->refs);
percpu_ref_switch_to_percpu(&data->refs);
}
......@@ -5771,8 +5775,13 @@ static void io_atomic_switch(struct percpu_ref *ref)
{
struct fixed_file_data *data;
/*
* Juggle reference to ensure we hit zero, if needed, so we can
* switch back to percpu mode
*/
data = container_of(ref, struct fixed_file_data, refs);
clear_bit(FFD_F_ATOMIC, &data->state);
percpu_ref_put(&data->refs);
percpu_ref_get(&data->refs);
}
static bool io_queue_file_removal(struct fixed_file_data *data,
......@@ -5795,11 +5804,7 @@ static bool io_queue_file_removal(struct fixed_file_data *data,
llist_add(&pfile->llist, &data->put_llist);
if (pfile == &pfile_stack) {
if (!test_and_set_bit(FFD_F_ATOMIC, &data->state)) {
percpu_ref_put(&data->refs);
percpu_ref_switch_to_atomic(&data->refs,
io_atomic_switch);
}
percpu_ref_switch_to_atomic(&data->refs, io_atomic_switch);
wait_for_completion(&done);
flush_work(&data->ref_work);
return false;
......@@ -5873,10 +5878,8 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
up->offset++;
}
if (ref_switch && !test_and_set_bit(FFD_F_ATOMIC, &data->state)) {
percpu_ref_put(&data->refs);
if (ref_switch)
percpu_ref_switch_to_atomic(&data->refs, io_atomic_switch);
}
return done ? done : err;
}
......@@ -6334,6 +6337,7 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx)
io_sqe_buffer_unregister(ctx);
io_sqe_files_unregister(ctx);
io_eventfd_unregister(ctx);
idr_destroy(&ctx->personality_idr);
#if defined(CONFIG_UNIX)
if (ctx->ring_sock) {
......@@ -6647,6 +6651,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
return submitted ? submitted : ret;
}
#ifdef CONFIG_PROC_FS
static int io_uring_show_cred(int id, void *p, void *data)
{
const struct cred *cred = p;
......@@ -6720,6 +6725,7 @@ static void io_uring_show_fdinfo(struct seq_file *m, struct file *f)
percpu_ref_put(&ctx->refs);
}
}
#endif
static const struct file_operations io_uring_fops = {
.release = io_uring_release,
......@@ -6731,7 +6737,9 @@ static const struct file_operations io_uring_fops = {
#endif
.poll = io_uring_poll,
.fasync = io_uring_fasync,
#ifdef CONFIG_PROC_FS
.show_fdinfo = io_uring_show_fdinfo,
#endif
};
static int io_allocate_scq_urings(struct io_ring_ctx *ctx,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment