Commit e59cd880 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'for-5.7/io_uring-2020-03-29' of git://git.kernel.dk/linux-block

Pull io_uring updates from Jens Axboe:
 "Here are the io_uring changes for this merge window. Light on new
  features this time around (just splice + buffer selection), lots of
  cleanups, fixes, and improvements to existing support. In particular,
  this contains:

   - Cleanup fixed file update handling for stack fallback (Hillf)

   - Re-work of how pollable async IO is handled, we no longer require
     thread offload to handle that. Instead we rely using poll to drive
     this, with task_work execution.

   - In conjunction with the above, allow expendable buffer selection,
     so that poll+recv (for example) no longer has to be a split
     operation.

   - Make sure we honor RLIMIT_FSIZE for buffered writes

   - Add support for splice (Pavel)

   - Linked work inheritance fixes and optimizations (Pavel)

   - Async work fixes and cleanups (Pavel)

   - Improve io-wq locking (Pavel)

   - Hashed link write improvements (Pavel)

   - SETUP_IOPOLL|SETUP_SQPOLL improvements (Xiaoguang)"

* tag 'for-5.7/io_uring-2020-03-29' of git://git.kernel.dk/linux-block: (54 commits)
  io_uring: cleanup io_alloc_async_ctx()
  io_uring: fix missing 'return' in comment
  io-wq: handle hashed writes in chains
  io-uring: drop 'free_pfile' in struct io_file_put
  io-uring: drop completion when removing file
  io_uring: Fix ->data corruption on re-enqueue
  io-wq: close cancel gap for hashed linked work
  io_uring: make spdxcheck.py happy
  io_uring: honor original task RLIMIT_FSIZE
  io-wq: hash dependent work
  io-wq: split hashing and enqueueing
  io-wq: don't resched if there is no work
  io-wq: remove duplicated cancel code
  io_uring: fix truncated async read/readv and write/writev retry
  io_uring: dual license io_uring.h uapi header
  io_uring: io_uring_enter(2) don't poll while SETUP_IOPOLL|SETUP_SQPOLL enabled
  io_uring: Fix unused function warnings
  io_uring: add end-of-bits marker and build time verify it
  io_uring: provide means of removing buffers
  io_uring: add IOSQE_BUFFER_SELECT support for IORING_OP_RECVMSG
  ...
parents 15926148 3d9932a8
......@@ -69,6 +69,8 @@ struct io_worker {
#define IO_WQ_HASH_ORDER 5
#endif
#define IO_WQ_NR_HASH_BUCKETS (1u << IO_WQ_HASH_ORDER)
struct io_wqe_acct {
unsigned nr_workers;
unsigned max_workers;
......@@ -98,6 +100,7 @@ struct io_wqe {
struct list_head all_list;
struct io_wq *wq;
struct io_wq_work *hash_tail[IO_WQ_NR_HASH_BUCKETS];
};
/*
......@@ -107,8 +110,7 @@ struct io_wq {
struct io_wqe **wqes;
unsigned long state;
get_work_fn *get_work;
put_work_fn *put_work;
free_work_fn *free_work;
struct task_struct *manager;
struct user_struct *user;
......@@ -376,26 +378,35 @@ static bool __io_worker_idle(struct io_wqe *wqe, struct io_worker *worker)
return __io_worker_unuse(wqe, worker);
}
static struct io_wq_work *io_get_next_work(struct io_wqe *wqe, unsigned *hash)
static inline unsigned int io_get_work_hash(struct io_wq_work *work)
{
return work->flags >> IO_WQ_HASH_SHIFT;
}
static struct io_wq_work *io_get_next_work(struct io_wqe *wqe)
__must_hold(wqe->lock)
{
struct io_wq_work_node *node, *prev;
struct io_wq_work *work;
struct io_wq_work *work, *tail;
unsigned int hash;
wq_list_for_each(node, prev, &wqe->work_list) {
work = container_of(node, struct io_wq_work, list);
/* not hashed, can run anytime */
if (!(work->flags & IO_WQ_WORK_HASHED)) {
wq_node_del(&wqe->work_list, node, prev);
if (!io_wq_is_hashed(work)) {
wq_list_del(&wqe->work_list, node, prev);
return work;
}
/* hashed, can run if not already running */
*hash = work->flags >> IO_WQ_HASH_SHIFT;
if (!(wqe->hash_map & BIT_ULL(*hash))) {
wqe->hash_map |= BIT_ULL(*hash);
wq_node_del(&wqe->work_list, node, prev);
hash = io_get_work_hash(work);
if (!(wqe->hash_map & BIT(hash))) {
wqe->hash_map |= BIT(hash);
/* all items with this hash lie in [work, tail] */
tail = wqe->hash_tail[hash];
wqe->hash_tail[hash] = NULL;
wq_list_cut(&wqe->work_list, &tail->list, prev);
return work;
}
}
......@@ -440,16 +451,49 @@ static void io_wq_switch_creds(struct io_worker *worker,
worker->saved_creds = old_creds;
}
static void io_impersonate_work(struct io_worker *worker,
struct io_wq_work *work)
{
if (work->files && current->files != work->files) {
task_lock(current);
current->files = work->files;
task_unlock(current);
}
if (work->fs && current->fs != work->fs)
current->fs = work->fs;
if (work->mm != worker->mm)
io_wq_switch_mm(worker, work);
if (worker->cur_creds != work->creds)
io_wq_switch_creds(worker, work);
}
static void io_assign_current_work(struct io_worker *worker,
struct io_wq_work *work)
{
if (work) {
/* flush pending signals before assigning new work */
if (signal_pending(current))
flush_signals(current);
cond_resched();
}
spin_lock_irq(&worker->lock);
worker->cur_work = work;
spin_unlock_irq(&worker->lock);
}
static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work);
static void io_worker_handle_work(struct io_worker *worker)
__releases(wqe->lock)
{
struct io_wq_work *work, *old_work = NULL, *put_work = NULL;
struct io_wqe *wqe = worker->wqe;
struct io_wq *wq = wqe->wq;
do {
unsigned hash = -1U;
struct io_wq_work *work;
unsigned int hash;
get_next:
/*
* If we got some work, mark us as busy. If we didn't, but
* the list isn't empty, it means we stalled on hashed work.
......@@ -457,81 +501,60 @@ static void io_worker_handle_work(struct io_worker *worker)
* can't make progress, any work completion or insertion will
* clear the stalled flag.
*/
work = io_get_next_work(wqe, &hash);
work = io_get_next_work(wqe);
if (work)
__io_worker_busy(wqe, worker, work);
else if (!wq_list_empty(&wqe->work_list))
wqe->flags |= IO_WQE_FLAG_STALLED;
spin_unlock_irq(&wqe->lock);
if (put_work && wq->put_work)
wq->put_work(old_work);
if (!work)
break;
next:
/* flush any pending signals before assigning new work */
if (signal_pending(current))
flush_signals(current);
cond_resched();
spin_lock_irq(&worker->lock);
worker->cur_work = work;
spin_unlock_irq(&worker->lock);
if (work->flags & IO_WQ_WORK_CB)
work->func(&work);
if (work->files && current->files != work->files) {
task_lock(current);
current->files = work->files;
task_unlock(current);
}
if (work->fs && current->fs != work->fs)
current->fs = work->fs;
if (work->mm != worker->mm)
io_wq_switch_mm(worker, work);
if (worker->cur_creds != work->creds)
io_wq_switch_creds(worker, work);
/*
* OK to set IO_WQ_WORK_CANCEL even for uncancellable work,
* the worker function will do the right thing.
*/
if (test_bit(IO_WQ_BIT_CANCEL, &wq->state))
work->flags |= IO_WQ_WORK_CANCEL;
if (worker->mm)
work->flags |= IO_WQ_WORK_HAS_MM;
if (wq->get_work) {
put_work = work;
wq->get_work(work);
}
old_work = work;
work->func(&work);
spin_lock_irq(&worker->lock);
worker->cur_work = NULL;
spin_unlock_irq(&worker->lock);
spin_lock_irq(&wqe->lock);
if (hash != -1U) {
wqe->hash_map &= ~BIT_ULL(hash);
wqe->flags &= ~IO_WQE_FLAG_STALLED;
}
if (work && work != old_work) {
spin_unlock_irq(&wqe->lock);
if (put_work && wq->put_work) {
wq->put_work(put_work);
put_work = NULL;
io_assign_current_work(worker, work);
/* handle a whole dependent link */
do {
struct io_wq_work *old_work, *next_hashed, *linked;
next_hashed = wq_next_work(work);
io_impersonate_work(worker, work);
/*
* OK to set IO_WQ_WORK_CANCEL even for uncancellable
* work, the worker function will do the right thing.
*/
if (test_bit(IO_WQ_BIT_CANCEL, &wq->state))
work->flags |= IO_WQ_WORK_CANCEL;
hash = io_get_work_hash(work);
linked = old_work = work;
linked->func(&linked);
linked = (old_work == linked) ? NULL : linked;
work = next_hashed;
if (!work && linked && !io_wq_is_hashed(linked)) {
work = linked;
linked = NULL;
}
io_assign_current_work(worker, work);
wq->free_work(old_work);
if (linked)
io_wqe_enqueue(wqe, linked);
if (hash != -1U && !next_hashed) {
spin_lock_irq(&wqe->lock);
wqe->hash_map &= ~BIT_ULL(hash);
wqe->flags &= ~IO_WQE_FLAG_STALLED;
/* dependent work is not hashed */
hash = -1U;
/* skip unnecessary unlock-lock wqe->lock */
if (!work)
goto get_next;
spin_unlock_irq(&wqe->lock);
}
} while (work);
/* dependent work not hashed */
hash = -1U;
goto next;
}
spin_lock_irq(&wqe->lock);
} while (1);
}
......@@ -747,17 +770,40 @@ static bool io_wq_can_queue(struct io_wqe *wqe, struct io_wqe_acct *acct,
return true;
}
static void io_run_cancel(struct io_wq_work *work)
static void io_run_cancel(struct io_wq_work *work, struct io_wqe *wqe)
{
struct io_wq *wq = wqe->wq;
do {
struct io_wq_work *old_work = work;
work->flags |= IO_WQ_WORK_CANCEL;
work->func(&work);
work = (work == old_work) ? NULL : work;
wq->free_work(old_work);
} while (work);
}
static void io_wqe_insert_work(struct io_wqe *wqe, struct io_wq_work *work)
{
unsigned int hash;
struct io_wq_work *tail;
if (!io_wq_is_hashed(work)) {
append:
wq_list_add_tail(&work->list, &wqe->work_list);
return;
}
hash = io_get_work_hash(work);
tail = wqe->hash_tail[hash];
wqe->hash_tail[hash] = work;
if (!tail)
goto append;
wq_list_add_after(&work->list, &tail->list, &wqe->work_list);
}
static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work)
{
struct io_wqe_acct *acct = io_work_get_acct(wqe, work);
......@@ -771,13 +817,13 @@ static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work)
* It's close enough to not be an issue, fork() has the same delay.
*/
if (unlikely(!io_wq_can_queue(wqe, acct, work))) {
io_run_cancel(work);
io_run_cancel(work, wqe);
return;
}
work_flags = work->flags;
spin_lock_irqsave(&wqe->lock, flags);
wq_list_add_tail(&work->list, &wqe->work_list);
io_wqe_insert_work(wqe, work);
wqe->flags &= ~IO_WQE_FLAG_STALLED;
spin_unlock_irqrestore(&wqe->lock, flags);
......@@ -794,19 +840,15 @@ void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work)
}
/*
* Enqueue work, hashed by some key. Work items that hash to the same value
* will not be done in parallel. Used to limit concurrent writes, generally
* hashed by inode.
* Work items that hash to the same value will not be done in parallel.
* Used to limit concurrent writes, generally hashed by inode.
*/
void io_wq_enqueue_hashed(struct io_wq *wq, struct io_wq_work *work, void *val)
void io_wq_hash_work(struct io_wq_work *work, void *val)
{
struct io_wqe *wqe = wq->wqes[numa_node_id()];
unsigned bit;
unsigned int bit;
bit = hash_ptr(val, IO_WQ_HASH_ORDER);
work->flags |= (IO_WQ_WORK_HASHED | (bit << IO_WQ_HASH_SHIFT));
io_wqe_enqueue(wqe, work);
}
static bool io_wqe_worker_send_sig(struct io_worker *worker, void *data)
......@@ -856,14 +898,13 @@ void io_wq_cancel_all(struct io_wq *wq)
}
struct io_cb_cancel_data {
struct io_wqe *wqe;
work_cancel_fn *cancel;
void *caller_data;
work_cancel_fn *fn;
void *data;
};
static bool io_work_cancel(struct io_worker *worker, void *cancel_data)
static bool io_wq_worker_cancel(struct io_worker *worker, void *data)
{
struct io_cb_cancel_data *data = cancel_data;
struct io_cb_cancel_data *match = data;
unsigned long flags;
bool ret = false;
......@@ -874,83 +915,7 @@ static bool io_work_cancel(struct io_worker *worker, void *cancel_data)
spin_lock_irqsave(&worker->lock, flags);
if (worker->cur_work &&
!(worker->cur_work->flags & IO_WQ_WORK_NO_CANCEL) &&
data->cancel(worker->cur_work, data->caller_data)) {
send_sig(SIGINT, worker->task, 1);
ret = true;
}
spin_unlock_irqrestore(&worker->lock, flags);
return ret;
}
static enum io_wq_cancel io_wqe_cancel_cb_work(struct io_wqe *wqe,
work_cancel_fn *cancel,
void *cancel_data)
{
struct io_cb_cancel_data data = {
.wqe = wqe,
.cancel = cancel,
.caller_data = cancel_data,
};
struct io_wq_work_node *node, *prev;
struct io_wq_work *work;
unsigned long flags;
bool found = false;
spin_lock_irqsave(&wqe->lock, flags);
wq_list_for_each(node, prev, &wqe->work_list) {
work = container_of(node, struct io_wq_work, list);
if (cancel(work, cancel_data)) {
wq_node_del(&wqe->work_list, node, prev);
found = true;
break;
}
}
spin_unlock_irqrestore(&wqe->lock, flags);
if (found) {
io_run_cancel(work);
return IO_WQ_CANCEL_OK;
}
rcu_read_lock();
found = io_wq_for_each_worker(wqe, io_work_cancel, &data);
rcu_read_unlock();
return found ? IO_WQ_CANCEL_RUNNING : IO_WQ_CANCEL_NOTFOUND;
}
enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel,
void *data)
{
enum io_wq_cancel ret = IO_WQ_CANCEL_NOTFOUND;
int node;
for_each_node(node) {
struct io_wqe *wqe = wq->wqes[node];
ret = io_wqe_cancel_cb_work(wqe, cancel, data);
if (ret != IO_WQ_CANCEL_NOTFOUND)
break;
}
return ret;
}
struct work_match {
bool (*fn)(struct io_wq_work *, void *data);
void *data;
};
static bool io_wq_worker_cancel(struct io_worker *worker, void *data)
{
struct work_match *match = data;
unsigned long flags;
bool ret = false;
spin_lock_irqsave(&worker->lock, flags);
if (match->fn(worker->cur_work, match->data) &&
!(worker->cur_work->flags & IO_WQ_WORK_NO_CANCEL)) {
match->fn(worker->cur_work, match->data)) {
send_sig(SIGINT, worker->task, 1);
ret = true;
}
......@@ -960,7 +925,7 @@ static bool io_wq_worker_cancel(struct io_worker *worker, void *data)
}
static enum io_wq_cancel io_wqe_cancel_work(struct io_wqe *wqe,
struct work_match *match)
struct io_cb_cancel_data *match)
{
struct io_wq_work_node *node, *prev;
struct io_wq_work *work;
......@@ -977,7 +942,7 @@ static enum io_wq_cancel io_wqe_cancel_work(struct io_wqe *wqe,
work = container_of(node, struct io_wq_work, list);
if (match->fn(work, match->data)) {
wq_node_del(&wqe->work_list, node, prev);
wq_list_del(&wqe->work_list, node, prev);
found = true;
break;
}
......@@ -985,7 +950,7 @@ static enum io_wq_cancel io_wqe_cancel_work(struct io_wqe *wqe,
spin_unlock_irqrestore(&wqe->lock, flags);
if (found) {
io_run_cancel(work);
io_run_cancel(work, wqe);
return IO_WQ_CANCEL_OK;
}
......@@ -1001,22 +966,16 @@ static enum io_wq_cancel io_wqe_cancel_work(struct io_wqe *wqe,
return found ? IO_WQ_CANCEL_RUNNING : IO_WQ_CANCEL_NOTFOUND;
}
static bool io_wq_work_match(struct io_wq_work *work, void *data)
{
return work == data;
}
enum io_wq_cancel io_wq_cancel_work(struct io_wq *wq, struct io_wq_work *cwork)
enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel,
void *data)
{
struct work_match match = {
.fn = io_wq_work_match,
.data = cwork
struct io_cb_cancel_data match = {
.fn = cancel,
.data = data,
};
enum io_wq_cancel ret = IO_WQ_CANCEL_NOTFOUND;
int node;
cwork->flags |= IO_WQ_WORK_CANCEL;
for_each_node(node) {
struct io_wqe *wqe = wq->wqes[node];
......@@ -1028,33 +987,28 @@ enum io_wq_cancel io_wq_cancel_work(struct io_wq *wq, struct io_wq_work *cwork)
return ret;
}
static bool io_wq_io_cb_cancel_data(struct io_wq_work *work, void *data)
{
return work == data;
}
enum io_wq_cancel io_wq_cancel_work(struct io_wq *wq, struct io_wq_work *cwork)
{
return io_wq_cancel_cb(wq, io_wq_io_cb_cancel_data, (void *)cwork);
}
static bool io_wq_pid_match(struct io_wq_work *work, void *data)
{
pid_t pid = (pid_t) (unsigned long) data;
if (work)
return work->task_pid == pid;
return false;
return work->task_pid == pid;
}
enum io_wq_cancel io_wq_cancel_pid(struct io_wq *wq, pid_t pid)
{
struct work_match match = {
.fn = io_wq_pid_match,
.data = (void *) (unsigned long) pid
};
enum io_wq_cancel ret = IO_WQ_CANCEL_NOTFOUND;
int node;
for_each_node(node) {
struct io_wqe *wqe = wq->wqes[node];
void *data = (void *) (unsigned long) pid;
ret = io_wqe_cancel_work(wqe, &match);
if (ret != IO_WQ_CANCEL_NOTFOUND)
break;
}
return ret;
return io_wq_cancel_cb(wq, io_wq_pid_match, data);
}
struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
......@@ -1062,6 +1016,9 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
int ret = -ENOMEM, node;
struct io_wq *wq;
if (WARN_ON_ONCE(!data->free_work))
return ERR_PTR(-EINVAL);
wq = kzalloc(sizeof(*wq), GFP_KERNEL);
if (!wq)
return ERR_PTR(-ENOMEM);
......@@ -1072,8 +1029,7 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
return ERR_PTR(-ENOMEM);
}
wq->get_work = data->get_work;
wq->put_work = data->put_work;
wq->free_work = data->free_work;
/* caller must already hold a reference to this */
wq->user = data->user;
......@@ -1130,7 +1086,7 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
bool io_wq_get(struct io_wq *wq, struct io_wq_data *data)
{
if (data->get_work != wq->get_work || data->put_work != wq->put_work)
if (data->free_work != wq->free_work)
return false;
return refcount_inc_not_zero(&wq->use_refs);
......
......@@ -5,10 +5,8 @@ struct io_wq;
enum {
IO_WQ_WORK_CANCEL = 1,
IO_WQ_WORK_HAS_MM = 2,
IO_WQ_WORK_HASHED = 4,
IO_WQ_WORK_UNBOUND = 32,
IO_WQ_WORK_CB = 128,
IO_WQ_WORK_NO_CANCEL = 256,
IO_WQ_WORK_CONCURRENT = 512,
......@@ -30,6 +28,18 @@ struct io_wq_work_list {
struct io_wq_work_node *last;
};
static inline void wq_list_add_after(struct io_wq_work_node *node,
struct io_wq_work_node *pos,
struct io_wq_work_list *list)
{
struct io_wq_work_node *next = pos->next;
pos->next = node;
node->next = next;
if (!next)
list->last = node;
}
static inline void wq_list_add_tail(struct io_wq_work_node *node,
struct io_wq_work_list *list)
{
......@@ -42,17 +52,26 @@ static inline void wq_list_add_tail(struct io_wq_work_node *node,
}
}
static inline void wq_node_del(struct io_wq_work_list *list,
struct io_wq_work_node *node,
static inline void wq_list_cut(struct io_wq_work_list *list,
struct io_wq_work_node *last,
struct io_wq_work_node *prev)
{
if (node == list->first)
WRITE_ONCE(list->first, node->next);
if (node == list->last)
/* first in the list, if prev==NULL */
if (!prev)
WRITE_ONCE(list->first, last->next);
else
prev->next = last->next;
if (last == list->last)
list->last = prev;
if (prev)
prev->next = node->next;
node->next = NULL;
last->next = NULL;
}
static inline void wq_list_del(struct io_wq_work_list *list,
struct io_wq_work_node *node,
struct io_wq_work_node *prev)
{
wq_list_cut(list, node, prev);
}
#define wq_list_for_each(pos, prv, head) \
......@@ -65,10 +84,7 @@ static inline void wq_node_del(struct io_wq_work_list *list,
} while (0)
struct io_wq_work {
union {
struct io_wq_work_node list;
void *data;
};
struct io_wq_work_node list;
void (*func)(struct io_wq_work **);
struct files_struct *files;
struct mm_struct *mm;
......@@ -83,14 +99,20 @@ struct io_wq_work {
*(work) = (struct io_wq_work){ .func = _func }; \
} while (0) \
typedef void (get_work_fn)(struct io_wq_work *);
typedef void (put_work_fn)(struct io_wq_work *);
static inline struct io_wq_work *wq_next_work(struct io_wq_work *work)
{
if (!work->list.next)
return NULL;
return container_of(work->list.next, struct io_wq_work, list);
}
typedef void (free_work_fn)(struct io_wq_work *);
struct io_wq_data {
struct user_struct *user;
get_work_fn *get_work;
put_work_fn *put_work;
free_work_fn *free_work;
};
struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data);
......@@ -98,7 +120,12 @@ bool io_wq_get(struct io_wq *wq, struct io_wq_data *data);
void io_wq_destroy(struct io_wq *wq);
void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work);
void io_wq_enqueue_hashed(struct io_wq *wq, struct io_wq_work *work, void *val);
void io_wq_hash_work(struct io_wq_work *work, void *val);
static inline bool io_wq_is_hashed(struct io_wq_work *work)
{
return work->flags & IO_WQ_WORK_HASHED;
}
void io_wq_cancel_all(struct io_wq *wq);
enum io_wq_cancel io_wq_cancel_work(struct io_wq *wq, struct io_wq_work *cwork);
......
......@@ -44,6 +44,7 @@
#include <linux/errno.h>
#include <linux/syscalls.h>
#include <linux/compat.h>
#include <net/compat.h>
#include <linux/refcount.h>
#include <linux/uio.h>
#include <linux/bits.h>
......@@ -76,6 +77,8 @@
#include <linux/fadvise.h>
#include <linux/eventpoll.h>
#include <linux/fs_struct.h>
#include <linux/splice.h>
#include <linux/task_work.h>
#define CREATE_TRACE_POINTS
#include <trace/events/io_uring.h>
......@@ -193,6 +196,13 @@ struct fixed_file_data {
struct completion done;
};
struct io_buffer {
struct list_head list;
__u64 addr;
__s32 len;
__u16 bid;
};
struct io_ring_ctx {
struct {
struct percpu_ref refs;
......@@ -270,6 +280,8 @@ struct io_ring_ctx {
struct socket *ring_sock;
#endif
struct idr io_buffer_idr;
struct idr personality_idr;
struct {
......@@ -290,7 +302,6 @@ struct io_ring_ctx {
struct {
spinlock_t completion_lock;
struct llist_head poll_llist;
/*
* ->poll_list is protected by the ctx->uring_lock for
......@@ -386,7 +397,9 @@ struct io_sr_msg {
void __user *buf;
};
int msg_flags;
int bgid;
size_t len;
struct io_buffer *kbuf;
};
struct io_open {
......@@ -430,6 +443,24 @@ struct io_epoll {
struct epoll_event event;
};
struct io_splice {
struct file *file_out;
struct file *file_in;
loff_t off_out;
loff_t off_in;
u64 len;
unsigned int flags;
};
struct io_provide_buf {
struct file *file;
__u64 addr;
__s32 len;
__u32 bgid;
__u16 nbufs;
__u16 bid;
};
struct io_async_connect {
struct sockaddr_storage address;
};
......@@ -464,6 +495,7 @@ enum {
REQ_F_LINK_BIT = IOSQE_IO_LINK_BIT,
REQ_F_HARDLINK_BIT = IOSQE_IO_HARDLINK_BIT,
REQ_F_FORCE_ASYNC_BIT = IOSQE_ASYNC_BIT,
REQ_F_BUFFER_SELECT_BIT = IOSQE_BUFFER_SELECT_BIT,
REQ_F_LINK_NEXT_BIT,
REQ_F_FAIL_LINK_BIT,
......@@ -479,6 +511,11 @@ enum {
REQ_F_COMP_LOCKED_BIT,
REQ_F_NEED_CLEANUP_BIT,
REQ_F_OVERFLOW_BIT,
REQ_F_POLLED_BIT,
REQ_F_BUFFER_SELECTED_BIT,
/* not a real bit, just to check we're not overflowing the space */
__REQ_F_LAST_BIT,
};
enum {
......@@ -492,6 +529,8 @@ enum {
REQ_F_HARDLINK = BIT(REQ_F_HARDLINK_BIT),
/* IOSQE_ASYNC */
REQ_F_FORCE_ASYNC = BIT(REQ_F_FORCE_ASYNC_BIT),
/* IOSQE_BUFFER_SELECT */
REQ_F_BUFFER_SELECT = BIT(REQ_F_BUFFER_SELECT_BIT),
/* already grabbed next link */
REQ_F_LINK_NEXT = BIT(REQ_F_LINK_NEXT_BIT),
......@@ -521,6 +560,15 @@ enum {
REQ_F_NEED_CLEANUP = BIT(REQ_F_NEED_CLEANUP_BIT),
/* in overflow list */
REQ_F_OVERFLOW = BIT(REQ_F_OVERFLOW_BIT),
/* already went through poll handler */
REQ_F_POLLED = BIT(REQ_F_POLLED_BIT),
/* buffer already selected */
REQ_F_BUFFER_SELECTED = BIT(REQ_F_BUFFER_SELECTED_BIT),
};
struct async_poll {
struct io_poll_iocb poll;
struct io_wq_work work;
};
/*
......@@ -546,32 +594,45 @@ struct io_kiocb {
struct io_fadvise fadvise;
struct io_madvise madvise;
struct io_epoll epoll;
struct io_splice splice;
struct io_provide_buf pbuf;
};
struct io_async_ctx *io;
/*
* llist_node is only used for poll deferred completions
*/
struct llist_node llist_node;
bool in_async;
bool needs_fixed_file;
u8 opcode;
struct io_ring_ctx *ctx;
union {
struct list_head list;
struct hlist_node hash_node;
};
struct list_head link_list;
struct list_head list;
unsigned int flags;
refcount_t refs;
union {
struct task_struct *task;
unsigned long fsize;
};
u64 user_data;
u32 result;
u32 sequence;
struct list_head link_list;
struct list_head inflight_entry;
struct io_wq_work work;
union {
/*
* Only commands that never go async can use the below fields,
* obviously. Right now only IORING_OP_POLL_ADD uses them, and
* async armed poll handlers for regular commands. The latter
* restore the work, if needed.
*/
struct {
struct callback_head task_work;
struct hlist_node hash_node;
struct async_poll *apoll;
int cflags;
};
struct io_wq_work work;
};
};
#define IO_PLUG_THRESHOLD 2
......@@ -615,6 +676,11 @@ struct io_op_def {
unsigned file_table : 1;
/* needs ->fs */
unsigned needs_fs : 1;
/* set if opcode supports polled "wait" */
unsigned pollin : 1;
unsigned pollout : 1;
/* op supports buffer selection */
unsigned buffer_select : 1;
};
static const struct io_op_def io_op_defs[] = {
......@@ -624,6 +690,8 @@ static const struct io_op_def io_op_defs[] = {
.needs_mm = 1,
.needs_file = 1,
.unbound_nonreg_file = 1,
.pollin = 1,
.buffer_select = 1,
},
[IORING_OP_WRITEV] = {
.async_ctx = 1,
......@@ -631,6 +699,7 @@ static const struct io_op_def io_op_defs[] = {
.needs_file = 1,
.hash_reg_file = 1,
.unbound_nonreg_file = 1,
.pollout = 1,
},
[IORING_OP_FSYNC] = {
.needs_file = 1,
......@@ -638,11 +707,13 @@ static const struct io_op_def io_op_defs[] = {
[IORING_OP_READ_FIXED] = {
.needs_file = 1,
.unbound_nonreg_file = 1,
.pollin = 1,
},
[IORING_OP_WRITE_FIXED] = {
.needs_file = 1,
.hash_reg_file = 1,
.unbound_nonreg_file = 1,
.pollout = 1,
},
[IORING_OP_POLL_ADD] = {
.needs_file = 1,
......@@ -658,6 +729,7 @@ static const struct io_op_def io_op_defs[] = {
.needs_file = 1,
.unbound_nonreg_file = 1,
.needs_fs = 1,
.pollout = 1,
},
[IORING_OP_RECVMSG] = {
.async_ctx = 1,
......@@ -665,6 +737,8 @@ static const struct io_op_def io_op_defs[] = {
.needs_file = 1,
.unbound_nonreg_file = 1,
.needs_fs = 1,
.pollin = 1,
.buffer_select = 1,
},
[IORING_OP_TIMEOUT] = {
.async_ctx = 1,
......@@ -676,6 +750,7 @@ static const struct io_op_def io_op_defs[] = {
.needs_file = 1,
.unbound_nonreg_file = 1,
.file_table = 1,
.pollin = 1,
},
[IORING_OP_ASYNC_CANCEL] = {},
[IORING_OP_LINK_TIMEOUT] = {
......@@ -687,6 +762,7 @@ static const struct io_op_def io_op_defs[] = {
.needs_mm = 1,
.needs_file = 1,
.unbound_nonreg_file = 1,
.pollout = 1,
},
[IORING_OP_FALLOCATE] = {
.needs_file = 1,
......@@ -715,11 +791,14 @@ static const struct io_op_def io_op_defs[] = {
.needs_mm = 1,
.needs_file = 1,
.unbound_nonreg_file = 1,
.pollin = 1,
.buffer_select = 1,
},
[IORING_OP_WRITE] = {
.needs_mm = 1,
.needs_file = 1,
.unbound_nonreg_file = 1,
.pollout = 1,
},
[IORING_OP_FADVISE] = {
.needs_file = 1,
......@@ -731,11 +810,14 @@ static const struct io_op_def io_op_defs[] = {
.needs_mm = 1,
.needs_file = 1,
.unbound_nonreg_file = 1,
.pollout = 1,
},
[IORING_OP_RECV] = {
.needs_mm = 1,
.needs_file = 1,
.unbound_nonreg_file = 1,
.pollin = 1,
.buffer_select = 1,
},
[IORING_OP_OPENAT2] = {
.needs_file = 1,
......@@ -747,6 +829,13 @@ static const struct io_op_def io_op_defs[] = {
.unbound_nonreg_file = 1,
.file_table = 1,
},
[IORING_OP_SPLICE] = {
.needs_file = 1,
.hash_reg_file = 1,
.unbound_nonreg_file = 1,
},
[IORING_OP_PROVIDE_BUFFERS] = {},
[IORING_OP_REMOVE_BUFFERS] = {},
};
static void io_wq_submit_work(struct io_wq_work **workptr);
......@@ -761,6 +850,10 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
static int io_grab_files(struct io_kiocb *req);
static void io_ring_file_ref_flush(struct fixed_file_data *data);
static void io_cleanup_req(struct io_kiocb *req);
static int io_file_get(struct io_submit_state *state, struct io_kiocb *req,
int fd, struct file **out_file, bool fixed);
static void __io_queue_sqe(struct io_kiocb *req,
const struct io_uring_sqe *sqe);
static struct kmem_cache *req_cachep;
......@@ -827,11 +920,11 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
INIT_LIST_HEAD(&ctx->cq_overflow_list);
init_completion(&ctx->completions[0]);
init_completion(&ctx->completions[1]);
idr_init(&ctx->io_buffer_idr);
idr_init(&ctx->personality_idr);
mutex_init(&ctx->uring_lock);
init_waitqueue_head(&ctx->wait);
spin_lock_init(&ctx->completion_lock);
init_llist_head(&ctx->poll_llist);
INIT_LIST_HEAD(&ctx->poll_list);
INIT_LIST_HEAD(&ctx->defer_list);
INIT_LIST_HEAD(&ctx->timeout_list);
......@@ -952,15 +1045,14 @@ static inline void io_req_work_drop_env(struct io_kiocb *req)
}
}
static inline bool io_prep_async_work(struct io_kiocb *req,
static inline void io_prep_async_work(struct io_kiocb *req,
struct io_kiocb **link)
{
const struct io_op_def *def = &io_op_defs[req->opcode];
bool do_hashed = false;
if (req->flags & REQ_F_ISREG) {
if (def->hash_reg_file)
do_hashed = true;
io_wq_hash_work(&req->work, file_inode(req->file));
} else {
if (def->unbound_nonreg_file)
req->work.flags |= IO_WQ_WORK_UNBOUND;
......@@ -969,25 +1061,18 @@ static inline bool io_prep_async_work(struct io_kiocb *req,
io_req_work_grab_env(req, def);
*link = io_prep_linked_timeout(req);
return do_hashed;
}
static inline void io_queue_async_work(struct io_kiocb *req)
{
struct io_ring_ctx *ctx = req->ctx;
struct io_kiocb *link;
bool do_hashed;
do_hashed = io_prep_async_work(req, &link);
io_prep_async_work(req, &link);
trace_io_uring_queue_async_work(ctx, do_hashed, req, &req->work,
req->flags);
if (!do_hashed) {
io_wq_enqueue(ctx->io_wq, &req->work);
} else {
io_wq_enqueue_hashed(ctx->io_wq, &req->work,
file_inode(req->file));
}
trace_io_uring_queue_async_work(ctx, io_wq_is_hashed(&req->work), req,
&req->work, req->flags);
io_wq_enqueue(ctx->io_wq, &req->work);
if (link)
io_queue_linked_timeout(link);
......@@ -1054,24 +1139,19 @@ static inline bool io_should_trigger_evfd(struct io_ring_ctx *ctx)
return false;
if (!ctx->eventfd_async)
return true;
return io_wq_current_is_worker() || in_interrupt();
return io_wq_current_is_worker();
}
static void __io_cqring_ev_posted(struct io_ring_ctx *ctx, bool trigger_ev)
static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
{
if (waitqueue_active(&ctx->wait))
wake_up(&ctx->wait);
if (waitqueue_active(&ctx->sqo_wait))
wake_up(&ctx->sqo_wait);
if (trigger_ev)
if (io_should_trigger_evfd(ctx))
eventfd_signal(ctx->cq_ev_fd, 1);
}
static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
{
__io_cqring_ev_posted(ctx, io_should_trigger_evfd(ctx));
}
/* Returns true if there are no backlogged entries after the flush */
static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
{
......@@ -1108,7 +1188,7 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
if (cqe) {
WRITE_ONCE(cqe->user_data, req->user_data);
WRITE_ONCE(cqe->res, req->result);
WRITE_ONCE(cqe->flags, 0);
WRITE_ONCE(cqe->flags, req->cflags);
} else {
WRITE_ONCE(ctx->rings->cq_overflow,
atomic_inc_return(&ctx->cached_cq_overflow));
......@@ -1132,7 +1212,7 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
return cqe != NULL;
}
static void io_cqring_fill_event(struct io_kiocb *req, long res)
static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags)
{
struct io_ring_ctx *ctx = req->ctx;
struct io_uring_cqe *cqe;
......@@ -1148,7 +1228,7 @@ static void io_cqring_fill_event(struct io_kiocb *req, long res)
if (likely(cqe)) {
WRITE_ONCE(cqe->user_data, req->user_data);
WRITE_ONCE(cqe->res, res);
WRITE_ONCE(cqe->flags, 0);
WRITE_ONCE(cqe->flags, cflags);
} else if (ctx->cq_overflow_flushed) {
WRITE_ONCE(ctx->rings->cq_overflow,
atomic_inc_return(&ctx->cached_cq_overflow));
......@@ -1160,23 +1240,34 @@ static void io_cqring_fill_event(struct io_kiocb *req, long res)
req->flags |= REQ_F_OVERFLOW;
refcount_inc(&req->refs);
req->result = res;
req->cflags = cflags;
list_add_tail(&req->list, &ctx->cq_overflow_list);
}
}
static void io_cqring_add_event(struct io_kiocb *req, long res)
static void io_cqring_fill_event(struct io_kiocb *req, long res)
{
__io_cqring_fill_event(req, res, 0);
}
static void __io_cqring_add_event(struct io_kiocb *req, long res, long cflags)
{
struct io_ring_ctx *ctx = req->ctx;
unsigned long flags;
spin_lock_irqsave(&ctx->completion_lock, flags);
io_cqring_fill_event(req, res);
__io_cqring_fill_event(req, res, cflags);
io_commit_cqring(ctx);
spin_unlock_irqrestore(&ctx->completion_lock, flags);
io_cqring_ev_posted(ctx);
}
static void io_cqring_add_event(struct io_kiocb *req, long res)
{
__io_cqring_add_event(req, res, 0);
}
static inline bool io_is_fallback_req(struct io_kiocb *req)
{
return req == (struct io_kiocb *)
......@@ -1246,6 +1337,15 @@ static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx,
return NULL;
}
static inline void io_put_file(struct io_kiocb *req, struct file *file,
bool fixed)
{
if (fixed)
percpu_ref_put(&req->ctx->file_data->refs);
else
fput(file);
}
static void __io_req_do_free(struct io_kiocb *req)
{
if (likely(!io_is_fallback_req(req)))
......@@ -1256,18 +1356,12 @@ static void __io_req_do_free(struct io_kiocb *req)
static void __io_req_aux_free(struct io_kiocb *req)
{
struct io_ring_ctx *ctx = req->ctx;
if (req->flags & REQ_F_NEED_CLEANUP)
io_cleanup_req(req);
kfree(req->io);
if (req->file) {
if (req->flags & REQ_F_FIXED_FILE)
percpu_ref_put(&ctx->file_data->refs);
else
fput(req->file);
}
if (req->file)
io_put_file(req, req->file, (req->flags & REQ_F_FIXED_FILE));
io_req_work_drop_env(req);
}
......@@ -1474,6 +1568,30 @@ static void io_free_req(struct io_kiocb *req)
io_queue_async_work(nxt);
}
static void io_link_work_cb(struct io_wq_work **workptr)
{
struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
struct io_kiocb *link;
link = list_first_entry(&req->link_list, struct io_kiocb, link_list);
io_queue_linked_timeout(link);
io_wq_submit_work(workptr);
}
static void io_wq_assign_next(struct io_wq_work **workptr, struct io_kiocb *nxt)
{
struct io_kiocb *link;
const struct io_op_def *def = &io_op_defs[nxt->opcode];
if ((nxt->flags & REQ_F_ISREG) && def->hash_reg_file)
io_wq_hash_work(&nxt->work, file_inode(nxt->file));
*workptr = &nxt->work;
link = io_prep_linked_timeout(nxt);
if (link)
nxt->work.func = io_link_work_cb;
}
/*
* Drop reference to request, return next in chain (if there is one) if this
* was the last reference to this request.
......@@ -1493,6 +1611,26 @@ static void io_put_req(struct io_kiocb *req)
io_free_req(req);
}
static void io_steal_work(struct io_kiocb *req,
struct io_wq_work **workptr)
{
/*
* It's in an io-wq worker, so there always should be at least
* one reference, which will be dropped in io_put_work() just
* after the current handler returns.
*
* It also means, that if the counter dropped to 1, then there is
* no asynchronous users left, so it's safe to steal the next work.
*/
if (refcount_read(&req->refs) == 1) {
struct io_kiocb *nxt = NULL;
io_req_find_next(req, &nxt);
if (nxt)
io_wq_assign_next(workptr, nxt);
}
}
/*
* Must only be used if we don't need to care about links, usually from
* within the completion handling itself.
......@@ -1554,6 +1692,19 @@ static inline bool io_req_multi_free(struct req_batch *rb, struct io_kiocb *req)
return true;
}
static int io_put_kbuf(struct io_kiocb *req)
{
struct io_buffer *kbuf;
int cflags;
kbuf = (struct io_buffer *) (unsigned long) req->rw.addr;
cflags = kbuf->bid << IORING_CQE_BUFFER_SHIFT;
cflags |= IORING_CQE_F_BUFFER;
req->rw.addr = 0;
kfree(kbuf);
return cflags;
}
/*
* Find and free completed poll iocbs
*/
......@@ -1565,10 +1716,15 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
rb.to_free = rb.need_iter = 0;
while (!list_empty(done)) {
int cflags = 0;
req = list_first_entry(done, struct io_kiocb, list);
list_del(&req->list);
io_cqring_fill_event(req, req->result);
if (req->flags & REQ_F_BUFFER_SELECTED)
cflags = io_put_kbuf(req);
__io_cqring_fill_event(req, req->result, cflags);
(*nr_events)++;
if (refcount_dec_and_test(&req->refs) &&
......@@ -1577,6 +1733,8 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
}
io_commit_cqring(ctx);
if (ctx->flags & IORING_SETUP_SQPOLL)
io_cqring_ev_posted(ctx);
io_free_req_many(ctx, &rb);
}
......@@ -1743,13 +1901,16 @@ static inline void req_set_fail_links(struct io_kiocb *req)
static void io_complete_rw_common(struct kiocb *kiocb, long res)
{
struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
int cflags = 0;
if (kiocb->ki_flags & IOCB_WRITE)
kiocb_end_write(req);
if (res != req->result)
req_set_fail_links(req);
io_cqring_add_event(req, res);
if (req->flags & REQ_F_BUFFER_SELECTED)
cflags = io_put_kbuf(req);
__io_cqring_add_event(req, res, cflags);
}
static void io_complete_rw(struct kiocb *kiocb, long res, long res2)
......@@ -1760,17 +1921,6 @@ static void io_complete_rw(struct kiocb *kiocb, long res, long res2)
io_put_req(req);
}
static struct io_kiocb *__io_complete_rw(struct kiocb *kiocb, long res)
{
struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
struct io_kiocb *nxt = NULL;
io_complete_rw_common(kiocb, res);
io_put_req_find_next(req, &nxt);
return nxt;
}
static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
{
struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
......@@ -1841,7 +1991,7 @@ static void io_file_put(struct io_submit_state *state)
* assuming most submissions are for one file, or at least that each file
* has more than one submission.
*/
static struct file *io_file_get(struct io_submit_state *state, int fd)
static struct file *__io_file_get(struct io_submit_state *state, int fd)
{
if (!state)
return fget(fd);
......@@ -1938,7 +2088,7 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
req->rw.addr = READ_ONCE(sqe->addr);
req->rw.len = READ_ONCE(sqe->len);
/* we own ->private, reuse it for the buffer index */
/* we own ->private, reuse it for the buffer index / buffer ID */
req->rw.kiocb.private = (void *) (unsigned long)
READ_ONCE(sqe->buf_index);
return 0;
......@@ -1965,15 +2115,14 @@ static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret)
}
}
static void kiocb_done(struct kiocb *kiocb, ssize_t ret, struct io_kiocb **nxt,
bool in_async)
static void kiocb_done(struct kiocb *kiocb, ssize_t ret)
{
struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
if (req->flags & REQ_F_CUR_POS)
req->file->f_pos = kiocb->ki_pos;
if (in_async && ret >= 0 && kiocb->ki_complete == io_complete_rw)
*nxt = __io_complete_rw(kiocb, ret);
if (ret >= 0 && kiocb->ki_complete == io_complete_rw)
io_complete_rw(kiocb, ret, 0);
else
io_rw_done(kiocb, ret);
}
......@@ -2052,11 +2201,147 @@ static ssize_t io_import_fixed(struct io_kiocb *req, int rw,
return len;
}
static void io_ring_submit_unlock(struct io_ring_ctx *ctx, bool needs_lock)
{
if (needs_lock)
mutex_unlock(&ctx->uring_lock);
}
static void io_ring_submit_lock(struct io_ring_ctx *ctx, bool needs_lock)
{
/*
* "Normal" inline submissions always hold the uring_lock, since we
* grab it from the system call. Same is true for the SQPOLL offload.
* The only exception is when we've detached the request and issue it
* from an async worker thread, grab the lock for that case.
*/
if (needs_lock)
mutex_lock(&ctx->uring_lock);
}
static struct io_buffer *io_buffer_select(struct io_kiocb *req, size_t *len,
int bgid, struct io_buffer *kbuf,
bool needs_lock)
{
struct io_buffer *head;
if (req->flags & REQ_F_BUFFER_SELECTED)
return kbuf;
io_ring_submit_lock(req->ctx, needs_lock);
lockdep_assert_held(&req->ctx->uring_lock);
head = idr_find(&req->ctx->io_buffer_idr, bgid);
if (head) {
if (!list_empty(&head->list)) {
kbuf = list_last_entry(&head->list, struct io_buffer,
list);
list_del(&kbuf->list);
} else {
kbuf = head;
idr_remove(&req->ctx->io_buffer_idr, bgid);
}
if (*len > kbuf->len)
*len = kbuf->len;
} else {
kbuf = ERR_PTR(-ENOBUFS);
}
io_ring_submit_unlock(req->ctx, needs_lock);
return kbuf;
}
static void __user *io_rw_buffer_select(struct io_kiocb *req, size_t *len,
bool needs_lock)
{
struct io_buffer *kbuf;
int bgid;
kbuf = (struct io_buffer *) (unsigned long) req->rw.addr;
bgid = (int) (unsigned long) req->rw.kiocb.private;
kbuf = io_buffer_select(req, len, bgid, kbuf, needs_lock);
if (IS_ERR(kbuf))
return kbuf;
req->rw.addr = (u64) (unsigned long) kbuf;
req->flags |= REQ_F_BUFFER_SELECTED;
return u64_to_user_ptr(kbuf->addr);
}
#ifdef CONFIG_COMPAT
static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov,
bool needs_lock)
{
struct compat_iovec __user *uiov;
compat_ssize_t clen;
void __user *buf;
ssize_t len;
uiov = u64_to_user_ptr(req->rw.addr);
if (!access_ok(uiov, sizeof(*uiov)))
return -EFAULT;
if (__get_user(clen, &uiov->iov_len))
return -EFAULT;
if (clen < 0)
return -EINVAL;
len = clen;
buf = io_rw_buffer_select(req, &len, needs_lock);
if (IS_ERR(buf))
return PTR_ERR(buf);
iov[0].iov_base = buf;
iov[0].iov_len = (compat_size_t) len;
return 0;
}
#endif
static ssize_t __io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
bool needs_lock)
{
struct iovec __user *uiov = u64_to_user_ptr(req->rw.addr);
void __user *buf;
ssize_t len;
if (copy_from_user(iov, uiov, sizeof(*uiov)))
return -EFAULT;
len = iov[0].iov_len;
if (len < 0)
return -EINVAL;
buf = io_rw_buffer_select(req, &len, needs_lock);
if (IS_ERR(buf))
return PTR_ERR(buf);
iov[0].iov_base = buf;
iov[0].iov_len = len;
return 0;
}
static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
bool needs_lock)
{
if (req->flags & REQ_F_BUFFER_SELECTED)
return 0;
if (!req->rw.len)
return 0;
else if (req->rw.len > 1)
return -EINVAL;
#ifdef CONFIG_COMPAT
if (req->ctx->compat)
return io_compat_import(req, iov, needs_lock);
#endif
return __io_iov_buffer_select(req, iov, needs_lock);
}
static ssize_t io_import_iovec(int rw, struct io_kiocb *req,
struct iovec **iovec, struct iov_iter *iter)
struct iovec **iovec, struct iov_iter *iter,
bool needs_lock)
{
void __user *buf = u64_to_user_ptr(req->rw.addr);
size_t sqe_len = req->rw.len;
ssize_t ret;
u8 opcode;
opcode = req->opcode;
......@@ -2065,12 +2350,20 @@ static ssize_t io_import_iovec(int rw, struct io_kiocb *req,
return io_import_fixed(req, rw, iter);
}
/* buffer index only valid with fixed read/write */
if (req->rw.kiocb.private)
/* buffer index only valid with fixed read/write, or buffer select */
if (req->rw.kiocb.private && !(req->flags & REQ_F_BUFFER_SELECT))
return -EINVAL;
if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE) {
ssize_t ret;
if (req->flags & REQ_F_BUFFER_SELECT) {
buf = io_rw_buffer_select(req, &sqe_len, needs_lock);
if (IS_ERR(buf)) {
*iovec = NULL;
return PTR_ERR(buf);
}
req->rw.len = sqe_len;
}
ret = import_single_range(rw, buf, sqe_len, *iovec, iter);
*iovec = NULL;
return ret < 0 ? ret : sqe_len;
......@@ -2086,6 +2379,16 @@ static ssize_t io_import_iovec(int rw, struct io_kiocb *req,
return iorw->size;
}
if (req->flags & REQ_F_BUFFER_SELECT) {
ret = io_iov_buffer_select(req, *iovec, needs_lock);
if (!ret) {
ret = (*iovec)->iov_len;
iov_iter_init(iter, rw, *iovec, 1, ret);
}
*iovec = NULL;
return ret;
}
#ifdef CONFIG_COMPAT
if (req->ctx->compat)
return compat_import_iovec(rw, buf, sqe_len, UIO_FASTIOV,
......@@ -2169,12 +2472,18 @@ static void io_req_map_rw(struct io_kiocb *req, ssize_t io_size,
}
}
static inline int __io_alloc_async_ctx(struct io_kiocb *req)
{
req->io = kmalloc(sizeof(*req->io), GFP_KERNEL);
return req->io == NULL;
}
static int io_alloc_async_ctx(struct io_kiocb *req)
{
if (!io_op_defs[req->opcode].async_ctx)
return 0;
req->io = kmalloc(sizeof(*req->io), GFP_KERNEL);
return req->io == NULL;
return __io_alloc_async_ctx(req);
}
static int io_setup_async_rw(struct io_kiocb *req, ssize_t io_size,
......@@ -2184,7 +2493,7 @@ static int io_setup_async_rw(struct io_kiocb *req, ssize_t io_size,
if (!io_op_defs[req->opcode].async_ctx)
return 0;
if (!req->io) {
if (io_alloc_async_ctx(req))
if (__io_alloc_async_ctx(req))
return -ENOMEM;
io_req_map_rw(req, io_size, iovec, fast_iov, iter);
......@@ -2213,7 +2522,7 @@ static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
io = req->io;
io->rw.iov = io->rw.fast_iov;
req->io = NULL;
ret = io_import_iovec(READ, req, &io->rw.iov, &iter);
ret = io_import_iovec(READ, req, &io->rw.iov, &iter, !force_nonblock);
req->io = io;
if (ret < 0)
return ret;
......@@ -2222,8 +2531,7 @@ static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
return 0;
}
static int io_read(struct io_kiocb *req, struct io_kiocb **nxt,
bool force_nonblock)
static int io_read(struct io_kiocb *req, bool force_nonblock)
{
struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
struct kiocb *kiocb = &req->rw.kiocb;
......@@ -2231,13 +2539,13 @@ static int io_read(struct io_kiocb *req, struct io_kiocb **nxt,
size_t iov_count;
ssize_t io_size, ret;
ret = io_import_iovec(READ, req, &iovec, &iter);
ret = io_import_iovec(READ, req, &iovec, &iter, !force_nonblock);
if (ret < 0)
return ret;
/* Ensure we clear previously set non-block flag */
if (!force_nonblock)
req->rw.kiocb.ki_flags &= ~IOCB_NOWAIT;
kiocb->ki_flags &= ~IOCB_NOWAIT;
req->result = 0;
io_size = ret;
......@@ -2248,10 +2556,8 @@ static int io_read(struct io_kiocb *req, struct io_kiocb **nxt,
* If the file doesn't support async, mark it as REQ_F_MUST_PUNT so
* we know to async punt it even if it was opened O_NONBLOCK
*/
if (force_nonblock && !io_file_supports_async(req->file)) {
req->flags |= REQ_F_MUST_PUNT;
if (force_nonblock && !io_file_supports_async(req->file))
goto copy_iov;
}
iov_count = iov_iter_count(&iter);
ret = rw_verify_area(READ, req->file, &kiocb->ki_pos, iov_count);
......@@ -2265,13 +2571,16 @@ static int io_read(struct io_kiocb *req, struct io_kiocb **nxt,
/* Catch -EAGAIN return for forced non-blocking submission */
if (!force_nonblock || ret2 != -EAGAIN) {
kiocb_done(kiocb, ret2, nxt, req->in_async);
kiocb_done(kiocb, ret2);
} else {
copy_iov:
ret = io_setup_async_rw(req, io_size, iovec,
inline_vecs, &iter);
if (ret)
goto out_free;
/* any defer here is final, must blocking retry */
if (!(req->flags & REQ_F_NOWAIT))
req->flags |= REQ_F_MUST_PUNT;
return -EAGAIN;
}
}
......@@ -2295,6 +2604,8 @@ static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
if (unlikely(!(req->file->f_mode & FMODE_WRITE)))
return -EBADF;
req->fsize = rlimit(RLIMIT_FSIZE);
/* either don't need iovec imported or already have it */
if (!req->io || req->flags & REQ_F_NEED_CLEANUP)
return 0;
......@@ -2302,7 +2613,7 @@ static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
io = req->io;
io->rw.iov = io->rw.fast_iov;
req->io = NULL;
ret = io_import_iovec(WRITE, req, &io->rw.iov, &iter);
ret = io_import_iovec(WRITE, req, &io->rw.iov, &iter, !force_nonblock);
req->io = io;
if (ret < 0)
return ret;
......@@ -2311,8 +2622,7 @@ static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
return 0;
}
static int io_write(struct io_kiocb *req, struct io_kiocb **nxt,
bool force_nonblock)
static int io_write(struct io_kiocb *req, bool force_nonblock)
{
struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
struct kiocb *kiocb = &req->rw.kiocb;
......@@ -2320,7 +2630,7 @@ static int io_write(struct io_kiocb *req, struct io_kiocb **nxt,
size_t iov_count;
ssize_t ret, io_size;
ret = io_import_iovec(WRITE, req, &iovec, &iter);
ret = io_import_iovec(WRITE, req, &iovec, &iter, !force_nonblock);
if (ret < 0)
return ret;
......@@ -2337,10 +2647,8 @@ static int io_write(struct io_kiocb *req, struct io_kiocb **nxt,
* If the file doesn't support async, mark it as REQ_F_MUST_PUNT so
* we know to async punt it even if it was opened O_NONBLOCK
*/
if (force_nonblock && !io_file_supports_async(req->file)) {
req->flags |= REQ_F_MUST_PUNT;
if (force_nonblock && !io_file_supports_async(req->file))
goto copy_iov;
}
/* file path doesn't support NOWAIT for non-direct_IO */
if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT) &&
......@@ -2367,24 +2675,33 @@ static int io_write(struct io_kiocb *req, struct io_kiocb **nxt,
}
kiocb->ki_flags |= IOCB_WRITE;
if (!force_nonblock)
current->signal->rlim[RLIMIT_FSIZE].rlim_cur = req->fsize;
if (req->file->f_op->write_iter)
ret2 = call_write_iter(req->file, kiocb, &iter);
else
ret2 = loop_rw_iter(WRITE, req->file, kiocb, &iter);
if (!force_nonblock)
current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
/*
* Raw bdev writes will -EOPNOTSUPP for IOCB_NOWAIT. Just
* Raw bdev writes will return -EOPNOTSUPP for IOCB_NOWAIT. Just
* retry them without IOCB_NOWAIT.
*/
if (ret2 == -EOPNOTSUPP && (kiocb->ki_flags & IOCB_NOWAIT))
ret2 = -EAGAIN;
if (!force_nonblock || ret2 != -EAGAIN) {
kiocb_done(kiocb, ret2, nxt, req->in_async);
kiocb_done(kiocb, ret2);
} else {
copy_iov:
ret = io_setup_async_rw(req, io_size, iovec,
inline_vecs, &iter);
if (ret)
goto out_free;
/* any defer here is final, must blocking retry */
req->flags |= REQ_F_MUST_PUNT;
return -EAGAIN;
}
}
......@@ -2394,6 +2711,76 @@ static int io_write(struct io_kiocb *req, struct io_kiocb **nxt,
return ret;
}
static int io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_splice* sp = &req->splice;
unsigned int valid_flags = SPLICE_F_FD_IN_FIXED | SPLICE_F_ALL;
int ret;
if (req->flags & REQ_F_NEED_CLEANUP)
return 0;
sp->file_in = NULL;
sp->off_in = READ_ONCE(sqe->splice_off_in);
sp->off_out = READ_ONCE(sqe->off);
sp->len = READ_ONCE(sqe->len);
sp->flags = READ_ONCE(sqe->splice_flags);
if (unlikely(sp->flags & ~valid_flags))
return -EINVAL;
ret = io_file_get(NULL, req, READ_ONCE(sqe->splice_fd_in), &sp->file_in,
(sp->flags & SPLICE_F_FD_IN_FIXED));
if (ret)
return ret;
req->flags |= REQ_F_NEED_CLEANUP;
if (!S_ISREG(file_inode(sp->file_in)->i_mode))
req->work.flags |= IO_WQ_WORK_UNBOUND;
return 0;
}
static bool io_splice_punt(struct file *file)
{
if (get_pipe_info(file))
return false;
if (!io_file_supports_async(file))
return true;
return !(file->f_mode & O_NONBLOCK);
}
static int io_splice(struct io_kiocb *req, bool force_nonblock)
{
struct io_splice *sp = &req->splice;
struct file *in = sp->file_in;
struct file *out = sp->file_out;
unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED;
loff_t *poff_in, *poff_out;
long ret;
if (force_nonblock) {
if (io_splice_punt(in) || io_splice_punt(out))
return -EAGAIN;
flags |= SPLICE_F_NONBLOCK;
}
poff_in = (sp->off_in == -1) ? NULL : &sp->off_in;
poff_out = (sp->off_out == -1) ? NULL : &sp->off_out;
ret = do_splice(in, poff_in, out, poff_out, sp->len, flags);
if (force_nonblock && ret == -EAGAIN)
return -EAGAIN;
io_put_file(req, in, (sp->flags & SPLICE_F_FD_IN_FIXED));
req->flags &= ~REQ_F_NEED_CLEANUP;
io_cqring_add_event(req, ret);
if (ret != sp->len)
req_set_fail_links(req);
io_put_req(req);
return 0;
}
/*
* IORING_OP_NOP just posts a completion event, nothing else.
*/
......@@ -2442,85 +2829,63 @@ static bool io_req_cancelled(struct io_kiocb *req)
return false;
}
static void io_link_work_cb(struct io_wq_work **workptr)
{
struct io_wq_work *work = *workptr;
struct io_kiocb *link = work->data;
io_queue_linked_timeout(link);
work->func = io_wq_submit_work;
}
static void io_wq_assign_next(struct io_wq_work **workptr, struct io_kiocb *nxt)
{
struct io_kiocb *link;
io_prep_async_work(nxt, &link);
*workptr = &nxt->work;
if (link) {
nxt->work.flags |= IO_WQ_WORK_CB;
nxt->work.func = io_link_work_cb;
nxt->work.data = link;
}
}
static void io_fsync_finish(struct io_wq_work **workptr)
static void __io_fsync(struct io_kiocb *req)
{
struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
loff_t end = req->sync.off + req->sync.len;
struct io_kiocb *nxt = NULL;
int ret;
if (io_req_cancelled(req))
return;
ret = vfs_fsync_range(req->file, req->sync.off,
end > 0 ? end : LLONG_MAX,
req->sync.flags & IORING_FSYNC_DATASYNC);
if (ret < 0)
req_set_fail_links(req);
io_cqring_add_event(req, ret);
io_put_req_find_next(req, &nxt);
if (nxt)
io_wq_assign_next(workptr, nxt);
io_put_req(req);
}
static int io_fsync(struct io_kiocb *req, struct io_kiocb **nxt,
bool force_nonblock)
static void io_fsync_finish(struct io_wq_work **workptr)
{
struct io_wq_work *work, *old_work;
struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
if (io_req_cancelled(req))
return;
__io_fsync(req);
io_steal_work(req, workptr);
}
static int io_fsync(struct io_kiocb *req, bool force_nonblock)
{
/* fsync always requires a blocking context */
if (force_nonblock) {
io_put_req(req);
req->work.func = io_fsync_finish;
return -EAGAIN;
}
work = old_work = &req->work;
io_fsync_finish(&work);
if (work && work != old_work)
*nxt = container_of(work, struct io_kiocb, work);
__io_fsync(req);
return 0;
}
static void io_fallocate_finish(struct io_wq_work **workptr)
static void __io_fallocate(struct io_kiocb *req)
{
struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
struct io_kiocb *nxt = NULL;
int ret;
if (io_req_cancelled(req))
return;
current->signal->rlim[RLIMIT_FSIZE].rlim_cur = req->fsize;
ret = vfs_fallocate(req->file, req->sync.mode, req->sync.off,
req->sync.len);
current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
if (ret < 0)
req_set_fail_links(req);
io_cqring_add_event(req, ret);
io_put_req_find_next(req, &nxt);
if (nxt)
io_wq_assign_next(workptr, nxt);
io_put_req(req);
}
static void io_fallocate_finish(struct io_wq_work **workptr)
{
struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
if (io_req_cancelled(req))
return;
__io_fallocate(req);
io_steal_work(req, workptr);
}
static int io_fallocate_prep(struct io_kiocb *req,
......@@ -2532,26 +2897,19 @@ static int io_fallocate_prep(struct io_kiocb *req,
req->sync.off = READ_ONCE(sqe->off);
req->sync.len = READ_ONCE(sqe->addr);
req->sync.mode = READ_ONCE(sqe->len);
req->fsize = rlimit(RLIMIT_FSIZE);
return 0;
}
static int io_fallocate(struct io_kiocb *req, struct io_kiocb **nxt,
bool force_nonblock)
static int io_fallocate(struct io_kiocb *req, bool force_nonblock)
{
struct io_wq_work *work, *old_work;
/* fallocate always requiring blocking context */
if (force_nonblock) {
io_put_req(req);
req->work.func = io_fallocate_finish;
return -EAGAIN;
}
work = old_work = &req->work;
io_fallocate_finish(&work);
if (work && work != old_work)
*nxt = container_of(work, struct io_kiocb, work);
__io_fallocate(req);
return 0;
}
......@@ -2626,8 +2984,7 @@ static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
return 0;
}
static int io_openat2(struct io_kiocb *req, struct io_kiocb **nxt,
bool force_nonblock)
static int io_openat2(struct io_kiocb *req, bool force_nonblock)
{
struct open_flags op;
struct file *file;
......@@ -2658,15 +3015,171 @@ static int io_openat2(struct io_kiocb *req, struct io_kiocb **nxt,
if (ret < 0)
req_set_fail_links(req);
io_cqring_add_event(req, ret);
io_put_req_find_next(req, nxt);
io_put_req(req);
return 0;
}
static int io_openat(struct io_kiocb *req, struct io_kiocb **nxt,
bool force_nonblock)
static int io_openat(struct io_kiocb *req, bool force_nonblock)
{
req->open.how = build_open_how(req->open.how.flags, req->open.how.mode);
return io_openat2(req, nxt, force_nonblock);
return io_openat2(req, force_nonblock);
}
static int io_remove_buffers_prep(struct io_kiocb *req,
const struct io_uring_sqe *sqe)
{
struct io_provide_buf *p = &req->pbuf;
u64 tmp;
if (sqe->ioprio || sqe->rw_flags || sqe->addr || sqe->len || sqe->off)
return -EINVAL;
tmp = READ_ONCE(sqe->fd);
if (!tmp || tmp > USHRT_MAX)
return -EINVAL;
memset(p, 0, sizeof(*p));
p->nbufs = tmp;
p->bgid = READ_ONCE(sqe->buf_group);
return 0;
}
static int __io_remove_buffers(struct io_ring_ctx *ctx, struct io_buffer *buf,
int bgid, unsigned nbufs)
{
unsigned i = 0;
/* shouldn't happen */
if (!nbufs)
return 0;
/* the head kbuf is the list itself */
while (!list_empty(&buf->list)) {
struct io_buffer *nxt;
nxt = list_first_entry(&buf->list, struct io_buffer, list);
list_del(&nxt->list);
kfree(nxt);
if (++i == nbufs)
return i;
}
i++;
kfree(buf);
idr_remove(&ctx->io_buffer_idr, bgid);
return i;
}
static int io_remove_buffers(struct io_kiocb *req, bool force_nonblock)
{
struct io_provide_buf *p = &req->pbuf;
struct io_ring_ctx *ctx = req->ctx;
struct io_buffer *head;
int ret = 0;
io_ring_submit_lock(ctx, !force_nonblock);
lockdep_assert_held(&ctx->uring_lock);
ret = -ENOENT;
head = idr_find(&ctx->io_buffer_idr, p->bgid);
if (head)
ret = __io_remove_buffers(ctx, head, p->bgid, p->nbufs);
io_ring_submit_lock(ctx, !force_nonblock);
if (ret < 0)
req_set_fail_links(req);
io_cqring_add_event(req, ret);
io_put_req(req);
return 0;
}
static int io_provide_buffers_prep(struct io_kiocb *req,
const struct io_uring_sqe *sqe)
{
struct io_provide_buf *p = &req->pbuf;
u64 tmp;
if (sqe->ioprio || sqe->rw_flags)
return -EINVAL;
tmp = READ_ONCE(sqe->fd);
if (!tmp || tmp > USHRT_MAX)
return -E2BIG;
p->nbufs = tmp;
p->addr = READ_ONCE(sqe->addr);
p->len = READ_ONCE(sqe->len);
if (!access_ok(u64_to_user_ptr(p->addr), p->len))
return -EFAULT;
p->bgid = READ_ONCE(sqe->buf_group);
tmp = READ_ONCE(sqe->off);
if (tmp > USHRT_MAX)
return -E2BIG;
p->bid = tmp;
return 0;
}
static int io_add_buffers(struct io_provide_buf *pbuf, struct io_buffer **head)
{
struct io_buffer *buf;
u64 addr = pbuf->addr;
int i, bid = pbuf->bid;
for (i = 0; i < pbuf->nbufs; i++) {
buf = kmalloc(sizeof(*buf), GFP_KERNEL);
if (!buf)
break;
buf->addr = addr;
buf->len = pbuf->len;
buf->bid = bid;
addr += pbuf->len;
bid++;
if (!*head) {
INIT_LIST_HEAD(&buf->list);
*head = buf;
} else {
list_add_tail(&buf->list, &(*head)->list);
}
}
return i ? i : -ENOMEM;
}
static int io_provide_buffers(struct io_kiocb *req, bool force_nonblock)
{
struct io_provide_buf *p = &req->pbuf;
struct io_ring_ctx *ctx = req->ctx;
struct io_buffer *head, *list;
int ret = 0;
io_ring_submit_lock(ctx, !force_nonblock);
lockdep_assert_held(&ctx->uring_lock);
list = head = idr_find(&ctx->io_buffer_idr, p->bgid);
ret = io_add_buffers(p, &head);
if (ret < 0)
goto out;
if (!list) {
ret = idr_alloc(&ctx->io_buffer_idr, head, p->bgid, p->bgid + 1,
GFP_KERNEL);
if (ret < 0) {
__io_remove_buffers(ctx, head, p->bgid, -1U);
goto out;
}
}
out:
io_ring_submit_unlock(ctx, !force_nonblock);
if (ret < 0)
req_set_fail_links(req);
io_cqring_add_event(req, ret);
io_put_req(req);
return 0;
}
static int io_epoll_ctl_prep(struct io_kiocb *req,
......@@ -2694,8 +3207,7 @@ static int io_epoll_ctl_prep(struct io_kiocb *req,
#endif
}
static int io_epoll_ctl(struct io_kiocb *req, struct io_kiocb **nxt,
bool force_nonblock)
static int io_epoll_ctl(struct io_kiocb *req, bool force_nonblock)
{
#if defined(CONFIG_EPOLL)
struct io_epoll *ie = &req->epoll;
......@@ -2708,7 +3220,7 @@ static int io_epoll_ctl(struct io_kiocb *req, struct io_kiocb **nxt,
if (ret < 0)
req_set_fail_links(req);
io_cqring_add_event(req, ret);
io_put_req_find_next(req, nxt);
io_put_req(req);
return 0;
#else
return -EOPNOTSUPP;
......@@ -2730,8 +3242,7 @@ static int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
#endif
}
static int io_madvise(struct io_kiocb *req, struct io_kiocb **nxt,
bool force_nonblock)
static int io_madvise(struct io_kiocb *req, bool force_nonblock)
{
#if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
struct io_madvise *ma = &req->madvise;
......@@ -2744,7 +3255,7 @@ static int io_madvise(struct io_kiocb *req, struct io_kiocb **nxt,
if (ret < 0)
req_set_fail_links(req);
io_cqring_add_event(req, ret);
io_put_req_find_next(req, nxt);
io_put_req(req);
return 0;
#else
return -EOPNOTSUPP;
......@@ -2762,8 +3273,7 @@ static int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
return 0;
}
static int io_fadvise(struct io_kiocb *req, struct io_kiocb **nxt,
bool force_nonblock)
static int io_fadvise(struct io_kiocb *req, bool force_nonblock)
{
struct io_fadvise *fa = &req->fadvise;
int ret;
......@@ -2783,7 +3293,7 @@ static int io_fadvise(struct io_kiocb *req, struct io_kiocb **nxt,
if (ret < 0)
req_set_fail_links(req);
io_cqring_add_event(req, ret);
io_put_req_find_next(req, nxt);
io_put_req(req);
return 0;
}
......@@ -2820,8 +3330,7 @@ static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
return 0;
}
static int io_statx(struct io_kiocb *req, struct io_kiocb **nxt,
bool force_nonblock)
static int io_statx(struct io_kiocb *req, bool force_nonblock)
{
struct io_open *ctx = &req->open;
unsigned lookup_flags;
......@@ -2858,7 +3367,7 @@ static int io_statx(struct io_kiocb *req, struct io_kiocb **nxt,
if (ret < 0)
req_set_fail_links(req);
io_cqring_add_event(req, ret);
io_put_req_find_next(req, nxt);
io_put_req(req);
return 0;
}
......@@ -2885,7 +3394,7 @@ static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
}
/* only called when __close_fd_get_file() is done */
static void __io_close_finish(struct io_kiocb *req, struct io_kiocb **nxt)
static void __io_close_finish(struct io_kiocb *req)
{
int ret;
......@@ -2894,22 +3403,19 @@ static void __io_close_finish(struct io_kiocb *req, struct io_kiocb **nxt)
req_set_fail_links(req);
io_cqring_add_event(req, ret);
fput(req->close.put_file);
io_put_req_find_next(req, nxt);
io_put_req(req);
}
static void io_close_finish(struct io_wq_work **workptr)
{
struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
struct io_kiocb *nxt = NULL;
/* not cancellable, don't do io_req_cancelled() */
__io_close_finish(req, &nxt);
if (nxt)
io_wq_assign_next(workptr, nxt);
__io_close_finish(req);
io_steal_work(req, workptr);
}
static int io_close(struct io_kiocb *req, struct io_kiocb **nxt,
bool force_nonblock)
static int io_close(struct io_kiocb *req, bool force_nonblock)
{
int ret;
......@@ -2919,23 +3425,25 @@ static int io_close(struct io_kiocb *req, struct io_kiocb **nxt,
return ret;
/* if the file has a flush method, be safe and punt to async */
if (req->close.put_file->f_op->flush && !io_wq_current_is_worker())
goto eagain;
if (req->close.put_file->f_op->flush && force_nonblock) {
/* submission ref will be dropped, take it for async */
refcount_inc(&req->refs);
req->work.func = io_close_finish;
/*
* Do manual async queue here to avoid grabbing files - we don't
* need the files, and it'll cause io_close_finish() to close
* the file again and cause a double CQE entry for this request
*/
io_queue_async_work(req);
return 0;
}
/*
* No ->flush(), safely close from here and just punt the
* fput() to async context.
*/
__io_close_finish(req, nxt);
return 0;
eagain:
req->work.func = io_close_finish;
/*
* Do manual async queue here to avoid grabbing files - we don't
* need the files, and it'll cause io_close_finish() to close
* the file again and cause a double CQE entry for this request
*/
io_queue_async_work(req);
__io_close_finish(req);
return 0;
}
......@@ -2957,47 +3465,62 @@ static int io_prep_sfr(struct io_kiocb *req, const struct io_uring_sqe *sqe)
return 0;
}
static void io_sync_file_range_finish(struct io_wq_work **workptr)
static void __io_sync_file_range(struct io_kiocb *req)
{
struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
struct io_kiocb *nxt = NULL;
int ret;
if (io_req_cancelled(req))
return;
ret = sync_file_range(req->file, req->sync.off, req->sync.len,
req->sync.flags);
if (ret < 0)
req_set_fail_links(req);
io_cqring_add_event(req, ret);
io_put_req_find_next(req, &nxt);
io_put_req(req);
}
static void io_sync_file_range_finish(struct io_wq_work **workptr)
{
struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
struct io_kiocb *nxt = NULL;
if (io_req_cancelled(req))
return;
__io_sync_file_range(req);
io_put_req(req); /* put submission ref */
if (nxt)
io_wq_assign_next(workptr, nxt);
}
static int io_sync_file_range(struct io_kiocb *req, struct io_kiocb **nxt,
bool force_nonblock)
static int io_sync_file_range(struct io_kiocb *req, bool force_nonblock)
{
struct io_wq_work *work, *old_work;
/* sync_file_range always requires a blocking context */
if (force_nonblock) {
io_put_req(req);
req->work.func = io_sync_file_range_finish;
return -EAGAIN;
}
work = old_work = &req->work;
io_sync_file_range_finish(&work);
if (work && work != old_work)
*nxt = container_of(work, struct io_kiocb, work);
__io_sync_file_range(req);
return 0;
}
#if defined(CONFIG_NET)
static int io_setup_async_msg(struct io_kiocb *req,
struct io_async_msghdr *kmsg)
{
if (req->io)
return -EAGAIN;
if (io_alloc_async_ctx(req)) {
if (kmsg->iov != kmsg->fast_iov)
kfree(kmsg->iov);
return -ENOMEM;
}
req->flags |= REQ_F_NEED_CLEANUP;
memcpy(&req->io->msg, kmsg, sizeof(*kmsg));
return -EAGAIN;
}
static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
#if defined(CONFIG_NET)
struct io_sr_msg *sr = &req->sr_msg;
struct io_async_ctx *io = req->io;
int ret;
......@@ -3023,15 +3546,10 @@ static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
if (!ret)
req->flags |= REQ_F_NEED_CLEANUP;
return ret;
#else
return -EOPNOTSUPP;
#endif
}
static int io_sendmsg(struct io_kiocb *req, struct io_kiocb **nxt,
bool force_nonblock)
static int io_sendmsg(struct io_kiocb *req, bool force_nonblock)
{
#if defined(CONFIG_NET)
struct io_async_msghdr *kmsg = NULL;
struct socket *sock;
int ret;
......@@ -3071,18 +3589,8 @@ static int io_sendmsg(struct io_kiocb *req, struct io_kiocb **nxt,
flags |= MSG_DONTWAIT;
ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags);
if (force_nonblock && ret == -EAGAIN) {
if (req->io)
return -EAGAIN;
if (io_alloc_async_ctx(req)) {
if (kmsg->iov != kmsg->fast_iov)
kfree(kmsg->iov);
return -ENOMEM;
}
req->flags |= REQ_F_NEED_CLEANUP;
memcpy(&req->io->msg, &io.msg, sizeof(io.msg));
return -EAGAIN;
}
if (force_nonblock && ret == -EAGAIN)
return io_setup_async_msg(req, kmsg);
if (ret == -ERESTARTSYS)
ret = -EINTR;
}
......@@ -3093,17 +3601,12 @@ static int io_sendmsg(struct io_kiocb *req, struct io_kiocb **nxt,
io_cqring_add_event(req, ret);
if (ret < 0)
req_set_fail_links(req);
io_put_req_find_next(req, nxt);
io_put_req(req);
return 0;
#else
return -EOPNOTSUPP;
#endif
}
static int io_send(struct io_kiocb *req, struct io_kiocb **nxt,
bool force_nonblock)
static int io_send(struct io_kiocb *req, bool force_nonblock)
{
#if defined(CONFIG_NET)
struct socket *sock;
int ret;
......@@ -3144,17 +3647,120 @@ static int io_send(struct io_kiocb *req, struct io_kiocb **nxt,
io_cqring_add_event(req, ret);
if (ret < 0)
req_set_fail_links(req);
io_put_req_find_next(req, nxt);
io_put_req(req);
return 0;
#else
return -EOPNOTSUPP;
}
static int __io_recvmsg_copy_hdr(struct io_kiocb *req, struct io_async_ctx *io)
{
struct io_sr_msg *sr = &req->sr_msg;
struct iovec __user *uiov;
size_t iov_len;
int ret;
ret = __copy_msghdr_from_user(&io->msg.msg, sr->msg, &io->msg.uaddr,
&uiov, &iov_len);
if (ret)
return ret;
if (req->flags & REQ_F_BUFFER_SELECT) {
if (iov_len > 1)
return -EINVAL;
if (copy_from_user(io->msg.iov, uiov, sizeof(*uiov)))
return -EFAULT;
sr->len = io->msg.iov[0].iov_len;
iov_iter_init(&io->msg.msg.msg_iter, READ, io->msg.iov, 1,
sr->len);
io->msg.iov = NULL;
} else {
ret = import_iovec(READ, uiov, iov_len, UIO_FASTIOV,
&io->msg.iov, &io->msg.msg.msg_iter);
if (ret > 0)
ret = 0;
}
return ret;
}
#ifdef CONFIG_COMPAT
static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req,
struct io_async_ctx *io)
{
struct compat_msghdr __user *msg_compat;
struct io_sr_msg *sr = &req->sr_msg;
struct compat_iovec __user *uiov;
compat_uptr_t ptr;
compat_size_t len;
int ret;
msg_compat = (struct compat_msghdr __user *) sr->msg;
ret = __get_compat_msghdr(&io->msg.msg, msg_compat, &io->msg.uaddr,
&ptr, &len);
if (ret)
return ret;
uiov = compat_ptr(ptr);
if (req->flags & REQ_F_BUFFER_SELECT) {
compat_ssize_t clen;
if (len > 1)
return -EINVAL;
if (!access_ok(uiov, sizeof(*uiov)))
return -EFAULT;
if (__get_user(clen, &uiov->iov_len))
return -EFAULT;
if (clen < 0)
return -EINVAL;
sr->len = io->msg.iov[0].iov_len;
io->msg.iov = NULL;
} else {
ret = compat_import_iovec(READ, uiov, len, UIO_FASTIOV,
&io->msg.iov,
&io->msg.msg.msg_iter);
if (ret < 0)
return ret;
}
return 0;
}
#endif
static int io_recvmsg_copy_hdr(struct io_kiocb *req, struct io_async_ctx *io)
{
io->msg.iov = io->msg.fast_iov;
#ifdef CONFIG_COMPAT
if (req->ctx->compat)
return __io_compat_recvmsg_copy_hdr(req, io);
#endif
return __io_recvmsg_copy_hdr(req, io);
}
static struct io_buffer *io_recv_buffer_select(struct io_kiocb *req,
int *cflags, bool needs_lock)
{
struct io_sr_msg *sr = &req->sr_msg;
struct io_buffer *kbuf;
if (!(req->flags & REQ_F_BUFFER_SELECT))
return NULL;
kbuf = io_buffer_select(req, &sr->len, sr->bgid, sr->kbuf, needs_lock);
if (IS_ERR(kbuf))
return kbuf;
sr->kbuf = kbuf;
req->flags |= REQ_F_BUFFER_SELECTED;
*cflags = kbuf->bid << IORING_CQE_BUFFER_SHIFT;
*cflags |= IORING_CQE_F_BUFFER;
return kbuf;
}
static int io_recvmsg_prep(struct io_kiocb *req,
const struct io_uring_sqe *sqe)
{
#if defined(CONFIG_NET)
struct io_sr_msg *sr = &req->sr_msg;
struct io_async_ctx *io = req->io;
int ret;
......@@ -3162,6 +3768,7 @@ static int io_recvmsg_prep(struct io_kiocb *req,
sr->msg_flags = READ_ONCE(sqe->msg_flags);
sr->msg = u64_to_user_ptr(READ_ONCE(sqe->addr));
sr->len = READ_ONCE(sqe->len);
sr->bgid = READ_ONCE(sqe->buf_group);
#ifdef CONFIG_COMPAT
if (req->ctx->compat)
......@@ -3174,30 +3781,24 @@ static int io_recvmsg_prep(struct io_kiocb *req,
if (req->flags & REQ_F_NEED_CLEANUP)
return 0;
io->msg.iov = io->msg.fast_iov;
ret = recvmsg_copy_msghdr(&io->msg.msg, sr->msg, sr->msg_flags,
&io->msg.uaddr, &io->msg.iov);
ret = io_recvmsg_copy_hdr(req, io);
if (!ret)
req->flags |= REQ_F_NEED_CLEANUP;
return ret;
#else
return -EOPNOTSUPP;
#endif
}
static int io_recvmsg(struct io_kiocb *req, struct io_kiocb **nxt,
bool force_nonblock)
static int io_recvmsg(struct io_kiocb *req, bool force_nonblock)
{
#if defined(CONFIG_NET)
struct io_async_msghdr *kmsg = NULL;
struct socket *sock;
int ret;
int ret, cflags = 0;
if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
sock = sock_from_file(req->file, &ret);
if (sock) {
struct io_buffer *kbuf;
struct io_async_ctx io;
unsigned flags;
......@@ -3209,19 +3810,23 @@ static int io_recvmsg(struct io_kiocb *req, struct io_kiocb **nxt,
kmsg->iov = kmsg->fast_iov;
kmsg->msg.msg_iter.iov = kmsg->iov;
} else {
struct io_sr_msg *sr = &req->sr_msg;
kmsg = &io.msg;
kmsg->msg.msg_name = &io.msg.addr;
io.msg.iov = io.msg.fast_iov;
ret = recvmsg_copy_msghdr(&io.msg.msg, sr->msg,
sr->msg_flags, &io.msg.uaddr,
&io.msg.iov);
ret = io_recvmsg_copy_hdr(req, &io);
if (ret)
return ret;
}
kbuf = io_recv_buffer_select(req, &cflags, !force_nonblock);
if (IS_ERR(kbuf)) {
return PTR_ERR(kbuf);
} else if (kbuf) {
kmsg->fast_iov[0].iov_base = u64_to_user_ptr(kbuf->addr);
iov_iter_init(&kmsg->msg.msg_iter, READ, kmsg->iov,
1, req->sr_msg.len);
}
flags = req->sr_msg.msg_flags;
if (flags & MSG_DONTWAIT)
req->flags |= REQ_F_NOWAIT;
......@@ -3230,18 +3835,8 @@ static int io_recvmsg(struct io_kiocb *req, struct io_kiocb **nxt,
ret = __sys_recvmsg_sock(sock, &kmsg->msg, req->sr_msg.msg,
kmsg->uaddr, flags);
if (force_nonblock && ret == -EAGAIN) {
if (req->io)
return -EAGAIN;
if (io_alloc_async_ctx(req)) {
if (kmsg->iov != kmsg->fast_iov)
kfree(kmsg->iov);
return -ENOMEM;
}
memcpy(&req->io->msg, &io.msg, sizeof(io.msg));
req->flags |= REQ_F_NEED_CLEANUP;
return -EAGAIN;
}
if (force_nonblock && ret == -EAGAIN)
return io_setup_async_msg(req, kmsg);
if (ret == -ERESTARTSYS)
ret = -EINTR;
}
......@@ -3249,22 +3844,18 @@ static int io_recvmsg(struct io_kiocb *req, struct io_kiocb **nxt,
if (kmsg && kmsg->iov != kmsg->fast_iov)
kfree(kmsg->iov);
req->flags &= ~REQ_F_NEED_CLEANUP;
io_cqring_add_event(req, ret);
__io_cqring_add_event(req, ret, cflags);
if (ret < 0)
req_set_fail_links(req);
io_put_req_find_next(req, nxt);
io_put_req(req);
return 0;
#else
return -EOPNOTSUPP;
#endif
}
static int io_recv(struct io_kiocb *req, struct io_kiocb **nxt,
bool force_nonblock)
static int io_recv(struct io_kiocb *req, bool force_nonblock)
{
#if defined(CONFIG_NET)
struct io_buffer *kbuf = NULL;
struct socket *sock;
int ret;
int ret, cflags = 0;
if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
......@@ -3272,15 +3863,25 @@ static int io_recv(struct io_kiocb *req, struct io_kiocb **nxt,
sock = sock_from_file(req->file, &ret);
if (sock) {
struct io_sr_msg *sr = &req->sr_msg;
void __user *buf = sr->buf;
struct msghdr msg;
struct iovec iov;
unsigned flags;
ret = import_single_range(READ, sr->buf, sr->len, &iov,
kbuf = io_recv_buffer_select(req, &cflags, !force_nonblock);
if (IS_ERR(kbuf))
return PTR_ERR(kbuf);
else if (kbuf)
buf = u64_to_user_ptr(kbuf->addr);
ret = import_single_range(READ, buf, sr->len, &iov,
&msg.msg_iter);
if (ret)
if (ret) {
kfree(kbuf);
return ret;
}
req->flags |= REQ_F_NEED_CLEANUP;
msg.msg_name = NULL;
msg.msg_control = NULL;
msg.msg_controllen = 0;
......@@ -3301,20 +3902,17 @@ static int io_recv(struct io_kiocb *req, struct io_kiocb **nxt,
ret = -EINTR;
}
io_cqring_add_event(req, ret);
kfree(kbuf);
req->flags &= ~REQ_F_NEED_CLEANUP;
__io_cqring_add_event(req, ret, cflags);
if (ret < 0)
req_set_fail_links(req);
io_put_req_find_next(req, nxt);
io_put_req(req);
return 0;
#else
return -EOPNOTSUPP;
#endif
}
static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
#if defined(CONFIG_NET)
struct io_accept *accept = &req->accept;
if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL)))
......@@ -3327,14 +3925,9 @@ static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
accept->flags = READ_ONCE(sqe->accept_flags);
accept->nofile = rlimit(RLIMIT_NOFILE);
return 0;
#else
return -EOPNOTSUPP;
#endif
}
#if defined(CONFIG_NET)
static int __io_accept(struct io_kiocb *req, struct io_kiocb **nxt,
bool force_nonblock)
static int __io_accept(struct io_kiocb *req, bool force_nonblock)
{
struct io_accept *accept = &req->accept;
unsigned file_flags;
......@@ -3351,44 +3944,34 @@ static int __io_accept(struct io_kiocb *req, struct io_kiocb **nxt,
if (ret < 0)
req_set_fail_links(req);
io_cqring_add_event(req, ret);
io_put_req_find_next(req, nxt);
io_put_req(req);
return 0;
}
static void io_accept_finish(struct io_wq_work **workptr)
{
struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
struct io_kiocb *nxt = NULL;
if (io_req_cancelled(req))
return;
__io_accept(req, &nxt, false);
if (nxt)
io_wq_assign_next(workptr, nxt);
__io_accept(req, false);
io_steal_work(req, workptr);
}
#endif
static int io_accept(struct io_kiocb *req, struct io_kiocb **nxt,
bool force_nonblock)
static int io_accept(struct io_kiocb *req, bool force_nonblock)
{
#if defined(CONFIG_NET)
int ret;
ret = __io_accept(req, nxt, force_nonblock);
ret = __io_accept(req, force_nonblock);
if (ret == -EAGAIN && force_nonblock) {
req->work.func = io_accept_finish;
io_put_req(req);
return -EAGAIN;
}
return 0;
#else
return -EOPNOTSUPP;
#endif
}
static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
#if defined(CONFIG_NET)
struct io_connect *conn = &req->connect;
struct io_async_ctx *io = req->io;
......@@ -3405,15 +3988,10 @@ static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
return move_addr_to_kernel(conn->addr, conn->addr_len,
&io->connect.address);
#else
return -EOPNOTSUPP;
#endif
}
static int io_connect(struct io_kiocb *req, struct io_kiocb **nxt,
bool force_nonblock)
static int io_connect(struct io_kiocb *req, bool force_nonblock)
{
#if defined(CONFIG_NET)
struct io_async_ctx __io, *io;
unsigned file_flags;
int ret;
......@@ -3449,25 +4027,301 @@ static int io_connect(struct io_kiocb *req, struct io_kiocb **nxt,
if (ret < 0)
req_set_fail_links(req);
io_cqring_add_event(req, ret);
io_put_req_find_next(req, nxt);
io_put_req(req);
return 0;
#else
}
#else /* !CONFIG_NET */
static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
return -EOPNOTSUPP;
#endif
}
static void io_poll_remove_one(struct io_kiocb *req)
static int io_sendmsg(struct io_kiocb *req, bool force_nonblock)
{
return -EOPNOTSUPP;
}
static int io_send(struct io_kiocb *req, bool force_nonblock)
{
return -EOPNOTSUPP;
}
static int io_recvmsg_prep(struct io_kiocb *req,
const struct io_uring_sqe *sqe)
{
return -EOPNOTSUPP;
}
static int io_recvmsg(struct io_kiocb *req, bool force_nonblock)
{
return -EOPNOTSUPP;
}
static int io_recv(struct io_kiocb *req, bool force_nonblock)
{
return -EOPNOTSUPP;
}
static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
return -EOPNOTSUPP;
}
static int io_accept(struct io_kiocb *req, bool force_nonblock)
{
return -EOPNOTSUPP;
}
static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
return -EOPNOTSUPP;
}
static int io_connect(struct io_kiocb *req, bool force_nonblock)
{
return -EOPNOTSUPP;
}
#endif /* CONFIG_NET */
struct io_poll_table {
struct poll_table_struct pt;
struct io_kiocb *req;
int error;
};
static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt,
struct wait_queue_head *head)
{
if (unlikely(poll->head)) {
pt->error = -EINVAL;
return;
}
pt->error = 0;
poll->head = head;
add_wait_queue(head, &poll->wait);
}
static void io_async_queue_proc(struct file *file, struct wait_queue_head *head,
struct poll_table_struct *p)
{
struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
__io_queue_proc(&pt->req->apoll->poll, pt, head);
}
static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll,
__poll_t mask, task_work_func_t func)
{
struct task_struct *tsk;
/* for instances that support it check for an event match first: */
if (mask && !(mask & poll->events))
return 0;
trace_io_uring_task_add(req->ctx, req->opcode, req->user_data, mask);
list_del_init(&poll->wait.entry);
tsk = req->task;
req->result = mask;
init_task_work(&req->task_work, func);
/*
* If this fails, then the task is exiting. If that is the case, then
* the exit check will ultimately cancel these work items. Hence we
* don't need to check here and handle it specifically.
*/
task_work_add(tsk, &req->task_work, true);
wake_up_process(tsk);
return 1;
}
static void io_async_task_func(struct callback_head *cb)
{
struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
struct async_poll *apoll = req->apoll;
struct io_ring_ctx *ctx = req->ctx;
trace_io_uring_task_run(req->ctx, req->opcode, req->user_data);
WARN_ON_ONCE(!list_empty(&req->apoll->poll.wait.entry));
if (hash_hashed(&req->hash_node)) {
spin_lock_irq(&ctx->completion_lock);
hash_del(&req->hash_node);
spin_unlock_irq(&ctx->completion_lock);
}
/* restore ->work in case we need to retry again */
memcpy(&req->work, &apoll->work, sizeof(req->work));
__set_current_state(TASK_RUNNING);
mutex_lock(&ctx->uring_lock);
__io_queue_sqe(req, NULL);
mutex_unlock(&ctx->uring_lock);
kfree(apoll);
}
static int io_async_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
void *key)
{
struct io_kiocb *req = wait->private;
struct io_poll_iocb *poll = &req->apoll->poll;
trace_io_uring_poll_wake(req->ctx, req->opcode, req->user_data,
key_to_poll(key));
return __io_async_wake(req, poll, key_to_poll(key), io_async_task_func);
}
static void io_poll_req_insert(struct io_kiocb *req)
{
struct io_ring_ctx *ctx = req->ctx;
struct hlist_head *list;
list = &ctx->cancel_hash[hash_long(req->user_data, ctx->cancel_hash_bits)];
hlist_add_head(&req->hash_node, list);
}
static __poll_t __io_arm_poll_handler(struct io_kiocb *req,
struct io_poll_iocb *poll,
struct io_poll_table *ipt, __poll_t mask,
wait_queue_func_t wake_func)
__acquires(&ctx->completion_lock)
{
struct io_ring_ctx *ctx = req->ctx;
bool cancel = false;
poll->file = req->file;
poll->head = NULL;
poll->done = poll->canceled = false;
poll->events = mask;
ipt->pt._key = mask;
ipt->req = req;
ipt->error = -EINVAL;
INIT_LIST_HEAD(&poll->wait.entry);
init_waitqueue_func_entry(&poll->wait, wake_func);
poll->wait.private = req;
mask = vfs_poll(req->file, &ipt->pt) & poll->events;
spin_lock_irq(&ctx->completion_lock);
if (likely(poll->head)) {
spin_lock(&poll->head->lock);
if (unlikely(list_empty(&poll->wait.entry))) {
if (ipt->error)
cancel = true;
ipt->error = 0;
mask = 0;
}
if (mask || ipt->error)
list_del_init(&poll->wait.entry);
else if (cancel)
WRITE_ONCE(poll->canceled, true);
else if (!poll->done) /* actually waiting for an event */
io_poll_req_insert(req);
spin_unlock(&poll->head->lock);
}
return mask;
}
static bool io_arm_poll_handler(struct io_kiocb *req)
{
const struct io_op_def *def = &io_op_defs[req->opcode];
struct io_ring_ctx *ctx = req->ctx;
struct async_poll *apoll;
struct io_poll_table ipt;
__poll_t mask, ret;
if (!req->file || !file_can_poll(req->file))
return false;
if (req->flags & (REQ_F_MUST_PUNT | REQ_F_POLLED))
return false;
if (!def->pollin && !def->pollout)
return false;
apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC);
if (unlikely(!apoll))
return false;
req->flags |= REQ_F_POLLED;
memcpy(&apoll->work, &req->work, sizeof(req->work));
/*
* Don't need a reference here, as we're adding it to the task
* task_works list. If the task exits, the list is pruned.
*/
req->task = current;
req->apoll = apoll;
INIT_HLIST_NODE(&req->hash_node);
mask = 0;
if (def->pollin)
mask |= POLLIN | POLLRDNORM;
if (def->pollout)
mask |= POLLOUT | POLLWRNORM;
mask |= POLLERR | POLLPRI;
ipt.pt._qproc = io_async_queue_proc;
ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask,
io_async_wake);
if (ret) {
ipt.error = 0;
apoll->poll.done = true;
spin_unlock_irq(&ctx->completion_lock);
memcpy(&req->work, &apoll->work, sizeof(req->work));
kfree(apoll);
return false;
}
spin_unlock_irq(&ctx->completion_lock);
trace_io_uring_poll_arm(ctx, req->opcode, req->user_data, mask,
apoll->poll.events);
return true;
}
static bool __io_poll_remove_one(struct io_kiocb *req,
struct io_poll_iocb *poll)
{
struct io_poll_iocb *poll = &req->poll;
bool do_complete = false;
spin_lock(&poll->head->lock);
WRITE_ONCE(poll->canceled, true);
if (!list_empty(&poll->wait.entry)) {
list_del_init(&poll->wait.entry);
io_queue_async_work(req);
do_complete = true;
}
spin_unlock(&poll->head->lock);
return do_complete;
}
static bool io_poll_remove_one(struct io_kiocb *req)
{
bool do_complete;
if (req->opcode == IORING_OP_POLL_ADD) {
do_complete = __io_poll_remove_one(req, &req->poll);
} else {
/* non-poll requests have submit ref still */
do_complete = __io_poll_remove_one(req, &req->apoll->poll);
if (do_complete)
io_put_req(req);
}
hash_del(&req->hash_node);
if (do_complete) {
io_cqring_fill_event(req, -ECANCELED);
io_commit_cqring(req->ctx);
req->flags |= REQ_F_COMP_LOCKED;
io_put_req(req);
}
return do_complete;
}
static void io_poll_remove_all(struct io_ring_ctx *ctx)
......@@ -3485,6 +4339,8 @@ static void io_poll_remove_all(struct io_ring_ctx *ctx)
io_poll_remove_one(req);
}
spin_unlock_irq(&ctx->completion_lock);
io_cqring_ev_posted(ctx);
}
static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr)
......@@ -3494,10 +4350,11 @@ static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr)
list = &ctx->cancel_hash[hash_long(sqe_addr, ctx->cancel_hash_bits)];
hlist_for_each_entry(req, list, hash_node) {
if (sqe_addr == req->user_data) {
io_poll_remove_one(req);
if (sqe_addr != req->user_data)
continue;
if (io_poll_remove_one(req))
return 0;
}
return -EALREADY;
}
return -ENOENT;
......@@ -3543,186 +4400,54 @@ static void io_poll_complete(struct io_kiocb *req, __poll_t mask, int error)
struct io_ring_ctx *ctx = req->ctx;
req->poll.done = true;
if (error)
io_cqring_fill_event(req, error);
else
io_cqring_fill_event(req, mangle_poll(mask));
io_cqring_fill_event(req, error ? error : mangle_poll(mask));
io_commit_cqring(ctx);
}
static void io_poll_complete_work(struct io_wq_work **workptr)
static void io_poll_task_handler(struct io_kiocb *req, struct io_kiocb **nxt)
{
struct io_wq_work *work = *workptr;
struct io_kiocb *req = container_of(work, struct io_kiocb, work);
struct io_poll_iocb *poll = &req->poll;
struct poll_table_struct pt = { ._key = poll->events };
struct io_ring_ctx *ctx = req->ctx;
struct io_kiocb *nxt = NULL;
__poll_t mask = 0;
int ret = 0;
if (work->flags & IO_WQ_WORK_CANCEL) {
WRITE_ONCE(poll->canceled, true);
ret = -ECANCELED;
} else if (READ_ONCE(poll->canceled)) {
ret = -ECANCELED;
}
if (ret != -ECANCELED)
mask = vfs_poll(poll->file, &pt) & poll->events;
/*
* Note that ->ki_cancel callers also delete iocb from active_reqs after
* calling ->ki_cancel. We need the ctx_lock roundtrip here to
* synchronize with them. In the cancellation case the list_del_init
* itself is not actually needed, but harmless so we keep it in to
* avoid further branches in the fast path.
*/
spin_lock_irq(&ctx->completion_lock);
if (!mask && ret != -ECANCELED) {
add_wait_queue(poll->head, &poll->wait);
spin_unlock_irq(&ctx->completion_lock);
return;
}
hash_del(&req->hash_node);
io_poll_complete(req, mask, ret);
io_poll_complete(req, req->result, 0);
req->flags |= REQ_F_COMP_LOCKED;
io_put_req_find_next(req, nxt);
spin_unlock_irq(&ctx->completion_lock);
io_cqring_ev_posted(ctx);
if (ret < 0)
req_set_fail_links(req);
io_put_req_find_next(req, &nxt);
if (nxt)
io_wq_assign_next(workptr, nxt);
}
static void __io_poll_flush(struct io_ring_ctx *ctx, struct llist_node *nodes)
static void io_poll_task_func(struct callback_head *cb)
{
struct io_kiocb *req, *tmp;
struct req_batch rb;
struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
struct io_kiocb *nxt = NULL;
rb.to_free = rb.need_iter = 0;
spin_lock_irq(&ctx->completion_lock);
llist_for_each_entry_safe(req, tmp, nodes, llist_node) {
hash_del(&req->hash_node);
io_poll_complete(req, req->result, 0);
io_poll_task_handler(req, &nxt);
if (nxt) {
struct io_ring_ctx *ctx = nxt->ctx;
if (refcount_dec_and_test(&req->refs) &&
!io_req_multi_free(&rb, req)) {
req->flags |= REQ_F_COMP_LOCKED;
io_free_req(req);
}
mutex_lock(&ctx->uring_lock);
__io_queue_sqe(nxt, NULL);
mutex_unlock(&ctx->uring_lock);
}
spin_unlock_irq(&ctx->completion_lock);
io_cqring_ev_posted(ctx);
io_free_req_many(ctx, &rb);
}
static void io_poll_flush(struct io_wq_work **workptr)
{
struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
struct llist_node *nodes;
nodes = llist_del_all(&req->ctx->poll_llist);
if (nodes)
__io_poll_flush(req->ctx, nodes);
}
static void io_poll_trigger_evfd(struct io_wq_work **workptr)
{
struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
eventfd_signal(req->ctx->cq_ev_fd, 1);
io_put_req(req);
}
static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
void *key)
{
struct io_poll_iocb *poll = wait->private;
struct io_kiocb *req = container_of(poll, struct io_kiocb, poll);
struct io_ring_ctx *ctx = req->ctx;
__poll_t mask = key_to_poll(key);
/* for instances that support it check for an event match first: */
if (mask && !(mask & poll->events))
return 0;
list_del_init(&poll->wait.entry);
/*
* Run completion inline if we can. We're using trylock here because
* we are violating the completion_lock -> poll wq lock ordering.
* If we have a link timeout we're going to need the completion_lock
* for finalizing the request, mark us as having grabbed that already.
*/
if (mask) {
unsigned long flags;
if (llist_empty(&ctx->poll_llist) &&
spin_trylock_irqsave(&ctx->completion_lock, flags)) {
bool trigger_ev;
hash_del(&req->hash_node);
io_poll_complete(req, mask, 0);
trigger_ev = io_should_trigger_evfd(ctx);
if (trigger_ev && eventfd_signal_count()) {
trigger_ev = false;
req->work.func = io_poll_trigger_evfd;
} else {
req->flags |= REQ_F_COMP_LOCKED;
io_put_req(req);
req = NULL;
}
spin_unlock_irqrestore(&ctx->completion_lock, flags);
__io_cqring_ev_posted(ctx, trigger_ev);
} else {
req->result = mask;
req->llist_node.next = NULL;
/* if the list wasn't empty, we're done */
if (!llist_add(&req->llist_node, &ctx->poll_llist))
req = NULL;
else
req->work.func = io_poll_flush;
}
}
if (req)
io_queue_async_work(req);
struct io_kiocb *req = wait->private;
struct io_poll_iocb *poll = &req->poll;
return 1;
return __io_async_wake(req, poll, key_to_poll(key), io_poll_task_func);
}
struct io_poll_table {
struct poll_table_struct pt;
struct io_kiocb *req;
int error;
};
static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head,
struct poll_table_struct *p)
{
struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
if (unlikely(pt->req->poll.head)) {
pt->error = -EINVAL;
return;
}
pt->error = 0;
pt->req->poll.head = head;
add_wait_queue(head, &pt->req->poll.wait);
}
static void io_poll_req_insert(struct io_kiocb *req)
{
struct io_ring_ctx *ctx = req->ctx;
struct hlist_head *list;
list = &ctx->cancel_hash[hash_long(req->user_data, ctx->cancel_hash_bits)];
hlist_add_head(&req->hash_node, list);
__io_queue_proc(&pt->req->poll, pt, head);
}
static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
......@@ -3739,55 +4464,29 @@ static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe
events = READ_ONCE(sqe->poll_events);
poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP;
/*
* Don't need a reference here, as we're adding it to the task
* task_works list. If the task exits, the list is pruned.
*/
req->task = current;
return 0;
}
static int io_poll_add(struct io_kiocb *req, struct io_kiocb **nxt)
static int io_poll_add(struct io_kiocb *req)
{
struct io_poll_iocb *poll = &req->poll;
struct io_ring_ctx *ctx = req->ctx;
struct io_poll_table ipt;
bool cancel = false;
__poll_t mask;
INIT_IO_WORK(&req->work, io_poll_complete_work);
INIT_HLIST_NODE(&req->hash_node);
poll->head = NULL;
poll->done = false;
poll->canceled = false;
ipt.pt._qproc = io_poll_queue_proc;
ipt.pt._key = poll->events;
ipt.req = req;
ipt.error = -EINVAL; /* same as no support for IOCB_CMD_POLL */
/* initialized the list so that we can do list_empty checks */
INIT_LIST_HEAD(&poll->wait.entry);
init_waitqueue_func_entry(&poll->wait, io_poll_wake);
poll->wait.private = poll;
INIT_LIST_HEAD(&req->list);
ipt.pt._qproc = io_poll_queue_proc;
mask = vfs_poll(poll->file, &ipt.pt) & poll->events;
mask = __io_arm_poll_handler(req, &req->poll, &ipt, poll->events,
io_poll_wake);
spin_lock_irq(&ctx->completion_lock);
if (likely(poll->head)) {
spin_lock(&poll->head->lock);
if (unlikely(list_empty(&poll->wait.entry))) {
if (ipt.error)
cancel = true;
ipt.error = 0;
mask = 0;
}
if (mask || ipt.error)
list_del_init(&poll->wait.entry);
else if (cancel)
WRITE_ONCE(poll->canceled, true);
else if (!poll->done) /* actually waiting for an event */
io_poll_req_insert(req);
spin_unlock(&poll->head->lock);
}
if (mask) { /* no async, we'd stolen it */
ipt.error = 0;
io_poll_complete(req, mask, 0);
......@@ -3796,7 +4495,7 @@ static int io_poll_add(struct io_kiocb *req, struct io_kiocb **nxt)
if (mask) {
io_cqring_ev_posted(ctx);
io_put_req_find_next(req, nxt);
io_put_req(req);
}
return ipt.error;
}
......@@ -4045,7 +4744,7 @@ static int io_async_cancel_one(struct io_ring_ctx *ctx, void *sqe_addr)
static void io_async_find_and_cancel(struct io_ring_ctx *ctx,
struct io_kiocb *req, __u64 sqe_addr,
struct io_kiocb **nxt, int success_ret)
int success_ret)
{
unsigned long flags;
int ret;
......@@ -4071,7 +4770,7 @@ static void io_async_find_and_cancel(struct io_ring_ctx *ctx,
if (ret < 0)
req_set_fail_links(req);
io_put_req_find_next(req, nxt);
io_put_req(req);
}
static int io_async_cancel_prep(struct io_kiocb *req,
......@@ -4087,11 +4786,11 @@ static int io_async_cancel_prep(struct io_kiocb *req,
return 0;
}
static int io_async_cancel(struct io_kiocb *req, struct io_kiocb **nxt)
static int io_async_cancel(struct io_kiocb *req)
{
struct io_ring_ctx *ctx = req->ctx;
io_async_find_and_cancel(ctx, req, req->cancel.addr, nxt, 0);
io_async_find_and_cancel(ctx, req, req->cancel.addr, 0);
return 0;
}
......@@ -4226,6 +4925,15 @@ static int io_req_defer_prep(struct io_kiocb *req,
case IORING_OP_EPOLL_CTL:
ret = io_epoll_ctl_prep(req, sqe);
break;
case IORING_OP_SPLICE:
ret = io_splice_prep(req, sqe);
break;
case IORING_OP_PROVIDE_BUFFERS:
ret = io_provide_buffers_prep(req, sqe);
break;
case IORING_OP_REMOVE_BUFFERS:
ret = io_remove_buffers_prep(req, sqe);
break;
default:
printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n",
req->opcode);
......@@ -4272,29 +4980,43 @@ static void io_cleanup_req(struct io_kiocb *req)
case IORING_OP_READV:
case IORING_OP_READ_FIXED:
case IORING_OP_READ:
if (req->flags & REQ_F_BUFFER_SELECTED)
kfree((void *)(unsigned long)req->rw.addr);
/* fallthrough */
case IORING_OP_WRITEV:
case IORING_OP_WRITE_FIXED:
case IORING_OP_WRITE:
if (io->rw.iov != io->rw.fast_iov)
kfree(io->rw.iov);
break;
case IORING_OP_SENDMSG:
case IORING_OP_RECVMSG:
if (req->flags & REQ_F_BUFFER_SELECTED)
kfree(req->sr_msg.kbuf);
/* fallthrough */
case IORING_OP_SENDMSG:
if (io->msg.iov != io->msg.fast_iov)
kfree(io->msg.iov);
break;
case IORING_OP_RECV:
if (req->flags & REQ_F_BUFFER_SELECTED)
kfree(req->sr_msg.kbuf);
break;
case IORING_OP_OPENAT:
case IORING_OP_OPENAT2:
case IORING_OP_STATX:
putname(req->open.filename);
break;
case IORING_OP_SPLICE:
io_put_file(req, req->splice.file_in,
(req->splice.flags & SPLICE_F_FD_IN_FIXED));
break;
}
req->flags &= ~REQ_F_NEED_CLEANUP;
}
static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
struct io_kiocb **nxt, bool force_nonblock)
bool force_nonblock)
{
struct io_ring_ctx *ctx = req->ctx;
int ret;
......@@ -4311,7 +5033,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
if (ret < 0)
break;
}
ret = io_read(req, nxt, force_nonblock);
ret = io_read(req, force_nonblock);
break;
case IORING_OP_WRITEV:
case IORING_OP_WRITE_FIXED:
......@@ -4321,7 +5043,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
if (ret < 0)
break;
}
ret = io_write(req, nxt, force_nonblock);
ret = io_write(req, force_nonblock);
break;
case IORING_OP_FSYNC:
if (sqe) {
......@@ -4329,7 +5051,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
if (ret < 0)
break;
}
ret = io_fsync(req, nxt, force_nonblock);
ret = io_fsync(req, force_nonblock);
break;
case IORING_OP_POLL_ADD:
if (sqe) {
......@@ -4337,7 +5059,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
if (ret)
break;
}
ret = io_poll_add(req, nxt);
ret = io_poll_add(req);
break;
case IORING_OP_POLL_REMOVE:
if (sqe) {
......@@ -4353,7 +5075,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
if (ret < 0)
break;
}
ret = io_sync_file_range(req, nxt, force_nonblock);
ret = io_sync_file_range(req, force_nonblock);
break;
case IORING_OP_SENDMSG:
case IORING_OP_SEND:
......@@ -4363,9 +5085,9 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
break;
}
if (req->opcode == IORING_OP_SENDMSG)
ret = io_sendmsg(req, nxt, force_nonblock);
ret = io_sendmsg(req, force_nonblock);
else
ret = io_send(req, nxt, force_nonblock);
ret = io_send(req, force_nonblock);
break;
case IORING_OP_RECVMSG:
case IORING_OP_RECV:
......@@ -4375,9 +5097,9 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
break;
}
if (req->opcode == IORING_OP_RECVMSG)
ret = io_recvmsg(req, nxt, force_nonblock);
ret = io_recvmsg(req, force_nonblock);
else
ret = io_recv(req, nxt, force_nonblock);
ret = io_recv(req, force_nonblock);
break;
case IORING_OP_TIMEOUT:
if (sqe) {
......@@ -4401,7 +5123,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
if (ret)
break;
}
ret = io_accept(req, nxt, force_nonblock);
ret = io_accept(req, force_nonblock);
break;
case IORING_OP_CONNECT:
if (sqe) {
......@@ -4409,7 +5131,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
if (ret)
break;
}
ret = io_connect(req, nxt, force_nonblock);
ret = io_connect(req, force_nonblock);
break;
case IORING_OP_ASYNC_CANCEL:
if (sqe) {
......@@ -4417,7 +5139,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
if (ret)
break;
}
ret = io_async_cancel(req, nxt);
ret = io_async_cancel(req);
break;
case IORING_OP_FALLOCATE:
if (sqe) {
......@@ -4425,7 +5147,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
if (ret)
break;
}
ret = io_fallocate(req, nxt, force_nonblock);
ret = io_fallocate(req, force_nonblock);
break;
case IORING_OP_OPENAT:
if (sqe) {
......@@ -4433,7 +5155,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
if (ret)
break;
}
ret = io_openat(req, nxt, force_nonblock);
ret = io_openat(req, force_nonblock);
break;
case IORING_OP_CLOSE:
if (sqe) {
......@@ -4441,7 +5163,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
if (ret)
break;
}
ret = io_close(req, nxt, force_nonblock);
ret = io_close(req, force_nonblock);
break;
case IORING_OP_FILES_UPDATE:
if (sqe) {
......@@ -4457,7 +5179,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
if (ret)
break;
}
ret = io_statx(req, nxt, force_nonblock);
ret = io_statx(req, force_nonblock);
break;
case IORING_OP_FADVISE:
if (sqe) {
......@@ -4465,7 +5187,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
if (ret)
break;
}
ret = io_fadvise(req, nxt, force_nonblock);
ret = io_fadvise(req, force_nonblock);
break;
case IORING_OP_MADVISE:
if (sqe) {
......@@ -4473,7 +5195,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
if (ret)
break;
}
ret = io_madvise(req, nxt, force_nonblock);
ret = io_madvise(req, force_nonblock);
break;
case IORING_OP_OPENAT2:
if (sqe) {
......@@ -4481,7 +5203,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
if (ret)
break;
}
ret = io_openat2(req, nxt, force_nonblock);
ret = io_openat2(req, force_nonblock);
break;
case IORING_OP_EPOLL_CTL:
if (sqe) {
......@@ -4489,7 +5211,31 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
if (ret)
break;
}
ret = io_epoll_ctl(req, nxt, force_nonblock);
ret = io_epoll_ctl(req, force_nonblock);
break;
case IORING_OP_SPLICE:
if (sqe) {
ret = io_splice_prep(req, sqe);
if (ret < 0)
break;
}
ret = io_splice(req, force_nonblock);
break;
case IORING_OP_PROVIDE_BUFFERS:
if (sqe) {
ret = io_provide_buffers_prep(req, sqe);
if (ret)
break;
}
ret = io_provide_buffers(req, force_nonblock);
break;
case IORING_OP_REMOVE_BUFFERS:
if (sqe) {
ret = io_remove_buffers_prep(req, sqe);
if (ret)
break;
}
ret = io_remove_buffers(req, force_nonblock);
break;
default:
ret = -EINVAL;
......@@ -4522,7 +5268,6 @@ static void io_wq_submit_work(struct io_wq_work **workptr)
{
struct io_wq_work *work = *workptr;
struct io_kiocb *req = container_of(work, struct io_kiocb, work);
struct io_kiocb *nxt = NULL;
int ret = 0;
/* if NO_CANCEL is set, we must still run the work */
......@@ -4532,9 +5277,8 @@ static void io_wq_submit_work(struct io_wq_work **workptr)
}
if (!ret) {
req->in_async = true;
do {
ret = io_issue_sqe(req, NULL, &nxt, false);
ret = io_issue_sqe(req, NULL, false);
/*
* We can get EAGAIN for polled IO even though we're
* forcing a sync submission from here, since we can't
......@@ -4546,18 +5290,13 @@ static void io_wq_submit_work(struct io_wq_work **workptr)
} while (1);
}
/* drop submission reference */
io_put_req(req);
if (ret) {
req_set_fail_links(req);
io_cqring_add_event(req, ret);
io_put_req(req);
}
/* if a dependent link is ready, pass it back */
if (!ret && nxt)
io_wq_assign_next(workptr, nxt);
io_steal_work(req, workptr);
}
static int io_req_needs_file(struct io_kiocb *req, int fd)
......@@ -4578,41 +5317,52 @@ static inline struct file *io_file_from_index(struct io_ring_ctx *ctx,
return table->files[index & IORING_FILE_TABLE_MASK];;
}
static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req,
const struct io_uring_sqe *sqe)
static int io_file_get(struct io_submit_state *state, struct io_kiocb *req,
int fd, struct file **out_file, bool fixed)
{
struct io_ring_ctx *ctx = req->ctx;
unsigned flags;
int fd;
flags = READ_ONCE(sqe->flags);
fd = READ_ONCE(sqe->fd);
if (!io_req_needs_file(req, fd))
return 0;
struct file *file;
if (flags & IOSQE_FIXED_FILE) {
if (fixed) {
if (unlikely(!ctx->file_data ||
(unsigned) fd >= ctx->nr_user_files))
return -EBADF;
fd = array_index_nospec(fd, ctx->nr_user_files);
req->file = io_file_from_index(ctx, fd);
if (!req->file)
file = io_file_from_index(ctx, fd);
if (!file)
return -EBADF;
req->flags |= REQ_F_FIXED_FILE;
percpu_ref_get(&ctx->file_data->refs);
} else {
if (req->needs_fixed_file)
return -EBADF;
trace_io_uring_file_get(ctx, fd);
req->file = io_file_get(state, fd);
if (unlikely(!req->file))
file = __io_file_get(state, fd);
if (unlikely(!file))
return -EBADF;
}
*out_file = file;
return 0;
}
static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req,
const struct io_uring_sqe *sqe)
{
unsigned flags;
int fd;
bool fixed;
flags = READ_ONCE(sqe->flags);
fd = READ_ONCE(sqe->fd);
if (!io_req_needs_file(req, fd))
return 0;
fixed = (flags & IOSQE_FIXED_FILE);
if (unlikely(!fixed && req->needs_fixed_file))
return -EBADF;
return io_file_get(state, req, fd, &req->file, fixed);
}
static int io_grab_files(struct io_kiocb *req)
{
int ret = -EBADF;
......@@ -4672,8 +5422,7 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
if (prev) {
req_set_fail_links(prev);
io_async_find_and_cancel(ctx, req, prev->user_data, NULL,
-ETIME);
io_async_find_and_cancel(ctx, req, prev->user_data, -ETIME);
io_put_req(prev);
} else {
io_cqring_add_event(req, -ETIME);
......@@ -4710,6 +5459,9 @@ static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req)
if (!(req->flags & REQ_F_LINK))
return NULL;
/* for polled retry, if flag is set, we already went through here */
if (req->flags & REQ_F_POLLED)
return NULL;
nxt = list_first_entry_or_null(&req->link_list, struct io_kiocb,
link_list);
......@@ -4723,7 +5475,7 @@ static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req)
static void __io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_kiocb *linked_timeout;
struct io_kiocb *nxt = NULL;
struct io_kiocb *nxt;
const struct cred *old_creds = NULL;
int ret;
......@@ -4739,7 +5491,7 @@ static void __io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe)
old_creds = override_creds(req->work.creds);
}
ret = io_issue_sqe(req, sqe, &nxt, true);
ret = io_issue_sqe(req, sqe, true);
/*
* We async punt it if the file wasn't marked NOWAIT, or if the file
......@@ -4747,6 +5499,11 @@ static void __io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe)
*/
if (ret == -EAGAIN && (!(req->flags & REQ_F_NOWAIT) ||
(req->flags & REQ_F_MUST_PUNT))) {
if (io_arm_poll_handler(req)) {
if (linked_timeout)
io_queue_linked_timeout(linked_timeout);
goto exit;
}
punt:
if (io_op_defs[req->opcode].file_table) {
ret = io_grab_files(req);
......@@ -4759,10 +5516,11 @@ static void __io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe)
* submit reference when the iocb is actually submitted.
*/
io_queue_async_work(req);
goto done_req;
goto exit;
}
err:
nxt = NULL;
/* drop submission reference */
io_put_req_find_next(req, &nxt);
......@@ -4779,15 +5537,14 @@ static void __io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe)
req_set_fail_links(req);
io_put_req(req);
}
done_req:
if (nxt) {
req = nxt;
nxt = NULL;
if (req->flags & REQ_F_FORCE_ASYNC)
goto punt;
goto again;
}
exit:
if (old_creds)
revert_creds(old_creds);
}
......@@ -4829,7 +5586,8 @@ static inline void io_queue_link_head(struct io_kiocb *req)
}
#define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK| \
IOSQE_IO_HARDLINK | IOSQE_ASYNC)
IOSQE_IO_HARDLINK | IOSQE_ASYNC | \
IOSQE_BUFFER_SELECT)
static bool io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
struct io_submit_state *state, struct io_kiocb **link)
......@@ -4846,6 +5604,12 @@ static bool io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
goto err_req;
}
if ((sqe_flags & IOSQE_BUFFER_SELECT) &&
!io_op_defs[req->opcode].buffer_select) {
ret = -EOPNOTSUPP;
goto err_req;
}
id = READ_ONCE(sqe->personality);
if (id) {
req->work.creds = idr_find(&ctx->personality_idr, id);
......@@ -4857,8 +5621,9 @@ static bool io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
}
/* same numerical values with corresponding REQ_F_*, safe to copy */
req->flags |= sqe_flags & (IOSQE_IO_DRAIN|IOSQE_IO_HARDLINK|
IOSQE_ASYNC);
req->flags |= sqe_flags & (IOSQE_IO_DRAIN | IOSQE_IO_HARDLINK |
IOSQE_ASYNC | IOSQE_FIXED_FILE |
IOSQE_BUFFER_SELECT);
ret = io_req_set_file(state, req, sqe);
if (unlikely(ret)) {
......@@ -5079,7 +5844,6 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
*mm = ctx->sqo_mm;
}
req->in_async = async;
req->needs_fixed_file = async;
trace_io_uring_submit_sqe(ctx, req->opcode, req->user_data,
true, async);
......@@ -5163,6 +5927,8 @@ static int io_sq_thread(void *data)
if (!list_empty(&ctx->poll_list) ||
(!time_after(jiffies, timeout) && ret != -EBUSY &&
!percpu_ref_is_dying(&ctx->refs))) {
if (current->task_works)
task_work_run();
cond_resched();
continue;
}
......@@ -5194,6 +5960,10 @@ static int io_sq_thread(void *data)
finish_wait(&ctx->sqo_wait, &wait);
break;
}
if (current->task_works) {
task_work_run();
continue;
}
if (signal_pending(current))
flush_signals(current);
schedule();
......@@ -5213,6 +5983,9 @@ static int io_sq_thread(void *data)
timeout = jiffies + ctx->sq_thread_idle;
}
if (current->task_works)
task_work_run();
set_fs(old_fs);
if (cur_mm) {
unuse_mm(cur_mm);
......@@ -5277,8 +6050,13 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
struct io_rings *rings = ctx->rings;
int ret = 0;
if (io_cqring_events(ctx, false) >= min_events)
return 0;
do {
if (io_cqring_events(ctx, false) >= min_events)
return 0;
if (!current->task_works)
break;
task_work_run();
} while (1);
if (sig) {
#ifdef CONFIG_COMPAT
......@@ -5298,6 +6076,8 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
do {
prepare_to_wait_exclusive(&ctx->wait, &iowq.wq,
TASK_INTERRUPTIBLE);
if (current->task_works)
task_work_run();
if (io_should_wake(&iowq, false))
break;
schedule();
......@@ -5607,7 +6387,6 @@ static void io_ring_file_put(struct io_ring_ctx *ctx, struct file *file)
struct io_file_put {
struct llist_node llist;
struct file *file;
struct completion *done;
};
static void io_ring_file_ref_flush(struct fixed_file_data *data)
......@@ -5618,10 +6397,7 @@ static void io_ring_file_ref_flush(struct fixed_file_data *data)
while ((node = llist_del_all(&data->put_llist)) != NULL) {
llist_for_each_entry_safe(pfile, tmp, node, llist) {
io_ring_file_put(data->ctx, pfile->file);
if (pfile->done)
complete(pfile->done);
else
kfree(pfile);
kfree(pfile);
}
}
}
......@@ -5816,33 +6592,18 @@ static void io_atomic_switch(struct percpu_ref *ref)
percpu_ref_get(&data->refs);
}
static bool io_queue_file_removal(struct fixed_file_data *data,
static int io_queue_file_removal(struct fixed_file_data *data,
struct file *file)
{
struct io_file_put *pfile, pfile_stack;
DECLARE_COMPLETION_ONSTACK(done);
struct io_file_put *pfile;
/*
* If we fail allocating the struct we need for doing async reomval
* of this file, just punt to sync and wait for it.
*/
pfile = kzalloc(sizeof(*pfile), GFP_KERNEL);
if (!pfile) {
pfile = &pfile_stack;
pfile->done = &done;
}
if (!pfile)
return -ENOMEM;
pfile->file = file;
llist_add(&pfile->llist, &data->put_llist);
if (pfile == &pfile_stack) {
percpu_ref_switch_to_atomic(&data->refs, io_atomic_switch);
wait_for_completion(&done);
flush_work(&data->ref_work);
return false;
}
return true;
return 0;
}
static int __io_sqe_files_update(struct io_ring_ctx *ctx,
......@@ -5877,9 +6638,11 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
index = i & IORING_FILE_TABLE_MASK;
if (table->files[index]) {
file = io_file_from_index(ctx, index);
err = io_queue_file_removal(data, file);
if (err)
break;
table->files[index] = NULL;
if (io_queue_file_removal(data, file))
ref_switch = true;
ref_switch = true;
}
if (fd != -1) {
file = fget(fd);
......@@ -5932,20 +6695,14 @@ static int io_sqe_files_update(struct io_ring_ctx *ctx, void __user *arg,
return __io_sqe_files_update(ctx, &up, nr_args);
}
static void io_put_work(struct io_wq_work *work)
static void io_free_work(struct io_wq_work *work)
{
struct io_kiocb *req = container_of(work, struct io_kiocb, work);
/* Consider that io_steal_work() relies on this ref */
io_put_req(req);
}
static void io_get_work(struct io_wq_work *work)
{
struct io_kiocb *req = container_of(work, struct io_kiocb, work);
refcount_inc(&req->refs);
}
static int io_init_wq_offload(struct io_ring_ctx *ctx,
struct io_uring_params *p)
{
......@@ -5956,8 +6713,7 @@ static int io_init_wq_offload(struct io_ring_ctx *ctx,
int ret = 0;
data.user = ctx->user;
data.get_work = io_get_work;
data.put_work = io_put_work;
data.free_work = io_free_work;
if (!(p->flags & IORING_SETUP_ATTACH_WQ)) {
/* Do QD, or 4 * CPUS, whatever is smallest */
......@@ -6359,6 +7115,21 @@ static int io_eventfd_unregister(struct io_ring_ctx *ctx)
return -ENXIO;
}
static int __io_destroy_buffers(int id, void *p, void *data)
{
struct io_ring_ctx *ctx = data;
struct io_buffer *buf = p;
__io_remove_buffers(ctx, buf, id, -1U);
return 0;
}
static void io_destroy_buffers(struct io_ring_ctx *ctx)
{
idr_for_each(&ctx->io_buffer_idr, __io_destroy_buffers, ctx);
idr_destroy(&ctx->io_buffer_idr);
}
static void io_ring_ctx_free(struct io_ring_ctx *ctx)
{
io_finish_async(ctx);
......@@ -6369,6 +7140,7 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx)
io_sqe_buffer_unregister(ctx);
io_sqe_files_unregister(ctx);
io_eventfd_unregister(ctx);
io_destroy_buffers(ctx);
idr_destroy(&ctx->personality_idr);
#if defined(CONFIG_UNIX)
......@@ -6623,6 +7395,9 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
int submitted = 0;
struct fd f;
if (current->task_works)
task_work_run();
if (flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP))
return -EINVAL;
......@@ -6669,7 +7444,14 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
min_complete = min(min_complete, ctx->cq_entries);
if (ctx->flags & IORING_SETUP_IOPOLL) {
/*
* When SETUP_IOPOLL and SETUP_SQPOLL are both enabled, user
* space applications don't need to do io completion events
* polling again, they can rely on io_sq_thread to do polling
* work, which can reduce cpu usage and uring_lock contention.
*/
if (ctx->flags & IORING_SETUP_IOPOLL &&
!(ctx->flags & IORING_SETUP_SQPOLL)) {
ret = io_iopoll_check(ctx, &nr_events, min_complete);
} else {
ret = io_cqring_wait(ctx, min_complete, sig, sigsz);
......@@ -6745,6 +7527,17 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
seq_printf(m, "Personalities:\n");
idr_for_each(&ctx->personality_idr, io_uring_show_cred, m);
}
seq_printf(m, "PollList:\n");
spin_lock_irq(&ctx->completion_lock);
for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
struct hlist_head *list = &ctx->cancel_hash[i];
struct io_kiocb *req;
hlist_for_each_entry(req, list, hash_node)
seq_printf(m, " op=%d, task_works=%d\n", req->opcode,
req->task->task_works != NULL);
}
spin_unlock_irq(&ctx->completion_lock);
mutex_unlock(&ctx->uring_lock);
}
......@@ -6961,7 +7754,7 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p)
p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP |
IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS |
IORING_FEAT_CUR_PERSONALITY;
IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL;
trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags);
return ret;
err:
......@@ -7239,6 +8032,7 @@ static int __init io_uring_init(void)
BUILD_BUG_SQE_ELEM(8, __u64, off);
BUILD_BUG_SQE_ELEM(8, __u64, addr2);
BUILD_BUG_SQE_ELEM(16, __u64, addr);
BUILD_BUG_SQE_ELEM(16, __u64, splice_off_in);
BUILD_BUG_SQE_ELEM(24, __u32, len);
BUILD_BUG_SQE_ELEM(28, __kernel_rwf_t, rw_flags);
BUILD_BUG_SQE_ELEM(28, /* compat */ int, rw_flags);
......@@ -7253,11 +8047,14 @@ static int __init io_uring_init(void)
BUILD_BUG_SQE_ELEM(28, __u32, open_flags);
BUILD_BUG_SQE_ELEM(28, __u32, statx_flags);
BUILD_BUG_SQE_ELEM(28, __u32, fadvise_advice);
BUILD_BUG_SQE_ELEM(28, __u32, splice_flags);
BUILD_BUG_SQE_ELEM(32, __u64, user_data);
BUILD_BUG_SQE_ELEM(40, __u16, buf_index);
BUILD_BUG_SQE_ELEM(42, __u16, personality);
BUILD_BUG_SQE_ELEM(44, __s32, splice_fd_in);
BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST);
BUILD_BUG_ON(__REQ_F_LAST_BIT >= 8 * sizeof(int));
req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC);
return 0;
};
......
......@@ -1109,9 +1109,9 @@ static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
/*
* Determine where to splice to/from.
*/
static long do_splice(struct file *in, loff_t __user *off_in,
struct file *out, loff_t __user *off_out,
size_t len, unsigned int flags)
long do_splice(struct file *in, loff_t __user *off_in,
struct file *out, loff_t __user *off_out,
size_t len, unsigned int flags)
{
struct pipe_inode_info *ipipe;
struct pipe_inode_info *opipe;
......
......@@ -391,6 +391,10 @@ extern int recvmsg_copy_msghdr(struct msghdr *msg,
struct user_msghdr __user *umsg, unsigned flags,
struct sockaddr __user **uaddr,
struct iovec **iov);
extern int __copy_msghdr_from_user(struct msghdr *kmsg,
struct user_msghdr __user *umsg,
struct sockaddr __user **save_addr,
struct iovec __user **uiov, size_t *nsegs);
/* helpers which do the actual work for syscalls */
extern int __sys_recvfrom(int fd, void __user *ubuf, size_t size,
......
......@@ -78,6 +78,9 @@ extern ssize_t add_to_pipe(struct pipe_inode_info *,
struct pipe_buffer *);
extern ssize_t splice_direct_to_actor(struct file *, struct splice_desc *,
splice_direct_actor *);
extern long do_splice(struct file *in, loff_t __user *off_in,
struct file *out, loff_t __user *off_out,
size_t len, unsigned int flags);
/*
* for dynamic pipe sizing
......
......@@ -38,6 +38,9 @@ struct compat_cmsghdr {
#define compat_mmsghdr mmsghdr
#endif /* defined(CONFIG_COMPAT) */
int __get_compat_msghdr(struct msghdr *kmsg, struct compat_msghdr __user *umsg,
struct sockaddr __user **save_addr, compat_uptr_t *ptr,
compat_size_t *len);
int get_compat_msghdr(struct msghdr *, struct compat_msghdr __user *,
struct sockaddr __user **, struct iovec **);
struct sock_fprog __user *get_compat_bpf_fprog(char __user *optval);
......
......@@ -357,6 +357,109 @@ TRACE_EVENT(io_uring_submit_sqe,
__entry->force_nonblock, __entry->sq_thread)
);
TRACE_EVENT(io_uring_poll_arm,
TP_PROTO(void *ctx, u8 opcode, u64 user_data, int mask, int events),
TP_ARGS(ctx, opcode, user_data, mask, events),
TP_STRUCT__entry (
__field( void *, ctx )
__field( u8, opcode )
__field( u64, user_data )
__field( int, mask )
__field( int, events )
),
TP_fast_assign(
__entry->ctx = ctx;
__entry->opcode = opcode;
__entry->user_data = user_data;
__entry->mask = mask;
__entry->events = events;
),
TP_printk("ring %p, op %d, data 0x%llx, mask 0x%x, events 0x%x",
__entry->ctx, __entry->opcode,
(unsigned long long) __entry->user_data,
__entry->mask, __entry->events)
);
TRACE_EVENT(io_uring_poll_wake,
TP_PROTO(void *ctx, u8 opcode, u64 user_data, int mask),
TP_ARGS(ctx, opcode, user_data, mask),
TP_STRUCT__entry (
__field( void *, ctx )
__field( u8, opcode )
__field( u64, user_data )
__field( int, mask )
),
TP_fast_assign(
__entry->ctx = ctx;
__entry->opcode = opcode;
__entry->user_data = user_data;
__entry->mask = mask;
),
TP_printk("ring %p, op %d, data 0x%llx, mask 0x%x",
__entry->ctx, __entry->opcode,
(unsigned long long) __entry->user_data,
__entry->mask)
);
TRACE_EVENT(io_uring_task_add,
TP_PROTO(void *ctx, u8 opcode, u64 user_data, int mask),
TP_ARGS(ctx, opcode, user_data, mask),
TP_STRUCT__entry (
__field( void *, ctx )
__field( u8, opcode )
__field( u64, user_data )
__field( int, mask )
),
TP_fast_assign(
__entry->ctx = ctx;
__entry->opcode = opcode;
__entry->user_data = user_data;
__entry->mask = mask;
),
TP_printk("ring %p, op %d, data 0x%llx, mask %x",
__entry->ctx, __entry->opcode,
(unsigned long long) __entry->user_data,
__entry->mask)
);
TRACE_EVENT(io_uring_task_run,
TP_PROTO(void *ctx, u8 opcode, u64 user_data),
TP_ARGS(ctx, opcode, user_data),
TP_STRUCT__entry (
__field( void *, ctx )
__field( u8, opcode )
__field( u64, user_data )
),
TP_fast_assign(
__entry->ctx = ctx;
__entry->opcode = opcode;
__entry->user_data = user_data;
),
TP_printk("ring %p, op %d, data 0x%llx",
__entry->ctx, __entry->opcode,
(unsigned long long) __entry->user_data)
);
#endif /* _TRACE_IO_URING_H */
/* This part must be outside protection */
......
/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
/* SPDX-License-Identifier: (GPL-2.0 WITH Linux-syscall-note) OR MIT */
/*
* Header file for the io_uring interface.
*
......@@ -23,7 +23,10 @@ struct io_uring_sqe {
__u64 off; /* offset into file */
__u64 addr2;
};
__u64 addr; /* pointer to buffer or iovecs */
union {
__u64 addr; /* pointer to buffer or iovecs */
__u64 splice_off_in;
};
__u32 len; /* buffer size or number of iovecs */
union {
__kernel_rwf_t rw_flags;
......@@ -37,14 +40,21 @@ struct io_uring_sqe {
__u32 open_flags;
__u32 statx_flags;
__u32 fadvise_advice;
__u32 splice_flags;
};
__u64 user_data; /* data to be passed back at completion time */
union {
struct {
/* index into fixed buffers, if used */
__u16 buf_index;
/* pack this to avoid bogus arm OABI complaints */
union {
/* index into fixed buffers, if used */
__u16 buf_index;
/* for grouped buffer selection */
__u16 buf_group;
} __attribute__((packed));
/* personality to use, if used */
__u16 personality;
__s32 splice_fd_in;
};
__u64 __pad2[3];
};
......@@ -56,6 +66,7 @@ enum {
IOSQE_IO_LINK_BIT,
IOSQE_IO_HARDLINK_BIT,
IOSQE_ASYNC_BIT,
IOSQE_BUFFER_SELECT_BIT,
};
/*
......@@ -71,6 +82,8 @@ enum {
#define IOSQE_IO_HARDLINK (1U << IOSQE_IO_HARDLINK_BIT)
/* always go async */
#define IOSQE_ASYNC (1U << IOSQE_ASYNC_BIT)
/* select buffer from sqe->buf_group */
#define IOSQE_BUFFER_SELECT (1U << IOSQE_BUFFER_SELECT_BIT)
/*
* io_uring_setup() flags
......@@ -113,6 +126,9 @@ enum {
IORING_OP_RECV,
IORING_OP_OPENAT2,
IORING_OP_EPOLL_CTL,
IORING_OP_SPLICE,
IORING_OP_PROVIDE_BUFFERS,
IORING_OP_REMOVE_BUFFERS,
/* this goes last, obviously */
IORING_OP_LAST,
......@@ -128,6 +144,12 @@ enum {
*/
#define IORING_TIMEOUT_ABS (1U << 0)
/*
* sqe->splice_flags
* extends splice(2) flags
*/
#define SPLICE_F_FD_IN_FIXED (1U << 31) /* the last bit of __u32 */
/*
* IO completion data structure (Completion Queue Entry)
*/
......@@ -137,6 +159,17 @@ struct io_uring_cqe {
__u32 flags;
};
/*
* cqe->flags
*
* IORING_CQE_F_BUFFER If set, the upper 16 bits are the buffer ID
*/
#define IORING_CQE_F_BUFFER (1U << 0)
enum {
IORING_CQE_BUFFER_SHIFT = 16,
};
/*
* Magic offsets for the application to mmap the data it needs
*/
......@@ -204,6 +237,7 @@ struct io_uring_params {
#define IORING_FEAT_SUBMIT_STABLE (1U << 2)
#define IORING_FEAT_RW_CUR_POS (1U << 3)
#define IORING_FEAT_CUR_PERSONALITY (1U << 4)
#define IORING_FEAT_FAST_POLL (1U << 5)
/*
* io_uring_register(2) opcodes and arguments
......
......@@ -97,16 +97,26 @@ void task_work_run(void)
* work->func() can do task_work_add(), do not set
* work_exited unless the list is empty.
*/
raw_spin_lock_irq(&task->pi_lock);
do {
head = NULL;
work = READ_ONCE(task->task_works);
head = !work && (task->flags & PF_EXITING) ?
&work_exited : NULL;
if (!work) {
if (task->flags & PF_EXITING)
head = &work_exited;
else
break;
}
} while (cmpxchg(&task->task_works, work, head) != work);
raw_spin_unlock_irq(&task->pi_lock);
if (!work)
break;
/*
* Synchronize with task_work_cancel(). It can not remove
* the first entry == work, cmpxchg(task_works) must fail.
* But it can remove another entry from the ->next list.
*/
raw_spin_lock_irq(&task->pi_lock);
raw_spin_unlock_irq(&task->pi_lock);
do {
next = work->next;
......
......@@ -33,10 +33,10 @@
#include <linux/uaccess.h>
#include <net/compat.h>
int get_compat_msghdr(struct msghdr *kmsg,
struct compat_msghdr __user *umsg,
struct sockaddr __user **save_addr,
struct iovec **iov)
int __get_compat_msghdr(struct msghdr *kmsg,
struct compat_msghdr __user *umsg,
struct sockaddr __user **save_addr,
compat_uptr_t *ptr, compat_size_t *len)
{
struct compat_msghdr msg;
ssize_t err;
......@@ -79,10 +79,26 @@ int get_compat_msghdr(struct msghdr *kmsg,
return -EMSGSIZE;
kmsg->msg_iocb = NULL;
*ptr = msg.msg_iov;
*len = msg.msg_iovlen;
return 0;
}
int get_compat_msghdr(struct msghdr *kmsg,
struct compat_msghdr __user *umsg,
struct sockaddr __user **save_addr,
struct iovec **iov)
{
compat_uptr_t ptr;
compat_size_t len;
ssize_t err;
err = __get_compat_msghdr(kmsg, umsg, save_addr, &ptr, &len);
if (err)
return err;
err = compat_import_iovec(save_addr ? READ : WRITE,
compat_ptr(msg.msg_iov), msg.msg_iovlen,
UIO_FASTIOV, iov, &kmsg->msg_iter);
err = compat_import_iovec(save_addr ? READ : WRITE, compat_ptr(ptr),
len, UIO_FASTIOV, iov, &kmsg->msg_iter);
return err < 0 ? err : 0;
}
......
......@@ -2228,10 +2228,10 @@ struct used_address {
unsigned int name_len;
};
static int copy_msghdr_from_user(struct msghdr *kmsg,
struct user_msghdr __user *umsg,
struct sockaddr __user **save_addr,
struct iovec **iov)
int __copy_msghdr_from_user(struct msghdr *kmsg,
struct user_msghdr __user *umsg,
struct sockaddr __user **save_addr,
struct iovec __user **uiov, size_t *nsegs)
{
struct user_msghdr msg;
ssize_t err;
......@@ -2273,6 +2273,23 @@ static int copy_msghdr_from_user(struct msghdr *kmsg,
return -EMSGSIZE;
kmsg->msg_iocb = NULL;
*uiov = msg.msg_iov;
*nsegs = msg.msg_iovlen;
return 0;
}
static int copy_msghdr_from_user(struct msghdr *kmsg,
struct user_msghdr __user *umsg,
struct sockaddr __user **save_addr,
struct iovec **iov)
{
struct user_msghdr msg;
ssize_t err;
err = __copy_msghdr_from_user(kmsg, umsg, save_addr, &msg.msg_iov,
&msg.msg_iovlen);
if (err)
return err;
err = import_iovec(save_addr ? READ : WRITE,
msg.msg_iov, msg.msg_iovlen,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment