Commit 8d1f0177 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'for-5.16/io_uring-2021-10-29' of git://git.kernel.dk/linux-block

Pull io_uring updates from Jens Axboe:
 "Light on new features - basically just the hybrid mode support.

  Outside of that it's just fixes, cleanups, and performance
  improvements.

  In detail:

   - Add ring related information to the fdinfo output (Hao)

   - Hybrid async mode (Hao)

   - Support for batched issue on block (me)

   - sqe error trace improvement (me)

   - IOPOLL efficiency improvements (Pavel)

   - submit state cleanups and improvements (Pavel)

   - Completion side improvements (Pavel)

   - Drain improvements (Pavel)

   - Buffer selection cleanups (Pavel)

   - Fixed file node improvements (Pavel)

   - io-wq setup cancelation fix (Pavel)

   - Various other performance improvements and cleanups (Pavel)

   - Misc fixes (Arnd, Bixuan, Changcheng, Hao, me, Noah)"

* tag 'for-5.16/io_uring-2021-10-29' of git://git.kernel.dk/linux-block: (97 commits)
  io-wq: remove worker to owner tw dependency
  io_uring: harder fdinfo sq/cq ring iterating
  io_uring: don't assign write hint in the read path
  io_uring: clusterise ki_flags access in rw_prep
  io_uring: kill unused param from io_file_supports_nowait
  io_uring: clean up timeout async_data allocation
  io_uring: don't try io-wq polling if not supported
  io_uring: check if opcode needs poll first on arming
  io_uring: clean iowq submit work cancellation
  io_uring: clean io_wq_submit_work()'s main loop
  io-wq: use helper for worker refcounting
  io_uring: implement async hybrid mode for pollable requests
  io_uring: Use ERR_CAST() instead of ERR_PTR(PTR_ERR())
  io_uring: split logic of force_nonblock
  io_uring: warning about unused-but-set parameter
  io_uring: inform block layer of how many requests we are submitting
  io_uring: simplify io_file_supports_nowait()
  io_uring: combine REQ_F_NOWAIT_{READ,WRITE} flags
  io_uring: arm poll for non-nowait files
  fs/io_uring: Prioritise checking faster conditions first in io_write
  ...
parents 643a7234 1d5f5ea7
......@@ -140,6 +140,7 @@ static void io_wqe_dec_running(struct io_worker *worker);
static bool io_acct_cancel_pending_work(struct io_wqe *wqe,
struct io_wqe_acct *acct,
struct io_cb_cancel_data *match);
static void create_worker_cb(struct callback_head *cb);
static bool io_worker_get(struct io_worker *worker)
{
......@@ -174,12 +175,46 @@ static void io_worker_ref_put(struct io_wq *wq)
complete(&wq->worker_done);
}
static void io_worker_cancel_cb(struct io_worker *worker)
{
struct io_wqe_acct *acct = io_wqe_get_acct(worker);
struct io_wqe *wqe = worker->wqe;
struct io_wq *wq = wqe->wq;
atomic_dec(&acct->nr_running);
raw_spin_lock(&worker->wqe->lock);
acct->nr_workers--;
raw_spin_unlock(&worker->wqe->lock);
io_worker_ref_put(wq);
clear_bit_unlock(0, &worker->create_state);
io_worker_release(worker);
}
static bool io_task_worker_match(struct callback_head *cb, void *data)
{
struct io_worker *worker;
if (cb->func != create_worker_cb)
return false;
worker = container_of(cb, struct io_worker, create_work);
return worker == data;
}
static void io_worker_exit(struct io_worker *worker)
{
struct io_wqe *wqe = worker->wqe;
struct io_wq *wq = wqe->wq;
if (refcount_dec_and_test(&worker->ref))
complete(&worker->ref_done);
while (1) {
struct callback_head *cb = task_work_cancel_match(wq->task,
io_task_worker_match, worker);
if (!cb)
break;
io_worker_cancel_cb(worker);
}
io_worker_release(worker);
wait_for_completion(&worker->ref_done);
raw_spin_lock(&wqe->lock);
......@@ -323,8 +358,10 @@ static bool io_queue_worker_create(struct io_worker *worker,
init_task_work(&worker->create_work, func);
worker->create_index = acct->index;
if (!task_work_add(wq->task, &worker->create_work, TWA_SIGNAL))
if (!task_work_add(wq->task, &worker->create_work, TWA_SIGNAL)) {
clear_bit_unlock(0, &worker->create_state);
return true;
}
clear_bit_unlock(0, &worker->create_state);
fail_release:
io_worker_release(worker);
......@@ -716,11 +753,8 @@ static void io_workqueue_create(struct work_struct *work)
struct io_worker *worker = container_of(work, struct io_worker, work);
struct io_wqe_acct *acct = io_wqe_get_acct(worker);
if (!io_queue_worker_create(worker, acct, create_worker_cont)) {
clear_bit_unlock(0, &worker->create_state);
io_worker_release(worker);
if (!io_queue_worker_create(worker, acct, create_worker_cont))
kfree(worker);
}
}
static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index)
......@@ -1150,17 +1184,9 @@ static void io_wq_exit_workers(struct io_wq *wq)
while ((cb = task_work_cancel_match(wq->task, io_task_work_match, wq)) != NULL) {
struct io_worker *worker;
struct io_wqe_acct *acct;
worker = container_of(cb, struct io_worker, create_work);
acct = io_wqe_get_acct(worker);
atomic_dec(&acct->nr_running);
raw_spin_lock(&worker->wqe->lock);
acct->nr_workers--;
raw_spin_unlock(&worker->wqe->lock);
io_worker_ref_put(wq);
clear_bit_unlock(0, &worker->create_state);
io_worker_release(worker);
io_worker_cancel_cb(worker);
}
rcu_read_lock();
......
......@@ -29,6 +29,17 @@ struct io_wq_work_list {
struct io_wq_work_node *last;
};
#define wq_list_for_each(pos, prv, head) \
for (pos = (head)->first, prv = NULL; pos; prv = pos, pos = (pos)->next)
#define wq_list_for_each_resume(pos, prv) \
for (; pos; prv = pos, pos = (pos)->next)
#define wq_list_empty(list) (READ_ONCE((list)->first) == NULL)
#define INIT_WQ_LIST(list) do { \
(list)->first = NULL; \
} while (0)
static inline void wq_list_add_after(struct io_wq_work_node *node,
struct io_wq_work_node *pos,
struct io_wq_work_list *list)
......@@ -54,6 +65,15 @@ static inline void wq_list_add_tail(struct io_wq_work_node *node,
}
}
static inline void wq_list_add_head(struct io_wq_work_node *node,
struct io_wq_work_list *list)
{
node->next = list->first;
if (!node->next)
list->last = node;
WRITE_ONCE(list->first, node);
}
static inline void wq_list_cut(struct io_wq_work_list *list,
struct io_wq_work_node *last,
struct io_wq_work_node *prev)
......@@ -69,6 +89,31 @@ static inline void wq_list_cut(struct io_wq_work_list *list,
last->next = NULL;
}
static inline void __wq_list_splice(struct io_wq_work_list *list,
struct io_wq_work_node *to)
{
list->last->next = to->next;
to->next = list->first;
INIT_WQ_LIST(list);
}
static inline bool wq_list_splice(struct io_wq_work_list *list,
struct io_wq_work_node *to)
{
if (!wq_list_empty(list)) {
__wq_list_splice(list, to);
return true;
}
return false;
}
static inline void wq_stack_add_head(struct io_wq_work_node *node,
struct io_wq_work_node *stack)
{
node->next = stack->next;
stack->next = node;
}
static inline void wq_list_del(struct io_wq_work_list *list,
struct io_wq_work_node *node,
struct io_wq_work_node *prev)
......@@ -76,14 +121,14 @@ static inline void wq_list_del(struct io_wq_work_list *list,
wq_list_cut(list, node, prev);
}
#define wq_list_for_each(pos, prv, head) \
for (pos = (head)->first, prv = NULL; pos; prv = pos, pos = (pos)->next)
static inline
struct io_wq_work_node *wq_stack_extract(struct io_wq_work_node *stack)
{
struct io_wq_work_node *node = stack->next;
#define wq_list_empty(list) (READ_ONCE((list)->first) == NULL)
#define INIT_WQ_LIST(list) do { \
(list)->first = NULL; \
(list)->last = NULL; \
} while (0)
stack->next = node->next;
return node;
}
struct io_wq_work {
struct io_wq_work_node list;
......
This source diff could not be displayed because it is too large. You can view the blob instead.
......@@ -6,6 +6,7 @@
#define _TRACE_IO_URING_H
#include <linux/tracepoint.h>
#include <uapi/linux/io_uring.h>
struct io_wq_work;
......@@ -497,6 +498,66 @@ TRACE_EVENT(io_uring_task_run,
(unsigned long long) __entry->user_data)
);
/*
* io_uring_req_failed - called when an sqe is errored dring submission
*
* @sqe: pointer to the io_uring_sqe that failed
* @error: error it failed with
*
* Allows easier diagnosing of malformed requests in production systems.
*/
TRACE_EVENT(io_uring_req_failed,
TP_PROTO(const struct io_uring_sqe *sqe, int error),
TP_ARGS(sqe, error),
TP_STRUCT__entry (
__field( u8, opcode )
__field( u8, flags )
__field( u8, ioprio )
__field( u64, off )
__field( u64, addr )
__field( u32, len )
__field( u32, op_flags )
__field( u64, user_data )
__field( u16, buf_index )
__field( u16, personality )
__field( u32, file_index )
__field( u64, pad1 )
__field( u64, pad2 )
__field( int, error )
),
TP_fast_assign(
__entry->opcode = sqe->opcode;
__entry->flags = sqe->flags;
__entry->ioprio = sqe->ioprio;
__entry->off = sqe->off;
__entry->addr = sqe->addr;
__entry->len = sqe->len;
__entry->op_flags = sqe->rw_flags;
__entry->user_data = sqe->user_data;
__entry->buf_index = sqe->buf_index;
__entry->personality = sqe->personality;
__entry->file_index = sqe->file_index;
__entry->pad1 = sqe->__pad2[0];
__entry->pad2 = sqe->__pad2[1];
__entry->error = error;
),
TP_printk("op %d, flags=0x%x, prio=%d, off=%llu, addr=%llu, "
"len=%u, rw_flags=0x%x, user_data=0x%llx, buf_index=%d, "
"personality=%d, file_index=%d, pad=0x%llx/%llx, error=%d",
__entry->opcode, __entry->flags, __entry->ioprio,
(unsigned long long)__entry->off,
(unsigned long long) __entry->addr, __entry->len,
__entry->op_flags, (unsigned long long) __entry->user_data,
__entry->buf_index, __entry->personality, __entry->file_index,
(unsigned long long) __entry->pad1,
(unsigned long long) __entry->pad2, __entry->error)
);
#endif /* _TRACE_IO_URING_H */
/* This part must be outside protection */
......
......@@ -158,6 +158,7 @@ enum {
#define IORING_TIMEOUT_BOOTTIME (1U << 2)
#define IORING_TIMEOUT_REALTIME (1U << 3)
#define IORING_LINK_TIMEOUT_UPDATE (1U << 4)
#define IORING_TIMEOUT_ETIME_SUCCESS (1U << 5)
#define IORING_TIMEOUT_CLOCK_MASK (IORING_TIMEOUT_BOOTTIME | IORING_TIMEOUT_REALTIME)
#define IORING_TIMEOUT_UPDATE_MASK (IORING_TIMEOUT_UPDATE | IORING_LINK_TIMEOUT_UPDATE)
/*
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment