Commit 1d9bd516 authored by Tejun Heo's avatar Tejun Heo Committed by Jens Axboe

blk-mq: replace timeout synchronization with a RCU and generation based scheme

Currently, blk-mq timeout path synchronizes against the usual
issue/completion path using a complex scheme involving atomic
bitflags, REQ_ATOM_*, memory barriers and subtle memory coherence
rules.  Unfortunately, it contains quite a few holes.

There's a complex dancing around REQ_ATOM_STARTED and
REQ_ATOM_COMPLETE between issue/completion and timeout paths; however,
they don't have a synchronization point across request recycle
instances and it isn't clear what the barriers add.
blk_mq_check_expired() can easily read STARTED from N-2'th iteration,
deadline from N-1'th, blk_mark_rq_complete() against Nth instance.

In fact, it's pretty easy to make blk_mq_check_expired() terminate a
later instance of a request.  If we induce 5 sec delay before
time_after_eq() test in blk_mq_check_expired(), shorten the timeout to
2s, and issue back-to-back large IOs, blk-mq starts timing out
requests spuriously pretty quickly.  Nothing actually timed out.  It
just made the call on a recycle instance of a request and then
terminated a later instance long after the original instance finished.
The scenario isn't theoretical either.

This patch replaces the broken synchronization mechanism with a RCU
and generation number based one.

1. Each request has a u64 generation + state value, which can be
   updated only by the request owner.  Whenever a request becomes
   in-flight, the generation number gets bumped up too.  This provides
   the basis for the timeout path to distinguish different recycle
   instances of the request.

   Also, marking a request in-flight and setting its deadline are
   protected with a seqcount so that the timeout path can fetch both
   values coherently.

2. The timeout path fetches the generation, state and deadline.  If
   the verdict is timeout, it records the generation into a dedicated
   request abortion field and does RCU wait.

3. The completion path is also protected by RCU (from the previous
   patch) and checks whether the current generation number and state
   match the abortion field.  If so, it skips completion.

4. The timeout path, after RCU wait, scans requests again and
   terminates the ones whose generation and state still match the ones
   requested for abortion.

   By now, the timeout path knows that either the generation number
   and state changed if it lost the race or the completion will yield
   to it and can safely timeout the request.

While it's more lines of code, it's conceptually simpler, doesn't
depend on direct use of subtle memory ordering or coherence, and
hopefully doesn't terminate the wrong instance.

While this change makes REQ_ATOM_COMPLETE synchronization unnecessary
between issue/complete and timeout paths, REQ_ATOM_COMPLETE isn't
removed yet as it's still used in other places.  Future patches will
move all state tracking to the new mechanism and remove all bitops in
the hot paths.

Note that this patch adds a comment explaining a race condition in
BLK_EH_RESET_TIMER path.  The race has always been there and this
patch doesn't change it.  It's just documenting the existing race.

v2: - Fixed BLK_EH_RESET_TIMER handling as pointed out by Jianchao.
    - s/request->gstate_seqc/request->gstate_seq/ as suggested by Peter.
    - READ_ONCE() added in blk_mq_rq_update_state() as suggested by Peter.

v3: - Fixed possible extended seqcount / u64_stats_sync read looping
      spotted by Peter.
    - MQ_RQ_IDLE was incorrectly being set in complete_request instead
      of free_request.  Fixed.

v4: - Rebased on top of hctx_lock() refactoring patch.
    - Added comment explaining the use of hctx_lock() in completion path.

v5: - Added comments requested by Bart.
    - Note the addition of BLK_EH_RESET_TIMER race condition in the
      commit message.
Signed-off-by: default avatarTejun Heo <tj@kernel.org>
Cc: "jianchao.wang" <jianchao.w.wang@oracle.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Bart Van Assche <Bart.VanAssche@wdc.com>
Signed-off-by: default avatarJens Axboe <axboe@kernel.dk>
parent 5197c05e
......@@ -126,6 +126,8 @@ void blk_rq_init(struct request_queue *q, struct request *rq)
rq->start_time = jiffies;
set_start_time_ns(rq);
rq->part = NULL;
seqcount_init(&rq->gstate_seq);
u64_stats_init(&rq->aborted_gstate_sync);
}
EXPORT_SYMBOL(blk_rq_init);
......
This diff is collapsed.
......@@ -27,6 +27,19 @@ struct blk_mq_ctx {
struct kobject kobj;
} ____cacheline_aligned_in_smp;
/*
* Bits for request->gstate. The lower two bits carry MQ_RQ_* state value
* and the upper bits the generation number.
*/
enum mq_rq_state {
MQ_RQ_IDLE = 0,
MQ_RQ_IN_FLIGHT = 1,
MQ_RQ_STATE_BITS = 2,
MQ_RQ_STATE_MASK = (1 << MQ_RQ_STATE_BITS) - 1,
MQ_RQ_GEN_INC = 1 << MQ_RQ_STATE_BITS,
};
void blk_mq_freeze_queue(struct request_queue *q);
void blk_mq_free_queue(struct request_queue *q);
int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr);
......@@ -85,6 +98,39 @@ extern void blk_mq_rq_timed_out(struct request *req, bool reserved);
void blk_mq_release(struct request_queue *q);
/**
* blk_mq_rq_state() - read the current MQ_RQ_* state of a request
* @rq: target request.
*/
static inline int blk_mq_rq_state(struct request *rq)
{
return READ_ONCE(rq->gstate) & MQ_RQ_STATE_MASK;
}
/**
* blk_mq_rq_update_state() - set the current MQ_RQ_* state of a request
* @rq: target request.
* @state: new state to set.
*
* Set @rq's state to @state. The caller is responsible for ensuring that
* there are no other updaters. A request can transition into IN_FLIGHT
* only from IDLE and doing so increments the generation number.
*/
static inline void blk_mq_rq_update_state(struct request *rq,
enum mq_rq_state state)
{
u64 old_val = READ_ONCE(rq->gstate);
u64 new_val = (old_val & ~MQ_RQ_STATE_MASK) | state;
if (state == MQ_RQ_IN_FLIGHT) {
WARN_ON_ONCE((old_val & MQ_RQ_STATE_MASK) != MQ_RQ_IDLE);
new_val += MQ_RQ_GEN_INC;
}
/* avoid exposing interim values */
WRITE_ONCE(rq->gstate, new_val);
}
static inline struct blk_mq_ctx *__blk_mq_get_ctx(struct request_queue *q,
unsigned int cpu)
{
......
......@@ -208,7 +208,7 @@ void blk_add_timer(struct request *req)
if (!req->timeout)
req->timeout = q->rq_timeout;
WRITE_ONCE(req->deadline, jiffies + req->timeout);
req->deadline = jiffies + req->timeout;
/*
* Only the non-mq case needs to add the request to a protected list.
......
......@@ -123,12 +123,6 @@ void blk_account_io_done(struct request *req);
* Internal atomic flags for request handling
*/
enum rq_atomic_flags {
/*
* Keep these two bits first - not because we depend on the
* value of them, but we do depend on them being in the same
* byte of storage to ensure ordering on writes. Keeping them
* first will achieve that nicely.
*/
REQ_ATOM_COMPLETE = 0,
REQ_ATOM_STARTED,
......
......@@ -51,6 +51,7 @@ struct blk_mq_hw_ctx {
unsigned int queue_num;
atomic_t nr_active;
unsigned int nr_expired;
struct hlist_node cpuhp_dead;
struct kobject kobj;
......
......@@ -27,6 +27,8 @@
#include <linux/percpu-refcount.h>
#include <linux/scatterlist.h>
#include <linux/blkzoned.h>
#include <linux/seqlock.h>
#include <linux/u64_stats_sync.h>
struct module;
struct scsi_ioctl_command;
......@@ -230,6 +232,27 @@ struct request {
unsigned short write_hint;
/*
* On blk-mq, the lower bits of ->gstate (generation number and
* state) carry the MQ_RQ_* state value and the upper bits the
* generation number which is monotonically incremented and used to
* distinguish the reuse instances.
*
* ->gstate_seq allows updates to ->gstate and other fields
* (currently ->deadline) during request start to be read
* atomically from the timeout path, so that it can operate on a
* coherent set of information.
*/
seqcount_t gstate_seq;
u64 gstate;
/*
* ->aborted_gstate is used by the timeout to claim a specific
* recycle instance of this request. See blk_mq_timeout_work().
*/
struct u64_stats_sync aborted_gstate_sync;
u64 aborted_gstate;
unsigned long deadline;
struct list_head timeout_list;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment