Commit da55f2cc authored by Omar Sandoval's avatar Omar Sandoval Committed by Jens Axboe

blk-mq: use sbq wait queues instead of restart for driver tags

Commit 50e1dab8 ("blk-mq-sched: fix starvation for multiple hardware
queues and shared tags") fixed one starvation issue for shared tags.
However, we can still get into a situation where we fail to allocate a
tag because all tags are allocated but we don't have any pending
requests on any hardware queue.

One solution for this would be to restart all queues that share a tag
map, but that really sucks. Ideally, we could just block and wait for a
tag, but that isn't always possible from blk_mq_dispatch_rq_list().

However, we can still use the struct sbitmap_queue wait queues with a
custom callback instead of blocking. This has a few benefits:

1. It avoids iterating over all hardware queues when completing an I/O,
   which the current restart code has to do.
2. It benefits from the existing rolling wakeup code.
3. It avoids punting to another thread just to have it block.
Signed-off-by: default avatarOmar Sandoval <osandov@fb.com>
Signed-off-by: default avatarJens Axboe <axboe@fb.com>
parent 2d19020b
...@@ -904,6 +904,44 @@ static bool reorder_tags_to_front(struct list_head *list) ...@@ -904,6 +904,44 @@ static bool reorder_tags_to_front(struct list_head *list)
return first != NULL; return first != NULL;
} }
static int blk_mq_dispatch_wake(wait_queue_t *wait, unsigned mode, int flags,
void *key)
{
struct blk_mq_hw_ctx *hctx;
hctx = container_of(wait, struct blk_mq_hw_ctx, dispatch_wait);
list_del(&wait->task_list);
clear_bit_unlock(BLK_MQ_S_TAG_WAITING, &hctx->state);
blk_mq_run_hw_queue(hctx, true);
return 1;
}
static bool blk_mq_dispatch_wait_add(struct blk_mq_hw_ctx *hctx)
{
struct sbq_wait_state *ws;
/*
* The TAG_WAITING bit serves as a lock protecting hctx->dispatch_wait.
* The thread which wins the race to grab this bit adds the hardware
* queue to the wait queue.
*/
if (test_bit(BLK_MQ_S_TAG_WAITING, &hctx->state) ||
test_and_set_bit_lock(BLK_MQ_S_TAG_WAITING, &hctx->state))
return false;
init_waitqueue_func_entry(&hctx->dispatch_wait, blk_mq_dispatch_wake);
ws = bt_wait_ptr(&hctx->tags->bitmap_tags, hctx);
/*
* As soon as this returns, it's no longer safe to fiddle with
* hctx->dispatch_wait, since a completion can wake up the wait queue
* and unlock the bit.
*/
add_wait_queue(&ws->wait, &hctx->dispatch_wait);
return true;
}
bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list) bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list)
{ {
struct request_queue *q = hctx->queue; struct request_queue *q = hctx->queue;
...@@ -931,15 +969,22 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list) ...@@ -931,15 +969,22 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list)
continue; continue;
/* /*
* We failed getting a driver tag. Mark the queue(s) * The initial allocation attempt failed, so we need to
* as needing a restart. Retry getting a tag again, * rerun the hardware queue when a tag is freed.
* in case the needed IO completed right before we
* marked the queue as needing a restart.
*/ */
blk_mq_sched_mark_restart(hctx); if (blk_mq_dispatch_wait_add(hctx)) {
if (!blk_mq_get_driver_tag(rq, &hctx, false)) /*
* It's possible that a tag was freed in the
* window between the allocation failure and
* adding the hardware queue to the wait queue.
*/
if (!blk_mq_get_driver_tag(rq, &hctx, false))
break;
} else {
break; break;
}
} }
list_del_init(&rq->queuelist); list_del_init(&rq->queuelist);
bd.rq = rq; bd.rq = rq;
...@@ -995,10 +1040,11 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list) ...@@ -995,10 +1040,11 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list)
* *
* blk_mq_run_hw_queue() already checks the STOPPED bit * blk_mq_run_hw_queue() already checks the STOPPED bit
* *
* If RESTART is set, then let completion restart the queue * If RESTART or TAG_WAITING is set, then let completion restart
* instead of potentially looping here. * the queue instead of potentially looping here.
*/ */
if (!blk_mq_sched_needs_restart(hctx)) if (!blk_mq_sched_needs_restart(hctx) &&
!test_bit(BLK_MQ_S_TAG_WAITING, &hctx->state))
blk_mq_run_hw_queue(hctx, true); blk_mq_run_hw_queue(hctx, true);
} }
......
...@@ -33,6 +33,7 @@ struct blk_mq_hw_ctx { ...@@ -33,6 +33,7 @@ struct blk_mq_hw_ctx {
struct blk_mq_ctx **ctxs; struct blk_mq_ctx **ctxs;
unsigned int nr_ctx; unsigned int nr_ctx;
wait_queue_t dispatch_wait;
atomic_t wait_index; atomic_t wait_index;
struct blk_mq_tags *tags; struct blk_mq_tags *tags;
...@@ -160,6 +161,7 @@ enum { ...@@ -160,6 +161,7 @@ enum {
BLK_MQ_S_STOPPED = 0, BLK_MQ_S_STOPPED = 0,
BLK_MQ_S_TAG_ACTIVE = 1, BLK_MQ_S_TAG_ACTIVE = 1,
BLK_MQ_S_SCHED_RESTART = 2, BLK_MQ_S_SCHED_RESTART = 2,
BLK_MQ_S_TAG_WAITING = 3,
BLK_MQ_MAX_DEPTH = 10240, BLK_MQ_MAX_DEPTH = 10240,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment