blk-mq: allow changing of queue depth through sysfs

For request_fn based devices, the block layer exports a 'nr_requests' file through sysfs to allow adjusting of queue depth on the fly. Currently this returns -EINVAL for blk-mq, since it's not wired up. Wire this up for blk-mq, so that it now also always dynamic adjustments of the allowed queue depth for any given block device managed by blk-mq. Signed-off-by: Jens Axboe <axboe@fb.com>

blk-mq: allow changing of queue depth through sysfs
For request_fn based devices, the block layer exports a 'nr_requests' file through sysfs to allow adjusting of queue depth on the fly. Currently this returns -EINVAL for blk-mq, since it's not wired up. Wire this up for blk-mq, so that it now also always dynamic adjustments of the allowed queue depth for any given block device managed by blk-mq. Signed-off-by: Jens Axboe <axboe@fb.com>
e3a2b3f9 · Jens Axboe · 64b14519 · e3a2b3f9 · e3a2b3f9 · e3a2b3f9
Commit e3a2b3f9 authored May 20, 2014 by Jens Axboe
8 changed files
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -848,6 +848,47 @@ static void freed_request(struct request_list *rl, unsigned int flags)
 		__freed_request(rl, sync ^ 1);
 }
+int blk_update_nr_requests(struct request_queue *q, unsigned int nr)
+{
+	struct request_list *rl;
+	spin_lock_irq(q->queue_lock);
+	q->nr_requests = nr;
+	blk_queue_congestion_threshold(q);
+	/* congestion isn't cgroup aware and follows root blkcg for now */
+	rl = &q->root_rl;
+	if (rl->count[BLK_RW_SYNC] >= queue_congestion_on_threshold(q))
+		blk_set_queue_congested(q, BLK_RW_SYNC);
+	else if (rl->count[BLK_RW_SYNC] < queue_congestion_off_threshold(q))
+		blk_clear_queue_congested(q, BLK_RW_SYNC);
+	if (rl->count[BLK_RW_ASYNC] >= queue_congestion_on_threshold(q))
+		blk_set_queue_congested(q, BLK_RW_ASYNC);
+	else if (rl->count[BLK_RW_ASYNC] < queue_congestion_off_threshold(q))
+		blk_clear_queue_congested(q, BLK_RW_ASYNC);
+	blk_queue_for_each_rl(rl, q) {
+		if (rl->count[BLK_RW_SYNC] >= q->nr_requests) {
+			blk_set_rl_full(rl, BLK_RW_SYNC);
+		} else {
+			blk_clear_rl_full(rl, BLK_RW_SYNC);
+			wake_up(&rl->wait[BLK_RW_SYNC]);
+		}
+		if (rl->count[BLK_RW_ASYNC] >= q->nr_requests) {
+			blk_set_rl_full(rl, BLK_RW_ASYNC);
+		} else {
+			blk_clear_rl_full(rl, BLK_RW_ASYNC);
+			wake_up(&rl->wait[BLK_RW_ASYNC]);
+		}
+	}
+	spin_unlock_irq(q->queue_lock);
+	return 0;
+}
 /*
 * Determine if elevator data should be initialized when allocating the
 * request associated with @bio.

--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -57,23 +57,13 @@ bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
 }
 /*
- * If a previously busy queue goes inactive, potential waiters could now
+ * Wakeup all potentially sleeping on normal (non-reserved) tags
- * be allowed to queue. Wake them up and check.
 */
-void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx)
+static void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags)
 {
-	struct blk_mq_tags *tags = hctx->tags;
 	struct blk_mq_bitmap_tags *bt;
 	int i, wake_index;
-	if (!test_and_clear_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
-		return;
-	atomic_dec(&tags->active_queues);
-	/*
-	 * Will only throttle depth on non-reserved tags
-	 */
 	bt = &tags->bitmap_tags;
 	wake_index = bt->wake_index;
 	for (i = 0; i < BT_WAIT_QUEUES; i++) {
@@ -86,6 +76,22 @@ void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx)
 	}
 }
+/*
+ * If a previously busy queue goes inactive, potential waiters could now
+ * be allowed to queue. Wake them up and check.
+ */
+void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx)
+{
+	struct blk_mq_tags *tags = hctx->tags;
+	if (!test_and_clear_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
+		return;
+	atomic_dec(&tags->active_queues);
+	blk_mq_tag_wakeup_all(tags);
+}
 /*
 * For shared tag users, we track the number of currently active users
 * and attempt to provide a fair share of the tag depth for each of them.
@@ -408,6 +414,28 @@ static unsigned int bt_unused_tags(struct blk_mq_bitmap_tags *bt)
 	return bt->depth - used;
 }
+static void bt_update_count(struct blk_mq_bitmap_tags *bt,
+			    unsigned int depth)
+{
+	unsigned int tags_per_word = 1U << bt->bits_per_word;
+	unsigned int map_depth = depth;
+	if (depth) {
+		int i;
+		for (i = 0; i < bt->map_nr; i++) {
+			bt->map[i].depth = min(map_depth, tags_per_word);
+			map_depth -= bt->map[i].depth;
+		}
+	}
+	bt->wake_cnt = BT_WAIT_BATCH;
+	if (bt->wake_cnt > depth / 4)
+		bt->wake_cnt = max(1U, depth / 4);
+	bt->depth = depth;
+}
 static int bt_alloc(struct blk_mq_bitmap_tags *bt, unsigned int depth,
 			int node, bool reserved)
 {
@@ -420,7 +448,7 @@ static int bt_alloc(struct blk_mq_bitmap_tags *bt, unsigned int depth,
 	 * condition.
 	 */
 	if (depth) {
-		unsigned int nr, i, map_depth, tags_per_word;
+		unsigned int nr, tags_per_word;
 		tags_per_word = (1 << bt->bits_per_word);
@@ -444,11 +472,6 @@ static int bt_alloc(struct blk_mq_bitmap_tags *bt, unsigned int depth,
 			return -ENOMEM;
 		bt->map_nr = nr;
-		map_depth = depth;
-		for (i = 0; i < nr; i++) {
-			bt->map[i].depth = min(map_depth, tags_per_word);
-			map_depth -= tags_per_word;
-		}
 	}
 	bt->bs = kzalloc(BT_WAIT_QUEUES * sizeof(*bt->bs), GFP_KERNEL);
@@ -460,11 +483,7 @@ static int bt_alloc(struct blk_mq_bitmap_tags *bt, unsigned int depth,
 	for (i = 0; i < BT_WAIT_QUEUES; i++)
 		init_waitqueue_head(&bt->bs[i].wait);
-	bt->wake_cnt = BT_WAIT_BATCH;
+	bt_update_count(bt, depth);
-	if (bt->wake_cnt > depth / 4)
-		bt->wake_cnt = max(1U, depth / 4);
-	bt->depth = depth;
 	return 0;
 }
@@ -525,6 +544,21 @@ void blk_mq_tag_init_last_tag(struct blk_mq_tags *tags, unsigned int *tag)
 	*tag = prandom_u32() % depth;
 }
+int blk_mq_tag_update_depth(struct blk_mq_tags *tags, unsigned int tdepth)
+{
+	tdepth -= tags->nr_reserved_tags;
+	if (tdepth > tags->nr_tags)
+		return -EINVAL;
+	/*
+	 * Don't need (or can't) update reserved tags here, they remain
+	 * static and should never need resizing.
+	 */
+	bt_update_count(&tags->bitmap_tags, tdepth);
+	blk_mq_tag_wakeup_all(tags);
+	return 0;
+}
 ssize_t blk_mq_tag_sysfs_show(struct blk_mq_tags *tags, char *page)
 {
 	char *orig_page = page;

--- a/block/blk-mq-tag.h
+++ b/block/blk-mq-tag.h
@@ -55,6 +55,7 @@ extern void blk_mq_tag_busy_iter(struct blk_mq_tags *tags, void (*fn)(void *data
 extern bool blk_mq_has_free_tags(struct blk_mq_tags *tags);
 extern ssize_t blk_mq_tag_sysfs_show(struct blk_mq_tags *tags, char *page);
 extern void blk_mq_tag_init_last_tag(struct blk_mq_tags *tags, unsigned int *last_tag);
+extern int blk_mq_tag_update_depth(struct blk_mq_tags *tags, unsigned int depth);
 enum {
 	BLK_MQ_TAG_CACHE_MIN	= 1,

--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1789,6 +1789,28 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set)
 }
 EXPORT_SYMBOL(blk_mq_free_tag_set);
+int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
+{
+	struct blk_mq_tag_set *set = q->tag_set;
+	struct blk_mq_hw_ctx *hctx;
+	int i, ret;
+	if (!set || nr > set->queue_depth)
+		return -EINVAL;
+	ret = 0;
+	queue_for_each_hw_ctx(q, hctx, i) {
+		ret = blk_mq_tag_update_depth(hctx->tags, nr);
+		if (ret)
+			break;
+	}
+	if (!ret)
+		q->nr_requests = nr;
+	return ret;
+}
 void blk_mq_disable_hotplug(void)
 {
 	mutex_lock(&all_q_mutex);

--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -32,6 +32,7 @@ void blk_mq_drain_queue(struct request_queue *q);
 void blk_mq_free_queue(struct request_queue *q);
 void blk_mq_clone_flush_request(struct request *flush_rq,
 		struct request *orig_rq);
+int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr);
 /*
 * CPU hotplug helpers

--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -48,11 +48,10 @@ static ssize_t queue_requests_show(struct request_queue *q, char *page)
 static ssize_t
 queue_requests_store(struct request_queue *q, const char *page, size_t count)
 {
-	struct request_list *rl;
 	unsigned long nr;
-	int ret;
+	int ret, err;
-	if (!q->request_fn)
+	if (!q->request_fn && !q->mq_ops)
 		return -EINVAL;
 	ret = queue_var_store(&nr, page, count);
@@ -62,40 +61,14 @@ queue_requests_store(struct request_queue *q, const char *page, size_t count)
 	if (nr < BLKDEV_MIN_RQ)
 		nr = BLKDEV_MIN_RQ;
-	spin_lock_irq(q->queue_lock);
+	if (q->request_fn)
-	q->nr_requests = nr;
+		err = blk_update_nr_requests(q, nr);
-	blk_queue_congestion_threshold(q);
+	else
+		err = blk_mq_update_nr_requests(q, nr);
-	/* congestion isn't cgroup aware and follows root blkcg for now */
-	rl = &q->root_rl;
-	if (rl->count[BLK_RW_SYNC] >= queue_congestion_on_threshold(q))
-		blk_set_queue_congested(q, BLK_RW_SYNC);
-	else if (rl->count[BLK_RW_SYNC] < queue_congestion_off_threshold(q))
-		blk_clear_queue_congested(q, BLK_RW_SYNC);
-	if (rl->count[BLK_RW_ASYNC] >= queue_congestion_on_threshold(q))
-		blk_set_queue_congested(q, BLK_RW_ASYNC);
-	else if (rl->count[BLK_RW_ASYNC] < queue_congestion_off_threshold(q))
-		blk_clear_queue_congested(q, BLK_RW_ASYNC);
-	blk_queue_for_each_rl(rl, q) {
-		if (rl->count[BLK_RW_SYNC] >= q->nr_requests) {
-			blk_set_rl_full(rl, BLK_RW_SYNC);
-		} else {
-			blk_clear_rl_full(rl, BLK_RW_SYNC);
-			wake_up(&rl->wait[BLK_RW_SYNC]);
-		}
-		if (rl->count[BLK_RW_ASYNC] >= q->nr_requests) {
+	if (err)
-			blk_set_rl_full(rl, BLK_RW_ASYNC);
+		return err;
-		} else {
-			blk_clear_rl_full(rl, BLK_RW_ASYNC);
-			wake_up(&rl->wait[BLK_RW_ASYNC]);
-		}
-	}
-	spin_unlock_irq(q->queue_lock);
 	return ret;
 }

--- a/block/blk.h
+++ b/block/blk.h
@@ -188,6 +188,8 @@ static inline int queue_congestion_off_threshold(struct request_queue *q)
 	return q->nr_congestion_off;
 }
+extern int blk_update_nr_requests(struct request_queue *, unsigned int);
 /*
 * Contribute to IO statistics IFF:
 *

--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -63,7 +63,7 @@ struct blk_mq_hw_ctx {
 struct blk_mq_tag_set {
 	struct blk_mq_ops	*ops;
 	unsigned int		nr_hw_queues;
-	unsigned int		queue_depth;
+	unsigned int		queue_depth;	/* max hw supported */
 	unsigned int		reserved_tags;
 	unsigned int		cmd_size;	/* per-request extra data */
 	int			numa_node;