Merge branch 'for-linus' of git://git.kernel.dk/linux-block

Pull block fixes from Jens Axboe: "Another week, another round of fixes. These have been brewing for a bit and in various iterations, but I feel pretty comfortable about the quality of them. They fix real issues. The pull request is mostly blk-mq related, and the only one not fixing a real bug, is the tag iterator abstraction from Christoph. But it's pretty trivial, and we'll need it for another fix soon. Apart from the blk-mq fixes, there's an NVMe affinity fix from Keith, and a single fix for xen-blkback from Roger fixing failure to free requests on disconnect" * 'for-linus' of git://git.kernel.dk/linux-block: blk-mq: factor out a helper to iterate all tags for a request_queue blk-mq: fix racy updates of rq->errors blk-mq: fix deadlock when reading cpu_list blk-mq: avoid inserting requests before establishing new mapping blk-mq: fix q->mq_usage_counter access race blk-mq: Fix use after of free q->mq_map blk-mq: fix sysfs registration/unregistration race blk-mq: avoid setting hctx->tags->cpumask before allocation NVMe: Set affinity after allocating request queues xen/blkback: free requests on disconnection

Merge branch 'for-linus' of git://git.kernel.dk/linux-block
Pull block fixes from Jens Axboe: "Another week, another round of fixes. These have been brewing for a bit and in various iterations, but I feel pretty comfortable about the quality of them. They fix real issues. The pull request is mostly blk-mq related, and the only one not fixing a real bug, is the tag iterator abstraction from Christoph. But it's pretty trivial, and we'll need it for another fix soon. Apart from the blk-mq fixes, there's an NVMe affinity fix from Keith, and a single fix for xen-blkback from Roger fixing failure to free requests on disconnect" * 'for-linus' of git://git.kernel.dk/linux-block: blk-mq: factor out a helper to iterate all tags for a request_queue blk-mq: fix racy updates of rq->errors blk-mq: fix deadlock when reading cpu_list blk-mq: avoid inserting requests before establishing new mapping blk-mq: fix q->mq_usage_counter access race blk-mq: Fix use after of free q->mq_map blk-mq: fix sysfs registration/unregistration race blk-mq: avoid setting hctx->tags->cpumask before allocation NVMe: Set affinity after allocating request queues xen/blkback: free requests on disconnection
27728bf0 · Linus Torvalds · 36f8dafe · 0bf6cd5b · 27728bf0 · 27728bf0
Commit 27728bf0 authored Oct 02, 2015 by Linus Torvalds
15 changed files
--- a/block/blk-mq-cpumap.c
+++ b/block/blk-mq-cpumap.c
@@ -31,7 +31,8 @@ static int get_first_sibling(unsigned int cpu)
 	return cpu;
 }

-int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues)
+int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues,
+			    const struct cpumask *online_mask)
 {
 	unsigned int i, nr_cpus, nr_uniq_cpus, queue, first_sibling;
 	cpumask_var_t cpus;
@@ -41,7 +42,7 @@ int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues)

 	cpumask_clear(cpus);
 	nr_cpus = nr_uniq_cpus = 0;
-	for_each_online_cpu(i) {
+	for_each_cpu(i, online_mask) {
 		nr_cpus++;
 		first_sibling = get_first_sibling(i);
 		if (!cpumask_test_cpu(first_sibling, cpus))
@@ -51,7 +52,7 @@ int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues)

 	queue = 0;
 	for_each_possible_cpu(i) {
-		if (!cpu_online(i)) {
+		if (!cpumask_test_cpu(i, online_mask)) {
 			map[i] = 0;
 			continue;
 		}
@@ -95,7 +96,7 @@ unsigned int *blk_mq_make_queue_map(struct blk_mq_tag_set *set)
 	if (!map)
 		return NULL;

-	if (!blk_mq_update_queue_map(map, set->nr_hw_queues))
+	if (!blk_mq_update_queue_map(map, set->nr_hw_queues, cpu_online_mask))
 		return map;

 	kfree(map);

--- a/block/blk-mq-sysfs.c
+++ b/block/blk-mq-sysfs.c
@@ -229,8 +229,6 @@ static ssize_t blk_mq_hw_sysfs_cpus_show(struct blk_mq_hw_ctx *hctx, char *page)
 	unsigned int i, first = 1;
 	ssize_t ret = 0;

-	blk_mq_disable_hotplug();
-
 	for_each_cpu(i, hctx->cpumask) {
 		if (first)
 			ret += sprintf(ret + page, "%u", i);
@@ -240,8 +238,6 @@ static ssize_t blk_mq_hw_sysfs_cpus_show(struct blk_mq_hw_ctx *hctx, char *page)
 		first = 0;
 	}

-	blk_mq_enable_hotplug();
-
 	ret += sprintf(ret + page, "\n");
 	return ret;
 }
@@ -343,7 +339,7 @@ static void blk_mq_unregister_hctx(struct blk_mq_hw_ctx *hctx)
 	struct blk_mq_ctx *ctx;
 	int i;

-	if (!hctx->nr_ctx || !(hctx->flags & BLK_MQ_F_SYSFS_UP))
+	if (!hctx->nr_ctx)
 		return;

 	hctx_for_each_ctx(hctx, ctx, i)
@@ -358,7 +354,7 @@ static int blk_mq_register_hctx(struct blk_mq_hw_ctx *hctx)
 	struct blk_mq_ctx *ctx;
 	int i, ret;

-	if (!hctx->nr_ctx || !(hctx->flags & BLK_MQ_F_SYSFS_UP))
+	if (!hctx->nr_ctx)
 		return 0;

 	ret = kobject_add(&hctx->kobj, &q->mq_kobj, "%u", hctx->queue_num);
@@ -381,6 +377,8 @@ void blk_mq_unregister_disk(struct gendisk *disk)
 	struct blk_mq_ctx *ctx;
 	int i, j;

+	blk_mq_disable_hotplug();
+
 	queue_for_each_hw_ctx(q, hctx, i) {
 		blk_mq_unregister_hctx(hctx);

@@ -395,6 +393,9 @@ void blk_mq_unregister_disk(struct gendisk *disk)
 	kobject_put(&q->mq_kobj);

 	kobject_put(&disk_to_dev(disk)->kobj);
+
+	q->mq_sysfs_init_done = false;
+	blk_mq_enable_hotplug();
 }

 static void blk_mq_sysfs_init(struct request_queue *q)
@@ -425,27 +426,30 @@ int blk_mq_register_disk(struct gendisk *disk)
 	struct blk_mq_hw_ctx *hctx;
 	int ret, i;

+	blk_mq_disable_hotplug();
+
 	blk_mq_sysfs_init(q);

 	ret = kobject_add(&q->mq_kobj, kobject_get(&dev->kobj), "%s", "mq");
 	if (ret < 0)
-		return ret;
+		goto out;

 	kobject_uevent(&q->mq_kobj, KOBJ_ADD);

 	queue_for_each_hw_ctx(q, hctx, i) {
-		hctx->flags |= BLK_MQ_F_SYSFS_UP;
 		ret = blk_mq_register_hctx(hctx);
 		if (ret)
 			break;
 	}

-	if (ret) {
+	if (ret)
 		blk_mq_unregister_disk(disk);
-		return ret;
-	}
+	else
+		q->mq_sysfs_init_done = true;
+out:
+	blk_mq_enable_hotplug();

-	return 0;
+	return ret;
 }
 EXPORT_SYMBOL_GPL(blk_mq_register_disk);

@@ -454,6 +458,9 @@ void blk_mq_sysfs_unregister(struct request_queue *q)
 	struct blk_mq_hw_ctx *hctx;
 	int i;

+	if (!q->mq_sysfs_init_done)
+		return;
+
 	queue_for_each_hw_ctx(q, hctx, i)
 		blk_mq_unregister_hctx(hctx);
 }
@@ -463,6 +470,9 @@ int blk_mq_sysfs_register(struct request_queue *q)
 	struct blk_mq_hw_ctx *hctx;
 	int i, ret = 0;

+	if (!q->mq_sysfs_init_done)
+		return ret;
+
 	queue_for_each_hw_ctx(q, hctx, i) {
 		ret = blk_mq_register_hctx(hctx);
 		if (ret)

--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -471,17 +471,30 @@ void blk_mq_all_tag_busy_iter(struct blk_mq_tags *tags, busy_tag_iter_fn *fn,
 }
 EXPORT_SYMBOL(blk_mq_all_tag_busy_iter);

-void blk_mq_tag_busy_iter(struct blk_mq_hw_ctx *hctx, busy_iter_fn *fn,
+void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn,
 		void *priv)
 {
+	struct blk_mq_hw_ctx *hctx;
+	int i;
+
+
+	queue_for_each_hw_ctx(q, hctx, i) {
 		struct blk_mq_tags *tags = hctx->tags;

+		/*
+		 * If not software queues are currently mapped to this
+		 * hardware queue, there's nothing to check
+		 */
+		if (!blk_mq_hw_queue_mapped(hctx))
+			continue;
+
 		if (tags->nr_reserved_tags)
 			bt_for_each(hctx, &tags->breserved_tags, 0, fn, priv, true);
 		bt_for_each(hctx, &tags->bitmap_tags, tags->nr_reserved_tags, fn, priv,
 		      false);
+	}
+
 }
-EXPORT_SYMBOL(blk_mq_tag_busy_iter);

 static unsigned int bt_unused_tags(struct blk_mq_bitmap_tags *bt)
 {

--- a/block/blk-mq-tag.h
+++ b/block/blk-mq-tag.h
@@ -58,6 +58,8 @@ extern ssize_t blk_mq_tag_sysfs_show(struct blk_mq_tags *tags, char *page);
 extern void blk_mq_tag_init_last_tag(struct blk_mq_tags *tags, unsigned int *last_tag);
 extern int blk_mq_tag_update_depth(struct blk_mq_tags *tags, unsigned int depth);
 extern void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool);
+void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn,
+		void *priv);

 enum {
 	BLK_MQ_TAG_CACHE_MIN	= 1,

--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -393,14 +393,16 @@ void __blk_mq_complete_request(struct request *rq)
 *	Ends all I/O on a request. It does not handle partial completions.
 *	The actual completion happens out-of-order, through a IPI handler.
 **/
-void blk_mq_complete_request(struct request *rq)
+void blk_mq_complete_request(struct request *rq, int error)
 {
 	struct request_queue *q = rq->q;

 	if (unlikely(blk_should_fake_timeout(q)))
 		return;
-	if (!blk_mark_rq_complete(rq))
+	if (!blk_mark_rq_complete(rq)) {
+		rq->errors = error;
 		__blk_mq_complete_request(rq);
+	}
 }
 EXPORT_SYMBOL(blk_mq_complete_request);

@@ -616,10 +618,8 @@ static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
 		 * If a request wasn't started before the queue was
 		 * marked dying, kill it here or it'll go unnoticed.
 		 */
-		if (unlikely(blk_queue_dying(rq->q))) {
-			rq->errors = -EIO;
-			blk_mq_complete_request(rq);
-		}
+		if (unlikely(blk_queue_dying(rq->q)))
+			blk_mq_complete_request(rq, -EIO);
 		return;
 	}
 	if (rq->cmd_flags & REQ_NO_TIMEOUT)
@@ -641,24 +641,16 @@ static void blk_mq_rq_timer(unsigned long priv)
 		.next		= 0,
 		.next_set	= 0,
 	};
-	struct blk_mq_hw_ctx *hctx;
 	int i;

-	queue_for_each_hw_ctx(q, hctx, i) {
-		/*
-		 * If not software queues are currently mapped to this
-		 * hardware queue, there's nothing to check
-		 */
-		if (!blk_mq_hw_queue_mapped(hctx))
-			continue;
-
-		blk_mq_tag_busy_iter(hctx, blk_mq_check_expired, &data);
-	}
+	blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &data);

 	if (data.next_set) {
 		data.next = blk_rq_timeout(round_jiffies_up(data.next));
 		mod_timer(&q->timeout, data.next);
 	} else {
+		struct blk_mq_hw_ctx *hctx;
+
 		queue_for_each_hw_ctx(q, hctx, i) {
 			/* the hctx may be unmapped, so check it here */
 			if (blk_mq_hw_queue_mapped(hctx))
@@ -1789,13 +1781,19 @@ static void blk_mq_init_cpu_queues(struct request_queue *q,
 	}
 }

-static void blk_mq_map_swqueue(struct request_queue *q)
+static void blk_mq_map_swqueue(struct request_queue *q,
+			       const struct cpumask *online_mask)
 {
 	unsigned int i;
 	struct blk_mq_hw_ctx *hctx;
 	struct blk_mq_ctx *ctx;
 	struct blk_mq_tag_set *set = q->tag_set;

+	/*
+	 * Avoid others reading imcomplete hctx->cpumask through sysfs
+	 */
+	mutex_lock(&q->sysfs_lock);
+
 	queue_for_each_hw_ctx(q, hctx, i) {
 		cpumask_clear(hctx->cpumask);
 		hctx->nr_ctx = 0;
@@ -1806,16 +1804,17 @@ static void blk_mq_map_swqueue(struct request_queue *q)
 	 */
 	queue_for_each_ctx(q, ctx, i) {
 		/* If the cpu isn't online, the cpu is mapped to first hctx */
-		if (!cpu_online(i))
+		if (!cpumask_test_cpu(i, online_mask))
 			continue;

 		hctx = q->mq_ops->map_queue(q, i);
 		cpumask_set_cpu(i, hctx->cpumask);
-		cpumask_set_cpu(i, hctx->tags->cpumask);
 		ctx->index_hw = hctx->nr_ctx;
 		hctx->ctxs[hctx->nr_ctx++] = ctx;
 	}

+	mutex_unlock(&q->sysfs_lock);
+
 	queue_for_each_hw_ctx(q, hctx, i) {
 		struct blk_mq_ctxmap *map = &hctx->ctx_map;

@@ -1851,6 +1850,14 @@ static void blk_mq_map_swqueue(struct request_queue *q)
 		hctx->next_cpu = cpumask_first(hctx->cpumask);
 		hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
 	}
+
+	queue_for_each_ctx(q, ctx, i) {
+		if (!cpumask_test_cpu(i, online_mask))
+			continue;
+
+		hctx = q->mq_ops->map_queue(q, i);
+		cpumask_set_cpu(i, hctx->tags->cpumask);
+	}
 }

 static void blk_mq_update_tag_set_depth(struct blk_mq_tag_set *set)
@@ -1918,6 +1925,9 @@ void blk_mq_release(struct request_queue *q)
 		kfree(hctx);
 	}

+	kfree(q->mq_map);
+	q->mq_map = NULL;
+
 	kfree(q->queue_hw_ctx);

 	/* ctx kobj stays in queue_ctx */
@@ -2027,13 +2037,15 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
 	if (blk_mq_init_hw_queues(q, set))
 		goto err_hctxs;

+	get_online_cpus();
 	mutex_lock(&all_q_mutex);
-	list_add_tail(&q->all_q_node, &all_q_list);
-	mutex_unlock(&all_q_mutex);

+	list_add_tail(&q->all_q_node, &all_q_list);
 	blk_mq_add_queue_tag_set(set, q);
+	blk_mq_map_swqueue(q, cpu_online_mask);

-	blk_mq_map_swqueue(q);
+	mutex_unlock(&all_q_mutex);
+	put_online_cpus();

 	return q;

@@ -2057,30 +2069,27 @@ void blk_mq_free_queue(struct request_queue *q)
 {
 	struct blk_mq_tag_set	*set = q->tag_set;

+	mutex_lock(&all_q_mutex);
+	list_del_init(&q->all_q_node);
+	mutex_unlock(&all_q_mutex);
+
 	blk_mq_del_queue_tag_set(q);

 	blk_mq_exit_hw_queues(q, set, set->nr_hw_queues);
 	blk_mq_free_hw_queues(q, set);

 	percpu_ref_exit(&q->mq_usage_counter);
-
-	kfree(q->mq_map);
-
-	q->mq_map = NULL;
-
-	mutex_lock(&all_q_mutex);
-	list_del_init(&q->all_q_node);
-	mutex_unlock(&all_q_mutex);
 }

 /* Basically redo blk_mq_init_queue with queue frozen */
-static void blk_mq_queue_reinit(struct request_queue *q)
+static void blk_mq_queue_reinit(struct request_queue *q,
+				const struct cpumask *online_mask)
 {
 	WARN_ON_ONCE(!atomic_read(&q->mq_freeze_depth));

 	blk_mq_sysfs_unregister(q);

-	blk_mq_update_queue_map(q->mq_map, q->nr_hw_queues);
+	blk_mq_update_queue_map(q->mq_map, q->nr_hw_queues, online_mask);

 	/*
 	 * redo blk_mq_init_cpu_queues and blk_mq_init_hw_queues. FIXME: maybe
@@ -2088,7 +2097,7 @@ static void blk_mq_queue_reinit(struct request_queue *q)
 	 * involves free and re-allocate memory, worthy doing?)
 	 */

-	blk_mq_map_swqueue(q);
+	blk_mq_map_swqueue(q, online_mask);

 	blk_mq_sysfs_register(q);
 }
@@ -2097,16 +2106,43 @@ static int blk_mq_queue_reinit_notify(struct notifier_block *nb,
 				      unsigned long action, void *hcpu)
 {
 	struct request_queue *q;
+	int cpu = (unsigned long)hcpu;
+	/*
+	 * New online cpumask which is going to be set in this hotplug event.
+	 * Declare this cpumasks as global as cpu-hotplug operation is invoked
+	 * one-by-one and dynamically allocating this could result in a failure.
+	 */
+	static struct cpumask online_new;

 	/*
-	 * Before new mappings are established, hotadded cpu might already
-	 * start handling requests. This doesn't break anything as we map
-	 * offline CPUs to first hardware queue. We will re-init the queue
-	 * below to get optimal settings.
+	 * Before hotadded cpu starts handling requests, new mappings must
+	 * be established.  Otherwise, these requests in hw queue might
+	 * never be dispatched.
+	 *
+	 * For example, there is a single hw queue (hctx) and two CPU queues
+	 * (ctx0 for CPU0, and ctx1 for CPU1).
+	 *
+	 * Now CPU1 is just onlined and a request is inserted into
+	 * ctx1->rq_list and set bit0 in pending bitmap as ctx1->index_hw is
+	 * still zero.
+	 *
+	 * And then while running hw queue, flush_busy_ctxs() finds bit0 is
+	 * set in pending bitmap and tries to retrieve requests in
+	 * hctx->ctxs[0]->rq_list.  But htx->ctxs[0] is a pointer to ctx0,
+	 * so the request in ctx1->rq_list is ignored.
 	 */
-	if (action != CPU_DEAD && action != CPU_DEAD_FROZEN &&
-	    action != CPU_ONLINE && action != CPU_ONLINE_FROZEN)
+	switch (action & ~CPU_TASKS_FROZEN) {
+	case CPU_DEAD:
+	case CPU_UP_CANCELED:
+		cpumask_copy(&online_new, cpu_online_mask);
+		break;
+	case CPU_UP_PREPARE:
+		cpumask_copy(&online_new, cpu_online_mask);
+		cpumask_set_cpu(cpu, &online_new);
+		break;
+	default:
 		return NOTIFY_OK;
+	}

 	mutex_lock(&all_q_mutex);

@@ -2130,7 +2166,7 @@ static int blk_mq_queue_reinit_notify(struct notifier_block *nb,
 	}

 	list_for_each_entry(q, &all_q_list, all_q_node)
-		blk_mq_queue_reinit(q);
+		blk_mq_queue_reinit(q, &online_new);

 	list_for_each_entry(q, &all_q_list, all_q_node)
 		blk_mq_unfreeze_queue(q);

--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -51,7 +51,8 @@ void blk_mq_disable_hotplug(void);
 * CPU -> queue mappings
 */
 extern unsigned int *blk_mq_make_queue_map(struct blk_mq_tag_set *set);
-extern int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues);
+extern int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues,
+				   const struct cpumask *online_mask);
 extern int blk_mq_hw_queue_to_node(unsigned int *map, unsigned int);

 /*

--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1486,17 +1486,16 @@ static void loop_handle_cmd(struct loop_cmd *cmd)
 {
 	const bool write = cmd->rq->cmd_flags & REQ_WRITE;
 	struct loop_device *lo = cmd->rq->q->queuedata;
-	int ret = -EIO;
+	int ret = 0;

-	if (write && (lo->lo_flags & LO_FLAGS_READ_ONLY))
+	if (write && (lo->lo_flags & LO_FLAGS_READ_ONLY)) {
+		ret = -EIO;
 		goto failed;
+	}

 	ret = do_req_filebacked(lo, cmd->rq);
-
 failed:
-	if (ret)
-		cmd->rq->errors = -EIO;
-	blk_mq_complete_request(cmd->rq);
+	blk_mq_complete_request(cmd->rq, ret ? -EIO : 0);
 }

 static void loop_queue_write_work(struct work_struct *work)

--- a/drivers/block/null_blk.c
+++ b/drivers/block/null_blk.c
@@ -289,7 +289,7 @@ static inline void null_handle_cmd(struct nullb_cmd *cmd)
 	case NULL_IRQ_SOFTIRQ:
 		switch (queue_mode)  {
 		case NULL_Q_MQ:
-			blk_mq_complete_request(cmd->rq);
+			blk_mq_complete_request(cmd->rq, cmd->rq->errors);
 			break;
 		case NULL_Q_RQ:
 			blk_complete_request(cmd->rq);

--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -618,16 +618,15 @@ static void req_completion(struct nvme_queue *nvmeq, void *ctx,
 			spin_unlock_irqrestore(req->q->queue_lock, flags);
 			return;
 		}
+
 		if (req->cmd_type == REQ_TYPE_DRV_PRIV) {
 			if (cmd_rq->ctx == CMD_CTX_CANCELLED)
-				req->errors = -EINTR;
-			else
-				req->errors = status;
+				status = -EINTR;
 		} else {
-			req->errors = nvme_error_status(status);
+			status = nvme_error_status(status);
 		}
-	} else
-		req->errors = 0;
+	}
+
 	if (req->cmd_type == REQ_TYPE_DRV_PRIV) {
 		u32 result = le32_to_cpup(&cqe->result);
 		req->special = (void *)(uintptr_t)result;
@@ -650,7 +649,7 @@ static void req_completion(struct nvme_queue *nvmeq, void *ctx,
 	}
 	nvme_free_iod(nvmeq->dev, iod);

-	blk_mq_complete_request(req);
+	blk_mq_complete_request(req, status);
 }

 /* length is in bytes.  gfp flags indicates whether we may sleep. */
@@ -863,8 +862,7 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
 	if (ns && ns->ms && !blk_integrity_rq(req)) {
 		if (!(ns->pi_type && ns->ms == 8) &&
 					req->cmd_type != REQ_TYPE_DRV_PRIV) {
-			req->errors = -EFAULT;
-			blk_mq_complete_request(req);
+			blk_mq_complete_request(req, -EFAULT);
 			return BLK_MQ_RQ_QUEUE_OK;
 		}
 	}
@@ -2439,6 +2437,22 @@ static void nvme_scan_namespaces(struct nvme_dev *dev, unsigned nn)
 	list_sort(NULL, &dev->namespaces, ns_cmp);
 }

+static void nvme_set_irq_hints(struct nvme_dev *dev)
+{
+	struct nvme_queue *nvmeq;
+	int i;
+
+	for (i = 0; i < dev->online_queues; i++) {
+		nvmeq = dev->queues[i];
+
+		if (!nvmeq->tags || !(*nvmeq->tags))
+			continue;
+
+		irq_set_affinity_hint(dev->entry[nvmeq->cq_vector].vector,
+					blk_mq_tags_cpumask(*nvmeq->tags));
+	}
+}
+
 static void nvme_dev_scan(struct work_struct *work)
 {
 	struct nvme_dev *dev = container_of(work, struct nvme_dev, scan_work);
@@ -2450,6 +2464,7 @@ static void nvme_dev_scan(struct work_struct *work)
 		return;
 	nvme_scan_namespaces(dev, le32_to_cpup(&ctrl->nn));
 	kfree(ctrl);
+	nvme_set_irq_hints(dev);
 }

 /*
@@ -2953,22 +2968,6 @@ static const struct file_operations nvme_dev_fops = {
 	.compat_ioctl	= nvme_dev_ioctl,
 };

-static void nvme_set_irq_hints(struct nvme_dev *dev)
-{
-	struct nvme_queue *nvmeq;
-	int i;
-
-	for (i = 0; i < dev->online_queues; i++) {
-		nvmeq = dev->queues[i];
-
-		if (!nvmeq->tags || !(*nvmeq->tags))
-			continue;
-
-		irq_set_affinity_hint(dev->entry[nvmeq->cq_vector].vector,
-					blk_mq_tags_cpumask(*nvmeq->tags));
-	}
-}
-
 static int nvme_dev_start(struct nvme_dev *dev)
 {
 	int result;
@@ -3010,8 +3009,6 @@ static int nvme_dev_start(struct nvme_dev *dev)
 	if (result)
 		goto free_tags;

-	nvme_set_irq_hints(dev);
-
 	dev->event_limit = 1;
 	return result;

@@ -3062,7 +3059,6 @@ static int nvme_dev_resume(struct nvme_dev *dev)
 	} else {
 		nvme_unfreeze_queues(dev);
 		nvme_dev_add(dev);
-		nvme_set_irq_hints(dev);
 	}
 	return 0;
 }

--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -144,7 +144,7 @@ static void virtblk_done(struct virtqueue *vq)
 	do {
 		virtqueue_disable_cb(vq);
 		while ((vbr = virtqueue_get_buf(vblk->vqs[qid].vq, &len)) != NULL) {
-			blk_mq_complete_request(vbr->req);
+			blk_mq_complete_request(vbr->req, vbr->req->errors);
 			req_done = true;
 		}
 		if (unlikely(virtqueue_is_broken(vq)))

--- a/drivers/block/xen-blkback/xenbus.c
+++ b/drivers/block/xen-blkback/xenbus.c
@@ -212,6 +212,9 @@ static int xen_blkif_map(struct xen_blkif *blkif, grant_ref_t *gref,

 static int xen_blkif_disconnect(struct xen_blkif *blkif)
 {
+	struct pending_req *req, *n;
+	int i = 0, j;
+
 	if (blkif->xenblkd) {
 		kthread_stop(blkif->xenblkd);
 		wake_up(&blkif->shutdown_wq);
@@ -238,13 +241,28 @@ static int xen_blkif_disconnect(struct xen_blkif *blkif)
 	/* Remove all persistent grants and the cache of ballooned pages. */
 	xen_blkbk_free_caches(blkif);

+	/* Check that there is no request in use */
+	list_for_each_entry_safe(req, n, &blkif->pending_free, free_list) {
+		list_del(&req->free_list);
+
+		for (j = 0; j < MAX_INDIRECT_SEGMENTS; j++)
+			kfree(req->segments[j]);
+
+		for (j = 0; j < MAX_INDIRECT_PAGES; j++)
+			kfree(req->indirect_pages[j]);
+
+		kfree(req);
+		i++;
+	}
+
+	WARN_ON(i != (XEN_BLKIF_REQS_PER_PAGE * blkif->nr_ring_pages));
+	blkif->nr_ring_pages = 0;
+
 	return 0;
 }

 static void xen_blkif_free(struct xen_blkif *blkif)
 {
-	struct pending_req *req, *n;
-	int i = 0, j;

 	xen_blkif_disconnect(blkif);
 	xen_vbd_free(&blkif->vbd);
@@ -257,22 +275,6 @@ static void xen_blkif_free(struct xen_blkif *blkif)
 	BUG_ON(!list_empty(&blkif->free_pages));
 	BUG_ON(!RB_EMPTY_ROOT(&blkif->persistent_gnts));

-	/* Check that there is no request in use */
-	list_for_each_entry_safe(req, n, &blkif->pending_free, free_list) {
-		list_del(&req->free_list);
-
-		for (j = 0; j < MAX_INDIRECT_SEGMENTS; j++)
-			kfree(req->segments[j]);
-
-		for (j = 0; j < MAX_INDIRECT_PAGES; j++)
-			kfree(req->indirect_pages[j]);
-
-		kfree(req);
-		i++;
-	}
-
-	WARN_ON(i != (XEN_BLKIF_REQS_PER_PAGE * blkif->nr_ring_pages));
-
 	kmem_cache_free(xen_blkif_cachep, blkif);
 }


--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -1142,6 +1142,7 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
 	RING_IDX i, rp;
 	unsigned long flags;
 	struct blkfront_info *info = (struct blkfront_info *)dev_id;
+	int error;

 	spin_lock_irqsave(&info->io_lock, flags);

@@ -1182,37 +1183,37 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
 			continue;
 		}

-		req->errors = (bret->status == BLKIF_RSP_OKAY) ? 0 : -EIO;
+		error = (bret->status == BLKIF_RSP_OKAY) ? 0 : -EIO;
 		switch (bret->operation) {
 		case BLKIF_OP_DISCARD:
 			if (unlikely(bret->status == BLKIF_RSP_EOPNOTSUPP)) {
 				struct request_queue *rq = info->rq;
 				printk(KERN_WARNING "blkfront: %s: %s op failed\n",
 					   info->gd->disk_name, op_name(bret->operation));
-				req->errors = -EOPNOTSUPP;
+				error = -EOPNOTSUPP;
 				info->feature_discard = 0;
 				info->feature_secdiscard = 0;
 				queue_flag_clear(QUEUE_FLAG_DISCARD, rq);
 				queue_flag_clear(QUEUE_FLAG_SECDISCARD, rq);
 			}
-			blk_mq_complete_request(req);
+			blk_mq_complete_request(req, error);
 			break;
 		case BLKIF_OP_FLUSH_DISKCACHE:
 		case BLKIF_OP_WRITE_BARRIER:
 			if (unlikely(bret->status == BLKIF_RSP_EOPNOTSUPP)) {
 				printk(KERN_WARNING "blkfront: %s: %s op failed\n",
 				       info->gd->disk_name, op_name(bret->operation));
-				req->errors = -EOPNOTSUPP;
+				error = -EOPNOTSUPP;
 			}
 			if (unlikely(bret->status == BLKIF_RSP_ERROR &&
 				     info->shadow[id].req.u.rw.nr_segments == 0)) {
 				printk(KERN_WARNING "blkfront: %s: empty %s op failed\n",
 				       info->gd->disk_name, op_name(bret->operation));
-				req->errors = -EOPNOTSUPP;
+				error = -EOPNOTSUPP;
 			}
-			if (unlikely(req->errors)) {
-				if (req->errors == -EOPNOTSUPP)
-					req->errors = 0;
+			if (unlikely(error)) {
+				if (error == -EOPNOTSUPP)
+					error = 0;
 				info->feature_flush = 0;
 				xlvbd_flush(info);
 			}
@@ -1223,7 +1224,7 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
 				dev_dbg(&info->xbdev->dev, "Bad return from blkdev data "
 					"request: %x\n", bret->status);

-			blk_mq_complete_request(req);
+			blk_mq_complete_request(req, error);
 			break;
 		default:
 			BUG();

--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -1957,7 +1957,7 @@ static int scsi_mq_prep_fn(struct request *req)
 static void scsi_mq_done(struct scsi_cmnd *cmd)
 {
 	trace_scsi_dispatch_cmd_done(cmd);
-	blk_mq_complete_request(cmd->request);
+	blk_mq_complete_request(cmd->request, cmd->request->errors);
 }

 static int scsi_queue_rq(struct blk_mq_hw_ctx *hctx,

--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -145,7 +145,6 @@ enum {
 	BLK_MQ_F_SHOULD_MERGE	= 1 << 0,
 	BLK_MQ_F_TAG_SHARED	= 1 << 1,
 	BLK_MQ_F_SG_MERGE	= 1 << 2,
-	BLK_MQ_F_SYSFS_UP	= 1 << 3,
 	BLK_MQ_F_DEFER_ISSUE	= 1 << 4,
 	BLK_MQ_F_ALLOC_POLICY_START_BIT = 8,
 	BLK_MQ_F_ALLOC_POLICY_BITS = 1,
@@ -215,7 +214,7 @@ void blk_mq_add_to_requeue_list(struct request *rq, bool at_head);
 void blk_mq_cancel_requeue_work(struct request_queue *q);
 void blk_mq_kick_requeue_list(struct request_queue *q);
 void blk_mq_abort_requeue_list(struct request_queue *q);
-void blk_mq_complete_request(struct request *rq);
+void blk_mq_complete_request(struct request *rq, int error);

 void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx);
 void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx);
@@ -224,8 +223,6 @@ void blk_mq_start_hw_queues(struct request_queue *q);
 void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async);
 void blk_mq_run_hw_queues(struct request_queue *q, bool async);
 void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs);
-void blk_mq_tag_busy_iter(struct blk_mq_hw_ctx *hctx, busy_iter_fn *fn,
-		void *priv);
 void blk_mq_all_tag_busy_iter(struct blk_mq_tags *tags, busy_tag_iter_fn *fn,
 		void *priv);
 void blk_mq_freeze_queue(struct request_queue *q);

--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -456,6 +456,8 @@ struct request_queue {
 	struct blk_mq_tag_set	*tag_set;
 	struct list_head	tag_set_list;
 	struct bio_set		*bio_split;
+
+	bool			mq_sysfs_init_done;
 };

 #define QUEUE_FLAG_QUEUED	1	/* uses generic tag queueing */