Merge tag 'for-linus-20190524' of git://git.kernel.dk/linux-block

Pull block fixes from Jens Axboe: - NVMe pull request from Keith, with fixes from a few folks. - bio and sbitmap before atomic barrier fixes (Andrea) - Hang fix for blk-mq freeze and unfreeze (Bob) - Single segment count regression fix (Christoph) - AoE now has a new maintainer - tools/io_uring/ Makefile fix, and sync with liburing (me) * tag 'for-linus-20190524' of git://git.kernel.dk/linux-block: (23 commits) tools/io_uring: sync with liburing tools/io_uring: fix Makefile for pthread library link blk-mq: fix hang caused by freeze/unfreeze sequence block: remove the bi_seg_{front,back}_size fields in struct bio block: remove the segment size check in bio_will_gap block: force an unlimited segment size on queues with a virt boundary block: don't decrement nr_phys_segments for physically contigous segments sbitmap: fix improper use of smp_mb__before_atomic() bio: fix improper use of smp_mb__before_atomic() aoe: list new maintainer for aoe driver nvme-pci: use blk-mq mapping for unmanaged irqs nvme: update MAINTAINERS nvme: copy MTFA field from identify controller nvme: fix memory leak for power latency tolerance nvme: release namespace SRCU protection before performing controller ioctls nvme: merge nvme_ns_ioctl into nvme_ioctl nvme: remove the ifdef around nvme_nvm_ioctl nvme: fix srcu locking on error return in nvme_get_ns_from_disk nvme: Fix known effects nvme-pci: Sync queues on reset ...

Merge tag 'for-linus-20190524' of git://git.kernel.dk/linux-block
Pull block fixes from Jens Axboe: - NVMe pull request from Keith, with fixes from a few folks. - bio and sbitmap before atomic barrier fixes (Andrea) - Hang fix for blk-mq freeze and unfreeze (Bob) - Single segment count regression fix (Christoph) - AoE now has a new maintainer - tools/io_uring/ Makefile fix, and sync with liburing (me) * tag 'for-linus-20190524' of git://git.kernel.dk/linux-block: (23 commits) tools/io_uring: sync with liburing tools/io_uring: fix Makefile for pthread library link blk-mq: fix hang caused by freeze/unfreeze sequence block: remove the bi_seg_{front,back}_size fields in struct bio block: remove the segment size check in bio_will_gap block: force an unlimited segment size on queues with a virt boundary block: don't decrement nr_phys_segments for physically contigous segments sbitmap: fix improper use of smp_mb__before_atomic() bio: fix improper use of smp_mb__before_atomic() aoe: list new maintainer for aoe driver nvme-pci: use blk-mq mapping for unmanaged irqs nvme: update MAINTAINERS nvme: copy MTFA field from identify controller nvme: fix memory leak for power latency tolerance nvme: release namespace SRCU protection before performing controller ioctls nvme: merge nvme_ns_ioctl into nvme_ioctl nvme: remove the ifdef around nvme_nvm_ioctl nvme: fix srcu locking on error return in nvme_get_ns_from_disk nvme: Fix known effects nvme-pci: Sync queues on reset ...
7fbc78e3 · Linus Torvalds · 7f8b40e3 · 096c7a6d · 7fbc78e3 · 7fbc78e3
Commit 7fbc78e3 authored May 24, 2019 by Linus Torvalds
18 changed files
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2627,7 +2627,7 @@ F:	Documentation/devicetree/bindings/eeprom/at24.txt
 F:	drivers/misc/eeprom/at24.c
 ATA OVER ETHERNET (AOE) DRIVER
-M:	"Ed L. Cashin" <ed.cashin@acm.org>
+M:	"Justin Sanders" <justin@coraid.com>
 W:	http://www.openaoe.org/
 S:	Supported
 F:	Documentation/aoe/
@@ -11226,7 +11226,7 @@ F:	drivers/video/fbdev/riva/
 F:	drivers/video/fbdev/nvidia/
 NVM EXPRESS DRIVER
-M:	Keith Busch <keith.busch@intel.com>
+M:	Keith Busch <kbusch@kernel.org>
 M:	Jens Axboe <axboe@fb.com>
 M:	Christoph Hellwig <hch@lst.de>
 M:	Sagi Grimberg <sagi@grimberg.me>

--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -413,7 +413,7 @@ int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags)
 		smp_rmb();
 		wait_event(q->mq_freeze_wq,
-			   (atomic_read(&q->mq_freeze_depth) == 0 &&
+			   (!q->mq_freeze_depth &&
 			    (pm || (blk_pm_request_resume(q),
 				    !blk_queue_pm_only(q)))) ||
 			   blk_queue_dying(q));
@@ -503,6 +503,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
 	spin_lock_init(&q->queue_lock);
 	init_waitqueue_head(&q->mq_freeze_wq);
+	mutex_init(&q->mq_freeze_lock);
 	/*
 	 * Init percpu_ref in atomic mode so that it's faster to shutdown.

--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -12,23 +12,6 @@
 #include "blk.h"
-/*
- * Check if the two bvecs from two bios can be merged to one segment.  If yes,
- * no need to check gap between the two bios since the 1st bio and the 1st bvec
- * in the 2nd bio can be handled in one segment.
- */
-static inline bool bios_segs_mergeable(struct request_queue *q,
-		struct bio *prev, struct bio_vec *prev_last_bv,
-		struct bio_vec *next_first_bv)
-{
-	if (!biovec_phys_mergeable(q, prev_last_bv, next_first_bv))
-		return false;
-	if (prev->bi_seg_back_size + next_first_bv->bv_len >
-			queue_max_segment_size(q))
-		return false;
-	return true;
-}
 static inline bool bio_will_gap(struct request_queue *q,
 		struct request *prev_rq, struct bio *prev, struct bio *next)
 {
@@ -60,7 +43,7 @@ static inline bool bio_will_gap(struct request_queue *q,
 	 */
 	bio_get_last_bvec(prev, &pb);
 	bio_get_first_bvec(next, &nb);
-	if (bios_segs_mergeable(q, prev, &pb, &nb))
+	if (biovec_phys_mergeable(q, &pb, &nb))
 		return false;
 	return __bvec_gap_to_prev(q, &pb, nb.bv_offset);
 }
@@ -179,8 +162,7 @@ static unsigned get_max_segment_size(struct request_queue *q,
 * variables.
 */
 static bool bvec_split_segs(struct request_queue *q, struct bio_vec *bv,
-		unsigned *nsegs, unsigned *last_seg_size,
+		unsigned *nsegs, unsigned *sectors, unsigned max_segs)
-		unsigned *front_seg_size, unsigned *sectors, unsigned max_segs)
 {
 	unsigned len = bv->bv_len;
 	unsigned total_len = 0;
@@ -202,27 +184,11 @@ static bool bvec_split_segs(struct request_queue *q, struct bio_vec *bv,
 			break;
 	}
-	if (!new_nsegs)
+	if (new_nsegs) {
-		return !!len;
-	/* update front segment size */
-	if (!*nsegs) {
-		unsigned first_seg_size;
-		if (new_nsegs == 1)
-			first_seg_size = get_max_segment_size(q, bv->bv_offset);
-		else
-			first_seg_size = queue_max_segment_size(q);
-		if (*front_seg_size < first_seg_size)
-			*front_seg_size = first_seg_size;
-	}
-	/* update other varibles */
-	*last_seg_size = seg_size;
 		*nsegs += new_nsegs;
 		if (sectors)
 			*sectors += total_len >> 9;
+	}
 	/* split in the middle of the bvec if len != 0 */
 	return !!len;
@@ -235,8 +201,7 @@ static struct bio *blk_bio_segment_split(struct request_queue *q,
 {
 	struct bio_vec bv, bvprv, *bvprvp = NULL;
 	struct bvec_iter iter;
-	unsigned seg_size = 0, nsegs = 0, sectors = 0;
+	unsigned nsegs = 0, sectors = 0;
-	unsigned front_seg_size = bio->bi_seg_front_size;
 	bool do_split = true;
 	struct bio *new = NULL;
 	const unsigned max_sectors = get_max_io_size(q, bio);
@@ -260,8 +225,6 @@ static struct bio *blk_bio_segment_split(struct request_queue *q,
 				/* split in the middle of bvec */
 				bv.bv_len = (max_sectors - sectors) << 9;
 				bvec_split_segs(q, &bv, &nsegs,
-						&seg_size,
-						&front_seg_size,
 						&sectors, max_segs);
 			}
 			goto split;
@@ -275,12 +238,9 @@ static struct bio *blk_bio_segment_split(struct request_queue *q,
 		if (bv.bv_offset + bv.bv_len <= PAGE_SIZE) {
 			nsegs++;
-			seg_size = bv.bv_len;
 			sectors += bv.bv_len >> 9;
-			if (nsegs == 1 && seg_size > front_seg_size)
+		} else if (bvec_split_segs(q, &bv, &nsegs, &sectors,
-				front_seg_size = seg_size;
+				max_segs)) {
-		} else if (bvec_split_segs(q, &bv, &nsegs, &seg_size,
-				    &front_seg_size, &sectors, max_segs)) {
 			goto split;
 		}
 	}
@@ -295,10 +255,6 @@ static struct bio *blk_bio_segment_split(struct request_queue *q,
 			bio = new;
 	}
-	bio->bi_seg_front_size = front_seg_size;
-	if (seg_size > bio->bi_seg_back_size)
-		bio->bi_seg_back_size = seg_size;
 	return do_split ? new : NULL;
 }
@@ -353,18 +309,13 @@ EXPORT_SYMBOL(blk_queue_split);
 static unsigned int __blk_recalc_rq_segments(struct request_queue *q,
 					     struct bio *bio)
 {
-	struct bio_vec uninitialized_var(bv), bvprv = { NULL };
+	unsigned int nr_phys_segs = 0;
-	unsigned int seg_size, nr_phys_segs;
-	unsigned front_seg_size;
-	struct bio *fbio, *bbio;
 	struct bvec_iter iter;
-	bool new_bio = false;
+	struct bio_vec bv;
 	if (!bio)
 		return 0;
-	front_seg_size = bio->bi_seg_front_size;
 	switch (bio_op(bio)) {
 	case REQ_OP_DISCARD:
 	case REQ_OP_SECURE_ERASE:
@@ -374,42 +325,11 @@ static unsigned int __blk_recalc_rq_segments(struct request_queue *q,
 		return 1;
 	}
-	fbio = bio;
-	seg_size = 0;
-	nr_phys_segs = 0;
 	for_each_bio(bio) {
-		bio_for_each_bvec(bv, bio, iter) {
+		bio_for_each_bvec(bv, bio, iter)
-			if (new_bio) {
+			bvec_split_segs(q, &bv, &nr_phys_segs, NULL, UINT_MAX);
-				if (seg_size + bv.bv_len
-				    > queue_max_segment_size(q))
-					goto new_segment;
-				if (!biovec_phys_mergeable(q, &bvprv, &bv))
-					goto new_segment;
-				seg_size += bv.bv_len;
-				if (nr_phys_segs == 1 && seg_size >
-						front_seg_size)
-					front_seg_size = seg_size;
-				continue;
-			}
-new_segment:
-			bvec_split_segs(q, &bv, &nr_phys_segs, &seg_size,
-					&front_seg_size, NULL, UINT_MAX);
-			new_bio = false;
-		}
-		bbio = bio;
-		if (likely(bio->bi_iter.bi_size)) {
-			bvprv = bv;
-			new_bio = true;
-		}
 	}
-	fbio->bi_seg_front_size = front_seg_size;
-	if (seg_size > bbio->bi_seg_back_size)
-		bbio->bi_seg_back_size = seg_size;
 	return nr_phys_segs;
 }
@@ -429,24 +349,6 @@ void blk_recount_segments(struct request_queue *q, struct bio *bio)
 	bio_set_flag(bio, BIO_SEG_VALID);
 }
-static int blk_phys_contig_segment(struct request_queue *q, struct bio *bio,
-				   struct bio *nxt)
-{
-	struct bio_vec end_bv = { NULL }, nxt_bv;
-	if (bio->bi_seg_back_size + nxt->bi_seg_front_size >
-	    queue_max_segment_size(q))
-		return 0;
-	if (!bio_has_data(bio))
-		return 1;
-	bio_get_last_bvec(bio, &end_bv);
-	bio_get_first_bvec(nxt, &nxt_bv);
-	return biovec_phys_mergeable(q, &end_bv, &nxt_bv);
-}
 static inline struct scatterlist *blk_next_sg(struct scatterlist **sg,
 		struct scatterlist *sglist)
 {
@@ -706,8 +608,6 @@ static int ll_merge_requests_fn(struct request_queue *q, struct request *req,
 				struct request *next)
 {
 	int total_phys_segments;
-	unsigned int seg_size =
-		req->biotail->bi_seg_back_size + next->bio->bi_seg_front_size;
 	if (req_gap_back_merge(req, next->bio))
 		return 0;
@@ -720,14 +620,6 @@ static int ll_merge_requests_fn(struct request_queue *q, struct request *req,
 		return 0;
 	total_phys_segments = req->nr_phys_segments + next->nr_phys_segments;
-	if (blk_phys_contig_segment(q, req->biotail, next->bio)) {
-		if (req->nr_phys_segments == 1)
-			req->bio->bi_seg_front_size = seg_size;
-		if (next->nr_phys_segments == 1)
-			next->biotail->bi_seg_back_size = seg_size;
-		total_phys_segments--;
-	}
 	if (total_phys_segments > queue_max_segments(q))
 		return 0;

--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -144,13 +144,14 @@ void blk_mq_in_flight_rw(struct request_queue *q, struct hd_struct *part,
 void blk_freeze_queue_start(struct request_queue *q)
 {
-	int freeze_depth;
+	mutex_lock(&q->mq_freeze_lock);
+	if (++q->mq_freeze_depth == 1) {
-	freeze_depth = atomic_inc_return(&q->mq_freeze_depth);
-	if (freeze_depth == 1) {
 		percpu_ref_kill(&q->q_usage_counter);
+		mutex_unlock(&q->mq_freeze_lock);
 		if (queue_is_mq(q))
 			blk_mq_run_hw_queues(q, false);
+	} else {
+		mutex_unlock(&q->mq_freeze_lock);
 	}
 }
 EXPORT_SYMBOL_GPL(blk_freeze_queue_start);
@@ -199,14 +200,14 @@ EXPORT_SYMBOL_GPL(blk_mq_freeze_queue);
 void blk_mq_unfreeze_queue(struct request_queue *q)
 {
-	int freeze_depth;
+	mutex_lock(&q->mq_freeze_lock);
+	q->mq_freeze_depth--;
-	freeze_depth = atomic_dec_return(&q->mq_freeze_depth);
+	WARN_ON_ONCE(q->mq_freeze_depth < 0);
-	WARN_ON_ONCE(freeze_depth < 0);
+	if (!q->mq_freeze_depth) {
-	if (!freeze_depth) {
 		percpu_ref_resurrect(&q->q_usage_counter);
 		wake_up_all(&q->mq_freeze_wq);
 	}
+	mutex_unlock(&q->mq_freeze_lock);
 }
 EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue);

--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -310,6 +310,9 @@ void blk_queue_max_segment_size(struct request_queue *q, unsigned int max_size)
 		       __func__, max_size);
 	}
+	/* see blk_queue_virt_boundary() for the explanation */
+	WARN_ON_ONCE(q->limits.virt_boundary_mask);
 	q->limits.max_segment_size = max_size;
 }
 EXPORT_SYMBOL(blk_queue_max_segment_size);
@@ -742,6 +745,14 @@ EXPORT_SYMBOL(blk_queue_segment_boundary);
 void blk_queue_virt_boundary(struct request_queue *q, unsigned long mask)
 {
 	q->limits.virt_boundary_mask = mask;
+	/*
+	 * Devices that require a virtual boundary do not support scatter/gather
+	 * I/O natively, but instead require a descriptor list entry for each
+	 * page (which might not be idential to the Linux PAGE_SIZE).  Because
+	 * of that they are not limited by our notion of "segment size".
+	 */
+	q->limits.max_segment_size = UINT_MAX;
 }
 EXPORT_SYMBOL(blk_queue_virt_boundary);

--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -1257,9 +1257,9 @@ static u32 nvme_passthru_start(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
 		return 0;
 	}
-	effects |= nvme_known_admin_effects(opcode);
 	if (ctrl->effects)
 		effects = le32_to_cpu(ctrl->effects->acs[opcode]);
+	effects |= nvme_known_admin_effects(opcode);
 	/*
 	 * For simplicity, IO to all namespaces is quiesced even if the command
@@ -1361,9 +1361,14 @@ static struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk,
 {
 #ifdef CONFIG_NVME_MULTIPATH
 	if (disk->fops == &nvme_ns_head_ops) {
+		struct nvme_ns *ns;
 		*head = disk->private_data;
 		*srcu_idx = srcu_read_lock(&(*head)->srcu);
-		return nvme_find_path(*head);
+		ns = nvme_find_path(*head);
+		if (!ns)
+			srcu_read_unlock(&(*head)->srcu, *srcu_idx);
+		return ns;
 	}
 #endif
 	*head = NULL;
@@ -1377,42 +1382,56 @@ static void nvme_put_ns_from_disk(struct nvme_ns_head *head, int idx)
 		srcu_read_unlock(&head->srcu, idx);
 }
-static int nvme_ns_ioctl(struct nvme_ns *ns, unsigned cmd, unsigned long arg)
-{
-	switch (cmd) {
-	case NVME_IOCTL_ID:
-		force_successful_syscall_return();
-		return ns->head->ns_id;
-	case NVME_IOCTL_ADMIN_CMD:
-		return nvme_user_cmd(ns->ctrl, NULL, (void __user *)arg);
-	case NVME_IOCTL_IO_CMD:
-		return nvme_user_cmd(ns->ctrl, ns, (void __user *)arg);
-	case NVME_IOCTL_SUBMIT_IO:
-		return nvme_submit_io(ns, (void __user *)arg);
-	default:
-#ifdef CONFIG_NVM
-		if (ns->ndev)
-			return nvme_nvm_ioctl(ns, cmd, arg);
-#endif
-		if (is_sed_ioctl(cmd))
-			return sed_ioctl(ns->ctrl->opal_dev, cmd,
-					 (void __user *) arg);
-		return -ENOTTY;
-	}
-}
 static int nvme_ioctl(struct block_device *bdev, fmode_t mode,
 		unsigned int cmd, unsigned long arg)
 {
 	struct nvme_ns_head *head = NULL;
+	void __user *argp = (void __user *)arg;
 	struct nvme_ns *ns;
 	int srcu_idx, ret;
 	ns = nvme_get_ns_from_disk(bdev->bd_disk, &head, &srcu_idx);
 	if (unlikely(!ns))
-		ret = -EWOULDBLOCK;
+		return -EWOULDBLOCK;
+	/*
+	 * Handle ioctls that apply to the controller instead of the namespace
+	 * seperately and drop the ns SRCU reference early.  This avoids a
+	 * deadlock when deleting namespaces using the passthrough interface.
+	 */
+	if (cmd == NVME_IOCTL_ADMIN_CMD || is_sed_ioctl(cmd)) {
+		struct nvme_ctrl *ctrl = ns->ctrl;
+		nvme_get_ctrl(ns->ctrl);
+		nvme_put_ns_from_disk(head, srcu_idx);
+		if (cmd == NVME_IOCTL_ADMIN_CMD)
+			ret = nvme_user_cmd(ctrl, NULL, argp);
 		else
-		ret = nvme_ns_ioctl(ns, cmd, arg);
+			ret = sed_ioctl(ctrl->opal_dev, cmd, argp);
+		nvme_put_ctrl(ctrl);
+		return ret;
+	}
+	switch (cmd) {
+	case NVME_IOCTL_ID:
+		force_successful_syscall_return();
+		ret = ns->head->ns_id;
+		break;
+	case NVME_IOCTL_IO_CMD:
+		ret = nvme_user_cmd(ns->ctrl, ns, argp);
+		break;
+	case NVME_IOCTL_SUBMIT_IO:
+		ret = nvme_submit_io(ns, argp);
+		break;
+	default:
+		if (ns->ndev)
+			ret = nvme_nvm_ioctl(ns, cmd, arg);
+		else
+			ret = -ENOTTY;
+	}
 	nvme_put_ns_from_disk(head, srcu_idx);
 	return ret;
 }
@@ -2557,6 +2576,7 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
 	ctrl->oacs = le16_to_cpu(id->oacs);
 	ctrl->oncs = le16_to_cpu(id->oncs);
+	ctrl->mtfa = le16_to_cpu(id->mtfa);
 	ctrl->oaes = le32_to_cpu(id->oaes);
 	atomic_set(&ctrl->abort_limit, id->acl + 1);
 	ctrl->vwc = id->vwc;
@@ -3681,6 +3701,7 @@ EXPORT_SYMBOL_GPL(nvme_start_ctrl);
 void nvme_uninit_ctrl(struct nvme_ctrl *ctrl)
 {
+	dev_pm_qos_hide_latency_tolerance(ctrl->device);
 	cdev_device_del(&ctrl->cdev, ctrl->device);
 }
 EXPORT_SYMBOL_GPL(nvme_uninit_ctrl);
@@ -3880,6 +3901,18 @@ void nvme_start_queues(struct nvme_ctrl *ctrl)
 }
 EXPORT_SYMBOL_GPL(nvme_start_queues);
+void nvme_sync_queues(struct nvme_ctrl *ctrl)
+{
+	struct nvme_ns *ns;
+	down_read(&ctrl->namespaces_rwsem);
+	list_for_each_entry(ns, &ctrl->namespaces, list)
+		blk_sync_queue(ns->queue);
+	up_read(&ctrl->namespaces_rwsem);
+}
+EXPORT_SYMBOL_GPL(nvme_sync_queues);
 /*
 * Check we didn't inadvertently grow the command structure sizes:
 */

--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -441,6 +441,7 @@ void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status,
 void nvme_stop_queues(struct nvme_ctrl *ctrl);
 void nvme_start_queues(struct nvme_ctrl *ctrl);
 void nvme_kill_queues(struct nvme_ctrl *ctrl);
+void nvme_sync_queues(struct nvme_ctrl *ctrl);
 void nvme_unfreeze(struct nvme_ctrl *ctrl);
 void nvme_wait_freeze(struct nvme_ctrl *ctrl);
 void nvme_wait_freeze_timeout(struct nvme_ctrl *ctrl, long timeout);

--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -464,7 +464,7 @@ static int nvme_pci_map_queues(struct blk_mq_tag_set *set)
 		 * affinity), so use the regular blk-mq cpu mapping
 		 */
 		map->queue_offset = qoff;
-		if (i != HCTX_TYPE_POLL)
+		if (i != HCTX_TYPE_POLL && offset)
 			blk_mq_pci_map_queues(map, to_pci_dev(dev->dev), offset);
 		else
 			blk_mq_map_queues(map);
@@ -1257,7 +1257,6 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
 	struct nvme_dev *dev = nvmeq->dev;
 	struct request *abort_req;
 	struct nvme_command cmd;
-	bool shutdown = false;
 	u32 csts = readl(dev->bar + NVME_REG_CSTS);
 	/* If PCI error recovery process is happening, we cannot reset or
@@ -1294,17 +1293,18 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
 	 * shutdown, so we return BLK_EH_DONE.
 	 */
 	switch (dev->ctrl.state) {
-	case NVME_CTRL_DELETING:
-		shutdown = true;
-		/* fall through */
 	case NVME_CTRL_CONNECTING:
-	case NVME_CTRL_RESETTING:
+		nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING);
+		/* fall through */
+	case NVME_CTRL_DELETING:
 		dev_warn_ratelimited(dev->ctrl.device,
 			 "I/O %d QID %d timeout, disable controller\n",
 			 req->tag, nvmeq->qid);
-		nvme_dev_disable(dev, shutdown);
+		nvme_dev_disable(dev, true);
 		nvme_req(req)->flags |= NVME_REQ_CANCELLED;
 		return BLK_EH_DONE;
+	case NVME_CTRL_RESETTING:
+		return BLK_EH_RESET_TIMER;
 	default:
 		break;
 	}
@@ -2376,7 +2376,7 @@ static void nvme_pci_disable(struct nvme_dev *dev)
 static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
 {
-	bool dead = true;
+	bool dead = true, freeze = false;
 	struct pci_dev *pdev = to_pci_dev(dev->dev);
 	mutex_lock(&dev->shutdown_lock);
@@ -2384,8 +2384,10 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
 		u32 csts = readl(dev->bar + NVME_REG_CSTS);
 		if (dev->ctrl.state == NVME_CTRL_LIVE ||
-		    dev->ctrl.state == NVME_CTRL_RESETTING)
+		    dev->ctrl.state == NVME_CTRL_RESETTING) {
+			freeze = true;
 			nvme_start_freeze(&dev->ctrl);
+		}
 		dead = !!((csts & NVME_CSTS_CFS) || !(csts & NVME_CSTS_RDY) ||
 			pdev->error_state  != pci_channel_io_normal);
 	}
@@ -2394,10 +2396,8 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
 	 * Give the controller a chance to complete all entered requests if
 	 * doing a safe shutdown.
 	 */
-	if (!dead) {
+	if (!dead && shutdown && freeze)
-		if (shutdown)
 		nvme_wait_freeze_timeout(&dev->ctrl, NVME_IO_TIMEOUT);
-	}
 	nvme_stop_queues(&dev->ctrl);
@@ -2492,6 +2492,7 @@ static void nvme_reset_work(struct work_struct *work)
 	 */
 	if (dev->ctrl.ctrl_config & NVME_CC_ENABLE)
 		nvme_dev_disable(dev, false);
+	nvme_sync_queues(&dev->ctrl);
 	mutex_lock(&dev->shutdown_lock);
 	result = nvme_pci_enable(dev);

--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -210,7 +210,7 @@ static inline void bio_cnt_set(struct bio *bio, unsigned int count)
 {
 	if (count != 1) {
 		bio->bi_flags |= (1 << BIO_REFFED);
-		smp_mb__before_atomic();
+		smp_mb();
 	}
 	atomic_set(&bio->__bi_cnt, count);
 }

--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -159,13 +159,6 @@ struct bio {
 	 */
 	unsigned int		bi_phys_segments;
-	/*
-	 * To keep track of the max segment size, we account for the
-	 * sizes of the first and last mergeable segments in this bio.
-	 */
-	unsigned int		bi_seg_front_size;
-	unsigned int		bi_seg_back_size;
 	struct bvec_iter	bi_iter;
 	atomic_t		__bi_remaining;

--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -542,7 +542,7 @@ struct request_queue {
 	struct list_head	unused_hctx_list;
 	spinlock_t		unused_hctx_lock;
-	atomic_t		mq_freeze_depth;
+	int			mq_freeze_depth;
 #if defined(CONFIG_BLK_DEV_BSG)
 	struct bsg_class_device bsg_dev;
@@ -554,6 +554,11 @@ struct request_queue {
 #endif
 	struct rcu_head		rcu_head;
 	wait_queue_head_t	mq_freeze_wq;
+	/*
+	 * Protect concurrent access to q_usage_counter by
+	 * percpu_ref_kill() and percpu_ref_reinit().
+	 */
+	struct mutex		mq_freeze_lock;
 	struct percpu_ref	q_usage_counter;
 	struct blk_mq_tag_set	*tag_set;

--- a/lib/sbitmap.c
+++ b/lib/sbitmap.c
@@ -435,7 +435,7 @@ static void sbitmap_queue_update_wake_batch(struct sbitmap_queue *sbq,
 		 * to ensure that the batch size is updated before the wait
 		 * counts.
 		 */
-		smp_mb__before_atomic();
+		smp_mb();
 		for (i = 0; i < SBQ_WAIT_QUEUES; i++)
 			atomic_set(&sbq->ws[i].wait_cnt, 1);
 	}

--- a/tools/io_uring/Makefile
+++ b/tools/io_uring/Makefile
@@ -8,7 +8,7 @@ all: io_uring-cp io_uring-bench
 	$(CC) $(CFLAGS) -o $@ $^
 io_uring-bench: syscall.o io_uring-bench.o
-	$(CC) $(CFLAGS) $(LDLIBS) -o $@ $^
+	$(CC) $(CFLAGS) -o $@ $^ $(LDLIBS)
 io_uring-cp: setup.o syscall.o queue.o

--- a/tools/io_uring/io_uring-cp.c
+++ b/tools/io_uring/io_uring-cp.c
@@ -13,6 +13,7 @@
 #include <assert.h>
 #include <errno.h>
 #include <inttypes.h>
+#include <sys/types.h>
 #include <sys/stat.h>
 #include <sys/ioctl.h>
@@ -85,11 +86,16 @@ static int queue_read(struct io_uring *ring, off_t size, off_t offset)
 	struct io_uring_sqe *sqe;
 	struct io_data *data;
+	data = malloc(size + sizeof(*data));
+	if (!data)
+		return 1;
 	sqe = io_uring_get_sqe(ring);
-	if (!sqe)
+	if (!sqe) {
+		free(data);
 		return 1;
+	}
-	data = malloc(size + sizeof(*data));
 	data->read = 1;
 	data->offset = data->first_offset = offset;
@@ -166,22 +172,23 @@ static int copy_file(struct io_uring *ring, off_t insize)
 			struct io_data *data;
 			if (!got_comp) {
-				ret = io_uring_wait_completion(ring, &cqe);
+				ret = io_uring_wait_cqe(ring, &cqe);
 				got_comp = 1;
 			} else
-				ret = io_uring_get_completion(ring, &cqe);
+				ret = io_uring_peek_cqe(ring, &cqe);
 			if (ret < 0) {
-				fprintf(stderr, "io_uring_get_completion: %s\n",
+				fprintf(stderr, "io_uring_peek_cqe: %s\n",
 							strerror(-ret));
 				return 1;
 			}
 			if (!cqe)
 				break;
-			data = (struct io_data *) (uintptr_t) cqe->user_data;
+			data = io_uring_cqe_get_data(cqe);
 			if (cqe->res < 0) {
 				if (cqe->res == -EAGAIN) {
 					queue_prepped(ring, data);
+					io_uring_cqe_seen(ring, cqe);
 					continue;
 				}
 				fprintf(stderr, "cqe failed: %s\n",
@@ -193,6 +200,7 @@ static int copy_file(struct io_uring *ring, off_t insize)
 				data->iov.iov_len -= cqe->res;
 				data->offset += cqe->res;
 				queue_prepped(ring, data);
+				io_uring_cqe_seen(ring, cqe);
 				continue;
 			}
@@ -209,6 +217,7 @@ static int copy_file(struct io_uring *ring, off_t insize)
 				free(data);
 				writes--;
 			}
+			io_uring_cqe_seen(ring, cqe);
 		}
 	}

--- a/tools/io_uring/liburing.h
+++ b/tools/io_uring/liburing.h
 #ifndef LIB_URING_H
 #define LIB_URING_H
+#ifdef __cplusplus
+extern "C" {
+#endif
 #include <sys/uio.h>
 #include <signal.h>
 #include <string.h>
 #include "../../include/uapi/linux/io_uring.h"
+#include <inttypes.h>
+#include "barrier.h"
 /*
 * Library interface to io_uring
@@ -46,7 +52,7 @@ struct io_uring {
 * System calls
 */
 extern int io_uring_setup(unsigned entries, struct io_uring_params *p);
-extern int io_uring_enter(unsigned fd, unsigned to_submit,
+extern int io_uring_enter(int fd, unsigned to_submit,
 	unsigned min_complete, unsigned flags, sigset_t *sig);
 extern int io_uring_register(int fd, unsigned int opcode, void *arg,
 	unsigned int nr_args);
@@ -59,13 +65,32 @@ extern int io_uring_queue_init(unsigned entries, struct io_uring *ring,
 extern int io_uring_queue_mmap(int fd, struct io_uring_params *p,
 	struct io_uring *ring);
 extern void io_uring_queue_exit(struct io_uring *ring);
-extern int io_uring_get_completion(struct io_uring *ring,
+extern int io_uring_peek_cqe(struct io_uring *ring,
 	struct io_uring_cqe **cqe_ptr);
-extern int io_uring_wait_completion(struct io_uring *ring,
+extern int io_uring_wait_cqe(struct io_uring *ring,
 	struct io_uring_cqe **cqe_ptr);
 extern int io_uring_submit(struct io_uring *ring);
 extern struct io_uring_sqe *io_uring_get_sqe(struct io_uring *ring);
+/*
+ * Must be called after io_uring_{peek,wait}_cqe() after the cqe has
+ * been processed by the application.
+ */
+static inline void io_uring_cqe_seen(struct io_uring *ring,
+				     struct io_uring_cqe *cqe)
+{
+	if (cqe) {
+		struct io_uring_cq *cq = &ring->cq;
+		(*cq->khead)++;
+		/*
+		 * Ensure that the kernel sees our new head, the kernel has
+		 * the matching read barrier.
+		 */
+		write_barrier();
+	}
+}
 /*
 * Command prep helpers
 */
@@ -74,8 +99,14 @@ static inline void io_uring_sqe_set_data(struct io_uring_sqe *sqe, void *data)
 	sqe->user_data = (unsigned long) data;
 }
+static inline void *io_uring_cqe_get_data(struct io_uring_cqe *cqe)
+{
+	return (void *) (uintptr_t) cqe->user_data;
+}
 static inline void io_uring_prep_rw(int op, struct io_uring_sqe *sqe, int fd,
-				    void *addr, unsigned len, off_t offset)
+				    const void *addr, unsigned len,
+				    off_t offset)
 {
 	memset(sqe, 0, sizeof(*sqe));
 	sqe->opcode = op;
@@ -86,8 +117,8 @@ static inline void io_uring_prep_rw(int op, struct io_uring_sqe *sqe, int fd,
 }
 static inline void io_uring_prep_readv(struct io_uring_sqe *sqe, int fd,
-				       struct iovec *iovecs, unsigned nr_vecs,
+				       const struct iovec *iovecs,
-				       off_t offset)
+				       unsigned nr_vecs, off_t offset)
 {
 	io_uring_prep_rw(IORING_OP_READV, sqe, fd, iovecs, nr_vecs, offset);
 }
@@ -100,14 +131,14 @@ static inline void io_uring_prep_read_fixed(struct io_uring_sqe *sqe, int fd,
 }
 static inline void io_uring_prep_writev(struct io_uring_sqe *sqe, int fd,
-				        struct iovec *iovecs, unsigned nr_vecs,
+					const struct iovec *iovecs,
-					off_t offset)
+					unsigned nr_vecs, off_t offset)
 {
 	io_uring_prep_rw(IORING_OP_WRITEV, sqe, fd, iovecs, nr_vecs, offset);
 }
 static inline void io_uring_prep_write_fixed(struct io_uring_sqe *sqe, int fd,
-					     void *buf, unsigned nbytes,
+					     const void *buf, unsigned nbytes,
 					     off_t offset)
 {
 	io_uring_prep_rw(IORING_OP_WRITE_FIXED, sqe, fd, buf, nbytes, offset);
@@ -131,13 +162,22 @@ static inline void io_uring_prep_poll_remove(struct io_uring_sqe *sqe,
 }
 static inline void io_uring_prep_fsync(struct io_uring_sqe *sqe, int fd,
-				       int datasync)
+				       unsigned fsync_flags)
 {
 	memset(sqe, 0, sizeof(*sqe));
 	sqe->opcode = IORING_OP_FSYNC;
 	sqe->fd = fd;
-	if (datasync)
+	sqe->fsync_flags = fsync_flags;
-		sqe->fsync_flags = IORING_FSYNC_DATASYNC;
 }
+static inline void io_uring_prep_nop(struct io_uring_sqe *sqe)
+{
+	memset(sqe, 0, sizeof(*sqe));
+	sqe->opcode = IORING_OP_NOP;
+}
+#ifdef __cplusplus
+}
+#endif
 #endif
--- a/tools/io_uring/queue.c
+++ b/tools/io_uring/queue.c
@@ -8,7 +8,7 @@
 #include "liburing.h"
 #include "barrier.h"
-static int __io_uring_get_completion(struct io_uring *ring,
+static int __io_uring_get_cqe(struct io_uring *ring,
 			      struct io_uring_cqe **cqe_ptr, int wait)
 {
 	struct io_uring_cq *cq = &ring->cq;
@@ -39,34 +39,25 @@ static int __io_uring_get_completion(struct io_uring *ring,
 			return -errno;
 	} while (1);
-	if (*cqe_ptr) {
-		*cq->khead = head + 1;
-		/*
-		 * Ensure that the kernel sees our new head, the kernel has
-		 * the matching read barrier.
-		 */
-		write_barrier();
-	}
 	return 0;
 }
 /*
- * Return an IO completion, if one is readily available
+ * Return an IO completion, if one is readily available. Returns 0 with
+ * cqe_ptr filled in on success, -errno on failure.
 */
-int io_uring_get_completion(struct io_uring *ring,
+int io_uring_peek_cqe(struct io_uring *ring, struct io_uring_cqe **cqe_ptr)
-			    struct io_uring_cqe **cqe_ptr)
 {
-	return __io_uring_get_completion(ring, cqe_ptr, 0);
+	return __io_uring_get_cqe(ring, cqe_ptr, 0);
 }
 /*
- * Return an IO completion, waiting for it if necessary
+ * Return an IO completion, waiting for it if necessary. Returns 0 with
+ * cqe_ptr filled in on success, -errno on failure.
 */
-int io_uring_wait_completion(struct io_uring *ring,
+int io_uring_wait_cqe(struct io_uring *ring, struct io_uring_cqe **cqe_ptr)
-			     struct io_uring_cqe **cqe_ptr)
 {
-	return __io_uring_get_completion(ring, cqe_ptr, 1);
+	return __io_uring_get_cqe(ring, cqe_ptr, 1);
 }
 /*
@@ -78,7 +69,7 @@ int io_uring_submit(struct io_uring *ring)
 {
 	struct io_uring_sq *sq = &ring->sq;
 	const unsigned mask = *sq->kring_mask;
-	unsigned ktail, ktail_next, submitted;
+	unsigned ktail, ktail_next, submitted, to_submit;
 	int ret;
 	/*
@@ -100,7 +91,8 @@ int io_uring_submit(struct io_uring *ring)
 	 */
 	submitted = 0;
 	ktail = ktail_next = *sq->ktail;
-	while (sq->sqe_head < sq->sqe_tail) {
+	to_submit = sq->sqe_tail - sq->sqe_head;
+	while (to_submit--) {
 		ktail_next++;
 		read_barrier();
@@ -136,7 +128,7 @@ int io_uring_submit(struct io_uring *ring)
 	if (ret < 0)
 		return -errno;
-	return 0;
+	return ret;
 }
 /*

--- a/tools/io_uring/setup.c
+++ b/tools/io_uring/setup.c
@@ -27,7 +27,7 @@ static int io_uring_mmap(int fd, struct io_uring_params *p,
 	sq->kdropped = ptr + p->sq_off.dropped;
 	sq->array = ptr + p->sq_off.array;
-	size = p->sq_entries * sizeof(struct io_uring_sqe),
+	size = p->sq_entries * sizeof(struct io_uring_sqe);
 	sq->sqes = mmap(0, size, PROT_READ | PROT_WRITE,
 				MAP_SHARED | MAP_POPULATE, fd,
 				IORING_OFF_SQES);
@@ -79,7 +79,7 @@ int io_uring_queue_mmap(int fd, struct io_uring_params *p, struct io_uring *ring
 int io_uring_queue_init(unsigned entries, struct io_uring *ring, unsigned flags)
 {
 	struct io_uring_params p;
-	int fd;
+	int fd, ret;
 	memset(&p, 0, sizeof(p));
 	p.flags = flags;
@@ -88,7 +88,11 @@ int io_uring_queue_init(unsigned entries, struct io_uring *ring, unsigned flags)
 	if (fd < 0)
 		return fd;
-	return io_uring_queue_mmap(fd, &p, ring);
+	ret = io_uring_queue_mmap(fd, &p, ring);
+	if (ret)
+		close(fd);
+	return ret;
 }
 void io_uring_queue_exit(struct io_uring *ring)

--- a/tools/io_uring/syscall.c
+++ b/tools/io_uring/syscall.c
@@ -7,34 +7,46 @@
 #include <signal.h>
 #include "liburing.h"
-#if defined(__x86_64) || defined(__i386__)
+#ifdef __alpha__
-#ifndef __NR_sys_io_uring_setup
+/*
-#define __NR_sys_io_uring_setup		425
+ * alpha is the only exception, all other architectures
-#endif
+ * have common numbers for new system calls.
-#ifndef __NR_sys_io_uring_enter
+ */
-#define __NR_sys_io_uring_enter		426
+# ifndef __NR_io_uring_setup
-#endif
+#  define __NR_io_uring_setup		535
-#ifndef __NR_sys_io_uring_register
+# endif
-#define __NR_sys_io_uring_register	427
+# ifndef __NR_io_uring_enter
-#endif
+#  define __NR_io_uring_enter		536
-#else
+# endif
-#error "Arch not supported yet"
+# ifndef __NR_io_uring_register
+#  define __NR_io_uring_register	537
+# endif
+#else /* !__alpha__ */
+# ifndef __NR_io_uring_setup
+#  define __NR_io_uring_setup		425
+# endif
+# ifndef __NR_io_uring_enter
+#  define __NR_io_uring_enter		426
+# endif
+# ifndef __NR_io_uring_register
+#  define __NR_io_uring_register	427
+# endif
 #endif
 int io_uring_register(int fd, unsigned int opcode, void *arg,
 		      unsigned int nr_args)
 {
-	return syscall(__NR_sys_io_uring_register, fd, opcode, arg, nr_args);
+	return syscall(__NR_io_uring_register, fd, opcode, arg, nr_args);
 }
-int io_uring_setup(unsigned entries, struct io_uring_params *p)
+int io_uring_setup(unsigned int entries, struct io_uring_params *p)
 {
-	return syscall(__NR_sys_io_uring_setup, entries, p);
+	return syscall(__NR_io_uring_setup, entries, p);
 }
-int io_uring_enter(unsigned fd, unsigned to_submit, unsigned min_complete,
+int io_uring_enter(int fd, unsigned int to_submit, unsigned int min_complete,
-		   unsigned flags, sigset_t *sig)
+		   unsigned int flags, sigset_t *sig)
 {
-	return syscall(__NR_sys_io_uring_enter, fd, to_submit, min_complete,
+	return syscall(__NR_io_uring_enter, fd, to_submit, min_complete,
 			flags, sig, _NSIG / 8);
 }