Merge branch 'for-linus' of git://git.kernel.dk/linux-block

Pull block fixes from Jens Axboe: "It's been a few weeks, so here's a small collection of fixes that should go into the current series. This contains: - NVMe pull request from Christoph, with a few important fixes. - kyber hang fix from Omar. - A blk-throttl fix from Shaohua, fixing a case where we double charge a bio. - Two call_single_data alignment fixes from me, fixing up some unfortunate changes that went into 4.14 without being properly reviewed on the block side (since nobody was CC'ed on the patch...). - A bounce buffer fix in two parts, one from me and one from Ming. - Revert bdi debug error handling patch. It's causing boot issues for some folks, and a week down the line, we're still no closer to a fix. Revert this patch for now until it's figured out, then we can retry for 4.16" * 'for-linus' of git://git.kernel.dk/linux-block: Revert "bdi: add error handle for bdi_debug_register" null_blk: unalign call_single_data block: unalign call_single_data in struct request block-throttle: avoid double charge block: fix blk_rq_append_bio block: don't let passthrough IO go into .make_request_fn() nvme: setup streams after initializing namespace head nvme: check hw sectors before setting chunk sectors nvme: call blk_integrity_unregister after queue is cleaned up nvme-fc: remove double put reference if admin connect fails nvme: set discard_alignment to zero kyber: fix another domain token wait queue hang

Merge branch 'for-linus' of git://git.kernel.dk/linux-block
Pull block fixes from Jens Axboe: "It's been a few weeks, so here's a small collection of fixes that should go into the current series. This contains: - NVMe pull request from Christoph, with a few important fixes. - kyber hang fix from Omar. - A blk-throttl fix from Shaohua, fixing a case where we double charge a bio. - Two call_single_data alignment fixes from me, fixing up some unfortunate changes that went into 4.14 without being properly reviewed on the block side (since nobody was CC'ed on the patch...). - A bounce buffer fix in two parts, one from me and one from Ming. - Revert bdi debug error handling patch. It's causing boot issues for some folks, and a week down the line, we're still no closer to a fix. Revert this patch for now until it's figured out, then we can retry for 4.16" * 'for-linus' of git://git.kernel.dk/linux-block: Revert "bdi: add error handle for bdi_debug_register" null_blk: unalign call_single_data block: unalign call_single_data in struct request block-throttle: avoid double charge block: fix blk_rq_append_bio block: don't let passthrough IO go into .make_request_fn() nvme: setup streams after initializing namespace head nvme: check hw sectors before setting chunk sectors nvme: call blk_integrity_unregister after queue is cleaned up nvme-fc: remove double put reference if admin connect fails nvme: set discard_alignment to zero kyber: fix another domain token wait queue hang
9035a896 · Linus Torvalds · 409232a4 · 6d0e4827 · 9035a896 · 9035a896
Commit 9035a896 authored Dec 21, 2017 by Linus Torvalds
14 changed files
--- a/block/bio.c
+++ b/block/bio.c
@@ -599,6 +599,8 @@ void __bio_clone_fast(struct bio *bio, struct bio *bio_src)
 	bio->bi_disk = bio_src->bi_disk;
 	bio->bi_partno = bio_src->bi_partno;
 	bio_set_flag(bio, BIO_CLONED);
+	if (bio_flagged(bio_src, BIO_THROTTLED))
+		bio_set_flag(bio, BIO_THROTTLED);
 	bio->bi_opf = bio_src->bi_opf;
 	bio->bi_write_hint = bio_src->bi_write_hint;
 	bio->bi_iter = bio_src->bi_iter;

--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -12,22 +12,29 @@
 #include "blk.h"

 /*
- * Append a bio to a passthrough request.  Only works can be merged into
- * the request based on the driver constraints.
+ * Append a bio to a passthrough request.  Only works if the bio can be merged
+ * into the request based on the driver constraints.
 */
-int blk_rq_append_bio(struct request *rq, struct bio *bio)
+int blk_rq_append_bio(struct request *rq, struct bio **bio)
 {
-	blk_queue_bounce(rq->q, &bio);
+	struct bio *orig_bio = *bio;
+
+	blk_queue_bounce(rq->q, bio);

 	if (!rq->bio) {
-		blk_rq_bio_prep(rq->q, rq, bio);
+		blk_rq_bio_prep(rq->q, rq, *bio);
 	} else {
-		if (!ll_back_merge_fn(rq->q, rq, bio))
+		if (!ll_back_merge_fn(rq->q, rq, *bio)) {
+			if (orig_bio != *bio) {
+				bio_put(*bio);
+				*bio = orig_bio;
+			}
 			return -EINVAL;
+		}

-		rq->biotail->bi_next = bio;
-		rq->biotail = bio;
-		rq->__data_len += bio->bi_iter.bi_size;
+		rq->biotail->bi_next = *bio;
+		rq->biotail = *bio;
+		rq->__data_len += (*bio)->bi_iter.bi_size;
 	}

 	return 0;
@@ -73,14 +80,12 @@ static int __blk_rq_map_user_iov(struct request *rq,
 	 * We link the bounce buffer in and could have to traverse it
 	 * later so we have to get a ref to prevent it from being freed
 	 */
-	ret = blk_rq_append_bio(rq, bio);
-	bio_get(bio);
+	ret = blk_rq_append_bio(rq, &bio);
 	if (ret) {
-		bio_endio(bio);
 		__blk_rq_unmap_user(orig_bio);
-		bio_put(bio);
 		return ret;
 	}
+	bio_get(bio);

 	return 0;
 }
@@ -213,7 +218,7 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf,
 	int reading = rq_data_dir(rq) == READ;
 	unsigned long addr = (unsigned long) kbuf;
 	int do_copy = 0;
-	struct bio *bio;
+	struct bio *bio, *orig_bio;
 	int ret;

 	if (len > (queue_max_hw_sectors(q) << 9))
@@ -236,10 +241,11 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf,
 	if (do_copy)
 		rq->rq_flags |= RQF_COPY_USER;

-	ret = blk_rq_append_bio(rq, bio);
+	orig_bio = bio;
+	ret = blk_rq_append_bio(rq, &bio);
 	if (unlikely(ret)) {
 		/* request is too big */
-		bio_put(bio);
+		bio_put(orig_bio);
 		return ret;
 	}


--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -2226,13 +2226,7 @@ bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
 out_unlock:
 	spin_unlock_irq(q->queue_lock);
 out:
-	/*
-	 * As multiple blk-throtls may stack in the same issue path, we
-	 * don't want bios to leave with the flag set.  Clear the flag if
-	 * being issued.
-	 */
-	if (!throttled)
-		bio_clear_flag(bio, BIO_THROTTLED);
+	bio_set_flag(bio, BIO_THROTTLED);

 #ifdef CONFIG_BLK_DEV_THROTTLING_LOW
 	if (throttled || !td->track_bio_latency)

--- a/block/bounce.c
+++ b/block/bounce.c
@@ -200,6 +200,7 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig,
 	unsigned i = 0;
 	bool bounce = false;
 	int sectors = 0;
+	bool passthrough = bio_is_passthrough(*bio_orig);

 	bio_for_each_segment(from, *bio_orig, iter) {
 		if (i++ < BIO_MAX_PAGES)
@@ -210,13 +211,14 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig,
 	if (!bounce)
 		return;

-	if (sectors < bio_sectors(*bio_orig)) {
+	if (!passthrough && sectors < bio_sectors(*bio_orig)) {
 		bio = bio_split(*bio_orig, sectors, GFP_NOIO, bounce_bio_split);
 		bio_chain(bio, *bio_orig);
 		generic_make_request(*bio_orig);
 		*bio_orig = bio;
 	}
-	bio = bio_clone_bioset(*bio_orig, GFP_NOIO, bounce_bio_set);
+	bio = bio_clone_bioset(*bio_orig, GFP_NOIO, passthrough ? NULL :
+			bounce_bio_set);

 	bio_for_each_segment_all(to, bio, i) {
 		struct page *page = to->bv_page;

--- a/block/kyber-iosched.c
+++ b/block/kyber-iosched.c
@@ -100,9 +100,13 @@ struct kyber_hctx_data {
 	unsigned int cur_domain;
 	unsigned int batching;
 	wait_queue_entry_t domain_wait[KYBER_NUM_DOMAINS];
+	struct sbq_wait_state *domain_ws[KYBER_NUM_DOMAINS];
 	atomic_t wait_index[KYBER_NUM_DOMAINS];
 };

+static int kyber_domain_wake(wait_queue_entry_t *wait, unsigned mode, int flags,
+			     void *key);
+
 static int rq_sched_domain(const struct request *rq)
 {
 	unsigned int op = rq->cmd_flags;
@@ -385,6 +389,9 @@ static int kyber_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)

 	for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
 		INIT_LIST_HEAD(&khd->rqs[i]);
+		init_waitqueue_func_entry(&khd->domain_wait[i],
+					  kyber_domain_wake);
+		khd->domain_wait[i].private = hctx;
 		INIT_LIST_HEAD(&khd->domain_wait[i].entry);
 		atomic_set(&khd->wait_index[i], 0);
 	}
@@ -524,35 +531,39 @@ static int kyber_get_domain_token(struct kyber_queue_data *kqd,
 	int nr;

 	nr = __sbitmap_queue_get(domain_tokens);
-	if (nr >= 0)
-		return nr;

 	/*
 	 * If we failed to get a domain token, make sure the hardware queue is
 	 * run when one becomes available. Note that this is serialized on
 	 * khd->lock, but we still need to be careful about the waker.
 	 */
-	if (list_empty_careful(&wait->entry)) {
-		init_waitqueue_func_entry(wait, kyber_domain_wake);
-		wait->private = hctx;
+	if (nr < 0 && list_empty_careful(&wait->entry)) {
 		ws = sbq_wait_ptr(domain_tokens,
 				  &khd->wait_index[sched_domain]);
+		khd->domain_ws[sched_domain] = ws;
 		add_wait_queue(&ws->wait, wait);

 		/*
 		 * Try again in case a token was freed before we got on the wait
-		 * queue. The waker may have already removed the entry from the
-		 * wait queue, but list_del_init() is okay with that.
+		 * queue.
 		 */
 		nr = __sbitmap_queue_get(domain_tokens);
-		if (nr >= 0) {
-			unsigned long flags;
+	}

-			spin_lock_irqsave(&ws->wait.lock, flags);
+	/*
+	 * If we got a token while we were on the wait queue, remove ourselves
+	 * from the wait queue to ensure that all wake ups make forward
+	 * progress. It's possible that the waker already deleted the entry
+	 * between the !list_empty_careful() check and us grabbing the lock, but
+	 * list_del_init() is okay with that.
+	 */
+	if (nr >= 0 && !list_empty_careful(&wait->entry)) {
+		ws = khd->domain_ws[sched_domain];
+		spin_lock_irq(&ws->wait.lock);
 		list_del_init(&wait->entry);
-			spin_unlock_irqrestore(&ws->wait.lock, flags);
-		}
+		spin_unlock_irq(&ws->wait.lock);
 	}
+
 	return nr;
 }


--- a/drivers/block/null_blk.c
+++ b/drivers/block/null_blk.c
@@ -35,13 +35,13 @@ static inline u64 mb_per_tick(int mbps)
 struct nullb_cmd {
 	struct list_head list;
 	struct llist_node ll_list;
-	call_single_data_t csd;
+	struct __call_single_data csd;
 	struct request *rq;
 	struct bio *bio;
 	unsigned int tag;
+	blk_status_t error;
 	struct nullb_queue *nq;
 	struct hrtimer timer;
-	blk_status_t error;
 };

 struct nullb_queue {

--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -1287,7 +1287,7 @@ static void nvme_config_discard(struct nvme_ctrl *ctrl,
 	BUILD_BUG_ON(PAGE_SIZE / sizeof(struct nvme_dsm_range) <
 			NVME_DSM_MAX_RANGES);

-	queue->limits.discard_alignment = size;
+	queue->limits.discard_alignment = 0;
 	queue->limits.discard_granularity = size;

 	blk_queue_max_discard_sectors(queue, UINT_MAX);
@@ -1705,7 +1705,8 @@ static void nvme_set_queue_limits(struct nvme_ctrl *ctrl,
 		blk_queue_max_hw_sectors(q, ctrl->max_hw_sectors);
 		blk_queue_max_segments(q, min_t(u32, max_segments, USHRT_MAX));
 	}
-	if (ctrl->quirks & NVME_QUIRK_STRIPE_SIZE)
+	if ((ctrl->quirks & NVME_QUIRK_STRIPE_SIZE) &&
+	    is_power_of_2(ctrl->max_hw_sectors))
 		blk_queue_chunk_sectors(q, ctrl->max_hw_sectors);
 	blk_queue_virt_boundary(q, ctrl->page_size - 1);
 	if (ctrl->vwc & NVME_CTRL_VWC_PRESENT)
@@ -2869,7 +2870,6 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)

 	blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift);
 	nvme_set_queue_limits(ctrl, ns->queue);
-	nvme_setup_streams_ns(ctrl, ns);

 	id = nvme_identify_ns(ctrl, nsid);
 	if (!id)
@@ -2880,6 +2880,7 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)

 	if (nvme_init_ns_head(ns, nsid, id, &new))
 		goto out_free_id;
+	nvme_setup_streams_ns(ctrl, ns);
 	
 #ifdef CONFIG_NVME_MULTIPATH
 	/*
@@ -2965,8 +2966,6 @@ static void nvme_ns_remove(struct nvme_ns *ns)
 		return;

 	if (ns->disk && ns->disk->flags & GENHD_FL_UP) {
-		if (blk_get_integrity(ns->disk))
-			blk_integrity_unregister(ns->disk);
 		nvme_mpath_remove_disk_links(ns);
 		sysfs_remove_group(&disk_to_dev(ns->disk)->kobj,
 					&nvme_ns_id_attr_group);
@@ -2974,6 +2973,8 @@ static void nvme_ns_remove(struct nvme_ns *ns)
 			nvme_nvm_unregister_sysfs(ns);
 		del_gendisk(ns->disk);
 		blk_cleanup_queue(ns->queue);
+		if (blk_get_integrity(ns->disk))
+			blk_integrity_unregister(ns->disk);
 	}

 	mutex_lock(&ns->ctrl->subsys->lock);

--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -3221,7 +3221,6 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts,

 		/* initiate nvme ctrl ref counting teardown */
 		nvme_uninit_ctrl(&ctrl->ctrl);
-		nvme_put_ctrl(&ctrl->ctrl);

 		/* Remove core ctrl ref. */
 		nvme_put_ctrl(&ctrl->ctrl);

--- a/drivers/scsi/osd/osd_initiator.c
+++ b/drivers/scsi/osd/osd_initiator.c
@@ -1576,7 +1576,9 @@ static struct request *_make_request(struct request_queue *q, bool has_write,
 		return req;

 	for_each_bio(bio) {
-		ret = blk_rq_append_bio(req, bio);
+		struct bio *bounce_bio = bio;
+
+		ret = blk_rq_append_bio(req, &bounce_bio);
 		if (ret)
 			return ERR_PTR(ret);
 	}

--- a/drivers/target/target_core_pscsi.c
+++ b/drivers/target/target_core_pscsi.c
@@ -920,7 +920,7 @@ pscsi_map_sg(struct se_cmd *cmd, struct scatterlist *sgl, u32 sgl_nents,
 					" %d i: %d bio: %p, allocating another"
 					" bio\n", bio->bi_vcnt, i, bio);

-				rc = blk_rq_append_bio(req, bio);
+				rc = blk_rq_append_bio(req, &bio);
 				if (rc) {
 					pr_err("pSCSI: failed to append bio\n");
 					goto fail;
@@ -938,7 +938,7 @@ pscsi_map_sg(struct se_cmd *cmd, struct scatterlist *sgl, u32 sgl_nents,
 	}

 	if (bio) {
-		rc = blk_rq_append_bio(req, bio);
+		rc = blk_rq_append_bio(req, &bio);
 		if (rc) {
 			pr_err("pSCSI: failed to append bio\n");
 			goto fail;

--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -492,6 +492,8 @@ extern unsigned int bvec_nr_vecs(unsigned short idx);

 #define bio_set_dev(bio, bdev) 			\
 do {						\
+	if ((bio)->bi_disk != (bdev)->bd_disk)	\
+		bio_clear_flag(bio, BIO_THROTTLED);\
 	(bio)->bi_disk = (bdev)->bd_disk;	\
 	(bio)->bi_partno = (bdev)->bd_partno;	\
 } while (0)

--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -50,8 +50,6 @@ struct blk_issue_stat {
 struct bio {
 	struct bio		*bi_next;	/* request queue link */
 	struct gendisk		*bi_disk;
-	u8			bi_partno;
-	blk_status_t		bi_status;
 	unsigned int		bi_opf;		/* bottom bits req flags,
 						 * top bits REQ_OP. Use
 						 * accessors.
@@ -59,8 +57,8 @@ struct bio {
 	unsigned short		bi_flags;	/* status, etc and bvec pool number */
 	unsigned short		bi_ioprio;
 	unsigned short		bi_write_hint;
-
-	struct bvec_iter	bi_iter;
+	blk_status_t		bi_status;
+	u8			bi_partno;

 	/* Number of segments in this BIO after
 	 * physical address coalescing is performed.
@@ -74,8 +72,9 @@ struct bio {
 	unsigned int		bi_seg_front_size;
 	unsigned int		bi_seg_back_size;

-	atomic_t		__bi_remaining;
+	struct bvec_iter	bi_iter;

+	atomic_t		__bi_remaining;
 	bio_end_io_t		*bi_end_io;

 	void			*bi_private;

--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -135,7 +135,7 @@ typedef __u32 __bitwise req_flags_t;
 struct request {
 	struct list_head queuelist;
 	union {
-		call_single_data_t csd;
+		struct __call_single_data csd;
 		u64 fifo_time;
 	};

@@ -241,14 +241,24 @@ struct request {
 	struct request *next_rq;
 };

+static inline bool blk_op_is_scsi(unsigned int op)
+{
+	return op == REQ_OP_SCSI_IN || op == REQ_OP_SCSI_OUT;
+}
+
+static inline bool blk_op_is_private(unsigned int op)
+{
+	return op == REQ_OP_DRV_IN || op == REQ_OP_DRV_OUT;
+}
+
 static inline bool blk_rq_is_scsi(struct request *rq)
 {
-	return req_op(rq) == REQ_OP_SCSI_IN || req_op(rq) == REQ_OP_SCSI_OUT;
+	return blk_op_is_scsi(req_op(rq));
 }

 static inline bool blk_rq_is_private(struct request *rq)
 {
-	return req_op(rq) == REQ_OP_DRV_IN || req_op(rq) == REQ_OP_DRV_OUT;
+	return blk_op_is_private(req_op(rq));
 }

 static inline bool blk_rq_is_passthrough(struct request *rq)
@@ -256,6 +266,13 @@ static inline bool blk_rq_is_passthrough(struct request *rq)
 	return blk_rq_is_scsi(rq) || blk_rq_is_private(rq);
 }

+static inline bool bio_is_passthrough(struct bio *bio)
+{
+	unsigned op = bio_op(bio);
+
+	return blk_op_is_scsi(op) || blk_op_is_private(op);
+}
+
 static inline unsigned short req_get_ioprio(struct request *req)
 {
 	return req->ioprio;
@@ -948,7 +965,7 @@ extern int blk_rq_prep_clone(struct request *rq, struct request *rq_src,
 extern void blk_rq_unprep_clone(struct request *rq);
 extern blk_status_t blk_insert_cloned_request(struct request_queue *q,
 				     struct request *rq);
-extern int blk_rq_append_bio(struct request *rq, struct bio *bio);
+extern int blk_rq_append_bio(struct request *rq, struct bio **bio);
 extern void blk_delay_queue(struct request_queue *, unsigned long);
 extern void blk_queue_split(struct request_queue *, struct bio **);
 extern void blk_recount_segments(struct request_queue *, struct bio *);

--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -882,13 +882,10 @@ int bdi_register_va(struct backing_dev_info *bdi, const char *fmt, va_list args)
 	if (IS_ERR(dev))
 		return PTR_ERR(dev);

-	if (bdi_debug_register(bdi, dev_name(dev))) {
-		device_destroy(bdi_class, dev->devt);
-		return -ENOMEM;
-	}
 	cgwb_bdi_register(bdi);
 	bdi->dev = dev;

+	bdi_debug_register(bdi, dev_name(dev));
 	set_bit(WB_registered, &bdi->wb.state);

 	spin_lock_bh(&bdi_lock);