Merge tag 'for-4.15/dm' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm

Pull device mapper updates from Mike Snitzer: - a few conversions from atomic_t to ref_count_t - a DM core fix for a race during device destruction that could result in a BUG_ON - a stable@ fix for a DM cache race condition that could lead to data corruption when operating in writeback mode (writethrough is default) - various DM cache cleanups and improvements - add DAX support to the DM log-writes target - a fix for the DM zoned target's ability to deal with the last zone of the drive being smaller than all others - a stable@ DM crypt and DM integrity fix for a negative check that was to restrictive (prevented slab debug with XFS ontop of DM crypt from working) - a DM raid target fix for a panic that can occur when forcing a raid to sync * tag 'for-4.15/dm' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm: (25 commits) dm cache: lift common migration preparation code to alloc_migration() dm cache: remove usused deferred_cells member from struct cache dm cache policy smq: allocate cache blocks in order dm cache policy smq: change max background work from 10240 to 4096 blocks dm cache background tracker: limit amount of background work that may be issued at once dm cache policy smq: take origin idle status into account when queuing writebacks dm cache policy smq: handle races with queuing background_work dm raid: fix panic when attempting to force a raid to sync dm integrity: allow unaligned bv_offset dm crypt: allow unaligned bv_offset dm: small cleanup in dm_get_md() dm: fix race between dm_get_from_kobject() and __dm_destroy() dm: allocate struct mapped_device with kvzalloc dm zoned: ignore last smaller runt zone dm space map metadata: use ARRAY_SIZE dm log writes: add support for DAX dm log writes: add support for inline data buffers dm cache: simplify get_per_bio_data() by removing data_size argument dm cache: remove all obsolete writethrough-specific code dm cache: submit writethrough writes in parallel to origin and cache ...

Merge tag 'for-4.15/dm' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm
Pull device mapper updates from Mike Snitzer: - a few conversions from atomic_t to ref_count_t - a DM core fix for a race during device destruction that could result in a BUG_ON - a stable@ fix for a DM cache race condition that could lead to data corruption when operating in writeback mode (writethrough is default) - various DM cache cleanups and improvements - add DAX support to the DM log-writes target - a fix for the DM zoned target's ability to deal with the last zone of the drive being smaller than all others - a stable@ DM crypt and DM integrity fix for a negative check that was to restrictive (prevented slab debug with XFS ontop of DM crypt from working) - a DM raid target fix for a panic that can occur when forcing a raid to sync * tag 'for-4.15/dm' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm: (25 commits) dm cache: lift common migration preparation code to alloc_migration() dm cache: remove usused deferred_cells member from struct cache dm cache policy smq: allocate cache blocks in order dm cache policy smq: change max background work from 10240 to 4096 blocks dm cache background tracker: limit amount of background work that may be issued at once dm cache policy smq: take origin idle status into account when queuing writebacks dm cache policy smq: handle races with queuing background_work dm raid: fix panic when attempting to force a raid to sync dm integrity: allow unaligned bv_offset dm crypt: allow unaligned bv_offset dm: small cleanup in dm_get_md() dm: fix race between dm_get_from_kobject() and __dm_destroy() dm: allocate struct mapped_device with kvzalloc dm zoned: ignore last smaller runt zone dm space map metadata: use ARRAY_SIZE dm log writes: add support for DAX dm log writes: add support for inline data buffers dm cache: simplify get_per_bio_data() by removing data_size argument dm cache: remove all obsolete writethrough-specific code dm cache: submit writethrough writes in parallel to origin and cache ...
b91593fa · Linus Torvalds · e2c5923c · ef7afb36 · b91593fa · b91593fa
Commit b91593fa authored Nov 14, 2017 by Linus Torvalds
14 changed files
--- a/drivers/md/dm-cache-background-tracker.c
+++ b/drivers/md/dm-cache-background-tracker.c
@@ -161,8 +161,17 @@ EXPORT_SYMBOL_GPL(btracker_nr_demotions_queued);
 static bool max_work_reached(struct background_tracker *b)
 {
-	// FIXME: finish
+	return atomic_read(&b->pending_promotes) +
-	return false;
+		atomic_read(&b->pending_writebacks) +
+		atomic_read(&b->pending_demotes) >= b->max_work;
+}
+struct bt_work *alloc_work(struct background_tracker *b)
+{
+	if (max_work_reached(b))
+		return NULL;
+	return kmem_cache_alloc(b->work_cache, GFP_NOWAIT);
 }
 int btracker_queue(struct background_tracker *b,
@@ -174,10 +183,7 @@ int btracker_queue(struct background_tracker *b,
 	if (pwork)
 		*pwork = NULL;
-	if (max_work_reached(b))
+	w = alloc_work(b);
-		return -ENOMEM;
-	w = kmem_cache_alloc(b->work_cache, GFP_NOWAIT);
 	if (!w)
 		return -ENOMEM;

--- a/drivers/md/dm-cache-metadata.c
+++ b/drivers/md/dm-cache-metadata.c
@@ -13,6 +13,7 @@
 #include "persistent-data/dm-transaction-manager.h"
 #include <linux/device-mapper.h>
+#include <linux/refcount.h>
 /*----------------------------------------------------------------*/
@@ -100,7 +101,7 @@ struct cache_disk_superblock {
 } __packed;
 struct dm_cache_metadata {
-	atomic_t ref_count;
+	refcount_t ref_count;
 	struct list_head list;
 	unsigned version;
@@ -753,7 +754,7 @@ static struct dm_cache_metadata *metadata_open(struct block_device *bdev,
 	}
 	cmd->version = metadata_version;
-	atomic_set(&cmd->ref_count, 1);
+	refcount_set(&cmd->ref_count, 1);
 	init_rwsem(&cmd->root_lock);
 	cmd->bdev = bdev;
 	cmd->data_block_size = data_block_size;
@@ -791,7 +792,7 @@ static struct dm_cache_metadata *lookup(struct block_device *bdev)
 	list_for_each_entry(cmd, &table, list)
 		if (cmd->bdev == bdev) {
-			atomic_inc(&cmd->ref_count);
+			refcount_inc(&cmd->ref_count);
 			return cmd;
 		}
@@ -862,7 +863,7 @@ struct dm_cache_metadata *dm_cache_metadata_open(struct block_device *bdev,
 void dm_cache_metadata_close(struct dm_cache_metadata *cmd)
 {
-	if (atomic_dec_and_test(&cmd->ref_count)) {
+	if (refcount_dec_and_test(&cmd->ref_count)) {
 		mutex_lock(&table_lock);
 		list_del(&cmd->list);
 		mutex_unlock(&table_lock);

--- a/drivers/md/dm-cache-policy-smq.c
+++ b/drivers/md/dm-cache-policy-smq.c
@@ -213,6 +213,19 @@ static void l_del(struct entry_space *es, struct ilist *l, struct entry *e)
 		l->nr_elts--;
 }
+static struct entry *l_pop_head(struct entry_space *es, struct ilist *l)
+{
+	struct entry *e;
+	for (e = l_head(es, l); e; e = l_next(es, e))
+		if (!e->sentinel) {
+			l_del(es, l, e);
+			return e;
+		}
+	return NULL;
+}
 static struct entry *l_pop_tail(struct entry_space *es, struct ilist *l)
 {
 	struct entry *e;
@@ -719,7 +732,7 @@ static struct entry *alloc_entry(struct entry_alloc *ea)
 	if (l_empty(&ea->free))
 		return NULL;
-	e = l_pop_tail(ea->es, &ea->free);
+	e = l_pop_head(ea->es, &ea->free);
 	init_entry(e);
 	ea->nr_allocated++;
@@ -1158,13 +1171,13 @@ static void clear_pending(struct smq_policy *mq, struct entry *e)
 	e->pending_work = false;
 }
-static void queue_writeback(struct smq_policy *mq)
+static void queue_writeback(struct smq_policy *mq, bool idle)
 {
 	int r;
 	struct policy_work work;
 	struct entry *e;
-	e = q_peek(&mq->dirty, mq->dirty.nr_levels, !mq->migrations_allowed);
+	e = q_peek(&mq->dirty, mq->dirty.nr_levels, idle);
 	if (e) {
 		mark_pending(mq, e);
 		q_del(&mq->dirty, e);
@@ -1174,12 +1187,16 @@ static void queue_writeback(struct smq_policy *mq)
 		work.cblock = infer_cblock(mq, e);
 		r = btracker_queue(mq->bg_work, &work, NULL);
-		WARN_ON_ONCE(r); // FIXME: finish, I think we have to get rid of this race.
+		if (r) {
+			clear_pending(mq, e);
+			q_push_front(&mq->dirty, e);
+		}
 	}
 }
 static void queue_demotion(struct smq_policy *mq)
 {
+	int r;
 	struct policy_work work;
 	struct entry *e;
@@ -1189,7 +1206,7 @@ static void queue_demotion(struct smq_policy *mq)
 	e = q_peek(&mq->clean, mq->clean.nr_levels / 2, true);
 	if (!e) {
 		if (!clean_target_met(mq, true))
-			queue_writeback(mq);
+			queue_writeback(mq, false);
 		return;
 	}
@@ -1199,12 +1216,17 @@ static void queue_demotion(struct smq_policy *mq)
 	work.op = POLICY_DEMOTE;
 	work.oblock = e->oblock;
 	work.cblock = infer_cblock(mq, e);
-	btracker_queue(mq->bg_work, &work, NULL);
+	r = btracker_queue(mq->bg_work, &work, NULL);
+	if (r) {
+		clear_pending(mq, e);
+		q_push_front(&mq->clean, e);
+	}
 }
 static void queue_promotion(struct smq_policy *mq, dm_oblock_t oblock,
 			    struct policy_work **workp)
 {
+	int r;
 	struct entry *e;
 	struct policy_work work;
@@ -1234,7 +1256,9 @@ static void queue_promotion(struct smq_policy *mq, dm_oblock_t oblock,
 	work.op = POLICY_PROMOTE;
 	work.oblock = oblock;
 	work.cblock = infer_cblock(mq, e);
-	btracker_queue(mq->bg_work, &work, workp);
+	r = btracker_queue(mq->bg_work, &work, workp);
+	if (r)
+		free_entry(&mq->cache_alloc, e);
 }
 /*----------------------------------------------------------------*/
@@ -1418,7 +1442,7 @@ static int smq_get_background_work(struct dm_cache_policy *p, bool idle,
 	r = btracker_issue(mq->bg_work, result);
 	if (r == -ENODATA) {
 		if (!clean_target_met(mq, idle)) {
-			queue_writeback(mq);
+			queue_writeback(mq, idle);
 			r = btracker_issue(mq->bg_work, result);
 		}
 	}
@@ -1778,7 +1802,7 @@ static struct dm_cache_policy *__smq_create(dm_cblock_t cache_size,
 	mq->next_hotspot_period = jiffies;
 	mq->next_cache_period = jiffies;
-	mq->bg_work = btracker_create(10240); /* FIXME: hard coded value */
+	mq->bg_work = btracker_create(4096); /* FIXME: hard coded value */
 	if (!mq->bg_work)
 		goto bad_btracker;

--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -408,9 +408,7 @@ struct cache {
 	int sectors_per_block_shift;
 	spinlock_t lock;
-	struct list_head deferred_cells;
 	struct bio_list deferred_bios;
-	struct bio_list deferred_writethrough_bios;
 	sector_t migration_threshold;
 	wait_queue_head_t migration_wait;
 	atomic_t nr_allocated_migrations;
@@ -446,10 +444,10 @@ struct cache {
 	struct dm_kcopyd_client *copier;
 	struct workqueue_struct *wq;
 	struct work_struct deferred_bio_worker;
-	struct work_struct deferred_writethrough_worker;
 	struct work_struct migration_worker;
 	struct delayed_work waker;
 	struct dm_bio_prison_v2 *prison;
+	struct bio_set *bs;
 	mempool_t *migration_pool;
@@ -490,15 +488,6 @@ struct per_bio_data {
 	struct dm_bio_prison_cell_v2 *cell;
 	struct dm_hook_info hook_info;
 	sector_t len;
-	/*
-	 * writethrough fields.  These MUST remain at the end of this
-	 * structure and the 'cache' member must be the first as it
-	 * is used to determine the offset of the writethrough fields.
-	 */
-	struct cache *cache;
-	dm_cblock_t cblock;
-	struct dm_bio_details bio_details;
 };
 struct dm_cache_migration {
@@ -515,19 +504,19 @@ struct dm_cache_migration {
 /*----------------------------------------------------------------*/
-static bool writethrough_mode(struct cache_features *f)
+static bool writethrough_mode(struct cache *cache)
 {
-	return f->io_mode == CM_IO_WRITETHROUGH;
+	return cache->features.io_mode == CM_IO_WRITETHROUGH;
 }
-static bool writeback_mode(struct cache_features *f)
+static bool writeback_mode(struct cache *cache)
 {
-	return f->io_mode == CM_IO_WRITEBACK;
+	return cache->features.io_mode == CM_IO_WRITEBACK;
 }
-static inline bool passthrough_mode(struct cache_features *f)
+static inline bool passthrough_mode(struct cache *cache)
 {
-	return unlikely(f->io_mode == CM_IO_PASSTHROUGH);
+	return unlikely(cache->features.io_mode == CM_IO_PASSTHROUGH);
 }
 /*----------------------------------------------------------------*/
@@ -537,14 +526,9 @@ static void wake_deferred_bio_worker(struct cache *cache)
 	queue_work(cache->wq, &cache->deferred_bio_worker);
 }
-static void wake_deferred_writethrough_worker(struct cache *cache)
-{
-	queue_work(cache->wq, &cache->deferred_writethrough_worker);
-}
 static void wake_migration_worker(struct cache *cache)
 {
-	if (passthrough_mode(&cache->features))
+	if (passthrough_mode(cache))
 		return;
 	queue_work(cache->wq, &cache->migration_worker);
@@ -567,10 +551,13 @@ static struct dm_cache_migration *alloc_migration(struct cache *cache)
 	struct dm_cache_migration *mg;
 	mg = mempool_alloc(cache->migration_pool, GFP_NOWAIT);
-	if (mg) {
+	if (!mg)
-		mg->cache = cache;
+		return NULL;
-		atomic_inc(&mg->cache->nr_allocated_migrations);
-	}
+	memset(mg, 0, sizeof(*mg));
+	mg->cache = cache;
+	atomic_inc(&cache->nr_allocated_migrations);
 	return mg;
 }
@@ -618,27 +605,16 @@ static unsigned lock_level(struct bio *bio)
 * Per bio data
 *--------------------------------------------------------------*/
-/*
+static struct per_bio_data *get_per_bio_data(struct bio *bio)
- * If using writeback, leave out struct per_bio_data's writethrough fields.
- */
-#define PB_DATA_SIZE_WB (offsetof(struct per_bio_data, cache))
-#define PB_DATA_SIZE_WT (sizeof(struct per_bio_data))
-static size_t get_per_bio_data_size(struct cache *cache)
 {
-	return writethrough_mode(&cache->features) ? PB_DATA_SIZE_WT : PB_DATA_SIZE_WB;
+	struct per_bio_data *pb = dm_per_bio_data(bio, sizeof(struct per_bio_data));
-}
-static struct per_bio_data *get_per_bio_data(struct bio *bio, size_t data_size)
-{
-	struct per_bio_data *pb = dm_per_bio_data(bio, data_size);
 	BUG_ON(!pb);
 	return pb;
 }
-static struct per_bio_data *init_per_bio_data(struct bio *bio, size_t data_size)
+static struct per_bio_data *init_per_bio_data(struct bio *bio)
 {
-	struct per_bio_data *pb = get_per_bio_data(bio, data_size);
+	struct per_bio_data *pb = get_per_bio_data(bio);
 	pb->tick = false;
 	pb->req_nr = dm_bio_get_target_bio_nr(bio);
@@ -678,7 +654,6 @@ static void defer_bios(struct cache *cache, struct bio_list *bios)
 static bool bio_detain_shared(struct cache *cache, dm_oblock_t oblock, struct bio *bio)
 {
 	bool r;
-	size_t pb_size;
 	struct per_bio_data *pb;
 	struct dm_cell_key_v2 key;
 	dm_oblock_t end = to_oblock(from_oblock(oblock) + 1ULL);
@@ -703,8 +678,7 @@ static bool bio_detain_shared(struct cache *cache, dm_oblock_t oblock, struct bi
 	if (cell != cell_prealloc)
 		free_prison_cell(cache, cell_prealloc);
-	pb_size = get_per_bio_data_size(cache);
+	pb = get_per_bio_data(bio);
-	pb = get_per_bio_data(bio, pb_size);
 	pb->cell = cell;
 	return r;
@@ -856,28 +830,35 @@ static void remap_to_cache(struct cache *cache, struct bio *bio,
 static void check_if_tick_bio_needed(struct cache *cache, struct bio *bio)
 {
 	unsigned long flags;
-	size_t pb_data_size = get_per_bio_data_size(cache);
+	struct per_bio_data *pb;
-	struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
 	spin_lock_irqsave(&cache->lock, flags);
 	if (cache->need_tick_bio && !op_is_flush(bio->bi_opf) &&
 	    bio_op(bio) != REQ_OP_DISCARD) {
+		pb = get_per_bio_data(bio);
 		pb->tick = true;
 		cache->need_tick_bio = false;
 	}
 	spin_unlock_irqrestore(&cache->lock, flags);
 }
-static void remap_to_origin_clear_discard(struct cache *cache, struct bio *bio,
+static void __remap_to_origin_clear_discard(struct cache *cache, struct bio *bio,
-					  dm_oblock_t oblock)
+					    dm_oblock_t oblock, bool bio_has_pbd)
 {
-	// FIXME: this is called way too much.
+	if (bio_has_pbd)
-	check_if_tick_bio_needed(cache, bio);
+		check_if_tick_bio_needed(cache, bio);
 	remap_to_origin(cache, bio);
 	if (bio_data_dir(bio) == WRITE)
 		clear_discard(cache, oblock_to_dblock(cache, oblock));
 }
+static void remap_to_origin_clear_discard(struct cache *cache, struct bio *bio,
+					  dm_oblock_t oblock)
+{
+	// FIXME: check_if_tick_bio_needed() is called way too much through this interface
+	__remap_to_origin_clear_discard(cache, bio, oblock, true);
+}
 static void remap_to_cache_dirty(struct cache *cache, struct bio *bio,
 				 dm_oblock_t oblock, dm_cblock_t cblock)
 {
@@ -908,10 +889,10 @@ static bool accountable_bio(struct cache *cache, struct bio *bio)
 static void accounted_begin(struct cache *cache, struct bio *bio)
 {
-	size_t pb_data_size = get_per_bio_data_size(cache);
+	struct per_bio_data *pb;
-	struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
 	if (accountable_bio(cache, bio)) {
+		pb = get_per_bio_data(bio);
 		pb->len = bio_sectors(bio);
 		iot_io_begin(&cache->tracker, pb->len);
 	}
@@ -919,8 +900,7 @@ static void accounted_begin(struct cache *cache, struct bio *bio)
 static void accounted_complete(struct cache *cache, struct bio *bio)
 {
-	size_t pb_data_size = get_per_bio_data_size(cache);
+	struct per_bio_data *pb = get_per_bio_data(bio);
-	struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
 	iot_io_end(&cache->tracker, pb->len);
 }
@@ -937,57 +917,26 @@ static void issue_op(struct bio *bio, void *context)
 	accounted_request(cache, bio);
 }
-static void defer_writethrough_bio(struct cache *cache, struct bio *bio)
-{
-	unsigned long flags;
-	spin_lock_irqsave(&cache->lock, flags);
-	bio_list_add(&cache->deferred_writethrough_bios, bio);
-	spin_unlock_irqrestore(&cache->lock, flags);
-	wake_deferred_writethrough_worker(cache);
-}
-static void writethrough_endio(struct bio *bio)
-{
-	struct per_bio_data *pb = get_per_bio_data(bio, PB_DATA_SIZE_WT);
-	dm_unhook_bio(&pb->hook_info, bio);
-	if (bio->bi_status) {
-		bio_endio(bio);
-		return;
-	}
-	dm_bio_restore(&pb->bio_details, bio);
-	remap_to_cache(pb->cache, bio, pb->cblock);
-	/*
-	 * We can't issue this bio directly, since we're in interrupt
-	 * context.  So it gets put on a bio list for processing by the
-	 * worker thread.
-	 */
-	defer_writethrough_bio(pb->cache, bio);
-}
 /*
- * FIXME: send in parallel, huge latency as is.
 * When running in writethrough mode we need to send writes to clean blocks
- * to both the cache and origin devices.  In future we'd like to clone the
+ * to both the cache and origin devices.  Clone the bio and send them in parallel.
- * bio and send them in parallel, but for now we're doing them in
- * series as this is easier.
 */
-static void remap_to_origin_then_cache(struct cache *cache, struct bio *bio,
+static void remap_to_origin_and_cache(struct cache *cache, struct bio *bio,
-				       dm_oblock_t oblock, dm_cblock_t cblock)
+				      dm_oblock_t oblock, dm_cblock_t cblock)
 {
-	struct per_bio_data *pb = get_per_bio_data(bio, PB_DATA_SIZE_WT);
+	struct bio *origin_bio = bio_clone_fast(bio, GFP_NOIO, cache->bs);
+	BUG_ON(!origin_bio);
-	pb->cache = cache;
+	bio_chain(origin_bio, bio);
-	pb->cblock = cblock;
+	/*
-	dm_hook_bio(&pb->hook_info, bio, writethrough_endio, NULL);
+	 * Passing false to __remap_to_origin_clear_discard() skips
-	dm_bio_record(&pb->bio_details, bio);
+	 * all code that might use per_bio_data (since clone doesn't have it)
+	 */
+	__remap_to_origin_clear_discard(cache, origin_bio, oblock, false);
+	submit_bio(origin_bio);
-	remap_to_origin_clear_discard(pb->cache, bio, oblock);
+	remap_to_cache(cache, bio, cblock);
 }
 /*----------------------------------------------------------------
@@ -1201,6 +1150,18 @@ static void background_work_end(struct cache *cache)
 /*----------------------------------------------------------------*/
+static bool bio_writes_complete_block(struct cache *cache, struct bio *bio)
+{
+	return (bio_data_dir(bio) == WRITE) &&
+		(bio->bi_iter.bi_size == (cache->sectors_per_block << SECTOR_SHIFT));
+}
+static bool optimisable_bio(struct cache *cache, struct bio *bio, dm_oblock_t block)
+{
+	return writeback_mode(cache) &&
+		(is_discarded_oblock(cache, block) || bio_writes_complete_block(cache, bio));
+}
 static void quiesce(struct dm_cache_migration *mg,
 		    void (*continuation)(struct work_struct *))
 {
@@ -1248,8 +1209,7 @@ static int copy(struct dm_cache_migration *mg, bool promote)
 static void bio_drop_shared_lock(struct cache *cache, struct bio *bio)
 {
-	size_t pb_data_size = get_per_bio_data_size(cache);
+	struct per_bio_data *pb = get_per_bio_data(bio);
-	struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
 	if (pb->cell && dm_cell_put_v2(cache->prison, pb->cell))
 		free_prison_cell(cache, pb->cell);
@@ -1260,23 +1220,21 @@ static void overwrite_endio(struct bio *bio)
 {
 	struct dm_cache_migration *mg = bio->bi_private;
 	struct cache *cache = mg->cache;
-	size_t pb_data_size = get_per_bio_data_size(cache);
+	struct per_bio_data *pb = get_per_bio_data(bio);
-	struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
 	dm_unhook_bio(&pb->hook_info, bio);
 	if (bio->bi_status)
 		mg->k.input = bio->bi_status;
-	queue_continuation(mg->cache->wq, &mg->k);
+	queue_continuation(cache->wq, &mg->k);
 }
 static void overwrite(struct dm_cache_migration *mg,
 		      void (*continuation)(struct work_struct *))
 {
 	struct bio *bio = mg->overwrite_bio;
-	size_t pb_data_size = get_per_bio_data_size(mg->cache);
+	struct per_bio_data *pb = get_per_bio_data(bio);
-	struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
 	dm_hook_bio(&pb->hook_info, bio, overwrite_endio, mg);
@@ -1474,12 +1432,50 @@ static void mg_upgrade_lock(struct work_struct *ws)
 	}
 }
+static void mg_full_copy(struct work_struct *ws)
+{
+	struct dm_cache_migration *mg = ws_to_mg(ws);
+	struct cache *cache = mg->cache;
+	struct policy_work *op = mg->op;
+	bool is_policy_promote = (op->op == POLICY_PROMOTE);
+	if ((!is_policy_promote && !is_dirty(cache, op->cblock)) ||
+	    is_discarded_oblock(cache, op->oblock)) {
+		mg_upgrade_lock(ws);
+		return;
+	}
+	init_continuation(&mg->k, mg_upgrade_lock);
+	if (copy(mg, is_policy_promote)) {
+		DMERR_LIMIT("%s: migration copy failed", cache_device_name(cache));
+		mg->k.input = BLK_STS_IOERR;
+		mg_complete(mg, false);
+	}
+}
 static void mg_copy(struct work_struct *ws)
 {
-	int r;
 	struct dm_cache_migration *mg = ws_to_mg(ws);
 	if (mg->overwrite_bio) {
+		/*
+		 * No exclusive lock was held when we last checked if the bio
+		 * was optimisable.  So we have to check again in case things
+		 * have changed (eg, the block may no longer be discarded).
+		 */
+		if (!optimisable_bio(mg->cache, mg->overwrite_bio, mg->op->oblock)) {
+			/*
+			 * Fallback to a real full copy after doing some tidying up.
+			 */
+			bool rb = bio_detain_shared(mg->cache, mg->op->oblock, mg->overwrite_bio);
+			BUG_ON(rb); /* An exclussive lock must _not_ be held for this block */
+			mg->overwrite_bio = NULL;
+			inc_io_migrations(mg->cache);
+			mg_full_copy(ws);
+			return;
+		}
 		/*
 		 * It's safe to do this here, even though it's new data
 		 * because all IO has been locked out of the block.
@@ -1489,26 +1485,8 @@ static void mg_copy(struct work_struct *ws)
 		 */
 		overwrite(mg, mg_update_metadata_after_copy);
-	} else {
+	} else
-		struct cache *cache = mg->cache;
+		mg_full_copy(ws);
-		struct policy_work *op = mg->op;
-		bool is_policy_promote = (op->op == POLICY_PROMOTE);
-		if ((!is_policy_promote && !is_dirty(cache, op->cblock)) ||
-		    is_discarded_oblock(cache, op->oblock)) {
-			mg_upgrade_lock(ws);
-			return;
-		}
-		init_continuation(&mg->k, mg_upgrade_lock);
-		r = copy(mg, is_policy_promote);
-		if (r) {
-			DMERR_LIMIT("%s: migration copy failed", cache_device_name(cache));
-			mg->k.input = BLK_STS_IOERR;
-			mg_complete(mg, false);
-		}
-	}
 }
 static int mg_lock_writes(struct dm_cache_migration *mg)
@@ -1567,9 +1545,6 @@ static int mg_start(struct cache *cache, struct policy_work *op, struct bio *bio
 		return -ENOMEM;
 	}
-	memset(mg, 0, sizeof(*mg));
-	mg->cache = cache;
 	mg->op = op;
 	mg->overwrite_bio = bio;
@@ -1703,9 +1678,6 @@ static int invalidate_start(struct cache *cache, dm_cblock_t cblock,
 		return -ENOMEM;
 	}
-	memset(mg, 0, sizeof(*mg));
-	mg->cache = cache;
 	mg->overwrite_bio = bio;
 	mg->invalidate_cblock = cblock;
 	mg->invalidate_oblock = oblock;
@@ -1748,26 +1720,12 @@ static void inc_miss_counter(struct cache *cache, struct bio *bio)
 /*----------------------------------------------------------------*/
-static bool bio_writes_complete_block(struct cache *cache, struct bio *bio)
-{
-	return (bio_data_dir(bio) == WRITE) &&
-		(bio->bi_iter.bi_size == (cache->sectors_per_block << SECTOR_SHIFT));
-}
-static bool optimisable_bio(struct cache *cache, struct bio *bio, dm_oblock_t block)
-{
-	return writeback_mode(&cache->features) &&
-		(is_discarded_oblock(cache, block) || bio_writes_complete_block(cache, bio));
-}
 static int map_bio(struct cache *cache, struct bio *bio, dm_oblock_t block,
 		   bool *commit_needed)
 {
 	int r, data_dir;
 	bool rb, background_queued;
 	dm_cblock_t cblock;
-	size_t pb_data_size = get_per_bio_data_size(cache);
-	struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
 	*commit_needed = false;
@@ -1816,6 +1774,8 @@ static int map_bio(struct cache *cache, struct bio *bio, dm_oblock_t block,
 	}
 	if (r == -ENOENT) {
+		struct per_bio_data *pb = get_per_bio_data(bio);
 		/*
 		 * Miss.
 		 */
@@ -1823,7 +1783,6 @@ static int map_bio(struct cache *cache, struct bio *bio, dm_oblock_t block,
 		if (pb->req_nr == 0) {
 			accounted_begin(cache, bio);
 			remap_to_origin_clear_discard(cache, bio, block);
 		} else {
 			/*
 			 * This is a duplicate writethrough io that is no
@@ -1842,18 +1801,17 @@ static int map_bio(struct cache *cache, struct bio *bio, dm_oblock_t block,
 		 * Passthrough always maps to the origin, invalidating any
 		 * cache blocks that are written to.
 		 */
-		if (passthrough_mode(&cache->features)) {
+		if (passthrough_mode(cache)) {
 			if (bio_data_dir(bio) == WRITE) {
 				bio_drop_shared_lock(cache, bio);
 				atomic_inc(&cache->stats.demotion);
 				invalidate_start(cache, cblock, block, bio);
 			} else
 				remap_to_origin_clear_discard(cache, bio, block);
 		} else {
-			if (bio_data_dir(bio) == WRITE && writethrough_mode(&cache->features) &&
+			if (bio_data_dir(bio) == WRITE && writethrough_mode(cache) &&
 			    !is_dirty(cache, cblock)) {
-				remap_to_origin_then_cache(cache, bio, block, cblock);
+				remap_to_origin_and_cache(cache, bio, block, cblock);
 				accounted_begin(cache, bio);
 			} else
 				remap_to_cache_dirty(cache, bio, block, cblock);
@@ -1922,8 +1880,7 @@ static blk_status_t commit_op(void *context)
 static bool process_flush_bio(struct cache *cache, struct bio *bio)
 {
-	size_t pb_data_size = get_per_bio_data_size(cache);
+	struct per_bio_data *pb = get_per_bio_data(bio);
-	struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
 	if (!pb->req_nr)
 		remap_to_origin(cache, bio);
@@ -1983,28 +1940,6 @@ static void process_deferred_bios(struct work_struct *ws)
 		schedule_commit(&cache->committer);
 }
-static void process_deferred_writethrough_bios(struct work_struct *ws)
-{
-	struct cache *cache = container_of(ws, struct cache, deferred_writethrough_worker);
-	unsigned long flags;
-	struct bio_list bios;
-	struct bio *bio;
-	bio_list_init(&bios);
-	spin_lock_irqsave(&cache->lock, flags);
-	bio_list_merge(&bios, &cache->deferred_writethrough_bios);
-	bio_list_init(&cache->deferred_writethrough_bios);
-	spin_unlock_irqrestore(&cache->lock, flags);
-	/*
-	 * These bios have already been through accounted_begin()
-	 */
-	while ((bio = bio_list_pop(&bios)))
-		generic_make_request(bio);
-}
 /*----------------------------------------------------------------
 * Main worker loop
 *--------------------------------------------------------------*/
@@ -2112,6 +2047,9 @@ static void destroy(struct cache *cache)
 		kfree(cache->ctr_args[i]);
 	kfree(cache->ctr_args);
+	if (cache->bs)
+		bioset_free(cache->bs);
 	kfree(cache);
 }
@@ -2555,8 +2493,15 @@ static int cache_create(struct cache_args *ca, struct cache **result)
 	ti->discards_supported = true;
 	ti->split_discard_bios = false;
+	ti->per_io_data_size = sizeof(struct per_bio_data);
 	cache->features = ca->features;
-	ti->per_io_data_size = get_per_bio_data_size(cache);
+	if (writethrough_mode(cache)) {
+		/* Create bioset for writethrough bios issued to origin */
+		cache->bs = bioset_create(BIO_POOL_SIZE, 0, 0);
+		if (!cache->bs)
+			goto bad;
+	}
 	cache->callbacks.congested_fn = cache_is_congested;
 	dm_table_add_target_callbacks(ti->table, &cache->callbacks);
@@ -2618,7 +2563,7 @@ static int cache_create(struct cache_args *ca, struct cache **result)
 		goto bad;
 	}
-	if (passthrough_mode(&cache->features)) {
+	if (passthrough_mode(cache)) {
 		bool all_clean;
 		r = dm_cache_metadata_all_clean(cache->cmd, &all_clean);
@@ -2637,9 +2582,7 @@ static int cache_create(struct cache_args *ca, struct cache **result)
 	}
 	spin_lock_init(&cache->lock);
-	INIT_LIST_HEAD(&cache->deferred_cells);
 	bio_list_init(&cache->deferred_bios);
-	bio_list_init(&cache->deferred_writethrough_bios);
 	atomic_set(&cache->nr_allocated_migrations, 0);
 	atomic_set(&cache->nr_io_migrations, 0);
 	init_waitqueue_head(&cache->migration_wait);
@@ -2678,8 +2621,6 @@ static int cache_create(struct cache_args *ca, struct cache **result)
 		goto bad;
 	}
 	INIT_WORK(&cache->deferred_bio_worker, process_deferred_bios);
-	INIT_WORK(&cache->deferred_writethrough_worker,
-		  process_deferred_writethrough_bios);
 	INIT_WORK(&cache->migration_worker, check_migrations);
 	INIT_DELAYED_WORK(&cache->waker, do_waker);
@@ -2795,9 +2736,8 @@ static int cache_map(struct dm_target *ti, struct bio *bio)
 	int r;
 	bool commit_needed;
 	dm_oblock_t block = get_bio_block(cache, bio);
-	size_t pb_data_size = get_per_bio_data_size(cache);
-	init_per_bio_data(bio, pb_data_size);
+	init_per_bio_data(bio);
 	if (unlikely(from_oblock(block) >= from_oblock(cache->origin_blocks))) {
 		/*
 		 * This can only occur if the io goes to a partial block at
@@ -2821,13 +2761,11 @@ static int cache_map(struct dm_target *ti, struct bio *bio)
 	return r;
 }
-static int cache_end_io(struct dm_target *ti, struct bio *bio,
+static int cache_end_io(struct dm_target *ti, struct bio *bio, blk_status_t *error)
-		blk_status_t *error)
 {
 	struct cache *cache = ti->private;
 	unsigned long flags;
-	size_t pb_data_size = get_per_bio_data_size(cache);
+	struct per_bio_data *pb = get_per_bio_data(bio);
-	struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
 	if (pb->tick) {
 		policy_tick(cache->policy, false);
@@ -3243,13 +3181,13 @@ static void cache_status(struct dm_target *ti, status_type_t type,
 		else
 			DMEMIT("1 ");
-		if (writethrough_mode(&cache->features))
+		if (writethrough_mode(cache))
 			DMEMIT("writethrough ");
-		else if (passthrough_mode(&cache->features))
+		else if (passthrough_mode(cache))
 			DMEMIT("passthrough ");
-		else if (writeback_mode(&cache->features))
+		else if (writeback_mode(cache))
 			DMEMIT("writeback ");
 		else {
@@ -3415,7 +3353,7 @@ static int process_invalidate_cblocks_message(struct cache *cache, unsigned coun
 	unsigned i;
 	struct cblock_range range;
-	if (!passthrough_mode(&cache->features)) {
+	if (!passthrough_mode(cache)) {
 		DMERR("%s: cache has to be in passthrough mode for invalidation",
 		      cache_device_name(cache));
 		return -EPERM;

--- a/drivers/md/dm-core.h
+++ b/drivers/md/dm-core.h
@@ -29,7 +29,6 @@ struct dm_kobject_holder {
 * DM targets must _not_ deference a mapped_device to directly access its members!
 */
 struct mapped_device {
-	struct srcu_struct io_barrier;
 	struct mutex suspend_lock;
 	/*
@@ -127,6 +126,8 @@ struct mapped_device {
 	struct blk_mq_tag_set *tag_set;
 	bool use_blk_mq:1;
 	bool init_tio_pdu:1;
+	struct srcu_struct io_barrier;
 };
 void dm_init_md_queue(struct mapped_device *md);

--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -1075,7 +1075,7 @@ static int crypt_convert_block_aead(struct crypt_config *cc,
 	BUG_ON(cc->integrity_iv_size && cc->integrity_iv_size != cc->iv_size);
 	/* Reject unexpected unaligned bio. */
-	if (unlikely(bv_in.bv_offset & (cc->sector_size - 1)))
+	if (unlikely(bv_in.bv_len & (cc->sector_size - 1)))
 		return -EIO;
 	dmreq = dmreq_of_req(cc, req);
@@ -1168,7 +1168,7 @@ static int crypt_convert_block_skcipher(struct crypt_config *cc,
 	int r = 0;
 	/* Reject unexpected unaligned bio. */
-	if (unlikely(bv_in.bv_offset & (cc->sector_size - 1)))
+	if (unlikely(bv_in.bv_len & (cc->sector_size - 1)))
 		return -EIO;
 	dmreq = dmreq_of_req(cc, req);

--- a/drivers/md/dm-integrity.c
+++ b/drivers/md/dm-integrity.c
@@ -1377,7 +1377,7 @@ static int dm_integrity_map(struct dm_target *ti, struct bio *bio)
 		struct bvec_iter iter;
 		struct bio_vec bv;
 		bio_for_each_segment(bv, bio, iter) {
-			if (unlikely((bv.bv_offset | bv.bv_len) & ((ic->sectors_per_block << SECTOR_SHIFT) - 1))) {
+			if (unlikely(bv.bv_len & ((ic->sectors_per_block << SECTOR_SHIFT) - 1))) {
 				DMERR("Bio vector (%u,%u) is not aligned on %u-sector boundary",
 					bv.bv_offset, bv.bv_len, ic->sectors_per_block);
 				return DM_MAPIO_KILL;

--- a/drivers/md/dm-log-writes.c
+++ b/drivers/md/dm-log-writes.c
@@ -10,9 +10,11 @@
 #include <linux/init.h>
 #include <linux/blkdev.h>
 #include <linux/bio.h>
+#include <linux/dax.h>
 #include <linux/slab.h>
 #include <linux/kthread.h>
 #include <linux/freezer.h>
+#include <linux/uio.h>
 #define DM_MSG_PREFIX "log-writes"
@@ -246,27 +248,108 @@ static int write_metadata(struct log_writes_c *lc, void *entry,
 	return -1;
 }
+static int write_inline_data(struct log_writes_c *lc, void *entry,
+			     size_t entrylen, void *data, size_t datalen,
+			     sector_t sector)
+{
+	int num_pages, bio_pages, pg_datalen, pg_sectorlen, i;
+	struct page *page;
+	struct bio *bio;
+	size_t ret;
+	void *ptr;
+	while (datalen) {
+		num_pages = ALIGN(datalen, PAGE_SIZE) >> PAGE_SHIFT;
+		bio_pages = min(num_pages, BIO_MAX_PAGES);
+		atomic_inc(&lc->io_blocks);
+		bio = bio_alloc(GFP_KERNEL, bio_pages);
+		if (!bio) {
+			DMERR("Couldn't alloc inline data bio");
+			goto error;
+		}
+		bio->bi_iter.bi_size = 0;
+		bio->bi_iter.bi_sector = sector;
+		bio_set_dev(bio, lc->logdev->bdev);
+		bio->bi_end_io = log_end_io;
+		bio->bi_private = lc;
+		bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
+		for (i = 0; i < bio_pages; i++) {
+			pg_datalen = min_t(int, datalen, PAGE_SIZE);
+			pg_sectorlen = ALIGN(pg_datalen, lc->sectorsize);
+			page = alloc_page(GFP_KERNEL);
+			if (!page) {
+				DMERR("Couldn't alloc inline data page");
+				goto error_bio;
+			}
+			ptr = kmap_atomic(page);
+			memcpy(ptr, data, pg_datalen);
+			if (pg_sectorlen > pg_datalen)
+				memset(ptr + pg_datalen, 0, pg_sectorlen - pg_datalen);
+			kunmap_atomic(ptr);
+			ret = bio_add_page(bio, page, pg_sectorlen, 0);
+			if (ret != pg_sectorlen) {
+				DMERR("Couldn't add page of inline data");
+				__free_page(page);
+				goto error_bio;
+			}
+			datalen -= pg_datalen;
+			data	+= pg_datalen;
+		}
+		submit_bio(bio);
+		sector += bio_pages * PAGE_SECTORS;
+	}
+	return 0;
+error_bio:
+	bio_free_pages(bio);
+	bio_put(bio);
+error:
+	put_io_block(lc);
+	return -1;
+}
 static int log_one_block(struct log_writes_c *lc,
 			 struct pending_block *block, sector_t sector)
 {
 	struct bio *bio;
 	struct log_write_entry entry;
-	size_t ret;
+	size_t metadatalen, ret;
 	int i;
 	entry.sector = cpu_to_le64(block->sector);
 	entry.nr_sectors = cpu_to_le64(block->nr_sectors);
 	entry.flags = cpu_to_le64(block->flags);
 	entry.data_len = cpu_to_le64(block->datalen);
+	metadatalen = (block->flags & LOG_MARK_FLAG) ? block->datalen : 0;
 	if (write_metadata(lc, &entry, sizeof(entry), block->data,
-			   block->datalen, sector)) {
+			   metadatalen, sector)) {
 		free_pending_block(lc, block);
 		return -1;
 	}
+	sector += dev_to_bio_sectors(lc, 1);
+	if (block->datalen && metadatalen == 0) {
+		if (write_inline_data(lc, &entry, sizeof(entry), block->data,
+				      block->datalen, sector)) {
+			free_pending_block(lc, block);
+			return -1;
+		}
+		/* we don't support both inline data & bio data */
+		goto out;
+	}
 	if (!block->vec_cnt)
 		goto out;
-	sector += dev_to_bio_sectors(lc, 1);
 	atomic_inc(&lc->io_blocks);
 	bio = bio_alloc(GFP_KERNEL, min(block->vec_cnt, BIO_MAX_PAGES));
@@ -527,6 +610,51 @@ static int log_mark(struct log_writes_c *lc, char *data)
 	return 0;
 }
+static int log_dax(struct log_writes_c *lc, sector_t sector, size_t bytes,
+		   struct iov_iter *i)
+{
+	struct pending_block *block;
+	if (!bytes)
+		return 0;
+	block = kzalloc(sizeof(struct pending_block), GFP_KERNEL);
+	if (!block) {
+		DMERR("Error allocating dax pending block");
+		return -ENOMEM;
+	}
+	block->data = kzalloc(bytes, GFP_KERNEL);
+	if (!block->data) {
+		DMERR("Error allocating dax data space");
+		kfree(block);
+		return -ENOMEM;
+	}
+	/* write data provided via the iterator */
+	if (!copy_from_iter(block->data, bytes, i)) {
+		DMERR("Error copying dax data");
+		kfree(block->data);
+		kfree(block);
+		return -EIO;
+	}
+	/* rewind the iterator so that the block driver can use it */
+	iov_iter_revert(i, bytes);
+	block->datalen = bytes;
+	block->sector = bio_to_dev_sectors(lc, sector);
+	block->nr_sectors = ALIGN(bytes, lc->sectorsize) >> lc->sectorshift;
+	atomic_inc(&lc->pending_blocks);
+	spin_lock_irq(&lc->blocks_lock);
+	list_add_tail(&block->list, &lc->unflushed_blocks);
+	spin_unlock_irq(&lc->blocks_lock);
+	wake_up_process(lc->log_kthread);
+	return 0;
+}
 static void log_writes_dtr(struct dm_target *ti)
 {
 	struct log_writes_c *lc = ti->private;
@@ -792,9 +920,46 @@ static void log_writes_io_hints(struct dm_target *ti, struct queue_limits *limit
 	limits->io_min = limits->physical_block_size;
 }
+static long log_writes_dax_direct_access(struct dm_target *ti, pgoff_t pgoff,
+					 long nr_pages, void **kaddr, pfn_t *pfn)
+{
+	struct log_writes_c *lc = ti->private;
+	sector_t sector = pgoff * PAGE_SECTORS;
+	int ret;
+	ret = bdev_dax_pgoff(lc->dev->bdev, sector, nr_pages * PAGE_SIZE, &pgoff);
+	if (ret)
+		return ret;
+	return dax_direct_access(lc->dev->dax_dev, pgoff, nr_pages, kaddr, pfn);
+}
+static size_t log_writes_dax_copy_from_iter(struct dm_target *ti,
+					    pgoff_t pgoff, void *addr, size_t bytes,
+					    struct iov_iter *i)
+{
+	struct log_writes_c *lc = ti->private;
+	sector_t sector = pgoff * PAGE_SECTORS;
+	int err;
+	if (bdev_dax_pgoff(lc->dev->bdev, sector, ALIGN(bytes, PAGE_SIZE), &pgoff))
+		return 0;
+	/* Don't bother doing anything if logging has been disabled */
+	if (!lc->logging_enabled)
+		goto dax_copy;
+	err = log_dax(lc, sector, bytes, i);
+	if (err) {
+		DMWARN("Error %d logging DAX write", err);
+		return 0;
+	}
+dax_copy:
+	return dax_copy_from_iter(lc->dev->dax_dev, pgoff, addr, bytes, i);
+}
 static struct target_type log_writes_target = {
 	.name   = "log-writes",
-	.version = {1, 0, 0},
+	.version = {1, 1, 0},
 	.module = THIS_MODULE,
 	.ctr    = log_writes_ctr,
 	.dtr    = log_writes_dtr,
@@ -805,6 +970,8 @@ static struct target_type log_writes_target = {
 	.message = log_writes_message,
 	.iterate_devices = log_writes_iterate_devices,
 	.io_hints = log_writes_io_hints,
+	.direct_access = log_writes_dax_direct_access,
+	.dax_copy_from_iter = log_writes_dax_copy_from_iter,
 };
 static int __init dm_log_writes_init(void)

--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -2143,13 +2143,6 @@ static int super_load(struct md_rdev *rdev, struct md_rdev *refdev)
 	struct dm_raid_superblock *refsb;
 	uint64_t events_sb, events_refsb;
-	rdev->sb_start = 0;
-	rdev->sb_size = bdev_logical_block_size(rdev->meta_bdev);
-	if (rdev->sb_size < sizeof(*sb) || rdev->sb_size > PAGE_SIZE) {
-		DMERR("superblock size of a logical block is no longer valid");
-		return -EINVAL;
-	}
 	r = read_disk_sb(rdev, rdev->sb_size, false);
 	if (r)
 		return r;
@@ -2494,6 +2487,17 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
 		if (test_bit(Journal, &rdev->flags))
 			continue;
+		if (!rdev->meta_bdev)
+			continue;
+		/* Set superblock offset/size for metadata device. */
+		rdev->sb_start = 0;
+		rdev->sb_size = bdev_logical_block_size(rdev->meta_bdev);
+		if (rdev->sb_size < sizeof(struct dm_raid_superblock) || rdev->sb_size > PAGE_SIZE) {
+			DMERR("superblock size of a logical block is no longer valid");
+			return -EINVAL;
+		}
 		/*
 		 * Skipping super_load due to CTR_FLAG_SYNC will cause
 		 * the array to undergo initialization again as
@@ -2506,9 +2510,6 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
 		if (test_bit(__CTR_FLAG_SYNC, &rs->ctr_flags))
 			continue;
-		if (!rdev->meta_bdev)
-			continue;
 		r = super_load(rdev, freshest);
 		switch (r) {

--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -451,15 +451,15 @@ int dm_get_device(struct dm_target *ti, const char *path, fmode_t mode,
 			return r;
 		}
-		atomic_set(&dd->count, 0);
+		refcount_set(&dd->count, 1);
 		list_add(&dd->list, &t->devices);
 	} else if (dd->dm_dev->mode != (mode | dd->dm_dev->mode)) {
 		r = upgrade_mode(dd, mode, t->md);
 		if (r)
 			return r;
+		refcount_inc(&dd->count);
 	}
-	atomic_inc(&dd->count);
 	*result = dd->dm_dev;
 	return 0;
@@ -515,7 +515,7 @@ void dm_put_device(struct dm_target *ti, struct dm_dev *d)
 		       dm_device_name(ti->table->md), d->name);
 		return;
 	}
-	if (atomic_dec_and_test(&dd->count)) {
+	if (refcount_dec_and_test(&dd->count)) {
 		dm_put_table_device(ti->table->md, d);
 		list_del(&dd->list);
 		kfree(dd);

--- a/drivers/md/dm-zoned-target.c
+++ b/drivers/md/dm-zoned-target.c
@@ -660,6 +660,7 @@ static int dmz_get_zoned_device(struct dm_target *ti, char *path)
 	struct dmz_target *dmz = ti->private;
 	struct request_queue *q;
 	struct dmz_dev *dev;
+	sector_t aligned_capacity;
 	int ret;
 	/* Get the target device */
@@ -685,15 +686,17 @@ static int dmz_get_zoned_device(struct dm_target *ti, char *path)
 		goto err;
 	}
+	q = bdev_get_queue(dev->bdev);
 	dev->capacity = i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT;
-	if (ti->begin || (ti->len != dev->capacity)) {
+	aligned_capacity = dev->capacity & ~(blk_queue_zone_sectors(q) - 1);
+	if (ti->begin ||
+	    ((ti->len != dev->capacity) && (ti->len != aligned_capacity))) {
 		ti->error = "Partial mapping not supported";
 		ret = -EINVAL;
 		goto err;
 	}
-	q = bdev_get_queue(dev->bdev);
+	dev->zone_nr_sectors = blk_queue_zone_sectors(q);
-	dev->zone_nr_sectors = q->limits.chunk_sectors;
 	dev->zone_nr_sectors_shift = ilog2(dev->zone_nr_sectors);
 	dev->zone_nr_blocks = dmz_sect2blk(dev->zone_nr_sectors);
@@ -929,8 +932,10 @@ static int dmz_iterate_devices(struct dm_target *ti,
 			       iterate_devices_callout_fn fn, void *data)
 {
 	struct dmz_target *dmz = ti->private;
+	struct dmz_dev *dev = dmz->dev;
+	sector_t capacity = dev->capacity & ~(dev->zone_nr_sectors - 1);
-	return fn(ti, dmz->ddev, 0, dmz->dev->capacity, data);
+	return fn(ti, dmz->ddev, 0, capacity, data);
 }
 static struct target_type dmz_type = {

--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -24,6 +24,7 @@
 #include <linux/delay.h>
 #include <linux/wait.h>
 #include <linux/pr.h>
+#include <linux/refcount.h>
 #define DM_MSG_PREFIX "core"
@@ -98,7 +99,7 @@ struct dm_md_mempools {
 struct table_device {
 	struct list_head list;
-	atomic_t count;
+	refcount_t count;
 	struct dm_dev dm_dev;
 };
@@ -685,10 +686,11 @@ int dm_get_table_device(struct mapped_device *md, dev_t dev, fmode_t mode,
 		format_dev_t(td->dm_dev.name, dev);
-		atomic_set(&td->count, 0);
+		refcount_set(&td->count, 1);
 		list_add(&td->list, &md->table_devices);
+	} else {
+		refcount_inc(&td->count);
 	}
-	atomic_inc(&td->count);
 	mutex_unlock(&md->table_devices_lock);
 	*result = &td->dm_dev;
@@ -701,7 +703,7 @@ void dm_put_table_device(struct mapped_device *md, struct dm_dev *d)
 	struct table_device *td = container_of(d, struct table_device, dm_dev);
 	mutex_lock(&md->table_devices_lock);
-	if (atomic_dec_and_test(&td->count)) {
+	if (refcount_dec_and_test(&td->count)) {
 		close_table_device(td, md);
 		list_del(&td->list);
 		kfree(td);
@@ -718,7 +720,7 @@ static void free_table_devices(struct list_head *devices)
 		struct table_device *td = list_entry(tmp, struct table_device, list);
 		DMWARN("dm_destroy: %s still exists with %d references",
-		       td->dm_dev.name, atomic_read(&td->count));
+		       td->dm_dev.name, refcount_read(&td->count));
 		kfree(td);
 	}
 }
@@ -1684,7 +1686,7 @@ static struct mapped_device *alloc_dev(int minor)
 	struct mapped_device *md;
 	void *old_md;
-	md = kzalloc_node(sizeof(*md), GFP_KERNEL, numa_node_id);
+	md = kvzalloc_node(sizeof(*md), GFP_KERNEL, numa_node_id);
 	if (!md) {
 		DMWARN("unable to allocate device, out of memory.");
 		return NULL;
@@ -1784,7 +1786,7 @@ static struct mapped_device *alloc_dev(int minor)
 bad_minor:
 	module_put(THIS_MODULE);
 bad_module_get:
-	kfree(md);
+	kvfree(md);
 	return NULL;
 }
@@ -1803,7 +1805,7 @@ static void free_dev(struct mapped_device *md)
 	free_minor(minor);
 	module_put(THIS_MODULE);
-	kfree(md);
+	kvfree(md);
 }
 static void __bind_mempools(struct mapped_device *md, struct dm_table *t)
@@ -2061,17 +2063,12 @@ struct mapped_device *dm_get_md(dev_t dev)
 	spin_lock(&_minor_lock);
 	md = idr_find(&_minor_idr, minor);
-	if (md) {
+	if (!md || md == MINOR_ALLOCED || (MINOR(disk_devt(dm_disk(md))) != minor) ||
-		if ((md == MINOR_ALLOCED ||
+	    test_bit(DMF_FREEING, &md->flags) || dm_deleting_md(md)) {
-		     (MINOR(disk_devt(dm_disk(md))) != minor) ||
+		md = NULL;
-		     dm_deleting_md(md) ||
+		goto out;
-		     test_bit(DMF_FREEING, &md->flags))) {
-			md = NULL;
-			goto out;
-		}
-		dm_get(md);
 	}
+	dm_get(md);
 out:
 	spin_unlock(&_minor_lock);
@@ -2698,11 +2695,15 @@ struct mapped_device *dm_get_from_kobject(struct kobject *kobj)
 	md = container_of(kobj, struct mapped_device, kobj_holder.kobj);
-	if (test_bit(DMF_FREEING, &md->flags) ||
+	spin_lock(&_minor_lock);
-	    dm_deleting_md(md))
+	if (test_bit(DMF_FREEING, &md->flags) || dm_deleting_md(md)) {
-		return NULL;
+		md = NULL;
+		goto out;
+	}
 	dm_get(md);
+out:
+	spin_unlock(&_minor_lock);
 	return md;
 }

--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -19,6 +19,7 @@
 #include <linux/hdreg.h>
 #include <linux/completion.h>
 #include <linux/kobject.h>
+#include <linux/refcount.h>
 #include "dm-stats.h"
@@ -38,7 +39,7 @@
 */
 struct dm_dev_internal {
 	struct list_head list;
-	atomic_t count;
+	refcount_t count;
 	struct dm_dev *dm_dev;
 };

--- a/drivers/md/persistent-data/dm-space-map-metadata.c
+++ b/drivers/md/persistent-data/dm-space-map-metadata.c
@@ -11,6 +11,7 @@
 #include <linux/list.h>
 #include <linux/slab.h>
 #include <linux/device-mapper.h>
+#include <linux/kernel.h>
 #define DM_MSG_PREFIX "space map metadata"
@@ -111,7 +112,7 @@ static bool brb_empty(struct bop_ring_buffer *brb)
 static unsigned brb_next(struct bop_ring_buffer *brb, unsigned old)
 {
 	unsigned r = old + 1;
-	return (r >= (sizeof(brb->bops) / sizeof(*brb->bops))) ? 0 : r;
+	return r >= ARRAY_SIZE(brb->bops) ? 0 : r;
 }
 static int brb_push(struct bop_ring_buffer *brb,