Commit 8b211aac authored by Ming Lei's avatar Ming Lei Committed by Mike Snitzer

dm: add two stage requeue mechanism

Commit 61b6e2e5 ("dm: fix BLK_STS_DM_REQUEUE handling when dm_io
represents split bio") reverted DM core's bio splitting back to using
bio_split()+bio_chain() because it was found that otherwise DM's
BLK_STS_DM_REQUEUE would trigger a live-lock waiting for bio
completion that would never occur.

Restore using bio_trim()+bio_inc_remaining(), like was done in commit
7dd76d1f ("dm: improve bio splitting and associated IO
accounting"), but this time with proper handling for the above
scenario that is covered in more detail in the commit header for
61b6e2e5.

Solve this issue by adding a two staged dm_io requeue mechanism that
uses the new dm_bio_rewind() via dm_io_rewind():

1) requeue the dm_io into the requeue_list added to struct
   mapped_device, and schedule it via new added requeue work. This
   workqueue just clones the dm_io->orig_bio (which DM saves and
   ensures its end sector isn't modified). dm_io_rewind() uses the
   sectors and sectors_offset members of the dm_io that are recorded
   relative to the end of orig_bio: dm_bio_rewind()+bio_trim() are
   then used to make that cloned bio reflect the subset of the
   original bio that is represented by the dm_io that is being
   requeued.

2) the 2nd stage requeue is same with original requeue, but
   io->orig_bio points to new cloned bio (which matches the requeued
   dm_io as described above).

This allows DM core to shift the need for bio cloning from bio-split
time (during IO submission) to the less likely BLK_STS_DM_REQUEUE
handling (after IO completes with that error).
Signed-off-by: default avatarMing Lei <ming.lei@redhat.com>
Signed-off-by: default avatarMike Snitzer <snitzer@kernel.org>
parent 61cbe788
...@@ -22,6 +22,8 @@ ...@@ -22,6 +22,8 @@
#define DM_RESERVED_MAX_IOS 1024 #define DM_RESERVED_MAX_IOS 1024
struct dm_io;
struct dm_kobject_holder { struct dm_kobject_holder {
struct kobject kobj; struct kobject kobj;
struct completion completion; struct completion completion;
...@@ -91,6 +93,14 @@ struct mapped_device { ...@@ -91,6 +93,14 @@ struct mapped_device {
spinlock_t deferred_lock; spinlock_t deferred_lock;
struct bio_list deferred; struct bio_list deferred;
/*
* requeue work context is needed for cloning one new bio
* to represent the dm_io to be requeued, since each
* dm_io may point to the original bio from FS.
*/
struct work_struct requeue_work;
struct dm_io *requeue_list;
void *interface_ptr; void *interface_ptr;
/* /*
...@@ -275,7 +285,6 @@ struct dm_io { ...@@ -275,7 +285,6 @@ struct dm_io {
atomic_t io_count; atomic_t io_count;
struct mapped_device *md; struct mapped_device *md;
struct bio *split_bio;
/* The three fields represent mapped part of original bio */ /* The three fields represent mapped part of original bio */
struct bio *orig_bio; struct bio *orig_bio;
unsigned int sector_offset; /* offset to end of orig_bio */ unsigned int sector_offset; /* offset to end of orig_bio */
...@@ -303,6 +312,8 @@ static inline void dm_io_set_flag(struct dm_io *io, unsigned int bit) ...@@ -303,6 +312,8 @@ static inline void dm_io_set_flag(struct dm_io *io, unsigned int bit)
io->flags |= (1U << bit); io->flags |= (1U << bit);
} }
void dm_io_rewind(struct dm_io *io, struct bio_set *bs);
static inline struct completion *dm_get_completion_from_kobject(struct kobject *kobj) static inline struct completion *dm_get_completion_from_kobject(struct kobject *kobj)
{ {
return &container_of(kobj, struct dm_kobject_holder, kobj)->completion; return &container_of(kobj, struct dm_kobject_holder, kobj)->completion;
...@@ -319,6 +330,4 @@ extern atomic_t dm_global_event_nr; ...@@ -319,6 +330,4 @@ extern atomic_t dm_global_event_nr;
extern wait_queue_head_t dm_global_eventq; extern wait_queue_head_t dm_global_eventq;
void dm_issue_global_event(void); void dm_issue_global_event(void);
void dm_bio_rewind(struct bio *bio, unsigned bytes);
#endif #endif
...@@ -131,7 +131,7 @@ static inline void dm_bio_rewind_iter(const struct bio *bio, ...@@ -131,7 +131,7 @@ static inline void dm_bio_rewind_iter(const struct bio *bio,
* rewinding from end of bio and restoring its original position. * rewinding from end of bio and restoring its original position.
* Caller is also responsibile for restoring bio's size. * Caller is also responsibile for restoring bio's size.
*/ */
void dm_bio_rewind(struct bio *bio, unsigned bytes) static void dm_bio_rewind(struct bio *bio, unsigned bytes)
{ {
if (bio_integrity(bio)) if (bio_integrity(bio))
dm_bio_integrity_rewind(bio, bytes); dm_bio_integrity_rewind(bio, bytes);
...@@ -141,3 +141,26 @@ void dm_bio_rewind(struct bio *bio, unsigned bytes) ...@@ -141,3 +141,26 @@ void dm_bio_rewind(struct bio *bio, unsigned bytes)
dm_bio_rewind_iter(bio, &bio->bi_iter, bytes); dm_bio_rewind_iter(bio, &bio->bi_iter, bytes);
} }
void dm_io_rewind(struct dm_io *io, struct bio_set *bs)
{
struct bio *orig = io->orig_bio;
struct bio *new_orig = bio_alloc_clone(orig->bi_bdev, orig,
GFP_NOIO, bs);
/*
* dm_bio_rewind can restore to previous position since the
* end sector is fixed for original bio, but we still need
* to restore bio's size manually (using io->sectors).
*/
dm_bio_rewind(new_orig, ((io->sector_offset << 9) -
orig->bi_iter.bi_size));
bio_trim(new_orig, 0, io->sectors);
bio_chain(new_orig, orig);
/*
* __bi_remaining was increased (by dm_split_and_process_bio),
* so must drop the one added in bio_chain.
*/
atomic_dec(&orig->__bi_remaining);
io->orig_bio = new_orig;
}
...@@ -590,7 +590,6 @@ static struct dm_io *alloc_io(struct mapped_device *md, struct bio *bio) ...@@ -590,7 +590,6 @@ static struct dm_io *alloc_io(struct mapped_device *md, struct bio *bio)
atomic_set(&io->io_count, 2); atomic_set(&io->io_count, 2);
this_cpu_inc(*md->pending_io); this_cpu_inc(*md->pending_io);
io->orig_bio = bio; io->orig_bio = bio;
io->split_bio = NULL;
io->md = md; io->md = md;
spin_lock_init(&io->lock); spin_lock_init(&io->lock);
io->start_time = jiffies; io->start_time = jiffies;
...@@ -880,13 +879,35 @@ static int __noflush_suspending(struct mapped_device *md) ...@@ -880,13 +879,35 @@ static int __noflush_suspending(struct mapped_device *md)
return test_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); return test_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
} }
static void dm_requeue_add_io(struct dm_io *io, bool first_stage)
{
struct mapped_device *md = io->md;
if (first_stage) {
struct dm_io *next = md->requeue_list;
md->requeue_list = io;
io->next = next;
} else {
bio_list_add_head(&md->deferred, io->orig_bio);
}
}
static void dm_kick_requeue(struct mapped_device *md, bool first_stage)
{
if (first_stage)
queue_work(md->wq, &md->requeue_work);
else
queue_work(md->wq, &md->work);
}
/* /*
* Return true if the dm_io's original bio is requeued. * Return true if the dm_io's original bio is requeued.
* io->status is updated with error if requeue disallowed. * io->status is updated with error if requeue disallowed.
*/ */
static bool dm_handle_requeue(struct dm_io *io) static bool dm_handle_requeue(struct dm_io *io, bool first_stage)
{ {
struct bio *bio = io->split_bio ? io->split_bio : io->orig_bio; struct bio *bio = io->orig_bio;
bool handle_requeue = (io->status == BLK_STS_DM_REQUEUE); bool handle_requeue = (io->status == BLK_STS_DM_REQUEUE);
bool handle_polled_eagain = ((io->status == BLK_STS_AGAIN) && bool handle_polled_eagain = ((io->status == BLK_STS_AGAIN) &&
(bio->bi_opf & REQ_POLLED)); (bio->bi_opf & REQ_POLLED));
...@@ -912,8 +933,8 @@ static bool dm_handle_requeue(struct dm_io *io) ...@@ -912,8 +933,8 @@ static bool dm_handle_requeue(struct dm_io *io)
spin_lock_irqsave(&md->deferred_lock, flags); spin_lock_irqsave(&md->deferred_lock, flags);
if ((__noflush_suspending(md) && if ((__noflush_suspending(md) &&
!WARN_ON_ONCE(dm_is_zone_write(md, bio))) || !WARN_ON_ONCE(dm_is_zone_write(md, bio))) ||
handle_polled_eagain) { handle_polled_eagain || first_stage) {
bio_list_add_head(&md->deferred, bio); dm_requeue_add_io(io, first_stage);
requeued = true; requeued = true;
} else { } else {
/* /*
...@@ -926,19 +947,21 @@ static bool dm_handle_requeue(struct dm_io *io) ...@@ -926,19 +947,21 @@ static bool dm_handle_requeue(struct dm_io *io)
} }
if (requeued) if (requeued)
queue_work(md->wq, &md->work); dm_kick_requeue(md, first_stage);
return requeued; return requeued;
} }
static void dm_io_complete(struct dm_io *io) static void __dm_io_complete(struct dm_io *io, bool first_stage)
{ {
struct bio *bio = io->split_bio ? io->split_bio : io->orig_bio; struct bio *bio = io->orig_bio;
struct mapped_device *md = io->md; struct mapped_device *md = io->md;
blk_status_t io_error; blk_status_t io_error;
bool requeued; bool requeued;
requeued = dm_handle_requeue(io); requeued = dm_handle_requeue(io, first_stage);
if (requeued && first_stage)
return;
io_error = io->status; io_error = io->status;
if (dm_io_flagged(io, DM_IO_ACCOUNTED)) if (dm_io_flagged(io, DM_IO_ACCOUNTED))
...@@ -978,6 +1001,58 @@ static void dm_io_complete(struct dm_io *io) ...@@ -978,6 +1001,58 @@ static void dm_io_complete(struct dm_io *io)
} }
} }
static void dm_wq_requeue_work(struct work_struct *work)
{
struct mapped_device *md = container_of(work, struct mapped_device,
requeue_work);
unsigned long flags;
struct dm_io *io;
/* reuse deferred lock to simplify dm_handle_requeue */
spin_lock_irqsave(&md->deferred_lock, flags);
io = md->requeue_list;
md->requeue_list = NULL;
spin_unlock_irqrestore(&md->deferred_lock, flags);
while (io) {
struct dm_io *next = io->next;
dm_io_rewind(io, &md->queue->bio_split);
io->next = NULL;
__dm_io_complete(io, false);
io = next;
}
}
/*
* Two staged requeue:
*
* 1) io->orig_bio points to the real original bio, and the part mapped to
* this io must be requeued, instead of other parts of the original bio.
*
* 2) io->orig_bio points to new cloned bio which matches the requeued dm_io.
*/
static void dm_io_complete(struct dm_io *io)
{
bool first_requeue;
/*
* Only dm_io that has been split needs two stage requeue, otherwise
* we may run into long bio clone chain during suspend and OOM could
* be triggered.
*
* Also flush data dm_io won't be marked as DM_IO_WAS_SPLIT, so they
* also aren't handled via the first stage requeue.
*/
if (dm_io_flagged(io, DM_IO_WAS_SPLIT))
first_requeue = true;
else
first_requeue = false;
__dm_io_complete(io, first_requeue);
}
/* /*
* Decrements the number of outstanding ios that a bio has been * Decrements the number of outstanding ios that a bio has been
* cloned into, completing the original io if necc. * cloned into, completing the original io if necc.
...@@ -1256,6 +1331,7 @@ static size_t dm_dax_recovery_write(struct dax_device *dax_dev, pgoff_t pgoff, ...@@ -1256,6 +1331,7 @@ static size_t dm_dax_recovery_write(struct dax_device *dax_dev, pgoff_t pgoff,
void dm_accept_partial_bio(struct bio *bio, unsigned n_sectors) void dm_accept_partial_bio(struct bio *bio, unsigned n_sectors)
{ {
struct dm_target_io *tio = clone_to_tio(bio); struct dm_target_io *tio = clone_to_tio(bio);
struct dm_io *io = tio->io;
unsigned bio_sectors = bio_sectors(bio); unsigned bio_sectors = bio_sectors(bio);
BUG_ON(dm_tio_flagged(tio, DM_TIO_IS_DUPLICATE_BIO)); BUG_ON(dm_tio_flagged(tio, DM_TIO_IS_DUPLICATE_BIO));
...@@ -1271,8 +1347,9 @@ void dm_accept_partial_bio(struct bio *bio, unsigned n_sectors) ...@@ -1271,8 +1347,9 @@ void dm_accept_partial_bio(struct bio *bio, unsigned n_sectors)
* __split_and_process_bio() may have already saved mapped part * __split_and_process_bio() may have already saved mapped part
* for accounting but it is being reduced so update accordingly. * for accounting but it is being reduced so update accordingly.
*/ */
dm_io_set_flag(tio->io, DM_IO_WAS_SPLIT); dm_io_set_flag(io, DM_IO_WAS_SPLIT);
tio->io->sectors = n_sectors; io->sectors = n_sectors;
io->sector_offset = bio_sectors(io->orig_bio);
} }
EXPORT_SYMBOL_GPL(dm_accept_partial_bio); EXPORT_SYMBOL_GPL(dm_accept_partial_bio);
...@@ -1395,17 +1472,7 @@ static void setup_split_accounting(struct clone_info *ci, unsigned len) ...@@ -1395,17 +1472,7 @@ static void setup_split_accounting(struct clone_info *ci, unsigned len)
*/ */
dm_io_set_flag(io, DM_IO_WAS_SPLIT); dm_io_set_flag(io, DM_IO_WAS_SPLIT);
io->sectors = len; io->sectors = len;
} io->sector_offset = bio_sectors(ci->bio);
if (static_branch_unlikely(&stats_enabled) &&
unlikely(dm_stats_used(&io->md->stats))) {
/*
* Save bi_sector in terms of its offset from end of
* original bio, only needed for DM-stats' benefit.
* - saved regardless of whether split needed so that
* dm_accept_partial_bio() doesn't need to.
*/
io->sector_offset = bio_end_sector(ci->bio) - ci->sector;
} }
} }
...@@ -1705,11 +1772,9 @@ static void dm_split_and_process_bio(struct mapped_device *md, ...@@ -1705,11 +1772,9 @@ static void dm_split_and_process_bio(struct mapped_device *md,
* Remainder must be passed to submit_bio_noacct() so it gets handled * Remainder must be passed to submit_bio_noacct() so it gets handled
* *after* bios already submitted have been completely processed. * *after* bios already submitted have been completely processed.
*/ */
WARN_ON_ONCE(!dm_io_flagged(io, DM_IO_WAS_SPLIT)); bio_trim(bio, io->sectors, ci.sector_count);
io->split_bio = bio_split(bio, io->sectors, GFP_NOIO, trace_block_split(bio, bio->bi_iter.bi_sector);
&md->queue->bio_split); bio_inc_remaining(bio);
bio_chain(io->split_bio, bio);
trace_block_split(io->split_bio, bio->bi_iter.bi_sector);
submit_bio_noacct(bio); submit_bio_noacct(bio);
out: out:
/* /*
...@@ -1985,9 +2050,11 @@ static struct mapped_device *alloc_dev(int minor) ...@@ -1985,9 +2050,11 @@ static struct mapped_device *alloc_dev(int minor)
init_waitqueue_head(&md->wait); init_waitqueue_head(&md->wait);
INIT_WORK(&md->work, dm_wq_work); INIT_WORK(&md->work, dm_wq_work);
INIT_WORK(&md->requeue_work, dm_wq_requeue_work);
init_waitqueue_head(&md->eventq); init_waitqueue_head(&md->eventq);
init_completion(&md->kobj_holder.completion); init_completion(&md->kobj_holder.completion);
md->requeue_list = NULL;
md->swap_bios = get_swap_bios(); md->swap_bios = get_swap_bios();
sema_init(&md->swap_bios_semaphore, md->swap_bios); sema_init(&md->swap_bios_semaphore, md->swap_bios);
mutex_init(&md->swap_bios_lock); mutex_init(&md->swap_bios_lock);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment