Commit fc9977dd authored by NeilBrown's avatar NeilBrown Committed by Shaohua Li

md/raid10: simplify the splitting of requests.

raid10 splits requests in two different ways for two different
reasons.

First, bio_split() is used to ensure the bio fits with a chunk.
Second, multiple r10bio structures are allocated to represent the
different sections that need to go to different devices, to avoid
known bad blocks.

This can be simplified to just use bio_split() once, and not to use
multiple r10bios.
We delay the split until we know a maximum bio size that can
be handled with a single r10bio, and then split the bio and queue
the remainder for later handling.

As with raid1, we allocate a new bio_set to help with the splitting.
It is not correct to use fs_bio_set in a device driver.
Signed-off-by: default avatarNeilBrown <neilb@suse.com>
Signed-off-by: default avatarShaohua Li <shli@fb.com>
parent 673ca68d
...@@ -1127,7 +1127,6 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio, ...@@ -1127,7 +1127,6 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio,
struct bio *read_bio; struct bio *read_bio;
const int op = bio_op(bio); const int op = bio_op(bio);
const unsigned long do_sync = (bio->bi_opf & REQ_SYNC); const unsigned long do_sync = (bio->bi_opf & REQ_SYNC);
int sectors_handled;
int max_sectors; int max_sectors;
sector_t sectors; sector_t sectors;
struct md_rdev *rdev; struct md_rdev *rdev;
...@@ -1140,7 +1139,7 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio, ...@@ -1140,7 +1139,7 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio,
*/ */
wait_barrier(conf); wait_barrier(conf);
sectors = bio_sectors(bio); sectors = r10_bio->sectors;
while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
bio->bi_iter.bi_sector < conf->reshape_progress && bio->bi_iter.bi_sector < conf->reshape_progress &&
bio->bi_iter.bi_sector + sectors > conf->reshape_progress) { bio->bi_iter.bi_sector + sectors > conf->reshape_progress) {
...@@ -1157,17 +1156,23 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio, ...@@ -1157,17 +1156,23 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio,
wait_barrier(conf); wait_barrier(conf);
} }
read_again:
rdev = read_balance(conf, r10_bio, &max_sectors); rdev = read_balance(conf, r10_bio, &max_sectors);
if (!rdev) { if (!rdev) {
raid_end_bio_io(r10_bio); raid_end_bio_io(r10_bio);
return; return;
} }
if (max_sectors < bio_sectors(bio)) {
struct bio *split = bio_split(bio, max_sectors,
GFP_NOIO, conf->bio_split);
bio_chain(split, bio);
generic_make_request(bio);
bio = split;
r10_bio->master_bio = bio;
r10_bio->sectors = max_sectors;
}
slot = r10_bio->read_slot; slot = r10_bio->read_slot;
read_bio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set); read_bio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set);
bio_trim(read_bio, r10_bio->sector - bio->bi_iter.bi_sector,
max_sectors);
r10_bio->devs[slot].bio = read_bio; r10_bio->devs[slot].bio = read_bio;
r10_bio->devs[slot].rdev = rdev; r10_bio->devs[slot].rdev = rdev;
...@@ -1186,40 +1191,13 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio, ...@@ -1186,40 +1191,13 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio,
trace_block_bio_remap(bdev_get_queue(read_bio->bi_bdev), trace_block_bio_remap(bdev_get_queue(read_bio->bi_bdev),
read_bio, disk_devt(mddev->gendisk), read_bio, disk_devt(mddev->gendisk),
r10_bio->sector); r10_bio->sector);
if (max_sectors < r10_bio->sectors) { generic_make_request(read_bio);
/*
* Could not read all from this device, so we will need another
* r10_bio.
*/
sectors_handled = (r10_bio->sector + max_sectors
- bio->bi_iter.bi_sector);
r10_bio->sectors = max_sectors;
inc_pending(conf);
bio_inc_remaining(bio);
/*
* Cannot call generic_make_request directly as that will be
* queued in __generic_make_request and subsequent
* mempool_alloc might block waiting for it. so hand bio over
* to raid10d.
*/
reschedule_retry(r10_bio);
r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
r10_bio->master_bio = bio;
r10_bio->sectors = bio_sectors(bio) - sectors_handled;
r10_bio->state = 0;
r10_bio->mddev = mddev;
r10_bio->sector = bio->bi_iter.bi_sector + sectors_handled;
goto read_again;
} else
generic_make_request(read_bio);
return; return;
} }
static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio, static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio,
struct bio *bio, bool replacement, struct bio *bio, bool replacement,
int n_copy, int max_sectors) int n_copy)
{ {
const int op = bio_op(bio); const int op = bio_op(bio);
const unsigned long do_sync = (bio->bi_opf & REQ_SYNC); const unsigned long do_sync = (bio->bi_opf & REQ_SYNC);
...@@ -1243,7 +1221,6 @@ static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio, ...@@ -1243,7 +1221,6 @@ static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio,
rdev = conf->mirrors[devnum].rdev; rdev = conf->mirrors[devnum].rdev;
mbio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set); mbio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set);
bio_trim(mbio, r10_bio->sector - bio->bi_iter.bi_sector, max_sectors);
if (replacement) if (replacement)
r10_bio->devs[n_copy].repl_bio = mbio; r10_bio->devs[n_copy].repl_bio = mbio;
else else
...@@ -1294,7 +1271,6 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio, ...@@ -1294,7 +1271,6 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
int i; int i;
struct md_rdev *blocked_rdev; struct md_rdev *blocked_rdev;
sector_t sectors; sector_t sectors;
int sectors_handled;
int max_sectors; int max_sectors;
md_write_start(mddev, bio); md_write_start(mddev, bio);
...@@ -1306,7 +1282,7 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio, ...@@ -1306,7 +1282,7 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
*/ */
wait_barrier(conf); wait_barrier(conf);
sectors = bio_sectors(bio); sectors = r10_bio->sectors;
while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
bio->bi_iter.bi_sector < conf->reshape_progress && bio->bi_iter.bi_sector < conf->reshape_progress &&
bio->bi_iter.bi_sector + sectors > conf->reshape_progress) { bio->bi_iter.bi_sector + sectors > conf->reshape_progress) {
...@@ -1476,44 +1452,29 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio, ...@@ -1476,44 +1452,29 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
if (max_sectors < r10_bio->sectors) if (max_sectors < r10_bio->sectors)
r10_bio->sectors = max_sectors; r10_bio->sectors = max_sectors;
sectors_handled = r10_bio->sector + max_sectors -
bio->bi_iter.bi_sector; if (r10_bio->sectors < bio_sectors(bio)) {
struct bio *split = bio_split(bio, r10_bio->sectors,
GFP_NOIO, conf->bio_split);
bio_chain(split, bio);
generic_make_request(bio);
bio = split;
r10_bio->master_bio = bio;
}
atomic_set(&r10_bio->remaining, 1); atomic_set(&r10_bio->remaining, 1);
bitmap_startwrite(mddev->bitmap, r10_bio->sector, r10_bio->sectors, 0); bitmap_startwrite(mddev->bitmap, r10_bio->sector, r10_bio->sectors, 0);
for (i = 0; i < conf->copies; i++) { for (i = 0; i < conf->copies; i++) {
if (r10_bio->devs[i].bio) if (r10_bio->devs[i].bio)
raid10_write_one_disk(mddev, r10_bio, bio, false, raid10_write_one_disk(mddev, r10_bio, bio, false, i);
i, max_sectors);
if (r10_bio->devs[i].repl_bio) if (r10_bio->devs[i].repl_bio)
raid10_write_one_disk(mddev, r10_bio, bio, true, raid10_write_one_disk(mddev, r10_bio, bio, true, i);
i, max_sectors);
}
/* Don't remove the bias on 'remaining' (one_write_done) until
* after checking if we need to go around again.
*/
if (sectors_handled < bio_sectors(bio)) {
/* We need another r10_bio and it needs to be counted */
inc_pending(conf);
bio_inc_remaining(bio);
one_write_done(r10_bio);
r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
r10_bio->master_bio = bio;
r10_bio->sectors = bio_sectors(bio) - sectors_handled;
r10_bio->mddev = mddev;
r10_bio->sector = bio->bi_iter.bi_sector + sectors_handled;
r10_bio->state = 0;
goto retry_write;
} }
one_write_done(r10_bio); one_write_done(r10_bio);
} }
static void __make_request(struct mddev *mddev, struct bio *bio) static void __make_request(struct mddev *mddev, struct bio *bio, int sectors)
{ {
struct r10conf *conf = mddev->private; struct r10conf *conf = mddev->private;
struct r10bio *r10_bio; struct r10bio *r10_bio;
...@@ -1521,7 +1482,7 @@ static void __make_request(struct mddev *mddev, struct bio *bio) ...@@ -1521,7 +1482,7 @@ static void __make_request(struct mddev *mddev, struct bio *bio)
r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO); r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
r10_bio->master_bio = bio; r10_bio->master_bio = bio;
r10_bio->sectors = bio_sectors(bio); r10_bio->sectors = sectors;
r10_bio->mddev = mddev; r10_bio->mddev = mddev;
r10_bio->sector = bio->bi_iter.bi_sector; r10_bio->sector = bio->bi_iter.bi_sector;
...@@ -1538,54 +1499,26 @@ static void raid10_make_request(struct mddev *mddev, struct bio *bio) ...@@ -1538,54 +1499,26 @@ static void raid10_make_request(struct mddev *mddev, struct bio *bio)
struct r10conf *conf = mddev->private; struct r10conf *conf = mddev->private;
sector_t chunk_mask = (conf->geo.chunk_mask & conf->prev.chunk_mask); sector_t chunk_mask = (conf->geo.chunk_mask & conf->prev.chunk_mask);
int chunk_sects = chunk_mask + 1; int chunk_sects = chunk_mask + 1;
int sectors = bio_sectors(bio);
struct bio *split;
if (unlikely(bio->bi_opf & REQ_PREFLUSH)) { if (unlikely(bio->bi_opf & REQ_PREFLUSH)) {
md_flush_request(mddev, bio); md_flush_request(mddev, bio);
return; return;
} }
do { /*
* If this request crosses a chunk boundary, we need to split
/* * it.
* If this request crosses a chunk boundary, we need to split */
* it. if (unlikely((bio->bi_iter.bi_sector & chunk_mask) +
*/ sectors > chunk_sects
if (unlikely((bio->bi_iter.bi_sector & chunk_mask) + && (conf->geo.near_copies < conf->geo.raid_disks
bio_sectors(bio) > chunk_sects || conf->prev.near_copies <
&& (conf->geo.near_copies < conf->geo.raid_disks conf->prev.raid_disks)))
|| conf->prev.near_copies < sectors = chunk_sects -
conf->prev.raid_disks))) { (bio->bi_iter.bi_sector &
split = bio_split(bio, chunk_sects - (chunk_sects - 1));
(bio->bi_iter.bi_sector & __make_request(mddev, bio, sectors);
(chunk_sects - 1)),
GFP_NOIO, fs_bio_set);
bio_chain(split, bio);
} else {
split = bio;
}
/*
* If a bio is splitted, the first part of bio will pass
* barrier but the bio is queued in current->bio_list (see
* generic_make_request). If there is a raise_barrier() called
* here, the second part of bio can't pass barrier. But since
* the first part bio isn't dispatched to underlaying disks
* yet, the barrier is never released, hence raise_barrier will
* alays wait. We have a deadlock.
* Note, this only happens in read path. For write path, the
* first part of bio is dispatched in a schedule() call
* (because of blk plug) or offloaded to raid10d.
* Quitting from the function immediately can change the bio
* order queued in bio_list and avoid the deadlock.
*/
__make_request(mddev, split);
if (split != bio && bio_data_dir(bio) == READ) {
generic_make_request(bio);
break;
}
} while (split != bio);
/* In case raid10d snuck in to freeze_array */ /* In case raid10d snuck in to freeze_array */
wake_up(&conf->wait_barrier); wake_up(&conf->wait_barrier);
...@@ -2873,13 +2806,8 @@ static void raid10d(struct md_thread *thread) ...@@ -2873,13 +2806,8 @@ static void raid10d(struct md_thread *thread)
recovery_request_write(mddev, r10_bio); recovery_request_write(mddev, r10_bio);
else if (test_bit(R10BIO_ReadError, &r10_bio->state)) else if (test_bit(R10BIO_ReadError, &r10_bio->state))
handle_read_error(mddev, r10_bio); handle_read_error(mddev, r10_bio);
else { else
/* just a partial read to be scheduled from a WARN_ON_ONCE(1);
* separate context
*/
int slot = r10_bio->read_slot;
generic_make_request(r10_bio->devs[slot].bio);
}
cond_resched(); cond_resched();
if (mddev->sb_flags & ~(1<<MD_SB_CHANGE_PENDING)) if (mddev->sb_flags & ~(1<<MD_SB_CHANGE_PENDING))
...@@ -3652,6 +3580,10 @@ static struct r10conf *setup_conf(struct mddev *mddev) ...@@ -3652,6 +3580,10 @@ static struct r10conf *setup_conf(struct mddev *mddev)
if (!conf->r10bio_pool) if (!conf->r10bio_pool)
goto out; goto out;
conf->bio_split = bioset_create(BIO_POOL_SIZE, 0);
if (!conf->bio_split)
goto out;
calc_sectors(conf, mddev->dev_sectors); calc_sectors(conf, mddev->dev_sectors);
if (mddev->reshape_position == MaxSector) { if (mddev->reshape_position == MaxSector) {
conf->prev = conf->geo; conf->prev = conf->geo;
...@@ -3689,6 +3621,8 @@ static struct r10conf *setup_conf(struct mddev *mddev) ...@@ -3689,6 +3621,8 @@ static struct r10conf *setup_conf(struct mddev *mddev)
mempool_destroy(conf->r10bio_pool); mempool_destroy(conf->r10bio_pool);
kfree(conf->mirrors); kfree(conf->mirrors);
safe_put_page(conf->tmppage); safe_put_page(conf->tmppage);
if (conf->bio_split)
bioset_free(conf->bio_split);
kfree(conf); kfree(conf);
} }
return ERR_PTR(err); return ERR_PTR(err);
...@@ -3899,6 +3833,8 @@ static void raid10_free(struct mddev *mddev, void *priv) ...@@ -3899,6 +3833,8 @@ static void raid10_free(struct mddev *mddev, void *priv)
kfree(conf->mirrors); kfree(conf->mirrors);
kfree(conf->mirrors_old); kfree(conf->mirrors_old);
kfree(conf->mirrors_new); kfree(conf->mirrors_new);
if (conf->bio_split)
bioset_free(conf->bio_split);
kfree(conf); kfree(conf);
} }
......
...@@ -82,6 +82,7 @@ struct r10conf { ...@@ -82,6 +82,7 @@ struct r10conf {
mempool_t *r10bio_pool; mempool_t *r10bio_pool;
mempool_t *r10buf_pool; mempool_t *r10buf_pool;
struct page *tmppage; struct page *tmppage;
struct bio_set *bio_split;
/* When taking over an array from a different personality, we store /* When taking over an array from a different personality, we store
* the new thread here until we fully activate the array. * the new thread here until we fully activate the array.
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment