Commit 8d3ca83d authored by NeilBrown's avatar NeilBrown Committed by Shaohua Li

md/raid10: add failfast handling for reads.

If a device is marked FailFast, and it is not the only
device we can read from, we mark the bio as MD_FAILFAST.

If this does fail-fast, we don't try read repair but just
allow failure.

If it was the last device, it doesn't get marked Faulty so
the retry happens on the same device - this time without
FAILFAST.  A subsequent failure will not retry but will just
pass up the error.

During resync we may use FAILFAST requests, and on a failure
we will simply use the other device(s).

During recovery we will only use FAILFAST in the unusual
case were there are multiple places to read from - i.e. if
there are > 2 devices.  If we get a failure we will fail the
device and complete the resync/recovery with remaining
devices.
Signed-off-by: default avatarNeilBrown <neilb@suse.com>
Signed-off-by: default avatarShaohua Li <shli@fb.com>
parent 212e7eb7
...@@ -719,6 +719,7 @@ static struct md_rdev *read_balance(struct r10conf *conf, ...@@ -719,6 +719,7 @@ static struct md_rdev *read_balance(struct r10conf *conf,
best_dist = MaxSector; best_dist = MaxSector;
best_good_sectors = 0; best_good_sectors = 0;
do_balance = 1; do_balance = 1;
clear_bit(R10BIO_FailFast, &r10_bio->state);
/* /*
* Check if we can balance. We can balance on the whole * Check if we can balance. We can balance on the whole
* device if no resync is going on (recovery is ok), or below * device if no resync is going on (recovery is ok), or below
...@@ -783,15 +784,18 @@ static struct md_rdev *read_balance(struct r10conf *conf, ...@@ -783,15 +784,18 @@ static struct md_rdev *read_balance(struct r10conf *conf,
if (!do_balance) if (!do_balance)
break; break;
if (best_slot >= 0)
/* At least 2 disks to choose from so failfast is OK */
set_bit(R10BIO_FailFast, &r10_bio->state);
/* This optimisation is debatable, and completely destroys /* This optimisation is debatable, and completely destroys
* sequential read speed for 'far copies' arrays. So only * sequential read speed for 'far copies' arrays. So only
* keep it for 'near' arrays, and review those later. * keep it for 'near' arrays, and review those later.
*/ */
if (geo->near_copies > 1 && !atomic_read(&rdev->nr_pending)) if (geo->near_copies > 1 && !atomic_read(&rdev->nr_pending))
break; new_distance = 0;
/* for far > 1 always use the lowest address */ /* for far > 1 always use the lowest address */
if (geo->far_copies > 1) else if (geo->far_copies > 1)
new_distance = r10_bio->devs[slot].addr; new_distance = r10_bio->devs[slot].addr;
else else
new_distance = abs(r10_bio->devs[slot].addr - new_distance = abs(r10_bio->devs[slot].addr -
...@@ -1170,6 +1174,9 @@ static void __make_request(struct mddev *mddev, struct bio *bio) ...@@ -1170,6 +1174,9 @@ static void __make_request(struct mddev *mddev, struct bio *bio)
read_bio->bi_bdev = rdev->bdev; read_bio->bi_bdev = rdev->bdev;
read_bio->bi_end_io = raid10_end_read_request; read_bio->bi_end_io = raid10_end_read_request;
bio_set_op_attrs(read_bio, op, do_sync); bio_set_op_attrs(read_bio, op, do_sync);
if (test_bit(FailFast, &rdev->flags) &&
test_bit(R10BIO_FailFast, &r10_bio->state))
read_bio->bi_opf |= MD_FAILFAST;
read_bio->bi_private = r10_bio; read_bio->bi_private = r10_bio;
if (mddev->gendisk) if (mddev->gendisk)
...@@ -1988,6 +1995,7 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio) ...@@ -1988,6 +1995,7 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
/* now find blocks with errors */ /* now find blocks with errors */
for (i=0 ; i < conf->copies ; i++) { for (i=0 ; i < conf->copies ; i++) {
int j, d; int j, d;
struct md_rdev *rdev;
tbio = r10_bio->devs[i].bio; tbio = r10_bio->devs[i].bio;
...@@ -1995,6 +2003,8 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio) ...@@ -1995,6 +2003,8 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
continue; continue;
if (i == first) if (i == first)
continue; continue;
d = r10_bio->devs[i].devnum;
rdev = conf->mirrors[d].rdev;
if (!r10_bio->devs[i].bio->bi_error) { if (!r10_bio->devs[i].bio->bi_error) {
/* We know that the bi_io_vec layout is the same for /* We know that the bi_io_vec layout is the same for
* both 'first' and 'i', so we just compare them. * both 'first' and 'i', so we just compare them.
...@@ -2017,6 +2027,10 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio) ...@@ -2017,6 +2027,10 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery))
/* Don't fix anything. */ /* Don't fix anything. */
continue; continue;
} else if (test_bit(FailFast, &rdev->flags)) {
/* Just give up on this device */
md_error(rdev->mddev, rdev);
continue;
} }
/* Ok, we need to write this bio, either to correct an /* Ok, we need to write this bio, either to correct an
* inconsistency or to correct an unreadable block. * inconsistency or to correct an unreadable block.
...@@ -2034,7 +2048,6 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio) ...@@ -2034,7 +2048,6 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
bio_copy_data(tbio, fbio); bio_copy_data(tbio, fbio);
d = r10_bio->devs[i].devnum;
atomic_inc(&conf->mirrors[d].rdev->nr_pending); atomic_inc(&conf->mirrors[d].rdev->nr_pending);
atomic_inc(&r10_bio->remaining); atomic_inc(&r10_bio->remaining);
md_sync_acct(conf->mirrors[d].rdev->bdev, bio_sectors(tbio)); md_sync_acct(conf->mirrors[d].rdev->bdev, bio_sectors(tbio));
...@@ -2541,12 +2554,14 @@ static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio) ...@@ -2541,12 +2554,14 @@ static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio)
bio_put(bio); bio_put(bio);
r10_bio->devs[slot].bio = NULL; r10_bio->devs[slot].bio = NULL;
if (mddev->ro == 0) { if (mddev->ro)
r10_bio->devs[slot].bio = IO_BLOCKED;
else if (!test_bit(FailFast, &rdev->flags)) {
freeze_array(conf, 1); freeze_array(conf, 1);
fix_read_error(conf, mddev, r10_bio); fix_read_error(conf, mddev, r10_bio);
unfreeze_array(conf); unfreeze_array(conf);
} else } else
r10_bio->devs[slot].bio = IO_BLOCKED; md_error(mddev, rdev);
rdev_dec_pending(rdev, mddev); rdev_dec_pending(rdev, mddev);
...@@ -2575,6 +2590,9 @@ static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio) ...@@ -2575,6 +2590,9 @@ static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio)
+ choose_data_offset(r10_bio, rdev); + choose_data_offset(r10_bio, rdev);
bio->bi_bdev = rdev->bdev; bio->bi_bdev = rdev->bdev;
bio_set_op_attrs(bio, REQ_OP_READ, do_sync); bio_set_op_attrs(bio, REQ_OP_READ, do_sync);
if (test_bit(FailFast, &rdev->flags) &&
test_bit(R10BIO_FailFast, &r10_bio->state))
bio->bi_opf |= MD_FAILFAST;
bio->bi_private = r10_bio; bio->bi_private = r10_bio;
bio->bi_end_io = raid10_end_read_request; bio->bi_end_io = raid10_end_read_request;
trace_block_bio_remap(bdev_get_queue(bio->bi_bdev), trace_block_bio_remap(bdev_get_queue(bio->bi_bdev),
...@@ -3096,6 +3114,8 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, ...@@ -3096,6 +3114,8 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
bio->bi_private = r10_bio; bio->bi_private = r10_bio;
bio->bi_end_io = end_sync_read; bio->bi_end_io = end_sync_read;
bio_set_op_attrs(bio, REQ_OP_READ, 0); bio_set_op_attrs(bio, REQ_OP_READ, 0);
if (test_bit(FailFast, &rdev->flags))
bio->bi_opf |= MD_FAILFAST;
from_addr = r10_bio->devs[j].addr; from_addr = r10_bio->devs[j].addr;
bio->bi_iter.bi_sector = from_addr + bio->bi_iter.bi_sector = from_addr +
rdev->data_offset; rdev->data_offset;
...@@ -3201,6 +3221,23 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, ...@@ -3201,6 +3221,23 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
rdev_dec_pending(mrdev, mddev); rdev_dec_pending(mrdev, mddev);
if (mreplace) if (mreplace)
rdev_dec_pending(mreplace, mddev); rdev_dec_pending(mreplace, mddev);
if (r10_bio->devs[0].bio->bi_opf & MD_FAILFAST) {
/* Only want this if there is elsewhere to
* read from. 'j' is currently the first
* readable copy.
*/
int targets = 1;
for (; j < conf->copies; j++) {
int d = r10_bio->devs[j].devnum;
if (conf->mirrors[d].rdev &&
test_bit(In_sync,
&conf->mirrors[d].rdev->flags))
targets++;
}
if (targets == 1)
r10_bio->devs[0].bio->bi_opf
&= ~MD_FAILFAST;
}
} }
if (biolist == NULL) { if (biolist == NULL) {
while (r10_bio) { while (r10_bio) {
...@@ -3279,6 +3316,8 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, ...@@ -3279,6 +3316,8 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
bio->bi_private = r10_bio; bio->bi_private = r10_bio;
bio->bi_end_io = end_sync_read; bio->bi_end_io = end_sync_read;
bio_set_op_attrs(bio, REQ_OP_READ, 0); bio_set_op_attrs(bio, REQ_OP_READ, 0);
if (test_bit(FailFast, &conf->mirrors[d].rdev->flags))
bio->bi_opf |= MD_FAILFAST;
bio->bi_iter.bi_sector = sector + rdev->data_offset; bio->bi_iter.bi_sector = sector + rdev->data_offset;
bio->bi_bdev = rdev->bdev; bio->bi_bdev = rdev->bdev;
count++; count++;
......
...@@ -156,5 +156,7 @@ enum r10bio_state { ...@@ -156,5 +156,7 @@ enum r10bio_state {
* flag is set * flag is set
*/ */
R10BIO_Previous, R10BIO_Previous,
/* failfast devices did receive failfast requests. */
R10BIO_FailFast,
}; };
#endif #endif
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment