Commit d2eb35ac authored by NeilBrown's avatar NeilBrown

md/raid1: avoid reading from known bad blocks.

Now that we have a bad block list, we should not read from those
blocks.
There are several main parts to this:
  1/ read_balance needs to check for bad blocks, and return not only
     the chosen device, but also how many good blocks are available
     there.
  2/ fix_read_error needs to avoid trying to read from bad blocks.
  3/ read submission must be ready to issue multiple reads to
     different devices as different bad blocks on different devices
     could mean that a single large read cannot be served by any one
     device, but can still be served by the array.
     This requires keeping count of the number of outstanding requests
     per bio.  This count is stored in 'bi_phys_segments'
  4/ retrying a read needs to also be ready to submit a smaller read
     and queue another request for the rest.

This does not yet handle bad blocks when reading to perform resync,
recovery, or check.

'md_trim_bio' will also be used for RAID10, so put it in md.c and
export it.
Signed-off-by: default avatarNeilBrown <neilb@suse.de>
parent 9f2f3830
...@@ -215,6 +215,55 @@ struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask, ...@@ -215,6 +215,55 @@ struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask,
} }
EXPORT_SYMBOL_GPL(bio_clone_mddev); EXPORT_SYMBOL_GPL(bio_clone_mddev);
void md_trim_bio(struct bio *bio, int offset, int size)
{
/* 'bio' is a cloned bio which we need to trim to match
* the given offset and size.
* This requires adjusting bi_sector, bi_size, and bi_io_vec
*/
int i;
struct bio_vec *bvec;
int sofar = 0;
size <<= 9;
if (offset == 0 && size == bio->bi_size)
return;
bio->bi_sector += offset;
bio->bi_size = size;
offset <<= 9;
clear_bit(BIO_SEG_VALID, &bio->bi_flags);
while (bio->bi_idx < bio->bi_vcnt &&
bio->bi_io_vec[bio->bi_idx].bv_len <= offset) {
/* remove this whole bio_vec */
offset -= bio->bi_io_vec[bio->bi_idx].bv_len;
bio->bi_idx++;
}
if (bio->bi_idx < bio->bi_vcnt) {
bio->bi_io_vec[bio->bi_idx].bv_offset += offset;
bio->bi_io_vec[bio->bi_idx].bv_len -= offset;
}
/* avoid any complications with bi_idx being non-zero*/
if (bio->bi_idx) {
memmove(bio->bi_io_vec, bio->bi_io_vec+bio->bi_idx,
(bio->bi_vcnt - bio->bi_idx) * sizeof(struct bio_vec));
bio->bi_vcnt -= bio->bi_idx;
bio->bi_idx = 0;
}
/* Make sure vcnt and last bv are not too big */
bio_for_each_segment(bvec, bio, i) {
if (sofar + bvec->bv_len > size)
bvec->bv_len = size - sofar;
if (bvec->bv_len == 0) {
bio->bi_vcnt = i;
break;
}
sofar += bvec->bv_len;
}
}
EXPORT_SYMBOL_GPL(md_trim_bio);
/* /*
* We have a system wide 'event count' that is incremented * We have a system wide 'event count' that is incremented
* on any 'interesting' event, and readers of /proc/mdstat * on any 'interesting' event, and readers of /proc/mdstat
......
...@@ -575,4 +575,5 @@ extern struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask, ...@@ -575,4 +575,5 @@ extern struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask,
extern struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs, extern struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs,
mddev_t *mddev); mddev_t *mddev);
extern int mddev_check_plugged(mddev_t *mddev); extern int mddev_check_plugged(mddev_t *mddev);
extern void md_trim_bio(struct bio *bio, int offset, int size);
#endif /* _MD_MD_H */ #endif /* _MD_MD_H */
This diff is collapsed.
...@@ -123,6 +123,10 @@ struct r1bio_s { ...@@ -123,6 +123,10 @@ struct r1bio_s {
#define R1BIO_IsSync 1 #define R1BIO_IsSync 1
#define R1BIO_Degraded 2 #define R1BIO_Degraded 2
#define R1BIO_BehindIO 3 #define R1BIO_BehindIO 3
/* Set ReadError on bios that experience a readerror so that
* raid1d knows what to do with them.
*/
#define R1BIO_ReadError 4
/* For write-behind requests, we call bi_end_io when /* For write-behind requests, we call bi_end_io when
* the last non-write-behind device completes, providing * the last non-write-behind device completes, providing
* any write was successful. Otherwise we call when * any write was successful. Otherwise we call when
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment