Commit af8e2d1d authored by Miao Xie's avatar Miao Xie

Btrfs, scrub: repair the common data on RAID5/6 if it is corrupted

This patch implement the RAID5/6 common data repair function, the
implementation is similar to the scrub on the other RAID such as
RAID1, the differentia is that we don't read the data from the
mirror, we use the data repair function of RAID5/6.
Signed-off-by: default avatarMiao Xie <miaox@cn.fujitsu.com>
parent b89e1b01
......@@ -58,6 +58,15 @@
*/
#define RBIO_CACHE_READY_BIT 3
/*
* bbio and raid_map is managed by the caller, so we shouldn't free
* them here. And besides that, all rbios with this flag should not
* be cached, because we need raid_map to check the rbios' stripe
* is the same or not, but it is very likely that the caller has
* free raid_map, so don't cache those rbios.
*/
#define RBIO_HOLD_BBIO_MAP_BIT 4
#define RBIO_CACHE_SIZE 1024
struct btrfs_raid_bio {
......@@ -799,6 +808,21 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
remove_rbio_from_cache(rbio);
}
static inline void
__free_bbio_and_raid_map(struct btrfs_bio *bbio, u64 *raid_map, int need)
{
if (need) {
kfree(raid_map);
kfree(bbio);
}
}
static inline void free_bbio_and_raid_map(struct btrfs_raid_bio *rbio)
{
__free_bbio_and_raid_map(rbio->bbio, rbio->raid_map,
!test_bit(RBIO_HOLD_BBIO_MAP_BIT, &rbio->flags));
}
static void __free_raid_bio(struct btrfs_raid_bio *rbio)
{
int i;
......@@ -817,8 +841,9 @@ static void __free_raid_bio(struct btrfs_raid_bio *rbio)
rbio->stripe_pages[i] = NULL;
}
}
kfree(rbio->raid_map);
kfree(rbio->bbio);
free_bbio_and_raid_map(rbio);
kfree(rbio);
}
......@@ -933,11 +958,8 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root,
rbio = kzalloc(sizeof(*rbio) + num_pages * sizeof(struct page *) * 2,
GFP_NOFS);
if (!rbio) {
kfree(raid_map);
kfree(bbio);
if (!rbio)
return ERR_PTR(-ENOMEM);
}
bio_list_init(&rbio->bio_list);
INIT_LIST_HEAD(&rbio->plug_list);
......@@ -1692,8 +1714,10 @@ int raid56_parity_write(struct btrfs_root *root, struct bio *bio,
struct blk_plug_cb *cb;
rbio = alloc_rbio(root, bbio, raid_map, stripe_len);
if (IS_ERR(rbio))
if (IS_ERR(rbio)) {
__free_bbio_and_raid_map(bbio, raid_map, 1);
return PTR_ERR(rbio);
}
bio_list_add(&rbio->bio_list, bio);
rbio->bio_list_bytes = bio->bi_iter.bi_size;
......@@ -1888,7 +1912,8 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
cleanup_io:
if (rbio->read_rebuild) {
if (err == 0)
if (err == 0 &&
!test_bit(RBIO_HOLD_BBIO_MAP_BIT, &rbio->flags))
cache_rbio_pages(rbio);
else
clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
......@@ -2038,15 +2063,19 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
*/
int raid56_parity_recover(struct btrfs_root *root, struct bio *bio,
struct btrfs_bio *bbio, u64 *raid_map,
u64 stripe_len, int mirror_num)
u64 stripe_len, int mirror_num, int hold_bbio)
{
struct btrfs_raid_bio *rbio;
int ret;
rbio = alloc_rbio(root, bbio, raid_map, stripe_len);
if (IS_ERR(rbio))
if (IS_ERR(rbio)) {
__free_bbio_and_raid_map(bbio, raid_map, !hold_bbio);
return PTR_ERR(rbio);
}
if (hold_bbio)
set_bit(RBIO_HOLD_BBIO_MAP_BIT, &rbio->flags);
rbio->read_rebuild = 1;
bio_list_add(&rbio->bio_list, bio);
rbio->bio_list_bytes = bio->bi_iter.bi_size;
......@@ -2054,8 +2083,7 @@ int raid56_parity_recover(struct btrfs_root *root, struct bio *bio,
rbio->faila = find_logical_bio_stripe(rbio, bio);
if (rbio->faila == -1) {
BUG();
kfree(raid_map);
kfree(bbio);
__free_bbio_and_raid_map(bbio, raid_map, !hold_bbio);
kfree(rbio);
return -EIO;
}
......
......@@ -41,7 +41,7 @@ static inline int nr_data_stripes(struct map_lookup *map)
int raid56_parity_recover(struct btrfs_root *root, struct bio *bio,
struct btrfs_bio *bbio, u64 *raid_map,
u64 stripe_len, int mirror_num);
u64 stripe_len, int mirror_num, int hold_bbio);
int raid56_parity_write(struct btrfs_root *root, struct bio *bio,
struct btrfs_bio *bbio, u64 *raid_map,
u64 stripe_len);
......
......@@ -63,6 +63,13 @@ struct scrub_ctx;
*/
#define SCRUB_MAX_PAGES_PER_BLOCK 16 /* 64k per node/leaf/sector */
struct scrub_recover {
atomic_t refs;
struct btrfs_bio *bbio;
u64 *raid_map;
u64 map_length;
};
struct scrub_page {
struct scrub_block *sblock;
struct page *page;
......@@ -79,6 +86,8 @@ struct scrub_page {
unsigned int io_error:1;
};
u8 csum[BTRFS_CSUM_SIZE];
struct scrub_recover *recover;
};
struct scrub_bio {
......@@ -196,7 +205,7 @@ static int scrub_setup_recheck_block(struct scrub_ctx *sctx,
static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
struct scrub_block *sblock, int is_metadata,
int have_csum, u8 *csum, u64 generation,
u16 csum_size);
u16 csum_size, int retry_failed_mirror);
static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
struct scrub_block *sblock,
int is_metadata, int have_csum,
......@@ -790,6 +799,20 @@ static void scrub_fixup_nodatasum(struct btrfs_work *work)
scrub_pending_trans_workers_dec(sctx);
}
static inline void scrub_get_recover(struct scrub_recover *recover)
{
atomic_inc(&recover->refs);
}
static inline void scrub_put_recover(struct scrub_recover *recover)
{
if (atomic_dec_and_test(&recover->refs)) {
kfree(recover->bbio);
kfree(recover->raid_map);
kfree(recover);
}
}
/*
* scrub_handle_errored_block gets called when either verification of the
* pages failed or the bio failed to read, e.g. with EIO. In the latter
......@@ -906,7 +929,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
/* build and submit the bios for the failed mirror, check checksums */
scrub_recheck_block(fs_info, sblock_bad, is_metadata, have_csum,
csum, generation, sctx->csum_size);
csum, generation, sctx->csum_size, 1);
if (!sblock_bad->header_error && !sblock_bad->checksum_error &&
sblock_bad->no_io_error_seen) {
......@@ -1019,7 +1042,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
/* build and submit the bios, check checksums */
scrub_recheck_block(fs_info, sblock_other, is_metadata,
have_csum, csum, generation,
sctx->csum_size);
sctx->csum_size, 0);
if (!sblock_other->header_error &&
!sblock_other->checksum_error &&
......@@ -1169,7 +1192,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
*/
scrub_recheck_block(fs_info, sblock_bad,
is_metadata, have_csum, csum,
generation, sctx->csum_size);
generation, sctx->csum_size, 1);
if (!sblock_bad->header_error &&
!sblock_bad->checksum_error &&
sblock_bad->no_io_error_seen)
......@@ -1201,11 +1224,18 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
mirror_index++) {
struct scrub_block *sblock = sblocks_for_recheck +
mirror_index;
struct scrub_recover *recover;
int page_index;
for (page_index = 0; page_index < sblock->page_count;
page_index++) {
sblock->pagev[page_index]->sblock = NULL;
recover = sblock->pagev[page_index]->recover;
if (recover) {
scrub_put_recover(recover);
sblock->pagev[page_index]->recover =
NULL;
}
scrub_page_put(sblock->pagev[page_index]);
}
}
......@@ -1215,14 +1245,63 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
return 0;
}
static inline int scrub_nr_raid_mirrors(struct btrfs_bio *bbio, u64 *raid_map)
{
if (raid_map) {
if (raid_map[bbio->num_stripes - 1] == RAID6_Q_STRIPE)
return 3;
else
return 2;
} else {
return (int)bbio->num_stripes;
}
}
static inline void scrub_stripe_index_and_offset(u64 logical, u64 *raid_map,
u64 mapped_length,
int nstripes, int mirror,
int *stripe_index,
u64 *stripe_offset)
{
int i;
if (raid_map) {
/* RAID5/6 */
for (i = 0; i < nstripes; i++) {
if (raid_map[i] == RAID6_Q_STRIPE ||
raid_map[i] == RAID5_P_STRIPE)
continue;
if (logical >= raid_map[i] &&
logical < raid_map[i] + mapped_length)
break;
}
*stripe_index = i;
*stripe_offset = logical - raid_map[i];
} else {
/* The other RAID type */
*stripe_index = mirror;
*stripe_offset = 0;
}
}
static int scrub_setup_recheck_block(struct scrub_ctx *sctx,
struct btrfs_fs_info *fs_info,
struct scrub_block *original_sblock,
u64 length, u64 logical,
struct scrub_block *sblocks_for_recheck)
{
struct scrub_recover *recover;
struct btrfs_bio *bbio;
u64 *raid_map;
u64 sublen;
u64 mapped_length;
u64 stripe_offset;
int stripe_index;
int page_index;
int mirror_index;
int nmirrors;
int ret;
/*
......@@ -1233,23 +1312,39 @@ static int scrub_setup_recheck_block(struct scrub_ctx *sctx,
page_index = 0;
while (length > 0) {
u64 sublen = min_t(u64, length, PAGE_SIZE);
u64 mapped_length = sublen;
struct btrfs_bio *bbio = NULL;
sublen = min_t(u64, length, PAGE_SIZE);
mapped_length = sublen;
bbio = NULL;
raid_map = NULL;
/*
* with a length of PAGE_SIZE, each returned stripe
* represents one mirror
*/
ret = btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS, logical,
&mapped_length, &bbio, 0);
ret = btrfs_map_sblock(fs_info, REQ_GET_READ_MIRRORS, logical,
&mapped_length, &bbio, 0, &raid_map);
if (ret || !bbio || mapped_length < sublen) {
kfree(bbio);
kfree(raid_map);
return -EIO;
}
recover = kzalloc(sizeof(struct scrub_recover), GFP_NOFS);
if (!recover) {
kfree(bbio);
kfree(raid_map);
return -ENOMEM;
}
atomic_set(&recover->refs, 1);
recover->bbio = bbio;
recover->raid_map = raid_map;
recover->map_length = mapped_length;
BUG_ON(page_index >= SCRUB_PAGES_PER_RD_BIO);
for (mirror_index = 0; mirror_index < (int)bbio->num_stripes;
nmirrors = scrub_nr_raid_mirrors(bbio, raid_map);
for (mirror_index = 0; mirror_index < nmirrors;
mirror_index++) {
struct scrub_block *sblock;
struct scrub_page *page;
......@@ -1265,26 +1360,38 @@ static int scrub_setup_recheck_block(struct scrub_ctx *sctx,
spin_lock(&sctx->stat_lock);
sctx->stat.malloc_errors++;
spin_unlock(&sctx->stat_lock);
kfree(bbio);
scrub_put_recover(recover);
return -ENOMEM;
}
scrub_page_get(page);
sblock->pagev[page_index] = page;
page->logical = logical;
page->physical = bbio->stripes[mirror_index].physical;
scrub_stripe_index_and_offset(logical, raid_map,
mapped_length,
bbio->num_stripes,
mirror_index,
&stripe_index,
&stripe_offset);
page->physical = bbio->stripes[stripe_index].physical +
stripe_offset;
page->dev = bbio->stripes[stripe_index].dev;
BUG_ON(page_index >= original_sblock->page_count);
page->physical_for_dev_replace =
original_sblock->pagev[page_index]->
physical_for_dev_replace;
/* for missing devices, dev->bdev is NULL */
page->dev = bbio->stripes[mirror_index].dev;
page->mirror_num = mirror_index + 1;
sblock->page_count++;
page->page = alloc_page(GFP_NOFS);
if (!page->page)
goto leave_nomem;
scrub_get_recover(recover);
page->recover = recover;
}
kfree(bbio);
scrub_put_recover(recover);
length -= sublen;
logical += sublen;
page_index++;
......@@ -1293,6 +1400,51 @@ static int scrub_setup_recheck_block(struct scrub_ctx *sctx,
return 0;
}
struct scrub_bio_ret {
struct completion event;
int error;
};
static void scrub_bio_wait_endio(struct bio *bio, int error)
{
struct scrub_bio_ret *ret = bio->bi_private;
ret->error = error;
complete(&ret->event);
}
static inline int scrub_is_page_on_raid56(struct scrub_page *page)
{
return page->recover && page->recover->raid_map;
}
static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info,
struct bio *bio,
struct scrub_page *page)
{
struct scrub_bio_ret done;
int ret;
init_completion(&done.event);
done.error = 0;
bio->bi_iter.bi_sector = page->logical >> 9;
bio->bi_private = &done;
bio->bi_end_io = scrub_bio_wait_endio;
ret = raid56_parity_recover(fs_info->fs_root, bio, page->recover->bbio,
page->recover->raid_map,
page->recover->map_length,
page->mirror_num, 1);
if (ret)
return ret;
wait_for_completion(&done.event);
if (done.error)
return -EIO;
return 0;
}
/*
* this function will check the on disk data for checksum errors, header
* errors and read I/O errors. If any I/O errors happen, the exact pages
......@@ -1303,7 +1455,7 @@ static int scrub_setup_recheck_block(struct scrub_ctx *sctx,
static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
struct scrub_block *sblock, int is_metadata,
int have_csum, u8 *csum, u64 generation,
u16 csum_size)
u16 csum_size, int retry_failed_mirror)
{
int page_num;
......@@ -1329,11 +1481,17 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
continue;
}
bio->bi_bdev = page->dev->bdev;
bio->bi_iter.bi_sector = page->physical >> 9;
bio_add_page(bio, page->page, PAGE_SIZE, 0);
if (btrfsic_submit_bio_wait(READ, bio))
sblock->no_io_error_seen = 0;
if (!retry_failed_mirror && scrub_is_page_on_raid56(page)) {
if (scrub_submit_raid56_bio_wait(fs_info, bio, page))
sblock->no_io_error_seen = 0;
} else {
bio->bi_iter.bi_sector = page->physical >> 9;
if (btrfsic_submit_bio_wait(READ, bio))
sblock->no_io_error_seen = 0;
}
bio_put(bio);
}
......
......@@ -5161,7 +5161,9 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
BTRFS_BLOCK_GROUP_RAID6)) {
u64 tmp;
if (raid_map_ret && ((rw & REQ_WRITE) || mirror_num > 1)) {
if (raid_map_ret &&
((rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) ||
mirror_num > 1)) {
int i, rot;
/* push stripe_nr back to the start of the full stripe */
......@@ -5440,6 +5442,16 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
mirror_num, NULL);
}
/* For Scrub/replace */
int btrfs_map_sblock(struct btrfs_fs_info *fs_info, int rw,
u64 logical, u64 *length,
struct btrfs_bio **bbio_ret, int mirror_num,
u64 **raid_map_ret)
{
return __btrfs_map_block(fs_info, rw, logical, length, bbio_ret,
mirror_num, raid_map_ret);
}
int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
u64 chunk_start, u64 physical, u64 devid,
u64 **logical, int *naddrs, int *stripe_len)
......@@ -5809,7 +5821,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
} else {
ret = raid56_parity_recover(root, bio, bbio,
raid_map, map_length,
mirror_num);
mirror_num, 0);
}
/*
* FIXME, replace dosen't support raid56 yet, please fix
......
......@@ -393,6 +393,10 @@ int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
u64 logical, u64 *length,
struct btrfs_bio **bbio_ret, int mirror_num);
int btrfs_map_sblock(struct btrfs_fs_info *fs_info, int rw,
u64 logical, u64 *length,
struct btrfs_bio **bbio_ret, int mirror_num,
u64 **raid_map_ret);
int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
u64 chunk_start, u64 physical, u64 devid,
u64 **logical, int *naddrs, int *stripe_len);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment