Commit 05616be5 authored by NeilBrown's avatar NeilBrown

md/raid5: Use correct data_offset for all IO.

As there can now be two different data_offsets - an 'old' and
a 'new' - we need to carefully choose between them.
Signed-off-by: default avatarNeilBrown <neilb@suse.de>
parent c6563a8c
...@@ -488,6 +488,27 @@ get_active_stripe(struct r5conf *conf, sector_t sector, ...@@ -488,6 +488,27 @@ get_active_stripe(struct r5conf *conf, sector_t sector,
return sh; return sh;
} }
/* Determine if 'data_offset' or 'new_data_offset' should be used
* in this stripe_head.
*/
static int use_new_offset(struct r5conf *conf, struct stripe_head *sh)
{
sector_t progress = conf->reshape_progress;
/* Need a memory barrier to make sure we see the value
* of conf->generation, or ->data_offset that was set before
* reshape_progress was updated.
*/
smp_rmb();
if (progress == MaxSector)
return 0;
if (sh->generation == conf->generation - 1)
return 0;
/* We are in a reshape, and this is a new-generation stripe,
* so use new_data_offset.
*/
return 1;
}
static void static void
raid5_end_read_request(struct bio *bi, int error); raid5_end_read_request(struct bio *bi, int error);
static void static void
...@@ -603,7 +624,12 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) ...@@ -603,7 +624,12 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
__func__, (unsigned long long)sh->sector, __func__, (unsigned long long)sh->sector,
bi->bi_rw, i); bi->bi_rw, i);
atomic_inc(&sh->count); atomic_inc(&sh->count);
bi->bi_sector = sh->sector + rdev->data_offset; if (use_new_offset(conf, sh))
bi->bi_sector = (sh->sector
+ rdev->new_data_offset);
else
bi->bi_sector = (sh->sector
+ rdev->data_offset);
bi->bi_flags = 1 << BIO_UPTODATE; bi->bi_flags = 1 << BIO_UPTODATE;
bi->bi_idx = 0; bi->bi_idx = 0;
bi->bi_io_vec[0].bv_len = STRIPE_SIZE; bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
...@@ -627,7 +653,12 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) ...@@ -627,7 +653,12 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
__func__, (unsigned long long)sh->sector, __func__, (unsigned long long)sh->sector,
rbi->bi_rw, i); rbi->bi_rw, i);
atomic_inc(&sh->count); atomic_inc(&sh->count);
rbi->bi_sector = sh->sector + rrdev->data_offset; if (use_new_offset(conf, sh))
rbi->bi_sector = (sh->sector
+ rrdev->new_data_offset);
else
rbi->bi_sector = (sh->sector
+ rrdev->data_offset);
rbi->bi_flags = 1 << BIO_UPTODATE; rbi->bi_flags = 1 << BIO_UPTODATE;
rbi->bi_idx = 0; rbi->bi_idx = 0;
rbi->bi_io_vec[0].bv_len = STRIPE_SIZE; rbi->bi_io_vec[0].bv_len = STRIPE_SIZE;
...@@ -1648,7 +1679,7 @@ static void raid5_end_read_request(struct bio * bi, int error) ...@@ -1648,7 +1679,7 @@ static void raid5_end_read_request(struct bio * bi, int error)
int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
char b[BDEVNAME_SIZE]; char b[BDEVNAME_SIZE];
struct md_rdev *rdev = NULL; struct md_rdev *rdev = NULL;
sector_t s;
for (i=0 ; i<disks; i++) for (i=0 ; i<disks; i++)
if (bi == &sh->dev[i].req) if (bi == &sh->dev[i].req)
...@@ -1671,6 +1702,10 @@ static void raid5_end_read_request(struct bio * bi, int error) ...@@ -1671,6 +1702,10 @@ static void raid5_end_read_request(struct bio * bi, int error)
if (!rdev) if (!rdev)
rdev = conf->disks[i].rdev; rdev = conf->disks[i].rdev;
if (use_new_offset(conf, sh))
s = sh->sector + rdev->new_data_offset;
else
s = sh->sector + rdev->data_offset;
if (uptodate) { if (uptodate) {
set_bit(R5_UPTODATE, &sh->dev[i].flags); set_bit(R5_UPTODATE, &sh->dev[i].flags);
if (test_bit(R5_ReadError, &sh->dev[i].flags)) { if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
...@@ -1683,8 +1718,7 @@ static void raid5_end_read_request(struct bio * bi, int error) ...@@ -1683,8 +1718,7 @@ static void raid5_end_read_request(struct bio * bi, int error)
"md/raid:%s: read error corrected" "md/raid:%s: read error corrected"
" (%lu sectors at %llu on %s)\n", " (%lu sectors at %llu on %s)\n",
mdname(conf->mddev), STRIPE_SECTORS, mdname(conf->mddev), STRIPE_SECTORS,
(unsigned long long)(sh->sector (unsigned long long)s,
+ rdev->data_offset),
bdevname(rdev->bdev, b)); bdevname(rdev->bdev, b));
atomic_add(STRIPE_SECTORS, &rdev->corrected_errors); atomic_add(STRIPE_SECTORS, &rdev->corrected_errors);
clear_bit(R5_ReadError, &sh->dev[i].flags); clear_bit(R5_ReadError, &sh->dev[i].flags);
...@@ -1704,8 +1738,7 @@ static void raid5_end_read_request(struct bio * bi, int error) ...@@ -1704,8 +1738,7 @@ static void raid5_end_read_request(struct bio * bi, int error)
"md/raid:%s: read error on replacement device " "md/raid:%s: read error on replacement device "
"(sector %llu on %s).\n", "(sector %llu on %s).\n",
mdname(conf->mddev), mdname(conf->mddev),
(unsigned long long)(sh->sector (unsigned long long)s,
+ rdev->data_offset),
bdn); bdn);
else if (conf->mddev->degraded >= conf->max_degraded) else if (conf->mddev->degraded >= conf->max_degraded)
printk_ratelimited( printk_ratelimited(
...@@ -1713,8 +1746,7 @@ static void raid5_end_read_request(struct bio * bi, int error) ...@@ -1713,8 +1746,7 @@ static void raid5_end_read_request(struct bio * bi, int error)
"md/raid:%s: read error not correctable " "md/raid:%s: read error not correctable "
"(sector %llu on %s).\n", "(sector %llu on %s).\n",
mdname(conf->mddev), mdname(conf->mddev),
(unsigned long long)(sh->sector (unsigned long long)s,
+ rdev->data_offset),
bdn); bdn);
else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) else if (test_bit(R5_ReWrite, &sh->dev[i].flags))
/* Oh, no!!! */ /* Oh, no!!! */
...@@ -1723,8 +1755,7 @@ static void raid5_end_read_request(struct bio * bi, int error) ...@@ -1723,8 +1755,7 @@ static void raid5_end_read_request(struct bio * bi, int error)
"md/raid:%s: read error NOT corrected!! " "md/raid:%s: read error NOT corrected!! "
"(sector %llu on %s).\n", "(sector %llu on %s).\n",
mdname(conf->mddev), mdname(conf->mddev),
(unsigned long long)(sh->sector (unsigned long long)s,
+ rdev->data_offset),
bdn); bdn);
else if (atomic_read(&rdev->read_errors) else if (atomic_read(&rdev->read_errors)
> conf->max_nr_stripes) > conf->max_nr_stripes)
...@@ -3842,6 +3873,7 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio) ...@@ -3842,6 +3873,7 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio)
raid_bio->bi_next = (void*)rdev; raid_bio->bi_next = (void*)rdev;
align_bi->bi_bdev = rdev->bdev; align_bi->bi_bdev = rdev->bdev;
align_bi->bi_flags &= ~(1 << BIO_SEG_VALID); align_bi->bi_flags &= ~(1 << BIO_SEG_VALID);
/* No reshape active, so we can trust rdev->data_offset */
align_bi->bi_sector += rdev->data_offset; align_bi->bi_sector += rdev->data_offset;
if (!bio_fits_rdev(align_bi) || if (!bio_fits_rdev(align_bi) ||
...@@ -5182,9 +5214,12 @@ static int run(struct mddev *mddev) ...@@ -5182,9 +5214,12 @@ static int run(struct mddev *mddev)
blk_queue_io_opt(mddev->queue, chunk_size * blk_queue_io_opt(mddev->queue, chunk_size *
(conf->raid_disks - conf->max_degraded)); (conf->raid_disks - conf->max_degraded));
rdev_for_each(rdev, mddev) rdev_for_each(rdev, mddev) {
disk_stack_limits(mddev->gendisk, rdev->bdev, disk_stack_limits(mddev->gendisk, rdev->bdev,
rdev->data_offset << 9); rdev->data_offset << 9);
disk_stack_limits(mddev->gendisk, rdev->bdev,
rdev->new_data_offset << 9);
}
} }
return 0; return 0;
...@@ -5539,12 +5574,16 @@ static int raid5_start_reshape(struct mddev *mddev) ...@@ -5539,12 +5574,16 @@ static int raid5_start_reshape(struct mddev *mddev)
conf->chunk_sectors = mddev->new_chunk_sectors; conf->chunk_sectors = mddev->new_chunk_sectors;
conf->prev_algo = conf->algorithm; conf->prev_algo = conf->algorithm;
conf->algorithm = mddev->new_layout; conf->algorithm = mddev->new_layout;
conf->generation++;
/* Code that selects data_offset needs to see the generation update
* if reshape_progress has been set - so a memory barrier needed.
*/
smp_mb();
if (mddev->reshape_backwards) if (mddev->reshape_backwards)
conf->reshape_progress = raid5_size(mddev, 0, 0); conf->reshape_progress = raid5_size(mddev, 0, 0);
else else
conf->reshape_progress = 0; conf->reshape_progress = 0;
conf->reshape_safe = conf->reshape_progress; conf->reshape_safe = conf->reshape_progress;
conf->generation++;
spin_unlock_irq(&conf->device_lock); spin_unlock_irq(&conf->device_lock);
/* Add some new drives, as many as will fit. /* Add some new drives, as many as will fit.
...@@ -5596,6 +5635,9 @@ static int raid5_start_reshape(struct mddev *mddev) ...@@ -5596,6 +5635,9 @@ static int raid5_start_reshape(struct mddev *mddev)
mddev->recovery = 0; mddev->recovery = 0;
spin_lock_irq(&conf->device_lock); spin_lock_irq(&conf->device_lock);
mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks; mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks;
rdev_for_each(rdev, mddev)
rdev->new_data_offset = rdev->data_offset;
smp_wmb();
conf->reshape_progress = MaxSector; conf->reshape_progress = MaxSector;
mddev->reshape_position = MaxSector; mddev->reshape_position = MaxSector;
spin_unlock_irq(&conf->device_lock); spin_unlock_irq(&conf->device_lock);
...@@ -5614,9 +5656,13 @@ static void end_reshape(struct r5conf *conf) ...@@ -5614,9 +5656,13 @@ static void end_reshape(struct r5conf *conf)
{ {
if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) { if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) {
struct md_rdev *rdev;
spin_lock_irq(&conf->device_lock); spin_lock_irq(&conf->device_lock);
conf->previous_raid_disks = conf->raid_disks; conf->previous_raid_disks = conf->raid_disks;
rdev_for_each(rdev, conf->mddev)
rdev->data_offset = rdev->new_data_offset;
smp_wmb();
conf->reshape_progress = MaxSector; conf->reshape_progress = MaxSector;
spin_unlock_irq(&conf->device_lock); spin_unlock_irq(&conf->device_lock);
wake_up(&conf->wait_for_overlap); wake_up(&conf->wait_for_overlap);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment