Commit 11e47232 authored by Heinz Mauelshagen's avatar Heinz Mauelshagen Committed by Mike Snitzer

dm raid: stop keeping raid set frozen altogether

In order to avoid redoing synchronization/recovery/reshape partially,
the raid set got frozen until after all passed in table line flags had
been cleared.  The related table reload sequence had to be precisely
followed, or reshaping may lead to data corruption caused by the active
mapping carrying on with a reshape when the inactive mapping already
had retrieved a stale reshape position.

Harden by retrieving the actual resync/recovery/reshape position
during resume whilst the active table is suspended thus avoiding
to keep the raid set frozen altogether.  This prevents superfluous
redoing of an already resynchronized or recovered segment and,
most importantly, potential for redoing of an already reshaped
segment causing data corruption.

Fixes: d39f0010 ("dm raid: fix raid_resume() to keep raid set frozen as needed")
Signed-off-by: default avatarHeinz Mauelshagen <heinzm@redhat.com>
Signed-off-by: default avatarMike Snitzer <snitzer@redhat.com>
parent 53bf5384
...@@ -347,3 +347,4 @@ Version History ...@@ -347,3 +347,4 @@ Version History
1.13.0 Fix dev_health status at end of "recover" (was 'a', now 'A') 1.13.0 Fix dev_health status at end of "recover" (was 'a', now 'A')
1.13.1 Fix deadlock caused by early md_stop_writes(). Also fix size an 1.13.1 Fix deadlock caused by early md_stop_writes(). Also fix size an
state races. state races.
1.13.2 Fix raid redundancy validation and avoid keeping raid set frozen
...@@ -29,6 +29,9 @@ ...@@ -29,6 +29,9 @@
*/ */
#define MIN_RAID456_JOURNAL_SPACE (4*2048) #define MIN_RAID456_JOURNAL_SPACE (4*2048)
/* Global list of all raid sets */
LIST_HEAD(raid_sets);
static bool devices_handle_discard_safely = false; static bool devices_handle_discard_safely = false;
/* /*
...@@ -105,8 +108,6 @@ struct raid_dev { ...@@ -105,8 +108,6 @@ struct raid_dev {
#define CTR_FLAG_JOURNAL_DEV (1 << __CTR_FLAG_JOURNAL_DEV) #define CTR_FLAG_JOURNAL_DEV (1 << __CTR_FLAG_JOURNAL_DEV)
#define CTR_FLAG_JOURNAL_MODE (1 << __CTR_FLAG_JOURNAL_MODE) #define CTR_FLAG_JOURNAL_MODE (1 << __CTR_FLAG_JOURNAL_MODE)
#define RESUME_STAY_FROZEN_FLAGS (CTR_FLAG_DELTA_DISKS | CTR_FLAG_DATA_OFFSET)
/* /*
* Definitions of various constructor flags to * Definitions of various constructor flags to
* be used in checks of valid / invalid flags * be used in checks of valid / invalid flags
...@@ -226,6 +227,7 @@ struct rs_layout { ...@@ -226,6 +227,7 @@ struct rs_layout {
struct raid_set { struct raid_set {
struct dm_target *ti; struct dm_target *ti;
struct list_head list;
uint32_t stripe_cache_entries; uint32_t stripe_cache_entries;
unsigned long ctr_flags; unsigned long ctr_flags;
...@@ -271,6 +273,19 @@ static void rs_config_restore(struct raid_set *rs, struct rs_layout *l) ...@@ -271,6 +273,19 @@ static void rs_config_restore(struct raid_set *rs, struct rs_layout *l)
mddev->new_chunk_sectors = l->new_chunk_sectors; mddev->new_chunk_sectors = l->new_chunk_sectors;
} }
/* Find any raid_set in active slot for @rs on global list */
static struct raid_set *rs_find_active(struct raid_set *rs)
{
struct raid_set *r;
struct mapped_device *md = dm_table_get_md(rs->ti->table);
list_for_each_entry(r, &raid_sets, list)
if (r != rs && dm_table_get_md(r->ti->table) == md)
return r;
return NULL;
}
/* raid10 algorithms (i.e. formats) */ /* raid10 algorithms (i.e. formats) */
#define ALGORITHM_RAID10_DEFAULT 0 #define ALGORITHM_RAID10_DEFAULT 0
#define ALGORITHM_RAID10_NEAR 1 #define ALGORITHM_RAID10_NEAR 1
...@@ -749,6 +764,7 @@ static struct raid_set *raid_set_alloc(struct dm_target *ti, struct raid_type *r ...@@ -749,6 +764,7 @@ static struct raid_set *raid_set_alloc(struct dm_target *ti, struct raid_type *r
mddev_init(&rs->md); mddev_init(&rs->md);
INIT_LIST_HEAD(&rs->list);
rs->raid_disks = raid_devs; rs->raid_disks = raid_devs;
rs->delta_disks = 0; rs->delta_disks = 0;
...@@ -766,6 +782,9 @@ static struct raid_set *raid_set_alloc(struct dm_target *ti, struct raid_type *r ...@@ -766,6 +782,9 @@ static struct raid_set *raid_set_alloc(struct dm_target *ti, struct raid_type *r
for (i = 0; i < raid_devs; i++) for (i = 0; i < raid_devs; i++)
md_rdev_init(&rs->dev[i].rdev); md_rdev_init(&rs->dev[i].rdev);
/* Add @rs to global list. */
list_add(&rs->list, &raid_sets);
/* /*
* Remaining items to be initialized by further RAID params: * Remaining items to be initialized by further RAID params:
* rs->md.persistent * rs->md.persistent
...@@ -778,6 +797,7 @@ static struct raid_set *raid_set_alloc(struct dm_target *ti, struct raid_type *r ...@@ -778,6 +797,7 @@ static struct raid_set *raid_set_alloc(struct dm_target *ti, struct raid_type *r
return rs; return rs;
} }
/* Free all @rs allocations and remove it from global list. */
static void raid_set_free(struct raid_set *rs) static void raid_set_free(struct raid_set *rs)
{ {
int i; int i;
...@@ -795,6 +815,8 @@ static void raid_set_free(struct raid_set *rs) ...@@ -795,6 +815,8 @@ static void raid_set_free(struct raid_set *rs)
dm_put_device(rs->ti, rs->dev[i].data_dev); dm_put_device(rs->ti, rs->dev[i].data_dev);
} }
list_del(&rs->list);
kfree(rs); kfree(rs);
} }
...@@ -2371,7 +2393,7 @@ static int super_init_validation(struct raid_set *rs, struct md_rdev *rdev) ...@@ -2371,7 +2393,7 @@ static int super_init_validation(struct raid_set *rs, struct md_rdev *rdev)
DMERR("new device%s provided without 'rebuild'", DMERR("new device%s provided without 'rebuild'",
new_devs > 1 ? "s" : ""); new_devs > 1 ? "s" : "");
return -EINVAL; return -EINVAL;
} else if (rs_is_recovering(rs)) { } else if (!test_bit(__CTR_FLAG_REBUILD, &rs->ctr_flags) && rs_is_recovering(rs)) {
DMERR("'rebuild' specified while raid set is not in-sync (recovery_cp=%llu)", DMERR("'rebuild' specified while raid set is not in-sync (recovery_cp=%llu)",
(unsigned long long) mddev->recovery_cp); (unsigned long long) mddev->recovery_cp);
return -EINVAL; return -EINVAL;
...@@ -3173,19 +3195,22 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv) ...@@ -3173,19 +3195,22 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv)
goto bad; goto bad;
} }
/* /* Out-of-place space has to be available to allow for a reshape unless raid1! */
* We can only prepare for a reshape here, because the if (reshape_sectors || rs_is_raid1(rs)) {
* raid set needs to run to provide the repective reshape /*
* check functions via its MD personality instance. * We can only prepare for a reshape here, because the
* * raid set needs to run to provide the repective reshape
* So do the reshape check after md_run() succeeded. * check functions via its MD personality instance.
*/ *
r = rs_prepare_reshape(rs); * So do the reshape check after md_run() succeeded.
if (r) */
return r; r = rs_prepare_reshape(rs);
if (r)
return r;
/* Reshaping ain't recovery, so disable recovery */ /* Reshaping ain't recovery, so disable recovery */
rs_setup_recovery(rs, MaxSector); rs_setup_recovery(rs, MaxSector);
}
rs_set_cur(rs); rs_set_cur(rs);
} else { } else {
/* May not set recovery when a device rebuild is requested */ /* May not set recovery when a device rebuild is requested */
...@@ -3395,7 +3420,6 @@ static sector_t rs_get_progress(struct raid_set *rs, unsigned long recovery, ...@@ -3395,7 +3420,6 @@ static sector_t rs_get_progress(struct raid_set *rs, unsigned long recovery,
} else if (test_bit(MD_RECOVERY_NEEDED, &recovery) || } else if (test_bit(MD_RECOVERY_NEEDED, &recovery) ||
test_bit(MD_RECOVERY_RUNNING, &recovery)) test_bit(MD_RECOVERY_RUNNING, &recovery))
r = mddev->curr_resync_completed; r = mddev->curr_resync_completed;
else else
r = mddev->recovery_cp; r = mddev->recovery_cp;
...@@ -3904,10 +3928,33 @@ static int raid_preresume(struct dm_target *ti) ...@@ -3904,10 +3928,33 @@ static int raid_preresume(struct dm_target *ti)
struct raid_set *rs = ti->private; struct raid_set *rs = ti->private;
struct mddev *mddev = &rs->md; struct mddev *mddev = &rs->md;
/* This is a resume after a suspend of the set -> it's already started */ /* This is a resume after a suspend of the set -> it's already started. */
if (test_and_set_bit(RT_FLAG_RS_PRERESUMED, &rs->runtime_flags)) if (test_and_set_bit(RT_FLAG_RS_PRERESUMED, &rs->runtime_flags))
return 0; return 0;
if (!test_bit(__CTR_FLAG_REBUILD, &rs->ctr_flags)) {
struct raid_set *rs_active = rs_find_active(rs);
if (rs_active) {
/*
* In case no rebuilds have been requested
* and an active table slot exists, copy
* current resynchonization completed and
* reshape position pointers across from
* suspended raid set in the active slot.
*
* This resumes the new mapping at current
* offsets to continue recover/reshape without
* necessarily redoing a raid set partially or
* causing data corruption in case of a reshape.
*/
if (rs_active->md.curr_resync_completed != MaxSector)
mddev->curr_resync_completed = rs_active->md.curr_resync_completed;
if (rs_active->md.reshape_position != MaxSector)
mddev->reshape_position = rs_active->md.reshape_position;
}
}
/* /*
* The superblocks need to be updated on disk if the * The superblocks need to be updated on disk if the
* array is new or new devices got added (thus zeroed * array is new or new devices got added (thus zeroed
...@@ -3968,28 +4015,13 @@ static void raid_resume(struct dm_target *ti) ...@@ -3968,28 +4015,13 @@ static void raid_resume(struct dm_target *ti)
attempt_restore_of_faulty_devices(rs); attempt_restore_of_faulty_devices(rs);
} }
/* Only reduce raid set size before running a disk removing reshape. */
if (mddev->delta_disks < 0)
rs_set_capacity(rs);
/*
* Keep the RAID set frozen if reshape/rebuild flags are set.
* The RAID set is unfrozen once the next table load/resume,
* which clears the reshape/rebuild flags, occurs.
* This ensures that the constructor for the inactive table
* retrieves an up-to-date reshape_position.
*/
if (!test_and_clear_bit(RT_FLAG_RESHAPE_RS, &rs->runtime_flags) &&
!(rs->ctr_flags & RESUME_STAY_FROZEN_FLAGS)) {
if (rs_is_reshapable(rs)) {
if (!rs_is_reshaping(rs) || _get_reshape_sectors(rs))
clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
} else
clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
}
if (test_and_clear_bit(RT_FLAG_RS_SUSPENDED, &rs->runtime_flags)) { if (test_and_clear_bit(RT_FLAG_RS_SUSPENDED, &rs->runtime_flags)) {
/* Only reduce raid set size before running a disk removing reshape. */
if (mddev->delta_disks < 0)
rs_set_capacity(rs);
mddev_lock_nointr(mddev); mddev_lock_nointr(mddev);
clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
mddev->ro = 0; mddev->ro = 0;
mddev->in_sync = 0; mddev->in_sync = 0;
mddev_resume(mddev); mddev_resume(mddev);
...@@ -3999,7 +4031,7 @@ static void raid_resume(struct dm_target *ti) ...@@ -3999,7 +4031,7 @@ static void raid_resume(struct dm_target *ti)
static struct target_type raid_target = { static struct target_type raid_target = {
.name = "raid", .name = "raid",
.version = {1, 13, 1}, .version = {1, 13, 2},
.module = THIS_MODULE, .module = THIS_MODULE,
.ctr = raid_ctr, .ctr = raid_ctr,
.dtr = raid_dtr, .dtr = raid_dtr,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment