Commit 98202f32 authored by Neil Brown's avatar Neil Brown Committed by Linus Torvalds

[PATCH] md: make raid5 and raid6 robust against failure during recovery.

Two problems are fixed here.
1/ if the array is known to require a resync (parity update),
  but there are too many failed devices,  the resync cannot complete
  but will be retried indefinitely.
2/ if the array has too many failed drives to be usable and a spare is
  available, reconstruction will be attempted, but cannot work.  This
  also is retried indefinitely.
Signed-off-by: default avatarNeil Brown <neilb@cse.unsw.edu.au>
Signed-off-by: default avatarAndrew Morton <akpm@osdl.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@osdl.org>
parent ea9694b2
...@@ -3545,18 +3545,18 @@ void md_check_recovery(mddev_t *mddev) ...@@ -3545,18 +3545,18 @@ void md_check_recovery(mddev_t *mddev)
/* no recovery is running. /* no recovery is running.
* remove any failed drives, then * remove any failed drives, then
* add spares if possible * add spares if possible.
* Spare are also removed and re-added, to allow
* the personality to fail the re-add.
*/ */
ITERATE_RDEV(mddev,rdev,rtmp) { ITERATE_RDEV(mddev,rdev,rtmp)
if (rdev->raid_disk >= 0 && if (rdev->raid_disk >= 0 &&
rdev->faulty && (rdev->faulty || ! rdev->in_sync) &&
atomic_read(&rdev->nr_pending)==0) { atomic_read(&rdev->nr_pending)==0) {
if (mddev->pers->hot_remove_disk(mddev, rdev->raid_disk)==0) if (mddev->pers->hot_remove_disk(mddev, rdev->raid_disk)==0)
rdev->raid_disk = -1; rdev->raid_disk = -1;
} }
if (!rdev->faulty && rdev->raid_disk >= 0 && !rdev->in_sync)
spares++;
}
if (mddev->degraded) { if (mddev->degraded) {
ITERATE_RDEV(mddev,rdev,rtmp) ITERATE_RDEV(mddev,rdev,rtmp)
if (rdev->raid_disk < 0 if (rdev->raid_disk < 0
......
...@@ -1493,6 +1493,15 @@ static int sync_request (mddev_t *mddev, sector_t sector_nr, int go_faster) ...@@ -1493,6 +1493,15 @@ static int sync_request (mddev_t *mddev, sector_t sector_nr, int go_faster)
unplug_slaves(mddev); unplug_slaves(mddev);
return 0; return 0;
} }
/* if there is 1 or more failed drives and we are trying
* to resync, then assert that we are finished, because there is
* nothing we can do.
*/
if (mddev->degraded >= 1 && test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
int rv = (mddev->size << 1) - sector_nr;
md_done_sync(mddev, rv, 1);
return rv;
}
x = sector_nr; x = sector_nr;
chunk_offset = sector_div(x, sectors_per_chunk); chunk_offset = sector_div(x, sectors_per_chunk);
...@@ -1884,6 +1893,10 @@ static int raid5_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) ...@@ -1884,6 +1893,10 @@ static int raid5_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
int disk; int disk;
struct disk_info *p; struct disk_info *p;
if (mddev->degraded > 1)
/* no point adding a device */
return 0;
/* /*
* find the disk ... * find the disk ...
*/ */
......
...@@ -1652,6 +1652,15 @@ static int sync_request (mddev_t *mddev, sector_t sector_nr, int go_faster) ...@@ -1652,6 +1652,15 @@ static int sync_request (mddev_t *mddev, sector_t sector_nr, int go_faster)
unplug_slaves(mddev); unplug_slaves(mddev);
return 0; return 0;
} }
/* if there are 2 or more failed drives and we are trying
* to resync, then assert that we are finished, because there is
* nothing we can do.
*/
if (mddev->degraded >= 2 && test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
int rv = (mddev->size << 1) - sector_nr;
md_done_sync(mddev, rv, 1);
return rv;
}
x = sector_nr; x = sector_nr;
chunk_offset = sector_div(x, sectors_per_chunk); chunk_offset = sector_div(x, sectors_per_chunk);
...@@ -2050,6 +2059,9 @@ static int raid6_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) ...@@ -2050,6 +2059,9 @@ static int raid6_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
int disk; int disk;
struct disk_info *p; struct disk_info *p;
if (mddev->degraded > 2)
/* no point adding a device */
return 0;
/* /*
* find the disk ... * find the disk ...
*/ */
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment