Commit 2943c833 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'md-3.3' of git://neil.brown.name/md

md update for 3.3

Big change is new hot-replacement.
A slot in an array can hold 2 devices - one that
wants-replacement and one that is the replacement.
Once the replacement is built - either from the
original or (in the case of errors) from elsewhere,
the wants-replacement device will be removed.

* tag 'md-3.3' of git://neil.brown.name/md: (36 commits)
  md/raid1: Mark device want_replacement when we see a write error.
  md/raid1: If there is a spare and a want_replacement device, start replacement.
  md/raid1: recognise replacements when assembling arrays.
  md/raid1: handle activation of replacement device when recovery completes.
  md/raid1: Allow a failed replacement device to be removed.
  md/raid1: Allocate spare to store replacement devices and their bios.
  md/raid1:  Replace use of mddev->raid_disks with conf->raid_disks.
  md/raid10: If there is a spare and a want_replacement device, start replacement.
  md/raid10: recognise replacements when assembling array.
  md/raid10: Allow replacement device to be replace old drive.
  md/raid10: handle recovery of replacement devices.
  md/raid10:  Handle replacement devices during resync.
  md/raid10: writes should get directed to replacement as well as original.
  md/raid10: allow removal of failed replacement devices.
  md/raid10: preferentially read from replacement device if possible.
  md/raid10:  change read_balance to return an rdev
  md/raid10: prepare data structures for handling replacement.
  md/raid5: Mark device want_replacement when we see a write error.
  md/raid5: If there is a spare and a want_replacement device, start replacement.
  md/raid5: recognise replacements when assembling array.
  ...
parents 98793265 19d67169
...@@ -360,7 +360,7 @@ Each directory contains: ...@@ -360,7 +360,7 @@ Each directory contains:
A file recording the current state of the device in the array A file recording the current state of the device in the array
which can be a comma separated list of which can be a comma separated list of
faulty - device has been kicked from active use due to faulty - device has been kicked from active use due to
a detected fault or it has unacknowledged bad a detected fault, or it has unacknowledged bad
blocks blocks
in_sync - device is a fully in-sync member of the array in_sync - device is a fully in-sync member of the array
writemostly - device will only be subject to read writemostly - device will only be subject to read
...@@ -374,6 +374,13 @@ Each directory contains: ...@@ -374,6 +374,13 @@ Each directory contains:
This includes spares that are in the process This includes spares that are in the process
of being recovered to of being recovered to
write_error - device has ever seen a write error. write_error - device has ever seen a write error.
want_replacement - device is (mostly) working but probably
should be replaced, either due to errors or
due to user request.
replacement - device is a replacement for another active
device with same raid_disk.
This list may grow in future. This list may grow in future.
This can be written to. This can be written to.
Writing "faulty" simulates a failure on the device. Writing "faulty" simulates a failure on the device.
...@@ -386,6 +393,13 @@ Each directory contains: ...@@ -386,6 +393,13 @@ Each directory contains:
Writing "in_sync" sets the in_sync flag. Writing "in_sync" sets the in_sync flag.
Writing "write_error" sets writeerrorseen flag. Writing "write_error" sets writeerrorseen flag.
Writing "-write_error" clears writeerrorseen flag. Writing "-write_error" clears writeerrorseen flag.
Writing "want_replacement" is allowed at any time except to a
replacement device or a spare. It sets the flag.
Writing "-want_replacement" is allowed at any time. It clears
the flag.
Writing "replacement" or "-replacement" is only allowed before
starting the array. It sets or clears the flag.
This file responds to select/poll. Any change to 'faulty' This file responds to select/poll. Any change to 'faulty'
or 'blocked' causes an event. or 'blocked' causes an event.
......
...@@ -1149,12 +1149,12 @@ void bitmap_daemon_work(struct mddev *mddev) ...@@ -1149,12 +1149,12 @@ void bitmap_daemon_work(struct mddev *mddev)
return; return;
} }
if (time_before(jiffies, bitmap->daemon_lastrun if (time_before(jiffies, bitmap->daemon_lastrun
+ bitmap->mddev->bitmap_info.daemon_sleep)) + mddev->bitmap_info.daemon_sleep))
goto done; goto done;
bitmap->daemon_lastrun = jiffies; bitmap->daemon_lastrun = jiffies;
if (bitmap->allclean) { if (bitmap->allclean) {
bitmap->mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT; mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT;
goto done; goto done;
} }
bitmap->allclean = 1; bitmap->allclean = 1;
...@@ -1206,7 +1206,7 @@ void bitmap_daemon_work(struct mddev *mddev) ...@@ -1206,7 +1206,7 @@ void bitmap_daemon_work(struct mddev *mddev)
* sure that events_cleared is up-to-date. * sure that events_cleared is up-to-date.
*/ */
if (bitmap->need_sync && if (bitmap->need_sync &&
bitmap->mddev->bitmap_info.external == 0) { mddev->bitmap_info.external == 0) {
bitmap_super_t *sb; bitmap_super_t *sb;
bitmap->need_sync = 0; bitmap->need_sync = 0;
sb = kmap_atomic(bitmap->sb_page, KM_USER0); sb = kmap_atomic(bitmap->sb_page, KM_USER0);
...@@ -1270,8 +1270,8 @@ void bitmap_daemon_work(struct mddev *mddev) ...@@ -1270,8 +1270,8 @@ void bitmap_daemon_work(struct mddev *mddev)
done: done:
if (bitmap->allclean == 0) if (bitmap->allclean == 0)
bitmap->mddev->thread->timeout = mddev->thread->timeout =
bitmap->mddev->bitmap_info.daemon_sleep; mddev->bitmap_info.daemon_sleep;
mutex_unlock(&mddev->bitmap_info.mutex); mutex_unlock(&mddev->bitmap_info.mutex);
} }
...@@ -1587,7 +1587,7 @@ static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int n ...@@ -1587,7 +1587,7 @@ static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int n
} }
if (!*bmc) { if (!*bmc) {
struct page *page; struct page *page;
*bmc = 1 | (needed ? NEEDED_MASK : 0); *bmc = 2 | (needed ? NEEDED_MASK : 0);
bitmap_count_page(bitmap, offset, 1); bitmap_count_page(bitmap, offset, 1);
page = filemap_get_page(bitmap, offset >> CHUNK_BLOCK_SHIFT(bitmap)); page = filemap_get_page(bitmap, offset >> CHUNK_BLOCK_SHIFT(bitmap));
set_page_attr(bitmap, page, BITMAP_PAGE_PENDING); set_page_attr(bitmap, page, BITMAP_PAGE_PENDING);
......
...@@ -1713,6 +1713,8 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev) ...@@ -1713,6 +1713,8 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
} }
if (sb->devflags & WriteMostly1) if (sb->devflags & WriteMostly1)
set_bit(WriteMostly, &rdev->flags); set_bit(WriteMostly, &rdev->flags);
if (le32_to_cpu(sb->feature_map) & MD_FEATURE_REPLACEMENT)
set_bit(Replacement, &rdev->flags);
} else /* MULTIPATH are always insync */ } else /* MULTIPATH are always insync */
set_bit(In_sync, &rdev->flags); set_bit(In_sync, &rdev->flags);
...@@ -1766,6 +1768,9 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev) ...@@ -1766,6 +1768,9 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev)
sb->recovery_offset = sb->recovery_offset =
cpu_to_le64(rdev->recovery_offset); cpu_to_le64(rdev->recovery_offset);
} }
if (test_bit(Replacement, &rdev->flags))
sb->feature_map |=
cpu_to_le32(MD_FEATURE_REPLACEMENT);
if (mddev->reshape_position != MaxSector) { if (mddev->reshape_position != MaxSector) {
sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE); sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE);
...@@ -2559,6 +2564,15 @@ state_show(struct md_rdev *rdev, char *page) ...@@ -2559,6 +2564,15 @@ state_show(struct md_rdev *rdev, char *page)
len += sprintf(page+len, "%swrite_error", sep); len += sprintf(page+len, "%swrite_error", sep);
sep = ","; sep = ",";
} }
if (test_bit(WantReplacement, &rdev->flags)) {
len += sprintf(page+len, "%swant_replacement", sep);
sep = ",";
}
if (test_bit(Replacement, &rdev->flags)) {
len += sprintf(page+len, "%sreplacement", sep);
sep = ",";
}
return len+sprintf(page+len, "\n"); return len+sprintf(page+len, "\n");
} }
...@@ -2627,6 +2641,42 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len) ...@@ -2627,6 +2641,42 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len)
} else if (cmd_match(buf, "-write_error")) { } else if (cmd_match(buf, "-write_error")) {
clear_bit(WriteErrorSeen, &rdev->flags); clear_bit(WriteErrorSeen, &rdev->flags);
err = 0; err = 0;
} else if (cmd_match(buf, "want_replacement")) {
/* Any non-spare device that is not a replacement can
* become want_replacement at any time, but we then need to
* check if recovery is needed.
*/
if (rdev->raid_disk >= 0 &&
!test_bit(Replacement, &rdev->flags))
set_bit(WantReplacement, &rdev->flags);
set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
md_wakeup_thread(rdev->mddev->thread);
err = 0;
} else if (cmd_match(buf, "-want_replacement")) {
/* Clearing 'want_replacement' is always allowed.
* Once replacements starts it is too late though.
*/
err = 0;
clear_bit(WantReplacement, &rdev->flags);
} else if (cmd_match(buf, "replacement")) {
/* Can only set a device as a replacement when array has not
* yet been started. Once running, replacement is automatic
* from spares, or by assigning 'slot'.
*/
if (rdev->mddev->pers)
err = -EBUSY;
else {
set_bit(Replacement, &rdev->flags);
err = 0;
}
} else if (cmd_match(buf, "-replacement")) {
/* Similarly, can only clear Replacement before start */
if (rdev->mddev->pers)
err = -EBUSY;
else {
clear_bit(Replacement, &rdev->flags);
err = 0;
}
} }
if (!err) if (!err)
sysfs_notify_dirent_safe(rdev->sysfs_state); sysfs_notify_dirent_safe(rdev->sysfs_state);
...@@ -2688,7 +2738,7 @@ slot_store(struct md_rdev *rdev, const char *buf, size_t len) ...@@ -2688,7 +2738,7 @@ slot_store(struct md_rdev *rdev, const char *buf, size_t len)
if (rdev->mddev->pers->hot_remove_disk == NULL) if (rdev->mddev->pers->hot_remove_disk == NULL)
return -EINVAL; return -EINVAL;
err = rdev->mddev->pers-> err = rdev->mddev->pers->
hot_remove_disk(rdev->mddev, rdev->raid_disk); hot_remove_disk(rdev->mddev, rdev);
if (err) if (err)
return err; return err;
sysfs_unlink_rdev(rdev->mddev, rdev); sysfs_unlink_rdev(rdev->mddev, rdev);
...@@ -2696,7 +2746,6 @@ slot_store(struct md_rdev *rdev, const char *buf, size_t len) ...@@ -2696,7 +2746,6 @@ slot_store(struct md_rdev *rdev, const char *buf, size_t len)
set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
md_wakeup_thread(rdev->mddev->thread); md_wakeup_thread(rdev->mddev->thread);
} else if (rdev->mddev->pers) { } else if (rdev->mddev->pers) {
struct md_rdev *rdev2;
/* Activating a spare .. or possibly reactivating /* Activating a spare .. or possibly reactivating
* if we ever get bitmaps working here. * if we ever get bitmaps working here.
*/ */
...@@ -2710,10 +2759,6 @@ slot_store(struct md_rdev *rdev, const char *buf, size_t len) ...@@ -2710,10 +2759,6 @@ slot_store(struct md_rdev *rdev, const char *buf, size_t len)
if (rdev->mddev->pers->hot_add_disk == NULL) if (rdev->mddev->pers->hot_add_disk == NULL)
return -EINVAL; return -EINVAL;
list_for_each_entry(rdev2, &rdev->mddev->disks, same_set)
if (rdev2->raid_disk == slot)
return -EEXIST;
if (slot >= rdev->mddev->raid_disks && if (slot >= rdev->mddev->raid_disks &&
slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks) slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks)
return -ENOSPC; return -ENOSPC;
...@@ -6053,8 +6098,15 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, ...@@ -6053,8 +6098,15 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
struct mddev *mddev = NULL; struct mddev *mddev = NULL;
int ro; int ro;
switch (cmd) {
case RAID_VERSION:
case GET_ARRAY_INFO:
case GET_DISK_INFO:
break;
default:
if (!capable(CAP_SYS_ADMIN)) if (!capable(CAP_SYS_ADMIN))
return -EACCES; return -EACCES;
}
/* /*
* Commands dealing with the RAID driver but not any * Commands dealing with the RAID driver but not any
...@@ -6714,8 +6766,11 @@ static int md_seq_show(struct seq_file *seq, void *v) ...@@ -6714,8 +6766,11 @@ static int md_seq_show(struct seq_file *seq, void *v)
if (test_bit(Faulty, &rdev->flags)) { if (test_bit(Faulty, &rdev->flags)) {
seq_printf(seq, "(F)"); seq_printf(seq, "(F)");
continue; continue;
} else if (rdev->raid_disk < 0) }
if (rdev->raid_disk < 0)
seq_printf(seq, "(S)"); /* spare */ seq_printf(seq, "(S)"); /* spare */
if (test_bit(Replacement, &rdev->flags))
seq_printf(seq, "(R)");
sectors += rdev->sectors; sectors += rdev->sectors;
} }
...@@ -7337,13 +7392,12 @@ static int remove_and_add_spares(struct mddev *mddev) ...@@ -7337,13 +7392,12 @@ static int remove_and_add_spares(struct mddev *mddev)
! test_bit(In_sync, &rdev->flags)) && ! test_bit(In_sync, &rdev->flags)) &&
atomic_read(&rdev->nr_pending)==0) { atomic_read(&rdev->nr_pending)==0) {
if (mddev->pers->hot_remove_disk( if (mddev->pers->hot_remove_disk(
mddev, rdev->raid_disk)==0) { mddev, rdev) == 0) {
sysfs_unlink_rdev(mddev, rdev); sysfs_unlink_rdev(mddev, rdev);
rdev->raid_disk = -1; rdev->raid_disk = -1;
} }
} }
if (mddev->degraded) {
list_for_each_entry(rdev, &mddev->disks, same_set) { list_for_each_entry(rdev, &mddev->disks, same_set) {
if (rdev->raid_disk >= 0 && if (rdev->raid_disk >= 0 &&
!test_bit(In_sync, &rdev->flags) && !test_bit(In_sync, &rdev->flags) &&
...@@ -7362,7 +7416,6 @@ static int remove_and_add_spares(struct mddev *mddev) ...@@ -7362,7 +7416,6 @@ static int remove_and_add_spares(struct mddev *mddev)
} }
} }
} }
}
return spares; return spares;
} }
...@@ -7474,7 +7527,7 @@ void md_check_recovery(struct mddev *mddev) ...@@ -7474,7 +7527,7 @@ void md_check_recovery(struct mddev *mddev)
test_bit(Faulty, &rdev->flags) && test_bit(Faulty, &rdev->flags) &&
atomic_read(&rdev->nr_pending)==0) { atomic_read(&rdev->nr_pending)==0) {
if (mddev->pers->hot_remove_disk( if (mddev->pers->hot_remove_disk(
mddev, rdev->raid_disk)==0) { mddev, rdev) == 0) {
sysfs_unlink_rdev(mddev, rdev); sysfs_unlink_rdev(mddev, rdev);
rdev->raid_disk = -1; rdev->raid_disk = -1;
} }
......
...@@ -72,34 +72,7 @@ struct md_rdev { ...@@ -72,34 +72,7 @@ struct md_rdev {
* This reduces the burden of testing multiple flags in many cases * This reduces the burden of testing multiple flags in many cases
*/ */
unsigned long flags; unsigned long flags; /* bit set of 'enum flag_bits' bits. */
#define Faulty 1 /* device is known to have a fault */
#define In_sync 2 /* device is in_sync with rest of array */
#define WriteMostly 4 /* Avoid reading if at all possible */
#define AutoDetected 7 /* added by auto-detect */
#define Blocked 8 /* An error occurred but has not yet
* been acknowledged by the metadata
* handler, so don't allow writes
* until it is cleared */
#define WriteErrorSeen 9 /* A write error has been seen on this
* device
*/
#define FaultRecorded 10 /* Intermediate state for clearing
* Blocked. The Fault is/will-be
* recorded in the metadata, but that
* metadata hasn't been stored safely
* on disk yet.
*/
#define BlockedBadBlocks 11 /* A writer is blocked because they
* found an unacknowledged bad-block.
* This can safely be cleared at any
* time, and the writer will re-check.
* It may be set at any time, and at
* worst the writer will timeout and
* re-check. So setting it as
* accurately as possible is good, but
* not absolutely critical.
*/
wait_queue_head_t blocked_wait; wait_queue_head_t blocked_wait;
int desc_nr; /* descriptor index in the superblock */ int desc_nr; /* descriptor index in the superblock */
...@@ -152,6 +125,44 @@ struct md_rdev { ...@@ -152,6 +125,44 @@ struct md_rdev {
sector_t size; /* in sectors */ sector_t size; /* in sectors */
} badblocks; } badblocks;
}; };
enum flag_bits {
Faulty, /* device is known to have a fault */
In_sync, /* device is in_sync with rest of array */
WriteMostly, /* Avoid reading if at all possible */
AutoDetected, /* added by auto-detect */
Blocked, /* An error occurred but has not yet
* been acknowledged by the metadata
* handler, so don't allow writes
* until it is cleared */
WriteErrorSeen, /* A write error has been seen on this
* device
*/
FaultRecorded, /* Intermediate state for clearing
* Blocked. The Fault is/will-be
* recorded in the metadata, but that
* metadata hasn't been stored safely
* on disk yet.
*/
BlockedBadBlocks, /* A writer is blocked because they
* found an unacknowledged bad-block.
* This can safely be cleared at any
* time, and the writer will re-check.
* It may be set at any time, and at
* worst the writer will timeout and
* re-check. So setting it as
* accurately as possible is good, but
* not absolutely critical.
*/
WantReplacement, /* This device is a candidate to be
* hot-replaced, either because it has
* reported some faults, or because
* of explicit request.
*/
Replacement, /* This device is a replacement for
* a want_replacement device with same
* raid_disk number.
*/
};
#define BB_LEN_MASK (0x00000000000001FFULL) #define BB_LEN_MASK (0x00000000000001FFULL)
#define BB_OFFSET_MASK (0x7FFFFFFFFFFFFE00ULL) #define BB_OFFSET_MASK (0x7FFFFFFFFFFFFE00ULL)
...@@ -428,7 +439,7 @@ struct md_personality ...@@ -428,7 +439,7 @@ struct md_personality
*/ */
void (*error_handler)(struct mddev *mddev, struct md_rdev *rdev); void (*error_handler)(struct mddev *mddev, struct md_rdev *rdev);
int (*hot_add_disk) (struct mddev *mddev, struct md_rdev *rdev); int (*hot_add_disk) (struct mddev *mddev, struct md_rdev *rdev);
int (*hot_remove_disk) (struct mddev *mddev, int number); int (*hot_remove_disk) (struct mddev *mddev, struct md_rdev *rdev);
int (*spare_active) (struct mddev *mddev); int (*spare_active) (struct mddev *mddev);
sector_t (*sync_request)(struct mddev *mddev, sector_t sector_nr, int *skipped, int go_faster); sector_t (*sync_request)(struct mddev *mddev, sector_t sector_nr, int *skipped, int go_faster);
int (*resize) (struct mddev *mddev, sector_t sectors); int (*resize) (struct mddev *mddev, sector_t sectors);
...@@ -482,15 +493,20 @@ static inline char * mdname (struct mddev * mddev) ...@@ -482,15 +493,20 @@ static inline char * mdname (struct mddev * mddev)
static inline int sysfs_link_rdev(struct mddev *mddev, struct md_rdev *rdev) static inline int sysfs_link_rdev(struct mddev *mddev, struct md_rdev *rdev)
{ {
char nm[20]; char nm[20];
if (!test_bit(Replacement, &rdev->flags)) {
sprintf(nm, "rd%d", rdev->raid_disk); sprintf(nm, "rd%d", rdev->raid_disk);
return sysfs_create_link(&mddev->kobj, &rdev->kobj, nm); return sysfs_create_link(&mddev->kobj, &rdev->kobj, nm);
} else
return 0;
} }
static inline void sysfs_unlink_rdev(struct mddev *mddev, struct md_rdev *rdev) static inline void sysfs_unlink_rdev(struct mddev *mddev, struct md_rdev *rdev)
{ {
char nm[20]; char nm[20];
if (!test_bit(Replacement, &rdev->flags)) {
sprintf(nm, "rd%d", rdev->raid_disk); sprintf(nm, "rd%d", rdev->raid_disk);
sysfs_remove_link(&mddev->kobj, nm); sysfs_remove_link(&mddev->kobj, nm);
}
} }
/* /*
......
...@@ -292,17 +292,16 @@ static int multipath_add_disk(struct mddev *mddev, struct md_rdev *rdev) ...@@ -292,17 +292,16 @@ static int multipath_add_disk(struct mddev *mddev, struct md_rdev *rdev)
return err; return err;
} }
static int multipath_remove_disk(struct mddev *mddev, int number) static int multipath_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
{ {
struct mpconf *conf = mddev->private; struct mpconf *conf = mddev->private;
int err = 0; int err = 0;
struct md_rdev *rdev; int number = rdev->raid_disk;
struct multipath_info *p = conf->multipaths + number; struct multipath_info *p = conf->multipaths + number;
print_multipath_conf(conf); print_multipath_conf(conf);
rdev = p->rdev; if (rdev == p->rdev) {
if (rdev) {
if (test_bit(In_sync, &rdev->flags) || if (test_bit(In_sync, &rdev->flags) ||
atomic_read(&rdev->nr_pending)) { atomic_read(&rdev->nr_pending)) {
printk(KERN_ERR "hot-remove-disk, slot %d is identified" printk(KERN_ERR "hot-remove-disk, slot %d is identified"
......
...@@ -135,7 +135,7 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data) ...@@ -135,7 +135,7 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
put_page(r1_bio->bios[j]->bi_io_vec[i].bv_page); put_page(r1_bio->bios[j]->bi_io_vec[i].bv_page);
j = -1; j = -1;
out_free_bio: out_free_bio:
while ( ++j < pi->raid_disks ) while (++j < pi->raid_disks)
bio_put(r1_bio->bios[j]); bio_put(r1_bio->bios[j]);
r1bio_pool_free(r1_bio, data); r1bio_pool_free(r1_bio, data);
return NULL; return NULL;
...@@ -164,7 +164,7 @@ static void put_all_bios(struct r1conf *conf, struct r1bio *r1_bio) ...@@ -164,7 +164,7 @@ static void put_all_bios(struct r1conf *conf, struct r1bio *r1_bio)
{ {
int i; int i;
for (i = 0; i < conf->raid_disks; i++) { for (i = 0; i < conf->raid_disks * 2; i++) {
struct bio **bio = r1_bio->bios + i; struct bio **bio = r1_bio->bios + i;
if (!BIO_SPECIAL(*bio)) if (!BIO_SPECIAL(*bio))
bio_put(*bio); bio_put(*bio);
...@@ -185,7 +185,7 @@ static void put_buf(struct r1bio *r1_bio) ...@@ -185,7 +185,7 @@ static void put_buf(struct r1bio *r1_bio)
struct r1conf *conf = r1_bio->mddev->private; struct r1conf *conf = r1_bio->mddev->private;
int i; int i;
for (i=0; i<conf->raid_disks; i++) { for (i = 0; i < conf->raid_disks * 2; i++) {
struct bio *bio = r1_bio->bios[i]; struct bio *bio = r1_bio->bios[i];
if (bio->bi_end_io) if (bio->bi_end_io)
rdev_dec_pending(conf->mirrors[i].rdev, r1_bio->mddev); rdev_dec_pending(conf->mirrors[i].rdev, r1_bio->mddev);
...@@ -277,13 +277,14 @@ static inline void update_head_pos(int disk, struct r1bio *r1_bio) ...@@ -277,13 +277,14 @@ static inline void update_head_pos(int disk, struct r1bio *r1_bio)
static int find_bio_disk(struct r1bio *r1_bio, struct bio *bio) static int find_bio_disk(struct r1bio *r1_bio, struct bio *bio)
{ {
int mirror; int mirror;
int raid_disks = r1_bio->mddev->raid_disks; struct r1conf *conf = r1_bio->mddev->private;
int raid_disks = conf->raid_disks;
for (mirror = 0; mirror < raid_disks; mirror++) for (mirror = 0; mirror < raid_disks * 2; mirror++)
if (r1_bio->bios[mirror] == bio) if (r1_bio->bios[mirror] == bio)
break; break;
BUG_ON(mirror == raid_disks); BUG_ON(mirror == raid_disks * 2);
update_head_pos(mirror, r1_bio); update_head_pos(mirror, r1_bio);
return mirror; return mirror;
...@@ -390,6 +391,11 @@ static void raid1_end_write_request(struct bio *bio, int error) ...@@ -390,6 +391,11 @@ static void raid1_end_write_request(struct bio *bio, int error)
if (!uptodate) { if (!uptodate) {
set_bit(WriteErrorSeen, set_bit(WriteErrorSeen,
&conf->mirrors[mirror].rdev->flags); &conf->mirrors[mirror].rdev->flags);
if (!test_and_set_bit(WantReplacement,
&conf->mirrors[mirror].rdev->flags))
set_bit(MD_RECOVERY_NEEDED, &
conf->mddev->recovery);
set_bit(R1BIO_WriteError, &r1_bio->state); set_bit(R1BIO_WriteError, &r1_bio->state);
} else { } else {
/* /*
...@@ -505,7 +511,7 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect ...@@ -505,7 +511,7 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
start_disk = conf->last_used; start_disk = conf->last_used;
} }
for (i = 0 ; i < conf->raid_disks ; i++) { for (i = 0 ; i < conf->raid_disks * 2 ; i++) {
sector_t dist; sector_t dist;
sector_t first_bad; sector_t first_bad;
int bad_sectors; int bad_sectors;
...@@ -609,7 +615,7 @@ int md_raid1_congested(struct mddev *mddev, int bits) ...@@ -609,7 +615,7 @@ int md_raid1_congested(struct mddev *mddev, int bits)
return 1; return 1;
rcu_read_lock(); rcu_read_lock();
for (i = 0; i < mddev->raid_disks; i++) { for (i = 0; i < conf->raid_disks; i++) {
struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev); struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
if (rdev && !test_bit(Faulty, &rdev->flags)) { if (rdev && !test_bit(Faulty, &rdev->flags)) {
struct request_queue *q = bdev_get_queue(rdev->bdev); struct request_queue *q = bdev_get_queue(rdev->bdev);
...@@ -974,7 +980,7 @@ static void make_request(struct mddev *mddev, struct bio * bio) ...@@ -974,7 +980,7 @@ static void make_request(struct mddev *mddev, struct bio * bio)
*/ */
plugged = mddev_check_plugged(mddev); plugged = mddev_check_plugged(mddev);
disks = conf->raid_disks; disks = conf->raid_disks * 2;
retry_write: retry_write:
blocked_rdev = NULL; blocked_rdev = NULL;
rcu_read_lock(); rcu_read_lock();
...@@ -988,6 +994,7 @@ static void make_request(struct mddev *mddev, struct bio * bio) ...@@ -988,6 +994,7 @@ static void make_request(struct mddev *mddev, struct bio * bio)
} }
r1_bio->bios[i] = NULL; r1_bio->bios[i] = NULL;
if (!rdev || test_bit(Faulty, &rdev->flags)) { if (!rdev || test_bit(Faulty, &rdev->flags)) {
if (i < conf->raid_disks)
set_bit(R1BIO_Degraded, &r1_bio->state); set_bit(R1BIO_Degraded, &r1_bio->state);
continue; continue;
} }
...@@ -1263,6 +1270,25 @@ static int raid1_spare_active(struct mddev *mddev) ...@@ -1263,6 +1270,25 @@ static int raid1_spare_active(struct mddev *mddev)
*/ */
for (i = 0; i < conf->raid_disks; i++) { for (i = 0; i < conf->raid_disks; i++) {
struct md_rdev *rdev = conf->mirrors[i].rdev; struct md_rdev *rdev = conf->mirrors[i].rdev;
struct md_rdev *repl = conf->mirrors[conf->raid_disks + i].rdev;
if (repl
&& repl->recovery_offset == MaxSector
&& !test_bit(Faulty, &repl->flags)
&& !test_and_set_bit(In_sync, &repl->flags)) {
/* replacement has just become active */
if (!rdev ||
!test_and_clear_bit(In_sync, &rdev->flags))
count++;
if (rdev) {
/* Replaced device not technically
* faulty, but we need to be sure
* it gets removed and never re-added
*/
set_bit(Faulty, &rdev->flags);
sysfs_notify_dirent_safe(
rdev->sysfs_state);
}
}
if (rdev if (rdev
&& !test_bit(Faulty, &rdev->flags) && !test_bit(Faulty, &rdev->flags)
&& !test_and_set_bit(In_sync, &rdev->flags)) { && !test_and_set_bit(In_sync, &rdev->flags)) {
...@@ -1286,7 +1312,7 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev) ...@@ -1286,7 +1312,7 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
int mirror = 0; int mirror = 0;
struct mirror_info *p; struct mirror_info *p;
int first = 0; int first = 0;
int last = mddev->raid_disks - 1; int last = conf->raid_disks - 1;
if (mddev->recovery_disabled == conf->recovery_disabled) if (mddev->recovery_disabled == conf->recovery_disabled)
return -EBUSY; return -EBUSY;
...@@ -1294,8 +1320,9 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev) ...@@ -1294,8 +1320,9 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
if (rdev->raid_disk >= 0) if (rdev->raid_disk >= 0)
first = last = rdev->raid_disk; first = last = rdev->raid_disk;
for (mirror = first; mirror <= last; mirror++) for (mirror = first; mirror <= last; mirror++) {
if ( !(p=conf->mirrors+mirror)->rdev) { p = conf->mirrors+mirror;
if (!p->rdev) {
disk_stack_limits(mddev->gendisk, rdev->bdev, disk_stack_limits(mddev->gendisk, rdev->bdev,
rdev->data_offset << 9); rdev->data_offset << 9);
...@@ -1322,21 +1349,35 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev) ...@@ -1322,21 +1349,35 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
rcu_assign_pointer(p->rdev, rdev); rcu_assign_pointer(p->rdev, rdev);
break; break;
} }
if (test_bit(WantReplacement, &p->rdev->flags) &&
p[conf->raid_disks].rdev == NULL) {
/* Add this device as a replacement */
clear_bit(In_sync, &rdev->flags);
set_bit(Replacement, &rdev->flags);
rdev->raid_disk = mirror;
err = 0;
conf->fullsync = 1;
rcu_assign_pointer(p[conf->raid_disks].rdev, rdev);
break;
}
}
md_integrity_add_rdev(rdev, mddev); md_integrity_add_rdev(rdev, mddev);
print_conf(conf); print_conf(conf);
return err; return err;
} }
static int raid1_remove_disk(struct mddev *mddev, int number) static int raid1_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
{ {
struct r1conf *conf = mddev->private; struct r1conf *conf = mddev->private;
int err = 0; int err = 0;
struct md_rdev *rdev; int number = rdev->raid_disk;
struct mirror_info *p = conf->mirrors+ number; struct mirror_info *p = conf->mirrors+ number;
if (rdev != p->rdev)
p = conf->mirrors + conf->raid_disks + number;
print_conf(conf); print_conf(conf);
rdev = p->rdev; if (rdev == p->rdev) {
if (rdev) {
if (test_bit(In_sync, &rdev->flags) || if (test_bit(In_sync, &rdev->flags) ||
atomic_read(&rdev->nr_pending)) { atomic_read(&rdev->nr_pending)) {
err = -EBUSY; err = -EBUSY;
...@@ -1358,7 +1399,21 @@ static int raid1_remove_disk(struct mddev *mddev, int number) ...@@ -1358,7 +1399,21 @@ static int raid1_remove_disk(struct mddev *mddev, int number)
err = -EBUSY; err = -EBUSY;
p->rdev = rdev; p->rdev = rdev;
goto abort; goto abort;
} } else if (conf->mirrors[conf->raid_disks + number].rdev) {
/* We just removed a device that is being replaced.
* Move down the replacement. We drain all IO before
* doing this to avoid confusion.
*/
struct md_rdev *repl =
conf->mirrors[conf->raid_disks + number].rdev;
raise_barrier(conf);
clear_bit(Replacement, &repl->flags);
p->rdev = repl;
conf->mirrors[conf->raid_disks + number].rdev = NULL;
lower_barrier(conf);
clear_bit(WantReplacement, &rdev->flags);
} else
clear_bit(WantReplacement, &rdev->flags);
err = md_integrity_register(mddev); err = md_integrity_register(mddev);
} }
abort: abort:
...@@ -1411,6 +1466,10 @@ static void end_sync_write(struct bio *bio, int error) ...@@ -1411,6 +1466,10 @@ static void end_sync_write(struct bio *bio, int error)
} while (sectors_to_go > 0); } while (sectors_to_go > 0);
set_bit(WriteErrorSeen, set_bit(WriteErrorSeen,
&conf->mirrors[mirror].rdev->flags); &conf->mirrors[mirror].rdev->flags);
if (!test_and_set_bit(WantReplacement,
&conf->mirrors[mirror].rdev->flags))
set_bit(MD_RECOVERY_NEEDED, &
mddev->recovery);
set_bit(R1BIO_WriteError, &r1_bio->state); set_bit(R1BIO_WriteError, &r1_bio->state);
} else if (is_badblock(conf->mirrors[mirror].rdev, } else if (is_badblock(conf->mirrors[mirror].rdev,
r1_bio->sector, r1_bio->sector,
...@@ -1441,8 +1500,13 @@ static int r1_sync_page_io(struct md_rdev *rdev, sector_t sector, ...@@ -1441,8 +1500,13 @@ static int r1_sync_page_io(struct md_rdev *rdev, sector_t sector,
if (sync_page_io(rdev, sector, sectors << 9, page, rw, false)) if (sync_page_io(rdev, sector, sectors << 9, page, rw, false))
/* success */ /* success */
return 1; return 1;
if (rw == WRITE) if (rw == WRITE) {
set_bit(WriteErrorSeen, &rdev->flags); set_bit(WriteErrorSeen, &rdev->flags);
if (!test_and_set_bit(WantReplacement,
&rdev->flags))
set_bit(MD_RECOVERY_NEEDED, &
rdev->mddev->recovery);
}
/* need to record an error - either for the block or the device */ /* need to record an error - either for the block or the device */
if (!rdev_set_badblocks(rdev, sector, sectors, 0)) if (!rdev_set_badblocks(rdev, sector, sectors, 0))
md_error(rdev->mddev, rdev); md_error(rdev->mddev, rdev);
...@@ -1493,7 +1557,7 @@ static int fix_sync_read_error(struct r1bio *r1_bio) ...@@ -1493,7 +1557,7 @@ static int fix_sync_read_error(struct r1bio *r1_bio)
} }
} }
d++; d++;
if (d == conf->raid_disks) if (d == conf->raid_disks * 2)
d = 0; d = 0;
} while (!success && d != r1_bio->read_disk); } while (!success && d != r1_bio->read_disk);
...@@ -1510,7 +1574,7 @@ static int fix_sync_read_error(struct r1bio *r1_bio) ...@@ -1510,7 +1574,7 @@ static int fix_sync_read_error(struct r1bio *r1_bio)
mdname(mddev), mdname(mddev),
bdevname(bio->bi_bdev, b), bdevname(bio->bi_bdev, b),
(unsigned long long)r1_bio->sector); (unsigned long long)r1_bio->sector);
for (d = 0; d < conf->raid_disks; d++) { for (d = 0; d < conf->raid_disks * 2; d++) {
rdev = conf->mirrors[d].rdev; rdev = conf->mirrors[d].rdev;
if (!rdev || test_bit(Faulty, &rdev->flags)) if (!rdev || test_bit(Faulty, &rdev->flags))
continue; continue;
...@@ -1536,7 +1600,7 @@ static int fix_sync_read_error(struct r1bio *r1_bio) ...@@ -1536,7 +1600,7 @@ static int fix_sync_read_error(struct r1bio *r1_bio)
/* write it back and re-read */ /* write it back and re-read */
while (d != r1_bio->read_disk) { while (d != r1_bio->read_disk) {
if (d == 0) if (d == 0)
d = conf->raid_disks; d = conf->raid_disks * 2;
d--; d--;
if (r1_bio->bios[d]->bi_end_io != end_sync_read) if (r1_bio->bios[d]->bi_end_io != end_sync_read)
continue; continue;
...@@ -1551,7 +1615,7 @@ static int fix_sync_read_error(struct r1bio *r1_bio) ...@@ -1551,7 +1615,7 @@ static int fix_sync_read_error(struct r1bio *r1_bio)
d = start; d = start;
while (d != r1_bio->read_disk) { while (d != r1_bio->read_disk) {
if (d == 0) if (d == 0)
d = conf->raid_disks; d = conf->raid_disks * 2;
d--; d--;
if (r1_bio->bios[d]->bi_end_io != end_sync_read) if (r1_bio->bios[d]->bi_end_io != end_sync_read)
continue; continue;
...@@ -1584,7 +1648,7 @@ static int process_checks(struct r1bio *r1_bio) ...@@ -1584,7 +1648,7 @@ static int process_checks(struct r1bio *r1_bio)
int primary; int primary;
int i; int i;
for (primary = 0; primary < conf->raid_disks; primary++) for (primary = 0; primary < conf->raid_disks * 2; primary++)
if (r1_bio->bios[primary]->bi_end_io == end_sync_read && if (r1_bio->bios[primary]->bi_end_io == end_sync_read &&
test_bit(BIO_UPTODATE, &r1_bio->bios[primary]->bi_flags)) { test_bit(BIO_UPTODATE, &r1_bio->bios[primary]->bi_flags)) {
r1_bio->bios[primary]->bi_end_io = NULL; r1_bio->bios[primary]->bi_end_io = NULL;
...@@ -1592,7 +1656,7 @@ static int process_checks(struct r1bio *r1_bio) ...@@ -1592,7 +1656,7 @@ static int process_checks(struct r1bio *r1_bio)
break; break;
} }
r1_bio->read_disk = primary; r1_bio->read_disk = primary;
for (i = 0; i < conf->raid_disks; i++) { for (i = 0; i < conf->raid_disks * 2; i++) {
int j; int j;
int vcnt = r1_bio->sectors >> (PAGE_SHIFT- 9); int vcnt = r1_bio->sectors >> (PAGE_SHIFT- 9);
struct bio *pbio = r1_bio->bios[primary]; struct bio *pbio = r1_bio->bios[primary];
...@@ -1656,7 +1720,7 @@ static void sync_request_write(struct mddev *mddev, struct r1bio *r1_bio) ...@@ -1656,7 +1720,7 @@ static void sync_request_write(struct mddev *mddev, struct r1bio *r1_bio)
{ {
struct r1conf *conf = mddev->private; struct r1conf *conf = mddev->private;
int i; int i;
int disks = conf->raid_disks; int disks = conf->raid_disks * 2;
struct bio *bio, *wbio; struct bio *bio, *wbio;
bio = r1_bio->bios[r1_bio->read_disk]; bio = r1_bio->bios[r1_bio->read_disk];
...@@ -1737,7 +1801,7 @@ static void fix_read_error(struct r1conf *conf, int read_disk, ...@@ -1737,7 +1801,7 @@ static void fix_read_error(struct r1conf *conf, int read_disk,
success = 1; success = 1;
else { else {
d++; d++;
if (d == conf->raid_disks) if (d == conf->raid_disks * 2)
d = 0; d = 0;
} }
} while (!success && d != read_disk); } while (!success && d != read_disk);
...@@ -1753,7 +1817,7 @@ static void fix_read_error(struct r1conf *conf, int read_disk, ...@@ -1753,7 +1817,7 @@ static void fix_read_error(struct r1conf *conf, int read_disk,
start = d; start = d;
while (d != read_disk) { while (d != read_disk) {
if (d==0) if (d==0)
d = conf->raid_disks; d = conf->raid_disks * 2;
d--; d--;
rdev = conf->mirrors[d].rdev; rdev = conf->mirrors[d].rdev;
if (rdev && if (rdev &&
...@@ -1765,7 +1829,7 @@ static void fix_read_error(struct r1conf *conf, int read_disk, ...@@ -1765,7 +1829,7 @@ static void fix_read_error(struct r1conf *conf, int read_disk,
while (d != read_disk) { while (d != read_disk) {
char b[BDEVNAME_SIZE]; char b[BDEVNAME_SIZE];
if (d==0) if (d==0)
d = conf->raid_disks; d = conf->raid_disks * 2;
d--; d--;
rdev = conf->mirrors[d].rdev; rdev = conf->mirrors[d].rdev;
if (rdev && if (rdev &&
...@@ -1887,7 +1951,7 @@ static void handle_sync_write_finished(struct r1conf *conf, struct r1bio *r1_bio ...@@ -1887,7 +1951,7 @@ static void handle_sync_write_finished(struct r1conf *conf, struct r1bio *r1_bio
{ {
int m; int m;
int s = r1_bio->sectors; int s = r1_bio->sectors;
for (m = 0; m < conf->raid_disks ; m++) { for (m = 0; m < conf->raid_disks * 2 ; m++) {
struct md_rdev *rdev = conf->mirrors[m].rdev; struct md_rdev *rdev = conf->mirrors[m].rdev;
struct bio *bio = r1_bio->bios[m]; struct bio *bio = r1_bio->bios[m];
if (bio->bi_end_io == NULL) if (bio->bi_end_io == NULL)
...@@ -1909,7 +1973,7 @@ static void handle_sync_write_finished(struct r1conf *conf, struct r1bio *r1_bio ...@@ -1909,7 +1973,7 @@ static void handle_sync_write_finished(struct r1conf *conf, struct r1bio *r1_bio
static void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio) static void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio)
{ {
int m; int m;
for (m = 0; m < conf->raid_disks ; m++) for (m = 0; m < conf->raid_disks * 2 ; m++)
if (r1_bio->bios[m] == IO_MADE_GOOD) { if (r1_bio->bios[m] == IO_MADE_GOOD) {
struct md_rdev *rdev = conf->mirrors[m].rdev; struct md_rdev *rdev = conf->mirrors[m].rdev;
rdev_clear_badblocks(rdev, rdev_clear_badblocks(rdev,
...@@ -2184,7 +2248,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp ...@@ -2184,7 +2248,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp
r1_bio->state = 0; r1_bio->state = 0;
set_bit(R1BIO_IsSync, &r1_bio->state); set_bit(R1BIO_IsSync, &r1_bio->state);
for (i=0; i < conf->raid_disks; i++) { for (i = 0; i < conf->raid_disks * 2; i++) {
struct md_rdev *rdev; struct md_rdev *rdev;
bio = r1_bio->bios[i]; bio = r1_bio->bios[i];
...@@ -2203,6 +2267,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp ...@@ -2203,6 +2267,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp
rdev = rcu_dereference(conf->mirrors[i].rdev); rdev = rcu_dereference(conf->mirrors[i].rdev);
if (rdev == NULL || if (rdev == NULL ||
test_bit(Faulty, &rdev->flags)) { test_bit(Faulty, &rdev->flags)) {
if (i < conf->raid_disks)
still_degraded = 1; still_degraded = 1;
} else if (!test_bit(In_sync, &rdev->flags)) { } else if (!test_bit(In_sync, &rdev->flags)) {
bio->bi_rw = WRITE; bio->bi_rw = WRITE;
...@@ -2254,7 +2319,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp ...@@ -2254,7 +2319,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp
* need to mark them bad on all write targets * need to mark them bad on all write targets
*/ */
int ok = 1; int ok = 1;
for (i = 0 ; i < conf->raid_disks ; i++) for (i = 0 ; i < conf->raid_disks * 2 ; i++)
if (r1_bio->bios[i]->bi_end_io == end_sync_write) { if (r1_bio->bios[i]->bi_end_io == end_sync_write) {
struct md_rdev *rdev = struct md_rdev *rdev =
rcu_dereference(conf->mirrors[i].rdev); rcu_dereference(conf->mirrors[i].rdev);
...@@ -2323,7 +2388,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp ...@@ -2323,7 +2388,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp
len = sync_blocks<<9; len = sync_blocks<<9;
} }
for (i=0 ; i < conf->raid_disks; i++) { for (i = 0 ; i < conf->raid_disks * 2; i++) {
bio = r1_bio->bios[i]; bio = r1_bio->bios[i];
if (bio->bi_end_io) { if (bio->bi_end_io) {
page = bio->bi_io_vec[bio->bi_vcnt].bv_page; page = bio->bi_io_vec[bio->bi_vcnt].bv_page;
...@@ -2356,7 +2421,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp ...@@ -2356,7 +2421,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp
*/ */
if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
atomic_set(&r1_bio->remaining, read_targets); atomic_set(&r1_bio->remaining, read_targets);
for (i=0; i<conf->raid_disks; i++) { for (i = 0; i < conf->raid_disks * 2; i++) {
bio = r1_bio->bios[i]; bio = r1_bio->bios[i];
if (bio->bi_end_io == end_sync_read) { if (bio->bi_end_io == end_sync_read) {
md_sync_acct(bio->bi_bdev, nr_sectors); md_sync_acct(bio->bi_bdev, nr_sectors);
...@@ -2393,7 +2458,8 @@ static struct r1conf *setup_conf(struct mddev *mddev) ...@@ -2393,7 +2458,8 @@ static struct r1conf *setup_conf(struct mddev *mddev)
if (!conf) if (!conf)
goto abort; goto abort;
conf->mirrors = kzalloc(sizeof(struct mirror_info)*mddev->raid_disks, conf->mirrors = kzalloc(sizeof(struct mirror_info)
* mddev->raid_disks * 2,
GFP_KERNEL); GFP_KERNEL);
if (!conf->mirrors) if (!conf->mirrors)
goto abort; goto abort;
...@@ -2405,7 +2471,7 @@ static struct r1conf *setup_conf(struct mddev *mddev) ...@@ -2405,7 +2471,7 @@ static struct r1conf *setup_conf(struct mddev *mddev)
conf->poolinfo = kzalloc(sizeof(*conf->poolinfo), GFP_KERNEL); conf->poolinfo = kzalloc(sizeof(*conf->poolinfo), GFP_KERNEL);
if (!conf->poolinfo) if (!conf->poolinfo)
goto abort; goto abort;
conf->poolinfo->raid_disks = mddev->raid_disks; conf->poolinfo->raid_disks = mddev->raid_disks * 2;
conf->r1bio_pool = mempool_create(NR_RAID1_BIOS, r1bio_pool_alloc, conf->r1bio_pool = mempool_create(NR_RAID1_BIOS, r1bio_pool_alloc,
r1bio_pool_free, r1bio_pool_free,
conf->poolinfo); conf->poolinfo);
...@@ -2414,14 +2480,20 @@ static struct r1conf *setup_conf(struct mddev *mddev) ...@@ -2414,14 +2480,20 @@ static struct r1conf *setup_conf(struct mddev *mddev)
conf->poolinfo->mddev = mddev; conf->poolinfo->mddev = mddev;
err = -EINVAL;
spin_lock_init(&conf->device_lock); spin_lock_init(&conf->device_lock);
list_for_each_entry(rdev, &mddev->disks, same_set) { list_for_each_entry(rdev, &mddev->disks, same_set) {
int disk_idx = rdev->raid_disk; int disk_idx = rdev->raid_disk;
if (disk_idx >= mddev->raid_disks if (disk_idx >= mddev->raid_disks
|| disk_idx < 0) || disk_idx < 0)
continue; continue;
if (test_bit(Replacement, &rdev->flags))
disk = conf->mirrors + conf->raid_disks + disk_idx;
else
disk = conf->mirrors + disk_idx; disk = conf->mirrors + disk_idx;
if (disk->rdev)
goto abort;
disk->rdev = rdev; disk->rdev = rdev;
disk->head_position = 0; disk->head_position = 0;
...@@ -2437,11 +2509,27 @@ static struct r1conf *setup_conf(struct mddev *mddev) ...@@ -2437,11 +2509,27 @@ static struct r1conf *setup_conf(struct mddev *mddev)
conf->pending_count = 0; conf->pending_count = 0;
conf->recovery_disabled = mddev->recovery_disabled - 1; conf->recovery_disabled = mddev->recovery_disabled - 1;
err = -EIO;
conf->last_used = -1; conf->last_used = -1;
for (i = 0; i < conf->raid_disks; i++) { for (i = 0; i < conf->raid_disks * 2; i++) {
disk = conf->mirrors + i; disk = conf->mirrors + i;
if (i < conf->raid_disks &&
disk[conf->raid_disks].rdev) {
/* This slot has a replacement. */
if (!disk->rdev) {
/* No original, just make the replacement
* a recovering spare
*/
disk->rdev =
disk[conf->raid_disks].rdev;
disk[conf->raid_disks].rdev = NULL;
} else if (!test_bit(In_sync, &disk->rdev->flags))
/* Original is not in_sync - bad */
goto abort;
}
if (!disk->rdev || if (!disk->rdev ||
!test_bit(In_sync, &disk->rdev->flags)) { !test_bit(In_sync, &disk->rdev->flags)) {
disk->head_position = 0; disk->head_position = 0;
...@@ -2455,7 +2543,6 @@ static struct r1conf *setup_conf(struct mddev *mddev) ...@@ -2455,7 +2543,6 @@ static struct r1conf *setup_conf(struct mddev *mddev)
conf->last_used = i; conf->last_used = i;
} }
err = -EIO;
if (conf->last_used < 0) { if (conf->last_used < 0) {
printk(KERN_ERR "md/raid1:%s: no operational mirrors\n", printk(KERN_ERR "md/raid1:%s: no operational mirrors\n",
mdname(mddev)); mdname(mddev));
...@@ -2665,7 +2752,7 @@ static int raid1_reshape(struct mddev *mddev) ...@@ -2665,7 +2752,7 @@ static int raid1_reshape(struct mddev *mddev)
if (!newpoolinfo) if (!newpoolinfo)
return -ENOMEM; return -ENOMEM;
newpoolinfo->mddev = mddev; newpoolinfo->mddev = mddev;
newpoolinfo->raid_disks = raid_disks; newpoolinfo->raid_disks = raid_disks * 2;
newpool = mempool_create(NR_RAID1_BIOS, r1bio_pool_alloc, newpool = mempool_create(NR_RAID1_BIOS, r1bio_pool_alloc,
r1bio_pool_free, newpoolinfo); r1bio_pool_free, newpoolinfo);
...@@ -2673,7 +2760,8 @@ static int raid1_reshape(struct mddev *mddev) ...@@ -2673,7 +2760,8 @@ static int raid1_reshape(struct mddev *mddev)
kfree(newpoolinfo); kfree(newpoolinfo);
return -ENOMEM; return -ENOMEM;
} }
newmirrors = kzalloc(sizeof(struct mirror_info) * raid_disks, GFP_KERNEL); newmirrors = kzalloc(sizeof(struct mirror_info) * raid_disks * 2,
GFP_KERNEL);
if (!newmirrors) { if (!newmirrors) {
kfree(newpoolinfo); kfree(newpoolinfo);
mempool_destroy(newpool); mempool_destroy(newpool);
......
...@@ -12,6 +12,9 @@ struct mirror_info { ...@@ -12,6 +12,9 @@ struct mirror_info {
* pool was allocated for, so they know how much to allocate and free. * pool was allocated for, so they know how much to allocate and free.
* mddev->raid_disks cannot be used, as it can change while a pool is active * mddev->raid_disks cannot be used, as it can change while a pool is active
* These two datums are stored in a kmalloced struct. * These two datums are stored in a kmalloced struct.
* The 'raid_disks' here is twice the raid_disks in r1conf.
* This allows space for each 'real' device can have a replacement in the
* second half of the array.
*/ */
struct pool_info { struct pool_info {
...@@ -21,7 +24,9 @@ struct pool_info { ...@@ -21,7 +24,9 @@ struct pool_info {
struct r1conf { struct r1conf {
struct mddev *mddev; struct mddev *mddev;
struct mirror_info *mirrors; struct mirror_info *mirrors; /* twice 'raid_disks' to
* allow for replacements.
*/
int raid_disks; int raid_disks;
/* When choose the best device for a read (read_balance()) /* When choose the best device for a read (read_balance())
......
...@@ -73,7 +73,8 @@ static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data) ...@@ -73,7 +73,8 @@ static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data)
struct r10conf *conf = data; struct r10conf *conf = data;
int size = offsetof(struct r10bio, devs[conf->copies]); int size = offsetof(struct r10bio, devs[conf->copies]);
/* allocate a r10bio with room for raid_disks entries in the bios array */ /* allocate a r10bio with room for raid_disks entries in the
* bios array */
return kzalloc(size, gfp_flags); return kzalloc(size, gfp_flags);
} }
...@@ -123,12 +124,19 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data) ...@@ -123,12 +124,19 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
if (!bio) if (!bio)
goto out_free_bio; goto out_free_bio;
r10_bio->devs[j].bio = bio; r10_bio->devs[j].bio = bio;
if (!conf->have_replacement)
continue;
bio = bio_kmalloc(gfp_flags, RESYNC_PAGES);
if (!bio)
goto out_free_bio;
r10_bio->devs[j].repl_bio = bio;
} }
/* /*
* Allocate RESYNC_PAGES data pages and attach them * Allocate RESYNC_PAGES data pages and attach them
* where needed. * where needed.
*/ */
for (j = 0 ; j < nalloc; j++) { for (j = 0 ; j < nalloc; j++) {
struct bio *rbio = r10_bio->devs[j].repl_bio;
bio = r10_bio->devs[j].bio; bio = r10_bio->devs[j].bio;
for (i = 0; i < RESYNC_PAGES; i++) { for (i = 0; i < RESYNC_PAGES; i++) {
if (j == 1 && !test_bit(MD_RECOVERY_SYNC, if (j == 1 && !test_bit(MD_RECOVERY_SYNC,
...@@ -143,6 +151,8 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data) ...@@ -143,6 +151,8 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
goto out_free_pages; goto out_free_pages;
bio->bi_io_vec[i].bv_page = page; bio->bi_io_vec[i].bv_page = page;
if (rbio)
rbio->bi_io_vec[i].bv_page = page;
} }
} }
...@@ -156,8 +166,11 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data) ...@@ -156,8 +166,11 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
safe_put_page(r10_bio->devs[j].bio->bi_io_vec[i].bv_page); safe_put_page(r10_bio->devs[j].bio->bi_io_vec[i].bv_page);
j = -1; j = -1;
out_free_bio: out_free_bio:
while ( ++j < nalloc ) while (++j < nalloc) {
bio_put(r10_bio->devs[j].bio); bio_put(r10_bio->devs[j].bio);
if (r10_bio->devs[j].repl_bio)
bio_put(r10_bio->devs[j].repl_bio);
}
r10bio_pool_free(r10_bio, conf); r10bio_pool_free(r10_bio, conf);
return NULL; return NULL;
} }
...@@ -178,6 +191,9 @@ static void r10buf_pool_free(void *__r10_bio, void *data) ...@@ -178,6 +191,9 @@ static void r10buf_pool_free(void *__r10_bio, void *data)
} }
bio_put(bio); bio_put(bio);
} }
bio = r10bio->devs[j].repl_bio;
if (bio)
bio_put(bio);
} }
r10bio_pool_free(r10bio, conf); r10bio_pool_free(r10bio, conf);
} }
...@@ -191,6 +207,10 @@ static void put_all_bios(struct r10conf *conf, struct r10bio *r10_bio) ...@@ -191,6 +207,10 @@ static void put_all_bios(struct r10conf *conf, struct r10bio *r10_bio)
if (!BIO_SPECIAL(*bio)) if (!BIO_SPECIAL(*bio))
bio_put(*bio); bio_put(*bio);
*bio = NULL; *bio = NULL;
bio = &r10_bio->devs[i].repl_bio;
if (r10_bio->read_slot < 0 && !BIO_SPECIAL(*bio))
bio_put(*bio);
*bio = NULL;
} }
} }
...@@ -275,19 +295,27 @@ static inline void update_head_pos(int slot, struct r10bio *r10_bio) ...@@ -275,19 +295,27 @@ static inline void update_head_pos(int slot, struct r10bio *r10_bio)
* Find the disk number which triggered given bio * Find the disk number which triggered given bio
*/ */
static int find_bio_disk(struct r10conf *conf, struct r10bio *r10_bio, static int find_bio_disk(struct r10conf *conf, struct r10bio *r10_bio,
struct bio *bio, int *slotp) struct bio *bio, int *slotp, int *replp)
{ {
int slot; int slot;
int repl = 0;
for (slot = 0; slot < conf->copies; slot++) for (slot = 0; slot < conf->copies; slot++) {
if (r10_bio->devs[slot].bio == bio) if (r10_bio->devs[slot].bio == bio)
break; break;
if (r10_bio->devs[slot].repl_bio == bio) {
repl = 1;
break;
}
}
BUG_ON(slot == conf->copies); BUG_ON(slot == conf->copies);
update_head_pos(slot, r10_bio); update_head_pos(slot, r10_bio);
if (slotp) if (slotp)
*slotp = slot; *slotp = slot;
if (replp)
*replp = repl;
return r10_bio->devs[slot].devnum; return r10_bio->devs[slot].devnum;
} }
...@@ -296,11 +324,13 @@ static void raid10_end_read_request(struct bio *bio, int error) ...@@ -296,11 +324,13 @@ static void raid10_end_read_request(struct bio *bio, int error)
int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
struct r10bio *r10_bio = bio->bi_private; struct r10bio *r10_bio = bio->bi_private;
int slot, dev; int slot, dev;
struct md_rdev *rdev;
struct r10conf *conf = r10_bio->mddev->private; struct r10conf *conf = r10_bio->mddev->private;
slot = r10_bio->read_slot; slot = r10_bio->read_slot;
dev = r10_bio->devs[slot].devnum; dev = r10_bio->devs[slot].devnum;
rdev = r10_bio->devs[slot].rdev;
/* /*
* this branch is our 'one mirror IO has finished' event handler: * this branch is our 'one mirror IO has finished' event handler:
*/ */
...@@ -318,7 +348,7 @@ static void raid10_end_read_request(struct bio *bio, int error) ...@@ -318,7 +348,7 @@ static void raid10_end_read_request(struct bio *bio, int error)
*/ */
set_bit(R10BIO_Uptodate, &r10_bio->state); set_bit(R10BIO_Uptodate, &r10_bio->state);
raid_end_bio_io(r10_bio); raid_end_bio_io(r10_bio);
rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev); rdev_dec_pending(rdev, conf->mddev);
} else { } else {
/* /*
* oops, read error - keep the refcount on the rdev * oops, read error - keep the refcount on the rdev
...@@ -327,7 +357,7 @@ static void raid10_end_read_request(struct bio *bio, int error) ...@@ -327,7 +357,7 @@ static void raid10_end_read_request(struct bio *bio, int error)
printk_ratelimited(KERN_ERR printk_ratelimited(KERN_ERR
"md/raid10:%s: %s: rescheduling sector %llu\n", "md/raid10:%s: %s: rescheduling sector %llu\n",
mdname(conf->mddev), mdname(conf->mddev),
bdevname(conf->mirrors[dev].rdev->bdev, b), bdevname(rdev->bdev, b),
(unsigned long long)r10_bio->sector); (unsigned long long)r10_bio->sector);
set_bit(R10BIO_ReadError, &r10_bio->state); set_bit(R10BIO_ReadError, &r10_bio->state);
reschedule_retry(r10_bio); reschedule_retry(r10_bio);
...@@ -366,17 +396,35 @@ static void raid10_end_write_request(struct bio *bio, int error) ...@@ -366,17 +396,35 @@ static void raid10_end_write_request(struct bio *bio, int error)
int dev; int dev;
int dec_rdev = 1; int dec_rdev = 1;
struct r10conf *conf = r10_bio->mddev->private; struct r10conf *conf = r10_bio->mddev->private;
int slot; int slot, repl;
struct md_rdev *rdev = NULL;
dev = find_bio_disk(conf, r10_bio, bio, &slot); dev = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
if (repl)
rdev = conf->mirrors[dev].replacement;
if (!rdev) {
smp_rmb();
repl = 0;
rdev = conf->mirrors[dev].rdev;
}
/* /*
* this branch is our 'one mirror IO has finished' event handler: * this branch is our 'one mirror IO has finished' event handler:
*/ */
if (!uptodate) { if (!uptodate) {
set_bit(WriteErrorSeen, &conf->mirrors[dev].rdev->flags); if (repl)
/* Never record new bad blocks to replacement,
* just fail it.
*/
md_error(rdev->mddev, rdev);
else {
set_bit(WriteErrorSeen, &rdev->flags);
if (!test_and_set_bit(WantReplacement, &rdev->flags))
set_bit(MD_RECOVERY_NEEDED,
&rdev->mddev->recovery);
set_bit(R10BIO_WriteError, &r10_bio->state); set_bit(R10BIO_WriteError, &r10_bio->state);
dec_rdev = 0; dec_rdev = 0;
}
} else { } else {
/* /*
* Set R10BIO_Uptodate in our master bio, so that * Set R10BIO_Uptodate in our master bio, so that
...@@ -393,11 +441,14 @@ static void raid10_end_write_request(struct bio *bio, int error) ...@@ -393,11 +441,14 @@ static void raid10_end_write_request(struct bio *bio, int error)
set_bit(R10BIO_Uptodate, &r10_bio->state); set_bit(R10BIO_Uptodate, &r10_bio->state);
/* Maybe we can clear some bad blocks. */ /* Maybe we can clear some bad blocks. */
if (is_badblock(conf->mirrors[dev].rdev, if (is_badblock(rdev,
r10_bio->devs[slot].addr, r10_bio->devs[slot].addr,
r10_bio->sectors, r10_bio->sectors,
&first_bad, &bad_sectors)) { &first_bad, &bad_sectors)) {
bio_put(bio); bio_put(bio);
if (repl)
r10_bio->devs[slot].repl_bio = IO_MADE_GOOD;
else
r10_bio->devs[slot].bio = IO_MADE_GOOD; r10_bio->devs[slot].bio = IO_MADE_GOOD;
dec_rdev = 0; dec_rdev = 0;
set_bit(R10BIO_MadeGood, &r10_bio->state); set_bit(R10BIO_MadeGood, &r10_bio->state);
...@@ -414,7 +465,6 @@ static void raid10_end_write_request(struct bio *bio, int error) ...@@ -414,7 +465,6 @@ static void raid10_end_write_request(struct bio *bio, int error)
rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev); rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev);
} }
/* /*
* RAID10 layout manager * RAID10 layout manager
* As well as the chunksize and raid_disks count, there are two * As well as the chunksize and raid_disks count, there are two
...@@ -562,14 +612,16 @@ static int raid10_mergeable_bvec(struct request_queue *q, ...@@ -562,14 +612,16 @@ static int raid10_mergeable_bvec(struct request_queue *q,
* FIXME: possibly should rethink readbalancing and do it differently * FIXME: possibly should rethink readbalancing and do it differently
* depending on near_copies / far_copies geometry. * depending on near_copies / far_copies geometry.
*/ */
static int read_balance(struct r10conf *conf, struct r10bio *r10_bio, int *max_sectors) static struct md_rdev *read_balance(struct r10conf *conf,
struct r10bio *r10_bio,
int *max_sectors)
{ {
const sector_t this_sector = r10_bio->sector; const sector_t this_sector = r10_bio->sector;
int disk, slot; int disk, slot;
int sectors = r10_bio->sectors; int sectors = r10_bio->sectors;
int best_good_sectors; int best_good_sectors;
sector_t new_distance, best_dist; sector_t new_distance, best_dist;
struct md_rdev *rdev; struct md_rdev *rdev, *best_rdev;
int do_balance; int do_balance;
int best_slot; int best_slot;
...@@ -578,6 +630,7 @@ static int read_balance(struct r10conf *conf, struct r10bio *r10_bio, int *max_s ...@@ -578,6 +630,7 @@ static int read_balance(struct r10conf *conf, struct r10bio *r10_bio, int *max_s
retry: retry:
sectors = r10_bio->sectors; sectors = r10_bio->sectors;
best_slot = -1; best_slot = -1;
best_rdev = NULL;
best_dist = MaxSector; best_dist = MaxSector;
best_good_sectors = 0; best_good_sectors = 0;
do_balance = 1; do_balance = 1;
...@@ -599,10 +652,16 @@ static int read_balance(struct r10conf *conf, struct r10bio *r10_bio, int *max_s ...@@ -599,10 +652,16 @@ static int read_balance(struct r10conf *conf, struct r10bio *r10_bio, int *max_s
if (r10_bio->devs[slot].bio == IO_BLOCKED) if (r10_bio->devs[slot].bio == IO_BLOCKED)
continue; continue;
disk = r10_bio->devs[slot].devnum; disk = r10_bio->devs[slot].devnum;
rdev = rcu_dereference(conf->mirrors[disk].replacement);
if (rdev == NULL || test_bit(Faulty, &rdev->flags) ||
r10_bio->devs[slot].addr + sectors > rdev->recovery_offset)
rdev = rcu_dereference(conf->mirrors[disk].rdev); rdev = rcu_dereference(conf->mirrors[disk].rdev);
if (rdev == NULL) if (rdev == NULL)
continue; continue;
if (!test_bit(In_sync, &rdev->flags)) if (test_bit(Faulty, &rdev->flags))
continue;
if (!test_bit(In_sync, &rdev->flags) &&
r10_bio->devs[slot].addr + sectors > rdev->recovery_offset)
continue; continue;
dev_sector = r10_bio->devs[slot].addr; dev_sector = r10_bio->devs[slot].addr;
...@@ -627,6 +686,7 @@ static int read_balance(struct r10conf *conf, struct r10bio *r10_bio, int *max_s ...@@ -627,6 +686,7 @@ static int read_balance(struct r10conf *conf, struct r10bio *r10_bio, int *max_s
if (good_sectors > best_good_sectors) { if (good_sectors > best_good_sectors) {
best_good_sectors = good_sectors; best_good_sectors = good_sectors;
best_slot = slot; best_slot = slot;
best_rdev = rdev;
} }
if (!do_balance) if (!do_balance)
/* Must read from here */ /* Must read from here */
...@@ -655,16 +715,15 @@ static int read_balance(struct r10conf *conf, struct r10bio *r10_bio, int *max_s ...@@ -655,16 +715,15 @@ static int read_balance(struct r10conf *conf, struct r10bio *r10_bio, int *max_s
if (new_distance < best_dist) { if (new_distance < best_dist) {
best_dist = new_distance; best_dist = new_distance;
best_slot = slot; best_slot = slot;
best_rdev = rdev;
} }
} }
if (slot == conf->copies) if (slot >= conf->copies) {
slot = best_slot; slot = best_slot;
rdev = best_rdev;
}
if (slot >= 0) { if (slot >= 0) {
disk = r10_bio->devs[slot].devnum;
rdev = rcu_dereference(conf->mirrors[disk].rdev);
if (!rdev)
goto retry;
atomic_inc(&rdev->nr_pending); atomic_inc(&rdev->nr_pending);
if (test_bit(Faulty, &rdev->flags)) { if (test_bit(Faulty, &rdev->flags)) {
/* Cannot risk returning a device that failed /* Cannot risk returning a device that failed
...@@ -675,11 +734,11 @@ static int read_balance(struct r10conf *conf, struct r10bio *r10_bio, int *max_s ...@@ -675,11 +734,11 @@ static int read_balance(struct r10conf *conf, struct r10bio *r10_bio, int *max_s
} }
r10_bio->read_slot = slot; r10_bio->read_slot = slot;
} else } else
disk = -1; rdev = NULL;
rcu_read_unlock(); rcu_read_unlock();
*max_sectors = best_good_sectors; *max_sectors = best_good_sectors;
return disk; return rdev;
} }
static int raid10_congested(void *data, int bits) static int raid10_congested(void *data, int bits)
...@@ -846,7 +905,6 @@ static void unfreeze_array(struct r10conf *conf) ...@@ -846,7 +905,6 @@ static void unfreeze_array(struct r10conf *conf)
static void make_request(struct mddev *mddev, struct bio * bio) static void make_request(struct mddev *mddev, struct bio * bio)
{ {
struct r10conf *conf = mddev->private; struct r10conf *conf = mddev->private;
struct mirror_info *mirror;
struct r10bio *r10_bio; struct r10bio *r10_bio;
struct bio *read_bio; struct bio *read_bio;
int i; int i;
...@@ -945,27 +1003,27 @@ static void make_request(struct mddev *mddev, struct bio * bio) ...@@ -945,27 +1003,27 @@ static void make_request(struct mddev *mddev, struct bio * bio)
/* /*
* read balancing logic: * read balancing logic:
*/ */
int disk; struct md_rdev *rdev;
int slot; int slot;
read_again: read_again:
disk = read_balance(conf, r10_bio, &max_sectors); rdev = read_balance(conf, r10_bio, &max_sectors);
slot = r10_bio->read_slot; if (!rdev) {
if (disk < 0) {
raid_end_bio_io(r10_bio); raid_end_bio_io(r10_bio);
return; return;
} }
mirror = conf->mirrors + disk; slot = r10_bio->read_slot;
read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev); read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev);
md_trim_bio(read_bio, r10_bio->sector - bio->bi_sector, md_trim_bio(read_bio, r10_bio->sector - bio->bi_sector,
max_sectors); max_sectors);
r10_bio->devs[slot].bio = read_bio; r10_bio->devs[slot].bio = read_bio;
r10_bio->devs[slot].rdev = rdev;
read_bio->bi_sector = r10_bio->devs[slot].addr + read_bio->bi_sector = r10_bio->devs[slot].addr +
mirror->rdev->data_offset; rdev->data_offset;
read_bio->bi_bdev = mirror->rdev->bdev; read_bio->bi_bdev = rdev->bdev;
read_bio->bi_end_io = raid10_end_read_request; read_bio->bi_end_io = raid10_end_read_request;
read_bio->bi_rw = READ | do_sync; read_bio->bi_rw = READ | do_sync;
read_bio->bi_private = r10_bio; read_bio->bi_private = r10_bio;
...@@ -1025,6 +1083,7 @@ static void make_request(struct mddev *mddev, struct bio * bio) ...@@ -1025,6 +1083,7 @@ static void make_request(struct mddev *mddev, struct bio * bio)
*/ */
plugged = mddev_check_plugged(mddev); plugged = mddev_check_plugged(mddev);
r10_bio->read_slot = -1; /* make sure repl_bio gets freed */
raid10_find_phys(conf, r10_bio); raid10_find_phys(conf, r10_bio);
retry_write: retry_write:
blocked_rdev = NULL; blocked_rdev = NULL;
...@@ -1034,12 +1093,25 @@ static void make_request(struct mddev *mddev, struct bio * bio) ...@@ -1034,12 +1093,25 @@ static void make_request(struct mddev *mddev, struct bio * bio)
for (i = 0; i < conf->copies; i++) { for (i = 0; i < conf->copies; i++) {
int d = r10_bio->devs[i].devnum; int d = r10_bio->devs[i].devnum;
struct md_rdev *rdev = rcu_dereference(conf->mirrors[d].rdev); struct md_rdev *rdev = rcu_dereference(conf->mirrors[d].rdev);
struct md_rdev *rrdev = rcu_dereference(
conf->mirrors[d].replacement);
if (rdev == rrdev)
rrdev = NULL;
if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) { if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
atomic_inc(&rdev->nr_pending); atomic_inc(&rdev->nr_pending);
blocked_rdev = rdev; blocked_rdev = rdev;
break; break;
} }
if (rrdev && unlikely(test_bit(Blocked, &rrdev->flags))) {
atomic_inc(&rrdev->nr_pending);
blocked_rdev = rrdev;
break;
}
if (rrdev && test_bit(Faulty, &rrdev->flags))
rrdev = NULL;
r10_bio->devs[i].bio = NULL; r10_bio->devs[i].bio = NULL;
r10_bio->devs[i].repl_bio = NULL;
if (!rdev || test_bit(Faulty, &rdev->flags)) { if (!rdev || test_bit(Faulty, &rdev->flags)) {
set_bit(R10BIO_Degraded, &r10_bio->state); set_bit(R10BIO_Degraded, &r10_bio->state);
continue; continue;
...@@ -1088,6 +1160,10 @@ static void make_request(struct mddev *mddev, struct bio * bio) ...@@ -1088,6 +1160,10 @@ static void make_request(struct mddev *mddev, struct bio * bio)
} }
r10_bio->devs[i].bio = bio; r10_bio->devs[i].bio = bio;
atomic_inc(&rdev->nr_pending); atomic_inc(&rdev->nr_pending);
if (rrdev) {
r10_bio->devs[i].repl_bio = bio;
atomic_inc(&rrdev->nr_pending);
}
} }
rcu_read_unlock(); rcu_read_unlock();
...@@ -1096,11 +1172,23 @@ static void make_request(struct mddev *mddev, struct bio * bio) ...@@ -1096,11 +1172,23 @@ static void make_request(struct mddev *mddev, struct bio * bio)
int j; int j;
int d; int d;
for (j = 0; j < i; j++) for (j = 0; j < i; j++) {
if (r10_bio->devs[j].bio) { if (r10_bio->devs[j].bio) {
d = r10_bio->devs[j].devnum; d = r10_bio->devs[j].devnum;
rdev_dec_pending(conf->mirrors[d].rdev, mddev); rdev_dec_pending(conf->mirrors[d].rdev, mddev);
} }
if (r10_bio->devs[j].repl_bio) {
struct md_rdev *rdev;
d = r10_bio->devs[j].devnum;
rdev = conf->mirrors[d].replacement;
if (!rdev) {
/* Race with remove_disk */
smp_mb();
rdev = conf->mirrors[d].rdev;
}
rdev_dec_pending(rdev, mddev);
}
}
allow_barrier(conf); allow_barrier(conf);
md_wait_for_blocked_rdev(blocked_rdev, mddev); md_wait_for_blocked_rdev(blocked_rdev, mddev);
wait_barrier(conf); wait_barrier(conf);
...@@ -1147,6 +1235,31 @@ static void make_request(struct mddev *mddev, struct bio * bio) ...@@ -1147,6 +1235,31 @@ static void make_request(struct mddev *mddev, struct bio * bio)
bio_list_add(&conf->pending_bio_list, mbio); bio_list_add(&conf->pending_bio_list, mbio);
conf->pending_count++; conf->pending_count++;
spin_unlock_irqrestore(&conf->device_lock, flags); spin_unlock_irqrestore(&conf->device_lock, flags);
if (!r10_bio->devs[i].repl_bio)
continue;
mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
md_trim_bio(mbio, r10_bio->sector - bio->bi_sector,
max_sectors);
r10_bio->devs[i].repl_bio = mbio;
/* We are actively writing to the original device
* so it cannot disappear, so the replacement cannot
* become NULL here
*/
mbio->bi_sector = (r10_bio->devs[i].addr+
conf->mirrors[d].replacement->data_offset);
mbio->bi_bdev = conf->mirrors[d].replacement->bdev;
mbio->bi_end_io = raid10_end_write_request;
mbio->bi_rw = WRITE | do_sync | do_fua;
mbio->bi_private = r10_bio;
atomic_inc(&r10_bio->remaining);
spin_lock_irqsave(&conf->device_lock, flags);
bio_list_add(&conf->pending_bio_list, mbio);
conf->pending_count++;
spin_unlock_irqrestore(&conf->device_lock, flags);
} }
/* Don't remove the bias on 'remaining' (one_write_done) until /* Don't remove the bias on 'remaining' (one_write_done) until
...@@ -1309,7 +1422,25 @@ static int raid10_spare_active(struct mddev *mddev) ...@@ -1309,7 +1422,25 @@ static int raid10_spare_active(struct mddev *mddev)
*/ */
for (i = 0; i < conf->raid_disks; i++) { for (i = 0; i < conf->raid_disks; i++) {
tmp = conf->mirrors + i; tmp = conf->mirrors + i;
if (tmp->rdev if (tmp->replacement
&& tmp->replacement->recovery_offset == MaxSector
&& !test_bit(Faulty, &tmp->replacement->flags)
&& !test_and_set_bit(In_sync, &tmp->replacement->flags)) {
/* Replacement has just become active */
if (!tmp->rdev
|| !test_and_clear_bit(In_sync, &tmp->rdev->flags))
count++;
if (tmp->rdev) {
/* Replaced device not technically faulty,
* but we need to be sure it gets removed
* and never re-added.
*/
set_bit(Faulty, &tmp->rdev->flags);
sysfs_notify_dirent_safe(
tmp->rdev->sysfs_state);
}
sysfs_notify_dirent_safe(tmp->replacement->sysfs_state);
} else if (tmp->rdev
&& !test_bit(Faulty, &tmp->rdev->flags) && !test_bit(Faulty, &tmp->rdev->flags)
&& !test_and_set_bit(In_sync, &tmp->rdev->flags)) { && !test_and_set_bit(In_sync, &tmp->rdev->flags)) {
count++; count++;
...@@ -1353,8 +1484,25 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev) ...@@ -1353,8 +1484,25 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
struct mirror_info *p = &conf->mirrors[mirror]; struct mirror_info *p = &conf->mirrors[mirror];
if (p->recovery_disabled == mddev->recovery_disabled) if (p->recovery_disabled == mddev->recovery_disabled)
continue; continue;
if (p->rdev) if (p->rdev) {
if (!test_bit(WantReplacement, &p->rdev->flags) ||
p->replacement != NULL)
continue; continue;
clear_bit(In_sync, &rdev->flags);
set_bit(Replacement, &rdev->flags);
rdev->raid_disk = mirror;
err = 0;
disk_stack_limits(mddev->gendisk, rdev->bdev,
rdev->data_offset << 9);
if (rdev->bdev->bd_disk->queue->merge_bvec_fn) {
blk_queue_max_segments(mddev->queue, 1);
blk_queue_segment_boundary(mddev->queue,
PAGE_CACHE_SIZE - 1);
}
conf->fullsync = 1;
rcu_assign_pointer(p->replacement, rdev);
break;
}
disk_stack_limits(mddev->gendisk, rdev->bdev, disk_stack_limits(mddev->gendisk, rdev->bdev,
rdev->data_offset << 9); rdev->data_offset << 9);
...@@ -1385,40 +1533,61 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev) ...@@ -1385,40 +1533,61 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
return err; return err;
} }
static int raid10_remove_disk(struct mddev *mddev, int number) static int raid10_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
{ {
struct r10conf *conf = mddev->private; struct r10conf *conf = mddev->private;
int err = 0; int err = 0;
struct md_rdev *rdev; int number = rdev->raid_disk;
struct mirror_info *p = conf->mirrors+ number; struct md_rdev **rdevp;
struct mirror_info *p = conf->mirrors + number;
print_conf(conf); print_conf(conf);
rdev = p->rdev; if (rdev == p->rdev)
if (rdev) { rdevp = &p->rdev;
else if (rdev == p->replacement)
rdevp = &p->replacement;
else
return 0;
if (test_bit(In_sync, &rdev->flags) || if (test_bit(In_sync, &rdev->flags) ||
atomic_read(&rdev->nr_pending)) { atomic_read(&rdev->nr_pending)) {
err = -EBUSY; err = -EBUSY;
goto abort; goto abort;
} }
/* Only remove faulty devices in recovery /* Only remove faulty devices if recovery
* is not possible. * is not possible.
*/ */
if (!test_bit(Faulty, &rdev->flags) && if (!test_bit(Faulty, &rdev->flags) &&
mddev->recovery_disabled != p->recovery_disabled && mddev->recovery_disabled != p->recovery_disabled &&
(!p->replacement || p->replacement == rdev) &&
enough(conf, -1)) { enough(conf, -1)) {
err = -EBUSY; err = -EBUSY;
goto abort; goto abort;
} }
p->rdev = NULL; *rdevp = NULL;
synchronize_rcu(); synchronize_rcu();
if (atomic_read(&rdev->nr_pending)) { if (atomic_read(&rdev->nr_pending)) {
/* lost the race, try later */ /* lost the race, try later */
err = -EBUSY; err = -EBUSY;
p->rdev = rdev; *rdevp = rdev;
goto abort; goto abort;
} } else if (p->replacement) {
/* We must have just cleared 'rdev' */
p->rdev = p->replacement;
clear_bit(Replacement, &p->replacement->flags);
smp_mb(); /* Make sure other CPUs may see both as identical
* but will never see neither -- if they are careful.
*/
p->replacement = NULL;
clear_bit(WantReplacement, &rdev->flags);
} else
/* We might have just remove the Replacement as faulty
* Clear the flag just in case
*/
clear_bit(WantReplacement, &rdev->flags);
err = md_integrity_register(mddev); err = md_integrity_register(mddev);
}
abort: abort:
print_conf(conf); print_conf(conf);
...@@ -1432,7 +1601,7 @@ static void end_sync_read(struct bio *bio, int error) ...@@ -1432,7 +1601,7 @@ static void end_sync_read(struct bio *bio, int error)
struct r10conf *conf = r10_bio->mddev->private; struct r10conf *conf = r10_bio->mddev->private;
int d; int d;
d = find_bio_disk(conf, r10_bio, bio, NULL); d = find_bio_disk(conf, r10_bio, bio, NULL, NULL);
if (test_bit(BIO_UPTODATE, &bio->bi_flags)) if (test_bit(BIO_UPTODATE, &bio->bi_flags))
set_bit(R10BIO_Uptodate, &r10_bio->state); set_bit(R10BIO_Uptodate, &r10_bio->state);
...@@ -1493,19 +1662,34 @@ static void end_sync_write(struct bio *bio, int error) ...@@ -1493,19 +1662,34 @@ static void end_sync_write(struct bio *bio, int error)
sector_t first_bad; sector_t first_bad;
int bad_sectors; int bad_sectors;
int slot; int slot;
int repl;
d = find_bio_disk(conf, r10_bio, bio, &slot); struct md_rdev *rdev = NULL;
d = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
if (repl)
rdev = conf->mirrors[d].replacement;
if (!rdev) {
smp_mb();
rdev = conf->mirrors[d].rdev;
}
if (!uptodate) { if (!uptodate) {
set_bit(WriteErrorSeen, &conf->mirrors[d].rdev->flags); if (repl)
md_error(mddev, rdev);
else {
set_bit(WriteErrorSeen, &rdev->flags);
if (!test_and_set_bit(WantReplacement, &rdev->flags))
set_bit(MD_RECOVERY_NEEDED,
&rdev->mddev->recovery);
set_bit(R10BIO_WriteError, &r10_bio->state); set_bit(R10BIO_WriteError, &r10_bio->state);
} else if (is_badblock(conf->mirrors[d].rdev, }
} else if (is_badblock(rdev,
r10_bio->devs[slot].addr, r10_bio->devs[slot].addr,
r10_bio->sectors, r10_bio->sectors,
&first_bad, &bad_sectors)) &first_bad, &bad_sectors))
set_bit(R10BIO_MadeGood, &r10_bio->state); set_bit(R10BIO_MadeGood, &r10_bio->state);
rdev_dec_pending(conf->mirrors[d].rdev, mddev); rdev_dec_pending(rdev, mddev);
end_sync_request(r10_bio); end_sync_request(r10_bio);
} }
...@@ -1609,6 +1793,29 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio) ...@@ -1609,6 +1793,29 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
generic_make_request(tbio); generic_make_request(tbio);
} }
/* Now write out to any replacement devices
* that are active
*/
for (i = 0; i < conf->copies; i++) {
int j, d;
int vcnt = r10_bio->sectors >> (PAGE_SHIFT-9);
tbio = r10_bio->devs[i].repl_bio;
if (!tbio || !tbio->bi_end_io)
continue;
if (r10_bio->devs[i].bio->bi_end_io != end_sync_write
&& r10_bio->devs[i].bio != fbio)
for (j = 0; j < vcnt; j++)
memcpy(page_address(tbio->bi_io_vec[j].bv_page),
page_address(fbio->bi_io_vec[j].bv_page),
PAGE_SIZE);
d = r10_bio->devs[i].devnum;
atomic_inc(&r10_bio->remaining);
md_sync_acct(conf->mirrors[d].replacement->bdev,
tbio->bi_size >> 9);
generic_make_request(tbio);
}
done: done:
if (atomic_dec_and_test(&r10_bio->remaining)) { if (atomic_dec_and_test(&r10_bio->remaining)) {
md_done_sync(mddev, r10_bio->sectors, 1); md_done_sync(mddev, r10_bio->sectors, 1);
...@@ -1668,8 +1875,13 @@ static void fix_recovery_read_error(struct r10bio *r10_bio) ...@@ -1668,8 +1875,13 @@ static void fix_recovery_read_error(struct r10bio *r10_bio)
s << 9, s << 9,
bio->bi_io_vec[idx].bv_page, bio->bi_io_vec[idx].bv_page,
WRITE, false); WRITE, false);
if (!ok) if (!ok) {
set_bit(WriteErrorSeen, &rdev->flags); set_bit(WriteErrorSeen, &rdev->flags);
if (!test_and_set_bit(WantReplacement,
&rdev->flags))
set_bit(MD_RECOVERY_NEEDED,
&rdev->mddev->recovery);
}
} }
if (!ok) { if (!ok) {
/* We don't worry if we cannot set a bad block - /* We don't worry if we cannot set a bad block -
...@@ -1709,7 +1921,7 @@ static void recovery_request_write(struct mddev *mddev, struct r10bio *r10_bio) ...@@ -1709,7 +1921,7 @@ static void recovery_request_write(struct mddev *mddev, struct r10bio *r10_bio)
{ {
struct r10conf *conf = mddev->private; struct r10conf *conf = mddev->private;
int d; int d;
struct bio *wbio; struct bio *wbio, *wbio2;
if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) { if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) {
fix_recovery_read_error(r10_bio); fix_recovery_read_error(r10_bio);
...@@ -1721,12 +1933,20 @@ static void recovery_request_write(struct mddev *mddev, struct r10bio *r10_bio) ...@@ -1721,12 +1933,20 @@ static void recovery_request_write(struct mddev *mddev, struct r10bio *r10_bio)
* share the pages with the first bio * share the pages with the first bio
* and submit the write request * and submit the write request
*/ */
wbio = r10_bio->devs[1].bio;
d = r10_bio->devs[1].devnum; d = r10_bio->devs[1].devnum;
wbio = r10_bio->devs[1].bio;
wbio2 = r10_bio->devs[1].repl_bio;
if (wbio->bi_end_io) {
atomic_inc(&conf->mirrors[d].rdev->nr_pending); atomic_inc(&conf->mirrors[d].rdev->nr_pending);
md_sync_acct(conf->mirrors[d].rdev->bdev, wbio->bi_size >> 9); md_sync_acct(conf->mirrors[d].rdev->bdev, wbio->bi_size >> 9);
generic_make_request(wbio); generic_make_request(wbio);
}
if (wbio2 && wbio2->bi_end_io) {
atomic_inc(&conf->mirrors[d].replacement->nr_pending);
md_sync_acct(conf->mirrors[d].replacement->bdev,
wbio2->bi_size >> 9);
generic_make_request(wbio2);
}
} }
...@@ -1779,8 +1999,12 @@ static int r10_sync_page_io(struct md_rdev *rdev, sector_t sector, ...@@ -1779,8 +1999,12 @@ static int r10_sync_page_io(struct md_rdev *rdev, sector_t sector,
if (sync_page_io(rdev, sector, sectors << 9, page, rw, false)) if (sync_page_io(rdev, sector, sectors << 9, page, rw, false))
/* success */ /* success */
return 1; return 1;
if (rw == WRITE) if (rw == WRITE) {
set_bit(WriteErrorSeen, &rdev->flags); set_bit(WriteErrorSeen, &rdev->flags);
if (!test_and_set_bit(WantReplacement, &rdev->flags))
set_bit(MD_RECOVERY_NEEDED,
&rdev->mddev->recovery);
}
/* need to record an error - either for the block or the device */ /* need to record an error - either for the block or the device */
if (!rdev_set_badblocks(rdev, sector, sectors, 0)) if (!rdev_set_badblocks(rdev, sector, sectors, 0))
md_error(rdev->mddev, rdev); md_error(rdev->mddev, rdev);
...@@ -2060,10 +2284,9 @@ static int narrow_write_error(struct r10bio *r10_bio, int i) ...@@ -2060,10 +2284,9 @@ static int narrow_write_error(struct r10bio *r10_bio, int i)
static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio) static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio)
{ {
int slot = r10_bio->read_slot; int slot = r10_bio->read_slot;
int mirror = r10_bio->devs[slot].devnum;
struct bio *bio; struct bio *bio;
struct r10conf *conf = mddev->private; struct r10conf *conf = mddev->private;
struct md_rdev *rdev; struct md_rdev *rdev = r10_bio->devs[slot].rdev;
char b[BDEVNAME_SIZE]; char b[BDEVNAME_SIZE];
unsigned long do_sync; unsigned long do_sync;
int max_sectors; int max_sectors;
...@@ -2081,15 +2304,15 @@ static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio) ...@@ -2081,15 +2304,15 @@ static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio)
fix_read_error(conf, mddev, r10_bio); fix_read_error(conf, mddev, r10_bio);
unfreeze_array(conf); unfreeze_array(conf);
} }
rdev_dec_pending(conf->mirrors[mirror].rdev, mddev); rdev_dec_pending(rdev, mddev);
bio = r10_bio->devs[slot].bio; bio = r10_bio->devs[slot].bio;
bdevname(bio->bi_bdev, b); bdevname(bio->bi_bdev, b);
r10_bio->devs[slot].bio = r10_bio->devs[slot].bio =
mddev->ro ? IO_BLOCKED : NULL; mddev->ro ? IO_BLOCKED : NULL;
read_more: read_more:
mirror = read_balance(conf, r10_bio, &max_sectors); rdev = read_balance(conf, r10_bio, &max_sectors);
if (mirror == -1) { if (rdev == NULL) {
printk(KERN_ALERT "md/raid10:%s: %s: unrecoverable I/O" printk(KERN_ALERT "md/raid10:%s: %s: unrecoverable I/O"
" read error for block %llu\n", " read error for block %llu\n",
mdname(mddev), b, mdname(mddev), b,
...@@ -2103,7 +2326,6 @@ static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio) ...@@ -2103,7 +2326,6 @@ static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio)
if (bio) if (bio)
bio_put(bio); bio_put(bio);
slot = r10_bio->read_slot; slot = r10_bio->read_slot;
rdev = conf->mirrors[mirror].rdev;
printk_ratelimited( printk_ratelimited(
KERN_ERR KERN_ERR
"md/raid10:%s: %s: redirecting" "md/raid10:%s: %s: redirecting"
...@@ -2117,6 +2339,7 @@ static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio) ...@@ -2117,6 +2339,7 @@ static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio)
r10_bio->sector - bio->bi_sector, r10_bio->sector - bio->bi_sector,
max_sectors); max_sectors);
r10_bio->devs[slot].bio = bio; r10_bio->devs[slot].bio = bio;
r10_bio->devs[slot].rdev = rdev;
bio->bi_sector = r10_bio->devs[slot].addr bio->bi_sector = r10_bio->devs[slot].addr
+ rdev->data_offset; + rdev->data_offset;
bio->bi_bdev = rdev->bdev; bio->bi_bdev = rdev->bdev;
...@@ -2187,6 +2410,22 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio) ...@@ -2187,6 +2410,22 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
r10_bio->sectors, 0)) r10_bio->sectors, 0))
md_error(conf->mddev, rdev); md_error(conf->mddev, rdev);
} }
rdev = conf->mirrors[dev].replacement;
if (r10_bio->devs[m].repl_bio == NULL)
continue;
if (test_bit(BIO_UPTODATE,
&r10_bio->devs[m].repl_bio->bi_flags)) {
rdev_clear_badblocks(
rdev,
r10_bio->devs[m].addr,
r10_bio->sectors);
} else {
if (!rdev_set_badblocks(
rdev,
r10_bio->devs[m].addr,
r10_bio->sectors, 0))
md_error(conf->mddev, rdev);
}
} }
put_buf(r10_bio); put_buf(r10_bio);
} else { } else {
...@@ -2209,6 +2448,15 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio) ...@@ -2209,6 +2448,15 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
} }
rdev_dec_pending(rdev, conf->mddev); rdev_dec_pending(rdev, conf->mddev);
} }
bio = r10_bio->devs[m].repl_bio;
rdev = conf->mirrors[dev].replacement;
if (rdev && bio == IO_MADE_GOOD) {
rdev_clear_badblocks(
rdev,
r10_bio->devs[m].addr,
r10_bio->sectors);
rdev_dec_pending(rdev, conf->mddev);
}
} }
if (test_bit(R10BIO_WriteError, if (test_bit(R10BIO_WriteError,
&r10_bio->state)) &r10_bio->state))
...@@ -2272,9 +2520,14 @@ static void raid10d(struct mddev *mddev) ...@@ -2272,9 +2520,14 @@ static void raid10d(struct mddev *mddev)
static int init_resync(struct r10conf *conf) static int init_resync(struct r10conf *conf)
{ {
int buffs; int buffs;
int i;
buffs = RESYNC_WINDOW / RESYNC_BLOCK_SIZE; buffs = RESYNC_WINDOW / RESYNC_BLOCK_SIZE;
BUG_ON(conf->r10buf_pool); BUG_ON(conf->r10buf_pool);
conf->have_replacement = 0;
for (i = 0; i < conf->raid_disks; i++)
if (conf->mirrors[i].replacement)
conf->have_replacement = 1;
conf->r10buf_pool = mempool_create(buffs, r10buf_pool_alloc, r10buf_pool_free, conf); conf->r10buf_pool = mempool_create(buffs, r10buf_pool_alloc, r10buf_pool_free, conf);
if (!conf->r10buf_pool) if (!conf->r10buf_pool)
return -ENOMEM; return -ENOMEM;
...@@ -2355,9 +2608,22 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, ...@@ -2355,9 +2608,22 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
bitmap_end_sync(mddev->bitmap, sect, bitmap_end_sync(mddev->bitmap, sect,
&sync_blocks, 1); &sync_blocks, 1);
} }
} else /* completed sync */ } else {
/* completed sync */
if ((!mddev->bitmap || conf->fullsync)
&& conf->have_replacement
&& test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
/* Completed a full sync so the replacements
* are now fully recovered.
*/
for (i = 0; i < conf->raid_disks; i++)
if (conf->mirrors[i].replacement)
conf->mirrors[i].replacement
->recovery_offset
= MaxSector;
}
conf->fullsync = 0; conf->fullsync = 0;
}
bitmap_close_sync(mddev->bitmap); bitmap_close_sync(mddev->bitmap);
close_sync(conf); close_sync(conf);
*skipped = 1; *skipped = 1;
...@@ -2414,23 +2680,30 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, ...@@ -2414,23 +2680,30 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
sector_t sect; sector_t sect;
int must_sync; int must_sync;
int any_working; int any_working;
struct mirror_info *mirror = &conf->mirrors[i];
if (conf->mirrors[i].rdev == NULL ||
test_bit(In_sync, &conf->mirrors[i].rdev->flags)) if ((mirror->rdev == NULL ||
test_bit(In_sync, &mirror->rdev->flags))
&&
(mirror->replacement == NULL ||
test_bit(Faulty,
&mirror->replacement->flags)))
continue; continue;
still_degraded = 0; still_degraded = 0;
/* want to reconstruct this device */ /* want to reconstruct this device */
rb2 = r10_bio; rb2 = r10_bio;
sect = raid10_find_virt(conf, sector_nr, i); sect = raid10_find_virt(conf, sector_nr, i);
/* Unless we are doing a full sync, we only need /* Unless we are doing a full sync, or a replacement
* to recover the block if it is set in the bitmap * we only need to recover the block if it is set in
* the bitmap
*/ */
must_sync = bitmap_start_sync(mddev->bitmap, sect, must_sync = bitmap_start_sync(mddev->bitmap, sect,
&sync_blocks, 1); &sync_blocks, 1);
if (sync_blocks < max_sync) if (sync_blocks < max_sync)
max_sync = sync_blocks; max_sync = sync_blocks;
if (!must_sync && if (!must_sync &&
mirror->replacement == NULL &&
!conf->fullsync) { !conf->fullsync) {
/* yep, skip the sync_blocks here, but don't assume /* yep, skip the sync_blocks here, but don't assume
* that there will never be anything to do here * that there will never be anything to do here
...@@ -2500,33 +2773,60 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, ...@@ -2500,33 +2773,60 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
bio->bi_end_io = end_sync_read; bio->bi_end_io = end_sync_read;
bio->bi_rw = READ; bio->bi_rw = READ;
from_addr = r10_bio->devs[j].addr; from_addr = r10_bio->devs[j].addr;
bio->bi_sector = from_addr + bio->bi_sector = from_addr + rdev->data_offset;
conf->mirrors[d].rdev->data_offset; bio->bi_bdev = rdev->bdev;
bio->bi_bdev = conf->mirrors[d].rdev->bdev; atomic_inc(&rdev->nr_pending);
atomic_inc(&conf->mirrors[d].rdev->nr_pending); /* and we write to 'i' (if not in_sync) */
atomic_inc(&r10_bio->remaining);
/* and we write to 'i' */
for (k=0; k<conf->copies; k++) for (k=0; k<conf->copies; k++)
if (r10_bio->devs[k].devnum == i) if (r10_bio->devs[k].devnum == i)
break; break;
BUG_ON(k == conf->copies); BUG_ON(k == conf->copies);
to_addr = r10_bio->devs[k].addr;
r10_bio->devs[0].devnum = d;
r10_bio->devs[0].addr = from_addr;
r10_bio->devs[1].devnum = i;
r10_bio->devs[1].addr = to_addr;
rdev = mirror->rdev;
if (!test_bit(In_sync, &rdev->flags)) {
bio = r10_bio->devs[1].bio; bio = r10_bio->devs[1].bio;
bio->bi_next = biolist; bio->bi_next = biolist;
biolist = bio; biolist = bio;
bio->bi_private = r10_bio; bio->bi_private = r10_bio;
bio->bi_end_io = end_sync_write; bio->bi_end_io = end_sync_write;
bio->bi_rw = WRITE; bio->bi_rw = WRITE;
to_addr = r10_bio->devs[k].addr; bio->bi_sector = to_addr
bio->bi_sector = to_addr + + rdev->data_offset;
conf->mirrors[i].rdev->data_offset; bio->bi_bdev = rdev->bdev;
bio->bi_bdev = conf->mirrors[i].rdev->bdev; atomic_inc(&r10_bio->remaining);
} else
r10_bio->devs[0].devnum = d; r10_bio->devs[1].bio->bi_end_io = NULL;
r10_bio->devs[0].addr = from_addr;
r10_bio->devs[1].devnum = i;
r10_bio->devs[1].addr = to_addr;
/* and maybe write to replacement */
bio = r10_bio->devs[1].repl_bio;
if (bio)
bio->bi_end_io = NULL;
rdev = mirror->replacement;
/* Note: if rdev != NULL, then bio
* cannot be NULL as r10buf_pool_alloc will
* have allocated it.
* So the second test here is pointless.
* But it keeps semantic-checkers happy, and
* this comment keeps human reviewers
* happy.
*/
if (rdev == NULL || bio == NULL ||
test_bit(Faulty, &rdev->flags))
break;
bio->bi_next = biolist;
biolist = bio;
bio->bi_private = r10_bio;
bio->bi_end_io = end_sync_write;
bio->bi_rw = WRITE;
bio->bi_sector = to_addr + rdev->data_offset;
bio->bi_bdev = rdev->bdev;
atomic_inc(&r10_bio->remaining);
break; break;
} }
if (j == conf->copies) { if (j == conf->copies) {
...@@ -2544,8 +2844,16 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, ...@@ -2544,8 +2844,16 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
for (k = 0; k < conf->copies; k++) for (k = 0; k < conf->copies; k++)
if (r10_bio->devs[k].devnum == i) if (r10_bio->devs[k].devnum == i)
break; break;
if (!rdev_set_badblocks( if (!test_bit(In_sync,
conf->mirrors[i].rdev, &mirror->rdev->flags)
&& !rdev_set_badblocks(
mirror->rdev,
r10_bio->devs[k].addr,
max_sync, 0))
any_working = 0;
if (mirror->replacement &&
!rdev_set_badblocks(
mirror->replacement,
r10_bio->devs[k].addr, r10_bio->devs[k].addr,
max_sync, 0)) max_sync, 0))
any_working = 0; any_working = 0;
...@@ -2556,7 +2864,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, ...@@ -2556,7 +2864,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
printk(KERN_INFO "md/raid10:%s: insufficient " printk(KERN_INFO "md/raid10:%s: insufficient "
"working devices for recovery.\n", "working devices for recovery.\n",
mdname(mddev)); mdname(mddev));
conf->mirrors[i].recovery_disabled mirror->recovery_disabled
= mddev->recovery_disabled; = mddev->recovery_disabled;
} }
break; break;
...@@ -2605,6 +2913,9 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, ...@@ -2605,6 +2913,9 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
sector_t first_bad, sector; sector_t first_bad, sector;
int bad_sectors; int bad_sectors;
if (r10_bio->devs[i].repl_bio)
r10_bio->devs[i].repl_bio->bi_end_io = NULL;
bio = r10_bio->devs[i].bio; bio = r10_bio->devs[i].bio;
bio->bi_end_io = NULL; bio->bi_end_io = NULL;
clear_bit(BIO_UPTODATE, &bio->bi_flags); clear_bit(BIO_UPTODATE, &bio->bi_flags);
...@@ -2635,6 +2946,27 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, ...@@ -2635,6 +2946,27 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
conf->mirrors[d].rdev->data_offset; conf->mirrors[d].rdev->data_offset;
bio->bi_bdev = conf->mirrors[d].rdev->bdev; bio->bi_bdev = conf->mirrors[d].rdev->bdev;
count++; count++;
if (conf->mirrors[d].replacement == NULL ||
test_bit(Faulty,
&conf->mirrors[d].replacement->flags))
continue;
/* Need to set up for writing to the replacement */
bio = r10_bio->devs[i].repl_bio;
clear_bit(BIO_UPTODATE, &bio->bi_flags);
sector = r10_bio->devs[i].addr;
atomic_inc(&conf->mirrors[d].rdev->nr_pending);
bio->bi_next = biolist;
biolist = bio;
bio->bi_private = r10_bio;
bio->bi_end_io = end_sync_write;
bio->bi_rw = WRITE;
bio->bi_sector = sector +
conf->mirrors[d].replacement->data_offset;
bio->bi_bdev = conf->mirrors[d].replacement->bdev;
count++;
} }
if (count < 2) { if (count < 2) {
...@@ -2643,6 +2975,11 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, ...@@ -2643,6 +2975,11 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
if (r10_bio->devs[i].bio->bi_end_io) if (r10_bio->devs[i].bio->bi_end_io)
rdev_dec_pending(conf->mirrors[d].rdev, rdev_dec_pending(conf->mirrors[d].rdev,
mddev); mddev);
if (r10_bio->devs[i].repl_bio &&
r10_bio->devs[i].repl_bio->bi_end_io)
rdev_dec_pending(
conf->mirrors[d].replacement,
mddev);
} }
put_buf(r10_bio); put_buf(r10_bio);
biolist = NULL; biolist = NULL;
...@@ -2896,6 +3233,16 @@ static int run(struct mddev *mddev) ...@@ -2896,6 +3233,16 @@ static int run(struct mddev *mddev)
continue; continue;
disk = conf->mirrors + disk_idx; disk = conf->mirrors + disk_idx;
if (test_bit(Replacement, &rdev->flags)) {
if (disk->replacement)
goto out_free_conf;
disk->replacement = rdev;
} else {
if (disk->rdev)
goto out_free_conf;
disk->rdev = rdev;
}
disk->rdev = rdev; disk->rdev = rdev;
disk_stack_limits(mddev->gendisk, rdev->bdev, disk_stack_limits(mddev->gendisk, rdev->bdev,
rdev->data_offset << 9); rdev->data_offset << 9);
...@@ -2923,6 +3270,13 @@ static int run(struct mddev *mddev) ...@@ -2923,6 +3270,13 @@ static int run(struct mddev *mddev)
disk = conf->mirrors + i; disk = conf->mirrors + i;
if (!disk->rdev && disk->replacement) {
/* The replacement is all we have - use it */
disk->rdev = disk->replacement;
disk->replacement = NULL;
clear_bit(Replacement, &disk->rdev->flags);
}
if (!disk->rdev || if (!disk->rdev ||
!test_bit(In_sync, &disk->rdev->flags)) { !test_bit(In_sync, &disk->rdev->flags)) {
disk->head_position = 0; disk->head_position = 0;
......
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
#define _RAID10_H #define _RAID10_H
struct mirror_info { struct mirror_info {
struct md_rdev *rdev; struct md_rdev *rdev, *replacement;
sector_t head_position; sector_t head_position;
int recovery_disabled; /* matches int recovery_disabled; /* matches
* mddev->recovery_disabled * mddev->recovery_disabled
...@@ -18,12 +18,13 @@ struct r10conf { ...@@ -18,12 +18,13 @@ struct r10conf {
spinlock_t device_lock; spinlock_t device_lock;
/* geometry */ /* geometry */
int near_copies; /* number of copies laid out raid0 style */ int near_copies; /* number of copies laid out
* raid0 style */
int far_copies; /* number of copies laid out int far_copies; /* number of copies laid out
* at large strides across drives * at large strides across drives
*/ */
int far_offset; /* far_copies are offset by 1 stripe int far_offset; /* far_copies are offset by 1
* instead of many * stripe instead of many
*/ */
int copies; /* near_copies * far_copies. int copies; /* near_copies * far_copies.
* must be <= raid_disks * must be <= raid_disks
...@@ -34,7 +35,8 @@ struct r10conf { ...@@ -34,7 +35,8 @@ struct r10conf {
* 1 stripe. * 1 stripe.
*/ */
sector_t dev_sectors; /* temp copy of mddev->dev_sectors */ sector_t dev_sectors; /* temp copy of
* mddev->dev_sectors */
int chunk_shift; /* shift from chunks to sectors */ int chunk_shift; /* shift from chunks to sectors */
sector_t chunk_mask; sector_t chunk_mask;
...@@ -54,7 +56,9 @@ struct r10conf { ...@@ -54,7 +56,9 @@ struct r10conf {
* (fresh device added). * (fresh device added).
* Cleared when a sync completes. * Cleared when a sync completes.
*/ */
int have_replacement; /* There is at least one
* replacement device.
*/
wait_queue_head_t wait_barrier; wait_queue_head_t wait_barrier;
mempool_t *r10bio_pool; mempool_t *r10bio_pool;
...@@ -98,9 +102,16 @@ struct r10bio { ...@@ -98,9 +102,16 @@ struct r10bio {
* When resyncing we also use one for each copy. * When resyncing we also use one for each copy.
* When reconstructing, we use 2 bios, one for read, one for write. * When reconstructing, we use 2 bios, one for read, one for write.
* We choose the number when they are allocated. * We choose the number when they are allocated.
* We sometimes need an extra bio to write to the replacement.
*/ */
struct { struct {
struct bio *bio; struct bio *bio;
union {
struct bio *repl_bio; /* used for resync and
* writes */
struct md_rdev *rdev; /* used for reads
* (read_slot >= 0) */
};
sector_t addr; sector_t addr;
int devnum; int devnum;
} devs[0]; } devs[0];
...@@ -121,17 +132,19 @@ struct r10bio { ...@@ -121,17 +132,19 @@ struct r10bio {
#define BIO_SPECIAL(bio) ((unsigned long)bio <= 2) #define BIO_SPECIAL(bio) ((unsigned long)bio <= 2)
/* bits for r10bio.state */ /* bits for r10bio.state */
#define R10BIO_Uptodate 0 enum r10bio_state {
#define R10BIO_IsSync 1 R10BIO_Uptodate,
#define R10BIO_IsRecover 2 R10BIO_IsSync,
#define R10BIO_Degraded 3 R10BIO_IsRecover,
R10BIO_Degraded,
/* Set ReadError on bios that experience a read error /* Set ReadError on bios that experience a read error
* so that raid10d knows what to do with them. * so that raid10d knows what to do with them.
*/ */
#define R10BIO_ReadError 4 R10BIO_ReadError,
/* If a write for this request means we can clear some /* If a write for this request means we can clear some
* known-bad-block records, we set this flag. * known-bad-block records, we set this flag.
*/ */
#define R10BIO_MadeGood 5 R10BIO_MadeGood,
#define R10BIO_WriteError 6 R10BIO_WriteError,
};
#endif #endif
...@@ -370,12 +370,10 @@ static struct stripe_head *__find_stripe(struct r5conf *conf, sector_t sector, ...@@ -370,12 +370,10 @@ static struct stripe_head *__find_stripe(struct r5conf *conf, sector_t sector,
* of the two sections, and some non-in_sync devices may * of the two sections, and some non-in_sync devices may
* be insync in the section most affected by failed devices. * be insync in the section most affected by failed devices.
*/ */
static int has_failed(struct r5conf *conf) static int calc_degraded(struct r5conf *conf)
{ {
int degraded; int degraded, degraded2;
int i; int i;
if (conf->mddev->reshape_position == MaxSector)
return conf->mddev->degraded > conf->max_degraded;
rcu_read_lock(); rcu_read_lock();
degraded = 0; degraded = 0;
...@@ -399,14 +397,14 @@ static int has_failed(struct r5conf *conf) ...@@ -399,14 +397,14 @@ static int has_failed(struct r5conf *conf)
degraded++; degraded++;
} }
rcu_read_unlock(); rcu_read_unlock();
if (degraded > conf->max_degraded) if (conf->raid_disks == conf->previous_raid_disks)
return 1; return degraded;
rcu_read_lock(); rcu_read_lock();
degraded = 0; degraded2 = 0;
for (i = 0; i < conf->raid_disks; i++) { for (i = 0; i < conf->raid_disks; i++) {
struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev); struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev);
if (!rdev || test_bit(Faulty, &rdev->flags)) if (!rdev || test_bit(Faulty, &rdev->flags))
degraded++; degraded2++;
else if (test_bit(In_sync, &rdev->flags)) else if (test_bit(In_sync, &rdev->flags))
; ;
else else
...@@ -416,9 +414,22 @@ static int has_failed(struct r5conf *conf) ...@@ -416,9 +414,22 @@ static int has_failed(struct r5conf *conf)
* almost certainly hasn't. * almost certainly hasn't.
*/ */
if (conf->raid_disks <= conf->previous_raid_disks) if (conf->raid_disks <= conf->previous_raid_disks)
degraded++; degraded2++;
} }
rcu_read_unlock(); rcu_read_unlock();
if (degraded2 > degraded)
return degraded2;
return degraded;
}
static int has_failed(struct r5conf *conf)
{
int degraded;
if (conf->mddev->reshape_position == MaxSector)
return conf->mddev->degraded > conf->max_degraded;
degraded = calc_degraded(conf);
if (degraded > conf->max_degraded) if (degraded > conf->max_degraded)
return 1; return 1;
return 0; return 0;
...@@ -492,8 +503,9 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) ...@@ -492,8 +503,9 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
for (i = disks; i--; ) { for (i = disks; i--; ) {
int rw; int rw;
struct bio *bi; int replace_only = 0;
struct md_rdev *rdev; struct bio *bi, *rbi;
struct md_rdev *rdev, *rrdev = NULL;
if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) { if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) {
if (test_and_clear_bit(R5_WantFUA, &sh->dev[i].flags)) if (test_and_clear_bit(R5_WantFUA, &sh->dev[i].flags))
rw = WRITE_FUA; rw = WRITE_FUA;
...@@ -501,27 +513,57 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) ...@@ -501,27 +513,57 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
rw = WRITE; rw = WRITE;
} else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags)) } else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
rw = READ; rw = READ;
else else if (test_and_clear_bit(R5_WantReplace,
&sh->dev[i].flags)) {
rw = WRITE;
replace_only = 1;
} else
continue; continue;
bi = &sh->dev[i].req; bi = &sh->dev[i].req;
rbi = &sh->dev[i].rreq; /* For writing to replacement */
bi->bi_rw = rw; bi->bi_rw = rw;
if (rw & WRITE) rbi->bi_rw = rw;
if (rw & WRITE) {
bi->bi_end_io = raid5_end_write_request; bi->bi_end_io = raid5_end_write_request;
else rbi->bi_end_io = raid5_end_write_request;
} else
bi->bi_end_io = raid5_end_read_request; bi->bi_end_io = raid5_end_read_request;
rcu_read_lock(); rcu_read_lock();
rrdev = rcu_dereference(conf->disks[i].replacement);
smp_mb(); /* Ensure that if rrdev is NULL, rdev won't be */
rdev = rcu_dereference(conf->disks[i].rdev); rdev = rcu_dereference(conf->disks[i].rdev);
if (!rdev) {
rdev = rrdev;
rrdev = NULL;
}
if (rw & WRITE) {
if (replace_only)
rdev = NULL;
if (rdev == rrdev)
/* We raced and saw duplicates */
rrdev = NULL;
} else {
if (test_bit(R5_ReadRepl, &sh->dev[i].flags) && rrdev)
rdev = rrdev;
rrdev = NULL;
}
if (rdev && test_bit(Faulty, &rdev->flags)) if (rdev && test_bit(Faulty, &rdev->flags))
rdev = NULL; rdev = NULL;
if (rdev) if (rdev)
atomic_inc(&rdev->nr_pending); atomic_inc(&rdev->nr_pending);
if (rrdev && test_bit(Faulty, &rrdev->flags))
rrdev = NULL;
if (rrdev)
atomic_inc(&rrdev->nr_pending);
rcu_read_unlock(); rcu_read_unlock();
/* We have already checked bad blocks for reads. Now /* We have already checked bad blocks for reads. Now
* need to check for writes. * need to check for writes. We never accept write errors
* on the replacement, so we don't to check rrdev.
*/ */
while ((rw & WRITE) && rdev && while ((rw & WRITE) && rdev &&
test_bit(WriteErrorSeen, &rdev->flags)) { test_bit(WriteErrorSeen, &rdev->flags)) {
...@@ -551,7 +593,8 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) ...@@ -551,7 +593,8 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
} }
if (rdev) { if (rdev) {
if (s->syncing || s->expanding || s->expanded) if (s->syncing || s->expanding || s->expanded
|| s->replacing)
md_sync_acct(rdev->bdev, STRIPE_SECTORS); md_sync_acct(rdev->bdev, STRIPE_SECTORS);
set_bit(STRIPE_IO_STARTED, &sh->state); set_bit(STRIPE_IO_STARTED, &sh->state);
...@@ -563,16 +606,38 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) ...@@ -563,16 +606,38 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
atomic_inc(&sh->count); atomic_inc(&sh->count);
bi->bi_sector = sh->sector + rdev->data_offset; bi->bi_sector = sh->sector + rdev->data_offset;
bi->bi_flags = 1 << BIO_UPTODATE; bi->bi_flags = 1 << BIO_UPTODATE;
bi->bi_vcnt = 1;
bi->bi_max_vecs = 1;
bi->bi_idx = 0; bi->bi_idx = 0;
bi->bi_io_vec = &sh->dev[i].vec;
bi->bi_io_vec[0].bv_len = STRIPE_SIZE; bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
bi->bi_io_vec[0].bv_offset = 0; bi->bi_io_vec[0].bv_offset = 0;
bi->bi_size = STRIPE_SIZE; bi->bi_size = STRIPE_SIZE;
bi->bi_next = NULL; bi->bi_next = NULL;
if (rrdev)
set_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags);
generic_make_request(bi); generic_make_request(bi);
} else { }
if (rrdev) {
if (s->syncing || s->expanding || s->expanded
|| s->replacing)
md_sync_acct(rrdev->bdev, STRIPE_SECTORS);
set_bit(STRIPE_IO_STARTED, &sh->state);
rbi->bi_bdev = rrdev->bdev;
pr_debug("%s: for %llu schedule op %ld on "
"replacement disc %d\n",
__func__, (unsigned long long)sh->sector,
rbi->bi_rw, i);
atomic_inc(&sh->count);
rbi->bi_sector = sh->sector + rrdev->data_offset;
rbi->bi_flags = 1 << BIO_UPTODATE;
rbi->bi_idx = 0;
rbi->bi_io_vec[0].bv_len = STRIPE_SIZE;
rbi->bi_io_vec[0].bv_offset = 0;
rbi->bi_size = STRIPE_SIZE;
rbi->bi_next = NULL;
generic_make_request(rbi);
}
if (!rdev && !rrdev) {
if (rw & WRITE) if (rw & WRITE)
set_bit(STRIPE_DEGRADED, &sh->state); set_bit(STRIPE_DEGRADED, &sh->state);
pr_debug("skip op %ld on disc %d for sector %llu\n", pr_debug("skip op %ld on disc %d for sector %llu\n",
...@@ -1583,7 +1648,7 @@ static void raid5_end_read_request(struct bio * bi, int error) ...@@ -1583,7 +1648,7 @@ static void raid5_end_read_request(struct bio * bi, int error)
int disks = sh->disks, i; int disks = sh->disks, i;
int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
char b[BDEVNAME_SIZE]; char b[BDEVNAME_SIZE];
struct md_rdev *rdev; struct md_rdev *rdev = NULL;
for (i=0 ; i<disks; i++) for (i=0 ; i<disks; i++)
...@@ -1597,11 +1662,23 @@ static void raid5_end_read_request(struct bio * bi, int error) ...@@ -1597,11 +1662,23 @@ static void raid5_end_read_request(struct bio * bi, int error)
BUG(); BUG();
return; return;
} }
if (test_bit(R5_ReadRepl, &sh->dev[i].flags))
/* If replacement finished while this request was outstanding,
* 'replacement' might be NULL already.
* In that case it moved down to 'rdev'.
* rdev is not removed until all requests are finished.
*/
rdev = conf->disks[i].replacement;
if (!rdev)
rdev = conf->disks[i].rdev;
if (uptodate) { if (uptodate) {
set_bit(R5_UPTODATE, &sh->dev[i].flags); set_bit(R5_UPTODATE, &sh->dev[i].flags);
if (test_bit(R5_ReadError, &sh->dev[i].flags)) { if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
rdev = conf->disks[i].rdev; /* Note that this cannot happen on a
* replacement device. We just fail those on
* any error
*/
printk_ratelimited( printk_ratelimited(
KERN_INFO KERN_INFO
"md/raid:%s: read error corrected" "md/raid:%s: read error corrected"
...@@ -1614,16 +1691,24 @@ static void raid5_end_read_request(struct bio * bi, int error) ...@@ -1614,16 +1691,24 @@ static void raid5_end_read_request(struct bio * bi, int error)
clear_bit(R5_ReadError, &sh->dev[i].flags); clear_bit(R5_ReadError, &sh->dev[i].flags);
clear_bit(R5_ReWrite, &sh->dev[i].flags); clear_bit(R5_ReWrite, &sh->dev[i].flags);
} }
if (atomic_read(&conf->disks[i].rdev->read_errors)) if (atomic_read(&rdev->read_errors))
atomic_set(&conf->disks[i].rdev->read_errors, 0); atomic_set(&rdev->read_errors, 0);
} else { } else {
const char *bdn = bdevname(conf->disks[i].rdev->bdev, b); const char *bdn = bdevname(rdev->bdev, b);
int retry = 0; int retry = 0;
rdev = conf->disks[i].rdev;
clear_bit(R5_UPTODATE, &sh->dev[i].flags); clear_bit(R5_UPTODATE, &sh->dev[i].flags);
atomic_inc(&rdev->read_errors); atomic_inc(&rdev->read_errors);
if (conf->mddev->degraded >= conf->max_degraded) if (test_bit(R5_ReadRepl, &sh->dev[i].flags))
printk_ratelimited(
KERN_WARNING
"md/raid:%s: read error on replacement device "
"(sector %llu on %s).\n",
mdname(conf->mddev),
(unsigned long long)(sh->sector
+ rdev->data_offset),
bdn);
else if (conf->mddev->degraded >= conf->max_degraded)
printk_ratelimited( printk_ratelimited(
KERN_WARNING KERN_WARNING
"md/raid:%s: read error not correctable " "md/raid:%s: read error not correctable "
...@@ -1657,7 +1742,7 @@ static void raid5_end_read_request(struct bio * bi, int error) ...@@ -1657,7 +1742,7 @@ static void raid5_end_read_request(struct bio * bi, int error)
md_error(conf->mddev, rdev); md_error(conf->mddev, rdev);
} }
} }
rdev_dec_pending(conf->disks[i].rdev, conf->mddev); rdev_dec_pending(rdev, conf->mddev);
clear_bit(R5_LOCKED, &sh->dev[i].flags); clear_bit(R5_LOCKED, &sh->dev[i].flags);
set_bit(STRIPE_HANDLE, &sh->state); set_bit(STRIPE_HANDLE, &sh->state);
release_stripe(sh); release_stripe(sh);
...@@ -1668,14 +1753,30 @@ static void raid5_end_write_request(struct bio *bi, int error) ...@@ -1668,14 +1753,30 @@ static void raid5_end_write_request(struct bio *bi, int error)
struct stripe_head *sh = bi->bi_private; struct stripe_head *sh = bi->bi_private;
struct r5conf *conf = sh->raid_conf; struct r5conf *conf = sh->raid_conf;
int disks = sh->disks, i; int disks = sh->disks, i;
struct md_rdev *uninitialized_var(rdev);
int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
sector_t first_bad; sector_t first_bad;
int bad_sectors; int bad_sectors;
int replacement = 0;
for (i=0 ; i<disks; i++) for (i = 0 ; i < disks; i++) {
if (bi == &sh->dev[i].req) if (bi == &sh->dev[i].req) {
rdev = conf->disks[i].rdev;
break; break;
}
if (bi == &sh->dev[i].rreq) {
rdev = conf->disks[i].replacement;
if (rdev)
replacement = 1;
else
/* rdev was removed and 'replacement'
* replaced it. rdev is not removed
* until all requests are finished.
*/
rdev = conf->disks[i].rdev;
break;
}
}
pr_debug("end_write_request %llu/%d, count %d, uptodate: %d.\n", pr_debug("end_write_request %llu/%d, count %d, uptodate: %d.\n",
(unsigned long long)sh->sector, i, atomic_read(&sh->count), (unsigned long long)sh->sector, i, atomic_read(&sh->count),
uptodate); uptodate);
...@@ -1684,21 +1785,33 @@ static void raid5_end_write_request(struct bio *bi, int error) ...@@ -1684,21 +1785,33 @@ static void raid5_end_write_request(struct bio *bi, int error)
return; return;
} }
if (replacement) {
if (!uptodate)
md_error(conf->mddev, rdev);
else if (is_badblock(rdev, sh->sector,
STRIPE_SECTORS,
&first_bad, &bad_sectors))
set_bit(R5_MadeGoodRepl, &sh->dev[i].flags);
} else {
if (!uptodate) { if (!uptodate) {
set_bit(WriteErrorSeen, &conf->disks[i].rdev->flags); set_bit(WriteErrorSeen, &rdev->flags);
set_bit(R5_WriteError, &sh->dev[i].flags); set_bit(R5_WriteError, &sh->dev[i].flags);
} else if (is_badblock(conf->disks[i].rdev, sh->sector, STRIPE_SECTORS, if (!test_and_set_bit(WantReplacement, &rdev->flags))
set_bit(MD_RECOVERY_NEEDED,
&rdev->mddev->recovery);
} else if (is_badblock(rdev, sh->sector,
STRIPE_SECTORS,
&first_bad, &bad_sectors)) &first_bad, &bad_sectors))
set_bit(R5_MadeGood, &sh->dev[i].flags); set_bit(R5_MadeGood, &sh->dev[i].flags);
}
rdev_dec_pending(rdev, conf->mddev);
rdev_dec_pending(conf->disks[i].rdev, conf->mddev); if (!test_and_clear_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags))
clear_bit(R5_LOCKED, &sh->dev[i].flags); clear_bit(R5_LOCKED, &sh->dev[i].flags);
set_bit(STRIPE_HANDLE, &sh->state); set_bit(STRIPE_HANDLE, &sh->state);
release_stripe(sh); release_stripe(sh);
} }
static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous); static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous);
static void raid5_build_block(struct stripe_head *sh, int i, int previous) static void raid5_build_block(struct stripe_head *sh, int i, int previous)
...@@ -1709,12 +1822,15 @@ static void raid5_build_block(struct stripe_head *sh, int i, int previous) ...@@ -1709,12 +1822,15 @@ static void raid5_build_block(struct stripe_head *sh, int i, int previous)
dev->req.bi_io_vec = &dev->vec; dev->req.bi_io_vec = &dev->vec;
dev->req.bi_vcnt++; dev->req.bi_vcnt++;
dev->req.bi_max_vecs++; dev->req.bi_max_vecs++;
dev->req.bi_private = sh;
dev->vec.bv_page = dev->page; dev->vec.bv_page = dev->page;
dev->vec.bv_len = STRIPE_SIZE;
dev->vec.bv_offset = 0;
dev->req.bi_sector = sh->sector; bio_init(&dev->rreq);
dev->req.bi_private = sh; dev->rreq.bi_io_vec = &dev->rvec;
dev->rreq.bi_vcnt++;
dev->rreq.bi_max_vecs++;
dev->rreq.bi_private = sh;
dev->rvec.bv_page = dev->page;
dev->flags = 0; dev->flags = 0;
dev->sector = compute_blocknr(sh, i, previous); dev->sector = compute_blocknr(sh, i, previous);
...@@ -1724,18 +1840,15 @@ static void error(struct mddev *mddev, struct md_rdev *rdev) ...@@ -1724,18 +1840,15 @@ static void error(struct mddev *mddev, struct md_rdev *rdev)
{ {
char b[BDEVNAME_SIZE]; char b[BDEVNAME_SIZE];
struct r5conf *conf = mddev->private; struct r5conf *conf = mddev->private;
unsigned long flags;
pr_debug("raid456: error called\n"); pr_debug("raid456: error called\n");
if (test_and_clear_bit(In_sync, &rdev->flags)) {
unsigned long flags;
spin_lock_irqsave(&conf->device_lock, flags); spin_lock_irqsave(&conf->device_lock, flags);
mddev->degraded++; clear_bit(In_sync, &rdev->flags);
mddev->degraded = calc_degraded(conf);
spin_unlock_irqrestore(&conf->device_lock, flags); spin_unlock_irqrestore(&conf->device_lock, flags);
/*
* if recovery was running, make sure it aborts.
*/
set_bit(MD_RECOVERY_INTR, &mddev->recovery); set_bit(MD_RECOVERY_INTR, &mddev->recovery);
}
set_bit(Blocked, &rdev->flags); set_bit(Blocked, &rdev->flags);
set_bit(Faulty, &rdev->flags); set_bit(Faulty, &rdev->flags);
set_bit(MD_CHANGE_DEVS, &mddev->flags); set_bit(MD_CHANGE_DEVS, &mddev->flags);
...@@ -2362,8 +2475,9 @@ handle_failed_sync(struct r5conf *conf, struct stripe_head *sh, ...@@ -2362,8 +2475,9 @@ handle_failed_sync(struct r5conf *conf, struct stripe_head *sh,
md_done_sync(conf->mddev, STRIPE_SECTORS, 0); md_done_sync(conf->mddev, STRIPE_SECTORS, 0);
clear_bit(STRIPE_SYNCING, &sh->state); clear_bit(STRIPE_SYNCING, &sh->state);
s->syncing = 0; s->syncing = 0;
s->replacing = 0;
/* There is nothing more to do for sync/check/repair. /* There is nothing more to do for sync/check/repair.
* For recover we need to record a bad block on all * For recover/replace we need to record a bad block on all
* non-sync devices, or abort the recovery * non-sync devices, or abort the recovery
*/ */
if (!test_bit(MD_RECOVERY_RECOVER, &conf->mddev->recovery)) if (!test_bit(MD_RECOVERY_RECOVER, &conf->mddev->recovery))
...@@ -2373,11 +2487,17 @@ handle_failed_sync(struct r5conf *conf, struct stripe_head *sh, ...@@ -2373,11 +2487,17 @@ handle_failed_sync(struct r5conf *conf, struct stripe_head *sh,
*/ */
for (i = 0; i < conf->raid_disks; i++) { for (i = 0; i < conf->raid_disks; i++) {
struct md_rdev *rdev = conf->disks[i].rdev; struct md_rdev *rdev = conf->disks[i].rdev;
if (!rdev if (rdev
|| test_bit(Faulty, &rdev->flags) && !test_bit(Faulty, &rdev->flags)
|| test_bit(In_sync, &rdev->flags)) && !test_bit(In_sync, &rdev->flags)
continue; && !rdev_set_badblocks(rdev, sh->sector,
if (!rdev_set_badblocks(rdev, sh->sector, STRIPE_SECTORS, 0))
abort = 1;
rdev = conf->disks[i].replacement;
if (rdev
&& !test_bit(Faulty, &rdev->flags)
&& !test_bit(In_sync, &rdev->flags)
&& !rdev_set_badblocks(rdev, sh->sector,
STRIPE_SECTORS, 0)) STRIPE_SECTORS, 0))
abort = 1; abort = 1;
} }
...@@ -2387,6 +2507,22 @@ handle_failed_sync(struct r5conf *conf, struct stripe_head *sh, ...@@ -2387,6 +2507,22 @@ handle_failed_sync(struct r5conf *conf, struct stripe_head *sh,
} }
} }
static int want_replace(struct stripe_head *sh, int disk_idx)
{
struct md_rdev *rdev;
int rv = 0;
/* Doing recovery so rcu locking not required */
rdev = sh->raid_conf->disks[disk_idx].replacement;
if (rdev
&& !test_bit(Faulty, &rdev->flags)
&& !test_bit(In_sync, &rdev->flags)
&& (rdev->recovery_offset <= sh->sector
|| rdev->mddev->recovery_cp <= sh->sector))
rv = 1;
return rv;
}
/* fetch_block - checks the given member device to see if its data needs /* fetch_block - checks the given member device to see if its data needs
* to be read or computed to satisfy a request. * to be read or computed to satisfy a request.
* *
...@@ -2406,6 +2542,7 @@ static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s, ...@@ -2406,6 +2542,7 @@ static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s,
(dev->toread || (dev->toread ||
(dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) || (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) ||
s->syncing || s->expanding || s->syncing || s->expanding ||
(s->replacing && want_replace(sh, disk_idx)) ||
(s->failed >= 1 && fdev[0]->toread) || (s->failed >= 1 && fdev[0]->toread) ||
(s->failed >= 2 && fdev[1]->toread) || (s->failed >= 2 && fdev[1]->toread) ||
(sh->raid_conf->level <= 5 && s->failed && fdev[0]->towrite && (sh->raid_conf->level <= 5 && s->failed && fdev[0]->towrite &&
...@@ -2959,22 +3096,18 @@ static void handle_stripe_expansion(struct r5conf *conf, struct stripe_head *sh) ...@@ -2959,22 +3096,18 @@ static void handle_stripe_expansion(struct r5conf *conf, struct stripe_head *sh)
} }
} }
/* /*
* handle_stripe - do things to a stripe. * handle_stripe - do things to a stripe.
* *
* We lock the stripe and then examine the state of various bits * We lock the stripe by setting STRIPE_ACTIVE and then examine the
* to see what needs to be done. * state of various bits to see what needs to be done.
* Possible results: * Possible results:
* return some read request which now have data * return some read requests which now have data
* return some write requests which are safely on disc * return some write requests which are safely on storage
* schedule a read on some buffers * schedule a read on some buffers
* schedule a write of some buffers * schedule a write of some buffers
* return confirmation of parity correctness * return confirmation of parity correctness
* *
* buffers are taken off read_list or write_list, and bh_cache buffers
* get BH_Lock set before the stripe lock is released.
*
*/ */
static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
...@@ -2983,10 +3116,10 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) ...@@ -2983,10 +3116,10 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
int disks = sh->disks; int disks = sh->disks;
struct r5dev *dev; struct r5dev *dev;
int i; int i;
int do_recovery = 0;
memset(s, 0, sizeof(*s)); memset(s, 0, sizeof(*s));
s->syncing = test_bit(STRIPE_SYNCING, &sh->state);
s->expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state); s->expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state);
s->expanded = test_bit(STRIPE_EXPAND_READY, &sh->state); s->expanded = test_bit(STRIPE_EXPAND_READY, &sh->state);
s->failed_num[0] = -1; s->failed_num[0] = -1;
...@@ -3004,7 +3137,8 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) ...@@ -3004,7 +3137,8 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
dev = &sh->dev[i]; dev = &sh->dev[i];
pr_debug("check %d: state 0x%lx read %p write %p written %p\n", pr_debug("check %d: state 0x%lx read %p write %p written %p\n",
i, dev->flags, dev->toread, dev->towrite, dev->written); i, dev->flags,
dev->toread, dev->towrite, dev->written);
/* maybe we can reply to a read /* maybe we can reply to a read
* *
* new wantfill requests are only permitted while * new wantfill requests are only permitted while
...@@ -3035,7 +3169,21 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) ...@@ -3035,7 +3169,21 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
} }
if (dev->written) if (dev->written)
s->written++; s->written++;
/* Prefer to use the replacement for reads, but only
* if it is recovered enough and has no bad blocks.
*/
rdev = rcu_dereference(conf->disks[i].replacement);
if (rdev && !test_bit(Faulty, &rdev->flags) &&
rdev->recovery_offset >= sh->sector + STRIPE_SECTORS &&
!is_badblock(rdev, sh->sector, STRIPE_SECTORS,
&first_bad, &bad_sectors))
set_bit(R5_ReadRepl, &dev->flags);
else {
if (rdev)
set_bit(R5_NeedReplace, &dev->flags);
rdev = rcu_dereference(conf->disks[i].rdev); rdev = rcu_dereference(conf->disks[i].rdev);
clear_bit(R5_ReadRepl, &dev->flags);
}
if (rdev && test_bit(Faulty, &rdev->flags)) if (rdev && test_bit(Faulty, &rdev->flags))
rdev = NULL; rdev = NULL;
if (rdev) { if (rdev) {
...@@ -3077,20 +3225,38 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) ...@@ -3077,20 +3225,38 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
set_bit(R5_Insync, &dev->flags); set_bit(R5_Insync, &dev->flags);
if (rdev && test_bit(R5_WriteError, &dev->flags)) { if (rdev && test_bit(R5_WriteError, &dev->flags)) {
/* This flag does not apply to '.replacement'
* only to .rdev, so make sure to check that*/
struct md_rdev *rdev2 = rcu_dereference(
conf->disks[i].rdev);
if (rdev2 == rdev)
clear_bit(R5_Insync, &dev->flags); clear_bit(R5_Insync, &dev->flags);
if (!test_bit(Faulty, &rdev->flags)) { if (rdev2 && !test_bit(Faulty, &rdev2->flags)) {
s->handle_bad_blocks = 1; s->handle_bad_blocks = 1;
atomic_inc(&rdev->nr_pending); atomic_inc(&rdev2->nr_pending);
} else } else
clear_bit(R5_WriteError, &dev->flags); clear_bit(R5_WriteError, &dev->flags);
} }
if (rdev && test_bit(R5_MadeGood, &dev->flags)) { if (rdev && test_bit(R5_MadeGood, &dev->flags)) {
if (!test_bit(Faulty, &rdev->flags)) { /* This flag does not apply to '.replacement'
* only to .rdev, so make sure to check that*/
struct md_rdev *rdev2 = rcu_dereference(
conf->disks[i].rdev);
if (rdev2 && !test_bit(Faulty, &rdev2->flags)) {
s->handle_bad_blocks = 1; s->handle_bad_blocks = 1;
atomic_inc(&rdev->nr_pending); atomic_inc(&rdev2->nr_pending);
} else } else
clear_bit(R5_MadeGood, &dev->flags); clear_bit(R5_MadeGood, &dev->flags);
} }
if (test_bit(R5_MadeGoodRepl, &dev->flags)) {
struct md_rdev *rdev2 = rcu_dereference(
conf->disks[i].replacement);
if (rdev2 && !test_bit(Faulty, &rdev2->flags)) {
s->handle_bad_blocks = 1;
atomic_inc(&rdev2->nr_pending);
} else
clear_bit(R5_MadeGoodRepl, &dev->flags);
}
if (!test_bit(R5_Insync, &dev->flags)) { if (!test_bit(R5_Insync, &dev->flags)) {
/* The ReadError flag will just be confusing now */ /* The ReadError flag will just be confusing now */
clear_bit(R5_ReadError, &dev->flags); clear_bit(R5_ReadError, &dev->flags);
...@@ -3102,9 +3268,25 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) ...@@ -3102,9 +3268,25 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
if (s->failed < 2) if (s->failed < 2)
s->failed_num[s->failed] = i; s->failed_num[s->failed] = i;
s->failed++; s->failed++;
if (rdev && !test_bit(Faulty, &rdev->flags))
do_recovery = 1;
} }
} }
spin_unlock_irq(&conf->device_lock); spin_unlock_irq(&conf->device_lock);
if (test_bit(STRIPE_SYNCING, &sh->state)) {
/* If there is a failed device being replaced,
* we must be recovering.
* else if we are after recovery_cp, we must be syncing
* else we can only be replacing
* sync and recovery both need to read all devices, and so
* use the same flag.
*/
if (do_recovery ||
sh->sector >= conf->mddev->recovery_cp)
s->syncing = 1;
else
s->replacing = 1;
}
rcu_read_unlock(); rcu_read_unlock();
} }
...@@ -3146,7 +3328,7 @@ static void handle_stripe(struct stripe_head *sh) ...@@ -3146,7 +3328,7 @@ static void handle_stripe(struct stripe_head *sh)
if (unlikely(s.blocked_rdev)) { if (unlikely(s.blocked_rdev)) {
if (s.syncing || s.expanding || s.expanded || if (s.syncing || s.expanding || s.expanded ||
s.to_write || s.written) { s.replacing || s.to_write || s.written) {
set_bit(STRIPE_HANDLE, &sh->state); set_bit(STRIPE_HANDLE, &sh->state);
goto finish; goto finish;
} }
...@@ -3172,7 +3354,7 @@ static void handle_stripe(struct stripe_head *sh) ...@@ -3172,7 +3354,7 @@ static void handle_stripe(struct stripe_head *sh)
sh->reconstruct_state = 0; sh->reconstruct_state = 0;
if (s.to_read+s.to_write+s.written) if (s.to_read+s.to_write+s.written)
handle_failed_stripe(conf, sh, &s, disks, &s.return_bi); handle_failed_stripe(conf, sh, &s, disks, &s.return_bi);
if (s.syncing) if (s.syncing + s.replacing)
handle_failed_sync(conf, sh, &s); handle_failed_sync(conf, sh, &s);
} }
...@@ -3203,7 +3385,9 @@ static void handle_stripe(struct stripe_head *sh) ...@@ -3203,7 +3385,9 @@ static void handle_stripe(struct stripe_head *sh)
*/ */
if (s.to_read || s.non_overwrite if (s.to_read || s.non_overwrite
|| (conf->level == 6 && s.to_write && s.failed) || (conf->level == 6 && s.to_write && s.failed)
|| (s.syncing && (s.uptodate + s.compute < disks)) || s.expanding) || (s.syncing && (s.uptodate + s.compute < disks))
|| s.replacing
|| s.expanding)
handle_stripe_fill(sh, &s, disks); handle_stripe_fill(sh, &s, disks);
/* Now we check to see if any write operations have recently /* Now we check to see if any write operations have recently
...@@ -3265,7 +3449,20 @@ static void handle_stripe(struct stripe_head *sh) ...@@ -3265,7 +3449,20 @@ static void handle_stripe(struct stripe_head *sh)
handle_parity_checks5(conf, sh, &s, disks); handle_parity_checks5(conf, sh, &s, disks);
} }
if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) { if (s.replacing && s.locked == 0
&& !test_bit(STRIPE_INSYNC, &sh->state)) {
/* Write out to replacement devices where possible */
for (i = 0; i < conf->raid_disks; i++)
if (test_bit(R5_UPTODATE, &sh->dev[i].flags) &&
test_bit(R5_NeedReplace, &sh->dev[i].flags)) {
set_bit(R5_WantReplace, &sh->dev[i].flags);
set_bit(R5_LOCKED, &sh->dev[i].flags);
s.locked++;
}
set_bit(STRIPE_INSYNC, &sh->state);
}
if ((s.syncing || s.replacing) && s.locked == 0 &&
test_bit(STRIPE_INSYNC, &sh->state)) {
md_done_sync(conf->mddev, STRIPE_SECTORS, 1); md_done_sync(conf->mddev, STRIPE_SECTORS, 1);
clear_bit(STRIPE_SYNCING, &sh->state); clear_bit(STRIPE_SYNCING, &sh->state);
} }
...@@ -3363,6 +3560,15 @@ static void handle_stripe(struct stripe_head *sh) ...@@ -3363,6 +3560,15 @@ static void handle_stripe(struct stripe_head *sh)
STRIPE_SECTORS); STRIPE_SECTORS);
rdev_dec_pending(rdev, conf->mddev); rdev_dec_pending(rdev, conf->mddev);
} }
if (test_and_clear_bit(R5_MadeGoodRepl, &dev->flags)) {
rdev = conf->disks[i].replacement;
if (!rdev)
/* rdev have been moved down */
rdev = conf->disks[i].rdev;
rdev_clear_badblocks(rdev, sh->sector,
STRIPE_SECTORS);
rdev_dec_pending(rdev, conf->mddev);
}
} }
if (s.ops_request) if (s.ops_request)
...@@ -3586,6 +3792,7 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio) ...@@ -3586,6 +3792,7 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio)
int dd_idx; int dd_idx;
struct bio* align_bi; struct bio* align_bi;
struct md_rdev *rdev; struct md_rdev *rdev;
sector_t end_sector;
if (!in_chunk_boundary(mddev, raid_bio)) { if (!in_chunk_boundary(mddev, raid_bio)) {
pr_debug("chunk_aligned_read : non aligned\n"); pr_debug("chunk_aligned_read : non aligned\n");
...@@ -3610,9 +3817,19 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio) ...@@ -3610,9 +3817,19 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio)
0, 0,
&dd_idx, NULL); &dd_idx, NULL);
end_sector = align_bi->bi_sector + (align_bi->bi_size >> 9);
rcu_read_lock(); rcu_read_lock();
rdev = rcu_dereference(conf->disks[dd_idx].replacement);
if (!rdev || test_bit(Faulty, &rdev->flags) ||
rdev->recovery_offset < end_sector) {
rdev = rcu_dereference(conf->disks[dd_idx].rdev); rdev = rcu_dereference(conf->disks[dd_idx].rdev);
if (rdev && test_bit(In_sync, &rdev->flags)) { if (rdev &&
(test_bit(Faulty, &rdev->flags) ||
!(test_bit(In_sync, &rdev->flags) ||
rdev->recovery_offset >= end_sector)))
rdev = NULL;
}
if (rdev) {
sector_t first_bad; sector_t first_bad;
int bad_sectors; int bad_sectors;
...@@ -4137,7 +4354,6 @@ static inline sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int ...@@ -4137,7 +4354,6 @@ static inline sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int
return sync_blocks * STRIPE_SECTORS; /* keep things rounded to whole stripes */ return sync_blocks * STRIPE_SECTORS; /* keep things rounded to whole stripes */
} }
bitmap_cond_end_sync(mddev->bitmap, sector_nr); bitmap_cond_end_sync(mddev->bitmap, sector_nr);
sh = get_active_stripe(conf, sector_nr, 0, 1, 0); sh = get_active_stripe(conf, sector_nr, 0, 1, 0);
...@@ -4208,7 +4424,6 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio) ...@@ -4208,7 +4424,6 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
return handled; return handled;
} }
set_bit(R5_ReadError, &sh->dev[dd_idx].flags);
if (!add_stripe_bio(sh, raid_bio, dd_idx, 0)) { if (!add_stripe_bio(sh, raid_bio, dd_idx, 0)) {
release_stripe(sh); release_stripe(sh);
raid5_set_bi_hw_segments(raid_bio, scnt); raid5_set_bi_hw_segments(raid_bio, scnt);
...@@ -4635,7 +4850,15 @@ static struct r5conf *setup_conf(struct mddev *mddev) ...@@ -4635,7 +4850,15 @@ static struct r5conf *setup_conf(struct mddev *mddev)
continue; continue;
disk = conf->disks + raid_disk; disk = conf->disks + raid_disk;
if (test_bit(Replacement, &rdev->flags)) {
if (disk->replacement)
goto abort;
disk->replacement = rdev;
} else {
if (disk->rdev)
goto abort;
disk->rdev = rdev; disk->rdev = rdev;
}
if (test_bit(In_sync, &rdev->flags)) { if (test_bit(In_sync, &rdev->flags)) {
char b[BDEVNAME_SIZE]; char b[BDEVNAME_SIZE];
...@@ -4724,6 +4947,7 @@ static int run(struct mddev *mddev) ...@@ -4724,6 +4947,7 @@ static int run(struct mddev *mddev)
int dirty_parity_disks = 0; int dirty_parity_disks = 0;
struct md_rdev *rdev; struct md_rdev *rdev;
sector_t reshape_offset = 0; sector_t reshape_offset = 0;
int i;
if (mddev->recovery_cp != MaxSector) if (mddev->recovery_cp != MaxSector)
printk(KERN_NOTICE "md/raid:%s: not clean" printk(KERN_NOTICE "md/raid:%s: not clean"
...@@ -4813,12 +5037,25 @@ static int run(struct mddev *mddev) ...@@ -4813,12 +5037,25 @@ static int run(struct mddev *mddev)
conf->thread = NULL; conf->thread = NULL;
mddev->private = conf; mddev->private = conf;
/* for (i = 0; i < conf->raid_disks && conf->previous_raid_disks;
* 0 for a fully functional array, 1 or 2 for a degraded array. i++) {
*/ rdev = conf->disks[i].rdev;
list_for_each_entry(rdev, &mddev->disks, same_set) { if (!rdev && conf->disks[i].replacement) {
if (rdev->raid_disk < 0) /* The replacement is all we have yet */
rdev = conf->disks[i].replacement;
conf->disks[i].replacement = NULL;
clear_bit(Replacement, &rdev->flags);
conf->disks[i].rdev = rdev;
}
if (!rdev)
continue; continue;
if (conf->disks[i].replacement &&
conf->reshape_progress != MaxSector) {
/* replacements and reshape simply do not mix. */
printk(KERN_ERR "md: cannot handle concurrent "
"replacement and reshape.\n");
goto abort;
}
if (test_bit(In_sync, &rdev->flags)) { if (test_bit(In_sync, &rdev->flags)) {
working_disks++; working_disks++;
continue; continue;
...@@ -4852,8 +5089,10 @@ static int run(struct mddev *mddev) ...@@ -4852,8 +5089,10 @@ static int run(struct mddev *mddev)
dirty_parity_disks++; dirty_parity_disks++;
} }
mddev->degraded = (max(conf->raid_disks, conf->previous_raid_disks) /*
- working_disks); * 0 for a fully functional array, 1 or 2 for a degraded array.
*/
mddev->degraded = calc_degraded(conf);
if (has_failed(conf)) { if (has_failed(conf)) {
printk(KERN_ERR "md/raid:%s: not enough operational devices" printk(KERN_ERR "md/raid:%s: not enough operational devices"
...@@ -5016,7 +5255,25 @@ static int raid5_spare_active(struct mddev *mddev) ...@@ -5016,7 +5255,25 @@ static int raid5_spare_active(struct mddev *mddev)
for (i = 0; i < conf->raid_disks; i++) { for (i = 0; i < conf->raid_disks; i++) {
tmp = conf->disks + i; tmp = conf->disks + i;
if (tmp->rdev if (tmp->replacement
&& tmp->replacement->recovery_offset == MaxSector
&& !test_bit(Faulty, &tmp->replacement->flags)
&& !test_and_set_bit(In_sync, &tmp->replacement->flags)) {
/* Replacement has just become active. */
if (!tmp->rdev
|| !test_and_clear_bit(In_sync, &tmp->rdev->flags))
count++;
if (tmp->rdev) {
/* Replaced device not technically faulty,
* but we need to be sure it gets removed
* and never re-added.
*/
set_bit(Faulty, &tmp->rdev->flags);
sysfs_notify_dirent_safe(
tmp->rdev->sysfs_state);
}
sysfs_notify_dirent_safe(tmp->replacement->sysfs_state);
} else if (tmp->rdev
&& tmp->rdev->recovery_offset == MaxSector && tmp->rdev->recovery_offset == MaxSector
&& !test_bit(Faulty, &tmp->rdev->flags) && !test_bit(Faulty, &tmp->rdev->flags)
&& !test_and_set_bit(In_sync, &tmp->rdev->flags)) { && !test_and_set_bit(In_sync, &tmp->rdev->flags)) {
...@@ -5025,22 +5282,28 @@ static int raid5_spare_active(struct mddev *mddev) ...@@ -5025,22 +5282,28 @@ static int raid5_spare_active(struct mddev *mddev)
} }
} }
spin_lock_irqsave(&conf->device_lock, flags); spin_lock_irqsave(&conf->device_lock, flags);
mddev->degraded -= count; mddev->degraded = calc_degraded(conf);
spin_unlock_irqrestore(&conf->device_lock, flags); spin_unlock_irqrestore(&conf->device_lock, flags);
print_raid5_conf(conf); print_raid5_conf(conf);
return count; return count;
} }
static int raid5_remove_disk(struct mddev *mddev, int number) static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
{ {
struct r5conf *conf = mddev->private; struct r5conf *conf = mddev->private;
int err = 0; int err = 0;
struct md_rdev *rdev; int number = rdev->raid_disk;
struct md_rdev **rdevp;
struct disk_info *p = conf->disks + number; struct disk_info *p = conf->disks + number;
print_raid5_conf(conf); print_raid5_conf(conf);
rdev = p->rdev; if (rdev == p->rdev)
if (rdev) { rdevp = &p->rdev;
else if (rdev == p->replacement)
rdevp = &p->replacement;
else
return 0;
if (number >= conf->raid_disks && if (number >= conf->raid_disks &&
conf->reshape_progress == MaxSector) conf->reshape_progress == MaxSector)
clear_bit(In_sync, &rdev->flags); clear_bit(In_sync, &rdev->flags);
...@@ -5056,18 +5319,31 @@ static int raid5_remove_disk(struct mddev *mddev, int number) ...@@ -5056,18 +5319,31 @@ static int raid5_remove_disk(struct mddev *mddev, int number)
if (!test_bit(Faulty, &rdev->flags) && if (!test_bit(Faulty, &rdev->flags) &&
mddev->recovery_disabled != conf->recovery_disabled && mddev->recovery_disabled != conf->recovery_disabled &&
!has_failed(conf) && !has_failed(conf) &&
(!p->replacement || p->replacement == rdev) &&
number < conf->raid_disks) { number < conf->raid_disks) {
err = -EBUSY; err = -EBUSY;
goto abort; goto abort;
} }
p->rdev = NULL; *rdevp = NULL;
synchronize_rcu(); synchronize_rcu();
if (atomic_read(&rdev->nr_pending)) { if (atomic_read(&rdev->nr_pending)) {
/* lost the race, try later */ /* lost the race, try later */
err = -EBUSY; err = -EBUSY;
p->rdev = rdev; *rdevp = rdev;
} } else if (p->replacement) {
} /* We must have just cleared 'rdev' */
p->rdev = p->replacement;
clear_bit(Replacement, &p->replacement->flags);
smp_mb(); /* Make sure other CPUs may see both as identical
* but will never see neither - if they are careful
*/
p->replacement = NULL;
clear_bit(WantReplacement, &rdev->flags);
} else
/* We might have just removed the Replacement as faulty-
* clear the bit just in case
*/
clear_bit(WantReplacement, &rdev->flags);
abort: abort:
print_raid5_conf(conf); print_raid5_conf(conf);
...@@ -5103,8 +5379,9 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev) ...@@ -5103,8 +5379,9 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev)
disk = rdev->saved_raid_disk; disk = rdev->saved_raid_disk;
else else
disk = first; disk = first;
for ( ; disk <= last ; disk++) for ( ; disk <= last ; disk++) {
if ((p=conf->disks + disk)->rdev == NULL) { p = conf->disks + disk;
if (p->rdev == NULL) {
clear_bit(In_sync, &rdev->flags); clear_bit(In_sync, &rdev->flags);
rdev->raid_disk = disk; rdev->raid_disk = disk;
err = 0; err = 0;
...@@ -5113,6 +5390,17 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev) ...@@ -5113,6 +5390,17 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev)
rcu_assign_pointer(p->rdev, rdev); rcu_assign_pointer(p->rdev, rdev);
break; break;
} }
if (test_bit(WantReplacement, &p->rdev->flags) &&
p->replacement == NULL) {
clear_bit(In_sync, &rdev->flags);
set_bit(Replacement, &rdev->flags);
rdev->raid_disk = disk;
err = 0;
conf->fullsync = 1;
rcu_assign_pointer(p->replacement, rdev);
break;
}
}
print_raid5_conf(conf); print_raid5_conf(conf);
return err; return err;
} }
...@@ -5286,8 +5574,7 @@ static int raid5_start_reshape(struct mddev *mddev) ...@@ -5286,8 +5574,7 @@ static int raid5_start_reshape(struct mddev *mddev)
* pre and post number of devices. * pre and post number of devices.
*/ */
spin_lock_irqsave(&conf->device_lock, flags); spin_lock_irqsave(&conf->device_lock, flags);
mddev->degraded += (conf->raid_disks - conf->previous_raid_disks) mddev->degraded = calc_degraded(conf);
- added_devices;
spin_unlock_irqrestore(&conf->device_lock, flags); spin_unlock_irqrestore(&conf->device_lock, flags);
} }
mddev->raid_disks = conf->raid_disks; mddev->raid_disks = conf->raid_disks;
...@@ -5356,17 +5643,15 @@ static void raid5_finish_reshape(struct mddev *mddev) ...@@ -5356,17 +5643,15 @@ static void raid5_finish_reshape(struct mddev *mddev)
revalidate_disk(mddev->gendisk); revalidate_disk(mddev->gendisk);
} else { } else {
int d; int d;
mddev->degraded = conf->raid_disks; spin_lock_irq(&conf->device_lock);
for (d = 0; d < conf->raid_disks ; d++) mddev->degraded = calc_degraded(conf);
if (conf->disks[d].rdev && spin_unlock_irq(&conf->device_lock);
test_bit(In_sync,
&conf->disks[d].rdev->flags))
mddev->degraded--;
for (d = conf->raid_disks ; for (d = conf->raid_disks ;
d < conf->raid_disks - mddev->delta_disks; d < conf->raid_disks - mddev->delta_disks;
d++) { d++) {
struct md_rdev *rdev = conf->disks[d].rdev; struct md_rdev *rdev = conf->disks[d].rdev;
if (rdev && raid5_remove_disk(mddev, d) == 0) { if (rdev &&
raid5_remove_disk(mddev, rdev) == 0) {
sysfs_unlink_rdev(mddev, rdev); sysfs_unlink_rdev(mddev, rdev);
rdev->raid_disk = -1; rdev->raid_disk = -1;
} }
......
...@@ -27,7 +27,7 @@ ...@@ -27,7 +27,7 @@
* The possible state transitions are: * The possible state transitions are:
* *
* Empty -> Want - on read or write to get old data for parity calc * Empty -> Want - on read or write to get old data for parity calc
* Empty -> Dirty - on compute_parity to satisfy write/sync request.(RECONSTRUCT_WRITE) * Empty -> Dirty - on compute_parity to satisfy write/sync request.
* Empty -> Clean - on compute_block when computing a block for failed drive * Empty -> Clean - on compute_block when computing a block for failed drive
* Want -> Empty - on failed read * Want -> Empty - on failed read
* Want -> Clean - on successful completion of read request * Want -> Clean - on successful completion of read request
...@@ -226,8 +226,11 @@ struct stripe_head { ...@@ -226,8 +226,11 @@ struct stripe_head {
#endif #endif
} ops; } ops;
struct r5dev { struct r5dev {
struct bio req; /* rreq and rvec are used for the replacement device when
struct bio_vec vec; * writing data to both devices.
*/
struct bio req, rreq;
struct bio_vec vec, rvec;
struct page *page; struct page *page;
struct bio *toread, *read, *towrite, *written; struct bio *toread, *read, *towrite, *written;
sector_t sector; /* sector of this page */ sector_t sector; /* sector of this page */
...@@ -239,7 +242,13 @@ struct stripe_head { ...@@ -239,7 +242,13 @@ struct stripe_head {
* for handle_stripe. * for handle_stripe.
*/ */
struct stripe_head_state { struct stripe_head_state {
int syncing, expanding, expanded; /* 'syncing' means that we need to read all devices, either
* to check/correct parity, or to reconstruct a missing device.
* 'replacing' means we are replacing one or more drives and
* the source is valid at this point so we don't need to
* read all devices, just the replacement targets.
*/
int syncing, expanding, expanded, replacing;
int locked, uptodate, to_read, to_write, failed, written; int locked, uptodate, to_read, to_write, failed, written;
int to_fill, compute, req_compute, non_overwrite; int to_fill, compute, req_compute, non_overwrite;
int failed_num[2]; int failed_num[2];
...@@ -252,38 +261,41 @@ struct stripe_head_state { ...@@ -252,38 +261,41 @@ struct stripe_head_state {
int handle_bad_blocks; int handle_bad_blocks;
}; };
/* Flags */ /* Flags for struct r5dev.flags */
#define R5_UPTODATE 0 /* page contains current data */ enum r5dev_flags {
#define R5_LOCKED 1 /* IO has been submitted on "req" */ R5_UPTODATE, /* page contains current data */
#define R5_OVERWRITE 2 /* towrite covers whole page */ R5_LOCKED, /* IO has been submitted on "req" */
R5_DOUBLE_LOCKED,/* Cannot clear R5_LOCKED until 2 writes complete */
R5_OVERWRITE, /* towrite covers whole page */
/* and some that are internal to handle_stripe */ /* and some that are internal to handle_stripe */
#define R5_Insync 3 /* rdev && rdev->in_sync at start */ R5_Insync, /* rdev && rdev->in_sync at start */
#define R5_Wantread 4 /* want to schedule a read */ R5_Wantread, /* want to schedule a read */
#define R5_Wantwrite 5 R5_Wantwrite,
#define R5_Overlap 7 /* There is a pending overlapping request on this block */ R5_Overlap, /* There is a pending overlapping request
#define R5_ReadError 8 /* seen a read error here recently */ * on this block */
#define R5_ReWrite 9 /* have tried to over-write the readerror */ R5_ReadError, /* seen a read error here recently */
R5_ReWrite, /* have tried to over-write the readerror */
#define R5_Expanded 10 /* This block now has post-expand data */ R5_Expanded, /* This block now has post-expand data */
#define R5_Wantcompute 11 /* compute_block in progress treat as R5_Wantcompute, /* compute_block in progress treat as
* uptodate * uptodate
*/ */
#define R5_Wantfill 12 /* dev->toread contains a bio that needs R5_Wantfill, /* dev->toread contains a bio that needs
* filling * filling
*/ */
#define R5_Wantdrain 13 /* dev->towrite needs to be drained */ R5_Wantdrain, /* dev->towrite needs to be drained */
#define R5_WantFUA 14 /* Write should be FUA */ R5_WantFUA, /* Write should be FUA */
#define R5_WriteError 15 /* got a write error - need to record it */ R5_WriteError, /* got a write error - need to record it */
#define R5_MadeGood 16 /* A bad block has been fixed by writing to it*/ R5_MadeGood, /* A bad block has been fixed by writing to it */
/* R5_ReadRepl, /* Will/did read from replacement rather than orig */
* Write method R5_MadeGoodRepl,/* A bad block on the replacement device has been
* fixed by writing to it */
R5_NeedReplace, /* This device has a replacement which is not
* up-to-date at this stripe. */
R5_WantReplace, /* We need to update the replacement, we have read
* data in, and now is a good time to write it out.
*/ */
#define RECONSTRUCT_WRITE 1 };
#define READ_MODIFY_WRITE 2
/* not a write method, but a compute_parity mode */
#define CHECK_PARITY 3
/* Additional compute_parity mode -- updates the parity w/o LOCKING */
#define UPDATE_PARITY 4
/* /*
* Stripe state * Stripe state
...@@ -311,13 +323,14 @@ enum { ...@@ -311,13 +323,14 @@ enum {
/* /*
* Operation request flags * Operation request flags
*/ */
#define STRIPE_OP_BIOFILL 0 enum {
#define STRIPE_OP_COMPUTE_BLK 1 STRIPE_OP_BIOFILL,
#define STRIPE_OP_PREXOR 2 STRIPE_OP_COMPUTE_BLK,
#define STRIPE_OP_BIODRAIN 3 STRIPE_OP_PREXOR,
#define STRIPE_OP_RECONSTRUCT 4 STRIPE_OP_BIODRAIN,
#define STRIPE_OP_CHECK 5 STRIPE_OP_RECONSTRUCT,
STRIPE_OP_CHECK,
};
/* /*
* Plugging: * Plugging:
* *
...@@ -344,13 +357,12 @@ enum { ...@@ -344,13 +357,12 @@ enum {
struct disk_info { struct disk_info {
struct md_rdev *rdev; struct md_rdev *rdev, *replacement;
}; };
struct r5conf { struct r5conf {
struct hlist_head *stripe_hashtbl; struct hlist_head *stripe_hashtbl;
struct mddev *mddev; struct mddev *mddev;
struct disk_info *spare;
int chunk_sectors; int chunk_sectors;
int level, algorithm; int level, algorithm;
int max_degraded; int max_degraded;
......
...@@ -277,7 +277,10 @@ struct mdp_superblock_1 { ...@@ -277,7 +277,10 @@ struct mdp_superblock_1 {
*/ */
#define MD_FEATURE_RESHAPE_ACTIVE 4 #define MD_FEATURE_RESHAPE_ACTIVE 4
#define MD_FEATURE_BAD_BLOCKS 8 /* badblock list is not empty */ #define MD_FEATURE_BAD_BLOCKS 8 /* badblock list is not empty */
#define MD_FEATURE_REPLACEMENT 16 /* This device is replacing an
#define MD_FEATURE_ALL (1|2|4|8) * active device with same 'role'.
* 'recovery_offset' is also set.
*/
#define MD_FEATURE_ALL (1|2|4|8|16)
#endif #endif
...@@ -132,7 +132,7 @@ void raid6_dual_recov(int disks, size_t bytes, int faila, int failb, ...@@ -132,7 +132,7 @@ void raid6_dual_recov(int disks, size_t bytes, int faila, int failb,
PROT_READ|PROT_WRITE, \ PROT_READ|PROT_WRITE, \
MAP_PRIVATE|MAP_ANONYMOUS,\ MAP_PRIVATE|MAP_ANONYMOUS,\
0, 0)) 0, 0))
# define free_pages(x, y) munmap((void *)(x), (y)*PAGE_SIZE) # define free_pages(x, y) munmap((void *)(x), PAGE_SIZE << (y))
static inline void cpu_relax(void) static inline void cpu_relax(void)
{ {
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment