Commit 2943c833 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'md-3.3' of git://neil.brown.name/md

md update for 3.3

Big change is new hot-replacement.
A slot in an array can hold 2 devices - one that
wants-replacement and one that is the replacement.
Once the replacement is built - either from the
original or (in the case of errors) from elsewhere,
the wants-replacement device will be removed.

* tag 'md-3.3' of git://neil.brown.name/md: (36 commits)
  md/raid1: Mark device want_replacement when we see a write error.
  md/raid1: If there is a spare and a want_replacement device, start replacement.
  md/raid1: recognise replacements when assembling arrays.
  md/raid1: handle activation of replacement device when recovery completes.
  md/raid1: Allow a failed replacement device to be removed.
  md/raid1: Allocate spare to store replacement devices and their bios.
  md/raid1:  Replace use of mddev->raid_disks with conf->raid_disks.
  md/raid10: If there is a spare and a want_replacement device, start replacement.
  md/raid10: recognise replacements when assembling array.
  md/raid10: Allow replacement device to be replace old drive.
  md/raid10: handle recovery of replacement devices.
  md/raid10:  Handle replacement devices during resync.
  md/raid10: writes should get directed to replacement as well as original.
  md/raid10: allow removal of failed replacement devices.
  md/raid10: preferentially read from replacement device if possible.
  md/raid10:  change read_balance to return an rdev
  md/raid10: prepare data structures for handling replacement.
  md/raid5: Mark device want_replacement when we see a write error.
  md/raid5: If there is a spare and a want_replacement device, start replacement.
  md/raid5: recognise replacements when assembling array.
  ...
parents 98793265 19d67169
...@@ -357,14 +357,14 @@ Each directory contains: ...@@ -357,14 +357,14 @@ Each directory contains:
written to, that device. written to, that device.
state state
A file recording the current state of the device in the array A file recording the current state of the device in the array
which can be a comma separated list of which can be a comma separated list of
faulty - device has been kicked from active use due to faulty - device has been kicked from active use due to
a detected fault or it has unacknowledged bad a detected fault, or it has unacknowledged bad
blocks blocks
in_sync - device is a fully in-sync member of the array in_sync - device is a fully in-sync member of the array
writemostly - device will only be subject to read writemostly - device will only be subject to read
requests if there are no other options. requests if there are no other options.
This applies only to raid1 arrays. This applies only to raid1 arrays.
blocked - device has failed, and the failure hasn't been blocked - device has failed, and the failure hasn't been
acknowledged yet by the metadata handler. acknowledged yet by the metadata handler.
...@@ -374,6 +374,13 @@ Each directory contains: ...@@ -374,6 +374,13 @@ Each directory contains:
This includes spares that are in the process This includes spares that are in the process
of being recovered to of being recovered to
write_error - device has ever seen a write error. write_error - device has ever seen a write error.
want_replacement - device is (mostly) working but probably
should be replaced, either due to errors or
due to user request.
replacement - device is a replacement for another active
device with same raid_disk.
This list may grow in future. This list may grow in future.
This can be written to. This can be written to.
Writing "faulty" simulates a failure on the device. Writing "faulty" simulates a failure on the device.
...@@ -386,6 +393,13 @@ Each directory contains: ...@@ -386,6 +393,13 @@ Each directory contains:
Writing "in_sync" sets the in_sync flag. Writing "in_sync" sets the in_sync flag.
Writing "write_error" sets writeerrorseen flag. Writing "write_error" sets writeerrorseen flag.
Writing "-write_error" clears writeerrorseen flag. Writing "-write_error" clears writeerrorseen flag.
Writing "want_replacement" is allowed at any time except to a
replacement device or a spare. It sets the flag.
Writing "-want_replacement" is allowed at any time. It clears
the flag.
Writing "replacement" or "-replacement" is only allowed before
starting the array. It sets or clears the flag.
This file responds to select/poll. Any change to 'faulty' This file responds to select/poll. Any change to 'faulty'
or 'blocked' causes an event. or 'blocked' causes an event.
......
...@@ -1149,12 +1149,12 @@ void bitmap_daemon_work(struct mddev *mddev) ...@@ -1149,12 +1149,12 @@ void bitmap_daemon_work(struct mddev *mddev)
return; return;
} }
if (time_before(jiffies, bitmap->daemon_lastrun if (time_before(jiffies, bitmap->daemon_lastrun
+ bitmap->mddev->bitmap_info.daemon_sleep)) + mddev->bitmap_info.daemon_sleep))
goto done; goto done;
bitmap->daemon_lastrun = jiffies; bitmap->daemon_lastrun = jiffies;
if (bitmap->allclean) { if (bitmap->allclean) {
bitmap->mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT; mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT;
goto done; goto done;
} }
bitmap->allclean = 1; bitmap->allclean = 1;
...@@ -1206,7 +1206,7 @@ void bitmap_daemon_work(struct mddev *mddev) ...@@ -1206,7 +1206,7 @@ void bitmap_daemon_work(struct mddev *mddev)
* sure that events_cleared is up-to-date. * sure that events_cleared is up-to-date.
*/ */
if (bitmap->need_sync && if (bitmap->need_sync &&
bitmap->mddev->bitmap_info.external == 0) { mddev->bitmap_info.external == 0) {
bitmap_super_t *sb; bitmap_super_t *sb;
bitmap->need_sync = 0; bitmap->need_sync = 0;
sb = kmap_atomic(bitmap->sb_page, KM_USER0); sb = kmap_atomic(bitmap->sb_page, KM_USER0);
...@@ -1270,8 +1270,8 @@ void bitmap_daemon_work(struct mddev *mddev) ...@@ -1270,8 +1270,8 @@ void bitmap_daemon_work(struct mddev *mddev)
done: done:
if (bitmap->allclean == 0) if (bitmap->allclean == 0)
bitmap->mddev->thread->timeout = mddev->thread->timeout =
bitmap->mddev->bitmap_info.daemon_sleep; mddev->bitmap_info.daemon_sleep;
mutex_unlock(&mddev->bitmap_info.mutex); mutex_unlock(&mddev->bitmap_info.mutex);
} }
...@@ -1587,7 +1587,7 @@ static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int n ...@@ -1587,7 +1587,7 @@ static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int n
} }
if (!*bmc) { if (!*bmc) {
struct page *page; struct page *page;
*bmc = 1 | (needed ? NEEDED_MASK : 0); *bmc = 2 | (needed ? NEEDED_MASK : 0);
bitmap_count_page(bitmap, offset, 1); bitmap_count_page(bitmap, offset, 1);
page = filemap_get_page(bitmap, offset >> CHUNK_BLOCK_SHIFT(bitmap)); page = filemap_get_page(bitmap, offset >> CHUNK_BLOCK_SHIFT(bitmap));
set_page_attr(bitmap, page, BITMAP_PAGE_PENDING); set_page_attr(bitmap, page, BITMAP_PAGE_PENDING);
......
...@@ -1713,6 +1713,8 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev) ...@@ -1713,6 +1713,8 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
} }
if (sb->devflags & WriteMostly1) if (sb->devflags & WriteMostly1)
set_bit(WriteMostly, &rdev->flags); set_bit(WriteMostly, &rdev->flags);
if (le32_to_cpu(sb->feature_map) & MD_FEATURE_REPLACEMENT)
set_bit(Replacement, &rdev->flags);
} else /* MULTIPATH are always insync */ } else /* MULTIPATH are always insync */
set_bit(In_sync, &rdev->flags); set_bit(In_sync, &rdev->flags);
...@@ -1766,6 +1768,9 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev) ...@@ -1766,6 +1768,9 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev)
sb->recovery_offset = sb->recovery_offset =
cpu_to_le64(rdev->recovery_offset); cpu_to_le64(rdev->recovery_offset);
} }
if (test_bit(Replacement, &rdev->flags))
sb->feature_map |=
cpu_to_le32(MD_FEATURE_REPLACEMENT);
if (mddev->reshape_position != MaxSector) { if (mddev->reshape_position != MaxSector) {
sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE); sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE);
...@@ -2559,6 +2564,15 @@ state_show(struct md_rdev *rdev, char *page) ...@@ -2559,6 +2564,15 @@ state_show(struct md_rdev *rdev, char *page)
len += sprintf(page+len, "%swrite_error", sep); len += sprintf(page+len, "%swrite_error", sep);
sep = ","; sep = ",";
} }
if (test_bit(WantReplacement, &rdev->flags)) {
len += sprintf(page+len, "%swant_replacement", sep);
sep = ",";
}
if (test_bit(Replacement, &rdev->flags)) {
len += sprintf(page+len, "%sreplacement", sep);
sep = ",";
}
return len+sprintf(page+len, "\n"); return len+sprintf(page+len, "\n");
} }
...@@ -2627,6 +2641,42 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len) ...@@ -2627,6 +2641,42 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len)
} else if (cmd_match(buf, "-write_error")) { } else if (cmd_match(buf, "-write_error")) {
clear_bit(WriteErrorSeen, &rdev->flags); clear_bit(WriteErrorSeen, &rdev->flags);
err = 0; err = 0;
} else if (cmd_match(buf, "want_replacement")) {
/* Any non-spare device that is not a replacement can
* become want_replacement at any time, but we then need to
* check if recovery is needed.
*/
if (rdev->raid_disk >= 0 &&
!test_bit(Replacement, &rdev->flags))
set_bit(WantReplacement, &rdev->flags);
set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
md_wakeup_thread(rdev->mddev->thread);
err = 0;
} else if (cmd_match(buf, "-want_replacement")) {
/* Clearing 'want_replacement' is always allowed.
* Once replacements starts it is too late though.
*/
err = 0;
clear_bit(WantReplacement, &rdev->flags);
} else if (cmd_match(buf, "replacement")) {
/* Can only set a device as a replacement when array has not
* yet been started. Once running, replacement is automatic
* from spares, or by assigning 'slot'.
*/
if (rdev->mddev->pers)
err = -EBUSY;
else {
set_bit(Replacement, &rdev->flags);
err = 0;
}
} else if (cmd_match(buf, "-replacement")) {
/* Similarly, can only clear Replacement before start */
if (rdev->mddev->pers)
err = -EBUSY;
else {
clear_bit(Replacement, &rdev->flags);
err = 0;
}
} }
if (!err) if (!err)
sysfs_notify_dirent_safe(rdev->sysfs_state); sysfs_notify_dirent_safe(rdev->sysfs_state);
...@@ -2688,7 +2738,7 @@ slot_store(struct md_rdev *rdev, const char *buf, size_t len) ...@@ -2688,7 +2738,7 @@ slot_store(struct md_rdev *rdev, const char *buf, size_t len)
if (rdev->mddev->pers->hot_remove_disk == NULL) if (rdev->mddev->pers->hot_remove_disk == NULL)
return -EINVAL; return -EINVAL;
err = rdev->mddev->pers-> err = rdev->mddev->pers->
hot_remove_disk(rdev->mddev, rdev->raid_disk); hot_remove_disk(rdev->mddev, rdev);
if (err) if (err)
return err; return err;
sysfs_unlink_rdev(rdev->mddev, rdev); sysfs_unlink_rdev(rdev->mddev, rdev);
...@@ -2696,7 +2746,6 @@ slot_store(struct md_rdev *rdev, const char *buf, size_t len) ...@@ -2696,7 +2746,6 @@ slot_store(struct md_rdev *rdev, const char *buf, size_t len)
set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
md_wakeup_thread(rdev->mddev->thread); md_wakeup_thread(rdev->mddev->thread);
} else if (rdev->mddev->pers) { } else if (rdev->mddev->pers) {
struct md_rdev *rdev2;
/* Activating a spare .. or possibly reactivating /* Activating a spare .. or possibly reactivating
* if we ever get bitmaps working here. * if we ever get bitmaps working here.
*/ */
...@@ -2710,10 +2759,6 @@ slot_store(struct md_rdev *rdev, const char *buf, size_t len) ...@@ -2710,10 +2759,6 @@ slot_store(struct md_rdev *rdev, const char *buf, size_t len)
if (rdev->mddev->pers->hot_add_disk == NULL) if (rdev->mddev->pers->hot_add_disk == NULL)
return -EINVAL; return -EINVAL;
list_for_each_entry(rdev2, &rdev->mddev->disks, same_set)
if (rdev2->raid_disk == slot)
return -EEXIST;
if (slot >= rdev->mddev->raid_disks && if (slot >= rdev->mddev->raid_disks &&
slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks) slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks)
return -ENOSPC; return -ENOSPC;
...@@ -6053,8 +6098,15 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, ...@@ -6053,8 +6098,15 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
struct mddev *mddev = NULL; struct mddev *mddev = NULL;
int ro; int ro;
if (!capable(CAP_SYS_ADMIN)) switch (cmd) {
return -EACCES; case RAID_VERSION:
case GET_ARRAY_INFO:
case GET_DISK_INFO:
break;
default:
if (!capable(CAP_SYS_ADMIN))
return -EACCES;
}
/* /*
* Commands dealing with the RAID driver but not any * Commands dealing with the RAID driver but not any
...@@ -6714,8 +6766,11 @@ static int md_seq_show(struct seq_file *seq, void *v) ...@@ -6714,8 +6766,11 @@ static int md_seq_show(struct seq_file *seq, void *v)
if (test_bit(Faulty, &rdev->flags)) { if (test_bit(Faulty, &rdev->flags)) {
seq_printf(seq, "(F)"); seq_printf(seq, "(F)");
continue; continue;
} else if (rdev->raid_disk < 0) }
if (rdev->raid_disk < 0)
seq_printf(seq, "(S)"); /* spare */ seq_printf(seq, "(S)"); /* spare */
if (test_bit(Replacement, &rdev->flags))
seq_printf(seq, "(R)");
sectors += rdev->sectors; sectors += rdev->sectors;
} }
...@@ -7337,29 +7392,27 @@ static int remove_and_add_spares(struct mddev *mddev) ...@@ -7337,29 +7392,27 @@ static int remove_and_add_spares(struct mddev *mddev)
! test_bit(In_sync, &rdev->flags)) && ! test_bit(In_sync, &rdev->flags)) &&
atomic_read(&rdev->nr_pending)==0) { atomic_read(&rdev->nr_pending)==0) {
if (mddev->pers->hot_remove_disk( if (mddev->pers->hot_remove_disk(
mddev, rdev->raid_disk)==0) { mddev, rdev) == 0) {
sysfs_unlink_rdev(mddev, rdev); sysfs_unlink_rdev(mddev, rdev);
rdev->raid_disk = -1; rdev->raid_disk = -1;
} }
} }
if (mddev->degraded) { list_for_each_entry(rdev, &mddev->disks, same_set) {
list_for_each_entry(rdev, &mddev->disks, same_set) { if (rdev->raid_disk >= 0 &&
if (rdev->raid_disk >= 0 && !test_bit(In_sync, &rdev->flags) &&
!test_bit(In_sync, &rdev->flags) && !test_bit(Faulty, &rdev->flags))
!test_bit(Faulty, &rdev->flags)) spares++;
if (rdev->raid_disk < 0
&& !test_bit(Faulty, &rdev->flags)) {
rdev->recovery_offset = 0;
if (mddev->pers->
hot_add_disk(mddev, rdev) == 0) {
if (sysfs_link_rdev(mddev, rdev))
/* failure here is OK */;
spares++; spares++;
if (rdev->raid_disk < 0 md_new_event(mddev);
&& !test_bit(Faulty, &rdev->flags)) { set_bit(MD_CHANGE_DEVS, &mddev->flags);
rdev->recovery_offset = 0;
if (mddev->pers->
hot_add_disk(mddev, rdev) == 0) {
if (sysfs_link_rdev(mddev, rdev))
/* failure here is OK */;
spares++;
md_new_event(mddev);
set_bit(MD_CHANGE_DEVS, &mddev->flags);
}
} }
} }
} }
...@@ -7474,7 +7527,7 @@ void md_check_recovery(struct mddev *mddev) ...@@ -7474,7 +7527,7 @@ void md_check_recovery(struct mddev *mddev)
test_bit(Faulty, &rdev->flags) && test_bit(Faulty, &rdev->flags) &&
atomic_read(&rdev->nr_pending)==0) { atomic_read(&rdev->nr_pending)==0) {
if (mddev->pers->hot_remove_disk( if (mddev->pers->hot_remove_disk(
mddev, rdev->raid_disk)==0) { mddev, rdev) == 0) {
sysfs_unlink_rdev(mddev, rdev); sysfs_unlink_rdev(mddev, rdev);
rdev->raid_disk = -1; rdev->raid_disk = -1;
} }
......
...@@ -72,34 +72,7 @@ struct md_rdev { ...@@ -72,34 +72,7 @@ struct md_rdev {
* This reduces the burden of testing multiple flags in many cases * This reduces the burden of testing multiple flags in many cases
*/ */
unsigned long flags; unsigned long flags; /* bit set of 'enum flag_bits' bits. */
#define Faulty 1 /* device is known to have a fault */
#define In_sync 2 /* device is in_sync with rest of array */
#define WriteMostly 4 /* Avoid reading if at all possible */
#define AutoDetected 7 /* added by auto-detect */
#define Blocked 8 /* An error occurred but has not yet
* been acknowledged by the metadata
* handler, so don't allow writes
* until it is cleared */
#define WriteErrorSeen 9 /* A write error has been seen on this
* device
*/
#define FaultRecorded 10 /* Intermediate state for clearing
* Blocked. The Fault is/will-be
* recorded in the metadata, but that
* metadata hasn't been stored safely
* on disk yet.
*/
#define BlockedBadBlocks 11 /* A writer is blocked because they
* found an unacknowledged bad-block.
* This can safely be cleared at any
* time, and the writer will re-check.
* It may be set at any time, and at
* worst the writer will timeout and
* re-check. So setting it as
* accurately as possible is good, but
* not absolutely critical.
*/
wait_queue_head_t blocked_wait; wait_queue_head_t blocked_wait;
int desc_nr; /* descriptor index in the superblock */ int desc_nr; /* descriptor index in the superblock */
...@@ -152,6 +125,44 @@ struct md_rdev { ...@@ -152,6 +125,44 @@ struct md_rdev {
sector_t size; /* in sectors */ sector_t size; /* in sectors */
} badblocks; } badblocks;
}; };
enum flag_bits {
Faulty, /* device is known to have a fault */
In_sync, /* device is in_sync with rest of array */
WriteMostly, /* Avoid reading if at all possible */
AutoDetected, /* added by auto-detect */
Blocked, /* An error occurred but has not yet
* been acknowledged by the metadata
* handler, so don't allow writes
* until it is cleared */
WriteErrorSeen, /* A write error has been seen on this
* device
*/
FaultRecorded, /* Intermediate state for clearing
* Blocked. The Fault is/will-be
* recorded in the metadata, but that
* metadata hasn't been stored safely
* on disk yet.
*/
BlockedBadBlocks, /* A writer is blocked because they
* found an unacknowledged bad-block.
* This can safely be cleared at any
* time, and the writer will re-check.
* It may be set at any time, and at
* worst the writer will timeout and
* re-check. So setting it as
* accurately as possible is good, but
* not absolutely critical.
*/
WantReplacement, /* This device is a candidate to be
* hot-replaced, either because it has
* reported some faults, or because
* of explicit request.
*/
Replacement, /* This device is a replacement for
* a want_replacement device with same
* raid_disk number.
*/
};
#define BB_LEN_MASK (0x00000000000001FFULL) #define BB_LEN_MASK (0x00000000000001FFULL)
#define BB_OFFSET_MASK (0x7FFFFFFFFFFFFE00ULL) #define BB_OFFSET_MASK (0x7FFFFFFFFFFFFE00ULL)
...@@ -428,7 +439,7 @@ struct md_personality ...@@ -428,7 +439,7 @@ struct md_personality
*/ */
void (*error_handler)(struct mddev *mddev, struct md_rdev *rdev); void (*error_handler)(struct mddev *mddev, struct md_rdev *rdev);
int (*hot_add_disk) (struct mddev *mddev, struct md_rdev *rdev); int (*hot_add_disk) (struct mddev *mddev, struct md_rdev *rdev);
int (*hot_remove_disk) (struct mddev *mddev, int number); int (*hot_remove_disk) (struct mddev *mddev, struct md_rdev *rdev);
int (*spare_active) (struct mddev *mddev); int (*spare_active) (struct mddev *mddev);
sector_t (*sync_request)(struct mddev *mddev, sector_t sector_nr, int *skipped, int go_faster); sector_t (*sync_request)(struct mddev *mddev, sector_t sector_nr, int *skipped, int go_faster);
int (*resize) (struct mddev *mddev, sector_t sectors); int (*resize) (struct mddev *mddev, sector_t sectors);
...@@ -482,15 +493,20 @@ static inline char * mdname (struct mddev * mddev) ...@@ -482,15 +493,20 @@ static inline char * mdname (struct mddev * mddev)
static inline int sysfs_link_rdev(struct mddev *mddev, struct md_rdev *rdev) static inline int sysfs_link_rdev(struct mddev *mddev, struct md_rdev *rdev)
{ {
char nm[20]; char nm[20];
sprintf(nm, "rd%d", rdev->raid_disk); if (!test_bit(Replacement, &rdev->flags)) {
return sysfs_create_link(&mddev->kobj, &rdev->kobj, nm); sprintf(nm, "rd%d", rdev->raid_disk);
return sysfs_create_link(&mddev->kobj, &rdev->kobj, nm);
} else
return 0;
} }
static inline void sysfs_unlink_rdev(struct mddev *mddev, struct md_rdev *rdev) static inline void sysfs_unlink_rdev(struct mddev *mddev, struct md_rdev *rdev)
{ {
char nm[20]; char nm[20];
sprintf(nm, "rd%d", rdev->raid_disk); if (!test_bit(Replacement, &rdev->flags)) {
sysfs_remove_link(&mddev->kobj, nm); sprintf(nm, "rd%d", rdev->raid_disk);
sysfs_remove_link(&mddev->kobj, nm);
}
} }
/* /*
......
...@@ -292,17 +292,16 @@ static int multipath_add_disk(struct mddev *mddev, struct md_rdev *rdev) ...@@ -292,17 +292,16 @@ static int multipath_add_disk(struct mddev *mddev, struct md_rdev *rdev)
return err; return err;
} }
static int multipath_remove_disk(struct mddev *mddev, int number) static int multipath_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
{ {
struct mpconf *conf = mddev->private; struct mpconf *conf = mddev->private;
int err = 0; int err = 0;
struct md_rdev *rdev; int number = rdev->raid_disk;
struct multipath_info *p = conf->multipaths + number; struct multipath_info *p = conf->multipaths + number;
print_multipath_conf(conf); print_multipath_conf(conf);
rdev = p->rdev; if (rdev == p->rdev) {
if (rdev) {
if (test_bit(In_sync, &rdev->flags) || if (test_bit(In_sync, &rdev->flags) ||
atomic_read(&rdev->nr_pending)) { atomic_read(&rdev->nr_pending)) {
printk(KERN_ERR "hot-remove-disk, slot %d is identified" printk(KERN_ERR "hot-remove-disk, slot %d is identified"
......
This diff is collapsed.
...@@ -12,6 +12,9 @@ struct mirror_info { ...@@ -12,6 +12,9 @@ struct mirror_info {
* pool was allocated for, so they know how much to allocate and free. * pool was allocated for, so they know how much to allocate and free.
* mddev->raid_disks cannot be used, as it can change while a pool is active * mddev->raid_disks cannot be used, as it can change while a pool is active
* These two datums are stored in a kmalloced struct. * These two datums are stored in a kmalloced struct.
* The 'raid_disks' here is twice the raid_disks in r1conf.
* This allows space for each 'real' device can have a replacement in the
* second half of the array.
*/ */
struct pool_info { struct pool_info {
...@@ -21,7 +24,9 @@ struct pool_info { ...@@ -21,7 +24,9 @@ struct pool_info {
struct r1conf { struct r1conf {
struct mddev *mddev; struct mddev *mddev;
struct mirror_info *mirrors; struct mirror_info *mirrors; /* twice 'raid_disks' to
* allow for replacements.
*/
int raid_disks; int raid_disks;
/* When choose the best device for a read (read_balance()) /* When choose the best device for a read (read_balance())
......
This diff is collapsed.
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
#define _RAID10_H #define _RAID10_H
struct mirror_info { struct mirror_info {
struct md_rdev *rdev; struct md_rdev *rdev, *replacement;
sector_t head_position; sector_t head_position;
int recovery_disabled; /* matches int recovery_disabled; /* matches
* mddev->recovery_disabled * mddev->recovery_disabled
...@@ -18,12 +18,13 @@ struct r10conf { ...@@ -18,12 +18,13 @@ struct r10conf {
spinlock_t device_lock; spinlock_t device_lock;
/* geometry */ /* geometry */
int near_copies; /* number of copies laid out raid0 style */ int near_copies; /* number of copies laid out
* raid0 style */
int far_copies; /* number of copies laid out int far_copies; /* number of copies laid out
* at large strides across drives * at large strides across drives
*/ */
int far_offset; /* far_copies are offset by 1 stripe int far_offset; /* far_copies are offset by 1
* instead of many * stripe instead of many
*/ */
int copies; /* near_copies * far_copies. int copies; /* near_copies * far_copies.
* must be <= raid_disks * must be <= raid_disks
...@@ -34,10 +35,11 @@ struct r10conf { ...@@ -34,10 +35,11 @@ struct r10conf {
* 1 stripe. * 1 stripe.
*/ */
sector_t dev_sectors; /* temp copy of mddev->dev_sectors */ sector_t dev_sectors; /* temp copy of
* mddev->dev_sectors */
int chunk_shift; /* shift from chunks to sectors */ int chunk_shift; /* shift from chunks to sectors */
sector_t chunk_mask; sector_t chunk_mask;
struct list_head retry_list; struct list_head retry_list;
/* queue pending writes and submit them on unplug */ /* queue pending writes and submit them on unplug */
...@@ -45,20 +47,22 @@ struct r10conf { ...@@ -45,20 +47,22 @@ struct r10conf {
int pending_count; int pending_count;
spinlock_t resync_lock; spinlock_t resync_lock;
int nr_pending; int nr_pending;
int nr_waiting; int nr_waiting;
int nr_queued; int nr_queued;
int barrier; int barrier;
sector_t next_resync; sector_t next_resync;
int fullsync; /* set to 1 if a full sync is needed, int fullsync; /* set to 1 if a full sync is needed,
* (fresh device added). * (fresh device added).
* Cleared when a sync completes. * Cleared when a sync completes.
*/ */
int have_replacement; /* There is at least one
* replacement device.
*/
wait_queue_head_t wait_barrier; wait_queue_head_t wait_barrier;
mempool_t *r10bio_pool; mempool_t *r10bio_pool;
mempool_t *r10buf_pool; mempool_t *r10buf_pool;
struct page *tmppage; struct page *tmppage;
/* When taking over an array from a different personality, we store /* When taking over an array from a different personality, we store
...@@ -98,11 +102,18 @@ struct r10bio { ...@@ -98,11 +102,18 @@ struct r10bio {
* When resyncing we also use one for each copy. * When resyncing we also use one for each copy.
* When reconstructing, we use 2 bios, one for read, one for write. * When reconstructing, we use 2 bios, one for read, one for write.
* We choose the number when they are allocated. * We choose the number when they are allocated.
* We sometimes need an extra bio to write to the replacement.
*/ */
struct { struct {
struct bio *bio; struct bio *bio;
sector_t addr; union {
int devnum; struct bio *repl_bio; /* used for resync and
* writes */
struct md_rdev *rdev; /* used for reads
* (read_slot >= 0) */
};
sector_t addr;
int devnum;
} devs[0]; } devs[0];
}; };
...@@ -121,17 +132,19 @@ struct r10bio { ...@@ -121,17 +132,19 @@ struct r10bio {
#define BIO_SPECIAL(bio) ((unsigned long)bio <= 2) #define BIO_SPECIAL(bio) ((unsigned long)bio <= 2)
/* bits for r10bio.state */ /* bits for r10bio.state */
#define R10BIO_Uptodate 0 enum r10bio_state {
#define R10BIO_IsSync 1 R10BIO_Uptodate,
#define R10BIO_IsRecover 2 R10BIO_IsSync,
#define R10BIO_Degraded 3 R10BIO_IsRecover,
R10BIO_Degraded,
/* Set ReadError on bios that experience a read error /* Set ReadError on bios that experience a read error
* so that raid10d knows what to do with them. * so that raid10d knows what to do with them.
*/ */
#define R10BIO_ReadError 4 R10BIO_ReadError,
/* If a write for this request means we can clear some /* If a write for this request means we can clear some
* known-bad-block records, we set this flag. * known-bad-block records, we set this flag.
*/ */
#define R10BIO_MadeGood 5 R10BIO_MadeGood,
#define R10BIO_WriteError 6 R10BIO_WriteError,
};
#endif #endif
This diff is collapsed.
...@@ -27,7 +27,7 @@ ...@@ -27,7 +27,7 @@
* The possible state transitions are: * The possible state transitions are:
* *
* Empty -> Want - on read or write to get old data for parity calc * Empty -> Want - on read or write to get old data for parity calc
* Empty -> Dirty - on compute_parity to satisfy write/sync request.(RECONSTRUCT_WRITE) * Empty -> Dirty - on compute_parity to satisfy write/sync request.
* Empty -> Clean - on compute_block when computing a block for failed drive * Empty -> Clean - on compute_block when computing a block for failed drive
* Want -> Empty - on failed read * Want -> Empty - on failed read
* Want -> Clean - on successful completion of read request * Want -> Clean - on successful completion of read request
...@@ -226,8 +226,11 @@ struct stripe_head { ...@@ -226,8 +226,11 @@ struct stripe_head {
#endif #endif
} ops; } ops;
struct r5dev { struct r5dev {
struct bio req; /* rreq and rvec are used for the replacement device when
struct bio_vec vec; * writing data to both devices.
*/
struct bio req, rreq;
struct bio_vec vec, rvec;
struct page *page; struct page *page;
struct bio *toread, *read, *towrite, *written; struct bio *toread, *read, *towrite, *written;
sector_t sector; /* sector of this page */ sector_t sector; /* sector of this page */
...@@ -239,7 +242,13 @@ struct stripe_head { ...@@ -239,7 +242,13 @@ struct stripe_head {
* for handle_stripe. * for handle_stripe.
*/ */
struct stripe_head_state { struct stripe_head_state {
int syncing, expanding, expanded; /* 'syncing' means that we need to read all devices, either
* to check/correct parity, or to reconstruct a missing device.
* 'replacing' means we are replacing one or more drives and
* the source is valid at this point so we don't need to
* read all devices, just the replacement targets.
*/
int syncing, expanding, expanded, replacing;
int locked, uptodate, to_read, to_write, failed, written; int locked, uptodate, to_read, to_write, failed, written;
int to_fill, compute, req_compute, non_overwrite; int to_fill, compute, req_compute, non_overwrite;
int failed_num[2]; int failed_num[2];
...@@ -252,38 +261,41 @@ struct stripe_head_state { ...@@ -252,38 +261,41 @@ struct stripe_head_state {
int handle_bad_blocks; int handle_bad_blocks;
}; };
/* Flags */ /* Flags for struct r5dev.flags */
#define R5_UPTODATE 0 /* page contains current data */ enum r5dev_flags {
#define R5_LOCKED 1 /* IO has been submitted on "req" */ R5_UPTODATE, /* page contains current data */
#define R5_OVERWRITE 2 /* towrite covers whole page */ R5_LOCKED, /* IO has been submitted on "req" */
R5_DOUBLE_LOCKED,/* Cannot clear R5_LOCKED until 2 writes complete */
R5_OVERWRITE, /* towrite covers whole page */
/* and some that are internal to handle_stripe */ /* and some that are internal to handle_stripe */
#define R5_Insync 3 /* rdev && rdev->in_sync at start */ R5_Insync, /* rdev && rdev->in_sync at start */
#define R5_Wantread 4 /* want to schedule a read */ R5_Wantread, /* want to schedule a read */
#define R5_Wantwrite 5 R5_Wantwrite,
#define R5_Overlap 7 /* There is a pending overlapping request on this block */ R5_Overlap, /* There is a pending overlapping request
#define R5_ReadError 8 /* seen a read error here recently */ * on this block */
#define R5_ReWrite 9 /* have tried to over-write the readerror */ R5_ReadError, /* seen a read error here recently */
R5_ReWrite, /* have tried to over-write the readerror */
#define R5_Expanded 10 /* This block now has post-expand data */ R5_Expanded, /* This block now has post-expand data */
#define R5_Wantcompute 11 /* compute_block in progress treat as R5_Wantcompute, /* compute_block in progress treat as
* uptodate * uptodate
*/ */
#define R5_Wantfill 12 /* dev->toread contains a bio that needs R5_Wantfill, /* dev->toread contains a bio that needs
* filling * filling
*/ */
#define R5_Wantdrain 13 /* dev->towrite needs to be drained */ R5_Wantdrain, /* dev->towrite needs to be drained */
#define R5_WantFUA 14 /* Write should be FUA */ R5_WantFUA, /* Write should be FUA */
#define R5_WriteError 15 /* got a write error - need to record it */ R5_WriteError, /* got a write error - need to record it */
#define R5_MadeGood 16 /* A bad block has been fixed by writing to it*/ R5_MadeGood, /* A bad block has been fixed by writing to it */
/* R5_ReadRepl, /* Will/did read from replacement rather than orig */
* Write method R5_MadeGoodRepl,/* A bad block on the replacement device has been
*/ * fixed by writing to it */
#define RECONSTRUCT_WRITE 1 R5_NeedReplace, /* This device has a replacement which is not
#define READ_MODIFY_WRITE 2 * up-to-date at this stripe. */
/* not a write method, but a compute_parity mode */ R5_WantReplace, /* We need to update the replacement, we have read
#define CHECK_PARITY 3 * data in, and now is a good time to write it out.
/* Additional compute_parity mode -- updates the parity w/o LOCKING */ */
#define UPDATE_PARITY 4 };
/* /*
* Stripe state * Stripe state
...@@ -311,13 +323,14 @@ enum { ...@@ -311,13 +323,14 @@ enum {
/* /*
* Operation request flags * Operation request flags
*/ */
#define STRIPE_OP_BIOFILL 0 enum {
#define STRIPE_OP_COMPUTE_BLK 1 STRIPE_OP_BIOFILL,
#define STRIPE_OP_PREXOR 2 STRIPE_OP_COMPUTE_BLK,
#define STRIPE_OP_BIODRAIN 3 STRIPE_OP_PREXOR,
#define STRIPE_OP_RECONSTRUCT 4 STRIPE_OP_BIODRAIN,
#define STRIPE_OP_CHECK 5 STRIPE_OP_RECONSTRUCT,
STRIPE_OP_CHECK,
};
/* /*
* Plugging: * Plugging:
* *
...@@ -344,13 +357,12 @@ enum { ...@@ -344,13 +357,12 @@ enum {
struct disk_info { struct disk_info {
struct md_rdev *rdev; struct md_rdev *rdev, *replacement;
}; };
struct r5conf { struct r5conf {
struct hlist_head *stripe_hashtbl; struct hlist_head *stripe_hashtbl;
struct mddev *mddev; struct mddev *mddev;
struct disk_info *spare;
int chunk_sectors; int chunk_sectors;
int level, algorithm; int level, algorithm;
int max_degraded; int max_degraded;
......
...@@ -277,7 +277,10 @@ struct mdp_superblock_1 { ...@@ -277,7 +277,10 @@ struct mdp_superblock_1 {
*/ */
#define MD_FEATURE_RESHAPE_ACTIVE 4 #define MD_FEATURE_RESHAPE_ACTIVE 4
#define MD_FEATURE_BAD_BLOCKS 8 /* badblock list is not empty */ #define MD_FEATURE_BAD_BLOCKS 8 /* badblock list is not empty */
#define MD_FEATURE_REPLACEMENT 16 /* This device is replacing an
#define MD_FEATURE_ALL (1|2|4|8) * active device with same 'role'.
* 'recovery_offset' is also set.
*/
#define MD_FEATURE_ALL (1|2|4|8|16)
#endif #endif
...@@ -132,7 +132,7 @@ void raid6_dual_recov(int disks, size_t bytes, int faila, int failb, ...@@ -132,7 +132,7 @@ void raid6_dual_recov(int disks, size_t bytes, int faila, int failb,
PROT_READ|PROT_WRITE, \ PROT_READ|PROT_WRITE, \
MAP_PRIVATE|MAP_ANONYMOUS,\ MAP_PRIVATE|MAP_ANONYMOUS,\
0, 0)) 0, 0))
# define free_pages(x, y) munmap((void *)(x), (y)*PAGE_SIZE) # define free_pages(x, y) munmap((void *)(x), PAGE_SIZE << (y))
static inline void cpu_relax(void) static inline void cpu_relax(void)
{ {
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment