Commit 867900b5 authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/shli/md

Pull MD updates from Shaohua Li:
 - A bunch of patches from Neil Brown to fix RCU usage
 - Two performance improvement patches from Tomasz Majchrzak
 - Alexey Obitotskiy fixes module refcount issue
 - Arnd Bergmann fixes time granularity
 - Cong Wang fixes a list corruption issue
 - Guoqing Jiang fixes a deadlock in md-cluster
 - A null pointer deference fix from me
 - Song Liu fixes misuse of raid6 rmw
 - Other trival/cleanup fixes from Guoqing Jiang and Xiao Ni

* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/shli/md: (28 commits)
  MD: fix null pointer deference
  raid10: improve random reads performance
  md: add missing sysfs_notify on array_state update
  Fix kernel module refcount handling
  md: use seconds granularity for error logging
  md: reduce the number of synchronize_rcu() calls when multiple devices fail.
  md: be extra careful not to take a reference to a Faulty device.
  md/multipath: add rcu protection to rdev access in multipath_status.
  md/raid5: add rcu protection to rdev accesses in raid5_status.
  md/raid5: add rcu protection to rdev accesses in want_replace
  md/raid5: add rcu protection to rdev accesses in handle_failed_sync.
  md/raid1: add rcu protection to rdev in fix_read_error
  md/raid1: small code cleanup in end_sync_write
  md/raid1: small cleanup in raid1_end_read/write_request
  md/raid10: simplify print_conf a little.
  md/raid10: minor code improvement in fix_read_error()
  md/raid10: add rcu protection to rdev access during reshape.
  md/raid10: add rcu protection to rdev access in raid10_sync_request.
  md/raid10: add rcu protection in raid10_status.
  md/raid10: fix refounct imbalance when resyncing an array with a replacement device.
  ...
parents f0c98ebc 3f35e210
...@@ -2482,8 +2482,7 @@ static int add_bound_rdev(struct md_rdev *rdev) ...@@ -2482,8 +2482,7 @@ static int add_bound_rdev(struct md_rdev *rdev)
if (add_journal) if (add_journal)
mddev_resume(mddev); mddev_resume(mddev);
if (err) { if (err) {
unbind_rdev_from_array(rdev); md_kick_rdev_from_array(rdev);
export_rdev(rdev);
return err; return err;
} }
} }
...@@ -2600,6 +2599,10 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len) ...@@ -2600,6 +2599,10 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len)
else else
err = -EBUSY; err = -EBUSY;
} else if (cmd_match(buf, "remove")) { } else if (cmd_match(buf, "remove")) {
if (rdev->mddev->pers) {
clear_bit(Blocked, &rdev->flags);
remove_and_add_spares(rdev->mddev, rdev);
}
if (rdev->raid_disk >= 0) if (rdev->raid_disk >= 0)
err = -EBUSY; err = -EBUSY;
else { else {
...@@ -3176,8 +3179,7 @@ int md_rdev_init(struct md_rdev *rdev) ...@@ -3176,8 +3179,7 @@ int md_rdev_init(struct md_rdev *rdev)
rdev->data_offset = 0; rdev->data_offset = 0;
rdev->new_data_offset = 0; rdev->new_data_offset = 0;
rdev->sb_events = 0; rdev->sb_events = 0;
rdev->last_read_error.tv_sec = 0; rdev->last_read_error = 0;
rdev->last_read_error.tv_nsec = 0;
rdev->sb_loaded = 0; rdev->sb_loaded = 0;
rdev->bb_page = NULL; rdev->bb_page = NULL;
atomic_set(&rdev->nr_pending, 0); atomic_set(&rdev->nr_pending, 0);
...@@ -3583,6 +3585,8 @@ level_store(struct mddev *mddev, const char *buf, size_t len) ...@@ -3583,6 +3585,8 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
mddev->to_remove = &md_redundancy_group; mddev->to_remove = &md_redundancy_group;
} }
module_put(oldpers->owner);
rdev_for_each(rdev, mddev) { rdev_for_each(rdev, mddev) {
if (rdev->raid_disk < 0) if (rdev->raid_disk < 0)
continue; continue;
...@@ -3940,6 +3944,8 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len) ...@@ -3940,6 +3944,8 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len)
} else } else
err = -EBUSY; err = -EBUSY;
} }
if (!err)
sysfs_notify_dirent_safe(mddev->sysfs_state);
spin_unlock(&mddev->lock); spin_unlock(&mddev->lock);
return err ?: len; return err ?: len;
} }
...@@ -4191,7 +4197,8 @@ size_store(struct mddev *mddev, const char *buf, size_t len) ...@@ -4191,7 +4197,8 @@ size_store(struct mddev *mddev, const char *buf, size_t len)
return err; return err;
if (mddev->pers) { if (mddev->pers) {
err = update_size(mddev, sectors); err = update_size(mddev, sectors);
md_update_sb(mddev, 1); if (err == 0)
md_update_sb(mddev, 1);
} else { } else {
if (mddev->dev_sectors == 0 || if (mddev->dev_sectors == 0 ||
mddev->dev_sectors > sectors) mddev->dev_sectors > sectors)
...@@ -7813,6 +7820,7 @@ void md_do_sync(struct md_thread *thread) ...@@ -7813,6 +7820,7 @@ void md_do_sync(struct md_thread *thread)
if (ret) if (ret)
goto skip; goto skip;
set_bit(MD_CLUSTER_RESYNC_LOCKED, &mddev->flags);
if (!(test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || if (!(test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) || test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) ||
test_bit(MD_RECOVERY_RECOVER, &mddev->recovery)) test_bit(MD_RECOVERY_RECOVER, &mddev->recovery))
...@@ -8151,18 +8159,11 @@ void md_do_sync(struct md_thread *thread) ...@@ -8151,18 +8159,11 @@ void md_do_sync(struct md_thread *thread)
} }
} }
skip: skip:
if (mddev_is_clustered(mddev) && /* set CHANGE_PENDING here since maybe another update is needed,
ret == 0) { * so other nodes are informed. It should be harmless for normal
/* set CHANGE_PENDING here since maybe another * raid */
* update is needed, so other nodes are informed */ set_mask_bits(&mddev->flags, 0,
set_mask_bits(&mddev->flags, 0, BIT(MD_CHANGE_PENDING) | BIT(MD_CHANGE_DEVS));
BIT(MD_CHANGE_PENDING) | BIT(MD_CHANGE_DEVS));
md_wakeup_thread(mddev->thread);
wait_event(mddev->sb_wait,
!test_bit(MD_CHANGE_PENDING, &mddev->flags));
md_cluster_ops->resync_finish(mddev);
} else
set_bit(MD_CHANGE_DEVS, &mddev->flags);
spin_lock(&mddev->lock); spin_lock(&mddev->lock);
if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
...@@ -8188,15 +8189,34 @@ static int remove_and_add_spares(struct mddev *mddev, ...@@ -8188,15 +8189,34 @@ static int remove_and_add_spares(struct mddev *mddev,
struct md_rdev *rdev; struct md_rdev *rdev;
int spares = 0; int spares = 0;
int removed = 0; int removed = 0;
bool remove_some = false;
rdev_for_each(rdev, mddev) rdev_for_each(rdev, mddev) {
if ((this == NULL || rdev == this) &&
rdev->raid_disk >= 0 &&
!test_bit(Blocked, &rdev->flags) &&
test_bit(Faulty, &rdev->flags) &&
atomic_read(&rdev->nr_pending)==0) {
/* Faulty non-Blocked devices with nr_pending == 0
* never get nr_pending incremented,
* never get Faulty cleared, and never get Blocked set.
* So we can synchronize_rcu now rather than once per device
*/
remove_some = true;
set_bit(RemoveSynchronized, &rdev->flags);
}
}
if (remove_some)
synchronize_rcu();
rdev_for_each(rdev, mddev) {
if ((this == NULL || rdev == this) && if ((this == NULL || rdev == this) &&
rdev->raid_disk >= 0 && rdev->raid_disk >= 0 &&
!test_bit(Blocked, &rdev->flags) && !test_bit(Blocked, &rdev->flags) &&
(test_bit(Faulty, &rdev->flags) || ((test_bit(RemoveSynchronized, &rdev->flags) ||
(!test_bit(In_sync, &rdev->flags) && (!test_bit(In_sync, &rdev->flags) &&
!test_bit(Journal, &rdev->flags))) && !test_bit(Journal, &rdev->flags))) &&
atomic_read(&rdev->nr_pending)==0) { atomic_read(&rdev->nr_pending)==0)) {
if (mddev->pers->hot_remove_disk( if (mddev->pers->hot_remove_disk(
mddev, rdev) == 0) { mddev, rdev) == 0) {
sysfs_unlink_rdev(mddev, rdev); sysfs_unlink_rdev(mddev, rdev);
...@@ -8204,6 +8224,10 @@ static int remove_and_add_spares(struct mddev *mddev, ...@@ -8204,6 +8224,10 @@ static int remove_and_add_spares(struct mddev *mddev,
removed++; removed++;
} }
} }
if (remove_some && test_bit(RemoveSynchronized, &rdev->flags))
clear_bit(RemoveSynchronized, &rdev->flags);
}
if (removed && mddev->kobj.sd) if (removed && mddev->kobj.sd)
sysfs_notify(&mddev->kobj, NULL, "degraded"); sysfs_notify(&mddev->kobj, NULL, "degraded");
...@@ -8506,6 +8530,11 @@ void md_reap_sync_thread(struct mddev *mddev) ...@@ -8506,6 +8530,11 @@ void md_reap_sync_thread(struct mddev *mddev)
rdev->saved_raid_disk = -1; rdev->saved_raid_disk = -1;
md_update_sb(mddev, 1); md_update_sb(mddev, 1);
/* MD_CHANGE_PENDING should be cleared by md_update_sb, so we can
* call resync_finish here if MD_CLUSTER_RESYNC_LOCKED is set by
* clustered raid */
if (test_and_clear_bit(MD_CLUSTER_RESYNC_LOCKED, &mddev->flags))
md_cluster_ops->resync_finish(mddev);
clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
clear_bit(MD_RECOVERY_DONE, &mddev->recovery); clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
...@@ -8803,6 +8832,7 @@ EXPORT_SYMBOL(md_reload_sb); ...@@ -8803,6 +8832,7 @@ EXPORT_SYMBOL(md_reload_sb);
* at boot time. * at boot time.
*/ */
static DEFINE_MUTEX(detected_devices_mutex);
static LIST_HEAD(all_detected_devices); static LIST_HEAD(all_detected_devices);
struct detected_devices_node { struct detected_devices_node {
struct list_head list; struct list_head list;
...@@ -8816,7 +8846,9 @@ void md_autodetect_dev(dev_t dev) ...@@ -8816,7 +8846,9 @@ void md_autodetect_dev(dev_t dev)
node_detected_dev = kzalloc(sizeof(*node_detected_dev), GFP_KERNEL); node_detected_dev = kzalloc(sizeof(*node_detected_dev), GFP_KERNEL);
if (node_detected_dev) { if (node_detected_dev) {
node_detected_dev->dev = dev; node_detected_dev->dev = dev;
mutex_lock(&detected_devices_mutex);
list_add_tail(&node_detected_dev->list, &all_detected_devices); list_add_tail(&node_detected_dev->list, &all_detected_devices);
mutex_unlock(&detected_devices_mutex);
} else { } else {
printk(KERN_CRIT "md: md_autodetect_dev: kzalloc failed" printk(KERN_CRIT "md: md_autodetect_dev: kzalloc failed"
", skipping dev(%d,%d)\n", MAJOR(dev), MINOR(dev)); ", skipping dev(%d,%d)\n", MAJOR(dev), MINOR(dev));
...@@ -8835,6 +8867,7 @@ static void autostart_arrays(int part) ...@@ -8835,6 +8867,7 @@ static void autostart_arrays(int part)
printk(KERN_INFO "md: Autodetecting RAID arrays.\n"); printk(KERN_INFO "md: Autodetecting RAID arrays.\n");
mutex_lock(&detected_devices_mutex);
while (!list_empty(&all_detected_devices) && i_scanned < INT_MAX) { while (!list_empty(&all_detected_devices) && i_scanned < INT_MAX) {
i_scanned++; i_scanned++;
node_detected_dev = list_entry(all_detected_devices.next, node_detected_dev = list_entry(all_detected_devices.next,
...@@ -8853,6 +8886,7 @@ static void autostart_arrays(int part) ...@@ -8853,6 +8886,7 @@ static void autostart_arrays(int part)
list_add(&rdev->same_set, &pending_raid_disks); list_add(&rdev->same_set, &pending_raid_disks);
i_passed++; i_passed++;
} }
mutex_unlock(&detected_devices_mutex);
printk(KERN_INFO "md: Scanned %d and added %d devices.\n", printk(KERN_INFO "md: Scanned %d and added %d devices.\n",
i_scanned, i_passed); i_scanned, i_passed);
......
...@@ -99,7 +99,7 @@ struct md_rdev { ...@@ -99,7 +99,7 @@ struct md_rdev {
atomic_t read_errors; /* number of consecutive read errors that atomic_t read_errors; /* number of consecutive read errors that
* we have tried to ignore. * we have tried to ignore.
*/ */
struct timespec last_read_error; /* monotonic time since our time64_t last_read_error; /* monotonic time since our
* last read error * last read error
*/ */
atomic_t corrected_errors; /* number of corrected read errors, atomic_t corrected_errors; /* number of corrected read errors,
...@@ -163,6 +163,11 @@ enum flag_bits { ...@@ -163,6 +163,11 @@ enum flag_bits {
* than other devices in the array * than other devices in the array
*/ */
ClusterRemove, ClusterRemove,
RemoveSynchronized, /* synchronize_rcu() was called after
* this device was known to be faulty,
* so it is safe to remove without
* another synchronize_rcu() call.
*/
}; };
static inline int is_badblock(struct md_rdev *rdev, sector_t s, int sectors, static inline int is_badblock(struct md_rdev *rdev, sector_t s, int sectors,
...@@ -204,6 +209,9 @@ struct mddev { ...@@ -204,6 +209,9 @@ struct mddev {
#define MD_RELOAD_SB 7 /* Reload the superblock because another node #define MD_RELOAD_SB 7 /* Reload the superblock because another node
* updated it. * updated it.
*/ */
#define MD_CLUSTER_RESYNC_LOCKED 8 /* cluster raid only, which means node
* already took resync lock, need to
* release the lock */
int suspended; int suspended;
atomic_t active_io; atomic_t active_io;
......
...@@ -43,7 +43,8 @@ static int multipath_map (struct mpconf *conf) ...@@ -43,7 +43,8 @@ static int multipath_map (struct mpconf *conf)
rcu_read_lock(); rcu_read_lock();
for (i = 0; i < disks; i++) { for (i = 0; i < disks; i++) {
struct md_rdev *rdev = rcu_dereference(conf->multipaths[i].rdev); struct md_rdev *rdev = rcu_dereference(conf->multipaths[i].rdev);
if (rdev && test_bit(In_sync, &rdev->flags)) { if (rdev && test_bit(In_sync, &rdev->flags) &&
!test_bit(Faulty, &rdev->flags)) {
atomic_inc(&rdev->nr_pending); atomic_inc(&rdev->nr_pending);
rcu_read_unlock(); rcu_read_unlock();
return i; return i;
...@@ -141,17 +142,19 @@ static void multipath_make_request(struct mddev *mddev, struct bio * bio) ...@@ -141,17 +142,19 @@ static void multipath_make_request(struct mddev *mddev, struct bio * bio)
return; return;
} }
static void multipath_status (struct seq_file *seq, struct mddev *mddev) static void multipath_status(struct seq_file *seq, struct mddev *mddev)
{ {
struct mpconf *conf = mddev->private; struct mpconf *conf = mddev->private;
int i; int i;
seq_printf (seq, " [%d/%d] [", conf->raid_disks, seq_printf (seq, " [%d/%d] [", conf->raid_disks,
conf->raid_disks - mddev->degraded); conf->raid_disks - mddev->degraded);
for (i = 0; i < conf->raid_disks; i++) rcu_read_lock();
seq_printf (seq, "%s", for (i = 0; i < conf->raid_disks; i++) {
conf->multipaths[i].rdev && struct md_rdev *rdev = rcu_dereference(conf->multipaths[i].rdev);
test_bit(In_sync, &conf->multipaths[i].rdev->flags) ? "U" : "_"); seq_printf (seq, "%s", rdev && test_bit(In_sync, &rdev->flags) ? "U" : "_");
}
rcu_read_unlock();
seq_printf (seq, "]"); seq_printf (seq, "]");
} }
...@@ -295,12 +298,14 @@ static int multipath_remove_disk(struct mddev *mddev, struct md_rdev *rdev) ...@@ -295,12 +298,14 @@ static int multipath_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
goto abort; goto abort;
} }
p->rdev = NULL; p->rdev = NULL;
synchronize_rcu(); if (!test_bit(RemoveSynchronized, &rdev->flags)) {
if (atomic_read(&rdev->nr_pending)) { synchronize_rcu();
/* lost the race, try later */ if (atomic_read(&rdev->nr_pending)) {
err = -EBUSY; /* lost the race, try later */
p->rdev = rdev; err = -EBUSY;
goto abort; p->rdev = rdev;
goto abort;
}
} }
err = md_integrity_register(mddev); err = md_integrity_register(mddev);
} }
......
...@@ -319,14 +319,13 @@ static void raid1_end_read_request(struct bio *bio) ...@@ -319,14 +319,13 @@ static void raid1_end_read_request(struct bio *bio)
{ {
int uptodate = !bio->bi_error; int uptodate = !bio->bi_error;
struct r1bio *r1_bio = bio->bi_private; struct r1bio *r1_bio = bio->bi_private;
int mirror;
struct r1conf *conf = r1_bio->mddev->private; struct r1conf *conf = r1_bio->mddev->private;
struct md_rdev *rdev = conf->mirrors[r1_bio->read_disk].rdev;
mirror = r1_bio->read_disk;
/* /*
* this branch is our 'one mirror IO has finished' event handler: * this branch is our 'one mirror IO has finished' event handler:
*/ */
update_head_pos(mirror, r1_bio); update_head_pos(r1_bio->read_disk, r1_bio);
if (uptodate) if (uptodate)
set_bit(R1BIO_Uptodate, &r1_bio->state); set_bit(R1BIO_Uptodate, &r1_bio->state);
...@@ -339,14 +338,14 @@ static void raid1_end_read_request(struct bio *bio) ...@@ -339,14 +338,14 @@ static void raid1_end_read_request(struct bio *bio)
spin_lock_irqsave(&conf->device_lock, flags); spin_lock_irqsave(&conf->device_lock, flags);
if (r1_bio->mddev->degraded == conf->raid_disks || if (r1_bio->mddev->degraded == conf->raid_disks ||
(r1_bio->mddev->degraded == conf->raid_disks-1 && (r1_bio->mddev->degraded == conf->raid_disks-1 &&
test_bit(In_sync, &conf->mirrors[mirror].rdev->flags))) test_bit(In_sync, &rdev->flags)))
uptodate = 1; uptodate = 1;
spin_unlock_irqrestore(&conf->device_lock, flags); spin_unlock_irqrestore(&conf->device_lock, flags);
} }
if (uptodate) { if (uptodate) {
raid_end_bio_io(r1_bio); raid_end_bio_io(r1_bio);
rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev); rdev_dec_pending(rdev, conf->mddev);
} else { } else {
/* /*
* oops, read error: * oops, read error:
...@@ -356,7 +355,7 @@ static void raid1_end_read_request(struct bio *bio) ...@@ -356,7 +355,7 @@ static void raid1_end_read_request(struct bio *bio)
KERN_ERR "md/raid1:%s: %s: " KERN_ERR "md/raid1:%s: %s: "
"rescheduling sector %llu\n", "rescheduling sector %llu\n",
mdname(conf->mddev), mdname(conf->mddev),
bdevname(conf->mirrors[mirror].rdev->bdev, bdevname(rdev->bdev,
b), b),
(unsigned long long)r1_bio->sector); (unsigned long long)r1_bio->sector);
set_bit(R1BIO_ReadError, &r1_bio->state); set_bit(R1BIO_ReadError, &r1_bio->state);
...@@ -403,20 +402,18 @@ static void r1_bio_write_done(struct r1bio *r1_bio) ...@@ -403,20 +402,18 @@ static void r1_bio_write_done(struct r1bio *r1_bio)
static void raid1_end_write_request(struct bio *bio) static void raid1_end_write_request(struct bio *bio)
{ {
struct r1bio *r1_bio = bio->bi_private; struct r1bio *r1_bio = bio->bi_private;
int mirror, behind = test_bit(R1BIO_BehindIO, &r1_bio->state); int behind = test_bit(R1BIO_BehindIO, &r1_bio->state);
struct r1conf *conf = r1_bio->mddev->private; struct r1conf *conf = r1_bio->mddev->private;
struct bio *to_put = NULL; struct bio *to_put = NULL;
int mirror = find_bio_disk(r1_bio, bio);
mirror = find_bio_disk(r1_bio, bio); struct md_rdev *rdev = conf->mirrors[mirror].rdev;
/* /*
* 'one mirror IO has finished' event handler: * 'one mirror IO has finished' event handler:
*/ */
if (bio->bi_error) { if (bio->bi_error) {
set_bit(WriteErrorSeen, set_bit(WriteErrorSeen, &rdev->flags);
&conf->mirrors[mirror].rdev->flags); if (!test_and_set_bit(WantReplacement, &rdev->flags))
if (!test_and_set_bit(WantReplacement,
&conf->mirrors[mirror].rdev->flags))
set_bit(MD_RECOVERY_NEEDED, & set_bit(MD_RECOVERY_NEEDED, &
conf->mddev->recovery); conf->mddev->recovery);
...@@ -445,13 +442,12 @@ static void raid1_end_write_request(struct bio *bio) ...@@ -445,13 +442,12 @@ static void raid1_end_write_request(struct bio *bio)
* before rdev->recovery_offset, but for simplicity we don't * before rdev->recovery_offset, but for simplicity we don't
* check this here. * check this here.
*/ */
if (test_bit(In_sync, &conf->mirrors[mirror].rdev->flags) && if (test_bit(In_sync, &rdev->flags) &&
!test_bit(Faulty, &conf->mirrors[mirror].rdev->flags)) !test_bit(Faulty, &rdev->flags))
set_bit(R1BIO_Uptodate, &r1_bio->state); set_bit(R1BIO_Uptodate, &r1_bio->state);
/* Maybe we can clear some bad blocks. */ /* Maybe we can clear some bad blocks. */
if (is_badblock(conf->mirrors[mirror].rdev, if (is_badblock(rdev, r1_bio->sector, r1_bio->sectors,
r1_bio->sector, r1_bio->sectors,
&first_bad, &bad_sectors)) { &first_bad, &bad_sectors)) {
r1_bio->bios[mirror] = IO_MADE_GOOD; r1_bio->bios[mirror] = IO_MADE_GOOD;
set_bit(R1BIO_MadeGood, &r1_bio->state); set_bit(R1BIO_MadeGood, &r1_bio->state);
...@@ -459,7 +455,7 @@ static void raid1_end_write_request(struct bio *bio) ...@@ -459,7 +455,7 @@ static void raid1_end_write_request(struct bio *bio)
} }
if (behind) { if (behind) {
if (test_bit(WriteMostly, &conf->mirrors[mirror].rdev->flags)) if (test_bit(WriteMostly, &rdev->flags))
atomic_dec(&r1_bio->behind_remaining); atomic_dec(&r1_bio->behind_remaining);
/* /*
...@@ -483,8 +479,7 @@ static void raid1_end_write_request(struct bio *bio) ...@@ -483,8 +479,7 @@ static void raid1_end_write_request(struct bio *bio)
} }
} }
if (r1_bio->bios[mirror] == NULL) if (r1_bio->bios[mirror] == NULL)
rdev_dec_pending(conf->mirrors[mirror].rdev, rdev_dec_pending(rdev, conf->mddev);
conf->mddev);
/* /*
* Let's see if all mirrored write operations have finished * Let's see if all mirrored write operations have finished
...@@ -689,13 +684,6 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect ...@@ -689,13 +684,6 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
if (!rdev) if (!rdev)
goto retry; goto retry;
atomic_inc(&rdev->nr_pending); atomic_inc(&rdev->nr_pending);
if (test_bit(Faulty, &rdev->flags)) {
/* cannot risk returning a device that failed
* before we inc'ed nr_pending
*/
rdev_dec_pending(rdev, conf->mddev);
goto retry;
}
sectors = best_good_sectors; sectors = best_good_sectors;
if (conf->mirrors[best_disk].next_seq_sect != this_sector) if (conf->mirrors[best_disk].next_seq_sect != this_sector)
...@@ -1666,13 +1654,16 @@ static int raid1_remove_disk(struct mddev *mddev, struct md_rdev *rdev) ...@@ -1666,13 +1654,16 @@ static int raid1_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
goto abort; goto abort;
} }
p->rdev = NULL; p->rdev = NULL;
synchronize_rcu(); if (!test_bit(RemoveSynchronized, &rdev->flags)) {
if (atomic_read(&rdev->nr_pending)) { synchronize_rcu();
/* lost the race, try later */ if (atomic_read(&rdev->nr_pending)) {
err = -EBUSY; /* lost the race, try later */
p->rdev = rdev; err = -EBUSY;
goto abort; p->rdev = rdev;
} else if (conf->mirrors[conf->raid_disks + number].rdev) { goto abort;
}
}
if (conf->mirrors[conf->raid_disks + number].rdev) {
/* We just removed a device that is being replaced. /* We just removed a device that is being replaced.
* Move down the replacement. We drain all IO before * Move down the replacement. We drain all IO before
* doing this to avoid confusion. * doing this to avoid confusion.
...@@ -1719,11 +1710,9 @@ static void end_sync_write(struct bio *bio) ...@@ -1719,11 +1710,9 @@ static void end_sync_write(struct bio *bio)
struct r1bio *r1_bio = bio->bi_private; struct r1bio *r1_bio = bio->bi_private;
struct mddev *mddev = r1_bio->mddev; struct mddev *mddev = r1_bio->mddev;
struct r1conf *conf = mddev->private; struct r1conf *conf = mddev->private;
int mirror=0;
sector_t first_bad; sector_t first_bad;
int bad_sectors; int bad_sectors;
struct md_rdev *rdev = conf->mirrors[find_bio_disk(r1_bio, bio)].rdev;
mirror = find_bio_disk(r1_bio, bio);
if (!uptodate) { if (!uptodate) {
sector_t sync_blocks = 0; sector_t sync_blocks = 0;
...@@ -1736,16 +1725,12 @@ static void end_sync_write(struct bio *bio) ...@@ -1736,16 +1725,12 @@ static void end_sync_write(struct bio *bio)
s += sync_blocks; s += sync_blocks;
sectors_to_go -= sync_blocks; sectors_to_go -= sync_blocks;
} while (sectors_to_go > 0); } while (sectors_to_go > 0);
set_bit(WriteErrorSeen, set_bit(WriteErrorSeen, &rdev->flags);
&conf->mirrors[mirror].rdev->flags); if (!test_and_set_bit(WantReplacement, &rdev->flags))
if (!test_and_set_bit(WantReplacement,
&conf->mirrors[mirror].rdev->flags))
set_bit(MD_RECOVERY_NEEDED, & set_bit(MD_RECOVERY_NEEDED, &
mddev->recovery); mddev->recovery);
set_bit(R1BIO_WriteError, &r1_bio->state); set_bit(R1BIO_WriteError, &r1_bio->state);
} else if (is_badblock(conf->mirrors[mirror].rdev, } else if (is_badblock(rdev, r1_bio->sector, r1_bio->sectors,
r1_bio->sector,
r1_bio->sectors,
&first_bad, &bad_sectors) && &first_bad, &bad_sectors) &&
!is_badblock(conf->mirrors[r1_bio->read_disk].rdev, !is_badblock(conf->mirrors[r1_bio->read_disk].rdev,
r1_bio->sector, r1_bio->sector,
...@@ -2072,29 +2057,30 @@ static void fix_read_error(struct r1conf *conf, int read_disk, ...@@ -2072,29 +2057,30 @@ static void fix_read_error(struct r1conf *conf, int read_disk,
s = PAGE_SIZE >> 9; s = PAGE_SIZE >> 9;
do { do {
/* Note: no rcu protection needed here
* as this is synchronous in the raid1d thread
* which is the thread that might remove
* a device. If raid1d ever becomes multi-threaded....
*/
sector_t first_bad; sector_t first_bad;
int bad_sectors; int bad_sectors;
rdev = conf->mirrors[d].rdev; rcu_read_lock();
rdev = rcu_dereference(conf->mirrors[d].rdev);
if (rdev && if (rdev &&
(test_bit(In_sync, &rdev->flags) || (test_bit(In_sync, &rdev->flags) ||
(!test_bit(Faulty, &rdev->flags) && (!test_bit(Faulty, &rdev->flags) &&
rdev->recovery_offset >= sect + s)) && rdev->recovery_offset >= sect + s)) &&
is_badblock(rdev, sect, s, is_badblock(rdev, sect, s,
&first_bad, &bad_sectors) == 0 && &first_bad, &bad_sectors) == 0) {
sync_page_io(rdev, sect, s<<9, atomic_inc(&rdev->nr_pending);
rcu_read_unlock();
if (sync_page_io(rdev, sect, s<<9,
conf->tmppage, REQ_OP_READ, 0, false)) conf->tmppage, REQ_OP_READ, 0, false))
success = 1; success = 1;
else { rdev_dec_pending(rdev, mddev);
d++; if (success)
if (d == conf->raid_disks * 2) break;
d = 0; } else
} rcu_read_unlock();
d++;
if (d == conf->raid_disks * 2)
d = 0;
} while (!success && d != read_disk); } while (!success && d != read_disk);
if (!success) { if (!success) {
...@@ -2110,11 +2096,17 @@ static void fix_read_error(struct r1conf *conf, int read_disk, ...@@ -2110,11 +2096,17 @@ static void fix_read_error(struct r1conf *conf, int read_disk,
if (d==0) if (d==0)
d = conf->raid_disks * 2; d = conf->raid_disks * 2;
d--; d--;
rdev = conf->mirrors[d].rdev; rcu_read_lock();
rdev = rcu_dereference(conf->mirrors[d].rdev);
if (rdev && if (rdev &&
!test_bit(Faulty, &rdev->flags)) !test_bit(Faulty, &rdev->flags)) {
atomic_inc(&rdev->nr_pending);
rcu_read_unlock();
r1_sync_page_io(rdev, sect, s, r1_sync_page_io(rdev, sect, s,
conf->tmppage, WRITE); conf->tmppage, WRITE);
rdev_dec_pending(rdev, mddev);
} else
rcu_read_unlock();
} }
d = start; d = start;
while (d != read_disk) { while (d != read_disk) {
...@@ -2122,9 +2114,12 @@ static void fix_read_error(struct r1conf *conf, int read_disk, ...@@ -2122,9 +2114,12 @@ static void fix_read_error(struct r1conf *conf, int read_disk,
if (d==0) if (d==0)
d = conf->raid_disks * 2; d = conf->raid_disks * 2;
d--; d--;
rdev = conf->mirrors[d].rdev; rcu_read_lock();
rdev = rcu_dereference(conf->mirrors[d].rdev);
if (rdev && if (rdev &&
!test_bit(Faulty, &rdev->flags)) { !test_bit(Faulty, &rdev->flags)) {
atomic_inc(&rdev->nr_pending);
rcu_read_unlock();
if (r1_sync_page_io(rdev, sect, s, if (r1_sync_page_io(rdev, sect, s,
conf->tmppage, READ)) { conf->tmppage, READ)) {
atomic_add(s, &rdev->corrected_errors); atomic_add(s, &rdev->corrected_errors);
...@@ -2133,10 +2128,12 @@ static void fix_read_error(struct r1conf *conf, int read_disk, ...@@ -2133,10 +2128,12 @@ static void fix_read_error(struct r1conf *conf, int read_disk,
"(%d sectors at %llu on %s)\n", "(%d sectors at %llu on %s)\n",
mdname(mddev), s, mdname(mddev), s,
(unsigned long long)(sect + (unsigned long long)(sect +
rdev->data_offset), rdev->data_offset),
bdevname(rdev->bdev, b)); bdevname(rdev->bdev, b));
} }
} rdev_dec_pending(rdev, mddev);
} else
rcu_read_unlock();
} }
sectors -= s; sectors -= s;
sect += s; sect += s;
...@@ -2534,6 +2531,13 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, ...@@ -2534,6 +2531,13 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
return sync_blocks; return sync_blocks;
} }
/*
* If there is non-resync activity waiting for a turn, then let it
* though before starting on this new sync request.
*/
if (conf->nr_waiting)
schedule_timeout_uninterruptible(1);
/* we are incrementing sector_nr below. To be safe, we check against /* we are incrementing sector_nr below. To be safe, we check against
* sector_nr + two times RESYNC_SECTORS * sector_nr + two times RESYNC_SECTORS
*/ */
......
This diff is collapsed.
...@@ -64,10 +64,11 @@ struct r10conf { ...@@ -64,10 +64,11 @@ struct r10conf {
int pending_count; int pending_count;
spinlock_t resync_lock; spinlock_t resync_lock;
int nr_pending; atomic_t nr_pending;
int nr_waiting; int nr_waiting;
int nr_queued; int nr_queued;
int barrier; int barrier;
int array_freeze_pending;
sector_t next_resync; sector_t next_resync;
int fullsync; /* set to 1 if a full sync is needed, int fullsync; /* set to 1 if a full sync is needed,
* (fresh device added). * (fresh device added).
......
...@@ -3080,7 +3080,8 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, ...@@ -3080,7 +3080,8 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
struct md_rdev *rdev; struct md_rdev *rdev;
rcu_read_lock(); rcu_read_lock();
rdev = rcu_dereference(conf->disks[i].rdev); rdev = rcu_dereference(conf->disks[i].rdev);
if (rdev && test_bit(In_sync, &rdev->flags)) if (rdev && test_bit(In_sync, &rdev->flags) &&
!test_bit(Faulty, &rdev->flags))
atomic_inc(&rdev->nr_pending); atomic_inc(&rdev->nr_pending);
else else
rdev = NULL; rdev = NULL;
...@@ -3210,15 +3211,16 @@ handle_failed_sync(struct r5conf *conf, struct stripe_head *sh, ...@@ -3210,15 +3211,16 @@ handle_failed_sync(struct r5conf *conf, struct stripe_head *sh,
/* During recovery devices cannot be removed, so /* During recovery devices cannot be removed, so
* locking and refcounting of rdevs is not needed * locking and refcounting of rdevs is not needed
*/ */
rcu_read_lock();
for (i = 0; i < conf->raid_disks; i++) { for (i = 0; i < conf->raid_disks; i++) {
struct md_rdev *rdev = conf->disks[i].rdev; struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev);
if (rdev if (rdev
&& !test_bit(Faulty, &rdev->flags) && !test_bit(Faulty, &rdev->flags)
&& !test_bit(In_sync, &rdev->flags) && !test_bit(In_sync, &rdev->flags)
&& !rdev_set_badblocks(rdev, sh->sector, && !rdev_set_badblocks(rdev, sh->sector,
STRIPE_SECTORS, 0)) STRIPE_SECTORS, 0))
abort = 1; abort = 1;
rdev = conf->disks[i].replacement; rdev = rcu_dereference(conf->disks[i].replacement);
if (rdev if (rdev
&& !test_bit(Faulty, &rdev->flags) && !test_bit(Faulty, &rdev->flags)
&& !test_bit(In_sync, &rdev->flags) && !test_bit(In_sync, &rdev->flags)
...@@ -3226,6 +3228,7 @@ handle_failed_sync(struct r5conf *conf, struct stripe_head *sh, ...@@ -3226,6 +3228,7 @@ handle_failed_sync(struct r5conf *conf, struct stripe_head *sh,
STRIPE_SECTORS, 0)) STRIPE_SECTORS, 0))
abort = 1; abort = 1;
} }
rcu_read_unlock();
if (abort) if (abort)
conf->recovery_disabled = conf->recovery_disabled =
conf->mddev->recovery_disabled; conf->mddev->recovery_disabled;
...@@ -3237,15 +3240,16 @@ static int want_replace(struct stripe_head *sh, int disk_idx) ...@@ -3237,15 +3240,16 @@ static int want_replace(struct stripe_head *sh, int disk_idx)
{ {
struct md_rdev *rdev; struct md_rdev *rdev;
int rv = 0; int rv = 0;
/* Doing recovery so rcu locking not required */
rdev = sh->raid_conf->disks[disk_idx].replacement; rcu_read_lock();
rdev = rcu_dereference(sh->raid_conf->disks[disk_idx].replacement);
if (rdev if (rdev
&& !test_bit(Faulty, &rdev->flags) && !test_bit(Faulty, &rdev->flags)
&& !test_bit(In_sync, &rdev->flags) && !test_bit(In_sync, &rdev->flags)
&& (rdev->recovery_offset <= sh->sector && (rdev->recovery_offset <= sh->sector
|| rdev->mddev->recovery_cp <= sh->sector)) || rdev->mddev->recovery_cp <= sh->sector))
rv = 1; rv = 1;
rcu_read_unlock();
return rv; return rv;
} }
...@@ -3600,7 +3604,7 @@ static void handle_stripe_dirtying(struct r5conf *conf, ...@@ -3600,7 +3604,7 @@ static void handle_stripe_dirtying(struct r5conf *conf,
pr_debug("for sector %llu, rmw=%d rcw=%d\n", pr_debug("for sector %llu, rmw=%d rcw=%d\n",
(unsigned long long)sh->sector, rmw, rcw); (unsigned long long)sh->sector, rmw, rcw);
set_bit(STRIPE_HANDLE, &sh->state); set_bit(STRIPE_HANDLE, &sh->state);
if ((rmw < rcw || (rmw == rcw && conf->rmw_level == PARITY_ENABLE_RMW)) && rmw > 0) { if ((rmw < rcw || (rmw == rcw && conf->rmw_level == PARITY_PREFER_RMW)) && rmw > 0) {
/* prefer read-modify-write, but need to get some data */ /* prefer read-modify-write, but need to get some data */
if (conf->mddev->queue) if (conf->mddev->queue)
blk_add_trace_msg(conf->mddev->queue, blk_add_trace_msg(conf->mddev->queue,
...@@ -3627,7 +3631,7 @@ static void handle_stripe_dirtying(struct r5conf *conf, ...@@ -3627,7 +3631,7 @@ static void handle_stripe_dirtying(struct r5conf *conf,
} }
} }
} }
if ((rcw < rmw || (rcw == rmw && conf->rmw_level != PARITY_ENABLE_RMW)) && rcw > 0) { if ((rcw < rmw || (rcw == rmw && conf->rmw_level != PARITY_PREFER_RMW)) && rcw > 0) {
/* want reconstruct write, but need to get some data */ /* want reconstruct write, but need to get some data */
int qread =0; int qread =0;
rcw = 0; rcw = 0;
...@@ -7066,10 +7070,12 @@ static void raid5_status(struct seq_file *seq, struct mddev *mddev) ...@@ -7066,10 +7070,12 @@ static void raid5_status(struct seq_file *seq, struct mddev *mddev)
seq_printf(seq, " level %d, %dk chunk, algorithm %d", mddev->level, seq_printf(seq, " level %d, %dk chunk, algorithm %d", mddev->level,
conf->chunk_sectors / 2, mddev->layout); conf->chunk_sectors / 2, mddev->layout);
seq_printf (seq, " [%d/%d] [", conf->raid_disks, conf->raid_disks - mddev->degraded); seq_printf (seq, " [%d/%d] [", conf->raid_disks, conf->raid_disks - mddev->degraded);
for (i = 0; i < conf->raid_disks; i++) rcu_read_lock();
seq_printf (seq, "%s", for (i = 0; i < conf->raid_disks; i++) {
conf->disks[i].rdev && struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev);
test_bit(In_sync, &conf->disks[i].rdev->flags) ? "U" : "_"); seq_printf (seq, "%s", rdev && test_bit(In_sync, &rdev->flags) ? "U" : "_");
}
rcu_read_unlock();
seq_printf (seq, "]"); seq_printf (seq, "]");
} }
...@@ -7191,12 +7197,15 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev) ...@@ -7191,12 +7197,15 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
goto abort; goto abort;
} }
*rdevp = NULL; *rdevp = NULL;
synchronize_rcu(); if (!test_bit(RemoveSynchronized, &rdev->flags)) {
if (atomic_read(&rdev->nr_pending)) { synchronize_rcu();
/* lost the race, try later */ if (atomic_read(&rdev->nr_pending)) {
err = -EBUSY; /* lost the race, try later */
*rdevp = rdev; err = -EBUSY;
} else if (p->replacement) { *rdevp = rdev;
}
}
if (p->replacement) {
/* We must have just cleared 'rdev' */ /* We must have just cleared 'rdev' */
p->rdev = p->replacement; p->rdev = p->replacement;
clear_bit(Replacement, &p->replacement->flags); clear_bit(Replacement, &p->replacement->flags);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment