Commit c4c1663b authored by NeilBrown's avatar NeilBrown

md/raid5: replace sh->lock with an 'active' flag.

sh->lock is now mainly used to ensure that two threads aren't running
in the locked part of handle_stripe[56] at the same time.

That can more neatly be achieved with an 'active' flag which we set
while running handle_stripe.  If we find the flag is set, we simply
requeue the stripe for later by setting STRIPE_HANDLE.

For safety we take ->device_lock while examining the state of the
stripe and creating a summary in 'stripe_head_state / r6_state'.
This possibly isn't needed but as shared fields like ->toread,
->towrite are checked it is safer for now at least.

We leave the label after the old 'unlock' called "unlock" because it
will disappear in a few patches, so renaming seems pointless.

This leaves the stripe 'locked' for longer as we clear STRIPE_ACTIVE
later, but that is not a problem.
Signed-off-by: default avatarNeilBrown <neilb@suse.de>
Reviewed-by: default avatarNamhyung Kim <namhyung@gmail.com>
parent cbe47ec5
...@@ -1020,14 +1020,12 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) ...@@ -1020,14 +1020,12 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
if (test_and_clear_bit(R5_Wantdrain, &dev->flags)) { if (test_and_clear_bit(R5_Wantdrain, &dev->flags)) {
struct bio *wbi; struct bio *wbi;
spin_lock(&sh->lock);
spin_lock_irq(&sh->raid_conf->device_lock); spin_lock_irq(&sh->raid_conf->device_lock);
chosen = dev->towrite; chosen = dev->towrite;
dev->towrite = NULL; dev->towrite = NULL;
BUG_ON(dev->written); BUG_ON(dev->written);
wbi = dev->written = chosen; wbi = dev->written = chosen;
spin_unlock_irq(&sh->raid_conf->device_lock); spin_unlock_irq(&sh->raid_conf->device_lock);
spin_unlock(&sh->lock);
while (wbi && wbi->bi_sector < while (wbi && wbi->bi_sector <
dev->sector + STRIPE_SECTORS) { dev->sector + STRIPE_SECTORS) {
...@@ -1322,7 +1320,6 @@ static int grow_one_stripe(raid5_conf_t *conf) ...@@ -1322,7 +1320,6 @@ static int grow_one_stripe(raid5_conf_t *conf)
return 0; return 0;
sh->raid_conf = conf; sh->raid_conf = conf;
spin_lock_init(&sh->lock);
#ifdef CONFIG_MULTICORE_RAID456 #ifdef CONFIG_MULTICORE_RAID456
init_waitqueue_head(&sh->ops.wait_for_ops); init_waitqueue_head(&sh->ops.wait_for_ops);
#endif #endif
...@@ -1442,7 +1439,6 @@ static int resize_stripes(raid5_conf_t *conf, int newsize) ...@@ -1442,7 +1439,6 @@ static int resize_stripes(raid5_conf_t *conf, int newsize)
break; break;
nsh->raid_conf = conf; nsh->raid_conf = conf;
spin_lock_init(&nsh->lock);
#ifdef CONFIG_MULTICORE_RAID456 #ifdef CONFIG_MULTICORE_RAID456
init_waitqueue_head(&nsh->ops.wait_for_ops); init_waitqueue_head(&nsh->ops.wait_for_ops);
#endif #endif
...@@ -2148,7 +2144,6 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in ...@@ -2148,7 +2144,6 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
(unsigned long long)sh->sector); (unsigned long long)sh->sector);
spin_lock(&sh->lock);
spin_lock_irq(&conf->device_lock); spin_lock_irq(&conf->device_lock);
if (forwrite) { if (forwrite) {
bip = &sh->dev[dd_idx].towrite; bip = &sh->dev[dd_idx].towrite;
...@@ -2184,7 +2179,6 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in ...@@ -2184,7 +2179,6 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags); set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags);
} }
spin_unlock_irq(&conf->device_lock); spin_unlock_irq(&conf->device_lock);
spin_unlock(&sh->lock);
pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n", pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n",
(unsigned long long)(*bip)->bi_sector, (unsigned long long)(*bip)->bi_sector,
...@@ -2201,7 +2195,6 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in ...@@ -2201,7 +2195,6 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
overlap: overlap:
set_bit(R5_Overlap, &sh->dev[dd_idx].flags); set_bit(R5_Overlap, &sh->dev[dd_idx].flags);
spin_unlock_irq(&conf->device_lock); spin_unlock_irq(&conf->device_lock);
spin_unlock(&sh->lock);
return 0; return 0;
} }
...@@ -3023,12 +3016,10 @@ static void handle_stripe5(struct stripe_head *sh) ...@@ -3023,12 +3016,10 @@ static void handle_stripe5(struct stripe_head *sh)
atomic_read(&sh->count), sh->pd_idx, sh->check_state, atomic_read(&sh->count), sh->pd_idx, sh->check_state,
sh->reconstruct_state); sh->reconstruct_state);
spin_lock(&sh->lock);
if (test_and_clear_bit(STRIPE_SYNC_REQUESTED, &sh->state)) { if (test_and_clear_bit(STRIPE_SYNC_REQUESTED, &sh->state)) {
set_bit(STRIPE_SYNCING, &sh->state); set_bit(STRIPE_SYNCING, &sh->state);
clear_bit(STRIPE_INSYNC, &sh->state); clear_bit(STRIPE_INSYNC, &sh->state);
} }
clear_bit(STRIPE_HANDLE, &sh->state);
clear_bit(STRIPE_DELAYED, &sh->state); clear_bit(STRIPE_DELAYED, &sh->state);
s.syncing = test_bit(STRIPE_SYNCING, &sh->state); s.syncing = test_bit(STRIPE_SYNCING, &sh->state);
...@@ -3037,6 +3028,7 @@ static void handle_stripe5(struct stripe_head *sh) ...@@ -3037,6 +3028,7 @@ static void handle_stripe5(struct stripe_head *sh)
/* Now to look around and see what can be done */ /* Now to look around and see what can be done */
rcu_read_lock(); rcu_read_lock();
spin_lock_irq(&conf->device_lock);
for (i=disks; i--; ) { for (i=disks; i--; ) {
mdk_rdev_t *rdev; mdk_rdev_t *rdev;
...@@ -3099,6 +3091,7 @@ static void handle_stripe5(struct stripe_head *sh) ...@@ -3099,6 +3091,7 @@ static void handle_stripe5(struct stripe_head *sh)
s.failed_num = i; s.failed_num = i;
} }
} }
spin_unlock_irq(&conf->device_lock);
rcu_read_unlock(); rcu_read_unlock();
if (unlikely(blocked_rdev)) { if (unlikely(blocked_rdev)) {
...@@ -3275,7 +3268,6 @@ static void handle_stripe5(struct stripe_head *sh) ...@@ -3275,7 +3268,6 @@ static void handle_stripe5(struct stripe_head *sh)
handle_stripe_expansion(conf, sh, NULL); handle_stripe_expansion(conf, sh, NULL);
unlock: unlock:
spin_unlock(&sh->lock);
/* wait for this device to become unblocked */ /* wait for this device to become unblocked */
if (unlikely(blocked_rdev)) if (unlikely(blocked_rdev))
...@@ -3318,12 +3310,10 @@ static void handle_stripe6(struct stripe_head *sh) ...@@ -3318,12 +3310,10 @@ static void handle_stripe6(struct stripe_head *sh)
sh->check_state, sh->reconstruct_state); sh->check_state, sh->reconstruct_state);
memset(&s, 0, sizeof(s)); memset(&s, 0, sizeof(s));
spin_lock(&sh->lock);
if (test_and_clear_bit(STRIPE_SYNC_REQUESTED, &sh->state)) { if (test_and_clear_bit(STRIPE_SYNC_REQUESTED, &sh->state)) {
set_bit(STRIPE_SYNCING, &sh->state); set_bit(STRIPE_SYNCING, &sh->state);
clear_bit(STRIPE_INSYNC, &sh->state); clear_bit(STRIPE_INSYNC, &sh->state);
} }
clear_bit(STRIPE_HANDLE, &sh->state);
clear_bit(STRIPE_DELAYED, &sh->state); clear_bit(STRIPE_DELAYED, &sh->state);
s.syncing = test_bit(STRIPE_SYNCING, &sh->state); s.syncing = test_bit(STRIPE_SYNCING, &sh->state);
...@@ -3332,6 +3322,7 @@ static void handle_stripe6(struct stripe_head *sh) ...@@ -3332,6 +3322,7 @@ static void handle_stripe6(struct stripe_head *sh)
/* Now to look around and see what can be done */ /* Now to look around and see what can be done */
rcu_read_lock(); rcu_read_lock();
spin_lock_irq(&conf->device_lock);
for (i=disks; i--; ) { for (i=disks; i--; ) {
mdk_rdev_t *rdev; mdk_rdev_t *rdev;
dev = &sh->dev[i]; dev = &sh->dev[i];
...@@ -3395,6 +3386,7 @@ static void handle_stripe6(struct stripe_head *sh) ...@@ -3395,6 +3386,7 @@ static void handle_stripe6(struct stripe_head *sh)
s.failed++; s.failed++;
} }
} }
spin_unlock_irq(&conf->device_lock);
rcu_read_unlock(); rcu_read_unlock();
if (unlikely(blocked_rdev)) { if (unlikely(blocked_rdev)) {
...@@ -3580,7 +3572,6 @@ static void handle_stripe6(struct stripe_head *sh) ...@@ -3580,7 +3572,6 @@ static void handle_stripe6(struct stripe_head *sh)
handle_stripe_expansion(conf, sh, &r6s); handle_stripe_expansion(conf, sh, &r6s);
unlock: unlock:
spin_unlock(&sh->lock);
/* wait for this device to become unblocked */ /* wait for this device to become unblocked */
if (unlikely(blocked_rdev)) if (unlikely(blocked_rdev))
...@@ -3608,10 +3599,19 @@ static void handle_stripe6(struct stripe_head *sh) ...@@ -3608,10 +3599,19 @@ static void handle_stripe6(struct stripe_head *sh)
static void handle_stripe(struct stripe_head *sh) static void handle_stripe(struct stripe_head *sh)
{ {
clear_bit(STRIPE_HANDLE, &sh->state);
if (test_and_set_bit(STRIPE_ACTIVE, &sh->state)) {
/* already being handled, ensure it gets handled
* again when current action finishes */
set_bit(STRIPE_HANDLE, &sh->state);
return;
}
if (sh->raid_conf->level == 6) if (sh->raid_conf->level == 6)
handle_stripe6(sh); handle_stripe6(sh);
else else
handle_stripe5(sh); handle_stripe5(sh);
clear_bit(STRIPE_ACTIVE, &sh->state);
} }
static void raid5_activate_delayed(raid5_conf_t *conf) static void raid5_activate_delayed(raid5_conf_t *conf)
......
...@@ -6,11 +6,11 @@ ...@@ -6,11 +6,11 @@
/* /*
* *
* Each stripe contains one buffer per disc. Each buffer can be in * Each stripe contains one buffer per device. Each buffer can be in
* one of a number of states stored in "flags". Changes between * one of a number of states stored in "flags". Changes between
* these states happen *almost* exclusively under a per-stripe * these states happen *almost* exclusively under the protection of the
* spinlock. Some very specific changes can happen in bi_end_io, and * STRIPE_ACTIVE flag. Some very specific changes can happen in bi_end_io, and
* these are not protected by the spin lock. * these are not protected by STRIPE_ACTIVE.
* *
* The flag bits that are used to represent these states are: * The flag bits that are used to represent these states are:
* R5_UPTODATE and R5_LOCKED * R5_UPTODATE and R5_LOCKED
...@@ -76,12 +76,10 @@ ...@@ -76,12 +76,10 @@
* block and the cached buffer are successfully written, any buffer on * block and the cached buffer are successfully written, any buffer on
* a written list can be returned with b_end_io. * a written list can be returned with b_end_io.
* *
* The write list and read list both act as fifos. The read list is * The write list and read list both act as fifos. The read list,
* protected by the device_lock. The write and written lists are * write list and written list are protected by the device_lock.
* protected by the stripe lock. The device_lock, which can be * The device_lock is only for list manipulations and will only be
* claimed while the stipe lock is held, is only for list * held for a very short time. It can be claimed from interrupts.
* manipulations and will only be held for a very short time. It can
* be claimed from interrupts.
* *
* *
* Stripes in the stripe cache can be on one of two lists (or on * Stripes in the stripe cache can be on one of two lists (or on
...@@ -96,7 +94,6 @@ ...@@ -96,7 +94,6 @@
* *
* The inactive_list, handle_list and hash bucket lists are all protected by the * The inactive_list, handle_list and hash bucket lists are all protected by the
* device_lock. * device_lock.
* - stripes on the inactive_list never have their stripe_lock held.
* - stripes have a reference counter. If count==0, they are on a list. * - stripes have a reference counter. If count==0, they are on a list.
* - If a stripe might need handling, STRIPE_HANDLE is set. * - If a stripe might need handling, STRIPE_HANDLE is set.
* - When refcount reaches zero, then if STRIPE_HANDLE it is put on * - When refcount reaches zero, then if STRIPE_HANDLE it is put on
...@@ -116,10 +113,10 @@ ...@@ -116,10 +113,10 @@
* attach a request to an active stripe (add_stripe_bh()) * attach a request to an active stripe (add_stripe_bh())
* lockdev attach-buffer unlockdev * lockdev attach-buffer unlockdev
* handle a stripe (handle_stripe()) * handle a stripe (handle_stripe())
* lockstripe clrSTRIPE_HANDLE ... * setSTRIPE_ACTIVE, clrSTRIPE_HANDLE ...
* (lockdev check-buffers unlockdev) .. * (lockdev check-buffers unlockdev) ..
* change-state .. * change-state ..
* record io/ops needed unlockstripe schedule io/ops * record io/ops needed clearSTRIPE_ACTIVE schedule io/ops
* release an active stripe (release_stripe()) * release an active stripe (release_stripe())
* lockdev if (!--cnt) { if STRIPE_HANDLE, add to handle_list else add to inactive-list } unlockdev * lockdev if (!--cnt) { if STRIPE_HANDLE, add to handle_list else add to inactive-list } unlockdev
* *
...@@ -128,8 +125,7 @@ ...@@ -128,8 +125,7 @@
* on a cached buffer, and plus one if the stripe is undergoing stripe * on a cached buffer, and plus one if the stripe is undergoing stripe
* operations. * operations.
* *
* Stripe operations are performed outside the stripe lock, * The stripe operations are:
* the stripe operations are:
* -copying data between the stripe cache and user application buffers * -copying data between the stripe cache and user application buffers
* -computing blocks to save a disk access, or to recover a missing block * -computing blocks to save a disk access, or to recover a missing block
* -updating the parity on a write operation (reconstruct write and * -updating the parity on a write operation (reconstruct write and
...@@ -159,7 +155,8 @@ ...@@ -159,7 +155,8 @@
*/ */
/* /*
* Operations state - intermediate states that are visible outside of sh->lock * Operations state - intermediate states that are visible outside of
* STRIPE_ACTIVE.
* In general _idle indicates nothing is running, _run indicates a data * In general _idle indicates nothing is running, _run indicates a data
* processing operation is active, and _result means the data processing result * processing operation is active, and _result means the data processing result
* is stable and can be acted upon. For simple operations like biofill and * is stable and can be acted upon. For simple operations like biofill and
...@@ -209,7 +206,6 @@ struct stripe_head { ...@@ -209,7 +206,6 @@ struct stripe_head {
short ddf_layout;/* use DDF ordering to calculate Q */ short ddf_layout;/* use DDF ordering to calculate Q */
unsigned long state; /* state flags */ unsigned long state; /* state flags */
atomic_t count; /* nr of active thread/requests */ atomic_t count; /* nr of active thread/requests */
spinlock_t lock;
int bm_seq; /* sequence number for bitmap flushes */ int bm_seq; /* sequence number for bitmap flushes */
int disks; /* disks in stripe */ int disks; /* disks in stripe */
enum check_states check_state; enum check_states check_state;
...@@ -240,7 +236,7 @@ struct stripe_head { ...@@ -240,7 +236,7 @@ struct stripe_head {
}; };
/* stripe_head_state - collects and tracks the dynamic state of a stripe_head /* stripe_head_state - collects and tracks the dynamic state of a stripe_head
* for handle_stripe. It is only valid under spin_lock(sh->lock); * for handle_stripe.
*/ */
struct stripe_head_state { struct stripe_head_state {
int syncing, expanding, expanded; int syncing, expanding, expanded;
...@@ -290,6 +286,7 @@ struct r6_state { ...@@ -290,6 +286,7 @@ struct r6_state {
* Stripe state * Stripe state
*/ */
enum { enum {
STRIPE_ACTIVE,
STRIPE_HANDLE, STRIPE_HANDLE,
STRIPE_SYNC_REQUESTED, STRIPE_SYNC_REQUESTED,
STRIPE_SYNCING, STRIPE_SYNCING,
...@@ -339,7 +336,7 @@ enum { ...@@ -339,7 +336,7 @@ enum {
* PREREAD_ACTIVE. * PREREAD_ACTIVE.
* In stripe_handle, if we find pre-reading is necessary, we do it if * In stripe_handle, if we find pre-reading is necessary, we do it if
* PREREAD_ACTIVE is set, else we set DELAYED which will send it to the delayed queue. * PREREAD_ACTIVE is set, else we set DELAYED which will send it to the delayed queue.
* HANDLE gets cleared if stripe_handle leave nothing locked. * HANDLE gets cleared if stripe_handle leaves nothing locked.
*/ */
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment