Commit b5470dc5 authored by Dan Williams's avatar Dan Williams

md: resolve external metadata handling deadlock in md_allow_write

md_allow_write() marks the metadata dirty while holding mddev->lock and then
waits for the write to complete.  For externally managed metadata this causes a
deadlock as userspace needs to take the lock to communicate that the metadata
update has completed.

Change md_allow_write() in the 'external' case to start the 'mark active'
operation and then return -EAGAIN.  The expected side effects while waiting for
userspace to write 'active' to 'array_state' are holding off reshape (code
currently handles -ENOMEM), cause some 'stripe_cache_size' change requests to
fail, cause some GET_BITMAP_FILE ioctl requests to fall back to GFP_NOIO, and
cause updates to 'raid_disks' to fail.  Except for 'stripe_cache_size' changes
these failures can be mitigated by coordinating with mdmon.

md_write_start() still prevents writes from occurring until the metadata
handler has had a chance to take action as it unconditionally waits for
MD_CHANGE_CLEAN to be cleared.

[neilb@suse.de: return -EAGAIN, try GFP_NOIO]
Signed-off-by: default avatarDan Williams <dan.j.williams@intel.com>
parent 1fe797e6
...@@ -4172,9 +4172,11 @@ static int get_bitmap_file(mddev_t * mddev, void __user * arg) ...@@ -4172,9 +4172,11 @@ static int get_bitmap_file(mddev_t * mddev, void __user * arg)
char *ptr, *buf = NULL; char *ptr, *buf = NULL;
int err = -ENOMEM; int err = -ENOMEM;
md_allow_write(mddev); if (md_allow_write(mddev))
file = kmalloc(sizeof(*file), GFP_NOIO);
else
file = kmalloc(sizeof(*file), GFP_KERNEL); file = kmalloc(sizeof(*file), GFP_KERNEL);
if (!file) if (!file)
goto out; goto out;
...@@ -5667,15 +5669,18 @@ void md_write_end(mddev_t *mddev) ...@@ -5667,15 +5669,18 @@ void md_write_end(mddev_t *mddev)
* may proceed without blocking. It is important to call this before * may proceed without blocking. It is important to call this before
* attempting a GFP_KERNEL allocation while holding the mddev lock. * attempting a GFP_KERNEL allocation while holding the mddev lock.
* Must be called with mddev_lock held. * Must be called with mddev_lock held.
*
* In the ->external case MD_CHANGE_CLEAN can not be cleared until mddev->lock
* is dropped, so return -EAGAIN after notifying userspace.
*/ */
void md_allow_write(mddev_t *mddev) int md_allow_write(mddev_t *mddev)
{ {
if (!mddev->pers) if (!mddev->pers)
return; return 0;
if (mddev->ro) if (mddev->ro)
return; return 0;
if (!mddev->pers->sync_request) if (!mddev->pers->sync_request)
return; return 0;
spin_lock_irq(&mddev->write_lock); spin_lock_irq(&mddev->write_lock);
if (mddev->in_sync) { if (mddev->in_sync) {
...@@ -5686,14 +5691,14 @@ void md_allow_write(mddev_t *mddev) ...@@ -5686,14 +5691,14 @@ void md_allow_write(mddev_t *mddev)
mddev->safemode = 1; mddev->safemode = 1;
spin_unlock_irq(&mddev->write_lock); spin_unlock_irq(&mddev->write_lock);
md_update_sb(mddev, 0); md_update_sb(mddev, 0);
sysfs_notify(&mddev->kobj, NULL, "array_state"); sysfs_notify(&mddev->kobj, NULL, "array_state");
/* wait for the dirty state to be recorded in the metadata */
wait_event(mddev->sb_wait,
!test_bit(MD_CHANGE_CLEAN, &mddev->flags) &&
!test_bit(MD_CHANGE_PENDING, &mddev->flags));
} else } else
spin_unlock_irq(&mddev->write_lock); spin_unlock_irq(&mddev->write_lock);
if (test_bit(MD_CHANGE_CLEAN, &mddev->flags))
return -EAGAIN;
else
return 0;
} }
EXPORT_SYMBOL_GPL(md_allow_write); EXPORT_SYMBOL_GPL(md_allow_write);
......
...@@ -2136,7 +2136,7 @@ static int raid1_reshape(mddev_t *mddev) ...@@ -2136,7 +2136,7 @@ static int raid1_reshape(mddev_t *mddev)
conf_t *conf = mddev_to_conf(mddev); conf_t *conf = mddev_to_conf(mddev);
int cnt, raid_disks; int cnt, raid_disks;
unsigned long flags; unsigned long flags;
int d, d2; int d, d2, err;
/* Cannot change chunk_size, layout, or level */ /* Cannot change chunk_size, layout, or level */
if (mddev->chunk_size != mddev->new_chunk || if (mddev->chunk_size != mddev->new_chunk ||
...@@ -2148,7 +2148,9 @@ static int raid1_reshape(mddev_t *mddev) ...@@ -2148,7 +2148,9 @@ static int raid1_reshape(mddev_t *mddev)
return -EINVAL; return -EINVAL;
} }
md_allow_write(mddev); err = md_allow_write(mddev);
if (err)
return err;
raid_disks = mddev->raid_disks + mddev->delta_disks; raid_disks = mddev->raid_disks + mddev->delta_disks;
......
...@@ -911,14 +911,16 @@ static int resize_stripes(raid5_conf_t *conf, int newsize) ...@@ -911,14 +911,16 @@ static int resize_stripes(raid5_conf_t *conf, int newsize)
struct stripe_head *osh, *nsh; struct stripe_head *osh, *nsh;
LIST_HEAD(newstripes); LIST_HEAD(newstripes);
struct disk_info *ndisks; struct disk_info *ndisks;
int err = 0; int err;
struct kmem_cache *sc; struct kmem_cache *sc;
int i; int i;
if (newsize <= conf->pool_size) if (newsize <= conf->pool_size)
return 0; /* never bother to shrink */ return 0; /* never bother to shrink */
md_allow_write(conf->mddev); err = md_allow_write(conf->mddev);
if (err)
return err;
/* Step 1 */ /* Step 1 */
sc = kmem_cache_create(conf->cache_name[1-conf->active_name], sc = kmem_cache_create(conf->cache_name[1-conf->active_name],
...@@ -3843,6 +3845,8 @@ raid5_store_stripe_cache_size(mddev_t *mddev, const char *page, size_t len) ...@@ -3843,6 +3845,8 @@ raid5_store_stripe_cache_size(mddev_t *mddev, const char *page, size_t len)
{ {
raid5_conf_t *conf = mddev_to_conf(mddev); raid5_conf_t *conf = mddev_to_conf(mddev);
unsigned long new; unsigned long new;
int err;
if (len >= PAGE_SIZE) if (len >= PAGE_SIZE)
return -EINVAL; return -EINVAL;
if (!conf) if (!conf)
...@@ -3858,7 +3862,9 @@ raid5_store_stripe_cache_size(mddev_t *mddev, const char *page, size_t len) ...@@ -3858,7 +3862,9 @@ raid5_store_stripe_cache_size(mddev_t *mddev, const char *page, size_t len)
else else
break; break;
} }
md_allow_write(mddev); err = md_allow_write(mddev);
if (err)
return err;
while (new > conf->max_nr_stripes) { while (new > conf->max_nr_stripes) {
if (grow_one_stripe(conf)) if (grow_one_stripe(conf))
conf->max_nr_stripes++; conf->max_nr_stripes++;
......
...@@ -95,7 +95,7 @@ extern int sync_page_io(struct block_device *bdev, sector_t sector, int size, ...@@ -95,7 +95,7 @@ extern int sync_page_io(struct block_device *bdev, sector_t sector, int size,
struct page *page, int rw); struct page *page, int rw);
extern void md_do_sync(mddev_t *mddev); extern void md_do_sync(mddev_t *mddev);
extern void md_new_event(mddev_t *mddev); extern void md_new_event(mddev_t *mddev);
extern void md_allow_write(mddev_t *mddev); extern int md_allow_write(mddev_t *mddev);
extern void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev); extern void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev);
#endif /* CONFIG_MD */ #endif /* CONFIG_MD */
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment