Remove deadlock potential in md_open

A recent commit: commit 449aad3e introduced the possibility of an A-B/B-A deadlock between bd_mutex and reconfig_mutex. __blkdev_get holds bd_mutex while calling md_open which takes reconfig_mutex, do_md_run is always called with reconfig_mutex held, and it now takes bd_mutex in the call the revalidate_disk. This potential deadlock was not caught by lockdep due to the use of mutex_lock_interruptible_nexted which was introduced by commit d63a5a74 do avoid a warning of an impossible deadlock. It is quite possible to split reconfig_mutex in to two locks. One protects the array data structures while it is being reconfigured, the other ensures that an array is never even partially open while it is being deactivated. In particular, the second lock prevents an open from completing between the time when do_md_stop checks if there are any active opens, and the time when the array is either set read-only, or when ->pers is set to NULL. So we can be certain that no IO is in flight as the array is being destroyed. So create a new lock, open_mutex, just to ensure exclusion between 'open' and 'stop'. This avoids the deadlock and also avoids the lockdep warning mentioned in commit d63a5a74Reported-by: "Mike Snitzer" <snitzer@gmail.com> Reported-by: "H. Peter Anvin" <hpa@zytor.com> Signed-off-by: NeilBrown <neilb@suse.de>

Remove deadlock potential in md_open
A recent commit: commit 449aad3e introduced the possibility of an A-B/B-A deadlock between bd_mutex and reconfig_mutex. __blkdev_get holds bd_mutex while calling md_open which takes reconfig_mutex, do_md_run is always called with reconfig_mutex held, and it now takes bd_mutex in the call the revalidate_disk. This potential deadlock was not caught by lockdep due to the use of mutex_lock_interruptible_nexted which was introduced by commit d63a5a74 do avoid a warning of an impossible deadlock. It is quite possible to split reconfig_mutex in to two locks. One protects the array data structures while it is being reconfigured, the other ensures that an array is never even partially open while it is being deactivated. In particular, the second lock prevents an open from completing between the time when do_md_stop checks if there are any active opens, and the time when the array is either set read-only, or when ->pers is set to NULL. So we can be certain that no IO is in flight as the array is being destroyed. So create a new lock, open_mutex, just to ensure exclusion between 'open' and 'stop'. This avoids the deadlock and also avoids the lockdep warning mentioned in commit d63a5a74Reported-by: "Mike Snitzer" <snitzer@gmail.com> Reported-by: "H. Peter Anvin" <hpa@zytor.com> Signed-off-by: NeilBrown <neilb@suse.de>
c8c00a69 · NeilBrown · 7b2aa037 · c8c00a69 · c8c00a69
Commit c8c00a69 authored Aug 10, 2009 by NeilBrown
Hide whitespace changes
Inline Side-by-side

Showing with 20 additions and 8 deletions

drivers/md/md.c drivers/md/md.c +10 -8

drivers/md/md.h drivers/md/md.h +10 -0

No files found.
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -359,6 +359,7 @@ static mddev_t * mddev_find(dev_t unit)
 	else
 		new->md_minor = MINOR(unit) >> MdpMinorShift;

+	mutex_init(&new->open_mutex);
 	mutex_init(&new->reconfig_mutex);
 	INIT_LIST_HEAD(&new->disks);
 	INIT_LIST_HEAD(&new->all_mddevs);
@@ -4304,12 +4305,11 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)
 	struct gendisk *disk = mddev->gendisk;
 	mdk_rdev_t *rdev;

+	mutex_lock(&mddev->open_mutex);
 	if (atomic_read(&mddev->openers) > is_open) {
 		printk("md: %s still in use.\n",mdname(mddev));
-		return -EBUSY;
-	}
-
-	if (mddev->pers) {
+		err = -EBUSY;
+	} else if (mddev->pers) {

 		if (mddev->sync_thread) {
 			set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
@@ -4367,7 +4367,10 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)
 			set_disk_ro(disk, 1);
 		clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
 	}
-
+out:
+	mutex_unlock(&mddev->open_mutex);
+	if (err)
+		return err;
 	/*
 	 * Free resources if final stop
 	 */
@@ -4433,7 +4436,6 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)
 	blk_integrity_unregister(disk);
 	md_new_event(mddev);
 	sysfs_notify_dirent(mddev->sysfs_state);
-out:
 	return err;
 }

@@ -5518,12 +5520,12 @@ static int md_open(struct block_device *bdev, fmode_t mode)
 	}
 	BUG_ON(mddev != bdev->bd_disk->private_data);

-	if ((err = mutex_lock_interruptible_nested(&mddev->reconfig_mutex, 1)))
+	if ((err = mutex_lock_interruptible(&mddev->open_mutex)))
 		goto out;

 	err = 0;
 	atomic_inc(&mddev->openers);
-	mddev_unlock(mddev);
+	mutex_unlock(&mddev->open_mutex);

 	check_disk_change(bdev);
 out:

--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -223,6 +223,16 @@ struct mddev_s
 							    * so we don't loop trying */

 	int				in_sync;	/* know to not need resync */
+	/* 'open_mutex' avoids races between 'md_open' and 'do_md_stop', so
+	 * that we are never stopping an array while it is open.
+	 * 'reconfig_mutex' protects all other reconfiguration.
+	 * These locks are separate due to conflicting interactions
+	 * with bdev->bd_mutex.
+	 * Lock ordering is:
+	 *  reconfig_mutex -> bd_mutex : e.g. do_md_run -> revalidate_disk
+	 *  bd_mutex -> open_mutex:  e.g. __blkdev_get -> md_open
+	 */
+	struct mutex			open_mutex;
 	struct mutex			reconfig_mutex;
 	atomic_t			active;		/* general refcount */
 	atomic_t			openers;	/* number of active opens */