MD: use per-cpu counter for writes_pending

The 'writes_pending' counter is used to determine when the array is stable so that it can be marked in the superblock as "Clean". Consequently it needs to be updated frequently but only checked for zero occasionally. Recent changes to raid5 cause the count to be updated even more often - once per 4K rather than once per bio. This provided justification for making the updates more efficient. So we replace the atomic counter a percpu-refcount. This can be incremented and decremented cheaply most of the time, and can be switched to "atomic" mode when more precise counting is needed. As it is possible for multiple threads to want a precise count, we introduce a "sync_checker" counter to count the number of threads in "set_in_sync()", and only switch the refcount back to percpu mode when that is zero. We need to be careful about races between set_in_sync() setting ->in_sync to 1, and md_write_start() setting it to zero. md_write_start() holds the rcu_read_lock() while checking if the refcount is in percpu mode. If it is, then we know a switch to 'atomic' will not happen until after we call rcu_read_unlock(), in which case set_in_sync() will see the elevated count, and not set in_sync to 1. If it is not in percpu mode, we take the mddev->lock to ensure proper synchronization. It is no longer possible to quickly check if the count is zero, which we previously did to update a timer or to schedule the md_thread. So now we do these every time we decrement that counter, but make sure they are fast. mod_timer() already optimizes the case where the timeout value doesn't actually change. We leverage that further by always rounding off the jiffies to the timeout value. This may delay the marking of 'clean' slightly, but ensure we only perform atomic operation here when absolutely needed. md_wakeup_thread() current always calls wake_up(), even if THREAD_WAKEUP is already set. That too can be optimised to avoid calls to wake_up(). Signed-off-by: NeilBrown <neilb@suse.com> Signed-off-by: Shaohua Li <shli@fb.com>

MD: use per-cpu counter for writes_pending
The 'writes_pending' counter is used to determine when the array is stable so that it can be marked in the superblock as "Clean". Consequently it needs to be updated frequently but only checked for zero occasionally. Recent changes to raid5 cause the count to be updated even more often - once per 4K rather than once per bio. This provided justification for making the updates more efficient. So we replace the atomic counter a percpu-refcount. This can be incremented and decremented cheaply most of the time, and can be switched to "atomic" mode when more precise counting is needed. As it is possible for multiple threads to want a precise count, we introduce a "sync_checker" counter to count the number of threads in "set_in_sync()", and only switch the refcount back to percpu mode when that is zero. We need to be careful about races between set_in_sync() setting ->in_sync to 1, and md_write_start() setting it to zero. md_write_start() holds the rcu_read_lock() while checking if the refcount is in percpu mode. If it is, then we know a switch to 'atomic' will not happen until after we call rcu_read_unlock(), in which case set_in_sync() will see the elevated count, and not set in_sync to 1. If it is not in percpu mode, we take the mddev->lock to ensure proper synchronization. It is no longer possible to quickly check if the count is zero, which we previously did to update a timer or to schedule the md_thread. So now we do these every time we decrement that counter, but make sure they are fast. mod_timer() already optimizes the case where the timeout value doesn't actually change. We leverage that further by always rounding off the jiffies to the timeout value. This may delay the marking of 'clean' slightly, but ensure we only perform atomic operation here when absolutely needed. md_wakeup_thread() current always calls wake_up(), even if THREAD_WAKEUP is already set. That too can be optimised to avoid calls to wake_up(). Signed-off-by: NeilBrown <neilb@suse.com> Signed-off-by: Shaohua Li <shli@fb.com>
4ad23a97 · NeilBrown · Shaohua Li · 210f7cdc · 4ad23a97 · 4ad23a97
Commit 4ad23a97 authored Mar 15, 2017 by NeilBrown Committed by Shaohua Li Mar 22, 2017
Hide whitespace changes
Inline Side-by-side

Showing with 49 additions and 24 deletions

drivers/md/md.c drivers/md/md.c +47 -23

drivers/md/md.h drivers/md/md.h +2 -1

No files found.
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -65,6 +65,8 @@
 #include <linux/raid/md_p.h>
 #include <linux/raid/md_u.h>
 #include <linux/slab.h>
+#include <linux/percpu-refcount.h>
+
 #include <trace/events/block.h>
 #include "md.h"
 #include "bitmap.h"
@@ -2255,16 +2257,24 @@ static void export_array(struct mddev *mddev)
 static bool set_in_sync(struct mddev *mddev)
 {
 	WARN_ON_ONCE(!spin_is_locked(&mddev->lock));
-	if (atomic_read(&mddev->writes_pending) == 0) {
-		if (mddev->in_sync == 0) {
+	if (!mddev->in_sync) {
+		mddev->sync_checkers++;
+		spin_unlock(&mddev->lock);
+		percpu_ref_switch_to_atomic_sync(&mddev->writes_pending);
+		spin_lock(&mddev->lock);
+		if (!mddev->in_sync &&
+		    percpu_ref_is_zero(&mddev->writes_pending)) {
 			mddev->in_sync = 1;
+			/*
+			 * Ensure ->in_sync is visible before we clear
+			 * ->sync_checkers.
+			 */
 			smp_mb();
-			if (atomic_read(&mddev->writes_pending))
-				/* lost a race with md_write_start() */
-				mddev->in_sync = 0;
 			set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
 			sysfs_notify_dirent_safe(mddev->sysfs_state);
 		}
+		if (--mddev->sync_checkers == 0)
+			percpu_ref_switch_to_percpu(&mddev->writes_pending);
 	}
 	if (mddev->safemode == 1)
 		mddev->safemode = 0;
@@ -5120,6 +5130,7 @@ static void md_free(struct kobject *ko)
 		del_gendisk(mddev->gendisk);
 		put_disk(mddev->gendisk);
 	}
+	percpu_ref_exit(&mddev->writes_pending);

 	kfree(mddev);
 }
@@ -5145,6 +5156,8 @@ static void mddev_delayed_delete(struct work_struct *ws)
 	kobject_put(&mddev->kobj);
 }

+static void no_op(struct percpu_ref *r) {}
+
 static int md_alloc(dev_t dev, char *name)
 {
 	static DEFINE_MUTEX(disks_mutex);
@@ -5196,6 +5209,10 @@ static int md_alloc(dev_t dev, char *name)
 	blk_queue_make_request(mddev->queue, md_make_request);
 	blk_set_stacking_limits(&mddev->queue->limits);

+	if (percpu_ref_init(&mddev->writes_pending, no_op, 0, GFP_KERNEL) < 0)
+		goto abort;
+	/* We want to start with the refcount at zero */
+	percpu_ref_put(&mddev->writes_pending);
 	disk = alloc_disk(1 << shift);
 	if (!disk) {
 		blk_cleanup_queue(mddev->queue);
@@ -5279,11 +5296,10 @@ static void md_safemode_timeout(unsigned long data)
 {
 	struct mddev *mddev = (struct mddev *) data;

-	if (!atomic_read(&mddev->writes_pending)) {
-		mddev->safemode = 1;
-		if (mddev->external)
-			sysfs_notify_dirent_safe(mddev->sysfs_state);
-	}
+	mddev->safemode = 1;
+	if (mddev->external)
+		sysfs_notify_dirent_safe(mddev->sysfs_state);
+
 	md_wakeup_thread(mddev->thread);
 }

@@ -5488,7 +5504,6 @@ int md_run(struct mddev *mddev)
 	} else if (mddev->ro == 2) /* auto-readonly not meaningful */
 		mddev->ro = 0;

-	atomic_set(&mddev->writes_pending,0);
 	atomic_set(&mddev->max_corr_read_errors,
 		   MD_DEFAULT_MAX_CORRECTED_READ_ERRORS);
 	mddev->safemode = 0;
@@ -7342,8 +7357,8 @@ void md_wakeup_thread(struct md_thread *thread)
 {
 	if (thread) {
 		pr_debug("md: waking up MD thread %s.\n", thread->tsk->comm);
-		set_bit(THREAD_WAKEUP, &thread->flags);
-		wake_up(&thread->wqueue);
+		if (!test_and_set_bit(THREAD_WAKEUP, &thread->flags))
+			wake_up(&thread->wqueue);
 	}
 }
 EXPORT_SYMBOL(md_wakeup_thread);
@@ -7890,11 +7905,13 @@ void md_write_start(struct mddev *mddev, struct bio *bi)
 		md_wakeup_thread(mddev->sync_thread);
 		did_change = 1;
 	}
-	atomic_inc(&mddev->writes_pending);
+	rcu_read_lock();
+	percpu_ref_get(&mddev->writes_pending);
 	smp_mb(); /* Match smp_mb in set_in_sync() */
 	if (mddev->safemode == 1)
 		mddev->safemode = 0;
-	if (mddev->in_sync) {
+	/* sync_checkers is always 0 when writes_pending is in per-cpu mode */
+	if (mddev->in_sync || !mddev->sync_checkers) {
 		spin_lock(&mddev->lock);
 		if (mddev->in_sync) {
 			mddev->in_sync = 0;
@@ -7905,6 +7922,7 @@ void md_write_start(struct mddev *mddev, struct bio *bi)
 		}
 		spin_unlock(&mddev->lock);
 	}
+	rcu_read_unlock();
 	if (did_change)
 		sysfs_notify_dirent_safe(mddev->sysfs_state);
 	wait_event(mddev->sb_wait,
@@ -7925,19 +7943,25 @@ void md_write_inc(struct mddev *mddev, struct bio *bi)
 	if (bio_data_dir(bi) != WRITE)
 		return;
 	WARN_ON_ONCE(mddev->in_sync || mddev->ro);
-	atomic_inc(&mddev->writes_pending);
+	percpu_ref_get(&mddev->writes_pending);
 }
 EXPORT_SYMBOL(md_write_inc);

 void md_write_end(struct mddev *mddev)
 {
-	if (atomic_dec_and_test(&mddev->writes_pending)) {
-		if (mddev->safemode == 2)
-			md_wakeup_thread(mddev->thread);
-		else if (mddev->safemode_delay)
-			mod_timer(&mddev->safemode_timer, jiffies + mddev->safemode_delay);
-	}
+	percpu_ref_put(&mddev->writes_pending);
+
+	if (mddev->safemode == 2)
+		md_wakeup_thread(mddev->thread);
+	else if (mddev->safemode_delay)
+		/* The roundup() ensures this only performs locking once
+		 * every ->safemode_delay jiffies
+		 */
+		mod_timer(&mddev->safemode_timer,
+			  roundup(jiffies, mddev->safemode_delay) +
+			  mddev->safemode_delay);
 }
+
 EXPORT_SYMBOL(md_write_end);

 /* md_allow_write(mddev)
@@ -8538,7 +8562,7 @@ void md_check_recovery(struct mddev *mddev)
 		test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) ||
 		test_bit(MD_RECOVERY_DONE, &mddev->recovery) ||
 		(mddev->external == 0 && mddev->safemode == 1) ||
-		(mddev->safemode == 2 && ! atomic_read(&mddev->writes_pending)
+		(mddev->safemode == 2
 		 && !mddev->in_sync && mddev->recovery_cp == MaxSector)
 		))
 		return;

--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -409,7 +409,8 @@ struct mddev {
 							 */
 	unsigned int			safemode_delay;
 	struct timer_list		safemode_timer;
-	atomic_t			writes_pending;
+	struct percpu_ref		writes_pending;
+	int				sync_checkers;	/* # of threads checking writes_pending */
 	struct request_queue		*queue;	/* for plugging ... */

 	struct bitmap			*bitmap; /* the bitmap for the device */