md/raid5: allow the stripe_cache to grow and shrink.

The default setting of 256 stripe_heads is probably much too small for many configurations. So it is best to make it auto-configure. Shrinking the cache under memory pressure is easy. The only interesting part here is that we put a fairly high cost ('seeks') on shrinking the cache as the cost is greater than just having to read more data, it reduces parallelism. Growing the cache on demand needs to be done carefully. If we allow fast growth, that can upset memory balance as lots of dirty memory can quickly turn into lots of memory queued in the stripe_cache. It is important for the raid5 block device to appear congested to allow write-throttling to work. So we only add stripes slowly. We set a flag when an allocation fails because all stripes are in use, allocate at a convenient time when that flag is set, and don't allow it to be set again until at least one stripe_head has been released for re-use. This means that a spurt of requests will only cause one stripe_head to be allocated, but a steady stream of requests will slowly increase the cache size - until memory pressure puts it back again. It could take hours to reach a steady state. The value written to, and displayed in, stripe_cache_size is used as a minimum. The cache can grow above this and shrink back down to it. The actual size is not directly visible, though it can be deduced to some extent by watching stripe_cache_active. Signed-off-by: NeilBrown <neilb@suse.de>

md/raid5: allow the stripe_cache to grow and shrink.
The default setting of 256 stripe_heads is probably much too small for many configurations. So it is best to make it auto-configure. Shrinking the cache under memory pressure is easy. The only interesting part here is that we put a fairly high cost ('seeks') on shrinking the cache as the cost is greater than just having to read more data, it reduces parallelism. Growing the cache on demand needs to be done carefully. If we allow fast growth, that can upset memory balance as lots of dirty memory can quickly turn into lots of memory queued in the stripe_cache. It is important for the raid5 block device to appear congested to allow write-throttling to work. So we only add stripes slowly. We set a flag when an allocation fails because all stripes are in use, allocate at a convenient time when that flag is set, and don't allow it to be set again until at least one stripe_head has been released for re-use. This means that a spurt of requests will only cause one stripe_head to be allocated, but a steady stream of requests will slowly increase the cache size - until memory pressure puts it back again. It could take hours to reach a steady state. The value written to, and displayed in, stripe_cache_size is used as a minimum. The cache can grow above this and shrink back down to it. The actual size is not directly visible, though it can be deduced to some extent by watching stripe_cache_active. Signed-off-by: NeilBrown <neilb@suse.de>
edbe83ab · NeilBrown · 5423399a · edbe83ab · edbe83ab
Commit edbe83ab authored Feb 26, 2015 by NeilBrown
Hide whitespace changes
Inline Side-by-side

Showing with 71 additions and 8 deletions

drivers/md/raid5.c drivers/md/raid5.c +61 -7

drivers/md/raid5.h drivers/md/raid5.h +10 -1

No files found.
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -672,8 +672,13 @@ get_active_stripe(struct r5conf *conf, sector_t sector,
 				    *(conf->hash_locks + hash));
 		sh = __find_stripe(conf, sector, conf->generation - previous);
 		if (!sh) {
-			if (!test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state))
+			if (!test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state)) {
 				sh = get_free_stripe(conf, hash);
+				if (!sh && llist_empty(&conf->released_stripes) &&
+				    !test_bit(R5_DID_ALLOC, &conf->cache_state))
+					set_bit(R5_ALLOC_MORE,
+						&conf->cache_state);
+			}
 			if (noblock && sh == NULL)
 				break;
 			if (!sh) {
@@ -5761,6 +5766,8 @@ static void raid5d(struct md_thread *thread)
 		int batch_size, released;
 		released = release_stripe_list(conf, conf->temp_inactive_list);
+		if (released)
+			clear_bit(R5_DID_ALLOC, &conf->cache_state);
 		if (
 		    !list_empty(&conf->bitmap_list)) {
@@ -5799,6 +5806,13 @@ static void raid5d(struct md_thread *thread)
 	pr_debug("%d stripes handled\n", handled);
 	spin_unlock_irq(&conf->device_lock);
+	if (test_and_clear_bit(R5_ALLOC_MORE, &conf->cache_state)) {
+		grow_one_stripe(conf, __GFP_NOWARN);
+		/* Set flag even if allocation failed.  This helps
+		 * slow down allocation requests when mem is short
+		 */
+		set_bit(R5_DID_ALLOC, &conf->cache_state);
+	}
 	async_tx_issue_pending_all();
 	blk_finish_plug(&plug);
@@ -5814,7 +5828,7 @@ raid5_show_stripe_cache_size(struct mddev *mddev, char *page)
 	spin_lock(&mddev->lock);
 	conf = mddev->private;
 	if (conf)
-		ret = sprintf(page, "%d\n", conf->max_nr_stripes);
+		ret = sprintf(page, "%d\n", conf->min_nr_stripes);
 	spin_unlock(&mddev->lock);
 	return ret;
 }
@@ -5828,10 +5842,12 @@ raid5_set_cache_size(struct mddev *mddev, int size)
 	if (size <= 16 || size > 32768)
 		return -EINVAL;
+	conf->min_nr_stripes = size;
 	while (size < conf->max_nr_stripes &&
 	       drop_one_stripe(conf))
 		;
 	err = md_allow_write(mddev);
 	if (err)
 		return err;
@@ -5947,7 +5963,7 @@ raid5_store_preread_threshold(struct mddev *mddev, const char *page, size_t len)
 	conf = mddev->private;
 	if (!conf)
 		err = -ENODEV;
-	else if (new > conf->max_nr_stripes)
+	else if (new > conf->min_nr_stripes)
 		err = -EINVAL;
 	else
 		conf->bypass_threshold = new;
@@ -6228,6 +6244,8 @@ static void raid5_free_percpu(struct r5conf *conf)
 static void free_conf(struct r5conf *conf)
 {
+	if (conf->shrinker.seeks)
+		unregister_shrinker(&conf->shrinker);
 	free_thread_groups(conf);
 	shrink_stripes(conf);
 	raid5_free_percpu(conf);
@@ -6295,6 +6313,30 @@ static int raid5_alloc_percpu(struct r5conf *conf)
 	return err;
 }
+static unsigned long raid5_cache_scan(struct shrinker *shrink,
+				      struct shrink_control *sc)
+{
+	struct r5conf *conf = container_of(shrink, struct r5conf, shrinker);
+	int ret = 0;
+	while (ret < sc->nr_to_scan) {
+		if (drop_one_stripe(conf) == 0)
+			return SHRINK_STOP;
+		ret++;
+	}
+	return ret;
+}
+static unsigned long raid5_cache_count(struct shrinker *shrink,
+				       struct shrink_control *sc)
+{
+	struct r5conf *conf = container_of(shrink, struct r5conf, shrinker);
+	if (conf->max_nr_stripes < conf->min_nr_stripes)
+		/* unlikely, but not impossible */
+		return 0;
+	return conf->max_nr_stripes - conf->min_nr_stripes;
+}
 static struct r5conf *setup_conf(struct mddev *mddev)
 {
 	struct r5conf *conf;
@@ -6445,10 +6487,11 @@ static struct r5conf *setup_conf(struct mddev *mddev)
 		conf->prev_algo = mddev->layout;
 	}
-	memory = NR_STRIPES * (sizeof(struct stripe_head) +
+	conf->min_nr_stripes = NR_STRIPES;
+	memory = conf->min_nr_stripes * (sizeof(struct stripe_head) +
 		 max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024;
 	atomic_set(&conf->empty_inactive_list_nr, NR_STRIPE_HASH_LOCKS);
-	if (grow_stripes(conf, NR_STRIPES)) {
+	if (grow_stripes(conf, conf->min_nr_stripes)) {
 		printk(KERN_ERR
 		       "md/raid:%s: couldn't allocate %dkB for buffers\n",
 		       mdname(mddev), memory);
@@ -6456,6 +6499,17 @@ static struct r5conf *setup_conf(struct mddev *mddev)
 	} else
 		printk(KERN_INFO "md/raid:%s: allocated %dkB\n",
 		       mdname(mddev), memory);
+	/*
+	 * Losing a stripe head costs more than the time to refill it,
+	 * it reduces the queue depth and so can hurt throughput.
+	 * So set it rather large, scaled by number of devices.
+	 */
+	conf->shrinker.seeks = DEFAULT_SEEKS * conf->raid_disks * 4;
+	conf->shrinker.scan_objects = raid5_cache_scan;
+	conf->shrinker.count_objects = raid5_cache_count;
+	conf->shrinker.batch = 128;
+	conf->shrinker.flags = 0;
+	register_shrinker(&conf->shrinker);
 	sprintf(pers_name, "raid%d", mddev->new_level);
 	conf->thread = md_register_thread(raid5d, mddev, pers_name);
@@ -7097,9 +7151,9 @@ static int check_stripe_cache(struct mddev *mddev)
 	 */
 	struct r5conf *conf = mddev->private;
 	if (((mddev->chunk_sectors << 9) / STRIPE_SIZE) * 4
-	    > conf->max_nr_stripes ||
+	    > conf->min_nr_stripes ||
 	    ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4
-	    > conf->max_nr_stripes) {
+	    > conf->min_nr_stripes) {
 		printk(KERN_WARNING "md/raid:%s: reshape: not enough stripes.  Needed %lu\n",
 		       mdname(mddev),
 		       ((max(mddev->chunk_sectors, mddev->new_chunk_sectors) << 9)

--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -433,6 +433,7 @@ struct r5conf {
 	int			max_degraded;
 	int			raid_disks;
 	int			max_nr_stripes;
+	int			min_nr_stripes;
 	/* reshape_progress is the leading edge of a 'reshape'
 	 * It has value MaxSector when no reshape is happening
@@ -513,7 +514,15 @@ struct r5conf {
 #define R5_INACTIVE_BLOCKED	1	/* release of inactive stripes blocked,
 					 * waiting for 25% to be free
 					 */
+#define R5_ALLOC_MORE		2	/* It might help to allocate another
+					 * stripe.
+					 */
+#define R5_DID_ALLOC		4	/* A stripe was allocated, don't allocate
+					 * more until at least one has been
+					 * released.  This avoids flooding
+					 * the cache.
+					 */
+	struct shrinker		shrinker;
 	int			pool_size; /* number of disks in stripeheads in pool */
 	spinlock_t		device_lock;
 	struct disk_info	*disks;