Commit edbe83ab authored by NeilBrown's avatar NeilBrown

md/raid5: allow the stripe_cache to grow and shrink.

The default setting of 256 stripe_heads is probably
much too small for many configurations.  So it is best to make it
auto-configure.

Shrinking the cache under memory pressure is easy.  The only
interesting part here is that we put a fairly high cost
('seeks') on shrinking the cache as the cost is greater than
just having to read more data, it reduces parallelism.

Growing the cache on demand needs to be done carefully.  If we allow
fast growth, that can upset memory balance as lots of dirty memory can
quickly turn into lots of memory queued in the stripe_cache.
It is important for the raid5 block device to appear congested to
allow write-throttling to work.

So we only add stripes slowly. We set a flag when an allocation
fails because all stripes are in use, allocate at a convenient
time when that flag is set, and don't allow it to be set again
until at least one stripe_head has been released for re-use.

This means that a spurt of requests will only cause one stripe_head
to be allocated, but a steady stream of requests will slowly
increase the cache size - until memory pressure puts it back again.

It could take hours to reach a steady state.

The value written to, and displayed in, stripe_cache_size is
used as a minimum.  The cache can grow above this and shrink back
down to it.  The actual size is not directly visible, though it can
be deduced to some extent by watching stripe_cache_active.
Signed-off-by: default avatarNeilBrown <neilb@suse.de>
parent 5423399a
...@@ -672,8 +672,13 @@ get_active_stripe(struct r5conf *conf, sector_t sector, ...@@ -672,8 +672,13 @@ get_active_stripe(struct r5conf *conf, sector_t sector,
*(conf->hash_locks + hash)); *(conf->hash_locks + hash));
sh = __find_stripe(conf, sector, conf->generation - previous); sh = __find_stripe(conf, sector, conf->generation - previous);
if (!sh) { if (!sh) {
if (!test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state)) if (!test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state)) {
sh = get_free_stripe(conf, hash); sh = get_free_stripe(conf, hash);
if (!sh && llist_empty(&conf->released_stripes) &&
!test_bit(R5_DID_ALLOC, &conf->cache_state))
set_bit(R5_ALLOC_MORE,
&conf->cache_state);
}
if (noblock && sh == NULL) if (noblock && sh == NULL)
break; break;
if (!sh) { if (!sh) {
...@@ -5761,6 +5766,8 @@ static void raid5d(struct md_thread *thread) ...@@ -5761,6 +5766,8 @@ static void raid5d(struct md_thread *thread)
int batch_size, released; int batch_size, released;
released = release_stripe_list(conf, conf->temp_inactive_list); released = release_stripe_list(conf, conf->temp_inactive_list);
if (released)
clear_bit(R5_DID_ALLOC, &conf->cache_state);
if ( if (
!list_empty(&conf->bitmap_list)) { !list_empty(&conf->bitmap_list)) {
...@@ -5799,6 +5806,13 @@ static void raid5d(struct md_thread *thread) ...@@ -5799,6 +5806,13 @@ static void raid5d(struct md_thread *thread)
pr_debug("%d stripes handled\n", handled); pr_debug("%d stripes handled\n", handled);
spin_unlock_irq(&conf->device_lock); spin_unlock_irq(&conf->device_lock);
if (test_and_clear_bit(R5_ALLOC_MORE, &conf->cache_state)) {
grow_one_stripe(conf, __GFP_NOWARN);
/* Set flag even if allocation failed. This helps
* slow down allocation requests when mem is short
*/
set_bit(R5_DID_ALLOC, &conf->cache_state);
}
async_tx_issue_pending_all(); async_tx_issue_pending_all();
blk_finish_plug(&plug); blk_finish_plug(&plug);
...@@ -5814,7 +5828,7 @@ raid5_show_stripe_cache_size(struct mddev *mddev, char *page) ...@@ -5814,7 +5828,7 @@ raid5_show_stripe_cache_size(struct mddev *mddev, char *page)
spin_lock(&mddev->lock); spin_lock(&mddev->lock);
conf = mddev->private; conf = mddev->private;
if (conf) if (conf)
ret = sprintf(page, "%d\n", conf->max_nr_stripes); ret = sprintf(page, "%d\n", conf->min_nr_stripes);
spin_unlock(&mddev->lock); spin_unlock(&mddev->lock);
return ret; return ret;
} }
...@@ -5828,10 +5842,12 @@ raid5_set_cache_size(struct mddev *mddev, int size) ...@@ -5828,10 +5842,12 @@ raid5_set_cache_size(struct mddev *mddev, int size)
if (size <= 16 || size > 32768) if (size <= 16 || size > 32768)
return -EINVAL; return -EINVAL;
conf->min_nr_stripes = size;
while (size < conf->max_nr_stripes && while (size < conf->max_nr_stripes &&
drop_one_stripe(conf)) drop_one_stripe(conf))
; ;
err = md_allow_write(mddev); err = md_allow_write(mddev);
if (err) if (err)
return err; return err;
...@@ -5947,7 +5963,7 @@ raid5_store_preread_threshold(struct mddev *mddev, const char *page, size_t len) ...@@ -5947,7 +5963,7 @@ raid5_store_preread_threshold(struct mddev *mddev, const char *page, size_t len)
conf = mddev->private; conf = mddev->private;
if (!conf) if (!conf)
err = -ENODEV; err = -ENODEV;
else if (new > conf->max_nr_stripes) else if (new > conf->min_nr_stripes)
err = -EINVAL; err = -EINVAL;
else else
conf->bypass_threshold = new; conf->bypass_threshold = new;
...@@ -6228,6 +6244,8 @@ static void raid5_free_percpu(struct r5conf *conf) ...@@ -6228,6 +6244,8 @@ static void raid5_free_percpu(struct r5conf *conf)
static void free_conf(struct r5conf *conf) static void free_conf(struct r5conf *conf)
{ {
if (conf->shrinker.seeks)
unregister_shrinker(&conf->shrinker);
free_thread_groups(conf); free_thread_groups(conf);
shrink_stripes(conf); shrink_stripes(conf);
raid5_free_percpu(conf); raid5_free_percpu(conf);
...@@ -6295,6 +6313,30 @@ static int raid5_alloc_percpu(struct r5conf *conf) ...@@ -6295,6 +6313,30 @@ static int raid5_alloc_percpu(struct r5conf *conf)
return err; return err;
} }
static unsigned long raid5_cache_scan(struct shrinker *shrink,
struct shrink_control *sc)
{
struct r5conf *conf = container_of(shrink, struct r5conf, shrinker);
int ret = 0;
while (ret < sc->nr_to_scan) {
if (drop_one_stripe(conf) == 0)
return SHRINK_STOP;
ret++;
}
return ret;
}
static unsigned long raid5_cache_count(struct shrinker *shrink,
struct shrink_control *sc)
{
struct r5conf *conf = container_of(shrink, struct r5conf, shrinker);
if (conf->max_nr_stripes < conf->min_nr_stripes)
/* unlikely, but not impossible */
return 0;
return conf->max_nr_stripes - conf->min_nr_stripes;
}
static struct r5conf *setup_conf(struct mddev *mddev) static struct r5conf *setup_conf(struct mddev *mddev)
{ {
struct r5conf *conf; struct r5conf *conf;
...@@ -6445,10 +6487,11 @@ static struct r5conf *setup_conf(struct mddev *mddev) ...@@ -6445,10 +6487,11 @@ static struct r5conf *setup_conf(struct mddev *mddev)
conf->prev_algo = mddev->layout; conf->prev_algo = mddev->layout;
} }
memory = NR_STRIPES * (sizeof(struct stripe_head) + conf->min_nr_stripes = NR_STRIPES;
memory = conf->min_nr_stripes * (sizeof(struct stripe_head) +
max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024; max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024;
atomic_set(&conf->empty_inactive_list_nr, NR_STRIPE_HASH_LOCKS); atomic_set(&conf->empty_inactive_list_nr, NR_STRIPE_HASH_LOCKS);
if (grow_stripes(conf, NR_STRIPES)) { if (grow_stripes(conf, conf->min_nr_stripes)) {
printk(KERN_ERR printk(KERN_ERR
"md/raid:%s: couldn't allocate %dkB for buffers\n", "md/raid:%s: couldn't allocate %dkB for buffers\n",
mdname(mddev), memory); mdname(mddev), memory);
...@@ -6456,6 +6499,17 @@ static struct r5conf *setup_conf(struct mddev *mddev) ...@@ -6456,6 +6499,17 @@ static struct r5conf *setup_conf(struct mddev *mddev)
} else } else
printk(KERN_INFO "md/raid:%s: allocated %dkB\n", printk(KERN_INFO "md/raid:%s: allocated %dkB\n",
mdname(mddev), memory); mdname(mddev), memory);
/*
* Losing a stripe head costs more than the time to refill it,
* it reduces the queue depth and so can hurt throughput.
* So set it rather large, scaled by number of devices.
*/
conf->shrinker.seeks = DEFAULT_SEEKS * conf->raid_disks * 4;
conf->shrinker.scan_objects = raid5_cache_scan;
conf->shrinker.count_objects = raid5_cache_count;
conf->shrinker.batch = 128;
conf->shrinker.flags = 0;
register_shrinker(&conf->shrinker);
sprintf(pers_name, "raid%d", mddev->new_level); sprintf(pers_name, "raid%d", mddev->new_level);
conf->thread = md_register_thread(raid5d, mddev, pers_name); conf->thread = md_register_thread(raid5d, mddev, pers_name);
...@@ -7097,9 +7151,9 @@ static int check_stripe_cache(struct mddev *mddev) ...@@ -7097,9 +7151,9 @@ static int check_stripe_cache(struct mddev *mddev)
*/ */
struct r5conf *conf = mddev->private; struct r5conf *conf = mddev->private;
if (((mddev->chunk_sectors << 9) / STRIPE_SIZE) * 4 if (((mddev->chunk_sectors << 9) / STRIPE_SIZE) * 4
> conf->max_nr_stripes || > conf->min_nr_stripes ||
((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4 ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4
> conf->max_nr_stripes) { > conf->min_nr_stripes) {
printk(KERN_WARNING "md/raid:%s: reshape: not enough stripes. Needed %lu\n", printk(KERN_WARNING "md/raid:%s: reshape: not enough stripes. Needed %lu\n",
mdname(mddev), mdname(mddev),
((max(mddev->chunk_sectors, mddev->new_chunk_sectors) << 9) ((max(mddev->chunk_sectors, mddev->new_chunk_sectors) << 9)
......
...@@ -433,6 +433,7 @@ struct r5conf { ...@@ -433,6 +433,7 @@ struct r5conf {
int max_degraded; int max_degraded;
int raid_disks; int raid_disks;
int max_nr_stripes; int max_nr_stripes;
int min_nr_stripes;
/* reshape_progress is the leading edge of a 'reshape' /* reshape_progress is the leading edge of a 'reshape'
* It has value MaxSector when no reshape is happening * It has value MaxSector when no reshape is happening
...@@ -513,7 +514,15 @@ struct r5conf { ...@@ -513,7 +514,15 @@ struct r5conf {
#define R5_INACTIVE_BLOCKED 1 /* release of inactive stripes blocked, #define R5_INACTIVE_BLOCKED 1 /* release of inactive stripes blocked,
* waiting for 25% to be free * waiting for 25% to be free
*/ */
#define R5_ALLOC_MORE 2 /* It might help to allocate another
* stripe.
*/
#define R5_DID_ALLOC 4 /* A stripe was allocated, don't allocate
* more until at least one has been
* released. This avoids flooding
* the cache.
*/
struct shrinker shrinker;
int pool_size; /* number of disks in stripeheads in pool */ int pool_size; /* number of disks in stripeheads in pool */
spinlock_t device_lock; spinlock_t device_lock;
struct disk_info *disks; struct disk_info *disks;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment