Commit 1d3d4437 authored by Glauber Costa's avatar Glauber Costa Committed by Al Viro

vmscan: per-node deferred work

The list_lru infrastructure already keeps per-node LRU lists in its
node-specific list_lru_node arrays and provide us with a per-node API, and
the shrinkers are properly equiped with node information.  This means that
we can now focus our shrinking effort in a single node, but the work that
is deferred from one run to another is kept global at nr_in_batch.  Work
can be deferred, for instance, during direct reclaim under a GFP_NOFS
allocation, where situation, all the filesystem shrinkers will be
prevented from running and accumulate in nr_in_batch the amount of work
they should have done, but could not.

This creates an impedance problem, where upon node pressure, work deferred
will accumulate and end up being flushed in other nodes.  The problem we
describe is particularly harmful in big machines, where many nodes can
accumulate at the same time, all adding to the global counter nr_in_batch.
 As we accumulate more and more, we start to ask for the caches to flush
even bigger numbers.  The result is that the caches are depleted and do
not stabilize.  To achieve stable steady state behavior, we need to tackle
it differently.

In this patch we keep the deferred count per-node, in the new array
nr_deferred[] (the name is also a bit more descriptive) and will never
accumulate that to other nodes.
Signed-off-by: default avatarGlauber Costa <glommer@openvz.org>
Cc: Dave Chinner <dchinner@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: "Theodore Ts'o" <tytso@mit.edu>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Cc: Arve Hjønnevåg <arve@android.com>
Cc: Carlos Maiolino <cmaiolino@redhat.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Chuck Lever <chuck.lever@oracle.com>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Cc: David Rientjes <rientjes@google.com>
Cc: Gleb Natapov <gleb@redhat.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: J. Bruce Fields <bfields@redhat.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jerome Glisse <jglisse@redhat.com>
Cc: John Stultz <john.stultz@linaro.org>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Kent Overstreet <koverstreet@google.com>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Steven Whitehouse <swhiteho@redhat.com>
Cc: Thomas Hellstrom <thellstrom@vmware.com>
Cc: Trond Myklebust <Trond.Myklebust@netapp.com>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
Signed-off-by: default avatarAl Viro <viro@zeniv.linux.org.uk>
parent 0ce3d744
...@@ -19,6 +19,8 @@ struct shrink_control { ...@@ -19,6 +19,8 @@ struct shrink_control {
/* shrink from these nodes */ /* shrink from these nodes */
nodemask_t nodes_to_scan; nodemask_t nodes_to_scan;
/* current node being shrunk (for NUMA aware shrinkers) */
int nid;
}; };
#define SHRINK_STOP (~0UL) #define SHRINK_STOP (~0UL)
...@@ -44,6 +46,8 @@ struct shrink_control { ...@@ -44,6 +46,8 @@ struct shrink_control {
* due to potential deadlocks. If SHRINK_STOP is returned, then no further * due to potential deadlocks. If SHRINK_STOP is returned, then no further
* attempts to call the @scan_objects will be made from the current reclaim * attempts to call the @scan_objects will be made from the current reclaim
* context. * context.
*
* @flags determine the shrinker abilities, like numa awareness
*/ */
struct shrinker { struct shrinker {
int (*shrink)(struct shrinker *, struct shrink_control *sc); int (*shrink)(struct shrinker *, struct shrink_control *sc);
...@@ -54,12 +58,18 @@ struct shrinker { ...@@ -54,12 +58,18 @@ struct shrinker {
int seeks; /* seeks to recreate an obj */ int seeks; /* seeks to recreate an obj */
long batch; /* reclaim batch size, 0 = default */ long batch; /* reclaim batch size, 0 = default */
unsigned long flags;
/* These are for internal use */ /* These are for internal use */
struct list_head list; struct list_head list;
atomic_long_t nr_in_batch; /* objs pending delete */ /* objs pending delete, per node */
atomic_long_t *nr_deferred;
}; };
#define DEFAULT_SEEKS 2 /* A good number if you don't know better. */ #define DEFAULT_SEEKS 2 /* A good number if you don't know better. */
extern void register_shrinker(struct shrinker *);
/* Flags */
#define SHRINKER_NUMA_AWARE (1 << 0)
extern int register_shrinker(struct shrinker *);
extern void unregister_shrinker(struct shrinker *); extern void unregister_shrinker(struct shrinker *);
#endif #endif
...@@ -155,14 +155,31 @@ static unsigned long get_lru_size(struct lruvec *lruvec, enum lru_list lru) ...@@ -155,14 +155,31 @@ static unsigned long get_lru_size(struct lruvec *lruvec, enum lru_list lru)
} }
/* /*
* Add a shrinker callback to be called from the vm * Add a shrinker callback to be called from the vm.
*/ */
void register_shrinker(struct shrinker *shrinker) int register_shrinker(struct shrinker *shrinker)
{ {
atomic_long_set(&shrinker->nr_in_batch, 0); size_t size = sizeof(*shrinker->nr_deferred);
/*
* If we only have one possible node in the system anyway, save
* ourselves the trouble and disable NUMA aware behavior. This way we
* will save memory and some small loop time later.
*/
if (nr_node_ids == 1)
shrinker->flags &= ~SHRINKER_NUMA_AWARE;
if (shrinker->flags & SHRINKER_NUMA_AWARE)
size *= nr_node_ids;
shrinker->nr_deferred = kzalloc(size, GFP_KERNEL);
if (!shrinker->nr_deferred)
return -ENOMEM;
down_write(&shrinker_rwsem); down_write(&shrinker_rwsem);
list_add_tail(&shrinker->list, &shrinker_list); list_add_tail(&shrinker->list, &shrinker_list);
up_write(&shrinker_rwsem); up_write(&shrinker_rwsem);
return 0;
} }
EXPORT_SYMBOL(register_shrinker); EXPORT_SYMBOL(register_shrinker);
...@@ -186,52 +203,18 @@ static inline int do_shrinker_shrink(struct shrinker *shrinker, ...@@ -186,52 +203,18 @@ static inline int do_shrinker_shrink(struct shrinker *shrinker,
} }
#define SHRINK_BATCH 128 #define SHRINK_BATCH 128
/*
* Call the shrink functions to age shrinkable caches static unsigned long
* shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker,
* Here we assume it costs one seek to replace a lru page and that it also unsigned long nr_pages_scanned, unsigned long lru_pages)
* takes a seek to recreate a cache object. With this in mind we age equal
* percentages of the lru and ageable caches. This should balance the seeks
* generated by these structures.
*
* If the vm encountered mapped pages on the LRU it increase the pressure on
* slab to avoid swapping.
*
* We do weird things to avoid (scanned*seeks*entries) overflowing 32 bits.
*
* `lru_pages' represents the number of on-LRU pages in all the zones which
* are eligible for the caller's allocation attempt. It is used for balancing
* slab reclaim versus page reclaim.
*
* Returns the number of slab objects which we shrunk.
*/
unsigned long shrink_slab(struct shrink_control *shrinkctl,
unsigned long nr_pages_scanned,
unsigned long lru_pages)
{ {
struct shrinker *shrinker;
unsigned long freed = 0; unsigned long freed = 0;
if (nr_pages_scanned == 0)
nr_pages_scanned = SWAP_CLUSTER_MAX;
if (!down_read_trylock(&shrinker_rwsem)) {
/*
* If we would return 0, our callers would understand that we
* have nothing else to shrink and give up trying. By returning
* 1 we keep it going and assume we'll be able to shrink next
* time.
*/
freed = 1;
goto out;
}
list_for_each_entry(shrinker, &shrinker_list, list) {
unsigned long long delta; unsigned long long delta;
long total_scan; long total_scan;
long max_pass; long max_pass;
long nr; long nr;
long new_nr; long new_nr;
int nid = shrinkctl->nid;
long batch_size = shrinker->batch ? shrinker->batch long batch_size = shrinker->batch ? shrinker->batch
: SHRINK_BATCH; : SHRINK_BATCH;
...@@ -240,14 +223,14 @@ unsigned long shrink_slab(struct shrink_control *shrinkctl, ...@@ -240,14 +223,14 @@ unsigned long shrink_slab(struct shrink_control *shrinkctl,
else else
max_pass = do_shrinker_shrink(shrinker, shrinkctl, 0); max_pass = do_shrinker_shrink(shrinker, shrinkctl, 0);
if (max_pass == 0) if (max_pass == 0)
continue; return 0;
/* /*
* copy the current shrinker scan count into a local variable * copy the current shrinker scan count into a local variable
* and zero it so that other concurrent shrinker invocations * and zero it so that other concurrent shrinker invocations
* don't also do this scanning work. * don't also do this scanning work.
*/ */
nr = atomic_long_xchg(&shrinker->nr_in_batch, 0); nr = atomic_long_xchg(&shrinker->nr_deferred[nid], 0);
total_scan = nr; total_scan = nr;
delta = (4 * nr_pages_scanned) / shrinker->seeks; delta = (4 * nr_pages_scanned) / shrinker->seeks;
...@@ -324,11 +307,67 @@ unsigned long shrink_slab(struct shrink_control *shrinkctl, ...@@ -324,11 +307,67 @@ unsigned long shrink_slab(struct shrink_control *shrinkctl,
*/ */
if (total_scan > 0) if (total_scan > 0)
new_nr = atomic_long_add_return(total_scan, new_nr = atomic_long_add_return(total_scan,
&shrinker->nr_in_batch); &shrinker->nr_deferred[nid]);
else else
new_nr = atomic_long_read(&shrinker->nr_in_batch); new_nr = atomic_long_read(&shrinker->nr_deferred[nid]);
trace_mm_shrink_slab_end(shrinker, freed, nr, new_nr); trace_mm_shrink_slab_end(shrinker, freed, nr, new_nr);
return freed;
}
/*
* Call the shrink functions to age shrinkable caches
*
* Here we assume it costs one seek to replace a lru page and that it also
* takes a seek to recreate a cache object. With this in mind we age equal
* percentages of the lru and ageable caches. This should balance the seeks
* generated by these structures.
*
* If the vm encountered mapped pages on the LRU it increase the pressure on
* slab to avoid swapping.
*
* We do weird things to avoid (scanned*seeks*entries) overflowing 32 bits.
*
* `lru_pages' represents the number of on-LRU pages in all the zones which
* are eligible for the caller's allocation attempt. It is used for balancing
* slab reclaim versus page reclaim.
*
* Returns the number of slab objects which we shrunk.
*/
unsigned long shrink_slab(struct shrink_control *shrinkctl,
unsigned long nr_pages_scanned,
unsigned long lru_pages)
{
struct shrinker *shrinker;
unsigned long freed = 0;
if (nr_pages_scanned == 0)
nr_pages_scanned = SWAP_CLUSTER_MAX;
if (!down_read_trylock(&shrinker_rwsem)) {
/*
* If we would return 0, our callers would understand that we
* have nothing else to shrink and give up trying. By returning
* 1 we keep it going and assume we'll be able to shrink next
* time.
*/
freed = 1;
goto out;
}
list_for_each_entry(shrinker, &shrinker_list, list) {
for_each_node_mask(shrinkctl->nid, shrinkctl->nodes_to_scan) {
if (!node_online(shrinkctl->nid))
continue;
if (!(shrinker->flags & SHRINKER_NUMA_AWARE) &&
(shrinkctl->nid != 0))
break;
freed += shrink_slab_node(shrinkctl, shrinker,
nr_pages_scanned, lru_pages);
}
} }
up_read(&shrinker_rwsem); up_read(&shrinker_rwsem);
out: out:
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment