Commit 42a30035 authored by Johannes Weiner's avatar Johannes Weiner Committed by Linus Torvalds

mm: memcontrol: fix recursive statistics correctness & scalabilty

Right now, when somebody needs to know the recursive memory statistics
and events of a cgroup subtree, they need to walk the entire subtree and
sum up the counters manually.

There are two issues with this:

1. When a cgroup gets deleted, its stats are lost. The state counters
   should all be 0 at that point, of course, but the events are not.
   When this happens, the event counters, which are supposed to be
   monotonic, can go backwards in the parent cgroups.

2. During regular operation, we always have a certain number of lazily
   freed cgroups sitting around that have been deleted, have no tasks,
   but have a few cache pages remaining. These groups' statistics do not
   change until we eventually hit memory pressure, but somebody
   watching, say, memory.stat on an ancestor has to iterate those every
   time.

This patch addresses both issues by introducing recursive counters at
each level that are propagated from the write side when stats change.

Upward propagation happens when the per-cpu caches spill over into the
local atomic counter.  This is the same thing we do during charge and
uncharge, except that the latter uses atomic RMWs, which are more
expensive; stat changes happen at around the same rate.  In a sparse
file test (page faults and reclaim at maximum CPU speed) with 5 cgroup
nesting levels, perf shows __mod_memcg_page state at ~1%.

Link: http://lkml.kernel.org/r/20190412151507.2769-4-hannes@cmpxchg.orgSigned-off-by: default avatarJohannes Weiner <hannes@cmpxchg.org>
Reviewed-by: default avatarShakeel Butt <shakeelb@google.com>
Reviewed-by: default avatarRoman Gushchin <guro@fb.com>
Cc: Michal Hocko <mhocko@kernel.org>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
parent db9adbcb
......@@ -128,6 +128,7 @@ struct mem_cgroup_per_node {
struct lruvec_stat __percpu *lruvec_stat_cpu;
atomic_long_t lruvec_stat[NR_VM_NODE_STAT_ITEMS];
atomic_long_t lruvec_stat_local[NR_VM_NODE_STAT_ITEMS];
unsigned long lru_zone_size[MAX_NR_ZONES][NR_LRU_LISTS];
......@@ -279,8 +280,12 @@ struct mem_cgroup {
MEMCG_PADDING(_pad2_);
atomic_long_t vmstats[MEMCG_NR_STAT];
atomic_long_t vmstats_local[MEMCG_NR_STAT];
atomic_long_t vmevents[NR_VM_EVENT_ITEMS];
atomic_long_t memory_events[MEMCG_NR_MEMORY_EVENTS];
atomic_long_t vmevents_local[NR_VM_EVENT_ITEMS];
atomic_long_t memory_events[MEMCG_NR_MEMORY_EVENTS];
unsigned long socket_pressure;
......@@ -550,6 +555,20 @@ struct mem_cgroup *lock_page_memcg(struct page *page);
void __unlock_page_memcg(struct mem_cgroup *memcg);
void unlock_page_memcg(struct page *page);
/*
* idx can be of type enum memcg_stat_item or node_stat_item.
* Keep in sync with memcg_exact_page_state().
*/
static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx)
{
long x = atomic_long_read(&memcg->vmstats[idx]);
#ifdef CONFIG_SMP
if (x < 0)
x = 0;
#endif
return x;
}
/*
* idx can be of type enum memcg_stat_item or node_stat_item.
* Keep in sync with memcg_exact_page_state().
......@@ -557,7 +576,7 @@ void unlock_page_memcg(struct page *page);
static inline unsigned long memcg_page_state_local(struct mem_cgroup *memcg,
int idx)
{
long x = atomic_long_read(&memcg->vmstats[idx]);
long x = atomic_long_read(&memcg->vmstats_local[idx]);
#ifdef CONFIG_SMP
if (x < 0)
x = 0;
......@@ -609,6 +628,24 @@ static inline void mod_memcg_page_state(struct page *page,
mod_memcg_state(page->mem_cgroup, idx, val);
}
static inline unsigned long lruvec_page_state(struct lruvec *lruvec,
enum node_stat_item idx)
{
struct mem_cgroup_per_node *pn;
long x;
if (mem_cgroup_disabled())
return node_page_state(lruvec_pgdat(lruvec), idx);
pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
x = atomic_long_read(&pn->lruvec_stat[idx]);
#ifdef CONFIG_SMP
if (x < 0)
x = 0;
#endif
return x;
}
static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec,
enum node_stat_item idx)
{
......@@ -619,7 +656,7 @@ static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec,
return node_page_state(lruvec_pgdat(lruvec), idx);
pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
x = atomic_long_read(&pn->lruvec_stat[idx]);
x = atomic_long_read(&pn->lruvec_stat_local[idx]);
#ifdef CONFIG_SMP
if (x < 0)
x = 0;
......@@ -959,6 +996,11 @@ static inline void mem_cgroup_print_oom_group(struct mem_cgroup *memcg)
{
}
static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx)
{
return 0;
}
static inline unsigned long memcg_page_state_local(struct mem_cgroup *memcg,
int idx)
{
......@@ -989,6 +1031,12 @@ static inline void mod_memcg_page_state(struct page *page,
{
}
static inline unsigned long lruvec_page_state(struct lruvec *lruvec,
enum node_stat_item idx)
{
return node_page_state(lruvec_pgdat(lruvec), idx);
}
static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec,
enum node_stat_item idx)
{
......
This diff is collapsed.
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment