Commit 8135be5a authored by Vladimir Davydov's avatar Vladimir Davydov Committed by Linus Torvalds

memcg: fix possible use-after-free in memcg_kmem_get_cache()

Suppose task @t that belongs to a memory cgroup @memcg is going to
allocate an object from a kmem cache @c.  The copy of @c corresponding to
@memcg, @mc, is empty.  Then if kmem_cache_alloc races with the memory
cgroup destruction we can access the memory cgroup's copy of the cache
after it was destroyed:

CPU0				CPU1
----				----
[ current=@t
  @mc->memcg_params->nr_pages=0 ]

kmem_cache_alloc(@c):
  call memcg_kmem_get_cache(@c);
  proceed to allocation from @mc:
    alloc a page for @mc:
      ...

				move @t from @memcg
				destroy @memcg:
				  mem_cgroup_css_offline(@memcg):
				    memcg_unregister_all_caches(@memcg):
				      kmem_cache_destroy(@mc)

    add page to @mc

We could fix this issue by taking a reference to a per-memcg cache, but
that would require adding a per-cpu reference counter to per-memcg caches,
which would look cumbersome.

Instead, let's take a reference to a memory cgroup, which already has a
per-cpu reference counter, in the beginning of kmem_cache_alloc to be
dropped in the end, and move per memcg caches destruction from css offline
to css free.  As a side effect, per-memcg caches will be destroyed not one
by one, but all at once when the last page accounted to the memory cgroup
is freed.  This doesn't sound as a high price for code readability though.

Note, this patch does add some overhead to the kmem_cache_alloc hot path,
but it is pretty negligible - it's just a function call plus a per cpu
counter decrement, which is comparable to what we already have in
memcg_kmem_get_cache.  Besides, it's only relevant if there are memory
cgroups with kmem accounting enabled.  I don't think we can find a way to
handle this race w/o it, because alloc_page called from kmem_cache_alloc
may sleep so we can't flush all pending kmallocs w/o reference counting.
Signed-off-by: default avatarVladimir Davydov <vdavydov@parallels.com>
Acked-by: default avatarChristoph Lameter <cl@linux.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
parent ae6e71d3
...@@ -400,8 +400,8 @@ int memcg_cache_id(struct mem_cgroup *memcg); ...@@ -400,8 +400,8 @@ int memcg_cache_id(struct mem_cgroup *memcg);
void memcg_update_array_size(int num_groups); void memcg_update_array_size(int num_groups);
struct kmem_cache * struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep);
__memcg_kmem_get_cache(struct kmem_cache *cachep); void __memcg_kmem_put_cache(struct kmem_cache *cachep);
int __memcg_charge_slab(struct kmem_cache *cachep, gfp_t gfp, int order); int __memcg_charge_slab(struct kmem_cache *cachep, gfp_t gfp, int order);
void __memcg_uncharge_slab(struct kmem_cache *cachep, int order); void __memcg_uncharge_slab(struct kmem_cache *cachep, int order);
...@@ -494,6 +494,12 @@ memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp) ...@@ -494,6 +494,12 @@ memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp)
return __memcg_kmem_get_cache(cachep); return __memcg_kmem_get_cache(cachep);
} }
static __always_inline void memcg_kmem_put_cache(struct kmem_cache *cachep)
{
if (memcg_kmem_enabled())
__memcg_kmem_put_cache(cachep);
}
#else #else
#define for_each_memcg_cache_index(_idx) \ #define for_each_memcg_cache_index(_idx) \
for (; NULL; ) for (; NULL; )
...@@ -528,6 +534,10 @@ memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp) ...@@ -528,6 +534,10 @@ memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp)
{ {
return cachep; return cachep;
} }
static inline void memcg_kmem_put_cache(struct kmem_cache *cachep)
{
}
#endif /* CONFIG_MEMCG_KMEM */ #endif /* CONFIG_MEMCG_KMEM */
#endif /* _LINUX_MEMCONTROL_H */ #endif /* _LINUX_MEMCONTROL_H */
...@@ -493,7 +493,6 @@ static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node) ...@@ -493,7 +493,6 @@ static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node)
* @memcg: pointer to the memcg this cache belongs to * @memcg: pointer to the memcg this cache belongs to
* @list: list_head for the list of all caches in this memcg * @list: list_head for the list of all caches in this memcg
* @root_cache: pointer to the global, root cache, this cache was derived from * @root_cache: pointer to the global, root cache, this cache was derived from
* @nr_pages: number of pages that belongs to this cache.
*/ */
struct memcg_cache_params { struct memcg_cache_params {
bool is_root_cache; bool is_root_cache;
...@@ -506,7 +505,6 @@ struct memcg_cache_params { ...@@ -506,7 +505,6 @@ struct memcg_cache_params {
struct mem_cgroup *memcg; struct mem_cgroup *memcg;
struct list_head list; struct list_head list;
struct kmem_cache *root_cache; struct kmem_cache *root_cache;
atomic_t nr_pages;
}; };
}; };
}; };
......
...@@ -2635,7 +2635,6 @@ static void memcg_register_cache(struct mem_cgroup *memcg, ...@@ -2635,7 +2635,6 @@ static void memcg_register_cache(struct mem_cgroup *memcg,
if (!cachep) if (!cachep)
return; return;
css_get(&memcg->css);
list_add(&cachep->memcg_params->list, &memcg->memcg_slab_caches); list_add(&cachep->memcg_params->list, &memcg->memcg_slab_caches);
/* /*
...@@ -2669,9 +2668,6 @@ static void memcg_unregister_cache(struct kmem_cache *cachep) ...@@ -2669,9 +2668,6 @@ static void memcg_unregister_cache(struct kmem_cache *cachep)
list_del(&cachep->memcg_params->list); list_del(&cachep->memcg_params->list);
kmem_cache_destroy(cachep); kmem_cache_destroy(cachep);
/* drop the reference taken in memcg_register_cache */
css_put(&memcg->css);
} }
int __memcg_cleanup_cache_params(struct kmem_cache *s) int __memcg_cleanup_cache_params(struct kmem_cache *s)
...@@ -2705,9 +2701,7 @@ static void memcg_unregister_all_caches(struct mem_cgroup *memcg) ...@@ -2705,9 +2701,7 @@ static void memcg_unregister_all_caches(struct mem_cgroup *memcg)
mutex_lock(&memcg_slab_mutex); mutex_lock(&memcg_slab_mutex);
list_for_each_entry_safe(params, tmp, &memcg->memcg_slab_caches, list) { list_for_each_entry_safe(params, tmp, &memcg->memcg_slab_caches, list) {
cachep = memcg_params_to_cache(params); cachep = memcg_params_to_cache(params);
kmem_cache_shrink(cachep); memcg_unregister_cache(cachep);
if (atomic_read(&cachep->memcg_params->nr_pages) == 0)
memcg_unregister_cache(cachep);
} }
mutex_unlock(&memcg_slab_mutex); mutex_unlock(&memcg_slab_mutex);
} }
...@@ -2742,10 +2736,10 @@ static void __memcg_schedule_register_cache(struct mem_cgroup *memcg, ...@@ -2742,10 +2736,10 @@ static void __memcg_schedule_register_cache(struct mem_cgroup *memcg,
struct memcg_register_cache_work *cw; struct memcg_register_cache_work *cw;
cw = kmalloc(sizeof(*cw), GFP_NOWAIT); cw = kmalloc(sizeof(*cw), GFP_NOWAIT);
if (cw == NULL) { if (!cw)
css_put(&memcg->css);
return; return;
}
css_get(&memcg->css);
cw->memcg = memcg; cw->memcg = memcg;
cw->cachep = cachep; cw->cachep = cachep;
...@@ -2776,12 +2770,8 @@ static void memcg_schedule_register_cache(struct mem_cgroup *memcg, ...@@ -2776,12 +2770,8 @@ static void memcg_schedule_register_cache(struct mem_cgroup *memcg,
int __memcg_charge_slab(struct kmem_cache *cachep, gfp_t gfp, int order) int __memcg_charge_slab(struct kmem_cache *cachep, gfp_t gfp, int order)
{ {
unsigned int nr_pages = 1 << order; unsigned int nr_pages = 1 << order;
int res;
res = memcg_charge_kmem(cachep->memcg_params->memcg, gfp, nr_pages); return memcg_charge_kmem(cachep->memcg_params->memcg, gfp, nr_pages);
if (!res)
atomic_add(nr_pages, &cachep->memcg_params->nr_pages);
return res;
} }
void __memcg_uncharge_slab(struct kmem_cache *cachep, int order) void __memcg_uncharge_slab(struct kmem_cache *cachep, int order)
...@@ -2789,7 +2779,6 @@ void __memcg_uncharge_slab(struct kmem_cache *cachep, int order) ...@@ -2789,7 +2779,6 @@ void __memcg_uncharge_slab(struct kmem_cache *cachep, int order)
unsigned int nr_pages = 1 << order; unsigned int nr_pages = 1 << order;
memcg_uncharge_kmem(cachep->memcg_params->memcg, nr_pages); memcg_uncharge_kmem(cachep->memcg_params->memcg, nr_pages);
atomic_sub(nr_pages, &cachep->memcg_params->nr_pages);
} }
/* /*
...@@ -2816,22 +2805,13 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep) ...@@ -2816,22 +2805,13 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep)
if (current->memcg_kmem_skip_account) if (current->memcg_kmem_skip_account)
return cachep; return cachep;
rcu_read_lock(); memcg = get_mem_cgroup_from_mm(current->mm);
memcg = mem_cgroup_from_task(rcu_dereference(current->mm->owner));
if (!memcg_kmem_is_active(memcg)) if (!memcg_kmem_is_active(memcg))
goto out; goto out;
memcg_cachep = cache_from_memcg_idx(cachep, memcg_cache_id(memcg)); memcg_cachep = cache_from_memcg_idx(cachep, memcg_cache_id(memcg));
if (likely(memcg_cachep)) { if (likely(memcg_cachep))
cachep = memcg_cachep; return memcg_cachep;
goto out;
}
/* The corresponding put will be done in the workqueue. */
if (!css_tryget_online(&memcg->css))
goto out;
rcu_read_unlock();
/* /*
* If we are in a safe context (can wait, and not in interrupt * If we are in a safe context (can wait, and not in interrupt
...@@ -2846,12 +2826,17 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep) ...@@ -2846,12 +2826,17 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep)
* defer everything. * defer everything.
*/ */
memcg_schedule_register_cache(memcg, cachep); memcg_schedule_register_cache(memcg, cachep);
return cachep;
out: out:
rcu_read_unlock(); css_put(&memcg->css);
return cachep; return cachep;
} }
void __memcg_kmem_put_cache(struct kmem_cache *cachep)
{
if (!is_root_cache(cachep))
css_put(&cachep->memcg_params->memcg->css);
}
/* /*
* We need to verify if the allocation against current->mm->owner's memcg is * We need to verify if the allocation against current->mm->owner's memcg is
* possible for the given order. But the page is not allocated yet, so we'll * possible for the given order. But the page is not allocated yet, so we'll
...@@ -2914,10 +2899,6 @@ void __memcg_kmem_uncharge_pages(struct page *page, int order) ...@@ -2914,10 +2899,6 @@ void __memcg_kmem_uncharge_pages(struct page *page, int order)
memcg_uncharge_kmem(memcg, 1 << order); memcg_uncharge_kmem(memcg, 1 << order);
page->mem_cgroup = NULL; page->mem_cgroup = NULL;
} }
#else
static inline void memcg_unregister_all_caches(struct mem_cgroup *memcg)
{
}
#endif /* CONFIG_MEMCG_KMEM */ #endif /* CONFIG_MEMCG_KMEM */
#ifdef CONFIG_TRANSPARENT_HUGEPAGE #ifdef CONFIG_TRANSPARENT_HUGEPAGE
...@@ -4188,6 +4169,7 @@ static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) ...@@ -4188,6 +4169,7 @@ static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
static void memcg_destroy_kmem(struct mem_cgroup *memcg) static void memcg_destroy_kmem(struct mem_cgroup *memcg)
{ {
memcg_unregister_all_caches(memcg);
mem_cgroup_sockets_destroy(memcg); mem_cgroup_sockets_destroy(memcg);
} }
#else #else
...@@ -4797,7 +4779,6 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) ...@@ -4797,7 +4779,6 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
} }
spin_unlock(&memcg->event_list_lock); spin_unlock(&memcg->event_list_lock);
memcg_unregister_all_caches(memcg);
vmpressure_cleanup(&memcg->vmpressure); vmpressure_cleanup(&memcg->vmpressure);
} }
......
...@@ -3182,6 +3182,7 @@ slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, ...@@ -3182,6 +3182,7 @@ slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
memset(ptr, 0, cachep->object_size); memset(ptr, 0, cachep->object_size);
} }
memcg_kmem_put_cache(cachep);
return ptr; return ptr;
} }
...@@ -3247,6 +3248,7 @@ slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller) ...@@ -3247,6 +3248,7 @@ slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller)
memset(objp, 0, cachep->object_size); memset(objp, 0, cachep->object_size);
} }
memcg_kmem_put_cache(cachep);
return objp; return objp;
} }
......
...@@ -1233,13 +1233,17 @@ static inline void kfree_hook(const void *x) ...@@ -1233,13 +1233,17 @@ static inline void kfree_hook(const void *x)
kmemleak_free(x); kmemleak_free(x);
} }
static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags) static inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s,
gfp_t flags)
{ {
flags &= gfp_allowed_mask; flags &= gfp_allowed_mask;
lockdep_trace_alloc(flags); lockdep_trace_alloc(flags);
might_sleep_if(flags & __GFP_WAIT); might_sleep_if(flags & __GFP_WAIT);
return should_failslab(s->object_size, flags, s->flags); if (should_failslab(s->object_size, flags, s->flags))
return NULL;
return memcg_kmem_get_cache(s, flags);
} }
static inline void slab_post_alloc_hook(struct kmem_cache *s, static inline void slab_post_alloc_hook(struct kmem_cache *s,
...@@ -1248,6 +1252,7 @@ static inline void slab_post_alloc_hook(struct kmem_cache *s, ...@@ -1248,6 +1252,7 @@ static inline void slab_post_alloc_hook(struct kmem_cache *s,
flags &= gfp_allowed_mask; flags &= gfp_allowed_mask;
kmemcheck_slab_alloc(s, flags, object, slab_ksize(s)); kmemcheck_slab_alloc(s, flags, object, slab_ksize(s));
kmemleak_alloc_recursive(object, s->object_size, 1, s->flags, flags); kmemleak_alloc_recursive(object, s->object_size, 1, s->flags, flags);
memcg_kmem_put_cache(s);
} }
static inline void slab_free_hook(struct kmem_cache *s, void *x) static inline void slab_free_hook(struct kmem_cache *s, void *x)
...@@ -2384,10 +2389,9 @@ static __always_inline void *slab_alloc_node(struct kmem_cache *s, ...@@ -2384,10 +2389,9 @@ static __always_inline void *slab_alloc_node(struct kmem_cache *s,
struct page *page; struct page *page;
unsigned long tid; unsigned long tid;
if (slab_pre_alloc_hook(s, gfpflags)) s = slab_pre_alloc_hook(s, gfpflags);
if (!s)
return NULL; return NULL;
s = memcg_kmem_get_cache(s, gfpflags);
redo: redo:
/* /*
* Must read kmem_cache cpu data via this cpu ptr. Preemption is * Must read kmem_cache cpu data via this cpu ptr. Preemption is
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment