Commit 2d1c4980 authored by Johannes Weiner's avatar Johannes Weiner Committed by Linus Torvalds

mm: memcontrol: make swap tracking an integral part of memory control

Without swap page tracking, users that are otherwise memory controlled can
easily escape their containment and allocate significant amounts of memory
that they're not being charged for.  That's because swap does readahead,
but without the cgroup records of who owned the page at swapout, readahead
pages don't get charged until somebody actually faults them into their
page table and we can identify an owner task.  This can be maliciously
exploited with MADV_WILLNEED, which triggers arbitrary readahead
allocations without charging the pages.

Make swap swap page tracking an integral part of memcg and remove the
Kconfig options.  In the first place, it was only made configurable to
allow users to save some memory.  But the overhead of tracking cgroup
ownership per swap page is minimal - 2 byte per page, or 512k per 1G of
swap, or 0.04%.  Saving that at the expense of broken containment
semantics is not something we should present as a coequal option.

The swapaccount=0 boot option will continue to exist, and it will
eliminate the page_counter overhead and hide the swap control files, but
it won't disable swap slot ownership tracking.

This patch makes sure we always have the cgroup records at swapin time;
the next patch will fix the actual bug by charging readahead swap pages at
swapin time rather than at fault time.

v2: fix double swap charge bug in cgroup1/cgroup2 code gating

[hannes@cmpxchg.org: fix crash with cgroup_disable=memory]
  Link: http://lkml.kernel.org/r/20200521215855.GB815153@cmpxchg.orgSigned-off-by: default avatarJohannes Weiner <hannes@cmpxchg.org>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
Reviewed-by: default avatarJoonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Alex Shi <alex.shi@linux.alibaba.com>
Cc: "Kirill A. Shutemov" <kirill@shutemov.name>
Cc: Roman Gushchin <guro@fb.com>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Balbir Singh <bsingharora@gmail.com>
Cc: Naresh Kamboju <naresh.kamboju@linaro.org>
Link: http://lkml.kernel.org/r/20200508183105.225460-16-hannes@cmpxchg.orgDebugged-by: default avatarHugh Dickins <hughd@google.com>
Debugged-by: default avatarMichal Hocko <mhocko@kernel.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
parent eccb52e7
...@@ -819,24 +819,9 @@ config MEMCG ...@@ -819,24 +819,9 @@ config MEMCG
Provides control over the memory footprint of tasks in a cgroup. Provides control over the memory footprint of tasks in a cgroup.
config MEMCG_SWAP config MEMCG_SWAP
bool "Swap controller" bool
depends on MEMCG && SWAP depends on MEMCG && SWAP
help
Provides control over the swap space consumed by tasks in a cgroup.
config MEMCG_SWAP_ENABLED
bool "Swap controller enabled by default"
depends on MEMCG_SWAP
default y default y
help
Memory Resource Controller Swap Extension comes with its price in
a bigger memory consumption. General purpose distribution kernels
which want to enable the feature but keep it disabled by default
and let the user enable it by swapaccount=1 boot command line
parameter should have this option unselected.
For those who want to have the feature enabled by default should
select this option (if, for some reason, they need to disable it
then swapaccount=0 does the trick).
config MEMCG_KMEM config MEMCG_KMEM
bool bool
......
...@@ -83,14 +83,10 @@ static bool cgroup_memory_nokmem; ...@@ -83,14 +83,10 @@ static bool cgroup_memory_nokmem;
/* Whether the swap controller is active */ /* Whether the swap controller is active */
#ifdef CONFIG_MEMCG_SWAP #ifdef CONFIG_MEMCG_SWAP
#ifdef CONFIG_MEMCG_SWAP_ENABLED
bool cgroup_memory_noswap __read_mostly; bool cgroup_memory_noswap __read_mostly;
#else #else
bool cgroup_memory_noswap __read_mostly = 1;
#endif /* CONFIG_MEMCG_SWAP_ENABLED */
#else
#define cgroup_memory_noswap 1 #define cgroup_memory_noswap 1
#endif /* CONFIG_MEMCG_SWAP */ #endif
#ifdef CONFIG_CGROUP_WRITEBACK #ifdef CONFIG_CGROUP_WRITEBACK
static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq); static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq);
...@@ -5360,7 +5356,6 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, ...@@ -5360,7 +5356,6 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
* we call find_get_page() with swapper_space directly. * we call find_get_page() with swapper_space directly.
*/ */
page = find_get_page(swap_address_space(ent), swp_offset(ent)); page = find_get_page(swap_address_space(ent), swp_offset(ent));
if (do_memsw_account())
entry->val = ent.val; entry->val = ent.val;
return page; return page;
...@@ -5395,7 +5390,6 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma, ...@@ -5395,7 +5390,6 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
page = find_get_entry(mapping, pgoff); page = find_get_entry(mapping, pgoff);
if (xa_is_value(page)) { if (xa_is_value(page)) {
swp_entry_t swp = radix_to_swp_entry(page); swp_entry_t swp = radix_to_swp_entry(page);
if (do_memsw_account())
*entry = swp; *entry = swp;
page = find_get_page(swap_address_space(swp), page = find_get_page(swap_address_space(swp),
swp_offset(swp)); swp_offset(swp));
...@@ -6529,6 +6523,9 @@ int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask, ...@@ -6529,6 +6523,9 @@ int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask,
goto out; goto out;
if (PageSwapCache(page)) { if (PageSwapCache(page)) {
swp_entry_t ent = { .val = page_private(page), };
unsigned short id;
/* /*
* Every swap fault against a single page tries to charge the * Every swap fault against a single page tries to charge the
* page, bail as early as possible. shmem_unuse() encounters * page, bail as early as possible. shmem_unuse() encounters
...@@ -6540,10 +6537,6 @@ int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask, ...@@ -6540,10 +6537,6 @@ int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask,
if (compound_head(page)->mem_cgroup) if (compound_head(page)->mem_cgroup)
goto out; goto out;
if (!cgroup_memory_noswap) {
swp_entry_t ent = { .val = page_private(page), };
unsigned short id;
id = lookup_swap_cgroup_id(ent); id = lookup_swap_cgroup_id(ent);
rcu_read_lock(); rcu_read_lock();
memcg = mem_cgroup_from_id(id); memcg = mem_cgroup_from_id(id);
...@@ -6551,7 +6544,6 @@ int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask, ...@@ -6551,7 +6544,6 @@ int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask,
memcg = NULL; memcg = NULL;
rcu_read_unlock(); rcu_read_unlock();
} }
}
if (!memcg) if (!memcg)
memcg = get_mem_cgroup_from_mm(mm); memcg = get_mem_cgroup_from_mm(mm);
...@@ -6567,7 +6559,7 @@ int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask, ...@@ -6567,7 +6559,7 @@ int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask,
memcg_check_events(memcg, page); memcg_check_events(memcg, page);
local_irq_enable(); local_irq_enable();
if (do_memsw_account() && PageSwapCache(page)) { if (PageSwapCache(page)) {
swp_entry_t entry = { .val = page_private(page) }; swp_entry_t entry = { .val = page_private(page) };
/* /*
* The swap entry might not get freed for a long time, * The swap entry might not get freed for a long time,
...@@ -6952,7 +6944,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) ...@@ -6952,7 +6944,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
VM_BUG_ON_PAGE(PageLRU(page), page); VM_BUG_ON_PAGE(PageLRU(page), page);
VM_BUG_ON_PAGE(page_count(page), page); VM_BUG_ON_PAGE(page_count(page), page);
if (!do_memsw_account()) if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
return; return;
memcg = page->mem_cgroup; memcg = page->mem_cgroup;
...@@ -6981,7 +6973,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) ...@@ -6981,7 +6973,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
if (!mem_cgroup_is_root(memcg)) if (!mem_cgroup_is_root(memcg))
page_counter_uncharge(&memcg->memory, nr_entries); page_counter_uncharge(&memcg->memory, nr_entries);
if (memcg != swap_memcg) { if (!cgroup_memory_noswap && memcg != swap_memcg) {
if (!mem_cgroup_is_root(swap_memcg)) if (!mem_cgroup_is_root(swap_memcg))
page_counter_charge(&swap_memcg->memsw, nr_entries); page_counter_charge(&swap_memcg->memsw, nr_entries);
page_counter_uncharge(&memcg->memsw, nr_entries); page_counter_uncharge(&memcg->memsw, nr_entries);
...@@ -7017,7 +7009,7 @@ int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry) ...@@ -7017,7 +7009,7 @@ int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry)
struct mem_cgroup *memcg; struct mem_cgroup *memcg;
unsigned short oldid; unsigned short oldid;
if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) || cgroup_memory_noswap) if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
return 0; return 0;
memcg = page->mem_cgroup; memcg = page->mem_cgroup;
...@@ -7033,7 +7025,7 @@ int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry) ...@@ -7033,7 +7025,7 @@ int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry)
memcg = mem_cgroup_id_get_online(memcg); memcg = mem_cgroup_id_get_online(memcg);
if (!mem_cgroup_is_root(memcg) && if (!cgroup_memory_noswap && !mem_cgroup_is_root(memcg) &&
!page_counter_try_charge(&memcg->swap, nr_pages, &counter)) { !page_counter_try_charge(&memcg->swap, nr_pages, &counter)) {
memcg_memory_event(memcg, MEMCG_SWAP_MAX); memcg_memory_event(memcg, MEMCG_SWAP_MAX);
memcg_memory_event(memcg, MEMCG_SWAP_FAIL); memcg_memory_event(memcg, MEMCG_SWAP_FAIL);
...@@ -7061,14 +7053,11 @@ void mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages) ...@@ -7061,14 +7053,11 @@ void mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages)
struct mem_cgroup *memcg; struct mem_cgroup *memcg;
unsigned short id; unsigned short id;
if (cgroup_memory_noswap)
return;
id = swap_cgroup_record(entry, 0, nr_pages); id = swap_cgroup_record(entry, 0, nr_pages);
rcu_read_lock(); rcu_read_lock();
memcg = mem_cgroup_from_id(id); memcg = mem_cgroup_from_id(id);
if (memcg) { if (memcg) {
if (!mem_cgroup_is_root(memcg)) { if (!cgroup_memory_noswap && !mem_cgroup_is_root(memcg)) {
if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
page_counter_uncharge(&memcg->swap, nr_pages); page_counter_uncharge(&memcg->swap, nr_pages);
else else
...@@ -7253,7 +7242,11 @@ static struct cftype memsw_files[] = { ...@@ -7253,7 +7242,11 @@ static struct cftype memsw_files[] = {
static int __init mem_cgroup_swap_init(void) static int __init mem_cgroup_swap_init(void)
{ {
if (mem_cgroup_disabled() || cgroup_memory_noswap) /* No memory control -> no swap control */
if (mem_cgroup_disabled())
cgroup_memory_noswap = true;
if (cgroup_memory_noswap)
return 0; return 0;
WARN_ON(cgroup_add_dfl_cftypes(&memory_cgrp_subsys, swap_files)); WARN_ON(cgroup_add_dfl_cftypes(&memory_cgrp_subsys, swap_files));
......
...@@ -171,9 +171,6 @@ int swap_cgroup_swapon(int type, unsigned long max_pages) ...@@ -171,9 +171,6 @@ int swap_cgroup_swapon(int type, unsigned long max_pages)
unsigned long length; unsigned long length;
struct swap_cgroup_ctrl *ctrl; struct swap_cgroup_ctrl *ctrl;
if (cgroup_memory_noswap)
return 0;
length = DIV_ROUND_UP(max_pages, SC_PER_PAGE); length = DIV_ROUND_UP(max_pages, SC_PER_PAGE);
array_size = length * sizeof(void *); array_size = length * sizeof(void *);
...@@ -209,9 +206,6 @@ void swap_cgroup_swapoff(int type) ...@@ -209,9 +206,6 @@ void swap_cgroup_swapoff(int type)
unsigned long i, length; unsigned long i, length;
struct swap_cgroup_ctrl *ctrl; struct swap_cgroup_ctrl *ctrl;
if (cgroup_memory_noswap)
return;
mutex_lock(&swap_cgroup_mutex); mutex_lock(&swap_cgroup_mutex);
ctrl = &swap_cgroup_ctrl[type]; ctrl = &swap_cgroup_ctrl[type];
map = ctrl->map; map = ctrl->map;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment