Commit 1306a85a authored by Johannes Weiner's avatar Johannes Weiner Committed by Linus Torvalds

mm: embed the memcg pointer directly into struct page

Memory cgroups used to have 5 per-page pointers.  To allow users to
disable that amount of overhead during runtime, those pointers were
allocated in a separate array, with a translation layer between them and
struct page.

There is now only one page pointer remaining: the memcg pointer, that
indicates which cgroup the page is associated with when charged.  The
complexity of runtime allocation and the runtime translation overhead is
no longer justified to save that *potential* 0.19% of memory.  With
CONFIG_SLUB, page->mem_cgroup actually sits in the doubleword padding
after the page->private member and doesn't even increase struct page,
and then this patch actually saves space.  Remaining users that care can
still compile their kernels without CONFIG_MEMCG.

     text    data     bss     dec     hex     filename
  8828345 1725264  983040 11536649 b00909  vmlinux.old
  8827425 1725264  966656 11519345 afc571  vmlinux.new

[mhocko@suse.cz: update Documentation/cgroups/memory.txt]
Signed-off-by: default avatarJohannes Weiner <hannes@cmpxchg.org>
Acked-by: default avatarMichal Hocko <mhocko@suse.cz>
Acked-by: default avatarVladimir Davydov <vdavydov@parallels.com>
Acked-by: default avatarDavid S. Miller <davem@davemloft.net>
Acked-by: default avatarKAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: "Kirill A. Shutemov" <kirill@shutemov.name>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Vladimir Davydov <vdavydov@parallels.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Acked-by: default avatarKonstantin Khlebnikov <koct9i@gmail.com>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
parent 22811c6b
Memory Resource Controller Memory Resource Controller
NOTE: This document is hopelessly outdated and it asks for a complete
rewrite. It still contains a useful information so we are keeping it
here but make sure to check the current code if you need a deeper
understanding.
NOTE: The Memory Resource Controller has generically been referred to as the NOTE: The Memory Resource Controller has generically been referred to as the
memory controller in this document. Do not confuse memory controller memory controller in this document. Do not confuse memory controller
used here with the memory controller that is used in hardware. used here with the memory controller that is used in hardware.
......
...@@ -25,7 +25,6 @@ ...@@ -25,7 +25,6 @@
#include <linux/jump_label.h> #include <linux/jump_label.h>
struct mem_cgroup; struct mem_cgroup;
struct page_cgroup;
struct page; struct page;
struct mm_struct; struct mm_struct;
struct kmem_cache; struct kmem_cache;
...@@ -466,8 +465,6 @@ memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **memcg, int order) ...@@ -466,8 +465,6 @@ memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **memcg, int order)
* memcg_kmem_uncharge_pages: uncharge pages from memcg * memcg_kmem_uncharge_pages: uncharge pages from memcg
* @page: pointer to struct page being freed * @page: pointer to struct page being freed
* @order: allocation order. * @order: allocation order.
*
* there is no need to specify memcg here, since it is embedded in page_cgroup
*/ */
static inline void static inline void
memcg_kmem_uncharge_pages(struct page *page, int order) memcg_kmem_uncharge_pages(struct page *page, int order)
...@@ -484,8 +481,7 @@ memcg_kmem_uncharge_pages(struct page *page, int order) ...@@ -484,8 +481,7 @@ memcg_kmem_uncharge_pages(struct page *page, int order)
* *
* Needs to be called after memcg_kmem_newpage_charge, regardless of success or * Needs to be called after memcg_kmem_newpage_charge, regardless of success or
* failure of the allocation. if @page is NULL, this function will revert the * failure of the allocation. if @page is NULL, this function will revert the
* charges. Otherwise, it will commit the memcg given by @memcg to the * charges. Otherwise, it will commit @page to @memcg.
* corresponding page_cgroup.
*/ */
static inline void static inline void
memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg, int order) memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg, int order)
......
...@@ -22,6 +22,7 @@ ...@@ -22,6 +22,7 @@
#define AT_VECTOR_SIZE (2*(AT_VECTOR_SIZE_ARCH + AT_VECTOR_SIZE_BASE + 1)) #define AT_VECTOR_SIZE (2*(AT_VECTOR_SIZE_ARCH + AT_VECTOR_SIZE_BASE + 1))
struct address_space; struct address_space;
struct mem_cgroup;
#define USE_SPLIT_PTE_PTLOCKS (NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS) #define USE_SPLIT_PTE_PTLOCKS (NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS)
#define USE_SPLIT_PMD_PTLOCKS (USE_SPLIT_PTE_PTLOCKS && \ #define USE_SPLIT_PMD_PTLOCKS (USE_SPLIT_PTE_PTLOCKS && \
...@@ -167,6 +168,10 @@ struct page { ...@@ -167,6 +168,10 @@ struct page {
struct page *first_page; /* Compound tail pages */ struct page *first_page; /* Compound tail pages */
}; };
#ifdef CONFIG_MEMCG
struct mem_cgroup *mem_cgroup;
#endif
/* /*
* On machines where all RAM is mapped into kernel address space, * On machines where all RAM is mapped into kernel address space,
* we can simply calculate the virtual address. On machines with * we can simply calculate the virtual address. On machines with
......
...@@ -722,9 +722,6 @@ typedef struct pglist_data { ...@@ -722,9 +722,6 @@ typedef struct pglist_data {
int nr_zones; int nr_zones;
#ifdef CONFIG_FLAT_NODE_MEM_MAP /* means !SPARSEMEM */ #ifdef CONFIG_FLAT_NODE_MEM_MAP /* means !SPARSEMEM */
struct page *node_mem_map; struct page *node_mem_map;
#ifdef CONFIG_MEMCG
struct page_cgroup *node_page_cgroup;
#endif
#endif #endif
#ifndef CONFIG_NO_BOOTMEM #ifndef CONFIG_NO_BOOTMEM
struct bootmem_data *bdata; struct bootmem_data *bdata;
...@@ -1078,7 +1075,6 @@ static inline unsigned long early_pfn_to_nid(unsigned long pfn) ...@@ -1078,7 +1075,6 @@ static inline unsigned long early_pfn_to_nid(unsigned long pfn)
#define SECTION_ALIGN_DOWN(pfn) ((pfn) & PAGE_SECTION_MASK) #define SECTION_ALIGN_DOWN(pfn) ((pfn) & PAGE_SECTION_MASK)
struct page; struct page;
struct page_cgroup;
struct mem_section { struct mem_section {
/* /*
* This is, logically, a pointer to an array of struct * This is, logically, a pointer to an array of struct
...@@ -1096,14 +1092,6 @@ struct mem_section { ...@@ -1096,14 +1092,6 @@ struct mem_section {
/* See declaration of similar field in struct zone */ /* See declaration of similar field in struct zone */
unsigned long *pageblock_flags; unsigned long *pageblock_flags;
#ifdef CONFIG_MEMCG
/*
* If !SPARSEMEM, pgdat doesn't have page_cgroup pointer. We use
* section. (see memcontrol.h/page_cgroup.h about this.)
*/
struct page_cgroup *page_cgroup;
unsigned long pad;
#endif
/* /*
* WARNING: mem_section must be a power-of-2 in size for the * WARNING: mem_section must be a power-of-2 in size for the
* calculation and use of SECTION_ROOT_MASK to make sense. * calculation and use of SECTION_ROOT_MASK to make sense.
......
#ifndef __LINUX_PAGE_CGROUP_H #ifndef __LINUX_PAGE_CGROUP_H
#define __LINUX_PAGE_CGROUP_H #define __LINUX_PAGE_CGROUP_H
struct pglist_data;
#ifdef CONFIG_MEMCG
struct mem_cgroup;
/*
* Page Cgroup can be considered as an extended mem_map.
* A page_cgroup page is associated with every page descriptor. The
* page_cgroup helps us identify information about the cgroup
* All page cgroups are allocated at boot or memory hotplug event,
* then the page cgroup for pfn always exists.
*/
struct page_cgroup {
struct mem_cgroup *mem_cgroup;
};
extern void pgdat_page_cgroup_init(struct pglist_data *pgdat);
#ifdef CONFIG_SPARSEMEM
static inline void page_cgroup_init_flatmem(void)
{
}
extern void page_cgroup_init(void);
#else
extern void page_cgroup_init_flatmem(void);
static inline void page_cgroup_init(void)
{
}
#endif
struct page_cgroup *lookup_page_cgroup(struct page *page);
#else /* !CONFIG_MEMCG */
struct page_cgroup;
static inline void pgdat_page_cgroup_init(struct pglist_data *pgdat)
{
}
static inline struct page_cgroup *lookup_page_cgroup(struct page *page)
{
return NULL;
}
static inline void page_cgroup_init(void)
{
}
static inline void page_cgroup_init_flatmem(void)
{
}
#endif /* CONFIG_MEMCG */
#include <linux/swap.h> #include <linux/swap.h>
#ifdef CONFIG_MEMCG_SWAP #ifdef CONFIG_MEMCG_SWAP
......
...@@ -51,7 +51,6 @@ ...@@ -51,7 +51,6 @@
#include <linux/mempolicy.h> #include <linux/mempolicy.h>
#include <linux/key.h> #include <linux/key.h>
#include <linux/buffer_head.h> #include <linux/buffer_head.h>
#include <linux/page_cgroup.h>
#include <linux/debug_locks.h> #include <linux/debug_locks.h>
#include <linux/debugobjects.h> #include <linux/debugobjects.h>
#include <linux/lockdep.h> #include <linux/lockdep.h>
...@@ -485,11 +484,6 @@ void __init __weak thread_info_cache_init(void) ...@@ -485,11 +484,6 @@ void __init __weak thread_info_cache_init(void)
*/ */
static void __init mm_init(void) static void __init mm_init(void)
{ {
/*
* page_cgroup requires contiguous pages,
* bigger than MAX_ORDER unless SPARSEMEM.
*/
page_cgroup_init_flatmem();
mem_init(); mem_init();
kmem_cache_init(); kmem_cache_init();
percpu_init_late(); percpu_init_late();
...@@ -627,7 +621,6 @@ asmlinkage __visible void __init start_kernel(void) ...@@ -627,7 +621,6 @@ asmlinkage __visible void __init start_kernel(void)
initrd_start = 0; initrd_start = 0;
} }
#endif #endif
page_cgroup_init();
debug_objects_mem_init(); debug_objects_mem_init();
kmemleak_init(); kmemleak_init();
setup_per_cpu_pageset(); setup_per_cpu_pageset();
......
This diff is collapsed.
...@@ -48,7 +48,6 @@ ...@@ -48,7 +48,6 @@
#include <linux/backing-dev.h> #include <linux/backing-dev.h>
#include <linux/fault-inject.h> #include <linux/fault-inject.h>
#include <linux/page-isolation.h> #include <linux/page-isolation.h>
#include <linux/page_cgroup.h>
#include <linux/debugobjects.h> #include <linux/debugobjects.h>
#include <linux/kmemleak.h> #include <linux/kmemleak.h>
#include <linux/compaction.h> #include <linux/compaction.h>
...@@ -4853,7 +4852,6 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, ...@@ -4853,7 +4852,6 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
#endif #endif
init_waitqueue_head(&pgdat->kswapd_wait); init_waitqueue_head(&pgdat->kswapd_wait);
init_waitqueue_head(&pgdat->pfmemalloc_wait); init_waitqueue_head(&pgdat->pfmemalloc_wait);
pgdat_page_cgroup_init(pgdat);
for (j = 0; j < MAX_NR_ZONES; j++) { for (j = 0; j < MAX_NR_ZONES; j++) {
struct zone *zone = pgdat->node_zones + j; struct zone *zone = pgdat->node_zones + j;
......
#include <linux/mm.h> #include <linux/mm.h>
#include <linux/mmzone.h>
#include <linux/bootmem.h>
#include <linux/bit_spinlock.h>
#include <linux/page_cgroup.h> #include <linux/page_cgroup.h>
#include <linux/hash.h>
#include <linux/slab.h>
#include <linux/memory.h>
#include <linux/vmalloc.h> #include <linux/vmalloc.h>
#include <linux/cgroup.h>
#include <linux/swapops.h> #include <linux/swapops.h>
#include <linux/kmemleak.h>
static unsigned long total_usage;
#if !defined(CONFIG_SPARSEMEM)
void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat)
{
pgdat->node_page_cgroup = NULL;
}
struct page_cgroup *lookup_page_cgroup(struct page *page)
{
unsigned long pfn = page_to_pfn(page);
unsigned long offset;
struct page_cgroup *base;
base = NODE_DATA(page_to_nid(page))->node_page_cgroup;
#ifdef CONFIG_DEBUG_VM
/*
* The sanity checks the page allocator does upon freeing a
* page can reach here before the page_cgroup arrays are
* allocated when feeding a range of pages to the allocator
* for the first time during bootup or memory hotplug.
*/
if (unlikely(!base))
return NULL;
#endif
offset = pfn - NODE_DATA(page_to_nid(page))->node_start_pfn;
return base + offset;
}
static int __init alloc_node_page_cgroup(int nid)
{
struct page_cgroup *base;
unsigned long table_size;
unsigned long nr_pages;
nr_pages = NODE_DATA(nid)->node_spanned_pages;
if (!nr_pages)
return 0;
table_size = sizeof(struct page_cgroup) * nr_pages;
base = memblock_virt_alloc_try_nid_nopanic(
table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS),
BOOTMEM_ALLOC_ACCESSIBLE, nid);
if (!base)
return -ENOMEM;
NODE_DATA(nid)->node_page_cgroup = base;
total_usage += table_size;
return 0;
}
void __init page_cgroup_init_flatmem(void)
{
int nid, fail;
if (mem_cgroup_disabled())
return;
for_each_online_node(nid) {
fail = alloc_node_page_cgroup(nid);
if (fail)
goto fail;
}
printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage);
printk(KERN_INFO "please try 'cgroup_disable=memory' option if you"
" don't want memory cgroups\n");
return;
fail:
printk(KERN_CRIT "allocation of page_cgroup failed.\n");
printk(KERN_CRIT "please try 'cgroup_disable=memory' boot option\n");
panic("Out of memory");
}
#else /* CONFIG_FLAT_NODE_MEM_MAP */
struct page_cgroup *lookup_page_cgroup(struct page *page)
{
unsigned long pfn = page_to_pfn(page);
struct mem_section *section = __pfn_to_section(pfn);
#ifdef CONFIG_DEBUG_VM
/*
* The sanity checks the page allocator does upon freeing a
* page can reach here before the page_cgroup arrays are
* allocated when feeding a range of pages to the allocator
* for the first time during bootup or memory hotplug.
*/
if (!section->page_cgroup)
return NULL;
#endif
return section->page_cgroup + pfn;
}
static void *__meminit alloc_page_cgroup(size_t size, int nid)
{
gfp_t flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN;
void *addr = NULL;
addr = alloc_pages_exact_nid(nid, size, flags);
if (addr) {
kmemleak_alloc(addr, size, 1, flags);
return addr;
}
if (node_state(nid, N_HIGH_MEMORY))
addr = vzalloc_node(size, nid);
else
addr = vzalloc(size);
return addr;
}
static int __meminit init_section_page_cgroup(unsigned long pfn, int nid)
{
struct mem_section *section;
struct page_cgroup *base;
unsigned long table_size;
section = __pfn_to_section(pfn);
if (section->page_cgroup)
return 0;
table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION;
base = alloc_page_cgroup(table_size, nid);
/*
* The value stored in section->page_cgroup is (base - pfn)
* and it does not point to the memory block allocated above,
* causing kmemleak false positives.
*/
kmemleak_not_leak(base);
if (!base) {
printk(KERN_ERR "page cgroup allocation failure\n");
return -ENOMEM;
}
/*
* The passed "pfn" may not be aligned to SECTION. For the calculation
* we need to apply a mask.
*/
pfn &= PAGE_SECTION_MASK;
section->page_cgroup = base - pfn;
total_usage += table_size;
return 0;
}
#ifdef CONFIG_MEMORY_HOTPLUG
static void free_page_cgroup(void *addr)
{
if (is_vmalloc_addr(addr)) {
vfree(addr);
} else {
struct page *page = virt_to_page(addr);
size_t table_size =
sizeof(struct page_cgroup) * PAGES_PER_SECTION;
BUG_ON(PageReserved(page));
kmemleak_free(addr);
free_pages_exact(addr, table_size);
}
}
static void __free_page_cgroup(unsigned long pfn)
{
struct mem_section *ms;
struct page_cgroup *base;
ms = __pfn_to_section(pfn);
if (!ms || !ms->page_cgroup)
return;
base = ms->page_cgroup + pfn;
free_page_cgroup(base);
ms->page_cgroup = NULL;
}
static int __meminit online_page_cgroup(unsigned long start_pfn,
unsigned long nr_pages,
int nid)
{
unsigned long start, end, pfn;
int fail = 0;
start = SECTION_ALIGN_DOWN(start_pfn);
end = SECTION_ALIGN_UP(start_pfn + nr_pages);
if (nid == -1) {
/*
* In this case, "nid" already exists and contains valid memory.
* "start_pfn" passed to us is a pfn which is an arg for
* online__pages(), and start_pfn should exist.
*/
nid = pfn_to_nid(start_pfn);
VM_BUG_ON(!node_state(nid, N_ONLINE));
}
for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION) {
if (!pfn_present(pfn))
continue;
fail = init_section_page_cgroup(pfn, nid);
}
if (!fail)
return 0;
/* rollback */
for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
__free_page_cgroup(pfn);
return -ENOMEM;
}
static int __meminit offline_page_cgroup(unsigned long start_pfn,
unsigned long nr_pages, int nid)
{
unsigned long start, end, pfn;
start = SECTION_ALIGN_DOWN(start_pfn);
end = SECTION_ALIGN_UP(start_pfn + nr_pages);
for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
__free_page_cgroup(pfn);
return 0;
}
static int __meminit page_cgroup_callback(struct notifier_block *self,
unsigned long action, void *arg)
{
struct memory_notify *mn = arg;
int ret = 0;
switch (action) {
case MEM_GOING_ONLINE:
ret = online_page_cgroup(mn->start_pfn,
mn->nr_pages, mn->status_change_nid);
break;
case MEM_OFFLINE:
offline_page_cgroup(mn->start_pfn,
mn->nr_pages, mn->status_change_nid);
break;
case MEM_CANCEL_ONLINE:
offline_page_cgroup(mn->start_pfn,
mn->nr_pages, mn->status_change_nid);
break;
case MEM_GOING_OFFLINE:
break;
case MEM_ONLINE:
case MEM_CANCEL_OFFLINE:
break;
}
return notifier_from_errno(ret);
}
#endif
void __init page_cgroup_init(void)
{
unsigned long pfn;
int nid;
if (mem_cgroup_disabled())
return;
for_each_node_state(nid, N_MEMORY) {
unsigned long start_pfn, end_pfn;
start_pfn = node_start_pfn(nid);
end_pfn = node_end_pfn(nid);
/*
* start_pfn and end_pfn may not be aligned to SECTION and the
* page->flags of out of node pages are not initialized. So we
* scan [start_pfn, the biggest section's pfn < end_pfn) here.
*/
for (pfn = start_pfn;
pfn < end_pfn;
pfn = ALIGN(pfn + 1, PAGES_PER_SECTION)) {
if (!pfn_valid(pfn))
continue;
/*
* Nodes's pfns can be overlapping.
* We know some arch can have a nodes layout such as
* -------------pfn-------------->
* N0 | N1 | N2 | N0 | N1 | N2|....
*/
if (pfn_to_nid(pfn) != nid)
continue;
if (init_section_page_cgroup(pfn, nid))
goto oom;
}
}
hotplug_memory_notifier(page_cgroup_callback, 0);
printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage);
printk(KERN_INFO "please try 'cgroup_disable=memory' option if you "
"don't want memory cgroups\n");
return;
oom:
printk(KERN_CRIT "try 'cgroup_disable=memory' boot option\n");
panic("Out of memory");
}
void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat)
{
return;
}
#endif
#ifdef CONFIG_MEMCG_SWAP #ifdef CONFIG_MEMCG_SWAP
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment