Commit 456f998e authored by Ying Han's avatar Ying Han Committed by Linus Torvalds

memcg: add the pagefault count into memcg stats

Two new stats in per-memcg memory.stat which tracks the number of page
faults and number of major page faults.

  "pgfault"
  "pgmajfault"

They are different from "pgpgin"/"pgpgout" stat which count number of
pages charged/discharged to the cgroup and have no meaning of reading/
writing page to disk.

It is valuable to track the two stats for both measuring application's
performance as well as the efficiency of the kernel page reclaim path.
Counting pagefaults per process is useful, but we also need the aggregated
value since processes are monitored and controlled in cgroup basis in
memcg.

Functional test: check the total number of pgfault/pgmajfault of all
memcgs and compare with global vmstat value:

  $ cat /proc/vmstat | grep fault
  pgfault 1070751
  pgmajfault 553

  $ cat /dev/cgroup/memory.stat | grep fault
  pgfault 1071138
  pgmajfault 553
  total_pgfault 1071142
  total_pgmajfault 553

  $ cat /dev/cgroup/A/memory.stat | grep fault
  pgfault 199
  pgmajfault 0
  total_pgfault 199
  total_pgmajfault 0

Performance test: run page fault test(pft) wit 16 thread on faulting in
15G anon pages in 16G container.  There is no regression noticed on the
"flt/cpu/s"

Sample output from pft:

  TAG pft:anon-sys-default:
    Gb  Thr CLine   User     System     Wall    flt/cpu/s fault/wsec
    15   16   1     0.67s   233.41s    14.76s   16798.546 266356.260

  +-------------------------------------------------------------------------+
      N           Min           Max        Median           Avg        Stddev
  x  10     16682.962     17344.027     16913.524     16928.812      166.5362
  +  10     16695.568     16923.896     16820.604     16824.652     84.816568
  No difference proven at 95.0% confidence

[akpm@linux-foundation.org: fix build]
[hughd@google.com: shmem fix]
Signed-off-by: default avatarYing Han <yinghan@google.com>
Acked-by: default avatarKAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Reviewed-by: default avatarMinchan Kim <minchan.kim@gmail.com>
Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Acked-by: default avatarBalbir Singh <balbir@linux.vnet.ibm.com>
Signed-off-by: default avatarHugh Dickins <hughd@google.com>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
parent 406eb0c9
...@@ -16,6 +16,7 @@ ...@@ -16,6 +16,7 @@
#include <linux/mman.h> #include <linux/mman.h>
#include <linux/string.h> #include <linux/string.h>
#include <linux/fcntl.h> #include <linux/fcntl.h>
#include <linux/memcontrol.h>
#include <asm/uaccess.h> #include <asm/uaccess.h>
#include <asm/system.h> #include <asm/system.h>
...@@ -92,6 +93,7 @@ static int ncp_file_mmap_fault(struct vm_area_struct *area, ...@@ -92,6 +93,7 @@ static int ncp_file_mmap_fault(struct vm_area_struct *area,
* -- wli * -- wli
*/ */
count_vm_event(PGMAJFAULT); count_vm_event(PGMAJFAULT);
mem_cgroup_count_vm_event(area->vm_mm, PGMAJFAULT);
return VM_FAULT_MAJOR; return VM_FAULT_MAJOR;
} }
......
...@@ -20,6 +20,8 @@ ...@@ -20,6 +20,8 @@
#ifndef _LINUX_MEMCONTROL_H #ifndef _LINUX_MEMCONTROL_H
#define _LINUX_MEMCONTROL_H #define _LINUX_MEMCONTROL_H
#include <linux/cgroup.h> #include <linux/cgroup.h>
#include <linux/vm_event_item.h>
struct mem_cgroup; struct mem_cgroup;
struct page_cgroup; struct page_cgroup;
struct page; struct page;
...@@ -149,6 +151,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, ...@@ -149,6 +151,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
unsigned long *total_scanned); unsigned long *total_scanned);
u64 mem_cgroup_get_limit(struct mem_cgroup *mem); u64 mem_cgroup_get_limit(struct mem_cgroup *mem);
void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx);
#ifdef CONFIG_TRANSPARENT_HUGEPAGE #ifdef CONFIG_TRANSPARENT_HUGEPAGE
void mem_cgroup_split_huge_fixup(struct page *head, struct page *tail); void mem_cgroup_split_huge_fixup(struct page *head, struct page *tail);
#endif #endif
...@@ -357,6 +360,10 @@ static inline void mem_cgroup_split_huge_fixup(struct page *head, ...@@ -357,6 +360,10 @@ static inline void mem_cgroup_split_huge_fixup(struct page *head,
{ {
} }
static inline
void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx)
{
}
#endif /* CONFIG_CGROUP_MEM_CONT */ #endif /* CONFIG_CGROUP_MEM_CONT */
#if !defined(CONFIG_CGROUP_MEM_RES_CTLR) || !defined(CONFIG_DEBUG_VM) #if !defined(CONFIG_CGROUP_MEM_RES_CTLR) || !defined(CONFIG_DEBUG_VM)
......
...@@ -1661,6 +1661,7 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) ...@@ -1661,6 +1661,7 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
/* No page in the page cache at all */ /* No page in the page cache at all */
do_sync_mmap_readahead(vma, ra, file, offset); do_sync_mmap_readahead(vma, ra, file, offset);
count_vm_event(PGMAJFAULT); count_vm_event(PGMAJFAULT);
mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
ret = VM_FAULT_MAJOR; ret = VM_FAULT_MAJOR;
retry_find: retry_find:
page = find_get_page(mapping, offset); page = find_get_page(mapping, offset);
......
...@@ -94,6 +94,8 @@ enum mem_cgroup_events_index { ...@@ -94,6 +94,8 @@ enum mem_cgroup_events_index {
MEM_CGROUP_EVENTS_PGPGIN, /* # of pages paged in */ MEM_CGROUP_EVENTS_PGPGIN, /* # of pages paged in */
MEM_CGROUP_EVENTS_PGPGOUT, /* # of pages paged out */ MEM_CGROUP_EVENTS_PGPGOUT, /* # of pages paged out */
MEM_CGROUP_EVENTS_COUNT, /* # of pages paged in/out */ MEM_CGROUP_EVENTS_COUNT, /* # of pages paged in/out */
MEM_CGROUP_EVENTS_PGFAULT, /* # of page-faults */
MEM_CGROUP_EVENTS_PGMAJFAULT, /* # of major page-faults */
MEM_CGROUP_EVENTS_NSTATS, MEM_CGROUP_EVENTS_NSTATS,
}; };
/* /*
...@@ -590,6 +592,16 @@ static void mem_cgroup_swap_statistics(struct mem_cgroup *mem, ...@@ -590,6 +592,16 @@ static void mem_cgroup_swap_statistics(struct mem_cgroup *mem,
this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_SWAPOUT], val); this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_SWAPOUT], val);
} }
void mem_cgroup_pgfault(struct mem_cgroup *mem, int val)
{
this_cpu_add(mem->stat->events[MEM_CGROUP_EVENTS_PGFAULT], val);
}
void mem_cgroup_pgmajfault(struct mem_cgroup *mem, int val)
{
this_cpu_add(mem->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT], val);
}
static unsigned long mem_cgroup_read_events(struct mem_cgroup *mem, static unsigned long mem_cgroup_read_events(struct mem_cgroup *mem,
enum mem_cgroup_events_index idx) enum mem_cgroup_events_index idx)
{ {
...@@ -827,6 +839,33 @@ static inline bool mem_cgroup_is_root(struct mem_cgroup *mem) ...@@ -827,6 +839,33 @@ static inline bool mem_cgroup_is_root(struct mem_cgroup *mem)
return (mem == root_mem_cgroup); return (mem == root_mem_cgroup);
} }
void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx)
{
struct mem_cgroup *mem;
if (!mm)
return;
rcu_read_lock();
mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
if (unlikely(!mem))
goto out;
switch (idx) {
case PGMAJFAULT:
mem_cgroup_pgmajfault(mem, 1);
break;
case PGFAULT:
mem_cgroup_pgfault(mem, 1);
break;
default:
BUG();
}
out:
rcu_read_unlock();
}
EXPORT_SYMBOL(mem_cgroup_count_vm_event);
/* /*
* Following LRU functions are allowed to be used without PCG_LOCK. * Following LRU functions are allowed to be used without PCG_LOCK.
* Operations are called by routine of global LRU independently from memcg. * Operations are called by routine of global LRU independently from memcg.
...@@ -3958,6 +3997,8 @@ enum { ...@@ -3958,6 +3997,8 @@ enum {
MCS_PGPGIN, MCS_PGPGIN,
MCS_PGPGOUT, MCS_PGPGOUT,
MCS_SWAP, MCS_SWAP,
MCS_PGFAULT,
MCS_PGMAJFAULT,
MCS_INACTIVE_ANON, MCS_INACTIVE_ANON,
MCS_ACTIVE_ANON, MCS_ACTIVE_ANON,
MCS_INACTIVE_FILE, MCS_INACTIVE_FILE,
...@@ -3980,6 +4021,8 @@ struct { ...@@ -3980,6 +4021,8 @@ struct {
{"pgpgin", "total_pgpgin"}, {"pgpgin", "total_pgpgin"},
{"pgpgout", "total_pgpgout"}, {"pgpgout", "total_pgpgout"},
{"swap", "total_swap"}, {"swap", "total_swap"},
{"pgfault", "total_pgfault"},
{"pgmajfault", "total_pgmajfault"},
{"inactive_anon", "total_inactive_anon"}, {"inactive_anon", "total_inactive_anon"},
{"active_anon", "total_active_anon"}, {"active_anon", "total_active_anon"},
{"inactive_file", "total_inactive_file"}, {"inactive_file", "total_inactive_file"},
...@@ -4008,6 +4051,10 @@ mem_cgroup_get_local_stat(struct mem_cgroup *mem, struct mcs_total_stat *s) ...@@ -4008,6 +4051,10 @@ mem_cgroup_get_local_stat(struct mem_cgroup *mem, struct mcs_total_stat *s)
val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_SWAPOUT); val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_SWAPOUT);
s->stat[MCS_SWAP] += val * PAGE_SIZE; s->stat[MCS_SWAP] += val * PAGE_SIZE;
} }
val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGFAULT);
s->stat[MCS_PGFAULT] += val;
val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGMAJFAULT);
s->stat[MCS_PGMAJFAULT] += val;
/* per zone stat */ /* per zone stat */
val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_ANON); val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_ANON);
......
...@@ -2874,6 +2874,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, ...@@ -2874,6 +2874,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
/* Had to read the page from swap area: Major fault */ /* Had to read the page from swap area: Major fault */
ret = VM_FAULT_MAJOR; ret = VM_FAULT_MAJOR;
count_vm_event(PGMAJFAULT); count_vm_event(PGMAJFAULT);
mem_cgroup_count_vm_event(mm, PGMAJFAULT);
} else if (PageHWPoison(page)) { } else if (PageHWPoison(page)) {
/* /*
* hwpoisoned dirty swapcache pages are kept for killing * hwpoisoned dirty swapcache pages are kept for killing
...@@ -3413,6 +3414,7 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, ...@@ -3413,6 +3414,7 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
__set_current_state(TASK_RUNNING); __set_current_state(TASK_RUNNING);
count_vm_event(PGFAULT); count_vm_event(PGFAULT);
mem_cgroup_count_vm_event(mm, PGFAULT);
/* do counter updates before entering really critical section. */ /* do counter updates before entering really critical section. */
check_sync_rss_stat(current); check_sync_rss_stat(current);
......
...@@ -1305,12 +1305,10 @@ static int shmem_getpage(struct inode *inode, unsigned long idx, ...@@ -1305,12 +1305,10 @@ static int shmem_getpage(struct inode *inode, unsigned long idx,
swappage = lookup_swap_cache(swap); swappage = lookup_swap_cache(swap);
if (!swappage) { if (!swappage) {
shmem_swp_unmap(entry); shmem_swp_unmap(entry);
spin_unlock(&info->lock);
/* here we actually do the io */ /* here we actually do the io */
if (type && !(*type & VM_FAULT_MAJOR)) { if (type)
__count_vm_event(PGMAJFAULT);
*type |= VM_FAULT_MAJOR; *type |= VM_FAULT_MAJOR;
}
spin_unlock(&info->lock);
swappage = shmem_swapin(swap, gfp, info, idx); swappage = shmem_swapin(swap, gfp, info, idx);
if (!swappage) { if (!swappage) {
spin_lock(&info->lock); spin_lock(&info->lock);
...@@ -1549,7 +1547,10 @@ static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) ...@@ -1549,7 +1547,10 @@ static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_CACHE, &ret); error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_CACHE, &ret);
if (error) if (error)
return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS); return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS);
if (ret & VM_FAULT_MAJOR) {
count_vm_event(PGMAJFAULT);
mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
}
return ret | VM_FAULT_LOCKED; return ret | VM_FAULT_LOCKED;
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment