Commit 365e9c87 authored by Hugh Dickins's avatar Hugh Dickins Committed by Linus Torvalds

[PATCH] mm: update_hiwaters just in time

update_mem_hiwater has attracted various criticisms, in particular from those
concerned with mm scalability.  Originally it was called whenever rss or
total_vm got raised.  Then many of those callsites were replaced by a timer
tick call from account_system_time.  Now Frank van Maarseveen reports that to
be found inadequate.  How about this?  Works for Frank.

Replace update_mem_hiwater, a poor combination of two unrelated ops, by macros
update_hiwater_rss and update_hiwater_vm.  Don't attempt to keep
mm->hiwater_rss up to date at timer tick, nor every time we raise rss (usually
by 1): those are hot paths.  Do the opposite, update only when about to lower
rss (usually by many), or just before final accounting in do_exit.  Handle
mm->hiwater_vm in the same way, though it's much less of an issue.  Demand
that whoever collects these hiwater statistics do the work of taking the
maximum with rss or total_vm.

And there has been no collector of these hiwater statistics in the tree.  The
new convention needs an example, so match Frank's usage by adding a VmPeak
line above VmSize to /proc/<pid>/status, and also a VmHWM line above VmRSS
(High-Water-Mark or High-Water-Memory).

There was a particular anomaly during mremap move, that hiwater_vm might be
captured too high.  A fleeting such anomaly remains, but it's quickly
corrected now, whereas before it would stick.

What locking?  None: if the app is racy then these statistics will be racy,
it's not worth any overhead to make them exact.  But whenever it suits,
hiwater_vm is updated under exclusive mmap_sem, and hiwater_rss under
page_table_lock (for now) or with preemption disabled (later on): without
going to any trouble, minimize the time between reading current values and
updating, to minimize those occasions when a racing thread bumps a count up
and back down in between.
Signed-off-by: default avatarHugh Dickins <hugh@veritas.com>
Signed-off-by: default avatarAndrew Morton <akpm@osdl.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@osdl.org>
parent 861f2fb8
...@@ -1490,7 +1490,6 @@ int compat_do_execve(char * filename, ...@@ -1490,7 +1490,6 @@ int compat_do_execve(char * filename,
/* execve success */ /* execve success */
security_bprm_free(bprm); security_bprm_free(bprm);
acct_update_integrals(current); acct_update_integrals(current);
update_mem_hiwater(current);
kfree(bprm); kfree(bprm);
return retval; return retval;
} }
......
...@@ -1207,7 +1207,6 @@ int do_execve(char * filename, ...@@ -1207,7 +1207,6 @@ int do_execve(char * filename,
/* execve success */ /* execve success */
security_bprm_free(bprm); security_bprm_free(bprm);
acct_update_integrals(current); acct_update_integrals(current);
update_mem_hiwater(current);
kfree(bprm); kfree(bprm);
return retval; return retval;
} }
......
...@@ -14,22 +14,41 @@ ...@@ -14,22 +14,41 @@
char *task_mem(struct mm_struct *mm, char *buffer) char *task_mem(struct mm_struct *mm, char *buffer)
{ {
unsigned long data, text, lib; unsigned long data, text, lib;
unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss;
/*
* Note: to minimize their overhead, mm maintains hiwater_vm and
* hiwater_rss only when about to *lower* total_vm or rss. Any
* collector of these hiwater stats must therefore get total_vm
* and rss too, which will usually be the higher. Barriers? not
* worth the effort, such snapshots can always be inconsistent.
*/
hiwater_vm = total_vm = mm->total_vm;
if (hiwater_vm < mm->hiwater_vm)
hiwater_vm = mm->hiwater_vm;
hiwater_rss = total_rss = get_mm_rss(mm);
if (hiwater_rss < mm->hiwater_rss)
hiwater_rss = mm->hiwater_rss;
data = mm->total_vm - mm->shared_vm - mm->stack_vm; data = mm->total_vm - mm->shared_vm - mm->stack_vm;
text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10; text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10;
lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text; lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text;
buffer += sprintf(buffer, buffer += sprintf(buffer,
"VmPeak:\t%8lu kB\n"
"VmSize:\t%8lu kB\n" "VmSize:\t%8lu kB\n"
"VmLck:\t%8lu kB\n" "VmLck:\t%8lu kB\n"
"VmHWM:\t%8lu kB\n"
"VmRSS:\t%8lu kB\n" "VmRSS:\t%8lu kB\n"
"VmData:\t%8lu kB\n" "VmData:\t%8lu kB\n"
"VmStk:\t%8lu kB\n" "VmStk:\t%8lu kB\n"
"VmExe:\t%8lu kB\n" "VmExe:\t%8lu kB\n"
"VmLib:\t%8lu kB\n" "VmLib:\t%8lu kB\n"
"VmPTE:\t%8lu kB\n", "VmPTE:\t%8lu kB\n",
(mm->total_vm - mm->reserved_vm) << (PAGE_SHIFT-10), hiwater_vm << (PAGE_SHIFT-10),
(total_vm - mm->reserved_vm) << (PAGE_SHIFT-10),
mm->locked_vm << (PAGE_SHIFT-10), mm->locked_vm << (PAGE_SHIFT-10),
get_mm_rss(mm) << (PAGE_SHIFT-10), hiwater_rss << (PAGE_SHIFT-10),
total_rss << (PAGE_SHIFT-10),
data << (PAGE_SHIFT-10), data << (PAGE_SHIFT-10),
mm->stack_vm << (PAGE_SHIFT-10), text, lib, mm->stack_vm << (PAGE_SHIFT-10), text, lib,
(PTRS_PER_PTE*sizeof(pte_t)*mm->nr_ptes) >> 10); (PTRS_PER_PTE*sizeof(pte_t)*mm->nr_ptes) >> 10);
......
...@@ -938,9 +938,6 @@ static inline void vm_stat_account(struct mm_struct *mm, ...@@ -938,9 +938,6 @@ static inline void vm_stat_account(struct mm_struct *mm,
} }
#endif /* CONFIG_PROC_FS */ #endif /* CONFIG_PROC_FS */
/* update per process rss and vm hiwater data */
extern void update_mem_hiwater(struct task_struct *tsk);
#ifndef CONFIG_DEBUG_PAGEALLOC #ifndef CONFIG_DEBUG_PAGEALLOC
static inline void static inline void
kernel_map_pages(struct page *page, int numpages, int enable) kernel_map_pages(struct page *page, int numpages, int enable)
......
...@@ -256,6 +256,16 @@ extern void arch_unmap_area_topdown(struct mm_struct *, unsigned long); ...@@ -256,6 +256,16 @@ extern void arch_unmap_area_topdown(struct mm_struct *, unsigned long);
#define dec_mm_counter(mm, member) (mm)->_##member-- #define dec_mm_counter(mm, member) (mm)->_##member--
#define get_mm_rss(mm) ((mm)->_file_rss + (mm)->_anon_rss) #define get_mm_rss(mm) ((mm)->_file_rss + (mm)->_anon_rss)
#define update_hiwater_rss(mm) do { \
unsigned long _rss = get_mm_rss(mm); \
if ((mm)->hiwater_rss < _rss) \
(mm)->hiwater_rss = _rss; \
} while (0)
#define update_hiwater_vm(mm) do { \
if ((mm)->hiwater_vm < (mm)->total_vm) \
(mm)->hiwater_vm = (mm)->total_vm; \
} while (0)
typedef unsigned long mm_counter_t; typedef unsigned long mm_counter_t;
struct mm_struct { struct mm_struct {
......
...@@ -839,7 +839,10 @@ fastcall NORET_TYPE void do_exit(long code) ...@@ -839,7 +839,10 @@ fastcall NORET_TYPE void do_exit(long code)
preempt_count()); preempt_count());
acct_update_integrals(tsk); acct_update_integrals(tsk);
update_mem_hiwater(tsk); if (tsk->mm) {
update_hiwater_rss(tsk->mm);
update_hiwater_vm(tsk->mm);
}
group_dead = atomic_dec_and_test(&tsk->signal->live); group_dead = atomic_dec_and_test(&tsk->signal->live);
if (group_dead) { if (group_dead) {
del_timer_sync(&tsk->signal->real_timer); del_timer_sync(&tsk->signal->real_timer);
......
...@@ -2511,8 +2511,6 @@ void account_system_time(struct task_struct *p, int hardirq_offset, ...@@ -2511,8 +2511,6 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
cpustat->idle = cputime64_add(cpustat->idle, tmp); cpustat->idle = cputime64_add(cpustat->idle, tmp);
/* Account for system time used */ /* Account for system time used */
acct_update_integrals(p); acct_update_integrals(p);
/* Update rss highwater mark */
update_mem_hiwater(p);
} }
/* /*
......
...@@ -143,8 +143,10 @@ int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma, ...@@ -143,8 +143,10 @@ int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma,
if (!pte) if (!pte)
goto err_unlock; goto err_unlock;
if (!pte_none(*pte) && zap_pte(mm, vma, addr, pte)) if (!pte_none(*pte) && zap_pte(mm, vma, addr, pte)) {
update_hiwater_rss(mm);
dec_mm_counter(mm, file_rss); dec_mm_counter(mm, file_rss);
}
set_pte_at(mm, addr, pte, pgoff_to_pte(pgoff)); set_pte_at(mm, addr, pte, pgoff_to_pte(pgoff));
pte_val = *pte; pte_val = *pte;
......
...@@ -310,6 +310,9 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, ...@@ -310,6 +310,9 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
BUG_ON(start & ~HPAGE_MASK); BUG_ON(start & ~HPAGE_MASK);
BUG_ON(end & ~HPAGE_MASK); BUG_ON(end & ~HPAGE_MASK);
/* Update high watermark before we lower rss */
update_hiwater_rss(mm);
for (address = start; address < end; address += HPAGE_SIZE) { for (address = start; address < end; address += HPAGE_SIZE) {
ptep = huge_pte_offset(mm, address); ptep = huge_pte_offset(mm, address);
if (! ptep) if (! ptep)
......
...@@ -820,6 +820,7 @@ unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address, ...@@ -820,6 +820,7 @@ unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address,
lru_add_drain(); lru_add_drain();
spin_lock(&mm->page_table_lock); spin_lock(&mm->page_table_lock);
tlb = tlb_gather_mmu(mm, 0); tlb = tlb_gather_mmu(mm, 0);
update_hiwater_rss(mm);
end = unmap_vmas(&tlb, mm, vma, address, end, &nr_accounted, details); end = unmap_vmas(&tlb, mm, vma, address, end, &nr_accounted, details);
tlb_finish_mmu(tlb, address, end); tlb_finish_mmu(tlb, address, end);
spin_unlock(&mm->page_table_lock); spin_unlock(&mm->page_table_lock);
...@@ -2225,22 +2226,6 @@ unsigned long vmalloc_to_pfn(void * vmalloc_addr) ...@@ -2225,22 +2226,6 @@ unsigned long vmalloc_to_pfn(void * vmalloc_addr)
EXPORT_SYMBOL(vmalloc_to_pfn); EXPORT_SYMBOL(vmalloc_to_pfn);
/*
* update_mem_hiwater
* - update per process rss and vm high water data
*/
void update_mem_hiwater(struct task_struct *tsk)
{
if (tsk->mm) {
unsigned long rss = get_mm_rss(tsk->mm);
if (tsk->mm->hiwater_rss < rss)
tsk->mm->hiwater_rss = rss;
if (tsk->mm->hiwater_vm < tsk->mm->total_vm)
tsk->mm->hiwater_vm = tsk->mm->total_vm;
}
}
#if !defined(__HAVE_ARCH_GATE_AREA) #if !defined(__HAVE_ARCH_GATE_AREA)
#if defined(AT_SYSINFO_EHDR) #if defined(AT_SYSINFO_EHDR)
......
...@@ -1640,6 +1640,8 @@ find_extend_vma(struct mm_struct * mm, unsigned long addr) ...@@ -1640,6 +1640,8 @@ find_extend_vma(struct mm_struct * mm, unsigned long addr)
*/ */
static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma) static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma)
{ {
/* Update high watermark before we lower total_vm */
update_hiwater_vm(mm);
do { do {
long nrpages = vma_pages(vma); long nrpages = vma_pages(vma);
...@@ -1668,6 +1670,7 @@ static void unmap_region(struct mm_struct *mm, ...@@ -1668,6 +1670,7 @@ static void unmap_region(struct mm_struct *mm,
lru_add_drain(); lru_add_drain();
spin_lock(&mm->page_table_lock); spin_lock(&mm->page_table_lock);
tlb = tlb_gather_mmu(mm, 0); tlb = tlb_gather_mmu(mm, 0);
update_hiwater_rss(mm);
unmap_vmas(&tlb, mm, vma, start, end, &nr_accounted, NULL); unmap_vmas(&tlb, mm, vma, start, end, &nr_accounted, NULL);
vm_unacct_memory(nr_accounted); vm_unacct_memory(nr_accounted);
free_pgtables(&tlb, vma, prev? prev->vm_end: FIRST_USER_ADDRESS, free_pgtables(&tlb, vma, prev? prev->vm_end: FIRST_USER_ADDRESS,
...@@ -1953,6 +1956,7 @@ void exit_mmap(struct mm_struct *mm) ...@@ -1953,6 +1956,7 @@ void exit_mmap(struct mm_struct *mm)
flush_cache_mm(mm); flush_cache_mm(mm);
tlb = tlb_gather_mmu(mm, 1); tlb = tlb_gather_mmu(mm, 1);
/* Don't update_hiwater_rss(mm) here, do_exit already did */
/* Use -1 here to ensure all VMAs in the mm are unmapped */ /* Use -1 here to ensure all VMAs in the mm are unmapped */
end = unmap_vmas(&tlb, mm, vma, 0, -1, &nr_accounted, NULL); end = unmap_vmas(&tlb, mm, vma, 0, -1, &nr_accounted, NULL);
vm_unacct_memory(nr_accounted); vm_unacct_memory(nr_accounted);
......
...@@ -167,6 +167,7 @@ static unsigned long move_vma(struct vm_area_struct *vma, ...@@ -167,6 +167,7 @@ static unsigned long move_vma(struct vm_area_struct *vma,
unsigned long new_pgoff; unsigned long new_pgoff;
unsigned long moved_len; unsigned long moved_len;
unsigned long excess = 0; unsigned long excess = 0;
unsigned long hiwater_vm;
int split = 0; int split = 0;
/* /*
...@@ -205,9 +206,15 @@ static unsigned long move_vma(struct vm_area_struct *vma, ...@@ -205,9 +206,15 @@ static unsigned long move_vma(struct vm_area_struct *vma,
} }
/* /*
* if we failed to move page tables we still do total_vm increment * If we failed to move page tables we still do total_vm increment
* since do_munmap() will decrement it by old_len == new_len * since do_munmap() will decrement it by old_len == new_len.
*
* Since total_vm is about to be raised artificially high for a
* moment, we need to restore high watermark afterwards: if stats
* are taken meanwhile, total_vm and hiwater_vm appear too high.
* If this were a serious issue, we'd add a flag to do_munmap().
*/ */
hiwater_vm = mm->hiwater_vm;
mm->total_vm += new_len >> PAGE_SHIFT; mm->total_vm += new_len >> PAGE_SHIFT;
vm_stat_account(mm, vma->vm_flags, vma->vm_file, new_len>>PAGE_SHIFT); vm_stat_account(mm, vma->vm_flags, vma->vm_file, new_len>>PAGE_SHIFT);
...@@ -216,6 +223,7 @@ static unsigned long move_vma(struct vm_area_struct *vma, ...@@ -216,6 +223,7 @@ static unsigned long move_vma(struct vm_area_struct *vma,
vm_unacct_memory(excess >> PAGE_SHIFT); vm_unacct_memory(excess >> PAGE_SHIFT);
excess = 0; excess = 0;
} }
mm->hiwater_vm = hiwater_vm;
/* Restore VM_ACCOUNT if one or two pieces of vma left */ /* Restore VM_ACCOUNT if one or two pieces of vma left */
if (excess) { if (excess) {
......
...@@ -931,6 +931,8 @@ int do_munmap(struct mm_struct *mm, unsigned long addr, size_t len) ...@@ -931,6 +931,8 @@ int do_munmap(struct mm_struct *mm, unsigned long addr, size_t len)
realalloc -= kobjsize(vml); realalloc -= kobjsize(vml);
askedalloc -= sizeof(*vml); askedalloc -= sizeof(*vml);
kfree(vml); kfree(vml);
update_hiwater_vm(mm);
mm->total_vm -= len >> PAGE_SHIFT; mm->total_vm -= len >> PAGE_SHIFT;
#ifdef DEBUG #ifdef DEBUG
...@@ -1078,19 +1080,6 @@ void arch_unmap_area(struct mm_struct *mm, unsigned long addr) ...@@ -1078,19 +1080,6 @@ void arch_unmap_area(struct mm_struct *mm, unsigned long addr)
{ {
} }
void update_mem_hiwater(struct task_struct *tsk)
{
unsigned long rss;
if (likely(tsk->mm)) {
rss = get_mm_rss(tsk->mm);
if (tsk->mm->hiwater_rss < rss)
tsk->mm->hiwater_rss = rss;
if (tsk->mm->hiwater_vm < tsk->mm->total_vm)
tsk->mm->hiwater_vm = tsk->mm->total_vm;
}
}
void unmap_mapping_range(struct address_space *mapping, void unmap_mapping_range(struct address_space *mapping,
loff_t const holebegin, loff_t const holelen, loff_t const holebegin, loff_t const holelen,
int even_cows) int even_cows)
......
...@@ -538,6 +538,9 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma) ...@@ -538,6 +538,9 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma)
if (pte_dirty(pteval)) if (pte_dirty(pteval))
set_page_dirty(page); set_page_dirty(page);
/* Update high watermark before we lower rss */
update_hiwater_rss(mm);
if (PageAnon(page)) { if (PageAnon(page)) {
swp_entry_t entry = { .val = page->private }; swp_entry_t entry = { .val = page->private };
/* /*
...@@ -628,6 +631,9 @@ static void try_to_unmap_cluster(unsigned long cursor, ...@@ -628,6 +631,9 @@ static void try_to_unmap_cluster(unsigned long cursor,
if (!pmd_present(*pmd)) if (!pmd_present(*pmd))
goto out_unlock; goto out_unlock;
/* Update high watermark before we lower rss */
update_hiwater_rss(mm);
for (original_pte = pte = pte_offset_map(pmd, address); for (original_pte = pte = pte_offset_map(pmd, address);
address < end; pte++, address += PAGE_SIZE) { address < end; pte++, address += PAGE_SIZE) {
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment