Commit b4adddd6 authored by Andrew Morton's avatar Andrew Morton Committed by Richard Henderson

[PATCH] low-latency pagetable teardown

Pagetable teardown can hold page_table_lock for extremely long periods -
hundreds of milliseconds.  This is pretty much the final source of high
scheduling latency in the core kernel.

We fixed it for zap_page_range() by chunking the work up and dropping the
lock occasionally if needed.  But that did not fix exit_mmap() and
unmap_region().

So what this patch does is to create an uber-zapper "unmap_vmas()" which
provides all the vma-walking, page unmapping and low-latency lock-dropping
which zap_page_range(), exit_mmap() and unmap_region() require.  Those three
functions are updated to call unmap_vmas().

It's actually a bit of a cleanup...
parent 670fe925
...@@ -141,6 +141,7 @@ struct vm_operations_struct { ...@@ -141,6 +141,7 @@ struct vm_operations_struct {
/* forward declaration; pte_chain is meant to be internal to rmap.c */ /* forward declaration; pte_chain is meant to be internal to rmap.c */
struct pte_chain; struct pte_chain;
struct mmu_gather;
/* /*
* Each physical page in the system has a struct page associated with * Each physical page in the system has a struct page associated with
...@@ -357,15 +358,26 @@ extern struct page *mem_map; ...@@ -357,15 +358,26 @@ extern struct page *mem_map;
extern void show_free_areas(void); extern void show_free_areas(void);
struct page * shmem_nopage(struct vm_area_struct * vma, unsigned long address, int unused); struct page *shmem_nopage(struct vm_area_struct * vma,
unsigned long address, int unused);
struct file *shmem_file_setup(char * name, loff_t size, unsigned long flags); struct file *shmem_file_setup(char * name, loff_t size, unsigned long flags);
extern void shmem_lock(struct file * file, int lock); void shmem_lock(struct file * file, int lock);
extern int shmem_zero_setup(struct vm_area_struct *); int shmem_zero_setup(struct vm_area_struct *);
extern void zap_page_range(struct vm_area_struct *vma, unsigned long address, unsigned long size); void zap_page_range(struct vm_area_struct *vma, unsigned long address,
extern int copy_page_range(struct mm_struct *dst, struct mm_struct *src, struct vm_area_struct *vma); unsigned long size);
extern int remap_page_range(struct vm_area_struct *vma, unsigned long from, unsigned long to, unsigned long size, pgprot_t prot); int unmap_vmas(struct mmu_gather **tlbp, struct mm_struct *mm,
extern int zeromap_page_range(struct vm_area_struct *vma, unsigned long from, unsigned long size, pgprot_t prot); struct vm_area_struct *start_vma, unsigned long start_addr,
unsigned long end_addr, unsigned long *nr_accounted);
void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
unsigned long address, unsigned long size);
void clear_page_tables(struct mmu_gather *tlb, unsigned long first, int nr);
int copy_page_range(struct mm_struct *dst, struct mm_struct *src,
struct vm_area_struct *vma);
int remap_page_range(struct vm_area_struct *vma, unsigned long from,
unsigned long to, unsigned long size, pgprot_t prot);
int zeromap_page_range(struct vm_area_struct *vma, unsigned long from,
unsigned long size, pgprot_t prot);
extern int vmtruncate(struct inode * inode, loff_t offset); extern int vmtruncate(struct inode * inode, loff_t offset);
extern pmd_t *FASTCALL(__pmd_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)); extern pmd_t *FASTCALL(__pmd_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address));
......
...@@ -454,8 +454,6 @@ void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma, ...@@ -454,8 +454,6 @@ void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
BUG_ON(address >= end); BUG_ON(address >= end);
lru_add_drain();
dir = pgd_offset(vma->vm_mm, address); dir = pgd_offset(vma->vm_mm, address);
tlb_start_vma(tlb, vma); tlb_start_vma(tlb, vma);
do { do {
...@@ -481,17 +479,106 @@ void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma, ...@@ -481,17 +479,106 @@ void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
#define ZAP_BLOCK_SIZE (~(0UL)) #define ZAP_BLOCK_SIZE (~(0UL))
#endif #endif
/**
* unmap_vmas - unmap a range of memory covered by a list of vma's
* @tlbp: address of the caller's struct mmu_gather
* @mm: the controlling mm_struct
* @vma: the starting vma
* @start_addr: virtual address at which to start unmapping
* @end_addr: virtual address at which to end unmapping
* @nr_accounted: Place number of unmapped pages in vm-accountable vma's here
*
* Returns the number of vma's which were covered by the unmapping.
*
* Unmap all pages in the vma list. Called under page_table_lock.
*
* We aim to not hold page_table_lock for too long (for scheduling latency
* reasons). So zap pages in ZAP_BLOCK_SIZE bytecounts. This means we need to
* return the ending mmu_gather to the caller.
*
* Only addresses between `start' and `end' will be unmapped.
*
* The VMA list must be sorted in ascending virtual address order.
*
* unmap_vmas() assumes that the caller will flush the whole unmapped address
* range after unmap_vmas() returns. So the only responsibility here is to
* ensure that any thus-far unmapped pages are flushed before unmap_vmas()
* drops the lock and schedules.
*/
int unmap_vmas(struct mmu_gather **tlbp, struct mm_struct *mm,
struct vm_area_struct *vma, unsigned long start_addr,
unsigned long end_addr, unsigned long *nr_accounted)
{
unsigned long zap_bytes = ZAP_BLOCK_SIZE;
unsigned long tlb_start; /* For tlb_finish_mmu */
int tlb_start_valid = 0;
int ret = 0;
if (vma) { /* debug. killme. */
if (end_addr <= vma->vm_start)
printk("%s: end_addr(0x%08lx) <= vm_start(0x%08lx)\n",
__FUNCTION__, end_addr, vma->vm_start);
if (start_addr >= vma->vm_end)
printk("%s: start_addr(0x%08lx) <= vm_end(0x%08lx)\n",
__FUNCTION__, start_addr, vma->vm_end);
}
for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) {
unsigned long start;
unsigned long end;
start = max(vma->vm_start, start_addr);
if (start >= vma->vm_end)
continue;
end = min(vma->vm_end, end_addr);
if (end <= vma->vm_start)
continue;
if (vma->vm_flags & VM_ACCOUNT)
*nr_accounted += (end - start) >> PAGE_SHIFT;
ret++;
while (start != end) {
unsigned long block = min(zap_bytes, end - start);
if (!tlb_start_valid) {
tlb_start = start;
tlb_start_valid = 1;
}
unmap_page_range(*tlbp, vma, start, start + block);
start += block;
zap_bytes -= block;
if (zap_bytes != 0)
continue;
if (need_resched()) {
tlb_finish_mmu(*tlbp, tlb_start, start);
cond_resched_lock(&mm->page_table_lock);
*tlbp = tlb_gather_mmu(mm, 0);
tlb_start_valid = 0;
}
zap_bytes = ZAP_BLOCK_SIZE;
}
if (vma->vm_next && vma->vm_next->vm_start < vma->vm_end)
printk("%s: VMA list is not sorted correctly!\n",
__FUNCTION__);
}
return ret;
}
/** /**
* zap_page_range - remove user pages in a given range * zap_page_range - remove user pages in a given range
* @vma: vm_area_struct holding the applicable pages * @vma: vm_area_struct holding the applicable pages
* @address: starting address of pages to zap * @address: starting address of pages to zap
* @size: number of bytes to zap * @size: number of bytes to zap
*/ */
void zap_page_range(struct vm_area_struct *vma, unsigned long address, unsigned long size) void zap_page_range(struct vm_area_struct *vma,
unsigned long address, unsigned long size)
{ {
struct mm_struct *mm = vma->vm_mm; struct mm_struct *mm = vma->vm_mm;
struct mmu_gather *tlb; struct mmu_gather *tlb;
unsigned long end, block; unsigned long end = address + size;
unsigned long nr_accounted = 0;
might_sleep(); might_sleep();
...@@ -501,30 +588,11 @@ void zap_page_range(struct vm_area_struct *vma, unsigned long address, unsigned ...@@ -501,30 +588,11 @@ void zap_page_range(struct vm_area_struct *vma, unsigned long address, unsigned
} }
lru_add_drain(); lru_add_drain();
spin_lock(&mm->page_table_lock); spin_lock(&mm->page_table_lock);
flush_cache_range(vma, address, end);
/* tlb = tlb_gather_mmu(mm, 0);
* This was once a long-held spinlock. Now we break the unmap_vmas(&tlb, mm, vma, address, end, &nr_accounted);
* work up into ZAP_BLOCK_SIZE units and relinquish the tlb_finish_mmu(tlb, address, end);
* lock after each interation. This drastically lowers
* lock contention and allows for a preemption point.
*/
while (size) {
block = (size > ZAP_BLOCK_SIZE) ? ZAP_BLOCK_SIZE : size;
end = address + block;
flush_cache_range(vma, address, end);
tlb = tlb_gather_mmu(mm, 0);
unmap_page_range(tlb, vma, address, end);
tlb_finish_mmu(tlb, address, end);
cond_resched_lock(&mm->page_table_lock);
address += block;
size -= block;
}
spin_unlock(&mm->page_table_lock); spin_unlock(&mm->page_table_lock);
} }
......
...@@ -23,9 +23,6 @@ ...@@ -23,9 +23,6 @@
#include <asm/pgalloc.h> #include <asm/pgalloc.h>
#include <asm/tlb.h> #include <asm/tlb.h>
extern void unmap_page_range(struct mmu_gather *,struct vm_area_struct *vma, unsigned long address, unsigned long size);
extern void clear_page_tables(struct mmu_gather *tlb, unsigned long first, int nr);
/* /*
* WARNING: the debugging will use recursive algorithms so never enable this * WARNING: the debugging will use recursive algorithms so never enable this
* unless you know what you are doing. * unless you know what you are doing.
...@@ -1003,29 +1000,18 @@ static void unmap_vma_list(struct mm_struct *mm, ...@@ -1003,29 +1000,18 @@ static void unmap_vma_list(struct mm_struct *mm,
* Called with the page table lock held. * Called with the page table lock held.
*/ */
static void unmap_region(struct mm_struct *mm, static void unmap_region(struct mm_struct *mm,
struct vm_area_struct *mpnt, struct vm_area_struct *vma,
struct vm_area_struct *prev, struct vm_area_struct *prev,
unsigned long start, unsigned long start,
unsigned long end) unsigned long end)
{ {
struct mmu_gather *tlb; struct mmu_gather *tlb;
unsigned long nr_accounted = 0;
lru_add_drain();
tlb = tlb_gather_mmu(mm, 0); tlb = tlb_gather_mmu(mm, 0);
unmap_vmas(&tlb, mm, vma, start, end, &nr_accounted);
do { vm_unacct_memory(nr_accounted);
unsigned long from, to, len;
from = start < mpnt->vm_start ? mpnt->vm_start : start;
to = end > mpnt->vm_end ? mpnt->vm_end : end;
unmap_page_range(tlb, mpnt, from, to);
if (mpnt->vm_flags & VM_ACCOUNT) {
len = to - from;
vm_unacct_memory(len >> PAGE_SHIFT);
}
} while ((mpnt = mpnt->vm_next) != NULL);
free_pgtables(tlb, prev, start, end); free_pgtables(tlb, prev, start, end);
tlb_finish_mmu(tlb, start, end); tlb_finish_mmu(tlb, start, end);
} }
...@@ -1271,43 +1257,28 @@ void build_mmap_rb(struct mm_struct * mm) ...@@ -1271,43 +1257,28 @@ void build_mmap_rb(struct mm_struct * mm)
} }
/* Release all mmaps. */ /* Release all mmaps. */
void exit_mmap(struct mm_struct * mm) void exit_mmap(struct mm_struct *mm)
{ {
struct mmu_gather *tlb; struct mmu_gather *tlb;
struct vm_area_struct * mpnt; struct vm_area_struct *vma;
unsigned long nr_accounted = 0;
profile_exit_mmap(mm); profile_exit_mmap(mm);
lru_add_drain();
spin_lock(&mm->page_table_lock); spin_lock(&mm->page_table_lock);
tlb = tlb_gather_mmu(mm, 1); tlb = tlb_gather_mmu(mm, 1);
flush_cache_mm(mm); flush_cache_mm(mm);
mpnt = mm->mmap; mm->map_count -= unmap_vmas(&tlb, mm, mm->mmap, 0,
while (mpnt) { TASK_SIZE, &nr_accounted);
unsigned long start = mpnt->vm_start; vm_unacct_memory(nr_accounted);
unsigned long end = mpnt->vm_end; BUG_ON(mm->map_count); /* This is just debugging */
/*
* If the VMA has been charged for, account for its
* removal
*/
if (mpnt->vm_flags & VM_ACCOUNT)
vm_unacct_memory((end - start) >> PAGE_SHIFT);
mm->map_count--;
unmap_page_range(tlb, mpnt, start, end);
mpnt = mpnt->vm_next;
}
/* This is just debugging */
if (mm->map_count)
BUG();
clear_page_tables(tlb, FIRST_USER_PGD_NR, USER_PTRS_PER_PGD); clear_page_tables(tlb, FIRST_USER_PGD_NR, USER_PTRS_PER_PGD);
tlb_finish_mmu(tlb, 0, TASK_SIZE); tlb_finish_mmu(tlb, 0, TASK_SIZE);
mpnt = mm->mmap; vma = mm->mmap;
mm->mmap = mm->mmap_cache = NULL; mm->mmap = mm->mmap_cache = NULL;
mm->mm_rb = RB_ROOT; mm->mm_rb = RB_ROOT;
mm->rss = 0; mm->rss = 0;
...@@ -1320,17 +1291,17 @@ void exit_mmap(struct mm_struct * mm) ...@@ -1320,17 +1291,17 @@ void exit_mmap(struct mm_struct * mm)
* Walk the list again, actually closing and freeing it * Walk the list again, actually closing and freeing it
* without holding any MM locks. * without holding any MM locks.
*/ */
while (mpnt) { while (vma) {
struct vm_area_struct * next = mpnt->vm_next; struct vm_area_struct *next = vma->vm_next;
remove_shared_vm_struct(mpnt); remove_shared_vm_struct(vma);
if (mpnt->vm_ops) { if (vma->vm_ops) {
if (mpnt->vm_ops->close) if (vma->vm_ops->close)
mpnt->vm_ops->close(mpnt); vma->vm_ops->close(vma);
} }
if (mpnt->vm_file) if (vma->vm_file)
fput(mpnt->vm_file); fput(vma->vm_file);
kmem_cache_free(vm_area_cachep, mpnt); kmem_cache_free(vm_area_cachep, vma);
mpnt = next; vma = next;
} }
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment