Commit 631709da authored by Andrew Morton's avatar Andrew Morton Committed by Linus Torvalds

[PATCH] speedup heuristic for get_unmapped_area

[I was going to send shared pagetables today, but it failed in
 my testing under X :( ]

the first one is an mmap inefficiency that was reported by Saurabh Desai.
The test_str02 NPTL test-utility does the following: it tests the maximum
number of threads by creating a new thread, which thread creates a new
thread itself, etc. It basically creates thousands of parallel threads,
which means thousands of thread stacks.

NPTL uses mmap() to allocate new default thread stacks - and POSIX
requires us to install a 'guard page' as well, which is done via
mprotect(PROT_NONE) on the first page of the stack. This means that tons
of NPTL threads means 2* tons of vmas per MM, all allocated in a forward
fashion starting at the virtual address of 1 GB (TASK_UNMAPPED_BASE).

Saurabh reported a slowdown after the first couple of thousands of
threads, which i can reproduce as well. The reason for this slowdown is
the get_unmapped_area() implementation, which tries to achieve the most
compact virtual memory allocation, by searching for the vma at
TASK_UNMAPPED_BASE, and then linearly searching for a hole. With thousands
of linearly allocated vmas this is an increasingly painful thing to do ...

obviously, high-performance threaded applications will create stacks
without the guard page, which triggers the anon-vma merging code so we end
up with one large vma, not tons of small vmas.

it's also possible for userspace to be smarter by setting aside a stack
space and keeping a bitmap of allocated stacks and using MAP_FIXED (this
also enables it to do the guard page not via mprotect() but by keeping the
stacks apart by 1 page - ie. half the number of vmas) - but this also
decreases flexibility.

So i think that the default behavior nevertheless makes sense as well, so
IMO we should optimize it in the kernel.

there are various solutions to this problem, none of which solve the
problem in a 100% sufficient way, so i went for the simplest approach: i
added code to cache the 'last known hole' address in mm->free_area_cache,
which is used as a hint to get_unmapped_area().

this fixed the test_str02 testcase wonderfully, thread creation
performance for this testcase is O(1) again, but this simpler solution
obviously has a number of weak spots, and the (unlikely but possible)
worst-case is quite close to the current situation. In any case, this
approach does not sacrifice the perfect VM compactness out mmap()
implementation achieves, so it's a performance optimization with no
externally visible consequences.

The most generic and still perfectly-compact VM allocation solution would
be to have a vma tree for the 'inverse virtual memory space', ie. a tree
of free virtual memory ranges, which could be searched and iterated like
the space of allocated vmas. I think we could do this by extending vmas,
but the drawback is larger vmas. This does not save us from having to scan
vmas linearly still, because the size constraint is still present, but at
least most of the anon-mmap activities are constant sized. (both malloc()
and the thread-stack allocator uses mostly fixed sizes.)

This patch contains some fixes from Dave Miller - on some architectures
it is not posible to evaluate TASK_UNMAPPED_BASE at compile-time.
parent b2205dc0
...@@ -307,6 +307,7 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs) ...@@ -307,6 +307,7 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs)
(current->mm->start_data = N_DATADDR(ex)); (current->mm->start_data = N_DATADDR(ex));
current->mm->brk = ex.a_bss + current->mm->brk = ex.a_bss +
(current->mm->start_brk = N_BSSADDR(ex)); (current->mm->start_brk = N_BSSADDR(ex));
current->mm->free_area_cache = TASK_UNMAPPED_BASE;
current->mm->rss = 0; current->mm->rss = 0;
current->mm->mmap = NULL; current->mm->mmap = NULL;
......
...@@ -619,6 +619,7 @@ static int load_elf_binary(struct linux_binprm * bprm, struct pt_regs * regs) ...@@ -619,6 +619,7 @@ static int load_elf_binary(struct linux_binprm * bprm, struct pt_regs * regs)
/* Do this so that we can load the interpreter, if need be. We will /* Do this so that we can load the interpreter, if need be. We will
change some of these later */ change some of these later */
current->mm->rss = 0; current->mm->rss = 0;
current->mm->free_area_cache = TASK_UNMAPPED_BASE;
retval = setup_arg_pages(bprm); retval = setup_arg_pages(bprm);
if (retval < 0) { if (retval < 0) {
send_sig(SIGKILL, current, 0); send_sig(SIGKILL, current, 0);
......
...@@ -270,7 +270,7 @@ extern unsigned int mca_pentium_flag; ...@@ -270,7 +270,7 @@ extern unsigned int mca_pentium_flag;
/* This decides where the kernel will search for a free chunk of vm /* This decides where the kernel will search for a free chunk of vm
* space during mmap's. * space during mmap's.
*/ */
#define TASK_UNMAPPED_BASE (TASK_SIZE / 3) #define TASK_UNMAPPED_BASE (PAGE_ALIGN(TASK_SIZE / 3))
/* /*
* Size of io_bitmap in longwords: 32 is ports 0-0x3ff. * Size of io_bitmap in longwords: 32 is ports 0-0x3ff.
......
...@@ -173,6 +173,7 @@ struct mm_struct { ...@@ -173,6 +173,7 @@ struct mm_struct {
struct vm_area_struct * mmap; /* list of VMAs */ struct vm_area_struct * mmap; /* list of VMAs */
struct rb_root mm_rb; struct rb_root mm_rb;
struct vm_area_struct * mmap_cache; /* last find_vma result */ struct vm_area_struct * mmap_cache; /* last find_vma result */
unsigned long free_area_cache; /* first hole */
pgd_t * pgd; pgd_t * pgd;
atomic_t mm_users; /* How many users with user space? */ atomic_t mm_users; /* How many users with user space? */
atomic_t mm_count; /* How many references to "struct mm_struct" (users count as 1) */ atomic_t mm_count; /* How many references to "struct mm_struct" (users count as 1) */
......
...@@ -215,6 +215,7 @@ static inline int dup_mmap(struct mm_struct * mm) ...@@ -215,6 +215,7 @@ static inline int dup_mmap(struct mm_struct * mm)
mm->locked_vm = 0; mm->locked_vm = 0;
mm->mmap = NULL; mm->mmap = NULL;
mm->mmap_cache = NULL; mm->mmap_cache = NULL;
mm->free_area_cache = TASK_UNMAPPED_BASE;
mm->map_count = 0; mm->map_count = 0;
mm->rss = 0; mm->rss = 0;
mm->cpu_vm_mask = 0; mm->cpu_vm_mask = 0;
...@@ -308,6 +309,8 @@ static struct mm_struct * mm_init(struct mm_struct * mm) ...@@ -308,6 +309,8 @@ static struct mm_struct * mm_init(struct mm_struct * mm)
mm->page_table_lock = SPIN_LOCK_UNLOCKED; mm->page_table_lock = SPIN_LOCK_UNLOCKED;
mm->ioctx_list_lock = RW_LOCK_UNLOCKED; mm->ioctx_list_lock = RW_LOCK_UNLOCKED;
mm->default_kioctx = (struct kioctx)INIT_KIOCTX(mm->default_kioctx, *mm); mm->default_kioctx = (struct kioctx)INIT_KIOCTX(mm->default_kioctx, *mm);
mm->free_area_cache = TASK_UNMAPPED_BASE;
mm->pgd = pgd_alloc(mm); mm->pgd = pgd_alloc(mm);
if (mm->pgd) if (mm->pgd)
return mm; return mm;
......
...@@ -639,24 +639,33 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr, ...@@ -639,24 +639,33 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr,
#ifndef HAVE_ARCH_UNMAPPED_AREA #ifndef HAVE_ARCH_UNMAPPED_AREA
static inline unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) static inline unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags)
{ {
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma; struct vm_area_struct *vma;
int found_hole = 0;
if (len > TASK_SIZE) if (len > TASK_SIZE)
return -ENOMEM; return -ENOMEM;
if (addr) { if (addr) {
addr = PAGE_ALIGN(addr); addr = PAGE_ALIGN(addr);
vma = find_vma(current->mm, addr); vma = find_vma(mm, addr);
if (TASK_SIZE - len >= addr && if (TASK_SIZE - len >= addr &&
(!vma || addr + len <= vma->vm_start)) (!vma || addr + len <= vma->vm_start))
return addr; return addr;
} }
addr = PAGE_ALIGN(TASK_UNMAPPED_BASE); addr = mm->free_area_cache;
for (vma = find_vma(current->mm, addr); ; vma = vma->vm_next) { for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
/* At this point: (!vma || addr < vma->vm_end). */ /* At this point: (!vma || addr < vma->vm_end). */
if (TASK_SIZE - len < addr) if (TASK_SIZE - len < addr)
return -ENOMEM; return -ENOMEM;
/*
* Record the first available hole.
*/
if (!found_hole && (!vma || addr < vma->vm_start)) {
mm->free_area_cache = addr;
found_hole = 1;
}
if (!vma || addr + len <= vma->vm_start) if (!vma || addr + len <= vma->vm_start)
return addr; return addr;
addr = vma->vm_end; addr = vma->vm_end;
...@@ -947,6 +956,12 @@ void unmap_vma(struct mm_struct *mm, struct vm_area_struct *area) ...@@ -947,6 +956,12 @@ void unmap_vma(struct mm_struct *mm, struct vm_area_struct *area)
area->vm_mm->total_vm -= len >> PAGE_SHIFT; area->vm_mm->total_vm -= len >> PAGE_SHIFT;
if (area->vm_flags & VM_LOCKED) if (area->vm_flags & VM_LOCKED)
area->vm_mm->locked_vm -= len >> PAGE_SHIFT; area->vm_mm->locked_vm -= len >> PAGE_SHIFT;
/*
* Is this a new hole at the lowest possible address?
*/
if (area->vm_start >= TASK_UNMAPPED_BASE &&
area->vm_start < area->vm_mm->free_area_cache)
area->vm_mm->free_area_cache = area->vm_start;
remove_shared_vm_struct(area); remove_shared_vm_struct(area);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment