Commit 051bd3c6 authored by Andrea Arcangeli's avatar Andrea Arcangeli

mm: gup: make get_user_pages_fast and __get_user_pages_fast latency conscious

This teaches gup_fast and __gup_fast to re-enable irqs and
cond_resched() if possible every BATCH_PAGES.

This must be implemented by other archs as well and it's a requirement
before converting more get_user_pages() to get_user_pages_fast() as an
optimization (instead of using get_user_pages_unlocked which would be
slower).
parent 123cb69c
...@@ -12,6 +12,12 @@ ...@@ -12,6 +12,12 @@
#include <asm/pgtable.h> #include <asm/pgtable.h>
/*
* Keep irq disabled for no more than BATCH_PAGES pages.
* Matches PTRS_PER_PTE (or half in non-PAE kernels).
*/
#define BATCH_PAGES 512
static inline pte_t gup_get_pte(pte_t *ptep) static inline pte_t gup_get_pte(pte_t *ptep)
{ {
#ifndef CONFIG_X86_PAE #ifndef CONFIG_X86_PAE
...@@ -250,6 +256,40 @@ static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end, ...@@ -250,6 +256,40 @@ static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end,
return 1; return 1;
} }
static inline int __get_user_pages_fast_batch(unsigned long start,
unsigned long end,
int write, struct page **pages)
{
struct mm_struct *mm = current->mm;
unsigned long next;
unsigned long flags;
pgd_t *pgdp;
int nr = 0;
/*
* This doesn't prevent pagetable teardown, but does prevent
* the pagetables and pages from being freed on x86.
*
* So long as we atomically load page table pointers versus teardown
* (which we do on x86, with the above PAE exception), we can follow the
* address down to the the page and take a ref on it.
*/
local_irq_save(flags);
pgdp = pgd_offset(mm, start);
do {
pgd_t pgd = *pgdp;
next = pgd_addr_end(start, end);
if (pgd_none(pgd))
break;
if (!gup_pud_range(pgd, start, next, write, pages, &nr))
break;
} while (pgdp++, start = next, start != end);
local_irq_restore(flags);
return nr;
}
/* /*
* Like get_user_pages_fast() except its IRQ-safe in that it won't fall * Like get_user_pages_fast() except its IRQ-safe in that it won't fall
* back to the regular GUP. * back to the regular GUP.
...@@ -257,31 +297,55 @@ static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end, ...@@ -257,31 +297,55 @@ static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end,
int __get_user_pages_fast(unsigned long start, int nr_pages, int write, int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
struct page **pages) struct page **pages)
{ {
struct mm_struct *mm = current->mm; unsigned long len, end, batch_pages;
unsigned long addr, len, end; int nr, ret;
unsigned long next;
unsigned long flags;
pgd_t *pgdp;
int nr = 0;
start &= PAGE_MASK; start &= PAGE_MASK;
addr = start;
len = (unsigned long) nr_pages << PAGE_SHIFT; len = (unsigned long) nr_pages << PAGE_SHIFT;
end = start + len; end = start + len;
/*
* get_user_pages() handles nr_pages == 0 gracefully, but
* gup_fast starts walking the first pagetable in a do {}
* while() fashion so it's not robust to handle nr_pages ==
* 0. There's no point in being permissive about end < start
* either. So this check verifies both nr_pages being non
* zero, and that "end" didn't overflow.
*/
VM_BUG_ON(end <= start);
if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ, if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ,
(void __user *)start, len))) (void __user *)start, len)))
return 0; return 0;
/* ret = 0;
* XXX: batch / limit 'nr', to avoid large irq off latency for (;;) {
* needs some instrumenting to determine the common sizes used by batch_pages = nr_pages;
* important workloads (eg. DB2), and whether limiting the batch size if (batch_pages > BATCH_PAGES && !irqs_disabled())
* will decrease performance. batch_pages = BATCH_PAGES;
* len = (unsigned long) batch_pages << PAGE_SHIFT;
* It seems like we're in the clear for the moment. Direct-IO is end = start + len;
* the main guy that batches up lots of get_user_pages, and even nr = __get_user_pages_fast_batch(start, end, write, pages);
* they are limited to 64-at-a-time which is not so many. VM_BUG_ON(nr > batch_pages);
*/ nr_pages -= nr;
ret += nr;
if (!nr_pages || nr != batch_pages)
break;
start += len;
pages += batch_pages;
}
return ret;
}
static inline int get_user_pages_fast_batch(unsigned long start,
unsigned long end,
int write, struct page **pages)
{
struct mm_struct *mm = current->mm;
unsigned long next;
pgd_t *pgdp;
int nr = 0;
unsigned long orig_start = start;
/* /*
* This doesn't prevent pagetable teardown, but does prevent * This doesn't prevent pagetable teardown, but does prevent
* the pagetables and pages from being freed on x86. * the pagetables and pages from being freed on x86.
...@@ -290,18 +354,24 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write, ...@@ -290,18 +354,24 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
* (which we do on x86, with the above PAE exception), we can follow the * (which we do on x86, with the above PAE exception), we can follow the
* address down to the the page and take a ref on it. * address down to the the page and take a ref on it.
*/ */
local_irq_save(flags); local_irq_disable();
pgdp = pgd_offset(mm, addr); pgdp = pgd_offset(mm, start);
do { do {
pgd_t pgd = *pgdp; pgd_t pgd = *pgdp;
next = pgd_addr_end(addr, end); next = pgd_addr_end(start, end);
if (pgd_none(pgd)) if (pgd_none(pgd)) {
VM_BUG_ON(nr >= (end-orig_start) >> PAGE_SHIFT);
break; break;
if (!gup_pud_range(pgd, addr, next, write, pages, &nr)) }
if (!gup_pud_range(pgd, start, next, write, pages, &nr)) {
VM_BUG_ON(nr >= (end-orig_start) >> PAGE_SHIFT);
break; break;
} while (pgdp++, addr = next, addr != end); }
local_irq_restore(flags); } while (pgdp++, start = next, start != end);
local_irq_enable();
cond_resched();
return nr; return nr;
} }
...@@ -326,68 +396,63 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write, ...@@ -326,68 +396,63 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write,
struct page **pages) struct page **pages)
{ {
struct mm_struct *mm = current->mm; struct mm_struct *mm = current->mm;
unsigned long addr, len, end; unsigned long len, end, batch_pages;
unsigned long next; int nr, ret;
pgd_t *pgdp; unsigned long orig_start;
int nr = 0;
start &= PAGE_MASK; start &= PAGE_MASK;
addr = start; orig_start = start;
len = (unsigned long) nr_pages << PAGE_SHIFT; len = (unsigned long) nr_pages << PAGE_SHIFT;
end = start + len; end = start + len;
if (end < start) /*
goto slow_irqon; * get_user_pages() handles nr_pages == 0 gracefully, but
* gup_fast starts walking the first pagetable in a do {}
* while() fashion so it's not robust to handle nr_pages ==
* 0. There's no point in being permissive about end < start
* either. So this check verifies both nr_pages being non
* zero, and that "end" didn't overflow.
*/
VM_BUG_ON(end <= start);
nr = ret = 0;
#ifdef CONFIG_X86_64 #ifdef CONFIG_X86_64
if (end >> __VIRTUAL_MASK_SHIFT) if (end >> __VIRTUAL_MASK_SHIFT)
goto slow_irqon; goto slow_irqon;
#endif #endif
for (;;) {
batch_pages = min(nr_pages, BATCH_PAGES);
len = (unsigned long) batch_pages << PAGE_SHIFT;
end = start + len;
nr = get_user_pages_fast_batch(start, end, write, pages);
VM_BUG_ON(nr > batch_pages);
nr_pages -= nr;
ret += nr;
if (!nr_pages)
break;
if (nr < batch_pages)
goto slow_irqon;
start += len;
pages += batch_pages;
}
/* VM_BUG_ON(ret != (end - orig_start) >> PAGE_SHIFT);
* XXX: batch / limit 'nr', to avoid large irq off latency return ret;
* needs some instrumenting to determine the common sizes used by
* important workloads (eg. DB2), and whether limiting the batch size
* will decrease performance.
*
* It seems like we're in the clear for the moment. Direct-IO is
* the main guy that batches up lots of get_user_pages, and even
* they are limited to 64-at-a-time which is not so many.
*/
/*
* This doesn't prevent pagetable teardown, but does prevent
* the pagetables and pages from being freed on x86.
*
* So long as we atomically load page table pointers versus teardown
* (which we do on x86, with the above PAE exception), we can follow the
* address down to the the page and take a ref on it.
*/
local_irq_disable();
pgdp = pgd_offset(mm, addr);
do {
pgd_t pgd = *pgdp;
next = pgd_addr_end(addr, end);
if (pgd_none(pgd))
goto slow;
if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
goto slow;
} while (pgdp++, addr = next, addr != end);
local_irq_enable();
VM_BUG_ON(nr != (end - start) >> PAGE_SHIFT);
return nr;
{
int ret;
slow:
local_irq_enable();
slow_irqon: slow_irqon:
/* Try to get the remaining pages with get_user_pages */ /* Try to get the remaining pages with get_user_pages */
start += nr << PAGE_SHIFT; start += nr << PAGE_SHIFT;
pages += nr; pages += nr;
/*
* "nr" was the get_user_pages_fast_batch last retval, "ret"
* was the sum of all get_user_pages_fast_batch retvals, now
* "nr" becomes the sum of all get_user_pages_fast_batch
* retvals and "ret" will become the get_user_pages_unlocked
* retval.
*/
nr = ret;
ret = get_user_pages_unlocked(current, mm, start, ret = get_user_pages_unlocked(current, mm, start,
(end - start) >> PAGE_SHIFT, (end - start) >> PAGE_SHIFT,
write, 0, pages); write, 0, pages);
...@@ -401,5 +466,4 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write, ...@@ -401,5 +466,4 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write,
} }
return ret; return ret;
}
} }
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment