Commit deceb6cd authored by Hugh Dickins's avatar Hugh Dickins Committed by Linus Torvalds

[PATCH] mm: follow_page with inner ptlock

Final step in pushing down common core's page_table_lock.  follow_page no
longer wants caller to hold page_table_lock, uses pte_offset_map_lock itself;
and so no page_table_lock is taken in get_user_pages itself.

But get_user_pages (and get_futex_key) do then need follow_page to pin the
page for them: take Daniel's suggestion of bitflags to follow_page.

Need one for WRITE, another for TOUCH (it was the accessed flag before:
vanished along with check_user_page_readable, but surely get_numa_maps is
wrong to mark every page it finds as accessed), another for GET.

And another, ANON to dispose of untouched_anonymous_page: it seems silly for
that to descend a second time, let follow_page observe if there was no page
table and return ZERO_PAGE if so.  Fix minor bug in that: check VM_LOCKED -
make_pages_present ought to make readonly anonymous present.

Give get_numa_maps a cond_resched while we're there.
Signed-off-by: default avatarHugh Dickins <hugh@veritas.com>
Signed-off-by: default avatarAndrew Morton <akpm@osdl.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@osdl.org>
parent c34d1b4d
...@@ -419,7 +419,6 @@ static struct numa_maps *get_numa_maps(const struct vm_area_struct *vma) ...@@ -419,7 +419,6 @@ static struct numa_maps *get_numa_maps(const struct vm_area_struct *vma)
for_each_node(i) for_each_node(i)
md->node[i] =0; md->node[i] =0;
spin_lock(&mm->page_table_lock);
for (vaddr = vma->vm_start; vaddr < vma->vm_end; vaddr += PAGE_SIZE) { for (vaddr = vma->vm_start; vaddr < vma->vm_end; vaddr += PAGE_SIZE) {
page = follow_page(mm, vaddr, 0); page = follow_page(mm, vaddr, 0);
if (page) { if (page) {
...@@ -434,8 +433,8 @@ static struct numa_maps *get_numa_maps(const struct vm_area_struct *vma) ...@@ -434,8 +433,8 @@ static struct numa_maps *get_numa_maps(const struct vm_area_struct *vma)
md->anon++; md->anon++;
md->node[page_to_nid(page)]++; md->node[page_to_nid(page)]++;
} }
cond_resched();
} }
spin_unlock(&mm->page_table_lock);
return md; return md;
} }
......
...@@ -938,14 +938,18 @@ static inline unsigned long vma_pages(struct vm_area_struct *vma) ...@@ -938,14 +938,18 @@ static inline unsigned long vma_pages(struct vm_area_struct *vma)
return (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; return (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
} }
extern struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr); struct vm_area_struct *find_extend_vma(struct mm_struct *, unsigned long addr);
struct page *vmalloc_to_page(void *addr);
extern struct page * vmalloc_to_page(void *addr); unsigned long vmalloc_to_pfn(void *addr);
extern unsigned long vmalloc_to_pfn(void *addr); int remap_pfn_range(struct vm_area_struct *, unsigned long addr,
extern struct page * follow_page(struct mm_struct *mm, unsigned long address, unsigned long pfn, unsigned long size, pgprot_t);
int write);
int remap_pfn_range(struct vm_area_struct *, unsigned long, struct page *follow_page(struct mm_struct *, unsigned long address,
unsigned long, unsigned long, pgprot_t); unsigned int foll_flags);
#define FOLL_WRITE 0x01 /* check pte is writable */
#define FOLL_TOUCH 0x02 /* mark page accessed */
#define FOLL_GET 0x04 /* do get_page on page */
#define FOLL_ANON 0x08 /* give ZERO_PAGE if no pgtable */
#ifdef CONFIG_PROC_FS #ifdef CONFIG_PROC_FS
void vm_stat_account(struct mm_struct *, unsigned long, struct file *, long); void vm_stat_account(struct mm_struct *, unsigned long, struct file *, long);
......
...@@ -205,15 +205,13 @@ static int get_futex_key(unsigned long uaddr, union futex_key *key) ...@@ -205,15 +205,13 @@ static int get_futex_key(unsigned long uaddr, union futex_key *key)
/* /*
* Do a quick atomic lookup first - this is the fastpath. * Do a quick atomic lookup first - this is the fastpath.
*/ */
spin_lock(&current->mm->page_table_lock); page = follow_page(mm, uaddr, FOLL_TOUCH|FOLL_GET);
page = follow_page(mm, uaddr, 0);
if (likely(page != NULL)) { if (likely(page != NULL)) {
key->shared.pgoff = key->shared.pgoff =
page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
spin_unlock(&current->mm->page_table_lock); put_page(page);
return 0; return 0;
} }
spin_unlock(&current->mm->page_table_lock);
/* /*
* Do it the general way. * Do it the general way.
......
...@@ -807,86 +807,82 @@ unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address, ...@@ -807,86 +807,82 @@ unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address,
/* /*
* Do a quick page-table lookup for a single page. * Do a quick page-table lookup for a single page.
* mm->page_table_lock must be held.
*/ */
struct page *follow_page(struct mm_struct *mm, unsigned long address, int write) struct page *follow_page(struct mm_struct *mm, unsigned long address,
unsigned int flags)
{ {
pgd_t *pgd; pgd_t *pgd;
pud_t *pud; pud_t *pud;
pmd_t *pmd; pmd_t *pmd;
pte_t *ptep, pte; pte_t *ptep, pte;
spinlock_t *ptl;
unsigned long pfn; unsigned long pfn;
struct page *page; struct page *page;
page = follow_huge_addr(mm, address, write); page = follow_huge_addr(mm, address, flags & FOLL_WRITE);
if (! IS_ERR(page)) if (!IS_ERR(page)) {
return page; BUG_ON(flags & FOLL_GET);
goto out;
}
page = NULL;
pgd = pgd_offset(mm, address); pgd = pgd_offset(mm, address);
if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
goto out; goto no_page_table;
pud = pud_offset(pgd, address); pud = pud_offset(pgd, address);
if (pud_none(*pud) || unlikely(pud_bad(*pud))) if (pud_none(*pud) || unlikely(pud_bad(*pud)))
goto out; goto no_page_table;
pmd = pmd_offset(pud, address); pmd = pmd_offset(pud, address);
if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
goto no_page_table;
if (pmd_huge(*pmd)) {
BUG_ON(flags & FOLL_GET);
page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE);
goto out; goto out;
if (pmd_huge(*pmd)) }
return follow_huge_pmd(mm, address, pmd, write);
ptep = pte_offset_map(pmd, address); ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
if (!ptep) if (!ptep)
goto out; goto out;
pte = *ptep; pte = *ptep;
pte_unmap(ptep); if (!pte_present(pte))
if (pte_present(pte)) { goto unlock;
if (write && !pte_write(pte)) if ((flags & FOLL_WRITE) && !pte_write(pte))
goto out; goto unlock;
pfn = pte_pfn(pte); pfn = pte_pfn(pte);
if (pfn_valid(pfn)) { if (!pfn_valid(pfn))
goto unlock;
page = pfn_to_page(pfn); page = pfn_to_page(pfn);
if (write && !pte_dirty(pte) &&!PageDirty(page)) if (flags & FOLL_GET)
get_page(page);
if (flags & FOLL_TOUCH) {
if ((flags & FOLL_WRITE) &&
!pte_dirty(pte) && !PageDirty(page))
set_page_dirty(page); set_page_dirty(page);
mark_page_accessed(page); mark_page_accessed(page);
return page;
}
} }
unlock:
pte_unmap_unlock(ptep, ptl);
out: out:
return NULL; return page;
}
static inline int
untouched_anonymous_page(struct mm_struct* mm, struct vm_area_struct *vma,
unsigned long address)
{
pgd_t *pgd;
pud_t *pud;
pmd_t *pmd;
/* Check if the vma is for an anonymous mapping. */
if (vma->vm_ops && vma->vm_ops->nopage)
return 0;
/* Check if page directory entry exists. */
pgd = pgd_offset(mm, address);
if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
return 1;
pud = pud_offset(pgd, address);
if (pud_none(*pud) || unlikely(pud_bad(*pud)))
return 1;
/* Check if page middle directory entry exists. */
pmd = pmd_offset(pud, address);
if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
return 1;
/* There is a pte slot for 'address' in 'mm'. */ no_page_table:
return 0; /*
* When core dumping an enormous anonymous area that nobody
* has touched so far, we don't want to allocate page tables.
*/
if (flags & FOLL_ANON) {
page = ZERO_PAGE(address);
if (flags & FOLL_GET)
get_page(page);
BUG_ON(flags & FOLL_WRITE);
}
return page;
} }
int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
...@@ -894,18 +890,19 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, ...@@ -894,18 +890,19 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
struct page **pages, struct vm_area_struct **vmas) struct page **pages, struct vm_area_struct **vmas)
{ {
int i; int i;
unsigned int flags; unsigned int vm_flags;
/* /*
* Require read or write permissions. * Require read or write permissions.
* If 'force' is set, we only require the "MAY" flags. * If 'force' is set, we only require the "MAY" flags.
*/ */
flags = write ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD); vm_flags = write ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
flags &= force ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); vm_flags &= force ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
i = 0; i = 0;
do { do {
struct vm_area_struct * vma; struct vm_area_struct *vma;
unsigned int foll_flags;
vma = find_extend_vma(mm, start); vma = find_extend_vma(mm, start);
if (!vma && in_gate_area(tsk, start)) { if (!vma && in_gate_area(tsk, start)) {
...@@ -946,7 +943,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, ...@@ -946,7 +943,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
} }
if (!vma || (vma->vm_flags & (VM_IO | VM_RESERVED)) if (!vma || (vma->vm_flags & (VM_IO | VM_RESERVED))
|| !(flags & vma->vm_flags)) || !(vm_flags & vma->vm_flags))
return i ? : -EFAULT; return i ? : -EFAULT;
if (is_vm_hugetlb_page(vma)) { if (is_vm_hugetlb_page(vma)) {
...@@ -954,29 +951,25 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, ...@@ -954,29 +951,25 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
&start, &len, i); &start, &len, i);
continue; continue;
} }
spin_lock(&mm->page_table_lock);
foll_flags = FOLL_TOUCH;
if (pages)
foll_flags |= FOLL_GET;
if (!write && !(vma->vm_flags & VM_LOCKED) &&
(!vma->vm_ops || !vma->vm_ops->nopage))
foll_flags |= FOLL_ANON;
do { do {
int write_access = write;
struct page *page; struct page *page;
cond_resched_lock(&mm->page_table_lock); if (write)
while (!(page = follow_page(mm, start, write_access))) { foll_flags |= FOLL_WRITE;
int ret;
/*
* Shortcut for anonymous pages. We don't want
* to force the creation of pages tables for
* insanely big anonymously mapped areas that
* nobody touched so far. This is important
* for doing a core dump for these mappings.
*/
if (!write && untouched_anonymous_page(mm,vma,start)) {
page = ZERO_PAGE(start);
break;
}
spin_unlock(&mm->page_table_lock);
ret = __handle_mm_fault(mm, vma, start, write_access);
cond_resched();
while (!(page = follow_page(mm, start, foll_flags))) {
int ret;
ret = __handle_mm_fault(mm, vma, start,
foll_flags & FOLL_WRITE);
/* /*
* The VM_FAULT_WRITE bit tells us that do_wp_page has * The VM_FAULT_WRITE bit tells us that do_wp_page has
* broken COW when necessary, even if maybe_mkwrite * broken COW when necessary, even if maybe_mkwrite
...@@ -984,7 +977,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, ...@@ -984,7 +977,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
* subsequent page lookups as if they were reads. * subsequent page lookups as if they were reads.
*/ */
if (ret & VM_FAULT_WRITE) if (ret & VM_FAULT_WRITE)
write_access = 0; foll_flags &= ~FOLL_WRITE;
switch (ret & ~VM_FAULT_WRITE) { switch (ret & ~VM_FAULT_WRITE) {
case VM_FAULT_MINOR: case VM_FAULT_MINOR:
...@@ -1000,12 +993,10 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, ...@@ -1000,12 +993,10 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
default: default:
BUG(); BUG();
} }
spin_lock(&mm->page_table_lock);
} }
if (pages) { if (pages) {
pages[i] = page; pages[i] = page;
flush_dcache_page(page); flush_dcache_page(page);
page_cache_get(page);
} }
if (vmas) if (vmas)
vmas[i] = vma; vmas[i] = vma;
...@@ -1013,7 +1004,6 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, ...@@ -1013,7 +1004,6 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
start += PAGE_SIZE; start += PAGE_SIZE;
len--; len--;
} while (len && start < vma->vm_end); } while (len && start < vma->vm_end);
spin_unlock(&mm->page_table_lock);
} while (len); } while (len);
return i; return i;
} }
......
...@@ -1049,7 +1049,8 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) ...@@ -1049,7 +1049,8 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
EXPORT_SYMBOL(find_vma); EXPORT_SYMBOL(find_vma);
struct page * follow_page(struct mm_struct *mm, unsigned long addr, int write) struct page *follow_page(struct mm_struct *mm, unsigned long address,
unsigned int foll_flags)
{ {
return NULL; return NULL;
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment