Commit 8aa3448c authored by Andrew Morton's avatar Andrew Morton Committed by Linus Torvalds

[PATCH] rmap 39 add anon_vma rmap

From: Hugh Dickins <hugh@veritas.com>

Andrea Arcangeli's anon_vma object-based reverse mapping scheme for anonymous
pages.  Instead of tracking anonymous pages by pte_chains or by mm, this
tracks them by vma.  But because vmas are frequently split and merged
(particularly by mprotect), a page cannot point directly to its vma(s), but
instead to an anon_vma list of those vmas likely to contain the page - a list
on which vmas can easily be linked and unlinked as they come and go.  The vmas
on one list are all related, either by forking or by splitting.

This has three particular advantages over anonmm: that it can cope
effortlessly with mremap moves; and no longer needs page_table_lock to protect
an mm's vma tree, since try_to_unmap finds vmas via page -> anon_vma -> vma
instead of using find_vma; and should use less cpu for swapout since it can
locate its anonymous vmas more quickly.

It does have disadvantages too: a lot more change in mmap.c to deal with
anon_vmas, though small straightforward additions now that the vma merging has
been refactored there; more lowmem needed for each anon_vma and vma structure;
an additional restriction on the merging of vmas (cannot be merged if already
assigned different anon_vmas, since then their pages will be pointing to
different heads).

(There would be no need to enlarge the vma structure if anonymous pages
belonged only to anonymous vmas; but private file mappings accumulate
anonymous pages by copy-on-write, so need to be listed in both anon_vma and
prio_tree at the same time.  A different implementation could avoid that by
using anon_vmas only for purely anonymous vmas, and use the existing prio_tree
to locate cow pages - but that would involve a long search for each single
private copy, probably not a good idea.)

Where before the vm_pgoff of a purely anonymous (not file-backed) vma was
meaningless, now it represents the virtual start address at which that vma is
mapped - which the standard file pgoff manipulations treat linearly as vmas
are split and merged.  But if mremap moves the vma, then it generally carries
its original vm_pgoff to the new location, so pages shared with the old
location can still be found.  Magic.

Hugh has massaged it somewhat: building on the earlier rmap patches, this
patch is a fifth of the size of Andrea's original anon_vma patch.  Please note
that this posting will be his first sight of this patch, which he may or may
not approve.
parent a89cd0f0
...@@ -303,6 +303,9 @@ void install_arg_page(struct vm_area_struct *vma, ...@@ -303,6 +303,9 @@ void install_arg_page(struct vm_area_struct *vma,
pmd_t * pmd; pmd_t * pmd;
pte_t * pte; pte_t * pte;
if (unlikely(anon_vma_prepare(vma)))
goto out_sig;
flush_dcache_page(page); flush_dcache_page(page);
pgd = pgd_offset(mm, address); pgd = pgd_offset(mm, address);
...@@ -329,6 +332,7 @@ void install_arg_page(struct vm_area_struct *vma, ...@@ -329,6 +332,7 @@ void install_arg_page(struct vm_area_struct *vma,
return; return;
out: out:
spin_unlock(&mm->page_table_lock); spin_unlock(&mm->page_table_lock);
out_sig:
__free_page(page); __free_page(page);
force_sig(SIGKILL, current); force_sig(SIGKILL, current);
} }
......
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
#include <linux/fs.h> #include <linux/fs.h>
struct mempolicy; struct mempolicy;
struct anon_vma;
#ifndef CONFIG_DISCONTIGMEM /* Don't use mapnrs, do it properly */ #ifndef CONFIG_DISCONTIGMEM /* Don't use mapnrs, do it properly */
extern unsigned long max_mapnr; extern unsigned long max_mapnr;
...@@ -78,6 +79,15 @@ struct vm_area_struct { ...@@ -78,6 +79,15 @@ struct vm_area_struct {
struct prio_tree_node prio_tree_node; struct prio_tree_node prio_tree_node;
} shared; } shared;
/*
* A file's MAP_PRIVATE vma can be in both i_mmap tree and anon_vma
* list, after a COW of one of the file pages. A MAP_SHARED vma
* can only be in the i_mmap tree. An anonymous MAP_PRIVATE, stack
* or brk vma (with NULL file) can only be in an anon_vma list.
*/
struct list_head anon_vma_node; /* Serialized by anon_vma->lock */
struct anon_vma *anon_vma; /* Serialized by page_table_lock */
/* Function pointers to deal with this struct. */ /* Function pointers to deal with this struct. */
struct vm_operations_struct * vm_ops; struct vm_operations_struct * vm_ops;
...@@ -201,7 +211,12 @@ struct page { ...@@ -201,7 +211,12 @@ struct page {
* if PagePrivate set; used for * if PagePrivate set; used for
* swp_entry_t if PageSwapCache * swp_entry_t if PageSwapCache
*/ */
struct address_space *mapping; /* The inode (or ...) we belong to. */ struct address_space *mapping; /* If PG_anon clear, points to
* inode address_space, or NULL.
* If page mapped as anonymous
* memory, PG_anon is set, and
* it points to anon_vma object.
*/
pgoff_t index; /* Our offset within mapping. */ pgoff_t index; /* Our offset within mapping. */
struct list_head lru; /* Pageout list, eg. active_list struct list_head lru; /* Pageout list, eg. active_list
* protected by zone->lru_lock ! * protected by zone->lru_lock !
...@@ -610,7 +625,8 @@ extern void vma_adjust(struct vm_area_struct *vma, unsigned long start, ...@@ -610,7 +625,8 @@ extern void vma_adjust(struct vm_area_struct *vma, unsigned long start,
unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert); unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert);
extern struct vm_area_struct *vma_merge(struct mm_struct *, extern struct vm_area_struct *vma_merge(struct mm_struct *,
struct vm_area_struct *prev, unsigned long addr, unsigned long end, struct vm_area_struct *prev, unsigned long addr, unsigned long end,
unsigned long vm_flags, struct file *, pgoff_t, struct mempolicy *); unsigned long vm_flags, struct anon_vma *, struct file *, pgoff_t,
struct mempolicy *);
extern int split_vma(struct mm_struct *, extern int split_vma(struct mm_struct *,
struct vm_area_struct *, unsigned long addr, int new_below); struct vm_area_struct *, unsigned long addr, int new_below);
extern void insert_vm_struct(struct mm_struct *, struct vm_area_struct *); extern void insert_vm_struct(struct mm_struct *, struct vm_area_struct *);
......
...@@ -2,18 +2,75 @@ ...@@ -2,18 +2,75 @@
#define _LINUX_RMAP_H #define _LINUX_RMAP_H
/* /*
* Declarations for Reverse Mapping functions in mm/rmap.c * Declarations for Reverse Mapping functions in mm/rmap.c
* Its structures are declared within that file.
*/ */
#include <linux/config.h> #include <linux/config.h>
#include <linux/list.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
#define page_map_lock(page) \ #define page_map_lock(page) \
bit_spin_lock(PG_maplock, (unsigned long *)&(page)->flags) bit_spin_lock(PG_maplock, (unsigned long *)&(page)->flags)
#define page_map_unlock(page) \ #define page_map_unlock(page) \
bit_spin_unlock(PG_maplock, (unsigned long *)&(page)->flags) bit_spin_unlock(PG_maplock, (unsigned long *)&(page)->flags)
/*
* The anon_vma heads a list of private "related" vmas, to scan if
* an anonymous page pointing to this anon_vma needs to be unmapped:
* the vmas on the list will be related by forking, or by splitting.
*
* Since vmas come and go as they are split and merged (particularly
* in mprotect), the mapping field of an anonymous page cannot point
* directly to a vma: instead it points to an anon_vma, on whose list
* the related vmas can be easily linked or unlinked.
*
* After unlinking the last vma on the list, we must garbage collect
* the anon_vma object itself: we're guaranteed no page can be
* pointing to this anon_vma once its vma list is empty.
*/
struct anon_vma {
spinlock_t lock; /* Serialize access to vma list */
struct list_head head; /* List of private "related" vmas */
};
#ifdef CONFIG_MMU #ifdef CONFIG_MMU
extern kmem_cache_t *anon_vma_cachep;
static inline struct anon_vma *anon_vma_alloc(void)
{
return kmem_cache_alloc(anon_vma_cachep, SLAB_KERNEL);
}
static inline void anon_vma_free(struct anon_vma *anon_vma)
{
kmem_cache_free(anon_vma_cachep, anon_vma);
}
static inline void anon_vma_lock(struct vm_area_struct *vma)
{
struct anon_vma *anon_vma = vma->anon_vma;
if (anon_vma)
spin_lock(&anon_vma->lock);
}
static inline void anon_vma_unlock(struct vm_area_struct *vma)
{
struct anon_vma *anon_vma = vma->anon_vma;
if (anon_vma)
spin_unlock(&anon_vma->lock);
}
/*
* anon_vma helper functions.
*/
void anon_vma_init(void); /* create anon_vma_cachep */
int anon_vma_prepare(struct vm_area_struct *);
void __anon_vma_merge(struct vm_area_struct *, struct vm_area_struct *);
void anon_vma_unlink(struct vm_area_struct *);
void anon_vma_link(struct vm_area_struct *);
void __anon_vma_link(struct vm_area_struct *);
/* /*
* rmap interfaces called when adding or removing pte of page * rmap interfaces called when adding or removing pte of page
*/ */
...@@ -43,6 +100,10 @@ int try_to_unmap(struct page *); ...@@ -43,6 +100,10 @@ int try_to_unmap(struct page *);
#else /* !CONFIG_MMU */ #else /* !CONFIG_MMU */
#define anon_vma_init() do {} while (0)
#define anon_vma_prepare(vma) (0)
#define anon_vma_link(vma) do {} while (0)
#define page_referenced(page) TestClearPageReferenced(page) #define page_referenced(page) TestClearPageReferenced(page)
#define try_to_unmap(page) SWAP_FAIL #define try_to_unmap(page) SWAP_FAIL
......
...@@ -42,6 +42,7 @@ ...@@ -42,6 +42,7 @@
#include <linux/cpu.h> #include <linux/cpu.h>
#include <linux/efi.h> #include <linux/efi.h>
#include <linux/unistd.h> #include <linux/unistd.h>
#include <linux/rmap.h>
#include <asm/io.h> #include <asm/io.h>
#include <asm/bugs.h> #include <asm/bugs.h>
...@@ -461,6 +462,7 @@ asmlinkage void __init start_kernel(void) ...@@ -461,6 +462,7 @@ asmlinkage void __init start_kernel(void)
pidmap_init(); pidmap_init();
pgtable_cache_init(); pgtable_cache_init();
prio_tree_init(); prio_tree_init();
anon_vma_init();
#ifdef CONFIG_X86 #ifdef CONFIG_X86
if (efi_enabled) if (efi_enabled)
efi_enter_virtual_mode(); efi_enter_virtual_mode();
......
...@@ -322,8 +322,9 @@ static inline int dup_mmap(struct mm_struct * mm, struct mm_struct * oldmm) ...@@ -322,8 +322,9 @@ static inline int dup_mmap(struct mm_struct * mm, struct mm_struct * oldmm)
tmp->vm_flags &= ~VM_LOCKED; tmp->vm_flags &= ~VM_LOCKED;
tmp->vm_mm = mm; tmp->vm_mm = mm;
tmp->vm_next = NULL; tmp->vm_next = NULL;
file = tmp->vm_file; anon_vma_link(tmp);
vma_prio_tree_init(tmp); vma_prio_tree_init(tmp);
file = tmp->vm_file;
if (file) { if (file) {
struct inode *inode = file->f_dentry->d_inode; struct inode *inode = file->f_dentry->d_inode;
get_file(file); get_file(file);
......
...@@ -1071,6 +1071,8 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct * vma, ...@@ -1071,6 +1071,8 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct * vma,
page_cache_get(old_page); page_cache_get(old_page);
spin_unlock(&mm->page_table_lock); spin_unlock(&mm->page_table_lock);
if (unlikely(anon_vma_prepare(vma)))
goto no_new_page;
new_page = alloc_page_vma(GFP_HIGHUSER, vma, address); new_page = alloc_page_vma(GFP_HIGHUSER, vma, address);
if (!new_page) if (!new_page)
goto no_new_page; goto no_new_page;
...@@ -1405,6 +1407,8 @@ do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, ...@@ -1405,6 +1407,8 @@ do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
pte_unmap(page_table); pte_unmap(page_table);
spin_unlock(&mm->page_table_lock); spin_unlock(&mm->page_table_lock);
if (unlikely(anon_vma_prepare(vma)))
goto no_mem;
page = alloc_page_vma(GFP_HIGHUSER, vma, addr); page = alloc_page_vma(GFP_HIGHUSER, vma, addr);
if (!page) if (!page)
goto no_mem; goto no_mem;
...@@ -1487,7 +1491,11 @@ do_no_page(struct mm_struct *mm, struct vm_area_struct *vma, ...@@ -1487,7 +1491,11 @@ do_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
* Should we do an early C-O-W break? * Should we do an early C-O-W break?
*/ */
if (write_access && !(vma->vm_flags & VM_SHARED)) { if (write_access && !(vma->vm_flags & VM_SHARED)) {
struct page *page = alloc_page_vma(GFP_HIGHUSER, vma, address); struct page *page;
if (unlikely(anon_vma_prepare(vma)))
goto oom;
page = alloc_page_vma(GFP_HIGHUSER, vma, address);
if (!page) if (!page)
goto oom; goto oom;
copy_user_highpage(page, new_page, address); copy_user_highpage(page, new_page, address);
......
This diff is collapsed.
...@@ -147,7 +147,7 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, ...@@ -147,7 +147,7 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
*/ */
pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
*pprev = vma_merge(mm, *pprev, start, end, newflags, *pprev = vma_merge(mm, *pprev, start, end, newflags,
vma->vm_file, pgoff, vma_policy(vma)); vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma));
if (*pprev) { if (*pprev) {
vma = *pprev; vma = *pprev;
goto success; goto success;
......
This diff is collapsed.
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment