Commit 4fc3f1d6 authored by Ingo Molnar's avatar Ingo Molnar Committed by Mel Gorman

mm/rmap, migration: Make rmap_walk_anon() and try_to_unmap_anon() more scalable

rmap_walk_anon() and try_to_unmap_anon() appears to be too
careful about locking the anon vma: while it needs protection
against anon vma list modifications, it does not need exclusive
access to the list itself.

Transforming this exclusive lock to a read-locked rwsem removes
a global lock from the hot path of page-migration intense
threaded workloads which can cause pathological performance like
this:

    96.43%        process 0  [kernel.kallsyms]  [k] perf_trace_sched_switch
                  |
                  --- perf_trace_sched_switch
                      __schedule
                      schedule
                      schedule_preempt_disabled
                      __mutex_lock_common.isra.6
                      __mutex_lock_slowpath
                      mutex_lock
                     |
                     |--50.61%-- rmap_walk
                     |          move_to_new_page
                     |          migrate_pages
                     |          migrate_misplaced_page
                     |          __do_numa_page.isra.69
                     |          handle_pte_fault
                     |          handle_mm_fault
                     |          __do_page_fault
                     |          do_page_fault
                     |          page_fault
                     |          __memset_sse2
                     |          |
                     |           --100.00%-- worker_thread
                     |                     |
                     |                      --100.00%-- start_thread
                     |
                      --49.39%-- page_lock_anon_vma
                                try_to_unmap_anon
                                try_to_unmap
                                migrate_pages
                                migrate_misplaced_page
                                __do_numa_page.isra.69
                                handle_pte_fault
                                handle_mm_fault
                                __do_page_fault
                                do_page_fault
                                page_fault
                                __memset_sse2
                                |
                                 --100.00%-- worker_thread
                                           start_thread

With this change applied the profile is now nicely flat
and there's no anon-vma related scheduling/blocking.

Rename anon_vma_[un]lock() => anon_vma_[un]lock_write(),
to make it clearer that it's an exclusive write-lock in
that case - suggested by Rik van Riel.
Suggested-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Turner <pjt@google.com>
Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Hugh Dickins <hughd@google.com>
Signed-off-by: default avatarIngo Molnar <mingo@kernel.org>
Signed-off-by: default avatarMel Gorman <mgorman@suse.de>
parent 5a505085
...@@ -102,7 +102,7 @@ extern void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd); ...@@ -102,7 +102,7 @@ extern void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd);
#define wait_split_huge_page(__anon_vma, __pmd) \ #define wait_split_huge_page(__anon_vma, __pmd) \
do { \ do { \
pmd_t *____pmd = (__pmd); \ pmd_t *____pmd = (__pmd); \
anon_vma_lock(__anon_vma); \ anon_vma_lock_write(__anon_vma); \
anon_vma_unlock(__anon_vma); \ anon_vma_unlock(__anon_vma); \
BUG_ON(pmd_trans_splitting(*____pmd) || \ BUG_ON(pmd_trans_splitting(*____pmd) || \
pmd_trans_huge(*____pmd)); \ pmd_trans_huge(*____pmd)); \
......
...@@ -118,7 +118,7 @@ static inline void vma_unlock_anon_vma(struct vm_area_struct *vma) ...@@ -118,7 +118,7 @@ static inline void vma_unlock_anon_vma(struct vm_area_struct *vma)
up_write(&anon_vma->root->rwsem); up_write(&anon_vma->root->rwsem);
} }
static inline void anon_vma_lock(struct anon_vma *anon_vma) static inline void anon_vma_lock_write(struct anon_vma *anon_vma)
{ {
down_write(&anon_vma->root->rwsem); down_write(&anon_vma->root->rwsem);
} }
...@@ -128,6 +128,17 @@ static inline void anon_vma_unlock(struct anon_vma *anon_vma) ...@@ -128,6 +128,17 @@ static inline void anon_vma_unlock(struct anon_vma *anon_vma)
up_write(&anon_vma->root->rwsem); up_write(&anon_vma->root->rwsem);
} }
static inline void anon_vma_lock_read(struct anon_vma *anon_vma)
{
down_read(&anon_vma->root->rwsem);
}
static inline void anon_vma_unlock_read(struct anon_vma *anon_vma)
{
up_read(&anon_vma->root->rwsem);
}
/* /*
* anon_vma helper functions. * anon_vma helper functions.
*/ */
...@@ -220,8 +231,8 @@ int try_to_munlock(struct page *); ...@@ -220,8 +231,8 @@ int try_to_munlock(struct page *);
/* /*
* Called by memory-failure.c to kill processes. * Called by memory-failure.c to kill processes.
*/ */
struct anon_vma *page_lock_anon_vma(struct page *page); struct anon_vma *page_lock_anon_vma_read(struct page *page);
void page_unlock_anon_vma(struct anon_vma *anon_vma); void page_unlock_anon_vma_read(struct anon_vma *anon_vma);
int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma); int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma);
/* /*
......
...@@ -1549,7 +1549,7 @@ int split_huge_page(struct page *page) ...@@ -1549,7 +1549,7 @@ int split_huge_page(struct page *page)
int ret = 1; int ret = 1;
BUG_ON(!PageAnon(page)); BUG_ON(!PageAnon(page));
anon_vma = page_lock_anon_vma(page); anon_vma = page_lock_anon_vma_read(page);
if (!anon_vma) if (!anon_vma)
goto out; goto out;
ret = 0; ret = 0;
...@@ -1562,7 +1562,7 @@ int split_huge_page(struct page *page) ...@@ -1562,7 +1562,7 @@ int split_huge_page(struct page *page)
BUG_ON(PageCompound(page)); BUG_ON(PageCompound(page));
out_unlock: out_unlock:
page_unlock_anon_vma(anon_vma); page_unlock_anon_vma_read(anon_vma);
out: out:
return ret; return ret;
} }
...@@ -2074,7 +2074,7 @@ static void collapse_huge_page(struct mm_struct *mm, ...@@ -2074,7 +2074,7 @@ static void collapse_huge_page(struct mm_struct *mm,
if (!pmd_present(*pmd) || pmd_trans_huge(*pmd)) if (!pmd_present(*pmd) || pmd_trans_huge(*pmd))
goto out; goto out;
anon_vma_lock(vma->anon_vma); anon_vma_lock_write(vma->anon_vma);
pte = pte_offset_map(pmd, address); pte = pte_offset_map(pmd, address);
ptl = pte_lockptr(mm, pmd); ptl = pte_lockptr(mm, pmd);
......
...@@ -1634,7 +1634,7 @@ int page_referenced_ksm(struct page *page, struct mem_cgroup *memcg, ...@@ -1634,7 +1634,7 @@ int page_referenced_ksm(struct page *page, struct mem_cgroup *memcg,
struct anon_vma_chain *vmac; struct anon_vma_chain *vmac;
struct vm_area_struct *vma; struct vm_area_struct *vma;
anon_vma_lock(anon_vma); anon_vma_lock_write(anon_vma);
anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
0, ULONG_MAX) { 0, ULONG_MAX) {
vma = vmac->vma; vma = vmac->vma;
...@@ -1688,7 +1688,7 @@ int try_to_unmap_ksm(struct page *page, enum ttu_flags flags) ...@@ -1688,7 +1688,7 @@ int try_to_unmap_ksm(struct page *page, enum ttu_flags flags)
struct anon_vma_chain *vmac; struct anon_vma_chain *vmac;
struct vm_area_struct *vma; struct vm_area_struct *vma;
anon_vma_lock(anon_vma); anon_vma_lock_write(anon_vma);
anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
0, ULONG_MAX) { 0, ULONG_MAX) {
vma = vmac->vma; vma = vmac->vma;
...@@ -1741,7 +1741,7 @@ int rmap_walk_ksm(struct page *page, int (*rmap_one)(struct page *, ...@@ -1741,7 +1741,7 @@ int rmap_walk_ksm(struct page *page, int (*rmap_one)(struct page *,
struct anon_vma_chain *vmac; struct anon_vma_chain *vmac;
struct vm_area_struct *vma; struct vm_area_struct *vma;
anon_vma_lock(anon_vma); anon_vma_lock_write(anon_vma);
anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
0, ULONG_MAX) { 0, ULONG_MAX) {
vma = vmac->vma; vma = vmac->vma;
......
...@@ -402,7 +402,7 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill, ...@@ -402,7 +402,7 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
struct anon_vma *av; struct anon_vma *av;
pgoff_t pgoff; pgoff_t pgoff;
av = page_lock_anon_vma(page); av = page_lock_anon_vma_read(page);
if (av == NULL) /* Not actually mapped anymore */ if (av == NULL) /* Not actually mapped anymore */
return; return;
...@@ -423,7 +423,7 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill, ...@@ -423,7 +423,7 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
} }
} }
read_unlock(&tasklist_lock); read_unlock(&tasklist_lock);
page_unlock_anon_vma(av); page_unlock_anon_vma_read(av);
} }
/* /*
......
...@@ -754,7 +754,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage, ...@@ -754,7 +754,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
*/ */
if (PageAnon(page)) { if (PageAnon(page)) {
/* /*
* Only page_lock_anon_vma() understands the subtleties of * Only page_lock_anon_vma_read() understands the subtleties of
* getting a hold on an anon_vma from outside one of its mms. * getting a hold on an anon_vma from outside one of its mms.
*/ */
anon_vma = page_get_anon_vma(page); anon_vma = page_get_anon_vma(page);
......
...@@ -602,7 +602,7 @@ again: remove_next = 1 + (end > next->vm_end); ...@@ -602,7 +602,7 @@ again: remove_next = 1 + (end > next->vm_end);
if (anon_vma) { if (anon_vma) {
VM_BUG_ON(adjust_next && next->anon_vma && VM_BUG_ON(adjust_next && next->anon_vma &&
anon_vma != next->anon_vma); anon_vma != next->anon_vma);
anon_vma_lock(anon_vma); anon_vma_lock_write(anon_vma);
anon_vma_interval_tree_pre_update_vma(vma); anon_vma_interval_tree_pre_update_vma(vma);
if (adjust_next) if (adjust_next)
anon_vma_interval_tree_pre_update_vma(next); anon_vma_interval_tree_pre_update_vma(next);
......
...@@ -104,7 +104,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, ...@@ -104,7 +104,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
} }
if (vma->anon_vma) { if (vma->anon_vma) {
anon_vma = vma->anon_vma; anon_vma = vma->anon_vma;
anon_vma_lock(anon_vma); anon_vma_lock_write(anon_vma);
} }
} }
......
...@@ -87,24 +87,24 @@ static inline void anon_vma_free(struct anon_vma *anon_vma) ...@@ -87,24 +87,24 @@ static inline void anon_vma_free(struct anon_vma *anon_vma)
VM_BUG_ON(atomic_read(&anon_vma->refcount)); VM_BUG_ON(atomic_read(&anon_vma->refcount));
/* /*
* Synchronize against page_lock_anon_vma() such that * Synchronize against page_lock_anon_vma_read() such that
* we can safely hold the lock without the anon_vma getting * we can safely hold the lock without the anon_vma getting
* freed. * freed.
* *
* Relies on the full mb implied by the atomic_dec_and_test() from * Relies on the full mb implied by the atomic_dec_and_test() from
* put_anon_vma() against the acquire barrier implied by * put_anon_vma() against the acquire barrier implied by
* mutex_trylock() from page_lock_anon_vma(). This orders: * down_read_trylock() from page_lock_anon_vma_read(). This orders:
* *
* page_lock_anon_vma() VS put_anon_vma() * page_lock_anon_vma_read() VS put_anon_vma()
* mutex_trylock() atomic_dec_and_test() * down_read_trylock() atomic_dec_and_test()
* LOCK MB * LOCK MB
* atomic_read() mutex_is_locked() * atomic_read() rwsem_is_locked()
* *
* LOCK should suffice since the actual taking of the lock must * LOCK should suffice since the actual taking of the lock must
* happen _before_ what follows. * happen _before_ what follows.
*/ */
if (rwsem_is_locked(&anon_vma->root->rwsem)) { if (rwsem_is_locked(&anon_vma->root->rwsem)) {
anon_vma_lock(anon_vma); anon_vma_lock_write(anon_vma);
anon_vma_unlock(anon_vma); anon_vma_unlock(anon_vma);
} }
...@@ -146,7 +146,7 @@ static void anon_vma_chain_link(struct vm_area_struct *vma, ...@@ -146,7 +146,7 @@ static void anon_vma_chain_link(struct vm_area_struct *vma,
* allocate a new one. * allocate a new one.
* *
* Anon-vma allocations are very subtle, because we may have * Anon-vma allocations are very subtle, because we may have
* optimistically looked up an anon_vma in page_lock_anon_vma() * optimistically looked up an anon_vma in page_lock_anon_vma_read()
* and that may actually touch the spinlock even in the newly * and that may actually touch the spinlock even in the newly
* allocated vma (it depends on RCU to make sure that the * allocated vma (it depends on RCU to make sure that the
* anon_vma isn't actually destroyed). * anon_vma isn't actually destroyed).
...@@ -181,7 +181,7 @@ int anon_vma_prepare(struct vm_area_struct *vma) ...@@ -181,7 +181,7 @@ int anon_vma_prepare(struct vm_area_struct *vma)
allocated = anon_vma; allocated = anon_vma;
} }
anon_vma_lock(anon_vma); anon_vma_lock_write(anon_vma);
/* page_table_lock to protect against threads */ /* page_table_lock to protect against threads */
spin_lock(&mm->page_table_lock); spin_lock(&mm->page_table_lock);
if (likely(!vma->anon_vma)) { if (likely(!vma->anon_vma)) {
...@@ -306,7 +306,7 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma) ...@@ -306,7 +306,7 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
get_anon_vma(anon_vma->root); get_anon_vma(anon_vma->root);
/* Mark this anon_vma as the one where our new (COWed) pages go. */ /* Mark this anon_vma as the one where our new (COWed) pages go. */
vma->anon_vma = anon_vma; vma->anon_vma = anon_vma;
anon_vma_lock(anon_vma); anon_vma_lock_write(anon_vma);
anon_vma_chain_link(vma, avc, anon_vma); anon_vma_chain_link(vma, avc, anon_vma);
anon_vma_unlock(anon_vma); anon_vma_unlock(anon_vma);
...@@ -442,7 +442,7 @@ struct anon_vma *page_get_anon_vma(struct page *page) ...@@ -442,7 +442,7 @@ struct anon_vma *page_get_anon_vma(struct page *page)
* atomic op -- the trylock. If we fail the trylock, we fall back to getting a * atomic op -- the trylock. If we fail the trylock, we fall back to getting a
* reference like with page_get_anon_vma() and then block on the mutex. * reference like with page_get_anon_vma() and then block on the mutex.
*/ */
struct anon_vma *page_lock_anon_vma(struct page *page) struct anon_vma *page_lock_anon_vma_read(struct page *page)
{ {
struct anon_vma *anon_vma = NULL; struct anon_vma *anon_vma = NULL;
struct anon_vma *root_anon_vma; struct anon_vma *root_anon_vma;
...@@ -457,14 +457,14 @@ struct anon_vma *page_lock_anon_vma(struct page *page) ...@@ -457,14 +457,14 @@ struct anon_vma *page_lock_anon_vma(struct page *page)
anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON); anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
root_anon_vma = ACCESS_ONCE(anon_vma->root); root_anon_vma = ACCESS_ONCE(anon_vma->root);
if (down_write_trylock(&root_anon_vma->rwsem)) { if (down_read_trylock(&root_anon_vma->rwsem)) {
/* /*
* If the page is still mapped, then this anon_vma is still * If the page is still mapped, then this anon_vma is still
* its anon_vma, and holding the mutex ensures that it will * its anon_vma, and holding the mutex ensures that it will
* not go away, see anon_vma_free(). * not go away, see anon_vma_free().
*/ */
if (!page_mapped(page)) { if (!page_mapped(page)) {
up_write(&root_anon_vma->rwsem); up_read(&root_anon_vma->rwsem);
anon_vma = NULL; anon_vma = NULL;
} }
goto out; goto out;
...@@ -484,15 +484,15 @@ struct anon_vma *page_lock_anon_vma(struct page *page) ...@@ -484,15 +484,15 @@ struct anon_vma *page_lock_anon_vma(struct page *page)
/* we pinned the anon_vma, its safe to sleep */ /* we pinned the anon_vma, its safe to sleep */
rcu_read_unlock(); rcu_read_unlock();
anon_vma_lock(anon_vma); anon_vma_lock_read(anon_vma);
if (atomic_dec_and_test(&anon_vma->refcount)) { if (atomic_dec_and_test(&anon_vma->refcount)) {
/* /*
* Oops, we held the last refcount, release the lock * Oops, we held the last refcount, release the lock
* and bail -- can't simply use put_anon_vma() because * and bail -- can't simply use put_anon_vma() because
* we'll deadlock on the anon_vma_lock() recursion. * we'll deadlock on the anon_vma_lock_write() recursion.
*/ */
anon_vma_unlock(anon_vma); anon_vma_unlock_read(anon_vma);
__put_anon_vma(anon_vma); __put_anon_vma(anon_vma);
anon_vma = NULL; anon_vma = NULL;
} }
...@@ -504,9 +504,9 @@ struct anon_vma *page_lock_anon_vma(struct page *page) ...@@ -504,9 +504,9 @@ struct anon_vma *page_lock_anon_vma(struct page *page)
return anon_vma; return anon_vma;
} }
void page_unlock_anon_vma(struct anon_vma *anon_vma) void page_unlock_anon_vma_read(struct anon_vma *anon_vma)
{ {
anon_vma_unlock(anon_vma); anon_vma_unlock_read(anon_vma);
} }
/* /*
...@@ -732,7 +732,7 @@ static int page_referenced_anon(struct page *page, ...@@ -732,7 +732,7 @@ static int page_referenced_anon(struct page *page,
struct anon_vma_chain *avc; struct anon_vma_chain *avc;
int referenced = 0; int referenced = 0;
anon_vma = page_lock_anon_vma(page); anon_vma = page_lock_anon_vma_read(page);
if (!anon_vma) if (!anon_vma)
return referenced; return referenced;
...@@ -754,7 +754,7 @@ static int page_referenced_anon(struct page *page, ...@@ -754,7 +754,7 @@ static int page_referenced_anon(struct page *page,
break; break;
} }
page_unlock_anon_vma(anon_vma); page_unlock_anon_vma_read(anon_vma);
return referenced; return referenced;
} }
...@@ -1474,7 +1474,7 @@ static int try_to_unmap_anon(struct page *page, enum ttu_flags flags) ...@@ -1474,7 +1474,7 @@ static int try_to_unmap_anon(struct page *page, enum ttu_flags flags)
struct anon_vma_chain *avc; struct anon_vma_chain *avc;
int ret = SWAP_AGAIN; int ret = SWAP_AGAIN;
anon_vma = page_lock_anon_vma(page); anon_vma = page_lock_anon_vma_read(page);
if (!anon_vma) if (!anon_vma)
return ret; return ret;
...@@ -1501,7 +1501,7 @@ static int try_to_unmap_anon(struct page *page, enum ttu_flags flags) ...@@ -1501,7 +1501,7 @@ static int try_to_unmap_anon(struct page *page, enum ttu_flags flags)
break; break;
} }
page_unlock_anon_vma(anon_vma); page_unlock_anon_vma_read(anon_vma);
return ret; return ret;
} }
...@@ -1696,7 +1696,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *, ...@@ -1696,7 +1696,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,
int ret = SWAP_AGAIN; int ret = SWAP_AGAIN;
/* /*
* Note: remove_migration_ptes() cannot use page_lock_anon_vma() * Note: remove_migration_ptes() cannot use page_lock_anon_vma_read()
* because that depends on page_mapped(); but not all its usages * because that depends on page_mapped(); but not all its usages
* are holding mmap_sem. Users without mmap_sem are required to * are holding mmap_sem. Users without mmap_sem are required to
* take a reference count to prevent the anon_vma disappearing * take a reference count to prevent the anon_vma disappearing
...@@ -1704,7 +1704,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *, ...@@ -1704,7 +1704,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,
anon_vma = page_anon_vma(page); anon_vma = page_anon_vma(page);
if (!anon_vma) if (!anon_vma)
return ret; return ret;
anon_vma_lock(anon_vma); anon_vma_lock_read(anon_vma);
anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
struct vm_area_struct *vma = avc->vma; struct vm_area_struct *vma = avc->vma;
unsigned long address = vma_address(page, vma); unsigned long address = vma_address(page, vma);
...@@ -1712,7 +1712,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *, ...@@ -1712,7 +1712,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,
if (ret != SWAP_AGAIN) if (ret != SWAP_AGAIN)
break; break;
} }
anon_vma_unlock(anon_vma); anon_vma_unlock_read(anon_vma);
return ret; return ret;
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment