Commit ebcbc6ea authored by Hugh Dickins's avatar Hugh Dickins Committed by Matthew Wilcox (Oracle)

mm/munlock: delete page_mlock() and all its works

We have recommended some applications to mlock their userspace, but that
turns out to be counter-productive: when many processes mlock the same
file, contention on rmap's i_mmap_rwsem can become intolerable at exit: it
is needed for write, to remove any vma mapping that file from rmap's tree;
but hogged for read by those with mlocks calling page_mlock() (formerly
known as try_to_munlock()) on *each* page mapped from the file (the
purpose being to find out whether another process has the page mlocked,
so therefore it should not be unmlocked yet).

Several optimizations have been made in the past: one is to skip
page_mlock() when mapcount tells that nothing else has this page
mapped; but that doesn't help at all when others do have it mapped.
This time around, I initially intended to add a preliminary search
of the rmap tree for overlapping VM_LOCKED ranges; but that gets
messy with locking order, when in doubt whether a page is actually
present; and risks adding even more contention on the i_mmap_rwsem.

A solution would be much easier, if only there were space in struct page
for an mlock_count... but actually, most of the time, there is space for
it - an mlocked page spends most of its life on an unevictable LRU, but
since 3.18 removed the scan_unevictable_pages sysctl, that "LRU" has
been redundant.  Let's try to reuse its page->lru.

But leave that until a later patch: in this patch, clear the ground by
removing page_mlock(), and all the infrastructure that has gathered
around it - which mostly hinders understanding, and will make reviewing
new additions harder.  Don't mind those old comments about THPs, they
date from before 4.5's refcounting rework: splitting is not a risk here.

Just keep a minimal version of munlock_vma_page(), as reminder of what it
should attend to (in particular, the odd way PGSTRANDED is counted out of
PGMUNLOCKED), and likewise a stub for munlock_vma_pages_range().  Move
unchanged __mlock_posix_error_return() out of the way, down to above its
caller: this series then makes no further change after mlock_fixup().

After this and each following commit, the kernel builds, boots and runs;
but with deficiencies which may show up in testing of mlock and munlock.
The system calls succeed or fail as before, and mlock remains effective
in preventing page reclaim; but meminfo's Unevictable and Mlocked amounts
may be shown too low after mlock, grow, then stay too high after munlock:
with previously mlocked pages remaining unevictable for too long, until
finally unmapped and freed and counts corrected. Normal service will be
resumed in "mm/munlock: mlock_pte_range() when mlocking or munlocking".
Signed-off-by: default avatarHugh Dickins <hughd@google.com>
Acked-by: default avatarVlastimil Babka <vbabka@suse.cz>
Signed-off-by: default avatarMatthew Wilcox (Oracle) <willy@infradead.org>
parent f71077a4
...@@ -237,12 +237,6 @@ unsigned long page_address_in_vma(struct page *, struct vm_area_struct *); ...@@ -237,12 +237,6 @@ unsigned long page_address_in_vma(struct page *, struct vm_area_struct *);
*/ */
int folio_mkclean(struct folio *); int folio_mkclean(struct folio *);
/*
* called in munlock()/munmap() path to check for other vmas holding
* the page mlocked.
*/
void page_mlock(struct page *page);
void remove_migration_ptes(struct page *old, struct page *new, bool locked); void remove_migration_ptes(struct page *old, struct page *new, bool locked);
/* /*
......
...@@ -409,7 +409,7 @@ static inline void munlock_vma_pages_all(struct vm_area_struct *vma) ...@@ -409,7 +409,7 @@ static inline void munlock_vma_pages_all(struct vm_area_struct *vma)
* must be called with vma's mmap_lock held for read or write, and page locked. * must be called with vma's mmap_lock held for read or write, and page locked.
*/ */
extern void mlock_vma_page(struct page *page); extern void mlock_vma_page(struct page *page);
extern unsigned int munlock_vma_page(struct page *page); extern void munlock_vma_page(struct page *page);
extern int mlock_future_check(struct mm_struct *mm, unsigned long flags, extern int mlock_future_check(struct mm_struct *mm, unsigned long flags,
unsigned long len); unsigned long len);
......
This diff is collapsed.
...@@ -1996,76 +1996,6 @@ void try_to_migrate(struct page *page, enum ttu_flags flags) ...@@ -1996,76 +1996,6 @@ void try_to_migrate(struct page *page, enum ttu_flags flags)
rmap_walk(page, &rwc); rmap_walk(page, &rwc);
} }
/*
* Walks the vma's mapping a page and mlocks the page if any locked vma's are
* found. Once one is found the page is locked and the scan can be terminated.
*/
static bool page_mlock_one(struct page *page, struct vm_area_struct *vma,
unsigned long address, void *unused)
{
struct page_vma_mapped_walk pvmw = {
.page = page,
.vma = vma,
.address = address,
};
/* An un-locked vma doesn't have any pages to lock, continue the scan */
if (!(vma->vm_flags & VM_LOCKED))
return true;
while (page_vma_mapped_walk(&pvmw)) {
/*
* Need to recheck under the ptl to serialise with
* __munlock_pagevec_fill() after VM_LOCKED is cleared in
* munlock_vma_pages_range().
*/
if (vma->vm_flags & VM_LOCKED) {
/*
* PTE-mapped THP are never marked as mlocked; but
* this function is never called on a DoubleMap THP,
* nor on an Anon THP (which may still be PTE-mapped
* after DoubleMap was cleared).
*/
mlock_vma_page(page);
/*
* No need to scan further once the page is marked
* as mlocked.
*/
page_vma_mapped_walk_done(&pvmw);
return false;
}
}
return true;
}
/**
* page_mlock - try to mlock a page
* @page: the page to be mlocked
*
* Called from munlock code. Checks all of the VMAs mapping the page and mlocks
* the page if any are found. The page will be returned with PG_mlocked cleared
* if it is not mapped by any locked vmas.
*/
void page_mlock(struct page *page)
{
struct rmap_walk_control rwc = {
.rmap_one = page_mlock_one,
.done = page_not_mapped,
.anon_lock = page_lock_anon_vma_read,
};
VM_BUG_ON_PAGE(!PageLocked(page) || PageLRU(page), page);
VM_BUG_ON_PAGE(PageCompound(page) && PageDoubleMap(page), page);
/* Anon THP are only marked as mlocked when singly mapped */
if (PageTransCompound(page) && PageAnon(page))
return;
rmap_walk(page, &rwc);
}
#ifdef CONFIG_DEVICE_PRIVATE #ifdef CONFIG_DEVICE_PRIVATE
struct make_exclusive_args { struct make_exclusive_args {
struct mm_struct *mm; struct mm_struct *mm;
...@@ -2291,11 +2221,6 @@ static struct anon_vma *rmap_walk_anon_lock(struct page *page, ...@@ -2291,11 +2221,6 @@ static struct anon_vma *rmap_walk_anon_lock(struct page *page,
* *
* Find all the mappings of a page using the mapping pointer and the vma chains * Find all the mappings of a page using the mapping pointer and the vma chains
* contained in the anon_vma struct it points to. * contained in the anon_vma struct it points to.
*
* When called from page_mlock(), the mmap_lock of the mm containing the vma
* where the page was found will be held for write. So, we won't recheck
* vm_flags for that VMA. That should be OK, because that vma shouldn't be
* LOCKED.
*/ */
static void rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc, static void rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc,
bool locked) bool locked)
...@@ -2344,11 +2269,6 @@ static void rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc, ...@@ -2344,11 +2269,6 @@ static void rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc,
* *
* Find all the mappings of a page using the mapping pointer and the vma chains * Find all the mappings of a page using the mapping pointer and the vma chains
* contained in the address_space struct it points to. * contained in the address_space struct it points to.
*
* When called from page_mlock(), the mmap_lock of the mm containing the vma
* where the page was found will be held for write. So, we won't recheck
* vm_flags for that VMA. That should be OK, because that vma shouldn't be
* LOCKED.
*/ */
static void rmap_walk_file(struct page *page, struct rmap_walk_control *rwc, static void rmap_walk_file(struct page *page, struct rmap_walk_control *rwc,
bool locked) bool locked)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment