Commit 7a8010cd authored by Vlastimil Babka's avatar Vlastimil Babka Committed by Linus Torvalds

mm: munlock: manual pte walk in fast path instead of follow_page_mask()

Currently munlock_vma_pages_range() calls follow_page_mask() to obtain
each individual struct page.  This entails repeated full page table
translations and page table lock taken for each page separately.

This patch avoids the costly follow_page_mask() where possible, by
iterating over ptes within single pmd under single page table lock.  The
first pte is obtained by get_locked_pte() for non-THP page acquired by the
initial follow_page_mask().  The rest of the on-stack pagevec for munlock
is filled up using pte_walk as long as pte_present() and vm_normal_page()
are sufficient to obtain the struct page.

After this patch, a 14% speedup was measured for munlocking a 56GB large
memory area with THP disabled.
Signed-off-by: default avatarVlastimil Babka <vbabka@suse.cz>
Cc: Jörn Engel <joern@logfs.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Michel Lespinasse <walken@google.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
parent 5b40998a
...@@ -643,12 +643,12 @@ static inline enum zone_type page_zonenum(const struct page *page) ...@@ -643,12 +643,12 @@ static inline enum zone_type page_zonenum(const struct page *page)
#endif #endif
/* /*
* The identification function is only used by the buddy allocator for * The identification function is mainly used by the buddy allocator for
* determining if two pages could be buddies. We are not really * determining if two pages could be buddies. We are not really identifying
* identifying a zone since we could be using a the section number * the zone since we could be using the section number id if we do not have
* id if we have not node id available in page flags. * node id available in page flags.
* We guarantee only that it will return the same value for two * We only guarantee that it will return the same value for two combinable
* combinable pages in a zone. * pages in a zone.
*/ */
static inline int page_zone_id(struct page *page) static inline int page_zone_id(struct page *page)
{ {
......
...@@ -280,8 +280,7 @@ static void __putback_lru_fast(struct pagevec *pvec, int pgrescued) ...@@ -280,8 +280,7 @@ static void __putback_lru_fast(struct pagevec *pvec, int pgrescued)
* The second phase finishes the munlock only for pages where isolation * The second phase finishes the munlock only for pages where isolation
* succeeded. * succeeded.
* *
* Note that pvec is modified during the process. Before returning * Note that the pagevec may be modified during the process.
* pagevec_reinit() is called on it.
*/ */
static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone) static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone)
{ {
...@@ -356,8 +355,60 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone) ...@@ -356,8 +355,60 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone)
*/ */
if (pagevec_count(&pvec_putback)) if (pagevec_count(&pvec_putback))
__putback_lru_fast(&pvec_putback, pgrescued); __putback_lru_fast(&pvec_putback, pgrescued);
}
/*
* Fill up pagevec for __munlock_pagevec using pte walk
*
* The function expects that the struct page corresponding to @start address is
* a non-TPH page already pinned and in the @pvec, and that it belongs to @zone.
*
* The rest of @pvec is filled by subsequent pages within the same pmd and same
* zone, as long as the pte's are present and vm_normal_page() succeeds. These
* pages also get pinned.
*
* Returns the address of the next page that should be scanned. This equals
* @start + PAGE_SIZE when no page could be added by the pte walk.
*/
static unsigned long __munlock_pagevec_fill(struct pagevec *pvec,
struct vm_area_struct *vma, int zoneid, unsigned long start,
unsigned long end)
{
pte_t *pte;
spinlock_t *ptl;
/*
* Initialize pte walk starting at the already pinned page where we
* are sure that there is a pte.
*/
pte = get_locked_pte(vma->vm_mm, start, &ptl);
end = min(end, pmd_addr_end(start, end));
pagevec_reinit(pvec); /* The page next to the pinned page is the first we will try to get */
start += PAGE_SIZE;
while (start < end) {
struct page *page = NULL;
pte++;
if (pte_present(*pte))
page = vm_normal_page(vma, start, *pte);
/*
* Break if page could not be obtained or the page's node+zone does not
* match
*/
if (!page || page_zone_id(page) != zoneid)
break;
get_page(page);
/*
* Increase the address that will be returned *before* the
* eventual break due to pvec becoming full by adding the page
*/
start += PAGE_SIZE;
if (pagevec_add(pvec, page) == 0)
break;
}
pte_unmap_unlock(pte, ptl);
return start;
} }
/* /*
...@@ -381,17 +432,16 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone) ...@@ -381,17 +432,16 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone)
void munlock_vma_pages_range(struct vm_area_struct *vma, void munlock_vma_pages_range(struct vm_area_struct *vma,
unsigned long start, unsigned long end) unsigned long start, unsigned long end)
{ {
struct pagevec pvec;
struct zone *zone = NULL;
pagevec_init(&pvec, 0);
vma->vm_flags &= ~VM_LOCKED; vma->vm_flags &= ~VM_LOCKED;
while (start < end) { while (start < end) {
struct page *page; struct page *page = NULL;
unsigned int page_mask, page_increm; unsigned int page_mask, page_increm;
struct zone *pagezone; struct pagevec pvec;
struct zone *zone;
int zoneid;
pagevec_init(&pvec, 0);
/* /*
* Although FOLL_DUMP is intended for get_dump_page(), * Although FOLL_DUMP is intended for get_dump_page(),
* it just so happens that its special treatment of the * it just so happens that its special treatment of the
...@@ -401,21 +451,9 @@ void munlock_vma_pages_range(struct vm_area_struct *vma, ...@@ -401,21 +451,9 @@ void munlock_vma_pages_range(struct vm_area_struct *vma,
*/ */
page = follow_page_mask(vma, start, FOLL_GET | FOLL_DUMP, page = follow_page_mask(vma, start, FOLL_GET | FOLL_DUMP,
&page_mask); &page_mask);
if (page && !IS_ERR(page)) { if (page && !IS_ERR(page)) {
pagezone = page_zone(page);
/* The whole pagevec must be in the same zone */
if (pagezone != zone) {
if (pagevec_count(&pvec))
__munlock_pagevec(&pvec, zone);
zone = pagezone;
}
if (PageTransHuge(page)) { if (PageTransHuge(page)) {
/*
* THP pages are not handled by pagevec due
* to their possible split (see below).
*/
if (pagevec_count(&pvec))
__munlock_pagevec(&pvec, zone);
lock_page(page); lock_page(page);
/* /*
* Any THP page found by follow_page_mask() may * Any THP page found by follow_page_mask() may
...@@ -428,21 +466,31 @@ void munlock_vma_pages_range(struct vm_area_struct *vma, ...@@ -428,21 +466,31 @@ void munlock_vma_pages_range(struct vm_area_struct *vma,
put_page(page); /* follow_page_mask() */ put_page(page); /* follow_page_mask() */
} else { } else {
/* /*
* Non-huge pages are handled in batches * Non-huge pages are handled in batches via
* via pagevec. The pin from * pagevec. The pin from follow_page_mask()
* follow_page_mask() prevents them from * prevents them from collapsing by THP.
* collapsing by THP.
*/ */
if (pagevec_add(&pvec, page) == 0) pagevec_add(&pvec, page);
zone = page_zone(page);
zoneid = page_zone_id(page);
/*
* Try to fill the rest of pagevec using fast
* pte walk. This will also update start to
* the next page to process. Then munlock the
* pagevec.
*/
start = __munlock_pagevec_fill(&pvec, vma,
zoneid, start, end);
__munlock_pagevec(&pvec, zone); __munlock_pagevec(&pvec, zone);
goto next;
} }
} }
page_increm = 1 + (~(start >> PAGE_SHIFT) & page_mask); page_increm = 1 + (~(start >> PAGE_SHIFT) & page_mask);
start += page_increm * PAGE_SIZE; start += page_increm * PAGE_SIZE;
next:
cond_resched(); cond_resched();
} }
if (pagevec_count(&pvec))
__munlock_pagevec(&pvec, zone);
} }
/* /*
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment