Commit 37a1c49a authored by Andrea Arcangeli's avatar Andrea Arcangeli Committed by Linus Torvalds

thp: mremap support and TLB optimization

This adds THP support to mremap (decreases the number of split_huge_page()
calls).

Here are also some benchmarks with a proggy like this:

===
#define _GNU_SOURCE
#include <sys/mman.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <sys/time.h>

#define SIZE (5UL*1024*1024*1024)

int main()
{
        static struct timeval oldstamp, newstamp;
	long diffsec;
	char *p, *p2, *p3, *p4;
	if (posix_memalign((void **)&p, 2*1024*1024, SIZE))
		perror("memalign"), exit(1);
	if (posix_memalign((void **)&p2, 2*1024*1024, SIZE))
		perror("memalign"), exit(1);
	if (posix_memalign((void **)&p3, 2*1024*1024, 4096))
		perror("memalign"), exit(1);

	memset(p, 0xff, SIZE);
	memset(p2, 0xff, SIZE);
	memset(p3, 0x77, 4096);
	gettimeofday(&oldstamp, NULL);
	p4 = mremap(p, SIZE, SIZE, MREMAP_FIXED|MREMAP_MAYMOVE, p3);
	gettimeofday(&newstamp, NULL);
	diffsec = newstamp.tv_sec - oldstamp.tv_sec;
	diffsec = newstamp.tv_usec - oldstamp.tv_usec + 1000000 * diffsec;
	printf("usec %ld\n", diffsec);
	if (p == MAP_FAILED || p4 != p3)
	//if (p == MAP_FAILED)
		perror("mremap"), exit(1);
	if (memcmp(p4, p2, SIZE))
		printf("mremap bug\n"), exit(1);
	printf("ok\n");

	return 0;
}
===

THP on

 Performance counter stats for './largepage13' (3 runs):

          69195836 dTLB-loads                 ( +-   3.546% )  (scaled from 50.30%)
             60708 dTLB-load-misses           ( +-  11.776% )  (scaled from 52.62%)
         676266476 dTLB-stores                ( +-   5.654% )  (scaled from 69.54%)
             29856 dTLB-store-misses          ( +-   4.081% )  (scaled from 89.22%)
        1055848782 iTLB-loads                 ( +-   4.526% )  (scaled from 80.18%)
              8689 iTLB-load-misses           ( +-   2.987% )  (scaled from 58.20%)

        7.314454164  seconds time elapsed   ( +-   0.023% )

THP off

 Performance counter stats for './largepage13' (3 runs):

        1967379311 dTLB-loads                 ( +-   0.506% )  (scaled from 60.59%)
           9238687 dTLB-load-misses           ( +-  22.547% )  (scaled from 61.87%)
        2014239444 dTLB-stores                ( +-   0.692% )  (scaled from 60.40%)
           3312335 dTLB-store-misses          ( +-   7.304% )  (scaled from 67.60%)
        6764372065 iTLB-loads                 ( +-   0.925% )  (scaled from 79.00%)
              8202 iTLB-load-misses           ( +-   0.475% )  (scaled from 70.55%)

        9.693655243  seconds time elapsed   ( +-   0.069% )

grep thp /proc/vmstat
thp_fault_alloc 35849
thp_fault_fallback 0
thp_collapse_alloc 3
thp_collapse_alloc_failed 0
thp_split 0

thp_split 0 confirms no thp split despite plenty of hugepages allocated.

The measurement of only the mremap time (so excluding the 3 long
memset and final long 10GB memory accessing memcmp):

THP on

usec 14824
usec 14862
usec 14859

THP off

usec 256416
usec 255981
usec 255847

With an older kernel without the mremap optimizations (the below patch
optimizes the non THP version too).

THP on

usec 392107
usec 390237
usec 404124

THP off

usec 444294
usec 445237
usec 445820

I guess with a threaded program that sends more IPI on large SMP it'd
create an even larger difference.

All debug options are off except DEBUG_VM to avoid skewing the
results.

The only problem for native 2M mremap like it happens above both the
source and destination address must be 2M aligned or the hugepmd can't be
moved without a split but that is an hardware limitation.

[akpm@linux-foundation.org: coding-style nitpicking]
Signed-off-by: default avatarAndrea Arcangeli <aarcange@redhat.com>
Acked-by: default avatarJohannes Weiner <jweiner@redhat.com>
Acked-by: default avatarMel Gorman <mgorman@suse.de>
Acked-by: default avatarRik van Riel <riel@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
parent 7b6efc2b
...@@ -22,6 +22,11 @@ extern int zap_huge_pmd(struct mmu_gather *tlb, ...@@ -22,6 +22,11 @@ extern int zap_huge_pmd(struct mmu_gather *tlb,
extern int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, extern int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
unsigned long addr, unsigned long end, unsigned long addr, unsigned long end,
unsigned char *vec); unsigned char *vec);
extern int move_huge_pmd(struct vm_area_struct *vma,
struct vm_area_struct *new_vma,
unsigned long old_addr,
unsigned long new_addr, unsigned long old_end,
pmd_t *old_pmd, pmd_t *new_pmd);
extern int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, extern int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
unsigned long addr, pgprot_t newprot); unsigned long addr, pgprot_t newprot);
......
...@@ -1052,6 +1052,51 @@ int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, ...@@ -1052,6 +1052,51 @@ int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
return ret; return ret;
} }
int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
unsigned long old_addr,
unsigned long new_addr, unsigned long old_end,
pmd_t *old_pmd, pmd_t *new_pmd)
{
int ret = 0;
pmd_t pmd;
struct mm_struct *mm = vma->vm_mm;
if ((old_addr & ~HPAGE_PMD_MASK) ||
(new_addr & ~HPAGE_PMD_MASK) ||
old_end - old_addr < HPAGE_PMD_SIZE ||
(new_vma->vm_flags & VM_NOHUGEPAGE))
goto out;
/*
* The destination pmd shouldn't be established, free_pgtables()
* should have release it.
*/
if (WARN_ON(!pmd_none(*new_pmd))) {
VM_BUG_ON(pmd_trans_huge(*new_pmd));
goto out;
}
spin_lock(&mm->page_table_lock);
if (likely(pmd_trans_huge(*old_pmd))) {
if (pmd_trans_splitting(*old_pmd)) {
spin_unlock(&mm->page_table_lock);
wait_split_huge_page(vma->anon_vma, old_pmd);
ret = -1;
} else {
pmd = pmdp_get_and_clear(mm, old_addr, old_pmd);
VM_BUG_ON(!pmd_none(*new_pmd));
set_pmd_at(mm, new_addr, new_pmd, pmd);
spin_unlock(&mm->page_table_lock);
ret = 1;
}
} else {
spin_unlock(&mm->page_table_lock);
}
out:
return ret;
}
int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
unsigned long addr, pgprot_t newprot) unsigned long addr, pgprot_t newprot)
{ {
......
...@@ -41,8 +41,7 @@ static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr) ...@@ -41,8 +41,7 @@ static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr)
return NULL; return NULL;
pmd = pmd_offset(pud, addr); pmd = pmd_offset(pud, addr);
split_huge_page_pmd(mm, pmd); if (pmd_none(*pmd))
if (pmd_none_or_clear_bad(pmd))
return NULL; return NULL;
return pmd; return pmd;
...@@ -65,8 +64,6 @@ static pmd_t *alloc_new_pmd(struct mm_struct *mm, struct vm_area_struct *vma, ...@@ -65,8 +64,6 @@ static pmd_t *alloc_new_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
return NULL; return NULL;
VM_BUG_ON(pmd_trans_huge(*pmd)); VM_BUG_ON(pmd_trans_huge(*pmd));
if (pmd_none(*pmd) && __pte_alloc(mm, vma, pmd, addr))
return NULL;
return pmd; return pmd;
} }
...@@ -149,6 +146,23 @@ unsigned long move_page_tables(struct vm_area_struct *vma, ...@@ -149,6 +146,23 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
new_pmd = alloc_new_pmd(vma->vm_mm, vma, new_addr); new_pmd = alloc_new_pmd(vma->vm_mm, vma, new_addr);
if (!new_pmd) if (!new_pmd)
break; break;
if (pmd_trans_huge(*old_pmd)) {
int err = 0;
if (extent == HPAGE_PMD_SIZE)
err = move_huge_pmd(vma, new_vma, old_addr,
new_addr, old_end,
old_pmd, new_pmd);
if (err > 0) {
need_flush = true;
continue;
} else if (!err) {
split_huge_page_pmd(vma->vm_mm, old_pmd);
}
VM_BUG_ON(pmd_trans_huge(*old_pmd));
}
if (pmd_none(*new_pmd) && __pte_alloc(new_vma->vm_mm, new_vma,
new_pmd, new_addr))
break;
next = (new_addr + PMD_SIZE) & PMD_MASK; next = (new_addr + PMD_SIZE) & PMD_MASK;
if (extent > next - new_addr) if (extent > next - new_addr)
extent = next - new_addr; extent = next - new_addr;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment