Commit b24f53a0 authored by Lee Schermerhorn's avatar Lee Schermerhorn Committed by Mel Gorman

mm: mempolicy: Add MPOL_MF_LAZY

NOTE: Once again there is a lot of patch stealing and the end result
	is sufficiently different that I had to drop the signed-offs.
	Will re-add if the original authors are ok with that.

This patch adds another mbind() flag to request "lazy migration".  The
flag, MPOL_MF_LAZY, modifies MPOL_MF_MOVE* such that the selected
pages are marked PROT_NONE. The pages will be migrated in the fault
path on "first touch", if the policy dictates at that time.

"Lazy Migration" will allow testing of migrate-on-fault via mbind().
Also allows applications to specify that only subsequently touched
pages be migrated to obey new policy, instead of all pages in range.
This can be useful for multi-threaded applications working on a
large shared data area that is initialized by an initial thread
resulting in all pages on one [or a few, if overflowed] nodes.
After PROT_NONE, the pages in regions assigned to the worker threads
will be automatically migrated local to the threads on 1st touch.
Signed-off-by: default avatarMel Gorman <mgorman@suse.de>
Reviewed-by: default avatarRik van Riel <riel@redhat.com>
parent 4daae3b4
...@@ -1551,6 +1551,11 @@ static inline pgprot_t vm_get_page_prot(unsigned long vm_flags) ...@@ -1551,6 +1551,11 @@ static inline pgprot_t vm_get_page_prot(unsigned long vm_flags)
} }
#endif #endif
#ifdef CONFIG_ARCH_USES_NUMA_PROT_NONE
void change_prot_numa(struct vm_area_struct *vma,
unsigned long start, unsigned long end);
#endif
struct vm_area_struct *find_extend_vma(struct mm_struct *, unsigned long addr); struct vm_area_struct *find_extend_vma(struct mm_struct *, unsigned long addr);
int remap_pfn_range(struct vm_area_struct *, unsigned long addr, int remap_pfn_range(struct vm_area_struct *, unsigned long addr,
unsigned long pfn, unsigned long size, pgprot_t); unsigned long pfn, unsigned long size, pgprot_t);
......
...@@ -49,9 +49,16 @@ enum mpol_rebind_step { ...@@ -49,9 +49,16 @@ enum mpol_rebind_step {
/* Flags for mbind */ /* Flags for mbind */
#define MPOL_MF_STRICT (1<<0) /* Verify existing pages in the mapping */ #define MPOL_MF_STRICT (1<<0) /* Verify existing pages in the mapping */
#define MPOL_MF_MOVE (1<<1) /* Move pages owned by this process to conform to mapping */ #define MPOL_MF_MOVE (1<<1) /* Move pages owned by this process to conform
#define MPOL_MF_MOVE_ALL (1<<2) /* Move every page to conform to mapping */ to policy */
#define MPOL_MF_INTERNAL (1<<3) /* Internal flags start here */ #define MPOL_MF_MOVE_ALL (1<<2) /* Move every page to conform to policy */
#define MPOL_MF_LAZY (1<<3) /* Modifies '_MOVE: lazy migrate on fault */
#define MPOL_MF_INTERNAL (1<<4) /* Internal flags start here */
#define MPOL_MF_VALID (MPOL_MF_STRICT | \
MPOL_MF_MOVE | \
MPOL_MF_MOVE_ALL | \
MPOL_MF_LAZY)
/* /*
* Internal flags that share the struct mempolicy flags word with * Internal flags that share the struct mempolicy flags word with
......
...@@ -90,6 +90,7 @@ ...@@ -90,6 +90,7 @@
#include <linux/syscalls.h> #include <linux/syscalls.h>
#include <linux/ctype.h> #include <linux/ctype.h>
#include <linux/mm_inline.h> #include <linux/mm_inline.h>
#include <linux/mmu_notifier.h>
#include <asm/tlbflush.h> #include <asm/tlbflush.h>
#include <asm/uaccess.h> #include <asm/uaccess.h>
...@@ -565,6 +566,145 @@ static inline int check_pgd_range(struct vm_area_struct *vma, ...@@ -565,6 +566,145 @@ static inline int check_pgd_range(struct vm_area_struct *vma,
return 0; return 0;
} }
#ifdef CONFIG_ARCH_USES_NUMA_PROT_NONE
/*
* Here we search for not shared page mappings (mapcount == 1) and we
* set up the pmd/pte_numa on those mappings so the very next access
* will fire a NUMA hinting page fault.
*/
static int
change_prot_numa_range(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address)
{
pgd_t *pgd;
pud_t *pud;
pmd_t *pmd;
pte_t *pte, *_pte;
struct page *page;
unsigned long _address, end;
spinlock_t *ptl;
int ret = 0;
VM_BUG_ON(address & ~PAGE_MASK);
pgd = pgd_offset(mm, address);
if (!pgd_present(*pgd))
goto out;
pud = pud_offset(pgd, address);
if (!pud_present(*pud))
goto out;
pmd = pmd_offset(pud, address);
if (pmd_none(*pmd))
goto out;
if (pmd_trans_huge_lock(pmd, vma) == 1) {
int page_nid;
ret = HPAGE_PMD_NR;
VM_BUG_ON(address & ~HPAGE_PMD_MASK);
if (pmd_numa(*pmd)) {
spin_unlock(&mm->page_table_lock);
goto out;
}
page = pmd_page(*pmd);
/* only check non-shared pages */
if (page_mapcount(page) != 1) {
spin_unlock(&mm->page_table_lock);
goto out;
}
page_nid = page_to_nid(page);
if (pmd_numa(*pmd)) {
spin_unlock(&mm->page_table_lock);
goto out;
}
set_pmd_at(mm, address, pmd, pmd_mknuma(*pmd));
ret += HPAGE_PMD_NR;
/* defer TLB flush to lower the overhead */
spin_unlock(&mm->page_table_lock);
goto out;
}
if (pmd_trans_unstable(pmd))
goto out;
VM_BUG_ON(!pmd_present(*pmd));
end = min(vma->vm_end, (address + PMD_SIZE) & PMD_MASK);
pte = pte_offset_map_lock(mm, pmd, address, &ptl);
for (_address = address, _pte = pte; _address < end;
_pte++, _address += PAGE_SIZE) {
pte_t pteval = *_pte;
if (!pte_present(pteval))
continue;
if (pte_numa(pteval))
continue;
page = vm_normal_page(vma, _address, pteval);
if (unlikely(!page))
continue;
/* only check non-shared pages */
if (page_mapcount(page) != 1)
continue;
set_pte_at(mm, _address, _pte, pte_mknuma(pteval));
/* defer TLB flush to lower the overhead */
ret++;
}
pte_unmap_unlock(pte, ptl);
if (ret && !pmd_numa(*pmd)) {
spin_lock(&mm->page_table_lock);
set_pmd_at(mm, address, pmd, pmd_mknuma(*pmd));
spin_unlock(&mm->page_table_lock);
/* defer TLB flush to lower the overhead */
}
out:
return ret;
}
/* Assumes mmap_sem is held */
void
change_prot_numa(struct vm_area_struct *vma,
unsigned long address, unsigned long end)
{
struct mm_struct *mm = vma->vm_mm;
int progress = 0;
while (address < end) {
VM_BUG_ON(address < vma->vm_start ||
address + PAGE_SIZE > vma->vm_end);
progress += change_prot_numa_range(mm, vma, address);
address = (address + PMD_SIZE) & PMD_MASK;
}
/*
* Flush the TLB for the mm to start the NUMA hinting
* page faults after we finish scanning this vma part
* if there were any PTE updates
*/
if (progress) {
mmu_notifier_invalidate_range_start(vma->vm_mm, address, end);
flush_tlb_range(vma, address, end);
mmu_notifier_invalidate_range_end(vma->vm_mm, address, end);
}
}
#else
static unsigned long change_prot_numa(struct vm_area_struct *vma,
unsigned long addr, unsigned long end)
{
return 0;
}
#endif /* CONFIG_ARCH_USES_NUMA_PROT_NONE */
/* /*
* Check if all pages in a range are on a set of nodes. * Check if all pages in a range are on a set of nodes.
* If pagelist != NULL then isolate pages from the LRU and * If pagelist != NULL then isolate pages from the LRU and
...@@ -583,22 +723,32 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end, ...@@ -583,22 +723,32 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
return ERR_PTR(-EFAULT); return ERR_PTR(-EFAULT);
prev = NULL; prev = NULL;
for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) { for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
unsigned long endvma = vma->vm_end;
if (endvma > end)
endvma = end;
if (vma->vm_start > start)
start = vma->vm_start;
if (!(flags & MPOL_MF_DISCONTIG_OK)) { if (!(flags & MPOL_MF_DISCONTIG_OK)) {
if (!vma->vm_next && vma->vm_end < end) if (!vma->vm_next && vma->vm_end < end)
return ERR_PTR(-EFAULT); return ERR_PTR(-EFAULT);
if (prev && prev->vm_end < vma->vm_start) if (prev && prev->vm_end < vma->vm_start)
return ERR_PTR(-EFAULT); return ERR_PTR(-EFAULT);
} }
if (!is_vm_hugetlb_page(vma) &&
((flags & MPOL_MF_STRICT) || if (is_vm_hugetlb_page(vma))
goto next;
if (flags & MPOL_MF_LAZY) {
change_prot_numa(vma, start, endvma);
goto next;
}
if ((flags & MPOL_MF_STRICT) ||
((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) && ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
vma_migratable(vma)))) { vma_migratable(vma))) {
unsigned long endvma = vma->vm_end;
if (endvma > end)
endvma = end;
if (vma->vm_start > start)
start = vma->vm_start;
err = check_pgd_range(vma, start, endvma, nodes, err = check_pgd_range(vma, start, endvma, nodes,
flags, private); flags, private);
if (err) { if (err) {
...@@ -606,6 +756,7 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end, ...@@ -606,6 +756,7 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
break; break;
} }
} }
next:
prev = vma; prev = vma;
} }
return first; return first;
...@@ -1138,8 +1289,7 @@ static long do_mbind(unsigned long start, unsigned long len, ...@@ -1138,8 +1289,7 @@ static long do_mbind(unsigned long start, unsigned long len,
int err; int err;
LIST_HEAD(pagelist); LIST_HEAD(pagelist);
if (flags & ~(unsigned long)(MPOL_MF_STRICT | if (flags & ~(unsigned long)MPOL_MF_VALID)
MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
return -EINVAL; return -EINVAL;
if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE)) if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
return -EPERM; return -EPERM;
...@@ -1162,6 +1312,9 @@ static long do_mbind(unsigned long start, unsigned long len, ...@@ -1162,6 +1312,9 @@ static long do_mbind(unsigned long start, unsigned long len,
if (IS_ERR(new)) if (IS_ERR(new))
return PTR_ERR(new); return PTR_ERR(new);
if (flags & MPOL_MF_LAZY)
new->flags |= MPOL_F_MOF;
/* /*
* If we are using the default policy then operation * If we are using the default policy then operation
* on discontinuous address spaces is okay after all * on discontinuous address spaces is okay after all
...@@ -1198,13 +1351,15 @@ static long do_mbind(unsigned long start, unsigned long len, ...@@ -1198,13 +1351,15 @@ static long do_mbind(unsigned long start, unsigned long len,
vma = check_range(mm, start, end, nmask, vma = check_range(mm, start, end, nmask,
flags | MPOL_MF_INVERT, &pagelist); flags | MPOL_MF_INVERT, &pagelist);
err = PTR_ERR(vma); err = PTR_ERR(vma); /* maybe ... */
if (!IS_ERR(vma)) { if (!IS_ERR(vma) && mode != MPOL_NOOP)
int nr_failed = 0;
err = mbind_range(mm, start, end, new); err = mbind_range(mm, start, end, new);
if (!err) {
int nr_failed = 0;
if (!list_empty(&pagelist)) { if (!list_empty(&pagelist)) {
WARN_ON_ONCE(flags & MPOL_MF_LAZY);
nr_failed = migrate_pages(&pagelist, new_vma_page, nr_failed = migrate_pages(&pagelist, new_vma_page,
(unsigned long)vma, (unsigned long)vma,
false, MIGRATE_SYNC, false, MIGRATE_SYNC,
...@@ -1213,7 +1368,7 @@ static long do_mbind(unsigned long start, unsigned long len, ...@@ -1213,7 +1368,7 @@ static long do_mbind(unsigned long start, unsigned long len,
putback_lru_pages(&pagelist); putback_lru_pages(&pagelist);
} }
if (!err && nr_failed && (flags & MPOL_MF_STRICT)) if (nr_failed && (flags & MPOL_MF_STRICT))
err = -EIO; err = -EIO;
} else } else
putback_lru_pages(&pagelist); putback_lru_pages(&pagelist);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment