Commit 9532fec1 authored by Mel Gorman's avatar Mel Gorman

mm: numa: Migrate pages handled during a pmd_numa hinting fault

To say that the PMD handling code was incorrectly transferred from autonuma
is an understatement. The intention was to handle a PMDs worth of pages
in the same fault and effectively batch the taking of the PTL and page
migration. The copied version instead has the impact of clearing a number
of pte_numa PTE entries and whether any page migration takes place depends
on racing. This just happens to work in some cases.

This patch handles pte_numa faults in batch when a pmd_numa fault is
handled. The pages are migrated if they are currently misplaced.
Essentially this is making an assumption that NUMA locality is
on a PMD boundary but that could be addressed by only setting
pmd_numa if all the pages within that PMD are on the same node
if necessary.
Signed-off-by: default avatarMel Gorman <mgorman@suse.de>
parent 5606e387
...@@ -3449,6 +3449,18 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma, ...@@ -3449,6 +3449,18 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
} }
int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
unsigned long addr, int current_nid)
{
get_page(page);
count_vm_numa_event(NUMA_HINT_FAULTS);
if (current_nid == numa_node_id())
count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
return mpol_misplaced(page, vma, addr);
}
int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long addr, pte_t pte, pte_t *ptep, pmd_t *pmd) unsigned long addr, pte_t pte, pte_t *ptep, pmd_t *pmd)
{ {
...@@ -3477,18 +3489,14 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, ...@@ -3477,18 +3489,14 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
set_pte_at(mm, addr, ptep, pte); set_pte_at(mm, addr, ptep, pte);
update_mmu_cache(vma, addr, ptep); update_mmu_cache(vma, addr, ptep);
count_vm_numa_event(NUMA_HINT_FAULTS);
page = vm_normal_page(vma, addr, pte); page = vm_normal_page(vma, addr, pte);
if (!page) { if (!page) {
pte_unmap_unlock(ptep, ptl); pte_unmap_unlock(ptep, ptl);
return 0; return 0;
} }
get_page(page);
current_nid = page_to_nid(page); current_nid = page_to_nid(page);
if (current_nid == numa_node_id()) target_nid = numa_migrate_prep(page, vma, addr, current_nid);
count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
target_nid = mpol_misplaced(page, vma, addr);
pte_unmap_unlock(ptep, ptl); pte_unmap_unlock(ptep, ptl);
if (target_nid == -1) { if (target_nid == -1) {
/* /*
...@@ -3505,6 +3513,7 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, ...@@ -3505,6 +3513,7 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
current_nid = target_nid; current_nid = target_nid;
out: out:
if (current_nid != -1)
task_numa_fault(current_nid, 1); task_numa_fault(current_nid, 1);
return 0; return 0;
} }
...@@ -3521,8 +3530,6 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, ...@@ -3521,8 +3530,6 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
spinlock_t *ptl; spinlock_t *ptl;
bool numa = false; bool numa = false;
int local_nid = numa_node_id(); int local_nid = numa_node_id();
unsigned long nr_faults = 0;
unsigned long nr_faults_local = 0;
spin_lock(&mm->page_table_lock); spin_lock(&mm->page_table_lock);
pmd = *pmdp; pmd = *pmdp;
...@@ -3545,7 +3552,8 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, ...@@ -3545,7 +3552,8 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
for (addr = _addr + offset; addr < _addr + PMD_SIZE; pte++, addr += PAGE_SIZE) { for (addr = _addr + offset; addr < _addr + PMD_SIZE; pte++, addr += PAGE_SIZE) {
pte_t pteval = *pte; pte_t pteval = *pte;
struct page *page; struct page *page;
int curr_nid; int curr_nid = local_nid;
int target_nid;
if (!pte_present(pteval)) if (!pte_present(pteval))
continue; continue;
if (!pte_numa(pteval)) if (!pte_numa(pteval))
...@@ -3566,21 +3574,30 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, ...@@ -3566,21 +3574,30 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
/* only check non-shared pages */ /* only check non-shared pages */
if (unlikely(page_mapcount(page) != 1)) if (unlikely(page_mapcount(page) != 1))
continue; continue;
pte_unmap_unlock(pte, ptl);
curr_nid = page_to_nid(page); /*
task_numa_fault(curr_nid, 1); * Note that the NUMA fault is later accounted to either
* the node that is currently running or where the page is
* migrated to.
*/
curr_nid = local_nid;
target_nid = numa_migrate_prep(page, vma, addr,
page_to_nid(page));
if (target_nid == -1) {
put_page(page);
continue;
}
nr_faults++; /* Migrate to the requested node */
if (curr_nid == local_nid) pte_unmap_unlock(pte, ptl);
nr_faults_local++; if (migrate_misplaced_page(page, target_nid))
curr_nid = target_nid;
task_numa_fault(curr_nid, 1);
pte = pte_offset_map_lock(mm, pmdp, addr, &ptl); pte = pte_offset_map_lock(mm, pmdp, addr, &ptl);
} }
pte_unmap_unlock(orig_pte, ptl); pte_unmap_unlock(orig_pte, ptl);
count_vm_numa_events(NUMA_HINT_FAULTS, nr_faults);
count_vm_numa_events(NUMA_HINT_FAULTS_LOCAL, nr_faults_local);
return 0; return 0;
} }
#else #else
......
...@@ -37,12 +37,14 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot) ...@@ -37,12 +37,14 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot)
static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
unsigned long addr, unsigned long end, pgprot_t newprot, unsigned long addr, unsigned long end, pgprot_t newprot,
int dirty_accountable, int prot_numa) int dirty_accountable, int prot_numa, bool *ret_all_same_node)
{ {
struct mm_struct *mm = vma->vm_mm; struct mm_struct *mm = vma->vm_mm;
pte_t *pte, oldpte; pte_t *pte, oldpte;
spinlock_t *ptl; spinlock_t *ptl;
unsigned long pages = 0; unsigned long pages = 0;
bool all_same_node = true;
int last_nid = -1;
pte = pte_offset_map_lock(mm, pmd, addr, &ptl); pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
arch_enter_lazy_mmu_mode(); arch_enter_lazy_mmu_mode();
...@@ -61,6 +63,12 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, ...@@ -61,6 +63,12 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
page = vm_normal_page(vma, addr, oldpte); page = vm_normal_page(vma, addr, oldpte);
if (page) { if (page) {
int this_nid = page_to_nid(page);
if (last_nid == -1)
last_nid = this_nid;
if (last_nid != this_nid)
all_same_node = false;
/* only check non-shared pages */ /* only check non-shared pages */
if (!pte_numa(oldpte) && if (!pte_numa(oldpte) &&
page_mapcount(page) == 1) { page_mapcount(page) == 1) {
...@@ -81,7 +89,6 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, ...@@ -81,7 +89,6 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
if (updated) if (updated)
pages++; pages++;
ptep_modify_prot_commit(mm, addr, pte, ptent); ptep_modify_prot_commit(mm, addr, pte, ptent);
} else if (IS_ENABLED(CONFIG_MIGRATION) && !pte_file(oldpte)) { } else if (IS_ENABLED(CONFIG_MIGRATION) && !pte_file(oldpte)) {
swp_entry_t entry = pte_to_swp_entry(oldpte); swp_entry_t entry = pte_to_swp_entry(oldpte);
...@@ -101,6 +108,7 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, ...@@ -101,6 +108,7 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
arch_leave_lazy_mmu_mode(); arch_leave_lazy_mmu_mode();
pte_unmap_unlock(pte - 1, ptl); pte_unmap_unlock(pte - 1, ptl);
*ret_all_same_node = all_same_node;
return pages; return pages;
} }
...@@ -127,6 +135,7 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, pud_t * ...@@ -127,6 +135,7 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, pud_t *
pmd_t *pmd; pmd_t *pmd;
unsigned long next; unsigned long next;
unsigned long pages = 0; unsigned long pages = 0;
bool all_same_node;
pmd = pmd_offset(pud, addr); pmd = pmd_offset(pud, addr);
do { do {
...@@ -143,9 +152,15 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, pud_t * ...@@ -143,9 +152,15 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, pud_t *
if (pmd_none_or_clear_bad(pmd)) if (pmd_none_or_clear_bad(pmd))
continue; continue;
pages += change_pte_range(vma, pmd, addr, next, newprot, pages += change_pte_range(vma, pmd, addr, next, newprot,
dirty_accountable, prot_numa); dirty_accountable, prot_numa, &all_same_node);
if (prot_numa) /*
* If we are changing protections for NUMA hinting faults then
* set pmd_numa if the examined pages were all on the same
* node. This allows a regular PMD to be handled as one fault
* and effectively batches the taking of the PTL
*/
if (prot_numa && all_same_node)
change_pmd_protnuma(vma->vm_mm, addr, pmd); change_pmd_protnuma(vma->vm_mm, addr, pmd);
} while (pmd++, addr = next, addr != end); } while (pmd++, addr = next, addr != end);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment