Commit 0bbbc0b3 authored by Andrea Arcangeli's avatar Andrea Arcangeli Committed by Linus Torvalds

thp: add numa awareness to hugepage allocations

It's mostly a matter of replacing alloc_pages with alloc_pages_vma after
introducing alloc_pages_vma.  khugepaged needs special handling as the
allocation has to happen inside collapse_huge_page where the vma is known
and an error has to be returned to the outer loop to sleep
alloc_sleep_millisecs in case of failure.  But it retains the more
efficient logic of handling allocation failures in khugepaged in case of
CONFIG_NUMA=n.
Signed-off-by: default avatarAndrea Arcangeli <aarcange@redhat.com>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
parent d39d33c3
...@@ -331,14 +331,17 @@ alloc_pages(gfp_t gfp_mask, unsigned int order) ...@@ -331,14 +331,17 @@ alloc_pages(gfp_t gfp_mask, unsigned int order)
{ {
return alloc_pages_current(gfp_mask, order); return alloc_pages_current(gfp_mask, order);
} }
extern struct page *alloc_page_vma(gfp_t gfp_mask, extern struct page *alloc_pages_vma(gfp_t gfp_mask, int order,
struct vm_area_struct *vma, unsigned long addr); struct vm_area_struct *vma, unsigned long addr);
#else #else
#define alloc_pages(gfp_mask, order) \ #define alloc_pages(gfp_mask, order) \
alloc_pages_node(numa_node_id(), gfp_mask, order) alloc_pages_node(numa_node_id(), gfp_mask, order)
#define alloc_page_vma(gfp_mask, vma, addr) alloc_pages(gfp_mask, 0) #define alloc_pages_vma(gfp_mask, order, vma, addr) \
alloc_pages(gfp_mask, order)
#endif #endif
#define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0) #define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0)
#define alloc_page_vma(gfp_mask, vma, addr) \
alloc_pages_vma(gfp_mask, 0, vma, addr)
extern unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order); extern unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order);
extern unsigned long get_zeroed_page(gfp_t gfp_mask); extern unsigned long get_zeroed_page(gfp_t gfp_mask);
......
...@@ -620,11 +620,26 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, ...@@ -620,11 +620,26 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
return ret; return ret;
} }
static inline gfp_t alloc_hugepage_gfpmask(int defrag)
{
return GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_WAIT);
}
static inline struct page *alloc_hugepage_vma(int defrag,
struct vm_area_struct *vma,
unsigned long haddr)
{
return alloc_pages_vma(alloc_hugepage_gfpmask(defrag),
HPAGE_PMD_ORDER, vma, haddr);
}
#ifndef CONFIG_NUMA
static inline struct page *alloc_hugepage(int defrag) static inline struct page *alloc_hugepage(int defrag)
{ {
return alloc_pages(GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_WAIT), return alloc_pages(alloc_hugepage_gfpmask(defrag),
HPAGE_PMD_ORDER); HPAGE_PMD_ORDER);
} }
#endif
int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pmd_t *pmd, unsigned long address, pmd_t *pmd,
...@@ -639,7 +654,8 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, ...@@ -639,7 +654,8 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
return VM_FAULT_OOM; return VM_FAULT_OOM;
if (unlikely(khugepaged_enter(vma))) if (unlikely(khugepaged_enter(vma)))
return VM_FAULT_OOM; return VM_FAULT_OOM;
page = alloc_hugepage(transparent_hugepage_defrag(vma)); page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
vma, haddr);
if (unlikely(!page)) if (unlikely(!page))
goto out; goto out;
if (unlikely(mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))) { if (unlikely(mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))) {
...@@ -862,7 +878,8 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, ...@@ -862,7 +878,8 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
if (transparent_hugepage_enabled(vma) && if (transparent_hugepage_enabled(vma) &&
!transparent_hugepage_debug_cow()) !transparent_hugepage_debug_cow())
new_page = alloc_hugepage(transparent_hugepage_defrag(vma)); new_page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
vma, haddr);
else else
new_page = NULL; new_page = NULL;
...@@ -1661,7 +1678,11 @@ static void collapse_huge_page(struct mm_struct *mm, ...@@ -1661,7 +1678,11 @@ static void collapse_huge_page(struct mm_struct *mm,
unsigned long hstart, hend; unsigned long hstart, hend;
VM_BUG_ON(address & ~HPAGE_PMD_MASK); VM_BUG_ON(address & ~HPAGE_PMD_MASK);
#ifndef CONFIG_NUMA
VM_BUG_ON(!*hpage); VM_BUG_ON(!*hpage);
#else
VM_BUG_ON(*hpage);
#endif
/* /*
* Prevent all access to pagetables with the exception of * Prevent all access to pagetables with the exception of
...@@ -1699,9 +1720,17 @@ static void collapse_huge_page(struct mm_struct *mm, ...@@ -1699,9 +1720,17 @@ static void collapse_huge_page(struct mm_struct *mm,
if (!pmd_present(*pmd) || pmd_trans_huge(*pmd)) if (!pmd_present(*pmd) || pmd_trans_huge(*pmd))
goto out; goto out;
#ifndef CONFIG_NUMA
new_page = *hpage; new_page = *hpage;
if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) #else
new_page = alloc_hugepage_vma(khugepaged_defrag(), vma, address);
if (unlikely(!new_page)) {
*hpage = ERR_PTR(-ENOMEM);
goto out; goto out;
}
#endif
if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL)))
goto out_put_page;
anon_vma_lock(vma->anon_vma); anon_vma_lock(vma->anon_vma);
...@@ -1730,7 +1759,7 @@ static void collapse_huge_page(struct mm_struct *mm, ...@@ -1730,7 +1759,7 @@ static void collapse_huge_page(struct mm_struct *mm,
spin_unlock(&mm->page_table_lock); spin_unlock(&mm->page_table_lock);
anon_vma_unlock(vma->anon_vma); anon_vma_unlock(vma->anon_vma);
mem_cgroup_uncharge_page(new_page); mem_cgroup_uncharge_page(new_page);
goto out; goto out_put_page;
} }
/* /*
...@@ -1765,10 +1794,19 @@ static void collapse_huge_page(struct mm_struct *mm, ...@@ -1765,10 +1794,19 @@ static void collapse_huge_page(struct mm_struct *mm,
mm->nr_ptes--; mm->nr_ptes--;
spin_unlock(&mm->page_table_lock); spin_unlock(&mm->page_table_lock);
#ifndef CONFIG_NUMA
*hpage = NULL; *hpage = NULL;
#endif
khugepaged_pages_collapsed++; khugepaged_pages_collapsed++;
out: out:
up_write(&mm->mmap_sem); up_write(&mm->mmap_sem);
return;
out_put_page:
#ifdef CONFIG_NUMA
put_page(new_page);
#endif
goto out;
} }
static int khugepaged_scan_pmd(struct mm_struct *mm, static int khugepaged_scan_pmd(struct mm_struct *mm,
...@@ -2001,11 +2039,16 @@ static void khugepaged_do_scan(struct page **hpage) ...@@ -2001,11 +2039,16 @@ static void khugepaged_do_scan(struct page **hpage)
while (progress < pages) { while (progress < pages) {
cond_resched(); cond_resched();
#ifndef CONFIG_NUMA
if (!*hpage) { if (!*hpage) {
*hpage = alloc_hugepage(khugepaged_defrag()); *hpage = alloc_hugepage(khugepaged_defrag());
if (unlikely(!*hpage)) if (unlikely(!*hpage))
break; break;
} }
#else
if (IS_ERR(*hpage))
break;
#endif
spin_lock(&khugepaged_mm_lock); spin_lock(&khugepaged_mm_lock);
if (!khugepaged_scan.mm_slot) if (!khugepaged_scan.mm_slot)
...@@ -2020,37 +2063,55 @@ static void khugepaged_do_scan(struct page **hpage) ...@@ -2020,37 +2063,55 @@ static void khugepaged_do_scan(struct page **hpage)
} }
} }
static struct page *khugepaged_alloc_hugepage(void) static void khugepaged_alloc_sleep(void)
{ {
struct page *hpage;
do {
hpage = alloc_hugepage(khugepaged_defrag());
if (!hpage) {
DEFINE_WAIT(wait); DEFINE_WAIT(wait);
add_wait_queue(&khugepaged_wait, &wait); add_wait_queue(&khugepaged_wait, &wait);
schedule_timeout_interruptible( schedule_timeout_interruptible(
msecs_to_jiffies( msecs_to_jiffies(
khugepaged_alloc_sleep_millisecs)); khugepaged_alloc_sleep_millisecs));
remove_wait_queue(&khugepaged_wait, &wait); remove_wait_queue(&khugepaged_wait, &wait);
} }
#ifndef CONFIG_NUMA
static struct page *khugepaged_alloc_hugepage(void)
{
struct page *hpage;
do {
hpage = alloc_hugepage(khugepaged_defrag());
if (!hpage)
khugepaged_alloc_sleep();
} while (unlikely(!hpage) && } while (unlikely(!hpage) &&
likely(khugepaged_enabled())); likely(khugepaged_enabled()));
return hpage; return hpage;
} }
#endif
static void khugepaged_loop(void) static void khugepaged_loop(void)
{ {
struct page *hpage; struct page *hpage;
#ifdef CONFIG_NUMA
hpage = NULL;
#endif
while (likely(khugepaged_enabled())) { while (likely(khugepaged_enabled())) {
#ifndef CONFIG_NUMA
hpage = khugepaged_alloc_hugepage(); hpage = khugepaged_alloc_hugepage();
if (unlikely(!hpage)) if (unlikely(!hpage))
break; break;
#else
if (IS_ERR(hpage)) {
khugepaged_alloc_sleep();
hpage = NULL;
}
#endif
khugepaged_do_scan(&hpage); khugepaged_do_scan(&hpage);
#ifndef CONFIG_NUMA
if (hpage) if (hpage)
put_page(hpage); put_page(hpage);
#endif
if (khugepaged_has_work()) { if (khugepaged_has_work()) {
DEFINE_WAIT(wait); DEFINE_WAIT(wait);
if (!khugepaged_scan_sleep_millisecs) if (!khugepaged_scan_sleep_millisecs)
......
...@@ -1796,7 +1796,7 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, ...@@ -1796,7 +1796,7 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
} }
/** /**
* alloc_page_vma - Allocate a page for a VMA. * alloc_pages_vma - Allocate a page for a VMA.
* *
* @gfp: * @gfp:
* %GFP_USER user allocation. * %GFP_USER user allocation.
...@@ -1805,6 +1805,7 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, ...@@ -1805,6 +1805,7 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
* %GFP_FS allocation should not call back into a file system. * %GFP_FS allocation should not call back into a file system.
* %GFP_ATOMIC don't sleep. * %GFP_ATOMIC don't sleep.
* *
* @order:Order of the GFP allocation.
* @vma: Pointer to VMA or NULL if not available. * @vma: Pointer to VMA or NULL if not available.
* @addr: Virtual Address of the allocation. Must be inside the VMA. * @addr: Virtual Address of the allocation. Must be inside the VMA.
* *
...@@ -1818,7 +1819,8 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, ...@@ -1818,7 +1819,8 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
* Should be called with the mm_sem of the vma hold. * Should be called with the mm_sem of the vma hold.
*/ */
struct page * struct page *
alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
unsigned long addr)
{ {
struct mempolicy *pol = get_vma_policy(current, vma, addr); struct mempolicy *pol = get_vma_policy(current, vma, addr);
struct zonelist *zl; struct zonelist *zl;
...@@ -1830,7 +1832,7 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) ...@@ -1830,7 +1832,7 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
nid = interleave_nid(pol, vma, addr, PAGE_SHIFT); nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
mpol_cond_put(pol); mpol_cond_put(pol);
page = alloc_page_interleave(gfp, 0, nid); page = alloc_page_interleave(gfp, order, nid);
put_mems_allowed(); put_mems_allowed();
return page; return page;
} }
...@@ -1839,7 +1841,7 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) ...@@ -1839,7 +1841,7 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
/* /*
* slow path: ref counted shared policy * slow path: ref counted shared policy
*/ */
struct page *page = __alloc_pages_nodemask(gfp, 0, struct page *page = __alloc_pages_nodemask(gfp, order,
zl, policy_nodemask(gfp, pol)); zl, policy_nodemask(gfp, pol));
__mpol_put(pol); __mpol_put(pol);
put_mems_allowed(); put_mems_allowed();
...@@ -1848,7 +1850,8 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) ...@@ -1848,7 +1850,8 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
/* /*
* fast path: default or task policy * fast path: default or task policy
*/ */
page = __alloc_pages_nodemask(gfp, 0, zl, policy_nodemask(gfp, pol)); page = __alloc_pages_nodemask(gfp, order, zl,
policy_nodemask(gfp, pol));
put_mems_allowed(); put_mems_allowed();
return page; return page;
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment