Commit 5da7ca86 authored by Christoph Lameter's avatar Christoph Lameter Committed by Linus Torvalds

[PATCH] Add NUMA policy support for huge pages.

The huge_zonelist() function in the memory policy layer provides an list of
zones ordered by NUMA distance.  The hugetlb layer will walk that list looking
for a zone that has available huge pages but is also in the nodeset of the
current cpuset.

This patch does not contain the folding of find_or_alloc_huge_page() that was
controversial in the earlier discussion.
Signed-off-by: default avatarChristoph Lameter <clameter@sgi.com>
Cc: Andi Kleen <ak@muc.de>
Acked-by: default avatarWilliam Lee Irwin III <wli@holomorphy.com>
Cc: Adam Litke <agl@us.ibm.com>
Signed-off-by: default avatarAndrew Morton <akpm@osdl.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@osdl.org>
parent 96df9333
...@@ -22,7 +22,7 @@ int hugetlb_report_meminfo(char *); ...@@ -22,7 +22,7 @@ int hugetlb_report_meminfo(char *);
int hugetlb_report_node_meminfo(int, char *); int hugetlb_report_node_meminfo(int, char *);
int is_hugepage_mem_enough(size_t); int is_hugepage_mem_enough(size_t);
unsigned long hugetlb_total_pages(void); unsigned long hugetlb_total_pages(void);
struct page *alloc_huge_page(void); struct page *alloc_huge_page(struct vm_area_struct *, unsigned long);
void free_huge_page(struct page *); void free_huge_page(struct page *);
int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, int write_access); unsigned long address, int write_access);
...@@ -97,7 +97,7 @@ static inline unsigned long hugetlb_total_pages(void) ...@@ -97,7 +97,7 @@ static inline unsigned long hugetlb_total_pages(void)
#define is_hugepage_only_range(mm, addr, len) 0 #define is_hugepage_only_range(mm, addr, len) 0
#define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) \ #define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) \
do { } while (0) do { } while (0)
#define alloc_huge_page() ({ NULL; }) #define alloc_huge_page(vma, addr) ({ NULL; })
#define free_huge_page(p) ({ (void)(p); BUG(); }) #define free_huge_page(p) ({ (void)(p); BUG(); })
#define hugetlb_fault(mm, vma, addr, write) ({ BUG(); 0; }) #define hugetlb_fault(mm, vma, addr, write) ({ BUG(); 0; })
......
...@@ -156,6 +156,8 @@ extern void numa_default_policy(void); ...@@ -156,6 +156,8 @@ extern void numa_default_policy(void);
extern void numa_policy_init(void); extern void numa_policy_init(void);
extern void numa_policy_rebind(const nodemask_t *old, const nodemask_t *new); extern void numa_policy_rebind(const nodemask_t *old, const nodemask_t *new);
extern struct mempolicy default_policy; extern struct mempolicy default_policy;
extern struct zonelist *huge_zonelist(struct vm_area_struct *vma,
unsigned long addr);
#else #else
...@@ -232,6 +234,12 @@ static inline void numa_policy_rebind(const nodemask_t *old, ...@@ -232,6 +234,12 @@ static inline void numa_policy_rebind(const nodemask_t *old,
{ {
} }
static inline struct zonelist *huge_zonelist(struct vm_area_struct *vma,
unsigned long addr)
{
return NODE_DATA(0)->node_zonelists + gfp_zone(GFP_HIGHUSER);
}
#endif /* CONFIG_NUMA */ #endif /* CONFIG_NUMA */
#endif /* __KERNEL__ */ #endif /* __KERNEL__ */
......
...@@ -11,6 +11,8 @@ ...@@ -11,6 +11,8 @@
#include <linux/highmem.h> #include <linux/highmem.h>
#include <linux/nodemask.h> #include <linux/nodemask.h>
#include <linux/pagemap.h> #include <linux/pagemap.h>
#include <linux/mempolicy.h>
#include <asm/page.h> #include <asm/page.h>
#include <asm/pgtable.h> #include <asm/pgtable.h>
...@@ -36,11 +38,12 @@ static void enqueue_huge_page(struct page *page) ...@@ -36,11 +38,12 @@ static void enqueue_huge_page(struct page *page)
free_huge_pages_node[nid]++; free_huge_pages_node[nid]++;
} }
static struct page *dequeue_huge_page(void) static struct page *dequeue_huge_page(struct vm_area_struct *vma,
unsigned long address)
{ {
int nid = numa_node_id(); int nid = numa_node_id();
struct page *page = NULL; struct page *page = NULL;
struct zonelist *zonelist = NODE_DATA(nid)->node_zonelists; struct zonelist *zonelist = huge_zonelist(vma, address);
struct zone **z; struct zone **z;
for (z = zonelist->zones; *z; z++) { for (z = zonelist->zones; *z; z++) {
...@@ -87,13 +90,13 @@ void free_huge_page(struct page *page) ...@@ -87,13 +90,13 @@ void free_huge_page(struct page *page)
spin_unlock(&hugetlb_lock); spin_unlock(&hugetlb_lock);
} }
struct page *alloc_huge_page(void) struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr)
{ {
struct page *page; struct page *page;
int i; int i;
spin_lock(&hugetlb_lock); spin_lock(&hugetlb_lock);
page = dequeue_huge_page(); page = dequeue_huge_page(vma, addr);
if (!page) { if (!page) {
spin_unlock(&hugetlb_lock); spin_unlock(&hugetlb_lock);
return NULL; return NULL;
...@@ -196,7 +199,7 @@ static unsigned long set_max_huge_pages(unsigned long count) ...@@ -196,7 +199,7 @@ static unsigned long set_max_huge_pages(unsigned long count)
spin_lock(&hugetlb_lock); spin_lock(&hugetlb_lock);
try_to_free_low(count); try_to_free_low(count);
while (count < nr_huge_pages) { while (count < nr_huge_pages) {
struct page *page = dequeue_huge_page(); struct page *page = dequeue_huge_page(NULL, 0);
if (!page) if (!page)
break; break;
update_and_free_page(page); update_and_free_page(page);
...@@ -365,8 +368,9 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, ...@@ -365,8 +368,9 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
flush_tlb_range(vma, start, end); flush_tlb_range(vma, start, end);
} }
static struct page *find_or_alloc_huge_page(struct address_space *mapping, static struct page *find_or_alloc_huge_page(struct vm_area_struct *vma,
unsigned long idx, int shared) unsigned long addr, struct address_space *mapping,
unsigned long idx, int shared)
{ {
struct page *page; struct page *page;
int err; int err;
...@@ -378,7 +382,7 @@ static struct page *find_or_alloc_huge_page(struct address_space *mapping, ...@@ -378,7 +382,7 @@ static struct page *find_or_alloc_huge_page(struct address_space *mapping,
if (hugetlb_get_quota(mapping)) if (hugetlb_get_quota(mapping))
goto out; goto out;
page = alloc_huge_page(); page = alloc_huge_page(vma, addr);
if (!page) { if (!page) {
hugetlb_put_quota(mapping); hugetlb_put_quota(mapping);
goto out; goto out;
...@@ -418,7 +422,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, ...@@ -418,7 +422,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
} }
page_cache_get(old_page); page_cache_get(old_page);
new_page = alloc_huge_page(); new_page = alloc_huge_page(vma, address);
if (!new_page) { if (!new_page) {
page_cache_release(old_page); page_cache_release(old_page);
...@@ -467,7 +471,7 @@ int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, ...@@ -467,7 +471,7 @@ int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
* Use page lock to guard against racing truncation * Use page lock to guard against racing truncation
* before we get page_table_lock. * before we get page_table_lock.
*/ */
page = find_or_alloc_huge_page(mapping, idx, page = find_or_alloc_huge_page(vma, address, mapping, idx,
vma->vm_flags & VM_SHARED); vma->vm_flags & VM_SHARED);
if (!page) if (!page)
goto out; goto out;
......
...@@ -785,6 +785,34 @@ static unsigned offset_il_node(struct mempolicy *pol, ...@@ -785,6 +785,34 @@ static unsigned offset_il_node(struct mempolicy *pol,
return nid; return nid;
} }
/* Determine a node number for interleave */
static inline unsigned interleave_nid(struct mempolicy *pol,
struct vm_area_struct *vma, unsigned long addr, int shift)
{
if (vma) {
unsigned long off;
off = vma->vm_pgoff;
off += (addr - vma->vm_start) >> shift;
return offset_il_node(pol, vma, off);
} else
return interleave_nodes(pol);
}
/* Return a zonelist suitable for a huge page allocation. */
struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr)
{
struct mempolicy *pol = get_vma_policy(current, vma, addr);
if (pol->policy == MPOL_INTERLEAVE) {
unsigned nid;
nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT);
return NODE_DATA(nid)->node_zonelists + gfp_zone(GFP_HIGHUSER);
}
return zonelist_policy(GFP_HIGHUSER, pol);
}
/* Allocate a page in interleaved policy. /* Allocate a page in interleaved policy.
Own path because it needs to do special accounting. */ Own path because it needs to do special accounting. */
static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
...@@ -833,15 +861,8 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) ...@@ -833,15 +861,8 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
if (unlikely(pol->policy == MPOL_INTERLEAVE)) { if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
unsigned nid; unsigned nid;
if (vma) {
unsigned long off; nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
off = vma->vm_pgoff;
off += (addr - vma->vm_start) >> PAGE_SHIFT;
nid = offset_il_node(pol, vma, off);
} else {
/* fall back to process interleaving */
nid = interleave_nodes(pol);
}
return alloc_page_interleave(gfp, 0, nid); return alloc_page_interleave(gfp, 0, nid);
} }
return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol)); return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol));
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment