Commit ffb29b1c authored by Chen Wandun's avatar Chen Wandun Committed by Linus Torvalds

mm/vmalloc: fix numa spreading for large hash tables

Eric Dumazet reported a strange numa spreading info in [1], and found
commit 121e6f32 ("mm/vmalloc: hugepage vmalloc mappings") introduced
this issue [2].

Dig into the difference before and after this patch, page allocation has
some difference:

before:
  alloc_large_system_hash
    __vmalloc
      __vmalloc_node(..., NUMA_NO_NODE, ...)
        __vmalloc_node_range
          __vmalloc_area_node
            alloc_page /* because NUMA_NO_NODE, so choose alloc_page branch */
              alloc_pages_current
                alloc_page_interleave /* can be proved by print policy mode */

after:
  alloc_large_system_hash
    __vmalloc
      __vmalloc_node(..., NUMA_NO_NODE, ...)
        __vmalloc_node_range
          __vmalloc_area_node
            alloc_pages_node /* choose nid by nuam_mem_id() */
              __alloc_pages_node(nid, ....)

So after commit 121e6f32 ("mm/vmalloc: hugepage vmalloc mappings"),
it will allocate memory in current node instead of interleaving allocate
memory.

Link: https://lore.kernel.org/linux-mm/CANn89iL6AAyWhfxdHO+jaT075iOa3XcYn9k6JJc7JR2XYn6k_Q@mail.gmail.com/ [1]
Link: https://lore.kernel.org/linux-mm/CANn89iLofTR=AK-QOZY87RdUZENCZUT4O6a0hvhu3_EwRMerOg@mail.gmail.com/ [2]
Link: https://lkml.kernel.org/r/20211021080744.874701-2-chenwandun@huawei.com
Fixes: 121e6f32 ("mm/vmalloc: hugepage vmalloc mappings")
Signed-off-by: default avatarChen Wandun <chenwandun@huawei.com>
Reported-by: default avatarEric Dumazet <edumazet@google.com>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Hanjun Guo <guohanjun@huawei.com>
Cc: Uladzislau Rezki <urezki@gmail.com>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
parent 855d4443
...@@ -2816,6 +2816,8 @@ vm_area_alloc_pages(gfp_t gfp, int nid, ...@@ -2816,6 +2816,8 @@ vm_area_alloc_pages(gfp_t gfp, int nid,
unsigned int order, unsigned int nr_pages, struct page **pages) unsigned int order, unsigned int nr_pages, struct page **pages)
{ {
unsigned int nr_allocated = 0; unsigned int nr_allocated = 0;
struct page *page;
int i;
/* /*
* For order-0 pages we make use of bulk allocator, if * For order-0 pages we make use of bulk allocator, if
...@@ -2823,7 +2825,7 @@ vm_area_alloc_pages(gfp_t gfp, int nid, ...@@ -2823,7 +2825,7 @@ vm_area_alloc_pages(gfp_t gfp, int nid,
* to fails, fallback to a single page allocator that is * to fails, fallback to a single page allocator that is
* more permissive. * more permissive.
*/ */
if (!order) { if (!order && nid != NUMA_NO_NODE) {
while (nr_allocated < nr_pages) { while (nr_allocated < nr_pages) {
unsigned int nr, nr_pages_request; unsigned int nr, nr_pages_request;
...@@ -2848,7 +2850,7 @@ vm_area_alloc_pages(gfp_t gfp, int nid, ...@@ -2848,7 +2850,7 @@ vm_area_alloc_pages(gfp_t gfp, int nid,
if (nr != nr_pages_request) if (nr != nr_pages_request)
break; break;
} }
} else } else if (order)
/* /*
* Compound pages required for remap_vmalloc_page if * Compound pages required for remap_vmalloc_page if
* high-order pages. * high-order pages.
...@@ -2856,11 +2858,12 @@ vm_area_alloc_pages(gfp_t gfp, int nid, ...@@ -2856,11 +2858,12 @@ vm_area_alloc_pages(gfp_t gfp, int nid,
gfp |= __GFP_COMP; gfp |= __GFP_COMP;
/* High-order pages or fallback path if "bulk" fails. */ /* High-order pages or fallback path if "bulk" fails. */
while (nr_allocated < nr_pages) {
struct page *page;
int i;
page = alloc_pages_node(nid, gfp, order); while (nr_allocated < nr_pages) {
if (nid == NUMA_NO_NODE)
page = alloc_pages(gfp, order);
else
page = alloc_pages_node(nid, gfp, order);
if (unlikely(!page)) if (unlikely(!page))
break; break;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment