Commit 6c542ab7 authored by Aneesh Kumar K.V's avatar Aneesh Kumar K.V Committed by Andrew Morton

mm/demotion: build demotion targets based on explicit memory tiers

This patch switch the demotion target building logic to use memory tiers
instead of NUMA distance.  All N_MEMORY NUMA nodes will be placed in the
default memory tier and additional memory tiers will be added by drivers
like dax kmem.

This patch builds the demotion target for a NUMA node by looking at all
memory tiers below the tier to which the NUMA node belongs.  The closest
node in the immediately following memory tier is used as a demotion
target.

Since we are now only building demotion target for N_MEMORY NUMA nodes the
CPU hotplug calls are removed in this patch.

Link: https://lkml.kernel.org/r/20220818131042.113280-6-aneesh.kumar@linux.ibm.comSigned-off-by: default avatarAneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Reviewed-by: default avatar"Huang, Ying" <ying.huang@intel.com>
Acked-by: default avatarWei Xu <weixugc@google.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Bharata B Rao <bharata@amd.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Hesham Almatary <hesham.almatary@huawei.com>
Cc: Jagdish Gediya <jvgediya.oss@gmail.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Tim Chen <tim.c.chen@intel.com>
Cc: Yang Shi <shy828301@gmail.com>
Cc: SeongJae Park <sj@kernel.org>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
parent 7b88bda3
......@@ -37,6 +37,14 @@ struct memory_dev_type *alloc_memory_type(int adistance);
void destroy_memory_type(struct memory_dev_type *memtype);
void init_node_memory_type(int node, struct memory_dev_type *default_type);
void clear_node_memory_type(int node, struct memory_dev_type *memtype);
#ifdef CONFIG_MIGRATION
int next_demotion_node(int node);
#else
static inline int next_demotion_node(int node)
{
return NUMA_NO_NODE;
}
#endif
#else
......@@ -63,5 +71,10 @@ static inline void clear_node_memory_type(int node, struct memory_dev_type *memt
{
}
static inline int next_demotion_node(int node)
{
return NUMA_NO_NODE;
}
#endif /* CONFIG_NUMA */
#endif /* _LINUX_MEMORY_TIERS_H */
......@@ -100,19 +100,6 @@ static inline int migrate_huge_page_move_mapping(struct address_space *mapping,
#endif /* CONFIG_MIGRATION */
#if defined(CONFIG_MIGRATION) && defined(CONFIG_NUMA)
extern void set_migration_target_nodes(void);
extern void migrate_on_reclaim_init(void);
extern int next_demotion_node(int node);
#else
static inline void set_migration_target_nodes(void) {}
static inline void migrate_on_reclaim_init(void) {}
static inline int next_demotion_node(int node)
{
return NUMA_NO_NODE;
}
#endif
#ifdef CONFIG_COMPACTION
bool PageMovable(struct page *page);
void __SetPageMovable(struct page *page, const struct movable_operations *ops);
......
......@@ -6,6 +6,8 @@
#include <linux/memory.h>
#include <linux/memory-tiers.h>
#include "internal.h"
struct memory_tier {
/* hierarchy of memory tiers */
struct list_head list;
......@@ -19,6 +21,10 @@ struct memory_tier {
int adistance_start;
};
struct demotion_nodes {
nodemask_t preferred;
};
struct node_memory_type_map {
struct memory_dev_type *memtype;
int map_count;
......@@ -28,6 +34,66 @@ static DEFINE_MUTEX(memory_tier_lock);
static LIST_HEAD(memory_tiers);
static struct node_memory_type_map node_memory_types[MAX_NUMNODES];
static struct memory_dev_type *default_dram_type;
#ifdef CONFIG_MIGRATION
/*
* node_demotion[] examples:
*
* Example 1:
*
* Node 0 & 1 are CPU + DRAM nodes, node 2 & 3 are PMEM nodes.
*
* node distances:
* node 0 1 2 3
* 0 10 20 30 40
* 1 20 10 40 30
* 2 30 40 10 40
* 3 40 30 40 10
*
* memory_tiers0 = 0-1
* memory_tiers1 = 2-3
*
* node_demotion[0].preferred = 2
* node_demotion[1].preferred = 3
* node_demotion[2].preferred = <empty>
* node_demotion[3].preferred = <empty>
*
* Example 2:
*
* Node 0 & 1 are CPU + DRAM nodes, node 2 is memory-only DRAM node.
*
* node distances:
* node 0 1 2
* 0 10 20 30
* 1 20 10 30
* 2 30 30 10
*
* memory_tiers0 = 0-2
*
* node_demotion[0].preferred = <empty>
* node_demotion[1].preferred = <empty>
* node_demotion[2].preferred = <empty>
*
* Example 3:
*
* Node 0 is CPU + DRAM nodes, Node 1 is HBM node, node 2 is PMEM node.
*
* node distances:
* node 0 1 2
* 0 10 20 30
* 1 20 10 40
* 2 30 40 10
*
* memory_tiers0 = 1
* memory_tiers1 = 0
* memory_tiers2 = 2
*
* node_demotion[0].preferred = 2
* node_demotion[1].preferred = 0
* node_demotion[2].preferred = <empty>
*
*/
static struct demotion_nodes *node_demotion __read_mostly;
#endif /* CONFIG_MIGRATION */
static struct memory_tier *find_create_memory_tier(struct memory_dev_type *memtype)
{
......@@ -73,6 +139,154 @@ static struct memory_tier *find_create_memory_tier(struct memory_dev_type *memty
return new_memtier;
}
static struct memory_tier *__node_get_memory_tier(int node)
{
struct memory_dev_type *memtype;
memtype = node_memory_types[node];
if (memtype && node_isset(node, memtype->nodes))
return memtype->memtier;
return NULL;
}
#ifdef CONFIG_MIGRATION
/**
* next_demotion_node() - Get the next node in the demotion path
* @node: The starting node to lookup the next node
*
* Return: node id for next memory node in the demotion path hierarchy
* from @node; NUMA_NO_NODE if @node is terminal. This does not keep
* @node online or guarantee that it *continues* to be the next demotion
* target.
*/
int next_demotion_node(int node)
{
struct demotion_nodes *nd;
int target;
if (!node_demotion)
return NUMA_NO_NODE;
nd = &node_demotion[node];
/*
* node_demotion[] is updated without excluding this
* function from running.
*
* Make sure to use RCU over entire code blocks if
* node_demotion[] reads need to be consistent.
*/
rcu_read_lock();
/*
* If there are multiple target nodes, just select one
* target node randomly.
*
* In addition, we can also use round-robin to select
* target node, but we should introduce another variable
* for node_demotion[] to record last selected target node,
* that may cause cache ping-pong due to the changing of
* last target node. Or introducing per-cpu data to avoid
* caching issue, which seems more complicated. So selecting
* target node randomly seems better until now.
*/
target = node_random(&nd->preferred);
rcu_read_unlock();
return target;
}
static void disable_all_demotion_targets(void)
{
int node;
for_each_node_state(node, N_MEMORY)
node_demotion[node].preferred = NODE_MASK_NONE;
/*
* Ensure that the "disable" is visible across the system.
* Readers will see either a combination of before+disable
* state or disable+after. They will never see before and
* after state together.
*/
synchronize_rcu();
}
static __always_inline nodemask_t get_memtier_nodemask(struct memory_tier *memtier)
{
nodemask_t nodes = NODE_MASK_NONE;
struct memory_dev_type *memtype;
list_for_each_entry(memtype, &memtier->memory_types, tier_sibiling)
nodes_or(nodes, nodes, memtype->nodes);
return nodes;
}
/*
* Find an automatic demotion target for all memory
* nodes. Failing here is OK. It might just indicate
* being at the end of a chain.
*/
static void establish_demotion_targets(void)
{
struct memory_tier *memtier;
struct demotion_nodes *nd;
int target = NUMA_NO_NODE, node;
int distance, best_distance;
nodemask_t tier_nodes;
lockdep_assert_held_once(&memory_tier_lock);
if (!node_demotion || !IS_ENABLED(CONFIG_MIGRATION))
return;
disable_all_demotion_targets();
for_each_node_state(node, N_MEMORY) {
best_distance = -1;
nd = &node_demotion[node];
memtier = __node_get_memory_tier(node);
if (!memtier || list_is_last(&memtier->list, &memory_tiers))
continue;
/*
* Get the lower memtier to find the demotion node list.
*/
memtier = list_next_entry(memtier, list);
tier_nodes = get_memtier_nodemask(memtier);
/*
* find_next_best_node, use 'used' nodemask as a skip list.
* Add all memory nodes except the selected memory tier
* nodelist to skip list so that we find the best node from the
* memtier nodelist.
*/
nodes_andnot(tier_nodes, node_states[N_MEMORY], tier_nodes);
/*
* Find all the nodes in the memory tier node list of same best distance.
* add them to the preferred mask. We randomly select between nodes
* in the preferred mask when allocating pages during demotion.
*/
do {
target = find_next_best_node(node, &tier_nodes);
if (target == NUMA_NO_NODE)
break;
distance = node_distance(node, target);
if (distance == best_distance || best_distance == -1) {
best_distance = distance;
node_set(target, nd->preferred);
} else {
break;
}
} while (1);
}
}
#else
static inline void disable_all_demotion_targets(void) {}
static inline void establish_demotion_targets(void) {}
#endif /* CONFIG_MIGRATION */
static inline void __init_node_memory_type(int node, struct memory_dev_type *memtype)
{
if (!node_memory_types[node].memtype)
......@@ -109,16 +323,6 @@ static struct memory_tier *set_node_memory_tier(int node)
return memtier;
}
static struct memory_tier *__node_get_memory_tier(int node)
{
struct memory_dev_type *memtype;
memtype = node_memory_types[node];
if (memtype && node_isset(node, memtype->nodes))
return memtype->memtier;
return NULL;
}
static void destroy_memory_tier(struct memory_tier *memtier)
{
list_del(&memtier->list);
......@@ -207,6 +411,7 @@ EXPORT_SYMBOL_GPL(clear_node_memory_type);
static int __meminit memtier_hotplug_callback(struct notifier_block *self,
unsigned long action, void *_arg)
{
struct memory_tier *memtier;
struct memory_notify *arg = _arg;
/*
......@@ -219,12 +424,15 @@ static int __meminit memtier_hotplug_callback(struct notifier_block *self,
switch (action) {
case MEM_OFFLINE:
mutex_lock(&memory_tier_lock);
clear_node_memory_tier(arg->status_change_nid);
if (clear_node_memory_tier(arg->status_change_nid))
establish_demotion_targets();
mutex_unlock(&memory_tier_lock);
break;
case MEM_ONLINE:
mutex_lock(&memory_tier_lock);
set_node_memory_tier(arg->status_change_nid);
memtier = set_node_memory_tier(arg->status_change_nid);
if (!IS_ERR(memtier))
establish_demotion_targets();
mutex_unlock(&memory_tier_lock);
break;
}
......@@ -237,6 +445,11 @@ static int __init memory_tier_init(void)
int node;
struct memory_tier *memtier;
#ifdef CONFIG_MIGRATION
node_demotion = kcalloc(nr_node_ids, sizeof(struct demotion_nodes),
GFP_KERNEL);
WARN_ON(!node_demotion);
#endif
mutex_lock(&memory_tier_lock);
/*
* For now we can have 4 faster memory tiers with smaller adistance
......@@ -259,6 +472,7 @@ static int __init memory_tier_init(void)
*/
break;
}
establish_demotion_targets();
mutex_unlock(&memory_tier_lock);
hotplug_memory_notifier(memtier_hotplug_callback, MEMTIER_HOTPLUG_PRIO);
......
This diff is collapsed.
......@@ -28,7 +28,6 @@
#include <linux/mm_inline.h>
#include <linux/page_ext.h>
#include <linux/page_owner.h>
#include <linux/migrate.h>
#include "internal.h"
......@@ -2068,7 +2067,6 @@ static int vmstat_cpu_online(unsigned int cpu)
if (!node_state(cpu_to_node(cpu), N_CPU)) {
node_set_state(cpu_to_node(cpu), N_CPU);
set_migration_target_nodes();
}
return 0;
......@@ -2093,7 +2091,6 @@ static int vmstat_cpu_dead(unsigned int cpu)
return 0;
node_clear_state(node, N_CPU);
set_migration_target_nodes();
return 0;
}
......@@ -2126,7 +2123,6 @@ void __init init_mm_internals(void)
start_shepherd_timer();
#endif
migrate_on_reclaim_init();
#ifdef CONFIG_PROC_FS
proc_create_seq("buddyinfo", 0444, NULL, &fragmentation_op);
proc_create_seq("pagetypeinfo", 0400, NULL, &pagetypeinfo_op);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment