Commit 4146d2d6 authored by Hugh Dickins's avatar Hugh Dickins Committed by Linus Torvalds

ksm: make !merge_across_nodes migration safe

The new KSM NUMA merge_across_nodes knob introduces a problem, when it's
set to non-default 0: if a KSM page is migrated to a different NUMA node,
how do we migrate its stable node to the right tree?  And what if that
collides with an existing stable node?

ksm_migrate_page() can do no more than it's already doing, updating
stable_node->kpfn: the stable tree itself cannot be manipulated without
holding ksm_thread_mutex.  So accept that a stable tree may temporarily
indicate a page belonging to the wrong NUMA node, leave updating until the
next pass of ksmd, just be careful not to merge other pages on to a
misplaced page.  Note nid of holding tree in stable_node, and recognize
that it will not always match nid of kpfn.

A misplaced KSM page is discovered, either when ksm_do_scan() next comes
around to one of its rmap_items (we now have to go to cmp_and_merge_page
even on pages in a stable tree), or when stable_tree_search() arrives at a
matching node for another page, and this node page is found misplaced.

In each case, move the misplaced stable_node to a list of migrate_nodes
(and use the address of migrate_nodes as magic by which to identify them):
we don't need them in a tree.  If stable_tree_search() finds no match for
a page, but it's currently exiled to this list, then slot its stable_node
right there into the tree, bringing all of its mappings with it; otherwise
they get migrated one by one to the original page of the colliding node.
stable_tree_search() is now modelled more like stable_tree_insert(), in
order to handle these insertions of migrated nodes.

remove_node_from_stable_tree(), remove_all_stable_nodes() and
ksm_check_stable_tree() have to handle the migrate_nodes list as well as
the stable tree itself.  Less obviously, we do need to prune the list of
stale entries from time to time (scan_get_next_rmap_item() does it once
each full scan): whereas stale nodes in the stable tree get naturally
pruned as searches try to brush past them, these migrate_nodes may get
forgotten and accumulate.
Signed-off-by: default avatarHugh Dickins <hughd@google.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Petr Holasek <pholasek@redhat.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Izik Eidus <izik.eidus@ravellosystems.com>
Cc: Gerald Schaefer <gerald.schaefer@de.ibm.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@gmail.com>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
parent c8d6553b
...@@ -122,13 +122,25 @@ struct ksm_scan { ...@@ -122,13 +122,25 @@ struct ksm_scan {
/** /**
* struct stable_node - node of the stable rbtree * struct stable_node - node of the stable rbtree
* @node: rb node of this ksm page in the stable tree * @node: rb node of this ksm page in the stable tree
* @head: (overlaying parent) &migrate_nodes indicates temporarily on that list
* @list: linked into migrate_nodes, pending placement in the proper node tree
* @hlist: hlist head of rmap_items using this ksm page * @hlist: hlist head of rmap_items using this ksm page
* @kpfn: page frame number of this ksm page * @kpfn: page frame number of this ksm page (perhaps temporarily on wrong nid)
* @nid: NUMA node id of stable tree in which linked (may not match kpfn)
*/ */
struct stable_node { struct stable_node {
struct rb_node node; union {
struct rb_node node; /* when node of stable tree */
struct { /* when listed for migration */
struct list_head *head;
struct list_head list;
};
};
struct hlist_head hlist; struct hlist_head hlist;
unsigned long kpfn; unsigned long kpfn;
#ifdef CONFIG_NUMA
int nid;
#endif
}; };
/** /**
...@@ -169,6 +181,9 @@ struct rmap_item { ...@@ -169,6 +181,9 @@ struct rmap_item {
static struct rb_root root_unstable_tree[MAX_NUMNODES]; static struct rb_root root_unstable_tree[MAX_NUMNODES];
static struct rb_root root_stable_tree[MAX_NUMNODES]; static struct rb_root root_stable_tree[MAX_NUMNODES];
/* Recently migrated nodes of stable tree, pending proper placement */
static LIST_HEAD(migrate_nodes);
#define MM_SLOTS_HASH_BITS 10 #define MM_SLOTS_HASH_BITS 10
static DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS); static DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
...@@ -311,11 +326,6 @@ static void insert_to_mm_slots_hash(struct mm_struct *mm, ...@@ -311,11 +326,6 @@ static void insert_to_mm_slots_hash(struct mm_struct *mm,
hash_add(mm_slots_hash, &mm_slot->link, (unsigned long)mm); hash_add(mm_slots_hash, &mm_slot->link, (unsigned long)mm);
} }
static inline int in_stable_tree(struct rmap_item *rmap_item)
{
return rmap_item->address & STABLE_FLAG;
}
/* /*
* ksmd, and unmerge_and_remove_all_rmap_items(), must not touch an mm's * ksmd, and unmerge_and_remove_all_rmap_items(), must not touch an mm's
* page tables after it has passed through ksm_exit() - which, if necessary, * page tables after it has passed through ksm_exit() - which, if necessary,
...@@ -476,7 +486,6 @@ static void remove_node_from_stable_tree(struct stable_node *stable_node) ...@@ -476,7 +486,6 @@ static void remove_node_from_stable_tree(struct stable_node *stable_node)
{ {
struct rmap_item *rmap_item; struct rmap_item *rmap_item;
struct hlist_node *hlist; struct hlist_node *hlist;
int nid;
hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) { hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) {
if (rmap_item->hlist.next) if (rmap_item->hlist.next)
...@@ -488,8 +497,11 @@ static void remove_node_from_stable_tree(struct stable_node *stable_node) ...@@ -488,8 +497,11 @@ static void remove_node_from_stable_tree(struct stable_node *stable_node)
cond_resched(); cond_resched();
} }
nid = get_kpfn_nid(stable_node->kpfn); if (stable_node->head == &migrate_nodes)
rb_erase(&stable_node->node, &root_stable_tree[nid]); list_del(&stable_node->list);
else
rb_erase(&stable_node->node,
&root_stable_tree[NUMA(stable_node->nid)]);
free_stable_node(stable_node); free_stable_node(stable_node);
} }
...@@ -712,6 +724,7 @@ static int remove_stable_node(struct stable_node *stable_node) ...@@ -712,6 +724,7 @@ static int remove_stable_node(struct stable_node *stable_node)
static int remove_all_stable_nodes(void) static int remove_all_stable_nodes(void)
{ {
struct stable_node *stable_node; struct stable_node *stable_node;
struct list_head *this, *next;
int nid; int nid;
int err = 0; int err = 0;
...@@ -726,6 +739,12 @@ static int remove_all_stable_nodes(void) ...@@ -726,6 +739,12 @@ static int remove_all_stable_nodes(void)
cond_resched(); cond_resched();
} }
} }
list_for_each_safe(this, next, &migrate_nodes) {
stable_node = list_entry(this, struct stable_node, list);
if (remove_stable_node(stable_node))
err = -EBUSY;
cond_resched();
}
return err; return err;
} }
...@@ -1113,25 +1132,30 @@ static struct page *try_to_merge_two_pages(struct rmap_item *rmap_item, ...@@ -1113,25 +1132,30 @@ static struct page *try_to_merge_two_pages(struct rmap_item *rmap_item,
*/ */
static struct page *stable_tree_search(struct page *page) static struct page *stable_tree_search(struct page *page)
{ {
struct rb_node *node;
struct stable_node *stable_node;
int nid; int nid;
struct rb_node **new;
struct rb_node *parent;
struct stable_node *stable_node;
struct stable_node *page_node;
stable_node = page_stable_node(page); page_node = page_stable_node(page);
if (stable_node) { /* ksm page forked */ if (page_node && page_node->head != &migrate_nodes) {
/* ksm page forked */
get_page(page); get_page(page);
return page; return page;
} }
nid = get_kpfn_nid(page_to_pfn(page)); nid = get_kpfn_nid(page_to_pfn(page));
node = root_stable_tree[nid].rb_node; again:
new = &root_stable_tree[nid].rb_node;
parent = NULL;
while (node) { while (*new) {
struct page *tree_page; struct page *tree_page;
int ret; int ret;
cond_resched(); cond_resched();
stable_node = rb_entry(node, struct stable_node, node); stable_node = rb_entry(*new, struct stable_node, node);
tree_page = get_ksm_page(stable_node, false); tree_page = get_ksm_page(stable_node, false);
if (!tree_page) if (!tree_page)
return NULL; return NULL;
...@@ -1139,10 +1163,11 @@ static struct page *stable_tree_search(struct page *page) ...@@ -1139,10 +1163,11 @@ static struct page *stable_tree_search(struct page *page)
ret = memcmp_pages(page, tree_page); ret = memcmp_pages(page, tree_page);
put_page(tree_page); put_page(tree_page);
parent = *new;
if (ret < 0) if (ret < 0)
node = node->rb_left; new = &parent->rb_left;
else if (ret > 0) else if (ret > 0)
node = node->rb_right; new = &parent->rb_right;
else { else {
/* /*
* Lock and unlock the stable_node's page (which * Lock and unlock the stable_node's page (which
...@@ -1152,13 +1177,49 @@ static struct page *stable_tree_search(struct page *page) ...@@ -1152,13 +1177,49 @@ static struct page *stable_tree_search(struct page *page)
* than kpage, but that involves more changes. * than kpage, but that involves more changes.
*/ */
tree_page = get_ksm_page(stable_node, true); tree_page = get_ksm_page(stable_node, true);
if (tree_page) if (tree_page) {
unlock_page(tree_page); unlock_page(tree_page);
return tree_page; if (get_kpfn_nid(stable_node->kpfn) !=
NUMA(stable_node->nid)) {
put_page(tree_page);
goto replace;
}
return tree_page;
}
/*
* There is now a place for page_node, but the tree may
* have been rebalanced, so re-evaluate parent and new.
*/
if (page_node)
goto again;
return NULL;
} }
} }
return NULL; if (!page_node)
return NULL;
list_del(&page_node->list);
DO_NUMA(page_node->nid = nid);
rb_link_node(&page_node->node, parent, new);
rb_insert_color(&page_node->node, &root_stable_tree[nid]);
get_page(page);
return page;
replace:
if (page_node) {
list_del(&page_node->list);
DO_NUMA(page_node->nid = nid);
rb_replace_node(&stable_node->node,
&page_node->node, &root_stable_tree[nid]);
get_page(page);
} else {
rb_erase(&stable_node->node, &root_stable_tree[nid]);
page = NULL;
}
stable_node->head = &migrate_nodes;
list_add(&stable_node->list, stable_node->head);
return page;
} }
/* /*
...@@ -1215,6 +1276,7 @@ static struct stable_node *stable_tree_insert(struct page *kpage) ...@@ -1215,6 +1276,7 @@ static struct stable_node *stable_tree_insert(struct page *kpage)
INIT_HLIST_HEAD(&stable_node->hlist); INIT_HLIST_HEAD(&stable_node->hlist);
stable_node->kpfn = kpfn; stable_node->kpfn = kpfn;
set_page_stable_node(kpage, stable_node); set_page_stable_node(kpage, stable_node);
DO_NUMA(stable_node->nid = nid);
rb_link_node(&stable_node->node, parent, new); rb_link_node(&stable_node->node, parent, new);
rb_insert_color(&stable_node->node, &root_stable_tree[nid]); rb_insert_color(&stable_node->node, &root_stable_tree[nid]);
...@@ -1311,11 +1373,6 @@ struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item, ...@@ -1311,11 +1373,6 @@ struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item,
static void stable_tree_append(struct rmap_item *rmap_item, static void stable_tree_append(struct rmap_item *rmap_item,
struct stable_node *stable_node) struct stable_node *stable_node)
{ {
/*
* Usually rmap_item->nid is already set correctly,
* but it may be wrong after switching merge_across_nodes.
*/
DO_NUMA(rmap_item->nid = get_kpfn_nid(stable_node->kpfn));
rmap_item->head = stable_node; rmap_item->head = stable_node;
rmap_item->address |= STABLE_FLAG; rmap_item->address |= STABLE_FLAG;
hlist_add_head(&rmap_item->hlist, &stable_node->hlist); hlist_add_head(&rmap_item->hlist, &stable_node->hlist);
...@@ -1344,10 +1401,29 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item) ...@@ -1344,10 +1401,29 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item)
unsigned int checksum; unsigned int checksum;
int err; int err;
remove_rmap_item_from_tree(rmap_item); stable_node = page_stable_node(page);
if (stable_node) {
if (stable_node->head != &migrate_nodes &&
get_kpfn_nid(stable_node->kpfn) != NUMA(stable_node->nid)) {
rb_erase(&stable_node->node,
&root_stable_tree[NUMA(stable_node->nid)]);
stable_node->head = &migrate_nodes;
list_add(&stable_node->list, stable_node->head);
}
if (stable_node->head != &migrate_nodes &&
rmap_item->head == stable_node)
return;
}
/* We first start with searching the page inside the stable tree */ /* We first start with searching the page inside the stable tree */
kpage = stable_tree_search(page); kpage = stable_tree_search(page);
if (kpage == page && rmap_item->head == stable_node) {
put_page(kpage);
return;
}
remove_rmap_item_from_tree(rmap_item);
if (kpage) { if (kpage) {
err = try_to_merge_with_ksm_page(rmap_item, page, kpage); err = try_to_merge_with_ksm_page(rmap_item, page, kpage);
if (!err) { if (!err) {
...@@ -1464,6 +1540,27 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page) ...@@ -1464,6 +1540,27 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page)
*/ */
lru_add_drain_all(); lru_add_drain_all();
/*
* Whereas stale stable_nodes on the stable_tree itself
* get pruned in the regular course of stable_tree_search(),
* those moved out to the migrate_nodes list can accumulate:
* so prune them once before each full scan.
*/
if (!ksm_merge_across_nodes) {
struct stable_node *stable_node;
struct list_head *this, *next;
struct page *page;
list_for_each_safe(this, next, &migrate_nodes) {
stable_node = list_entry(this,
struct stable_node, list);
page = get_ksm_page(stable_node, false);
if (page)
put_page(page);
cond_resched();
}
}
for (nid = 0; nid < nr_node_ids; nid++) for (nid = 0; nid < nr_node_ids; nid++)
root_unstable_tree[nid] = RB_ROOT; root_unstable_tree[nid] = RB_ROOT;
...@@ -1586,8 +1683,7 @@ static void ksm_do_scan(unsigned int scan_npages) ...@@ -1586,8 +1683,7 @@ static void ksm_do_scan(unsigned int scan_npages)
rmap_item = scan_get_next_rmap_item(&page); rmap_item = scan_get_next_rmap_item(&page);
if (!rmap_item) if (!rmap_item)
return; return;
if (!PageKsm(page) || !in_stable_tree(rmap_item)) cmp_and_merge_page(page, rmap_item);
cmp_and_merge_page(page, rmap_item);
put_page(page); put_page(page);
} }
} }
...@@ -1964,6 +2060,7 @@ static void ksm_check_stable_tree(unsigned long start_pfn, ...@@ -1964,6 +2060,7 @@ static void ksm_check_stable_tree(unsigned long start_pfn,
unsigned long end_pfn) unsigned long end_pfn)
{ {
struct stable_node *stable_node; struct stable_node *stable_node;
struct list_head *this, *next;
struct rb_node *node; struct rb_node *node;
int nid; int nid;
...@@ -1984,6 +2081,13 @@ static void ksm_check_stable_tree(unsigned long start_pfn, ...@@ -1984,6 +2081,13 @@ static void ksm_check_stable_tree(unsigned long start_pfn,
cond_resched(); cond_resched();
} }
} }
list_for_each_safe(this, next, &migrate_nodes) {
stable_node = list_entry(this, struct stable_node, list);
if (stable_node->kpfn >= start_pfn &&
stable_node->kpfn < end_pfn)
remove_node_from_stable_tree(stable_node);
cond_resched();
}
} }
static int ksm_memory_callback(struct notifier_block *self, static int ksm_memory_callback(struct notifier_block *self,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment