Commit d72da4a4 authored by Peter Zijlstra's avatar Peter Zijlstra Committed by Rusty Russell

rbtree: Make lockless searches non-fatal

Change the insert and erase code such that lockless searches are
non-fatal.

In and of itself an rbtree cannot be correctly searched while
in-modification, we can however provide weaker guarantees that will
allow the rbtree to be used in conjunction with other techniques, such
as latches; see 9b0fd802 ("seqcount: Add raw_write_seqcount_latch()").

For this to work we need the following guarantees from the rbtree
code:

 1) a lockless reader must not see partial stores, this would allow it
    to observe nodes that are invalid memory.

 2) there must not be (temporary) loops in the tree structure in the
    modifier's program order, this would cause a lookup which
    interrupts the modifier to get stuck indefinitely.

For 1) we must use WRITE_ONCE() for all updates to the tree structure;
in particular this patch only does rb_{left,right} as those are the
only element required for simple searches.

It generates slightly worse code, probably because volatile. But in
pointer chasing heavy code a few instructions more should not matter.

For 2) I have carefully audited the code and drawn every intermediate
link state and not found a loop.

Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: David Woodhouse <David.Woodhouse@intel.com>
Cc: Rik van Riel <riel@redhat.com>
Reviewed-by: default avatarMichel Lespinasse <walken@google.com>
Signed-off-by: default avatarPeter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: default avatarRusty Russell <rusty@rustcorp.com.au>
parent 0be964be
...@@ -31,6 +31,7 @@ ...@@ -31,6 +31,7 @@
#include <linux/kernel.h> #include <linux/kernel.h>
#include <linux/stddef.h> #include <linux/stddef.h>
#include <linux/rcupdate.h>
struct rb_node { struct rb_node {
unsigned long __rb_parent_color; unsigned long __rb_parent_color;
...@@ -73,11 +74,11 @@ extern struct rb_node *rb_first_postorder(const struct rb_root *); ...@@ -73,11 +74,11 @@ extern struct rb_node *rb_first_postorder(const struct rb_root *);
extern struct rb_node *rb_next_postorder(const struct rb_node *); extern struct rb_node *rb_next_postorder(const struct rb_node *);
/* Fast replacement of a single node without remove/rebalance/add/rebalance */ /* Fast replacement of a single node without remove/rebalance/add/rebalance */
extern void rb_replace_node(struct rb_node *victim, struct rb_node *new, extern void rb_replace_node(struct rb_node *victim, struct rb_node *new,
struct rb_root *root); struct rb_root *root);
static inline void rb_link_node(struct rb_node * node, struct rb_node * parent, static inline void rb_link_node(struct rb_node *node, struct rb_node *parent,
struct rb_node ** rb_link) struct rb_node **rb_link)
{ {
node->__rb_parent_color = (unsigned long)parent; node->__rb_parent_color = (unsigned long)parent;
node->rb_left = node->rb_right = NULL; node->rb_left = node->rb_right = NULL;
...@@ -85,6 +86,15 @@ static inline void rb_link_node(struct rb_node * node, struct rb_node * parent, ...@@ -85,6 +86,15 @@ static inline void rb_link_node(struct rb_node * node, struct rb_node * parent,
*rb_link = node; *rb_link = node;
} }
static inline void rb_link_node_rcu(struct rb_node *node, struct rb_node *parent,
struct rb_node **rb_link)
{
node->__rb_parent_color = (unsigned long)parent;
node->rb_left = node->rb_right = NULL;
rcu_assign_pointer(*rb_link, node);
}
#define rb_entry_safe(ptr, type, member) \ #define rb_entry_safe(ptr, type, member) \
({ typeof(ptr) ____ptr = (ptr); \ ({ typeof(ptr) ____ptr = (ptr); \
____ptr ? rb_entry(____ptr, type, member) : NULL; \ ____ptr ? rb_entry(____ptr, type, member) : NULL; \
......
...@@ -123,11 +123,11 @@ __rb_change_child(struct rb_node *old, struct rb_node *new, ...@@ -123,11 +123,11 @@ __rb_change_child(struct rb_node *old, struct rb_node *new,
{ {
if (parent) { if (parent) {
if (parent->rb_left == old) if (parent->rb_left == old)
parent->rb_left = new; WRITE_ONCE(parent->rb_left, new);
else else
parent->rb_right = new; WRITE_ONCE(parent->rb_right, new);
} else } else
root->rb_node = new; WRITE_ONCE(root->rb_node, new);
} }
extern void __rb_erase_color(struct rb_node *parent, struct rb_root *root, extern void __rb_erase_color(struct rb_node *parent, struct rb_root *root,
...@@ -137,7 +137,8 @@ static __always_inline struct rb_node * ...@@ -137,7 +137,8 @@ static __always_inline struct rb_node *
__rb_erase_augmented(struct rb_node *node, struct rb_root *root, __rb_erase_augmented(struct rb_node *node, struct rb_root *root,
const struct rb_augment_callbacks *augment) const struct rb_augment_callbacks *augment)
{ {
struct rb_node *child = node->rb_right, *tmp = node->rb_left; struct rb_node *child = node->rb_right;
struct rb_node *tmp = node->rb_left;
struct rb_node *parent, *rebalance; struct rb_node *parent, *rebalance;
unsigned long pc; unsigned long pc;
...@@ -167,6 +168,7 @@ __rb_erase_augmented(struct rb_node *node, struct rb_root *root, ...@@ -167,6 +168,7 @@ __rb_erase_augmented(struct rb_node *node, struct rb_root *root,
tmp = parent; tmp = parent;
} else { } else {
struct rb_node *successor = child, *child2; struct rb_node *successor = child, *child2;
tmp = child->rb_left; tmp = child->rb_left;
if (!tmp) { if (!tmp) {
/* /*
...@@ -180,6 +182,7 @@ __rb_erase_augmented(struct rb_node *node, struct rb_root *root, ...@@ -180,6 +182,7 @@ __rb_erase_augmented(struct rb_node *node, struct rb_root *root,
*/ */
parent = successor; parent = successor;
child2 = successor->rb_right; child2 = successor->rb_right;
augment->copy(node, successor); augment->copy(node, successor);
} else { } else {
/* /*
...@@ -201,19 +204,23 @@ __rb_erase_augmented(struct rb_node *node, struct rb_root *root, ...@@ -201,19 +204,23 @@ __rb_erase_augmented(struct rb_node *node, struct rb_root *root,
successor = tmp; successor = tmp;
tmp = tmp->rb_left; tmp = tmp->rb_left;
} while (tmp); } while (tmp);
parent->rb_left = child2 = successor->rb_right; child2 = successor->rb_right;
successor->rb_right = child; WRITE_ONCE(parent->rb_left, child2);
WRITE_ONCE(successor->rb_right, child);
rb_set_parent(child, successor); rb_set_parent(child, successor);
augment->copy(node, successor); augment->copy(node, successor);
augment->propagate(parent, successor); augment->propagate(parent, successor);
} }
successor->rb_left = tmp = node->rb_left; tmp = node->rb_left;
WRITE_ONCE(successor->rb_left, tmp);
rb_set_parent(tmp, successor); rb_set_parent(tmp, successor);
pc = node->__rb_parent_color; pc = node->__rb_parent_color;
tmp = __rb_parent(pc); tmp = __rb_parent(pc);
__rb_change_child(node, successor, tmp, root); __rb_change_child(node, successor, tmp, root);
if (child2) { if (child2) {
successor->__rb_parent_color = pc; successor->__rb_parent_color = pc;
rb_set_parent_color(child2, parent, RB_BLACK); rb_set_parent_color(child2, parent, RB_BLACK);
......
...@@ -44,6 +44,30 @@ ...@@ -44,6 +44,30 @@
* parentheses and have some accompanying text comment. * parentheses and have some accompanying text comment.
*/ */
/*
* Notes on lockless lookups:
*
* All stores to the tree structure (rb_left and rb_right) must be done using
* WRITE_ONCE(). And we must not inadvertently cause (temporary) loops in the
* tree structure as seen in program order.
*
* These two requirements will allow lockless iteration of the tree -- not
* correct iteration mind you, tree rotations are not atomic so a lookup might
* miss entire subtrees.
*
* But they do guarantee that any such traversal will only see valid elements
* and that it will indeed complete -- does not get stuck in a loop.
*
* It also guarantees that if the lookup returns an element it is the 'correct'
* one. But not returning an element does _NOT_ mean it's not present.
*
* NOTE:
*
* Stores to __rb_parent_color are not important for simple lookups so those
* are left undone as of now. Nor did I check for loops involving parent
* pointers.
*/
static inline void rb_set_black(struct rb_node *rb) static inline void rb_set_black(struct rb_node *rb)
{ {
rb->__rb_parent_color |= RB_BLACK; rb->__rb_parent_color |= RB_BLACK;
...@@ -129,8 +153,9 @@ __rb_insert(struct rb_node *node, struct rb_root *root, ...@@ -129,8 +153,9 @@ __rb_insert(struct rb_node *node, struct rb_root *root,
* This still leaves us in violation of 4), the * This still leaves us in violation of 4), the
* continuation into Case 3 will fix that. * continuation into Case 3 will fix that.
*/ */
parent->rb_right = tmp = node->rb_left; tmp = node->rb_left;
node->rb_left = parent; WRITE_ONCE(parent->rb_right, tmp);
WRITE_ONCE(node->rb_left, parent);
if (tmp) if (tmp)
rb_set_parent_color(tmp, parent, rb_set_parent_color(tmp, parent,
RB_BLACK); RB_BLACK);
...@@ -149,8 +174,8 @@ __rb_insert(struct rb_node *node, struct rb_root *root, ...@@ -149,8 +174,8 @@ __rb_insert(struct rb_node *node, struct rb_root *root,
* / \ * / \
* n U * n U
*/ */
gparent->rb_left = tmp; /* == parent->rb_right */ WRITE_ONCE(gparent->rb_left, tmp); /* == parent->rb_right */
parent->rb_right = gparent; WRITE_ONCE(parent->rb_right, gparent);
if (tmp) if (tmp)
rb_set_parent_color(tmp, gparent, RB_BLACK); rb_set_parent_color(tmp, gparent, RB_BLACK);
__rb_rotate_set_parents(gparent, parent, root, RB_RED); __rb_rotate_set_parents(gparent, parent, root, RB_RED);
...@@ -171,8 +196,9 @@ __rb_insert(struct rb_node *node, struct rb_root *root, ...@@ -171,8 +196,9 @@ __rb_insert(struct rb_node *node, struct rb_root *root,
tmp = parent->rb_left; tmp = parent->rb_left;
if (node == tmp) { if (node == tmp) {
/* Case 2 - right rotate at parent */ /* Case 2 - right rotate at parent */
parent->rb_left = tmp = node->rb_right; tmp = node->rb_right;
node->rb_right = parent; WRITE_ONCE(parent->rb_left, tmp);
WRITE_ONCE(node->rb_right, parent);
if (tmp) if (tmp)
rb_set_parent_color(tmp, parent, rb_set_parent_color(tmp, parent,
RB_BLACK); RB_BLACK);
...@@ -183,8 +209,8 @@ __rb_insert(struct rb_node *node, struct rb_root *root, ...@@ -183,8 +209,8 @@ __rb_insert(struct rb_node *node, struct rb_root *root,
} }
/* Case 3 - left rotate at gparent */ /* Case 3 - left rotate at gparent */
gparent->rb_right = tmp; /* == parent->rb_left */ WRITE_ONCE(gparent->rb_right, tmp); /* == parent->rb_left */
parent->rb_left = gparent; WRITE_ONCE(parent->rb_left, gparent);
if (tmp) if (tmp)
rb_set_parent_color(tmp, gparent, RB_BLACK); rb_set_parent_color(tmp, gparent, RB_BLACK);
__rb_rotate_set_parents(gparent, parent, root, RB_RED); __rb_rotate_set_parents(gparent, parent, root, RB_RED);
...@@ -224,8 +250,9 @@ ____rb_erase_color(struct rb_node *parent, struct rb_root *root, ...@@ -224,8 +250,9 @@ ____rb_erase_color(struct rb_node *parent, struct rb_root *root,
* / \ / \ * / \ / \
* Sl Sr N Sl * Sl Sr N Sl
*/ */
parent->rb_right = tmp1 = sibling->rb_left; tmp1 = sibling->rb_left;
sibling->rb_left = parent; WRITE_ONCE(parent->rb_right, tmp1);
WRITE_ONCE(sibling->rb_left, parent);
rb_set_parent_color(tmp1, parent, RB_BLACK); rb_set_parent_color(tmp1, parent, RB_BLACK);
__rb_rotate_set_parents(parent, sibling, root, __rb_rotate_set_parents(parent, sibling, root,
RB_RED); RB_RED);
...@@ -275,9 +302,10 @@ ____rb_erase_color(struct rb_node *parent, struct rb_root *root, ...@@ -275,9 +302,10 @@ ____rb_erase_color(struct rb_node *parent, struct rb_root *root,
* \ * \
* Sr * Sr
*/ */
sibling->rb_left = tmp1 = tmp2->rb_right; tmp1 = tmp2->rb_right;
tmp2->rb_right = sibling; WRITE_ONCE(sibling->rb_left, tmp1);
parent->rb_right = tmp2; WRITE_ONCE(tmp2->rb_right, sibling);
WRITE_ONCE(parent->rb_right, tmp2);
if (tmp1) if (tmp1)
rb_set_parent_color(tmp1, sibling, rb_set_parent_color(tmp1, sibling,
RB_BLACK); RB_BLACK);
...@@ -297,8 +325,9 @@ ____rb_erase_color(struct rb_node *parent, struct rb_root *root, ...@@ -297,8 +325,9 @@ ____rb_erase_color(struct rb_node *parent, struct rb_root *root,
* / \ / \ * / \ / \
* (sl) sr N (sl) * (sl) sr N (sl)
*/ */
parent->rb_right = tmp2 = sibling->rb_left; tmp2 = sibling->rb_left;
sibling->rb_left = parent; WRITE_ONCE(parent->rb_right, tmp2);
WRITE_ONCE(sibling->rb_left, parent);
rb_set_parent_color(tmp1, sibling, RB_BLACK); rb_set_parent_color(tmp1, sibling, RB_BLACK);
if (tmp2) if (tmp2)
rb_set_parent(tmp2, parent); rb_set_parent(tmp2, parent);
...@@ -310,8 +339,9 @@ ____rb_erase_color(struct rb_node *parent, struct rb_root *root, ...@@ -310,8 +339,9 @@ ____rb_erase_color(struct rb_node *parent, struct rb_root *root,
sibling = parent->rb_left; sibling = parent->rb_left;
if (rb_is_red(sibling)) { if (rb_is_red(sibling)) {
/* Case 1 - right rotate at parent */ /* Case 1 - right rotate at parent */
parent->rb_left = tmp1 = sibling->rb_right; tmp1 = sibling->rb_right;
sibling->rb_right = parent; WRITE_ONCE(parent->rb_left, tmp1);
WRITE_ONCE(sibling->rb_right, parent);
rb_set_parent_color(tmp1, parent, RB_BLACK); rb_set_parent_color(tmp1, parent, RB_BLACK);
__rb_rotate_set_parents(parent, sibling, root, __rb_rotate_set_parents(parent, sibling, root,
RB_RED); RB_RED);
...@@ -336,9 +366,10 @@ ____rb_erase_color(struct rb_node *parent, struct rb_root *root, ...@@ -336,9 +366,10 @@ ____rb_erase_color(struct rb_node *parent, struct rb_root *root,
break; break;
} }
/* Case 3 - right rotate at sibling */ /* Case 3 - right rotate at sibling */
sibling->rb_right = tmp1 = tmp2->rb_left; tmp1 = tmp2->rb_left;
tmp2->rb_left = sibling; WRITE_ONCE(sibling->rb_right, tmp1);
parent->rb_left = tmp2; WRITE_ONCE(tmp2->rb_left, sibling);
WRITE_ONCE(parent->rb_left, tmp2);
if (tmp1) if (tmp1)
rb_set_parent_color(tmp1, sibling, rb_set_parent_color(tmp1, sibling,
RB_BLACK); RB_BLACK);
...@@ -347,8 +378,9 @@ ____rb_erase_color(struct rb_node *parent, struct rb_root *root, ...@@ -347,8 +378,9 @@ ____rb_erase_color(struct rb_node *parent, struct rb_root *root,
sibling = tmp2; sibling = tmp2;
} }
/* Case 4 - left rotate at parent + color flips */ /* Case 4 - left rotate at parent + color flips */
parent->rb_left = tmp2 = sibling->rb_right; tmp2 = sibling->rb_right;
sibling->rb_right = parent; WRITE_ONCE(parent->rb_left, tmp2);
WRITE_ONCE(sibling->rb_right, parent);
rb_set_parent_color(tmp1, sibling, RB_BLACK); rb_set_parent_color(tmp1, sibling, RB_BLACK);
if (tmp2) if (tmp2)
rb_set_parent(tmp2, parent); rb_set_parent(tmp2, parent);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment