Commit 4fcdd6ec authored by Kent Overstreet's avatar Kent Overstreet

bcachefs: Btree split improvement

This improves the bkey_format calculation when splitting btree nodes.
Previously, we'd use a format calculated for the original node for the
lower of the two new nodes.

This was particularly bad on sequential insertions, where we iteratively
split the last btree node, whos format has to include KEY_MAX.

Now, we calculate formats precisely for the keys the two new nodes will
contain. This also should make splitting a bit more efficient, since
we're only copying keys once (from the original node to the new node,
instead of new node, replacement node, then upper split).
Signed-off-by: default avatarKent Overstreet <kent.overstreet@linux.dev>
parent 0f35e086
...@@ -144,6 +144,8 @@ bch2_sort_repack(struct bset *dst, struct btree *src, ...@@ -144,6 +144,8 @@ bch2_sort_repack(struct bset *dst, struct btree *src,
else else
bch2_bkey_unpack(src, (void *) out, in); bch2_bkey_unpack(src, (void *) out, in);
out->needs_whiteout = false;
btree_keys_account_key_add(&nr, 0, out); btree_keys_account_key_add(&nr, 0, out);
out = bkey_next(out); out = bkey_next(out);
} }
......
...@@ -447,6 +447,11 @@ struct bkey_s_c bch2_btree_node_iter_peek_unpack(struct btree_node_iter *, ...@@ -447,6 +447,11 @@ struct bkey_s_c bch2_btree_node_iter_peek_unpack(struct btree_node_iter *,
struct btree *, struct btree *,
struct bkey *); struct bkey *);
#define for_each_btree_node_key(b, k, iter) \
for (bch2_btree_node_iter_init_from_start((iter), (b)); \
(k = bch2_btree_node_iter_peek((iter), (b))); \
bch2_btree_node_iter_advance(iter, b))
#define for_each_btree_node_key_unpack(b, k, iter, unpacked) \ #define for_each_btree_node_key_unpack(b, k, iter, unpacked) \
for (bch2_btree_node_iter_init_from_start((iter), (b)); \ for (bch2_btree_node_iter_init_from_start((iter), (b)); \
(k = bch2_btree_node_iter_peek_unpack((iter), (b), (unpacked))).k;\ (k = bch2_btree_node_iter_peek_unpack((iter), (b), (unpacked))).k;\
......
...@@ -377,14 +377,19 @@ static void btree_set_max(struct btree *b, struct bpos pos) ...@@ -377,14 +377,19 @@ static void btree_set_max(struct btree *b, struct bpos pos)
b->data->max_key = pos; b->data->max_key = pos;
} }
struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *as, static struct btree *bch2_btree_node_alloc_replacement(struct btree_update *as,
struct btree_trans *trans, struct btree_trans *trans,
struct btree *b, struct btree *b)
struct bkey_format format)
{ {
struct btree *n; struct btree *n = bch2_btree_node_alloc(as, trans, b->c.level);
struct bkey_format format = bch2_btree_calc_format(b);
n = bch2_btree_node_alloc(as, trans, b->c.level); /*
* The keys might expand with the new format - if they wouldn't fit in
* the btree node anymore, use the old format for now:
*/
if (!bch2_btree_node_format_fits(as->c, b, &format))
format = b->format;
SET_BTREE_NODE_SEQ(n->data, BTREE_NODE_SEQ(b->data) + 1); SET_BTREE_NODE_SEQ(n->data, BTREE_NODE_SEQ(b->data) + 1);
...@@ -397,27 +402,9 @@ struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *as, ...@@ -397,27 +402,9 @@ struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *as,
bch2_btree_sort_into(as->c, n, b); bch2_btree_sort_into(as->c, n, b);
btree_node_reset_sib_u64s(n); btree_node_reset_sib_u64s(n);
n->key.k.p = b->key.k.p;
return n; return n;
} }
static struct btree *bch2_btree_node_alloc_replacement(struct btree_update *as,
struct btree_trans *trans,
struct btree *b)
{
struct bkey_format new_f = bch2_btree_calc_format(b);
/*
* The keys might expand with the new format - if they wouldn't fit in
* the btree node anymore, use the old format for now:
*/
if (!bch2_btree_node_format_fits(as->c, b, &new_f))
new_f = b->format;
return __bch2_btree_node_alloc_replacement(as, trans, b, new_f);
}
static struct btree *__btree_root_alloc(struct btree_update *as, static struct btree *__btree_root_alloc(struct btree_update *as,
struct btree_trans *trans, unsigned level) struct btree_trans *trans, unsigned level)
{ {
...@@ -1331,8 +1318,12 @@ __bch2_btree_insert_keys_interior(struct btree_update *as, ...@@ -1331,8 +1318,12 @@ __bch2_btree_insert_keys_interior(struct btree_update *as,
; ;
while (!bch2_keylist_empty(keys)) { while (!bch2_keylist_empty(keys)) {
bch2_insert_fixup_btree_ptr(as, trans, path, b, struct bkey_i *k = bch2_keylist_front(keys);
&node_iter, bch2_keylist_front(keys));
if (bpos_cmp(k->k.p, b->key.k.p) > 0)
break;
bch2_insert_fixup_btree_ptr(as, trans, path, b, &node_iter, k);
bch2_keylist_pop_front(keys); bch2_keylist_pop_front(keys);
} }
} }
...@@ -1341,109 +1332,91 @@ __bch2_btree_insert_keys_interior(struct btree_update *as, ...@@ -1341,109 +1332,91 @@ __bch2_btree_insert_keys_interior(struct btree_update *as,
* Move keys from n1 (original replacement node, now lower node) to n2 (higher * Move keys from n1 (original replacement node, now lower node) to n2 (higher
* node) * node)
*/ */
static struct btree *__btree_split_node(struct btree_update *as, static void __btree_split_node(struct btree_update *as,
struct btree_trans *trans, struct btree_trans *trans,
struct btree *n1) struct btree *b,
struct btree *n[2])
{ {
struct bkey_format_state s; struct bkey_packed *k;
size_t nr_packed = 0, nr_unpacked = 0; struct bpos n1_pos = POS_MIN;
struct btree *n2; struct btree_node_iter iter;
struct bset *set1, *set2; struct bset *bsets[2];
struct bkey_packed *k, *set2_start, *set2_end, *out, *prev = NULL; struct bkey_format_state format[2];
struct bpos n1_pos; struct bkey_packed *out[2];
struct bkey uk;
unsigned u64s, n1_u64s = (b->nr.live_u64s * 3) / 5;
int i;
n2 = bch2_btree_node_alloc(as, trans, n1->c.level); for (i = 0; i < 2; i++) {
BUG_ON(n[i]->nsets != 1);
n2->data->max_key = n1->data->max_key; bsets[i] = btree_bset_first(n[i]);
n2->data->format = n1->format; out[i] = bsets[i]->start;
SET_BTREE_NODE_SEQ(n2->data, BTREE_NODE_SEQ(n1->data));
n2->key.k.p = n1->key.k.p;
set1 = btree_bset_first(n1); SET_BTREE_NODE_SEQ(n[i]->data, BTREE_NODE_SEQ(b->data) + 1);
set2 = btree_bset_first(n2); bch2_bkey_format_init(&format[i]);
}
/* u64s = 0;
* Has to be a linear search because we don't have an auxiliary for_each_btree_node_key(b, k, &iter) {
* search tree yet if (bkey_deleted(k))
*/ continue;
k = set1->start;
while (1) { i = u64s >= n1_u64s;
struct bkey_packed *n = bkey_next(k); u64s += k->u64s;
uk = bkey_unpack_key(b, k);
if (!i)
n1_pos = uk.p;
bch2_bkey_format_add_key(&format[i], &uk);
}
if (n == vstruct_last(set1)) btree_set_min(n[0], b->data->min_key);
break; btree_set_max(n[0], n1_pos);
if (k->_data - set1->_data >= (le16_to_cpu(set1->u64s) * 3) / 5) btree_set_min(n[1], bpos_successor(n1_pos));
break; btree_set_max(n[1], b->data->max_key);
if (bkey_packed(k)) for (i = 0; i < 2; i++) {
nr_packed++; bch2_bkey_format_add_pos(&format[i], n[i]->data->min_key);
else bch2_bkey_format_add_pos(&format[i], n[i]->data->max_key);
nr_unpacked++;
prev = k; n[i]->data->format = bch2_bkey_format_done(&format[i]);
k = n; btree_node_set_format(n[i], n[i]->data->format);
} }
BUG_ON(!prev); u64s = 0;
set2_start = k; for_each_btree_node_key(b, k, &iter) {
set2_end = vstruct_last(set1); if (bkey_deleted(k))
continue;
set1->u64s = cpu_to_le16((u64 *) set2_start - set1->_data);
set_btree_bset_end(n1, n1->set);
n1->nr.live_u64s = le16_to_cpu(set1->u64s); i = u64s >= n1_u64s;
n1->nr.bset_u64s[0] = le16_to_cpu(set1->u64s); u64s += k->u64s;
n1->nr.packed_keys = nr_packed;
n1->nr.unpacked_keys = nr_unpacked;
n1_pos = bkey_unpack_pos(n1, prev); if (bch2_bkey_transform(&n[i]->format, out[i], bkey_packed(k)
if (as->c->sb.version < bcachefs_metadata_version_snapshot) ? &b->format: &bch2_bkey_format_current, k))
n1_pos.snapshot = U32_MAX; out[i]->format = KEY_FORMAT_LOCAL_BTREE;
else
btree_set_max(n1, n1_pos); bch2_bkey_unpack(b, (void *) out[i], k);
btree_set_min(n2, bpos_successor(n1->key.k.p));
bch2_bkey_format_init(&s); out[i]->needs_whiteout = false;
bch2_bkey_format_add_pos(&s, n2->data->min_key);
bch2_bkey_format_add_pos(&s, n2->data->max_key);
for (k = set2_start; k != set2_end; k = bkey_next(k)) { btree_keys_account_key_add(&n[i]->nr, 0, out[i]);
struct bkey uk = bkey_unpack_key(n1, k); out[i] = bkey_next(out[i]);
bch2_bkey_format_add_key(&s, &uk);
} }
n2->data->format = bch2_bkey_format_done(&s); for (i = 0; i < 2; i++) {
btree_node_set_format(n2, n2->data->format); bsets[i]->u64s = cpu_to_le16((u64 *) out[i] - bsets[i]->_data);
out = set2->start; BUG_ON(!bsets[i]->u64s);
memset(&n2->nr, 0, sizeof(n2->nr));
for (k = set2_start; k != set2_end; k = bkey_next(k)) { set_btree_bset_end(n[i], n[i]->set);
BUG_ON(!bch2_bkey_transform(&n2->format, out, bkey_packed(k)
? &n1->format : &bch2_bkey_format_current, k));
out->format = KEY_FORMAT_LOCAL_BTREE;
btree_keys_account_key_add(&n2->nr, 0, out);
out = bkey_next(out);
}
set2->u64s = cpu_to_le16((u64 *) out - set2->_data); btree_node_reset_sib_u64s(n[i]);
set_btree_bset_end(n2, n2->set);
BUG_ON(!set1->u64s); bch2_verify_btree_nr_keys(n[i]);
BUG_ON(!set2->u64s);
btree_node_reset_sib_u64s(n1); if (b->c.level)
btree_node_reset_sib_u64s(n2); btree_node_interior_verify(as->c, n[i]);
bch2_verify_btree_nr_keys(n1);
bch2_verify_btree_nr_keys(n2);
if (n1->c.level) {
btree_node_interior_verify(as->c, n1);
btree_node_interior_verify(as->c, n2);
} }
return n2;
} }
/* /*
...@@ -1463,41 +1436,17 @@ static void btree_split_insert_keys(struct btree_update *as, ...@@ -1463,41 +1436,17 @@ static void btree_split_insert_keys(struct btree_update *as,
struct btree *b, struct btree *b,
struct keylist *keys) struct keylist *keys)
{ {
struct btree_node_iter node_iter; if (!bch2_keylist_empty(keys) &&
struct bkey_i *k = bch2_keylist_front(keys); bpos_cmp(bch2_keylist_front(keys)->k.p,
struct bkey_packed *src, *dst, *n; b->data->max_key) <= 0) {
struct bset *i; struct btree_node_iter node_iter;
bch2_btree_node_iter_init(&node_iter, b, &k->k.p); bch2_btree_node_iter_init(&node_iter, b, &bch2_keylist_front(keys)->k.p);
__bch2_btree_insert_keys_interior(as, trans, path, b, node_iter, keys); __bch2_btree_insert_keys_interior(as, trans, path, b, node_iter, keys);
/* btree_node_interior_verify(as->c, b);
* We can't tolerate whiteouts here - with whiteouts there can be
* duplicate keys, and it would be rather bad if we picked a duplicate
* for the pivot:
*/
i = btree_bset_first(b);
src = dst = i->start;
while (src != vstruct_last(i)) {
n = bkey_next(src);
if (!bkey_deleted(src)) {
memmove_u64s_down(dst, src, src->u64s);
dst = bkey_next(dst);
}
src = n;
} }
/* Also clear out the unwritten whiteouts area: */
b->whiteout_u64s = 0;
i->u64s = cpu_to_le16((u64 *) dst - i->_data);
set_btree_bset_end(b, b->set);
BUG_ON(b->nsets != 1 ||
b->nr.live_u64s != le16_to_cpu(btree_bset_first(b)->u64s));
btree_node_interior_verify(as->c, b);
} }
static int btree_split(struct btree_update *as, struct btree_trans *trans, static int btree_split(struct btree_update *as, struct btree_trans *trans,
...@@ -1516,15 +1465,21 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans, ...@@ -1516,15 +1465,21 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans,
bch2_btree_interior_update_will_free_node(as, b); bch2_btree_interior_update_will_free_node(as, b);
n1 = bch2_btree_node_alloc_replacement(as, trans, b); if (b->nr.live_u64s > BTREE_SPLIT_THRESHOLD(c)) {
struct btree *n[2];
if (keys)
btree_split_insert_keys(as, trans, path, n1, keys);
if (bset_u64s(&n1->set[0]) > BTREE_SPLIT_THRESHOLD(c)) {
trace_and_count(c, btree_node_split, c, b); trace_and_count(c, btree_node_split, c, b);
n2 = __btree_split_node(as, trans, n1); n[0] = n1 = bch2_btree_node_alloc(as, trans, b->c.level);
n[1] = n2 = bch2_btree_node_alloc(as, trans, b->c.level);
__btree_split_node(as, trans, b, n);
if (keys) {
btree_split_insert_keys(as, trans, path, n1, keys);
btree_split_insert_keys(as, trans, path, n2, keys);
BUG_ON(!bch2_keylist_empty(keys));
}
bch2_btree_build_aux_trees(n2); bch2_btree_build_aux_trees(n2);
bch2_btree_build_aux_trees(n1); bch2_btree_build_aux_trees(n1);
...@@ -1573,6 +1528,13 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans, ...@@ -1573,6 +1528,13 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans,
} else { } else {
trace_and_count(c, btree_node_compact, c, b); trace_and_count(c, btree_node_compact, c, b);
n1 = bch2_btree_node_alloc_replacement(as, trans, b);
if (keys) {
btree_split_insert_keys(as, trans, path, n1, keys);
BUG_ON(!bch2_keylist_empty(keys));
}
bch2_btree_build_aux_trees(n1); bch2_btree_build_aux_trees(n1);
bch2_btree_update_add_new_node(as, n1); bch2_btree_update_add_new_node(as, n1);
six_unlock_write(&n1->c.lock); six_unlock_write(&n1->c.lock);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment