Commit 4fcdd6ec authored by Kent Overstreet's avatar Kent Overstreet

bcachefs: Btree split improvement

This improves the bkey_format calculation when splitting btree nodes.
Previously, we'd use a format calculated for the original node for the
lower of the two new nodes.

This was particularly bad on sequential insertions, where we iteratively
split the last btree node, whos format has to include KEY_MAX.

Now, we calculate formats precisely for the keys the two new nodes will
contain. This also should make splitting a bit more efficient, since
we're only copying keys once (from the original node to the new node,
instead of new node, replacement node, then upper split).
Signed-off-by: default avatarKent Overstreet <kent.overstreet@linux.dev>
parent 0f35e086
......@@ -144,6 +144,8 @@ bch2_sort_repack(struct bset *dst, struct btree *src,
else
bch2_bkey_unpack(src, (void *) out, in);
out->needs_whiteout = false;
btree_keys_account_key_add(&nr, 0, out);
out = bkey_next(out);
}
......
......@@ -447,6 +447,11 @@ struct bkey_s_c bch2_btree_node_iter_peek_unpack(struct btree_node_iter *,
struct btree *,
struct bkey *);
#define for_each_btree_node_key(b, k, iter) \
for (bch2_btree_node_iter_init_from_start((iter), (b)); \
(k = bch2_btree_node_iter_peek((iter), (b))); \
bch2_btree_node_iter_advance(iter, b))
#define for_each_btree_node_key_unpack(b, k, iter, unpacked) \
for (bch2_btree_node_iter_init_from_start((iter), (b)); \
(k = bch2_btree_node_iter_peek_unpack((iter), (b), (unpacked))).k;\
......
......@@ -377,14 +377,19 @@ static void btree_set_max(struct btree *b, struct bpos pos)
b->data->max_key = pos;
}
struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *as,
static struct btree *bch2_btree_node_alloc_replacement(struct btree_update *as,
struct btree_trans *trans,
struct btree *b,
struct bkey_format format)
struct btree *b)
{
struct btree *n;
struct btree *n = bch2_btree_node_alloc(as, trans, b->c.level);
struct bkey_format format = bch2_btree_calc_format(b);
n = bch2_btree_node_alloc(as, trans, b->c.level);
/*
* The keys might expand with the new format - if they wouldn't fit in
* the btree node anymore, use the old format for now:
*/
if (!bch2_btree_node_format_fits(as->c, b, &format))
format = b->format;
SET_BTREE_NODE_SEQ(n->data, BTREE_NODE_SEQ(b->data) + 1);
......@@ -397,27 +402,9 @@ struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *as,
bch2_btree_sort_into(as->c, n, b);
btree_node_reset_sib_u64s(n);
n->key.k.p = b->key.k.p;
return n;
}
static struct btree *bch2_btree_node_alloc_replacement(struct btree_update *as,
struct btree_trans *trans,
struct btree *b)
{
struct bkey_format new_f = bch2_btree_calc_format(b);
/*
* The keys might expand with the new format - if they wouldn't fit in
* the btree node anymore, use the old format for now:
*/
if (!bch2_btree_node_format_fits(as->c, b, &new_f))
new_f = b->format;
return __bch2_btree_node_alloc_replacement(as, trans, b, new_f);
}
static struct btree *__btree_root_alloc(struct btree_update *as,
struct btree_trans *trans, unsigned level)
{
......@@ -1331,8 +1318,12 @@ __bch2_btree_insert_keys_interior(struct btree_update *as,
;
while (!bch2_keylist_empty(keys)) {
bch2_insert_fixup_btree_ptr(as, trans, path, b,
&node_iter, bch2_keylist_front(keys));
struct bkey_i *k = bch2_keylist_front(keys);
if (bpos_cmp(k->k.p, b->key.k.p) > 0)
break;
bch2_insert_fixup_btree_ptr(as, trans, path, b, &node_iter, k);
bch2_keylist_pop_front(keys);
}
}
......@@ -1341,109 +1332,91 @@ __bch2_btree_insert_keys_interior(struct btree_update *as,
* Move keys from n1 (original replacement node, now lower node) to n2 (higher
* node)
*/
static struct btree *__btree_split_node(struct btree_update *as,
static void __btree_split_node(struct btree_update *as,
struct btree_trans *trans,
struct btree *n1)
struct btree *b,
struct btree *n[2])
{
struct bkey_format_state s;
size_t nr_packed = 0, nr_unpacked = 0;
struct btree *n2;
struct bset *set1, *set2;
struct bkey_packed *k, *set2_start, *set2_end, *out, *prev = NULL;
struct bpos n1_pos;
n2 = bch2_btree_node_alloc(as, trans, n1->c.level);
n2->data->max_key = n1->data->max_key;
n2->data->format = n1->format;
SET_BTREE_NODE_SEQ(n2->data, BTREE_NODE_SEQ(n1->data));
n2->key.k.p = n1->key.k.p;
set1 = btree_bset_first(n1);
set2 = btree_bset_first(n2);
/*
* Has to be a linear search because we don't have an auxiliary
* search tree yet
*/
k = set1->start;
while (1) {
struct bkey_packed *n = bkey_next(k);
struct bkey_packed *k;
struct bpos n1_pos = POS_MIN;
struct btree_node_iter iter;
struct bset *bsets[2];
struct bkey_format_state format[2];
struct bkey_packed *out[2];
struct bkey uk;
unsigned u64s, n1_u64s = (b->nr.live_u64s * 3) / 5;
int i;
if (n == vstruct_last(set1))
break;
if (k->_data - set1->_data >= (le16_to_cpu(set1->u64s) * 3) / 5)
break;
for (i = 0; i < 2; i++) {
BUG_ON(n[i]->nsets != 1);
if (bkey_packed(k))
nr_packed++;
else
nr_unpacked++;
bsets[i] = btree_bset_first(n[i]);
out[i] = bsets[i]->start;
prev = k;
k = n;
SET_BTREE_NODE_SEQ(n[i]->data, BTREE_NODE_SEQ(b->data) + 1);
bch2_bkey_format_init(&format[i]);
}
BUG_ON(!prev);
set2_start = k;
set2_end = vstruct_last(set1);
u64s = 0;
for_each_btree_node_key(b, k, &iter) {
if (bkey_deleted(k))
continue;
set1->u64s = cpu_to_le16((u64 *) set2_start - set1->_data);
set_btree_bset_end(n1, n1->set);
i = u64s >= n1_u64s;
u64s += k->u64s;
uk = bkey_unpack_key(b, k);
if (!i)
n1_pos = uk.p;
bch2_bkey_format_add_key(&format[i], &uk);
}
n1->nr.live_u64s = le16_to_cpu(set1->u64s);
n1->nr.bset_u64s[0] = le16_to_cpu(set1->u64s);
n1->nr.packed_keys = nr_packed;
n1->nr.unpacked_keys = nr_unpacked;
btree_set_min(n[0], b->data->min_key);
btree_set_max(n[0], n1_pos);
btree_set_min(n[1], bpos_successor(n1_pos));
btree_set_max(n[1], b->data->max_key);
n1_pos = bkey_unpack_pos(n1, prev);
if (as->c->sb.version < bcachefs_metadata_version_snapshot)
n1_pos.snapshot = U32_MAX;
for (i = 0; i < 2; i++) {
bch2_bkey_format_add_pos(&format[i], n[i]->data->min_key);
bch2_bkey_format_add_pos(&format[i], n[i]->data->max_key);
btree_set_max(n1, n1_pos);
btree_set_min(n2, bpos_successor(n1->key.k.p));
n[i]->data->format = bch2_bkey_format_done(&format[i]);
btree_node_set_format(n[i], n[i]->data->format);
}
bch2_bkey_format_init(&s);
bch2_bkey_format_add_pos(&s, n2->data->min_key);
bch2_bkey_format_add_pos(&s, n2->data->max_key);
u64s = 0;
for_each_btree_node_key(b, k, &iter) {
if (bkey_deleted(k))
continue;
for (k = set2_start; k != set2_end; k = bkey_next(k)) {
struct bkey uk = bkey_unpack_key(n1, k);
bch2_bkey_format_add_key(&s, &uk);
}
i = u64s >= n1_u64s;
u64s += k->u64s;
n2->data->format = bch2_bkey_format_done(&s);
btree_node_set_format(n2, n2->data->format);
if (bch2_bkey_transform(&n[i]->format, out[i], bkey_packed(k)
? &b->format: &bch2_bkey_format_current, k))
out[i]->format = KEY_FORMAT_LOCAL_BTREE;
else
bch2_bkey_unpack(b, (void *) out[i], k);
out = set2->start;
memset(&n2->nr, 0, sizeof(n2->nr));
out[i]->needs_whiteout = false;
for (k = set2_start; k != set2_end; k = bkey_next(k)) {
BUG_ON(!bch2_bkey_transform(&n2->format, out, bkey_packed(k)
? &n1->format : &bch2_bkey_format_current, k));
out->format = KEY_FORMAT_LOCAL_BTREE;
btree_keys_account_key_add(&n2->nr, 0, out);
out = bkey_next(out);
btree_keys_account_key_add(&n[i]->nr, 0, out[i]);
out[i] = bkey_next(out[i]);
}
set2->u64s = cpu_to_le16((u64 *) out - set2->_data);
set_btree_bset_end(n2, n2->set);
for (i = 0; i < 2; i++) {
bsets[i]->u64s = cpu_to_le16((u64 *) out[i] - bsets[i]->_data);
BUG_ON(!set1->u64s);
BUG_ON(!set2->u64s);
BUG_ON(!bsets[i]->u64s);
btree_node_reset_sib_u64s(n1);
btree_node_reset_sib_u64s(n2);
set_btree_bset_end(n[i], n[i]->set);
bch2_verify_btree_nr_keys(n1);
bch2_verify_btree_nr_keys(n2);
btree_node_reset_sib_u64s(n[i]);
if (n1->c.level) {
btree_node_interior_verify(as->c, n1);
btree_node_interior_verify(as->c, n2);
}
bch2_verify_btree_nr_keys(n[i]);
return n2;
if (b->c.level)
btree_node_interior_verify(as->c, n[i]);
}
}
/*
......@@ -1463,41 +1436,17 @@ static void btree_split_insert_keys(struct btree_update *as,
struct btree *b,
struct keylist *keys)
{
if (!bch2_keylist_empty(keys) &&
bpos_cmp(bch2_keylist_front(keys)->k.p,
b->data->max_key) <= 0) {
struct btree_node_iter node_iter;
struct bkey_i *k = bch2_keylist_front(keys);
struct bkey_packed *src, *dst, *n;
struct bset *i;
bch2_btree_node_iter_init(&node_iter, b, &k->k.p);
bch2_btree_node_iter_init(&node_iter, b, &bch2_keylist_front(keys)->k.p);
__bch2_btree_insert_keys_interior(as, trans, path, b, node_iter, keys);
/*
* We can't tolerate whiteouts here - with whiteouts there can be
* duplicate keys, and it would be rather bad if we picked a duplicate
* for the pivot:
*/
i = btree_bset_first(b);
src = dst = i->start;
while (src != vstruct_last(i)) {
n = bkey_next(src);
if (!bkey_deleted(src)) {
memmove_u64s_down(dst, src, src->u64s);
dst = bkey_next(dst);
}
src = n;
}
/* Also clear out the unwritten whiteouts area: */
b->whiteout_u64s = 0;
i->u64s = cpu_to_le16((u64 *) dst - i->_data);
set_btree_bset_end(b, b->set);
BUG_ON(b->nsets != 1 ||
b->nr.live_u64s != le16_to_cpu(btree_bset_first(b)->u64s));
btree_node_interior_verify(as->c, b);
}
}
static int btree_split(struct btree_update *as, struct btree_trans *trans,
......@@ -1516,15 +1465,21 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans,
bch2_btree_interior_update_will_free_node(as, b);
n1 = bch2_btree_node_alloc_replacement(as, trans, b);
if (keys)
btree_split_insert_keys(as, trans, path, n1, keys);
if (b->nr.live_u64s > BTREE_SPLIT_THRESHOLD(c)) {
struct btree *n[2];
if (bset_u64s(&n1->set[0]) > BTREE_SPLIT_THRESHOLD(c)) {
trace_and_count(c, btree_node_split, c, b);
n2 = __btree_split_node(as, trans, n1);
n[0] = n1 = bch2_btree_node_alloc(as, trans, b->c.level);
n[1] = n2 = bch2_btree_node_alloc(as, trans, b->c.level);
__btree_split_node(as, trans, b, n);
if (keys) {
btree_split_insert_keys(as, trans, path, n1, keys);
btree_split_insert_keys(as, trans, path, n2, keys);
BUG_ON(!bch2_keylist_empty(keys));
}
bch2_btree_build_aux_trees(n2);
bch2_btree_build_aux_trees(n1);
......@@ -1573,6 +1528,13 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans,
} else {
trace_and_count(c, btree_node_compact, c, b);
n1 = bch2_btree_node_alloc_replacement(as, trans, b);
if (keys) {
btree_split_insert_keys(as, trans, path, n1, keys);
BUG_ON(!bch2_keylist_empty(keys));
}
bch2_btree_build_aux_trees(n1);
bch2_btree_update_add_new_node(as, n1);
six_unlock_write(&n1->c.lock);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment