Commit 8479938d authored by Kent Overstreet's avatar Kent Overstreet

bcachefs: Convert snapshot table to RCU array

This switches the generic radix tree for the in-memory table of snapshot
nodes to a simple rcu array. This means we have to add new locking to
deal with reallocations, but is faster than traversing the radix tree.
Signed-off-by: default avatarKent Overstreet <kent.overstreet@linux.dev>
parent d82978ca
...@@ -774,9 +774,10 @@ struct bch_fs { ...@@ -774,9 +774,10 @@ struct bch_fs {
struct mutex sb_lock; struct mutex sb_lock;
/* snapshot.c: */ /* snapshot.c: */
GENRADIX(struct snapshot_t) snapshots; struct snapshot_table __rcu *snapshots;
struct bch_snapshot_table __rcu *snapshot_table; size_t snapshot_table_size;
struct mutex snapshot_table_lock; struct mutex snapshot_table_lock;
struct work_struct snapshot_delete_work; struct work_struct snapshot_delete_work;
struct work_struct snapshot_wait_for_pagecache_and_delete_work; struct work_struct snapshot_wait_for_pagecache_and_delete_work;
snapshot_id_list snapshots_unlinked; snapshot_id_list snapshots_unlinked;
......
...@@ -311,7 +311,7 @@ static inline void btree_insert_entry_checks(struct btree_trans *trans, ...@@ -311,7 +311,7 @@ static inline void btree_insert_entry_checks(struct btree_trans *trans,
!(i->flags & BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) && !(i->flags & BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) &&
test_bit(JOURNAL_REPLAY_DONE, &trans->c->journal.flags) && test_bit(JOURNAL_REPLAY_DONE, &trans->c->journal.flags) &&
i->k->k.p.snapshot && i->k->k.p.snapshot &&
bch2_snapshot_internal_node(trans->c, i->k->k.p.snapshot)); bch2_snapshot_is_internal_node(trans->c, i->k->k.p.snapshot));
} }
static noinline int static noinline int
...@@ -1229,7 +1229,7 @@ static inline int check_pos_snapshot_overwritten(struct btree_trans *trans, ...@@ -1229,7 +1229,7 @@ static inline int check_pos_snapshot_overwritten(struct btree_trans *trans,
struct bpos pos) struct bpos pos)
{ {
if (!btree_type_has_snapshots(id) || if (!btree_type_has_snapshots(id) ||
!snapshot_t(trans->c, pos.snapshot)->children[0]) bch2_snapshot_is_leaf(trans->c, pos.snapshot))
return 0; return 0;
return __check_pos_snapshot_overwritten(trans, id, pos); return __check_pos_snapshot_overwritten(trans, id, pos);
......
...@@ -894,7 +894,7 @@ static int check_inode(struct btree_trans *trans, ...@@ -894,7 +894,7 @@ static int check_inode(struct btree_trans *trans,
* particular is not atomic, so on the internal snapshot nodes * particular is not atomic, so on the internal snapshot nodes
* we can see inodes marked for deletion after a clean shutdown * we can see inodes marked for deletion after a clean shutdown
*/ */
if (bch2_snapshot_internal_node(c, k.k->p.snapshot)) if (bch2_snapshot_is_internal_node(c, k.k->p.snapshot))
return 0; return 0;
if (!bkey_is_inode(k.k)) if (!bkey_is_inode(k.k))
......
...@@ -562,7 +562,7 @@ static int bch2_fs_quota_read_inode(struct btree_trans *trans, ...@@ -562,7 +562,7 @@ static int bch2_fs_quota_read_inode(struct btree_trans *trans,
int ret; int ret;
ret = bch2_snapshot_tree_lookup(trans, ret = bch2_snapshot_tree_lookup(trans,
snapshot_t(c, k.k->p.snapshot)->tree, &s_t); bch2_snapshot_tree(c, k.k->p.snapshot), &s_t);
bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c, bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
"%s: snapshot tree %u not found", __func__, "%s: snapshot tree %u not found", __func__,
snapshot_t(c, k.k->p.snapshot)->tree); snapshot_t(c, k.k->p.snapshot)->tree);
......
...@@ -12,9 +12,9 @@ ...@@ -12,9 +12,9 @@
static int bch2_subvolume_delete(struct btree_trans *, u32); static int bch2_subvolume_delete(struct btree_trans *, u32);
static inline u32 get_ancestor_below(struct bch_fs *c, u32 id, u32 ancestor) static inline u32 get_ancestor_below(struct snapshot_table *t, u32 id, u32 ancestor)
{ {
struct snapshot_t *s = snapshot_t(c, id); const struct snapshot_t *s = __snapshot_t(t, id);
if (s->skip[2] <= ancestor) if (s->skip[2] <= ancestor)
return s->skip[2]; return s->skip[2];
...@@ -27,22 +27,102 @@ static inline u32 get_ancestor_below(struct bch_fs *c, u32 id, u32 ancestor) ...@@ -27,22 +27,102 @@ static inline u32 get_ancestor_below(struct bch_fs *c, u32 id, u32 ancestor)
bool bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor) bool bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor)
{ {
struct snapshot_table *t;
EBUG_ON(c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_snapshots); EBUG_ON(c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_snapshots);
rcu_read_lock();
t = rcu_dereference(c->snapshots);
while (id && id < ancestor) while (id && id < ancestor)
id = get_ancestor_below(c, id, ancestor); id = get_ancestor_below(t, id, ancestor);
rcu_read_unlock();
return id == ancestor; return id == ancestor;
} }
static bool bch2_snapshot_is_ancestor_early(struct bch_fs *c, u32 id, u32 ancestor) static bool bch2_snapshot_is_ancestor_early(struct bch_fs *c, u32 id, u32 ancestor)
{ {
struct snapshot_table *t;
rcu_read_lock();
t = rcu_dereference(c->snapshots);
while (id && id < ancestor) while (id && id < ancestor)
id = snapshot_t(c, id)->parent; id = __snapshot_t(t, id)->parent;
rcu_read_unlock();
return id == ancestor; return id == ancestor;
} }
static inline u32 bch2_snapshot_depth(struct bch_fs *c, u32 parent)
{
u32 depth;
rcu_read_lock();
depth = parent ? snapshot_t(c, parent)->depth + 1 : 0;
rcu_read_unlock();
return depth;
}
struct snapshot_t_free_rcu {
struct rcu_head rcu;
struct snapshot_table *t;
};
static void snapshot_t_free_rcu(struct rcu_head *rcu)
{
struct snapshot_t_free_rcu *free_rcu =
container_of(rcu, struct snapshot_t_free_rcu, rcu);
kvfree(free_rcu->t);
kfree(free_rcu);
}
static noinline struct snapshot_t *__snapshot_t_mut(struct bch_fs *c, u32 id)
{
size_t idx = U32_MAX - id;
size_t new_size;
struct snapshot_table *new, *old;
new_size = max(16UL, roundup_pow_of_two(idx + 1));
new = kvzalloc(struct_size(new, s, new_size), GFP_KERNEL);
if (!new)
return NULL;
old = rcu_dereference_protected(c->snapshots, true);
if (old)
memcpy(new->s,
rcu_dereference_protected(c->snapshots, true)->s,
sizeof(new->s[0]) * c->snapshot_table_size);
rcu_assign_pointer(c->snapshots, new);
c->snapshot_table_size = new_size;
if (old) {
struct snapshot_t_free_rcu *rcu =
kmalloc(sizeof(*rcu), GFP_KERNEL|__GFP_NOFAIL);
rcu->t = old;
call_rcu(&rcu->rcu, snapshot_t_free_rcu);
}
return &rcu_dereference_protected(c->snapshots, true)->s[idx];
}
static inline struct snapshot_t *snapshot_t_mut(struct bch_fs *c, u32 id)
{
size_t idx = U32_MAX - id;
lockdep_assert_held(&c->snapshot_table_lock);
if (likely(idx < c->snapshot_table_size))
return &rcu_dereference_protected(c->snapshots, true)->s[idx];
return __snapshot_t_mut(c, id);
}
/* Snapshot tree: */ /* Snapshot tree: */
void bch2_snapshot_tree_to_text(struct printbuf *out, struct bch_fs *c, void bch2_snapshot_tree_to_text(struct printbuf *out, struct bch_fs *c,
...@@ -209,12 +289,15 @@ int bch2_mark_snapshot(struct btree_trans *trans, ...@@ -209,12 +289,15 @@ int bch2_mark_snapshot(struct btree_trans *trans,
{ {
struct bch_fs *c = trans->c; struct bch_fs *c = trans->c;
struct snapshot_t *t; struct snapshot_t *t;
int ret = 0;
t = genradix_ptr_alloc(&c->snapshots, mutex_lock(&c->snapshot_table_lock);
U32_MAX - new.k->p.offset,
GFP_KERNEL); t = snapshot_t_mut(c, new.k->p.offset);
if (!t) if (!t) {
return -BCH_ERR_ENOMEM_mark_snapshot; ret = -BCH_ERR_ENOMEM_mark_snapshot;
goto err;
}
if (new.k->type == KEY_TYPE_snapshot) { if (new.k->type == KEY_TYPE_snapshot) {
struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(new); struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(new);
...@@ -246,8 +329,9 @@ int bch2_mark_snapshot(struct btree_trans *trans, ...@@ -246,8 +329,9 @@ int bch2_mark_snapshot(struct btree_trans *trans,
t->subvol = 0; t->subvol = 0;
t->tree = 0; t->tree = 0;
} }
err:
return 0; mutex_unlock(&c->snapshot_table_lock);
return ret;
} }
static int snapshot_lookup(struct btree_trans *trans, u32 id, static int snapshot_lookup(struct btree_trans *trans, u32 id,
...@@ -300,9 +384,14 @@ static int bch2_snapshot_set_equiv(struct btree_trans *trans, struct bkey_s_c k) ...@@ -300,9 +384,14 @@ static int bch2_snapshot_set_equiv(struct btree_trans *trans, struct bkey_s_c k)
nr_live += ret; nr_live += ret;
} }
snapshot_t(c, id)->equiv = nr_live == 1 mutex_lock(&c->snapshot_table_lock);
? snapshot_t(c, child[live_idx])->equiv
snapshot_t_mut(c, id)->equiv = nr_live == 1
? snapshot_t_mut(c, child[live_idx])->equiv
: id; : id;
mutex_unlock(&c->snapshot_table_lock);
return 0; return 0;
} }
...@@ -520,16 +609,18 @@ static int snapshot_tree_ptr_good(struct btree_trans *trans, ...@@ -520,16 +609,18 @@ static int snapshot_tree_ptr_good(struct btree_trans *trans,
static u32 snapshot_skiplist_get(struct bch_fs *c, u32 id) static u32 snapshot_skiplist_get(struct bch_fs *c, u32 id)
{ {
struct snapshot_t *s; const struct snapshot_t *s;
if (!id) if (!id)
return 0; return 0;
rcu_read_lock();
s = snapshot_t(c, id); s = snapshot_t(c, id);
if (!s->parent) if (s->parent)
return id; id = bch2_snapshot_nth_parent(c, id, get_random_u32_below(s->depth));
rcu_read_unlock();
return bch2_snapshot_nth_parent(c, id, get_random_u32_below(s->depth)); return id;
} }
static int snapshot_skiplist_good(struct btree_trans *trans, struct bch_snapshot s) static int snapshot_skiplist_good(struct btree_trans *trans, struct bch_snapshot s)
...@@ -633,9 +724,6 @@ static int check_snapshot(struct btree_trans *trans, ...@@ -633,9 +724,6 @@ static int check_snapshot(struct btree_trans *trans,
struct bkey_i_snapshot *u; struct bkey_i_snapshot *u;
u32 parent_id = bch2_snapshot_parent_early(c, k.k->p.offset); u32 parent_id = bch2_snapshot_parent_early(c, k.k->p.offset);
u32 real_depth; u32 real_depth;
struct snapshot_t *parent = parent_id
? snapshot_t(c, parent_id)
: NULL;
struct printbuf buf = PRINTBUF; struct printbuf buf = PRINTBUF;
bool should_have_subvol; bool should_have_subvol;
u32 i, id; u32 i, id;
...@@ -726,7 +814,7 @@ static int check_snapshot(struct btree_trans *trans, ...@@ -726,7 +814,7 @@ static int check_snapshot(struct btree_trans *trans,
} }
ret = 0; ret = 0;
real_depth = parent ? parent->depth + 1 : 0; real_depth = bch2_snapshot_depth(c, parent_id);
if (le32_to_cpu(s.depth) != real_depth && if (le32_to_cpu(s.depth) != real_depth &&
(c->sb.version_upgrade_complete < bcachefs_metadata_version_snapshot_skiplists || (c->sb.version_upgrade_complete < bcachefs_metadata_version_snapshot_skiplists ||
...@@ -823,9 +911,13 @@ static int check_subvol(struct btree_trans *trans, ...@@ -823,9 +911,13 @@ static int check_subvol(struct btree_trans *trans,
if (!BCH_SUBVOLUME_SNAP(subvol.v)) { if (!BCH_SUBVOLUME_SNAP(subvol.v)) {
u32 snapshot_root = bch2_snapshot_root(c, le32_to_cpu(subvol.v->snapshot)); u32 snapshot_root = bch2_snapshot_root(c, le32_to_cpu(subvol.v->snapshot));
u32 snapshot_tree = snapshot_t(c, snapshot_root)->tree; u32 snapshot_tree;
struct bch_snapshot_tree st; struct bch_snapshot_tree st;
rcu_read_lock();
snapshot_tree = snapshot_t(c, snapshot_root)->tree;
rcu_read_unlock();
ret = bch2_snapshot_tree_lookup(trans, snapshot_tree, &st); ret = bch2_snapshot_tree_lookup(trans, snapshot_tree, &st);
bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c, bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
...@@ -869,7 +961,7 @@ int bch2_check_subvols(struct bch_fs *c) ...@@ -869,7 +961,7 @@ int bch2_check_subvols(struct bch_fs *c)
void bch2_fs_snapshots_exit(struct bch_fs *c) void bch2_fs_snapshots_exit(struct bch_fs *c)
{ {
genradix_free(&c->snapshots); kfree(rcu_dereference_protected(c->snapshots, true));
} }
int bch2_snapshots_read(struct bch_fs *c) int bch2_snapshots_read(struct bch_fs *c)
...@@ -1011,7 +1103,7 @@ static int create_snapids(struct btree_trans *trans, u32 parent, u32 tree, ...@@ -1011,7 +1103,7 @@ static int create_snapids(struct btree_trans *trans, u32 parent, u32 tree,
struct bkey_i_snapshot *n; struct bkey_i_snapshot *n;
struct bkey_s_c k; struct bkey_s_c k;
unsigned i, j; unsigned i, j;
u32 depth = parent ? snapshot_t(c, parent)->depth + 1 : 0; u32 depth = bch2_snapshot_depth(c, parent);
int ret; int ret;
bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots, bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots,
...@@ -1150,7 +1242,7 @@ static int snapshot_delete_key(struct btree_trans *trans, ...@@ -1150,7 +1242,7 @@ static int snapshot_delete_key(struct btree_trans *trans,
struct bpos *last_pos) struct bpos *last_pos)
{ {
struct bch_fs *c = trans->c; struct bch_fs *c = trans->c;
u32 equiv = snapshot_t(c, k.k->p.snapshot)->equiv; u32 equiv = bch2_snapshot_equiv(c, k.k->p.snapshot);
if (!bkey_eq(k.k->p, *last_pos)) if (!bkey_eq(k.k->p, *last_pos))
equiv_seen->nr = 0; equiv_seen->nr = 0;
......
...@@ -32,17 +32,40 @@ int bch2_mark_snapshot(struct btree_trans *, enum btree_id, unsigned, ...@@ -32,17 +32,40 @@ int bch2_mark_snapshot(struct btree_trans *, enum btree_id, unsigned,
.min_val_size = 24, \ .min_val_size = 24, \
}) })
static inline struct snapshot_t *snapshot_t(struct bch_fs *c, u32 id) static inline struct snapshot_t *__snapshot_t(struct snapshot_table *t, u32 id)
{ {
return genradix_ptr(&c->snapshots, U32_MAX - id); return &t->s[U32_MAX - id];
} }
static inline u32 bch2_snapshot_parent_early(struct bch_fs *c, u32 id) static inline const struct snapshot_t *snapshot_t(struct bch_fs *c, u32 id)
{
return __snapshot_t(rcu_dereference(c->snapshots), id);
}
static inline u32 bch2_snapshot_tree(struct bch_fs *c, u32 id)
{
rcu_read_lock();
id = snapshot_t(c, id)->tree;
rcu_read_unlock();
return id;
}
static inline u32 __bch2_snapshot_parent_early(struct bch_fs *c, u32 id)
{ {
return snapshot_t(c, id)->parent; return snapshot_t(c, id)->parent;
} }
static inline u32 bch2_snapshot_parent(struct bch_fs *c, u32 id) static inline u32 bch2_snapshot_parent_early(struct bch_fs *c, u32 id)
{
rcu_read_lock();
id = __bch2_snapshot_parent_early(c, id);
rcu_read_unlock();
return id;
}
static inline u32 __bch2_snapshot_parent(struct bch_fs *c, u32 id)
{ {
#ifdef CONFIG_BCACHEFS_DEBUG #ifdef CONFIG_BCACHEFS_DEBUG
u32 parent = snapshot_t(c, id)->parent; u32 parent = snapshot_t(c, id)->parent;
...@@ -59,10 +82,21 @@ static inline u32 bch2_snapshot_parent(struct bch_fs *c, u32 id) ...@@ -59,10 +82,21 @@ static inline u32 bch2_snapshot_parent(struct bch_fs *c, u32 id)
#endif #endif
} }
static inline u32 bch2_snapshot_parent(struct bch_fs *c, u32 id)
{
rcu_read_lock();
id = __bch2_snapshot_parent(c, id);
rcu_read_unlock();
return id;
}
static inline u32 bch2_snapshot_nth_parent(struct bch_fs *c, u32 id, u32 n) static inline u32 bch2_snapshot_nth_parent(struct bch_fs *c, u32 id, u32 n)
{ {
rcu_read_lock();
while (n--) while (n--)
id = bch2_snapshot_parent(c, id); id = __bch2_snapshot_parent(c, id);
rcu_read_unlock();
return id; return id;
} }
...@@ -71,37 +105,60 @@ static inline u32 bch2_snapshot_root(struct bch_fs *c, u32 id) ...@@ -71,37 +105,60 @@ static inline u32 bch2_snapshot_root(struct bch_fs *c, u32 id)
{ {
u32 parent; u32 parent;
while ((parent = bch2_snapshot_parent(c, id))) rcu_read_lock();
while ((parent = __bch2_snapshot_parent(c, id)))
id = parent; id = parent;
rcu_read_unlock();
return id; return id;
} }
static inline u32 bch2_snapshot_equiv(struct bch_fs *c, u32 id) static inline u32 __bch2_snapshot_equiv(struct bch_fs *c, u32 id)
{ {
return snapshot_t(c, id)->equiv; return snapshot_t(c, id)->equiv;
} }
static inline u32 bch2_snapshot_equiv(struct bch_fs *c, u32 id)
{
rcu_read_lock();
id = __bch2_snapshot_equiv(c, id);
rcu_read_unlock();
return id;
}
static inline bool bch2_snapshot_is_equiv(struct bch_fs *c, u32 id) static inline bool bch2_snapshot_is_equiv(struct bch_fs *c, u32 id)
{ {
return id == snapshot_t(c, id)->equiv; return id == bch2_snapshot_equiv(c, id);
} }
static inline u32 bch2_snapshot_internal_node(struct bch_fs *c, u32 id) static inline bool bch2_snapshot_is_internal_node(struct bch_fs *c, u32 id)
{ {
struct snapshot_t *s = snapshot_t(c, id); const struct snapshot_t *s;
bool ret;
rcu_read_lock();
s = snapshot_t(c, id);
ret = s->children[0];
rcu_read_unlock();
return s->children[0] || s->children[1]; return ret;
}
static inline u32 bch2_snapshot_is_leaf(struct bch_fs *c, u32 id)
{
return !bch2_snapshot_is_internal_node(c, id);
} }
static inline u32 bch2_snapshot_sibling(struct bch_fs *c, u32 id) static inline u32 bch2_snapshot_sibling(struct bch_fs *c, u32 id)
{ {
struct snapshot_t *s; const struct snapshot_t *s;
u32 parent = bch2_snapshot_parent(c, id); u32 parent = __bch2_snapshot_parent(c, id);
if (!parent) if (!parent)
return 0; return 0;
s = snapshot_t(c, bch2_snapshot_parent(c, id)); s = snapshot_t(c, __bch2_snapshot_parent(c, id));
if (id == s->children[0]) if (id == s->children[0])
return s->children[1]; return s->children[1];
if (id == s->children[1]) if (id == s->children[1])
...@@ -113,9 +170,15 @@ bool bch2_snapshot_is_ancestor(struct bch_fs *, u32, u32); ...@@ -113,9 +170,15 @@ bool bch2_snapshot_is_ancestor(struct bch_fs *, u32, u32);
static inline bool bch2_snapshot_has_children(struct bch_fs *c, u32 id) static inline bool bch2_snapshot_has_children(struct bch_fs *c, u32 id)
{ {
struct snapshot_t *t = snapshot_t(c, id); const struct snapshot_t *t;
bool ret;
return (t->children[0]|t->children[1]) != 0; rcu_read_lock();
t = snapshot_t(c, id);
ret = (t->children[0]|t->children[1]) != 0;
rcu_read_unlock();
return ret;
} }
static inline bool snapshot_list_has_id(snapshot_id_list *s, u32 id) static inline bool snapshot_list_has_id(snapshot_id_list *s, u32 id)
......
...@@ -16,6 +16,10 @@ struct snapshot_t { ...@@ -16,6 +16,10 @@ struct snapshot_t {
u32 equiv; u32 equiv;
}; };
struct snapshot_table {
struct snapshot_t s[0];
};
typedef struct { typedef struct {
u32 subvol; u32 subvol;
u64 inum; u64 inum;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment