Commit 7a51608d authored by Kent Overstreet's avatar Kent Overstreet

bcachefs: Rework btree node pinning

In backpointers fsck, we do a seqential scan of one btree, and check
references to another: extents <-> backpointers

Checking references generates random lookups, so we want to pin that
btree in memory (or only a range, if it doesn't fit in ram).

Previously, this was done with a simple check in the shrinker - "if
btree node is in range being pinned, don't free it" - but this generated
OOMs, as our shrinker wasn't well behaved if there was less memory
available than expected.

Instead, we now have two different shrinkers and lru lists; the second
shrinker being for pinned nodes, with seeks set much higher than normal
- so they can still be freed if necessary, but we'll prefer not to.
Signed-off-by: default avatarKent Overstreet <kent.overstreet@linux.dev>
parent 91ddd715
......@@ -752,10 +752,12 @@ static int bch2_get_btree_in_memory_pos(struct btree_trans *trans,
s64 mem_may_pin = mem_may_pin_bytes(c);
int ret = 0;
bch2_btree_cache_unpin(c);
btree_interior_mask |= btree_leaf_mask;
c->btree_cache.pinned_nodes_leaf_mask = btree_leaf_mask;
c->btree_cache.pinned_nodes_interior_mask = btree_interior_mask;
c->btree_cache.pinned_nodes_mask[0] = btree_leaf_mask;
c->btree_cache.pinned_nodes_mask[1] = btree_interior_mask;
c->btree_cache.pinned_nodes_start = start;
c->btree_cache.pinned_nodes_end = *end = BBPOS_MAX;
......@@ -777,6 +779,7 @@ static int bch2_get_btree_in_memory_pos(struct btree_trans *trans,
BBPOS(btree, b->key.k.p);
break;
}
bch2_node_pin(c, b);
0;
}));
}
......@@ -936,8 +939,7 @@ int bch2_check_extents_to_backpointers(struct bch_fs *c)
bch2_trans_put(trans);
bch2_bkey_buf_exit(&s.last_flushed, c);
c->btree_cache.pinned_nodes_leaf_mask = 0;
c->btree_cache.pinned_nodes_interior_mask = 0;
bch2_btree_cache_unpin(c);
bch_err_fn(c, ret);
return ret;
......@@ -1053,8 +1055,7 @@ int bch2_check_backpointers_to_extents(struct bch_fs *c)
}
bch2_trans_put(trans);
c->btree_cache.pinned_nodes_leaf_mask = 0;
c->btree_cache.pinned_nodes_interior_mask = 0;
bch2_btree_cache_unpin(c);
bch_err_fn(c, ret);
return ret;
......
This diff is collapsed.
......@@ -19,6 +19,9 @@ int __bch2_btree_node_hash_insert(struct btree_cache *, struct btree *);
int bch2_btree_node_hash_insert(struct btree_cache *, struct btree *,
unsigned, enum btree_id);
void bch2_node_pin(struct bch_fs *, struct btree *);
void bch2_btree_cache_unpin(struct bch_fs *);
void bch2_btree_node_update_key_early(struct btree_trans *, enum btree_id, unsigned,
struct bkey_s_c, struct bkey_i *);
......
......@@ -147,8 +147,7 @@ struct btree {
x(noevict) \
x(write_blocked) \
x(will_make_reachable) \
x(access_bit) \
x(pinned) \
x(access_bit)
enum bch_btree_cache_not_freed_reasons {
#define x(n) BCH_BTREE_CACHE_NOT_FREED_##n,
......@@ -157,6 +156,13 @@ enum bch_btree_cache_not_freed_reasons {
BCH_BTREE_CACHE_NOT_FREED_REASONS_NR,
};
struct btree_cache_list {
unsigned idx;
struct shrinker *shrink;
struct list_head list;
size_t nr;
};
struct btree_cache {
struct rhashtable table;
bool table_init_done;
......@@ -174,12 +180,11 @@ struct btree_cache {
* should never grow past ~2-3 nodes in practice.
*/
struct mutex lock;
struct list_head live;
struct list_head freeable;
struct list_head freed_pcpu;
struct list_head freed_nonpcpu;
struct btree_cache_list live[2];
size_t nr_live;
size_t nr_freeable;
size_t nr_reserve;
size_t nr_by_btree[BTREE_ID_NR];
......@@ -188,7 +193,6 @@ struct btree_cache {
/* shrinker stats */
size_t nr_freed;
u64 not_freed[BCH_BTREE_CACHE_NOT_FREED_REASONS_NR];
struct shrinker *shrink;
/*
* If we need to allocate memory for a new btree node and that
......@@ -201,8 +205,8 @@ struct btree_cache {
struct bbpos pinned_nodes_start;
struct bbpos pinned_nodes_end;
u64 pinned_nodes_leaf_mask;
u64 pinned_nodes_interior_mask;
/* btree id mask: 0 for leaves, 1 for interior */
u64 pinned_nodes_mask[2];
};
struct btree_node_iter {
......@@ -594,7 +598,8 @@ enum btree_write_type {
x(dying) \
x(fake) \
x(need_rewrite) \
x(never_write)
x(never_write) \
x(pinned)
enum btree_flags {
/* First bits for btree node write type */
......
......@@ -1904,7 +1904,7 @@ static void __btree_increase_depth(struct btree_update *as, struct btree_trans *
six_unlock_intent(&n->c.lock);
mutex_lock(&c->btree_cache.lock);
list_add_tail(&b->list, &c->btree_cache.live);
list_add_tail(&b->list, &c->btree_cache.live[btree_node_pinned(b)].list);
mutex_unlock(&c->btree_cache.lock);
bch2_trans_verify_locks(trans);
......
......@@ -641,6 +641,7 @@ static u64 journal_seq_to_flush(struct journal *j)
static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct btree_cache *bc = &c->btree_cache;
bool kthread = (current->flags & PF_KTHREAD) != 0;
u64 seq_to_flush;
size_t min_nr, min_key_cache, nr_flushed;
......@@ -681,7 +682,8 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked)
if (j->watermark != BCH_WATERMARK_stripe)
min_nr = 1;
if (atomic_long_read(&c->btree_cache.nr_dirty) * 2 > c->btree_cache.nr_live)
size_t btree_cache_live = bc->live[0].nr + bc->live[1].nr;
if (atomic_long_read(&bc->nr_dirty) * 2 > btree_cache_live)
min_nr = 1;
min_key_cache = min(bch2_nr_btree_keys_need_flush(c), (size_t) 128);
......@@ -689,8 +691,7 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked)
trace_and_count(c, journal_reclaim_start, c,
direct, kicked,
min_nr, min_key_cache,
atomic_long_read(&c->btree_cache.nr_dirty),
c->btree_cache.nr_live,
atomic_long_read(&bc->nr_dirty), btree_cache_live,
atomic_long_read(&c->btree_key_cache.nr_dirty),
atomic_long_read(&c->btree_key_cache.nr_keys));
......
......@@ -244,14 +244,18 @@ static struct attribute sysfs_state_rw = {
static size_t bch2_btree_cache_size(struct bch_fs *c)
{
struct btree_cache *bc = &c->btree_cache;
size_t ret = 0;
struct btree *b;
mutex_lock(&c->btree_cache.lock);
list_for_each_entry(b, &c->btree_cache.live, list)
mutex_lock(&bc->lock);
list_for_each_entry(b, &bc->live[0].list, list)
ret += btree_buf_bytes(b);
mutex_unlock(&c->btree_cache.lock);
list_for_each_entry(b, &bc->live[1].list, list)
ret += btree_buf_bytes(b);
list_for_each_entry(b, &bc->freeable, list)
ret += btree_buf_bytes(b);
mutex_unlock(&bc->lock);
return ret;
}
......@@ -444,11 +448,12 @@ STORE(bch2_fs)
return -EROFS;
if (attr == &sysfs_trigger_btree_cache_shrink) {
struct btree_cache *bc = &c->btree_cache;
struct shrink_control sc;
sc.gfp_mask = GFP_KERNEL;
sc.nr_to_scan = strtoul_or_return(buf);
c->btree_cache.shrink->scan_objects(c->btree_cache.shrink, &sc);
bc->live[0].shrink->scan_objects(bc->live[0].shrink, &sc);
}
if (attr == &sysfs_trigger_btree_key_cache_shrink) {
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment