bcachefs: gc now operates on second set of bucket marks

This means we can now use gc to verify the allocation information - important for testing persistant alloc info Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>

bcachefs: gc now operates on second set of bucket marks
This means we can now use gc to verify the allocation information - important for testing persistant alloc info Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
9ca53b55 · Kent Overstreet · Kent Overstreet · e6473691 · 9ca53b55 · 9ca53b55
Commit 9ca53b55 authored Jul 23, 2018 by Kent Overstreet Committed by Kent Overstreet Oct 22, 2023
11 changed files
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -930,12 +930,6 @@ static int bch2_allocator_thread(void *arg)
 		pr_debug("free_inc now empty");

 		do {
-			if (test_bit(BCH_FS_GC_FAILURE, &c->flags)) {
-				up_read(&c->gc_lock);
-				bch_err(ca, "gc failure");
-				goto stop;
-			}
-
 			/*
 			 * Find some buckets that we can invalidate, either
 			 * they're completely unused, or only contain clean data
@@ -1293,9 +1287,6 @@ static int __bch2_fs_allocator_start(struct bch_fs *c)
 	bool invalidating_data = false;
 	int ret = 0;

-	if (test_bit(BCH_FS_GC_FAILURE, &c->flags))
-		return -1;
-
 	if (test_alloc_startup(c)) {
 		invalidating_data = true;
 		goto not_enough;
@@ -1321,9 +1312,7 @@ static int __bch2_fs_allocator_start(struct bch_fs *c)
 				continue;

 			bch2_mark_alloc_bucket(c, ca, bu, true,
-					gc_pos_alloc(c, NULL),
-					BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
-					BCH_BUCKET_MARK_GC_LOCK_HELD);
+					gc_pos_alloc(c, NULL), 0);

 			fifo_push(&ca->free_inc, bu);


--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -347,7 +347,6 @@ enum gc_phase {

 	GC_PHASE_PENDING_DELETE,
 	GC_PHASE_ALLOC,
-	GC_PHASE_DONE
 };

 struct gc_pos {
@@ -392,15 +391,14 @@ struct bch_dev {
 	 * gc_lock, for device resize - holding any is sufficient for access:
 	 * Or rcu_read_lock(), but only for ptr_stale():
 	 */
-	struct bucket_array __rcu *buckets;
+	struct bucket_array __rcu *buckets[2];
 	unsigned long		*buckets_dirty;
 	unsigned long		*buckets_written;
 	/* most out of date gen in the btree */
 	u8			*oldest_gens;
 	struct rw_semaphore	bucket_lock;

-	struct bch_dev_usage __percpu *usage_percpu;
-	struct bch_dev_usage	usage_cached;
+	struct bch_dev_usage __percpu *usage[2];

 	/* Allocator: */
 	struct task_struct __rcu *alloc_thread;
@@ -478,7 +476,6 @@ enum {

 	/* errors: */
 	BCH_FS_ERROR,
-	BCH_FS_GC_FAILURE,

 	/* misc: */
 	BCH_FS_BDEV_MOUNTED,
@@ -614,8 +611,8 @@ struct bch_fs {

 	atomic64_t		sectors_available;

-	struct bch_fs_usage __percpu *usage_percpu;
-	struct bch_fs_usage	usage_cached;
+	struct bch_fs_usage __percpu *usage[2];
+
 	struct percpu_rw_semaphore usage_lock;

 	struct closure_waitlist	freelist_wait;
@@ -656,9 +653,6 @@ struct bch_fs {
 	 *
 	 * gc_cur_phase is a superset of btree_ids (BTREE_ID_EXTENTS etc.)
 	 *
-	 * gc_cur_phase == GC_PHASE_DONE indicates that gc is finished/not
-	 * currently running, and gc marks are currently valid
-	 *
 	 * Protected by gc_pos_lock. Only written to by GC thread, so GC thread
 	 * can read without a lock.
 	 */

--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
--- a/fs/bcachefs/btree_gc.h
+++ b/fs/bcachefs/btree_gc.h
@@ -7,7 +7,7 @@
 enum bkey_type;

 void bch2_coalesce(struct bch_fs *);
-void bch2_gc(struct bch_fs *);
+int bch2_gc(struct bch_fs *, struct list_head *, bool);
 void bch2_gc_thread_stop(struct bch_fs *);
 int bch2_gc_thread_start(struct bch_fs *);
 int bch2_initial_gc(struct bch_fs *, struct list_head *);
@@ -105,14 +105,14 @@ static inline struct gc_pos gc_pos_alloc(struct bch_fs *c, struct open_bucket *o
 	};
 }

-static inline bool gc_will_visit(struct bch_fs *c, struct gc_pos pos)
+static inline bool gc_visited(struct bch_fs *c, struct gc_pos pos)
 {
 	unsigned seq;
 	bool ret;

 	do {
 		seq = read_seqcount_begin(&c->gc_pos_lock);
-		ret = gc_pos_cmp(c->gc_pos, pos) < 0;
+		ret = gc_pos_cmp(pos, c->gc_pos) <= 0;
 	} while (read_seqcount_retry(&c->gc_pos_lock, seq));

 	return ret;

--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -160,7 +160,6 @@ static void bch2_btree_node_free_index(struct btree_update *as, struct btree *b,
 {
 	struct bch_fs *c = as->c;
 	struct pending_btree_node_free *d;
-	unsigned replicas;

 	/*
 	 * btree_update lock is only needed here to avoid racing with
@@ -178,15 +177,6 @@ static void bch2_btree_node_free_index(struct btree_update *as, struct btree *b,
 	BUG_ON(d->index_update_done);
 	d->index_update_done = true;

-	/*
-	 * Btree nodes are accounted as freed in bch_alloc_stats when they're
-	 * freed from the index:
-	 */
-	replicas = bch2_extent_nr_dirty_ptrs(k);
-	if (replicas)
-		stats->replicas[replicas - 1].data[BCH_DATA_BTREE] -=
-			c->opts.btree_node_size * replicas;
-
 	/*
 	 * We're dropping @k from the btree, but it's still live until the
 	 * index update is persistent so we need to keep a reference around for
@@ -208,15 +198,16 @@ static void bch2_btree_node_free_index(struct btree_update *as, struct btree *b,
 	 * bch2_mark_key() compares the current gc pos to the pos we're
 	 * moving this reference from, hence one comparison here:
 	 */
-	if (gc_pos_cmp(c->gc_pos, gc_phase(GC_PHASE_PENDING_DELETE)) < 0) {
-		struct bch_fs_usage tmp = { 0 };
+	if (gc_pos_cmp(c->gc_pos, b
+		       ? gc_pos_btree_node(b)
+		       : gc_pos_btree_root(as->btree_id)) >= 0 &&
+	    gc_pos_cmp(c->gc_pos, gc_phase(GC_PHASE_PENDING_DELETE)) < 0) {
+		struct gc_pos pos = { 0 };

 		bch2_mark_key(c, BKEY_TYPE_BTREE,
 			      bkey_i_to_s_c(&d->key),
-			      false, 0, b
-			      ? gc_pos_btree_node(b)
-			      : gc_pos_btree_root(as->btree_id),
-			      &tmp, 0, 0);
+			      false, 0, pos,
+			      NULL, 0, BCH_BUCKET_MARK_GC);
 		/*
 		 * Don't apply tmp - pending deletes aren't tracked in
 		 * bch_alloc_stats:
@@ -287,19 +278,13 @@ void bch2_btree_node_free_inmem(struct bch_fs *c, struct btree *b,
 static void bch2_btree_node_free_ondisk(struct bch_fs *c,
 					struct pending_btree_node_free *pending)
 {
-	struct bch_fs_usage stats = { 0 };
-
 	BUG_ON(!pending->index_update_done);

 	bch2_mark_key(c, BKEY_TYPE_BTREE,
 		      bkey_i_to_s_c(&pending->key),
 		      false, 0,
 		      gc_phase(GC_PHASE_PENDING_DELETE),
-		      &stats, 0, 0);
-	/*
-	 * Don't apply stats - pending deletes aren't tracked in
-	 * bch_alloc_stats:
-	 */
+		      NULL, 0, 0);
 }

 static struct btree *__bch2_btree_node_alloc(struct bch_fs *c,
@@ -1939,6 +1924,25 @@ static void __bch2_btree_node_update_key(struct bch_fs *c,

 	btree_interior_update_add_node_reference(as, b);

+	/*
+	 * XXX: the rest of the update path treats this like we're actually
+	 * inserting a new node and deleting the existing node, so the
+	 * reservation needs to include enough space for @b
+	 *
+	 * that is actually sketch as fuck though and I am surprised the code
+	 * seems to work like that, definitely need to go back and rework it
+	 * into something saner.
+	 *
+	 * (I think @b is just getting double counted until the btree update
+	 * finishes and "deletes" @b on disk)
+	 */
+	ret = bch2_disk_reservation_add(c, &as->reserve->disk_res,
+			c->opts.btree_node_size *
+			bch2_extent_nr_ptrs(extent_i_to_s_c(new_key)),
+			BCH_DISK_RESERVATION_NOFAIL|
+			BCH_DISK_RESERVATION_GC_LOCK_HELD);
+	BUG_ON(ret);
+
 	parent = btree_node_parent(iter, b);
 	if (parent) {
 		if (new_hash) {

--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -29,23 +29,34 @@
 	_old;							\
 })

-static inline struct bucket_array *bucket_array(struct bch_dev *ca)
+static inline struct bucket_array *__bucket_array(struct bch_dev *ca,
+						  bool gc)
 {
-	return rcu_dereference_check(ca->buckets,
+	return rcu_dereference_check(ca->buckets[gc],
 				     !ca->fs ||
 				     percpu_rwsem_is_held(&ca->fs->usage_lock) ||
 				     lockdep_is_held(&ca->fs->gc_lock) ||
 				     lockdep_is_held(&ca->bucket_lock));
 }

-static inline struct bucket *bucket(struct bch_dev *ca, size_t b)
+static inline struct bucket_array *bucket_array(struct bch_dev *ca)
+{
+	return __bucket_array(ca, false);
+}
+
+static inline struct bucket *__bucket(struct bch_dev *ca, size_t b, bool gc)
 {
-	struct bucket_array *buckets = bucket_array(ca);
+	struct bucket_array *buckets = __bucket_array(ca, gc);

 	BUG_ON(b < buckets->first_bucket || b >= buckets->nbuckets);
 	return buckets->b + b;
 }

+static inline struct bucket *bucket(struct bch_dev *ca, size_t b)
+{
+	return __bucket(ca, b, false);
+}
+
 static inline void bucket_io_clock_reset(struct bch_fs *c, struct bch_dev *ca,
 					 size_t b, int rw)
 {
@@ -129,7 +140,7 @@ static inline bool bucket_unused(struct bucket_mark mark)

 /* Device usage: */

-struct bch_dev_usage __bch2_dev_usage_read(struct bch_dev *);
+struct bch_dev_usage __bch2_dev_usage_read(struct bch_dev *, bool);
 struct bch_dev_usage bch2_dev_usage_read(struct bch_fs *, struct bch_dev *);

 static inline u64 __dev_buckets_available(struct bch_dev *ca,
@@ -168,7 +179,7 @@ static inline u64 dev_buckets_free(struct bch_fs *c, struct bch_dev *ca)

 /* Filesystem usage: */

-struct bch_fs_usage __bch2_fs_usage_read(struct bch_fs *);
+struct bch_fs_usage __bch2_fs_usage_read(struct bch_fs *, bool);
 struct bch_fs_usage bch2_fs_usage_read(struct bch_fs *);
 void bch2_fs_usage_apply(struct bch_fs *, struct bch_fs_usage *,
 			 struct disk_reservation *, struct gc_pos);
@@ -207,17 +218,13 @@ void bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *,
 			       struct gc_pos, unsigned);

 #define BCH_BUCKET_MARK_NOATOMIC		(1 << 0)
-#define BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE	(1 << 1)
-#define BCH_BUCKET_MARK_GC_WILL_VISIT		(1 << 2)
-#define BCH_BUCKET_MARK_GC_LOCK_HELD		(1 << 3)
+#define BCH_BUCKET_MARK_GC			(1 << 1)

 void bch2_mark_key(struct bch_fs *, enum bkey_type, struct bkey_s_c,
 		   bool, s64, struct gc_pos,
 		   struct bch_fs_usage *, u64, unsigned);
 void bch2_mark_update(struct btree_insert *, struct btree_insert_entry *);

-void bch2_recalc_sectors_available(struct bch_fs *);
-
 void __bch2_disk_reservation_put(struct bch_fs *, struct disk_reservation *);

 static inline void bch2_disk_reservation_put(struct bch_fs *c,

--- a/fs/bcachefs/buckets_types.h
+++ b/fs/bcachefs/buckets_types.h
@@ -64,8 +64,6 @@ struct bch_dev_usage {

 struct bch_fs_usage {
 	/* all fields are in units of 512 byte sectors: */
-	u64			online_reserved;
-	u64			available_cache;

 	struct {
 		u64		data[BCH_DATA_NR];
@@ -74,6 +72,10 @@ struct bch_fs_usage {
 	}			replicas[BCH_REPLICAS_MAX];

 	u64			buckets[BCH_DATA_NR];
+
+	/* fields starting here aren't touched by gc: */
+	u64			online_reserved;
+	u64			available_cache;
 };

 /*

--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -782,9 +782,7 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
 		bch2_mark_metadata_bucket(c, ca, bucket, BCH_DATA_JOURNAL,
 				ca->mi.bucket_size,
 				gc_phase(GC_PHASE_SB),
-				new_fs
-				? BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE
-				: 0);
+				0);

 		if (c) {
 			spin_unlock(&c->journal.lock);

--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -374,7 +374,7 @@ static void bch2_fs_free(struct bch_fs *c)
 	bch2_io_clock_exit(&c->io_clock[READ]);
 	bch2_fs_compress_exit(c);
 	percpu_free_rwsem(&c->usage_lock);
-	free_percpu(c->usage_percpu);
+	free_percpu(c->usage[0]);
 	mempool_exit(&c->btree_iters_pool);
 	mempool_exit(&c->btree_bounce_pool);
 	bioset_exit(&c->btree_bio);
@@ -606,7 +606,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 			max(offsetof(struct btree_read_bio, bio),
 			    offsetof(struct btree_write_bio, wbio.bio)),
 			BIOSET_NEED_BVECS) ||
-	    !(c->usage_percpu = alloc_percpu(struct bch_fs_usage)) ||
+	    !(c->usage[0] = alloc_percpu(struct bch_fs_usage)) ||
 	    percpu_init_rwsem(&c->usage_lock) ||
 	    mempool_init_kvpmalloc_pool(&c->btree_bounce_pool, 1,
 					btree_bytes(c)) ||
@@ -1028,8 +1028,7 @@ static int bch2_dev_attach_bdev(struct bch_fs *c, struct bch_sb_handle *sb)
 		return ret;

 	mutex_lock(&c->sb_lock);
-	bch2_mark_dev_superblock(ca->fs, ca,
-			BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE);
+	bch2_mark_dev_superblock(ca->fs, ca, 0);
 	mutex_unlock(&c->sb_lock);

 	bch2_dev_sysfs_online(c, ca);
@@ -1314,7 +1313,7 @@ static void dev_usage_clear(struct bch_dev *ca)

 	for_each_possible_cpu(cpu) {
 		struct bch_dev_usage *p =
-			per_cpu_ptr(ca->usage_percpu, cpu);
+			per_cpu_ptr(ca->usage[0], cpu);
 		memset(p, 0, sizeof(*p));
 	}

@@ -1375,8 +1374,7 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
 	 * allocate the journal, reset all the marks, then remark after we
 	 * attach...
 	 */
-	bch2_mark_dev_superblock(ca->fs, ca,
-			BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE);
+	bch2_mark_dev_superblock(ca->fs, ca, 0);

 	err = "journal alloc failed";
 	ret = bch2_dev_journal_alloc(ca);
@@ -1435,8 +1433,7 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
 	ca->disk_sb.sb->dev_idx	= dev_idx;
 	bch2_dev_attach(c, ca, dev_idx);

-	bch2_mark_dev_superblock(c, ca,
-			BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE);
+	bch2_mark_dev_superblock(c, ca, 0);

 	bch2_write_super(c);
 	mutex_unlock(&c->sb_lock);

--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -478,7 +478,7 @@ STORE(__bch2_fs)
 		bch2_coalesce(c);

 	if (attr == &sysfs_trigger_gc)
-		bch2_gc(c);
+		bch2_gc(c, NULL, false);

 	if (attr == &sysfs_prune_cache) {
 		struct shrink_control sc;