bcachefs: Change how replicated data is accounted

Due to compression, the different replicas of a replicated extent don't necessarily have to take up the same amount of space - so replicated data sector counts shouldn't be stored divided by the number of replicas. Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>

bcachefs: Change how replicated data is accounted
Due to compression, the different replicas of a replicated extent don't necessarily have to take up the same amount of space - so replicated data sector counts shouldn't be stored divided by the number of replicas. Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
6eac2c2e · Kent Overstreet · Kent Overstreet · 5b650fd1 · 6eac2c2e · 6eac2c2e
Commit 6eac2c2e authored Jul 24, 2018 by Kent Overstreet Committed by Kent Overstreet Oct 22, 2023
5 changed files
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -185,7 +185,7 @@ static void bch2_btree_node_free_index(struct btree_update *as, struct btree *b,
 	replicas = bch2_extent_nr_dirty_ptrs(k);
 	if (replicas)
 		stats->replicas[replicas - 1].data[BCH_DATA_BTREE] -=
-			c->opts.btree_node_size;
+			c->opts.btree_node_size * replicas;

 	/*
 	 * We're dropping @k from the btree, but it's still live until the

--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -248,29 +248,28 @@ bch2_fs_usage_read(struct bch_fs *c)
 struct fs_usage_sum {
 	u64	hidden;
 	u64	data;
+	u64	cached;
 	u64	reserved;
 };

 static inline struct fs_usage_sum __fs_usage_sum(struct bch_fs_usage stats)
 {
 	struct fs_usage_sum sum = { 0 };
-	unsigned i, j;
+	unsigned i;

 	/*
 	 * For superblock and journal we count bucket usage, not sector usage,
 	 * because any internal fragmentation should _not_ be counted as
 	 * free space:
 	 */
-	for (j = 1; j < BCH_DATA_BTREE; j++)
-		sum.hidden += stats.buckets[j];
+	sum.hidden += stats.buckets[BCH_DATA_SB];
+	sum.hidden += stats.buckets[BCH_DATA_JOURNAL];

 	for (i = 0; i < ARRAY_SIZE(stats.replicas); i++) {
-		for (j = BCH_DATA_BTREE;
-		     j < ARRAY_SIZE(stats.replicas[i].data);
-		     j++)
-			sum.data += stats.replicas[i].data[j] * (i + 1);
-
-		sum.reserved += stats.replicas[i].persistent_reserved * (i + 1);
+		sum.data	+= stats.replicas[i].data[BCH_DATA_BTREE];
+		sum.data	+= stats.replicas[i].data[BCH_DATA_USER];
+		sum.cached	+= stats.replicas[i].data[BCH_DATA_CACHED];
+		sum.reserved	+= stats.replicas[i].persistent_reserved;
 	}

 	sum.reserved += stats.online_reserved;
@@ -379,17 +378,13 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
 {
 	struct bch_dev_usage *dev_usage;

-	if (c)
-		percpu_rwsem_assert_held(&c->usage_lock);
+	percpu_rwsem_assert_held(&c->usage_lock);

-	if (old.data_type && new.data_type &&
-	    old.data_type != new.data_type) {
-		BUG_ON(!c);
-		bch2_fs_inconsistent(c,
-			"different types of data in same bucket: %s, %s",
-			bch2_data_types[old.data_type],
-			bch2_data_types[new.data_type]);
-	}
+	bch2_fs_inconsistent_on(old.data_type && new.data_type &&
+				old.data_type != new.data_type, c,
+		"different types of data in same bucket: %s, %s",
+		bch2_data_types[old.data_type],
+		bch2_data_types[new.data_type]);

 	stats->buckets[bucket_type(old)] -= ca->mi.bucket_size;
 	stats->buckets[bucket_type(new)] += ca->mi.bucket_size;
@@ -448,6 +443,12 @@ void bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
 		new.gen++;
 	}));

+	/*
+	 * This isn't actually correct yet, since fs usage is still
+	 * uncompressed sectors:
+	 */
+	stats->replicas[0].data[BCH_DATA_CACHED] -= old->cached_sectors;
+
 	if (!old->owned_by_allocator && old->cached_sectors)
 		trace_invalidate(ca, bucket_to_sector(ca, b),
 				 old->cached_sectors);
@@ -501,26 +502,34 @@ void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
 		if (!(flags & BCH_BUCKET_MARK_GC_LOCK_HELD) &&
 		    gc_will_visit(c, pos))
 			return;
-	}

-	preempt_disable();
-	stats = this_cpu_ptr(c->usage_percpu);
+		preempt_disable();
+		stats = this_cpu_ptr(c->usage_percpu);

-	g = bucket(ca, b);
-	old = bucket_data_cmpxchg(c, ca, stats, g, new, ({
-		new.data_type = type;
-		checked_add(new.dirty_sectors, sectors);
-	}));
+		g = bucket(ca, b);
+		old = bucket_data_cmpxchg(c, ca, stats, g, new, ({
+			new.data_type = type;
+			checked_add(new.dirty_sectors, sectors);
+		}));

-	stats->replicas[0].data[type] += sectors;
-	preempt_enable();
+		stats->replicas[0].data[type] += sectors;
+		preempt_enable();
+	} else {
+		rcu_read_lock();
+
+		g = bucket(ca, b);
+		old = bucket_cmpxchg(g, new, ({
+			new.data_type = type;
+			checked_add(new.dirty_sectors, sectors);
+		}));
+
+		rcu_read_unlock();
+	}

 	BUG_ON(!(flags & BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE) &&
 	       bucket_became_unavailable(c, old, new));
 }

-/* Reverting this until the copygc + compression issue is fixed: */
-
 static int __disk_sectors(struct bch_extent_crc_unpacked crc, unsigned sectors)
 {
 	if (!sectors)
@@ -540,12 +549,14 @@ static void bch2_mark_pointer(struct bch_fs *c,
 			      const struct bch_extent_ptr *ptr,
 			      struct bch_extent_crc_unpacked crc,
 			      s64 sectors, enum bch_data_type data_type,
-			      struct bch_fs_usage *stats,
+			      unsigned replicas,
+			      struct bch_fs_usage *fs_usage,
 			      u64 journal_seq, unsigned flags)
 {
 	struct bucket_mark old, new;
 	struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
 	struct bucket *g = PTR_BUCKET(ca, ptr);
+	s64 uncompressed_sectors = sectors;
 	u64 v;

 	if (crc.compression_type) {
@@ -563,6 +574,20 @@ static void bch2_mark_pointer(struct bch_fs *c,
 			  +__disk_sectors(crc, new_sectors);
 	}

+	/*
+	 * fs level usage (which determines free space) is in uncompressed
+	 * sectors, until copygc + compression is sorted out:
+	 *
+	 * note also that we always update @fs_usage, even when we otherwise
+	 * wouldn't do anything because gc is running - this is because the
+	 * caller still needs to account w.r.t. its disk reservation. It is
+	 * caller's responsibility to not apply @fs_usage if gc is in progress.
+	 */
+	fs_usage->replicas
+		[!ptr->cached && replicas ? replicas - 1 : 0].data
+		[!ptr->cached ? data_type : BCH_DATA_CACHED] +=
+			uncompressed_sectors;
+
 	if (flags & BCH_BUCKET_MARK_GC_WILL_VISIT) {
 		if (journal_seq)
 			bucket_cmpxchg(g, new, ({
@@ -614,7 +639,7 @@ static void bch2_mark_pointer(struct bch_fs *c,
 			      old.v.counter,
 			      new.v.counter)) != old.v.counter);

-	bch2_dev_usage_update(c, ca, stats, old, new);
+	bch2_dev_usage_update(c, ca, fs_usage, old, new);

 	BUG_ON(!(flags & BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE) &&
 	       bucket_became_unavailable(c, old, new));
@@ -677,15 +702,13 @@ void bch2_mark_key(struct bch_fs *c, struct bkey_s_c k,

 		extent_for_each_ptr_crc(e, ptr, crc)
 			bch2_mark_pointer(c, e, ptr, crc, sectors, data_type,
-					  stats, journal_seq, flags);
-
-		if (replicas)
-			stats->replicas[replicas - 1].data[data_type] += sectors;
+					  replicas, stats, journal_seq, flags);
 		break;
 	}
 	case BCH_RESERVATION:
 		if (replicas)
-			stats->replicas[replicas - 1].persistent_reserved += sectors;
+			stats->replicas[replicas - 1].persistent_reserved +=
+				sectors * replicas;
 		break;
 	}
 	percpu_up_read(&c->usage_lock);

--- a/fs/bcachefs/buckets_types.h
+++ b/fs/bcachefs/buckets_types.h
@@ -62,7 +62,6 @@ struct bch_dev_usage {

 struct bch_fs_usage {
 	/* all fields are in units of 512 byte sectors: */
-	/* _uncompressed_ sectors: */
 	u64			online_reserved;
 	u64			available_cache;


--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -985,14 +985,6 @@ static int __bch2_dev_attach_bdev(struct bch_dev *ca, struct bch_sb_handle *sb)
 	ca->disk_sb = *sb;
 	memset(sb, 0, sizeof(*sb));

-	if (ca->fs)
-		mutex_lock(&ca->fs->sb_lock);
-
-	bch2_mark_dev_superblock(ca->fs, ca, BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE);
-
-	if (ca->fs)
-		mutex_unlock(&ca->fs->sb_lock);
-
 	percpu_ref_reinit(&ca->io_ref);

 	return 0;
@@ -1018,6 +1010,11 @@ static int bch2_dev_attach_bdev(struct bch_fs *c, struct bch_sb_handle *sb)
 	if (ret)
 		return ret;

+	mutex_lock(&c->sb_lock);
+	bch2_mark_dev_superblock(ca->fs, ca,
+			BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE);
+	mutex_unlock(&c->sb_lock);
+
 	bch2_dev_sysfs_online(c, ca);

 	if (c->sb.nr_devices == 1)
@@ -1295,6 +1292,24 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
 	return ret;
 }

+static void dev_usage_clear(struct bch_dev *ca)
+{
+	struct bucket_array *buckets;
+	int cpu;
+
+	for_each_possible_cpu(cpu) {
+		struct bch_dev_usage *p =
+			per_cpu_ptr(ca->usage_percpu, cpu);
+		memset(p, 0, sizeof(*p));
+	}
+
+	down_read(&ca->bucket_lock);
+	buckets = bucket_array(ca);
+
+	memset(buckets->b, 0, sizeof(buckets->b[0]) * buckets->nbuckets);
+	up_read(&ca->bucket_lock);
+}
+
 /* Add new device to running filesystem: */
 int bch2_dev_add(struct bch_fs *c, const char *path)
 {
@@ -1333,11 +1348,28 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
 		return ret;
 	}

+	/*
+	 * We want to allocate journal on the new device before adding the new
+	 * device to the filesystem because allocating after we attach requires
+	 * spinning up the allocator thread, and the allocator thread requires
+	 * doing btree writes, which if the existing devices are RO isn't going
+	 * to work
+	 *
+	 * So we have to mark where the superblocks are, but marking allocated
+	 * data normally updates the filesystem usage too, so we have to mark,
+	 * allocate the journal, reset all the marks, then remark after we
+	 * attach...
+	 */
+	bch2_mark_dev_superblock(ca->fs, ca,
+			BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE);
+
 	err = "journal alloc failed";
 	ret = bch2_dev_journal_alloc(ca);
 	if (ret)
 		goto err;

+	dev_usage_clear(ca);
+
 	mutex_lock(&c->state_lock);
 	mutex_lock(&c->sb_lock);

@@ -1388,6 +1420,9 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
 	ca->disk_sb.sb->dev_idx	= dev_idx;
 	bch2_dev_attach(c, ca, dev_idx);

+	bch2_mark_dev_superblock(c, ca,
+			BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE);
+
 	bch2_write_super(c);
 	mutex_unlock(&c->sb_lock);


--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -781,7 +781,7 @@ static ssize_t show_dev_alloc_debug(struct bch_dev *ca, char *buf)
 		"    meta:               %llu\n"
 		"    user:               %llu\n"
 		"    cached:             %llu\n"
-		"    available:          %llu\n"
+		"    available:          %lli\n"
 		"sectors:\n"
 		"    sb:                 %llu\n"
 		"    journal:            %llu\n"
@@ -802,7 +802,7 @@ static ssize_t show_dev_alloc_debug(struct bch_dev *ca, char *buf)
 		stats.buckets[BCH_DATA_BTREE],
 		stats.buckets[BCH_DATA_USER],
 		stats.buckets[BCH_DATA_CACHED],
-		__dev_buckets_available(ca, stats),
+		ca->mi.nbuckets - ca->mi.first_bucket - stats.buckets_unavailable,
 		stats.sectors[BCH_DATA_SB],
 		stats.sectors[BCH_DATA_JOURNAL],
 		stats.sectors[BCH_DATA_BTREE],