bcachefs: Journal updates to dev usage

This eliminates the need to scan every bucket to regenerate dev_usage at mount time. Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com> Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>

bcachefs: Journal updates to dev usage
This eliminates the need to scan every bucket to regenerate dev_usage at mount time. Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com> Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
180fb49d · Kent Overstreet · Kent Overstreet · 2abe5420 · 180fb49d · 180fb49d
Commit 180fb49d authored Jan 21, 2021 by Kent Overstreet Committed by Kent Overstreet Oct 22, 2023
10 changed files
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -350,10 +350,6 @@ int bch2_alloc_read(struct bch_fs *c, struct journal_keys *journal_keys)
 		return ret;
 	}

-	percpu_down_write(&c->mark_lock);
-	bch2_dev_usage_from_buckets(c);
-	percpu_up_write(&c->mark_lock);
-
 	return 0;
 }


--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -429,7 +429,9 @@ struct bch_dev {
 	unsigned long		*buckets_nouse;
 	struct rw_semaphore	bucket_lock;

-	struct bch_dev_usage __percpu *usage[2];
+	struct bch_dev_usage		*usage_base;
+	struct bch_dev_usage __percpu	*usage[JOURNAL_BUF_NR];
+	struct bch_dev_usage __percpu	*usage_gc;

 	/* Allocator: */
 	struct task_struct __rcu *alloc_thread;
@@ -582,6 +584,8 @@ struct bch_fs {

 	struct journal_entry_res replicas_journal_res;

+	struct journal_entry_res dev_usage_journal_res;
+
 	struct bch_disk_groups_cpu __rcu *disk_groups;

 	struct bch_opts		opts;

--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -1512,7 +1512,8 @@ static inline __u64 __bset_magic(struct bch_sb *sb)
 	x(blacklist_v2,		4)		\
 	x(usage,		5)		\
 	x(data_usage,		6)		\
-	x(clock,		7)
+	x(clock,		7)		\
+	x(dev_usage,		8)

 enum {
 #define x(f, nr)	BCH_JSET_ENTRY_##f	= nr,
@@ -1567,6 +1568,23 @@ struct jset_entry_clock {
 	__le64			time;
 } __attribute__((packed));

+struct jset_entry_dev_usage_type {
+	__le64			buckets;
+	__le64			sectors;
+	__le64			fragmented;
+} __attribute__((packed));
+
+struct jset_entry_dev_usage {
+	struct jset_entry	entry;
+	__le32			dev;
+	__u32			pad;
+
+	__le64			buckets_ec;
+	__le64			buckets_unavailable;
+
+	struct jset_entry_dev_usage_type d[];
+} __attribute__((packed));
+
 /*
 * On disk format for a journal entry:
 * seq is monotonically increasing; every journal entry has its own unique

--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -706,8 +706,8 @@ static void bch2_gc_free(struct bch_fs *c)
 			ca->mi.nbuckets * sizeof(struct bucket));
 		ca->buckets[1] = NULL;

-		free_percpu(ca->usage[1]);
-		ca->usage[1] = NULL;
+		free_percpu(ca->usage_gc);
+		ca->usage_gc = NULL;
 	}

 	free_percpu(c->usage_gc);
@@ -720,7 +720,7 @@ static int bch2_gc_done(struct bch_fs *c,
 	struct bch_dev *ca;
 	bool verify = (!initial ||
 		       (c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_INFO)));
-	unsigned i;
+	unsigned i, dev;
 	int ret = 0;

 #define copy_field(_f, _msg, ...)					\
@@ -786,7 +786,10 @@ static int bch2_gc_done(struct bch_fs *c,
 		}
 	}

-	for_each_member_device(ca, c, i) {
+	for (i = 0; i < ARRAY_SIZE(c->usage); i++)
+		bch2_fs_usage_acc_to_base(c, i);
+
+	for_each_member_device(ca, c, dev) {
 		struct bucket_array *dst = __bucket_array(ca, 0);
 		struct bucket_array *src = __bucket_array(ca, 1);
 		size_t b;
@@ -801,12 +804,23 @@ static int bch2_gc_done(struct bch_fs *c,

 			dst->b[b].oldest_gen = src->b[b].oldest_gen;
 		}
-	};

-	for (i = 0; i < ARRAY_SIZE(c->usage); i++)
-		bch2_fs_usage_acc_to_base(c, i);
+		{
+			struct bch_dev_usage *dst = ca->usage_base;
+			struct bch_dev_usage *src = (void *)
+				bch2_acc_percpu_u64s((void *) ca->usage_gc,
+						     dev_usage_u64s());
+
+			copy_dev_field(buckets_ec,		"buckets_ec");
+			copy_dev_field(buckets_unavailable,	"buckets_unavailable");

-	bch2_dev_usage_from_buckets(c);
+			for (i = 0; i < BCH_DATA_NR; i++) {
+				copy_dev_field(d[i].buckets,	"%s buckets", bch2_data_types[i]);
+				copy_dev_field(d[i].sectors,	"%s sectors", bch2_data_types[i]);
+				copy_dev_field(d[i].fragmented,	"%s fragmented", bch2_data_types[i]);
+			}
+		}
+	};

 	{
 		unsigned nr = fs_usage_u64s(c);
@@ -862,7 +876,7 @@ static int bch2_gc_start(struct bch_fs *c)

 	for_each_member_device(ca, c, i) {
 		BUG_ON(ca->buckets[1]);
-		BUG_ON(ca->usage[1]);
+		BUG_ON(ca->usage_gc);

 		ca->buckets[1] = kvpmalloc(sizeof(struct bucket_array) +
 				ca->mi.nbuckets * sizeof(struct bucket),
@@ -873,9 +887,9 @@ static int bch2_gc_start(struct bch_fs *c)
 			return -ENOMEM;
 		}

-		ca->usage[1] = alloc_percpu(struct bch_dev_usage);
-		if (!ca->usage[1]) {
-			bch_err(c, "error allocating ca->usage[gc]");
+		ca->usage_gc = alloc_percpu(struct bch_dev_usage);
+		if (!ca->usage_gc) {
+			bch_err(c, "error allocating ca->usage_gc");
 			percpu_ref_put(&ca->ref);
 			return -ENOMEM;
 		}

--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -137,6 +137,7 @@ void bch2_bucket_seq_cleanup(struct bch_fs *c)
 void bch2_fs_usage_initialize(struct bch_fs *c)
 {
 	struct bch_fs_usage *usage;
+	struct bch_dev *ca;
 	unsigned i;

 	percpu_down_write(&c->mark_lock);
@@ -155,6 +156,14 @@ void bch2_fs_usage_initialize(struct bch_fs *c)
 		fs_usage_data_type_to_base(usage, e->data_type, usage->replicas[i]);
 	}

+	for_each_member_device(ca, c, i) {
+		struct bch_dev_usage dev = bch2_dev_usage_read(ca);
+
+		usage->hidden += (dev.d[BCH_DATA_sb].buckets +
+				  dev.d[BCH_DATA_journal].buckets) *
+			ca->mi.bucket_size;
+	}
+
 	percpu_up_write(&c->mark_lock);
 }

@@ -189,14 +198,27 @@ struct bch_fs_usage_online *bch2_fs_usage_scratch_get(struct bch_fs *c)
 	return ret;
 }

+static inline struct bch_dev_usage *dev_usage_ptr(struct bch_dev *ca,
+						  unsigned journal_seq,
+						  bool gc)
+{
+	return this_cpu_ptr(gc
+			    ? ca->usage_gc
+			    : ca->usage[journal_seq & JOURNAL_BUF_MASK]);
+}
+
 struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *ca)
 {
+	struct bch_fs *c = ca->fs;
 	struct bch_dev_usage ret;
+	unsigned seq, i, u64s = dev_usage_u64s();

-	memset(&ret, 0, sizeof(ret));
-	acc_u64s_percpu((u64 *) &ret,
-			(u64 __percpu *) ca->usage[0],
-			sizeof(ret) / sizeof(u64));
+	do {
+		seq = read_seqcount_begin(&c->usage_lock);
+		memcpy(&ret, ca->usage_base, u64s * sizeof(u64));
+		for (i = 0; i < ARRAY_SIZE(ca->usage); i++)
+			acc_u64s_percpu((u64 *) &ret, (u64 __percpu *) ca->usage[i], u64s);
+	} while (read_seqcount_retry(&c->usage_lock, seq));

 	return ret;
 }
@@ -264,7 +286,8 @@ struct bch_fs_usage_online *bch2_fs_usage_read(struct bch_fs *c)

 void bch2_fs_usage_acc_to_base(struct bch_fs *c, unsigned idx)
 {
-	unsigned u64s = fs_usage_u64s(c);
+	struct bch_dev *ca;
+	unsigned i, u64s = fs_usage_u64s(c);

 	BUG_ON(idx >= ARRAY_SIZE(c->usage));

@@ -275,6 +298,16 @@ void bch2_fs_usage_acc_to_base(struct bch_fs *c, unsigned idx)
 			(u64 __percpu *) c->usage[idx], u64s);
 	percpu_memset(c->usage[idx], 0, u64s * sizeof(u64));

+	rcu_read_lock();
+	for_each_member_device_rcu(ca, c, i, NULL) {
+		u64s = dev_usage_u64s();
+
+		acc_u64s_percpu((u64 *) ca->usage_base,
+				(u64 __percpu *) ca->usage[idx], u64s);
+		percpu_memset(ca->usage[idx], 0, u64s * sizeof(u64));
+	}
+	rcu_read_unlock();
+
 	write_seqcount_end(&c->usage_lock);
 	preempt_enable();
 }
@@ -459,14 +492,14 @@ static inline void account_bucket(struct bch_fs_usage *fs_usage,
 static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
 				  struct bch_fs_usage *fs_usage,
 				  struct bucket_mark old, struct bucket_mark new,
-				  bool gc)
+				  u64 journal_seq, bool gc)
 {
 	struct bch_dev_usage *u;

 	percpu_rwsem_assert_held(&c->mark_lock);

 	preempt_disable();
-	u = this_cpu_ptr(ca->usage[gc]);
+	u = dev_usage_ptr(ca, journal_seq, gc);

 	if (bucket_type(old))
 		account_bucket(fs_usage, u, bucket_type(old),
@@ -493,31 +526,6 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
 		bch2_wake_allocator(ca);
 }

-__flatten
-void bch2_dev_usage_from_buckets(struct bch_fs *c)
-{
-	struct bch_dev *ca;
-	struct bucket_mark old = { .v.counter = 0 };
-	struct bucket_array *buckets;
-	struct bucket *g;
-	unsigned i;
-	int cpu;
-
-	c->usage_base->hidden = 0;
-
-	for_each_member_device(ca, c, i) {
-		for_each_possible_cpu(cpu)
-			memset(per_cpu_ptr(ca->usage[0], cpu), 0,
-			       sizeof(*ca->usage[0]));
-
-		buckets = bucket_array(ca);
-
-		for_each_bucket(g, buckets)
-			bch2_dev_usage_update(c, ca, c->usage_base,
-					      old, g->mark, false);
-	}
-}
-
 static inline int update_replicas(struct bch_fs *c,
 				  struct bch_fs_usage *fs_usage,
 				  struct bch_replicas_entry *r,
@@ -656,7 +664,12 @@ static int __bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
 		new.owned_by_allocator	= owned_by_allocator;
 	}));

-	bch2_dev_usage_update(c, ca, fs_usage, old, new, gc);
+	/*
+	 * XXX: this is wrong, this means we'll be doing updates to the percpu
+	 * buckets_alloc counter that don't have an open journal buffer and
+	 * we'll race with the machinery that accumulates that to ca->usage_base
+	 */
+	bch2_dev_usage_update(c, ca, fs_usage, old, new, 0, gc);

 	BUG_ON(!gc &&
 	       !owned_by_allocator && !old.owned_by_allocator);
@@ -720,7 +733,7 @@ static int bch2_mark_alloc(struct bch_fs *c,
 		}
 	}));

-	bch2_dev_usage_update(c, ca, fs_usage, old_m, m, gc);
+	bch2_dev_usage_update(c, ca, fs_usage, old_m, m, journal_seq, gc);

 	g->io_time[READ]	= u.read_time;
 	g->io_time[WRITE]	= u.write_time;
@@ -785,7 +798,7 @@ static int __bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,

 	if (c)
 		bch2_dev_usage_update(c, ca, fs_usage_ptr(c, 0, gc),
-				      old, new, gc);
+				      old, new, 0, gc);

 	return 0;
 }
@@ -966,7 +979,7 @@ static int mark_stripe_bucket(struct bch_fs *c, struct bkey_s_c k,
 	g->stripe		= k.k->p.offset;
 	g->stripe_redundancy	= s->nr_redundant;

-	bch2_dev_usage_update(c, ca, fs_usage, old, new, gc);
+	bch2_dev_usage_update(c, ca, fs_usage, old, new, journal_seq, gc);
 	return 0;
 }

@@ -1033,7 +1046,7 @@ static int bch2_mark_pointer(struct bch_fs *c, struct bkey_s_c k,
 			      old.v.counter,
 			      new.v.counter)) != old.v.counter);

-	bch2_dev_usage_update(c, ca, fs_usage, old, new, gc);
+	bch2_dev_usage_update(c, ca, fs_usage, old, new, journal_seq, gc);

 	BUG_ON(!gc && bucket_became_unavailable(old, new));

@@ -2389,13 +2402,24 @@ void bch2_dev_buckets_free(struct bch_dev *ca)
 		sizeof(struct bucket_array) +
 		ca->mi.nbuckets * sizeof(struct bucket));

-	free_percpu(ca->usage[0]);
+	for (i = 0; i < ARRAY_SIZE(ca->usage); i++)
+		free_percpu(ca->usage[i]);
+	kfree(ca->usage_base);
 }

 int bch2_dev_buckets_alloc(struct bch_fs *c, struct bch_dev *ca)
 {
-	if (!(ca->usage[0] = alloc_percpu(struct bch_dev_usage)))
+	unsigned i;
+
+	ca->usage_base = kzalloc(sizeof(struct bch_dev_usage), GFP_KERNEL);
+	if (!ca->usage_base)
 		return -ENOMEM;

+	for (i = 0; i < ARRAY_SIZE(ca->usage); i++) {
+		ca->usage[i] = alloc_percpu(struct bch_dev_usage);
+		if (!ca->usage[i])
+			return -ENOMEM;
+	}
+
 	return bch2_dev_buckets_resize(c, ca, ca->mi.nbuckets);;
 }
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -162,8 +162,6 @@ static inline bool bucket_needs_journal_commit(struct bucket_mark m,

 struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *);

-void bch2_dev_usage_from_buckets(struct bch_fs *);
-
 static inline u64 __dev_buckets_available(struct bch_dev *ca,
 					  struct bch_dev_usage stats)
 {
@@ -207,6 +205,11 @@ static inline unsigned fs_usage_u64s(struct bch_fs *c)
 		READ_ONCE(c->replicas.nr);
 }

+static inline unsigned dev_usage_u64s(void)
+{
+	return sizeof(struct bch_dev_usage) / sizeof(u64);
+}
+
 void bch2_fs_usage_scratch_put(struct bch_fs *, struct bch_fs_usage_online *);
 struct bch_fs_usage_online *bch2_fs_usage_scratch_get(struct bch_fs *);


--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -452,6 +452,43 @@ static int journal_entry_validate_clock(struct bch_fs *c,
 	return ret;
 }

+static int journal_entry_validate_dev_usage(struct bch_fs *c,
+					    struct jset *jset,
+					    struct jset_entry *entry,
+					    int write)
+{
+	struct jset_entry_dev_usage *u =
+		container_of(entry, struct jset_entry_dev_usage, entry);
+	unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
+	unsigned expected = sizeof(*u) + sizeof(u->d[0]) * 7; /* Current value of BCH_DATA_NR */
+	unsigned dev;
+	int ret = 0;
+
+	if (journal_entry_err_on(bytes < expected,
+				 c, "invalid journal entry dev usage: bad size (%u < %u)",
+				 bytes, expected)) {
+		journal_entry_null_range(entry, vstruct_next(entry));
+		return ret;
+	}
+
+	dev = le32_to_cpu(u->dev);
+
+	if (journal_entry_err_on(!bch2_dev_exists2(c, dev),
+				 c, "invalid journal entry dev usage: bad dev")) {
+		journal_entry_null_range(entry, vstruct_next(entry));
+		return ret;
+	}
+
+	if (journal_entry_err_on(u->pad,
+				 c, "invalid journal entry dev usage: bad pad")) {
+		journal_entry_null_range(entry, vstruct_next(entry));
+		return ret;
+	}
+
+fsck_err:
+	return ret;
+}
+
 struct jset_entry_ops {
 	int (*validate)(struct bch_fs *, struct jset *,
 			struct jset_entry *, int);

--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -825,10 +825,31 @@ static int journal_replay_entry_early(struct bch_fs *c,
 	case BCH_JSET_ENTRY_data_usage: {
 		struct jset_entry_data_usage *u =
 			container_of(entry, struct jset_entry_data_usage, entry);
+
 		ret = bch2_replicas_set_usage(c, &u->r,
 					      le64_to_cpu(u->v));
 		break;
 	}
+	case BCH_JSET_ENTRY_dev_usage: {
+		struct jset_entry_dev_usage *u =
+			container_of(entry, struct jset_entry_dev_usage, entry);
+		struct bch_dev *ca = bch_dev_bkey_exists(c, u->dev);
+		unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
+		unsigned nr_types = (bytes - sizeof(struct jset_entry_dev_usage)) /
+			sizeof(struct jset_entry_dev_usage_type);
+		unsigned i;
+
+		ca->usage_base->buckets_ec		= le64_to_cpu(u->buckets_ec);
+		ca->usage_base->buckets_unavailable	= le64_to_cpu(u->buckets_unavailable);
+
+		for (i = 0; i < nr_types; i++) {
+			ca->usage_base->d[i].buckets	= le64_to_cpu(u->d[i].buckets);
+			ca->usage_base->d[i].sectors	= le64_to_cpu(u->d[i].sectors);
+			ca->usage_base->d[i].fragmented	= le64_to_cpu(u->d[i].fragmented);
+		}
+
+		break;
+	}
 	case BCH_JSET_ENTRY_blacklist: {
 		struct jset_entry_blacklist *bl_entry =
 			container_of(entry, struct jset_entry_blacklist, entry);

--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -986,7 +986,8 @@ void bch2_journal_super_entries_add_common(struct bch_fs *c,
 					   struct jset_entry **end,
 					   u64 journal_seq)
 {
-	unsigned i;
+	struct bch_dev *ca;
+	unsigned i, dev;

 	percpu_down_read(&c->mark_lock);

@@ -1041,6 +1042,25 @@ void bch2_journal_super_entries_add_common(struct bch_fs *c,
 			      "embedded variable length struct");
 	}

+	for_each_member_device(ca, c, dev) {
+		unsigned b = sizeof(struct jset_entry_dev_usage) +
+			sizeof(struct jset_entry_dev_usage_type) * BCH_DATA_NR;
+		struct jset_entry_dev_usage *u =
+			container_of(jset_entry_init(end, b),
+				     struct jset_entry_dev_usage, entry);
+
+		u->entry.type = BCH_JSET_ENTRY_dev_usage;
+		u->dev = cpu_to_le32(dev);
+		u->buckets_ec		= cpu_to_le64(ca->usage_base->buckets_ec);
+		u->buckets_unavailable	= cpu_to_le64(ca->usage_base->buckets_unavailable);
+
+		for (i = 0; i < BCH_DATA_NR; i++) {
+			u->d[i].buckets = cpu_to_le64(ca->usage_base->d[i].buckets);
+			u->d[i].sectors	= cpu_to_le64(ca->usage_base->d[i].sectors);
+			u->d[i].fragmented = cpu_to_le64(ca->usage_base->d[i].fragmented);
+		}
+	}
+
 	percpu_up_read(&c->mark_lock);

 	for (i = 0; i < 2; i++) {

--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -155,6 +155,22 @@ struct bch_fs *bch2_uuid_to_fs(__uuid_t uuid)
 	return c;
 }

+static void bch2_dev_usage_journal_reserve(struct bch_fs *c)
+{
+	struct bch_dev *ca;
+	unsigned i, nr = 0, u64s =
+		(sizeof(struct jset_entry_dev_usage) +
+		 sizeof(struct jset_entry_dev_usage_type) * BCH_DATA_NR);
+
+	rcu_read_lock();
+	for_each_member_device_rcu(ca, c, i, NULL)
+		nr++;
+	rcu_read_unlock();
+
+	bch2_journal_entry_res_resize(&c->journal,
+			&c->dev_usage_journal_res, u64s * nr);
+}
+
 /* Filesystem RO/RW: */

 /*
@@ -780,6 +796,8 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 	    bch2_fs_fsio_init(c))
 		goto err;

+	bch2_dev_usage_journal_reserve(c);
+
 	mi = bch2_sb_get_members(c->disk_sb.sb);
 	for (i = 0; i < c->sb.nr_devices; i++)
 		if (bch2_dev_exists(c->disk_sb.sb, mi, i) &&
@@ -1516,6 +1534,8 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)

 	mutex_unlock(&c->sb_lock);
 	up_write(&c->state_lock);
+
+	bch2_dev_usage_journal_reserve(c);
 	return 0;
 err:
 	if (ca->mi.state == BCH_MEMBER_STATE_RW &&
@@ -1525,19 +1545,6 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
 	return ret;
 }

-static void dev_usage_clear(struct bch_dev *ca)
-{
-	struct bucket_array *buckets;
-
-	percpu_memset(ca->usage[0], 0, sizeof(*ca->usage[0]));
-
-	down_read(&ca->bucket_lock);
-	buckets = bucket_array(ca);
-
-	memset(buckets->b, 0, sizeof(buckets->b[0]) * buckets->nbuckets);
-	up_read(&ca->bucket_lock);
-}
-
 /* Add new device to running filesystem: */
 int bch2_dev_add(struct bch_fs *c, const char *path)
 {
@@ -1595,8 +1602,6 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
 	if (ret)
 		goto err;

-	dev_usage_clear(ca);
-
 	down_write(&c->state_lock);
 	mutex_lock(&c->sb_lock);

@@ -1650,6 +1655,8 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
 	bch2_write_super(c);
 	mutex_unlock(&c->sb_lock);

+	bch2_dev_usage_journal_reserve(c);
+
 	err = "error marking superblock";
 	ret = bch2_trans_mark_dev_sb(c, NULL, ca);
 	if (ret)