bcachefs: Disk space accounting rewrite

Main part of the disk accounting rewrite. This is a wholesale rewrite of the existing disk space accounting, which relies on percepu counters that are sharded by journal buffer, and rolled up and added to each journal write. With the new scheme, every set of counters is a distinct key in the accounting btree; this fixes scaling limitations of the old scheme, where counters took up space in each journal entry and required multiple percpu counters. Now, in memory accounting requires a single set of percpu counters - not multiple for each in flight journal buffer - and in the future we'll probably also have counters that don't use in memory percpu counters, they're not strictly required. An accounting update is now a normal btree update, using the btree write buffer path. At transaction commit time, we apply accounting updates to the in memory counters, which are percpu counters indexed in an eytzinger tree by the accounting key. Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>

bcachefs: Disk space accounting rewrite
Main part of the disk accounting rewrite. This is a wholesale rewrite of the existing disk space accounting, which relies on percepu counters that are sharded by journal buffer, and rolled up and added to each journal write. With the new scheme, every set of counters is a distinct key in the accounting btree; this fixes scaling limitations of the old scheme, where counters took up space in each journal entry and required multiple percpu counters. Now, in memory accounting requires a single set of percpu counters - not multiple for each in flight journal buffer - and in the future we'll probably also have counters that don't use in memory percpu counters, they're not strictly required. An accounting update is now a normal btree update, using the btree write buffer path. At transaction commit time, we apply accounting updates to the in memory counters, which are percpu counters indexed in an eytzinger tree by the accounting key. Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
1d16c605 · Kent Overstreet · 5d9667d1 · 1d16c605 · 1d16c605 · 1d16c605
Commit 1d16c605 authored Nov 09, 2023 by Kent Overstreet
25 changed files
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -15,6 +15,7 @@
 #include "buckets_waiting_for_journal.h"
 #include "clock.h"
 #include "debug.h"
+#include "disk_accounting.h"
 #include "ec.h"
 #include "error.h"
 #include "lru.h"
@@ -760,6 +761,61 @@ static noinline int bch2_bucket_gen_update(struct btree_trans *trans,
 	return ret;
 }

+static inline int bch2_dev_data_type_accounting_mod(struct btree_trans *trans, struct bch_dev *ca,
+						    enum bch_data_type data_type,
+						    s64 delta_buckets,
+						    s64 delta_sectors,
+						    s64 delta_fragmented, unsigned flags)
+{
+	struct disk_accounting_pos acc = {
+		.type = BCH_DISK_ACCOUNTING_dev_data_type,
+		.dev_data_type.dev		= ca->dev_idx,
+		.dev_data_type.data_type	= data_type,
+	};
+	s64 d[3] = { delta_buckets, delta_sectors, delta_fragmented };
+
+	return bch2_disk_accounting_mod(trans, &acc, d, 3);
+}
+
+int bch2_alloc_key_to_dev_counters(struct btree_trans *trans, struct bch_dev *ca,
+				   const struct bch_alloc_v4 *old,
+				   const struct bch_alloc_v4 *new,
+				   unsigned flags)
+{
+	s64 old_sectors = bch2_bucket_sectors(*old);
+	s64 new_sectors = bch2_bucket_sectors(*new);
+	if (old->data_type != new->data_type) {
+		int ret = bch2_dev_data_type_accounting_mod(trans, ca, new->data_type,
+				 1,  new_sectors,  bch2_bucket_sectors_fragmented(ca, *new), flags) ?:
+			  bch2_dev_data_type_accounting_mod(trans, ca, old->data_type,
+				-1, -old_sectors, -bch2_bucket_sectors_fragmented(ca, *old), flags);
+		if (ret)
+			return ret;
+	} else if (old_sectors != new_sectors) {
+		int ret = bch2_dev_data_type_accounting_mod(trans, ca, new->data_type,
+					 0,
+					 new_sectors - old_sectors,
+					 bch2_bucket_sectors_fragmented(ca, *new) -
+					 bch2_bucket_sectors_fragmented(ca, *old), flags);
+		if (ret)
+			return ret;
+	}
+
+	s64 old_unstriped = bch2_bucket_sectors_unstriped(*old);
+	s64 new_unstriped = bch2_bucket_sectors_unstriped(*new);
+	if (old_unstriped != new_unstriped) {
+		int ret = bch2_dev_data_type_accounting_mod(trans, ca, BCH_DATA_unstriped,
+					 !!new_unstriped - !!old_unstriped,
+					 new_unstriped - old_unstriped,
+					 0,
+					 flags);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
 int bch2_trigger_alloc(struct btree_trans *trans,
 		       enum btree_id btree, unsigned level,
 		       struct bkey_s_c old, struct bkey_s new,
@@ -835,18 +891,17 @@ int bch2_trigger_alloc(struct btree_trans *trans,
 				goto err;
 		}

-		/*
-		 * need to know if we're getting called from the invalidate path or
-		 * not:
-		 */
-
 		if ((flags & BTREE_TRIGGER_bucket_invalidate) &&
 		    old_a->cached_sectors) {
-			ret = bch2_update_cached_sectors_list(trans, new.k->p.inode,
+			ret = bch2_mod_dev_cached_sectors(trans, ca->dev_idx,
 					 -((s64) old_a->cached_sectors));
 			if (ret)
 				goto err;
 		}
+
+		ret = bch2_alloc_key_to_dev_counters(trans, ca, old_a, new_a, flags);
+		if (ret)
+			goto err;
 	}

 	if ((flags & BTREE_TRIGGER_atomic) && (flags & BTREE_TRIGGER_insert)) {
@@ -886,19 +941,17 @@ int bch2_trigger_alloc(struct btree_trans *trans,
 			}
 		}

-		percpu_down_read(&c->mark_lock);
 		if (new_a->gen != old_a->gen) {
+			rcu_read_lock();
 			u8 *gen = bucket_gen(ca, new.k->p.offset);
 			if (unlikely(!gen)) {
-				percpu_up_read(&c->mark_lock);
+				rcu_read_unlock();
 				goto invalid_bucket;
 			}
 			*gen = new_a->gen;
+			rcu_read_unlock();
 		}

-		bch2_dev_usage_update(c, ca, old_a, new_a, journal_seq, false);
-		percpu_up_read(&c->mark_lock);
-
 #define eval_state(_a, expr)		({ const struct bch_alloc_v4 *a = _a; expr; })
 #define statechange(expr)		!eval_state(old_a, expr) && eval_state(new_a, expr)
 #define bucket_flushed(a)		(!a->journal_seq || a->journal_seq <= c->journal.flushed_seq_ondisk)
@@ -946,6 +999,8 @@ int bch2_trigger_alloc(struct btree_trans *trans,

 		bucket_unlock(g);
 		percpu_up_read(&c->mark_lock);
+
+		bch2_dev_usage_update(c, ca, old_a, new_a);
 	}
 err:
 	printbuf_exit(&buf);

--- a/fs/bcachefs/alloc_background.h
+++ b/fs/bcachefs/alloc_background.h
@@ -82,25 +82,39 @@ static inline bool bucket_data_type_mismatch(enum bch_data_type bucket,
 		bucket_data_type(bucket) != bucket_data_type(ptr);
 }

-static inline unsigned bch2_bucket_sectors_total(struct bch_alloc_v4 a)
+static inline s64 bch2_bucket_sectors_total(struct bch_alloc_v4 a)
 {
 	return a.stripe_sectors + a.dirty_sectors + a.cached_sectors;
 }

-static inline unsigned bch2_bucket_sectors_dirty(struct bch_alloc_v4 a)
+static inline s64 bch2_bucket_sectors_dirty(struct bch_alloc_v4 a)
 {
 	return a.stripe_sectors + a.dirty_sectors;
 }

-static inline unsigned bch2_bucket_sectors_fragmented(struct bch_dev *ca,
+static inline s64 bch2_bucket_sectors(struct bch_alloc_v4 a)
+{
+	return a.data_type == BCH_DATA_cached
+		? a.cached_sectors
+		: bch2_bucket_sectors_dirty(a);
+}
+
+static inline s64 bch2_bucket_sectors_fragmented(struct bch_dev *ca,
 						 struct bch_alloc_v4 a)
 {
-	int d = bch2_bucket_sectors_dirty(a);
+	int d = bch2_bucket_sectors(a);
+
+	return d ? max(0, ca->mi.bucket_size - d) : 0;
+}
+
+static inline s64 bch2_gc_bucket_sectors_fragmented(struct bch_dev *ca, struct bucket a)
+{
+	int d = a.stripe_sectors + a.dirty_sectors;

 	return d ? max(0, ca->mi.bucket_size - d) : 0;
 }

-static inline unsigned bch2_bucket_sectors_unstriped(struct bch_alloc_v4 a)
+static inline s64 bch2_bucket_sectors_unstriped(struct bch_alloc_v4 a)
 {
 	return a.data_type == BCH_DATA_stripe ? a.dirty_sectors : 0;
 }
@@ -277,6 +291,9 @@ static inline bool bkey_is_alloc(const struct bkey *k)

 int bch2_alloc_read(struct bch_fs *);

+int bch2_alloc_key_to_dev_counters(struct btree_trans *, struct bch_dev *,
+				   const struct bch_alloc_v4 *,
+				   const struct bch_alloc_v4 *, unsigned);
 int bch2_trigger_alloc(struct btree_trans *, enum btree_id, unsigned,
 		       struct bkey_s_c, struct bkey_s,
 		       enum btree_iter_update_trigger_flags);

--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -205,6 +205,7 @@
 #include <linux/zstd.h>

 #include "bcachefs_format.h"
+#include "disk_accounting_types.h"
 #include "errcode.h"
 #include "fifo.h"
 #include "nocow_locking_types.h"
@@ -672,8 +673,6 @@ struct btree_trans_buf {
 	struct btree_trans	*trans;
 };

-#define REPLICAS_DELTA_LIST_MAX	(1U << 16)
-
 #define BCACHEFS_ROOT_SUBVOL_INUM					\
 	((subvol_inum) { BCACHEFS_ROOT_SUBVOL,	BCACHEFS_ROOT_INO })

@@ -743,10 +742,11 @@ struct bch_fs {

 	struct bch_dev __rcu	*devs[BCH_SB_MEMBERS_MAX];

+	struct bch_accounting_mem accounting;
+
 	struct bch_replicas_cpu replicas;
 	struct bch_replicas_cpu replicas_gc;
 	struct mutex		replicas_gc_lock;
-	mempool_t		replicas_delta_pool;

 	struct journal_entry_res btree_root_journal_res;
 	struct journal_entry_res replicas_journal_res;

--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -1138,7 +1138,6 @@ static inline bool jset_entry_is_key(struct jset_entry *e)
 	switch (e->type) {
 	case BCH_JSET_ENTRY_btree_keys:
 	case BCH_JSET_ENTRY_btree_root:
-	case BCH_JSET_ENTRY_overwrite:
 	case BCH_JSET_ENTRY_write_buffer_keys:
 		return true;
 	}

--- a/fs/bcachefs/bcachefs_ioctl.h
+++ b/fs/bcachefs/bcachefs_ioctl.h
@@ -251,10 +251,15 @@ struct bch_replicas_usage {
 	struct bch_replicas_entry_v1 r;
 } __packed;

+static inline unsigned replicas_usage_bytes(struct bch_replicas_usage *u)
+{
+	return offsetof(struct bch_replicas_usage, r) + replicas_entry_bytes(&u->r);
+}
+
 static inline struct bch_replicas_usage *
 replicas_usage_next(struct bch_replicas_usage *u)
 {
-	return (void *) u + replicas_entry_bytes(&u->r) + 8;
+	return (void *) u + replicas_usage_bytes(u);
 }

 /*

--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -583,7 +583,8 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id,
 		BUG_ON(bch2_journal_seq_verify &&
 		       k.k->version.lo > atomic64_read(&c->journal.seq));

-		if (fsck_err_on(k.k->version.lo > atomic64_read(&c->key_version), c,
+		if (fsck_err_on(btree_id != BTREE_ID_accounting &&
+				k.k->version.lo > atomic64_read(&c->key_version), c,
 				bkey_version_in_future,
 				"key version number higher than recorded %llu\n  %s",
 				atomic64_read(&c->key_version),
@@ -915,7 +916,7 @@ static int bch2_alloc_write_key(struct btree_trans *trans,

 	if (gc.data_type != old_gc.data_type ||
 	    gc.dirty_sectors != old_gc.dirty_sectors)
-		bch2_dev_usage_update(c, ca, &old_gc, &gc, 0, true);
+		bch2_dev_usage_update(c, ca, &old_gc, &gc);
 	percpu_up_read(&c->mark_lock);

 	gc.fragmentation_lru = alloc_lru_idx_fragmentation(gc, ca);

--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -3240,15 +3240,6 @@ void bch2_trans_put(struct btree_trans *trans)
 		srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx);
 	}

-	if (trans->fs_usage_deltas) {
-		if (trans->fs_usage_deltas->size + sizeof(trans->fs_usage_deltas) ==
-		    REPLICAS_DELTA_LIST_MAX)
-			mempool_free(trans->fs_usage_deltas,
-				     &c->replicas_delta_pool);
-		else
-			kfree(trans->fs_usage_deltas);
-	}
-
 	if (unlikely(trans->journal_replay_not_finished))
 		bch2_journal_keys_put(c);


--- a/fs/bcachefs/btree_trans_commit.c
+++ b/fs/bcachefs/btree_trans_commit.c
@@ -10,6 +10,7 @@
 #include "btree_update_interior.h"
 #include "btree_write_buffer.h"
 #include "buckets.h"
+#include "disk_accounting.h"
 #include "errcode.h"
 #include "error.h"
 #include "journal.h"
@@ -620,6 +621,14 @@ static noinline int bch2_trans_commit_run_gc_triggers(struct btree_trans *trans)
 	return 0;
 }

+static struct bversion journal_pos_to_bversion(struct journal_res *res, unsigned offset)
+{
+	return (struct bversion) {
+		.hi = res->seq >> 32,
+		.lo = (res->seq << 32) | (res->offset + offset),
+	};
+}
+
 static inline int
 bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
 			       struct btree_insert_entry **stopped_at,
@@ -628,7 +637,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
 	struct bch_fs *c = trans->c;
 	struct btree_trans_commit_hook *h;
 	unsigned u64s = 0;
-	int ret;
+	int ret = 0;

 	bch2_trans_verify_not_unlocked(trans);
 	bch2_trans_verify_not_in_restart(trans);
@@ -693,21 +702,38 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
 				i->k->k.version = MAX_VERSION;
 	}

-	if (trans->fs_usage_deltas &&
-	    bch2_trans_fs_usage_apply(trans, trans->fs_usage_deltas))
-		return -BCH_ERR_btree_insert_need_mark_replicas;
-
-	/* XXX: we only want to run this if deltas are nonzero */
-	bch2_trans_account_disk_usage_change(trans);
-
 	h = trans->hooks;
 	while (h) {
 		ret = h->fn(trans, h);
 		if (ret)
-			goto revert_fs_usage;
+			return ret;
 		h = h->next;
 	}

+	struct jset_entry *entry = trans->journal_entries;
+
+	if (likely(!(flags & BCH_TRANS_COMMIT_skip_accounting_apply))) {
+		percpu_down_read(&c->mark_lock);
+
+		for (entry = trans->journal_entries;
+		     entry != (void *) ((u64 *) trans->journal_entries + trans->journal_entries_u64s);
+		     entry = vstruct_next(entry))
+			if (jset_entry_is_key(entry) && entry->start->k.type == KEY_TYPE_accounting) {
+				struct bkey_i_accounting *a = bkey_i_to_accounting(entry->start);
+
+				a->k.version = journal_pos_to_bversion(&trans->journal_res,
+								(u64 *) entry - (u64 *) trans->journal_entries);
+				BUG_ON(bversion_zero(a->k.version));
+				ret = bch2_accounting_mem_mod(trans, accounting_i_to_s_c(a));
+				if (ret)
+					goto revert_fs_usage;
+			}
+		percpu_up_read(&c->mark_lock);
+
+		/* XXX: we only want to run this if deltas are nonzero */
+		bch2_trans_account_disk_usage_change(trans);
+	}
+
 	trans_for_each_update(trans, i)
 		if (BTREE_NODE_TYPE_HAS_ATOMIC_TRIGGERS & (1U << i->bkey_type)) {
 			ret = run_one_mem_trigger(trans, i, BTREE_TRIGGER_atomic|i->flags);
@@ -776,10 +802,20 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,

 	return 0;
 fatal_err:
-	bch2_fatal_error(c);
+	bch2_fs_fatal_error(c, "fatal error in transaction commit: %s", bch2_err_str(ret));
+	percpu_down_read(&c->mark_lock);
 revert_fs_usage:
-	if (trans->fs_usage_deltas)
-		bch2_trans_fs_usage_revert(trans, trans->fs_usage_deltas);
+	for (struct jset_entry *entry2 = trans->journal_entries;
+	     entry2 != entry;
+	     entry2 = vstruct_next(entry2))
+		if (jset_entry_is_key(entry2) && entry2->start->k.type == KEY_TYPE_accounting) {
+			struct bkey_s_accounting a = bkey_i_to_s_accounting(entry2->start);
+
+			bch2_accounting_neg(a);
+			bch2_accounting_mem_mod(trans, a.c);
+			bch2_accounting_neg(a);
+		}
+	percpu_up_read(&c->mark_lock);
 	return ret;
 }

@@ -929,7 +965,7 @@ int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags,
 		break;
 	case -BCH_ERR_btree_insert_need_mark_replicas:
 		ret = drop_locks_do(trans,
-			bch2_replicas_delta_list_mark(c, trans->fs_usage_deltas));
+			bch2_accounting_update_sb(trans));
 		break;
 	case -BCH_ERR_journal_res_get_blocked:
 		/*
@@ -1033,8 +1069,6 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
 	    !trans->journal_entries_u64s)
 		goto out_reset;

-	memset(&trans->fs_usage_delta, 0, sizeof(trans->fs_usage_delta));
-
 	ret = bch2_trans_commit_run_triggers(trans);
 	if (ret)
 		goto out_reset;
@@ -1131,6 +1165,7 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
 	bch2_trans_verify_not_in_restart(trans);
 	if (likely(!(flags & BCH_TRANS_COMMIT_no_journal_res)))
 		memset(&trans->journal_res, 0, sizeof(trans->journal_res));
+	memset(&trans->fs_usage_delta, 0, sizeof(trans->fs_usage_delta));

 	ret = do_bch2_trans_commit(trans, flags, &errored_at, _RET_IP_);


--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -523,7 +523,6 @@ struct btree_trans {

 	unsigned		journal_u64s;
 	unsigned		extra_disk_res; /* XXX kill */
-	struct replicas_delta_list *fs_usage_deltas;

 	/* Entries before this are zeroed out on every bch2_trans_get() call */


--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -29,6 +29,7 @@ void bch2_btree_insert_key_leaf(struct btree_trans *, struct btree_path *,
 			"pin journal entry referred to by trans->journal_res.seq")	\
 	x(journal_reclaim, "operation required for journal reclaim; may return error"	\
 			"instead of deadlocking if BCH_WATERMARK_reclaim not specified")\
+	x(skip_accounting_apply, "we're in journal replay - accounting updates have already been applied")

 enum __bch_trans_commit_flags {
 	/* First bits for bch_watermark: */
@@ -207,14 +208,6 @@ static inline void bch2_trans_reset_updates(struct btree_trans *trans)
 	trans->journal_entries_u64s	= 0;
 	trans->hooks			= NULL;
 	trans->extra_disk_res		= 0;
-
-	if (trans->fs_usage_deltas) {
-		trans->fs_usage_deltas->used = 0;
-		memset((void *) trans->fs_usage_deltas +
-		       offsetof(struct replicas_delta_list, memset_start), 0,
-		       (void *) &trans->fs_usage_deltas->memset_end -
-		       (void *) &trans->fs_usage_deltas->memset_start);
-	}
 }

 static inline struct bkey_i *__bch2_bkey_make_mut_noupdate(struct btree_trans *trans, struct bkey_s_c k,

--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -212,7 +212,6 @@ static inline struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *ca)
 	return ret;
 }

-void bch2_dev_usage_init(struct bch_dev *);
 void bch2_dev_usage_to_text(struct printbuf *, struct bch_dev_usage *);

 static inline u64 bch2_dev_buckets_reserved(struct bch_dev *ca, enum bch_watermark watermark)
@@ -315,29 +314,9 @@ bch2_fs_usage_read_short(struct bch_fs *);

 void bch2_dev_usage_update(struct bch_fs *, struct bch_dev *,
 			   const struct bch_alloc_v4 *,
-			   const struct bch_alloc_v4 *, u64, bool);
-
-/* key/bucket marking: */
-
-static inline struct bch_fs_usage *fs_usage_ptr(struct bch_fs *c,
-						unsigned journal_seq,
-						bool gc)
-{
-	percpu_rwsem_assert_held(&c->mark_lock);
-	BUG_ON(!gc && !journal_seq);
-
-	return this_cpu_ptr(gc
-			    ? c->usage_gc
-			    : c->usage[journal_seq & JOURNAL_BUF_MASK]);
-}
-
+			   const struct bch_alloc_v4 *);
 int bch2_update_replicas(struct bch_fs *, struct bkey_s_c,
-			 struct bch_replicas_entry_v1 *, s64,
-			 unsigned, bool);
-int bch2_update_replicas_list(struct btree_trans *,
 			 struct bch_replicas_entry_v1 *, s64);
-int bch2_update_cached_sectors_list(struct btree_trans *, unsigned, s64);
-int bch2_replicas_deltas_realloc(struct btree_trans *, unsigned);

 void bch2_fs_usage_initialize(struct bch_fs *);

@@ -369,9 +348,6 @@ int bch2_trigger_reservation(struct btree_trans *, enum btree_id, unsigned,

 void bch2_trans_account_disk_usage_change(struct btree_trans *);

-void bch2_trans_fs_usage_revert(struct btree_trans *, struct replicas_delta_list *);
-int bch2_trans_fs_usage_apply(struct btree_trans *, struct replicas_delta_list *);
-
 int bch2_trans_mark_metadata_bucket(struct btree_trans *, struct bch_dev *, u64,
 				    enum bch_data_type, unsigned,
 				    enum btree_iter_update_trigger_flags);

--- a/fs/bcachefs/disk_accounting.c
+++ b/fs/bcachefs/disk_accounting.c
--- a/fs/bcachefs/disk_accounting.h
+++ b/fs/bcachefs/disk_accounting.h
@@ -2,11 +2,32 @@
 #ifndef _BCACHEFS_DISK_ACCOUNTING_H
 #define _BCACHEFS_DISK_ACCOUNTING_H

+#include "eytzinger.h"
+
+static inline void bch2_u64s_neg(u64 *v, unsigned nr)
+{
+	for (unsigned i = 0; i < nr; i++)
+		v[i] = -v[i];
+}
+
 static inline unsigned bch2_accounting_counters(const struct bkey *k)
 {
 	return bkey_val_u64s(k) - offsetof(struct bch_accounting, d) / sizeof(u64);
 }

+static inline void bch2_accounting_neg(struct bkey_s_accounting a)
+{
+	bch2_u64s_neg(a.v->d, bch2_accounting_counters(a.k));
+}
+
+static inline bool bch2_accounting_key_is_zero(struct bkey_s_c_accounting a)
+{
+	for (unsigned i = 0;  i < bch2_accounting_counters(a.k); i++)
+		if (a.v->d[i])
+			return false;
+	return true;
+}
+
 static inline void bch2_accounting_accumulate(struct bkey_i_accounting *dst,
 					      struct bkey_s_c_accounting src)
 {
@@ -18,6 +39,26 @@ static inline void bch2_accounting_accumulate(struct bkey_i_accounting *dst,
 		dst->k.version = src.k->version;
 }

+static inline void fs_usage_data_type_to_base(struct bch_fs_usage_base *fs_usage,
+					      enum bch_data_type data_type,
+					      s64 sectors)
+{
+	switch (data_type) {
+	case BCH_DATA_btree:
+		fs_usage->btree		+= sectors;
+		break;
+	case BCH_DATA_user:
+	case BCH_DATA_parity:
+		fs_usage->data		+= sectors;
+		break;
+	case BCH_DATA_cached:
+		fs_usage->cached	+= sectors;
+		break;
+	default:
+		break;
+	}
+}
+
 static inline void bpos_to_disk_accounting_pos(struct disk_accounting_pos *acc, struct bpos p)
 {
 	acc->_pad = p;
@@ -36,6 +77,12 @@ static inline struct bpos disk_accounting_pos_to_bpos(struct disk_accounting_pos
 	return ret;
 }

+int bch2_disk_accounting_mod(struct btree_trans *,
+			     struct disk_accounting_pos *,
+			     s64 *, unsigned);
+int bch2_mod_dev_cached_sectors(struct btree_trans *trans,
+				unsigned dev, s64 sectors);
+
 int bch2_accounting_invalid(struct bch_fs *, struct bkey_s_c,
 			    enum bch_validate_flags, struct printbuf *);
 void bch2_accounting_key_to_text(struct printbuf *, struct disk_accounting_pos *);
@@ -49,4 +96,88 @@ void bch2_accounting_swab(struct bkey_s);
 	.min_val_size	= 8,				\
 })

+int bch2_accounting_update_sb(struct btree_trans *);
+
+static inline int accounting_pos_cmp(const void *_l, const void *_r)
+{
+	const struct bpos *l = _l, *r = _r;
+
+	return bpos_cmp(*l, *r);
+}
+
+int bch2_accounting_mem_mod_slowpath(struct bch_fs *, struct bkey_s_c_accounting);
+
+static inline int __bch2_accounting_mem_mod(struct bch_fs *c, struct bkey_s_c_accounting a)
+{
+	struct bch_accounting_mem *acc = &c->accounting;
+	unsigned idx = eytzinger0_find(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]),
+				       accounting_pos_cmp, &a.k->p);
+	if (unlikely(idx >= acc->k.nr))
+		return bch2_accounting_mem_mod_slowpath(c, a);
+
+	unsigned offset = acc->k.data[idx].offset;
+
+	EBUG_ON(bch2_accounting_counters(a.k) != acc->k.data[idx].nr_counters);
+
+	for (unsigned i = 0; i < bch2_accounting_counters(a.k); i++)
+		this_cpu_add(acc->v[offset + i], a.v->d[i]);
+	return 0;
+}
+
+/*
+ * Update in memory counters so they match the btree update we're doing; called
+ * from transaction commit path
+ */
+static inline int bch2_accounting_mem_mod(struct btree_trans *trans, struct
+					  bkey_s_c_accounting a)
+{
+	struct disk_accounting_pos acc_k;
+	bpos_to_disk_accounting_pos(&acc_k, a.k->p);
+
+	switch (acc_k.type) {
+	case BCH_DISK_ACCOUNTING_persistent_reserved:
+		trans->fs_usage_delta.reserved += acc_k.persistent_reserved.nr_replicas * a.v->d[0];
+		break;
+	case BCH_DISK_ACCOUNTING_replicas:
+		fs_usage_data_type_to_base(&trans->fs_usage_delta, acc_k.replicas.data_type, a.v->d[0]);
+		break;
+	}
+	return __bch2_accounting_mem_mod(trans->c, a);
+}
+
+static inline void bch2_accounting_mem_read_counters(struct bch_fs *c,
+						     unsigned idx,
+						     u64 *v, unsigned nr)
+{
+	memset(v, 0, sizeof(*v) * nr);
+
+	struct bch_accounting_mem *acc = &c->accounting;
+	if (unlikely(idx >= acc->k.nr))
+		return;
+
+	unsigned offset = acc->k.data[idx].offset;
+	nr = min_t(unsigned, nr, acc->k.data[idx].nr_counters);
+
+	for (unsigned i = 0; i < nr; i++)
+		v[i] = percpu_u64_get(acc->v + offset + i);
+}
+
+static inline void bch2_accounting_mem_read(struct bch_fs *c, struct bpos p,
+					    u64 *v, unsigned nr)
+{
+	struct bch_accounting_mem *acc = &c->accounting;
+	unsigned idx = eytzinger0_find(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]),
+				       accounting_pos_cmp, &p);
+
+	bch2_accounting_mem_read_counters(c, idx, v, nr);
+}
+
+int bch2_fs_replicas_usage_read(struct bch_fs *, darray_char *);
+
+int bch2_accounting_read(struct bch_fs *);
+
+int bch2_dev_usage_remove(struct bch_fs *, unsigned);
+int bch2_dev_usage_init(struct bch_dev *);
+void bch2_fs_accounting_exit(struct bch_fs *);
+
 #endif /* _BCACHEFS_DISK_ACCOUNTING_H */
--- a/fs/bcachefs/disk_accounting_format.h
+++ b/fs/bcachefs/disk_accounting_format.h
@@ -99,8 +99,7 @@ static inline bool data_type_is_hidden(enum bch_data_type type)
 	x(nr_inodes,		0)		\
 	x(persistent_reserved,	1)		\
 	x(replicas,		2)		\
-	x(dev_data_type,	3)		\
-	x(dev_stripe_buckets,	4)
+	x(dev_data_type,	3)

 enum disk_accounting_type {
 #define x(f, nr)	BCH_DISK_ACCOUNTING_##f	= nr,

--- a/fs/bcachefs/disk_accounting_types.h
+++ b/fs/bcachefs/disk_accounting_types.h
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_DISK_ACCOUNTING_TYPES_H
+#define _BCACHEFS_DISK_ACCOUNTING_TYPES_H
+
+#include "darray.h"
+
+struct accounting_pos_offset {
+	struct bpos				pos;
+	struct bversion				version;
+	u32					offset:24,
+						nr_counters:8;
+};
+
+struct bch_accounting_mem {
+	DARRAY(struct accounting_pos_offset)	k;
+	u64 __percpu				*v;
+	unsigned				nr_counters;
+};
+
+#endif /* _BCACHEFS_DISK_ACCOUNTING_TYPES_H */
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -13,6 +13,7 @@
 #include "btree_write_buffer.h"
 #include "buckets.h"
 #include "checksum.h"
+#include "disk_accounting.h"
 #include "disk_groups.h"
 #include "ec.h"
 #include "error.h"
@@ -302,7 +303,7 @@ static int mark_stripe_bucket(struct btree_trans *trans,
 		ret = __mark_stripe_bucket(trans, ca, s, ptr_idx, deleting, bucket, &new, flags);
 		if (!ret) {
 			alloc_to_bucket(g, new);
-			bch2_dev_usage_update(c, ca, &old, &new, 0, true);
+			bch2_dev_usage_update(c, ca, &old, &new);
 		}
 		bucket_unlock(g);
 err_unlock:
@@ -384,21 +385,25 @@ int bch2_trigger_stripe(struct btree_trans *trans,
 			new_s->nr_redundant	!= old_s->nr_redundant));

 		if (new_s) {
-			s64 sectors = le16_to_cpu(new_s->sectors);
+			s64 sectors = (u64) le16_to_cpu(new_s->sectors) * new_s->nr_redundant;

-			struct bch_replicas_padded r;
-			bch2_bkey_to_replicas(&r.e, new);
-			int ret = bch2_update_replicas_list(trans, &r.e, sectors * new_s->nr_redundant);
+			struct disk_accounting_pos acc = {
+				.type = BCH_DISK_ACCOUNTING_replicas,
+			};
+			bch2_bkey_to_replicas(&acc.replicas, new);
+			int ret = bch2_disk_accounting_mod(trans, &acc, &sectors, 1);
 			if (ret)
 				return ret;
 		}

 		if (old_s) {
-			s64 sectors = -((s64) le16_to_cpu(old_s->sectors));
+			s64 sectors = -((s64) le16_to_cpu(old_s->sectors)) * old_s->nr_redundant;

-			struct bch_replicas_padded r;
-			bch2_bkey_to_replicas(&r.e, old);
-			int ret = bch2_update_replicas_list(trans, &r.e, sectors * old_s->nr_redundant);
+			struct disk_accounting_pos acc = {
+				.type = BCH_DISK_ACCOUNTING_replicas,
+			};
+			bch2_bkey_to_replicas(&acc.replicas, old);
+			int ret = bch2_disk_accounting_mod(trans, &acc, &sectors, 1);
 			if (ret)
 				return ret;
 		}
@@ -481,8 +486,7 @@ int bch2_trigger_stripe(struct btree_trans *trans,
 			return ret;

 		ret = bch2_update_replicas(c, new, &m->r.e,
-				      ((s64) m->sectors * m->nr_redundant),
-				      0, true);
+				      ((s64) m->sectors * m->nr_redundant));
 		if (ret) {
 			struct printbuf buf = PRINTBUF;


--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -8,6 +8,7 @@
 #include "buckets.h"
 #include "compress.h"
 #include "dirent.h"
+#include "disk_accounting.h"
 #include "error.h"
 #include "extents.h"
 #include "extent_update.h"
@@ -603,11 +604,13 @@ int bch2_trigger_inode(struct btree_trans *trans,

 	if (flags & BTREE_TRIGGER_transactional) {
 		if (nr) {
-			int ret = bch2_replicas_deltas_realloc(trans, 0);
+			struct disk_accounting_pos acc = {
+				.type = BCH_DISK_ACCOUNTING_nr_inodes
+			};
+
+			int ret = bch2_disk_accounting_mod(trans, &acc, &nr, 1);
 			if (ret)
 				return ret;
-
-			trans->fs_usage_deltas->nr_inodes += nr;
 		}

 		bool old_deleted = bkey_is_deleted_inode(old);

--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -139,8 +139,6 @@ static void replay_now_at(struct journal *j, u64 seq)
 static int bch2_journal_replay_accounting_key(struct btree_trans *trans,
 					      struct journal_key *k)
 {
-	struct journal_keys *keys = &trans->c->journal_keys;
-
 	struct btree_iter iter;
 	bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p,
 				  BTREE_MAX_DEPTH, k->level,
@@ -282,6 +280,7 @@ int bch2_journal_replay(struct bch_fs *c)
 		ret = commit_do(trans, NULL, NULL,
 				BCH_TRANS_COMMIT_no_enospc|
 				BCH_TRANS_COMMIT_journal_reclaim|
+				BCH_TRANS_COMMIT_skip_accounting_apply|
 				BCH_TRANS_COMMIT_no_journal_res,
 			     bch2_journal_replay_accounting_key(trans, k));
 		if (bch2_fs_fatal_err_on(ret, c, "error replaying accounting; %s", bch2_err_str(ret)))
@@ -312,6 +311,7 @@ int bch2_journal_replay(struct bch_fs *c)
 			commit_do(trans, NULL, NULL,
 				  BCH_TRANS_COMMIT_no_enospc|
 				  BCH_TRANS_COMMIT_journal_reclaim|
+				  BCH_TRANS_COMMIT_skip_accounting_apply|
 				  (!k->allocated ? BCH_TRANS_COMMIT_no_journal_res : 0),
 			     bch2_journal_replay_key(trans, k));
 		BUG_ON(!ret && !k->overwritten && k->k->k.type != KEY_TYPE_accounting);
@@ -342,6 +342,7 @@ int bch2_journal_replay(struct bch_fs *c)

 		ret = commit_do(trans, NULL, NULL,
 				BCH_TRANS_COMMIT_no_enospc|
+				BCH_TRANS_COMMIT_skip_accounting_apply|
 				(!k->allocated
 				 ? BCH_TRANS_COMMIT_no_journal_res|BCH_WATERMARK_reclaim
 				 : 0),
@@ -1050,9 +1051,6 @@ int bch2_fs_initialize(struct bch_fs *c)
 	for (unsigned i = 0; i < BTREE_ID_NR; i++)
 		bch2_btree_root_alloc_fake(c, i, 0);

-	for_each_member_device(c, ca)
-		bch2_dev_usage_init(ca);
-
 	ret = bch2_fs_journal_alloc(c);
 	if (ret)
 		goto err;
@@ -1069,6 +1067,15 @@ int bch2_fs_initialize(struct bch_fs *c)
 	if (ret)
 		goto err;

+	for_each_member_device(c, ca) {
+		ret = bch2_dev_usage_init(ca);
+		bch_err_msg(c, ret, "initializing device usage");
+		if (ret) {
+			bch2_dev_put(ca);
+			goto err;
+		}
+	}
+
 	/*
 	 * Write out the superblock and journal buckets, now that we can do
 	 * btree updates

--- a/fs/bcachefs/recovery_passes.c
+++ b/fs/bcachefs/recovery_passes.c
@@ -5,6 +5,7 @@
 #include "backpointers.h"
 #include "btree_gc.h"
 #include "btree_node_scan.h"
+#include "disk_accounting.h"
 #include "ec.h"
 #include "fsck.h"
 #include "inode.h"

--- a/fs/bcachefs/recovery_passes_types.h
+++ b/fs/bcachefs/recovery_passes_types.h
@@ -15,6 +15,7 @@
 #define BCH_RECOVERY_PASSES()							\
 	x(scan_for_btree_nodes,			37, 0)				\
 	x(check_topology,			 4, 0)				\
+	x(accounting_read,			39, PASS_ALWAYS)		\
 	x(alloc_read,				 0, PASS_ALWAYS)		\
 	x(stripes_read,				 1, PASS_ALWAYS)		\
 	x(initialize_subvolumes,		 2, 0)				\

--- a/fs/bcachefs/replicas.c
+++ b/fs/bcachefs/replicas.c
@@ -243,23 +243,25 @@ static bool __replicas_has_entry(struct bch_replicas_cpu *r,
 	return __replicas_entry_idx(r, search) >= 0;
 }

-bool bch2_replicas_marked(struct bch_fs *c,
+bool bch2_replicas_marked_locked(struct bch_fs *c,
 			  struct bch_replicas_entry_v1 *search)
 {
-	bool marked;
-
-	if (!search->nr_devs)
-		return true;
-
 	verify_replicas_entry(search);

-	percpu_down_read(&c->mark_lock);
-	marked = __replicas_has_entry(&c->replicas, search) &&
+	return !search->nr_devs ||
+		(__replicas_has_entry(&c->replicas, search) &&
 		 (likely((!c->replicas_gc.entries)) ||
-		 __replicas_has_entry(&c->replicas_gc, search));
+		  __replicas_has_entry(&c->replicas_gc, search)));
+}
+
+bool bch2_replicas_marked(struct bch_fs *c,
+			  struct bch_replicas_entry_v1 *search)
+{
+	percpu_down_read(&c->mark_lock);
+	bool ret = bch2_replicas_marked_locked(c, search);
 	percpu_up_read(&c->mark_lock);

-	return marked;
+	return ret;
 }

 static void __replicas_table_update(struct bch_fs_usage *dst,
@@ -457,20 +459,6 @@ int bch2_mark_replicas(struct bch_fs *c, struct bch_replicas_entry_v1 *r)
 		? 0 : bch2_mark_replicas_slowpath(c, r);
 }

-/* replicas delta list: */
-
-int bch2_replicas_delta_list_mark(struct bch_fs *c,
-				  struct replicas_delta_list *r)
-{
-	struct replicas_delta *d = r->d;
-	struct replicas_delta *top = (void *) r->d + r->used;
-	int ret = 0;
-
-	for (d = r->d; !ret && d != top; d = replicas_delta_next(d))
-		ret = bch2_mark_replicas(c, &d->r);
-	return ret;
-}
-
 /*
 * Old replicas_gc mechanism: only used for journal replicas entries now, should
 * die at some point:
@@ -1046,8 +1034,6 @@ void bch2_fs_replicas_exit(struct bch_fs *c)
 	kfree(c->usage_base);
 	kfree(c->replicas.entries);
 	kfree(c->replicas_gc.entries);
-
-	mempool_exit(&c->replicas_delta_pool);
 }

 int bch2_fs_replicas_init(struct bch_fs *c)
@@ -1056,7 +1042,5 @@ int bch2_fs_replicas_init(struct bch_fs *c)
 			&c->replicas_journal_res,
 			reserve_journal_replicas(c, &c->replicas));

-	return mempool_init_kmalloc_pool(&c->replicas_delta_pool, 1,
-					 REPLICAS_DELTA_LIST_MAX) ?:
-		replicas_table_update(c, &c->replicas);
+	return replicas_table_update(c, &c->replicas);
 }
--- a/fs/bcachefs/replicas.h
+++ b/fs/bcachefs/replicas.h
@@ -25,18 +25,13 @@ int bch2_replicas_entry_idx(struct bch_fs *,
 void bch2_devlist_to_replicas(struct bch_replicas_entry_v1 *,
 			      enum bch_data_type,
 			      struct bch_devs_list);
+
+bool bch2_replicas_marked_locked(struct bch_fs *,
+			  struct bch_replicas_entry_v1 *);
 bool bch2_replicas_marked(struct bch_fs *, struct bch_replicas_entry_v1 *);
 int bch2_mark_replicas(struct bch_fs *,
 		       struct bch_replicas_entry_v1 *);

-static inline struct replicas_delta *
-replicas_delta_next(struct replicas_delta *d)
-{
-	return (void *) d + replicas_entry_bytes(&d->r) + 8;
-}
-
-int bch2_replicas_delta_list_mark(struct bch_fs *, struct replicas_delta_list *);
-
 void bch2_bkey_to_replicas(struct bch_replicas_entry_v1 *, struct bkey_s_c);

 static inline void bch2_replicas_entry_cached(struct bch_replicas_entry_v1 *e,

--- a/fs/bcachefs/replicas_types.h
+++ b/fs/bcachefs/replicas_types.h
@@ -8,20 +8,4 @@ struct bch_replicas_cpu {
 	struct bch_replicas_entry_v1 *entries;
 };

-struct replicas_delta {
-	s64			delta;
-	struct bch_replicas_entry_v1 r;
-} __packed;
-
-struct replicas_delta_list {
-	unsigned		size;
-	unsigned		used;
-
-	struct			{} memset_start;
-	u64			nr_inodes;
-	u64			persistent_reserved[BCH_REPLICAS_MAX];
-	struct			{} memset_end;
-	struct replicas_delta	d[];
-};
-
 #endif /* _BCACHEFS_REPLICAS_TYPES_H */
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -25,6 +25,7 @@
 #include "clock.h"
 #include "compress.h"
 #include "debug.h"
+#include "disk_accounting.h"
 #include "disk_groups.h"
 #include "ec.h"
 #include "errcode.h"
@@ -536,6 +537,7 @@ static void __bch2_fs_free(struct bch_fs *c)

 	bch2_find_btree_nodes_exit(&c->found_btree_nodes);
 	bch2_free_pending_node_rewrites(c);
+	bch2_fs_accounting_exit(c);
 	bch2_fs_sb_errors_exit(c);
 	bch2_fs_counters_exit(c);
 	bch2_fs_snapshots_exit(c);
@@ -1615,7 +1617,8 @@ static int bch2_dev_remove_alloc(struct bch_fs *c, struct bch_dev *ca)
 		bch2_btree_delete_range(c, BTREE_ID_alloc, start, end,
 					BTREE_TRIGGER_norun, NULL) ?:
 		bch2_btree_delete_range(c, BTREE_ID_bucket_gens, start, end,
-					BTREE_TRIGGER_norun, NULL);
+					BTREE_TRIGGER_norun, NULL) ?:
+		bch2_dev_usage_remove(c, ca->dev_idx);
 	bch_err_msg(c, ret, "removing dev alloc info");
 	return ret;
 }
@@ -1652,6 +1655,16 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
 	if (ret)
 		goto err;

+	/*
+	 * We need to flush the entire journal to get rid of keys that reference
+	 * the device being removed before removing the superblock entry
+	 */
+	bch2_journal_flush_all_pins(&c->journal);
+
+	/*
+	 * this is really just needed for the bch2_replicas_gc_(start|end)
+	 * calls, and could be cleaned up:
+	 */
 	ret = bch2_journal_flush_device_pins(&c->journal, ca->dev_idx);
 	bch_err_msg(ca, ret, "bch2_journal_flush_device_pins()");
 	if (ret)
@@ -1694,17 +1707,6 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)

 	bch2_dev_free(ca);

-	/*
-	 * At this point the device object has been removed in-core, but the
-	 * on-disk journal might still refer to the device index via sb device
-	 * usage entries. Recovery fails if it sees usage information for an
-	 * invalid device. Flush journal pins to push the back of the journal
-	 * past now invalid device index references before we update the
-	 * superblock, but after the device object has been removed so any
-	 * further journal writes elide usage info for the device.
-	 */
-	bch2_journal_flush_all_pins(&c->journal);
-
 	/*
 	 * Free this device's slot in the bch_member array - all pointers to
 	 * this device must be gone:
@@ -1766,8 +1768,6 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
 		goto err;
 	}

-	bch2_dev_usage_init(ca);
-
 	ret = __bch2_dev_attach_bdev(ca, &sb);
 	if (ret)
 		goto err;
@@ -1851,6 +1851,10 @@ int bch2_dev_add(struct bch_fs *c, const char *path)

 	bch2_dev_usage_journal_reserve(c);

+	ret = bch2_dev_usage_init(ca);
+	if (ret)
+		goto err_late;
+
 	ret = bch2_trans_mark_dev_sb(c, ca, BTREE_TRIGGER_transactional);
 	bch_err_msg(ca, ret, "marking new superblock");
 	if (ret)
@@ -2021,15 +2025,18 @@ int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
 	mutex_unlock(&c->sb_lock);

 	if (ca->mi.freespace_initialized) {
-		ret = bch2_dev_freespace_init(c, ca, old_nbuckets, nbuckets);
+		struct disk_accounting_pos acc = {
+			.type = BCH_DISK_ACCOUNTING_dev_data_type,
+			.dev_data_type.dev = ca->dev_idx,
+			.dev_data_type.data_type = BCH_DATA_free,
+		};
+		u64 v[3] = { nbuckets - old_nbuckets, 0, 0 };
+
+		ret   = bch2_dev_freespace_init(c, ca, old_nbuckets, nbuckets) ?:
+			bch2_trans_do(ca->fs, NULL, NULL, 0,
+				bch2_disk_accounting_mod(trans, &acc, v, ARRAY_SIZE(v)));
 		if (ret)
 			goto err;
-
-		/*
-		 * XXX: this is all wrong transactionally - we'll be able to do
-		 * this correctly after the disk space accounting rewrite
-		 */
-		ca->usage_base->d[BCH_DATA_free].buckets += nbuckets - old_nbuckets;
 	}

 	bch2_recalc_capacity(c);