Merge tag 'bcachefs-2024-08-24' of git://evilpiepirate.org/bcachefs

Pull bcachefs fixes from Kent Overstreet: - assorted syzbot fixes - some upgrade fixes for old (pre 1.0) filesystems - fix for moving data off a device that was switched to durability=0 after data had been written to it. - nocow deadlock fix - fix for new rebalance_work accounting * tag 'bcachefs-2024-08-24' of git://evilpiepirate.org/bcachefs: (28 commits) bcachefs: Fix rebalance_work accounting bcachefs: Fix failure to flush moves before sleeping in copygc bcachefs: don't use rht_bucket() in btree_key_cache_scan() bcachefs: add missing inode_walker_exit() bcachefs: clear path->should_be_locked in bch2_btree_key_cache_drop() bcachefs: Fix double assignment in check_dirent_to_subvol() bcachefs: Fix refcounting in discard path bcachefs: Fix compat issue with old alloc_v4 keys bcachefs: Fix warning in bch2_fs_journal_stop() fs/super.c: improve get_tree() error message bcachefs: Fix missing validation in bch2_sb_journal_v2_validate() bcachefs: Fix replay_now_at() assert bcachefs: Fix locking in bch2_ioc_setlabel() bcachefs: fix failure to relock in btree_node_fill() bcachefs: fix failure to relock in bch2_btree_node_mem_alloc() bcachefs: unlock_long() before resort in journal replay bcachefs: fix missing bch2_err_str() bcachefs: fix time_stats_to_text() bcachefs: Fix bch2_bucket_gens_init() bcachefs: Fix bch2_trigger_alloc assert ...

Merge tag 'bcachefs-2024-08-24' of git://evilpiepirate.org/bcachefs
Pull bcachefs fixes from Kent Overstreet: - assorted syzbot fixes - some upgrade fixes for old (pre 1.0) filesystems - fix for moving data off a device that was switched to durability=0 after data had been written to it. - nocow deadlock fix - fix for new rebalance_work accounting * tag 'bcachefs-2024-08-24' of git://evilpiepirate.org/bcachefs: (28 commits) bcachefs: Fix rebalance_work accounting bcachefs: Fix failure to flush moves before sleeping in copygc bcachefs: don't use rht_bucket() in btree_key_cache_scan() bcachefs: add missing inode_walker_exit() bcachefs: clear path->should_be_locked in bch2_btree_key_cache_drop() bcachefs: Fix double assignment in check_dirent_to_subvol() bcachefs: Fix refcounting in discard path bcachefs: Fix compat issue with old alloc_v4 keys bcachefs: Fix warning in bch2_fs_journal_stop() fs/super.c: improve get_tree() error message bcachefs: Fix missing validation in bch2_sb_journal_v2_validate() bcachefs: Fix replay_now_at() assert bcachefs: Fix locking in bch2_ioc_setlabel() bcachefs: fix failure to relock in btree_node_fill() bcachefs: fix failure to relock in bch2_btree_node_mem_alloc() bcachefs: unlock_long() before resort in journal replay bcachefs: fix missing bch2_err_str() bcachefs: fix time_stats_to_text() bcachefs: Fix bch2_bucket_gens_init() bcachefs: Fix bch2_trigger_alloc assert ...
72bea05c · Linus Torvalds · 780bdc1b · 49aa7830 · 72bea05c · 72bea05c
Commit 72bea05c authored Aug 25, 2024 by Linus Torvalds
25 changed files
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -240,71 +240,73 @@ int bch2_alloc_v3_validate(struct bch_fs *c, struct bkey_s_c k,
 int bch2_alloc_v4_validate(struct bch_fs *c, struct bkey_s_c k,
 			   enum bch_validate_flags flags)
 {
-	struct bkey_s_c_alloc_v4 a = bkey_s_c_to_alloc_v4(k);
+	struct bch_alloc_v4 a;
 	int ret = 0;

-	bkey_fsck_err_on(alloc_v4_u64s_noerror(a.v) > bkey_val_u64s(k.k),
+	bkey_val_copy(&a, bkey_s_c_to_alloc_v4(k));
+
+	bkey_fsck_err_on(alloc_v4_u64s_noerror(&a) > bkey_val_u64s(k.k),
 			 c, alloc_v4_val_size_bad,
 			 "bad val size (%u > %zu)",
-			 alloc_v4_u64s_noerror(a.v), bkey_val_u64s(k.k));
+			 alloc_v4_u64s_noerror(&a), bkey_val_u64s(k.k));

-	bkey_fsck_err_on(!BCH_ALLOC_V4_BACKPOINTERS_START(a.v) &&
-			 BCH_ALLOC_V4_NR_BACKPOINTERS(a.v),
+	bkey_fsck_err_on(!BCH_ALLOC_V4_BACKPOINTERS_START(&a) &&
+			 BCH_ALLOC_V4_NR_BACKPOINTERS(&a),
 			 c, alloc_v4_backpointers_start_bad,
 			 "invalid backpointers_start");

-	bkey_fsck_err_on(alloc_data_type(*a.v, a.v->data_type) != a.v->data_type,
+	bkey_fsck_err_on(alloc_data_type(a, a.data_type) != a.data_type,
 			 c, alloc_key_data_type_bad,
 			 "invalid data type (got %u should be %u)",
-			 a.v->data_type, alloc_data_type(*a.v, a.v->data_type));
+			 a.data_type, alloc_data_type(a, a.data_type));

 	for (unsigned i = 0; i < 2; i++)
-		bkey_fsck_err_on(a.v->io_time[i] > LRU_TIME_MAX,
+		bkey_fsck_err_on(a.io_time[i] > LRU_TIME_MAX,
 				 c, alloc_key_io_time_bad,
 				 "invalid io_time[%s]: %llu, max %llu",
 				 i == READ ? "read" : "write",
-				 a.v->io_time[i], LRU_TIME_MAX);
+				 a.io_time[i], LRU_TIME_MAX);

-	unsigned stripe_sectors = BCH_ALLOC_V4_BACKPOINTERS_START(a.v) * sizeof(u64) >
+	unsigned stripe_sectors = BCH_ALLOC_V4_BACKPOINTERS_START(&a) * sizeof(u64) >
 		offsetof(struct bch_alloc_v4, stripe_sectors)
-		? a.v->stripe_sectors
+		? a.stripe_sectors
 		: 0;

-	switch (a.v->data_type) {
+	switch (a.data_type) {
 	case BCH_DATA_free:
 	case BCH_DATA_need_gc_gens:
 	case BCH_DATA_need_discard:
 		bkey_fsck_err_on(stripe_sectors ||
-				 a.v->dirty_sectors ||
-				 a.v->cached_sectors ||
-				 a.v->stripe,
+				 a.dirty_sectors ||
+				 a.cached_sectors ||
+				 a.stripe,
 				 c, alloc_key_empty_but_have_data,
 				 "empty data type free but have data %u.%u.%u %u",
 				 stripe_sectors,
-				 a.v->dirty_sectors,
-				 a.v->cached_sectors,
-				 a.v->stripe);
+				 a.dirty_sectors,
+				 a.cached_sectors,
+				 a.stripe);
 		break;
 	case BCH_DATA_sb:
 	case BCH_DATA_journal:
 	case BCH_DATA_btree:
 	case BCH_DATA_user:
 	case BCH_DATA_parity:
-		bkey_fsck_err_on(!a.v->dirty_sectors &&
+		bkey_fsck_err_on(!a.dirty_sectors &&
 				 !stripe_sectors,
 				 c, alloc_key_dirty_sectors_0,
 				 "data_type %s but dirty_sectors==0",
-				 bch2_data_type_str(a.v->data_type));
+				 bch2_data_type_str(a.data_type));
 		break;
 	case BCH_DATA_cached:
-		bkey_fsck_err_on(!a.v->cached_sectors ||
-				 a.v->dirty_sectors ||
+		bkey_fsck_err_on(!a.cached_sectors ||
+				 a.dirty_sectors ||
 				 stripe_sectors ||
-				 a.v->stripe,
+				 a.stripe,
 				 c, alloc_key_cached_inconsistency,
 				 "data type inconsistency");

-		bkey_fsck_err_on(!a.v->io_time[READ] &&
+		bkey_fsck_err_on(!a.io_time[READ] &&
 				 c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_to_lru_refs,
 				 c, alloc_key_cached_but_read_time_zero,
 				 "cached bucket with read_time == 0");
@@ -556,7 +558,7 @@ int bch2_bucket_gens_init(struct bch_fs *c)
 		struct bpos pos = alloc_gens_pos(iter.pos, &offset);
 		int ret2 = 0;

-		if (have_bucket_gens_key && bkey_cmp(iter.pos, pos)) {
+		if (have_bucket_gens_key && !bkey_eq(g.k.p, pos)) {
 			ret2 =  bch2_btree_insert_trans(trans, BTREE_ID_bucket_gens, &g.k_i, 0) ?:
 				bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
 			if (ret2)
@@ -829,7 +831,7 @@ int bch2_trigger_alloc(struct btree_trans *trans,
 	if (likely(new.k->type == KEY_TYPE_alloc_v4)) {
 		new_a = bkey_s_to_alloc_v4(new).v;
 	} else {
-		BUG_ON(!(flags & BTREE_TRIGGER_gc));
+		BUG_ON(!(flags & (BTREE_TRIGGER_gc|BTREE_TRIGGER_check_repair)));

 		struct bkey_i_alloc_v4 *new_ka = bch2_alloc_to_v4_mut_inlined(trans, new.s_c);
 		ret = PTR_ERR_OR_ZERO(new_ka);
@@ -1872,26 +1874,26 @@ static void bch2_do_discards_work(struct work_struct *work)
 	trace_discard_buckets(c, s.seen, s.open, s.need_journal_commit, s.discarded,
 			      bch2_err_str(ret));

-	bch2_write_ref_put(c, BCH_WRITE_REF_discard);
 	percpu_ref_put(&ca->io_ref);
+	bch2_write_ref_put(c, BCH_WRITE_REF_discard);
 }

 void bch2_dev_do_discards(struct bch_dev *ca)
 {
 	struct bch_fs *c = ca->fs;

-	if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE))
+	if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_discard))
 		return;

-	if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_discard))
-		goto put_ioref;
+	if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE))
+		goto put_write_ref;

 	if (queue_work(c->write_ref_wq, &ca->discard_work))
 		return;

-	bch2_write_ref_put(c, BCH_WRITE_REF_discard);
-put_ioref:
 	percpu_ref_put(&ca->io_ref);
+put_write_ref:
+	bch2_write_ref_put(c, BCH_WRITE_REF_discard);
 }

 void bch2_do_discards(struct bch_fs *c)

--- a/fs/bcachefs/alloc_background_format.h
+++ b/fs/bcachefs/alloc_background_format.h
@@ -69,6 +69,7 @@ struct bch_alloc_v4 {
 	__u64			io_time[2];
 	__u32			stripe;
 	__u32			nr_external_backpointers;
+	/* end of fields in original version of alloc_v4 */
 	__u64			fragmentation_lru;
 	__u32			stripe_sectors;
 	__u32			pad;

--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -677,7 +677,8 @@ struct bch_sb_field_ext {
 	x(bucket_stripe_sectors,	BCH_VERSION(1,  8))		\
 	x(disk_accounting_v2,		BCH_VERSION(1,  9))		\
 	x(disk_accounting_v3,		BCH_VERSION(1, 10))		\
-	x(disk_accounting_inum,		BCH_VERSION(1, 11))
+	x(disk_accounting_inum,		BCH_VERSION(1, 11))		\
+	x(rebalance_work_acct_fix,	BCH_VERSION(1, 12))

 enum bcachefs_metadata_version {
 	bcachefs_metadata_version_min = 9,

--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -159,6 +159,16 @@ struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *c)
 	return b;
 }

+void bch2_btree_node_to_freelist(struct bch_fs *c, struct btree *b)
+{
+	mutex_lock(&c->btree_cache.lock);
+	list_move(&b->list, &c->btree_cache.freeable);
+	mutex_unlock(&c->btree_cache.lock);
+
+	six_unlock_write(&b->c.lock);
+	six_unlock_intent(&b->c.lock);
+}
+
 /* Btree in memory cache - hash table */

 void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b)
@@ -736,6 +746,13 @@ struct btree *bch2_btree_node_mem_alloc(struct btree_trans *trans, bool pcpu_rea
 			       start_time);

 	memalloc_nofs_restore(flags);
+
+	int ret = bch2_trans_relock(trans);
+	if (unlikely(ret)) {
+		bch2_btree_node_to_freelist(c, b);
+		return ERR_PTR(ret);
+	}
+
 	return b;
 err:
 	mutex_lock(&bc->lock);
@@ -856,6 +873,10 @@ static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans,

 		bch2_btree_node_read(trans, b, sync);

+		int ret = bch2_trans_relock(trans);
+		if (ret)
+			return ERR_PTR(ret);
+
 		if (!sync)
 			return NULL;

@@ -974,6 +995,10 @@ static struct btree *__bch2_btree_node_get(struct btree_trans *trans, struct btr

 		bch2_btree_node_wait_on_read(b);

+		ret = bch2_trans_relock(trans);
+		if (ret)
+			return ERR_PTR(ret);
+
 		/*
 		 * should_be_locked is not set on this path yet, so we need to
 		 * relock it specifically:

--- a/fs/bcachefs/btree_cache.h
+++ b/fs/bcachefs/btree_cache.h
@@ -12,6 +12,8 @@ struct btree_iter;

 void bch2_recalc_btree_reserve(struct bch_fs *);

+void bch2_btree_node_to_freelist(struct bch_fs *, struct btree *);
+
 void bch2_btree_node_hash_remove(struct btree_cache *, struct btree *);
 int __bch2_btree_node_hash_insert(struct btree_cache *, struct btree *);
 int bch2_btree_node_hash_insert(struct btree_cache *, struct btree *,

--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -569,6 +569,15 @@ static inline struct bkey_s_c bch2_bkey_get_iter(struct btree_trans *trans,
 	bkey_s_c_to_##_type(__bch2_bkey_get_iter(_trans, _iter,			\
 				       _btree_id, _pos, _flags, KEY_TYPE_##_type))

+#define bkey_val_copy(_dst_v, _src_k)					\
+do {									\
+	unsigned b = min_t(unsigned, sizeof(*_dst_v),			\
+			   bkey_val_bytes(_src_k.k));			\
+	memcpy(_dst_v, _src_k.v, b);					\
+	if (b < sizeof(*_dst_v))					\
+		memset((void *) (_dst_v) + b, 0, sizeof(*_dst_v) - b);	\
+} while (0)
+
 static inline int __bch2_bkey_get_val_typed(struct btree_trans *trans,
 				unsigned btree_id, struct bpos pos,
 				unsigned flags, unsigned type,

--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -726,6 +726,7 @@ void bch2_btree_key_cache_drop(struct btree_trans *trans,

 	mark_btree_node_locked(trans, path, 0, BTREE_NODE_UNLOCKED);
 	btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
+	path->should_be_locked = false;
 }

 static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
@@ -777,6 +778,20 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,

 	rcu_read_lock();
 	tbl = rht_dereference_rcu(bc->table.tbl, &bc->table);
+
+	/*
+	 * Scanning is expensive while a rehash is in progress - most elements
+	 * will be on the new hashtable, if it's in progress
+	 *
+	 * A rehash could still start while we're scanning - that's ok, we'll
+	 * still see most elements.
+	 */
+	if (unlikely(tbl->nest)) {
+		rcu_read_unlock();
+		srcu_read_unlock(&c->btree_trans_barrier, srcu_idx);
+		return SHRINK_STOP;
+	}
+
 	if (bc->shrink_iter >= tbl->size)
 		bc->shrink_iter = 0;
 	start = bc->shrink_iter;
@@ -784,7 +799,7 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
 	do {
 		struct rhash_head *pos, *next;

-		pos = rht_ptr_rcu(rht_bucket(tbl, bc->shrink_iter));
+		pos = rht_ptr_rcu(&tbl->buckets[bc->shrink_iter]);

 		while (!rht_is_a_nulls(pos)) {
 			next = rht_dereference_bucket_rcu(pos->next, tbl, bc->shrink_iter);
@@ -865,12 +880,22 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc)
 	while (atomic_long_read(&bc->nr_keys)) {
 		rcu_read_lock();
 		tbl = rht_dereference_rcu(bc->table.tbl, &bc->table);
-		if (tbl)
+		if (tbl) {
+			if (tbl->nest) {
+				/* wait for in progress rehash */
+				rcu_read_unlock();
+				mutex_lock(&bc->table.mutex);
+				mutex_unlock(&bc->table.mutex);
+				rcu_read_lock();
+				continue;
+			}
 			for (i = 0; i < tbl->size; i++)
-				rht_for_each_entry_rcu(ck, pos, tbl, i, hash) {
+				while (pos = rht_ptr_rcu(&tbl->buckets[i]), !rht_is_a_nulls(pos)) {
+					ck = container_of(pos, struct bkey_cached, hash);
 					bkey_cached_evict(bc, ck);
 					list_add(&ck->list, &items);
 				}
+		}
 		rcu_read_unlock();
 	}


--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -317,6 +317,12 @@ static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans,
 		: 0;
 	int ret;

+	b = bch2_btree_node_mem_alloc(trans, interior_node);
+	if (IS_ERR(b))
+		return b;
+
+	BUG_ON(b->ob.nr);
+
 	mutex_lock(&c->btree_reserve_cache_lock);
 	if (c->btree_reserve_cache_nr > nr_reserve) {
 		struct btree_alloc *a =
@@ -325,10 +331,9 @@ static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans,
 		obs = a->ob;
 		bkey_copy(&tmp.k, &a->k);
 		mutex_unlock(&c->btree_reserve_cache_lock);
-		goto mem_alloc;
+		goto out;
 	}
 	mutex_unlock(&c->btree_reserve_cache_lock);
-
 retry:
 	ret = bch2_alloc_sectors_start_trans(trans,
 				      c->opts.metadata_target ?:
@@ -341,7 +346,7 @@ static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans,
 					  c->opts.metadata_replicas_required),
 				      watermark, 0, cl, &wp);
 	if (unlikely(ret))
-		return ERR_PTR(ret);
+		goto err;

 	if (wp->sectors_free < btree_sectors(c)) {
 		struct open_bucket *ob;
@@ -360,19 +365,16 @@ static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans,

 	bch2_open_bucket_get(c, wp, &obs);
 	bch2_alloc_sectors_done(c, wp);
-mem_alloc:
-	b = bch2_btree_node_mem_alloc(trans, interior_node);
-	six_unlock_write(&b->c.lock);
-	six_unlock_intent(&b->c.lock);
-
-	/* we hold cannibalize_lock: */
-	BUG_ON(IS_ERR(b));
-	BUG_ON(b->ob.nr);
-
+out:
 	bkey_copy(&b->key, &tmp.k);
 	b->ob = obs;
+	six_unlock_write(&b->c.lock);
+	six_unlock_intent(&b->c.lock);

 	return b;
+err:
+	bch2_btree_node_to_freelist(c, b);
+	return ERR_PTR(ret);
 }

 static struct btree *bch2_btree_node_alloc(struct btree_update *as,
@@ -2439,6 +2441,9 @@ int bch2_btree_node_update_key(struct btree_trans *trans, struct btree_iter *ite
 		}

 		new_hash = bch2_btree_node_mem_alloc(trans, false);
+		ret = PTR_ERR_OR_ZERO(new_hash);
+		if (ret)
+			goto err;
 	}

 	path->intent_ref++;
@@ -2446,14 +2451,9 @@ int bch2_btree_node_update_key(struct btree_trans *trans, struct btree_iter *ite
 					   commit_flags, skip_triggers);
 	--path->intent_ref;

-	if (new_hash) {
-		mutex_lock(&c->btree_cache.lock);
-		list_move(&new_hash->list, &c->btree_cache.freeable);
-		mutex_unlock(&c->btree_cache.lock);
-
-		six_unlock_write(&new_hash->c.lock);
-		six_unlock_intent(&new_hash->c.lock);
-	}
+	if (new_hash)
+		bch2_btree_node_to_freelist(c, new_hash);
+err:
 	closure_sync(&cl);
 	bch2_btree_cache_cannibalize_unlock(trans);
 	return ret;
@@ -2522,6 +2522,10 @@ int bch2_btree_root_alloc_fake_trans(struct btree_trans *trans, enum btree_id id
 	b = bch2_btree_node_mem_alloc(trans, false);
 	bch2_btree_cache_cannibalize_unlock(trans);

+	ret = PTR_ERR_OR_ZERO(b);
+	if (ret)
+		return ret;
+
 	set_btree_node_fake(b);
 	set_btree_node_need_rewrite(b);
 	b->c.level	= level;
@@ -2553,7 +2557,7 @@ int bch2_btree_root_alloc_fake_trans(struct btree_trans *trans, enum btree_id id

 void bch2_btree_root_alloc_fake(struct bch_fs *c, enum btree_id id, unsigned level)
 {
-	bch2_trans_run(c, bch2_btree_root_alloc_fake_trans(trans, id, level));
+	bch2_trans_run(c, lockrestart_do(trans, bch2_btree_root_alloc_fake_trans(trans, id, level)));
 }

 static void bch2_btree_update_to_text(struct printbuf *out, struct btree_update *as)

--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -699,7 +699,8 @@ static int bch2_trigger_stripe_ptr(struct btree_trans *trans,
 static int __trigger_extent(struct btree_trans *trans,
 			    enum btree_id btree_id, unsigned level,
 			    struct bkey_s_c k,
-			    enum btree_iter_update_trigger_flags flags)
+			    enum btree_iter_update_trigger_flags flags,
+			    s64 *replicas_sectors)
 {
 	bool gc = flags & BTREE_TRIGGER_gc;
 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
@@ -708,7 +709,6 @@ static int __trigger_extent(struct btree_trans *trans,
 	enum bch_data_type data_type = bkey_is_btree_ptr(k.k)
 		? BCH_DATA_btree
 		: BCH_DATA_user;
-	s64 replicas_sectors = 0;
 	int ret = 0;

 	struct disk_accounting_pos acc_replicas_key = {
@@ -739,7 +739,7 @@ static int __trigger_extent(struct btree_trans *trans,
 			if (ret)
 				return ret;
 		} else if (!p.has_ec) {
-			replicas_sectors       += disk_sectors;
+			*replicas_sectors       += disk_sectors;
 			acc_replicas_key.replicas.devs[acc_replicas_key.replicas.nr_devs++] = p.ptr.dev;
 		} else {
 			ret = bch2_trigger_stripe_ptr(trans, k, p, data_type, disk_sectors, flags);
@@ -777,7 +777,7 @@ static int __trigger_extent(struct btree_trans *trans,
 	}

 	if (acc_replicas_key.replicas.nr_devs) {
-		ret = bch2_disk_accounting_mod(trans, &acc_replicas_key, &replicas_sectors, 1, gc);
+		ret = bch2_disk_accounting_mod(trans, &acc_replicas_key, replicas_sectors, 1, gc);
 		if (ret)
 			return ret;
 	}
@@ -787,7 +787,7 @@ static int __trigger_extent(struct btree_trans *trans,
 			.type			= BCH_DISK_ACCOUNTING_snapshot,
 			.snapshot.id		= k.k->p.snapshot,
 		};
-		ret = bch2_disk_accounting_mod(trans, &acc_snapshot_key, &replicas_sectors, 1, gc);
+		ret = bch2_disk_accounting_mod(trans, &acc_snapshot_key, replicas_sectors, 1, gc);
 		if (ret)
 			return ret;
 	}
@@ -807,7 +807,7 @@ static int __trigger_extent(struct btree_trans *trans,
 			.type		= BCH_DISK_ACCOUNTING_btree,
 			.btree.id	= btree_id,
 		};
-		ret = bch2_disk_accounting_mod(trans, &acc_btree_key, &replicas_sectors, 1, gc);
+		ret = bch2_disk_accounting_mod(trans, &acc_btree_key, replicas_sectors, 1, gc);
 		if (ret)
 			return ret;
 	} else {
@@ -819,22 +819,13 @@ static int __trigger_extent(struct btree_trans *trans,
 		s64 v[3] = {
 			insert ? 1 : -1,
 			insert ? k.k->size : -((s64) k.k->size),
-			replicas_sectors,
+			*replicas_sectors,
 		};
 		ret = bch2_disk_accounting_mod(trans, &acc_inum_key, v, ARRAY_SIZE(v), gc);
 		if (ret)
 			return ret;
 	}

-	if (bch2_bkey_rebalance_opts(k)) {
-		struct disk_accounting_pos acc = {
-			.type		= BCH_DISK_ACCOUNTING_rebalance_work,
-		};
-		ret = bch2_disk_accounting_mod(trans, &acc, &replicas_sectors, 1, gc);
-		if (ret)
-			return ret;
-	}
-
 	return 0;
 }

@@ -843,6 +834,7 @@ int bch2_trigger_extent(struct btree_trans *trans,
 			struct bkey_s_c old, struct bkey_s new,
 			enum btree_iter_update_trigger_flags flags)
 {
+	struct bch_fs *c = trans->c;
 	struct bkey_ptrs_c new_ptrs = bch2_bkey_ptrs_c(new.s_c);
 	struct bkey_ptrs_c old_ptrs = bch2_bkey_ptrs_c(old);
 	unsigned new_ptrs_bytes = (void *) new_ptrs.end - (void *) new_ptrs.start;
@@ -858,21 +850,53 @@ int bch2_trigger_extent(struct btree_trans *trans,
 		    new_ptrs_bytes))
 		return 0;

-	if (flags & BTREE_TRIGGER_transactional) {
-		struct bch_fs *c = trans->c;
-		int mod = (int) bch2_bkey_needs_rebalance(c, new.s_c) -
-			  (int) bch2_bkey_needs_rebalance(c, old);
+	if (flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc)) {
+		s64 old_replicas_sectors = 0, new_replicas_sectors = 0;
+
+		if (old.k->type) {
+			int ret = __trigger_extent(trans, btree, level, old,
+						   flags & ~BTREE_TRIGGER_insert,
+						   &old_replicas_sectors);
+			if (ret)
+				return ret;
+		}
+
+		if (new.k->type) {
+			int ret = __trigger_extent(trans, btree, level, new.s_c,
+						   flags & ~BTREE_TRIGGER_overwrite,
+						   &new_replicas_sectors);
+			if (ret)
+				return ret;
+		}
+
+		int need_rebalance_delta = 0;
+		s64 need_rebalance_sectors_delta = 0;
+
+		s64 s = bch2_bkey_sectors_need_rebalance(c, old);
+		need_rebalance_delta -= s != 0;
+		need_rebalance_sectors_delta -= s;

-		if (mod) {
+		s = bch2_bkey_sectors_need_rebalance(c, old);
+		need_rebalance_delta += s != 0;
+		need_rebalance_sectors_delta += s;
+
+		if ((flags & BTREE_TRIGGER_transactional) && need_rebalance_delta) {
 			int ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_rebalance_work,
-							      new.k->p, mod > 0);
+							  new.k->p, need_rebalance_delta > 0);
 			if (ret)
 				return ret;
 		}
-	}

-	if (flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc))
-		return trigger_run_overwrite_then_insert(__trigger_extent, trans, btree, level, old, new, flags);
+		if (need_rebalance_sectors_delta) {
+			struct disk_accounting_pos acc = {
+				.type		= BCH_DISK_ACCOUNTING_rebalance_work,
+			};
+			int ret = bch2_disk_accounting_mod(trans, &acc, &need_rebalance_sectors_delta, 1,
+							   flags & BTREE_TRIGGER_gc);
+			if (ret)
+				return ret;
+		}
+	}

 	return 0;
 }

--- a/fs/bcachefs/buckets_waiting_for_journal.c
+++ b/fs/bcachefs/buckets_waiting_for_journal.c
@@ -107,7 +107,7 @@ int bch2_set_bucket_needs_journal_commit(struct buckets_waiting_for_journal *b,
 		nr_elements += t->d[i].journal_seq > flushed_seq;

 	new_bits = ilog2(roundup_pow_of_two(nr_elements * 3));
-
+realloc:
 	n = kvmalloc(sizeof(*n) + (sizeof(n->d[0]) << new_bits), GFP_KERNEL);
 	if (!n) {
 		ret = -BCH_ERR_ENOMEM_buckets_waiting_for_journal_set;
@@ -118,6 +118,8 @@ int bch2_set_bucket_needs_journal_commit(struct buckets_waiting_for_journal *b,
 	if (nr_rehashes_this_size == 3) {
 		new_bits++;
 		nr_rehashes_this_size = 0;
+		kvfree(n);
+		goto realloc;
 	}

 	nr_rehashes++;

--- a/fs/bcachefs/data_update.c
+++ b/fs/bcachefs/data_update.c
@@ -20,6 +20,76 @@
 #include "subvolume.h"
 #include "trace.h"

+static void bkey_put_dev_refs(struct bch_fs *c, struct bkey_s_c k)
+{
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+
+	bkey_for_each_ptr(ptrs, ptr)
+		bch2_dev_put(bch2_dev_have_ref(c, ptr->dev));
+}
+
+static bool bkey_get_dev_refs(struct bch_fs *c, struct bkey_s_c k)
+{
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+
+	bkey_for_each_ptr(ptrs, ptr) {
+		if (!bch2_dev_tryget(c, ptr->dev)) {
+			bkey_for_each_ptr(ptrs, ptr2) {
+				if (ptr2 == ptr)
+					break;
+				bch2_dev_put(bch2_dev_have_ref(c, ptr2->dev));
+			}
+			return false;
+		}
+	}
+	return true;
+}
+
+static void bkey_nocow_unlock(struct bch_fs *c, struct bkey_s_c k)
+{
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+
+	bkey_for_each_ptr(ptrs, ptr) {
+		struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev);
+		struct bpos bucket = PTR_BUCKET_POS(ca, ptr);
+
+		bch2_bucket_nocow_unlock(&c->nocow_locks, bucket, 0);
+	}
+}
+
+static bool bkey_nocow_lock(struct bch_fs *c, struct moving_context *ctxt, struct bkey_s_c k)
+{
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+
+	bkey_for_each_ptr(ptrs, ptr) {
+		struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev);
+		struct bpos bucket = PTR_BUCKET_POS(ca, ptr);
+
+		if (ctxt) {
+			bool locked;
+
+			move_ctxt_wait_event(ctxt,
+				(locked = bch2_bucket_nocow_trylock(&c->nocow_locks, bucket, 0)) ||
+				list_empty(&ctxt->ios));
+
+			if (!locked)
+				bch2_bucket_nocow_lock(&c->nocow_locks, bucket, 0);
+		} else {
+			if (!bch2_bucket_nocow_trylock(&c->nocow_locks, bucket, 0)) {
+				bkey_for_each_ptr(ptrs, ptr2) {
+					if (ptr2 == ptr)
+						break;
+
+					bucket = PTR_BUCKET_POS(ca, ptr2);
+					bch2_bucket_nocow_unlock(&c->nocow_locks, bucket, 0);
+				}
+				return false;
+			}
+		}
+	}
+	return true;
+}
+
 static void trace_move_extent_finish2(struct bch_fs *c, struct bkey_s_c k)
 {
 	if (trace_move_extent_finish_enabled()) {
@@ -355,17 +425,11 @@ void bch2_data_update_read_done(struct data_update *m,
 void bch2_data_update_exit(struct data_update *update)
 {
 	struct bch_fs *c = update->op.c;
-	struct bkey_ptrs_c ptrs =
-		bch2_bkey_ptrs_c(bkey_i_to_s_c(update->k.k));
-
-	bkey_for_each_ptr(ptrs, ptr) {
-		struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev);
-		if (c->opts.nocow_enabled)
-			bch2_bucket_nocow_unlock(&c->nocow_locks,
-						 PTR_BUCKET_POS(ca, ptr), 0);
-		bch2_dev_put(ca);
-	}
+	struct bkey_s_c k = bkey_i_to_s_c(update->k.k);

+	if (c->opts.nocow_enabled)
+		bkey_nocow_unlock(c, k);
+	bkey_put_dev_refs(c, k);
 	bch2_bkey_buf_exit(&update->k, c);
 	bch2_disk_reservation_put(c, &update->op.res);
 	bch2_bio_free_pages_pool(c, &update->op.wbio.bio);
@@ -475,6 +539,9 @@ void bch2_data_update_opts_to_text(struct printbuf *out, struct bch_fs *c,
 	bch2_compression_opt_to_text(out, background_compression(*io_opts));
 	prt_newline(out);

+	prt_str(out, "opts.replicas:\t");
+	prt_u64(out, io_opts->data_replicas);
+
 	prt_str(out, "extra replicas:\t");
 	prt_u64(out, data_opts->extra_replicas);
 }
@@ -543,7 +610,6 @@ int bch2_data_update_init(struct btree_trans *trans,
 	const union bch_extent_entry *entry;
 	struct extent_ptr_decoded p;
 	unsigned i, reserve_sectors = k.k->size * data_opts.extra_replicas;
-	unsigned ptrs_locked = 0;
 	int ret = 0;

 	/*
@@ -554,6 +620,15 @@ int bch2_data_update_init(struct btree_trans *trans,
 	if (unlikely(k.k->p.snapshot && !bch2_snapshot_equiv(c, k.k->p.snapshot)))
 		return -BCH_ERR_data_update_done;

+	if (!bkey_get_dev_refs(c, k))
+		return -BCH_ERR_data_update_done;
+
+	if (c->opts.nocow_enabled &&
+	    !bkey_nocow_lock(c, ctxt, k)) {
+		bkey_put_dev_refs(c, k);
+		return -BCH_ERR_nocow_lock_blocked;
+	}
+
 	bch2_bkey_buf_init(&m->k);
 	bch2_bkey_buf_reassemble(&m->k, c, k);
 	m->btree_id	= btree_id;
@@ -575,40 +650,24 @@ int bch2_data_update_init(struct btree_trans *trans,
 	m->op.compression_opt	= background_compression(io_opts);
 	m->op.watermark		= m->data_opts.btree_insert_flags & BCH_WATERMARK_MASK;

-	bkey_for_each_ptr(ptrs, ptr) {
-		if (!bch2_dev_tryget(c, ptr->dev)) {
-			bkey_for_each_ptr(ptrs, ptr2) {
-				if (ptr2 == ptr)
-					break;
-				bch2_dev_put(bch2_dev_have_ref(c, ptr2->dev));
-			}
-			return -BCH_ERR_data_update_done;
-		}
-	}
-
 	unsigned durability_have = 0, durability_removing = 0;

 	i = 0;
 	bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
-		struct bch_dev *ca = bch2_dev_have_ref(c, p.ptr.dev);
-		struct bpos bucket = PTR_BUCKET_POS(ca, &p.ptr);
-		bool locked;
-
-		rcu_read_lock();
-		if (((1U << i) & m->data_opts.rewrite_ptrs)) {
-			BUG_ON(p.ptr.cached);
-
-			if (crc_is_compressed(p.crc))
-				reserve_sectors += k.k->size;
-
-			m->op.nr_replicas += bch2_extent_ptr_desired_durability(c, &p);
-			durability_removing += bch2_extent_ptr_desired_durability(c, &p);
-		} else if (!p.ptr.cached &&
-			   !((1U << i) & m->data_opts.kill_ptrs)) {
-			bch2_dev_list_add_dev(&m->op.devs_have, p.ptr.dev);
-			durability_have += bch2_extent_ptr_durability(c, &p);
+		if (!p.ptr.cached) {
+			rcu_read_lock();
+			if (BIT(i) & m->data_opts.rewrite_ptrs) {
+				if (crc_is_compressed(p.crc))
+					reserve_sectors += k.k->size;
+
+				m->op.nr_replicas += bch2_extent_ptr_desired_durability(c, &p);
+				durability_removing += bch2_extent_ptr_desired_durability(c, &p);
+			} else if (!(BIT(i) & m->data_opts.kill_ptrs)) {
+				bch2_dev_list_add_dev(&m->op.devs_have, p.ptr.dev);
+				durability_have += bch2_extent_ptr_durability(c, &p);
+			}
+			rcu_read_unlock();
 		}
-		rcu_read_unlock();

 		/*
 		 * op->csum_type is normally initialized from the fs/file's
@@ -623,24 +682,6 @@ int bch2_data_update_init(struct btree_trans *trans,
 		if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible)
 			m->op.incompressible = true;

-		if (c->opts.nocow_enabled) {
-			if (ctxt) {
-				move_ctxt_wait_event(ctxt,
-						(locked = bch2_bucket_nocow_trylock(&c->nocow_locks,
-									  bucket, 0)) ||
-						list_empty(&ctxt->ios));
-
-				if (!locked)
-					bch2_bucket_nocow_lock(&c->nocow_locks, bucket, 0);
-			} else {
-				if (!bch2_bucket_nocow_trylock(&c->nocow_locks, bucket, 0)) {
-					ret = -BCH_ERR_nocow_lock_blocked;
-					goto err;
-				}
-			}
-			ptrs_locked |= (1U << i);
-		}
-
 		i++;
 	}

@@ -654,16 +695,6 @@ int bch2_data_update_init(struct btree_trans *trans,
 	 * Increasing replication is an explicit operation triggered by
 	 * rereplicate, currently, so that users don't get an unexpected -ENOSPC
 	 */
-	if (!(m->data_opts.write_flags & BCH_WRITE_CACHED) &&
-	    !durability_required) {
-		m->data_opts.kill_ptrs |= m->data_opts.rewrite_ptrs;
-		m->data_opts.rewrite_ptrs = 0;
-		/* if iter == NULL, it's just a promote */
-		if (iter)
-			ret = bch2_extent_drop_ptrs(trans, iter, k, m->data_opts);
-		goto done;
-	}
-
 	m->op.nr_replicas = min(durability_removing, durability_required) +
 		m->data_opts.extra_replicas;

@@ -675,48 +706,38 @@ int bch2_data_update_init(struct btree_trans *trans,
 	if (!(durability_have + durability_removing))
 		m->op.nr_replicas = max((unsigned) m->op.nr_replicas, 1);

-	if (!m->op.nr_replicas) {
-		struct printbuf buf = PRINTBUF;
+	m->op.nr_replicas_required = m->op.nr_replicas;

-		bch2_data_update_to_text(&buf, m);
-		WARN(1, "trying to move an extent, but nr_replicas=0\n%s", buf.buf);
-		printbuf_exit(&buf);
-		ret = -BCH_ERR_data_update_done;
-		goto done;
+	/*
+	 * It might turn out that we don't need any new replicas, if the
+	 * replicas or durability settings have been changed since the extent
+	 * was written:
+	 */
+	if (!m->op.nr_replicas) {
+		m->data_opts.kill_ptrs |= m->data_opts.rewrite_ptrs;
+		m->data_opts.rewrite_ptrs = 0;
+		/* if iter == NULL, it's just a promote */
+		if (iter)
+			ret = bch2_extent_drop_ptrs(trans, iter, k, m->data_opts);
+		goto out;
 	}

-	m->op.nr_replicas_required = m->op.nr_replicas;
-
 	if (reserve_sectors) {
 		ret = bch2_disk_reservation_add(c, &m->op.res, reserve_sectors,
 				m->data_opts.extra_replicas
 				? 0
 				: BCH_DISK_RESERVATION_NOFAIL);
 		if (ret)
-			goto err;
+			goto out;
 	}

 	if (bkey_extent_is_unwritten(k)) {
 		bch2_update_unwritten_extent(trans, m);
-		goto done;
+		goto out;
 	}

 	return 0;
-err:
-	i = 0;
-	bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
-		struct bch_dev *ca = bch2_dev_have_ref(c, p.ptr.dev);
-		struct bpos bucket = PTR_BUCKET_POS(ca, &p.ptr);
-		if ((1U << i) & ptrs_locked)
-			bch2_bucket_nocow_unlock(&c->nocow_locks, bucket, 0);
-		bch2_dev_put(ca);
-		i++;
-	}
-
-	bch2_bkey_buf_exit(&m->k, c);
-	bch2_bio_free_pages_pool(c, &m->op.wbio.bio);
-	return ret;
-done:
+out:
 	bch2_data_update_exit(m);
 	return ret ?: -BCH_ERR_data_update_done;
 }

--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -1017,6 +1017,8 @@ void bch2_extent_ptr_to_text(struct printbuf *out, struct bch_fs *c, const struc

 		prt_printf(out, "ptr: %u:%llu:%u gen %u",
 			   ptr->dev, b, offset, ptr->gen);
+		if (ca->mi.durability != 1)
+			prt_printf(out, " d=%u", ca->mi.durability);
 		if (ptr->cached)
 			prt_str(out, " cached");
 		if (ptr->unwritten)
@@ -1377,6 +1379,45 @@ bool bch2_bkey_needs_rebalance(struct bch_fs *c, struct bkey_s_c k)
 	return r != NULL;
 }

+static u64 __bch2_bkey_sectors_need_rebalance(struct bch_fs *c, struct bkey_s_c k,
+				       unsigned target, unsigned compression)
+{
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	const union bch_extent_entry *entry;
+	struct extent_ptr_decoded p;
+	u64 sectors = 0;
+
+	if (compression) {
+		unsigned compression_type = bch2_compression_opt_to_type(compression);
+
+		bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+			if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible ||
+			    p.ptr.unwritten) {
+				sectors = 0;
+				goto incompressible;
+			}
+
+			if (!p.ptr.cached && p.crc.compression_type != compression_type)
+				sectors += p.crc.compressed_size;
+		}
+	}
+incompressible:
+	if (target && bch2_target_accepts_data(c, BCH_DATA_user, target)) {
+		bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
+			if (!p.ptr.cached && !bch2_dev_in_target(c, p.ptr.dev, target))
+				sectors += p.crc.compressed_size;
+	}
+
+	return sectors;
+}
+
+u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *c, struct bkey_s_c k)
+{
+	const struct bch_extent_rebalance *r = bch2_bkey_rebalance_opts(k);
+
+	return r ? __bch2_bkey_sectors_need_rebalance(c, k, r->target, r->compression) : 0;
+}
+
 int bch2_bkey_set_needs_rebalance(struct bch_fs *c, struct bkey_i *_k,
 				  struct bch_io_opts *opts)
 {

--- a/fs/bcachefs/extents.h
+++ b/fs/bcachefs/extents.h
@@ -692,6 +692,7 @@ const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c);
 unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *, struct bkey_s_c,
 				       unsigned, unsigned);
 bool bch2_bkey_needs_rebalance(struct bch_fs *, struct bkey_s_c);
+u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *, struct bkey_s_c);

 int bch2_bkey_set_needs_rebalance(struct bch_fs *, struct bkey_i *,
 				  struct bch_io_opts *);

--- a/fs/bcachefs/fs-io-buffered.c
+++ b/fs/bcachefs/fs-io-buffered.c
@@ -534,7 +534,7 @@ static int __bch2_writepage(struct folio *folio,

 	if (f_sectors > w->tmp_sectors) {
 		kfree(w->tmp);
-		w->tmp = kcalloc(f_sectors, sizeof(struct bch_folio_sector), __GFP_NOFAIL);
+		w->tmp = kcalloc(f_sectors, sizeof(struct bch_folio_sector), GFP_NOFS|__GFP_NOFAIL);
 		w->tmp_sectors = f_sectors;
 	}


--- a/fs/bcachefs/fs-ioctl.c
+++ b/fs/bcachefs/fs-ioctl.c
@@ -328,9 +328,8 @@ static int bch2_ioc_setlabel(struct bch_fs *c,

 	mutex_lock(&c->sb_lock);
 	strscpy(c->disk_sb.sb->label, label, BCH_SB_LABEL_SIZE);
-	mutex_unlock(&c->sb_lock);
-
 	ret = bch2_write_super(c);
+	mutex_unlock(&c->sb_lock);

 	mnt_drop_write_file(file);
 	return ret;

--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -2006,7 +2006,6 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter *
 	if (ret) {
 		bch_err(c, "subvol %u points to missing inode root %llu", target_subvol, target_inum);
 		ret = -BCH_ERR_fsck_repair_unimplemented;
-		ret = 0;
 		goto err;
 	}

@@ -2216,6 +2215,8 @@ int bch2_check_xattrs(struct bch_fs *c)
 			NULL, NULL,
 			BCH_TRANS_COMMIT_no_enospc,
 		check_xattr(trans, &iter, k, &hash_info, &inode)));
+
+	inode_walker_exit(&inode);
 	bch_err_fn(c, ret);
 	return ret;
 }
@@ -2469,8 +2470,7 @@ static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c ino
 			: bch2_inode_unpack(inode_k, &inode);
 		if (ret) {
 			/* Should have been caught in dirents pass */
-			if (!bch2_err_matches(ret, BCH_ERR_transaction_restart))
-				bch_err(c, "error looking up parent directory: %i", ret);
+			bch_err_msg(c, ret, "error looking up parent directory");
 			break;
 		}


--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -1260,7 +1260,7 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq)
 	}

 	if (!had_entries)
-		j->last_empty_seq = cur_seq;
+		j->last_empty_seq = cur_seq - 1; /* to match j->seq */

 	spin_lock(&j->lock);


--- a/fs/bcachefs/journal_sb.c
+++ b/fs/bcachefs/journal_sb.c
@@ -104,6 +104,7 @@ static int bch2_sb_journal_v2_validate(struct bch_sb *sb, struct bch_sb_field *f
 	struct bch_sb_field_journal_v2 *journal = field_to_type(f, journal_v2);
 	struct bch_member m = bch2_sb_member_get(sb, sb->dev_idx);
 	int ret = -BCH_ERR_invalid_sb_journal;
+	u64 sum = 0;
 	unsigned nr;
 	unsigned i;
 	struct u64_range *b;
@@ -119,6 +120,15 @@ static int bch2_sb_journal_v2_validate(struct bch_sb *sb, struct bch_sb_field *f
 	for (i = 0; i < nr; i++) {
 		b[i].start = le64_to_cpu(journal->d[i].start);
 		b[i].end = b[i].start + le64_to_cpu(journal->d[i].nr);
+
+		if (b[i].end <= b[i].start) {
+			prt_printf(err, "journal buckets entry with bad nr: %llu+%llu",
+				   le64_to_cpu(journal->d[i].start),
+				   le64_to_cpu(journal->d[i].nr));
+			goto err;
+		}
+
+		sum += le64_to_cpu(journal->d[i].nr);
 	}

 	sort(b, nr, sizeof(*b), u64_range_cmp, NULL);
@@ -148,6 +158,11 @@ static int bch2_sb_journal_v2_validate(struct bch_sb *sb, struct bch_sb_field *f
 		}
 	}

+	if (sum > UINT_MAX) {
+		prt_printf(err, "too many journal buckets: %llu > %u", sum, UINT_MAX);
+		goto err;
+	}
+
 	ret = 0;
 err:
 	kfree(b);

--- a/fs/bcachefs/movinggc.c
+++ b/fs/bcachefs/movinggc.c
@@ -383,7 +383,7 @@ static int bch2_copygc_thread(void *arg)
 			if (min_member_capacity == U64_MAX)
 				min_member_capacity = 128 * 2048;

-			bch2_trans_unlock_long(ctxt.trans);
+			move_buckets_wait(&ctxt, buckets, true);
 			bch2_kthread_io_clock_wait(clock, last + (min_member_capacity >> 6),
 					MAX_SCHEDULE_TIMEOUT);
 		}

--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -241,7 +241,13 @@ static int journal_sort_seq_cmp(const void *_l, const void *_r)
 	const struct journal_key *l = *((const struct journal_key **)_l);
 	const struct journal_key *r = *((const struct journal_key **)_r);

-	return cmp_int(l->journal_seq, r->journal_seq);
+	/*
+	 * Map 0 to U64_MAX, so that keys with journal_seq === 0 come last
+	 *
+	 * journal_seq == 0 means that the key comes from early repair, and
+	 * should be inserted last so as to avoid overflowing the journal
+	 */
+	return cmp_int(l->journal_seq - 1, r->journal_seq - 1);
 }

 int bch2_journal_replay(struct bch_fs *c)
@@ -322,6 +328,7 @@ int bch2_journal_replay(struct bch_fs *c)
 		}
 	}

+	bch2_trans_unlock_long(trans);
 	/*
 	 * Now, replay any remaining keys in the order in which they appear in
 	 * the journal, unpinning those journal entries as we go:

--- a/fs/bcachefs/replicas.c
+++ b/fs/bcachefs/replicas.c
@@ -451,7 +451,8 @@ int bch2_replicas_gc2(struct bch_fs *c)
 			.type = BCH_DISK_ACCOUNTING_replicas,
 		};

-		memcpy(&k.replicas, e, replicas_entry_bytes(e));
+		unsafe_memcpy(&k.replicas, e, replicas_entry_bytes(e),
+			      "embedded variable length struct");

 		struct bpos p = disk_accounting_pos_to_bpos(&k);


--- a/fs/bcachefs/sb-downgrade.c
+++ b/fs/bcachefs/sb-downgrade.c
@@ -74,6 +74,9 @@
 	  BCH_FSCK_ERR_accounting_key_replicas_devs_unsorted,	\
 	  BCH_FSCK_ERR_accounting_key_junk_at_end)		\
 	x(disk_accounting_inum,					\
+	  BIT_ULL(BCH_RECOVERY_PASS_check_allocations),		\
+	  BCH_FSCK_ERR_accounting_mismatch)			\
+	x(rebalance_work_acct_fix,				\
 	  BIT_ULL(BCH_RECOVERY_PASS_check_allocations),		\
 	  BCH_FSCK_ERR_accounting_mismatch)

@@ -108,7 +111,10 @@
 	  BCH_FSCK_ERR_fs_usage_persistent_reserved_wrong,	\
 	  BCH_FSCK_ERR_fs_usage_replicas_wrong,			\
 	  BCH_FSCK_ERR_accounting_replicas_not_marked,		\
-	  BCH_FSCK_ERR_bkey_version_in_future)
+	  BCH_FSCK_ERR_bkey_version_in_future)			\
+	x(rebalance_work_acct_fix,				\
+	  BIT_ULL(BCH_RECOVERY_PASS_check_allocations),		\
+	  BCH_FSCK_ERR_accounting_mismatch)

 struct upgrade_downgrade_entry {
 	u64		recovery_passes;

--- a/fs/bcachefs/util.c
+++ b/fs/bcachefs/util.c
@@ -416,7 +416,6 @@ void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats
 	printbuf_tabstop_push(out, TABSTOP_SIZE + 2);

 	prt_printf(out, "\tsince mount\r\trecent\r\n");
-	prt_printf(out, "recent");

 	printbuf_tabstops_reset(out);
 	printbuf_tabstop_push(out, out->indent + 20);

--- a/fs/bcachefs/xattr.c
+++ b/fs/bcachefs/xattr.c
@@ -612,10 +612,20 @@ static int bch2_xattr_bcachefs_get_effective(
 					 name, buffer, size, true);
 }

+/* Noop - xattrs in the bcachefs_effective namespace are inherited */
+static int bch2_xattr_bcachefs_set_effective(const struct xattr_handler *handler,
+				   struct mnt_idmap *idmap,
+				   struct dentry *dentry, struct inode *vinode,
+				   const char *name, const void *value,
+				   size_t size, int flags)
+{
+	return 0;
+}
+
 static const struct xattr_handler bch_xattr_bcachefs_effective_handler = {
 	.prefix	= "bcachefs_effective.",
 	.get	= bch2_xattr_bcachefs_get_effective,
-	.set	= bch2_xattr_bcachefs_set,
+	.set	= bch2_xattr_bcachefs_set_effective,
 };

 #endif /* NO_BCACHEFS_FS */

--- a/fs/super.c
+++ b/fs/super.c
@@ -1802,8 +1802,8 @@ int vfs_get_tree(struct fs_context *fc)
 		return error;

 	if (!fc->root) {
-		pr_err("Filesystem %s get_tree() didn't set fc->root\n",
-		       fc->fs_type->name);
+		pr_err("Filesystem %s get_tree() didn't set fc->root, returned %i\n",
+		       fc->fs_type->name, error);
 		/* We don't know what the locking state of the superblock is -
 		 * if there is a superblock.
 		 */