Merge tag 'bcachefs-2023-11-29' of https://evilpiepirate.org/git/bcachefs

Pull more bcachefs bugfixes from Kent Overstreet: - bcache & bcachefs were broken with CFI enabled; patch for closures to fix type punning - mark erasure coding as extra-experimental; there are incompatible disk space accounting changes coming for erasure coding, and I'm still seeing checksum errors in some tests - several fixes for durability-related issues (durability is a device specific setting where we can tell bcachefs that data on a given device should be counted as replicated x times) - a fix for a rare livelock when a btree node merge then updates a parent node that is almost full - fix a race in the device removal path, where dropping a pointer in a btree node to a device would be clobbered by an in flight btree write updating the btree node key on completion - fix one SRCU lock hold time warning in the btree gc code - ther's still a bunch more of these to fix - fix a rare race where we'd start copygc before initializing the "are we rw" percpu refcount; copygc would think we were already ro and die immediately * tag 'bcachefs-2023-11-29' of https://evilpiepirate.org/git/bcachefs: (23 commits) bcachefs: Extra kthread_should_stop() calls for copygc bcachefs: Convert gc_alloc_start() to for_each_btree_key2() bcachefs: Fix race between btree writes and metadata drop bcachefs: move journal seq assertion bcachefs: -EROFS doesn't count as move_extent_start_fail bcachefs: trace_move_extent_start_fail() now includes errcode bcachefs: Fix split_race livelock bcachefs: Fix bucket data type for stripe buckets bcachefs: Add missing validation for jset_entry_data_usage bcachefs: Fix zstd compress workspace size bcachefs: bpos is misaligned on big endian bcachefs: Fix ec + durability calculation bcachefs: Data update path won't accidentaly grow replicas bcachefs: deallocate_extra_replicas() bcachefs: Proper refcounting for journal_keys bcachefs: preserve device path as device name bcachefs: Fix an endianness conversion bcachefs: Start gc, copygc, rebalance threads after initing writes ref bcachefs: Don't stop copygc thread on device resize bcachefs: Make sure bch2_move_ratelimit() also waits for move_ops ...

Merge tag 'bcachefs-2023-11-29' of https://evilpiepirate.org/git/bcachefs
Pull more bcachefs bugfixes from Kent Overstreet: - bcache & bcachefs were broken with CFI enabled; patch for closures to fix type punning - mark erasure coding as extra-experimental; there are incompatible disk space accounting changes coming for erasure coding, and I'm still seeing checksum errors in some tests - several fixes for durability-related issues (durability is a device specific setting where we can tell bcachefs that data on a given device should be counted as replicated x times) - a fix for a rare livelock when a btree node merge then updates a parent node that is almost full - fix a race in the device removal path, where dropping a pointer in a btree node to a device would be clobbered by an in flight btree write updating the btree node key on completion - fix one SRCU lock hold time warning in the btree gc code - ther's still a bunch more of these to fix - fix a rare race where we'd start copygc before initializing the "are we rw" percpu refcount; copygc would think we were already ro and die immediately * tag 'bcachefs-2023-11-29' of https://evilpiepirate.org/git/bcachefs: (23 commits) bcachefs: Extra kthread_should_stop() calls for copygc bcachefs: Convert gc_alloc_start() to for_each_btree_key2() bcachefs: Fix race between btree writes and metadata drop bcachefs: move journal seq assertion bcachefs: -EROFS doesn't count as move_extent_start_fail bcachefs: trace_move_extent_start_fail() now includes errcode bcachefs: Fix split_race livelock bcachefs: Fix bucket data type for stripe buckets bcachefs: Add missing validation for jset_entry_data_usage bcachefs: Fix zstd compress workspace size bcachefs: bpos is misaligned on big endian bcachefs: Fix ec + durability calculation bcachefs: Data update path won't accidentaly grow replicas bcachefs: deallocate_extra_replicas() bcachefs: Proper refcounting for journal_keys bcachefs: preserve device path as device name bcachefs: Fix an endianness conversion bcachefs: Start gc, copygc, rebalance threads after initing writes ref bcachefs: Don't stop copygc thread on device resize bcachefs: Make sure bch2_move_ratelimit() also waits for move_ops ...
e6861be4 · Linus Torvalds · 994d5c58 · 415e5107 · e6861be4 · e6861be4
Commit e6861be4 authored 1 year ago by Linus Torvalds
20 changed files
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -293,16 +293,16 @@ static void btree_complete_write(struct btree *b, struct btree_write *w)
 	w->journal	= NULL;
 }

-static void btree_node_write_unlock(struct closure *cl)
+static CLOSURE_CALLBACK(btree_node_write_unlock)
 {
-	struct btree *b = container_of(cl, struct btree, io);
+	closure_type(b, struct btree, io);

 	up(&b->io_mutex);
 }

-static void __btree_node_write_done(struct closure *cl)
+static CLOSURE_CALLBACK(__btree_node_write_done)
 {
-	struct btree *b = container_of(cl, struct btree, io);
+	closure_type(b, struct btree, io);
 	struct btree_write *w = btree_prev_write(b);

 	bch_bbio_free(b->bio, b->c);
@@ -315,12 +315,12 @@ static void __btree_node_write_done(struct closure *cl)
 	closure_return_with_destructor(cl, btree_node_write_unlock);
 }

-static void btree_node_write_done(struct closure *cl)
+static CLOSURE_CALLBACK(btree_node_write_done)
 {
-	struct btree *b = container_of(cl, struct btree, io);
+	closure_type(b, struct btree, io);

 	bio_free_pages(b->bio);
-	__btree_node_write_done(cl);
+	__btree_node_write_done(&cl->work);
 }

 static void btree_node_write_endio(struct bio *bio)

--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@@ -723,11 +723,11 @@ static void journal_write_endio(struct bio *bio)
 	closure_put(&w->c->journal.io);
 }

-static void journal_write(struct closure *cl);
+static CLOSURE_CALLBACK(journal_write);

-static void journal_write_done(struct closure *cl)
+static CLOSURE_CALLBACK(journal_write_done)
 {
-	struct journal *j = container_of(cl, struct journal, io);
+	closure_type(j, struct journal, io);
 	struct journal_write *w = (j->cur == j->w)
 		? &j->w[1]
 		: &j->w[0];
@@ -736,19 +736,19 @@ static void journal_write_done(struct closure *cl)
 	continue_at_nobarrier(cl, journal_write, bch_journal_wq);
 }

-static void journal_write_unlock(struct closure *cl)
+static CLOSURE_CALLBACK(journal_write_unlock)
 	__releases(&c->journal.lock)
 {
-	struct cache_set *c = container_of(cl, struct cache_set, journal.io);
+	closure_type(c, struct cache_set, journal.io);

 	c->journal.io_in_flight = 0;
 	spin_unlock(&c->journal.lock);
 }

-static void journal_write_unlocked(struct closure *cl)
+static CLOSURE_CALLBACK(journal_write_unlocked)
 	__releases(c->journal.lock)
 {
-	struct cache_set *c = container_of(cl, struct cache_set, journal.io);
+	closure_type(c, struct cache_set, journal.io);
 	struct cache *ca = c->cache;
 	struct journal_write *w = c->journal.cur;
 	struct bkey *k = &c->journal.key;
@@ -823,12 +823,12 @@ static void journal_write_unlocked(struct closure *cl)
 	continue_at(cl, journal_write_done, NULL);
 }

-static void journal_write(struct closure *cl)
+static CLOSURE_CALLBACK(journal_write)
 {
-	struct cache_set *c = container_of(cl, struct cache_set, journal.io);
+	closure_type(c, struct cache_set, journal.io);

 	spin_lock(&c->journal.lock);
-	journal_write_unlocked(cl);
+	journal_write_unlocked(&cl->work);
 }

 static void journal_try_write(struct cache_set *c)

--- a/drivers/md/bcache/movinggc.c
+++ b/drivers/md/bcache/movinggc.c
@@ -35,16 +35,16 @@ static bool moving_pred(struct keybuf *buf, struct bkey *k)

 /* Moving GC - IO loop */

-static void moving_io_destructor(struct closure *cl)
+static CLOSURE_CALLBACK(moving_io_destructor)
 {
-	struct moving_io *io = container_of(cl, struct moving_io, cl);
+	closure_type(io, struct moving_io, cl);

 	kfree(io);
 }

-static void write_moving_finish(struct closure *cl)
+static CLOSURE_CALLBACK(write_moving_finish)
 {
-	struct moving_io *io = container_of(cl, struct moving_io, cl);
+	closure_type(io, struct moving_io, cl);
 	struct bio *bio = &io->bio.bio;

 	bio_free_pages(bio);
@@ -89,9 +89,9 @@ static void moving_init(struct moving_io *io)
 	bch_bio_map(bio, NULL);
 }

-static void write_moving(struct closure *cl)
+static CLOSURE_CALLBACK(write_moving)
 {
-	struct moving_io *io = container_of(cl, struct moving_io, cl);
+	closure_type(io, struct moving_io, cl);
 	struct data_insert_op *op = &io->op;

 	if (!op->status) {
@@ -113,9 +113,9 @@ static void write_moving(struct closure *cl)
 	continue_at(cl, write_moving_finish, op->wq);
 }

-static void read_moving_submit(struct closure *cl)
+static CLOSURE_CALLBACK(read_moving_submit)
 {
-	struct moving_io *io = container_of(cl, struct moving_io, cl);
+	closure_type(io, struct moving_io, cl);
 	struct bio *bio = &io->bio.bio;

 	bch_submit_bbio(bio, io->op.c, &io->w->key, 0);

--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -25,7 +25,7 @@

 struct kmem_cache *bch_search_cache;

-static void bch_data_insert_start(struct closure *cl);
+static CLOSURE_CALLBACK(bch_data_insert_start);

 static unsigned int cache_mode(struct cached_dev *dc)
 {
@@ -55,9 +55,9 @@ static void bio_csum(struct bio *bio, struct bkey *k)

 /* Insert data into cache */

-static void bch_data_insert_keys(struct closure *cl)
+static CLOSURE_CALLBACK(bch_data_insert_keys)
 {
-	struct data_insert_op *op = container_of(cl, struct data_insert_op, cl);
+	closure_type(op, struct data_insert_op, cl);
 	atomic_t *journal_ref = NULL;
 	struct bkey *replace_key = op->replace ? &op->replace_key : NULL;
 	int ret;
@@ -136,9 +136,9 @@ static void bch_data_invalidate(struct closure *cl)
 	continue_at(cl, bch_data_insert_keys, op->wq);
 }

-static void bch_data_insert_error(struct closure *cl)
+static CLOSURE_CALLBACK(bch_data_insert_error)
 {
-	struct data_insert_op *op = container_of(cl, struct data_insert_op, cl);
+	closure_type(op, struct data_insert_op, cl);

 	/*
 	 * Our data write just errored, which means we've got a bunch of keys to
@@ -163,7 +163,7 @@ static void bch_data_insert_error(struct closure *cl)

 	op->insert_keys.top = dst;

-	bch_data_insert_keys(cl);
+	bch_data_insert_keys(&cl->work);
 }

 static void bch_data_insert_endio(struct bio *bio)
@@ -184,9 +184,9 @@ static void bch_data_insert_endio(struct bio *bio)
 	bch_bbio_endio(op->c, bio, bio->bi_status, "writing data to cache");
 }

-static void bch_data_insert_start(struct closure *cl)
+static CLOSURE_CALLBACK(bch_data_insert_start)
 {
-	struct data_insert_op *op = container_of(cl, struct data_insert_op, cl);
+	closure_type(op, struct data_insert_op, cl);
 	struct bio *bio = op->bio, *n;

 	if (op->bypass)
@@ -305,16 +305,16 @@ static void bch_data_insert_start(struct closure *cl)
 * If op->bypass is true, instead of inserting the data it invalidates the
 * region of the cache represented by op->bio and op->inode.
 */
-void bch_data_insert(struct closure *cl)
+CLOSURE_CALLBACK(bch_data_insert)
 {
-	struct data_insert_op *op = container_of(cl, struct data_insert_op, cl);
+	closure_type(op, struct data_insert_op, cl);

 	trace_bcache_write(op->c, op->inode, op->bio,
 			   op->writeback, op->bypass);

 	bch_keylist_init(&op->insert_keys);
 	bio_get(op->bio);
-	bch_data_insert_start(cl);
+	bch_data_insert_start(&cl->work);
 }

 /*
@@ -575,9 +575,9 @@ static int cache_lookup_fn(struct btree_op *op, struct btree *b, struct bkey *k)
 	return n == bio ? MAP_DONE : MAP_CONTINUE;
 }

-static void cache_lookup(struct closure *cl)
+static CLOSURE_CALLBACK(cache_lookup)
 {
-	struct search *s = container_of(cl, struct search, iop.cl);
+	closure_type(s, struct search, iop.cl);
 	struct bio *bio = &s->bio.bio;
 	struct cached_dev *dc;
 	int ret;
@@ -698,9 +698,9 @@ static void do_bio_hook(struct search *s,
 	bio_cnt_set(bio, 3);
 }

-static void search_free(struct closure *cl)
+static CLOSURE_CALLBACK(search_free)
 {
-	struct search *s = container_of(cl, struct search, cl);
+	closure_type(s, struct search, cl);

 	atomic_dec(&s->iop.c->search_inflight);

@@ -749,20 +749,20 @@ static inline struct search *search_alloc(struct bio *bio,

 /* Cached devices */

-static void cached_dev_bio_complete(struct closure *cl)
+static CLOSURE_CALLBACK(cached_dev_bio_complete)
 {
-	struct search *s = container_of(cl, struct search, cl);
+	closure_type(s, struct search, cl);
 	struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);

 	cached_dev_put(dc);
-	search_free(cl);
+	search_free(&cl->work);
 }

 /* Process reads */

-static void cached_dev_read_error_done(struct closure *cl)
+static CLOSURE_CALLBACK(cached_dev_read_error_done)
 {
-	struct search *s = container_of(cl, struct search, cl);
+	closure_type(s, struct search, cl);

 	if (s->iop.replace_collision)
 		bch_mark_cache_miss_collision(s->iop.c, s->d);
@@ -770,12 +770,12 @@ static void cached_dev_read_error_done(struct closure *cl)
 	if (s->iop.bio)
 		bio_free_pages(s->iop.bio);

-	cached_dev_bio_complete(cl);
+	cached_dev_bio_complete(&cl->work);
 }

-static void cached_dev_read_error(struct closure *cl)
+static CLOSURE_CALLBACK(cached_dev_read_error)
 {
-	struct search *s = container_of(cl, struct search, cl);
+	closure_type(s, struct search, cl);
 	struct bio *bio = &s->bio.bio;

 	/*
@@ -801,9 +801,9 @@ static void cached_dev_read_error(struct closure *cl)
 	continue_at(cl, cached_dev_read_error_done, NULL);
 }

-static void cached_dev_cache_miss_done(struct closure *cl)
+static CLOSURE_CALLBACK(cached_dev_cache_miss_done)
 {
-	struct search *s = container_of(cl, struct search, cl);
+	closure_type(s, struct search, cl);
 	struct bcache_device *d = s->d;

 	if (s->iop.replace_collision)
@@ -812,13 +812,13 @@ static void cached_dev_cache_miss_done(struct closure *cl)
 	if (s->iop.bio)
 		bio_free_pages(s->iop.bio);

-	cached_dev_bio_complete(cl);
+	cached_dev_bio_complete(&cl->work);
 	closure_put(&d->cl);
 }

-static void cached_dev_read_done(struct closure *cl)
+static CLOSURE_CALLBACK(cached_dev_read_done)
 {
-	struct search *s = container_of(cl, struct search, cl);
+	closure_type(s, struct search, cl);
 	struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);

 	/*
@@ -858,9 +858,9 @@ static void cached_dev_read_done(struct closure *cl)
 	continue_at(cl, cached_dev_cache_miss_done, NULL);
 }

-static void cached_dev_read_done_bh(struct closure *cl)
+static CLOSURE_CALLBACK(cached_dev_read_done_bh)
 {
-	struct search *s = container_of(cl, struct search, cl);
+	closure_type(s, struct search, cl);
 	struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);

 	bch_mark_cache_accounting(s->iop.c, s->d,
@@ -955,13 +955,13 @@ static void cached_dev_read(struct cached_dev *dc, struct search *s)

 /* Process writes */

-static void cached_dev_write_complete(struct closure *cl)
+static CLOSURE_CALLBACK(cached_dev_write_complete)
 {
-	struct search *s = container_of(cl, struct search, cl);
+	closure_type(s, struct search, cl);
 	struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);

 	up_read_non_owner(&dc->writeback_lock);
-	cached_dev_bio_complete(cl);
+	cached_dev_bio_complete(&cl->work);
 }

 static void cached_dev_write(struct cached_dev *dc, struct search *s)
@@ -1048,9 +1048,9 @@ static void cached_dev_write(struct cached_dev *dc, struct search *s)
 	continue_at(cl, cached_dev_write_complete, NULL);
 }

-static void cached_dev_nodata(struct closure *cl)
+static CLOSURE_CALLBACK(cached_dev_nodata)
 {
-	struct search *s = container_of(cl, struct search, cl);
+	closure_type(s, struct search, cl);
 	struct bio *bio = &s->bio.bio;

 	if (s->iop.flush_journal)
@@ -1265,9 +1265,9 @@ static int flash_dev_cache_miss(struct btree *b, struct search *s,
 	return MAP_CONTINUE;
 }

-static void flash_dev_nodata(struct closure *cl)
+static CLOSURE_CALLBACK(flash_dev_nodata)
 {
-	struct search *s = container_of(cl, struct search, cl);
+	closure_type(s, struct search, cl);

 	if (s->iop.flush_journal)
 		bch_journal_meta(s->iop.c, cl);

--- a/drivers/md/bcache/request.h
+++ b/drivers/md/bcache/request.h
@@ -34,7 +34,7 @@ struct data_insert_op {
 };

 unsigned int bch_get_congested(const struct cache_set *c);
-void bch_data_insert(struct closure *cl);
+CLOSURE_CALLBACK(bch_data_insert);

 void bch_cached_dev_request_init(struct cached_dev *dc);
 void cached_dev_submit_bio(struct bio *bio);

--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -327,9 +327,9 @@ static void __write_super(struct cache_sb *sb, struct cache_sb_disk *out,
 	submit_bio(bio);
 }

-static void bch_write_bdev_super_unlock(struct closure *cl)
+static CLOSURE_CALLBACK(bch_write_bdev_super_unlock)
 {
-	struct cached_dev *dc = container_of(cl, struct cached_dev, sb_write);
+	closure_type(dc, struct cached_dev, sb_write);

 	up(&dc->sb_write_mutex);
 }
@@ -363,9 +363,9 @@ static void write_super_endio(struct bio *bio)
 	closure_put(&ca->set->sb_write);
 }

-static void bcache_write_super_unlock(struct closure *cl)
+static CLOSURE_CALLBACK(bcache_write_super_unlock)
 {
-	struct cache_set *c = container_of(cl, struct cache_set, sb_write);
+	closure_type(c, struct cache_set, sb_write);

 	up(&c->sb_write_mutex);
 }
@@ -407,9 +407,9 @@ static void uuid_endio(struct bio *bio)
 	closure_put(cl);
 }

-static void uuid_io_unlock(struct closure *cl)
+static CLOSURE_CALLBACK(uuid_io_unlock)
 {
-	struct cache_set *c = container_of(cl, struct cache_set, uuid_write);
+	closure_type(c, struct cache_set, uuid_write);

 	up(&c->uuid_write_mutex);
 }
@@ -1344,9 +1344,9 @@ void bch_cached_dev_release(struct kobject *kobj)
 	module_put(THIS_MODULE);
 }

-static void cached_dev_free(struct closure *cl)
+static CLOSURE_CALLBACK(cached_dev_free)
 {
-	struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl);
+	closure_type(dc, struct cached_dev, disk.cl);

 	if (test_and_clear_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags))
 		cancel_writeback_rate_update_dwork(dc);
@@ -1378,9 +1378,9 @@ static void cached_dev_free(struct closure *cl)
 	kobject_put(&dc->disk.kobj);
 }

-static void cached_dev_flush(struct closure *cl)
+static CLOSURE_CALLBACK(cached_dev_flush)
 {
-	struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl);
+	closure_type(dc, struct cached_dev, disk.cl);
 	struct bcache_device *d = &dc->disk;

 	mutex_lock(&bch_register_lock);
@@ -1499,9 +1499,9 @@ void bch_flash_dev_release(struct kobject *kobj)
 	kfree(d);
 }

-static void flash_dev_free(struct closure *cl)
+static CLOSURE_CALLBACK(flash_dev_free)
 {
-	struct bcache_device *d = container_of(cl, struct bcache_device, cl);
+	closure_type(d, struct bcache_device, cl);

 	mutex_lock(&bch_register_lock);
 	atomic_long_sub(bcache_dev_sectors_dirty(d),
@@ -1512,9 +1512,9 @@ static void flash_dev_free(struct closure *cl)
 	kobject_put(&d->kobj);
 }

-static void flash_dev_flush(struct closure *cl)
+static CLOSURE_CALLBACK(flash_dev_flush)
 {
-	struct bcache_device *d = container_of(cl, struct bcache_device, cl);
+	closure_type(d, struct bcache_device, cl);

 	mutex_lock(&bch_register_lock);
 	bcache_device_unlink(d);
@@ -1670,9 +1670,9 @@ void bch_cache_set_release(struct kobject *kobj)
 	module_put(THIS_MODULE);
 }

-static void cache_set_free(struct closure *cl)
+static CLOSURE_CALLBACK(cache_set_free)
 {
-	struct cache_set *c = container_of(cl, struct cache_set, cl);
+	closure_type(c, struct cache_set, cl);
 	struct cache *ca;

 	debugfs_remove(c->debug);
@@ -1711,9 +1711,9 @@ static void cache_set_free(struct closure *cl)
 	kobject_put(&c->kobj);
 }

-static void cache_set_flush(struct closure *cl)
+static CLOSURE_CALLBACK(cache_set_flush)
 {
-	struct cache_set *c = container_of(cl, struct cache_set, caching);
+	closure_type(c, struct cache_set, caching);
 	struct cache *ca = c->cache;
 	struct btree *b;

@@ -1808,9 +1808,9 @@ static void conditional_stop_bcache_device(struct cache_set *c,
 	}
 }

-static void __cache_set_unregister(struct closure *cl)
+static CLOSURE_CALLBACK(__cache_set_unregister)
 {
-	struct cache_set *c = container_of(cl, struct cache_set, caching);
+	closure_type(c, struct cache_set, caching);
 	struct cached_dev *dc;
 	struct bcache_device *d;
 	size_t i;

--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -341,16 +341,16 @@ static void dirty_init(struct keybuf_key *w)
 	bch_bio_map(bio, NULL);
 }

-static void dirty_io_destructor(struct closure *cl)
+static CLOSURE_CALLBACK(dirty_io_destructor)
 {
-	struct dirty_io *io = container_of(cl, struct dirty_io, cl);
+	closure_type(io, struct dirty_io, cl);

 	kfree(io);
 }

-static void write_dirty_finish(struct closure *cl)
+static CLOSURE_CALLBACK(write_dirty_finish)
 {
-	struct dirty_io *io = container_of(cl, struct dirty_io, cl);
+	closure_type(io, struct dirty_io, cl);
 	struct keybuf_key *w = io->bio.bi_private;
 	struct cached_dev *dc = io->dc;

@@ -400,9 +400,9 @@ static void dirty_endio(struct bio *bio)
 	closure_put(&io->cl);
 }

-static void write_dirty(struct closure *cl)
+static CLOSURE_CALLBACK(write_dirty)
 {
-	struct dirty_io *io = container_of(cl, struct dirty_io, cl);
+	closure_type(io, struct dirty_io, cl);
 	struct keybuf_key *w = io->bio.bi_private;
 	struct cached_dev *dc = io->dc;

@@ -462,9 +462,9 @@ static void read_dirty_endio(struct bio *bio)
 	dirty_endio(bio);
 }

-static void read_dirty_submit(struct closure *cl)
+static CLOSURE_CALLBACK(read_dirty_submit)
 {
-	struct dirty_io *io = container_of(cl, struct dirty_io, cl);
+	closure_type(io, struct dirty_io, cl);

 	closure_bio_submit(io->dc->disk.c, &io->bio, cl);


--- a/fs/bcachefs/Kconfig
+++ b/fs/bcachefs/Kconfig
@@ -33,6 +33,18 @@ config BCACHEFS_QUOTA
 	depends on BCACHEFS_FS
 	select QUOTACTL

+config BCACHEFS_ERASURE_CODING
+	bool "bcachefs erasure coding (RAID5/6) support (EXPERIMENTAL)"
+	depends on BCACHEFS_FS
+	select QUOTACTL
+	help
+	This enables the "erasure_code" filesysystem and inode option, which
+	organizes data into reed-solomon stripes instead of ordinary
+	replication.
+
+	WARNING: this feature is still undergoing on disk format changes, and
+	should only be enabled for testing purposes.
+
 config BCACHEFS_POSIX_ACL
 	bool "bcachefs POSIX ACL support"
 	depends on BCACHEFS_FS

--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -1297,6 +1297,30 @@ static struct write_point *writepoint_find(struct btree_trans *trans,
 	return wp;
 }

+static noinline void
+deallocate_extra_replicas(struct bch_fs *c,
+			  struct open_buckets *ptrs,
+			  struct open_buckets *ptrs_no_use,
+			  unsigned extra_replicas)
+{
+	struct open_buckets ptrs2 = { 0 };
+	struct open_bucket *ob;
+	unsigned i;
+
+	open_bucket_for_each(c, ptrs, ob, i) {
+		unsigned d = bch_dev_bkey_exists(c, ob->dev)->mi.durability;
+
+		if (d && d <= extra_replicas) {
+			extra_replicas -= d;
+			ob_push(c, ptrs_no_use, ob);
+		} else {
+			ob_push(c, &ptrs2, ob);
+		}
+	}
+
+	*ptrs = ptrs2;
+}
+
 /*
 * Get us an open_bucket we can allocate from, return with it locked:
 */
@@ -1321,6 +1345,9 @@ int bch2_alloc_sectors_start_trans(struct btree_trans *trans,
 	int ret;
 	int i;

+	if (!IS_ENABLED(CONFIG_BCACHEFS_ERASURE_CODING))
+		erasure_code = false;
+
 	BUG_ON(flags & BCH_WRITE_ONLY_SPECIFIED_DEVS);

 	BUG_ON(!nr_replicas || !nr_replicas_required);
@@ -1382,6 +1409,9 @@ int bch2_alloc_sectors_start_trans(struct btree_trans *trans,
 	if (ret)
 		goto err;

+	if (nr_effective > nr_replicas)
+		deallocate_extra_replicas(c, &ptrs, &wp->ptrs, nr_effective - nr_replicas);
+
 	/* Free buckets we didn't use: */
 	open_bucket_for_each(c, &wp->ptrs, ob, i)
 		open_bucket_free_unused(c, ob);

--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -638,6 +638,8 @@ struct journal_keys {
 	size_t			gap;
 	size_t			nr;
 	size_t			size;
+	atomic_t		ref;
+	bool			initial_ref_held;
 };

 struct btree_trans_buf {
@@ -929,7 +931,7 @@ struct bch_fs {
 	mempool_t		compression_bounce[2];
 	mempool_t		compress_workspace[BCH_COMPRESSION_TYPE_NR];
 	mempool_t		decompress_workspace;
-	ZSTD_parameters		zstd_params;
+	size_t			zstd_workspace_size;

 	struct crypto_shash	*sha256;
 	struct crypto_sync_skcipher *chacha20;

--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -151,7 +151,11 @@ struct bpos {
 #else
 #error edit for your odd byteorder.
 #endif
-} __packed __aligned(4);
+} __packed
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+__aligned(4)
+#endif
+;

 #define KEY_INODE_MAX			((__u64)~0ULL)
 #define KEY_OFFSET_MAX			((__u64)~0ULL)
@@ -1528,7 +1532,7 @@ struct bch_sb_field_disk_groups {
 	x(move_extent_write,				36)	\
 	x(move_extent_finish,				37)	\
 	x(move_extent_fail,				38)	\
-	x(move_extent_alloc_mem_fail,			39)	\
+	x(move_extent_start_fail,			39)	\
 	x(copygc,					40)	\
 	x(copygc_wait,					41)	\
 	x(gc_gens_end,					42)	\

--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -1541,8 +1541,8 @@ static int bch2_gc_alloc_start(struct bch_fs *c, bool metadata_only)
 		rcu_assign_pointer(ca->buckets_gc, buckets);
 	}

-	for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN,
-			   BTREE_ITER_PREFETCH, k, ret) {
+	ret = for_each_btree_key2(trans, iter, BTREE_ID_alloc, POS_MIN,
+				  BTREE_ITER_PREFETCH, k, ({
 		ca = bch_dev_bkey_exists(c, k.k->p.inode);
 		g = gc_bucket(ca, k.k->p.offset);

@@ -1561,8 +1561,9 @@ static int bch2_gc_alloc_start(struct bch_fs *c, bool metadata_only)
 			g->stripe		= a->stripe;
 			g->stripe_redundancy	= a->stripe_redundancy;
 		}
-	}
-	bch2_trans_iter_exit(trans, &iter);
+
+		0;
+	}));
 err:
 	bch2_trans_put(trans);
 	if (ret)

--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -1358,10 +1358,9 @@ static bool btree_node_has_extra_bsets(struct bch_fs *c, unsigned offset, void *
 	return offset;
 }

-static void btree_node_read_all_replicas_done(struct closure *cl)
+static CLOSURE_CALLBACK(btree_node_read_all_replicas_done)
 {
-	struct btree_node_read_all *ra =
-		container_of(cl, struct btree_node_read_all, cl);
+	closure_type(ra, struct btree_node_read_all, cl);
 	struct bch_fs *c = ra->c;
 	struct btree *b = ra->b;
 	struct printbuf buf = PRINTBUF;
@@ -1567,7 +1566,7 @@ static int btree_node_read_all_replicas(struct bch_fs *c, struct btree *b, bool

 	if (sync) {
 		closure_sync(&ra->cl);
-		btree_node_read_all_replicas_done(&ra->cl);
+		btree_node_read_all_replicas_done(&ra->cl.work);
 	} else {
 		continue_at(&ra->cl, btree_node_read_all_replicas_done,
 			    c->io_complete_wq);

--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -2981,7 +2981,8 @@ struct btree_trans *__bch2_trans_get(struct bch_fs *c, unsigned fn_idx)
 	trans->fn_idx		= fn_idx;
 	trans->locking_wait.task = current;
 	trans->journal_replay_not_finished =
-		!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags);
+		unlikely(!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)) &&
+		atomic_inc_not_zero(&c->journal_keys.ref);
 	closure_init_stack(&trans->ref);

 	s = btree_trans_stats(trans);
@@ -3098,6 +3099,9 @@ void bch2_trans_put(struct btree_trans *trans)
 			kfree(trans->fs_usage_deltas);
 	}

+	if (unlikely(trans->journal_replay_not_finished))
+		bch2_journal_keys_put(c);
+
 	if (trans->mem_bytes == BTREE_TRANS_MEM_MAX)
 		mempool_free(trans->mem, &c->btree_trans_mem_pool);
 	else

--- a/fs/bcachefs/btree_journal_iter.c
+++ b/fs/bcachefs/btree_journal_iter.c
@@ -80,6 +80,8 @@ struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *c, enum btree_id btree
 	struct journal_keys *keys = &c->journal_keys;
 	unsigned iters = 0;
 	struct journal_key *k;
+
+	BUG_ON(*idx > keys->nr);
 search:
 	if (!*idx)
 		*idx = __bch2_journal_key_search(keys, btree_id, level, pos);
@@ -189,10 +191,12 @@ int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id,
 		/* Since @keys was full, there was no gap: */
 		memcpy(new_keys.d, keys->d, sizeof(keys->d[0]) * keys->nr);
 		kvfree(keys->d);
-		*keys = new_keys;
+		keys->d		= new_keys.d;
+		keys->nr	= new_keys.nr;
+		keys->size	= new_keys.size;

 		/* And now the gap is at the end: */
-		keys->gap = keys->nr;
+		keys->gap	= keys->nr;
 	}

 	journal_iters_move_gap(c, keys->gap, idx);
@@ -415,10 +419,16 @@ static int journal_sort_key_cmp(const void *_l, const void *_r)
 		cmp_int(l->journal_offset, r->journal_offset);
 }

-void bch2_journal_keys_free(struct journal_keys *keys)
+void bch2_journal_keys_put(struct bch_fs *c)
 {
+	struct journal_keys *keys = &c->journal_keys;
 	struct journal_key *i;

+	BUG_ON(atomic_read(&keys->ref) <= 0);
+
+	if (!atomic_dec_and_test(&keys->ref))
+		return;
+
 	move_gap(keys->d, keys->nr, keys->size, keys->gap, keys->nr);
 	keys->gap = keys->nr;

@@ -429,6 +439,8 @@ void bch2_journal_keys_free(struct journal_keys *keys)
 	kvfree(keys->d);
 	keys->d = NULL;
 	keys->nr = keys->gap = keys->size = 0;
+
+	bch2_journal_entries_free(c);
 }

 static void __journal_keys_sort(struct journal_keys *keys)

--- a/fs/bcachefs/btree_journal_iter.h
+++ b/fs/bcachefs/btree_journal_iter.h
@@ -49,7 +49,15 @@ void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *,
 						struct bch_fs *,
 						struct btree *);

-void bch2_journal_keys_free(struct journal_keys *);
+void bch2_journal_keys_put(struct bch_fs *);
+
+static inline void bch2_journal_keys_put_initial(struct bch_fs *c)
+{
+	if (c->journal_keys.initial_ref_held)
+		bch2_journal_keys_put(c);
+	c->journal_keys.initial_ref_held = false;
+}
+
 void bch2_journal_entries_free(struct bch_fs *);

 int bch2_journal_keys_sort(struct bch_fs *);

--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -778,9 +778,9 @@ static void btree_interior_update_work(struct work_struct *work)
 	}
 }

-static void btree_update_set_nodes_written(struct closure *cl)
+static CLOSURE_CALLBACK(btree_update_set_nodes_written)
 {
-	struct btree_update *as = container_of(cl, struct btree_update, cl);
+	closure_type(as, struct btree_update, cl);
 	struct bch_fs *c = as->c;

 	mutex_lock(&c->btree_interior_update_lock);
@@ -1071,8 +1071,12 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
 			break;
 		}

+		/*
+		 * Always check for space for two keys, even if we won't have to
+		 * split at prior level - it might have been a merge instead:
+		 */
 		if (bch2_btree_node_insert_fits(c, path->l[update_level].b,
-					BKEY_BTREE_PTR_U64s_MAX * (1 + split)))
+						BKEY_BTREE_PTR_U64s_MAX * 2))
 			break;

 		split = path->l[update_level].b->nr.live_u64s > BTREE_SPLIT_THRESHOLD(c);
@@ -2266,6 +2270,10 @@ int bch2_btree_node_update_key_get_iter(struct btree_trans *trans,

 	BUG_ON(!btree_node_hashed(b));

+	struct bch_extent_ptr *ptr;
+	bch2_bkey_drop_ptrs(bkey_i_to_s(new_key), ptr,
+			    !bch2_bkey_has_device(bkey_i_to_s(&b->key), ptr->dev));
+
 	ret = bch2_btree_node_update_key(trans, &iter, b, new_key,
 					 commit_flags, skip_triggers);
 out:

--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -854,8 +854,12 @@ static int __mark_pointer(struct btree_trans *trans,
 		return ret;

 	*dst_sectors += sectors;
-	*bucket_data_type = *dirty_sectors || *cached_sectors
-		? ptr_data_type : 0;
+
+	if (!*dirty_sectors && !*cached_sectors)
+		*bucket_data_type = 0;
+	else if (*bucket_data_type != BCH_DATA_stripe)
+		*bucket_data_type = ptr_data_type;
+
 	return 0;
 }

@@ -2091,8 +2095,6 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
 	bucket_gens->first_bucket = ca->mi.first_bucket;
 	bucket_gens->nbuckets	= nbuckets;

-	bch2_copygc_stop(c);
-
 	if (resize) {
 		down_write(&c->gc_lock);
 		down_write(&ca->bucket_lock);

--- a/fs/bcachefs/compress.c
+++ b/fs/bcachefs/compress.c
@@ -354,8 +354,7 @@ static int attempt_compress(struct bch_fs *c,
 		 */
 		unsigned level = min((compression.level * 3) / 2, zstd_max_clevel());
 		ZSTD_parameters params = zstd_get_params(level, c->opts.encoded_extent_max);
-		ZSTD_CCtx *ctx = zstd_init_cctx(workspace,
-			zstd_cctx_workspace_bound(&params.cParams));
+		ZSTD_CCtx *ctx = zstd_init_cctx(workspace, c->zstd_workspace_size);

 		/*
 		 * ZSTD requires that when we decompress we pass in the exact
@@ -371,7 +370,7 @@ static int attempt_compress(struct bch_fs *c,
 		size_t len = zstd_compress_cctx(ctx,
 				dst + 4,	dst_len - 4 - 7,
 				src,		src_len,
-				&c->zstd_params);
+				&params);
 		if (zstd_is_error(len))
 			return 0;

@@ -572,6 +571,13 @@ static int __bch2_fs_compress_init(struct bch_fs *c, u64 features)
 	size_t decompress_workspace_size = 0;
 	ZSTD_parameters params = zstd_get_params(zstd_max_clevel(),
 						 c->opts.encoded_extent_max);
+
+	/*
+	 * ZSTD is lying: if we allocate the size of the workspace it says it
+	 * requires, it returns memory allocation errors
+	 */
+	c->zstd_workspace_size = zstd_cctx_workspace_bound(&params.cParams);
+
 	struct {
 		unsigned			feature;
 		enum bch_compression_type	type;
@@ -585,13 +591,11 @@ static int __bch2_fs_compress_init(struct bch_fs *c, u64 features)
 			zlib_deflate_workspacesize(MAX_WBITS, DEF_MEM_LEVEL),
 			zlib_inflate_workspacesize(), },
 		{ BCH_FEATURE_zstd, BCH_COMPRESSION_TYPE_zstd,
-			zstd_cctx_workspace_bound(&params.cParams),
+			c->zstd_workspace_size,
 			zstd_dctx_workspace_bound() },
 	}, *i;
 	bool have_compressed = false;

-	c->zstd_params = params;
-
 	for (i = compression_types;
 	     i < compression_types + ARRAY_SIZE(compression_types);
 	     i++)

--- a/fs/bcachefs/data_update.c
+++ b/fs/bcachefs/data_update.c
@@ -356,7 +356,7 @@ void bch2_data_update_exit(struct data_update *update)
 	bch2_bio_free_pages_pool(c, &update->op.wbio.bio);
 }

-void bch2_update_unwritten_extent(struct btree_trans *trans,
+static void bch2_update_unwritten_extent(struct btree_trans *trans,
 				  struct data_update *update)
 {
 	struct bch_fs *c = update->op.c;
@@ -436,7 +436,51 @@ void bch2_update_unwritten_extent(struct btree_trans *trans,
 	}
 }

+int bch2_extent_drop_ptrs(struct btree_trans *trans,
+			  struct btree_iter *iter,
+			  struct bkey_s_c k,
+			  struct data_update_opts data_opts)
+{
+	struct bch_fs *c = trans->c;
+	struct bkey_i *n;
+	int ret;
+
+	n = bch2_bkey_make_mut_noupdate(trans, k);
+	ret = PTR_ERR_OR_ZERO(n);
+	if (ret)
+		return ret;
+
+	while (data_opts.kill_ptrs) {
+		unsigned i = 0, drop = __fls(data_opts.kill_ptrs);
+		struct bch_extent_ptr *ptr;
+
+		bch2_bkey_drop_ptrs(bkey_i_to_s(n), ptr, i++ == drop);
+		data_opts.kill_ptrs ^= 1U << drop;
+	}
+
+	/*
+	 * If the new extent no longer has any pointers, bch2_extent_normalize()
+	 * will do the appropriate thing with it (turning it into a
+	 * KEY_TYPE_error key, or just a discard if it was a cached extent)
+	 */
+	bch2_extent_normalize(c, bkey_i_to_s(n));
+
+	/*
+	 * Since we're not inserting through an extent iterator
+	 * (BTREE_ITER_ALL_SNAPSHOTS iterators aren't extent iterators),
+	 * we aren't using the extent overwrite path to delete, we're
+	 * just using the normal key deletion path:
+	 */
+	if (bkey_deleted(&n->k))
+		n->k.size = 0;
+
+	return bch2_trans_relock(trans) ?:
+		bch2_trans_update(trans, iter, n, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
+		bch2_trans_commit(trans, NULL, NULL, BTREE_INSERT_NOFAIL);
+}
+
 int bch2_data_update_init(struct btree_trans *trans,
+			  struct btree_iter *iter,
 			  struct moving_context *ctxt,
 			  struct data_update *m,
 			  struct write_point_specifier wp,
@@ -452,7 +496,7 @@ int bch2_data_update_init(struct btree_trans *trans,
 	const struct bch_extent_ptr *ptr;
 	unsigned i, reserve_sectors = k.k->size * data_opts.extra_replicas;
 	unsigned ptrs_locked = 0;
-	int ret;
+	int ret = 0;

 	bch2_bkey_buf_init(&m->k);
 	bch2_bkey_buf_reassemble(&m->k, c, k);
@@ -478,6 +522,8 @@ int bch2_data_update_init(struct btree_trans *trans,
 	bkey_for_each_ptr(ptrs, ptr)
 		percpu_ref_get(&bch_dev_bkey_exists(c, ptr->dev)->ref);

+	unsigned durability_have = 0, durability_removing = 0;
+
 	i = 0;
 	bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
 		bool locked;
@@ -489,8 +535,11 @@ int bch2_data_update_init(struct btree_trans *trans,
 				reserve_sectors += k.k->size;

 			m->op.nr_replicas += bch2_extent_ptr_desired_durability(c, &p);
-		} else if (!p.ptr.cached) {
+			durability_removing += bch2_extent_ptr_desired_durability(c, &p);
+		} else if (!p.ptr.cached &&
+			   !((1U << i) & m->data_opts.kill_ptrs)) {
 			bch2_dev_list_add_dev(&m->op.devs_have, p.ptr.dev);
+			durability_have += bch2_extent_ptr_durability(c, &p);
 		}

 		/*
@@ -529,6 +578,29 @@ int bch2_data_update_init(struct btree_trans *trans,
 		i++;
 	}

+	/*
+	 * If current extent durability is less than io_opts.data_replicas,
+	 * we're not trying to rereplicate the extent up to data_replicas here -
+	 * unless extra_replicas was specified
+	 *
+	 * Increasing replication is an explicit operation triggered by
+	 * rereplicate, currently, so that users don't get an unexpected -ENOSPC
+	 */
+	if (durability_have >= io_opts.data_replicas) {
+		m->data_opts.kill_ptrs |= m->data_opts.rewrite_ptrs;
+		m->data_opts.rewrite_ptrs = 0;
+		/* if iter == NULL, it's just a promote */
+		if (iter)
+			ret = bch2_extent_drop_ptrs(trans, iter, k, data_opts);
+		goto done;
+	}
+
+	m->op.nr_replicas = min(durability_removing, io_opts.data_replicas - durability_have) +
+		m->data_opts.extra_replicas;
+	m->op.nr_replicas_required = m->op.nr_replicas;
+
+	BUG_ON(!m->op.nr_replicas);
+
 	if (reserve_sectors) {
 		ret = bch2_disk_reservation_add(c, &m->op.res, reserve_sectors,
 				m->data_opts.extra_replicas
@@ -538,14 +610,11 @@ int bch2_data_update_init(struct btree_trans *trans,
 			goto err;
 	}

-	m->op.nr_replicas += m->data_opts.extra_replicas;
-	m->op.nr_replicas_required = m->op.nr_replicas;
-
-	BUG_ON(!m->op.nr_replicas);
+	if (bkey_extent_is_unwritten(k)) {
+		bch2_update_unwritten_extent(trans, m);
+		goto done;
+	}

-	/* Special handling required: */
-	if (bkey_extent_is_unwritten(k))
-		return -BCH_ERR_unwritten_extent_update;
 	return 0;
 err:
 	i = 0;
@@ -560,6 +629,9 @@ int bch2_data_update_init(struct btree_trans *trans,
 	bch2_bkey_buf_exit(&m->k, c);
 	bch2_bio_free_pages_pool(c, &m->op.wbio.bio);
 	return ret;
+done:
+	bch2_data_update_exit(m);
+	return ret ?: -BCH_ERR_data_update_done;
 }

 void bch2_data_update_opts_normalize(struct bkey_s_c k, struct data_update_opts *opts)