Commit e6861be4 authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge tag 'bcachefs-2023-11-29' of https://evilpiepirate.org/git/bcachefs

Pull more bcachefs bugfixes from Kent Overstreet:

 - bcache & bcachefs were broken with CFI enabled; patch for closures to
   fix type punning

 - mark erasure coding as extra-experimental; there are incompatible
   disk space accounting changes coming for erasure coding, and I'm
   still seeing checksum errors in some tests

 - several fixes for durability-related issues (durability is a device
   specific setting where we can tell bcachefs that data on a given
   device should be counted as replicated x times)

 - a fix for a rare livelock when a btree node merge then updates a
   parent node that is almost full

 - fix a race in the device removal path, where dropping a pointer in a
   btree node to a device would be clobbered by an in flight btree write
   updating the btree node key on completion

 - fix one SRCU lock hold time warning in the btree gc code - ther's
   still a bunch more of these to fix

 - fix a rare race where we'd start copygc before initializing the "are
   we rw" percpu refcount; copygc would think we were already ro and die
   immediately

* tag 'bcachefs-2023-11-29' of https://evilpiepirate.org/git/bcachefs: (23 commits)
  bcachefs: Extra kthread_should_stop() calls for copygc
  bcachefs: Convert gc_alloc_start() to for_each_btree_key2()
  bcachefs: Fix race between btree writes and metadata drop
  bcachefs: move journal seq assertion
  bcachefs: -EROFS doesn't count as move_extent_start_fail
  bcachefs: trace_move_extent_start_fail() now includes errcode
  bcachefs: Fix split_race livelock
  bcachefs: Fix bucket data type for stripe buckets
  bcachefs: Add missing validation for jset_entry_data_usage
  bcachefs: Fix zstd compress workspace size
  bcachefs: bpos is misaligned on big endian
  bcachefs: Fix ec + durability calculation
  bcachefs: Data update path won't accidentaly grow replicas
  bcachefs: deallocate_extra_replicas()
  bcachefs: Proper refcounting for journal_keys
  bcachefs: preserve device path as device name
  bcachefs: Fix an endianness conversion
  bcachefs: Start gc, copygc, rebalance threads after initing writes ref
  bcachefs: Don't stop copygc thread on device resize
  bcachefs: Make sure bch2_move_ratelimit() also waits for move_ops
  ...
parents 994d5c58 415e5107
......@@ -293,16 +293,16 @@ static void btree_complete_write(struct btree *b, struct btree_write *w)
w->journal = NULL;
}
static void btree_node_write_unlock(struct closure *cl)
static CLOSURE_CALLBACK(btree_node_write_unlock)
{
struct btree *b = container_of(cl, struct btree, io);
closure_type(b, struct btree, io);
up(&b->io_mutex);
}
static void __btree_node_write_done(struct closure *cl)
static CLOSURE_CALLBACK(__btree_node_write_done)
{
struct btree *b = container_of(cl, struct btree, io);
closure_type(b, struct btree, io);
struct btree_write *w = btree_prev_write(b);
bch_bbio_free(b->bio, b->c);
......@@ -315,12 +315,12 @@ static void __btree_node_write_done(struct closure *cl)
closure_return_with_destructor(cl, btree_node_write_unlock);
}
static void btree_node_write_done(struct closure *cl)
static CLOSURE_CALLBACK(btree_node_write_done)
{
struct btree *b = container_of(cl, struct btree, io);
closure_type(b, struct btree, io);
bio_free_pages(b->bio);
__btree_node_write_done(cl);
__btree_node_write_done(&cl->work);
}
static void btree_node_write_endio(struct bio *bio)
......
......@@ -723,11 +723,11 @@ static void journal_write_endio(struct bio *bio)
closure_put(&w->c->journal.io);
}
static void journal_write(struct closure *cl);
static CLOSURE_CALLBACK(journal_write);
static void journal_write_done(struct closure *cl)
static CLOSURE_CALLBACK(journal_write_done)
{
struct journal *j = container_of(cl, struct journal, io);
closure_type(j, struct journal, io);
struct journal_write *w = (j->cur == j->w)
? &j->w[1]
: &j->w[0];
......@@ -736,19 +736,19 @@ static void journal_write_done(struct closure *cl)
continue_at_nobarrier(cl, journal_write, bch_journal_wq);
}
static void journal_write_unlock(struct closure *cl)
static CLOSURE_CALLBACK(journal_write_unlock)
__releases(&c->journal.lock)
{
struct cache_set *c = container_of(cl, struct cache_set, journal.io);
closure_type(c, struct cache_set, journal.io);
c->journal.io_in_flight = 0;
spin_unlock(&c->journal.lock);
}
static void journal_write_unlocked(struct closure *cl)
static CLOSURE_CALLBACK(journal_write_unlocked)
__releases(c->journal.lock)
{
struct cache_set *c = container_of(cl, struct cache_set, journal.io);
closure_type(c, struct cache_set, journal.io);
struct cache *ca = c->cache;
struct journal_write *w = c->journal.cur;
struct bkey *k = &c->journal.key;
......@@ -823,12 +823,12 @@ static void journal_write_unlocked(struct closure *cl)
continue_at(cl, journal_write_done, NULL);
}
static void journal_write(struct closure *cl)
static CLOSURE_CALLBACK(journal_write)
{
struct cache_set *c = container_of(cl, struct cache_set, journal.io);
closure_type(c, struct cache_set, journal.io);
spin_lock(&c->journal.lock);
journal_write_unlocked(cl);
journal_write_unlocked(&cl->work);
}
static void journal_try_write(struct cache_set *c)
......
......@@ -35,16 +35,16 @@ static bool moving_pred(struct keybuf *buf, struct bkey *k)
/* Moving GC - IO loop */
static void moving_io_destructor(struct closure *cl)
static CLOSURE_CALLBACK(moving_io_destructor)
{
struct moving_io *io = container_of(cl, struct moving_io, cl);
closure_type(io, struct moving_io, cl);
kfree(io);
}
static void write_moving_finish(struct closure *cl)
static CLOSURE_CALLBACK(write_moving_finish)
{
struct moving_io *io = container_of(cl, struct moving_io, cl);
closure_type(io, struct moving_io, cl);
struct bio *bio = &io->bio.bio;
bio_free_pages(bio);
......@@ -89,9 +89,9 @@ static void moving_init(struct moving_io *io)
bch_bio_map(bio, NULL);
}
static void write_moving(struct closure *cl)
static CLOSURE_CALLBACK(write_moving)
{
struct moving_io *io = container_of(cl, struct moving_io, cl);
closure_type(io, struct moving_io, cl);
struct data_insert_op *op = &io->op;
if (!op->status) {
......@@ -113,9 +113,9 @@ static void write_moving(struct closure *cl)
continue_at(cl, write_moving_finish, op->wq);
}
static void read_moving_submit(struct closure *cl)
static CLOSURE_CALLBACK(read_moving_submit)
{
struct moving_io *io = container_of(cl, struct moving_io, cl);
closure_type(io, struct moving_io, cl);
struct bio *bio = &io->bio.bio;
bch_submit_bbio(bio, io->op.c, &io->w->key, 0);
......
......@@ -25,7 +25,7 @@
struct kmem_cache *bch_search_cache;
static void bch_data_insert_start(struct closure *cl);
static CLOSURE_CALLBACK(bch_data_insert_start);
static unsigned int cache_mode(struct cached_dev *dc)
{
......@@ -55,9 +55,9 @@ static void bio_csum(struct bio *bio, struct bkey *k)
/* Insert data into cache */
static void bch_data_insert_keys(struct closure *cl)
static CLOSURE_CALLBACK(bch_data_insert_keys)
{
struct data_insert_op *op = container_of(cl, struct data_insert_op, cl);
closure_type(op, struct data_insert_op, cl);
atomic_t *journal_ref = NULL;
struct bkey *replace_key = op->replace ? &op->replace_key : NULL;
int ret;
......@@ -136,9 +136,9 @@ static void bch_data_invalidate(struct closure *cl)
continue_at(cl, bch_data_insert_keys, op->wq);
}
static void bch_data_insert_error(struct closure *cl)
static CLOSURE_CALLBACK(bch_data_insert_error)
{
struct data_insert_op *op = container_of(cl, struct data_insert_op, cl);
closure_type(op, struct data_insert_op, cl);
/*
* Our data write just errored, which means we've got a bunch of keys to
......@@ -163,7 +163,7 @@ static void bch_data_insert_error(struct closure *cl)
op->insert_keys.top = dst;
bch_data_insert_keys(cl);
bch_data_insert_keys(&cl->work);
}
static void bch_data_insert_endio(struct bio *bio)
......@@ -184,9 +184,9 @@ static void bch_data_insert_endio(struct bio *bio)
bch_bbio_endio(op->c, bio, bio->bi_status, "writing data to cache");
}
static void bch_data_insert_start(struct closure *cl)
static CLOSURE_CALLBACK(bch_data_insert_start)
{
struct data_insert_op *op = container_of(cl, struct data_insert_op, cl);
closure_type(op, struct data_insert_op, cl);
struct bio *bio = op->bio, *n;
if (op->bypass)
......@@ -305,16 +305,16 @@ static void bch_data_insert_start(struct closure *cl)
* If op->bypass is true, instead of inserting the data it invalidates the
* region of the cache represented by op->bio and op->inode.
*/
void bch_data_insert(struct closure *cl)
CLOSURE_CALLBACK(bch_data_insert)
{
struct data_insert_op *op = container_of(cl, struct data_insert_op, cl);
closure_type(op, struct data_insert_op, cl);
trace_bcache_write(op->c, op->inode, op->bio,
op->writeback, op->bypass);
bch_keylist_init(&op->insert_keys);
bio_get(op->bio);
bch_data_insert_start(cl);
bch_data_insert_start(&cl->work);
}
/*
......@@ -575,9 +575,9 @@ static int cache_lookup_fn(struct btree_op *op, struct btree *b, struct bkey *k)
return n == bio ? MAP_DONE : MAP_CONTINUE;
}
static void cache_lookup(struct closure *cl)
static CLOSURE_CALLBACK(cache_lookup)
{
struct search *s = container_of(cl, struct search, iop.cl);
closure_type(s, struct search, iop.cl);
struct bio *bio = &s->bio.bio;
struct cached_dev *dc;
int ret;
......@@ -698,9 +698,9 @@ static void do_bio_hook(struct search *s,
bio_cnt_set(bio, 3);
}
static void search_free(struct closure *cl)
static CLOSURE_CALLBACK(search_free)
{
struct search *s = container_of(cl, struct search, cl);
closure_type(s, struct search, cl);
atomic_dec(&s->iop.c->search_inflight);
......@@ -749,20 +749,20 @@ static inline struct search *search_alloc(struct bio *bio,
/* Cached devices */
static void cached_dev_bio_complete(struct closure *cl)
static CLOSURE_CALLBACK(cached_dev_bio_complete)
{
struct search *s = container_of(cl, struct search, cl);
closure_type(s, struct search, cl);
struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
cached_dev_put(dc);
search_free(cl);
search_free(&cl->work);
}
/* Process reads */
static void cached_dev_read_error_done(struct closure *cl)
static CLOSURE_CALLBACK(cached_dev_read_error_done)
{
struct search *s = container_of(cl, struct search, cl);
closure_type(s, struct search, cl);
if (s->iop.replace_collision)
bch_mark_cache_miss_collision(s->iop.c, s->d);
......@@ -770,12 +770,12 @@ static void cached_dev_read_error_done(struct closure *cl)
if (s->iop.bio)
bio_free_pages(s->iop.bio);
cached_dev_bio_complete(cl);
cached_dev_bio_complete(&cl->work);
}
static void cached_dev_read_error(struct closure *cl)
static CLOSURE_CALLBACK(cached_dev_read_error)
{
struct search *s = container_of(cl, struct search, cl);
closure_type(s, struct search, cl);
struct bio *bio = &s->bio.bio;
/*
......@@ -801,9 +801,9 @@ static void cached_dev_read_error(struct closure *cl)
continue_at(cl, cached_dev_read_error_done, NULL);
}
static void cached_dev_cache_miss_done(struct closure *cl)
static CLOSURE_CALLBACK(cached_dev_cache_miss_done)
{
struct search *s = container_of(cl, struct search, cl);
closure_type(s, struct search, cl);
struct bcache_device *d = s->d;
if (s->iop.replace_collision)
......@@ -812,13 +812,13 @@ static void cached_dev_cache_miss_done(struct closure *cl)
if (s->iop.bio)
bio_free_pages(s->iop.bio);
cached_dev_bio_complete(cl);
cached_dev_bio_complete(&cl->work);
closure_put(&d->cl);
}
static void cached_dev_read_done(struct closure *cl)
static CLOSURE_CALLBACK(cached_dev_read_done)
{
struct search *s = container_of(cl, struct search, cl);
closure_type(s, struct search, cl);
struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
/*
......@@ -858,9 +858,9 @@ static void cached_dev_read_done(struct closure *cl)
continue_at(cl, cached_dev_cache_miss_done, NULL);
}
static void cached_dev_read_done_bh(struct closure *cl)
static CLOSURE_CALLBACK(cached_dev_read_done_bh)
{
struct search *s = container_of(cl, struct search, cl);
closure_type(s, struct search, cl);
struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
bch_mark_cache_accounting(s->iop.c, s->d,
......@@ -955,13 +955,13 @@ static void cached_dev_read(struct cached_dev *dc, struct search *s)
/* Process writes */
static void cached_dev_write_complete(struct closure *cl)
static CLOSURE_CALLBACK(cached_dev_write_complete)
{
struct search *s = container_of(cl, struct search, cl);
closure_type(s, struct search, cl);
struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
up_read_non_owner(&dc->writeback_lock);
cached_dev_bio_complete(cl);
cached_dev_bio_complete(&cl->work);
}
static void cached_dev_write(struct cached_dev *dc, struct search *s)
......@@ -1048,9 +1048,9 @@ static void cached_dev_write(struct cached_dev *dc, struct search *s)
continue_at(cl, cached_dev_write_complete, NULL);
}
static void cached_dev_nodata(struct closure *cl)
static CLOSURE_CALLBACK(cached_dev_nodata)
{
struct search *s = container_of(cl, struct search, cl);
closure_type(s, struct search, cl);
struct bio *bio = &s->bio.bio;
if (s->iop.flush_journal)
......@@ -1265,9 +1265,9 @@ static int flash_dev_cache_miss(struct btree *b, struct search *s,
return MAP_CONTINUE;
}
static void flash_dev_nodata(struct closure *cl)
static CLOSURE_CALLBACK(flash_dev_nodata)
{
struct search *s = container_of(cl, struct search, cl);
closure_type(s, struct search, cl);
if (s->iop.flush_journal)
bch_journal_meta(s->iop.c, cl);
......
......@@ -34,7 +34,7 @@ struct data_insert_op {
};
unsigned int bch_get_congested(const struct cache_set *c);
void bch_data_insert(struct closure *cl);
CLOSURE_CALLBACK(bch_data_insert);
void bch_cached_dev_request_init(struct cached_dev *dc);
void cached_dev_submit_bio(struct bio *bio);
......
......@@ -327,9 +327,9 @@ static void __write_super(struct cache_sb *sb, struct cache_sb_disk *out,
submit_bio(bio);
}
static void bch_write_bdev_super_unlock(struct closure *cl)
static CLOSURE_CALLBACK(bch_write_bdev_super_unlock)
{
struct cached_dev *dc = container_of(cl, struct cached_dev, sb_write);
closure_type(dc, struct cached_dev, sb_write);
up(&dc->sb_write_mutex);
}
......@@ -363,9 +363,9 @@ static void write_super_endio(struct bio *bio)
closure_put(&ca->set->sb_write);
}
static void bcache_write_super_unlock(struct closure *cl)
static CLOSURE_CALLBACK(bcache_write_super_unlock)
{
struct cache_set *c = container_of(cl, struct cache_set, sb_write);
closure_type(c, struct cache_set, sb_write);
up(&c->sb_write_mutex);
}
......@@ -407,9 +407,9 @@ static void uuid_endio(struct bio *bio)
closure_put(cl);
}
static void uuid_io_unlock(struct closure *cl)
static CLOSURE_CALLBACK(uuid_io_unlock)
{
struct cache_set *c = container_of(cl, struct cache_set, uuid_write);
closure_type(c, struct cache_set, uuid_write);
up(&c->uuid_write_mutex);
}
......@@ -1344,9 +1344,9 @@ void bch_cached_dev_release(struct kobject *kobj)
module_put(THIS_MODULE);
}
static void cached_dev_free(struct closure *cl)
static CLOSURE_CALLBACK(cached_dev_free)
{
struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl);
closure_type(dc, struct cached_dev, disk.cl);
if (test_and_clear_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags))
cancel_writeback_rate_update_dwork(dc);
......@@ -1378,9 +1378,9 @@ static void cached_dev_free(struct closure *cl)
kobject_put(&dc->disk.kobj);
}
static void cached_dev_flush(struct closure *cl)
static CLOSURE_CALLBACK(cached_dev_flush)
{
struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl);
closure_type(dc, struct cached_dev, disk.cl);
struct bcache_device *d = &dc->disk;
mutex_lock(&bch_register_lock);
......@@ -1499,9 +1499,9 @@ void bch_flash_dev_release(struct kobject *kobj)
kfree(d);
}
static void flash_dev_free(struct closure *cl)
static CLOSURE_CALLBACK(flash_dev_free)
{
struct bcache_device *d = container_of(cl, struct bcache_device, cl);
closure_type(d, struct bcache_device, cl);
mutex_lock(&bch_register_lock);
atomic_long_sub(bcache_dev_sectors_dirty(d),
......@@ -1512,9 +1512,9 @@ static void flash_dev_free(struct closure *cl)
kobject_put(&d->kobj);
}
static void flash_dev_flush(struct closure *cl)
static CLOSURE_CALLBACK(flash_dev_flush)
{
struct bcache_device *d = container_of(cl, struct bcache_device, cl);
closure_type(d, struct bcache_device, cl);
mutex_lock(&bch_register_lock);
bcache_device_unlink(d);
......@@ -1670,9 +1670,9 @@ void bch_cache_set_release(struct kobject *kobj)
module_put(THIS_MODULE);
}
static void cache_set_free(struct closure *cl)
static CLOSURE_CALLBACK(cache_set_free)
{
struct cache_set *c = container_of(cl, struct cache_set, cl);
closure_type(c, struct cache_set, cl);
struct cache *ca;
debugfs_remove(c->debug);
......@@ -1711,9 +1711,9 @@ static void cache_set_free(struct closure *cl)
kobject_put(&c->kobj);
}
static void cache_set_flush(struct closure *cl)
static CLOSURE_CALLBACK(cache_set_flush)
{
struct cache_set *c = container_of(cl, struct cache_set, caching);
closure_type(c, struct cache_set, caching);
struct cache *ca = c->cache;
struct btree *b;
......@@ -1808,9 +1808,9 @@ static void conditional_stop_bcache_device(struct cache_set *c,
}
}
static void __cache_set_unregister(struct closure *cl)
static CLOSURE_CALLBACK(__cache_set_unregister)
{
struct cache_set *c = container_of(cl, struct cache_set, caching);
closure_type(c, struct cache_set, caching);
struct cached_dev *dc;
struct bcache_device *d;
size_t i;
......
......@@ -341,16 +341,16 @@ static void dirty_init(struct keybuf_key *w)
bch_bio_map(bio, NULL);
}
static void dirty_io_destructor(struct closure *cl)
static CLOSURE_CALLBACK(dirty_io_destructor)
{
struct dirty_io *io = container_of(cl, struct dirty_io, cl);
closure_type(io, struct dirty_io, cl);
kfree(io);
}
static void write_dirty_finish(struct closure *cl)
static CLOSURE_CALLBACK(write_dirty_finish)
{
struct dirty_io *io = container_of(cl, struct dirty_io, cl);
closure_type(io, struct dirty_io, cl);
struct keybuf_key *w = io->bio.bi_private;
struct cached_dev *dc = io->dc;
......@@ -400,9 +400,9 @@ static void dirty_endio(struct bio *bio)
closure_put(&io->cl);
}
static void write_dirty(struct closure *cl)
static CLOSURE_CALLBACK(write_dirty)
{
struct dirty_io *io = container_of(cl, struct dirty_io, cl);
closure_type(io, struct dirty_io, cl);
struct keybuf_key *w = io->bio.bi_private;
struct cached_dev *dc = io->dc;
......@@ -462,9 +462,9 @@ static void read_dirty_endio(struct bio *bio)
dirty_endio(bio);
}
static void read_dirty_submit(struct closure *cl)
static CLOSURE_CALLBACK(read_dirty_submit)
{
struct dirty_io *io = container_of(cl, struct dirty_io, cl);
closure_type(io, struct dirty_io, cl);
closure_bio_submit(io->dc->disk.c, &io->bio, cl);
......
......@@ -33,6 +33,18 @@ config BCACHEFS_QUOTA
depends on BCACHEFS_FS
select QUOTACTL
config BCACHEFS_ERASURE_CODING
bool "bcachefs erasure coding (RAID5/6) support (EXPERIMENTAL)"
depends on BCACHEFS_FS
select QUOTACTL
help
This enables the "erasure_code" filesysystem and inode option, which
organizes data into reed-solomon stripes instead of ordinary
replication.
WARNING: this feature is still undergoing on disk format changes, and
should only be enabled for testing purposes.
config BCACHEFS_POSIX_ACL
bool "bcachefs POSIX ACL support"
depends on BCACHEFS_FS
......
......@@ -1297,6 +1297,30 @@ static struct write_point *writepoint_find(struct btree_trans *trans,
return wp;
}
static noinline void
deallocate_extra_replicas(struct bch_fs *c,
struct open_buckets *ptrs,
struct open_buckets *ptrs_no_use,
unsigned extra_replicas)
{
struct open_buckets ptrs2 = { 0 };
struct open_bucket *ob;
unsigned i;
open_bucket_for_each(c, ptrs, ob, i) {
unsigned d = bch_dev_bkey_exists(c, ob->dev)->mi.durability;
if (d && d <= extra_replicas) {
extra_replicas -= d;
ob_push(c, ptrs_no_use, ob);
} else {
ob_push(c, &ptrs2, ob);
}
}
*ptrs = ptrs2;
}
/*
* Get us an open_bucket we can allocate from, return with it locked:
*/
......@@ -1321,6 +1345,9 @@ int bch2_alloc_sectors_start_trans(struct btree_trans *trans,
int ret;
int i;
if (!IS_ENABLED(CONFIG_BCACHEFS_ERASURE_CODING))
erasure_code = false;
BUG_ON(flags & BCH_WRITE_ONLY_SPECIFIED_DEVS);
BUG_ON(!nr_replicas || !nr_replicas_required);
......@@ -1382,6 +1409,9 @@ int bch2_alloc_sectors_start_trans(struct btree_trans *trans,
if (ret)
goto err;
if (nr_effective > nr_replicas)
deallocate_extra_replicas(c, &ptrs, &wp->ptrs, nr_effective - nr_replicas);
/* Free buckets we didn't use: */
open_bucket_for_each(c, &wp->ptrs, ob, i)
open_bucket_free_unused(c, ob);
......
......@@ -638,6 +638,8 @@ struct journal_keys {
size_t gap;
size_t nr;
size_t size;
atomic_t ref;
bool initial_ref_held;
};
struct btree_trans_buf {
......@@ -929,7 +931,7 @@ struct bch_fs {
mempool_t compression_bounce[2];
mempool_t compress_workspace[BCH_COMPRESSION_TYPE_NR];
mempool_t decompress_workspace;
ZSTD_parameters zstd_params;
size_t zstd_workspace_size;
struct crypto_shash *sha256;
struct crypto_sync_skcipher *chacha20;
......
......@@ -151,7 +151,11 @@ struct bpos {
#else
#error edit for your odd byteorder.
#endif
} __packed __aligned(4);
} __packed
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
__aligned(4)
#endif
;
#define KEY_INODE_MAX ((__u64)~0ULL)
#define KEY_OFFSET_MAX ((__u64)~0ULL)
......@@ -1528,7 +1532,7 @@ struct bch_sb_field_disk_groups {
x(move_extent_write, 36) \
x(move_extent_finish, 37) \
x(move_extent_fail, 38) \
x(move_extent_alloc_mem_fail, 39) \
x(move_extent_start_fail, 39) \
x(copygc, 40) \
x(copygc_wait, 41) \
x(gc_gens_end, 42) \
......
......@@ -1541,8 +1541,8 @@ static int bch2_gc_alloc_start(struct bch_fs *c, bool metadata_only)
rcu_assign_pointer(ca->buckets_gc, buckets);
}
for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN,
BTREE_ITER_PREFETCH, k, ret) {
ret = for_each_btree_key2(trans, iter, BTREE_ID_alloc, POS_MIN,
BTREE_ITER_PREFETCH, k, ({
ca = bch_dev_bkey_exists(c, k.k->p.inode);
g = gc_bucket(ca, k.k->p.offset);
......@@ -1561,8 +1561,9 @@ static int bch2_gc_alloc_start(struct bch_fs *c, bool metadata_only)
g->stripe = a->stripe;
g->stripe_redundancy = a->stripe_redundancy;
}
}
bch2_trans_iter_exit(trans, &iter);
0;
}));
err:
bch2_trans_put(trans);
if (ret)
......
......@@ -1358,10 +1358,9 @@ static bool btree_node_has_extra_bsets(struct bch_fs *c, unsigned offset, void *
return offset;
}
static void btree_node_read_all_replicas_done(struct closure *cl)
static CLOSURE_CALLBACK(btree_node_read_all_replicas_done)
{
struct btree_node_read_all *ra =
container_of(cl, struct btree_node_read_all, cl);
closure_type(ra, struct btree_node_read_all, cl);
struct bch_fs *c = ra->c;
struct btree *b = ra->b;
struct printbuf buf = PRINTBUF;
......@@ -1567,7 +1566,7 @@ static int btree_node_read_all_replicas(struct bch_fs *c, struct btree *b, bool
if (sync) {
closure_sync(&ra->cl);
btree_node_read_all_replicas_done(&ra->cl);
btree_node_read_all_replicas_done(&ra->cl.work);
} else {
continue_at(&ra->cl, btree_node_read_all_replicas_done,
c->io_complete_wq);
......
......@@ -2981,7 +2981,8 @@ struct btree_trans *__bch2_trans_get(struct bch_fs *c, unsigned fn_idx)
trans->fn_idx = fn_idx;
trans->locking_wait.task = current;
trans->journal_replay_not_finished =
!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags);
unlikely(!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)) &&
atomic_inc_not_zero(&c->journal_keys.ref);
closure_init_stack(&trans->ref);
s = btree_trans_stats(trans);
......@@ -3098,6 +3099,9 @@ void bch2_trans_put(struct btree_trans *trans)
kfree(trans->fs_usage_deltas);
}
if (unlikely(trans->journal_replay_not_finished))
bch2_journal_keys_put(c);
if (trans->mem_bytes == BTREE_TRANS_MEM_MAX)
mempool_free(trans->mem, &c->btree_trans_mem_pool);
else
......
......@@ -80,6 +80,8 @@ struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *c, enum btree_id btree
struct journal_keys *keys = &c->journal_keys;
unsigned iters = 0;
struct journal_key *k;
BUG_ON(*idx > keys->nr);
search:
if (!*idx)
*idx = __bch2_journal_key_search(keys, btree_id, level, pos);
......@@ -189,10 +191,12 @@ int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id,
/* Since @keys was full, there was no gap: */
memcpy(new_keys.d, keys->d, sizeof(keys->d[0]) * keys->nr);
kvfree(keys->d);
*keys = new_keys;
keys->d = new_keys.d;
keys->nr = new_keys.nr;
keys->size = new_keys.size;
/* And now the gap is at the end: */
keys->gap = keys->nr;
keys->gap = keys->nr;
}
journal_iters_move_gap(c, keys->gap, idx);
......@@ -415,10 +419,16 @@ static int journal_sort_key_cmp(const void *_l, const void *_r)
cmp_int(l->journal_offset, r->journal_offset);
}
void bch2_journal_keys_free(struct journal_keys *keys)
void bch2_journal_keys_put(struct bch_fs *c)
{
struct journal_keys *keys = &c->journal_keys;
struct journal_key *i;
BUG_ON(atomic_read(&keys->ref) <= 0);
if (!atomic_dec_and_test(&keys->ref))
return;
move_gap(keys->d, keys->nr, keys->size, keys->gap, keys->nr);
keys->gap = keys->nr;
......@@ -429,6 +439,8 @@ void bch2_journal_keys_free(struct journal_keys *keys)
kvfree(keys->d);
keys->d = NULL;
keys->nr = keys->gap = keys->size = 0;
bch2_journal_entries_free(c);
}
static void __journal_keys_sort(struct journal_keys *keys)
......
......@@ -49,7 +49,15 @@ void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *,
struct bch_fs *,
struct btree *);
void bch2_journal_keys_free(struct journal_keys *);
void bch2_journal_keys_put(struct bch_fs *);
static inline void bch2_journal_keys_put_initial(struct bch_fs *c)
{
if (c->journal_keys.initial_ref_held)
bch2_journal_keys_put(c);
c->journal_keys.initial_ref_held = false;
}
void bch2_journal_entries_free(struct bch_fs *);
int bch2_journal_keys_sort(struct bch_fs *);
......
......@@ -778,9 +778,9 @@ static void btree_interior_update_work(struct work_struct *work)
}
}
static void btree_update_set_nodes_written(struct closure *cl)
static CLOSURE_CALLBACK(btree_update_set_nodes_written)
{
struct btree_update *as = container_of(cl, struct btree_update, cl);
closure_type(as, struct btree_update, cl);
struct bch_fs *c = as->c;
mutex_lock(&c->btree_interior_update_lock);
......@@ -1071,8 +1071,12 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
break;
}
/*
* Always check for space for two keys, even if we won't have to
* split at prior level - it might have been a merge instead:
*/
if (bch2_btree_node_insert_fits(c, path->l[update_level].b,
BKEY_BTREE_PTR_U64s_MAX * (1 + split)))
BKEY_BTREE_PTR_U64s_MAX * 2))
break;
split = path->l[update_level].b->nr.live_u64s > BTREE_SPLIT_THRESHOLD(c);
......@@ -2266,6 +2270,10 @@ int bch2_btree_node_update_key_get_iter(struct btree_trans *trans,
BUG_ON(!btree_node_hashed(b));
struct bch_extent_ptr *ptr;
bch2_bkey_drop_ptrs(bkey_i_to_s(new_key), ptr,
!bch2_bkey_has_device(bkey_i_to_s(&b->key), ptr->dev));
ret = bch2_btree_node_update_key(trans, &iter, b, new_key,
commit_flags, skip_triggers);
out:
......
......@@ -854,8 +854,12 @@ static int __mark_pointer(struct btree_trans *trans,
return ret;
*dst_sectors += sectors;
*bucket_data_type = *dirty_sectors || *cached_sectors
? ptr_data_type : 0;
if (!*dirty_sectors && !*cached_sectors)
*bucket_data_type = 0;
else if (*bucket_data_type != BCH_DATA_stripe)
*bucket_data_type = ptr_data_type;
return 0;
}
......@@ -2091,8 +2095,6 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
bucket_gens->first_bucket = ca->mi.first_bucket;
bucket_gens->nbuckets = nbuckets;
bch2_copygc_stop(c);
if (resize) {
down_write(&c->gc_lock);
down_write(&ca->bucket_lock);
......
......@@ -354,8 +354,7 @@ static int attempt_compress(struct bch_fs *c,
*/
unsigned level = min((compression.level * 3) / 2, zstd_max_clevel());
ZSTD_parameters params = zstd_get_params(level, c->opts.encoded_extent_max);
ZSTD_CCtx *ctx = zstd_init_cctx(workspace,
zstd_cctx_workspace_bound(&params.cParams));
ZSTD_CCtx *ctx = zstd_init_cctx(workspace, c->zstd_workspace_size);
/*
* ZSTD requires that when we decompress we pass in the exact
......@@ -371,7 +370,7 @@ static int attempt_compress(struct bch_fs *c,
size_t len = zstd_compress_cctx(ctx,
dst + 4, dst_len - 4 - 7,
src, src_len,
&c->zstd_params);
&params);
if (zstd_is_error(len))
return 0;
......@@ -572,6 +571,13 @@ static int __bch2_fs_compress_init(struct bch_fs *c, u64 features)
size_t decompress_workspace_size = 0;
ZSTD_parameters params = zstd_get_params(zstd_max_clevel(),
c->opts.encoded_extent_max);
/*
* ZSTD is lying: if we allocate the size of the workspace it says it
* requires, it returns memory allocation errors
*/
c->zstd_workspace_size = zstd_cctx_workspace_bound(&params.cParams);
struct {
unsigned feature;
enum bch_compression_type type;
......@@ -585,13 +591,11 @@ static int __bch2_fs_compress_init(struct bch_fs *c, u64 features)
zlib_deflate_workspacesize(MAX_WBITS, DEF_MEM_LEVEL),
zlib_inflate_workspacesize(), },
{ BCH_FEATURE_zstd, BCH_COMPRESSION_TYPE_zstd,
zstd_cctx_workspace_bound(&params.cParams),
c->zstd_workspace_size,
zstd_dctx_workspace_bound() },
}, *i;
bool have_compressed = false;
c->zstd_params = params;
for (i = compression_types;
i < compression_types + ARRAY_SIZE(compression_types);
i++)
......
......@@ -356,7 +356,7 @@ void bch2_data_update_exit(struct data_update *update)
bch2_bio_free_pages_pool(c, &update->op.wbio.bio);
}
void bch2_update_unwritten_extent(struct btree_trans *trans,
static void bch2_update_unwritten_extent(struct btree_trans *trans,
struct data_update *update)
{
struct bch_fs *c = update->op.c;
......@@ -436,7 +436,51 @@ void bch2_update_unwritten_extent(struct btree_trans *trans,
}
}
int bch2_extent_drop_ptrs(struct btree_trans *trans,
struct btree_iter *iter,
struct bkey_s_c k,
struct data_update_opts data_opts)
{
struct bch_fs *c = trans->c;
struct bkey_i *n;
int ret;
n = bch2_bkey_make_mut_noupdate(trans, k);
ret = PTR_ERR_OR_ZERO(n);
if (ret)
return ret;
while (data_opts.kill_ptrs) {
unsigned i = 0, drop = __fls(data_opts.kill_ptrs);
struct bch_extent_ptr *ptr;
bch2_bkey_drop_ptrs(bkey_i_to_s(n), ptr, i++ == drop);
data_opts.kill_ptrs ^= 1U << drop;
}
/*
* If the new extent no longer has any pointers, bch2_extent_normalize()
* will do the appropriate thing with it (turning it into a
* KEY_TYPE_error key, or just a discard if it was a cached extent)
*/
bch2_extent_normalize(c, bkey_i_to_s(n));
/*
* Since we're not inserting through an extent iterator
* (BTREE_ITER_ALL_SNAPSHOTS iterators aren't extent iterators),
* we aren't using the extent overwrite path to delete, we're
* just using the normal key deletion path:
*/
if (bkey_deleted(&n->k))
n->k.size = 0;
return bch2_trans_relock(trans) ?:
bch2_trans_update(trans, iter, n, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
bch2_trans_commit(trans, NULL, NULL, BTREE_INSERT_NOFAIL);
}
int bch2_data_update_init(struct btree_trans *trans,
struct btree_iter *iter,
struct moving_context *ctxt,
struct data_update *m,
struct write_point_specifier wp,
......@@ -452,7 +496,7 @@ int bch2_data_update_init(struct btree_trans *trans,
const struct bch_extent_ptr *ptr;
unsigned i, reserve_sectors = k.k->size * data_opts.extra_replicas;
unsigned ptrs_locked = 0;
int ret;
int ret = 0;
bch2_bkey_buf_init(&m->k);
bch2_bkey_buf_reassemble(&m->k, c, k);
......@@ -478,6 +522,8 @@ int bch2_data_update_init(struct btree_trans *trans,
bkey_for_each_ptr(ptrs, ptr)
percpu_ref_get(&bch_dev_bkey_exists(c, ptr->dev)->ref);
unsigned durability_have = 0, durability_removing = 0;
i = 0;
bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
bool locked;
......@@ -489,8 +535,11 @@ int bch2_data_update_init(struct btree_trans *trans,
reserve_sectors += k.k->size;
m->op.nr_replicas += bch2_extent_ptr_desired_durability(c, &p);
} else if (!p.ptr.cached) {
durability_removing += bch2_extent_ptr_desired_durability(c, &p);
} else if (!p.ptr.cached &&
!((1U << i) & m->data_opts.kill_ptrs)) {
bch2_dev_list_add_dev(&m->op.devs_have, p.ptr.dev);
durability_have += bch2_extent_ptr_durability(c, &p);
}
/*
......@@ -529,6 +578,29 @@ int bch2_data_update_init(struct btree_trans *trans,
i++;
}
/*
* If current extent durability is less than io_opts.data_replicas,
* we're not trying to rereplicate the extent up to data_replicas here -
* unless extra_replicas was specified
*
* Increasing replication is an explicit operation triggered by
* rereplicate, currently, so that users don't get an unexpected -ENOSPC
*/
if (durability_have >= io_opts.data_replicas) {
m->data_opts.kill_ptrs |= m->data_opts.rewrite_ptrs;
m->data_opts.rewrite_ptrs = 0;
/* if iter == NULL, it's just a promote */
if (iter)
ret = bch2_extent_drop_ptrs(trans, iter, k, data_opts);
goto done;
}
m->op.nr_replicas = min(durability_removing, io_opts.data_replicas - durability_have) +
m->data_opts.extra_replicas;
m->op.nr_replicas_required = m->op.nr_replicas;
BUG_ON(!m->op.nr_replicas);
if (reserve_sectors) {
ret = bch2_disk_reservation_add(c, &m->op.res, reserve_sectors,
m->data_opts.extra_replicas
......@@ -538,14 +610,11 @@ int bch2_data_update_init(struct btree_trans *trans,
goto err;
}
m->op.nr_replicas += m->data_opts.extra_replicas;
m->op.nr_replicas_required = m->op.nr_replicas;
BUG_ON(!m->op.nr_replicas);
if (bkey_extent_is_unwritten(k)) {
bch2_update_unwritten_extent(trans, m);
goto done;
}
/* Special handling required: */
if (bkey_extent_is_unwritten(k))
return -BCH_ERR_unwritten_extent_update;
return 0;
err:
i = 0;
......@@ -560,6 +629,9 @@ int bch2_data_update_init(struct btree_trans *trans,
bch2_bkey_buf_exit(&m->k, c);
bch2_bio_free_pages_pool(c, &m->op.wbio.bio);
return ret;
done:
bch2_data_update_exit(m);
return ret ?: -BCH_ERR_data_update_done;
}
void bch2_data_update_opts_normalize(struct bkey_s_c k, struct data_update_opts *opts)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment