Commit 72bea05c authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'bcachefs-2024-08-24' of git://evilpiepirate.org/bcachefs

Pull bcachefs fixes from Kent Overstreet:

 - assorted syzbot fixes

 - some upgrade fixes for old (pre 1.0) filesystems

 - fix for moving data off a device that was switched to durability=0
   after data had been written to it.

 - nocow deadlock fix

 - fix for new rebalance_work accounting

* tag 'bcachefs-2024-08-24' of git://evilpiepirate.org/bcachefs: (28 commits)
  bcachefs: Fix rebalance_work accounting
  bcachefs: Fix failure to flush moves before sleeping in copygc
  bcachefs: don't use rht_bucket() in btree_key_cache_scan()
  bcachefs: add missing inode_walker_exit()
  bcachefs: clear path->should_be_locked in bch2_btree_key_cache_drop()
  bcachefs: Fix double assignment in check_dirent_to_subvol()
  bcachefs: Fix refcounting in discard path
  bcachefs: Fix compat issue with old alloc_v4 keys
  bcachefs: Fix warning in bch2_fs_journal_stop()
  fs/super.c: improve get_tree() error message
  bcachefs: Fix missing validation in bch2_sb_journal_v2_validate()
  bcachefs: Fix replay_now_at() assert
  bcachefs: Fix locking in bch2_ioc_setlabel()
  bcachefs: fix failure to relock in btree_node_fill()
  bcachefs: fix failure to relock in bch2_btree_node_mem_alloc()
  bcachefs: unlock_long() before resort in journal replay
  bcachefs: fix missing bch2_err_str()
  bcachefs: fix time_stats_to_text()
  bcachefs: Fix bch2_bucket_gens_init()
  bcachefs: Fix bch2_trigger_alloc assert
  ...
parents 780bdc1b 49aa7830
...@@ -240,71 +240,73 @@ int bch2_alloc_v3_validate(struct bch_fs *c, struct bkey_s_c k, ...@@ -240,71 +240,73 @@ int bch2_alloc_v3_validate(struct bch_fs *c, struct bkey_s_c k,
int bch2_alloc_v4_validate(struct bch_fs *c, struct bkey_s_c k, int bch2_alloc_v4_validate(struct bch_fs *c, struct bkey_s_c k,
enum bch_validate_flags flags) enum bch_validate_flags flags)
{ {
struct bkey_s_c_alloc_v4 a = bkey_s_c_to_alloc_v4(k); struct bch_alloc_v4 a;
int ret = 0; int ret = 0;
bkey_fsck_err_on(alloc_v4_u64s_noerror(a.v) > bkey_val_u64s(k.k), bkey_val_copy(&a, bkey_s_c_to_alloc_v4(k));
bkey_fsck_err_on(alloc_v4_u64s_noerror(&a) > bkey_val_u64s(k.k),
c, alloc_v4_val_size_bad, c, alloc_v4_val_size_bad,
"bad val size (%u > %zu)", "bad val size (%u > %zu)",
alloc_v4_u64s_noerror(a.v), bkey_val_u64s(k.k)); alloc_v4_u64s_noerror(&a), bkey_val_u64s(k.k));
bkey_fsck_err_on(!BCH_ALLOC_V4_BACKPOINTERS_START(a.v) && bkey_fsck_err_on(!BCH_ALLOC_V4_BACKPOINTERS_START(&a) &&
BCH_ALLOC_V4_NR_BACKPOINTERS(a.v), BCH_ALLOC_V4_NR_BACKPOINTERS(&a),
c, alloc_v4_backpointers_start_bad, c, alloc_v4_backpointers_start_bad,
"invalid backpointers_start"); "invalid backpointers_start");
bkey_fsck_err_on(alloc_data_type(*a.v, a.v->data_type) != a.v->data_type, bkey_fsck_err_on(alloc_data_type(a, a.data_type) != a.data_type,
c, alloc_key_data_type_bad, c, alloc_key_data_type_bad,
"invalid data type (got %u should be %u)", "invalid data type (got %u should be %u)",
a.v->data_type, alloc_data_type(*a.v, a.v->data_type)); a.data_type, alloc_data_type(a, a.data_type));
for (unsigned i = 0; i < 2; i++) for (unsigned i = 0; i < 2; i++)
bkey_fsck_err_on(a.v->io_time[i] > LRU_TIME_MAX, bkey_fsck_err_on(a.io_time[i] > LRU_TIME_MAX,
c, alloc_key_io_time_bad, c, alloc_key_io_time_bad,
"invalid io_time[%s]: %llu, max %llu", "invalid io_time[%s]: %llu, max %llu",
i == READ ? "read" : "write", i == READ ? "read" : "write",
a.v->io_time[i], LRU_TIME_MAX); a.io_time[i], LRU_TIME_MAX);
unsigned stripe_sectors = BCH_ALLOC_V4_BACKPOINTERS_START(a.v) * sizeof(u64) > unsigned stripe_sectors = BCH_ALLOC_V4_BACKPOINTERS_START(&a) * sizeof(u64) >
offsetof(struct bch_alloc_v4, stripe_sectors) offsetof(struct bch_alloc_v4, stripe_sectors)
? a.v->stripe_sectors ? a.stripe_sectors
: 0; : 0;
switch (a.v->data_type) { switch (a.data_type) {
case BCH_DATA_free: case BCH_DATA_free:
case BCH_DATA_need_gc_gens: case BCH_DATA_need_gc_gens:
case BCH_DATA_need_discard: case BCH_DATA_need_discard:
bkey_fsck_err_on(stripe_sectors || bkey_fsck_err_on(stripe_sectors ||
a.v->dirty_sectors || a.dirty_sectors ||
a.v->cached_sectors || a.cached_sectors ||
a.v->stripe, a.stripe,
c, alloc_key_empty_but_have_data, c, alloc_key_empty_but_have_data,
"empty data type free but have data %u.%u.%u %u", "empty data type free but have data %u.%u.%u %u",
stripe_sectors, stripe_sectors,
a.v->dirty_sectors, a.dirty_sectors,
a.v->cached_sectors, a.cached_sectors,
a.v->stripe); a.stripe);
break; break;
case BCH_DATA_sb: case BCH_DATA_sb:
case BCH_DATA_journal: case BCH_DATA_journal:
case BCH_DATA_btree: case BCH_DATA_btree:
case BCH_DATA_user: case BCH_DATA_user:
case BCH_DATA_parity: case BCH_DATA_parity:
bkey_fsck_err_on(!a.v->dirty_sectors && bkey_fsck_err_on(!a.dirty_sectors &&
!stripe_sectors, !stripe_sectors,
c, alloc_key_dirty_sectors_0, c, alloc_key_dirty_sectors_0,
"data_type %s but dirty_sectors==0", "data_type %s but dirty_sectors==0",
bch2_data_type_str(a.v->data_type)); bch2_data_type_str(a.data_type));
break; break;
case BCH_DATA_cached: case BCH_DATA_cached:
bkey_fsck_err_on(!a.v->cached_sectors || bkey_fsck_err_on(!a.cached_sectors ||
a.v->dirty_sectors || a.dirty_sectors ||
stripe_sectors || stripe_sectors ||
a.v->stripe, a.stripe,
c, alloc_key_cached_inconsistency, c, alloc_key_cached_inconsistency,
"data type inconsistency"); "data type inconsistency");
bkey_fsck_err_on(!a.v->io_time[READ] && bkey_fsck_err_on(!a.io_time[READ] &&
c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_to_lru_refs, c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_to_lru_refs,
c, alloc_key_cached_but_read_time_zero, c, alloc_key_cached_but_read_time_zero,
"cached bucket with read_time == 0"); "cached bucket with read_time == 0");
...@@ -556,7 +558,7 @@ int bch2_bucket_gens_init(struct bch_fs *c) ...@@ -556,7 +558,7 @@ int bch2_bucket_gens_init(struct bch_fs *c)
struct bpos pos = alloc_gens_pos(iter.pos, &offset); struct bpos pos = alloc_gens_pos(iter.pos, &offset);
int ret2 = 0; int ret2 = 0;
if (have_bucket_gens_key && bkey_cmp(iter.pos, pos)) { if (have_bucket_gens_key && !bkey_eq(g.k.p, pos)) {
ret2 = bch2_btree_insert_trans(trans, BTREE_ID_bucket_gens, &g.k_i, 0) ?: ret2 = bch2_btree_insert_trans(trans, BTREE_ID_bucket_gens, &g.k_i, 0) ?:
bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
if (ret2) if (ret2)
...@@ -829,7 +831,7 @@ int bch2_trigger_alloc(struct btree_trans *trans, ...@@ -829,7 +831,7 @@ int bch2_trigger_alloc(struct btree_trans *trans,
if (likely(new.k->type == KEY_TYPE_alloc_v4)) { if (likely(new.k->type == KEY_TYPE_alloc_v4)) {
new_a = bkey_s_to_alloc_v4(new).v; new_a = bkey_s_to_alloc_v4(new).v;
} else { } else {
BUG_ON(!(flags & BTREE_TRIGGER_gc)); BUG_ON(!(flags & (BTREE_TRIGGER_gc|BTREE_TRIGGER_check_repair)));
struct bkey_i_alloc_v4 *new_ka = bch2_alloc_to_v4_mut_inlined(trans, new.s_c); struct bkey_i_alloc_v4 *new_ka = bch2_alloc_to_v4_mut_inlined(trans, new.s_c);
ret = PTR_ERR_OR_ZERO(new_ka); ret = PTR_ERR_OR_ZERO(new_ka);
...@@ -1872,26 +1874,26 @@ static void bch2_do_discards_work(struct work_struct *work) ...@@ -1872,26 +1874,26 @@ static void bch2_do_discards_work(struct work_struct *work)
trace_discard_buckets(c, s.seen, s.open, s.need_journal_commit, s.discarded, trace_discard_buckets(c, s.seen, s.open, s.need_journal_commit, s.discarded,
bch2_err_str(ret)); bch2_err_str(ret));
bch2_write_ref_put(c, BCH_WRITE_REF_discard);
percpu_ref_put(&ca->io_ref); percpu_ref_put(&ca->io_ref);
bch2_write_ref_put(c, BCH_WRITE_REF_discard);
} }
void bch2_dev_do_discards(struct bch_dev *ca) void bch2_dev_do_discards(struct bch_dev *ca)
{ {
struct bch_fs *c = ca->fs; struct bch_fs *c = ca->fs;
if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE)) if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_discard))
return; return;
if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_discard)) if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE))
goto put_ioref; goto put_write_ref;
if (queue_work(c->write_ref_wq, &ca->discard_work)) if (queue_work(c->write_ref_wq, &ca->discard_work))
return; return;
bch2_write_ref_put(c, BCH_WRITE_REF_discard);
put_ioref:
percpu_ref_put(&ca->io_ref); percpu_ref_put(&ca->io_ref);
put_write_ref:
bch2_write_ref_put(c, BCH_WRITE_REF_discard);
} }
void bch2_do_discards(struct bch_fs *c) void bch2_do_discards(struct bch_fs *c)
......
...@@ -69,6 +69,7 @@ struct bch_alloc_v4 { ...@@ -69,6 +69,7 @@ struct bch_alloc_v4 {
__u64 io_time[2]; __u64 io_time[2];
__u32 stripe; __u32 stripe;
__u32 nr_external_backpointers; __u32 nr_external_backpointers;
/* end of fields in original version of alloc_v4 */
__u64 fragmentation_lru; __u64 fragmentation_lru;
__u32 stripe_sectors; __u32 stripe_sectors;
__u32 pad; __u32 pad;
......
...@@ -677,7 +677,8 @@ struct bch_sb_field_ext { ...@@ -677,7 +677,8 @@ struct bch_sb_field_ext {
x(bucket_stripe_sectors, BCH_VERSION(1, 8)) \ x(bucket_stripe_sectors, BCH_VERSION(1, 8)) \
x(disk_accounting_v2, BCH_VERSION(1, 9)) \ x(disk_accounting_v2, BCH_VERSION(1, 9)) \
x(disk_accounting_v3, BCH_VERSION(1, 10)) \ x(disk_accounting_v3, BCH_VERSION(1, 10)) \
x(disk_accounting_inum, BCH_VERSION(1, 11)) x(disk_accounting_inum, BCH_VERSION(1, 11)) \
x(rebalance_work_acct_fix, BCH_VERSION(1, 12))
enum bcachefs_metadata_version { enum bcachefs_metadata_version {
bcachefs_metadata_version_min = 9, bcachefs_metadata_version_min = 9,
......
...@@ -159,6 +159,16 @@ struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *c) ...@@ -159,6 +159,16 @@ struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *c)
return b; return b;
} }
void bch2_btree_node_to_freelist(struct bch_fs *c, struct btree *b)
{
mutex_lock(&c->btree_cache.lock);
list_move(&b->list, &c->btree_cache.freeable);
mutex_unlock(&c->btree_cache.lock);
six_unlock_write(&b->c.lock);
six_unlock_intent(&b->c.lock);
}
/* Btree in memory cache - hash table */ /* Btree in memory cache - hash table */
void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b) void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b)
...@@ -736,6 +746,13 @@ struct btree *bch2_btree_node_mem_alloc(struct btree_trans *trans, bool pcpu_rea ...@@ -736,6 +746,13 @@ struct btree *bch2_btree_node_mem_alloc(struct btree_trans *trans, bool pcpu_rea
start_time); start_time);
memalloc_nofs_restore(flags); memalloc_nofs_restore(flags);
int ret = bch2_trans_relock(trans);
if (unlikely(ret)) {
bch2_btree_node_to_freelist(c, b);
return ERR_PTR(ret);
}
return b; return b;
err: err:
mutex_lock(&bc->lock); mutex_lock(&bc->lock);
...@@ -856,6 +873,10 @@ static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans, ...@@ -856,6 +873,10 @@ static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans,
bch2_btree_node_read(trans, b, sync); bch2_btree_node_read(trans, b, sync);
int ret = bch2_trans_relock(trans);
if (ret)
return ERR_PTR(ret);
if (!sync) if (!sync)
return NULL; return NULL;
...@@ -974,6 +995,10 @@ static struct btree *__bch2_btree_node_get(struct btree_trans *trans, struct btr ...@@ -974,6 +995,10 @@ static struct btree *__bch2_btree_node_get(struct btree_trans *trans, struct btr
bch2_btree_node_wait_on_read(b); bch2_btree_node_wait_on_read(b);
ret = bch2_trans_relock(trans);
if (ret)
return ERR_PTR(ret);
/* /*
* should_be_locked is not set on this path yet, so we need to * should_be_locked is not set on this path yet, so we need to
* relock it specifically: * relock it specifically:
......
...@@ -12,6 +12,8 @@ struct btree_iter; ...@@ -12,6 +12,8 @@ struct btree_iter;
void bch2_recalc_btree_reserve(struct bch_fs *); void bch2_recalc_btree_reserve(struct bch_fs *);
void bch2_btree_node_to_freelist(struct bch_fs *, struct btree *);
void bch2_btree_node_hash_remove(struct btree_cache *, struct btree *); void bch2_btree_node_hash_remove(struct btree_cache *, struct btree *);
int __bch2_btree_node_hash_insert(struct btree_cache *, struct btree *); int __bch2_btree_node_hash_insert(struct btree_cache *, struct btree *);
int bch2_btree_node_hash_insert(struct btree_cache *, struct btree *, int bch2_btree_node_hash_insert(struct btree_cache *, struct btree *,
......
...@@ -569,6 +569,15 @@ static inline struct bkey_s_c bch2_bkey_get_iter(struct btree_trans *trans, ...@@ -569,6 +569,15 @@ static inline struct bkey_s_c bch2_bkey_get_iter(struct btree_trans *trans,
bkey_s_c_to_##_type(__bch2_bkey_get_iter(_trans, _iter, \ bkey_s_c_to_##_type(__bch2_bkey_get_iter(_trans, _iter, \
_btree_id, _pos, _flags, KEY_TYPE_##_type)) _btree_id, _pos, _flags, KEY_TYPE_##_type))
#define bkey_val_copy(_dst_v, _src_k) \
do { \
unsigned b = min_t(unsigned, sizeof(*_dst_v), \
bkey_val_bytes(_src_k.k)); \
memcpy(_dst_v, _src_k.v, b); \
if (b < sizeof(*_dst_v)) \
memset((void *) (_dst_v) + b, 0, sizeof(*_dst_v) - b); \
} while (0)
static inline int __bch2_bkey_get_val_typed(struct btree_trans *trans, static inline int __bch2_bkey_get_val_typed(struct btree_trans *trans,
unsigned btree_id, struct bpos pos, unsigned btree_id, struct bpos pos,
unsigned flags, unsigned type, unsigned flags, unsigned type,
......
...@@ -726,6 +726,7 @@ void bch2_btree_key_cache_drop(struct btree_trans *trans, ...@@ -726,6 +726,7 @@ void bch2_btree_key_cache_drop(struct btree_trans *trans,
mark_btree_node_locked(trans, path, 0, BTREE_NODE_UNLOCKED); mark_btree_node_locked(trans, path, 0, BTREE_NODE_UNLOCKED);
btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
path->should_be_locked = false;
} }
static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink, static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
...@@ -777,6 +778,20 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink, ...@@ -777,6 +778,20 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
rcu_read_lock(); rcu_read_lock();
tbl = rht_dereference_rcu(bc->table.tbl, &bc->table); tbl = rht_dereference_rcu(bc->table.tbl, &bc->table);
/*
* Scanning is expensive while a rehash is in progress - most elements
* will be on the new hashtable, if it's in progress
*
* A rehash could still start while we're scanning - that's ok, we'll
* still see most elements.
*/
if (unlikely(tbl->nest)) {
rcu_read_unlock();
srcu_read_unlock(&c->btree_trans_barrier, srcu_idx);
return SHRINK_STOP;
}
if (bc->shrink_iter >= tbl->size) if (bc->shrink_iter >= tbl->size)
bc->shrink_iter = 0; bc->shrink_iter = 0;
start = bc->shrink_iter; start = bc->shrink_iter;
...@@ -784,7 +799,7 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink, ...@@ -784,7 +799,7 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
do { do {
struct rhash_head *pos, *next; struct rhash_head *pos, *next;
pos = rht_ptr_rcu(rht_bucket(tbl, bc->shrink_iter)); pos = rht_ptr_rcu(&tbl->buckets[bc->shrink_iter]);
while (!rht_is_a_nulls(pos)) { while (!rht_is_a_nulls(pos)) {
next = rht_dereference_bucket_rcu(pos->next, tbl, bc->shrink_iter); next = rht_dereference_bucket_rcu(pos->next, tbl, bc->shrink_iter);
...@@ -865,12 +880,22 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc) ...@@ -865,12 +880,22 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc)
while (atomic_long_read(&bc->nr_keys)) { while (atomic_long_read(&bc->nr_keys)) {
rcu_read_lock(); rcu_read_lock();
tbl = rht_dereference_rcu(bc->table.tbl, &bc->table); tbl = rht_dereference_rcu(bc->table.tbl, &bc->table);
if (tbl) if (tbl) {
if (tbl->nest) {
/* wait for in progress rehash */
rcu_read_unlock();
mutex_lock(&bc->table.mutex);
mutex_unlock(&bc->table.mutex);
rcu_read_lock();
continue;
}
for (i = 0; i < tbl->size; i++) for (i = 0; i < tbl->size; i++)
rht_for_each_entry_rcu(ck, pos, tbl, i, hash) { while (pos = rht_ptr_rcu(&tbl->buckets[i]), !rht_is_a_nulls(pos)) {
ck = container_of(pos, struct bkey_cached, hash);
bkey_cached_evict(bc, ck); bkey_cached_evict(bc, ck);
list_add(&ck->list, &items); list_add(&ck->list, &items);
} }
}
rcu_read_unlock(); rcu_read_unlock();
} }
......
...@@ -317,6 +317,12 @@ static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans, ...@@ -317,6 +317,12 @@ static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans,
: 0; : 0;
int ret; int ret;
b = bch2_btree_node_mem_alloc(trans, interior_node);
if (IS_ERR(b))
return b;
BUG_ON(b->ob.nr);
mutex_lock(&c->btree_reserve_cache_lock); mutex_lock(&c->btree_reserve_cache_lock);
if (c->btree_reserve_cache_nr > nr_reserve) { if (c->btree_reserve_cache_nr > nr_reserve) {
struct btree_alloc *a = struct btree_alloc *a =
...@@ -325,10 +331,9 @@ static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans, ...@@ -325,10 +331,9 @@ static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans,
obs = a->ob; obs = a->ob;
bkey_copy(&tmp.k, &a->k); bkey_copy(&tmp.k, &a->k);
mutex_unlock(&c->btree_reserve_cache_lock); mutex_unlock(&c->btree_reserve_cache_lock);
goto mem_alloc; goto out;
} }
mutex_unlock(&c->btree_reserve_cache_lock); mutex_unlock(&c->btree_reserve_cache_lock);
retry: retry:
ret = bch2_alloc_sectors_start_trans(trans, ret = bch2_alloc_sectors_start_trans(trans,
c->opts.metadata_target ?: c->opts.metadata_target ?:
...@@ -341,7 +346,7 @@ static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans, ...@@ -341,7 +346,7 @@ static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans,
c->opts.metadata_replicas_required), c->opts.metadata_replicas_required),
watermark, 0, cl, &wp); watermark, 0, cl, &wp);
if (unlikely(ret)) if (unlikely(ret))
return ERR_PTR(ret); goto err;
if (wp->sectors_free < btree_sectors(c)) { if (wp->sectors_free < btree_sectors(c)) {
struct open_bucket *ob; struct open_bucket *ob;
...@@ -360,19 +365,16 @@ static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans, ...@@ -360,19 +365,16 @@ static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans,
bch2_open_bucket_get(c, wp, &obs); bch2_open_bucket_get(c, wp, &obs);
bch2_alloc_sectors_done(c, wp); bch2_alloc_sectors_done(c, wp);
mem_alloc: out:
b = bch2_btree_node_mem_alloc(trans, interior_node);
six_unlock_write(&b->c.lock);
six_unlock_intent(&b->c.lock);
/* we hold cannibalize_lock: */
BUG_ON(IS_ERR(b));
BUG_ON(b->ob.nr);
bkey_copy(&b->key, &tmp.k); bkey_copy(&b->key, &tmp.k);
b->ob = obs; b->ob = obs;
six_unlock_write(&b->c.lock);
six_unlock_intent(&b->c.lock);
return b; return b;
err:
bch2_btree_node_to_freelist(c, b);
return ERR_PTR(ret);
} }
static struct btree *bch2_btree_node_alloc(struct btree_update *as, static struct btree *bch2_btree_node_alloc(struct btree_update *as,
...@@ -2439,6 +2441,9 @@ int bch2_btree_node_update_key(struct btree_trans *trans, struct btree_iter *ite ...@@ -2439,6 +2441,9 @@ int bch2_btree_node_update_key(struct btree_trans *trans, struct btree_iter *ite
} }
new_hash = bch2_btree_node_mem_alloc(trans, false); new_hash = bch2_btree_node_mem_alloc(trans, false);
ret = PTR_ERR_OR_ZERO(new_hash);
if (ret)
goto err;
} }
path->intent_ref++; path->intent_ref++;
...@@ -2446,14 +2451,9 @@ int bch2_btree_node_update_key(struct btree_trans *trans, struct btree_iter *ite ...@@ -2446,14 +2451,9 @@ int bch2_btree_node_update_key(struct btree_trans *trans, struct btree_iter *ite
commit_flags, skip_triggers); commit_flags, skip_triggers);
--path->intent_ref; --path->intent_ref;
if (new_hash) { if (new_hash)
mutex_lock(&c->btree_cache.lock); bch2_btree_node_to_freelist(c, new_hash);
list_move(&new_hash->list, &c->btree_cache.freeable); err:
mutex_unlock(&c->btree_cache.lock);
six_unlock_write(&new_hash->c.lock);
six_unlock_intent(&new_hash->c.lock);
}
closure_sync(&cl); closure_sync(&cl);
bch2_btree_cache_cannibalize_unlock(trans); bch2_btree_cache_cannibalize_unlock(trans);
return ret; return ret;
...@@ -2522,6 +2522,10 @@ int bch2_btree_root_alloc_fake_trans(struct btree_trans *trans, enum btree_id id ...@@ -2522,6 +2522,10 @@ int bch2_btree_root_alloc_fake_trans(struct btree_trans *trans, enum btree_id id
b = bch2_btree_node_mem_alloc(trans, false); b = bch2_btree_node_mem_alloc(trans, false);
bch2_btree_cache_cannibalize_unlock(trans); bch2_btree_cache_cannibalize_unlock(trans);
ret = PTR_ERR_OR_ZERO(b);
if (ret)
return ret;
set_btree_node_fake(b); set_btree_node_fake(b);
set_btree_node_need_rewrite(b); set_btree_node_need_rewrite(b);
b->c.level = level; b->c.level = level;
...@@ -2553,7 +2557,7 @@ int bch2_btree_root_alloc_fake_trans(struct btree_trans *trans, enum btree_id id ...@@ -2553,7 +2557,7 @@ int bch2_btree_root_alloc_fake_trans(struct btree_trans *trans, enum btree_id id
void bch2_btree_root_alloc_fake(struct bch_fs *c, enum btree_id id, unsigned level) void bch2_btree_root_alloc_fake(struct bch_fs *c, enum btree_id id, unsigned level)
{ {
bch2_trans_run(c, bch2_btree_root_alloc_fake_trans(trans, id, level)); bch2_trans_run(c, lockrestart_do(trans, bch2_btree_root_alloc_fake_trans(trans, id, level)));
} }
static void bch2_btree_update_to_text(struct printbuf *out, struct btree_update *as) static void bch2_btree_update_to_text(struct printbuf *out, struct btree_update *as)
......
...@@ -699,7 +699,8 @@ static int bch2_trigger_stripe_ptr(struct btree_trans *trans, ...@@ -699,7 +699,8 @@ static int bch2_trigger_stripe_ptr(struct btree_trans *trans,
static int __trigger_extent(struct btree_trans *trans, static int __trigger_extent(struct btree_trans *trans,
enum btree_id btree_id, unsigned level, enum btree_id btree_id, unsigned level,
struct bkey_s_c k, struct bkey_s_c k,
enum btree_iter_update_trigger_flags flags) enum btree_iter_update_trigger_flags flags,
s64 *replicas_sectors)
{ {
bool gc = flags & BTREE_TRIGGER_gc; bool gc = flags & BTREE_TRIGGER_gc;
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
...@@ -708,7 +709,6 @@ static int __trigger_extent(struct btree_trans *trans, ...@@ -708,7 +709,6 @@ static int __trigger_extent(struct btree_trans *trans,
enum bch_data_type data_type = bkey_is_btree_ptr(k.k) enum bch_data_type data_type = bkey_is_btree_ptr(k.k)
? BCH_DATA_btree ? BCH_DATA_btree
: BCH_DATA_user; : BCH_DATA_user;
s64 replicas_sectors = 0;
int ret = 0; int ret = 0;
struct disk_accounting_pos acc_replicas_key = { struct disk_accounting_pos acc_replicas_key = {
...@@ -739,7 +739,7 @@ static int __trigger_extent(struct btree_trans *trans, ...@@ -739,7 +739,7 @@ static int __trigger_extent(struct btree_trans *trans,
if (ret) if (ret)
return ret; return ret;
} else if (!p.has_ec) { } else if (!p.has_ec) {
replicas_sectors += disk_sectors; *replicas_sectors += disk_sectors;
acc_replicas_key.replicas.devs[acc_replicas_key.replicas.nr_devs++] = p.ptr.dev; acc_replicas_key.replicas.devs[acc_replicas_key.replicas.nr_devs++] = p.ptr.dev;
} else { } else {
ret = bch2_trigger_stripe_ptr(trans, k, p, data_type, disk_sectors, flags); ret = bch2_trigger_stripe_ptr(trans, k, p, data_type, disk_sectors, flags);
...@@ -777,7 +777,7 @@ static int __trigger_extent(struct btree_trans *trans, ...@@ -777,7 +777,7 @@ static int __trigger_extent(struct btree_trans *trans,
} }
if (acc_replicas_key.replicas.nr_devs) { if (acc_replicas_key.replicas.nr_devs) {
ret = bch2_disk_accounting_mod(trans, &acc_replicas_key, &replicas_sectors, 1, gc); ret = bch2_disk_accounting_mod(trans, &acc_replicas_key, replicas_sectors, 1, gc);
if (ret) if (ret)
return ret; return ret;
} }
...@@ -787,7 +787,7 @@ static int __trigger_extent(struct btree_trans *trans, ...@@ -787,7 +787,7 @@ static int __trigger_extent(struct btree_trans *trans,
.type = BCH_DISK_ACCOUNTING_snapshot, .type = BCH_DISK_ACCOUNTING_snapshot,
.snapshot.id = k.k->p.snapshot, .snapshot.id = k.k->p.snapshot,
}; };
ret = bch2_disk_accounting_mod(trans, &acc_snapshot_key, &replicas_sectors, 1, gc); ret = bch2_disk_accounting_mod(trans, &acc_snapshot_key, replicas_sectors, 1, gc);
if (ret) if (ret)
return ret; return ret;
} }
...@@ -807,7 +807,7 @@ static int __trigger_extent(struct btree_trans *trans, ...@@ -807,7 +807,7 @@ static int __trigger_extent(struct btree_trans *trans,
.type = BCH_DISK_ACCOUNTING_btree, .type = BCH_DISK_ACCOUNTING_btree,
.btree.id = btree_id, .btree.id = btree_id,
}; };
ret = bch2_disk_accounting_mod(trans, &acc_btree_key, &replicas_sectors, 1, gc); ret = bch2_disk_accounting_mod(trans, &acc_btree_key, replicas_sectors, 1, gc);
if (ret) if (ret)
return ret; return ret;
} else { } else {
...@@ -819,22 +819,13 @@ static int __trigger_extent(struct btree_trans *trans, ...@@ -819,22 +819,13 @@ static int __trigger_extent(struct btree_trans *trans,
s64 v[3] = { s64 v[3] = {
insert ? 1 : -1, insert ? 1 : -1,
insert ? k.k->size : -((s64) k.k->size), insert ? k.k->size : -((s64) k.k->size),
replicas_sectors, *replicas_sectors,
}; };
ret = bch2_disk_accounting_mod(trans, &acc_inum_key, v, ARRAY_SIZE(v), gc); ret = bch2_disk_accounting_mod(trans, &acc_inum_key, v, ARRAY_SIZE(v), gc);
if (ret) if (ret)
return ret; return ret;
} }
if (bch2_bkey_rebalance_opts(k)) {
struct disk_accounting_pos acc = {
.type = BCH_DISK_ACCOUNTING_rebalance_work,
};
ret = bch2_disk_accounting_mod(trans, &acc, &replicas_sectors, 1, gc);
if (ret)
return ret;
}
return 0; return 0;
} }
...@@ -843,6 +834,7 @@ int bch2_trigger_extent(struct btree_trans *trans, ...@@ -843,6 +834,7 @@ int bch2_trigger_extent(struct btree_trans *trans,
struct bkey_s_c old, struct bkey_s new, struct bkey_s_c old, struct bkey_s new,
enum btree_iter_update_trigger_flags flags) enum btree_iter_update_trigger_flags flags)
{ {
struct bch_fs *c = trans->c;
struct bkey_ptrs_c new_ptrs = bch2_bkey_ptrs_c(new.s_c); struct bkey_ptrs_c new_ptrs = bch2_bkey_ptrs_c(new.s_c);
struct bkey_ptrs_c old_ptrs = bch2_bkey_ptrs_c(old); struct bkey_ptrs_c old_ptrs = bch2_bkey_ptrs_c(old);
unsigned new_ptrs_bytes = (void *) new_ptrs.end - (void *) new_ptrs.start; unsigned new_ptrs_bytes = (void *) new_ptrs.end - (void *) new_ptrs.start;
...@@ -858,21 +850,53 @@ int bch2_trigger_extent(struct btree_trans *trans, ...@@ -858,21 +850,53 @@ int bch2_trigger_extent(struct btree_trans *trans,
new_ptrs_bytes)) new_ptrs_bytes))
return 0; return 0;
if (flags & BTREE_TRIGGER_transactional) { if (flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc)) {
struct bch_fs *c = trans->c; s64 old_replicas_sectors = 0, new_replicas_sectors = 0;
int mod = (int) bch2_bkey_needs_rebalance(c, new.s_c) -
(int) bch2_bkey_needs_rebalance(c, old); if (old.k->type) {
int ret = __trigger_extent(trans, btree, level, old,
flags & ~BTREE_TRIGGER_insert,
&old_replicas_sectors);
if (ret)
return ret;
}
if (new.k->type) {
int ret = __trigger_extent(trans, btree, level, new.s_c,
flags & ~BTREE_TRIGGER_overwrite,
&new_replicas_sectors);
if (ret)
return ret;
}
int need_rebalance_delta = 0;
s64 need_rebalance_sectors_delta = 0;
s64 s = bch2_bkey_sectors_need_rebalance(c, old);
need_rebalance_delta -= s != 0;
need_rebalance_sectors_delta -= s;
if (mod) { s = bch2_bkey_sectors_need_rebalance(c, old);
need_rebalance_delta += s != 0;
need_rebalance_sectors_delta += s;
if ((flags & BTREE_TRIGGER_transactional) && need_rebalance_delta) {
int ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_rebalance_work, int ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_rebalance_work,
new.k->p, mod > 0); new.k->p, need_rebalance_delta > 0);
if (ret) if (ret)
return ret; return ret;
} }
}
if (flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc)) if (need_rebalance_sectors_delta) {
return trigger_run_overwrite_then_insert(__trigger_extent, trans, btree, level, old, new, flags); struct disk_accounting_pos acc = {
.type = BCH_DISK_ACCOUNTING_rebalance_work,
};
int ret = bch2_disk_accounting_mod(trans, &acc, &need_rebalance_sectors_delta, 1,
flags & BTREE_TRIGGER_gc);
if (ret)
return ret;
}
}
return 0; return 0;
} }
......
...@@ -107,7 +107,7 @@ int bch2_set_bucket_needs_journal_commit(struct buckets_waiting_for_journal *b, ...@@ -107,7 +107,7 @@ int bch2_set_bucket_needs_journal_commit(struct buckets_waiting_for_journal *b,
nr_elements += t->d[i].journal_seq > flushed_seq; nr_elements += t->d[i].journal_seq > flushed_seq;
new_bits = ilog2(roundup_pow_of_two(nr_elements * 3)); new_bits = ilog2(roundup_pow_of_two(nr_elements * 3));
realloc:
n = kvmalloc(sizeof(*n) + (sizeof(n->d[0]) << new_bits), GFP_KERNEL); n = kvmalloc(sizeof(*n) + (sizeof(n->d[0]) << new_bits), GFP_KERNEL);
if (!n) { if (!n) {
ret = -BCH_ERR_ENOMEM_buckets_waiting_for_journal_set; ret = -BCH_ERR_ENOMEM_buckets_waiting_for_journal_set;
...@@ -118,6 +118,8 @@ int bch2_set_bucket_needs_journal_commit(struct buckets_waiting_for_journal *b, ...@@ -118,6 +118,8 @@ int bch2_set_bucket_needs_journal_commit(struct buckets_waiting_for_journal *b,
if (nr_rehashes_this_size == 3) { if (nr_rehashes_this_size == 3) {
new_bits++; new_bits++;
nr_rehashes_this_size = 0; nr_rehashes_this_size = 0;
kvfree(n);
goto realloc;
} }
nr_rehashes++; nr_rehashes++;
......
...@@ -20,6 +20,76 @@ ...@@ -20,6 +20,76 @@
#include "subvolume.h" #include "subvolume.h"
#include "trace.h" #include "trace.h"
static void bkey_put_dev_refs(struct bch_fs *c, struct bkey_s_c k)
{
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
bkey_for_each_ptr(ptrs, ptr)
bch2_dev_put(bch2_dev_have_ref(c, ptr->dev));
}
static bool bkey_get_dev_refs(struct bch_fs *c, struct bkey_s_c k)
{
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
bkey_for_each_ptr(ptrs, ptr) {
if (!bch2_dev_tryget(c, ptr->dev)) {
bkey_for_each_ptr(ptrs, ptr2) {
if (ptr2 == ptr)
break;
bch2_dev_put(bch2_dev_have_ref(c, ptr2->dev));
}
return false;
}
}
return true;
}
static void bkey_nocow_unlock(struct bch_fs *c, struct bkey_s_c k)
{
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
bkey_for_each_ptr(ptrs, ptr) {
struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev);
struct bpos bucket = PTR_BUCKET_POS(ca, ptr);
bch2_bucket_nocow_unlock(&c->nocow_locks, bucket, 0);
}
}
static bool bkey_nocow_lock(struct bch_fs *c, struct moving_context *ctxt, struct bkey_s_c k)
{
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
bkey_for_each_ptr(ptrs, ptr) {
struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev);
struct bpos bucket = PTR_BUCKET_POS(ca, ptr);
if (ctxt) {
bool locked;
move_ctxt_wait_event(ctxt,
(locked = bch2_bucket_nocow_trylock(&c->nocow_locks, bucket, 0)) ||
list_empty(&ctxt->ios));
if (!locked)
bch2_bucket_nocow_lock(&c->nocow_locks, bucket, 0);
} else {
if (!bch2_bucket_nocow_trylock(&c->nocow_locks, bucket, 0)) {
bkey_for_each_ptr(ptrs, ptr2) {
if (ptr2 == ptr)
break;
bucket = PTR_BUCKET_POS(ca, ptr2);
bch2_bucket_nocow_unlock(&c->nocow_locks, bucket, 0);
}
return false;
}
}
}
return true;
}
static void trace_move_extent_finish2(struct bch_fs *c, struct bkey_s_c k) static void trace_move_extent_finish2(struct bch_fs *c, struct bkey_s_c k)
{ {
if (trace_move_extent_finish_enabled()) { if (trace_move_extent_finish_enabled()) {
...@@ -355,17 +425,11 @@ void bch2_data_update_read_done(struct data_update *m, ...@@ -355,17 +425,11 @@ void bch2_data_update_read_done(struct data_update *m,
void bch2_data_update_exit(struct data_update *update) void bch2_data_update_exit(struct data_update *update)
{ {
struct bch_fs *c = update->op.c; struct bch_fs *c = update->op.c;
struct bkey_ptrs_c ptrs = struct bkey_s_c k = bkey_i_to_s_c(update->k.k);
bch2_bkey_ptrs_c(bkey_i_to_s_c(update->k.k));
bkey_for_each_ptr(ptrs, ptr) {
struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev);
if (c->opts.nocow_enabled)
bch2_bucket_nocow_unlock(&c->nocow_locks,
PTR_BUCKET_POS(ca, ptr), 0);
bch2_dev_put(ca);
}
if (c->opts.nocow_enabled)
bkey_nocow_unlock(c, k);
bkey_put_dev_refs(c, k);
bch2_bkey_buf_exit(&update->k, c); bch2_bkey_buf_exit(&update->k, c);
bch2_disk_reservation_put(c, &update->op.res); bch2_disk_reservation_put(c, &update->op.res);
bch2_bio_free_pages_pool(c, &update->op.wbio.bio); bch2_bio_free_pages_pool(c, &update->op.wbio.bio);
...@@ -475,6 +539,9 @@ void bch2_data_update_opts_to_text(struct printbuf *out, struct bch_fs *c, ...@@ -475,6 +539,9 @@ void bch2_data_update_opts_to_text(struct printbuf *out, struct bch_fs *c,
bch2_compression_opt_to_text(out, background_compression(*io_opts)); bch2_compression_opt_to_text(out, background_compression(*io_opts));
prt_newline(out); prt_newline(out);
prt_str(out, "opts.replicas:\t");
prt_u64(out, io_opts->data_replicas);
prt_str(out, "extra replicas:\t"); prt_str(out, "extra replicas:\t");
prt_u64(out, data_opts->extra_replicas); prt_u64(out, data_opts->extra_replicas);
} }
...@@ -543,7 +610,6 @@ int bch2_data_update_init(struct btree_trans *trans, ...@@ -543,7 +610,6 @@ int bch2_data_update_init(struct btree_trans *trans,
const union bch_extent_entry *entry; const union bch_extent_entry *entry;
struct extent_ptr_decoded p; struct extent_ptr_decoded p;
unsigned i, reserve_sectors = k.k->size * data_opts.extra_replicas; unsigned i, reserve_sectors = k.k->size * data_opts.extra_replicas;
unsigned ptrs_locked = 0;
int ret = 0; int ret = 0;
/* /*
...@@ -554,6 +620,15 @@ int bch2_data_update_init(struct btree_trans *trans, ...@@ -554,6 +620,15 @@ int bch2_data_update_init(struct btree_trans *trans,
if (unlikely(k.k->p.snapshot && !bch2_snapshot_equiv(c, k.k->p.snapshot))) if (unlikely(k.k->p.snapshot && !bch2_snapshot_equiv(c, k.k->p.snapshot)))
return -BCH_ERR_data_update_done; return -BCH_ERR_data_update_done;
if (!bkey_get_dev_refs(c, k))
return -BCH_ERR_data_update_done;
if (c->opts.nocow_enabled &&
!bkey_nocow_lock(c, ctxt, k)) {
bkey_put_dev_refs(c, k);
return -BCH_ERR_nocow_lock_blocked;
}
bch2_bkey_buf_init(&m->k); bch2_bkey_buf_init(&m->k);
bch2_bkey_buf_reassemble(&m->k, c, k); bch2_bkey_buf_reassemble(&m->k, c, k);
m->btree_id = btree_id; m->btree_id = btree_id;
...@@ -575,40 +650,24 @@ int bch2_data_update_init(struct btree_trans *trans, ...@@ -575,40 +650,24 @@ int bch2_data_update_init(struct btree_trans *trans,
m->op.compression_opt = background_compression(io_opts); m->op.compression_opt = background_compression(io_opts);
m->op.watermark = m->data_opts.btree_insert_flags & BCH_WATERMARK_MASK; m->op.watermark = m->data_opts.btree_insert_flags & BCH_WATERMARK_MASK;
bkey_for_each_ptr(ptrs, ptr) {
if (!bch2_dev_tryget(c, ptr->dev)) {
bkey_for_each_ptr(ptrs, ptr2) {
if (ptr2 == ptr)
break;
bch2_dev_put(bch2_dev_have_ref(c, ptr2->dev));
}
return -BCH_ERR_data_update_done;
}
}
unsigned durability_have = 0, durability_removing = 0; unsigned durability_have = 0, durability_removing = 0;
i = 0; i = 0;
bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
struct bch_dev *ca = bch2_dev_have_ref(c, p.ptr.dev); if (!p.ptr.cached) {
struct bpos bucket = PTR_BUCKET_POS(ca, &p.ptr); rcu_read_lock();
bool locked; if (BIT(i) & m->data_opts.rewrite_ptrs) {
if (crc_is_compressed(p.crc))
rcu_read_lock(); reserve_sectors += k.k->size;
if (((1U << i) & m->data_opts.rewrite_ptrs)) {
BUG_ON(p.ptr.cached); m->op.nr_replicas += bch2_extent_ptr_desired_durability(c, &p);
durability_removing += bch2_extent_ptr_desired_durability(c, &p);
if (crc_is_compressed(p.crc)) } else if (!(BIT(i) & m->data_opts.kill_ptrs)) {
reserve_sectors += k.k->size; bch2_dev_list_add_dev(&m->op.devs_have, p.ptr.dev);
durability_have += bch2_extent_ptr_durability(c, &p);
m->op.nr_replicas += bch2_extent_ptr_desired_durability(c, &p); }
durability_removing += bch2_extent_ptr_desired_durability(c, &p); rcu_read_unlock();
} else if (!p.ptr.cached &&
!((1U << i) & m->data_opts.kill_ptrs)) {
bch2_dev_list_add_dev(&m->op.devs_have, p.ptr.dev);
durability_have += bch2_extent_ptr_durability(c, &p);
} }
rcu_read_unlock();
/* /*
* op->csum_type is normally initialized from the fs/file's * op->csum_type is normally initialized from the fs/file's
...@@ -623,24 +682,6 @@ int bch2_data_update_init(struct btree_trans *trans, ...@@ -623,24 +682,6 @@ int bch2_data_update_init(struct btree_trans *trans,
if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible) if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible)
m->op.incompressible = true; m->op.incompressible = true;
if (c->opts.nocow_enabled) {
if (ctxt) {
move_ctxt_wait_event(ctxt,
(locked = bch2_bucket_nocow_trylock(&c->nocow_locks,
bucket, 0)) ||
list_empty(&ctxt->ios));
if (!locked)
bch2_bucket_nocow_lock(&c->nocow_locks, bucket, 0);
} else {
if (!bch2_bucket_nocow_trylock(&c->nocow_locks, bucket, 0)) {
ret = -BCH_ERR_nocow_lock_blocked;
goto err;
}
}
ptrs_locked |= (1U << i);
}
i++; i++;
} }
...@@ -654,16 +695,6 @@ int bch2_data_update_init(struct btree_trans *trans, ...@@ -654,16 +695,6 @@ int bch2_data_update_init(struct btree_trans *trans,
* Increasing replication is an explicit operation triggered by * Increasing replication is an explicit operation triggered by
* rereplicate, currently, so that users don't get an unexpected -ENOSPC * rereplicate, currently, so that users don't get an unexpected -ENOSPC
*/ */
if (!(m->data_opts.write_flags & BCH_WRITE_CACHED) &&
!durability_required) {
m->data_opts.kill_ptrs |= m->data_opts.rewrite_ptrs;
m->data_opts.rewrite_ptrs = 0;
/* if iter == NULL, it's just a promote */
if (iter)
ret = bch2_extent_drop_ptrs(trans, iter, k, m->data_opts);
goto done;
}
m->op.nr_replicas = min(durability_removing, durability_required) + m->op.nr_replicas = min(durability_removing, durability_required) +
m->data_opts.extra_replicas; m->data_opts.extra_replicas;
...@@ -675,48 +706,38 @@ int bch2_data_update_init(struct btree_trans *trans, ...@@ -675,48 +706,38 @@ int bch2_data_update_init(struct btree_trans *trans,
if (!(durability_have + durability_removing)) if (!(durability_have + durability_removing))
m->op.nr_replicas = max((unsigned) m->op.nr_replicas, 1); m->op.nr_replicas = max((unsigned) m->op.nr_replicas, 1);
if (!m->op.nr_replicas) { m->op.nr_replicas_required = m->op.nr_replicas;
struct printbuf buf = PRINTBUF;
bch2_data_update_to_text(&buf, m); /*
WARN(1, "trying to move an extent, but nr_replicas=0\n%s", buf.buf); * It might turn out that we don't need any new replicas, if the
printbuf_exit(&buf); * replicas or durability settings have been changed since the extent
ret = -BCH_ERR_data_update_done; * was written:
goto done; */
if (!m->op.nr_replicas) {
m->data_opts.kill_ptrs |= m->data_opts.rewrite_ptrs;
m->data_opts.rewrite_ptrs = 0;
/* if iter == NULL, it's just a promote */
if (iter)
ret = bch2_extent_drop_ptrs(trans, iter, k, m->data_opts);
goto out;
} }
m->op.nr_replicas_required = m->op.nr_replicas;
if (reserve_sectors) { if (reserve_sectors) {
ret = bch2_disk_reservation_add(c, &m->op.res, reserve_sectors, ret = bch2_disk_reservation_add(c, &m->op.res, reserve_sectors,
m->data_opts.extra_replicas m->data_opts.extra_replicas
? 0 ? 0
: BCH_DISK_RESERVATION_NOFAIL); : BCH_DISK_RESERVATION_NOFAIL);
if (ret) if (ret)
goto err; goto out;
} }
if (bkey_extent_is_unwritten(k)) { if (bkey_extent_is_unwritten(k)) {
bch2_update_unwritten_extent(trans, m); bch2_update_unwritten_extent(trans, m);
goto done; goto out;
} }
return 0; return 0;
err: out:
i = 0;
bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
struct bch_dev *ca = bch2_dev_have_ref(c, p.ptr.dev);
struct bpos bucket = PTR_BUCKET_POS(ca, &p.ptr);
if ((1U << i) & ptrs_locked)
bch2_bucket_nocow_unlock(&c->nocow_locks, bucket, 0);
bch2_dev_put(ca);
i++;
}
bch2_bkey_buf_exit(&m->k, c);
bch2_bio_free_pages_pool(c, &m->op.wbio.bio);
return ret;
done:
bch2_data_update_exit(m); bch2_data_update_exit(m);
return ret ?: -BCH_ERR_data_update_done; return ret ?: -BCH_ERR_data_update_done;
} }
......
...@@ -1017,6 +1017,8 @@ void bch2_extent_ptr_to_text(struct printbuf *out, struct bch_fs *c, const struc ...@@ -1017,6 +1017,8 @@ void bch2_extent_ptr_to_text(struct printbuf *out, struct bch_fs *c, const struc
prt_printf(out, "ptr: %u:%llu:%u gen %u", prt_printf(out, "ptr: %u:%llu:%u gen %u",
ptr->dev, b, offset, ptr->gen); ptr->dev, b, offset, ptr->gen);
if (ca->mi.durability != 1)
prt_printf(out, " d=%u", ca->mi.durability);
if (ptr->cached) if (ptr->cached)
prt_str(out, " cached"); prt_str(out, " cached");
if (ptr->unwritten) if (ptr->unwritten)
...@@ -1377,6 +1379,45 @@ bool bch2_bkey_needs_rebalance(struct bch_fs *c, struct bkey_s_c k) ...@@ -1377,6 +1379,45 @@ bool bch2_bkey_needs_rebalance(struct bch_fs *c, struct bkey_s_c k)
return r != NULL; return r != NULL;
} }
static u64 __bch2_bkey_sectors_need_rebalance(struct bch_fs *c, struct bkey_s_c k,
unsigned target, unsigned compression)
{
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
const union bch_extent_entry *entry;
struct extent_ptr_decoded p;
u64 sectors = 0;
if (compression) {
unsigned compression_type = bch2_compression_opt_to_type(compression);
bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible ||
p.ptr.unwritten) {
sectors = 0;
goto incompressible;
}
if (!p.ptr.cached && p.crc.compression_type != compression_type)
sectors += p.crc.compressed_size;
}
}
incompressible:
if (target && bch2_target_accepts_data(c, BCH_DATA_user, target)) {
bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
if (!p.ptr.cached && !bch2_dev_in_target(c, p.ptr.dev, target))
sectors += p.crc.compressed_size;
}
return sectors;
}
u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *c, struct bkey_s_c k)
{
const struct bch_extent_rebalance *r = bch2_bkey_rebalance_opts(k);
return r ? __bch2_bkey_sectors_need_rebalance(c, k, r->target, r->compression) : 0;
}
int bch2_bkey_set_needs_rebalance(struct bch_fs *c, struct bkey_i *_k, int bch2_bkey_set_needs_rebalance(struct bch_fs *c, struct bkey_i *_k,
struct bch_io_opts *opts) struct bch_io_opts *opts)
{ {
......
...@@ -692,6 +692,7 @@ const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c); ...@@ -692,6 +692,7 @@ const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c);
unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *, struct bkey_s_c, unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *, struct bkey_s_c,
unsigned, unsigned); unsigned, unsigned);
bool bch2_bkey_needs_rebalance(struct bch_fs *, struct bkey_s_c); bool bch2_bkey_needs_rebalance(struct bch_fs *, struct bkey_s_c);
u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *, struct bkey_s_c);
int bch2_bkey_set_needs_rebalance(struct bch_fs *, struct bkey_i *, int bch2_bkey_set_needs_rebalance(struct bch_fs *, struct bkey_i *,
struct bch_io_opts *); struct bch_io_opts *);
......
...@@ -534,7 +534,7 @@ static int __bch2_writepage(struct folio *folio, ...@@ -534,7 +534,7 @@ static int __bch2_writepage(struct folio *folio,
if (f_sectors > w->tmp_sectors) { if (f_sectors > w->tmp_sectors) {
kfree(w->tmp); kfree(w->tmp);
w->tmp = kcalloc(f_sectors, sizeof(struct bch_folio_sector), __GFP_NOFAIL); w->tmp = kcalloc(f_sectors, sizeof(struct bch_folio_sector), GFP_NOFS|__GFP_NOFAIL);
w->tmp_sectors = f_sectors; w->tmp_sectors = f_sectors;
} }
......
...@@ -328,9 +328,8 @@ static int bch2_ioc_setlabel(struct bch_fs *c, ...@@ -328,9 +328,8 @@ static int bch2_ioc_setlabel(struct bch_fs *c,
mutex_lock(&c->sb_lock); mutex_lock(&c->sb_lock);
strscpy(c->disk_sb.sb->label, label, BCH_SB_LABEL_SIZE); strscpy(c->disk_sb.sb->label, label, BCH_SB_LABEL_SIZE);
mutex_unlock(&c->sb_lock);
ret = bch2_write_super(c); ret = bch2_write_super(c);
mutex_unlock(&c->sb_lock);
mnt_drop_write_file(file); mnt_drop_write_file(file);
return ret; return ret;
......
...@@ -2006,7 +2006,6 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter * ...@@ -2006,7 +2006,6 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter *
if (ret) { if (ret) {
bch_err(c, "subvol %u points to missing inode root %llu", target_subvol, target_inum); bch_err(c, "subvol %u points to missing inode root %llu", target_subvol, target_inum);
ret = -BCH_ERR_fsck_repair_unimplemented; ret = -BCH_ERR_fsck_repair_unimplemented;
ret = 0;
goto err; goto err;
} }
...@@ -2216,6 +2215,8 @@ int bch2_check_xattrs(struct bch_fs *c) ...@@ -2216,6 +2215,8 @@ int bch2_check_xattrs(struct bch_fs *c)
NULL, NULL, NULL, NULL,
BCH_TRANS_COMMIT_no_enospc, BCH_TRANS_COMMIT_no_enospc,
check_xattr(trans, &iter, k, &hash_info, &inode))); check_xattr(trans, &iter, k, &hash_info, &inode)));
inode_walker_exit(&inode);
bch_err_fn(c, ret); bch_err_fn(c, ret);
return ret; return ret;
} }
...@@ -2469,8 +2470,7 @@ static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c ino ...@@ -2469,8 +2470,7 @@ static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c ino
: bch2_inode_unpack(inode_k, &inode); : bch2_inode_unpack(inode_k, &inode);
if (ret) { if (ret) {
/* Should have been caught in dirents pass */ /* Should have been caught in dirents pass */
if (!bch2_err_matches(ret, BCH_ERR_transaction_restart)) bch_err_msg(c, ret, "error looking up parent directory");
bch_err(c, "error looking up parent directory: %i", ret);
break; break;
} }
......
...@@ -1260,7 +1260,7 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq) ...@@ -1260,7 +1260,7 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq)
} }
if (!had_entries) if (!had_entries)
j->last_empty_seq = cur_seq; j->last_empty_seq = cur_seq - 1; /* to match j->seq */
spin_lock(&j->lock); spin_lock(&j->lock);
......
...@@ -104,6 +104,7 @@ static int bch2_sb_journal_v2_validate(struct bch_sb *sb, struct bch_sb_field *f ...@@ -104,6 +104,7 @@ static int bch2_sb_journal_v2_validate(struct bch_sb *sb, struct bch_sb_field *f
struct bch_sb_field_journal_v2 *journal = field_to_type(f, journal_v2); struct bch_sb_field_journal_v2 *journal = field_to_type(f, journal_v2);
struct bch_member m = bch2_sb_member_get(sb, sb->dev_idx); struct bch_member m = bch2_sb_member_get(sb, sb->dev_idx);
int ret = -BCH_ERR_invalid_sb_journal; int ret = -BCH_ERR_invalid_sb_journal;
u64 sum = 0;
unsigned nr; unsigned nr;
unsigned i; unsigned i;
struct u64_range *b; struct u64_range *b;
...@@ -119,6 +120,15 @@ static int bch2_sb_journal_v2_validate(struct bch_sb *sb, struct bch_sb_field *f ...@@ -119,6 +120,15 @@ static int bch2_sb_journal_v2_validate(struct bch_sb *sb, struct bch_sb_field *f
for (i = 0; i < nr; i++) { for (i = 0; i < nr; i++) {
b[i].start = le64_to_cpu(journal->d[i].start); b[i].start = le64_to_cpu(journal->d[i].start);
b[i].end = b[i].start + le64_to_cpu(journal->d[i].nr); b[i].end = b[i].start + le64_to_cpu(journal->d[i].nr);
if (b[i].end <= b[i].start) {
prt_printf(err, "journal buckets entry with bad nr: %llu+%llu",
le64_to_cpu(journal->d[i].start),
le64_to_cpu(journal->d[i].nr));
goto err;
}
sum += le64_to_cpu(journal->d[i].nr);
} }
sort(b, nr, sizeof(*b), u64_range_cmp, NULL); sort(b, nr, sizeof(*b), u64_range_cmp, NULL);
...@@ -148,6 +158,11 @@ static int bch2_sb_journal_v2_validate(struct bch_sb *sb, struct bch_sb_field *f ...@@ -148,6 +158,11 @@ static int bch2_sb_journal_v2_validate(struct bch_sb *sb, struct bch_sb_field *f
} }
} }
if (sum > UINT_MAX) {
prt_printf(err, "too many journal buckets: %llu > %u", sum, UINT_MAX);
goto err;
}
ret = 0; ret = 0;
err: err:
kfree(b); kfree(b);
......
...@@ -383,7 +383,7 @@ static int bch2_copygc_thread(void *arg) ...@@ -383,7 +383,7 @@ static int bch2_copygc_thread(void *arg)
if (min_member_capacity == U64_MAX) if (min_member_capacity == U64_MAX)
min_member_capacity = 128 * 2048; min_member_capacity = 128 * 2048;
bch2_trans_unlock_long(ctxt.trans); move_buckets_wait(&ctxt, buckets, true);
bch2_kthread_io_clock_wait(clock, last + (min_member_capacity >> 6), bch2_kthread_io_clock_wait(clock, last + (min_member_capacity >> 6),
MAX_SCHEDULE_TIMEOUT); MAX_SCHEDULE_TIMEOUT);
} }
......
...@@ -241,7 +241,13 @@ static int journal_sort_seq_cmp(const void *_l, const void *_r) ...@@ -241,7 +241,13 @@ static int journal_sort_seq_cmp(const void *_l, const void *_r)
const struct journal_key *l = *((const struct journal_key **)_l); const struct journal_key *l = *((const struct journal_key **)_l);
const struct journal_key *r = *((const struct journal_key **)_r); const struct journal_key *r = *((const struct journal_key **)_r);
return cmp_int(l->journal_seq, r->journal_seq); /*
* Map 0 to U64_MAX, so that keys with journal_seq === 0 come last
*
* journal_seq == 0 means that the key comes from early repair, and
* should be inserted last so as to avoid overflowing the journal
*/
return cmp_int(l->journal_seq - 1, r->journal_seq - 1);
} }
int bch2_journal_replay(struct bch_fs *c) int bch2_journal_replay(struct bch_fs *c)
...@@ -322,6 +328,7 @@ int bch2_journal_replay(struct bch_fs *c) ...@@ -322,6 +328,7 @@ int bch2_journal_replay(struct bch_fs *c)
} }
} }
bch2_trans_unlock_long(trans);
/* /*
* Now, replay any remaining keys in the order in which they appear in * Now, replay any remaining keys in the order in which they appear in
* the journal, unpinning those journal entries as we go: * the journal, unpinning those journal entries as we go:
......
...@@ -451,7 +451,8 @@ int bch2_replicas_gc2(struct bch_fs *c) ...@@ -451,7 +451,8 @@ int bch2_replicas_gc2(struct bch_fs *c)
.type = BCH_DISK_ACCOUNTING_replicas, .type = BCH_DISK_ACCOUNTING_replicas,
}; };
memcpy(&k.replicas, e, replicas_entry_bytes(e)); unsafe_memcpy(&k.replicas, e, replicas_entry_bytes(e),
"embedded variable length struct");
struct bpos p = disk_accounting_pos_to_bpos(&k); struct bpos p = disk_accounting_pos_to_bpos(&k);
......
...@@ -74,6 +74,9 @@ ...@@ -74,6 +74,9 @@
BCH_FSCK_ERR_accounting_key_replicas_devs_unsorted, \ BCH_FSCK_ERR_accounting_key_replicas_devs_unsorted, \
BCH_FSCK_ERR_accounting_key_junk_at_end) \ BCH_FSCK_ERR_accounting_key_junk_at_end) \
x(disk_accounting_inum, \ x(disk_accounting_inum, \
BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \
BCH_FSCK_ERR_accounting_mismatch) \
x(rebalance_work_acct_fix, \
BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \ BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \
BCH_FSCK_ERR_accounting_mismatch) BCH_FSCK_ERR_accounting_mismatch)
...@@ -108,7 +111,10 @@ ...@@ -108,7 +111,10 @@
BCH_FSCK_ERR_fs_usage_persistent_reserved_wrong, \ BCH_FSCK_ERR_fs_usage_persistent_reserved_wrong, \
BCH_FSCK_ERR_fs_usage_replicas_wrong, \ BCH_FSCK_ERR_fs_usage_replicas_wrong, \
BCH_FSCK_ERR_accounting_replicas_not_marked, \ BCH_FSCK_ERR_accounting_replicas_not_marked, \
BCH_FSCK_ERR_bkey_version_in_future) BCH_FSCK_ERR_bkey_version_in_future) \
x(rebalance_work_acct_fix, \
BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \
BCH_FSCK_ERR_accounting_mismatch)
struct upgrade_downgrade_entry { struct upgrade_downgrade_entry {
u64 recovery_passes; u64 recovery_passes;
......
...@@ -416,7 +416,6 @@ void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats ...@@ -416,7 +416,6 @@ void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats
printbuf_tabstop_push(out, TABSTOP_SIZE + 2); printbuf_tabstop_push(out, TABSTOP_SIZE + 2);
prt_printf(out, "\tsince mount\r\trecent\r\n"); prt_printf(out, "\tsince mount\r\trecent\r\n");
prt_printf(out, "recent");
printbuf_tabstops_reset(out); printbuf_tabstops_reset(out);
printbuf_tabstop_push(out, out->indent + 20); printbuf_tabstop_push(out, out->indent + 20);
......
...@@ -612,10 +612,20 @@ static int bch2_xattr_bcachefs_get_effective( ...@@ -612,10 +612,20 @@ static int bch2_xattr_bcachefs_get_effective(
name, buffer, size, true); name, buffer, size, true);
} }
/* Noop - xattrs in the bcachefs_effective namespace are inherited */
static int bch2_xattr_bcachefs_set_effective(const struct xattr_handler *handler,
struct mnt_idmap *idmap,
struct dentry *dentry, struct inode *vinode,
const char *name, const void *value,
size_t size, int flags)
{
return 0;
}
static const struct xattr_handler bch_xattr_bcachefs_effective_handler = { static const struct xattr_handler bch_xattr_bcachefs_effective_handler = {
.prefix = "bcachefs_effective.", .prefix = "bcachefs_effective.",
.get = bch2_xattr_bcachefs_get_effective, .get = bch2_xattr_bcachefs_get_effective,
.set = bch2_xattr_bcachefs_set, .set = bch2_xattr_bcachefs_set_effective,
}; };
#endif /* NO_BCACHEFS_FS */ #endif /* NO_BCACHEFS_FS */
......
...@@ -1802,8 +1802,8 @@ int vfs_get_tree(struct fs_context *fc) ...@@ -1802,8 +1802,8 @@ int vfs_get_tree(struct fs_context *fc)
return error; return error;
if (!fc->root) { if (!fc->root) {
pr_err("Filesystem %s get_tree() didn't set fc->root\n", pr_err("Filesystem %s get_tree() didn't set fc->root, returned %i\n",
fc->fs_type->name); fc->fs_type->name, error);
/* We don't know what the locking state of the superblock is - /* We don't know what the locking state of the superblock is -
* if there is a superblock. * if there is a superblock.
*/ */
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment