Commit 460651ee authored by Kent Overstreet's avatar Kent Overstreet Committed by Kent Overstreet

bcachefs: Various improvements to bch2_alloc_write()

Signed-off-by: default avatarKent Overstreet <kent.overstreet@linux.dev>
parent 932aa837
......@@ -129,15 +129,21 @@ static inline void put_alloc_field(struct bkey_i_alloc *a, void **p,
*p += bytes;
}
struct bkey_alloc_unpacked bch2_alloc_unpack(const struct bch_alloc *a)
struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c k)
{
struct bkey_alloc_unpacked ret = { .gen = a->gen };
struct bkey_alloc_unpacked ret = { .gen = 0 };
if (k.k->type == KEY_TYPE_alloc) {
const struct bch_alloc *a = bkey_s_c_to_alloc(k).v;
const void *d = a->data;
unsigned idx = 0;
ret.gen = a->gen;
#define x(_name, _bits) ret._name = get_alloc_field(a, &d, idx++);
BCH_ALLOC_FIELDS()
#undef x
}
return ret;
}
......@@ -199,66 +205,18 @@ void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c,
get_alloc_field(a.v, &d, i));
}
static void __alloc_read_key(struct bucket *g, const struct bch_alloc *a)
{
const void *d = a->data;
unsigned idx = 0, data_type, dirty_sectors, cached_sectors;
struct bucket_mark m;
g->io_time[READ] = get_alloc_field(a, &d, idx++);
g->io_time[WRITE] = get_alloc_field(a, &d, idx++);
data_type = get_alloc_field(a, &d, idx++);
dirty_sectors = get_alloc_field(a, &d, idx++);
cached_sectors = get_alloc_field(a, &d, idx++);
g->oldest_gen = get_alloc_field(a, &d, idx++);
bucket_cmpxchg(g, m, ({
m.gen = a->gen;
m.data_type = data_type;
m.dirty_sectors = dirty_sectors;
m.cached_sectors = cached_sectors;
}));
g->gen_valid = 1;
}
static void __alloc_write_key(struct bkey_i_alloc *a, struct bucket *g,
struct bucket_mark m)
static inline struct bkey_alloc_unpacked
alloc_mem_to_key(struct bucket *g, struct bucket_mark m)
{
unsigned idx = 0;
void *d = a->v.data;
a->v.fields = 0;
a->v.gen = m.gen;
d = a->v.data;
put_alloc_field(a, &d, idx++, g->io_time[READ]);
put_alloc_field(a, &d, idx++, g->io_time[WRITE]);
put_alloc_field(a, &d, idx++, m.data_type);
put_alloc_field(a, &d, idx++, m.dirty_sectors);
put_alloc_field(a, &d, idx++, m.cached_sectors);
put_alloc_field(a, &d, idx++, g->oldest_gen);
set_bkey_val_bytes(&a->k, (void *) d - (void *) &a->v);
}
static void bch2_alloc_read_key(struct bch_fs *c, struct bkey_s_c k)
{
struct bch_dev *ca;
struct bkey_s_c_alloc a;
if (k.k->type != KEY_TYPE_alloc)
return;
a = bkey_s_c_to_alloc(k);
ca = bch_dev_bkey_exists(c, a.k->p.inode);
if (a.k->p.offset >= ca->mi.nbuckets)
return;
percpu_down_read(&c->mark_lock);
__alloc_read_key(bucket(ca, a.k->p.offset), a.v);
percpu_up_read(&c->mark_lock);
return (struct bkey_alloc_unpacked) {
.gen = m.gen,
.oldest_gen = g->oldest_gen,
.data_type = m.data_type,
.dirty_sectors = m.dirty_sectors,
.cached_sectors = m.cached_sectors,
.read_time = g->io_time[READ],
.write_time = g->io_time[WRITE],
};
}
int bch2_alloc_read(struct bch_fs *c, struct journal_keys *journal_keys)
......@@ -274,7 +232,7 @@ int bch2_alloc_read(struct bch_fs *c, struct journal_keys *journal_keys)
bch2_trans_init(&trans, c);
for_each_btree_key(&trans, iter, BTREE_ID_ALLOC, POS_MIN, 0, k, ret)
bch2_alloc_read_key(c, k);
bch2_mark_key(c, k, true, 0, NULL, 0, 0);
ret = bch2_trans_exit(&trans) ?: ret;
if (ret) {
......@@ -284,7 +242,8 @@ int bch2_alloc_read(struct bch_fs *c, struct journal_keys *journal_keys)
for_each_journal_key(*journal_keys, j)
if (j->btree_id == BTREE_ID_ALLOC)
bch2_alloc_read_key(c, bkey_i_to_s_c(j->k));
bch2_mark_key(c, bkey_i_to_s_c(j->k),
true, 0, NULL, 0, 0);
percpu_down_write(&c->mark_lock);
bch2_dev_usage_from_buckets(c);
......@@ -352,81 +311,32 @@ int bch2_alloc_replay_key(struct bch_fs *c, struct bkey_i *k)
return ret;
}
static int __bch2_alloc_write_key(struct btree_trans *trans, struct bch_dev *ca,
size_t b, struct btree_iter *iter,
unsigned flags)
{
struct bch_fs *c = trans->c;
#if 0
__BKEY_PADDED(k, BKEY_ALLOC_VAL_U64s_MAX) alloc_key;
#else
/* hack: */
__BKEY_PADDED(k, 8) alloc_key;
#endif
struct bkey_i_alloc *a = bkey_alloc_init(&alloc_key.k);
struct bucket *g;
struct bucket_mark m, new;
int ret;
BUG_ON(BKEY_ALLOC_VAL_U64s_MAX > 8);
a->k.p = POS(ca->dev_idx, b);
bch2_btree_iter_set_pos(iter, a->k.p);
ret = bch2_btree_iter_traverse(iter);
if (ret)
return ret;
percpu_down_read(&c->mark_lock);
g = bucket(ca, b);
m = READ_ONCE(g->mark);
if (!m.dirty) {
percpu_up_read(&c->mark_lock);
return 0;
}
__alloc_write_key(a, g, m);
percpu_up_read(&c->mark_lock);
bch2_trans_update(trans, BTREE_INSERT_ENTRY(iter, &a->k_i));
ret = bch2_trans_commit(trans, NULL, NULL,
BTREE_INSERT_NOFAIL|
BTREE_INSERT_NOMARK|
flags);
if (ret)
return ret;
new = m;
new.dirty = false;
atomic64_cmpxchg(&g->_mark.v, m.v.counter, new.v.counter);
if (ca->buckets_written)
set_bit(b, ca->buckets_written);
return 0;
}
int bch2_alloc_write(struct bch_fs *c, unsigned flags, bool *wrote)
{
struct btree_trans trans;
struct btree_iter *iter;
struct bucket_array *buckets;
struct bch_dev *ca;
struct bucket *g;
struct bucket_mark m, new;
struct bkey_alloc_unpacked old_u, new_u;
__BKEY_PADDED(k, 8) alloc_key; /* hack: */
struct bkey_i_alloc *a;
struct bkey_s_c k;
unsigned i;
size_t b;
int ret = 0;
BUG_ON(BKEY_ALLOC_VAL_U64s_MAX > 8);
bch2_trans_init(&trans, c);
iter = bch2_trans_get_iter(&trans, BTREE_ID_ALLOC, POS_MIN,
BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
for_each_rw_member(ca, c, i) {
relock:
down_read(&ca->bucket_lock);
restart:
buckets = bucket_array(ca);
for (b = buckets->first_bucket;
......@@ -435,27 +345,70 @@ int bch2_alloc_write(struct bch_fs *c, unsigned flags, bool *wrote)
if (!buckets->b[b].mark.dirty)
continue;
bch2_btree_iter_set_pos(iter, POS(i, b));
k = bch2_btree_iter_peek_slot(iter);
ret = bkey_err(k);
if (ret)
goto err;
old_u = bch2_alloc_unpack(k);
percpu_down_read(&c->mark_lock);
g = bucket(ca, b);
m = READ_ONCE(g->mark);
new_u = alloc_mem_to_key(g, m);
percpu_up_read(&c->mark_lock);
if (!m.dirty)
continue;
if ((flags & BTREE_INSERT_LAZY_RW) &&
percpu_ref_is_zero(&c->writes)) {
up_read(&ca->bucket_lock);
bch2_trans_unlock(&trans);
ret = bch2_fs_read_write_early(c);
down_read(&ca->bucket_lock);
if (ret)
goto out;
goto relock;
goto err;
goto restart;
}
ret = __bch2_alloc_write_key(&trans, ca, b,
iter, flags);
a = bkey_alloc_init(&alloc_key.k);
a->k.p = iter->pos;
bch2_alloc_pack(a, new_u);
bch2_trans_update(&trans, BTREE_INSERT_ENTRY(iter, &a->k_i));
ret = bch2_trans_commit(&trans, NULL, NULL,
BTREE_INSERT_NOFAIL|
BTREE_INSERT_NOMARK|
flags);
err:
if (ret && !test_bit(BCH_FS_EMERGENCY_RO, &c->flags)) {
bch_err(c, "error %i writing alloc info", ret);
printk(KERN_CONT "dev %llu bucket %llu\n",
iter->pos.inode, iter->pos.offset);
printk(KERN_CONT "gen %u -> %u\n", old_u.gen, new_u.gen);
#define x(_name, _bits) printk(KERN_CONT #_name " %u -> %u\n", old_u._name, new_u._name);
BCH_ALLOC_FIELDS()
#undef x
}
if (ret)
break;
new = m;
new.dirty = false;
atomic64_cmpxchg(&g->_mark.v, m.v.counter, new.v.counter);
if (ca->buckets_written)
set_bit(b, ca->buckets_written);
bch2_trans_cond_resched(&trans);
*wrote = true;
}
up_read(&ca->bucket_lock);
out:
if (ret) {
percpu_ref_put(&ca->io_ref);
break;
......@@ -922,6 +875,7 @@ static int bch2_invalidate_one_bucket2(struct btree_trans *trans,
struct bch_fs *c = trans->c;
struct bkey_i_alloc *a;
struct bkey_alloc_unpacked u;
struct bucket *g;
struct bucket_mark m;
struct bkey_s_c k;
bool invalidating_cached_data;
......@@ -941,7 +895,6 @@ static int bch2_invalidate_one_bucket2(struct btree_trans *trans,
BUG_ON(!fifo_push(&ca->free_inc, b));
bch2_mark_alloc_bucket(c, ca, b, true, gc_pos_alloc(c, NULL), 0);
m = bucket(ca, b)->mark;
spin_unlock(&c->freelist_lock);
percpu_up_read(&c->mark_lock);
......@@ -955,27 +908,26 @@ static int bch2_invalidate_one_bucket2(struct btree_trans *trans,
if (ret)
return ret;
if (k.k && k.k->type == KEY_TYPE_alloc)
u = bch2_alloc_unpack(bkey_s_c_to_alloc(k).v);
else
memset(&u, 0, sizeof(u));
/*
* The allocator has to start before journal replay is finished - thus,
* we have to trust the in memory bucket @m, not the version in the
* btree:
*/
percpu_down_read(&c->mark_lock);
g = bucket(ca, b);
m = READ_ONCE(g->mark);
u = alloc_mem_to_key(g, m);
percpu_up_read(&c->mark_lock);
invalidating_cached_data = m.cached_sectors != 0;
u.gen++;
u.data_type = 0;
u.dirty_sectors = 0;
u.cached_sectors = 0;
u.read_time = c->bucket_clock[READ].hand;
u.write_time = c->bucket_clock[WRITE].hand;
/*
* The allocator has to start before journal replay is finished - thus,
* we have to trust the in memory bucket @m, not the version in the
* btree:
*/
//BUG_ON(u.dirty_sectors);
u.gen = m.gen + 1;
a = bkey_alloc_init(&alloc_key.k);
a->k.p = iter->pos;
bch2_alloc_pack(a, u);
......
......@@ -13,7 +13,7 @@ struct bkey_alloc_unpacked {
#undef x
};
struct bkey_alloc_unpacked bch2_alloc_unpack(const struct bch_alloc *);
struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c);
void bch2_alloc_pack(struct bkey_i_alloc *,
const struct bkey_alloc_unpacked);
......
......@@ -649,9 +649,13 @@ static int bch2_mark_alloc(struct bch_fs *c, struct bkey_s_c k,
if (flags & BCH_BUCKET_MARK_GC)
return 0;
u = bch2_alloc_unpack(bkey_s_c_to_alloc(k).v);
ca = bch_dev_bkey_exists(c, k.k->p.inode);
if (k.k->p.offset >= ca->mi.nbuckets)
return 0;
g = __bucket(ca, k.k->p.offset, gc);
u = bch2_alloc_unpack(k);
old = bucket_data_cmpxchg(c, ca, fs_usage, g, m, ({
m.gen = u.gen;
......@@ -1381,7 +1385,7 @@ static int bch2_trans_mark_pointer(struct btree_trans *trans,
goto out;
}
u = bch2_alloc_unpack(bkey_s_c_to_alloc(k).v);
u = bch2_alloc_unpack(k);
if (gen_after(u.gen, p.ptr.gen)) {
ret = 1;
......
......@@ -1234,11 +1234,6 @@ int bch2_stripes_write(struct bch_fs *c, unsigned flags, bool *wrote)
return ret;
}
static void bch2_stripe_read_key(struct bch_fs *c, struct bkey_s_c k)
{
bch2_mark_key(c, k, true, 0, NULL, 0, 0);
}
int bch2_stripes_read(struct bch_fs *c, struct journal_keys *journal_keys)
{
struct journal_key *i;
......@@ -1254,7 +1249,7 @@ int bch2_stripes_read(struct bch_fs *c, struct journal_keys *journal_keys)
bch2_trans_init(&trans, c);
for_each_btree_key(&trans, iter, BTREE_ID_EC, POS_MIN, 0, k, ret)
bch2_stripe_read_key(c, k);
bch2_mark_key(c, k, true, 0, NULL, 0, 0);
ret = bch2_trans_exit(&trans) ?: ret;
if (ret) {
......@@ -1264,7 +1259,8 @@ int bch2_stripes_read(struct bch_fs *c, struct journal_keys *journal_keys)
for_each_journal_key(*journal_keys, i)
if (i->btree_id == BTREE_ID_EC)
bch2_stripe_read_key(c, bkey_i_to_s_c(i->k));
bch2_mark_key(c, bkey_i_to_s_c(i->k),
true, 0, NULL, 0, 0);
return 0;
}
......
......@@ -947,7 +947,6 @@ static void journal_write_done(struct closure *cl)
return;
err:
bch2_fatal_error(c);
bch2_journal_halt(j);
spin_lock(&j->lock);
goto out;
}
......@@ -1059,7 +1058,6 @@ void bch2_journal_write(struct closure *cl)
spin_unlock(&j->lock);
if (ret) {
bch2_journal_halt(j);
bch_err(c, "Unable to allocate journal write");
bch2_fatal_error(c);
continue_at(cl, journal_write_done, system_highpri_wq);
......
......@@ -198,17 +198,14 @@ static void __bch2_fs_read_only(struct bch_fs *c)
do {
wrote = false;
ret = bch2_stripes_write(c, BTREE_INSERT_NOCHECK_RW, &wrote);
if (ret) {
bch2_fs_inconsistent(c, "error writing out stripes");
break;
}
ret = bch2_stripes_write(c, BTREE_INSERT_NOCHECK_RW, &wrote) ?:
bch2_alloc_write(c, BTREE_INSERT_NOCHECK_RW, &wrote);
ret = bch2_alloc_write(c, BTREE_INSERT_NOCHECK_RW, &wrote);
if (ret) {
if (ret && !test_bit(BCH_FS_EMERGENCY_RO, &c->flags))
bch2_fs_inconsistent(c, "error writing out alloc info %i", ret);
if (ret)
break;
}
for_each_member_device(ca, c, i)
bch2_dev_allocator_quiesce(c, ca);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment