Commit 7f4e1d5d authored by Kent Overstreet's avatar Kent Overstreet Committed by Kent Overstreet

bcachefs: KEY_TYPE_alloc_v2

This introduces a new version of KEY_TYPE_alloc, which uses the new
varint encoding introduced for inodes. This means we'll eventually be
able to support much larger bucket sizes (for SMR devices), and the
read/write time fields are expanded to 64 bits - which will be used in
the next patch to get rid of the periodic rescaling of those fields.

Also, for buckets that are members of erasure coded stripes, this adds
persistent fields for the index of the stripe they're members of and the
stripe redundancy. This is part of work to get rid of having to scan and
read into memory the alloc and stripes btrees at mount time.
Signed-off-by: default avatarKent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: default avatarKent Overstreet <kent.overstreet@linux.dev>
parent 26452d1d
This diff is collapsed.
...@@ -7,12 +7,33 @@ ...@@ -7,12 +7,33 @@
#include "debug.h" #include "debug.h"
struct bkey_alloc_unpacked { struct bkey_alloc_unpacked {
u64 bucket;
u8 dev;
u8 gen; u8 gen;
u8 oldest_gen;
u8 data_type;
#define x(_name, _bits) u##_bits _name; #define x(_name, _bits) u##_bits _name;
BCH_ALLOC_FIELDS() BCH_ALLOC_FIELDS_V2()
#undef x #undef x
}; };
struct bkey_alloc_buf {
struct bkey_i k;
union {
struct {
#define x(_name, _bits) + _bits / 8
u8 _pad[8 + BCH_ALLOC_FIELDS_V1()];
#undef x
} _v1;
struct {
#define x(_name, _bits) + 8 + _bits / 8
u8 _pad[8 + BCH_ALLOC_FIELDS_V2()];
#undef x
} _v2;
};
} __attribute__((packed, aligned(8)));
/* How out of date a pointer gen is allowed to be: */ /* How out of date a pointer gen is allowed to be: */
#define BUCKET_GC_GEN_MAX 96U #define BUCKET_GC_GEN_MAX 96U
...@@ -20,23 +41,28 @@ struct bkey_alloc_unpacked { ...@@ -20,23 +41,28 @@ struct bkey_alloc_unpacked {
static inline bool bkey_alloc_unpacked_cmp(struct bkey_alloc_unpacked l, static inline bool bkey_alloc_unpacked_cmp(struct bkey_alloc_unpacked l,
struct bkey_alloc_unpacked r) struct bkey_alloc_unpacked r)
{ {
return l.gen != r.gen return l.gen != r.gen ||
#define x(_name, _bits) || l._name != r._name l.oldest_gen != r.oldest_gen ||
BCH_ALLOC_FIELDS() l.data_type != r.data_type
#define x(_name, ...) || l._name != r._name
BCH_ALLOC_FIELDS_V2()
#undef x #undef x
; ;
} }
struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c); struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c);
void bch2_alloc_pack(struct bkey_i_alloc *, void bch2_alloc_pack(struct bch_fs *, struct bkey_alloc_buf *,
const struct bkey_alloc_unpacked); const struct bkey_alloc_unpacked);
int bch2_bucket_io_time_reset(struct btree_trans *, unsigned, size_t, int); int bch2_bucket_io_time_reset(struct btree_trans *, unsigned, size_t, int);
static inline struct bkey_alloc_unpacked static inline struct bkey_alloc_unpacked
alloc_mem_to_key(struct bucket *g, struct bucket_mark m) alloc_mem_to_key(struct btree_iter *iter,
struct bucket *g, struct bucket_mark m)
{ {
return (struct bkey_alloc_unpacked) { return (struct bkey_alloc_unpacked) {
.dev = iter->pos.inode,
.bucket = iter->pos.offset,
.gen = m.gen, .gen = m.gen,
.oldest_gen = g->oldest_gen, .oldest_gen = g->oldest_gen,
.data_type = m.data_type, .data_type = m.data_type,
...@@ -49,11 +75,17 @@ alloc_mem_to_key(struct bucket *g, struct bucket_mark m) ...@@ -49,11 +75,17 @@ alloc_mem_to_key(struct bucket *g, struct bucket_mark m)
#define ALLOC_SCAN_BATCH(ca) max_t(size_t, 1, (ca)->mi.nbuckets >> 9) #define ALLOC_SCAN_BATCH(ca) max_t(size_t, 1, (ca)->mi.nbuckets >> 9)
const char *bch2_alloc_invalid(const struct bch_fs *, struct bkey_s_c); const char *bch2_alloc_v1_invalid(const struct bch_fs *, struct bkey_s_c);
const char *bch2_alloc_v2_invalid(const struct bch_fs *, struct bkey_s_c);
void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
#define bch2_bkey_ops_alloc (struct bkey_ops) { \ #define bch2_bkey_ops_alloc (struct bkey_ops) { \
.key_invalid = bch2_alloc_invalid, \ .key_invalid = bch2_alloc_v1_invalid, \
.val_to_text = bch2_alloc_to_text, \
}
#define bch2_bkey_ops_alloc_v2 (struct bkey_ops) { \
.key_invalid = bch2_alloc_v2_invalid, \
.val_to_text = bch2_alloc_to_text, \ .val_to_text = bch2_alloc_to_text, \
} }
......
...@@ -345,7 +345,8 @@ static inline void bkey_init(struct bkey *k) ...@@ -345,7 +345,8 @@ static inline void bkey_init(struct bkey *k)
x(reflink_v, 16) \ x(reflink_v, 16) \
x(inline_data, 17) \ x(inline_data, 17) \
x(btree_ptr_v2, 18) \ x(btree_ptr_v2, 18) \
x(indirect_inline_data, 19) x(indirect_inline_data, 19) \
x(alloc_v2, 20)
enum bch_bkey_type { enum bch_bkey_type {
#define x(name, nr) KEY_TYPE_##name = nr, #define x(name, nr) KEY_TYPE_##name = nr,
...@@ -555,9 +556,11 @@ struct bch_extent_stripe_ptr { ...@@ -555,9 +556,11 @@ struct bch_extent_stripe_ptr {
#if defined(__LITTLE_ENDIAN_BITFIELD) #if defined(__LITTLE_ENDIAN_BITFIELD)
__u64 type:5, __u64 type:5,
block:8, block:8,
idx:51; redundancy:4,
idx:47;
#elif defined (__BIG_ENDIAN_BITFIELD) #elif defined (__BIG_ENDIAN_BITFIELD)
__u64 idx:51, __u64 idx:47,
redundancy:4,
block:8, block:8,
type:5; type:5;
#endif #endif
...@@ -803,35 +806,40 @@ struct bch_alloc { ...@@ -803,35 +806,40 @@ struct bch_alloc {
__u8 data[]; __u8 data[];
} __attribute__((packed, aligned(8))); } __attribute__((packed, aligned(8)));
#define BCH_ALLOC_FIELDS() \ #define BCH_ALLOC_FIELDS_V1() \
x(read_time, 16) \ x(read_time, 16) \
x(write_time, 16) \ x(write_time, 16) \
x(data_type, 8) \ x(data_type, 8) \
x(dirty_sectors, 16) \ x(dirty_sectors, 16) \
x(cached_sectors, 16) \ x(cached_sectors, 16) \
x(oldest_gen, 8) x(oldest_gen, 8) \
x(stripe, 32) \
x(stripe_redundancy, 8)
struct bch_alloc_v2 {
struct bch_val v;
__u8 nr_fields;
__u8 gen;
__u8 oldest_gen;
__u8 data_type;
__u8 data[];
} __attribute__((packed, aligned(8)));
#define BCH_ALLOC_FIELDS_V2() \
x(read_time, 64) \
x(write_time, 64) \
x(dirty_sectors, 16) \
x(cached_sectors, 16) \
x(stripe, 32) \
x(stripe_redundancy, 8)
enum { enum {
#define x(name, bytes) BCH_ALLOC_FIELD_##name, #define x(name, _bits) BCH_ALLOC_FIELD_V1_##name,
BCH_ALLOC_FIELDS() BCH_ALLOC_FIELDS_V1()
#undef x #undef x
BCH_ALLOC_FIELD_NR BCH_ALLOC_FIELD_NR
}; };
static const unsigned BCH_ALLOC_FIELD_BYTES[] = {
#define x(name, bits) [BCH_ALLOC_FIELD_##name] = bits / 8,
BCH_ALLOC_FIELDS()
#undef x
};
#define x(name, bits) + (bits / 8)
static const unsigned BKEY_ALLOC_VAL_U64s_MAX =
DIV_ROUND_UP(offsetof(struct bch_alloc, data)
BCH_ALLOC_FIELDS(), sizeof(u64));
#undef x
#define BKEY_ALLOC_U64s_MAX (BKEY_U64s + BKEY_ALLOC_VAL_U64s_MAX)
/* Quotas: */ /* Quotas: */
enum quota_types { enum quota_types {
...@@ -1337,7 +1345,8 @@ LE64_BITMASK(BCH_SB_METADATA_TARGET, struct bch_sb, flags[3], 16, 28); ...@@ -1337,7 +1345,8 @@ LE64_BITMASK(BCH_SB_METADATA_TARGET, struct bch_sb, flags[3], 16, 28);
x(btree_updates_journalled, 13) \ x(btree_updates_journalled, 13) \
x(reflink_inline_data, 14) \ x(reflink_inline_data, 14) \
x(new_varint, 15) \ x(new_varint, 15) \
x(journal_no_flush, 16) x(journal_no_flush, 16) \
x(alloc_v2, 17)
#define BCH_SB_FEATURES_ALL \ #define BCH_SB_FEATURES_ALL \
((1ULL << BCH_FEATURE_new_siphash)| \ ((1ULL << BCH_FEATURE_new_siphash)| \
...@@ -1345,7 +1354,8 @@ LE64_BITMASK(BCH_SB_METADATA_TARGET, struct bch_sb, flags[3], 16, 28); ...@@ -1345,7 +1354,8 @@ LE64_BITMASK(BCH_SB_METADATA_TARGET, struct bch_sb, flags[3], 16, 28);
(1ULL << BCH_FEATURE_btree_ptr_v2)| \ (1ULL << BCH_FEATURE_btree_ptr_v2)| \
(1ULL << BCH_FEATURE_extents_above_btree_updates)|\ (1ULL << BCH_FEATURE_extents_above_btree_updates)|\
(1ULL << BCH_FEATURE_new_varint)| \ (1ULL << BCH_FEATURE_new_varint)| \
(1ULL << BCH_FEATURE_journal_no_flush)) (1ULL << BCH_FEATURE_journal_no_flush)| \
(1ULL << BCH_FEATURE_alloc_v2))
enum bch_sb_feature { enum bch_sb_feature {
#define x(f, n) BCH_FEATURE_##f, #define x(f, n) BCH_FEATURE_##f,
......
...@@ -538,6 +538,7 @@ BKEY_VAL_ACCESSORS(reflink_v); ...@@ -538,6 +538,7 @@ BKEY_VAL_ACCESSORS(reflink_v);
BKEY_VAL_ACCESSORS(inline_data); BKEY_VAL_ACCESSORS(inline_data);
BKEY_VAL_ACCESSORS(btree_ptr_v2); BKEY_VAL_ACCESSORS(btree_ptr_v2);
BKEY_VAL_ACCESSORS(indirect_inline_data); BKEY_VAL_ACCESSORS(indirect_inline_data);
BKEY_VAL_ACCESSORS(alloc_v2);
/* byte order helpers */ /* byte order helpers */
......
This diff is collapsed.
...@@ -41,7 +41,8 @@ struct bucket { ...@@ -41,7 +41,8 @@ struct bucket {
u8 oldest_gen; u8 oldest_gen;
u8 gc_gen; u8 gc_gen;
unsigned gen_valid:1; unsigned gen_valid:1;
u8 ec_redundancy; u8 stripe_redundancy;
u32 stripe;
}; };
struct bucket_array { struct bucket_array {
......
...@@ -105,6 +105,9 @@ const char *bch2_stripe_invalid(const struct bch_fs *c, struct bkey_s_c k) ...@@ -105,6 +105,9 @@ const char *bch2_stripe_invalid(const struct bch_fs *c, struct bkey_s_c k)
{ {
const struct bch_stripe *s = bkey_s_c_to_stripe(k).v; const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
if (!bkey_cmp(k.k->p, POS_MIN))
return "stripe at pos 0";
if (k.k->p.inode) if (k.k->p.inode)
return "invalid stripe key"; return "invalid stripe key";
...@@ -279,10 +282,14 @@ static void ec_validate_checksums(struct bch_fs *c, struct ec_stripe_buf *buf) ...@@ -279,10 +282,14 @@ static void ec_validate_checksums(struct bch_fs *c, struct ec_stripe_buf *buf)
struct bch_csum got = ec_block_checksum(buf, i, offset); struct bch_csum got = ec_block_checksum(buf, i, offset);
if (bch2_crc_cmp(want, got)) { if (bch2_crc_cmp(want, got)) {
char buf2[200];
bch2_bkey_val_to_text(&PBUF(buf2), c, bkey_i_to_s_c(&buf->key.k_i));
bch_err_ratelimited(c, bch_err_ratelimited(c,
"stripe checksum error at %u:%u: csum type %u, expected %llx got %llx", "stripe checksum error for %ps at %u:%u: csum type %u, expected %llx got %llx\n%s",
i, j, v->csum_type, (void *) _RET_IP_, i, j, v->csum_type,
want.lo, got.lo); want.lo, got.lo, buf2);
clear_bit(i, buf->valid); clear_bit(i, buf->valid);
break; break;
} }
...@@ -335,6 +342,8 @@ static int ec_do_recov(struct bch_fs *c, struct ec_stripe_buf *buf) ...@@ -335,6 +342,8 @@ static int ec_do_recov(struct bch_fs *c, struct ec_stripe_buf *buf)
static void ec_block_endio(struct bio *bio) static void ec_block_endio(struct bio *bio)
{ {
struct ec_bio *ec_bio = container_of(bio, struct ec_bio, bio); struct ec_bio *ec_bio = container_of(bio, struct ec_bio, bio);
struct bch_stripe *v = &ec_bio->buf->key.v;
struct bch_extent_ptr *ptr = &v->ptrs[ec_bio->idx];
struct bch_dev *ca = ec_bio->ca; struct bch_dev *ca = ec_bio->ca;
struct closure *cl = bio->bi_private; struct closure *cl = bio->bi_private;
...@@ -343,6 +352,13 @@ static void ec_block_endio(struct bio *bio) ...@@ -343,6 +352,13 @@ static void ec_block_endio(struct bio *bio)
bch2_blk_status_to_str(bio->bi_status))) bch2_blk_status_to_str(bio->bi_status)))
clear_bit(ec_bio->idx, ec_bio->buf->valid); clear_bit(ec_bio->idx, ec_bio->buf->valid);
if (ptr_stale(ca, ptr)) {
bch_err_ratelimited(ca->fs,
"error %s stripe: stale pointer after io",
bio_data_dir(bio) == READ ? "reading from" : "writing to");
clear_bit(ec_bio->idx, ec_bio->buf->valid);
}
bio_put(&ec_bio->bio); bio_put(&ec_bio->bio);
percpu_ref_put(&ca->io_ref); percpu_ref_put(&ca->io_ref);
closure_put(cl); closure_put(cl);
...@@ -652,7 +668,6 @@ void bch2_stripes_heap_update(struct bch_fs *c, ...@@ -652,7 +668,6 @@ void bch2_stripes_heap_update(struct bch_fs *c,
static int ec_stripe_delete(struct bch_fs *c, size_t idx) static int ec_stripe_delete(struct bch_fs *c, size_t idx)
{ {
//pr_info("deleting stripe %zu", idx);
return bch2_btree_delete_range(c, BTREE_ID_EC, return bch2_btree_delete_range(c, BTREE_ID_EC,
POS(0, idx), POS(0, idx),
POS(0, idx + 1), POS(0, idx + 1),
...@@ -795,6 +810,7 @@ static void extent_stripe_ptr_add(struct bkey_s_extent e, ...@@ -795,6 +810,7 @@ static void extent_stripe_ptr_add(struct bkey_s_extent e,
*dst = (struct bch_extent_stripe_ptr) { *dst = (struct bch_extent_stripe_ptr) {
.type = 1 << BCH_EXTENT_ENTRY_stripe_ptr, .type = 1 << BCH_EXTENT_ENTRY_stripe_ptr,
.block = block, .block = block,
.redundancy = s->key.v.nr_redundant,
.idx = s->key.k.p.offset, .idx = s->key.k.p.offset,
}; };
} }
...@@ -1054,8 +1070,6 @@ void bch2_ec_add_backpointer(struct bch_fs *c, struct write_point *wp, ...@@ -1054,8 +1070,6 @@ void bch2_ec_add_backpointer(struct bch_fs *c, struct write_point *wp,
if (!ob) if (!ob)
return; return;
//pr_info("adding backpointer at %llu:%llu", pos.inode, pos.offset);
ec = ob->ec; ec = ob->ec;
mutex_lock(&ec->lock); mutex_lock(&ec->lock);
...@@ -1348,12 +1362,14 @@ static s64 get_existing_stripe(struct bch_fs *c, ...@@ -1348,12 +1362,14 @@ static s64 get_existing_stripe(struct bch_fs *c,
struct stripe *m; struct stripe *m;
size_t heap_idx; size_t heap_idx;
u64 stripe_idx; u64 stripe_idx;
s64 ret = -1;
if (may_create_new_stripe(c)) if (may_create_new_stripe(c))
return -1; return -1;
spin_lock(&c->ec_stripes_heap_lock); spin_lock(&c->ec_stripes_heap_lock);
for (heap_idx = 0; heap_idx < h->used; heap_idx++) { for (heap_idx = 0; heap_idx < h->used; heap_idx++) {
/* No blocks worth reusing, stripe will just be deleted: */
if (!h->data[heap_idx].blocks_nonempty) if (!h->data[heap_idx].blocks_nonempty)
continue; continue;
...@@ -1365,13 +1381,12 @@ static s64 get_existing_stripe(struct bch_fs *c, ...@@ -1365,13 +1381,12 @@ static s64 get_existing_stripe(struct bch_fs *c,
m->sectors == head->blocksize && m->sectors == head->blocksize &&
m->blocks_nonempty < m->nr_blocks - m->nr_redundant) { m->blocks_nonempty < m->nr_blocks - m->nr_redundant) {
bch2_stripes_heap_del(c, m, stripe_idx); bch2_stripes_heap_del(c, m, stripe_idx);
spin_unlock(&c->ec_stripes_heap_lock); ret = stripe_idx;
return stripe_idx; break;
} }
} }
spin_unlock(&c->ec_stripes_heap_lock); spin_unlock(&c->ec_stripes_heap_lock);
return -1; return ret;
} }
struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c, struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c,
......
...@@ -703,14 +703,8 @@ unsigned bch2_bkey_replicas(struct bch_fs *c, struct bkey_s_c k) ...@@ -703,14 +703,8 @@ unsigned bch2_bkey_replicas(struct bch_fs *c, struct bkey_s_c k)
if (p.ptr.cached) if (p.ptr.cached)
continue; continue;
if (p.has_ec) { if (p.has_ec)
struct stripe *s = replicas += p.ec.redundancy;
genradix_ptr(&c->stripes[0], p.ec.idx);
WARN_ON(!s);
if (s)
replicas += s->nr_redundant;
}
replicas++; replicas++;
...@@ -733,16 +727,9 @@ static unsigned bch2_extent_ptr_durability(struct bch_fs *c, ...@@ -733,16 +727,9 @@ static unsigned bch2_extent_ptr_durability(struct bch_fs *c,
if (ca->mi.state != BCH_MEMBER_STATE_FAILED) if (ca->mi.state != BCH_MEMBER_STATE_FAILED)
durability = max_t(unsigned, durability, ca->mi.durability); durability = max_t(unsigned, durability, ca->mi.durability);
if (p.has_ec) { if (p.has_ec)
struct stripe *s = durability += p.ec.redundancy;
genradix_ptr(&c->stripes[0], p.ec.idx);
if (WARN_ON(!s))
goto out;
durability += s->nr_redundant;
}
out:
return durability; return durability;
} }
......
...@@ -92,11 +92,8 @@ static enum data_cmd copygc_pred(struct bch_fs *c, void *arg, ...@@ -92,11 +92,8 @@ static enum data_cmd copygc_pred(struct bch_fs *c, void *arg,
data_opts->btree_insert_flags = BTREE_INSERT_USE_RESERVE; data_opts->btree_insert_flags = BTREE_INSERT_USE_RESERVE;
data_opts->rewrite_dev = p.ptr.dev; data_opts->rewrite_dev = p.ptr.dev;
if (p.has_ec) { if (p.has_ec)
struct stripe *m = genradix_ptr(&c->stripes[0], p.ec.idx); data_opts->nr_replicas += p.ec.redundancy;
data_opts->nr_replicas += m->nr_redundant;
}
return DATA_REWRITE; return DATA_REWRITE;
} }
...@@ -179,12 +176,12 @@ static int bch2_copygc(struct bch_fs *c) ...@@ -179,12 +176,12 @@ static int bch2_copygc(struct bch_fs *c)
bucket_sectors_used(m) >= ca->mi.bucket_size) bucket_sectors_used(m) >= ca->mi.bucket_size)
continue; continue;
WARN_ON(m.stripe && !g->ec_redundancy); WARN_ON(m.stripe && !g->stripe_redundancy);
e = (struct copygc_heap_entry) { e = (struct copygc_heap_entry) {
.dev = dev_idx, .dev = dev_idx,
.gen = m.gen, .gen = m.gen,
.replicas = 1 + g->ec_redundancy, .replicas = 1 + g->stripe_redundancy,
.fragmentation = bucket_sectors_used(m) * (1U << 15) .fragmentation = bucket_sectors_used(m) * (1U << 15)
/ ca->mi.bucket_size, / ca->mi.bucket_size,
.sectors = bucket_sectors_used(m), .sectors = bucket_sectors_used(m),
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment