Commit 4be1a412 authored by Kent Overstreet's avatar Kent Overstreet Committed by Kent Overstreet

bcachefs: Inline data extents

This implements extents that have their data inline, in the value,
instead of the bkey value being pointers to the data - and the read and
write paths are updated to read from these new extent types and write
them out, when the write size is small enough.
Signed-off-by: default avatarKent Overstreet <kent.overstreet@linux.dev>
parent 08c07fea
...@@ -342,7 +342,8 @@ static inline void bkey_init(struct bkey *k) ...@@ -342,7 +342,8 @@ static inline void bkey_init(struct bkey *k)
x(quota, 13) \ x(quota, 13) \
x(stripe, 14) \ x(stripe, 14) \
x(reflink_p, 15) \ x(reflink_p, 15) \
x(reflink_v, 16) x(reflink_v, 16) \
x(inline_data, 17)
enum bch_bkey_type { enum bch_bkey_type {
#define x(name, nr) KEY_TYPE_##name = nr, #define x(name, nr) KEY_TYPE_##name = nr,
...@@ -915,6 +916,13 @@ struct bch_reflink_v { ...@@ -915,6 +916,13 @@ struct bch_reflink_v {
__u64 _data[0]; __u64 _data[0];
}; };
/* Inline data */
struct bch_inline_data {
struct bch_val v;
u8 data[0];
};
/* Optional/variable size superblock sections: */ /* Optional/variable size superblock sections: */
struct bch_sb_field { struct bch_sb_field {
...@@ -1319,6 +1327,7 @@ enum bch_sb_features { ...@@ -1319,6 +1327,7 @@ enum bch_sb_features {
BCH_FEATURE_JOURNAL_SEQ_BLACKLIST_V3 = 5, BCH_FEATURE_JOURNAL_SEQ_BLACKLIST_V3 = 5,
BCH_FEATURE_REFLINK = 6, BCH_FEATURE_REFLINK = 6,
BCH_FEATURE_NEW_SIPHASH = 7, BCH_FEATURE_NEW_SIPHASH = 7,
BCH_FEATURE_INLINE_DATA = 8,
BCH_FEATURE_NR, BCH_FEATURE_NR,
}; };
......
...@@ -572,6 +572,7 @@ BKEY_VAL_ACCESSORS(quota); ...@@ -572,6 +572,7 @@ BKEY_VAL_ACCESSORS(quota);
BKEY_VAL_ACCESSORS(stripe); BKEY_VAL_ACCESSORS(stripe);
BKEY_VAL_ACCESSORS(reflink_p); BKEY_VAL_ACCESSORS(reflink_p);
BKEY_VAL_ACCESSORS(reflink_v); BKEY_VAL_ACCESSORS(reflink_v);
BKEY_VAL_ACCESSORS(inline_data);
/* byte order helpers */ /* byte order helpers */
......
...@@ -63,6 +63,23 @@ static const char *key_type_cookie_invalid(const struct bch_fs *c, ...@@ -63,6 +63,23 @@ static const char *key_type_cookie_invalid(const struct bch_fs *c,
.key_invalid = empty_val_key_invalid, \ .key_invalid = empty_val_key_invalid, \
} }
static const char *key_type_inline_data_invalid(const struct bch_fs *c,
struct bkey_s_c k)
{
return NULL;
}
static void key_type_inline_data_to_text(struct printbuf *out, struct bch_fs *c,
struct bkey_s_c k)
{
pr_buf(out, "(%zu bytes)", bkey_val_bytes(k.k));
}
static const struct bkey_ops bch2_bkey_ops_inline_data = {
.key_invalid = key_type_inline_data_invalid,
.val_to_text = key_type_inline_data_to_text,
};
static const struct bkey_ops bch2_bkey_ops[] = { static const struct bkey_ops bch2_bkey_ops[] = {
#define x(name, nr) [KEY_TYPE_##name] = bch2_bkey_ops_##name, #define x(name, nr) [KEY_TYPE_##name] = bch2_bkey_ops_##name,
BCH_BKEY_TYPES() BCH_BKEY_TYPES()
...@@ -83,9 +100,8 @@ const char *__bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k, ...@@ -83,9 +100,8 @@ const char *__bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k,
if (k.k->u64s < BKEY_U64s) if (k.k->u64s < BKEY_U64s)
return "u64s too small"; return "u64s too small";
if ((btree_node_type_is_extents(type) || if (type == BKEY_TYPE_BTREE &&
type == BKEY_TYPE_BTREE) && bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX)
bkey_val_u64s(k.k) > BKEY_EXTENT_VAL_U64s_MAX)
return "value too big"; return "value too big";
if (btree_node_type_is_extents(type)) { if (btree_node_type_is_extents(type)) {
......
...@@ -737,11 +737,6 @@ int bch2_cut_front_s(struct bpos where, struct bkey_s k) ...@@ -737,11 +737,6 @@ int bch2_cut_front_s(struct bpos where, struct bkey_s k)
} }
switch (k.k->type) { switch (k.k->type) {
case KEY_TYPE_deleted:
case KEY_TYPE_discard:
case KEY_TYPE_error:
case KEY_TYPE_cookie:
break;
case KEY_TYPE_extent: case KEY_TYPE_extent:
case KEY_TYPE_reflink_v: { case KEY_TYPE_reflink_v: {
struct bkey_ptrs ptrs = bch2_bkey_ptrs(k); struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
...@@ -779,10 +774,18 @@ int bch2_cut_front_s(struct bpos where, struct bkey_s k) ...@@ -779,10 +774,18 @@ int bch2_cut_front_s(struct bpos where, struct bkey_s k)
le64_add_cpu(&p.v->idx, sub); le64_add_cpu(&p.v->idx, sub);
break; break;
} }
case KEY_TYPE_reservation: case KEY_TYPE_inline_data: {
struct bkey_s_inline_data d = bkey_s_to_inline_data(k);
sub = min_t(u64, sub << 9, bkey_val_bytes(d.k));
memmove(d.v->data,
d.v->data + sub,
bkey_val_bytes(d.k) - sub);
new_val_u64s -= sub >> 3;
break; break;
default: }
BUG();
} }
val_u64s_delta = bkey_val_u64s(k.k) - new_val_u64s; val_u64s_delta = bkey_val_u64s(k.k) - new_val_u64s;
...@@ -814,6 +817,12 @@ int bch2_cut_back_s(struct bpos where, struct bkey_s k) ...@@ -814,6 +817,12 @@ int bch2_cut_back_s(struct bpos where, struct bkey_s k)
new_val_u64s = 0; new_val_u64s = 0;
} }
switch (k.k->type) {
case KEY_TYPE_inline_data:
new_val_u64s = min(new_val_u64s, k.k->size << 6);
break;
}
val_u64s_delta = bkey_val_u64s(k.k) - new_val_u64s; val_u64s_delta = bkey_val_u64s(k.k) - new_val_u64s;
BUG_ON(val_u64s_delta < 0); BUG_ON(val_u64s_delta < 0);
......
...@@ -456,6 +456,7 @@ static inline bool bkey_extent_is_direct_data(const struct bkey *k) ...@@ -456,6 +456,7 @@ static inline bool bkey_extent_is_direct_data(const struct bkey *k)
static inline bool bkey_extent_is_data(const struct bkey *k) static inline bool bkey_extent_is_data(const struct bkey *k)
{ {
return bkey_extent_is_direct_data(k) || return bkey_extent_is_direct_data(k) ||
k->type == KEY_TYPE_inline_data ||
k->type == KEY_TYPE_reflink_p; k->type == KEY_TYPE_reflink_p;
} }
...@@ -469,6 +470,7 @@ static inline bool bkey_extent_is_allocation(const struct bkey *k) ...@@ -469,6 +470,7 @@ static inline bool bkey_extent_is_allocation(const struct bkey *k)
case KEY_TYPE_reservation: case KEY_TYPE_reservation:
case KEY_TYPE_reflink_p: case KEY_TYPE_reflink_p:
case KEY_TYPE_reflink_v: case KEY_TYPE_reflink_v:
case KEY_TYPE_inline_data:
return true; return true;
default: default:
return false; return false;
......
...@@ -990,6 +990,18 @@ static void bch2_writepage_io_done(struct closure *cl) ...@@ -990,6 +990,18 @@ static void bch2_writepage_io_done(struct closure *cl)
} }
} }
if (io->op.flags & BCH_WRITE_WROTE_DATA_INLINE) {
bio_for_each_segment_all(bvec, bio, iter) {
struct bch_page_state *s;
s = __bch2_page_state(bvec->bv_page);
spin_lock(&s->lock);
for (i = 0; i < PAGE_SECTORS; i++)
s->s[i].nr_replicas = 0;
spin_unlock(&s->lock);
}
}
/* /*
* racing with fallocate can cause us to add fewer sectors than * racing with fallocate can cause us to add fewer sectors than
* expected - but we shouldn't add more sectors than expected: * expected - but we shouldn't add more sectors than expected:
......
...@@ -539,16 +539,19 @@ static void __bch2_write_index(struct bch_write_op *op) ...@@ -539,16 +539,19 @@ static void __bch2_write_index(struct bch_write_op *op)
for (src = keys->keys; src != keys->top; src = n) { for (src = keys->keys; src != keys->top; src = n) {
n = bkey_next(src); n = bkey_next(src);
bkey_copy(dst, src);
bch2_bkey_drop_ptrs(bkey_i_to_s(dst), ptr, if (bkey_extent_is_direct_data(&src->k)) {
test_bit(ptr->dev, op->failed.d)); bch2_bkey_drop_ptrs(bkey_i_to_s(src), ptr,
test_bit(ptr->dev, op->failed.d));
if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(dst))) { if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(src))) {
ret = -EIO; ret = -EIO;
goto err; goto err;
}
} }
if (dst != src)
memmove_u64s_down(dst, src, src->u64s);
dst = bkey_next(dst); dst = bkey_next(dst);
} }
...@@ -1092,7 +1095,7 @@ static void __bch2_write(struct closure *cl) ...@@ -1092,7 +1095,7 @@ static void __bch2_write(struct closure *cl)
bio->bi_end_io = bch2_write_endio; bio->bi_end_io = bch2_write_endio;
bio->bi_private = &op->cl; bio->bi_private = &op->cl;
bio->bi_opf = REQ_OP_WRITE; bio->bi_opf |= REQ_OP_WRITE;
if (!skip_put) if (!skip_put)
closure_get(bio->bi_private); closure_get(bio->bi_private);
...@@ -1129,6 +1132,47 @@ static void __bch2_write(struct closure *cl) ...@@ -1129,6 +1132,47 @@ static void __bch2_write(struct closure *cl)
goto again; goto again;
} }
static void bch2_write_data_inline(struct bch_write_op *op, unsigned data_len)
{
struct closure *cl = &op->cl;
struct bio *bio = &op->wbio.bio;
struct bvec_iter iter;
struct bkey_i_inline_data *id;
unsigned sectors;
int ret;
ret = bch2_keylist_realloc(&op->insert_keys, op->inline_keys,
ARRAY_SIZE(op->inline_keys),
BKEY_U64s + DIV_ROUND_UP(data_len, 8));
if (ret) {
op->error = ret;
goto err;
}
sectors = bio_sectors(bio);
op->pos.offset += sectors;
id = bkey_inline_data_init(op->insert_keys.top);
id->k.p = op->pos;
id->k.version = op->version;
id->k.size = sectors;
iter = bio->bi_iter;
iter.bi_size = data_len;
memcpy_from_bio(id->v.data, bio, iter);
while (data_len & 7)
id->v.data[data_len++] = '\0';
set_bkey_val_bytes(&id->k, data_len);
bch2_keylist_push(&op->insert_keys);
op->flags |= BCH_WRITE_WROTE_DATA_INLINE;
continue_at_nobarrier(cl, bch2_write_index, NULL);
return;
err:
bch2_write_done(&op->cl);
}
/** /**
* bch_write - handle a write to a cache device or flash only volume * bch_write - handle a write to a cache device or flash only volume
* *
...@@ -1150,22 +1194,22 @@ void bch2_write(struct closure *cl) ...@@ -1150,22 +1194,22 @@ void bch2_write(struct closure *cl)
struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
struct bio *bio = &op->wbio.bio; struct bio *bio = &op->wbio.bio;
struct bch_fs *c = op->c; struct bch_fs *c = op->c;
unsigned data_len;
BUG_ON(!op->nr_replicas); BUG_ON(!op->nr_replicas);
BUG_ON(!op->write_point.v); BUG_ON(!op->write_point.v);
BUG_ON(!bkey_cmp(op->pos, POS_MAX)); BUG_ON(!bkey_cmp(op->pos, POS_MAX));
op->start_time = local_clock();
bch2_keylist_init(&op->insert_keys, op->inline_keys);
wbio_init(bio)->put_bio = false;
if (bio_sectors(bio) & (c->opts.block_size - 1)) { if (bio_sectors(bio) & (c->opts.block_size - 1)) {
__bcache_io_error(c, "misaligned write"); __bcache_io_error(c, "misaligned write");
op->error = -EIO; op->error = -EIO;
goto err; goto err;
} }
op->start_time = local_clock();
bch2_keylist_init(&op->insert_keys, op->inline_keys);
wbio_init(bio)->put_bio = false;
if (c->opts.nochanges || if (c->opts.nochanges ||
!percpu_ref_tryget(&c->writes)) { !percpu_ref_tryget(&c->writes)) {
__bcache_io_error(c, "read only"); __bcache_io_error(c, "read only");
...@@ -1175,6 +1219,14 @@ void bch2_write(struct closure *cl) ...@@ -1175,6 +1219,14 @@ void bch2_write(struct closure *cl)
bch2_increment_clock(c, bio_sectors(bio), WRITE); bch2_increment_clock(c, bio_sectors(bio), WRITE);
data_len = min_t(u64, bio->bi_iter.bi_size,
op->new_i_size - (op->pos.offset << 9));
if (data_len <= min(block_bytes(c) / 2, 1024U)) {
bch2_write_data_inline(op, data_len);
return;
}
continue_at_nobarrier(cl, __bch2_write, NULL); continue_at_nobarrier(cl, __bch2_write, NULL);
return; return;
err: err:
...@@ -1892,6 +1944,19 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig, ...@@ -1892,6 +1944,19 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig,
struct bpos pos = bkey_start_pos(k.k); struct bpos pos = bkey_start_pos(k.k);
int pick_ret; int pick_ret;
if (k.k->type == KEY_TYPE_inline_data) {
struct bkey_s_c_inline_data d = bkey_s_c_to_inline_data(k);
unsigned bytes = min_t(unsigned, iter.bi_size,
bkey_val_bytes(d.k));
swap(iter.bi_size, bytes);
memcpy_to_bio(&orig->bio, iter, d.v->data);
swap(iter.bi_size, bytes);
bio_advance_iter(&orig->bio, &iter, bytes);
zero_fill_bio_iter(&orig->bio, iter);
goto out_read_done;
}
pick_ret = bch2_bkey_pick_read_device(c, k, failed, &pick); pick_ret = bch2_bkey_pick_read_device(c, k, failed, &pick);
/* hole or reservation - just zero fill: */ /* hole or reservation - just zero fill: */
......
...@@ -34,10 +34,11 @@ enum bch_write_flags { ...@@ -34,10 +34,11 @@ enum bch_write_flags {
BCH_WRITE_PAGES_OWNED = (1 << 5), BCH_WRITE_PAGES_OWNED = (1 << 5),
BCH_WRITE_ONLY_SPECIFIED_DEVS = (1 << 6), BCH_WRITE_ONLY_SPECIFIED_DEVS = (1 << 6),
BCH_WRITE_NOPUT_RESERVATION = (1 << 7), BCH_WRITE_NOPUT_RESERVATION = (1 << 7),
BCH_WRITE_WROTE_DATA_INLINE = (1 << 8),
/* Internal: */ /* Internal: */
BCH_WRITE_JOURNAL_SEQ_PTR = (1 << 8), BCH_WRITE_JOURNAL_SEQ_PTR = (1 << 9),
BCH_WRITE_SKIP_CLOSURE_PUT = (1 << 9), BCH_WRITE_SKIP_CLOSURE_PUT = (1 << 10),
}; };
static inline u64 *op_journal_seq(struct bch_write_op *op) static inline u64 *op_journal_seq(struct bch_write_op *op)
......
...@@ -913,6 +913,12 @@ int bch2_fs_recovery(struct bch_fs *c) ...@@ -913,6 +913,12 @@ int bch2_fs_recovery(struct bch_fs *c)
write_sb = true; write_sb = true;
} }
if (!(c->sb.features & (1ULL << BCH_FEATURE_INLINE_DATA))) {
c->disk_sb.sb->features[0] |=
cpu_to_le64(1ULL << BCH_FEATURE_INLINE_DATA);
write_sb = true;
}
if (!test_bit(BCH_FS_ERROR, &c->flags)) { if (!test_bit(BCH_FS_ERROR, &c->flags)) {
c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_FEAT_ALLOC_INFO; c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_FEAT_ALLOC_INFO;
write_sb = true; write_sb = true;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment