bcachefs: Inline data extents

This implements extents that have their data inline, in the value, instead of the bkey value being pointers to the data - and the read and write paths are updated to read from these new extent types and write them out, when the write size is small enough. Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>

bcachefs: Inline data extents
This implements extents that have their data inline, in the value, instead of the bkey value being pointers to the data - and the read and write paths are updated to read from these new extent types and write them out, when the write size is small enough. Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
4be1a412 · Kent Overstreet · Kent Overstreet · 08c07fea · 4be1a412 · 4be1a412
Commit 4be1a412 authored Nov 09, 2019 by Kent Overstreet Committed by Kent Overstreet Oct 22, 2023
9 changed files
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -342,7 +342,8 @@ static inline void bkey_init(struct bkey *k)
 	x(quota,		13)			\
 	x(stripe,		14)			\
 	x(reflink_p,		15)			\
-	x(reflink_v,		16)
+	x(reflink_v,		16)			\
+	x(inline_data,		17)

 enum bch_bkey_type {
 #define x(name, nr) KEY_TYPE_##name	= nr,
@@ -915,6 +916,13 @@ struct bch_reflink_v {
 	__u64			_data[0];
 };

+/* Inline data */
+
+struct bch_inline_data {
+	struct bch_val		v;
+	u8			data[0];
+};
+
 /* Optional/variable size superblock sections: */

 struct bch_sb_field {
@@ -1319,6 +1327,7 @@ enum bch_sb_features {
 	BCH_FEATURE_JOURNAL_SEQ_BLACKLIST_V3 = 5,
 	BCH_FEATURE_REFLINK		= 6,
 	BCH_FEATURE_NEW_SIPHASH		= 7,
+	BCH_FEATURE_INLINE_DATA		= 8,
 	BCH_FEATURE_NR,
 };


--- a/fs/bcachefs/bkey.h
+++ b/fs/bcachefs/bkey.h
@@ -572,6 +572,7 @@ BKEY_VAL_ACCESSORS(quota);
 BKEY_VAL_ACCESSORS(stripe);
 BKEY_VAL_ACCESSORS(reflink_p);
 BKEY_VAL_ACCESSORS(reflink_v);
+BKEY_VAL_ACCESSORS(inline_data);

 /* byte order helpers */


--- a/fs/bcachefs/bkey_methods.c
+++ b/fs/bcachefs/bkey_methods.c
@@ -63,6 +63,23 @@ static const char *key_type_cookie_invalid(const struct bch_fs *c,
 	.key_invalid = empty_val_key_invalid,		\
 }

+static const char *key_type_inline_data_invalid(const struct bch_fs *c,
+					   struct bkey_s_c k)
+{
+	return NULL;
+}
+
+static void key_type_inline_data_to_text(struct printbuf *out, struct bch_fs *c,
+					 struct bkey_s_c k)
+{
+	pr_buf(out, "(%zu bytes)", bkey_val_bytes(k.k));
+}
+
+static const struct bkey_ops bch2_bkey_ops_inline_data = {
+	.key_invalid	= key_type_inline_data_invalid,
+	.val_to_text	= key_type_inline_data_to_text,
+};
+
 static const struct bkey_ops bch2_bkey_ops[] = {
 #define x(name, nr) [KEY_TYPE_##name]	= bch2_bkey_ops_##name,
 	BCH_BKEY_TYPES()
@@ -83,9 +100,8 @@ const char *__bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k,
 	if (k.k->u64s < BKEY_U64s)
 		return "u64s too small";

-	if ((btree_node_type_is_extents(type) ||
-	     type == BKEY_TYPE_BTREE) &&
-	    bkey_val_u64s(k.k) > BKEY_EXTENT_VAL_U64s_MAX)
+	if (type == BKEY_TYPE_BTREE &&
+	    bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX)
 		return "value too big";

 	if (btree_node_type_is_extents(type)) {

--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -737,11 +737,6 @@ int bch2_cut_front_s(struct bpos where, struct bkey_s k)
 	}

 	switch (k.k->type) {
-	case KEY_TYPE_deleted:
-	case KEY_TYPE_discard:
-	case KEY_TYPE_error:
-	case KEY_TYPE_cookie:
-		break;
 	case KEY_TYPE_extent:
 	case KEY_TYPE_reflink_v: {
 		struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
@@ -779,10 +774,18 @@ int bch2_cut_front_s(struct bpos where, struct bkey_s k)
 		le64_add_cpu(&p.v->idx, sub);
 		break;
 	}
-	case KEY_TYPE_reservation:
+	case KEY_TYPE_inline_data: {
+		struct bkey_s_inline_data d = bkey_s_to_inline_data(k);
+
+		sub = min_t(u64, sub << 9, bkey_val_bytes(d.k));
+
+		memmove(d.v->data,
+			d.v->data + sub,
+			bkey_val_bytes(d.k) - sub);
+
+		new_val_u64s -= sub >> 3;
 		break;
-	default:
-		BUG();
+	}
 	}

 	val_u64s_delta = bkey_val_u64s(k.k) - new_val_u64s;
@@ -814,6 +817,12 @@ int bch2_cut_back_s(struct bpos where, struct bkey_s k)
 		new_val_u64s = 0;
 	}

+	switch (k.k->type) {
+	case KEY_TYPE_inline_data:
+		new_val_u64s = min(new_val_u64s, k.k->size << 6);
+		break;
+	}
+
 	val_u64s_delta = bkey_val_u64s(k.k) - new_val_u64s;
 	BUG_ON(val_u64s_delta < 0);


--- a/fs/bcachefs/extents.h
+++ b/fs/bcachefs/extents.h
@@ -456,6 +456,7 @@ static inline bool bkey_extent_is_direct_data(const struct bkey *k)
 static inline bool bkey_extent_is_data(const struct bkey *k)
 {
 	return bkey_extent_is_direct_data(k) ||
+		k->type == KEY_TYPE_inline_data ||
 		k->type == KEY_TYPE_reflink_p;
 }

@@ -469,6 +470,7 @@ static inline bool bkey_extent_is_allocation(const struct bkey *k)
 	case KEY_TYPE_reservation:
 	case KEY_TYPE_reflink_p:
 	case KEY_TYPE_reflink_v:
+	case KEY_TYPE_inline_data:
 		return true;
 	default:
 		return false;

--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -990,6 +990,18 @@ static void bch2_writepage_io_done(struct closure *cl)
 		}
 	}

+	if (io->op.flags & BCH_WRITE_WROTE_DATA_INLINE) {
+		bio_for_each_segment_all(bvec, bio, iter) {
+			struct bch_page_state *s;
+
+			s = __bch2_page_state(bvec->bv_page);
+			spin_lock(&s->lock);
+			for (i = 0; i < PAGE_SECTORS; i++)
+				s->s[i].nr_replicas = 0;
+			spin_unlock(&s->lock);
+		}
+	}
+
 	/*
 	 * racing with fallocate can cause us to add fewer sectors than
 	 * expected - but we shouldn't add more sectors than expected:

--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -539,16 +539,19 @@ static void __bch2_write_index(struct bch_write_op *op)

 	for (src = keys->keys; src != keys->top; src = n) {
 		n = bkey_next(src);
-		bkey_copy(dst, src);

-		bch2_bkey_drop_ptrs(bkey_i_to_s(dst), ptr,
+		if (bkey_extent_is_direct_data(&src->k)) {
+			bch2_bkey_drop_ptrs(bkey_i_to_s(src), ptr,
 					    test_bit(ptr->dev, op->failed.d));

-		if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(dst))) {
+			if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(src))) {
 				ret = -EIO;
 				goto err;
 			}
+		}

+		if (dst != src)
+			memmove_u64s_down(dst, src, src->u64s);
 		dst = bkey_next(dst);
 	}

@@ -1092,7 +1095,7 @@ static void __bch2_write(struct closure *cl)

 		bio->bi_end_io	= bch2_write_endio;
 		bio->bi_private	= &op->cl;
-		bio->bi_opf	= REQ_OP_WRITE;
+		bio->bi_opf |= REQ_OP_WRITE;

 		if (!skip_put)
 			closure_get(bio->bi_private);
@@ -1129,6 +1132,47 @@ static void __bch2_write(struct closure *cl)
 	goto again;
 }

+static void bch2_write_data_inline(struct bch_write_op *op, unsigned data_len)
+{
+	struct closure *cl = &op->cl;
+	struct bio *bio = &op->wbio.bio;
+	struct bvec_iter iter;
+	struct bkey_i_inline_data *id;
+	unsigned sectors;
+	int ret;
+
+	ret = bch2_keylist_realloc(&op->insert_keys, op->inline_keys,
+				   ARRAY_SIZE(op->inline_keys),
+				   BKEY_U64s + DIV_ROUND_UP(data_len, 8));
+	if (ret) {
+		op->error = ret;
+		goto err;
+	}
+
+	sectors = bio_sectors(bio);
+	op->pos.offset += sectors;
+
+	id = bkey_inline_data_init(op->insert_keys.top);
+	id->k.p		= op->pos;
+	id->k.version	= op->version;
+	id->k.size	= sectors;
+
+	iter = bio->bi_iter;
+	iter.bi_size = data_len;
+	memcpy_from_bio(id->v.data, bio, iter);
+
+	while (data_len & 7)
+		id->v.data[data_len++] = '\0';
+	set_bkey_val_bytes(&id->k, data_len);
+	bch2_keylist_push(&op->insert_keys);
+
+	op->flags |= BCH_WRITE_WROTE_DATA_INLINE;
+	continue_at_nobarrier(cl, bch2_write_index, NULL);
+	return;
+err:
+	bch2_write_done(&op->cl);
+}
+
 /**
 * bch_write - handle a write to a cache device or flash only volume
 *
@@ -1150,22 +1194,22 @@ void bch2_write(struct closure *cl)
 	struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
 	struct bio *bio = &op->wbio.bio;
 	struct bch_fs *c = op->c;
+	unsigned data_len;

 	BUG_ON(!op->nr_replicas);
 	BUG_ON(!op->write_point.v);
 	BUG_ON(!bkey_cmp(op->pos, POS_MAX));

+	op->start_time = local_clock();
+	bch2_keylist_init(&op->insert_keys, op->inline_keys);
+	wbio_init(bio)->put_bio = false;
+
 	if (bio_sectors(bio) & (c->opts.block_size - 1)) {
 		__bcache_io_error(c, "misaligned write");
 		op->error = -EIO;
 		goto err;
 	}

-	op->start_time = local_clock();
-
-	bch2_keylist_init(&op->insert_keys, op->inline_keys);
-	wbio_init(bio)->put_bio = false;
-
 	if (c->opts.nochanges ||
 	    !percpu_ref_tryget(&c->writes)) {
 		__bcache_io_error(c, "read only");
@@ -1175,6 +1219,14 @@ void bch2_write(struct closure *cl)

 	bch2_increment_clock(c, bio_sectors(bio), WRITE);

+	data_len = min_t(u64, bio->bi_iter.bi_size,
+			 op->new_i_size - (op->pos.offset << 9));
+
+	if (data_len <= min(block_bytes(c) / 2, 1024U)) {
+		bch2_write_data_inline(op, data_len);
+		return;
+	}
+
 	continue_at_nobarrier(cl, __bch2_write, NULL);
 	return;
 err:
@@ -1892,6 +1944,19 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig,
 	struct bpos pos = bkey_start_pos(k.k);
 	int pick_ret;

+	if (k.k->type == KEY_TYPE_inline_data) {
+		struct bkey_s_c_inline_data d = bkey_s_c_to_inline_data(k);
+		unsigned bytes = min_t(unsigned, iter.bi_size,
+				       bkey_val_bytes(d.k));
+
+		swap(iter.bi_size, bytes);
+		memcpy_to_bio(&orig->bio, iter, d.v->data);
+		swap(iter.bi_size, bytes);
+		bio_advance_iter(&orig->bio, &iter, bytes);
+		zero_fill_bio_iter(&orig->bio, iter);
+		goto out_read_done;
+	}
+
 	pick_ret = bch2_bkey_pick_read_device(c, k, failed, &pick);

 	/* hole or reservation - just zero fill: */

--- a/fs/bcachefs/io.h
+++ b/fs/bcachefs/io.h
@@ -34,10 +34,11 @@ enum bch_write_flags {
 	BCH_WRITE_PAGES_OWNED		= (1 << 5),
 	BCH_WRITE_ONLY_SPECIFIED_DEVS	= (1 << 6),
 	BCH_WRITE_NOPUT_RESERVATION	= (1 << 7),
+	BCH_WRITE_WROTE_DATA_INLINE	= (1 << 8),

 	/* Internal: */
-	BCH_WRITE_JOURNAL_SEQ_PTR	= (1 << 8),
-	BCH_WRITE_SKIP_CLOSURE_PUT	= (1 << 9),
+	BCH_WRITE_JOURNAL_SEQ_PTR	= (1 << 9),
+	BCH_WRITE_SKIP_CLOSURE_PUT	= (1 << 10),
 };

 static inline u64 *op_journal_seq(struct bch_write_op *op)

--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -913,6 +913,12 @@ int bch2_fs_recovery(struct bch_fs *c)
 		write_sb = true;
 	}

+	if (!(c->sb.features & (1ULL << BCH_FEATURE_INLINE_DATA))) {
+		c->disk_sb.sb->features[0] |=
+			cpu_to_le64(1ULL << BCH_FEATURE_INLINE_DATA);
+		write_sb = true;
+	}
+
 	if (!test_bit(BCH_FS_ERROR, &c->flags)) {
 		c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_FEAT_ALLOC_INFO;
 		write_sb = true;