bcache: Refactor btree io

The most significant change is that btree reads are now done synchronously, instead of asynchronously and doing the post read stuff from a workqueue. This was originally done because we can't block on IO under generic_make_request(). But - we already have a mechanism to punt cache lookups to workqueue if needed, so if we just use that we don't have to deal with the complexity of doing things asynchronously. The main benefit is this makes the locking situation saner; we can hold our write lock on the btree node until we're finished reading it, and we don't need that btree_node_read_done() flag anymore. Also, for writes, btree_write() was broken out into btree_node_write() and btree_leaf_dirty() - the old code with the boolean argument was dumb and confusing. The prio_blocked mechanism was improved a bit too, now the only counter is in struct btree_write, we don't mess with transfering a count from struct btree anymore. This required changing garbage collection to block prios at the start and unblock when it finishes, which is cleaner than what it was doing anyways (the old code had mostly the same effect, but was doing it in a convoluted way) And the btree iter btree_node_read_done() uses was converted to a real mempool. Signed-off-by: Kent Overstreet <koverstreet@google.com>

bcache: Refactor btree io
The most significant change is that btree reads are now done synchronously, instead of asynchronously and doing the post read stuff from a workqueue. This was originally done because we can't block on IO under generic_make_request(). But - we already have a mechanism to punt cache lookups to workqueue if needed, so if we just use that we don't have to deal with the complexity of doing things asynchronously. The main benefit is this makes the locking situation saner; we can hold our write lock on the btree node until we're finished reading it, and we don't need that btree_node_read_done() flag anymore. Also, for writes, btree_write() was broken out into btree_node_write() and btree_leaf_dirty() - the old code with the boolean argument was dumb and confusing. The prio_blocked mechanism was improved a bit too, now the only counter is in struct btree_write, we don't mess with transfering a count from struct btree anymore. This required changing garbage collection to block prios at the start and unblock when it finishes, which is cleaner than what it was doing anyways (the old code had mostly the same effect, but was doing it in a convoluted way) And the btree iter btree_node_read_done() uses was converted to a real mempool. Signed-off-by: Kent Overstreet <koverstreet@google.com>
57943511 · Kent Overstreet · 119ba0f8 · 57943511 · 57943511 · 57943511
Commit 57943511 authored Apr 25, 2013 by Kent Overstreet
6 changed files
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -819,10 +819,9 @@ struct cache_set {

 	/*
 	 * A btree node on disk could have too many bsets for an iterator to fit
-	 * on the stack - this is a single element mempool for btree_read_work()
+	 * on the stack - have to dynamically allocate them
 	 */
-	struct mutex		fill_lock;
-	struct btree_iter	*fill_iter;
+	mempool_t		*fill_iter;

 	/*
 	 * btree_sort() is a merge sort and requires temporary space - single

--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
--- a/drivers/md/bcache/btree.h
+++ b/drivers/md/bcache/btree.h
@@ -102,7 +102,6 @@
 #include "debug.h"

 struct btree_write {
-	struct closure		*owner;
 	atomic_t		*journal;

 	/* If btree_split() frees a btree node, it writes a new pointer to that
@@ -142,16 +141,12 @@ struct btree {
 	 */
 	struct bset_tree	sets[MAX_BSETS];

-	/* Used to refcount bio splits, also protects b->bio */
+	/* For outstanding btree writes, used as a lock - protects write_idx */
 	struct closure_with_waitlist	io;

-	/* Gets transferred to w->prio_blocked - see the comment there */
-	int			prio_blocked;
-
 	struct list_head	list;
 	struct delayed_work	work;

-	uint64_t		io_start_time;
 	struct btree_write	writes[2];
 	struct bio		*bio;
 };
@@ -164,13 +159,11 @@ static inline void set_btree_node_ ## flag(struct btree *b)		\
 {	set_bit(BTREE_NODE_ ## flag, &b->flags); }			\

 enum btree_flags {
-	BTREE_NODE_read_done,
 	BTREE_NODE_io_error,
 	BTREE_NODE_dirty,
 	BTREE_NODE_write_idx,
 };

-BTREE_FLAG(read_done);
 BTREE_FLAG(io_error);
 BTREE_FLAG(dirty);
 BTREE_FLAG(write_idx);
@@ -293,9 +286,7 @@ static inline void rw_unlock(bool w, struct btree *b)
 #ifdef CONFIG_BCACHE_EDEBUG
 	unsigned i;

-	if (w &&
-	    b->key.ptr[0] &&
-	    btree_node_read_done(b))
+	if (w && b->key.ptr[0])
 		for (i = 0; i <= b->nsets; i++)
 			bch_check_key_order(b, b->sets[i].data);
 #endif
@@ -370,9 +361,9 @@ static inline bool should_split(struct btree *b)
 		 > btree_blocks(b));
 }

-void bch_btree_read_done(struct closure *);
-void bch_btree_read(struct btree *);
-void bch_btree_write(struct btree *b, bool now, struct btree_op *op);
+void bch_btree_node_read(struct btree *);
+void bch_btree_node_read_done(struct btree *);
+void bch_btree_node_write(struct btree *, struct closure *);

 void bch_cannibalize_unlock(struct cache_set *, struct closure *);
 void bch_btree_set_root(struct btree *);

--- a/drivers/md/bcache/debug.c
+++ b/drivers/md/bcache/debug.c
@@ -144,7 +144,7 @@ void bch_btree_verify(struct btree *b, struct bset *new)
 	v->written = 0;
 	v->level = b->level;

-	bch_btree_read(v);
+	bch_btree_node_read(v);
 	closure_wait_event(&v->io.wait, &cl,
 			   atomic_read(&b->io.cl.remaining) == -1);

@@ -512,7 +512,7 @@ static ssize_t btree_fuzz(struct kobject *k, struct kobj_attribute *a,

 		bch_btree_sort(b);
 		fill->written = 0;
-		bch_btree_read_done(&fill->io.cl);
+		bch_btree_node_read_done(fill);

 		if (b->sets[0].data->keys != fill->sets[0].data->keys ||
 		    memcmp(b->sets[0].data->start,

--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@@ -384,7 +384,7 @@ static void btree_flush_write(struct cache_set *c)
 		return;
 found:
 	if (btree_node_dirty(best))
-		bch_btree_write(best, true, NULL);
+		bch_btree_node_write(best, NULL);
 	rw_unlock(true, best);
 }


--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -1255,9 +1255,10 @@ static void cache_set_free(struct closure *cl)
 	free_pages((unsigned long) c->uuids, ilog2(bucket_pages(c)));
 	free_pages((unsigned long) c->sort, ilog2(bucket_pages(c)));

-	kfree(c->fill_iter);
 	if (c->bio_split)
 		bioset_free(c->bio_split);
+	if (c->fill_iter)
+		mempool_destroy(c->fill_iter);
 	if (c->bio_meta)
 		mempool_destroy(c->bio_meta);
 	if (c->search)
@@ -1295,7 +1296,7 @@ static void cache_set_flush(struct closure *cl)
 	/* Should skip this if we're unregistering because of an error */
 	list_for_each_entry(b, &c->btree_cache, list)
 		if (btree_node_dirty(b))
-			bch_btree_write(b, true, NULL);
+			bch_btree_node_write(b, NULL);

 	closure_return(cl);
 }
@@ -1374,7 +1375,6 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
 				       BTREE_MAX_PAGES);

 	mutex_init(&c->bucket_lock);
-	mutex_init(&c->fill_lock);
 	mutex_init(&c->sort_lock);
 	spin_lock_init(&c->sort_time_lock);
 	closure_init_unlocked(&c->sb_write);
@@ -1400,8 +1400,8 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
 	    !(c->bio_meta = mempool_create_kmalloc_pool(2,
 				sizeof(struct bbio) + sizeof(struct bio_vec) *
 				bucket_pages(c))) ||
+	    !(c->fill_iter = mempool_create_kmalloc_pool(1, iter_size)) ||
 	    !(c->bio_split = bioset_create(4, offsetof(struct bbio, bio))) ||
-	    !(c->fill_iter = kmalloc(iter_size, GFP_KERNEL)) ||
 	    !(c->sort = alloc_bucket_pages(GFP_KERNEL, c)) ||
 	    !(c->uuids = alloc_bucket_pages(GFP_KERNEL, c)) ||
 	    bch_journal_alloc(c) ||
@@ -1409,8 +1409,6 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
 	    bch_open_buckets_alloc(c))
 		goto err;

-	c->fill_iter->size = sb->bucket_size / sb->block_size;
-
 	c->congested_read_threshold_us	= 2000;
 	c->congested_write_threshold_us	= 20000;
 	c->error_limit	= 8 << IO_ERROR_SHIFT;
@@ -1551,7 +1549,7 @@ static void run_cache_set(struct cache_set *c)
 			goto err_unlock_gc;

 		bkey_copy_key(&c->root->key, &MAX_KEY);
-		bch_btree_write(c->root, true, &op);
+		bch_btree_node_write(c->root, &op.cl);

 		bch_btree_set_root(c->root);
 		rw_unlock(true, c->root);