Commit a34a8bfd authored by Kent Overstreet's avatar Kent Overstreet

bcache: Refactor journalling flow control

Making things less asynchronous that don't need to be - bch_journal()
only has to block when the journal or journal entry is full, which is
emphatically not a fast path. So make it a normal function that just
returns when it finishes, to make the code and control flow easier to
follow.
Signed-off-by: default avatarKent Overstreet <kmo@daterainc.com>
parent cdd972b1
......@@ -2164,9 +2164,6 @@ int bch_btree_insert(struct btree_op *op, struct cache_set *c,
}
}
if (op->journal)
atomic_dec_bug(op->journal);
op->journal = NULL;
return ret;
}
......
......@@ -642,7 +642,7 @@ do { \
#define continue_at_nobarrier(_cl, _fn, _wq) \
do { \
set_closure_fn(_cl, _fn, _wq); \
closure_queue(cl); \
closure_queue(_cl); \
return; \
} while (0)
......
......@@ -318,7 +318,6 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list,
bch_keylist_push(&op->keys);
op->journal = i->pin;
atomic_inc(op->journal);
ret = bch_btree_insert(op, s, &op->keys);
if (ret)
......@@ -357,48 +356,35 @@ static void btree_flush_write(struct cache_set *c)
* Try to find the btree node with that references the oldest journal
* entry, best is our current candidate and is locked if non NULL:
*/
struct btree *b, *best = NULL;
unsigned iter;
struct btree *b, *best;
unsigned i;
retry:
best = NULL;
for_each_cached_btree(b, c, i)
if (btree_current_write(b)->journal) {
if (!best)
best = b;
else if (journal_pin_cmp(c,
btree_current_write(best),
btree_current_write(b))) {
best = b;
}
}
for_each_cached_btree(b, c, iter) {
if (!down_write_trylock(&b->lock))
continue;
b = best;
if (b) {
rw_lock(true, b, b->level);
if (!btree_node_dirty(b) ||
!btree_current_write(b)->journal) {
if (!btree_current_write(b)->journal) {
rw_unlock(true, b);
continue;
/* We raced */
goto retry;
}
if (!best)
best = b;
else if (journal_pin_cmp(c,
btree_current_write(best),
btree_current_write(b))) {
rw_unlock(true, best);
best = b;
} else
rw_unlock(true, b);
bch_btree_node_write(b, NULL);
rw_unlock(true, b);
}
if (best)
goto out;
/* We can't find the best btree node, just pick the first */
list_for_each_entry(b, &c->btree_cache, list)
if (!b->level && btree_node_dirty(b)) {
best = b;
rw_lock(true, best, best->level);
goto found;
}
out:
if (!best)
return;
found:
if (btree_node_dirty(best))
bch_btree_node_write(best, NULL);
rw_unlock(true, best);
}
#define last_seq(j) ((j)->seq - fifo_used(&(j)->pin) + 1)
......@@ -494,7 +480,7 @@ static void journal_reclaim(struct cache_set *c)
do_journal_discard(ca);
if (c->journal.blocks_free)
return;
goto out;
/*
* Allocate:
......@@ -520,7 +506,7 @@ static void journal_reclaim(struct cache_set *c)
if (n)
c->journal.blocks_free = c->sb.bucket_size >> c->block_bits;
out:
if (!journal_full(&c->journal))
__closure_wake_up(&c->journal.wait);
}
......@@ -659,7 +645,7 @@ static void journal_write(struct closure *cl)
journal_write_unlocked(cl);
}
static void __journal_try_write(struct cache_set *c, bool noflush)
static void journal_try_write(struct cache_set *c)
__releases(c->journal.lock)
{
struct closure *cl = &c->journal.io;
......@@ -667,29 +653,59 @@ static void __journal_try_write(struct cache_set *c, bool noflush)
w->need_write = true;
if (!closure_trylock(cl, &c->cl))
spin_unlock(&c->journal.lock);
else if (noflush && journal_full(&c->journal)) {
spin_unlock(&c->journal.lock);
continue_at(cl, journal_write, system_wq);
} else
if (closure_trylock(cl, &c->cl))
journal_write_unlocked(cl);
else
spin_unlock(&c->journal.lock);
}
#define journal_try_write(c) __journal_try_write(c, false)
void bch_journal_meta(struct cache_set *c, struct closure *cl)
static struct journal_write *journal_wait_for_write(struct cache_set *c,
unsigned nkeys)
{
struct journal_write *w;
size_t sectors;
struct closure cl;
if (CACHE_SYNC(&c->sb)) {
spin_lock(&c->journal.lock);
w = c->journal.cur;
closure_init_stack(&cl);
spin_lock(&c->journal.lock);
while (1) {
struct journal_write *w = c->journal.cur;
sectors = __set_blocks(w->data, w->data->keys + nkeys,
c) * c->sb.block_size;
if (sectors <= min_t(size_t,
c->journal.blocks_free * c->sb.block_size,
PAGE_SECTORS << JSET_BITS))
return w;
/* XXX: tracepoint */
if (!journal_full(&c->journal)) {
trace_bcache_journal_entry_full(c);
/*
* XXX: If we were inserting so many keys that they
* won't fit in an _empty_ journal write, we'll
* deadlock. For now, handle this in
* bch_keylist_realloc() - but something to think about.
*/
BUG_ON(!w->data->keys);
closure_wait(&w->wait, &cl);
journal_try_write(c); /* unlocks */
} else {
trace_bcache_journal_full(c);
closure_wait(&c->journal.wait, &cl);
journal_reclaim(c);
spin_unlock(&c->journal.lock);
if (cl)
BUG_ON(!closure_wait(&w->wait, cl));
btree_flush_write(c);
}
__journal_try_write(c, true);
closure_sync(&cl);
spin_lock(&c->journal.lock);
}
}
......@@ -708,68 +724,26 @@ static void journal_write_work(struct work_struct *work)
* bch_journal() hands those same keys off to btree_insert_async()
*/
void bch_journal(struct closure *cl)
atomic_t *bch_journal(struct cache_set *c,
struct keylist *keys,
struct closure *parent)
{
struct btree_op *op = container_of(cl, struct btree_op, cl);
struct cache_set *c = op->c;
struct journal_write *w;
size_t sectors, nkeys;
if (op->type != BTREE_INSERT ||
!CACHE_SYNC(&c->sb))
goto out;
/*
* If we're looping because we errored, might already be waiting on
* another journal write:
*/
while (atomic_read(&cl->parent->remaining) & CLOSURE_WAITING)
closure_sync(cl->parent);
spin_lock(&c->journal.lock);
if (journal_full(&c->journal)) {
trace_bcache_journal_full(c);
closure_wait(&c->journal.wait, cl);
journal_reclaim(c);
spin_unlock(&c->journal.lock);
btree_flush_write(c);
continue_at(cl, bch_journal, bcache_wq);
}
atomic_t *ret;
w = c->journal.cur;
nkeys = w->data->keys + bch_keylist_nkeys(&op->keys);
sectors = __set_blocks(w->data, nkeys, c) * c->sb.block_size;
if (!CACHE_SYNC(&c->sb))
return NULL;
if (sectors > min_t(size_t,
c->journal.blocks_free * c->sb.block_size,
PAGE_SECTORS << JSET_BITS)) {
trace_bcache_journal_entry_full(c);
w = journal_wait_for_write(c, bch_keylist_nkeys(keys));
/*
* XXX: If we were inserting so many keys that they won't fit in
* an _empty_ journal write, we'll deadlock. For now, handle
* this in bch_keylist_realloc() - but something to think about.
*/
BUG_ON(!w->data->keys);
memcpy(end(w->data), keys->keys, bch_keylist_bytes(keys));
w->data->keys += bch_keylist_nkeys(keys);
BUG_ON(!closure_wait(&w->wait, cl));
ret = &fifo_back(&c->journal.pin);
atomic_inc(ret);
journal_try_write(c);
continue_at(cl, bch_journal, bcache_wq);
}
memcpy(end(w->data), op->keys.keys, bch_keylist_bytes(&op->keys));
w->data->keys += bch_keylist_nkeys(&op->keys);
op->journal = &fifo_back(&c->journal.pin);
atomic_inc(op->journal);
if (op->flush_journal) {
closure_wait(&w->wait, cl->parent);
if (parent) {
closure_wait(&w->wait, parent);
journal_try_write(c);
} else if (!w->need_write) {
schedule_delayed_work(&c->journal.work,
......@@ -778,8 +752,21 @@ void bch_journal(struct closure *cl)
} else {
spin_unlock(&c->journal.lock);
}
out:
bch_btree_insert_async(cl);
return ret;
}
void bch_journal_meta(struct cache_set *c, struct closure *cl)
{
struct keylist keys;
atomic_t *ref;
bch_keylist_init(&keys);
ref = bch_journal(c, &keys, cl);
if (ref)
atomic_dec_bug(ref);
}
void bch_journal_free(struct cache_set *c)
......
......@@ -200,8 +200,9 @@ struct journal_device {
struct closure;
struct cache_set;
struct btree_op;
struct keylist;
void bch_journal(struct closure *);
atomic_t *bch_journal(struct cache_set *, struct keylist *, struct closure *);
void bch_journal_next(struct journal *);
void bch_journal_mark(struct cache_set *, struct list_head *);
void bch_journal_meta(struct cache_set *, struct closure *);
......
......@@ -110,7 +110,7 @@ static void write_moving(struct closure *cl)
bkey_copy(&s->op.replace, &io->w->key);
closure_init(&s->op.cl, cl);
bch_insert_data(&s->op.cl);
bch_data_insert(&s->op.cl);
}
continue_at(cl, write_moving_finish, bch_gc_wq);
......
......@@ -25,6 +25,8 @@
struct kmem_cache *bch_search_cache;
static void bch_data_insert_start(struct closure *);
/* Cgroup interface */
#ifdef CONFIG_CGROUP_BCACHE
......@@ -211,31 +213,42 @@ static void bio_csum(struct bio *bio, struct bkey *k)
/* Insert data into cache */
static void bio_invalidate(struct closure *cl)
static void bch_data_insert_keys(struct closure *cl)
{
struct btree_op *op = container_of(cl, struct btree_op, cl);
struct bio *bio = op->cache_bio;
struct search *s = container_of(op, struct search, op);
pr_debug("invalidating %i sectors from %llu",
bio_sectors(bio), (uint64_t) bio->bi_sector);
/*
* If we're looping, might already be waiting on
* another journal write - can't wait on more than one journal write at
* a time
*
* XXX: this looks wrong
*/
#if 0
while (atomic_read(&s->cl.remaining) & CLOSURE_WAITING)
closure_sync(&s->cl);
#endif
while (bio_sectors(bio)) {
unsigned len = min(bio_sectors(bio), 1U << 14);
if (s->write)
op->journal = bch_journal(op->c, &op->keys,
op->flush_journal
? &s->cl : NULL);
if (bch_keylist_realloc(&op->keys, 0, op->c))
goto out;
if (bch_btree_insert(op, op->c, &op->keys)) {
s->error = -ENOMEM;
op->insert_data_done = true;
}
bio->bi_sector += len;
bio->bi_size -= len << 9;
if (op->journal)
atomic_dec_bug(op->journal);
op->journal = NULL;
bch_keylist_add(&op->keys,
&KEY(op->inode, bio->bi_sector, len));
}
if (!op->insert_data_done)
continue_at(cl, bch_data_insert_start, bcache_wq);
op->insert_data_done = true;
bio_put(bio);
out:
continue_at(cl, bch_journal, bcache_wq);
bch_keylist_free(&op->keys);
closure_return(cl);
}
struct open_bucket {
......@@ -423,7 +436,34 @@ static bool bch_alloc_sectors(struct bkey *k, unsigned sectors,
return true;
}
static void bch_insert_data_error(struct closure *cl)
static void bch_data_invalidate(struct closure *cl)
{
struct btree_op *op = container_of(cl, struct btree_op, cl);
struct bio *bio = op->cache_bio;
pr_debug("invalidating %i sectors from %llu",
bio_sectors(bio), (uint64_t) bio->bi_sector);
while (bio_sectors(bio)) {
unsigned len = min(bio_sectors(bio), 1U << 14);
if (bch_keylist_realloc(&op->keys, 0, op->c))
goto out;
bio->bi_sector += len;
bio->bi_size -= len << 9;
bch_keylist_add(&op->keys, &KEY(op->inode,
bio->bi_sector, len));
}
op->insert_data_done = true;
bio_put(bio);
out:
continue_at(cl, bch_data_insert_keys, bcache_wq);
}
static void bch_data_insert_error(struct closure *cl)
{
struct btree_op *op = container_of(cl, struct btree_op, cl);
......@@ -450,10 +490,10 @@ static void bch_insert_data_error(struct closure *cl)
op->keys.top = dst;
bch_journal(cl);
bch_data_insert_keys(cl);
}
static void bch_insert_data_endio(struct bio *bio, int error)
static void bch_data_insert_endio(struct bio *bio, int error)
{
struct closure *cl = bio->bi_private;
struct btree_op *op = container_of(cl, struct btree_op, cl);
......@@ -464,7 +504,7 @@ static void bch_insert_data_endio(struct bio *bio, int error)
if (s->writeback)
s->error = error;
else if (s->write)
set_closure_fn(cl, bch_insert_data_error, bcache_wq);
set_closure_fn(cl, bch_data_insert_error, bcache_wq);
else
set_closure_fn(cl, NULL, NULL);
}
......@@ -472,14 +512,14 @@ static void bch_insert_data_endio(struct bio *bio, int error)
bch_bbio_endio(op->c, bio, error, "writing data to cache");
}
static void bch_insert_data_loop(struct closure *cl)
static void bch_data_insert_start(struct closure *cl)
{
struct btree_op *op = container_of(cl, struct btree_op, cl);
struct search *s = container_of(op, struct search, op);
struct bio *bio = op->cache_bio, *n;
if (op->bypass)
return bio_invalidate(cl);
return bch_data_invalidate(cl);
if (atomic_sub_return(bio_sectors(bio), &op->c->sectors_to_gc) < 0) {
set_gc_sectors(op->c);
......@@ -502,7 +542,7 @@ static void bch_insert_data_loop(struct closure *cl)
if (bch_keylist_realloc(&op->keys,
1 + (op->csum ? 1 : 0),
op->c))
continue_at(cl, bch_journal, bcache_wq);
continue_at(cl, bch_data_insert_keys, bcache_wq);
k = op->keys.top;
bkey_init(k);
......@@ -514,7 +554,7 @@ static void bch_insert_data_loop(struct closure *cl)
n = bch_bio_split(bio, KEY_SIZE(k), GFP_NOIO, split);
n->bi_end_io = bch_insert_data_endio;
n->bi_end_io = bch_data_insert_endio;
n->bi_private = cl;
if (s->writeback) {
......@@ -537,7 +577,7 @@ static void bch_insert_data_loop(struct closure *cl)
} while (n != bio);
op->insert_data_done = true;
continue_at(cl, bch_journal, bcache_wq);
continue_at(cl, bch_data_insert_keys, bcache_wq);
err:
/* bch_alloc_sectors() blocks if s->writeback = true */
BUG_ON(s->writeback);
......@@ -556,7 +596,7 @@ static void bch_insert_data_loop(struct closure *cl)
* rest of the write.
*/
op->bypass = true;
return bio_invalidate(cl);
return bch_data_invalidate(cl);
} else {
/*
* From a cache miss, we can just insert the keys for the data
......@@ -566,14 +606,14 @@ static void bch_insert_data_loop(struct closure *cl)
bio_put(bio);
if (!bch_keylist_empty(&op->keys))
continue_at(cl, bch_journal, bcache_wq);
continue_at(cl, bch_data_insert_keys, bcache_wq);
else
closure_return(cl);
}
}
/**
* bch_insert_data - stick some data in the cache
* bch_data_insert - stick some data in the cache
*
* This is the starting point for any data to end up in a cache device; it could
* be from a normal write, or a writeback write, or a write to a flash only
......@@ -591,30 +631,13 @@ static void bch_insert_data_loop(struct closure *cl)
* If op->bypass is true, instead of inserting the data it invalidates the
* region of the cache represented by op->cache_bio and op->inode.
*/
void bch_insert_data(struct closure *cl)
void bch_data_insert(struct closure *cl)
{
struct btree_op *op = container_of(cl, struct btree_op, cl);
bch_keylist_init(&op->keys);
bio_get(op->cache_bio);
bch_insert_data_loop(cl);
}
void bch_btree_insert_async(struct closure *cl)
{
struct btree_op *op = container_of(cl, struct btree_op, cl);
struct search *s = container_of(op, struct search, op);
if (bch_btree_insert(op, op->c, &op->keys)) {
s->error = -ENOMEM;
op->insert_data_done = true;
}
if (op->insert_data_done) {
bch_keylist_free(&op->keys);
closure_return(cl);
} else
continue_at(cl, bch_insert_data_loop, bcache_wq);
bch_data_insert_start(cl);
}
/* Common code for the make_request functions */
......@@ -969,7 +992,7 @@ static void cached_dev_read_done(struct closure *cl)
if (s->op.cache_bio &&
!test_bit(CACHE_SET_STOPPING, &s->op.c->flags)) {
s->op.type = BTREE_REPLACE;
closure_call(&s->op.cl, bch_insert_data, NULL, cl);
closure_call(&s->op.cl, bch_data_insert, NULL, cl);
}
continue_at(cl, cached_dev_cache_miss_done, NULL);
......@@ -1147,13 +1170,13 @@ static void cached_dev_write(struct cached_dev *dc, struct search *s)
closure_bio_submit(bio, cl, s->d);
}
closure_call(&s->op.cl, bch_insert_data, NULL, cl);
closure_call(&s->op.cl, bch_data_insert, NULL, cl);
continue_at(cl, cached_dev_write_complete, NULL);
}
static void cached_dev_nodata(struct cached_dev *dc, struct search *s)
static void cached_dev_nodata(struct closure *cl)
{
struct closure *cl = &s->cl;
struct search *s = container_of(cl, struct search, cl);
struct bio *bio = &s->bio.bio;
if (s->op.flush_journal)
......@@ -1186,9 +1209,15 @@ static void cached_dev_make_request(struct request_queue *q, struct bio *bio)
s = search_alloc(bio, d);
trace_bcache_request_start(s, bio);
if (!bio->bi_size)
cached_dev_nodata(dc, s);
else {
if (!bio->bi_size) {
/*
* can't call bch_journal_meta from under
* generic_make_request
*/
continue_at_nobarrier(&s->cl,
cached_dev_nodata,
bcache_wq);
} else {
s->op.bypass = check_should_bypass(dc, s);
if (rw)
......@@ -1275,6 +1304,16 @@ static int flash_dev_cache_miss(struct btree *b, struct search *s,
return 0;
}
static void flash_dev_nodata(struct closure *cl)
{
struct search *s = container_of(cl, struct search, cl);
if (s->op.flush_journal)
bch_journal_meta(s->op.c, cl);
continue_at(cl, search_free, NULL);
}
static void flash_dev_make_request(struct request_queue *q, struct bio *bio)
{
struct search *s;
......@@ -1294,8 +1333,13 @@ static void flash_dev_make_request(struct request_queue *q, struct bio *bio)
trace_bcache_request_start(s, bio);
if (!bio->bi_size) {
if (s->op.flush_journal)
bch_journal_meta(s->op.c, cl);
/*
* can't call bch_journal_meta from under
* generic_make_request
*/
continue_at_nobarrier(&s->cl,
flash_dev_nodata,
bcache_wq);
} else if (rw) {
bch_keybuf_check_overlapping(&s->op.c->moving_gc_keys,
&KEY(d->id, bio->bi_sector, 0),
......@@ -1305,7 +1349,7 @@ static void flash_dev_make_request(struct request_queue *q, struct bio *bio)
s->writeback = true;
s->op.cache_bio = bio;
closure_call(&s->op.cl, bch_insert_data, NULL, cl);
closure_call(&s->op.cl, bch_data_insert, NULL, cl);
} else {
closure_call(&s->op.cl, btree_read_async, NULL, cl);
}
......
......@@ -31,8 +31,7 @@ struct search {
void bch_cache_read_endio(struct bio *, int);
unsigned bch_get_congested(struct cache_set *);
void bch_insert_data(struct closure *cl);
void bch_btree_insert_async(struct closure *);
void bch_data_insert(struct closure *cl);
void bch_cache_read_endio(struct bio *, int);
void bch_open_buckets_free(struct cache_set *);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment