Commit 2a285686 authored by Kent Overstreet's avatar Kent Overstreet

bcache: btree locking rework

Add a new lock, b->write_lock, which is required to actually modify - or write -
a btree node; this lock is only held for short durations.

This means we can write out a btree node without taking b->lock, which _is_ held
for long durations - solving a deadlock when btree_flush_write() (from the
journalling code) is called with a btree node locked.

Right now just occurs in bch_btree_set_root(), but with an upcoming journalling
rework is going to happen a lot more.

This also turns b->lock is now more of a read/intent lock instead of a
read/write lock - but not completely, since it still blocks readers. May turn it
into a real intent lock at some point in the future.
Signed-off-by: default avatarKent Overstreet <kmo@daterainc.com>
parent 05335cff
...@@ -167,6 +167,20 @@ static inline struct bset *write_block(struct btree *b) ...@@ -167,6 +167,20 @@ static inline struct bset *write_block(struct btree *b)
return ((void *) btree_bset_first(b)) + b->written * block_bytes(b->c); return ((void *) btree_bset_first(b)) + b->written * block_bytes(b->c);
} }
static void bch_btree_init_next(struct btree *b)
{
/* If not a leaf node, always sort */
if (b->level && b->keys.nsets)
bch_btree_sort(&b->keys, &b->c->sort);
else
bch_btree_sort_lazy(&b->keys, &b->c->sort);
if (b->written < btree_blocks(b))
bch_bset_init_next(&b->keys, write_block(b),
bset_magic(&b->c->sb));
}
/* Btree key manipulation */ /* Btree key manipulation */
void bkey_put(struct cache_set *c, struct bkey *k) void bkey_put(struct cache_set *c, struct bkey *k)
...@@ -438,10 +452,12 @@ static void do_btree_node_write(struct btree *b) ...@@ -438,10 +452,12 @@ static void do_btree_node_write(struct btree *b)
} }
} }
void bch_btree_node_write(struct btree *b, struct closure *parent) void __bch_btree_node_write(struct btree *b, struct closure *parent)
{ {
struct bset *i = btree_bset_last(b); struct bset *i = btree_bset_last(b);
lockdep_assert_held(&b->write_lock);
trace_bcache_btree_write(b); trace_bcache_btree_write(b);
BUG_ON(current->bio_list); BUG_ON(current->bio_list);
...@@ -465,23 +481,24 @@ void bch_btree_node_write(struct btree *b, struct closure *parent) ...@@ -465,23 +481,24 @@ void bch_btree_node_write(struct btree *b, struct closure *parent)
&PTR_CACHE(b->c, &b->key, 0)->btree_sectors_written); &PTR_CACHE(b->c, &b->key, 0)->btree_sectors_written);
b->written += set_blocks(i, block_bytes(b->c)); b->written += set_blocks(i, block_bytes(b->c));
}
/* If not a leaf node, always sort */ void bch_btree_node_write(struct btree *b, struct closure *parent)
if (b->level && b->keys.nsets) {
bch_btree_sort(&b->keys, &b->c->sort); unsigned nsets = b->keys.nsets;
else
bch_btree_sort_lazy(&b->keys, &b->c->sort); lockdep_assert_held(&b->lock);
__bch_btree_node_write(b, parent);
/* /*
* do verify if there was more than one set initially (i.e. we did a * do verify if there was more than one set initially (i.e. we did a
* sort) and we sorted down to a single set: * sort) and we sorted down to a single set:
*/ */
if (i != b->keys.set->data && !b->keys.nsets) if (nsets && !b->keys.nsets)
bch_btree_verify(b); bch_btree_verify(b);
if (b->written < btree_blocks(b)) bch_btree_init_next(b);
bch_bset_init_next(&b->keys, write_block(b),
bset_magic(&b->c->sb));
} }
static void bch_btree_node_write_sync(struct btree *b) static void bch_btree_node_write_sync(struct btree *b)
...@@ -489,7 +506,11 @@ static void bch_btree_node_write_sync(struct btree *b) ...@@ -489,7 +506,11 @@ static void bch_btree_node_write_sync(struct btree *b)
struct closure cl; struct closure cl;
closure_init_stack(&cl); closure_init_stack(&cl);
mutex_lock(&b->write_lock);
bch_btree_node_write(b, &cl); bch_btree_node_write(b, &cl);
mutex_unlock(&b->write_lock);
closure_sync(&cl); closure_sync(&cl);
} }
...@@ -497,11 +518,10 @@ static void btree_node_write_work(struct work_struct *w) ...@@ -497,11 +518,10 @@ static void btree_node_write_work(struct work_struct *w)
{ {
struct btree *b = container_of(to_delayed_work(w), struct btree, work); struct btree *b = container_of(to_delayed_work(w), struct btree, work);
rw_lock(true, b, b->level); mutex_lock(&b->write_lock);
if (btree_node_dirty(b)) if (btree_node_dirty(b))
bch_btree_node_write(b, NULL); __bch_btree_node_write(b, NULL);
rw_unlock(true, b); mutex_unlock(&b->write_lock);
} }
static void bch_btree_leaf_dirty(struct btree *b, atomic_t *journal_ref) static void bch_btree_leaf_dirty(struct btree *b, atomic_t *journal_ref)
...@@ -509,6 +529,8 @@ static void bch_btree_leaf_dirty(struct btree *b, atomic_t *journal_ref) ...@@ -509,6 +529,8 @@ static void bch_btree_leaf_dirty(struct btree *b, atomic_t *journal_ref)
struct bset *i = btree_bset_last(b); struct bset *i = btree_bset_last(b);
struct btree_write *w = btree_current_write(b); struct btree_write *w = btree_current_write(b);
lockdep_assert_held(&b->write_lock);
BUG_ON(!b->written); BUG_ON(!b->written);
BUG_ON(!i->keys); BUG_ON(!i->keys);
...@@ -593,6 +615,8 @@ static struct btree *mca_bucket_alloc(struct cache_set *c, ...@@ -593,6 +615,8 @@ static struct btree *mca_bucket_alloc(struct cache_set *c,
init_rwsem(&b->lock); init_rwsem(&b->lock);
lockdep_set_novalidate_class(&b->lock); lockdep_set_novalidate_class(&b->lock);
mutex_init(&b->write_lock);
lockdep_set_novalidate_class(&b->write_lock);
INIT_LIST_HEAD(&b->list); INIT_LIST_HEAD(&b->list);
INIT_DELAYED_WORK(&b->work, btree_node_write_work); INIT_DELAYED_WORK(&b->work, btree_node_write_work);
b->c = c; b->c = c;
...@@ -626,8 +650,12 @@ static int mca_reap(struct btree *b, unsigned min_order, bool flush) ...@@ -626,8 +650,12 @@ static int mca_reap(struct btree *b, unsigned min_order, bool flush)
up(&b->io_mutex); up(&b->io_mutex);
} }
mutex_lock(&b->write_lock);
if (btree_node_dirty(b)) if (btree_node_dirty(b))
bch_btree_node_write_sync(b); __bch_btree_node_write(b, &cl);
mutex_unlock(&b->write_lock);
closure_sync(&cl);
/* wait for any in flight btree write */ /* wait for any in flight btree write */
down(&b->io_mutex); down(&b->io_mutex);
...@@ -1010,10 +1038,14 @@ static void btree_node_free(struct btree *b) ...@@ -1010,10 +1038,14 @@ static void btree_node_free(struct btree *b)
BUG_ON(b == b->c->root); BUG_ON(b == b->c->root);
mutex_lock(&b->write_lock);
if (btree_node_dirty(b)) if (btree_node_dirty(b))
btree_complete_write(b, btree_current_write(b)); btree_complete_write(b, btree_current_write(b));
clear_bit(BTREE_NODE_dirty, &b->flags); clear_bit(BTREE_NODE_dirty, &b->flags);
mutex_unlock(&b->write_lock);
cancel_delayed_work(&b->work); cancel_delayed_work(&b->work);
mutex_lock(&b->c->bucket_lock); mutex_lock(&b->c->bucket_lock);
...@@ -1065,8 +1097,10 @@ static struct btree *btree_node_alloc_replacement(struct btree *b, bool wait) ...@@ -1065,8 +1097,10 @@ static struct btree *btree_node_alloc_replacement(struct btree *b, bool wait)
{ {
struct btree *n = bch_btree_node_alloc(b->c, b->level, wait); struct btree *n = bch_btree_node_alloc(b->c, b->level, wait);
if (!IS_ERR_OR_NULL(n)) { if (!IS_ERR_OR_NULL(n)) {
mutex_lock(&n->write_lock);
bch_btree_sort_into(&b->keys, &n->keys, &b->c->sort); bch_btree_sort_into(&b->keys, &n->keys, &b->c->sort);
bkey_copy_key(&n->key, &b->key); bkey_copy_key(&n->key, &b->key);
mutex_unlock(&n->write_lock);
} }
return n; return n;
...@@ -1269,6 +1303,9 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op, ...@@ -1269,6 +1303,9 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op,
goto out_nocoalesce; goto out_nocoalesce;
} }
for (i = 0; i < nodes; i++)
mutex_lock(&new_nodes[i]->write_lock);
for (i = nodes - 1; i > 0; --i) { for (i = nodes - 1; i > 0; --i) {
struct bset *n1 = btree_bset_first(new_nodes[i]); struct bset *n1 = btree_bset_first(new_nodes[i]);
struct bset *n2 = btree_bset_first(new_nodes[i - 1]); struct bset *n2 = btree_bset_first(new_nodes[i - 1]);
...@@ -1335,6 +1372,9 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op, ...@@ -1335,6 +1372,9 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op,
bch_keylist_add(keylist, &new_nodes[i]->key); bch_keylist_add(keylist, &new_nodes[i]->key);
} }
for (i = 0; i < nodes; i++)
mutex_unlock(&new_nodes[i]->write_lock);
closure_sync(&cl); closure_sync(&cl);
/* We emptied out this node */ /* We emptied out this node */
...@@ -1399,7 +1439,6 @@ static unsigned btree_gc_count_keys(struct btree *b) ...@@ -1399,7 +1439,6 @@ static unsigned btree_gc_count_keys(struct btree *b)
static int btree_gc_recurse(struct btree *b, struct btree_op *op, static int btree_gc_recurse(struct btree *b, struct btree_op *op,
struct closure *writes, struct gc_stat *gc) struct closure *writes, struct gc_stat *gc)
{ {
unsigned i;
int ret = 0; int ret = 0;
bool should_rewrite; bool should_rewrite;
struct btree *n; struct btree *n;
...@@ -1407,13 +1446,13 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op, ...@@ -1407,13 +1446,13 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op,
struct keylist keys; struct keylist keys;
struct btree_iter iter; struct btree_iter iter;
struct gc_merge_info r[GC_MERGE_NODES]; struct gc_merge_info r[GC_MERGE_NODES];
struct gc_merge_info *last = r + GC_MERGE_NODES - 1; struct gc_merge_info *i, *last = r + ARRAY_SIZE(r) - 1;
bch_keylist_init(&keys); bch_keylist_init(&keys);
bch_btree_iter_init(&b->keys, &iter, &b->c->gc_done); bch_btree_iter_init(&b->keys, &iter, &b->c->gc_done);
for (i = 0; i < GC_MERGE_NODES; i++) for (i = r; i < r + ARRAY_SIZE(r); i++)
r[i].b = ERR_PTR(-EINTR); i->b = ERR_PTR(-EINTR);
while (1) { while (1) {
k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad); k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad);
...@@ -1443,6 +1482,7 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op, ...@@ -1443,6 +1482,7 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op,
if (!IS_ERR_OR_NULL(n)) { if (!IS_ERR_OR_NULL(n)) {
bch_btree_node_write_sync(n); bch_btree_node_write_sync(n);
bch_keylist_add(&keys, &n->key); bch_keylist_add(&keys, &n->key);
make_btree_freeing_key(last->b, make_btree_freeing_key(last->b,
...@@ -1475,8 +1515,10 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op, ...@@ -1475,8 +1515,10 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op,
* Must flush leaf nodes before gc ends, since replace * Must flush leaf nodes before gc ends, since replace
* operations aren't journalled * operations aren't journalled
*/ */
mutex_lock(&last->b->write_lock);
if (btree_node_dirty(last->b)) if (btree_node_dirty(last->b))
bch_btree_node_write(last->b, writes); bch_btree_node_write(last->b, writes);
mutex_unlock(&last->b->write_lock);
rw_unlock(true, last->b); rw_unlock(true, last->b);
} }
...@@ -1489,11 +1531,13 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op, ...@@ -1489,11 +1531,13 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op,
} }
} }
for (i = 0; i < GC_MERGE_NODES; i++) for (i = r; i < r + ARRAY_SIZE(r); i++)
if (!IS_ERR_OR_NULL(r[i].b)) { if (!IS_ERR_OR_NULL(i->b)) {
if (btree_node_dirty(r[i].b)) mutex_lock(&i->b->write_lock);
bch_btree_node_write(r[i].b, writes); if (btree_node_dirty(i->b))
rw_unlock(true, r[i].b); bch_btree_node_write(i->b, writes);
mutex_unlock(&i->b->write_lock);
rw_unlock(true, i->b);
} }
bch_keylist_free(&keys); bch_keylist_free(&keys);
...@@ -1514,6 +1558,7 @@ static int bch_btree_gc_root(struct btree *b, struct btree_op *op, ...@@ -1514,6 +1558,7 @@ static int bch_btree_gc_root(struct btree *b, struct btree_op *op,
if (!IS_ERR_OR_NULL(n)) { if (!IS_ERR_OR_NULL(n)) {
bch_btree_node_write_sync(n); bch_btree_node_write_sync(n);
bch_btree_set_root(n); bch_btree_set_root(n);
btree_node_free(b); btree_node_free(b);
rw_unlock(true, n); rw_unlock(true, n);
...@@ -1871,6 +1916,9 @@ static int btree_split(struct btree *b, struct btree_op *op, ...@@ -1871,6 +1916,9 @@ static int btree_split(struct btree *b, struct btree_op *op,
goto err_free2; goto err_free2;
} }
mutex_lock(&n1->write_lock);
mutex_lock(&n2->write_lock);
bch_btree_insert_keys(n1, op, insert_keys, replace_key); bch_btree_insert_keys(n1, op, insert_keys, replace_key);
/* /*
...@@ -1897,21 +1945,26 @@ static int btree_split(struct btree *b, struct btree_op *op, ...@@ -1897,21 +1945,26 @@ static int btree_split(struct btree *b, struct btree_op *op,
bch_keylist_add(&parent_keys, &n2->key); bch_keylist_add(&parent_keys, &n2->key);
bch_btree_node_write(n2, &cl); bch_btree_node_write(n2, &cl);
mutex_unlock(&n2->write_lock);
rw_unlock(true, n2); rw_unlock(true, n2);
} else { } else {
trace_bcache_btree_node_compact(b, btree_bset_first(n1)->keys); trace_bcache_btree_node_compact(b, btree_bset_first(n1)->keys);
mutex_lock(&n1->write_lock);
bch_btree_insert_keys(n1, op, insert_keys, replace_key); bch_btree_insert_keys(n1, op, insert_keys, replace_key);
} }
bch_keylist_add(&parent_keys, &n1->key); bch_keylist_add(&parent_keys, &n1->key);
bch_btree_node_write(n1, &cl); bch_btree_node_write(n1, &cl);
mutex_unlock(&n1->write_lock);
if (n3) { if (n3) {
/* Depth increases, make a new root */ /* Depth increases, make a new root */
mutex_lock(&n3->write_lock);
bkey_copy_key(&n3->key, &MAX_KEY); bkey_copy_key(&n3->key, &MAX_KEY);
bch_btree_insert_keys(n3, op, &parent_keys, NULL); bch_btree_insert_keys(n3, op, &parent_keys, NULL);
bch_btree_node_write(n3, &cl); bch_btree_node_write(n3, &cl);
mutex_unlock(&n3->write_lock);
closure_sync(&cl); closure_sync(&cl);
bch_btree_set_root(n3); bch_btree_set_root(n3);
...@@ -1960,33 +2013,54 @@ static int bch_btree_insert_node(struct btree *b, struct btree_op *op, ...@@ -1960,33 +2013,54 @@ static int bch_btree_insert_node(struct btree *b, struct btree_op *op,
atomic_t *journal_ref, atomic_t *journal_ref,
struct bkey *replace_key) struct bkey *replace_key)
{ {
struct closure cl;
BUG_ON(b->level && replace_key); BUG_ON(b->level && replace_key);
closure_init_stack(&cl);
mutex_lock(&b->write_lock);
if (write_block(b) != btree_bset_last(b) &&
b->keys.last_set_unwritten)
bch_btree_init_next(b); /* just wrote a set */
if (bch_keylist_nkeys(insert_keys) > insert_u64s_remaining(b)) { if (bch_keylist_nkeys(insert_keys) > insert_u64s_remaining(b)) {
if (current->bio_list) { mutex_unlock(&b->write_lock);
op->lock = b->c->root->level + 1; goto split;
return -EAGAIN; }
} else if (op->lock <= b->c->root->level) {
op->lock = b->c->root->level + 1;
return -EINTR;
} else {
/* Invalidated all iterators */
int ret = btree_split(b, op, insert_keys, replace_key);
return bch_keylist_empty(insert_keys) ? BUG_ON(write_block(b) != btree_bset_last(b));
0 : ret ?: -EINTR;
}
} else {
BUG_ON(write_block(b) != btree_bset_last(b));
if (bch_btree_insert_keys(b, op, insert_keys, replace_key)) { if (bch_btree_insert_keys(b, op, insert_keys, replace_key)) {
if (!b->level) if (!b->level)
bch_btree_leaf_dirty(b, journal_ref); bch_btree_leaf_dirty(b, journal_ref);
else else
bch_btree_node_write_sync(b); bch_btree_node_write(b, &cl);
} }
return 0; mutex_unlock(&b->write_lock);
/* wait for btree node write if necessary, after unlock */
closure_sync(&cl);
return 0;
split:
if (current->bio_list) {
op->lock = b->c->root->level + 1;
return -EAGAIN;
} else if (op->lock <= b->c->root->level) {
op->lock = b->c->root->level + 1;
return -EINTR;
} else {
/* Invalidated all iterators */
int ret = btree_split(b, op, insert_keys, replace_key);
if (bch_keylist_empty(insert_keys))
return 0;
else if (!ret)
return -EINTR;
return ret;
} }
} }
......
...@@ -127,6 +127,8 @@ struct btree { ...@@ -127,6 +127,8 @@ struct btree {
struct cache_set *c; struct cache_set *c;
struct btree *parent; struct btree *parent;
struct mutex write_lock;
unsigned long flags; unsigned long flags;
uint16_t written; /* would be nice to kill */ uint16_t written; /* would be nice to kill */
uint8_t level; uint8_t level;
...@@ -236,6 +238,7 @@ static inline void rw_unlock(bool w, struct btree *b) ...@@ -236,6 +238,7 @@ static inline void rw_unlock(bool w, struct btree *b)
} }
void bch_btree_node_read_done(struct btree *); void bch_btree_node_read_done(struct btree *);
void __bch_btree_node_write(struct btree *, struct closure *);
void bch_btree_node_write(struct btree *, struct closure *); void bch_btree_node_write(struct btree *, struct closure *);
void bch_btree_set_root(struct btree *); void bch_btree_set_root(struct btree *);
......
...@@ -381,16 +381,15 @@ static void btree_flush_write(struct cache_set *c) ...@@ -381,16 +381,15 @@ static void btree_flush_write(struct cache_set *c)
b = best; b = best;
if (b) { if (b) {
rw_lock(true, b, b->level); mutex_lock(&b->write_lock);
if (!btree_current_write(b)->journal) { if (!btree_current_write(b)->journal) {
rw_unlock(true, b); mutex_unlock(&b->write_lock);
/* We raced */ /* We raced */
goto retry; goto retry;
} }
bch_btree_node_write(b, NULL); __bch_btree_node_write(b, NULL);
rw_unlock(true, b); mutex_unlock(&b->write_lock);
} }
} }
......
...@@ -1398,9 +1398,12 @@ static void cache_set_flush(struct closure *cl) ...@@ -1398,9 +1398,12 @@ static void cache_set_flush(struct closure *cl)
list_add(&c->root->list, &c->btree_cache); list_add(&c->root->list, &c->btree_cache);
/* Should skip this if we're unregistering because of an error */ /* Should skip this if we're unregistering because of an error */
list_for_each_entry(b, &c->btree_cache, list) list_for_each_entry(b, &c->btree_cache, list) {
mutex_lock(&b->write_lock);
if (btree_node_dirty(b)) if (btree_node_dirty(b))
bch_btree_node_write(b, NULL); __bch_btree_node_write(b, NULL);
mutex_unlock(&b->write_lock);
}
for_each_cache(ca, c, i) for_each_cache(ca, c, i)
if (ca->alloc_thread) if (ca->alloc_thread)
...@@ -1667,8 +1670,10 @@ static void run_cache_set(struct cache_set *c) ...@@ -1667,8 +1670,10 @@ static void run_cache_set(struct cache_set *c)
if (IS_ERR_OR_NULL(c->root)) if (IS_ERR_OR_NULL(c->root))
goto err; goto err;
mutex_lock(&c->root->write_lock);
bkey_copy_key(&c->root->key, &MAX_KEY); bkey_copy_key(&c->root->key, &MAX_KEY);
bch_btree_node_write(c->root, &cl); bch_btree_node_write(c->root, &cl);
mutex_unlock(&c->root->write_lock);
bch_btree_set_root(c->root); bch_btree_set_root(c->root);
rw_unlock(true, c->root); rw_unlock(true, c->root);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment