Commit 7ffb6a7e authored by Kent Overstreet's avatar Kent Overstreet

bcachefs: Fix deadlock on nocow locks in data move path

The recent nocow locking rework introduced a deadlock in the data move
path: the new nocow locking scheme uses a hash table with a fixed size
array for chaining, meaning on hash collision we may have to wait for
other locks to be released before we can lock a bucket.

And since the data move path needs to submit writes from the same thread
that's taking nocow locks and submitting reads, this introduces a
deadlock.

This shouldn't happen often in practice, but since the data move path
can keep large numbers of IOs in flight simultaneously, it's something
we have to handle.

This patch makes move_ctxt_wait_event() available to
bch2_data_update_init() and uses it when appropriate, which is our
normal solution to this kind of thing.
Signed-off-by: default avatarKent Overstreet <kent.overstreet@linux.dev>
parent dbe17f18
......@@ -397,13 +397,16 @@ void bch2_update_unwritten_extent(struct btree_trans *trans,
}
}
int bch2_data_update_init(struct bch_fs *c, struct data_update *m,
int bch2_data_update_init(struct btree_trans *trans,
struct moving_context *ctxt,
struct data_update *m,
struct write_point_specifier wp,
struct bch_io_opts io_opts,
struct data_update_opts data_opts,
enum btree_id btree_id,
struct bkey_s_c k)
{
struct bch_fs *c = trans->c;
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
const union bch_extent_entry *entry;
struct extent_ptr_decoded p;
......@@ -460,8 +463,21 @@ int bch2_data_update_init(struct bch_fs *c, struct data_update *m,
i++;
if (ctxt) {
bool locked;
move_ctxt_wait_event(ctxt, trans,
(locked = bch2_bucket_nocow_trylock(&c->nocow_locks,
PTR_BUCKET_POS(c, &p.ptr), 0)) ||
!atomic_read(&ctxt->read_sectors));
if (!locked)
bch2_bucket_nocow_lock(&c->nocow_locks,
PTR_BUCKET_POS(c, &p.ptr), 0);
} else {
bch2_bucket_nocow_lock(&c->nocow_locks,
PTR_BUCKET_POS(c, &p.ptr), 0);
}
}
if (reserve_sectors) {
......
......@@ -33,7 +33,8 @@ void bch2_data_update_read_done(struct data_update *,
void bch2_data_update_exit(struct data_update *);
void bch2_update_unwritten_extent(struct btree_trans *, struct data_update *);
int bch2_data_update_init(struct bch_fs *, struct data_update *,
int bch2_data_update_init(struct btree_trans *, struct moving_context *,
struct data_update *,
struct write_point_specifier,
struct bch_io_opts, struct data_update_opts,
enum btree_id, struct bkey_s_c);
......
......@@ -1961,7 +1961,7 @@ static void promote_start(struct promote_op *op, struct bch_read_bio *rbio)
bch2_data_update_read_done(&op->write, rbio->pick.crc);
}
static struct promote_op *__promote_alloc(struct bch_fs *c,
static struct promote_op *__promote_alloc(struct btree_trans *trans,
enum btree_id btree_id,
struct bkey_s_c k,
struct bpos pos,
......@@ -1970,6 +1970,7 @@ static struct promote_op *__promote_alloc(struct bch_fs *c,
unsigned sectors,
struct bch_read_bio **rbio)
{
struct bch_fs *c = trans->c;
struct promote_op *op = NULL;
struct bio *bio;
unsigned pages = DIV_ROUND_UP(sectors, PAGE_SECTORS);
......@@ -2013,7 +2014,7 @@ static struct promote_op *__promote_alloc(struct bch_fs *c,
bio = &op->write.op.wbio.bio;
bio_init(bio, NULL, bio->bi_inline_vecs, pages, 0);
ret = bch2_data_update_init(c, &op->write,
ret = bch2_data_update_init(trans, NULL, &op->write,
writepoint_hashed((unsigned long) current),
opts,
(struct data_update_opts) {
......@@ -2037,7 +2038,7 @@ static struct promote_op *__promote_alloc(struct bch_fs *c,
}
noinline
static struct promote_op *promote_alloc(struct bch_fs *c,
static struct promote_op *promote_alloc(struct btree_trans *trans,
struct bvec_iter iter,
struct bkey_s_c k,
struct extent_ptr_decoded *pick,
......@@ -2047,6 +2048,7 @@ static struct promote_op *promote_alloc(struct bch_fs *c,
bool *bounce,
bool *read_full)
{
struct bch_fs *c = trans->c;
bool promote_full = *read_full || READ_ONCE(c->promote_whole_extents);
/* data might have to be decompressed in the write path: */
unsigned sectors = promote_full
......@@ -2060,7 +2062,7 @@ static struct promote_op *promote_alloc(struct bch_fs *c,
if (!should_promote(c, k, pos, opts, flags))
return NULL;
promote = __promote_alloc(c,
promote = __promote_alloc(trans,
k.k->type == KEY_TYPE_reflink_v
? BTREE_ID_reflink
: BTREE_ID_extents,
......@@ -2667,7 +2669,7 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig,
}
if (orig->opts.promote_target)
promote = promote_alloc(c, iter, k, &pick, orig->opts, flags,
promote = promote_alloc(trans, iter, k, &pick, orig->opts, flags,
&rbio, &bounce, &read_full);
if (!read_full) {
......
......@@ -91,7 +91,7 @@ static void move_write(struct moving_io *io)
bch2_data_update_read_done(&io->write, io->rbio.pick.crc);
}
static inline struct moving_io *next_pending_write(struct moving_context *ctxt)
struct moving_io *bch2_moving_ctxt_next_pending_write(struct moving_context *ctxt)
{
struct moving_io *io =
list_first_entry_or_null(&ctxt->reads, struct moving_io, list);
......@@ -111,29 +111,20 @@ static void move_read_endio(struct bio *bio)
closure_put(&ctxt->cl);
}
static void do_pending_writes(struct moving_context *ctxt, struct btree_trans *trans)
void bch2_moving_ctxt_do_pending_writes(struct moving_context *ctxt,
struct btree_trans *trans)
{
struct moving_io *io;
if (trans)
bch2_trans_unlock(trans);
while ((io = next_pending_write(ctxt))) {
while ((io = bch2_moving_ctxt_next_pending_write(ctxt))) {
list_del(&io->list);
move_write(io);
}
}
#define move_ctxt_wait_event(_ctxt, _trans, _cond) \
do { \
do_pending_writes(_ctxt, _trans); \
\
if (_cond) \
break; \
__wait_event((_ctxt)->wait, \
next_pending_write(_ctxt) || (_cond)); \
} while (1)
static void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt,
struct btree_trans *trans)
{
......@@ -299,8 +290,8 @@ static int bch2_move_extent(struct btree_trans *trans,
io->rbio.bio.bi_iter.bi_sector = bkey_start_offset(k.k);
io->rbio.bio.bi_end_io = move_read_endio;
ret = bch2_data_update_init(c, &io->write, ctxt->wp, io_opts,
data_opts, btree_id, k);
ret = bch2_data_update_init(trans, ctxt, &io->write, ctxt->wp,
io_opts, data_opts, btree_id, k);
if (ret && ret != -BCH_ERR_unwritten_extent_update)
goto err_free_pages;
......
......@@ -28,6 +28,16 @@ struct moving_context {
wait_queue_head_t wait;
};
#define move_ctxt_wait_event(_ctxt, _trans, _cond) \
do { \
bch2_moving_ctxt_do_pending_writes(_ctxt, _trans); \
\
if (_cond) \
break; \
__wait_event((_ctxt)->wait, \
bch2_moving_ctxt_next_pending_write(_ctxt) || (_cond));\
} while (1)
typedef bool (*move_pred_fn)(struct bch_fs *, void *, struct bkey_s_c,
struct bch_io_opts *, struct data_update_opts *);
......@@ -35,6 +45,9 @@ void bch2_moving_ctxt_exit(struct moving_context *);
void bch2_moving_ctxt_init(struct moving_context *, struct bch_fs *,
struct bch_ratelimit *, struct bch_move_stats *,
struct write_point_specifier, bool);
struct moving_io *bch2_moving_ctxt_next_pending_write(struct moving_context *);
void bch2_moving_ctxt_do_pending_writes(struct moving_context *,
struct btree_trans *);
int bch2_scan_old_btree_nodes(struct bch_fs *, struct bch_move_stats *);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment