Commit 64bc0011 authored by Kent Overstreet's avatar Kent Overstreet Committed by Kent Overstreet

bcachefs: Rework btree iterator lifetimes

The btree_trans struct needs to memoize/cache btree iterators, so that
on transaction restart we don't have to completely redo btree lookups,
and so that we can do them all at once in the correct order when the
transaction had to restart to avoid a deadlock.

This switches the btree iterator lookups to work based on iterator
position, instead of trying to match them up based on the stack trace.
Signed-off-by: default avatarKent Overstreet <kent.overstreet@linux.dev>
parent a7199432
......@@ -1730,15 +1730,6 @@ static inline void bch2_btree_iter_init(struct btree_trans *trans,
/* new transactional stuff: */
int bch2_trans_iter_put(struct btree_trans *trans,
struct btree_iter *iter)
{
int ret = btree_iter_err(iter);
trans->iters_live &= ~(1ULL << iter->idx);
return ret;
}
static inline void __bch2_trans_iter_free(struct btree_trans *trans,
unsigned idx)
{
......@@ -1746,26 +1737,27 @@ static inline void __bch2_trans_iter_free(struct btree_trans *trans,
trans->iters_linked &= ~(1ULL << idx);
trans->iters_live &= ~(1ULL << idx);
trans->iters_touched &= ~(1ULL << idx);
trans->iters_unlink_on_restart &= ~(1ULL << idx);
trans->iters_unlink_on_commit &= ~(1ULL << idx);
}
int bch2_trans_iter_free(struct btree_trans *trans,
struct btree_iter *iter)
int bch2_trans_iter_put(struct btree_trans *trans,
struct btree_iter *iter)
{
int ret = btree_iter_err(iter);
__bch2_trans_iter_free(trans, iter->idx);
if (!(trans->iters_touched & (1ULL << iter->idx)) &&
!(iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT))
__bch2_trans_iter_free(trans, iter->idx);
trans->iters_live &= ~(1ULL << iter->idx);
return ret;
}
int bch2_trans_iter_free_on_commit(struct btree_trans *trans,
struct btree_iter *iter)
int bch2_trans_iter_free(struct btree_trans *trans,
struct btree_iter *iter)
{
int ret = btree_iter_err(iter);
trans->iters_touched &= ~(1ULL << iter->idx);
trans->iters_unlink_on_commit |= 1ULL << iter->idx;
return ret;
return bch2_trans_iter_put(trans, iter);
}
static int bch2_trans_realloc_iters(struct btree_trans *trans,
......@@ -1839,7 +1831,25 @@ static struct btree_iter *btree_trans_iter_alloc(struct btree_trans *trans)
goto got_slot;
if (trans->nr_iters == trans->size) {
int ret = bch2_trans_realloc_iters(trans, trans->size * 2);
int ret;
if (trans->nr_iters >= BTREE_ITER_MAX) {
struct btree_iter *iter;
trans_for_each_iter(trans, iter) {
pr_err("iter: btree %s pos %llu:%llu%s%s%s",
bch2_btree_ids[iter->btree_id],
iter->pos.inode,
iter->pos.offset,
(trans->iters_live & (1ULL << iter->idx)) ? " live" : "",
(trans->iters_touched & (1ULL << iter->idx)) ? " touched" : "",
iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT ? " keep" : "");
}
panic("trans iter oveflow\n");
}
ret = bch2_trans_realloc_iters(trans, trans->size * 2);
if (ret)
return ERR_PTR(ret);
}
......@@ -1854,60 +1864,94 @@ static struct btree_iter *btree_trans_iter_alloc(struct btree_trans *trans)
return &trans->iters[idx];
}
static inline void btree_iter_copy(struct btree_iter *dst,
struct btree_iter *src)
{
unsigned i, idx = dst->idx;
*dst = *src;
dst->idx = idx;
for (i = 0; i < BTREE_MAX_DEPTH; i++)
if (btree_node_locked(dst, i))
six_lock_increment(&dst->l[i].b->c.lock,
__btree_lock_want(dst, i));
}
static inline struct bpos bpos_diff(struct bpos l, struct bpos r)
{
if (bkey_cmp(l, r) > 0)
swap(l, r);
return POS(r.inode - l.inode, r.offset - l.offset);
}
static struct btree_iter *__btree_trans_get_iter(struct btree_trans *trans,
unsigned btree_id, struct bpos pos,
unsigned flags, u64 iter_id)
unsigned flags)
{
struct btree_iter *iter;
struct btree_iter *iter, *best = NULL;
BUG_ON(trans->nr_iters > BTREE_ITER_MAX);
trans_for_each_iter(trans, iter)
if (iter_id
? iter->id == iter_id
: (iter->btree_id == btree_id &&
!bkey_cmp(iter->pos, pos)))
goto found;
trans_for_each_iter(trans, iter) {
if (btree_iter_type(iter) != (flags & BTREE_ITER_TYPE))
continue;
iter = NULL;
found:
if (!iter) {
if (iter->btree_id != btree_id)
continue;
if (best &&
bkey_cmp(bpos_diff(best->pos, pos),
bpos_diff(iter->pos, pos)) < 0)
continue;
best = iter;
}
if (!best) {
iter = btree_trans_iter_alloc(trans);
if (IS_ERR(iter))
return iter;
iter->id = iter_id;
bch2_btree_iter_init(trans, iter, btree_id, pos, flags);
} else {
iter->flags &= ~(BTREE_ITER_SLOTS|BTREE_ITER_INTENT|BTREE_ITER_PREFETCH);
iter->flags |= flags & (BTREE_ITER_SLOTS|BTREE_ITER_INTENT|BTREE_ITER_PREFETCH);
} else if ((trans->iters_live & (1ULL << best->idx)) ||
(best->flags & BTREE_ITER_KEEP_UNTIL_COMMIT)) {
iter = btree_trans_iter_alloc(trans);
if (IS_ERR(iter))
return iter;
if ((iter->flags & BTREE_ITER_INTENT) &&
!bch2_btree_iter_upgrade(iter, 1)) {
trace_trans_restart_upgrade(trans->ip);
return ERR_PTR(-EINTR);
}
btree_iter_copy(iter, best);
} else {
iter = best;
}
iter->flags &= ~BTREE_ITER_KEEP_UNTIL_COMMIT;
iter->flags &= ~(BTREE_ITER_SLOTS|BTREE_ITER_INTENT|BTREE_ITER_PREFETCH);
iter->flags |= flags & (BTREE_ITER_SLOTS|BTREE_ITER_INTENT|BTREE_ITER_PREFETCH);
if (iter->flags & BTREE_ITER_INTENT)
bch2_btree_iter_upgrade(iter, 1);
else
bch2_btree_iter_downgrade(iter);
BUG_ON(iter->btree_id != btree_id);
BUG_ON((iter->flags ^ flags) & BTREE_ITER_TYPE);
BUG_ON(iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT);
BUG_ON(trans->iters_live & (1ULL << iter->idx));
trans->iters_live |= 1ULL << iter->idx;
trans->iters_touched |= 1ULL << iter->idx;
BUG_ON(iter->btree_id != btree_id);
BUG_ON((iter->flags ^ flags) & BTREE_ITER_TYPE);
return iter;
}
struct btree_iter *__bch2_trans_get_iter(struct btree_trans *trans,
enum btree_id btree_id,
struct bpos pos, unsigned flags,
u64 iter_id)
struct btree_iter *bch2_trans_get_iter(struct btree_trans *trans,
enum btree_id btree_id,
struct bpos pos, unsigned flags)
{
struct btree_iter *iter =
__btree_trans_get_iter(trans, btree_id, pos, flags, iter_id);
__btree_trans_get_iter(trans, btree_id, pos, flags);
if (!IS_ERR(iter))
bch2_btree_iter_set_pos(iter, pos);
......@@ -1923,7 +1967,7 @@ struct btree_iter *bch2_trans_get_node_iter(struct btree_trans *trans,
{
struct btree_iter *iter =
__btree_trans_get_iter(trans, btree_id, pos,
flags|BTREE_ITER_NODES, 0);
flags|BTREE_ITER_NODES);
unsigned i;
BUG_ON(IS_ERR(iter));
......@@ -1943,24 +1987,20 @@ struct btree_iter *bch2_trans_copy_iter(struct btree_trans *trans,
struct btree_iter *src)
{
struct btree_iter *iter;
int idx, i;
iter = btree_trans_iter_alloc(trans);
if (IS_ERR(iter))
return iter;
idx = iter->idx;
*iter = *src;
iter->idx = idx;
btree_iter_copy(iter, src);
trans->iters_live |= 1ULL << idx;
trans->iters_touched |= 1ULL << idx;
trans->iters_unlink_on_restart |= 1ULL << idx;
for (i = 0; i < BTREE_MAX_DEPTH; i++)
if (btree_node_locked(iter, i))
six_lock_increment(&iter->l[i].b->c.lock,
__btree_lock_want(iter, i));
trans->iters_live |= 1ULL << iter->idx;
/*
* Don't mark it as touched, we don't need to preserve this iter since
* it's cheap to copy it again:
*/
trans->iters_touched &= ~(1ULL << iter->idx);
iter->flags &= ~BTREE_ITER_KEEP_UNTIL_COMMIT;
return iter;
}
......@@ -2001,10 +2041,11 @@ void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
return p;
}
inline void bch2_trans_unlink_iters(struct btree_trans *trans, u64 iters)
inline void bch2_trans_unlink_iters(struct btree_trans *trans)
{
iters &= trans->iters_linked;
iters &= ~trans->iters_live;
u64 iters = trans->iters_linked &
~trans->iters_touched &
~trans->iters_live;
while (iters) {
unsigned idx = __ffs64(iters);
......@@ -2014,33 +2055,24 @@ inline void bch2_trans_unlink_iters(struct btree_trans *trans, u64 iters)
}
}
void bch2_trans_begin(struct btree_trans *trans)
void bch2_trans_reset(struct btree_trans *trans, unsigned flags)
{
u64 iters_to_unlink;
struct btree_iter *iter;
/*
* On transaction restart, the transaction isn't required to allocate
* all the same iterators it on the last iteration:
*
* Unlink any iterators it didn't use this iteration, assuming it got
* further (allocated an iter with a higher idx) than where the iter
* was originally allocated:
*/
iters_to_unlink = ~trans->iters_live &
((1ULL << fls64(trans->iters_live)) - 1);
trans_for_each_iter(trans, iter)
iter->flags &= ~BTREE_ITER_KEEP_UNTIL_COMMIT;
iters_to_unlink |= trans->iters_unlink_on_restart;
iters_to_unlink |= trans->iters_unlink_on_commit;
bch2_trans_unlink_iters(trans);
trans->iters_live = 0;
if (flags & TRANS_RESET_ITERS)
trans->iters_live = 0;
bch2_trans_unlink_iters(trans, iters_to_unlink);
trans->iters_touched &= trans->iters_live;
trans->iters_touched = 0;
trans->iters_unlink_on_restart = 0;
trans->iters_unlink_on_commit = 0;
trans->nr_updates = 0;
trans->mem_top = 0;
if (flags & TRANS_RESET_MEM)
trans->mem_top = 0;
bch2_btree_iter_traverse_all(trans);
}
......
......@@ -271,43 +271,30 @@ static inline int bkey_err(struct bkey_s_c k)
int bch2_trans_iter_put(struct btree_trans *, struct btree_iter *);
int bch2_trans_iter_free(struct btree_trans *, struct btree_iter *);
int bch2_trans_iter_free_on_commit(struct btree_trans *, struct btree_iter *);
void bch2_trans_unlink_iters(struct btree_trans *, u64);
void bch2_trans_unlink_iters(struct btree_trans *);
struct btree_iter *__bch2_trans_get_iter(struct btree_trans *, enum btree_id,
struct bpos, unsigned, u64);
struct btree_iter *bch2_trans_get_iter(struct btree_trans *, enum btree_id,
struct bpos, unsigned);
struct btree_iter *bch2_trans_copy_iter(struct btree_trans *,
struct btree_iter *);
struct btree_iter *bch2_trans_get_node_iter(struct btree_trans *,
enum btree_id, struct bpos,
unsigned, unsigned, unsigned);
static __always_inline u64 __btree_iter_id(void)
{
u64 ret = 0;
#define TRANS_RESET_ITERS (1 << 0)
#define TRANS_RESET_MEM (1 << 1)
ret <<= 32;
ret |= _RET_IP_ & U32_MAX;
ret <<= 32;
ret |= _THIS_IP_ & U32_MAX;
return ret;
}
void bch2_trans_reset(struct btree_trans *, unsigned);
static __always_inline struct btree_iter *
bch2_trans_get_iter(struct btree_trans *trans, enum btree_id btree_id,
struct bpos pos, unsigned flags)
static inline void bch2_trans_begin(struct btree_trans *trans)
{
return __bch2_trans_get_iter(trans, btree_id, pos, flags,
__btree_iter_id());
return bch2_trans_reset(trans, TRANS_RESET_ITERS|TRANS_RESET_MEM);
}
struct btree_iter *bch2_trans_get_node_iter(struct btree_trans *,
enum btree_id, struct bpos,
unsigned, unsigned, unsigned);
void bch2_trans_begin(struct btree_trans *);
static inline void bch2_trans_begin_updates(struct btree_trans *trans)
{
trans->nr_updates = 0;
return bch2_trans_reset(trans, TRANS_RESET_MEM);
}
void *bch2_trans_kmalloc(struct btree_trans *, size_t);
......
......@@ -191,12 +191,13 @@ enum btree_iter_type {
#define BTREE_ITER_SLOTS (1 << 2)
#define BTREE_ITER_INTENT (1 << 3)
#define BTREE_ITER_PREFETCH (1 << 4)
#define BTREE_ITER_KEEP_UNTIL_COMMIT (1 << 5)
/*
* Used in bch2_btree_iter_traverse(), to indicate whether we're searching for
* @pos or the first key strictly greater than @pos
*/
#define BTREE_ITER_IS_EXTENTS (1 << 5)
#define BTREE_ITER_ERROR (1 << 6)
#define BTREE_ITER_IS_EXTENTS (1 << 6)
#define BTREE_ITER_ERROR (1 << 7)
enum btree_iter_uptodate {
BTREE_ITER_UPTODATE = 0,
......@@ -237,8 +238,6 @@ struct btree_iter {
* bch2_btree_iter_next_slot() can correctly advance pos.
*/
struct bkey k;
u64 id;
};
static inline enum btree_iter_type btree_iter_type(struct btree_iter *iter)
......@@ -261,8 +260,6 @@ struct btree_trans {
u64 iters_linked;
u64 iters_live;
u64 iters_touched;
u64 iters_unlink_on_restart;
u64 iters_unlink_on_commit;
u8 nr_iters;
u8 nr_updates;
......
......@@ -107,6 +107,8 @@ static inline void bch2_trans_update(struct btree_trans *trans,
{
EBUG_ON(trans->nr_updates >= trans->nr_iters + 4);
iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT;
trans->updates[trans->nr_updates++] = (struct btree_insert_entry) {
.iter = iter, .k = k
};
......
......@@ -752,6 +752,7 @@ int bch2_trans_commit(struct btree_trans *trans,
{
struct bch_fs *c = trans->c;
struct btree_insert_entry *i = NULL;
struct btree_iter *iter;
unsigned orig_nr_updates = trans->nr_updates;
unsigned orig_mem_top = trans->mem_top;
int ret = 0;
......@@ -814,9 +815,11 @@ int bch2_trans_commit(struct btree_trans *trans,
BUG_ON(!(trans->flags & BTREE_INSERT_ATOMIC) && ret == -EINTR);
trans_for_each_iter(trans, iter)
iter->flags &= ~BTREE_ITER_KEEP_UNTIL_COMMIT;
if (!ret) {
bch2_trans_unlink_iters(trans, ~trans->iters_touched|
trans->iters_unlink_on_commit);
bch2_trans_unlink_iters(trans);
trans->iters_touched = 0;
}
trans->nr_updates = 0;
......
......@@ -1369,13 +1369,11 @@ static int trans_get_key(struct btree_trans *trans,
return 1;
}
*iter = __bch2_trans_get_iter(trans, btree_id, pos,
BTREE_ITER_SLOTS|BTREE_ITER_INTENT, 0);
*iter = bch2_trans_get_iter(trans, btree_id, pos,
BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
if (IS_ERR(*iter))
return PTR_ERR(*iter);
bch2_trans_iter_free_on_commit(trans, *iter);
*k = bch2_btree_iter_peek_slot(*iter);
ret = bkey_err(*k);
if (ret)
......
......@@ -282,7 +282,7 @@ static int sum_sector_overwrites(struct btree_trans *trans,
old = bch2_btree_iter_next_slot(iter);
}
bch2_trans_iter_free(trans, iter);
bch2_trans_iter_put(trans, iter);
return 0;
}
......@@ -2786,7 +2786,7 @@ static long bch2_fcollapse_finsert(struct bch_inode_info *inode,
bch2_disk_reservation_put(c, &disk_res);
bkey_err:
if (del)
bch2_trans_iter_free(&trans, del);
bch2_trans_iter_put(&trans, del);
del = NULL;
if (!ret)
......
......@@ -355,6 +355,7 @@ static void __bch2_write_index(struct bch_write_op *op)
u64 sectors_start = keylist_sectors(keys);
int ret = op->index_update_fn(op);
BUG_ON(ret == -EINTR);
BUG_ON(keylist_sectors(keys) && !ret);
op->written += sectors_start - keylist_sectors(keys);
......@@ -1337,6 +1338,8 @@ static void bch2_read_retry(struct bch_fs *c, struct bch_read_bio *rbio,
bio_advance_iter(&rbio->bio, &bvec_iter, bytes);
}
if (ret == -EINTR)
goto retry;
/*
* If we get here, it better have been because there was an error
* reading a btree node
......@@ -1610,9 +1613,9 @@ int __bch2_read_indirect_extent(struct btree_trans *trans,
reflink_offset = le64_to_cpu(bkey_i_to_reflink_p(orig_k)->v.idx) +
*offset_into_extent;
iter = __bch2_trans_get_iter(trans, BTREE_ID_REFLINK,
POS(0, reflink_offset),
BTREE_ITER_SLOTS, 1);
iter = bch2_trans_get_iter(trans, BTREE_ID_REFLINK,
POS(0, reflink_offset),
BTREE_ITER_SLOTS);
ret = PTR_ERR_OR_ZERO(iter);
if (ret)
return ret;
......@@ -1888,8 +1891,6 @@ void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, u64 inode)
BCH_READ_USER_MAPPED;
int ret;
bch2_trans_init(&trans, c, 0, 0);
BUG_ON(rbio->_state);
BUG_ON(flags & BCH_READ_NODECODE);
BUG_ON(flags & BCH_READ_IN_RETRY);
......@@ -1897,10 +1898,13 @@ void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, u64 inode)
rbio->c = c;
rbio->start_time = local_clock();
bch2_trans_init(&trans, c, 0, 0);
retry:
bch2_trans_begin(&trans);
iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
POS(inode, rbio->bio.bi_iter.bi_sector),
BTREE_ITER_SLOTS);
while (1) {
BKEY_PADDED(k) tmp;
unsigned bytes, sectors, offset_into_extent;
......@@ -1955,6 +1959,9 @@ void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, u64 inode)
bch2_trans_exit(&trans);
return;
err:
if (ret == -EINTR)
goto retry;
bcache_io_error(c, &rbio->bio, "btree IO error: %i", ret);
bch2_rbio_done(rbio);
goto out;
......
......@@ -190,10 +190,10 @@ s64 bch2_remap_range(struct bch_fs *c,
bch2_trans_init(&trans, c, BTREE_ITER_MAX, 4096);
src_iter = __bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, src_start,
BTREE_ITER_INTENT, 1);
dst_iter = __bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, dst_start,
BTREE_ITER_INTENT, 2);
src_iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, src_start,
BTREE_ITER_INTENT);
dst_iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, dst_start,
BTREE_ITER_INTENT);
while (1) {
bch2_trans_begin_updates(&trans);
......
......@@ -202,12 +202,13 @@ int bch2_hash_needs_whiteout(struct btree_trans *trans,
if (k.k->type == desc.key_type &&
desc.hash_bkey(info, k) <= start->pos.offset) {
bch2_trans_iter_free_on_commit(trans, iter);
return 1;
iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT;
ret = 1;
break;
}
}
bch2_trans_iter_free(trans, iter);
bch2_trans_iter_put(trans, iter);
return ret;
}
......@@ -247,11 +248,14 @@ int bch2_hash_set(struct btree_trans *trans,
goto not_found;
}
if (!ret)
ret = -ENOSPC;
out:
if (slot)
bch2_trans_iter_free(trans, slot);
bch2_trans_iter_free(trans, iter);
bch2_trans_iter_put(trans, slot);
bch2_trans_iter_put(trans, iter);
return ret ?: -ENOSPC;
return ret;
found:
found = true;
not_found:
......@@ -261,17 +265,14 @@ int bch2_hash_set(struct btree_trans *trans,
} else if (found && (flags & BCH_HASH_SET_MUST_CREATE)) {
ret = -EEXIST;
} else {
if (!found && slot) {
bch2_trans_iter_free(trans, iter);
iter = slot;
}
if (!found && slot)
swap(iter, slot);
insert->k.p = iter->pos;
bch2_trans_update(trans, iter, insert);
bch2_trans_iter_free_on_commit(trans, iter);
}
return ret;
goto out;
}
static __always_inline
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment