Commit d0cc3def authored by Kent Overstreet's avatar Kent Overstreet Committed by Kent Overstreet

bcachefs: More allocator startup improvements

Signed-off-by: default avatarKent Overstreet <kent.overstreet@linux.dev>
parent b8adb833
......@@ -347,12 +347,14 @@ int bch2_alloc_replay_key(struct bch_fs *c, struct bkey_i *k)
return ret;
}
int bch2_alloc_write(struct bch_fs *c)
int bch2_alloc_write(struct bch_fs *c, bool nowait, bool *wrote)
{
struct bch_dev *ca;
unsigned i;
int ret = 0;
*wrote = false;
for_each_rw_member(ca, c, i) {
struct btree_iter iter;
struct bucket_array *buckets;
......@@ -370,9 +372,14 @@ int bch2_alloc_write(struct bch_fs *c)
if (!buckets->b[b].mark.dirty)
continue;
ret = __bch2_alloc_write_key(c, ca, b, &iter, NULL, 0);
ret = __bch2_alloc_write_key(c, ca, b, &iter, NULL,
nowait
? BTREE_INSERT_NOWAIT
: 0);
if (ret)
break;
*wrote = true;
}
up_read(&ca->bucket_lock);
bch2_btree_iter_unlock(&iter);
......@@ -1270,20 +1277,23 @@ static void flush_held_btree_writes(struct bch_fs *c)
struct bucket_table *tbl;
struct rhash_head *pos;
struct btree *b;
bool flush_updates;
size_t i, nr_pending_updates;
bool nodes_blocked;
size_t i;
struct closure cl;
closure_init_stack(&cl);
clear_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags);
again:
pr_debug("flushing dirty btree nodes");
cond_resched();
closure_wait(&c->btree_interior_update_wait, &cl);
flush_updates = false;
nr_pending_updates = bch2_btree_interior_updates_nr_pending(c);
nodes_blocked = false;
rcu_read_lock();
for_each_cached_btree(b, c, tbl, i, pos)
if (btree_node_dirty(b) && (!b->written || b->level)) {
if (btree_node_need_write(b)) {
if (btree_node_may_write(b)) {
rcu_read_unlock();
btree_node_lock_type(c, b, SIX_LOCK_read);
......@@ -1291,7 +1301,7 @@ static void flush_held_btree_writes(struct bch_fs *c)
six_unlock_read(&b->lock);
goto again;
} else {
flush_updates = true;
nodes_blocked = true;
}
}
rcu_read_unlock();
......@@ -1299,17 +1309,16 @@ static void flush_held_btree_writes(struct bch_fs *c)
if (c->btree_roots_dirty)
bch2_journal_meta(&c->journal);
/*
* This is ugly, but it's needed to flush btree node writes
* without spinning...
*/
if (flush_updates) {
closure_wait_event(&c->btree_interior_update_wait,
bch2_btree_interior_updates_nr_pending(c) <
nr_pending_updates);
if (nodes_blocked) {
closure_sync(&cl);
goto again;
}
closure_wake_up(&c->btree_interior_update_wait);
closure_sync(&cl);
closure_wait_event(&c->btree_interior_update_wait,
!bch2_btree_interior_updates_nr_pending(c));
}
static void allocator_start_issue_discards(struct bch_fs *c)
......@@ -1331,13 +1340,10 @@ static int __bch2_fs_allocator_start(struct bch_fs *c)
unsigned dev_iter;
u64 journal_seq = 0;
long bu;
bool invalidating_data = false;
int ret = 0;
if (test_alloc_startup(c)) {
invalidating_data = true;
if (test_alloc_startup(c))
goto not_enough;
}
/* Scan for buckets that are already invalidated: */
for_each_rw_member(ca, c, dev_iter) {
......@@ -1384,13 +1390,27 @@ static int __bch2_fs_allocator_start(struct bch_fs *c)
not_enough:
pr_debug("not enough empty buckets; scanning for reclaimable buckets");
/*
* We're moving buckets to freelists _before_ they've been marked as
* invalidated on disk - we have to so that we can allocate new btree
* nodes to mark them as invalidated on disk.
*
* However, we can't _write_ to any of these buckets yet - they might
* have cached data in them, which is live until they're marked as
* invalidated on disk:
*/
set_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags);
while (1) {
bool wrote = false;
for_each_rw_member(ca, c, dev_iter) {
find_reclaimable_buckets(c, ca);
while (!fifo_full(&ca->free[RESERVE_BTREE]) &&
(bu = next_alloc_bucket(ca)) >= 0) {
invalidating_data |=
bch2_invalidate_one_bucket(c, ca, bu, &journal_seq);
bch2_invalidate_one_bucket(c, ca, bu,
&journal_seq);
fifo_push(&ca->free[RESERVE_BTREE], bu);
bucket_set_dirty(ca, bu);
......@@ -1400,45 +1420,37 @@ static int __bch2_fs_allocator_start(struct bch_fs *c)
pr_debug("done scanning for reclaimable buckets");
/*
* We're moving buckets to freelists _before_ they've been marked as
* invalidated on disk - we have to so that we can allocate new btree
* nodes to mark them as invalidated on disk.
*
* However, we can't _write_ to any of these buckets yet - they might
* have cached data in them, which is live until they're marked as
* invalidated on disk:
* XXX: it's possible for this to deadlock waiting on journal reclaim,
* since we're holding btree writes. What then?
*/
if (invalidating_data) {
pr_debug("invalidating existing data");
set_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags);
} else {
pr_debug("issuing discards");
allocator_start_issue_discards(c);
}
ret = bch2_alloc_write(c, true, &wrote);
/*
* XXX: it's possible for this to deadlock waiting on journal reclaim,
* since we're holding btree writes. What then?
* If bch2_alloc_write() did anything, it may have used some
* buckets, and we need the RESERVE_BTREE freelist full - so we
* need to loop and scan again.
* And if it errored, it may have been because there weren't
* enough buckets, so just scan and loop again as long as it
* made some progress:
*/
ret = bch2_alloc_write(c);
if (ret)
if (!wrote && ret)
return ret;
if (!wrote && !ret)
break;
}
if (invalidating_data) {
pr_debug("flushing journal");
ret = bch2_journal_flush_seq(&c->journal, journal_seq);
ret = bch2_journal_flush(&c->journal);
if (ret)
return ret;
pr_debug("issuing discards");
allocator_start_issue_discards(c);
}
set_bit(BCH_FS_ALLOCATOR_STARTED, &c->flags);
/* now flush dirty btree nodes: */
if (invalidating_data)
flush_held_btree_writes(c);
return 0;
......@@ -1448,6 +1460,7 @@ int bch2_fs_allocator_start(struct bch_fs *c)
{
struct bch_dev *ca;
unsigned i;
bool wrote;
int ret;
down_read(&c->gc_lock);
......@@ -1465,7 +1478,7 @@ int bch2_fs_allocator_start(struct bch_fs *c)
}
}
return bch2_alloc_write(c);
return bch2_alloc_write(c, false, &wrote);
}
void bch2_fs_allocator_background_init(struct bch_fs *c)
......
......@@ -55,7 +55,7 @@ void bch2_dev_allocator_add(struct bch_fs *, struct bch_dev *);
void bch2_dev_allocator_stop(struct bch_dev *);
int bch2_dev_allocator_start(struct bch_dev *);
int bch2_alloc_write(struct bch_fs *);
int bch2_alloc_write(struct bch_fs *, bool, bool *);
int bch2_fs_allocator_start(struct bch_fs *);
void bch2_fs_allocator_background_init(struct bch_fs *);
......
......@@ -171,6 +171,10 @@ static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush)
if (!btree_node_may_write(b))
goto out_unlock;
if (btree_node_dirty(b) &&
test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags))
goto out_unlock;
if (btree_node_dirty(b) ||
btree_node_write_in_flight(b) ||
btree_node_read_in_flight(b)) {
......
......@@ -1330,8 +1330,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
if (!(old & (1 << BTREE_NODE_dirty)))
return;
if (b->written &&
!btree_node_may_write(b))
if (!btree_node_may_write(b))
return;
if (old & (1 << BTREE_NODE_write_in_flight)) {
......@@ -1347,7 +1346,6 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
} while (cmpxchg_acquire(&b->flags, old, new) != old);
BUG_ON(btree_node_fake(b));
BUG_ON(!list_empty(&b->write_blocked));
BUG_ON((b->will_make_reachable != 0) != !b->written);
BUG_ON(b->written >= c->opts.btree_node_size);
......@@ -1685,15 +1683,13 @@ ssize_t bch2_dirty_btree_nodes_print(struct bch_fs *c, char *buf)
unsigned long flags = READ_ONCE(b->flags);
unsigned idx = (flags & (1 << BTREE_NODE_write_idx)) != 0;
if (//!(flags & (1 << BTREE_NODE_dirty)) &&
!b->writes[0].wait.list.first &&
!b->writes[1].wait.list.first &&
!(b->will_make_reachable & 1))
if (!(flags & (1 << BTREE_NODE_dirty)))
continue;
pr_buf(&out, "%p d %u l %u w %u b %u r %u:%lu c %u p %u\n",
pr_buf(&out, "%p d %u n %u l %u w %u b %u r %u:%lu c %u p %u\n",
b,
(flags & (1 << BTREE_NODE_dirty)) != 0,
(flags & (1 << BTREE_NODE_need_write)) != 0,
b->level,
b->written,
!list_empty_careful(&b->write_blocked),
......
......@@ -3,6 +3,7 @@
#define _BCACHEFS_BTREE_IO_H
#include "bset.h"
#include "btree_locking.h"
#include "extents.h"
#include "io_types.h"
......@@ -48,7 +49,7 @@ static inline void btree_node_wait_on_io(struct btree *b)
static inline bool btree_node_may_write(struct btree *b)
{
return list_empty_careful(&b->write_blocked) &&
!b->will_make_reachable;
(!b->written || !b->will_make_reachable);
}
enum compact_mode {
......@@ -100,42 +101,36 @@ bool bch2_btree_post_write_cleanup(struct bch_fs *, struct btree *);
void bch2_btree_node_write(struct bch_fs *, struct btree *,
enum six_lock_type);
/*
* btree_node_dirty() can be cleared with only a read lock,
* and for bch2_btree_node_write_cond() we want to set need_write iff it's
* still dirty:
*/
static inline void set_btree_node_need_write_if_dirty(struct btree *b)
static inline void btree_node_write_if_need(struct bch_fs *c, struct btree *b)
{
unsigned long old, new, v = READ_ONCE(b->flags);
do {
old = new = v;
if (!(old & (1 << BTREE_NODE_dirty)))
return;
new |= (1 << BTREE_NODE_need_write);
} while ((v = cmpxchg(&b->flags, old, new)) != old);
while (b->written &&
btree_node_need_write(b) &&
btree_node_may_write(b)) {
if (!btree_node_write_in_flight(b)) {
bch2_btree_node_write(c, b, SIX_LOCK_read);
break;
}
six_unlock_read(&b->lock);
btree_node_wait_on_io(b);
btree_node_lock_type(c, b, SIX_LOCK_read);
}
}
#define bch2_btree_node_write_cond(_c, _b, cond) \
do { \
while ((_b)->written && btree_node_dirty(_b) && (cond)) { \
if (!btree_node_may_write(_b)) { \
set_btree_node_need_write_if_dirty(_b); \
break; \
} \
unsigned long old, new, v = READ_ONCE((_b)->flags); \
\
do { \
old = new = v; \
\
if (!btree_node_write_in_flight(_b)) { \
bch2_btree_node_write(_c, _b, SIX_LOCK_read); \
if (!(old & (1 << BTREE_NODE_dirty)) || !(cond)) \
break; \
} \
\
six_unlock_read(&(_b)->lock); \
btree_node_wait_on_io(_b); \
btree_node_lock_type(c, b, SIX_LOCK_read); \
} \
new |= (1 << BTREE_NODE_need_write); \
} while ((v = cmpxchg(&(_b)->flags, old, new)) != old); \
\
btree_node_write_if_need(_c, _b); \
} while (0)
void bch2_btree_flush_all_reads(struct bch_fs *);
......
......@@ -2,6 +2,7 @@
#ifndef _BCACHEFS_BTREE_ITER_H
#define _BCACHEFS_BTREE_ITER_H
#include "bset.h"
#include "btree_types.h"
static inline void btree_iter_set_dirty(struct btree_iter *iter,
......
......@@ -11,7 +11,6 @@
*/
#include "btree_iter.h"
#include "btree_io.h"
#include "six.h"
/* matches six lock types */
......
......@@ -367,6 +367,7 @@ static struct btree *bch2_btree_node_alloc(struct btree_update *as, unsigned lev
set_btree_node_accessed(b);
set_btree_node_dirty(b);
set_btree_node_need_write(b);
bch2_bset_init_first(b, &b->data->keys);
memset(&b->nr, 0, sizeof(b->nr));
......@@ -655,6 +656,12 @@ static void btree_update_nodes_written(struct closure *cl)
closure_wait(&btree_current_write(b)->wait, cl);
list_del(&as->write_blocked_list);
/*
* for flush_held_btree_writes() waiting on updates to flush or
* nodes to be writeable:
*/
closure_wake_up(&c->btree_interior_update_wait);
mutex_unlock(&c->btree_interior_update_lock);
/*
......@@ -958,6 +965,12 @@ void bch2_btree_interior_update_will_free_node(struct btree_update *as,
list_for_each_entry_safe(p, n, &b->write_blocked, write_blocked_list) {
list_del(&p->write_blocked_list);
btree_update_reparent(as, p);
/*
* for flush_held_btree_writes() waiting on updates to flush or
* nodes to be writeable:
*/
closure_wake_up(&c->btree_interior_update_wait);
}
clear_btree_node_dirty(b);
......
......@@ -1038,7 +1038,7 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
size_t reserve_none = max_t(size_t, 1, nbuckets >> 9);
size_t copygc_reserve = max_t(size_t, 2, nbuckets >> 7);
size_t free_inc_nr = max(max_t(size_t, 1, nbuckets >> 12),
btree_reserve);
btree_reserve * 2);
bool resize = ca->buckets[0] != NULL,
start_copygc = ca->copygc_thread != NULL;
int ret = -ENOMEM;
......
......@@ -25,9 +25,6 @@
#include "eytzinger.h"
#include "util.h"
#define simple_strtoint(c, end, base) simple_strtol(c, end, base)
#define simple_strtouint(c, end, base) simple_strtoul(c, end, base)
static const char si_units[] = "?kMGTPEZY";
static int __bch2_strtoh(const char *cp, u64 *res,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment