Commit b29e197a authored by Kent Overstreet's avatar Kent Overstreet Committed by Kent Overstreet

bcachefs: Invalidate buckets when writing to alloc btree

Prep work for persistent alloc information. Refactoring also lets us
make free_inc much smaller, which means a lot fewer buckets stranded on
freelists.
Signed-off-by: default avatarKent Overstreet <kent.overstreet@linux.dev>
parent b2be7c8b
......@@ -288,53 +288,41 @@ int bch2_alloc_read(struct bch_fs *c, struct list_head *journal_replay_list)
static int __bch2_alloc_write_key(struct bch_fs *c, struct bch_dev *ca,
size_t b, struct btree_iter *iter,
u64 *journal_seq, bool nowait)
u64 *journal_seq, unsigned flags)
{
struct bucket_mark m;
__BKEY_PADDED(k, DIV_ROUND_UP(sizeof(struct bch_alloc), 8)) alloc_key;
struct bucket *g;
struct bkey_i_alloc *a;
u8 *d;
int ret;
unsigned flags = BTREE_INSERT_ATOMIC|
BTREE_INSERT_NOFAIL|
BTREE_INSERT_USE_RESERVE|
BTREE_INSERT_USE_ALLOC_RESERVE;
if (nowait)
flags |= BTREE_INSERT_NOWAIT;
bch2_btree_iter_set_pos(iter, POS(ca->dev_idx, b));
percpu_down_read(&c->usage_lock);
g = bucket(ca, b);
m = READ_ONCE(g->mark);
a = bkey_alloc_init(&alloc_key.k);
a->k.p = POS(ca->dev_idx, b);
a->v.fields = 0;
a->v.gen = m.gen;
set_bkey_val_u64s(&a->k, bch_alloc_val_u64s(&a->v));
d = a->v.data;
if (a->v.fields & (1 << BCH_ALLOC_FIELD_READ_TIME))
put_alloc_field(&d, 2, g->io_time[READ]);
if (a->v.fields & (1 << BCH_ALLOC_FIELD_WRITE_TIME))
put_alloc_field(&d, 2, g->io_time[WRITE]);
percpu_up_read(&c->usage_lock);
do {
ret = btree_iter_err(bch2_btree_iter_peek_slot(iter));
if (ret)
break;
bch2_btree_iter_cond_resched(iter);
percpu_down_read(&c->usage_lock);
g = bucket(ca, b);
/* read mark under btree node lock: */
m = READ_ONCE(g->mark);
a = bkey_alloc_init(&alloc_key.k);
a->k.p = iter->pos;
a->v.fields = 0;
a->v.gen = m.gen;
set_bkey_val_u64s(&a->k, bch_alloc_val_u64s(&a->v));
d = a->v.data;
if (a->v.fields & (1 << BCH_ALLOC_FIELD_READ_TIME))
put_alloc_field(&d, 2, g->io_time[READ]);
if (a->v.fields & (1 << BCH_ALLOC_FIELD_WRITE_TIME))
put_alloc_field(&d, 2, g->io_time[WRITE]);
percpu_up_read(&c->usage_lock);
ret = bch2_btree_insert_at(c, NULL, NULL, journal_seq, flags,
BTREE_INSERT_ENTRY(iter, &a->k_i));
bch2_btree_iter_cond_resched(iter);
} while (ret == -EINTR);
bch2_btree_iter_set_pos(iter, a->k.p);
return ret;
return bch2_btree_insert_at(c, NULL, NULL, journal_seq,
BTREE_INSERT_NOFAIL|
BTREE_INSERT_USE_RESERVE|
BTREE_INSERT_USE_ALLOC_RESERVE|
flags,
BTREE_INSERT_ENTRY(iter, &a->k_i));
}
int bch2_alloc_replay_key(struct bch_fs *c, struct bpos pos)
......@@ -354,8 +342,7 @@ int bch2_alloc_replay_key(struct bch_fs *c, struct bpos pos)
bch2_btree_iter_init(&iter, c, BTREE_ID_ALLOC, POS_MIN,
BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
ret = __bch2_alloc_write_key(c, ca, pos.offset, &iter,
NULL, false);
ret = __bch2_alloc_write_key(c, ca, pos.offset, &iter, NULL, 0);
bch2_btree_iter_unlock(&iter);
return ret;
}
......@@ -375,8 +362,8 @@ int bch2_alloc_write(struct bch_fs *c)
down_read(&ca->bucket_lock);
for_each_set_bit(bucket, ca->buckets_dirty, ca->mi.nbuckets) {
ret = __bch2_alloc_write_key(c, ca, bucket, &iter,
NULL, false);
ret = __bch2_alloc_write_key(c, ca, bucket,
&iter, NULL, 0);
if (ret)
break;
......@@ -582,47 +569,6 @@ static bool bch2_can_invalidate_bucket(struct bch_dev *ca,
return gc_gen < BUCKET_GC_GEN_MAX;
}
static void bch2_invalidate_one_bucket(struct bch_fs *c, struct bch_dev *ca,
size_t bucket)
{
struct bucket_mark m;
percpu_down_read(&c->usage_lock);
spin_lock(&c->freelist_lock);
if (!bch2_invalidate_bucket(c, ca, bucket, &m)) {
spin_unlock(&c->freelist_lock);
percpu_up_read(&c->usage_lock);
return;
}
verify_not_on_freelist(c, ca, bucket);
BUG_ON(!fifo_push(&ca->free_inc, bucket));
spin_unlock(&c->freelist_lock);
percpu_up_read(&c->usage_lock);
/* gc lock held: */
bucket_io_clock_reset(c, ca, bucket, READ);
bucket_io_clock_reset(c, ca, bucket, WRITE);
if (m.cached_sectors) {
ca->allocator_invalidating_data = true;
} else if (m.journal_seq_valid) {
u64 journal_seq = atomic64_read(&c->journal.seq);
u64 bucket_seq = journal_seq;
bucket_seq &= ~((u64) U16_MAX);
bucket_seq |= m.journal_seq;
if (bucket_seq > journal_seq)
bucket_seq -= 1 << 16;
ca->allocator_journal_seq_flush =
max(ca->allocator_journal_seq_flush, bucket_seq);
}
}
/*
* Determines what order we're going to reuse buckets, smallest bucket_key()
* first.
......@@ -674,11 +620,18 @@ static inline int bucket_alloc_cmp(alloc_heap *h,
(l.bucket > r.bucket) - (l.bucket < r.bucket);
}
static inline int bucket_idx_cmp(const void *_l, const void *_r)
{
const struct alloc_heap_entry *l = _l, *r = _r;
return (l->bucket > r->bucket) - (l->bucket < r->bucket);
}
static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca)
{
struct bucket_array *buckets;
struct alloc_heap_entry e = { 0 };
size_t b;
size_t b, i, nr = 0;
ca->alloc_heap.used = 0;
......@@ -720,55 +673,58 @@ static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca)
if (e.nr)
heap_add_or_replace(&ca->alloc_heap, e, -bucket_alloc_cmp);
up_read(&ca->bucket_lock);
mutex_unlock(&c->bucket_clock[READ].lock);
heap_resort(&ca->alloc_heap, bucket_alloc_cmp);
while (heap_pop(&ca->alloc_heap, e, bucket_alloc_cmp)) {
for (b = e.bucket;
b < e.bucket + e.nr;
b++) {
if (fifo_full(&ca->free_inc))
return;
for (i = 0; i < ca->alloc_heap.used; i++)
nr += ca->alloc_heap.data[i].nr;
bch2_invalidate_one_bucket(c, ca, b);
}
while (nr - ca->alloc_heap.data[0].nr >= ALLOC_SCAN_BATCH(ca)) {
nr -= ca->alloc_heap.data[0].nr;
heap_pop(&ca->alloc_heap, e, -bucket_alloc_cmp);
}
up_read(&ca->bucket_lock);
mutex_unlock(&c->bucket_clock[READ].lock);
}
static void find_reclaimable_buckets_fifo(struct bch_fs *c, struct bch_dev *ca)
{
struct bucket_array *buckets = bucket_array(ca);
struct bucket_mark m;
size_t b, checked;
size_t b, start;
for (checked = 0;
checked < ca->mi.nbuckets && !fifo_full(&ca->free_inc);
checked++) {
if (ca->fifo_last_bucket < ca->mi.first_bucket ||
ca->fifo_last_bucket >= ca->mi.nbuckets)
ca->fifo_last_bucket = ca->mi.first_bucket;
if (ca->fifo_last_bucket < ca->mi.first_bucket ||
ca->fifo_last_bucket >= ca->mi.nbuckets)
ca->fifo_last_bucket = ca->mi.first_bucket;
start = ca->fifo_last_bucket;
b = ca->fifo_last_bucket++;
do {
ca->fifo_last_bucket++;
if (ca->fifo_last_bucket == ca->mi.nbuckets)
ca->fifo_last_bucket = ca->mi.first_bucket;
b = ca->fifo_last_bucket;
m = READ_ONCE(buckets->b[b].mark);
if (bch2_can_invalidate_bucket(ca, b, m))
bch2_invalidate_one_bucket(c, ca, b);
if (bch2_can_invalidate_bucket(ca, b, m)) {
struct alloc_heap_entry e = { .bucket = b, .nr = 1, };
heap_add(&ca->alloc_heap, e, bucket_alloc_cmp);
if (heap_full(&ca->alloc_heap))
break;
}
cond_resched();
}
} while (ca->fifo_last_bucket != start);
}
static void find_reclaimable_buckets_random(struct bch_fs *c, struct bch_dev *ca)
{
struct bucket_array *buckets = bucket_array(ca);
struct bucket_mark m;
size_t checked;
size_t checked, i;
for (checked = 0;
checked < ca->mi.nbuckets / 2 && !fifo_full(&ca->free_inc);
checked < ca->mi.nbuckets / 2;
checked++) {
size_t b = bch2_rand_range(ca->mi.nbuckets -
ca->mi.first_bucket) +
......@@ -776,17 +732,34 @@ static void find_reclaimable_buckets_random(struct bch_fs *c, struct bch_dev *ca
m = READ_ONCE(buckets->b[b].mark);
if (bch2_can_invalidate_bucket(ca, b, m))
bch2_invalidate_one_bucket(c, ca, b);
if (bch2_can_invalidate_bucket(ca, b, m)) {
struct alloc_heap_entry e = { .bucket = b, .nr = 1, };
heap_add(&ca->alloc_heap, e, bucket_alloc_cmp);
if (heap_full(&ca->alloc_heap))
break;
}
cond_resched();
}
sort(ca->alloc_heap.data,
ca->alloc_heap.used,
sizeof(ca->alloc_heap.data[0]),
bucket_idx_cmp, NULL);
/* remove duplicates: */
for (i = 0; i + 1 < ca->alloc_heap.used; i++)
if (ca->alloc_heap.data[i].bucket ==
ca->alloc_heap.data[i + 1].bucket)
ca->alloc_heap.data[i].nr = 0;
}
static void find_reclaimable_buckets(struct bch_fs *c, struct bch_dev *ca)
static size_t find_reclaimable_buckets(struct bch_fs *c, struct bch_dev *ca)
{
size_t i, nr = 0;
ca->inc_gen_needs_gc = 0;
ca->inc_gen_really_needs_gc = 0;
switch (ca->mi.replacement) {
case CACHE_REPLACEMENT_LRU:
......@@ -799,86 +772,132 @@ static void find_reclaimable_buckets(struct bch_fs *c, struct bch_dev *ca)
find_reclaimable_buckets_random(c, ca);
break;
}
heap_resort(&ca->alloc_heap, bucket_alloc_cmp);
for (i = 0; i < ca->alloc_heap.used; i++)
nr += ca->alloc_heap.data[i].nr;
return nr;
}
static int size_t_cmp(const void *_l, const void *_r)
static inline long next_alloc_bucket(struct bch_dev *ca)
{
const size_t *l = _l, *r = _r;
struct alloc_heap_entry e, *top = ca->alloc_heap.data;
while (ca->alloc_heap.used) {
if (top->nr) {
size_t b = top->bucket;
top->bucket++;
top->nr--;
return b;
}
return (*l > *r) - (*l < *r);
heap_pop(&ca->alloc_heap, e, bucket_alloc_cmp);
}
return -1;
}
static void sort_free_inc(struct bch_fs *c, struct bch_dev *ca)
static bool bch2_invalidate_one_bucket(struct bch_fs *c, struct bch_dev *ca,
size_t bucket, u64 *flush_seq)
{
BUG_ON(ca->free_inc.front);
struct bucket_mark m;
percpu_down_read(&c->usage_lock);
spin_lock(&c->freelist_lock);
sort(ca->free_inc.data,
ca->free_inc.back,
sizeof(ca->free_inc.data[0]),
size_t_cmp, NULL);
bch2_invalidate_bucket(c, ca, bucket, &m);
verify_not_on_freelist(c, ca, bucket);
BUG_ON(!fifo_push(&ca->free_inc, bucket));
spin_unlock(&c->freelist_lock);
bucket_io_clock_reset(c, ca, bucket, READ);
bucket_io_clock_reset(c, ca, bucket, WRITE);
percpu_up_read(&c->usage_lock);
if (m.journal_seq_valid) {
u64 journal_seq = atomic64_read(&c->journal.seq);
u64 bucket_seq = journal_seq;
bucket_seq &= ~((u64) U16_MAX);
bucket_seq |= m.journal_seq;
if (bucket_seq > journal_seq)
bucket_seq -= 1 << 16;
*flush_seq = max(*flush_seq, bucket_seq);
}
return m.cached_sectors != 0;
}
static int bch2_invalidate_free_inc(struct bch_fs *c, struct bch_dev *ca,
u64 *journal_seq, size_t nr,
bool nowait)
/*
* Pull buckets off ca->alloc_heap, invalidate them, move them to ca->free_inc:
*/
static int bch2_invalidate_buckets(struct bch_fs *c, struct bch_dev *ca)
{
struct btree_iter iter;
u64 journal_seq = 0;
int ret = 0;
long b;
bch2_btree_iter_init(&iter, c, BTREE_ID_ALLOC, POS(ca->dev_idx, 0),
BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
/* Only use nowait if we've already invalidated at least one bucket: */
while (ca->nr_invalidated < min(nr, fifo_used(&ca->free_inc))) {
size_t b = fifo_idx_entry(&ca->free_inc, ca->nr_invalidated);
ret = __bch2_alloc_write_key(c, ca, b, &iter, journal_seq,
nowait && ca->nr_invalidated);
if (ret)
break;
ca->nr_invalidated++;
while (!ret &&
!fifo_full(&ca->free_inc) &&
(b = next_alloc_bucket(ca)) >= 0) {
bool must_flush =
bch2_invalidate_one_bucket(c, ca, b, &journal_seq);
ret = __bch2_alloc_write_key(c, ca, b, &iter,
must_flush ? &journal_seq : NULL,
!fifo_empty(&ca->free_inc) ? BTREE_INSERT_NOWAIT : 0);
}
bch2_btree_iter_unlock(&iter);
/* If we used NOWAIT, don't return the error: */
return ca->nr_invalidated ? 0 : ret;
}
static bool __push_invalidated_bucket(struct bch_fs *c, struct bch_dev *ca, size_t bucket)
{
unsigned i;
if (!fifo_empty(&ca->free_inc))
ret = 0;
if (ret) {
bch_err(ca, "error invalidating buckets: %i", ret);
return ret;
}
/*
* Don't remove from free_inc until after it's added to
* freelist, so gc can find it:
*/
spin_lock(&c->freelist_lock);
for (i = 0; i < RESERVE_NR; i++)
if (fifo_push(&ca->free[i], bucket)) {
fifo_pop(&ca->free_inc, bucket);
--ca->nr_invalidated;
closure_wake_up(&c->freelist_wait);
spin_unlock(&c->freelist_lock);
return true;
}
spin_unlock(&c->freelist_lock);
if (journal_seq)
ret = bch2_journal_flush_seq(&c->journal, journal_seq);
if (ret) {
bch_err(ca, "journal error: %i", ret);
return ret;
}
return false;
return 0;
}
static int push_invalidated_bucket(struct bch_fs *c, struct bch_dev *ca, size_t bucket)
{
unsigned i;
int ret = 0;
while (1) {
set_current_state(TASK_INTERRUPTIBLE);
if (__push_invalidated_bucket(c, ca, bucket))
break;
spin_lock(&c->freelist_lock);
for (i = 0; i < RESERVE_NR; i++)
if (fifo_push(&ca->free[i], bucket)) {
fifo_pop(&ca->free_inc, bucket);
closure_wake_up(&c->freelist_wait);
spin_unlock(&c->freelist_lock);
goto out;
}
spin_unlock(&c->freelist_lock);
if ((current->flags & PF_KTHREAD) &&
kthread_should_stop()) {
......@@ -889,22 +908,20 @@ static int push_invalidated_bucket(struct bch_fs *c, struct bch_dev *ca, size_t
schedule();
try_to_freeze();
}
out:
__set_current_state(TASK_RUNNING);
return ret;
}
/*
* Given an invalidated, ready to use bucket: issue a discard to it if enabled,
* then add it to the freelist, waiting until there's room if necessary:
* Pulls buckets off free_inc, discards them (if enabled), then adds them to
* freelists, waiting until there's room if necessary:
*/
static int discard_invalidated_buckets(struct bch_fs *c, struct bch_dev *ca)
{
while (ca->nr_invalidated) {
while (!fifo_empty(&ca->free_inc)) {
size_t bucket = fifo_peek(&ca->free_inc);
BUG_ON(fifo_empty(&ca->free_inc) || !ca->nr_invalidated);
if (ca->mi.discard &&
bdev_max_discard_sectors(ca->disk_sb.bdev))
blkdev_issue_discard(ca->disk_sb.bdev,
......@@ -930,68 +947,32 @@ static int bch2_allocator_thread(void *arg)
{
struct bch_dev *ca = arg;
struct bch_fs *c = ca->fs;
u64 journal_seq;
size_t nr;
int ret;
set_freezable();
while (1) {
while (1) {
cond_resched();
pr_debug("discarding %zu invalidated buckets",
ca->nr_invalidated);
ret = discard_invalidated_buckets(c, ca);
if (ret)
goto stop;
if (fifo_empty(&ca->free_inc))
break;
cond_resched();
pr_debug("invalidating %zu buckets",
fifo_used(&ca->free_inc));
pr_debug("discarding %zu invalidated buckets",
fifo_used(&ca->free_inc));
journal_seq = 0;
ret = bch2_invalidate_free_inc(c, ca, &journal_seq,
SIZE_MAX, true);
if (ret) {
bch_err(ca, "error invalidating buckets: %i", ret);
goto stop;
}
if (!ca->nr_invalidated) {
bch_err(ca, "allocator thread unable to make forward progress!");
goto stop;
}
ret = discard_invalidated_buckets(c, ca);
if (ret)
goto stop;
if (ca->allocator_invalidating_data)
ret = bch2_journal_flush_seq(&c->journal, journal_seq);
else if (ca->allocator_journal_seq_flush)
ret = bch2_journal_flush_seq(&c->journal,
ca->allocator_journal_seq_flush);
ret = bch2_invalidate_buckets(c, ca);
if (ret)
goto stop;
/*
* journal error - buckets haven't actually been
* invalidated, can't discard them:
*/
if (ret) {
bch_err(ca, "journal error: %i", ret);
goto stop;
}
}
if (!fifo_empty(&ca->free_inc))
continue;
pr_debug("free_inc now empty");
/* Reset front/back so we can easily sort fifo entries later: */
ca->free_inc.front = ca->free_inc.back = 0;
ca->allocator_journal_seq_flush = 0;
ca->allocator_invalidating_data = false;
down_read(&c->gc_lock);
while (1) {
size_t prev = fifo_used(&ca->free_inc);
do {
if (test_bit(BCH_FS_GC_FAILURE, &c->flags)) {
up_read(&c->gc_lock);
bch_err(ca, "gc failure");
......@@ -1007,56 +988,46 @@ static int bch2_allocator_thread(void *arg)
pr_debug("scanning for reclaimable buckets");
find_reclaimable_buckets(c, ca);
nr = find_reclaimable_buckets(c, ca);
pr_debug("found %zu buckets (free_inc %zu/%zu)",
fifo_used(&ca->free_inc) - prev,
fifo_used(&ca->free_inc), ca->free_inc.size);
pr_debug("found %zu buckets", nr);
trace_alloc_batch(ca, fifo_used(&ca->free_inc),
ca->free_inc.size);
trace_alloc_batch(ca, nr, ca->alloc_heap.size);
if ((ca->inc_gen_needs_gc >= ca->free_inc.size ||
(!fifo_full(&ca->free_inc) &&
ca->inc_gen_really_needs_gc >=
fifo_free(&ca->free_inc))) &&
if ((ca->inc_gen_needs_gc >= ALLOC_SCAN_BATCH(ca) ||
ca->inc_gen_really_needs_gc) &&
c->gc_thread) {
atomic_inc(&c->kick_gc);
wake_up_process(c->gc_thread);
}
if (fifo_full(&ca->free_inc))
break;
if (!fifo_empty(&ca->free_inc) &&
!fifo_full(&ca->free[RESERVE_MOVINGGC]))
break;
/*
* copygc may be waiting until either its reserve fills
* up, or we can't make forward progress:
* If we found any buckets, we have to invalidate them
* before we scan for more - but if we didn't find very
* many we may want to wait on more buckets being
* available so we don't spin:
*/
ca->allocator_blocked = true;
closure_wake_up(&c->freelist_wait);
ret = wait_buckets_available(c, ca);
if (ret) {
up_read(&c->gc_lock);
goto stop;
if (!nr ||
(nr < ALLOC_SCAN_BATCH(ca) &&
!fifo_full(&ca->free[RESERVE_MOVINGGC]))) {
ca->allocator_blocked = true;
closure_wake_up(&c->freelist_wait);
ret = wait_buckets_available(c, ca);
if (ret) {
up_read(&c->gc_lock);
goto stop;
}
}
}
} while (!nr);
ca->allocator_blocked = false;
up_read(&c->gc_lock);
pr_debug("free_inc now %zu/%zu",
fifo_used(&ca->free_inc),
ca->free_inc.size);
sort_free_inc(c, ca);
pr_debug("%zu buckets to invalidate", nr);
/*
* free_inc is now full of newly-invalidated buckets: next,
* alloc_heap is now full of newly-invalidated buckets: next,
* write out the new bucket gens:
*/
}
......@@ -1946,39 +1917,83 @@ int bch2_dev_allocator_start(struct bch_dev *ca)
return 0;
}
static void flush_held_btree_writes(struct bch_fs *c)
{
struct bucket_table *tbl;
struct rhash_head *pos;
struct btree *b;
bool flush_updates;
size_t i, nr_pending_updates;
clear_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags);
again:
pr_debug("flushing dirty btree nodes");
cond_resched();
flush_updates = false;
nr_pending_updates = bch2_btree_interior_updates_nr_pending(c);
rcu_read_lock();
for_each_cached_btree(b, c, tbl, i, pos)
if (btree_node_dirty(b) && (!b->written || b->level)) {
if (btree_node_may_write(b)) {
rcu_read_unlock();
btree_node_lock_type(c, b, SIX_LOCK_read);
bch2_btree_node_write(c, b, SIX_LOCK_read);
six_unlock_read(&b->lock);
goto again;
} else {
flush_updates = true;
}
}
rcu_read_unlock();
if (c->btree_roots_dirty)
bch2_journal_meta(&c->journal);
/*
* This is ugly, but it's needed to flush btree node writes
* without spinning...
*/
if (flush_updates) {
closure_wait_event(&c->btree_interior_update_wait,
bch2_btree_interior_updates_nr_pending(c) <
nr_pending_updates);
goto again;
}
}
static void allocator_start_issue_discards(struct bch_fs *c)
{
struct bch_dev *ca;
unsigned dev_iter;
size_t i, bu;
for_each_rw_member(ca, c, dev_iter) {
unsigned done = 0;
fifo_for_each_entry(bu, &ca->free_inc, i) {
if (done == ca->nr_invalidated)
break;
size_t bu;
for_each_rw_member(ca, c, dev_iter)
while (fifo_pop(&ca->free_inc, bu))
blkdev_issue_discard(ca->disk_sb.bdev,
bucket_to_sector(ca, bu),
ca->mi.bucket_size, GFP_NOIO);
done++;
}
}
}
static int __bch2_fs_allocator_start(struct bch_fs *c)
{
struct bch_dev *ca;
size_t bu, i;
unsigned dev_iter;
u64 journal_seq = 0;
long bu;
bool invalidating_data = false;
int ret = 0;
if (test_bit(BCH_FS_GC_FAILURE, &c->flags))
return -1;
if (test_alloc_startup(c)) {
invalidating_data = true;
goto not_enough;
}
/* Scan for buckets that are already invalidated: */
for_each_rw_member(ca, c, dev_iter) {
struct btree_iter iter;
......@@ -2003,7 +2018,6 @@ static int __bch2_fs_allocator_start(struct bch_fs *c)
percpu_up_read(&c->usage_lock);
fifo_push(&ca->free_inc, bu);
ca->nr_invalidated++;
if (fifo_full(&ca->free_inc))
break;
......@@ -2022,24 +2036,23 @@ static int __bch2_fs_allocator_start(struct bch_fs *c)
not_enough:
pr_debug("did not find enough empty buckets; issuing discards");
/* clear out free_inc - find_reclaimable_buckets() assumes it's empty */
/* clear out free_inc, we'll be using it again below: */
for_each_rw_member(ca, c, dev_iter)
discard_invalidated_buckets(c, ca);
pr_debug("scanning for reclaimable buckets");
for_each_rw_member(ca, c, dev_iter) {
BUG_ON(!fifo_empty(&ca->free_inc));
ca->free_inc.front = ca->free_inc.back = 0;
find_reclaimable_buckets(c, ca);
sort_free_inc(c, ca);
invalidating_data |= ca->allocator_invalidating_data;
while (!fifo_full(&ca->free[RESERVE_BTREE]) &&
(bu = next_alloc_bucket(ca)) >= 0) {
invalidating_data |=
bch2_invalidate_one_bucket(c, ca, bu, &journal_seq);
fifo_for_each_entry(bu, &ca->free_inc, i)
if (!fifo_push(&ca->free[RESERVE_BTREE], bu))
break;
fifo_push(&ca->free[RESERVE_BTREE], bu);
set_bit(bu, ca->buckets_dirty);
}
}
pr_debug("done scanning for reclaimable buckets");
......@@ -2065,16 +2078,9 @@ static int __bch2_fs_allocator_start(struct bch_fs *c)
* XXX: it's possible for this to deadlock waiting on journal reclaim,
* since we're holding btree writes. What then?
*/
for_each_rw_member(ca, c, dev_iter) {
ret = bch2_invalidate_free_inc(c, ca, &journal_seq,
ca->free[RESERVE_BTREE].size,
false);
if (ret) {
percpu_ref_put(&ca->io_ref);
return ret;
}
}
ret = bch2_alloc_write(c);
if (ret)
return ret;
if (invalidating_data) {
pr_debug("flushing journal");
......@@ -2087,57 +2093,11 @@ static int __bch2_fs_allocator_start(struct bch_fs *c)
allocator_start_issue_discards(c);
}
for_each_rw_member(ca, c, dev_iter)
while (ca->nr_invalidated) {
BUG_ON(!fifo_pop(&ca->free_inc, bu));
ca->nr_invalidated--;
}
set_bit(BCH_FS_ALLOCATOR_STARTED, &c->flags);
/* now flush dirty btree nodes: */
if (invalidating_data) {
struct bucket_table *tbl;
struct rhash_head *pos;
struct btree *b;
bool flush_updates;
size_t nr_pending_updates;
clear_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags);
again:
pr_debug("flushing dirty btree nodes");
cond_resched();
flush_updates = false;
nr_pending_updates = bch2_btree_interior_updates_nr_pending(c);
rcu_read_lock();
for_each_cached_btree(b, c, tbl, i, pos)
if (btree_node_dirty(b) && (!b->written || b->level)) {
if (btree_node_may_write(b)) {
rcu_read_unlock();
btree_node_lock_type(c, b, SIX_LOCK_read);
bch2_btree_node_write(c, b, SIX_LOCK_read);
six_unlock_read(&b->lock);
goto again;
} else {
flush_updates = true;
}
}
rcu_read_unlock();
/*
* This is ugly, but it's needed to flush btree node writes
* without spinning...
*/
if (flush_updates) {
closure_wait_event(&c->btree_interior_update_wait,
bch2_btree_interior_updates_nr_pending(c) <
nr_pending_updates);
goto again;
}
}
if (invalidating_data)
flush_held_btree_writes(c);
return 0;
}
......
......@@ -9,6 +9,8 @@ struct bch_dev;
struct bch_fs;
struct bch_devs_List;
#define ALLOC_SCAN_BATCH(ca) ((ca)->mi.nbuckets >> 9)
const char *bch2_alloc_invalid(const struct bch_fs *, struct bkey_s_c);
void bch2_alloc_to_text(struct bch_fs *, char *, size_t, struct bkey_s_c);
......
......@@ -270,6 +270,10 @@ do { \
"Store the journal sequence number in the version " \
"number of every btree key, and verify that btree " \
"update ordering is preserved during recovery") \
BCH_DEBUG_PARAM(test_alloc_startup, \
"Force allocator startup to use the slowpath where it" \
"can't find enough free buckets without invalidating" \
"cached data")
#define BCH_DEBUG_PARAMS_ALL() BCH_DEBUG_PARAMS_ALWAYS() BCH_DEBUG_PARAMS_DEBUG()
......@@ -403,7 +407,6 @@ struct bch_dev {
alloc_fifo free[RESERVE_NR];
alloc_fifo free_inc;
spinlock_t freelist_lock;
size_t nr_invalidated;
u8 open_buckets_partial[OPEN_BUCKETS_COUNT];
unsigned open_buckets_partial_nr;
......@@ -415,8 +418,6 @@ struct bch_dev {
size_t inc_gen_needs_gc;
size_t inc_gen_really_needs_gc;
u64 allocator_journal_seq_flush;
bool allocator_invalidating_data;
bool allocator_blocked;
alloc_heap alloc_heap;
......
......@@ -1145,7 +1145,8 @@ static void bch2_btree_set_root(struct btree_update *as, struct btree *b,
struct btree *old;
trace_btree_set_root(c, b);
BUG_ON(!b->written);
BUG_ON(!b->written &&
!test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags));
old = btree_node_root(c, b);
......
......@@ -405,7 +405,7 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
_old; \
})
bool bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
void bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
size_t b, struct bucket_mark *old)
{
struct bucket *g;
......@@ -416,8 +416,7 @@ bool bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
g = bucket(ca, b);
*old = bucket_data_cmpxchg(c, ca, g, new, ({
if (!is_available_bucket(new))
return false;
BUG_ON(!is_available_bucket(new));
new.owned_by_allocator = 1;
new.data_type = 0;
......@@ -429,7 +428,6 @@ bool bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
if (!old->owned_by_allocator && old->cached_sectors)
trace_invalidate(ca, bucket_to_sector(ca, b),
old->cached_sectors);
return true;
}
void bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
......@@ -822,7 +820,8 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
/* XXX: these should be tunable */
size_t reserve_none = max_t(size_t, 4, ca->mi.nbuckets >> 9);
size_t copygc_reserve = max_t(size_t, 16, ca->mi.nbuckets >> 7);
size_t free_inc_reserve = copygc_reserve / 2;
size_t free_inc_nr = max(max_t(size_t, 16, ca->mi.nbuckets >> 12),
btree_reserve);
bool resize = ca->buckets != NULL,
start_copygc = ca->copygc_thread != NULL;
int ret = -ENOMEM;
......@@ -845,8 +844,8 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
!init_fifo(&free[RESERVE_MOVINGGC],
copygc_reserve, GFP_KERNEL) ||
!init_fifo(&free[RESERVE_NONE], reserve_none, GFP_KERNEL) ||
!init_fifo(&free_inc, free_inc_reserve, GFP_KERNEL) ||
!init_heap(&alloc_heap, free_inc_reserve, GFP_KERNEL) ||
!init_fifo(&free_inc, free_inc_nr, GFP_KERNEL) ||
!init_heap(&alloc_heap, ALLOC_SCAN_BATCH(ca) << 1, GFP_KERNEL) ||
!init_heap(&copygc_heap, copygc_reserve, GFP_KERNEL))
goto err;
......
......@@ -205,7 +205,7 @@ static inline bool bucket_needs_journal_commit(struct bucket_mark m,
void bch2_bucket_seq_cleanup(struct bch_fs *);
bool bch2_invalidate_bucket(struct bch_fs *, struct bch_dev *,
void bch2_invalidate_bucket(struct bch_fs *, struct bch_dev *,
size_t, struct bucket_mark *);
void bch2_mark_alloc_bucket(struct bch_fs *, struct bch_dev *,
size_t, bool, struct gc_pos, unsigned);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment