bcachefs: Invalidate buckets when writing to alloc btree

Prep work for persistent alloc information. Refactoring also lets us make free_inc much smaller, which means a lot fewer buckets stranded on freelists. Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>

bcachefs: Invalidate buckets when writing to alloc btree
Prep work for persistent alloc information. Refactoring also lets us make free_inc much smaller, which means a lot fewer buckets stranded on freelists. Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
b29e197a · Kent Overstreet · Kent Overstreet · b2be7c8b · b29e197a · b29e197a
Commit b29e197a authored Jul 22, 2018 by Kent Overstreet Committed by Kent Overstreet Oct 22, 2023
6 changed files
--- a/fs/bcachefs/alloc.c
+++ b/fs/bcachefs/alloc.c
@@ -288,53 +288,41 @@ int bch2_alloc_read(struct bch_fs *c, struct list_head *journal_replay_list)

 static int __bch2_alloc_write_key(struct bch_fs *c, struct bch_dev *ca,
 				  size_t b, struct btree_iter *iter,
-				  u64 *journal_seq, bool nowait)
+				  u64 *journal_seq, unsigned flags)
 {
 	struct bucket_mark m;
 	__BKEY_PADDED(k, DIV_ROUND_UP(sizeof(struct bch_alloc), 8)) alloc_key;
 	struct bucket *g;
 	struct bkey_i_alloc *a;
 	u8 *d;
-	int ret;
-	unsigned flags = BTREE_INSERT_ATOMIC|
-		BTREE_INSERT_NOFAIL|
-		BTREE_INSERT_USE_RESERVE|
-		BTREE_INSERT_USE_ALLOC_RESERVE;
-
-	if (nowait)
-		flags |= BTREE_INSERT_NOWAIT;

-	bch2_btree_iter_set_pos(iter, POS(ca->dev_idx, b));
+	percpu_down_read(&c->usage_lock);
+	g = bucket(ca, b);
+
+	m = READ_ONCE(g->mark);
+	a = bkey_alloc_init(&alloc_key.k);
+	a->k.p		= POS(ca->dev_idx, b);
+	a->v.fields	= 0;
+	a->v.gen	= m.gen;
+	set_bkey_val_u64s(&a->k, bch_alloc_val_u64s(&a->v));
+
+	d = a->v.data;
+	if (a->v.fields & (1 << BCH_ALLOC_FIELD_READ_TIME))
+		put_alloc_field(&d, 2, g->io_time[READ]);
+	if (a->v.fields & (1 << BCH_ALLOC_FIELD_WRITE_TIME))
+		put_alloc_field(&d, 2, g->io_time[WRITE]);
+	percpu_up_read(&c->usage_lock);

-	do {
-		ret = btree_iter_err(bch2_btree_iter_peek_slot(iter));
-		if (ret)
-			break;
+	bch2_btree_iter_cond_resched(iter);

-		percpu_down_read(&c->usage_lock);
-		g = bucket(ca, b);
-
-		/* read mark under btree node lock: */
-		m = READ_ONCE(g->mark);
-		a = bkey_alloc_init(&alloc_key.k);
-		a->k.p		= iter->pos;
-		a->v.fields	= 0;
-		a->v.gen	= m.gen;
-		set_bkey_val_u64s(&a->k, bch_alloc_val_u64s(&a->v));
-
-		d = a->v.data;
-		if (a->v.fields & (1 << BCH_ALLOC_FIELD_READ_TIME))
-			put_alloc_field(&d, 2, g->io_time[READ]);
-		if (a->v.fields & (1 << BCH_ALLOC_FIELD_WRITE_TIME))
-			put_alloc_field(&d, 2, g->io_time[WRITE]);
-		percpu_up_read(&c->usage_lock);
-
-		ret = bch2_btree_insert_at(c, NULL, NULL, journal_seq, flags,
-					   BTREE_INSERT_ENTRY(iter, &a->k_i));
-		bch2_btree_iter_cond_resched(iter);
-	} while (ret == -EINTR);
+	bch2_btree_iter_set_pos(iter, a->k.p);

-	return ret;
+	return bch2_btree_insert_at(c, NULL, NULL, journal_seq,
+				    BTREE_INSERT_NOFAIL|
+				    BTREE_INSERT_USE_RESERVE|
+				    BTREE_INSERT_USE_ALLOC_RESERVE|
+				    flags,
+				    BTREE_INSERT_ENTRY(iter, &a->k_i));
 }

 int bch2_alloc_replay_key(struct bch_fs *c, struct bpos pos)
@@ -354,8 +342,7 @@ int bch2_alloc_replay_key(struct bch_fs *c, struct bpos pos)
 	bch2_btree_iter_init(&iter, c, BTREE_ID_ALLOC, POS_MIN,
 			     BTREE_ITER_SLOTS|BTREE_ITER_INTENT);

-	ret = __bch2_alloc_write_key(c, ca, pos.offset, &iter,
-				     NULL, false);
+	ret = __bch2_alloc_write_key(c, ca, pos.offset, &iter, NULL, 0);
 	bch2_btree_iter_unlock(&iter);
 	return ret;
 }
@@ -375,8 +362,8 @@ int bch2_alloc_write(struct bch_fs *c)

 		down_read(&ca->bucket_lock);
 		for_each_set_bit(bucket, ca->buckets_dirty, ca->mi.nbuckets) {
-			ret = __bch2_alloc_write_key(c, ca, bucket, &iter,
-						     NULL, false);
+			ret = __bch2_alloc_write_key(c, ca, bucket,
+						     &iter, NULL, 0);
 			if (ret)
 				break;

@@ -582,47 +569,6 @@ static bool bch2_can_invalidate_bucket(struct bch_dev *ca,
 	return gc_gen < BUCKET_GC_GEN_MAX;
 }

-static void bch2_invalidate_one_bucket(struct bch_fs *c, struct bch_dev *ca,
-				       size_t bucket)
-{
-	struct bucket_mark m;
-
-	percpu_down_read(&c->usage_lock);
-	spin_lock(&c->freelist_lock);
-
-	if (!bch2_invalidate_bucket(c, ca, bucket, &m)) {
-		spin_unlock(&c->freelist_lock);
-		percpu_up_read(&c->usage_lock);
-		return;
-	}
-
-	verify_not_on_freelist(c, ca, bucket);
-	BUG_ON(!fifo_push(&ca->free_inc, bucket));
-
-	spin_unlock(&c->freelist_lock);
-	percpu_up_read(&c->usage_lock);
-
-	/* gc lock held: */
-	bucket_io_clock_reset(c, ca, bucket, READ);
-	bucket_io_clock_reset(c, ca, bucket, WRITE);
-
-	if (m.cached_sectors) {
-		ca->allocator_invalidating_data = true;
-	} else if (m.journal_seq_valid) {
-		u64 journal_seq = atomic64_read(&c->journal.seq);
-		u64 bucket_seq	= journal_seq;
-
-		bucket_seq &= ~((u64) U16_MAX);
-		bucket_seq |= m.journal_seq;
-
-		if (bucket_seq > journal_seq)
-			bucket_seq -= 1 << 16;
-
-		ca->allocator_journal_seq_flush =
-			max(ca->allocator_journal_seq_flush, bucket_seq);
-	}
-}
-
 /*
 * Determines what order we're going to reuse buckets, smallest bucket_key()
 * first.
@@ -674,11 +620,18 @@ static inline int bucket_alloc_cmp(alloc_heap *h,
 		(l.bucket > r.bucket) - (l.bucket < r.bucket);
 }

+static inline int bucket_idx_cmp(const void *_l, const void *_r)
+{
+	const struct alloc_heap_entry *l = _l, *r = _r;
+
+	return (l->bucket > r->bucket) - (l->bucket < r->bucket);
+}
+
 static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca)
 {
 	struct bucket_array *buckets;
 	struct alloc_heap_entry e = { 0 };
-	size_t b;
+	size_t b, i, nr = 0;

 	ca->alloc_heap.used = 0;

@@ -720,55 +673,58 @@ static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca)
 	if (e.nr)
 		heap_add_or_replace(&ca->alloc_heap, e, -bucket_alloc_cmp);

-	up_read(&ca->bucket_lock);
-	mutex_unlock(&c->bucket_clock[READ].lock);
-
-	heap_resort(&ca->alloc_heap, bucket_alloc_cmp);
-
-	while (heap_pop(&ca->alloc_heap, e, bucket_alloc_cmp)) {
-		for (b = e.bucket;
-		     b < e.bucket + e.nr;
-		     b++) {
-			if (fifo_full(&ca->free_inc))
-				return;
+	for (i = 0; i < ca->alloc_heap.used; i++)
+		nr += ca->alloc_heap.data[i].nr;

-			bch2_invalidate_one_bucket(c, ca, b);
-		}
+	while (nr - ca->alloc_heap.data[0].nr >= ALLOC_SCAN_BATCH(ca)) {
+		nr -= ca->alloc_heap.data[0].nr;
+		heap_pop(&ca->alloc_heap, e, -bucket_alloc_cmp);
 	}
+
+	up_read(&ca->bucket_lock);
+	mutex_unlock(&c->bucket_clock[READ].lock);
 }

 static void find_reclaimable_buckets_fifo(struct bch_fs *c, struct bch_dev *ca)
 {
 	struct bucket_array *buckets = bucket_array(ca);
 	struct bucket_mark m;
-	size_t b, checked;
+	size_t b, start;

-	for (checked = 0;
-	     checked < ca->mi.nbuckets && !fifo_full(&ca->free_inc);
-	     checked++) {
-		if (ca->fifo_last_bucket <  ca->mi.first_bucket ||
-		    ca->fifo_last_bucket >= ca->mi.nbuckets)
-			ca->fifo_last_bucket = ca->mi.first_bucket;
+	if (ca->fifo_last_bucket <  ca->mi.first_bucket ||
+	    ca->fifo_last_bucket >= ca->mi.nbuckets)
+		ca->fifo_last_bucket = ca->mi.first_bucket;
+
+	start = ca->fifo_last_bucket;

-		b = ca->fifo_last_bucket++;
+	do {
+		ca->fifo_last_bucket++;
+		if (ca->fifo_last_bucket == ca->mi.nbuckets)
+			ca->fifo_last_bucket = ca->mi.first_bucket;

+		b = ca->fifo_last_bucket;
 		m = READ_ONCE(buckets->b[b].mark);

-		if (bch2_can_invalidate_bucket(ca, b, m))
-			bch2_invalidate_one_bucket(c, ca, b);
+		if (bch2_can_invalidate_bucket(ca, b, m)) {
+			struct alloc_heap_entry e = { .bucket = b, .nr = 1, };
+
+			heap_add(&ca->alloc_heap, e, bucket_alloc_cmp);
+			if (heap_full(&ca->alloc_heap))
+				break;
+		}

 		cond_resched();
-	}
+	} while (ca->fifo_last_bucket != start);
 }

 static void find_reclaimable_buckets_random(struct bch_fs *c, struct bch_dev *ca)
 {
 	struct bucket_array *buckets = bucket_array(ca);
 	struct bucket_mark m;
-	size_t checked;
+	size_t checked, i;

 	for (checked = 0;
-	     checked < ca->mi.nbuckets / 2 && !fifo_full(&ca->free_inc);
+	     checked < ca->mi.nbuckets / 2;
 	     checked++) {
 		size_t b = bch2_rand_range(ca->mi.nbuckets -
 					   ca->mi.first_bucket) +
@@ -776,17 +732,34 @@ static void find_reclaimable_buckets_random(struct bch_fs *c, struct bch_dev *ca

 		m = READ_ONCE(buckets->b[b].mark);

-		if (bch2_can_invalidate_bucket(ca, b, m))
-			bch2_invalidate_one_bucket(c, ca, b);
+		if (bch2_can_invalidate_bucket(ca, b, m)) {
+			struct alloc_heap_entry e = { .bucket = b, .nr = 1, };
+
+			heap_add(&ca->alloc_heap, e, bucket_alloc_cmp);
+			if (heap_full(&ca->alloc_heap))
+				break;
+		}

 		cond_resched();
 	}
+
+	sort(ca->alloc_heap.data,
+	     ca->alloc_heap.used,
+	     sizeof(ca->alloc_heap.data[0]),
+	     bucket_idx_cmp, NULL);
+
+	/* remove duplicates: */
+	for (i = 0; i + 1 < ca->alloc_heap.used; i++)
+		if (ca->alloc_heap.data[i].bucket ==
+		    ca->alloc_heap.data[i + 1].bucket)
+			ca->alloc_heap.data[i].nr = 0;
 }

-static void find_reclaimable_buckets(struct bch_fs *c, struct bch_dev *ca)
+static size_t find_reclaimable_buckets(struct bch_fs *c, struct bch_dev *ca)
 {
+	size_t i, nr = 0;
+
 	ca->inc_gen_needs_gc			= 0;
-	ca->inc_gen_really_needs_gc		= 0;

 	switch (ca->mi.replacement) {
 	case CACHE_REPLACEMENT_LRU:
@@ -799,86 +772,132 @@ static void find_reclaimable_buckets(struct bch_fs *c, struct bch_dev *ca)
 		find_reclaimable_buckets_random(c, ca);
 		break;
 	}
+
+	heap_resort(&ca->alloc_heap, bucket_alloc_cmp);
+
+	for (i = 0; i < ca->alloc_heap.used; i++)
+		nr += ca->alloc_heap.data[i].nr;
+
+	return nr;
 }

-static int size_t_cmp(const void *_l, const void *_r)
+static inline long next_alloc_bucket(struct bch_dev *ca)
 {
-	const size_t *l = _l, *r = _r;
+	struct alloc_heap_entry e, *top = ca->alloc_heap.data;
+
+	while (ca->alloc_heap.used) {
+		if (top->nr) {
+			size_t b = top->bucket;
+
+			top->bucket++;
+			top->nr--;
+			return b;
+		}

-	return (*l > *r) - (*l < *r);
+		heap_pop(&ca->alloc_heap, e, bucket_alloc_cmp);
+	}
+
+	return -1;
 }

-static void sort_free_inc(struct bch_fs *c, struct bch_dev *ca)
+static bool bch2_invalidate_one_bucket(struct bch_fs *c, struct bch_dev *ca,
+				       size_t bucket, u64 *flush_seq)
 {
-	BUG_ON(ca->free_inc.front);
+	struct bucket_mark m;

+	percpu_down_read(&c->usage_lock);
 	spin_lock(&c->freelist_lock);
-	sort(ca->free_inc.data,
-	     ca->free_inc.back,
-	     sizeof(ca->free_inc.data[0]),
-	     size_t_cmp, NULL);
+
+	bch2_invalidate_bucket(c, ca, bucket, &m);
+
+	verify_not_on_freelist(c, ca, bucket);
+	BUG_ON(!fifo_push(&ca->free_inc, bucket));
+
 	spin_unlock(&c->freelist_lock);
+
+	bucket_io_clock_reset(c, ca, bucket, READ);
+	bucket_io_clock_reset(c, ca, bucket, WRITE);
+
+	percpu_up_read(&c->usage_lock);
+
+	if (m.journal_seq_valid) {
+		u64 journal_seq = atomic64_read(&c->journal.seq);
+		u64 bucket_seq	= journal_seq;
+
+		bucket_seq &= ~((u64) U16_MAX);
+		bucket_seq |= m.journal_seq;
+
+		if (bucket_seq > journal_seq)
+			bucket_seq -= 1 << 16;
+
+		*flush_seq = max(*flush_seq, bucket_seq);
+	}
+
+	return m.cached_sectors != 0;
 }

-static int bch2_invalidate_free_inc(struct bch_fs *c, struct bch_dev *ca,
-				    u64 *journal_seq, size_t nr,
-				    bool nowait)
+/*
+ * Pull buckets off ca->alloc_heap, invalidate them, move them to ca->free_inc:
+ */
+static int bch2_invalidate_buckets(struct bch_fs *c, struct bch_dev *ca)
 {
 	struct btree_iter iter;
+	u64 journal_seq = 0;
 	int ret = 0;
+	long b;

 	bch2_btree_iter_init(&iter, c, BTREE_ID_ALLOC, POS(ca->dev_idx, 0),
 			     BTREE_ITER_SLOTS|BTREE_ITER_INTENT);

 	/* Only use nowait if we've already invalidated at least one bucket: */
-	while (ca->nr_invalidated < min(nr, fifo_used(&ca->free_inc))) {
-		size_t b = fifo_idx_entry(&ca->free_inc, ca->nr_invalidated);
-
-		ret = __bch2_alloc_write_key(c, ca, b, &iter, journal_seq,
-					     nowait && ca->nr_invalidated);
-		if (ret)
-			break;
-
-		ca->nr_invalidated++;
+	while (!ret &&
+	       !fifo_full(&ca->free_inc) &&
+	       (b = next_alloc_bucket(ca)) >= 0) {
+		bool must_flush =
+			bch2_invalidate_one_bucket(c, ca, b, &journal_seq);
+
+		ret = __bch2_alloc_write_key(c, ca, b, &iter,
+				must_flush ? &journal_seq : NULL,
+				!fifo_empty(&ca->free_inc) ? BTREE_INSERT_NOWAIT : 0);
 	}

 	bch2_btree_iter_unlock(&iter);

 	/* If we used NOWAIT, don't return the error: */
-	return ca->nr_invalidated ? 0 : ret;
-}
-
-static bool __push_invalidated_bucket(struct bch_fs *c, struct bch_dev *ca, size_t bucket)
-{
-	unsigned i;
+	if (!fifo_empty(&ca->free_inc))
+		ret = 0;
+	if (ret) {
+		bch_err(ca, "error invalidating buckets: %i", ret);
+		return ret;
+	}

-	/*
-	 * Don't remove from free_inc until after it's added to
-	 * freelist, so gc can find it:
-	 */
-	spin_lock(&c->freelist_lock);
-	for (i = 0; i < RESERVE_NR; i++)
-		if (fifo_push(&ca->free[i], bucket)) {
-			fifo_pop(&ca->free_inc, bucket);
-			--ca->nr_invalidated;
-			closure_wake_up(&c->freelist_wait);
-			spin_unlock(&c->freelist_lock);
-			return true;
-		}
-	spin_unlock(&c->freelist_lock);
+	if (journal_seq)
+		ret = bch2_journal_flush_seq(&c->journal, journal_seq);
+	if (ret) {
+		bch_err(ca, "journal error: %i", ret);
+		return ret;
+	}

-	return false;
+	return 0;
 }

 static int push_invalidated_bucket(struct bch_fs *c, struct bch_dev *ca, size_t bucket)
 {
+	unsigned i;
 	int ret = 0;

 	while (1) {
 		set_current_state(TASK_INTERRUPTIBLE);

-		if (__push_invalidated_bucket(c, ca, bucket))
-			break;
+		spin_lock(&c->freelist_lock);
+		for (i = 0; i < RESERVE_NR; i++)
+			if (fifo_push(&ca->free[i], bucket)) {
+				fifo_pop(&ca->free_inc, bucket);
+				closure_wake_up(&c->freelist_wait);
+				spin_unlock(&c->freelist_lock);
+				goto out;
+			}
+		spin_unlock(&c->freelist_lock);

 		if ((current->flags & PF_KTHREAD) &&
 		    kthread_should_stop()) {
@@ -889,22 +908,20 @@ static int push_invalidated_bucket(struct bch_fs *c, struct bch_dev *ca, size_t
 		schedule();
 		try_to_freeze();
 	}
-
+out:
 	__set_current_state(TASK_RUNNING);
 	return ret;
 }

 /*
- * Given an invalidated, ready to use bucket: issue a discard to it if enabled,
- * then add it to the freelist, waiting until there's room if necessary:
+ * Pulls buckets off free_inc, discards them (if enabled), then adds them to
+ * freelists, waiting until there's room if necessary:
 */
 static int discard_invalidated_buckets(struct bch_fs *c, struct bch_dev *ca)
 {
-	while (ca->nr_invalidated) {
+	while (!fifo_empty(&ca->free_inc)) {
 		size_t bucket = fifo_peek(&ca->free_inc);

-		BUG_ON(fifo_empty(&ca->free_inc) || !ca->nr_invalidated);
-
 		if (ca->mi.discard &&
 		    bdev_max_discard_sectors(ca->disk_sb.bdev))
 			blkdev_issue_discard(ca->disk_sb.bdev,
@@ -930,68 +947,32 @@ static int bch2_allocator_thread(void *arg)
 {
 	struct bch_dev *ca = arg;
 	struct bch_fs *c = ca->fs;
-	u64 journal_seq;
+	size_t nr;
 	int ret;

 	set_freezable();

 	while (1) {
-		while (1) {
-			cond_resched();
-
-			pr_debug("discarding %zu invalidated buckets",
-				 ca->nr_invalidated);
-
-			ret = discard_invalidated_buckets(c, ca);
-			if (ret)
-				goto stop;
-
-			if (fifo_empty(&ca->free_inc))
-				break;
+		cond_resched();

-			pr_debug("invalidating %zu buckets",
-				 fifo_used(&ca->free_inc));
+		pr_debug("discarding %zu invalidated buckets",
+			 fifo_used(&ca->free_inc));

-			journal_seq = 0;
-			ret = bch2_invalidate_free_inc(c, ca, &journal_seq,
-						       SIZE_MAX, true);
-			if (ret) {
-				bch_err(ca, "error invalidating buckets: %i", ret);
-				goto stop;
-			}
-
-			if (!ca->nr_invalidated) {
-				bch_err(ca, "allocator thread unable to make forward progress!");
-				goto stop;
-			}
+		ret = discard_invalidated_buckets(c, ca);
+		if (ret)
+			goto stop;

-			if (ca->allocator_invalidating_data)
-				ret = bch2_journal_flush_seq(&c->journal, journal_seq);
-			else if (ca->allocator_journal_seq_flush)
-				ret = bch2_journal_flush_seq(&c->journal,
-						       ca->allocator_journal_seq_flush);
+		ret = bch2_invalidate_buckets(c, ca);
+		if (ret)
+			goto stop;

-			/*
-			 * journal error - buckets haven't actually been
-			 * invalidated, can't discard them:
-			 */
-			if (ret) {
-				bch_err(ca, "journal error: %i", ret);
-				goto stop;
-			}
-		}
+		if (!fifo_empty(&ca->free_inc))
+			continue;

 		pr_debug("free_inc now empty");

-		/* Reset front/back so we can easily sort fifo entries later: */
-		ca->free_inc.front = ca->free_inc.back	= 0;
-		ca->allocator_journal_seq_flush		= 0;
-		ca->allocator_invalidating_data		= false;
-
 		down_read(&c->gc_lock);
-		while (1) {
-			size_t prev = fifo_used(&ca->free_inc);
-
+		do {
 			if (test_bit(BCH_FS_GC_FAILURE, &c->flags)) {
 				up_read(&c->gc_lock);
 				bch_err(ca, "gc failure");
@@ -1007,56 +988,46 @@ static int bch2_allocator_thread(void *arg)

 			pr_debug("scanning for reclaimable buckets");

-			find_reclaimable_buckets(c, ca);
+			nr = find_reclaimable_buckets(c, ca);

-			pr_debug("found %zu buckets (free_inc %zu/%zu)",
-				 fifo_used(&ca->free_inc) - prev,
-				 fifo_used(&ca->free_inc), ca->free_inc.size);
+			pr_debug("found %zu buckets", nr);

-			trace_alloc_batch(ca, fifo_used(&ca->free_inc),
-					  ca->free_inc.size);
+			trace_alloc_batch(ca, nr, ca->alloc_heap.size);

-			if ((ca->inc_gen_needs_gc >= ca->free_inc.size ||
-			     (!fifo_full(&ca->free_inc) &&
-			      ca->inc_gen_really_needs_gc >=
-			      fifo_free(&ca->free_inc))) &&
+			if ((ca->inc_gen_needs_gc >= ALLOC_SCAN_BATCH(ca) ||
+			     ca->inc_gen_really_needs_gc) &&
 			    c->gc_thread) {
 				atomic_inc(&c->kick_gc);
 				wake_up_process(c->gc_thread);
 			}

-			if (fifo_full(&ca->free_inc))
-				break;
-
-			if (!fifo_empty(&ca->free_inc) &&
-			    !fifo_full(&ca->free[RESERVE_MOVINGGC]))
-				break;
-
 			/*
-			 * copygc may be waiting until either its reserve fills
-			 * up, or we can't make forward progress:
+			 * If we found any buckets, we have to invalidate them
+			 * before we scan for more - but if we didn't find very
+			 * many we may want to wait on more buckets being
+			 * available so we don't spin:
 			 */
-			ca->allocator_blocked = true;
-			closure_wake_up(&c->freelist_wait);
-
-			ret = wait_buckets_available(c, ca);
-			if (ret) {
-				up_read(&c->gc_lock);
-				goto stop;
+			if (!nr ||
+			    (nr < ALLOC_SCAN_BATCH(ca) &&
+			     !fifo_full(&ca->free[RESERVE_MOVINGGC]))) {
+				ca->allocator_blocked = true;
+				closure_wake_up(&c->freelist_wait);
+
+				ret = wait_buckets_available(c, ca);
+				if (ret) {
+					up_read(&c->gc_lock);
+					goto stop;
+				}
 			}
-		}
+		} while (!nr);

 		ca->allocator_blocked = false;
 		up_read(&c->gc_lock);

-		pr_debug("free_inc now %zu/%zu",
-			 fifo_used(&ca->free_inc),
-			 ca->free_inc.size);
-
-		sort_free_inc(c, ca);
+		pr_debug("%zu buckets to invalidate", nr);

 		/*
-		 * free_inc is now full of newly-invalidated buckets: next,
+		 * alloc_heap is now full of newly-invalidated buckets: next,
 		 * write out the new bucket gens:
 		 */
 	}
@@ -1946,39 +1917,83 @@ int bch2_dev_allocator_start(struct bch_dev *ca)
 	return 0;
 }

+static void flush_held_btree_writes(struct bch_fs *c)
+{
+	struct bucket_table *tbl;
+	struct rhash_head *pos;
+	struct btree *b;
+	bool flush_updates;
+	size_t i, nr_pending_updates;
+
+	clear_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags);
+again:
+	pr_debug("flushing dirty btree nodes");
+	cond_resched();
+
+	flush_updates = false;
+	nr_pending_updates = bch2_btree_interior_updates_nr_pending(c);
+
+	rcu_read_lock();
+	for_each_cached_btree(b, c, tbl, i, pos)
+		if (btree_node_dirty(b) && (!b->written || b->level)) {
+			if (btree_node_may_write(b)) {
+				rcu_read_unlock();
+				btree_node_lock_type(c, b, SIX_LOCK_read);
+				bch2_btree_node_write(c, b, SIX_LOCK_read);
+				six_unlock_read(&b->lock);
+				goto again;
+			} else {
+				flush_updates = true;
+			}
+		}
+	rcu_read_unlock();
+
+	if (c->btree_roots_dirty)
+		bch2_journal_meta(&c->journal);
+
+	/*
+	 * This is ugly, but it's needed to flush btree node writes
+	 * without spinning...
+	 */
+	if (flush_updates) {
+		closure_wait_event(&c->btree_interior_update_wait,
+				   bch2_btree_interior_updates_nr_pending(c) <
+				   nr_pending_updates);
+		goto again;
+	}
+
+}
+
 static void allocator_start_issue_discards(struct bch_fs *c)
 {
 	struct bch_dev *ca;
 	unsigned dev_iter;
-	size_t i, bu;
-
-	for_each_rw_member(ca, c, dev_iter) {
-		unsigned done = 0;
-
-		fifo_for_each_entry(bu, &ca->free_inc, i) {
-			if (done == ca->nr_invalidated)
-				break;
+	size_t bu;

+	for_each_rw_member(ca, c, dev_iter)
+		while (fifo_pop(&ca->free_inc, bu))
 			blkdev_issue_discard(ca->disk_sb.bdev,
 					     bucket_to_sector(ca, bu),
 					     ca->mi.bucket_size, GFP_NOIO);
-			done++;
-		}
-	}
 }

 static int __bch2_fs_allocator_start(struct bch_fs *c)
 {
 	struct bch_dev *ca;
-	size_t bu, i;
 	unsigned dev_iter;
 	u64 journal_seq = 0;
+	long bu;
 	bool invalidating_data = false;
 	int ret = 0;

 	if (test_bit(BCH_FS_GC_FAILURE, &c->flags))
 		return -1;

+	if (test_alloc_startup(c)) {
+		invalidating_data = true;
+		goto not_enough;
+	}
+
 	/* Scan for buckets that are already invalidated: */
 	for_each_rw_member(ca, c, dev_iter) {
 		struct btree_iter iter;
@@ -2003,7 +2018,6 @@ static int __bch2_fs_allocator_start(struct bch_fs *c)
 			percpu_up_read(&c->usage_lock);

 			fifo_push(&ca->free_inc, bu);
-			ca->nr_invalidated++;

 			if (fifo_full(&ca->free_inc))
 				break;
@@ -2022,24 +2036,23 @@ static int __bch2_fs_allocator_start(struct bch_fs *c)
 not_enough:
 	pr_debug("did not find enough empty buckets; issuing discards");

-	/* clear out free_inc - find_reclaimable_buckets() assumes it's empty */
+	/* clear out free_inc, we'll be using it again below: */
 	for_each_rw_member(ca, c, dev_iter)
 		discard_invalidated_buckets(c, ca);

 	pr_debug("scanning for reclaimable buckets");

 	for_each_rw_member(ca, c, dev_iter) {
-		BUG_ON(!fifo_empty(&ca->free_inc));
-		ca->free_inc.front = ca->free_inc.back	= 0;
-
 		find_reclaimable_buckets(c, ca);
-		sort_free_inc(c, ca);

-		invalidating_data |= ca->allocator_invalidating_data;
+		while (!fifo_full(&ca->free[RESERVE_BTREE]) &&
+		       (bu = next_alloc_bucket(ca)) >= 0) {
+			invalidating_data |=
+				bch2_invalidate_one_bucket(c, ca, bu, &journal_seq);

-		fifo_for_each_entry(bu, &ca->free_inc, i)
-			if (!fifo_push(&ca->free[RESERVE_BTREE], bu))
-				break;
+			fifo_push(&ca->free[RESERVE_BTREE], bu);
+			set_bit(bu, ca->buckets_dirty);
+		}
 	}

 	pr_debug("done scanning for reclaimable buckets");
@@ -2065,16 +2078,9 @@ static int __bch2_fs_allocator_start(struct bch_fs *c)
 	 * XXX: it's possible for this to deadlock waiting on journal reclaim,
 	 * since we're holding btree writes. What then?
 	 */
-
-	for_each_rw_member(ca, c, dev_iter) {
-		ret = bch2_invalidate_free_inc(c, ca, &journal_seq,
-					       ca->free[RESERVE_BTREE].size,
-					       false);
-		if (ret) {
-			percpu_ref_put(&ca->io_ref);
-			return ret;
-		}
-	}
+	ret = bch2_alloc_write(c);
+	if (ret)
+		return ret;

 	if (invalidating_data) {
 		pr_debug("flushing journal");
@@ -2087,57 +2093,11 @@ static int __bch2_fs_allocator_start(struct bch_fs *c)
 		allocator_start_issue_discards(c);
 	}

-	for_each_rw_member(ca, c, dev_iter)
-		while (ca->nr_invalidated) {
-			BUG_ON(!fifo_pop(&ca->free_inc, bu));
-			ca->nr_invalidated--;
-		}
-
 	set_bit(BCH_FS_ALLOCATOR_STARTED, &c->flags);

 	/* now flush dirty btree nodes: */
-	if (invalidating_data) {
-		struct bucket_table *tbl;
-		struct rhash_head *pos;
-		struct btree *b;
-		bool flush_updates;
-		size_t nr_pending_updates;
-
-		clear_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags);
-again:
-		pr_debug("flushing dirty btree nodes");
-		cond_resched();
-
-		flush_updates = false;
-		nr_pending_updates = bch2_btree_interior_updates_nr_pending(c);
-
-
-		rcu_read_lock();
-		for_each_cached_btree(b, c, tbl, i, pos)
-			if (btree_node_dirty(b) && (!b->written || b->level)) {
-				if (btree_node_may_write(b)) {
-					rcu_read_unlock();
-					btree_node_lock_type(c, b, SIX_LOCK_read);
-					bch2_btree_node_write(c, b, SIX_LOCK_read);
-					six_unlock_read(&b->lock);
-					goto again;
-				} else {
-					flush_updates = true;
-				}
-			}
-		rcu_read_unlock();
-
-		/*
-		 * This is ugly, but it's needed to flush btree node writes
-		 * without spinning...
-		 */
-		if (flush_updates) {
-			closure_wait_event(&c->btree_interior_update_wait,
-				bch2_btree_interior_updates_nr_pending(c) <
-				nr_pending_updates);
-			goto again;
-		}
-	}
+	if (invalidating_data)
+		flush_held_btree_writes(c);

 	return 0;
 }

--- a/fs/bcachefs/alloc.h
+++ b/fs/bcachefs/alloc.h
@@ -9,6 +9,8 @@ struct bch_dev;
 struct bch_fs;
 struct bch_devs_List;

+#define ALLOC_SCAN_BATCH(ca)		((ca)->mi.nbuckets >> 9)
+
 const char *bch2_alloc_invalid(const struct bch_fs *, struct bkey_s_c);
 void bch2_alloc_to_text(struct bch_fs *, char *, size_t, struct bkey_s_c);


--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -270,6 +270,10 @@ do {									\
 		"Store the journal sequence number in the version "	\
 		"number of every btree key, and verify that btree "	\
 		"update ordering is preserved during recovery")		\
+	BCH_DEBUG_PARAM(test_alloc_startup,				\
+		"Force allocator startup to use the slowpath where it"	\
+		"can't find enough free buckets without invalidating"	\
+		"cached data")

 #define BCH_DEBUG_PARAMS_ALL() BCH_DEBUG_PARAMS_ALWAYS() BCH_DEBUG_PARAMS_DEBUG()

@@ -403,7 +407,6 @@ struct bch_dev {
 	alloc_fifo		free[RESERVE_NR];
 	alloc_fifo		free_inc;
 	spinlock_t		freelist_lock;
-	size_t			nr_invalidated;

 	u8			open_buckets_partial[OPEN_BUCKETS_COUNT];
 	unsigned		open_buckets_partial_nr;
@@ -415,8 +418,6 @@ struct bch_dev {

 	size_t			inc_gen_needs_gc;
 	size_t			inc_gen_really_needs_gc;
-	u64			allocator_journal_seq_flush;
-	bool			allocator_invalidating_data;
 	bool			allocator_blocked;

 	alloc_heap		alloc_heap;

--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -1145,7 +1145,8 @@ static void bch2_btree_set_root(struct btree_update *as, struct btree *b,
 	struct btree *old;

 	trace_btree_set_root(c, b);
-	BUG_ON(!b->written);
+	BUG_ON(!b->written &&
+	       !test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags));

 	old = btree_node_root(c, b);


--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -405,7 +405,7 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
 	_old;							\
 })

-bool bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
+void bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
 			    size_t b, struct bucket_mark *old)
 {
 	struct bucket *g;
@@ -416,8 +416,7 @@ bool bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
 	g = bucket(ca, b);

 	*old = bucket_data_cmpxchg(c, ca, g, new, ({
-		if (!is_available_bucket(new))
-			return false;
+		BUG_ON(!is_available_bucket(new));

 		new.owned_by_allocator	= 1;
 		new.data_type		= 0;
@@ -429,7 +428,6 @@ bool bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
 	if (!old->owned_by_allocator && old->cached_sectors)
 		trace_invalidate(ca, bucket_to_sector(ca, b),
 				 old->cached_sectors);
-	return true;
 }

 void bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
@@ -822,7 +820,8 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
 	/* XXX: these should be tunable */
 	size_t reserve_none	= max_t(size_t, 4, ca->mi.nbuckets >> 9);
 	size_t copygc_reserve	= max_t(size_t, 16, ca->mi.nbuckets >> 7);
-	size_t free_inc_reserve = copygc_reserve / 2;
+	size_t free_inc_nr	= max(max_t(size_t, 16, ca->mi.nbuckets >> 12),
+				      btree_reserve);
 	bool resize = ca->buckets != NULL,
 	     start_copygc = ca->copygc_thread != NULL;
 	int ret = -ENOMEM;
@@ -845,8 +844,8 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
 	    !init_fifo(&free[RESERVE_MOVINGGC],
 		       copygc_reserve, GFP_KERNEL) ||
 	    !init_fifo(&free[RESERVE_NONE], reserve_none, GFP_KERNEL) ||
-	    !init_fifo(&free_inc,	free_inc_reserve, GFP_KERNEL) ||
-	    !init_heap(&alloc_heap,	free_inc_reserve, GFP_KERNEL) ||
+	    !init_fifo(&free_inc,	free_inc_nr, GFP_KERNEL) ||
+	    !init_heap(&alloc_heap,	ALLOC_SCAN_BATCH(ca) << 1, GFP_KERNEL) ||
 	    !init_heap(&copygc_heap,	copygc_reserve, GFP_KERNEL))
 		goto err;


--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -205,7 +205,7 @@ static inline bool bucket_needs_journal_commit(struct bucket_mark m,

 void bch2_bucket_seq_cleanup(struct bch_fs *);

-bool bch2_invalidate_bucket(struct bch_fs *, struct bch_dev *,
+void bch2_invalidate_bucket(struct bch_fs *, struct bch_dev *,
 			    size_t, struct bucket_mark *);
 void bch2_mark_alloc_bucket(struct bch_fs *, struct bch_dev *,
 			    size_t, bool, struct gc_pos, unsigned);