Commit 68ef94a6 authored by Kent Overstreet's avatar Kent Overstreet Committed by Kent Overstreet

bcachefs: Add a pre-reserve mechanism for the journal

Signed-off-by: default avatarKent Overstreet <kent.overstreet@linux.dev>
parent 9ace606e
...@@ -343,6 +343,16 @@ static int __journal_res_get(struct journal *j, struct journal_res *res, ...@@ -343,6 +343,16 @@ static int __journal_res_get(struct journal *j, struct journal_res *res,
return 0; return 0;
} }
if (!(flags & JOURNAL_RES_GET_RESERVED) &&
!test_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags)) {
/*
* Don't want to close current journal entry, just need to
* invoke reclaim:
*/
ret = -ENOSPC;
goto unlock;
}
/* /*
* If we couldn't get a reservation because the current buf filled up, * If we couldn't get a reservation because the current buf filled up,
* and we had room for a bigger entry on disk, signal that we want to * and we had room for a bigger entry on disk, signal that we want to
...@@ -366,7 +376,7 @@ static int __journal_res_get(struct journal *j, struct journal_res *res, ...@@ -366,7 +376,7 @@ static int __journal_res_get(struct journal *j, struct journal_res *res,
} else { } else {
ret = journal_entry_open(j); ret = journal_entry_open(j);
} }
unlock:
if ((ret == -EAGAIN || ret == -ENOSPC) && if ((ret == -EAGAIN || ret == -ENOSPC) &&
!j->res_get_blocked_start) !j->res_get_blocked_start)
j->res_get_blocked_start = local_clock() ?: 1; j->res_get_blocked_start = local_clock() ?: 1;
...@@ -378,6 +388,8 @@ static int __journal_res_get(struct journal *j, struct journal_res *res, ...@@ -378,6 +388,8 @@ static int __journal_res_get(struct journal *j, struct journal_res *res,
goto retry; goto retry;
if (ret == -ENOSPC) { if (ret == -ENOSPC) {
BUG_ON(!can_discard && (flags & JOURNAL_RES_GET_RESERVED));
/* /*
* Journal is full - can't rely on reclaim from work item due to * Journal is full - can't rely on reclaim from work item due to
* freezing: * freezing:
...@@ -423,6 +435,32 @@ int bch2_journal_res_get_slowpath(struct journal *j, struct journal_res *res, ...@@ -423,6 +435,32 @@ int bch2_journal_res_get_slowpath(struct journal *j, struct journal_res *res,
return ret; return ret;
} }
/* journal_preres: */
static bool journal_preres_available(struct journal *j,
struct journal_preres *res,
unsigned new_u64s)
{
bool ret = bch2_journal_preres_get_fast(j, res, new_u64s);
if (!ret)
bch2_journal_reclaim_work(&j->reclaim_work.work);
return ret;
}
int __bch2_journal_preres_get(struct journal *j,
struct journal_preres *res,
unsigned new_u64s)
{
int ret;
closure_wait_event(&j->preres_wait,
(ret = bch2_journal_error(j)) ||
journal_preres_available(j, res, new_u64s));
return ret;
}
/* journal_entry_res: */ /* journal_entry_res: */
void bch2_journal_entry_res_resize(struct journal *j, void bch2_journal_entry_res_resize(struct journal *j,
...@@ -1110,11 +1148,16 @@ ssize_t bch2_journal_print_debug(struct journal *j, char *buf) ...@@ -1110,11 +1148,16 @@ ssize_t bch2_journal_print_debug(struct journal *j, char *buf)
"seq:\t\t\t%llu\n" "seq:\t\t\t%llu\n"
"last_seq:\t\t%llu\n" "last_seq:\t\t%llu\n"
"last_seq_ondisk:\t%llu\n" "last_seq_ondisk:\t%llu\n"
"prereserved:\t\t%u/%u\n"
"current entry sectors:\t%u\n"
"current entry:\t\t", "current entry:\t\t",
fifo_used(&j->pin), fifo_used(&j->pin),
journal_cur_seq(j), journal_cur_seq(j),
journal_last_seq(j), journal_last_seq(j),
j->last_seq_ondisk); j->last_seq_ondisk,
j->prereserved.reserved,
j->prereserved.remaining,
j->cur_entry_sectors);
switch (s.cur_entry_offset) { switch (s.cur_entry_offset) {
case JOURNAL_ENTRY_ERROR_VAL: case JOURNAL_ENTRY_ERROR_VAL:
...@@ -1136,8 +1179,9 @@ ssize_t bch2_journal_print_debug(struct journal *j, char *buf) ...@@ -1136,8 +1179,9 @@ ssize_t bch2_journal_print_debug(struct journal *j, char *buf)
journal_state_count(s, s.idx)); journal_state_count(s, s.idx));
if (s.prev_buf_unwritten) if (s.prev_buf_unwritten)
pr_buf(&out, "yes, ref %u\n", pr_buf(&out, "yes, ref %u sectors %u\n",
journal_state_count(s, !s.idx)); journal_state_count(s, !s.idx),
journal_prev_buf(j)->sectors);
else else
pr_buf(&out, "no\n"); pr_buf(&out, "no\n");
......
...@@ -119,6 +119,7 @@ static inline void journal_wake(struct journal *j) ...@@ -119,6 +119,7 @@ static inline void journal_wake(struct journal *j)
{ {
wake_up(&j->wait); wake_up(&j->wait);
closure_wake_up(&j->async_wait); closure_wake_up(&j->async_wait);
closure_wake_up(&j->preres_wait);
} }
static inline struct journal_buf *journal_cur_buf(struct journal *j) static inline struct journal_buf *journal_cur_buf(struct journal *j)
...@@ -274,6 +275,7 @@ int bch2_journal_res_get_slowpath(struct journal *, struct journal_res *, ...@@ -274,6 +275,7 @@ int bch2_journal_res_get_slowpath(struct journal *, struct journal_res *,
#define JOURNAL_RES_GET_NONBLOCK (1 << 0) #define JOURNAL_RES_GET_NONBLOCK (1 << 0)
#define JOURNAL_RES_GET_CHECK (1 << 1) #define JOURNAL_RES_GET_CHECK (1 << 1)
#define JOURNAL_RES_GET_RESERVED (1 << 2)
static inline int journal_res_get_fast(struct journal *j, static inline int journal_res_get_fast(struct journal *j,
struct journal_res *res, struct journal_res *res,
...@@ -294,6 +296,10 @@ static inline int journal_res_get_fast(struct journal *j, ...@@ -294,6 +296,10 @@ static inline int journal_res_get_fast(struct journal *j,
EBUG_ON(!journal_state_count(new, new.idx)); EBUG_ON(!journal_state_count(new, new.idx));
if (!(flags & JOURNAL_RES_GET_RESERVED) &&
!test_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags))
return 0;
if (flags & JOURNAL_RES_GET_CHECK) if (flags & JOURNAL_RES_GET_CHECK)
return 1; return 1;
...@@ -333,6 +339,89 @@ static inline int bch2_journal_res_get(struct journal *j, struct journal_res *re ...@@ -333,6 +339,89 @@ static inline int bch2_journal_res_get(struct journal *j, struct journal_res *re
return 0; return 0;
} }
/* journal_preres: */
static inline bool journal_check_may_get_unreserved(struct journal *j)
{
union journal_preres_state s = READ_ONCE(j->prereserved);
bool ret = s.reserved <= s.remaining &&
fifo_free(&j->pin) > 8;
lockdep_assert_held(&j->lock);
if (ret != test_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags)) {
if (ret) {
set_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags);
journal_wake(j);
} else {
clear_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags);
}
}
return ret;
}
static inline void bch2_journal_preres_put(struct journal *j,
struct journal_preres *res)
{
union journal_preres_state s = { .reserved = res->u64s };
if (!res->u64s)
return;
s.v = atomic64_sub_return(s.v, &j->prereserved.counter);
res->u64s = 0;
closure_wake_up(&j->preres_wait);
if (s.reserved <= s.remaining &&
!test_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags)) {
spin_lock(&j->lock);
journal_check_may_get_unreserved(j);
spin_unlock(&j->lock);
}
}
int __bch2_journal_preres_get(struct journal *,
struct journal_preres *, unsigned);
static inline int bch2_journal_preres_get_fast(struct journal *j,
struct journal_preres *res,
unsigned new_u64s)
{
int d = new_u64s - res->u64s;
union journal_preres_state old, new;
u64 v = atomic64_read(&j->prereserved.counter);
do {
old.v = new.v = v;
new.reserved += d;
if (new.reserved > new.remaining)
return 0;
} while ((v = atomic64_cmpxchg(&j->prereserved.counter,
old.v, new.v)) != old.v);
res->u64s += d;
return 1;
}
static inline int bch2_journal_preres_get(struct journal *j,
struct journal_preres *res,
unsigned new_u64s,
unsigned flags)
{
if (new_u64s <= res->u64s)
return 0;
if (bch2_journal_preres_get_fast(j, res, new_u64s))
return 0;
if (flags & JOURNAL_RES_GET_NONBLOCK)
return -EAGAIN;
return __bch2_journal_preres_get(j, res, new_u64s);
}
/* journal_entry_res: */ /* journal_entry_res: */
void bch2_journal_entry_res_resize(struct journal *, void bch2_journal_entry_res_resize(struct journal *,
......
...@@ -974,6 +974,12 @@ static int journal_write_alloc(struct journal *j, struct journal_buf *w, ...@@ -974,6 +974,12 @@ static int journal_write_alloc(struct journal *j, struct journal_buf *w,
journal_space_discarded)) { journal_space_discarded)) {
ja->cur_idx = (ja->cur_idx + 1) % ja->nr; ja->cur_idx = (ja->cur_idx + 1) % ja->nr;
ja->sectors_free = ca->mi.bucket_size; ja->sectors_free = ca->mi.bucket_size;
/*
* ja->bucket_seq[ja->cur_idx] must always have
* something sensible:
*/
ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq);
} }
} }
......
...@@ -49,6 +49,18 @@ unsigned bch2_journal_dev_buckets_available(struct journal *j, ...@@ -49,6 +49,18 @@ unsigned bch2_journal_dev_buckets_available(struct journal *j,
return available; return available;
} }
static void journal_set_remaining(struct journal *j, unsigned u64s_remaining)
{
union journal_preres_state old, new;
u64 v = atomic64_read(&j->prereserved.counter);
do {
old.v = new.v = v;
new.remaining = u64s_remaining;
} while ((v = atomic64_cmpxchg(&j->prereserved.counter,
old.v, new.v)) != old.v);
}
static struct journal_space { static struct journal_space {
unsigned next_entry; unsigned next_entry;
unsigned remaining; unsigned remaining;
...@@ -124,8 +136,9 @@ void bch2_journal_space_available(struct journal *j) ...@@ -124,8 +136,9 @@ void bch2_journal_space_available(struct journal *j)
struct bch_fs *c = container_of(j, struct bch_fs, journal); struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct bch_dev *ca; struct bch_dev *ca;
struct journal_space discarded, clean_ondisk, clean; struct journal_space discarded, clean_ondisk, clean;
unsigned max_entry_size = min(j->buf[0].buf_size >> 9, unsigned overhead, u64s_remaining = 0;
j->buf[1].buf_size >> 9); unsigned max_entry_size = min(j->buf[0].buf_size >> 9,
j->buf[1].buf_size >> 9);
unsigned i, nr_online = 0, nr_devs_want; unsigned i, nr_online = 0, nr_devs_want;
bool can_discard = false; bool can_discard = false;
int ret = 0; int ret = 0;
...@@ -176,9 +189,17 @@ void bch2_journal_space_available(struct journal *j) ...@@ -176,9 +189,17 @@ void bch2_journal_space_available(struct journal *j)
if (!discarded.next_entry) if (!discarded.next_entry)
ret = -ENOSPC; ret = -ENOSPC;
overhead = DIV_ROUND_UP(clean.remaining, max_entry_size) *
journal_entry_overhead(j);
u64s_remaining = clean.remaining << 6;
u64s_remaining = max_t(int, 0, u64s_remaining - overhead);
u64s_remaining /= 4;
out: out:
j->cur_entry_sectors = !ret ? discarded.next_entry : 0; j->cur_entry_sectors = !ret ? discarded.next_entry : 0;
j->cur_entry_error = ret; j->cur_entry_error = ret;
journal_set_remaining(j, u64s_remaining);
journal_check_may_get_unreserved(j);
if (!ret) if (!ret)
journal_wake(j); journal_wake(j);
...@@ -454,7 +475,7 @@ void bch2_journal_reclaim(struct journal *j) ...@@ -454,7 +475,7 @@ void bch2_journal_reclaim(struct journal *j)
{ {
struct bch_fs *c = container_of(j, struct bch_fs, journal); struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct bch_dev *ca; struct bch_dev *ca;
unsigned iter, bucket_to_flush, min_nr = 0; unsigned iter, min_nr = 0;
u64 seq_to_flush = 0; u64 seq_to_flush = 0;
lockdep_assert_held(&j->reclaim_lock); lockdep_assert_held(&j->reclaim_lock);
...@@ -465,13 +486,22 @@ void bch2_journal_reclaim(struct journal *j) ...@@ -465,13 +486,22 @@ void bch2_journal_reclaim(struct journal *j)
for_each_rw_member(ca, c, iter) { for_each_rw_member(ca, c, iter) {
struct journal_device *ja = &ca->journal; struct journal_device *ja = &ca->journal;
unsigned nr_buckets, bucket_to_flush;
if (!ja->nr) if (!ja->nr)
continue; continue;
/* Try to keep the journal at most half full: */ /* Try to keep the journal at most half full: */
bucket_to_flush = (ja->cur_idx + (ja->nr >> 1)) % ja->nr; nr_buckets = ja->nr / 2;
/* And include pre-reservations: */
nr_buckets += DIV_ROUND_UP(j->prereserved.reserved,
(ca->mi.bucket_size << 6) -
journal_entry_overhead(j));
nr_buckets = min(nr_buckets, ja->nr);
bucket_to_flush = (ja->cur_idx + nr_buckets) % ja->nr;
seq_to_flush = max_t(u64, seq_to_flush, seq_to_flush = max_t(u64, seq_to_flush,
ja->bucket_seq[bucket_to_flush]); ja->bucket_seq[bucket_to_flush]);
} }
...@@ -490,6 +520,9 @@ void bch2_journal_reclaim(struct journal *j) ...@@ -490,6 +520,9 @@ void bch2_journal_reclaim(struct journal *j)
msecs_to_jiffies(j->reclaim_delay_ms))) msecs_to_jiffies(j->reclaim_delay_ms)))
min_nr = 1; min_nr = 1;
if (j->prereserved.reserved * 2 > j->prereserved.remaining)
min_nr = 1;
journal_flush_pins(j, seq_to_flush, min_nr); journal_flush_pins(j, seq_to_flush, min_nr);
if (!test_bit(BCH_FS_RO, &c->flags)) if (!test_bit(BCH_FS_RO, &c->flags))
......
...@@ -80,6 +80,14 @@ struct journal_res { ...@@ -80,6 +80,14 @@ struct journal_res {
u64 seq; u64 seq;
}; };
/*
* For reserving space in the journal prior to getting a reservation on a
* particular journal entry:
*/
struct journal_preres {
unsigned u64s;
};
union journal_res_state { union journal_res_state {
struct { struct {
atomic64_t counter; atomic64_t counter;
...@@ -98,6 +106,21 @@ union journal_res_state { ...@@ -98,6 +106,21 @@ union journal_res_state {
}; };
}; };
union journal_preres_state {
struct {
atomic64_t counter;
};
struct {
u64 v;
};
struct {
u32 reserved;
u32 remaining;
};
};
/* bytes: */ /* bytes: */
#define JOURNAL_ENTRY_SIZE_MIN (64U << 10) /* 64k */ #define JOURNAL_ENTRY_SIZE_MIN (64U << 10) /* 64k */
#define JOURNAL_ENTRY_SIZE_MAX (4U << 20) /* 4M */ #define JOURNAL_ENTRY_SIZE_MAX (4U << 20) /* 4M */
...@@ -122,6 +145,7 @@ enum { ...@@ -122,6 +145,7 @@ enum {
JOURNAL_STARTED, JOURNAL_STARTED,
JOURNAL_NEED_WRITE, JOURNAL_NEED_WRITE,
JOURNAL_NOT_EMPTY, JOURNAL_NOT_EMPTY,
JOURNAL_MAY_GET_UNRESERVED,
}; };
/* Embedded in struct bch_fs */ /* Embedded in struct bch_fs */
...@@ -142,6 +166,8 @@ struct journal { ...@@ -142,6 +166,8 @@ struct journal {
*/ */
int cur_entry_error; int cur_entry_error;
union journal_preres_state prereserved;
/* Reserved space in journal entry to be used just prior to write */ /* Reserved space in journal entry to be used just prior to write */
unsigned entry_u64s_reserved; unsigned entry_u64s_reserved;
...@@ -161,6 +187,7 @@ struct journal { ...@@ -161,6 +187,7 @@ struct journal {
/* Used when waiting because the journal was full */ /* Used when waiting because the journal was full */
wait_queue_head_t wait; wait_queue_head_t wait;
struct closure_waitlist async_wait; struct closure_waitlist async_wait;
struct closure_waitlist preres_wait;
struct closure io; struct closure io;
struct delayed_work write_work; struct delayed_work write_work;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment