Commit 2abe5420 authored by Kent Overstreet's avatar Kent Overstreet Committed by Kent Overstreet

bcachefs: Persist 64 bit io clocks

Originally, bcachefs - going back to bcache - stored, for each bucket, a
16 bit counter corresponding to how long it had been since the bucket
was read from. But, this required periodically rescaling counters on
every bucket to avoid wraparound. That wasn't an issue in bcache, where
we'd perodically rewrite the per bucket metadata all at once, but in
bcachefs we're trying to avoid having to walk every single bucket.

This patch switches to persisting 64 bit io clocks, corresponding to the
64 bit bucket timestaps introduced in the previous patch with
KEY_TYPE_alloc_v2.
Signed-off-by: default avatarKent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: default avatarKent Overstreet <kent.overstreet@linux.dev>
parent 7f4e1d5d
This diff is collapsed.
......@@ -10,30 +10,6 @@
struct ec_bucket_buf;
/* There's two of these clocks, one for reads and one for writes: */
struct bucket_clock {
/*
* "now" in (read/write) IO time - incremented whenever we do X amount
* of reads or writes.
*
* Goes with the bucket read/write prios: when we read or write to a
* bucket we reset the bucket's prio to the current hand; thus hand -
* prio = time since bucket was last read/written.
*
* The units are some amount (bytes/sectors) of data read/written, and
* the units can change on the fly if we need to rescale to fit
* everything in a u16 - your only guarantee is that the units are
* consistent.
*/
u16 hand;
u16 max_last_io;
int rw;
struct io_timer rescale;
struct mutex lock;
};
enum alloc_reserve {
RESERVE_BTREE_MOVINGGC = -2,
RESERVE_BTREE = -1,
......
......@@ -451,9 +451,6 @@ struct bch_dev {
size_t fifo_last_bucket;
/* last calculated minimum prio */
u16 max_last_bucket_io[2];
size_t inc_gen_needs_gc;
size_t inc_gen_really_needs_gc;
......@@ -693,14 +690,6 @@ struct bch_fs {
struct mutex usage_scratch_lock;
struct bch_fs_usage_online *usage_scratch;
/*
* When we invalidate buckets, we use both the priority and the amount
* of good data to determine which buckets to reuse first - to weight
* those together consistently we keep track of the smallest nonzero
* priority of any bucket.
*/
struct bucket_clock bucket_clock[2];
struct io_clock io_clock[2];
/* JOURNAL SEQ BLACKLIST */
......
......@@ -1143,8 +1143,8 @@ struct bch_sb_field_clean {
struct bch_sb_field field;
__le32 flags;
__le16 read_clock;
__le16 write_clock;
__le16 _read_clock; /* no longer used */
__le16 _write_clock;
__le64 journal_seq;
union {
......@@ -1511,7 +1511,8 @@ static inline __u64 __bset_magic(struct bch_sb *sb)
x(blacklist, 3) \
x(blacklist_v2, 4) \
x(usage, 5) \
x(data_usage, 6)
x(data_usage, 6) \
x(clock, 7)
enum {
#define x(f, nr) BCH_JSET_ENTRY_##f = nr,
......@@ -1559,6 +1560,13 @@ struct jset_entry_data_usage {
struct bch_replicas_entry r;
} __attribute__((packed));
struct jset_entry_clock {
struct jset_entry entry;
__u8 rw;
__u8 pad[7];
__le64 time;
} __attribute__((packed));
/*
* On disk format for a journal entry:
* seq is monotonically increasing; every journal entry has its own unique
......@@ -1581,8 +1589,8 @@ struct jset {
__u8 encrypted_start[0];
__le16 read_clock;
__le16 write_clock;
__le16 _read_clock; /* no longer used */
__le16 _write_clock;
/* Sequence number of oldest dirty journal entry */
__le64 last_seq;
......
......@@ -1489,7 +1489,7 @@ static int bch2_gc_thread(void *arg)
{
struct bch_fs *c = arg;
struct io_clock *clock = &c->io_clock[WRITE];
unsigned long last = atomic_long_read(&clock->now);
unsigned long last = atomic64_read(&clock->now);
unsigned last_kick = atomic_read(&c->kick_gc);
int ret;
......@@ -1510,7 +1510,7 @@ static int bch2_gc_thread(void *arg)
if (c->btree_gc_periodic) {
unsigned long next = last + c->capacity / 16;
if (atomic_long_read(&clock->now) >= next)
if (atomic64_read(&clock->now) >= next)
break;
bch2_io_clock_schedule_timeout(clock, next);
......@@ -1522,7 +1522,7 @@ static int bch2_gc_thread(void *arg)
}
__set_current_state(TASK_RUNNING);
last = atomic_long_read(&clock->now);
last = atomic64_read(&clock->now);
last_kick = atomic_read(&c->kick_gc);
/*
......
......@@ -58,20 +58,13 @@ static inline struct bucket *bucket(struct bch_dev *ca, size_t b)
return __bucket(ca, b, false);
}
static inline u16 bucket_last_io(struct bch_fs *c, struct bucket *g, int rw)
{
return c->bucket_clock[rw].hand - g->io_time[rw];
}
/*
* bucket_gc_gen() returns the difference between the bucket's current gen and
* the oldest gen of any pointer into that bucket in the btree.
*/
static inline u8 bucket_gc_gen(struct bch_dev *ca, size_t b)
static inline u8 bucket_gc_gen(struct bucket *g)
{
struct bucket *g = bucket(ca, b);
return g->mark.gen - g->oldest_gen;
}
......
......@@ -37,7 +37,7 @@ struct bucket {
const struct bucket_mark mark;
};
u16 io_time[2];
u64 io_time[2];
u8 oldest_gen;
u8 gc_gen;
unsigned gen_valid:1;
......
......@@ -19,7 +19,7 @@ void bch2_io_timer_add(struct io_clock *clock, struct io_timer *timer)
spin_lock(&clock->timer_lock);
if (time_after_eq((unsigned long) atomic_long_read(&clock->now),
if (time_after_eq((unsigned long) atomic64_read(&clock->now),
timer->expire)) {
spin_unlock(&clock->timer_lock);
timer->fn(timer);
......@@ -146,7 +146,7 @@ static struct io_timer *get_expired_timer(struct io_clock *clock,
void __bch2_increment_clock(struct io_clock *clock, unsigned sectors)
{
struct io_timer *timer;
unsigned long now = atomic_long_add_return(sectors, &clock->now);
unsigned long now = atomic64_add_return(sectors, &clock->now);
while ((timer = get_expired_timer(clock, now)))
timer->fn(timer);
......@@ -158,7 +158,7 @@ void bch2_io_timers_to_text(struct printbuf *out, struct io_clock *clock)
unsigned i;
spin_lock(&clock->timer_lock);
now = atomic_long_read(&clock->now);
now = atomic64_read(&clock->now);
for (i = 0; i < clock->timers.used; i++)
pr_buf(out, "%ps:\t%li\n",
......@@ -175,7 +175,7 @@ void bch2_io_clock_exit(struct io_clock *clock)
int bch2_io_clock_init(struct io_clock *clock)
{
atomic_long_set(&clock->now, 0);
atomic64_set(&clock->now, 0);
spin_lock_init(&clock->timer_lock);
clock->max_slop = IO_CLOCK_PCPU_SECTORS * num_possible_cpus();
......
......@@ -26,7 +26,7 @@ struct io_timer {
typedef HEAP(struct io_timer *) io_timer_heap;
struct io_clock {
atomic_long_t now;
atomic64_t now;
u16 __percpu *pcpu_buf;
unsigned max_slop;
......
......@@ -1123,6 +1123,9 @@ int bch2_fs_journal_init(struct journal *j)
j->entry_u64s_reserved +=
BTREE_ID_NR * (JSET_KEYS_U64s + BKEY_BTREE_PTR_U64s_MAX);
j->entry_u64s_reserved +=
2 * (sizeof(struct jset_entry_clock) / sizeof(u64));
atomic64_set(&j->reservations.counter,
((union journal_res_state)
{ .cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL }).v);
......
......@@ -426,6 +426,32 @@ static int journal_entry_validate_data_usage(struct bch_fs *c,
return ret;
}
static int journal_entry_validate_clock(struct bch_fs *c,
struct jset *jset,
struct jset_entry *entry,
int write)
{
struct jset_entry_clock *clock =
container_of(entry, struct jset_entry_clock, entry);
unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
int ret = 0;
if (journal_entry_err_on(bytes != sizeof(*clock),
c, "invalid journal entry clock: bad size")) {
journal_entry_null_range(entry, vstruct_next(entry));
return ret;
}
if (journal_entry_err_on(clock->rw > 1,
c, "invalid journal entry clock: bad rw")) {
journal_entry_null_range(entry, vstruct_next(entry));
return ret;
}
fsck_err:
return ret;
}
struct jset_entry_ops {
int (*validate)(struct bch_fs *, struct jset *,
struct jset_entry *, int);
......@@ -1361,8 +1387,8 @@ void bch2_journal_write(struct closure *cl)
end = bch2_btree_roots_to_journal_entries(c, jset->start, end);
end = bch2_journal_super_entries_add_common(c, end,
le64_to_cpu(jset->seq));
bch2_journal_super_entries_add_common(c, &end,
le64_to_cpu(jset->seq));
u64s = (u64 *) end - (u64 *) start;
BUG_ON(u64s > j->entry_u64s_reserved);
......@@ -1371,10 +1397,7 @@ void bch2_journal_write(struct closure *cl)
journal_write_compact(jset);
jset->read_clock = cpu_to_le16(c->bucket_clock[READ].hand);
jset->write_clock = cpu_to_le16(c->bucket_clock[WRITE].hand);
jset->magic = cpu_to_le64(jset_magic(c));
jset->version = c->sb.version < bcachefs_metadata_version_new_versioning
? cpu_to_le32(BCH_JSET_VERSION_OLD)
: cpu_to_le32(c->sb.version);
......
......@@ -298,7 +298,7 @@ static int bch2_copygc_thread(void *arg)
{
struct bch_fs *c = arg;
struct io_clock *clock = &c->io_clock[WRITE];
unsigned long last, wait;
u64 last, wait;
set_freezable();
......@@ -306,7 +306,7 @@ static int bch2_copygc_thread(void *arg)
if (kthread_wait_freezable(c->copy_gc_enabled))
break;
last = atomic_long_read(&clock->now);
last = atomic64_read(&clock->now);
wait = bch2_copygc_wait_amount(c);
if (wait > clock->max_slop) {
......
......@@ -169,12 +169,12 @@ static int bch2_rebalance_thread(void *arg)
unsigned long start, prev_start;
unsigned long prev_run_time, prev_run_cputime;
unsigned long cputime, prev_cputime;
unsigned long io_start;
u64 io_start;
long throttle;
set_freezable();
io_start = atomic_long_read(&clock->now);
io_start = atomic64_read(&clock->now);
p = rebalance_work(c);
prev_start = jiffies;
prev_cputime = curr_cputime();
......@@ -210,7 +210,7 @@ static int bch2_rebalance_thread(void *arg)
(20 - w.dev_most_full_percent),
50);
if (atomic_long_read(&clock->now) + clock->max_slop <
if (atomic64_read(&clock->now) + clock->max_slop <
r->throttled_until_iotime) {
r->throttled_until_cputime = start + throttle;
r->state = REBALANCE_THROTTLED;
......@@ -229,7 +229,7 @@ static int bch2_rebalance_thread(void *arg)
max(p.dev_most_full_percent, 1U) /
max(w.dev_most_full_percent, 1U));
io_start = atomic_long_read(&clock->now);
io_start = atomic64_read(&clock->now);
p = w;
prev_start = start;
prev_cputime = cputime;
......@@ -274,7 +274,7 @@ void bch2_rebalance_work_to_text(struct printbuf *out, struct bch_fs *c)
case REBALANCE_THROTTLED:
bch2_hprint(&PBUF(h1),
(r->throttled_until_iotime -
atomic_long_read(&c->io_clock[WRITE].now)) << 9);
atomic64_read(&c->io_clock[WRITE].now)) << 9);
pr_buf(out, "throttled for %lu sec or %s io\n",
(r->throttled_until_cputime - jiffies) / HZ,
h1);
......
......@@ -17,7 +17,7 @@ struct bch_fs_rebalance {
atomic64_t work_unknown_dev;
enum rebalance_state state;
unsigned long throttled_until_iotime;
u64 throttled_until_iotime;
unsigned long throttled_until_cputime;
struct bch_move_stats move_stats;
......
......@@ -847,6 +847,12 @@ static int journal_replay_entry_early(struct bch_fs *c,
le64_to_cpu(bl_entry->end) + 1);
break;
}
case BCH_JSET_ENTRY_clock: {
struct jset_entry_clock *clock =
container_of(entry, struct jset_entry_clock, entry);
atomic64_set(&c->io_clock[clock->rw].now, clock->time);
}
}
return ret;
......@@ -861,9 +867,6 @@ static int journal_replay_early(struct bch_fs *c,
int ret;
if (clean) {
c->bucket_clock[READ].hand = le16_to_cpu(clean->read_clock);
c->bucket_clock[WRITE].hand = le16_to_cpu(clean->write_clock);
for (entry = clean->start;
entry != vstruct_end(&clean->field);
entry = vstruct_next(entry)) {
......@@ -876,9 +879,6 @@ static int journal_replay_early(struct bch_fs *c,
if (i->ignore)
continue;
c->bucket_clock[READ].hand = le16_to_cpu(i->j.read_clock);
c->bucket_clock[WRITE].hand = le16_to_cpu(i->j.write_clock);
vstruct_for_each(&i->j, entry) {
ret = journal_replay_entry_early(c, entry);
if (ret)
......@@ -942,13 +942,6 @@ static int verify_superblock_clean(struct bch_fs *c,
return 0;
}
mustfix_fsck_err_on(j->read_clock != clean->read_clock, c,
"superblock read clock %u doesn't match journal %u after clean shutdown",
clean->read_clock, j->read_clock);
mustfix_fsck_err_on(j->write_clock != clean->write_clock, c,
"superblock write clock %u doesn't match journal %u after clean shutdown",
clean->write_clock, j->write_clock);
for (i = 0; i < BTREE_ID_NR; i++) {
char buf1[200], buf2[200];
struct bkey_i *k1, *k2;
......
......@@ -966,29 +966,25 @@ int bch2_fs_mark_dirty(struct bch_fs *c)
return ret;
}
static void
entry_init_u64s(struct jset_entry *entry, unsigned u64s)
static struct jset_entry *jset_entry_init(struct jset_entry **end, size_t size)
{
memset(entry, 0, u64s * sizeof(u64));
struct jset_entry *entry = *end;
unsigned u64s = DIV_ROUND_UP(size, sizeof(u64));
memset(entry, 0, u64s * sizeof(u64));
/*
* The u64s field counts from the start of data, ignoring the shared
* fields.
*/
entry->u64s = u64s - 1;
}
static void
entry_init_size(struct jset_entry *entry, size_t size)
{
unsigned u64s = DIV_ROUND_UP(size, sizeof(u64));
entry_init_u64s(entry, u64s);
*end = vstruct_next(*end);
return entry;
}
struct jset_entry *
bch2_journal_super_entries_add_common(struct bch_fs *c,
struct jset_entry *entry,
u64 journal_seq)
void bch2_journal_super_entries_add_common(struct bch_fs *c,
struct jset_entry **end,
u64 journal_seq)
{
unsigned i;
......@@ -1003,59 +999,59 @@ bch2_journal_super_entries_add_common(struct bch_fs *c,
{
struct jset_entry_usage *u =
container_of(entry, struct jset_entry_usage, entry);
container_of(jset_entry_init(end, sizeof(*u)),
struct jset_entry_usage, entry);
entry_init_size(entry, sizeof(*u));
u->entry.type = BCH_JSET_ENTRY_usage;
u->entry.btree_id = FS_USAGE_INODES;
u->v = cpu_to_le64(c->usage_base->nr_inodes);
entry = vstruct_next(entry);
}
{
struct jset_entry_usage *u =
container_of(entry, struct jset_entry_usage, entry);
container_of(jset_entry_init(end, sizeof(*u)),
struct jset_entry_usage, entry);
entry_init_size(entry, sizeof(*u));
u->entry.type = BCH_JSET_ENTRY_usage;
u->entry.btree_id = FS_USAGE_KEY_VERSION;
u->v = cpu_to_le64(atomic64_read(&c->key_version));
entry = vstruct_next(entry);
}
for (i = 0; i < BCH_REPLICAS_MAX; i++) {
struct jset_entry_usage *u =
container_of(entry, struct jset_entry_usage, entry);
container_of(jset_entry_init(end, sizeof(*u)),
struct jset_entry_usage, entry);
entry_init_size(entry, sizeof(*u));
u->entry.type = BCH_JSET_ENTRY_usage;
u->entry.btree_id = FS_USAGE_RESERVED;
u->entry.level = i;
u->v = cpu_to_le64(c->usage_base->persistent_reserved[i]);
entry = vstruct_next(entry);
}
for (i = 0; i < c->replicas.nr; i++) {
struct bch_replicas_entry *e =
cpu_replicas_entry(&c->replicas, i);
struct jset_entry_data_usage *u =
container_of(entry, struct jset_entry_data_usage, entry);
container_of(jset_entry_init(end, sizeof(*u) + e->nr_devs),
struct jset_entry_data_usage, entry);
entry_init_size(entry, sizeof(*u) + e->nr_devs);
u->entry.type = BCH_JSET_ENTRY_data_usage;
u->v = cpu_to_le64(c->usage_base->replicas[i]);
unsafe_memcpy(&u->r, e, replicas_entry_bytes(e),
"embedded variable length struct");
entry = vstruct_next(entry);
}
percpu_up_read(&c->mark_lock);
return entry;
for (i = 0; i < 2; i++) {
struct jset_entry_clock *clock =
container_of(jset_entry_init(end, sizeof(*clock)),
struct jset_entry_clock, entry);
clock->entry.type = BCH_JSET_ENTRY_clock;
clock->rw = i;
clock->time = atomic64_read(&c->io_clock[i].now);
}
}
void bch2_fs_mark_clean(struct bch_fs *c)
......@@ -1084,15 +1080,13 @@ void bch2_fs_mark_clean(struct bch_fs *c)
}
sb_clean->flags = 0;
sb_clean->read_clock = cpu_to_le16(c->bucket_clock[READ].hand);
sb_clean->write_clock = cpu_to_le16(c->bucket_clock[WRITE].hand);
sb_clean->journal_seq = cpu_to_le64(journal_cur_seq(&c->journal) - 1);
/* Trying to catch outstanding bug: */
BUG_ON(le64_to_cpu(sb_clean->journal_seq) > S64_MAX);
entry = sb_clean->start;
entry = bch2_journal_super_entries_add_common(c, entry, 0);
bch2_journal_super_entries_add_common(c, &entry, 0);
entry = bch2_btree_roots_to_journal_entries(c, entry, entry);
BUG_ON((void *) entry > vstruct_end(&sb_clean->field));
......
......@@ -122,9 +122,8 @@ static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi)
/* BCH_SB_FIELD_clean: */
struct jset_entry *
bch2_journal_super_entries_add_common(struct bch_fs *,
struct jset_entry *, u64);
void bch2_journal_super_entries_add_common(struct bch_fs *,
struct jset_entry **, u64);
void bch2_sb_clean_renumber(struct bch_sb_field_clean *, int);
......
......@@ -181,9 +181,6 @@ static void __bch2_fs_read_only(struct bch_fs *c)
bch2_copygc_stop(c);
bch2_gc_thread_stop(c);
bch2_io_timer_del(&c->io_clock[READ], &c->bucket_clock[READ].rescale);
bch2_io_timer_del(&c->io_clock[WRITE], &c->bucket_clock[WRITE].rescale);
/*
* Flush journal before stopping allocators, because flushing journal
* blacklist entries involves allocating new btree nodes:
......@@ -406,9 +403,6 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early)
bch2_dev_allocator_add(c, ca);
bch2_recalc_capacity(c);
bch2_io_timer_add(&c->io_clock[READ], &c->bucket_clock[READ].rescale);
bch2_io_timer_add(&c->io_clock[WRITE], &c->bucket_clock[WRITE].rescale);
for_each_rw_member(ca, c, i) {
ret = bch2_dev_allocator_start(ca);
if (ret) {
......
......@@ -705,7 +705,7 @@ static unsigned bucket_last_io_fn(struct bch_fs *c, struct bch_dev *ca,
{
int rw = (private ? 1 : 0);
return bucket_last_io(c, bucket(ca, b), rw);
return atomic64_read(&c->io_clock[rw].now) - bucket(ca, b)->io_time[rw];
}
static unsigned bucket_sectors_used_fn(struct bch_fs *c, struct bch_dev *ca,
......@@ -718,7 +718,7 @@ static unsigned bucket_sectors_used_fn(struct bch_fs *c, struct bch_dev *ca,
static unsigned bucket_oldest_gen_fn(struct bch_fs *c, struct bch_dev *ca,
size_t b, void *private)
{
return bucket_gc_gen(ca, b);
return bucket_gc_gen(bucket(ca, b));
}
static int unsigned_cmp(const void *_l, const void *_r)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment