Commit 2abe5420 authored by Kent Overstreet's avatar Kent Overstreet Committed by Kent Overstreet

bcachefs: Persist 64 bit io clocks

Originally, bcachefs - going back to bcache - stored, for each bucket, a
16 bit counter corresponding to how long it had been since the bucket
was read from. But, this required periodically rescaling counters on
every bucket to avoid wraparound. That wasn't an issue in bcache, where
we'd perodically rewrite the per bucket metadata all at once, but in
bcachefs we're trying to avoid having to walk every single bucket.

This patch switches to persisting 64 bit io clocks, corresponding to the
64 bit bucket timestaps introduced in the previous patch with
KEY_TYPE_alloc_v2.
Signed-off-by: default avatarKent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: default avatarKent Overstreet <kent.overstreet@linux.dev>
parent 7f4e1d5d
...@@ -31,8 +31,6 @@ static const unsigned BCH_ALLOC_V1_FIELD_BYTES[] = { ...@@ -31,8 +31,6 @@ static const unsigned BCH_ALLOC_V1_FIELD_BYTES[] = {
#undef x #undef x
}; };
static void bch2_recalc_oldest_io(struct bch_fs *, struct bch_dev *, int);
/* Ratelimiting/PD controllers */ /* Ratelimiting/PD controllers */
static void pd_controllers_update(struct work_struct *work) static void pd_controllers_update(struct work_struct *work)
...@@ -340,9 +338,7 @@ static int bch2_alloc_read_fn(struct bch_fs *c, enum btree_id id, ...@@ -340,9 +338,7 @@ static int bch2_alloc_read_fn(struct bch_fs *c, enum btree_id id,
int bch2_alloc_read(struct bch_fs *c, struct journal_keys *journal_keys) int bch2_alloc_read(struct bch_fs *c, struct journal_keys *journal_keys)
{ {
struct bch_dev *ca; int ret;
unsigned i;
int ret = 0;
down_read(&c->gc_lock); down_read(&c->gc_lock);
ret = bch2_btree_and_journal_walk(c, journal_keys, BTREE_ID_ALLOC, ret = bch2_btree_and_journal_walk(c, journal_keys, BTREE_ID_ALLOC,
...@@ -358,22 +354,6 @@ int bch2_alloc_read(struct bch_fs *c, struct journal_keys *journal_keys) ...@@ -358,22 +354,6 @@ int bch2_alloc_read(struct bch_fs *c, struct journal_keys *journal_keys)
bch2_dev_usage_from_buckets(c); bch2_dev_usage_from_buckets(c);
percpu_up_write(&c->mark_lock); percpu_up_write(&c->mark_lock);
mutex_lock(&c->bucket_clock[READ].lock);
for_each_member_device(ca, c, i) {
down_read(&ca->bucket_lock);
bch2_recalc_oldest_io(c, ca, READ);
up_read(&ca->bucket_lock);
}
mutex_unlock(&c->bucket_clock[READ].lock);
mutex_lock(&c->bucket_clock[WRITE].lock);
for_each_member_device(ca, c, i) {
down_read(&ca->bucket_lock);
bch2_recalc_oldest_io(c, ca, WRITE);
up_read(&ca->bucket_lock);
}
mutex_unlock(&c->bucket_clock[WRITE].lock);
return 0; return 0;
} }
...@@ -460,114 +440,6 @@ int bch2_alloc_write(struct bch_fs *c, unsigned flags) ...@@ -460,114 +440,6 @@ int bch2_alloc_write(struct bch_fs *c, unsigned flags)
/* Bucket IO clocks: */ /* Bucket IO clocks: */
static void bch2_recalc_oldest_io(struct bch_fs *c, struct bch_dev *ca, int rw)
{
struct bucket_clock *clock = &c->bucket_clock[rw];
struct bucket_array *buckets = bucket_array(ca);
struct bucket *g;
u16 max_last_io = 0;
unsigned i;
lockdep_assert_held(&c->bucket_clock[rw].lock);
/* Recalculate max_last_io for this device: */
for_each_bucket(g, buckets)
max_last_io = max(max_last_io, bucket_last_io(c, g, rw));
ca->max_last_bucket_io[rw] = max_last_io;
/* Recalculate global max_last_io: */
max_last_io = 0;
for_each_member_device(ca, c, i)
max_last_io = max(max_last_io, ca->max_last_bucket_io[rw]);
clock->max_last_io = max_last_io;
}
static void bch2_rescale_bucket_io_times(struct bch_fs *c, int rw)
{
struct bucket_clock *clock = &c->bucket_clock[rw];
struct bucket_array *buckets;
struct bch_dev *ca;
struct bucket *g;
unsigned i;
trace_rescale_prios(c);
for_each_member_device(ca, c, i) {
down_read(&ca->bucket_lock);
buckets = bucket_array(ca);
for_each_bucket(g, buckets)
g->io_time[rw] = clock->hand -
bucket_last_io(c, g, rw) / 2;
bch2_recalc_oldest_io(c, ca, rw);
up_read(&ca->bucket_lock);
}
}
static inline u64 bucket_clock_freq(u64 capacity)
{
return max(capacity >> 10, 2028ULL);
}
static void bch2_inc_clock_hand(struct io_timer *timer)
{
struct bucket_clock *clock = container_of(timer,
struct bucket_clock, rescale);
struct bch_fs *c = container_of(clock,
struct bch_fs, bucket_clock[clock->rw]);
struct bch_dev *ca;
u64 capacity;
unsigned i;
mutex_lock(&clock->lock);
/* if clock cannot be advanced more, rescale prio */
if (clock->max_last_io >= U16_MAX - 2)
bch2_rescale_bucket_io_times(c, clock->rw);
BUG_ON(clock->max_last_io >= U16_MAX - 2);
for_each_member_device(ca, c, i)
ca->max_last_bucket_io[clock->rw]++;
clock->max_last_io++;
clock->hand++;
mutex_unlock(&clock->lock);
capacity = READ_ONCE(c->capacity);
if (!capacity)
return;
/*
* we only increment when 0.1% of the filesystem capacity has been read
* or written too, this determines if it's time
*
* XXX: we shouldn't really be going off of the capacity of devices in
* RW mode (that will be 0 when we're RO, yet we can still service
* reads)
*/
timer->expire += bucket_clock_freq(capacity);
bch2_io_timer_add(&c->io_clock[clock->rw], timer);
}
static void bch2_bucket_clock_init(struct bch_fs *c, int rw)
{
struct bucket_clock *clock = &c->bucket_clock[rw];
clock->hand = 1;
clock->rw = rw;
clock->rescale.fn = bch2_inc_clock_hand;
clock->rescale.expire = bucket_clock_freq(c->capacity);
mutex_init(&clock->lock);
}
int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev, int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev,
size_t bucket_nr, int rw) size_t bucket_nr, int rw)
{ {
...@@ -577,7 +449,7 @@ int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev, ...@@ -577,7 +449,7 @@ int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev,
struct bucket *g; struct bucket *g;
struct bkey_alloc_buf *a; struct bkey_alloc_buf *a;
struct bkey_alloc_unpacked u; struct bkey_alloc_unpacked u;
u64 *time; u64 *time, now;
int ret = 0; int ret = 0;
iter = bch2_trans_get_iter(trans, BTREE_ID_ALLOC, POS(dev, bucket_nr), iter = bch2_trans_get_iter(trans, BTREE_ID_ALLOC, POS(dev, bucket_nr),
...@@ -599,10 +471,11 @@ int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev, ...@@ -599,10 +471,11 @@ int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev,
percpu_up_read(&c->mark_lock); percpu_up_read(&c->mark_lock);
time = rw == READ ? &u.read_time : &u.write_time; time = rw == READ ? &u.read_time : &u.write_time;
if (*time == c->bucket_clock[rw].hand) now = atomic64_read(&c->io_clock[rw].now);
if (*time == now)
goto out; goto out;
*time = c->bucket_clock[rw].hand; *time = now;
bch2_alloc_pack(c, a, u); bch2_alloc_pack(c, a, u);
ret = bch2_trans_update(trans, iter, &a->k, 0) ?: ret = bch2_trans_update(trans, iter, &a->k, 0) ?:
...@@ -674,23 +547,22 @@ static int wait_buckets_available(struct bch_fs *c, struct bch_dev *ca) ...@@ -674,23 +547,22 @@ static int wait_buckets_available(struct bch_fs *c, struct bch_dev *ca)
return ret; return ret;
} }
static bool bch2_can_invalidate_bucket(struct bch_dev *ca, static bool bch2_can_invalidate_bucket(struct bch_dev *ca, size_t b,
size_t bucket, struct bucket_mark m)
struct bucket_mark mark)
{ {
u8 gc_gen; u8 gc_gen;
if (!is_available_bucket(mark)) if (!is_available_bucket(m))
return false; return false;
if (mark.owned_by_allocator) if (m.owned_by_allocator)
return false; return false;
if (ca->buckets_nouse && if (ca->buckets_nouse &&
test_bit(bucket, ca->buckets_nouse)) test_bit(b, ca->buckets_nouse))
return false; return false;
gc_gen = bucket_gc_gen(ca, bucket); gc_gen = bucket_gc_gen(bucket(ca, b));
if (gc_gen >= BUCKET_GC_GEN_MAX / 2) if (gc_gen >= BUCKET_GC_GEN_MAX / 2)
ca->inc_gen_needs_gc++; ca->inc_gen_needs_gc++;
...@@ -704,43 +576,33 @@ static bool bch2_can_invalidate_bucket(struct bch_dev *ca, ...@@ -704,43 +576,33 @@ static bool bch2_can_invalidate_bucket(struct bch_dev *ca,
/* /*
* Determines what order we're going to reuse buckets, smallest bucket_key() * Determines what order we're going to reuse buckets, smallest bucket_key()
* first. * first.
*
*
* - We take into account the read prio of the bucket, which gives us an
* indication of how hot the data is -- we scale the prio so that the prio
* farthest from the clock is worth 1/8th of the closest.
*
* - The number of sectors of cached data in the bucket, which gives us an
* indication of the cost in cache misses this eviction will cause.
*
* - If hotness * sectors used compares equal, we pick the bucket with the
* smallest bucket_gc_gen() - since incrementing the same bucket's generation
* number repeatedly forces us to run mark and sweep gc to avoid generation
* number wraparound.
*/ */
static unsigned long bucket_sort_key(struct bch_fs *c, struct bch_dev *ca, static unsigned bucket_sort_key(struct bucket *g, struct bucket_mark m,
size_t b, struct bucket_mark m) u64 now, u64 last_seq_ondisk)
{ {
unsigned last_io = bucket_last_io(c, bucket(ca, b), READ); unsigned used = bucket_sectors_used(m);
unsigned max_last_io = ca->max_last_bucket_io[READ];
/*
* Time since last read, scaled to [0, 8) where larger value indicates
* more recently read data:
*/
unsigned long hotness = (max_last_io - last_io) * 7 / max_last_io;
/* How much we want to keep the data in this bucket: */
unsigned long data_wantness =
(hotness + 1) * bucket_sectors_used(m);
unsigned long needs_journal_commit = if (used) {
bucket_needs_journal_commit(m, c->journal.last_seq_ondisk); /*
* Prefer to keep buckets that have been read more recently, and
* buckets that have more data in them:
*/
u64 last_read = max_t(s64, 0, now - g->io_time[READ]);
u32 last_read_scaled = max_t(u64, U32_MAX, div_u64(last_read, used));
return (data_wantness << 9) | return -last_read_scaled;
(needs_journal_commit << 8) | } else {
(bucket_gc_gen(ca, b) / 16); /*
* Prefer to use buckets with smaller gc_gen so that we don't
* have to walk the btree and recalculate oldest_gen - but shift
* off the low bits so that buckets will still have equal sort
* keys when there's only a small difference, so that we can
* keep sequential buckets together:
*/
return (bucket_needs_journal_commit(m, last_seq_ondisk) << 4)|
(bucket_gc_gen(g) >> 4);
}
} }
static inline int bucket_alloc_cmp(alloc_heap *h, static inline int bucket_alloc_cmp(alloc_heap *h,
...@@ -763,16 +625,15 @@ static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca) ...@@ -763,16 +625,15 @@ static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca)
{ {
struct bucket_array *buckets; struct bucket_array *buckets;
struct alloc_heap_entry e = { 0 }; struct alloc_heap_entry e = { 0 };
u64 now, last_seq_ondisk;
size_t b, i, nr = 0; size_t b, i, nr = 0;
ca->alloc_heap.used = 0;
mutex_lock(&c->bucket_clock[READ].lock);
down_read(&ca->bucket_lock); down_read(&ca->bucket_lock);
buckets = bucket_array(ca); buckets = bucket_array(ca);
ca->alloc_heap.used = 0;
bch2_recalc_oldest_io(c, ca, READ); now = atomic64_read(&c->io_clock[READ].now);
last_seq_ondisk = c->journal.last_seq_ondisk;
/* /*
* Find buckets with lowest read priority, by building a maxheap sorted * Find buckets with lowest read priority, by building a maxheap sorted
...@@ -780,8 +641,9 @@ static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca) ...@@ -780,8 +641,9 @@ static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca)
* all buckets have been visited. * all buckets have been visited.
*/ */
for (b = ca->mi.first_bucket; b < ca->mi.nbuckets; b++) { for (b = ca->mi.first_bucket; b < ca->mi.nbuckets; b++) {
struct bucket_mark m = READ_ONCE(buckets->b[b].mark); struct bucket *g = &buckets->b[b];
unsigned long key = bucket_sort_key(c, ca, b, m); struct bucket_mark m = READ_ONCE(g->mark);
unsigned key = bucket_sort_key(g, m, now, last_seq_ondisk);
if (!bch2_can_invalidate_bucket(ca, b, m)) if (!bch2_can_invalidate_bucket(ca, b, m))
continue; continue;
...@@ -816,7 +678,6 @@ static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca) ...@@ -816,7 +678,6 @@ static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca)
} }
up_read(&ca->bucket_lock); up_read(&ca->bucket_lock);
mutex_unlock(&c->bucket_clock[READ].lock);
} }
static void find_reclaimable_buckets_fifo(struct bch_fs *c, struct bch_dev *ca) static void find_reclaimable_buckets_fifo(struct bch_fs *c, struct bch_dev *ca)
...@@ -1031,8 +892,8 @@ static int bch2_invalidate_one_bucket2(struct btree_trans *trans, ...@@ -1031,8 +892,8 @@ static int bch2_invalidate_one_bucket2(struct btree_trans *trans,
u.data_type = 0; u.data_type = 0;
u.dirty_sectors = 0; u.dirty_sectors = 0;
u.cached_sectors = 0; u.cached_sectors = 0;
u.read_time = c->bucket_clock[READ].hand; u.read_time = atomic64_read(&c->io_clock[READ].now);
u.write_time = c->bucket_clock[WRITE].hand; u.write_time = atomic64_read(&c->io_clock[WRITE].now);
bch2_alloc_pack(c, &a, u); bch2_alloc_pack(c, &a, u);
bch2_trans_update(trans, iter, &a.k, bch2_trans_update(trans, iter, &a.k,
...@@ -1542,8 +1403,6 @@ int bch2_dev_allocator_start(struct bch_dev *ca) ...@@ -1542,8 +1403,6 @@ int bch2_dev_allocator_start(struct bch_dev *ca)
void bch2_fs_allocator_background_init(struct bch_fs *c) void bch2_fs_allocator_background_init(struct bch_fs *c)
{ {
spin_lock_init(&c->freelist_lock); spin_lock_init(&c->freelist_lock);
bch2_bucket_clock_init(c, READ);
bch2_bucket_clock_init(c, WRITE);
c->pd_controllers_update_seconds = 5; c->pd_controllers_update_seconds = 5;
INIT_DELAYED_WORK(&c->pd_controllers_update, pd_controllers_update); INIT_DELAYED_WORK(&c->pd_controllers_update, pd_controllers_update);
......
...@@ -10,30 +10,6 @@ ...@@ -10,30 +10,6 @@
struct ec_bucket_buf; struct ec_bucket_buf;
/* There's two of these clocks, one for reads and one for writes: */
struct bucket_clock {
/*
* "now" in (read/write) IO time - incremented whenever we do X amount
* of reads or writes.
*
* Goes with the bucket read/write prios: when we read or write to a
* bucket we reset the bucket's prio to the current hand; thus hand -
* prio = time since bucket was last read/written.
*
* The units are some amount (bytes/sectors) of data read/written, and
* the units can change on the fly if we need to rescale to fit
* everything in a u16 - your only guarantee is that the units are
* consistent.
*/
u16 hand;
u16 max_last_io;
int rw;
struct io_timer rescale;
struct mutex lock;
};
enum alloc_reserve { enum alloc_reserve {
RESERVE_BTREE_MOVINGGC = -2, RESERVE_BTREE_MOVINGGC = -2,
RESERVE_BTREE = -1, RESERVE_BTREE = -1,
......
...@@ -451,9 +451,6 @@ struct bch_dev { ...@@ -451,9 +451,6 @@ struct bch_dev {
size_t fifo_last_bucket; size_t fifo_last_bucket;
/* last calculated minimum prio */
u16 max_last_bucket_io[2];
size_t inc_gen_needs_gc; size_t inc_gen_needs_gc;
size_t inc_gen_really_needs_gc; size_t inc_gen_really_needs_gc;
...@@ -693,14 +690,6 @@ struct bch_fs { ...@@ -693,14 +690,6 @@ struct bch_fs {
struct mutex usage_scratch_lock; struct mutex usage_scratch_lock;
struct bch_fs_usage_online *usage_scratch; struct bch_fs_usage_online *usage_scratch;
/*
* When we invalidate buckets, we use both the priority and the amount
* of good data to determine which buckets to reuse first - to weight
* those together consistently we keep track of the smallest nonzero
* priority of any bucket.
*/
struct bucket_clock bucket_clock[2];
struct io_clock io_clock[2]; struct io_clock io_clock[2];
/* JOURNAL SEQ BLACKLIST */ /* JOURNAL SEQ BLACKLIST */
......
...@@ -1143,8 +1143,8 @@ struct bch_sb_field_clean { ...@@ -1143,8 +1143,8 @@ struct bch_sb_field_clean {
struct bch_sb_field field; struct bch_sb_field field;
__le32 flags; __le32 flags;
__le16 read_clock; __le16 _read_clock; /* no longer used */
__le16 write_clock; __le16 _write_clock;
__le64 journal_seq; __le64 journal_seq;
union { union {
...@@ -1511,7 +1511,8 @@ static inline __u64 __bset_magic(struct bch_sb *sb) ...@@ -1511,7 +1511,8 @@ static inline __u64 __bset_magic(struct bch_sb *sb)
x(blacklist, 3) \ x(blacklist, 3) \
x(blacklist_v2, 4) \ x(blacklist_v2, 4) \
x(usage, 5) \ x(usage, 5) \
x(data_usage, 6) x(data_usage, 6) \
x(clock, 7)
enum { enum {
#define x(f, nr) BCH_JSET_ENTRY_##f = nr, #define x(f, nr) BCH_JSET_ENTRY_##f = nr,
...@@ -1559,6 +1560,13 @@ struct jset_entry_data_usage { ...@@ -1559,6 +1560,13 @@ struct jset_entry_data_usage {
struct bch_replicas_entry r; struct bch_replicas_entry r;
} __attribute__((packed)); } __attribute__((packed));
struct jset_entry_clock {
struct jset_entry entry;
__u8 rw;
__u8 pad[7];
__le64 time;
} __attribute__((packed));
/* /*
* On disk format for a journal entry: * On disk format for a journal entry:
* seq is monotonically increasing; every journal entry has its own unique * seq is monotonically increasing; every journal entry has its own unique
...@@ -1581,8 +1589,8 @@ struct jset { ...@@ -1581,8 +1589,8 @@ struct jset {
__u8 encrypted_start[0]; __u8 encrypted_start[0];
__le16 read_clock; __le16 _read_clock; /* no longer used */
__le16 write_clock; __le16 _write_clock;
/* Sequence number of oldest dirty journal entry */ /* Sequence number of oldest dirty journal entry */
__le64 last_seq; __le64 last_seq;
......
...@@ -1489,7 +1489,7 @@ static int bch2_gc_thread(void *arg) ...@@ -1489,7 +1489,7 @@ static int bch2_gc_thread(void *arg)
{ {
struct bch_fs *c = arg; struct bch_fs *c = arg;
struct io_clock *clock = &c->io_clock[WRITE]; struct io_clock *clock = &c->io_clock[WRITE];
unsigned long last = atomic_long_read(&clock->now); unsigned long last = atomic64_read(&clock->now);
unsigned last_kick = atomic_read(&c->kick_gc); unsigned last_kick = atomic_read(&c->kick_gc);
int ret; int ret;
...@@ -1510,7 +1510,7 @@ static int bch2_gc_thread(void *arg) ...@@ -1510,7 +1510,7 @@ static int bch2_gc_thread(void *arg)
if (c->btree_gc_periodic) { if (c->btree_gc_periodic) {
unsigned long next = last + c->capacity / 16; unsigned long next = last + c->capacity / 16;
if (atomic_long_read(&clock->now) >= next) if (atomic64_read(&clock->now) >= next)
break; break;
bch2_io_clock_schedule_timeout(clock, next); bch2_io_clock_schedule_timeout(clock, next);
...@@ -1522,7 +1522,7 @@ static int bch2_gc_thread(void *arg) ...@@ -1522,7 +1522,7 @@ static int bch2_gc_thread(void *arg)
} }
__set_current_state(TASK_RUNNING); __set_current_state(TASK_RUNNING);
last = atomic_long_read(&clock->now); last = atomic64_read(&clock->now);
last_kick = atomic_read(&c->kick_gc); last_kick = atomic_read(&c->kick_gc);
/* /*
......
...@@ -58,20 +58,13 @@ static inline struct bucket *bucket(struct bch_dev *ca, size_t b) ...@@ -58,20 +58,13 @@ static inline struct bucket *bucket(struct bch_dev *ca, size_t b)
return __bucket(ca, b, false); return __bucket(ca, b, false);
} }
static inline u16 bucket_last_io(struct bch_fs *c, struct bucket *g, int rw)
{
return c->bucket_clock[rw].hand - g->io_time[rw];
}
/* /*
* bucket_gc_gen() returns the difference between the bucket's current gen and * bucket_gc_gen() returns the difference between the bucket's current gen and
* the oldest gen of any pointer into that bucket in the btree. * the oldest gen of any pointer into that bucket in the btree.
*/ */
static inline u8 bucket_gc_gen(struct bch_dev *ca, size_t b) static inline u8 bucket_gc_gen(struct bucket *g)
{ {
struct bucket *g = bucket(ca, b);
return g->mark.gen - g->oldest_gen; return g->mark.gen - g->oldest_gen;
} }
......
...@@ -37,7 +37,7 @@ struct bucket { ...@@ -37,7 +37,7 @@ struct bucket {
const struct bucket_mark mark; const struct bucket_mark mark;
}; };
u16 io_time[2]; u64 io_time[2];
u8 oldest_gen; u8 oldest_gen;
u8 gc_gen; u8 gc_gen;
unsigned gen_valid:1; unsigned gen_valid:1;
......
...@@ -19,7 +19,7 @@ void bch2_io_timer_add(struct io_clock *clock, struct io_timer *timer) ...@@ -19,7 +19,7 @@ void bch2_io_timer_add(struct io_clock *clock, struct io_timer *timer)
spin_lock(&clock->timer_lock); spin_lock(&clock->timer_lock);
if (time_after_eq((unsigned long) atomic_long_read(&clock->now), if (time_after_eq((unsigned long) atomic64_read(&clock->now),
timer->expire)) { timer->expire)) {
spin_unlock(&clock->timer_lock); spin_unlock(&clock->timer_lock);
timer->fn(timer); timer->fn(timer);
...@@ -146,7 +146,7 @@ static struct io_timer *get_expired_timer(struct io_clock *clock, ...@@ -146,7 +146,7 @@ static struct io_timer *get_expired_timer(struct io_clock *clock,
void __bch2_increment_clock(struct io_clock *clock, unsigned sectors) void __bch2_increment_clock(struct io_clock *clock, unsigned sectors)
{ {
struct io_timer *timer; struct io_timer *timer;
unsigned long now = atomic_long_add_return(sectors, &clock->now); unsigned long now = atomic64_add_return(sectors, &clock->now);
while ((timer = get_expired_timer(clock, now))) while ((timer = get_expired_timer(clock, now)))
timer->fn(timer); timer->fn(timer);
...@@ -158,7 +158,7 @@ void bch2_io_timers_to_text(struct printbuf *out, struct io_clock *clock) ...@@ -158,7 +158,7 @@ void bch2_io_timers_to_text(struct printbuf *out, struct io_clock *clock)
unsigned i; unsigned i;
spin_lock(&clock->timer_lock); spin_lock(&clock->timer_lock);
now = atomic_long_read(&clock->now); now = atomic64_read(&clock->now);
for (i = 0; i < clock->timers.used; i++) for (i = 0; i < clock->timers.used; i++)
pr_buf(out, "%ps:\t%li\n", pr_buf(out, "%ps:\t%li\n",
...@@ -175,7 +175,7 @@ void bch2_io_clock_exit(struct io_clock *clock) ...@@ -175,7 +175,7 @@ void bch2_io_clock_exit(struct io_clock *clock)
int bch2_io_clock_init(struct io_clock *clock) int bch2_io_clock_init(struct io_clock *clock)
{ {
atomic_long_set(&clock->now, 0); atomic64_set(&clock->now, 0);
spin_lock_init(&clock->timer_lock); spin_lock_init(&clock->timer_lock);
clock->max_slop = IO_CLOCK_PCPU_SECTORS * num_possible_cpus(); clock->max_slop = IO_CLOCK_PCPU_SECTORS * num_possible_cpus();
......
...@@ -26,7 +26,7 @@ struct io_timer { ...@@ -26,7 +26,7 @@ struct io_timer {
typedef HEAP(struct io_timer *) io_timer_heap; typedef HEAP(struct io_timer *) io_timer_heap;
struct io_clock { struct io_clock {
atomic_long_t now; atomic64_t now;
u16 __percpu *pcpu_buf; u16 __percpu *pcpu_buf;
unsigned max_slop; unsigned max_slop;
......
...@@ -1123,6 +1123,9 @@ int bch2_fs_journal_init(struct journal *j) ...@@ -1123,6 +1123,9 @@ int bch2_fs_journal_init(struct journal *j)
j->entry_u64s_reserved += j->entry_u64s_reserved +=
BTREE_ID_NR * (JSET_KEYS_U64s + BKEY_BTREE_PTR_U64s_MAX); BTREE_ID_NR * (JSET_KEYS_U64s + BKEY_BTREE_PTR_U64s_MAX);
j->entry_u64s_reserved +=
2 * (sizeof(struct jset_entry_clock) / sizeof(u64));
atomic64_set(&j->reservations.counter, atomic64_set(&j->reservations.counter,
((union journal_res_state) ((union journal_res_state)
{ .cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL }).v); { .cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL }).v);
......
...@@ -426,6 +426,32 @@ static int journal_entry_validate_data_usage(struct bch_fs *c, ...@@ -426,6 +426,32 @@ static int journal_entry_validate_data_usage(struct bch_fs *c,
return ret; return ret;
} }
static int journal_entry_validate_clock(struct bch_fs *c,
struct jset *jset,
struct jset_entry *entry,
int write)
{
struct jset_entry_clock *clock =
container_of(entry, struct jset_entry_clock, entry);
unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
int ret = 0;
if (journal_entry_err_on(bytes != sizeof(*clock),
c, "invalid journal entry clock: bad size")) {
journal_entry_null_range(entry, vstruct_next(entry));
return ret;
}
if (journal_entry_err_on(clock->rw > 1,
c, "invalid journal entry clock: bad rw")) {
journal_entry_null_range(entry, vstruct_next(entry));
return ret;
}
fsck_err:
return ret;
}
struct jset_entry_ops { struct jset_entry_ops {
int (*validate)(struct bch_fs *, struct jset *, int (*validate)(struct bch_fs *, struct jset *,
struct jset_entry *, int); struct jset_entry *, int);
...@@ -1361,8 +1387,8 @@ void bch2_journal_write(struct closure *cl) ...@@ -1361,8 +1387,8 @@ void bch2_journal_write(struct closure *cl)
end = bch2_btree_roots_to_journal_entries(c, jset->start, end); end = bch2_btree_roots_to_journal_entries(c, jset->start, end);
end = bch2_journal_super_entries_add_common(c, end, bch2_journal_super_entries_add_common(c, &end,
le64_to_cpu(jset->seq)); le64_to_cpu(jset->seq));
u64s = (u64 *) end - (u64 *) start; u64s = (u64 *) end - (u64 *) start;
BUG_ON(u64s > j->entry_u64s_reserved); BUG_ON(u64s > j->entry_u64s_reserved);
...@@ -1371,10 +1397,7 @@ void bch2_journal_write(struct closure *cl) ...@@ -1371,10 +1397,7 @@ void bch2_journal_write(struct closure *cl)
journal_write_compact(jset); journal_write_compact(jset);
jset->read_clock = cpu_to_le16(c->bucket_clock[READ].hand);
jset->write_clock = cpu_to_le16(c->bucket_clock[WRITE].hand);
jset->magic = cpu_to_le64(jset_magic(c)); jset->magic = cpu_to_le64(jset_magic(c));
jset->version = c->sb.version < bcachefs_metadata_version_new_versioning jset->version = c->sb.version < bcachefs_metadata_version_new_versioning
? cpu_to_le32(BCH_JSET_VERSION_OLD) ? cpu_to_le32(BCH_JSET_VERSION_OLD)
: cpu_to_le32(c->sb.version); : cpu_to_le32(c->sb.version);
......
...@@ -298,7 +298,7 @@ static int bch2_copygc_thread(void *arg) ...@@ -298,7 +298,7 @@ static int bch2_copygc_thread(void *arg)
{ {
struct bch_fs *c = arg; struct bch_fs *c = arg;
struct io_clock *clock = &c->io_clock[WRITE]; struct io_clock *clock = &c->io_clock[WRITE];
unsigned long last, wait; u64 last, wait;
set_freezable(); set_freezable();
...@@ -306,7 +306,7 @@ static int bch2_copygc_thread(void *arg) ...@@ -306,7 +306,7 @@ static int bch2_copygc_thread(void *arg)
if (kthread_wait_freezable(c->copy_gc_enabled)) if (kthread_wait_freezable(c->copy_gc_enabled))
break; break;
last = atomic_long_read(&clock->now); last = atomic64_read(&clock->now);
wait = bch2_copygc_wait_amount(c); wait = bch2_copygc_wait_amount(c);
if (wait > clock->max_slop) { if (wait > clock->max_slop) {
......
...@@ -169,12 +169,12 @@ static int bch2_rebalance_thread(void *arg) ...@@ -169,12 +169,12 @@ static int bch2_rebalance_thread(void *arg)
unsigned long start, prev_start; unsigned long start, prev_start;
unsigned long prev_run_time, prev_run_cputime; unsigned long prev_run_time, prev_run_cputime;
unsigned long cputime, prev_cputime; unsigned long cputime, prev_cputime;
unsigned long io_start; u64 io_start;
long throttle; long throttle;
set_freezable(); set_freezable();
io_start = atomic_long_read(&clock->now); io_start = atomic64_read(&clock->now);
p = rebalance_work(c); p = rebalance_work(c);
prev_start = jiffies; prev_start = jiffies;
prev_cputime = curr_cputime(); prev_cputime = curr_cputime();
...@@ -210,7 +210,7 @@ static int bch2_rebalance_thread(void *arg) ...@@ -210,7 +210,7 @@ static int bch2_rebalance_thread(void *arg)
(20 - w.dev_most_full_percent), (20 - w.dev_most_full_percent),
50); 50);
if (atomic_long_read(&clock->now) + clock->max_slop < if (atomic64_read(&clock->now) + clock->max_slop <
r->throttled_until_iotime) { r->throttled_until_iotime) {
r->throttled_until_cputime = start + throttle; r->throttled_until_cputime = start + throttle;
r->state = REBALANCE_THROTTLED; r->state = REBALANCE_THROTTLED;
...@@ -229,7 +229,7 @@ static int bch2_rebalance_thread(void *arg) ...@@ -229,7 +229,7 @@ static int bch2_rebalance_thread(void *arg)
max(p.dev_most_full_percent, 1U) / max(p.dev_most_full_percent, 1U) /
max(w.dev_most_full_percent, 1U)); max(w.dev_most_full_percent, 1U));
io_start = atomic_long_read(&clock->now); io_start = atomic64_read(&clock->now);
p = w; p = w;
prev_start = start; prev_start = start;
prev_cputime = cputime; prev_cputime = cputime;
...@@ -274,7 +274,7 @@ void bch2_rebalance_work_to_text(struct printbuf *out, struct bch_fs *c) ...@@ -274,7 +274,7 @@ void bch2_rebalance_work_to_text(struct printbuf *out, struct bch_fs *c)
case REBALANCE_THROTTLED: case REBALANCE_THROTTLED:
bch2_hprint(&PBUF(h1), bch2_hprint(&PBUF(h1),
(r->throttled_until_iotime - (r->throttled_until_iotime -
atomic_long_read(&c->io_clock[WRITE].now)) << 9); atomic64_read(&c->io_clock[WRITE].now)) << 9);
pr_buf(out, "throttled for %lu sec or %s io\n", pr_buf(out, "throttled for %lu sec or %s io\n",
(r->throttled_until_cputime - jiffies) / HZ, (r->throttled_until_cputime - jiffies) / HZ,
h1); h1);
......
...@@ -17,7 +17,7 @@ struct bch_fs_rebalance { ...@@ -17,7 +17,7 @@ struct bch_fs_rebalance {
atomic64_t work_unknown_dev; atomic64_t work_unknown_dev;
enum rebalance_state state; enum rebalance_state state;
unsigned long throttled_until_iotime; u64 throttled_until_iotime;
unsigned long throttled_until_cputime; unsigned long throttled_until_cputime;
struct bch_move_stats move_stats; struct bch_move_stats move_stats;
......
...@@ -847,6 +847,12 @@ static int journal_replay_entry_early(struct bch_fs *c, ...@@ -847,6 +847,12 @@ static int journal_replay_entry_early(struct bch_fs *c,
le64_to_cpu(bl_entry->end) + 1); le64_to_cpu(bl_entry->end) + 1);
break; break;
} }
case BCH_JSET_ENTRY_clock: {
struct jset_entry_clock *clock =
container_of(entry, struct jset_entry_clock, entry);
atomic64_set(&c->io_clock[clock->rw].now, clock->time);
}
} }
return ret; return ret;
...@@ -861,9 +867,6 @@ static int journal_replay_early(struct bch_fs *c, ...@@ -861,9 +867,6 @@ static int journal_replay_early(struct bch_fs *c,
int ret; int ret;
if (clean) { if (clean) {
c->bucket_clock[READ].hand = le16_to_cpu(clean->read_clock);
c->bucket_clock[WRITE].hand = le16_to_cpu(clean->write_clock);
for (entry = clean->start; for (entry = clean->start;
entry != vstruct_end(&clean->field); entry != vstruct_end(&clean->field);
entry = vstruct_next(entry)) { entry = vstruct_next(entry)) {
...@@ -876,9 +879,6 @@ static int journal_replay_early(struct bch_fs *c, ...@@ -876,9 +879,6 @@ static int journal_replay_early(struct bch_fs *c,
if (i->ignore) if (i->ignore)
continue; continue;
c->bucket_clock[READ].hand = le16_to_cpu(i->j.read_clock);
c->bucket_clock[WRITE].hand = le16_to_cpu(i->j.write_clock);
vstruct_for_each(&i->j, entry) { vstruct_for_each(&i->j, entry) {
ret = journal_replay_entry_early(c, entry); ret = journal_replay_entry_early(c, entry);
if (ret) if (ret)
...@@ -942,13 +942,6 @@ static int verify_superblock_clean(struct bch_fs *c, ...@@ -942,13 +942,6 @@ static int verify_superblock_clean(struct bch_fs *c,
return 0; return 0;
} }
mustfix_fsck_err_on(j->read_clock != clean->read_clock, c,
"superblock read clock %u doesn't match journal %u after clean shutdown",
clean->read_clock, j->read_clock);
mustfix_fsck_err_on(j->write_clock != clean->write_clock, c,
"superblock write clock %u doesn't match journal %u after clean shutdown",
clean->write_clock, j->write_clock);
for (i = 0; i < BTREE_ID_NR; i++) { for (i = 0; i < BTREE_ID_NR; i++) {
char buf1[200], buf2[200]; char buf1[200], buf2[200];
struct bkey_i *k1, *k2; struct bkey_i *k1, *k2;
......
...@@ -966,29 +966,25 @@ int bch2_fs_mark_dirty(struct bch_fs *c) ...@@ -966,29 +966,25 @@ int bch2_fs_mark_dirty(struct bch_fs *c)
return ret; return ret;
} }
static void static struct jset_entry *jset_entry_init(struct jset_entry **end, size_t size)
entry_init_u64s(struct jset_entry *entry, unsigned u64s)
{ {
memset(entry, 0, u64s * sizeof(u64)); struct jset_entry *entry = *end;
unsigned u64s = DIV_ROUND_UP(size, sizeof(u64));
memset(entry, 0, u64s * sizeof(u64));
/* /*
* The u64s field counts from the start of data, ignoring the shared * The u64s field counts from the start of data, ignoring the shared
* fields. * fields.
*/ */
entry->u64s = u64s - 1; entry->u64s = u64s - 1;
}
static void *end = vstruct_next(*end);
entry_init_size(struct jset_entry *entry, size_t size) return entry;
{
unsigned u64s = DIV_ROUND_UP(size, sizeof(u64));
entry_init_u64s(entry, u64s);
} }
struct jset_entry * void bch2_journal_super_entries_add_common(struct bch_fs *c,
bch2_journal_super_entries_add_common(struct bch_fs *c, struct jset_entry **end,
struct jset_entry *entry, u64 journal_seq)
u64 journal_seq)
{ {
unsigned i; unsigned i;
...@@ -1003,59 +999,59 @@ bch2_journal_super_entries_add_common(struct bch_fs *c, ...@@ -1003,59 +999,59 @@ bch2_journal_super_entries_add_common(struct bch_fs *c,
{ {
struct jset_entry_usage *u = struct jset_entry_usage *u =
container_of(entry, struct jset_entry_usage, entry); container_of(jset_entry_init(end, sizeof(*u)),
struct jset_entry_usage, entry);
entry_init_size(entry, sizeof(*u));
u->entry.type = BCH_JSET_ENTRY_usage; u->entry.type = BCH_JSET_ENTRY_usage;
u->entry.btree_id = FS_USAGE_INODES; u->entry.btree_id = FS_USAGE_INODES;
u->v = cpu_to_le64(c->usage_base->nr_inodes); u->v = cpu_to_le64(c->usage_base->nr_inodes);
entry = vstruct_next(entry);
} }
{ {
struct jset_entry_usage *u = struct jset_entry_usage *u =
container_of(entry, struct jset_entry_usage, entry); container_of(jset_entry_init(end, sizeof(*u)),
struct jset_entry_usage, entry);
entry_init_size(entry, sizeof(*u));
u->entry.type = BCH_JSET_ENTRY_usage; u->entry.type = BCH_JSET_ENTRY_usage;
u->entry.btree_id = FS_USAGE_KEY_VERSION; u->entry.btree_id = FS_USAGE_KEY_VERSION;
u->v = cpu_to_le64(atomic64_read(&c->key_version)); u->v = cpu_to_le64(atomic64_read(&c->key_version));
entry = vstruct_next(entry);
} }
for (i = 0; i < BCH_REPLICAS_MAX; i++) { for (i = 0; i < BCH_REPLICAS_MAX; i++) {
struct jset_entry_usage *u = struct jset_entry_usage *u =
container_of(entry, struct jset_entry_usage, entry); container_of(jset_entry_init(end, sizeof(*u)),
struct jset_entry_usage, entry);
entry_init_size(entry, sizeof(*u));
u->entry.type = BCH_JSET_ENTRY_usage; u->entry.type = BCH_JSET_ENTRY_usage;
u->entry.btree_id = FS_USAGE_RESERVED; u->entry.btree_id = FS_USAGE_RESERVED;
u->entry.level = i; u->entry.level = i;
u->v = cpu_to_le64(c->usage_base->persistent_reserved[i]); u->v = cpu_to_le64(c->usage_base->persistent_reserved[i]);
entry = vstruct_next(entry);
} }
for (i = 0; i < c->replicas.nr; i++) { for (i = 0; i < c->replicas.nr; i++) {
struct bch_replicas_entry *e = struct bch_replicas_entry *e =
cpu_replicas_entry(&c->replicas, i); cpu_replicas_entry(&c->replicas, i);
struct jset_entry_data_usage *u = struct jset_entry_data_usage *u =
container_of(entry, struct jset_entry_data_usage, entry); container_of(jset_entry_init(end, sizeof(*u) + e->nr_devs),
struct jset_entry_data_usage, entry);
entry_init_size(entry, sizeof(*u) + e->nr_devs);
u->entry.type = BCH_JSET_ENTRY_data_usage; u->entry.type = BCH_JSET_ENTRY_data_usage;
u->v = cpu_to_le64(c->usage_base->replicas[i]); u->v = cpu_to_le64(c->usage_base->replicas[i]);
unsafe_memcpy(&u->r, e, replicas_entry_bytes(e), unsafe_memcpy(&u->r, e, replicas_entry_bytes(e),
"embedded variable length struct"); "embedded variable length struct");
entry = vstruct_next(entry);
} }
percpu_up_read(&c->mark_lock); percpu_up_read(&c->mark_lock);
return entry; for (i = 0; i < 2; i++) {
struct jset_entry_clock *clock =
container_of(jset_entry_init(end, sizeof(*clock)),
struct jset_entry_clock, entry);
clock->entry.type = BCH_JSET_ENTRY_clock;
clock->rw = i;
clock->time = atomic64_read(&c->io_clock[i].now);
}
} }
void bch2_fs_mark_clean(struct bch_fs *c) void bch2_fs_mark_clean(struct bch_fs *c)
...@@ -1084,15 +1080,13 @@ void bch2_fs_mark_clean(struct bch_fs *c) ...@@ -1084,15 +1080,13 @@ void bch2_fs_mark_clean(struct bch_fs *c)
} }
sb_clean->flags = 0; sb_clean->flags = 0;
sb_clean->read_clock = cpu_to_le16(c->bucket_clock[READ].hand);
sb_clean->write_clock = cpu_to_le16(c->bucket_clock[WRITE].hand);
sb_clean->journal_seq = cpu_to_le64(journal_cur_seq(&c->journal) - 1); sb_clean->journal_seq = cpu_to_le64(journal_cur_seq(&c->journal) - 1);
/* Trying to catch outstanding bug: */ /* Trying to catch outstanding bug: */
BUG_ON(le64_to_cpu(sb_clean->journal_seq) > S64_MAX); BUG_ON(le64_to_cpu(sb_clean->journal_seq) > S64_MAX);
entry = sb_clean->start; entry = sb_clean->start;
entry = bch2_journal_super_entries_add_common(c, entry, 0); bch2_journal_super_entries_add_common(c, &entry, 0);
entry = bch2_btree_roots_to_journal_entries(c, entry, entry); entry = bch2_btree_roots_to_journal_entries(c, entry, entry);
BUG_ON((void *) entry > vstruct_end(&sb_clean->field)); BUG_ON((void *) entry > vstruct_end(&sb_clean->field));
......
...@@ -122,9 +122,8 @@ static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi) ...@@ -122,9 +122,8 @@ static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi)
/* BCH_SB_FIELD_clean: */ /* BCH_SB_FIELD_clean: */
struct jset_entry * void bch2_journal_super_entries_add_common(struct bch_fs *,
bch2_journal_super_entries_add_common(struct bch_fs *, struct jset_entry **, u64);
struct jset_entry *, u64);
void bch2_sb_clean_renumber(struct bch_sb_field_clean *, int); void bch2_sb_clean_renumber(struct bch_sb_field_clean *, int);
......
...@@ -181,9 +181,6 @@ static void __bch2_fs_read_only(struct bch_fs *c) ...@@ -181,9 +181,6 @@ static void __bch2_fs_read_only(struct bch_fs *c)
bch2_copygc_stop(c); bch2_copygc_stop(c);
bch2_gc_thread_stop(c); bch2_gc_thread_stop(c);
bch2_io_timer_del(&c->io_clock[READ], &c->bucket_clock[READ].rescale);
bch2_io_timer_del(&c->io_clock[WRITE], &c->bucket_clock[WRITE].rescale);
/* /*
* Flush journal before stopping allocators, because flushing journal * Flush journal before stopping allocators, because flushing journal
* blacklist entries involves allocating new btree nodes: * blacklist entries involves allocating new btree nodes:
...@@ -406,9 +403,6 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early) ...@@ -406,9 +403,6 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early)
bch2_dev_allocator_add(c, ca); bch2_dev_allocator_add(c, ca);
bch2_recalc_capacity(c); bch2_recalc_capacity(c);
bch2_io_timer_add(&c->io_clock[READ], &c->bucket_clock[READ].rescale);
bch2_io_timer_add(&c->io_clock[WRITE], &c->bucket_clock[WRITE].rescale);
for_each_rw_member(ca, c, i) { for_each_rw_member(ca, c, i) {
ret = bch2_dev_allocator_start(ca); ret = bch2_dev_allocator_start(ca);
if (ret) { if (ret) {
......
...@@ -705,7 +705,7 @@ static unsigned bucket_last_io_fn(struct bch_fs *c, struct bch_dev *ca, ...@@ -705,7 +705,7 @@ static unsigned bucket_last_io_fn(struct bch_fs *c, struct bch_dev *ca,
{ {
int rw = (private ? 1 : 0); int rw = (private ? 1 : 0);
return bucket_last_io(c, bucket(ca, b), rw); return atomic64_read(&c->io_clock[rw].now) - bucket(ca, b)->io_time[rw];
} }
static unsigned bucket_sectors_used_fn(struct bch_fs *c, struct bch_dev *ca, static unsigned bucket_sectors_used_fn(struct bch_fs *c, struct bch_dev *ca,
...@@ -718,7 +718,7 @@ static unsigned bucket_sectors_used_fn(struct bch_fs *c, struct bch_dev *ca, ...@@ -718,7 +718,7 @@ static unsigned bucket_sectors_used_fn(struct bch_fs *c, struct bch_dev *ca,
static unsigned bucket_oldest_gen_fn(struct bch_fs *c, struct bch_dev *ca, static unsigned bucket_oldest_gen_fn(struct bch_fs *c, struct bch_dev *ca,
size_t b, void *private) size_t b, void *private)
{ {
return bucket_gc_gen(ca, b); return bucket_gc_gen(bucket(ca, b));
} }
static int unsigned_cmp(const void *_l, const void *_r) static int unsigned_cmp(const void *_l, const void *_r)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment