Commit 6eac2c2e authored by Kent Overstreet's avatar Kent Overstreet Committed by Kent Overstreet

bcachefs: Change how replicated data is accounted

Due to compression, the different replicas of a replicated extent don't
necessarily have to take up the same amount of space - so replicated
data sector counts shouldn't be stored divided by the number of
replicas.
Signed-off-by: default avatarKent Overstreet <kent.overstreet@linux.dev>
parent 5b650fd1
...@@ -185,7 +185,7 @@ static void bch2_btree_node_free_index(struct btree_update *as, struct btree *b, ...@@ -185,7 +185,7 @@ static void bch2_btree_node_free_index(struct btree_update *as, struct btree *b,
replicas = bch2_extent_nr_dirty_ptrs(k); replicas = bch2_extent_nr_dirty_ptrs(k);
if (replicas) if (replicas)
stats->replicas[replicas - 1].data[BCH_DATA_BTREE] -= stats->replicas[replicas - 1].data[BCH_DATA_BTREE] -=
c->opts.btree_node_size; c->opts.btree_node_size * replicas;
/* /*
* We're dropping @k from the btree, but it's still live until the * We're dropping @k from the btree, but it's still live until the
......
...@@ -248,29 +248,28 @@ bch2_fs_usage_read(struct bch_fs *c) ...@@ -248,29 +248,28 @@ bch2_fs_usage_read(struct bch_fs *c)
struct fs_usage_sum { struct fs_usage_sum {
u64 hidden; u64 hidden;
u64 data; u64 data;
u64 cached;
u64 reserved; u64 reserved;
}; };
static inline struct fs_usage_sum __fs_usage_sum(struct bch_fs_usage stats) static inline struct fs_usage_sum __fs_usage_sum(struct bch_fs_usage stats)
{ {
struct fs_usage_sum sum = { 0 }; struct fs_usage_sum sum = { 0 };
unsigned i, j; unsigned i;
/* /*
* For superblock and journal we count bucket usage, not sector usage, * For superblock and journal we count bucket usage, not sector usage,
* because any internal fragmentation should _not_ be counted as * because any internal fragmentation should _not_ be counted as
* free space: * free space:
*/ */
for (j = 1; j < BCH_DATA_BTREE; j++) sum.hidden += stats.buckets[BCH_DATA_SB];
sum.hidden += stats.buckets[j]; sum.hidden += stats.buckets[BCH_DATA_JOURNAL];
for (i = 0; i < ARRAY_SIZE(stats.replicas); i++) { for (i = 0; i < ARRAY_SIZE(stats.replicas); i++) {
for (j = BCH_DATA_BTREE; sum.data += stats.replicas[i].data[BCH_DATA_BTREE];
j < ARRAY_SIZE(stats.replicas[i].data); sum.data += stats.replicas[i].data[BCH_DATA_USER];
j++) sum.cached += stats.replicas[i].data[BCH_DATA_CACHED];
sum.data += stats.replicas[i].data[j] * (i + 1); sum.reserved += stats.replicas[i].persistent_reserved;
sum.reserved += stats.replicas[i].persistent_reserved * (i + 1);
} }
sum.reserved += stats.online_reserved; sum.reserved += stats.online_reserved;
...@@ -379,17 +378,13 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca, ...@@ -379,17 +378,13 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
{ {
struct bch_dev_usage *dev_usage; struct bch_dev_usage *dev_usage;
if (c) percpu_rwsem_assert_held(&c->usage_lock);
percpu_rwsem_assert_held(&c->usage_lock);
if (old.data_type && new.data_type && bch2_fs_inconsistent_on(old.data_type && new.data_type &&
old.data_type != new.data_type) { old.data_type != new.data_type, c,
BUG_ON(!c); "different types of data in same bucket: %s, %s",
bch2_fs_inconsistent(c, bch2_data_types[old.data_type],
"different types of data in same bucket: %s, %s", bch2_data_types[new.data_type]);
bch2_data_types[old.data_type],
bch2_data_types[new.data_type]);
}
stats->buckets[bucket_type(old)] -= ca->mi.bucket_size; stats->buckets[bucket_type(old)] -= ca->mi.bucket_size;
stats->buckets[bucket_type(new)] += ca->mi.bucket_size; stats->buckets[bucket_type(new)] += ca->mi.bucket_size;
...@@ -448,6 +443,12 @@ void bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca, ...@@ -448,6 +443,12 @@ void bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
new.gen++; new.gen++;
})); }));
/*
* This isn't actually correct yet, since fs usage is still
* uncompressed sectors:
*/
stats->replicas[0].data[BCH_DATA_CACHED] -= old->cached_sectors;
if (!old->owned_by_allocator && old->cached_sectors) if (!old->owned_by_allocator && old->cached_sectors)
trace_invalidate(ca, bucket_to_sector(ca, b), trace_invalidate(ca, bucket_to_sector(ca, b),
old->cached_sectors); old->cached_sectors);
...@@ -501,26 +502,34 @@ void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca, ...@@ -501,26 +502,34 @@ void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
if (!(flags & BCH_BUCKET_MARK_GC_LOCK_HELD) && if (!(flags & BCH_BUCKET_MARK_GC_LOCK_HELD) &&
gc_will_visit(c, pos)) gc_will_visit(c, pos))
return; return;
}
preempt_disable(); preempt_disable();
stats = this_cpu_ptr(c->usage_percpu); stats = this_cpu_ptr(c->usage_percpu);
g = bucket(ca, b); g = bucket(ca, b);
old = bucket_data_cmpxchg(c, ca, stats, g, new, ({ old = bucket_data_cmpxchg(c, ca, stats, g, new, ({
new.data_type = type; new.data_type = type;
checked_add(new.dirty_sectors, sectors); checked_add(new.dirty_sectors, sectors);
})); }));
stats->replicas[0].data[type] += sectors; stats->replicas[0].data[type] += sectors;
preempt_enable(); preempt_enable();
} else {
rcu_read_lock();
g = bucket(ca, b);
old = bucket_cmpxchg(g, new, ({
new.data_type = type;
checked_add(new.dirty_sectors, sectors);
}));
rcu_read_unlock();
}
BUG_ON(!(flags & BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE) && BUG_ON(!(flags & BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE) &&
bucket_became_unavailable(c, old, new)); bucket_became_unavailable(c, old, new));
} }
/* Reverting this until the copygc + compression issue is fixed: */
static int __disk_sectors(struct bch_extent_crc_unpacked crc, unsigned sectors) static int __disk_sectors(struct bch_extent_crc_unpacked crc, unsigned sectors)
{ {
if (!sectors) if (!sectors)
...@@ -540,12 +549,14 @@ static void bch2_mark_pointer(struct bch_fs *c, ...@@ -540,12 +549,14 @@ static void bch2_mark_pointer(struct bch_fs *c,
const struct bch_extent_ptr *ptr, const struct bch_extent_ptr *ptr,
struct bch_extent_crc_unpacked crc, struct bch_extent_crc_unpacked crc,
s64 sectors, enum bch_data_type data_type, s64 sectors, enum bch_data_type data_type,
struct bch_fs_usage *stats, unsigned replicas,
struct bch_fs_usage *fs_usage,
u64 journal_seq, unsigned flags) u64 journal_seq, unsigned flags)
{ {
struct bucket_mark old, new; struct bucket_mark old, new;
struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
struct bucket *g = PTR_BUCKET(ca, ptr); struct bucket *g = PTR_BUCKET(ca, ptr);
s64 uncompressed_sectors = sectors;
u64 v; u64 v;
if (crc.compression_type) { if (crc.compression_type) {
...@@ -563,6 +574,20 @@ static void bch2_mark_pointer(struct bch_fs *c, ...@@ -563,6 +574,20 @@ static void bch2_mark_pointer(struct bch_fs *c,
+__disk_sectors(crc, new_sectors); +__disk_sectors(crc, new_sectors);
} }
/*
* fs level usage (which determines free space) is in uncompressed
* sectors, until copygc + compression is sorted out:
*
* note also that we always update @fs_usage, even when we otherwise
* wouldn't do anything because gc is running - this is because the
* caller still needs to account w.r.t. its disk reservation. It is
* caller's responsibility to not apply @fs_usage if gc is in progress.
*/
fs_usage->replicas
[!ptr->cached && replicas ? replicas - 1 : 0].data
[!ptr->cached ? data_type : BCH_DATA_CACHED] +=
uncompressed_sectors;
if (flags & BCH_BUCKET_MARK_GC_WILL_VISIT) { if (flags & BCH_BUCKET_MARK_GC_WILL_VISIT) {
if (journal_seq) if (journal_seq)
bucket_cmpxchg(g, new, ({ bucket_cmpxchg(g, new, ({
...@@ -614,7 +639,7 @@ static void bch2_mark_pointer(struct bch_fs *c, ...@@ -614,7 +639,7 @@ static void bch2_mark_pointer(struct bch_fs *c,
old.v.counter, old.v.counter,
new.v.counter)) != old.v.counter); new.v.counter)) != old.v.counter);
bch2_dev_usage_update(c, ca, stats, old, new); bch2_dev_usage_update(c, ca, fs_usage, old, new);
BUG_ON(!(flags & BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE) && BUG_ON(!(flags & BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE) &&
bucket_became_unavailable(c, old, new)); bucket_became_unavailable(c, old, new));
...@@ -677,15 +702,13 @@ void bch2_mark_key(struct bch_fs *c, struct bkey_s_c k, ...@@ -677,15 +702,13 @@ void bch2_mark_key(struct bch_fs *c, struct bkey_s_c k,
extent_for_each_ptr_crc(e, ptr, crc) extent_for_each_ptr_crc(e, ptr, crc)
bch2_mark_pointer(c, e, ptr, crc, sectors, data_type, bch2_mark_pointer(c, e, ptr, crc, sectors, data_type,
stats, journal_seq, flags); replicas, stats, journal_seq, flags);
if (replicas)
stats->replicas[replicas - 1].data[data_type] += sectors;
break; break;
} }
case BCH_RESERVATION: case BCH_RESERVATION:
if (replicas) if (replicas)
stats->replicas[replicas - 1].persistent_reserved += sectors; stats->replicas[replicas - 1].persistent_reserved +=
sectors * replicas;
break; break;
} }
percpu_up_read(&c->usage_lock); percpu_up_read(&c->usage_lock);
......
...@@ -62,7 +62,6 @@ struct bch_dev_usage { ...@@ -62,7 +62,6 @@ struct bch_dev_usage {
struct bch_fs_usage { struct bch_fs_usage {
/* all fields are in units of 512 byte sectors: */ /* all fields are in units of 512 byte sectors: */
/* _uncompressed_ sectors: */
u64 online_reserved; u64 online_reserved;
u64 available_cache; u64 available_cache;
......
...@@ -985,14 +985,6 @@ static int __bch2_dev_attach_bdev(struct bch_dev *ca, struct bch_sb_handle *sb) ...@@ -985,14 +985,6 @@ static int __bch2_dev_attach_bdev(struct bch_dev *ca, struct bch_sb_handle *sb)
ca->disk_sb = *sb; ca->disk_sb = *sb;
memset(sb, 0, sizeof(*sb)); memset(sb, 0, sizeof(*sb));
if (ca->fs)
mutex_lock(&ca->fs->sb_lock);
bch2_mark_dev_superblock(ca->fs, ca, BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE);
if (ca->fs)
mutex_unlock(&ca->fs->sb_lock);
percpu_ref_reinit(&ca->io_ref); percpu_ref_reinit(&ca->io_ref);
return 0; return 0;
...@@ -1018,6 +1010,11 @@ static int bch2_dev_attach_bdev(struct bch_fs *c, struct bch_sb_handle *sb) ...@@ -1018,6 +1010,11 @@ static int bch2_dev_attach_bdev(struct bch_fs *c, struct bch_sb_handle *sb)
if (ret) if (ret)
return ret; return ret;
mutex_lock(&c->sb_lock);
bch2_mark_dev_superblock(ca->fs, ca,
BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE);
mutex_unlock(&c->sb_lock);
bch2_dev_sysfs_online(c, ca); bch2_dev_sysfs_online(c, ca);
if (c->sb.nr_devices == 1) if (c->sb.nr_devices == 1)
...@@ -1295,6 +1292,24 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) ...@@ -1295,6 +1292,24 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
return ret; return ret;
} }
static void dev_usage_clear(struct bch_dev *ca)
{
struct bucket_array *buckets;
int cpu;
for_each_possible_cpu(cpu) {
struct bch_dev_usage *p =
per_cpu_ptr(ca->usage_percpu, cpu);
memset(p, 0, sizeof(*p));
}
down_read(&ca->bucket_lock);
buckets = bucket_array(ca);
memset(buckets->b, 0, sizeof(buckets->b[0]) * buckets->nbuckets);
up_read(&ca->bucket_lock);
}
/* Add new device to running filesystem: */ /* Add new device to running filesystem: */
int bch2_dev_add(struct bch_fs *c, const char *path) int bch2_dev_add(struct bch_fs *c, const char *path)
{ {
...@@ -1333,11 +1348,28 @@ int bch2_dev_add(struct bch_fs *c, const char *path) ...@@ -1333,11 +1348,28 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
return ret; return ret;
} }
/*
* We want to allocate journal on the new device before adding the new
* device to the filesystem because allocating after we attach requires
* spinning up the allocator thread, and the allocator thread requires
* doing btree writes, which if the existing devices are RO isn't going
* to work
*
* So we have to mark where the superblocks are, but marking allocated
* data normally updates the filesystem usage too, so we have to mark,
* allocate the journal, reset all the marks, then remark after we
* attach...
*/
bch2_mark_dev_superblock(ca->fs, ca,
BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE);
err = "journal alloc failed"; err = "journal alloc failed";
ret = bch2_dev_journal_alloc(ca); ret = bch2_dev_journal_alloc(ca);
if (ret) if (ret)
goto err; goto err;
dev_usage_clear(ca);
mutex_lock(&c->state_lock); mutex_lock(&c->state_lock);
mutex_lock(&c->sb_lock); mutex_lock(&c->sb_lock);
...@@ -1388,6 +1420,9 @@ int bch2_dev_add(struct bch_fs *c, const char *path) ...@@ -1388,6 +1420,9 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
ca->disk_sb.sb->dev_idx = dev_idx; ca->disk_sb.sb->dev_idx = dev_idx;
bch2_dev_attach(c, ca, dev_idx); bch2_dev_attach(c, ca, dev_idx);
bch2_mark_dev_superblock(c, ca,
BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE);
bch2_write_super(c); bch2_write_super(c);
mutex_unlock(&c->sb_lock); mutex_unlock(&c->sb_lock);
......
...@@ -781,7 +781,7 @@ static ssize_t show_dev_alloc_debug(struct bch_dev *ca, char *buf) ...@@ -781,7 +781,7 @@ static ssize_t show_dev_alloc_debug(struct bch_dev *ca, char *buf)
" meta: %llu\n" " meta: %llu\n"
" user: %llu\n" " user: %llu\n"
" cached: %llu\n" " cached: %llu\n"
" available: %llu\n" " available: %lli\n"
"sectors:\n" "sectors:\n"
" sb: %llu\n" " sb: %llu\n"
" journal: %llu\n" " journal: %llu\n"
...@@ -802,7 +802,7 @@ static ssize_t show_dev_alloc_debug(struct bch_dev *ca, char *buf) ...@@ -802,7 +802,7 @@ static ssize_t show_dev_alloc_debug(struct bch_dev *ca, char *buf)
stats.buckets[BCH_DATA_BTREE], stats.buckets[BCH_DATA_BTREE],
stats.buckets[BCH_DATA_USER], stats.buckets[BCH_DATA_USER],
stats.buckets[BCH_DATA_CACHED], stats.buckets[BCH_DATA_CACHED],
__dev_buckets_available(ca, stats), ca->mi.nbuckets - ca->mi.first_bucket - stats.buckets_unavailable,
stats.sectors[BCH_DATA_SB], stats.sectors[BCH_DATA_SB],
stats.sectors[BCH_DATA_JOURNAL], stats.sectors[BCH_DATA_JOURNAL],
stats.sectors[BCH_DATA_BTREE], stats.sectors[BCH_DATA_BTREE],
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment