Commit 644d180b authored by Kent Overstreet's avatar Kent Overstreet Committed by Kent Overstreet

bcachefs: Journal replay refactoring

Signed-off-by: default avatarKent Overstreet <kent.overstreet@linux.dev>
parent 3ea2b1e1
...@@ -984,9 +984,9 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq, ...@@ -984,9 +984,9 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq,
u64 last_seq = cur_seq, nr, seq; u64 last_seq = cur_seq, nr, seq;
if (!list_empty(journal_entries)) if (!list_empty(journal_entries))
last_seq = le64_to_cpu(list_last_entry(journal_entries, last_seq = le64_to_cpu(list_first_entry(journal_entries,
struct journal_replay, struct journal_replay,
list)->j.last_seq); list)->j.seq);
nr = cur_seq - last_seq; nr = cur_seq - last_seq;
...@@ -999,6 +999,8 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq, ...@@ -999,6 +999,8 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq,
} }
} }
j->replay_journal_seq = last_seq;
j->replay_journal_seq_end = cur_seq;
j->last_seq_ondisk = last_seq; j->last_seq_ondisk = last_seq;
j->pin.front = last_seq; j->pin.front = last_seq;
j->pin.back = cur_seq; j->pin.back = cur_seq;
...@@ -1007,7 +1009,7 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq, ...@@ -1007,7 +1009,7 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq,
fifo_for_each_entry_ptr(p, &j->pin, seq) { fifo_for_each_entry_ptr(p, &j->pin, seq) {
INIT_LIST_HEAD(&p->list); INIT_LIST_HEAD(&p->list);
INIT_LIST_HEAD(&p->flushed); INIT_LIST_HEAD(&p->flushed);
atomic_set(&p->count, 0); atomic_set(&p->count, 1);
p->devs.nr = 0; p->devs.nr = 0;
} }
...@@ -1016,10 +1018,7 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq, ...@@ -1016,10 +1018,7 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq,
BUG_ON(seq < last_seq || seq >= cur_seq); BUG_ON(seq < last_seq || seq >= cur_seq);
p = journal_seq_pin(j, seq); journal_seq_pin(j, seq)->devs = i->devs;
atomic_set(&p->count, 1);
p->devs = i->devs;
} }
spin_lock(&j->lock); spin_lock(&j->lock);
......
// SPDX-License-Identifier: GPL-2.0 // SPDX-License-Identifier: GPL-2.0
#include "bcachefs.h" #include "bcachefs.h"
#include "alloc_background.h"
#include "alloc_foreground.h" #include "alloc_foreground.h"
#include "btree_gc.h"
#include "btree_update.h"
#include "buckets.h" #include "buckets.h"
#include "checksum.h" #include "checksum.h"
#include "error.h" #include "error.h"
...@@ -642,18 +639,6 @@ static void bch2_journal_read_device(struct closure *cl) ...@@ -642,18 +639,6 @@ static void bch2_journal_read_device(struct closure *cl)
goto out; goto out;
} }
void bch2_journal_entries_free(struct list_head *list)
{
while (!list_empty(list)) {
struct journal_replay *i =
list_first_entry(list, struct journal_replay, list);
list_del(&i->list);
kvpfree(i, offsetof(struct journal_replay, j) +
vstruct_bytes(&i->j));
}
}
int bch2_journal_read(struct bch_fs *c, struct list_head *list) int bch2_journal_read(struct bch_fs *c, struct list_head *list)
{ {
struct journal_list jlist; struct journal_list jlist;
...@@ -733,121 +718,6 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list) ...@@ -733,121 +718,6 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list)
return ret; return ret;
} }
/* journal replay: */
static int bch2_extent_replay_key(struct bch_fs *c, struct bkey_i *k)
{
struct btree_trans trans;
struct btree_iter *iter;
/*
* We might cause compressed extents to be
* split, so we need to pass in a
* disk_reservation:
*/
struct disk_reservation disk_res =
bch2_disk_reservation_init(c, 0);
BKEY_PADDED(k) split;
int ret;
bch2_trans_init(&trans, c);
iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
bkey_start_pos(&k->k),
BTREE_ITER_INTENT);
do {
ret = bch2_btree_iter_traverse(iter);
if (ret)
break;
bkey_copy(&split.k, k);
bch2_cut_front(iter->pos, &split.k);
bch2_extent_trim_atomic(&split.k, iter);
ret = bch2_disk_reservation_add(c, &disk_res,
split.k.k.size *
bch2_bkey_nr_dirty_ptrs(bkey_i_to_s_c(&split.k)),
BCH_DISK_RESERVATION_NOFAIL);
BUG_ON(ret);
bch2_trans_update(&trans, BTREE_INSERT_ENTRY(iter, &split.k));
ret = bch2_trans_commit(&trans, &disk_res, NULL,
BTREE_INSERT_ATOMIC|
BTREE_INSERT_NOFAIL|
BTREE_INSERT_LAZY_RW|
BTREE_INSERT_JOURNAL_REPLAY);
} while ((!ret || ret == -EINTR) &&
bkey_cmp(k->k.p, iter->pos));
bch2_disk_reservation_put(c, &disk_res);
/*
* This isn't strictly correct - we should only be relying on the btree
* node lock for synchronization with gc when we've got a write lock
* held.
*
* but - there are other correctness issues if btree gc were to run
* before journal replay finishes
*/
BUG_ON(c->gc_pos.phase);
bch2_mark_key(c, bkey_i_to_s_c(k), false, -((s64) k->k.size),
NULL, 0, 0);
bch2_trans_exit(&trans);
return ret;
}
int bch2_journal_replay(struct bch_fs *c, struct list_head *list)
{
struct journal *j = &c->journal;
struct bkey_i *k, *_n;
struct jset_entry *entry;
struct journal_replay *i, *n;
int ret = 0;
list_for_each_entry_safe(i, n, list, list) {
j->replay_journal_seq = le64_to_cpu(i->j.seq);
for_each_jset_key(k, _n, entry, &i->j) {
switch (entry->btree_id) {
case BTREE_ID_ALLOC:
ret = bch2_alloc_replay_key(c, k);
break;
case BTREE_ID_EXTENTS:
ret = bch2_extent_replay_key(c, k);
break;
default:
ret = bch2_btree_insert(c, entry->btree_id, k,
NULL, NULL,
BTREE_INSERT_NOFAIL|
BTREE_INSERT_LAZY_RW|
BTREE_INSERT_JOURNAL_REPLAY|
BTREE_INSERT_NOMARK);
break;
}
if (ret) {
bch_err(c, "journal replay: error %d while replaying key",
ret);
goto err;
}
cond_resched();
}
bch2_journal_pin_put(j, j->replay_journal_seq);
}
j->replay_journal_seq = 0;
bch2_journal_set_replay_done(j);
bch2_journal_flush_all_pins(j);
ret = bch2_journal_error(j);
err:
bch2_journal_entries_free(list);
return ret;
}
/* journal write: */ /* journal write: */
static void __journal_write_alloc(struct journal *j, static void __journal_write_alloc(struct journal *j,
......
...@@ -36,8 +36,6 @@ static inline struct jset_entry *__jset_entry_type_next(struct jset *jset, ...@@ -36,8 +36,6 @@ static inline struct jset_entry *__jset_entry_type_next(struct jset *jset,
vstruct_for_each_safe(entry, k, _n) vstruct_for_each_safe(entry, k, _n)
int bch2_journal_read(struct bch_fs *, struct list_head *); int bch2_journal_read(struct bch_fs *, struct list_head *);
void bch2_journal_entries_free(struct list_head *);
int bch2_journal_replay(struct bch_fs *, struct list_head *);
void bch2_journal_write(struct closure *); void bch2_journal_write(struct closure *);
......
...@@ -203,6 +203,7 @@ struct journal { ...@@ -203,6 +203,7 @@ struct journal {
} pin; } pin;
u64 replay_journal_seq; u64 replay_journal_seq;
u64 replay_journal_seq_end;
struct write_point wp; struct write_point wp;
spinlock_t err_lock; spinlock_t err_lock;
......
...@@ -12,94 +12,162 @@ ...@@ -12,94 +12,162 @@
#include "error.h" #include "error.h"
#include "fsck.h" #include "fsck.h"
#include "journal_io.h" #include "journal_io.h"
#include "journal_reclaim.h"
#include "journal_seq_blacklist.h" #include "journal_seq_blacklist.h"
#include "quota.h" #include "quota.h"
#include "recovery.h" #include "recovery.h"
#include "replicas.h" #include "replicas.h"
#include "super-io.h" #include "super-io.h"
#include <linux/sort.h>
#include <linux/stat.h> #include <linux/stat.h>
#define QSTR(n) { { { .len = strlen(n) } }, .name = n } #define QSTR(n) { { { .len = strlen(n) } }, .name = n }
static struct bkey_i *btree_root_find(struct bch_fs *c, /* journal replay: */
struct bch_sb_field_clean *clean,
struct jset *j, static void bch2_journal_entries_free(struct list_head *list)
enum btree_id id, unsigned *level)
{ {
struct bkey_i *k;
struct jset_entry *entry, *start, *end;
if (clean) { while (!list_empty(list)) {
start = clean->start; struct journal_replay *i =
end = vstruct_end(&clean->field); list_first_entry(list, struct journal_replay, list);
} else { list_del(&i->list);
start = j->start; kvpfree(i, offsetof(struct journal_replay, j) +
end = vstruct_last(j); vstruct_bytes(&i->j));
} }
}
for (entry = start; entry < end; entry = vstruct_next(entry)) static int bch2_extent_replay_key(struct bch_fs *c, struct bkey_i *k)
if (entry->type == BCH_JSET_ENTRY_btree_root && {
entry->btree_id == id) struct btree_trans trans;
goto found; struct btree_iter *iter;
/*
* We might cause compressed extents to be
* split, so we need to pass in a
* disk_reservation:
*/
struct disk_reservation disk_res =
bch2_disk_reservation_init(c, 0);
BKEY_PADDED(k) split;
int ret;
return NULL; bch2_trans_init(&trans, c);
found:
if (!entry->u64s)
return ERR_PTR(-EINVAL);
k = entry->start; iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
*level = entry->level; bkey_start_pos(&k->k),
return k; BTREE_ITER_INTENT);
} do {
ret = bch2_btree_iter_traverse(iter);
if (ret)
break;
static int verify_superblock_clean(struct bch_fs *c, bkey_copy(&split.k, k);
struct bch_sb_field_clean **cleanp, bch2_cut_front(iter->pos, &split.k);
struct jset *j) bch2_extent_trim_atomic(&split.k, iter);
{
unsigned i;
struct bch_sb_field_clean *clean = *cleanp;
int ret = 0;
if (!clean || !j) ret = bch2_disk_reservation_add(c, &disk_res,
return 0; split.k.k.size *
bch2_bkey_nr_dirty_ptrs(bkey_i_to_s_c(&split.k)),
BCH_DISK_RESERVATION_NOFAIL);
BUG_ON(ret);
if (mustfix_fsck_err_on(j->seq != clean->journal_seq, c, bch2_trans_update(&trans, BTREE_INSERT_ENTRY(iter, &split.k));
"superblock journal seq (%llu) doesn't match journal (%llu) after clean shutdown", ret = bch2_trans_commit(&trans, &disk_res, NULL,
le64_to_cpu(clean->journal_seq), BTREE_INSERT_ATOMIC|
le64_to_cpu(j->seq))) { BTREE_INSERT_NOFAIL|
kfree(clean); BTREE_INSERT_LAZY_RW|
*cleanp = NULL; BTREE_INSERT_JOURNAL_REPLAY);
return 0; } while ((!ret || ret == -EINTR) &&
bkey_cmp(k->k.p, iter->pos));
bch2_disk_reservation_put(c, &disk_res);
/*
* This isn't strictly correct - we should only be relying on the btree
* node lock for synchronization with gc when we've got a write lock
* held.
*
* but - there are other correctness issues if btree gc were to run
* before journal replay finishes
*/
BUG_ON(c->gc_pos.phase);
bch2_mark_key(c, bkey_i_to_s_c(k), false, -((s64) k->k.size),
NULL, 0, 0);
bch2_trans_exit(&trans);
return ret;
}
static int bch2_journal_replay_key(struct bch_fs *c, enum btree_id btree_id,
struct bkey_i *k)
{
switch (btree_id) {
case BTREE_ID_ALLOC:
return bch2_alloc_replay_key(c, k);
case BTREE_ID_EXTENTS:
return bch2_extent_replay_key(c, k);
default:
return bch2_btree_insert(c, btree_id, k,
NULL, NULL,
BTREE_INSERT_NOFAIL|
BTREE_INSERT_LAZY_RW|
BTREE_INSERT_JOURNAL_REPLAY|
BTREE_INSERT_NOMARK);
} }
}
mustfix_fsck_err_on(j->read_clock != clean->read_clock, c, static void replay_now_at(struct journal *j, u64 seq)
"superblock read clock doesn't match journal after clean shutdown"); {
mustfix_fsck_err_on(j->write_clock != clean->write_clock, c, BUG_ON(seq < j->replay_journal_seq);
"superblock read clock doesn't match journal after clean shutdown"); BUG_ON(seq > j->replay_journal_seq_end);
for (i = 0; i < BTREE_ID_NR; i++) { while (j->replay_journal_seq < seq)
struct bkey_i *k1, *k2; bch2_journal_pin_put(j, j->replay_journal_seq++);
unsigned l1 = 0, l2 = 0; }
k1 = btree_root_find(c, clean, NULL, i, &l1); static int bch2_journal_replay(struct bch_fs *c, struct list_head *list)
k2 = btree_root_find(c, NULL, j, i, &l2); {
struct journal *j = &c->journal;
struct bkey_i *k, *_n;
struct jset_entry *entry;
struct journal_replay *i, *n;
int ret = 0;
if (!k1 && !k2) list_for_each_entry_safe(i, n, list, list) {
continue; replay_now_at(j, le64_to_cpu(i->j.seq));
mustfix_fsck_err_on(!k1 || !k2 || for_each_jset_key(k, _n, entry, &i->j) {
IS_ERR(k1) || ret = bch2_journal_replay_key(c, entry->btree_id, k);
IS_ERR(k2) || if (ret) {
k1->k.u64s != k2->k.u64s || bch_err(c, "journal replay: error %d while replaying key",
memcmp(k1, k2, bkey_bytes(k1)) || ret);
l1 != l2, c, goto err;
"superblock btree root doesn't match journal after clean shutdown"); }
cond_resched();
}
} }
fsck_err:
replay_now_at(j, j->replay_journal_seq_end);
j->replay_journal_seq = 0;
bch2_journal_set_replay_done(j);
bch2_journal_flush_all_pins(j);
ret = bch2_journal_error(j);
err:
bch2_journal_entries_free(list);
return ret; return ret;
} }
static bool journal_empty(struct list_head *journal)
{
return list_empty(journal) ||
journal_entry_empty(&list_last_entry(journal,
struct journal_replay, list)->j);
}
static int static int
verify_journal_entries_not_blacklisted_or_missing(struct bch_fs *c, verify_journal_entries_not_blacklisted_or_missing(struct bch_fs *c,
struct list_head *journal) struct list_head *journal)
...@@ -130,40 +198,7 @@ verify_journal_entries_not_blacklisted_or_missing(struct bch_fs *c, ...@@ -130,40 +198,7 @@ verify_journal_entries_not_blacklisted_or_missing(struct bch_fs *c,
return ret; return ret;
} }
static struct bch_sb_field_clean *read_superblock_clean(struct bch_fs *c) /* journal replay early: */
{
struct bch_sb_field_clean *clean, *sb_clean;
int ret;
mutex_lock(&c->sb_lock);
sb_clean = bch2_sb_get_clean(c->disk_sb.sb);
if (fsck_err_on(!sb_clean, c,
"superblock marked clean but clean section not present")) {
SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
c->sb.clean = false;
mutex_unlock(&c->sb_lock);
return NULL;
}
clean = kmemdup(sb_clean, vstruct_bytes(&sb_clean->field),
GFP_KERNEL);
if (!clean) {
mutex_unlock(&c->sb_lock);
return ERR_PTR(-ENOMEM);
}
if (le16_to_cpu(c->disk_sb.sb->version) <
bcachefs_metadata_version_bkey_renumber)
bch2_sb_clean_renumber(clean, READ);
mutex_unlock(&c->sb_lock);
return clean;
fsck_err:
mutex_unlock(&c->sb_lock);
return ERR_PTR(ret);
}
static int journal_replay_entry_early(struct bch_fs *c, static int journal_replay_entry_early(struct bch_fs *c,
struct jset_entry *entry) struct jset_entry *entry)
...@@ -275,6 +310,121 @@ static int journal_replay_early(struct bch_fs *c, ...@@ -275,6 +310,121 @@ static int journal_replay_early(struct bch_fs *c,
return 0; return 0;
} }
/* sb clean section: */
static struct bkey_i *btree_root_find(struct bch_fs *c,
struct bch_sb_field_clean *clean,
struct jset *j,
enum btree_id id, unsigned *level)
{
struct bkey_i *k;
struct jset_entry *entry, *start, *end;
if (clean) {
start = clean->start;
end = vstruct_end(&clean->field);
} else {
start = j->start;
end = vstruct_last(j);
}
for (entry = start; entry < end; entry = vstruct_next(entry))
if (entry->type == BCH_JSET_ENTRY_btree_root &&
entry->btree_id == id)
goto found;
return NULL;
found:
if (!entry->u64s)
return ERR_PTR(-EINVAL);
k = entry->start;
*level = entry->level;
return k;
}
static int verify_superblock_clean(struct bch_fs *c,
struct bch_sb_field_clean **cleanp,
struct jset *j)
{
unsigned i;
struct bch_sb_field_clean *clean = *cleanp;
int ret = 0;
if (!clean || !j)
return 0;
if (mustfix_fsck_err_on(j->seq != clean->journal_seq, c,
"superblock journal seq (%llu) doesn't match journal (%llu) after clean shutdown",
le64_to_cpu(clean->journal_seq),
le64_to_cpu(j->seq))) {
kfree(clean);
*cleanp = NULL;
return 0;
}
mustfix_fsck_err_on(j->read_clock != clean->read_clock, c,
"superblock read clock doesn't match journal after clean shutdown");
mustfix_fsck_err_on(j->write_clock != clean->write_clock, c,
"superblock read clock doesn't match journal after clean shutdown");
for (i = 0; i < BTREE_ID_NR; i++) {
struct bkey_i *k1, *k2;
unsigned l1 = 0, l2 = 0;
k1 = btree_root_find(c, clean, NULL, i, &l1);
k2 = btree_root_find(c, NULL, j, i, &l2);
if (!k1 && !k2)
continue;
mustfix_fsck_err_on(!k1 || !k2 ||
IS_ERR(k1) ||
IS_ERR(k2) ||
k1->k.u64s != k2->k.u64s ||
memcmp(k1, k2, bkey_bytes(k1)) ||
l1 != l2, c,
"superblock btree root doesn't match journal after clean shutdown");
}
fsck_err:
return ret;
}
static struct bch_sb_field_clean *read_superblock_clean(struct bch_fs *c)
{
struct bch_sb_field_clean *clean, *sb_clean;
int ret;
mutex_lock(&c->sb_lock);
sb_clean = bch2_sb_get_clean(c->disk_sb.sb);
if (fsck_err_on(!sb_clean, c,
"superblock marked clean but clean section not present")) {
SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
c->sb.clean = false;
mutex_unlock(&c->sb_lock);
return NULL;
}
clean = kmemdup(sb_clean, vstruct_bytes(&sb_clean->field),
GFP_KERNEL);
if (!clean) {
mutex_unlock(&c->sb_lock);
return ERR_PTR(-ENOMEM);
}
if (le16_to_cpu(c->disk_sb.sb->version) <
bcachefs_metadata_version_bkey_renumber)
bch2_sb_clean_renumber(clean, READ);
mutex_unlock(&c->sb_lock);
return clean;
fsck_err:
mutex_unlock(&c->sb_lock);
return ERR_PTR(ret);
}
static int read_btree_roots(struct bch_fs *c) static int read_btree_roots(struct bch_fs *c)
{ {
unsigned i; unsigned i;
...@@ -320,13 +470,6 @@ static int read_btree_roots(struct bch_fs *c) ...@@ -320,13 +470,6 @@ static int read_btree_roots(struct bch_fs *c)
return ret; return ret;
} }
static bool journal_empty(struct list_head *journal)
{
return list_empty(journal) ||
journal_entry_empty(&list_last_entry(journal,
struct journal_replay, list)->j);
}
int bch2_fs_recovery(struct bch_fs *c) int bch2_fs_recovery(struct bch_fs *c)
{ {
const char *err = "cannot allocate memory"; const char *err = "cannot allocate memory";
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment