Commit 9b23fdbd authored by Kent Overstreet's avatar Kent Overstreet

bcachefs: bcachefs_metadata_version_inode_has_child_snapshots

There's an inherent race in taking a snapshot while an unlinked file is
open, and then reattaching it in the child snapshot.

In the interior snapshot node the file will appear unlinked, as though
it should be deleted - it's not referenced by anything in that snapshot
- but we can't delete it, because the file data is referenced by the
child snapshot.

This was being handled incorrectly with
propagate_key_to_snapshot_leaves() - but that doesn't resolve the
fundamental inconsistency of "this file looks like it should be deleted
according to normal rules, but - ".

To fix this, we need to fix the rule for when an inode is deleted. The
previous rule, ignoring snapshots (there was no well-defined rule
for with snapshots) was:
  Unlinked, non open files are deleted, either at recovery time or
  during online fsck

The new rule is:
  Unlinked, non open files, that do not exist in child snapshots, are
  deleted.

To make this work transactionally, we add a new inode flag,
BCH_INODE_has_child_snapshot; it overrides BCH_INODE_unlinked when
considering whether to delete an inode, or put it on the deleted list.

For transactional consistency, clearing it handled by the inode trigger:
when deleting an inode we check if there are parent inodes which can now
have the BCH_INODE_has_child_snapshot flag cleared.
Signed-off-by: default avatarKent Overstreet <kent.overstreet@linux.dev>
parent cba31b7e
...@@ -678,7 +678,8 @@ struct bch_sb_field_ext { ...@@ -678,7 +678,8 @@ struct bch_sb_field_ext {
x(disk_accounting_v2, BCH_VERSION(1, 9)) \ x(disk_accounting_v2, BCH_VERSION(1, 9)) \
x(disk_accounting_v3, BCH_VERSION(1, 10)) \ x(disk_accounting_v3, BCH_VERSION(1, 10)) \
x(disk_accounting_inum, BCH_VERSION(1, 11)) \ x(disk_accounting_inum, BCH_VERSION(1, 11)) \
x(rebalance_work_acct_fix, BCH_VERSION(1, 12)) x(rebalance_work_acct_fix, BCH_VERSION(1, 12)) \
x(inode_has_child_snapshots, BCH_VERSION(1, 13))
enum bcachefs_metadata_version { enum bcachefs_metadata_version {
bcachefs_metadata_version_min = 9, bcachefs_metadata_version_min = 9,
......
...@@ -174,11 +174,30 @@ static const struct rhashtable_params bch2_vfs_inodes_params = { ...@@ -174,11 +174,30 @@ static const struct rhashtable_params bch2_vfs_inodes_params = {
.automatic_shrinking = true, .automatic_shrinking = true,
}; };
struct bch_inode_info *__bch2_inode_hash_find(struct bch_fs *c, subvol_inum inum) static struct bch_inode_info *__bch2_inode_hash_find(struct bch_fs *c, subvol_inum inum)
{ {
return rhashtable_lookup_fast(&c->vfs_inodes_table, &inum, bch2_vfs_inodes_params); return rhashtable_lookup_fast(&c->vfs_inodes_table, &inum, bch2_vfs_inodes_params);
} }
bool bch2_inode_is_open(struct bch_fs *c, struct bpos p)
{
if (!test_bit(BCH_FS_started, &c->flags))
return false;
subvol_inum inum = {
.subvol = snapshot_t(c, p.snapshot)->subvol,
.inum = p.offset,
};
/* snapshot tree interior node, can't safely delete while online (yet) */
if (!inum.subvol) {
bch_warn_ratelimited(c, "%s(): snapshot %u has no subvol, unlinked but can't safely delete", __func__, p.snapshot);
return true;
}
return __bch2_inode_hash_find(c, inum) != NULL;
}
static void __wait_on_freeing_inode(struct bch_fs *c, static void __wait_on_freeing_inode(struct bch_fs *c,
struct bch_inode_info *inode, struct bch_inode_info *inode,
subvol_inum inum) subvol_inum inum)
......
...@@ -54,8 +54,6 @@ static inline subvol_inum inode_inum(struct bch_inode_info *inode) ...@@ -54,8 +54,6 @@ static inline subvol_inum inode_inum(struct bch_inode_info *inode)
return inode->ei_inum; return inode->ei_inum;
} }
struct bch_inode_info *__bch2_inode_hash_find(struct bch_fs *, subvol_inum);
/* /*
* Set if we've gotten a btree error for this inode, and thus the vfs inode and * Set if we've gotten a btree error for this inode, and thus the vfs inode and
* btree inode may be inconsistent: * btree inode may be inconsistent:
...@@ -181,6 +179,8 @@ void bch2_inode_update_after_write(struct btree_trans *, ...@@ -181,6 +179,8 @@ void bch2_inode_update_after_write(struct btree_trans *,
int __must_check bch2_write_inode(struct bch_fs *, struct bch_inode_info *, int __must_check bch2_write_inode(struct bch_fs *, struct bch_inode_info *,
inode_set_fn, void *, unsigned); inode_set_fn, void *, unsigned);
bool bch2_inode_is_open(struct bch_fs *c, struct bpos p);
int bch2_setattr_nonsize(struct mnt_idmap *, int bch2_setattr_nonsize(struct mnt_idmap *,
struct bch_inode_info *, struct bch_inode_info *,
struct iattr *); struct iattr *);
...@@ -198,10 +198,7 @@ int bch2_vfs_init(void); ...@@ -198,10 +198,7 @@ int bch2_vfs_init(void);
#define bch2_inode_update_after_write(_trans, _inode, _inode_u, _fields) ({ do {} while (0); }) #define bch2_inode_update_after_write(_trans, _inode, _inode_u, _fields) ({ do {} while (0); })
static inline struct bch_inode_info *__bch2_inode_hash_find(struct bch_fs *c, subvol_inum inum) static inline bool bch2_inode_is_open(struct bch_fs *c, struct bpos p) { return false; }
{
return NULL;
}
static inline void bch2_evict_subvolume_inodes(struct bch_fs *c, static inline void bch2_evict_subvolume_inodes(struct bch_fs *c,
snapshot_id_list *s) {} snapshot_id_list *s) {}
......
...@@ -1096,22 +1096,6 @@ static int check_inode_dirent_inode(struct btree_trans *trans, ...@@ -1096,22 +1096,6 @@ static int check_inode_dirent_inode(struct btree_trans *trans,
return ret; return ret;
} }
static bool bch2_inode_is_open(struct bch_fs *c, struct bpos p)
{
subvol_inum inum = {
.subvol = snapshot_t(c, p.snapshot)->subvol,
.inum = p.offset,
};
/* snapshot tree corruption, can't safely delete */
if (!inum.subvol) {
bch_warn_ratelimited(c, "%s(): snapshot %u has no subvol, unlinked but can't safely delete", __func__, p.snapshot);
return true;
}
return __bch2_inode_hash_find(c, inum) != NULL;
}
static int check_inode(struct btree_trans *trans, static int check_inode(struct btree_trans *trans,
struct btree_iter *iter, struct btree_iter *iter,
struct bkey_s_c k, struct bkey_s_c k,
...@@ -1184,28 +1168,27 @@ static int check_inode(struct btree_trans *trans, ...@@ -1184,28 +1168,27 @@ static int check_inode(struct btree_trans *trans,
ret = 0; ret = 0;
} }
if ((u.bi_flags & BCH_INODE_unlinked) && ret = bch2_inode_has_child_snapshots(trans, k.k->p);
bch2_key_has_snapshot_overwrites(trans, BTREE_ID_inodes, k.k->p)) { if (ret < 0)
struct bpos new_min_pos; goto err;
ret = bch2_propagate_key_to_snapshot_leaves(trans, iter->btree_id, k, &new_min_pos);
if (ret)
goto err;
u.bi_flags &= ~BCH_INODE_unlinked;
ret = __bch2_fsck_write_inode(trans, &u);
bch_err_msg(c, ret, "in fsck updating inode"); if (fsck_err_on(ret != !!(u.bi_flags & BCH_INODE_has_child_snapshot),
trans, inode_has_child_snapshots_wrong,
"inode has_child_snapshots flag wrong (should be %u)\n%s",
ret,
(printbuf_reset(&buf),
bch2_inode_unpacked_to_text(&buf, &u),
buf.buf))) {
if (ret) if (ret)
goto err_noprint; u.bi_flags |= BCH_INODE_has_child_snapshot;
else
if (!bpos_eq(new_min_pos, POS_MIN)) u.bi_flags &= ~BCH_INODE_has_child_snapshot;
bch2_btree_iter_set_pos(iter, bpos_predecessor(new_min_pos)); do_update = true;
goto err_noprint;
} }
ret = 0;
if (u.bi_flags & BCH_INODE_unlinked) { if ((u.bi_flags & BCH_INODE_unlinked) &&
!(u.bi_flags & BCH_INODE_has_child_snapshot)) {
if (!test_bit(BCH_FS_started, &c->flags)) { if (!test_bit(BCH_FS_started, &c->flags)) {
/* /*
* If we're not in online fsck, don't delete unlinked * If we're not in online fsck, don't delete unlinked
......
This diff is collapsed.
...@@ -5,6 +5,7 @@ ...@@ -5,6 +5,7 @@
#include "bkey.h" #include "bkey.h"
#include "bkey_methods.h" #include "bkey_methods.h"
#include "opts.h" #include "opts.h"
#include "snapshot.h"
enum bch_validate_flags; enum bch_validate_flags;
extern const char * const bch2_inode_opts[]; extern const char * const bch2_inode_opts[];
...@@ -17,6 +18,15 @@ int bch2_inode_v3_validate(struct bch_fs *, struct bkey_s_c, ...@@ -17,6 +18,15 @@ int bch2_inode_v3_validate(struct bch_fs *, struct bkey_s_c,
enum bch_validate_flags); enum bch_validate_flags);
void bch2_inode_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); void bch2_inode_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
int __bch2_inode_has_child_snapshots(struct btree_trans *, struct bpos);
static inline int bch2_inode_has_child_snapshots(struct btree_trans *trans, struct bpos pos)
{
return bch2_snapshot_is_leaf(trans->c, pos.snapshot) <= 0
? __bch2_inode_has_child_snapshots(trans, pos)
: 0;
}
int bch2_trigger_inode(struct btree_trans *, enum btree_id, unsigned, int bch2_trigger_inode(struct btree_trans *, enum btree_id, unsigned,
struct bkey_s_c, struct bkey_s, struct bkey_s_c, struct bkey_s,
enum btree_iter_update_trigger_flags); enum btree_iter_update_trigger_flags);
......
...@@ -133,7 +133,8 @@ enum inode_opt_id { ...@@ -133,7 +133,8 @@ enum inode_opt_id {
x(i_size_dirty, 5) \ x(i_size_dirty, 5) \
x(i_sectors_dirty, 6) \ x(i_sectors_dirty, 6) \
x(unlinked, 7) \ x(unlinked, 7) \
x(backptr_untrusted, 8) x(backptr_untrusted, 8) \
x(has_child_snapshot, 9)
/* bits 20+ reserved for packed fields below: */ /* bits 20+ reserved for packed fields below: */
......
...@@ -78,7 +78,10 @@ ...@@ -78,7 +78,10 @@
BCH_FSCK_ERR_accounting_mismatch) \ BCH_FSCK_ERR_accounting_mismatch) \
x(rebalance_work_acct_fix, \ x(rebalance_work_acct_fix, \
BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \ BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \
BCH_FSCK_ERR_accounting_mismatch) BCH_FSCK_ERR_accounting_mismatch) \
x(inode_has_child_snapshots, \
BIT_ULL(BCH_RECOVERY_PASS_check_inodes), \
BCH_FSCK_ERR_inode_has_child_snapshots_wrong)
#define DOWNGRADE_TABLE() \ #define DOWNGRADE_TABLE() \
x(bucket_stripe_sectors, \ x(bucket_stripe_sectors, \
......
...@@ -225,11 +225,13 @@ enum bch_fsck_flags { ...@@ -225,11 +225,13 @@ enum bch_fsck_flags {
x(inode_multiple_links_but_nlink_0, 207, FSCK_AUTOFIX) \ x(inode_multiple_links_but_nlink_0, 207, FSCK_AUTOFIX) \
x(inode_wrong_backpointer, 208, FSCK_AUTOFIX) \ x(inode_wrong_backpointer, 208, FSCK_AUTOFIX) \
x(inode_wrong_nlink, 209, FSCK_AUTOFIX) \ x(inode_wrong_nlink, 209, FSCK_AUTOFIX) \
x(inode_has_child_snapshots_wrong, 287, 0) \
x(inode_unreachable, 210, FSCK_AUTOFIX) \ x(inode_unreachable, 210, FSCK_AUTOFIX) \
x(deleted_inode_but_clean, 211, FSCK_AUTOFIX) \ x(deleted_inode_but_clean, 211, FSCK_AUTOFIX) \
x(deleted_inode_missing, 212, FSCK_AUTOFIX) \ x(deleted_inode_missing, 212, FSCK_AUTOFIX) \
x(deleted_inode_is_dir, 213, FSCK_AUTOFIX) \ x(deleted_inode_is_dir, 213, FSCK_AUTOFIX) \
x(deleted_inode_not_unlinked, 214, FSCK_AUTOFIX) \ x(deleted_inode_not_unlinked, 214, FSCK_AUTOFIX) \
x(deleted_inode_has_child_snapshots, 288, FSCK_AUTOFIX) \
x(extent_overlapping, 215, 0) \ x(extent_overlapping, 215, 0) \
x(key_in_missing_inode, 216, 0) \ x(key_in_missing_inode, 216, 0) \
x(key_in_wrong_inode_type, 217, 0) \ x(key_in_wrong_inode_type, 217, 0) \
...@@ -298,7 +300,7 @@ enum bch_fsck_flags { ...@@ -298,7 +300,7 @@ enum bch_fsck_flags {
x(accounting_key_replicas_devs_unsorted, 280, FSCK_AUTOFIX) \ x(accounting_key_replicas_devs_unsorted, 280, FSCK_AUTOFIX) \
x(accounting_key_version_0, 282, FSCK_AUTOFIX) \ x(accounting_key_version_0, 282, FSCK_AUTOFIX) \
x(logged_op_but_clean, 283, FSCK_AUTOFIX) \ x(logged_op_but_clean, 283, FSCK_AUTOFIX) \
x(MAX, 287, 0) x(MAX, 289, 0)
enum bch_sb_error_id { enum bch_sb_error_id {
#define x(t, n, ...) BCH_FSCK_ERR_##t = n, #define x(t, n, ...) BCH_FSCK_ERR_##t = n,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment