Commit 12fcfd22 authored by Chris Mason's avatar Chris Mason

Btrfs: tree logging unlink/rename fixes

The tree logging code allows individual files or directories to be logged
without including operations on other files and directories in the FS.
It tries to commit the minimal set of changes to disk in order to
fsync the single file or directory that was sent to fsync or O_SYNC.

The tree logging code was allowing files and directories to be unlinked
if they were part of a rename operation where only one directory
in the rename was in the fsync log.  This patch adds a few new rules
to the tree logging.

1) on rename or unlink, if the inode being unlinked isn't in the fsync
log, we must force a full commit before doing an fsync of the directory
where the unlink was done.  The commit isn't done during the unlink,
but it is forced the next time we try to log the parent directory.

Solution: record transid of last unlink/rename per directory when the
directory wasn't already logged.  For renames this is only done when
renaming to a different directory.

mkdir foo/some_dir
normal commit
rename foo/some_dir foo2/some_dir
mkdir foo/some_dir
fsync foo/some_dir/some_file

The fsync above will unlink the original some_dir without recording
it in its new location (foo2).  After a crash, some_dir will be gone
unless the fsync of some_file forces a full commit

2) we must log any new names for any file or dir that is in the fsync
log.  This way we make sure not to lose files that are unlinked during
the same transaction.

2a) we must log any new names for any file or dir during rename
when the directory they are being removed from was logged.

2a is actually the more important variant.  Without the extra logging
a crash might unlink the old name without recreating the new one

3) after a crash, we must go through any directories with a link count
of zero and redo the rm -rf

mkdir f1/foo
normal commit
rm -rf f1/foo
fsync(f1)

The directory f1 was fully removed from the FS, but fsync was never
called on f1, only its parent dir.  After a crash the rm -rf must
be replayed.  This must be able to recurse down the entire
directory tree.  The inode link count fixup code takes care of the
ugly details.
Signed-off-by: default avatarChris Mason <chris.mason@oracle.com>
parent a74ac322
...@@ -86,12 +86,6 @@ struct btrfs_inode { ...@@ -86,12 +86,6 @@ struct btrfs_inode {
*/ */
u64 logged_trans; u64 logged_trans;
/*
* trans that last made a change that should be fully fsync'd. This
* gets reset to zero each time the inode is logged
*/
u64 log_dirty_trans;
/* total number of bytes pending delalloc, used by stat to calc the /* total number of bytes pending delalloc, used by stat to calc the
* real block usage of the file * real block usage of the file
*/ */
...@@ -121,6 +115,13 @@ struct btrfs_inode { ...@@ -121,6 +115,13 @@ struct btrfs_inode {
/* the start of block group preferred for allocations. */ /* the start of block group preferred for allocations. */
u64 block_group; u64 block_group;
/* the fsync log has some corner cases that mean we have to check
* directories to see if any unlinks have been done before
* the directory was logged. See tree-log.c for all the
* details
*/
u64 last_unlink_trans;
struct inode vfs_inode; struct inode vfs_inode;
}; };
......
...@@ -695,7 +695,12 @@ struct btrfs_fs_info { ...@@ -695,7 +695,12 @@ struct btrfs_fs_info {
u64 generation; u64 generation;
u64 last_trans_committed; u64 last_trans_committed;
u64 last_trans_new_blockgroup;
/*
* this is updated to the current trans every time a full commit
* is required instead of the faster short fsync log commits
*/
u64 last_trans_log_full_commit;
u64 open_ioctl_trans; u64 open_ioctl_trans;
unsigned long mount_opt; unsigned long mount_opt;
u64 max_extent; u64 max_extent;
......
...@@ -5897,7 +5897,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, ...@@ -5897,7 +5897,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
extent_root = root->fs_info->extent_root; extent_root = root->fs_info->extent_root;
root->fs_info->last_trans_new_blockgroup = trans->transid; root->fs_info->last_trans_log_full_commit = trans->transid;
cache = kzalloc(sizeof(*cache), GFP_NOFS); cache = kzalloc(sizeof(*cache), GFP_NOFS);
if (!cache) if (!cache)
......
...@@ -1173,8 +1173,11 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf, ...@@ -1173,8 +1173,11 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
ret = btrfs_log_dentry_safe(trans, root, ret = btrfs_log_dentry_safe(trans, root,
file->f_dentry); file->f_dentry);
if (ret == 0) { if (ret == 0) {
btrfs_sync_log(trans, root); ret = btrfs_sync_log(trans, root);
if (ret == 0)
btrfs_end_transaction(trans, root); btrfs_end_transaction(trans, root);
else
btrfs_commit_transaction(trans, root);
} else { } else {
btrfs_commit_transaction(trans, root); btrfs_commit_transaction(trans, root);
} }
...@@ -1266,8 +1269,11 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync) ...@@ -1266,8 +1269,11 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
if (ret > 0) { if (ret > 0) {
ret = btrfs_commit_transaction(trans, root); ret = btrfs_commit_transaction(trans, root);
} else { } else {
btrfs_sync_log(trans, root); ret = btrfs_sync_log(trans, root);
if (ret == 0)
ret = btrfs_end_transaction(trans, root); ret = btrfs_end_transaction(trans, root);
else
ret = btrfs_commit_transaction(trans, root);
} }
mutex_lock(&dentry->d_inode->i_mutex); mutex_lock(&dentry->d_inode->i_mutex);
out: out:
......
...@@ -2246,8 +2246,6 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans, ...@@ -2246,8 +2246,6 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
ret = btrfs_del_inode_ref_in_log(trans, root, name, name_len, ret = btrfs_del_inode_ref_in_log(trans, root, name, name_len,
inode, dir->i_ino); inode, dir->i_ino);
BUG_ON(ret != 0 && ret != -ENOENT); BUG_ON(ret != 0 && ret != -ENOENT);
if (ret != -ENOENT)
BTRFS_I(dir)->log_dirty_trans = trans->transid;
ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len, ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len,
dir, index); dir, index);
...@@ -2280,6 +2278,9 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry) ...@@ -2280,6 +2278,9 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
trans = btrfs_start_transaction(root, 1); trans = btrfs_start_transaction(root, 1);
btrfs_set_trans_block_group(trans, dir); btrfs_set_trans_block_group(trans, dir);
btrfs_record_unlink_dir(trans, dir, dentry->d_inode, 0);
ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode, ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode,
dentry->d_name.name, dentry->d_name.len); dentry->d_name.name, dentry->d_name.len);
...@@ -3042,7 +3043,7 @@ static noinline void init_btrfs_i(struct inode *inode) ...@@ -3042,7 +3043,7 @@ static noinline void init_btrfs_i(struct inode *inode)
bi->disk_i_size = 0; bi->disk_i_size = 0;
bi->flags = 0; bi->flags = 0;
bi->index_cnt = (u64)-1; bi->index_cnt = (u64)-1;
bi->log_dirty_trans = 0; bi->last_unlink_trans = 0;
extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS); extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS);
extent_io_tree_init(&BTRFS_I(inode)->io_tree, extent_io_tree_init(&BTRFS_I(inode)->io_tree,
inode->i_mapping, GFP_NOFS); inode->i_mapping, GFP_NOFS);
...@@ -3786,6 +3787,8 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, ...@@ -3786,6 +3787,8 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
drop_inode = 1; drop_inode = 1;
nr = trans->blocks_used; nr = trans->blocks_used;
btrfs_log_new_name(trans, inode, NULL, dentry->d_parent);
btrfs_end_transaction_throttle(trans, root); btrfs_end_transaction_throttle(trans, root);
fail: fail:
if (drop_inode) { if (drop_inode) {
...@@ -4666,6 +4669,15 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, ...@@ -4666,6 +4669,15 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
trans = btrfs_start_transaction(root, 1); trans = btrfs_start_transaction(root, 1);
/*
* this is an ugly little race, but the rename is required to make
* sure that if we crash, the inode is either at the old name
* or the new one. pinning the log transaction lets us make sure
* we don't allow a log commit to come in after we unlink the
* name but before we add the new name back in.
*/
btrfs_pin_log_trans(root);
btrfs_set_trans_block_group(trans, new_dir); btrfs_set_trans_block_group(trans, new_dir);
btrfs_inc_nlink(old_dentry->d_inode); btrfs_inc_nlink(old_dentry->d_inode);
...@@ -4673,6 +4685,9 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, ...@@ -4673,6 +4685,9 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
new_dir->i_ctime = new_dir->i_mtime = ctime; new_dir->i_ctime = new_dir->i_mtime = ctime;
old_inode->i_ctime = ctime; old_inode->i_ctime = ctime;
if (old_dentry->d_parent != new_dentry->d_parent)
btrfs_record_unlink_dir(trans, old_dir, old_inode, 1);
ret = btrfs_unlink_inode(trans, root, old_dir, old_dentry->d_inode, ret = btrfs_unlink_inode(trans, root, old_dir, old_dentry->d_inode,
old_dentry->d_name.name, old_dentry->d_name.name,
old_dentry->d_name.len); old_dentry->d_name.len);
...@@ -4704,7 +4719,14 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, ...@@ -4704,7 +4719,14 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
if (ret) if (ret)
goto out_fail; goto out_fail;
btrfs_log_new_name(trans, old_inode, old_dir,
new_dentry->d_parent);
out_fail: out_fail:
/* this btrfs_end_log_trans just allows the current
* log-sub transaction to complete
*/
btrfs_end_log_trans(root);
btrfs_end_transaction_throttle(trans, root); btrfs_end_transaction_throttle(trans, root);
out_unlock: out_unlock:
return ret; return ret;
......
This diff is collapsed.
...@@ -22,14 +22,9 @@ ...@@ -22,14 +22,9 @@
int btrfs_sync_log(struct btrfs_trans_handle *trans, int btrfs_sync_log(struct btrfs_trans_handle *trans,
struct btrfs_root *root); struct btrfs_root *root);
int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root);
int btrfs_log_dentry(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct dentry *dentry);
int btrfs_recover_log_trees(struct btrfs_root *tree_root); int btrfs_recover_log_trees(struct btrfs_root *tree_root);
int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct dentry *dentry); struct btrfs_root *root, struct dentry *dentry);
int btrfs_log_inode(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct inode *inode,
int inode_only);
int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct btrfs_root *root,
const char *name, int name_len, const char *name, int name_len,
...@@ -38,4 +33,16 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, ...@@ -38,4 +33,16 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct btrfs_root *root,
const char *name, int name_len, const char *name, int name_len,
struct inode *inode, u64 dirid); struct inode *inode, u64 dirid);
int btrfs_join_running_log_trans(struct btrfs_root *root);
int btrfs_end_log_trans(struct btrfs_root *root);
int btrfs_pin_log_trans(struct btrfs_root *root);
int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct inode *inode,
struct dentry *parent, int exists_only);
void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
struct inode *dir, struct inode *inode,
int for_rename);
int btrfs_log_new_name(struct btrfs_trans_handle *trans,
struct inode *inode, struct inode *old_dir,
struct dentry *parent);
#endif #endif
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment