Commit 5792fa6b authored by Chris Mason's avatar Chris Mason Committed by Greg Kroah-Hartman

btrfs: disable strict file flushes for renames and truncates

commit 8d875f95 upstream.

Truncates and renames are often used to replace old versions of a file
with new versions.  Applications often expect this to be an atomic
replacement, even if they haven't done anything to make sure the new
version is fully on disk.

Btrfs has strict flushing in place to make sure that renaming over an
old file with a new file will fully flush out the new file before
allowing the transaction commit with the rename to complete.

This ordering means the commit code needs to be able to lock file pages,
and there are a few paths in the filesystem where we will try to end a
transaction with the page lock held.  It's rare, but these things can
deadlock.

This patch removes the ordered flushes and switches to a best effort
filemap_flush like ext4 uses. It's not perfect, but it should fix the
deadlocks.
Signed-off-by: default avatarChris Mason <clm@fb.com>
Signed-off-by: default avatarGreg Kroah-Hartman <gregkh@linuxfoundation.org>
parent 8e46c5dc
...@@ -84,12 +84,6 @@ struct btrfs_inode { ...@@ -84,12 +84,6 @@ struct btrfs_inode {
*/ */
struct list_head delalloc_inodes; struct list_head delalloc_inodes;
/*
* list for tracking inodes that must be sent to disk before a
* rename or truncate commit
*/
struct list_head ordered_operations;
/* node for the red-black tree that links inodes in subvolume root */ /* node for the red-black tree that links inodes in subvolume root */
struct rb_node rb_node; struct rb_node rb_node;
......
...@@ -60,8 +60,6 @@ static void end_workqueue_fn(struct btrfs_work *work); ...@@ -60,8 +60,6 @@ static void end_workqueue_fn(struct btrfs_work *work);
static void free_fs_root(struct btrfs_root *root); static void free_fs_root(struct btrfs_root *root);
static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info, static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
int read_only); int read_only);
static void btrfs_destroy_ordered_operations(struct btrfs_transaction *t,
struct btrfs_root *root);
static void btrfs_destroy_ordered_extents(struct btrfs_root *root); static void btrfs_destroy_ordered_extents(struct btrfs_root *root);
static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
struct btrfs_root *root); struct btrfs_root *root);
...@@ -3829,34 +3827,6 @@ static void btrfs_error_commit_super(struct btrfs_root *root) ...@@ -3829,34 +3827,6 @@ static void btrfs_error_commit_super(struct btrfs_root *root)
btrfs_cleanup_transaction(root); btrfs_cleanup_transaction(root);
} }
static void btrfs_destroy_ordered_operations(struct btrfs_transaction *t,
struct btrfs_root *root)
{
struct btrfs_inode *btrfs_inode;
struct list_head splice;
INIT_LIST_HEAD(&splice);
mutex_lock(&root->fs_info->ordered_operations_mutex);
spin_lock(&root->fs_info->ordered_root_lock);
list_splice_init(&t->ordered_operations, &splice);
while (!list_empty(&splice)) {
btrfs_inode = list_entry(splice.next, struct btrfs_inode,
ordered_operations);
list_del_init(&btrfs_inode->ordered_operations);
spin_unlock(&root->fs_info->ordered_root_lock);
btrfs_invalidate_inodes(btrfs_inode->root);
spin_lock(&root->fs_info->ordered_root_lock);
}
spin_unlock(&root->fs_info->ordered_root_lock);
mutex_unlock(&root->fs_info->ordered_operations_mutex);
}
static void btrfs_destroy_ordered_extents(struct btrfs_root *root) static void btrfs_destroy_ordered_extents(struct btrfs_root *root)
{ {
struct btrfs_ordered_extent *ordered; struct btrfs_ordered_extent *ordered;
...@@ -4093,8 +4063,6 @@ static int btrfs_destroy_pinned_extent(struct btrfs_root *root, ...@@ -4093,8 +4063,6 @@ static int btrfs_destroy_pinned_extent(struct btrfs_root *root,
void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans, void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
struct btrfs_root *root) struct btrfs_root *root)
{ {
btrfs_destroy_ordered_operations(cur_trans, root);
btrfs_destroy_delayed_refs(cur_trans, root); btrfs_destroy_delayed_refs(cur_trans, root);
cur_trans->state = TRANS_STATE_COMMIT_START; cur_trans->state = TRANS_STATE_COMMIT_START;
......
...@@ -1838,33 +1838,9 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, ...@@ -1838,33 +1838,9 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
int btrfs_release_file(struct inode *inode, struct file *filp) int btrfs_release_file(struct inode *inode, struct file *filp)
{ {
/*
* ordered_data_close is set by settattr when we are about to truncate
* a file from a non-zero size to a zero size. This tries to
* flush down new bytes that may have been written if the
* application were using truncate to replace a file in place.
*/
if (test_and_clear_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
&BTRFS_I(inode)->runtime_flags)) {
struct btrfs_trans_handle *trans;
struct btrfs_root *root = BTRFS_I(inode)->root;
/*
* We need to block on a committing transaction to keep us from
* throwing a ordered operation on to the list and causing
* something like sync to deadlock trying to flush out this
* inode.
*/
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans))
return PTR_ERR(trans);
btrfs_add_ordered_operation(trans, BTRFS_I(inode)->root, inode);
btrfs_end_transaction(trans, root);
if (inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)
filemap_flush(inode->i_mapping);
}
if (filp->private_data) if (filp->private_data)
btrfs_ioctl_trans_end(filp); btrfs_ioctl_trans_end(filp);
filemap_flush(inode->i_mapping);
return 0; return 0;
} }
......
...@@ -7950,27 +7950,6 @@ static int btrfs_truncate(struct inode *inode) ...@@ -7950,27 +7950,6 @@ static int btrfs_truncate(struct inode *inode)
min_size); min_size);
BUG_ON(ret); BUG_ON(ret);
/*
* setattr is responsible for setting the ordered_data_close flag,
* but that is only tested during the last file release. That
* could happen well after the next commit, leaving a great big
* window where new writes may get lost if someone chooses to write
* to this file after truncating to zero
*
* The inode doesn't have any dirty data here, and so if we commit
* this is a noop. If someone immediately starts writing to the inode
* it is very likely we'll catch some of their writes in this
* transaction, and the commit will find this file on the ordered
* data list with good things to send down.
*
* This is a best effort solution, there is still a window where
* using truncate to replace the contents of the file will
* end up with a zero length file after a crash.
*/
if (inode->i_size == 0 && test_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
&BTRFS_I(inode)->runtime_flags))
btrfs_add_ordered_operation(trans, root, inode);
/* /*
* So if we truncate and then write and fsync we normally would just * So if we truncate and then write and fsync we normally would just
* write the extents that changed, which is a problem if we need to * write the extents that changed, which is a problem if we need to
...@@ -8118,7 +8097,6 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) ...@@ -8118,7 +8097,6 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
mutex_init(&ei->delalloc_mutex); mutex_init(&ei->delalloc_mutex);
btrfs_ordered_inode_tree_init(&ei->ordered_tree); btrfs_ordered_inode_tree_init(&ei->ordered_tree);
INIT_LIST_HEAD(&ei->delalloc_inodes); INIT_LIST_HEAD(&ei->delalloc_inodes);
INIT_LIST_HEAD(&ei->ordered_operations);
RB_CLEAR_NODE(&ei->rb_node); RB_CLEAR_NODE(&ei->rb_node);
return inode; return inode;
...@@ -8158,17 +8136,6 @@ void btrfs_destroy_inode(struct inode *inode) ...@@ -8158,17 +8136,6 @@ void btrfs_destroy_inode(struct inode *inode)
if (!root) if (!root)
goto free; goto free;
/*
* Make sure we're properly removed from the ordered operation
* lists.
*/
smp_mb();
if (!list_empty(&BTRFS_I(inode)->ordered_operations)) {
spin_lock(&root->fs_info->ordered_root_lock);
list_del_init(&BTRFS_I(inode)->ordered_operations);
spin_unlock(&root->fs_info->ordered_root_lock);
}
if (test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, if (test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
&BTRFS_I(inode)->runtime_flags)) { &BTRFS_I(inode)->runtime_flags)) {
btrfs_info(root->fs_info, "inode %llu still on the orphan list", btrfs_info(root->fs_info, "inode %llu still on the orphan list",
...@@ -8350,12 +8317,10 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, ...@@ -8350,12 +8317,10 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
ret = 0; ret = 0;
/* /*
* we're using rename to replace one file with another. * we're using rename to replace one file with another. Start IO on it
* and the replacement file is large. Start IO on it now so * now so we don't add too much work to the end of the transaction
* we don't add too much work to the end of the transaction
*/ */
if (new_inode && S_ISREG(old_inode->i_mode) && new_inode->i_size && if (new_inode && S_ISREG(old_inode->i_mode) && new_inode->i_size)
old_inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)
filemap_flush(old_inode->i_mapping); filemap_flush(old_inode->i_mapping);
/* close the racy window with snapshot create/destroy ioctl */ /* close the racy window with snapshot create/destroy ioctl */
...@@ -8403,12 +8368,6 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, ...@@ -8403,12 +8368,6 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
*/ */
btrfs_pin_log_trans(root); btrfs_pin_log_trans(root);
} }
/*
* make sure the inode gets flushed if it is replacing
* something.
*/
if (new_inode && new_inode->i_size && S_ISREG(old_inode->i_mode))
btrfs_add_ordered_operation(trans, root, old_inode);
inode_inc_iversion(old_dir); inode_inc_iversion(old_dir);
inode_inc_iversion(new_dir); inode_inc_iversion(new_dir);
......
...@@ -571,18 +571,6 @@ void btrfs_remove_ordered_extent(struct inode *inode, ...@@ -571,18 +571,6 @@ void btrfs_remove_ordered_extent(struct inode *inode,
trace_btrfs_ordered_extent_remove(inode, entry); trace_btrfs_ordered_extent_remove(inode, entry);
/*
* we have no more ordered extents for this inode and
* no dirty pages. We can safely remove it from the
* list of ordered extents
*/
if (RB_EMPTY_ROOT(&tree->tree) &&
!mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) {
spin_lock(&root->fs_info->ordered_root_lock);
list_del_init(&BTRFS_I(inode)->ordered_operations);
spin_unlock(&root->fs_info->ordered_root_lock);
}
if (!root->nr_ordered_extents) { if (!root->nr_ordered_extents) {
spin_lock(&root->fs_info->ordered_root_lock); spin_lock(&root->fs_info->ordered_root_lock);
BUG_ON(list_empty(&root->ordered_root)); BUG_ON(list_empty(&root->ordered_root));
...@@ -686,81 +674,6 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr) ...@@ -686,81 +674,6 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr)
mutex_unlock(&fs_info->ordered_operations_mutex); mutex_unlock(&fs_info->ordered_operations_mutex);
} }
/*
* this is used during transaction commit to write all the inodes
* added to the ordered operation list. These files must be fully on
* disk before the transaction commits.
*
* we have two modes here, one is to just start the IO via filemap_flush
* and the other is to wait for all the io. When we wait, we have an
* extra check to make sure the ordered operation list really is empty
* before we return
*/
int btrfs_run_ordered_operations(struct btrfs_trans_handle *trans,
struct btrfs_root *root, int wait)
{
struct btrfs_inode *btrfs_inode;
struct inode *inode;
struct btrfs_transaction *cur_trans = trans->transaction;
struct list_head splice;
struct list_head works;
struct btrfs_delalloc_work *work, *next;
int ret = 0;
INIT_LIST_HEAD(&splice);
INIT_LIST_HEAD(&works);
mutex_lock(&root->fs_info->ordered_extent_flush_mutex);
spin_lock(&root->fs_info->ordered_root_lock);
list_splice_init(&cur_trans->ordered_operations, &splice);
while (!list_empty(&splice)) {
btrfs_inode = list_entry(splice.next, struct btrfs_inode,
ordered_operations);
inode = &btrfs_inode->vfs_inode;
list_del_init(&btrfs_inode->ordered_operations);
/*
* the inode may be getting freed (in sys_unlink path).
*/
inode = igrab(inode);
if (!inode)
continue;
if (!wait)
list_add_tail(&BTRFS_I(inode)->ordered_operations,
&cur_trans->ordered_operations);
spin_unlock(&root->fs_info->ordered_root_lock);
work = btrfs_alloc_delalloc_work(inode, wait, 1);
if (!work) {
spin_lock(&root->fs_info->ordered_root_lock);
if (list_empty(&BTRFS_I(inode)->ordered_operations))
list_add_tail(&btrfs_inode->ordered_operations,
&splice);
list_splice_tail(&splice,
&cur_trans->ordered_operations);
spin_unlock(&root->fs_info->ordered_root_lock);
ret = -ENOMEM;
goto out;
}
list_add_tail(&work->list, &works);
btrfs_queue_work(root->fs_info->flush_workers,
&work->work);
cond_resched();
spin_lock(&root->fs_info->ordered_root_lock);
}
spin_unlock(&root->fs_info->ordered_root_lock);
out:
list_for_each_entry_safe(work, next, &works, list) {
list_del_init(&work->list);
btrfs_wait_and_free_delalloc_work(work);
}
mutex_unlock(&root->fs_info->ordered_extent_flush_mutex);
return ret;
}
/* /*
* Used to start IO or wait for a given ordered extent to finish. * Used to start IO or wait for a given ordered extent to finish.
* *
...@@ -1120,42 +1033,6 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, ...@@ -1120,42 +1033,6 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
return index; return index;
} }
/*
* add a given inode to the list of inodes that must be fully on
* disk before a transaction commit finishes.
*
* This basically gives us the ext3 style data=ordered mode, and it is mostly
* used to make sure renamed files are fully on disk.
*
* It is a noop if the inode is already fully on disk.
*
* If trans is not null, we'll do a friendly check for a transaction that
* is already flushing things and force the IO down ourselves.
*/
void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct inode *inode)
{
struct btrfs_transaction *cur_trans = trans->transaction;
u64 last_mod;
last_mod = max(BTRFS_I(inode)->generation, BTRFS_I(inode)->last_trans);
/*
* if this file hasn't been changed since the last transaction
* commit, we can safely return without doing anything
*/
if (last_mod <= root->fs_info->last_trans_committed)
return;
spin_lock(&root->fs_info->ordered_root_lock);
if (list_empty(&BTRFS_I(inode)->ordered_operations)) {
list_add_tail(&BTRFS_I(inode)->ordered_operations,
&cur_trans->ordered_operations);
}
spin_unlock(&root->fs_info->ordered_root_lock);
}
int __init ordered_data_init(void) int __init ordered_data_init(void)
{ {
btrfs_ordered_extent_cache = kmem_cache_create("btrfs_ordered_extent", btrfs_ordered_extent_cache = kmem_cache_create("btrfs_ordered_extent",
......
...@@ -190,11 +190,6 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, ...@@ -190,11 +190,6 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
struct btrfs_ordered_extent *ordered); struct btrfs_ordered_extent *ordered);
int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
u32 *sum, int len); u32 *sum, int len);
int btrfs_run_ordered_operations(struct btrfs_trans_handle *trans,
struct btrfs_root *root, int wait);
void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct inode *inode);
int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr); int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr);
void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr); void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr);
void btrfs_get_logged_extents(struct inode *inode, void btrfs_get_logged_extents(struct inode *inode,
......
...@@ -218,7 +218,6 @@ static noinline int join_transaction(struct btrfs_root *root, unsigned int type) ...@@ -218,7 +218,6 @@ static noinline int join_transaction(struct btrfs_root *root, unsigned int type)
spin_lock_init(&cur_trans->delayed_refs.lock); spin_lock_init(&cur_trans->delayed_refs.lock);
INIT_LIST_HEAD(&cur_trans->pending_snapshots); INIT_LIST_HEAD(&cur_trans->pending_snapshots);
INIT_LIST_HEAD(&cur_trans->ordered_operations);
INIT_LIST_HEAD(&cur_trans->pending_chunks); INIT_LIST_HEAD(&cur_trans->pending_chunks);
INIT_LIST_HEAD(&cur_trans->switch_commits); INIT_LIST_HEAD(&cur_trans->switch_commits);
list_add_tail(&cur_trans->list, &fs_info->trans_list); list_add_tail(&cur_trans->list, &fs_info->trans_list);
...@@ -1612,27 +1611,6 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans, ...@@ -1612,27 +1611,6 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans,
kmem_cache_free(btrfs_trans_handle_cachep, trans); kmem_cache_free(btrfs_trans_handle_cachep, trans);
} }
static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
{
int ret;
ret = btrfs_run_delayed_items(trans, root);
if (ret)
return ret;
/*
* rename don't use btrfs_join_transaction, so, once we
* set the transaction to blocked above, we aren't going
* to get any new ordered operations. We can safely run
* it here and no for sure that nothing new will be added
* to the list
*/
ret = btrfs_run_ordered_operations(trans, root, 1);
return ret;
}
static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info) static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info)
{ {
if (btrfs_test_opt(fs_info->tree_root, FLUSHONCOMMIT)) if (btrfs_test_opt(fs_info->tree_root, FLUSHONCOMMIT))
...@@ -1653,13 +1631,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, ...@@ -1653,13 +1631,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
struct btrfs_transaction *prev_trans = NULL; struct btrfs_transaction *prev_trans = NULL;
int ret; int ret;
ret = btrfs_run_ordered_operations(trans, root, 0);
if (ret) {
btrfs_abort_transaction(trans, root, ret);
btrfs_end_transaction(trans, root);
return ret;
}
/* Stop the commit early if ->aborted is set */ /* Stop the commit early if ->aborted is set */
if (unlikely(ACCESS_ONCE(cur_trans->aborted))) { if (unlikely(ACCESS_ONCE(cur_trans->aborted))) {
ret = cur_trans->aborted; ret = cur_trans->aborted;
...@@ -1740,7 +1711,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, ...@@ -1740,7 +1711,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
if (ret) if (ret)
goto cleanup_transaction; goto cleanup_transaction;
ret = btrfs_flush_all_pending_stuffs(trans, root); ret = btrfs_run_delayed_items(trans, root);
if (ret) if (ret)
goto cleanup_transaction; goto cleanup_transaction;
...@@ -1748,7 +1719,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, ...@@ -1748,7 +1719,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
extwriter_counter_read(cur_trans) == 0); extwriter_counter_read(cur_trans) == 0);
/* some pending stuffs might be added after the previous flush. */ /* some pending stuffs might be added after the previous flush. */
ret = btrfs_flush_all_pending_stuffs(trans, root); ret = btrfs_run_delayed_items(trans, root);
if (ret) if (ret)
goto cleanup_transaction; goto cleanup_transaction;
......
...@@ -55,7 +55,6 @@ struct btrfs_transaction { ...@@ -55,7 +55,6 @@ struct btrfs_transaction {
wait_queue_head_t writer_wait; wait_queue_head_t writer_wait;
wait_queue_head_t commit_wait; wait_queue_head_t commit_wait;
struct list_head pending_snapshots; struct list_head pending_snapshots;
struct list_head ordered_operations;
struct list_head pending_chunks; struct list_head pending_chunks;
struct list_head switch_commits; struct list_head switch_commits;
struct btrfs_delayed_ref_root delayed_refs; struct btrfs_delayed_ref_root delayed_refs;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment