Commit 28a23593 authored by Filipe Manana's avatar Filipe Manana Committed by Chris Mason

Btrfs: fix lockdep warning on deadlock against an inode's log mutex

Commit 44f714da ("Btrfs: improve performance on fsync against new
inode after rename/unlink"), which landed in 4.8-rc2, introduced a
possibility for a deadlock due to double locking of an inode's log mutex
by the same task, which lockdep reports with:

[23045.433975] =============================================
[23045.434748] [ INFO: possible recursive locking detected ]
[23045.435426] 4.7.0-rc6-btrfs-next-34+ #1 Not tainted
[23045.436044] ---------------------------------------------
[23045.436044] xfs_io/3688 is trying to acquire lock:
[23045.436044]  (&ei->log_mutex){+.+...}, at: [<ffffffffa038552d>] btrfs_log_inode+0x13a/0xc95 [btrfs]
[23045.436044]
               but task is already holding lock:
[23045.436044]  (&ei->log_mutex){+.+...}, at: [<ffffffffa038552d>] btrfs_log_inode+0x13a/0xc95 [btrfs]
[23045.436044]
               other info that might help us debug this:
[23045.436044]  Possible unsafe locking scenario:

[23045.436044]        CPU0
[23045.436044]        ----
[23045.436044]   lock(&ei->log_mutex);
[23045.436044]   lock(&ei->log_mutex);
[23045.436044]
                *** DEADLOCK ***

[23045.436044]  May be due to missing lock nesting notation

[23045.436044] 3 locks held by xfs_io/3688:
[23045.436044]  #0:  (&sb->s_type->i_mutex_key#15){+.+...}, at: [<ffffffffa035f2ae>] btrfs_sync_file+0x14e/0x425 [btrfs]
[23045.436044]  #1:  (sb_internal#2){.+.+.+}, at: [<ffffffff8118446b>] __sb_start_write+0x5f/0xb0
[23045.436044]  #2:  (&ei->log_mutex){+.+...}, at: [<ffffffffa038552d>] btrfs_log_inode+0x13a/0xc95 [btrfs]
[23045.436044]
               stack backtrace:
[23045.436044] CPU: 4 PID: 3688 Comm: xfs_io Not tainted 4.7.0-rc6-btrfs-next-34+ #1
[23045.436044] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.9.1-0-gb3ef39f-prebuilt.qemu-project.org 04/01/2014
[23045.436044]  0000000000000000 ffff88022f5f7860 ffffffff8127074d ffffffff82a54b70
[23045.436044]  ffffffff82a54b70 ffff88022f5f7920 ffffffff81092897 ffff880228015d68
[23045.436044]  0000000000000000 ffffffff82a54b70 ffffffff829c3f00 ffff880228015d68
[23045.436044] Call Trace:
[23045.436044]  [<ffffffff8127074d>] dump_stack+0x67/0x90
[23045.436044]  [<ffffffff81092897>] __lock_acquire+0xcbb/0xe4e
[23045.436044]  [<ffffffff8109155f>] ? mark_lock+0x24/0x201
[23045.436044]  [<ffffffff8109179a>] ? mark_held_locks+0x5e/0x74
[23045.436044]  [<ffffffff81092de0>] lock_acquire+0x12f/0x1c3
[23045.436044]  [<ffffffff81092de0>] ? lock_acquire+0x12f/0x1c3
[23045.436044]  [<ffffffffa038552d>] ? btrfs_log_inode+0x13a/0xc95 [btrfs]
[23045.436044]  [<ffffffffa038552d>] ? btrfs_log_inode+0x13a/0xc95 [btrfs]
[23045.436044]  [<ffffffff814a51a4>] mutex_lock_nested+0x77/0x3a7
[23045.436044]  [<ffffffffa038552d>] ? btrfs_log_inode+0x13a/0xc95 [btrfs]
[23045.436044]  [<ffffffffa039705e>] ? btrfs_release_delayed_node+0xb/0xd [btrfs]
[23045.436044]  [<ffffffffa038552d>] btrfs_log_inode+0x13a/0xc95 [btrfs]
[23045.436044]  [<ffffffffa038552d>] ? btrfs_log_inode+0x13a/0xc95 [btrfs]
[23045.436044]  [<ffffffff810a0ed1>] ? vprintk_emit+0x453/0x465
[23045.436044]  [<ffffffffa0385a61>] btrfs_log_inode+0x66e/0xc95 [btrfs]
[23045.436044]  [<ffffffffa03c084d>] log_new_dir_dentries+0x26c/0x359 [btrfs]
[23045.436044]  [<ffffffffa03865aa>] btrfs_log_inode_parent+0x4a6/0x628 [btrfs]
[23045.436044]  [<ffffffffa0387552>] btrfs_log_dentry_safe+0x5a/0x75 [btrfs]
[23045.436044]  [<ffffffffa035f464>] btrfs_sync_file+0x304/0x425 [btrfs]
[23045.436044]  [<ffffffff811acaf4>] vfs_fsync_range+0x8c/0x9e
[23045.436044]  [<ffffffff811acb22>] vfs_fsync+0x1c/0x1e
[23045.436044]  [<ffffffff811acc79>] do_fsync+0x31/0x4a
[23045.436044]  [<ffffffff811ace99>] SyS_fsync+0x10/0x14
[23045.436044]  [<ffffffff814a88e5>] entry_SYSCALL_64_fastpath+0x18/0xa8
[23045.436044]  [<ffffffff8108f039>] ? trace_hardirqs_off_caller+0x3f/0xaa

An example reproducer for this is:

   $ mkfs.btrfs -f /dev/sdb
   $ mount /dev/sdb /mnt
   $ mkdir /mnt/dir
   $ touch /mnt/dir/foo
   $ sync
   $ mv /mnt/dir/foo /mnt/dir/bar
   $ touch /mnt/dir/foo
   $ xfs_io -c "fsync" /mnt/dir/bar

This is because while logging the inode of file bar we end up logging its
parent directory (since its inode has an unlink_trans field matching the
current transaction id due to the rename operation), which in turn logs
the inodes for all its new dentries, so that the new inode for the new
file named foo gets logged which in turn triggered another logging attempt
for the inode we are fsync'ing, since that inode had an old name that
corresponds to the name of the new inode.

So fix this by ensuring that when logging the inode for a new dentry that
has a name matching an old name of some other inode, we don't log again
the original inode that we are fsync'ing.

Fixes: 44f714da ("Btrfs: improve performance on fsync against new inode after rename/unlink")
Signed-off-by: default avatarFilipe Manana <fdmanana@suse.com>
Signed-off-by: default avatarDavid Sterba <dsterba@suse.com>
Signed-off-by: default avatarChris Mason <clm@fb.com>
parent 1ba98d08
...@@ -2070,7 +2070,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) ...@@ -2070,7 +2070,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
} }
trans->sync = true; trans->sync = true;
btrfs_init_log_ctx(&ctx); btrfs_init_log_ctx(&ctx, inode);
ret = btrfs_log_dentry_safe(trans, root, dentry, start, end, &ctx); ret = btrfs_log_dentry_safe(trans, root, dentry, start, end, &ctx);
if (ret < 0) { if (ret < 0) {
......
...@@ -2823,7 +2823,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, ...@@ -2823,7 +2823,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
*/ */
mutex_unlock(&root->log_mutex); mutex_unlock(&root->log_mutex);
btrfs_init_log_ctx(&root_log_ctx); btrfs_init_log_ctx(&root_log_ctx, NULL);
mutex_lock(&log_root_tree->log_mutex); mutex_lock(&log_root_tree->log_mutex);
atomic_inc(&log_root_tree->log_batch); atomic_inc(&log_root_tree->log_batch);
...@@ -4757,7 +4757,8 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, ...@@ -4757,7 +4757,8 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
if (ret < 0) { if (ret < 0) {
err = ret; err = ret;
goto out_unlock; goto out_unlock;
} else if (ret > 0) { } else if (ret > 0 && ctx &&
other_ino != btrfs_ino(ctx->inode)) {
struct btrfs_key inode_key; struct btrfs_key inode_key;
struct inode *other_inode; struct inode *other_inode;
......
...@@ -30,15 +30,18 @@ struct btrfs_log_ctx { ...@@ -30,15 +30,18 @@ struct btrfs_log_ctx {
int log_transid; int log_transid;
int io_err; int io_err;
bool log_new_dentries; bool log_new_dentries;
struct inode *inode;
struct list_head list; struct list_head list;
}; };
static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx) static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx,
struct inode *inode)
{ {
ctx->log_ret = 0; ctx->log_ret = 0;
ctx->log_transid = 0; ctx->log_transid = 0;
ctx->io_err = 0; ctx->io_err = 0;
ctx->log_new_dentries = false; ctx->log_new_dentries = false;
ctx->inode = inode;
INIT_LIST_HEAD(&ctx->list); INIT_LIST_HEAD(&ctx->list);
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment