Commit e09d94c9 authored by Filipe Manana's avatar Filipe Manana Committed by David Sterba

btrfs: log conflicting inodes without holding log mutex of the initial inode

When logging an inode, if we detect the inode has a reference that
conflicts with some other inode that got renamed, we log that other inode
while holding the log mutex of the current inode. We then find out if
there are other inodes that conflict with the first conflicting inode,
and log them while under the log mutex of the original inode. This is
fine because the recursion can only happen once.

For the upcoming work where we directly log delayed items without flushing
them first to the subvolume tree, this recursion adds a lot of complexity
and it's hard to keep lockdep happy about it.

So collect a list of conflicting inodes and then log the inodes after
unlocking the log mutex of the inode we started with.

Also limit the maximum number of conflict inodes we log to 10, to avoid
spending too much time logging (and maybe allocating too many list
elements too), as typically we don't have more than 1 or 2 conflicting
inodes - if we go over the limit, simply fallback to a transaction commit.

It is possible to have a very long list of conflicting inodes to be
intentionally created by a user if he/she creates a very long succession
of renames like this:

  (...)
  rename E to F
  rename D to E
  rename C to D
  rename B to C
  rename A to B
  touch A (create a new file named A)
  fsync A

If that happened for a sequence of hundreds or thousands of renames, it
could massively slow down the logging and cause other secondary effects
like for example blocking other fsync operations and transaction commits
for a very long time (assuming it wouldn't run into -ENOSPC or -ENOMEM
first). However such cases are very uncommon to happen in practice,
nevertheless it's better to be prepared for them and avoid chaos.
Such long sequence of conflicting inodes could be created before this
change.
Signed-off-by: default avatarFilipe Manana <fdmanana@suse.com>
Signed-off-by: default avatarDavid Sterba <dsterba@suse.com>
parent f6d86dbe
...@@ -2380,6 +2380,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) ...@@ -2380,6 +2380,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
ret = btrfs_commit_transaction(trans); ret = btrfs_commit_transaction(trans);
out: out:
ASSERT(list_empty(&ctx.list)); ASSERT(list_empty(&ctx.list));
ASSERT(list_empty(&ctx.conflict_inodes));
err = file_check_and_advance_wb_err(file); err = file_check_and_advance_wb_err(file);
if (!ret) if (!ret)
ret = err; ret = err;
......
...@@ -22,6 +22,8 @@ ...@@ -22,6 +22,8 @@
#include "zoned.h" #include "zoned.h"
#include "inode-item.h" #include "inode-item.h"
#define MAX_CONFLICT_INODES 10
/* magic values for the inode_only field in btrfs_log_inode: /* magic values for the inode_only field in btrfs_log_inode:
* *
* LOG_INODE_ALL means to log everything * LOG_INODE_ALL means to log everything
...@@ -31,8 +33,6 @@ ...@@ -31,8 +33,6 @@
enum { enum {
LOG_INODE_ALL, LOG_INODE_ALL,
LOG_INODE_EXISTS, LOG_INODE_EXISTS,
LOG_OTHER_INODE,
LOG_OTHER_INODE_ALL,
}; };
/* /*
...@@ -5633,67 +5633,71 @@ struct btrfs_ino_list { ...@@ -5633,67 +5633,71 @@ struct btrfs_ino_list {
struct list_head list; struct list_head list;
}; };
static int log_conflicting_inodes(struct btrfs_trans_handle *trans, static void free_conflicting_inodes(struct btrfs_log_ctx *ctx)
{
struct btrfs_ino_list *curr;
struct btrfs_ino_list *next;
list_for_each_entry_safe(curr, next, &ctx->conflict_inodes, list) {
list_del(&curr->list);
kfree(curr);
}
}
static int add_conflicting_inode(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct btrfs_root *root,
struct btrfs_path *path, u64 ino, u64 parent,
struct btrfs_log_ctx *ctx, struct btrfs_log_ctx *ctx)
u64 ino, u64 parent)
{ {
struct btrfs_ino_list *ino_elem; struct btrfs_ino_list *ino_elem;
LIST_HEAD(inode_list); struct inode *inode;
int ret = 0;
/*
* It's rare to have a lot of conflicting inodes, in practice it is not
* common to have more than 1 or 2. We don't want to collect too many,
* as we could end up logging too many inodes (even if only in
* LOG_INODE_EXISTS mode) and slow down other fsyncs or transaction
* commits.
*/
if (ctx->num_conflict_inodes >= MAX_CONFLICT_INODES)
return BTRFS_LOG_FORCE_COMMIT;
inode = btrfs_iget(root->fs_info->sb, ino, root);
/*
* If the other inode that had a conflicting dir entry was deleted in
* the current transaction, we need to log its parent directory.
* We can't simply ignore it and let the current inode's reference cause
* an unlink of the conflicting inode when replaying the log - because
* the conflicting inode may be a deleted subvolume/snapshot or it may
* be a directory that had subvolumes/snapshots inside it (or had one
* or more subdirectories with subvolumes/snapshots, etc). If that's the
* case, then when logging the parent directory we will fallback to a
* transaction commit because the parent directory will have a
* last_unlink_trans that matches the current transaction.
*/
if (IS_ERR(inode)) {
int ret = PTR_ERR(inode);
if (ret != -ENOENT)
return ret;
ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS); ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS);
if (!ino_elem) if (!ino_elem)
return -ENOMEM; return -ENOMEM;
ino_elem->ino = ino; ino_elem->ino = ino;
ino_elem->parent = parent; ino_elem->parent = parent;
list_add_tail(&ino_elem->list, &inode_list); list_add_tail(&ino_elem->list, &ctx->conflict_inodes);
ctx->num_conflict_inodes++;
while (!list_empty(&inode_list)) { return 0;
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_key key;
struct inode *inode;
ino_elem = list_first_entry(&inode_list, struct btrfs_ino_list,
list);
ino = ino_elem->ino;
parent = ino_elem->parent;
list_del(&ino_elem->list);
kfree(ino_elem);
if (ret)
continue;
btrfs_release_path(path);
inode = btrfs_iget(fs_info->sb, ino, root);
/*
* If the other inode that had a conflicting dir entry was
* deleted in the current transaction, we need to log its parent
* directory.
*/
if (IS_ERR(inode)) {
ret = PTR_ERR(inode);
if (ret == -ENOENT) {
inode = btrfs_iget(fs_info->sb, parent, root);
if (IS_ERR(inode)) {
ret = PTR_ERR(inode);
} else {
ret = btrfs_log_inode(trans,
BTRFS_I(inode),
LOG_OTHER_INODE_ALL,
ctx);
btrfs_add_delayed_iput(inode);
}
}
continue;
} }
/* /*
* If the inode was already logged skip it - otherwise we can * If the inode was already logged skip it - otherwise we can hit an
* hit an infinite loop. Example: * infinite loop. Example:
* *
* From the commit root (previous transaction) we have the * From the commit root (previous transaction) we have the following
* following inodes: * inodes:
* *
* inode 257 a directory * inode 257 a directory
* inode 258 with references "zz" and "zz_link" on inode 257 * inode 258 with references "zz" and "zz_link" on inode 257
...@@ -5718,88 +5722,129 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans, ...@@ -5718,88 +5722,129 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
* - we detect inode 258 as a conflicting inode, with inode 259 * - we detect inode 258 as a conflicting inode, with inode 259
* on reference "zz_link", and log it - again! After this we * on reference "zz_link", and log it - again! After this we
* repeat the above steps forever. * repeat the above steps forever.
*
* Here we can use need_log_inode() because we only need to log the
* inode in LOG_INODE_EXISTS mode and rename operations update the log,
* so that the log ends up with the new name and without the old name.
*/ */
spin_lock(&BTRFS_I(inode)->lock); if (!need_log_inode(trans, BTRFS_I(inode))) {
/*
* Check the inode's logged_trans only instead of
* btrfs_inode_in_log(). This is because the last_log_commit of
* the inode is not updated when we only log that it exists (see
* btrfs_log_inode()).
*/
if (BTRFS_I(inode)->logged_trans == trans->transid) {
spin_unlock(&BTRFS_I(inode)->lock);
btrfs_add_delayed_iput(inode); btrfs_add_delayed_iput(inode);
continue; return 0;
} }
spin_unlock(&BTRFS_I(inode)->lock);
btrfs_add_delayed_iput(inode);
ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS);
if (!ino_elem)
return -ENOMEM;
ino_elem->ino = ino;
ino_elem->parent = parent;
list_add_tail(&ino_elem->list, &ctx->conflict_inodes);
ctx->num_conflict_inodes++;
return 0;
}
static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_log_ctx *ctx)
{
struct btrfs_fs_info *fs_info = root->fs_info;
int ret = 0;
/* /*
* We are safe logging the other inode without acquiring its * Conflicting inodes are logged by the first call to btrfs_log_inode(),
* lock as long as we log with the LOG_INODE_EXISTS mode. We * otherwise we could have unbounded recursion of btrfs_log_inode()
* are safe against concurrent renames of the other inode as * calls. This check guarantees we can have only 1 level of recursion.
* well because during a rename we pin the log and update the
* log with the new name before we unpin it.
*/ */
ret = btrfs_log_inode(trans, BTRFS_I(inode), LOG_OTHER_INODE, ctx); if (ctx->logging_conflict_inodes)
if (ret) { return 0;
btrfs_add_delayed_iput(inode);
continue;
}
key.objectid = ino; ctx->logging_conflict_inodes = true;
key.type = BTRFS_INODE_REF_KEY;
key.offset = 0;
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0) {
btrfs_add_delayed_iput(inode);
continue;
}
while (true) { /*
struct extent_buffer *leaf = path->nodes[0]; * New conflicting inodes may be found and added to the list while we
int slot = path->slots[0]; * are logging a conflicting inode, so keep iterating while the list is
u64 other_ino = 0; * not empty.
u64 other_parent = 0; */
while (!list_empty(&ctx->conflict_inodes)) {
struct btrfs_ino_list *curr;
struct inode *inode;
u64 ino;
u64 parent;
if (slot >= btrfs_header_nritems(leaf)) { curr = list_first_entry(&ctx->conflict_inodes,
ret = btrfs_next_leaf(root, path); struct btrfs_ino_list, list);
if (ret < 0) { ino = curr->ino;
break; parent = curr->parent;
} else if (ret > 0) { list_del(&curr->list);
ret = 0; kfree(curr);
inode = btrfs_iget(fs_info->sb, ino, root);
/*
* If the other inode that had a conflicting dir entry was
* deleted in the current transaction, we need to log its parent
* directory. See the comment at add_conflicting_inode().
*/
if (IS_ERR(inode)) {
ret = PTR_ERR(inode);
if (ret != -ENOENT)
break; break;
}
continue;
}
btrfs_item_key_to_cpu(leaf, &key, slot); inode = btrfs_iget(fs_info->sb, parent, root);
if (key.objectid != ino || if (IS_ERR(inode)) {
(key.type != BTRFS_INODE_REF_KEY && ret = PTR_ERR(inode);
key.type != BTRFS_INODE_EXTREF_KEY)) {
ret = 0;
break; break;
} }
ret = btrfs_check_ref_name_override(leaf, slot, &key, /*
BTRFS_I(inode), &other_ino, * Always log the directory, we cannot make this
&other_parent); * conditional on need_log_inode() because the directory
if (ret < 0) * might have been logged in LOG_INODE_EXISTS mode or
break; * the dir index of the conflicting inode is not in a
if (ret > 0) { * dir index key range logged for the directory. So we
ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS); * must make sure the deletion is recorded.
if (!ino_elem) { */
ret = -ENOMEM; ret = btrfs_log_inode(trans, BTRFS_I(inode),
LOG_INODE_ALL, ctx);
btrfs_add_delayed_iput(inode);
if (ret)
break; break;
continue;
} }
ino_elem->ino = other_ino;
ino_elem->parent = other_parent; /*
list_add_tail(&ino_elem->list, &inode_list); * Here we can use need_log_inode() because we only need to log
ret = 0; * the inode in LOG_INODE_EXISTS mode and rename operations
} * update the log, so that the log ends up with the new name and
path->slots[0]++; * without the old name.
*
* We did this check at add_conflicting_inode(), but here we do
* it again because if some other task logged the inode after
* that, we can avoid doing it again.
*/
if (!need_log_inode(trans, BTRFS_I(inode))) {
btrfs_add_delayed_iput(inode);
continue;
} }
/*
* We are safe logging the other inode without acquiring its
* lock as long as we log with the LOG_INODE_EXISTS mode. We
* are safe against concurrent renames of the other inode as
* well because during a rename we pin the log and update the
* log with the new name before we unpin it.
*/
ret = btrfs_log_inode(trans, BTRFS_I(inode), LOG_INODE_EXISTS, ctx);
btrfs_add_delayed_iput(inode); btrfs_add_delayed_iput(inode);
if (ret)
break;
} }
ctx->logging_conflict_inodes = false;
if (ret)
free_conflicting_inodes(ctx);
return ret; return ret;
} }
...@@ -5810,7 +5855,6 @@ static int copy_inode_items_to_log(struct btrfs_trans_handle *trans, ...@@ -5810,7 +5855,6 @@ static int copy_inode_items_to_log(struct btrfs_trans_handle *trans,
struct btrfs_path *path, struct btrfs_path *path,
struct btrfs_path *dst_path, struct btrfs_path *dst_path,
const u64 logged_isize, const u64 logged_isize,
const bool recursive_logging,
const int inode_only, const int inode_only,
struct btrfs_log_ctx *ctx, struct btrfs_log_ctx *ctx,
bool *need_log_inode_item) bool *need_log_inode_item)
...@@ -5849,8 +5893,8 @@ static int copy_inode_items_to_log(struct btrfs_trans_handle *trans, ...@@ -5849,8 +5893,8 @@ static int copy_inode_items_to_log(struct btrfs_trans_handle *trans,
break; break;
} else if ((min_key->type == BTRFS_INODE_REF_KEY || } else if ((min_key->type == BTRFS_INODE_REF_KEY ||
min_key->type == BTRFS_INODE_EXTREF_KEY) && min_key->type == BTRFS_INODE_EXTREF_KEY) &&
inode->generation == trans->transid && (inode->generation == trans->transid ||
!recursive_logging) { ctx->logging_conflict_inodes)) {
u64 other_ino = 0; u64 other_ino = 0;
u64 other_parent = 0; u64 other_parent = 0;
...@@ -5874,11 +5918,12 @@ static int copy_inode_items_to_log(struct btrfs_trans_handle *trans, ...@@ -5874,11 +5918,12 @@ static int copy_inode_items_to_log(struct btrfs_trans_handle *trans,
return ret; return ret;
ins_nr = 0; ins_nr = 0;
ret = log_conflicting_inodes(trans, root, path, btrfs_release_path(path);
ctx, other_ino, other_parent); ret = add_conflicting_inode(trans, root,
other_ino,
other_parent, ctx);
if (ret) if (ret)
return ret; return ret;
btrfs_release_path(path);
goto next_key; goto next_key;
} }
} else if (min_key->type == BTRFS_XATTR_ITEM_KEY) { } else if (min_key->type == BTRFS_XATTR_ITEM_KEY) {
...@@ -5992,9 +6037,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, ...@@ -5992,9 +6037,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
u64 logged_isize = 0; u64 logged_isize = 0;
bool need_log_inode_item = true; bool need_log_inode_item = true;
bool xattrs_logged = false; bool xattrs_logged = false;
bool recursive_logging = false;
bool inode_item_dropped = true; bool inode_item_dropped = true;
const bool orig_logged_before = ctx->logged_before;
path = btrfs_alloc_path(); path = btrfs_alloc_path();
if (!path) if (!path)
...@@ -6033,16 +6076,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, ...@@ -6033,16 +6076,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
goto out; goto out;
} }
if (inode_only == LOG_OTHER_INODE || inode_only == LOG_OTHER_INODE_ALL) {
recursive_logging = true;
if (inode_only == LOG_OTHER_INODE)
inode_only = LOG_INODE_EXISTS;
else
inode_only = LOG_INODE_ALL;
mutex_lock_nested(&inode->log_mutex, SINGLE_DEPTH_NESTING);
} else {
mutex_lock(&inode->log_mutex); mutex_lock(&inode->log_mutex);
}
/* /*
* For symlinks, we must always log their content, which is stored in an * For symlinks, we must always log their content, which is stored in an
...@@ -6148,7 +6182,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, ...@@ -6148,7 +6182,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
ret = copy_inode_items_to_log(trans, inode, &min_key, &max_key, ret = copy_inode_items_to_log(trans, inode, &min_key, &max_key,
path, dst_path, logged_isize, path, dst_path, logged_isize,
recursive_logging, inode_only, ctx, inode_only, ctx,
&need_log_inode_item); &need_log_inode_item);
if (ret) if (ret)
goto out_unlock; goto out_unlock;
...@@ -6257,8 +6291,10 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, ...@@ -6257,8 +6291,10 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
btrfs_free_path(path); btrfs_free_path(path);
btrfs_free_path(dst_path); btrfs_free_path(dst_path);
if (recursive_logging) if (ret)
ctx->logged_before = orig_logged_before; free_conflicting_inodes(ctx);
else
ret = log_conflicting_inodes(trans, inode->root, ctx);
return ret; return ret;
} }
...@@ -7113,6 +7149,7 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans, ...@@ -7113,6 +7149,7 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans,
* inconsistent state after a rename operation. * inconsistent state after a rename operation.
*/ */
btrfs_log_inode_parent(trans, inode, parent, LOG_INODE_EXISTS, &ctx); btrfs_log_inode_parent(trans, inode, parent, LOG_INODE_EXISTS, &ctx);
ASSERT(list_empty(&ctx.conflict_inodes));
out: out:
/* /*
* If an error happened mark the log for a full commit because it's not * If an error happened mark the log for a full commit because it's not
......
...@@ -28,6 +28,9 @@ struct btrfs_log_ctx { ...@@ -28,6 +28,9 @@ struct btrfs_log_ctx {
struct list_head list; struct list_head list;
/* Only used for fast fsyncs. */ /* Only used for fast fsyncs. */
struct list_head ordered_extents; struct list_head ordered_extents;
struct list_head conflict_inodes;
int num_conflict_inodes;
bool logging_conflict_inodes;
}; };
static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx, static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx,
...@@ -41,6 +44,9 @@ static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx, ...@@ -41,6 +44,9 @@ static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx,
ctx->inode = inode; ctx->inode = inode;
INIT_LIST_HEAD(&ctx->list); INIT_LIST_HEAD(&ctx->list);
INIT_LIST_HEAD(&ctx->ordered_extents); INIT_LIST_HEAD(&ctx->ordered_extents);
INIT_LIST_HEAD(&ctx->conflict_inodes);
ctx->num_conflict_inodes = 0;
ctx->logging_conflict_inodes = false;
} }
static inline void btrfs_release_log_ctx_extents(struct btrfs_log_ctx *ctx) static inline void btrfs_release_log_ctx_extents(struct btrfs_log_ctx *ctx)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment