Commit f73853c7 authored by Filipe Manana's avatar Filipe Manana Committed by David Sterba

btrfs: send: avoid double extent tree search when finding clone source

At find_extent_clone() we search twice for the extent item corresponding
to the data extent that the current file extent items points to:

1) Once with a call to extent_from_logical();

2) Once again during backref walking, through iterate_extent_inodes()
   which eventually leads to find_parent_nodes() where we will search
   again the extent tree for the same extent item.

The extent tree can be huge, so doing this one extra search for every
extent we want to send adds up and it's expensive.

The first call is there since the send code was introduced and it
accomplishes two things:

1) Check that the extent is flagged as a data extent in the extent tree.
   But it can not be anything else, otherwise we wouldn't have a file
   extent item in the send root pointing to it.
   This was probably added to catch bugs in the early days where send was
   yet too young and the interaction with everything else was far from
   perfect;

2) Check how many direct references there are on the extent, and if
   there's too many (more than SEND_MAX_EXTENT_REFS), avoid doing the
   backred walking as it may take too long and slowdown send.

So improve on this by having a callback in the backref walking code that
is called when it finds the extent item in the extent tree, and have those
checks done in the callback. When the callback returns anything different
from 0, it stops the backref walking code. This way we do a single search
on the extent tree for the extent item of our data extent.

Also, before this change we were only checking the number of references on
the data extent against SEND_MAX_EXTENT_REFS, but after starting backref
walking we will end up resolving backrefs for extent buffers in the path
from a leaf having a file extent item pointing to our data extent, up to
roots of trees from which the extent buffer is accessible from, due to
shared subtrees resulting from snapshoting. We were therefore allowing for
the possibility for send taking too long due to some node in the path from
the leaf to a root node being shared too many times. After this change we
check for reference counts being greater than SEND_MAX_EXTENT_REFS for
both data extents and metadata extents.

This change is part of a patchset comprised of the following patches:

  01/17 btrfs: fix inode list leak during backref walking at resolve_indirect_refs()
  02/17 btrfs: fix inode list leak during backref walking at find_parent_nodes()
  03/17 btrfs: fix ulist leaks in error paths of qgroup self tests
  04/17 btrfs: remove pointless and double ulist frees in error paths of qgroup tests
  05/17 btrfs: send: avoid unnecessary path allocations when finding extent clone
  06/17 btrfs: send: update comment at find_extent_clone()
  07/17 btrfs: send: drop unnecessary backref context field initializations
  08/17 btrfs: send: avoid unnecessary backref lookups when finding clone source
  09/17 btrfs: send: optimize clone detection to increase extent sharing
  10/17 btrfs: use a single argument for extent offset in backref walking functions
  11/17 btrfs: use a structure to pass arguments to backref walking functions
  12/17 btrfs: reuse roots ulist on each leaf iteration for iterate_extent_inodes()
  13/17 btrfs: constify ulist parameter of ulist_next()
  14/17 btrfs: send: cache leaf to roots mapping during backref walking
  15/17 btrfs: send: skip unnecessary backref iterations
  16/17 btrfs: send: avoid double extent tree search when finding clone source
  17/17 btrfs: send: skip resolution of our own backref when finding clone source

Performance test results are in the changelog of patch 17/17.
Signed-off-by: default avatarFilipe Manana <fdmanana@suse.com>
Signed-off-by: default avatarDavid Sterba <dsterba@suse.com>
parent 88ffb665
...@@ -1003,8 +1003,8 @@ static int add_delayed_refs(const struct btrfs_fs_info *fs_info, ...@@ -1003,8 +1003,8 @@ static int add_delayed_refs(const struct btrfs_fs_info *fs_info,
* *
* Returns 0 on success, <0 on error, or BACKREF_FOUND_SHARED. * Returns 0 on success, <0 on error, or BACKREF_FOUND_SHARED.
*/ */
static int add_inline_refs(const struct btrfs_fs_info *fs_info, static int add_inline_refs(struct btrfs_backref_walk_ctx *ctx,
struct btrfs_path *path, u64 bytenr, struct btrfs_path *path,
int *info_level, struct preftrees *preftrees, int *info_level, struct preftrees *preftrees,
struct share_check *sc) struct share_check *sc)
{ {
...@@ -1029,6 +1029,13 @@ static int add_inline_refs(const struct btrfs_fs_info *fs_info, ...@@ -1029,6 +1029,13 @@ static int add_inline_refs(const struct btrfs_fs_info *fs_info,
BUG_ON(item_size < sizeof(*ei)); BUG_ON(item_size < sizeof(*ei));
ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item); ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item);
if (ctx->check_extent_item) {
ret = ctx->check_extent_item(ctx->bytenr, ei, leaf, ctx->user_ctx);
if (ret)
return ret;
}
flags = btrfs_extent_flags(leaf, ei); flags = btrfs_extent_flags(leaf, ei);
btrfs_item_key_to_cpu(leaf, &found_key, slot); btrfs_item_key_to_cpu(leaf, &found_key, slot);
...@@ -1064,9 +1071,9 @@ static int add_inline_refs(const struct btrfs_fs_info *fs_info, ...@@ -1064,9 +1071,9 @@ static int add_inline_refs(const struct btrfs_fs_info *fs_info,
switch (type) { switch (type) {
case BTRFS_SHARED_BLOCK_REF_KEY: case BTRFS_SHARED_BLOCK_REF_KEY:
ret = add_direct_ref(fs_info, preftrees, ret = add_direct_ref(ctx->fs_info, preftrees,
*info_level + 1, offset, *info_level + 1, offset,
bytenr, 1, NULL, GFP_NOFS); ctx->bytenr, 1, NULL, GFP_NOFS);
break; break;
case BTRFS_SHARED_DATA_REF_KEY: { case BTRFS_SHARED_DATA_REF_KEY: {
struct btrfs_shared_data_ref *sdref; struct btrfs_shared_data_ref *sdref;
...@@ -1075,14 +1082,14 @@ static int add_inline_refs(const struct btrfs_fs_info *fs_info, ...@@ -1075,14 +1082,14 @@ static int add_inline_refs(const struct btrfs_fs_info *fs_info,
sdref = (struct btrfs_shared_data_ref *)(iref + 1); sdref = (struct btrfs_shared_data_ref *)(iref + 1);
count = btrfs_shared_data_ref_count(leaf, sdref); count = btrfs_shared_data_ref_count(leaf, sdref);
ret = add_direct_ref(fs_info, preftrees, 0, offset, ret = add_direct_ref(ctx->fs_info, preftrees, 0, offset,
bytenr, count, sc, GFP_NOFS); ctx->bytenr, count, sc, GFP_NOFS);
break; break;
} }
case BTRFS_TREE_BLOCK_REF_KEY: case BTRFS_TREE_BLOCK_REF_KEY:
ret = add_indirect_ref(fs_info, preftrees, offset, ret = add_indirect_ref(ctx->fs_info, preftrees, offset,
NULL, *info_level + 1, NULL, *info_level + 1,
bytenr, 1, NULL, GFP_NOFS); ctx->bytenr, 1, NULL, GFP_NOFS);
break; break;
case BTRFS_EXTENT_DATA_REF_KEY: { case BTRFS_EXTENT_DATA_REF_KEY: {
struct btrfs_extent_data_ref *dref; struct btrfs_extent_data_ref *dref;
...@@ -1104,8 +1111,8 @@ static int add_inline_refs(const struct btrfs_fs_info *fs_info, ...@@ -1104,8 +1111,8 @@ static int add_inline_refs(const struct btrfs_fs_info *fs_info,
root = btrfs_extent_data_ref_root(leaf, dref); root = btrfs_extent_data_ref_root(leaf, dref);
ret = add_indirect_ref(fs_info, preftrees, root, ret = add_indirect_ref(ctx->fs_info, preftrees, root,
&key, 0, bytenr, count, &key, 0, ctx->bytenr, count,
sc, GFP_NOFS); sc, GFP_NOFS);
break; break;
...@@ -1455,8 +1462,8 @@ static int find_parent_nodes(struct btrfs_backref_walk_ctx *ctx, ...@@ -1455,8 +1462,8 @@ static int find_parent_nodes(struct btrfs_backref_walk_ctx *ctx,
if (key.objectid == ctx->bytenr && if (key.objectid == ctx->bytenr &&
(key.type == BTRFS_EXTENT_ITEM_KEY || (key.type == BTRFS_EXTENT_ITEM_KEY ||
key.type == BTRFS_METADATA_ITEM_KEY)) { key.type == BTRFS_METADATA_ITEM_KEY)) {
ret = add_inline_refs(ctx->fs_info, path, ctx->bytenr, ret = add_inline_refs(ctx, path, &info_level,
&info_level, &preftrees, sc); &preftrees, sc);
if (ret) if (ret)
goto out; goto out;
ret = add_keyed_refs(root, path, ctx->bytenr, info_level, ret = add_keyed_refs(root, path, ctx->bytenr, info_level,
......
...@@ -109,9 +109,14 @@ struct btrfs_backref_walk_ctx { ...@@ -109,9 +109,14 @@ struct btrfs_backref_walk_ctx {
*/ */
iterate_extent_inodes_t *indirect_ref_iterator; iterate_extent_inodes_t *indirect_ref_iterator;
/* /*
* Context object to pass to the @cache_lookup, @cache_store and * If this is not NULL, then the backref walking code will call this for
* @indirect_ref_iterator callbacks. * each extent item it's meant to process before it actually starts
* processing it. If this returns anything other than 0, then it stops
* the backref walking code immediately.
*/ */
int (*check_extent_item)(u64 bytenr, const struct btrfs_extent_item *ei,
const struct extent_buffer *leaf, void *user_ctx);
/* Context object to pass to the callbacks defined above. */
void *user_ctx; void *user_ctx;
}; };
......
...@@ -1281,6 +1281,9 @@ struct backref_ctx { ...@@ -1281,6 +1281,9 @@ struct backref_ctx {
/* may be truncated in case it's the last extent in a file */ /* may be truncated in case it's the last extent in a file */
u64 extent_len; u64 extent_len;
/* The bytenr the file extent item we are processing refers to. */
u64 bytenr;
}; };
static int __clone_root_cmp_bsearch(const void *key, const void *elt) static int __clone_root_cmp_bsearch(const void *key, const void *elt)
...@@ -1518,6 +1521,43 @@ static void store_backref_cache(u64 leaf_bytenr, const struct ulist *root_ids, ...@@ -1518,6 +1521,43 @@ static void store_backref_cache(u64 leaf_bytenr, const struct ulist *root_ids,
sctx->backref_cache.size++; sctx->backref_cache.size++;
} }
static int check_extent_item(u64 bytenr, const struct btrfs_extent_item *ei,
const struct extent_buffer *leaf, void *ctx)
{
const u64 refs = btrfs_extent_refs(leaf, ei);
const struct backref_ctx *bctx = ctx;
const struct send_ctx *sctx = bctx->sctx;
if (bytenr == bctx->bytenr) {
const u64 flags = btrfs_extent_flags(leaf, ei);
if (WARN_ON(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK))
return -EUCLEAN;
/*
* If we have only one reference and only the send root as a
* clone source - meaning no clone roots were given in the
* struct btrfs_ioctl_send_args passed to the send ioctl - then
* it's our reference and there's no point in doing backref
* walking which is expensive, so exit early.
*/
if (refs == 1 && sctx->clone_roots_cnt == 1)
return -ENOENT;
}
/*
* Backreference walking (iterate_extent_inodes() below) is currently
* too expensive when an extent has a large number of references, both
* in time spent and used memory. So for now just fallback to write
* operations instead of clone operations when an extent has more than
* a certain amount of references.
*/
if (refs > SEND_MAX_EXTENT_REFS)
return -ENOENT;
return 0;
}
/* /*
* Given an inode, offset and extent item, it finds a good clone for a clone * Given an inode, offset and extent item, it finds a good clone for a clone
* instruction. Returns -ENOENT when none could be found. The function makes * instruction. Returns -ENOENT when none could be found. The function makes
...@@ -1539,16 +1579,11 @@ static int find_extent_clone(struct send_ctx *sctx, ...@@ -1539,16 +1579,11 @@ static int find_extent_clone(struct send_ctx *sctx,
u64 logical; u64 logical;
u64 disk_byte; u64 disk_byte;
u64 num_bytes; u64 num_bytes;
u64 extent_refs;
u64 flags = 0;
struct btrfs_file_extent_item *fi; struct btrfs_file_extent_item *fi;
struct extent_buffer *eb = path->nodes[0]; struct extent_buffer *eb = path->nodes[0];
struct backref_ctx backref_ctx = { 0 }; struct backref_ctx backref_ctx = { 0 };
struct btrfs_backref_walk_ctx backref_walk_ctx = { 0 }; struct btrfs_backref_walk_ctx backref_walk_ctx = { 0 };
struct clone_root *cur_clone_root; struct clone_root *cur_clone_root;
struct btrfs_key found_key;
struct btrfs_path *tmp_path;
struct btrfs_extent_item *ei;
int compressed; int compressed;
u32 i; u32 i;
...@@ -1574,48 +1609,6 @@ static int find_extent_clone(struct send_ctx *sctx, ...@@ -1574,48 +1609,6 @@ static int find_extent_clone(struct send_ctx *sctx,
num_bytes = btrfs_file_extent_num_bytes(eb, fi); num_bytes = btrfs_file_extent_num_bytes(eb, fi);
logical = disk_byte + btrfs_file_extent_offset(eb, fi); logical = disk_byte + btrfs_file_extent_offset(eb, fi);
tmp_path = alloc_path_for_send();
if (!tmp_path)
return -ENOMEM;
/* We only use this path under the commit sem */
tmp_path->need_commit_sem = 0;
down_read(&fs_info->commit_root_sem);
ret = extent_from_logical(fs_info, disk_byte, tmp_path,
&found_key, &flags);
up_read(&fs_info->commit_root_sem);
if (ret < 0)
goto out;
if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
ret = -EIO;
goto out;
}
ei = btrfs_item_ptr(tmp_path->nodes[0], tmp_path->slots[0],
struct btrfs_extent_item);
extent_refs = btrfs_extent_refs(tmp_path->nodes[0], ei);
/*
* Backreference walking (iterate_extent_inodes() below) is currently
* too expensive when an extent has a large number of references, both
* in time spent and used memory. So for now just fallback to write
* operations instead of clone operations when an extent has more than
* a certain amount of references.
*
* Also, if we have only one reference and only the send root as a clone
* source - meaning no clone roots were given in the struct
* btrfs_ioctl_send_args passed to the send ioctl - then it's our
* reference and there's no point in doing backref walking which is
* expensive, so exit early.
*/
if ((extent_refs == 1 && sctx->clone_roots_cnt == 1) ||
extent_refs > SEND_MAX_EXTENT_REFS) {
ret = -ENOENT;
goto out;
}
btrfs_release_path(tmp_path);
/* /*
* Setup the clone roots. * Setup the clone roots.
*/ */
...@@ -1630,6 +1623,7 @@ static int find_extent_clone(struct send_ctx *sctx, ...@@ -1630,6 +1623,7 @@ static int find_extent_clone(struct send_ctx *sctx,
backref_ctx.sctx = sctx; backref_ctx.sctx = sctx;
backref_ctx.cur_objectid = ino; backref_ctx.cur_objectid = ino;
backref_ctx.cur_offset = data_offset; backref_ctx.cur_offset = data_offset;
backref_ctx.bytenr = disk_byte;
/* /*
* The last extent of a file may be too large due to page alignment. * The last extent of a file may be too large due to page alignment.
...@@ -1644,19 +1638,20 @@ static int find_extent_clone(struct send_ctx *sctx, ...@@ -1644,19 +1638,20 @@ static int find_extent_clone(struct send_ctx *sctx,
/* /*
* Now collect all backrefs. * Now collect all backrefs.
*/ */
backref_walk_ctx.bytenr = found_key.objectid; backref_walk_ctx.bytenr = disk_byte;
if (compressed == BTRFS_COMPRESS_NONE) if (compressed == BTRFS_COMPRESS_NONE)
backref_walk_ctx.extent_item_pos = logical - found_key.objectid; backref_walk_ctx.extent_item_pos = btrfs_file_extent_offset(eb, fi);
backref_walk_ctx.fs_info = fs_info; backref_walk_ctx.fs_info = fs_info;
backref_walk_ctx.cache_lookup = lookup_backref_cache; backref_walk_ctx.cache_lookup = lookup_backref_cache;
backref_walk_ctx.cache_store = store_backref_cache; backref_walk_ctx.cache_store = store_backref_cache;
backref_walk_ctx.indirect_ref_iterator = iterate_backrefs; backref_walk_ctx.indirect_ref_iterator = iterate_backrefs;
backref_walk_ctx.check_extent_item = check_extent_item;
backref_walk_ctx.user_ctx = &backref_ctx; backref_walk_ctx.user_ctx = &backref_ctx;
ret = iterate_extent_inodes(&backref_walk_ctx, true, iterate_backrefs, ret = iterate_extent_inodes(&backref_walk_ctx, true, iterate_backrefs,
&backref_ctx); &backref_ctx);
if (ret < 0) if (ret < 0)
goto out; return ret;
down_read(&fs_info->commit_root_sem); down_read(&fs_info->commit_root_sem);
if (fs_info->last_reloc_trans > sctx->last_reloc_trans) { if (fs_info->last_reloc_trans > sctx->last_reloc_trans) {
...@@ -1673,8 +1668,7 @@ static int find_extent_clone(struct send_ctx *sctx, ...@@ -1673,8 +1668,7 @@ static int find_extent_clone(struct send_ctx *sctx,
* was already reallocated after the relocation. * was already reallocated after the relocation.
*/ */
up_read(&fs_info->commit_root_sem); up_read(&fs_info->commit_root_sem);
ret = -ENOENT; return -ENOENT;
goto out;
} }
up_read(&fs_info->commit_root_sem); up_read(&fs_info->commit_root_sem);
...@@ -1684,8 +1678,7 @@ static int find_extent_clone(struct send_ctx *sctx, ...@@ -1684,8 +1678,7 @@ static int find_extent_clone(struct send_ctx *sctx,
if (!backref_ctx.found) { if (!backref_ctx.found) {
btrfs_debug(fs_info, "no clones found"); btrfs_debug(fs_info, "no clones found");
ret = -ENOENT; return -ENOENT;
goto out;
} }
cur_clone_root = NULL; cur_clone_root = NULL;
...@@ -1720,8 +1713,6 @@ static int find_extent_clone(struct send_ctx *sctx, ...@@ -1720,8 +1713,6 @@ static int find_extent_clone(struct send_ctx *sctx,
ret = -ENOENT; ret = -ENOENT;
} }
out:
btrfs_free_path(tmp_path);
return ret; return ret;
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment