Commit 652f25a2 authored by Josef Bacik's avatar Josef Bacik Committed by Chris Mason

Btrfs: improve replacing nocow extents

Various people have hit a deadlock when running btrfs/011.  This is because when
replacing nocow extents we will take the i_mutex to make sure nobody messes with
the file while we are replacing the extent.  The problem is we are already
holding a transaction open, which is a locking inversion, so instead we need to
save these inodes we find and then process them outside of the transaction.

Further we can't just lock the inode and assume we are good to go.  We need to
lock the extent range and then read back the extent cache for the inode to make
sure the extent really still points at the physical block we want.  If it
doesn't we don't have to copy it.  Thanks,
Signed-off-by: default avatarJosef Bacik <jbacik@fusionio.com>
Signed-off-by: default avatarChris Mason <chris.mason@fusionio.com>
parent d555438b
...@@ -158,12 +158,20 @@ struct scrub_fixup_nodatasum { ...@@ -158,12 +158,20 @@ struct scrub_fixup_nodatasum {
int mirror_num; int mirror_num;
}; };
struct scrub_nocow_inode {
u64 inum;
u64 offset;
u64 root;
struct list_head list;
};
struct scrub_copy_nocow_ctx { struct scrub_copy_nocow_ctx {
struct scrub_ctx *sctx; struct scrub_ctx *sctx;
u64 logical; u64 logical;
u64 len; u64 len;
int mirror_num; int mirror_num;
u64 physical_for_dev_replace; u64 physical_for_dev_replace;
struct list_head inodes;
struct btrfs_work work; struct btrfs_work work;
}; };
...@@ -245,7 +253,7 @@ static void scrub_wr_bio_end_io_worker(struct btrfs_work *work); ...@@ -245,7 +253,7 @@ static void scrub_wr_bio_end_io_worker(struct btrfs_work *work);
static int write_page_nocow(struct scrub_ctx *sctx, static int write_page_nocow(struct scrub_ctx *sctx,
u64 physical_for_dev_replace, struct page *page); u64 physical_for_dev_replace, struct page *page);
static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
void *ctx); struct scrub_copy_nocow_ctx *ctx);
static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len, static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
int mirror_num, u64 physical_for_dev_replace); int mirror_num, u64 physical_for_dev_replace);
static void copy_nocow_pages_worker(struct btrfs_work *work); static void copy_nocow_pages_worker(struct btrfs_work *work);
...@@ -3126,12 +3134,30 @@ static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len, ...@@ -3126,12 +3134,30 @@ static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
nocow_ctx->mirror_num = mirror_num; nocow_ctx->mirror_num = mirror_num;
nocow_ctx->physical_for_dev_replace = physical_for_dev_replace; nocow_ctx->physical_for_dev_replace = physical_for_dev_replace;
nocow_ctx->work.func = copy_nocow_pages_worker; nocow_ctx->work.func = copy_nocow_pages_worker;
INIT_LIST_HEAD(&nocow_ctx->inodes);
btrfs_queue_worker(&fs_info->scrub_nocow_workers, btrfs_queue_worker(&fs_info->scrub_nocow_workers,
&nocow_ctx->work); &nocow_ctx->work);
return 0; return 0;
} }
static int record_inode_for_nocow(u64 inum, u64 offset, u64 root, void *ctx)
{
struct scrub_copy_nocow_ctx *nocow_ctx = ctx;
struct scrub_nocow_inode *nocow_inode;
nocow_inode = kzalloc(sizeof(*nocow_inode), GFP_NOFS);
if (!nocow_inode)
return -ENOMEM;
nocow_inode->inum = inum;
nocow_inode->offset = offset;
nocow_inode->root = root;
list_add_tail(&nocow_inode->list, &nocow_ctx->inodes);
return 0;
}
#define COPY_COMPLETE 1
static void copy_nocow_pages_worker(struct btrfs_work *work) static void copy_nocow_pages_worker(struct btrfs_work *work)
{ {
struct scrub_copy_nocow_ctx *nocow_ctx = struct scrub_copy_nocow_ctx *nocow_ctx =
...@@ -3167,8 +3193,7 @@ static void copy_nocow_pages_worker(struct btrfs_work *work) ...@@ -3167,8 +3193,7 @@ static void copy_nocow_pages_worker(struct btrfs_work *work)
} }
ret = iterate_inodes_from_logical(logical, fs_info, path, ret = iterate_inodes_from_logical(logical, fs_info, path,
copy_nocow_pages_for_inode, record_inode_for_nocow, nocow_ctx);
nocow_ctx);
if (ret != 0 && ret != -ENOENT) { if (ret != 0 && ret != -ENOENT) {
pr_warn("iterate_inodes_from_logical() failed: log %llu, phys %llu, len %llu, mir %u, ret %d\n", pr_warn("iterate_inodes_from_logical() failed: log %llu, phys %llu, len %llu, mir %u, ret %d\n",
logical, physical_for_dev_replace, len, mirror_num, logical, physical_for_dev_replace, len, mirror_num,
...@@ -3177,7 +3202,33 @@ static void copy_nocow_pages_worker(struct btrfs_work *work) ...@@ -3177,7 +3202,33 @@ static void copy_nocow_pages_worker(struct btrfs_work *work)
goto out; goto out;
} }
btrfs_end_transaction(trans, root);
trans = NULL;
while (!list_empty(&nocow_ctx->inodes)) {
struct scrub_nocow_inode *entry;
entry = list_first_entry(&nocow_ctx->inodes,
struct scrub_nocow_inode,
list);
list_del_init(&entry->list);
ret = copy_nocow_pages_for_inode(entry->inum, entry->offset,
entry->root, nocow_ctx);
kfree(entry);
if (ret == COPY_COMPLETE) {
ret = 0;
break;
} else if (ret) {
break;
}
}
out: out:
while (!list_empty(&nocow_ctx->inodes)) {
struct scrub_nocow_inode *entry;
entry = list_first_entry(&nocow_ctx->inodes,
struct scrub_nocow_inode,
list);
list_del_init(&entry->list);
kfree(entry);
}
if (trans && !IS_ERR(trans)) if (trans && !IS_ERR(trans))
btrfs_end_transaction(trans, root); btrfs_end_transaction(trans, root);
if (not_written) if (not_written)
...@@ -3190,20 +3241,25 @@ static void copy_nocow_pages_worker(struct btrfs_work *work) ...@@ -3190,20 +3241,25 @@ static void copy_nocow_pages_worker(struct btrfs_work *work)
scrub_pending_trans_workers_dec(sctx); scrub_pending_trans_workers_dec(sctx);
} }
static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, void *ctx) static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
struct scrub_copy_nocow_ctx *nocow_ctx)
{ {
struct scrub_copy_nocow_ctx *nocow_ctx = ctx;
struct btrfs_fs_info *fs_info = nocow_ctx->sctx->dev_root->fs_info; struct btrfs_fs_info *fs_info = nocow_ctx->sctx->dev_root->fs_info;
struct btrfs_key key; struct btrfs_key key;
struct inode *inode; struct inode *inode;
struct page *page; struct page *page;
struct btrfs_root *local_root; struct btrfs_root *local_root;
struct btrfs_ordered_extent *ordered;
struct extent_map *em;
struct extent_state *cached_state = NULL;
struct extent_io_tree *io_tree;
u64 physical_for_dev_replace; u64 physical_for_dev_replace;
u64 len; u64 len = nocow_ctx->len;
u64 lockstart = offset, lockend = offset + len - 1;
unsigned long index; unsigned long index;
int srcu_index; int srcu_index;
int ret; int ret = 0;
int err; int err = 0;
key.objectid = root; key.objectid = root;
key.type = BTRFS_ROOT_ITEM_KEY; key.type = BTRFS_ROOT_ITEM_KEY;
...@@ -3229,9 +3285,33 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, void *ctx) ...@@ -3229,9 +3285,33 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, void *ctx)
mutex_lock(&inode->i_mutex); mutex_lock(&inode->i_mutex);
inode_dio_wait(inode); inode_dio_wait(inode);
ret = 0;
physical_for_dev_replace = nocow_ctx->physical_for_dev_replace; physical_for_dev_replace = nocow_ctx->physical_for_dev_replace;
len = nocow_ctx->len; io_tree = &BTRFS_I(inode)->io_tree;
lock_extent_bits(io_tree, lockstart, lockend, 0, &cached_state);
ordered = btrfs_lookup_ordered_range(inode, lockstart, len);
if (ordered) {
btrfs_put_ordered_extent(ordered);
goto out_unlock;
}
em = btrfs_get_extent(inode, NULL, 0, lockstart, len, 0);
if (IS_ERR(em)) {
ret = PTR_ERR(em);
goto out_unlock;
}
/*
* This extent does not actually cover the logical extent anymore,
* move on to the next inode.
*/
if (em->block_start > nocow_ctx->logical ||
em->block_start + em->block_len < nocow_ctx->logical + len) {
free_extent_map(em);
goto out_unlock;
}
free_extent_map(em);
while (len >= PAGE_CACHE_SIZE) { while (len >= PAGE_CACHE_SIZE) {
index = offset >> PAGE_CACHE_SHIFT; index = offset >> PAGE_CACHE_SHIFT;
again: again:
...@@ -3247,10 +3327,9 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, void *ctx) ...@@ -3247,10 +3327,9 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, void *ctx)
goto next_page; goto next_page;
} else { } else {
ClearPageError(page); ClearPageError(page);
err = extent_read_full_page(&BTRFS_I(inode)-> err = extent_read_full_page_nolock(io_tree, page,
io_tree, btrfs_get_extent,
page, btrfs_get_extent, nocow_ctx->mirror_num);
nocow_ctx->mirror_num);
if (err) { if (err) {
ret = err; ret = err;
goto next_page; goto next_page;
...@@ -3264,6 +3343,7 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, void *ctx) ...@@ -3264,6 +3343,7 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, void *ctx)
* page in the page cache. * page in the page cache.
*/ */
if (page->mapping != inode->i_mapping) { if (page->mapping != inode->i_mapping) {
unlock_page(page);
page_cache_release(page); page_cache_release(page);
goto again; goto again;
} }
...@@ -3287,6 +3367,10 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, void *ctx) ...@@ -3287,6 +3367,10 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, void *ctx)
physical_for_dev_replace += PAGE_CACHE_SIZE; physical_for_dev_replace += PAGE_CACHE_SIZE;
len -= PAGE_CACHE_SIZE; len -= PAGE_CACHE_SIZE;
} }
ret = COPY_COMPLETE;
out_unlock:
unlock_extent_cached(io_tree, lockstart, lockend, &cached_state,
GFP_NOFS);
out: out:
mutex_unlock(&inode->i_mutex); mutex_unlock(&inode->i_mutex);
iput(inode); iput(inode);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment