Commit 78ce9fc2 authored by Naohiro Aota's avatar Naohiro Aota Committed by David Sterba

btrfs: zoned: mark block groups to copy for device-replace

This is the 1/4 patch to support device-replace on zoned filesystems.

We have two types of IOs during the device replace process. One is an IO
to "copy" (by the scrub functions) all the device extents from the source
device to the destination device. The other one is an IO to "clone" (by
handle_ops_on_dev_replace()) new incoming write IOs from users to the
source device into the target device.

Cloning incoming IOs can break the sequential write rule in on target
device. When a write is mapped in the middle of a block group, the IO is
directed to the middle of a target device zone, which breaks the
sequential write requirement.

However, the cloning function cannot be disabled since incoming IOs
targeting already copied device extents must be cloned so that the IO is
executed on the target device.

We cannot use dev_replace->cursor_{left,right} to determine whether a bio
is going to a not yet copied region. Since we have a time gap between
finishing btrfs_scrub_dev() and rewriting the mapping tree in
btrfs_dev_replace_finishing(), we can have a newly allocated device extent
which is never cloned nor copied.

So the point is to copy only already existing device extents. This patch
introduces mark_block_group_to_copy() to mark existing block groups as a
target of copying. Then, handle_ops_on_dev_replace() and dev-replace can
check the flag to do their job.

Also, btrfs_finish_block_group_to_copy() will check if the copied stripe
is the last stripe in the block group. With the last stripe copied,
the to_copy flag is finally disabled. Afterwards we can safely clone
incoming IOs on this block group.
Reviewed-by: default avatarJosef Bacik <josef@toxicpanda.com>
Signed-off-by: default avatarNaohiro Aota <naohiro.aota@wdc.com>
Signed-off-by: default avatarDavid Sterba <dsterba@suse.com>
parent 4eef29ef
...@@ -95,6 +95,7 @@ struct btrfs_block_group { ...@@ -95,6 +95,7 @@ struct btrfs_block_group {
unsigned int iref:1; unsigned int iref:1;
unsigned int has_caching_ctl:1; unsigned int has_caching_ctl:1;
unsigned int removed:1; unsigned int removed:1;
unsigned int to_copy:1;
int disk_cache_state; int disk_cache_state;
......
...@@ -22,6 +22,7 @@ ...@@ -22,6 +22,7 @@
#include "dev-replace.h" #include "dev-replace.h"
#include "sysfs.h" #include "sysfs.h"
#include "zoned.h" #include "zoned.h"
#include "block-group.h"
/* /*
* Device replace overview * Device replace overview
...@@ -459,6 +460,185 @@ static char* btrfs_dev_name(struct btrfs_device *device) ...@@ -459,6 +460,185 @@ static char* btrfs_dev_name(struct btrfs_device *device)
return rcu_str_deref(device->name); return rcu_str_deref(device->name);
} }
static int mark_block_group_to_copy(struct btrfs_fs_info *fs_info,
struct btrfs_device *src_dev)
{
struct btrfs_path *path;
struct btrfs_key key;
struct btrfs_key found_key;
struct btrfs_root *root = fs_info->dev_root;
struct btrfs_dev_extent *dev_extent = NULL;
struct btrfs_block_group *cache;
struct btrfs_trans_handle *trans;
int ret = 0;
u64 chunk_offset;
/* Do not use "to_copy" on non zoned filesystem for now */
if (!btrfs_is_zoned(fs_info))
return 0;
mutex_lock(&fs_info->chunk_mutex);
/* Ensure we don't have pending new block group */
spin_lock(&fs_info->trans_lock);
while (fs_info->running_transaction &&
!list_empty(&fs_info->running_transaction->dev_update_list)) {
spin_unlock(&fs_info->trans_lock);
mutex_unlock(&fs_info->chunk_mutex);
trans = btrfs_attach_transaction(root);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
mutex_lock(&fs_info->chunk_mutex);
if (ret == -ENOENT) {
spin_lock(&fs_info->trans_lock);
continue;
} else {
goto unlock;
}
}
ret = btrfs_commit_transaction(trans);
mutex_lock(&fs_info->chunk_mutex);
if (ret)
goto unlock;
spin_lock(&fs_info->trans_lock);
}
spin_unlock(&fs_info->trans_lock);
path = btrfs_alloc_path();
if (!path) {
ret = -ENOMEM;
goto unlock;
}
path->reada = READA_FORWARD;
path->search_commit_root = 1;
path->skip_locking = 1;
key.objectid = src_dev->devid;
key.type = BTRFS_DEV_EXTENT_KEY;
key.offset = 0;
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
goto free_path;
if (ret > 0) {
if (path->slots[0] >=
btrfs_header_nritems(path->nodes[0])) {
ret = btrfs_next_leaf(root, path);
if (ret < 0)
goto free_path;
if (ret > 0) {
ret = 0;
goto free_path;
}
} else {
ret = 0;
}
}
while (1) {
struct extent_buffer *leaf = path->nodes[0];
int slot = path->slots[0];
btrfs_item_key_to_cpu(leaf, &found_key, slot);
if (found_key.objectid != src_dev->devid)
break;
if (found_key.type != BTRFS_DEV_EXTENT_KEY)
break;
if (found_key.offset < key.offset)
break;
dev_extent = btrfs_item_ptr(leaf, slot, struct btrfs_dev_extent);
chunk_offset = btrfs_dev_extent_chunk_offset(leaf, dev_extent);
cache = btrfs_lookup_block_group(fs_info, chunk_offset);
if (!cache)
goto skip;
spin_lock(&cache->lock);
cache->to_copy = 1;
spin_unlock(&cache->lock);
btrfs_put_block_group(cache);
skip:
ret = btrfs_next_item(root, path);
if (ret != 0) {
if (ret > 0)
ret = 0;
break;
}
}
free_path:
btrfs_free_path(path);
unlock:
mutex_unlock(&fs_info->chunk_mutex);
return ret;
}
bool btrfs_finish_block_group_to_copy(struct btrfs_device *srcdev,
struct btrfs_block_group *cache,
u64 physical)
{
struct btrfs_fs_info *fs_info = cache->fs_info;
struct extent_map *em;
struct map_lookup *map;
u64 chunk_offset = cache->start;
int num_extents, cur_extent;
int i;
/* Do not use "to_copy" on non zoned filesystem for now */
if (!btrfs_is_zoned(fs_info))
return true;
spin_lock(&cache->lock);
if (cache->removed) {
spin_unlock(&cache->lock);
return true;
}
spin_unlock(&cache->lock);
em = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
ASSERT(!IS_ERR(em));
map = em->map_lookup;
num_extents = cur_extent = 0;
for (i = 0; i < map->num_stripes; i++) {
/* We have more device extent to copy */
if (srcdev != map->stripes[i].dev)
continue;
num_extents++;
if (physical == map->stripes[i].physical)
cur_extent = i;
}
free_extent_map(em);
if (num_extents > 1 && cur_extent < num_extents - 1) {
/*
* Has more stripes on this device. Keep this block group
* readonly until we finish all the stripes.
*/
return false;
}
/* Last stripe on this device */
spin_lock(&cache->lock);
cache->to_copy = 0;
spin_unlock(&cache->lock);
return true;
}
static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
const char *tgtdev_name, u64 srcdevid, const char *srcdev_name, const char *tgtdev_name, u64 srcdevid, const char *srcdev_name,
int read_src) int read_src)
...@@ -500,6 +680,10 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, ...@@ -500,6 +680,10 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
if (ret) if (ret)
return ret; return ret;
ret = mark_block_group_to_copy(fs_info, src_device);
if (ret)
return ret;
down_write(&dev_replace->rwsem); down_write(&dev_replace->rwsem);
switch (dev_replace->replace_state) { switch (dev_replace->replace_state) {
case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
......
...@@ -18,5 +18,8 @@ int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info); ...@@ -18,5 +18,8 @@ int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info);
void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info); void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info);
int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info); int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info);
int __pure btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace); int __pure btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace);
bool btrfs_finish_block_group_to_copy(struct btrfs_device *srcdev,
struct btrfs_block_group *cache,
u64 physical);
#endif #endif
...@@ -3561,6 +3561,16 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, ...@@ -3561,6 +3561,16 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
if (!cache) if (!cache)
goto skip; goto skip;
if (sctx->is_dev_replace && btrfs_is_zoned(fs_info)) {
spin_lock(&cache->lock);
if (!cache->to_copy) {
spin_unlock(&cache->lock);
ro_set = 0;
goto done;
}
spin_unlock(&cache->lock);
}
/* /*
* Make sure that while we are scrubbing the corresponding block * Make sure that while we are scrubbing the corresponding block
* group doesn't get its logical address and its device extents * group doesn't get its logical address and its device extents
...@@ -3692,6 +3702,12 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, ...@@ -3692,6 +3702,12 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
scrub_pause_off(fs_info); scrub_pause_off(fs_info);
if (sctx->is_dev_replace &&
!btrfs_finish_block_group_to_copy(dev_replace->srcdev,
cache, found_key.offset))
ro_set = 0;
done:
down_write(&dev_replace->rwsem); down_write(&dev_replace->rwsem);
dev_replace->cursor_left = dev_replace->cursor_right; dev_replace->cursor_left = dev_replace->cursor_right;
dev_replace->item_needs_writeback = 1; dev_replace->item_needs_writeback = 1;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment