Commit c404e0dc authored by Miao Xie's avatar Miao Xie Committed by Josef Bacik

Btrfs: fix use-after-free in the finishing procedure of the device replace

During device replace test, we hit a null pointer deference (It was very easy
to reproduce it by running xfstests' btrfs/011 on the devices with the virtio
scsi driver). There were two bugs that caused this problem:
- We might allocate new chunks on the replaced device after we updated
  the mapping tree. And we forgot to replace the source device in those
  mapping of the new chunks.
- We might get the mapping information which including the source device
  before the mapping information update. And then submit the bio which was
  based on that mapping information after we freed the source device.

For the first bug, we can fix it by doing mapping tree update and source
device remove in the same context of the chunk mutex. The chunk mutex is
used to protect the allocable device list, the above method can avoid
the new chunk allocation, and after we remove the source device, all
the new chunks will be allocated on the new device. So it can fix
the first bug.

For the second bug, we need make sure all flighting bios are finished and
no new bios are produced during we are removing the source device. To fix
this problem, we introduced a global @bio_counter, we not only inc/dec
@bio_counter outsize of map_blocks, but also inc it before submitting bio
and dec @bio_counter when ending bios.

Since Raid56 is a little different and device replace dosen't support raid56
yet, it is not addressed in the patch and I add comments to make sure we will
fix it in the future.
Reported-by: default avatarQu Wenruo <quwenruo@cn.fujitsu.com>
Signed-off-by: default avatarWang Shilong <wangsl.fnst@cn.fujitsu.com>
Signed-off-by: default avatarMiao Xie <miaox@cn.fujitsu.com>
Signed-off-by: default avatarJosef Bacik <jbacik@fb.com>
parent 391cd9df
......@@ -351,6 +351,7 @@ static inline unsigned long btrfs_chunk_item_size(int num_stripes)
#define BTRFS_FS_STATE_ERROR 0
#define BTRFS_FS_STATE_REMOUNTING 1
#define BTRFS_FS_STATE_TRANS_ABORTED 2
#define BTRFS_FS_STATE_DEV_REPLACING 3
/* Super block flags */
/* Errors detected */
......@@ -1674,6 +1675,9 @@ struct btrfs_fs_info {
atomic_t mutually_exclusive_operation_running;
struct percpu_counter bio_counter;
wait_queue_head_t replace_wait;
struct semaphore uuid_tree_rescan_sem;
unsigned int update_uuid_tree_gen:1;
};
......@@ -4008,6 +4012,11 @@ int btrfs_scrub_cancel_dev(struct btrfs_fs_info *info,
int btrfs_scrub_progress(struct btrfs_root *root, u64 devid,
struct btrfs_scrub_progress *progress);
/* dev-replace.c */
void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info);
void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info);
void btrfs_bio_counter_dec(struct btrfs_fs_info *fs_info);
/* reada.c */
struct reada_control {
struct btrfs_root *root; /* tree to prefetch */
......
......@@ -431,6 +431,35 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
return ret;
}
/*
* blocked until all flighting bios are finished.
*/
static void btrfs_rm_dev_replace_blocked(struct btrfs_fs_info *fs_info)
{
s64 writers;
DEFINE_WAIT(wait);
set_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state);
do {
prepare_to_wait(&fs_info->replace_wait, &wait,
TASK_UNINTERRUPTIBLE);
writers = percpu_counter_sum(&fs_info->bio_counter);
if (writers)
schedule();
finish_wait(&fs_info->replace_wait, &wait);
} while (writers);
}
/*
* we have removed target device, it is safe to allow new bios request.
*/
static void btrfs_rm_dev_replace_unblocked(struct btrfs_fs_info *fs_info)
{
clear_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state);
if (waitqueue_active(&fs_info->replace_wait))
wake_up(&fs_info->replace_wait);
}
static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
int scrub_ret)
{
......@@ -458,12 +487,6 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
src_device = dev_replace->srcdev;
btrfs_dev_replace_unlock(dev_replace);
/* replace old device with new one in mapping tree */
if (!scrub_ret)
btrfs_dev_replace_update_device_in_mapping_tree(fs_info,
src_device,
tgt_device);
/*
* flush all outstanding I/O and inode extent mappings before the
* copy operation is declared as being finished
......@@ -495,7 +518,12 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
dev_replace->time_stopped = get_seconds();
dev_replace->item_needs_writeback = 1;
if (scrub_ret) {
/* replace old device with new one in mapping tree */
if (!scrub_ret) {
btrfs_dev_replace_update_device_in_mapping_tree(fs_info,
src_device,
tgt_device);
} else {
printk_in_rcu(KERN_ERR
"BTRFS: btrfs_scrub_dev(%s, %llu, %s) failed %d\n",
src_device->missing ? "<missing disk>" :
......@@ -534,8 +562,12 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
fs_info->fs_devices->latest_bdev = tgt_device->bdev;
list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list);
btrfs_rm_dev_replace_blocked(fs_info);
btrfs_rm_dev_replace_srcdev(fs_info, src_device);
btrfs_rm_dev_replace_unblocked(fs_info);
/*
* this is again a consistent state where no dev_replace procedure
* is running, the target device is part of the filesystem, the
......@@ -865,3 +897,31 @@ void btrfs_dev_replace_unlock(struct btrfs_dev_replace *dev_replace)
mutex_unlock(&dev_replace->lock_management_lock);
}
}
void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info)
{
percpu_counter_inc(&fs_info->bio_counter);
}
void btrfs_bio_counter_dec(struct btrfs_fs_info *fs_info)
{
percpu_counter_dec(&fs_info->bio_counter);
if (waitqueue_active(&fs_info->replace_wait))
wake_up(&fs_info->replace_wait);
}
void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info)
{
DEFINE_WAIT(wait);
again:
percpu_counter_inc(&fs_info->bio_counter);
if (test_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state)) {
btrfs_bio_counter_dec(fs_info);
wait_event(fs_info->replace_wait,
!test_bit(BTRFS_FS_STATE_DEV_REPLACING,
&fs_info->fs_state));
goto again;
}
}
......@@ -2136,10 +2136,16 @@ int open_ctree(struct super_block *sb,
goto fail_dirty_metadata_bytes;
}
ret = percpu_counter_init(&fs_info->bio_counter, 0);
if (ret) {
err = ret;
goto fail_delalloc_bytes;
}
fs_info->btree_inode = new_inode(sb);
if (!fs_info->btree_inode) {
err = -ENOMEM;
goto fail_delalloc_bytes;
goto fail_bio_counter;
}
mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS);
......@@ -2214,6 +2220,7 @@ int open_ctree(struct super_block *sb,
atomic_set(&fs_info->scrub_pause_req, 0);
atomic_set(&fs_info->scrubs_paused, 0);
atomic_set(&fs_info->scrub_cancel_req, 0);
init_waitqueue_head(&fs_info->replace_wait);
init_waitqueue_head(&fs_info->scrub_pause_wait);
fs_info->scrub_workers_refcnt = 0;
#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
......@@ -2966,6 +2973,8 @@ int open_ctree(struct super_block *sb,
btrfs_mapping_tree_free(&fs_info->mapping_tree);
iput(fs_info->btree_inode);
fail_bio_counter:
percpu_counter_destroy(&fs_info->bio_counter);
fail_delalloc_bytes:
percpu_counter_destroy(&fs_info->delalloc_bytes);
fail_dirty_metadata_bytes:
......@@ -3613,6 +3622,7 @@ int close_ctree(struct btrfs_root *root)
percpu_counter_destroy(&fs_info->dirty_metadata_bytes);
percpu_counter_destroy(&fs_info->delalloc_bytes);
percpu_counter_destroy(&fs_info->bio_counter);
bdi_destroy(&fs_info->bdi);
cleanup_srcu_struct(&fs_info->subvol_srcu);
......
......@@ -5263,6 +5263,7 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
static void btrfs_end_bio(struct bio *bio, int err)
{
struct btrfs_bio *bbio = bio->bi_private;
struct btrfs_device *dev = bbio->stripes[0].dev;
int is_orig_bio = 0;
if (err) {
......@@ -5270,7 +5271,6 @@ static void btrfs_end_bio(struct bio *bio, int err)
if (err == -EIO || err == -EREMOTEIO) {
unsigned int stripe_index =
btrfs_io_bio(bio)->stripe_index;
struct btrfs_device *dev;
BUG_ON(stripe_index >= bbio->num_stripes);
dev = bbio->stripes[stripe_index].dev;
......@@ -5292,6 +5292,8 @@ static void btrfs_end_bio(struct bio *bio, int err)
if (bio == bbio->orig_bio)
is_orig_bio = 1;
btrfs_bio_counter_dec(bbio->fs_info);
if (atomic_dec_and_test(&bbio->stripes_pending)) {
if (!is_orig_bio) {
bio_put(bio);
......@@ -5440,6 +5442,9 @@ static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio,
}
#endif
bio->bi_bdev = dev->bdev;
btrfs_bio_counter_inc_noblocked(root->fs_info);
if (async)
btrfs_schedule_bio(root, dev, rw, bio);
else
......@@ -5508,28 +5513,38 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
length = bio->bi_size;
map_length = length;
btrfs_bio_counter_inc_blocked(root->fs_info);
ret = __btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio,
mirror_num, &raid_map);
if (ret) /* -ENOMEM */
if (ret) {
btrfs_bio_counter_dec(root->fs_info);
return ret;
}
total_devs = bbio->num_stripes;
bbio->orig_bio = first_bio;
bbio->private = first_bio->bi_private;
bbio->end_io = first_bio->bi_end_io;
bbio->fs_info = root->fs_info;
atomic_set(&bbio->stripes_pending, bbio->num_stripes);
if (raid_map) {
/* In this case, map_length has been set to the length of
a single stripe; not the whole write */
if (rw & WRITE) {
return raid56_parity_write(root, bio, bbio,
raid_map, map_length);
ret = raid56_parity_write(root, bio, bbio,
raid_map, map_length);
} else {
return raid56_parity_recover(root, bio, bbio,
raid_map, map_length,
mirror_num);
ret = raid56_parity_recover(root, bio, bbio,
raid_map, map_length,
mirror_num);
}
/*
* FIXME, replace dosen't support raid56 yet, please fix
* it in the future.
*/
btrfs_bio_counter_dec(root->fs_info);
return ret;
}
if (map_length < length) {
......@@ -5571,6 +5586,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
async_submit);
dev_nr++;
}
btrfs_bio_counter_dec(root->fs_info);
return 0;
}
......
......@@ -192,6 +192,7 @@ typedef void (btrfs_bio_end_io_t) (struct btrfs_bio *bio, int err);
struct btrfs_bio {
atomic_t stripes_pending;
struct btrfs_fs_info *fs_info;
bio_end_io_t *end_io;
struct bio *orig_bio;
void *private;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment