Commit fd574a2f authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'for-5.18-rc4-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux

Pull btrfs fixes from David Sterba:

 - direct IO fixes:

      - restore passing file offset to correctly calculate checksums
        when repairing on read and bio split happens

      - use correct bio when sumitting IO on zoned filesystem

 - zoned mode fixes:

      - fix selection of device to correctly calculate device
        capabilities when allocating a new bio

      - use a dedicated lock for exclusion during relocation

      - fix leaked plug after failure syncing log

 - fix assertion during scrub and relocation

* tag 'for-5.18-rc4-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux:
  btrfs: zoned: use dedicated lock for data relocation
  btrfs: fix assertion failure during scrub due to block group reallocation
  btrfs: fix direct I/O writes for split bios on zoned devices
  btrfs: fix direct I/O read repair for split bios
  btrfs: fix and document the zoned device choice in alloc_new_bio
  btrfs: fix leaked plug after failure syncing log on zoned filesystems
parents d615b541 5f0addf7
......@@ -1060,6 +1060,7 @@ struct btrfs_fs_info {
*/
spinlock_t relocation_bg_lock;
u64 data_reloc_bg;
struct mutex zoned_data_reloc_io_lock;
u64 nr_global_roots;
......
......@@ -734,7 +734,12 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);
/* Commit dev_replace state and reserve 1 item for it. */
/*
* Commit dev_replace state and reserve 1 item for it.
* This is crucial to ensure we won't miss copying extents for new block
* groups that are allocated after we started the device replace, and
* must be done after setting up the device replace state.
*/
trans = btrfs_start_transaction(root, 1);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
......
......@@ -3157,6 +3157,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
mutex_init(&fs_info->reloc_mutex);
mutex_init(&fs_info->delalloc_root_mutex);
mutex_init(&fs_info->zoned_meta_io_lock);
mutex_init(&fs_info->zoned_data_reloc_io_lock);
seqlock_init(&fs_info->profiles_lock);
INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
......
......@@ -2658,6 +2658,7 @@ int btrfs_repair_one_sector(struct inode *inode,
repair_bio = btrfs_bio_alloc(1);
repair_bbio = btrfs_bio(repair_bio);
repair_bbio->file_offset = start;
repair_bio->bi_opf = REQ_OP_READ;
repair_bio->bi_end_io = failed_bio->bi_end_io;
repair_bio->bi_iter.bi_sector = failrec->logical >> 9;
......@@ -3333,24 +3334,37 @@ static int alloc_new_bio(struct btrfs_inode *inode,
ret = calc_bio_boundaries(bio_ctrl, inode, file_offset);
if (ret < 0)
goto error;
if (wbc) {
struct block_device *bdev;
bdev = fs_info->fs_devices->latest_dev->bdev;
bio_set_dev(bio, bdev);
wbc_init_bio(wbc, bio);
}
if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
struct btrfs_device *device;
if (wbc) {
/*
* For Zone append we need the correct block_device that we are
* going to write to set in the bio to be able to respect the
* hardware limitation. Look it up here:
*/
if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
struct btrfs_device *dev;
dev = btrfs_zoned_get_device(fs_info, disk_bytenr,
fs_info->sectorsize);
if (IS_ERR(dev)) {
ret = PTR_ERR(dev);
goto error;
}
device = btrfs_zoned_get_device(fs_info, disk_bytenr,
fs_info->sectorsize);
if (IS_ERR(device)) {
ret = PTR_ERR(device);
goto error;
bio_set_dev(bio, dev->bdev);
} else {
/*
* Otherwise pick the last added device to support
* cgroup writeback. For multi-device file systems this
* means blk-cgroup policies have to always be set on the
* last added/replaced device. This is a bit odd but has
* been like that for a long time.
*/
bio_set_dev(bio, fs_info->fs_devices->latest_dev->bdev);
}
btrfs_bio(bio)->device = device;
wbc_init_bio(wbc, bio);
} else {
ASSERT(bio_op(bio) != REQ_OP_ZONE_APPEND);
}
return 0;
error:
......
......@@ -7810,8 +7810,6 @@ static blk_status_t btrfs_check_read_dio_bio(struct btrfs_dio_private *dip,
const bool csum = !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM);
struct bio_vec bvec;
struct bvec_iter iter;
const u64 orig_file_offset = dip->file_offset;
u64 start = orig_file_offset;
u32 bio_offset = 0;
blk_status_t err = BLK_STS_OK;
......@@ -7821,6 +7819,8 @@ static blk_status_t btrfs_check_read_dio_bio(struct btrfs_dio_private *dip,
nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec.bv_len);
pgoff = bvec.bv_offset;
for (i = 0; i < nr_sectors; i++) {
u64 start = bbio->file_offset + bio_offset;
ASSERT(pgoff < PAGE_SIZE);
if (uptodate &&
(!csum || !check_data_csum(inode, bbio,
......@@ -7833,17 +7833,13 @@ static blk_status_t btrfs_check_read_dio_bio(struct btrfs_dio_private *dip,
} else {
int ret;
ASSERT((start - orig_file_offset) < UINT_MAX);
ret = btrfs_repair_one_sector(inode,
&bbio->bio,
start - orig_file_offset,
bvec.bv_page, pgoff,
ret = btrfs_repair_one_sector(inode, &bbio->bio,
bio_offset, bvec.bv_page, pgoff,
start, bbio->mirror_num,
submit_dio_repair_bio);
if (ret)
err = errno_to_blk_status(ret);
}
start += sectorsize;
ASSERT(bio_offset + sectorsize > bio_offset);
bio_offset += sectorsize;
pgoff += sectorsize;
......@@ -7870,6 +7866,7 @@ static blk_status_t btrfs_submit_bio_start_direct_io(struct inode *inode,
static void btrfs_end_dio_bio(struct bio *bio)
{
struct btrfs_dio_private *dip = bio->bi_private;
struct btrfs_bio *bbio = btrfs_bio(bio);
blk_status_t err = bio->bi_status;
if (err)
......@@ -7880,12 +7877,12 @@ static void btrfs_end_dio_bio(struct bio *bio)
bio->bi_iter.bi_size, err);
if (bio_op(bio) == REQ_OP_READ)
err = btrfs_check_read_dio_bio(dip, btrfs_bio(bio), !err);
err = btrfs_check_read_dio_bio(dip, bbio, !err);
if (err)
dip->dio_bio->bi_status = err;
btrfs_record_physical_zoned(dip->inode, dip->file_offset, bio);
btrfs_record_physical_zoned(dip->inode, bbio->file_offset, bio);
bio_put(bio);
btrfs_dio_private_put(dip);
......@@ -8046,6 +8043,7 @@ static void btrfs_submit_direct(const struct iomap_iter *iter,
bio = btrfs_bio_clone_partial(dio_bio, clone_offset, clone_len);
bio->bi_private = dip;
bio->bi_end_io = btrfs_end_dio_bio;
btrfs_bio(bio)->file_offset = file_offset;
if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
status = extract_ordered_extent(BTRFS_I(inode), bio,
......
......@@ -3699,6 +3699,31 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
if (!cache)
goto skip;
ASSERT(cache->start <= chunk_offset);
/*
* We are using the commit root to search for device extents, so
* that means we could have found a device extent item from a
* block group that was deleted in the current transaction. The
* logical start offset of the deleted block group, stored at
* @chunk_offset, might be part of the logical address range of
* a new block group (which uses different physical extents).
* In this case btrfs_lookup_block_group() has returned the new
* block group, and its start address is less than @chunk_offset.
*
* We skip such new block groups, because it's pointless to
* process them, as we won't find their extents because we search
* for them using the commit root of the extent tree. For a device
* replace it's also fine to skip it, we won't miss copying them
* to the target device because we have the write duplication
* setup through the regular write path (by btrfs_map_block()),
* and we have committed a transaction when we started the device
* replace, right after setting up the device replace state.
*/
if (cache->start < chunk_offset) {
btrfs_put_block_group(cache);
goto skip;
}
if (sctx->is_dev_replace && btrfs_is_zoned(fs_info)) {
spin_lock(&cache->lock);
if (!cache->to_copy) {
......@@ -3822,7 +3847,6 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
dev_replace->item_needs_writeback = 1;
up_write(&dev_replace->rwsem);
ASSERT(cache->start == chunk_offset);
ret = scrub_chunk(sctx, cache, scrub_dev, found_key.offset,
dev_extent_len);
......
......@@ -3188,6 +3188,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
ret = btrfs_alloc_log_tree_node(trans, log_root_tree);
if (ret) {
mutex_unlock(&fs_info->tree_root->log_mutex);
blk_finish_plug(&plug);
goto out;
}
}
......
......@@ -328,6 +328,9 @@ struct btrfs_fs_devices {
struct btrfs_bio {
unsigned int mirror_num;
/* for direct I/O */
u64 file_offset;
/* @device is for stripe IO submission. */
struct btrfs_device *device;
u8 *csum;
......
......@@ -359,7 +359,7 @@ static inline void btrfs_zoned_data_reloc_lock(struct btrfs_inode *inode)
struct btrfs_root *root = inode->root;
if (btrfs_is_data_reloc_root(root) && btrfs_is_zoned(root->fs_info))
btrfs_inode_lock(&inode->vfs_inode, 0);
mutex_lock(&root->fs_info->zoned_data_reloc_io_lock);
}
static inline void btrfs_zoned_data_reloc_unlock(struct btrfs_inode *inode)
......@@ -367,7 +367,7 @@ static inline void btrfs_zoned_data_reloc_unlock(struct btrfs_inode *inode)
struct btrfs_root *root = inode->root;
if (btrfs_is_data_reloc_root(root) && btrfs_is_zoned(root->fs_info))
btrfs_inode_unlock(&inode->vfs_inode, 0);
mutex_unlock(&root->fs_info->zoned_data_reloc_io_lock);
}
#endif
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment