Commit e52ea4cc authored by Filipe Manana's avatar Filipe Manana Committed by Ben Hutchings

Btrfs: fix read corruption of compressed and shared extents

commit 005efedf upstream.

If a file has a range pointing to a compressed extent, followed by
another range that points to the same compressed extent and a read
operation attempts to read both ranges (either completely or part of
them), the pages that correspond to the second range are incorrectly
filled with zeroes.

Consider the following example:

  File layout
  [0 - 8K]                      [8K - 24K]
      |                             |
      |                             |
   points to extent X,         points to extent X,
   offset 4K, length of 8K     offset 0, length 16K

  [extent X, compressed length = 4K uncompressed length = 16K]

If a readpages() call spans the 2 ranges, a single bio to read the extent
is submitted - extent_io.c:submit_extent_page() would only create a new
bio to cover the second range pointing to the extent if the extent it
points to had a different logical address than the extent associated with
the first range. This has a consequence of the compressed read end io
handler (compression.c:end_compressed_bio_read()) finish once the extent
is decompressed into the pages covering the first range, leaving the
remaining pages (belonging to the second range) filled with zeroes (done
by compression.c:btrfs_clear_biovec_end()).

So fix this by submitting the current bio whenever we find a range
pointing to a compressed extent that was preceded by a range with a
different extent map. This is the simplest solution for this corner
case. Making the end io callback populate both ranges (or more, if we
have multiple pointing to the same extent) is a much more complex
solution since each bio is tightly coupled with a single extent map and
the extent maps associated to the ranges pointing to the shared extent
can have different offsets and lengths.

The following test case for fstests triggers the issue:

  seq=`basename $0`
  seqres=$RESULT_DIR/$seq
  echo "QA output created by $seq"
  tmp=/tmp/$$
  status=1	# failure is the default!
  trap "_cleanup; exit \$status" 0 1 2 3 15

  _cleanup()
  {
      rm -f $tmp.*
  }

  # get standard environment, filters and checks
  . ./common/rc
  . ./common/filter

  # real QA test starts here
  _need_to_be_root
  _supported_fs btrfs
  _supported_os Linux
  _require_scratch
  _require_cloner

  rm -f $seqres.full

  test_clone_and_read_compressed_extent()
  {
      local mount_opts=$1

      _scratch_mkfs >>$seqres.full 2>&1
      _scratch_mount $mount_opts

      # Create a test file with a single extent that is compressed (the
      # data we write into it is highly compressible no matter which
      # compression algorithm is used, zlib or lzo).
      $XFS_IO_PROG -f -c "pwrite -S 0xaa 0K 4K"        \
                      -c "pwrite -S 0xbb 4K 8K"        \
                      -c "pwrite -S 0xcc 12K 4K"       \
                      $SCRATCH_MNT/foo | _filter_xfs_io

      # Now clone our extent into an adjacent offset.
      $CLONER_PROG -s $((4 * 1024)) -d $((16 * 1024)) -l $((8 * 1024)) \
          $SCRATCH_MNT/foo $SCRATCH_MNT/foo

      # Same as before but for this file we clone the extent into a lower
      # file offset.
      $XFS_IO_PROG -f -c "pwrite -S 0xaa 8K 4K"         \
                      -c "pwrite -S 0xbb 12K 8K"        \
                      -c "pwrite -S 0xcc 20K 4K"        \
                      $SCRATCH_MNT/bar | _filter_xfs_io

      $CLONER_PROG -s $((12 * 1024)) -d 0 -l $((8 * 1024)) \
          $SCRATCH_MNT/bar $SCRATCH_MNT/bar

      echo "File digests before unmounting filesystem:"
      md5sum $SCRATCH_MNT/foo | _filter_scratch
      md5sum $SCRATCH_MNT/bar | _filter_scratch

      # Evicting the inode or clearing the page cache before reading
      # again the file would also trigger the bug - reads were returning
      # all bytes in the range corresponding to the second reference to
      # the extent with a value of 0, but the correct data was persisted
      # (it was a bug exclusively in the read path). The issue happened
      # only if the same readpages() call targeted pages belonging to the
      # first and second ranges that point to the same compressed extent.
      _scratch_remount

      echo "File digests after mounting filesystem again:"
      # Must match the same digests we got before.
      md5sum $SCRATCH_MNT/foo | _filter_scratch
      md5sum $SCRATCH_MNT/bar | _filter_scratch
  }

  echo -e "\nTesting with zlib compression..."
  test_clone_and_read_compressed_extent "-o compress=zlib"

  _scratch_unmount

  echo -e "\nTesting with lzo compression..."
  test_clone_and_read_compressed_extent "-o compress=lzo"

  status=0
  exit
Signed-off-by: default avatarFilipe Manana <fdmanana@suse.com>
Reviewed-by: Qu Wenruo<quwenruo@cn.fujitsu.com>
Reviewed-by: default avatarLiu Bo <bo.li.liu@oracle.com>
[bwh: Backported to 3.2:
 - Maintain prev_em_start in both functions calling __extent_read_full_page()
   in a loop
 - Adjust context and order]
Signed-off-by: default avatarBen Hutchings <ben@decadent.org.uk>
parent 254a47ce
...@@ -2444,7 +2444,8 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree, ...@@ -2444,7 +2444,8 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree,
bio_end_io_t end_io_func, bio_end_io_t end_io_func,
int mirror_num, int mirror_num,
unsigned long prev_bio_flags, unsigned long prev_bio_flags,
unsigned long bio_flags) unsigned long bio_flags,
bool force_bio_submit)
{ {
int ret = 0; int ret = 0;
struct bio *bio; struct bio *bio;
...@@ -2463,6 +2464,7 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree, ...@@ -2463,6 +2464,7 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree,
sector; sector;
if (prev_bio_flags != bio_flags || !contig || if (prev_bio_flags != bio_flags || !contig ||
force_bio_submit ||
(tree->ops && tree->ops->merge_bio_hook && (tree->ops && tree->ops->merge_bio_hook &&
tree->ops->merge_bio_hook(page, offset, page_size, bio, tree->ops->merge_bio_hook(page, offset, page_size, bio,
bio_flags)) || bio_flags)) ||
...@@ -2519,7 +2521,8 @@ static int __extent_read_full_page(struct extent_io_tree *tree, ...@@ -2519,7 +2521,8 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
struct page *page, struct page *page,
get_extent_t *get_extent, get_extent_t *get_extent,
struct bio **bio, int mirror_num, struct bio **bio, int mirror_num,
unsigned long *bio_flags) unsigned long *bio_flags,
u64 *prev_em_start)
{ {
struct inode *inode = page->mapping->host; struct inode *inode = page->mapping->host;
u64 start = (u64)page->index << PAGE_CACHE_SHIFT; u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
...@@ -2575,6 +2578,8 @@ static int __extent_read_full_page(struct extent_io_tree *tree, ...@@ -2575,6 +2578,8 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
} }
} }
while (cur <= end) { while (cur <= end) {
bool force_bio_submit = false;
if (cur >= last_byte) { if (cur >= last_byte) {
char *userpage; char *userpage;
struct extent_state *cached = NULL; struct extent_state *cached = NULL;
...@@ -2621,6 +2626,49 @@ static int __extent_read_full_page(struct extent_io_tree *tree, ...@@ -2621,6 +2626,49 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
block_start = em->block_start; block_start = em->block_start;
if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
block_start = EXTENT_MAP_HOLE; block_start = EXTENT_MAP_HOLE;
/*
* If we have a file range that points to a compressed extent
* and it's followed by a consecutive file range that points to
* to the same compressed extent (possibly with a different
* offset and/or length, so it either points to the whole extent
* or only part of it), we must make sure we do not submit a
* single bio to populate the pages for the 2 ranges because
* this makes the compressed extent read zero out the pages
* belonging to the 2nd range. Imagine the following scenario:
*
* File layout
* [0 - 8K] [8K - 24K]
* | |
* | |
* points to extent X, points to extent X,
* offset 4K, length of 8K offset 0, length 16K
*
* [extent X, compressed length = 4K uncompressed length = 16K]
*
* If the bio to read the compressed extent covers both ranges,
* it will decompress extent X into the pages belonging to the
* first range and then it will stop, zeroing out the remaining
* pages that belong to the other range that points to extent X.
* So here we make sure we submit 2 bios, one for the first
* range and another one for the third range. Both will target
* the same physical extent from disk, but we can't currently
* make the compressed bio endio callback populate the pages
* for both ranges because each compressed bio is tightly
* coupled with a single extent map, and each range can have
* an extent map with a different offset value relative to the
* uncompressed data of our extent and different lengths. This
* is a corner case so we prioritize correctness over
* non-optimal behavior (submitting 2 bios for the same extent).
*/
if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) &&
prev_em_start && *prev_em_start != (u64)-1 &&
*prev_em_start != em->orig_start)
force_bio_submit = true;
if (prev_em_start)
*prev_em_start = em->orig_start;
free_extent_map(em); free_extent_map(em);
em = NULL; em = NULL;
...@@ -2675,7 +2723,8 @@ static int __extent_read_full_page(struct extent_io_tree *tree, ...@@ -2675,7 +2723,8 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
bdev, bio, pnr, bdev, bio, pnr,
end_bio_extent_readpage, mirror_num, end_bio_extent_readpage, mirror_num,
*bio_flags, *bio_flags,
this_bio_flag); this_bio_flag,
force_bio_submit);
nr++; nr++;
*bio_flags = this_bio_flag; *bio_flags = this_bio_flag;
} }
...@@ -2701,7 +2750,7 @@ int extent_read_full_page(struct extent_io_tree *tree, struct page *page, ...@@ -2701,7 +2750,7 @@ int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
int ret; int ret;
ret = __extent_read_full_page(tree, page, get_extent, &bio, mirror_num, ret = __extent_read_full_page(tree, page, get_extent, &bio, mirror_num,
&bio_flags); &bio_flags, NULL);
if (bio) if (bio)
ret = submit_one_bio(READ, bio, mirror_num, bio_flags); ret = submit_one_bio(READ, bio, mirror_num, bio_flags);
return ret; return ret;
...@@ -2960,7 +3009,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, ...@@ -2960,7 +3009,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
sector, iosize, pg_offset, sector, iosize, pg_offset,
bdev, &epd->bio, max_nr, bdev, &epd->bio, max_nr,
end_bio_extent_writepage, end_bio_extent_writepage,
0, 0, 0); 0, 0, 0, false);
if (ret) if (ret)
SetPageError(page); SetPageError(page);
} }
...@@ -3219,6 +3268,7 @@ int extent_readpages(struct extent_io_tree *tree, ...@@ -3219,6 +3268,7 @@ int extent_readpages(struct extent_io_tree *tree,
struct bio *bio = NULL; struct bio *bio = NULL;
unsigned page_idx; unsigned page_idx;
unsigned long bio_flags = 0; unsigned long bio_flags = 0;
u64 prev_em_start = (u64)-1;
for (page_idx = 0; page_idx < nr_pages; page_idx++) { for (page_idx = 0; page_idx < nr_pages; page_idx++) {
struct page *page = list_entry(pages->prev, struct page, lru); struct page *page = list_entry(pages->prev, struct page, lru);
...@@ -3228,7 +3278,8 @@ int extent_readpages(struct extent_io_tree *tree, ...@@ -3228,7 +3278,8 @@ int extent_readpages(struct extent_io_tree *tree,
if (!add_to_page_cache_lru(page, mapping, if (!add_to_page_cache_lru(page, mapping,
page->index, GFP_NOFS)) { page->index, GFP_NOFS)) {
__extent_read_full_page(tree, page, get_extent, __extent_read_full_page(tree, page, get_extent,
&bio, 0, &bio_flags); &bio, 0, &bio_flags,
&prev_em_start);
} }
page_cache_release(page); page_cache_release(page);
} }
...@@ -3998,6 +4049,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree, ...@@ -3998,6 +4049,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
unsigned long num_pages; unsigned long num_pages;
struct bio *bio = NULL; struct bio *bio = NULL;
unsigned long bio_flags = 0; unsigned long bio_flags = 0;
u64 prev_em_start = (u64)-1;
if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
return 0; return 0;
...@@ -4053,7 +4105,8 @@ int read_extent_buffer_pages(struct extent_io_tree *tree, ...@@ -4053,7 +4105,8 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
ClearPageError(page); ClearPageError(page);
err = __extent_read_full_page(tree, page, err = __extent_read_full_page(tree, page,
get_extent, &bio, get_extent, &bio,
mirror_num, &bio_flags); mirror_num, &bio_flags,
&prev_em_start);
if (err) if (err)
ret = err; ret = err;
} else { } else {
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment