Commit e570fd27 authored by Miao Xie's avatar Miao Xie Committed by Chris Mason

Btrfs: fix broken free space cache after the system crashed

When we mounted the filesystem after the crash, we got the following
message:
  BTRFS error (device xxx): block group xxxx has wrong amount of free space
  BTRFS error (device xxx): failed to load free space cache for block group xxx

It is because we didn't update the metadata of the allocated space (in extent
tree) until the file data was written into the disk. During this time, there was
no information about the allocated spaces in either the extent tree nor the
free space cache. when we wrote out the free space cache at this time (commit
transaction), those spaces were lost. In fact, only the free space that is
used to store the file data had this problem, the others didn't because
the metadata of them is updated in the same transaction context.

There are many methods which can fix the above problem
- track the allocated space, and write it out when we write out the free
  space cache
- account the size of the allocated space that is used to store the file
  data, if the size is not zero, don't write out the free space cache.

The first one is complex and may make the performance drop down.
This patch chose the second method, we use a per-block-group variant to
account the size of that allocated space. Besides that, we also introduce
a per-block-group read-write semaphore to avoid the race between
the allocation and the free space cache write out.
Signed-off-by: default avatarMiao Xie <miaox@cn.fujitsu.com>
Signed-off-by: default avatarChris Mason <clm@fb.com>
parent 5349d6c3
......@@ -1259,11 +1259,19 @@ struct btrfs_block_group_cache {
spinlock_t lock;
u64 pinned;
u64 reserved;
u64 delalloc_bytes;
u64 bytes_super;
u64 flags;
u64 sectorsize;
u64 cache_generation;
/*
* It is just used for the delayed data space allocation because
* only the data space allocation and the relative metadata update
* can be done cross the transaction.
*/
struct rw_semaphore data_rwsem;
/* for raid56, this is a full stripe, without parity */
unsigned long full_stripe_len;
......@@ -3316,7 +3324,7 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
struct btrfs_key *ins);
int btrfs_reserve_extent(struct btrfs_root *root, u64 num_bytes,
u64 min_alloc_size, u64 empty_size, u64 hint_byte,
struct btrfs_key *ins, int is_data);
struct btrfs_key *ins, int is_data, int delalloc);
int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
struct extent_buffer *buf, int full_backref, int no_quota);
int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
......@@ -3330,7 +3338,8 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
u64 owner, u64 offset, int no_quota);
int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len);
int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len,
int delalloc);
int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root,
u64 start, u64 len);
void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
......
This diff is collapsed.
......@@ -680,6 +680,13 @@ static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
generation = btrfs_free_space_generation(leaf, header);
btrfs_release_path(path);
if (!BTRFS_I(inode)->generation) {
btrfs_info(root->fs_info,
"The free space cache file (%llu) is invalid. skip it\n",
offset);
return 0;
}
if (BTRFS_I(inode)->generation != generation) {
btrfs_err(root->fs_info,
"free space inode generation (%llu) "
......@@ -1107,6 +1114,20 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
if (ret)
return -1;
if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) {
down_write(&block_group->data_rwsem);
spin_lock(&block_group->lock);
if (block_group->delalloc_bytes) {
block_group->disk_cache_state = BTRFS_DC_WRITTEN;
spin_unlock(&block_group->lock);
up_write(&block_group->data_rwsem);
BTRFS_I(inode)->generation = 0;
ret = 0;
goto out;
}
spin_unlock(&block_group->lock);
}
/* Lock all pages first so we can lock the extent safely. */
io_ctl_prepare_pages(&io_ctl, inode, 0);
......@@ -1145,6 +1166,8 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
if (ret)
goto out_nospc;
if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA))
up_write(&block_group->data_rwsem);
/*
* Release the pages and unlock the extent, we will flush
* them out later
......@@ -1173,6 +1196,10 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
out_nospc:
cleanup_write_cache_enospc(inode, &io_ctl, &cached_state, &bitmap_list);
if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA))
up_write(&block_group->data_rwsem);
goto out;
}
......@@ -1192,6 +1219,12 @@ int btrfs_write_out_cache(struct btrfs_root *root,
spin_unlock(&block_group->lock);
return 0;
}
if (block_group->delalloc_bytes) {
block_group->disk_cache_state = BTRFS_DC_WRITTEN;
spin_unlock(&block_group->lock);
return 0;
}
spin_unlock(&block_group->lock);
inode = lookup_free_space_inode(root, block_group, path);
......
......@@ -693,7 +693,7 @@ static noinline int submit_compressed_extents(struct inode *inode,
ret = btrfs_reserve_extent(root,
async_extent->compressed_size,
async_extent->compressed_size,
0, alloc_hint, &ins, 1);
0, alloc_hint, &ins, 1, 1);
if (ret) {
int i;
......@@ -794,7 +794,7 @@ static noinline int submit_compressed_extents(struct inode *inode,
out:
return ret;
out_free_reserve:
btrfs_free_reserved_extent(root, ins.objectid, ins.offset);
btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
out_free:
extent_clear_unlock_delalloc(inode, async_extent->start,
async_extent->start +
......@@ -917,7 +917,7 @@ static noinline int cow_file_range(struct inode *inode,
cur_alloc_size = disk_num_bytes;
ret = btrfs_reserve_extent(root, cur_alloc_size,
root->sectorsize, 0, alloc_hint,
&ins, 1);
&ins, 1, 1);
if (ret < 0)
goto out_unlock;
......@@ -995,7 +995,7 @@ static noinline int cow_file_range(struct inode *inode,
return ret;
out_reserve:
btrfs_free_reserved_extent(root, ins.objectid, ins.offset);
btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
out_unlock:
extent_clear_unlock_delalloc(inode, start, end, locked_page,
EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
......@@ -2599,6 +2599,21 @@ record_old_file_extents(struct inode *inode,
return NULL;
}
static void btrfs_release_delalloc_bytes(struct btrfs_root *root,
u64 start, u64 len)
{
struct btrfs_block_group_cache *cache;
cache = btrfs_lookup_block_group(root->fs_info, start);
ASSERT(cache);
spin_lock(&cache->lock);
cache->delalloc_bytes -= len;
spin_unlock(&cache->lock);
btrfs_put_block_group(cache);
}
/* as ordered data IO finishes, this gets called so we can finish
* an ordered extent if the range of bytes in the file it covers are
* fully written.
......@@ -2698,6 +2713,10 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
logical_len, logical_len,
compress_type, 0, 0,
BTRFS_FILE_EXTENT_REG);
if (!ret)
btrfs_release_delalloc_bytes(root,
ordered_extent->start,
ordered_extent->disk_len);
}
unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
ordered_extent->file_offset, ordered_extent->len,
......@@ -2750,7 +2769,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
!test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags))
btrfs_free_reserved_extent(root, ordered_extent->start,
ordered_extent->disk_len);
ordered_extent->disk_len, 1);
}
......@@ -6535,21 +6554,21 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
alloc_hint = get_extent_allocation_hint(inode, start, len);
ret = btrfs_reserve_extent(root, len, root->sectorsize, 0,
alloc_hint, &ins, 1);
alloc_hint, &ins, 1, 1);
if (ret)
return ERR_PTR(ret);
em = create_pinned_em(inode, start, ins.offset, start, ins.objectid,
ins.offset, ins.offset, ins.offset, 0);
if (IS_ERR(em)) {
btrfs_free_reserved_extent(root, ins.objectid, ins.offset);
btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
return em;
}
ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid,
ins.offset, ins.offset, 0);
if (ret) {
btrfs_free_reserved_extent(root, ins.objectid, ins.offset);
btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
free_extent_map(em);
return ERR_PTR(ret);
}
......@@ -7437,7 +7456,7 @@ static void btrfs_submit_direct(int rw, struct bio *dio_bio,
if (!test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags) &&
!test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags))
btrfs_free_reserved_extent(root, ordered->start,
ordered->disk_len);
ordered->disk_len, 1);
btrfs_put_ordered_extent(ordered);
btrfs_put_ordered_extent(ordered);
}
......@@ -8819,7 +8838,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
cur_bytes = min(num_bytes, 256ULL * 1024 * 1024);
cur_bytes = max(cur_bytes, min_size);
ret = btrfs_reserve_extent(root, cur_bytes, min_size, 0,
*alloc_hint, &ins, 1);
*alloc_hint, &ins, 1, 0);
if (ret) {
if (own_trans)
btrfs_end_transaction(trans, root);
......@@ -8833,7 +8852,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
BTRFS_FILE_EXTENT_PREALLOC);
if (ret) {
btrfs_free_reserved_extent(root, ins.objectid,
ins.offset);
ins.offset, 0);
btrfs_abort_transaction(trans, root, ret);
if (own_trans)
btrfs_end_transaction(trans, root);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment