Commit d7eae340 authored by Omar Sandoval's avatar Omar Sandoval Committed by David Sterba

Btrfs: rework delayed ref total_bytes_pinned accounting

The total_bytes_pinned counter is completely broken when accounting
delayed refs:

- If two drops for the same extent are merged, we will decrement
  total_bytes_pinned twice but only increment it once.
- If an add is merged into a drop or vice versa, we will decrement the
  total_bytes_pinned counter but never increment it.
- If multiple references to an extent are dropped, we will account it
  multiple times, potentially vastly over-estimating the number of bytes
  that will be freed by a commit and doing unnecessary work when we're
  close to ENOSPC.

The last issue is relatively minor, but the first two make the
total_bytes_pinned counter leak or underflow very often. These
accounting issues were introduced in b150a4f1 ("Btrfs: use a percpu
to keep track of possibly pinned bytes"), but they were papered over by
zeroing out the counter on every commit until d288db5d ("Btrfs: fix
race of using total_bytes_pinned").

We need to make sure that an extent is accounted as pinned exactly once
if and only if we will drop references to it when when the transaction
is committed. Ideally we would only add to total_bytes_pinned when the
*last* reference is dropped, but this information isn't readily
available for data extents. Again, this over-estimation can lead to
extra commits when we're close to ENOSPC, but it's not as bad as before.

The fix implemented here is to increment total_bytes_pinned when the
total refmod count for an extent goes negative and decrement it if the
refmod count goes back to non-negative or after we've run all of the
delayed refs for that extent.
Signed-off-by: default avatarOmar Sandoval <osandov@fb.com>
Tested-by: default avatarHolger Hoffstätte <holger@applied-asynchrony.com>
Reviewed-by: default avatarLiu Bo <bo.li.liu@oracle.com>
Signed-off-by: default avatarDavid Sterba <dsterba@suse.com>
parent 7be07912
......@@ -2113,6 +2113,7 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes, u64 parent,
u64 root_objectid, u64 owner, u64 offset)
{
int old_ref_mod, new_ref_mod;
int ret;
BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID &&
......@@ -2123,14 +2124,18 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
num_bytes, parent,
root_objectid, (int)owner,
BTRFS_ADD_DELAYED_REF, NULL,
NULL, NULL);
&old_ref_mod, &new_ref_mod);
} else {
ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
num_bytes, parent,
root_objectid, owner, offset,
0, BTRFS_ADD_DELAYED_REF, NULL,
NULL);
0, BTRFS_ADD_DELAYED_REF,
&old_ref_mod, &new_ref_mod);
}
if (ret == 0 && old_ref_mod < 0 && new_ref_mod >= 0)
add_pinned_bytes(fs_info, -num_bytes, owner, root_objectid);
return ret;
}
......@@ -2434,6 +2439,16 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
head = btrfs_delayed_node_to_head(node);
trace_run_delayed_ref_head(fs_info, node, head, node->action);
if (head->total_ref_mod < 0) {
struct btrfs_block_group_cache *cache;
cache = btrfs_lookup_block_group(fs_info, node->bytenr);
ASSERT(cache);
percpu_counter_add(&cache->space_info->total_bytes_pinned,
-node->num_bytes);
btrfs_put_block_group(cache);
}
if (insert_reserved) {
btrfs_pin_extent(fs_info, node->bytenr,
node->num_bytes, 1);
......@@ -6284,6 +6299,8 @@ static int update_block_group(struct btrfs_trans_handle *trans,
trace_btrfs_space_reservation(info, "pinned",
cache->space_info->flags,
num_bytes, 1);
percpu_counter_add(&cache->space_info->total_bytes_pinned,
num_bytes);
set_extent_dirty(info->pinned_extents,
bytenr, bytenr + num_bytes - 1,
GFP_NOFS | __GFP_NOFAIL);
......@@ -7053,8 +7070,6 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
goto out;
}
}
add_pinned_bytes(info, -num_bytes, owner_objectid,
root_objectid);
} else {
if (found_extent) {
BUG_ON(is_data && refs_to_drop !=
......@@ -7186,13 +7201,16 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
int ret;
if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
int old_ref_mod, new_ref_mod;
ret = btrfs_add_delayed_tree_ref(fs_info, trans, buf->start,
buf->len, parent,
root->root_key.objectid,
btrfs_header_level(buf),
BTRFS_DROP_DELAYED_REF, NULL,
NULL, NULL);
&old_ref_mod, &new_ref_mod);
BUG_ON(ret); /* -ENOMEM */
pin = old_ref_mod >= 0 && new_ref_mod < 0;
}
if (last_ref && btrfs_header_generation(buf) == trans->transid) {
......@@ -7241,12 +7259,12 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
u64 owner, u64 offset)
{
int old_ref_mod, new_ref_mod;
int ret;
if (btrfs_is_testing(fs_info))
return 0;
add_pinned_bytes(fs_info, num_bytes, owner, root_objectid);
/*
* tree log blocks never actually go into the extent allocation
......@@ -7256,20 +7274,25 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
WARN_ON(owner >= BTRFS_FIRST_FREE_OBJECTID);
/* unlocks the pinned mutex */
btrfs_pin_extent(fs_info, bytenr, num_bytes, 1);
old_ref_mod = new_ref_mod = 0;
ret = 0;
} else if (owner < BTRFS_FIRST_FREE_OBJECTID) {
ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
num_bytes, parent,
root_objectid, (int)owner,
BTRFS_DROP_DELAYED_REF, NULL,
NULL, NULL);
&old_ref_mod, &new_ref_mod);
} else {
ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
num_bytes, parent,
root_objectid, owner, offset,
0, BTRFS_DROP_DELAYED_REF,
NULL, NULL);
&old_ref_mod, &new_ref_mod);
}
if (ret == 0 && old_ref_mod >= 0 && new_ref_mod < 0)
add_pinned_bytes(fs_info, num_bytes, owner, root_objectid);
return ret;
}
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment