Commit 1fb91896 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'for-6.11-rc3-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux

Pull btrfs fixes from David Sterba:

 - extend tree-checker verification of directory item type

 - fix regression in page/folio and extent state tracking in xarray, the
   dirty status can get out of sync and can cause problems e.g. a hang

 - in send, detect last extent and allow to clone it instead of sending
   it as write, reduces amount of data transferred in the stream

 - fix checking extent references when cleaning deleted subvolumes

 - fix one more case in the extent map shrinker, let it run only in the
   kswapd context so it does not cause latency spikes during other
   operations

* tag 'for-6.11-rc3-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux:
  btrfs: fix invalid mapping of extent xarray state
  btrfs: send: allow cloning non-aligned extent if it ends at i_size
  btrfs: only run the extent map shrinker from kswapd tasks
  btrfs: tree-checker: reject BTRFS_FT_UNKNOWN dir type
  btrfs: check delayed refs when we're checking if a ref exists
parents d07b4328 6252690f
...@@ -1134,6 +1134,73 @@ btrfs_find_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs, u64 byt ...@@ -1134,6 +1134,73 @@ btrfs_find_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs, u64 byt
return find_ref_head(delayed_refs, bytenr, false); return find_ref_head(delayed_refs, bytenr, false);
} }
static int find_comp(struct btrfs_delayed_ref_node *entry, u64 root, u64 parent)
{
int type = parent ? BTRFS_SHARED_BLOCK_REF_KEY : BTRFS_TREE_BLOCK_REF_KEY;
if (type < entry->type)
return -1;
if (type > entry->type)
return 1;
if (type == BTRFS_TREE_BLOCK_REF_KEY) {
if (root < entry->ref_root)
return -1;
if (root > entry->ref_root)
return 1;
} else {
if (parent < entry->parent)
return -1;
if (parent > entry->parent)
return 1;
}
return 0;
}
/*
* Check to see if a given root/parent reference is attached to the head. This
* only checks for BTRFS_ADD_DELAYED_REF references that match, as that
* indicates the reference exists for the given root or parent. This is for
* tree blocks only.
*
* @head: the head of the bytenr we're searching.
* @root: the root objectid of the reference if it is a normal reference.
* @parent: the parent if this is a shared backref.
*/
bool btrfs_find_delayed_tree_ref(struct btrfs_delayed_ref_head *head,
u64 root, u64 parent)
{
struct rb_node *node;
bool found = false;
lockdep_assert_held(&head->mutex);
spin_lock(&head->lock);
node = head->ref_tree.rb_root.rb_node;
while (node) {
struct btrfs_delayed_ref_node *entry;
int ret;
entry = rb_entry(node, struct btrfs_delayed_ref_node, ref_node);
ret = find_comp(entry, root, parent);
if (ret < 0) {
node = node->rb_left;
} else if (ret > 0) {
node = node->rb_right;
} else {
/*
* We only want to count ADD actions, as drops mean the
* ref doesn't exist.
*/
if (entry->action == BTRFS_ADD_DELAYED_REF)
found = true;
break;
}
}
spin_unlock(&head->lock);
return found;
}
void __cold btrfs_delayed_ref_exit(void) void __cold btrfs_delayed_ref_exit(void)
{ {
kmem_cache_destroy(btrfs_delayed_ref_head_cachep); kmem_cache_destroy(btrfs_delayed_ref_head_cachep);
......
...@@ -389,6 +389,8 @@ void btrfs_dec_delayed_refs_rsv_bg_updates(struct btrfs_fs_info *fs_info); ...@@ -389,6 +389,8 @@ void btrfs_dec_delayed_refs_rsv_bg_updates(struct btrfs_fs_info *fs_info);
int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info, int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info,
enum btrfs_reserve_flush_enum flush); enum btrfs_reserve_flush_enum flush);
bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info); bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info);
bool btrfs_find_delayed_tree_ref(struct btrfs_delayed_ref_head *head,
u64 root, u64 parent);
static inline u64 btrfs_delayed_ref_owner(struct btrfs_delayed_ref_node *node) static inline u64 btrfs_delayed_ref_owner(struct btrfs_delayed_ref_node *node)
{ {
......
...@@ -5472,23 +5472,62 @@ static int check_ref_exists(struct btrfs_trans_handle *trans, ...@@ -5472,23 +5472,62 @@ static int check_ref_exists(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 bytenr, u64 parent, struct btrfs_root *root, u64 bytenr, u64 parent,
int level) int level)
{ {
struct btrfs_delayed_ref_root *delayed_refs;
struct btrfs_delayed_ref_head *head;
struct btrfs_path *path; struct btrfs_path *path;
struct btrfs_extent_inline_ref *iref; struct btrfs_extent_inline_ref *iref;
int ret; int ret;
bool exists = false;
path = btrfs_alloc_path(); path = btrfs_alloc_path();
if (!path) if (!path)
return -ENOMEM; return -ENOMEM;
again:
ret = lookup_extent_backref(trans, path, &iref, bytenr, ret = lookup_extent_backref(trans, path, &iref, bytenr,
root->fs_info->nodesize, parent, root->fs_info->nodesize, parent,
btrfs_root_id(root), level, 0); btrfs_root_id(root), level, 0);
if (ret != -ENOENT) {
/*
* If we get 0 then we found our reference, return 1, else
* return the error if it's not -ENOENT;
*/
btrfs_free_path(path); btrfs_free_path(path);
if (ret == -ENOENT) return (ret < 0 ) ? ret : 1;
return 0; }
if (ret < 0)
return ret; /*
return 1; * We could have a delayed ref with this reference, so look it up while
* we're holding the path open to make sure we don't race with the
* delayed ref running.
*/
delayed_refs = &trans->transaction->delayed_refs;
spin_lock(&delayed_refs->lock);
head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
if (!head)
goto out;
if (!mutex_trylock(&head->mutex)) {
/*
* We're contended, means that the delayed ref is running, get a
* reference and wait for the ref head to be complete and then
* try again.
*/
refcount_inc(&head->refs);
spin_unlock(&delayed_refs->lock);
btrfs_release_path(path);
mutex_lock(&head->mutex);
mutex_unlock(&head->mutex);
btrfs_put_delayed_ref_head(head);
goto again;
}
exists = btrfs_find_delayed_tree_ref(head, root->root_key.objectid, parent);
mutex_unlock(&head->mutex);
out:
spin_unlock(&delayed_refs->lock);
btrfs_free_path(path);
return exists ? 1 : 0;
} }
/* /*
......
...@@ -1496,6 +1496,13 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, ...@@ -1496,6 +1496,13 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
free_extent_map(em); free_extent_map(em);
em = NULL; em = NULL;
/*
* Although the PageDirty bit might be cleared before entering
* this function, subpage dirty bit is not cleared.
* So clear subpage dirty bit here so next time we won't submit
* page for range already written to disk.
*/
btrfs_folio_clear_dirty(fs_info, page_folio(page), cur, iosize);
btrfs_set_range_writeback(inode, cur, cur + iosize - 1); btrfs_set_range_writeback(inode, cur, cur + iosize - 1);
if (!PageWriteback(page)) { if (!PageWriteback(page)) {
btrfs_err(inode->root->fs_info, btrfs_err(inode->root->fs_info,
...@@ -1503,13 +1510,6 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, ...@@ -1503,13 +1510,6 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
page->index, cur, end); page->index, cur, end);
} }
/*
* Although the PageDirty bit is cleared before entering this
* function, subpage dirty bit is not cleared.
* So clear subpage dirty bit here so next time we won't submit
* page for range already written to disk.
*/
btrfs_folio_clear_dirty(fs_info, page_folio(page), cur, iosize);
submit_extent_page(bio_ctrl, disk_bytenr, page, iosize, submit_extent_page(bio_ctrl, disk_bytenr, page, iosize,
cur - page_offset(page)); cur - page_offset(page));
......
...@@ -1147,8 +1147,7 @@ static long btrfs_scan_inode(struct btrfs_inode *inode, struct btrfs_em_shrink_c ...@@ -1147,8 +1147,7 @@ static long btrfs_scan_inode(struct btrfs_inode *inode, struct btrfs_em_shrink_c
return 0; return 0;
/* /*
* We want to be fast because we can be called from any path trying to * We want to be fast so if the lock is busy we don't want to spend time
* allocate memory, so if the lock is busy we don't want to spend time
* waiting for it - either some task is about to do IO for the inode or * waiting for it - either some task is about to do IO for the inode or
* we may have another task shrinking extent maps, here in this code, so * we may have another task shrinking extent maps, here in this code, so
* skip this inode. * skip this inode.
...@@ -1191,9 +1190,7 @@ static long btrfs_scan_inode(struct btrfs_inode *inode, struct btrfs_em_shrink_c ...@@ -1191,9 +1190,7 @@ static long btrfs_scan_inode(struct btrfs_inode *inode, struct btrfs_em_shrink_c
/* /*
* Stop if we need to reschedule or there's contention on the * Stop if we need to reschedule or there's contention on the
* lock. This is to avoid slowing other tasks trying to take the * lock. This is to avoid slowing other tasks trying to take the
* lock and because the shrinker might be called during a memory * lock.
* allocation path and we want to avoid taking a very long time
* and slowing down all sorts of tasks.
*/ */
if (need_resched() || rwlock_needbreak(&tree->lock)) if (need_resched() || rwlock_needbreak(&tree->lock))
break; break;
...@@ -1222,12 +1219,7 @@ static long btrfs_scan_root(struct btrfs_root *root, struct btrfs_em_shrink_ctx ...@@ -1222,12 +1219,7 @@ static long btrfs_scan_root(struct btrfs_root *root, struct btrfs_em_shrink_ctx
if (ctx->scanned >= ctx->nr_to_scan) if (ctx->scanned >= ctx->nr_to_scan)
break; break;
/* cond_resched();
* We may be called from memory allocation paths, so we don't
* want to take too much time and slowdown tasks.
*/
if (need_resched())
break;
inode = btrfs_find_first_inode(root, min_ino); inode = btrfs_find_first_inode(root, min_ino);
} }
...@@ -1285,14 +1277,12 @@ long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan) ...@@ -1285,14 +1277,12 @@ long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan)
ctx.last_ino); ctx.last_ino);
} }
/* while (ctx.scanned < ctx.nr_to_scan) {
* We may be called from memory allocation paths, so we don't want to
* take too much time and slowdown tasks, so stop if we need reschedule.
*/
while (ctx.scanned < ctx.nr_to_scan && !need_resched()) {
struct btrfs_root *root; struct btrfs_root *root;
unsigned long count; unsigned long count;
cond_resched();
spin_lock(&fs_info->fs_roots_radix_lock); spin_lock(&fs_info->fs_roots_radix_lock);
count = radix_tree_gang_lookup(&fs_info->fs_roots_radix, count = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
(void **)&root, (void **)&root,
......
...@@ -6157,25 +6157,51 @@ static int send_write_or_clone(struct send_ctx *sctx, ...@@ -6157,25 +6157,51 @@ static int send_write_or_clone(struct send_ctx *sctx,
u64 offset = key->offset; u64 offset = key->offset;
u64 end; u64 end;
u64 bs = sctx->send_root->fs_info->sectorsize; u64 bs = sctx->send_root->fs_info->sectorsize;
struct btrfs_file_extent_item *ei;
u64 disk_byte;
u64 data_offset;
u64 num_bytes;
struct btrfs_inode_info info = { 0 };
end = min_t(u64, btrfs_file_extent_end(path), sctx->cur_inode_size); end = min_t(u64, btrfs_file_extent_end(path), sctx->cur_inode_size);
if (offset >= end) if (offset >= end)
return 0; return 0;
if (clone_root && IS_ALIGNED(end, bs)) { num_bytes = end - offset;
struct btrfs_file_extent_item *ei;
u64 disk_byte; if (!clone_root)
u64 data_offset; goto write_data;
if (IS_ALIGNED(end, bs))
goto clone_data;
/*
* If the extent end is not aligned, we can clone if the extent ends at
* the i_size of the inode and the clone range ends at the i_size of the
* source inode, otherwise the clone operation fails with -EINVAL.
*/
if (end != sctx->cur_inode_size)
goto write_data;
ret = get_inode_info(clone_root->root, clone_root->ino, &info);
if (ret < 0)
return ret;
if (clone_root->offset + num_bytes == info.size)
goto clone_data;
write_data:
ret = send_extent_data(sctx, path, offset, num_bytes);
sctx->cur_inode_next_write_offset = end;
return ret;
clone_data:
ei = btrfs_item_ptr(path->nodes[0], path->slots[0], ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
struct btrfs_file_extent_item); struct btrfs_file_extent_item);
disk_byte = btrfs_file_extent_disk_bytenr(path->nodes[0], ei); disk_byte = btrfs_file_extent_disk_bytenr(path->nodes[0], ei);
data_offset = btrfs_file_extent_offset(path->nodes[0], ei); data_offset = btrfs_file_extent_offset(path->nodes[0], ei);
ret = clone_range(sctx, path, clone_root, disk_byte, ret = clone_range(sctx, path, clone_root, disk_byte, data_offset, offset,
data_offset, offset, end - offset); num_bytes);
} else {
ret = send_extent_data(sctx, path, offset, end - offset);
}
sctx->cur_inode_next_write_offset = end; sctx->cur_inode_next_write_offset = end;
return ret; return ret;
} }
......
...@@ -28,6 +28,7 @@ ...@@ -28,6 +28,7 @@
#include <linux/btrfs.h> #include <linux/btrfs.h>
#include <linux/security.h> #include <linux/security.h>
#include <linux/fs_parser.h> #include <linux/fs_parser.h>
#include <linux/swap.h>
#include "messages.h" #include "messages.h"
#include "delayed-inode.h" #include "delayed-inode.h"
#include "ctree.h" #include "ctree.h"
...@@ -2409,6 +2410,15 @@ static long btrfs_free_cached_objects(struct super_block *sb, struct shrink_cont ...@@ -2409,6 +2410,15 @@ static long btrfs_free_cached_objects(struct super_block *sb, struct shrink_cont
const long nr_to_scan = min_t(unsigned long, LONG_MAX, sc->nr_to_scan); const long nr_to_scan = min_t(unsigned long, LONG_MAX, sc->nr_to_scan);
struct btrfs_fs_info *fs_info = btrfs_sb(sb); struct btrfs_fs_info *fs_info = btrfs_sb(sb);
/*
* We may be called from any task trying to allocate memory and we don't
* want to slow it down with scanning and dropping extent maps. It would
* also cause heavy lock contention if many tasks concurrently enter
* here. Therefore only allow kswapd tasks to scan and drop extent maps.
*/
if (!current_is_kswapd())
return 0;
return btrfs_free_extent_maps(fs_info, nr_to_scan); return btrfs_free_extent_maps(fs_info, nr_to_scan);
} }
......
...@@ -569,9 +569,10 @@ static int check_dir_item(struct extent_buffer *leaf, ...@@ -569,9 +569,10 @@ static int check_dir_item(struct extent_buffer *leaf,
/* dir type check */ /* dir type check */
dir_type = btrfs_dir_ftype(leaf, di); dir_type = btrfs_dir_ftype(leaf, di);
if (unlikely(dir_type >= BTRFS_FT_MAX)) { if (unlikely(dir_type <= BTRFS_FT_UNKNOWN ||
dir_type >= BTRFS_FT_MAX)) {
dir_item_err(leaf, slot, dir_item_err(leaf, slot,
"invalid dir item type, have %u expect [0, %u)", "invalid dir item type, have %u expect (0, %u)",
dir_type, BTRFS_FT_MAX); dir_type, BTRFS_FT_MAX);
return -EUCLEAN; return -EUCLEAN;
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment