1. 09 Feb, 2023 1 commit
    • Filipe Manana's avatar
      btrfs: lock the inode in shared mode before starting fiemap · 519b7e13
      Filipe Manana authored
      Currently fiemap does not take the inode's lock (VFS lock), it only locks
      a file range in the inode's io tree. This however can lead to a deadlock
      if we have a concurrent fsync on the file and fiemap code triggers a fault
      when accessing the user space buffer with fiemap_fill_next_extent(). The
      deadlock happens on the inode's i_mmap_lock semaphore, which is taken both
      by fsync and btrfs_page_mkwrite(). This deadlock was recently reported by
      syzbot and triggers a trace like the following:
      
         task:syz-executor361 state:D stack:20264 pid:5668  ppid:5119   flags:0x00004004
         Call Trace:
          <TASK>
          context_switch kernel/sched/core.c:5293 [inline]
          __schedule+0x995/0xe20 kernel/sched/core.c:6606
          schedule+0xcb/0x190 kernel/sched/core.c:6682
          wait_on_state fs/btrfs/extent-io-tree.c:707 [inline]
          wait_extent_bit+0x577/0x6f0 fs/btrfs/extent-io-tree.c:751
          lock_extent+0x1c2/0x280 fs/btrfs/extent-io-tree.c:1742
          find_lock_delalloc_range+0x4e6/0x9c0 fs/btrfs/extent_io.c:488
          writepage_delalloc+0x1ef/0x540 fs/btrfs/extent_io.c:1863
          __extent_writepage+0x736/0x14e0 fs/btrfs/extent_io.c:2174
          extent_write_cache_pages+0x983/0x1220 fs/btrfs/extent_io.c:3091
          extent_writepages+0x219/0x540 fs/btrfs/extent_io.c:3211
          do_writepages+0x3c3/0x680 mm/page-writeback.c:2581
          filemap_fdatawrite_wbc+0x11e/0x170 mm/filemap.c:388
          __filemap_fdatawrite_range mm/filemap.c:421 [inline]
          filemap_fdatawrite_range+0x175/0x200 mm/filemap.c:439
          btrfs_fdatawrite_range fs/btrfs/file.c:3850 [inline]
          start_ordered_ops fs/btrfs/file.c:1737 [inline]
          btrfs_sync_file+0x4ff/0x1190 fs/btrfs/file.c:1839
          generic_write_sync include/linux/fs.h:2885 [inline]
          btrfs_do_write_iter+0xcd3/0x1280 fs/btrfs/file.c:1684
          call_write_iter include/linux/fs.h:2189 [inline]
          new_sync_write fs/read_write.c:491 [inline]
          vfs_write+0x7dc/0xc50 fs/read_write.c:584
          ksys_write+0x177/0x2a0 fs/read_write.c:637
          do_syscall_x64 arch/x86/entry/common.c:50 [inline]
          do_syscall_64+0x3d/0xb0 arch/x86/entry/common.c:80
          entry_SYSCALL_64_after_hwframe+0x63/0xcd
         RIP: 0033:0x7f7d4054e9b9
         RSP: 002b:00007f7d404fa2f8 EFLAGS: 00000246 ORIG_RAX: 0000000000000001
         RAX: ffffffffffffffda RBX: 00007f7d405d87a0 RCX: 00007f7d4054e9b9
         RDX: 0000000000000090 RSI: 0000000020000000 RDI: 0000000000000006
         RBP: 00007f7d405a51d0 R08: 0000000000000000 R09: 0000000000000000
         R10: 0000000000000000 R11: 0000000000000246 R12: 61635f65646f6e69
         R13: 65646f7475616f6e R14: 7261637369646f6e R15: 00007f7d405d87a8
          </TASK>
         INFO: task syz-executor361:5697 blocked for more than 145 seconds.
               Not tainted 6.2.0-rc3-syzkaller-00376-g7c698440 #0
         "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
         task:syz-executor361 state:D stack:21216 pid:5697  ppid:5119   flags:0x00004004
         Call Trace:
          <TASK>
          context_switch kernel/sched/core.c:5293 [inline]
          __schedule+0x995/0xe20 kernel/sched/core.c:6606
          schedule+0xcb/0x190 kernel/sched/core.c:6682
          rwsem_down_read_slowpath+0x5f9/0x930 kernel/locking/rwsem.c:1095
          __down_read_common+0x54/0x2a0 kernel/locking/rwsem.c:1260
          btrfs_page_mkwrite+0x417/0xc80 fs/btrfs/inode.c:8526
          do_page_mkwrite+0x19e/0x5e0 mm/memory.c:2947
          wp_page_shared+0x15e/0x380 mm/memory.c:3295
          handle_pte_fault mm/memory.c:4949 [inline]
          __handle_mm_fault mm/memory.c:5073 [inline]
          handle_mm_fault+0x1b79/0x26b0 mm/memory.c:5219
          do_user_addr_fault+0x69b/0xcb0 arch/x86/mm/fault.c:1428
          handle_page_fault arch/x86/mm/fault.c:1519 [inline]
          exc_page_fault+0x7a/0x110 arch/x86/mm/fault.c:1575
          asm_exc_page_fault+0x22/0x30 arch/x86/include/asm/idtentry.h:570
         RIP: 0010:copy_user_short_string+0xd/0x40 arch/x86/lib/copy_user_64.S:233
         Code: 74 0a 89 (...)
         RSP: 0018:ffffc9000570f330 EFLAGS: 00050202
         RAX: ffffffff843e6601 RBX: 00007fffffffefc8 RCX: 0000000000000007
         RDX: 0000000000000000 RSI: ffffc9000570f3e0 RDI: 0000000020000120
         RBP: ffffc9000570f490 R08: 0000000000000000 R09: fffff52000ae1e83
         R10: fffff52000ae1e83 R11: 1ffff92000ae1e7c R12: 0000000000000038
         R13: ffffc9000570f3e0 R14: 0000000020000120 R15: ffffc9000570f3e0
          copy_user_generic arch/x86/include/asm/uaccess_64.h:37 [inline]
          raw_copy_to_user arch/x86/include/asm/uaccess_64.h:58 [inline]
          _copy_to_user+0xe9/0x130 lib/usercopy.c:34
          copy_to_user include/linux/uaccess.h:169 [inline]
          fiemap_fill_next_extent+0x22e/0x410 fs/ioctl.c:144
          emit_fiemap_extent+0x22d/0x3c0 fs/btrfs/extent_io.c:3458
          fiemap_process_hole+0xa00/0xad0 fs/btrfs/extent_io.c:3716
          extent_fiemap+0xe27/0x2100 fs/btrfs/extent_io.c:3922
          btrfs_fiemap+0x172/0x1e0 fs/btrfs/inode.c:8209
          ioctl_fiemap fs/ioctl.c:219 [inline]
          do_vfs_ioctl+0x185b/0x2980 fs/ioctl.c:810
          __do_sys_ioctl fs/ioctl.c:868 [inline]
          __se_sys_ioctl+0x83/0x170 fs/ioctl.c:856
          do_syscall_x64 arch/x86/entry/common.c:50 [inline]
          do_syscall_64+0x3d/0xb0 arch/x86/entry/common.c:80
          entry_SYSCALL_64_after_hwframe+0x63/0xcd
         RIP: 0033:0x7f7d4054e9b9
         RSP: 002b:00007f7d390d92f8 EFLAGS: 00000246 ORIG_RAX: 0000000000000010
         RAX: ffffffffffffffda RBX: 00007f7d405d87b0 RCX: 00007f7d4054e9b9
         RDX: 0000000020000100 RSI: 00000000c020660b RDI: 0000000000000005
         RBP: 00007f7d405a51d0 R08: 00007f7d390d9700 R09: 0000000000000000
         R10: 00007f7d390d9700 R11: 0000000000000246 R12: 61635f65646f6e69
         R13: 65646f7475616f6e R14: 7261637369646f6e R15: 00007f7d405d87b8
          </TASK>
      
      What happens is the following:
      
      1) Task A is doing an fsync, enters btrfs_sync_file() and flushes delalloc
         before locking the inode and the i_mmap_lock semaphore, that is, before
         calling btrfs_inode_lock();
      
      2) After task A flushes delalloc and before it calls btrfs_inode_lock(),
         another task dirties a page;
      
      3) Task B starts a fiemap without FIEMAP_FLAG_SYNC, so the page dirtied
         at step 2 remains dirty and unflushed. Then when it enters
         extent_fiemap() and it locks a file range that includes the range of
         the page dirtied in step 2;
      
      4) Task A calls btrfs_inode_lock() and locks the inode (VFS lock) and the
         inode's i_mmap_lock semaphore in write mode. Then it tries to flush
         delalloc by calling start_ordered_ops(), which will block, at
         find_lock_delalloc_range(), when trying to lock the range of the page
         dirtied at step 2, since this range was locked by the fiemap task (at
         step 3);
      
      5) Task B generates a page fault when accessing the user space fiemap
         buffer with a call to fiemap_fill_next_extent().
      
         The fault handler needs to call btrfs_page_mkwrite() for some other
         page of our inode, and there we deadlock when trying to lock the
         inode's i_mmap_lock semaphore in read mode, since the fsync task locked
         it in write mode (step 4) and the fsync task can not progress because
         it's waiting to lock a file range that is currently locked by us (the
         fiemap task, step 3).
      
      Fix this by taking the inode's lock (VFS lock) in shared mode when
      entering fiemap. This effectively serializes fiemap with fsync (except the
      most expensive part of fsync, the log sync), preventing this deadlock.
      
      Reported-by: syzbot+cc35f55c41e34c30dcb5@syzkaller.appspotmail.com
      Link: https://lore.kernel.org/linux-btrfs/00000000000032dc7305f2a66f46@google.com/
      CC: stable@vger.kernel.org # 6.1+
      Reviewed-by: default avatarJosef Bacik <josef@toxicpanda.com>
      Signed-off-by: default avatarFilipe Manana <fdmanana@suse.com>
      Signed-off-by: default avatarDavid Sterba <dsterba@suse.com>
      519b7e13
  2. 06 Feb, 2023 1 commit
  3. 27 Jan, 2023 2 commits
    • Qu Wenruo's avatar
      btrfs: raid56: make error_bitmap update atomic · a9ad4d87
      Qu Wenruo authored
      In the rework of raid56 code, there is very limited concurrency in the
      endio context.
      
      Most of the work is done inside the sectors arrays, which different bios
      will never touch the same sector.
      
      But there is a concurrency here for error_bitmap. Both read and write
      endio functions need to touch them, and we can have multiple write bios
      touching the same error bitmap if they all hit some errors.
      
      Here we fix the unprotected bitmap operation by going set_bit() in a
      loop.
      
      Since we have a very small ceiling of the sectors (at most 16 sectors),
      such set_bit() in a loop should be very acceptable.
      
      Fixes: 2942a50d ("btrfs: raid56: introduce btrfs_raid_bio::error_bitmap")
      Reviewed-by: default avatarChristoph Hellwig <hch@lst.de>
      Signed-off-by: default avatarQu Wenruo <wqu@suse.com>
      Signed-off-by: default avatarDavid Sterba <dsterba@suse.com>
      a9ad4d87
    • David Sterba's avatar
      btrfs: send: limit number of clones and allocated memory size · 33e17b3f
      David Sterba authored
      The arg->clone_sources_count is u64 and can trigger a warning when a
      huge value is passed from user space and a huge array is allocated.
      Limit the allocated memory to 8MiB (can be increased if needed), which
      in turn limits the number of clone sources to 8M / sizeof(struct
      clone_root) = 8M / 40 = 209715.  Real world number of clones is from
      tens to hundreds, so this is future proof.
      
      Reported-by: syzbot+4376a9a073770c173269@syzkaller.appspotmail.com
      Signed-off-by: default avatarDavid Sterba <dsterba@suse.com>
      33e17b3f
  4. 25 Jan, 2023 3 commits
    • Alexander Potapenko's avatar
      btrfs: zlib: zero-initialize zlib workspace · eadd7dec
      Alexander Potapenko authored
      KMSAN reports uses of uninitialized memory in zlib's longest_match()
      called on memory originating from zlib_alloc_workspace().
      This issue is known by zlib maintainers and is claimed to be harmless,
      but to be on the safe side we'd better initialize the memory.
      
      Link: https://zlib.net/zlib_faq.html#faq36
      Reported-by: syzbot+14d9e7602ebdf7ec0a60@syzkaller.appspotmail.com
      CC: stable@vger.kernel.org # 5.4+
      Signed-off-by: default avatarAlexander Potapenko <glider@google.com>
      Reviewed-by: default avatarDavid Sterba <dsterba@suse.com>
      Signed-off-by: default avatarDavid Sterba <dsterba@suse.com>
      eadd7dec
    • Josef Bacik's avatar
      btrfs: limit device extents to the device size · 3c538de0
      Josef Bacik authored
      There was a recent regression in btrfs/177 that started happening with
      the size class patches ("btrfs: introduce size class to block group
      allocator").  This however isn't a regression introduced by those
      patches, but rather the bug was uncovered by a change in behavior in
      these patches.  The patches triggered more chunk allocations in the
      ^free-space-tree case, which uncovered a race with device shrink.
      
      The problem is we will set the device total size to the new size, and
      use this to find a hole for a device extent.  However during shrink we
      may have device extents allocated past this range, so we could
      potentially find a hole in a range past our new shrink size.  We don't
      actually limit our found extent to the device size anywhere, we assume
      that we will not find a hole past our device size.  This isn't true with
      shrink as we're relocating block groups and thus creating holes past the
      device size.
      
      Fix this by making sure we do not search past the new device size, and
      if we wander into any device extents that start after our device size
      simply break from the loop and use whatever hole we've already found.
      
      CC: stable@vger.kernel.org # 4.14+
      Signed-off-by: default avatarJosef Bacik <josef@toxicpanda.com>
      Signed-off-by: default avatarDavid Sterba <dsterba@suse.com>
      3c538de0
    • Tanmay Bhushan's avatar
      btrfs: raid56: fix stripes if vertical errors are found · f7c11aff
      Tanmay Bhushan authored
      We take two stripe numbers if vertical errors are found.  In case it is
      just a pstripe it does not matter but in case of raid 6 it matters as
      both stripes need to be fixed.
      
      Fixes: 7a315072 ("btrfs: raid56: do data csum verification during RMW cycle")
      Reviewed-by: default avatarQu Wenruo <wqu@suse.com>
      Signed-off-by: default avatarTanmay Bhushan <007047221b@gmail.com>
      Signed-off-by: default avatarDavid Sterba <dsterba@suse.com>
      f7c11aff
  5. 16 Jan, 2023 4 commits
    • Filipe Manana's avatar
      btrfs: fix race between quota rescan and disable leading to NULL pointer deref · b7adbf9a
      Filipe Manana authored
      If we have one task trying to start the quota rescan worker while another
      one is trying to disable quotas, we can end up hitting a race that results
      in the quota rescan worker doing a NULL pointer dereference. The steps for
      this are the following:
      
      1) Quotas are enabled;
      
      2) Task A calls the quota rescan ioctl and enters btrfs_qgroup_rescan().
         It calls qgroup_rescan_init() which returns 0 (success) and then joins a
         transaction and commits it;
      
      3) Task B calls the quota disable ioctl and enters btrfs_quota_disable().
         It clears the bit BTRFS_FS_QUOTA_ENABLED from fs_info->flags and calls
         btrfs_qgroup_wait_for_completion(), which returns immediately since the
         rescan worker is not yet running.
         Then it starts a transaction and locks fs_info->qgroup_ioctl_lock;
      
      4) Task A queues the rescan worker, by calling btrfs_queue_work();
      
      5) The rescan worker starts, and calls rescan_should_stop() at the start
         of its while loop, which results in 0 iterations of the loop, since
         the flag BTRFS_FS_QUOTA_ENABLED was cleared from fs_info->flags by
         task B at step 3);
      
      6) Task B sets fs_info->quota_root to NULL;
      
      7) The rescan worker tries to start a transaction and uses
         fs_info->quota_root as the root argument for btrfs_start_transaction().
         This results in a NULL pointer dereference down the call chain of
         btrfs_start_transaction(). The stack trace is something like the one
         reported in Link tag below:
      
         general protection fault, probably for non-canonical address 0xdffffc0000000041: 0000 [#1] PREEMPT SMP KASAN
         KASAN: null-ptr-deref in range [0x0000000000000208-0x000000000000020f]
         CPU: 1 PID: 34 Comm: kworker/u4:2 Not tainted 6.1.0-syzkaller-13872-gb6bb9676 #0
         Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 10/26/2022
         Workqueue: btrfs-qgroup-rescan btrfs_work_helper
         RIP: 0010:start_transaction+0x48/0x10f0 fs/btrfs/transaction.c:564
         Code: 48 89 fb 48 (...)
         RSP: 0018:ffffc90000ab7ab0 EFLAGS: 00010206
         RAX: 0000000000000041 RBX: 0000000000000208 RCX: ffff88801779ba80
         RDX: 0000000000000000 RSI: 0000000000000001 RDI: 0000000000000000
         RBP: dffffc0000000000 R08: 0000000000000001 R09: fffff52000156f5d
         R10: fffff52000156f5d R11: 1ffff92000156f5c R12: 0000000000000000
         R13: 0000000000000001 R14: 0000000000000001 R15: 0000000000000003
         FS:  0000000000000000(0000) GS:ffff8880b9900000(0000) knlGS:0000000000000000
         CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
         CR2: 00007f2bea75b718 CR3: 000000001d0cc000 CR4: 00000000003506e0
         DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
         DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
         Call Trace:
          <TASK>
          btrfs_qgroup_rescan_worker+0x3bb/0x6a0 fs/btrfs/qgroup.c:3402
          btrfs_work_helper+0x312/0x850 fs/btrfs/async-thread.c:280
          process_one_work+0x877/0xdb0 kernel/workqueue.c:2289
          worker_thread+0xb14/0x1330 kernel/workqueue.c:2436
          kthread+0x266/0x300 kernel/kthread.c:376
          ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:308
          </TASK>
         Modules linked in:
      
      So fix this by having the rescan worker function not attempt to start a
      transaction if it didn't do any rescan work.
      
      Reported-by: syzbot+96977faa68092ad382c4@syzkaller.appspotmail.com
      Link: https://lore.kernel.org/linux-btrfs/000000000000e5454b05f065a803@google.com/
      Fixes: e804861b ("btrfs: fix deadlock between quota disable and qgroup rescan worker")
      CC: stable@vger.kernel.org # 5.4+
      Signed-off-by: default avatarFilipe Manana <fdmanana@suse.com>
      Signed-off-by: default avatarDavid Sterba <dsterba@suse.com>
      b7adbf9a
    • Filipe Manana's avatar
      btrfs: fix invalid leaf access due to inline extent during lseek · 1f55ee6d
      Filipe Manana authored
      During lseek, for SEEK_DATA and SEEK_HOLE modes, we access the disk_bytenr
      of an extent without checking its type. However inline extents have their
      data starting the offset of the disk_bytenr field, so accessing that field
      when we have an inline extent can result in either of the following:
      
      1) Interpret the inline extent's data as a disk_bytenr value;
      
      2) In case the inline data is less than 8 bytes, we access part of some
         other item in the leaf, or unused space in the leaf;
      
      3) In case the inline data is less than 8 bytes and the extent item is
         the first item in the leaf, we can access beyond the leaf's limit.
      
      So fix this by not accessing the disk_bytenr field if we have an inline
      extent.
      
      Fixes: b6e83356 ("btrfs: make hole and data seeking a lot more efficient")
      Reported-by: default avatarMatthias Schoepfer <matthias.schoepfer@googlemail.com>
      Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=216908
      Link: https://lore.kernel.org/linux-btrfs/7f25442f-b121-2a3a-5a3d-22bcaae83cd4@leemhuis.info/
      CC: stable@vger.kernel.org # 6.1
      Signed-off-by: default avatarFilipe Manana <fdmanana@suse.com>
      Reviewed-by: default avatarDavid Sterba <dsterba@suse.com>
      Signed-off-by: default avatarDavid Sterba <dsterba@suse.com>
      1f55ee6d
    • Christoph Hellwig's avatar
      btrfs: stop using write_one_page in btrfs_scratch_superblock · 26ecf243
      Christoph Hellwig authored
      write_one_page is an awkward interface that expects the page locked and
      ->writepage to be implemented.  Replace that by zeroing the signature
      bytes and synchronize the block device page using the proper bdev
      helpers.
      Signed-off-by: default avatarChristoph Hellwig <hch@lst.de>
      Reviewed-by: default avatarDavid Sterba <dsterba@suse.com>
      [ update changelog ]
      Signed-off-by: default avatarDavid Sterba <dsterba@suse.com>
      26ecf243
    • Christoph Hellwig's avatar
      btrfs: factor out scratching of one regular super block · 0e0078f7
      Christoph Hellwig authored
      btrfs_scratch_superblocks open codes scratching super block of a
      non-zoned super block.  Split the code to read, zero and write the
      superblock for regular devices into a separate helper.
      Reviewed-by: default avatarJohannes Thumshirn <johannes.thumshirn@wdc.com>
      Signed-off-by: default avatarChristoph Hellwig <hch@lst.de>
      Reviewed-by: default avatarDavid Sterba <dsterba@suse.com>
      [ update changelog ]
      Signed-off-by: default avatarDavid Sterba <dsterba@suse.com>
      0e0078f7
  6. 12 Jan, 2023 5 commits
    • Filipe Manana's avatar
      btrfs: do not abort transaction on failure to update log root · 09e44868
      Filipe Manana authored
      When syncing a log, if we fail to update a log root in the log root tree,
      we are aborting the transaction if the failure was not -ENOSPC. This is
      excessive because there is a chance that a transaction commit can succeed,
      and therefore avoid to turn the filesystem into RO mode. All we need to be
      careful about is to mark the log for a full commit, which we already do,
      to make sure no one commits a super block pointing to an outdated log root
      tree.
      
      So don't abort the transaction if we fail to update a log root in the log
      root tree, and log an error if the failure is not -ENOSPC, so that it does
      not go completely unnoticed.
      
      CC: stable@vger.kernel.org # 6.0+
      Reviewed-by: default avatarJosef Bacik <josef@toxicpanda.com>
      Signed-off-by: default avatarFilipe Manana <fdmanana@suse.com>
      Signed-off-by: default avatarDavid Sterba <dsterba@suse.com>
      09e44868
    • Filipe Manana's avatar
      btrfs: do not abort transaction on failure to write log tree when syncing log · 16199ad9
      Filipe Manana authored
      When syncing the log, if we fail to write log tree extent buffers, we mark
      the log for a full commit and abort the transaction. However we don't need
      to abort the transaction, all we really need to do is to make sure no one
      can commit a superblock pointing to new log tree roots. Just because we
      got a failure writing extent buffers for a log tree, it does not mean we
      will also fail to do a transaction commit.
      
      One particular case is if due to a bug somewhere, when writing log tree
      extent buffers, the tree checker detects some corruption and the writeout
      fails because of that. Aborting the transaction can be very disruptive for
      a user, specially if the issue happened on a root filesystem. One example
      is the scenario in the Link tag below, where an isolated corruption on log
      tree leaves was causing transaction aborts when syncing the log.
      
      Link: https://lore.kernel.org/linux-btrfs/ae169fc6-f504-28f0-a098-6fa6a4dfb612@leemhuis.info/
      CC: stable@vger.kernel.org # 5.15+
      Reviewed-by: default avatarJosef Bacik <josef@toxicpanda.com>
      Signed-off-by: default avatarFilipe Manana <fdmanana@suse.com>
      Signed-off-by: default avatarDavid Sterba <dsterba@suse.com>
      16199ad9
    • Filipe Manana's avatar
      btrfs: add missing setup of log for full commit at add_conflicting_inode() · 94cd63ae
      Filipe Manana authored
      When logging conflicting inodes, if we reach the maximum limit of inodes,
      we return BTRFS_LOG_FORCE_COMMIT to force a transaction commit. However
      we don't mark the log for full commit (with btrfs_set_log_full_commit()),
      which means that once we leave the log transaction and before we commit
      the transaction, some other task may sync the log, which is incomplete
      as we have not logged all conflicting inodes, leading to some inconsistent
      in case that log ends up being replayed.
      
      So also call btrfs_set_log_full_commit() at add_conflicting_inode().
      
      Fixes: e09d94c9 ("btrfs: log conflicting inodes without holding log mutex of the initial inode")
      CC: stable@vger.kernel.org # 6.1
      Reviewed-by: default avatarJosef Bacik <josef@toxicpanda.com>
      Signed-off-by: default avatarFilipe Manana <fdmanana@suse.com>
      Signed-off-by: default avatarDavid Sterba <dsterba@suse.com>
      94cd63ae
    • Filipe Manana's avatar
      btrfs: fix directory logging due to race with concurrent index key deletion · 8bb6898d
      Filipe Manana authored
      Sometimes we log a directory without holding its VFS lock, so while we
      logging it, dir index entries may be added or removed. This typically
      happens when logging a dentry from a parent directory that points to a
      new directory, through log_new_dir_dentries(), or when while logging
      some other inode we also need to log its parent directories (through
      btrfs_log_all_parents()).
      
      This means that while we are at log_dir_items(), we may not find a dir
      index key we found before, because it was deleted in the meanwhile, so
      a call to btrfs_search_slot() may return 1 (key not found). In that case
      we return from log_dir_items() with a success value (the variable 'err'
      has a value of 0). This can lead to a few problems, specially in the case
      where the variable 'last_offset' has a value of (u64)-1 (and it's
      initialized to that when it was declared):
      
      1) By returning from log_dir_items() with success (0) and a value of
         (u64)-1 for '*last_offset_ret', we end up not logging any other dir
         index keys that follow the missing, just deleted, index key. The
         (u64)-1 value makes log_directory_changes() not call log_dir_items()
         again;
      
      2) Before returning with success (0), log_dir_items(), will log a dir
         index range item covering a range from the last old dentry index
         (stored in the variable 'last_old_dentry_offset') to the value of
         'last_offset'. If 'last_offset' has a value of (u64)-1, then it means
         if the log is persisted and replayed after a power failure, it will
         cause deletion of all the directory entries that have an index number
         between last_old_dentry_offset + 1 and (u64)-1;
      
      3) We can end up returning from log_dir_items() with
         ctx->last_dir_item_offset having a lower value than
         inode->last_dir_index_offset, because the former is set to the current
         key we are processing at process_dir_items_leaf(), and at the end of
         log_directory_changes() we set inode->last_dir_index_offset to the
         current value of ctx->last_dir_item_offset. So if for example a
         deletion of a lower dir index key happened, we set
         ctx->last_dir_item_offset to that index value, then if we return from
         log_dir_items() because btrfs_search_slot() returned 1, we end up
         returning from log_dir_items() with success (0) and then
         log_directory_changes() sets inode->last_dir_index_offset to a lower
         value than it had before.
         This can result in unpredictable and unexpected behaviour when we
         need to log again the directory in the same transaction, and can result
         in ending up with a log tree leaf that has duplicated keys, as we do
         batch insertions of dir index keys into a log tree.
      
      So fix this by making log_dir_items() move on to the next dir index key
      if it does not find the one it was looking for.
      Reported-by: default avatarDavid Arendt <admin@prnet.org>
      Link: https://lore.kernel.org/linux-btrfs/ae169fc6-f504-28f0-a098-6fa6a4dfb612@leemhuis.info/
      CC: stable@vger.kernel.org # 4.14+
      Reviewed-by: default avatarJosef Bacik <josef@toxicpanda.com>
      Signed-off-by: default avatarFilipe Manana <fdmanana@suse.com>
      Signed-off-by: default avatarDavid Sterba <dsterba@suse.com>
      8bb6898d
    • Filipe Manana's avatar
      btrfs: fix missing error handling when logging directory items · 6d3d970b
      Filipe Manana authored
      When logging a directory, at log_dir_items(), if we get an error when
      attempting to search the subvolume tree for a dir index item, we end up
      returning 0 (success) from log_dir_items() because 'err' is left with a
      value of 0.
      
      This can lead to a few problems, specially in the case the variable
      'last_offset' has a value of (u64)-1 (and it's initialized to that when
      it was declared):
      
      1) By returning from log_dir_items() with success (0) and a value of
         (u64)-1 for '*last_offset_ret', we end up not logging any other dir
         index keys that follow the missing, just deleted, index key. The
         (u64)-1 value makes log_directory_changes() not call log_dir_items()
         again;
      
      2) Before returning with success (0), log_dir_items(), will log a dir
         index range item covering a range from the last old dentry index
         (stored in the variable 'last_old_dentry_offset') to the value of
         'last_offset'. If 'last_offset' has a value of (u64)-1, then it means
         if the log is persisted and replayed after a power failure, it will
         cause deletion of all the directory entries that have an index number
         between last_old_dentry_offset + 1 and (u64)-1;
      
      3) We can end up returning from log_dir_items() with
         ctx->last_dir_item_offset having a lower value than
         inode->last_dir_index_offset, because the former is set to the current
         key we are processing at process_dir_items_leaf(), and at the end of
         log_directory_changes() we set inode->last_dir_index_offset to the
         current value of ctx->last_dir_item_offset. So if for example a
         deletion of a lower dir index key happened, we set
         ctx->last_dir_item_offset to that index value, then if we return from
         log_dir_items() because btrfs_search_slot() returned an error, we end up
         returning without any error from log_dir_items() and then
         log_directory_changes() sets inode->last_dir_index_offset to a lower
         value than it had before.
         This can result in unpredictable and unexpected behaviour when we
         need to log again the directory in the same transaction, and can result
         in ending up with a log tree leaf that has duplicated keys, as we do
         batch insertions of dir index keys into a log tree.
      
      Fix this by setting 'err' to the value of 'ret' in case
      btrfs_search_slot() or btrfs_previous_item() returned an error. That will
      result in falling back to a full transaction commit.
      Reported-by: default avatarDavid Arendt <admin@prnet.org>
      Link: https://lore.kernel.org/linux-btrfs/ae169fc6-f504-28f0-a098-6fa6a4dfb612@leemhuis.info/
      Fixes: e02119d5 ("Btrfs: Add a write ahead tree log to optimize synchronous operations")
      CC: stable@vger.kernel.org # 4.14+
      Reviewed-by: default avatarJosef Bacik <josef@toxicpanda.com>
      Signed-off-by: default avatarFilipe Manana <fdmanana@suse.com>
      Signed-off-by: default avatarDavid Sterba <dsterba@suse.com>
      6d3d970b
  7. 11 Jan, 2023 3 commits
    • Naohiro Aota's avatar
      btrfs: zoned: enable metadata over-commit for non-ZNS setup · 85e79ec7
      Naohiro Aota authored
      The commit 79417d04 ("btrfs: zoned: disable metadata overcommit for
      zoned") disabled the metadata over-commit to track active zones properly.
      
      However, it also introduced a heavy overhead by allocating new metadata
      block groups and/or flushing dirty buffers to release the space
      reservations. Specifically, a workload (write only without any sync
      operations) worsen its performance from 343.77 MB/sec (v5.19) to 182.89
      MB/sec (v6.0).
      
      The performance is still bad on current misc-next which is 187.95 MB/sec.
      And, with this patch applied, it improves back to 326.70 MB/sec (+73.82%).
      
      This patch introduces a new fs_info->flag BTRFS_FS_NO_OVERCOMMIT to
      indicate it needs to disable the metadata over-commit. The flag is enabled
      when a device with max active zones limit is loaded into a file-system.
      
      Fixes: 79417d04 ("btrfs: zoned: disable metadata overcommit for zoned")
      CC: stable@vger.kernel.org # 6.0+
      Reviewed-by: default avatarJohannes Thumshirn <johannes.thumshirn@wdc.com>
      Signed-off-by: default avatarNaohiro Aota <naohiro.aota@wdc.com>
      Signed-off-by: default avatarDavid Sterba <dsterba@suse.com>
      85e79ec7
    • Qu Wenruo's avatar
      btrfs: qgroup: do not warn on record without old_roots populated · 75181406
      Qu Wenruo authored
      [BUG]
      There are some reports from the mailing list that since v6.1 kernel, the
      WARN_ON() inside btrfs_qgroup_account_extent() gets triggered during
      rescan:
      
        WARNING: CPU: 3 PID: 6424 at fs/btrfs/qgroup.c:2756 btrfs_qgroup_account_extents+0x1ae/0x260 [btrfs]
        CPU: 3 PID: 6424 Comm: snapperd Tainted: P           OE      6.1.2-1-default #1 openSUSE Tumbleweed 05c7a1b1b61d5627475528f71f50444637b5aad7
        RIP: 0010:btrfs_qgroup_account_extents+0x1ae/0x260 [btrfs]
        Call Trace:
         <TASK>
        btrfs_commit_transaction+0x30c/0xb40 [btrfs c39c9c546c241c593f03bd6d5f39ea1b676250f6]
         ? start_transaction+0xc3/0x5b0 [btrfs c39c9c546c241c593f03bd6d5f39ea1b676250f6]
        btrfs_qgroup_rescan+0x42/0xc0 [btrfs c39c9c546c241c593f03bd6d5f39ea1b676250f6]
         btrfs_ioctl+0x1ab9/0x25c0 [btrfs c39c9c546c241c593f03bd6d5f39ea1b676250f6]
         ? __rseq_handle_notify_resume+0xa9/0x4a0
         ? mntput_no_expire+0x4a/0x240
         ? __seccomp_filter+0x319/0x4d0
         __x64_sys_ioctl+0x90/0xd0
         do_syscall_64+0x5b/0x80
         ? syscall_exit_to_user_mode+0x17/0x40
         ? do_syscall_64+0x67/0x80
        entry_SYSCALL_64_after_hwframe+0x63/0xcd
        RIP: 0033:0x7fd9b790d9bf
         </TASK>
      
      [CAUSE]
      Since commit e15e9f43 ("btrfs: introduce
      BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING to skip qgroup accounting"), if
      our qgroup is already in inconsistent state, we will no longer do the
      time-consuming backref walk.
      
      This can leave some qgroup records without a valid old_roots ulist.
      Normally this is fine, as btrfs_qgroup_account_extents() would also skip
      those records if we have NO_ACCOUNTING flag set.
      
      But there is a small window, if we have NO_ACCOUNTING flag set, and
      inserted some qgroup_record without a old_roots ulist, but then the user
      triggered a qgroup rescan.
      
      During btrfs_qgroup_rescan(), we firstly clear NO_ACCOUNTING flag, then
      commit current transaction.
      
      And since we have a qgroup_record with old_roots = NULL, we trigger the
      WARN_ON() during btrfs_qgroup_account_extents().
      
      [FIX]
      Unfortunately due to the introduction of NO_ACCOUNTING flag, the
      assumption that every qgroup_record would have its old_roots populated
      is no longer correct.
      
      Fix the false alerts and drop the WARN_ON().
      Reported-by: default avatarLukas Straub <lukasstraub2@web.de>
      Reported-by: default avatarHanatoK <summersnow9403@gmail.com>
      Fixes: e15e9f43 ("btrfs: introduce BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING to skip qgroup accounting")
      CC: stable@vger.kernel.org # 6.1
      Link: https://lore.kernel.org/linux-btrfs/2403c697-ddaf-58ad-3829-0335fc89df09@gmail.com/Signed-off-by: default avatarQu Wenruo <wqu@suse.com>
      Signed-off-by: default avatarDavid Sterba <dsterba@suse.com>
      75181406
    • Qu Wenruo's avatar
      btrfs: add extra error messages to cover non-ENOMEM errors from device_add_list() · ed02363f
      Qu Wenruo authored
      [BUG]
      When test case btrfs/219 (aka, mount a registered device but with a lower
      generation) failed, there is not any useful information for the end user
      to find out what's going wrong.
      
      The mount failure just looks like this:
      
        #  mount -o loop /tmp/219.img2 /mnt/btrfs/
        mount: /mnt/btrfs: mount(2) system call failed: File exists.
               dmesg(1) may have more information after failed mount system call.
      
      While the dmesg contains nothing but the loop device change:
      
        loop1: detected capacity change from 0 to 524288
      
      [CAUSE]
      In device_list_add() we have a lot of extra checks to reject invalid
      cases.
      
      That function also contains the regular device scan result like the
      following prompt:
      
        BTRFS: device fsid 6222333e-f9f1-47e6-b306-55ddd4dcaef4 devid 1 transid 8 /dev/loop0 scanned by systemd-udevd (3027)
      
      But unfortunately not all errors have their own error messages, thus if
      we hit something wrong in device_add_list(), there may be no error
      messages at all.
      
      [FIX]
      Add errors message for all non-ENOMEM errors.
      
      For ENOMEM, I'd say we're in a much worse situation, and there should be
      some OOM messages way before our call sites.
      
      CC: stable@vger.kernel.org # 6.0+
      Signed-off-by: default avatarQu Wenruo <wqu@suse.com>
      Reviewed-by: default avatarDavid Sterba <dsterba@suse.com>
      Signed-off-by: default avatarDavid Sterba <dsterba@suse.com>
      ed02363f
  8. 03 Jan, 2023 7 commits
    • Qu Wenruo's avatar
      btrfs: fix compat_ro checks against remount · 2ba48b20
      Qu Wenruo authored
      [BUG]
      Even with commit 81d5d614 ("btrfs: enhance unsupported compat RO
      flags handling"), btrfs can still mount a fs with unsupported compat_ro
      flags read-only, then remount it RW:
      
        # btrfs ins dump-super /dev/loop0 | grep compat_ro_flags -A 3
        compat_ro_flags		0x403
      			( FREE_SPACE_TREE |
      			  FREE_SPACE_TREE_VALID |
      			  unknown flag: 0x400 )
      
        # mount /dev/loop0 /mnt/btrfs
        mount: /mnt/btrfs: wrong fs type, bad option, bad superblock on /dev/loop0, missing codepage or helper program, or other error.
               dmesg(1) may have more information after failed mount system call.
        ^^^ RW mount failed as expected ^^^
      
        # dmesg -t | tail -n5
        loop0: detected capacity change from 0 to 1048576
        BTRFS: device fsid cb5b82f5-0fdd-4d81-9b4b-78533c324afa devid 1 transid 7 /dev/loop0 scanned by mount (1146)
        BTRFS info (device loop0): using crc32c (crc32c-intel) checksum algorithm
        BTRFS info (device loop0): using free space tree
        BTRFS error (device loop0): cannot mount read-write because of unknown compat_ro features (0x403)
        BTRFS error (device loop0): open_ctree failed
      
        # mount /dev/loop0 -o ro /mnt/btrfs
        # mount -o remount,rw /mnt/btrfs
        ^^^ RW remount succeeded unexpectedly ^^^
      
      [CAUSE]
      Currently we use btrfs_check_features() to check compat_ro flags against
      our current mount flags.
      
      That function get reused between open_ctree() and btrfs_remount().
      
      But for btrfs_remount(), the super block we passed in still has the old
      mount flags, thus btrfs_check_features() still believes we're mounting
      read-only.
      
      [FIX]
      Replace the existing @sb argument with @is_rw_mount.
      
      As originally we only use @sb to determine if the mount is RW.
      
      Now it's callers' responsibility to determine if the mount is RW, and
      since there are only two callers, the check is pretty simple:
      
      - caller in open_ctree()
        Just pass !sb_rdonly().
      
      - caller in btrfs_remount()
        Pass !(*flags & SB_RDONLY), as our check should be against the new
        flags.
      
      Now we can correctly reject the RW remount:
      
        # mount /dev/loop0 -o ro /mnt/btrfs
        # mount -o remount,rw /mnt/btrfs
        mount: /mnt/btrfs: mount point not mounted or bad option.
               dmesg(1) may have more information after failed mount system call.
        # dmesg -t | tail -n 1
        BTRFS error (device loop0: state M): cannot mount read-write because of unknown compat_ro features (0x403)
      Reported-by: default avatarChung-Chiang Cheng <shepjeng@gmail.com>
      Fixes: 81d5d614 ("btrfs: enhance unsupported compat RO flags handling")
      CC: stable@vger.kernel.org # 5.15+
      Reviewed-by: default avatarAnand Jain <anand.jain@oracle.com>
      Signed-off-by: default avatarQu Wenruo <wqu@suse.com>
      Reviewed-by: default avatarDavid Sterba <dsterba@suse.com>
      Signed-off-by: default avatarDavid Sterba <dsterba@suse.com>
      2ba48b20
    • Qu Wenruo's avatar
      btrfs: always report error in run_one_delayed_ref() · 39f501d6
      Qu Wenruo authored
      Currently we have a btrfs_debug() for run_one_delayed_ref() failure, but
      if end users hit such problem, there will be no chance that
      btrfs_debug() is enabled.  This can lead to very little useful info for
      debugging.
      
      This patch will:
      
      - Add extra info for error reporting
        Including:
        * logical bytenr
        * num_bytes
        * type
        * action
        * ref_mod
      
      - Replace the btrfs_debug() with btrfs_err()
      
      - Move the error reporting into run_one_delayed_ref()
        This is to avoid use-after-free, the @node can be freed in the caller.
      
      This error should only be triggered at most once.
      
      As if run_one_delayed_ref() failed, we trigger the error message, then
      causing the call chain to error out:
      
      btrfs_run_delayed_refs()
      `- btrfs_run_delayed_refs()
         `- btrfs_run_delayed_refs_for_head()
            `- run_one_delayed_ref()
      
      And we will abort the current transaction in btrfs_run_delayed_refs().
      If we have to run delayed refs for the abort transaction,
      run_one_delayed_ref() will just cleanup the refs and do nothing, thus no
      new error messages would be output.
      Reviewed-by: default avatarAnand Jain <anand.jain@oracle.com>
      Signed-off-by: default avatarQu Wenruo <wqu@suse.com>
      Signed-off-by: default avatarDavid Sterba <dsterba@suse.com>
      39f501d6
    • Qu Wenruo's avatar
      btrfs: handle case when repair happens with dev-replace · d73a27b8
      Qu Wenruo authored
      [BUG]
      There is a bug report that a BUG_ON() in btrfs_repair_io_failure()
      (originally repair_io_failure() in v6.0 kernel) got triggered when
      replacing a unreliable disk:
      
        BTRFS warning (device sda1): csum failed root 257 ino 2397453 off 39624704 csum 0xb0d18c75 expected csum 0x4dae9c5e mirror 3
        kernel BUG at fs/btrfs/extent_io.c:2380!
        invalid opcode: 0000 [#1] PREEMPT SMP NOPTI
        CPU: 9 PID: 3614331 Comm: kworker/u257:2 Tainted: G           OE      6.0.0-5-amd64 #1  Debian 6.0.10-2
        Hardware name: Micro-Star International Co., Ltd. MS-7C60/TRX40 PRO WIFI (MS-7C60), BIOS 2.70 07/01/2021
        Workqueue: btrfs-endio btrfs_end_bio_work [btrfs]
        RIP: 0010:repair_io_failure+0x24a/0x260 [btrfs]
        Call Trace:
         <TASK>
         clean_io_failure+0x14d/0x180 [btrfs]
         end_bio_extent_readpage+0x412/0x6e0 [btrfs]
         ? __switch_to+0x106/0x420
         process_one_work+0x1c7/0x380
         worker_thread+0x4d/0x380
         ? rescuer_thread+0x3a0/0x3a0
         kthread+0xe9/0x110
         ? kthread_complete_and_exit+0x20/0x20
         ret_from_fork+0x22/0x30
      
      [CAUSE]
      
      Before the BUG_ON(), we got some read errors from the replace target
      first, note the mirror number (3, which is beyond RAID1 duplication,
      thus it's read from the replace target device).
      
      Then at the BUG_ON() location, we are trying to writeback the repaired
      sectors back the failed device.
      
      The check looks like this:
      
      		ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical,
      				      &map_length, &bioc, mirror_num);
      		if (ret)
      			goto out_counter_dec;
      		BUG_ON(mirror_num != bioc->mirror_num);
      
      But inside btrfs_map_block(), we can modify bioc->mirror_num especially
      for dev-replace:
      
      	if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 &&
      	    !need_full_stripe(op) && dev_replace->tgtdev != NULL) {
      		ret = get_extra_mirror_from_replace(fs_info, logical, *length,
      						    dev_replace->srcdev->devid,
      						    &mirror_num,
      					    &physical_to_patch_in_first_stripe);
      		patch_the_first_stripe_for_dev_replace = 1;
      	}
      
      Thus if we're repairing the replace target device, we're going to
      trigger that BUG_ON().
      
      But in reality, the read failure from the replace target device may be
      that, our replace hasn't reached the range we're reading, thus we're
      reading garbage, but with replace running, the range would be properly
      filled later.
      
      Thus in that case, we don't need to do anything but let the replace
      routine to handle it.
      
      [FIX]
      Instead of a BUG_ON(), just skip the repair if we're repairing the
      device replace target device.
      Reported-by: default avatar小太 <nospam@kota.moe>
      Link: https://lore.kernel.org/linux-btrfs/CACsxjPYyJGQZ+yvjzxA1Nn2LuqkYqTCcUH43S=+wXhyf8S00Ag@mail.gmail.com/
      CC: stable@vger.kernel.org # 6.0+
      Signed-off-by: default avatarQu Wenruo <wqu@suse.com>
      Signed-off-by: default avatarDavid Sterba <dsterba@suse.com>
      d73a27b8
    • Filipe Manana's avatar
      btrfs: fix off-by-one in delalloc search during lseek · 2f2e84ca
      Filipe Manana authored
      During lseek, when searching for delalloc in a range that represents a
      hole and that range has a length of 1 byte, we end up not doing the actual
      delalloc search in the inode's io tree, resulting in not correctly
      reporting the offset with data or a hole. This actually only happens when
      the start offset is 0 because with any other start offset we round it down
      by sector size.
      
      Reproducer:
      
        $ mkfs.btrfs -f /dev/sdc
        $ mount /dev/sdc /mnt/sdc
      
        $ xfs_io -f -c "pwrite -q 0 1" /mnt/sdc/foo
      
        $ xfs_io -c "seek -d 0" /mnt/sdc/foo
        Whence   Result
        DATA	   EOF
      
      It should have reported an offset of 0 instead of EOF.
      
      Fix this by updating btrfs_find_delalloc_in_range() and count_range_bits()
      to deal with inclusive ranges properly. These functions are already
      supposed to work with inclusive end offsets, they just got it wrong in a
      couple places due to off-by-one mistakes.
      
      A test case for fstests will be added later.
      Reported-by: default avatarJoan Bruguera Micó <joanbrugueram@gmail.com>
      Link: https://lore.kernel.org/linux-btrfs/20221223020509.457113-1-joanbrugueram@gmail.com/
      Fixes: b6e83356 ("btrfs: make hole and data seeking a lot more efficient")
      CC: stable@vger.kernel.org # 6.1
      Tested-by: default avatarJoan Bruguera Micó <joanbrugueram@gmail.com>
      Signed-off-by: default avatarFilipe Manana <fdmanana@suse.com>
      Signed-off-by: default avatarDavid Sterba <dsterba@suse.com>
      2f2e84ca
    • Qu Wenruo's avatar
      btrfs: fix false alert on bad tree level check · 1d854e4f
      Qu Wenruo authored
      [BUG]
      There is a bug report that on a RAID0 NVMe btrfs system, under heavy
      write load the filesystem can flip RO randomly.
      
      With extra debugging, it shows some tree blocks failed to pass their
      level checks, and if that happens at critical path of a transaction, we
      abort the transaction:
      
        BTRFS error (device nvme0n1p3): level verify failed on logical 5446121209856 mirror 1 wanted 0 found 1
        BTRFS error (device nvme0n1p3: state A): Transaction aborted (error -5)
        BTRFS: error (device nvme0n1p3: state A) in btrfs_finish_ordered_io:3343: errno=-5 IO failure
        BTRFS info (device nvme0n1p3: state EA): forced readonly
      
      [CAUSE]
      The reporter has already bisected to commit 947a6299 ("btrfs: move
      tree block parentness check into validate_extent_buffer()").
      
      And with extra debugging, it shows we can have btrfs_tree_parent_check
      filled with all zeros in the following call trace:
      
        submit_one_bio+0xd4/0xe0
        submit_extent_page+0x142/0x550
        read_extent_buffer_pages+0x584/0x9c0
        ? __pfx_end_bio_extent_readpage+0x10/0x10
        ? folio_unlock+0x1d/0x50
        btrfs_read_extent_buffer+0x98/0x150
        read_tree_block+0x43/0xa0
        read_block_for_search+0x266/0x370
        btrfs_search_slot+0x351/0xd30
        ? lock_is_held_type+0xe8/0x140
        btrfs_lookup_csum+0x63/0x150
        btrfs_csum_file_blocks+0x197/0x6c0
        ? sched_clock_cpu+0x9f/0xc0
        ? lock_release+0x14b/0x440
        ? _raw_read_unlock+0x29/0x50
        btrfs_finish_ordered_io+0x441/0x860
        btrfs_work_helper+0xfe/0x400
        ? lock_is_held_type+0xe8/0x140
        process_one_work+0x294/0x5b0
        worker_thread+0x4f/0x3a0
        ? __pfx_worker_thread+0x10/0x10
        kthread+0xf5/0x120
        ? __pfx_kthread+0x10/0x10
        ret_from_fork+0x2c/0x50
      
      Currently we only copy the btrfs_tree_parent_check structure into bbio
      at read_extent_buffer_pages() after we have assembled the bbio.
      
      But as shown above, submit_extent_page() itself can already submit the
      bbio, leaving the bbio->parent_check uninitialized, and cause the false
      alert.
      
      [FIX]
      Instead of copying @check into bbio after bbio is assembled, we pass
      @check in btrfs_bio_ctrl::parent_check, and copy the content of
      parent_check in submit_one_bio() for metadata read.
      
      By this we should be able to pass the needed info for metadata endio
      verification, and fix the false alert.
      Reported-by: default avatarMikhail Gavrilov <mikhail.v.gavrilov@gmail.com>
      Link: https://lore.kernel.org/linux-btrfs/CABXGCsNzVxo4iq-tJSGm_kO1UggHXgq6CdcHDL=z5FL4njYXSQ@mail.gmail.com/
      Fixes: 947a6299 ("btrfs: move tree block parentness check into validate_extent_buffer()")
      Tested-by: default avatarMikhail Gavrilov <mikhail.v.gavrilov@gmail.com>
      Signed-off-by: default avatarQu Wenruo <wqu@suse.com>
      Signed-off-by: default avatarDavid Sterba <dsterba@suse.com>
      1d854e4f
    • Qu Wenruo's avatar
      btrfs: add error message for metadata level mismatch · 77177ed1
      Qu Wenruo authored
      From a recent regression report, we found that after commit 947a6299
      ("btrfs: move tree block parentness check into
      validate_extent_buffer()") if we have a level mismatch (false alert
      though), there is no error message at all.
      
      This makes later debugging harder.  This patch will add the proper error
      message for such case.
      
      Link: https://lore.kernel.org/linux-btrfs/CABXGCsNzVxo4iq-tJSGm_kO1UggHXgq6CdcHDL=z5FL4njYXSQ@mail.gmail.com/Reviewed-by: default avatarAnand Jain <anand.jain@oracle.com>
      Signed-off-by: default avatarQu Wenruo <wqu@suse.com>
      Signed-off-by: default avatarDavid Sterba <dsterba@suse.com>
      77177ed1
    • Tanmay Bhushan's avatar
      btrfs: fix ASSERT em->len condition in btrfs_get_extent · 946c2923
      Tanmay Bhushan authored
      The em->len value is supposed to be verified in the assertion condition
      though we expect it to be same as the sectorsize.
      
      Fixes: a196a894 ("btrfs: do not reset extent map members for inline extents read")
      Reviewed-by: default avatarAnand Jain <anand.jain@oracle.com>
      Signed-off-by: default avatarTanmay Bhushan <007047221b@gmail.com>
      Signed-off-by: default avatarDavid Sterba <dsterba@suse.com>
      946c2923
  9. 20 Dec, 2022 3 commits
    • Filipe Manana's avatar
      btrfs: fix fscrypt name leak after failure to join log transaction · fee4c199
      Filipe Manana authored
      When logging a new name, we don't expect to fail joining a log transaction
      since we know at least one of the inodes was logged before in the current
      transaction. However if we fail for some unexpected reason, we end up not
      freeing the fscrypt name we previously allocated. So fix that by freeing
      the name in case we failed to join a log transaction.
      
      Fixes: ab3c5c18 ("btrfs: setup qstr from dentrys using fscrypt helper")
      Reviewed-by: default avatarSweet Tea Dorminy <sweettea-kernel@dorminy.me>
      Signed-off-by: default avatarFilipe Manana <fdmanana@suse.com>
      Reviewed-by: default avatarDavid Sterba <dsterba@suse.com>
      Signed-off-by: default avatarDavid Sterba <dsterba@suse.com>
      fee4c199
    • Josef Bacik's avatar
      btrfs: scrub: fix uninitialized return value in recover_scrub_rbio · e7fc357e
      Josef Bacik authored
      Commit 75b47033 ("btrfs: raid56: migrate recovery and scrub recovery
      path to use error_bitmap") introduced an uninitialized return variable.
      
      This can be caught by gcc 12.1 by -Wmaybe-uninitialized:
      
        CC [M]  fs/btrfs/raid56.o
      fs/btrfs/raid56.c: In function ‘scrub_rbio’:
      fs/btrfs/raid56.c:2801:15: warning: ‘ret’ may be used uninitialized [-Wmaybe-uninitialized]
       2801 |         ret = recover_scrub_rbio(rbio);
            |               ^~~~~~~~~~~~~~~~~~~~~~~~
      fs/btrfs/raid56.c:2649:13: note: ‘ret’ was declared here
       2649 |         int ret;
      
      The warning is disabled by default so we haven't caught that.
      
      Due to the bug the raid56 scrub fstests have been failing since the
      patch was merged, so initialize that.
      
      Fixes: 75b47033 ("btrfs: raid56: migrate recovery and scrub recovery path to use error_bitmap")
      Signed-off-by: default avatarJosef Bacik <josef@toxicpanda.com>
      Reviewed-by: default avatarDavid Sterba <dsterba@suse.com>
      Signed-off-by: default avatarDavid Sterba <dsterba@suse.com>
      e7fc357e
    • Boris Burkov's avatar
      btrfs: fix resolving backrefs for inline extent followed by prealloc · 560840af
      Boris Burkov authored
      If a file consists of an inline extent followed by a regular or prealloc
      extent, then a legitimate attempt to resolve a logical address in the
      non-inline region will result in add_all_parents reading the invalid
      offset field of the inline extent. If the inline extent item is placed
      in the leaf eb s.t. it is the first item, attempting to access the
      offset field will not only be meaningless, it will go past the end of
      the eb and cause this panic:
      
        [17.626048] BTRFS warning (device dm-2): bad eb member end: ptr 0x3fd4 start 30834688 member offset 16377 size 8
        [17.631693] general protection fault, probably for non-canonical address 0x5088000000000: 0000 [#1] SMP PTI
        [17.635041] CPU: 2 PID: 1267 Comm: btrfs Not tainted 5.12.0-07246-g75175d5adc74-dirty #199
        [17.637969] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014
        [17.641995] RIP: 0010:btrfs_get_64+0xe7/0x110
        [17.649890] RSP: 0018:ffffc90001f73a08 EFLAGS: 00010202
        [17.651652] RAX: 0000000000000001 RBX: ffff88810c42d000 RCX: 0000000000000000
        [17.653921] RDX: 0005088000000000 RSI: ffffc90001f73a0f RDI: 0000000000000001
        [17.656174] RBP: 0000000000000ff9 R08: 0000000000000007 R09: c0000000fffeffff
        [17.658441] R10: ffffc90001f73790 R11: ffffc90001f73788 R12: ffff888106afe918
        [17.661070] R13: 0000000000003fd4 R14: 0000000000003f6f R15: cdcdcdcdcdcdcdcd
        [17.663617] FS:  00007f64e7627d80(0000) GS:ffff888237c80000(0000) knlGS:0000000000000000
        [17.666525] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
        [17.668664] CR2: 000055d4a39152e8 CR3: 000000010c596002 CR4: 0000000000770ee0
        [17.671253] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
        [17.673634] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
        [17.676034] PKRU: 55555554
        [17.677004] Call Trace:
        [17.677877]  add_all_parents+0x276/0x480
        [17.679325]  find_parent_nodes+0xfae/0x1590
        [17.680771]  btrfs_find_all_leafs+0x5e/0xa0
        [17.682217]  iterate_extent_inodes+0xce/0x260
        [17.683809]  ? btrfs_inode_flags_to_xflags+0x50/0x50
        [17.685597]  ? iterate_inodes_from_logical+0xa1/0xd0
        [17.687404]  iterate_inodes_from_logical+0xa1/0xd0
        [17.689121]  ? btrfs_inode_flags_to_xflags+0x50/0x50
        [17.691010]  btrfs_ioctl_logical_to_ino+0x131/0x190
        [17.692946]  btrfs_ioctl+0x104a/0x2f60
        [17.694384]  ? selinux_file_ioctl+0x182/0x220
        [17.695995]  ? __x64_sys_ioctl+0x84/0xc0
        [17.697394]  __x64_sys_ioctl+0x84/0xc0
        [17.698697]  do_syscall_64+0x33/0x40
        [17.700017]  entry_SYSCALL_64_after_hwframe+0x44/0xae
        [17.701753] RIP: 0033:0x7f64e72761b7
        [17.709355] RSP: 002b:00007ffefb067f58 EFLAGS: 00000246 ORIG_RAX: 0000000000000010
        [17.712088] RAX: ffffffffffffffda RBX: 0000000000000003 RCX: 00007f64e72761b7
        [17.714667] RDX: 00007ffefb067fb0 RSI: 00000000c0389424 RDI: 0000000000000003
        [17.717386] RBP: 00007ffefb06d188 R08: 000055d4a390d2b0 R09: 00007f64e7340a60
        [17.719938] R10: 0000000000000231 R11: 0000000000000246 R12: 0000000000000001
        [17.722383] R13: 0000000000000000 R14: 00000000c0389424 R15: 000055d4a38fd2a0
        [17.724839] Modules linked in:
      
      Fix the bug by detecting the inline extent item in add_all_parents and
      skipping to the next extent item.
      
      CC: stable@vger.kernel.org # 4.9+
      Reviewed-by: default avatarQu Wenruo <wqu@suse.com>
      Signed-off-by: default avatarBoris Burkov <boris@bur.io>
      Signed-off-by: default avatarDavid Sterba <dsterba@suse.com>
      560840af
  10. 15 Dec, 2022 5 commits
  11. 05 Dec, 2022 6 commits