Commit 1e791234 authored by Dave Chinner's avatar Dave Chinner Committed by Dave Chinner

Merge tag 'scrub-iget-fixes-6.4_2023-04-12' of...

Merge tag 'scrub-iget-fixes-6.4_2023-04-12' of git://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfs-linux into guilt/xfs-for-next

xfs: fix iget/irele usage in online fsck [v24.5]

This patchset fixes a handful of problems relating to how we get and
release incore inodes in the online scrub code.  The first patch fixes
how we handle DONTCACHE -- our reasons for setting (or clearing it)
depend entirely on the runtime environment at irele time.  Hence we can
refactor iget and irele to use our own wrappers that set that context
appropriately.

The second patch fixes a race between the iget call in the inode core
scrubber and other writer threads that are allocating or freeing inodes
in the same AG by changing the behavior of xchk_iget (and the inode core
scrub setup function) to return either an incore inode or the AGI buffer
so that we can be sure that the inode cannot disappear on us.

The final patch elides MMAPLOCK from scrub paths when possible.  It did
not fit anywhere else.
Signed-off-by: default avatarDarrick J. Wong <djwong@kernel.org>
Signed-off-by: default avatarDave Chinner <david@fromorbit.com>
parents a4466722 1fc7a059
...@@ -34,12 +34,12 @@ xchk_setup_inode_bmap( ...@@ -34,12 +34,12 @@ xchk_setup_inode_bmap(
if (xchk_need_intent_drain(sc)) if (xchk_need_intent_drain(sc))
xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN); xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN);
error = xchk_get_inode(sc); error = xchk_iget_for_scrubbing(sc);
if (error) if (error)
goto out; goto out;
sc->ilock_flags = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL; sc->ilock_flags = XFS_IOLOCK_EXCL;
xfs_ilock(sc->ip, sc->ilock_flags); xfs_ilock(sc->ip, XFS_IOLOCK_EXCL);
/* /*
* We don't want any ephemeral data fork updates sitting around * We don't want any ephemeral data fork updates sitting around
...@@ -50,6 +50,9 @@ xchk_setup_inode_bmap( ...@@ -50,6 +50,9 @@ xchk_setup_inode_bmap(
sc->sm->sm_type == XFS_SCRUB_TYPE_BMBTD) { sc->sm->sm_type == XFS_SCRUB_TYPE_BMBTD) {
struct address_space *mapping = VFS_I(sc->ip)->i_mapping; struct address_space *mapping = VFS_I(sc->ip)->i_mapping;
sc->ilock_flags |= XFS_MMAPLOCK_EXCL;
xfs_ilock(sc->ip, XFS_MMAPLOCK_EXCL);
inode_dio_wait(VFS_I(sc->ip)); inode_dio_wait(VFS_I(sc->ip));
/* /*
......
This diff is collapsed.
...@@ -32,6 +32,8 @@ xchk_should_terminate( ...@@ -32,6 +32,8 @@ xchk_should_terminate(
} }
int xchk_trans_alloc(struct xfs_scrub *sc, uint resblks); int xchk_trans_alloc(struct xfs_scrub *sc, uint resblks);
void xchk_trans_cancel(struct xfs_scrub *sc);
bool xchk_process_error(struct xfs_scrub *sc, xfs_agnumber_t agno, bool xchk_process_error(struct xfs_scrub *sc, xfs_agnumber_t agno,
xfs_agblock_t bno, int *error); xfs_agblock_t bno, int *error);
bool xchk_fblock_process_error(struct xfs_scrub *sc, int whichfork, bool xchk_fblock_process_error(struct xfs_scrub *sc, int whichfork,
...@@ -133,10 +135,16 @@ int xchk_count_rmap_ownedby_ag(struct xfs_scrub *sc, struct xfs_btree_cur *cur, ...@@ -133,10 +135,16 @@ int xchk_count_rmap_ownedby_ag(struct xfs_scrub *sc, struct xfs_btree_cur *cur,
const struct xfs_owner_info *oinfo, xfs_filblks_t *blocks); const struct xfs_owner_info *oinfo, xfs_filblks_t *blocks);
int xchk_setup_ag_btree(struct xfs_scrub *sc, bool force_log); int xchk_setup_ag_btree(struct xfs_scrub *sc, bool force_log);
int xchk_get_inode(struct xfs_scrub *sc); int xchk_iget_for_scrubbing(struct xfs_scrub *sc);
int xchk_setup_inode_contents(struct xfs_scrub *sc, unsigned int resblks); int xchk_setup_inode_contents(struct xfs_scrub *sc, unsigned int resblks);
void xchk_buffer_recheck(struct xfs_scrub *sc, struct xfs_buf *bp); void xchk_buffer_recheck(struct xfs_scrub *sc, struct xfs_buf *bp);
int xchk_iget(struct xfs_scrub *sc, xfs_ino_t inum, struct xfs_inode **ipp);
int xchk_iget_agi(struct xfs_scrub *sc, xfs_ino_t inum,
struct xfs_buf **agi_bpp, struct xfs_inode **ipp);
void xchk_irele(struct xfs_scrub *sc, struct xfs_inode *ip);
int xchk_install_handle_inode(struct xfs_scrub *sc, struct xfs_inode *ip);
/* /*
* Don't bother cross-referencing if we already found corruption or cross * Don't bother cross-referencing if we already found corruption or cross
* referencing discrepancies. * referencing discrepancies.
......
...@@ -117,21 +117,15 @@ xchk_dir_actor( ...@@ -117,21 +117,15 @@ xchk_dir_actor(
} }
/* /*
* Grab the inode pointed to by the dirent. We release the * Grab the inode pointed to by the dirent. We release the inode
* inode before we cancel the scrub transaction. Since we're * before we cancel the scrub transaction.
* don't know a priori that releasing the inode won't trigger
* eofblocks cleanup (which allocates what would be a nested
* transaction), we can't use DONTCACHE here because DONTCACHE
* inodes can trigger immediate inactive cleanup of the inode.
* Use UNTRUSTED here to check the allocation status of the inode in
* the inode btrees.
* *
* If _iget returns -EINVAL or -ENOENT then the child inode number is * If _iget returns -EINVAL or -ENOENT then the child inode number is
* garbage and the directory is corrupt. If the _iget returns * garbage and the directory is corrupt. If the _iget returns
* -EFSCORRUPTED or -EFSBADCRC then the child is corrupt which is a * -EFSCORRUPTED or -EFSBADCRC then the child is corrupt which is a
* cross referencing error. Any other error is an operational error. * cross referencing error. Any other error is an operational error.
*/ */
error = xfs_iget(mp, sc->tp, ino, XFS_IGET_UNTRUSTED, 0, &ip); error = xchk_iget(sc, ino, &ip);
if (error == -EINVAL || error == -ENOENT) { if (error == -EINVAL || error == -ENOENT) {
error = -EFSCORRUPTED; error = -EFSCORRUPTED;
xchk_fblock_process_error(sc, XFS_DATA_FORK, 0, &error); xchk_fblock_process_error(sc, XFS_DATA_FORK, 0, &error);
...@@ -141,7 +135,7 @@ xchk_dir_actor( ...@@ -141,7 +135,7 @@ xchk_dir_actor(
goto out; goto out;
xchk_dir_check_ftype(sc, offset, ip, name->type); xchk_dir_check_ftype(sc, offset, ip, name->type);
xfs_irele(ip); xchk_irele(sc, ip);
out: out:
if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
return -ECANCELED; return -ECANCELED;
......
...@@ -11,8 +11,11 @@ ...@@ -11,8 +11,11 @@
#include "xfs_mount.h" #include "xfs_mount.h"
#include "xfs_btree.h" #include "xfs_btree.h"
#include "xfs_log_format.h" #include "xfs_log_format.h"
#include "xfs_trans.h"
#include "xfs_ag.h"
#include "xfs_inode.h" #include "xfs_inode.h"
#include "xfs_ialloc.h" #include "xfs_ialloc.h"
#include "xfs_icache.h"
#include "xfs_da_format.h" #include "xfs_da_format.h"
#include "xfs_reflink.h" #include "xfs_reflink.h"
#include "xfs_rmap.h" #include "xfs_rmap.h"
...@@ -20,48 +23,176 @@ ...@@ -20,48 +23,176 @@
#include "scrub/scrub.h" #include "scrub/scrub.h"
#include "scrub/common.h" #include "scrub/common.h"
#include "scrub/btree.h" #include "scrub/btree.h"
#include "scrub/trace.h"
/* Prepare the attached inode for scrubbing. */
static inline int
xchk_prepare_iscrub(
struct xfs_scrub *sc)
{
int error;
sc->ilock_flags = XFS_IOLOCK_EXCL;
xfs_ilock(sc->ip, sc->ilock_flags);
error = xchk_trans_alloc(sc, 0);
if (error)
return error;
sc->ilock_flags |= XFS_ILOCK_EXCL;
xfs_ilock(sc->ip, XFS_ILOCK_EXCL);
return 0;
}
/* Install this scrub-by-handle inode and prepare it for scrubbing. */
static inline int
xchk_install_handle_iscrub(
struct xfs_scrub *sc,
struct xfs_inode *ip)
{
int error;
error = xchk_install_handle_inode(sc, ip);
if (error)
return error;
return xchk_prepare_iscrub(sc);
}
/* /*
* Grab total control of the inode metadata. It doesn't matter here if * Grab total control of the inode metadata. In the best case, we grab the
* the file data is still changing; exclusive access to the metadata is * incore inode and take all locks on it. If the incore inode cannot be
* the goal. * constructed due to corruption problems, lock the AGI so that we can single
* step the loading process to fix everything that can go wrong.
*/ */
int int
xchk_setup_inode( xchk_setup_inode(
struct xfs_scrub *sc) struct xfs_scrub *sc)
{ {
struct xfs_imap imap;
struct xfs_inode *ip;
struct xfs_mount *mp = sc->mp;
struct xfs_inode *ip_in = XFS_I(file_inode(sc->file));
struct xfs_buf *agi_bp;
struct xfs_perag *pag;
xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, sc->sm->sm_ino);
int error; int error;
if (xchk_need_intent_drain(sc)) if (xchk_need_intent_drain(sc))
xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN); xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN);
/* We want to scan the opened inode, so lock it and exit. */
if (sc->sm->sm_ino == 0 || sc->sm->sm_ino == ip_in->i_ino) {
sc->ip = ip_in;
return xchk_prepare_iscrub(sc);
}
/* Reject internal metadata files and obviously bad inode numbers. */
if (xfs_internal_inum(mp, sc->sm->sm_ino))
return -ENOENT;
if (!xfs_verify_ino(sc->mp, sc->sm->sm_ino))
return -ENOENT;
/* Try a regular untrusted iget. */
error = xchk_iget(sc, sc->sm->sm_ino, &ip);
if (!error)
return xchk_install_handle_iscrub(sc, ip);
if (error == -ENOENT)
return error;
if (error != -EFSCORRUPTED && error != -EFSBADCRC && error != -EINVAL)
goto out_error;
/* /*
* Try to get the inode. If the verifiers fail, we try again * EINVAL with IGET_UNTRUSTED probably means one of several things:
* in raw mode. * userspace gave us an inode number that doesn't correspond to fs
* space; the inode btree lacks a record for this inode; or there is
* a record, and it says this inode is free.
*
* EFSCORRUPTED/EFSBADCRC could mean that the inode was mappable, but
* some other metadata corruption (e.g. inode forks) prevented
* instantiation of the incore inode. Or it could mean the inobt is
* corrupt.
*
* We want to look up this inode in the inobt directly to distinguish
* three different scenarios: (1) the inobt says the inode is free,
* in which case there's nothing to do; (2) the inobt is corrupt so we
* should flag the corruption and exit to userspace to let it fix the
* inobt; and (3) the inobt says the inode is allocated, but loading it
* failed due to corruption.
*
* Allocate a transaction and grab the AGI to prevent inobt activity in
* this AG. Retry the iget in case someone allocated a new inode after
* the first iget failed.
*/ */
error = xchk_get_inode(sc); error = xchk_trans_alloc(sc, 0);
switch (error) { if (error)
case 0: goto out_error;
break;
case -EFSCORRUPTED: error = xchk_iget_agi(sc, sc->sm->sm_ino, &agi_bp, &ip);
case -EFSBADCRC: if (error == 0) {
return xchk_trans_alloc(sc, 0); /* Actually got the incore inode, so install it and proceed. */
default: xchk_trans_cancel(sc);
return error; return xchk_install_handle_iscrub(sc, ip);
}
if (error == -ENOENT)
goto out_gone;
if (error != -EFSCORRUPTED && error != -EFSBADCRC && error != -EINVAL)
goto out_cancel;
/* Ensure that we have protected against inode allocation/freeing. */
if (agi_bp == NULL) {
ASSERT(agi_bp != NULL);
error = -ECANCELED;
goto out_cancel;
} }
/* Got the inode, lock it and we're ready to go. */ /*
sc->ilock_flags = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL; * Untrusted iget failed a second time. Let's try an inobt lookup.
xfs_ilock(sc->ip, sc->ilock_flags); * If the inobt doesn't think this is an allocated inode then we'll
error = xchk_trans_alloc(sc, 0); * return ENOENT to signal that the check can be skipped.
*
* If the lookup signals corruption, we'll mark this inode corrupt and
* exit to userspace. There's little chance of fixing anything until
* the inobt is straightened out, but there's nothing we can do here.
*
* If the lookup encounters a runtime error, exit to userspace.
*/
pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, sc->sm->sm_ino));
if (!pag) {
error = -EFSCORRUPTED;
goto out_cancel;
}
error = xfs_imap(pag, sc->tp, sc->sm->sm_ino, &imap,
XFS_IGET_UNTRUSTED);
xfs_perag_put(pag);
if (error == -EINVAL || error == -ENOENT)
goto out_gone;
if (error) if (error)
goto out; goto out_cancel;
sc->ilock_flags |= XFS_ILOCK_EXCL;
xfs_ilock(sc->ip, XFS_ILOCK_EXCL);
out: /*
/* scrub teardown will unlock and release the inode for us */ * The lookup succeeded. Chances are the ondisk inode is corrupt and
* preventing iget from reading it. Retain the scrub transaction and
* the AGI buffer to prevent anyone from allocating or freeing inodes.
* This ensures that we preserve the inconsistency between the inobt
* saying the inode is allocated and the icache being unable to load
* the inode until we can flag the corruption in xchk_inode. The
* scrub function has to note the corruption, since we're not really
* supposed to do that from the setup function.
*/
return 0;
out_cancel:
xchk_trans_cancel(sc);
out_error:
trace_xchk_op_error(sc, agno, XFS_INO_TO_AGBNO(mp, sc->sm->sm_ino),
error, __return_address);
return error; return error;
out_gone:
/* The file is gone, so there's nothing to check. */
xchk_trans_cancel(sc);
return -ENOENT;
} }
/* Inode core */ /* Inode core */
......
...@@ -127,20 +127,15 @@ xchk_parent_validate( ...@@ -127,20 +127,15 @@ xchk_parent_validate(
expected_nlink = VFS_I(sc->ip)->i_nlink == 0 ? 0 : 1; expected_nlink = VFS_I(sc->ip)->i_nlink == 0 ? 0 : 1;
/* /*
* Grab this parent inode. We release the inode before we * Grab the parent directory inode. This must be released before we
* cancel the scrub transaction. Since we're don't know a * cancel the scrub transaction.
* priori that releasing the inode won't trigger eofblocks
* cleanup (which allocates what would be a nested transaction)
* if the parent pointer erroneously points to a file, we
* can't use DONTCACHE here because DONTCACHE inodes can trigger
* immediate inactive cleanup of the inode.
* *
* If _iget returns -EINVAL or -ENOENT then the parent inode number is * If _iget returns -EINVAL or -ENOENT then the parent inode number is
* garbage and the directory is corrupt. If the _iget returns * garbage and the directory is corrupt. If the _iget returns
* -EFSCORRUPTED or -EFSBADCRC then the parent is corrupt which is a * -EFSCORRUPTED or -EFSBADCRC then the parent is corrupt which is a
* cross referencing error. Any other error is an operational error. * cross referencing error. Any other error is an operational error.
*/ */
error = xfs_iget(mp, sc->tp, parent_ino, XFS_IGET_UNTRUSTED, 0, &dp); error = xchk_iget(sc, parent_ino, &dp);
if (error == -EINVAL || error == -ENOENT) { if (error == -EINVAL || error == -ENOENT) {
error = -EFSCORRUPTED; error = -EFSCORRUPTED;
xchk_fblock_process_error(sc, XFS_DATA_FORK, 0, &error); xchk_fblock_process_error(sc, XFS_DATA_FORK, 0, &error);
...@@ -176,7 +171,7 @@ xchk_parent_validate( ...@@ -176,7 +171,7 @@ xchk_parent_validate(
out_unlock: out_unlock:
xfs_iunlock(dp, lock_mode); xfs_iunlock(dp, lock_mode);
out_rele: out_rele:
xfs_irele(dp); xchk_irele(sc, dp);
return error; return error;
} }
......
...@@ -181,7 +181,7 @@ xchk_teardown( ...@@ -181,7 +181,7 @@ xchk_teardown(
xfs_iunlock(sc->ip, sc->ilock_flags); xfs_iunlock(sc->ip, sc->ilock_flags);
if (sc->ip != ip_in && if (sc->ip != ip_in &&
!xfs_internal_inum(sc->mp, sc->ip->i_ino)) !xfs_internal_inum(sc->mp, sc->ip->i_ino))
xfs_irele(sc->ip); xchk_irele(sc, sc->ip);
sc->ip = NULL; sc->ip = NULL;
} }
if (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) if (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR)
......
...@@ -767,7 +767,8 @@ xfs_iget( ...@@ -767,7 +767,8 @@ xfs_iget(
return 0; return 0;
out_error_or_again: out_error_or_again:
if (!(flags & XFS_IGET_INCORE) && error == -EAGAIN) { if (!(flags & (XFS_IGET_INCORE | XFS_IGET_NORETRY)) &&
error == -EAGAIN) {
delay(1); delay(1);
goto again; goto again;
} }
......
...@@ -34,10 +34,13 @@ struct xfs_icwalk { ...@@ -34,10 +34,13 @@ struct xfs_icwalk {
/* /*
* Flags for xfs_iget() * Flags for xfs_iget()
*/ */
#define XFS_IGET_CREATE 0x1 #define XFS_IGET_CREATE (1U << 0)
#define XFS_IGET_UNTRUSTED 0x2 #define XFS_IGET_UNTRUSTED (1U << 1)
#define XFS_IGET_DONTCACHE 0x4 #define XFS_IGET_DONTCACHE (1U << 2)
#define XFS_IGET_INCORE 0x8 /* don't read from disk or reinit */ /* don't read from disk or reinit */
#define XFS_IGET_INCORE (1U << 3)
/* Return -EAGAIN immediately if the inode is unavailable. */
#define XFS_IGET_NORETRY (1U << 4)
int xfs_iget(struct xfs_mount *mp, struct xfs_trans *tp, xfs_ino_t ino, int xfs_iget(struct xfs_mount *mp, struct xfs_trans *tp, xfs_ino_t ino,
uint flags, uint lock_flags, xfs_inode_t **ipp); uint flags, uint lock_flags, xfs_inode_t **ipp);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment