Commit 783c5170 authored by Chandan Babu R's avatar Chandan Babu R

Merge tag 'repair-tempfiles-6.10_2024-04-15' of...

Merge tag 'repair-tempfiles-6.10_2024-04-15' of https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfs-linux into xfs-6.10-mergeA

xfs: create temporary files for online repair

As mentioned earlier, the repair strategy for file-based metadata is to
build a new copy in a temporary file and swap the file fork mappings
with the metadata inode.  We've built the atomic extent swap facility,
so now we need to build a facility for handling private temporary files.

The first step is to teach the filesystem to ignore the temporary files.
We'll mark them as PRIVATE in the VFS so that the kernel security
modules will leave it alone.  The second step is to add the online
repair code the ability to create a temporary file and reap extents from
the temporary file after the extent swap.
Signed-off-by: default avatarDarrick J. Wong <djwong@kernel.org>
Signed-off-by: default avatarChandan Babu R <chandanbabu@kernel.org>

* tag 'repair-tempfiles-6.10_2024-04-15' of https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfs-linux:
  xfs: add the ability to reap entire inode forks
  xfs: refactor live buffer invalidation for repairs
  xfs: create temporary files and directories for online repair
  xfs: hide private inodes from bulkstat and handle functions
parents 22d5a8e5 5befb047
......@@ -207,6 +207,7 @@ xfs-y += $(addprefix scrub/, \
refcount_repair.o \
repair.o \
rmap_repair.o \
tempfile.o \
)
xfs-$(CONFIG_XFS_RT) += $(addprefix scrub/, \
......
......@@ -143,7 +143,7 @@ xchk_parent_validate(
}
if (!xchk_fblock_xref_process_error(sc, XFS_DATA_FORK, 0, &error))
return error;
if (dp == sc->ip || !S_ISDIR(VFS_I(dp)->i_mode)) {
if (dp == sc->ip || dp == sc->tempip || !S_ISDIR(VFS_I(dp)->i_mode)) {
xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0);
goto out_rele;
}
......
This diff is collapsed.
......@@ -13,5 +13,26 @@ int xrep_reap_agblocks(struct xfs_scrub *sc, struct xagb_bitmap *bitmap,
const struct xfs_owner_info *oinfo, enum xfs_ag_resv_type type);
int xrep_reap_fsblocks(struct xfs_scrub *sc, struct xfsb_bitmap *bitmap,
const struct xfs_owner_info *oinfo);
int xrep_reap_ifork(struct xfs_scrub *sc, struct xfs_inode *ip, int whichfork);
/* Buffer cache scan context. */
struct xrep_bufscan {
/* Disk address for the buffers we want to scan. */
xfs_daddr_t daddr;
/* Maximum number of sectors to scan. */
xfs_daddr_t max_sectors;
/* Each round, increment the search length by this number of sectors. */
xfs_daddr_t daddr_step;
/* Internal scan state; initialize to zero. */
xfs_daddr_t __sector_count;
};
xfs_daddr_t xrep_bufscan_max_sectors(struct xfs_mount *mp,
xfs_extlen_t fsblocks);
struct xfs_buf *xrep_bufscan_advance(struct xfs_mount *mp,
struct xrep_bufscan *scan);
#endif /* __XFS_SCRUB_REAP_H__ */
......@@ -17,6 +17,7 @@
#include "xfs_scrub.h"
#include "xfs_buf_mem.h"
#include "xfs_rmap.h"
#include "xfs_exchrange.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/trace.h"
......@@ -24,6 +25,7 @@
#include "scrub/health.h"
#include "scrub/stats.h"
#include "scrub/xfile.h"
#include "scrub/tempfile.h"
/*
* Online Scrub and Repair
......@@ -211,6 +213,7 @@ xchk_teardown(
sc->buf = NULL;
}
xrep_tempfile_rele(sc);
xchk_fsgates_disable(sc);
return error;
}
......
......@@ -105,6 +105,10 @@ struct xfs_scrub {
/* Lock flags for @ip. */
uint ilock_flags;
/* A temporary file on this filesystem, for staging new metadata. */
struct xfs_inode *tempip;
uint temp_ilock_flags;
/* See the XCHK/XREP state flags below. */
unsigned int flags;
......
// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Copyright (c) 2021-2024 Oracle. All Rights Reserved.
* Author: Darrick J. Wong <djwong@kernel.org>
*/
#include "xfs.h"
#include "xfs_fs.h"
#include "xfs_shared.h"
#include "xfs_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
#include "xfs_log_format.h"
#include "xfs_trans.h"
#include "xfs_inode.h"
#include "xfs_ialloc.h"
#include "xfs_quota.h"
#include "xfs_bmap_btree.h"
#include "xfs_trans_space.h"
#include "xfs_dir2.h"
#include "xfs_exchrange.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/trace.h"
#include "scrub/tempfile.h"
/*
* Create a temporary file for reconstructing metadata, with the intention of
* atomically exchanging the temporary file's contents with the file that's
* being repaired.
*/
int
xrep_tempfile_create(
struct xfs_scrub *sc,
uint16_t mode)
{
struct xfs_mount *mp = sc->mp;
struct xfs_trans *tp = NULL;
struct xfs_dquot *udqp = NULL;
struct xfs_dquot *gdqp = NULL;
struct xfs_dquot *pdqp = NULL;
struct xfs_trans_res *tres;
struct xfs_inode *dp = mp->m_rootip;
xfs_ino_t ino;
unsigned int resblks;
bool is_dir = S_ISDIR(mode);
int error;
if (xfs_is_shutdown(mp))
return -EIO;
if (xfs_is_readonly(mp))
return -EROFS;
ASSERT(sc->tp == NULL);
ASSERT(sc->tempip == NULL);
/*
* Make sure that we have allocated dquot(s) on disk. The temporary
* inode should be completely root owned so that we don't fail due to
* quota limits.
*/
error = xfs_qm_vop_dqalloc(dp, GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, 0,
XFS_QMOPT_QUOTALL, &udqp, &gdqp, &pdqp);
if (error)
return error;
if (is_dir) {
resblks = XFS_MKDIR_SPACE_RES(mp, 0);
tres = &M_RES(mp)->tr_mkdir;
} else {
resblks = XFS_IALLOC_SPACE_RES(mp);
tres = &M_RES(mp)->tr_create_tmpfile;
}
error = xfs_trans_alloc_icreate(mp, tres, udqp, gdqp, pdqp, resblks,
&tp);
if (error)
goto out_release_dquots;
/* Allocate inode, set up directory. */
error = xfs_dialloc(&tp, dp->i_ino, mode, &ino);
if (error)
goto out_trans_cancel;
error = xfs_init_new_inode(&nop_mnt_idmap, tp, dp, ino, mode, 0, 0,
0, false, &sc->tempip);
if (error)
goto out_trans_cancel;
/* Change the ownership of the inode to root. */
VFS_I(sc->tempip)->i_uid = GLOBAL_ROOT_UID;
VFS_I(sc->tempip)->i_gid = GLOBAL_ROOT_GID;
sc->tempip->i_diflags &= ~(XFS_DIFLAG_REALTIME | XFS_DIFLAG_RTINHERIT);
xfs_trans_log_inode(tp, sc->tempip, XFS_ILOG_CORE);
/*
* Mark our temporary file as private so that LSMs and the ACL code
* don't try to add their own metadata or reason about these files.
* The file should never be exposed to userspace.
*/
VFS_I(sc->tempip)->i_flags |= S_PRIVATE;
VFS_I(sc->tempip)->i_opflags &= ~IOP_XATTR;
if (is_dir) {
error = xfs_dir_init(tp, sc->tempip, dp);
if (error)
goto out_trans_cancel;
}
/*
* Attach the dquot(s) to the inodes and modify them incore.
* These ids of the inode couldn't have changed since the new
* inode has been locked ever since it was created.
*/
xfs_qm_vop_create_dqattach(tp, sc->tempip, udqp, gdqp, pdqp);
/*
* Put our temp file on the unlinked list so it's purged automatically.
* All file-based metadata being reconstructed using this file must be
* atomically exchanged with the original file because the contents
* here will be purged when the inode is dropped or log recovery cleans
* out the unlinked list.
*/
error = xfs_iunlink(tp, sc->tempip);
if (error)
goto out_trans_cancel;
error = xfs_trans_commit(tp);
if (error)
goto out_release_inode;
trace_xrep_tempfile_create(sc);
xfs_qm_dqrele(udqp);
xfs_qm_dqrele(gdqp);
xfs_qm_dqrele(pdqp);
/* Finish setting up the incore / vfs context. */
xfs_setup_iops(sc->tempip);
xfs_finish_inode_setup(sc->tempip);
sc->temp_ilock_flags = 0;
return error;
out_trans_cancel:
xfs_trans_cancel(tp);
out_release_inode:
/*
* Wait until after the current transaction is aborted to finish the
* setup of the inode and release the inode. This prevents recursive
* transactions and deadlocks from xfs_inactive.
*/
if (sc->tempip) {
xfs_finish_inode_setup(sc->tempip);
xchk_irele(sc, sc->tempip);
}
out_release_dquots:
xfs_qm_dqrele(udqp);
xfs_qm_dqrele(gdqp);
xfs_qm_dqrele(pdqp);
return error;
}
/* Take IOLOCK_EXCL on the temporary file, maybe. */
bool
xrep_tempfile_iolock_nowait(
struct xfs_scrub *sc)
{
if (xfs_ilock_nowait(sc->tempip, XFS_IOLOCK_EXCL)) {
sc->temp_ilock_flags |= XFS_IOLOCK_EXCL;
return true;
}
return false;
}
/*
* Take the temporary file's IOLOCK while holding a different inode's IOLOCK.
* In theory nobody else should hold the tempfile's IOLOCK, but we use trylock
* to avoid deadlocks and lockdep complaints.
*/
int
xrep_tempfile_iolock_polled(
struct xfs_scrub *sc)
{
int error = 0;
while (!xrep_tempfile_iolock_nowait(sc)) {
if (xchk_should_terminate(sc, &error))
return error;
delay(1);
}
return 0;
}
/* Release IOLOCK_EXCL on the temporary file. */
void
xrep_tempfile_iounlock(
struct xfs_scrub *sc)
{
xfs_iunlock(sc->tempip, XFS_IOLOCK_EXCL);
sc->temp_ilock_flags &= ~XFS_IOLOCK_EXCL;
}
/* Prepare the temporary file for metadata updates by grabbing ILOCK_EXCL. */
void
xrep_tempfile_ilock(
struct xfs_scrub *sc)
{
sc->temp_ilock_flags |= XFS_ILOCK_EXCL;
xfs_ilock(sc->tempip, XFS_ILOCK_EXCL);
}
/* Try to grab ILOCK_EXCL on the temporary file. */
bool
xrep_tempfile_ilock_nowait(
struct xfs_scrub *sc)
{
if (xfs_ilock_nowait(sc->tempip, XFS_ILOCK_EXCL)) {
sc->temp_ilock_flags |= XFS_ILOCK_EXCL;
return true;
}
return false;
}
/* Unlock ILOCK_EXCL on the temporary file after an update. */
void
xrep_tempfile_iunlock(
struct xfs_scrub *sc)
{
xfs_iunlock(sc->tempip, XFS_ILOCK_EXCL);
sc->temp_ilock_flags &= ~XFS_ILOCK_EXCL;
}
/* Release the temporary file. */
void
xrep_tempfile_rele(
struct xfs_scrub *sc)
{
if (!sc->tempip)
return;
if (sc->temp_ilock_flags) {
xfs_iunlock(sc->tempip, sc->temp_ilock_flags);
sc->temp_ilock_flags = 0;
}
xchk_irele(sc, sc->tempip);
sc->tempip = NULL;
}
// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Copyright (c) 2021-2024 Oracle. All Rights Reserved.
* Author: Darrick J. Wong <djwong@kernel.org>
*/
#ifndef __XFS_SCRUB_TEMPFILE_H__
#define __XFS_SCRUB_TEMPFILE_H__
#ifdef CONFIG_XFS_ONLINE_REPAIR
int xrep_tempfile_create(struct xfs_scrub *sc, uint16_t mode);
void xrep_tempfile_rele(struct xfs_scrub *sc);
bool xrep_tempfile_iolock_nowait(struct xfs_scrub *sc);
int xrep_tempfile_iolock_polled(struct xfs_scrub *sc);
void xrep_tempfile_iounlock(struct xfs_scrub *sc);
void xrep_tempfile_ilock(struct xfs_scrub *sc);
bool xrep_tempfile_ilock_nowait(struct xfs_scrub *sc);
void xrep_tempfile_iunlock(struct xfs_scrub *sc);
#else
static inline void xrep_tempfile_iolock_both(struct xfs_scrub *sc)
{
xchk_ilock(sc, XFS_IOLOCK_EXCL);
}
# define xrep_tempfile_rele(sc)
#endif /* CONFIG_XFS_ONLINE_REPAIR */
#endif /* __XFS_SCRUB_TEMPFILE_H__ */
......@@ -1539,6 +1539,7 @@ DEFINE_EVENT(xrep_extent_class, name, \
DEFINE_REPAIR_EXTENT_EVENT(xreap_dispose_unmap_extent);
DEFINE_REPAIR_EXTENT_EVENT(xreap_dispose_free_extent);
DEFINE_REPAIR_EXTENT_EVENT(xreap_agextent_binval);
DEFINE_REPAIR_EXTENT_EVENT(xreap_bmapi_binval);
DEFINE_REPAIR_EXTENT_EVENT(xrep_agfl_insert);
DECLARE_EVENT_CLASS(xrep_reap_find_class,
......@@ -1572,6 +1573,7 @@ DEFINE_EVENT(xrep_reap_find_class, name, \
bool crosslinked), \
TP_ARGS(pag, agbno, len, crosslinked))
DEFINE_REPAIR_REAP_FIND_EVENT(xreap_agextent_select);
DEFINE_REPAIR_REAP_FIND_EVENT(xreap_bmapi_select);
DECLARE_EVENT_CLASS(xrep_rmap_class,
TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
......@@ -2279,6 +2281,100 @@ TRACE_EVENT(xrep_rmap_live_update,
__entry->flags)
);
TRACE_EVENT(xrep_tempfile_create,
TP_PROTO(struct xfs_scrub *sc),
TP_ARGS(sc),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_ino_t, ino)
__field(unsigned int, type)
__field(xfs_agnumber_t, agno)
__field(xfs_ino_t, inum)
__field(unsigned int, gen)
__field(unsigned int, flags)
__field(xfs_ino_t, temp_inum)
),
TP_fast_assign(
__entry->dev = sc->mp->m_super->s_dev;
__entry->ino = sc->file ? XFS_I(file_inode(sc->file))->i_ino : 0;
__entry->type = sc->sm->sm_type;
__entry->agno = sc->sm->sm_agno;
__entry->inum = sc->sm->sm_ino;
__entry->gen = sc->sm->sm_gen;
__entry->flags = sc->sm->sm_flags;
__entry->temp_inum = sc->tempip->i_ino;
),
TP_printk("dev %d:%d ino 0x%llx type %s inum 0x%llx gen 0x%x flags 0x%x temp_inum 0x%llx",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
__print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS),
__entry->inum,
__entry->gen,
__entry->flags,
__entry->temp_inum)
);
TRACE_EVENT(xreap_ifork_extent,
TP_PROTO(struct xfs_scrub *sc, struct xfs_inode *ip, int whichfork,
const struct xfs_bmbt_irec *irec),
TP_ARGS(sc, ip, whichfork, irec),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_ino_t, ino)
__field(int, whichfork)
__field(xfs_fileoff_t, fileoff)
__field(xfs_filblks_t, len)
__field(xfs_agnumber_t, agno)
__field(xfs_agblock_t, agbno)
__field(int, state)
),
TP_fast_assign(
__entry->dev = sc->mp->m_super->s_dev;
__entry->ino = ip->i_ino;
__entry->whichfork = whichfork;
__entry->fileoff = irec->br_startoff;
__entry->len = irec->br_blockcount;
__entry->agno = XFS_FSB_TO_AGNO(sc->mp, irec->br_startblock);
__entry->agbno = XFS_FSB_TO_AGBNO(sc->mp, irec->br_startblock);
__entry->state = irec->br_state;
),
TP_printk("dev %d:%d ip 0x%llx whichfork %s agno 0x%x agbno 0x%x fileoff 0x%llx fsbcount 0x%llx state 0x%x",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
__print_symbolic(__entry->whichfork, XFS_WHICHFORK_STRINGS),
__entry->agno,
__entry->agbno,
__entry->fileoff,
__entry->len,
__entry->state)
);
TRACE_EVENT(xreap_bmapi_binval_scan,
TP_PROTO(struct xfs_scrub *sc, const struct xfs_bmbt_irec *irec,
xfs_extlen_t scan_blocks),
TP_ARGS(sc, irec, scan_blocks),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_filblks_t, len)
__field(xfs_agnumber_t, agno)
__field(xfs_agblock_t, agbno)
__field(xfs_extlen_t, scan_blocks)
),
TP_fast_assign(
__entry->dev = sc->mp->m_super->s_dev;
__entry->len = irec->br_blockcount;
__entry->agno = XFS_FSB_TO_AGNO(sc->mp, irec->br_startblock);
__entry->agbno = XFS_FSB_TO_AGBNO(sc->mp, irec->br_startblock);
__entry->scan_blocks = scan_blocks;
),
TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%llx scan_blocks 0x%x",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->agno,
__entry->agbno,
__entry->len,
__entry->scan_blocks)
);
#endif /* IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR) */
#endif /* _TRACE_XFS_SCRUB_TRACE_H */
......
......@@ -160,7 +160,7 @@ xfs_nfs_get_inode(
}
}
if (VFS_I(ip)->i_generation != generation) {
if (VFS_I(ip)->i_generation != generation || IS_PRIVATE(VFS_I(ip))) {
xfs_irele(ip);
return ERR_PTR(-ESTALE);
}
......
......@@ -42,7 +42,6 @@
struct kmem_cache *xfs_inode_cache;
STATIC int xfs_iunlink(struct xfs_trans *, struct xfs_inode *);
STATIC int xfs_iunlink_remove(struct xfs_trans *tp, struct xfs_perag *pag,
struct xfs_inode *);
......@@ -2151,7 +2150,7 @@ xfs_iunlink_insert_inode(
* We place the on-disk inode on a list in the AGI. It will be pulled from this
* list when the inode is freed.
*/
STATIC int
int
xfs_iunlink(
struct xfs_trans *tp,
struct xfs_inode *ip)
......
......@@ -616,6 +616,8 @@ extern struct kmem_cache *xfs_inode_cache;
bool xfs_inode_needs_inactive(struct xfs_inode *ip);
int xfs_iunlink(struct xfs_trans *tp, struct xfs_inode *ip);
void xfs_end_io(struct work_struct *work);
int xfs_ilock2_io_mmap(struct xfs_inode *ip1, struct xfs_inode *ip2);
......
......@@ -365,6 +365,9 @@ xfs_vn_link(
if (unlikely(error))
return error;
if (IS_PRIVATE(inode))
return -EPERM;
error = xfs_link(XFS_I(dir), XFS_I(inode), &name);
if (unlikely(error))
return error;
......
......@@ -97,6 +97,14 @@ xfs_bulkstat_one_int(
vfsuid = i_uid_into_vfsuid(idmap, inode);
vfsgid = i_gid_into_vfsgid(idmap, inode);
/* If this is a private inode, don't leak its details to userspace. */
if (IS_PRIVATE(inode)) {
xfs_iunlock(ip, XFS_ILOCK_SHARED);
xfs_irele(ip);
error = -EINVAL;
goto out_advance;
}
/* xfs_iget returns the following without needing
* further change.
*/
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment