Commit 6e1d7b89 authored by Chandan Babu R's avatar Chandan Babu R

Merge tag 'repair-ag-btrees-6.8_2023-12-15' of...

Merge tag 'repair-ag-btrees-6.8_2023-12-15' of https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfs-linux into xfs-6.8-mergeB

xfs: online repair of AG btrees

Now that we've spent a lot of time reworking common code in online fsck,
we're ready to start rebuilding the AG space btrees.  This series
implements repair functions for the free space, inode, and refcount
btrees.  Rebuilding the reverse mapping btree is much more intense and
is left for a subsequent patchset.  The fstests counterpart of this
patchset implements stress testing of repair.
Signed-off-by: default avatarDarrick J. Wong <djwong@kernel.org>
Signed-off-by: default avatarChandan Babu R <chandanbabu@kernel.org>

* tag 'repair-ag-btrees-6.8_2023-12-15' of https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfs-linux:
  xfs: repair refcount btrees
  xfs: repair inode btrees
  xfs: repair free space btrees
  xfs: remove trivial bnobt/inobt scrub helpers
  xfs: roll the scrub transaction after completing a repair
  xfs: move the per-AG datatype bitmaps to separate files
  xfs: create separate structures and code for u32 bitmaps
parents 5e60ca3f 9099cd38
......@@ -145,6 +145,7 @@ ifeq ($(CONFIG_XFS_ONLINE_SCRUB),y)
xfs-y += $(addprefix scrub/, \
trace.o \
agb_bitmap.o \
agheader.o \
alloc.o \
attr.o \
......@@ -181,8 +182,11 @@ xfs-$(CONFIG_XFS_QUOTA) += scrub/quota.o
ifeq ($(CONFIG_XFS_ONLINE_REPAIR),y)
xfs-y += $(addprefix scrub/, \
agheader_repair.o \
alloc_repair.o \
ialloc_repair.o \
newbt.o \
reap.o \
refcount_repair.o \
repair.o \
)
endif
......
......@@ -80,6 +80,16 @@ struct xfs_perag {
*/
uint16_t pag_checked;
uint16_t pag_sick;
#ifdef CONFIG_XFS_ONLINE_REPAIR
/*
* Alternate btree heights so that online repair won't trip the write
* verifiers while rebuilding the AG btrees.
*/
uint8_t pagf_repair_levels[XFS_BTNUM_AGF];
uint8_t pagf_repair_refcount_level;
#endif
spinlock_t pag_state_lock;
spinlock_t pagb_lock; /* lock for pagb_tree */
......
......@@ -411,6 +411,8 @@ xfs_ag_resv_free_extent(
fallthrough;
case XFS_AG_RESV_NONE:
xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, (int64_t)len);
fallthrough;
case XFS_AG_RESV_IGNORE:
return;
}
......
......@@ -246,11 +246,9 @@ xfs_alloc_btrec_to_irec(
/* Simple checks for free space records. */
xfs_failaddr_t
xfs_alloc_check_irec(
struct xfs_btree_cur *cur,
struct xfs_perag *pag,
const struct xfs_alloc_rec_incore *irec)
{
struct xfs_perag *pag = cur->bc_ag.pag;
if (irec->ar_blockcount == 0)
return __this_address;
......@@ -299,7 +297,7 @@ xfs_alloc_get_rec(
return error;
xfs_alloc_btrec_to_irec(rec, &irec);
fa = xfs_alloc_check_irec(cur, &irec);
fa = xfs_alloc_check_irec(cur->bc_ag.pag, &irec);
if (fa)
return xfs_alloc_complain_bad_rec(cur, fa, &irec);
......@@ -3944,7 +3942,7 @@ xfs_alloc_query_range_helper(
xfs_failaddr_t fa;
xfs_alloc_btrec_to_irec(rec, &irec);
fa = xfs_alloc_check_irec(cur, &irec);
fa = xfs_alloc_check_irec(cur->bc_ag.pag, &irec);
if (fa)
return xfs_alloc_complain_bad_rec(cur, fa, &irec);
......
......@@ -185,7 +185,7 @@ xfs_alloc_get_rec(
union xfs_btree_rec;
void xfs_alloc_btrec_to_irec(const union xfs_btree_rec *rec,
struct xfs_alloc_rec_incore *irec);
xfs_failaddr_t xfs_alloc_check_irec(struct xfs_btree_cur *cur,
xfs_failaddr_t xfs_alloc_check_irec(struct xfs_perag *pag,
const struct xfs_alloc_rec_incore *irec);
int xfs_read_agf(struct xfs_perag *pag, struct xfs_trans *tp, int flags,
......
......@@ -323,7 +323,18 @@ xfs_allocbt_verify(
if (bp->b_ops->magic[0] == cpu_to_be32(XFS_ABTC_MAGIC))
btnum = XFS_BTNUM_CNTi;
if (pag && xfs_perag_initialised_agf(pag)) {
if (level >= pag->pagf_levels[btnum])
unsigned int maxlevel = pag->pagf_levels[btnum];
#ifdef CONFIG_XFS_ONLINE_REPAIR
/*
* Online repair could be rewriting the free space btrees, so
* we'll validate against the larger of either tree while this
* is going on.
*/
maxlevel = max_t(unsigned int, maxlevel,
pag->pagf_repair_levels[btnum]);
#endif
if (level >= maxlevel)
return __this_address;
} else if (level >= mp->m_alloc_maxlevels)
return __this_address;
......
......@@ -5212,3 +5212,29 @@ xfs_btree_destroy_cur_caches(void)
xfs_rmapbt_destroy_cur_cache();
xfs_refcountbt_destroy_cur_cache();
}
/* Move the btree cursor before the first record. */
int
xfs_btree_goto_left_edge(
struct xfs_btree_cur *cur)
{
int stat = 0;
int error;
memset(&cur->bc_rec, 0, sizeof(cur->bc_rec));
error = xfs_btree_lookup(cur, XFS_LOOKUP_LE, &stat);
if (error)
return error;
if (!stat)
return 0;
error = xfs_btree_decrement(cur, 0, &stat);
if (error)
return error;
if (stat != 0) {
ASSERT(0);
return -EFSCORRUPTED;
}
return 0;
}
......@@ -738,4 +738,6 @@ xfs_btree_alloc_cursor(
int __init xfs_btree_init_cur_caches(void);
void xfs_btree_destroy_cur_caches(void);
int xfs_btree_goto_left_edge(struct xfs_btree_cur *cur);
#endif /* __XFS_BTREE_H__ */
......@@ -95,18 +95,28 @@ xfs_inobt_btrec_to_irec(
irec->ir_free = be64_to_cpu(rec->inobt.ir_free);
}
/* Compute the freecount of an incore inode record. */
uint8_t
xfs_inobt_rec_freecount(
const struct xfs_inobt_rec_incore *irec)
{
uint64_t realfree = irec->ir_free;
if (xfs_inobt_issparse(irec->ir_holemask))
realfree &= xfs_inobt_irec_to_allocmask(irec);
return hweight64(realfree);
}
/* Simple checks for inode records. */
xfs_failaddr_t
xfs_inobt_check_irec(
struct xfs_btree_cur *cur,
struct xfs_perag *pag,
const struct xfs_inobt_rec_incore *irec)
{
uint64_t realfree;
/* Record has to be properly aligned within the AG. */
if (!xfs_verify_agino(cur->bc_ag.pag, irec->ir_startino))
if (!xfs_verify_agino(pag, irec->ir_startino))
return __this_address;
if (!xfs_verify_agino(cur->bc_ag.pag,
if (!xfs_verify_agino(pag,
irec->ir_startino + XFS_INODES_PER_CHUNK - 1))
return __this_address;
if (irec->ir_count < XFS_INODES_PER_HOLEMASK_BIT ||
......@@ -115,12 +125,7 @@ xfs_inobt_check_irec(
if (irec->ir_freecount > XFS_INODES_PER_CHUNK)
return __this_address;
/* if there are no holes, return the first available offset */
if (!xfs_inobt_issparse(irec->ir_holemask))
realfree = irec->ir_free;
else
realfree = irec->ir_free & xfs_inobt_irec_to_allocmask(irec);
if (hweight64(realfree) != irec->ir_freecount)
if (xfs_inobt_rec_freecount(irec) != irec->ir_freecount)
return __this_address;
return NULL;
......@@ -164,7 +169,7 @@ xfs_inobt_get_rec(
return error;
xfs_inobt_btrec_to_irec(mp, rec, irec);
fa = xfs_inobt_check_irec(cur, irec);
fa = xfs_inobt_check_irec(cur->bc_ag.pag, irec);
if (fa)
return xfs_inobt_complain_bad_rec(cur, fa, irec);
......@@ -2740,7 +2745,7 @@ xfs_ialloc_count_inodes_rec(
xfs_failaddr_t fa;
xfs_inobt_btrec_to_irec(cur->bc_mp, rec, &irec);
fa = xfs_inobt_check_irec(cur, &irec);
fa = xfs_inobt_check_irec(cur->bc_ag.pag, &irec);
if (fa)
return xfs_inobt_complain_bad_rec(cur, fa, &irec);
......
......@@ -79,6 +79,7 @@ int xfs_inobt_lookup(struct xfs_btree_cur *cur, xfs_agino_t ino,
*/
int xfs_inobt_get_rec(struct xfs_btree_cur *cur,
xfs_inobt_rec_incore_t *rec, int *stat);
uint8_t xfs_inobt_rec_freecount(const struct xfs_inobt_rec_incore *irec);
/*
* Inode chunk initialisation routine
......@@ -93,7 +94,7 @@ union xfs_btree_rec;
void xfs_inobt_btrec_to_irec(struct xfs_mount *mp,
const union xfs_btree_rec *rec,
struct xfs_inobt_rec_incore *irec);
xfs_failaddr_t xfs_inobt_check_irec(struct xfs_btree_cur *cur,
xfs_failaddr_t xfs_inobt_check_irec(struct xfs_perag *pag,
const struct xfs_inobt_rec_incore *irec);
int xfs_ialloc_has_inodes_at_extent(struct xfs_btree_cur *cur,
xfs_agblock_t bno, xfs_extlen_t len,
......
......@@ -123,11 +123,9 @@ xfs_refcount_btrec_to_irec(
/* Simple checks for refcount records. */
xfs_failaddr_t
xfs_refcount_check_irec(
struct xfs_btree_cur *cur,
struct xfs_perag *pag,
const struct xfs_refcount_irec *irec)
{
struct xfs_perag *pag = cur->bc_ag.pag;
if (irec->rc_blockcount == 0 || irec->rc_blockcount > MAXREFCEXTLEN)
return __this_address;
......@@ -179,7 +177,7 @@ xfs_refcount_get_rec(
return error;
xfs_refcount_btrec_to_irec(rec, irec);
fa = xfs_refcount_check_irec(cur, irec);
fa = xfs_refcount_check_irec(cur->bc_ag.pag, irec);
if (fa)
return xfs_refcount_complain_bad_rec(cur, fa, irec);
......@@ -1899,7 +1897,7 @@ xfs_refcount_recover_extent(
INIT_LIST_HEAD(&rr->rr_list);
xfs_refcount_btrec_to_irec(rec, &rr->rr_rrec);
if (xfs_refcount_check_irec(cur, &rr->rr_rrec) != NULL ||
if (xfs_refcount_check_irec(cur->bc_ag.pag, &rr->rr_rrec) != NULL ||
XFS_IS_CORRUPT(cur->bc_mp,
rr->rr_rrec.rc_domain != XFS_REFC_DOMAIN_COW)) {
kfree(rr);
......
......@@ -117,7 +117,7 @@ extern int xfs_refcount_has_records(struct xfs_btree_cur *cur,
union xfs_btree_rec;
extern void xfs_refcount_btrec_to_irec(const union xfs_btree_rec *rec,
struct xfs_refcount_irec *irec);
xfs_failaddr_t xfs_refcount_check_irec(struct xfs_btree_cur *cur,
xfs_failaddr_t xfs_refcount_check_irec(struct xfs_perag *pag,
const struct xfs_refcount_irec *irec);
extern int xfs_refcount_insert(struct xfs_btree_cur *cur,
struct xfs_refcount_irec *irec, int *stat);
......
......@@ -226,7 +226,18 @@ xfs_refcountbt_verify(
level = be16_to_cpu(block->bb_level);
if (pag && xfs_perag_initialised_agf(pag)) {
if (level >= pag->pagf_refcount_level)
unsigned int maxlevel = pag->pagf_refcount_level;
#ifdef CONFIG_XFS_ONLINE_REPAIR
/*
* Online repair could be rewriting the refcount btree, so
* we'll validate against the larger of either tree while this
* is going on.
*/
maxlevel = max_t(unsigned int, maxlevel,
pag->pagf_repair_refcount_level);
#endif
if (level >= maxlevel)
return __this_address;
} else if (level >= mp->m_refc_maxlevels)
return __this_address;
......
......@@ -208,6 +208,13 @@ enum xfs_ag_resv_type {
XFS_AG_RESV_AGFL,
XFS_AG_RESV_METADATA,
XFS_AG_RESV_RMAPBT,
/*
* Don't increase fdblocks when freeing extent. This is a pony for
* the bnobt repair functions to re-free the free space without
* altering fdblocks. If you think you need this you're wrong.
*/
XFS_AG_RESV_IGNORE,
};
/* Results of scanning a btree keyspace to check occupancy. */
......
// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Copyright (C) 2018-2023 Oracle. All Rights Reserved.
* Author: Darrick J. Wong <djwong@kernel.org>
*/
#include "xfs.h"
#include "xfs_shared.h"
#include "xfs_bit.h"
#include "xfs_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
#include "xfs_btree.h"
#include "bitmap.h"
#include "scrub/agb_bitmap.h"
/*
* Record all btree blocks seen while iterating all records of a btree.
*
* We know that the btree query_all function starts at the left edge and walks
* towards the right edge of the tree. Therefore, we know that we can walk up
* the btree cursor towards the root; if the pointer for a given level points
* to the first record/key in that block, we haven't seen this block before;
* and therefore we need to remember that we saw this block in the btree.
*
* So if our btree is:
*
* 4
* / | \
* 1 2 3
*
* Pretend for this example that each leaf block has 100 btree records. For
* the first btree record, we'll observe that bc_levels[0].ptr == 1, so we
* record that we saw block 1. Then we observe that bc_levels[1].ptr == 1, so
* we record block 4. The list is [1, 4].
*
* For the second btree record, we see that bc_levels[0].ptr == 2, so we exit
* the loop. The list remains [1, 4].
*
* For the 101st btree record, we've moved onto leaf block 2. Now
* bc_levels[0].ptr == 1 again, so we record that we saw block 2. We see that
* bc_levels[1].ptr == 2, so we exit the loop. The list is now [1, 4, 2].
*
* For the 102nd record, bc_levels[0].ptr == 2, so we continue.
*
* For the 201st record, we've moved on to leaf block 3.
* bc_levels[0].ptr == 1, so we add 3 to the list. Now it is [1, 4, 2, 3].
*
* For the 300th record we just exit, with the list being [1, 4, 2, 3].
*/
/* Mark a btree block to the agblock bitmap. */
STATIC int
xagb_bitmap_visit_btblock(
struct xfs_btree_cur *cur,
int level,
void *priv)
{
struct xagb_bitmap *bitmap = priv;
struct xfs_buf *bp;
xfs_fsblock_t fsbno;
xfs_agblock_t agbno;
xfs_btree_get_block(cur, level, &bp);
if (!bp)
return 0;
fsbno = XFS_DADDR_TO_FSB(cur->bc_mp, xfs_buf_daddr(bp));
agbno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno);
return xagb_bitmap_set(bitmap, agbno, 1);
}
/* Mark all (per-AG) btree blocks in the agblock bitmap. */
int
xagb_bitmap_set_btblocks(
struct xagb_bitmap *bitmap,
struct xfs_btree_cur *cur)
{
return xfs_btree_visit_blocks(cur, xagb_bitmap_visit_btblock,
XFS_BTREE_VISIT_ALL, bitmap);
}
/*
* Record all the buffers pointed to by the btree cursor. Callers already
* engaged in a btree walk should call this function to capture the list of
* blocks going from the leaf towards the root.
*/
int
xagb_bitmap_set_btcur_path(
struct xagb_bitmap *bitmap,
struct xfs_btree_cur *cur)
{
int i;
int error;
for (i = 0; i < cur->bc_nlevels && cur->bc_levels[i].ptr == 1; i++) {
error = xagb_bitmap_visit_btblock(cur, i, bitmap);
if (error)
return error;
}
return 0;
}
// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Copyright (C) 2018-2023 Oracle. All Rights Reserved.
* Author: Darrick J. Wong <djwong@kernel.org>
*/
#ifndef __XFS_SCRUB_AGB_BITMAP_H__
#define __XFS_SCRUB_AGB_BITMAP_H__
/* Bitmaps, but for type-checked for xfs_agblock_t */
struct xagb_bitmap {
struct xbitmap32 agbitmap;
};
static inline void xagb_bitmap_init(struct xagb_bitmap *bitmap)
{
xbitmap32_init(&bitmap->agbitmap);
}
static inline void xagb_bitmap_destroy(struct xagb_bitmap *bitmap)
{
xbitmap32_destroy(&bitmap->agbitmap);
}
static inline int xagb_bitmap_clear(struct xagb_bitmap *bitmap,
xfs_agblock_t start, xfs_extlen_t len)
{
return xbitmap32_clear(&bitmap->agbitmap, start, len);
}
static inline int xagb_bitmap_set(struct xagb_bitmap *bitmap,
xfs_agblock_t start, xfs_extlen_t len)
{
return xbitmap32_set(&bitmap->agbitmap, start, len);
}
static inline bool xagb_bitmap_test(struct xagb_bitmap *bitmap,
xfs_agblock_t start, xfs_extlen_t *len)
{
return xbitmap32_test(&bitmap->agbitmap, start, len);
}
static inline int xagb_bitmap_disunion(struct xagb_bitmap *bitmap,
struct xagb_bitmap *sub)
{
return xbitmap32_disunion(&bitmap->agbitmap, &sub->agbitmap);
}
static inline uint32_t xagb_bitmap_hweight(struct xagb_bitmap *bitmap)
{
return xbitmap32_hweight(&bitmap->agbitmap);
}
static inline bool xagb_bitmap_empty(struct xagb_bitmap *bitmap)
{
return xbitmap32_empty(&bitmap->agbitmap);
}
static inline int xagb_bitmap_walk(struct xagb_bitmap *bitmap,
xbitmap32_walk_fn fn, void *priv)
{
return xbitmap32_walk(&bitmap->agbitmap, fn, priv);
}
int xagb_bitmap_set_btblocks(struct xagb_bitmap *bitmap,
struct xfs_btree_cur *cur);
int xagb_bitmap_set_btcur_path(struct xagb_bitmap *bitmap,
struct xfs_btree_cur *cur);
#endif /* __XFS_SCRUB_AGB_BITMAP_H__ */
......@@ -26,6 +26,7 @@
#include "scrub/trace.h"
#include "scrub/repair.h"
#include "scrub/bitmap.h"
#include "scrub/agb_bitmap.h"
#include "scrub/reap.h"
/* Superblock */
......@@ -72,7 +73,7 @@ xrep_superblock(
/* Write this to disk. */
xfs_trans_buf_set_type(sc->tp, bp, XFS_BLFT_SB_BUF);
xfs_trans_log_buf(sc->tp, bp, 0, BBTOB(bp->b_length) - 1);
return error;
return 0;
}
/* AGF */
......@@ -341,7 +342,7 @@ xrep_agf_commit_new(
pag->pagf_refcount_level = be32_to_cpu(agf->agf_refcount_level);
set_bit(XFS_AGSTATE_AGF_INIT, &pag->pag_opstate);
return 0;
return xrep_roll_ag_trans(sc);
}
/* Repair the AGF. v5 filesystems only. */
......@@ -494,12 +495,11 @@ xrep_agfl_walk_rmap(
/* Strike out the blocks that are cross-linked according to the rmapbt. */
STATIC int
xrep_agfl_check_extent(
uint64_t start,
uint64_t len,
uint32_t agbno,
uint32_t len,
void *priv)
{
struct xrep_agfl *ra = priv;
xfs_agblock_t agbno = start;
xfs_agblock_t last_agbno = agbno + len - 1;
int error;
......@@ -647,8 +647,8 @@ struct xrep_agfl_fill {
/* Fill the AGFL with whatever blocks are in this extent. */
static int
xrep_agfl_fill(
uint64_t start,
uint64_t len,
uint32_t start,
uint32_t len,
void *priv)
{
struct xrep_agfl_fill *af = priv;
......@@ -789,6 +789,9 @@ xrep_agfl(
/* Dump any AGFL overflow. */
error = xrep_reap_agblocks(sc, &agfl_extents, &XFS_RMAP_OINFO_AG,
XFS_AG_RESV_AGFL);
if (error)
goto err;
err:
xagb_bitmap_destroy(&agfl_extents);
return error;
......@@ -962,7 +965,7 @@ xrep_agi_commit_new(
pag->pagi_freecount = be32_to_cpu(agi->agi_freecount);
set_bit(XFS_AGSTATE_AGI_INIT, &pag->pag_opstate);
return 0;
return xrep_roll_ag_trans(sc);
}
/* Repair the AGI. */
......
......@@ -9,13 +9,16 @@
#include "xfs_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
#include "xfs_log_format.h"
#include "xfs_trans.h"
#include "xfs_btree.h"
#include "xfs_alloc.h"
#include "xfs_rmap.h"
#include "xfs_ag.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/btree.h"
#include "xfs_ag.h"
#include "scrub/repair.h"
/*
* Set us up to scrub free space btrees.
......@@ -24,10 +27,19 @@ int
xchk_setup_ag_allocbt(
struct xfs_scrub *sc)
{
int error;
if (xchk_need_intent_drain(sc))
xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN);
return xchk_setup_ag_btree(sc, false);
error = xchk_setup_ag_btree(sc, false);
if (error)
return error;
if (xchk_could_repair(sc))
return xrep_setup_ag_allocbt(sc);
return 0;
}
/* Free space btree scrubber. */
......@@ -127,7 +139,7 @@ xchk_allocbt_rec(
struct xchk_alloc *ca = bs->private;
xfs_alloc_btrec_to_irec(rec, &irec);
if (xfs_alloc_check_irec(bs->cur, &irec) != NULL) {
if (xfs_alloc_check_irec(bs->cur->bc_ag.pag, &irec) != NULL) {
xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
return 0;
}
......@@ -138,31 +150,27 @@ xchk_allocbt_rec(
return 0;
}
/* Scrub the freespace btrees for some AG. */
STATIC int
/* Scrub one of the freespace btrees for some AG. */
int
xchk_allocbt(
struct xfs_scrub *sc,
xfs_btnum_t which)
struct xfs_scrub *sc)
{
struct xchk_alloc ca = { };
struct xfs_btree_cur *cur;
cur = which == XFS_BTNUM_BNO ? sc->sa.bno_cur : sc->sa.cnt_cur;
return xchk_btree(sc, cur, xchk_allocbt_rec, &XFS_RMAP_OINFO_AG, &ca);
}
int
xchk_bnobt(
struct xfs_scrub *sc)
{
return xchk_allocbt(sc, XFS_BTNUM_BNO);
}
switch (sc->sm->sm_type) {
case XFS_SCRUB_TYPE_BNOBT:
cur = sc->sa.bno_cur;
break;
case XFS_SCRUB_TYPE_CNTBT:
cur = sc->sa.cnt_cur;
break;
default:
ASSERT(0);
return -EIO;
}
int
xchk_cntbt(
struct xfs_scrub *sc)
{
return xchk_allocbt(sc, XFS_BTNUM_CNT);
return xchk_btree(sc, cur, xchk_allocbt_rec, &XFS_RMAP_OINFO_AG, &ca);
}
/* xref check that the extent is not free */
......
// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Copyright (C) 2018-2023 Oracle. All Rights Reserved.
* Author: Darrick J. Wong <djwong@kernel.org>
*/
#include "xfs.h"
#include "xfs_fs.h"
#include "xfs_shared.h"
#include "xfs_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
#include "xfs_defer.h"
#include "xfs_btree.h"
#include "xfs_btree_staging.h"
#include "xfs_bit.h"
#include "xfs_log_format.h"
#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_alloc.h"
#include "xfs_alloc_btree.h"
#include "xfs_rmap.h"
#include "xfs_rmap_btree.h"
#include "xfs_inode.h"
#include "xfs_refcount.h"
#include "xfs_extent_busy.h"
#include "xfs_health.h"
#include "xfs_bmap.h"
#include "xfs_ialloc.h"
#include "xfs_ag.h"
#include "scrub/xfs_scrub.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/btree.h"
#include "scrub/trace.h"
#include "scrub/repair.h"
#include "scrub/bitmap.h"
#include "scrub/agb_bitmap.h"
#include "scrub/xfile.h"
#include "scrub/xfarray.h"
#include "scrub/newbt.h"
#include "scrub/reap.h"
/*
* Free Space Btree Repair
* =======================
*
* The reverse mappings are supposed to record all space usage for the entire
* AG. Therefore, we can recreate the free extent records in an AG by looking
* for gaps in the physical extents recorded in the rmapbt. These records are
* staged in @free_records. Identifying the gaps is more difficult on a
* reflink filesystem because rmap records are allowed to overlap.
*
* Because the final step of building a new index is to free the space used by
* the old index, repair needs to find that space. Unfortunately, all
* structures that live in the free space (bnobt, cntbt, rmapbt, agfl) share
* the same rmapbt owner code (OWN_AG), so this is not straightforward.
*
* The scan of the reverse mapping information records the space used by OWN_AG
* in @old_allocbt_blocks, which (at this stage) is somewhat misnamed. While
* walking the rmapbt records, we create a second bitmap @not_allocbt_blocks to
* record all visited rmap btree blocks and all blocks owned by the AGFL.
*
* After that is where the definitions of old_allocbt_blocks shifts. This
* expression identifies possible former bnobt/cntbt blocks:
*
* (OWN_AG blocks) & ~(rmapbt blocks | agfl blocks);
*
* Substituting from above definitions, that becomes:
*
* old_allocbt_blocks & ~not_allocbt_blocks
*
* The OWN_AG bitmap itself isn't needed after this point, so what we really do
* instead is:
*
* old_allocbt_blocks &= ~not_allocbt_blocks;
*
* After this point, @old_allocbt_blocks is a bitmap of alleged former
* bnobt/cntbt blocks. The xagb_bitmap_disunion operation modifies its first
* parameter in place to avoid copying records around.
*
* Next, some of the space described by @free_records are diverted to the newbt
* reservation and used to format new btree blocks. The remaining records are
* written to the new btree indices. We reconstruct both bnobt and cntbt at
* the same time since we've already done all the work.
*
* We use the prefix 'xrep_abt' here because we regenerate both free space
* allocation btrees at the same time.
*/
struct xrep_abt {
/* Blocks owned by the rmapbt or the agfl. */
struct xagb_bitmap not_allocbt_blocks;
/* All OWN_AG blocks. */
struct xagb_bitmap old_allocbt_blocks;
/*
* New bnobt information. All btree block reservations are added to
* the reservation list in new_bnobt.
*/
struct xrep_newbt new_bnobt;
/* new cntbt information */
struct xrep_newbt new_cntbt;
/* Free space extents. */
struct xfarray *free_records;
struct xfs_scrub *sc;
/* Number of non-null records in @free_records. */
uint64_t nr_real_records;
/* get_records()'s position in the free space record array. */
xfarray_idx_t array_cur;
/*
* Next block we anticipate seeing in the rmap records. If the next
* rmap record is greater than next_agbno, we have found unused space.
*/
xfs_agblock_t next_agbno;
/* Number of free blocks in this AG. */
xfs_agblock_t nr_blocks;
/* Longest free extent we found in the AG. */
xfs_agblock_t longest;
};
/* Set up to repair AG free space btrees. */
int
xrep_setup_ag_allocbt(
struct xfs_scrub *sc)
{
unsigned int busy_gen;
/*
* Make sure the busy extent list is clear because we can't put extents
* on there twice.
*/
busy_gen = READ_ONCE(sc->sa.pag->pagb_gen);
if (xfs_extent_busy_list_empty(sc->sa.pag))
return 0;
return xfs_extent_busy_flush(sc->tp, sc->sa.pag, busy_gen, 0);
}
/* Check for any obvious conflicts in the free extent. */
STATIC int
xrep_abt_check_free_ext(
struct xfs_scrub *sc,
const struct xfs_alloc_rec_incore *rec)
{
enum xbtree_recpacking outcome;
int error;
if (xfs_alloc_check_irec(sc->sa.pag, rec) != NULL)
return -EFSCORRUPTED;
/* Must not be an inode chunk. */
error = xfs_ialloc_has_inodes_at_extent(sc->sa.ino_cur,
rec->ar_startblock, rec->ar_blockcount, &outcome);
if (error)
return error;
if (outcome != XBTREE_RECPACKING_EMPTY)
return -EFSCORRUPTED;
/* Must not be shared or CoW staging. */
if (sc->sa.refc_cur) {
error = xfs_refcount_has_records(sc->sa.refc_cur,
XFS_REFC_DOMAIN_SHARED, rec->ar_startblock,
rec->ar_blockcount, &outcome);
if (error)
return error;
if (outcome != XBTREE_RECPACKING_EMPTY)
return -EFSCORRUPTED;
error = xfs_refcount_has_records(sc->sa.refc_cur,
XFS_REFC_DOMAIN_COW, rec->ar_startblock,
rec->ar_blockcount, &outcome);
if (error)
return error;
if (outcome != XBTREE_RECPACKING_EMPTY)
return -EFSCORRUPTED;
}
return 0;
}
/*
* Stash a free space record for all the space since the last bno we found
* all the way up to @end.
*/
static int
xrep_abt_stash(
struct xrep_abt *ra,
xfs_agblock_t end)
{
struct xfs_alloc_rec_incore arec = {
.ar_startblock = ra->next_agbno,
.ar_blockcount = end - ra->next_agbno,
};
struct xfs_scrub *sc = ra->sc;
int error = 0;
if (xchk_should_terminate(sc, &error))
return error;
error = xrep_abt_check_free_ext(ra->sc, &arec);
if (error)
return error;
trace_xrep_abt_found(sc->mp, sc->sa.pag->pag_agno, &arec);
error = xfarray_append(ra->free_records, &arec);
if (error)
return error;
ra->nr_blocks += arec.ar_blockcount;
return 0;
}
/* Record extents that aren't in use from gaps in the rmap records. */
STATIC int
xrep_abt_walk_rmap(
struct xfs_btree_cur *cur,
const struct xfs_rmap_irec *rec,
void *priv)
{
struct xrep_abt *ra = priv;
int error;
/* Record all the OWN_AG blocks... */
if (rec->rm_owner == XFS_RMAP_OWN_AG) {
error = xagb_bitmap_set(&ra->old_allocbt_blocks,
rec->rm_startblock, rec->rm_blockcount);
if (error)
return error;
}
/* ...and all the rmapbt blocks... */
error = xagb_bitmap_set_btcur_path(&ra->not_allocbt_blocks, cur);
if (error)
return error;
/* ...and all the free space. */
if (rec->rm_startblock > ra->next_agbno) {
error = xrep_abt_stash(ra, rec->rm_startblock);
if (error)
return error;
}
/*
* rmap records can overlap on reflink filesystems, so project
* next_agbno as far out into the AG space as we currently know about.
*/
ra->next_agbno = max_t(xfs_agblock_t, ra->next_agbno,
rec->rm_startblock + rec->rm_blockcount);
return 0;
}
/* Collect an AGFL block for the not-to-release list. */
static int
xrep_abt_walk_agfl(
struct xfs_mount *mp,
xfs_agblock_t agbno,
void *priv)
{
struct xrep_abt *ra = priv;
return xagb_bitmap_set(&ra->not_allocbt_blocks, agbno, 1);
}
/*
* Compare two free space extents by block number. We want to sort in order of
* increasing block number.
*/
static int
xrep_bnobt_extent_cmp(
const void *a,
const void *b)
{
const struct xfs_alloc_rec_incore *ap = a;
const struct xfs_alloc_rec_incore *bp = b;
if (ap->ar_startblock > bp->ar_startblock)
return 1;
else if (ap->ar_startblock < bp->ar_startblock)
return -1;
return 0;
}
/*
* Re-sort the free extents by block number so that we can put the records into
* the bnobt in the correct order. Make sure the records do not overlap in
* physical space.
*/
STATIC int
xrep_bnobt_sort_records(
struct xrep_abt *ra)
{
struct xfs_alloc_rec_incore arec;
xfarray_idx_t cur = XFARRAY_CURSOR_INIT;
xfs_agblock_t next_agbno = 0;
int error;
error = xfarray_sort(ra->free_records, xrep_bnobt_extent_cmp, 0);
if (error)
return error;
while ((error = xfarray_iter(ra->free_records, &cur, &arec)) == 1) {
if (arec.ar_startblock < next_agbno)
return -EFSCORRUPTED;
next_agbno = arec.ar_startblock + arec.ar_blockcount;
}
return error;
}
/*
* Compare two free space extents by length and then block number. We want
* to sort first in order of increasing length and then in order of increasing
* block number.
*/
static int
xrep_cntbt_extent_cmp(
const void *a,
const void *b)
{
const struct xfs_alloc_rec_incore *ap = a;
const struct xfs_alloc_rec_incore *bp = b;
if (ap->ar_blockcount > bp->ar_blockcount)
return 1;
else if (ap->ar_blockcount < bp->ar_blockcount)
return -1;
return xrep_bnobt_extent_cmp(a, b);
}
/*
* Sort the free extents by length so so that we can put the records into the
* cntbt in the correct order. Don't let userspace kill us if we're resorting
* after allocating btree blocks.
*/
STATIC int
xrep_cntbt_sort_records(
struct xrep_abt *ra,
bool is_resort)
{
return xfarray_sort(ra->free_records, xrep_cntbt_extent_cmp,
is_resort ? 0 : XFARRAY_SORT_KILLABLE);
}
/*
* Iterate all reverse mappings to find (1) the gaps between rmap records (all
* unowned space), (2) the OWN_AG extents (which encompass the free space
* btrees, the rmapbt, and the agfl), (3) the rmapbt blocks, and (4) the AGFL
* blocks. The free space is (1) + (2) - (3) - (4).
*/
STATIC int
xrep_abt_find_freespace(
struct xrep_abt *ra)
{
struct xfs_scrub *sc = ra->sc;
struct xfs_mount *mp = sc->mp;
struct xfs_agf *agf = sc->sa.agf_bp->b_addr;
struct xfs_buf *agfl_bp;
xfs_agblock_t agend;
int error;
xagb_bitmap_init(&ra->not_allocbt_blocks);
xrep_ag_btcur_init(sc, &sc->sa);
/*
* Iterate all the reverse mappings to find gaps in the physical
* mappings, all the OWN_AG blocks, and all the rmapbt extents.
*/
error = xfs_rmap_query_all(sc->sa.rmap_cur, xrep_abt_walk_rmap, ra);
if (error)
goto err;
/* Insert a record for space between the last rmap and EOAG. */
agend = be32_to_cpu(agf->agf_length);
if (ra->next_agbno < agend) {
error = xrep_abt_stash(ra, agend);
if (error)
goto err;
}
/* Collect all the AGFL blocks. */
error = xfs_alloc_read_agfl(sc->sa.pag, sc->tp, &agfl_bp);
if (error)
goto err;
error = xfs_agfl_walk(mp, agf, agfl_bp, xrep_abt_walk_agfl, ra);
if (error)
goto err_agfl;
/* Compute the old bnobt/cntbt blocks. */
error = xagb_bitmap_disunion(&ra->old_allocbt_blocks,
&ra->not_allocbt_blocks);
if (error)
goto err_agfl;
ra->nr_real_records = xfarray_length(ra->free_records);
err_agfl:
xfs_trans_brelse(sc->tp, agfl_bp);
err:
xchk_ag_btcur_free(&sc->sa);
xagb_bitmap_destroy(&ra->not_allocbt_blocks);
return error;
}
/*
* We're going to use the observed free space records to reserve blocks for the
* new free space btrees, so we play an iterative game where we try to converge
* on the number of blocks we need:
*
* 1. Estimate how many blocks we'll need to store the records.
* 2. If the first free record has more blocks than we need, we're done.
* We will have to re-sort the records prior to building the cntbt.
* 3. If that record has exactly the number of blocks we need, null out the
* record. We're done.
* 4. Otherwise, we still need more blocks. Null out the record, subtract its
* length from the number of blocks we need, and go back to step 1.
*
* Fortunately, we don't have to do any transaction work to play this game, so
* we don't have to tear down the staging cursors.
*/
STATIC int
xrep_abt_reserve_space(
struct xrep_abt *ra,
struct xfs_btree_cur *bno_cur,
struct xfs_btree_cur *cnt_cur,
bool *needs_resort)
{
struct xfs_scrub *sc = ra->sc;
xfarray_idx_t record_nr;
unsigned int allocated = 0;
int error = 0;
record_nr = xfarray_length(ra->free_records) - 1;
do {
struct xfs_alloc_rec_incore arec;
uint64_t required;
unsigned int desired;
unsigned int len;
/* Compute how many blocks we'll need. */
error = xfs_btree_bload_compute_geometry(cnt_cur,
&ra->new_cntbt.bload, ra->nr_real_records);
if (error)
break;
error = xfs_btree_bload_compute_geometry(bno_cur,
&ra->new_bnobt.bload, ra->nr_real_records);
if (error)
break;
/* How many btree blocks do we need to store all records? */
required = ra->new_bnobt.bload.nr_blocks +
ra->new_cntbt.bload.nr_blocks;
ASSERT(required < INT_MAX);
/* If we've reserved enough blocks, we're done. */
if (allocated >= required)
break;
desired = required - allocated;
/* We need space but there's none left; bye! */
if (ra->nr_real_records == 0) {
error = -ENOSPC;
break;
}
/* Grab the first record from the list. */
error = xfarray_load(ra->free_records, record_nr, &arec);
if (error)
break;
ASSERT(arec.ar_blockcount <= UINT_MAX);
len = min_t(unsigned int, arec.ar_blockcount, desired);
trace_xrep_newbt_alloc_ag_blocks(sc->mp, sc->sa.pag->pag_agno,
arec.ar_startblock, len, XFS_RMAP_OWN_AG);
error = xrep_newbt_add_extent(&ra->new_bnobt, sc->sa.pag,
arec.ar_startblock, len);
if (error)
break;
allocated += len;
ra->nr_blocks -= len;
if (arec.ar_blockcount > desired) {
/*
* Record has more space than we need. The number of
* free records doesn't change, so shrink the free
* record, inform the caller that the records are no
* longer sorted by length, and exit.
*/
arec.ar_startblock += desired;
arec.ar_blockcount -= desired;
error = xfarray_store(ra->free_records, record_nr,
&arec);
if (error)
break;
*needs_resort = true;
return 0;
}
/*
* We're going to use up the entire record, so unset it and
* move on to the next one. This changes the number of free
* records (but doesn't break the sorting order), so we must
* go around the loop once more to re-run _bload_init.
*/
error = xfarray_unset(ra->free_records, record_nr);
if (error)
break;
ra->nr_real_records--;
record_nr--;
} while (1);
return error;
}
STATIC int
xrep_abt_dispose_one(
struct xrep_abt *ra,
struct xrep_newbt_resv *resv)
{
struct xfs_scrub *sc = ra->sc;
struct xfs_perag *pag = sc->sa.pag;
xfs_agblock_t free_agbno = resv->agbno + resv->used;
xfs_extlen_t free_aglen = resv->len - resv->used;
int error;
ASSERT(pag == resv->pag);
/* Add a deferred rmap for each extent we used. */
if (resv->used > 0)
xfs_rmap_alloc_extent(sc->tp, pag->pag_agno, resv->agbno,
resv->used, XFS_RMAP_OWN_AG);
/*
* For each reserved btree block we didn't use, add it to the free
* space btree. We didn't touch fdblocks when we reserved them, so
* we don't touch it now.
*/
if (free_aglen == 0)
return 0;
trace_xrep_newbt_free_blocks(sc->mp, resv->pag->pag_agno, free_agbno,
free_aglen, ra->new_bnobt.oinfo.oi_owner);
error = __xfs_free_extent(sc->tp, resv->pag, free_agbno, free_aglen,
&ra->new_bnobt.oinfo, XFS_AG_RESV_IGNORE, true);
if (error)
return error;
return xrep_defer_finish(sc);
}
/*
* Deal with all the space we reserved. Blocks that were allocated for the
* free space btrees need to have a (deferred) rmap added for the OWN_AG
* allocation, and blocks that didn't get used can be freed via the usual
* (deferred) means.
*/
STATIC void
xrep_abt_dispose_reservations(
struct xrep_abt *ra,
int error)
{
struct xrep_newbt_resv *resv, *n;
if (error)
goto junkit;
list_for_each_entry_safe(resv, n, &ra->new_bnobt.resv_list, list) {
error = xrep_abt_dispose_one(ra, resv);
if (error)
goto junkit;
}
junkit:
list_for_each_entry_safe(resv, n, &ra->new_bnobt.resv_list, list) {
xfs_perag_put(resv->pag);
list_del(&resv->list);
kfree(resv);
}
xrep_newbt_cancel(&ra->new_bnobt);
xrep_newbt_cancel(&ra->new_cntbt);
}
/* Retrieve free space data for bulk load. */
STATIC int
xrep_abt_get_records(
struct xfs_btree_cur *cur,
unsigned int idx,
struct xfs_btree_block *block,
unsigned int nr_wanted,
void *priv)
{
struct xfs_alloc_rec_incore *arec = &cur->bc_rec.a;
struct xrep_abt *ra = priv;
union xfs_btree_rec *block_rec;
unsigned int loaded;
int error;
for (loaded = 0; loaded < nr_wanted; loaded++, idx++) {
error = xfarray_load_next(ra->free_records, &ra->array_cur,
arec);
if (error)
return error;
ra->longest = max(ra->longest, arec->ar_blockcount);
block_rec = xfs_btree_rec_addr(cur, idx, block);
cur->bc_ops->init_rec_from_cur(cur, block_rec);
}
return loaded;
}
/* Feed one of the new btree blocks to the bulk loader. */
STATIC int
xrep_abt_claim_block(
struct xfs_btree_cur *cur,
union xfs_btree_ptr *ptr,
void *priv)
{
struct xrep_abt *ra = priv;
return xrep_newbt_claim_block(cur, &ra->new_bnobt, ptr);
}
/*
* Reset the AGF counters to reflect the free space btrees that we just
* rebuilt, then reinitialize the per-AG data.
*/
STATIC int
xrep_abt_reset_counters(
struct xrep_abt *ra)
{
struct xfs_scrub *sc = ra->sc;
struct xfs_perag *pag = sc->sa.pag;
struct xfs_agf *agf = sc->sa.agf_bp->b_addr;
unsigned int freesp_btreeblks = 0;
/*
* Compute the contribution to agf_btreeblks for the new free space
* btrees. This is the computed btree size minus anything we didn't
* use.
*/
freesp_btreeblks += ra->new_bnobt.bload.nr_blocks - 1;
freesp_btreeblks += ra->new_cntbt.bload.nr_blocks - 1;
freesp_btreeblks -= xrep_newbt_unused_blocks(&ra->new_bnobt);
freesp_btreeblks -= xrep_newbt_unused_blocks(&ra->new_cntbt);
/*
* The AGF header contains extra information related to the free space
* btrees, so we must update those fields here.
*/
agf->agf_btreeblks = cpu_to_be32(freesp_btreeblks +
(be32_to_cpu(agf->agf_rmap_blocks) - 1));
agf->agf_freeblks = cpu_to_be32(ra->nr_blocks);
agf->agf_longest = cpu_to_be32(ra->longest);
xfs_alloc_log_agf(sc->tp, sc->sa.agf_bp, XFS_AGF_BTREEBLKS |
XFS_AGF_LONGEST |
XFS_AGF_FREEBLKS);
/*
* After we commit the new btree to disk, it is possible that the
* process to reap the old btree blocks will race with the AIL trying
* to checkpoint the old btree blocks into the filesystem. If the new
* tree is shorter than the old one, the allocbt write verifier will
* fail and the AIL will shut down the filesystem.
*
* To avoid this, save the old incore btree height values as the alt
* height values before re-initializing the perag info from the updated
* AGF to capture all the new values.
*/
pag->pagf_repair_levels[XFS_BTNUM_BNOi] = pag->pagf_levels[XFS_BTNUM_BNOi];
pag->pagf_repair_levels[XFS_BTNUM_CNTi] = pag->pagf_levels[XFS_BTNUM_CNTi];
/* Reinitialize with the values we just logged. */
return xrep_reinit_pagf(sc);
}
/*
* Use the collected free space information to stage new free space btrees.
* If this is successful we'll return with the new btree root
* information logged to the repair transaction but not yet committed.
*/
STATIC int
xrep_abt_build_new_trees(
struct xrep_abt *ra)
{
struct xfs_scrub *sc = ra->sc;
struct xfs_btree_cur *bno_cur;
struct xfs_btree_cur *cnt_cur;
struct xfs_perag *pag = sc->sa.pag;
bool needs_resort = false;
int error;
/*
* Sort the free extents by length so that we can set up the free space
* btrees in as few extents as possible. This reduces the amount of
* deferred rmap / free work we have to do at the end.
*/
error = xrep_cntbt_sort_records(ra, false);
if (error)
return error;
/*
* Prepare to construct the new btree by reserving disk space for the
* new btree and setting up all the accounting information we'll need
* to root the new btree while it's under construction and before we
* attach it to the AG header.
*/
xrep_newbt_init_bare(&ra->new_bnobt, sc);
xrep_newbt_init_bare(&ra->new_cntbt, sc);
ra->new_bnobt.bload.get_records = xrep_abt_get_records;
ra->new_cntbt.bload.get_records = xrep_abt_get_records;
ra->new_bnobt.bload.claim_block = xrep_abt_claim_block;
ra->new_cntbt.bload.claim_block = xrep_abt_claim_block;
/* Allocate cursors for the staged btrees. */
bno_cur = xfs_allocbt_stage_cursor(sc->mp, &ra->new_bnobt.afake,
pag, XFS_BTNUM_BNO);
cnt_cur = xfs_allocbt_stage_cursor(sc->mp, &ra->new_cntbt.afake,
pag, XFS_BTNUM_CNT);
/* Last chance to abort before we start committing fixes. */
if (xchk_should_terminate(sc, &error))
goto err_cur;
/* Reserve the space we'll need for the new btrees. */
error = xrep_abt_reserve_space(ra, bno_cur, cnt_cur, &needs_resort);
if (error)
goto err_cur;
/*
* If we need to re-sort the free extents by length, do so so that we
* can put the records into the cntbt in the correct order.
*/
if (needs_resort) {
error = xrep_cntbt_sort_records(ra, needs_resort);
if (error)
goto err_cur;
}
/*
* Due to btree slack factors, it's possible for a new btree to be one
* level taller than the old btree. Update the alternate incore btree
* height so that we don't trip the verifiers when writing the new
* btree blocks to disk.
*/
pag->pagf_repair_levels[XFS_BTNUM_BNOi] =
ra->new_bnobt.bload.btree_height;
pag->pagf_repair_levels[XFS_BTNUM_CNTi] =
ra->new_cntbt.bload.btree_height;
/* Load the free space by length tree. */
ra->array_cur = XFARRAY_CURSOR_INIT;
ra->longest = 0;
error = xfs_btree_bload(cnt_cur, &ra->new_cntbt.bload, ra);
if (error)
goto err_levels;
error = xrep_bnobt_sort_records(ra);
if (error)
return error;
/* Load the free space by block number tree. */
ra->array_cur = XFARRAY_CURSOR_INIT;
error = xfs_btree_bload(bno_cur, &ra->new_bnobt.bload, ra);
if (error)
goto err_levels;
/*
* Install the new btrees in the AG header. After this point the old
* btrees are no longer accessible and the new trees are live.
*/
xfs_allocbt_commit_staged_btree(bno_cur, sc->tp, sc->sa.agf_bp);
xfs_btree_del_cursor(bno_cur, 0);
xfs_allocbt_commit_staged_btree(cnt_cur, sc->tp, sc->sa.agf_bp);
xfs_btree_del_cursor(cnt_cur, 0);
/* Reset the AGF counters now that we've changed the btree shape. */
error = xrep_abt_reset_counters(ra);
if (error)
goto err_newbt;
/* Dispose of any unused blocks and the accounting information. */
xrep_abt_dispose_reservations(ra, error);
return xrep_roll_ag_trans(sc);
err_levels:
pag->pagf_repair_levels[XFS_BTNUM_BNOi] = 0;
pag->pagf_repair_levels[XFS_BTNUM_CNTi] = 0;
err_cur:
xfs_btree_del_cursor(cnt_cur, error);
xfs_btree_del_cursor(bno_cur, error);
err_newbt:
xrep_abt_dispose_reservations(ra, error);
return error;
}
/*
* Now that we've logged the roots of the new btrees, invalidate all of the
* old blocks and free them.
*/
STATIC int
xrep_abt_remove_old_trees(
struct xrep_abt *ra)
{
struct xfs_perag *pag = ra->sc->sa.pag;
int error;
/* Free the old btree blocks if they're not in use. */
error = xrep_reap_agblocks(ra->sc, &ra->old_allocbt_blocks,
&XFS_RMAP_OINFO_AG, XFS_AG_RESV_IGNORE);
if (error)
return error;
/*
* Now that we've zapped all the old allocbt blocks we can turn off
* the alternate height mechanism.
*/
pag->pagf_repair_levels[XFS_BTNUM_BNOi] = 0;
pag->pagf_repair_levels[XFS_BTNUM_CNTi] = 0;
return 0;
}
/* Repair the freespace btrees for some AG. */
int
xrep_allocbt(
struct xfs_scrub *sc)
{
struct xrep_abt *ra;
struct xfs_mount *mp = sc->mp;
char *descr;
int error;
/* We require the rmapbt to rebuild anything. */
if (!xfs_has_rmapbt(mp))
return -EOPNOTSUPP;
ra = kzalloc(sizeof(struct xrep_abt), XCHK_GFP_FLAGS);
if (!ra)
return -ENOMEM;
ra->sc = sc;
/* We rebuild both data structures. */
sc->sick_mask = XFS_SICK_AG_BNOBT | XFS_SICK_AG_CNTBT;
/*
* Make sure the busy extent list is clear because we can't put extents
* on there twice. In theory we cleared this before we started, but
* let's not risk the filesystem.
*/
if (!xfs_extent_busy_list_empty(sc->sa.pag)) {
error = -EDEADLOCK;
goto out_ra;
}
/* Set up enough storage to handle maximally fragmented free space. */
descr = xchk_xfile_ag_descr(sc, "free space records");
error = xfarray_create(descr, mp->m_sb.sb_agblocks / 2,
sizeof(struct xfs_alloc_rec_incore),
&ra->free_records);
kfree(descr);
if (error)
goto out_ra;
/* Collect the free space data and find the old btree blocks. */
xagb_bitmap_init(&ra->old_allocbt_blocks);
error = xrep_abt_find_freespace(ra);
if (error)
goto out_bitmap;
/* Rebuild the free space information. */
error = xrep_abt_build_new_trees(ra);
if (error)
goto out_bitmap;
/* Kill the old trees. */
error = xrep_abt_remove_old_trees(ra);
if (error)
goto out_bitmap;
out_bitmap:
xagb_bitmap_destroy(&ra->old_allocbt_blocks);
xfarray_destroy(ra->free_records);
out_ra:
kfree(ra);
return error;
}
/* Make sure both btrees are ok after we've rebuilt them. */
int
xrep_revalidate_allocbt(
struct xfs_scrub *sc)
{
__u32 old_type = sc->sm->sm_type;
int error;
/*
* We must update sm_type temporarily so that the tree-to-tree cross
* reference checks will work in the correct direction, and also so
* that tracing will report correctly if there are more errors.
*/
sc->sm->sm_type = XFS_SCRUB_TYPE_BNOBT;
error = xchk_allocbt(sc);
if (error)
goto out;
sc->sm->sm_type = XFS_SCRUB_TYPE_CNTBT;
error = xchk_allocbt(sc);
out:
sc->sm->sm_type = old_type;
return error;
}
......@@ -16,7 +16,9 @@
#include <linux/interval_tree_generic.h>
struct xbitmap_node {
/* u64 bitmap */
struct xbitmap64_node {
struct rb_node bn_rbnode;
/* First set bit of this interval and subtree. */
......@@ -39,72 +41,72 @@ struct xbitmap_node {
* forward-declare them anyway for clarity.
*/
static inline void
xbitmap_tree_insert(struct xbitmap_node *node, struct rb_root_cached *root);
xbitmap64_tree_insert(struct xbitmap64_node *node, struct rb_root_cached *root);
static inline void
xbitmap_tree_remove(struct xbitmap_node *node, struct rb_root_cached *root);
xbitmap64_tree_remove(struct xbitmap64_node *node, struct rb_root_cached *root);
static inline struct xbitmap_node *
xbitmap_tree_iter_first(struct rb_root_cached *root, uint64_t start,
static inline struct xbitmap64_node *
xbitmap64_tree_iter_first(struct rb_root_cached *root, uint64_t start,
uint64_t last);
static inline struct xbitmap_node *
xbitmap_tree_iter_next(struct xbitmap_node *node, uint64_t start,
static inline struct xbitmap64_node *
xbitmap64_tree_iter_next(struct xbitmap64_node *node, uint64_t start,
uint64_t last);
INTERVAL_TREE_DEFINE(struct xbitmap_node, bn_rbnode, uint64_t,
__bn_subtree_last, START, LAST, static inline, xbitmap_tree)
INTERVAL_TREE_DEFINE(struct xbitmap64_node, bn_rbnode, uint64_t,
__bn_subtree_last, START, LAST, static inline, xbitmap64_tree)
/* Iterate each interval of a bitmap. Do not change the bitmap. */
#define for_each_xbitmap_extent(bn, bitmap) \
#define for_each_xbitmap64_extent(bn, bitmap) \
for ((bn) = rb_entry_safe(rb_first(&(bitmap)->xb_root.rb_root), \
struct xbitmap_node, bn_rbnode); \
struct xbitmap64_node, bn_rbnode); \
(bn) != NULL; \
(bn) = rb_entry_safe(rb_next(&(bn)->bn_rbnode), \
struct xbitmap_node, bn_rbnode))
struct xbitmap64_node, bn_rbnode))
/* Clear a range of this bitmap. */
int
xbitmap_clear(
struct xbitmap *bitmap,
xbitmap64_clear(
struct xbitmap64 *bitmap,
uint64_t start,
uint64_t len)
{
struct xbitmap_node *bn;
struct xbitmap_node *new_bn;
struct xbitmap64_node *bn;
struct xbitmap64_node *new_bn;
uint64_t last = start + len - 1;
while ((bn = xbitmap_tree_iter_first(&bitmap->xb_root, start, last))) {
while ((bn = xbitmap64_tree_iter_first(&bitmap->xb_root, start, last))) {
if (bn->bn_start < start && bn->bn_last > last) {
uint64_t old_last = bn->bn_last;
/* overlaps with the entire clearing range */
xbitmap_tree_remove(bn, &bitmap->xb_root);
xbitmap64_tree_remove(bn, &bitmap->xb_root);
bn->bn_last = start - 1;
xbitmap_tree_insert(bn, &bitmap->xb_root);
xbitmap64_tree_insert(bn, &bitmap->xb_root);
/* add an extent */
new_bn = kmalloc(sizeof(struct xbitmap_node),
new_bn = kmalloc(sizeof(struct xbitmap64_node),
XCHK_GFP_FLAGS);
if (!new_bn)
return -ENOMEM;
new_bn->bn_start = last + 1;
new_bn->bn_last = old_last;
xbitmap_tree_insert(new_bn, &bitmap->xb_root);
xbitmap64_tree_insert(new_bn, &bitmap->xb_root);
} else if (bn->bn_start < start) {
/* overlaps with the left side of the clearing range */
xbitmap_tree_remove(bn, &bitmap->xb_root);
xbitmap64_tree_remove(bn, &bitmap->xb_root);
bn->bn_last = start - 1;
xbitmap_tree_insert(bn, &bitmap->xb_root);
xbitmap64_tree_insert(bn, &bitmap->xb_root);
} else if (bn->bn_last > last) {
/* overlaps with the right side of the clearing range */
xbitmap_tree_remove(bn, &bitmap->xb_root);
xbitmap64_tree_remove(bn, &bitmap->xb_root);
bn->bn_start = last + 1;
xbitmap_tree_insert(bn, &bitmap->xb_root);
xbitmap64_tree_insert(bn, &bitmap->xb_root);
break;
} else {
/* in the middle of the clearing range */
xbitmap_tree_remove(bn, &bitmap->xb_root);
xbitmap64_tree_remove(bn, &bitmap->xb_root);
kfree(bn);
}
}
......@@ -114,59 +116,59 @@ xbitmap_clear(
/* Set a range of this bitmap. */
int
xbitmap_set(
struct xbitmap *bitmap,
xbitmap64_set(
struct xbitmap64 *bitmap,
uint64_t start,
uint64_t len)
{
struct xbitmap_node *left;
struct xbitmap_node *right;
struct xbitmap64_node *left;
struct xbitmap64_node *right;
uint64_t last = start + len - 1;
int error;
/* Is this whole range already set? */
left = xbitmap_tree_iter_first(&bitmap->xb_root, start, last);
left = xbitmap64_tree_iter_first(&bitmap->xb_root, start, last);
if (left && left->bn_start <= start && left->bn_last >= last)
return 0;
/* Clear out everything in the range we want to set. */
error = xbitmap_clear(bitmap, start, len);
error = xbitmap64_clear(bitmap, start, len);
if (error)
return error;
/* Do we have a left-adjacent extent? */
left = xbitmap_tree_iter_first(&bitmap->xb_root, start - 1, start - 1);
left = xbitmap64_tree_iter_first(&bitmap->xb_root, start - 1, start - 1);
ASSERT(!left || left->bn_last + 1 == start);
/* Do we have a right-adjacent extent? */
right = xbitmap_tree_iter_first(&bitmap->xb_root, last + 1, last + 1);
right = xbitmap64_tree_iter_first(&bitmap->xb_root, last + 1, last + 1);
ASSERT(!right || right->bn_start == last + 1);
if (left && right) {
/* combine left and right adjacent extent */
xbitmap_tree_remove(left, &bitmap->xb_root);
xbitmap_tree_remove(right, &bitmap->xb_root);
xbitmap64_tree_remove(left, &bitmap->xb_root);
xbitmap64_tree_remove(right, &bitmap->xb_root);
left->bn_last = right->bn_last;
xbitmap_tree_insert(left, &bitmap->xb_root);
xbitmap64_tree_insert(left, &bitmap->xb_root);
kfree(right);
} else if (left) {
/* combine with left extent */
xbitmap_tree_remove(left, &bitmap->xb_root);
xbitmap64_tree_remove(left, &bitmap->xb_root);
left->bn_last = last;
xbitmap_tree_insert(left, &bitmap->xb_root);
xbitmap64_tree_insert(left, &bitmap->xb_root);
} else if (right) {
/* combine with right extent */
xbitmap_tree_remove(right, &bitmap->xb_root);
xbitmap64_tree_remove(right, &bitmap->xb_root);
right->bn_start = start;
xbitmap_tree_insert(right, &bitmap->xb_root);
xbitmap64_tree_insert(right, &bitmap->xb_root);
} else {
/* add an extent */
left = kmalloc(sizeof(struct xbitmap_node), XCHK_GFP_FLAGS);
left = kmalloc(sizeof(struct xbitmap64_node), XCHK_GFP_FLAGS);
if (!left)
return -ENOMEM;
left->bn_start = start;
left->bn_last = last;
xbitmap_tree_insert(left, &bitmap->xb_root);
xbitmap64_tree_insert(left, &bitmap->xb_root);
}
return 0;
......@@ -174,21 +176,21 @@ xbitmap_set(
/* Free everything related to this bitmap. */
void
xbitmap_destroy(
struct xbitmap *bitmap)
xbitmap64_destroy(
struct xbitmap64 *bitmap)
{
struct xbitmap_node *bn;
struct xbitmap64_node *bn;
while ((bn = xbitmap_tree_iter_first(&bitmap->xb_root, 0, -1ULL))) {
xbitmap_tree_remove(bn, &bitmap->xb_root);
while ((bn = xbitmap64_tree_iter_first(&bitmap->xb_root, 0, -1ULL))) {
xbitmap64_tree_remove(bn, &bitmap->xb_root);
kfree(bn);
}
}
/* Set up a per-AG block bitmap. */
void
xbitmap_init(
struct xbitmap *bitmap)
xbitmap64_init(
struct xbitmap64 *bitmap)
{
bitmap->xb_root = RB_ROOT_CACHED;
}
......@@ -208,18 +210,18 @@ xbitmap_init(
* This is the logical equivalent of bitmap &= ~sub.
*/
int
xbitmap_disunion(
struct xbitmap *bitmap,
struct xbitmap *sub)
xbitmap64_disunion(
struct xbitmap64 *bitmap,
struct xbitmap64 *sub)
{
struct xbitmap_node *bn;
struct xbitmap64_node *bn;
int error;
if (xbitmap_empty(bitmap) || xbitmap_empty(sub))
if (xbitmap64_empty(bitmap) || xbitmap64_empty(sub))
return 0;
for_each_xbitmap_extent(bn, sub) {
error = xbitmap_clear(bitmap, bn->bn_start,
for_each_xbitmap64_extent(bn, sub) {
error = xbitmap64_clear(bitmap, bn->bn_start,
bn->bn_last - bn->bn_start + 1);
if (error)
return error;
......@@ -228,88 +230,273 @@ xbitmap_disunion(
return 0;
}
/* How many bits are set in this bitmap? */
uint64_t
xbitmap64_hweight(
struct xbitmap64 *bitmap)
{
struct xbitmap64_node *bn;
uint64_t ret = 0;
for_each_xbitmap64_extent(bn, bitmap)
ret += bn->bn_last - bn->bn_start + 1;
return ret;
}
/* Call a function for every run of set bits in this bitmap. */
int
xbitmap64_walk(
struct xbitmap64 *bitmap,
xbitmap64_walk_fn fn,
void *priv)
{
struct xbitmap64_node *bn;
int error = 0;
for_each_xbitmap64_extent(bn, bitmap) {
error = fn(bn->bn_start, bn->bn_last - bn->bn_start + 1, priv);
if (error)
break;
}
return error;
}
/* Does this bitmap have no bits set at all? */
bool
xbitmap64_empty(
struct xbitmap64 *bitmap)
{
return bitmap->xb_root.rb_root.rb_node == NULL;
}
/* Is the start of the range set or clear? And for how long? */
bool
xbitmap64_test(
struct xbitmap64 *bitmap,
uint64_t start,
uint64_t *len)
{
struct xbitmap64_node *bn;
uint64_t last = start + *len - 1;
bn = xbitmap64_tree_iter_first(&bitmap->xb_root, start, last);
if (!bn)
return false;
if (bn->bn_start <= start) {
if (bn->bn_last < last)
*len = bn->bn_last - start + 1;
return true;
}
*len = bn->bn_start - start;
return false;
}
/* u32 bitmap */
struct xbitmap32_node {
struct rb_node bn_rbnode;
/* First set bit of this interval and subtree. */
uint32_t bn_start;
/* Last set bit of this interval. */
uint32_t bn_last;
/* Last set bit of this subtree. Do not touch this. */
uint32_t __bn_subtree_last;
};
/* Define our own interval tree type with uint32_t parameters. */
/*
* Record all btree blocks seen while iterating all records of a btree.
*
* We know that the btree query_all function starts at the left edge and walks
* towards the right edge of the tree. Therefore, we know that we can walk up
* the btree cursor towards the root; if the pointer for a given level points
* to the first record/key in that block, we haven't seen this block before;
* and therefore we need to remember that we saw this block in the btree.
*
* So if our btree is:
*
* 4
* / | \
* 1 2 3
*
* Pretend for this example that each leaf block has 100 btree records. For
* the first btree record, we'll observe that bc_levels[0].ptr == 1, so we
* record that we saw block 1. Then we observe that bc_levels[1].ptr == 1, so
* we record block 4. The list is [1, 4].
*
* For the second btree record, we see that bc_levels[0].ptr == 2, so we exit
* the loop. The list remains [1, 4].
*
* For the 101st btree record, we've moved onto leaf block 2. Now
* bc_levels[0].ptr == 1 again, so we record that we saw block 2. We see that
* bc_levels[1].ptr == 2, so we exit the loop. The list is now [1, 4, 2].
*
* For the 102nd record, bc_levels[0].ptr == 2, so we continue.
*
* For the 201st record, we've moved on to leaf block 3.
* bc_levels[0].ptr == 1, so we add 3 to the list. Now it is [1, 4, 2, 3].
*
* For the 300th record we just exit, with the list being [1, 4, 2, 3].
* These functions are defined by the INTERVAL_TREE_DEFINE macro, but we'll
* forward-declare them anyway for clarity.
*/
static inline void
xbitmap32_tree_insert(struct xbitmap32_node *node, struct rb_root_cached *root);
/* Mark a btree block to the agblock bitmap. */
STATIC int
xagb_bitmap_visit_btblock(
struct xfs_btree_cur *cur,
int level,
void *priv)
static inline void
xbitmap32_tree_remove(struct xbitmap32_node *node, struct rb_root_cached *root);
static inline struct xbitmap32_node *
xbitmap32_tree_iter_first(struct rb_root_cached *root, uint32_t start,
uint32_t last);
static inline struct xbitmap32_node *
xbitmap32_tree_iter_next(struct xbitmap32_node *node, uint32_t start,
uint32_t last);
INTERVAL_TREE_DEFINE(struct xbitmap32_node, bn_rbnode, uint32_t,
__bn_subtree_last, START, LAST, static inline, xbitmap32_tree)
/* Iterate each interval of a bitmap. Do not change the bitmap. */
#define for_each_xbitmap32_extent(bn, bitmap) \
for ((bn) = rb_entry_safe(rb_first(&(bitmap)->xb_root.rb_root), \
struct xbitmap32_node, bn_rbnode); \
(bn) != NULL; \
(bn) = rb_entry_safe(rb_next(&(bn)->bn_rbnode), \
struct xbitmap32_node, bn_rbnode))
/* Clear a range of this bitmap. */
int
xbitmap32_clear(
struct xbitmap32 *bitmap,
uint32_t start,
uint32_t len)
{
struct xagb_bitmap *bitmap = priv;
struct xfs_buf *bp;
xfs_fsblock_t fsbno;
xfs_agblock_t agbno;
struct xbitmap32_node *bn;
struct xbitmap32_node *new_bn;
uint32_t last = start + len - 1;
xfs_btree_get_block(cur, level, &bp);
if (!bp)
return 0;
while ((bn = xbitmap32_tree_iter_first(&bitmap->xb_root, start, last))) {
if (bn->bn_start < start && bn->bn_last > last) {
uint32_t old_last = bn->bn_last;
fsbno = XFS_DADDR_TO_FSB(cur->bc_mp, xfs_buf_daddr(bp));
agbno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno);
/* overlaps with the entire clearing range */
xbitmap32_tree_remove(bn, &bitmap->xb_root);
bn->bn_last = start - 1;
xbitmap32_tree_insert(bn, &bitmap->xb_root);
return xagb_bitmap_set(bitmap, agbno, 1);
/* add an extent */
new_bn = kmalloc(sizeof(struct xbitmap32_node),
XCHK_GFP_FLAGS);
if (!new_bn)
return -ENOMEM;
new_bn->bn_start = last + 1;
new_bn->bn_last = old_last;
xbitmap32_tree_insert(new_bn, &bitmap->xb_root);
} else if (bn->bn_start < start) {
/* overlaps with the left side of the clearing range */
xbitmap32_tree_remove(bn, &bitmap->xb_root);
bn->bn_last = start - 1;
xbitmap32_tree_insert(bn, &bitmap->xb_root);
} else if (bn->bn_last > last) {
/* overlaps with the right side of the clearing range */
xbitmap32_tree_remove(bn, &bitmap->xb_root);
bn->bn_start = last + 1;
xbitmap32_tree_insert(bn, &bitmap->xb_root);
break;
} else {
/* in the middle of the clearing range */
xbitmap32_tree_remove(bn, &bitmap->xb_root);
kfree(bn);
}
}
return 0;
}
/* Mark all (per-AG) btree blocks in the agblock bitmap. */
/* Set a range of this bitmap. */
int
xagb_bitmap_set_btblocks(
struct xagb_bitmap *bitmap,
struct xfs_btree_cur *cur)
xbitmap32_set(
struct xbitmap32 *bitmap,
uint32_t start,
uint32_t len)
{
return xfs_btree_visit_blocks(cur, xagb_bitmap_visit_btblock,
XFS_BTREE_VISIT_ALL, bitmap);
struct xbitmap32_node *left;
struct xbitmap32_node *right;
uint32_t last = start + len - 1;
int error;
/* Is this whole range already set? */
left = xbitmap32_tree_iter_first(&bitmap->xb_root, start, last);
if (left && left->bn_start <= start && left->bn_last >= last)
return 0;
/* Clear out everything in the range we want to set. */
error = xbitmap32_clear(bitmap, start, len);
if (error)
return error;
/* Do we have a left-adjacent extent? */
left = xbitmap32_tree_iter_first(&bitmap->xb_root, start - 1, start - 1);
ASSERT(!left || left->bn_last + 1 == start);
/* Do we have a right-adjacent extent? */
right = xbitmap32_tree_iter_first(&bitmap->xb_root, last + 1, last + 1);
ASSERT(!right || right->bn_start == last + 1);
if (left && right) {
/* combine left and right adjacent extent */
xbitmap32_tree_remove(left, &bitmap->xb_root);
xbitmap32_tree_remove(right, &bitmap->xb_root);
left->bn_last = right->bn_last;
xbitmap32_tree_insert(left, &bitmap->xb_root);
kfree(right);
} else if (left) {
/* combine with left extent */
xbitmap32_tree_remove(left, &bitmap->xb_root);
left->bn_last = last;
xbitmap32_tree_insert(left, &bitmap->xb_root);
} else if (right) {
/* combine with right extent */
xbitmap32_tree_remove(right, &bitmap->xb_root);
right->bn_start = start;
xbitmap32_tree_insert(right, &bitmap->xb_root);
} else {
/* add an extent */
left = kmalloc(sizeof(struct xbitmap32_node), XCHK_GFP_FLAGS);
if (!left)
return -ENOMEM;
left->bn_start = start;
left->bn_last = last;
xbitmap32_tree_insert(left, &bitmap->xb_root);
}
return 0;
}
/* Free everything related to this bitmap. */
void
xbitmap32_destroy(
struct xbitmap32 *bitmap)
{
struct xbitmap32_node *bn;
while ((bn = xbitmap32_tree_iter_first(&bitmap->xb_root, 0, -1U))) {
xbitmap32_tree_remove(bn, &bitmap->xb_root);
kfree(bn);
}
}
/* Set up a per-AG block bitmap. */
void
xbitmap32_init(
struct xbitmap32 *bitmap)
{
bitmap->xb_root = RB_ROOT_CACHED;
}
/*
* Record all the buffers pointed to by the btree cursor. Callers already
* engaged in a btree walk should call this function to capture the list of
* blocks going from the leaf towards the root.
* Remove all the blocks mentioned in @sub from the extents in @bitmap.
*
* The intent is that callers will iterate the rmapbt for all of its records
* for a given owner to generate @bitmap; and iterate all the blocks of the
* metadata structures that are not being rebuilt and have the same rmapbt
* owner to generate @sub. This routine subtracts all the extents
* mentioned in sub from all the extents linked in @bitmap, which leaves
* @bitmap as the list of blocks that are not accounted for, which we assume
* are the dead blocks of the old metadata structure. The blocks mentioned in
* @bitmap can be reaped.
*
* This is the logical equivalent of bitmap &= ~sub.
*/
int
xagb_bitmap_set_btcur_path(
struct xagb_bitmap *bitmap,
struct xfs_btree_cur *cur)
xbitmap32_disunion(
struct xbitmap32 *bitmap,
struct xbitmap32 *sub)
{
int i;
struct xbitmap32_node *bn;
int error;
for (i = 0; i < cur->bc_nlevels && cur->bc_levels[i].ptr == 1; i++) {
error = xagb_bitmap_visit_btblock(cur, i, bitmap);
if (xbitmap32_empty(bitmap) || xbitmap32_empty(sub))
return 0;
for_each_xbitmap32_extent(bn, sub) {
error = xbitmap32_clear(bitmap, bn->bn_start,
bn->bn_last - bn->bn_start + 1);
if (error)
return error;
}
......@@ -318,14 +505,14 @@ xagb_bitmap_set_btcur_path(
}
/* How many bits are set in this bitmap? */
uint64_t
xbitmap_hweight(
struct xbitmap *bitmap)
uint32_t
xbitmap32_hweight(
struct xbitmap32 *bitmap)
{
struct xbitmap_node *bn;
uint64_t ret = 0;
struct xbitmap32_node *bn;
uint32_t ret = 0;
for_each_xbitmap_extent(bn, bitmap)
for_each_xbitmap32_extent(bn, bitmap)
ret += bn->bn_last - bn->bn_start + 1;
return ret;
......@@ -333,15 +520,15 @@ xbitmap_hweight(
/* Call a function for every run of set bits in this bitmap. */
int
xbitmap_walk(
struct xbitmap *bitmap,
xbitmap_walk_fn fn,
xbitmap32_walk(
struct xbitmap32 *bitmap,
xbitmap32_walk_fn fn,
void *priv)
{
struct xbitmap_node *bn;
struct xbitmap32_node *bn;
int error = 0;
for_each_xbitmap_extent(bn, bitmap) {
for_each_xbitmap32_extent(bn, bitmap) {
error = fn(bn->bn_start, bn->bn_last - bn->bn_start + 1, priv);
if (error)
break;
......@@ -352,23 +539,23 @@ xbitmap_walk(
/* Does this bitmap have no bits set at all? */
bool
xbitmap_empty(
struct xbitmap *bitmap)
xbitmap32_empty(
struct xbitmap32 *bitmap)
{
return bitmap->xb_root.rb_root.rb_node == NULL;
}
/* Is the start of the range set or clear? And for how long? */
bool
xbitmap_test(
struct xbitmap *bitmap,
uint64_t start,
uint64_t *len)
xbitmap32_test(
struct xbitmap32 *bitmap,
uint32_t start,
uint32_t *len)
{
struct xbitmap_node *bn;
uint64_t last = start + *len - 1;
struct xbitmap32_node *bn;
uint32_t last = start + *len - 1;
bn = xbitmap_tree_iter_first(&bitmap->xb_root, start, last);
bn = xbitmap32_tree_iter_first(&bitmap->xb_root, start, last);
if (!bn)
return false;
if (bn->bn_start <= start) {
......
......@@ -6,17 +6,19 @@
#ifndef __XFS_SCRUB_BITMAP_H__
#define __XFS_SCRUB_BITMAP_H__
struct xbitmap {
/* u64 bitmap */
struct xbitmap64 {
struct rb_root_cached xb_root;
};
void xbitmap_init(struct xbitmap *bitmap);
void xbitmap_destroy(struct xbitmap *bitmap);
void xbitmap64_init(struct xbitmap64 *bitmap);
void xbitmap64_destroy(struct xbitmap64 *bitmap);
int xbitmap_clear(struct xbitmap *bitmap, uint64_t start, uint64_t len);
int xbitmap_set(struct xbitmap *bitmap, uint64_t start, uint64_t len);
int xbitmap_disunion(struct xbitmap *bitmap, struct xbitmap *sub);
uint64_t xbitmap_hweight(struct xbitmap *bitmap);
int xbitmap64_clear(struct xbitmap64 *bitmap, uint64_t start, uint64_t len);
int xbitmap64_set(struct xbitmap64 *bitmap, uint64_t start, uint64_t len);
int xbitmap64_disunion(struct xbitmap64 *bitmap, struct xbitmap64 *sub);
uint64_t xbitmap64_hweight(struct xbitmap64 *bitmap);
/*
* Return codes for the bitmap iterator functions are 0 to continue iterating,
......@@ -25,84 +27,39 @@ uint64_t xbitmap_hweight(struct xbitmap *bitmap);
* iteration, because neither bitmap iterator ever generates that error code on
* its own. Callers must not modify the bitmap while walking it.
*/
typedef int (*xbitmap_walk_fn)(uint64_t start, uint64_t len, void *priv);
int xbitmap_walk(struct xbitmap *bitmap, xbitmap_walk_fn fn,
typedef int (*xbitmap64_walk_fn)(uint64_t start, uint64_t len, void *priv);
int xbitmap64_walk(struct xbitmap64 *bitmap, xbitmap64_walk_fn fn,
void *priv);
bool xbitmap_empty(struct xbitmap *bitmap);
bool xbitmap_test(struct xbitmap *bitmap, uint64_t start, uint64_t *len);
bool xbitmap64_empty(struct xbitmap64 *bitmap);
bool xbitmap64_test(struct xbitmap64 *bitmap, uint64_t start, uint64_t *len);
/* Bitmaps, but for type-checked for xfs_agblock_t */
/* u32 bitmap */
struct xagb_bitmap {
struct xbitmap agbitmap;
struct xbitmap32 {
struct rb_root_cached xb_root;
};
static inline void xagb_bitmap_init(struct xagb_bitmap *bitmap)
{
xbitmap_init(&bitmap->agbitmap);
}
static inline void xagb_bitmap_destroy(struct xagb_bitmap *bitmap)
{
xbitmap_destroy(&bitmap->agbitmap);
}
static inline int xagb_bitmap_clear(struct xagb_bitmap *bitmap,
xfs_agblock_t start, xfs_extlen_t len)
{
return xbitmap_clear(&bitmap->agbitmap, start, len);
}
static inline int xagb_bitmap_set(struct xagb_bitmap *bitmap,
xfs_agblock_t start, xfs_extlen_t len)
{
return xbitmap_set(&bitmap->agbitmap, start, len);
}
static inline bool
xagb_bitmap_test(
struct xagb_bitmap *bitmap,
xfs_agblock_t start,
xfs_extlen_t *len)
{
uint64_t biglen = *len;
bool ret;
ret = xbitmap_test(&bitmap->agbitmap, start, &biglen);
if (start + biglen >= UINT_MAX) {
ASSERT(0);
biglen = UINT_MAX - start;
}
*len = biglen;
return ret;
}
static inline int xagb_bitmap_disunion(struct xagb_bitmap *bitmap,
struct xagb_bitmap *sub)
{
return xbitmap_disunion(&bitmap->agbitmap, &sub->agbitmap);
}
void xbitmap32_init(struct xbitmap32 *bitmap);
void xbitmap32_destroy(struct xbitmap32 *bitmap);
static inline uint32_t xagb_bitmap_hweight(struct xagb_bitmap *bitmap)
{
return xbitmap_hweight(&bitmap->agbitmap);
}
static inline bool xagb_bitmap_empty(struct xagb_bitmap *bitmap)
{
return xbitmap_empty(&bitmap->agbitmap);
}
int xbitmap32_clear(struct xbitmap32 *bitmap, uint32_t start, uint32_t len);
int xbitmap32_set(struct xbitmap32 *bitmap, uint32_t start, uint32_t len);
int xbitmap32_disunion(struct xbitmap32 *bitmap, struct xbitmap32 *sub);
uint32_t xbitmap32_hweight(struct xbitmap32 *bitmap);
static inline int xagb_bitmap_walk(struct xagb_bitmap *bitmap,
xbitmap_walk_fn fn, void *priv)
{
return xbitmap_walk(&bitmap->agbitmap, fn, priv);
}
/*
* Return codes for the bitmap iterator functions are 0 to continue iterating,
* and non-zero to stop iterating. Any non-zero value will be passed up to the
* iteration caller. The special value -ECANCELED can be used to stop
* iteration, because neither bitmap iterator ever generates that error code on
* its own. Callers must not modify the bitmap while walking it.
*/
typedef int (*xbitmap32_walk_fn)(uint32_t start, uint32_t len, void *priv);
int xbitmap32_walk(struct xbitmap32 *bitmap, xbitmap32_walk_fn fn,
void *priv);
int xagb_bitmap_set_btblocks(struct xagb_bitmap *bitmap,
struct xfs_btree_cur *cur);
int xagb_bitmap_set_btcur_path(struct xagb_bitmap *bitmap,
struct xfs_btree_cur *cur);
bool xbitmap32_empty(struct xbitmap32 *bitmap);
bool xbitmap32_test(struct xbitmap32 *bitmap, uint32_t start, uint32_t *len);
#endif /* __XFS_SCRUB_BITMAP_H__ */
......@@ -604,6 +604,7 @@ xchk_ag_free(
struct xchk_ag *sa)
{
xchk_ag_btcur_free(sa);
xrep_reset_perag_resv(sc);
if (sa->agf_bp) {
xfs_trans_brelse(sc->tp, sa->agf_bp);
sa->agf_bp = NULL;
......
......@@ -200,8 +200,21 @@ static inline bool xchk_needs_repair(const struct xfs_scrub_metadata *sm)
XFS_SCRUB_OFLAG_XCORRUPT |
XFS_SCRUB_OFLAG_PREEN);
}
/*
* "Should we prepare for a repair?"
*
* Return true if the caller permits us to repair metadata and we're not
* setting up for a post-repair evaluation.
*/
static inline bool xchk_could_repair(const struct xfs_scrub *sc)
{
return (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) &&
!(sc->flags & XREP_ALREADY_FIXED);
}
#else
# define xchk_needs_repair(sc) (false)
# define xchk_could_repair(sc) (false)
#endif /* CONFIG_XFS_ONLINE_REPAIR */
int xchk_metadata_inode_forks(struct xfs_scrub *sc);
......@@ -213,6 +226,12 @@ int xchk_metadata_inode_forks(struct xfs_scrub *sc);
#define xchk_xfile_descr(sc, fmt, ...) \
kasprintf(XCHK_GFP_FLAGS, "XFS (%s): " fmt, \
(sc)->mp->m_super->s_id, ##__VA_ARGS__)
#define xchk_xfile_ag_descr(sc, fmt, ...) \
kasprintf(XCHK_GFP_FLAGS, "XFS (%s): AG 0x%x " fmt, \
(sc)->mp->m_super->s_id, \
(sc)->sa.pag ? (sc)->sa.pag->pag_agno : (sc)->sm->sm_agno, \
##__VA_ARGS__)
/*
* Setting up a hook to wait for intents to drain is costly -- we have to take
......
......@@ -585,7 +585,7 @@ xchk_iallocbt_rec(
uint16_t holemask;
xfs_inobt_btrec_to_irec(mp, rec, &irec);
if (xfs_inobt_check_irec(bs->cur, &irec) != NULL) {
if (xfs_inobt_check_irec(bs->cur->bc_ag.pag, &irec) != NULL) {
xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
return 0;
}
......@@ -708,11 +708,10 @@ xchk_iallocbt_xref_rmap_inodes(
xchk_btree_xref_set_corrupt(sc, sc->sa.rmap_cur, 0);
}
/* Scrub the inode btrees for some AG. */
STATIC int
/* Scrub one of the inode btrees for some AG. */
int
xchk_iallocbt(
struct xfs_scrub *sc,
xfs_btnum_t which)
struct xfs_scrub *sc)
{
struct xfs_btree_cur *cur;
struct xchk_iallocbt iabt = {
......@@ -720,9 +719,23 @@ xchk_iallocbt(
.next_startino = NULLAGINO,
.next_cluster_ino = NULLAGINO,
};
xfs_btnum_t which;
int error;
cur = which == XFS_BTNUM_INO ? sc->sa.ino_cur : sc->sa.fino_cur;
switch (sc->sm->sm_type) {
case XFS_SCRUB_TYPE_INOBT:
cur = sc->sa.ino_cur;
which = XFS_BTNUM_INO;
break;
case XFS_SCRUB_TYPE_FINOBT:
cur = sc->sa.fino_cur;
which = XFS_BTNUM_FINO;
break;
default:
ASSERT(0);
return -EIO;
}
error = xchk_btree(sc, cur, xchk_iallocbt_rec, &XFS_RMAP_OINFO_INOBT,
&iabt);
if (error)
......@@ -743,20 +756,6 @@ xchk_iallocbt(
return error;
}
int
xchk_inobt(
struct xfs_scrub *sc)
{
return xchk_iallocbt(sc, XFS_BTNUM_INO);
}
int
xchk_finobt(
struct xfs_scrub *sc)
{
return xchk_iallocbt(sc, XFS_BTNUM_FINO);
}
/* See if an inode btree has (or doesn't have) an inode chunk record. */
static inline void
xchk_xref_inode_check(
......
// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Copyright (C) 2018-2023 Oracle. All Rights Reserved.
* Author: Darrick J. Wong <djwong@kernel.org>
*/
#include "xfs.h"
#include "xfs_fs.h"
#include "xfs_shared.h"
#include "xfs_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
#include "xfs_defer.h"
#include "xfs_btree.h"
#include "xfs_btree_staging.h"
#include "xfs_bit.h"
#include "xfs_log_format.h"
#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_inode.h"
#include "xfs_alloc.h"
#include "xfs_ialloc.h"
#include "xfs_ialloc_btree.h"
#include "xfs_icache.h"
#include "xfs_rmap.h"
#include "xfs_rmap_btree.h"
#include "xfs_log.h"
#include "xfs_trans_priv.h"
#include "xfs_error.h"
#include "xfs_health.h"
#include "xfs_ag.h"
#include "scrub/xfs_scrub.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/btree.h"
#include "scrub/trace.h"
#include "scrub/repair.h"
#include "scrub/bitmap.h"
#include "scrub/agb_bitmap.h"
#include "scrub/xfile.h"
#include "scrub/xfarray.h"
#include "scrub/newbt.h"
#include "scrub/reap.h"
/*
* Inode Btree Repair
* ==================
*
* A quick refresher of inode btrees on a v5 filesystem:
*
* - Inode records are read into memory in units of 'inode clusters'. However
* many inodes fit in a cluster buffer is the smallest number of inodes that
* can be allocated or freed. Clusters are never smaller than one fs block
* though they can span multiple blocks. The size (in fs blocks) is
* computed with xfs_icluster_size_fsb(). The fs block alignment of a
* cluster is computed with xfs_ialloc_cluster_alignment().
*
* - Each inode btree record can describe a single 'inode chunk'. The chunk
* size is defined to be 64 inodes. If sparse inodes are enabled, every
* inobt record must be aligned to the chunk size; if not, every record must
* be aligned to the start of a cluster. It is possible to construct an XFS
* geometry where one inobt record maps to multiple inode clusters; it is
* also possible to construct a geometry where multiple inobt records map to
* different parts of one inode cluster.
*
* - If sparse inodes are not enabled, the smallest unit of allocation for
* inode records is enough to contain one inode chunk's worth of inodes.
*
* - If sparse inodes are enabled, the holemask field will be active. Each
* bit of the holemask represents 4 potential inodes; if set, the
* corresponding space does *not* contain inodes and must be left alone.
* Clusters cannot be smaller than 4 inodes. The smallest unit of allocation
* of inode records is one inode cluster.
*
* So what's the rebuild algorithm?
*
* Iterate the reverse mapping records looking for OWN_INODES and OWN_INOBT
* records. The OWN_INOBT records are the old inode btree blocks and will be
* cleared out after we've rebuilt the tree. Each possible inode cluster
* within an OWN_INODES record will be read in; for each possible inobt record
* associated with that cluster, compute the freemask calculated from the
* i_mode data in the inode chunk. For sparse inodes the holemask will be
* calculated by creating the properly aligned inobt record and punching out
* any chunk that's missing. Inode allocations and frees grab the AGI first,
* so repair protects itself from concurrent access by locking the AGI.
*
* Once we've reconstructed all the inode records, we can create new inode
* btree roots and reload the btrees. We rebuild both inode trees at the same
* time because they have the same rmap owner and it would be more complex to
* figure out if the other tree isn't in need of a rebuild and which OWN_INOBT
* blocks it owns. We have all the data we need to build both, so dump
* everything and start over.
*
* We use the prefix 'xrep_ibt' because we rebuild both inode btrees at once.
*/
struct xrep_ibt {
/* Record under construction. */
struct xfs_inobt_rec_incore rie;
/* new inobt information */
struct xrep_newbt new_inobt;
/* new finobt information */
struct xrep_newbt new_finobt;
/* Old inode btree blocks we found in the rmap. */
struct xagb_bitmap old_iallocbt_blocks;
/* Reconstructed inode records. */
struct xfarray *inode_records;
struct xfs_scrub *sc;
/* Number of inodes assigned disk space. */
unsigned int icount;
/* Number of inodes in use. */
unsigned int iused;
/* Number of finobt records needed. */
unsigned int finobt_recs;
/* get_records()'s position in the inode record array. */
xfarray_idx_t array_cur;
};
/*
* Is this inode in use? If the inode is in memory we can tell from i_mode,
* otherwise we have to check di_mode in the on-disk buffer. We only care
* that the high (i.e. non-permission) bits of _mode are zero. This should be
* safe because repair keeps all AG headers locked until the end, and process
* trying to perform an inode allocation/free must lock the AGI.
*
* @cluster_ag_base is the inode offset of the cluster within the AG.
* @cluster_bp is the cluster buffer.
* @cluster_index is the inode offset within the inode cluster.
*/
STATIC int
xrep_ibt_check_ifree(
struct xrep_ibt *ri,
xfs_agino_t cluster_ag_base,
struct xfs_buf *cluster_bp,
unsigned int cluster_index,
bool *inuse)
{
struct xfs_scrub *sc = ri->sc;
struct xfs_mount *mp = sc->mp;
struct xfs_dinode *dip;
xfs_ino_t fsino;
xfs_agino_t agino;
xfs_agnumber_t agno = ri->sc->sa.pag->pag_agno;
unsigned int cluster_buf_base;
unsigned int offset;
int error;
agino = cluster_ag_base + cluster_index;
fsino = XFS_AGINO_TO_INO(mp, agno, agino);
/* Inode uncached or half assembled, read disk buffer */
cluster_buf_base = XFS_INO_TO_OFFSET(mp, cluster_ag_base);
offset = (cluster_buf_base + cluster_index) * mp->m_sb.sb_inodesize;
if (offset >= BBTOB(cluster_bp->b_length))
return -EFSCORRUPTED;
dip = xfs_buf_offset(cluster_bp, offset);
if (be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC)
return -EFSCORRUPTED;
if (dip->di_version >= 3 && be64_to_cpu(dip->di_ino) != fsino)
return -EFSCORRUPTED;
/* Will the in-core inode tell us if it's in use? */
error = xchk_inode_is_allocated(sc, agino, inuse);
if (!error)
return 0;
*inuse = dip->di_mode != 0;
return 0;
}
/* Stash the accumulated inobt record for rebuilding. */
STATIC int
xrep_ibt_stash(
struct xrep_ibt *ri)
{
int error = 0;
if (xchk_should_terminate(ri->sc, &error))
return error;
ri->rie.ir_freecount = xfs_inobt_rec_freecount(&ri->rie);
if (xfs_inobt_check_irec(ri->sc->sa.pag, &ri->rie) != NULL)
return -EFSCORRUPTED;
if (ri->rie.ir_freecount > 0)
ri->finobt_recs++;
trace_xrep_ibt_found(ri->sc->mp, ri->sc->sa.pag->pag_agno, &ri->rie);
error = xfarray_append(ri->inode_records, &ri->rie);
if (error)
return error;
ri->rie.ir_startino = NULLAGINO;
return 0;
}
/*
* Given an extent of inodes and an inode cluster buffer, calculate the
* location of the corresponding inobt record (creating it if necessary),
* then update the parts of the holemask and freemask of that record that
* correspond to the inode extent we were given.
*
* @cluster_ir_startino is the AG inode number of an inobt record that we're
* proposing to create for this inode cluster. If sparse inodes are enabled,
* we must round down to a chunk boundary to find the actual sparse record.
* @cluster_bp is the buffer of the inode cluster.
* @nr_inodes is the number of inodes to check from the cluster.
*/
STATIC int
xrep_ibt_cluster_record(
struct xrep_ibt *ri,
xfs_agino_t cluster_ir_startino,
struct xfs_buf *cluster_bp,
unsigned int nr_inodes)
{
struct xfs_scrub *sc = ri->sc;
struct xfs_mount *mp = sc->mp;
xfs_agino_t ir_startino;
unsigned int cluster_base;
unsigned int cluster_index;
int error = 0;
ir_startino = cluster_ir_startino;
if (xfs_has_sparseinodes(mp))
ir_startino = rounddown(ir_startino, XFS_INODES_PER_CHUNK);
cluster_base = cluster_ir_startino - ir_startino;
/*
* If the accumulated inobt record doesn't map this cluster, add it to
* the list and reset it.
*/
if (ri->rie.ir_startino != NULLAGINO &&
ri->rie.ir_startino + XFS_INODES_PER_CHUNK <= ir_startino) {
error = xrep_ibt_stash(ri);
if (error)
return error;
}
if (ri->rie.ir_startino == NULLAGINO) {
ri->rie.ir_startino = ir_startino;
ri->rie.ir_free = XFS_INOBT_ALL_FREE;
ri->rie.ir_holemask = 0xFFFF;
ri->rie.ir_count = 0;
}
/* Record the whole cluster. */
ri->icount += nr_inodes;
ri->rie.ir_count += nr_inodes;
ri->rie.ir_holemask &= ~xfs_inobt_maskn(
cluster_base / XFS_INODES_PER_HOLEMASK_BIT,
nr_inodes / XFS_INODES_PER_HOLEMASK_BIT);
/* Which inodes within this cluster are free? */
for (cluster_index = 0; cluster_index < nr_inodes; cluster_index++) {
bool inuse = false;
error = xrep_ibt_check_ifree(ri, cluster_ir_startino,
cluster_bp, cluster_index, &inuse);
if (error)
return error;
if (!inuse)
continue;
ri->iused++;
ri->rie.ir_free &= ~XFS_INOBT_MASK(cluster_base +
cluster_index);
}
return 0;
}
/*
* For each inode cluster covering the physical extent recorded by the rmapbt,
* we must calculate the properly aligned startino of that cluster, then
* iterate each cluster to fill in used and filled masks appropriately. We
* then use the (startino, used, filled) information to construct the
* appropriate inode records.
*/
STATIC int
xrep_ibt_process_cluster(
struct xrep_ibt *ri,
xfs_agblock_t cluster_bno)
{
struct xfs_imap imap;
struct xfs_buf *cluster_bp;
struct xfs_scrub *sc = ri->sc;
struct xfs_mount *mp = sc->mp;
struct xfs_ino_geometry *igeo = M_IGEO(mp);
xfs_agino_t cluster_ag_base;
xfs_agino_t irec_index;
unsigned int nr_inodes;
int error;
nr_inodes = min_t(unsigned int, igeo->inodes_per_cluster,
XFS_INODES_PER_CHUNK);
/*
* Grab the inode cluster buffer. This is safe to do with a broken
* inobt because imap_to_bp directly maps the buffer without touching
* either inode btree.
*/
imap.im_blkno = XFS_AGB_TO_DADDR(mp, sc->sa.pag->pag_agno, cluster_bno);
imap.im_len = XFS_FSB_TO_BB(mp, igeo->blocks_per_cluster);
imap.im_boffset = 0;
error = xfs_imap_to_bp(mp, sc->tp, &imap, &cluster_bp);
if (error)
return error;
/*
* Record the contents of each possible inobt record mapping this
* cluster.
*/
cluster_ag_base = XFS_AGB_TO_AGINO(mp, cluster_bno);
for (irec_index = 0;
irec_index < igeo->inodes_per_cluster;
irec_index += XFS_INODES_PER_CHUNK) {
error = xrep_ibt_cluster_record(ri,
cluster_ag_base + irec_index, cluster_bp,
nr_inodes);
if (error)
break;
}
xfs_trans_brelse(sc->tp, cluster_bp);
return error;
}
/* Check for any obvious conflicts in the inode chunk extent. */
STATIC int
xrep_ibt_check_inode_ext(
struct xfs_scrub *sc,
xfs_agblock_t agbno,
xfs_extlen_t len)
{
struct xfs_mount *mp = sc->mp;
struct xfs_ino_geometry *igeo = M_IGEO(mp);
xfs_agino_t agino;
enum xbtree_recpacking outcome;
int error;
/* Inode records must be within the AG. */
if (!xfs_verify_agbext(sc->sa.pag, agbno, len))
return -EFSCORRUPTED;
/* The entire record must align to the inode cluster size. */
if (!IS_ALIGNED(agbno, igeo->blocks_per_cluster) ||
!IS_ALIGNED(agbno + len, igeo->blocks_per_cluster))
return -EFSCORRUPTED;
/*
* The entire record must also adhere to the inode cluster alignment
* size if sparse inodes are not enabled.
*/
if (!xfs_has_sparseinodes(mp) &&
(!IS_ALIGNED(agbno, igeo->cluster_align) ||
!IS_ALIGNED(agbno + len, igeo->cluster_align)))
return -EFSCORRUPTED;
/*
* On a sparse inode fs, this cluster could be part of a sparse chunk.
* Sparse clusters must be aligned to sparse chunk alignment.
*/
if (xfs_has_sparseinodes(mp) &&
(!IS_ALIGNED(agbno, mp->m_sb.sb_spino_align) ||
!IS_ALIGNED(agbno + len, mp->m_sb.sb_spino_align)))
return -EFSCORRUPTED;
/* Make sure the entire range of blocks are valid AG inodes. */
agino = XFS_AGB_TO_AGINO(mp, agbno);
if (!xfs_verify_agino(sc->sa.pag, agino))
return -EFSCORRUPTED;
agino = XFS_AGB_TO_AGINO(mp, agbno + len) - 1;
if (!xfs_verify_agino(sc->sa.pag, agino))
return -EFSCORRUPTED;
/* Make sure this isn't free space. */
error = xfs_alloc_has_records(sc->sa.bno_cur, agbno, len, &outcome);
if (error)
return error;
if (outcome != XBTREE_RECPACKING_EMPTY)
return -EFSCORRUPTED;
return 0;
}
/* Found a fragment of the old inode btrees; dispose of them later. */
STATIC int
xrep_ibt_record_old_btree_blocks(
struct xrep_ibt *ri,
const struct xfs_rmap_irec *rec)
{
if (!xfs_verify_agbext(ri->sc->sa.pag, rec->rm_startblock,
rec->rm_blockcount))
return -EFSCORRUPTED;
return xagb_bitmap_set(&ri->old_iallocbt_blocks, rec->rm_startblock,
rec->rm_blockcount);
}
/* Record extents that belong to inode cluster blocks. */
STATIC int
xrep_ibt_record_inode_blocks(
struct xrep_ibt *ri,
const struct xfs_rmap_irec *rec)
{
struct xfs_mount *mp = ri->sc->mp;
struct xfs_ino_geometry *igeo = M_IGEO(mp);
xfs_agblock_t cluster_base;
int error;
error = xrep_ibt_check_inode_ext(ri->sc, rec->rm_startblock,
rec->rm_blockcount);
if (error)
return error;
trace_xrep_ibt_walk_rmap(mp, ri->sc->sa.pag->pag_agno,
rec->rm_startblock, rec->rm_blockcount, rec->rm_owner,
rec->rm_offset, rec->rm_flags);
/*
* Record the free/hole masks for each inode cluster that could be
* mapped by this rmap record.
*/
for (cluster_base = 0;
cluster_base < rec->rm_blockcount;
cluster_base += igeo->blocks_per_cluster) {
error = xrep_ibt_process_cluster(ri,
rec->rm_startblock + cluster_base);
if (error)
return error;
}
return 0;
}
STATIC int
xrep_ibt_walk_rmap(
struct xfs_btree_cur *cur,
const struct xfs_rmap_irec *rec,
void *priv)
{
struct xrep_ibt *ri = priv;
int error = 0;
if (xchk_should_terminate(ri->sc, &error))
return error;
switch (rec->rm_owner) {
case XFS_RMAP_OWN_INOBT:
return xrep_ibt_record_old_btree_blocks(ri, rec);
case XFS_RMAP_OWN_INODES:
return xrep_ibt_record_inode_blocks(ri, rec);
}
return 0;
}
/*
* Iterate all reverse mappings to find the inodes (OWN_INODES) and the inode
* btrees (OWN_INOBT). Figure out if we have enough free space to reconstruct
* the inode btrees. The caller must clean up the lists if anything goes
* wrong.
*/
STATIC int
xrep_ibt_find_inodes(
struct xrep_ibt *ri)
{
struct xfs_scrub *sc = ri->sc;
int error;
ri->rie.ir_startino = NULLAGINO;
/* Collect all reverse mappings for inode blocks. */
xrep_ag_btcur_init(sc, &sc->sa);
error = xfs_rmap_query_all(sc->sa.rmap_cur, xrep_ibt_walk_rmap, ri);
xchk_ag_btcur_free(&sc->sa);
if (error)
return error;
/* If we have a record ready to go, add it to the array. */
if (ri->rie.ir_startino != NULLAGINO)
return xrep_ibt_stash(ri);
return 0;
}
/* Update the AGI counters. */
STATIC int
xrep_ibt_reset_counters(
struct xrep_ibt *ri)
{
struct xfs_scrub *sc = ri->sc;
struct xfs_agi *agi = sc->sa.agi_bp->b_addr;
unsigned int freecount = ri->icount - ri->iused;
/* Trigger inode count recalculation */
xfs_force_summary_recalc(sc->mp);
/*
* The AGI header contains extra information related to the inode
* btrees, so we must update those fields here.
*/
agi->agi_count = cpu_to_be32(ri->icount);
agi->agi_freecount = cpu_to_be32(freecount);
xfs_ialloc_log_agi(sc->tp, sc->sa.agi_bp,
XFS_AGI_COUNT | XFS_AGI_FREECOUNT);
/* Reinitialize with the values we just logged. */
return xrep_reinit_pagi(sc);
}
/* Retrieve finobt data for bulk load. */
STATIC int
xrep_fibt_get_records(
struct xfs_btree_cur *cur,
unsigned int idx,
struct xfs_btree_block *block,
unsigned int nr_wanted,
void *priv)
{
struct xfs_inobt_rec_incore *irec = &cur->bc_rec.i;
struct xrep_ibt *ri = priv;
union xfs_btree_rec *block_rec;
unsigned int loaded;
int error;
for (loaded = 0; loaded < nr_wanted; loaded++, idx++) {
do {
error = xfarray_load(ri->inode_records,
ri->array_cur++, irec);
} while (error == 0 && xfs_inobt_rec_freecount(irec) == 0);
if (error)
return error;
block_rec = xfs_btree_rec_addr(cur, idx, block);
cur->bc_ops->init_rec_from_cur(cur, block_rec);
}
return loaded;
}
/* Retrieve inobt data for bulk load. */
STATIC int
xrep_ibt_get_records(
struct xfs_btree_cur *cur,
unsigned int idx,
struct xfs_btree_block *block,
unsigned int nr_wanted,
void *priv)
{
struct xfs_inobt_rec_incore *irec = &cur->bc_rec.i;
struct xrep_ibt *ri = priv;
union xfs_btree_rec *block_rec;
unsigned int loaded;
int error;
for (loaded = 0; loaded < nr_wanted; loaded++, idx++) {
error = xfarray_load(ri->inode_records, ri->array_cur++, irec);
if (error)
return error;
block_rec = xfs_btree_rec_addr(cur, idx, block);
cur->bc_ops->init_rec_from_cur(cur, block_rec);
}
return loaded;
}
/* Feed one of the new inobt blocks to the bulk loader. */
STATIC int
xrep_ibt_claim_block(
struct xfs_btree_cur *cur,
union xfs_btree_ptr *ptr,
void *priv)
{
struct xrep_ibt *ri = priv;
return xrep_newbt_claim_block(cur, &ri->new_inobt, ptr);
}
/* Feed one of the new finobt blocks to the bulk loader. */
STATIC int
xrep_fibt_claim_block(
struct xfs_btree_cur *cur,
union xfs_btree_ptr *ptr,
void *priv)
{
struct xrep_ibt *ri = priv;
return xrep_newbt_claim_block(cur, &ri->new_finobt, ptr);
}
/* Make sure the records do not overlap in inumber address space. */
STATIC int
xrep_ibt_check_overlap(
struct xrep_ibt *ri)
{
struct xfs_inobt_rec_incore irec;
xfarray_idx_t cur;
xfs_agino_t next_agino = 0;
int error = 0;
foreach_xfarray_idx(ri->inode_records, cur) {
if (xchk_should_terminate(ri->sc, &error))
return error;
error = xfarray_load(ri->inode_records, cur, &irec);
if (error)
return error;
if (irec.ir_startino < next_agino)
return -EFSCORRUPTED;
next_agino = irec.ir_startino + XFS_INODES_PER_CHUNK;
}
return error;
}
/* Build new inode btrees and dispose of the old one. */
STATIC int
xrep_ibt_build_new_trees(
struct xrep_ibt *ri)
{
struct xfs_scrub *sc = ri->sc;
struct xfs_btree_cur *ino_cur;
struct xfs_btree_cur *fino_cur = NULL;
xfs_fsblock_t fsbno;
bool need_finobt;
int error;
need_finobt = xfs_has_finobt(sc->mp);
/*
* Create new btrees for staging all the inobt records we collected
* earlier. The records were collected in order of increasing agino,
* so we do not have to sort them. Ensure there are no overlapping
* records.
*/
error = xrep_ibt_check_overlap(ri);
if (error)
return error;
/*
* The new inode btrees will not be rooted in the AGI until we've
* successfully rebuilt the tree.
*
* Start by setting up the inobt staging cursor.
*/
fsbno = XFS_AGB_TO_FSB(sc->mp, sc->sa.pag->pag_agno,
XFS_IBT_BLOCK(sc->mp)),
xrep_newbt_init_ag(&ri->new_inobt, sc, &XFS_RMAP_OINFO_INOBT, fsbno,
XFS_AG_RESV_NONE);
ri->new_inobt.bload.claim_block = xrep_ibt_claim_block;
ri->new_inobt.bload.get_records = xrep_ibt_get_records;
ino_cur = xfs_inobt_stage_cursor(sc->sa.pag, &ri->new_inobt.afake,
XFS_BTNUM_INO);
error = xfs_btree_bload_compute_geometry(ino_cur, &ri->new_inobt.bload,
xfarray_length(ri->inode_records));
if (error)
goto err_inocur;
/* Set up finobt staging cursor. */
if (need_finobt) {
enum xfs_ag_resv_type resv = XFS_AG_RESV_METADATA;
if (sc->mp->m_finobt_nores)
resv = XFS_AG_RESV_NONE;
fsbno = XFS_AGB_TO_FSB(sc->mp, sc->sa.pag->pag_agno,
XFS_FIBT_BLOCK(sc->mp)),
xrep_newbt_init_ag(&ri->new_finobt, sc, &XFS_RMAP_OINFO_INOBT,
fsbno, resv);
ri->new_finobt.bload.claim_block = xrep_fibt_claim_block;
ri->new_finobt.bload.get_records = xrep_fibt_get_records;
fino_cur = xfs_inobt_stage_cursor(sc->sa.pag,
&ri->new_finobt.afake, XFS_BTNUM_FINO);
error = xfs_btree_bload_compute_geometry(fino_cur,
&ri->new_finobt.bload, ri->finobt_recs);
if (error)
goto err_finocur;
}
/* Last chance to abort before we start committing fixes. */
if (xchk_should_terminate(sc, &error))
goto err_finocur;
/* Reserve all the space we need to build the new btrees. */
error = xrep_newbt_alloc_blocks(&ri->new_inobt,
ri->new_inobt.bload.nr_blocks);
if (error)
goto err_finocur;
if (need_finobt) {
error = xrep_newbt_alloc_blocks(&ri->new_finobt,
ri->new_finobt.bload.nr_blocks);
if (error)
goto err_finocur;
}
/* Add all inobt records. */
ri->array_cur = XFARRAY_CURSOR_INIT;
error = xfs_btree_bload(ino_cur, &ri->new_inobt.bload, ri);
if (error)
goto err_finocur;
/* Add all finobt records. */
if (need_finobt) {
ri->array_cur = XFARRAY_CURSOR_INIT;
error = xfs_btree_bload(fino_cur, &ri->new_finobt.bload, ri);
if (error)
goto err_finocur;
}
/*
* Install the new btrees in the AG header. After this point the old
* btrees are no longer accessible and the new trees are live.
*/
xfs_inobt_commit_staged_btree(ino_cur, sc->tp, sc->sa.agi_bp);
xfs_btree_del_cursor(ino_cur, 0);
if (fino_cur) {
xfs_inobt_commit_staged_btree(fino_cur, sc->tp, sc->sa.agi_bp);
xfs_btree_del_cursor(fino_cur, 0);
}
/* Reset the AGI counters now that we've changed the inode roots. */
error = xrep_ibt_reset_counters(ri);
if (error)
goto err_finobt;
/* Free unused blocks and bitmap. */
if (need_finobt) {
error = xrep_newbt_commit(&ri->new_finobt);
if (error)
goto err_inobt;
}
error = xrep_newbt_commit(&ri->new_inobt);
if (error)
return error;
return xrep_roll_ag_trans(sc);
err_finocur:
if (need_finobt)
xfs_btree_del_cursor(fino_cur, error);
err_inocur:
xfs_btree_del_cursor(ino_cur, error);
err_finobt:
if (need_finobt)
xrep_newbt_cancel(&ri->new_finobt);
err_inobt:
xrep_newbt_cancel(&ri->new_inobt);
return error;
}
/*
* Now that we've logged the roots of the new btrees, invalidate all of the
* old blocks and free them.
*/
STATIC int
xrep_ibt_remove_old_trees(
struct xrep_ibt *ri)
{
struct xfs_scrub *sc = ri->sc;
int error;
/*
* Free the old inode btree blocks if they're not in use. It's ok to
* reap with XFS_AG_RESV_NONE even if the finobt had a per-AG
* reservation because we reset the reservation before releasing the
* AGI and AGF header buffer locks.
*/
error = xrep_reap_agblocks(sc, &ri->old_iallocbt_blocks,
&XFS_RMAP_OINFO_INOBT, XFS_AG_RESV_NONE);
if (error)
return error;
/*
* If the finobt is enabled and has a per-AG reservation, make sure we
* reinitialize the per-AG reservations.
*/
if (xfs_has_finobt(sc->mp) && !sc->mp->m_finobt_nores)
sc->flags |= XREP_RESET_PERAG_RESV;
return 0;
}
/* Repair both inode btrees. */
int
xrep_iallocbt(
struct xfs_scrub *sc)
{
struct xrep_ibt *ri;
struct xfs_mount *mp = sc->mp;
char *descr;
xfs_agino_t first_agino, last_agino;
int error = 0;
/* We require the rmapbt to rebuild anything. */
if (!xfs_has_rmapbt(mp))
return -EOPNOTSUPP;
ri = kzalloc(sizeof(struct xrep_ibt), XCHK_GFP_FLAGS);
if (!ri)
return -ENOMEM;
ri->sc = sc;
/* We rebuild both inode btrees. */
sc->sick_mask = XFS_SICK_AG_INOBT | XFS_SICK_AG_FINOBT;
/* Set up enough storage to handle an AG with nothing but inodes. */
xfs_agino_range(mp, sc->sa.pag->pag_agno, &first_agino, &last_agino);
last_agino /= XFS_INODES_PER_CHUNK;
descr = xchk_xfile_ag_descr(sc, "inode index records");
error = xfarray_create(descr, last_agino,
sizeof(struct xfs_inobt_rec_incore),
&ri->inode_records);
kfree(descr);
if (error)
goto out_ri;
/* Collect the inode data and find the old btree blocks. */
xagb_bitmap_init(&ri->old_iallocbt_blocks);
error = xrep_ibt_find_inodes(ri);
if (error)
goto out_bitmap;
/* Rebuild the inode indexes. */
error = xrep_ibt_build_new_trees(ri);
if (error)
goto out_bitmap;
/* Kill the old tree. */
error = xrep_ibt_remove_old_trees(ri);
if (error)
goto out_bitmap;
out_bitmap:
xagb_bitmap_destroy(&ri->old_iallocbt_blocks);
xfarray_destroy(ri->inode_records);
out_ri:
kfree(ri);
return error;
}
/* Make sure both btrees are ok after we've rebuilt them. */
int
xrep_revalidate_iallocbt(
struct xfs_scrub *sc)
{
__u32 old_type = sc->sm->sm_type;
int error;
/*
* We must update sm_type temporarily so that the tree-to-tree cross
* reference checks will work in the correct direction, and also so
* that tracing will report correctly if there are more errors.
*/
sc->sm->sm_type = XFS_SCRUB_TYPE_INOBT;
error = xchk_iallocbt(sc);
if (error)
goto out;
if (xfs_has_finobt(sc->mp)) {
sc->sm->sm_type = XFS_SCRUB_TYPE_FINOBT;
error = xchk_iallocbt(sc);
}
out:
sc->sm->sm_type = old_type;
return error;
}
......@@ -157,11 +157,13 @@ xrep_newbt_add_blocks(
resv->used = 0;
resv->pag = xfs_perag_hold(pag);
if (args->tp) {
ASSERT(xnr->oinfo.oi_offset == 0);
error = xfs_alloc_schedule_autoreap(args, true, &resv->autoreap);
if (error)
goto out_pag;
}
list_add_tail(&resv->list, &xnr->resv_list);
return 0;
......@@ -171,6 +173,30 @@ xrep_newbt_add_blocks(
return error;
}
/*
* Add an extent to the new btree reservation pool. Callers are required to
* reap this reservation manually if the repair is cancelled. @pag must be a
* passive reference.
*/
int
xrep_newbt_add_extent(
struct xrep_newbt *xnr,
struct xfs_perag *pag,
xfs_agblock_t agbno,
xfs_extlen_t len)
{
struct xfs_mount *mp = xnr->sc->mp;
struct xfs_alloc_arg args = {
.tp = NULL, /* no autoreap */
.oinfo = xnr->oinfo,
.fsbno = XFS_AGB_TO_FSB(mp, pag->pag_agno, agbno),
.len = len,
.resv = xnr->resv,
};
return xrep_newbt_add_blocks(xnr, pag, &args);
}
/* Don't let our allocation hint take us beyond this AG */
static inline void
xrep_newbt_validate_ag_alloc_hint(
......@@ -372,6 +398,7 @@ xrep_newbt_free_extent(
free_aglen, xnr->oinfo.oi_owner);
ASSERT(xnr->resv != XFS_AG_RESV_AGFL);
ASSERT(xnr->resv != XFS_AG_RESV_IGNORE);
/*
* Use EFIs to free the reservations. This reduces the chance
......@@ -517,3 +544,16 @@ xrep_newbt_claim_block(
/* Relog all the EFIs. */
return xrep_defer_finish(xnr->sc);
}
/* How many reserved blocks are unused? */
unsigned int
xrep_newbt_unused_blocks(
struct xrep_newbt *xnr)
{
struct xrep_newbt_resv *resv;
unsigned int unused = 0;
list_for_each_entry(resv, &xnr->resv_list, list)
unused += resv->len - resv->used;
return unused;
}
......@@ -57,9 +57,12 @@ void xrep_newbt_init_ag(struct xrep_newbt *xnr, struct xfs_scrub *sc,
int xrep_newbt_init_inode(struct xrep_newbt *xnr, struct xfs_scrub *sc,
int whichfork, const struct xfs_owner_info *oinfo);
int xrep_newbt_alloc_blocks(struct xrep_newbt *xnr, uint64_t nr_blocks);
int xrep_newbt_add_extent(struct xrep_newbt *xnr, struct xfs_perag *pag,
xfs_agblock_t agbno, xfs_extlen_t len);
void xrep_newbt_cancel(struct xrep_newbt *xnr);
int xrep_newbt_commit(struct xrep_newbt *xnr);
int xrep_newbt_claim_block(struct xfs_btree_cur *cur, struct xrep_newbt *xnr,
union xfs_btree_ptr *ptr);
unsigned int xrep_newbt_unused_blocks(struct xrep_newbt *xnr);
#endif /* __XFS_SCRUB_NEWBT_H__ */
......@@ -37,6 +37,7 @@
#include "scrub/trace.h"
#include "scrub/repair.h"
#include "scrub/bitmap.h"
#include "scrub/agb_bitmap.h"
#include "scrub/reap.h"
/*
......@@ -430,13 +431,12 @@ xreap_agextent_iter(
*/
STATIC int
xreap_agmeta_extent(
uint64_t fsbno,
uint64_t len,
uint32_t agbno,
uint32_t len,
void *priv)
{
struct xreap_state *rs = priv;
struct xfs_scrub *sc = rs->sc;
xfs_agblock_t agbno = fsbno;
xfs_agblock_t agbno_next = agbno + len;
int error = 0;
......
......@@ -441,7 +441,7 @@ xchk_refcountbt_rec(
struct xchk_refcbt_records *rrc = bs->private;
xfs_refcount_btrec_to_irec(rec, &irec);
if (xfs_refcount_check_irec(bs->cur, &irec) != NULL) {
if (xfs_refcount_check_irec(bs->cur->bc_ag.pag, &irec) != NULL) {
xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
return 0;
}
......
// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Copyright (C) 2018-2023 Oracle. All Rights Reserved.
* Author: Darrick J. Wong <djwong@kernel.org>
*/
#include "xfs.h"
#include "xfs_fs.h"
#include "xfs_shared.h"
#include "xfs_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
#include "xfs_defer.h"
#include "xfs_btree.h"
#include "xfs_btree_staging.h"
#include "xfs_inode.h"
#include "xfs_bit.h"
#include "xfs_log_format.h"
#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_alloc.h"
#include "xfs_ialloc.h"
#include "xfs_rmap.h"
#include "xfs_rmap_btree.h"
#include "xfs_refcount.h"
#include "xfs_refcount_btree.h"
#include "xfs_error.h"
#include "xfs_ag.h"
#include "scrub/xfs_scrub.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/btree.h"
#include "scrub/trace.h"
#include "scrub/repair.h"
#include "scrub/bitmap.h"
#include "scrub/agb_bitmap.h"
#include "scrub/xfile.h"
#include "scrub/xfarray.h"
#include "scrub/newbt.h"
#include "scrub/reap.h"
/*
* Rebuilding the Reference Count Btree
* ====================================
*
* This algorithm is "borrowed" from xfs_repair. Imagine the rmap
* entries as rectangles representing extents of physical blocks, and
* that the rectangles can be laid down to allow them to overlap each
* other; then we know that we must emit a refcnt btree entry wherever
* the amount of overlap changes, i.e. the emission stimulus is
* level-triggered:
*
* - ---
* -- ----- ---- --- ------
* -- ---- ----------- ---- ---------
* -------------------------------- -----------
* ^ ^ ^^ ^^ ^ ^^ ^^^ ^^^^ ^ ^^ ^ ^ ^
* 2 1 23 21 3 43 234 2123 1 01 2 3 0
*
* For our purposes, a rmap is a tuple (startblock, len, fileoff, owner).
*
* Note that in the actual refcnt btree we don't store the refcount < 2
* cases because the bnobt tells us which blocks are free; single-use
* blocks aren't recorded in the bnobt or the refcntbt. If the rmapbt
* supports storing multiple entries covering a given block we could
* theoretically dispense with the refcntbt and simply count rmaps, but
* that's inefficient in the (hot) write path, so we'll take the cost of
* the extra tree to save time. Also there's no guarantee that rmap
* will be enabled.
*
* Given an array of rmaps sorted by physical block number, a starting
* physical block (sp), a bag to hold rmaps that cover sp, and the next
* physical block where the level changes (np), we can reconstruct the
* refcount btree as follows:
*
* While there are still unprocessed rmaps in the array,
* - Set sp to the physical block (pblk) of the next unprocessed rmap.
* - Add to the bag all rmaps in the array where startblock == sp.
* - Set np to the physical block where the bag size will change. This
* is the minimum of (the pblk of the next unprocessed rmap) and
* (startblock + len of each rmap in the bag).
* - Record the bag size as old_bag_size.
*
* - While the bag isn't empty,
* - Remove from the bag all rmaps where startblock + len == np.
* - Add to the bag all rmaps in the array where startblock == np.
* - If the bag size isn't old_bag_size, store the refcount entry
* (sp, np - sp, bag_size) in the refcnt btree.
* - If the bag is empty, break out of the inner loop.
* - Set old_bag_size to the bag size
* - Set sp = np.
* - Set np to the physical block where the bag size will change.
* This is the minimum of (the pblk of the next unprocessed rmap)
* and (startblock + len of each rmap in the bag).
*
* Like all the other repairers, we make a list of all the refcount
* records we need, then reinitialize the refcount btree root and
* insert all the records.
*/
/* The only parts of the rmap that we care about for computing refcounts. */
struct xrep_refc_rmap {
xfs_agblock_t startblock;
xfs_extlen_t blockcount;
} __packed;
struct xrep_refc {
/* refcount extents */
struct xfarray *refcount_records;
/* new refcountbt information */
struct xrep_newbt new_btree;
/* old refcountbt blocks */
struct xagb_bitmap old_refcountbt_blocks;
struct xfs_scrub *sc;
/* get_records()'s position in the refcount record array. */
xfarray_idx_t array_cur;
/* # of refcountbt blocks */
xfs_extlen_t btblocks;
};
/* Check for any obvious conflicts with this shared/CoW staging extent. */
STATIC int
xrep_refc_check_ext(
struct xfs_scrub *sc,
const struct xfs_refcount_irec *rec)
{
enum xbtree_recpacking outcome;
int error;
if (xfs_refcount_check_irec(sc->sa.pag, rec) != NULL)
return -EFSCORRUPTED;
/* Make sure this isn't free space. */
error = xfs_alloc_has_records(sc->sa.bno_cur, rec->rc_startblock,
rec->rc_blockcount, &outcome);
if (error)
return error;
if (outcome != XBTREE_RECPACKING_EMPTY)
return -EFSCORRUPTED;
/* Must not be an inode chunk. */
error = xfs_ialloc_has_inodes_at_extent(sc->sa.ino_cur,
rec->rc_startblock, rec->rc_blockcount, &outcome);
if (error)
return error;
if (outcome != XBTREE_RECPACKING_EMPTY)
return -EFSCORRUPTED;
return 0;
}
/* Record a reference count extent. */
STATIC int
xrep_refc_stash(
struct xrep_refc *rr,
enum xfs_refc_domain domain,
xfs_agblock_t agbno,
xfs_extlen_t len,
uint64_t refcount)
{
struct xfs_refcount_irec irec = {
.rc_startblock = agbno,
.rc_blockcount = len,
.rc_domain = domain,
};
struct xfs_scrub *sc = rr->sc;
int error = 0;
if (xchk_should_terminate(sc, &error))
return error;
irec.rc_refcount = min_t(uint64_t, MAXREFCOUNT, refcount);
error = xrep_refc_check_ext(rr->sc, &irec);
if (error)
return error;
trace_xrep_refc_found(sc->sa.pag, &irec);
return xfarray_append(rr->refcount_records, &irec);
}
/* Record a CoW staging extent. */
STATIC int
xrep_refc_stash_cow(
struct xrep_refc *rr,
xfs_agblock_t agbno,
xfs_extlen_t len)
{
return xrep_refc_stash(rr, XFS_REFC_DOMAIN_COW, agbno, len, 1);
}
/* Decide if an rmap could describe a shared extent. */
static inline bool
xrep_refc_rmap_shareable(
struct xfs_mount *mp,
const struct xfs_rmap_irec *rmap)
{
/* AG metadata are never sharable */
if (XFS_RMAP_NON_INODE_OWNER(rmap->rm_owner))
return false;
/* Metadata in files are never shareable */
if (xfs_internal_inum(mp, rmap->rm_owner))
return false;
/* Metadata and unwritten file blocks are not shareable. */
if (rmap->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK |
XFS_RMAP_UNWRITTEN))
return false;
return true;
}
/*
* Walk along the reverse mapping records until we find one that could describe
* a shared extent.
*/
STATIC int
xrep_refc_walk_rmaps(
struct xrep_refc *rr,
struct xrep_refc_rmap *rrm,
bool *have_rec)
{
struct xfs_rmap_irec rmap;
struct xfs_btree_cur *cur = rr->sc->sa.rmap_cur;
struct xfs_mount *mp = cur->bc_mp;
int have_gt;
int error = 0;
*have_rec = false;
/*
* Loop through the remaining rmaps. Remember CoW staging
* extents and the refcountbt blocks from the old tree for later
* disposal. We can only share written data fork extents, so
* keep looping until we find an rmap for one.
*/
do {
if (xchk_should_terminate(rr->sc, &error))
return error;
error = xfs_btree_increment(cur, 0, &have_gt);
if (error)
return error;
if (!have_gt)
return 0;
error = xfs_rmap_get_rec(cur, &rmap, &have_gt);
if (error)
return error;
if (XFS_IS_CORRUPT(mp, !have_gt))
return -EFSCORRUPTED;
if (rmap.rm_owner == XFS_RMAP_OWN_COW) {
error = xrep_refc_stash_cow(rr, rmap.rm_startblock,
rmap.rm_blockcount);
if (error)
return error;
} else if (rmap.rm_owner == XFS_RMAP_OWN_REFC) {
/* refcountbt block, dump it when we're done. */
rr->btblocks += rmap.rm_blockcount;
error = xagb_bitmap_set(&rr->old_refcountbt_blocks,
rmap.rm_startblock, rmap.rm_blockcount);
if (error)
return error;
}
} while (!xrep_refc_rmap_shareable(mp, &rmap));
rrm->startblock = rmap.rm_startblock;
rrm->blockcount = rmap.rm_blockcount;
*have_rec = true;
return 0;
}
static inline uint32_t
xrep_refc_encode_startblock(
const struct xfs_refcount_irec *irec)
{
uint32_t start;
start = irec->rc_startblock & ~XFS_REFC_COWFLAG;
if (irec->rc_domain == XFS_REFC_DOMAIN_COW)
start |= XFS_REFC_COWFLAG;
return start;
}
/* Sort in the same order as the ondisk records. */
static int
xrep_refc_extent_cmp(
const void *a,
const void *b)
{
const struct xfs_refcount_irec *ap = a;
const struct xfs_refcount_irec *bp = b;
uint32_t sa, sb;
sa = xrep_refc_encode_startblock(ap);
sb = xrep_refc_encode_startblock(bp);
if (sa > sb)
return 1;
if (sa < sb)
return -1;
return 0;
}
/*
* Sort the refcount extents by startblock or else the btree records will be in
* the wrong order. Make sure the records do not overlap in physical space.
*/
STATIC int
xrep_refc_sort_records(
struct xrep_refc *rr)
{
struct xfs_refcount_irec irec;
xfarray_idx_t cur;
enum xfs_refc_domain dom = XFS_REFC_DOMAIN_SHARED;
xfs_agblock_t next_agbno = 0;
int error;
error = xfarray_sort(rr->refcount_records, xrep_refc_extent_cmp,
XFARRAY_SORT_KILLABLE);
if (error)
return error;
foreach_xfarray_idx(rr->refcount_records, cur) {
if (xchk_should_terminate(rr->sc, &error))
return error;
error = xfarray_load(rr->refcount_records, cur, &irec);
if (error)
return error;
if (dom == XFS_REFC_DOMAIN_SHARED &&
irec.rc_domain == XFS_REFC_DOMAIN_COW) {
dom = irec.rc_domain;
next_agbno = 0;
}
if (dom != irec.rc_domain)
return -EFSCORRUPTED;
if (irec.rc_startblock < next_agbno)
return -EFSCORRUPTED;
next_agbno = irec.rc_startblock + irec.rc_blockcount;
}
return error;
}
#define RRM_NEXT(r) ((r).startblock + (r).blockcount)
/*
* Find the next block where the refcount changes, given the next rmap we
* looked at and the ones we're already tracking.
*/
static inline int
xrep_refc_next_edge(
struct xfarray *rmap_bag,
struct xrep_refc_rmap *next_rrm,
bool next_valid,
xfs_agblock_t *nbnop)
{
struct xrep_refc_rmap rrm;
xfarray_idx_t array_cur = XFARRAY_CURSOR_INIT;
xfs_agblock_t nbno = NULLAGBLOCK;
int error;
if (next_valid)
nbno = next_rrm->startblock;
while ((error = xfarray_iter(rmap_bag, &array_cur, &rrm)) == 1)
nbno = min_t(xfs_agblock_t, nbno, RRM_NEXT(rrm));
if (error)
return error;
/*
* We should have found /something/ because either next_rrm is the next
* interesting rmap to look at after emitting this refcount extent, or
* there are other rmaps in rmap_bag contributing to the current
* sharing count. But if something is seriously wrong, bail out.
*/
if (nbno == NULLAGBLOCK)
return -EFSCORRUPTED;
*nbnop = nbno;
return 0;
}
/*
* Walk forward through the rmap btree to collect all rmaps starting at
* @bno in @rmap_bag. These represent the file(s) that share ownership of
* the current block. Upon return, the rmap cursor points to the last record
* satisfying the startblock constraint.
*/
static int
xrep_refc_push_rmaps_at(
struct xrep_refc *rr,
struct xfarray *rmap_bag,
xfs_agblock_t bno,
struct xrep_refc_rmap *rrm,
bool *have,
uint64_t *stack_sz)
{
struct xfs_scrub *sc = rr->sc;
int have_gt;
int error;
while (*have && rrm->startblock == bno) {
error = xfarray_store_anywhere(rmap_bag, rrm);
if (error)
return error;
(*stack_sz)++;
error = xrep_refc_walk_rmaps(rr, rrm, have);
if (error)
return error;
}
error = xfs_btree_decrement(sc->sa.rmap_cur, 0, &have_gt);
if (error)
return error;
if (XFS_IS_CORRUPT(sc->mp, !have_gt))
return -EFSCORRUPTED;
return 0;
}
/* Iterate all the rmap records to generate reference count data. */
STATIC int
xrep_refc_find_refcounts(
struct xrep_refc *rr)
{
struct xrep_refc_rmap rrm;
struct xfs_scrub *sc = rr->sc;
struct xfarray *rmap_bag;
char *descr;
uint64_t old_stack_sz;
uint64_t stack_sz = 0;
xfs_agblock_t sbno;
xfs_agblock_t cbno;
xfs_agblock_t nbno;
bool have;
int error;
xrep_ag_btcur_init(sc, &sc->sa);
/*
* Set up a sparse array to store all the rmap records that we're
* tracking to generate a reference count record. If this exceeds
* MAXREFCOUNT, we clamp rc_refcount.
*/
descr = xchk_xfile_ag_descr(sc, "rmap record bag");
error = xfarray_create(descr, 0, sizeof(struct xrep_refc_rmap),
&rmap_bag);
kfree(descr);
if (error)
goto out_cur;
/* Start the rmapbt cursor to the left of all records. */
error = xfs_btree_goto_left_edge(sc->sa.rmap_cur);
if (error)
goto out_bag;
/* Process reverse mappings into refcount data. */
while (xfs_btree_has_more_records(sc->sa.rmap_cur)) {
/* Push all rmaps with pblk == sbno onto the stack */
error = xrep_refc_walk_rmaps(rr, &rrm, &have);
if (error)
goto out_bag;
if (!have)
break;
sbno = cbno = rrm.startblock;
error = xrep_refc_push_rmaps_at(rr, rmap_bag, sbno,
&rrm, &have, &stack_sz);
if (error)
goto out_bag;
/* Set nbno to the bno of the next refcount change */
error = xrep_refc_next_edge(rmap_bag, &rrm, have, &nbno);
if (error)
goto out_bag;
ASSERT(nbno > sbno);
old_stack_sz = stack_sz;
/* While stack isn't empty... */
while (stack_sz) {
xfarray_idx_t array_cur = XFARRAY_CURSOR_INIT;
/* Pop all rmaps that end at nbno */
while ((error = xfarray_iter(rmap_bag, &array_cur,
&rrm)) == 1) {
if (RRM_NEXT(rrm) != nbno)
continue;
error = xfarray_unset(rmap_bag, array_cur - 1);
if (error)
goto out_bag;
stack_sz--;
}
if (error)
goto out_bag;
/* Push array items that start at nbno */
error = xrep_refc_walk_rmaps(rr, &rrm, &have);
if (error)
goto out_bag;
if (have) {
error = xrep_refc_push_rmaps_at(rr, rmap_bag,
nbno, &rrm, &have, &stack_sz);
if (error)
goto out_bag;
}
/* Emit refcount if necessary */
ASSERT(nbno > cbno);
if (stack_sz != old_stack_sz) {
if (old_stack_sz > 1) {
error = xrep_refc_stash(rr,
XFS_REFC_DOMAIN_SHARED,
cbno, nbno - cbno,
old_stack_sz);
if (error)
goto out_bag;
}
cbno = nbno;
}
/* Stack empty, go find the next rmap */
if (stack_sz == 0)
break;
old_stack_sz = stack_sz;
sbno = nbno;
/* Set nbno to the bno of the next refcount change */
error = xrep_refc_next_edge(rmap_bag, &rrm, have,
&nbno);
if (error)
goto out_bag;
ASSERT(nbno > sbno);
}
}
ASSERT(stack_sz == 0);
out_bag:
xfarray_destroy(rmap_bag);
out_cur:
xchk_ag_btcur_free(&sc->sa);
return error;
}
#undef RRM_NEXT
/* Retrieve refcountbt data for bulk load. */
STATIC int
xrep_refc_get_records(
struct xfs_btree_cur *cur,
unsigned int idx,
struct xfs_btree_block *block,
unsigned int nr_wanted,
void *priv)
{
struct xfs_refcount_irec *irec = &cur->bc_rec.rc;
struct xrep_refc *rr = priv;
union xfs_btree_rec *block_rec;
unsigned int loaded;
int error;
for (loaded = 0; loaded < nr_wanted; loaded++, idx++) {
error = xfarray_load(rr->refcount_records, rr->array_cur++,
irec);
if (error)
return error;
block_rec = xfs_btree_rec_addr(cur, idx, block);
cur->bc_ops->init_rec_from_cur(cur, block_rec);
}
return loaded;
}
/* Feed one of the new btree blocks to the bulk loader. */
STATIC int
xrep_refc_claim_block(
struct xfs_btree_cur *cur,
union xfs_btree_ptr *ptr,
void *priv)
{
struct xrep_refc *rr = priv;
return xrep_newbt_claim_block(cur, &rr->new_btree, ptr);
}
/* Update the AGF counters. */
STATIC int
xrep_refc_reset_counters(
struct xrep_refc *rr)
{
struct xfs_scrub *sc = rr->sc;
struct xfs_perag *pag = sc->sa.pag;
/*
* After we commit the new btree to disk, it is possible that the
* process to reap the old btree blocks will race with the AIL trying
* to checkpoint the old btree blocks into the filesystem. If the new
* tree is shorter than the old one, the refcountbt write verifier will
* fail and the AIL will shut down the filesystem.
*
* To avoid this, save the old incore btree height values as the alt
* height values before re-initializing the perag info from the updated
* AGF to capture all the new values.
*/
pag->pagf_repair_refcount_level = pag->pagf_refcount_level;
/* Reinitialize with the values we just logged. */
return xrep_reinit_pagf(sc);
}
/*
* Use the collected refcount information to stage a new refcount btree. If
* this is successful we'll return with the new btree root information logged
* to the repair transaction but not yet committed.
*/
STATIC int
xrep_refc_build_new_tree(
struct xrep_refc *rr)
{
struct xfs_scrub *sc = rr->sc;
struct xfs_btree_cur *refc_cur;
struct xfs_perag *pag = sc->sa.pag;
xfs_fsblock_t fsbno;
int error;
error = xrep_refc_sort_records(rr);
if (error)
return error;
/*
* Prepare to construct the new btree by reserving disk space for the
* new btree and setting up all the accounting information we'll need
* to root the new btree while it's under construction and before we
* attach it to the AG header.
*/
fsbno = XFS_AGB_TO_FSB(sc->mp, pag->pag_agno, xfs_refc_block(sc->mp));
xrep_newbt_init_ag(&rr->new_btree, sc, &XFS_RMAP_OINFO_REFC, fsbno,
XFS_AG_RESV_METADATA);
rr->new_btree.bload.get_records = xrep_refc_get_records;
rr->new_btree.bload.claim_block = xrep_refc_claim_block;
/* Compute how many blocks we'll need. */
refc_cur = xfs_refcountbt_stage_cursor(sc->mp, &rr->new_btree.afake,
pag);
error = xfs_btree_bload_compute_geometry(refc_cur,
&rr->new_btree.bload,
xfarray_length(rr->refcount_records));
if (error)
goto err_cur;
/* Last chance to abort before we start committing fixes. */
if (xchk_should_terminate(sc, &error))
goto err_cur;
/* Reserve the space we'll need for the new btree. */
error = xrep_newbt_alloc_blocks(&rr->new_btree,
rr->new_btree.bload.nr_blocks);
if (error)
goto err_cur;
/*
* Due to btree slack factors, it's possible for a new btree to be one
* level taller than the old btree. Update the incore btree height so
* that we don't trip the verifiers when writing the new btree blocks
* to disk.
*/
pag->pagf_repair_refcount_level = rr->new_btree.bload.btree_height;
/* Add all observed refcount records. */
rr->array_cur = XFARRAY_CURSOR_INIT;
error = xfs_btree_bload(refc_cur, &rr->new_btree.bload, rr);
if (error)
goto err_level;
/*
* Install the new btree in the AG header. After this point the old
* btree is no longer accessible and the new tree is live.
*/
xfs_refcountbt_commit_staged_btree(refc_cur, sc->tp, sc->sa.agf_bp);
xfs_btree_del_cursor(refc_cur, 0);
/* Reset the AGF counters now that we've changed the btree shape. */
error = xrep_refc_reset_counters(rr);
if (error)
goto err_newbt;
/* Dispose of any unused blocks and the accounting information. */
error = xrep_newbt_commit(&rr->new_btree);
if (error)
return error;
return xrep_roll_ag_trans(sc);
err_level:
pag->pagf_repair_refcount_level = 0;
err_cur:
xfs_btree_del_cursor(refc_cur, error);
err_newbt:
xrep_newbt_cancel(&rr->new_btree);
return error;
}
/*
* Now that we've logged the roots of the new btrees, invalidate all of the
* old blocks and free them.
*/
STATIC int
xrep_refc_remove_old_tree(
struct xrep_refc *rr)
{
struct xfs_scrub *sc = rr->sc;
struct xfs_perag *pag = sc->sa.pag;
int error;
/* Free the old refcountbt blocks if they're not in use. */
error = xrep_reap_agblocks(sc, &rr->old_refcountbt_blocks,
&XFS_RMAP_OINFO_REFC, XFS_AG_RESV_METADATA);
if (error)
return error;
/*
* Now that we've zapped all the old refcountbt blocks we can turn off
* the alternate height mechanism and reset the per-AG space
* reservations.
*/
pag->pagf_repair_refcount_level = 0;
sc->flags |= XREP_RESET_PERAG_RESV;
return 0;
}
/* Rebuild the refcount btree. */
int
xrep_refcountbt(
struct xfs_scrub *sc)
{
struct xrep_refc *rr;
struct xfs_mount *mp = sc->mp;
char *descr;
int error;
/* We require the rmapbt to rebuild anything. */
if (!xfs_has_rmapbt(mp))
return -EOPNOTSUPP;
rr = kzalloc(sizeof(struct xrep_refc), XCHK_GFP_FLAGS);
if (!rr)
return -ENOMEM;
rr->sc = sc;
/* Set up enough storage to handle one refcount record per block. */
descr = xchk_xfile_ag_descr(sc, "reference count records");
error = xfarray_create(descr, mp->m_sb.sb_agblocks,
sizeof(struct xfs_refcount_irec),
&rr->refcount_records);
kfree(descr);
if (error)
goto out_rr;
/* Collect all reference counts. */
xagb_bitmap_init(&rr->old_refcountbt_blocks);
error = xrep_refc_find_refcounts(rr);
if (error)
goto out_bitmap;
/* Rebuild the refcount information. */
error = xrep_refc_build_new_tree(rr);
if (error)
goto out_bitmap;
/* Kill the old tree. */
error = xrep_refc_remove_old_tree(rr);
if (error)
goto out_bitmap;
out_bitmap:
xagb_bitmap_destroy(&rr->old_refcountbt_blocks);
xfarray_destroy(rr->refcount_records);
out_rr:
kfree(rr);
return error;
}
......@@ -734,3 +734,134 @@ xrep_ino_dqattach(
return error;
}
/*
* Initialize all the btree cursors for an AG repair except for the btree that
* we're rebuilding.
*/
void
xrep_ag_btcur_init(
struct xfs_scrub *sc,
struct xchk_ag *sa)
{
struct xfs_mount *mp = sc->mp;
/* Set up a bnobt cursor for cross-referencing. */
if (sc->sm->sm_type != XFS_SCRUB_TYPE_BNOBT &&
sc->sm->sm_type != XFS_SCRUB_TYPE_CNTBT) {
sa->bno_cur = xfs_allocbt_init_cursor(mp, sc->tp, sa->agf_bp,
sc->sa.pag, XFS_BTNUM_BNO);
sa->cnt_cur = xfs_allocbt_init_cursor(mp, sc->tp, sa->agf_bp,
sc->sa.pag, XFS_BTNUM_CNT);
}
/* Set up a inobt cursor for cross-referencing. */
if (sc->sm->sm_type != XFS_SCRUB_TYPE_INOBT &&
sc->sm->sm_type != XFS_SCRUB_TYPE_FINOBT) {
sa->ino_cur = xfs_inobt_init_cursor(sc->sa.pag, sc->tp,
sa->agi_bp, XFS_BTNUM_INO);
if (xfs_has_finobt(mp))
sa->fino_cur = xfs_inobt_init_cursor(sc->sa.pag,
sc->tp, sa->agi_bp, XFS_BTNUM_FINO);
}
/* Set up a rmapbt cursor for cross-referencing. */
if (sc->sm->sm_type != XFS_SCRUB_TYPE_RMAPBT &&
xfs_has_rmapbt(mp))
sa->rmap_cur = xfs_rmapbt_init_cursor(mp, sc->tp, sa->agf_bp,
sc->sa.pag);
/* Set up a refcountbt cursor for cross-referencing. */
if (sc->sm->sm_type != XFS_SCRUB_TYPE_REFCNTBT &&
xfs_has_reflink(mp))
sa->refc_cur = xfs_refcountbt_init_cursor(mp, sc->tp,
sa->agf_bp, sc->sa.pag);
}
/*
* Reinitialize the in-core AG state after a repair by rereading the AGF
* buffer. We had better get the same AGF buffer as the one that's attached
* to the scrub context.
*/
int
xrep_reinit_pagf(
struct xfs_scrub *sc)
{
struct xfs_perag *pag = sc->sa.pag;
struct xfs_buf *bp;
int error;
ASSERT(pag);
ASSERT(xfs_perag_initialised_agf(pag));
clear_bit(XFS_AGSTATE_AGF_INIT, &pag->pag_opstate);
error = xfs_alloc_read_agf(pag, sc->tp, 0, &bp);
if (error)
return error;
if (bp != sc->sa.agf_bp) {
ASSERT(bp == sc->sa.agf_bp);
return -EFSCORRUPTED;
}
return 0;
}
/*
* Reinitialize the in-core AG state after a repair by rereading the AGI
* buffer. We had better get the same AGI buffer as the one that's attached
* to the scrub context.
*/
int
xrep_reinit_pagi(
struct xfs_scrub *sc)
{
struct xfs_perag *pag = sc->sa.pag;
struct xfs_buf *bp;
int error;
ASSERT(pag);
ASSERT(xfs_perag_initialised_agi(pag));
clear_bit(XFS_AGSTATE_AGI_INIT, &pag->pag_opstate);
error = xfs_ialloc_read_agi(pag, sc->tp, &bp);
if (error)
return error;
if (bp != sc->sa.agi_bp) {
ASSERT(bp == sc->sa.agi_bp);
return -EFSCORRUPTED;
}
return 0;
}
/* Reinitialize the per-AG block reservation for the AG we just fixed. */
int
xrep_reset_perag_resv(
struct xfs_scrub *sc)
{
int error;
if (!(sc->flags & XREP_RESET_PERAG_RESV))
return 0;
ASSERT(sc->sa.pag != NULL);
ASSERT(sc->ops->type == ST_PERAG);
ASSERT(sc->tp);
sc->flags &= ~XREP_RESET_PERAG_RESV;
error = xfs_ag_resv_free(sc->sa.pag);
if (error)
goto out;
error = xfs_ag_resv_init(sc->sa.pag, sc->tp);
if (error == -ENOSPC) {
xfs_err(sc->mp,
"Insufficient free space to reset per-AG reservation for AG %u after repair.",
sc->sa.pag->pag_agno);
error = 0;
}
out:
return error;
}
......@@ -59,6 +59,17 @@ int xrep_find_ag_btree_roots(struct xfs_scrub *sc, struct xfs_buf *agf_bp,
struct xrep_find_ag_btree *btree_info, struct xfs_buf *agfl_bp);
void xrep_force_quotacheck(struct xfs_scrub *sc, xfs_dqtype_t type);
int xrep_ino_dqattach(struct xfs_scrub *sc);
int xrep_reset_perag_resv(struct xfs_scrub *sc);
/* Repair setup functions */
int xrep_setup_ag_allocbt(struct xfs_scrub *sc);
void xrep_ag_btcur_init(struct xfs_scrub *sc, struct xchk_ag *sa);
/* Metadata revalidators */
int xrep_revalidate_allocbt(struct xfs_scrub *sc);
int xrep_revalidate_iallocbt(struct xfs_scrub *sc);
/* Metadata repairers */
......@@ -67,6 +78,12 @@ int xrep_superblock(struct xfs_scrub *sc);
int xrep_agf(struct xfs_scrub *sc);
int xrep_agfl(struct xfs_scrub *sc);
int xrep_agi(struct xfs_scrub *sc);
int xrep_allocbt(struct xfs_scrub *sc);
int xrep_iallocbt(struct xfs_scrub *sc);
int xrep_refcountbt(struct xfs_scrub *sc);
int xrep_reinit_pagf(struct xfs_scrub *sc);
int xrep_reinit_pagi(struct xfs_scrub *sc);
#else
......@@ -87,11 +104,37 @@ xrep_calc_ag_resblks(
return 0;
}
static inline int
xrep_reset_perag_resv(
struct xfs_scrub *sc)
{
if (!(sc->flags & XREP_RESET_PERAG_RESV))
return 0;
ASSERT(0);
return -EOPNOTSUPP;
}
/* repair setup functions for no-repair */
static inline int
xrep_setup_nothing(
struct xfs_scrub *sc)
{
return 0;
}
#define xrep_setup_ag_allocbt xrep_setup_nothing
#define xrep_revalidate_allocbt (NULL)
#define xrep_revalidate_iallocbt (NULL)
#define xrep_probe xrep_notsupported
#define xrep_superblock xrep_notsupported
#define xrep_agf xrep_notsupported
#define xrep_agfl xrep_notsupported
#define xrep_agi xrep_notsupported
#define xrep_allocbt xrep_notsupported
#define xrep_iallocbt xrep_notsupported
#define xrep_refcountbt xrep_notsupported
#endif /* CONFIG_XFS_ONLINE_REPAIR */
......
......@@ -24,6 +24,7 @@
#include "scrub/common.h"
#include "scrub/btree.h"
#include "scrub/bitmap.h"
#include "scrub/agb_bitmap.h"
/*
* Set us up to scrub reverse mapping btrees.
......
......@@ -238,27 +238,31 @@ static const struct xchk_meta_ops meta_scrub_ops[] = {
[XFS_SCRUB_TYPE_BNOBT] = { /* bnobt */
.type = ST_PERAG,
.setup = xchk_setup_ag_allocbt,
.scrub = xchk_bnobt,
.repair = xrep_notsupported,
.scrub = xchk_allocbt,
.repair = xrep_allocbt,
.repair_eval = xrep_revalidate_allocbt,
},
[XFS_SCRUB_TYPE_CNTBT] = { /* cntbt */
.type = ST_PERAG,
.setup = xchk_setup_ag_allocbt,
.scrub = xchk_cntbt,
.repair = xrep_notsupported,
.scrub = xchk_allocbt,
.repair = xrep_allocbt,
.repair_eval = xrep_revalidate_allocbt,
},
[XFS_SCRUB_TYPE_INOBT] = { /* inobt */
.type = ST_PERAG,
.setup = xchk_setup_ag_iallocbt,
.scrub = xchk_inobt,
.repair = xrep_notsupported,
.scrub = xchk_iallocbt,
.repair = xrep_iallocbt,
.repair_eval = xrep_revalidate_iallocbt,
},
[XFS_SCRUB_TYPE_FINOBT] = { /* finobt */
.type = ST_PERAG,
.setup = xchk_setup_ag_iallocbt,
.scrub = xchk_finobt,
.scrub = xchk_iallocbt,
.has = xfs_has_finobt,
.repair = xrep_notsupported,
.repair = xrep_iallocbt,
.repair_eval = xrep_revalidate_iallocbt,
},
[XFS_SCRUB_TYPE_RMAPBT] = { /* rmapbt */
.type = ST_PERAG,
......@@ -272,7 +276,7 @@ static const struct xchk_meta_ops meta_scrub_ops[] = {
.setup = xchk_setup_ag_refcountbt,
.scrub = xchk_refcountbt,
.has = xfs_has_reflink,
.repair = xrep_notsupported,
.repair = xrep_refcountbt,
},
[XFS_SCRUB_TYPE_INODE] = { /* inode record */
.type = ST_INODE,
......@@ -531,6 +535,9 @@ xfs_scrub_metadata(
/* Scrub for errors. */
check_start = xchk_stats_now();
if ((sc->flags & XREP_ALREADY_FIXED) && sc->ops->repair_eval != NULL)
error = sc->ops->repair_eval(sc);
else
error = sc->ops->scrub(sc);
run.scrub_ns += xchk_stats_elapsed_ns(check_start);
if (error == -EDEADLOCK && !(sc->flags & XCHK_TRY_HARDER))
......@@ -542,8 +549,7 @@ xfs_scrub_metadata(
xchk_update_health(sc);
if ((sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) &&
!(sc->flags & XREP_ALREADY_FIXED)) {
if (xchk_could_repair(sc)) {
bool needs_fix = xchk_needs_repair(sc->sm);
/* Userspace asked us to rebuild the structure regardless. */
......
......@@ -35,6 +35,14 @@ struct xchk_meta_ops {
/* Repair or optimize the metadata. */
int (*repair)(struct xfs_scrub *);
/*
* Re-scrub the metadata we repaired, in case there's extra work that
* we need to do to check our repair work. If this is NULL, we'll use
* the ->scrub function pointer, assuming that the regular scrub is
* sufficient.
*/
int (*repair_eval)(struct xfs_scrub *sc);
/* Decide if we even have this piece of metadata. */
bool (*has)(struct xfs_mount *);
......@@ -113,6 +121,7 @@ struct xfs_scrub {
#define XCHK_HAVE_FREEZE_PROT (1U << 1) /* do we have freeze protection? */
#define XCHK_FSGATES_DRAIN (1U << 2) /* defer ops draining enabled */
#define XCHK_NEED_DRAIN (1U << 3) /* scrub needs to drain defer ops */
#define XREP_RESET_PERAG_RESV (1U << 30) /* must reset AG space reservation */
#define XREP_ALREADY_FIXED (1U << 31) /* checking our repair work */
/*
......@@ -129,10 +138,8 @@ int xchk_superblock(struct xfs_scrub *sc);
int xchk_agf(struct xfs_scrub *sc);
int xchk_agfl(struct xfs_scrub *sc);
int xchk_agi(struct xfs_scrub *sc);
int xchk_bnobt(struct xfs_scrub *sc);
int xchk_cntbt(struct xfs_scrub *sc);
int xchk_inobt(struct xfs_scrub *sc);
int xchk_finobt(struct xfs_scrub *sc);
int xchk_allocbt(struct xfs_scrub *sc);
int xchk_iallocbt(struct xfs_scrub *sc);
int xchk_rmapbt(struct xfs_scrub *sc);
int xchk_refcountbt(struct xfs_scrub *sc);
int xchk_inode(struct xfs_scrub *sc);
......
......@@ -106,6 +106,7 @@ TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_FSCOUNTERS);
{ XCHK_HAVE_FREEZE_PROT, "nofreeze" }, \
{ XCHK_FSGATES_DRAIN, "fsgates_drain" }, \
{ XCHK_NEED_DRAIN, "need_drain" }, \
{ XREP_RESET_PERAG_RESV, "reset_perag_resv" }, \
{ XREP_ALREADY_FIXED, "already_fixed" }
DECLARE_EVENT_CLASS(xchk_class,
......@@ -1172,33 +1173,89 @@ DEFINE_EVENT(xrep_rmap_class, name, \
xfs_agblock_t agbno, xfs_extlen_t len, \
uint64_t owner, uint64_t offset, unsigned int flags), \
TP_ARGS(mp, agno, agbno, len, owner, offset, flags))
DEFINE_REPAIR_RMAP_EVENT(xrep_alloc_extent_fn);
DEFINE_REPAIR_RMAP_EVENT(xrep_ialloc_extent_fn);
DEFINE_REPAIR_RMAP_EVENT(xrep_ibt_walk_rmap);
DEFINE_REPAIR_RMAP_EVENT(xrep_rmap_extent_fn);
DEFINE_REPAIR_RMAP_EVENT(xrep_bmap_extent_fn);
TRACE_EVENT(xrep_refcount_extent_fn,
TRACE_EVENT(xrep_abt_found,
TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
struct xfs_refcount_irec *irec),
TP_ARGS(mp, agno, irec),
const struct xfs_alloc_rec_incore *rec),
TP_ARGS(mp, agno, rec),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_agnumber_t, agno)
__field(xfs_agblock_t, startblock)
__field(xfs_extlen_t, blockcount)
__field(xfs_nlink_t, refcount)
),
TP_fast_assign(
__entry->dev = mp->m_super->s_dev;
__entry->agno = agno;
__entry->startblock = irec->rc_startblock;
__entry->blockcount = irec->rc_blockcount;
__entry->refcount = irec->rc_refcount;
__entry->startblock = rec->ar_startblock;
__entry->blockcount = rec->ar_blockcount;
),
TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x refcount %u",
TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->agno,
__entry->startblock,
__entry->blockcount)
)
TRACE_EVENT(xrep_ibt_found,
TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
const struct xfs_inobt_rec_incore *rec),
TP_ARGS(mp, agno, rec),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_agnumber_t, agno)
__field(xfs_agino_t, startino)
__field(uint16_t, holemask)
__field(uint8_t, count)
__field(uint8_t, freecount)
__field(uint64_t, freemask)
),
TP_fast_assign(
__entry->dev = mp->m_super->s_dev;
__entry->agno = agno;
__entry->startino = rec->ir_startino;
__entry->holemask = rec->ir_holemask;
__entry->count = rec->ir_count;
__entry->freecount = rec->ir_freecount;
__entry->freemask = rec->ir_free;
),
TP_printk("dev %d:%d agno 0x%x agino 0x%x holemask 0x%x count 0x%x freecount 0x%x freemask 0x%llx",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->agno,
__entry->startino,
__entry->holemask,
__entry->count,
__entry->freecount,
__entry->freemask)
)
TRACE_EVENT(xrep_refc_found,
TP_PROTO(struct xfs_perag *pag, const struct xfs_refcount_irec *rec),
TP_ARGS(pag, rec),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_agnumber_t, agno)
__field(enum xfs_refc_domain, domain)
__field(xfs_agblock_t, startblock)
__field(xfs_extlen_t, blockcount)
__field(xfs_nlink_t, refcount)
),
TP_fast_assign(
__entry->dev = pag->pag_mount->m_super->s_dev;
__entry->agno = pag->pag_agno;
__entry->domain = rec->rc_domain;
__entry->startblock = rec->rc_startblock;
__entry->blockcount = rec->rc_blockcount;
__entry->refcount = rec->rc_refcount;
),
TP_printk("dev %d:%d agno 0x%x dom %s agbno 0x%x fsbcount 0x%x refcount %u",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->agno,
__print_symbolic(__entry->domain, XFS_REFC_DOMAIN_STRINGS),
__entry->startblock,
__entry->blockcount,
__entry->refcount)
)
......@@ -1299,39 +1356,6 @@ TRACE_EVENT(xrep_reset_counters,
MAJOR(__entry->dev), MINOR(__entry->dev))
)
TRACE_EVENT(xrep_ialloc_insert,
TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
xfs_agino_t startino, uint16_t holemask, uint8_t count,
uint8_t freecount, uint64_t freemask),
TP_ARGS(mp, agno, startino, holemask, count, freecount, freemask),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_agnumber_t, agno)
__field(xfs_agino_t, startino)
__field(uint16_t, holemask)
__field(uint8_t, count)
__field(uint8_t, freecount)
__field(uint64_t, freemask)
),
TP_fast_assign(
__entry->dev = mp->m_super->s_dev;
__entry->agno = agno;
__entry->startino = startino;
__entry->holemask = holemask;
__entry->count = count;
__entry->freecount = freecount;
__entry->freemask = freemask;
),
TP_printk("dev %d:%d agno 0x%x startino 0x%x holemask 0x%x count %u freecount %u freemask 0x%llx",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->agno,
__entry->startino,
__entry->holemask,
__entry->count,
__entry->freecount,
__entry->freemask)
)
DECLARE_EVENT_CLASS(xrep_newbt_extent_class,
TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
xfs_agblock_t agbno, xfs_extlen_t len,
......
......@@ -54,6 +54,28 @@ static inline int xfarray_append(struct xfarray *array, const void *ptr)
uint64_t xfarray_length(struct xfarray *array);
int xfarray_load_next(struct xfarray *array, xfarray_idx_t *idx, void *rec);
/*
* Iterate the non-null elements in a sparse xfarray. Callers should
* initialize *idx to XFARRAY_CURSOR_INIT before the first call; on return, it
* will be set to one more than the index of the record that was retrieved.
* Returns 1 if a record was retrieved, 0 if there weren't any more records, or
* a negative errno.
*/
static inline int
xfarray_iter(
struct xfarray *array,
xfarray_idx_t *idx,
void *rec)
{
int ret = xfarray_load_next(array, idx, rec);
if (ret == -ENODATA)
return 0;
if (ret == 0)
return 1;
return ret;
}
/* Declarations for xfile array sort functionality. */
typedef cmp_func_t xfarray_cmp_fn;
......
......@@ -678,3 +678,16 @@ xfs_extent_busy_ag_cmp(
diff = b1->bno - b2->bno;
return diff;
}
/* Are there any busy extents in this AG? */
bool
xfs_extent_busy_list_empty(
struct xfs_perag *pag)
{
bool res;
spin_lock(&pag->pagb_lock);
res = RB_EMPTY_ROOT(&pag->pagb_tree);
spin_unlock(&pag->pagb_lock);
return res;
}
......@@ -85,4 +85,6 @@ static inline void xfs_extent_busy_sort(struct list_head *list)
list_sort(NULL, list, xfs_extent_busy_ag_cmp);
}
bool xfs_extent_busy_list_empty(struct xfs_perag *pag);
#endif /* __XFS_EXTENT_BUSY_H__ */
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment