Commit 8394a97c authored by Chandan Babu R's avatar Chandan Babu R

Merge tag 'in-memory-btrees-6.9_2024-02-23' of...

Merge tag 'in-memory-btrees-6.9_2024-02-23' of https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfs-linux into xfs-6.9-mergeC

xfs: support in-memory btrees

Online repair of the reverse-mapping btrees presens some unique
challenges.  To construct a new reverse mapping btree, we must scan the
entire filesystem, but we cannot afford to quiesce the entire filesystem
for the potentially lengthy scan.

For rmap btrees, therefore, we relax our requirements of totally atomic
repairs.  Instead, repairs will scan all inodes, construct a new reverse
mapping dataset, format a new btree, and commit it before anyone trips
over the corruption.  This is exactly the same strategy as was used in
the quotacheck and nlink scanners.

Unfortunately, the xfarray cannot perform key-based lookups and is
therefore unsuitable for supporting live updates.  Luckily, we already a
data structure that maintains an indexed rmap recordset -- the existing
rmap btree code!  Hence we port the existing btree and buffer target
code to be able to create a btree using the xfile we developed earlier.
Live hooks keep the in-memory btree up to date for any resources that
have already been scanned.

This approach is not maximally memory efficient, but we can use the same
rmap code that we do everywhere else, which provides improved stability
without growing the code base even more.  Note that in-memory btree
blocks are always page sized.

This patchset modifies the kernel xfs buffer cache to be capable of
using a xfile (aka a shmem file) as a backing device.  It then augments
the btree code to support creating btree cursors with buffers that come
from a buftarg other than the data device (namely an xfile-backed
buftarg).  For the userspace xfs buffer cache, we instead use a memfd or
an O_TMPFILE file as a backing device.
Signed-off-by: default avatarDarrick J. Wong <djwong@kernel.org>
Signed-off-by: default avatarChandan Babu R <chandanbabu@kernel.org>

* tag 'in-memory-btrees-6.9_2024-02-23' of https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfs-linux:
  xfs: launder in-memory btree buffers before transaction commit
  xfs: support in-memory btrees
  xfs: add a xfs_btree_ptrs_equal helper
  xfs: support in-memory buffer cache targets
  xfs: teach buftargs to maintain their own buffer hashtable
parents aa8fb4bb 0dc63c8a
......@@ -2270,13 +2270,12 @@ follows:
pointing to the xfile.
3. Pass the buffer cache target, buffer ops, and other information to
``xfbtree_create`` to write an initial tree header and root block to the
xfile.
``xfbtree_init`` to initialize the passed in ``struct xfbtree`` and write an
initial root block to the xfile.
Each btree type should define a wrapper that passes necessary arguments to
the creation function.
For example, rmap btrees define ``xfs_rmapbt_mem_create`` to take care of
all the necessary details for callers.
A ``struct xfbtree`` object will be returned.
4. Pass the xfbtree object to the btree cursor creation function for the
btree type.
......
......@@ -128,6 +128,12 @@ config XFS_LIVE_HOOKS
bool
select JUMP_LABEL if HAVE_ARCH_JUMP_LABEL
config XFS_MEMORY_BUFS
bool
config XFS_BTREE_IN_MEM
bool
config XFS_ONLINE_SCRUB
bool "XFS online metadata check support"
default n
......@@ -135,6 +141,7 @@ config XFS_ONLINE_SCRUB
depends on TMPFS && SHMEM
select XFS_LIVE_HOOKS
select XFS_DRAIN_INTENTS
select XFS_MEMORY_BUFS
help
If you say Y here you will be able to check metadata on a
mounted XFS filesystem. This feature is intended to reduce
......@@ -169,6 +176,7 @@ config XFS_ONLINE_REPAIR
bool "XFS online metadata repair support"
default n
depends on XFS_FS && XFS_ONLINE_SCRUB
select XFS_BTREE_IN_MEM
help
If you say Y here you will be able to repair metadata on a
mounted XFS filesystem. This feature is intended to reduce
......
......@@ -137,6 +137,8 @@ endif
xfs-$(CONFIG_XFS_DRAIN_INTENTS) += xfs_drain.o
xfs-$(CONFIG_XFS_LIVE_HOOKS) += xfs_hooks.o
xfs-$(CONFIG_XFS_MEMORY_BUFS) += xfs_buf_mem.o
xfs-$(CONFIG_XFS_BTREE_IN_MEM) += libxfs/xfs_btree_mem.o
# online scrub/repair
ifeq ($(CONFIG_XFS_ONLINE_SCRUB),y)
......
......@@ -264,7 +264,7 @@ xfs_free_perag(
xfs_defer_drain_free(&pag->pag_intents_drain);
cancel_delayed_work_sync(&pag->pag_blockgc_work);
xfs_buf_hash_destroy(pag);
xfs_buf_cache_destroy(&pag->pag_bcache);
/* drop the mount's active reference */
xfs_perag_rele(pag);
......@@ -352,7 +352,7 @@ xfs_free_unused_perag_range(
spin_unlock(&mp->m_perag_lock);
if (!pag)
break;
xfs_buf_hash_destroy(pag);
xfs_buf_cache_destroy(&pag->pag_bcache);
xfs_defer_drain_free(&pag->pag_intents_drain);
kfree(pag);
}
......@@ -419,7 +419,7 @@ xfs_initialize_perag(
pag->pagb_tree = RB_ROOT;
#endif /* __KERNEL__ */
error = xfs_buf_hash_init(pag);
error = xfs_buf_cache_init(&pag->pag_bcache);
if (error)
goto out_remove_pag;
......
......@@ -106,9 +106,7 @@ struct xfs_perag {
int pag_ici_reclaimable; /* reclaimable inodes */
unsigned long pag_ici_reclaim_cursor; /* reclaim restart point */
/* buffer cache index */
spinlock_t pag_buf_lock; /* lock for pag_buf_hash */
struct rhashtable pag_buf_hash;
struct xfs_buf_cache pag_bcache;
/* background prealloc block trimming */
struct delayed_work pag_blockgc_work;
......
......@@ -28,6 +28,8 @@
#include "xfs_rmap_btree.h"
#include "xfs_refcount_btree.h"
#include "xfs_health.h"
#include "xfs_buf_mem.h"
#include "xfs_btree_mem.h"
/*
* Btree magic numbers.
......@@ -75,6 +77,25 @@ xfs_btree_check_fsblock_siblings(
return NULL;
}
static inline xfs_failaddr_t
xfs_btree_check_memblock_siblings(
struct xfs_buftarg *btp,
xfbno_t bno,
__be64 dsibling)
{
xfbno_t sibling;
if (dsibling == cpu_to_be64(NULLFSBLOCK))
return NULL;
sibling = be64_to_cpu(dsibling);
if (sibling == bno)
return __this_address;
if (!xmbuf_verify_daddr(btp, xfbno_to_daddr(sibling)))
return __this_address;
return NULL;
}
static inline xfs_failaddr_t
xfs_btree_check_agblock_siblings(
struct xfs_perag *pag,
......@@ -164,6 +185,34 @@ __xfs_btree_check_fsblock(
return fa;
}
/*
* Check an in-memory btree block header. Return the address of the failing
* check, or NULL if everything is ok.
*/
static xfs_failaddr_t
__xfs_btree_check_memblock(
struct xfs_btree_cur *cur,
struct xfs_btree_block *block,
int level,
struct xfs_buf *bp)
{
struct xfs_buftarg *btp = cur->bc_mem.xfbtree->target;
xfs_failaddr_t fa;
xfbno_t bno;
fa = __xfs_btree_check_lblock_hdr(cur, block, level, bp);
if (fa)
return fa;
bno = xfs_daddr_to_xfbno(xfs_buf_daddr(bp));
fa = xfs_btree_check_memblock_siblings(btp, bno,
block->bb_u.l.bb_leftsib);
if (!fa)
fa = xfs_btree_check_memblock_siblings(btp, bno,
block->bb_u.l.bb_rightsib);
return fa;
}
/*
* Check a short btree block header. Return the address of the failing check,
* or NULL if everything is ok.
......@@ -216,9 +265,17 @@ __xfs_btree_check_block(
int level,
struct xfs_buf *bp)
{
if (cur->bc_ops->type == XFS_BTREE_TYPE_AG)
switch (cur->bc_ops->type) {
case XFS_BTREE_TYPE_MEM:
return __xfs_btree_check_memblock(cur, block, level, bp);
case XFS_BTREE_TYPE_AG:
return __xfs_btree_check_agblock(cur, block, level, bp);
return __xfs_btree_check_fsblock(cur, block, level, bp);
case XFS_BTREE_TYPE_INODE:
return __xfs_btree_check_fsblock(cur, block, level, bp);
default:
ASSERT(0);
return __this_address;
}
}
static inline unsigned int xfs_btree_block_errtag(struct xfs_btree_cur *cur)
......@@ -262,14 +319,22 @@ __xfs_btree_check_ptr(
if (level <= 0)
return -EFSCORRUPTED;
if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE) {
switch (cur->bc_ops->type) {
case XFS_BTREE_TYPE_MEM:
if (!xfbtree_verify_bno(cur->bc_mem.xfbtree,
be64_to_cpu((&ptr->l)[index])))
return -EFSCORRUPTED;
break;
case XFS_BTREE_TYPE_INODE:
if (!xfs_verify_fsbno(cur->bc_mp,
be64_to_cpu((&ptr->l)[index])))
return -EFSCORRUPTED;
} else {
break;
case XFS_BTREE_TYPE_AG:
if (!xfs_verify_agbno(cur->bc_ag.pag,
be32_to_cpu((&ptr->s)[index])))
return -EFSCORRUPTED;
break;
}
return 0;
......@@ -290,17 +355,26 @@ xfs_btree_check_ptr(
error = __xfs_btree_check_ptr(cur, ptr, index, level);
if (error) {
if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE) {
switch (cur->bc_ops->type) {
case XFS_BTREE_TYPE_MEM:
xfs_err(cur->bc_mp,
"In-memory: Corrupt %sbt flags 0x%x pointer at level %d index %d fa %pS.",
cur->bc_ops->name, cur->bc_flags, level, index,
__this_address);
break;
case XFS_BTREE_TYPE_INODE:
xfs_err(cur->bc_mp,
"Inode %llu fork %d: Corrupt %sbt pointer at level %d index %d.",
cur->bc_ino.ip->i_ino,
cur->bc_ino.whichfork, cur->bc_ops->name,
level, index);
} else {
break;
case XFS_BTREE_TYPE_AG:
xfs_err(cur->bc_mp,
"AG %u: Corrupt %sbt pointer at level %d index %d.",
cur->bc_ag.pag->pag_agno, cur->bc_ops->name,
level, index);
break;
}
xfs_btree_mark_sick(cur);
}
......@@ -457,11 +531,35 @@ xfs_btree_del_cursor(
case XFS_BTREE_TYPE_INODE:
/* nothing to do */
break;
case XFS_BTREE_TYPE_MEM:
if (cur->bc_mem.pag)
xfs_perag_put(cur->bc_mem.pag);
break;
}
kmem_cache_free(cur->bc_cache, cur);
}
/* Return the buffer target for this btree's buffer. */
static inline struct xfs_buftarg *
xfs_btree_buftarg(
struct xfs_btree_cur *cur)
{
if (cur->bc_ops->type == XFS_BTREE_TYPE_MEM)
return cur->bc_mem.xfbtree->target;
return cur->bc_mp->m_ddev_targp;
}
/* Return the block size (in units of 512b sectors) for this btree. */
static inline unsigned int
xfs_btree_bbsize(
struct xfs_btree_cur *cur)
{
if (cur->bc_ops->type == XFS_BTREE_TYPE_MEM)
return XFBNO_BBSIZE;
return cur->bc_mp->m_bsize;
}
/*
* Duplicate the btree cursor.
* Allocate a new one, copy the record, re-get the buffers.
......@@ -505,10 +603,11 @@ xfs_btree_dup_cursor(
new->bc_levels[i].ra = cur->bc_levels[i].ra;
bp = cur->bc_levels[i].bp;
if (bp) {
error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
xfs_buf_daddr(bp), mp->m_bsize,
0, &bp,
cur->bc_ops->buf_ops);
error = xfs_trans_read_buf(mp, tp,
xfs_btree_buftarg(cur),
xfs_buf_daddr(bp),
xfs_btree_bbsize(cur), 0, &bp,
cur->bc_ops->buf_ops);
if (xfs_metadata_is_sick(error))
xfs_btree_mark_sick(new);
if (error) {
......@@ -885,6 +984,32 @@ xfs_btree_readahead_fsblock(
return rval;
}
STATIC int
xfs_btree_readahead_memblock(
struct xfs_btree_cur *cur,
int lr,
struct xfs_btree_block *block)
{
struct xfs_buftarg *btp = cur->bc_mem.xfbtree->target;
xfbno_t left = be64_to_cpu(block->bb_u.l.bb_leftsib);
xfbno_t right = be64_to_cpu(block->bb_u.l.bb_rightsib);
int rval = 0;
if ((lr & XFS_BTCUR_LEFTRA) && left != NULLFSBLOCK) {
xfs_buf_readahead(btp, xfbno_to_daddr(left), XFBNO_BBSIZE,
cur->bc_ops->buf_ops);
rval++;
}
if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLFSBLOCK) {
xfs_buf_readahead(btp, xfbno_to_daddr(right), XFBNO_BBSIZE,
cur->bc_ops->buf_ops);
rval++;
}
return rval;
}
STATIC int
xfs_btree_readahead_agblock(
struct xfs_btree_cur *cur,
......@@ -939,9 +1064,17 @@ xfs_btree_readahead(
cur->bc_levels[lev].ra |= lr;
block = XFS_BUF_TO_BLOCK(cur->bc_levels[lev].bp);
if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN)
switch (cur->bc_ops->type) {
case XFS_BTREE_TYPE_AG:
return xfs_btree_readahead_agblock(cur, lr, block);
case XFS_BTREE_TYPE_INODE:
return xfs_btree_readahead_fsblock(cur, lr, block);
return xfs_btree_readahead_agblock(cur, lr, block);
case XFS_BTREE_TYPE_MEM:
return xfs_btree_readahead_memblock(cur, lr, block);
default:
ASSERT(0);
return 0;
}
}
STATIC int
......@@ -950,23 +1083,24 @@ xfs_btree_ptr_to_daddr(
const union xfs_btree_ptr *ptr,
xfs_daddr_t *daddr)
{
xfs_fsblock_t fsbno;
xfs_agblock_t agbno;
int error;
error = xfs_btree_check_ptr(cur, ptr, 0, 1);
if (error)
return error;
if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) {
fsbno = be64_to_cpu(ptr->l);
*daddr = XFS_FSB_TO_DADDR(cur->bc_mp, fsbno);
} else {
agbno = be32_to_cpu(ptr->s);
switch (cur->bc_ops->type) {
case XFS_BTREE_TYPE_AG:
*daddr = XFS_AGB_TO_DADDR(cur->bc_mp, cur->bc_ag.pag->pag_agno,
agbno);
be32_to_cpu(ptr->s));
break;
case XFS_BTREE_TYPE_INODE:
*daddr = XFS_FSB_TO_DADDR(cur->bc_mp, be64_to_cpu(ptr->l));
break;
case XFS_BTREE_TYPE_MEM:
*daddr = xfbno_to_daddr(be64_to_cpu(ptr->l));
break;
}
return 0;
}
......@@ -986,8 +1120,9 @@ xfs_btree_readahead_ptr(
if (xfs_btree_ptr_to_daddr(cur, ptr, &daddr))
return;
xfs_buf_readahead(cur->bc_mp->m_ddev_targp, daddr,
cur->bc_mp->m_bsize * count, cur->bc_ops->buf_ops);
xfs_buf_readahead(xfs_btree_buftarg(cur), daddr,
xfs_btree_bbsize(cur) * count,
cur->bc_ops->buf_ops);
}
/*
......@@ -1043,6 +1178,17 @@ xfs_btree_set_ptr_null(
ptr->s = cpu_to_be32(NULLAGBLOCK);
}
static inline bool
xfs_btree_ptrs_equal(
struct xfs_btree_cur *cur,
union xfs_btree_ptr *ptr1,
union xfs_btree_ptr *ptr2)
{
if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN)
return ptr1->l == ptr2->l;
return ptr1->s == ptr2->s;
}
/*
* Get/set/init sibling pointers
*/
......@@ -1161,9 +1307,17 @@ static inline __u64
xfs_btree_owner(
struct xfs_btree_cur *cur)
{
if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE)
switch (cur->bc_ops->type) {
case XFS_BTREE_TYPE_MEM:
return cur->bc_mem.xfbtree->owner;
case XFS_BTREE_TYPE_INODE:
return cur->bc_ino.ip->i_ino;
return cur->bc_ag.pag->pag_agno;
case XFS_BTREE_TYPE_AG:
return cur->bc_ag.pag->pag_agno;
default:
ASSERT(0);
return 0;
}
}
void
......@@ -1207,12 +1361,18 @@ xfs_btree_buf_to_ptr(
struct xfs_buf *bp,
union xfs_btree_ptr *ptr)
{
if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN)
ptr->l = cpu_to_be64(XFS_DADDR_TO_FSB(cur->bc_mp,
xfs_buf_daddr(bp)));
else {
switch (cur->bc_ops->type) {
case XFS_BTREE_TYPE_AG:
ptr->s = cpu_to_be32(xfs_daddr_to_agbno(cur->bc_mp,
xfs_buf_daddr(bp)));
break;
case XFS_BTREE_TYPE_INODE:
ptr->l = cpu_to_be64(XFS_DADDR_TO_FSB(cur->bc_mp,
xfs_buf_daddr(bp)));
break;
case XFS_BTREE_TYPE_MEM:
ptr->l = cpu_to_be64(xfs_daddr_to_xfbno(xfs_buf_daddr(bp)));
break;
}
}
......@@ -1231,15 +1391,14 @@ xfs_btree_get_buf_block(
struct xfs_btree_block **block,
struct xfs_buf **bpp)
{
struct xfs_mount *mp = cur->bc_mp;
xfs_daddr_t d;
int error;
xfs_daddr_t d;
int error;
error = xfs_btree_ptr_to_daddr(cur, ptr, &d);
if (error)
return error;
error = xfs_trans_get_buf(cur->bc_tp, mp->m_ddev_targp, d, mp->m_bsize,
0, bpp);
error = xfs_trans_get_buf(cur->bc_tp, xfs_btree_buftarg(cur), d,
xfs_btree_bbsize(cur), 0, bpp);
if (error)
return error;
......@@ -1270,9 +1429,9 @@ xfs_btree_read_buf_block(
error = xfs_btree_ptr_to_daddr(cur, ptr, &d);
if (error)
return error;
error = xfs_trans_read_buf(mp, cur->bc_tp, mp->m_ddev_targp, d,
mp->m_bsize, flags, bpp,
cur->bc_ops->buf_ops);
error = xfs_trans_read_buf(mp, cur->bc_tp, xfs_btree_buftarg(cur), d,
xfs_btree_bbsize(cur), flags, bpp,
cur->bc_ops->buf_ops);
if (xfs_metadata_is_sick(error))
xfs_btree_mark_sick(cur);
if (error)
......@@ -4365,7 +4524,7 @@ xfs_btree_visit_block(
{
struct xfs_btree_block *block;
struct xfs_buf *bp;
union xfs_btree_ptr rptr;
union xfs_btree_ptr rptr, bufptr;
int error;
/* do right sibling readahead */
......@@ -4388,19 +4547,12 @@ xfs_btree_visit_block(
* return the same block without checking if the right sibling points
* back to us and creates a cyclic reference in the btree.
*/
if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) {
if (be64_to_cpu(rptr.l) == XFS_DADDR_TO_FSB(cur->bc_mp,
xfs_buf_daddr(bp))) {
xfs_btree_mark_sick(cur);
return -EFSCORRUPTED;
}
} else {
if (be32_to_cpu(rptr.s) == xfs_daddr_to_agbno(cur->bc_mp,
xfs_buf_daddr(bp))) {
xfs_btree_mark_sick(cur);
return -EFSCORRUPTED;
}
xfs_btree_buf_to_ptr(cur, bp, &bufptr);
if (xfs_btree_ptrs_equal(cur, &rptr, &bufptr)) {
xfs_btree_mark_sick(cur);
return -EFSCORRUPTED;
}
return xfs_btree_lookup_get_block(cur, level, &rptr, &block);
}
......@@ -4578,6 +4730,8 @@ xfs_btree_fsblock_verify(
xfs_fsblock_t fsb;
xfs_failaddr_t fa;
ASSERT(!xfs_buftarg_is_mem(bp->b_target));
/* numrecs verification */
if (be16_to_cpu(block->bb_numrecs) > max_recs)
return __this_address;
......@@ -4592,6 +4746,36 @@ xfs_btree_fsblock_verify(
return fa;
}
/* Verify an in-memory btree block. */
xfs_failaddr_t
xfs_btree_memblock_verify(
struct xfs_buf *bp,
unsigned int max_recs)
{
struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
struct xfs_buftarg *btp = bp->b_target;
xfs_failaddr_t fa;
xfbno_t bno;
ASSERT(xfs_buftarg_is_mem(bp->b_target));
/* numrecs verification */
if (be16_to_cpu(block->bb_numrecs) > max_recs)
return __this_address;
/* sibling pointer verification */
bno = xfs_daddr_to_xfbno(xfs_buf_daddr(bp));
fa = xfs_btree_check_memblock_siblings(btp, bno,
block->bb_u.l.bb_leftsib);
if (fa)
return fa;
fa = xfs_btree_check_memblock_siblings(btp, bno,
block->bb_u.l.bb_rightsib);
if (fa)
return fa;
return NULL;
}
/**
* xfs_btree_agblock_v5hdr_verify() -- verify the v5 fields of a short-format
* btree block
......@@ -4633,6 +4817,8 @@ xfs_btree_agblock_verify(
xfs_agblock_t agbno;
xfs_failaddr_t fa;
ASSERT(!xfs_buftarg_is_mem(bp->b_target));
/* numrecs verification */
if (be16_to_cpu(block->bb_numrecs) > max_recs)
return __this_address;
......
......@@ -112,6 +112,7 @@ static inline enum xbtree_key_contig xbtree_key_contig(uint64_t x, uint64_t y)
enum xfs_btree_type {
XFS_BTREE_TYPE_AG,
XFS_BTREE_TYPE_INODE,
XFS_BTREE_TYPE_MEM,
};
struct xfs_btree_ops {
......@@ -281,6 +282,10 @@ struct xfs_btree_cur
struct xfs_buf *agbp;
struct xbtree_afakeroot *afake; /* for staging cursor */
} bc_ag;
struct {
struct xfbtree *xfbtree;
struct xfs_perag *pag;
} bc_mem;
};
/* per-format private data */
......@@ -455,6 +460,8 @@ xfs_failaddr_t xfs_btree_fsblock_v5hdr_verify(struct xfs_buf *bp,
uint64_t owner);
xfs_failaddr_t xfs_btree_fsblock_verify(struct xfs_buf *bp,
unsigned int max_recs);
xfs_failaddr_t xfs_btree_memblock_verify(struct xfs_buf *bp,
unsigned int max_recs);
unsigned int xfs_btree_compute_maxlevels(const unsigned int *limits,
unsigned long long records);
......
/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright (c) 2021-2024 Oracle. All Rights Reserved.
* Author: Darrick J. Wong <djwong@kernel.org>
*/
#include "xfs.h"
#include "xfs_fs.h"
#include "xfs_shared.h"
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
#include "xfs_trans.h"
#include "xfs_btree.h"
#include "xfs_error.h"
#include "xfs_buf_mem.h"
#include "xfs_btree_mem.h"
#include "xfs_ag.h"
#include "xfs_buf_item.h"
#include "xfs_trace.h"
/* Set the root of an in-memory btree. */
void
xfbtree_set_root(
struct xfs_btree_cur *cur,
const union xfs_btree_ptr *ptr,
int inc)
{
ASSERT(cur->bc_ops->type == XFS_BTREE_TYPE_MEM);
cur->bc_mem.xfbtree->root = *ptr;
cur->bc_mem.xfbtree->nlevels += inc;
}
/* Initialize a pointer from the in-memory btree header. */
void
xfbtree_init_ptr_from_cur(
struct xfs_btree_cur *cur,
union xfs_btree_ptr *ptr)
{
ASSERT(cur->bc_ops->type == XFS_BTREE_TYPE_MEM);
*ptr = cur->bc_mem.xfbtree->root;
}
/* Duplicate an in-memory btree cursor. */
struct xfs_btree_cur *
xfbtree_dup_cursor(
struct xfs_btree_cur *cur)
{
struct xfs_btree_cur *ncur;
ASSERT(cur->bc_ops->type == XFS_BTREE_TYPE_MEM);
ncur = xfs_btree_alloc_cursor(cur->bc_mp, cur->bc_tp, cur->bc_ops,
cur->bc_maxlevels, cur->bc_cache);
ncur->bc_flags = cur->bc_flags;
ncur->bc_nlevels = cur->bc_nlevels;
ncur->bc_mem.xfbtree = cur->bc_mem.xfbtree;
if (cur->bc_mem.pag)
ncur->bc_mem.pag = xfs_perag_hold(cur->bc_mem.pag);
return ncur;
}
/* Close the btree xfile and release all resources. */
void
xfbtree_destroy(
struct xfbtree *xfbt)
{
xfs_buftarg_drain(xfbt->target);
}
/* Compute the number of bytes available for records. */
static inline unsigned int
xfbtree_rec_bytes(
struct xfs_mount *mp,
const struct xfs_btree_ops *ops)
{
return XMBUF_BLOCKSIZE - XFS_BTREE_LBLOCK_CRC_LEN;
}
/* Initialize an empty leaf block as the btree root. */
STATIC int
xfbtree_init_leaf_block(
struct xfs_mount *mp,
struct xfbtree *xfbt,
const struct xfs_btree_ops *ops)
{
struct xfs_buf *bp;
xfbno_t bno = xfbt->highest_bno++;
int error;
error = xfs_buf_get(xfbt->target, xfbno_to_daddr(bno), XFBNO_BBSIZE,
&bp);
if (error)
return error;
trace_xfbtree_create_root_buf(xfbt, bp);
bp->b_ops = ops->buf_ops;
xfs_btree_init_buf(mp, bp, ops, 0, 0, xfbt->owner);
xfs_buf_relse(bp);
xfbt->root.l = cpu_to_be64(bno);
return 0;
}
/*
* Create an in-memory btree root that can be used with the given xmbuf.
* Callers must set xfbt->owner.
*/
int
xfbtree_init(
struct xfs_mount *mp,
struct xfbtree *xfbt,
struct xfs_buftarg *btp,
const struct xfs_btree_ops *ops)
{
unsigned int blocklen = xfbtree_rec_bytes(mp, ops);
unsigned int keyptr_len;
int error;
/* Requires a long-format CRC-format btree */
if (!xfs_has_crc(mp)) {
ASSERT(xfs_has_crc(mp));
return -EINVAL;
}
if (ops->ptr_len != XFS_BTREE_LONG_PTR_LEN) {
ASSERT(ops->ptr_len == XFS_BTREE_LONG_PTR_LEN);
return -EINVAL;
}
memset(xfbt, 0, sizeof(*xfbt));
xfbt->target = btp;
/* Set up min/maxrecs for this btree. */
keyptr_len = ops->key_len + sizeof(__be64);
xfbt->maxrecs[0] = blocklen / ops->rec_len;
xfbt->maxrecs[1] = blocklen / keyptr_len;
xfbt->minrecs[0] = xfbt->maxrecs[0] / 2;
xfbt->minrecs[1] = xfbt->maxrecs[1] / 2;
xfbt->highest_bno = 0;
xfbt->nlevels = 1;
/* Initialize the empty btree. */
error = xfbtree_init_leaf_block(mp, xfbt, ops);
if (error)
goto err_freesp;
trace_xfbtree_init(mp, xfbt, ops);
return 0;
err_freesp:
xfs_buftarg_drain(xfbt->target);
return error;
}
/* Allocate a block to our in-memory btree. */
int
xfbtree_alloc_block(
struct xfs_btree_cur *cur,
const union xfs_btree_ptr *start,
union xfs_btree_ptr *new,
int *stat)
{
struct xfbtree *xfbt = cur->bc_mem.xfbtree;
xfbno_t bno = xfbt->highest_bno++;
ASSERT(cur->bc_ops->type == XFS_BTREE_TYPE_MEM);
trace_xfbtree_alloc_block(xfbt, cur, bno);
/* Fail if the block address exceeds the maximum for the buftarg. */
if (!xfbtree_verify_bno(xfbt, bno)) {
ASSERT(xfbtree_verify_bno(xfbt, bno));
*stat = 0;
return 0;
}
new->l = cpu_to_be64(bno);
*stat = 1;
return 0;
}
/* Free a block from our in-memory btree. */
int
xfbtree_free_block(
struct xfs_btree_cur *cur,
struct xfs_buf *bp)
{
struct xfbtree *xfbt = cur->bc_mem.xfbtree;
xfs_daddr_t daddr = xfs_buf_daddr(bp);
xfbno_t bno = xfs_daddr_to_xfbno(daddr);
ASSERT(cur->bc_ops->type == XFS_BTREE_TYPE_MEM);
trace_xfbtree_free_block(xfbt, cur, bno);
if (bno + 1 == xfbt->highest_bno)
xfbt->highest_bno--;
return 0;
}
/* Return the minimum number of records for a btree block. */
int
xfbtree_get_minrecs(
struct xfs_btree_cur *cur,
int level)
{
struct xfbtree *xfbt = cur->bc_mem.xfbtree;
return xfbt->minrecs[level != 0];
}
/* Return the maximum number of records for a btree block. */
int
xfbtree_get_maxrecs(
struct xfs_btree_cur *cur,
int level)
{
struct xfbtree *xfbt = cur->bc_mem.xfbtree;
return xfbt->maxrecs[level != 0];
}
/* If this log item is a buffer item that came from the xfbtree, return it. */
static inline struct xfs_buf *
xfbtree_buf_match(
struct xfbtree *xfbt,
const struct xfs_log_item *lip)
{
const struct xfs_buf_log_item *bli;
struct xfs_buf *bp;
if (lip->li_type != XFS_LI_BUF)
return NULL;
bli = container_of(lip, struct xfs_buf_log_item, bli_item);
bp = bli->bli_buf;
if (bp->b_target != xfbt->target)
return NULL;
return bp;
}
/*
* Commit changes to the incore btree immediately by writing all dirty xfbtree
* buffers to the backing xfile. This detaches all xfbtree buffers from the
* transaction, even on failure. The buffer locks are dropped between the
* delwri queue and submit, so the caller must synchronize btree access.
*
* Normally we'd let the buffers commit with the transaction and get written to
* the xfile via the log, but online repair stages ephemeral btrees in memory
* and uses the btree_staging functions to write new btrees to disk atomically.
* The in-memory btree (and its backing store) are discarded at the end of the
* repair phase, which means that xfbtree buffers cannot commit with the rest
* of a transaction.
*
* In other words, online repair only needs the transaction to collect buffer
* pointers and to avoid buffer deadlocks, not to guarantee consistency of
* updates.
*/
int
xfbtree_trans_commit(
struct xfbtree *xfbt,
struct xfs_trans *tp)
{
struct xfs_log_item *lip, *n;
bool tp_dirty = false;
int error = 0;
/*
* For each xfbtree buffer attached to the transaction, write the dirty
* buffers to the xfile and release them.
*/
list_for_each_entry_safe(lip, n, &tp->t_items, li_trans) {
struct xfs_buf *bp = xfbtree_buf_match(xfbt, lip);
if (!bp) {
if (test_bit(XFS_LI_DIRTY, &lip->li_flags))
tp_dirty |= true;
continue;
}
trace_xfbtree_trans_commit_buf(xfbt, bp);
xmbuf_trans_bdetach(tp, bp);
/*
* If the buffer fails verification, note the failure but
* continue walking the transaction items so that we remove all
* ephemeral btree buffers.
*/
if (!error)
error = xmbuf_finalize(bp);
xfs_buf_relse(bp);
}
/*
* Reset the transaction's dirty flag to reflect the dirty state of the
* log items that are still attached.
*/
tp->t_flags = (tp->t_flags & ~XFS_TRANS_DIRTY) |
(tp_dirty ? XFS_TRANS_DIRTY : 0);
return error;
}
/*
* Cancel changes to the incore btree by detaching all the xfbtree buffers.
* Changes are not undone, so callers must not access the btree ever again.
*/
void
xfbtree_trans_cancel(
struct xfbtree *xfbt,
struct xfs_trans *tp)
{
struct xfs_log_item *lip, *n;
bool tp_dirty = false;
list_for_each_entry_safe(lip, n, &tp->t_items, li_trans) {
struct xfs_buf *bp = xfbtree_buf_match(xfbt, lip);
if (!bp) {
if (test_bit(XFS_LI_DIRTY, &lip->li_flags))
tp_dirty |= true;
continue;
}
trace_xfbtree_trans_cancel_buf(xfbt, bp);
xmbuf_trans_bdetach(tp, bp);
xfs_buf_relse(bp);
}
/*
* Reset the transaction's dirty flag to reflect the dirty state of the
* log items that are still attached.
*/
tp->t_flags = (tp->t_flags & ~XFS_TRANS_DIRTY) |
(tp_dirty ? XFS_TRANS_DIRTY : 0);
}
/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright (c) 2021-2024 Oracle. All Rights Reserved.
* Author: Darrick J. Wong <djwong@kernel.org>
*/
#ifndef __XFS_BTREE_MEM_H__
#define __XFS_BTREE_MEM_H__
typedef uint64_t xfbno_t;
#define XFBNO_BLOCKSIZE (XMBUF_BLOCKSIZE)
#define XFBNO_BBSHIFT (XMBUF_BLOCKSHIFT - BBSHIFT)
#define XFBNO_BBSIZE (XFBNO_BLOCKSIZE >> BBSHIFT)
static inline xfs_daddr_t xfbno_to_daddr(xfbno_t blkno)
{
return blkno << XFBNO_BBSHIFT;
}
static inline xfbno_t xfs_daddr_to_xfbno(xfs_daddr_t daddr)
{
return daddr >> XFBNO_BBSHIFT;
}
struct xfbtree {
/* buffer cache target for this in-memory btree */
struct xfs_buftarg *target;
/* Highest block number that has been written to. */
xfbno_t highest_bno;
/* Owner of this btree. */
unsigned long long owner;
/* Btree header */
union xfs_btree_ptr root;
unsigned int nlevels;
/* Minimum and maximum records per block. */
unsigned int maxrecs[2];
unsigned int minrecs[2];
};
#ifdef CONFIG_XFS_BTREE_IN_MEM
static inline bool xfbtree_verify_bno(struct xfbtree *xfbt, xfbno_t bno)
{
return xmbuf_verify_daddr(xfbt->target, xfbno_to_daddr(bno));
}
void xfbtree_set_root(struct xfs_btree_cur *cur,
const union xfs_btree_ptr *ptr, int inc);
void xfbtree_init_ptr_from_cur(struct xfs_btree_cur *cur,
union xfs_btree_ptr *ptr);
struct xfs_btree_cur *xfbtree_dup_cursor(struct xfs_btree_cur *cur);
int xfbtree_get_minrecs(struct xfs_btree_cur *cur, int level);
int xfbtree_get_maxrecs(struct xfs_btree_cur *cur, int level);
int xfbtree_alloc_block(struct xfs_btree_cur *cur,
const union xfs_btree_ptr *start, union xfs_btree_ptr *ptr,
int *stat);
int xfbtree_free_block(struct xfs_btree_cur *cur, struct xfs_buf *bp);
/* Callers must set xfbt->target and xfbt->owner before calling this */
int xfbtree_init(struct xfs_mount *mp, struct xfbtree *xfbt,
struct xfs_buftarg *btp, const struct xfs_btree_ops *ops);
void xfbtree_destroy(struct xfbtree *xfbt);
int xfbtree_trans_commit(struct xfbtree *xfbt, struct xfs_trans *tp);
void xfbtree_trans_cancel(struct xfbtree *xfbt, struct xfs_trans *tp);
#else
# define xfbtree_verify_bno(...) (false)
#endif /* CONFIG_XFS_BTREE_IN_MEM */
#endif /* __XFS_BTREE_MEM_H__ */
......@@ -15,6 +15,7 @@
#include "xfs_quota.h"
#include "xfs_qm.h"
#include "xfs_scrub.h"
#include "xfs_buf_mem.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/trace.h"
......@@ -190,6 +191,10 @@ xchk_teardown(
sc->flags &= ~XCHK_HAVE_FREEZE_PROT;
mnt_drop_write_file(sc->file);
}
if (sc->xmbtp) {
xmbuf_free(sc->xmbtp);
sc->xmbtp = NULL;
}
if (sc->xfile) {
xfile_destroy(sc->xfile);
sc->xfile = NULL;
......
......@@ -99,6 +99,9 @@ struct xfs_scrub {
/* xfile used by the scrubbers; freed at teardown. */
struct xfile *xfile;
/* buffer target for in-memory btrees; also freed at teardown. */
struct xfs_buftarg *xmbtp;
/* Lock flags for @ip. */
uint ilock_flags;
......
......@@ -21,6 +21,7 @@
#include "xfs_errortag.h"
#include "xfs_error.h"
#include "xfs_ag.h"
#include "xfs_buf_mem.h"
struct kmem_cache *xfs_buf_cache;
......@@ -318,7 +319,9 @@ xfs_buf_free(
ASSERT(list_empty(&bp->b_lru));
if (bp->b_flags & _XBF_PAGES)
if (xfs_buftarg_is_mem(bp->b_target))
xmbuf_unmap_page(bp);
else if (bp->b_flags & _XBF_PAGES)
xfs_buf_free_pages(bp);
else if (bp->b_flags & _XBF_KMEM)
kfree(bp->b_addr);
......@@ -510,18 +513,18 @@ static const struct rhashtable_params xfs_buf_hash_params = {
};
int
xfs_buf_hash_init(
struct xfs_perag *pag)
xfs_buf_cache_init(
struct xfs_buf_cache *bch)
{
spin_lock_init(&pag->pag_buf_lock);
return rhashtable_init(&pag->pag_buf_hash, &xfs_buf_hash_params);
spin_lock_init(&bch->bc_lock);
return rhashtable_init(&bch->bc_hash, &xfs_buf_hash_params);
}
void
xfs_buf_hash_destroy(
struct xfs_perag *pag)
xfs_buf_cache_destroy(
struct xfs_buf_cache *bch)
{
rhashtable_destroy(&pag->pag_buf_hash);
rhashtable_destroy(&bch->bc_hash);
}
static int
......@@ -584,7 +587,7 @@ xfs_buf_find_lock(
static inline int
xfs_buf_lookup(
struct xfs_perag *pag,
struct xfs_buf_cache *bch,
struct xfs_buf_map *map,
xfs_buf_flags_t flags,
struct xfs_buf **bpp)
......@@ -593,7 +596,7 @@ xfs_buf_lookup(
int error;
rcu_read_lock();
bp = rhashtable_lookup(&pag->pag_buf_hash, map, xfs_buf_hash_params);
bp = rhashtable_lookup(&bch->bc_hash, map, xfs_buf_hash_params);
if (!bp || !atomic_inc_not_zero(&bp->b_hold)) {
rcu_read_unlock();
return -ENOENT;
......@@ -618,6 +621,7 @@ xfs_buf_lookup(
static int
xfs_buf_find_insert(
struct xfs_buftarg *btp,
struct xfs_buf_cache *bch,
struct xfs_perag *pag,
struct xfs_buf_map *cmap,
struct xfs_buf_map *map,
......@@ -633,31 +637,33 @@ xfs_buf_find_insert(
if (error)
goto out_drop_pag;
/*
* For buffers that fit entirely within a single page, first attempt to
* allocate the memory from the heap to minimise memory usage. If we
* can't get heap memory for these small buffers, we fall back to using
* the page allocator.
*/
if (BBTOB(new_bp->b_length) >= PAGE_SIZE ||
xfs_buf_alloc_kmem(new_bp, flags) < 0) {
if (xfs_buftarg_is_mem(new_bp->b_target)) {
error = xmbuf_map_page(new_bp);
} else if (BBTOB(new_bp->b_length) >= PAGE_SIZE ||
xfs_buf_alloc_kmem(new_bp, flags) < 0) {
/*
* For buffers that fit entirely within a single page, first
* attempt to allocate the memory from the heap to minimise
* memory usage. If we can't get heap memory for these small
* buffers, we fall back to using the page allocator.
*/
error = xfs_buf_alloc_pages(new_bp, flags);
if (error)
goto out_free_buf;
}
if (error)
goto out_free_buf;
spin_lock(&pag->pag_buf_lock);
bp = rhashtable_lookup_get_insert_fast(&pag->pag_buf_hash,
spin_lock(&bch->bc_lock);
bp = rhashtable_lookup_get_insert_fast(&bch->bc_hash,
&new_bp->b_rhash_head, xfs_buf_hash_params);
if (IS_ERR(bp)) {
error = PTR_ERR(bp);
spin_unlock(&pag->pag_buf_lock);
spin_unlock(&bch->bc_lock);
goto out_free_buf;
}
if (bp) {
/* found an existing buffer */
atomic_inc(&bp->b_hold);
spin_unlock(&pag->pag_buf_lock);
spin_unlock(&bch->bc_lock);
error = xfs_buf_find_lock(bp, flags);
if (error)
xfs_buf_rele(bp);
......@@ -668,17 +674,40 @@ xfs_buf_find_insert(
/* The new buffer keeps the perag reference until it is freed. */
new_bp->b_pag = pag;
spin_unlock(&pag->pag_buf_lock);
spin_unlock(&bch->bc_lock);
*bpp = new_bp;
return 0;
out_free_buf:
xfs_buf_free(new_bp);
out_drop_pag:
xfs_perag_put(pag);
if (pag)
xfs_perag_put(pag);
return error;
}
static inline struct xfs_perag *
xfs_buftarg_get_pag(
struct xfs_buftarg *btp,
const struct xfs_buf_map *map)
{
struct xfs_mount *mp = btp->bt_mount;
if (xfs_buftarg_is_mem(btp))
return NULL;
return xfs_perag_get(mp, xfs_daddr_to_agno(mp, map->bm_bn));
}
static inline struct xfs_buf_cache *
xfs_buftarg_buf_cache(
struct xfs_buftarg *btp,
struct xfs_perag *pag)
{
if (pag)
return &pag->pag_bcache;
return btp->bt_cache;
}
/*
* Assembles a buffer covering the specified range. The code is optimised for
* cache hits, as metadata intensive workloads will see 3 orders of magnitude
......@@ -692,6 +721,7 @@ xfs_buf_get_map(
xfs_buf_flags_t flags,
struct xfs_buf **bpp)
{
struct xfs_buf_cache *bch;
struct xfs_perag *pag;
struct xfs_buf *bp = NULL;
struct xfs_buf_map cmap = { .bm_bn = map[0].bm_bn };
......@@ -707,10 +737,10 @@ xfs_buf_get_map(
if (error)
return error;
pag = xfs_perag_get(btp->bt_mount,
xfs_daddr_to_agno(btp->bt_mount, cmap.bm_bn));
pag = xfs_buftarg_get_pag(btp, &cmap);
bch = xfs_buftarg_buf_cache(btp, pag);
error = xfs_buf_lookup(pag, &cmap, flags, &bp);
error = xfs_buf_lookup(bch, &cmap, flags, &bp);
if (error && error != -ENOENT)
goto out_put_perag;
......@@ -722,13 +752,14 @@ xfs_buf_get_map(
goto out_put_perag;
/* xfs_buf_find_insert() consumes the perag reference. */
error = xfs_buf_find_insert(btp, pag, &cmap, map, nmaps,
error = xfs_buf_find_insert(btp, bch, pag, &cmap, map, nmaps,
flags, &bp);
if (error)
return error;
} else {
XFS_STATS_INC(btp->bt_mount, xb_get_locked);
xfs_perag_put(pag);
if (pag)
xfs_perag_put(pag);
}
/* We do not hold a perag reference anymore. */
......@@ -756,7 +787,8 @@ xfs_buf_get_map(
return 0;
out_put_perag:
xfs_perag_put(pag);
if (pag)
xfs_perag_put(pag);
return error;
}
......@@ -903,6 +935,13 @@ xfs_buf_readahead_map(
{
struct xfs_buf *bp;
/*
* Currently we don't have a good means or justification for performing
* xmbuf_map_page asynchronously, so we don't do readahead.
*/
if (xfs_buftarg_is_mem(target))
return;
xfs_buf_read_map(target, map, nmaps,
XBF_TRYLOCK | XBF_ASYNC | XBF_READ_AHEAD, &bp, ops,
__this_address);
......@@ -968,7 +1007,10 @@ xfs_buf_get_uncached(
if (error)
return error;
error = xfs_buf_alloc_pages(bp, flags);
if (xfs_buftarg_is_mem(bp->b_target))
error = xmbuf_map_page(bp);
else
error = xfs_buf_alloc_pages(bp, flags);
if (error)
goto fail_free_buf;
......@@ -1016,7 +1058,9 @@ static void
xfs_buf_rele_cached(
struct xfs_buf *bp)
{
struct xfs_buftarg *btp = bp->b_target;
struct xfs_perag *pag = bp->b_pag;
struct xfs_buf_cache *bch = xfs_buftarg_buf_cache(btp, pag);
bool release;
bool freebuf = false;
......@@ -1035,7 +1079,7 @@ xfs_buf_rele_cached(
* leading to a use-after-free scenario.
*/
spin_lock(&bp->b_lock);
release = atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock);
release = atomic_dec_and_lock(&bp->b_hold, &bch->bc_lock);
if (!release) {
/*
* Drop the in-flight state if the buffer is already on the LRU
......@@ -1056,11 +1100,11 @@ xfs_buf_rele_cached(
* buffer for the LRU and clear the (now stale) dispose list
* state flag
*/
if (list_lru_add_obj(&bp->b_target->bt_lru, &bp->b_lru)) {
if (list_lru_add_obj(&btp->bt_lru, &bp->b_lru)) {
bp->b_state &= ~XFS_BSTATE_DISPOSE;
atomic_inc(&bp->b_hold);
}
spin_unlock(&pag->pag_buf_lock);
spin_unlock(&bch->bc_lock);
} else {
/*
* most of the time buffers will already be removed from the
......@@ -1069,16 +1113,17 @@ xfs_buf_rele_cached(
* was on was the disposal list
*/
if (!(bp->b_state & XFS_BSTATE_DISPOSE)) {
list_lru_del_obj(&bp->b_target->bt_lru, &bp->b_lru);
list_lru_del_obj(&btp->bt_lru, &bp->b_lru);
} else {
ASSERT(list_empty(&bp->b_lru));
}
ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
rhashtable_remove_fast(&pag->pag_buf_hash, &bp->b_rhash_head,
xfs_buf_hash_params);
spin_unlock(&pag->pag_buf_lock);
xfs_perag_put(pag);
rhashtable_remove_fast(&bch->bc_hash, &bp->b_rhash_head,
xfs_buf_hash_params);
spin_unlock(&bch->bc_lock);
if (pag)
xfs_perag_put(pag);
freebuf = true;
}
......@@ -1607,6 +1652,12 @@ _xfs_buf_ioapply(
/* we only use the buffer cache for meta-data */
op |= REQ_META;
/* in-memory targets are directly mapped, no IO required. */
if (xfs_buftarg_is_mem(bp->b_target)) {
xfs_buf_ioend(bp);
return;
}
/*
* Walk all the vectors issuing IO on them. Set up the initial offset
* into the buffer and the desired IO size before we start -
......@@ -1962,19 +2013,24 @@ xfs_buftarg_shrink_count(
}
void
xfs_free_buftarg(
xfs_destroy_buftarg(
struct xfs_buftarg *btp)
{
shrinker_free(btp->bt_shrinker);
ASSERT(percpu_counter_sum(&btp->bt_io_count) == 0);
percpu_counter_destroy(&btp->bt_io_count);
list_lru_destroy(&btp->bt_lru);
}
void
xfs_free_buftarg(
struct xfs_buftarg *btp)
{
xfs_destroy_buftarg(btp);
fs_put_dax(btp->bt_daxdev, btp->bt_mount);
/* the main block device is closed by kill_block_super */
if (btp->bt_bdev != btp->bt_mount->m_super->s_bdev)
bdev_release(btp->bt_bdev_handle);
kfree(btp);
}
......@@ -1997,6 +2053,45 @@ xfs_setsize_buftarg(
return 0;
}
int
xfs_init_buftarg(
struct xfs_buftarg *btp,
size_t logical_sectorsize,
const char *descr)
{
/* Set up device logical sector size mask */
btp->bt_logical_sectorsize = logical_sectorsize;
btp->bt_logical_sectormask = logical_sectorsize - 1;
/*
* Buffer IO error rate limiting. Limit it to no more than 10 messages
* per 30 seconds so as to not spam logs too much on repeated errors.
*/
ratelimit_state_init(&btp->bt_ioerror_rl, 30 * HZ,
DEFAULT_RATELIMIT_BURST);
if (list_lru_init(&btp->bt_lru))
return -ENOMEM;
if (percpu_counter_init(&btp->bt_io_count, 0, GFP_KERNEL))
goto out_destroy_lru;
btp->bt_shrinker =
shrinker_alloc(SHRINKER_NUMA_AWARE, "xfs-buf:%s", descr);
if (!btp->bt_shrinker)
goto out_destroy_io_count;
btp->bt_shrinker->count_objects = xfs_buftarg_shrink_count;
btp->bt_shrinker->scan_objects = xfs_buftarg_shrink_scan;
btp->bt_shrinker->private_data = btp;
shrinker_register(btp->bt_shrinker);
return 0;
out_destroy_io_count:
percpu_counter_destroy(&btp->bt_io_count);
out_destroy_lru:
list_lru_destroy(&btp->bt_lru);
return -ENOMEM;
}
struct xfs_buftarg *
xfs_alloc_buftarg(
struct xfs_mount *mp,
......@@ -2023,41 +2118,12 @@ xfs_alloc_buftarg(
*/
if (xfs_setsize_buftarg(btp, bdev_logical_block_size(btp->bt_bdev)))
goto error_free;
/* Set up device logical sector size mask */
btp->bt_logical_sectorsize = bdev_logical_block_size(btp->bt_bdev);
btp->bt_logical_sectormask = bdev_logical_block_size(btp->bt_bdev) - 1;
/*
* Buffer IO error rate limiting. Limit it to no more than 10 messages
* per 30 seconds so as to not spam logs too much on repeated errors.
*/
ratelimit_state_init(&btp->bt_ioerror_rl, 30 * HZ,
DEFAULT_RATELIMIT_BURST);
if (list_lru_init(&btp->bt_lru))
if (xfs_init_buftarg(btp, bdev_logical_block_size(btp->bt_bdev),
mp->m_super->s_id))
goto error_free;
if (percpu_counter_init(&btp->bt_io_count, 0, GFP_KERNEL))
goto error_lru;
btp->bt_shrinker = shrinker_alloc(SHRINKER_NUMA_AWARE, "xfs-buf:%s",
mp->m_super->s_id);
if (!btp->bt_shrinker)
goto error_pcpu;
btp->bt_shrinker->count_objects = xfs_buftarg_shrink_count;
btp->bt_shrinker->scan_objects = xfs_buftarg_shrink_scan;
btp->bt_shrinker->private_data = btp;
shrinker_register(btp->bt_shrinker);
return btp;
error_pcpu:
percpu_counter_destroy(&btp->bt_io_count);
error_lru:
list_lru_destroy(&btp->bt_lru);
error_free:
kfree(btp);
return NULL;
......
......@@ -83,6 +83,14 @@ typedef unsigned int xfs_buf_flags_t;
#define XFS_BSTATE_DISPOSE (1 << 0) /* buffer being discarded */
#define XFS_BSTATE_IN_FLIGHT (1 << 1) /* I/O in flight */
struct xfs_buf_cache {
spinlock_t bc_lock;
struct rhashtable bc_hash;
};
int xfs_buf_cache_init(struct xfs_buf_cache *bch);
void xfs_buf_cache_destroy(struct xfs_buf_cache *bch);
/*
* The xfs_buftarg contains 2 notions of "sector size" -
*
......@@ -101,6 +109,7 @@ struct xfs_buftarg {
struct bdev_handle *bt_bdev_handle;
struct block_device *bt_bdev;
struct dax_device *bt_daxdev;
struct file *bt_file;
u64 bt_dax_part_off;
struct xfs_mount *bt_mount;
unsigned int bt_meta_sectorsize;
......@@ -114,6 +123,9 @@ struct xfs_buftarg {
struct percpu_counter bt_io_count;
struct ratelimit_state bt_ioerror_rl;
/* built-in cache, if we're not using the perag one */
struct xfs_buf_cache bt_cache[];
};
#define XB_PAGES 2
......@@ -379,4 +391,9 @@ int xfs_buf_reverify(struct xfs_buf *bp, const struct xfs_buf_ops *ops);
bool xfs_verify_magic(struct xfs_buf *bp, __be32 dmagic);
bool xfs_verify_magic16(struct xfs_buf *bp, __be16 dmagic);
/* for xfs_buf_mem.c only: */
int xfs_init_buftarg(struct xfs_buftarg *btp, size_t logical_sectorsize,
const char *descr);
void xfs_destroy_buftarg(struct xfs_buftarg *btp);
#endif /* __XFS_BUF_H__ */
// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Copyright (c) 2023-2024 Oracle. All Rights Reserved.
* Author: Darrick J. Wong <djwong@kernel.org>
*/
#include "xfs.h"
#include "xfs_fs.h"
#include "xfs_buf.h"
#include "xfs_buf_mem.h"
#include "xfs_trace.h"
#include <linux/shmem_fs.h>
#include "xfs_log_format.h"
#include "xfs_trans.h"
#include "xfs_buf_item.h"
#include "xfs_error.h"
/*
* Buffer Cache for In-Memory Files
* ================================
*
* Online fsck wants to create ephemeral ordered recordsets. The existing
* btree infrastructure can do this, but we need the buffer cache to target
* memory instead of block devices.
*
* When CONFIG_TMPFS=y, shmemfs is enough of a filesystem to meet those
* requirements. Therefore, the xmbuf mechanism uses an unlinked shmem file to
* store our staging data. This file is not installed in the file descriptor
* table so that user programs cannot access the data, which means that the
* xmbuf must be freed with xmbuf_destroy.
*
* xmbufs assume that the caller will handle all required concurrency
* management; standard vfs locks (freezer and inode) are not taken. Reads
* and writes are satisfied directly from the page cache.
*
* The only supported block size is PAGE_SIZE, and we cannot use highmem.
*/
/*
* shmem files used to back an in-memory buffer cache must not be exposed to
* userspace. Upper layers must coordinate access to the one handle returned
* by the constructor, so establish a separate lock class for xmbufs to avoid
* confusing lockdep.
*/
static struct lock_class_key xmbuf_i_mutex_key;
/*
* Allocate a buffer cache target for a memory-backed file and set up the
* buffer target.
*/
int
xmbuf_alloc(
struct xfs_mount *mp,
const char *descr,
struct xfs_buftarg **btpp)
{
struct file *file;
struct inode *inode;
struct xfs_buftarg *btp;
int error;
btp = kzalloc(struct_size(btp, bt_cache, 1), GFP_KERNEL);
if (!btp)
return -ENOMEM;
file = shmem_kernel_file_setup(descr, 0, 0);
if (IS_ERR(file)) {
error = PTR_ERR(file);
goto out_free_btp;
}
inode = file_inode(file);
/* private file, private locking */
lockdep_set_class(&inode->i_rwsem, &xmbuf_i_mutex_key);
/*
* We don't want to bother with kmapping data during repair, so don't
* allow highmem pages to back this mapping.
*/
mapping_set_gfp_mask(inode->i_mapping, GFP_KERNEL);
/* ensure all writes are below EOF to avoid pagecache zeroing */
i_size_write(inode, inode->i_sb->s_maxbytes);
trace_xmbuf_create(btp);
error = xfs_buf_cache_init(btp->bt_cache);
if (error)
goto out_file;
/* Initialize buffer target */
btp->bt_mount = mp;
btp->bt_dev = (dev_t)-1U;
btp->bt_bdev = NULL; /* in-memory buftargs have no bdev */
btp->bt_file = file;
btp->bt_meta_sectorsize = XMBUF_BLOCKSIZE;
btp->bt_meta_sectormask = XMBUF_BLOCKSIZE - 1;
error = xfs_init_buftarg(btp, XMBUF_BLOCKSIZE, descr);
if (error)
goto out_bcache;
*btpp = btp;
return 0;
out_bcache:
xfs_buf_cache_destroy(btp->bt_cache);
out_file:
fput(file);
out_free_btp:
kfree(btp);
return error;
}
/* Free a buffer cache target for a memory-backed buffer cache. */
void
xmbuf_free(
struct xfs_buftarg *btp)
{
ASSERT(xfs_buftarg_is_mem(btp));
ASSERT(percpu_counter_sum(&btp->bt_io_count) == 0);
trace_xmbuf_free(btp);
xfs_destroy_buftarg(btp);
xfs_buf_cache_destroy(btp->bt_cache);
fput(btp->bt_file);
kfree(btp);
}
/* Directly map a shmem page into the buffer cache. */
int
xmbuf_map_page(
struct xfs_buf *bp)
{
struct inode *inode = file_inode(bp->b_target->bt_file);
struct folio *folio = NULL;
struct page *page;
loff_t pos = BBTOB(xfs_buf_daddr(bp));
int error;
ASSERT(xfs_buftarg_is_mem(bp->b_target));
if (bp->b_map_count != 1)
return -ENOMEM;
if (BBTOB(bp->b_length) != XMBUF_BLOCKSIZE)
return -ENOMEM;
if (offset_in_page(pos) != 0) {
ASSERT(offset_in_page(pos));
return -ENOMEM;
}
error = shmem_get_folio(inode, pos >> PAGE_SHIFT, &folio, SGP_CACHE);
if (error)
return error;
if (filemap_check_wb_err(inode->i_mapping, 0)) {
folio_unlock(folio);
folio_put(folio);
return -EIO;
}
page = folio_file_page(folio, pos >> PAGE_SHIFT);
/*
* Mark the page dirty so that it won't be reclaimed once we drop the
* (potentially last) reference in xmbuf_unmap_page.
*/
set_page_dirty(page);
unlock_page(page);
bp->b_addr = page_address(page);
bp->b_pages = bp->b_page_array;
bp->b_pages[0] = page;
bp->b_page_count = 1;
return 0;
}
/* Unmap a shmem page that was mapped into the buffer cache. */
void
xmbuf_unmap_page(
struct xfs_buf *bp)
{
struct page *page = bp->b_pages[0];
ASSERT(xfs_buftarg_is_mem(bp->b_target));
put_page(page);
bp->b_addr = NULL;
bp->b_pages[0] = NULL;
bp->b_pages = NULL;
bp->b_page_count = 0;
}
/* Is this a valid daddr within the buftarg? */
bool
xmbuf_verify_daddr(
struct xfs_buftarg *btp,
xfs_daddr_t daddr)
{
struct inode *inode = file_inode(btp->bt_file);
ASSERT(xfs_buftarg_is_mem(btp));
return daddr < (inode->i_sb->s_maxbytes >> BBSHIFT);
}
/* Discard the page backing this buffer. */
static void
xmbuf_stale(
struct xfs_buf *bp)
{
struct inode *inode = file_inode(bp->b_target->bt_file);
loff_t pos;
ASSERT(xfs_buftarg_is_mem(bp->b_target));
pos = BBTOB(xfs_buf_daddr(bp));
shmem_truncate_range(inode, pos, pos + BBTOB(bp->b_length) - 1);
}
/*
* Finalize a buffer -- discard the backing page if it's stale, or run the
* write verifier to detect problems.
*/
int
xmbuf_finalize(
struct xfs_buf *bp)
{
xfs_failaddr_t fa;
int error = 0;
if (bp->b_flags & XBF_STALE) {
xmbuf_stale(bp);
return 0;
}
/*
* Although this btree is ephemeral, validate the buffer structure so
* that we can detect memory corruption errors and software bugs.
*/
fa = bp->b_ops->verify_struct(bp);
if (fa) {
error = -EFSCORRUPTED;
xfs_verifier_error(bp, error, fa);
}
return error;
}
/*
* Detach this xmbuf buffer from the transaction by any means necessary.
* All buffers are direct-mapped, so they do not need bwrite.
*/
void
xmbuf_trans_bdetach(
struct xfs_trans *tp,
struct xfs_buf *bp)
{
struct xfs_buf_log_item *bli = bp->b_log_item;
ASSERT(bli != NULL);
bli->bli_flags &= ~(XFS_BLI_DIRTY | XFS_BLI_ORDERED |
XFS_BLI_LOGGED | XFS_BLI_STALE);
clear_bit(XFS_LI_DIRTY, &bli->bli_item.li_flags);
while (bp->b_log_item != NULL)
xfs_trans_bdetach(tp, bp);
}
// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Copyright (c) 2023-2024 Oracle. All Rights Reserved.
* Author: Darrick J. Wong <djwong@kernel.org>
*/
#ifndef __XFS_BUF_MEM_H__
#define __XFS_BUF_MEM_H__
#define XMBUF_BLOCKSIZE (PAGE_SIZE)
#define XMBUF_BLOCKSHIFT (PAGE_SHIFT)
#ifdef CONFIG_XFS_MEMORY_BUFS
static inline bool xfs_buftarg_is_mem(const struct xfs_buftarg *btp)
{
return btp->bt_bdev == NULL;
}
int xmbuf_alloc(struct xfs_mount *mp, const char *descr,
struct xfs_buftarg **btpp);
void xmbuf_free(struct xfs_buftarg *btp);
int xmbuf_map_page(struct xfs_buf *bp);
void xmbuf_unmap_page(struct xfs_buf *bp);
bool xmbuf_verify_daddr(struct xfs_buftarg *btp, xfs_daddr_t daddr);
void xmbuf_trans_bdetach(struct xfs_trans *tp, struct xfs_buf *bp);
int xmbuf_finalize(struct xfs_buf *bp);
#else
# define xfs_buftarg_is_mem(...) (false)
# define xmbuf_map_page(...) (-ENOMEM)
# define xmbuf_unmap_page(...) ((void)0)
# define xmbuf_verify_daddr(...) (false)
#endif /* CONFIG_XFS_MEMORY_BUFS */
#endif /* __XFS_BUF_MEM_H__ */
......@@ -527,6 +527,9 @@ xfs_btree_mark_sick(
struct xfs_btree_cur *cur)
{
switch (cur->bc_ops->type) {
case XFS_BTREE_TYPE_MEM:
/* no health state tracking for ephemeral btrees */
return;
case XFS_BTREE_TYPE_AG:
ASSERT(cur->bc_ops->sick_mask);
xfs_ag_mark_sick(cur->bc_ag.pag, cur->bc_ops->sick_mask);
......
......@@ -505,9 +505,6 @@ xfs_daddr_to_agbno(struct xfs_mount *mp, xfs_daddr_t d)
return (xfs_agblock_t) do_div(ld, mp->m_sb.sb_agblocks);
}
int xfs_buf_hash_init(struct xfs_perag *pag);
void xfs_buf_hash_destroy(struct xfs_perag *pag);
extern void xfs_uuid_table_free(void);
extern uint64_t xfs_default_resblks(xfs_mount_t *mp);
extern int xfs_mountfs(xfs_mount_t *mp);
......
......@@ -36,6 +36,8 @@
#include "xfs_error.h"
#include <linux/iomap.h>
#include "xfs_iomap.h"
#include "xfs_buf_mem.h"
#include "xfs_btree_mem.h"
/*
* We include this last to have the helpers above available for the trace
......
......@@ -79,6 +79,8 @@ union xfs_btree_ptr;
struct xfs_dqtrx;
struct xfs_icwalk;
struct xfs_perag;
struct xfbtree;
struct xfs_btree_ops;
#define XFS_ATTR_FILTER_FLAGS \
{ XFS_ATTR_ROOT, "ROOT" }, \
......@@ -640,6 +642,7 @@ DEFINE_BUF_ITEM_EVENT(xfs_trans_read_buf);
DEFINE_BUF_ITEM_EVENT(xfs_trans_read_buf_recur);
DEFINE_BUF_ITEM_EVENT(xfs_trans_log_buf);
DEFINE_BUF_ITEM_EVENT(xfs_trans_brelse);
DEFINE_BUF_ITEM_EVENT(xfs_trans_bdetach);
DEFINE_BUF_ITEM_EVENT(xfs_trans_bjoin);
DEFINE_BUF_ITEM_EVENT(xfs_trans_bhold);
DEFINE_BUF_ITEM_EVENT(xfs_trans_bhold_release);
......@@ -2499,12 +2502,19 @@ TRACE_EVENT(xfs_btree_alloc_block,
),
TP_fast_assign(
__entry->dev = cur->bc_mp->m_super->s_dev;
if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE) {
switch (cur->bc_ops->type) {
case XFS_BTREE_TYPE_INODE:
__entry->agno = 0;
__entry->ino = cur->bc_ino.ip->i_ino;
} else {
break;
case XFS_BTREE_TYPE_AG:
__entry->agno = cur->bc_ag.pag->pag_agno;
__entry->ino = 0;
break;
case XFS_BTREE_TYPE_MEM:
__entry->agno = 0;
__entry->ino = 0;
break;
}
__assign_str(name, cur->bc_ops->name);
__entry->error = error;
......@@ -4514,6 +4524,159 @@ DEFINE_PERAG_INTENTS_EVENT(xfs_perag_wait_intents);
#endif /* CONFIG_XFS_DRAIN_INTENTS */
#ifdef CONFIG_XFS_MEMORY_BUFS
TRACE_EVENT(xmbuf_create,
TP_PROTO(struct xfs_buftarg *btp),
TP_ARGS(btp),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(unsigned long, ino)
__array(char, pathname, 256)
),
TP_fast_assign(
char pathname[257];
char *path;
struct file *file = btp->bt_file;
__entry->ino = file_inode(file)->i_ino;
memset(pathname, 0, sizeof(pathname));
path = file_path(file, pathname, sizeof(pathname) - 1);
if (IS_ERR(path))
path = "(unknown)";
strncpy(__entry->pathname, path, sizeof(__entry->pathname));
),
TP_printk("xmino 0x%lx path '%s'",
__entry->ino,
__entry->pathname)
);
TRACE_EVENT(xmbuf_free,
TP_PROTO(struct xfs_buftarg *btp),
TP_ARGS(btp),
TP_STRUCT__entry(
__field(unsigned long, ino)
__field(unsigned long long, bytes)
__field(loff_t, size)
),
TP_fast_assign(
struct file *file = btp->bt_file;
struct inode *inode = file_inode(file);
__entry->size = i_size_read(inode);
__entry->bytes = (inode->i_blocks << SECTOR_SHIFT) + inode->i_bytes;
__entry->ino = inode->i_ino;
),
TP_printk("xmino 0x%lx mem_bytes 0x%llx isize 0x%llx",
__entry->ino,
__entry->bytes,
__entry->size)
);
#endif /* CONFIG_XFS_MEMORY_BUFS */
#ifdef CONFIG_XFS_BTREE_IN_MEM
TRACE_EVENT(xfbtree_init,
TP_PROTO(struct xfs_mount *mp, struct xfbtree *xfbt,
const struct xfs_btree_ops *ops),
TP_ARGS(mp, xfbt, ops),
TP_STRUCT__entry(
__field(const void *, btree_ops)
__field(unsigned long, xfino)
__field(unsigned int, leaf_mxr)
__field(unsigned int, leaf_mnr)
__field(unsigned int, node_mxr)
__field(unsigned int, node_mnr)
__field(unsigned long long, owner)
),
TP_fast_assign(
__entry->btree_ops = ops;
__entry->xfino = file_inode(xfbt->target->bt_file)->i_ino;
__entry->leaf_mxr = xfbt->maxrecs[0];
__entry->node_mxr = xfbt->maxrecs[1];
__entry->leaf_mnr = xfbt->minrecs[0];
__entry->node_mnr = xfbt->minrecs[1];
__entry->owner = xfbt->owner;
),
TP_printk("xfino 0x%lx btree_ops %pS owner 0x%llx leaf_mxr %u leaf_mnr %u node_mxr %u node_mnr %u",
__entry->xfino,
__entry->btree_ops,
__entry->owner,
__entry->leaf_mxr,
__entry->leaf_mnr,
__entry->node_mxr,
__entry->node_mnr)
);
DECLARE_EVENT_CLASS(xfbtree_buf_class,
TP_PROTO(struct xfbtree *xfbt, struct xfs_buf *bp),
TP_ARGS(xfbt, bp),
TP_STRUCT__entry(
__field(unsigned long, xfino)
__field(xfs_daddr_t, bno)
__field(int, nblks)
__field(int, hold)
__field(int, pincount)
__field(unsigned int, lockval)
__field(unsigned int, flags)
),
TP_fast_assign(
__entry->xfino = file_inode(xfbt->target->bt_file)->i_ino;
__entry->bno = xfs_buf_daddr(bp);
__entry->nblks = bp->b_length;
__entry->hold = atomic_read(&bp->b_hold);
__entry->pincount = atomic_read(&bp->b_pin_count);
__entry->lockval = bp->b_sema.count;
__entry->flags = bp->b_flags;
),
TP_printk("xfino 0x%lx daddr 0x%llx bbcount 0x%x hold %d pincount %d lock %d flags %s",
__entry->xfino,
(unsigned long long)__entry->bno,
__entry->nblks,
__entry->hold,
__entry->pincount,
__entry->lockval,
__print_flags(__entry->flags, "|", XFS_BUF_FLAGS))
)
#define DEFINE_XFBTREE_BUF_EVENT(name) \
DEFINE_EVENT(xfbtree_buf_class, name, \
TP_PROTO(struct xfbtree *xfbt, struct xfs_buf *bp), \
TP_ARGS(xfbt, bp))
DEFINE_XFBTREE_BUF_EVENT(xfbtree_create_root_buf);
DEFINE_XFBTREE_BUF_EVENT(xfbtree_trans_commit_buf);
DEFINE_XFBTREE_BUF_EVENT(xfbtree_trans_cancel_buf);
DECLARE_EVENT_CLASS(xfbtree_freesp_class,
TP_PROTO(struct xfbtree *xfbt, struct xfs_btree_cur *cur,
xfs_fileoff_t fileoff),
TP_ARGS(xfbt, cur, fileoff),
TP_STRUCT__entry(
__field(unsigned long, xfino)
__string(btname, cur->bc_ops->name)
__field(int, nlevels)
__field(xfs_fileoff_t, fileoff)
),
TP_fast_assign(
__entry->xfino = file_inode(xfbt->target->bt_file)->i_ino;
__assign_str(btname, cur->bc_ops->name);
__entry->nlevels = cur->bc_nlevels;
__entry->fileoff = fileoff;
),
TP_printk("xfino 0x%lx %sbt nlevels %d fileoff 0x%llx",
__entry->xfino,
__get_str(btname),
__entry->nlevels,
(unsigned long long)__entry->fileoff)
)
#define DEFINE_XFBTREE_FREESP_EVENT(name) \
DEFINE_EVENT(xfbtree_freesp_class, name, \
TP_PROTO(struct xfbtree *xfbt, struct xfs_btree_cur *cur, \
xfs_fileoff_t fileoff), \
TP_ARGS(xfbt, cur, fileoff))
DEFINE_XFBTREE_FREESP_EVENT(xfbtree_alloc_block);
DEFINE_XFBTREE_FREESP_EVENT(xfbtree_free_block);
#endif /* CONFIG_XFS_BTREE_IN_MEM */
#endif /* _TRACE_XFS_H */
#undef TRACE_INCLUDE_PATH
......
......@@ -215,6 +215,7 @@ struct xfs_buf *xfs_trans_getsb(struct xfs_trans *);
void xfs_trans_brelse(xfs_trans_t *, struct xfs_buf *);
void xfs_trans_bjoin(xfs_trans_t *, struct xfs_buf *);
void xfs_trans_bdetach(struct xfs_trans *tp, struct xfs_buf *bp);
void xfs_trans_bhold(xfs_trans_t *, struct xfs_buf *);
void xfs_trans_bhold_release(xfs_trans_t *, struct xfs_buf *);
void xfs_trans_binval(xfs_trans_t *, struct xfs_buf *);
......
......@@ -392,6 +392,48 @@ xfs_trans_brelse(
xfs_buf_relse(bp);
}
/*
* Forcibly detach a buffer previously joined to the transaction. The caller
* will retain its locked reference to the buffer after this function returns.
* The buffer must be completely clean and must not be held to the transaction.
*/
void
xfs_trans_bdetach(
struct xfs_trans *tp,
struct xfs_buf *bp)
{
struct xfs_buf_log_item *bip = bp->b_log_item;
ASSERT(tp != NULL);
ASSERT(bp->b_transp == tp);
ASSERT(bip->bli_item.li_type == XFS_LI_BUF);
ASSERT(atomic_read(&bip->bli_refcount) > 0);
trace_xfs_trans_bdetach(bip);
/*
* Erase all recursion count, since we're removing this buffer from the
* transaction.
*/
bip->bli_recur = 0;
/*
* The buffer must be completely clean. Specifically, it had better
* not be dirty, stale, logged, ordered, or held to the transaction.
*/
ASSERT(!test_bit(XFS_LI_DIRTY, &bip->bli_item.li_flags));
ASSERT(!(bip->bli_flags & XFS_BLI_DIRTY));
ASSERT(!(bip->bli_flags & XFS_BLI_HOLD));
ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED));
ASSERT(!(bip->bli_flags & XFS_BLI_ORDERED));
ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
/* Unlink the log item from the transaction and drop the log item. */
xfs_trans_del_item(&bip->bli_item);
xfs_buf_item_put(bip);
bp->b_transp = NULL;
}
/*
* Mark the buffer as not needing to be unlocked when the buf item's
* iop_committing() routine is called. The buffer must already be locked
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment