Commit a44a027a authored by Dave Chinner's avatar Dave Chinner

Merge tag 'large-extent-counters-v9' of https://github.com/chandanr/linux into xfs-5.19-for-next

xfs: Large extent counters

The commit xfs: fix inode fork extent count overflow
(3f8a4f1d) mentions that 10 billion
data fork extents should be possible to create. However the
corresponding on-disk field has a signed 32-bit type. Hence this
patchset extends the per-inode data fork extent counter to 64 bits
(out of which 48 bits are used to store the extent count).

Also, XFS has an attribute fork extent counter which is 16 bits
wide. A workload that,
1. Creates 1 million 255-byte sized xattrs,
2. Deletes 50% of these xattrs in an alternating manner,
3. Tries to insert 400,000 new 255-byte sized xattrs
   causes the xattr extent counter to overflow.

Dave tells me that there are instances where a single file has more
than 100 million hardlinks. With parent pointers being stored in
xattrs, we will overflow the signed 16-bits wide attribute extent
counter when large number of hardlinks are created. Hence this
patchset extends the on-disk field to 32-bits.

The following changes are made to accomplish this,
1. A 64-bit inode field is carved out of existing di_pad and
   di_flushiter fields to hold the 64-bit data fork extent counter.
2. The existing 32-bit inode data fork extent counter will be used to
   hold the attribute fork extent counter.
3. A new incompat superblock flag to prevent older kernels from mounting
   the filesystem.
Signed-off-by: default avatarChandan Babu R <chandan.babu@oracle.com>
Signed-off-by: default avatarDave Chinner <david@fromorbit.com>
parents 463260d7 973ac0eb
......@@ -2511,7 +2511,7 @@ __xfs_free_extent_later(
ASSERT(bno != NULLFSBLOCK);
ASSERT(len > 0);
ASSERT(len <= MAXEXTLEN);
ASSERT(len <= XFS_MAX_BMBT_EXTLEN);
ASSERT(!isnullstartblock(bno));
agno = XFS_FSB_TO_AGNO(mp, bno);
agbno = XFS_FSB_TO_AGBNO(mp, bno);
......
......@@ -776,6 +776,9 @@ xfs_attr_set(
if (args->value || xfs_inode_hasattr(dp)) {
error = xfs_iext_count_may_overflow(dp, XFS_ATTR_FORK,
XFS_IEXT_ATTR_MANIP_CNT(rmt_blks));
if (error == -EFBIG)
error = xfs_iext_count_upgrade(args->trans, dp,
XFS_IEXT_ATTR_MANIP_CNT(rmt_blks));
if (error)
goto out_trans_cancel;
}
......
This diff is collapsed.
......@@ -597,7 +597,11 @@ xfs_bmbt_maxrecs(
return xfs_bmbt_block_maxrecs(blocklen, leaf);
}
/* Compute the max possible height for block mapping btrees. */
/*
* Calculate the maximum possible height of the btree that the on-disk format
* supports. This is used for sizing structures large enough to support every
* possible configuration of a filesystem that might get mounted.
*/
unsigned int
xfs_bmbt_maxlevels_ondisk(void)
{
......@@ -611,7 +615,8 @@ xfs_bmbt_maxlevels_ondisk(void)
minrecs[1] = xfs_bmbt_block_maxrecs(blocklen, false) / 2;
/* One extra level for the inode root. */
return xfs_btree_compute_maxlevels(minrecs, MAXEXTNUM) + 1;
return xfs_btree_compute_maxlevels(minrecs,
XFS_MAX_EXTCNT_DATA_FORK_LARGE) + 1;
}
/*
......
......@@ -30,6 +30,7 @@ struct xfs_da_geometry {
unsigned int free_hdr_size; /* dir2 free header size */
unsigned int free_max_bests; /* # of bests entries in dir2 free */
xfs_dablk_t freeblk; /* blockno of free data v2 */
xfs_extnum_t max_extents; /* Max. extents in corresponding fork */
xfs_dir2_data_aoff_t data_first_offset;
size_t data_entry_offset;
......
......@@ -277,6 +277,7 @@ xfs_dir2_sf_firstentry(struct xfs_dir2_sf_hdr *hdr)
* Directory address space divided into sections,
* spaces separated by 32GB.
*/
#define XFS_DIR2_MAX_SPACES 3
#define XFS_DIR2_SPACE_SIZE (1ULL << (32 + XFS_DIR2_DATA_ALIGN_LOG))
#define XFS_DIR2_DATA_SPACE 0
#define XFS_DIR2_DATA_OFFSET (XFS_DIR2_DATA_SPACE * XFS_DIR2_SPACE_SIZE)
......
......@@ -150,6 +150,8 @@ xfs_da_mount(
dageo->freeblk = xfs_dir2_byte_to_da(dageo, XFS_DIR2_FREE_OFFSET);
dageo->node_ents = (dageo->blksize - dageo->node_hdr_size) /
(uint)sizeof(xfs_da_node_entry_t);
dageo->max_extents = (XFS_DIR2_MAX_SPACES * XFS_DIR2_SPACE_SIZE) >>
mp->m_sb.sb_blocklog;
dageo->magicpct = (dageo->blksize * 37) / 100;
/* set up attribute geometry - single fsb only */
......@@ -161,6 +163,12 @@ xfs_da_mount(
dageo->node_hdr_size = mp->m_dir_geo->node_hdr_size;
dageo->node_ents = (dageo->blksize - dageo->node_hdr_size) /
(uint)sizeof(xfs_da_node_entry_t);
if (xfs_has_large_extent_counts(mp))
dageo->max_extents = XFS_MAX_EXTCNT_ATTR_FORK_LARGE;
else
dageo->max_extents = XFS_MAX_EXTCNT_ATTR_FORK_SMALL;
dageo->magicpct = (dageo->blksize * 37) / 100;
return 0;
}
......
......@@ -372,12 +372,14 @@ xfs_sb_has_ro_compat_feature(
#define XFS_SB_FEAT_INCOMPAT_META_UUID (1 << 2) /* metadata UUID */
#define XFS_SB_FEAT_INCOMPAT_BIGTIME (1 << 3) /* large timestamps */
#define XFS_SB_FEAT_INCOMPAT_NEEDSREPAIR (1 << 4) /* needs xfs_repair */
#define XFS_SB_FEAT_INCOMPAT_NREXT64 (1 << 5) /* large extent counters */
#define XFS_SB_FEAT_INCOMPAT_ALL \
(XFS_SB_FEAT_INCOMPAT_FTYPE| \
XFS_SB_FEAT_INCOMPAT_SPINODES| \
XFS_SB_FEAT_INCOMPAT_META_UUID| \
XFS_SB_FEAT_INCOMPAT_BIGTIME| \
XFS_SB_FEAT_INCOMPAT_NEEDSREPAIR)
XFS_SB_FEAT_INCOMPAT_NEEDSREPAIR| \
XFS_SB_FEAT_INCOMPAT_NREXT64)
#define XFS_SB_FEAT_INCOMPAT_UNKNOWN ~XFS_SB_FEAT_INCOMPAT_ALL
static inline bool
......@@ -791,16 +793,41 @@ struct xfs_dinode {
__be32 di_nlink; /* number of links to file */
__be16 di_projid_lo; /* lower part of owner's project id */
__be16 di_projid_hi; /* higher part owner's project id */
__u8 di_pad[6]; /* unused, zeroed space */
__be16 di_flushiter; /* incremented on flush */
union {
/* Number of data fork extents if NREXT64 is set */
__be64 di_big_nextents;
/* Padding for V3 inodes without NREXT64 set. */
__be64 di_v3_pad;
/* Padding and inode flush counter for V2 inodes. */
struct {
__u8 di_v2_pad[6];
__be16 di_flushiter;
};
};
xfs_timestamp_t di_atime; /* time last accessed */
xfs_timestamp_t di_mtime; /* time last modified */
xfs_timestamp_t di_ctime; /* time created/inode modified */
__be64 di_size; /* number of bytes in file */
__be64 di_nblocks; /* # of direct & btree blocks used */
__be32 di_extsize; /* basic/minimum extent size for file */
__be32 di_nextents; /* number of extents in data fork */
__be16 di_anextents; /* number of extents in attribute fork*/
union {
/*
* For V2 inodes and V3 inodes without NREXT64 set, this
* is the number of data and attr fork extents.
*/
struct {
__be32 di_nextents;
__be16 di_anextents;
} __packed;
/* Number of attr fork extents if NREXT64 is set. */
struct {
__be32 di_big_anextents;
__be16 di_nrext64_pad;
} __packed;
} __packed;
__u8 di_forkoff; /* attr fork offs, <<3 for 64b align */
__s8 di_aformat; /* format of attr fork's data */
__be32 di_dmevmask; /* DMIG event mask */
......@@ -869,6 +896,56 @@ enum xfs_dinode_fmt {
{ XFS_DINODE_FMT_BTREE, "btree" }, \
{ XFS_DINODE_FMT_UUID, "uuid" }
/*
* Max values for extnum and aextnum.
*
* The original on-disk extent counts were held in signed fields, resulting in
* maximum extent counts of 2^31 and 2^15 for the data and attr forks
* respectively. Similarly the maximum extent length is limited to 2^21 blocks
* by the 21-bit wide blockcount field of a BMBT extent record.
*
* The newly introduced data fork extent counter can hold a 64-bit value,
* however the maximum number of extents in a file is also limited to 2^54
* extents by the 54-bit wide startoff field of a BMBT extent record.
*
* It is further limited by the maximum supported file size of 2^63
* *bytes*. This leads to a maximum extent count for maximally sized filesystem
* blocks (64kB) of:
*
* 2^63 bytes / 2^16 bytes per block = 2^47 blocks
*
* Rounding up 47 to the nearest multiple of bits-per-byte results in 48. Hence
* 2^48 was chosen as the maximum data fork extent count.
*
* The maximum file size that can be represented by the data fork extent counter
* in the worst case occurs when all extents are 1 block in length and each
* block is 1KB in size.
*
* With XFS_MAX_EXTCNT_DATA_FORK_SMALL representing maximum extent count and
* with 1KB sized blocks, a file can reach upto,
* 1KB * (2^31) = 2TB
*
* This is much larger than the theoretical maximum size of a directory
* i.e. XFS_DIR2_SPACE_SIZE * XFS_DIR2_MAX_SPACES = ~96GB.
*
* Hence, a directory inode can never overflow its data fork extent counter.
*/
#define XFS_MAX_EXTCNT_DATA_FORK_LARGE ((xfs_extnum_t)((1ULL << 48) - 1))
#define XFS_MAX_EXTCNT_ATTR_FORK_LARGE ((xfs_extnum_t)((1ULL << 32) - 1))
#define XFS_MAX_EXTCNT_DATA_FORK_SMALL ((xfs_extnum_t)((1ULL << 31) - 1))
#define XFS_MAX_EXTCNT_ATTR_FORK_SMALL ((xfs_extnum_t)((1ULL << 15) - 1))
/*
* When we upgrade an inode to the large extent counts, the maximum value by
* which the extent count can increase is bound by the change in size of the
* on-disk field. No upgrade operation should ever be adding more than a few
* tens of extents, so if we get a really large value it is a sign of a code bug
* or corruption.
*/
#define XFS_MAX_EXTCNT_UPGRADE_NR \
min(XFS_MAX_EXTCNT_ATTR_FORK_LARGE - XFS_MAX_EXTCNT_ATTR_FORK_SMALL, \
XFS_MAX_EXTCNT_DATA_FORK_LARGE - XFS_MAX_EXTCNT_DATA_FORK_SMALL)
/*
* Inode minimum and maximum sizes.
*/
......@@ -918,10 +995,6 @@ enum xfs_dinode_fmt {
((w) == XFS_DATA_FORK ? \
(dip)->di_format : \
(dip)->di_aformat)
#define XFS_DFORK_NEXTENTS(dip,w) \
((w) == XFS_DATA_FORK ? \
be32_to_cpu((dip)->di_nextents) : \
be16_to_cpu((dip)->di_anextents))
/*
* For block and character special files the 32bit dev_t is stored at the
......@@ -988,15 +1061,17 @@ static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev)
#define XFS_DIFLAG2_REFLINK_BIT 1 /* file's blocks may be shared */
#define XFS_DIFLAG2_COWEXTSIZE_BIT 2 /* copy on write extent size hint */
#define XFS_DIFLAG2_BIGTIME_BIT 3 /* big timestamps */
#define XFS_DIFLAG2_NREXT64_BIT 4 /* large extent counters */
#define XFS_DIFLAG2_DAX (1 << XFS_DIFLAG2_DAX_BIT)
#define XFS_DIFLAG2_REFLINK (1 << XFS_DIFLAG2_REFLINK_BIT)
#define XFS_DIFLAG2_COWEXTSIZE (1 << XFS_DIFLAG2_COWEXTSIZE_BIT)
#define XFS_DIFLAG2_BIGTIME (1 << XFS_DIFLAG2_BIGTIME_BIT)
#define XFS_DIFLAG2_NREXT64 (1 << XFS_DIFLAG2_NREXT64_BIT)
#define XFS_DIFLAG2_ANY \
(XFS_DIFLAG2_DAX | XFS_DIFLAG2_REFLINK | XFS_DIFLAG2_COWEXTSIZE | \
XFS_DIFLAG2_BIGTIME)
XFS_DIFLAG2_BIGTIME | XFS_DIFLAG2_NREXT64)
static inline bool xfs_dinode_has_bigtime(const struct xfs_dinode *dip)
{
......@@ -1004,6 +1079,13 @@ static inline bool xfs_dinode_has_bigtime(const struct xfs_dinode *dip)
(dip->di_flags2 & cpu_to_be64(XFS_DIFLAG2_BIGTIME));
}
static inline bool xfs_dinode_has_large_extent_counts(
const struct xfs_dinode *dip)
{
return dip->di_version >= 3 &&
(dip->di_flags2 & cpu_to_be64(XFS_DIFLAG2_NREXT64));
}
/*
* Inode number format:
* low inopblog bits - offset in block
......@@ -1596,6 +1678,8 @@ typedef struct xfs_bmdr_block {
#define BMBT_STARTOFF_MASK ((1ULL << BMBT_STARTOFF_BITLEN) - 1)
#define BMBT_BLOCKCOUNT_MASK ((1ULL << BMBT_BLOCKCOUNT_BITLEN) - 1)
#define XFS_MAX_BMBT_EXTLEN ((xfs_extlen_t)(BMBT_BLOCKCOUNT_MASK))
/*
* bmbt records have a file offset (block) field that is 54 bits wide, so this
* is the largest xfs_fileoff_t that we ever expect to see.
......
......@@ -236,6 +236,7 @@ typedef struct xfs_fsop_resblks {
#define XFS_FSOP_GEOM_FLAGS_REFLINK (1 << 20) /* files can share blocks */
#define XFS_FSOP_GEOM_FLAGS_BIGTIME (1 << 21) /* 64-bit nsec timestamps */
#define XFS_FSOP_GEOM_FLAGS_INOBTCNT (1 << 22) /* inobt btree counter */
#define XFS_FSOP_GEOM_FLAGS_NREXT64 (1 << 23) /* large extent counters */
/*
* Minimum and maximum sizes need for growth checks.
......@@ -377,7 +378,7 @@ struct xfs_bulkstat {
uint32_t bs_extsize_blks; /* extent size hint, blocks */
uint32_t bs_nlink; /* number of links */
uint32_t bs_extents; /* number of extents */
uint32_t bs_extents; /* 32-bit data fork extent counter */
uint32_t bs_aextents; /* attribute number of extents */
uint16_t bs_version; /* structure version */
uint16_t bs_forkoff; /* inode fork offset in bytes */
......@@ -386,8 +387,9 @@ struct xfs_bulkstat {
uint16_t bs_checked; /* checked inode metadata */
uint16_t bs_mode; /* type and mode */
uint16_t bs_pad2; /* zeroed */
uint64_t bs_extents64; /* 64-bit data fork extent counter */
uint64_t bs_pad[7]; /* zeroed */
uint64_t bs_pad[6]; /* zeroed */
};
#define XFS_BULKSTAT_VERSION_V1 (1)
......@@ -459,17 +461,28 @@ struct xfs_bulk_ireq {
* Only return results from the specified @agno. If @ino is zero, start
* with the first inode of @agno.
*/
#define XFS_BULK_IREQ_AGNO (1 << 0)
#define XFS_BULK_IREQ_AGNO (1U << 0)
/*
* Return bulkstat information for a single inode, where @ino value is a
* special value, not a literal inode number. See the XFS_BULK_IREQ_SPECIAL_*
* values below. Not compatible with XFS_BULK_IREQ_AGNO.
*/
#define XFS_BULK_IREQ_SPECIAL (1 << 1)
#define XFS_BULK_IREQ_SPECIAL (1U << 1)
#define XFS_BULK_IREQ_FLAGS_ALL (XFS_BULK_IREQ_AGNO | \
XFS_BULK_IREQ_SPECIAL)
/*
* Return data fork extent count via xfs_bulkstat->bs_extents64 field and assign
* 0 to xfs_bulkstat->bs_extents when the flag is set. Otherwise, use
* xfs_bulkstat->bs_extents for returning data fork extent count and set
* xfs_bulkstat->bs_extents64 to 0. In the second case, return -EOVERFLOW and
* assign 0 to xfs_bulkstat->bs_extents if data fork extent count is larger than
* XFS_MAX_EXTCNT_DATA_FORK_OLD.
*/
#define XFS_BULK_IREQ_NREXT64 (1U << 2)
#define XFS_BULK_IREQ_FLAGS_ALL (XFS_BULK_IREQ_AGNO | \
XFS_BULK_IREQ_SPECIAL | \
XFS_BULK_IREQ_NREXT64)
/* Operate on the root directory inode. */
#define XFS_BULK_IREQ_SPECIAL_ROOT (1)
......
......@@ -2772,6 +2772,8 @@ xfs_ialloc_setup_geometry(
igeo->new_diflags2 = 0;
if (xfs_has_bigtime(mp))
igeo->new_diflags2 |= XFS_DIFLAG2_BIGTIME;
if (xfs_has_large_extent_counts(mp))
igeo->new_diflags2 |= XFS_DIFLAG2_NREXT64;
/* Compute inode btree geometry. */
igeo->agino_log = sbp->sb_inopblog + sbp->sb_agblklog;
......
......@@ -279,6 +279,25 @@ xfs_inode_to_disk_ts(
return ts;
}
static inline void
xfs_inode_to_disk_iext_counters(
struct xfs_inode *ip,
struct xfs_dinode *to)
{
if (xfs_inode_has_large_extent_counts(ip)) {
to->di_big_nextents = cpu_to_be64(xfs_ifork_nextents(&ip->i_df));
to->di_big_anextents = cpu_to_be32(xfs_ifork_nextents(ip->i_afp));
/*
* We might be upgrading the inode to use larger extent counters
* than was previously used. Hence zero the unused field.
*/
to->di_nrext64_pad = cpu_to_be16(0);
} else {
to->di_nextents = cpu_to_be32(xfs_ifork_nextents(&ip->i_df));
to->di_anextents = cpu_to_be16(xfs_ifork_nextents(ip->i_afp));
}
}
void
xfs_inode_to_disk(
struct xfs_inode *ip,
......@@ -296,7 +315,6 @@ xfs_inode_to_disk(
to->di_projid_lo = cpu_to_be16(ip->i_projid & 0xffff);
to->di_projid_hi = cpu_to_be16(ip->i_projid >> 16);
memset(to->di_pad, 0, sizeof(to->di_pad));
to->di_atime = xfs_inode_to_disk_ts(ip, inode->i_atime);
to->di_mtime = xfs_inode_to_disk_ts(ip, inode->i_mtime);
to->di_ctime = xfs_inode_to_disk_ts(ip, inode->i_ctime);
......@@ -307,8 +325,6 @@ xfs_inode_to_disk(
to->di_size = cpu_to_be64(ip->i_disk_size);
to->di_nblocks = cpu_to_be64(ip->i_nblocks);
to->di_extsize = cpu_to_be32(ip->i_extsize);
to->di_nextents = cpu_to_be32(xfs_ifork_nextents(&ip->i_df));
to->di_anextents = cpu_to_be16(xfs_ifork_nextents(ip->i_afp));
to->di_forkoff = ip->i_forkoff;
to->di_aformat = xfs_ifork_format(ip->i_afp);
to->di_flags = cpu_to_be16(ip->i_diflags);
......@@ -323,11 +339,14 @@ xfs_inode_to_disk(
to->di_lsn = cpu_to_be64(lsn);
memset(to->di_pad2, 0, sizeof(to->di_pad2));
uuid_copy(&to->di_uuid, &ip->i_mount->m_sb.sb_meta_uuid);
to->di_flushiter = 0;
to->di_v3_pad = 0;
} else {
to->di_version = 2;
to->di_flushiter = cpu_to_be16(ip->i_flushiter);
memset(to->di_v2_pad, 0, sizeof(to->di_v2_pad));
}
xfs_inode_to_disk_iext_counters(ip, to);
}
static xfs_failaddr_t
......@@ -336,7 +355,10 @@ xfs_dinode_verify_fork(
struct xfs_mount *mp,
int whichfork)
{
uint32_t di_nextents = XFS_DFORK_NEXTENTS(dip, whichfork);
xfs_extnum_t di_nextents;
xfs_extnum_t max_extents;
di_nextents = xfs_dfork_nextents(dip, whichfork);
switch (XFS_DFORK_FORMAT(dip, whichfork)) {
case XFS_DINODE_FMT_LOCAL:
......@@ -358,12 +380,11 @@ xfs_dinode_verify_fork(
return __this_address;
break;
case XFS_DINODE_FMT_BTREE:
if (whichfork == XFS_ATTR_FORK) {
if (di_nextents > MAXAEXTNUM)
return __this_address;
} else if (di_nextents > MAXEXTNUM) {
max_extents = xfs_iext_max_nextents(
xfs_dinode_has_large_extent_counts(dip),
whichfork);
if (di_nextents > max_extents)
return __this_address;
}
break;
default:
return __this_address;
......@@ -396,6 +417,24 @@ xfs_dinode_verify_forkoff(
return NULL;
}
static xfs_failaddr_t
xfs_dinode_verify_nrext64(
struct xfs_mount *mp,
struct xfs_dinode *dip)
{
if (xfs_dinode_has_large_extent_counts(dip)) {
if (!xfs_has_large_extent_counts(mp))
return __this_address;
if (dip->di_nrext64_pad != 0)
return __this_address;
} else if (dip->di_version >= 3) {
if (dip->di_v3_pad != 0)
return __this_address;
}
return NULL;
}
xfs_failaddr_t
xfs_dinode_verify(
struct xfs_mount *mp,
......@@ -407,6 +446,9 @@ xfs_dinode_verify(
uint16_t flags;
uint64_t flags2;
uint64_t di_size;
xfs_extnum_t nextents;
xfs_extnum_t naextents;
xfs_filblks_t nblocks;
if (dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC))
return __this_address;
......@@ -437,10 +479,19 @@ xfs_dinode_verify(
if ((S_ISLNK(mode) || S_ISDIR(mode)) && di_size == 0)
return __this_address;
fa = xfs_dinode_verify_nrext64(mp, dip);
if (fa)
return fa;
nextents = xfs_dfork_data_extents(dip);
naextents = xfs_dfork_attr_extents(dip);
nblocks = be64_to_cpu(dip->di_nblocks);
/* Fork checks carried over from xfs_iformat_fork */
if (mode &&
be32_to_cpu(dip->di_nextents) + be16_to_cpu(dip->di_anextents) >
be64_to_cpu(dip->di_nblocks))
if (mode && nextents + naextents > nblocks)
return __this_address;
if (S_ISDIR(mode) && nextents > mp->m_dir_geo->max_extents)
return __this_address;
if (mode && XFS_DFORK_BOFF(dip) > mp->m_sb.sb_inodesize)
......@@ -497,7 +548,7 @@ xfs_dinode_verify(
default:
return __this_address;
}
if (dip->di_anextents)
if (naextents)
return __this_address;
}
......@@ -639,7 +690,7 @@ xfs_inode_validate_extsize(
if (extsize_bytes % blocksize_bytes)
return __this_address;
if (extsize > MAXEXTLEN)
if (extsize > XFS_MAX_BMBT_EXTLEN)
return __this_address;
if (!rt_flag && extsize > mp->m_sb.sb_agblocks / 2)
......@@ -696,7 +747,7 @@ xfs_inode_validate_cowextsize(
if (cowextsize_bytes % mp->m_sb.sb_blocksize)
return __this_address;
if (cowextsize > MAXEXTLEN)
if (cowextsize > XFS_MAX_BMBT_EXTLEN)
return __this_address;
if (cowextsize > mp->m_sb.sb_agblocks / 2)
......
......@@ -105,7 +105,7 @@ xfs_iformat_extents(
struct xfs_mount *mp = ip->i_mount;
struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
int state = xfs_bmap_fork_to_state(whichfork);
int nex = XFS_DFORK_NEXTENTS(dip, whichfork);
xfs_extnum_t nex = xfs_dfork_nextents(dip, whichfork);
int size = nex * sizeof(xfs_bmbt_rec_t);
struct xfs_iext_cursor icur;
struct xfs_bmbt_rec *dp;
......@@ -117,8 +117,8 @@ xfs_iformat_extents(
* we just bail out rather than crash in kmem_alloc() or memcpy() below.
*/
if (unlikely(size < 0 || size > XFS_DFORK_SIZE(dip, mp, whichfork))) {
xfs_warn(ip->i_mount, "corrupt inode %Lu ((a)extents = %d).",
(unsigned long long) ip->i_ino, nex);
xfs_warn(ip->i_mount, "corrupt inode %llu ((a)extents = %llu).",
ip->i_ino, nex);
xfs_inode_verifier_error(ip, -EFSCORRUPTED,
"xfs_iformat_extents(1)", dip, sizeof(*dip),
__this_address);
......@@ -230,7 +230,7 @@ xfs_iformat_data_fork(
* depend on it.
*/
ip->i_df.if_format = dip->di_format;
ip->i_df.if_nextents = be32_to_cpu(dip->di_nextents);
ip->i_df.if_nextents = xfs_dfork_data_extents(dip);
switch (inode->i_mode & S_IFMT) {
case S_IFIFO:
......@@ -295,14 +295,14 @@ xfs_iformat_attr_fork(
struct xfs_inode *ip,
struct xfs_dinode *dip)
{
xfs_extnum_t naextents = xfs_dfork_attr_extents(dip);
int error = 0;
/*
* Initialize the extent count early, as the per-format routines may
* depend on it.
*/
ip->i_afp = xfs_ifork_alloc(dip->di_aformat,
be16_to_cpu(dip->di_anextents));
ip->i_afp = xfs_ifork_alloc(dip->di_aformat, naextents);
switch (ip->i_afp->if_format) {
case XFS_DINODE_FMT_LOCAL:
......@@ -744,7 +744,8 @@ xfs_iext_count_may_overflow(
if (whichfork == XFS_COW_FORK)
return 0;
max_exts = (whichfork == XFS_ATTR_FORK) ? MAXAEXTNUM : MAXEXTNUM;
max_exts = xfs_iext_max_nextents(xfs_inode_has_large_extent_counts(ip),
whichfork);
if (XFS_TEST_ERROR(false, ip->i_mount, XFS_ERRTAG_REDUCE_MAX_IEXTENTS))
max_exts = 10;
......@@ -755,3 +756,27 @@ xfs_iext_count_may_overflow(
return 0;
}
/*
* Upgrade this inode's extent counter fields to be able to handle a potential
* increase in the extent count by nr_to_add. Normally this is the same
* quantity that caused xfs_iext_count_may_overflow() to return -EFBIG.
*/
int
xfs_iext_count_upgrade(
struct xfs_trans *tp,
struct xfs_inode *ip,
uint nr_to_add)
{
ASSERT(nr_to_add <= XFS_MAX_EXTCNT_UPGRADE_NR);
if (!xfs_has_large_extent_counts(ip->i_mount) ||
xfs_inode_has_large_extent_counts(ip) ||
XFS_TEST_ERROR(false, ip->i_mount, XFS_ERRTAG_REDUCE_MAX_IEXTENTS))
return -EFBIG;
ip->i_diflags2 |= XFS_DIFLAG2_NREXT64;
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
return 0;
}
......@@ -21,9 +21,9 @@ struct xfs_ifork {
void *if_root; /* extent tree root */
char *if_data; /* inline file data */
} if_u1;
xfs_extnum_t if_nextents; /* # of extents in this fork */
short if_broot_bytes; /* bytes allocated for root */
int8_t if_format; /* format of this fork */
xfs_extnum_t if_nextents; /* # of extents in this fork */
};
/*
......@@ -39,19 +39,6 @@ struct xfs_ifork {
*/
#define XFS_IEXT_PUNCH_HOLE_CNT (1)
/*
* Directory entry addition can cause the following,
* 1. Data block can be added/removed.
* A new extent can cause extent count to increase by 1.
* 2. Free disk block can be added/removed.
* Same behaviour as described above for Data block.
* 3. Dabtree blocks.
* XFS_DA_NODE_MAXDEPTH blocks can be added. Each of these can be new
* extents. Hence extent count can increase by XFS_DA_NODE_MAXDEPTH.
*/
#define XFS_IEXT_DIR_MANIP_CNT(mp) \
((XFS_DA_NODE_MAXDEPTH + 1 + 1) * (mp)->m_dir_geo->fsbcount)
/*
* Adding/removing an xattr can cause XFS_DA_NODE_MAXDEPTH extents to
* be added. One extra extent for dabtree in case a local attr is
......@@ -133,6 +120,65 @@ static inline int8_t xfs_ifork_format(struct xfs_ifork *ifp)
return ifp->if_format;
}
static inline xfs_extnum_t xfs_iext_max_nextents(bool has_large_extent_counts,
int whichfork)
{
switch (whichfork) {
case XFS_DATA_FORK:
case XFS_COW_FORK:
if (has_large_extent_counts)
return XFS_MAX_EXTCNT_DATA_FORK_LARGE;
return XFS_MAX_EXTCNT_DATA_FORK_SMALL;
case XFS_ATTR_FORK:
if (has_large_extent_counts)
return XFS_MAX_EXTCNT_ATTR_FORK_LARGE;
return XFS_MAX_EXTCNT_ATTR_FORK_SMALL;
default:
ASSERT(0);
return 0;
}
}
static inline xfs_extnum_t
xfs_dfork_data_extents(
struct xfs_dinode *dip)
{
if (xfs_dinode_has_large_extent_counts(dip))
return be64_to_cpu(dip->di_big_nextents);
return be32_to_cpu(dip->di_nextents);
}
static inline xfs_extnum_t
xfs_dfork_attr_extents(
struct xfs_dinode *dip)
{
if (xfs_dinode_has_large_extent_counts(dip))
return be32_to_cpu(dip->di_big_anextents);
return be16_to_cpu(dip->di_anextents);
}
static inline xfs_extnum_t
xfs_dfork_nextents(
struct xfs_dinode *dip,
int whichfork)
{
switch (whichfork) {
case XFS_DATA_FORK:
return xfs_dfork_data_extents(dip);
case XFS_ATTR_FORK:
return xfs_dfork_attr_extents(dip);
default:
ASSERT(0);
break;
}
return 0;
}
struct xfs_ifork *xfs_ifork_alloc(enum xfs_dinode_fmt format,
xfs_extnum_t nextents);
struct xfs_ifork *xfs_iext_state_to_fork(struct xfs_inode *ip, int state);
......@@ -229,6 +275,8 @@ int xfs_ifork_verify_local_data(struct xfs_inode *ip);
int xfs_ifork_verify_local_attr(struct xfs_inode *ip);
int xfs_iext_count_may_overflow(struct xfs_inode *ip, int whichfork,
int nr_to_add);
int xfs_iext_count_upgrade(struct xfs_trans *tp, struct xfs_inode *ip,
uint nr_to_add);
/* returns true if the fork has extents but they are not read in yet. */
static inline bool xfs_need_iread_extents(struct xfs_ifork *ifp)
......
......@@ -387,16 +387,41 @@ struct xfs_log_dinode {
uint32_t di_nlink; /* number of links to file */
uint16_t di_projid_lo; /* lower part of owner's project id */
uint16_t di_projid_hi; /* higher part of owner's project id */
uint8_t di_pad[6]; /* unused, zeroed space */
uint16_t di_flushiter; /* incremented on flush */
union {
/* Number of data fork extents if NREXT64 is set */
uint64_t di_big_nextents;
/* Padding for V3 inodes without NREXT64 set. */
uint64_t di_v3_pad;
/* Padding and inode flush counter for V2 inodes. */
struct {
uint8_t di_v2_pad[6]; /* V2 inode zeroed space */
uint16_t di_flushiter; /* V2 inode incremented on flush */
};
};
xfs_log_timestamp_t di_atime; /* time last accessed */
xfs_log_timestamp_t di_mtime; /* time last modified */
xfs_log_timestamp_t di_ctime; /* time created/inode modified */
xfs_fsize_t di_size; /* number of bytes in file */
xfs_rfsblock_t di_nblocks; /* # of direct & btree blocks used */
xfs_extlen_t di_extsize; /* basic/minimum extent size for file */
xfs_extnum_t di_nextents; /* number of extents in data fork */
xfs_aextnum_t di_anextents; /* number of extents in attribute fork*/
union {
/*
* For V2 inodes and V3 inodes without NREXT64 set, this
* is the number of data and attr fork extents.
*/
struct {
uint32_t di_nextents;
uint16_t di_anextents;
} __packed;
/* Number of attr fork extents if NREXT64 is set. */
struct {
uint32_t di_big_anextents;
uint16_t di_nrext64_pad;
} __packed;
} __packed;
uint8_t di_forkoff; /* attr fork offs, <<3 for 64b align */
int8_t di_aformat; /* format of attr fork's data */
uint32_t di_dmevmask; /* DMIG event mask */
......
......@@ -124,6 +124,9 @@ xfs_sb_version_to_features(
features |= XFS_FEAT_BIGTIME;
if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_NEEDSREPAIR)
features |= XFS_FEAT_NEEDSREPAIR;
if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_NREXT64)
features |= XFS_FEAT_NREXT64;
return features;
}
......@@ -1140,6 +1143,8 @@ xfs_fs_geometry(
} else {
geo->logsectsize = BBSIZE;
}
if (xfs_has_large_extent_counts(mp))
geo->flags |= XFS_FSOP_GEOM_FLAGS_NREXT64;
geo->rtsectsize = sbp->sb_blocksize;
geo->dirblocksize = xfs_dir2_dirblock_bytes(sbp);
......
......@@ -199,8 +199,8 @@ xfs_calc_inode_chunk_res(
/*
* Per-extent log reservation for the btree changes involved in freeing or
* allocating a realtime extent. We have to be able to log as many rtbitmap
* blocks as needed to mark inuse MAXEXTLEN blocks' worth of realtime extents,
* as well as the realtime summary block.
* blocks as needed to mark inuse XFS_BMBT_MAX_EXTLEN blocks' worth of realtime
* extents, as well as the realtime summary block.
*/
static unsigned int
xfs_rtalloc_log_count(
......@@ -210,7 +210,7 @@ xfs_rtalloc_log_count(
unsigned int blksz = XFS_FSB_TO_B(mp, 1);
unsigned int rtbmp_bytes;
rtbmp_bytes = (MAXEXTLEN / mp->m_sb.sb_rextsize) / NBBY;
rtbmp_bytes = (XFS_MAX_BMBT_EXTLEN / mp->m_sb.sb_rextsize) / NBBY;
return (howmany(rtbmp_bytes, blksz) + 1) * num_ops;
}
......@@ -247,7 +247,7 @@ xfs_rtalloc_log_count(
* the inode's bmap btree: max depth * block size
* the agfs of the ags from which the extents are allocated: 2 * sector
* the superblock free block counter: sector size
* the realtime bitmap: ((MAXEXTLEN / rtextsize) / NBBY) bytes
* the realtime bitmap: ((XFS_BMBT_MAX_EXTLEN / rtextsize) / NBBY) bytes
* the realtime summary: 1 block
* the allocation btrees: 2 trees * (2 * max depth - 1) * block size
* And the bmap_finish transaction can free bmap blocks in a join (t3):
......@@ -299,7 +299,8 @@ xfs_calc_write_reservation(
* the agf for each of the ags: 2 * sector size
* the agfl for each of the ags: 2 * sector size
* the super block to reflect the freed blocks: sector size
* the realtime bitmap: 2 exts * ((MAXEXTLEN / rtextsize) / NBBY) bytes
* the realtime bitmap:
* 2 exts * ((XFS_BMBT_MAX_EXTLEN / rtextsize) / NBBY) bytes
* the realtime summary: 2 exts * 1 block
* worst case split in allocation btrees per extent assuming 2 extents:
* 2 exts * 2 trees * (2 * max depth - 1) * block size
......
......@@ -12,8 +12,8 @@ typedef uint32_t xfs_agblock_t; /* blockno in alloc. group */
typedef uint32_t xfs_agino_t; /* inode # within allocation grp */
typedef uint32_t xfs_extlen_t; /* extent length in blocks */
typedef uint32_t xfs_agnumber_t; /* allocation group number */
typedef int32_t xfs_extnum_t; /* # of extents in a file */
typedef int16_t xfs_aextnum_t; /* # extents in an attribute fork */
typedef uint64_t xfs_extnum_t; /* # of extents in a file */
typedef uint32_t xfs_aextnum_t; /* # extents in an attribute fork */
typedef int64_t xfs_fsize_t; /* bytes in a file */
typedef uint64_t xfs_ufsize_t; /* unsigned bytes in a file */
......@@ -56,13 +56,6 @@ typedef void * xfs_failaddr_t;
#define NULLFSINO ((xfs_ino_t)-1)
#define NULLAGINO ((xfs_agino_t)-1)
/*
* Max values for extlen, extnum, aextnum.
*/
#define MAXEXTLEN ((xfs_extlen_t)0x001fffff) /* 21 bits */
#define MAXEXTNUM ((xfs_extnum_t)0x7fffffff) /* signed int */
#define MAXAEXTNUM ((xfs_aextnum_t)0x7fff) /* signed short */
/*
* Minimum and maximum blocksize and sectorsize.
* The blocksize upper limit is pretty much arbitrary.
......
......@@ -350,7 +350,7 @@ xchk_bmap_iextent(
irec->br_startoff);
/* Make sure the extent points to a valid place. */
if (irec->br_blockcount > MAXEXTLEN)
if (irec->br_blockcount > XFS_MAX_BMBT_EXTLEN)
xchk_fblock_set_corrupt(info->sc, info->whichfork,
irec->br_startoff);
if (info->is_rt &&
......
......@@ -232,7 +232,8 @@ xchk_dinode(
size_t fork_recs;
unsigned long long isize;
uint64_t flags2;
uint32_t nextents;
xfs_extnum_t nextents;
xfs_extnum_t naextents;
prid_t prid;
uint16_t flags;
uint16_t mode;
......@@ -390,8 +391,10 @@ xchk_dinode(
xchk_inode_extsize(sc, dip, ino, mode, flags);
nextents = xfs_dfork_data_extents(dip);
naextents = xfs_dfork_attr_extents(dip);
/* di_nextents */
nextents = be32_to_cpu(dip->di_nextents);
fork_recs = XFS_DFORK_DSIZE(dip, mp) / sizeof(struct xfs_bmbt_rec);
switch (dip->di_format) {
case XFS_DINODE_FMT_EXTENTS:
......@@ -411,7 +414,7 @@ xchk_dinode(
/* di_forkoff */
if (XFS_DFORK_APTR(dip) >= (char *)dip + mp->m_sb.sb_inodesize)
xchk_ino_set_corrupt(sc, ino);
if (dip->di_anextents != 0 && dip->di_forkoff == 0)
if (naextents != 0 && dip->di_forkoff == 0)
xchk_ino_set_corrupt(sc, ino);
if (dip->di_forkoff == 0 && dip->di_aformat != XFS_DINODE_FMT_EXTENTS)
xchk_ino_set_corrupt(sc, ino);
......@@ -423,19 +426,18 @@ xchk_dinode(
xchk_ino_set_corrupt(sc, ino);
/* di_anextents */
nextents = be16_to_cpu(dip->di_anextents);
fork_recs = XFS_DFORK_ASIZE(dip, mp) / sizeof(struct xfs_bmbt_rec);
switch (dip->di_aformat) {
case XFS_DINODE_FMT_EXTENTS:
if (nextents > fork_recs)
if (naextents > fork_recs)
xchk_ino_set_corrupt(sc, ino);
break;
case XFS_DINODE_FMT_BTREE:
if (nextents <= fork_recs)
if (naextents <= fork_recs)
xchk_ino_set_corrupt(sc, ino);
break;
default:
if (nextents != 0)
if (naextents != 0)
xchk_ino_set_corrupt(sc, ino);
}
......@@ -513,14 +515,14 @@ xchk_inode_xref_bmap(
&nextents, &count);
if (!xchk_should_check_xref(sc, &error, NULL))
return;
if (nextents < be32_to_cpu(dip->di_nextents))
if (nextents < xfs_dfork_data_extents(dip))
xchk_ino_xref_set_corrupt(sc, sc->ip->i_ino);
error = xfs_bmap_count_blocks(sc->tp, sc->ip, XFS_ATTR_FORK,
&nextents, &acount);
if (!xchk_should_check_xref(sc, &error, NULL))
return;
if (nextents != be16_to_cpu(dip->di_anextents))
if (nextents != xfs_dfork_attr_extents(dip))
xchk_ino_xref_set_corrupt(sc, sc->ip->i_ino);
/* Check nblocks against the inode. */
......
......@@ -506,6 +506,8 @@ xfs_bui_item_recover(
iext_delta = XFS_IEXT_PUNCH_HOLE_CNT;
error = xfs_iext_count_may_overflow(ip, whichfork, iext_delta);
if (error == -EFBIG)
error = xfs_iext_count_upgrade(tp, ip, iext_delta);
if (error)
goto err_cancel;
......
......@@ -119,14 +119,14 @@ xfs_bmap_rtalloc(
*/
ralen = ap->length / mp->m_sb.sb_rextsize;
/*
* If the old value was close enough to MAXEXTLEN that
* If the old value was close enough to XFS_BMBT_MAX_EXTLEN that
* we rounded up to it, cut it back so it's valid again.
* Note that if it's a really large request (bigger than
* MAXEXTLEN), we don't hear about that number, and can't
* XFS_BMBT_MAX_EXTLEN), we don't hear about that number, and can't
* adjust the starting point to match it.
*/
if (ralen * mp->m_sb.sb_rextsize >= MAXEXTLEN)
ralen = MAXEXTLEN / mp->m_sb.sb_rextsize;
if (ralen * mp->m_sb.sb_rextsize >= XFS_MAX_BMBT_EXTLEN)
ralen = XFS_MAX_BMBT_EXTLEN / mp->m_sb.sb_rextsize;
/*
* Lock out modifications to both the RT bitmap and summary inodes
......@@ -839,9 +839,11 @@ xfs_alloc_file_space(
* count, hence we need to limit the number of blocks we are
* trying to reserve to avoid an overflow. We can't allocate
* more than @nimaps extents, and an extent is limited on disk
* to MAXEXTLEN (21 bits), so use that to enforce the limit.
* to XFS_BMBT_MAX_EXTLEN (21 bits), so use that to enforce the
* limit.
*/
resblks = min_t(xfs_fileoff_t, (e - s), (MAXEXTLEN * nimaps));
resblks = min_t(xfs_fileoff_t, (e - s),
(XFS_MAX_BMBT_EXTLEN * nimaps));
if (unlikely(rt)) {
dblocks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
rblocks = resblks;
......@@ -857,6 +859,9 @@ xfs_alloc_file_space(
error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK,
XFS_IEXT_ADD_NOSPLIT_CNT);
if (error == -EFBIG)
error = xfs_iext_count_upgrade(tp, ip,
XFS_IEXT_ADD_NOSPLIT_CNT);
if (error)
goto error;
......@@ -912,6 +917,8 @@ xfs_unmap_extent(
error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK,
XFS_IEXT_PUNCH_HOLE_CNT);
if (error == -EFBIG)
error = xfs_iext_count_upgrade(tp, ip, XFS_IEXT_PUNCH_HOLE_CNT);
if (error)
goto out_trans_cancel;
......@@ -1193,6 +1200,8 @@ xfs_insert_file_space(
error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK,
XFS_IEXT_PUNCH_HOLE_CNT);
if (error == -EFBIG)
error = xfs_iext_count_upgrade(tp, ip, XFS_IEXT_PUNCH_HOLE_CNT);
if (error)
goto out_trans_cancel;
......@@ -1421,6 +1430,9 @@ xfs_swap_extent_rmap(
error = xfs_iext_count_may_overflow(ip,
XFS_DATA_FORK,
XFS_IEXT_SWAP_RMAP_CNT);
if (error == -EFBIG)
error = xfs_iext_count_upgrade(tp, ip,
XFS_IEXT_SWAP_RMAP_CNT);
if (error)
goto out;
}
......@@ -1429,6 +1441,9 @@ xfs_swap_extent_rmap(
error = xfs_iext_count_may_overflow(tip,
XFS_DATA_FORK,
XFS_IEXT_SWAP_RMAP_CNT);
if (error == -EFBIG)
error = xfs_iext_count_upgrade(tp, ip,
XFS_IEXT_SWAP_RMAP_CNT);
if (error)
goto out;
}
......
......@@ -322,6 +322,9 @@ xfs_dquot_disk_alloc(
error = xfs_iext_count_may_overflow(quotip, XFS_DATA_FORK,
XFS_IEXT_ADD_NOSPLIT_CNT);
if (error == -EFBIG)
error = xfs_iext_count_upgrade(tp, quotip,
XFS_IEXT_ADD_NOSPLIT_CNT);
if (error)
goto err_cancel;
......
......@@ -1027,11 +1027,6 @@ xfs_create(
xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
unlock_dp_on_error = true;
error = xfs_iext_count_may_overflow(dp, XFS_DATA_FORK,
XFS_IEXT_DIR_MANIP_CNT(mp));
if (error)
goto out_trans_cancel;
/*
* A newly created regular or special file just has one directory
* entry pointing to them, but a directory also the "." entry
......@@ -1245,11 +1240,6 @@ xfs_link(
if (error)
goto std_return;
error = xfs_iext_count_may_overflow(tdp, XFS_DATA_FORK,
XFS_IEXT_DIR_MANIP_CNT(mp));
if (error)
goto error_return;
/*
* If we are using project inheritance, we only allow hard link
* creation in our tree when the project IDs are the same; else
......@@ -3215,35 +3205,6 @@ xfs_rename(
/*
* Check for expected errors before we dirty the transaction
* so we can return an error without a transaction abort.
*
* Extent count overflow check:
*
* From the perspective of src_dp, a rename operation is essentially a
* directory entry remove operation. Hence the only place where we check
* for extent count overflow for src_dp is in
* xfs_bmap_del_extent_real(). xfs_bmap_del_extent_real() returns
* -ENOSPC when it detects a possible extent count overflow and in
* response, the higher layers of directory handling code do the
* following:
* 1. Data/Free blocks: XFS lets these blocks linger until a
* future remove operation removes them.
* 2. Dabtree blocks: XFS swaps the blocks with the last block in the
* Leaf space and unmaps the last block.
*
* For target_dp, there are two cases depending on whether the
* destination directory entry exists or not.
*
* When destination directory entry does not exist (i.e. target_ip ==
* NULL), extent count overflow check is performed only when transaction
* has a non-zero sized space reservation associated with it. With a
* zero-sized space reservation, XFS allows a rename operation to
* continue only when the directory has sufficient free space in its
* data/leaf/free space blocks to hold the new entry.
*
* When destination directory entry exists (i.e. target_ip != NULL), all
* we need to do is change the inode number associated with the already
* existing entry. Hence there is no need to perform an extent count
* overflow check.
*/
if (target_ip == NULL) {
/*
......@@ -3254,12 +3215,6 @@ xfs_rename(
error = xfs_dir_canenter(tp, target_dp, target_name);
if (error)
goto out_trans_cancel;
} else {
error = xfs_iext_count_may_overflow(target_dp,
XFS_DATA_FORK,
XFS_IEXT_DIR_MANIP_CNT(mp));
if (error)
goto out_trans_cancel;
}
} else {
/*
......@@ -3427,18 +3382,12 @@ xfs_rename(
* inode number of the whiteout inode rather than removing it
* altogether.
*/
if (wip) {
if (wip)
error = xfs_dir_replace(tp, src_dp, src_name, wip->i_ino,
spaceres);
} else {
/*
* NOTE: We don't need to check for extent count overflow here
* because the dir remove name code will leave the dir block in
* place if the extent count would overflow.
*/
else
error = xfs_dir_removename(tp, src_dp, src_name, src_ip->i_ino,
spaceres);
}
if (error)
goto out_trans_cancel;
......@@ -3520,8 +3469,8 @@ xfs_iflush(
if (XFS_TEST_ERROR(ip->i_df.if_nextents + xfs_ifork_nextents(ip->i_afp) >
ip->i_nblocks, mp, XFS_ERRTAG_IFLUSH_5)) {
xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
"%s: detected corrupt incore inode %Lu, "
"total extents = %d, nblocks = %Ld, ptr "PTR_FMT,
"%s: detected corrupt incore inode %llu, "
"total extents = %llu nblocks = %lld, ptr "PTR_FMT,
__func__, ip->i_ino,
ip->i_df.if_nextents + xfs_ifork_nextents(ip->i_afp),
ip->i_nblocks, ip);
......
......@@ -218,6 +218,11 @@ static inline bool xfs_inode_has_bigtime(struct xfs_inode *ip)
return ip->i_diflags2 & XFS_DIFLAG2_BIGTIME;
}
static inline bool xfs_inode_has_large_extent_counts(struct xfs_inode *ip)
{
return ip->i_diflags2 & XFS_DIFLAG2_NREXT64;
}
/*
* Return the buftarg used for data allocations on a given inode.
*/
......
......@@ -359,6 +359,21 @@ xfs_copy_dm_fields_to_log_dinode(
}
}
static inline void
xfs_inode_to_log_dinode_iext_counters(
struct xfs_inode *ip,
struct xfs_log_dinode *to)
{
if (xfs_inode_has_large_extent_counts(ip)) {
to->di_big_nextents = xfs_ifork_nextents(&ip->i_df);
to->di_big_anextents = xfs_ifork_nextents(ip->i_afp);
to->di_nrext64_pad = 0;
} else {
to->di_nextents = xfs_ifork_nextents(&ip->i_df);
to->di_anextents = xfs_ifork_nextents(ip->i_afp);
}
}
static void
xfs_inode_to_log_dinode(
struct xfs_inode *ip,
......@@ -374,7 +389,6 @@ xfs_inode_to_log_dinode(
to->di_projid_lo = ip->i_projid & 0xffff;
to->di_projid_hi = ip->i_projid >> 16;
memset(to->di_pad, 0, sizeof(to->di_pad));
memset(to->di_pad3, 0, sizeof(to->di_pad3));
to->di_atime = xfs_inode_to_log_dinode_ts(ip, inode->i_atime);
to->di_mtime = xfs_inode_to_log_dinode_ts(ip, inode->i_mtime);
......@@ -386,8 +400,6 @@ xfs_inode_to_log_dinode(
to->di_size = ip->i_disk_size;
to->di_nblocks = ip->i_nblocks;
to->di_extsize = ip->i_extsize;
to->di_nextents = xfs_ifork_nextents(&ip->i_df);
to->di_anextents = xfs_ifork_nextents(ip->i_afp);
to->di_forkoff = ip->i_forkoff;
to->di_aformat = xfs_ifork_format(ip->i_afp);
to->di_flags = ip->i_diflags;
......@@ -407,11 +419,14 @@ xfs_inode_to_log_dinode(
to->di_lsn = lsn;
memset(to->di_pad2, 0, sizeof(to->di_pad2));
uuid_copy(&to->di_uuid, &ip->i_mount->m_sb.sb_meta_uuid);
to->di_flushiter = 0;
to->di_v3_pad = 0;
} else {
to->di_version = 2;
to->di_flushiter = ip->i_flushiter;
memset(to->di_v2_pad, 0, sizeof(to->di_v2_pad));
}
xfs_inode_to_log_dinode_iext_counters(ip, to);
}
/*
......
......@@ -142,6 +142,29 @@ xfs_log_dinode_to_disk_ts(
return ts;
}
static inline bool xfs_log_dinode_has_large_extent_counts(
const struct xfs_log_dinode *ld)
{
return ld->di_version >= 3 &&
(ld->di_flags2 & XFS_DIFLAG2_NREXT64);
}
static inline void
xfs_log_dinode_to_disk_iext_counters(
struct xfs_log_dinode *from,
struct xfs_dinode *to)
{
if (xfs_log_dinode_has_large_extent_counts(from)) {
to->di_big_nextents = cpu_to_be64(from->di_big_nextents);
to->di_big_anextents = cpu_to_be32(from->di_big_anextents);
to->di_nrext64_pad = cpu_to_be16(from->di_nrext64_pad);
} else {
to->di_nextents = cpu_to_be32(from->di_nextents);
to->di_anextents = cpu_to_be16(from->di_anextents);
}
}
STATIC void
xfs_log_dinode_to_disk(
struct xfs_log_dinode *from,
......@@ -158,7 +181,6 @@ xfs_log_dinode_to_disk(
to->di_nlink = cpu_to_be32(from->di_nlink);
to->di_projid_lo = cpu_to_be16(from->di_projid_lo);
to->di_projid_hi = cpu_to_be16(from->di_projid_hi);
memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad));
to->di_atime = xfs_log_dinode_to_disk_ts(from, from->di_atime);
to->di_mtime = xfs_log_dinode_to_disk_ts(from, from->di_mtime);
......@@ -167,8 +189,6 @@ xfs_log_dinode_to_disk(
to->di_size = cpu_to_be64(from->di_size);
to->di_nblocks = cpu_to_be64(from->di_nblocks);
to->di_extsize = cpu_to_be32(from->di_extsize);
to->di_nextents = cpu_to_be32(from->di_nextents);
to->di_anextents = cpu_to_be16(from->di_anextents);
to->di_forkoff = from->di_forkoff;
to->di_aformat = from->di_aformat;
to->di_dmevmask = cpu_to_be32(from->di_dmevmask);
......@@ -184,12 +204,66 @@ xfs_log_dinode_to_disk(
to->di_cowextsize = cpu_to_be32(from->di_cowextsize);
to->di_ino = cpu_to_be64(from->di_ino);
to->di_lsn = cpu_to_be64(lsn);
memcpy(to->di_pad2, from->di_pad2, sizeof(to->di_pad2));
memset(to->di_pad2, 0, sizeof(to->di_pad2));
uuid_copy(&to->di_uuid, &from->di_uuid);
to->di_flushiter = 0;
to->di_v3_pad = 0;
} else {
to->di_flushiter = cpu_to_be16(from->di_flushiter);
memset(to->di_v2_pad, 0, sizeof(to->di_v2_pad));
}
xfs_log_dinode_to_disk_iext_counters(from, to);
}
STATIC int
xlog_dinode_verify_extent_counts(
struct xfs_mount *mp,
struct xfs_log_dinode *ldip)
{
xfs_extnum_t nextents;
xfs_aextnum_t anextents;
if (xfs_log_dinode_has_large_extent_counts(ldip)) {
if (!xfs_has_large_extent_counts(mp) ||
(ldip->di_nrext64_pad != 0)) {
XFS_CORRUPTION_ERROR(
"Bad log dinode large extent count format",
XFS_ERRLEVEL_LOW, mp, ldip, sizeof(*ldip));
xfs_alert(mp,
"Bad inode 0x%llx, large extent counts %d, padding 0x%x",
ldip->di_ino, xfs_has_large_extent_counts(mp),
ldip->di_nrext64_pad);
return -EFSCORRUPTED;
}
nextents = ldip->di_big_nextents;
anextents = ldip->di_big_anextents;
} else {
if (ldip->di_version == 3 && ldip->di_v3_pad != 0) {
XFS_CORRUPTION_ERROR(
"Bad log dinode di_v3_pad",
XFS_ERRLEVEL_LOW, mp, ldip, sizeof(*ldip));
xfs_alert(mp,
"Bad inode 0x%llx, di_v3_pad 0x%llx",
ldip->di_ino, ldip->di_v3_pad);
return -EFSCORRUPTED;
}
nextents = ldip->di_nextents;
anextents = ldip->di_anextents;
}
if (unlikely(nextents + anextents > ldip->di_nblocks)) {
XFS_CORRUPTION_ERROR("Bad log dinode extent counts",
XFS_ERRLEVEL_LOW, mp, ldip, sizeof(*ldip));
xfs_alert(mp,
"Bad inode 0x%llx, large extent counts %d, nextents 0x%llx, anextents 0x%x, nblocks 0x%llx",
ldip->di_ino, xfs_has_large_extent_counts(mp), nextents,
anextents, ldip->di_nblocks);
return -EFSCORRUPTED;
}
return 0;
}
STATIC int
......@@ -317,13 +391,12 @@ xlog_recover_inode_commit_pass2(
if (unlikely(S_ISREG(ldip->di_mode))) {
if ((ldip->di_format != XFS_DINODE_FMT_EXTENTS) &&
(ldip->di_format != XFS_DINODE_FMT_BTREE)) {
XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(3)",
XFS_ERRLEVEL_LOW, mp, ldip,
sizeof(*ldip));
XFS_CORRUPTION_ERROR(
"Bad log dinode data fork format for regular file",
XFS_ERRLEVEL_LOW, mp, ldip, sizeof(*ldip));
xfs_alert(mp,
"%s: Bad regular inode log record, rec ptr "PTR_FMT", "
"ino ptr = "PTR_FMT", ino bp = "PTR_FMT", ino %Ld",
__func__, item, dip, bp, in_f->ilf_ino);
"Bad inode 0x%llx, data fork format 0x%x",
in_f->ilf_ino, ldip->di_format);
error = -EFSCORRUPTED;
goto out_release;
}
......@@ -331,49 +404,37 @@ xlog_recover_inode_commit_pass2(
if ((ldip->di_format != XFS_DINODE_FMT_EXTENTS) &&
(ldip->di_format != XFS_DINODE_FMT_BTREE) &&
(ldip->di_format != XFS_DINODE_FMT_LOCAL)) {
XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(4)",
XFS_ERRLEVEL_LOW, mp, ldip,
sizeof(*ldip));
XFS_CORRUPTION_ERROR(
"Bad log dinode data fork format for directory",
XFS_ERRLEVEL_LOW, mp, ldip, sizeof(*ldip));
xfs_alert(mp,
"%s: Bad dir inode log record, rec ptr "PTR_FMT", "
"ino ptr = "PTR_FMT", ino bp = "PTR_FMT", ino %Ld",
__func__, item, dip, bp, in_f->ilf_ino);
"Bad inode 0x%llx, data fork format 0x%x",
in_f->ilf_ino, ldip->di_format);
error = -EFSCORRUPTED;
goto out_release;
}
}
if (unlikely(ldip->di_nextents + ldip->di_anextents > ldip->di_nblocks)){
XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(5)",
XFS_ERRLEVEL_LOW, mp, ldip,
sizeof(*ldip));
xfs_alert(mp,
"%s: Bad inode log record, rec ptr "PTR_FMT", dino ptr "PTR_FMT", "
"dino bp "PTR_FMT", ino %Ld, total extents = %d, nblocks = %Ld",
__func__, item, dip, bp, in_f->ilf_ino,
ldip->di_nextents + ldip->di_anextents,
ldip->di_nblocks);
error = -EFSCORRUPTED;
error = xlog_dinode_verify_extent_counts(mp, ldip);
if (error)
goto out_release;
}
if (unlikely(ldip->di_forkoff > mp->m_sb.sb_inodesize)) {
XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(6)",
XFS_ERRLEVEL_LOW, mp, ldip,
sizeof(*ldip));
XFS_CORRUPTION_ERROR("Bad log dinode fork offset",
XFS_ERRLEVEL_LOW, mp, ldip, sizeof(*ldip));
xfs_alert(mp,
"%s: Bad inode log record, rec ptr "PTR_FMT", dino ptr "PTR_FMT", "
"dino bp "PTR_FMT", ino %Ld, forkoff 0x%x", __func__,
item, dip, bp, in_f->ilf_ino, ldip->di_forkoff);
"Bad inode 0x%llx, di_forkoff 0x%x",
in_f->ilf_ino, ldip->di_forkoff);
error = -EFSCORRUPTED;
goto out_release;
}
isize = xfs_log_dinode_size(mp);
if (unlikely(item->ri_buf[1].i_len > isize)) {
XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(7)",
XFS_ERRLEVEL_LOW, mp, ldip,
sizeof(*ldip));
XFS_CORRUPTION_ERROR("Bad log dinode size", XFS_ERRLEVEL_LOW,
mp, ldip, sizeof(*ldip));
xfs_alert(mp,
"%s: Bad inode log record length %d, rec ptr "PTR_FMT,
__func__, item->ri_buf[1].i_len, item);
"Bad inode 0x%llx log dinode size 0x%x",
in_f->ilf_ino, item->ri_buf[1].i_len);
error = -EFSCORRUPTED;
goto out_release;
}
......
......@@ -813,6 +813,9 @@ xfs_bulk_ireq_setup(
if (XFS_INO_TO_AGNO(mp, breq->startino) >= mp->m_sb.sb_agcount)
return -ECANCELED;
if (hdr->flags & XFS_BULK_IREQ_NREXT64)
breq->flags |= XFS_IBULK_NREXT64;
return 0;
}
......
......@@ -251,6 +251,8 @@ xfs_iomap_write_direct(
return error;
error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK, nr_exts);
if (error == -EFBIG)
error = xfs_iext_count_upgrade(tp, ip, nr_exts);
if (error)
goto out_trans_cancel;
......@@ -402,7 +404,7 @@ xfs_iomap_prealloc_size(
*/
plen = prev.br_blockcount;
while (xfs_iext_prev_extent(ifp, &ncur, &got)) {
if (plen > MAXEXTLEN / 2 ||
if (plen > XFS_MAX_BMBT_EXTLEN / 2 ||
isnullstartblock(got.br_startblock) ||
got.br_startoff + got.br_blockcount != prev.br_startoff ||
got.br_startblock + got.br_blockcount != prev.br_startblock)
......@@ -414,23 +416,23 @@ xfs_iomap_prealloc_size(
/*
* If the size of the extents is greater than half the maximum extent
* length, then use the current offset as the basis. This ensures that
* for large files the preallocation size always extends to MAXEXTLEN
* rather than falling short due to things like stripe unit/width
* alignment of real extents.
* for large files the preallocation size always extends to
* XFS_BMBT_MAX_EXTLEN rather than falling short due to things like stripe
* unit/width alignment of real extents.
*/
alloc_blocks = plen * 2;
if (alloc_blocks > MAXEXTLEN)
if (alloc_blocks > XFS_MAX_BMBT_EXTLEN)
alloc_blocks = XFS_B_TO_FSB(mp, offset);
qblocks = alloc_blocks;
/*
* MAXEXTLEN is not a power of two value but we round the prealloc down
* to the nearest power of two value after throttling. To prevent the
* round down from unconditionally reducing the maximum supported
* prealloc size, we round up first, apply appropriate throttling,
* round down and cap the value to MAXEXTLEN.
* XFS_BMBT_MAX_EXTLEN is not a power of two value but we round the prealloc
* down to the nearest power of two value after throttling. To prevent
* the round down from unconditionally reducing the maximum supported
* prealloc size, we round up first, apply appropriate throttling, round
* down and cap the value to XFS_BMBT_MAX_EXTLEN.
*/
alloc_blocks = XFS_FILEOFF_MIN(roundup_pow_of_two(MAXEXTLEN),
alloc_blocks = XFS_FILEOFF_MIN(roundup_pow_of_two(XFS_MAX_BMBT_EXTLEN),
alloc_blocks);
freesp = percpu_counter_read_positive(&mp->m_fdblocks);
......@@ -478,14 +480,14 @@ xfs_iomap_prealloc_size(
*/
if (alloc_blocks)
alloc_blocks = rounddown_pow_of_two(alloc_blocks);
if (alloc_blocks > MAXEXTLEN)
alloc_blocks = MAXEXTLEN;
if (alloc_blocks > XFS_MAX_BMBT_EXTLEN)
alloc_blocks = XFS_MAX_BMBT_EXTLEN;
/*
* If we are still trying to allocate more space than is
* available, squash the prealloc hard. This can happen if we
* have a large file on a small filesystem and the above
* lowspace thresholds are smaller than MAXEXTLEN.
* lowspace thresholds are smaller than XFS_BMBT_MAX_EXTLEN.
*/
while (alloc_blocks && alloc_blocks >= freesp)
alloc_blocks >>= 4;
......@@ -555,6 +557,9 @@ xfs_iomap_write_unwritten(
error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK,
XFS_IEXT_WRITE_UNWRITTEN_CNT);
if (error == -EFBIG)
error = xfs_iext_count_upgrade(tp, ip,
XFS_IEXT_WRITE_UNWRITTEN_CNT);
if (error)
goto error_on_bmapi_transaction;
......
......@@ -64,6 +64,7 @@ xfs_bulkstat_one_int(
struct xfs_inode *ip; /* incore inode pointer */
struct inode *inode;
struct xfs_bulkstat *buf = bc->buf;
xfs_extnum_t nextents;
int error = -EINVAL;
if (xfs_internal_inum(mp, ino))
......@@ -102,7 +103,13 @@ xfs_bulkstat_one_int(
buf->bs_xflags = xfs_ip2xflags(ip);
buf->bs_extsize_blks = ip->i_extsize;
buf->bs_extents = xfs_ifork_nextents(&ip->i_df);
nextents = xfs_ifork_nextents(&ip->i_df);
if (!(bc->breq->flags & XFS_IBULK_NREXT64))
buf->bs_extents = min(nextents, XFS_MAX_EXTCNT_DATA_FORK_SMALL);
else
buf->bs_extents64 = nextents;
xfs_bulkstat_health(ip, buf);
buf->bs_aextents = xfs_ifork_nextents(ip->i_afp);
buf->bs_forkoff = XFS_IFORK_BOFF(ip);
......@@ -256,6 +263,7 @@ xfs_bulkstat(
.breq = breq,
};
struct xfs_trans *tp;
unsigned int iwalk_flags = 0;
int error;
if (breq->mnt_userns != &init_user_ns) {
......@@ -279,7 +287,10 @@ xfs_bulkstat(
if (error)
goto out;
error = xfs_iwalk(breq->mp, tp, breq->startino, breq->flags,
if (breq->flags & XFS_IBULK_SAME_AG)
iwalk_flags |= XFS_IWALK_SAME_AG;
error = xfs_iwalk(breq->mp, tp, breq->startino, iwalk_flags,
xfs_bulkstat_iwalk, breq->icount, &bc);
xfs_trans_cancel(tp);
out:
......
......@@ -17,7 +17,10 @@ struct xfs_ibulk {
};
/* Only iterate within the same AG as startino */
#define XFS_IBULK_SAME_AG (XFS_IWALK_SAME_AG)
#define XFS_IBULK_SAME_AG (1U << 0)
/* Fill out the bs_extents64 field if set. */
#define XFS_IBULK_NREXT64 (1U << 1)
/*
* Advance the user buffer pointer by one record of the given size. If the
......
......@@ -26,7 +26,7 @@ int xfs_iwalk_threaded(struct xfs_mount *mp, xfs_ino_t startino,
unsigned int inode_records, bool poll, void *data);
/* Only iterate inodes within the same AG as @startino. */
#define XFS_IWALK_SAME_AG (0x1)
#define XFS_IWALK_SAME_AG (1U << 0)
#define XFS_IWALK_FLAGS_ALL (XFS_IWALK_SAME_AG)
......
......@@ -278,6 +278,7 @@ typedef struct xfs_mount {
#define XFS_FEAT_INOBTCNT (1ULL << 23) /* inobt block counts */
#define XFS_FEAT_BIGTIME (1ULL << 24) /* large timestamps */
#define XFS_FEAT_NEEDSREPAIR (1ULL << 25) /* needs xfs_repair */
#define XFS_FEAT_NREXT64 (1ULL << 26) /* large extent counters */
/* Mount features */
#define XFS_FEAT_NOATTR2 (1ULL << 48) /* disable attr2 creation */
......@@ -340,6 +341,7 @@ __XFS_HAS_FEAT(realtime, REALTIME)
__XFS_HAS_FEAT(inobtcounts, INOBTCNT)
__XFS_HAS_FEAT(bigtime, BIGTIME)
__XFS_HAS_FEAT(needsrepair, NEEDSREPAIR)
__XFS_HAS_FEAT(large_extent_counts, NREXT64)
/*
* Mount features
......
......@@ -620,6 +620,9 @@ xfs_reflink_end_cow_extent(
error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK,
XFS_IEXT_REFLINK_END_COW_CNT);
if (error == -EFBIG)
error = xfs_iext_count_upgrade(tp, ip,
XFS_IEXT_REFLINK_END_COW_CNT);
if (error)
goto out_cancel;
......@@ -1121,6 +1124,8 @@ xfs_reflink_remap_extent(
++iext_delta;
error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK, iext_delta);
if (error == -EFBIG)
error = xfs_iext_count_upgrade(tp, ip, iext_delta);
if (error)
goto out_cancel;
......
......@@ -806,6 +806,9 @@ xfs_growfs_rt_alloc(
error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK,
XFS_IEXT_ADD_NOSPLIT_CNT);
if (error == -EFBIG)
error = xfs_iext_count_upgrade(tp, ip,
XFS_IEXT_ADD_NOSPLIT_CNT);
if (error)
goto out_trans_cancel;
......
......@@ -1649,6 +1649,10 @@ xfs_fs_fill_super(
goto out_filestream_unmount;
}
if (xfs_has_large_extent_counts(mp))
xfs_warn(mp,
"EXPERIMENTAL Large extent counts feature in use. Use at your own risk!");
error = xfs_mountfs(mp);
if (error)
goto out_filestream_unmount;
......
......@@ -226,11 +226,6 @@ xfs_symlink(
goto out_trans_cancel;
}
error = xfs_iext_count_may_overflow(dp, XFS_DATA_FORK,
XFS_IEXT_DIR_MANIP_CNT(mp));
if (error)
goto out_trans_cancel;
/*
* Allocate an inode for the symlink.
*/
......
......@@ -2153,7 +2153,7 @@ DECLARE_EVENT_CLASS(xfs_swap_extent_class,
__field(int, which)
__field(xfs_ino_t, ino)
__field(int, format)
__field(int, nex)
__field(xfs_extnum_t, nex)
__field(int, broot_size)
__field(int, fork_off)
),
......@@ -2166,7 +2166,7 @@ DECLARE_EVENT_CLASS(xfs_swap_extent_class,
__entry->broot_size = ip->i_df.if_broot_bytes;
__entry->fork_off = XFS_IFORK_BOFF(ip);
),
TP_printk("dev %d:%d ino 0x%llx (%s), %s format, num_extents %d, "
TP_printk("dev %d:%d ino 0x%llx (%s), %s format, num_extents %llu, "
"broot size %d, forkoff 0x%x",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment