Commit 1e868d8e authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'for-linus' of git://oss.sgi.com/xfs/xfs

* 'for-linus' of git://oss.sgi.com/xfs/xfs:
  xfs: xfs_swap_extents needs to handle dynamic fork offsets
  xfs: fix missing error check in xfs_rtfree_range
  xfs: fix stale inode flush avoidance
  xfs: Remove inode iolock held check during allocation
  xfs: reclaim all inodes by background tree walks
  xfs: Avoid inodes in reclaim when flushing from inode cache
  xfs: reclaim inodes under a write lock
parents 2faae422 e09f9860
......@@ -954,16 +954,14 @@ xfs_fs_destroy_inode(
ASSERT_ALWAYS(!xfs_iflags_test(ip, XFS_IRECLAIM));
/*
* If we have nothing to flush with this inode then complete the
* teardown now, otherwise delay the flush operation.
* We always use background reclaim here because even if the
* inode is clean, it still may be under IO and hence we have
* to take the flush lock. The background reclaim path handles
* this more efficiently than we can here, so simply let background
* reclaim tear down all inodes.
*/
if (!xfs_inode_clean(ip)) {
xfs_inode_set_reclaim_tag(ip);
return;
}
out_reclaim:
xfs_ireclaim(ip);
xfs_inode_set_reclaim_tag(ip);
}
/*
......
......@@ -65,7 +65,6 @@ xfs_inode_ag_lookup(
* as the tree is sparse and a gang lookup walks to find
* the number of objects requested.
*/
read_lock(&pag->pag_ici_lock);
if (tag == XFS_ICI_NO_TAG) {
nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
(void **)&ip, *first_index, 1);
......@@ -74,7 +73,7 @@ xfs_inode_ag_lookup(
(void **)&ip, *first_index, 1, tag);
}
if (!nr_found)
goto unlock;
return NULL;
/*
* Update the index for the next lookup. Catch overflows
......@@ -84,13 +83,8 @@ xfs_inode_ag_lookup(
*/
*first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
if (*first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
goto unlock;
return NULL;
return ip;
unlock:
read_unlock(&pag->pag_ici_lock);
return NULL;
}
STATIC int
......@@ -100,7 +94,8 @@ xfs_inode_ag_walk(
int (*execute)(struct xfs_inode *ip,
struct xfs_perag *pag, int flags),
int flags,
int tag)
int tag,
int exclusive)
{
struct xfs_perag *pag = &mp->m_perag[ag];
uint32_t first_index;
......@@ -114,10 +109,20 @@ xfs_inode_ag_walk(
int error = 0;
xfs_inode_t *ip;
if (exclusive)
write_lock(&pag->pag_ici_lock);
else
read_lock(&pag->pag_ici_lock);
ip = xfs_inode_ag_lookup(mp, pag, &first_index, tag);
if (!ip)
if (!ip) {
if (exclusive)
write_unlock(&pag->pag_ici_lock);
else
read_unlock(&pag->pag_ici_lock);
break;
}
/* execute releases pag->pag_ici_lock */
error = execute(ip, pag, flags);
if (error == EAGAIN) {
skipped++;
......@@ -125,9 +130,8 @@ xfs_inode_ag_walk(
}
if (error)
last_error = error;
/*
* bail out if the filesystem is corrupted.
*/
/* bail out if the filesystem is corrupted. */
if (error == EFSCORRUPTED)
break;
......@@ -148,7 +152,8 @@ xfs_inode_ag_iterator(
int (*execute)(struct xfs_inode *ip,
struct xfs_perag *pag, int flags),
int flags,
int tag)
int tag,
int exclusive)
{
int error = 0;
int last_error = 0;
......@@ -157,7 +162,8 @@ xfs_inode_ag_iterator(
for (ag = 0; ag < mp->m_sb.sb_agcount; ag++) {
if (!mp->m_perag[ag].pag_ici_init)
continue;
error = xfs_inode_ag_walk(mp, ag, execute, flags, tag);
error = xfs_inode_ag_walk(mp, ag, execute, flags, tag,
exclusive);
if (error) {
last_error = error;
if (error == EFSCORRUPTED)
......@@ -174,30 +180,31 @@ xfs_sync_inode_valid(
struct xfs_perag *pag)
{
struct inode *inode = VFS_I(ip);
int error = EFSCORRUPTED;
/* nothing to sync during shutdown */
if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
read_unlock(&pag->pag_ici_lock);
return EFSCORRUPTED;
}
if (XFS_FORCED_SHUTDOWN(ip->i_mount))
goto out_unlock;
/*
* If we can't get a reference on the inode, it must be in reclaim.
* Leave it for the reclaim code to flush. Also avoid inodes that
* haven't been fully initialised.
*/
if (!igrab(inode)) {
read_unlock(&pag->pag_ici_lock);
return ENOENT;
}
read_unlock(&pag->pag_ici_lock);
/* avoid new or reclaimable inodes. Leave for reclaim code to flush */
error = ENOENT;
if (xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM))
goto out_unlock;
if (is_bad_inode(inode) || xfs_iflags_test(ip, XFS_INEW)) {
/* If we can't grab the inode, it must on it's way to reclaim. */
if (!igrab(inode))
goto out_unlock;
if (is_bad_inode(inode)) {
IRELE(ip);
return ENOENT;
goto out_unlock;
}
return 0;
/* inode is valid */
error = 0;
out_unlock:
read_unlock(&pag->pag_ici_lock);
return error;
}
STATIC int
......@@ -282,7 +289,7 @@ xfs_sync_data(
ASSERT((flags & ~(SYNC_TRYLOCK|SYNC_WAIT)) == 0);
error = xfs_inode_ag_iterator(mp, xfs_sync_inode_data, flags,
XFS_ICI_NO_TAG);
XFS_ICI_NO_TAG, 0);
if (error)
return XFS_ERROR(error);
......@@ -304,7 +311,7 @@ xfs_sync_attr(
ASSERT((flags & ~SYNC_WAIT) == 0);
return xfs_inode_ag_iterator(mp, xfs_sync_inode_attr, flags,
XFS_ICI_NO_TAG);
XFS_ICI_NO_TAG, 0);
}
STATIC int
......@@ -664,60 +671,6 @@ xfs_syncd_stop(
kthread_stop(mp->m_sync_task);
}
STATIC int
xfs_reclaim_inode(
xfs_inode_t *ip,
int sync_mode)
{
xfs_perag_t *pag = xfs_get_perag(ip->i_mount, ip->i_ino);
/* The hash lock here protects a thread in xfs_iget_core from
* racing with us on linking the inode back with a vnode.
* Once we have the XFS_IRECLAIM flag set it will not touch
* us.
*/
write_lock(&pag->pag_ici_lock);
spin_lock(&ip->i_flags_lock);
if (__xfs_iflags_test(ip, XFS_IRECLAIM) ||
!__xfs_iflags_test(ip, XFS_IRECLAIMABLE)) {
spin_unlock(&ip->i_flags_lock);
write_unlock(&pag->pag_ici_lock);
return -EAGAIN;
}
__xfs_iflags_set(ip, XFS_IRECLAIM);
spin_unlock(&ip->i_flags_lock);
write_unlock(&pag->pag_ici_lock);
xfs_put_perag(ip->i_mount, pag);
/*
* If the inode is still dirty, then flush it out. If the inode
* is not in the AIL, then it will be OK to flush it delwri as
* long as xfs_iflush() does not keep any references to the inode.
* We leave that decision up to xfs_iflush() since it has the
* knowledge of whether it's OK to simply do a delwri flush of
* the inode or whether we need to wait until the inode is
* pulled from the AIL.
* We get the flush lock regardless, though, just to make sure
* we don't free it while it is being flushed.
*/
xfs_ilock(ip, XFS_ILOCK_EXCL);
xfs_iflock(ip);
/*
* In the case of a forced shutdown we rely on xfs_iflush() to
* wait for the inode to be unpinned before returning an error.
*/
if (!is_bad_inode(VFS_I(ip)) && xfs_iflush(ip, sync_mode) == 0) {
/* synchronize with xfs_iflush_done */
xfs_iflock(ip);
xfs_ifunlock(ip);
}
xfs_iunlock(ip, XFS_ILOCK_EXCL);
xfs_ireclaim(ip);
return 0;
}
void
__xfs_inode_set_reclaim_tag(
struct xfs_perag *pag,
......@@ -760,19 +713,55 @@ __xfs_inode_clear_reclaim_tag(
}
STATIC int
xfs_reclaim_inode_now(
xfs_reclaim_inode(
struct xfs_inode *ip,
struct xfs_perag *pag,
int flags)
int sync_mode)
{
/* ignore if already under reclaim */
if (xfs_iflags_test(ip, XFS_IRECLAIM)) {
read_unlock(&pag->pag_ici_lock);
/*
* The radix tree lock here protects a thread in xfs_iget from racing
* with us starting reclaim on the inode. Once we have the
* XFS_IRECLAIM flag set it will not touch us.
*/
spin_lock(&ip->i_flags_lock);
ASSERT_ALWAYS(__xfs_iflags_test(ip, XFS_IRECLAIMABLE));
if (__xfs_iflags_test(ip, XFS_IRECLAIM)) {
/* ignore as it is already under reclaim */
spin_unlock(&ip->i_flags_lock);
write_unlock(&pag->pag_ici_lock);
return 0;
}
read_unlock(&pag->pag_ici_lock);
__xfs_iflags_set(ip, XFS_IRECLAIM);
spin_unlock(&ip->i_flags_lock);
write_unlock(&pag->pag_ici_lock);
return xfs_reclaim_inode(ip, flags);
/*
* If the inode is still dirty, then flush it out. If the inode
* is not in the AIL, then it will be OK to flush it delwri as
* long as xfs_iflush() does not keep any references to the inode.
* We leave that decision up to xfs_iflush() since it has the
* knowledge of whether it's OK to simply do a delwri flush of
* the inode or whether we need to wait until the inode is
* pulled from the AIL.
* We get the flush lock regardless, though, just to make sure
* we don't free it while it is being flushed.
*/
xfs_ilock(ip, XFS_ILOCK_EXCL);
xfs_iflock(ip);
/*
* In the case of a forced shutdown we rely on xfs_iflush() to
* wait for the inode to be unpinned before returning an error.
*/
if (!is_bad_inode(VFS_I(ip)) && xfs_iflush(ip, sync_mode) == 0) {
/* synchronize with xfs_iflush_done */
xfs_iflock(ip);
xfs_ifunlock(ip);
}
xfs_iunlock(ip, XFS_ILOCK_EXCL);
xfs_ireclaim(ip);
return 0;
}
int
......@@ -780,6 +769,6 @@ xfs_reclaim_inodes(
xfs_mount_t *mp,
int mode)
{
return xfs_inode_ag_iterator(mp, xfs_reclaim_inode_now, mode,
XFS_ICI_RECLAIM_TAG);
return xfs_inode_ag_iterator(mp, xfs_reclaim_inode, mode,
XFS_ICI_RECLAIM_TAG, 1);
}
......@@ -54,6 +54,6 @@ void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, struct xfs_perag *pag,
int xfs_sync_inode_valid(struct xfs_inode *ip, struct xfs_perag *pag);
int xfs_inode_ag_iterator(struct xfs_mount *mp,
int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, int flags),
int flags, int tag);
int flags, int tag, int write_lock);
#endif
......@@ -891,7 +891,7 @@ xfs_qm_dqrele_all_inodes(
uint flags)
{
ASSERT(mp->m_quotainfo);
xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags, XFS_ICI_NO_TAG);
xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags, XFS_ICI_NO_TAG, 0);
}
/*------------------------------------------------------------------------*/
......
......@@ -114,10 +114,82 @@ xfs_swapext(
return error;
}
/*
* We need to check that the format of the data fork in the temporary inode is
* valid for the target inode before doing the swap. This is not a problem with
* attr1 because of the fixed fork offset, but attr2 has a dynamically sized
* data fork depending on the space the attribute fork is taking so we can get
* invalid formats on the target inode.
*
* E.g. target has space for 7 extents in extent format, temp inode only has
* space for 6. If we defragment down to 7 extents, then the tmp format is a
* btree, but when swapped it needs to be in extent format. Hence we can't just
* blindly swap data forks on attr2 filesystems.
*
* Note that we check the swap in both directions so that we don't end up with
* a corrupt temporary inode, either.
*
* Note that fixing the way xfs_fsr sets up the attribute fork in the source
* inode will prevent this situation from occurring, so all we do here is
* reject and log the attempt. basically we are putting the responsibility on
* userspace to get this right.
*/
static int
xfs_swap_extents_check_format(
xfs_inode_t *ip, /* target inode */
xfs_inode_t *tip) /* tmp inode */
{
/* Should never get a local format */
if (ip->i_d.di_format == XFS_DINODE_FMT_LOCAL ||
tip->i_d.di_format == XFS_DINODE_FMT_LOCAL)
return EINVAL;
/*
* if the target inode has less extents that then temporary inode then
* why did userspace call us?
*/
if (ip->i_d.di_nextents < tip->i_d.di_nextents)
return EINVAL;
/*
* if the target inode is in extent form and the temp inode is in btree
* form then we will end up with the target inode in the wrong format
* as we already know there are less extents in the temp inode.
*/
if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
tip->i_d.di_format == XFS_DINODE_FMT_BTREE)
return EINVAL;
/* Check temp in extent form to max in target */
if (tip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) > ip->i_df.if_ext_max)
return EINVAL;
/* Check target in extent form to max in temp */
if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) > tip->i_df.if_ext_max)
return EINVAL;
/* Check root block of temp in btree form to max in target */
if (tip->i_d.di_format == XFS_DINODE_FMT_BTREE &&
XFS_IFORK_BOFF(ip) &&
tip->i_df.if_broot_bytes > XFS_IFORK_BOFF(ip))
return EINVAL;
/* Check root block of target in btree form to max in temp */
if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE &&
XFS_IFORK_BOFF(tip) &&
ip->i_df.if_broot_bytes > XFS_IFORK_BOFF(tip))
return EINVAL;
return 0;
}
int
xfs_swap_extents(
xfs_inode_t *ip,
xfs_inode_t *tip,
xfs_inode_t *ip, /* target inode */
xfs_inode_t *tip, /* tmp inode */
xfs_swapext_t *sxp)
{
xfs_mount_t *mp;
......@@ -161,13 +233,6 @@ xfs_swap_extents(
goto out_unlock;
}
/* Should never get a local format */
if (ip->i_d.di_format == XFS_DINODE_FMT_LOCAL ||
tip->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
error = XFS_ERROR(EINVAL);
goto out_unlock;
}
if (VN_CACHED(VFS_I(tip)) != 0) {
error = xfs_flushinval_pages(tip, 0, -1,
FI_REMAPF_LOCKED);
......@@ -189,13 +254,12 @@ xfs_swap_extents(
goto out_unlock;
}
/*
* If the target has extended attributes, the tmp file
* must also in order to ensure the correct data fork
* format.
*/
if ( XFS_IFORK_Q(ip) != XFS_IFORK_Q(tip) ) {
error = XFS_ERROR(EINVAL);
/* check inode formats now that data is flushed */
error = xfs_swap_extents_check_format(ip, tip);
if (error) {
xfs_fs_cmn_err(CE_NOTE, mp,
"%s: inode 0x%llx format is incompatible for exchanging.",
__FILE__, ip->i_ino);
goto out_unlock;
}
......@@ -275,6 +339,16 @@ xfs_swap_extents(
*ifp = *tifp; /* struct copy */
*tifp = *tempifp; /* struct copy */
/*
* Fix the in-memory data fork values that are dependent on the fork
* offset in the inode. We can't assume they remain the same as attr2
* has dynamic fork offsets.
*/
ifp->if_ext_max = XFS_IFORK_SIZE(ip, XFS_DATA_FORK) /
(uint)sizeof(xfs_bmbt_rec_t);
tifp->if_ext_max = XFS_IFORK_SIZE(tip, XFS_DATA_FORK) /
(uint)sizeof(xfs_bmbt_rec_t);
/*
* Fix the on-disk inode values
*/
......
......@@ -73,7 +73,6 @@ xfs_inode_alloc(
ASSERT(atomic_read(&ip->i_pincount) == 0);
ASSERT(!spin_is_locked(&ip->i_flags_lock));
ASSERT(completion_done(&ip->i_flush));
ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock));
mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
......
......@@ -2842,13 +2842,9 @@ xfs_iflush(
/*
* If the inode isn't dirty, then just release the inode flush lock and
* do nothing. Treat stale inodes the same; we cannot rely on the
* backing buffer remaining stale in cache for the remaining life of
* the stale inode and so xfs_itobp() below may give us a buffer that
* no longer contains inodes below. Doing this stale check here also
* avoids forcing the log on pinned, stale inodes.
* do nothing.
*/
if (xfs_inode_clean(ip) || xfs_iflags_test(ip, XFS_ISTALE)) {
if (xfs_inode_clean(ip)) {
xfs_ifunlock(ip);
return 0;
}
......@@ -2871,6 +2867,19 @@ xfs_iflush(
}
xfs_iunpin_wait(ip);
/*
* For stale inodes we cannot rely on the backing buffer remaining
* stale in cache for the remaining life of the stale inode and so
* xfs_itobp() below may give us a buffer that no longer contains
* inodes below. We have to check this after ensuring the inode is
* unpinned so that it is safe to reclaim the stale inode after the
* flush call.
*/
if (xfs_iflags_test(ip, XFS_ISTALE)) {
xfs_ifunlock(ip);
return 0;
}
/*
* This may have been unpinned because the filesystem is shutting
* down forcibly. If that's the case we must not write this inode
......
......@@ -1517,6 +1517,8 @@ xfs_rtfree_range(
*/
error = xfs_rtfind_forw(mp, tp, end, mp->m_sb.sb_rextents - 1,
&postblock);
if (error)
return error;
/*
* If there are blocks not being freed at the front of the
* old extent, add summary data for them to be allocated.
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment