Commit 8cb280c9 authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'for-linus' of git://oss.sgi.com/xfs/xfs

* 'for-linus' of git://oss.sgi.com/xfs/xfs:
  xfs: only run xfs_error_test if error injection is active
  xfs: avoid moving stale inodes in the AIL
  xfs: delayed alloc blocks beyond EOF are valid after writeback
  xfs: push stale, pinned buffers on trylock failures
  xfs: fix failed write truncation handling.
parents 8fed709f c76febef
...@@ -934,7 +934,6 @@ xfs_aops_discard_page( ...@@ -934,7 +934,6 @@ xfs_aops_discard_page(
struct xfs_inode *ip = XFS_I(inode); struct xfs_inode *ip = XFS_I(inode);
struct buffer_head *bh, *head; struct buffer_head *bh, *head;
loff_t offset = page_offset(page); loff_t offset = page_offset(page);
ssize_t len = 1 << inode->i_blkbits;
if (!xfs_is_delayed_page(page, IO_DELAY)) if (!xfs_is_delayed_page(page, IO_DELAY))
goto out_invalidate; goto out_invalidate;
...@@ -949,58 +948,14 @@ xfs_aops_discard_page( ...@@ -949,58 +948,14 @@ xfs_aops_discard_page(
xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_ilock(ip, XFS_ILOCK_EXCL);
bh = head = page_buffers(page); bh = head = page_buffers(page);
do { do {
int done;
xfs_fileoff_t offset_fsb;
xfs_bmbt_irec_t imap;
int nimaps = 1;
int error; int error;
xfs_fsblock_t firstblock; xfs_fileoff_t start_fsb;
xfs_bmap_free_t flist;
if (!buffer_delay(bh)) if (!buffer_delay(bh))
goto next_buffer; goto next_buffer;
offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset); start_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
error = xfs_bmap_punch_delalloc_range(ip, start_fsb, 1);
/*
* Map the range first and check that it is a delalloc extent
* before trying to unmap the range. Otherwise we will be
* trying to remove a real extent (which requires a
* transaction) or a hole, which is probably a bad idea...
*/
error = xfs_bmapi(NULL, ip, offset_fsb, 1,
XFS_BMAPI_ENTIRE, NULL, 0, &imap,
&nimaps, NULL);
if (error) {
/* something screwed, just bail */
if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
xfs_fs_cmn_err(CE_ALERT, ip->i_mount,
"page discard failed delalloc mapping lookup.");
}
break;
}
if (!nimaps) {
/* nothing there */
goto next_buffer;
}
if (imap.br_startblock != DELAYSTARTBLOCK) {
/* been converted, ignore */
goto next_buffer;
}
WARN_ON(imap.br_blockcount == 0);
/*
* Note: while we initialise the firstblock/flist pair, they
* should never be used because blocks should never be
* allocated or freed for a delalloc extent and hence we need
* don't cancel or finish them after the xfs_bunmapi() call.
*/
xfs_bmap_init(&flist, &firstblock);
error = xfs_bunmapi(NULL, ip, offset_fsb, 1, 0, 1, &firstblock,
&flist, &done);
ASSERT(!flist.xbf_count && !flist.xbf_first);
if (error) { if (error) {
/* something screwed, just bail */ /* something screwed, just bail */
if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) { if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
...@@ -1010,7 +965,7 @@ xfs_aops_discard_page( ...@@ -1010,7 +965,7 @@ xfs_aops_discard_page(
break; break;
} }
next_buffer: next_buffer:
offset += len; offset += 1 << inode->i_blkbits;
} while ((bh = bh->b_this_page) != head); } while ((bh = bh->b_this_page) != head);
...@@ -1505,11 +1460,42 @@ xfs_vm_write_failed( ...@@ -1505,11 +1460,42 @@ xfs_vm_write_failed(
struct inode *inode = mapping->host; struct inode *inode = mapping->host;
if (to > inode->i_size) { if (to > inode->i_size) {
struct iattr ia = { /*
.ia_valid = ATTR_SIZE | ATTR_FORCE, * punch out the delalloc blocks we have already allocated. We
.ia_size = inode->i_size, * don't call xfs_setattr() to do this as we may be in the
}; * middle of a multi-iovec write and so the vfs inode->i_size
xfs_setattr(XFS_I(inode), &ia, XFS_ATTR_NOLOCK); * will not match the xfs ip->i_size and so it will zero too
* much. Hence we jus truncate the page cache to zero what is
* necessary and punch the delalloc blocks directly.
*/
struct xfs_inode *ip = XFS_I(inode);
xfs_fileoff_t start_fsb;
xfs_fileoff_t end_fsb;
int error;
truncate_pagecache(inode, to, inode->i_size);
/*
* Check if there are any blocks that are outside of i_size
* that need to be trimmed back.
*/
start_fsb = XFS_B_TO_FSB(ip->i_mount, inode->i_size) + 1;
end_fsb = XFS_B_TO_FSB(ip->i_mount, to);
if (end_fsb <= start_fsb)
return;
xfs_ilock(ip, XFS_ILOCK_EXCL);
error = xfs_bmap_punch_delalloc_range(ip, start_fsb,
end_fsb - start_fsb);
if (error) {
/* something screwed, just bail */
if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
xfs_fs_cmn_err(CE_ALERT, ip->i_mount,
"xfs_vm_write_failed: unable to clean up ino %lld",
ip->i_ino);
}
}
xfs_iunlock(ip, XFS_ILOCK_EXCL);
} }
} }
......
...@@ -488,29 +488,16 @@ _xfs_buf_find( ...@@ -488,29 +488,16 @@ _xfs_buf_find(
spin_unlock(&pag->pag_buf_lock); spin_unlock(&pag->pag_buf_lock);
xfs_perag_put(pag); xfs_perag_put(pag);
/* Attempt to get the semaphore without sleeping, if (xfs_buf_cond_lock(bp)) {
* if this does not work then we need to drop the /* failed, so wait for the lock if requested. */
* spinlock and do a hard attempt on the semaphore.
*/
if (down_trylock(&bp->b_sema)) {
if (!(flags & XBF_TRYLOCK)) { if (!(flags & XBF_TRYLOCK)) {
/* wait for buffer ownership */
xfs_buf_lock(bp); xfs_buf_lock(bp);
XFS_STATS_INC(xb_get_locked_waited); XFS_STATS_INC(xb_get_locked_waited);
} else { } else {
/* We asked for a trylock and failed, no need
* to look at file offset and length here, we
* know that this buffer at least overlaps our
* buffer and is locked, therefore our buffer
* either does not exist, or is this buffer.
*/
xfs_buf_rele(bp); xfs_buf_rele(bp);
XFS_STATS_INC(xb_busy_locked); XFS_STATS_INC(xb_busy_locked);
return NULL; return NULL;
} }
} else {
/* trylock worked */
XB_SET_OWNER(bp);
} }
if (bp->b_flags & XBF_STALE) { if (bp->b_flags & XBF_STALE) {
...@@ -876,10 +863,18 @@ xfs_buf_rele( ...@@ -876,10 +863,18 @@ xfs_buf_rele(
*/ */
/* /*
* Locks a buffer object, if it is not already locked. * Locks a buffer object, if it is not already locked. Note that this in
* Note that this in no way locks the underlying pages, so it is only * no way locks the underlying pages, so it is only useful for
* useful for synchronizing concurrent use of buffer objects, not for * synchronizing concurrent use of buffer objects, not for synchronizing
* synchronizing independent access to the underlying pages. * independent access to the underlying pages.
*
* If we come across a stale, pinned, locked buffer, we know that we are
* being asked to lock a buffer that has been reallocated. Because it is
* pinned, we know that the log has not been pushed to disk and hence it
* will still be locked. Rather than continuing to have trylock attempts
* fail until someone else pushes the log, push it ourselves before
* returning. This means that the xfsaild will not get stuck trying
* to push on stale inode buffers.
*/ */
int int
xfs_buf_cond_lock( xfs_buf_cond_lock(
...@@ -890,6 +885,8 @@ xfs_buf_cond_lock( ...@@ -890,6 +885,8 @@ xfs_buf_cond_lock(
locked = down_trylock(&bp->b_sema) == 0; locked = down_trylock(&bp->b_sema) == 0;
if (locked) if (locked)
XB_SET_OWNER(bp); XB_SET_OWNER(bp);
else if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE))
xfs_log_force(bp->b_target->bt_mount, 0);
trace_xfs_buf_cond_lock(bp, _RET_IP_); trace_xfs_buf_cond_lock(bp, _RET_IP_);
return locked ? 0 : -EBUSY; return locked ? 0 : -EBUSY;
......
...@@ -5471,8 +5471,13 @@ xfs_getbmap( ...@@ -5471,8 +5471,13 @@ xfs_getbmap(
if (error) if (error)
goto out_unlock_iolock; goto out_unlock_iolock;
} }
/*
ASSERT(ip->i_delayed_blks == 0); * even after flushing the inode, there can still be delalloc
* blocks on the inode beyond EOF due to speculative
* preallocation. These are not removed until the release
* function is called or the inode is inactivated. Hence we
* cannot assert here that ip->i_delayed_blks == 0.
*/
} }
lock = xfs_ilock_map_shared(ip); lock = xfs_ilock_map_shared(ip);
...@@ -6070,3 +6075,79 @@ xfs_bmap_disk_count_leaves( ...@@ -6070,3 +6075,79 @@ xfs_bmap_disk_count_leaves(
*count += xfs_bmbt_disk_get_blockcount(frp); *count += xfs_bmbt_disk_get_blockcount(frp);
} }
} }
/*
* dead simple method of punching delalyed allocation blocks from a range in
* the inode. Walks a block at a time so will be slow, but is only executed in
* rare error cases so the overhead is not critical. This will alays punch out
* both the start and end blocks, even if the ranges only partially overlap
* them, so it is up to the caller to ensure that partial blocks are not
* passed in.
*/
int
xfs_bmap_punch_delalloc_range(
struct xfs_inode *ip,
xfs_fileoff_t start_fsb,
xfs_fileoff_t length)
{
xfs_fileoff_t remaining = length;
int error = 0;
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
do {
int done;
xfs_bmbt_irec_t imap;
int nimaps = 1;
xfs_fsblock_t firstblock;
xfs_bmap_free_t flist;
/*
* Map the range first and check that it is a delalloc extent
* before trying to unmap the range. Otherwise we will be
* trying to remove a real extent (which requires a
* transaction) or a hole, which is probably a bad idea...
*/
error = xfs_bmapi(NULL, ip, start_fsb, 1,
XFS_BMAPI_ENTIRE, NULL, 0, &imap,
&nimaps, NULL);
if (error) {
/* something screwed, just bail */
if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
xfs_fs_cmn_err(CE_ALERT, ip->i_mount,
"Failed delalloc mapping lookup ino %lld fsb %lld.",
ip->i_ino, start_fsb);
}
break;
}
if (!nimaps) {
/* nothing there */
goto next_block;
}
if (imap.br_startblock != DELAYSTARTBLOCK) {
/* been converted, ignore */
goto next_block;
}
WARN_ON(imap.br_blockcount == 0);
/*
* Note: while we initialise the firstblock/flist pair, they
* should never be used because blocks should never be
* allocated or freed for a delalloc extent and hence we need
* don't cancel or finish them after the xfs_bunmapi() call.
*/
xfs_bmap_init(&flist, &firstblock);
error = xfs_bunmapi(NULL, ip, start_fsb, 1, 0, 1, &firstblock,
&flist, &done);
if (error)
break;
ASSERT(!flist.xbf_count && !flist.xbf_first);
next_block:
start_fsb++;
remaining--;
} while(remaining > 0);
return error;
}
...@@ -394,6 +394,11 @@ xfs_bmap_count_blocks( ...@@ -394,6 +394,11 @@ xfs_bmap_count_blocks(
int whichfork, int whichfork,
int *count); int *count);
int
xfs_bmap_punch_delalloc_range(
struct xfs_inode *ip,
xfs_fileoff_t start_fsb,
xfs_fileoff_t length);
#endif /* __KERNEL__ */ #endif /* __KERNEL__ */
#endif /* __XFS_BMAP_H__ */ #endif /* __XFS_BMAP_H__ */
...@@ -377,6 +377,19 @@ xfs_swap_extents( ...@@ -377,6 +377,19 @@ xfs_swap_extents(
ip->i_d.di_format = tip->i_d.di_format; ip->i_d.di_format = tip->i_d.di_format;
tip->i_d.di_format = tmp; tip->i_d.di_format = tmp;
/*
* The extents in the source inode could still contain speculative
* preallocation beyond EOF (e.g. the file is open but not modified
* while defrag is in progress). In that case, we need to copy over the
* number of delalloc blocks the data fork in the source inode is
* tracking beyond EOF so that when the fork is truncated away when the
* temporary inode is unlinked we don't underrun the i_delayed_blks
* counter on that inode.
*/
ASSERT(tip->i_delayed_blks == 0);
tip->i_delayed_blks = ip->i_delayed_blks;
ip->i_delayed_blks = 0;
ilf_fields = XFS_ILOG_CORE; ilf_fields = XFS_ILOG_CORE;
switch(ip->i_d.di_format) { switch(ip->i_d.di_format) {
......
...@@ -58,6 +58,7 @@ xfs_error_trap(int e) ...@@ -58,6 +58,7 @@ xfs_error_trap(int e)
int xfs_etest[XFS_NUM_INJECT_ERROR]; int xfs_etest[XFS_NUM_INJECT_ERROR];
int64_t xfs_etest_fsid[XFS_NUM_INJECT_ERROR]; int64_t xfs_etest_fsid[XFS_NUM_INJECT_ERROR];
char * xfs_etest_fsname[XFS_NUM_INJECT_ERROR]; char * xfs_etest_fsname[XFS_NUM_INJECT_ERROR];
int xfs_error_test_active;
int int
xfs_error_test(int error_tag, int *fsidp, char *expression, xfs_error_test(int error_tag, int *fsidp, char *expression,
...@@ -108,6 +109,7 @@ xfs_errortag_add(int error_tag, xfs_mount_t *mp) ...@@ -108,6 +109,7 @@ xfs_errortag_add(int error_tag, xfs_mount_t *mp)
len = strlen(mp->m_fsname); len = strlen(mp->m_fsname);
xfs_etest_fsname[i] = kmem_alloc(len + 1, KM_SLEEP); xfs_etest_fsname[i] = kmem_alloc(len + 1, KM_SLEEP);
strcpy(xfs_etest_fsname[i], mp->m_fsname); strcpy(xfs_etest_fsname[i], mp->m_fsname);
xfs_error_test_active++;
return 0; return 0;
} }
} }
...@@ -137,6 +139,7 @@ xfs_errortag_clearall(xfs_mount_t *mp, int loud) ...@@ -137,6 +139,7 @@ xfs_errortag_clearall(xfs_mount_t *mp, int loud)
xfs_etest_fsid[i] = 0LL; xfs_etest_fsid[i] = 0LL;
kmem_free(xfs_etest_fsname[i]); kmem_free(xfs_etest_fsname[i]);
xfs_etest_fsname[i] = NULL; xfs_etest_fsname[i] = NULL;
xfs_error_test_active--;
} }
} }
......
...@@ -127,13 +127,14 @@ extern void xfs_corruption_error(const char *tag, int level, ...@@ -127,13 +127,14 @@ extern void xfs_corruption_error(const char *tag, int level,
#define XFS_RANDOM_BMAPIFORMAT XFS_RANDOM_DEFAULT #define XFS_RANDOM_BMAPIFORMAT XFS_RANDOM_DEFAULT
#ifdef DEBUG #ifdef DEBUG
extern int xfs_error_test_active;
extern int xfs_error_test(int, int *, char *, int, char *, unsigned long); extern int xfs_error_test(int, int *, char *, int, char *, unsigned long);
#define XFS_NUM_INJECT_ERROR 10 #define XFS_NUM_INJECT_ERROR 10
#define XFS_TEST_ERROR(expr, mp, tag, rf) \ #define XFS_TEST_ERROR(expr, mp, tag, rf) \
((expr) || \ ((expr) || (xfs_error_test_active && \
xfs_error_test((tag), (mp)->m_fixedfsid, "expr", __LINE__, __FILE__, \ xfs_error_test((tag), (mp)->m_fixedfsid, "expr", __LINE__, __FILE__, \
(rf))) (rf))))
extern int xfs_errortag_add(int error_tag, xfs_mount_t *mp); extern int xfs_errortag_add(int error_tag, xfs_mount_t *mp);
extern int xfs_errortag_clearall(xfs_mount_t *mp, int loud); extern int xfs_errortag_clearall(xfs_mount_t *mp, int loud);
......
...@@ -657,18 +657,37 @@ xfs_inode_item_unlock( ...@@ -657,18 +657,37 @@ xfs_inode_item_unlock(
} }
/* /*
* This is called to find out where the oldest active copy of the * This is called to find out where the oldest active copy of the inode log
* inode log item in the on disk log resides now that the last log * item in the on disk log resides now that the last log write of it completed
* write of it completed at the given lsn. Since we always re-log * at the given lsn. Since we always re-log all dirty data in an inode, the
* all dirty data in an inode, the latest copy in the on disk log * latest copy in the on disk log is the only one that matters. Therefore,
* is the only one that matters. Therefore, simply return the * simply return the given lsn.
* given lsn. *
* If the inode has been marked stale because the cluster is being freed, we
* don't want to (re-)insert this inode into the AIL. There is a race condition
* where the cluster buffer may be unpinned before the inode is inserted into
* the AIL during transaction committed processing. If the buffer is unpinned
* before the inode item has been committed and inserted, then it is possible
* for the buffer to be written and IO completions before the inode is inserted
* into the AIL. In that case, we'd be inserting a clean, stale inode into the
* AIL which will never get removed. It will, however, get reclaimed which
* triggers an assert in xfs_inode_free() complaining about freein an inode
* still in the AIL.
*
* To avoid this, return a lower LSN than the one passed in so that the
* transaction committed code will not move the inode forward in the AIL but
* will still unpin it properly.
*/ */
STATIC xfs_lsn_t STATIC xfs_lsn_t
xfs_inode_item_committed( xfs_inode_item_committed(
struct xfs_log_item *lip, struct xfs_log_item *lip,
xfs_lsn_t lsn) xfs_lsn_t lsn)
{ {
struct xfs_inode_log_item *iip = INODE_ITEM(lip);
struct xfs_inode *ip = iip->ili_inode;
if (xfs_iflags_test(ip, XFS_ISTALE))
return lsn - 1;
return lsn; return lsn;
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment