xfs: ensure truncate forces zeroed blocks to disk

commit 5885ebda upstream. A new fsync vs power fail test in xfstests indicated that XFS can have unreliable data consistency when doing extending truncates that require block zeroing. The blocks beyond EOF get zeroed in memory, but we never force those changes to disk before we run the transaction that extends the file size and exposes those blocks to userspace. This can result in the blocks not being correctly zeroed after a crash. Because in-memory behaviour is correct, tools like fsx don't pick up any coherency problems - it's not until the filesystem is shutdown or the system crashes after writing the truncate transaction to the journal but before the zeroed data in the page cache is flushed that the issue is exposed. Fix this by also flushing the dirty data in memory region between the old size and new size when we've found blocks that need zeroing in the truncate process. Reported-by: Liu Bo <bo.li.liu@oracle.com> Signed-off-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Brian Foster <bfoster@redhat.com> Signed-off-by: Dave Chinner <david@fromorbit.com> [bwh: Backported to 3.2: adjust filename, context] Signed-off-by: Ben Hutchings <ben@decadent.org.uk>

xfs: ensure truncate forces zeroed blocks to disk
commit 5885ebda upstream. A new fsync vs power fail test in xfstests indicated that XFS can have unreliable data consistency when doing extending truncates that require block zeroing. The blocks beyond EOF get zeroed in memory, but we never force those changes to disk before we run the transaction that extends the file size and exposes those blocks to userspace. This can result in the blocks not being correctly zeroed after a crash. Because in-memory behaviour is correct, tools like fsx don't pick up any coherency problems - it's not until the filesystem is shutdown or the system crashes after writing the truncate transaction to the journal but before the zeroed data in the page cache is flushed that the issue is exposed. Fix this by also flushing the dirty data in memory region between the old size and new size when we've found blocks that need zeroing in the truncate process. Reported-by: Liu Bo <bo.li.liu@oracle.com> Signed-off-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Brian Foster <bfoster@redhat.com> Signed-off-by: Dave Chinner <david@fromorbit.com> [bwh: Backported to 3.2: adjust filename, context] Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
50848949 · Dave Chinner · Ben Hutchings · 17f606d8 · 50848949 · 50848949
Commit 50848949 authored Feb 23, 2015 by Dave Chinner Committed by Ben Hutchings May 09, 2015
Hide whitespace changes
Inline Side-by-side

Showing with 26 additions and 27 deletions

fs/xfs/xfs_file.c fs/xfs/xfs_file.c +10 -4

fs/xfs/xfs_iops.c fs/xfs/xfs_iops.c +14 -22

fs/xfs/xfs_vnodeops.h fs/xfs/xfs_vnodeops.h +2 -1

No files found.
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -516,7 +516,8 @@ STATIC int				/* error (positive) */
 xfs_zero_last_block(
 	xfs_inode_t	*ip,
 	xfs_fsize_t	offset,
-	xfs_fsize_t	isize)
+	xfs_fsize_t	isize,
+	bool		*did_zeroing)
 {
 	xfs_fileoff_t	last_fsb;
 	xfs_mount_t	*mp = ip->i_mount;
@@ -560,6 +561,7 @@ xfs_zero_last_block(
 	zero_len = mp->m_sb.sb_blocksize - zero_offset;
 	if (isize + zero_len > offset)
 		zero_len = offset - isize;
+	*did_zeroing = true;
 	error = xfs_iozero(ip, isize, zero_len);
 	xfs_ilock(ip, XFS_ILOCK_EXCL);
@@ -582,7 +584,8 @@ int					/* error (positive) */
 xfs_zero_eof(
 	xfs_inode_t	*ip,
 	xfs_off_t	offset,		/* starting I/O offset */
-	xfs_fsize_t	isize)		/* current inode size */
+	xfs_fsize_t	isize,		/* current inode size */
+	bool		*did_zeroing)
 {
 	xfs_mount_t	*mp = ip->i_mount;
 	xfs_fileoff_t	start_zero_fsb;
@@ -602,7 +605,7 @@ xfs_zero_eof(
 	 * First handle zeroing the block on which isize resides.
 	 * We only zero a part of that block so it is handled specially.
 	 */
-	error = xfs_zero_last_block(ip, offset, isize);
+	error = xfs_zero_last_block(ip, offset, isize, did_zeroing);
 	if (error) {
 		ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL));
 		return error;
@@ -672,6 +675,7 @@ xfs_zero_eof(
 			goto out_lock;
 		}
+		*did_zeroing = true;
 		start_zero_fsb = imap.br_startoff + imap.br_blockcount;
 		ASSERT(start_zero_fsb <= (end_zero_fsb + 1));
@@ -729,13 +733,15 @@ xfs_file_aio_write_checks(
 	 */
 	if ((ip->i_new_size && *pos > ip->i_new_size) ||
 	    (!ip->i_new_size && *pos > ip->i_size)) {
+		bool	zero = false;
 		if (*iolock == XFS_IOLOCK_SHARED) {
 			xfs_rw_iunlock(ip, XFS_ILOCK_EXCL | *iolock);
 			*iolock = XFS_IOLOCK_EXCL;
 			xfs_rw_ilock(ip, XFS_ILOCK_EXCL | *iolock);
 			goto restart;
 		}
-		error = -xfs_zero_eof(ip, *pos, ip->i_size);
+		error = -xfs_zero_eof(ip, *pos, ip->i_size, &zero);
 	}
 	/*

--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -766,6 +766,7 @@ xfs_setattr_size(
 	int			error;
 	uint			lock_flags;
 	uint			commit_flags = 0;
+	bool			did_zeroing = false;
 	trace_xfs_setattr(ip);
@@ -812,20 +813,16 @@ xfs_setattr_size(
 		goto out_unlock;
 	/*
-	 * Now we can make the changes.  Before we join the inode to the
+	 * File data changes must be complete before we start the transaction to
-	 * transaction, take care of the part of the truncation that must be
+	 * modify the inode.  This needs to be done before joining the inode to
-	 * done without the inode lock.  This needs to be done before joining
+	 * the transaction because the inode cannot be unlocked once it is a
-	 * the inode to the transaction, because the inode cannot be unlocked
+	 * part of the transaction.
-	 * once it is a part of the transaction.
+	 *
+	 * Start with zeroing any data block beyond EOF that we may expose on
+	 * file extension.
 	 */
 	if (iattr->ia_size > ip->i_size) {
-		/*
+		error = xfs_zero_eof(ip, iattr->ia_size, ip->i_size, &did_zeroing);
-		 * Do the first part of growing a file: zero any data in the
-		 * last block that is beyond the old EOF.  We need to do this
-		 * before the inode is joined to the transaction to modify
-		 * i_size.
-		 */
-		error = xfs_zero_eof(ip, iattr->ia_size, ip->i_size);
 		if (error)
 			goto out_unlock;
 	}
@@ -837,23 +834,18 @@ xfs_setattr_size(
 	 * any previous writes that are beyond the on disk EOF and the new
 	 * EOF that have not been written out need to be written here.  If we
 	 * do not write the data out, we expose ourselves to the null files
-	 * problem.
+	 * problem. Note that this includes any block zeroing we did above;
-	 *
+	 * otherwise those blocks may not be zeroed after a crash.
-	 * Only flush from the on disk size to the smaller of the in memory
-	 * file size or the new size as that's the range we really care about
-	 * here and prevents waiting for other data not within the range we
-	 * care about here.
 	 */
-	if (ip->i_size != ip->i_d.di_size && iattr->ia_size > ip->i_d.di_size) {
+	if (iattr->ia_size > ip->i_d.di_size &&
+	    (ip->i_size != ip->i_d.di_size || did_zeroing)) {
 		error = xfs_flush_pages(ip, ip->i_d.di_size, iattr->ia_size, 0,
 					FI_NONE);
 		if (error)
 			goto out_unlock;
 	}
-	/*
+	/* Now wait for all direct I/O to complete. */
-	 * Wait for all direct I/O to complete.
-	 */
 	inode_dio_wait(inode);
 	error = -block_truncate_page(inode->i_mapping, iattr->ia_size,

--- a/fs/xfs/xfs_vnodeops.h
+++ b/fs/xfs/xfs_vnodeops.h
@@ -59,6 +59,7 @@ int xfs_flush_pages(struct xfs_inode *ip, xfs_off_t first,
 		xfs_off_t last, uint64_t flags, int fiopt);
 int xfs_wait_on_pages(struct xfs_inode *ip, xfs_off_t first, xfs_off_t last);
-int xfs_zero_eof(struct xfs_inode *, xfs_off_t, xfs_fsize_t);
+int	xfs_zero_eof(struct xfs_inode *ip, xfs_off_t offset,
+		     xfs_fsize_t isize, bool *did_zeroing);
 #endif /* _XFS_VNODEOPS_H */