Commit 0ee7a3f6 authored by Christoph Hellwig's avatar Christoph Hellwig Committed by Dave Chinner

xfs: don't take the IOLOCK exclusive for direct I/O page invalidation

XFS historically took the iolock exclusive when invalidating pages
before direct I/O operations to protect against writeback starvations.

But this writeback starvation issues has been fixed a long time ago
in the core writeback code, and all other file systems manage to do
without the exclusive lock.  Convert XFS over to avoid the exclusive
lock in this case, and also move to range invalidations like done
by the other file systems.
Signed-off-by: default avatarChristoph Hellwig <hch@lst.de>
Reviewed-by: default avatarJan Kara <jack@suse.cz>
Reviewed-by: default avatarCarlos Maiolino <cmaiolino@redhat.com>
Signed-off-by: default avatarDave Chinner <david@fromorbit.com>
parent f1b8243c
...@@ -249,6 +249,7 @@ xfs_file_dio_aio_read( ...@@ -249,6 +249,7 @@ xfs_file_dio_aio_read(
struct xfs_inode *ip = XFS_I(inode); struct xfs_inode *ip = XFS_I(inode);
loff_t isize = i_size_read(inode); loff_t isize = i_size_read(inode);
size_t count = iov_iter_count(to); size_t count = iov_iter_count(to);
loff_t end = iocb->ki_pos + count - 1;
struct iov_iter data; struct iov_iter data;
struct xfs_buftarg *target; struct xfs_buftarg *target;
ssize_t ret = 0; ssize_t ret = 0;
...@@ -272,50 +273,22 @@ xfs_file_dio_aio_read( ...@@ -272,50 +273,22 @@ xfs_file_dio_aio_read(
file_accessed(iocb->ki_filp); file_accessed(iocb->ki_filp);
/*
* Locking is a bit tricky here. If we take an exclusive lock for direct
* IO, we effectively serialise all new concurrent read IO to this file
* and block it behind IO that is currently in progress because IO in
* progress holds the IO lock shared. We only need to hold the lock
* exclusive to blow away the page cache, so only take lock exclusively
* if the page cache needs invalidation. This allows the normal direct
* IO case of no page cache pages to proceeed concurrently without
* serialisation.
*/
xfs_rw_ilock(ip, XFS_IOLOCK_SHARED); xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
if (mapping->nrpages) { if (mapping->nrpages) {
xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); ret = filemap_write_and_wait_range(mapping, iocb->ki_pos, end);
xfs_rw_ilock(ip, XFS_IOLOCK_EXCL); if (ret)
goto out_unlock;
/*
* The generic dio code only flushes the range of the particular
* I/O. Because we take an exclusive lock here, this whole
* sequence is considerably more expensive for us. This has a
* noticeable performance impact for any file with cached pages,
* even when outside of the range of the particular I/O.
*
* Hence, amortize the cost of the lock against a full file
* flush and reduce the chances of repeated iolock cycles going
* forward.
*/
if (mapping->nrpages) {
ret = filemap_write_and_wait(mapping);
if (ret) {
xfs_rw_iunlock(ip, XFS_IOLOCK_EXCL);
return ret;
}
/* /*
* Invalidate whole pages. This can return an error if * Invalidate whole pages. This can return an error if we fail
* we fail to invalidate a page, but this should never * to invalidate a page, but this should never happen on XFS.
* happen on XFS. Warn if it does fail. * Warn if it does fail.
*/ */
ret = invalidate_inode_pages2(mapping); ret = invalidate_inode_pages2_range(mapping,
iocb->ki_pos >> PAGE_SHIFT, end >> PAGE_SHIFT);
WARN_ON_ONCE(ret); WARN_ON_ONCE(ret);
ret = 0; ret = 0;
} }
xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
}
data = *to; data = *to;
ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data, ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data,
...@@ -324,8 +297,9 @@ xfs_file_dio_aio_read( ...@@ -324,8 +297,9 @@ xfs_file_dio_aio_read(
iocb->ki_pos += ret; iocb->ki_pos += ret;
iov_iter_advance(to, ret); iov_iter_advance(to, ret);
} }
xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
out_unlock:
xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
return ret; return ret;
} }
...@@ -570,34 +544,22 @@ xfs_file_dio_aio_write( ...@@ -570,34 +544,22 @@ xfs_file_dio_aio_write(
if ((iocb->ki_pos | count) & target->bt_logical_sectormask) if ((iocb->ki_pos | count) & target->bt_logical_sectormask)
return -EINVAL; return -EINVAL;
/* "unaligned" here means not aligned to a filesystem block */
if ((iocb->ki_pos & mp->m_blockmask) ||
((iocb->ki_pos + count) & mp->m_blockmask))
unaligned_io = 1;
/* /*
* We don't need to take an exclusive lock unless there page cache needs * Don't take the exclusive iolock here unless the I/O is unaligned to
* to be invalidated or unaligned IO is being executed. We don't need to * the file system block size. We don't need to consider the EOF
* consider the EOF extension case here because * extension case here because xfs_file_aio_write_checks() will relock
* xfs_file_aio_write_checks() will relock the inode as necessary for * the inode as necessary for EOF zeroing cases and fill out the new
* EOF zeroing cases and fill out the new inode size as appropriate. * inode size as appropriate.
*/ */
if (unaligned_io || mapping->nrpages) if ((iocb->ki_pos & mp->m_blockmask) ||
((iocb->ki_pos + count) & mp->m_blockmask)) {
unaligned_io = 1;
iolock = XFS_IOLOCK_EXCL; iolock = XFS_IOLOCK_EXCL;
else } else {
iolock = XFS_IOLOCK_SHARED; iolock = XFS_IOLOCK_SHARED;
xfs_rw_ilock(ip, iolock); }
/*
* Recheck if there are cached pages that need invalidate after we got
* the iolock to protect against other threads adding new pages while
* we were waiting for the iolock.
*/
if (mapping->nrpages && iolock == XFS_IOLOCK_SHARED) {
xfs_rw_iunlock(ip, iolock);
iolock = XFS_IOLOCK_EXCL;
xfs_rw_ilock(ip, iolock); xfs_rw_ilock(ip, iolock);
}
ret = xfs_file_aio_write_checks(iocb, from, &iolock); ret = xfs_file_aio_write_checks(iocb, from, &iolock);
if (ret) if (ret)
...@@ -605,26 +567,26 @@ xfs_file_dio_aio_write( ...@@ -605,26 +567,26 @@ xfs_file_dio_aio_write(
count = iov_iter_count(from); count = iov_iter_count(from);
end = iocb->ki_pos + count - 1; end = iocb->ki_pos + count - 1;
/*
* See xfs_file_dio_aio_read() for why we do a full-file flush here.
*/
if (mapping->nrpages) { if (mapping->nrpages) {
ret = filemap_write_and_wait(VFS_I(ip)->i_mapping); ret = filemap_write_and_wait_range(mapping, iocb->ki_pos, end);
if (ret) if (ret)
goto out; goto out;
/* /*
* Invalidate whole pages. This can return an error if we fail * Invalidate whole pages. This can return an error if we fail
* to invalidate a page, but this should never happen on XFS. * to invalidate a page, but this should never happen on XFS.
* Warn if it does fail. * Warn if it does fail.
*/ */
ret = invalidate_inode_pages2(VFS_I(ip)->i_mapping); ret = invalidate_inode_pages2_range(mapping,
iocb->ki_pos >> PAGE_SHIFT, end >> PAGE_SHIFT);
WARN_ON_ONCE(ret); WARN_ON_ONCE(ret);
ret = 0; ret = 0;
} }
/* /*
* If we are doing unaligned IO, wait for all other IO to drain, * If we are doing unaligned IO, wait for all other IO to drain,
* otherwise demote the lock if we had to flush cached pages * otherwise demote the lock if we had to take the exclusive lock
* for other reasons in xfs_file_aio_write_checks.
*/ */
if (unaligned_io) if (unaligned_io)
inode_dio_wait(inode); inode_dio_wait(inode);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment