Commit 6b698ede authored by Dave Chinner's avatar Dave Chinner Committed by Dave Chinner

xfs: add DAX file operations support

Add the initial support for DAX file operations to XFS. This
includes the necessary block allocation and mmap page fault hooks
for DAX to function.

Note that there are changes to the splice interfaces to ensure that
for DAX splice avoids direct page cache manipulations and instead
takes the DAX IO paths for read/write operations.
Signed-off-by: default avatarDave Chinner <dchinner@redhat.com>
Reviewed-by: default avatarBrian Foster <bfoster@redhat.com>
Signed-off-by: default avatarDave Chinner <david@fromorbit.com>
parent ce5c5d55
...@@ -1349,7 +1349,7 @@ __xfs_get_blocks( ...@@ -1349,7 +1349,7 @@ __xfs_get_blocks(
sector_t iblock, sector_t iblock,
struct buffer_head *bh_result, struct buffer_head *bh_result,
int create, int create,
int direct) bool direct)
{ {
struct xfs_inode *ip = XFS_I(inode); struct xfs_inode *ip = XFS_I(inode);
struct xfs_mount *mp = ip->i_mount; struct xfs_mount *mp = ip->i_mount;
...@@ -1414,6 +1414,7 @@ __xfs_get_blocks( ...@@ -1414,6 +1414,7 @@ __xfs_get_blocks(
if (error) if (error)
return error; return error;
new = 1; new = 1;
} else { } else {
/* /*
* Delalloc reservations do not require a transaction, * Delalloc reservations do not require a transaction,
...@@ -1508,49 +1509,29 @@ xfs_get_blocks( ...@@ -1508,49 +1509,29 @@ xfs_get_blocks(
struct buffer_head *bh_result, struct buffer_head *bh_result,
int create) int create)
{ {
return __xfs_get_blocks(inode, iblock, bh_result, create, 0); return __xfs_get_blocks(inode, iblock, bh_result, create, false);
} }
STATIC int int
xfs_get_blocks_direct( xfs_get_blocks_direct(
struct inode *inode, struct inode *inode,
sector_t iblock, sector_t iblock,
struct buffer_head *bh_result, struct buffer_head *bh_result,
int create) int create)
{ {
return __xfs_get_blocks(inode, iblock, bh_result, create, 1); return __xfs_get_blocks(inode, iblock, bh_result, create, true);
} }
/* static void
* Complete a direct I/O write request. __xfs_end_io_direct_write(
* struct inode *inode,
* The ioend structure is passed from __xfs_get_blocks() to tell us what to do. struct xfs_ioend *ioend,
* If no ioend exists (i.e. @private == NULL) then the write IO is an overwrite
* wholly within the EOF and so there is nothing for us to do. Note that in this
* case the completion can be called in interrupt context, whereas if we have an
* ioend we will always be called in task context (i.e. from a workqueue).
*/
STATIC void
xfs_end_io_direct_write(
struct kiocb *iocb,
loff_t offset, loff_t offset,
ssize_t size, ssize_t size)
void *private)
{ {
struct inode *inode = file_inode(iocb->ki_filp); struct xfs_mount *mp = XFS_I(inode)->i_mount;
struct xfs_inode *ip = XFS_I(inode);
struct xfs_mount *mp = ip->i_mount;
struct xfs_ioend *ioend = private;
trace_xfs_gbmap_direct_endio(ip, offset, size, if (XFS_FORCED_SHUTDOWN(mp) || ioend->io_error)
ioend ? ioend->io_type : 0, NULL);
if (!ioend) {
ASSERT(offset + size <= i_size_read(inode));
return;
}
if (XFS_FORCED_SHUTDOWN(mp))
goto out_end_io; goto out_end_io;
/* /*
...@@ -1587,10 +1568,10 @@ xfs_end_io_direct_write( ...@@ -1587,10 +1568,10 @@ xfs_end_io_direct_write(
* here can result in EOF moving backwards and Bad Things Happen when * here can result in EOF moving backwards and Bad Things Happen when
* that occurs. * that occurs.
*/ */
spin_lock(&ip->i_flags_lock); spin_lock(&XFS_I(inode)->i_flags_lock);
if (offset + size > i_size_read(inode)) if (offset + size > i_size_read(inode))
i_size_write(inode, offset + size); i_size_write(inode, offset + size);
spin_unlock(&ip->i_flags_lock); spin_unlock(&XFS_I(inode)->i_flags_lock);
/* /*
* If we are doing an append IO that needs to update the EOF on disk, * If we are doing an append IO that needs to update the EOF on disk,
...@@ -1607,6 +1588,75 @@ xfs_end_io_direct_write( ...@@ -1607,6 +1588,75 @@ xfs_end_io_direct_write(
return; return;
} }
/*
* Complete a direct I/O write request.
*
* The ioend structure is passed from __xfs_get_blocks() to tell us what to do.
* If no ioend exists (i.e. @private == NULL) then the write IO is an overwrite
* wholly within the EOF and so there is nothing for us to do. Note that in this
* case the completion can be called in interrupt context, whereas if we have an
* ioend we will always be called in task context (i.e. from a workqueue).
*/
STATIC void
xfs_end_io_direct_write(
struct kiocb *iocb,
loff_t offset,
ssize_t size,
void *private)
{
struct inode *inode = file_inode(iocb->ki_filp);
struct xfs_ioend *ioend = private;
trace_xfs_gbmap_direct_endio(XFS_I(inode), offset, size,
ioend ? ioend->io_type : 0, NULL);
if (!ioend) {
ASSERT(offset + size <= i_size_read(inode));
return;
}
__xfs_end_io_direct_write(inode, ioend, offset, size);
}
/*
* For DAX we need a mapping buffer callback for unwritten extent conversion
* when page faults allocate blocks and then zero them. Note that in this
* case the mapping indicated by the ioend may extend beyond EOF. We most
* definitely do not want to extend EOF here, so we trim back the ioend size to
* EOF.
*/
#ifdef CONFIG_FS_DAX
void
xfs_end_io_dax_write(
struct buffer_head *bh,
int uptodate)
{
struct xfs_ioend *ioend = bh->b_private;
struct inode *inode = ioend->io_inode;
ssize_t size = ioend->io_size;
ASSERT(IS_DAX(ioend->io_inode));
/* if there was an error zeroing, then don't convert it */
if (!uptodate)
ioend->io_error = -EIO;
/*
* Trim update to EOF, so we don't extend EOF during unwritten extent
* conversion of partial EOF blocks.
*/
spin_lock(&XFS_I(inode)->i_flags_lock);
if (ioend->io_offset + size > i_size_read(inode))
size = i_size_read(inode) - ioend->io_offset;
spin_unlock(&XFS_I(inode)->i_flags_lock);
__xfs_end_io_direct_write(inode, ioend, ioend->io_offset, size);
}
#else
void xfs_end_io_dax_write(struct buffer_head *bh, int uptodate) { }
#endif
STATIC ssize_t STATIC ssize_t
xfs_vm_direct_IO( xfs_vm_direct_IO(
struct kiocb *iocb, struct kiocb *iocb,
......
...@@ -53,7 +53,12 @@ typedef struct xfs_ioend { ...@@ -53,7 +53,12 @@ typedef struct xfs_ioend {
} xfs_ioend_t; } xfs_ioend_t;
extern const struct address_space_operations xfs_address_space_operations; extern const struct address_space_operations xfs_address_space_operations;
extern int xfs_get_blocks(struct inode *, sector_t, struct buffer_head *, int);
int xfs_get_blocks(struct inode *inode, sector_t offset,
struct buffer_head *map_bh, int create);
int xfs_get_blocks_direct(struct inode *inode, sector_t offset,
struct buffer_head *map_bh, int create);
void xfs_end_io_dax_write(struct buffer_head *bh, int uptodate);
extern void xfs_count_page_state(struct page *, int *, int *); extern void xfs_count_page_state(struct page *, int *, int *);
......
...@@ -284,7 +284,7 @@ xfs_file_read_iter( ...@@ -284,7 +284,7 @@ xfs_file_read_iter(
if (file->f_mode & FMODE_NOCMTIME) if (file->f_mode & FMODE_NOCMTIME)
ioflags |= XFS_IO_INVIS; ioflags |= XFS_IO_INVIS;
if (unlikely(ioflags & XFS_IO_ISDIRECT)) { if ((ioflags & XFS_IO_ISDIRECT) && !IS_DAX(inode)) {
xfs_buftarg_t *target = xfs_buftarg_t *target =
XFS_IS_REALTIME_INODE(ip) ? XFS_IS_REALTIME_INODE(ip) ?
mp->m_rtdev_targp : mp->m_ddev_targp; mp->m_rtdev_targp : mp->m_ddev_targp;
...@@ -378,7 +378,11 @@ xfs_file_splice_read( ...@@ -378,7 +378,11 @@ xfs_file_splice_read(
trace_xfs_file_splice_read(ip, count, *ppos, ioflags); trace_xfs_file_splice_read(ip, count, *ppos, ioflags);
ret = generic_file_splice_read(infilp, ppos, pipe, count, flags); /* for dax, we need to avoid the page cache */
if (IS_DAX(VFS_I(ip)))
ret = default_file_splice_read(infilp, ppos, pipe, count, flags);
else
ret = generic_file_splice_read(infilp, ppos, pipe, count, flags);
if (ret > 0) if (ret > 0)
XFS_STATS_ADD(xs_read_bytes, ret); XFS_STATS_ADD(xs_read_bytes, ret);
...@@ -672,7 +676,7 @@ xfs_file_dio_aio_write( ...@@ -672,7 +676,7 @@ xfs_file_dio_aio_write(
mp->m_rtdev_targp : mp->m_ddev_targp; mp->m_rtdev_targp : mp->m_ddev_targp;
/* DIO must be aligned to device logical sector size */ /* DIO must be aligned to device logical sector size */
if ((pos | count) & target->bt_logical_sectormask) if (!IS_DAX(inode) && ((pos | count) & target->bt_logical_sectormask))
return -EINVAL; return -EINVAL;
/* "unaligned" here means not aligned to a filesystem block */ /* "unaligned" here means not aligned to a filesystem block */
...@@ -758,8 +762,11 @@ xfs_file_dio_aio_write( ...@@ -758,8 +762,11 @@ xfs_file_dio_aio_write(
out: out:
xfs_rw_iunlock(ip, iolock); xfs_rw_iunlock(ip, iolock);
/* No fallback to buffered IO on errors for XFS. */ /*
ASSERT(ret < 0 || ret == count); * No fallback to buffered IO on errors for XFS. DAX can result in
* partial writes, but direct IO will either complete fully or fail.
*/
ASSERT(ret < 0 || ret == count || IS_DAX(VFS_I(ip)));
return ret; return ret;
} }
...@@ -842,7 +849,7 @@ xfs_file_write_iter( ...@@ -842,7 +849,7 @@ xfs_file_write_iter(
if (XFS_FORCED_SHUTDOWN(ip->i_mount)) if (XFS_FORCED_SHUTDOWN(ip->i_mount))
return -EIO; return -EIO;
if (unlikely(iocb->ki_flags & IOCB_DIRECT)) if ((iocb->ki_flags & IOCB_DIRECT) || IS_DAX(inode))
ret = xfs_file_dio_aio_write(iocb, from); ret = xfs_file_dio_aio_write(iocb, from);
else else
ret = xfs_file_buffered_aio_write(iocb, from); ret = xfs_file_buffered_aio_write(iocb, from);
...@@ -1063,17 +1070,6 @@ xfs_file_readdir( ...@@ -1063,17 +1070,6 @@ xfs_file_readdir(
return xfs_readdir(ip, ctx, bufsize); return xfs_readdir(ip, ctx, bufsize);
} }
STATIC int
xfs_file_mmap(
struct file *filp,
struct vm_area_struct *vma)
{
vma->vm_ops = &xfs_file_vm_ops;
file_accessed(filp);
return 0;
}
/* /*
* This type is designed to indicate the type of offset we would like * This type is designed to indicate the type of offset we would like
* to search from page cache for xfs_seek_hole_data(). * to search from page cache for xfs_seek_hole_data().
...@@ -1454,26 +1450,11 @@ xfs_file_llseek( ...@@ -1454,26 +1450,11 @@ xfs_file_llseek(
* ordering of: * ordering of:
* *
* mmap_sem (MM) * mmap_sem (MM)
* i_mmap_lock (XFS - truncate serialisation) * sb_start_pagefault(vfs, freeze)
* page_lock (MM) * i_mmap_lock (XFS - truncate serialisation)
* i_lock (XFS - extent map serialisation) * page_lock (MM)
* i_lock (XFS - extent map serialisation)
*/ */
STATIC int
xfs_filemap_fault(
struct vm_area_struct *vma,
struct vm_fault *vmf)
{
struct xfs_inode *ip = XFS_I(vma->vm_file->f_mapping->host);
int error;
trace_xfs_filemap_fault(ip);
xfs_ilock(ip, XFS_MMAPLOCK_SHARED);
error = filemap_fault(vma, vmf);
xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
return error;
}
/* /*
* mmap()d file has taken write protection fault and is being made writable. We * mmap()d file has taken write protection fault and is being made writable. We
...@@ -1486,21 +1467,66 @@ xfs_filemap_page_mkwrite( ...@@ -1486,21 +1467,66 @@ xfs_filemap_page_mkwrite(
struct vm_area_struct *vma, struct vm_area_struct *vma,
struct vm_fault *vmf) struct vm_fault *vmf)
{ {
struct xfs_inode *ip = XFS_I(vma->vm_file->f_mapping->host); struct inode *inode = file_inode(vma->vm_file);
int ret; int ret;
trace_xfs_filemap_page_mkwrite(ip); trace_xfs_filemap_page_mkwrite(XFS_I(inode));
sb_start_pagefault(VFS_I(ip)->i_sb); sb_start_pagefault(inode->i_sb);
file_update_time(vma->vm_file); file_update_time(vma->vm_file);
xfs_ilock(ip, XFS_MMAPLOCK_SHARED); xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
if (IS_DAX(inode)) {
ret = __dax_mkwrite(vma, vmf, xfs_get_blocks_direct,
xfs_end_io_dax_write);
} else {
ret = __block_page_mkwrite(vma, vmf, xfs_get_blocks);
ret = block_page_mkwrite_return(ret);
}
xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
sb_end_pagefault(inode->i_sb);
return ret;
}
STATIC int
xfs_filemap_fault(
struct vm_area_struct *vma,
struct vm_fault *vmf)
{
struct xfs_inode *ip = XFS_I(file_inode(vma->vm_file));
int ret;
ret = __block_page_mkwrite(vma, vmf, xfs_get_blocks); trace_xfs_filemap_fault(ip);
/* DAX can shortcut the normal fault path on write faults! */
if ((vmf->flags & FAULT_FLAG_WRITE) && IS_DAX(VFS_I(ip)))
return xfs_filemap_page_mkwrite(vma, vmf);
xfs_ilock(ip, XFS_MMAPLOCK_SHARED);
ret = filemap_fault(vma, vmf);
xfs_iunlock(ip, XFS_MMAPLOCK_SHARED); xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
sb_end_pagefault(VFS_I(ip)->i_sb);
return block_page_mkwrite_return(ret); return ret;
}
static const struct vm_operations_struct xfs_file_vm_ops = {
.fault = xfs_filemap_fault,
.map_pages = filemap_map_pages,
.page_mkwrite = xfs_filemap_page_mkwrite,
};
STATIC int
xfs_file_mmap(
struct file *filp,
struct vm_area_struct *vma)
{
file_accessed(filp);
vma->vm_ops = &xfs_file_vm_ops;
if (IS_DAX(file_inode(filp)))
vma->vm_flags |= VM_MIXEDMAP;
return 0;
} }
const struct file_operations xfs_file_operations = { const struct file_operations xfs_file_operations = {
...@@ -1531,9 +1557,3 @@ const struct file_operations xfs_dir_file_operations = { ...@@ -1531,9 +1557,3 @@ const struct file_operations xfs_dir_file_operations = {
#endif #endif
.fsync = xfs_dir_fsync, .fsync = xfs_dir_fsync,
}; };
static const struct vm_operations_struct xfs_file_vm_ops = {
.fault = xfs_filemap_fault,
.map_pages = filemap_map_pages,
.page_mkwrite = xfs_filemap_page_mkwrite,
};
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment