Commit 78f673ca authored by Trond Myklebust's avatar Trond Myklebust

NFS O_DIRECT: Change the NFS O_DIRECT path so that it

    no longer calls the generic VFS read and write routines.
    This allows all application read requests to pass through
    to the server, instead of just the ones that appear to be
    inside the file. this eliminates the requirement to use a
    GETATTR operation before each read or write to determine
    where the EOF is. This is a significant performance and
    scalability win.

    It also removes all requirements for holding the inode
    semaphore during NFS direct reads and writes, as the read
    and write logic no longer needs atomic access to the size
    of the file. this also helps client CPU scalability by
    reducing the serialization of writes against a single file.
                                                                                
    Patch by Chuck Lever
parent e89b7533
......@@ -32,6 +32,7 @@
* 18 Dec 2001 Initial implementation for 2.4 --cel
* 08 Jul 2002 Version for 2.4.19, with bug fixes --trondmy
* 08 Jun 2003 Port to 2.5 APIs --cel
* 31 Mar 2004 Handle direct I/O without VFS support --cel
*
*/
......@@ -406,12 +407,6 @@ nfs_direct_write(struct inode *inode, struct file *file,
* file_offset: offset in file to begin the operation
* nr_segs: size of iovec array
*
* Usually a file system implements direct I/O by calling out to
* blockdev_direct_IO. The NFS client doesn't have a backing block
* device, so we do everything by hand instead.
*
* The inode's i_sem is no longer held by the VFS layer before it calls
* this function to do a write.
*/
ssize_t
nfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
......@@ -426,11 +421,7 @@ nfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
* No support for async yet
*/
if (!is_sync_kiocb(iocb))
goto out;
result = nfs_revalidate_inode(NFS_SERVER(inode), inode);
if (result < 0)
goto out;
return result;
switch (rw) {
case READ:
......@@ -450,8 +441,160 @@ nfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
default:
break;
}
return result;
}
/**
* nfs_file_direct_read - file direct read operation for NFS files
* @iocb: target I/O control block
* @buf: user's buffer into which to read data
* count: number of bytes to read
* pos: byte offset in file where reading starts
*
* We use this function for direct reads instead of calling
* generic_file_aio_read() in order to avoid gfar's check to see if
* the request starts before the end of the file. For that check
* to work, we must generate a GETATTR before each direct read, and
* even then there is a window between the GETATTR and the subsequent
* READ where the file size could change. So our preference is simply
* to do all reads the application wants, and the server will take
* care of managing the end of file boundary.
*
* This function also eliminates unnecessarily updating the file's
* atime locally, as the NFS server sets the file's atime, and this
* client must read the updated atime from the server back into its
* cache.
*/
ssize_t
nfs_file_direct_read(struct kiocb *iocb, char *buf, size_t count, loff_t pos)
{
ssize_t retval = -EINVAL;
loff_t *ppos = &iocb->ki_pos;
struct file *file = iocb->ki_filp;
struct dentry *dentry = file->f_dentry;
struct address_space *mapping = file->f_mapping;
struct inode *inode = mapping->host;
struct iovec iov = {
.iov_base = buf,
.iov_len = count,
};
dprintk("nfs: direct read(%s/%s, %lu@%lu)\n",
dentry->d_parent->d_name.name, dentry->d_name.name,
(unsigned long) count, (unsigned long) pos);
if (!is_sync_kiocb(iocb))
goto out;
if (count < 0)
goto out;
retval = -EFAULT;
if (!access_ok(VERIFY_WRITE, iov.iov_base, iov.iov_len))
goto out;
retval = 0;
if (!count)
goto out;
if (mapping->nrpages) {
retval = filemap_fdatawrite(mapping);
if (retval == 0)
retval = filemap_fdatawait(mapping);
if (retval)
goto out;
}
retval = nfs_direct_read(inode, file, &iov, pos, 1);
if (retval > 0)
*ppos = pos + retval;
out:
dprintk("NFS: direct_IO result=%zd\n", result);
return result;
return retval;
}
/**
* nfs_file_direct_write - file direct write operation for NFS files
* @iocb: target I/O control block
* @buf: user's buffer from which to write data
* count: number of bytes to write
* pos: byte offset in file where writing starts
*
* We use this function for direct writes instead of calling
* generic_file_aio_write() in order to avoid taking the inode
* semaphore and updating the i_size. The NFS server will set
* the new i_size and this client must read the updated size
* back into its cache. We let the server do generic write
* parameter checking and report problems.
*
* We also avoid an unnecessary invocation of generic_osync_inode(),
* as it is fairly meaningless to sync the metadata of an NFS file.
*
* We eliminate local atime updates, see direct read above.
*
* We avoid unnecessary page cache invalidations for normal cached
* readers of this file.
*
* Note that O_APPEND is not supported for NFS direct writes, as there
* is no atomic O_APPEND write facility in the NFS protocol.
*/
ssize_t
nfs_file_direct_write(struct kiocb *iocb, const char *buf, size_t count, loff_t pos)
{
ssize_t retval = -EINVAL;
loff_t *ppos = &iocb->ki_pos;
unsigned long limit = current->rlim[RLIMIT_FSIZE].rlim_cur;
struct file *file = iocb->ki_filp;
struct dentry *dentry = file->f_dentry;
struct address_space *mapping = file->f_mapping;
struct inode *inode = mapping->host;
struct iovec iov = {
.iov_base = (void __user *)buf,
.iov_len = count,
};
dfprintk(VFS, "nfs: direct write(%s/%s(%ld), %lu@%lu)\n",
dentry->d_parent->d_name.name, dentry->d_name.name,
inode->i_ino, (unsigned long) count, (unsigned long) pos);
if (!is_sync_kiocb(iocb))
goto out;
if (count < 0)
goto out;
if (pos < 0)
goto out;
retval = -EFAULT;
if (!access_ok(VERIFY_READ, iov.iov_base, iov.iov_len))
goto out;
if (file->f_error) {
retval = file->f_error;
file->f_error = 0;
goto out;
}
retval = -EFBIG;
if (limit != RLIM_INFINITY) {
if (pos >= limit) {
send_sig(SIGXFSZ, current, 0);
goto out;
}
if (count > limit - (unsigned long) pos)
count = limit - (unsigned long) pos;
}
retval = 0;
if (!count)
goto out;
if (mapping->nrpages) {
retval = filemap_fdatawrite(mapping);
if (retval == 0)
retval = filemap_fdatawait(mapping);
if (retval)
goto out;
}
retval = nfs_direct_write(inode, file, &iov, pos, 1);
if (mapping->nrpages)
invalidate_inode_pages2(mapping);
if (retval > 0)
*ppos = pos + retval;
out:
return retval;
}
......@@ -154,6 +154,11 @@ nfs_file_read(struct kiocb *iocb, char * buf, size_t count, loff_t pos)
struct inode * inode = dentry->d_inode;
ssize_t result;
#ifdef CONFIG_NFS_DIRECTIO
if (iocb->ki_filp->f_flags & O_DIRECT)
return nfs_file_direct_read(iocb, buf, count, pos);
#endif
dfprintk(VFS, "nfs: read(%s/%s, %lu@%lu)\n",
dentry->d_parent->d_name.name, dentry->d_name.name,
(unsigned long) count, (unsigned long) pos);
......@@ -268,6 +273,11 @@ nfs_file_write(struct kiocb *iocb, const char *buf, size_t count, loff_t pos)
struct inode * inode = dentry->d_inode;
ssize_t result;
#ifdef CONFIG_NFS_DIRECTIO
if (iocb->ki_filp->f_flags & O_DIRECT)
return nfs_file_direct_write(iocb, buf, count, pos);
#endif
dfprintk(VFS, "nfs: write(%s/%s(%ld), %lu@%lu)\n",
dentry->d_parent->d_name.name, dentry->d_name.name,
inode->i_ino, (unsigned long) count, (unsigned long) pos);
......
......@@ -306,6 +306,10 @@ nfs_file_cred(struct file *file)
*/
extern ssize_t nfs_direct_IO(int, struct kiocb *, const struct iovec *, loff_t,
unsigned long);
extern ssize_t nfs_file_direct_read(struct kiocb *iocb, char *buf,
size_t count, loff_t pos);
extern ssize_t nfs_file_direct_write(struct kiocb *iocb, const char *buf,
size_t count, loff_t pos);
/*
* linux/fs/nfs/dir.c
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment