Commit eed4e51f authored by Badari Pulavarty's avatar Badari Pulavarty Committed by Linus Torvalds

[PATCH] Add vector AIO support

This work is initially done by Zach Brown to add support for vectored aio.
These are the core changes for AIO to support
IOCB_CMD_PREADV/IOCB_CMD_PWRITEV.

[akpm@osdl.org: huge build fix]
Signed-off-by: default avatarZach Brown <zach.brown@oracle.com>
Signed-off-by: default avatarChristoph Hellwig <hch@lst.de>
Signed-off-by: default avatarBadari Pulavarty <pbadari@us.ibm.com>
Acked-by: default avatarBenjamin LaHaise <bcrl@kvack.org>
Acked-by: default avatarJames Morris <jmorris@namei.org>
Signed-off-by: default avatarAndrew Morton <akpm@osdl.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@osdl.org>
parent 543ade1f
...@@ -415,6 +415,7 @@ static struct kiocb fastcall *__aio_get_req(struct kioctx *ctx) ...@@ -415,6 +415,7 @@ static struct kiocb fastcall *__aio_get_req(struct kioctx *ctx)
req->ki_retry = NULL; req->ki_retry = NULL;
req->ki_dtor = NULL; req->ki_dtor = NULL;
req->private = NULL; req->private = NULL;
req->ki_iovec = NULL;
INIT_LIST_HEAD(&req->ki_run_list); INIT_LIST_HEAD(&req->ki_run_list);
/* Check if the completion queue has enough free space to /* Check if the completion queue has enough free space to
...@@ -460,6 +461,8 @@ static inline void really_put_req(struct kioctx *ctx, struct kiocb *req) ...@@ -460,6 +461,8 @@ static inline void really_put_req(struct kioctx *ctx, struct kiocb *req)
if (req->ki_dtor) if (req->ki_dtor)
req->ki_dtor(req); req->ki_dtor(req);
if (req->ki_iovec != &req->ki_inline_vec)
kfree(req->ki_iovec);
kmem_cache_free(kiocb_cachep, req); kmem_cache_free(kiocb_cachep, req);
ctx->reqs_active--; ctx->reqs_active--;
...@@ -1301,69 +1304,63 @@ asmlinkage long sys_io_destroy(aio_context_t ctx) ...@@ -1301,69 +1304,63 @@ asmlinkage long sys_io_destroy(aio_context_t ctx)
return -EINVAL; return -EINVAL;
} }
/* static void aio_advance_iovec(struct kiocb *iocb, ssize_t ret)
* aio_p{read,write} are the default ki_retry methods for
* IO_CMD_P{READ,WRITE}. They maintains kiocb retry state around potentially
* multiple calls to f_op->aio_read(). They loop around partial progress
* instead of returning -EIOCBRETRY because they don't have the means to call
* kick_iocb().
*/
static ssize_t aio_pread(struct kiocb *iocb)
{ {
struct file *file = iocb->ki_filp; struct iovec *iov = &iocb->ki_iovec[iocb->ki_cur_seg];
struct address_space *mapping = file->f_mapping;
struct inode *inode = mapping->host; BUG_ON(ret <= 0);
ssize_t ret = 0;
while (iocb->ki_cur_seg < iocb->ki_nr_segs && ret > 0) {
do { ssize_t this = min((ssize_t)iov->iov_len, ret);
iocb->ki_inline_vec.iov_base = iocb->ki_buf; iov->iov_base += this;
iocb->ki_inline_vec.iov_len = iocb->ki_left; iov->iov_len -= this;
iocb->ki_left -= this;
ret = file->f_op->aio_read(iocb, &iocb->ki_inline_vec, ret -= this;
1, iocb->ki_pos); if (iov->iov_len == 0) {
/* iocb->ki_cur_seg++;
* Can't just depend on iocb->ki_left to determine iov++;
* whether we are done. This may have been a short read. }
*/
if (ret > 0) {
iocb->ki_buf += ret;
iocb->ki_left -= ret;
} }
/* /* the caller should not have done more io than what fit in
* For pipes and sockets we return once we have some data; for * the remaining iovecs */
* regular files we retry till we complete the entire read or BUG_ON(ret > 0 && iocb->ki_left == 0);
* find that we can't read any more data (e.g short reads).
*/
} while (ret > 0 && iocb->ki_left > 0 &&
!S_ISFIFO(inode->i_mode) && !S_ISSOCK(inode->i_mode));
/* This means we must have transferred all that we could */
/* No need to retry anymore */
if ((ret == 0) || (iocb->ki_left == 0))
ret = iocb->ki_nbytes - iocb->ki_left;
return ret;
} }
/* see aio_pread() */ static ssize_t aio_rw_vect_retry(struct kiocb *iocb)
static ssize_t aio_pwrite(struct kiocb *iocb)
{ {
struct file *file = iocb->ki_filp; struct file *file = iocb->ki_filp;
struct address_space *mapping = file->f_mapping;
struct inode *inode = mapping->host;
ssize_t (*rw_op)(struct kiocb *, const struct iovec *,
unsigned long, loff_t);
ssize_t ret = 0; ssize_t ret = 0;
unsigned short opcode;
if ((iocb->ki_opcode == IOCB_CMD_PREADV) ||
(iocb->ki_opcode == IOCB_CMD_PREAD)) {
rw_op = file->f_op->aio_read;
opcode = IOCB_CMD_PREADV;
} else {
rw_op = file->f_op->aio_write;
opcode = IOCB_CMD_PWRITEV;
}
do { do {
iocb->ki_inline_vec.iov_base = iocb->ki_buf; ret = rw_op(iocb, &iocb->ki_iovec[iocb->ki_cur_seg],
iocb->ki_inline_vec.iov_len = iocb->ki_left; iocb->ki_nr_segs - iocb->ki_cur_seg,
iocb->ki_pos);
ret = file->f_op->aio_write(iocb, &iocb->ki_inline_vec, if (ret > 0)
1, iocb->ki_pos); aio_advance_iovec(iocb, ret);
if (ret > 0) {
iocb->ki_buf += ret; /* retry all partial writes. retry partial reads as long as its a
iocb->ki_left -= ret; * regular file. */
} } while (ret > 0 && iocb->ki_left > 0 &&
} while (ret > 0 && iocb->ki_left > 0); (opcode == IOCB_CMD_PWRITEV ||
(!S_ISFIFO(inode->i_mode) && !S_ISSOCK(inode->i_mode))));
/* This means we must have transferred all that we could */
/* No need to retry anymore */
if ((ret == 0) || (iocb->ki_left == 0)) if ((ret == 0) || (iocb->ki_left == 0))
ret = iocb->ki_nbytes - iocb->ki_left; ret = iocb->ki_nbytes - iocb->ki_left;
...@@ -1390,6 +1387,38 @@ static ssize_t aio_fsync(struct kiocb *iocb) ...@@ -1390,6 +1387,38 @@ static ssize_t aio_fsync(struct kiocb *iocb)
return ret; return ret;
} }
static ssize_t aio_setup_vectored_rw(int type, struct kiocb *kiocb)
{
ssize_t ret;
ret = rw_copy_check_uvector(type, (struct iovec __user *)kiocb->ki_buf,
kiocb->ki_nbytes, 1,
&kiocb->ki_inline_vec, &kiocb->ki_iovec);
if (ret < 0)
goto out;
kiocb->ki_nr_segs = kiocb->ki_nbytes;
kiocb->ki_cur_seg = 0;
/* ki_nbytes/left now reflect bytes instead of segs */
kiocb->ki_nbytes = ret;
kiocb->ki_left = ret;
ret = 0;
out:
return ret;
}
static ssize_t aio_setup_single_vector(struct kiocb *kiocb)
{
kiocb->ki_iovec = &kiocb->ki_inline_vec;
kiocb->ki_iovec->iov_base = kiocb->ki_buf;
kiocb->ki_iovec->iov_len = kiocb->ki_left;
kiocb->ki_nr_segs = 1;
kiocb->ki_cur_seg = 0;
kiocb->ki_nbytes = kiocb->ki_left;
return 0;
}
/* /*
* aio_setup_iocb: * aio_setup_iocb:
* Performs the initial checks and aio retry method * Performs the initial checks and aio retry method
...@@ -1412,9 +1441,12 @@ static ssize_t aio_setup_iocb(struct kiocb *kiocb) ...@@ -1412,9 +1441,12 @@ static ssize_t aio_setup_iocb(struct kiocb *kiocb)
ret = security_file_permission(file, MAY_READ); ret = security_file_permission(file, MAY_READ);
if (unlikely(ret)) if (unlikely(ret))
break; break;
ret = aio_setup_single_vector(kiocb);
if (ret)
break;
ret = -EINVAL; ret = -EINVAL;
if (file->f_op->aio_read) if (file->f_op->aio_read)
kiocb->ki_retry = aio_pread; kiocb->ki_retry = aio_rw_vect_retry;
break; break;
case IOCB_CMD_PWRITE: case IOCB_CMD_PWRITE:
ret = -EBADF; ret = -EBADF;
...@@ -1427,9 +1459,40 @@ static ssize_t aio_setup_iocb(struct kiocb *kiocb) ...@@ -1427,9 +1459,40 @@ static ssize_t aio_setup_iocb(struct kiocb *kiocb)
ret = security_file_permission(file, MAY_WRITE); ret = security_file_permission(file, MAY_WRITE);
if (unlikely(ret)) if (unlikely(ret))
break; break;
ret = aio_setup_single_vector(kiocb);
if (ret)
break;
ret = -EINVAL;
if (file->f_op->aio_write)
kiocb->ki_retry = aio_rw_vect_retry;
break;
case IOCB_CMD_PREADV:
ret = -EBADF;
if (unlikely(!(file->f_mode & FMODE_READ)))
break;
ret = security_file_permission(file, MAY_READ);
if (unlikely(ret))
break;
ret = aio_setup_vectored_rw(READ, kiocb);
if (ret)
break;
ret = -EINVAL;
if (file->f_op->aio_read)
kiocb->ki_retry = aio_rw_vect_retry;
break;
case IOCB_CMD_PWRITEV:
ret = -EBADF;
if (unlikely(!(file->f_mode & FMODE_WRITE)))
break;
ret = security_file_permission(file, MAY_WRITE);
if (unlikely(ret))
break;
ret = aio_setup_vectored_rw(WRITE, kiocb);
if (ret)
break;
ret = -EINVAL; ret = -EINVAL;
if (file->f_op->aio_write) if (file->f_op->aio_write)
kiocb->ki_retry = aio_pwrite; kiocb->ki_retry = aio_rw_vect_retry;
break; break;
case IOCB_CMD_FDSYNC: case IOCB_CMD_FDSYNC:
ret = -EINVAL; ret = -EINVAL;
......
...@@ -511,72 +511,96 @@ ssize_t do_loop_readv_writev(struct file *filp, struct iovec *iov, ...@@ -511,72 +511,96 @@ ssize_t do_loop_readv_writev(struct file *filp, struct iovec *iov,
/* A write operation does a read from user space and vice versa */ /* A write operation does a read from user space and vice versa */
#define vrfy_dir(type) ((type) == READ ? VERIFY_WRITE : VERIFY_READ) #define vrfy_dir(type) ((type) == READ ? VERIFY_WRITE : VERIFY_READ)
static ssize_t do_readv_writev(int type, struct file *file, ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector,
const struct iovec __user * uvector, unsigned long nr_segs, unsigned long fast_segs,
unsigned long nr_segs, loff_t *pos) struct iovec *fast_pointer,
{ struct iovec **ret_pointer)
size_t tot_len; {
struct iovec iovstack[UIO_FASTIOV]; unsigned long seg;
struct iovec *iov = iovstack;
ssize_t ret; ssize_t ret;
int seg; struct iovec *iov = fast_pointer;
io_fn_t fn;
iov_fn_t fnv;
/* /*
* SuS says "The readv() function *may* fail if the iovcnt argument * SuS says "The readv() function *may* fail if the iovcnt argument
* was less than or equal to 0, or greater than {IOV_MAX}. Linux has * was less than or equal to 0, or greater than {IOV_MAX}. Linux has
* traditionally returned zero for zero segments, so... * traditionally returned zero for zero segments, so...
*/ */
if (nr_segs == 0) {
ret = 0; ret = 0;
if (nr_segs == 0)
goto out; goto out;
}
/* /*
* First get the "struct iovec" from user memory and * First get the "struct iovec" from user memory and
* verify all the pointers * verify all the pointers
*/ */
if (nr_segs > UIO_MAXIOV) {
ret = -EINVAL; ret = -EINVAL;
if (nr_segs > UIO_MAXIOV)
goto out;
if (!file->f_op)
goto out; goto out;
if (nr_segs > UIO_FASTIOV) { }
ret = -ENOMEM; if (nr_segs > fast_segs) {
iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL); iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL);
if (!iov) if (iov == NULL) {
ret = -ENOMEM;
goto out; goto out;
} }
}
if (copy_from_user(iov, uvector, nr_segs*sizeof(*uvector))) {
ret = -EFAULT; ret = -EFAULT;
if (copy_from_user(iov, uvector, nr_segs*sizeof(*uvector)))
goto out; goto out;
}
/* /*
* Single unix specification: * According to the Single Unix Specification we should return EINVAL
* We should -EINVAL if an element length is not >= 0 and fitting an * if an element length is < 0 when cast to ssize_t or if the
* ssize_t. The total length is fitting an ssize_t * total length would overflow the ssize_t return value of the
* * system call.
* Be careful here because iov_len is a size_t not an ssize_t
*/ */
tot_len = 0; ret = 0;
ret = -EINVAL;
for (seg = 0; seg < nr_segs; seg++) { for (seg = 0; seg < nr_segs; seg++) {
void __user *buf = iov[seg].iov_base; void __user *buf = iov[seg].iov_base;
ssize_t len = (ssize_t)iov[seg].iov_len; ssize_t len = (ssize_t)iov[seg].iov_len;
if (len < 0) /* size_t not fitting an ssize_t .. */ /* see if we we're about to use an invalid len or if
* it's about to overflow ssize_t */
if (len < 0 || (ret + len < ret)) {
ret = -EINVAL;
goto out; goto out;
if (unlikely(!access_ok(vrfy_dir(type), buf, len))) }
goto Efault; if (unlikely(!access_ok(vrfy_dir(type), buf, len))) {
tot_len += len; ret = -EFAULT;
if ((ssize_t)tot_len < 0) /* maths overflow on the ssize_t */
goto out; goto out;
} }
if (tot_len == 0) {
ret = 0; ret += len;
}
out:
*ret_pointer = iov;
return ret;
}
static ssize_t do_readv_writev(int type, struct file *file,
const struct iovec __user * uvector,
unsigned long nr_segs, loff_t *pos)
{
size_t tot_len;
struct iovec iovstack[UIO_FASTIOV];
struct iovec *iov = iovstack;
ssize_t ret;
io_fn_t fn;
iov_fn_t fnv;
if (!file->f_op) {
ret = -EINVAL;
goto out; goto out;
} }
ret = rw_copy_check_uvector(type, uvector, nr_segs,
ARRAY_SIZE(iovstack), iovstack, &iov);
if (ret <= 0)
goto out;
tot_len = ret;
ret = rw_verify_area(type, file, pos, tot_len); ret = rw_verify_area(type, file, pos, tot_len);
if (ret < 0) if (ret < 0)
goto out; goto out;
...@@ -609,9 +633,6 @@ static ssize_t do_readv_writev(int type, struct file *file, ...@@ -609,9 +633,6 @@ static ssize_t do_readv_writev(int type, struct file *file,
fsnotify_modify(file->f_dentry); fsnotify_modify(file->f_dentry);
} }
return ret; return ret;
Efault:
ret = -EFAULT;
goto out;
} }
ssize_t vfs_readv(struct file *file, const struct iovec __user *vec, ssize_t vfs_readv(struct file *file, const struct iovec __user *vec,
......
...@@ -7,6 +7,7 @@ ...@@ -7,6 +7,7 @@
#include <linux/uio.h> #include <linux/uio.h>
#include <asm/atomic.h> #include <asm/atomic.h>
#include <linux/uio.h>
#define AIO_MAXSEGS 4 #define AIO_MAXSEGS 4
#define AIO_KIOGRP_NR_ATOMIC 8 #define AIO_KIOGRP_NR_ATOMIC 8
...@@ -114,6 +115,9 @@ struct kiocb { ...@@ -114,6 +115,9 @@ struct kiocb {
long ki_kicked; /* just for testing */ long ki_kicked; /* just for testing */
long ki_queued; /* just for testing */ long ki_queued; /* just for testing */
struct iovec ki_inline_vec; /* inline vector */ struct iovec ki_inline_vec; /* inline vector */
struct iovec *ki_iovec;
unsigned long ki_nr_segs;
unsigned long ki_cur_seg;
struct list_head ki_list; /* the aio core uses this struct list_head ki_list; /* the aio core uses this
* for cancellation */ * for cancellation */
......
...@@ -41,6 +41,8 @@ enum { ...@@ -41,6 +41,8 @@ enum {
* IOCB_CMD_POLL = 5, * IOCB_CMD_POLL = 5,
*/ */
IOCB_CMD_NOOP = 6, IOCB_CMD_NOOP = 6,
IOCB_CMD_PREADV = 7,
IOCB_CMD_PWRITEV = 8,
}; };
/* read() from /dev/aio returns these structures. */ /* read() from /dev/aio returns these structures. */
......
...@@ -1150,6 +1150,11 @@ struct inode_operations { ...@@ -1150,6 +1150,11 @@ struct inode_operations {
struct seq_file; struct seq_file;
ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector,
unsigned long nr_segs, unsigned long fast_segs,
struct iovec *fast_pointer,
struct iovec **ret_pointer);
extern ssize_t vfs_read(struct file *, char __user *, size_t, loff_t *); extern ssize_t vfs_read(struct file *, char __user *, size_t, loff_t *);
extern ssize_t vfs_write(struct file *, const char __user *, size_t, loff_t *); extern ssize_t vfs_write(struct file *, const char __user *, size_t, loff_t *);
extern ssize_t vfs_readv(struct file *, const struct iovec __user *, extern ssize_t vfs_readv(struct file *, const struct iovec __user *,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment