Commit 3a93daea authored by Christian Brauner's avatar Christian Brauner

Merge branch 'read_iter' of git://git.kernel.dk/linux

Pull read_iter updates from Jens Axboe:

There are still a few users of fops->read() in the core parts of the
fs stack. Which is a shame, since it'd be nice to get rid of the
non-iterator parts of down the line, and reclaim that part of the
file_operations struct.

Outside of moving in that direction as a cleanup, using ->read_iter()
enables us to mark them with FMODE_NOWAIT. This is important for users
like io_uring, where per-IO nonblocking hints make a difference in how
efficiently IO can be done.

Those two things are my main motivation for starting this work, with
hopefully more to come down the line.

All patches have been booted and tested, and the corresponding test
cases from ltp have been run.

* 'read_iter' of git://git.kernel.dk/linux: (4 commits)
  signalfd: convert to ->read_iter()
  userfaultfd: convert to ->read_iter()
  timerfd: convert to ->read_iter()
  new helper: copy_to_iter_full()
Signed-off-by: default avatarChristian Brauner <brauner@kernel.org>
parents fec50db7 fbe38120
...@@ -68,8 +68,7 @@ static __poll_t signalfd_poll(struct file *file, poll_table *wait) ...@@ -68,8 +68,7 @@ static __poll_t signalfd_poll(struct file *file, poll_table *wait)
/* /*
* Copied from copy_siginfo_to_user() in kernel/signal.c * Copied from copy_siginfo_to_user() in kernel/signal.c
*/ */
static int signalfd_copyinfo(struct signalfd_siginfo __user *uinfo, static int signalfd_copyinfo(struct iov_iter *to, kernel_siginfo_t const *kinfo)
kernel_siginfo_t const *kinfo)
{ {
struct signalfd_siginfo new; struct signalfd_siginfo new;
...@@ -146,10 +145,10 @@ static int signalfd_copyinfo(struct signalfd_siginfo __user *uinfo, ...@@ -146,10 +145,10 @@ static int signalfd_copyinfo(struct signalfd_siginfo __user *uinfo,
break; break;
} }
if (copy_to_user(uinfo, &new, sizeof(struct signalfd_siginfo))) if (!copy_to_iter_full(&new, sizeof(struct signalfd_siginfo), to))
return -EFAULT; return -EFAULT;
return sizeof(*uinfo); return sizeof(struct signalfd_siginfo);
} }
static ssize_t signalfd_dequeue(struct signalfd_ctx *ctx, kernel_siginfo_t *info, static ssize_t signalfd_dequeue(struct signalfd_ctx *ctx, kernel_siginfo_t *info,
...@@ -199,28 +198,27 @@ static ssize_t signalfd_dequeue(struct signalfd_ctx *ctx, kernel_siginfo_t *info ...@@ -199,28 +198,27 @@ static ssize_t signalfd_dequeue(struct signalfd_ctx *ctx, kernel_siginfo_t *info
* error code. The "count" parameter must be at least the size of a * error code. The "count" parameter must be at least the size of a
* "struct signalfd_siginfo". * "struct signalfd_siginfo".
*/ */
static ssize_t signalfd_read(struct file *file, char __user *buf, size_t count, static ssize_t signalfd_read_iter(struct kiocb *iocb, struct iov_iter *to)
loff_t *ppos)
{ {
struct file *file = iocb->ki_filp;
struct signalfd_ctx *ctx = file->private_data; struct signalfd_ctx *ctx = file->private_data;
struct signalfd_siginfo __user *siginfo; size_t count = iov_iter_count(to);
int nonblock = file->f_flags & O_NONBLOCK;
ssize_t ret, total = 0; ssize_t ret, total = 0;
kernel_siginfo_t info; kernel_siginfo_t info;
bool nonblock;
count /= sizeof(struct signalfd_siginfo); count /= sizeof(struct signalfd_siginfo);
if (!count) if (!count)
return -EINVAL; return -EINVAL;
siginfo = (struct signalfd_siginfo __user *) buf; nonblock = file->f_flags & O_NONBLOCK || iocb->ki_flags & IOCB_NOWAIT;
do { do {
ret = signalfd_dequeue(ctx, &info, nonblock); ret = signalfd_dequeue(ctx, &info, nonblock);
if (unlikely(ret <= 0)) if (unlikely(ret <= 0))
break; break;
ret = signalfd_copyinfo(siginfo, &info); ret = signalfd_copyinfo(to, &info);
if (ret < 0) if (ret < 0)
break; break;
siginfo++;
total += ret; total += ret;
nonblock = 1; nonblock = 1;
} while (--count); } while (--count);
...@@ -246,7 +244,7 @@ static const struct file_operations signalfd_fops = { ...@@ -246,7 +244,7 @@ static const struct file_operations signalfd_fops = {
#endif #endif
.release = signalfd_release, .release = signalfd_release,
.poll = signalfd_poll, .poll = signalfd_poll,
.read = signalfd_read, .read_iter = signalfd_read_iter,
.llseek = noop_llseek, .llseek = noop_llseek,
}; };
...@@ -265,20 +263,34 @@ static int do_signalfd4(int ufd, sigset_t *mask, int flags) ...@@ -265,20 +263,34 @@ static int do_signalfd4(int ufd, sigset_t *mask, int flags)
signotset(mask); signotset(mask);
if (ufd == -1) { if (ufd == -1) {
struct file *file;
ctx = kmalloc(sizeof(*ctx), GFP_KERNEL); ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
if (!ctx) if (!ctx)
return -ENOMEM; return -ENOMEM;
ctx->sigmask = *mask; ctx->sigmask = *mask;
ufd = get_unused_fd_flags(flags & O_CLOEXEC);
if (ufd < 0) {
kfree(ctx);
return ufd;
}
file = anon_inode_getfile("[signalfd]", &signalfd_fops, ctx,
O_RDWR | (flags & O_NONBLOCK));
if (IS_ERR(file)) {
put_unused_fd(ufd);
kfree(ctx);
return ufd;
}
file->f_mode |= FMODE_NOWAIT;
/* /*
* When we call this, the initialization must be complete, since * When we call this, the initialization must be complete, since
* anon_inode_getfd() will install the fd. * anon_inode_getfd() will install the fd.
*/ */
ufd = anon_inode_getfd("[signalfd]", &signalfd_fops, ctx, fd_install(ufd, file);
O_RDWR | (flags & (O_CLOEXEC | O_NONBLOCK)));
if (ufd < 0)
kfree(ctx);
} else { } else {
struct fd f = fdget(ufd); struct fd f = fdget(ufd);
if (!f.file) if (!f.file)
......
...@@ -262,17 +262,18 @@ static __poll_t timerfd_poll(struct file *file, poll_table *wait) ...@@ -262,17 +262,18 @@ static __poll_t timerfd_poll(struct file *file, poll_table *wait)
return events; return events;
} }
static ssize_t timerfd_read(struct file *file, char __user *buf, size_t count, static ssize_t timerfd_read_iter(struct kiocb *iocb, struct iov_iter *to)
loff_t *ppos)
{ {
struct file *file = iocb->ki_filp;
struct timerfd_ctx *ctx = file->private_data; struct timerfd_ctx *ctx = file->private_data;
ssize_t res; ssize_t res;
u64 ticks = 0; u64 ticks = 0;
if (count < sizeof(ticks)) if (iov_iter_count(to) < sizeof(ticks))
return -EINVAL; return -EINVAL;
spin_lock_irq(&ctx->wqh.lock); spin_lock_irq(&ctx->wqh.lock);
if (file->f_flags & O_NONBLOCK) if (file->f_flags & O_NONBLOCK || iocb->ki_flags & IOCB_NOWAIT)
res = -EAGAIN; res = -EAGAIN;
else else
res = wait_event_interruptible_locked_irq(ctx->wqh, ctx->ticks); res = wait_event_interruptible_locked_irq(ctx->wqh, ctx->ticks);
...@@ -312,8 +313,11 @@ static ssize_t timerfd_read(struct file *file, char __user *buf, size_t count, ...@@ -312,8 +313,11 @@ static ssize_t timerfd_read(struct file *file, char __user *buf, size_t count,
ctx->ticks = 0; ctx->ticks = 0;
} }
spin_unlock_irq(&ctx->wqh.lock); spin_unlock_irq(&ctx->wqh.lock);
if (ticks) if (ticks) {
res = put_user(ticks, (u64 __user *) buf) ? -EFAULT: sizeof(ticks); res = copy_to_iter(&ticks, sizeof(ticks), to);
if (!res)
res = -EFAULT;
}
return res; return res;
} }
...@@ -384,7 +388,7 @@ static long timerfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg ...@@ -384,7 +388,7 @@ static long timerfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg
static const struct file_operations timerfd_fops = { static const struct file_operations timerfd_fops = {
.release = timerfd_release, .release = timerfd_release,
.poll = timerfd_poll, .poll = timerfd_poll,
.read = timerfd_read, .read_iter = timerfd_read_iter,
.llseek = noop_llseek, .llseek = noop_llseek,
.show_fdinfo = timerfd_show, .show_fdinfo = timerfd_show,
.unlocked_ioctl = timerfd_ioctl, .unlocked_ioctl = timerfd_ioctl,
...@@ -407,6 +411,7 @@ SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags) ...@@ -407,6 +411,7 @@ SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags)
{ {
int ufd; int ufd;
struct timerfd_ctx *ctx; struct timerfd_ctx *ctx;
struct file *file;
/* Check the TFD_* constants for consistency. */ /* Check the TFD_* constants for consistency. */
BUILD_BUG_ON(TFD_CLOEXEC != O_CLOEXEC); BUILD_BUG_ON(TFD_CLOEXEC != O_CLOEXEC);
...@@ -443,11 +448,22 @@ SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags) ...@@ -443,11 +448,22 @@ SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags)
ctx->moffs = ktime_mono_to_real(0); ctx->moffs = ktime_mono_to_real(0);
ufd = anon_inode_getfd("[timerfd]", &timerfd_fops, ctx, ufd = get_unused_fd_flags(flags & TFD_SHARED_FCNTL_FLAGS);
if (ufd < 0) {
kfree(ctx);
return ufd;
}
file = anon_inode_getfile("[timerfd]", &timerfd_fops, ctx,
O_RDWR | (flags & TFD_SHARED_FCNTL_FLAGS)); O_RDWR | (flags & TFD_SHARED_FCNTL_FLAGS));
if (ufd < 0) if (IS_ERR(file)) {
put_unused_fd(ufd);
kfree(ctx); kfree(ctx);
return PTR_ERR(file);
}
file->f_mode |= FMODE_NOWAIT;
fd_install(ufd, file);
return ufd; return ufd;
} }
......
...@@ -31,6 +31,7 @@ ...@@ -31,6 +31,7 @@
#include <linux/hugetlb.h> #include <linux/hugetlb.h>
#include <linux/swapops.h> #include <linux/swapops.h>
#include <linux/miscdevice.h> #include <linux/miscdevice.h>
#include <linux/uio.h>
static int sysctl_unprivileged_userfaultfd __read_mostly; static int sysctl_unprivileged_userfaultfd __read_mostly;
...@@ -282,7 +283,7 @@ static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx, ...@@ -282,7 +283,7 @@ static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx,
/* /*
* Verify the pagetables are still not ok after having reigstered into * Verify the pagetables are still not ok after having reigstered into
* the fault_pending_wqh to avoid userland having to UFFDIO_WAKE any * the fault_pending_wqh to avoid userland having to UFFDIO_WAKE any
* userfault that has already been resolved, if userfaultfd_read and * userfault that has already been resolved, if userfaultfd_read_iter and
* UFFDIO_COPY|ZEROPAGE are being run simultaneously on two different * UFFDIO_COPY|ZEROPAGE are being run simultaneously on two different
* threads. * threads.
*/ */
...@@ -1177,34 +1178,34 @@ static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait, ...@@ -1177,34 +1178,34 @@ static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait,
return ret; return ret;
} }
static ssize_t userfaultfd_read(struct file *file, char __user *buf, static ssize_t userfaultfd_read_iter(struct kiocb *iocb, struct iov_iter *to)
size_t count, loff_t *ppos)
{ {
struct file *file = iocb->ki_filp;
struct userfaultfd_ctx *ctx = file->private_data; struct userfaultfd_ctx *ctx = file->private_data;
ssize_t _ret, ret = 0; ssize_t _ret, ret = 0;
struct uffd_msg msg; struct uffd_msg msg;
int no_wait = file->f_flags & O_NONBLOCK;
struct inode *inode = file_inode(file); struct inode *inode = file_inode(file);
bool no_wait;
if (!userfaultfd_is_initialized(ctx)) if (!userfaultfd_is_initialized(ctx))
return -EINVAL; return -EINVAL;
no_wait = file->f_flags & O_NONBLOCK || iocb->ki_flags & IOCB_NOWAIT;
for (;;) { for (;;) {
if (count < sizeof(msg)) if (iov_iter_count(to) < sizeof(msg))
return ret ? ret : -EINVAL; return ret ? ret : -EINVAL;
_ret = userfaultfd_ctx_read(ctx, no_wait, &msg, inode); _ret = userfaultfd_ctx_read(ctx, no_wait, &msg, inode);
if (_ret < 0) if (_ret < 0)
return ret ? ret : _ret; return ret ? ret : _ret;
if (copy_to_user((__u64 __user *) buf, &msg, sizeof(msg))) _ret = !copy_to_iter_full(&msg, sizeof(msg), to);
if (_ret)
return ret ? ret : -EFAULT; return ret ? ret : -EFAULT;
ret += sizeof(msg); ret += sizeof(msg);
buf += sizeof(msg);
count -= sizeof(msg);
/* /*
* Allow to read more than one fault at time but only * Allow to read more than one fault at time but only
* block if waiting for the very first one. * block if waiting for the very first one.
*/ */
no_wait = O_NONBLOCK; no_wait = true;
} }
} }
...@@ -2172,7 +2173,7 @@ static const struct file_operations userfaultfd_fops = { ...@@ -2172,7 +2173,7 @@ static const struct file_operations userfaultfd_fops = {
#endif #endif
.release = userfaultfd_release, .release = userfaultfd_release,
.poll = userfaultfd_poll, .poll = userfaultfd_poll,
.read = userfaultfd_read, .read_iter = userfaultfd_read_iter,
.unlocked_ioctl = userfaultfd_ioctl, .unlocked_ioctl = userfaultfd_ioctl,
.compat_ioctl = compat_ptr_ioctl, .compat_ioctl = compat_ptr_ioctl,
.llseek = noop_llseek, .llseek = noop_llseek,
...@@ -2192,6 +2193,7 @@ static void init_once_userfaultfd_ctx(void *mem) ...@@ -2192,6 +2193,7 @@ static void init_once_userfaultfd_ctx(void *mem)
static int new_userfaultfd(int flags) static int new_userfaultfd(int flags)
{ {
struct userfaultfd_ctx *ctx; struct userfaultfd_ctx *ctx;
struct file *file;
int fd; int fd;
BUG_ON(!current->mm); BUG_ON(!current->mm);
...@@ -2215,16 +2217,26 @@ static int new_userfaultfd(int flags) ...@@ -2215,16 +2217,26 @@ static int new_userfaultfd(int flags)
init_rwsem(&ctx->map_changing_lock); init_rwsem(&ctx->map_changing_lock);
atomic_set(&ctx->mmap_changing, 0); atomic_set(&ctx->mmap_changing, 0);
ctx->mm = current->mm; ctx->mm = current->mm;
/* prevent the mm struct to be freed */
mmgrab(ctx->mm); fd = get_unused_fd_flags(flags & UFFD_SHARED_FCNTL_FLAGS);
if (fd < 0)
goto err_out;
/* Create a new inode so that the LSM can block the creation. */ /* Create a new inode so that the LSM can block the creation. */
fd = anon_inode_create_getfd("[userfaultfd]", &userfaultfd_fops, ctx, file = anon_inode_create_getfile("[userfaultfd]", &userfaultfd_fops, ctx,
O_RDONLY | (flags & UFFD_SHARED_FCNTL_FLAGS), NULL); O_RDONLY | (flags & UFFD_SHARED_FCNTL_FLAGS), NULL);
if (fd < 0) { if (IS_ERR(file)) {
mmdrop(ctx->mm); put_unused_fd(fd);
kmem_cache_free(userfaultfd_ctx_cachep, ctx); fd = PTR_ERR(file);
goto err_out;
} }
/* prevent the mm struct to be freed */
mmgrab(ctx->mm);
file->f_mode |= FMODE_NOWAIT;
fd_install(fd, file);
return fd;
err_out:
kmem_cache_free(userfaultfd_ctx_cachep, ctx);
return fd; return fd;
} }
......
...@@ -205,6 +205,16 @@ size_t copy_from_iter(void *addr, size_t bytes, struct iov_iter *i) ...@@ -205,6 +205,16 @@ size_t copy_from_iter(void *addr, size_t bytes, struct iov_iter *i)
return 0; return 0;
} }
static __always_inline __must_check
bool copy_to_iter_full(const void *addr, size_t bytes, struct iov_iter *i)
{
size_t copied = copy_to_iter(addr, bytes, i);
if (likely(copied == bytes))
return true;
iov_iter_revert(i, copied);
return false;
}
static __always_inline __must_check static __always_inline __must_check
bool copy_from_iter_full(void *addr, size_t bytes, struct iov_iter *i) bool copy_from_iter_full(void *addr, size_t bytes, struct iov_iter *i)
{ {
......
...@@ -379,14 +379,7 @@ static inline bool udp_skb_is_linear(struct sk_buff *skb) ...@@ -379,14 +379,7 @@ static inline bool udp_skb_is_linear(struct sk_buff *skb)
static inline int copy_linear_skb(struct sk_buff *skb, int len, int off, static inline int copy_linear_skb(struct sk_buff *skb, int len, int off,
struct iov_iter *to) struct iov_iter *to)
{ {
int n; return copy_to_iter_full(skb->data + off, len, to) ? 0 : -EFAULT;
n = copy_to_iter(skb->data + off, len, to);
if (n == len)
return 0;
iov_iter_revert(to, n);
return -EFAULT;
} }
/* /*
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment