Commit a9b85f94 authored by Andrea Arcangeli's avatar Andrea Arcangeli Committed by Linus Torvalds

userfaultfd: change the read API to return a uffd_msg

I had requests to return the full address (not the page aligned one) to
userland.

It's not entirely clear how the page offset could be relevant because
userfaults aren't like SIGBUS that can sigjump to a different place and it
actually skip resolving the fault depending on a page offset.  There's
currently no real way to skip the fault especially because after a
UFFDIO_COPY|ZEROPAGE, the fault is optimized to be retried within the
kernel without having to return to userland first (not even self modifying
code replacing the .text that touched the faulting address would prevent
the fault to be repeated).  Userland cannot skip repeating the fault even
more so if the fault was triggered by a KVM secondary page fault or any
get_user_pages or any copy-user inside some syscall which will return to
kernel code.  The second time FAULT_FLAG_RETRY_NOWAIT won't be set leading
to a SIGBUS being raised because the userfault can't wait if it cannot
release the mmap_map first (and FAULT_FLAG_RETRY_NOWAIT is required for
that).

Still returning userland a proper structure during the read() on the uffd,
can allow to use the current UFFD_API for the future non-cooperative
extensions too and it looks cleaner as well.  Once we get additional
fields there's no point to return the fault address page aligned anymore
to reuse the bits below PAGE_SHIFT.

The only downside is that the read() syscall will read 32bytes instead of
8bytes but that's not going to be measurable overhead.

The total number of new events that can be extended or of new future bits
for already shipped events, is limited to 64 by the features field of the
uffdio_api structure.  If more will be needed a bump of UFFD_API will be
required.

[akpm@linux-foundation.org: use __packed]
Signed-off-by: default avatarAndrea Arcangeli <aarcange@redhat.com>
Acked-by: default avatarPavel Emelyanov <xemul@parallels.com>
Cc: Sanidhya Kashyap <sanidhya.gatech@gmail.com>
Cc: zhang.zhanghailiang@huawei.com
Cc: "Kirill A. Shutemov" <kirill@shutemov.name>
Cc: Andres Lagar-Cavilla <andreslc@google.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Hugh Dickins <hughd@google.com>
Cc: Peter Feiner <pfeiner@google.com>
Cc: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: "Huangpeng (Peter)" <peter.huangpeng@huawei.com>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
parent 3f602d27
...@@ -46,11 +46,13 @@ is a corner case that would currently return -EBUSY). ...@@ -46,11 +46,13 @@ is a corner case that would currently return -EBUSY).
When first opened the userfaultfd must be enabled invoking the When first opened the userfaultfd must be enabled invoking the
UFFDIO_API ioctl specifying a uffdio_api.api value set to UFFD_API (or UFFDIO_API ioctl specifying a uffdio_api.api value set to UFFD_API (or
a later API version) which will specify the read/POLLIN protocol a later API version) which will specify the read/POLLIN protocol
userland intends to speak on the UFFD. The UFFDIO_API ioctl if userland intends to speak on the UFFD and the uffdio_api.features
successful (i.e. if the requested uffdio_api.api is spoken also by the userland requires. The UFFDIO_API ioctl if successful (i.e. if the
running kernel), will return into uffdio_api.features and requested uffdio_api.api is spoken also by the running kernel and the
uffdio_api.ioctls two 64bit bitmasks of respectively the activated requested features are going to be enabled) will return into
feature of the read(2) protocol and the generic ioctl available. uffdio_api.features and uffdio_api.ioctls two 64bit bitmasks of
respectively all the available features of the read(2) protocol and
the generic ioctl available.
Once the userfaultfd has been enabled the UFFDIO_REGISTER ioctl should Once the userfaultfd has been enabled the UFFDIO_REGISTER ioctl should
be invoked (if present in the returned uffdio_api.ioctls bitmask) to be invoked (if present in the returned uffdio_api.ioctls bitmask) to
......
...@@ -50,7 +50,7 @@ struct userfaultfd_ctx { ...@@ -50,7 +50,7 @@ struct userfaultfd_ctx {
}; };
struct userfaultfd_wait_queue { struct userfaultfd_wait_queue {
unsigned long address; struct uffd_msg msg;
wait_queue_t wq; wait_queue_t wq;
bool pending; bool pending;
struct userfaultfd_ctx *ctx; struct userfaultfd_ctx *ctx;
...@@ -77,7 +77,8 @@ static int userfaultfd_wake_function(wait_queue_t *wq, unsigned mode, ...@@ -77,7 +77,8 @@ static int userfaultfd_wake_function(wait_queue_t *wq, unsigned mode,
/* len == 0 means wake all */ /* len == 0 means wake all */
start = range->start; start = range->start;
len = range->len; len = range->len;
if (len && (start > uwq->address || start + len <= uwq->address)) if (len && (start > uwq->msg.arg.pagefault.address ||
start + len <= uwq->msg.arg.pagefault.address))
goto out; goto out;
ret = wake_up_state(wq->private, mode); ret = wake_up_state(wq->private, mode);
if (ret) if (ret)
...@@ -135,28 +136,43 @@ static void userfaultfd_ctx_put(struct userfaultfd_ctx *ctx) ...@@ -135,28 +136,43 @@ static void userfaultfd_ctx_put(struct userfaultfd_ctx *ctx)
} }
} }
static inline unsigned long userfault_address(unsigned long address, static inline void msg_init(struct uffd_msg *msg)
unsigned int flags,
unsigned long reason)
{ {
BUILD_BUG_ON(PAGE_SHIFT < UFFD_BITS); BUILD_BUG_ON(sizeof(struct uffd_msg) != 32);
address &= PAGE_MASK; /*
* Must use memset to zero out the paddings or kernel data is
* leaked to userland.
*/
memset(msg, 0, sizeof(struct uffd_msg));
}
static inline struct uffd_msg userfault_msg(unsigned long address,
unsigned int flags,
unsigned long reason)
{
struct uffd_msg msg;
msg_init(&msg);
msg.event = UFFD_EVENT_PAGEFAULT;
msg.arg.pagefault.address = address;
if (flags & FAULT_FLAG_WRITE) if (flags & FAULT_FLAG_WRITE)
/* /*
* Encode "write" fault information in the LSB of the * If UFFD_FEATURE_PAGEFAULT_FLAG_WRITE was set in the
* address read by userland, without depending on * uffdio_api.features and UFFD_PAGEFAULT_FLAG_WRITE
* FAULT_FLAG_WRITE kernel internal value. * was not set in a UFFD_EVENT_PAGEFAULT, it means it
* was a read fault, otherwise if set it means it's
* a write fault.
*/ */
address |= UFFD_BIT_WRITE; msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_WRITE;
if (reason & VM_UFFD_WP) if (reason & VM_UFFD_WP)
/* /*
* Encode "reason" fault information as bit number 1 * If UFFD_FEATURE_PAGEFAULT_FLAG_WP was set in the
* in the address read by userland. If bit number 1 is * uffdio_api.features and UFFD_PAGEFAULT_FLAG_WP was
* clear it means the reason is a VM_FAULT_MISSING * not set in a UFFD_EVENT_PAGEFAULT, it means it was
* fault. * a missing fault, otherwise if set it means it's a
* write protect fault.
*/ */
address |= UFFD_BIT_WP; msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_WP;
return address; return msg;
} }
/* /*
...@@ -242,7 +258,7 @@ int handle_userfault(struct vm_area_struct *vma, unsigned long address, ...@@ -242,7 +258,7 @@ int handle_userfault(struct vm_area_struct *vma, unsigned long address,
init_waitqueue_func_entry(&uwq.wq, userfaultfd_wake_function); init_waitqueue_func_entry(&uwq.wq, userfaultfd_wake_function);
uwq.wq.private = current; uwq.wq.private = current;
uwq.address = userfault_address(address, flags, reason); uwq.msg = userfault_msg(address, flags, reason);
uwq.pending = true; uwq.pending = true;
uwq.ctx = ctx; uwq.ctx = ctx;
...@@ -398,7 +414,7 @@ static unsigned int userfaultfd_poll(struct file *file, poll_table *wait) ...@@ -398,7 +414,7 @@ static unsigned int userfaultfd_poll(struct file *file, poll_table *wait)
} }
static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait, static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait,
__u64 *addr) struct uffd_msg *msg)
{ {
ssize_t ret; ssize_t ret;
DECLARE_WAITQUEUE(wait, current); DECLARE_WAITQUEUE(wait, current);
...@@ -416,8 +432,8 @@ static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait, ...@@ -416,8 +432,8 @@ static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait,
* disappear from under us. * disappear from under us.
*/ */
uwq->pending = false; uwq->pending = false;
/* careful to always initialize addr if ret == 0 */ /* careful to always initialize msg if ret == 0 */
*addr = uwq->address; *msg = uwq->msg;
spin_unlock(&ctx->fault_wqh.lock); spin_unlock(&ctx->fault_wqh.lock);
ret = 0; ret = 0;
break; break;
...@@ -447,8 +463,7 @@ static ssize_t userfaultfd_read(struct file *file, char __user *buf, ...@@ -447,8 +463,7 @@ static ssize_t userfaultfd_read(struct file *file, char __user *buf,
{ {
struct userfaultfd_ctx *ctx = file->private_data; struct userfaultfd_ctx *ctx = file->private_data;
ssize_t _ret, ret = 0; ssize_t _ret, ret = 0;
/* careful to always initialize addr if ret == 0 */ struct uffd_msg msg;
__u64 uninitialized_var(addr);
int no_wait = file->f_flags & O_NONBLOCK; int no_wait = file->f_flags & O_NONBLOCK;
if (ctx->state == UFFD_STATE_WAIT_API) if (ctx->state == UFFD_STATE_WAIT_API)
...@@ -456,16 +471,16 @@ static ssize_t userfaultfd_read(struct file *file, char __user *buf, ...@@ -456,16 +471,16 @@ static ssize_t userfaultfd_read(struct file *file, char __user *buf,
BUG_ON(ctx->state != UFFD_STATE_RUNNING); BUG_ON(ctx->state != UFFD_STATE_RUNNING);
for (;;) { for (;;) {
if (count < sizeof(addr)) if (count < sizeof(msg))
return ret ? ret : -EINVAL; return ret ? ret : -EINVAL;
_ret = userfaultfd_ctx_read(ctx, no_wait, &addr); _ret = userfaultfd_ctx_read(ctx, no_wait, &msg);
if (_ret < 0) if (_ret < 0)
return ret ? ret : _ret; return ret ? ret : _ret;
if (put_user(addr, (__u64 __user *) buf)) if (copy_to_user((__u64 __user *) buf, &msg, sizeof(msg)))
return ret ? ret : -EFAULT; return ret ? ret : -EFAULT;
ret += sizeof(addr); ret += sizeof(msg);
buf += sizeof(addr); buf += sizeof(msg);
count -= sizeof(addr); count -= sizeof(msg);
/* /*
* Allow to read more than one fault at time but only * Allow to read more than one fault at time but only
* block if waiting for the very first one. * block if waiting for the very first one.
...@@ -873,17 +888,15 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx, ...@@ -873,17 +888,15 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx,
if (ctx->state != UFFD_STATE_WAIT_API) if (ctx->state != UFFD_STATE_WAIT_API)
goto out; goto out;
ret = -EFAULT; ret = -EFAULT;
if (copy_from_user(&uffdio_api, buf, sizeof(__u64))) if (copy_from_user(&uffdio_api, buf, sizeof(uffdio_api)))
goto out; goto out;
if (uffdio_api.api != UFFD_API) { if (uffdio_api.api != UFFD_API || uffdio_api.features) {
/* careful not to leak info, we only read the first 8 bytes */
memset(&uffdio_api, 0, sizeof(uffdio_api)); memset(&uffdio_api, 0, sizeof(uffdio_api));
if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api))) if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api)))
goto out; goto out;
ret = -EINVAL; ret = -EINVAL;
goto out; goto out;
} }
/* careful not to leak info, we only read the first 8 bytes */
uffdio_api.features = UFFD_API_FEATURES; uffdio_api.features = UFFD_API_FEATURES;
uffdio_api.ioctls = UFFD_API_IOCTLS; uffdio_api.ioctls = UFFD_API_IOCTLS;
ret = -EFAULT; ret = -EFAULT;
......
...@@ -11,9 +11,15 @@ ...@@ -11,9 +11,15 @@
#include <linux/types.h> #include <linux/types.h>
#include <linux/compiler.h>
#define UFFD_API ((__u64)0xAA) #define UFFD_API ((__u64)0xAA)
/* FIXME: add "|UFFD_FEATURE_WP" to UFFD_API_FEATURES after implementing it */ /*
#define UFFD_API_FEATURES (UFFD_FEATURE_WRITE_BIT) * After implementing the respective features it will become:
* #define UFFD_API_FEATURES (UFFD_FEATURE_PAGEFAULT_FLAG_WP | \
* UFFD_FEATURE_EVENT_FORK)
*/
#define UFFD_API_FEATURES (0)
#define UFFD_API_IOCTLS \ #define UFFD_API_IOCTLS \
((__u64)1 << _UFFDIO_REGISTER | \ ((__u64)1 << _UFFDIO_REGISTER | \
(__u64)1 << _UFFDIO_UNREGISTER | \ (__u64)1 << _UFFDIO_UNREGISTER | \
...@@ -45,26 +51,60 @@ ...@@ -45,26 +51,60 @@
#define UFFDIO_WAKE _IOR(UFFDIO, _UFFDIO_WAKE, \ #define UFFDIO_WAKE _IOR(UFFDIO, _UFFDIO_WAKE, \
struct uffdio_range) struct uffdio_range)
/* /* read() structure */
* Valid bits below PAGE_SHIFT in the userfault address read through struct uffd_msg {
* the read() syscall. __u8 event;
*/
#define UFFD_BIT_WRITE (1<<0) /* this was a write fault, MISSING or WP */ __u8 reserved1;
#define UFFD_BIT_WP (1<<1) /* handle_userfault() reason VM_UFFD_WP */ __u16 reserved2;
#define UFFD_BITS 2 /* two above bits used for UFFD_BIT_* mask */ __u32 reserved3;
union {
struct {
__u64 flags;
__u64 address;
} pagefault;
struct {
/* unused reserved fields */
__u64 reserved1;
__u64 reserved2;
__u64 reserved3;
} reserved;
} arg;
} __packed;
/* /*
* Features reported in uffdio_api.features field * Start at 0x12 and not at 0 to be more strict against bugs.
*/ */
#define UFFD_FEATURE_WRITE_BIT (1<<0) /* Corresponds to UFFD_BIT_WRITE */ #define UFFD_EVENT_PAGEFAULT 0x12
#define UFFD_FEATURE_WP_BIT (1<<1) /* Corresponds to UFFD_BIT_WP */ #if 0 /* not available yet */
#define UFFD_EVENT_FORK 0x13
#endif
/* flags for UFFD_EVENT_PAGEFAULT */
#define UFFD_PAGEFAULT_FLAG_WRITE (1<<0) /* If this was a write fault */
#define UFFD_PAGEFAULT_FLAG_WP (1<<1) /* If reason is VM_UFFD_WP */
struct uffdio_api { struct uffdio_api {
/* userland asks for an API number */ /* userland asks for an API number and the features to enable */
__u64 api; __u64 api;
/*
/* kernel answers below with the available features for the API */ * Kernel answers below with the all available features for
* the API, this notifies userland of which events and/or
* which flags for each event are enabled in the current
* kernel.
*
* Note: UFFD_EVENT_PAGEFAULT and UFFD_PAGEFAULT_FLAG_WRITE
* are to be considered implicitly always enabled in all kernels as
* long as the uffdio_api.api requested matches UFFD_API.
*/
#if 0 /* not available yet */
#define UFFD_FEATURE_PAGEFAULT_FLAG_WP (1<<0)
#define UFFD_FEATURE_EVENT_FORK (1<<1)
#endif
__u64 features; __u64 features;
__u64 ioctls; __u64 ioctls;
}; };
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment