Commit 824ddc60 authored by Nadav Amit's avatar Nadav Amit Committed by Linus Torvalds

userfaultfd: provide unmasked address on page-fault

Userfaultfd is supposed to provide the full address (i.e., unmasked) of
the faulting access back to userspace.  However, that is not the case for
quite some time.

Even running "userfaultfd_demo" from the userfaultfd man page provides the
wrong output (and contradicts the man page).  Notice that
"UFFD_EVENT_PAGEFAULT event" shows the masked address (7fc5e30b3000) and
not the first read address (0x7fc5e30b300f).

	Address returned by mmap() = 0x7fc5e30b3000

	fault_handler_thread():
	    poll() returns: nready = 1; POLLIN = 1; POLLERR = 0
	    UFFD_EVENT_PAGEFAULT event: flags = 0; address = 7fc5e30b3000
		(uffdio_copy.copy returned 4096)
	Read address 0x7fc5e30b300f in main(): A
	Read address 0x7fc5e30b340f in main(): A
	Read address 0x7fc5e30b380f in main(): A
	Read address 0x7fc5e30b3c0f in main(): A

The exact address is useful for various reasons and specifically for
prefetching decisions.  If it is known that the memory is populated by
certain objects whose size is not page-aligned, then based on the faulting
address, the uffd-monitor can decide whether to prefetch and prefault the
adjacent page.

This bug has been for quite some time in the kernel: since commit
1a29d85e ("mm: use vmf->address instead of of vmf->virtual_address")
vmf->virtual_address"), which dates back to 2016.  A concern has been
raised that existing userspace application might rely on the old/wrong
behavior in which the address is masked.  Therefore, it was suggested to
provide the masked address unless the user explicitly asks for the exact
address.

Add a new userfaultfd feature UFFD_FEATURE_EXACT_ADDRESS to direct
userfaultfd to provide the exact address.  Add a new "real_address" field
to vmf to hold the unmasked address.  Provide the address to userspace
accordingly.

Initialize real_address in various code-paths to be consistent with
address, even when it is not used, to be on the safe side.

[namit@vmware.com: initialize real_address on all code paths, per Jan]
  Link: https://lkml.kernel.org/r/20220226022655.350562-1-namit@vmware.com
[akpm@linux-foundation.org: fix typo in comment, per Jan]

Link: https://lkml.kernel.org/r/20220218041003.3508-1-namit@vmware.comSigned-off-by: default avatarNadav Amit <namit@vmware.com>
Acked-by: default avatarPeter Xu <peterx@redhat.com>
Reviewed-by: default avatarDavid Hildenbrand <david@redhat.com>
Acked-by: default avatarMike Rapoport <rppt@linux.ibm.com>
Reviewed-by: default avatarJan Kara <jack@suse.cz>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
parent 87d2762e
...@@ -198,6 +198,9 @@ static inline struct uffd_msg userfault_msg(unsigned long address, ...@@ -198,6 +198,9 @@ static inline struct uffd_msg userfault_msg(unsigned long address,
struct uffd_msg msg; struct uffd_msg msg;
msg_init(&msg); msg_init(&msg);
msg.event = UFFD_EVENT_PAGEFAULT; msg.event = UFFD_EVENT_PAGEFAULT;
if (!(features & UFFD_FEATURE_EXACT_ADDRESS))
address &= PAGE_MASK;
msg.arg.pagefault.address = address; msg.arg.pagefault.address = address;
/* /*
* These flags indicate why the userfault occurred: * These flags indicate why the userfault occurred:
...@@ -482,7 +485,7 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason) ...@@ -482,7 +485,7 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
init_waitqueue_func_entry(&uwq.wq, userfaultfd_wake_function); init_waitqueue_func_entry(&uwq.wq, userfaultfd_wake_function);
uwq.wq.private = current; uwq.wq.private = current;
uwq.msg = userfault_msg(vmf->address, vmf->flags, reason, uwq.msg = userfault_msg(vmf->real_address, vmf->flags, reason,
ctx->features); ctx->features);
uwq.ctx = ctx; uwq.ctx = ctx;
uwq.waken = false; uwq.waken = false;
......
...@@ -478,7 +478,8 @@ struct vm_fault { ...@@ -478,7 +478,8 @@ struct vm_fault {
struct vm_area_struct *vma; /* Target VMA */ struct vm_area_struct *vma; /* Target VMA */
gfp_t gfp_mask; /* gfp mask to be used for allocations */ gfp_t gfp_mask; /* gfp mask to be used for allocations */
pgoff_t pgoff; /* Logical page offset based on vma */ pgoff_t pgoff; /* Logical page offset based on vma */
unsigned long address; /* Faulting virtual address */ unsigned long address; /* Faulting virtual address - masked */
unsigned long real_address; /* Faulting virtual address - unmasked */
}; };
enum fault_flag flags; /* FAULT_FLAG_xxx flags enum fault_flag flags; /* FAULT_FLAG_xxx flags
* XXX: should really be 'const' */ * XXX: should really be 'const' */
......
...@@ -32,7 +32,8 @@ ...@@ -32,7 +32,8 @@
UFFD_FEATURE_SIGBUS | \ UFFD_FEATURE_SIGBUS | \
UFFD_FEATURE_THREAD_ID | \ UFFD_FEATURE_THREAD_ID | \
UFFD_FEATURE_MINOR_HUGETLBFS | \ UFFD_FEATURE_MINOR_HUGETLBFS | \
UFFD_FEATURE_MINOR_SHMEM) UFFD_FEATURE_MINOR_SHMEM | \
UFFD_FEATURE_EXACT_ADDRESS)
#define UFFD_API_IOCTLS \ #define UFFD_API_IOCTLS \
((__u64)1 << _UFFDIO_REGISTER | \ ((__u64)1 << _UFFDIO_REGISTER | \
(__u64)1 << _UFFDIO_UNREGISTER | \ (__u64)1 << _UFFDIO_UNREGISTER | \
...@@ -189,6 +190,10 @@ struct uffdio_api { ...@@ -189,6 +190,10 @@ struct uffdio_api {
* *
* UFFD_FEATURE_MINOR_SHMEM indicates the same support as * UFFD_FEATURE_MINOR_SHMEM indicates the same support as
* UFFD_FEATURE_MINOR_HUGETLBFS, but for shmem-backed pages instead. * UFFD_FEATURE_MINOR_HUGETLBFS, but for shmem-backed pages instead.
*
* UFFD_FEATURE_EXACT_ADDRESS indicates that the exact address of page
* faults would be provided and the offset within the page would not be
* masked.
*/ */
#define UFFD_FEATURE_PAGEFAULT_FLAG_WP (1<<0) #define UFFD_FEATURE_PAGEFAULT_FLAG_WP (1<<0)
#define UFFD_FEATURE_EVENT_FORK (1<<1) #define UFFD_FEATURE_EVENT_FORK (1<<1)
...@@ -201,6 +206,7 @@ struct uffdio_api { ...@@ -201,6 +206,7 @@ struct uffdio_api {
#define UFFD_FEATURE_THREAD_ID (1<<8) #define UFFD_FEATURE_THREAD_ID (1<<8)
#define UFFD_FEATURE_MINOR_HUGETLBFS (1<<9) #define UFFD_FEATURE_MINOR_HUGETLBFS (1<<9)
#define UFFD_FEATURE_MINOR_SHMEM (1<<10) #define UFFD_FEATURE_MINOR_SHMEM (1<<10)
#define UFFD_FEATURE_EXACT_ADDRESS (1<<11)
__u64 features; __u64 features;
__u64 ioctls; __u64 ioctls;
......
...@@ -5341,6 +5341,7 @@ static inline vm_fault_t hugetlb_handle_userfault(struct vm_area_struct *vma, ...@@ -5341,6 +5341,7 @@ static inline vm_fault_t hugetlb_handle_userfault(struct vm_area_struct *vma,
pgoff_t idx, pgoff_t idx,
unsigned int flags, unsigned int flags,
unsigned long haddr, unsigned long haddr,
unsigned long addr,
unsigned long reason) unsigned long reason)
{ {
vm_fault_t ret; vm_fault_t ret;
...@@ -5348,6 +5349,7 @@ static inline vm_fault_t hugetlb_handle_userfault(struct vm_area_struct *vma, ...@@ -5348,6 +5349,7 @@ static inline vm_fault_t hugetlb_handle_userfault(struct vm_area_struct *vma,
struct vm_fault vmf = { struct vm_fault vmf = {
.vma = vma, .vma = vma,
.address = haddr, .address = haddr,
.real_address = addr,
.flags = flags, .flags = flags,
/* /*
...@@ -5416,7 +5418,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, ...@@ -5416,7 +5418,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
/* Check for page in userfault range */ /* Check for page in userfault range */
if (userfaultfd_missing(vma)) { if (userfaultfd_missing(vma)) {
ret = hugetlb_handle_userfault(vma, mapping, idx, ret = hugetlb_handle_userfault(vma, mapping, idx,
flags, haddr, flags, haddr, address,
VM_UFFD_MISSING); VM_UFFD_MISSING);
goto out; goto out;
} }
...@@ -5480,7 +5482,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, ...@@ -5480,7 +5482,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
unlock_page(page); unlock_page(page);
put_page(page); put_page(page);
ret = hugetlb_handle_userfault(vma, mapping, idx, ret = hugetlb_handle_userfault(vma, mapping, idx,
flags, haddr, flags, haddr, address,
VM_UFFD_MINOR); VM_UFFD_MINOR);
goto out; goto out;
} }
......
...@@ -4633,6 +4633,7 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma, ...@@ -4633,6 +4633,7 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma,
struct vm_fault vmf = { struct vm_fault vmf = {
.vma = vma, .vma = vma,
.address = address & PAGE_MASK, .address = address & PAGE_MASK,
.real_address = address,
.flags = flags, .flags = flags,
.pgoff = linear_page_index(vma, address), .pgoff = linear_page_index(vma, address),
.gfp_mask = __get_fault_gfp_mask(vma), .gfp_mask = __get_fault_gfp_mask(vma),
......
...@@ -1951,6 +1951,7 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, ...@@ -1951,6 +1951,7 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
struct vm_fault vmf = { struct vm_fault vmf = {
.vma = vma, .vma = vma,
.address = addr, .address = addr,
.real_address = addr,
.pmd = pmd, .pmd = pmd,
}; };
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment