• David Hildenbrand's avatar
    mm/hugetlb: support write-faults in shared mappings · 1d8d1464
    David Hildenbrand authored
    If we ever get a write-fault on a write-protected page in a shared
    mapping, we'd be in trouble (again).  Instead, we can simply map the page
    writable.
    
    And in fact, there is even a way right now to trigger that code via
    uffd-wp ever since we stared to support it for shmem in 5.19:
    
    --------------------------------------------------------------------------
     #include <stdio.h>
     #include <stdlib.h>
     #include <string.h>
     #include <fcntl.h>
     #include <unistd.h>
     #include <errno.h>
     #include <sys/mman.h>
     #include <sys/syscall.h>
     #include <sys/ioctl.h>
     #include <linux/userfaultfd.h>
    
     #define HUGETLB_SIZE (2 * 1024 * 1024u)
    
     static char *map;
     int uffd;
    
     static int temp_setup_uffd(void)
     {
     	struct uffdio_api uffdio_api;
     	struct uffdio_register uffdio_register;
     	struct uffdio_writeprotect uffd_writeprotect;
     	struct uffdio_range uffd_range;
    
     	uffd = syscall(__NR_userfaultfd,
     		       O_CLOEXEC | O_NONBLOCK | UFFD_USER_MODE_ONLY);
     	if (uffd < 0) {
     		fprintf(stderr, "syscall() failed: %d\n", errno);
     		return -errno;
     	}
    
     	uffdio_api.api = UFFD_API;
     	uffdio_api.features = UFFD_FEATURE_PAGEFAULT_FLAG_WP;
     	if (ioctl(uffd, UFFDIO_API, &uffdio_api) < 0) {
     		fprintf(stderr, "UFFDIO_API failed: %d\n", errno);
     		return -errno;
     	}
    
     	if (!(uffdio_api.features & UFFD_FEATURE_PAGEFAULT_FLAG_WP)) {
     		fprintf(stderr, "UFFD_FEATURE_WRITEPROTECT missing\n");
     		return -ENOSYS;
     	}
    
     	/* Register UFFD-WP */
     	uffdio_register.range.start = (unsigned long) map;
     	uffdio_register.range.len = HUGETLB_SIZE;
     	uffdio_register.mode = UFFDIO_REGISTER_MODE_WP;
     	if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register) < 0) {
     		fprintf(stderr, "UFFDIO_REGISTER failed: %d\n", errno);
     		return -errno;
     	}
    
     	/* Writeprotect a single page. */
     	uffd_writeprotect.range.start = (unsigned long) map;
     	uffd_writeprotect.range.len = HUGETLB_SIZE;
     	uffd_writeprotect.mode = UFFDIO_WRITEPROTECT_MODE_WP;
     	if (ioctl(uffd, UFFDIO_WRITEPROTECT, &uffd_writeprotect)) {
     		fprintf(stderr, "UFFDIO_WRITEPROTECT failed: %d\n", errno);
     		return -errno;
     	}
    
     	/* Unregister UFFD-WP without prior writeunprotection. */
     	uffd_range.start = (unsigned long) map;
     	uffd_range.len = HUGETLB_SIZE;
     	if (ioctl(uffd, UFFDIO_UNREGISTER, &uffd_range)) {
     		fprintf(stderr, "UFFDIO_UNREGISTER failed: %d\n", errno);
     		return -errno;
     	}
    
     	return 0;
     }
    
     int main(int argc, char **argv)
     {
     	int fd;
    
     	fd = open("/dev/hugepages/tmp", O_RDWR | O_CREAT);
     	if (!fd) {
     		fprintf(stderr, "open() failed\n");
     		return -errno;
     	}
     	if (ftruncate(fd, HUGETLB_SIZE)) {
     		fprintf(stderr, "ftruncate() failed\n");
     		return -errno;
     	}
    
     	map = mmap(NULL, HUGETLB_SIZE, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
     	if (map == MAP_FAILED) {
     		fprintf(stderr, "mmap() failed\n");
     		return -errno;
     	}
    
     	*map = 0;
    
     	if (temp_setup_uffd())
     		return 1;
    
     	*map = 0;
    
     	return 0;
     }
    --------------------------------------------------------------------------
    
    Above test fails with SIGBUS when there is only a single free hugetlb page.
     # echo 1 > /sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages
     # ./test
     Bus error (core dumped)
    
    And worse, with sufficient free hugetlb pages it will map an anonymous page
    into a shared mapping, for example, messing up accounting during unmap
    and breaking MAP_SHARED semantics:
     # echo 2 > /sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages
     # ./test
     # cat /proc/meminfo | grep HugePages_
     HugePages_Total:       2
     HugePages_Free:        1
     HugePages_Rsvd:    18446744073709551615
     HugePages_Surp:        0
    
    Reason is that uffd-wp doesn't clear the uffd-wp PTE bit when
    unregistering and consequently keeps the PTE writeprotected.  Reason for
    this is to avoid the additional overhead when unregistering.  Note that
    this is the case also for !hugetlb and that we will end up with writable
    PTEs that still have the uffd-wp PTE bit set once we return from
    hugetlb_wp().  I'm not touching the uffd-wp PTE bit for now, because it
    seems to be a generic thing -- wp_page_reuse() also doesn't clear it.
    
    VM_MAYSHARE handling in hugetlb_fault() for FAULT_FLAG_WRITE indicates
    that MAP_SHARED handling was at least envisioned, but could never have
    worked as expected.
    
    While at it, make sure that we never end up in hugetlb_wp() on write
    faults without VM_WRITE, because we don't support maybe_mkwrite()
    semantics as commonly used in the !hugetlb case -- for example, in
    wp_page_reuse().
    
    Note that there is no need to do any kind of reservation in
    hugetlb_fault() in this case ...  because we already have a hugetlb page
    mapped R/O that we will simply map writable and we are not dealing with
    COW/unsharing.
    
    Link: https://lkml.kernel.org/r/20220811103435.188481-3-david@redhat.com
    Fixes: b1f9e876 ("mm/uffd: enable write protection for shmem & hugetlbfs")
    Signed-off-by: default avatarDavid Hildenbrand <david@redhat.com>
    Reviewed-by: default avatarMike Kravetz <mike.kravetz@oracle.com>
    Cc: Bjorn Helgaas <bhelgaas@google.com>
    Cc: Cyrill Gorcunov <gorcunov@openvz.org>
    Cc: Hugh Dickins <hughd@google.com>
    Cc: Jamie Liu <jamieliu@google.com>
    Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
    Cc: Muchun Song <songmuchun@bytedance.com>
    Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
    Cc: Pavel Emelyanov <xemul@parallels.com>
    Cc: Peter Feiner <pfeiner@google.com>
    Cc: Peter Xu <peterx@redhat.com>
    Cc: <stable@vger.kernel.org>	[5.19]
    Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
    1d8d1464
hugetlb.c 201 KB