Commit f28d4363 authored by Claudio Imbrenda's avatar Claudio Imbrenda Committed by Linus Torvalds

mm/gup/writeback: add callbacks for inaccessible pages

With the introduction of protected KVM guests on s390 there is now a
concept of inaccessible pages.  These pages need to be made accessible
before the host can access them.

While cpu accesses will trigger a fault that can be resolved, I/O accesses
will just fail.  We need to add a callback into architecture code for
places that will do I/O, namely when writeback is started or when a page
reference is taken.

This is not only to enable paging, file backing etc, it is also necessary
to protect the host against a malicious user space.  For example a bad
QEMU could simply start direct I/O on such protected memory.  We do not
want userspace to be able to trigger I/O errors and thus the logic is
"whenever somebody accesses that page (gup) or does I/O, make sure that
this page can be accessed".  When the guest tries to access that page we
will wait in the page fault handler for writeback to have finished and for
the page_ref to be the expected value.

On s390x the function is not supposed to fail, so it is ok to use a
WARN_ON on failure.  If we ever need some more finegrained handling we can
tackle this when we know the details.
Signed-off-by: default avatarClaudio Imbrenda <imbrenda@linux.ibm.com>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
Reviewed-by: default avatarDavid Hildenbrand <david@redhat.com>
Reviewed-by: default avatarChristian Borntraeger <borntraeger@de.ibm.com>
Reviewed-by: default avatarJohn Hubbard <jhubbard@nvidia.com>
Acked-by: default avatarWill Deacon <will@kernel.org>
Cc: Jan Kara <jack@suse.cz>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Ira Weiny <ira.weiny@intel.com>
Cc: Jérôme Glisse <jglisse@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Link: http://lkml.kernel.org/r/20200306132537.783769-3-imbrenda@linux.ibm.comSigned-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
parent dc8fb2f2
...@@ -485,6 +485,12 @@ static inline void arch_free_page(struct page *page, int order) { } ...@@ -485,6 +485,12 @@ static inline void arch_free_page(struct page *page, int order) { }
#ifndef HAVE_ARCH_ALLOC_PAGE #ifndef HAVE_ARCH_ALLOC_PAGE
static inline void arch_alloc_page(struct page *page, int order) { } static inline void arch_alloc_page(struct page *page, int order) { }
#endif #endif
#ifndef HAVE_ARCH_MAKE_PAGE_ACCESSIBLE
static inline int arch_make_page_accessible(struct page *page)
{
return 0;
}
#endif
struct page * struct page *
__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid, __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid,
......
...@@ -390,6 +390,7 @@ static struct page *follow_page_pte(struct vm_area_struct *vma, ...@@ -390,6 +390,7 @@ static struct page *follow_page_pte(struct vm_area_struct *vma,
struct page *page; struct page *page;
spinlock_t *ptl; spinlock_t *ptl;
pte_t *ptep, pte; pte_t *ptep, pte;
int ret;
/* FOLL_GET and FOLL_PIN are mutually exclusive. */ /* FOLL_GET and FOLL_PIN are mutually exclusive. */
if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) == if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) ==
...@@ -448,8 +449,6 @@ static struct page *follow_page_pte(struct vm_area_struct *vma, ...@@ -448,8 +449,6 @@ static struct page *follow_page_pte(struct vm_area_struct *vma,
if (is_zero_pfn(pte_pfn(pte))) { if (is_zero_pfn(pte_pfn(pte))) {
page = pte_page(pte); page = pte_page(pte);
} else { } else {
int ret;
ret = follow_pfn_pte(vma, address, ptep, flags); ret = follow_pfn_pte(vma, address, ptep, flags);
page = ERR_PTR(ret); page = ERR_PTR(ret);
goto out; goto out;
...@@ -457,7 +456,6 @@ static struct page *follow_page_pte(struct vm_area_struct *vma, ...@@ -457,7 +456,6 @@ static struct page *follow_page_pte(struct vm_area_struct *vma,
} }
if (flags & FOLL_SPLIT && PageTransCompound(page)) { if (flags & FOLL_SPLIT && PageTransCompound(page)) {
int ret;
get_page(page); get_page(page);
pte_unmap_unlock(ptep, ptl); pte_unmap_unlock(ptep, ptl);
lock_page(page); lock_page(page);
...@@ -474,6 +472,19 @@ static struct page *follow_page_pte(struct vm_area_struct *vma, ...@@ -474,6 +472,19 @@ static struct page *follow_page_pte(struct vm_area_struct *vma,
page = ERR_PTR(-ENOMEM); page = ERR_PTR(-ENOMEM);
goto out; goto out;
} }
/*
* We need to make the page accessible if and only if we are going
* to access its content (the FOLL_PIN case). Please see
* Documentation/core-api/pin_user_pages.rst for details.
*/
if (flags & FOLL_PIN) {
ret = arch_make_page_accessible(page);
if (ret) {
unpin_user_page(page);
page = ERR_PTR(ret);
goto out;
}
}
if (flags & FOLL_TOUCH) { if (flags & FOLL_TOUCH) {
if ((flags & FOLL_WRITE) && if ((flags & FOLL_WRITE) &&
!pte_dirty(pte) && !PageDirty(page)) !pte_dirty(pte) && !PageDirty(page))
...@@ -2163,6 +2174,19 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, ...@@ -2163,6 +2174,19 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
VM_BUG_ON_PAGE(compound_head(page) != head, page); VM_BUG_ON_PAGE(compound_head(page) != head, page);
/*
* We need to make the page accessible if and only if we are
* going to access its content (the FOLL_PIN case). Please
* see Documentation/core-api/pin_user_pages.rst for
* details.
*/
if (flags & FOLL_PIN) {
ret = arch_make_page_accessible(page);
if (ret) {
unpin_user_page(page);
goto pte_unmap;
}
}
SetPageReferenced(page); SetPageReferenced(page);
pages[*nr] = page; pages[*nr] = page;
(*nr)++; (*nr)++;
......
...@@ -2764,7 +2764,7 @@ int test_clear_page_writeback(struct page *page) ...@@ -2764,7 +2764,7 @@ int test_clear_page_writeback(struct page *page)
int __test_set_page_writeback(struct page *page, bool keep_write) int __test_set_page_writeback(struct page *page, bool keep_write)
{ {
struct address_space *mapping = page_mapping(page); struct address_space *mapping = page_mapping(page);
int ret; int ret, access_ret;
lock_page_memcg(page); lock_page_memcg(page);
if (mapping && mapping_use_writeback_tags(mapping)) { if (mapping && mapping_use_writeback_tags(mapping)) {
...@@ -2807,6 +2807,13 @@ int __test_set_page_writeback(struct page *page, bool keep_write) ...@@ -2807,6 +2807,13 @@ int __test_set_page_writeback(struct page *page, bool keep_write)
inc_zone_page_state(page, NR_ZONE_WRITE_PENDING); inc_zone_page_state(page, NR_ZONE_WRITE_PENDING);
} }
unlock_page_memcg(page); unlock_page_memcg(page);
access_ret = arch_make_page_accessible(page);
/*
* If writeback has been triggered on a page that cannot be made
* accessible, it is too late to recover here.
*/
VM_BUG_ON_PAGE(access_ret != 0, page);
return ret; return ret;
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment