Commit 63d5066f authored by Jérôme Glisse's avatar Jérôme Glisse Committed by Linus Torvalds

mm/hmm: mirror hugetlbfs (snapshoting, faulting and DMA mapping)

HMM mirror is a device driver helpers to mirror range of virtual address.
It means that the process jobs running on the device can access the same
virtual address as the CPU threads of that process.  This patch adds
support for hugetlbfs mapping (ie range of virtual address that are mmap
of a hugetlbfs).

[rcampbell@nvidia.com: fix initial PFN for hugetlbfs pages]
  Link: http://lkml.kernel.org/r/20190419233536.8080-1-rcampbell@nvidia.com
Link: http://lkml.kernel.org/r/20190403193318.16478-9-jglisse@redhat.comSigned-off-by: default avatarJérôme Glisse <jglisse@redhat.com>
Signed-off-by: default avatarRalph Campbell <rcampbell@nvidia.com>
Reviewed-by: default avatarRalph Campbell <rcampbell@nvidia.com>
Reviewed-by: default avatarIra Weiny <ira.weiny@intel.com>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Balbir Singh <bsingharora@gmail.com>
Cc: Dan Carpenter <dan.carpenter@oracle.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Souptick Joarder <jrdr.linux@gmail.com>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
parent 023a019a
...@@ -181,10 +181,31 @@ struct hmm_range { ...@@ -181,10 +181,31 @@ struct hmm_range {
const uint64_t *values; const uint64_t *values;
uint64_t default_flags; uint64_t default_flags;
uint64_t pfn_flags_mask; uint64_t pfn_flags_mask;
uint8_t page_shift;
uint8_t pfn_shift; uint8_t pfn_shift;
bool valid; bool valid;
}; };
/*
* hmm_range_page_shift() - return the page shift for the range
* @range: range being queried
* Returns: page shift (page size = 1 << page shift) for the range
*/
static inline unsigned hmm_range_page_shift(const struct hmm_range *range)
{
return range->page_shift;
}
/*
* hmm_range_page_size() - return the page size for the range
* @range: range being queried
* Returns: page size for the range in bytes
*/
static inline unsigned long hmm_range_page_size(const struct hmm_range *range)
{
return 1UL << hmm_range_page_shift(range);
}
/* /*
* hmm_range_wait_until_valid() - wait for range to be valid * hmm_range_wait_until_valid() - wait for range to be valid
* @range: range affected by invalidation to wait on * @range: range affected by invalidation to wait on
...@@ -424,7 +445,8 @@ void hmm_mirror_unregister(struct hmm_mirror *mirror); ...@@ -424,7 +445,8 @@ void hmm_mirror_unregister(struct hmm_mirror *mirror);
int hmm_range_register(struct hmm_range *range, int hmm_range_register(struct hmm_range *range,
struct mm_struct *mm, struct mm_struct *mm,
unsigned long start, unsigned long start,
unsigned long end); unsigned long end,
unsigned page_shift);
void hmm_range_unregister(struct hmm_range *range); void hmm_range_unregister(struct hmm_range *range);
long hmm_range_snapshot(struct hmm_range *range); long hmm_range_snapshot(struct hmm_range *range);
long hmm_range_fault(struct hmm_range *range, bool block); long hmm_range_fault(struct hmm_range *range, bool block);
...@@ -462,7 +484,8 @@ static inline int hmm_vma_fault(struct hmm_range *range, bool block) ...@@ -462,7 +484,8 @@ static inline int hmm_vma_fault(struct hmm_range *range, bool block)
range->pfn_flags_mask = -1UL; range->pfn_flags_mask = -1UL;
ret = hmm_range_register(range, range->vma->vm_mm, ret = hmm_range_register(range, range->vma->vm_mm,
range->start, range->end); range->start, range->end,
PAGE_SHIFT);
if (ret) if (ret)
return (int)ret; return (int)ret;
......
...@@ -391,11 +391,13 @@ static int hmm_vma_walk_hole_(unsigned long addr, unsigned long end, ...@@ -391,11 +391,13 @@ static int hmm_vma_walk_hole_(unsigned long addr, unsigned long end,
struct hmm_vma_walk *hmm_vma_walk = walk->private; struct hmm_vma_walk *hmm_vma_walk = walk->private;
struct hmm_range *range = hmm_vma_walk->range; struct hmm_range *range = hmm_vma_walk->range;
uint64_t *pfns = range->pfns; uint64_t *pfns = range->pfns;
unsigned long i; unsigned long i, page_size;
hmm_vma_walk->last = addr; hmm_vma_walk->last = addr;
i = (addr - range->start) >> PAGE_SHIFT; page_size = hmm_range_page_size(range);
for (; addr < end; addr += PAGE_SIZE, i++) { i = (addr - range->start) >> range->page_shift;
for (; addr < end; addr += page_size, i++) {
pfns[i] = range->values[HMM_PFN_NONE]; pfns[i] = range->values[HMM_PFN_NONE];
if (fault || write_fault) { if (fault || write_fault) {
int ret; int ret;
...@@ -707,6 +709,69 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp, ...@@ -707,6 +709,69 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp,
return 0; return 0;
} }
static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask,
unsigned long start, unsigned long end,
struct mm_walk *walk)
{
#ifdef CONFIG_HUGETLB_PAGE
unsigned long addr = start, i, pfn, mask, size, pfn_inc;
struct hmm_vma_walk *hmm_vma_walk = walk->private;
struct hmm_range *range = hmm_vma_walk->range;
struct vm_area_struct *vma = walk->vma;
struct hstate *h = hstate_vma(vma);
uint64_t orig_pfn, cpu_flags;
bool fault, write_fault;
spinlock_t *ptl;
pte_t entry;
int ret = 0;
size = 1UL << huge_page_shift(h);
mask = size - 1;
if (range->page_shift != PAGE_SHIFT) {
/* Make sure we are looking at full page. */
if (start & mask)
return -EINVAL;
if (end < (start + size))
return -EINVAL;
pfn_inc = size >> PAGE_SHIFT;
} else {
pfn_inc = 1;
size = PAGE_SIZE;
}
ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte);
entry = huge_ptep_get(pte);
i = (start - range->start) >> range->page_shift;
orig_pfn = range->pfns[i];
range->pfns[i] = range->values[HMM_PFN_NONE];
cpu_flags = pte_to_hmm_pfn_flags(range, entry);
fault = write_fault = false;
hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags,
&fault, &write_fault);
if (fault || write_fault) {
ret = -ENOENT;
goto unlock;
}
pfn = pte_pfn(entry) + ((start & mask) >> range->page_shift);
for (; addr < end; addr += size, i++, pfn += pfn_inc)
range->pfns[i] = hmm_pfn_from_pfn(range, pfn) | cpu_flags;
hmm_vma_walk->last = end;
unlock:
spin_unlock(ptl);
if (ret == -ENOENT)
return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk);
return ret;
#else /* CONFIG_HUGETLB_PAGE */
return -EINVAL;
#endif
}
static void hmm_pfns_clear(struct hmm_range *range, static void hmm_pfns_clear(struct hmm_range *range,
uint64_t *pfns, uint64_t *pfns,
unsigned long addr, unsigned long addr,
...@@ -730,6 +795,7 @@ static void hmm_pfns_special(struct hmm_range *range) ...@@ -730,6 +795,7 @@ static void hmm_pfns_special(struct hmm_range *range)
* @mm: the mm struct for the range of virtual address * @mm: the mm struct for the range of virtual address
* @start: start virtual address (inclusive) * @start: start virtual address (inclusive)
* @end: end virtual address (exclusive) * @end: end virtual address (exclusive)
* @page_shift: expect page shift for the range
* Returns 0 on success, -EFAULT if the address space is no longer valid * Returns 0 on success, -EFAULT if the address space is no longer valid
* *
* Track updates to the CPU page table see include/linux/hmm.h * Track updates to the CPU page table see include/linux/hmm.h
...@@ -737,16 +803,20 @@ static void hmm_pfns_special(struct hmm_range *range) ...@@ -737,16 +803,20 @@ static void hmm_pfns_special(struct hmm_range *range)
int hmm_range_register(struct hmm_range *range, int hmm_range_register(struct hmm_range *range,
struct mm_struct *mm, struct mm_struct *mm,
unsigned long start, unsigned long start,
unsigned long end) unsigned long end,
unsigned page_shift)
{ {
range->start = start & PAGE_MASK; unsigned long mask = ((1UL << page_shift) - 1UL);
range->end = end & PAGE_MASK;
range->valid = false; range->valid = false;
range->hmm = NULL; range->hmm = NULL;
if (range->start >= range->end) if ((start & mask) || (end & mask))
return -EINVAL;
if (start >= end)
return -EINVAL; return -EINVAL;
range->page_shift = page_shift;
range->start = start; range->start = start;
range->end = end; range->end = end;
...@@ -816,6 +886,7 @@ EXPORT_SYMBOL(hmm_range_unregister); ...@@ -816,6 +886,7 @@ EXPORT_SYMBOL(hmm_range_unregister);
*/ */
long hmm_range_snapshot(struct hmm_range *range) long hmm_range_snapshot(struct hmm_range *range)
{ {
const unsigned long device_vma = VM_IO | VM_PFNMAP | VM_MIXEDMAP;
unsigned long start = range->start, end; unsigned long start = range->start, end;
struct hmm_vma_walk hmm_vma_walk; struct hmm_vma_walk hmm_vma_walk;
struct hmm *hmm = range->hmm; struct hmm *hmm = range->hmm;
...@@ -832,15 +903,26 @@ long hmm_range_snapshot(struct hmm_range *range) ...@@ -832,15 +903,26 @@ long hmm_range_snapshot(struct hmm_range *range)
return -EAGAIN; return -EAGAIN;
vma = find_vma(hmm->mm, start); vma = find_vma(hmm->mm, start);
if (vma == NULL || (vma->vm_flags & VM_SPECIAL)) if (vma == NULL || (vma->vm_flags & device_vma))
return -EFAULT; return -EFAULT;
/* FIXME support hugetlb fs/dax */ /* FIXME support dax */
if (is_vm_hugetlb_page(vma) || vma_is_dax(vma)) { if (vma_is_dax(vma)) {
hmm_pfns_special(range); hmm_pfns_special(range);
return -EINVAL; return -EINVAL;
} }
if (is_vm_hugetlb_page(vma)) {
struct hstate *h = hstate_vma(vma);
if (huge_page_shift(h) != range->page_shift &&
range->page_shift != PAGE_SHIFT)
return -EINVAL;
} else {
if (range->page_shift != PAGE_SHIFT)
return -EINVAL;
}
if (!(vma->vm_flags & VM_READ)) { if (!(vma->vm_flags & VM_READ)) {
/* /*
* If vma do not allow read access, then assume that it * If vma do not allow read access, then assume that it
...@@ -866,6 +948,7 @@ long hmm_range_snapshot(struct hmm_range *range) ...@@ -866,6 +948,7 @@ long hmm_range_snapshot(struct hmm_range *range)
mm_walk.hugetlb_entry = NULL; mm_walk.hugetlb_entry = NULL;
mm_walk.pmd_entry = hmm_vma_walk_pmd; mm_walk.pmd_entry = hmm_vma_walk_pmd;
mm_walk.pte_hole = hmm_vma_walk_hole; mm_walk.pte_hole = hmm_vma_walk_hole;
mm_walk.hugetlb_entry = hmm_vma_walk_hugetlb_entry;
walk_page_range(start, end, &mm_walk); walk_page_range(start, end, &mm_walk);
start = end; start = end;
...@@ -884,7 +967,7 @@ EXPORT_SYMBOL(hmm_range_snapshot); ...@@ -884,7 +967,7 @@ EXPORT_SYMBOL(hmm_range_snapshot);
* then one of the following values may be returned: * then one of the following values may be returned:
* *
* -EINVAL invalid arguments or mm or virtual address are in an * -EINVAL invalid arguments or mm or virtual address are in an
* invalid vma (ie either hugetlbfs or device file vma). * invalid vma (for instance device file vma).
* -ENOMEM: Out of memory. * -ENOMEM: Out of memory.
* -EPERM: Invalid permission (for instance asking for write and * -EPERM: Invalid permission (for instance asking for write and
* range is read only). * range is read only).
...@@ -905,6 +988,7 @@ EXPORT_SYMBOL(hmm_range_snapshot); ...@@ -905,6 +988,7 @@ EXPORT_SYMBOL(hmm_range_snapshot);
*/ */
long hmm_range_fault(struct hmm_range *range, bool block) long hmm_range_fault(struct hmm_range *range, bool block)
{ {
const unsigned long device_vma = VM_IO | VM_PFNMAP | VM_MIXEDMAP;
unsigned long start = range->start, end; unsigned long start = range->start, end;
struct hmm_vma_walk hmm_vma_walk; struct hmm_vma_walk hmm_vma_walk;
struct hmm *hmm = range->hmm; struct hmm *hmm = range->hmm;
...@@ -924,15 +1008,25 @@ long hmm_range_fault(struct hmm_range *range, bool block) ...@@ -924,15 +1008,25 @@ long hmm_range_fault(struct hmm_range *range, bool block)
} }
vma = find_vma(hmm->mm, start); vma = find_vma(hmm->mm, start);
if (vma == NULL || (vma->vm_flags & VM_SPECIAL)) if (vma == NULL || (vma->vm_flags & device_vma))
return -EFAULT; return -EFAULT;
/* FIXME support hugetlb fs/dax */ /* FIXME support dax */
if (is_vm_hugetlb_page(vma) || vma_is_dax(vma)) { if (vma_is_dax(vma)) {
hmm_pfns_special(range); hmm_pfns_special(range);
return -EINVAL; return -EINVAL;
} }
if (is_vm_hugetlb_page(vma)) {
if (huge_page_shift(hstate_vma(vma)) !=
range->page_shift &&
range->page_shift != PAGE_SHIFT)
return -EINVAL;
} else {
if (range->page_shift != PAGE_SHIFT)
return -EINVAL;
}
if (!(vma->vm_flags & VM_READ)) { if (!(vma->vm_flags & VM_READ)) {
/* /*
* If vma do not allow read access, then assume that it * If vma do not allow read access, then assume that it
...@@ -959,6 +1053,7 @@ long hmm_range_fault(struct hmm_range *range, bool block) ...@@ -959,6 +1053,7 @@ long hmm_range_fault(struct hmm_range *range, bool block)
mm_walk.hugetlb_entry = NULL; mm_walk.hugetlb_entry = NULL;
mm_walk.pmd_entry = hmm_vma_walk_pmd; mm_walk.pmd_entry = hmm_vma_walk_pmd;
mm_walk.pte_hole = hmm_vma_walk_hole; mm_walk.pte_hole = hmm_vma_walk_hole;
mm_walk.hugetlb_entry = hmm_vma_walk_hugetlb_entry;
do { do {
ret = walk_page_range(start, end, &mm_walk); ret = walk_page_range(start, end, &mm_walk);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment