Commit dd361e50 authored by Peter Xu's avatar Peter Xu Committed by Andrew Morton

mm/hugetlb: make walk_hugetlb_range() safe to pmd unshare

Since walk_hugetlb_range() walks the pgtable, it needs the vma lock to
make sure the pgtable page will not be freed concurrently.

Link: https://lkml.kernel.org/r/20221216155226.2043738-1-peterx@redhat.comSigned-off-by: default avatarPeter Xu <peterx@redhat.com>
Reviewed-by: default avatarMike Kravetz <mike.kravetz@oracle.com>
Reviewed-by: default avatarJohn Hubbard <jhubbard@nvidia.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: James Houghton <jthoughton@google.com>
Cc: Jann Horn <jannh@google.com>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Muchun Song <songmuchun@bytedance.com>
Cc: Nadav Amit <nadav.amit@gmail.com>
Cc: Rik van Riel <riel@surriel.com>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
parent eefc7fa5
...@@ -21,7 +21,16 @@ struct mm_walk; ...@@ -21,7 +21,16 @@ struct mm_walk;
* depth is -1 if not known, 0:PGD, 1:P4D, 2:PUD, 3:PMD. * depth is -1 if not known, 0:PGD, 1:P4D, 2:PUD, 3:PMD.
* Any folded depths (where PTRS_PER_P?D is equal to 1) * Any folded depths (where PTRS_PER_P?D is equal to 1)
* are skipped. * are skipped.
* @hugetlb_entry: if set, called for each hugetlb entry * @hugetlb_entry: if set, called for each hugetlb entry. This hook
* function is called with the vma lock held, in order to
* protect against a concurrent freeing of the pte_t* or
* the ptl. In some cases, the hook function needs to drop
* and retake the vma lock in order to avoid deadlocks
* while calling other functions. In such cases the hook
* function must either refrain from accessing the pte or
* ptl after dropping the vma lock, or else revalidate
* those items after re-acquiring the vma lock and before
* accessing them.
* @test_walk: caller specific callback function to determine whether * @test_walk: caller specific callback function to determine whether
* we walk over the current vma or not. Returning 0 means * we walk over the current vma or not. Returning 0 means
* "do page table walk over the current vma", returning * "do page table walk over the current vma", returning
......
...@@ -492,8 +492,21 @@ static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask, ...@@ -492,8 +492,21 @@ static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask,
required_fault = required_fault =
hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, cpu_flags); hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, cpu_flags);
if (required_fault) { if (required_fault) {
int ret;
spin_unlock(ptl); spin_unlock(ptl);
return hmm_vma_fault(addr, end, required_fault, walk); hugetlb_vma_unlock_read(vma);
/*
* Avoid deadlock: drop the vma lock before calling
* hmm_vma_fault(), which will itself potentially take and
* drop the vma lock. This is also correct from a
* protection point of view, because there is no further
* use here of either pte or ptl after dropping the vma
* lock.
*/
ret = hmm_vma_fault(addr, end, required_fault, walk);
hugetlb_vma_lock_read(vma);
return ret;
} }
pfn = pte_pfn(entry) + ((start & ~hmask) >> PAGE_SHIFT); pfn = pte_pfn(entry) + ((start & ~hmask) >> PAGE_SHIFT);
......
...@@ -302,6 +302,7 @@ static int walk_hugetlb_range(unsigned long addr, unsigned long end, ...@@ -302,6 +302,7 @@ static int walk_hugetlb_range(unsigned long addr, unsigned long end,
const struct mm_walk_ops *ops = walk->ops; const struct mm_walk_ops *ops = walk->ops;
int err = 0; int err = 0;
hugetlb_vma_lock_read(vma);
do { do {
next = hugetlb_entry_end(h, addr, end); next = hugetlb_entry_end(h, addr, end);
pte = huge_pte_offset(walk->mm, addr & hmask, sz); pte = huge_pte_offset(walk->mm, addr & hmask, sz);
...@@ -314,6 +315,7 @@ static int walk_hugetlb_range(unsigned long addr, unsigned long end, ...@@ -314,6 +315,7 @@ static int walk_hugetlb_range(unsigned long addr, unsigned long end,
if (err) if (err)
break; break;
} while (addr = next, addr != end); } while (addr = next, addr != end);
hugetlb_vma_unlock_read(vma);
return err; return err;
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment