Commit ce83d217 authored by Andrea Arcangeli's avatar Andrea Arcangeli Committed by Linus Torvalds

thp: allocate memory in khugepaged outside of mmap_sem write mode

This tries to be more friendly to filesystem in userland, with userland
backends that allocate memory in the I/O paths and that could deadlock if
khugepaged holds the mmap_sem write mode of the userland backend while
allocating memory.  Memory allocation may wait for writeback I/O
completion from the daemon that may be blocked in the mmap_sem read mode
if a page fault happens and the daemon wasn't using mlock for the memory
required for the I/O submission and completion.
Signed-off-by: default avatarAndrea Arcangeli <aarcange@redhat.com>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
parent 0bbbc0b3
...@@ -1664,9 +1664,9 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page, ...@@ -1664,9 +1664,9 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
static void collapse_huge_page(struct mm_struct *mm, static void collapse_huge_page(struct mm_struct *mm,
unsigned long address, unsigned long address,
struct page **hpage) struct page **hpage,
struct vm_area_struct *vma)
{ {
struct vm_area_struct *vma;
pgd_t *pgd; pgd_t *pgd;
pud_t *pud; pud_t *pud;
pmd_t *pmd, _pmd; pmd_t *pmd, _pmd;
...@@ -1680,9 +1680,34 @@ static void collapse_huge_page(struct mm_struct *mm, ...@@ -1680,9 +1680,34 @@ static void collapse_huge_page(struct mm_struct *mm,
VM_BUG_ON(address & ~HPAGE_PMD_MASK); VM_BUG_ON(address & ~HPAGE_PMD_MASK);
#ifndef CONFIG_NUMA #ifndef CONFIG_NUMA
VM_BUG_ON(!*hpage); VM_BUG_ON(!*hpage);
new_page = *hpage;
#else #else
VM_BUG_ON(*hpage); VM_BUG_ON(*hpage);
/*
* Allocate the page while the vma is still valid and under
* the mmap_sem read mode so there is no memory allocation
* later when we take the mmap_sem in write mode. This is more
* friendly behavior (OTOH it may actually hide bugs) to
* filesystems in userland with daemons allocating memory in
* the userland I/O paths. Allocating memory with the
* mmap_sem in read mode is good idea also to allow greater
* scalability.
*/
new_page = alloc_hugepage_vma(khugepaged_defrag(), vma, address);
if (unlikely(!new_page)) {
up_read(&mm->mmap_sem);
*hpage = ERR_PTR(-ENOMEM);
return;
}
#endif #endif
if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) {
up_read(&mm->mmap_sem);
put_page(new_page);
return;
}
/* after allocating the hugepage upgrade to mmap_sem write mode */
up_read(&mm->mmap_sem);
/* /*
* Prevent all access to pagetables with the exception of * Prevent all access to pagetables with the exception of
...@@ -1720,18 +1745,6 @@ static void collapse_huge_page(struct mm_struct *mm, ...@@ -1720,18 +1745,6 @@ static void collapse_huge_page(struct mm_struct *mm,
if (!pmd_present(*pmd) || pmd_trans_huge(*pmd)) if (!pmd_present(*pmd) || pmd_trans_huge(*pmd))
goto out; goto out;
#ifndef CONFIG_NUMA
new_page = *hpage;
#else
new_page = alloc_hugepage_vma(khugepaged_defrag(), vma, address);
if (unlikely(!new_page)) {
*hpage = ERR_PTR(-ENOMEM);
goto out;
}
#endif
if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL)))
goto out_put_page;
anon_vma_lock(vma->anon_vma); anon_vma_lock(vma->anon_vma);
pte = pte_offset_map(pmd, address); pte = pte_offset_map(pmd, address);
...@@ -1759,7 +1772,7 @@ static void collapse_huge_page(struct mm_struct *mm, ...@@ -1759,7 +1772,7 @@ static void collapse_huge_page(struct mm_struct *mm,
spin_unlock(&mm->page_table_lock); spin_unlock(&mm->page_table_lock);
anon_vma_unlock(vma->anon_vma); anon_vma_unlock(vma->anon_vma);
mem_cgroup_uncharge_page(new_page); mem_cgroup_uncharge_page(new_page);
goto out_put_page; goto out;
} }
/* /*
...@@ -1798,15 +1811,15 @@ static void collapse_huge_page(struct mm_struct *mm, ...@@ -1798,15 +1811,15 @@ static void collapse_huge_page(struct mm_struct *mm,
*hpage = NULL; *hpage = NULL;
#endif #endif
khugepaged_pages_collapsed++; khugepaged_pages_collapsed++;
out: out_up_write:
up_write(&mm->mmap_sem); up_write(&mm->mmap_sem);
return; return;
out_put_page: out:
#ifdef CONFIG_NUMA #ifdef CONFIG_NUMA
put_page(new_page); put_page(new_page);
#endif #endif
goto out; goto out_up_write;
} }
static int khugepaged_scan_pmd(struct mm_struct *mm, static int khugepaged_scan_pmd(struct mm_struct *mm,
...@@ -1865,10 +1878,9 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, ...@@ -1865,10 +1878,9 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
ret = 1; ret = 1;
out_unmap: out_unmap:
pte_unmap_unlock(pte, ptl); pte_unmap_unlock(pte, ptl);
if (ret) { if (ret)
up_read(&mm->mmap_sem); /* collapse_huge_page will return with the mmap_sem released */
collapse_huge_page(mm, address, hpage); collapse_huge_page(mm, address, hpage, vma);
}
out: out:
return ret; return ret;
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment