Commit f36b7534 authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'akpm' (patches from Andrew)

Merge misc fixes from Andrew Morton:
 "13 fixes"

* emailed patches from Andrew Morton <akpm@linux-foundation.org>:
  mm, thp: do not cause memcg oom for thp
  mm/vmscan: wake up flushers for legacy cgroups too
  Revert "mm: page_alloc: skip over regions of invalid pfns where possible"
  mm/shmem: do not wait for lock_page() in shmem_unused_huge_shrink()
  mm/thp: do not wait for lock_page() in deferred_split_scan()
  mm/khugepaged.c: convert VM_BUG_ON() to collapse fail
  x86/mm: implement free pmd/pte page interfaces
  mm/vmalloc: add interfaces to free unmapped page table
  h8300: remove extraneous __BIG_ENDIAN definition
  hugetlbfs: check for pgoff value overflow
  lockdep: fix fs_reclaim warning
  MAINTAINERS: update Mark Fasheh's e-mail
  mm/mempolicy.c: avoid use uninitialized preferred_node
parents 8401c72c 9d3c3354
...@@ -10334,7 +10334,7 @@ F: drivers/oprofile/ ...@@ -10334,7 +10334,7 @@ F: drivers/oprofile/
F: include/linux/oprofile.h F: include/linux/oprofile.h
ORACLE CLUSTER FILESYSTEM 2 (OCFS2) ORACLE CLUSTER FILESYSTEM 2 (OCFS2)
M: Mark Fasheh <mfasheh@versity.com> M: Mark Fasheh <mark@fasheh.com>
M: Joel Becker <jlbec@evilplan.org> M: Joel Becker <jlbec@evilplan.org>
L: ocfs2-devel@oss.oracle.com (moderated for non-subscribers) L: ocfs2-devel@oss.oracle.com (moderated for non-subscribers)
W: http://ocfs2.wiki.kernel.org W: http://ocfs2.wiki.kernel.org
......
...@@ -972,3 +972,13 @@ int pmd_clear_huge(pmd_t *pmdp) ...@@ -972,3 +972,13 @@ int pmd_clear_huge(pmd_t *pmdp)
pmd_clear(pmdp); pmd_clear(pmdp);
return 1; return 1;
} }
int pud_free_pmd_page(pud_t *pud)
{
return pud_none(*pud);
}
int pmd_free_pte_page(pmd_t *pmd)
{
return pmd_none(*pmd);
}
...@@ -2,7 +2,6 @@ ...@@ -2,7 +2,6 @@
#ifndef __H8300_BYTEORDER_H__ #ifndef __H8300_BYTEORDER_H__
#define __H8300_BYTEORDER_H__ #define __H8300_BYTEORDER_H__
#define __BIG_ENDIAN __ORDER_BIG_ENDIAN__
#include <linux/byteorder/big_endian.h> #include <linux/byteorder/big_endian.h>
#endif #endif
...@@ -702,4 +702,52 @@ int pmd_clear_huge(pmd_t *pmd) ...@@ -702,4 +702,52 @@ int pmd_clear_huge(pmd_t *pmd)
return 0; return 0;
} }
/**
* pud_free_pmd_page - Clear pud entry and free pmd page.
* @pud: Pointer to a PUD.
*
* Context: The pud range has been unmaped and TLB purged.
* Return: 1 if clearing the entry succeeded. 0 otherwise.
*/
int pud_free_pmd_page(pud_t *pud)
{
pmd_t *pmd;
int i;
if (pud_none(*pud))
return 1;
pmd = (pmd_t *)pud_page_vaddr(*pud);
for (i = 0; i < PTRS_PER_PMD; i++)
if (!pmd_free_pte_page(&pmd[i]))
return 0;
pud_clear(pud);
free_page((unsigned long)pmd);
return 1;
}
/**
* pmd_free_pte_page - Clear pmd entry and free pte page.
* @pmd: Pointer to a PMD.
*
* Context: The pmd range has been unmaped and TLB purged.
* Return: 1 if clearing the entry succeeded. 0 otherwise.
*/
int pmd_free_pte_page(pmd_t *pmd)
{
pte_t *pte;
if (pmd_none(*pmd))
return 1;
pte = (pte_t *)pmd_page_vaddr(*pmd);
pmd_clear(pmd);
free_page((unsigned long)pte);
return 1;
}
#endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */ #endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */
...@@ -108,6 +108,16 @@ static void huge_pagevec_release(struct pagevec *pvec) ...@@ -108,6 +108,16 @@ static void huge_pagevec_release(struct pagevec *pvec)
pagevec_reinit(pvec); pagevec_reinit(pvec);
} }
/*
* Mask used when checking the page offset value passed in via system
* calls. This value will be converted to a loff_t which is signed.
* Therefore, we want to check the upper PAGE_SHIFT + 1 bits of the
* value. The extra bit (- 1 in the shift value) is to take the sign
* bit into account.
*/
#define PGOFF_LOFFT_MAX \
(((1UL << (PAGE_SHIFT + 1)) - 1) << (BITS_PER_LONG - (PAGE_SHIFT + 1)))
static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
{ {
struct inode *inode = file_inode(file); struct inode *inode = file_inode(file);
...@@ -127,12 +137,13 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) ...@@ -127,12 +137,13 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
vma->vm_ops = &hugetlb_vm_ops; vma->vm_ops = &hugetlb_vm_ops;
/* /*
* Offset passed to mmap (before page shift) could have been * page based offset in vm_pgoff could be sufficiently large to
* negative when represented as a (l)off_t. * overflow a (l)off_t when converted to byte offset.
*/ */
if (((loff_t)vma->vm_pgoff << PAGE_SHIFT) < 0) if (vma->vm_pgoff & PGOFF_LOFFT_MAX)
return -EINVAL; return -EINVAL;
/* must be huge page aligned */
if (vma->vm_pgoff & (~huge_page_mask(h) >> PAGE_SHIFT)) if (vma->vm_pgoff & (~huge_page_mask(h) >> PAGE_SHIFT))
return -EINVAL; return -EINVAL;
......
...@@ -983,6 +983,8 @@ int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot); ...@@ -983,6 +983,8 @@ int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot);
int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot); int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot);
int pud_clear_huge(pud_t *pud); int pud_clear_huge(pud_t *pud);
int pmd_clear_huge(pmd_t *pmd); int pmd_clear_huge(pmd_t *pmd);
int pud_free_pmd_page(pud_t *pud);
int pmd_free_pte_page(pmd_t *pmd);
#else /* !CONFIG_HAVE_ARCH_HUGE_VMAP */ #else /* !CONFIG_HAVE_ARCH_HUGE_VMAP */
static inline int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot) static inline int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot)
{ {
...@@ -1008,6 +1010,14 @@ static inline int pmd_clear_huge(pmd_t *pmd) ...@@ -1008,6 +1010,14 @@ static inline int pmd_clear_huge(pmd_t *pmd)
{ {
return 0; return 0;
} }
static inline int pud_free_pmd_page(pud_t *pud)
{
return 0;
}
static inline int pmd_free_pte_page(pmd_t *pmd)
{
return 0;
}
#endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */ #endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */
#ifndef __HAVE_ARCH_FLUSH_PMD_TLB_RANGE #ifndef __HAVE_ARCH_FLUSH_PMD_TLB_RANGE
......
...@@ -187,7 +187,6 @@ int memblock_search_pfn_nid(unsigned long pfn, unsigned long *start_pfn, ...@@ -187,7 +187,6 @@ int memblock_search_pfn_nid(unsigned long pfn, unsigned long *start_pfn,
unsigned long *end_pfn); unsigned long *end_pfn);
void __next_mem_pfn_range(int *idx, int nid, unsigned long *out_start_pfn, void __next_mem_pfn_range(int *idx, int nid, unsigned long *out_start_pfn,
unsigned long *out_end_pfn, int *out_nid); unsigned long *out_end_pfn, int *out_nid);
unsigned long memblock_next_valid_pfn(unsigned long pfn, unsigned long max_pfn);
/** /**
* for_each_mem_pfn_range - early memory pfn range iterator * for_each_mem_pfn_range - early memory pfn range iterator
......
...@@ -91,7 +91,8 @@ static inline int ioremap_pmd_range(pud_t *pud, unsigned long addr, ...@@ -91,7 +91,8 @@ static inline int ioremap_pmd_range(pud_t *pud, unsigned long addr,
if (ioremap_pmd_enabled() && if (ioremap_pmd_enabled() &&
((next - addr) == PMD_SIZE) && ((next - addr) == PMD_SIZE) &&
IS_ALIGNED(phys_addr + addr, PMD_SIZE)) { IS_ALIGNED(phys_addr + addr, PMD_SIZE) &&
pmd_free_pte_page(pmd)) {
if (pmd_set_huge(pmd, phys_addr + addr, prot)) if (pmd_set_huge(pmd, phys_addr + addr, prot))
continue; continue;
} }
...@@ -117,7 +118,8 @@ static inline int ioremap_pud_range(p4d_t *p4d, unsigned long addr, ...@@ -117,7 +118,8 @@ static inline int ioremap_pud_range(p4d_t *p4d, unsigned long addr,
if (ioremap_pud_enabled() && if (ioremap_pud_enabled() &&
((next - addr) == PUD_SIZE) && ((next - addr) == PUD_SIZE) &&
IS_ALIGNED(phys_addr + addr, PUD_SIZE)) { IS_ALIGNED(phys_addr + addr, PUD_SIZE) &&
pud_free_pmd_page(pud)) {
if (pud_set_huge(pud, phys_addr + addr, prot)) if (pud_set_huge(pud, phys_addr + addr, prot))
continue; continue;
} }
......
...@@ -555,7 +555,8 @@ static int __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page, ...@@ -555,7 +555,8 @@ static int __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page,
VM_BUG_ON_PAGE(!PageCompound(page), page); VM_BUG_ON_PAGE(!PageCompound(page), page);
if (mem_cgroup_try_charge(page, vma->vm_mm, gfp, &memcg, true)) { if (mem_cgroup_try_charge(page, vma->vm_mm, gfp | __GFP_NORETRY, &memcg,
true)) {
put_page(page); put_page(page);
count_vm_event(THP_FAULT_FALLBACK); count_vm_event(THP_FAULT_FALLBACK);
return VM_FAULT_FALLBACK; return VM_FAULT_FALLBACK;
...@@ -1316,7 +1317,7 @@ int do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd) ...@@ -1316,7 +1317,7 @@ int do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd)
} }
if (unlikely(mem_cgroup_try_charge(new_page, vma->vm_mm, if (unlikely(mem_cgroup_try_charge(new_page, vma->vm_mm,
huge_gfp, &memcg, true))) { huge_gfp | __GFP_NORETRY, &memcg, true))) {
put_page(new_page); put_page(new_page);
split_huge_pmd(vma, vmf->pmd, vmf->address); split_huge_pmd(vma, vmf->pmd, vmf->address);
if (page) if (page)
...@@ -2783,11 +2784,13 @@ static unsigned long deferred_split_scan(struct shrinker *shrink, ...@@ -2783,11 +2784,13 @@ static unsigned long deferred_split_scan(struct shrinker *shrink,
list_for_each_safe(pos, next, &list) { list_for_each_safe(pos, next, &list) {
page = list_entry((void *)pos, struct page, mapping); page = list_entry((void *)pos, struct page, mapping);
lock_page(page); if (!trylock_page(page))
goto next;
/* split_huge_page() removes page from list on success */ /* split_huge_page() removes page from list on success */
if (!split_huge_page(page)) if (!split_huge_page(page))
split++; split++;
unlock_page(page); unlock_page(page);
next:
put_page(page); put_page(page);
} }
......
...@@ -18,6 +18,7 @@ ...@@ -18,6 +18,7 @@
#include <linux/bootmem.h> #include <linux/bootmem.h>
#include <linux/sysfs.h> #include <linux/sysfs.h>
#include <linux/slab.h> #include <linux/slab.h>
#include <linux/mmdebug.h>
#include <linux/sched/signal.h> #include <linux/sched/signal.h>
#include <linux/rmap.h> #include <linux/rmap.h>
#include <linux/string_helpers.h> #include <linux/string_helpers.h>
...@@ -4374,6 +4375,12 @@ int hugetlb_reserve_pages(struct inode *inode, ...@@ -4374,6 +4375,12 @@ int hugetlb_reserve_pages(struct inode *inode,
struct resv_map *resv_map; struct resv_map *resv_map;
long gbl_reserve; long gbl_reserve;
/* This should never happen */
if (from > to) {
VM_WARN(1, "%s called with a negative range\n", __func__);
return -EINVAL;
}
/* /*
* Only apply hugepage reservation if asked. At fault time, an * Only apply hugepage reservation if asked. At fault time, an
* attempt will be made for VM_NORESERVE to allocate a page * attempt will be made for VM_NORESERVE to allocate a page
......
...@@ -530,7 +530,12 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, ...@@ -530,7 +530,12 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
goto out; goto out;
} }
VM_BUG_ON_PAGE(PageCompound(page), page); /* TODO: teach khugepaged to collapse THP mapped with pte */
if (PageCompound(page)) {
result = SCAN_PAGE_COMPOUND;
goto out;
}
VM_BUG_ON_PAGE(!PageAnon(page), page); VM_BUG_ON_PAGE(!PageAnon(page), page);
/* /*
...@@ -960,7 +965,9 @@ static void collapse_huge_page(struct mm_struct *mm, ...@@ -960,7 +965,9 @@ static void collapse_huge_page(struct mm_struct *mm,
goto out_nolock; goto out_nolock;
} }
if (unlikely(mem_cgroup_try_charge(new_page, mm, gfp, &memcg, true))) { /* Do not oom kill for khugepaged charges */
if (unlikely(mem_cgroup_try_charge(new_page, mm, gfp | __GFP_NORETRY,
&memcg, true))) {
result = SCAN_CGROUP_CHARGE_FAIL; result = SCAN_CGROUP_CHARGE_FAIL;
goto out_nolock; goto out_nolock;
} }
...@@ -1319,7 +1326,9 @@ static void collapse_shmem(struct mm_struct *mm, ...@@ -1319,7 +1326,9 @@ static void collapse_shmem(struct mm_struct *mm,
goto out; goto out;
} }
if (unlikely(mem_cgroup_try_charge(new_page, mm, gfp, &memcg, true))) { /* Do not oom kill for khugepaged charges */
if (unlikely(mem_cgroup_try_charge(new_page, mm, gfp | __GFP_NORETRY,
&memcg, true))) {
result = SCAN_CGROUP_CHARGE_FAIL; result = SCAN_CGROUP_CHARGE_FAIL;
goto out; goto out;
} }
......
...@@ -1101,34 +1101,6 @@ void __init_memblock __next_mem_pfn_range(int *idx, int nid, ...@@ -1101,34 +1101,6 @@ void __init_memblock __next_mem_pfn_range(int *idx, int nid,
*out_nid = r->nid; *out_nid = r->nid;
} }
unsigned long __init_memblock memblock_next_valid_pfn(unsigned long pfn,
unsigned long max_pfn)
{
struct memblock_type *type = &memblock.memory;
unsigned int right = type->cnt;
unsigned int mid, left = 0;
phys_addr_t addr = PFN_PHYS(++pfn);
do {
mid = (right + left) / 2;
if (addr < type->regions[mid].base)
right = mid;
else if (addr >= (type->regions[mid].base +
type->regions[mid].size))
left = mid + 1;
else {
/* addr is within the region, so pfn is valid */
return pfn;
}
} while (left < right);
if (right == type->cnt)
return -1UL;
else
return PHYS_PFN(type->regions[right].base);
}
/** /**
* memblock_set_node - set node ID on memblock regions * memblock_set_node - set node ID on memblock regions
* @base: base of area to set node ID for * @base: base of area to set node ID for
......
...@@ -2124,6 +2124,9 @@ bool __mpol_equal(struct mempolicy *a, struct mempolicy *b) ...@@ -2124,6 +2124,9 @@ bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
case MPOL_INTERLEAVE: case MPOL_INTERLEAVE:
return !!nodes_equal(a->v.nodes, b->v.nodes); return !!nodes_equal(a->v.nodes, b->v.nodes);
case MPOL_PREFERRED: case MPOL_PREFERRED:
/* a's ->flags is the same as b's */
if (a->flags & MPOL_F_LOCAL)
return true;
return a->v.preferred_node == b->v.preferred_node; return a->v.preferred_node == b->v.preferred_node;
default: default:
BUG(); BUG();
......
...@@ -3596,7 +3596,7 @@ static bool __need_fs_reclaim(gfp_t gfp_mask) ...@@ -3596,7 +3596,7 @@ static bool __need_fs_reclaim(gfp_t gfp_mask)
return false; return false;
/* this guy won't enter reclaim */ /* this guy won't enter reclaim */
if ((current->flags & PF_MEMALLOC) && !(gfp_mask & __GFP_NOMEMALLOC)) if (current->flags & PF_MEMALLOC)
return false; return false;
/* We're only interested __GFP_FS allocations for now */ /* We're only interested __GFP_FS allocations for now */
...@@ -5356,17 +5356,8 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, ...@@ -5356,17 +5356,8 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
if (context != MEMMAP_EARLY) if (context != MEMMAP_EARLY)
goto not_early; goto not_early;
if (!early_pfn_valid(pfn)) { if (!early_pfn_valid(pfn))
#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
/*
* Skip to the pfn preceding the next valid one (or
* end_pfn), such that we hit a valid pfn (or end_pfn)
* on our next iteration of the loop.
*/
pfn = memblock_next_valid_pfn(pfn, end_pfn) - 1;
#endif
continue; continue;
}
if (!early_pfn_in_nid(pfn, nid)) if (!early_pfn_in_nid(pfn, nid))
continue; continue;
if (!update_defer_init(pgdat, pfn, end_pfn, &nr_initialised)) if (!update_defer_init(pgdat, pfn, end_pfn, &nr_initialised))
......
...@@ -493,36 +493,45 @@ static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, ...@@ -493,36 +493,45 @@ static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo,
info = list_entry(pos, struct shmem_inode_info, shrinklist); info = list_entry(pos, struct shmem_inode_info, shrinklist);
inode = &info->vfs_inode; inode = &info->vfs_inode;
if (nr_to_split && split >= nr_to_split) { if (nr_to_split && split >= nr_to_split)
iput(inode); goto leave;
continue;
}
page = find_lock_page(inode->i_mapping, page = find_get_page(inode->i_mapping,
(inode->i_size & HPAGE_PMD_MASK) >> PAGE_SHIFT); (inode->i_size & HPAGE_PMD_MASK) >> PAGE_SHIFT);
if (!page) if (!page)
goto drop; goto drop;
/* No huge page at the end of the file: nothing to split */
if (!PageTransHuge(page)) { if (!PageTransHuge(page)) {
unlock_page(page);
put_page(page); put_page(page);
goto drop; goto drop;
} }
/*
* Leave the inode on the list if we failed to lock
* the page at this time.
*
* Waiting for the lock may lead to deadlock in the
* reclaim path.
*/
if (!trylock_page(page)) {
put_page(page);
goto leave;
}
ret = split_huge_page(page); ret = split_huge_page(page);
unlock_page(page); unlock_page(page);
put_page(page); put_page(page);
if (ret) { /* If split failed leave the inode on the list */
/* split failed: leave it on the list */ if (ret)
iput(inode); goto leave;
continue;
}
split++; split++;
drop: drop:
list_del_init(&info->shrinklist); list_del_init(&info->shrinklist);
removed++; removed++;
leave:
iput(inode); iput(inode);
} }
......
...@@ -1779,6 +1779,20 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, ...@@ -1779,6 +1779,20 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
if (stat.nr_writeback && stat.nr_writeback == nr_taken) if (stat.nr_writeback && stat.nr_writeback == nr_taken)
set_bit(PGDAT_WRITEBACK, &pgdat->flags); set_bit(PGDAT_WRITEBACK, &pgdat->flags);
/*
* If dirty pages are scanned that are not queued for IO, it
* implies that flushers are not doing their job. This can
* happen when memory pressure pushes dirty pages to the end of
* the LRU before the dirty limits are breached and the dirty
* data has expired. It can also happen when the proportion of
* dirty pages grows not through writes but through memory
* pressure reclaiming all the clean cache. And in some cases,
* the flushers simply cannot keep up with the allocation
* rate. Nudge the flusher threads in case they are asleep.
*/
if (stat.nr_unqueued_dirty == nr_taken)
wakeup_flusher_threads(WB_REASON_VMSCAN);
/* /*
* Legacy memcg will stall in page writeback so avoid forcibly * Legacy memcg will stall in page writeback so avoid forcibly
* stalling here. * stalling here.
...@@ -1791,22 +1805,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, ...@@ -1791,22 +1805,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
if (stat.nr_dirty && stat.nr_dirty == stat.nr_congested) if (stat.nr_dirty && stat.nr_dirty == stat.nr_congested)
set_bit(PGDAT_CONGESTED, &pgdat->flags); set_bit(PGDAT_CONGESTED, &pgdat->flags);
/* /* Allow kswapd to start writing pages during reclaim. */
* If dirty pages are scanned that are not queued for IO, it if (stat.nr_unqueued_dirty == nr_taken)
* implies that flushers are not doing their job. This can
* happen when memory pressure pushes dirty pages to the end of
* the LRU before the dirty limits are breached and the dirty
* data has expired. It can also happen when the proportion of
* dirty pages grows not through writes but through memory
* pressure reclaiming all the clean cache. And in some cases,
* the flushers simply cannot keep up with the allocation
* rate. Nudge the flusher threads in case they are asleep, but
* also allow kswapd to start writing pages during reclaim.
*/
if (stat.nr_unqueued_dirty == nr_taken) {
wakeup_flusher_threads(WB_REASON_VMSCAN);
set_bit(PGDAT_DIRTY, &pgdat->flags); set_bit(PGDAT_DIRTY, &pgdat->flags);
}
/* /*
* If kswapd scans pages marked marked for immediate * If kswapd scans pages marked marked for immediate
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment