Commit f36b7534 authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'akpm' (patches from Andrew)

Merge misc fixes from Andrew Morton:
 "13 fixes"

* emailed patches from Andrew Morton <akpm@linux-foundation.org>:
  mm, thp: do not cause memcg oom for thp
  mm/vmscan: wake up flushers for legacy cgroups too
  Revert "mm: page_alloc: skip over regions of invalid pfns where possible"
  mm/shmem: do not wait for lock_page() in shmem_unused_huge_shrink()
  mm/thp: do not wait for lock_page() in deferred_split_scan()
  mm/khugepaged.c: convert VM_BUG_ON() to collapse fail
  x86/mm: implement free pmd/pte page interfaces
  mm/vmalloc: add interfaces to free unmapped page table
  h8300: remove extraneous __BIG_ENDIAN definition
  hugetlbfs: check for pgoff value overflow
  lockdep: fix fs_reclaim warning
  MAINTAINERS: update Mark Fasheh's e-mail
  mm/mempolicy.c: avoid use uninitialized preferred_node
parents 8401c72c 9d3c3354
......@@ -10334,7 +10334,7 @@ F: drivers/oprofile/
F: include/linux/oprofile.h
ORACLE CLUSTER FILESYSTEM 2 (OCFS2)
M: Mark Fasheh <mfasheh@versity.com>
M: Mark Fasheh <mark@fasheh.com>
M: Joel Becker <jlbec@evilplan.org>
L: ocfs2-devel@oss.oracle.com (moderated for non-subscribers)
W: http://ocfs2.wiki.kernel.org
......
......@@ -972,3 +972,13 @@ int pmd_clear_huge(pmd_t *pmdp)
pmd_clear(pmdp);
return 1;
}
int pud_free_pmd_page(pud_t *pud)
{
return pud_none(*pud);
}
int pmd_free_pte_page(pmd_t *pmd)
{
return pmd_none(*pmd);
}
......@@ -2,7 +2,6 @@
#ifndef __H8300_BYTEORDER_H__
#define __H8300_BYTEORDER_H__
#define __BIG_ENDIAN __ORDER_BIG_ENDIAN__
#include <linux/byteorder/big_endian.h>
#endif
......@@ -702,4 +702,52 @@ int pmd_clear_huge(pmd_t *pmd)
return 0;
}
/**
* pud_free_pmd_page - Clear pud entry and free pmd page.
* @pud: Pointer to a PUD.
*
* Context: The pud range has been unmaped and TLB purged.
* Return: 1 if clearing the entry succeeded. 0 otherwise.
*/
int pud_free_pmd_page(pud_t *pud)
{
pmd_t *pmd;
int i;
if (pud_none(*pud))
return 1;
pmd = (pmd_t *)pud_page_vaddr(*pud);
for (i = 0; i < PTRS_PER_PMD; i++)
if (!pmd_free_pte_page(&pmd[i]))
return 0;
pud_clear(pud);
free_page((unsigned long)pmd);
return 1;
}
/**
* pmd_free_pte_page - Clear pmd entry and free pte page.
* @pmd: Pointer to a PMD.
*
* Context: The pmd range has been unmaped and TLB purged.
* Return: 1 if clearing the entry succeeded. 0 otherwise.
*/
int pmd_free_pte_page(pmd_t *pmd)
{
pte_t *pte;
if (pmd_none(*pmd))
return 1;
pte = (pte_t *)pmd_page_vaddr(*pmd);
pmd_clear(pmd);
free_page((unsigned long)pte);
return 1;
}
#endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */
......@@ -108,6 +108,16 @@ static void huge_pagevec_release(struct pagevec *pvec)
pagevec_reinit(pvec);
}
/*
* Mask used when checking the page offset value passed in via system
* calls. This value will be converted to a loff_t which is signed.
* Therefore, we want to check the upper PAGE_SHIFT + 1 bits of the
* value. The extra bit (- 1 in the shift value) is to take the sign
* bit into account.
*/
#define PGOFF_LOFFT_MAX \
(((1UL << (PAGE_SHIFT + 1)) - 1) << (BITS_PER_LONG - (PAGE_SHIFT + 1)))
static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
{
struct inode *inode = file_inode(file);
......@@ -127,12 +137,13 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
vma->vm_ops = &hugetlb_vm_ops;
/*
* Offset passed to mmap (before page shift) could have been
* negative when represented as a (l)off_t.
* page based offset in vm_pgoff could be sufficiently large to
* overflow a (l)off_t when converted to byte offset.
*/
if (((loff_t)vma->vm_pgoff << PAGE_SHIFT) < 0)
if (vma->vm_pgoff & PGOFF_LOFFT_MAX)
return -EINVAL;
/* must be huge page aligned */
if (vma->vm_pgoff & (~huge_page_mask(h) >> PAGE_SHIFT))
return -EINVAL;
......
......@@ -983,6 +983,8 @@ int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot);
int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot);
int pud_clear_huge(pud_t *pud);
int pmd_clear_huge(pmd_t *pmd);
int pud_free_pmd_page(pud_t *pud);
int pmd_free_pte_page(pmd_t *pmd);
#else /* !CONFIG_HAVE_ARCH_HUGE_VMAP */
static inline int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot)
{
......@@ -1008,6 +1010,14 @@ static inline int pmd_clear_huge(pmd_t *pmd)
{
return 0;
}
static inline int pud_free_pmd_page(pud_t *pud)
{
return 0;
}
static inline int pmd_free_pte_page(pmd_t *pmd)
{
return 0;
}
#endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */
#ifndef __HAVE_ARCH_FLUSH_PMD_TLB_RANGE
......
......@@ -187,7 +187,6 @@ int memblock_search_pfn_nid(unsigned long pfn, unsigned long *start_pfn,
unsigned long *end_pfn);
void __next_mem_pfn_range(int *idx, int nid, unsigned long *out_start_pfn,
unsigned long *out_end_pfn, int *out_nid);
unsigned long memblock_next_valid_pfn(unsigned long pfn, unsigned long max_pfn);
/**
* for_each_mem_pfn_range - early memory pfn range iterator
......
......@@ -91,7 +91,8 @@ static inline int ioremap_pmd_range(pud_t *pud, unsigned long addr,
if (ioremap_pmd_enabled() &&
((next - addr) == PMD_SIZE) &&
IS_ALIGNED(phys_addr + addr, PMD_SIZE)) {
IS_ALIGNED(phys_addr + addr, PMD_SIZE) &&
pmd_free_pte_page(pmd)) {
if (pmd_set_huge(pmd, phys_addr + addr, prot))
continue;
}
......@@ -117,7 +118,8 @@ static inline int ioremap_pud_range(p4d_t *p4d, unsigned long addr,
if (ioremap_pud_enabled() &&
((next - addr) == PUD_SIZE) &&
IS_ALIGNED(phys_addr + addr, PUD_SIZE)) {
IS_ALIGNED(phys_addr + addr, PUD_SIZE) &&
pud_free_pmd_page(pud)) {
if (pud_set_huge(pud, phys_addr + addr, prot))
continue;
}
......
......@@ -555,7 +555,8 @@ static int __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page,
VM_BUG_ON_PAGE(!PageCompound(page), page);
if (mem_cgroup_try_charge(page, vma->vm_mm, gfp, &memcg, true)) {
if (mem_cgroup_try_charge(page, vma->vm_mm, gfp | __GFP_NORETRY, &memcg,
true)) {
put_page(page);
count_vm_event(THP_FAULT_FALLBACK);
return VM_FAULT_FALLBACK;
......@@ -1316,7 +1317,7 @@ int do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd)
}
if (unlikely(mem_cgroup_try_charge(new_page, vma->vm_mm,
huge_gfp, &memcg, true))) {
huge_gfp | __GFP_NORETRY, &memcg, true))) {
put_page(new_page);
split_huge_pmd(vma, vmf->pmd, vmf->address);
if (page)
......@@ -2783,11 +2784,13 @@ static unsigned long deferred_split_scan(struct shrinker *shrink,
list_for_each_safe(pos, next, &list) {
page = list_entry((void *)pos, struct page, mapping);
lock_page(page);
if (!trylock_page(page))
goto next;
/* split_huge_page() removes page from list on success */
if (!split_huge_page(page))
split++;
unlock_page(page);
next:
put_page(page);
}
......
......@@ -18,6 +18,7 @@
#include <linux/bootmem.h>
#include <linux/sysfs.h>
#include <linux/slab.h>
#include <linux/mmdebug.h>
#include <linux/sched/signal.h>
#include <linux/rmap.h>
#include <linux/string_helpers.h>
......@@ -4374,6 +4375,12 @@ int hugetlb_reserve_pages(struct inode *inode,
struct resv_map *resv_map;
long gbl_reserve;
/* This should never happen */
if (from > to) {
VM_WARN(1, "%s called with a negative range\n", __func__);
return -EINVAL;
}
/*
* Only apply hugepage reservation if asked. At fault time, an
* attempt will be made for VM_NORESERVE to allocate a page
......
......@@ -530,7 +530,12 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
goto out;
}
VM_BUG_ON_PAGE(PageCompound(page), page);
/* TODO: teach khugepaged to collapse THP mapped with pte */
if (PageCompound(page)) {
result = SCAN_PAGE_COMPOUND;
goto out;
}
VM_BUG_ON_PAGE(!PageAnon(page), page);
/*
......@@ -960,7 +965,9 @@ static void collapse_huge_page(struct mm_struct *mm,
goto out_nolock;
}
if (unlikely(mem_cgroup_try_charge(new_page, mm, gfp, &memcg, true))) {
/* Do not oom kill for khugepaged charges */
if (unlikely(mem_cgroup_try_charge(new_page, mm, gfp | __GFP_NORETRY,
&memcg, true))) {
result = SCAN_CGROUP_CHARGE_FAIL;
goto out_nolock;
}
......@@ -1319,7 +1326,9 @@ static void collapse_shmem(struct mm_struct *mm,
goto out;
}
if (unlikely(mem_cgroup_try_charge(new_page, mm, gfp, &memcg, true))) {
/* Do not oom kill for khugepaged charges */
if (unlikely(mem_cgroup_try_charge(new_page, mm, gfp | __GFP_NORETRY,
&memcg, true))) {
result = SCAN_CGROUP_CHARGE_FAIL;
goto out;
}
......
......@@ -1101,34 +1101,6 @@ void __init_memblock __next_mem_pfn_range(int *idx, int nid,
*out_nid = r->nid;
}
unsigned long __init_memblock memblock_next_valid_pfn(unsigned long pfn,
unsigned long max_pfn)
{
struct memblock_type *type = &memblock.memory;
unsigned int right = type->cnt;
unsigned int mid, left = 0;
phys_addr_t addr = PFN_PHYS(++pfn);
do {
mid = (right + left) / 2;
if (addr < type->regions[mid].base)
right = mid;
else if (addr >= (type->regions[mid].base +
type->regions[mid].size))
left = mid + 1;
else {
/* addr is within the region, so pfn is valid */
return pfn;
}
} while (left < right);
if (right == type->cnt)
return -1UL;
else
return PHYS_PFN(type->regions[right].base);
}
/**
* memblock_set_node - set node ID on memblock regions
* @base: base of area to set node ID for
......
......@@ -2124,6 +2124,9 @@ bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
case MPOL_INTERLEAVE:
return !!nodes_equal(a->v.nodes, b->v.nodes);
case MPOL_PREFERRED:
/* a's ->flags is the same as b's */
if (a->flags & MPOL_F_LOCAL)
return true;
return a->v.preferred_node == b->v.preferred_node;
default:
BUG();
......
......@@ -3596,7 +3596,7 @@ static bool __need_fs_reclaim(gfp_t gfp_mask)
return false;
/* this guy won't enter reclaim */
if ((current->flags & PF_MEMALLOC) && !(gfp_mask & __GFP_NOMEMALLOC))
if (current->flags & PF_MEMALLOC)
return false;
/* We're only interested __GFP_FS allocations for now */
......@@ -5356,17 +5356,8 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
if (context != MEMMAP_EARLY)
goto not_early;
if (!early_pfn_valid(pfn)) {
#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
/*
* Skip to the pfn preceding the next valid one (or
* end_pfn), such that we hit a valid pfn (or end_pfn)
* on our next iteration of the loop.
*/
pfn = memblock_next_valid_pfn(pfn, end_pfn) - 1;
#endif
if (!early_pfn_valid(pfn))
continue;
}
if (!early_pfn_in_nid(pfn, nid))
continue;
if (!update_defer_init(pgdat, pfn, end_pfn, &nr_initialised))
......
......@@ -493,36 +493,45 @@ static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo,
info = list_entry(pos, struct shmem_inode_info, shrinklist);
inode = &info->vfs_inode;
if (nr_to_split && split >= nr_to_split) {
iput(inode);
continue;
}
if (nr_to_split && split >= nr_to_split)
goto leave;
page = find_lock_page(inode->i_mapping,
page = find_get_page(inode->i_mapping,
(inode->i_size & HPAGE_PMD_MASK) >> PAGE_SHIFT);
if (!page)
goto drop;
/* No huge page at the end of the file: nothing to split */
if (!PageTransHuge(page)) {
unlock_page(page);
put_page(page);
goto drop;
}
/*
* Leave the inode on the list if we failed to lock
* the page at this time.
*
* Waiting for the lock may lead to deadlock in the
* reclaim path.
*/
if (!trylock_page(page)) {
put_page(page);
goto leave;
}
ret = split_huge_page(page);
unlock_page(page);
put_page(page);
if (ret) {
/* split failed: leave it on the list */
iput(inode);
continue;
}
/* If split failed leave the inode on the list */
if (ret)
goto leave;
split++;
drop:
list_del_init(&info->shrinklist);
removed++;
leave:
iput(inode);
}
......
......@@ -1779,6 +1779,20 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
if (stat.nr_writeback && stat.nr_writeback == nr_taken)
set_bit(PGDAT_WRITEBACK, &pgdat->flags);
/*
* If dirty pages are scanned that are not queued for IO, it
* implies that flushers are not doing their job. This can
* happen when memory pressure pushes dirty pages to the end of
* the LRU before the dirty limits are breached and the dirty
* data has expired. It can also happen when the proportion of
* dirty pages grows not through writes but through memory
* pressure reclaiming all the clean cache. And in some cases,
* the flushers simply cannot keep up with the allocation
* rate. Nudge the flusher threads in case they are asleep.
*/
if (stat.nr_unqueued_dirty == nr_taken)
wakeup_flusher_threads(WB_REASON_VMSCAN);
/*
* Legacy memcg will stall in page writeback so avoid forcibly
* stalling here.
......@@ -1791,22 +1805,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
if (stat.nr_dirty && stat.nr_dirty == stat.nr_congested)
set_bit(PGDAT_CONGESTED, &pgdat->flags);
/*
* If dirty pages are scanned that are not queued for IO, it
* implies that flushers are not doing their job. This can
* happen when memory pressure pushes dirty pages to the end of
* the LRU before the dirty limits are breached and the dirty
* data has expired. It can also happen when the proportion of
* dirty pages grows not through writes but through memory
* pressure reclaiming all the clean cache. And in some cases,
* the flushers simply cannot keep up with the allocation
* rate. Nudge the flusher threads in case they are asleep, but
* also allow kswapd to start writing pages during reclaim.
*/
if (stat.nr_unqueued_dirty == nr_taken) {
wakeup_flusher_threads(WB_REASON_VMSCAN);
/* Allow kswapd to start writing pages during reclaim. */
if (stat.nr_unqueued_dirty == nr_taken)
set_bit(PGDAT_DIRTY, &pgdat->flags);
}
/*
* If kswapd scans pages marked marked for immediate
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment