Commit 70585216 authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'akpm' (patches from Andrew)

Merge misc fixes from Andrew Morton:
 "18 patches.

  Subsystems affected by this patch series: mm (memory-failure, swap,
  slub, hugetlb, memory-failure, slub, thp, sparsemem), and coredump"

* emailed patches from Andrew Morton <akpm@linux-foundation.org>:
  mm/sparse: fix check_usemap_section_nr warnings
  mm: thp: replace DEBUG_VM BUG with VM_WARN when unmap fails for split
  mm/thp: unmap_mapping_page() to fix THP truncate_cleanup_page()
  mm/thp: fix page_address_in_vma() on file THP tails
  mm/thp: fix vma_address() if virtual address below file offset
  mm/thp: try_to_unmap() use TTU_SYNC for safe splitting
  mm/thp: make is_huge_zero_pmd() safe and quicker
  mm/thp: fix __split_huge_pmd_locked() on shmem migration entry
  mm, thp: use head page in __migration_entry_wait()
  mm/slub.c: include swab.h
  crash_core, vmcoreinfo: append 'SECTION_SIZE_BITS' to vmcoreinfo
  mm/memory-failure: make sure wait for page writeback in memory_failure
  mm/hugetlb: expand restore_reserve_on_error functionality
  mm/slub: actually fix freelist pointer vs redzoning
  mm/slub: fix redzoning for small allocations
  mm/slub: clarify verification reporting
  mm/swap: fix pte_same_as_swp() not removing uffd-wp bit when compare
  mm,hwpoison: fix race with hugetlb page allocation
parents 6b00bc63 ccbd6283
...@@ -181,7 +181,7 @@ SLUB Debug output ...@@ -181,7 +181,7 @@ SLUB Debug output
Here is a sample of slub debug output:: Here is a sample of slub debug output::
==================================================================== ====================================================================
BUG kmalloc-8: Redzone overwritten BUG kmalloc-8: Right Redzone overwritten
-------------------------------------------------------------------- --------------------------------------------------------------------
INFO: 0xc90f6d28-0xc90f6d2b. First byte 0x00 instead of 0xcc INFO: 0xc90f6d28-0xc90f6d2b. First byte 0x00 instead of 0xcc
...@@ -189,10 +189,10 @@ Here is a sample of slub debug output:: ...@@ -189,10 +189,10 @@ Here is a sample of slub debug output::
INFO: Object 0xc90f6d20 @offset=3360 fp=0xc90f6d58 INFO: Object 0xc90f6d20 @offset=3360 fp=0xc90f6d58
INFO: Allocated in get_modalias+0x61/0xf5 age=53 cpu=1 pid=554 INFO: Allocated in get_modalias+0x61/0xf5 age=53 cpu=1 pid=554
Bytes b4 0xc90f6d10: 00 00 00 00 00 00 00 00 5a 5a 5a 5a 5a 5a 5a 5a ........ZZZZZZZZ Bytes b4 (0xc90f6d10): 00 00 00 00 00 00 00 00 5a 5a 5a 5a 5a 5a 5a 5a ........ZZZZZZZZ
Object 0xc90f6d20: 31 30 31 39 2e 30 30 35 1019.005 Object (0xc90f6d20): 31 30 31 39 2e 30 30 35 1019.005
Redzone 0xc90f6d28: 00 cc cc cc . Redzone (0xc90f6d28): 00 cc cc cc .
Padding 0xc90f6d50: 5a 5a 5a 5a 5a 5a 5a 5a ZZZZZZZZ Padding (0xc90f6d50): 5a 5a 5a 5a 5a 5a 5a 5a ZZZZZZZZ
[<c010523d>] dump_trace+0x63/0x1eb [<c010523d>] dump_trace+0x63/0x1eb
[<c01053df>] show_trace_log_lvl+0x1a/0x2f [<c01053df>] show_trace_log_lvl+0x1a/0x2f
......
...@@ -735,6 +735,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, ...@@ -735,6 +735,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
__SetPageUptodate(page); __SetPageUptodate(page);
error = huge_add_to_page_cache(page, mapping, index); error = huge_add_to_page_cache(page, mapping, index);
if (unlikely(error)) { if (unlikely(error)) {
restore_reserve_on_error(h, &pseudo_vma, addr, page);
put_page(page); put_page(page);
mutex_unlock(&hugetlb_fault_mutex_table[hash]); mutex_unlock(&hugetlb_fault_mutex_table[hash]);
goto out; goto out;
......
...@@ -286,6 +286,7 @@ struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr, ...@@ -286,6 +286,7 @@ struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr,
vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t orig_pmd); vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t orig_pmd);
extern struct page *huge_zero_page; extern struct page *huge_zero_page;
extern unsigned long huge_zero_pfn;
static inline bool is_huge_zero_page(struct page *page) static inline bool is_huge_zero_page(struct page *page)
{ {
...@@ -294,7 +295,7 @@ static inline bool is_huge_zero_page(struct page *page) ...@@ -294,7 +295,7 @@ static inline bool is_huge_zero_page(struct page *page)
static inline bool is_huge_zero_pmd(pmd_t pmd) static inline bool is_huge_zero_pmd(pmd_t pmd)
{ {
return is_huge_zero_page(pmd_page(pmd)); return READ_ONCE(huge_zero_pfn) == pmd_pfn(pmd) && pmd_present(pmd);
} }
static inline bool is_huge_zero_pud(pud_t pud) static inline bool is_huge_zero_pud(pud_t pud)
...@@ -440,6 +441,11 @@ static inline bool is_huge_zero_page(struct page *page) ...@@ -440,6 +441,11 @@ static inline bool is_huge_zero_page(struct page *page)
return false; return false;
} }
static inline bool is_huge_zero_pmd(pmd_t pmd)
{
return false;
}
static inline bool is_huge_zero_pud(pud_t pud) static inline bool is_huge_zero_pud(pud_t pud)
{ {
return false; return false;
......
...@@ -149,6 +149,7 @@ bool hugetlb_reserve_pages(struct inode *inode, long from, long to, ...@@ -149,6 +149,7 @@ bool hugetlb_reserve_pages(struct inode *inode, long from, long to,
long hugetlb_unreserve_pages(struct inode *inode, long start, long end, long hugetlb_unreserve_pages(struct inode *inode, long start, long end,
long freed); long freed);
bool isolate_huge_page(struct page *page, struct list_head *list); bool isolate_huge_page(struct page *page, struct list_head *list);
int get_hwpoison_huge_page(struct page *page, bool *hugetlb);
void putback_active_hugepage(struct page *page); void putback_active_hugepage(struct page *page);
void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason); void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason);
void free_huge_page(struct page *page); void free_huge_page(struct page *page);
...@@ -339,6 +340,11 @@ static inline bool isolate_huge_page(struct page *page, struct list_head *list) ...@@ -339,6 +340,11 @@ static inline bool isolate_huge_page(struct page *page, struct list_head *list)
return false; return false;
} }
static inline int get_hwpoison_huge_page(struct page *page, bool *hugetlb)
{
return 0;
}
static inline void putback_active_hugepage(struct page *page) static inline void putback_active_hugepage(struct page *page)
{ {
} }
...@@ -604,6 +610,8 @@ struct page *alloc_huge_page_vma(struct hstate *h, struct vm_area_struct *vma, ...@@ -604,6 +610,8 @@ struct page *alloc_huge_page_vma(struct hstate *h, struct vm_area_struct *vma,
unsigned long address); unsigned long address);
int huge_add_to_page_cache(struct page *page, struct address_space *mapping, int huge_add_to_page_cache(struct page *page, struct address_space *mapping,
pgoff_t idx); pgoff_t idx);
void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma,
unsigned long address, struct page *page);
/* arch callback */ /* arch callback */
int __init __alloc_bootmem_huge_page(struct hstate *h); int __init __alloc_bootmem_huge_page(struct hstate *h);
......
...@@ -1719,6 +1719,7 @@ struct zap_details { ...@@ -1719,6 +1719,7 @@ struct zap_details {
struct address_space *check_mapping; /* Check page->mapping if set */ struct address_space *check_mapping; /* Check page->mapping if set */
pgoff_t first_index; /* Lowest page->index to unmap */ pgoff_t first_index; /* Lowest page->index to unmap */
pgoff_t last_index; /* Highest page->index to unmap */ pgoff_t last_index; /* Highest page->index to unmap */
struct page *single_page; /* Locked page to be unmapped */
}; };
struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
...@@ -1766,6 +1767,7 @@ extern vm_fault_t handle_mm_fault(struct vm_area_struct *vma, ...@@ -1766,6 +1767,7 @@ extern vm_fault_t handle_mm_fault(struct vm_area_struct *vma,
extern int fixup_user_fault(struct mm_struct *mm, extern int fixup_user_fault(struct mm_struct *mm,
unsigned long address, unsigned int fault_flags, unsigned long address, unsigned int fault_flags,
bool *unlocked); bool *unlocked);
void unmap_mapping_page(struct page *page);
void unmap_mapping_pages(struct address_space *mapping, void unmap_mapping_pages(struct address_space *mapping,
pgoff_t start, pgoff_t nr, bool even_cows); pgoff_t start, pgoff_t nr, bool even_cows);
void unmap_mapping_range(struct address_space *mapping, void unmap_mapping_range(struct address_space *mapping,
...@@ -1786,6 +1788,7 @@ static inline int fixup_user_fault(struct mm_struct *mm, unsigned long address, ...@@ -1786,6 +1788,7 @@ static inline int fixup_user_fault(struct mm_struct *mm, unsigned long address,
BUG(); BUG();
return -EFAULT; return -EFAULT;
} }
static inline void unmap_mapping_page(struct page *page) { }
static inline void unmap_mapping_pages(struct address_space *mapping, static inline void unmap_mapping_pages(struct address_space *mapping,
pgoff_t start, pgoff_t nr, bool even_cows) { } pgoff_t start, pgoff_t nr, bool even_cows) { }
static inline void unmap_mapping_range(struct address_space *mapping, static inline void unmap_mapping_range(struct address_space *mapping,
......
...@@ -91,6 +91,7 @@ enum ttu_flags { ...@@ -91,6 +91,7 @@ enum ttu_flags {
TTU_SPLIT_HUGE_PMD = 0x4, /* split huge PMD if any */ TTU_SPLIT_HUGE_PMD = 0x4, /* split huge PMD if any */
TTU_IGNORE_MLOCK = 0x8, /* ignore mlock */ TTU_IGNORE_MLOCK = 0x8, /* ignore mlock */
TTU_SYNC = 0x10, /* avoid racy checks with PVMW_SYNC */
TTU_IGNORE_HWPOISON = 0x20, /* corrupted page is recoverable */ TTU_IGNORE_HWPOISON = 0x20, /* corrupted page is recoverable */
TTU_BATCH_FLUSH = 0x40, /* Batch TLB flushes where possible TTU_BATCH_FLUSH = 0x40, /* Batch TLB flushes where possible
* and caller guarantees they will * and caller guarantees they will
......
...@@ -23,6 +23,16 @@ ...@@ -23,6 +23,16 @@
#define SWP_TYPE_SHIFT (BITS_PER_XA_VALUE - MAX_SWAPFILES_SHIFT) #define SWP_TYPE_SHIFT (BITS_PER_XA_VALUE - MAX_SWAPFILES_SHIFT)
#define SWP_OFFSET_MASK ((1UL << SWP_TYPE_SHIFT) - 1) #define SWP_OFFSET_MASK ((1UL << SWP_TYPE_SHIFT) - 1)
/* Clear all flags but only keep swp_entry_t related information */
static inline pte_t pte_swp_clear_flags(pte_t pte)
{
if (pte_swp_soft_dirty(pte))
pte = pte_swp_clear_soft_dirty(pte);
if (pte_swp_uffd_wp(pte))
pte = pte_swp_clear_uffd_wp(pte);
return pte;
}
/* /*
* Store a type+offset into a swp_entry_t in an arch-independent format * Store a type+offset into a swp_entry_t in an arch-independent format
*/ */
...@@ -66,10 +76,7 @@ static inline swp_entry_t pte_to_swp_entry(pte_t pte) ...@@ -66,10 +76,7 @@ static inline swp_entry_t pte_to_swp_entry(pte_t pte)
{ {
swp_entry_t arch_entry; swp_entry_t arch_entry;
if (pte_swp_soft_dirty(pte)) pte = pte_swp_clear_flags(pte);
pte = pte_swp_clear_soft_dirty(pte);
if (pte_swp_uffd_wp(pte))
pte = pte_swp_clear_uffd_wp(pte);
arch_entry = __pte_to_swp_entry(pte); arch_entry = __pte_to_swp_entry(pte);
return swp_entry(__swp_type(arch_entry), __swp_offset(arch_entry)); return swp_entry(__swp_type(arch_entry), __swp_offset(arch_entry));
} }
......
...@@ -464,6 +464,7 @@ static int __init crash_save_vmcoreinfo_init(void) ...@@ -464,6 +464,7 @@ static int __init crash_save_vmcoreinfo_init(void)
VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS); VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS);
VMCOREINFO_STRUCT_SIZE(mem_section); VMCOREINFO_STRUCT_SIZE(mem_section);
VMCOREINFO_OFFSET(mem_section, section_mem_map); VMCOREINFO_OFFSET(mem_section, section_mem_map);
VMCOREINFO_NUMBER(SECTION_SIZE_BITS);
VMCOREINFO_NUMBER(MAX_PHYSMEM_BITS); VMCOREINFO_NUMBER(MAX_PHYSMEM_BITS);
#endif #endif
VMCOREINFO_STRUCT_SIZE(page); VMCOREINFO_STRUCT_SIZE(page);
......
...@@ -62,6 +62,7 @@ static struct shrinker deferred_split_shrinker; ...@@ -62,6 +62,7 @@ static struct shrinker deferred_split_shrinker;
static atomic_t huge_zero_refcount; static atomic_t huge_zero_refcount;
struct page *huge_zero_page __read_mostly; struct page *huge_zero_page __read_mostly;
unsigned long huge_zero_pfn __read_mostly = ~0UL;
bool transparent_hugepage_enabled(struct vm_area_struct *vma) bool transparent_hugepage_enabled(struct vm_area_struct *vma)
{ {
...@@ -98,6 +99,7 @@ static bool get_huge_zero_page(void) ...@@ -98,6 +99,7 @@ static bool get_huge_zero_page(void)
__free_pages(zero_page, compound_order(zero_page)); __free_pages(zero_page, compound_order(zero_page));
goto retry; goto retry;
} }
WRITE_ONCE(huge_zero_pfn, page_to_pfn(zero_page));
/* We take additional reference here. It will be put back by shrinker */ /* We take additional reference here. It will be put back by shrinker */
atomic_set(&huge_zero_refcount, 2); atomic_set(&huge_zero_refcount, 2);
...@@ -147,6 +149,7 @@ static unsigned long shrink_huge_zero_page_scan(struct shrinker *shrink, ...@@ -147,6 +149,7 @@ static unsigned long shrink_huge_zero_page_scan(struct shrinker *shrink,
if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) { if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) {
struct page *zero_page = xchg(&huge_zero_page, NULL); struct page *zero_page = xchg(&huge_zero_page, NULL);
BUG_ON(zero_page == NULL); BUG_ON(zero_page == NULL);
WRITE_ONCE(huge_zero_pfn, ~0UL);
__free_pages(zero_page, compound_order(zero_page)); __free_pages(zero_page, compound_order(zero_page));
return HPAGE_PMD_NR; return HPAGE_PMD_NR;
} }
...@@ -2044,7 +2047,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, ...@@ -2044,7 +2047,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
count_vm_event(THP_SPLIT_PMD); count_vm_event(THP_SPLIT_PMD);
if (!vma_is_anonymous(vma)) { if (!vma_is_anonymous(vma)) {
_pmd = pmdp_huge_clear_flush_notify(vma, haddr, pmd); old_pmd = pmdp_huge_clear_flush_notify(vma, haddr, pmd);
/* /*
* We are going to unmap this huge page. So * We are going to unmap this huge page. So
* just go ahead and zap it * just go ahead and zap it
...@@ -2053,16 +2056,25 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, ...@@ -2053,16 +2056,25 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
zap_deposited_table(mm, pmd); zap_deposited_table(mm, pmd);
if (vma_is_special_huge(vma)) if (vma_is_special_huge(vma))
return; return;
page = pmd_page(_pmd); if (unlikely(is_pmd_migration_entry(old_pmd))) {
if (!PageDirty(page) && pmd_dirty(_pmd)) swp_entry_t entry;
set_page_dirty(page);
if (!PageReferenced(page) && pmd_young(_pmd)) entry = pmd_to_swp_entry(old_pmd);
SetPageReferenced(page); page = migration_entry_to_page(entry);
page_remove_rmap(page, true); } else {
put_page(page); page = pmd_page(old_pmd);
if (!PageDirty(page) && pmd_dirty(old_pmd))
set_page_dirty(page);
if (!PageReferenced(page) && pmd_young(old_pmd))
SetPageReferenced(page);
page_remove_rmap(page, true);
put_page(page);
}
add_mm_counter(mm, mm_counter_file(page), -HPAGE_PMD_NR); add_mm_counter(mm, mm_counter_file(page), -HPAGE_PMD_NR);
return; return;
} else if (pmd_trans_huge(*pmd) && is_huge_zero_pmd(*pmd)) { }
if (is_huge_zero_pmd(*pmd)) {
/* /*
* FIXME: Do we want to invalidate secondary mmu by calling * FIXME: Do we want to invalidate secondary mmu by calling
* mmu_notifier_invalidate_range() see comments below inside * mmu_notifier_invalidate_range() see comments below inside
...@@ -2338,17 +2350,17 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma, ...@@ -2338,17 +2350,17 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma,
static void unmap_page(struct page *page) static void unmap_page(struct page *page)
{ {
enum ttu_flags ttu_flags = TTU_IGNORE_MLOCK | enum ttu_flags ttu_flags = TTU_IGNORE_MLOCK | TTU_SYNC |
TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD; TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD;
bool unmap_success;
VM_BUG_ON_PAGE(!PageHead(page), page); VM_BUG_ON_PAGE(!PageHead(page), page);
if (PageAnon(page)) if (PageAnon(page))
ttu_flags |= TTU_SPLIT_FREEZE; ttu_flags |= TTU_SPLIT_FREEZE;
unmap_success = try_to_unmap(page, ttu_flags); try_to_unmap(page, ttu_flags);
VM_BUG_ON_PAGE(!unmap_success, page);
VM_WARN_ON_ONCE_PAGE(page_mapped(page), page);
} }
static void remap_page(struct page *page, unsigned int nr) static void remap_page(struct page *page, unsigned int nr)
...@@ -2659,7 +2671,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) ...@@ -2659,7 +2671,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
struct deferred_split *ds_queue = get_deferred_split_queue(head); struct deferred_split *ds_queue = get_deferred_split_queue(head);
struct anon_vma *anon_vma = NULL; struct anon_vma *anon_vma = NULL;
struct address_space *mapping = NULL; struct address_space *mapping = NULL;
int count, mapcount, extra_pins, ret; int extra_pins, ret;
pgoff_t end; pgoff_t end;
VM_BUG_ON_PAGE(is_huge_zero_page(head), head); VM_BUG_ON_PAGE(is_huge_zero_page(head), head);
...@@ -2718,7 +2730,6 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) ...@@ -2718,7 +2730,6 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
} }
unmap_page(head); unmap_page(head);
VM_BUG_ON_PAGE(compound_mapcount(head), head);
/* block interrupt reentry in xa_lock and spinlock */ /* block interrupt reentry in xa_lock and spinlock */
local_irq_disable(); local_irq_disable();
...@@ -2736,9 +2747,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) ...@@ -2736,9 +2747,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
/* Prevent deferred_split_scan() touching ->_refcount */ /* Prevent deferred_split_scan() touching ->_refcount */
spin_lock(&ds_queue->split_queue_lock); spin_lock(&ds_queue->split_queue_lock);
count = page_count(head); if (page_ref_freeze(head, 1 + extra_pins)) {
mapcount = total_mapcount(head);
if (!mapcount && page_ref_freeze(head, 1 + extra_pins)) {
if (!list_empty(page_deferred_list(head))) { if (!list_empty(page_deferred_list(head))) {
ds_queue->split_queue_len--; ds_queue->split_queue_len--;
list_del(page_deferred_list(head)); list_del(page_deferred_list(head));
...@@ -2758,16 +2767,9 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) ...@@ -2758,16 +2767,9 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
__split_huge_page(page, list, end); __split_huge_page(page, list, end);
ret = 0; ret = 0;
} else { } else {
if (IS_ENABLED(CONFIG_DEBUG_VM) && mapcount) {
pr_alert("total_mapcount: %u, page_count(): %u\n",
mapcount, count);
if (PageTail(page))
dump_page(head, NULL);
dump_page(page, "total_mapcount(head) > 0");
BUG();
}
spin_unlock(&ds_queue->split_queue_lock); spin_unlock(&ds_queue->split_queue_lock);
fail: if (mapping) fail:
if (mapping)
xa_unlock(&mapping->i_pages); xa_unlock(&mapping->i_pages);
local_irq_enable(); local_irq_enable();
remap_page(head, thp_nr_pages(head)); remap_page(head, thp_nr_pages(head));
......
...@@ -2121,12 +2121,18 @@ static void return_unused_surplus_pages(struct hstate *h, ...@@ -2121,12 +2121,18 @@ static void return_unused_surplus_pages(struct hstate *h,
* be restored when a newly allocated huge page must be freed. It is * be restored when a newly allocated huge page must be freed. It is
* to be called after calling vma_needs_reservation to determine if a * to be called after calling vma_needs_reservation to determine if a
* reservation exists. * reservation exists.
*
* vma_del_reservation is used in error paths where an entry in the reserve
* map was created during huge page allocation and must be removed. It is to
* be called after calling vma_needs_reservation to determine if a reservation
* exists.
*/ */
enum vma_resv_mode { enum vma_resv_mode {
VMA_NEEDS_RESV, VMA_NEEDS_RESV,
VMA_COMMIT_RESV, VMA_COMMIT_RESV,
VMA_END_RESV, VMA_END_RESV,
VMA_ADD_RESV, VMA_ADD_RESV,
VMA_DEL_RESV,
}; };
static long __vma_reservation_common(struct hstate *h, static long __vma_reservation_common(struct hstate *h,
struct vm_area_struct *vma, unsigned long addr, struct vm_area_struct *vma, unsigned long addr,
...@@ -2170,11 +2176,21 @@ static long __vma_reservation_common(struct hstate *h, ...@@ -2170,11 +2176,21 @@ static long __vma_reservation_common(struct hstate *h,
ret = region_del(resv, idx, idx + 1); ret = region_del(resv, idx, idx + 1);
} }
break; break;
case VMA_DEL_RESV:
if (vma->vm_flags & VM_MAYSHARE) {
region_abort(resv, idx, idx + 1, 1);
ret = region_del(resv, idx, idx + 1);
} else {
ret = region_add(resv, idx, idx + 1, 1, NULL, NULL);
/* region_add calls of range 1 should never fail. */
VM_BUG_ON(ret < 0);
}
break;
default: default:
BUG(); BUG();
} }
if (vma->vm_flags & VM_MAYSHARE) if (vma->vm_flags & VM_MAYSHARE || mode == VMA_DEL_RESV)
return ret; return ret;
/* /*
* We know private mapping must have HPAGE_RESV_OWNER set. * We know private mapping must have HPAGE_RESV_OWNER set.
...@@ -2222,25 +2238,39 @@ static long vma_add_reservation(struct hstate *h, ...@@ -2222,25 +2238,39 @@ static long vma_add_reservation(struct hstate *h,
return __vma_reservation_common(h, vma, addr, VMA_ADD_RESV); return __vma_reservation_common(h, vma, addr, VMA_ADD_RESV);
} }
static long vma_del_reservation(struct hstate *h,
struct vm_area_struct *vma, unsigned long addr)
{
return __vma_reservation_common(h, vma, addr, VMA_DEL_RESV);
}
/* /*
* This routine is called to restore a reservation on error paths. In the * This routine is called to restore reservation information on error paths.
* specific error paths, a huge page was allocated (via alloc_huge_page) * It should ONLY be called for pages allocated via alloc_huge_page(), and
* and is about to be freed. If a reservation for the page existed, * the hugetlb mutex should remain held when calling this routine.
* alloc_huge_page would have consumed the reservation and set *
* HPageRestoreReserve in the newly allocated page. When the page is freed * It handles two specific cases:
* via free_huge_page, the global reservation count will be incremented if * 1) A reservation was in place and the page consumed the reservation.
* HPageRestoreReserve is set. However, free_huge_page can not adjust the * HPageRestoreReserve is set in the page.
* reserve map. Adjust the reserve map here to be consistent with global * 2) No reservation was in place for the page, so HPageRestoreReserve is
* reserve count adjustments to be made by free_huge_page. * not set. However, alloc_huge_page always updates the reserve map.
*
* In case 1, free_huge_page later in the error path will increment the
* global reserve count. But, free_huge_page does not have enough context
* to adjust the reservation map. This case deals primarily with private
* mappings. Adjust the reserve map here to be consistent with global
* reserve count adjustments to be made by free_huge_page. Make sure the
* reserve map indicates there is a reservation present.
*
* In case 2, simply undo reserve map modifications done by alloc_huge_page.
*/ */
static void restore_reserve_on_error(struct hstate *h, void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma,
struct vm_area_struct *vma, unsigned long address, unsigned long address, struct page *page)
struct page *page)
{ {
if (unlikely(HPageRestoreReserve(page))) { long rc = vma_needs_reservation(h, vma, address);
long rc = vma_needs_reservation(h, vma, address);
if (unlikely(rc < 0)) { if (HPageRestoreReserve(page)) {
if (unlikely(rc < 0))
/* /*
* Rare out of memory condition in reserve map * Rare out of memory condition in reserve map
* manipulation. Clear HPageRestoreReserve so that * manipulation. Clear HPageRestoreReserve so that
...@@ -2253,16 +2283,57 @@ static void restore_reserve_on_error(struct hstate *h, ...@@ -2253,16 +2283,57 @@ static void restore_reserve_on_error(struct hstate *h,
* accounting of reserve counts. * accounting of reserve counts.
*/ */
ClearHPageRestoreReserve(page); ClearHPageRestoreReserve(page);
} else if (rc) { else if (rc)
rc = vma_add_reservation(h, vma, address); (void)vma_add_reservation(h, vma, address);
if (unlikely(rc < 0)) else
vma_end_reservation(h, vma, address);
} else {
if (!rc) {
/*
* This indicates there is an entry in the reserve map
* added by alloc_huge_page. We know it was added
* before the alloc_huge_page call, otherwise
* HPageRestoreReserve would be set on the page.
* Remove the entry so that a subsequent allocation
* does not consume a reservation.
*/
rc = vma_del_reservation(h, vma, address);
if (rc < 0)
/*
* VERY rare out of memory condition. Since
* we can not delete the entry, set
* HPageRestoreReserve so that the reserve
* count will be incremented when the page
* is freed. This reserve will be consumed
* on a subsequent allocation.
*/
SetHPageRestoreReserve(page);
} else if (rc < 0) {
/*
* Rare out of memory condition from
* vma_needs_reservation call. Memory allocation is
* only attempted if a new entry is needed. Therefore,
* this implies there is not an entry in the
* reserve map.
*
* For shared mappings, no entry in the map indicates
* no reservation. We are done.
*/
if (!(vma->vm_flags & VM_MAYSHARE))
/* /*
* See above comment about rare out of * For private mappings, no entry indicates
* memory condition. * a reservation is present. Since we can
* not add an entry, set SetHPageRestoreReserve
* on the page so reserve count will be
* incremented when freed. This reserve will
* be consumed on a subsequent allocation.
*/ */
ClearHPageRestoreReserve(page); SetHPageRestoreReserve(page);
} else } else
vma_end_reservation(h, vma, address); /*
* No reservation present, do nothing
*/
vma_end_reservation(h, vma, address);
} }
} }
...@@ -4037,6 +4108,8 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, ...@@ -4037,6 +4108,8 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
entry = huge_ptep_get(src_pte); entry = huge_ptep_get(src_pte);
if (!pte_same(src_pte_old, entry)) { if (!pte_same(src_pte_old, entry)) {
restore_reserve_on_error(h, vma, addr,
new);
put_page(new); put_page(new);
/* dst_entry won't change as in child */ /* dst_entry won't change as in child */
goto again; goto again;
...@@ -5006,6 +5079,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, ...@@ -5006,6 +5079,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
if (vm_shared || is_continue) if (vm_shared || is_continue)
unlock_page(page); unlock_page(page);
out_release_nounlock: out_release_nounlock:
restore_reserve_on_error(h, dst_vma, dst_addr, page);
put_page(page); put_page(page);
goto out; goto out;
} }
...@@ -5857,6 +5931,21 @@ bool isolate_huge_page(struct page *page, struct list_head *list) ...@@ -5857,6 +5931,21 @@ bool isolate_huge_page(struct page *page, struct list_head *list)
return ret; return ret;
} }
int get_hwpoison_huge_page(struct page *page, bool *hugetlb)
{
int ret = 0;
*hugetlb = false;
spin_lock_irq(&hugetlb_lock);
if (PageHeadHuge(page)) {
*hugetlb = true;
if (HPageFreed(page) || HPageMigratable(page))
ret = get_page_unless_zero(page);
}
spin_unlock_irq(&hugetlb_lock);
return ret;
}
void putback_active_hugepage(struct page *page) void putback_active_hugepage(struct page *page)
{ {
spin_lock_irq(&hugetlb_lock); spin_lock_irq(&hugetlb_lock);
......
...@@ -384,27 +384,52 @@ static inline void mlock_migrate_page(struct page *newpage, struct page *page) ...@@ -384,27 +384,52 @@ static inline void mlock_migrate_page(struct page *newpage, struct page *page)
extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma); extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma);
/* /*
* At what user virtual address is page expected in @vma? * At what user virtual address is page expected in vma?
* Returns -EFAULT if all of the page is outside the range of vma.
* If page is a compound head, the entire compound page is considered.
*/ */
static inline unsigned long static inline unsigned long
__vma_address(struct page *page, struct vm_area_struct *vma) vma_address(struct page *page, struct vm_area_struct *vma)
{ {
pgoff_t pgoff = page_to_pgoff(page); pgoff_t pgoff;
return vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); unsigned long address;
VM_BUG_ON_PAGE(PageKsm(page), page); /* KSM page->index unusable */
pgoff = page_to_pgoff(page);
if (pgoff >= vma->vm_pgoff) {
address = vma->vm_start +
((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
/* Check for address beyond vma (or wrapped through 0?) */
if (address < vma->vm_start || address >= vma->vm_end)
address = -EFAULT;
} else if (PageHead(page) &&
pgoff + compound_nr(page) - 1 >= vma->vm_pgoff) {
/* Test above avoids possibility of wrap to 0 on 32-bit */
address = vma->vm_start;
} else {
address = -EFAULT;
}
return address;
} }
/*
* Then at what user virtual address will none of the page be found in vma?
* Assumes that vma_address() already returned a good starting address.
* If page is a compound head, the entire compound page is considered.
*/
static inline unsigned long static inline unsigned long
vma_address(struct page *page, struct vm_area_struct *vma) vma_address_end(struct page *page, struct vm_area_struct *vma)
{ {
unsigned long start, end; pgoff_t pgoff;
unsigned long address;
start = __vma_address(page, vma);
end = start + thp_size(page) - PAGE_SIZE; VM_BUG_ON_PAGE(PageKsm(page), page); /* KSM page->index unusable */
pgoff = page_to_pgoff(page) + compound_nr(page);
/* page should be within @vma mapping range */ address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
VM_BUG_ON_VMA(end < vma->vm_start || start >= vma->vm_end, vma); /* Check for address beyond vma (or wrapped through 0?) */
if (address < vma->vm_start || address > vma->vm_end)
return max(start, vma->vm_start); address = vma->vm_end;
return address;
} }
static inline struct file *maybe_unlock_mmap_for_io(struct vm_fault *vmf, static inline struct file *maybe_unlock_mmap_for_io(struct vm_fault *vmf,
......
...@@ -949,6 +949,17 @@ static int page_action(struct page_state *ps, struct page *p, ...@@ -949,6 +949,17 @@ static int page_action(struct page_state *ps, struct page *p,
return (result == MF_RECOVERED || result == MF_DELAYED) ? 0 : -EBUSY; return (result == MF_RECOVERED || result == MF_DELAYED) ? 0 : -EBUSY;
} }
/*
* Return true if a page type of a given page is supported by hwpoison
* mechanism (while handling could fail), otherwise false. This function
* does not return true for hugetlb or device memory pages, so it's assumed
* to be called only in the context where we never have such pages.
*/
static inline bool HWPoisonHandlable(struct page *page)
{
return PageLRU(page) || __PageMovable(page);
}
/** /**
* __get_hwpoison_page() - Get refcount for memory error handling: * __get_hwpoison_page() - Get refcount for memory error handling:
* @page: raw error page (hit by memory error) * @page: raw error page (hit by memory error)
...@@ -959,8 +970,22 @@ static int page_action(struct page_state *ps, struct page *p, ...@@ -959,8 +970,22 @@ static int page_action(struct page_state *ps, struct page *p,
static int __get_hwpoison_page(struct page *page) static int __get_hwpoison_page(struct page *page)
{ {
struct page *head = compound_head(page); struct page *head = compound_head(page);
int ret = 0;
bool hugetlb = false;
ret = get_hwpoison_huge_page(head, &hugetlb);
if (hugetlb)
return ret;
if (!PageHuge(head) && PageTransHuge(head)) { /*
* This check prevents from calling get_hwpoison_unless_zero()
* for any unsupported type of page in order to reduce the risk of
* unexpected races caused by taking a page refcount.
*/
if (!HWPoisonHandlable(head))
return 0;
if (PageTransHuge(head)) {
/* /*
* Non anonymous thp exists only in allocation/free time. We * Non anonymous thp exists only in allocation/free time. We
* can't handle such a case correctly, so let's give it up. * can't handle such a case correctly, so let's give it up.
...@@ -1017,7 +1042,7 @@ static int get_any_page(struct page *p, unsigned long flags) ...@@ -1017,7 +1042,7 @@ static int get_any_page(struct page *p, unsigned long flags)
ret = -EIO; ret = -EIO;
} }
} else { } else {
if (PageHuge(p) || PageLRU(p) || __PageMovable(p)) { if (PageHuge(p) || HWPoisonHandlable(p)) {
ret = 1; ret = 1;
} else { } else {
/* /*
...@@ -1527,7 +1552,12 @@ int memory_failure(unsigned long pfn, int flags) ...@@ -1527,7 +1552,12 @@ int memory_failure(unsigned long pfn, int flags)
return 0; return 0;
} }
if (!PageTransTail(p) && !PageLRU(p)) /*
* __munlock_pagevec may clear a writeback page's LRU flag without
* page_lock. We need wait writeback completion for this page or it
* may trigger vfs BUG while evict inode.
*/
if (!PageTransTail(p) && !PageLRU(p) && !PageWriteback(p))
goto identify_page_state; goto identify_page_state;
/* /*
......
...@@ -1361,7 +1361,18 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, ...@@ -1361,7 +1361,18 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
else if (zap_huge_pmd(tlb, vma, pmd, addr)) else if (zap_huge_pmd(tlb, vma, pmd, addr))
goto next; goto next;
/* fall through */ /* fall through */
} else if (details && details->single_page &&
PageTransCompound(details->single_page) &&
next - addr == HPAGE_PMD_SIZE && pmd_none(*pmd)) {
spinlock_t *ptl = pmd_lock(tlb->mm, pmd);
/*
* Take and drop THP pmd lock so that we cannot return
* prematurely, while zap_huge_pmd() has cleared *pmd,
* but not yet decremented compound_mapcount().
*/
spin_unlock(ptl);
} }
/* /*
* Here there can be other concurrent MADV_DONTNEED or * Here there can be other concurrent MADV_DONTNEED or
* trans huge page faults running, and if the pmd is * trans huge page faults running, and if the pmd is
...@@ -3236,6 +3247,36 @@ static inline void unmap_mapping_range_tree(struct rb_root_cached *root, ...@@ -3236,6 +3247,36 @@ static inline void unmap_mapping_range_tree(struct rb_root_cached *root,
} }
} }
/**
* unmap_mapping_page() - Unmap single page from processes.
* @page: The locked page to be unmapped.
*
* Unmap this page from any userspace process which still has it mmaped.
* Typically, for efficiency, the range of nearby pages has already been
* unmapped by unmap_mapping_pages() or unmap_mapping_range(). But once
* truncation or invalidation holds the lock on a page, it may find that
* the page has been remapped again: and then uses unmap_mapping_page()
* to unmap it finally.
*/
void unmap_mapping_page(struct page *page)
{
struct address_space *mapping = page->mapping;
struct zap_details details = { };
VM_BUG_ON(!PageLocked(page));
VM_BUG_ON(PageTail(page));
details.check_mapping = mapping;
details.first_index = page->index;
details.last_index = page->index + thp_nr_pages(page) - 1;
details.single_page = page;
i_mmap_lock_write(mapping);
if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)))
unmap_mapping_range_tree(&mapping->i_mmap, &details);
i_mmap_unlock_write(mapping);
}
/** /**
* unmap_mapping_pages() - Unmap pages from processes. * unmap_mapping_pages() - Unmap pages from processes.
* @mapping: The address space containing pages to be unmapped. * @mapping: The address space containing pages to be unmapped.
......
...@@ -295,6 +295,7 @@ void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep, ...@@ -295,6 +295,7 @@ void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep,
goto out; goto out;
page = migration_entry_to_page(entry); page = migration_entry_to_page(entry);
page = compound_head(page);
/* /*
* Once page cache replacement of page migration started, page_count * Once page cache replacement of page migration started, page_count
......
...@@ -212,23 +212,34 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw) ...@@ -212,23 +212,34 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
pvmw->ptl = NULL; pvmw->ptl = NULL;
} }
} else if (!pmd_present(pmde)) { } else if (!pmd_present(pmde)) {
/*
* If PVMW_SYNC, take and drop THP pmd lock so that we
* cannot return prematurely, while zap_huge_pmd() has
* cleared *pmd but not decremented compound_mapcount().
*/
if ((pvmw->flags & PVMW_SYNC) &&
PageTransCompound(pvmw->page)) {
spinlock_t *ptl = pmd_lock(mm, pvmw->pmd);
spin_unlock(ptl);
}
return false; return false;
} }
if (!map_pte(pvmw)) if (!map_pte(pvmw))
goto next_pte; goto next_pte;
while (1) { while (1) {
unsigned long end;
if (check_pte(pvmw)) if (check_pte(pvmw))
return true; return true;
next_pte: next_pte:
/* Seek to next pte only makes sense for THP */ /* Seek to next pte only makes sense for THP */
if (!PageTransHuge(pvmw->page) || PageHuge(pvmw->page)) if (!PageTransHuge(pvmw->page) || PageHuge(pvmw->page))
return not_found(pvmw); return not_found(pvmw);
end = vma_address_end(pvmw->page, pvmw->vma);
do { do {
pvmw->address += PAGE_SIZE; pvmw->address += PAGE_SIZE;
if (pvmw->address >= pvmw->vma->vm_end || if (pvmw->address >= end)
pvmw->address >=
__vma_address(pvmw->page, pvmw->vma) +
thp_size(pvmw->page))
return not_found(pvmw); return not_found(pvmw);
/* Did we cross page table boundary? */ /* Did we cross page table boundary? */
if (pvmw->address % PMD_SIZE == 0) { if (pvmw->address % PMD_SIZE == 0) {
...@@ -266,14 +277,10 @@ int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma) ...@@ -266,14 +277,10 @@ int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma)
.vma = vma, .vma = vma,
.flags = PVMW_SYNC, .flags = PVMW_SYNC,
}; };
unsigned long start, end;
start = __vma_address(page, vma);
end = start + thp_size(page) - PAGE_SIZE;
if (unlikely(end < vma->vm_start || start >= vma->vm_end)) pvmw.address = vma_address(page, vma);
if (pvmw.address == -EFAULT)
return 0; return 0;
pvmw.address = max(start, vma->vm_start);
if (!page_vma_mapped_walk(&pvmw)) if (!page_vma_mapped_walk(&pvmw))
return 0; return 0;
page_vma_mapped_walk_done(&pvmw); page_vma_mapped_walk_done(&pvmw);
......
...@@ -135,9 +135,8 @@ pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma, unsigned long address, ...@@ -135,9 +135,8 @@ pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma, unsigned long address,
{ {
pmd_t pmd; pmd_t pmd;
VM_BUG_ON(address & ~HPAGE_PMD_MASK); VM_BUG_ON(address & ~HPAGE_PMD_MASK);
VM_BUG_ON(!pmd_present(*pmdp)); VM_BUG_ON(pmd_present(*pmdp) && !pmd_trans_huge(*pmdp) &&
/* Below assumes pmd_present() is true */ !pmd_devmap(*pmdp));
VM_BUG_ON(!pmd_trans_huge(*pmdp) && !pmd_devmap(*pmdp));
pmd = pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp); pmd = pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp);
flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE); flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
return pmd; return pmd;
......
...@@ -707,7 +707,6 @@ static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags) ...@@ -707,7 +707,6 @@ static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags)
*/ */
unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
{ {
unsigned long address;
if (PageAnon(page)) { if (PageAnon(page)) {
struct anon_vma *page__anon_vma = page_anon_vma(page); struct anon_vma *page__anon_vma = page_anon_vma(page);
/* /*
...@@ -717,15 +716,13 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) ...@@ -717,15 +716,13 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
if (!vma->anon_vma || !page__anon_vma || if (!vma->anon_vma || !page__anon_vma ||
vma->anon_vma->root != page__anon_vma->root) vma->anon_vma->root != page__anon_vma->root)
return -EFAULT; return -EFAULT;
} else if (page->mapping) { } else if (!vma->vm_file) {
if (!vma->vm_file || vma->vm_file->f_mapping != page->mapping)
return -EFAULT;
} else
return -EFAULT; return -EFAULT;
address = __vma_address(page, vma); } else if (vma->vm_file->f_mapping != compound_head(page)->mapping) {
if (unlikely(address < vma->vm_start || address >= vma->vm_end))
return -EFAULT; return -EFAULT;
return address; }
return vma_address(page, vma);
} }
pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address) pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address)
...@@ -919,7 +916,7 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma, ...@@ -919,7 +916,7 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma,
*/ */
mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE, mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE,
0, vma, vma->vm_mm, address, 0, vma, vma->vm_mm, address,
min(vma->vm_end, address + page_size(page))); vma_address_end(page, vma));
mmu_notifier_invalidate_range_start(&range); mmu_notifier_invalidate_range_start(&range);
while (page_vma_mapped_walk(&pvmw)) { while (page_vma_mapped_walk(&pvmw)) {
...@@ -1405,6 +1402,15 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, ...@@ -1405,6 +1402,15 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
struct mmu_notifier_range range; struct mmu_notifier_range range;
enum ttu_flags flags = (enum ttu_flags)(long)arg; enum ttu_flags flags = (enum ttu_flags)(long)arg;
/*
* When racing against e.g. zap_pte_range() on another cpu,
* in between its ptep_get_and_clear_full() and page_remove_rmap(),
* try_to_unmap() may return false when it is about to become true,
* if page table locking is skipped: use TTU_SYNC to wait for that.
*/
if (flags & TTU_SYNC)
pvmw.flags = PVMW_SYNC;
/* munlock has nothing to gain from examining un-locked vmas */ /* munlock has nothing to gain from examining un-locked vmas */
if ((flags & TTU_MUNLOCK) && !(vma->vm_flags & VM_LOCKED)) if ((flags & TTU_MUNLOCK) && !(vma->vm_flags & VM_LOCKED))
return true; return true;
...@@ -1426,9 +1432,10 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, ...@@ -1426,9 +1432,10 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
* Note that the page can not be free in this function as call of * Note that the page can not be free in this function as call of
* try_to_unmap() must hold a reference on the page. * try_to_unmap() must hold a reference on the page.
*/ */
range.end = PageKsm(page) ?
address + PAGE_SIZE : vma_address_end(page, vma);
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
address, address, range.end);
min(vma->vm_end, address + page_size(page)));
if (PageHuge(page)) { if (PageHuge(page)) {
/* /*
* If sharing is possible, start and end will be adjusted * If sharing is possible, start and end will be adjusted
...@@ -1777,7 +1784,13 @@ bool try_to_unmap(struct page *page, enum ttu_flags flags) ...@@ -1777,7 +1784,13 @@ bool try_to_unmap(struct page *page, enum ttu_flags flags)
else else
rmap_walk(page, &rwc); rmap_walk(page, &rwc);
return !page_mapcount(page) ? true : false; /*
* When racing against e.g. zap_pte_range() on another cpu,
* in between its ptep_get_and_clear_full() and page_remove_rmap(),
* try_to_unmap() may return false when it is about to become true,
* if page table locking is skipped: use TTU_SYNC to wait for that.
*/
return !page_mapcount(page);
} }
/** /**
...@@ -1874,6 +1887,7 @@ static void rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc, ...@@ -1874,6 +1887,7 @@ static void rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc,
struct vm_area_struct *vma = avc->vma; struct vm_area_struct *vma = avc->vma;
unsigned long address = vma_address(page, vma); unsigned long address = vma_address(page, vma);
VM_BUG_ON_VMA(address == -EFAULT, vma);
cond_resched(); cond_resched();
if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
...@@ -1928,6 +1942,7 @@ static void rmap_walk_file(struct page *page, struct rmap_walk_control *rwc, ...@@ -1928,6 +1942,7 @@ static void rmap_walk_file(struct page *page, struct rmap_walk_control *rwc,
pgoff_start, pgoff_end) { pgoff_start, pgoff_end) {
unsigned long address = vma_address(page, vma); unsigned long address = vma_address(page, vma);
VM_BUG_ON_VMA(address == -EFAULT, vma);
cond_resched(); cond_resched();
if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
......
...@@ -97,8 +97,7 @@ EXPORT_SYMBOL(kmem_cache_size); ...@@ -97,8 +97,7 @@ EXPORT_SYMBOL(kmem_cache_size);
#ifdef CONFIG_DEBUG_VM #ifdef CONFIG_DEBUG_VM
static int kmem_cache_sanity_check(const char *name, unsigned int size) static int kmem_cache_sanity_check(const char *name, unsigned int size)
{ {
if (!name || in_interrupt() || size < sizeof(void *) || if (!name || in_interrupt() || size > KMALLOC_MAX_SIZE) {
size > KMALLOC_MAX_SIZE) {
pr_err("kmem_cache_create(%s) integrity check failed\n", name); pr_err("kmem_cache_create(%s) integrity check failed\n", name);
return -EINVAL; return -EINVAL;
} }
......
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
#include <linux/module.h> #include <linux/module.h>
#include <linux/bit_spinlock.h> #include <linux/bit_spinlock.h>
#include <linux/interrupt.h> #include <linux/interrupt.h>
#include <linux/swab.h>
#include <linux/bitops.h> #include <linux/bitops.h>
#include <linux/slab.h> #include <linux/slab.h>
#include "slab.h" #include "slab.h"
...@@ -712,15 +713,15 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p) ...@@ -712,15 +713,15 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p)
p, p - addr, get_freepointer(s, p)); p, p - addr, get_freepointer(s, p));
if (s->flags & SLAB_RED_ZONE) if (s->flags & SLAB_RED_ZONE)
print_section(KERN_ERR, "Redzone ", p - s->red_left_pad, print_section(KERN_ERR, "Redzone ", p - s->red_left_pad,
s->red_left_pad); s->red_left_pad);
else if (p > addr + 16) else if (p > addr + 16)
print_section(KERN_ERR, "Bytes b4 ", p - 16, 16); print_section(KERN_ERR, "Bytes b4 ", p - 16, 16);
print_section(KERN_ERR, "Object ", p, print_section(KERN_ERR, "Object ", p,
min_t(unsigned int, s->object_size, PAGE_SIZE)); min_t(unsigned int, s->object_size, PAGE_SIZE));
if (s->flags & SLAB_RED_ZONE) if (s->flags & SLAB_RED_ZONE)
print_section(KERN_ERR, "Redzone ", p + s->object_size, print_section(KERN_ERR, "Redzone ", p + s->object_size,
s->inuse - s->object_size); s->inuse - s->object_size);
off = get_info_end(s); off = get_info_end(s);
...@@ -732,7 +733,7 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p) ...@@ -732,7 +733,7 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p)
if (off != size_from_object(s)) if (off != size_from_object(s))
/* Beginning of the filler is the free pointer */ /* Beginning of the filler is the free pointer */
print_section(KERN_ERR, "Padding ", p + off, print_section(KERN_ERR, "Padding ", p + off,
size_from_object(s) - off); size_from_object(s) - off);
dump_stack(); dump_stack();
...@@ -909,11 +910,11 @@ static int check_object(struct kmem_cache *s, struct page *page, ...@@ -909,11 +910,11 @@ static int check_object(struct kmem_cache *s, struct page *page,
u8 *endobject = object + s->object_size; u8 *endobject = object + s->object_size;
if (s->flags & SLAB_RED_ZONE) { if (s->flags & SLAB_RED_ZONE) {
if (!check_bytes_and_report(s, page, object, "Redzone", if (!check_bytes_and_report(s, page, object, "Left Redzone",
object - s->red_left_pad, val, s->red_left_pad)) object - s->red_left_pad, val, s->red_left_pad))
return 0; return 0;
if (!check_bytes_and_report(s, page, object, "Redzone", if (!check_bytes_and_report(s, page, object, "Right Redzone",
endobject, val, s->inuse - s->object_size)) endobject, val, s->inuse - s->object_size))
return 0; return 0;
} else { } else {
...@@ -928,7 +929,7 @@ static int check_object(struct kmem_cache *s, struct page *page, ...@@ -928,7 +929,7 @@ static int check_object(struct kmem_cache *s, struct page *page,
if (val != SLUB_RED_ACTIVE && (s->flags & __OBJECT_POISON) && if (val != SLUB_RED_ACTIVE && (s->flags & __OBJECT_POISON) &&
(!check_bytes_and_report(s, page, p, "Poison", p, (!check_bytes_and_report(s, page, p, "Poison", p,
POISON_FREE, s->object_size - 1) || POISON_FREE, s->object_size - 1) ||
!check_bytes_and_report(s, page, p, "Poison", !check_bytes_and_report(s, page, p, "End Poison",
p + s->object_size - 1, POISON_END, 1))) p + s->object_size - 1, POISON_END, 1)))
return 0; return 0;
/* /*
...@@ -3689,7 +3690,6 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) ...@@ -3689,7 +3690,6 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
{ {
slab_flags_t flags = s->flags; slab_flags_t flags = s->flags;
unsigned int size = s->object_size; unsigned int size = s->object_size;
unsigned int freepointer_area;
unsigned int order; unsigned int order;
/* /*
...@@ -3698,13 +3698,6 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) ...@@ -3698,13 +3698,6 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
* the possible location of the free pointer. * the possible location of the free pointer.
*/ */
size = ALIGN(size, sizeof(void *)); size = ALIGN(size, sizeof(void *));
/*
* This is the area of the object where a freepointer can be
* safely written. If redzoning adds more to the inuse size, we
* can't use that portion for writing the freepointer, so
* s->offset must be limited within this for the general case.
*/
freepointer_area = size;
#ifdef CONFIG_SLUB_DEBUG #ifdef CONFIG_SLUB_DEBUG
/* /*
...@@ -3730,19 +3723,21 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) ...@@ -3730,19 +3723,21 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
/* /*
* With that we have determined the number of bytes in actual use * With that we have determined the number of bytes in actual use
* by the object. This is the potential offset to the free pointer. * by the object and redzoning.
*/ */
s->inuse = size; s->inuse = size;
if (((flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON)) || if ((flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON)) ||
s->ctor)) { ((flags & SLAB_RED_ZONE) && s->object_size < sizeof(void *)) ||
s->ctor) {
/* /*
* Relocate free pointer after the object if it is not * Relocate free pointer after the object if it is not
* permitted to overwrite the first word of the object on * permitted to overwrite the first word of the object on
* kmem_cache_free. * kmem_cache_free.
* *
* This is the case if we do RCU, have a constructor or * This is the case if we do RCU, have a constructor or
* destructor or are poisoning the objects. * destructor, are poisoning the objects, or are
* redzoning an object smaller than sizeof(void *).
* *
* The assumption that s->offset >= s->inuse means free * The assumption that s->offset >= s->inuse means free
* pointer is outside of the object is used in the * pointer is outside of the object is used in the
...@@ -3751,13 +3746,13 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) ...@@ -3751,13 +3746,13 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
*/ */
s->offset = size; s->offset = size;
size += sizeof(void *); size += sizeof(void *);
} else if (freepointer_area > sizeof(void *)) { } else {
/* /*
* Store freelist pointer near middle of object to keep * Store freelist pointer near middle of object to keep
* it away from the edges of the object to avoid small * it away from the edges of the object to avoid small
* sized over/underflows from neighboring allocations. * sized over/underflows from neighboring allocations.
*/ */
s->offset = ALIGN(freepointer_area / 2, sizeof(void *)); s->offset = ALIGN_DOWN(s->object_size / 2, sizeof(void *));
} }
#ifdef CONFIG_SLUB_DEBUG #ifdef CONFIG_SLUB_DEBUG
......
...@@ -344,6 +344,15 @@ size_t mem_section_usage_size(void) ...@@ -344,6 +344,15 @@ size_t mem_section_usage_size(void)
return sizeof(struct mem_section_usage) + usemap_size(); return sizeof(struct mem_section_usage) + usemap_size();
} }
static inline phys_addr_t pgdat_to_phys(struct pglist_data *pgdat)
{
#ifndef CONFIG_NEED_MULTIPLE_NODES
return __pa_symbol(pgdat);
#else
return __pa(pgdat);
#endif
}
#ifdef CONFIG_MEMORY_HOTREMOVE #ifdef CONFIG_MEMORY_HOTREMOVE
static struct mem_section_usage * __init static struct mem_section_usage * __init
sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat, sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
...@@ -362,7 +371,7 @@ sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat, ...@@ -362,7 +371,7 @@ sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
* from the same section as the pgdat where possible to avoid * from the same section as the pgdat where possible to avoid
* this problem. * this problem.
*/ */
goal = __pa(pgdat) & (PAGE_SECTION_MASK << PAGE_SHIFT); goal = pgdat_to_phys(pgdat) & (PAGE_SECTION_MASK << PAGE_SHIFT);
limit = goal + (1UL << PA_SECTION_SHIFT); limit = goal + (1UL << PA_SECTION_SHIFT);
nid = early_pfn_to_nid(goal >> PAGE_SHIFT); nid = early_pfn_to_nid(goal >> PAGE_SHIFT);
again: again:
...@@ -390,7 +399,7 @@ static void __init check_usemap_section_nr(int nid, ...@@ -390,7 +399,7 @@ static void __init check_usemap_section_nr(int nid,
} }
usemap_snr = pfn_to_section_nr(__pa(usage) >> PAGE_SHIFT); usemap_snr = pfn_to_section_nr(__pa(usage) >> PAGE_SHIFT);
pgdat_snr = pfn_to_section_nr(__pa(pgdat) >> PAGE_SHIFT); pgdat_snr = pfn_to_section_nr(pgdat_to_phys(pgdat) >> PAGE_SHIFT);
if (usemap_snr == pgdat_snr) if (usemap_snr == pgdat_snr)
return; return;
......
...@@ -1900,7 +1900,7 @@ unsigned int count_swap_pages(int type, int free) ...@@ -1900,7 +1900,7 @@ unsigned int count_swap_pages(int type, int free)
static inline int pte_same_as_swp(pte_t pte, pte_t swp_pte) static inline int pte_same_as_swp(pte_t pte, pte_t swp_pte)
{ {
return pte_same(pte_swp_clear_soft_dirty(pte), swp_pte); return pte_same(pte_swp_clear_flags(pte), swp_pte);
} }
/* /*
......
...@@ -167,13 +167,10 @@ void do_invalidatepage(struct page *page, unsigned int offset, ...@@ -167,13 +167,10 @@ void do_invalidatepage(struct page *page, unsigned int offset,
* its lock, b) when a concurrent invalidate_mapping_pages got there first and * its lock, b) when a concurrent invalidate_mapping_pages got there first and
* c) when tmpfs swizzles a page between a tmpfs inode and swapper_space. * c) when tmpfs swizzles a page between a tmpfs inode and swapper_space.
*/ */
static void static void truncate_cleanup_page(struct page *page)
truncate_cleanup_page(struct address_space *mapping, struct page *page)
{ {
if (page_mapped(page)) { if (page_mapped(page))
unsigned int nr = thp_nr_pages(page); unmap_mapping_page(page);
unmap_mapping_pages(mapping, page->index, nr, false);
}
if (page_has_private(page)) if (page_has_private(page))
do_invalidatepage(page, 0, thp_size(page)); do_invalidatepage(page, 0, thp_size(page));
...@@ -218,7 +215,7 @@ int truncate_inode_page(struct address_space *mapping, struct page *page) ...@@ -218,7 +215,7 @@ int truncate_inode_page(struct address_space *mapping, struct page *page)
if (page->mapping != mapping) if (page->mapping != mapping)
return -EIO; return -EIO;
truncate_cleanup_page(mapping, page); truncate_cleanup_page(page);
delete_from_page_cache(page); delete_from_page_cache(page);
return 0; return 0;
} }
...@@ -325,7 +322,7 @@ void truncate_inode_pages_range(struct address_space *mapping, ...@@ -325,7 +322,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
index = indices[pagevec_count(&pvec) - 1] + 1; index = indices[pagevec_count(&pvec) - 1] + 1;
truncate_exceptional_pvec_entries(mapping, &pvec, indices); truncate_exceptional_pvec_entries(mapping, &pvec, indices);
for (i = 0; i < pagevec_count(&pvec); i++) for (i = 0; i < pagevec_count(&pvec); i++)
truncate_cleanup_page(mapping, pvec.pages[i]); truncate_cleanup_page(pvec.pages[i]);
delete_from_page_cache_batch(mapping, &pvec); delete_from_page_cache_batch(mapping, &pvec);
for (i = 0; i < pagevec_count(&pvec); i++) for (i = 0; i < pagevec_count(&pvec); i++)
unlock_page(pvec.pages[i]); unlock_page(pvec.pages[i]);
...@@ -639,6 +636,16 @@ int invalidate_inode_pages2_range(struct address_space *mapping, ...@@ -639,6 +636,16 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
continue; continue;
} }
if (!did_range_unmap && page_mapped(page)) {
/*
* If page is mapped, before taking its lock,
* zap the rest of the file in one hit.
*/
unmap_mapping_pages(mapping, index,
(1 + end - index), false);
did_range_unmap = 1;
}
lock_page(page); lock_page(page);
WARN_ON(page_to_index(page) != index); WARN_ON(page_to_index(page) != index);
if (page->mapping != mapping) { if (page->mapping != mapping) {
...@@ -646,23 +653,11 @@ int invalidate_inode_pages2_range(struct address_space *mapping, ...@@ -646,23 +653,11 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
continue; continue;
} }
wait_on_page_writeback(page); wait_on_page_writeback(page);
if (page_mapped(page)) {
if (!did_range_unmap) { if (page_mapped(page))
/* unmap_mapping_page(page);
* Zap the rest of the file in one hit.
*/
unmap_mapping_pages(mapping, index,
(1 + end - index), false);
did_range_unmap = 1;
} else {
/*
* Just zap this page
*/
unmap_mapping_pages(mapping, index,
1, false);
}
}
BUG_ON(page_mapped(page)); BUG_ON(page_mapped(page));
ret2 = do_launder_page(mapping, page); ret2 = do_launder_page(mapping, page);
if (ret2 == 0) { if (ret2 == 0) {
if (!invalidate_complete_page2(mapping, page)) if (!invalidate_complete_page2(mapping, page))
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment