Commit 70585216 authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'akpm' (patches from Andrew)

Merge misc fixes from Andrew Morton:
 "18 patches.

  Subsystems affected by this patch series: mm (memory-failure, swap,
  slub, hugetlb, memory-failure, slub, thp, sparsemem), and coredump"

* emailed patches from Andrew Morton <akpm@linux-foundation.org>:
  mm/sparse: fix check_usemap_section_nr warnings
  mm: thp: replace DEBUG_VM BUG with VM_WARN when unmap fails for split
  mm/thp: unmap_mapping_page() to fix THP truncate_cleanup_page()
  mm/thp: fix page_address_in_vma() on file THP tails
  mm/thp: fix vma_address() if virtual address below file offset
  mm/thp: try_to_unmap() use TTU_SYNC for safe splitting
  mm/thp: make is_huge_zero_pmd() safe and quicker
  mm/thp: fix __split_huge_pmd_locked() on shmem migration entry
  mm, thp: use head page in __migration_entry_wait()
  mm/slub.c: include swab.h
  crash_core, vmcoreinfo: append 'SECTION_SIZE_BITS' to vmcoreinfo
  mm/memory-failure: make sure wait for page writeback in memory_failure
  mm/hugetlb: expand restore_reserve_on_error functionality
  mm/slub: actually fix freelist pointer vs redzoning
  mm/slub: fix redzoning for small allocations
  mm/slub: clarify verification reporting
  mm/swap: fix pte_same_as_swp() not removing uffd-wp bit when compare
  mm,hwpoison: fix race with hugetlb page allocation
parents 6b00bc63 ccbd6283
......@@ -181,7 +181,7 @@ SLUB Debug output
Here is a sample of slub debug output::
====================================================================
BUG kmalloc-8: Redzone overwritten
BUG kmalloc-8: Right Redzone overwritten
--------------------------------------------------------------------
INFO: 0xc90f6d28-0xc90f6d2b. First byte 0x00 instead of 0xcc
......@@ -189,10 +189,10 @@ Here is a sample of slub debug output::
INFO: Object 0xc90f6d20 @offset=3360 fp=0xc90f6d58
INFO: Allocated in get_modalias+0x61/0xf5 age=53 cpu=1 pid=554
Bytes b4 0xc90f6d10: 00 00 00 00 00 00 00 00 5a 5a 5a 5a 5a 5a 5a 5a ........ZZZZZZZZ
Object 0xc90f6d20: 31 30 31 39 2e 30 30 35 1019.005
Redzone 0xc90f6d28: 00 cc cc cc .
Padding 0xc90f6d50: 5a 5a 5a 5a 5a 5a 5a 5a ZZZZZZZZ
Bytes b4 (0xc90f6d10): 00 00 00 00 00 00 00 00 5a 5a 5a 5a 5a 5a 5a 5a ........ZZZZZZZZ
Object (0xc90f6d20): 31 30 31 39 2e 30 30 35 1019.005
Redzone (0xc90f6d28): 00 cc cc cc .
Padding (0xc90f6d50): 5a 5a 5a 5a 5a 5a 5a 5a ZZZZZZZZ
[<c010523d>] dump_trace+0x63/0x1eb
[<c01053df>] show_trace_log_lvl+0x1a/0x2f
......
......@@ -735,6 +735,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
__SetPageUptodate(page);
error = huge_add_to_page_cache(page, mapping, index);
if (unlikely(error)) {
restore_reserve_on_error(h, &pseudo_vma, addr, page);
put_page(page);
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
goto out;
......
......@@ -286,6 +286,7 @@ struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr,
vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t orig_pmd);
extern struct page *huge_zero_page;
extern unsigned long huge_zero_pfn;
static inline bool is_huge_zero_page(struct page *page)
{
......@@ -294,7 +295,7 @@ static inline bool is_huge_zero_page(struct page *page)
static inline bool is_huge_zero_pmd(pmd_t pmd)
{
return is_huge_zero_page(pmd_page(pmd));
return READ_ONCE(huge_zero_pfn) == pmd_pfn(pmd) && pmd_present(pmd);
}
static inline bool is_huge_zero_pud(pud_t pud)
......@@ -440,6 +441,11 @@ static inline bool is_huge_zero_page(struct page *page)
return false;
}
static inline bool is_huge_zero_pmd(pmd_t pmd)
{
return false;
}
static inline bool is_huge_zero_pud(pud_t pud)
{
return false;
......
......@@ -149,6 +149,7 @@ bool hugetlb_reserve_pages(struct inode *inode, long from, long to,
long hugetlb_unreserve_pages(struct inode *inode, long start, long end,
long freed);
bool isolate_huge_page(struct page *page, struct list_head *list);
int get_hwpoison_huge_page(struct page *page, bool *hugetlb);
void putback_active_hugepage(struct page *page);
void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason);
void free_huge_page(struct page *page);
......@@ -339,6 +340,11 @@ static inline bool isolate_huge_page(struct page *page, struct list_head *list)
return false;
}
static inline int get_hwpoison_huge_page(struct page *page, bool *hugetlb)
{
return 0;
}
static inline void putback_active_hugepage(struct page *page)
{
}
......@@ -604,6 +610,8 @@ struct page *alloc_huge_page_vma(struct hstate *h, struct vm_area_struct *vma,
unsigned long address);
int huge_add_to_page_cache(struct page *page, struct address_space *mapping,
pgoff_t idx);
void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma,
unsigned long address, struct page *page);
/* arch callback */
int __init __alloc_bootmem_huge_page(struct hstate *h);
......
......@@ -1719,6 +1719,7 @@ struct zap_details {
struct address_space *check_mapping; /* Check page->mapping if set */
pgoff_t first_index; /* Lowest page->index to unmap */
pgoff_t last_index; /* Highest page->index to unmap */
struct page *single_page; /* Locked page to be unmapped */
};
struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
......@@ -1766,6 +1767,7 @@ extern vm_fault_t handle_mm_fault(struct vm_area_struct *vma,
extern int fixup_user_fault(struct mm_struct *mm,
unsigned long address, unsigned int fault_flags,
bool *unlocked);
void unmap_mapping_page(struct page *page);
void unmap_mapping_pages(struct address_space *mapping,
pgoff_t start, pgoff_t nr, bool even_cows);
void unmap_mapping_range(struct address_space *mapping,
......@@ -1786,6 +1788,7 @@ static inline int fixup_user_fault(struct mm_struct *mm, unsigned long address,
BUG();
return -EFAULT;
}
static inline void unmap_mapping_page(struct page *page) { }
static inline void unmap_mapping_pages(struct address_space *mapping,
pgoff_t start, pgoff_t nr, bool even_cows) { }
static inline void unmap_mapping_range(struct address_space *mapping,
......
......@@ -91,6 +91,7 @@ enum ttu_flags {
TTU_SPLIT_HUGE_PMD = 0x4, /* split huge PMD if any */
TTU_IGNORE_MLOCK = 0x8, /* ignore mlock */
TTU_SYNC = 0x10, /* avoid racy checks with PVMW_SYNC */
TTU_IGNORE_HWPOISON = 0x20, /* corrupted page is recoverable */
TTU_BATCH_FLUSH = 0x40, /* Batch TLB flushes where possible
* and caller guarantees they will
......
......@@ -23,6 +23,16 @@
#define SWP_TYPE_SHIFT (BITS_PER_XA_VALUE - MAX_SWAPFILES_SHIFT)
#define SWP_OFFSET_MASK ((1UL << SWP_TYPE_SHIFT) - 1)
/* Clear all flags but only keep swp_entry_t related information */
static inline pte_t pte_swp_clear_flags(pte_t pte)
{
if (pte_swp_soft_dirty(pte))
pte = pte_swp_clear_soft_dirty(pte);
if (pte_swp_uffd_wp(pte))
pte = pte_swp_clear_uffd_wp(pte);
return pte;
}
/*
* Store a type+offset into a swp_entry_t in an arch-independent format
*/
......@@ -66,10 +76,7 @@ static inline swp_entry_t pte_to_swp_entry(pte_t pte)
{
swp_entry_t arch_entry;
if (pte_swp_soft_dirty(pte))
pte = pte_swp_clear_soft_dirty(pte);
if (pte_swp_uffd_wp(pte))
pte = pte_swp_clear_uffd_wp(pte);
pte = pte_swp_clear_flags(pte);
arch_entry = __pte_to_swp_entry(pte);
return swp_entry(__swp_type(arch_entry), __swp_offset(arch_entry));
}
......
......@@ -464,6 +464,7 @@ static int __init crash_save_vmcoreinfo_init(void)
VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS);
VMCOREINFO_STRUCT_SIZE(mem_section);
VMCOREINFO_OFFSET(mem_section, section_mem_map);
VMCOREINFO_NUMBER(SECTION_SIZE_BITS);
VMCOREINFO_NUMBER(MAX_PHYSMEM_BITS);
#endif
VMCOREINFO_STRUCT_SIZE(page);
......
......@@ -62,6 +62,7 @@ static struct shrinker deferred_split_shrinker;
static atomic_t huge_zero_refcount;
struct page *huge_zero_page __read_mostly;
unsigned long huge_zero_pfn __read_mostly = ~0UL;
bool transparent_hugepage_enabled(struct vm_area_struct *vma)
{
......@@ -98,6 +99,7 @@ static bool get_huge_zero_page(void)
__free_pages(zero_page, compound_order(zero_page));
goto retry;
}
WRITE_ONCE(huge_zero_pfn, page_to_pfn(zero_page));
/* We take additional reference here. It will be put back by shrinker */
atomic_set(&huge_zero_refcount, 2);
......@@ -147,6 +149,7 @@ static unsigned long shrink_huge_zero_page_scan(struct shrinker *shrink,
if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) {
struct page *zero_page = xchg(&huge_zero_page, NULL);
BUG_ON(zero_page == NULL);
WRITE_ONCE(huge_zero_pfn, ~0UL);
__free_pages(zero_page, compound_order(zero_page));
return HPAGE_PMD_NR;
}
......@@ -2044,7 +2047,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
count_vm_event(THP_SPLIT_PMD);
if (!vma_is_anonymous(vma)) {
_pmd = pmdp_huge_clear_flush_notify(vma, haddr, pmd);
old_pmd = pmdp_huge_clear_flush_notify(vma, haddr, pmd);
/*
* We are going to unmap this huge page. So
* just go ahead and zap it
......@@ -2053,16 +2056,25 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
zap_deposited_table(mm, pmd);
if (vma_is_special_huge(vma))
return;
page = pmd_page(_pmd);
if (!PageDirty(page) && pmd_dirty(_pmd))
if (unlikely(is_pmd_migration_entry(old_pmd))) {
swp_entry_t entry;
entry = pmd_to_swp_entry(old_pmd);
page = migration_entry_to_page(entry);
} else {
page = pmd_page(old_pmd);
if (!PageDirty(page) && pmd_dirty(old_pmd))
set_page_dirty(page);
if (!PageReferenced(page) && pmd_young(_pmd))
if (!PageReferenced(page) && pmd_young(old_pmd))
SetPageReferenced(page);
page_remove_rmap(page, true);
put_page(page);
}
add_mm_counter(mm, mm_counter_file(page), -HPAGE_PMD_NR);
return;
} else if (pmd_trans_huge(*pmd) && is_huge_zero_pmd(*pmd)) {
}
if (is_huge_zero_pmd(*pmd)) {
/*
* FIXME: Do we want to invalidate secondary mmu by calling
* mmu_notifier_invalidate_range() see comments below inside
......@@ -2338,17 +2350,17 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma,
static void unmap_page(struct page *page)
{
enum ttu_flags ttu_flags = TTU_IGNORE_MLOCK |
enum ttu_flags ttu_flags = TTU_IGNORE_MLOCK | TTU_SYNC |
TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD;
bool unmap_success;
VM_BUG_ON_PAGE(!PageHead(page), page);
if (PageAnon(page))
ttu_flags |= TTU_SPLIT_FREEZE;
unmap_success = try_to_unmap(page, ttu_flags);
VM_BUG_ON_PAGE(!unmap_success, page);
try_to_unmap(page, ttu_flags);
VM_WARN_ON_ONCE_PAGE(page_mapped(page), page);
}
static void remap_page(struct page *page, unsigned int nr)
......@@ -2659,7 +2671,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
struct deferred_split *ds_queue = get_deferred_split_queue(head);
struct anon_vma *anon_vma = NULL;
struct address_space *mapping = NULL;
int count, mapcount, extra_pins, ret;
int extra_pins, ret;
pgoff_t end;
VM_BUG_ON_PAGE(is_huge_zero_page(head), head);
......@@ -2718,7 +2730,6 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
}
unmap_page(head);
VM_BUG_ON_PAGE(compound_mapcount(head), head);
/* block interrupt reentry in xa_lock and spinlock */
local_irq_disable();
......@@ -2736,9 +2747,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
/* Prevent deferred_split_scan() touching ->_refcount */
spin_lock(&ds_queue->split_queue_lock);
count = page_count(head);
mapcount = total_mapcount(head);
if (!mapcount && page_ref_freeze(head, 1 + extra_pins)) {
if (page_ref_freeze(head, 1 + extra_pins)) {
if (!list_empty(page_deferred_list(head))) {
ds_queue->split_queue_len--;
list_del(page_deferred_list(head));
......@@ -2758,16 +2767,9 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
__split_huge_page(page, list, end);
ret = 0;
} else {
if (IS_ENABLED(CONFIG_DEBUG_VM) && mapcount) {
pr_alert("total_mapcount: %u, page_count(): %u\n",
mapcount, count);
if (PageTail(page))
dump_page(head, NULL);
dump_page(page, "total_mapcount(head) > 0");
BUG();
}
spin_unlock(&ds_queue->split_queue_lock);
fail: if (mapping)
fail:
if (mapping)
xa_unlock(&mapping->i_pages);
local_irq_enable();
remap_page(head, thp_nr_pages(head));
......
......@@ -2121,12 +2121,18 @@ static void return_unused_surplus_pages(struct hstate *h,
* be restored when a newly allocated huge page must be freed. It is
* to be called after calling vma_needs_reservation to determine if a
* reservation exists.
*
* vma_del_reservation is used in error paths where an entry in the reserve
* map was created during huge page allocation and must be removed. It is to
* be called after calling vma_needs_reservation to determine if a reservation
* exists.
*/
enum vma_resv_mode {
VMA_NEEDS_RESV,
VMA_COMMIT_RESV,
VMA_END_RESV,
VMA_ADD_RESV,
VMA_DEL_RESV,
};
static long __vma_reservation_common(struct hstate *h,
struct vm_area_struct *vma, unsigned long addr,
......@@ -2170,11 +2176,21 @@ static long __vma_reservation_common(struct hstate *h,
ret = region_del(resv, idx, idx + 1);
}
break;
case VMA_DEL_RESV:
if (vma->vm_flags & VM_MAYSHARE) {
region_abort(resv, idx, idx + 1, 1);
ret = region_del(resv, idx, idx + 1);
} else {
ret = region_add(resv, idx, idx + 1, 1, NULL, NULL);
/* region_add calls of range 1 should never fail. */
VM_BUG_ON(ret < 0);
}
break;
default:
BUG();
}
if (vma->vm_flags & VM_MAYSHARE)
if (vma->vm_flags & VM_MAYSHARE || mode == VMA_DEL_RESV)
return ret;
/*
* We know private mapping must have HPAGE_RESV_OWNER set.
......@@ -2222,25 +2238,39 @@ static long vma_add_reservation(struct hstate *h,
return __vma_reservation_common(h, vma, addr, VMA_ADD_RESV);
}
static long vma_del_reservation(struct hstate *h,
struct vm_area_struct *vma, unsigned long addr)
{
return __vma_reservation_common(h, vma, addr, VMA_DEL_RESV);
}
/*
* This routine is called to restore a reservation on error paths. In the
* specific error paths, a huge page was allocated (via alloc_huge_page)
* and is about to be freed. If a reservation for the page existed,
* alloc_huge_page would have consumed the reservation and set
* HPageRestoreReserve in the newly allocated page. When the page is freed
* via free_huge_page, the global reservation count will be incremented if
* HPageRestoreReserve is set. However, free_huge_page can not adjust the
* reserve map. Adjust the reserve map here to be consistent with global
* reserve count adjustments to be made by free_huge_page.
*/
static void restore_reserve_on_error(struct hstate *h,
struct vm_area_struct *vma, unsigned long address,
struct page *page)
{
if (unlikely(HPageRestoreReserve(page))) {
* This routine is called to restore reservation information on error paths.
* It should ONLY be called for pages allocated via alloc_huge_page(), and
* the hugetlb mutex should remain held when calling this routine.
*
* It handles two specific cases:
* 1) A reservation was in place and the page consumed the reservation.
* HPageRestoreReserve is set in the page.
* 2) No reservation was in place for the page, so HPageRestoreReserve is
* not set. However, alloc_huge_page always updates the reserve map.
*
* In case 1, free_huge_page later in the error path will increment the
* global reserve count. But, free_huge_page does not have enough context
* to adjust the reservation map. This case deals primarily with private
* mappings. Adjust the reserve map here to be consistent with global
* reserve count adjustments to be made by free_huge_page. Make sure the
* reserve map indicates there is a reservation present.
*
* In case 2, simply undo reserve map modifications done by alloc_huge_page.
*/
void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma,
unsigned long address, struct page *page)
{
long rc = vma_needs_reservation(h, vma, address);
if (unlikely(rc < 0)) {
if (HPageRestoreReserve(page)) {
if (unlikely(rc < 0))
/*
* Rare out of memory condition in reserve map
* manipulation. Clear HPageRestoreReserve so that
......@@ -2253,15 +2283,56 @@ static void restore_reserve_on_error(struct hstate *h,
* accounting of reserve counts.
*/
ClearHPageRestoreReserve(page);
} else if (rc) {
rc = vma_add_reservation(h, vma, address);
if (unlikely(rc < 0))
else if (rc)
(void)vma_add_reservation(h, vma, address);
else
vma_end_reservation(h, vma, address);
} else {
if (!rc) {
/*
* See above comment about rare out of
* memory condition.
* This indicates there is an entry in the reserve map
* added by alloc_huge_page. We know it was added
* before the alloc_huge_page call, otherwise
* HPageRestoreReserve would be set on the page.
* Remove the entry so that a subsequent allocation
* does not consume a reservation.
*/
ClearHPageRestoreReserve(page);
rc = vma_del_reservation(h, vma, address);
if (rc < 0)
/*
* VERY rare out of memory condition. Since
* we can not delete the entry, set
* HPageRestoreReserve so that the reserve
* count will be incremented when the page
* is freed. This reserve will be consumed
* on a subsequent allocation.
*/
SetHPageRestoreReserve(page);
} else if (rc < 0) {
/*
* Rare out of memory condition from
* vma_needs_reservation call. Memory allocation is
* only attempted if a new entry is needed. Therefore,
* this implies there is not an entry in the
* reserve map.
*
* For shared mappings, no entry in the map indicates
* no reservation. We are done.
*/
if (!(vma->vm_flags & VM_MAYSHARE))
/*
* For private mappings, no entry indicates
* a reservation is present. Since we can
* not add an entry, set SetHPageRestoreReserve
* on the page so reserve count will be
* incremented when freed. This reserve will
* be consumed on a subsequent allocation.
*/
SetHPageRestoreReserve(page);
} else
/*
* No reservation present, do nothing
*/
vma_end_reservation(h, vma, address);
}
}
......@@ -4037,6 +4108,8 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
entry = huge_ptep_get(src_pte);
if (!pte_same(src_pte_old, entry)) {
restore_reserve_on_error(h, vma, addr,
new);
put_page(new);
/* dst_entry won't change as in child */
goto again;
......@@ -5006,6 +5079,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
if (vm_shared || is_continue)
unlock_page(page);
out_release_nounlock:
restore_reserve_on_error(h, dst_vma, dst_addr, page);
put_page(page);
goto out;
}
......@@ -5857,6 +5931,21 @@ bool isolate_huge_page(struct page *page, struct list_head *list)
return ret;
}
int get_hwpoison_huge_page(struct page *page, bool *hugetlb)
{
int ret = 0;
*hugetlb = false;
spin_lock_irq(&hugetlb_lock);
if (PageHeadHuge(page)) {
*hugetlb = true;
if (HPageFreed(page) || HPageMigratable(page))
ret = get_page_unless_zero(page);
}
spin_unlock_irq(&hugetlb_lock);
return ret;
}
void putback_active_hugepage(struct page *page)
{
spin_lock_irq(&hugetlb_lock);
......
......@@ -384,27 +384,52 @@ static inline void mlock_migrate_page(struct page *newpage, struct page *page)
extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma);
/*
* At what user virtual address is page expected in @vma?
* At what user virtual address is page expected in vma?
* Returns -EFAULT if all of the page is outside the range of vma.
* If page is a compound head, the entire compound page is considered.
*/
static inline unsigned long
__vma_address(struct page *page, struct vm_area_struct *vma)
vma_address(struct page *page, struct vm_area_struct *vma)
{
pgoff_t pgoff = page_to_pgoff(page);
return vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
pgoff_t pgoff;
unsigned long address;
VM_BUG_ON_PAGE(PageKsm(page), page); /* KSM page->index unusable */
pgoff = page_to_pgoff(page);
if (pgoff >= vma->vm_pgoff) {
address = vma->vm_start +
((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
/* Check for address beyond vma (or wrapped through 0?) */
if (address < vma->vm_start || address >= vma->vm_end)
address = -EFAULT;
} else if (PageHead(page) &&
pgoff + compound_nr(page) - 1 >= vma->vm_pgoff) {
/* Test above avoids possibility of wrap to 0 on 32-bit */
address = vma->vm_start;
} else {
address = -EFAULT;
}
return address;
}
/*
* Then at what user virtual address will none of the page be found in vma?
* Assumes that vma_address() already returned a good starting address.
* If page is a compound head, the entire compound page is considered.
*/
static inline unsigned long
vma_address(struct page *page, struct vm_area_struct *vma)
vma_address_end(struct page *page, struct vm_area_struct *vma)
{
unsigned long start, end;
start = __vma_address(page, vma);
end = start + thp_size(page) - PAGE_SIZE;
/* page should be within @vma mapping range */
VM_BUG_ON_VMA(end < vma->vm_start || start >= vma->vm_end, vma);
return max(start, vma->vm_start);
pgoff_t pgoff;
unsigned long address;
VM_BUG_ON_PAGE(PageKsm(page), page); /* KSM page->index unusable */
pgoff = page_to_pgoff(page) + compound_nr(page);
address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
/* Check for address beyond vma (or wrapped through 0?) */
if (address < vma->vm_start || address > vma->vm_end)
address = vma->vm_end;
return address;
}
static inline struct file *maybe_unlock_mmap_for_io(struct vm_fault *vmf,
......
......@@ -949,6 +949,17 @@ static int page_action(struct page_state *ps, struct page *p,
return (result == MF_RECOVERED || result == MF_DELAYED) ? 0 : -EBUSY;
}
/*
* Return true if a page type of a given page is supported by hwpoison
* mechanism (while handling could fail), otherwise false. This function
* does not return true for hugetlb or device memory pages, so it's assumed
* to be called only in the context where we never have such pages.
*/
static inline bool HWPoisonHandlable(struct page *page)
{
return PageLRU(page) || __PageMovable(page);
}
/**
* __get_hwpoison_page() - Get refcount for memory error handling:
* @page: raw error page (hit by memory error)
......@@ -959,8 +970,22 @@ static int page_action(struct page_state *ps, struct page *p,
static int __get_hwpoison_page(struct page *page)
{
struct page *head = compound_head(page);
int ret = 0;
bool hugetlb = false;
ret = get_hwpoison_huge_page(head, &hugetlb);
if (hugetlb)
return ret;
if (!PageHuge(head) && PageTransHuge(head)) {
/*
* This check prevents from calling get_hwpoison_unless_zero()
* for any unsupported type of page in order to reduce the risk of
* unexpected races caused by taking a page refcount.
*/
if (!HWPoisonHandlable(head))
return 0;
if (PageTransHuge(head)) {
/*
* Non anonymous thp exists only in allocation/free time. We
* can't handle such a case correctly, so let's give it up.
......@@ -1017,7 +1042,7 @@ static int get_any_page(struct page *p, unsigned long flags)
ret = -EIO;
}
} else {
if (PageHuge(p) || PageLRU(p) || __PageMovable(p)) {
if (PageHuge(p) || HWPoisonHandlable(p)) {
ret = 1;
} else {
/*
......@@ -1527,7 +1552,12 @@ int memory_failure(unsigned long pfn, int flags)
return 0;
}
if (!PageTransTail(p) && !PageLRU(p))
/*
* __munlock_pagevec may clear a writeback page's LRU flag without
* page_lock. We need wait writeback completion for this page or it
* may trigger vfs BUG while evict inode.
*/
if (!PageTransTail(p) && !PageLRU(p) && !PageWriteback(p))
goto identify_page_state;
/*
......
......@@ -1361,7 +1361,18 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
else if (zap_huge_pmd(tlb, vma, pmd, addr))
goto next;
/* fall through */
} else if (details && details->single_page &&
PageTransCompound(details->single_page) &&
next - addr == HPAGE_PMD_SIZE && pmd_none(*pmd)) {
spinlock_t *ptl = pmd_lock(tlb->mm, pmd);
/*
* Take and drop THP pmd lock so that we cannot return
* prematurely, while zap_huge_pmd() has cleared *pmd,
* but not yet decremented compound_mapcount().
*/
spin_unlock(ptl);
}
/*
* Here there can be other concurrent MADV_DONTNEED or
* trans huge page faults running, and if the pmd is
......@@ -3236,6 +3247,36 @@ static inline void unmap_mapping_range_tree(struct rb_root_cached *root,
}
}
/**
* unmap_mapping_page() - Unmap single page from processes.
* @page: The locked page to be unmapped.
*
* Unmap this page from any userspace process which still has it mmaped.
* Typically, for efficiency, the range of nearby pages has already been
* unmapped by unmap_mapping_pages() or unmap_mapping_range(). But once
* truncation or invalidation holds the lock on a page, it may find that
* the page has been remapped again: and then uses unmap_mapping_page()
* to unmap it finally.
*/
void unmap_mapping_page(struct page *page)
{
struct address_space *mapping = page->mapping;
struct zap_details details = { };
VM_BUG_ON(!PageLocked(page));
VM_BUG_ON(PageTail(page));
details.check_mapping = mapping;
details.first_index = page->index;
details.last_index = page->index + thp_nr_pages(page) - 1;
details.single_page = page;
i_mmap_lock_write(mapping);
if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)))
unmap_mapping_range_tree(&mapping->i_mmap, &details);
i_mmap_unlock_write(mapping);
}
/**
* unmap_mapping_pages() - Unmap pages from processes.
* @mapping: The address space containing pages to be unmapped.
......
......@@ -295,6 +295,7 @@ void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep,
goto out;
page = migration_entry_to_page(entry);
page = compound_head(page);
/*
* Once page cache replacement of page migration started, page_count
......
......@@ -212,23 +212,34 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
pvmw->ptl = NULL;
}
} else if (!pmd_present(pmde)) {
/*
* If PVMW_SYNC, take and drop THP pmd lock so that we
* cannot return prematurely, while zap_huge_pmd() has
* cleared *pmd but not decremented compound_mapcount().
*/
if ((pvmw->flags & PVMW_SYNC) &&
PageTransCompound(pvmw->page)) {
spinlock_t *ptl = pmd_lock(mm, pvmw->pmd);
spin_unlock(ptl);
}
return false;
}
if (!map_pte(pvmw))
goto next_pte;
while (1) {
unsigned long end;
if (check_pte(pvmw))
return true;
next_pte:
/* Seek to next pte only makes sense for THP */
if (!PageTransHuge(pvmw->page) || PageHuge(pvmw->page))
return not_found(pvmw);
end = vma_address_end(pvmw->page, pvmw->vma);
do {
pvmw->address += PAGE_SIZE;
if (pvmw->address >= pvmw->vma->vm_end ||
pvmw->address >=
__vma_address(pvmw->page, pvmw->vma) +
thp_size(pvmw->page))
if (pvmw->address >= end)
return not_found(pvmw);
/* Did we cross page table boundary? */
if (pvmw->address % PMD_SIZE == 0) {
......@@ -266,14 +277,10 @@ int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma)
.vma = vma,
.flags = PVMW_SYNC,
};
unsigned long start, end;
start = __vma_address(page, vma);
end = start + thp_size(page) - PAGE_SIZE;
if (unlikely(end < vma->vm_start || start >= vma->vm_end))
pvmw.address = vma_address(page, vma);
if (pvmw.address == -EFAULT)
return 0;
pvmw.address = max(start, vma->vm_start);
if (!page_vma_mapped_walk(&pvmw))
return 0;
page_vma_mapped_walk_done(&pvmw);
......
......@@ -135,9 +135,8 @@ pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma, unsigned long address,
{
pmd_t pmd;
VM_BUG_ON(address & ~HPAGE_PMD_MASK);
VM_BUG_ON(!pmd_present(*pmdp));
/* Below assumes pmd_present() is true */
VM_BUG_ON(!pmd_trans_huge(*pmdp) && !pmd_devmap(*pmdp));
VM_BUG_ON(pmd_present(*pmdp) && !pmd_trans_huge(*pmdp) &&
!pmd_devmap(*pmdp));
pmd = pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp);
flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
return pmd;
......
......@@ -707,7 +707,6 @@ static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags)
*/
unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
{
unsigned long address;
if (PageAnon(page)) {
struct anon_vma *page__anon_vma = page_anon_vma(page);
/*
......@@ -717,15 +716,13 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
if (!vma->anon_vma || !page__anon_vma ||
vma->anon_vma->root != page__anon_vma->root)
return -EFAULT;
} else if (page->mapping) {
if (!vma->vm_file || vma->vm_file->f_mapping != page->mapping)
} else if (!vma->vm_file) {
return -EFAULT;
} else
} else if (vma->vm_file->f_mapping != compound_head(page)->mapping) {
return -EFAULT;
address = __vma_address(page, vma);
if (unlikely(address < vma->vm_start || address >= vma->vm_end))
return -EFAULT;
return address;
}
return vma_address(page, vma);
}
pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address)
......@@ -919,7 +916,7 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma,
*/
mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE,
0, vma, vma->vm_mm, address,
min(vma->vm_end, address + page_size(page)));
vma_address_end(page, vma));
mmu_notifier_invalidate_range_start(&range);
while (page_vma_mapped_walk(&pvmw)) {
......@@ -1405,6 +1402,15 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
struct mmu_notifier_range range;
enum ttu_flags flags = (enum ttu_flags)(long)arg;
/*
* When racing against e.g. zap_pte_range() on another cpu,
* in between its ptep_get_and_clear_full() and page_remove_rmap(),
* try_to_unmap() may return false when it is about to become true,
* if page table locking is skipped: use TTU_SYNC to wait for that.
*/
if (flags & TTU_SYNC)
pvmw.flags = PVMW_SYNC;
/* munlock has nothing to gain from examining un-locked vmas */
if ((flags & TTU_MUNLOCK) && !(vma->vm_flags & VM_LOCKED))
return true;
......@@ -1426,9 +1432,10 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
* Note that the page can not be free in this function as call of
* try_to_unmap() must hold a reference on the page.
*/
range.end = PageKsm(page) ?
address + PAGE_SIZE : vma_address_end(page, vma);
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
address,
min(vma->vm_end, address + page_size(page)));
address, range.end);
if (PageHuge(page)) {
/*
* If sharing is possible, start and end will be adjusted
......@@ -1777,7 +1784,13 @@ bool try_to_unmap(struct page *page, enum ttu_flags flags)
else
rmap_walk(page, &rwc);
return !page_mapcount(page) ? true : false;
/*
* When racing against e.g. zap_pte_range() on another cpu,
* in between its ptep_get_and_clear_full() and page_remove_rmap(),
* try_to_unmap() may return false when it is about to become true,
* if page table locking is skipped: use TTU_SYNC to wait for that.
*/
return !page_mapcount(page);
}
/**
......@@ -1874,6 +1887,7 @@ static void rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc,
struct vm_area_struct *vma = avc->vma;
unsigned long address = vma_address(page, vma);
VM_BUG_ON_VMA(address == -EFAULT, vma);
cond_resched();
if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
......@@ -1928,6 +1942,7 @@ static void rmap_walk_file(struct page *page, struct rmap_walk_control *rwc,
pgoff_start, pgoff_end) {
unsigned long address = vma_address(page, vma);
VM_BUG_ON_VMA(address == -EFAULT, vma);
cond_resched();
if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
......
......@@ -97,8 +97,7 @@ EXPORT_SYMBOL(kmem_cache_size);
#ifdef CONFIG_DEBUG_VM
static int kmem_cache_sanity_check(const char *name, unsigned int size)
{
if (!name || in_interrupt() || size < sizeof(void *) ||
size > KMALLOC_MAX_SIZE) {
if (!name || in_interrupt() || size > KMALLOC_MAX_SIZE) {
pr_err("kmem_cache_create(%s) integrity check failed\n", name);
return -EINVAL;
}
......
......@@ -15,6 +15,7 @@
#include <linux/module.h>
#include <linux/bit_spinlock.h>
#include <linux/interrupt.h>
#include <linux/swab.h>
#include <linux/bitops.h>
#include <linux/slab.h>
#include "slab.h"
......@@ -909,11 +910,11 @@ static int check_object(struct kmem_cache *s, struct page *page,
u8 *endobject = object + s->object_size;
if (s->flags & SLAB_RED_ZONE) {
if (!check_bytes_and_report(s, page, object, "Redzone",
if (!check_bytes_and_report(s, page, object, "Left Redzone",
object - s->red_left_pad, val, s->red_left_pad))
return 0;
if (!check_bytes_and_report(s, page, object, "Redzone",
if (!check_bytes_and_report(s, page, object, "Right Redzone",
endobject, val, s->inuse - s->object_size))
return 0;
} else {
......@@ -928,7 +929,7 @@ static int check_object(struct kmem_cache *s, struct page *page,
if (val != SLUB_RED_ACTIVE && (s->flags & __OBJECT_POISON) &&
(!check_bytes_and_report(s, page, p, "Poison", p,
POISON_FREE, s->object_size - 1) ||
!check_bytes_and_report(s, page, p, "Poison",
!check_bytes_and_report(s, page, p, "End Poison",
p + s->object_size - 1, POISON_END, 1)))
return 0;
/*
......@@ -3689,7 +3690,6 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
{
slab_flags_t flags = s->flags;
unsigned int size = s->object_size;
unsigned int freepointer_area;
unsigned int order;
/*
......@@ -3698,13 +3698,6 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
* the possible location of the free pointer.
*/
size = ALIGN(size, sizeof(void *));
/*
* This is the area of the object where a freepointer can be
* safely written. If redzoning adds more to the inuse size, we
* can't use that portion for writing the freepointer, so
* s->offset must be limited within this for the general case.
*/
freepointer_area = size;
#ifdef CONFIG_SLUB_DEBUG
/*
......@@ -3730,19 +3723,21 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
/*
* With that we have determined the number of bytes in actual use
* by the object. This is the potential offset to the free pointer.
* by the object and redzoning.
*/
s->inuse = size;
if (((flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON)) ||
s->ctor)) {
if ((flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON)) ||
((flags & SLAB_RED_ZONE) && s->object_size < sizeof(void *)) ||
s->ctor) {
/*
* Relocate free pointer after the object if it is not
* permitted to overwrite the first word of the object on
* kmem_cache_free.
*
* This is the case if we do RCU, have a constructor or
* destructor or are poisoning the objects.
* destructor, are poisoning the objects, or are
* redzoning an object smaller than sizeof(void *).
*
* The assumption that s->offset >= s->inuse means free
* pointer is outside of the object is used in the
......@@ -3751,13 +3746,13 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
*/
s->offset = size;
size += sizeof(void *);
} else if (freepointer_area > sizeof(void *)) {
} else {
/*
* Store freelist pointer near middle of object to keep
* it away from the edges of the object to avoid small
* sized over/underflows from neighboring allocations.
*/
s->offset = ALIGN(freepointer_area / 2, sizeof(void *));
s->offset = ALIGN_DOWN(s->object_size / 2, sizeof(void *));
}
#ifdef CONFIG_SLUB_DEBUG
......
......@@ -344,6 +344,15 @@ size_t mem_section_usage_size(void)
return sizeof(struct mem_section_usage) + usemap_size();
}
static inline phys_addr_t pgdat_to_phys(struct pglist_data *pgdat)
{
#ifndef CONFIG_NEED_MULTIPLE_NODES
return __pa_symbol(pgdat);
#else
return __pa(pgdat);
#endif
}
#ifdef CONFIG_MEMORY_HOTREMOVE
static struct mem_section_usage * __init
sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
......@@ -362,7 +371,7 @@ sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
* from the same section as the pgdat where possible to avoid
* this problem.
*/
goal = __pa(pgdat) & (PAGE_SECTION_MASK << PAGE_SHIFT);
goal = pgdat_to_phys(pgdat) & (PAGE_SECTION_MASK << PAGE_SHIFT);
limit = goal + (1UL << PA_SECTION_SHIFT);
nid = early_pfn_to_nid(goal >> PAGE_SHIFT);
again:
......@@ -390,7 +399,7 @@ static void __init check_usemap_section_nr(int nid,
}
usemap_snr = pfn_to_section_nr(__pa(usage) >> PAGE_SHIFT);
pgdat_snr = pfn_to_section_nr(__pa(pgdat) >> PAGE_SHIFT);
pgdat_snr = pfn_to_section_nr(pgdat_to_phys(pgdat) >> PAGE_SHIFT);
if (usemap_snr == pgdat_snr)
return;
......
......@@ -1900,7 +1900,7 @@ unsigned int count_swap_pages(int type, int free)
static inline int pte_same_as_swp(pte_t pte, pte_t swp_pte)
{
return pte_same(pte_swp_clear_soft_dirty(pte), swp_pte);
return pte_same(pte_swp_clear_flags(pte), swp_pte);
}
/*
......
......@@ -167,13 +167,10 @@ void do_invalidatepage(struct page *page, unsigned int offset,
* its lock, b) when a concurrent invalidate_mapping_pages got there first and
* c) when tmpfs swizzles a page between a tmpfs inode and swapper_space.
*/
static void
truncate_cleanup_page(struct address_space *mapping, struct page *page)
static void truncate_cleanup_page(struct page *page)
{
if (page_mapped(page)) {
unsigned int nr = thp_nr_pages(page);
unmap_mapping_pages(mapping, page->index, nr, false);
}
if (page_mapped(page))
unmap_mapping_page(page);
if (page_has_private(page))
do_invalidatepage(page, 0, thp_size(page));
......@@ -218,7 +215,7 @@ int truncate_inode_page(struct address_space *mapping, struct page *page)
if (page->mapping != mapping)
return -EIO;
truncate_cleanup_page(mapping, page);
truncate_cleanup_page(page);
delete_from_page_cache(page);
return 0;
}
......@@ -325,7 +322,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
index = indices[pagevec_count(&pvec) - 1] + 1;
truncate_exceptional_pvec_entries(mapping, &pvec, indices);
for (i = 0; i < pagevec_count(&pvec); i++)
truncate_cleanup_page(mapping, pvec.pages[i]);
truncate_cleanup_page(pvec.pages[i]);
delete_from_page_cache_batch(mapping, &pvec);
for (i = 0; i < pagevec_count(&pvec); i++)
unlock_page(pvec.pages[i]);
......@@ -639,30 +636,28 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
continue;
}
lock_page(page);
WARN_ON(page_to_index(page) != index);
if (page->mapping != mapping) {
unlock_page(page);
continue;
}
wait_on_page_writeback(page);
if (page_mapped(page)) {
if (!did_range_unmap) {
if (!did_range_unmap && page_mapped(page)) {
/*
* Zap the rest of the file in one hit.
* If page is mapped, before taking its lock,
* zap the rest of the file in one hit.
*/
unmap_mapping_pages(mapping, index,
(1 + end - index), false);
did_range_unmap = 1;
} else {
/*
* Just zap this page
*/
unmap_mapping_pages(mapping, index,
1, false);
}
lock_page(page);
WARN_ON(page_to_index(page) != index);
if (page->mapping != mapping) {
unlock_page(page);
continue;
}
wait_on_page_writeback(page);
if (page_mapped(page))
unmap_mapping_page(page);
BUG_ON(page_mapped(page));
ret2 = do_launder_page(mapping, page);
if (ret2 == 0) {
if (!invalidate_complete_page2(mapping, page))
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment