Commit bdaa78c6 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'mm-hotfixes-stable-2022-12-02' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm

Pull misc hotfixes from Andrew Morton:
 "15 hotfixes,  11 marked cc:stable.

  Only three or four of the latter address post-6.0 issues, which is
  hopefully a sign that things are converging"

* tag 'mm-hotfixes-stable-2022-12-02' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm:
  revert "kbuild: fix -Wimplicit-function-declaration in license_is_gpl_compatible"
  Kconfig.debug: provide a little extra FRAME_WARN leeway when KASAN is enabled
  drm/amdgpu: temporarily disable broken Clang builds due to blown stack-frame
  mm/khugepaged: invoke MMU notifiers in shmem/file collapse paths
  mm/khugepaged: fix GUP-fast interaction by sending IPI
  mm/khugepaged: take the right locks for page table retraction
  mm: migrate: fix THP's mapcount on isolation
  mm: introduce arch_has_hw_nonleaf_pmd_young()
  mm: add dummy pmd_young() for architectures not having it
  mm/damon/sysfs: fix wrong empty schemes assumption under online tuning in damon_sysfs_set_schemes()
  tools/vm/slabinfo-gnuplot: use "grep -E" instead of "egrep"
  nilfs2: fix NULL pointer dereference in nilfs_palloc_commit_free_entry()
  hugetlb: don't delete vma_lock in hugetlb MADV_DONTNEED processing
  madvise: use zap_page_range_single for madvise dontneed
  mm: replace VM_WARN_ON to pr_warn if the node is offline with __GFP_THISNODE
parents 6647e76a 1d351f18
...@@ -490,6 +490,7 @@ static inline pmd_t pmd_mkdirty(pmd_t pmd) ...@@ -490,6 +490,7 @@ static inline pmd_t pmd_mkdirty(pmd_t pmd)
return pmd; return pmd;
} }
#define pmd_young pmd_young
static inline int pmd_young(pmd_t pmd) static inline int pmd_young(pmd_t pmd)
{ {
return !!(pmd_val(pmd) & _PAGE_ACCESSED); return !!(pmd_val(pmd) & _PAGE_ACCESSED);
......
...@@ -622,6 +622,7 @@ static inline pmd_t pmd_mkdirty(pmd_t pmd) ...@@ -622,6 +622,7 @@ static inline pmd_t pmd_mkdirty(pmd_t pmd)
return pmd; return pmd;
} }
#define pmd_young pmd_young
static inline int pmd_young(pmd_t pmd) static inline int pmd_young(pmd_t pmd)
{ {
return !!(pmd_val(pmd) & _PAGE_ACCESSED); return !!(pmd_val(pmd) & _PAGE_ACCESSED);
......
...@@ -600,6 +600,7 @@ static inline int pmd_dirty(pmd_t pmd) ...@@ -600,6 +600,7 @@ static inline int pmd_dirty(pmd_t pmd)
return pte_dirty(pmd_pte(pmd)); return pte_dirty(pmd_pte(pmd));
} }
#define pmd_young pmd_young
static inline int pmd_young(pmd_t pmd) static inline int pmd_young(pmd_t pmd)
{ {
return pte_young(pmd_pte(pmd)); return pte_young(pmd_pte(pmd));
......
...@@ -763,6 +763,7 @@ static inline int pmd_dirty(pmd_t pmd) ...@@ -763,6 +763,7 @@ static inline int pmd_dirty(pmd_t pmd)
return (pmd_val(pmd) & _SEGMENT_ENTRY_DIRTY) != 0; return (pmd_val(pmd) & _SEGMENT_ENTRY_DIRTY) != 0;
} }
#define pmd_young pmd_young
static inline int pmd_young(pmd_t pmd) static inline int pmd_young(pmd_t pmd)
{ {
return (pmd_val(pmd) & _SEGMENT_ENTRY_YOUNG) != 0; return (pmd_val(pmd) & _SEGMENT_ENTRY_YOUNG) != 0;
......
...@@ -693,6 +693,7 @@ static inline unsigned long pmd_dirty(pmd_t pmd) ...@@ -693,6 +693,7 @@ static inline unsigned long pmd_dirty(pmd_t pmd)
return pte_dirty(pte); return pte_dirty(pte);
} }
#define pmd_young pmd_young
static inline unsigned long pmd_young(pmd_t pmd) static inline unsigned long pmd_young(pmd_t pmd)
{ {
pte_t pte = __pte(pmd_val(pmd)); pte_t pte = __pte(pmd_val(pmd));
......
...@@ -139,6 +139,7 @@ static inline int pmd_dirty(pmd_t pmd) ...@@ -139,6 +139,7 @@ static inline int pmd_dirty(pmd_t pmd)
return pmd_flags(pmd) & _PAGE_DIRTY; return pmd_flags(pmd) & _PAGE_DIRTY;
} }
#define pmd_young pmd_young
static inline int pmd_young(pmd_t pmd) static inline int pmd_young(pmd_t pmd)
{ {
return pmd_flags(pmd) & _PAGE_ACCESSED; return pmd_flags(pmd) & _PAGE_ACCESSED;
...@@ -1438,6 +1439,14 @@ static inline bool arch_has_hw_pte_young(void) ...@@ -1438,6 +1439,14 @@ static inline bool arch_has_hw_pte_young(void)
return true; return true;
} }
#ifdef CONFIG_XEN_PV
#define arch_has_hw_nonleaf_pmd_young arch_has_hw_nonleaf_pmd_young
static inline bool arch_has_hw_nonleaf_pmd_young(void)
{
return !cpu_feature_enabled(X86_FEATURE_XENPV);
}
#endif
#ifdef CONFIG_PAGE_TABLE_CHECK #ifdef CONFIG_PAGE_TABLE_CHECK
static inline bool pte_user_accessible_page(pte_t pte) static inline bool pte_user_accessible_page(pte_t pte)
{ {
......
...@@ -5,6 +5,7 @@ menu "Display Engine Configuration" ...@@ -5,6 +5,7 @@ menu "Display Engine Configuration"
config DRM_AMD_DC config DRM_AMD_DC
bool "AMD DC - Enable new display engine" bool "AMD DC - Enable new display engine"
default y default y
depends on BROKEN || !CC_IS_CLANG || X86_64 || SPARC64 || ARM64
select SND_HDA_COMPONENT if SND_HDA_CORE select SND_HDA_COMPONENT if SND_HDA_CORE
select DRM_AMD_DC_DCN if (X86 || PPC_LONG_DOUBLE_128) select DRM_AMD_DC_DCN if (X86 || PPC_LONG_DOUBLE_128)
help help
...@@ -12,6 +13,12 @@ config DRM_AMD_DC ...@@ -12,6 +13,12 @@ config DRM_AMD_DC
support for AMDGPU. This adds required support for Vega and support for AMDGPU. This adds required support for Vega and
Raven ASICs. Raven ASICs.
calculate_bandwidth() is presently broken on all !(X86_64 || SPARC64 || ARM64)
architectures built with Clang (all released versions), whereby the stack
frame gets blown up to well over 5k. This would cause an immediate kernel
panic on most architectures. We'll revert this when the following bug report
has been resolved: https://github.com/llvm/llvm-project/issues/41896.
config DRM_AMD_DC_DCN config DRM_AMD_DC_DCN
def_bool n def_bool n
help help
......
...@@ -111,6 +111,13 @@ static void nilfs_dat_commit_free(struct inode *dat, ...@@ -111,6 +111,13 @@ static void nilfs_dat_commit_free(struct inode *dat,
kunmap_atomic(kaddr); kunmap_atomic(kaddr);
nilfs_dat_commit_entry(dat, req); nilfs_dat_commit_entry(dat, req);
if (unlikely(req->pr_desc_bh == NULL || req->pr_bitmap_bh == NULL)) {
nilfs_error(dat->i_sb,
"state inconsistency probably due to duplicate use of vblocknr = %llu",
(unsigned long long)req->pr_entry_nr);
return;
}
nilfs_palloc_commit_free_entry(dat, req); nilfs_palloc_commit_free_entry(dat, req);
} }
......
...@@ -222,12 +222,16 @@ extern void tlb_remove_table(struct mmu_gather *tlb, void *table); ...@@ -222,12 +222,16 @@ extern void tlb_remove_table(struct mmu_gather *tlb, void *table);
#define tlb_needs_table_invalidate() (true) #define tlb_needs_table_invalidate() (true)
#endif #endif
void tlb_remove_table_sync_one(void);
#else #else
#ifdef tlb_needs_table_invalidate #ifdef tlb_needs_table_invalidate
#error tlb_needs_table_invalidate() requires MMU_GATHER_RCU_TABLE_FREE #error tlb_needs_table_invalidate() requires MMU_GATHER_RCU_TABLE_FREE
#endif #endif
static inline void tlb_remove_table_sync_one(void) { }
#endif /* CONFIG_MMU_GATHER_RCU_TABLE_FREE */ #endif /* CONFIG_MMU_GATHER_RCU_TABLE_FREE */
......
...@@ -210,6 +210,20 @@ alloc_pages_bulk_array_node(gfp_t gfp, int nid, unsigned long nr_pages, struct p ...@@ -210,6 +210,20 @@ alloc_pages_bulk_array_node(gfp_t gfp, int nid, unsigned long nr_pages, struct p
return __alloc_pages_bulk(gfp, nid, NULL, nr_pages, NULL, page_array); return __alloc_pages_bulk(gfp, nid, NULL, nr_pages, NULL, page_array);
} }
static inline void warn_if_node_offline(int this_node, gfp_t gfp_mask)
{
gfp_t warn_gfp = gfp_mask & (__GFP_THISNODE|__GFP_NOWARN);
if (warn_gfp != (__GFP_THISNODE|__GFP_NOWARN))
return;
if (node_online(this_node))
return;
pr_warn("%pGg allocation from offline node %d\n", &gfp_mask, this_node);
dump_stack();
}
/* /*
* Allocate pages, preferring the node given as nid. The node must be valid and * Allocate pages, preferring the node given as nid. The node must be valid and
* online. For more general interface, see alloc_pages_node(). * online. For more general interface, see alloc_pages_node().
...@@ -218,7 +232,7 @@ static inline struct page * ...@@ -218,7 +232,7 @@ static inline struct page *
__alloc_pages_node(int nid, gfp_t gfp_mask, unsigned int order) __alloc_pages_node(int nid, gfp_t gfp_mask, unsigned int order)
{ {
VM_BUG_ON(nid < 0 || nid >= MAX_NUMNODES); VM_BUG_ON(nid < 0 || nid >= MAX_NUMNODES);
VM_WARN_ON((gfp_mask & __GFP_THISNODE) && !node_online(nid)); warn_if_node_offline(nid, gfp_mask);
return __alloc_pages(gfp_mask, order, nid, NULL); return __alloc_pages(gfp_mask, order, nid, NULL);
} }
...@@ -227,7 +241,7 @@ static inline ...@@ -227,7 +241,7 @@ static inline
struct folio *__folio_alloc_node(gfp_t gfp, unsigned int order, int nid) struct folio *__folio_alloc_node(gfp_t gfp, unsigned int order, int nid)
{ {
VM_BUG_ON(nid < 0 || nid >= MAX_NUMNODES); VM_BUG_ON(nid < 0 || nid >= MAX_NUMNODES);
VM_WARN_ON((gfp & __GFP_THISNODE) && !node_online(nid)); warn_if_node_offline(nid, gfp);
return __folio_alloc(gfp, order, nid, NULL); return __folio_alloc(gfp, order, nid, NULL);
} }
......
...@@ -2,8 +2,6 @@ ...@@ -2,8 +2,6 @@
#ifndef __LICENSE_H #ifndef __LICENSE_H
#define __LICENSE_H #define __LICENSE_H
#include <linux/string.h>
static inline int license_is_gpl_compatible(const char *license) static inline int license_is_gpl_compatible(const char *license)
{ {
return (strcmp(license, "GPL") == 0 return (strcmp(license, "GPL") == 0
......
...@@ -1852,6 +1852,25 @@ static void __maybe_unused show_free_areas(unsigned int flags, nodemask_t *nodem ...@@ -1852,6 +1852,25 @@ static void __maybe_unused show_free_areas(unsigned int flags, nodemask_t *nodem
__show_free_areas(flags, nodemask, MAX_NR_ZONES - 1); __show_free_areas(flags, nodemask, MAX_NR_ZONES - 1);
} }
/*
* Parameter block passed down to zap_pte_range in exceptional cases.
*/
struct zap_details {
struct folio *single_folio; /* Locked folio to be unmapped */
bool even_cows; /* Zap COWed private pages too? */
zap_flags_t zap_flags; /* Extra flags for zapping */
};
/*
* Whether to drop the pte markers, for example, the uffd-wp information for
* file-backed memory. This should only be specified when we will completely
* drop the page in the mm, either by truncation or unmapping of the vma. By
* default, the flag is not set.
*/
#define ZAP_FLAG_DROP_MARKER ((__force zap_flags_t) BIT(0))
/* Set in unmap_vmas() to indicate a final unmap call. Only used by hugetlb */
#define ZAP_FLAG_UNMAP ((__force zap_flags_t) BIT(1))
#ifdef CONFIG_MMU #ifdef CONFIG_MMU
extern bool can_do_mlock(void); extern bool can_do_mlock(void);
#else #else
...@@ -1869,6 +1888,8 @@ void zap_vma_ptes(struct vm_area_struct *vma, unsigned long address, ...@@ -1869,6 +1888,8 @@ void zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
unsigned long size); unsigned long size);
void zap_page_range(struct vm_area_struct *vma, unsigned long address, void zap_page_range(struct vm_area_struct *vma, unsigned long address,
unsigned long size); unsigned long size);
void zap_page_range_single(struct vm_area_struct *vma, unsigned long address,
unsigned long size, struct zap_details *details);
void unmap_vmas(struct mmu_gather *tlb, struct maple_tree *mt, void unmap_vmas(struct mmu_gather *tlb, struct maple_tree *mt,
struct vm_area_struct *start_vma, unsigned long start, struct vm_area_struct *start_vma, unsigned long start,
unsigned long end); unsigned long end);
...@@ -3467,12 +3488,4 @@ madvise_set_anon_name(struct mm_struct *mm, unsigned long start, ...@@ -3467,12 +3488,4 @@ madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
} }
#endif #endif
/*
* Whether to drop the pte markers, for example, the uffd-wp information for
* file-backed memory. This should only be specified when we will completely
* drop the page in the mm, either by truncation or unmapping of the vma. By
* default, the flag is not set.
*/
#define ZAP_FLAG_DROP_MARKER ((__force zap_flags_t) BIT(0))
#endif /* _LINUX_MM_H */ #endif /* _LINUX_MM_H */
...@@ -165,6 +165,13 @@ static inline pte_t *virt_to_kpte(unsigned long vaddr) ...@@ -165,6 +165,13 @@ static inline pte_t *virt_to_kpte(unsigned long vaddr)
return pmd_none(*pmd) ? NULL : pte_offset_kernel(pmd, vaddr); return pmd_none(*pmd) ? NULL : pte_offset_kernel(pmd, vaddr);
} }
#ifndef pmd_young
static inline int pmd_young(pmd_t pmd)
{
return 0;
}
#endif
#ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS #ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
extern int ptep_set_access_flags(struct vm_area_struct *vma, extern int ptep_set_access_flags(struct vm_area_struct *vma,
unsigned long address, pte_t *ptep, unsigned long address, pte_t *ptep,
...@@ -260,6 +267,17 @@ static inline int pmdp_clear_flush_young(struct vm_area_struct *vma, ...@@ -260,6 +267,17 @@ static inline int pmdp_clear_flush_young(struct vm_area_struct *vma,
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
#endif #endif
#ifndef arch_has_hw_nonleaf_pmd_young
/*
* Return whether the accessed bit in non-leaf PMD entries is supported on the
* local CPU.
*/
static inline bool arch_has_hw_nonleaf_pmd_young(void)
{
return IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG);
}
#endif
#ifndef arch_has_hw_pte_young #ifndef arch_has_hw_pte_young
/* /*
* Return whether the accessed bit is supported on the local CPU. * Return whether the accessed bit is supported on the local CPU.
......
...@@ -399,6 +399,7 @@ config FRAME_WARN ...@@ -399,6 +399,7 @@ config FRAME_WARN
default 2048 if GCC_PLUGIN_LATENT_ENTROPY default 2048 if GCC_PLUGIN_LATENT_ENTROPY
default 2048 if PARISC default 2048 if PARISC
default 1536 if (!64BIT && XTENSA) default 1536 if (!64BIT && XTENSA)
default 1280 if KASAN && !64BIT
default 1024 if !64BIT default 1024 if !64BIT
default 2048 if 64BIT default 2048 if 64BIT
help help
......
...@@ -984,29 +984,29 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, ...@@ -984,29 +984,29 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
goto isolate_fail; goto isolate_fail;
} }
/*
* Be careful not to clear PageLRU until after we're
* sure the page is not being freed elsewhere -- the
* page release code relies on it.
*/
if (unlikely(!get_page_unless_zero(page)))
goto isolate_fail;
/* /*
* Migration will fail if an anonymous page is pinned in memory, * Migration will fail if an anonymous page is pinned in memory,
* so avoid taking lru_lock and isolating it unnecessarily in an * so avoid taking lru_lock and isolating it unnecessarily in an
* admittedly racy check. * admittedly racy check.
*/ */
mapping = page_mapping(page); mapping = page_mapping(page);
if (!mapping && page_count(page) > page_mapcount(page)) if (!mapping && (page_count(page) - 1) > total_mapcount(page))
goto isolate_fail; goto isolate_fail_put;
/* /*
* Only allow to migrate anonymous pages in GFP_NOFS context * Only allow to migrate anonymous pages in GFP_NOFS context
* because those do not depend on fs locks. * because those do not depend on fs locks.
*/ */
if (!(cc->gfp_mask & __GFP_FS) && mapping) if (!(cc->gfp_mask & __GFP_FS) && mapping)
goto isolate_fail; goto isolate_fail_put;
/*
* Be careful not to clear PageLRU until after we're
* sure the page is not being freed elsewhere -- the
* page release code relies on it.
*/
if (unlikely(!get_page_unless_zero(page)))
goto isolate_fail;
/* Only take pages on LRU: a check now makes later tests safe */ /* Only take pages on LRU: a check now makes later tests safe */
if (!PageLRU(page)) if (!PageLRU(page))
......
...@@ -2283,12 +2283,54 @@ static struct damos *damon_sysfs_mk_scheme( ...@@ -2283,12 +2283,54 @@ static struct damos *damon_sysfs_mk_scheme(
&wmarks); &wmarks);
} }
static void damon_sysfs_update_scheme(struct damos *scheme,
struct damon_sysfs_scheme *sysfs_scheme)
{
struct damon_sysfs_access_pattern *access_pattern =
sysfs_scheme->access_pattern;
struct damon_sysfs_quotas *sysfs_quotas = sysfs_scheme->quotas;
struct damon_sysfs_weights *sysfs_weights = sysfs_quotas->weights;
struct damon_sysfs_watermarks *sysfs_wmarks = sysfs_scheme->watermarks;
scheme->pattern.min_sz_region = access_pattern->sz->min;
scheme->pattern.max_sz_region = access_pattern->sz->max;
scheme->pattern.min_nr_accesses = access_pattern->nr_accesses->min;
scheme->pattern.max_nr_accesses = access_pattern->nr_accesses->max;
scheme->pattern.min_age_region = access_pattern->age->min;
scheme->pattern.max_age_region = access_pattern->age->max;
scheme->action = sysfs_scheme->action;
scheme->quota.ms = sysfs_quotas->ms;
scheme->quota.sz = sysfs_quotas->sz;
scheme->quota.reset_interval = sysfs_quotas->reset_interval_ms;
scheme->quota.weight_sz = sysfs_weights->sz;
scheme->quota.weight_nr_accesses = sysfs_weights->nr_accesses;
scheme->quota.weight_age = sysfs_weights->age;
scheme->wmarks.metric = sysfs_wmarks->metric;
scheme->wmarks.interval = sysfs_wmarks->interval_us;
scheme->wmarks.high = sysfs_wmarks->high;
scheme->wmarks.mid = sysfs_wmarks->mid;
scheme->wmarks.low = sysfs_wmarks->low;
}
static int damon_sysfs_set_schemes(struct damon_ctx *ctx, static int damon_sysfs_set_schemes(struct damon_ctx *ctx,
struct damon_sysfs_schemes *sysfs_schemes) struct damon_sysfs_schemes *sysfs_schemes)
{ {
int i; struct damos *scheme, *next;
int i = 0;
damon_for_each_scheme_safe(scheme, next, ctx) {
if (i < sysfs_schemes->nr)
damon_sysfs_update_scheme(scheme,
sysfs_schemes->schemes_arr[i]);
else
damon_destroy_scheme(scheme);
i++;
}
for (i = 0; i < sysfs_schemes->nr; i++) { for (; i < sysfs_schemes->nr; i++) {
struct damos *scheme, *next; struct damos *scheme, *next;
scheme = damon_sysfs_mk_scheme(sysfs_schemes->schemes_arr[i]); scheme = damon_sysfs_mk_scheme(sysfs_schemes->schemes_arr[i]);
......
...@@ -5206,17 +5206,22 @@ void __unmap_hugepage_range_final(struct mmu_gather *tlb, ...@@ -5206,17 +5206,22 @@ void __unmap_hugepage_range_final(struct mmu_gather *tlb,
__unmap_hugepage_range(tlb, vma, start, end, ref_page, zap_flags); __unmap_hugepage_range(tlb, vma, start, end, ref_page, zap_flags);
if (zap_flags & ZAP_FLAG_UNMAP) { /* final unmap */
/* /*
* Unlock and free the vma lock before releasing i_mmap_rwsem. When * Unlock and free the vma lock before releasing i_mmap_rwsem.
* the vma_lock is freed, this makes the vma ineligible for pmd * When the vma_lock is freed, this makes the vma ineligible
* sharing. And, i_mmap_rwsem is required to set up pmd sharing. * for pmd sharing. And, i_mmap_rwsem is required to set up
* This is important as page tables for this unmapped range will * pmd sharing. This is important as page tables for this
* be asynchrously deleted. If the page tables are shared, there * unmapped range will be asynchrously deleted. If the page
* will be issues when accessed by someone else. * tables are shared, there will be issues when accessed by
* someone else.
*/ */
__hugetlb_vma_unlock_write_free(vma); __hugetlb_vma_unlock_write_free(vma);
i_mmap_unlock_write(vma->vm_file->f_mapping); i_mmap_unlock_write(vma->vm_file->f_mapping);
} else {
i_mmap_unlock_write(vma->vm_file->f_mapping);
hugetlb_vma_unlock_write(vma);
}
} }
void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
......
...@@ -1051,6 +1051,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address, ...@@ -1051,6 +1051,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
_pmd = pmdp_collapse_flush(vma, address, pmd); _pmd = pmdp_collapse_flush(vma, address, pmd);
spin_unlock(pmd_ptl); spin_unlock(pmd_ptl);
mmu_notifier_invalidate_range_end(&range); mmu_notifier_invalidate_range_end(&range);
tlb_remove_table_sync_one();
spin_lock(pte_ptl); spin_lock(pte_ptl);
result = __collapse_huge_page_isolate(vma, address, pte, cc, result = __collapse_huge_page_isolate(vma, address, pte, cc,
...@@ -1379,16 +1380,43 @@ static int set_huge_pmd(struct vm_area_struct *vma, unsigned long addr, ...@@ -1379,16 +1380,43 @@ static int set_huge_pmd(struct vm_area_struct *vma, unsigned long addr,
return SCAN_SUCCEED; return SCAN_SUCCEED;
} }
/*
* A note about locking:
* Trying to take the page table spinlocks would be useless here because those
* are only used to synchronize:
*
* - modifying terminal entries (ones that point to a data page, not to another
* page table)
* - installing *new* non-terminal entries
*
* Instead, we need roughly the same kind of protection as free_pgtables() or
* mm_take_all_locks() (but only for a single VMA):
* The mmap lock together with this VMA's rmap locks covers all paths towards
* the page table entries we're messing with here, except for hardware page
* table walks and lockless_pages_from_mm().
*/
static void collapse_and_free_pmd(struct mm_struct *mm, struct vm_area_struct *vma, static void collapse_and_free_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long addr, pmd_t *pmdp) unsigned long addr, pmd_t *pmdp)
{ {
spinlock_t *ptl;
pmd_t pmd; pmd_t pmd;
struct mmu_notifier_range range;
mmap_assert_write_locked(mm); mmap_assert_write_locked(mm);
ptl = pmd_lock(vma->vm_mm, pmdp); if (vma->vm_file)
lockdep_assert_held_write(&vma->vm_file->f_mapping->i_mmap_rwsem);
/*
* All anon_vmas attached to the VMA have the same root and are
* therefore locked by the same lock.
*/
if (vma->anon_vma)
lockdep_assert_held_write(&vma->anon_vma->root->rwsem);
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, NULL, mm, addr,
addr + HPAGE_PMD_SIZE);
mmu_notifier_invalidate_range_start(&range);
pmd = pmdp_collapse_flush(vma, addr, pmdp); pmd = pmdp_collapse_flush(vma, addr, pmdp);
spin_unlock(ptl); tlb_remove_table_sync_one();
mmu_notifier_invalidate_range_end(&range);
mm_dec_nr_ptes(mm); mm_dec_nr_ptes(mm);
page_table_check_pte_clear_range(mm, addr, pmd); page_table_check_pte_clear_range(mm, addr, pmd);
pte_free(mm, pmd_pgtable(pmd)); pte_free(mm, pmd_pgtable(pmd));
...@@ -1439,6 +1467,14 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, ...@@ -1439,6 +1467,14 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
if (!hugepage_vma_check(vma, vma->vm_flags, false, false, false)) if (!hugepage_vma_check(vma, vma->vm_flags, false, false, false))
return SCAN_VMA_CHECK; return SCAN_VMA_CHECK;
/*
* Symmetry with retract_page_tables(): Exclude MAP_PRIVATE mappings
* that got written to. Without this, we'd have to also lock the
* anon_vma if one exists.
*/
if (vma->anon_vma)
return SCAN_VMA_CHECK;
/* Keep pmd pgtable for uffd-wp; see comment in retract_page_tables() */ /* Keep pmd pgtable for uffd-wp; see comment in retract_page_tables() */
if (userfaultfd_wp(vma)) if (userfaultfd_wp(vma))
return SCAN_PTE_UFFD_WP; return SCAN_PTE_UFFD_WP;
...@@ -1472,6 +1508,20 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, ...@@ -1472,6 +1508,20 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
goto drop_hpage; goto drop_hpage;
} }
/*
* We need to lock the mapping so that from here on, only GUP-fast and
* hardware page walks can access the parts of the page tables that
* we're operating on.
* See collapse_and_free_pmd().
*/
i_mmap_lock_write(vma->vm_file->f_mapping);
/*
* This spinlock should be unnecessary: Nobody else should be accessing
* the page tables under spinlock protection here, only
* lockless_pages_from_mm() and the hardware page walker can access page
* tables while all the high-level locks are held in write mode.
*/
start_pte = pte_offset_map_lock(mm, pmd, haddr, &ptl); start_pte = pte_offset_map_lock(mm, pmd, haddr, &ptl);
result = SCAN_FAIL; result = SCAN_FAIL;
...@@ -1526,6 +1576,8 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, ...@@ -1526,6 +1576,8 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
/* step 4: remove pte entries */ /* step 4: remove pte entries */
collapse_and_free_pmd(mm, vma, haddr, pmd); collapse_and_free_pmd(mm, vma, haddr, pmd);
i_mmap_unlock_write(vma->vm_file->f_mapping);
maybe_install_pmd: maybe_install_pmd:
/* step 5: install pmd entry */ /* step 5: install pmd entry */
result = install_pmd result = install_pmd
...@@ -1539,6 +1591,7 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, ...@@ -1539,6 +1591,7 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
abort: abort:
pte_unmap_unlock(start_pte, ptl); pte_unmap_unlock(start_pte, ptl);
i_mmap_unlock_write(vma->vm_file->f_mapping);
goto drop_hpage; goto drop_hpage;
} }
...@@ -1595,7 +1648,8 @@ static int retract_page_tables(struct address_space *mapping, pgoff_t pgoff, ...@@ -1595,7 +1648,8 @@ static int retract_page_tables(struct address_space *mapping, pgoff_t pgoff,
* An alternative would be drop the check, but check that page * An alternative would be drop the check, but check that page
* table is clear before calling pmdp_collapse_flush() under * table is clear before calling pmdp_collapse_flush() under
* ptl. It has higher chance to recover THP for the VMA, but * ptl. It has higher chance to recover THP for the VMA, but
* has higher cost too. * has higher cost too. It would also probably require locking
* the anon_vma.
*/ */
if (vma->anon_vma) { if (vma->anon_vma) {
result = SCAN_PAGE_ANON; result = SCAN_PAGE_ANON;
......
...@@ -772,8 +772,8 @@ static int madvise_free_single_vma(struct vm_area_struct *vma, ...@@ -772,8 +772,8 @@ static int madvise_free_single_vma(struct vm_area_struct *vma,
* Application no longer needs these pages. If the pages are dirty, * Application no longer needs these pages. If the pages are dirty,
* it's OK to just throw them away. The app will be more careful about * it's OK to just throw them away. The app will be more careful about
* data it wants to keep. Be sure to free swap resources too. The * data it wants to keep. Be sure to free swap resources too. The
* zap_page_range call sets things up for shrink_active_list to actually free * zap_page_range_single call sets things up for shrink_active_list to actually
* these pages later if no one else has touched them in the meantime, * free these pages later if no one else has touched them in the meantime,
* although we could add these pages to a global reuse list for * although we could add these pages to a global reuse list for
* shrink_active_list to pick up before reclaiming other pages. * shrink_active_list to pick up before reclaiming other pages.
* *
...@@ -790,7 +790,7 @@ static int madvise_free_single_vma(struct vm_area_struct *vma, ...@@ -790,7 +790,7 @@ static int madvise_free_single_vma(struct vm_area_struct *vma,
static long madvise_dontneed_single_vma(struct vm_area_struct *vma, static long madvise_dontneed_single_vma(struct vm_area_struct *vma,
unsigned long start, unsigned long end) unsigned long start, unsigned long end)
{ {
zap_page_range(vma, start, end - start); zap_page_range_single(vma, start, end - start, NULL);
return 0; return 0;
} }
......
...@@ -1341,15 +1341,6 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) ...@@ -1341,15 +1341,6 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
return ret; return ret;
} }
/*
* Parameter block passed down to zap_pte_range in exceptional cases.
*/
struct zap_details {
struct folio *single_folio; /* Locked folio to be unmapped */
bool even_cows; /* Zap COWed private pages too? */
zap_flags_t zap_flags; /* Extra flags for zapping */
};
/* Whether we should zap all COWed (private) pages too */ /* Whether we should zap all COWed (private) pages too */
static inline bool should_zap_cows(struct zap_details *details) static inline bool should_zap_cows(struct zap_details *details)
{ {
...@@ -1720,7 +1711,7 @@ void unmap_vmas(struct mmu_gather *tlb, struct maple_tree *mt, ...@@ -1720,7 +1711,7 @@ void unmap_vmas(struct mmu_gather *tlb, struct maple_tree *mt,
{ {
struct mmu_notifier_range range; struct mmu_notifier_range range;
struct zap_details details = { struct zap_details details = {
.zap_flags = ZAP_FLAG_DROP_MARKER, .zap_flags = ZAP_FLAG_DROP_MARKER | ZAP_FLAG_UNMAP,
/* Careful - we need to zap private pages too! */ /* Careful - we need to zap private pages too! */
.even_cows = true, .even_cows = true,
}; };
...@@ -1774,19 +1765,27 @@ void zap_page_range(struct vm_area_struct *vma, unsigned long start, ...@@ -1774,19 +1765,27 @@ void zap_page_range(struct vm_area_struct *vma, unsigned long start,
* *
* The range must fit into one VMA. * The range must fit into one VMA.
*/ */
static void zap_page_range_single(struct vm_area_struct *vma, unsigned long address, void zap_page_range_single(struct vm_area_struct *vma, unsigned long address,
unsigned long size, struct zap_details *details) unsigned long size, struct zap_details *details)
{ {
const unsigned long end = address + size;
struct mmu_notifier_range range; struct mmu_notifier_range range;
struct mmu_gather tlb; struct mmu_gather tlb;
lru_add_drain(); lru_add_drain();
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
address, address + size); address, end);
if (is_vm_hugetlb_page(vma))
adjust_range_if_pmd_sharing_possible(vma, &range.start,
&range.end);
tlb_gather_mmu(&tlb, vma->vm_mm); tlb_gather_mmu(&tlb, vma->vm_mm);
update_hiwater_rss(vma->vm_mm); update_hiwater_rss(vma->vm_mm);
mmu_notifier_invalidate_range_start(&range); mmu_notifier_invalidate_range_start(&range);
unmap_single_vma(&tlb, vma, address, range.end, details); /*
* unmap 'address-end' not 'range.start-range.end' as range
* could have been expanded for hugetlb pmd sharing.
*/
unmap_single_vma(&tlb, vma, address, end, details);
mmu_notifier_invalidate_range_end(&range); mmu_notifier_invalidate_range_end(&range);
tlb_finish_mmu(&tlb); tlb_finish_mmu(&tlb);
} }
......
...@@ -153,7 +153,7 @@ static void tlb_remove_table_smp_sync(void *arg) ...@@ -153,7 +153,7 @@ static void tlb_remove_table_smp_sync(void *arg)
/* Simply deliver the interrupt */ /* Simply deliver the interrupt */
} }
static void tlb_remove_table_sync_one(void) void tlb_remove_table_sync_one(void)
{ {
/* /*
* This isn't an RCU grace period and hence the page-tables cannot be * This isn't an RCU grace period and hence the page-tables cannot be
...@@ -177,8 +177,6 @@ static void tlb_remove_table_free(struct mmu_table_batch *batch) ...@@ -177,8 +177,6 @@ static void tlb_remove_table_free(struct mmu_table_batch *batch)
#else /* !CONFIG_MMU_GATHER_RCU_TABLE_FREE */ #else /* !CONFIG_MMU_GATHER_RCU_TABLE_FREE */
static void tlb_remove_table_sync_one(void) { }
static void tlb_remove_table_free(struct mmu_table_batch *batch) static void tlb_remove_table_free(struct mmu_table_batch *batch)
{ {
__tlb_remove_table_free(batch); __tlb_remove_table_free(batch);
......
...@@ -3987,7 +3987,7 @@ static void walk_pmd_range_locked(pud_t *pud, unsigned long next, struct vm_area ...@@ -3987,7 +3987,7 @@ static void walk_pmd_range_locked(pud_t *pud, unsigned long next, struct vm_area
goto next; goto next;
if (!pmd_trans_huge(pmd[i])) { if (!pmd_trans_huge(pmd[i])) {
if (IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) && if (arch_has_hw_nonleaf_pmd_young() &&
get_cap(LRU_GEN_NONLEAF_YOUNG)) get_cap(LRU_GEN_NONLEAF_YOUNG))
pmdp_test_and_clear_young(vma, addr, pmd + i); pmdp_test_and_clear_young(vma, addr, pmd + i);
goto next; goto next;
...@@ -4085,14 +4085,14 @@ static void walk_pmd_range(pud_t *pud, unsigned long start, unsigned long end, ...@@ -4085,14 +4085,14 @@ static void walk_pmd_range(pud_t *pud, unsigned long start, unsigned long end,
#endif #endif
walk->mm_stats[MM_NONLEAF_TOTAL]++; walk->mm_stats[MM_NONLEAF_TOTAL]++;
#ifdef CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG if (arch_has_hw_nonleaf_pmd_young() &&
if (get_cap(LRU_GEN_NONLEAF_YOUNG)) { get_cap(LRU_GEN_NONLEAF_YOUNG)) {
if (!pmd_young(val)) if (!pmd_young(val))
continue; continue;
walk_pmd_range_locked(pud, addr, vma, args, bitmap, &pos); walk_pmd_range_locked(pud, addr, vma, args, bitmap, &pos);
} }
#endif
if (!walk->force_scan && !test_bloom_filter(walk->lruvec, walk->max_seq, pmd + i)) if (!walk->force_scan && !test_bloom_filter(walk->lruvec, walk->max_seq, pmd + i))
continue; continue;
...@@ -5392,7 +5392,7 @@ static ssize_t show_enabled(struct kobject *kobj, struct kobj_attribute *attr, c ...@@ -5392,7 +5392,7 @@ static ssize_t show_enabled(struct kobject *kobj, struct kobj_attribute *attr, c
if (arch_has_hw_pte_young() && get_cap(LRU_GEN_MM_WALK)) if (arch_has_hw_pte_young() && get_cap(LRU_GEN_MM_WALK))
caps |= BIT(LRU_GEN_MM_WALK); caps |= BIT(LRU_GEN_MM_WALK);
if (IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) && get_cap(LRU_GEN_NONLEAF_YOUNG)) if (arch_has_hw_nonleaf_pmd_young() && get_cap(LRU_GEN_NONLEAF_YOUNG))
caps |= BIT(LRU_GEN_NONLEAF_YOUNG); caps |= BIT(LRU_GEN_NONLEAF_YOUNG);
return snprintf(buf, PAGE_SIZE, "0x%04x\n", caps); return snprintf(buf, PAGE_SIZE, "0x%04x\n", caps);
......
...@@ -150,7 +150,7 @@ do_preprocess() ...@@ -150,7 +150,7 @@ do_preprocess()
let lines=3 let lines=3
out=`basename "$in"`"-slabs-by-loss" out=`basename "$in"`"-slabs-by-loss"
`cat "$in" | grep -A "$lines" 'Slabs sorted by loss' |\ `cat "$in" | grep -A "$lines" 'Slabs sorted by loss' |\
egrep -iv '\-\-|Name|Slabs'\ grep -E -iv '\-\-|Name|Slabs'\
| awk '{print $1" "$4+$2*$3" "$4}' > "$out"` | awk '{print $1" "$4+$2*$3" "$4}' > "$out"`
if [ $? -eq 0 ]; then if [ $? -eq 0 ]; then
do_slabs_plotting "$out" do_slabs_plotting "$out"
...@@ -159,7 +159,7 @@ do_preprocess() ...@@ -159,7 +159,7 @@ do_preprocess()
let lines=3 let lines=3
out=`basename "$in"`"-slabs-by-size" out=`basename "$in"`"-slabs-by-size"
`cat "$in" | grep -A "$lines" 'Slabs sorted by size' |\ `cat "$in" | grep -A "$lines" 'Slabs sorted by size' |\
egrep -iv '\-\-|Name|Slabs'\ grep -E -iv '\-\-|Name|Slabs'\
| awk '{print $1" "$4" "$4-$2*$3}' > "$out"` | awk '{print $1" "$4" "$4-$2*$3}' > "$out"`
if [ $? -eq 0 ]; then if [ $? -eq 0 ]; then
do_slabs_plotting "$out" do_slabs_plotting "$out"
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment