Commit cb085634 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'mm-hotfixes-stable-2023-04-19-16-36' of...

Merge tag 'mm-hotfixes-stable-2023-04-19-16-36' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm

Pull misc fixes from Andrew Morton:
 "22 hotfixes.

  19 are cc:stable and the remainder address issues which were
  introduced during this merge cycle, or aren't considered suitable for
  -stable backporting.

  19 are for MM and the remainder are for other subsystems"

* tag 'mm-hotfixes-stable-2023-04-19-16-36' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm: (22 commits)
  nilfs2: initialize unused bytes in segment summary blocks
  mm: page_alloc: skip regions with hugetlbfs pages when allocating 1G pages
  mm/mmap: regression fix for unmapped_area{_topdown}
  maple_tree: fix mas_empty_area() search
  maple_tree: make maple state reusable after mas_empty_area_rev()
  mm: kmsan: handle alloc failures in kmsan_ioremap_page_range()
  mm: kmsan: handle alloc failures in kmsan_vmap_pages_range_noflush()
  tools/Makefile: do missed s/vm/mm/
  mm: fix memory leak on mm_init error handling
  mm/page_alloc: fix potential deadlock on zonelist_update_seq seqlock
  kernel/sys.c: fix and improve control flow in __sys_setres[ug]id()
  Revert "userfaultfd: don't fail on unrecognized features"
  writeback, cgroup: fix null-ptr-deref write in bdi_split_work_to_wbs
  maple_tree: fix a potential memory leak, OOB access, or other unpredictable bug
  tools/mm/page_owner_sort.c: fix TGID output when cull=tg is used
  mailmap: update jtoppins' entry to reference correct email
  mm/mempolicy: fix use-after-free of VMA iterator
  mm/huge_memory.c: warn with pr_warn_ratelimited instead of VM_WARN_ON_ONCE_FOLIO
  mm/mprotect: fix do_mprotect_pkey() return on error
  mm/khugepaged: check again on anon uffd-wp during isolation
  ...
parents 23990b1a ef832747
......@@ -232,6 +232,8 @@ Johan Hovold <johan@kernel.org> <johan@hovoldconsulting.com>
John Crispin <john@phrozen.org> <blogic@openwrt.org>
John Paul Adrian Glaubitz <glaubitz@physik.fu-berlin.de>
John Stultz <johnstul@us.ibm.com>
<jon.toppins+linux@gmail.com> <jtoppins@cumulusnetworks.com>
<jon.toppins+linux@gmail.com> <jtoppins@redhat.com>
Jordan Crouse <jordan@cosmicpenguin.net> <jcrouse@codeaurora.org>
<josh@joshtriplett.org> <josh@freedesktop.org>
<josh@joshtriplett.org> <josh@kernel.org>
......
......@@ -978,6 +978,16 @@ static void bdi_split_work_to_wbs(struct backing_dev_info *bdi,
continue;
}
/*
* If wb_tryget fails, the wb has been shutdown, skip it.
*
* Pin @wb so that it stays on @bdi->wb_list. This allows
* continuing iteration from @wb after dropping and
* regrabbing rcu read lock.
*/
if (!wb_tryget(wb))
continue;
/* alloc failed, execute synchronously using on-stack fallback */
work = &fallback_work;
*work = *base_work;
......@@ -986,13 +996,6 @@ static void bdi_split_work_to_wbs(struct backing_dev_info *bdi,
work->done = &fallback_work_done;
wb_queue_work(wb, work);
/*
* Pin @wb so that it stays on @bdi->wb_list. This allows
* continuing iteration from @wb after dropping and
* regrabbing rcu read lock.
*/
wb_get(wb);
last_wb = wb;
rcu_read_unlock();
......
......@@ -430,6 +430,23 @@ static int nilfs_segctor_reset_segment_buffer(struct nilfs_sc_info *sci)
return 0;
}
/**
* nilfs_segctor_zeropad_segsum - zero pad the rest of the segment summary area
* @sci: segment constructor object
*
* nilfs_segctor_zeropad_segsum() zero-fills unallocated space at the end of
* the current segment summary block.
*/
static void nilfs_segctor_zeropad_segsum(struct nilfs_sc_info *sci)
{
struct nilfs_segsum_pointer *ssp;
ssp = sci->sc_blk_cnt > 0 ? &sci->sc_binfo_ptr : &sci->sc_finfo_ptr;
if (ssp->offset < ssp->bh->b_size)
memset(ssp->bh->b_data + ssp->offset, 0,
ssp->bh->b_size - ssp->offset);
}
static int nilfs_segctor_feed_segment(struct nilfs_sc_info *sci)
{
sci->sc_nblk_this_inc += sci->sc_curseg->sb_sum.nblocks;
......@@ -438,6 +455,7 @@ static int nilfs_segctor_feed_segment(struct nilfs_sc_info *sci)
* The current segment is filled up
* (internal code)
*/
nilfs_segctor_zeropad_segsum(sci);
sci->sc_curseg = NILFS_NEXT_SEGBUF(sci->sc_curseg);
return nilfs_segctor_reset_segment_buffer(sci);
}
......@@ -542,6 +560,7 @@ static int nilfs_segctor_add_file_block(struct nilfs_sc_info *sci,
goto retry;
}
if (unlikely(required)) {
nilfs_segctor_zeropad_segsum(sci);
err = nilfs_segbuf_extend_segsum(segbuf);
if (unlikely(err))
goto failed;
......@@ -1533,6 +1552,7 @@ static int nilfs_segctor_collect(struct nilfs_sc_info *sci,
nadd = min_t(int, nadd << 1, SC_MAX_SEGDELTA);
sci->sc_stage = prev_stage;
}
nilfs_segctor_zeropad_segsum(sci);
nilfs_segctor_truncate_segments(sci, sci->sc_curseg, nilfs->ns_sufile);
return 0;
......
......@@ -1955,8 +1955,10 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx,
ret = -EFAULT;
if (copy_from_user(&uffdio_api, buf, sizeof(uffdio_api)))
goto out;
/* Ignore unsupported features (userspace built against newer kernel) */
features = uffdio_api.features & UFFD_API_FEATURES;
features = uffdio_api.features;
ret = -EINVAL;
if (uffdio_api.api != UFFD_API || (features & ~UFFD_API_FEATURES))
goto err_out;
ret = -EPERM;
if ((features & UFFD_FEATURE_EVENT_FORK) && !capable(CAP_SYS_PTRACE))
goto err_out;
......
......@@ -134,11 +134,12 @@ void kmsan_kfree_large(const void *ptr);
* @page_shift: page_shift passed to vmap_range_noflush().
*
* KMSAN maps shadow and origin pages of @pages into contiguous ranges in
* vmalloc metadata address range.
* vmalloc metadata address range. Returns 0 on success, callers must check
* for non-zero return value.
*/
void kmsan_vmap_pages_range_noflush(unsigned long start, unsigned long end,
pgprot_t prot, struct page **pages,
unsigned int page_shift);
int kmsan_vmap_pages_range_noflush(unsigned long start, unsigned long end,
pgprot_t prot, struct page **pages,
unsigned int page_shift);
/**
* kmsan_vunmap_kernel_range_noflush() - Notify KMSAN about a vunmap.
......@@ -159,11 +160,12 @@ void kmsan_vunmap_range_noflush(unsigned long start, unsigned long end);
* @page_shift: page_shift argument passed to vmap_range_noflush().
*
* KMSAN creates new metadata pages for the physical pages mapped into the
* virtual memory.
* virtual memory. Returns 0 on success, callers must check for non-zero return
* value.
*/
void kmsan_ioremap_page_range(unsigned long addr, unsigned long end,
phys_addr_t phys_addr, pgprot_t prot,
unsigned int page_shift);
int kmsan_ioremap_page_range(unsigned long addr, unsigned long end,
phys_addr_t phys_addr, pgprot_t prot,
unsigned int page_shift);
/**
* kmsan_iounmap_page_range() - Notify KMSAN about a iounmap_page_range() call.
......@@ -281,12 +283,13 @@ static inline void kmsan_kfree_large(const void *ptr)
{
}
static inline void kmsan_vmap_pages_range_noflush(unsigned long start,
unsigned long end,
pgprot_t prot,
struct page **pages,
unsigned int page_shift)
static inline int kmsan_vmap_pages_range_noflush(unsigned long start,
unsigned long end,
pgprot_t prot,
struct page **pages,
unsigned int page_shift)
{
return 0;
}
static inline void kmsan_vunmap_range_noflush(unsigned long start,
......@@ -294,12 +297,12 @@ static inline void kmsan_vunmap_range_noflush(unsigned long start,
{
}
static inline void kmsan_ioremap_page_range(unsigned long start,
unsigned long end,
phys_addr_t phys_addr,
pgprot_t prot,
unsigned int page_shift)
static inline int kmsan_ioremap_page_range(unsigned long start,
unsigned long end,
phys_addr_t phys_addr, pgprot_t prot,
unsigned int page_shift)
{
return 0;
}
static inline void kmsan_iounmap_page_range(unsigned long start,
......
......@@ -1174,6 +1174,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
fail_pcpu:
while (i > 0)
percpu_counter_destroy(&mm->rss_stat[--i]);
destroy_context(mm);
fail_nocontext:
mm_free_pgd(mm);
fail_nopgd:
......
......@@ -664,6 +664,7 @@ long __sys_setresuid(uid_t ruid, uid_t euid, uid_t suid)
struct cred *new;
int retval;
kuid_t kruid, keuid, ksuid;
bool ruid_new, euid_new, suid_new;
kruid = make_kuid(ns, ruid);
keuid = make_kuid(ns, euid);
......@@ -678,25 +679,29 @@ long __sys_setresuid(uid_t ruid, uid_t euid, uid_t suid)
if ((suid != (uid_t) -1) && !uid_valid(ksuid))
return -EINVAL;
old = current_cred();
/* check for no-op */
if ((ruid == (uid_t) -1 || uid_eq(kruid, old->uid)) &&
(euid == (uid_t) -1 || (uid_eq(keuid, old->euid) &&
uid_eq(keuid, old->fsuid))) &&
(suid == (uid_t) -1 || uid_eq(ksuid, old->suid)))
return 0;
ruid_new = ruid != (uid_t) -1 && !uid_eq(kruid, old->uid) &&
!uid_eq(kruid, old->euid) && !uid_eq(kruid, old->suid);
euid_new = euid != (uid_t) -1 && !uid_eq(keuid, old->uid) &&
!uid_eq(keuid, old->euid) && !uid_eq(keuid, old->suid);
suid_new = suid != (uid_t) -1 && !uid_eq(ksuid, old->uid) &&
!uid_eq(ksuid, old->euid) && !uid_eq(ksuid, old->suid);
if ((ruid_new || euid_new || suid_new) &&
!ns_capable_setid(old->user_ns, CAP_SETUID))
return -EPERM;
new = prepare_creds();
if (!new)
return -ENOMEM;
old = current_cred();
retval = -EPERM;
if (!ns_capable_setid(old->user_ns, CAP_SETUID)) {
if (ruid != (uid_t) -1 && !uid_eq(kruid, old->uid) &&
!uid_eq(kruid, old->euid) && !uid_eq(kruid, old->suid))
goto error;
if (euid != (uid_t) -1 && !uid_eq(keuid, old->uid) &&
!uid_eq(keuid, old->euid) && !uid_eq(keuid, old->suid))
goto error;
if (suid != (uid_t) -1 && !uid_eq(ksuid, old->uid) &&
!uid_eq(ksuid, old->euid) && !uid_eq(ksuid, old->suid))
goto error;
}
if (ruid != (uid_t) -1) {
new->uid = kruid;
if (!uid_eq(kruid, old->uid)) {
......@@ -761,6 +766,7 @@ long __sys_setresgid(gid_t rgid, gid_t egid, gid_t sgid)
struct cred *new;
int retval;
kgid_t krgid, kegid, ksgid;
bool rgid_new, egid_new, sgid_new;
krgid = make_kgid(ns, rgid);
kegid = make_kgid(ns, egid);
......@@ -773,23 +779,28 @@ long __sys_setresgid(gid_t rgid, gid_t egid, gid_t sgid)
if ((sgid != (gid_t) -1) && !gid_valid(ksgid))
return -EINVAL;
old = current_cred();
/* check for no-op */
if ((rgid == (gid_t) -1 || gid_eq(krgid, old->gid)) &&
(egid == (gid_t) -1 || (gid_eq(kegid, old->egid) &&
gid_eq(kegid, old->fsgid))) &&
(sgid == (gid_t) -1 || gid_eq(ksgid, old->sgid)))
return 0;
rgid_new = rgid != (gid_t) -1 && !gid_eq(krgid, old->gid) &&
!gid_eq(krgid, old->egid) && !gid_eq(krgid, old->sgid);
egid_new = egid != (gid_t) -1 && !gid_eq(kegid, old->gid) &&
!gid_eq(kegid, old->egid) && !gid_eq(kegid, old->sgid);
sgid_new = sgid != (gid_t) -1 && !gid_eq(ksgid, old->gid) &&
!gid_eq(ksgid, old->egid) && !gid_eq(ksgid, old->sgid);
if ((rgid_new || egid_new || sgid_new) &&
!ns_capable_setid(old->user_ns, CAP_SETGID))
return -EPERM;
new = prepare_creds();
if (!new)
return -ENOMEM;
old = current_cred();
retval = -EPERM;
if (!ns_capable_setid(old->user_ns, CAP_SETGID)) {
if (rgid != (gid_t) -1 && !gid_eq(krgid, old->gid) &&
!gid_eq(krgid, old->egid) && !gid_eq(krgid, old->sgid))
goto error;
if (egid != (gid_t) -1 && !gid_eq(kegid, old->gid) &&
!gid_eq(kegid, old->egid) && !gid_eq(kegid, old->sgid))
goto error;
if (sgid != (gid_t) -1 && !gid_eq(ksgid, old->gid) &&
!gid_eq(ksgid, old->egid) && !gid_eq(ksgid, old->sgid))
goto error;
}
if (rgid != (gid_t) -1)
new->gid = krgid;
......
......@@ -1303,26 +1303,21 @@ static inline void mas_alloc_nodes(struct ma_state *mas, gfp_t gfp)
node = mas->alloc;
node->request_count = 0;
while (requested) {
max_req = MAPLE_ALLOC_SLOTS;
if (node->node_count) {
unsigned int offset = node->node_count;
slots = (void **)&node->slot[offset];
max_req -= offset;
} else {
slots = (void **)&node->slot;
}
max_req = MAPLE_ALLOC_SLOTS - node->node_count;
slots = (void **)&node->slot[node->node_count];
max_req = min(requested, max_req);
count = mt_alloc_bulk(gfp, max_req, slots);
if (!count)
goto nomem_bulk;
if (node->node_count == 0) {
node->slot[0]->node_count = 0;
node->slot[0]->request_count = 0;
}
node->node_count += count;
allocated += count;
node = node->slot[0];
node->node_count = 0;
node->request_count = 0;
requested -= count;
}
mas->alloc->total = allocated;
......@@ -4970,7 +4965,8 @@ static inline void *mas_prev_entry(struct ma_state *mas, unsigned long min)
* Return: True if found in a leaf, false otherwise.
*
*/
static bool mas_rev_awalk(struct ma_state *mas, unsigned long size)
static bool mas_rev_awalk(struct ma_state *mas, unsigned long size,
unsigned long *gap_min, unsigned long *gap_max)
{
enum maple_type type = mte_node_type(mas->node);
struct maple_node *node = mas_mn(mas);
......@@ -5035,8 +5031,8 @@ static bool mas_rev_awalk(struct ma_state *mas, unsigned long size)
if (unlikely(ma_is_leaf(type))) {
mas->offset = offset;
mas->min = min;
mas->max = min + gap - 1;
*gap_min = min;
*gap_max = min + gap - 1;
return true;
}
......@@ -5060,10 +5056,10 @@ static inline bool mas_anode_descend(struct ma_state *mas, unsigned long size)
{
enum maple_type type = mte_node_type(mas->node);
unsigned long pivot, min, gap = 0;
unsigned char offset;
unsigned long *gaps;
unsigned long *pivots = ma_pivots(mas_mn(mas), type);
void __rcu **slots = ma_slots(mas_mn(mas), type);
unsigned char offset, data_end;
unsigned long *gaps, *pivots;
void __rcu **slots;
struct maple_node *node;
bool found = false;
if (ma_is_dense(type)) {
......@@ -5071,13 +5067,15 @@ static inline bool mas_anode_descend(struct ma_state *mas, unsigned long size)
return true;
}
gaps = ma_gaps(mte_to_node(mas->node), type);
node = mas_mn(mas);
pivots = ma_pivots(node, type);
slots = ma_slots(node, type);
gaps = ma_gaps(node, type);
offset = mas->offset;
min = mas_safe_min(mas, pivots, offset);
for (; offset < mt_slots[type]; offset++) {
pivot = mas_safe_pivot(mas, pivots, offset, type);
if (offset && !pivot)
break;
data_end = ma_data_end(node, type, pivots, mas->max);
for (; offset <= data_end; offset++) {
pivot = mas_logical_pivot(mas, pivots, offset, type);
/* Not within lower bounds */
if (mas->index > pivot)
......@@ -5312,6 +5310,9 @@ int mas_empty_area(struct ma_state *mas, unsigned long min,
unsigned long *pivots;
enum maple_type mt;
if (min >= max)
return -EINVAL;
if (mas_is_start(mas))
mas_start(mas);
else if (mas->offset >= 2)
......@@ -5366,6 +5367,9 @@ int mas_empty_area_rev(struct ma_state *mas, unsigned long min,
{
struct maple_enode *last = mas->node;
if (min >= max)
return -EINVAL;
if (mas_is_start(mas)) {
mas_start(mas);
mas->offset = mas_data_end(mas);
......@@ -5385,7 +5389,7 @@ int mas_empty_area_rev(struct ma_state *mas, unsigned long min,
mas->index = min;
mas->last = max;
while (!mas_rev_awalk(mas, size)) {
while (!mas_rev_awalk(mas, size, &min, &max)) {
if (last == mas->node) {
if (!mas_rewind_node(mas))
return -EBUSY;
......@@ -5400,17 +5404,9 @@ int mas_empty_area_rev(struct ma_state *mas, unsigned long min,
if (unlikely(mas->offset == MAPLE_NODE_SLOTS))
return -EBUSY;
/*
* mas_rev_awalk() has set mas->min and mas->max to the gap values. If
* the maximum is outside the window we are searching, then use the last
* location in the search.
* mas->max and mas->min is the range of the gap.
* mas->index and mas->last are currently set to the search range.
*/
/* Trim the upper limit to the max. */
if (mas->max <= mas->last)
mas->last = mas->max;
if (max <= mas->last)
mas->last = max;
mas->index = mas->last - size + 1;
return 0;
......
......@@ -507,6 +507,15 @@ static LIST_HEAD(offline_cgwbs);
static void cleanup_offline_cgwbs_workfn(struct work_struct *work);
static DECLARE_WORK(cleanup_offline_cgwbs_work, cleanup_offline_cgwbs_workfn);
static void cgwb_free_rcu(struct rcu_head *rcu_head)
{
struct bdi_writeback *wb = container_of(rcu_head,
struct bdi_writeback, rcu);
percpu_ref_exit(&wb->refcnt);
kfree(wb);
}
static void cgwb_release_workfn(struct work_struct *work)
{
struct bdi_writeback *wb = container_of(work, struct bdi_writeback,
......@@ -529,11 +538,10 @@ static void cgwb_release_workfn(struct work_struct *work)
list_del(&wb->offline_node);
spin_unlock_irq(&cgwb_lock);
percpu_ref_exit(&wb->refcnt);
wb_exit(wb);
bdi_put(bdi);
WARN_ON_ONCE(!list_empty(&wb->b_attached));
kfree_rcu(wb, rcu);
call_rcu(&wb->rcu, cgwb_free_rcu);
}
static void cgwb_release(struct percpu_ref *refcnt)
......
......@@ -1838,10 +1838,10 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
if (is_swap_pmd(*pmd)) {
swp_entry_t entry = pmd_to_swp_entry(*pmd);
struct page *page = pfn_swap_entry_to_page(entry);
pmd_t newpmd;
VM_BUG_ON(!is_pmd_migration_entry(*pmd));
if (is_writable_migration_entry(entry)) {
pmd_t newpmd;
/*
* A protection check is difficult so
* just be safe and disable write
......@@ -1855,8 +1855,16 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
newpmd = pmd_swp_mksoft_dirty(newpmd);
if (pmd_swp_uffd_wp(*pmd))
newpmd = pmd_swp_mkuffd_wp(newpmd);
set_pmd_at(mm, addr, pmd, newpmd);
} else {
newpmd = *pmd;
}
if (uffd_wp)
newpmd = pmd_swp_mkuffd_wp(newpmd);
else if (uffd_wp_resolve)
newpmd = pmd_swp_clear_uffd_wp(newpmd);
if (!pmd_same(*pmd, newpmd))
set_pmd_at(mm, addr, pmd, newpmd);
goto unlock;
}
#endif
......@@ -2657,9 +2665,10 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
is_hzp = is_huge_zero_page(&folio->page);
VM_WARN_ON_ONCE_FOLIO(is_hzp, folio);
if (is_hzp)
if (is_hzp) {
pr_warn_ratelimited("Called split_huge_page for huge zero page\n");
return -EBUSY;
}
if (folio_test_writeback(folio))
return -EBUSY;
......@@ -3251,6 +3260,8 @@ int set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw,
pmdswp = swp_entry_to_pmd(entry);
if (pmd_soft_dirty(pmdval))
pmdswp = pmd_swp_mksoft_dirty(pmdswp);
if (pmd_uffd_wp(pmdval))
pmdswp = pmd_swp_mkuffd_wp(pmdswp);
set_pmd_at(mm, address, pvmw->pmd, pmdswp);
page_remove_rmap(page, vma, true);
put_page(page);
......
......@@ -572,6 +572,10 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
result = SCAN_PTE_NON_PRESENT;
goto out;
}
if (pte_uffd_wp(pteval)) {
result = SCAN_PTE_UFFD_WP;
goto out;
}
page = vm_normal_page(vma, address, pteval);
if (unlikely(!page) || unlikely(is_zone_device_page(page))) {
result = SCAN_PAGE_NULL;
......
......@@ -148,35 +148,74 @@ void kmsan_vunmap_range_noflush(unsigned long start, unsigned long end)
* into the virtual memory. If those physical pages already had shadow/origin,
* those are ignored.
*/
void kmsan_ioremap_page_range(unsigned long start, unsigned long end,
phys_addr_t phys_addr, pgprot_t prot,
unsigned int page_shift)
int kmsan_ioremap_page_range(unsigned long start, unsigned long end,
phys_addr_t phys_addr, pgprot_t prot,
unsigned int page_shift)
{
gfp_t gfp_mask = GFP_KERNEL | __GFP_ZERO;
struct page *shadow, *origin;
unsigned long off = 0;
int nr;
int nr, err = 0, clean = 0, mapped;
if (!kmsan_enabled || kmsan_in_runtime())
return;
return 0;
nr = (end - start) / PAGE_SIZE;
kmsan_enter_runtime();
for (int i = 0; i < nr; i++, off += PAGE_SIZE) {
for (int i = 0; i < nr; i++, off += PAGE_SIZE, clean = i) {
shadow = alloc_pages(gfp_mask, 1);
origin = alloc_pages(gfp_mask, 1);
__vmap_pages_range_noflush(
if (!shadow || !origin) {
err = -ENOMEM;
goto ret;
}
mapped = __vmap_pages_range_noflush(
vmalloc_shadow(start + off),
vmalloc_shadow(start + off + PAGE_SIZE), prot, &shadow,
PAGE_SHIFT);
__vmap_pages_range_noflush(
if (mapped) {
err = mapped;
goto ret;
}
shadow = NULL;
mapped = __vmap_pages_range_noflush(
vmalloc_origin(start + off),
vmalloc_origin(start + off + PAGE_SIZE), prot, &origin,
PAGE_SHIFT);
if (mapped) {
__vunmap_range_noflush(
vmalloc_shadow(start + off),
vmalloc_shadow(start + off + PAGE_SIZE));
err = mapped;
goto ret;
}
origin = NULL;
}
/* Page mapping loop finished normally, nothing to clean up. */
clean = 0;
ret:
if (clean > 0) {
/*
* Something went wrong. Clean up shadow/origin pages allocated
* on the last loop iteration, then delete mappings created
* during the previous iterations.
*/
if (shadow)
__free_pages(shadow, 1);
if (origin)
__free_pages(origin, 1);
__vunmap_range_noflush(
vmalloc_shadow(start),
vmalloc_shadow(start + clean * PAGE_SIZE));
__vunmap_range_noflush(
vmalloc_origin(start),
vmalloc_origin(start + clean * PAGE_SIZE));
}
flush_cache_vmap(vmalloc_shadow(start), vmalloc_shadow(end));
flush_cache_vmap(vmalloc_origin(start), vmalloc_origin(end));
kmsan_leave_runtime();
return err;
}
void kmsan_iounmap_page_range(unsigned long start, unsigned long end)
......
......@@ -216,27 +216,29 @@ void kmsan_free_page(struct page *page, unsigned int order)
kmsan_leave_runtime();
}
void kmsan_vmap_pages_range_noflush(unsigned long start, unsigned long end,
pgprot_t prot, struct page **pages,
unsigned int page_shift)
int kmsan_vmap_pages_range_noflush(unsigned long start, unsigned long end,
pgprot_t prot, struct page **pages,
unsigned int page_shift)
{
unsigned long shadow_start, origin_start, shadow_end, origin_end;
struct page **s_pages, **o_pages;
int nr, mapped;
int nr, mapped, err = 0;
if (!kmsan_enabled)
return;
return 0;
shadow_start = vmalloc_meta((void *)start, KMSAN_META_SHADOW);
shadow_end = vmalloc_meta((void *)end, KMSAN_META_SHADOW);
if (!shadow_start)
return;
return 0;
nr = (end - start) / PAGE_SIZE;
s_pages = kcalloc(nr, sizeof(*s_pages), GFP_KERNEL);
o_pages = kcalloc(nr, sizeof(*o_pages), GFP_KERNEL);
if (!s_pages || !o_pages)
if (!s_pages || !o_pages) {
err = -ENOMEM;
goto ret;
}
for (int i = 0; i < nr; i++) {
s_pages[i] = shadow_page_for(pages[i]);
o_pages[i] = origin_page_for(pages[i]);
......@@ -249,10 +251,16 @@ void kmsan_vmap_pages_range_noflush(unsigned long start, unsigned long end,
kmsan_enter_runtime();
mapped = __vmap_pages_range_noflush(shadow_start, shadow_end, prot,
s_pages, page_shift);
KMSAN_WARN_ON(mapped);
if (mapped) {
err = mapped;
goto ret;
}
mapped = __vmap_pages_range_noflush(origin_start, origin_end, prot,
o_pages, page_shift);
KMSAN_WARN_ON(mapped);
if (mapped) {
err = mapped;
goto ret;
}
kmsan_leave_runtime();
flush_tlb_kernel_range(shadow_start, shadow_end);
flush_tlb_kernel_range(origin_start, origin_end);
......@@ -262,6 +270,7 @@ void kmsan_vmap_pages_range_noflush(unsigned long start, unsigned long end,
ret:
kfree(s_pages);
kfree(o_pages);
return err;
}
/* Allocate metadata for pages allocated at boot time. */
......
......@@ -790,61 +790,50 @@ static int vma_replace_policy(struct vm_area_struct *vma,
return err;
}
/* Step 2: apply policy to a range and do splits. */
static int mbind_range(struct mm_struct *mm, unsigned long start,
unsigned long end, struct mempolicy *new_pol)
/* Split or merge the VMA (if required) and apply the new policy */
static int mbind_range(struct vma_iterator *vmi, struct vm_area_struct *vma,
struct vm_area_struct **prev, unsigned long start,
unsigned long end, struct mempolicy *new_pol)
{
VMA_ITERATOR(vmi, mm, start);
struct vm_area_struct *prev;
struct vm_area_struct *vma;
int err = 0;
struct vm_area_struct *merged;
unsigned long vmstart, vmend;
pgoff_t pgoff;
int err;
prev = vma_prev(&vmi);
vma = vma_find(&vmi, end);
if (WARN_ON(!vma))
vmend = min(end, vma->vm_end);
if (start > vma->vm_start) {
*prev = vma;
vmstart = start;
} else {
vmstart = vma->vm_start;
}
if (mpol_equal(vma_policy(vma), new_pol))
return 0;
if (start > vma->vm_start)
prev = vma;
do {
unsigned long vmstart = max(start, vma->vm_start);
unsigned long vmend = min(end, vma->vm_end);
if (mpol_equal(vma_policy(vma), new_pol))
goto next;
pgoff = vma->vm_pgoff +
((vmstart - vma->vm_start) >> PAGE_SHIFT);
prev = vma_merge(&vmi, mm, prev, vmstart, vmend, vma->vm_flags,
vma->anon_vma, vma->vm_file, pgoff,
new_pol, vma->vm_userfaultfd_ctx,
anon_vma_name(vma));
if (prev) {
vma = prev;
goto replace;
}
if (vma->vm_start != vmstart) {
err = split_vma(&vmi, vma, vmstart, 1);
if (err)
goto out;
}
if (vma->vm_end != vmend) {
err = split_vma(&vmi, vma, vmend, 0);
if (err)
goto out;
}
replace:
err = vma_replace_policy(vma, new_pol);
pgoff = vma->vm_pgoff + ((vmstart - vma->vm_start) >> PAGE_SHIFT);
merged = vma_merge(vmi, vma->vm_mm, *prev, vmstart, vmend, vma->vm_flags,
vma->anon_vma, vma->vm_file, pgoff, new_pol,
vma->vm_userfaultfd_ctx, anon_vma_name(vma));
if (merged) {
*prev = merged;
return vma_replace_policy(merged, new_pol);
}
if (vma->vm_start != vmstart) {
err = split_vma(vmi, vma, vmstart, 1);
if (err)
goto out;
next:
prev = vma;
} for_each_vma_range(vmi, vma, end);
return err;
}
out:
return err;
if (vma->vm_end != vmend) {
err = split_vma(vmi, vma, vmend, 0);
if (err)
return err;
}
*prev = vma;
return vma_replace_policy(vma, new_pol);
}
/* Set the process memory policy */
......@@ -1259,6 +1248,8 @@ static long do_mbind(unsigned long start, unsigned long len,
nodemask_t *nmask, unsigned long flags)
{
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma, *prev;
struct vma_iterator vmi;
struct mempolicy *new;
unsigned long end;
int err;
......@@ -1328,7 +1319,13 @@ static long do_mbind(unsigned long start, unsigned long len,
goto up_out;
}
err = mbind_range(mm, start, end, new);
vma_iter_init(&vmi, mm, start);
prev = vma_prev(&vmi);
for_each_vma_range(vmi, vma, end) {
err = mbind_range(&vmi, vma, &prev, start, end, new);
if (err)
break;
}
if (!err) {
int nr_failed = 0;
......@@ -1489,10 +1486,8 @@ SYSCALL_DEFINE4(set_mempolicy_home_node, unsigned long, start, unsigned long, le
unsigned long, home_node, unsigned long, flags)
{
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma;
struct vm_area_struct *vma, *prev;
struct mempolicy *new, *old;
unsigned long vmstart;
unsigned long vmend;
unsigned long end;
int err = -ENOENT;
VMA_ITERATOR(vmi, mm, start);
......@@ -1521,6 +1516,7 @@ SYSCALL_DEFINE4(set_mempolicy_home_node, unsigned long, start, unsigned long, le
if (end == start)
return 0;
mmap_write_lock(mm);
prev = vma_prev(&vmi);
for_each_vma_range(vmi, vma, end) {
/*
* If any vma in the range got policy other than MPOL_BIND
......@@ -1541,9 +1537,7 @@ SYSCALL_DEFINE4(set_mempolicy_home_node, unsigned long, start, unsigned long, le
}
new->home_node = home_node;
vmstart = max(start, vma->vm_start);
vmend = min(end, vma->vm_end);
err = mbind_range(mm, vmstart, vmend, new);
err = mbind_range(&vmi, vma, &prev, start, end, new);
mpol_put(new);
if (err)
break;
......
......@@ -1518,7 +1518,8 @@ static inline int accountable_mapping(struct file *file, vm_flags_t vm_flags)
*/
static unsigned long unmapped_area(struct vm_unmapped_area_info *info)
{
unsigned long length, gap;
unsigned long length, gap, low_limit;
struct vm_area_struct *tmp;
MA_STATE(mas, &current->mm->mm_mt, 0, 0);
......@@ -1527,12 +1528,29 @@ static unsigned long unmapped_area(struct vm_unmapped_area_info *info)
if (length < info->length)
return -ENOMEM;
if (mas_empty_area(&mas, info->low_limit, info->high_limit - 1,
length))
low_limit = info->low_limit;
retry:
if (mas_empty_area(&mas, low_limit, info->high_limit - 1, length))
return -ENOMEM;
gap = mas.index;
gap += (info->align_offset - gap) & info->align_mask;
tmp = mas_next(&mas, ULONG_MAX);
if (tmp && (tmp->vm_flags & VM_GROWSDOWN)) { /* Avoid prev check if possible */
if (vm_start_gap(tmp) < gap + length - 1) {
low_limit = tmp->vm_end;
mas_reset(&mas);
goto retry;
}
} else {
tmp = mas_prev(&mas, 0);
if (tmp && vm_end_gap(tmp) > gap) {
low_limit = vm_end_gap(tmp);
mas_reset(&mas);
goto retry;
}
}
return gap;
}
......@@ -1548,7 +1566,8 @@ static unsigned long unmapped_area(struct vm_unmapped_area_info *info)
*/
static unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info)
{
unsigned long length, gap;
unsigned long length, gap, high_limit, gap_end;
struct vm_area_struct *tmp;
MA_STATE(mas, &current->mm->mm_mt, 0, 0);
/* Adjust search length to account for worst case alignment overhead */
......@@ -1556,12 +1575,31 @@ static unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info)
if (length < info->length)
return -ENOMEM;
if (mas_empty_area_rev(&mas, info->low_limit, info->high_limit - 1,
high_limit = info->high_limit;
retry:
if (mas_empty_area_rev(&mas, info->low_limit, high_limit - 1,
length))
return -ENOMEM;
gap = mas.last + 1 - info->length;
gap -= (gap - info->align_offset) & info->align_mask;
gap_end = mas.last;
tmp = mas_next(&mas, ULONG_MAX);
if (tmp && (tmp->vm_flags & VM_GROWSDOWN)) { /* Avoid prev check if possible */
if (vm_start_gap(tmp) <= gap_end) {
high_limit = vm_start_gap(tmp);
mas_reset(&mas);
goto retry;
}
} else {
tmp = mas_prev(&mas, 0);
if (tmp && vm_end_gap(tmp) > gap) {
high_limit = tmp->vm_start;
mas_reset(&mas);
goto retry;
}
}
return gap;
}
......
......@@ -838,7 +838,7 @@ static int do_mprotect_pkey(unsigned long start, size_t len,
}
tlb_finish_mmu(&tlb);
if (vma_iter_end(&vmi) < end)
if (!error && vma_iter_end(&vmi) < end)
error = -ENOMEM;
out:
......
......@@ -6632,7 +6632,21 @@ static void __build_all_zonelists(void *data)
int nid;
int __maybe_unused cpu;
pg_data_t *self = data;
unsigned long flags;
/*
* Explicitly disable this CPU's interrupts before taking seqlock
* to prevent any IRQ handler from calling into the page allocator
* (e.g. GFP_ATOMIC) that could hit zonelist_iter_begin and livelock.
*/
local_irq_save(flags);
/*
* Explicitly disable this CPU's synchronous printk() before taking
* seqlock to prevent any printk() from trying to hold port->lock, for
* tty_insert_flip_string_and_push_buffer() on other CPU might be
* calling kmalloc(GFP_ATOMIC | __GFP_NOWARN) with port->lock held.
*/
printk_deferred_enter();
write_seqlock(&zonelist_update_seq);
#ifdef CONFIG_NUMA
......@@ -6671,6 +6685,8 @@ static void __build_all_zonelists(void *data)
}
write_sequnlock(&zonelist_update_seq);
printk_deferred_exit();
local_irq_restore(flags);
}
static noinline void __init
......@@ -9450,6 +9466,9 @@ static bool pfn_range_valid_contig(struct zone *z, unsigned long start_pfn,
if (PageReserved(page))
return false;
if (PageHuge(page))
return false;
}
return true;
}
......
......@@ -222,7 +222,7 @@ static void folio_batch_move_lru(struct folio_batch *fbatch, move_fn_t move_fn)
if (lruvec)
unlock_page_lruvec_irqrestore(lruvec, flags);
folios_put(fbatch->folios, folio_batch_count(fbatch));
folio_batch_init(fbatch);
folio_batch_reinit(fbatch);
}
static void folio_batch_add_and_move(struct folio_batch *fbatch,
......
......@@ -313,8 +313,8 @@ int ioremap_page_range(unsigned long addr, unsigned long end,
ioremap_max_page_shift);
flush_cache_vmap(addr, end);
if (!err)
kmsan_ioremap_page_range(addr, end, phys_addr, prot,
ioremap_max_page_shift);
err = kmsan_ioremap_page_range(addr, end, phys_addr, prot,
ioremap_max_page_shift);
return err;
}
......@@ -605,7 +605,11 @@ int __vmap_pages_range_noflush(unsigned long addr, unsigned long end,
int vmap_pages_range_noflush(unsigned long addr, unsigned long end,
pgprot_t prot, struct page **pages, unsigned int page_shift)
{
kmsan_vmap_pages_range_noflush(addr, end, prot, pages, page_shift);
int ret = kmsan_vmap_pages_range_noflush(addr, end, prot, pages,
page_shift);
if (ret)
return ret;
return __vmap_pages_range_noflush(addr, end, prot, pages, page_shift);
}
......
......@@ -39,7 +39,7 @@ help:
@echo ' turbostat - Intel CPU idle stats and freq reporting tool'
@echo ' usb - USB testing tools'
@echo ' virtio - vhost test module'
@echo ' vm - misc vm tools'
@echo ' mm - misc mm tools'
@echo ' wmi - WMI interface examples'
@echo ' x86_energy_perf_policy - Intel energy policy tool'
@echo ''
......@@ -69,7 +69,7 @@ acpi: FORCE
cpupower: FORCE
$(call descend,power/$@)
cgroup counter firewire hv guest bootconfig spi usb virtio vm bpf iio gpio objtool leds wmi pci firmware debugging tracing: FORCE
cgroup counter firewire hv guest bootconfig spi usb virtio mm bpf iio gpio objtool leds wmi pci firmware debugging tracing: FORCE
$(call descend,$@)
bpf/%: FORCE
......@@ -118,7 +118,7 @@ kvm_stat: FORCE
all: acpi cgroup counter cpupower gpio hv firewire \
perf selftests bootconfig spi turbostat usb \
virtio vm bpf x86_energy_perf_policy \
virtio mm bpf x86_energy_perf_policy \
tmon freefall iio objtool kvm_stat wmi \
pci debugging tracing thermal thermometer thermal-engine
......@@ -128,7 +128,7 @@ acpi_install:
cpupower_install:
$(call descend,power/$(@:_install=),install)
cgroup_install counter_install firewire_install gpio_install hv_install iio_install perf_install bootconfig_install spi_install usb_install virtio_install vm_install bpf_install objtool_install wmi_install pci_install debugging_install tracing_install:
cgroup_install counter_install firewire_install gpio_install hv_install iio_install perf_install bootconfig_install spi_install usb_install virtio_install mm_install bpf_install objtool_install wmi_install pci_install debugging_install tracing_install:
$(call descend,$(@:_install=),install)
selftests_install:
......@@ -158,7 +158,7 @@ kvm_stat_install:
install: acpi_install cgroup_install counter_install cpupower_install gpio_install \
hv_install firewire_install iio_install \
perf_install selftests_install turbostat_install usb_install \
virtio_install vm_install bpf_install x86_energy_perf_policy_install \
virtio_install mm_install bpf_install x86_energy_perf_policy_install \
tmon_install freefall_install objtool_install kvm_stat_install \
wmi_install pci_install debugging_install intel-speed-select_install \
tracing_install thermometer_install thermal-engine_install
......@@ -169,7 +169,7 @@ acpi_clean:
cpupower_clean:
$(call descend,power/cpupower,clean)
cgroup_clean counter_clean hv_clean firewire_clean bootconfig_clean spi_clean usb_clean virtio_clean vm_clean wmi_clean bpf_clean iio_clean gpio_clean objtool_clean leds_clean pci_clean firmware_clean debugging_clean tracing_clean:
cgroup_clean counter_clean hv_clean firewire_clean bootconfig_clean spi_clean usb_clean virtio_clean mm_clean wmi_clean bpf_clean iio_clean gpio_clean objtool_clean leds_clean pci_clean firmware_clean debugging_clean tracing_clean:
$(call descend,$(@:_clean=),clean)
libapi_clean:
......@@ -211,7 +211,7 @@ build_clean:
clean: acpi_clean cgroup_clean counter_clean cpupower_clean hv_clean firewire_clean \
perf_clean selftests_clean turbostat_clean bootconfig_clean spi_clean usb_clean virtio_clean \
vm_clean bpf_clean iio_clean x86_energy_perf_policy_clean tmon_clean \
mm_clean bpf_clean iio_clean x86_energy_perf_policy_clean tmon_clean \
freefall_clean build_clean libbpf_clean libsubcmd_clean \
gpio_clean objtool_clean leds_clean wmi_clean pci_clean firmware_clean debugging_clean \
intel-speed-select_clean tracing_clean thermal_clean thermometer_clean thermal-engine_clean
......
......@@ -857,7 +857,7 @@ int main(int argc, char **argv)
if (cull & CULL_PID || filter & FILTER_PID)
fprintf(fout, ", PID %d", list[i].pid);
if (cull & CULL_TGID || filter & FILTER_TGID)
fprintf(fout, ", TGID %d", list[i].pid);
fprintf(fout, ", TGID %d", list[i].tgid);
if (cull & CULL_COMM || filter & FILTER_COMM)
fprintf(fout, ", task_comm_name: %s", list[i].comm);
if (cull & CULL_ALLOCATOR) {
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment