Commit 95607ad9 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'mm-hotfixes-stable-2022-08-22' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm

Pull misc fixes from Andrew Morton:
 "Thirteen fixes, almost all for MM.

  Seven of these are cc:stable and the remainder fix up the changes
  which went into this -rc cycle"

* tag 'mm-hotfixes-stable-2022-08-22' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm:
  kprobes: don't call disarm_kprobe() for disabled kprobes
  mm/shmem: shmem_replace_page() remember NR_SHMEM
  mm/shmem: tmpfs fallocate use file_modified()
  mm/shmem: fix chattr fsflags support in tmpfs
  mm/hugetlb: support write-faults in shared mappings
  mm/hugetlb: fix hugetlb not supporting softdirty tracking
  mm/uffd: reset write protection when unregister with wp-mode
  mm/smaps: don't access young/dirty bit if pte unpresent
  mm: add DEVICE_ZONE to FOR_ALL_ZONES
  kernel/sys_ni: add compat entry for fadvise64_64
  mm/gup: fix FOLL_FORCE COW security issue and remove FOLL_COW
  Revert "zram: remove double compression logic"
  get_maintainer: add Alan to .get_maintainer.ignore
parents 6234806f 9c80e799
Alan Cox <alan@lxorguk.ukuu.org.uk>
Alan Cox <root@hraefn.swansea.linux.org.uk>
Christoph Hellwig <hch@lst.de> Christoph Hellwig <hch@lst.de>
Marc Gonzalez <marc.w.gonzalez@free.fr> Marc Gonzalez <marc.w.gonzalez@free.fr>
...@@ -1146,14 +1146,15 @@ static ssize_t bd_stat_show(struct device *dev, ...@@ -1146,14 +1146,15 @@ static ssize_t bd_stat_show(struct device *dev,
static ssize_t debug_stat_show(struct device *dev, static ssize_t debug_stat_show(struct device *dev,
struct device_attribute *attr, char *buf) struct device_attribute *attr, char *buf)
{ {
int version = 2; int version = 1;
struct zram *zram = dev_to_zram(dev); struct zram *zram = dev_to_zram(dev);
ssize_t ret; ssize_t ret;
down_read(&zram->init_lock); down_read(&zram->init_lock);
ret = scnprintf(buf, PAGE_SIZE, ret = scnprintf(buf, PAGE_SIZE,
"version: %d\n%8llu\n", "version: %d\n%8llu %8llu\n",
version, version,
(u64)atomic64_read(&zram->stats.writestall),
(u64)atomic64_read(&zram->stats.miss_free)); (u64)atomic64_read(&zram->stats.miss_free));
up_read(&zram->init_lock); up_read(&zram->init_lock);
...@@ -1351,7 +1352,7 @@ static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec, ...@@ -1351,7 +1352,7 @@ static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec,
{ {
int ret = 0; int ret = 0;
unsigned long alloced_pages; unsigned long alloced_pages;
unsigned long handle = 0; unsigned long handle = -ENOMEM;
unsigned int comp_len = 0; unsigned int comp_len = 0;
void *src, *dst, *mem; void *src, *dst, *mem;
struct zcomp_strm *zstrm; struct zcomp_strm *zstrm;
...@@ -1369,6 +1370,7 @@ static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec, ...@@ -1369,6 +1370,7 @@ static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec,
} }
kunmap_atomic(mem); kunmap_atomic(mem);
compress_again:
zstrm = zcomp_stream_get(zram->comp); zstrm = zcomp_stream_get(zram->comp);
src = kmap_atomic(page); src = kmap_atomic(page);
ret = zcomp_compress(zstrm, src, &comp_len); ret = zcomp_compress(zstrm, src, &comp_len);
...@@ -1377,20 +1379,39 @@ static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec, ...@@ -1377,20 +1379,39 @@ static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec,
if (unlikely(ret)) { if (unlikely(ret)) {
zcomp_stream_put(zram->comp); zcomp_stream_put(zram->comp);
pr_err("Compression failed! err=%d\n", ret); pr_err("Compression failed! err=%d\n", ret);
zs_free(zram->mem_pool, handle);
return ret; return ret;
} }
if (comp_len >= huge_class_size) if (comp_len >= huge_class_size)
comp_len = PAGE_SIZE; comp_len = PAGE_SIZE;
/*
handle = zs_malloc(zram->mem_pool, comp_len, * handle allocation has 2 paths:
__GFP_KSWAPD_RECLAIM | * a) fast path is executed with preemption disabled (for
__GFP_NOWARN | * per-cpu streams) and has __GFP_DIRECT_RECLAIM bit clear,
__GFP_HIGHMEM | * since we can't sleep;
__GFP_MOVABLE); * b) slow path enables preemption and attempts to allocate
* the page with __GFP_DIRECT_RECLAIM bit set. we have to
* put per-cpu compression stream and, thus, to re-do
* the compression once handle is allocated.
*
* if we have a 'non-null' handle here then we are coming
* from the slow path and handle has already been allocated.
*/
if (IS_ERR((void *)handle))
handle = zs_malloc(zram->mem_pool, comp_len,
__GFP_KSWAPD_RECLAIM |
__GFP_NOWARN |
__GFP_HIGHMEM |
__GFP_MOVABLE);
if (IS_ERR((void *)handle)) { if (IS_ERR((void *)handle)) {
zcomp_stream_put(zram->comp); zcomp_stream_put(zram->comp);
atomic64_inc(&zram->stats.writestall);
handle = zs_malloc(zram->mem_pool, comp_len,
GFP_NOIO | __GFP_HIGHMEM |
__GFP_MOVABLE);
if (!IS_ERR((void *)handle))
goto compress_again;
return PTR_ERR((void *)handle); return PTR_ERR((void *)handle);
} }
...@@ -1948,6 +1969,7 @@ static int zram_add(void) ...@@ -1948,6 +1969,7 @@ static int zram_add(void)
if (ZRAM_LOGICAL_BLOCK_SIZE == PAGE_SIZE) if (ZRAM_LOGICAL_BLOCK_SIZE == PAGE_SIZE)
blk_queue_max_write_zeroes_sectors(zram->disk->queue, UINT_MAX); blk_queue_max_write_zeroes_sectors(zram->disk->queue, UINT_MAX);
blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, zram->disk->queue);
ret = device_add_disk(NULL, zram->disk, zram_disk_groups); ret = device_add_disk(NULL, zram->disk, zram_disk_groups);
if (ret) if (ret)
goto out_cleanup_disk; goto out_cleanup_disk;
......
...@@ -81,6 +81,7 @@ struct zram_stats { ...@@ -81,6 +81,7 @@ struct zram_stats {
atomic64_t huge_pages_since; /* no. of huge pages since zram set up */ atomic64_t huge_pages_since; /* no. of huge pages since zram set up */
atomic64_t pages_stored; /* no. of pages currently stored */ atomic64_t pages_stored; /* no. of pages currently stored */
atomic_long_t max_used_pages; /* no. of maximum pages stored */ atomic_long_t max_used_pages; /* no. of maximum pages stored */
atomic64_t writestall; /* no. of write slow paths */
atomic64_t miss_free; /* no. of missed free */ atomic64_t miss_free; /* no. of missed free */
#ifdef CONFIG_ZRAM_WRITEBACK #ifdef CONFIG_ZRAM_WRITEBACK
atomic64_t bd_count; /* no. of pages in backing device */ atomic64_t bd_count; /* no. of pages in backing device */
......
...@@ -527,10 +527,12 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr, ...@@ -527,10 +527,12 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr,
struct vm_area_struct *vma = walk->vma; struct vm_area_struct *vma = walk->vma;
bool locked = !!(vma->vm_flags & VM_LOCKED); bool locked = !!(vma->vm_flags & VM_LOCKED);
struct page *page = NULL; struct page *page = NULL;
bool migration = false; bool migration = false, young = false, dirty = false;
if (pte_present(*pte)) { if (pte_present(*pte)) {
page = vm_normal_page(vma, addr, *pte); page = vm_normal_page(vma, addr, *pte);
young = pte_young(*pte);
dirty = pte_dirty(*pte);
} else if (is_swap_pte(*pte)) { } else if (is_swap_pte(*pte)) {
swp_entry_t swpent = pte_to_swp_entry(*pte); swp_entry_t swpent = pte_to_swp_entry(*pte);
...@@ -560,8 +562,7 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr, ...@@ -560,8 +562,7 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr,
if (!page) if (!page)
return; return;
smaps_account(mss, page, false, pte_young(*pte), pte_dirty(*pte), smaps_account(mss, page, false, young, dirty, locked, migration);
locked, migration);
} }
#ifdef CONFIG_TRANSPARENT_HUGEPAGE #ifdef CONFIG_TRANSPARENT_HUGEPAGE
......
...@@ -1601,6 +1601,10 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, ...@@ -1601,6 +1601,10 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
wake_userfault(vma->vm_userfaultfd_ctx.ctx, &range); wake_userfault(vma->vm_userfaultfd_ctx.ctx, &range);
} }
/* Reset ptes for the whole vma range if wr-protected */
if (userfaultfd_wp(vma))
uffd_wp_range(mm, vma, start, vma_end - start, false);
new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS; new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS;
prev = vma_merge(mm, prev, start, vma_end, new_flags, prev = vma_merge(mm, prev, start, vma_end, new_flags,
vma->anon_vma, vma->vm_file, vma->vm_pgoff, vma->anon_vma, vma->vm_file, vma->vm_pgoff,
......
...@@ -2885,7 +2885,6 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, ...@@ -2885,7 +2885,6 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
#define FOLL_MIGRATION 0x400 /* wait for page to replace migration entry */ #define FOLL_MIGRATION 0x400 /* wait for page to replace migration entry */
#define FOLL_TRIED 0x800 /* a retry, previous pass started an IO */ #define FOLL_TRIED 0x800 /* a retry, previous pass started an IO */
#define FOLL_REMOTE 0x2000 /* we are working on non-current tsk/mm */ #define FOLL_REMOTE 0x2000 /* we are working on non-current tsk/mm */
#define FOLL_COW 0x4000 /* internal GUP flag */
#define FOLL_ANON 0x8000 /* don't do file mappings */ #define FOLL_ANON 0x8000 /* don't do file mappings */
#define FOLL_LONGTERM 0x10000 /* mapping lifetime is indefinite: see below */ #define FOLL_LONGTERM 0x10000 /* mapping lifetime is indefinite: see below */
#define FOLL_SPLIT_PMD 0x20000 /* split huge pmd before returning */ #define FOLL_SPLIT_PMD 0x20000 /* split huge pmd before returning */
......
...@@ -29,15 +29,10 @@ struct shmem_inode_info { ...@@ -29,15 +29,10 @@ struct shmem_inode_info {
struct inode vfs_inode; struct inode vfs_inode;
}; };
#define SHMEM_FL_USER_VISIBLE FS_FL_USER_VISIBLE #define SHMEM_FL_USER_VISIBLE FS_FL_USER_VISIBLE
#define SHMEM_FL_USER_MODIFIABLE FS_FL_USER_MODIFIABLE #define SHMEM_FL_USER_MODIFIABLE \
#define SHMEM_FL_INHERITED FS_FL_USER_MODIFIABLE (FS_IMMUTABLE_FL | FS_APPEND_FL | FS_NODUMP_FL | FS_NOATIME_FL)
#define SHMEM_FL_INHERITED (FS_NODUMP_FL | FS_NOATIME_FL)
/* Flags that are appropriate for regular files (all but dir-specific ones). */
#define SHMEM_REG_FLMASK (~(FS_DIRSYNC_FL | FS_TOPDIR_FL))
/* Flags that are appropriate for non-directories/regular files. */
#define SHMEM_OTHER_FLMASK (FS_NODUMP_FL | FS_NOATIME_FL)
struct shmem_sb_info { struct shmem_sb_info {
unsigned long max_blocks; /* How many blocks are allowed */ unsigned long max_blocks; /* How many blocks are allowed */
......
...@@ -73,6 +73,8 @@ extern ssize_t mcopy_continue(struct mm_struct *dst_mm, unsigned long dst_start, ...@@ -73,6 +73,8 @@ extern ssize_t mcopy_continue(struct mm_struct *dst_mm, unsigned long dst_start,
extern int mwriteprotect_range(struct mm_struct *dst_mm, extern int mwriteprotect_range(struct mm_struct *dst_mm,
unsigned long start, unsigned long len, unsigned long start, unsigned long len,
bool enable_wp, atomic_t *mmap_changing); bool enable_wp, atomic_t *mmap_changing);
extern void uffd_wp_range(struct mm_struct *dst_mm, struct vm_area_struct *vma,
unsigned long start, unsigned long len, bool enable_wp);
/* mm helpers */ /* mm helpers */
static inline bool is_mergeable_vm_userfaultfd_ctx(struct vm_area_struct *vma, static inline bool is_mergeable_vm_userfaultfd_ctx(struct vm_area_struct *vma,
......
...@@ -20,12 +20,19 @@ ...@@ -20,12 +20,19 @@
#define HIGHMEM_ZONE(xx) #define HIGHMEM_ZONE(xx)
#endif #endif
#define FOR_ALL_ZONES(xx) DMA_ZONE(xx) DMA32_ZONE(xx) xx##_NORMAL, HIGHMEM_ZONE(xx) xx##_MOVABLE #ifdef CONFIG_ZONE_DEVICE
#define DEVICE_ZONE(xx) xx##_DEVICE,
#else
#define DEVICE_ZONE(xx)
#endif
#define FOR_ALL_ZONES(xx) DMA_ZONE(xx) DMA32_ZONE(xx) xx##_NORMAL, \
HIGHMEM_ZONE(xx) xx##_MOVABLE, DEVICE_ZONE(xx)
enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
FOR_ALL_ZONES(PGALLOC), FOR_ALL_ZONES(PGALLOC)
FOR_ALL_ZONES(ALLOCSTALL), FOR_ALL_ZONES(ALLOCSTALL)
FOR_ALL_ZONES(PGSCAN_SKIP), FOR_ALL_ZONES(PGSCAN_SKIP)
PGFREE, PGACTIVATE, PGDEACTIVATE, PGLAZYFREE, PGFREE, PGACTIVATE, PGDEACTIVATE, PGLAZYFREE,
PGFAULT, PGMAJFAULT, PGFAULT, PGMAJFAULT,
PGLAZYFREED, PGLAZYFREED,
......
...@@ -1707,11 +1707,12 @@ static struct kprobe *__disable_kprobe(struct kprobe *p) ...@@ -1707,11 +1707,12 @@ static struct kprobe *__disable_kprobe(struct kprobe *p)
/* Try to disarm and disable this/parent probe */ /* Try to disarm and disable this/parent probe */
if (p == orig_p || aggr_kprobe_disabled(orig_p)) { if (p == orig_p || aggr_kprobe_disabled(orig_p)) {
/* /*
* If 'kprobes_all_disarmed' is set, 'orig_p' * Don't be lazy here. Even if 'kprobes_all_disarmed'
* should have already been disarmed, so * is false, 'orig_p' might not have been armed yet.
* skip unneed disarming process. * Note arm_all_kprobes() __tries__ to arm all kprobes
* on the best effort basis.
*/ */
if (!kprobes_all_disarmed) { if (!kprobes_all_disarmed && !kprobe_disabled(orig_p)) {
ret = disarm_kprobe(orig_p, true); ret = disarm_kprobe(orig_p, true);
if (ret) { if (ret) {
p->flags &= ~KPROBE_FLAG_DISABLED; p->flags &= ~KPROBE_FLAG_DISABLED;
......
...@@ -277,6 +277,7 @@ COND_SYSCALL(landlock_restrict_self); ...@@ -277,6 +277,7 @@ COND_SYSCALL(landlock_restrict_self);
/* mm/fadvise.c */ /* mm/fadvise.c */
COND_SYSCALL(fadvise64_64); COND_SYSCALL(fadvise64_64);
COND_SYSCALL_COMPAT(fadvise64_64);
/* mm/, CONFIG_MMU only */ /* mm/, CONFIG_MMU only */
COND_SYSCALL(swapon); COND_SYSCALL(swapon);
......
...@@ -478,14 +478,42 @@ static int follow_pfn_pte(struct vm_area_struct *vma, unsigned long address, ...@@ -478,14 +478,42 @@ static int follow_pfn_pte(struct vm_area_struct *vma, unsigned long address,
return -EEXIST; return -EEXIST;
} }
/* /* FOLL_FORCE can write to even unwritable PTEs in COW mappings. */
* FOLL_FORCE can write to even unwritable pte's, but only static inline bool can_follow_write_pte(pte_t pte, struct page *page,
* after we've gone through a COW cycle and they are dirty. struct vm_area_struct *vma,
*/ unsigned int flags)
static inline bool can_follow_write_pte(pte_t pte, unsigned int flags)
{ {
return pte_write(pte) || /* If the pte is writable, we can write to the page. */
((flags & FOLL_FORCE) && (flags & FOLL_COW) && pte_dirty(pte)); if (pte_write(pte))
return true;
/* Maybe FOLL_FORCE is set to override it? */
if (!(flags & FOLL_FORCE))
return false;
/* But FOLL_FORCE has no effect on shared mappings */
if (vma->vm_flags & (VM_MAYSHARE | VM_SHARED))
return false;
/* ... or read-only private ones */
if (!(vma->vm_flags & VM_MAYWRITE))
return false;
/* ... or already writable ones that just need to take a write fault */
if (vma->vm_flags & VM_WRITE)
return false;
/*
* See can_change_pte_writable(): we broke COW and could map the page
* writable if we have an exclusive anonymous page ...
*/
if (!page || !PageAnon(page) || !PageAnonExclusive(page))
return false;
/* ... and a write-fault isn't required for other reasons. */
if (vma_soft_dirty_enabled(vma) && !pte_soft_dirty(pte))
return false;
return !userfaultfd_pte_wp(vma, pte);
} }
static struct page *follow_page_pte(struct vm_area_struct *vma, static struct page *follow_page_pte(struct vm_area_struct *vma,
...@@ -528,12 +556,19 @@ static struct page *follow_page_pte(struct vm_area_struct *vma, ...@@ -528,12 +556,19 @@ static struct page *follow_page_pte(struct vm_area_struct *vma,
} }
if ((flags & FOLL_NUMA) && pte_protnone(pte)) if ((flags & FOLL_NUMA) && pte_protnone(pte))
goto no_page; goto no_page;
if ((flags & FOLL_WRITE) && !can_follow_write_pte(pte, flags)) {
pte_unmap_unlock(ptep, ptl);
return NULL;
}
page = vm_normal_page(vma, address, pte); page = vm_normal_page(vma, address, pte);
/*
* We only care about anon pages in can_follow_write_pte() and don't
* have to worry about pte_devmap() because they are never anon.
*/
if ((flags & FOLL_WRITE) &&
!can_follow_write_pte(pte, page, vma, flags)) {
page = NULL;
goto out;
}
if (!page && pte_devmap(pte) && (flags & (FOLL_GET | FOLL_PIN))) { if (!page && pte_devmap(pte) && (flags & (FOLL_GET | FOLL_PIN))) {
/* /*
* Only return device mapping pages in the FOLL_GET or FOLL_PIN * Only return device mapping pages in the FOLL_GET or FOLL_PIN
...@@ -986,17 +1021,6 @@ static int faultin_page(struct vm_area_struct *vma, ...@@ -986,17 +1021,6 @@ static int faultin_page(struct vm_area_struct *vma,
return -EBUSY; return -EBUSY;
} }
/*
* The VM_FAULT_WRITE bit tells us that do_wp_page has broken COW when
* necessary, even if maybe_mkwrite decided not to set pte_write. We
* can thus safely do subsequent page lookups as if they were reads.
* But only do so when looping for pte_write is futile: in some cases
* userspace may also be wanting to write to the gotten user page,
* which a read fault here might prevent (a readonly page might get
* reCOWed by userspace write).
*/
if ((ret & VM_FAULT_WRITE) && !(vma->vm_flags & VM_WRITE))
*flags |= FOLL_COW;
return 0; return 0;
} }
......
...@@ -1040,12 +1040,6 @@ struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr, ...@@ -1040,12 +1040,6 @@ struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr,
assert_spin_locked(pmd_lockptr(mm, pmd)); assert_spin_locked(pmd_lockptr(mm, pmd));
/*
* When we COW a devmap PMD entry, we split it into PTEs, so we should
* not be in this function with `flags & FOLL_COW` set.
*/
WARN_ONCE(flags & FOLL_COW, "mm: In follow_devmap_pmd with FOLL_COW set");
/* FOLL_GET and FOLL_PIN are mutually exclusive. */ /* FOLL_GET and FOLL_PIN are mutually exclusive. */
if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) == if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) ==
(FOLL_PIN | FOLL_GET))) (FOLL_PIN | FOLL_GET)))
...@@ -1395,14 +1389,42 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf) ...@@ -1395,14 +1389,42 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf)
return VM_FAULT_FALLBACK; return VM_FAULT_FALLBACK;
} }
/* /* FOLL_FORCE can write to even unwritable PMDs in COW mappings. */
* FOLL_FORCE can write to even unwritable pmd's, but only static inline bool can_follow_write_pmd(pmd_t pmd, struct page *page,
* after we've gone through a COW cycle and they are dirty. struct vm_area_struct *vma,
*/ unsigned int flags)
static inline bool can_follow_write_pmd(pmd_t pmd, unsigned int flags)
{ {
return pmd_write(pmd) || /* If the pmd is writable, we can write to the page. */
((flags & FOLL_FORCE) && (flags & FOLL_COW) && pmd_dirty(pmd)); if (pmd_write(pmd))
return true;
/* Maybe FOLL_FORCE is set to override it? */
if (!(flags & FOLL_FORCE))
return false;
/* But FOLL_FORCE has no effect on shared mappings */
if (vma->vm_flags & (VM_MAYSHARE | VM_SHARED))
return false;
/* ... or read-only private ones */
if (!(vma->vm_flags & VM_MAYWRITE))
return false;
/* ... or already writable ones that just need to take a write fault */
if (vma->vm_flags & VM_WRITE)
return false;
/*
* See can_change_pte_writable(): we broke COW and could map the page
* writable if we have an exclusive anonymous page ...
*/
if (!page || !PageAnon(page) || !PageAnonExclusive(page))
return false;
/* ... and a write-fault isn't required for other reasons. */
if (vma_soft_dirty_enabled(vma) && !pmd_soft_dirty(pmd))
return false;
return !userfaultfd_huge_pmd_wp(vma, pmd);
} }
struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
...@@ -1411,12 +1433,16 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, ...@@ -1411,12 +1433,16 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
unsigned int flags) unsigned int flags)
{ {
struct mm_struct *mm = vma->vm_mm; struct mm_struct *mm = vma->vm_mm;
struct page *page = NULL; struct page *page;
assert_spin_locked(pmd_lockptr(mm, pmd)); assert_spin_locked(pmd_lockptr(mm, pmd));
if (flags & FOLL_WRITE && !can_follow_write_pmd(*pmd, flags)) page = pmd_page(*pmd);
goto out; VM_BUG_ON_PAGE(!PageHead(page) && !is_zone_device_page(page), page);
if ((flags & FOLL_WRITE) &&
!can_follow_write_pmd(*pmd, page, vma, flags))
return NULL;
/* Avoid dumping huge zero page */ /* Avoid dumping huge zero page */
if ((flags & FOLL_DUMP) && is_huge_zero_pmd(*pmd)) if ((flags & FOLL_DUMP) && is_huge_zero_pmd(*pmd))
...@@ -1424,10 +1450,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, ...@@ -1424,10 +1450,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
/* Full NUMA hinting faults to serialise migration in fault paths */ /* Full NUMA hinting faults to serialise migration in fault paths */
if ((flags & FOLL_NUMA) && pmd_protnone(*pmd)) if ((flags & FOLL_NUMA) && pmd_protnone(*pmd))
goto out; return NULL;
page = pmd_page(*pmd);
VM_BUG_ON_PAGE(!PageHead(page) && !is_zone_device_page(page), page);
if (!pmd_write(*pmd) && gup_must_unshare(flags, page)) if (!pmd_write(*pmd) && gup_must_unshare(flags, page))
return ERR_PTR(-EMLINK); return ERR_PTR(-EMLINK);
...@@ -1444,7 +1467,6 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, ...@@ -1444,7 +1467,6 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT; page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT;
VM_BUG_ON_PAGE(!PageCompound(page) && !is_zone_device_page(page), page); VM_BUG_ON_PAGE(!PageCompound(page) && !is_zone_device_page(page), page);
out:
return page; return page;
} }
......
...@@ -5241,6 +5241,21 @@ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma, ...@@ -5241,6 +5241,21 @@ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma,
VM_BUG_ON(unshare && (flags & FOLL_WRITE)); VM_BUG_ON(unshare && (flags & FOLL_WRITE));
VM_BUG_ON(!unshare && !(flags & FOLL_WRITE)); VM_BUG_ON(!unshare && !(flags & FOLL_WRITE));
/*
* hugetlb does not support FOLL_FORCE-style write faults that keep the
* PTE mapped R/O such as maybe_mkwrite() would do.
*/
if (WARN_ON_ONCE(!unshare && !(vma->vm_flags & VM_WRITE)))
return VM_FAULT_SIGSEGV;
/* Let's take out MAP_SHARED mappings first. */
if (vma->vm_flags & VM_MAYSHARE) {
if (unlikely(unshare))
return 0;
set_huge_ptep_writable(vma, haddr, ptep);
return 0;
}
pte = huge_ptep_get(ptep); pte = huge_ptep_get(ptep);
old_page = pte_page(pte); old_page = pte_page(pte);
...@@ -5781,12 +5796,11 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, ...@@ -5781,12 +5796,11 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
* If we are going to COW/unshare the mapping later, we examine the * If we are going to COW/unshare the mapping later, we examine the
* pending reservations for this page now. This will ensure that any * pending reservations for this page now. This will ensure that any
* allocations necessary to record that reservation occur outside the * allocations necessary to record that reservation occur outside the
* spinlock. For private mappings, we also lookup the pagecache * spinlock. Also lookup the pagecache page now as it is used to
* page now as it is used to determine if a reservation has been * determine if a reservation has been consumed.
* consumed.
*/ */
if ((flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) && if ((flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) &&
!huge_pte_write(entry)) { !(vma->vm_flags & VM_MAYSHARE) && !huge_pte_write(entry)) {
if (vma_needs_reservation(h, vma, haddr) < 0) { if (vma_needs_reservation(h, vma, haddr) < 0) {
ret = VM_FAULT_OOM; ret = VM_FAULT_OOM;
goto out_mutex; goto out_mutex;
...@@ -5794,9 +5808,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, ...@@ -5794,9 +5808,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
/* Just decrements count, does not deallocate */ /* Just decrements count, does not deallocate */
vma_end_reservation(h, vma, haddr); vma_end_reservation(h, vma, haddr);
if (!(vma->vm_flags & VM_MAYSHARE)) pagecache_page = hugetlbfs_pagecache_page(h, vma, haddr);
pagecache_page = hugetlbfs_pagecache_page(h,
vma, haddr);
} }
ptl = huge_pte_lock(h, mm, ptep); ptl = huge_pte_lock(h, mm, ptep);
......
...@@ -1646,8 +1646,11 @@ int vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot) ...@@ -1646,8 +1646,11 @@ int vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot)
pgprot_val(vm_pgprot_modify(vm_page_prot, vm_flags))) pgprot_val(vm_pgprot_modify(vm_page_prot, vm_flags)))
return 0; return 0;
/* Do we need to track softdirty? */ /*
if (vma_soft_dirty_enabled(vma)) * Do we need to track softdirty? hugetlb does not support softdirty
* tracking yet.
*/
if (vma_soft_dirty_enabled(vma) && !is_vm_hugetlb_page(vma))
return 1; return 1;
/* Specialty mapping? */ /* Specialty mapping? */
......
...@@ -1659,7 +1659,9 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp, ...@@ -1659,7 +1659,9 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp,
new = page_folio(newpage); new = page_folio(newpage);
mem_cgroup_migrate(old, new); mem_cgroup_migrate(old, new);
__inc_lruvec_page_state(newpage, NR_FILE_PAGES); __inc_lruvec_page_state(newpage, NR_FILE_PAGES);
__inc_lruvec_page_state(newpage, NR_SHMEM);
__dec_lruvec_page_state(oldpage, NR_FILE_PAGES); __dec_lruvec_page_state(oldpage, NR_FILE_PAGES);
__dec_lruvec_page_state(oldpage, NR_SHMEM);
} }
xa_unlock_irq(&swap_mapping->i_pages); xa_unlock_irq(&swap_mapping->i_pages);
...@@ -2281,16 +2283,34 @@ static int shmem_mmap(struct file *file, struct vm_area_struct *vma) ...@@ -2281,16 +2283,34 @@ static int shmem_mmap(struct file *file, struct vm_area_struct *vma)
return 0; return 0;
} }
/* Mask out flags that are inappropriate for the given type of inode. */ #ifdef CONFIG_TMPFS_XATTR
static unsigned shmem_mask_flags(umode_t mode, __u32 flags) static int shmem_initxattrs(struct inode *, const struct xattr *, void *);
/*
* chattr's fsflags are unrelated to extended attributes,
* but tmpfs has chosen to enable them under the same config option.
*/
static void shmem_set_inode_flags(struct inode *inode, unsigned int fsflags)
{
unsigned int i_flags = 0;
if (fsflags & FS_NOATIME_FL)
i_flags |= S_NOATIME;
if (fsflags & FS_APPEND_FL)
i_flags |= S_APPEND;
if (fsflags & FS_IMMUTABLE_FL)
i_flags |= S_IMMUTABLE;
/*
* But FS_NODUMP_FL does not require any action in i_flags.
*/
inode_set_flags(inode, i_flags, S_NOATIME | S_APPEND | S_IMMUTABLE);
}
#else
static void shmem_set_inode_flags(struct inode *inode, unsigned int fsflags)
{ {
if (S_ISDIR(mode))
return flags;
else if (S_ISREG(mode))
return flags & SHMEM_REG_FLMASK;
else
return flags & SHMEM_OTHER_FLMASK;
} }
#define shmem_initxattrs NULL
#endif
static struct inode *shmem_get_inode(struct super_block *sb, struct inode *dir, static struct inode *shmem_get_inode(struct super_block *sb, struct inode *dir,
umode_t mode, dev_t dev, unsigned long flags) umode_t mode, dev_t dev, unsigned long flags)
...@@ -2319,7 +2339,8 @@ static struct inode *shmem_get_inode(struct super_block *sb, struct inode *dir, ...@@ -2319,7 +2339,8 @@ static struct inode *shmem_get_inode(struct super_block *sb, struct inode *dir,
info->i_crtime = inode->i_mtime; info->i_crtime = inode->i_mtime;
info->fsflags = (dir == NULL) ? 0 : info->fsflags = (dir == NULL) ? 0 :
SHMEM_I(dir)->fsflags & SHMEM_FL_INHERITED; SHMEM_I(dir)->fsflags & SHMEM_FL_INHERITED;
info->fsflags = shmem_mask_flags(mode, info->fsflags); if (info->fsflags)
shmem_set_inode_flags(inode, info->fsflags);
INIT_LIST_HEAD(&info->shrinklist); INIT_LIST_HEAD(&info->shrinklist);
INIT_LIST_HEAD(&info->swaplist); INIT_LIST_HEAD(&info->swaplist);
simple_xattrs_init(&info->xattrs); simple_xattrs_init(&info->xattrs);
...@@ -2468,12 +2489,6 @@ int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, ...@@ -2468,12 +2489,6 @@ int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
static const struct inode_operations shmem_symlink_inode_operations; static const struct inode_operations shmem_symlink_inode_operations;
static const struct inode_operations shmem_short_symlink_operations; static const struct inode_operations shmem_short_symlink_operations;
#ifdef CONFIG_TMPFS_XATTR
static int shmem_initxattrs(struct inode *, const struct xattr *, void *);
#else
#define shmem_initxattrs NULL
#endif
static int static int
shmem_write_begin(struct file *file, struct address_space *mapping, shmem_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, loff_t pos, unsigned len,
...@@ -2826,12 +2841,13 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, ...@@ -2826,12 +2841,13 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset,
if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size) if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size)
i_size_write(inode, offset + len); i_size_write(inode, offset + len);
inode->i_ctime = current_time(inode);
undone: undone:
spin_lock(&inode->i_lock); spin_lock(&inode->i_lock);
inode->i_private = NULL; inode->i_private = NULL;
spin_unlock(&inode->i_lock); spin_unlock(&inode->i_lock);
out: out:
if (!error)
file_modified(file);
inode_unlock(inode); inode_unlock(inode);
return error; return error;
} }
...@@ -3179,18 +3195,13 @@ static int shmem_fileattr_set(struct user_namespace *mnt_userns, ...@@ -3179,18 +3195,13 @@ static int shmem_fileattr_set(struct user_namespace *mnt_userns,
if (fileattr_has_fsx(fa)) if (fileattr_has_fsx(fa))
return -EOPNOTSUPP; return -EOPNOTSUPP;
if (fa->flags & ~SHMEM_FL_USER_MODIFIABLE)
return -EOPNOTSUPP;
info->fsflags = (info->fsflags & ~SHMEM_FL_USER_MODIFIABLE) | info->fsflags = (info->fsflags & ~SHMEM_FL_USER_MODIFIABLE) |
(fa->flags & SHMEM_FL_USER_MODIFIABLE); (fa->flags & SHMEM_FL_USER_MODIFIABLE);
inode->i_flags &= ~(S_APPEND | S_IMMUTABLE | S_NOATIME); shmem_set_inode_flags(inode, info->fsflags);
if (info->fsflags & FS_APPEND_FL)
inode->i_flags |= S_APPEND;
if (info->fsflags & FS_IMMUTABLE_FL)
inode->i_flags |= S_IMMUTABLE;
if (info->fsflags & FS_NOATIME_FL)
inode->i_flags |= S_NOATIME;
inode->i_ctime = current_time(inode); inode->i_ctime = current_time(inode);
return 0; return 0;
} }
......
...@@ -703,14 +703,29 @@ ssize_t mcopy_continue(struct mm_struct *dst_mm, unsigned long start, ...@@ -703,14 +703,29 @@ ssize_t mcopy_continue(struct mm_struct *dst_mm, unsigned long start,
mmap_changing, 0); mmap_changing, 0);
} }
void uffd_wp_range(struct mm_struct *dst_mm, struct vm_area_struct *dst_vma,
unsigned long start, unsigned long len, bool enable_wp)
{
struct mmu_gather tlb;
pgprot_t newprot;
if (enable_wp)
newprot = vm_get_page_prot(dst_vma->vm_flags & ~(VM_WRITE));
else
newprot = vm_get_page_prot(dst_vma->vm_flags);
tlb_gather_mmu(&tlb, dst_mm);
change_protection(&tlb, dst_vma, start, start + len, newprot,
enable_wp ? MM_CP_UFFD_WP : MM_CP_UFFD_WP_RESOLVE);
tlb_finish_mmu(&tlb);
}
int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start, int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start,
unsigned long len, bool enable_wp, unsigned long len, bool enable_wp,
atomic_t *mmap_changing) atomic_t *mmap_changing)
{ {
struct vm_area_struct *dst_vma; struct vm_area_struct *dst_vma;
unsigned long page_mask; unsigned long page_mask;
struct mmu_gather tlb;
pgprot_t newprot;
int err; int err;
/* /*
...@@ -750,15 +765,7 @@ int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start, ...@@ -750,15 +765,7 @@ int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start,
goto out_unlock; goto out_unlock;
} }
if (enable_wp) uffd_wp_range(dst_mm, dst_vma, start, len, enable_wp);
newprot = vm_get_page_prot(dst_vma->vm_flags & ~(VM_WRITE));
else
newprot = vm_get_page_prot(dst_vma->vm_flags);
tlb_gather_mmu(&tlb, dst_mm);
change_protection(&tlb, dst_vma, start, start + len, newprot,
enable_wp ? MM_CP_UFFD_WP : MM_CP_UFFD_WP_RESOLVE);
tlb_finish_mmu(&tlb);
err = 0; err = 0;
out_unlock: out_unlock:
......
...@@ -1168,8 +1168,15 @@ int fragmentation_index(struct zone *zone, unsigned int order) ...@@ -1168,8 +1168,15 @@ int fragmentation_index(struct zone *zone, unsigned int order)
#define TEXT_FOR_HIGHMEM(xx) #define TEXT_FOR_HIGHMEM(xx)
#endif #endif
#ifdef CONFIG_ZONE_DEVICE
#define TEXT_FOR_DEVICE(xx) xx "_device",
#else
#define TEXT_FOR_DEVICE(xx)
#endif
#define TEXTS_FOR_ZONES(xx) TEXT_FOR_DMA(xx) TEXT_FOR_DMA32(xx) xx "_normal", \ #define TEXTS_FOR_ZONES(xx) TEXT_FOR_DMA(xx) TEXT_FOR_DMA32(xx) xx "_normal", \
TEXT_FOR_HIGHMEM(xx) xx "_movable", TEXT_FOR_HIGHMEM(xx) xx "_movable", \
TEXT_FOR_DEVICE(xx)
const char * const vmstat_text[] = { const char * const vmstat_text[] = {
/* enum zone_stat_item counters */ /* enum zone_stat_item counters */
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment