madvise.c 38.4 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0
Linus Torvalds's avatar
Linus Torvalds committed
2 3 4 5 6 7 8 9 10 11
/*
 *	linux/mm/madvise.c
 *
 * Copyright (C) 1999  Linus Torvalds
 * Copyright (C) 2002  Christoph Hellwig
 */

#include <linux/mman.h>
#include <linux/pagemap.h>
#include <linux/syscalls.h>
12
#include <linux/mempolicy.h>
13
#include <linux/page-isolation.h>
Minchan Kim's avatar
Minchan Kim committed
14
#include <linux/page_idle.h>
15
#include <linux/userfaultfd_k.h>
Linus Torvalds's avatar
Linus Torvalds committed
16
#include <linux/hugetlb.h>
17
#include <linux/falloc.h>
18
#include <linux/fadvise.h>
Alexey Dobriyan's avatar
Alexey Dobriyan committed
19
#include <linux/sched.h>
20
#include <linux/sched/mm.h>
21
#include <linux/mm_inline.h>
22
#include <linux/string.h>
23
#include <linux/uio.h>
Hugh Dickins's avatar
Hugh Dickins committed
24
#include <linux/ksm.h>
25
#include <linux/fs.h>
26
#include <linux/file.h>
27
#include <linux/blkdev.h>
28
#include <linux/backing-dev.h>
29
#include <linux/pagewalk.h>
30 31
#include <linux/swap.h>
#include <linux/swapops.h>
32
#include <linux/shmem_fs.h>
33 34 35
#include <linux/mmu_notifier.h>

#include <asm/tlb.h>
Linus Torvalds's avatar
Linus Torvalds committed
36

37
#include "internal.h"
38
#include "swap.h"
39

40 41 42 43 44
struct madvise_walk_private {
	struct mmu_gather *tlb;
	bool pageout;
};

45 46
/*
 * Any behaviour which results in changes to the vma->vm_flags needs to
47
 * take mmap_lock for writing. Others, which simply traverse vmas, need
48 49 50 51 52 53 54 55
 * to only take it for reading.
 */
static int madvise_need_mmap_write(int behavior)
{
	switch (behavior) {
	case MADV_REMOVE:
	case MADV_WILLNEED:
	case MADV_DONTNEED:
56
	case MADV_DONTNEED_LOCKED:
Minchan Kim's avatar
Minchan Kim committed
57
	case MADV_COLD:
Minchan Kim's avatar
Minchan Kim committed
58
	case MADV_PAGEOUT:
59
	case MADV_FREE:
60 61
	case MADV_POPULATE_READ:
	case MADV_POPULATE_WRITE:
62
	case MADV_COLLAPSE:
63 64 65 66 67 68 69
		return 0;
	default:
		/* be safe, default to 1. list exceptions explicitly */
		return 1;
	}
}

70
#ifdef CONFIG_ANON_VMA_NAME
71
struct anon_vma_name *anon_vma_name_alloc(const char *name)
72 73 74 75 76 77 78 79 80 81 82 83 84 85 86
{
	struct anon_vma_name *anon_name;
	size_t count;

	/* Add 1 for NUL terminator at the end of the anon_name->name */
	count = strlen(name) + 1;
	anon_name = kmalloc(struct_size(anon_name, name, count), GFP_KERNEL);
	if (anon_name) {
		kref_init(&anon_name->kref);
		memcpy(anon_name->name, name, count);
	}

	return anon_name;
}

87
void anon_vma_name_free(struct kref *kref)
88 89 90 91 92 93
{
	struct anon_vma_name *anon_name =
			container_of(kref, struct anon_vma_name, kref);
	kfree(anon_name);
}

94
struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma)
95 96 97
{
	mmap_assert_locked(vma->vm_mm);

98
	return vma->anon_name;
99 100 101
}

/* mmap_lock should be write-locked */
102 103
static int replace_anon_vma_name(struct vm_area_struct *vma,
				 struct anon_vma_name *anon_name)
104
{
105
	struct anon_vma_name *orig_name = anon_vma_name(vma);
106

107 108 109
	if (!anon_name) {
		vma->anon_name = NULL;
		anon_vma_name_put(orig_name);
110 111 112
		return 0;
	}

113 114
	if (anon_vma_name_eq(orig_name, anon_name))
		return 0;
115

116
	vma->anon_name = anon_vma_name_reuse(anon_name);
117
	anon_vma_name_put(orig_name);
118 119 120 121

	return 0;
}
#else /* CONFIG_ANON_VMA_NAME */
122 123
static int replace_anon_vma_name(struct vm_area_struct *vma,
				 struct anon_vma_name *anon_name)
124
{
125
	if (anon_name)
126 127 128 129 130
		return -EINVAL;

	return 0;
}
#endif /* CONFIG_ANON_VMA_NAME */
Linus Torvalds's avatar
Linus Torvalds committed
131
/*
132
 * Update the vm_flags on region of a vma, splitting it or merging it as
133
 * necessary.  Must be called with mmap_lock held for writing;
134 135
 * Caller should ensure anon_name stability by raising its refcount even when
 * anon_name belongs to a valid vma because this function might free that vma.
Linus Torvalds's avatar
Linus Torvalds committed
136
 */
137 138
static int madvise_update_vma(struct vm_area_struct *vma,
			      struct vm_area_struct **prev, unsigned long start,
139
			      unsigned long end, unsigned long new_flags,
140
			      struct anon_vma_name *anon_name)
Linus Torvalds's avatar
Linus Torvalds committed
141
{
142
	struct mm_struct *mm = vma->vm_mm;
143
	int error;
144
	VMA_ITERATOR(vmi, mm, start);
145

146
	if (new_flags == vma->vm_flags && anon_vma_name_eq(anon_vma_name(vma), anon_name)) {
147
		*prev = vma;
148
		return 0;
149 150
	}

151 152 153 154
	vma = vma_modify_flags_name(&vmi, *prev, vma, start, end, new_flags,
				    anon_name);
	if (IS_ERR(vma))
		return PTR_ERR(vma);
155 156

	*prev = vma;
Linus Torvalds's avatar
Linus Torvalds committed
157

158 159
	/* vm_flags is protected by the mmap_lock held in write mode. */
	vma_start_write(vma);
160
	vm_flags_reset(vma, new_flags);
161
	if (!vma->vm_file || vma_is_anon_shmem(vma)) {
162
		error = replace_anon_vma_name(vma, anon_name);
163 164 165
		if (error)
			return error;
	}
166

167
	return 0;
Linus Torvalds's avatar
Linus Torvalds committed
168 169
}

170 171
#ifdef CONFIG_SWAP
static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start,
172
		unsigned long end, struct mm_walk *walk)
173 174
{
	struct vm_area_struct *vma = walk->private;
175
	struct swap_iocb *splug = NULL;
176 177 178
	pte_t *ptep = NULL;
	spinlock_t *ptl;
	unsigned long addr;
179

180
	for (addr = start; addr < end; addr += PAGE_SIZE) {
181 182 183 184
		pte_t pte;
		swp_entry_t entry;
		struct page *page;

185 186 187 188 189
		if (!ptep++) {
			ptep = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
			if (!ptep)
				break;
		}
190

Ryan Roberts's avatar
Ryan Roberts committed
191
		pte = ptep_get(ptep);
192
		if (!is_swap_pte(pte))
193 194 195 196 197
			continue;
		entry = pte_to_swp_entry(pte);
		if (unlikely(non_swap_entry(entry)))
			continue;

198 199 200
		pte_unmap_unlock(ptep, ptl);
		ptep = NULL;

201
		page = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE,
202
					     vma, addr, &splug);
203
		if (page)
204
			put_page(page);
205
	}
206 207 208

	if (ptep)
		pte_unmap_unlock(ptep, ptl);
209
	swap_read_unplug(splug);
210
	cond_resched();
211 212 213 214

	return 0;
}

215 216
static const struct mm_walk_ops swapin_walk_ops = {
	.pmd_entry		= swapin_walk_pmd_entry,
217
	.walk_lock		= PGWALK_RDLOCK,
218
};
219

220
static void shmem_swapin_range(struct vm_area_struct *vma,
221 222 223
		unsigned long start, unsigned long end,
		struct address_space *mapping)
{
224
	XA_STATE(xas, &mapping->i_pages, linear_page_index(vma, start));
225
	pgoff_t end_index = linear_page_index(vma, end) - 1;
226
	struct page *page;
227
	struct swap_iocb *splug = NULL;
228

229 230
	rcu_read_lock();
	xas_for_each(&xas, page, end_index) {
231 232
		unsigned long addr;
		swp_entry_t entry;
233

234
		if (!xa_is_value(page))
235
			continue;
236
		entry = radix_to_swp_entry(page);
237
		/* There might be swapin error entries in shmem mapping. */
238
		if (non_swap_entry(entry))
239
			continue;
240 241 242

		addr = vma->vm_start +
			((xas.xa_index - vma->vm_pgoff) << PAGE_SHIFT);
243 244 245
		xas_pause(&xas);
		rcu_read_unlock();

246
		page = read_swap_cache_async(entry, mapping_gfp_mask(mapping),
247
					     vma, addr, &splug);
248
		if (page)
249
			put_page(page);
250 251

		rcu_read_lock();
252
	}
253
	rcu_read_unlock();
254
	swap_read_unplug(splug);
255 256 257
}
#endif		/* CONFIG_SWAP */

Linus Torvalds's avatar
Linus Torvalds committed
258 259 260
/*
 * Schedule all required I/O operations.  Do not wait for completion.
 */
261 262
static long madvise_willneed(struct vm_area_struct *vma,
			     struct vm_area_struct **prev,
Linus Torvalds's avatar
Linus Torvalds committed
263 264
			     unsigned long start, unsigned long end)
{
265
	struct mm_struct *mm = vma->vm_mm;
Linus Torvalds's avatar
Linus Torvalds committed
266
	struct file *file = vma->vm_file;
267
	loff_t offset;
Linus Torvalds's avatar
Linus Torvalds committed
268

269
	*prev = vma;
270
#ifdef CONFIG_SWAP
271
	if (!file) {
272 273
		walk_page_range(vma->vm_mm, start, end, &swapin_walk_ops, vma);
		lru_add_drain(); /* Push any new pages onto the LRU now */
274 275 276
		return 0;
	}

277
	if (shmem_mapping(file->f_mapping)) {
278 279
		shmem_swapin_range(vma, start, end, file->f_mapping);
		lru_add_drain(); /* Push any new pages onto the LRU now */
280 281 282
		return 0;
	}
#else
283 284
	if (!file)
		return -EBADF;
285
#endif
286

Matthew Wilcox's avatar
Matthew Wilcox committed
287
	if (IS_DAX(file_inode(file))) {
288 289 290 291
		/* no bad return value, but ignore advice */
		return 0;
	}

292 293 294 295
	/*
	 * Filesystem's fadvise may need to take various locks.  We need to
	 * explicitly grab a reference because the vma (and hence the
	 * vma's reference to the file) can go away as soon as we drop
296
	 * mmap_lock.
297
	 */
298
	*prev = NULL;	/* tell sys_madvise we drop mmap_lock */
299 300 301
	get_file(file);
	offset = (loff_t)(start - vma->vm_start)
			+ ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
302
	mmap_read_unlock(mm);
303 304
	vfs_fadvise(file, offset, end - start, POSIX_FADV_WILLNEED);
	fput(file);
305
	mmap_read_lock(mm);
Linus Torvalds's avatar
Linus Torvalds committed
306 307 308
	return 0;
}

309 310 311 312 313 314 315 316 317 318
static inline bool can_do_file_pageout(struct vm_area_struct *vma)
{
	if (!vma->vm_file)
		return false;
	/*
	 * paging out pagecache only for non-anonymous mappings that correspond
	 * to the files the calling process could (if tried) open for writing;
	 * otherwise we'd be including shared non-exclusive mappings, which
	 * opens a side channel.
	 */
319
	return inode_owner_or_capable(&nop_mnt_idmap,
320 321 322 323
				      file_inode(vma->vm_file)) ||
	       file_permission(vma->vm_file, MAY_WRITE) == 0;
}

324 325 326
static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
				unsigned long addr, unsigned long end,
				struct mm_walk *walk)
Minchan Kim's avatar
Minchan Kim committed
327
{
328 329 330
	struct madvise_walk_private *private = walk->private;
	struct mmu_gather *tlb = private->tlb;
	bool pageout = private->pageout;
Minchan Kim's avatar
Minchan Kim committed
331 332
	struct mm_struct *mm = tlb->mm;
	struct vm_area_struct *vma = walk->vma;
333
	pte_t *start_pte, *pte, ptent;
Minchan Kim's avatar
Minchan Kim committed
334
	spinlock_t *ptl;
335 336
	struct folio *folio = NULL;
	LIST_HEAD(folio_list);
337
	bool pageout_anon_only_filter;
338 339 340

	if (fatal_signal_pending(current))
		return -EINTR;
Minchan Kim's avatar
Minchan Kim committed
341

342 343 344
	pageout_anon_only_filter = pageout && !vma_is_anonymous(vma) &&
					!can_do_file_pageout(vma);

Minchan Kim's avatar
Minchan Kim committed
345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
	if (pmd_trans_huge(*pmd)) {
		pmd_t orig_pmd;
		unsigned long next = pmd_addr_end(addr, end);

		tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
		ptl = pmd_trans_huge_lock(pmd, vma);
		if (!ptl)
			return 0;

		orig_pmd = *pmd;
		if (is_huge_zero_pmd(orig_pmd))
			goto huge_unlock;

		if (unlikely(!pmd_present(orig_pmd))) {
			VM_BUG_ON(thp_migration_supported() &&
					!is_pmd_migration_entry(orig_pmd));
			goto huge_unlock;
		}

365
		folio = pfn_folio(pmd_pfn(orig_pmd));
366

367
		/* Do not interfere with other mappings of this folio */
368
		if (folio_estimated_sharers(folio) != 1)
369 370
			goto huge_unlock;

371
		if (pageout_anon_only_filter && !folio_test_anon(folio))
372 373
			goto huge_unlock;

Minchan Kim's avatar
Minchan Kim committed
374 375 376
		if (next - addr != HPAGE_PMD_SIZE) {
			int err;

377
			folio_get(folio);
Minchan Kim's avatar
Minchan Kim committed
378
			spin_unlock(ptl);
379 380 381 382
			folio_lock(folio);
			err = split_folio(folio);
			folio_unlock(folio);
			folio_put(folio);
Minchan Kim's avatar
Minchan Kim committed
383
			if (!err)
384
				goto regular_folio;
Minchan Kim's avatar
Minchan Kim committed
385 386 387 388 389 390 391 392 393 394 395
			return 0;
		}

		if (pmd_young(orig_pmd)) {
			pmdp_invalidate(vma, addr, pmd);
			orig_pmd = pmd_mkold(orig_pmd);

			set_pmd_at(mm, addr, pmd, orig_pmd);
			tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
		}

396 397
		folio_clear_referenced(folio);
		folio_test_clear_young(folio);
398 399
		if (folio_test_active(folio))
			folio_set_workingset(folio);
400
		if (pageout) {
401
			if (folio_isolate_lru(folio)) {
402 403
				if (folio_test_unevictable(folio))
					folio_putback_lru(folio);
404
				else
405
					list_add(&folio->lru, &folio_list);
406
			}
407
		} else
408
			folio_deactivate(folio);
Minchan Kim's avatar
Minchan Kim committed
409 410
huge_unlock:
		spin_unlock(ptl);
411
		if (pageout)
412
			reclaim_pages(&folio_list);
Minchan Kim's avatar
Minchan Kim committed
413 414 415
		return 0;
	}

416
regular_folio:
Minchan Kim's avatar
Minchan Kim committed
417 418
#endif
	tlb_change_page_size(tlb, PAGE_SIZE);
419 420 421
	start_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
	if (!start_pte)
		return 0;
Minchan Kim's avatar
Minchan Kim committed
422 423 424
	flush_tlb_batched_pending(mm);
	arch_enter_lazy_mmu_mode();
	for (; addr < end; pte++, addr += PAGE_SIZE) {
Ryan Roberts's avatar
Ryan Roberts committed
425
		ptent = ptep_get(pte);
Minchan Kim's avatar
Minchan Kim committed
426 427 428 429 430 431 432

		if (pte_none(ptent))
			continue;

		if (!pte_present(ptent))
			continue;

433 434
		folio = vm_normal_folio(vma, addr, ptent);
		if (!folio || folio_is_zone_device(folio))
Minchan Kim's avatar
Minchan Kim committed
435 436 437 438 439 440
			continue;

		/*
		 * Creating a THP page is expensive so split it only if we
		 * are sure it's worth. Split it if we are only owner.
		 */
441
		if (folio_test_large(folio)) {
442 443
			int err;

444
			if (folio_estimated_sharers(folio) != 1)
Minchan Kim's avatar
Minchan Kim committed
445
				break;
446
			if (pageout_anon_only_filter && !folio_test_anon(folio))
447
				break;
448
			if (!folio_trylock(folio))
Minchan Kim's avatar
Minchan Kim committed
449
				break;
450 451 452 453 454
			folio_get(folio);
			arch_leave_lazy_mmu_mode();
			pte_unmap_unlock(start_pte, ptl);
			start_pte = NULL;
			err = split_folio(folio);
455 456
			folio_unlock(folio);
			folio_put(folio);
457 458 459 460 461 462 463
			if (err)
				break;
			start_pte = pte =
				pte_offset_map_lock(mm, pmd, addr, &ptl);
			if (!start_pte)
				break;
			arch_enter_lazy_mmu_mode();
Minchan Kim's avatar
Minchan Kim committed
464 465 466 467 468
			pte--;
			addr -= PAGE_SIZE;
			continue;
		}

469
		/*
470 471
		 * Do not interfere with other mappings of this folio and
		 * non-LRU folio.
472
		 */
473
		if (!folio_test_lru(folio) || folio_mapcount(folio) != 1)
474 475
			continue;

476
		if (pageout_anon_only_filter && !folio_test_anon(folio))
477 478
			continue;

479
		VM_BUG_ON_FOLIO(folio_test_large(folio), folio);
Minchan Kim's avatar
Minchan Kim committed
480 481 482 483 484 485 486 487 488 489

		if (pte_young(ptent)) {
			ptent = ptep_get_and_clear_full(mm, addr, pte,
							tlb->fullmm);
			ptent = pte_mkold(ptent);
			set_pte_at(mm, addr, pte, ptent);
			tlb_remove_tlb_entry(tlb, pte, addr);
		}

		/*
490 491
		 * We are deactivating a folio for accelerating reclaiming.
		 * VM couldn't reclaim the folio unless we clear PG_young.
Minchan Kim's avatar
Minchan Kim committed
492 493 494
		 * As a side effect, it makes confuse idle-page tracking
		 * because they will miss recent referenced history.
		 */
495 496
		folio_clear_referenced(folio);
		folio_test_clear_young(folio);
497 498
		if (folio_test_active(folio))
			folio_set_workingset(folio);
499
		if (pageout) {
500
			if (folio_isolate_lru(folio)) {
501 502
				if (folio_test_unevictable(folio))
					folio_putback_lru(folio);
503
				else
504
					list_add(&folio->lru, &folio_list);
505
			}
506
		} else
507
			folio_deactivate(folio);
Minchan Kim's avatar
Minchan Kim committed
508 509
	}

510 511 512 513
	if (start_pte) {
		arch_leave_lazy_mmu_mode();
		pte_unmap_unlock(start_pte, ptl);
	}
514
	if (pageout)
515
		reclaim_pages(&folio_list);
Minchan Kim's avatar
Minchan Kim committed
516 517 518 519 520 521
	cond_resched();

	return 0;
}

static const struct mm_walk_ops cold_walk_ops = {
522
	.pmd_entry = madvise_cold_or_pageout_pte_range,
523
	.walk_lock = PGWALK_RDLOCK,
Minchan Kim's avatar
Minchan Kim committed
524 525 526 527 528 529
};

static void madvise_cold_page_range(struct mmu_gather *tlb,
			     struct vm_area_struct *vma,
			     unsigned long addr, unsigned long end)
{
530 531 532 533 534
	struct madvise_walk_private walk_private = {
		.pageout = false,
		.tlb = tlb,
	};

Minchan Kim's avatar
Minchan Kim committed
535
	tlb_start_vma(tlb, vma);
536
	walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, &walk_private);
Minchan Kim's avatar
Minchan Kim committed
537 538 539
	tlb_end_vma(tlb, vma);
}

540 541
static inline bool can_madv_lru_vma(struct vm_area_struct *vma)
{
542
	return !(vma->vm_flags & (VM_LOCKED|VM_PFNMAP|VM_HUGETLB));
543 544
}

Minchan Kim's avatar
Minchan Kim committed
545 546 547 548 549 550 551 552 553 554 555 556
static long madvise_cold(struct vm_area_struct *vma,
			struct vm_area_struct **prev,
			unsigned long start_addr, unsigned long end_addr)
{
	struct mm_struct *mm = vma->vm_mm;
	struct mmu_gather tlb;

	*prev = vma;
	if (!can_madv_lru_vma(vma))
		return -EINVAL;

	lru_add_drain();
557
	tlb_gather_mmu(&tlb, mm);
Minchan Kim's avatar
Minchan Kim committed
558
	madvise_cold_page_range(&tlb, vma, start_addr, end_addr);
559
	tlb_finish_mmu(&tlb);
Minchan Kim's avatar
Minchan Kim committed
560 561 562 563

	return 0;
}

Minchan Kim's avatar
Minchan Kim committed
564 565 566 567
static void madvise_pageout_page_range(struct mmu_gather *tlb,
			     struct vm_area_struct *vma,
			     unsigned long addr, unsigned long end)
{
568 569 570 571 572
	struct madvise_walk_private walk_private = {
		.pageout = true,
		.tlb = tlb,
	};

Minchan Kim's avatar
Minchan Kim committed
573
	tlb_start_vma(tlb, vma);
574
	walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, &walk_private);
Minchan Kim's avatar
Minchan Kim committed
575 576 577 578 579 580 581 582 583 584 585 586 587 588
	tlb_end_vma(tlb, vma);
}

static long madvise_pageout(struct vm_area_struct *vma,
			struct vm_area_struct **prev,
			unsigned long start_addr, unsigned long end_addr)
{
	struct mm_struct *mm = vma->vm_mm;
	struct mmu_gather tlb;

	*prev = vma;
	if (!can_madv_lru_vma(vma))
		return -EINVAL;

589 590 591 592 593 594 595 596
	/*
	 * If the VMA belongs to a private file mapping, there can be private
	 * dirty pages which can be paged out if even this process is neither
	 * owner nor write capable of the file. We allow private file mappings
	 * further to pageout dirty anon pages.
	 */
	if (!vma_is_anonymous(vma) && (!can_do_file_pageout(vma) &&
				(vma->vm_flags & VM_MAYSHARE)))
Minchan Kim's avatar
Minchan Kim committed
597 598 599
		return 0;

	lru_add_drain();
600
	tlb_gather_mmu(&tlb, mm);
Minchan Kim's avatar
Minchan Kim committed
601
	madvise_pageout_page_range(&tlb, vma, start_addr, end_addr);
602
	tlb_finish_mmu(&tlb);
Minchan Kim's avatar
Minchan Kim committed
603 604 605 606

	return 0;
}

607 608 609 610 611 612 613 614
static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
				unsigned long end, struct mm_walk *walk)

{
	struct mmu_gather *tlb = walk->private;
	struct mm_struct *mm = tlb->mm;
	struct vm_area_struct *vma = walk->vma;
	spinlock_t *ptl;
615
	pte_t *start_pte, *pte, ptent;
616
	struct folio *folio;
617
	int nr_swap = 0;
618 619 620 621 622
	unsigned long next;

	next = pmd_addr_end(addr, end);
	if (pmd_trans_huge(*pmd))
		if (madvise_free_huge_pmd(tlb, vma, pmd, addr, next))
623
			return 0;
624

625
	tlb_change_page_size(tlb, PAGE_SIZE);
626 627 628
	start_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
	if (!start_pte)
		return 0;
629
	flush_tlb_batched_pending(mm);
630 631
	arch_enter_lazy_mmu_mode();
	for (; addr != end; pte++, addr += PAGE_SIZE) {
Ryan Roberts's avatar
Ryan Roberts committed
632
		ptent = ptep_get(pte);
633

634
		if (pte_none(ptent))
635
			continue;
636 637 638 639 640 641 642 643 644
		/*
		 * If the pte has swp_entry, just clear page table to
		 * prevent swap-in which is more expensive rather than
		 * (page allocation + zeroing).
		 */
		if (!pte_present(ptent)) {
			swp_entry_t entry;

			entry = pte_to_swp_entry(ptent);
645 646 647 648 649
			if (!non_swap_entry(entry)) {
				nr_swap--;
				free_swap_and_cache(entry);
				pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
			} else if (is_hwpoison_entry(entry) ||
650
				   is_poisoned_swp_entry(entry)) {
651 652
				pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
			}
653 654
			continue;
		}
655

656 657
		folio = vm_normal_folio(vma, addr, ptent);
		if (!folio || folio_is_zone_device(folio))
658 659 660
			continue;

		/*
661
		 * If pmd isn't transhuge but the folio is large and
662 663 664
		 * is owned by only this process, split it and
		 * deactivate all pages.
		 */
665
		if (folio_test_large(folio)) {
666 667
			int err;

668
			if (folio_estimated_sharers(folio) != 1)
669 670 671
				break;
			if (!folio_trylock(folio))
				break;
672
			folio_get(folio);
673 674 675 676
			arch_leave_lazy_mmu_mode();
			pte_unmap_unlock(start_pte, ptl);
			start_pte = NULL;
			err = split_folio(folio);
677 678
			folio_unlock(folio);
			folio_put(folio);
679 680 681 682 683 684 685
			if (err)
				break;
			start_pte = pte =
				pte_offset_map_lock(mm, pmd, addr, &ptl);
			if (!start_pte)
				break;
			arch_enter_lazy_mmu_mode();
686 687 688 689 690
			pte--;
			addr -= PAGE_SIZE;
			continue;
		}

691 692
		if (folio_test_swapcache(folio) || folio_test_dirty(folio)) {
			if (!folio_trylock(folio))
693 694
				continue;
			/*
695 696
			 * If folio is shared with others, we mustn't clear
			 * the folio's dirty flag.
697
			 */
698 699
			if (folio_mapcount(folio) != 1) {
				folio_unlock(folio);
700 701 702
				continue;
			}

703 704 705
			if (folio_test_swapcache(folio) &&
			    !folio_free_swap(folio)) {
				folio_unlock(folio);
706 707 708
				continue;
			}

709 710
			folio_clear_dirty(folio);
			folio_unlock(folio);
711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727
		}

		if (pte_young(ptent) || pte_dirty(ptent)) {
			/*
			 * Some of architecture(ex, PPC) don't update TLB
			 * with set_pte_at and tlb_remove_tlb_entry so for
			 * the portability, remap the pte with old|clean
			 * after pte clearing.
			 */
			ptent = ptep_get_and_clear_full(mm, addr, pte,
							tlb->fullmm);

			ptent = pte_mkold(ptent);
			ptent = pte_mkclean(ptent);
			set_pte_at(mm, addr, pte, ptent);
			tlb_remove_tlb_entry(tlb, pte, addr);
		}
728
		folio_mark_lazyfree(folio);
729
	}
730

731
	if (nr_swap)
732
		add_mm_counter(mm, MM_SWAPENTS, nr_swap);
733 734 735 736
	if (start_pte) {
		arch_leave_lazy_mmu_mode();
		pte_unmap_unlock(start_pte, ptl);
	}
737
	cond_resched();
738

739 740 741
	return 0;
}

742 743
static const struct mm_walk_ops madvise_free_walk_ops = {
	.pmd_entry		= madvise_free_pte_range,
744
	.walk_lock		= PGWALK_RDLOCK,
745
};
746 747 748 749 750

static int madvise_free_single_vma(struct vm_area_struct *vma,
			unsigned long start_addr, unsigned long end_addr)
{
	struct mm_struct *mm = vma->vm_mm;
751
	struct mmu_notifier_range range;
752 753 754 755 756 757
	struct mmu_gather tlb;

	/* MADV_FREE works for only anon vma at the moment */
	if (!vma_is_anonymous(vma))
		return -EINVAL;

758 759
	range.start = max(vma->vm_start, start_addr);
	if (range.start >= vma->vm_end)
760
		return -EINVAL;
761 762
	range.end = min(vma->vm_end, end_addr);
	if (range.end <= vma->vm_start)
763
		return -EINVAL;
764
	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm,
765
				range.start, range.end);
766 767

	lru_add_drain();
768
	tlb_gather_mmu(&tlb, mm);
769 770
	update_hiwater_rss(mm);

771
	mmu_notifier_invalidate_range_start(&range);
772 773 774 775
	tlb_start_vma(&tlb, vma);
	walk_page_range(vma->vm_mm, range.start, range.end,
			&madvise_free_walk_ops, &tlb);
	tlb_end_vma(&tlb, vma);
776
	mmu_notifier_invalidate_range_end(&range);
777
	tlb_finish_mmu(&tlb);
778 779 780 781

	return 0;
}

Linus Torvalds's avatar
Linus Torvalds committed
782 783 784 785
/*
 * Application no longer needs these pages.  If the pages are dirty,
 * it's OK to just throw them away.  The app will be more careful about
 * data it wants to keep.  Be sure to free swap resources too.  The
786 787
 * zap_page_range_single call sets things up for shrink_active_list to actually
 * free these pages later if no one else has touched them in the meantime,
Linus Torvalds's avatar
Linus Torvalds committed
788
 * although we could add these pages to a global reuse list for
789
 * shrink_active_list to pick up before reclaiming other pages.
Linus Torvalds's avatar
Linus Torvalds committed
790 791 792 793 794 795 796 797 798 799 800
 *
 * NB: This interface discards data rather than pushes it out to swap,
 * as some implementations do.  This has performance implications for
 * applications like large transactional databases which want to discard
 * pages in anonymous maps after committing to backing store the data
 * that was kept in them.  There is no reason to write this data out to
 * the swap area if the application is discarding it.
 *
 * An interface that causes the system to free clean pages and flush
 * dirty pages is already available as msync(MS_INVALIDATE).
 */
801 802 803
static long madvise_dontneed_single_vma(struct vm_area_struct *vma,
					unsigned long start, unsigned long end)
{
804
	zap_page_range_single(vma, start, end - start, NULL);
805 806 807
	return 0;
}

808 809 810 811 812
static bool madvise_dontneed_free_valid_vma(struct vm_area_struct *vma,
					    unsigned long start,
					    unsigned long *end,
					    int behavior)
{
813 814 815 816 817 818 819 820
	if (!is_vm_hugetlb_page(vma)) {
		unsigned int forbidden = VM_PFNMAP;

		if (behavior != MADV_DONTNEED_LOCKED)
			forbidden |= VM_LOCKED;

		return !(vma->vm_flags & forbidden);
	}
821

822
	if (behavior != MADV_DONTNEED && behavior != MADV_DONTNEED_LOCKED)
823 824 825 826
		return false;
	if (start & ~huge_page_mask(hstate_vma(vma)))
		return false;

827 828 829 830 831 832 833 834
	/*
	 * Madvise callers expect the length to be rounded up to PAGE_SIZE
	 * boundaries, and may be unaware that this VMA uses huge pages.
	 * Avoid unexpected data loss by rounding down the number of
	 * huge pages freed.
	 */
	*end = ALIGN_DOWN(*end, huge_page_size(hstate_vma(vma)));

835 836 837
	return true;
}

838 839 840 841
static long madvise_dontneed_free(struct vm_area_struct *vma,
				  struct vm_area_struct **prev,
				  unsigned long start, unsigned long end,
				  int behavior)
Linus Torvalds's avatar
Linus Torvalds committed
842
{
843 844
	struct mm_struct *mm = vma->vm_mm;

845
	*prev = vma;
846
	if (!madvise_dontneed_free_valid_vma(vma, start, &end, behavior))
Linus Torvalds's avatar
Linus Torvalds committed
847 848
		return -EINVAL;

849 850 851
	if (start == end)
		return 0;

852
	if (!userfaultfd_remove(vma, start, end)) {
853
		*prev = NULL; /* mmap_lock has been dropped, prev is stale */
854

855
		mmap_read_lock(mm);
856
		vma = vma_lookup(mm, start);
857 858
		if (!vma)
			return -ENOMEM;
859 860 861 862 863 864
		/*
		 * Potential end adjustment for hugetlb vma is OK as
		 * the check below keeps end within vma.
		 */
		if (!madvise_dontneed_free_valid_vma(vma, start, &end,
						     behavior))
865 866 867 868
			return -EINVAL;
		if (end > vma->vm_end) {
			/*
			 * Don't fail if end > vma->vm_end. If the old
Ingo Molnar's avatar
Ingo Molnar committed
869
			 * vma was split while the mmap_lock was
870
			 * released the effect of the concurrent
871
			 * operation may not cause madvise() to
872 873 874 875 876 877 878 879 880 881 882
			 * have an undefined result. There may be an
			 * adjacent next vma that we'll walk
			 * next. userfaultfd_remove() will generate an
			 * UFFD_EVENT_REMOVE repetition on the
			 * end-vma->vm_end range, but the manager can
			 * handle a repetition fine.
			 */
			end = vma->vm_end;
		}
		VM_WARN_ON(start >= end);
	}
883

884
	if (behavior == MADV_DONTNEED || behavior == MADV_DONTNEED_LOCKED)
885 886 887 888 889
		return madvise_dontneed_single_vma(vma, start, end);
	else if (behavior == MADV_FREE)
		return madvise_free_single_vma(vma, start, end);
	else
		return -EINVAL;
Linus Torvalds's avatar
Linus Torvalds committed
890 891
}

892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910
static long madvise_populate(struct vm_area_struct *vma,
			     struct vm_area_struct **prev,
			     unsigned long start, unsigned long end,
			     int behavior)
{
	const bool write = behavior == MADV_POPULATE_WRITE;
	struct mm_struct *mm = vma->vm_mm;
	unsigned long tmp_end;
	int locked = 1;
	long pages;

	*prev = vma;

	while (start < end) {
		/*
		 * We might have temporarily dropped the lock. For example,
		 * our VMA might have been split.
		 */
		if (!vma || start >= vma->vm_end) {
911 912
			vma = vma_lookup(mm, start);
			if (!vma)
913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929
				return -ENOMEM;
		}

		tmp_end = min_t(unsigned long, end, vma->vm_end);
		/* Populate (prefault) page tables readable/writable. */
		pages = faultin_vma_page_range(vma, start, tmp_end, write,
					       &locked);
		if (!locked) {
			mmap_read_lock(mm);
			locked = 1;
			*prev = NULL;
			vma = NULL;
		}
		if (pages < 0) {
			switch (pages) {
			case -EINTR:
				return -EINTR;
930
			case -EINVAL: /* Incompatible mappings / permissions. */
931 932 933
				return -EINVAL;
			case -EHWPOISON:
				return -EHWPOISON;
934 935
			case -EFAULT: /* VM_FAULT_SIGBUS or VM_FAULT_SIGSEGV */
				return -EFAULT;
936 937 938 939 940 941 942 943 944 945 946 947 948
			default:
				pr_warn_once("%s: unhandled return value: %ld\n",
					     __func__, pages);
				fallthrough;
			case -ENOMEM:
				return -ENOMEM;
			}
		}
		start += pages * PAGE_SIZE;
	}
	return 0;
}

949 950 951 952 953
/*
 * Application wants to free up the pages and associated backing store.
 * This is effectively punching a hole into the middle of a file.
 */
static long madvise_remove(struct vm_area_struct *vma,
954
				struct vm_area_struct **prev,
955 956
				unsigned long start, unsigned long end)
{
957
	loff_t offset;
958
	int error;
959
	struct file *f;
960
	struct mm_struct *mm = vma->vm_mm;
961

962
	*prev = NULL;	/* tell sys_madvise we drop mmap_lock */
963

964
	if (vma->vm_flags & VM_LOCKED)
965 966
		return -EINVAL;

967 968 969
	f = vma->vm_file;

	if (!f || !f->f_mapping || !f->f_mapping->host) {
970 971 972
			return -EINVAL;
	}

973
	if (!vma_is_shared_maywrite(vma))
974 975
		return -EACCES;

976 977
	offset = (loff_t)(start - vma->vm_start)
			+ ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
978

979
	/*
980
	 * Filesystem's fallocate may need to take i_rwsem.  We need to
981 982
	 * explicitly grab a reference because the vma (and hence the
	 * vma's reference to the file) can go away as soon as we drop
983
	 * mmap_lock.
984 985
	 */
	get_file(f);
986
	if (userfaultfd_remove(vma, start, end)) {
987
		/* mmap_lock was not released by userfaultfd_remove() */
988
		mmap_read_unlock(mm);
989
	}
990
	error = vfs_fallocate(f,
991 992
				FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
				offset, end - start);
993
	fput(f);
994
	mmap_read_lock(mm);
995
	return error;
996 997
}

998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008
/*
 * Apply an madvise behavior to a region of a vma.  madvise_update_vma
 * will handle splitting a vm area into separate areas, each area with its own
 * behavior.
 */
static int madvise_vma_behavior(struct vm_area_struct *vma,
				struct vm_area_struct **prev,
				unsigned long start, unsigned long end,
				unsigned long behavior)
{
	int error;
1009
	struct anon_vma_name *anon_name;
1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022
	unsigned long new_flags = vma->vm_flags;

	switch (behavior) {
	case MADV_REMOVE:
		return madvise_remove(vma, prev, start, end);
	case MADV_WILLNEED:
		return madvise_willneed(vma, prev, start, end);
	case MADV_COLD:
		return madvise_cold(vma, prev, start, end);
	case MADV_PAGEOUT:
		return madvise_pageout(vma, prev, start, end);
	case MADV_FREE:
	case MADV_DONTNEED:
1023
	case MADV_DONTNEED_LOCKED:
1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073
		return madvise_dontneed_free(vma, prev, start, end, behavior);
	case MADV_POPULATE_READ:
	case MADV_POPULATE_WRITE:
		return madvise_populate(vma, prev, start, end, behavior);
	case MADV_NORMAL:
		new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ;
		break;
	case MADV_SEQUENTIAL:
		new_flags = (new_flags & ~VM_RAND_READ) | VM_SEQ_READ;
		break;
	case MADV_RANDOM:
		new_flags = (new_flags & ~VM_SEQ_READ) | VM_RAND_READ;
		break;
	case MADV_DONTFORK:
		new_flags |= VM_DONTCOPY;
		break;
	case MADV_DOFORK:
		if (vma->vm_flags & VM_IO)
			return -EINVAL;
		new_flags &= ~VM_DONTCOPY;
		break;
	case MADV_WIPEONFORK:
		/* MADV_WIPEONFORK is only supported on anonymous memory. */
		if (vma->vm_file || vma->vm_flags & VM_SHARED)
			return -EINVAL;
		new_flags |= VM_WIPEONFORK;
		break;
	case MADV_KEEPONFORK:
		new_flags &= ~VM_WIPEONFORK;
		break;
	case MADV_DONTDUMP:
		new_flags |= VM_DONTDUMP;
		break;
	case MADV_DODUMP:
		if (!is_vm_hugetlb_page(vma) && new_flags & VM_SPECIAL)
			return -EINVAL;
		new_flags &= ~VM_DONTDUMP;
		break;
	case MADV_MERGEABLE:
	case MADV_UNMERGEABLE:
		error = ksm_madvise(vma, start, end, behavior, &new_flags);
		if (error)
			goto out;
		break;
	case MADV_HUGEPAGE:
	case MADV_NOHUGEPAGE:
		error = hugepage_madvise(vma, &new_flags, behavior);
		if (error)
			goto out;
		break;
1074 1075
	case MADV_COLLAPSE:
		return madvise_collapse(vma, prev, start, end);
1076 1077
	}

1078 1079
	anon_name = anon_vma_name(vma);
	anon_vma_name_get(anon_name);
1080
	error = madvise_update_vma(vma, prev, start, end, new_flags,
1081 1082
				   anon_name);
	anon_vma_name_put(anon_name);
1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093

out:
	/*
	 * madvise() returns EAGAIN if kernel resources, such as
	 * slab, are temporarily unavailable.
	 */
	if (error == -ENOMEM)
		error = -EAGAIN;
	return error;
}

1094 1095 1096 1097
#ifdef CONFIG_MEMORY_FAILURE
/*
 * Error injection support for memory error handling.
 */
1098 1099
static int madvise_inject_error(int behavior,
		unsigned long start, unsigned long end)
1100
{
1101
	unsigned long size;
1102

1103 1104
	if (!capable(CAP_SYS_ADMIN))
		return -EPERM;
1105

1106

1107
	for (; start < end; start += size) {
1108
		unsigned long pfn;
1109
		struct page *page;
1110 1111
		int ret;

1112
		ret = get_user_pages_fast(start, 1, 0, &page);
1113 1114
		if (ret != 1)
			return ret;
1115
		pfn = page_to_pfn(page);
1116

1117 1118 1119
		/*
		 * When soft offlining hugepages, after migrating the page
		 * we dissolve it, therefore in the second loop "page" will
1120
		 * no longer be a compound page.
1121
		 */
1122
		size = page_size(compound_head(page));
1123

1124 1125
		if (behavior == MADV_SOFT_OFFLINE) {
			pr_info("Soft offlining pfn %#lx at process virtual address %#lx\n",
1126
				 pfn, start);
1127
			ret = soft_offline_page(pfn, MF_COUNT_INCREASED);
1128 1129 1130
		} else {
			pr_info("Injecting memory failure for pfn %#lx at process virtual address %#lx\n",
				 pfn, start);
1131
			ret = memory_failure(pfn, MF_COUNT_INCREASED | MF_SW_SIMULATED);
1132 1133
			if (ret == -EOPNOTSUPP)
				ret = 0;
1134
		}
1135

1136 1137
		if (ret)
			return ret;
1138
	}
1139

1140
	return 0;
1141 1142 1143
}
#endif

1144
static bool
1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155
madvise_behavior_valid(int behavior)
{
	switch (behavior) {
	case MADV_DOFORK:
	case MADV_DONTFORK:
	case MADV_NORMAL:
	case MADV_SEQUENTIAL:
	case MADV_RANDOM:
	case MADV_REMOVE:
	case MADV_WILLNEED:
	case MADV_DONTNEED:
1156
	case MADV_DONTNEED_LOCKED:
1157
	case MADV_FREE:
Minchan Kim's avatar
Minchan Kim committed
1158
	case MADV_COLD:
Minchan Kim's avatar
Minchan Kim committed
1159
	case MADV_PAGEOUT:
1160 1161
	case MADV_POPULATE_READ:
	case MADV_POPULATE_WRITE:
Hugh Dickins's avatar
Hugh Dickins committed
1162 1163 1164
#ifdef CONFIG_KSM
	case MADV_MERGEABLE:
	case MADV_UNMERGEABLE:
1165 1166 1167
#endif
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
	case MADV_HUGEPAGE:
1168
	case MADV_NOHUGEPAGE:
1169
	case MADV_COLLAPSE:
Hugh Dickins's avatar
Hugh Dickins committed
1170
#endif
1171 1172
	case MADV_DONTDUMP:
	case MADV_DODUMP:
1173 1174
	case MADV_WIPEONFORK:
	case MADV_KEEPONFORK:
1175 1176 1177 1178
#ifdef CONFIG_MEMORY_FAILURE
	case MADV_SOFT_OFFLINE:
	case MADV_HWPOISON:
#endif
1179
		return true;
1180 1181

	default:
1182
		return false;
1183 1184
	}
}
1185

1186
static bool process_madvise_behavior_valid(int behavior)
1187 1188 1189 1190
{
	switch (behavior) {
	case MADV_COLD:
	case MADV_PAGEOUT:
1191
	case MADV_WILLNEED:
1192
	case MADV_COLLAPSE:
1193 1194 1195 1196 1197 1198
		return true;
	default:
		return false;
	}
}

1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257
/*
 * Walk the vmas in range [start,end), and call the visit function on each one.
 * The visit function will get start and end parameters that cover the overlap
 * between the current vma and the original range.  Any unmapped regions in the
 * original range will result in this function returning -ENOMEM while still
 * calling the visit function on all of the existing vmas in the range.
 * Must be called with the mmap_lock held for reading or writing.
 */
static
int madvise_walk_vmas(struct mm_struct *mm, unsigned long start,
		      unsigned long end, unsigned long arg,
		      int (*visit)(struct vm_area_struct *vma,
				   struct vm_area_struct **prev, unsigned long start,
				   unsigned long end, unsigned long arg))
{
	struct vm_area_struct *vma;
	struct vm_area_struct *prev;
	unsigned long tmp;
	int unmapped_error = 0;

	/*
	 * If the interval [start,end) covers some unmapped address
	 * ranges, just ignore them, but return -ENOMEM at the end.
	 * - different from the way of handling in mlock etc.
	 */
	vma = find_vma_prev(mm, start, &prev);
	if (vma && start > vma->vm_start)
		prev = vma;

	for (;;) {
		int error;

		/* Still start < end. */
		if (!vma)
			return -ENOMEM;

		/* Here start < (end|vma->vm_end). */
		if (start < vma->vm_start) {
			unmapped_error = -ENOMEM;
			start = vma->vm_start;
			if (start >= end)
				break;
		}

		/* Here vma->vm_start <= start < (end|vma->vm_end) */
		tmp = vma->vm_end;
		if (end < tmp)
			tmp = end;

		/* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */
		error = visit(vma, &prev, start, tmp, arg);
		if (error)
			return error;
		start = tmp;
		if (prev && start < prev->vm_end)
			start = prev->vm_end;
		if (start >= end)
			break;
		if (prev)
1258
			vma = find_vma(mm, prev->vm_end);
1259 1260 1261 1262 1263 1264 1265
		else	/* madvise_remove dropped mmap_lock */
			vma = find_vma(mm, start);
	}

	return unmapped_error;
}

1266 1267 1268 1269
#ifdef CONFIG_ANON_VMA_NAME
static int madvise_vma_anon_name(struct vm_area_struct *vma,
				 struct vm_area_struct **prev,
				 unsigned long start, unsigned long end,
1270
				 unsigned long anon_name)
1271 1272 1273 1274
{
	int error;

	/* Only anonymous mappings can be named */
1275
	if (vma->vm_file && !vma_is_anon_shmem(vma))
1276 1277 1278
		return -EBADF;

	error = madvise_update_vma(vma, prev, start, end, vma->vm_flags,
1279
				   (struct anon_vma_name *)anon_name);
1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290

	/*
	 * madvise() returns EAGAIN if kernel resources, such as
	 * slab, are temporarily unavailable.
	 */
	if (error == -ENOMEM)
		error = -EAGAIN;
	return error;
}

int madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
1291
			  unsigned long len_in, struct anon_vma_name *anon_name)
1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310
{
	unsigned long end;
	unsigned long len;

	if (start & ~PAGE_MASK)
		return -EINVAL;
	len = (len_in + ~PAGE_MASK) & PAGE_MASK;

	/* Check to see whether len was rounded up from small -ve to zero */
	if (len_in && !len)
		return -EINVAL;

	end = start + len;
	if (end < start)
		return -EINVAL;

	if (end == start)
		return 0;

1311
	return madvise_walk_vmas(mm, start, end, (unsigned long)anon_name,
1312 1313 1314
				 madvise_vma_anon_name);
}
#endif /* CONFIG_ANON_VMA_NAME */
Linus Torvalds's avatar
Linus Torvalds committed
1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336
/*
 * The madvise(2) system call.
 *
 * Applications can use madvise() to advise the kernel how it should
 * handle paging I/O in this VM area.  The idea is to help the kernel
 * use appropriate read-ahead and caching techniques.  The information
 * provided is advisory only, and can be safely disregarded by the
 * kernel without affecting the correct operation of the application.
 *
 * behavior values:
 *  MADV_NORMAL - the default behavior is to read clusters.  This
 *		results in some read-ahead and read-behind.
 *  MADV_RANDOM - the system should read the minimum amount of data
 *		on any access, since it is unlikely that the appli-
 *		cation will need more than what it asks for.
 *  MADV_SEQUENTIAL - pages in the given range will probably be accessed
 *		once, so they can be aggressively read ahead, and
 *		can be freed soon after they are accessed.
 *  MADV_WILLNEED - the application is notifying the system to read
 *		some pages ahead.
 *  MADV_DONTNEED - the application is finished with the given range,
 *		so the kernel can free resources associated with it.
1337 1338
 *  MADV_FREE - the application marks pages in the given range as lazy free,
 *		where actual purges are postponed until memory pressure happens.
1339 1340
 *  MADV_REMOVE - the application wants to free up the given range of
 *		pages and associated backing store.
1341 1342 1343
 *  MADV_DONTFORK - omit this area from child's address space when forking:
 *		typically, to avoid COWing pages pinned by get_user_pages().
 *  MADV_DOFORK - cancel MADV_DONTFORK: no longer omit this area when forking.
1344 1345 1346
 *  MADV_WIPEONFORK - present the child process with zero-filled memory in this
 *              range after a fork.
 *  MADV_KEEPONFORK - undo the effect of MADV_WIPEONFORK
1347 1348 1349
 *  MADV_HWPOISON - trigger memory error handler as if the given memory range
 *		were corrupted by unrecoverable hardware memory failure.
 *  MADV_SOFT_OFFLINE - try to soft-offline the given range of memory.
Hugh Dickins's avatar
Hugh Dickins committed
1350 1351 1352
 *  MADV_MERGEABLE - the application recommends that KSM try to merge pages in
 *		this area with pages of identical content from other such areas.
 *  MADV_UNMERGEABLE- cancel MADV_MERGEABLE: no longer merge pages with others.
1353 1354 1355 1356 1357 1358
 *  MADV_HUGEPAGE - the application wants to back the given range by transparent
 *		huge pages in the future. Existing pages might be coalesced and
 *		new pages might be allocated as THP.
 *  MADV_NOHUGEPAGE - mark the given range as not worth being backed by
 *		transparent huge pages so the existing pages will not be
 *		coalesced into THP and new pages will not be allocated as THP.
1359
 *  MADV_COLLAPSE - synchronously coalesce pages into new THP.
1360 1361 1362
 *  MADV_DONTDUMP - the application wants to prevent pages in the given range
 *		from being included in its core dump.
 *  MADV_DODUMP - cancel MADV_DONTDUMP: no longer exclude from core dump.
1363 1364
 *  MADV_COLD - the application is not expected to use this memory soon,
 *		deactivate pages in this range so that they can be reclaimed
Ingo Molnar's avatar
Ingo Molnar committed
1365
 *		easily if memory pressure happens.
1366 1367
 *  MADV_PAGEOUT - the application is not expected to use this memory soon,
 *		page out the pages in this range immediately.
1368 1369 1370 1371
 *  MADV_POPULATE_READ - populate (prefault) page tables readable by
 *		triggering read faults if required
 *  MADV_POPULATE_WRITE - populate (prefault) page tables writable by
 *		triggering write faults if required
Linus Torvalds's avatar
Linus Torvalds committed
1372 1373 1374 1375 1376
 *
 * return values:
 *  zero    - success
 *  -EINVAL - start + len < 0, start is not page-aligned,
 *		"behavior" is not a valid value, or application
1377 1378 1379
 *		is attempting to release locked or shared pages,
 *		or the specified address range includes file, Huge TLB,
 *		MAP_SHARED or VMPFNMAP range.
Linus Torvalds's avatar
Linus Torvalds committed
1380 1381 1382 1383 1384 1385
 *  -ENOMEM - addresses in the specified range are not currently
 *		mapped, or are outside the AS of the process.
 *  -EIO    - an I/O error occurred while paging in data.
 *  -EBADF  - map exists, but area maps something that isn't a file.
 *  -EAGAIN - a kernel resource was temporarily unavailable.
 */
1386
int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior)
Linus Torvalds's avatar
Linus Torvalds committed
1387
{
1388 1389
	unsigned long end;
	int error;
1390
	int write;
Linus Torvalds's avatar
Linus Torvalds committed
1391
	size_t len;
1392
	struct blk_plug plug;
Linus Torvalds's avatar
Linus Torvalds committed
1393

1394
	if (!madvise_behavior_valid(behavior))
1395
		return -EINVAL;
1396

1397
	if (!PAGE_ALIGNED(start))
1398
		return -EINVAL;
1399
	len = PAGE_ALIGN(len_in);
Linus Torvalds's avatar
Linus Torvalds committed
1400 1401 1402

	/* Check to see whether len was rounded up from small -ve to zero */
	if (len_in && !len)
1403
		return -EINVAL;
Linus Torvalds's avatar
Linus Torvalds committed
1404 1405 1406

	end = start + len;
	if (end < start)
1407
		return -EINVAL;
Linus Torvalds's avatar
Linus Torvalds committed
1408 1409

	if (end == start)
1410
		return 0;
1411

1412 1413 1414 1415 1416
#ifdef CONFIG_MEMORY_FAILURE
	if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE)
		return madvise_inject_error(behavior, start, start + len_in);
#endif

1417
	write = madvise_need_mmap_write(behavior);
1418
	if (write) {
1419
		if (mmap_write_lock_killable(mm))
1420 1421
			return -EINTR;
	} else {
1422
		mmap_read_lock(mm);
1423
	}
Linus Torvalds's avatar
Linus Torvalds committed
1424

1425 1426 1427
	start = untagged_addr_remote(mm, start);
	end = start + len;

1428
	blk_start_plug(&plug);
1429 1430
	error = madvise_walk_vmas(mm, start, end, behavior,
			madvise_vma_behavior);
1431
	blk_finish_plug(&plug);
1432
	if (write)
1433
		mmap_write_unlock(mm);
1434
	else
1435
		mmap_read_unlock(mm);
1436

Linus Torvalds's avatar
Linus Torvalds committed
1437 1438
	return error;
}
1439 1440 1441

SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
{
1442
	return do_madvise(current->mm, start, len_in, behavior);
1443
}
1444 1445 1446 1447 1448

SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec,
		size_t, vlen, int, behavior, unsigned int, flags)
{
	ssize_t ret;
1449
	struct iovec iovstack[UIO_FASTIOV];
1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461
	struct iovec *iov = iovstack;
	struct iov_iter iter;
	struct task_struct *task;
	struct mm_struct *mm;
	size_t total_len;
	unsigned int f_flags;

	if (flags != 0) {
		ret = -EINVAL;
		goto out;
	}

1462
	ret = import_iovec(ITER_DEST, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter);
1463 1464 1465
	if (ret < 0)
		goto out;

1466 1467 1468
	task = pidfd_get_task(pidfd, &f_flags);
	if (IS_ERR(task)) {
		ret = PTR_ERR(task);
1469 1470 1471
		goto free_iov;
	}

1472
	if (!process_madvise_behavior_valid(behavior)) {
1473 1474 1475 1476
		ret = -EINVAL;
		goto release_task;
	}

1477 1478
	/* Require PTRACE_MODE_READ to avoid leaking ASLR metadata. */
	mm = mm_access(task, PTRACE_MODE_READ_FSCREDS);
1479 1480 1481 1482 1483
	if (IS_ERR_OR_NULL(mm)) {
		ret = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH;
		goto release_task;
	}

1484 1485 1486 1487 1488 1489 1490 1491 1492
	/*
	 * Require CAP_SYS_NICE for influencing process performance. Note that
	 * only non-destructive hints are currently supported.
	 */
	if (!capable(CAP_SYS_NICE)) {
		ret = -EPERM;
		goto release_mm;
	}

1493 1494 1495
	total_len = iov_iter_count(&iter);

	while (iov_iter_count(&iter)) {
1496 1497
		ret = do_madvise(mm, (unsigned long)iter_iov_addr(&iter),
					iter_iov_len(&iter), behavior);
1498
		if (ret < 0)
1499
			break;
1500
		iov_iter_advance(&iter, iter_iov_len(&iter));
1501 1502
	}

1503
	ret = (total_len - iov_iter_count(&iter)) ? : ret;
1504

1505
release_mm:
1506 1507 1508 1509 1510 1511 1512 1513
	mmput(mm);
release_task:
	put_task_struct(task);
free_iov:
	kfree(iov);
out:
	return ret;
}