memory.c 137 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0-only
Linus Torvalds's avatar
Linus Torvalds committed
2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33
/*
 *  linux/mm/memory.c
 *
 *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
 */

/*
 * demand-loading started 01.12.91 - seems it is high on the list of
 * things wanted, and it should be easy to implement. - Linus
 */

/*
 * Ok, demand-loading was easy, shared pages a little bit tricker. Shared
 * pages started 02.12.91, seems to work. - Linus.
 *
 * Tested sharing by executing about 30 /bin/sh: under the old kernel it
 * would have taken more than the 6M I have free, but it worked well as
 * far as I could see.
 *
 * Also corrected some "invalidate()"s - I wasn't doing enough of them.
 */

/*
 * Real VM (paging to/from disk) started 18.12.91. Much more work and
 * thought has to go into this. Oh, well..
 * 19.12.91  -  works, somewhat. Sometimes I get faults, don't know why.
 *		Found it. Everything seems to work now.
 * 20.12.91  -  Ok, making the swap-device changeable like the root.
 */

/*
 * 05.04.94  -  Multi-page memory management added for v1.1.
Tobin C Harding's avatar
Tobin C Harding committed
34
 *              Idea by Alex Bligh (alex@cconcepts.co.uk)
Linus Torvalds's avatar
Linus Torvalds committed
35 36 37 38 39 40 41 42 43
 *
 * 16.07.99  -  Support of BIGMEM added by Gerhard Wichert, Siemens AG
 *		(Gerhard.Wichert@pdb.siemens.de)
 *
 * Aug/Sep 2004 Changed to four level page tables (Andi Kleen)
 */

#include <linux/kernel_stat.h>
#include <linux/mm.h>
44
#include <linux/sched/mm.h>
45
#include <linux/sched/coredump.h>
46
#include <linux/sched/numa_balancing.h>
47
#include <linux/sched/task.h>
Linus Torvalds's avatar
Linus Torvalds committed
48 49 50 51 52
#include <linux/hugetlb.h>
#include <linux/mman.h>
#include <linux/swap.h>
#include <linux/highmem.h>
#include <linux/pagemap.h>
53
#include <linux/memremap.h>
Hugh Dickins's avatar
Hugh Dickins committed
54
#include <linux/ksm.h>
Linus Torvalds's avatar
Linus Torvalds committed
55
#include <linux/rmap.h>
56
#include <linux/export.h>
57
#include <linux/delayacct.h>
Linus Torvalds's avatar
Linus Torvalds committed
58
#include <linux/init.h>
59
#include <linux/pfn_t.h>
60
#include <linux/writeback.h>
61
#include <linux/memcontrol.h>
Andrea Arcangeli's avatar
Andrea Arcangeli committed
62
#include <linux/mmu_notifier.h>
63 64
#include <linux/swapops.h>
#include <linux/elf.h>
65
#include <linux/gfp.h>
66
#include <linux/migrate.h>
Andy Shevchenko's avatar
Andy Shevchenko committed
67
#include <linux/string.h>
68
#include <linux/dma-debug.h>
69
#include <linux/debugfs.h>
70
#include <linux/userfaultfd_k.h>
71
#include <linux/dax.h>
72
#include <linux/oom.h>
73
#include <linux/numa.h>
74 75
#include <linux/perf_event.h>
#include <linux/ptrace.h>
76
#include <linux/vmalloc.h>
Linus Torvalds's avatar
Linus Torvalds committed
77

78 79
#include <trace/events/kmem.h>

80
#include <asm/io.h>
81
#include <asm/mmu_context.h>
Linus Torvalds's avatar
Linus Torvalds committed
82
#include <asm/pgalloc.h>
83
#include <linux/uaccess.h>
Linus Torvalds's avatar
Linus Torvalds committed
84 85 86
#include <asm/tlb.h>
#include <asm/tlbflush.h>

87
#include "pgalloc-track.h"
88 89
#include "internal.h"

90
#if defined(LAST_CPUPID_NOT_IN_PAGE_FLAGS) && !defined(CONFIG_COMPILE_TEST)
91
#warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_cpupid.
92 93
#endif

94
#ifndef CONFIG_NEED_MULTIPLE_NODES
Linus Torvalds's avatar
Linus Torvalds committed
95 96 97
/* use the per-pgdat data instead for discontigmem - mbligh */
unsigned long max_mapnr;
EXPORT_SYMBOL(max_mapnr);
Tobin C Harding's avatar
Tobin C Harding committed
98 99

struct page *mem_map;
Linus Torvalds's avatar
Linus Torvalds committed
100 101 102 103 104 105 106 107 108 109
EXPORT_SYMBOL(mem_map);
#endif

/*
 * A number of key systems in x86 including ioremap() rely on the assumption
 * that high_memory defines the upper bound on direct map memory, then end
 * of ZONE_NORMAL.  Under CONFIG_DISCONTIG this means that max_low_pfn and
 * highstart_pfn must be the same; there must be no gap between ZONE_NORMAL
 * and ZONE_HIGHMEM.
 */
Tobin C Harding's avatar
Tobin C Harding committed
110
void *high_memory;
Linus Torvalds's avatar
Linus Torvalds committed
111 112
EXPORT_SYMBOL(high_memory);

113 114 115 116 117 118 119 120 121 122 123 124
/*
 * Randomize the address space (stacks, mmaps, brk, etc.).
 *
 * ( When CONFIG_COMPAT_BRK=y we exclude brk from randomization,
 *   as ancient (libc5 based) binaries can segfault. )
 */
int randomize_va_space __read_mostly =
#ifdef CONFIG_COMPAT_BRK
					1;
#else
					2;
#endif
125

126 127 128 129 130 131 132 133 134 135 136 137
#ifndef arch_faults_on_old_pte
static inline bool arch_faults_on_old_pte(void)
{
	/*
	 * Those arches which don't have hw access flag feature need to
	 * implement their own helper. By default, "true" means pagefault
	 * will be hit on old pte.
	 */
	return true;
}
#endif

138 139 140
static int __init disable_randmaps(char *s)
{
	randomize_va_space = 0;
141
	return 1;
142 143 144
}
__setup("norandmaps", disable_randmaps);

145
unsigned long zero_pfn __read_mostly;
146 147
EXPORT_SYMBOL(zero_pfn);

Tobin C Harding's avatar
Tobin C Harding committed
148 149
unsigned long highest_memmap_pfn __read_mostly;

Hugh Dickins's avatar
Hugh Dickins committed
150 151 152 153 154 155 156 157 158
/*
 * CONFIG_MMU architectures set up ZERO_PAGE in their paging_init()
 */
static int __init init_zero_pfn(void)
{
	zero_pfn = page_to_pfn(ZERO_PAGE(0));
	return 0;
}
core_initcall(init_zero_pfn);
159

160
void mm_trace_rss_stat(struct mm_struct *mm, int member, long count)
161
{
162
	trace_rss_stat(mm, member, count);
163
}
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
164

165 166
#if defined(SPLIT_RSS_COUNTING)

167
void sync_mm_rss(struct mm_struct *mm)
168 169 170 171
{
	int i;

	for (i = 0; i < NR_MM_COUNTERS; i++) {
172 173 174
		if (current->rss_stat.count[i]) {
			add_mm_counter(mm, i, current->rss_stat.count[i]);
			current->rss_stat.count[i] = 0;
175 176
		}
	}
177
	current->rss_stat.events = 0;
178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198
}

static void add_mm_counter_fast(struct mm_struct *mm, int member, int val)
{
	struct task_struct *task = current;

	if (likely(task->mm == mm))
		task->rss_stat.count[member] += val;
	else
		add_mm_counter(mm, member, val);
}
#define inc_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, 1)
#define dec_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, -1)

/* sync counter once per 64 page faults */
#define TASK_RSS_EVENTS_THRESH	(64)
static void check_sync_rss_stat(struct task_struct *task)
{
	if (unlikely(task != current))
		return;
	if (unlikely(task->rss_stat.events++ > TASK_RSS_EVENTS_THRESH))
199
		sync_mm_rss(task->mm);
200
}
201
#else /* SPLIT_RSS_COUNTING */
202 203 204 205 206 207 208 209

#define inc_mm_counter_fast(mm, member) inc_mm_counter(mm, member)
#define dec_mm_counter_fast(mm, member) dec_mm_counter(mm, member)

static void check_sync_rss_stat(struct task_struct *task)
{
}

210 211
#endif /* SPLIT_RSS_COUNTING */

Linus Torvalds's avatar
Linus Torvalds committed
212 213 214 215
/*
 * Note: this doesn't free the actual pages themselves. That
 * has been handled earlier when unmapping all the memory regions.
 */
216 217
static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
			   unsigned long addr)
Linus Torvalds's avatar
Linus Torvalds committed
218
{
219
	pgtable_t token = pmd_pgtable(*pmd);
220
	pmd_clear(pmd);
221
	pte_free_tlb(tlb, token, addr);
222
	mm_dec_nr_ptes(tlb->mm);
Linus Torvalds's avatar
Linus Torvalds committed
223 224
}

225 226 227
static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
				unsigned long addr, unsigned long end,
				unsigned long floor, unsigned long ceiling)
Linus Torvalds's avatar
Linus Torvalds committed
228 229 230
{
	pmd_t *pmd;
	unsigned long next;
231
	unsigned long start;
Linus Torvalds's avatar
Linus Torvalds committed
232

233
	start = addr;
Linus Torvalds's avatar
Linus Torvalds committed
234 235 236 237 238
	pmd = pmd_offset(pud, addr);
	do {
		next = pmd_addr_end(addr, end);
		if (pmd_none_or_clear_bad(pmd))
			continue;
239
		free_pte_range(tlb, pmd, addr);
Linus Torvalds's avatar
Linus Torvalds committed
240 241
	} while (pmd++, addr = next, addr != end);

242 243 244 245 246 247 248
	start &= PUD_MASK;
	if (start < floor)
		return;
	if (ceiling) {
		ceiling &= PUD_MASK;
		if (!ceiling)
			return;
Linus Torvalds's avatar
Linus Torvalds committed
249
	}
250 251 252 253 254
	if (end - 1 > ceiling - 1)
		return;

	pmd = pmd_offset(pud, start);
	pud_clear(pud);
255
	pmd_free_tlb(tlb, pmd, start);
256
	mm_dec_nr_pmds(tlb->mm);
Linus Torvalds's avatar
Linus Torvalds committed
257 258
}

259
static inline void free_pud_range(struct mmu_gather *tlb, p4d_t *p4d,
260 261
				unsigned long addr, unsigned long end,
				unsigned long floor, unsigned long ceiling)
Linus Torvalds's avatar
Linus Torvalds committed
262 263 264
{
	pud_t *pud;
	unsigned long next;
265
	unsigned long start;
Linus Torvalds's avatar
Linus Torvalds committed
266

267
	start = addr;
268
	pud = pud_offset(p4d, addr);
Linus Torvalds's avatar
Linus Torvalds committed
269 270 271 272
	do {
		next = pud_addr_end(addr, end);
		if (pud_none_or_clear_bad(pud))
			continue;
273
		free_pmd_range(tlb, pud, addr, next, floor, ceiling);
Linus Torvalds's avatar
Linus Torvalds committed
274 275
	} while (pud++, addr = next, addr != end);

276 277 278 279 280 281 282 283 284 285 286 287 288 289
	start &= P4D_MASK;
	if (start < floor)
		return;
	if (ceiling) {
		ceiling &= P4D_MASK;
		if (!ceiling)
			return;
	}
	if (end - 1 > ceiling - 1)
		return;

	pud = pud_offset(p4d, start);
	p4d_clear(p4d);
	pud_free_tlb(tlb, pud, start);
290
	mm_dec_nr_puds(tlb->mm);
291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309
}

static inline void free_p4d_range(struct mmu_gather *tlb, pgd_t *pgd,
				unsigned long addr, unsigned long end,
				unsigned long floor, unsigned long ceiling)
{
	p4d_t *p4d;
	unsigned long next;
	unsigned long start;

	start = addr;
	p4d = p4d_offset(pgd, addr);
	do {
		next = p4d_addr_end(addr, end);
		if (p4d_none_or_clear_bad(p4d))
			continue;
		free_pud_range(tlb, p4d, addr, next, floor, ceiling);
	} while (p4d++, addr = next, addr != end);

310 311 312 313 314 315 316
	start &= PGDIR_MASK;
	if (start < floor)
		return;
	if (ceiling) {
		ceiling &= PGDIR_MASK;
		if (!ceiling)
			return;
Linus Torvalds's avatar
Linus Torvalds committed
317
	}
318 319 320
	if (end - 1 > ceiling - 1)
		return;

321
	p4d = p4d_offset(pgd, start);
322
	pgd_clear(pgd);
323
	p4d_free_tlb(tlb, p4d, start);
Linus Torvalds's avatar
Linus Torvalds committed
324 325 326
}

/*
327
 * This function frees user-level page tables of a process.
Linus Torvalds's avatar
Linus Torvalds committed
328
 */
329
void free_pgd_range(struct mmu_gather *tlb,
330 331
			unsigned long addr, unsigned long end,
			unsigned long floor, unsigned long ceiling)
Linus Torvalds's avatar
Linus Torvalds committed
332 333 334
{
	pgd_t *pgd;
	unsigned long next;
335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360

	/*
	 * The next few lines have given us lots of grief...
	 *
	 * Why are we testing PMD* at this top level?  Because often
	 * there will be no work to do at all, and we'd prefer not to
	 * go all the way down to the bottom just to discover that.
	 *
	 * Why all these "- 1"s?  Because 0 represents both the bottom
	 * of the address space and the top of it (using -1 for the
	 * top wouldn't help much: the masks would do the wrong thing).
	 * The rule is that addr 0 and floor 0 refer to the bottom of
	 * the address space, but end 0 and ceiling 0 refer to the top
	 * Comparisons need to use "end - 1" and "ceiling - 1" (though
	 * that end 0 case should be mythical).
	 *
	 * Wherever addr is brought up or ceiling brought down, we must
	 * be careful to reject "the opposite 0" before it confuses the
	 * subsequent tests.  But what about where end is brought down
	 * by PMD_SIZE below? no, end can't go down to 0 there.
	 *
	 * Whereas we round start (addr) and ceiling down, by different
	 * masks at different levels, in order to test whether a table
	 * now has no other vmas using it, so can be freed, we don't
	 * bother to round floor or end up - the tests don't need that.
	 */
Linus Torvalds's avatar
Linus Torvalds committed
361

362 363 364 365 366 367 368 369 370 371 372 373 374 375 376
	addr &= PMD_MASK;
	if (addr < floor) {
		addr += PMD_SIZE;
		if (!addr)
			return;
	}
	if (ceiling) {
		ceiling &= PMD_MASK;
		if (!ceiling)
			return;
	}
	if (end - 1 > ceiling - 1)
		end -= PMD_SIZE;
	if (addr > end - 1)
		return;
377 378 379 380
	/*
	 * We add page table cache pages with PAGE_SIZE,
	 * (see pte_free_tlb()), flush the tlb if we need
	 */
381
	tlb_change_page_size(tlb, PAGE_SIZE);
382
	pgd = pgd_offset(tlb->mm, addr);
Linus Torvalds's avatar
Linus Torvalds committed
383 384 385 386
	do {
		next = pgd_addr_end(addr, end);
		if (pgd_none_or_clear_bad(pgd))
			continue;
387
		free_p4d_range(tlb, pgd, addr, next, floor, ceiling);
Linus Torvalds's avatar
Linus Torvalds committed
388
	} while (pgd++, addr = next, addr != end);
389 390
}

391
void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
392
		unsigned long floor, unsigned long ceiling)
393 394 395 396 397
{
	while (vma) {
		struct vm_area_struct *next = vma->vm_next;
		unsigned long addr = vma->vm_start;

398
		/*
npiggin@suse.de's avatar
npiggin@suse.de committed
399 400
		 * Hide vma from rmap and truncate_pagecache before freeing
		 * pgtables
401
		 */
402
		unlink_anon_vmas(vma);
403 404
		unlink_file_vma(vma);

405
		if (is_vm_hugetlb_page(vma)) {
406
			hugetlb_free_pgd_range(tlb, addr, vma->vm_end,
Tobin C Harding's avatar
Tobin C Harding committed
407
				floor, next ? next->vm_start : ceiling);
408 409 410 411 412
		} else {
			/*
			 * Optimization: gather nearby vmas into one call down
			 */
			while (next && next->vm_start <= vma->vm_end + PMD_SIZE
413
			       && !is_vm_hugetlb_page(next)) {
414 415
				vma = next;
				next = vma->vm_next;
416
				unlink_anon_vmas(vma);
417
				unlink_file_vma(vma);
418 419
			}
			free_pgd_range(tlb, addr, vma->vm_end,
Tobin C Harding's avatar
Tobin C Harding committed
420
				floor, next ? next->vm_start : ceiling);
421
		}
422 423
		vma = next;
	}
Linus Torvalds's avatar
Linus Torvalds committed
424 425
}

426
int __pte_alloc(struct mm_struct *mm, pmd_t *pmd)
Linus Torvalds's avatar
Linus Torvalds committed
427
{
428
	spinlock_t *ptl;
429
	pgtable_t new = pte_alloc_one(mm);
430 431 432
	if (!new)
		return -ENOMEM;

433 434 435 436 437 438 439 440 441 442 443
	/*
	 * Ensure all pte setup (eg. pte page lock and page clearing) are
	 * visible before the pte is made visible to other CPUs by being
	 * put into page tables.
	 *
	 * The other side of the story is the pointer chasing in the page
	 * table walking code (when walking the page table without locking;
	 * ie. most of the time). Fortunately, these data accesses consist
	 * of a chain of data-dependent loads, meaning most CPUs (alpha
	 * being the notable exception) will already guarantee loads are
	 * seen in-order. See the alpha page table accessors for the
444
	 * smp_rmb() barriers in page table walking code.
445 446 447
	 */
	smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */

448
	ptl = pmd_lock(mm, pmd);
449
	if (likely(pmd_none(*pmd))) {	/* Has another populated it ? */
450
		mm_inc_nr_ptes(mm);
Linus Torvalds's avatar
Linus Torvalds committed
451
		pmd_populate(mm, pmd, new);
452
		new = NULL;
453
	}
454
	spin_unlock(ptl);
455 456
	if (new)
		pte_free(mm, new);
457
	return 0;
Linus Torvalds's avatar
Linus Torvalds committed
458 459
}

460
int __pte_alloc_kernel(pmd_t *pmd)
Linus Torvalds's avatar
Linus Torvalds committed
461
{
462
	pte_t *new = pte_alloc_one_kernel(&init_mm);
463 464 465
	if (!new)
		return -ENOMEM;

466 467
	smp_wmb(); /* See comment in __pte_alloc */

468
	spin_lock(&init_mm.page_table_lock);
469
	if (likely(pmd_none(*pmd))) {	/* Has another populated it ? */
470
		pmd_populate_kernel(&init_mm, pmd, new);
471
		new = NULL;
472
	}
473
	spin_unlock(&init_mm.page_table_lock);
474 475
	if (new)
		pte_free_kernel(&init_mm, new);
476
	return 0;
Linus Torvalds's avatar
Linus Torvalds committed
477 478
}

KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
479 480 481 482 483 484
static inline void init_rss_vec(int *rss)
{
	memset(rss, 0, sizeof(int) * NR_MM_COUNTERS);
}

static inline void add_mm_rss_vec(struct mm_struct *mm, int *rss)
485
{
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
486 487
	int i;

488
	if (current->mm == mm)
489
		sync_mm_rss(mm);
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
490 491 492
	for (i = 0; i < NR_MM_COUNTERS; i++)
		if (rss[i])
			add_mm_counter(mm, i, rss[i]);
493 494
}

495
/*
496 497 498
 * This function is called to print an error when a bad pte
 * is found. For example, we might have a PFN-mapped pte in
 * a region that doesn't allow it.
499 500 501
 *
 * The calling function must still handle the error.
 */
502 503
static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
			  pte_t pte, struct page *page)
504
{
505
	pgd_t *pgd = pgd_offset(vma->vm_mm, addr);
506 507
	p4d_t *p4d = p4d_offset(pgd, addr);
	pud_t *pud = pud_offset(p4d, addr);
508 509 510
	pmd_t *pmd = pmd_offset(pud, addr);
	struct address_space *mapping;
	pgoff_t index;
511 512 513 514 515 516 517 518 519 520 521 522 523 524
	static unsigned long resume;
	static unsigned long nr_shown;
	static unsigned long nr_unshown;

	/*
	 * Allow a burst of 60 reports, then keep quiet for that minute;
	 * or allow a steady drip of one report per second.
	 */
	if (nr_shown == 60) {
		if (time_before(jiffies, resume)) {
			nr_unshown++;
			return;
		}
		if (nr_unshown) {
525 526
			pr_alert("BUG: Bad page map: %lu messages suppressed\n",
				 nr_unshown);
527 528 529 530 531 532
			nr_unshown = 0;
		}
		nr_shown = 0;
	}
	if (nr_shown++ == 0)
		resume = jiffies + 60 * HZ;
533 534 535 536

	mapping = vma->vm_file ? vma->vm_file->f_mapping : NULL;
	index = linear_page_index(vma, addr);

537 538 539
	pr_alert("BUG: Bad page map in process %s  pte:%08llx pmd:%08llx\n",
		 current->comm,
		 (long long)pte_val(pte), (long long)pmd_val(*pmd));
540
	if (page)
541
		dump_page(page, "bad pte");
542
	pr_alert("addr:%px vm_flags:%08lx anon_vma:%px mapping:%px index:%lx\n",
543
		 (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index);
544
	pr_alert("file:%pD fault:%ps mmap:%ps readpage:%ps\n",
545 546 547 548
		 vma->vm_file,
		 vma->vm_ops ? vma->vm_ops->fault : NULL,
		 vma->vm_file ? vma->vm_file->f_op->mmap : NULL,
		 mapping ? mapping->a_ops->readpage : NULL);
549
	dump_stack();
550
	add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
551 552
}

553
/*
554
 * vm_normal_page -- This function gets the "struct page" associated with a pte.
555
 *
556 557 558
 * "Special" mappings do not wish to be associated with a "struct page" (either
 * it doesn't exist, or it exists but they don't want to touch it). In this
 * case, NULL is returned here. "Normal" mappings do have a struct page.
Jared Hulbert's avatar
Jared Hulbert committed
559
 *
560 561 562 563 564 565 566 567
 * There are 2 broad cases. Firstly, an architecture may define a pte_special()
 * pte bit, in which case this function is trivial. Secondly, an architecture
 * may not have a spare pte bit, which requires a more complicated scheme,
 * described below.
 *
 * A raw VM_PFNMAP mapping (ie. one that is not COWed) is always considered a
 * special mapping (even if there are underlying and valid "struct pages").
 * COWed pages of a VM_PFNMAP are always normal.
568
 *
Jared Hulbert's avatar
Jared Hulbert committed
569 570
 * The way we recognize COWed pages within VM_PFNMAP mappings is through the
 * rules set up by "remap_pfn_range()": the vma will have the VM_PFNMAP bit
571 572
 * set, and the vm_pgoff will point to the first PFN mapped: thus every special
 * mapping will always honor the rule
573 574 575
 *
 *	pfn_of_page == vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT)
 *
576 577 578 579 580 581
 * And for normal mappings this is false.
 *
 * This restricts such mappings to be a linear translation from virtual address
 * to pfn. To get around this restriction, we allow arbitrary mappings so long
 * as the vma is not a COW mapping; in that case, we know that all ptes are
 * special (because none can have been COWed).
Jared Hulbert's avatar
Jared Hulbert committed
582 583
 *
 *
584
 * In order to support COW of arbitrary special mappings, we have VM_MIXEDMAP.
Jared Hulbert's avatar
Jared Hulbert committed
585 586 587 588 589 590 591 592 593
 *
 * VM_MIXEDMAP mappings can likewise contain memory with or without "struct
 * page" backing, however the difference is that _all_ pages with a struct
 * page (that is, those where pfn_valid is true) are refcounted and considered
 * normal pages by the VM. The disadvantage is that pages are refcounted
 * (which can be slower and simply not an option for some PFNMAP users). The
 * advantage is that we don't have to follow the strict linearity rule of
 * PFNMAP mappings in order to support COWable mappings.
 *
594
 */
595 596
struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
			    pte_t pte)
597
{
598
	unsigned long pfn = pte_pfn(pte);
599

600
	if (IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL)) {
601
		if (likely(!pte_special(pte)))
602
			goto check_pfn;
603 604
		if (vma->vm_ops && vma->vm_ops->find_special_page)
			return vma->vm_ops->find_special_page(vma, addr);
Hugh Dickins's avatar
Hugh Dickins committed
605 606
		if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))
			return NULL;
607 608
		if (is_zero_pfn(pfn))
			return NULL;
609 610 611
		if (pte_devmap(pte))
			return NULL;

612
		print_bad_pte(vma, addr, pte, NULL);
613 614 615
		return NULL;
	}

616
	/* !CONFIG_ARCH_HAS_PTE_SPECIAL case follows: */
617

Jared Hulbert's avatar
Jared Hulbert committed
618 619 620 621 622 623
	if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
		if (vma->vm_flags & VM_MIXEDMAP) {
			if (!pfn_valid(pfn))
				return NULL;
			goto out;
		} else {
624 625
			unsigned long off;
			off = (addr - vma->vm_start) >> PAGE_SHIFT;
Jared Hulbert's avatar
Jared Hulbert committed
626 627 628 629 630
			if (pfn == vma->vm_pgoff + off)
				return NULL;
			if (!is_cow_mapping(vma->vm_flags))
				return NULL;
		}
631 632
	}

633 634
	if (is_zero_pfn(pfn))
		return NULL;
635

636 637 638 639 640
check_pfn:
	if (unlikely(pfn > highest_memmap_pfn)) {
		print_bad_pte(vma, addr, pte, NULL);
		return NULL;
	}
641 642

	/*
643 644
	 * NOTE! We still have PageReserved() pages in the page tables.
	 * eg. VDSO mappings can cause them to exist.
645
	 */
Jared Hulbert's avatar
Jared Hulbert committed
646
out:
647
	return pfn_to_page(pfn);
648 649
}

650 651 652 653 654 655 656 657 658
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
				pmd_t pmd)
{
	unsigned long pfn = pmd_pfn(pmd);

	/*
	 * There is no pmd_special() but there may be special pmds, e.g.
	 * in a direct-access (dax) mapping, so let's just replicate the
659
	 * !CONFIG_ARCH_HAS_PTE_SPECIAL case from vm_normal_page() here.
660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675
	 */
	if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
		if (vma->vm_flags & VM_MIXEDMAP) {
			if (!pfn_valid(pfn))
				return NULL;
			goto out;
		} else {
			unsigned long off;
			off = (addr - vma->vm_start) >> PAGE_SHIFT;
			if (pfn == vma->vm_pgoff + off)
				return NULL;
			if (!is_cow_mapping(vma->vm_flags))
				return NULL;
		}
	}

676 677
	if (pmd_devmap(pmd))
		return NULL;
678
	if (is_huge_zero_pmd(pmd))
679 680 681 682 683 684 685 686 687 688 689 690 691
		return NULL;
	if (unlikely(pfn > highest_memmap_pfn))
		return NULL;

	/*
	 * NOTE! We still have PageReserved() pages in the page tables.
	 * eg. VDSO mappings can cause them to exist.
	 */
out:
	return pfn_to_page(pfn);
}
#endif

Linus Torvalds's avatar
Linus Torvalds committed
692 693 694 695 696 697
/*
 * copy one vm_area from one task to the other. Assumes the page tables
 * already present in the new task to be cleared in the whole range
 * covered by this vma.
 */

698 699
static unsigned long
copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
700
		pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma,
701
		unsigned long addr, int *rss)
Linus Torvalds's avatar
Linus Torvalds committed
702
{
703
	unsigned long vm_flags = vma->vm_flags;
Linus Torvalds's avatar
Linus Torvalds committed
704 705
	pte_t pte = *src_pte;
	struct page *page;
706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722
	swp_entry_t entry = pte_to_swp_entry(pte);

	if (likely(!non_swap_entry(entry))) {
		if (swap_duplicate(entry) < 0)
			return entry.val;

		/* make sure dst_mm is on swapoff's mmlist. */
		if (unlikely(list_empty(&dst_mm->mmlist))) {
			spin_lock(&mmlist_lock);
			if (list_empty(&dst_mm->mmlist))
				list_add(&dst_mm->mmlist,
						&src_mm->mmlist);
			spin_unlock(&mmlist_lock);
		}
		rss[MM_SWAPENTS]++;
	} else if (is_migration_entry(entry)) {
		page = migration_entry_to_page(entry);
Linus Torvalds's avatar
Linus Torvalds committed
723

724
		rss[mm_counter(page)]++;
725

726 727
		if (is_write_migration_entry(entry) &&
				is_cow_mapping(vm_flags)) {
728
			/*
729 730
			 * COW mappings require pages in both
			 * parent and child to be set to read.
731
			 */
732 733 734 735 736 737 738 739 740 741
			make_migration_entry_read(&entry);
			pte = swp_entry_to_pte(entry);
			if (pte_swp_soft_dirty(*src_pte))
				pte = pte_swp_mksoft_dirty(pte);
			if (pte_swp_uffd_wp(*src_pte))
				pte = pte_swp_mkuffd_wp(pte);
			set_pte_at(src_mm, addr, src_pte, pte);
		}
	} else if (is_device_private_entry(entry)) {
		page = device_private_entry_to_page(entry);
742

743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769
		/*
		 * Update rss count even for unaddressable pages, as
		 * they should treated just like normal pages in this
		 * respect.
		 *
		 * We will likely want to have some new rss counters
		 * for unaddressable pages, at some point. But for now
		 * keep things as they are.
		 */
		get_page(page);
		rss[mm_counter(page)]++;
		page_dup_rmap(page, false);

		/*
		 * We do not preserve soft-dirty information, because so
		 * far, checkpoint/restore is the only feature that
		 * requires that. And checkpoint/restore does not work
		 * when a device driver is involved (you cannot easily
		 * save and restore device driver state).
		 */
		if (is_write_device_private_entry(entry) &&
		    is_cow_mapping(vm_flags)) {
			make_device_private_entry_read(&entry);
			pte = swp_entry_to_pte(entry);
			if (pte_swp_uffd_wp(*src_pte))
				pte = pte_swp_mkuffd_wp(pte);
			set_pte_at(src_mm, addr, src_pte, pte);
Linus Torvalds's avatar
Linus Torvalds committed
770 771
		}
	}
772 773 774 775
	set_pte_at(dst_mm, addr, dst_pte, pte);
	return 0;
}

776 777
static inline void
copy_present_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
778 779 780 781 782 783 784
		pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma,
		unsigned long addr, int *rss)
{
	unsigned long vm_flags = vma->vm_flags;
	pte_t pte = *src_pte;
	struct page *page;

Linus Torvalds's avatar
Linus Torvalds committed
785 786 787 788
	/*
	 * If it's a COW mapping, write protect it both
	 * in the parent and the child
	 */
789
	if (is_cow_mapping(vm_flags) && pte_write(pte)) {
Linus Torvalds's avatar
Linus Torvalds committed
790
		ptep_set_wrprotect(src_mm, addr, src_pte);
791
		pte = pte_wrprotect(pte);
Linus Torvalds's avatar
Linus Torvalds committed
792 793 794 795 796 797 798 799 800
	}

	/*
	 * If it's a shared mapping, mark it clean in
	 * the child
	 */
	if (vm_flags & VM_SHARED)
		pte = pte_mkclean(pte);
	pte = pte_mkold(pte);
801

802 803 804 805 806 807 808 809
	/*
	 * Make sure the _PAGE_UFFD_WP bit is cleared if the new VMA
	 * does not have the VM_UFFD_WP, which means that the uffd
	 * fork event is not enabled.
	 */
	if (!(vm_flags & VM_UFFD_WP))
		pte = pte_clear_uffd_wp(pte);

810 811 812
	page = vm_normal_page(vma, addr, pte);
	if (page) {
		get_page(page);
813
		page_dup_rmap(page, false);
814
		rss[mm_counter(page)]++;
815
	}
816 817

	set_pte_at(dst_mm, addr, dst_pte, pte);
Linus Torvalds's avatar
Linus Torvalds committed
818 819
}

820
static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
821 822
		   pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma,
		   unsigned long addr, unsigned long end)
Linus Torvalds's avatar
Linus Torvalds committed
823
{
824
	pte_t *orig_src_pte, *orig_dst_pte;
Linus Torvalds's avatar
Linus Torvalds committed
825
	pte_t *src_pte, *dst_pte;
826
	spinlock_t *src_ptl, *dst_ptl;
827
	int progress = 0;
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
828
	int rss[NR_MM_COUNTERS];
829
	swp_entry_t entry = (swp_entry_t){0};
Linus Torvalds's avatar
Linus Torvalds committed
830 831

again:
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
832 833
	init_rss_vec(rss);

834
	dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
Linus Torvalds's avatar
Linus Torvalds committed
835 836
	if (!dst_pte)
		return -ENOMEM;
837
	src_pte = pte_offset_map(src_pmd, addr);
838
	src_ptl = pte_lockptr(src_mm, src_pmd);
Ingo Molnar's avatar
Ingo Molnar committed
839
	spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
840 841
	orig_src_pte = src_pte;
	orig_dst_pte = dst_pte;
842
	arch_enter_lazy_mmu_mode();
Linus Torvalds's avatar
Linus Torvalds committed
843 844 845 846 847 848

	do {
		/*
		 * We are holding two locks at this point - either of them
		 * could generate latencies in another task on another CPU.
		 */
849 850 851
		if (progress >= 32) {
			progress = 0;
			if (need_resched() ||
Nick Piggin's avatar
Nick Piggin committed
852
			    spin_needbreak(src_ptl) || spin_needbreak(dst_ptl))
853 854
				break;
		}
Linus Torvalds's avatar
Linus Torvalds committed
855 856 857 858
		if (pte_none(*src_pte)) {
			progress++;
			continue;
		}
859 860 861
		if (unlikely(!pte_present(*src_pte))) {
			entry.val = copy_nonpresent_pte(dst_mm, src_mm,
							dst_pte, src_pte,
862
							vma, addr, rss);
863 864 865 866 867 868 869
			if (entry.val)
				break;
			progress += 8;
			continue;
		}
		copy_present_pte(dst_mm, src_mm, dst_pte, src_pte,
				 vma, addr, rss);
Linus Torvalds's avatar
Linus Torvalds committed
870 871 872
		progress += 8;
	} while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);

873
	arch_leave_lazy_mmu_mode();
874
	spin_unlock(src_ptl);
875
	pte_unmap(orig_src_pte);
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
876
	add_mm_rss_vec(dst_mm, rss);
877
	pte_unmap_unlock(orig_dst_pte, dst_ptl);
878
	cond_resched();
879 880 881 882 883 884

	if (entry.val) {
		if (add_swap_count_continuation(entry, GFP_KERNEL) < 0)
			return -ENOMEM;
		progress = 0;
	}
Linus Torvalds's avatar
Linus Torvalds committed
885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902
	if (addr != end)
		goto again;
	return 0;
}

static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
		pud_t *dst_pud, pud_t *src_pud, struct vm_area_struct *vma,
		unsigned long addr, unsigned long end)
{
	pmd_t *src_pmd, *dst_pmd;
	unsigned long next;

	dst_pmd = pmd_alloc(dst_mm, dst_pud, addr);
	if (!dst_pmd)
		return -ENOMEM;
	src_pmd = pmd_offset(src_pud, addr);
	do {
		next = pmd_addr_end(addr, end);
903 904
		if (is_swap_pmd(*src_pmd) || pmd_trans_huge(*src_pmd)
			|| pmd_devmap(*src_pmd)) {
905
			int err;
906
			VM_BUG_ON_VMA(next-addr != HPAGE_PMD_SIZE, vma);
907 908 909 910 911 912 913 914
			err = copy_huge_pmd(dst_mm, src_mm,
					    dst_pmd, src_pmd, addr, vma);
			if (err == -ENOMEM)
				return -ENOMEM;
			if (!err)
				continue;
			/* fall through */
		}
Linus Torvalds's avatar
Linus Torvalds committed
915 916 917 918 919 920 921 922 923 924
		if (pmd_none_or_clear_bad(src_pmd))
			continue;
		if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd,
						vma, addr, next))
			return -ENOMEM;
	} while (dst_pmd++, src_pmd++, addr = next, addr != end);
	return 0;
}

static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
925
		p4d_t *dst_p4d, p4d_t *src_p4d, struct vm_area_struct *vma,
Linus Torvalds's avatar
Linus Torvalds committed
926 927 928 929 930
		unsigned long addr, unsigned long end)
{
	pud_t *src_pud, *dst_pud;
	unsigned long next;

931
	dst_pud = pud_alloc(dst_mm, dst_p4d, addr);
Linus Torvalds's avatar
Linus Torvalds committed
932 933
	if (!dst_pud)
		return -ENOMEM;
934
	src_pud = pud_offset(src_p4d, addr);
Linus Torvalds's avatar
Linus Torvalds committed
935 936
	do {
		next = pud_addr_end(addr, end);
937 938 939 940 941 942 943 944 945 946 947 948
		if (pud_trans_huge(*src_pud) || pud_devmap(*src_pud)) {
			int err;

			VM_BUG_ON_VMA(next-addr != HPAGE_PUD_SIZE, vma);
			err = copy_huge_pud(dst_mm, src_mm,
					    dst_pud, src_pud, addr, vma);
			if (err == -ENOMEM)
				return -ENOMEM;
			if (!err)
				continue;
			/* fall through */
		}
Linus Torvalds's avatar
Linus Torvalds committed
949 950 951 952 953 954 955 956 957
		if (pud_none_or_clear_bad(src_pud))
			continue;
		if (copy_pmd_range(dst_mm, src_mm, dst_pud, src_pud,
						vma, addr, next))
			return -ENOMEM;
	} while (dst_pud++, src_pud++, addr = next, addr != end);
	return 0;
}

958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979
static inline int copy_p4d_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
		pgd_t *dst_pgd, pgd_t *src_pgd, struct vm_area_struct *vma,
		unsigned long addr, unsigned long end)
{
	p4d_t *src_p4d, *dst_p4d;
	unsigned long next;

	dst_p4d = p4d_alloc(dst_mm, dst_pgd, addr);
	if (!dst_p4d)
		return -ENOMEM;
	src_p4d = p4d_offset(src_pgd, addr);
	do {
		next = p4d_addr_end(addr, end);
		if (p4d_none_or_clear_bad(src_p4d))
			continue;
		if (copy_pud_range(dst_mm, src_mm, dst_p4d, src_p4d,
						vma, addr, next))
			return -ENOMEM;
	} while (dst_p4d++, src_p4d++, addr = next, addr != end);
	return 0;
}

Linus Torvalds's avatar
Linus Torvalds committed
980 981 982 983 984 985 986
int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
		struct vm_area_struct *vma)
{
	pgd_t *src_pgd, *dst_pgd;
	unsigned long next;
	unsigned long addr = vma->vm_start;
	unsigned long end = vma->vm_end;
987
	struct mmu_notifier_range range;
988
	bool is_cow;
Andrea Arcangeli's avatar
Andrea Arcangeli committed
989
	int ret;
Linus Torvalds's avatar
Linus Torvalds committed
990

991 992 993 994 995 996
	/*
	 * Don't copy ptes where a page fault will fill them correctly.
	 * Fork becomes much lighter when there are big shared or private
	 * readonly mappings. The tradeoff is that copy_page_range is more
	 * efficient than faulting.
	 */
997 998 999
	if (!(vma->vm_flags & (VM_HUGETLB | VM_PFNMAP | VM_MIXEDMAP)) &&
			!vma->anon_vma)
		return 0;
1000

Linus Torvalds's avatar
Linus Torvalds committed
1001 1002 1003
	if (is_vm_hugetlb_page(vma))
		return copy_hugetlb_page_range(dst_mm, src_mm, vma);

1004
	if (unlikely(vma->vm_flags & VM_PFNMAP)) {
1005 1006 1007 1008
		/*
		 * We do not free on error cases below as remove_vma
		 * gets called on error from higher level routine
		 */
1009
		ret = track_pfn_copy(vma);
1010 1011 1012 1013
		if (ret)
			return ret;
	}

Andrea Arcangeli's avatar
Andrea Arcangeli committed
1014 1015 1016 1017 1018 1019
	/*
	 * We need to invalidate the secondary MMU mappings only when
	 * there could be a permission downgrade on the ptes of the
	 * parent mm. And a permission downgrade will only happen if
	 * is_cow_mapping() returns true.
	 */
1020
	is_cow = is_cow_mapping(vma->vm_flags);
1021 1022

	if (is_cow) {
1023 1024
		mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE,
					0, vma, src_mm, addr, end);
1025 1026
		mmu_notifier_invalidate_range_start(&range);
	}
Andrea Arcangeli's avatar
Andrea Arcangeli committed
1027 1028

	ret = 0;
Linus Torvalds's avatar
Linus Torvalds committed
1029 1030 1031 1032 1033 1034
	dst_pgd = pgd_offset(dst_mm, addr);
	src_pgd = pgd_offset(src_mm, addr);
	do {
		next = pgd_addr_end(addr, end);
		if (pgd_none_or_clear_bad(src_pgd))
			continue;
1035
		if (unlikely(copy_p4d_range(dst_mm, src_mm, dst_pgd, src_pgd,
Andrea Arcangeli's avatar
Andrea Arcangeli committed
1036 1037 1038 1039
					    vma, addr, next))) {
			ret = -ENOMEM;
			break;
		}
Linus Torvalds's avatar
Linus Torvalds committed
1040
	} while (dst_pgd++, src_pgd++, addr = next, addr != end);
Andrea Arcangeli's avatar
Andrea Arcangeli committed
1041

1042
	if (is_cow)
1043
		mmu_notifier_invalidate_range_end(&range);
Andrea Arcangeli's avatar
Andrea Arcangeli committed
1044
	return ret;
Linus Torvalds's avatar
Linus Torvalds committed
1045 1046
}

1047
static unsigned long zap_pte_range(struct mmu_gather *tlb,
1048
				struct vm_area_struct *vma, pmd_t *pmd,
Linus Torvalds's avatar
Linus Torvalds committed
1049
				unsigned long addr, unsigned long end,
1050
				struct zap_details *details)
Linus Torvalds's avatar
Linus Torvalds committed
1051
{
1052
	struct mm_struct *mm = tlb->mm;
Peter Zijlstra's avatar
Peter Zijlstra committed
1053
	int force_flush = 0;
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
1054
	int rss[NR_MM_COUNTERS];
1055
	spinlock_t *ptl;
1056
	pte_t *start_pte;
1057
	pte_t *pte;
1058
	swp_entry_t entry;
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
1059

1060
	tlb_change_page_size(tlb, PAGE_SIZE);
Peter Zijlstra's avatar
Peter Zijlstra committed
1061
again:
1062
	init_rss_vec(rss);
1063 1064
	start_pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
	pte = start_pte;
1065
	flush_tlb_batched_pending(mm);
1066
	arch_enter_lazy_mmu_mode();
Linus Torvalds's avatar
Linus Torvalds committed
1067 1068
	do {
		pte_t ptent = *pte;
Tobin C Harding's avatar
Tobin C Harding committed
1069
		if (pte_none(ptent))
Linus Torvalds's avatar
Linus Torvalds committed
1070
			continue;
1071

1072 1073 1074
		if (need_resched())
			break;

Linus Torvalds's avatar
Linus Torvalds committed
1075
		if (pte_present(ptent)) {
1076
			struct page *page;
1077

1078
			page = vm_normal_page(vma, addr, ptent);
Linus Torvalds's avatar
Linus Torvalds committed
1079 1080 1081 1082 1083 1084 1085
			if (unlikely(details) && page) {
				/*
				 * unmap_shared_mapping_pages() wants to
				 * invalidate cache without truncating:
				 * unmap shared but keep private pages.
				 */
				if (details->check_mapping &&
1086
				    details->check_mapping != page_rmapping(page))
Linus Torvalds's avatar
Linus Torvalds committed
1087 1088
					continue;
			}
1089
			ptent = ptep_get_and_clear_full(mm, addr, pte,
1090
							tlb->fullmm);
Linus Torvalds's avatar
Linus Torvalds committed
1091 1092 1093
			tlb_remove_tlb_entry(tlb, pte, addr);
			if (unlikely(!page))
				continue;
1094 1095

			if (!PageAnon(page)) {
1096 1097
				if (pte_dirty(ptent)) {
					force_flush = 1;
1098
					set_page_dirty(page);
1099
				}
1100
				if (pte_young(ptent) &&
1101
				    likely(!(vma->vm_flags & VM_SEQ_READ)))
1102
					mark_page_accessed(page);
1103
			}
1104
			rss[mm_counter(page)]--;
1105
			page_remove_rmap(page, false);
1106 1107
			if (unlikely(page_mapcount(page) < 0))
				print_bad_pte(vma, addr, ptent, page);
1108
			if (unlikely(__tlb_remove_page(tlb, page))) {
1109
				force_flush = 1;
1110
				addr += PAGE_SIZE;
Peter Zijlstra's avatar
Peter Zijlstra committed
1111
				break;
1112
			}
Linus Torvalds's avatar
Linus Torvalds committed
1113 1114
			continue;
		}
1115 1116

		entry = pte_to_swp_entry(ptent);
1117
		if (is_device_private_entry(entry)) {
1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137
			struct page *page = device_private_entry_to_page(entry);

			if (unlikely(details && details->check_mapping)) {
				/*
				 * unmap_shared_mapping_pages() wants to
				 * invalidate cache without truncating:
				 * unmap shared but keep private pages.
				 */
				if (details->check_mapping !=
				    page_rmapping(page))
					continue;
			}

			pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
			rss[mm_counter(page)]--;
			page_remove_rmap(page, false);
			put_page(page);
			continue;
		}

1138 1139
		/* If details->check_mapping, we leave swap entries. */
		if (unlikely(details))
Linus Torvalds's avatar
Linus Torvalds committed
1140
			continue;
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
1141

1142 1143 1144 1145
		if (!non_swap_entry(entry))
			rss[MM_SWAPENTS]--;
		else if (is_migration_entry(entry)) {
			struct page *page;
1146

1147
			page = migration_entry_to_page(entry);
1148
			rss[mm_counter(page)]--;
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
1149
		}
1150 1151
		if (unlikely(!free_swap_and_cache(entry)))
			print_bad_pte(vma, addr, ptent, NULL);
1152
		pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
1153
	} while (pte++, addr += PAGE_SIZE, addr != end);
1154

KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
1155
	add_mm_rss_vec(mm, rss);
1156
	arch_leave_lazy_mmu_mode();
1157

1158
	/* Do the actual TLB flush before dropping ptl */
1159
	if (force_flush)
1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170
		tlb_flush_mmu_tlbonly(tlb);
	pte_unmap_unlock(start_pte, ptl);

	/*
	 * If we forced a TLB flush (either due to running out of
	 * batch buffers or because we needed to flush dirty TLB
	 * entries before releasing the ptl), free the batched
	 * memory too. Restart if we didn't do everything.
	 */
	if (force_flush) {
		force_flush = 0;
1171
		tlb_flush_mmu(tlb);
1172 1173 1174 1175 1176
	}

	if (addr != end) {
		cond_resched();
		goto again;
Peter Zijlstra's avatar
Peter Zijlstra committed
1177 1178
	}

1179
	return addr;
Linus Torvalds's avatar
Linus Torvalds committed
1180 1181
}

1182
static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
1183
				struct vm_area_struct *vma, pud_t *pud,
Linus Torvalds's avatar
Linus Torvalds committed
1184
				unsigned long addr, unsigned long end,
1185
				struct zap_details *details)
Linus Torvalds's avatar
Linus Torvalds committed
1186 1187 1188 1189 1190 1191 1192
{
	pmd_t *pmd;
	unsigned long next;

	pmd = pmd_offset(pud, addr);
	do {
		next = pmd_addr_end(addr, end);
1193
		if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) {
1194
			if (next - addr != HPAGE_PMD_SIZE)
1195
				__split_huge_pmd(vma, pmd, addr, false, NULL);
1196
			else if (zap_huge_pmd(tlb, vma, pmd, addr))
1197
				goto next;
1198 1199
			/* fall through */
		}
1200 1201 1202 1203
		/*
		 * Here there can be other concurrent MADV_DONTNEED or
		 * trans huge page faults running, and if the pmd is
		 * none or trans huge it can change under us. This is
1204
		 * because MADV_DONTNEED holds the mmap_lock in read
1205 1206 1207 1208
		 * mode.
		 */
		if (pmd_none_or_trans_huge_or_clear_bad(pmd))
			goto next;
1209
		next = zap_pte_range(tlb, vma, pmd, addr, next, details);
1210
next:
1211 1212
		cond_resched();
	} while (pmd++, addr = next, addr != end);
1213 1214

	return addr;
Linus Torvalds's avatar
Linus Torvalds committed
1215 1216
}

1217
static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
1218
				struct vm_area_struct *vma, p4d_t *p4d,
Linus Torvalds's avatar
Linus Torvalds committed
1219
				unsigned long addr, unsigned long end,
1220
				struct zap_details *details)
Linus Torvalds's avatar
Linus Torvalds committed
1221 1222 1223 1224
{
	pud_t *pud;
	unsigned long next;

1225
	pud = pud_offset(p4d, addr);
Linus Torvalds's avatar
Linus Torvalds committed
1226 1227
	do {
		next = pud_addr_end(addr, end);
1228 1229
		if (pud_trans_huge(*pud) || pud_devmap(*pud)) {
			if (next - addr != HPAGE_PUD_SIZE) {
1230
				mmap_assert_locked(tlb->mm);
1231 1232 1233 1234 1235
				split_huge_pud(vma, pud, addr);
			} else if (zap_huge_pud(tlb, vma, pud, addr))
				goto next;
			/* fall through */
		}
1236
		if (pud_none_or_clear_bad(pud))
Linus Torvalds's avatar
Linus Torvalds committed
1237
			continue;
1238
		next = zap_pmd_range(tlb, vma, pud, addr, next, details);
1239 1240
next:
		cond_resched();
1241
	} while (pud++, addr = next, addr != end);
1242 1243

	return addr;
Linus Torvalds's avatar
Linus Torvalds committed
1244 1245
}

1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264
static inline unsigned long zap_p4d_range(struct mmu_gather *tlb,
				struct vm_area_struct *vma, pgd_t *pgd,
				unsigned long addr, unsigned long end,
				struct zap_details *details)
{
	p4d_t *p4d;
	unsigned long next;

	p4d = p4d_offset(pgd, addr);
	do {
		next = p4d_addr_end(addr, end);
		if (p4d_none_or_clear_bad(p4d))
			continue;
		next = zap_pud_range(tlb, vma, p4d, addr, next, details);
	} while (p4d++, addr = next, addr != end);

	return addr;
}

1265
void unmap_page_range(struct mmu_gather *tlb,
1266 1267 1268
			     struct vm_area_struct *vma,
			     unsigned long addr, unsigned long end,
			     struct zap_details *details)
Linus Torvalds's avatar
Linus Torvalds committed
1269 1270 1271 1272 1273 1274 1275 1276 1277
{
	pgd_t *pgd;
	unsigned long next;

	BUG_ON(addr >= end);
	tlb_start_vma(tlb, vma);
	pgd = pgd_offset(vma->vm_mm, addr);
	do {
		next = pgd_addr_end(addr, end);
1278
		if (pgd_none_or_clear_bad(pgd))
Linus Torvalds's avatar
Linus Torvalds committed
1279
			continue;
1280
		next = zap_p4d_range(tlb, vma, pgd, addr, next, details);
1281
	} while (pgd++, addr = next, addr != end);
Linus Torvalds's avatar
Linus Torvalds committed
1282 1283
	tlb_end_vma(tlb, vma);
}
1284

1285 1286 1287

static void unmap_single_vma(struct mmu_gather *tlb,
		struct vm_area_struct *vma, unsigned long start_addr,
1288
		unsigned long end_addr,
1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299
		struct zap_details *details)
{
	unsigned long start = max(vma->vm_start, start_addr);
	unsigned long end;

	if (start >= vma->vm_end)
		return;
	end = min(vma->vm_end, end_addr);
	if (end <= vma->vm_start)
		return;

1300 1301 1302
	if (vma->vm_file)
		uprobe_munmap(vma, start, end);

1303
	if (unlikely(vma->vm_flags & VM_PFNMAP))
1304
		untrack_pfn(vma, 0, 0);
1305 1306 1307 1308 1309 1310 1311

	if (start != end) {
		if (unlikely(is_vm_hugetlb_page(vma))) {
			/*
			 * It is undesirable to test vma->vm_file as it
			 * should be non-null for valid hugetlb area.
			 * However, vm_file will be NULL in the error
1312
			 * cleanup path of mmap_region. When
1313
			 * hugetlbfs ->mmap method fails,
1314
			 * mmap_region() nullifies vma->vm_file
1315 1316 1317 1318
			 * before calling this function to clean up.
			 * Since no pte has actually been setup, it is
			 * safe to do nothing in this case.
			 */
1319
			if (vma->vm_file) {
1320
				i_mmap_lock_write(vma->vm_file->f_mapping);
1321
				__unmap_hugepage_range_final(tlb, vma, start, end, NULL);
1322
				i_mmap_unlock_write(vma->vm_file->f_mapping);
1323
			}
1324 1325 1326
		} else
			unmap_page_range(tlb, vma, start, end, details);
	}
Linus Torvalds's avatar
Linus Torvalds committed
1327 1328 1329 1330
}

/**
 * unmap_vmas - unmap a range of memory covered by a list of vma's
1331
 * @tlb: address of the caller's struct mmu_gather
Linus Torvalds's avatar
Linus Torvalds committed
1332 1333 1334 1335
 * @vma: the starting vma
 * @start_addr: virtual address at which to start unmapping
 * @end_addr: virtual address at which to end unmapping
 *
1336
 * Unmap all pages in the vma list.
Linus Torvalds's avatar
Linus Torvalds committed
1337 1338 1339 1340 1341 1342 1343 1344 1345 1346
 *
 * Only addresses between `start' and `end' will be unmapped.
 *
 * The VMA list must be sorted in ascending virtual address order.
 *
 * unmap_vmas() assumes that the caller will flush the whole unmapped address
 * range after unmap_vmas() returns.  So the only responsibility here is to
 * ensure that any thus-far unmapped pages are flushed before unmap_vmas()
 * drops the lock and schedules.
 */
Al Viro's avatar
Al Viro committed
1347
void unmap_vmas(struct mmu_gather *tlb,
Linus Torvalds's avatar
Linus Torvalds committed
1348
		struct vm_area_struct *vma, unsigned long start_addr,
1349
		unsigned long end_addr)
Linus Torvalds's avatar
Linus Torvalds committed
1350
{
1351
	struct mmu_notifier_range range;
Linus Torvalds's avatar
Linus Torvalds committed
1352

1353 1354
	mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm,
				start_addr, end_addr);
1355
	mmu_notifier_invalidate_range_start(&range);
1356
	for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next)
1357
		unmap_single_vma(tlb, vma, start_addr, end_addr, NULL);
1358
	mmu_notifier_invalidate_range_end(&range);
Linus Torvalds's avatar
Linus Torvalds committed
1359 1360 1361 1362 1363
}

/**
 * zap_page_range - remove user pages in a given range
 * @vma: vm_area_struct holding the applicable pages
1364
 * @start: starting address of pages to zap
Linus Torvalds's avatar
Linus Torvalds committed
1365
 * @size: number of bytes to zap
1366 1367
 *
 * Caller must protect the VMA list
Linus Torvalds's avatar
Linus Torvalds committed
1368
 */
1369
void zap_page_range(struct vm_area_struct *vma, unsigned long start,
1370
		unsigned long size)
Linus Torvalds's avatar
Linus Torvalds committed
1371
{
1372
	struct mmu_notifier_range range;
Peter Zijlstra's avatar
Peter Zijlstra committed
1373
	struct mmu_gather tlb;
Linus Torvalds's avatar
Linus Torvalds committed
1374 1375

	lru_add_drain();
1376
	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
1377
				start, start + size);
1378 1379 1380 1381 1382 1383 1384
	tlb_gather_mmu(&tlb, vma->vm_mm, start, range.end);
	update_hiwater_rss(vma->vm_mm);
	mmu_notifier_invalidate_range_start(&range);
	for ( ; vma && vma->vm_start < range.end; vma = vma->vm_next)
		unmap_single_vma(&tlb, vma, start, range.end, NULL);
	mmu_notifier_invalidate_range_end(&range);
	tlb_finish_mmu(&tlb, start, range.end);
Linus Torvalds's avatar
Linus Torvalds committed
1385 1386
}

1387 1388 1389 1390 1391
/**
 * zap_page_range_single - remove user pages in a given range
 * @vma: vm_area_struct holding the applicable pages
 * @address: starting address of pages to zap
 * @size: number of bytes to zap
1392
 * @details: details of shared cache invalidation
1393 1394
 *
 * The range must fit into one VMA.
Linus Torvalds's avatar
Linus Torvalds committed
1395
 */
1396
static void zap_page_range_single(struct vm_area_struct *vma, unsigned long address,
Linus Torvalds's avatar
Linus Torvalds committed
1397 1398
		unsigned long size, struct zap_details *details)
{
1399
	struct mmu_notifier_range range;
Peter Zijlstra's avatar
Peter Zijlstra committed
1400
	struct mmu_gather tlb;
Linus Torvalds's avatar
Linus Torvalds committed
1401 1402

	lru_add_drain();
1403
	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
1404
				address, address + size);
1405 1406 1407 1408 1409 1410
	tlb_gather_mmu(&tlb, vma->vm_mm, address, range.end);
	update_hiwater_rss(vma->vm_mm);
	mmu_notifier_invalidate_range_start(&range);
	unmap_single_vma(&tlb, vma, address, range.end, details);
	mmu_notifier_invalidate_range_end(&range);
	tlb_finish_mmu(&tlb, address, range.end);
Linus Torvalds's avatar
Linus Torvalds committed
1411 1412
}

1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423
/**
 * zap_vma_ptes - remove ptes mapping the vma
 * @vma: vm_area_struct holding ptes to be zapped
 * @address: starting address of pages to zap
 * @size: number of bytes to zap
 *
 * This function only unmaps ptes assigned to VM_PFNMAP vmas.
 *
 * The entire address range must be fully contained within the vma.
 *
 */
1424
void zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
1425 1426 1427 1428
		unsigned long size)
{
	if (address < vma->vm_start || address + size > vma->vm_end ||
	    		!(vma->vm_flags & VM_PFNMAP))
1429 1430
		return;

1431
	zap_page_range_single(vma, address, size, NULL);
1432 1433 1434
}
EXPORT_SYMBOL_GPL(zap_vma_ptes);

1435
static pmd_t *walk_to_pmd(struct mm_struct *mm, unsigned long addr)
1436
{
1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453
	pgd_t *pgd;
	p4d_t *p4d;
	pud_t *pud;
	pmd_t *pmd;

	pgd = pgd_offset(mm, addr);
	p4d = p4d_alloc(mm, pgd, addr);
	if (!p4d)
		return NULL;
	pud = pud_alloc(mm, p4d, addr);
	if (!pud)
		return NULL;
	pmd = pmd_alloc(mm, pud, addr);
	if (!pmd)
		return NULL;

	VM_BUG_ON(pmd_trans_huge(*pmd));
1454 1455 1456 1457 1458 1459 1460 1461 1462 1463
	return pmd;
}

pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr,
			spinlock_t **ptl)
{
	pmd_t *pmd = walk_to_pmd(mm, addr);

	if (!pmd)
		return NULL;
1464
	return pte_alloc_map_lock(mm, pmd, addr, ptl);
1465 1466
}

1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487
static int validate_page_before_insert(struct page *page)
{
	if (PageAnon(page) || PageSlab(page) || page_has_type(page))
		return -EINVAL;
	flush_dcache_page(page);
	return 0;
}

static int insert_page_into_pte_locked(struct mm_struct *mm, pte_t *pte,
			unsigned long addr, struct page *page, pgprot_t prot)
{
	if (!pte_none(*pte))
		return -EBUSY;
	/* Ok, finally just insert the thing.. */
	get_page(page);
	inc_mm_counter_fast(mm, mm_counter_file(page));
	page_add_file_rmap(page, false);
	set_pte_at(mm, addr, pte, mk_pte(page, prot));
	return 0;
}

1488 1489 1490 1491 1492 1493 1494
/*
 * This is the old fallback for page remapping.
 *
 * For historical reasons, it only allows reserved pages. Only
 * old drivers should use this, and they needed to mark their
 * pages reserved for the old functions anyway.
 */
Nick Piggin's avatar
Nick Piggin committed
1495 1496
static int insert_page(struct vm_area_struct *vma, unsigned long addr,
			struct page *page, pgprot_t prot)
1497
{
Nick Piggin's avatar
Nick Piggin committed
1498
	struct mm_struct *mm = vma->vm_mm;
1499
	int retval;
1500
	pte_t *pte;
1501 1502
	spinlock_t *ptl;

1503 1504
	retval = validate_page_before_insert(page);
	if (retval)
1505
		goto out;
1506
	retval = -ENOMEM;
1507
	pte = get_locked_pte(mm, addr, &ptl);
1508
	if (!pte)
1509
		goto out;
1510
	retval = insert_page_into_pte_locked(mm, pte, addr, page, prot);
1511 1512 1513 1514 1515
	pte_unmap_unlock(pte, ptl);
out:
	return retval;
}

1516
#ifdef pte_index
1517
static int insert_page_in_batch_locked(struct mm_struct *mm, pte_t *pte,
1518 1519 1520 1521 1522 1523 1524
			unsigned long addr, struct page *page, pgprot_t prot)
{
	int err;

	if (!page_count(page))
		return -EINVAL;
	err = validate_page_before_insert(page);
1525 1526 1527
	if (err)
		return err;
	return insert_page_into_pte_locked(mm, pte, addr, page, prot);
1528 1529 1530 1531 1532 1533 1534 1535 1536
}

/* insert_pages() amortizes the cost of spinlock operations
 * when inserting pages in a loop. Arch *must* define pte_index.
 */
static int insert_pages(struct vm_area_struct *vma, unsigned long addr,
			struct page **pages, unsigned long *num, pgprot_t prot)
{
	pmd_t *pmd = NULL;
1537 1538
	pte_t *start_pte, *pte;
	spinlock_t *pte_lock;
1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561
	struct mm_struct *const mm = vma->vm_mm;
	unsigned long curr_page_idx = 0;
	unsigned long remaining_pages_total = *num;
	unsigned long pages_to_write_in_pmd;
	int ret;
more:
	ret = -EFAULT;
	pmd = walk_to_pmd(mm, addr);
	if (!pmd)
		goto out;

	pages_to_write_in_pmd = min_t(unsigned long,
		remaining_pages_total, PTRS_PER_PTE - pte_index(addr));

	/* Allocate the PTE if necessary; takes PMD lock once only. */
	ret = -ENOMEM;
	if (pte_alloc(mm, pmd))
		goto out;

	while (pages_to_write_in_pmd) {
		int pte_idx = 0;
		const int batch_size = min_t(int, pages_to_write_in_pmd, 8);

1562 1563 1564
		start_pte = pte_offset_map_lock(mm, pmd, addr, &pte_lock);
		for (pte = start_pte; pte_idx < batch_size; ++pte, ++pte_idx) {
			int err = insert_page_in_batch_locked(mm, pte,
1565 1566
				addr, pages[curr_page_idx], prot);
			if (unlikely(err)) {
1567
				pte_unmap_unlock(start_pte, pte_lock);
1568 1569 1570 1571 1572 1573 1574
				ret = err;
				remaining_pages_total -= pte_idx;
				goto out;
			}
			addr += PAGE_SIZE;
			++curr_page_idx;
		}
1575
		pte_unmap_unlock(start_pte, pte_lock);
1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611
		pages_to_write_in_pmd -= batch_size;
		remaining_pages_total -= batch_size;
	}
	if (remaining_pages_total)
		goto more;
	ret = 0;
out:
	*num = remaining_pages_total;
	return ret;
}
#endif  /* ifdef pte_index */

/**
 * vm_insert_pages - insert multiple pages into user vma, batching the pmd lock.
 * @vma: user vma to map to
 * @addr: target start user address of these pages
 * @pages: source kernel pages
 * @num: in: number of pages to map. out: number of pages that were *not*
 * mapped. (0 means all pages were successfully mapped).
 *
 * Preferred over vm_insert_page() when inserting multiple pages.
 *
 * In case of error, we may have mapped a subset of the provided
 * pages. It is the caller's responsibility to account for this case.
 *
 * The same restrictions apply as in vm_insert_page().
 */
int vm_insert_pages(struct vm_area_struct *vma, unsigned long addr,
			struct page **pages, unsigned long *num)
{
#ifdef pte_index
	const unsigned long end_addr = addr + (*num * PAGE_SIZE) - 1;

	if (addr < vma->vm_start || end_addr >= vma->vm_end)
		return -EFAULT;
	if (!(vma->vm_flags & VM_MIXEDMAP)) {
1612
		BUG_ON(mmap_read_trylock(vma->vm_mm));
1613 1614 1615 1616 1617 1618 1619
		BUG_ON(vma->vm_flags & VM_PFNMAP);
		vma->vm_flags |= VM_MIXEDMAP;
	}
	/* Defer page refcount checking till we're about to map that page. */
	return insert_pages(vma, addr, pages, num, vma->vm_page_prot);
#else
	unsigned long idx = 0, pgcount = *num;
1620
	int err = -EINVAL;
1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632

	for (; idx < pgcount; ++idx) {
		err = vm_insert_page(vma, addr + (PAGE_SIZE * idx), pages[idx]);
		if (err)
			break;
	}
	*num = pgcount - idx;
	return err;
#endif  /* ifdef pte_index */
}
EXPORT_SYMBOL(vm_insert_pages);

1633 1634 1635 1636 1637 1638
/**
 * vm_insert_page - insert single page into user vma
 * @vma: user vma to map to
 * @addr: target user address of this page
 * @page: source kernel page
 *
1639 1640 1641 1642 1643 1644
 * This allows drivers to insert individual pages they've allocated
 * into a user vma.
 *
 * The page has to be a nice clean _individual_ kernel allocation.
 * If you allocate a compound page, you need to have marked it as
 * such (__GFP_COMP), or manually just split the page up yourself
1645
 * (see split_page()).
1646 1647 1648 1649 1650 1651 1652 1653
 *
 * NOTE! Traditionally this was done with "remap_pfn_range()" which
 * took an arbitrary page protection parameter. This doesn't allow
 * that. Your vma protection will have to be set up correctly, which
 * means that if you want a shared writable mapping, you'd better
 * ask for a shared writable mapping!
 *
 * The page does not need to be reserved.
1654 1655
 *
 * Usually this function is called from f_op->mmap() handler
1656
 * under mm->mmap_lock write-lock, so it can change vma->vm_flags.
1657 1658
 * Caller must set VM_MIXEDMAP on vma if it wants to call this
 * function from other places, for example from page-fault handler.
1659 1660
 *
 * Return: %0 on success, negative error code otherwise.
1661
 */
Nick Piggin's avatar
Nick Piggin committed
1662 1663
int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
			struct page *page)
1664 1665 1666 1667 1668
{
	if (addr < vma->vm_start || addr >= vma->vm_end)
		return -EFAULT;
	if (!page_count(page))
		return -EINVAL;
1669
	if (!(vma->vm_flags & VM_MIXEDMAP)) {
1670
		BUG_ON(mmap_read_trylock(vma->vm_mm));
1671 1672 1673
		BUG_ON(vma->vm_flags & VM_PFNMAP);
		vma->vm_flags |= VM_MIXEDMAP;
	}
Nick Piggin's avatar
Nick Piggin committed
1674
	return insert_page(vma, addr, page, vma->vm_page_prot);
1675
}
1676
EXPORT_SYMBOL(vm_insert_page);
1677

1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696
/*
 * __vm_map_pages - maps range of kernel pages into user vma
 * @vma: user vma to map to
 * @pages: pointer to array of source kernel pages
 * @num: number of pages in page array
 * @offset: user's requested vm_pgoff
 *
 * This allows drivers to map range of kernel pages into a user vma.
 *
 * Return: 0 on success and error code otherwise.
 */
static int __vm_map_pages(struct vm_area_struct *vma, struct page **pages,
				unsigned long num, unsigned long offset)
{
	unsigned long count = vma_pages(vma);
	unsigned long uaddr = vma->vm_start;
	int ret, i;

	/* Fail if the user requested offset is beyond the end of the object */
1697
	if (offset >= num)
1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758
		return -ENXIO;

	/* Fail if the user requested size exceeds available object size */
	if (count > num - offset)
		return -ENXIO;

	for (i = 0; i < count; i++) {
		ret = vm_insert_page(vma, uaddr, pages[offset + i]);
		if (ret < 0)
			return ret;
		uaddr += PAGE_SIZE;
	}

	return 0;
}

/**
 * vm_map_pages - maps range of kernel pages starts with non zero offset
 * @vma: user vma to map to
 * @pages: pointer to array of source kernel pages
 * @num: number of pages in page array
 *
 * Maps an object consisting of @num pages, catering for the user's
 * requested vm_pgoff
 *
 * If we fail to insert any page into the vma, the function will return
 * immediately leaving any previously inserted pages present.  Callers
 * from the mmap handler may immediately return the error as their caller
 * will destroy the vma, removing any successfully inserted pages. Other
 * callers should make their own arrangements for calling unmap_region().
 *
 * Context: Process context. Called by mmap handlers.
 * Return: 0 on success and error code otherwise.
 */
int vm_map_pages(struct vm_area_struct *vma, struct page **pages,
				unsigned long num)
{
	return __vm_map_pages(vma, pages, num, vma->vm_pgoff);
}
EXPORT_SYMBOL(vm_map_pages);

/**
 * vm_map_pages_zero - map range of kernel pages starts with zero offset
 * @vma: user vma to map to
 * @pages: pointer to array of source kernel pages
 * @num: number of pages in page array
 *
 * Similar to vm_map_pages(), except that it explicitly sets the offset
 * to 0. This function is intended for the drivers that did not consider
 * vm_pgoff.
 *
 * Context: Process context. Called by mmap handlers.
 * Return: 0 on success and error code otherwise.
 */
int vm_map_pages_zero(struct vm_area_struct *vma, struct page **pages,
				unsigned long num)
{
	return __vm_map_pages(vma, pages, num, 0);
}
EXPORT_SYMBOL(vm_map_pages_zero);

1759
static vm_fault_t insert_pfn(struct vm_area_struct *vma, unsigned long addr,
1760
			pfn_t pfn, pgprot_t prot, bool mkwrite)
Nick Piggin's avatar
Nick Piggin committed
1761 1762 1763 1764 1765 1766 1767
{
	struct mm_struct *mm = vma->vm_mm;
	pte_t *pte, entry;
	spinlock_t *ptl;

	pte = get_locked_pte(mm, addr, &ptl);
	if (!pte)
1768
		return VM_FAULT_OOM;
1769 1770 1771 1772 1773 1774 1775
	if (!pte_none(*pte)) {
		if (mkwrite) {
			/*
			 * For read faults on private mappings the PFN passed
			 * in may not match the PFN we have mapped if the
			 * mapped PFN is a writeable COW page.  In the mkwrite
			 * case we are creating a writable PTE for a shared
Jan Kara's avatar
Jan Kara committed
1776 1777 1778 1779
			 * mapping and we expect the PFNs to match. If they
			 * don't match, we are likely racing with block
			 * allocation and mapping invalidation so just skip the
			 * update.
1780
			 */
Jan Kara's avatar
Jan Kara committed
1781 1782
			if (pte_pfn(*pte) != pfn_t_to_pfn(pfn)) {
				WARN_ON_ONCE(!is_zero_pfn(pte_pfn(*pte)));
1783
				goto out_unlock;
Jan Kara's avatar
Jan Kara committed
1784
			}
1785 1786 1787 1788 1789 1790
			entry = pte_mkyoung(*pte);
			entry = maybe_mkwrite(pte_mkdirty(entry), vma);
			if (ptep_set_access_flags(vma, addr, pte, entry, 1))
				update_mmu_cache(vma, addr, pte);
		}
		goto out_unlock;
1791
	}
Nick Piggin's avatar
Nick Piggin committed
1792 1793

	/* Ok, finally just insert the thing.. */
1794 1795 1796 1797
	if (pfn_t_devmap(pfn))
		entry = pte_mkdevmap(pfn_t_pte(pfn, prot));
	else
		entry = pte_mkspecial(pfn_t_pte(pfn, prot));
1798 1799 1800 1801 1802 1803

	if (mkwrite) {
		entry = pte_mkyoung(entry);
		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
	}

Nick Piggin's avatar
Nick Piggin committed
1804
	set_pte_at(mm, addr, pte, entry);
1805
	update_mmu_cache(vma, addr, pte); /* XXX: why not for insert_page? */
Nick Piggin's avatar
Nick Piggin committed
1806 1807 1808

out_unlock:
	pte_unmap_unlock(pte, ptl);
1809
	return VM_FAULT_NOPAGE;
Nick Piggin's avatar
Nick Piggin committed
1810 1811
}

1812 1813 1814 1815 1816 1817 1818
/**
 * vmf_insert_pfn_prot - insert single pfn into user vma with specified pgprot
 * @vma: user vma to map to
 * @addr: target user address of this page
 * @pfn: source kernel pfn
 * @pgprot: pgprot flags for the inserted page
 *
1819
 * This is exactly like vmf_insert_pfn(), except that it allows drivers
1820 1821 1822 1823
 * to override pgprot on a per-page basis.
 *
 * This only makes sense for IO mappings, and it makes no sense for
 * COW mappings.  In general, using multiple vmas is preferable;
1824
 * vmf_insert_pfn_prot should only be used if using multiple VMAs is
1825 1826
 * impractical.
 *
1827 1828 1829
 * See vmf_insert_mixed_prot() for a discussion of the implication of using
 * a value of @pgprot different from that of @vma->vm_page_prot.
 *
1830
 * Context: Process context.  May allocate using %GFP_KERNEL.
1831 1832 1833 1834 1835
 * Return: vm_fault_t value.
 */
vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
			unsigned long pfn, pgprot_t pgprot)
{
1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855
	/*
	 * Technically, architectures with pte_special can avoid all these
	 * restrictions (same for remap_pfn_range).  However we would like
	 * consistency in testing and feature parity among all, so we should
	 * try to keep these invariants in place for everybody.
	 */
	BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)));
	BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
						(VM_PFNMAP|VM_MIXEDMAP));
	BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
	BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn));

	if (addr < vma->vm_start || addr >= vma->vm_end)
		return VM_FAULT_SIGBUS;

	if (!pfn_modify_allowed(pfn, pgprot))
		return VM_FAULT_SIGBUS;

	track_pfn_insert(vma, &pgprot, __pfn_to_pfn_t(pfn, PFN_DEV));

1856
	return insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot,
1857
			false);
1858 1859
}
EXPORT_SYMBOL(vmf_insert_pfn_prot);
Nick Piggin's avatar
Nick Piggin committed
1860

1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887
/**
 * vmf_insert_pfn - insert single pfn into user vma
 * @vma: user vma to map to
 * @addr: target user address of this page
 * @pfn: source kernel pfn
 *
 * Similar to vm_insert_page, this allows drivers to insert individual pages
 * they've allocated into a user vma. Same comments apply.
 *
 * This function should only be called from a vm_ops->fault handler, and
 * in that case the handler should return the result of this function.
 *
 * vma cannot be a COW mapping.
 *
 * As this is called only for pages that do not currently exist, we
 * do not need to flush old virtual caches or the TLB.
 *
 * Context: Process context.  May allocate using %GFP_KERNEL.
 * Return: vm_fault_t value.
 */
vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
			unsigned long pfn)
{
	return vmf_insert_pfn_prot(vma, addr, pfn, vma->vm_page_prot);
}
EXPORT_SYMBOL(vmf_insert_pfn);

1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901
static bool vm_mixed_ok(struct vm_area_struct *vma, pfn_t pfn)
{
	/* these checks mirror the abort conditions in vm_normal_page */
	if (vma->vm_flags & VM_MIXEDMAP)
		return true;
	if (pfn_t_devmap(pfn))
		return true;
	if (pfn_t_special(pfn))
		return true;
	if (is_zero_pfn(pfn_t_to_pfn(pfn)))
		return true;
	return false;
}

1902
static vm_fault_t __vm_insert_mixed(struct vm_area_struct *vma,
1903 1904
		unsigned long addr, pfn_t pfn, pgprot_t pgprot,
		bool mkwrite)
Nick Piggin's avatar
Nick Piggin committed
1905
{
1906
	int err;
1907

1908
	BUG_ON(!vm_mixed_ok(vma, pfn));
Nick Piggin's avatar
Nick Piggin committed
1909

Nick Piggin's avatar
Nick Piggin committed
1910
	if (addr < vma->vm_start || addr >= vma->vm_end)
1911
		return VM_FAULT_SIGBUS;
1912 1913

	track_pfn_insert(vma, &pgprot, pfn);
Nick Piggin's avatar
Nick Piggin committed
1914

1915
	if (!pfn_modify_allowed(pfn_t_to_pfn(pfn), pgprot))
1916
		return VM_FAULT_SIGBUS;
1917

Nick Piggin's avatar
Nick Piggin committed
1918 1919 1920 1921
	/*
	 * If we don't have pte special, then we have to use the pfn_valid()
	 * based VM_MIXEDMAP scheme (see vm_normal_page), and thus we *must*
	 * refcount the page if pfn_valid is true (hence insert_page rather
1922 1923
	 * than insert_pfn).  If a zero_pfn were inserted into a VM_MIXEDMAP
	 * without pte special, it would there be refcounted as a normal page.
Nick Piggin's avatar
Nick Piggin committed
1924
	 */
1925 1926
	if (!IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL) &&
	    !pfn_t_devmap(pfn) && pfn_t_valid(pfn)) {
Nick Piggin's avatar
Nick Piggin committed
1927 1928
		struct page *page;

1929 1930 1931 1932 1933 1934
		/*
		 * At this point we are committed to insert_page()
		 * regardless of whether the caller specified flags that
		 * result in pfn_t_has_page() == false.
		 */
		page = pfn_to_page(pfn_t_to_pfn(pfn));
1935 1936
		err = insert_page(vma, addr, page, pgprot);
	} else {
1937
		return insert_pfn(vma, addr, pfn, pgprot, mkwrite);
Nick Piggin's avatar
Nick Piggin committed
1938
	}
1939

1940 1941 1942 1943 1944 1945
	if (err == -ENOMEM)
		return VM_FAULT_OOM;
	if (err < 0 && err != -EBUSY)
		return VM_FAULT_SIGBUS;

	return VM_FAULT_NOPAGE;
Nick Piggin's avatar
Nick Piggin committed
1946
}
1947

1948 1949 1950 1951 1952 1953 1954
/**
 * vmf_insert_mixed_prot - insert single pfn into user vma with specified pgprot
 * @vma: user vma to map to
 * @addr: target user address of this page
 * @pfn: source kernel pfn
 * @pgprot: pgprot flags for the inserted page
 *
1955
 * This is exactly like vmf_insert_mixed(), except that it allows drivers
1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978
 * to override pgprot on a per-page basis.
 *
 * Typically this function should be used by drivers to set caching- and
 * encryption bits different than those of @vma->vm_page_prot, because
 * the caching- or encryption mode may not be known at mmap() time.
 * This is ok as long as @vma->vm_page_prot is not used by the core vm
 * to set caching and encryption bits for those vmas (except for COW pages).
 * This is ensured by core vm only modifying these page table entries using
 * functions that don't touch caching- or encryption bits, using pte_modify()
 * if needed. (See for example mprotect()).
 * Also when new page-table entries are created, this is only done using the
 * fault() callback, and never using the value of vma->vm_page_prot,
 * except for page-table entries that point to anonymous pages as the result
 * of COW.
 *
 * Context: Process context.  May allocate using %GFP_KERNEL.
 * Return: vm_fault_t value.
 */
vm_fault_t vmf_insert_mixed_prot(struct vm_area_struct *vma, unsigned long addr,
				 pfn_t pfn, pgprot_t pgprot)
{
	return __vm_insert_mixed(vma, addr, pfn, pgprot, false);
}
1979
EXPORT_SYMBOL(vmf_insert_mixed_prot);
1980

1981 1982 1983
vm_fault_t vmf_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
		pfn_t pfn)
{
1984
	return __vm_insert_mixed(vma, addr, pfn, vma->vm_page_prot, false);
1985
}
1986
EXPORT_SYMBOL(vmf_insert_mixed);
Nick Piggin's avatar
Nick Piggin committed
1987

1988 1989 1990 1991 1992 1993 1994
/*
 *  If the insertion of PTE failed because someone else already added a
 *  different entry in the mean time, we treat that as success as we assume
 *  the same entry was actually inserted.
 */
vm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct *vma,
		unsigned long addr, pfn_t pfn)
1995
{
1996
	return __vm_insert_mixed(vma, addr, pfn, vma->vm_page_prot, true);
1997
}
1998
EXPORT_SYMBOL(vmf_insert_mixed_mkwrite);
1999

Linus Torvalds's avatar
Linus Torvalds committed
2000 2001 2002 2003 2004 2005 2006 2007 2008 2009
/*
 * maps a range of physical memory into the requested pages. the old
 * mappings are removed. any references to nonexistent pages results
 * in null mappings (currently treated as "copy-on-access")
 */
static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
			unsigned long addr, unsigned long end,
			unsigned long pfn, pgprot_t prot)
{
	pte_t *pte;
2010
	spinlock_t *ptl;
2011
	int err = 0;
Linus Torvalds's avatar
Linus Torvalds committed
2012

2013
	pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
Linus Torvalds's avatar
Linus Torvalds committed
2014 2015
	if (!pte)
		return -ENOMEM;
2016
	arch_enter_lazy_mmu_mode();
Linus Torvalds's avatar
Linus Torvalds committed
2017 2018
	do {
		BUG_ON(!pte_none(*pte));
2019 2020 2021 2022
		if (!pfn_modify_allowed(pfn, prot)) {
			err = -EACCES;
			break;
		}
2023
		set_pte_at(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot)));
Linus Torvalds's avatar
Linus Torvalds committed
2024 2025
		pfn++;
	} while (pte++, addr += PAGE_SIZE, addr != end);
2026
	arch_leave_lazy_mmu_mode();
2027
	pte_unmap_unlock(pte - 1, ptl);
2028
	return err;
Linus Torvalds's avatar
Linus Torvalds committed
2029 2030 2031 2032 2033 2034 2035 2036
}

static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud,
			unsigned long addr, unsigned long end,
			unsigned long pfn, pgprot_t prot)
{
	pmd_t *pmd;
	unsigned long next;
2037
	int err;
Linus Torvalds's avatar
Linus Torvalds committed
2038 2039 2040 2041 2042

	pfn -= addr >> PAGE_SHIFT;
	pmd = pmd_alloc(mm, pud, addr);
	if (!pmd)
		return -ENOMEM;
2043
	VM_BUG_ON(pmd_trans_huge(*pmd));
Linus Torvalds's avatar
Linus Torvalds committed
2044 2045
	do {
		next = pmd_addr_end(addr, end);
2046 2047 2048 2049
		err = remap_pte_range(mm, pmd, addr, next,
				pfn + (addr >> PAGE_SHIFT), prot);
		if (err)
			return err;
Linus Torvalds's avatar
Linus Torvalds committed
2050 2051 2052 2053
	} while (pmd++, addr = next, addr != end);
	return 0;
}

2054
static inline int remap_pud_range(struct mm_struct *mm, p4d_t *p4d,
Linus Torvalds's avatar
Linus Torvalds committed
2055 2056 2057 2058 2059
			unsigned long addr, unsigned long end,
			unsigned long pfn, pgprot_t prot)
{
	pud_t *pud;
	unsigned long next;
2060
	int err;
Linus Torvalds's avatar
Linus Torvalds committed
2061 2062

	pfn -= addr >> PAGE_SHIFT;
2063
	pud = pud_alloc(mm, p4d, addr);
Linus Torvalds's avatar
Linus Torvalds committed
2064 2065 2066 2067
	if (!pud)
		return -ENOMEM;
	do {
		next = pud_addr_end(addr, end);
2068 2069 2070 2071
		err = remap_pmd_range(mm, pud, addr, next,
				pfn + (addr >> PAGE_SHIFT), prot);
		if (err)
			return err;
Linus Torvalds's avatar
Linus Torvalds committed
2072 2073 2074 2075
	} while (pud++, addr = next, addr != end);
	return 0;
}

2076 2077 2078 2079 2080 2081
static inline int remap_p4d_range(struct mm_struct *mm, pgd_t *pgd,
			unsigned long addr, unsigned long end,
			unsigned long pfn, pgprot_t prot)
{
	p4d_t *p4d;
	unsigned long next;
2082
	int err;
2083 2084 2085 2086 2087 2088 2089

	pfn -= addr >> PAGE_SHIFT;
	p4d = p4d_alloc(mm, pgd, addr);
	if (!p4d)
		return -ENOMEM;
	do {
		next = p4d_addr_end(addr, end);
2090 2091 2092 2093
		err = remap_pud_range(mm, p4d, addr, next,
				pfn + (addr >> PAGE_SHIFT), prot);
		if (err)
			return err;
2094 2095 2096 2097
	} while (p4d++, addr = next, addr != end);
	return 0;
}

2098 2099 2100
/**
 * remap_pfn_range - remap kernel memory to userspace
 * @vma: user vma to map to
2101
 * @addr: target page aligned user address to start at
2102
 * @pfn: page frame number of kernel physical memory address
2103
 * @size: size of mapping area
2104 2105
 * @prot: page protection flags for this mapping
 *
2106 2107 2108
 * Note: this is only safe if the mm semaphore is held when called.
 *
 * Return: %0 on success, negative error code otherwise.
2109
 */
Linus Torvalds's avatar
Linus Torvalds committed
2110 2111 2112 2113 2114
int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
		    unsigned long pfn, unsigned long size, pgprot_t prot)
{
	pgd_t *pgd;
	unsigned long next;
2115
	unsigned long end = addr + PAGE_ALIGN(size);
Linus Torvalds's avatar
Linus Torvalds committed
2116
	struct mm_struct *mm = vma->vm_mm;
2117
	unsigned long remap_pfn = pfn;
Linus Torvalds's avatar
Linus Torvalds committed
2118 2119
	int err;

2120 2121 2122
	if (WARN_ON_ONCE(!PAGE_ALIGNED(addr)))
		return -EINVAL;

Linus Torvalds's avatar
Linus Torvalds committed
2123 2124 2125 2126 2127
	/*
	 * Physically remapped pages are special. Tell the
	 * rest of the world about it:
	 *   VM_IO tells people not to look at these pages
	 *	(accesses can have side effects).
2128 2129 2130
	 *   VM_PFNMAP tells the core MM that the base pages are just
	 *	raw PFN mappings, and do not have a "struct page" associated
	 *	with them.
2131 2132 2133 2134
	 *   VM_DONTEXPAND
	 *      Disable vma merging and expanding with mremap().
	 *   VM_DONTDUMP
	 *      Omit vma from core dump, even when VM_IO turned off.
2135 2136 2137 2138
	 *
	 * There's a horrible special case to handle copy-on-write
	 * behaviour that some programs depend on. We mark the "original"
	 * un-COW'ed pages by matching them up with "vma->vm_pgoff".
2139
	 * See vm_normal_page() for details.
Linus Torvalds's avatar
Linus Torvalds committed
2140
	 */
2141 2142 2143
	if (is_cow_mapping(vma->vm_flags)) {
		if (addr != vma->vm_start || end != vma->vm_end)
			return -EINVAL;
2144
		vma->vm_pgoff = pfn;
2145 2146
	}

2147
	err = track_pfn_remap(vma, &prot, remap_pfn, addr, PAGE_ALIGN(size));
2148
	if (err)
2149
		return -EINVAL;
2150

2151
	vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP;
Linus Torvalds's avatar
Linus Torvalds committed
2152 2153 2154 2155 2156 2157 2158

	BUG_ON(addr >= end);
	pfn -= addr >> PAGE_SHIFT;
	pgd = pgd_offset(mm, addr);
	flush_cache_range(vma, addr, end);
	do {
		next = pgd_addr_end(addr, end);
2159
		err = remap_p4d_range(mm, pgd, addr, next,
Linus Torvalds's avatar
Linus Torvalds committed
2160 2161 2162 2163
				pfn + (addr >> PAGE_SHIFT), prot);
		if (err)
			break;
	} while (pgd++, addr = next, addr != end);
2164 2165

	if (err)
2166
		untrack_pfn(vma, remap_pfn, PAGE_ALIGN(size));
2167

Linus Torvalds's avatar
Linus Torvalds committed
2168 2169 2170 2171
	return err;
}
EXPORT_SYMBOL(remap_pfn_range);

2172 2173 2174
/**
 * vm_iomap_memory - remap memory to userspace
 * @vma: user vma to map to
2175
 * @start: start of the physical memory to be mapped
2176 2177 2178 2179 2180 2181 2182 2183
 * @len: size of area
 *
 * This is a simplified io_remap_pfn_range() for common driver use. The
 * driver just needs to give us the physical memory range to be mapped,
 * we'll figure out the rest from the vma information.
 *
 * NOTE! Some drivers might want to tweak vma->vm_page_prot first to get
 * whatever write-combining details or similar.
2184 2185
 *
 * Return: %0 on success, negative error code otherwise.
2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220
 */
int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len)
{
	unsigned long vm_len, pfn, pages;

	/* Check that the physical memory area passed in looks valid */
	if (start + len < start)
		return -EINVAL;
	/*
	 * You *really* shouldn't map things that aren't page-aligned,
	 * but we've historically allowed it because IO memory might
	 * just have smaller alignment.
	 */
	len += start & ~PAGE_MASK;
	pfn = start >> PAGE_SHIFT;
	pages = (len + ~PAGE_MASK) >> PAGE_SHIFT;
	if (pfn + pages < pfn)
		return -EINVAL;

	/* We start the mapping 'vm_pgoff' pages into the area */
	if (vma->vm_pgoff > pages)
		return -EINVAL;
	pfn += vma->vm_pgoff;
	pages -= vma->vm_pgoff;

	/* Can we fit all of the mapping? */
	vm_len = vma->vm_end - vma->vm_start;
	if (vm_len >> PAGE_SHIFT > pages)
		return -EINVAL;

	/* Ok, let it rip */
	return io_remap_pfn_range(vma, vma->vm_start, pfn, vm_len, vma->vm_page_prot);
}
EXPORT_SYMBOL(vm_iomap_memory);

2221 2222
static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
				     unsigned long addr, unsigned long end,
2223 2224
				     pte_fn_t fn, void *data, bool create,
				     pgtbl_mod_mask *mask)
2225 2226
{
	pte_t *pte;
2227
	int err = 0;
2228
	spinlock_t *ptl;
2229

2230 2231
	if (create) {
		pte = (mm == &init_mm) ?
2232
			pte_alloc_kernel_track(pmd, addr, mask) :
2233 2234 2235 2236 2237 2238 2239 2240
			pte_alloc_map_lock(mm, pmd, addr, &ptl);
		if (!pte)
			return -ENOMEM;
	} else {
		pte = (mm == &init_mm) ?
			pte_offset_kernel(pmd, addr) :
			pte_offset_map_lock(mm, pmd, addr, &ptl);
	}
2241 2242 2243

	BUG_ON(pmd_huge(*pmd));

2244 2245
	arch_enter_lazy_mmu_mode();

2246
	do {
2247 2248 2249 2250 2251
		if (create || !pte_none(*pte)) {
			err = fn(pte++, addr, data);
			if (err)
				break;
		}
2252
	} while (addr += PAGE_SIZE, addr != end);
2253
	*mask |= PGTBL_PTE_MODIFIED;
2254

2255 2256
	arch_leave_lazy_mmu_mode();

2257 2258 2259 2260 2261 2262 2263
	if (mm != &init_mm)
		pte_unmap_unlock(pte-1, ptl);
	return err;
}

static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud,
				     unsigned long addr, unsigned long end,
2264 2265
				     pte_fn_t fn, void *data, bool create,
				     pgtbl_mod_mask *mask)
2266 2267 2268
{
	pmd_t *pmd;
	unsigned long next;
2269
	int err = 0;
2270

Andi Kleen's avatar
Andi Kleen committed
2271 2272
	BUG_ON(pud_huge(*pud));

2273
	if (create) {
2274
		pmd = pmd_alloc_track(mm, pud, addr, mask);
2275 2276 2277 2278 2279
		if (!pmd)
			return -ENOMEM;
	} else {
		pmd = pmd_offset(pud, addr);
	}
2280 2281
	do {
		next = pmd_addr_end(addr, end);
2282 2283
		if (create || !pmd_none_or_clear_bad(pmd)) {
			err = apply_to_pte_range(mm, pmd, addr, next, fn, data,
2284
						 create, mask);
2285 2286 2287
			if (err)
				break;
		}
2288 2289 2290 2291
	} while (pmd++, addr = next, addr != end);
	return err;
}

2292
static int apply_to_pud_range(struct mm_struct *mm, p4d_t *p4d,
2293
				     unsigned long addr, unsigned long end,
2294 2295
				     pte_fn_t fn, void *data, bool create,
				     pgtbl_mod_mask *mask)
2296 2297 2298
{
	pud_t *pud;
	unsigned long next;
2299
	int err = 0;
2300

2301
	if (create) {
2302
		pud = pud_alloc_track(mm, p4d, addr, mask);
2303 2304 2305 2306 2307
		if (!pud)
			return -ENOMEM;
	} else {
		pud = pud_offset(p4d, addr);
	}
2308 2309
	do {
		next = pud_addr_end(addr, end);
2310 2311
		if (create || !pud_none_or_clear_bad(pud)) {
			err = apply_to_pmd_range(mm, pud, addr, next, fn, data,
2312
						 create, mask);
2313 2314 2315
			if (err)
				break;
		}
2316 2317 2318 2319
	} while (pud++, addr = next, addr != end);
	return err;
}

2320 2321
static int apply_to_p4d_range(struct mm_struct *mm, pgd_t *pgd,
				     unsigned long addr, unsigned long end,
2322 2323
				     pte_fn_t fn, void *data, bool create,
				     pgtbl_mod_mask *mask)
2324 2325 2326
{
	p4d_t *p4d;
	unsigned long next;
2327
	int err = 0;
2328

2329
	if (create) {
2330
		p4d = p4d_alloc_track(mm, pgd, addr, mask);
2331 2332 2333 2334 2335
		if (!p4d)
			return -ENOMEM;
	} else {
		p4d = p4d_offset(pgd, addr);
	}
2336 2337
	do {
		next = p4d_addr_end(addr, end);
2338 2339
		if (create || !p4d_none_or_clear_bad(p4d)) {
			err = apply_to_pud_range(mm, p4d, addr, next, fn, data,
2340
						 create, mask);
2341 2342 2343
			if (err)
				break;
		}
2344 2345 2346 2347
	} while (p4d++, addr = next, addr != end);
	return err;
}

2348 2349 2350
static int __apply_to_page_range(struct mm_struct *mm, unsigned long addr,
				 unsigned long size, pte_fn_t fn,
				 void *data, bool create)
2351 2352
{
	pgd_t *pgd;
2353
	unsigned long start = addr, next;
2354
	unsigned long end = addr + size;
2355
	pgtbl_mod_mask mask = 0;
2356
	int err = 0;
2357

2358 2359 2360
	if (WARN_ON(addr >= end))
		return -EINVAL;

2361 2362 2363
	pgd = pgd_offset(mm, addr);
	do {
		next = pgd_addr_end(addr, end);
2364 2365
		if (!create && pgd_none_or_clear_bad(pgd))
			continue;
2366
		err = apply_to_p4d_range(mm, pgd, addr, next, fn, data, create, &mask);
2367 2368 2369
		if (err)
			break;
	} while (pgd++, addr = next, addr != end);
2370

2371 2372 2373
	if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
		arch_sync_kernel_mappings(start, start + size);

2374 2375
	return err;
}
2376 2377 2378 2379 2380 2381 2382 2383 2384 2385

/*
 * Scan a region of virtual memory, filling in page tables as necessary
 * and calling a provided function on each leaf page table.
 */
int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
			unsigned long size, pte_fn_t fn, void *data)
{
	return __apply_to_page_range(mm, addr, size, fn, data, true);
}
2386 2387
EXPORT_SYMBOL_GPL(apply_to_page_range);

2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401
/*
 * Scan a region of virtual memory, calling a provided function on
 * each leaf page table where it exists.
 *
 * Unlike apply_to_page_range, this does _not_ fill in page tables
 * where they are absent.
 */
int apply_to_existing_page_range(struct mm_struct *mm, unsigned long addr,
				 unsigned long size, pte_fn_t fn, void *data)
{
	return __apply_to_page_range(mm, addr, size, fn, data, false);
}
EXPORT_SYMBOL_GPL(apply_to_existing_page_range);

2402
/*
2403 2404 2405 2406 2407
 * handle_pte_fault chooses page fault handler according to an entry which was
 * read non-atomically.  Before making any commitment, on those architectures
 * or configurations (e.g. i386 with PAE) which might give a mix of unmatched
 * parts, do_swap_page must check under lock before unmapping the pte and
 * proceeding (but do_wp_page is only called after already making such a check;
2408
 * and do_anonymous_page can safely check later on).
2409
 */
2410
static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd,
2411 2412 2413
				pte_t *page_table, pte_t orig_pte)
{
	int same = 1;
2414
#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPTION)
2415
	if (sizeof(pte_t) > sizeof(unsigned long)) {
2416 2417
		spinlock_t *ptl = pte_lockptr(mm, pmd);
		spin_lock(ptl);
2418
		same = pte_same(*page_table, orig_pte);
2419
		spin_unlock(ptl);
2420 2421 2422 2423 2424 2425
	}
#endif
	pte_unmap(page_table);
	return same;
}

2426 2427
static inline bool cow_user_page(struct page *dst, struct page *src,
				 struct vm_fault *vmf)
2428
{
2429 2430 2431
	bool ret;
	void *kaddr;
	void __user *uaddr;
2432
	bool locked = false;
2433 2434 2435 2436 2437 2438 2439 2440 2441
	struct vm_area_struct *vma = vmf->vma;
	struct mm_struct *mm = vma->vm_mm;
	unsigned long addr = vmf->address;

	if (likely(src)) {
		copy_user_highpage(dst, src, addr, vma);
		return true;
	}

2442 2443 2444 2445 2446 2447
	/*
	 * If the source page was a PFN mapping, we don't have
	 * a "struct page" for it. We do a best-effort copy by
	 * just copying from the original user address. If that
	 * fails, we just zero-fill it. Live with it.
	 */
2448 2449 2450 2451 2452 2453 2454
	kaddr = kmap_atomic(dst);
	uaddr = (void __user *)(addr & PAGE_MASK);

	/*
	 * On architectures with software "accessed" bits, we would
	 * take a double page fault, so mark it accessed here.
	 */
2455
	if (arch_faults_on_old_pte() && !pte_young(vmf->orig_pte)) {
2456
		pte_t entry;
2457

2458
		vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl);
2459
		locked = true;
2460 2461 2462
		if (!likely(pte_same(*vmf->pte, vmf->orig_pte))) {
			/*
			 * Other thread has already handled the fault
2463
			 * and update local tlb only
2464
			 */
2465
			update_mmu_tlb(vma, addr, vmf->pte);
2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481
			ret = false;
			goto pte_unlock;
		}

		entry = pte_mkyoung(vmf->orig_pte);
		if (ptep_set_access_flags(vma, addr, vmf->pte, entry, 0))
			update_mmu_cache(vma, addr, vmf->pte);
	}

	/*
	 * This really shouldn't fail, because the page is there
	 * in the page tables. But it might just be unreadable,
	 * in which case we just give up and fill the result with
	 * zeroes.
	 */
	if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) {
2482 2483 2484 2485 2486 2487 2488
		if (locked)
			goto warn;

		/* Re-validate under PTL if the page is still mapped */
		vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl);
		locked = true;
		if (!likely(pte_same(*vmf->pte, vmf->orig_pte))) {
2489 2490
			/* The PTE changed under us, update local tlb */
			update_mmu_tlb(vma, addr, vmf->pte);
2491 2492 2493 2494
			ret = false;
			goto pte_unlock;
		}

2495
		/*
2496
		 * The same page can be mapped back since last copy attempt.
2497
		 * Try to copy again under PTL.
2498
		 */
2499 2500 2501 2502 2503 2504 2505 2506 2507
		if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) {
			/*
			 * Give a warn in case there can be some obscure
			 * use-case
			 */
warn:
			WARN_ON_ONCE(1);
			clear_page(kaddr);
		}
2508 2509 2510 2511 2512
	}

	ret = true;

pte_unlock:
2513
	if (locked)
2514 2515 2516 2517 2518
		pte_unmap_unlock(vmf->pte, vmf->ptl);
	kunmap_atomic(kaddr);
	flush_dcache_page(dst);

	return ret;
2519 2520
}

2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534
static gfp_t __get_fault_gfp_mask(struct vm_area_struct *vma)
{
	struct file *vm_file = vma->vm_file;

	if (vm_file)
		return mapping_gfp_mask(vm_file->f_mapping) | __GFP_FS | __GFP_IO;

	/*
	 * Special mappings (e.g. VDSO) do not have any file so fake
	 * a default GFP_KERNEL for them.
	 */
	return GFP_KERNEL;
}

2535 2536 2537 2538 2539 2540
/*
 * Notify the address space that the page is about to become writable so that
 * it can prohibit this or wait for the page to get into an appropriate state.
 *
 * We do this without the lock held, so that it can sleep if it needs to.
 */
2541
static vm_fault_t do_page_mkwrite(struct vm_fault *vmf)
2542
{
2543
	vm_fault_t ret;
2544 2545
	struct page *page = vmf->page;
	unsigned int old_flags = vmf->flags;
2546

2547
	vmf->flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
2548

2549 2550 2551 2552
	if (vmf->vma->vm_file &&
	    IS_SWAPFILE(vmf->vma->vm_file->f_mapping->host))
		return VM_FAULT_SIGBUS;

2553
	ret = vmf->vma->vm_ops->page_mkwrite(vmf);
2554 2555
	/* Restore original flags so that caller is not surprised */
	vmf->flags = old_flags;
2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569
	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
		return ret;
	if (unlikely(!(ret & VM_FAULT_LOCKED))) {
		lock_page(page);
		if (!page->mapping) {
			unlock_page(page);
			return 0; /* retry */
		}
		ret |= VM_FAULT_LOCKED;
	} else
		VM_BUG_ON_PAGE(!PageLocked(page), page);
	return ret;
}

2570 2571 2572 2573 2574
/*
 * Handle dirtying of a page in shared file mapping on a write fault.
 *
 * The function expects the page to be locked and unlocks it.
 */
2575
static vm_fault_t fault_dirty_shared_page(struct vm_fault *vmf)
2576
{
2577
	struct vm_area_struct *vma = vmf->vma;
2578
	struct address_space *mapping;
2579
	struct page *page = vmf->page;
2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593
	bool dirtied;
	bool page_mkwrite = vma->vm_ops && vma->vm_ops->page_mkwrite;

	dirtied = set_page_dirty(page);
	VM_BUG_ON_PAGE(PageAnon(page), page);
	/*
	 * Take a local copy of the address_space - page.mapping may be zeroed
	 * by truncate after unlock_page().   The address_space itself remains
	 * pinned by vma->vm_file's reference.  We rely on unlock_page()'s
	 * release semantics to prevent the compiler from undoing this copying.
	 */
	mapping = page_rmapping(page);
	unlock_page(page);

2594 2595 2596 2597 2598 2599 2600 2601 2602
	if (!page_mkwrite)
		file_update_time(vma->vm_file);

	/*
	 * Throttle page dirtying rate down to writeback speed.
	 *
	 * mapping may be NULL here because some device drivers do not
	 * set page.mapping but still dirty their pages
	 *
2603
	 * Drop the mmap_lock before waiting on IO, if we can. The file
2604 2605
	 * is pinning the mapping, as per above.
	 */
2606
	if ((dirtied || page_mkwrite) && mapping) {
2607 2608 2609
		struct file *fpin;

		fpin = maybe_unlock_mmap_for_io(vmf, NULL);
2610
		balance_dirty_pages_ratelimited(mapping);
2611 2612 2613 2614
		if (fpin) {
			fput(fpin);
			return VM_FAULT_RETRY;
		}
2615 2616
	}

2617
	return 0;
2618 2619
}

2620 2621 2622 2623 2624 2625 2626 2627
/*
 * Handle write page faults for pages that can be reused in the current vma
 *
 * This can happen either due to the mapping being with the VM_SHARED flag,
 * or due to us being the last reference standing to the page. In either
 * case, all we need to do here is to mark the page as writable and update
 * any related book-keeping.
 */
2628
static inline void wp_page_reuse(struct vm_fault *vmf)
2629
	__releases(vmf->ptl)
2630
{
2631
	struct vm_area_struct *vma = vmf->vma;
2632
	struct page *page = vmf->page;
2633 2634 2635 2636 2637 2638 2639 2640 2641
	pte_t entry;
	/*
	 * Clear the pages cpupid information as the existing
	 * information potentially belongs to a now completely
	 * unrelated process.
	 */
	if (page)
		page_cpupid_xchg_last(page, (1 << LAST_CPUPID_SHIFT) - 1);

2642 2643
	flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
	entry = pte_mkyoung(vmf->orig_pte);
2644
	entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2645 2646 2647
	if (ptep_set_access_flags(vma, vmf->address, vmf->pte, entry, 1))
		update_mmu_cache(vma, vmf->address, vmf->pte);
	pte_unmap_unlock(vmf->pte, vmf->ptl);
Peter Xu's avatar
Peter Xu committed
2648
	count_vm_event(PGREUSE);
2649 2650
}

2651 2652 2653
/*
 * Handle the case of a page which we actually need to copy to a new page.
 *
2654
 * Called with mmap_lock locked and the old page referenced, but
2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666
 * without the ptl held.
 *
 * High level logic flow:
 *
 * - Allocate a page, copy the content of the old page to the new one.
 * - Handle book keeping and accounting - cgroups, mmu-notifiers, etc.
 * - Take the PTL. If the pte changed, bail out and release the allocated page
 * - If the pte is still the way we remember it, update the page table and all
 *   relevant references. This includes dropping the reference the page-table
 *   held to the old page, as well as updating the rmap.
 * - In any case, unlock the PTL and drop the reference we took to the old page.
 */
2667
static vm_fault_t wp_page_copy(struct vm_fault *vmf)
2668
{
2669
	struct vm_area_struct *vma = vmf->vma;
2670
	struct mm_struct *mm = vma->vm_mm;
2671
	struct page *old_page = vmf->page;
2672 2673 2674
	struct page *new_page = NULL;
	pte_t entry;
	int page_copied = 0;
2675
	struct mmu_notifier_range range;
2676 2677 2678 2679

	if (unlikely(anon_vma_prepare(vma)))
		goto oom;

2680
	if (is_zero_pfn(pte_pfn(vmf->orig_pte))) {
2681 2682
		new_page = alloc_zeroed_user_highpage_movable(vma,
							      vmf->address);
2683 2684 2685
		if (!new_page)
			goto oom;
	} else {
2686
		new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma,
2687
				vmf->address);
2688 2689
		if (!new_page)
			goto oom;
2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702

		if (!cow_user_page(new_page, old_page, vmf)) {
			/*
			 * COW failed, if the fault was solved by other,
			 * it's fine. If not, userspace would re-fault on
			 * the same address and we will handle the fault
			 * from the second attempt.
			 */
			put_page(new_page);
			if (old_page)
				put_page(old_page);
			return 0;
		}
2703 2704
	}

2705
	if (mem_cgroup_charge(new_page, mm, GFP_KERNEL))
2706
		goto oom_free_new;
2707
	cgroup_throttle_swaprate(new_page, GFP_KERNEL);
2708

2709 2710
	__SetPageUptodate(new_page);

2711
	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm,
2712
				vmf->address & PAGE_MASK,
2713 2714
				(vmf->address & PAGE_MASK) + PAGE_SIZE);
	mmu_notifier_invalidate_range_start(&range);
2715 2716 2717 2718

	/*
	 * Re-check the pte - we dropped the lock
	 */
2719
	vmf->pte = pte_offset_map_lock(mm, vmf->pmd, vmf->address, &vmf->ptl);
2720
	if (likely(pte_same(*vmf->pte, vmf->orig_pte))) {
2721 2722
		if (old_page) {
			if (!PageAnon(old_page)) {
2723 2724
				dec_mm_counter_fast(mm,
						mm_counter_file(old_page));
2725 2726 2727 2728 2729
				inc_mm_counter_fast(mm, MM_ANONPAGES);
			}
		} else {
			inc_mm_counter_fast(mm, MM_ANONPAGES);
		}
2730
		flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
2731
		entry = mk_pte(new_page, vma->vm_page_prot);
2732
		entry = pte_sw_mkyoung(entry);
2733 2734 2735 2736 2737 2738 2739
		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
		/*
		 * Clear the pte entry and flush it first, before updating the
		 * pte with the new entry. This will avoid a race condition
		 * seen in the presence of one thread doing SMC and another
		 * thread doing COW.
		 */
2740 2741
		ptep_clear_flush_notify(vma, vmf->address, vmf->pte);
		page_add_new_anon_rmap(new_page, vma, vmf->address, false);
2742
		lru_cache_add_inactive_or_unevictable(new_page, vma);
2743 2744 2745 2746 2747
		/*
		 * We call the notify macro here because, when using secondary
		 * mmu page tables (such as kvm shadow page tables), we want the
		 * new page to be mapped directly into the secondary page table.
		 */
2748 2749
		set_pte_at_notify(mm, vmf->address, vmf->pte, entry);
		update_mmu_cache(vma, vmf->address, vmf->pte);
2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772
		if (old_page) {
			/*
			 * Only after switching the pte to the new page may
			 * we remove the mapcount here. Otherwise another
			 * process may come and find the rmap count decremented
			 * before the pte is switched to the new page, and
			 * "reuse" the old page writing into it while our pte
			 * here still points into it and can be read by other
			 * threads.
			 *
			 * The critical issue is to order this
			 * page_remove_rmap with the ptp_clear_flush above.
			 * Those stores are ordered by (if nothing else,)
			 * the barrier present in the atomic_add_negative
			 * in page_remove_rmap.
			 *
			 * Then the TLB flush in ptep_clear_flush ensures that
			 * no process can access the old page before the
			 * decremented mapcount is visible. And the old page
			 * cannot be reused until after the decremented
			 * mapcount is visible. So transitively, TLBs to
			 * old page will be flushed before it can be reused.
			 */
2773
			page_remove_rmap(old_page, false);
2774 2775 2776 2777 2778 2779
		}

		/* Free the old page.. */
		new_page = old_page;
		page_copied = 1;
	} else {
2780
		update_mmu_tlb(vma, vmf->address, vmf->pte);
2781 2782 2783
	}

	if (new_page)
2784
		put_page(new_page);
2785

2786
	pte_unmap_unlock(vmf->pte, vmf->ptl);
2787 2788 2789 2790
	/*
	 * No need to double call mmu_notifier->invalidate_range() callback as
	 * the above ptep_clear_flush_notify() did already call it.
	 */
2791
	mmu_notifier_invalidate_range_only_end(&range);
2792 2793 2794 2795 2796 2797 2798
	if (old_page) {
		/*
		 * Don't let another task, with possibly unlocked vma,
		 * keep the mlocked page.
		 */
		if (page_copied && (vma->vm_flags & VM_LOCKED)) {
			lock_page(old_page);	/* LRU manipulation */
2799 2800
			if (PageMlocked(old_page))
				munlock_vma_page(old_page);
2801 2802
			unlock_page(old_page);
		}
2803
		put_page(old_page);
2804 2805 2806
	}
	return page_copied ? VM_FAULT_WRITE : 0;
oom_free_new:
2807
	put_page(new_page);
2808 2809
oom:
	if (old_page)
2810
		put_page(old_page);
2811 2812 2813
	return VM_FAULT_OOM;
}

2814 2815 2816 2817 2818 2819 2820 2821
/**
 * finish_mkwrite_fault - finish page fault for a shared mapping, making PTE
 *			  writeable once the page is prepared
 *
 * @vmf: structure describing the fault
 *
 * This function handles all that is needed to finish a write page fault in a
 * shared mapping due to PTE being read-only once the mapped page is prepared.
2822
 * It handles locking of PTE and modifying it.
2823 2824 2825
 *
 * The function expects the page to be locked or other protection against
 * concurrent faults / writeback (such as DAX radix tree locks).
2826 2827 2828
 *
 * Return: %VM_FAULT_WRITE on success, %0 when PTE got changed before
 * we acquired PTE lock.
2829
 */
2830
vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf)
2831 2832 2833 2834 2835 2836 2837 2838 2839
{
	WARN_ON_ONCE(!(vmf->vma->vm_flags & VM_SHARED));
	vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, vmf->address,
				       &vmf->ptl);
	/*
	 * We might have raced with another page fault while we released the
	 * pte_offset_map_lock.
	 */
	if (!pte_same(*vmf->pte, vmf->orig_pte)) {
2840
		update_mmu_tlb(vmf->vma, vmf->address, vmf->pte);
2841
		pte_unmap_unlock(vmf->pte, vmf->ptl);
2842
		return VM_FAULT_NOPAGE;
2843 2844
	}
	wp_page_reuse(vmf);
2845
	return 0;
2846 2847
}

2848 2849 2850 2851
/*
 * Handle write page faults for VM_MIXEDMAP or VM_PFNMAP for a VM_SHARED
 * mapping
 */
2852
static vm_fault_t wp_pfn_shared(struct vm_fault *vmf)
2853
{
2854
	struct vm_area_struct *vma = vmf->vma;
2855

2856
	if (vma->vm_ops && vma->vm_ops->pfn_mkwrite) {
2857
		vm_fault_t ret;
2858

2859
		pte_unmap_unlock(vmf->pte, vmf->ptl);
2860
		vmf->flags |= FAULT_FLAG_MKWRITE;
2861
		ret = vma->vm_ops->pfn_mkwrite(vmf);
2862
		if (ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))
2863
			return ret;
2864
		return finish_mkwrite_fault(vmf);
2865
	}
2866 2867
	wp_page_reuse(vmf);
	return VM_FAULT_WRITE;
2868 2869
}

2870
static vm_fault_t wp_page_shared(struct vm_fault *vmf)
2871
	__releases(vmf->ptl)
2872
{
2873
	struct vm_area_struct *vma = vmf->vma;
2874
	vm_fault_t ret = VM_FAULT_WRITE;
2875

2876
	get_page(vmf->page);
2877 2878

	if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
2879
		vm_fault_t tmp;
2880

2881
		pte_unmap_unlock(vmf->pte, vmf->ptl);
2882
		tmp = do_page_mkwrite(vmf);
2883 2884
		if (unlikely(!tmp || (tmp &
				      (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
2885
			put_page(vmf->page);
2886 2887
			return tmp;
		}
2888
		tmp = finish_mkwrite_fault(vmf);
2889
		if (unlikely(tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
2890 2891
			unlock_page(vmf->page);
			put_page(vmf->page);
2892
			return tmp;
2893
		}
2894 2895
	} else {
		wp_page_reuse(vmf);
2896
		lock_page(vmf->page);
2897
	}
2898
	ret |= fault_dirty_shared_page(vmf);
2899
	put_page(vmf->page);
2900

2901
	return ret;
2902 2903
}

Linus Torvalds's avatar
Linus Torvalds committed
2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917
/*
 * This routine handles present pages, when users try to write
 * to a shared page. It is done by copying the page to a new address
 * and decrementing the shared-page counter for the old page.
 *
 * Note that this routine assumes that the protection checks have been
 * done by the caller (the low-level page fault routine in most cases).
 * Thus we can safely just mark it writable once we've done any necessary
 * COW.
 *
 * We also mark the page dirty at this point even though the page will
 * change only once the write actually happens. This avoids a few races,
 * and potentially makes it more efficient.
 *
2918
 * We enter with non-exclusive mmap_lock (to exclude vma changes,
2919
 * but allow concurrent faults), with pte both mapped and locked.
2920
 * We return with mmap_lock still held, but pte unmapped and unlocked.
Linus Torvalds's avatar
Linus Torvalds committed
2921
 */
2922
static vm_fault_t do_wp_page(struct vm_fault *vmf)
2923
	__releases(vmf->ptl)
Linus Torvalds's avatar
Linus Torvalds committed
2924
{
2925
	struct vm_area_struct *vma = vmf->vma;
Linus Torvalds's avatar
Linus Torvalds committed
2926

2927
	if (userfaultfd_pte_wp(vma, *vmf->pte)) {
2928 2929 2930 2931
		pte_unmap_unlock(vmf->pte, vmf->ptl);
		return handle_userfault(vmf, VM_UFFD_WP);
	}

2932 2933
	vmf->page = vm_normal_page(vma, vmf->address, vmf->orig_pte);
	if (!vmf->page) {
2934
		/*
2935 2936
		 * VM_MIXEDMAP !pfn_valid() case, or VM_SOFTDIRTY clear on a
		 * VM_PFNMAP VMA.
2937 2938
		 *
		 * We should not cow pages in a shared writeable mapping.
2939
		 * Just mark the pages writable and/or call ops->pfn_mkwrite.
2940 2941 2942
		 */
		if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
				     (VM_WRITE|VM_SHARED))
2943
			return wp_pfn_shared(vmf);
2944

2945
		pte_unmap_unlock(vmf->pte, vmf->ptl);
2946
		return wp_page_copy(vmf);
2947
	}
Linus Torvalds's avatar
Linus Torvalds committed
2948

2949
	/*
2950 2951
	 * Take out anonymous pages first, anonymous shared vmas are
	 * not dirty accountable.
2952
	 */
2953
	if (PageAnon(vmf->page)) {
2954 2955 2956 2957 2958 2959 2960 2961 2962
		struct page *page = vmf->page;

		/* PageKsm() doesn't necessarily raise the page refcount */
		if (PageKsm(page) || page_count(page) != 1)
			goto copy;
		if (!trylock_page(page))
			goto copy;
		if (PageKsm(page) || page_mapcount(page) != 1 || page_count(page) != 1) {
			unlock_page(page);
2963
			goto copy;
2964
		}
2965 2966 2967 2968 2969 2970 2971 2972
		/*
		 * Ok, we've got the only map reference, and the only
		 * page count reference, and the page is locked,
		 * it's dark out, and we're wearing sunglasses. Hit it.
		 */
		wp_page_reuse(vmf);
		unlock_page(page);
		return VM_FAULT_WRITE;
2973
	} else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
2974
					(VM_WRITE|VM_SHARED))) {
2975
		return wp_page_shared(vmf);
Linus Torvalds's avatar
Linus Torvalds committed
2976
	}
2977
copy:
Linus Torvalds's avatar
Linus Torvalds committed
2978 2979 2980
	/*
	 * Ok, we need to copy. Oh, well..
	 */
2981
	get_page(vmf->page);
2982

2983
	pte_unmap_unlock(vmf->pte, vmf->ptl);
2984
	return wp_page_copy(vmf);
Linus Torvalds's avatar
Linus Torvalds committed
2985 2986
}

2987
static void unmap_mapping_range_vma(struct vm_area_struct *vma,
Linus Torvalds's avatar
Linus Torvalds committed
2988 2989 2990
		unsigned long start_addr, unsigned long end_addr,
		struct zap_details *details)
{
2991
	zap_page_range_single(vma, start_addr, end_addr - start_addr, details);
Linus Torvalds's avatar
Linus Torvalds committed
2992 2993
}

2994
static inline void unmap_mapping_range_tree(struct rb_root_cached *root,
Linus Torvalds's avatar
Linus Torvalds committed
2995 2996 2997 2998 2999
					    struct zap_details *details)
{
	struct vm_area_struct *vma;
	pgoff_t vba, vea, zba, zea;

3000
	vma_interval_tree_foreach(vma, root,
Linus Torvalds's avatar
Linus Torvalds committed
3001 3002 3003
			details->first_index, details->last_index) {

		vba = vma->vm_pgoff;
3004
		vea = vba + vma_pages(vma) - 1;
Linus Torvalds's avatar
Linus Torvalds committed
3005 3006 3007 3008 3009 3010 3011
		zba = details->first_index;
		if (zba < vba)
			zba = vba;
		zea = details->last_index;
		if (zea > vea)
			zea = vea;

3012
		unmap_mapping_range_vma(vma,
Linus Torvalds's avatar
Linus Torvalds committed
3013 3014
			((zba - vba) << PAGE_SHIFT) + vma->vm_start,
			((zea - vba + 1) << PAGE_SHIFT) + vma->vm_start,
3015
				details);
Linus Torvalds's avatar
Linus Torvalds committed
3016 3017 3018
	}
}

3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047
/**
 * unmap_mapping_pages() - Unmap pages from processes.
 * @mapping: The address space containing pages to be unmapped.
 * @start: Index of first page to be unmapped.
 * @nr: Number of pages to be unmapped.  0 to unmap to end of file.
 * @even_cows: Whether to unmap even private COWed pages.
 *
 * Unmap the pages in this address space from any userspace process which
 * has them mmaped.  Generally, you want to remove COWed pages as well when
 * a file is being truncated, but not when invalidating pages from the page
 * cache.
 */
void unmap_mapping_pages(struct address_space *mapping, pgoff_t start,
		pgoff_t nr, bool even_cows)
{
	struct zap_details details = { };

	details.check_mapping = even_cows ? NULL : mapping;
	details.first_index = start;
	details.last_index = start + nr - 1;
	if (details.last_index < details.first_index)
		details.last_index = ULONG_MAX;

	i_mmap_lock_write(mapping);
	if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)))
		unmap_mapping_range_tree(&mapping->i_mmap, &details);
	i_mmap_unlock_write(mapping);
}

Linus Torvalds's avatar
Linus Torvalds committed
3048
/**
3049
 * unmap_mapping_range - unmap the portion of all mmaps in the specified
3050
 * address_space corresponding to the specified byte range in the underlying
3051 3052
 * file.
 *
3053
 * @mapping: the address space containing mmaps to be unmapped.
Linus Torvalds's avatar
Linus Torvalds committed
3054 3055
 * @holebegin: byte in first page to unmap, relative to the start of
 * the underlying file.  This will be rounded down to a PAGE_SIZE
npiggin@suse.de's avatar
npiggin@suse.de committed
3056
 * boundary.  Note that this is different from truncate_pagecache(), which
Linus Torvalds's avatar
Linus Torvalds committed
3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078
 * must keep the partial page.  In contrast, we must get rid of
 * partial pages.
 * @holelen: size of prospective hole in bytes.  This will be rounded
 * up to a PAGE_SIZE boundary.  A holelen of zero truncates to the
 * end of the file.
 * @even_cows: 1 when truncating a file, unmap even private COWed pages;
 * but 0 when invalidating pagecache, don't throw away private data.
 */
void unmap_mapping_range(struct address_space *mapping,
		loff_t const holebegin, loff_t const holelen, int even_cows)
{
	pgoff_t hba = holebegin >> PAGE_SHIFT;
	pgoff_t hlen = (holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;

	/* Check for overflow. */
	if (sizeof(holelen) > sizeof(hlen)) {
		long long holeend =
			(holebegin + holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
		if (holeend & ~(long long)ULONG_MAX)
			hlen = ULONG_MAX - hba + 1;
	}

3079
	unmap_mapping_pages(mapping, hba, hlen, even_cows);
Linus Torvalds's avatar
Linus Torvalds committed
3080 3081 3082 3083
}
EXPORT_SYMBOL(unmap_mapping_range);

/*
3084
 * We enter with non-exclusive mmap_lock (to exclude vma changes,
3085
 * but allow concurrent faults), and pte mapped but not yet locked.
3086 3087
 * We return with pte unmapped and unlocked.
 *
3088
 * We return with the mmap_lock locked or unlocked in the same cases
3089
 * as does filemap_fault().
Linus Torvalds's avatar
Linus Torvalds committed
3090
 */
3091
vm_fault_t do_swap_page(struct vm_fault *vmf)
Linus Torvalds's avatar
Linus Torvalds committed
3092
{
3093
	struct vm_area_struct *vma = vmf->vma;
3094
	struct page *page = NULL, *swapcache;
3095
	swp_entry_t entry;
Linus Torvalds's avatar
Linus Torvalds committed
3096
	pte_t pte;
3097
	int locked;
3098
	int exclusive = 0;
3099
	vm_fault_t ret = 0;
3100
	void *shadow = NULL;
Linus Torvalds's avatar
Linus Torvalds committed
3101

3102
	if (!pte_unmap_same(vma->vm_mm, vmf->pmd, vmf->pte, vmf->orig_pte))
3103
		goto out;
3104

3105
	entry = pte_to_swp_entry(vmf->orig_pte);
3106 3107
	if (unlikely(non_swap_entry(entry))) {
		if (is_migration_entry(entry)) {
3108 3109
			migration_entry_wait(vma->vm_mm, vmf->pmd,
					     vmf->address);
3110
		} else if (is_device_private_entry(entry)) {
3111 3112
			vmf->page = device_private_entry_to_page(entry);
			ret = vmf->page->pgmap->ops->migrate_to_ram(vmf);
3113 3114 3115
		} else if (is_hwpoison_entry(entry)) {
			ret = VM_FAULT_HWPOISON;
		} else {
3116
			print_bad_pte(vma, vmf->address, vmf->orig_pte, NULL);
3117
			ret = VM_FAULT_SIGBUS;
3118
		}
3119 3120
		goto out;
	}
3121 3122


3123
	delayacct_set_flag(DELAYACCT_PF_SWAPIN);
3124 3125
	page = lookup_swap_cache(entry, vma, vmf->address);
	swapcache = page;
3126

Linus Torvalds's avatar
Linus Torvalds committed
3127
	if (!page) {
3128 3129
		struct swap_info_struct *si = swp_swap_info(entry);

3130 3131
		if (data_race(si->flags & SWP_SYNCHRONOUS_IO) &&
		    __swap_count(entry) == 1) {
3132
			/* skip swapcache */
3133 3134
			page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma,
							vmf->address);
3135
			if (page) {
3136 3137
				int err;

3138 3139 3140
				__SetPageLocked(page);
				__SetPageSwapBacked(page);
				set_page_private(page, entry.val);
3141 3142 3143 3144

				/* Tell memcg to use swap ownership records */
				SetPageSwapCache(page);
				err = mem_cgroup_charge(page, vma->vm_mm,
3145
							GFP_KERNEL);
3146
				ClearPageSwapCache(page);
3147 3148
				if (err) {
					ret = VM_FAULT_OOM;
3149
					goto out_page;
3150
				}
3151

3152 3153 3154
				shadow = get_shadow_from_swap_cache(entry);
				if (shadow)
					workingset_refault(page, shadow);
3155

3156
				lru_cache_add(page);
3157 3158
				swap_readpage(page, true);
			}
3159
		} else {
3160 3161
			page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE,
						vmf);
3162
			swapcache = page;
3163 3164
		}

Linus Torvalds's avatar
Linus Torvalds committed
3165 3166
		if (!page) {
			/*
3167 3168
			 * Back out if somebody else faulted in this pte
			 * while we released the pte lock.
Linus Torvalds's avatar
Linus Torvalds committed
3169
			 */
3170 3171
			vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
					vmf->address, &vmf->ptl);
3172
			if (likely(pte_same(*vmf->pte, vmf->orig_pte)))
Linus Torvalds's avatar
Linus Torvalds committed
3173
				ret = VM_FAULT_OOM;
3174
			delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
3175
			goto unlock;
Linus Torvalds's avatar
Linus Torvalds committed
3176 3177 3178 3179
		}

		/* Had to read the page from swap area: Major fault */
		ret = VM_FAULT_MAJOR;
3180
		count_vm_event(PGMAJFAULT);
3181
		count_memcg_event_mm(vma->vm_mm, PGMAJFAULT);
3182
	} else if (PageHWPoison(page)) {
3183 3184 3185 3186
		/*
		 * hwpoisoned dirty swapcache pages are kept for killing
		 * owner processes (which may be unknown at hwpoison time)
		 */
3187 3188
		ret = VM_FAULT_HWPOISON;
		delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
3189
		goto out_release;
Linus Torvalds's avatar
Linus Torvalds committed
3190 3191
	}

3192
	locked = lock_page_or_retry(page, vma->vm_mm, vmf->flags);
Rik van Riel's avatar
Rik van Riel committed
3193

3194
	delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
3195 3196 3197 3198
	if (!locked) {
		ret |= VM_FAULT_RETRY;
		goto out_release;
	}
3199

3200
	/*
3201 3202 3203 3204
	 * Make sure try_to_free_swap or reuse_swap_page or swapoff did not
	 * release the swapcache from under us.  The page pin, and pte_same
	 * test below, are not enough to exclude that.  Even if it is still
	 * swapcache, we need to check that the page's swap has not changed.
3205
	 */
3206 3207
	if (unlikely((!PageSwapCache(page) ||
			page_private(page) != entry.val)) && swapcache)
3208 3209
		goto out_page;

3210
	page = ksm_might_need_to_copy(page, vma, vmf->address);
3211 3212 3213 3214
	if (unlikely(!page)) {
		ret = VM_FAULT_OOM;
		page = swapcache;
		goto out_page;
3215 3216
	}

3217
	cgroup_throttle_swaprate(page, GFP_KERNEL);
3218

Linus Torvalds's avatar
Linus Torvalds committed
3219
	/*
3220
	 * Back out if somebody else already faulted in this pte.
Linus Torvalds's avatar
Linus Torvalds committed
3221
	 */
3222 3223
	vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
			&vmf->ptl);
3224
	if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte)))
3225 3226 3227 3228 3229
		goto out_nomap;

	if (unlikely(!PageUptodate(page))) {
		ret = VM_FAULT_SIGBUS;
		goto out_nomap;
Linus Torvalds's avatar
Linus Torvalds committed
3230 3231
	}

3232 3233 3234 3235 3236 3237 3238 3239 3240
	/*
	 * The page isn't present yet, go ahead with the fault.
	 *
	 * Be careful about the sequence of operations here.
	 * To get its accounting right, reuse_swap_page() must be called
	 * while the page is counted on swap but not yet in mapcount i.e.
	 * before page_add_anon_rmap() and swap_free(); try_to_free_swap()
	 * must be called after the swap_free(), or it will never succeed.
	 */
Linus Torvalds's avatar
Linus Torvalds committed
3241

3242 3243
	inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
	dec_mm_counter_fast(vma->vm_mm, MM_SWAPENTS);
Linus Torvalds's avatar
Linus Torvalds committed
3244
	pte = mk_pte(page, vma->vm_page_prot);
3245
	if ((vmf->flags & FAULT_FLAG_WRITE) && reuse_swap_page(page, NULL)) {
Linus Torvalds's avatar
Linus Torvalds committed
3246
		pte = maybe_mkwrite(pte_mkdirty(pte), vma);
3247
		vmf->flags &= ~FAULT_FLAG_WRITE;
3248
		ret |= VM_FAULT_WRITE;
3249
		exclusive = RMAP_EXCLUSIVE;
Linus Torvalds's avatar
Linus Torvalds committed
3250 3251
	}
	flush_icache_page(vma, page);
3252
	if (pte_swp_soft_dirty(vmf->orig_pte))
3253
		pte = pte_mksoft_dirty(pte);
3254 3255 3256 3257
	if (pte_swp_uffd_wp(vmf->orig_pte)) {
		pte = pte_mkuffd_wp(pte);
		pte = pte_wrprotect(pte);
	}
3258
	set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte);
3259
	arch_do_swap_page(vma->vm_mm, vma, vmf->address, pte, vmf->orig_pte);
3260
	vmf->orig_pte = pte;
3261 3262 3263

	/* ksm created a completely new copy */
	if (unlikely(page != swapcache && swapcache)) {
3264
		page_add_new_anon_rmap(page, vma, vmf->address, false);
3265
		lru_cache_add_inactive_or_unevictable(page, vma);
3266 3267
	} else {
		do_page_add_anon_rmap(page, vma, vmf->address, exclusive);
3268
	}
Linus Torvalds's avatar
Linus Torvalds committed
3269

3270
	swap_free(entry);
3271 3272
	if (mem_cgroup_swap_full(page) ||
	    (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
3273
		try_to_free_swap(page);
3274
	unlock_page(page);
3275
	if (page != swapcache && swapcache) {
3276 3277 3278 3279 3280 3281 3282 3283 3284
		/*
		 * Hold the lock to avoid the swap entry to be reused
		 * until we take the PT lock for the pte_same() check
		 * (to avoid false positives from pte_same). For
		 * further safety release the lock after the swap_free
		 * so that the swap count won't change under a
		 * parallel locked swapcache.
		 */
		unlock_page(swapcache);
3285
		put_page(swapcache);
3286
	}
3287

3288
	if (vmf->flags & FAULT_FLAG_WRITE) {
3289
		ret |= do_wp_page(vmf);
3290 3291
		if (ret & VM_FAULT_ERROR)
			ret &= VM_FAULT_ERROR;
Linus Torvalds's avatar
Linus Torvalds committed
3292 3293 3294 3295
		goto out;
	}

	/* No need to invalidate - it was non-present before */
3296
	update_mmu_cache(vma, vmf->address, vmf->pte);
3297
unlock:
3298
	pte_unmap_unlock(vmf->pte, vmf->ptl);
Linus Torvalds's avatar
Linus Torvalds committed
3299 3300
out:
	return ret;
3301
out_nomap:
3302
	pte_unmap_unlock(vmf->pte, vmf->ptl);
3303
out_page:
3304
	unlock_page(page);
3305
out_release:
3306
	put_page(page);
3307
	if (page != swapcache && swapcache) {
3308
		unlock_page(swapcache);
3309
		put_page(swapcache);
3310
	}
3311
	return ret;
Linus Torvalds's avatar
Linus Torvalds committed
3312 3313 3314
}

/*
3315
 * We enter with non-exclusive mmap_lock (to exclude vma changes,
3316
 * but allow concurrent faults), and pte mapped but not yet locked.
3317
 * We return with mmap_lock still held, but pte unmapped and unlocked.
Linus Torvalds's avatar
Linus Torvalds committed
3318
 */
3319
static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
Linus Torvalds's avatar
Linus Torvalds committed
3320
{
3321
	struct vm_area_struct *vma = vmf->vma;
3322
	struct page *page;
3323
	vm_fault_t ret = 0;
Linus Torvalds's avatar
Linus Torvalds committed
3324 3325
	pte_t entry;

3326 3327 3328 3329
	/* File mapping without ->vm_ops ? */
	if (vma->vm_flags & VM_SHARED)
		return VM_FAULT_SIGBUS;

3330 3331 3332 3333 3334
	/*
	 * Use pte_alloc() instead of pte_alloc_map().  We can't run
	 * pte_offset_map() on pmds where a huge pmd might be created
	 * from a different thread.
	 *
3335
	 * pte_alloc_map() is safe to use under mmap_write_lock(mm) or when
3336 3337
	 * parallel threads are excluded by other means.
	 *
3338
	 * Here we only have mmap_read_lock(mm).
3339
	 */
3340
	if (pte_alloc(vma->vm_mm, vmf->pmd))
3341 3342 3343
		return VM_FAULT_OOM;

	/* See the comment in pte_alloc_one_map() */
3344
	if (unlikely(pmd_trans_unstable(vmf->pmd)))
3345 3346
		return 0;

3347
	/* Use the zero-page for reads */
3348
	if (!(vmf->flags & FAULT_FLAG_WRITE) &&
3349
			!mm_forbids_zeropage(vma->vm_mm)) {
3350
		entry = pte_mkspecial(pfn_pte(my_zero_pfn(vmf->address),
3351
						vma->vm_page_prot));
3352 3353
		vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
				vmf->address, &vmf->ptl);
3354 3355
		if (!pte_none(*vmf->pte)) {
			update_mmu_tlb(vma, vmf->address, vmf->pte);
Hugh Dickins's avatar
Hugh Dickins committed
3356
			goto unlock;
3357
		}
3358 3359 3360
		ret = check_stable_address_space(vma->vm_mm);
		if (ret)
			goto unlock;
3361 3362
		/* Deliver the page fault to userland, check inside PT lock */
		if (userfaultfd_missing(vma)) {
3363 3364
			pte_unmap_unlock(vmf->pte, vmf->ptl);
			return handle_userfault(vmf, VM_UFFD_MISSING);
3365
		}
Hugh Dickins's avatar
Hugh Dickins committed
3366 3367 3368
		goto setpte;
	}

Nick Piggin's avatar
Nick Piggin committed
3369 3370 3371
	/* Allocate our own private page. */
	if (unlikely(anon_vma_prepare(vma)))
		goto oom;
3372
	page = alloc_zeroed_user_highpage_movable(vma, vmf->address);
Nick Piggin's avatar
Nick Piggin committed
3373 3374
	if (!page)
		goto oom;
3375

3376
	if (mem_cgroup_charge(page, vma->vm_mm, GFP_KERNEL))
3377
		goto oom_free_page;
3378
	cgroup_throttle_swaprate(page, GFP_KERNEL);
3379

3380 3381
	/*
	 * The memory barrier inside __SetPageUptodate makes sure that
3382
	 * preceding stores to the page contents become visible before
3383 3384
	 * the set_pte_at() write.
	 */
Nick Piggin's avatar
Nick Piggin committed
3385
	__SetPageUptodate(page);
3386

Nick Piggin's avatar
Nick Piggin committed
3387
	entry = mk_pte(page, vma->vm_page_prot);
3388
	entry = pte_sw_mkyoung(entry);
Hugh Dickins's avatar
Hugh Dickins committed
3389 3390
	if (vma->vm_flags & VM_WRITE)
		entry = pte_mkwrite(pte_mkdirty(entry));
Linus Torvalds's avatar
Linus Torvalds committed
3391

3392 3393
	vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
			&vmf->ptl);
3394 3395
	if (!pte_none(*vmf->pte)) {
		update_mmu_cache(vma, vmf->address, vmf->pte);
Nick Piggin's avatar
Nick Piggin committed
3396
		goto release;
3397
	}
Hugh Dickins's avatar
Hugh Dickins committed
3398

3399 3400 3401 3402
	ret = check_stable_address_space(vma->vm_mm);
	if (ret)
		goto release;

3403 3404
	/* Deliver the page fault to userland, check inside PT lock */
	if (userfaultfd_missing(vma)) {
3405
		pte_unmap_unlock(vmf->pte, vmf->ptl);
3406
		put_page(page);
3407
		return handle_userfault(vmf, VM_UFFD_MISSING);
3408 3409
	}

3410
	inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
3411
	page_add_new_anon_rmap(page, vma, vmf->address, false);
3412
	lru_cache_add_inactive_or_unevictable(page, vma);
Hugh Dickins's avatar
Hugh Dickins committed
3413
setpte:
3414
	set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry);
Linus Torvalds's avatar
Linus Torvalds committed
3415 3416

	/* No need to invalidate - it was non-present before */
3417
	update_mmu_cache(vma, vmf->address, vmf->pte);
3418
unlock:
3419
	pte_unmap_unlock(vmf->pte, vmf->ptl);
3420
	return ret;
3421
release:
3422
	put_page(page);
3423
	goto unlock;
3424
oom_free_page:
3425
	put_page(page);
3426
oom:
Linus Torvalds's avatar
Linus Torvalds committed
3427 3428 3429
	return VM_FAULT_OOM;
}

3430
/*
3431
 * The mmap_lock must have been held on entry, and may have been
3432 3433 3434
 * released depending on flags and vma->vm_ops->fault() return value.
 * See filemap_fault() and __lock_page_retry().
 */
3435
static vm_fault_t __do_fault(struct vm_fault *vmf)
3436
{
3437
	struct vm_area_struct *vma = vmf->vma;
3438
	vm_fault_t ret;
3439

3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461
	/*
	 * Preallocate pte before we take page_lock because this might lead to
	 * deadlocks for memcg reclaim which waits for pages under writeback:
	 *				lock_page(A)
	 *				SetPageWriteback(A)
	 *				unlock_page(A)
	 * lock_page(B)
	 *				lock_page(B)
	 * pte_alloc_pne
	 *   shrink_page_list
	 *     wait_on_page_writeback(A)
	 *				SetPageWriteback(B)
	 *				unlock_page(B)
	 *				# flush A, B to clear the writeback
	 */
	if (pmd_none(*vmf->pmd) && !vmf->prealloc_pte) {
		vmf->prealloc_pte = pte_alloc_one(vmf->vma->vm_mm);
		if (!vmf->prealloc_pte)
			return VM_FAULT_OOM;
		smp_wmb(); /* See comment in __pte_alloc() */
	}

3462
	ret = vma->vm_ops->fault(vmf);
3463
	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY |
3464
			    VM_FAULT_DONE_COW)))
3465
		return ret;
3466

3467
	if (unlikely(PageHWPoison(vmf->page))) {
3468
		if (ret & VM_FAULT_LOCKED)
3469 3470
			unlock_page(vmf->page);
		put_page(vmf->page);
Jan Kara's avatar
Jan Kara committed
3471
		vmf->page = NULL;
3472 3473 3474 3475
		return VM_FAULT_HWPOISON;
	}

	if (unlikely(!(ret & VM_FAULT_LOCKED)))
3476
		lock_page(vmf->page);
3477
	else
3478
		VM_BUG_ON_PAGE(!PageLocked(vmf->page), vmf->page);
3479 3480 3481 3482

	return ret;
}

3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493
/*
 * The ordering of these checks is important for pmds with _PAGE_DEVMAP set.
 * If we check pmd_trans_unstable() first we will trip the bad_pmd() check
 * inside of pmd_none_or_trans_huge_or_clear_bad(). This will end up correctly
 * returning 1 but not before it spams dmesg with the pmd_clear_bad() output.
 */
static int pmd_devmap_trans_unstable(pmd_t *pmd)
{
	return pmd_devmap(*pmd) || pmd_trans_unstable(pmd);
}

3494
static vm_fault_t pte_alloc_one_map(struct vm_fault *vmf)
3495
{
3496
	struct vm_area_struct *vma = vmf->vma;
3497

3498
	if (!pmd_none(*vmf->pmd))
3499
		goto map_pte;
3500 3501 3502 3503
	if (vmf->prealloc_pte) {
		vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
		if (unlikely(!pmd_none(*vmf->pmd))) {
			spin_unlock(vmf->ptl);
3504 3505 3506
			goto map_pte;
		}

3507
		mm_inc_nr_ptes(vma->vm_mm);
3508 3509
		pmd_populate(vma->vm_mm, vmf->pmd, vmf->prealloc_pte);
		spin_unlock(vmf->ptl);
3510
		vmf->prealloc_pte = NULL;
3511
	} else if (unlikely(pte_alloc(vma->vm_mm, vmf->pmd))) {
3512 3513 3514 3515 3516
		return VM_FAULT_OOM;
	}
map_pte:
	/*
	 * If a huge pmd materialized under us just retry later.  Use
3517 3518 3519 3520 3521 3522 3523 3524
	 * pmd_trans_unstable() via pmd_devmap_trans_unstable() instead of
	 * pmd_trans_huge() to ensure the pmd didn't become pmd_trans_huge
	 * under us and then back to pmd_none, as a result of MADV_DONTNEED
	 * running immediately after a huge pmd fault in a different thread of
	 * this mm, in turn leading to a misleading pmd_trans_huge() retval.
	 * All we have to ensure is that it is a regular pmd that we can walk
	 * with pte_offset_map() and we can do that through an atomic read in
	 * C, which is what pmd_trans_unstable() provides.
3525
	 */
3526
	if (pmd_devmap_trans_unstable(vmf->pmd))
3527 3528
		return VM_FAULT_NOPAGE;

3529 3530 3531 3532 3533 3534 3535 3536 3537
	/*
	 * At this point we know that our vmf->pmd points to a page of ptes
	 * and it cannot become pmd_none(), pmd_devmap() or pmd_trans_huge()
	 * for the duration of the fault.  If a racing MADV_DONTNEED runs and
	 * we zap the ptes pointed to by our vmf->pmd, the vmf->ptl will still
	 * be valid and we will re-check to make sure the vmf->pte isn't
	 * pte_none() under vmf->ptl protection when we return to
	 * alloc_set_pte().
	 */
3538 3539
	vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
			&vmf->ptl);
3540 3541 3542
	return 0;
}

3543
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
3544
static void deposit_prealloc_pte(struct vm_fault *vmf)
3545
{
3546
	struct vm_area_struct *vma = vmf->vma;
3547

3548
	pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, vmf->prealloc_pte);
3549 3550 3551 3552
	/*
	 * We are going to consume the prealloc table,
	 * count that as nr_ptes.
	 */
3553
	mm_inc_nr_ptes(vma->vm_mm);
3554
	vmf->prealloc_pte = NULL;
3555 3556
}

3557
static vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
3558
{
3559 3560 3561
	struct vm_area_struct *vma = vmf->vma;
	bool write = vmf->flags & FAULT_FLAG_WRITE;
	unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
3562
	pmd_t entry;
3563 3564
	int i;
	vm_fault_t ret;
3565 3566 3567 3568 3569 3570 3571

	if (!transhuge_vma_suitable(vma, haddr))
		return VM_FAULT_FALLBACK;

	ret = VM_FAULT_FALLBACK;
	page = compound_head(page);

3572 3573 3574 3575
	/*
	 * Archs like ppc64 need additonal space to store information
	 * related to pte entry. Use the preallocated table for that.
	 */
3576
	if (arch_needs_pgtable_deposit() && !vmf->prealloc_pte) {
3577
		vmf->prealloc_pte = pte_alloc_one(vma->vm_mm);
3578
		if (!vmf->prealloc_pte)
3579 3580 3581 3582
			return VM_FAULT_OOM;
		smp_wmb(); /* See comment in __pte_alloc() */
	}

3583 3584
	vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
	if (unlikely(!pmd_none(*vmf->pmd)))
3585 3586 3587 3588 3589 3590 3591
		goto out;

	for (i = 0; i < HPAGE_PMD_NR; i++)
		flush_icache_page(vma, page + i);

	entry = mk_huge_pmd(page, vma->vm_page_prot);
	if (write)
3592
		entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
3593

3594
	add_mm_counter(vma->vm_mm, mm_counter_file(page), HPAGE_PMD_NR);
3595
	page_add_file_rmap(page, true);
3596 3597 3598 3599
	/*
	 * deposit and withdraw with pmd lock held
	 */
	if (arch_needs_pgtable_deposit())
3600
		deposit_prealloc_pte(vmf);
3601

3602
	set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
3603

3604
	update_mmu_cache_pmd(vma, haddr, vmf->pmd);
3605 3606 3607

	/* fault is handled */
	ret = 0;
3608
	count_vm_event(THP_FILE_MAPPED);
3609
out:
3610
	spin_unlock(vmf->ptl);
3611 3612 3613
	return ret;
}
#else
3614
static vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
3615 3616 3617 3618 3619 3620
{
	BUILD_BUG();
	return 0;
}
#endif

3621
/**
3622 3623
 * alloc_set_pte - setup new PTE entry for given page and add reverse page
 * mapping. If needed, the fucntion allocates page table or use pre-allocated.
3624
 *
3625
 * @vmf: fault environment
3626 3627
 * @page: page to map
 *
3628 3629
 * Caller must take care of unlocking vmf->ptl, if vmf->pte is non-NULL on
 * return.
3630 3631 3632
 *
 * Target users are page handler itself and implementations of
 * vm_ops->map_pages.
3633 3634
 *
 * Return: %0 on success, %VM_FAULT_ code in case of error.
3635
 */
3636
vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct page *page)
3637
{
3638 3639
	struct vm_area_struct *vma = vmf->vma;
	bool write = vmf->flags & FAULT_FLAG_WRITE;
3640
	pte_t entry;
3641
	vm_fault_t ret;
3642

3643
	if (pmd_none(*vmf->pmd) && PageTransCompound(page)) {
3644
		ret = do_set_pmd(vmf, page);
3645
		if (ret != VM_FAULT_FALLBACK)
Hugh Dickins's avatar
Hugh Dickins committed
3646
			return ret;
3647
	}
3648

3649 3650
	if (!vmf->pte) {
		ret = pte_alloc_one_map(vmf);
3651
		if (ret)
Hugh Dickins's avatar
Hugh Dickins committed
3652
			return ret;
3653 3654 3655
	}

	/* Re-check under ptl */
3656 3657
	if (unlikely(!pte_none(*vmf->pte))) {
		update_mmu_tlb(vma, vmf->address, vmf->pte);
Hugh Dickins's avatar
Hugh Dickins committed
3658
		return VM_FAULT_NOPAGE;
3659
	}
3660

3661 3662
	flush_icache_page(vma, page);
	entry = mk_pte(page, vma->vm_page_prot);
3663
	entry = pte_sw_mkyoung(entry);
3664 3665
	if (write)
		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
3666 3667
	/* copy-on-write page */
	if (write && !(vma->vm_flags & VM_SHARED)) {
3668
		inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
3669
		page_add_new_anon_rmap(page, vma, vmf->address, false);
3670
		lru_cache_add_inactive_or_unevictable(page, vma);
3671
	} else {
3672
		inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page));
3673
		page_add_file_rmap(page, false);
3674
	}
3675
	set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry);
3676 3677

	/* no need to invalidate: a not-present page won't be cached */
3678
	update_mmu_cache(vma, vmf->address, vmf->pte);
3679

Hugh Dickins's avatar
Hugh Dickins committed
3680
	return 0;
3681 3682
}

3683 3684 3685 3686 3687 3688 3689 3690 3691

/**
 * finish_fault - finish page fault once we have prepared the page to fault
 *
 * @vmf: structure describing the fault
 *
 * This function handles all that is needed to finish a page fault once the
 * page to fault in is prepared. It handles locking of PTEs, inserts PTE for
 * given page, adds reverse page mapping, handles memcg charges and LRU
3692
 * addition.
3693 3694 3695
 *
 * The function expects the page to be locked and on success it consumes a
 * reference of a page being mapped (for the PTE which maps it).
3696 3697
 *
 * Return: %0 on success, %VM_FAULT_ code in case of error.
3698
 */
3699
vm_fault_t finish_fault(struct vm_fault *vmf)
3700 3701
{
	struct page *page;
3702
	vm_fault_t ret = 0;
3703 3704 3705 3706 3707 3708 3709

	/* Did we COW the page? */
	if ((vmf->flags & FAULT_FLAG_WRITE) &&
	    !(vmf->vma->vm_flags & VM_SHARED))
		page = vmf->cow_page;
	else
		page = vmf->page;
3710 3711 3712 3713 3714 3715 3716 3717

	/*
	 * check even for read faults because we might have lost our CoWed
	 * page
	 */
	if (!(vmf->vma->vm_flags & VM_SHARED))
		ret = check_stable_address_space(vmf->vma->vm_mm);
	if (!ret)
3718
		ret = alloc_set_pte(vmf, page);
3719 3720 3721 3722 3723
	if (vmf->pte)
		pte_unmap_unlock(vmf->pte, vmf->ptl);
	return ret;
}

3724 3725
static unsigned long fault_around_bytes __read_mostly =
	rounddown_pow_of_two(65536);
3726 3727 3728

#ifdef CONFIG_DEBUG_FS
static int fault_around_bytes_get(void *data, u64 *val)
3729
{
3730
	*val = fault_around_bytes;
3731 3732 3733
	return 0;
}

3734
/*
3735 3736
 * fault_around_bytes must be rounded down to the nearest page order as it's
 * what do_fault_around() expects to see.
3737
 */
3738
static int fault_around_bytes_set(void *data, u64 val)
3739
{
3740
	if (val / PAGE_SIZE > PTRS_PER_PTE)
3741
		return -EINVAL;
3742 3743 3744 3745
	if (val > PAGE_SIZE)
		fault_around_bytes = rounddown_pow_of_two(val);
	else
		fault_around_bytes = PAGE_SIZE; /* rounddown_pow_of_two(0) is undefined */
3746 3747
	return 0;
}
3748
DEFINE_DEBUGFS_ATTRIBUTE(fault_around_bytes_fops,
3749
		fault_around_bytes_get, fault_around_bytes_set, "%llu\n");
3750 3751 3752

static int __init fault_around_debugfs(void)
{
3753 3754
	debugfs_create_file_unsafe("fault_around_bytes", 0644, NULL, NULL,
				   &fault_around_bytes_fops);
3755 3756 3757 3758
	return 0;
}
late_initcall(fault_around_debugfs);
#endif
3759

3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774
/*
 * do_fault_around() tries to map few pages around the fault address. The hope
 * is that the pages will be needed soon and this will lower the number of
 * faults to handle.
 *
 * It uses vm_ops->map_pages() to map the pages, which skips the page if it's
 * not ready to be mapped: not up-to-date, locked, etc.
 *
 * This function is called with the page table lock taken. In the split ptlock
 * case the page table lock only protects only those entries which belong to
 * the page table corresponding to the fault address.
 *
 * This function doesn't cross the VMA boundaries, in order to call map_pages()
 * only once.
 *
3775 3776 3777
 * fault_around_bytes defines how many bytes we'll try to map.
 * do_fault_around() expects it to be set to a power of two less than or equal
 * to PTRS_PER_PTE.
3778
 *
3779 3780 3781 3782
 * The virtual address of the area that we map is naturally aligned to
 * fault_around_bytes rounded down to the machine page size
 * (and therefore to page order).  This way it's easier to guarantee
 * that we don't cross page table boundaries.
3783
 */
3784
static vm_fault_t do_fault_around(struct vm_fault *vmf)
3785
{
3786
	unsigned long address = vmf->address, nr_pages, mask;
3787
	pgoff_t start_pgoff = vmf->pgoff;
3788
	pgoff_t end_pgoff;
3789 3790
	int off;
	vm_fault_t ret = 0;
3791

3792
	nr_pages = READ_ONCE(fault_around_bytes) >> PAGE_SHIFT;
3793 3794
	mask = ~(nr_pages * PAGE_SIZE - 1) & PAGE_MASK;

3795 3796
	vmf->address = max(address & mask, vmf->vma->vm_start);
	off = ((address - vmf->address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1);
3797
	start_pgoff -= off;
3798 3799

	/*
3800 3801
	 *  end_pgoff is either the end of the page table, the end of
	 *  the vma or nr_pages from start_pgoff, depending what is nearest.
3802
	 */
3803
	end_pgoff = start_pgoff -
3804
		((vmf->address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) +
3805
		PTRS_PER_PTE - 1;
3806
	end_pgoff = min3(end_pgoff, vma_pages(vmf->vma) + vmf->vma->vm_pgoff - 1,
3807
			start_pgoff + nr_pages - 1);
3808

3809
	if (pmd_none(*vmf->pmd)) {
3810
		vmf->prealloc_pte = pte_alloc_one(vmf->vma->vm_mm);
3811
		if (!vmf->prealloc_pte)
3812
			goto out;
3813
		smp_wmb(); /* See comment in __pte_alloc() */
3814 3815
	}

3816
	vmf->vma->vm_ops->map_pages(vmf, start_pgoff, end_pgoff);
3817 3818

	/* Huge page is mapped? Page fault is solved */
3819
	if (pmd_trans_huge(*vmf->pmd)) {
3820 3821 3822 3823 3824
		ret = VM_FAULT_NOPAGE;
		goto out;
	}

	/* ->map_pages() haven't done anything useful. Cold page cache? */
3825
	if (!vmf->pte)
3826 3827 3828
		goto out;

	/* check if the page fault is solved */
3829 3830
	vmf->pte -= (vmf->address >> PAGE_SHIFT) - (address >> PAGE_SHIFT);
	if (!pte_none(*vmf->pte))
3831
		ret = VM_FAULT_NOPAGE;
3832
	pte_unmap_unlock(vmf->pte, vmf->ptl);
3833
out:
3834 3835
	vmf->address = address;
	vmf->pte = NULL;
3836
	return ret;
3837 3838
}

3839
static vm_fault_t do_read_fault(struct vm_fault *vmf)
3840
{
3841
	struct vm_area_struct *vma = vmf->vma;
3842
	vm_fault_t ret = 0;
3843 3844 3845 3846 3847 3848

	/*
	 * Let's call ->map_pages() first and use ->fault() as fallback
	 * if page by the offset is not ready to be mapped (cold cache or
	 * something).
	 */
3849
	if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) {
3850
		ret = do_fault_around(vmf);
3851 3852
		if (ret)
			return ret;
3853
	}
3854

Jan Kara's avatar
Jan Kara committed
3855
	ret = __do_fault(vmf);
3856 3857 3858
	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
		return ret;

3859
	ret |= finish_fault(vmf);
Jan Kara's avatar
Jan Kara committed
3860
	unlock_page(vmf->page);
3861
	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
Jan Kara's avatar
Jan Kara committed
3862
		put_page(vmf->page);
3863 3864 3865
	return ret;
}

3866
static vm_fault_t do_cow_fault(struct vm_fault *vmf)
3867
{
3868
	struct vm_area_struct *vma = vmf->vma;
3869
	vm_fault_t ret;
3870 3871 3872 3873

	if (unlikely(anon_vma_prepare(vma)))
		return VM_FAULT_OOM;

Jan Kara's avatar
Jan Kara committed
3874 3875
	vmf->cow_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vmf->address);
	if (!vmf->cow_page)
3876 3877
		return VM_FAULT_OOM;

3878
	if (mem_cgroup_charge(vmf->cow_page, vma->vm_mm, GFP_KERNEL)) {
Jan Kara's avatar
Jan Kara committed
3879
		put_page(vmf->cow_page);
3880 3881
		return VM_FAULT_OOM;
	}
3882
	cgroup_throttle_swaprate(vmf->cow_page, GFP_KERNEL);
3883

Jan Kara's avatar
Jan Kara committed
3884
	ret = __do_fault(vmf);
3885 3886
	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
		goto uncharge_out;
3887 3888
	if (ret & VM_FAULT_DONE_COW)
		return ret;
3889

3890
	copy_user_highpage(vmf->cow_page, vmf->page, vmf->address, vma);
Jan Kara's avatar
Jan Kara committed
3891
	__SetPageUptodate(vmf->cow_page);
3892

3893
	ret |= finish_fault(vmf);
3894 3895
	unlock_page(vmf->page);
	put_page(vmf->page);
3896 3897
	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
		goto uncharge_out;
3898 3899
	return ret;
uncharge_out:
Jan Kara's avatar
Jan Kara committed
3900
	put_page(vmf->cow_page);
3901 3902 3903
	return ret;
}

3904
static vm_fault_t do_shared_fault(struct vm_fault *vmf)
Linus Torvalds's avatar
Linus Torvalds committed
3905
{
3906
	struct vm_area_struct *vma = vmf->vma;
3907
	vm_fault_t ret, tmp;
3908

Jan Kara's avatar
Jan Kara committed
3909
	ret = __do_fault(vmf);
3910
	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
3911
		return ret;
Linus Torvalds's avatar
Linus Torvalds committed
3912 3913

	/*
3914 3915
	 * Check if the backing address space wants to know that the page is
	 * about to become writable
Linus Torvalds's avatar
Linus Torvalds committed
3916
	 */
3917
	if (vma->vm_ops->page_mkwrite) {
Jan Kara's avatar
Jan Kara committed
3918
		unlock_page(vmf->page);
3919
		tmp = do_page_mkwrite(vmf);
3920 3921
		if (unlikely(!tmp ||
				(tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
Jan Kara's avatar
Jan Kara committed
3922
			put_page(vmf->page);
3923
			return tmp;
3924
		}
3925 3926
	}

3927
	ret |= finish_fault(vmf);
3928 3929
	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE |
					VM_FAULT_RETRY))) {
Jan Kara's avatar
Jan Kara committed
3930 3931
		unlock_page(vmf->page);
		put_page(vmf->page);
3932
		return ret;
Linus Torvalds's avatar
Linus Torvalds committed
3933
	}
Nick Piggin's avatar
Nick Piggin committed
3934

3935
	ret |= fault_dirty_shared_page(vmf);
3936
	return ret;
3937
}
3938

3939
/*
3940
 * We enter with non-exclusive mmap_lock (to exclude vma changes,
3941
 * but allow concurrent faults).
3942
 * The mmap_lock may have been released depending on flags and our
3943
 * return value.  See filemap_fault() and __lock_page_or_retry().
3944
 * If mmap_lock is released, vma may become invalid (for example
3945
 * by other thread calling munmap()).
3946
 */
3947
static vm_fault_t do_fault(struct vm_fault *vmf)
3948
{
3949
	struct vm_area_struct *vma = vmf->vma;
3950
	struct mm_struct *vm_mm = vma->vm_mm;
3951
	vm_fault_t ret;
3952

3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982
	/*
	 * The VMA was not fully populated on mmap() or missing VM_DONTEXPAND
	 */
	if (!vma->vm_ops->fault) {
		/*
		 * If we find a migration pmd entry or a none pmd entry, which
		 * should never happen, return SIGBUS
		 */
		if (unlikely(!pmd_present(*vmf->pmd)))
			ret = VM_FAULT_SIGBUS;
		else {
			vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm,
						       vmf->pmd,
						       vmf->address,
						       &vmf->ptl);
			/*
			 * Make sure this is not a temporary clearing of pte
			 * by holding ptl and checking again. A R/M/W update
			 * of pte involves: take ptl, clearing the pte so that
			 * we don't have concurrent modification by hardware
			 * followed by an update.
			 */
			if (unlikely(pte_none(*vmf->pte)))
				ret = VM_FAULT_SIGBUS;
			else
				ret = VM_FAULT_NOPAGE;

			pte_unmap_unlock(vmf->pte, vmf->ptl);
		}
	} else if (!(vmf->flags & FAULT_FLAG_WRITE))
Hugh Dickins's avatar
Hugh Dickins committed
3983 3984 3985 3986 3987 3988 3989 3990
		ret = do_read_fault(vmf);
	else if (!(vma->vm_flags & VM_SHARED))
		ret = do_cow_fault(vmf);
	else
		ret = do_shared_fault(vmf);

	/* preallocated pagetable is unused: free it */
	if (vmf->prealloc_pte) {
3991
		pte_free(vm_mm, vmf->prealloc_pte);
3992
		vmf->prealloc_pte = NULL;
Hugh Dickins's avatar
Hugh Dickins committed
3993 3994
	}
	return ret;
3995 3996
}

3997
static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
3998 3999
				unsigned long addr, int page_nid,
				int *flags)
4000 4001 4002 4003
{
	get_page(page);

	count_vm_numa_event(NUMA_HINT_FAULTS);
4004
	if (page_nid == numa_node_id()) {
4005
		count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
4006 4007
		*flags |= TNF_FAULT_LOCAL;
	}
4008 4009 4010 4011

	return mpol_misplaced(page, vma, addr);
}

4012
static vm_fault_t do_numa_page(struct vm_fault *vmf)
4013
{
4014
	struct vm_area_struct *vma = vmf->vma;
4015
	struct page *page = NULL;
4016
	int page_nid = NUMA_NO_NODE;
4017
	int last_cpupid;
4018
	int target_nid;
4019
	bool migrated = false;
4020
	pte_t pte, old_pte;
4021
	bool was_writable = pte_savedwrite(vmf->orig_pte);
4022
	int flags = 0;
4023 4024

	/*
Tobin C Harding's avatar
Tobin C Harding committed
4025 4026 4027 4028
	 * The "pte" at this point cannot be used safely without
	 * validation through pte_unmap_same(). It's of NUMA type but
	 * the pfn may be screwed if the read is non atomic.
	 */
4029 4030
	vmf->ptl = pte_lockptr(vma->vm_mm, vmf->pmd);
	spin_lock(vmf->ptl);
4031
	if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte))) {
4032
		pte_unmap_unlock(vmf->pte, vmf->ptl);
4033 4034 4035
		goto out;
	}

4036 4037 4038 4039
	/*
	 * Make it present again, Depending on how arch implementes non
	 * accessible ptes, some can allow access by kernel mode.
	 */
4040 4041
	old_pte = ptep_modify_prot_start(vma, vmf->address, vmf->pte);
	pte = pte_modify(old_pte, vma->vm_page_prot);
4042
	pte = pte_mkyoung(pte);
4043 4044
	if (was_writable)
		pte = pte_mkwrite(pte);
4045
	ptep_modify_prot_commit(vma, vmf->address, vmf->pte, old_pte, pte);
4046
	update_mmu_cache(vma, vmf->address, vmf->pte);
4047

4048
	page = vm_normal_page(vma, vmf->address, pte);
4049
	if (!page) {
4050
		pte_unmap_unlock(vmf->pte, vmf->ptl);
4051 4052 4053
		return 0;
	}

4054 4055
	/* TODO: handle PTE-mapped THP */
	if (PageCompound(page)) {
4056
		pte_unmap_unlock(vmf->pte, vmf->ptl);
4057 4058 4059
		return 0;
	}

4060
	/*
4061 4062 4063 4064 4065 4066
	 * Avoid grouping on RO pages in general. RO pages shouldn't hurt as
	 * much anyway since they can be in shared cache state. This misses
	 * the case where a mapping is writable but the process never writes
	 * to it but pte_write gets cleared during protection updates and
	 * pte_dirty has unpredictable behaviour between PTE scan updates,
	 * background writeback, dirty balancing and application behaviour.
4067
	 */
4068
	if (!pte_write(pte))
4069 4070
		flags |= TNF_NO_GROUP;

4071 4072 4073 4074 4075 4076 4077
	/*
	 * Flag if the page is shared between multiple address spaces. This
	 * is later used when determining whether to group tasks together
	 */
	if (page_mapcount(page) > 1 && (vma->vm_flags & VM_SHARED))
		flags |= TNF_SHARED;

4078
	last_cpupid = page_cpupid_last(page);
4079
	page_nid = page_to_nid(page);
4080
	target_nid = numa_migrate_prep(page, vma, vmf->address, page_nid,
4081
			&flags);
4082
	pte_unmap_unlock(vmf->pte, vmf->ptl);
4083
	if (target_nid == NUMA_NO_NODE) {
4084 4085 4086 4087 4088
		put_page(page);
		goto out;
	}

	/* Migrate to the requested node */
4089
	migrated = migrate_misplaced_page(page, vma, target_nid);
4090
	if (migrated) {
4091
		page_nid = target_nid;
4092
		flags |= TNF_MIGRATED;
4093 4094
	} else
		flags |= TNF_MIGRATE_FAIL;
4095 4096

out:
4097
	if (page_nid != NUMA_NO_NODE)
4098
		task_numa_fault(last_cpupid, page_nid, 1, flags);
4099 4100 4101
	return 0;
}

4102
static inline vm_fault_t create_huge_pmd(struct vm_fault *vmf)
4103
{
4104
	if (vma_is_anonymous(vmf->vma))
4105
		return do_huge_pmd_anonymous_page(vmf);
4106
	if (vmf->vma->vm_ops->huge_fault)
4107
		return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD);
4108 4109 4110
	return VM_FAULT_FALLBACK;
}

4111
/* `inline' is required to avoid gcc 4.1.2 build error */
4112
static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf, pmd_t orig_pmd)
4113
{
4114
	if (vma_is_anonymous(vmf->vma)) {
4115
		if (userfaultfd_huge_pmd_wp(vmf->vma, orig_pmd))
4116
			return handle_userfault(vmf, VM_UFFD_WP);
4117
		return do_huge_pmd_wp_page(vmf, orig_pmd);
4118
	}
4119 4120 4121 4122 4123 4124
	if (vmf->vma->vm_ops->huge_fault) {
		vm_fault_t ret = vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD);

		if (!(ret & VM_FAULT_FALLBACK))
			return ret;
	}
4125

4126
	/* COW or write-notify handled on pte level: split pmd. */
4127
	__split_huge_pmd(vmf->vma, vmf->pmd, vmf->address, false, NULL);
4128

4129 4130 4131
	return VM_FAULT_FALLBACK;
}

4132
static vm_fault_t create_huge_pud(struct vm_fault *vmf)
4133
{
4134 4135
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) &&			\
	defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
4136 4137
	/* No support for anonymous transparent PUD pages yet */
	if (vma_is_anonymous(vmf->vma))
4138 4139 4140 4141 4142 4143 4144 4145 4146 4147
		goto split;
	if (vmf->vma->vm_ops->huge_fault) {
		vm_fault_t ret = vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD);

		if (!(ret & VM_FAULT_FALLBACK))
			return ret;
	}
split:
	/* COW or write-notify not handled on PUD level: split pud.*/
	__split_huge_pud(vmf->vma, vmf->pud, vmf->address);
4148 4149 4150 4151
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
	return VM_FAULT_FALLBACK;
}

4152
static vm_fault_t wp_huge_pud(struct vm_fault *vmf, pud_t orig_pud)
4153 4154 4155 4156 4157 4158
{
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
	/* No support for anonymous transparent PUD pages yet */
	if (vma_is_anonymous(vmf->vma))
		return VM_FAULT_FALLBACK;
	if (vmf->vma->vm_ops->huge_fault)
4159
		return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD);
4160 4161 4162 4163
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
	return VM_FAULT_FALLBACK;
}

Linus Torvalds's avatar
Linus Torvalds committed
4164 4165 4166 4167 4168 4169 4170 4171 4172
/*
 * These routines also need to handle stuff like marking pages dirty
 * and/or accessed for architectures that don't do it in hardware (most
 * RISC architectures).  The early dirtying is also good on the i386.
 *
 * There is also a hook called "update_mmu_cache()" that architectures
 * with external mmu caches can use to update those (ie the Sparc or
 * PowerPC hashed page tables that act as extended TLBs).
 *
4173
 * We enter with non-exclusive mmap_lock (to exclude vma changes, but allow
4174
 * concurrent faults).
4175
 *
4176
 * The mmap_lock may have been released depending on flags and our return value.
4177
 * See filemap_fault() and __lock_page_or_retry().
Linus Torvalds's avatar
Linus Torvalds committed
4178
 */
4179
static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
Linus Torvalds's avatar
Linus Torvalds committed
4180 4181 4182
{
	pte_t entry;

4183
	if (unlikely(pmd_none(*vmf->pmd))) {
4184 4185 4186 4187 4188 4189
		/*
		 * Leave __pte_alloc() until later: because vm_ops->fault may
		 * want to allocate huge page, and if we expose page table
		 * for an instant, it will be difficult to retract from
		 * concurrent faults and from rmap lookups.
		 */
4190
		vmf->pte = NULL;
4191 4192
	} else {
		/* See comment in pte_alloc_one_map() */
4193
		if (pmd_devmap_trans_unstable(vmf->pmd))
4194 4195 4196 4197
			return 0;
		/*
		 * A regular pmd is established and it can't morph into a huge
		 * pmd from under us anymore at this point because we hold the
4198
		 * mmap_lock read mode and khugepaged takes it in write mode.
4199 4200
		 * So now it's safe to run pte_offset_map().
		 */
4201
		vmf->pte = pte_offset_map(vmf->pmd, vmf->address);
4202
		vmf->orig_pte = *vmf->pte;
4203 4204 4205 4206

		/*
		 * some architectures can have larger ptes than wordsize,
		 * e.g.ppc44x-defconfig has CONFIG_PTE_64BIT=y and
4207 4208 4209
		 * CONFIG_32BIT=y, so READ_ONCE cannot guarantee atomic
		 * accesses.  The code below just needs a consistent view
		 * for the ifs and we later double check anyway with the
4210 4211 4212
		 * ptl lock held. So here a barrier will do.
		 */
		barrier();
4213
		if (pte_none(vmf->orig_pte)) {
4214 4215
			pte_unmap(vmf->pte);
			vmf->pte = NULL;
4216
		}
Linus Torvalds's avatar
Linus Torvalds committed
4217 4218
	}

4219 4220 4221
	if (!vmf->pte) {
		if (vma_is_anonymous(vmf->vma))
			return do_anonymous_page(vmf);
4222
		else
4223
			return do_fault(vmf);
4224 4225
	}

4226 4227
	if (!pte_present(vmf->orig_pte))
		return do_swap_page(vmf);
4228

4229 4230
	if (pte_protnone(vmf->orig_pte) && vma_is_accessible(vmf->vma))
		return do_numa_page(vmf);
4231

4232 4233
	vmf->ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd);
	spin_lock(vmf->ptl);
4234
	entry = vmf->orig_pte;
4235 4236
	if (unlikely(!pte_same(*vmf->pte, entry))) {
		update_mmu_tlb(vmf->vma, vmf->address, vmf->pte);
4237
		goto unlock;
4238
	}
4239
	if (vmf->flags & FAULT_FLAG_WRITE) {
4240
		if (!pte_write(entry))
4241
			return do_wp_page(vmf);
Linus Torvalds's avatar
Linus Torvalds committed
4242 4243 4244
		entry = pte_mkdirty(entry);
	}
	entry = pte_mkyoung(entry);
4245 4246 4247
	if (ptep_set_access_flags(vmf->vma, vmf->address, vmf->pte, entry,
				vmf->flags & FAULT_FLAG_WRITE)) {
		update_mmu_cache(vmf->vma, vmf->address, vmf->pte);
4248
	} else {
4249 4250 4251
		/* Skip spurious TLB flush for retried page fault */
		if (vmf->flags & FAULT_FLAG_TRIED)
			goto unlock;
4252 4253 4254 4255 4256 4257
		/*
		 * This is needed only for protection faults but the arch code
		 * is not yet telling us if this is a protection fault or not.
		 * This still avoids useless tlb flushes for .text page faults
		 * with threads.
		 */
4258 4259
		if (vmf->flags & FAULT_FLAG_WRITE)
			flush_tlb_fix_spurious_fault(vmf->vma, vmf->address);
4260
	}
4261
unlock:
4262
	pte_unmap_unlock(vmf->pte, vmf->ptl);
Nick Piggin's avatar
Nick Piggin committed
4263
	return 0;
Linus Torvalds's avatar
Linus Torvalds committed
4264 4265 4266 4267
}

/*
 * By the time we get here, we already hold the mm semaphore
4268
 *
4269
 * The mmap_lock may have been released depending on flags and our
4270
 * return value.  See filemap_fault() and __lock_page_or_retry().
Linus Torvalds's avatar
Linus Torvalds committed
4271
 */
4272 4273
static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma,
		unsigned long address, unsigned int flags)
Linus Torvalds's avatar
Linus Torvalds committed
4274
{
4275
	struct vm_fault vmf = {
4276
		.vma = vma,
4277
		.address = address & PAGE_MASK,
4278
		.flags = flags,
4279
		.pgoff = linear_page_index(vma, address),
4280
		.gfp_mask = __get_fault_gfp_mask(vma),
4281
	};
4282
	unsigned int dirty = flags & FAULT_FLAG_WRITE;
4283
	struct mm_struct *mm = vma->vm_mm;
Linus Torvalds's avatar
Linus Torvalds committed
4284
	pgd_t *pgd;
4285
	p4d_t *p4d;
4286
	vm_fault_t ret;
Linus Torvalds's avatar
Linus Torvalds committed
4287 4288

	pgd = pgd_offset(mm, address);
4289 4290 4291
	p4d = p4d_alloc(mm, pgd, address);
	if (!p4d)
		return VM_FAULT_OOM;
4292

4293
	vmf.pud = pud_alloc(mm, p4d, address);
4294
	if (!vmf.pud)
4295
		return VM_FAULT_OOM;
4296
retry_pud:
4297
	if (pud_none(*vmf.pud) && __transparent_hugepage_enabled(vma)) {
4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308
		ret = create_huge_pud(&vmf);
		if (!(ret & VM_FAULT_FALLBACK))
			return ret;
	} else {
		pud_t orig_pud = *vmf.pud;

		barrier();
		if (pud_trans_huge(orig_pud) || pud_devmap(orig_pud)) {

			/* NUMA case for anonymous PUDs would go here */

4309
			if (dirty && !pud_write(orig_pud)) {
4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320
				ret = wp_huge_pud(&vmf, orig_pud);
				if (!(ret & VM_FAULT_FALLBACK))
					return ret;
			} else {
				huge_pud_set_accessed(&vmf, orig_pud);
				return 0;
			}
		}
	}

	vmf.pmd = pmd_alloc(mm, vmf.pud, address);
4321
	if (!vmf.pmd)
4322
		return VM_FAULT_OOM;
4323 4324 4325 4326 4327

	/* Huge pud page fault raced with pmd_alloc? */
	if (pud_trans_unstable(vmf.pud))
		goto retry_pud;

4328
	if (pmd_none(*vmf.pmd) && __transparent_hugepage_enabled(vma)) {
4329
		ret = create_huge_pmd(&vmf);
4330 4331
		if (!(ret & VM_FAULT_FALLBACK))
			return ret;
4332
	} else {
4333
		pmd_t orig_pmd = *vmf.pmd;
4334

4335
		barrier();
4336 4337 4338 4339 4340 4341 4342
		if (unlikely(is_swap_pmd(orig_pmd))) {
			VM_BUG_ON(thp_migration_supported() &&
					  !is_pmd_migration_entry(orig_pmd));
			if (is_pmd_migration_entry(orig_pmd))
				pmd_migration_entry_wait(mm, vmf.pmd);
			return 0;
		}
4343
		if (pmd_trans_huge(orig_pmd) || pmd_devmap(orig_pmd)) {
4344
			if (pmd_protnone(orig_pmd) && vma_is_accessible(vma))
4345
				return do_huge_pmd_numa_page(&vmf, orig_pmd);
4346

4347
			if (dirty && !pmd_write(orig_pmd)) {
4348
				ret = wp_huge_pmd(&vmf, orig_pmd);
4349 4350
				if (!(ret & VM_FAULT_FALLBACK))
					return ret;
4351
			} else {
4352
				huge_pmd_set_accessed(&vmf, orig_pmd);
4353
				return 0;
4354
			}
4355 4356 4357
		}
	}

4358
	return handle_pte_fault(&vmf);
Linus Torvalds's avatar
Linus Torvalds committed
4359 4360
}

4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402
/**
 * mm_account_fault - Do page fault accountings
 *
 * @regs: the pt_regs struct pointer.  When set to NULL, will skip accounting
 *        of perf event counters, but we'll still do the per-task accounting to
 *        the task who triggered this page fault.
 * @address: the faulted address.
 * @flags: the fault flags.
 * @ret: the fault retcode.
 *
 * This will take care of most of the page fault accountings.  Meanwhile, it
 * will also include the PERF_COUNT_SW_PAGE_FAULTS_[MAJ|MIN] perf counter
 * updates.  However note that the handling of PERF_COUNT_SW_PAGE_FAULTS should
 * still be in per-arch page fault handlers at the entry of page fault.
 */
static inline void mm_account_fault(struct pt_regs *regs,
				    unsigned long address, unsigned int flags,
				    vm_fault_t ret)
{
	bool major;

	/*
	 * We don't do accounting for some specific faults:
	 *
	 * - Unsuccessful faults (e.g. when the address wasn't valid).  That
	 *   includes arch_vma_access_permitted() failing before reaching here.
	 *   So this is not a "this many hardware page faults" counter.  We
	 *   should use the hw profiling for that.
	 *
	 * - Incomplete faults (VM_FAULT_RETRY).  They will only be counted
	 *   once they're completed.
	 */
	if (ret & (VM_FAULT_ERROR | VM_FAULT_RETRY))
		return;

	/*
	 * We define the fault as a major fault when the final successful fault
	 * is VM_FAULT_MAJOR, or if it retried (which implies that we couldn't
	 * handle it immediately previously).
	 */
	major = (ret & VM_FAULT_MAJOR) || (flags & FAULT_FLAG_TRIED);

4403 4404 4405 4406 4407
	if (major)
		current->maj_flt++;
	else
		current->min_flt++;

4408
	/*
4409 4410 4411
	 * If the fault is done for GUP, regs will be NULL.  We only do the
	 * accounting for the per thread fault counters who triggered the
	 * fault, and we skip the perf event updates.
4412 4413 4414 4415
	 */
	if (!regs)
		return;

4416
	if (major)
4417
		perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, regs, address);
4418
	else
4419 4420 4421
		perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, address);
}

4422 4423 4424
/*
 * By the time we get here, we already hold the mm semaphore
 *
4425
 * The mmap_lock may have been released depending on flags and our
4426 4427
 * return value.  See filemap_fault() and __lock_page_or_retry().
 */
4428
vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
4429
			   unsigned int flags, struct pt_regs *regs)
4430
{
4431
	vm_fault_t ret;
4432 4433 4434 4435

	__set_current_state(TASK_RUNNING);

	count_vm_event(PGFAULT);
4436
	count_memcg_event_mm(vma->vm_mm, PGFAULT);
4437 4438 4439 4440

	/* do counter updates before entering really critical section. */
	check_sync_rss_stat(current);

4441 4442 4443 4444 4445
	if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE,
					    flags & FAULT_FLAG_INSTRUCTION,
					    flags & FAULT_FLAG_REMOTE))
		return VM_FAULT_SIGSEGV;

4446 4447 4448 4449 4450
	/*
	 * Enable the memcg OOM handling for faults triggered in user
	 * space.  Kernel faults are handled more gracefully.
	 */
	if (flags & FAULT_FLAG_USER)
4451
		mem_cgroup_enter_user_fault();
4452

4453 4454 4455 4456
	if (unlikely(is_vm_hugetlb_page(vma)))
		ret = hugetlb_fault(vma->vm_mm, vma, address, flags);
	else
		ret = __handle_mm_fault(vma, address, flags);
4457

4458
	if (flags & FAULT_FLAG_USER) {
4459
		mem_cgroup_exit_user_fault();
Tobin C Harding's avatar
Tobin C Harding committed
4460 4461 4462 4463 4464 4465 4466 4467
		/*
		 * The task may have entered a memcg OOM situation but
		 * if the allocation error was handled gracefully (no
		 * VM_FAULT_OOM), there is no need to kill anything.
		 * Just clean up the OOM state peacefully.
		 */
		if (task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM))
			mem_cgroup_oom_synchronize(false);
4468
	}
4469

4470 4471
	mm_account_fault(regs, address, flags, ret);

4472 4473
	return ret;
}
4474
EXPORT_SYMBOL_GPL(handle_mm_fault);
4475

4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498
#ifndef __PAGETABLE_P4D_FOLDED
/*
 * Allocate p4d page table.
 * We've already handled the fast-path in-line.
 */
int __p4d_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
{
	p4d_t *new = p4d_alloc_one(mm, address);
	if (!new)
		return -ENOMEM;

	smp_wmb(); /* See comment in __pte_alloc */

	spin_lock(&mm->page_table_lock);
	if (pgd_present(*pgd))		/* Another has populated it */
		p4d_free(mm, new);
	else
		pgd_populate(mm, pgd, new);
	spin_unlock(&mm->page_table_lock);
	return 0;
}
#endif /* __PAGETABLE_P4D_FOLDED */

Linus Torvalds's avatar
Linus Torvalds committed
4499 4500 4501
#ifndef __PAGETABLE_PUD_FOLDED
/*
 * Allocate page upper directory.
4502
 * We've already handled the fast-path in-line.
Linus Torvalds's avatar
Linus Torvalds committed
4503
 */
4504
int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address)
Linus Torvalds's avatar
Linus Torvalds committed
4505
{
4506 4507
	pud_t *new = pud_alloc_one(mm, address);
	if (!new)
4508
		return -ENOMEM;
Linus Torvalds's avatar
Linus Torvalds committed
4509

4510 4511
	smp_wmb(); /* See comment in __pte_alloc */

4512
	spin_lock(&mm->page_table_lock);
4513 4514
	if (!p4d_present(*p4d)) {
		mm_inc_nr_puds(mm);
4515
		p4d_populate(mm, p4d, new);
4516
	} else	/* Another has populated it */
4517
		pud_free(mm, new);
4518
	spin_unlock(&mm->page_table_lock);
4519
	return 0;
Linus Torvalds's avatar
Linus Torvalds committed
4520 4521 4522 4523 4524 4525
}
#endif /* __PAGETABLE_PUD_FOLDED */

#ifndef __PAGETABLE_PMD_FOLDED
/*
 * Allocate page middle directory.
4526
 * We've already handled the fast-path in-line.
Linus Torvalds's avatar
Linus Torvalds committed
4527
 */
4528
int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
Linus Torvalds's avatar
Linus Torvalds committed
4529
{
4530
	spinlock_t *ptl;
4531 4532
	pmd_t *new = pmd_alloc_one(mm, address);
	if (!new)
4533
		return -ENOMEM;
Linus Torvalds's avatar
Linus Torvalds committed
4534

4535 4536
	smp_wmb(); /* See comment in __pte_alloc */

4537
	ptl = pud_lock(mm, pud);
4538 4539
	if (!pud_present(*pud)) {
		mm_inc_nr_pmds(mm);
4540
		pud_populate(mm, pud, new);
4541
	} else	/* Another has populated it */
4542
		pmd_free(mm, new);
4543
	spin_unlock(ptl);
4544
	return 0;
4545
}
Linus Torvalds's avatar
Linus Torvalds committed
4546 4547
#endif /* __PAGETABLE_PMD_FOLDED */

Ross Zwisler's avatar
Ross Zwisler committed
4548
static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address,
4549
			    struct mmu_notifier_range *range,
4550
			    pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp)
Johannes Weiner's avatar
Johannes Weiner committed
4551 4552
{
	pgd_t *pgd;
4553
	p4d_t *p4d;
Johannes Weiner's avatar
Johannes Weiner committed
4554 4555 4556 4557 4558 4559 4560 4561
	pud_t *pud;
	pmd_t *pmd;
	pte_t *ptep;

	pgd = pgd_offset(mm, address);
	if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
		goto out;

4562 4563 4564 4565 4566
	p4d = p4d_offset(pgd, address);
	if (p4d_none(*p4d) || unlikely(p4d_bad(*p4d)))
		goto out;

	pud = pud_offset(p4d, address);
Johannes Weiner's avatar
Johannes Weiner committed
4567 4568 4569 4570
	if (pud_none(*pud) || unlikely(pud_bad(*pud)))
		goto out;

	pmd = pmd_offset(pud, address);
4571
	VM_BUG_ON(pmd_trans_huge(*pmd));
Johannes Weiner's avatar
Johannes Weiner committed
4572

Ross Zwisler's avatar
Ross Zwisler committed
4573 4574 4575 4576
	if (pmd_huge(*pmd)) {
		if (!pmdpp)
			goto out;

4577
		if (range) {
4578
			mmu_notifier_range_init(range, MMU_NOTIFY_CLEAR, 0,
4579 4580
						NULL, mm, address & PMD_MASK,
						(address & PMD_MASK) + PMD_SIZE);
4581
			mmu_notifier_invalidate_range_start(range);
4582
		}
Ross Zwisler's avatar
Ross Zwisler committed
4583 4584 4585 4586 4587 4588
		*ptlp = pmd_lock(mm, pmd);
		if (pmd_huge(*pmd)) {
			*pmdpp = pmd;
			return 0;
		}
		spin_unlock(*ptlp);
4589 4590
		if (range)
			mmu_notifier_invalidate_range_end(range);
Ross Zwisler's avatar
Ross Zwisler committed
4591 4592 4593
	}

	if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
Johannes Weiner's avatar
Johannes Weiner committed
4594 4595
		goto out;

4596
	if (range) {
4597
		mmu_notifier_range_init(range, MMU_NOTIFY_CLEAR, 0, NULL, mm,
4598 4599
					address & PAGE_MASK,
					(address & PAGE_MASK) + PAGE_SIZE);
4600
		mmu_notifier_invalidate_range_start(range);
4601
	}
Johannes Weiner's avatar
Johannes Weiner committed
4602 4603 4604 4605 4606 4607 4608
	ptep = pte_offset_map_lock(mm, pmd, address, ptlp);
	if (!pte_present(*ptep))
		goto unlock;
	*ptepp = ptep;
	return 0;
unlock:
	pte_unmap_unlock(ptep, *ptlp);
4609 4610
	if (range)
		mmu_notifier_invalidate_range_end(range);
Johannes Weiner's avatar
Johannes Weiner committed
4611 4612 4613 4614
out:
	return -EINVAL;
}

4615 4616
static inline int follow_pte(struct mm_struct *mm, unsigned long address,
			     pte_t **ptepp, spinlock_t **ptlp)
4617 4618 4619 4620 4621
{
	int res;

	/* (void) is needed to make gcc happy */
	(void) __cond_lock(*ptlp,
4622
			   !(res = __follow_pte_pmd(mm, address, NULL,
4623
						    ptepp, NULL, ptlp)));
Ross Zwisler's avatar
Ross Zwisler committed
4624 4625 4626 4627
	return res;
}

int follow_pte_pmd(struct mm_struct *mm, unsigned long address,
4628 4629
		   struct mmu_notifier_range *range,
		   pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp)
Ross Zwisler's avatar
Ross Zwisler committed
4630 4631 4632 4633 4634
{
	int res;

	/* (void) is needed to make gcc happy */
	(void) __cond_lock(*ptlp,
4635
			   !(res = __follow_pte_pmd(mm, address, range,
4636
						    ptepp, pmdpp, ptlp)));
4637 4638
	return res;
}
Ross Zwisler's avatar
Ross Zwisler committed
4639
EXPORT_SYMBOL(follow_pte_pmd);
4640

Johannes Weiner's avatar
Johannes Weiner committed
4641 4642 4643 4644 4645 4646 4647 4648
/**
 * follow_pfn - look up PFN at a user virtual address
 * @vma: memory mapping
 * @address: user virtual address
 * @pfn: location to store found PFN
 *
 * Only IO mappings and raw PFN mappings are allowed.
 *
4649
 * Return: zero and the pfn at @pfn on success, -ve otherwise.
Johannes Weiner's avatar
Johannes Weiner committed
4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669
 */
int follow_pfn(struct vm_area_struct *vma, unsigned long address,
	unsigned long *pfn)
{
	int ret = -EINVAL;
	spinlock_t *ptl;
	pte_t *ptep;

	if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
		return ret;

	ret = follow_pte(vma->vm_mm, address, &ptep, &ptl);
	if (ret)
		return ret;
	*pfn = pte_pfn(*ptep);
	pte_unmap_unlock(ptep, ptl);
	return 0;
}
EXPORT_SYMBOL(follow_pfn);

4670
#ifdef CONFIG_HAVE_IOREMAP_PROT
4671 4672 4673
int follow_phys(struct vm_area_struct *vma,
		unsigned long address, unsigned int flags,
		unsigned long *prot, resource_size_t *phys)
4674
{
4675
	int ret = -EINVAL;
4676 4677 4678
	pte_t *ptep, pte;
	spinlock_t *ptl;

4679 4680
	if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
		goto out;
4681

4682
	if (follow_pte(vma->vm_mm, address, &ptep, &ptl))
4683
		goto out;
4684
	pte = *ptep;
4685

4686
	if ((flags & FOLL_WRITE) && !pte_write(pte))
4687 4688 4689
		goto unlock;

	*prot = pgprot_val(pte_pgprot(pte));
4690
	*phys = (resource_size_t)pte_pfn(pte) << PAGE_SHIFT;
4691

4692
	ret = 0;
4693 4694 4695
unlock:
	pte_unmap_unlock(ptep, ptl);
out:
4696
	return ret;
4697 4698 4699 4700 4701 4702 4703
}

int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
			void *buf, int len, int write)
{
	resource_size_t phys_addr;
	unsigned long prot = 0;
KOSAKI Motohiro's avatar
KOSAKI Motohiro committed
4704
	void __iomem *maddr;
4705 4706
	int offset = addr & (PAGE_SIZE-1);

4707
	if (follow_phys(vma, addr, write, &prot, &phys_addr))
4708 4709
		return -EINVAL;

4710
	maddr = ioremap_prot(phys_addr, PAGE_ALIGN(len + offset), prot);
4711 4712 4713
	if (!maddr)
		return -ENOMEM;

4714 4715 4716 4717 4718 4719 4720 4721
	if (write)
		memcpy_toio(maddr + offset, buf, len);
	else
		memcpy_fromio(buf, maddr + offset, len);
	iounmap(maddr);

	return len;
}
4722
EXPORT_SYMBOL_GPL(generic_access_phys);
4723 4724
#endif

4725
/*
4726 4727
 * Access another process' address space as given in mm.  If non-NULL, use the
 * given task for page fault accounting.
4728
 */
4729
int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
4730
		unsigned long addr, void *buf, int len, unsigned int gup_flags)
4731 4732 4733
{
	struct vm_area_struct *vma;
	void *old_buf = buf;
4734
	int write = gup_flags & FOLL_WRITE;
4735

4736
	if (mmap_read_lock_killable(mm))
4737 4738
		return 0;

Simon Arlott's avatar
Simon Arlott committed
4739
	/* ignore errors, just check how much was successfully transferred */
4740 4741 4742
	while (len) {
		int bytes, ret, offset;
		void *maddr;
4743
		struct page *page = NULL;
4744

4745
		ret = get_user_pages_remote(mm, addr, 1,
4746
				gup_flags, &page, &vma, NULL);
4747
		if (ret <= 0) {
4748 4749 4750
#ifndef CONFIG_HAVE_IOREMAP_PROT
			break;
#else
4751 4752 4753 4754 4755
			/*
			 * Check if this is a VM_IO | VM_PFNMAP VMA, which
			 * we can access using slightly different code.
			 */
			vma = find_vma(mm, addr);
4756
			if (!vma || vma->vm_start > addr)
4757 4758 4759 4760 4761 4762 4763
				break;
			if (vma->vm_ops && vma->vm_ops->access)
				ret = vma->vm_ops->access(vma, addr, buf,
							  len, write);
			if (ret <= 0)
				break;
			bytes = ret;
4764
#endif
4765
		} else {
4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780
			bytes = len;
			offset = addr & (PAGE_SIZE-1);
			if (bytes > PAGE_SIZE-offset)
				bytes = PAGE_SIZE-offset;

			maddr = kmap(page);
			if (write) {
				copy_to_user_page(vma, page, addr,
						  maddr + offset, buf, bytes);
				set_page_dirty_lock(page);
			} else {
				copy_from_user_page(vma, page, addr,
						    buf, maddr + offset, bytes);
			}
			kunmap(page);
4781
			put_page(page);
4782 4783 4784 4785 4786
		}
		len -= bytes;
		buf += bytes;
		addr += bytes;
	}
4787
	mmap_read_unlock(mm);
4788 4789 4790

	return buf - old_buf;
}
4791

4792
/**
4793
 * access_remote_vm - access another process' address space
4794 4795 4796 4797
 * @mm:		the mm_struct of the target address space
 * @addr:	start address to access
 * @buf:	source or destination buffer
 * @len:	number of bytes to transfer
4798
 * @gup_flags:	flags modifying lookup behaviour
4799 4800
 *
 * The caller must hold a reference on @mm.
4801 4802
 *
 * Return: number of bytes copied from source to destination.
4803 4804
 */
int access_remote_vm(struct mm_struct *mm, unsigned long addr,
4805
		void *buf, int len, unsigned int gup_flags)
4806
{
4807
	return __access_remote_vm(NULL, mm, addr, buf, len, gup_flags);
4808 4809
}

4810 4811 4812 4813 4814 4815
/*
 * Access another process' address space.
 * Source/target buffer must be kernel space,
 * Do not walk the page table directly, use get_user_pages
 */
int access_process_vm(struct task_struct *tsk, unsigned long addr,
4816
		void *buf, int len, unsigned int gup_flags)
4817 4818 4819 4820 4821 4822 4823 4824
{
	struct mm_struct *mm;
	int ret;

	mm = get_task_mm(tsk);
	if (!mm)
		return 0;

4825
	ret = __access_remote_vm(tsk, mm, addr, buf, len, gup_flags);
4826

4827 4828 4829 4830
	mmput(mm);

	return ret;
}
4831
EXPORT_SYMBOL_GPL(access_process_vm);
4832

4833 4834 4835 4836 4837 4838 4839 4840
/*
 * Print the name of a VMA.
 */
void print_vma_addr(char *prefix, unsigned long ip)
{
	struct mm_struct *mm = current->mm;
	struct vm_area_struct *vma;

4841
	/*
4842
	 * we might be running from an atomic context so we cannot sleep
4843
	 */
4844
	if (!mmap_read_trylock(mm))
4845 4846
		return;

4847 4848 4849
	vma = find_vma(mm, ip);
	if (vma && vma->vm_file) {
		struct file *f = vma->vm_file;
4850
		char *buf = (char *)__get_free_page(GFP_NOWAIT);
4851
		if (buf) {
Andy Shevchenko's avatar
Andy Shevchenko committed
4852
			char *p;
4853

4854
			p = file_path(f, buf, PAGE_SIZE);
4855 4856
			if (IS_ERR(p))
				p = "?";
Andy Shevchenko's avatar
Andy Shevchenko committed
4857
			printk("%s%s[%lx+%lx]", prefix, kbasename(p),
4858 4859 4860 4861 4862
					vma->vm_start,
					vma->vm_end - vma->vm_start);
			free_page((unsigned long)buf);
		}
	}
4863
	mmap_read_unlock(mm);
4864
}
4865

4866
#if defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_DEBUG_ATOMIC_SLEEP)
4867
void __might_fault(const char *file, int line)
4868
{
4869 4870
	/*
	 * Some code (nfs/sunrpc) uses socket ops on kernel memory while
4871
	 * holding the mmap_lock, this is safe because kernel memory doesn't
4872 4873 4874
	 * get paged out, therefore we'll never actually fault, and the
	 * below annotations will generate false positives.
	 */
Al Viro's avatar
Al Viro committed
4875
	if (uaccess_kernel())
4876
		return;
4877
	if (pagefault_disabled())
4878
		return;
4879 4880
	__might_sleep(file, line, 0);
#if defined(CONFIG_DEBUG_ATOMIC_SLEEP)
4881
	if (current->mm)
4882
		might_lock_read(&current->mm->mmap_lock);
4883
#endif
4884
}
4885
EXPORT_SYMBOL(__might_fault);
4886
#endif
Andrea Arcangeli's avatar
Andrea Arcangeli committed
4887 4888

#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS)
4889 4890 4891 4892 4893 4894 4895 4896 4897
/*
 * Process all subpages of the specified huge page with the specified
 * operation.  The target subpage will be processed last to keep its
 * cache lines hot.
 */
static inline void process_huge_page(
	unsigned long addr_hint, unsigned int pages_per_huge_page,
	void (*process_subpage)(unsigned long addr, int idx, void *arg),
	void *arg)
Andrea Arcangeli's avatar
Andrea Arcangeli committed
4898
{
4899 4900 4901
	int i, n, base, l;
	unsigned long addr = addr_hint &
		~(((unsigned long)pages_per_huge_page << PAGE_SHIFT) - 1);
Andrea Arcangeli's avatar
Andrea Arcangeli committed
4902

4903
	/* Process target subpage last to keep its cache lines hot */
Andrea Arcangeli's avatar
Andrea Arcangeli committed
4904
	might_sleep();
4905 4906
	n = (addr_hint - addr) / PAGE_SIZE;
	if (2 * n <= pages_per_huge_page) {
4907
		/* If target subpage in first half of huge page */
4908 4909
		base = 0;
		l = n;
4910
		/* Process subpages at the end of huge page */
4911 4912
		for (i = pages_per_huge_page - 1; i >= 2 * n; i--) {
			cond_resched();
4913
			process_subpage(addr + i * PAGE_SIZE, i, arg);
4914 4915
		}
	} else {
4916
		/* If target subpage in second half of huge page */
4917 4918
		base = pages_per_huge_page - 2 * (pages_per_huge_page - n);
		l = pages_per_huge_page - n;
4919
		/* Process subpages at the begin of huge page */
4920 4921
		for (i = 0; i < base; i++) {
			cond_resched();
4922
			process_subpage(addr + i * PAGE_SIZE, i, arg);
4923 4924 4925
		}
	}
	/*
4926 4927
	 * Process remaining subpages in left-right-left-right pattern
	 * towards the target subpage
4928 4929 4930 4931 4932 4933
	 */
	for (i = 0; i < l; i++) {
		int left_idx = base + i;
		int right_idx = base + 2 * l - 1 - i;

		cond_resched();
4934
		process_subpage(addr + left_idx * PAGE_SIZE, left_idx, arg);
Andrea Arcangeli's avatar
Andrea Arcangeli committed
4935
		cond_resched();
4936
		process_subpage(addr + right_idx * PAGE_SIZE, right_idx, arg);
Andrea Arcangeli's avatar
Andrea Arcangeli committed
4937 4938 4939
	}
}

4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975
static void clear_gigantic_page(struct page *page,
				unsigned long addr,
				unsigned int pages_per_huge_page)
{
	int i;
	struct page *p = page;

	might_sleep();
	for (i = 0; i < pages_per_huge_page;
	     i++, p = mem_map_next(p, page, i)) {
		cond_resched();
		clear_user_highpage(p, addr + i * PAGE_SIZE);
	}
}

static void clear_subpage(unsigned long addr, int idx, void *arg)
{
	struct page *page = arg;

	clear_user_highpage(page + idx, addr);
}

void clear_huge_page(struct page *page,
		     unsigned long addr_hint, unsigned int pages_per_huge_page)
{
	unsigned long addr = addr_hint &
		~(((unsigned long)pages_per_huge_page << PAGE_SHIFT) - 1);

	if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
		clear_gigantic_page(page, addr, pages_per_huge_page);
		return;
	}

	process_huge_page(addr_hint, pages_per_huge_page, clear_subpage, page);
}

Andrea Arcangeli's avatar
Andrea Arcangeli committed
4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994
static void copy_user_gigantic_page(struct page *dst, struct page *src,
				    unsigned long addr,
				    struct vm_area_struct *vma,
				    unsigned int pages_per_huge_page)
{
	int i;
	struct page *dst_base = dst;
	struct page *src_base = src;

	for (i = 0; i < pages_per_huge_page; ) {
		cond_resched();
		copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma);

		i++;
		dst = mem_map_next(dst, dst_base, i);
		src = mem_map_next(src, src_base, i);
	}
}

4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008
struct copy_subpage_arg {
	struct page *dst;
	struct page *src;
	struct vm_area_struct *vma;
};

static void copy_subpage(unsigned long addr, int idx, void *arg)
{
	struct copy_subpage_arg *copy_arg = arg;

	copy_user_highpage(copy_arg->dst + idx, copy_arg->src + idx,
			   addr, copy_arg->vma);
}

Andrea Arcangeli's avatar
Andrea Arcangeli committed
5009
void copy_user_huge_page(struct page *dst, struct page *src,
5010
			 unsigned long addr_hint, struct vm_area_struct *vma,
Andrea Arcangeli's avatar
Andrea Arcangeli committed
5011 5012
			 unsigned int pages_per_huge_page)
{
5013 5014 5015 5016 5017 5018 5019
	unsigned long addr = addr_hint &
		~(((unsigned long)pages_per_huge_page << PAGE_SHIFT) - 1);
	struct copy_subpage_arg arg = {
		.dst = dst,
		.src = src,
		.vma = vma,
	};
Andrea Arcangeli's avatar
Andrea Arcangeli committed
5020 5021 5022 5023 5024 5025 5026

	if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
		copy_user_gigantic_page(dst, src, addr, vma,
					pages_per_huge_page);
		return;
	}

5027
	process_huge_page(addr_hint, pages_per_huge_page, copy_subpage, &arg);
Andrea Arcangeli's avatar
Andrea Arcangeli committed
5028
}
5029 5030 5031

long copy_huge_page_from_user(struct page *dst_page,
				const void __user *usr_src,
5032 5033
				unsigned int pages_per_huge_page,
				bool allow_pagefault)
5034 5035 5036 5037 5038 5039 5040
{
	void *src = (void *)usr_src;
	void *page_kaddr;
	unsigned long i, rc = 0;
	unsigned long ret_val = pages_per_huge_page * PAGE_SIZE;

	for (i = 0; i < pages_per_huge_page; i++) {
5041 5042 5043 5044
		if (allow_pagefault)
			page_kaddr = kmap(dst_page + i);
		else
			page_kaddr = kmap_atomic(dst_page + i);
5045 5046 5047
		rc = copy_from_user(page_kaddr,
				(const void __user *)(src + i * PAGE_SIZE),
				PAGE_SIZE);
5048 5049 5050 5051
		if (allow_pagefault)
			kunmap(dst_page + i);
		else
			kunmap_atomic(page_kaddr);
5052 5053 5054 5055 5056 5057 5058 5059 5060

		ret_val -= (PAGE_SIZE - rc);
		if (rc)
			break;

		cond_resched();
	}
	return ret_val;
}
Andrea Arcangeli's avatar
Andrea Arcangeli committed
5061
#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */
5062

5063
#if USE_SPLIT_PTE_PTLOCKS && ALLOC_SPLIT_PTLOCKS
5064 5065 5066 5067 5068 5069 5070 5071 5072

static struct kmem_cache *page_ptl_cachep;

void __init ptlock_cache_init(void)
{
	page_ptl_cachep = kmem_cache_create("page->ptl", sizeof(spinlock_t), 0,
			SLAB_PANIC, NULL);
}

5073
bool ptlock_alloc(struct page *page)
5074 5075 5076
{
	spinlock_t *ptl;

5077
	ptl = kmem_cache_alloc(page_ptl_cachep, GFP_KERNEL);
5078 5079
	if (!ptl)
		return false;
5080
	page->ptl = ptl;
5081 5082 5083
	return true;
}

5084
void ptlock_free(struct page *page)
5085
{
5086
	kmem_cache_free(page_ptl_cachep, page->ptl);
5087 5088
}
#endif