mremap.c 29.1 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0
Linus Torvalds's avatar
Linus Torvalds committed
2 3 4 5 6
/*
 *	mm/mremap.c
 *
 *	(C) Copyright 1996 Linus Torvalds
 *
Alan Cox's avatar
Alan Cox committed
7
 *	Address space accounting code	<alan@lxorguk.ukuu.org.uk>
Linus Torvalds's avatar
Linus Torvalds committed
8 9 10 11
 *	(C) Copyright 2002 Red Hat Inc, All Rights Reserved
 */

#include <linux/mm.h>
12
#include <linux/mm_inline.h>
Linus Torvalds's avatar
Linus Torvalds committed
13 14
#include <linux/hugetlb.h>
#include <linux/shm.h>
15
#include <linux/ksm.h>
Linus Torvalds's avatar
Linus Torvalds committed
16 17
#include <linux/mman.h>
#include <linux/swap.h>
18
#include <linux/capability.h>
Linus Torvalds's avatar
Linus Torvalds committed
19
#include <linux/fs.h>
20
#include <linux/swapops.h>
Linus Torvalds's avatar
Linus Torvalds committed
21 22 23
#include <linux/highmem.h>
#include <linux/security.h>
#include <linux/syscalls.h>
Andrea Arcangeli's avatar
Andrea Arcangeli committed
24
#include <linux/mmu_notifier.h>
25
#include <linux/uaccess.h>
26
#include <linux/userfaultfd_k.h>
27
#include <linux/mempolicy.h>
Linus Torvalds's avatar
Linus Torvalds committed
28 29

#include <asm/cacheflush.h>
30
#include <asm/tlb.h>
31
#include <asm/pgalloc.h>
Linus Torvalds's avatar
Linus Torvalds committed
32

33 34
#include "internal.h"

35
static pud_t *get_old_pud(struct mm_struct *mm, unsigned long addr)
Linus Torvalds's avatar
Linus Torvalds committed
36 37
{
	pgd_t *pgd;
38
	p4d_t *p4d;
Linus Torvalds's avatar
Linus Torvalds committed
39 40 41 42 43 44
	pud_t *pud;

	pgd = pgd_offset(mm, addr);
	if (pgd_none_or_clear_bad(pgd))
		return NULL;

45 46 47 48 49
	p4d = p4d_offset(pgd, addr);
	if (p4d_none_or_clear_bad(p4d))
		return NULL;

	pud = pud_offset(p4d, addr);
Linus Torvalds's avatar
Linus Torvalds committed
50 51 52
	if (pud_none_or_clear_bad(pud))
		return NULL;

53 54 55 56 57 58 59 60 61 62 63 64
	return pud;
}

static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr)
{
	pud_t *pud;
	pmd_t *pmd;

	pud = get_old_pud(mm, addr);
	if (!pud)
		return NULL;

Linus Torvalds's avatar
Linus Torvalds committed
65
	pmd = pmd_offset(pud, addr);
66
	if (pmd_none(*pmd))
Linus Torvalds's avatar
Linus Torvalds committed
67 68
		return NULL;

69
	return pmd;
Linus Torvalds's avatar
Linus Torvalds committed
70 71
}

72
static pud_t *alloc_new_pud(struct mm_struct *mm, struct vm_area_struct *vma,
73
			    unsigned long addr)
Linus Torvalds's avatar
Linus Torvalds committed
74 75
{
	pgd_t *pgd;
76
	p4d_t *p4d;
Linus Torvalds's avatar
Linus Torvalds committed
77 78

	pgd = pgd_offset(mm, addr);
79 80 81
	p4d = p4d_alloc(mm, pgd, addr);
	if (!p4d)
		return NULL;
82 83 84 85 86 87 88 89 90 91 92

	return pud_alloc(mm, p4d, addr);
}

static pmd_t *alloc_new_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
			    unsigned long addr)
{
	pud_t *pud;
	pmd_t *pmd;

	pud = alloc_new_pud(mm, vma, addr);
Linus Torvalds's avatar
Linus Torvalds committed
93
	if (!pud)
94
		return NULL;
95

Linus Torvalds's avatar
Linus Torvalds committed
96
	pmd = pmd_alloc(mm, pud, addr);
97
	if (!pmd)
98
		return NULL;
99

100
	VM_BUG_ON(pmd_trans_huge(*pmd));
101

102
	return pmd;
Linus Torvalds's avatar
Linus Torvalds committed
103 104
}

105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120
static void take_rmap_locks(struct vm_area_struct *vma)
{
	if (vma->vm_file)
		i_mmap_lock_write(vma->vm_file->f_mapping);
	if (vma->anon_vma)
		anon_vma_lock_write(vma->anon_vma);
}

static void drop_rmap_locks(struct vm_area_struct *vma)
{
	if (vma->anon_vma)
		anon_vma_unlock_write(vma->anon_vma);
	if (vma->vm_file)
		i_mmap_unlock_write(vma->vm_file->f_mapping);
}

121 122 123 124 125 126 127 128 129 130 131 132 133 134 135
static pte_t move_soft_dirty_pte(pte_t pte)
{
	/*
	 * Set soft dirty bit so we can notice
	 * in userspace the ptes were moved.
	 */
#ifdef CONFIG_MEM_SOFT_DIRTY
	if (pte_present(pte))
		pte = pte_mksoft_dirty(pte);
	else if (is_swap_pte(pte))
		pte = pte_swp_mksoft_dirty(pte);
#endif
	return pte;
}

136 137 138
static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
		unsigned long old_addr, unsigned long old_end,
		struct vm_area_struct *new_vma, pmd_t *new_pmd,
139
		unsigned long new_addr, bool need_rmap_locks)
Linus Torvalds's avatar
Linus Torvalds committed
140 141
{
	struct mm_struct *mm = vma->vm_mm;
142
	pte_t *old_pte, *new_pte, pte;
143
	spinlock_t *old_ptl, *new_ptl;
144 145
	bool force_flush = false;
	unsigned long len = old_end - old_addr;
Linus Torvalds's avatar
Linus Torvalds committed
146

147
	/*
148
	 * When need_rmap_locks is true, we take the i_mmap_rwsem and anon_vma
149 150 151 152 153 154 155 156
	 * locks to ensure that rmap will always observe either the old or the
	 * new ptes. This is the easiest way to avoid races with
	 * truncate_pagecache(), page migration, etc...
	 *
	 * When need_rmap_locks is false, we use other ways to avoid
	 * such races:
	 *
	 * - During exec() shift_arg_pages(), we use a specially tagged vma
157
	 *   which rmap call sites look for using vma_is_temporary_stack().
158 159 160 161 162 163 164
	 *
	 * - During mremap(), new_vma is often known to be placed after vma
	 *   in rmap traversal order. This ensures rmap will always observe
	 *   either the old pte, or the new pte, or both (the page table locks
	 *   serialize access to individual ptes, but only rmap traversal
	 *   order guarantees that we won't miss both the old and new ptes).
	 */
165 166
	if (need_rmap_locks)
		take_rmap_locks(vma);
Linus Torvalds's avatar
Linus Torvalds committed
167

168 169
	/*
	 * We don't have to worry about the ordering of src and dst
170
	 * pte locks because exclusive mmap_lock prevents deadlock.
171
	 */
172
	old_pte = pte_offset_map_lock(mm, old_pmd, old_addr, &old_ptl);
173
	new_pte = pte_offset_map(new_pmd, new_addr);
174 175
	new_ptl = pte_lockptr(mm, new_pmd);
	if (new_ptl != old_ptl)
Ingo Molnar's avatar
Ingo Molnar committed
176
		spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
177
	flush_tlb_batched_pending(vma->vm_mm);
178
	arch_enter_lazy_mmu_mode();
179 180 181 182 183

	for (; old_addr < old_end; old_pte++, old_addr += PAGE_SIZE,
				   new_pte++, new_addr += PAGE_SIZE) {
		if (pte_none(*old_pte))
			continue;
184

185
		pte = ptep_get_and_clear(mm, old_addr, old_pte);
186
		/*
187
		 * If we are remapping a valid PTE, make sure
188
		 * to flush TLB before we drop the PTL for the
189
		 * PTE.
190
		 *
191 192 193 194 195
		 * NOTE! Both old and new PTL matter: the old one
		 * for racing with page_mkclean(), the new one to
		 * make sure the physical page stays valid until
		 * the TLB entry for the old mapping has been
		 * flushed.
196
		 */
197
		if (pte_present(pte))
198
			force_flush = true;
199
		pte = move_pte(pte, new_vma->vm_page_prot, old_addr, new_addr);
200 201
		pte = move_soft_dirty_pte(pte);
		set_pte_at(mm, new_addr, new_pte, pte);
Linus Torvalds's avatar
Linus Torvalds committed
202
	}
203

204
	arch_leave_lazy_mmu_mode();
205 206
	if (force_flush)
		flush_tlb_range(vma, old_end - len, old_end);
207 208
	if (new_ptl != old_ptl)
		spin_unlock(new_ptl);
209
	pte_unmap(new_pte - 1);
210
	pte_unmap_unlock(old_pte - 1, old_ptl);
211 212
	if (need_rmap_locks)
		drop_rmap_locks(vma);
Linus Torvalds's avatar
Linus Torvalds committed
213 214
}

215 216 217 218 219 220 221 222 223
#ifndef arch_supports_page_table_move
#define arch_supports_page_table_move arch_supports_page_table_move
static inline bool arch_supports_page_table_move(void)
{
	return IS_ENABLED(CONFIG_HAVE_MOVE_PMD) ||
		IS_ENABLED(CONFIG_HAVE_MOVE_PUD);
}
#endif

224 225
#ifdef CONFIG_HAVE_MOVE_PMD
static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr,
226
		  unsigned long new_addr, pmd_t *old_pmd, pmd_t *new_pmd)
227 228 229 230 231
{
	spinlock_t *old_ptl, *new_ptl;
	struct mm_struct *mm = vma->vm_mm;
	pmd_t pmd;

232 233
	if (!arch_supports_page_table_move())
		return false;
234 235
	/*
	 * The destination pmd shouldn't be established, free_pgtables()
236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255
	 * should have released it.
	 *
	 * However, there's a case during execve() where we use mremap
	 * to move the initial stack, and in that case the target area
	 * may overlap the source area (always moving down).
	 *
	 * If everything is PMD-aligned, that works fine, as moving
	 * each pmd down will clear the source pmd. But if we first
	 * have a few 4kB-only pages that get moved down, and then
	 * hit the "now the rest is PMD-aligned, let's do everything
	 * one pmd at a time", we will still have the old (now empty
	 * of any 4kB pages, but still there) PMD in the page table
	 * tree.
	 *
	 * Warn on it once - because we really should try to figure
	 * out how to do this better - but then say "I won't move
	 * this pmd".
	 *
	 * One alternative might be to just unmap the target pmd at
	 * this point, and verify that it really is empty. We'll see.
256
	 */
257
	if (WARN_ON_ONCE(!pmd_none(*new_pmd)))
258 259 260 261
		return false;

	/*
	 * We don't have to worry about the ordering of src and dst
262
	 * ptlocks because exclusive mmap_lock prevents deadlock.
263 264 265 266 267 268 269 270 271 272 273 274
	 */
	old_ptl = pmd_lock(vma->vm_mm, old_pmd);
	new_ptl = pmd_lockptr(mm, new_pmd);
	if (new_ptl != old_ptl)
		spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);

	/* Clear the pmd */
	pmd = *old_pmd;
	pmd_clear(old_pmd);

	VM_BUG_ON(!pmd_none(*new_pmd));

275
	pmd_populate(mm, new_pmd, pmd_pgtable(pmd));
276 277 278 279 280 281 282
	flush_tlb_range(vma, old_addr, old_addr + PMD_SIZE);
	if (new_ptl != old_ptl)
		spin_unlock(new_ptl);
	spin_unlock(old_ptl);

	return true;
}
283 284 285 286 287 288 289 290 291
#else
static inline bool move_normal_pmd(struct vm_area_struct *vma,
		unsigned long old_addr, unsigned long new_addr, pmd_t *old_pmd,
		pmd_t *new_pmd)
{
	return false;
}
#endif

292
#if CONFIG_PGTABLE_LEVELS > 2 && defined(CONFIG_HAVE_MOVE_PUD)
293 294 295 296 297 298 299
static bool move_normal_pud(struct vm_area_struct *vma, unsigned long old_addr,
		  unsigned long new_addr, pud_t *old_pud, pud_t *new_pud)
{
	spinlock_t *old_ptl, *new_ptl;
	struct mm_struct *mm = vma->vm_mm;
	pud_t pud;

300 301
	if (!arch_supports_page_table_move())
		return false;
302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323
	/*
	 * The destination pud shouldn't be established, free_pgtables()
	 * should have released it.
	 */
	if (WARN_ON_ONCE(!pud_none(*new_pud)))
		return false;

	/*
	 * We don't have to worry about the ordering of src and dst
	 * ptlocks because exclusive mmap_lock prevents deadlock.
	 */
	old_ptl = pud_lock(vma->vm_mm, old_pud);
	new_ptl = pud_lockptr(mm, new_pud);
	if (new_ptl != old_ptl)
		spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);

	/* Clear the pud */
	pud = *old_pud;
	pud_clear(old_pud);

	VM_BUG_ON(!pud_none(*new_pud));

324
	pud_populate(mm, new_pud, pud_pgtable(pud));
325 326 327 328 329 330 331 332 333 334 335 336 337 338
	flush_tlb_range(vma, old_addr, old_addr + PUD_SIZE);
	if (new_ptl != old_ptl)
		spin_unlock(new_ptl);
	spin_unlock(old_ptl);

	return true;
}
#else
static inline bool move_normal_pud(struct vm_area_struct *vma,
		unsigned long old_addr, unsigned long new_addr, pud_t *old_pud,
		pud_t *new_pud)
{
	return false;
}
339 340
#endif

341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390
#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
static bool move_huge_pud(struct vm_area_struct *vma, unsigned long old_addr,
			  unsigned long new_addr, pud_t *old_pud, pud_t *new_pud)
{
	spinlock_t *old_ptl, *new_ptl;
	struct mm_struct *mm = vma->vm_mm;
	pud_t pud;

	/*
	 * The destination pud shouldn't be established, free_pgtables()
	 * should have released it.
	 */
	if (WARN_ON_ONCE(!pud_none(*new_pud)))
		return false;

	/*
	 * We don't have to worry about the ordering of src and dst
	 * ptlocks because exclusive mmap_lock prevents deadlock.
	 */
	old_ptl = pud_lock(vma->vm_mm, old_pud);
	new_ptl = pud_lockptr(mm, new_pud);
	if (new_ptl != old_ptl)
		spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);

	/* Clear the pud */
	pud = *old_pud;
	pud_clear(old_pud);

	VM_BUG_ON(!pud_none(*new_pud));

	/* Set the new pud */
	/* mark soft_ditry when we add pud level soft dirty support */
	set_pud_at(mm, new_addr, new_pud, pud);
	flush_pud_tlb_range(vma, old_addr, old_addr + HPAGE_PUD_SIZE);
	if (new_ptl != old_ptl)
		spin_unlock(new_ptl);
	spin_unlock(old_ptl);

	return true;
}
#else
static bool move_huge_pud(struct vm_area_struct *vma, unsigned long old_addr,
			  unsigned long new_addr, pud_t *old_pud, pud_t *new_pud)
{
	WARN_ON_ONCE(1);
	return false;

}
#endif

391 392 393 394
enum pgt_entry {
	NORMAL_PMD,
	HPAGE_PMD,
	NORMAL_PUD,
395
	HPAGE_PUD,
396 397 398 399 400 401 402
};

/*
 * Returns an extent of the corresponding size for the pgt_entry specified if
 * valid. Else returns a smaller extent bounded by the end of the source and
 * destination pgt_entry.
 */
403 404 405
static __always_inline unsigned long get_extent(enum pgt_entry entry,
			unsigned long old_addr, unsigned long old_end,
			unsigned long new_addr)
406 407 408 409 410 411 412 413 414
{
	unsigned long next, extent, mask, size;

	switch (entry) {
	case HPAGE_PMD:
	case NORMAL_PMD:
		mask = PMD_MASK;
		size = PMD_SIZE;
		break;
415
	case HPAGE_PUD:
416 417 418 419 420 421 422 423 424 425 426
	case NORMAL_PUD:
		mask = PUD_MASK;
		size = PUD_SIZE;
		break;
	default:
		BUILD_BUG();
		break;
	}

	next = (old_addr + size) & mask;
	/* even if next overflowed, extent below will be ok */
427 428 429
	extent = next - old_addr;
	if (extent > old_end - old_addr)
		extent = old_end - old_addr;
430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463
	next = (new_addr + size) & mask;
	if (extent > next - new_addr)
		extent = next - new_addr;
	return extent;
}

/*
 * Attempts to speedup the move by moving entry at the level corresponding to
 * pgt_entry. Returns true if the move was successful, else false.
 */
static bool move_pgt_entry(enum pgt_entry entry, struct vm_area_struct *vma,
			unsigned long old_addr, unsigned long new_addr,
			void *old_entry, void *new_entry, bool need_rmap_locks)
{
	bool moved = false;

	/* See comment in move_ptes() */
	if (need_rmap_locks)
		take_rmap_locks(vma);

	switch (entry) {
	case NORMAL_PMD:
		moved = move_normal_pmd(vma, old_addr, new_addr, old_entry,
					new_entry);
		break;
	case NORMAL_PUD:
		moved = move_normal_pud(vma, old_addr, new_addr, old_entry,
					new_entry);
		break;
	case HPAGE_PMD:
		moved = IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
			move_huge_pmd(vma, old_addr, new_addr, old_entry,
				      new_entry);
		break;
464 465 466 467 468 469
	case HPAGE_PUD:
		moved = IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
			move_huge_pud(vma, old_addr, new_addr, old_entry,
				      new_entry);
		break;

470 471 472 473 474 475 476 477 478 479 480
	default:
		WARN_ON_ONCE(1);
		break;
	}

	if (need_rmap_locks)
		drop_rmap_locks(vma);

	return moved;
}

481
unsigned long move_page_tables(struct vm_area_struct *vma,
Linus Torvalds's avatar
Linus Torvalds committed
482
		unsigned long old_addr, struct vm_area_struct *new_vma,
483 484
		unsigned long new_addr, unsigned long len,
		bool need_rmap_locks)
Linus Torvalds's avatar
Linus Torvalds committed
485
{
486
	unsigned long extent, old_end;
487
	struct mmu_notifier_range range;
488
	pmd_t *old_pmd, *new_pmd;
489
	pud_t *old_pud, *new_pud;
Linus Torvalds's avatar
Linus Torvalds committed
490

491 492 493
	if (!len)
		return 0;

494
	old_end = old_addr + len;
Linus Torvalds's avatar
Linus Torvalds committed
495

496 497 498 499
	if (is_vm_hugetlb_page(vma))
		return move_hugetlb_page_tables(vma, new_vma, old_addr,
						new_addr, len);

500
	flush_cache_range(vma, old_addr, old_end);
501
	mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma->vm_mm,
502
				old_addr, old_end);
503
	mmu_notifier_invalidate_range_start(&range);
504

505
	for (; old_addr < old_end; old_addr += extent, new_addr += extent) {
Linus Torvalds's avatar
Linus Torvalds committed
506
		cond_resched();
507 508 509 510 511 512
		/*
		 * If extent is PUD-sized try to speed up the move by moving at the
		 * PUD level if possible.
		 */
		extent = get_extent(NORMAL_PUD, old_addr, old_end, new_addr);

513 514 515 516 517 518 519 520 521 522 523
		old_pud = get_old_pud(vma->vm_mm, old_addr);
		if (!old_pud)
			continue;
		new_pud = alloc_new_pud(vma->vm_mm, vma, new_addr);
		if (!new_pud)
			break;
		if (pud_trans_huge(*old_pud) || pud_devmap(*old_pud)) {
			if (extent == HPAGE_PUD_SIZE) {
				move_pgt_entry(HPAGE_PUD, vma, old_addr, new_addr,
					       old_pud, new_pud, need_rmap_locks);
				/* We ignore and continue on error? */
524
				continue;
525 526 527
			}
		} else if (IS_ENABLED(CONFIG_HAVE_MOVE_PUD) && extent == PUD_SIZE) {

528
			if (move_pgt_entry(NORMAL_PUD, vma, old_addr, new_addr,
529
					   old_pud, new_pud, true))
530 531 532 533
				continue;
		}

		extent = get_extent(NORMAL_PMD, old_addr, old_end, new_addr);
534 535 536
		old_pmd = get_old_pmd(vma->vm_mm, old_addr);
		if (!old_pmd)
			continue;
537
		new_pmd = alloc_new_pmd(vma->vm_mm, vma, new_addr);
538 539
		if (!new_pmd)
			break;
540 541 542 543 544 545
		if (is_swap_pmd(*old_pmd) || pmd_trans_huge(*old_pmd) ||
		    pmd_devmap(*old_pmd)) {
			if (extent == HPAGE_PMD_SIZE &&
			    move_pgt_entry(HPAGE_PMD, vma, old_addr, new_addr,
					   old_pmd, new_pmd, need_rmap_locks))
				continue;
546
			split_huge_pmd(vma, old_pmd, old_addr);
547
			if (pmd_trans_unstable(old_pmd))
548
				continue;
549 550
		} else if (IS_ENABLED(CONFIG_HAVE_MOVE_PMD) &&
			   extent == PMD_SIZE) {
551 552 553 554
			/*
			 * If the extent is PMD-sized, try to speed the move by
			 * moving at the PMD level if possible.
			 */
555
			if (move_pgt_entry(NORMAL_PMD, vma, old_addr, new_addr,
556
					   old_pmd, new_pmd, true))
557
				continue;
558
		}
559

560
		if (pte_alloc(new_vma->vm_mm, new_pmd))
561
			break;
562
		move_ptes(vma, old_pmd, old_addr, old_addr + extent, new_vma,
563
			  new_pmd, new_addr, need_rmap_locks);
Linus Torvalds's avatar
Linus Torvalds committed
564
	}
565

566
	mmu_notifier_invalidate_range_end(&range);
567 568

	return len + old_addr - old_end;	/* how much done */
Linus Torvalds's avatar
Linus Torvalds committed
569 570 571 572
}

static unsigned long move_vma(struct vm_area_struct *vma,
		unsigned long old_addr, unsigned long old_len,
573
		unsigned long new_len, unsigned long new_addr,
574 575
		bool *locked, unsigned long flags,
		struct vm_userfaultfd_ctx *uf, struct list_head *uf_unmap)
Linus Torvalds's avatar
Linus Torvalds committed
576
{
577
	long to_account = new_len - old_len;
Linus Torvalds's avatar
Linus Torvalds committed
578 579 580 581 582
	struct mm_struct *mm = vma->vm_mm;
	struct vm_area_struct *new_vma;
	unsigned long vm_flags = vma->vm_flags;
	unsigned long new_pgoff;
	unsigned long moved_len;
583 584
	unsigned long account_start = 0;
	unsigned long account_end = 0;
585
	unsigned long hiwater_vm;
586
	int err = 0;
587
	bool need_rmap_locks;
588
	struct vma_iterator vmi;
Linus Torvalds's avatar
Linus Torvalds committed
589 590 591 592 593 594 595 596

	/*
	 * We'd prefer to avoid failure later on in do_munmap:
	 * which may split one vma into three before unmapping.
	 */
	if (mm->map_count >= sysctl_max_map_count - 3)
		return -ENOMEM;

597 598 599
	if (unlikely(flags & MREMAP_DONTUNMAP))
		to_account = new_len;

600 601 602 603 604 605 606 607 608
	if (vma->vm_ops && vma->vm_ops->may_split) {
		if (vma->vm_start != old_addr)
			err = vma->vm_ops->may_split(vma, old_addr);
		if (!err && vma->vm_end != old_addr + old_len)
			err = vma->vm_ops->may_split(vma, old_addr + old_len);
		if (err)
			return err;
	}

609 610 611 612 613 614 615
	/*
	 * Advise KSM to break any KSM pages in the area to be moved:
	 * it would be confusing if they were to turn up at the new
	 * location, where they happen to coincide with different KSM
	 * pages recently unmapped.  But leave vma->vm_flags as it was,
	 * so KSM can come around to merge on vma and new_vma afterwards.
	 */
616 617 618 619
	err = ksm_madvise(vma, old_addr, old_addr + old_len,
						MADV_UNMERGEABLE, &vm_flags);
	if (err)
		return err;
620

621 622
	if (vm_flags & VM_ACCOUNT) {
		if (security_vm_enough_memory_mm(mm, to_account >> PAGE_SHIFT))
623 624 625
			return -ENOMEM;
	}

626
	vma_start_write(vma);
Linus Torvalds's avatar
Linus Torvalds committed
627
	new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT);
628 629
	new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff,
			   &need_rmap_locks);
630
	if (!new_vma) {
631 632
		if (vm_flags & VM_ACCOUNT)
			vm_unacct_memory(to_account >> PAGE_SHIFT);
Linus Torvalds's avatar
Linus Torvalds committed
633
		return -ENOMEM;
634
	}
Linus Torvalds's avatar
Linus Torvalds committed
635

636 637
	moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len,
				     need_rmap_locks);
Linus Torvalds's avatar
Linus Torvalds committed
638
	if (moved_len < old_len) {
639
		err = -ENOMEM;
640
	} else if (vma->vm_ops && vma->vm_ops->mremap) {
641
		err = vma->vm_ops->mremap(new_vma);
642 643 644
	}

	if (unlikely(err)) {
Linus Torvalds's avatar
Linus Torvalds committed
645 646 647 648 649
		/*
		 * On error, move entries back from new area to old,
		 * which will succeed since page tables still there,
		 * and then proceed to unmap new area instead of old.
		 */
650 651
		move_page_tables(new_vma, new_addr, vma, old_addr, moved_len,
				 true);
Linus Torvalds's avatar
Linus Torvalds committed
652 653 654
		vma = new_vma;
		old_len = new_len;
		old_addr = new_addr;
655
		new_addr = err;
Laurent Dufour's avatar
Laurent Dufour committed
656
	} else {
657
		mremap_userfaultfd_prep(new_vma, uf);
658
	}
Linus Torvalds's avatar
Linus Torvalds committed
659

660 661 662 663
	if (is_vm_hugetlb_page(vma)) {
		clear_vma_resv_huge_pages(vma);
	}

Linus Torvalds's avatar
Linus Torvalds committed
664
	/* Conceal VM_ACCOUNT so old reservation is not undone */
665
	if (vm_flags & VM_ACCOUNT && !(flags & MREMAP_DONTUNMAP)) {
666
		vm_flags_clear(vma, VM_ACCOUNT);
667 668 669 670
		if (vma->vm_start < old_addr)
			account_start = vma->vm_start;
		if (vma->vm_end > old_addr + old_len)
			account_end = vma->vm_end;
Linus Torvalds's avatar
Linus Torvalds committed
671 672
	}

673
	/*
674 675 676 677 678 679 680
	 * If we failed to move page tables we still do total_vm increment
	 * since do_munmap() will decrement it by old_len == new_len.
	 *
	 * Since total_vm is about to be raised artificially high for a
	 * moment, we need to restore high watermark afterwards: if stats
	 * are taken meanwhile, total_vm and hiwater_vm appear too high.
	 * If this were a serious issue, we'd add a flag to do_munmap().
681
	 */
682
	hiwater_vm = mm->hiwater_vm;
683
	vm_stat_account(mm, vma->vm_flags, new_len >> PAGE_SHIFT);
684

685 686
	/* Tell pfnmap has moved from this vma */
	if (unlikely(vma->vm_flags & VM_PFNMAP))
687
		untrack_pfn_clear(vma);
688

689 690
	if (unlikely(!err && (flags & MREMAP_DONTUNMAP))) {
		/* We always clear VM_LOCKED[ONFAULT] on the old vma */
691
		vm_flags_clear(vma, VM_LOCKED_MASK);
692

693 694 695 696 697 698 699 700
		/*
		 * anon_vma links of the old vma is no longer needed after its page
		 * table has been moved.
		 */
		if (new_vma != vma && vma->vm_start == old_addr &&
			vma->vm_end == (old_addr + old_len))
			unlink_anon_vmas(vma);

701
		/* Because we won't unmap we don't need to touch locked_vm */
702
		return new_addr;
703 704
	}

705 706
	vma_iter_init(&vmi, mm, old_addr);
	if (do_vmi_munmap(&vmi, mm, old_addr, old_len, uf_unmap, false) < 0) {
Linus Torvalds's avatar
Linus Torvalds committed
707
		/* OOM: unable to split vma, just get accounts right */
708
		if (vm_flags & VM_ACCOUNT && !(flags & MREMAP_DONTUNMAP))
709
			vm_acct_memory(old_len >> PAGE_SHIFT);
710
		account_start = account_end = 0;
Linus Torvalds's avatar
Linus Torvalds committed
711
	}
712 713 714 715 716

	if (vm_flags & VM_LOCKED) {
		mm->locked_vm += new_len >> PAGE_SHIFT;
		*locked = true;
	}
717

718
	mm->hiwater_vm = hiwater_vm;
Linus Torvalds's avatar
Linus Torvalds committed
719 720

	/* Restore VM_ACCOUNT if one or two pieces of vma left */
721 722
	if (account_start) {
		vma = vma_prev(&vmi);
723
		vm_flags_set(vma, VM_ACCOUNT);
724 725 726 727
	}

	if (account_end) {
		vma = vma_next(&vmi);
728
		vm_flags_set(vma, VM_ACCOUNT);
Linus Torvalds's avatar
Linus Torvalds committed
729 730 731 732 733
	}

	return new_addr;
}

Al Viro's avatar
Al Viro committed
734
static struct vm_area_struct *vma_to_resize(unsigned long addr,
735
	unsigned long old_len, unsigned long new_len, unsigned long flags)
Al Viro's avatar
Al Viro committed
736 737
{
	struct mm_struct *mm = current->mm;
738
	struct vm_area_struct *vma;
739
	unsigned long pgoff;
Al Viro's avatar
Al Viro committed
740

741 742
	vma = vma_lookup(mm, addr);
	if (!vma)
743
		return ERR_PTR(-EFAULT);
Al Viro's avatar
Al Viro committed
744

745 746 747 748 749 750 751 752 753 754 755 756 757
	/*
	 * !old_len is a special case where an attempt is made to 'duplicate'
	 * a mapping.  This makes no sense for private mappings as it will
	 * instead create a fresh/new mapping unrelated to the original.  This
	 * is contrary to the basic idea of mremap which creates new mappings
	 * based on the original.  There are no known use cases for this
	 * behavior.  As a result, fail such attempts.
	 */
	if (!old_len && !(vma->vm_flags & (VM_SHARED | VM_MAYSHARE))) {
		pr_warn_once("%s (%d): attempted to duplicate a private mapping with mremap.  This is not supported.\n", current->comm, current->pid);
		return ERR_PTR(-EINVAL);
	}

758 759
	if ((flags & MREMAP_DONTUNMAP) &&
			(vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP)))
760 761
		return ERR_PTR(-EINVAL);

Al Viro's avatar
Al Viro committed
762 763
	/* We can't remap across vm area boundaries */
	if (old_len > vma->vm_end - addr)
764
		return ERR_PTR(-EFAULT);
Al Viro's avatar
Al Viro committed
765

766 767 768
	if (new_len == old_len)
		return vma;

769
	/* Need to be careful about a growing mapping */
770 771 772 773 774 775 776
	pgoff = (addr - vma->vm_start) >> PAGE_SHIFT;
	pgoff += vma->vm_pgoff;
	if (pgoff + (new_len >> PAGE_SHIFT) < pgoff)
		return ERR_PTR(-EINVAL);

	if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP))
		return ERR_PTR(-EFAULT);
Al Viro's avatar
Al Viro committed
777

778
	if (!mlock_future_check(mm, vma->vm_flags, new_len - old_len))
779
		return ERR_PTR(-EAGAIN);
Al Viro's avatar
Al Viro committed
780

781 782
	if (!may_expand_vm(mm, vma->vm_flags,
				(new_len - old_len) >> PAGE_SHIFT))
783
		return ERR_PTR(-ENOMEM);
Al Viro's avatar
Al Viro committed
784 785 786 787

	return vma;
}

788
static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
789
		unsigned long new_addr, unsigned long new_len, bool *locked,
790
		unsigned long flags, struct vm_userfaultfd_ctx *uf,
791
		struct list_head *uf_unmap_early,
792
		struct list_head *uf_unmap)
Al Viro's avatar
Al Viro committed
793 794 795 796
{
	struct mm_struct *mm = current->mm;
	struct vm_area_struct *vma;
	unsigned long ret = -EINVAL;
797
	unsigned long map_flags = 0;
Al Viro's avatar
Al Viro committed
798

799
	if (offset_in_page(new_addr))
Al Viro's avatar
Al Viro committed
800 801 802 803 804
		goto out;

	if (new_len > TASK_SIZE || new_addr > TASK_SIZE - new_len)
		goto out;

805 806
	/* Ensure the old/new locations do not overlap */
	if (addr + old_len > new_addr && new_addr + new_len > addr)
Al Viro's avatar
Al Viro committed
807 808
		goto out;

809 810 811 812 813 814 815 816 817
	/*
	 * move_vma() need us to stay 4 maps below the threshold, otherwise
	 * it will bail out at the very beginning.
	 * That is a problem if we have already unmaped the regions here
	 * (new_addr, and old_addr), because userspace will not know the
	 * state of the vma's after it gets -ENOMEM.
	 * So, to avoid such scenario we can pre-compute if the whole
	 * operation has high chances to success map-wise.
	 * Worst-scenario case is when both vma's (new_addr and old_addr) get
Ingo Molnar's avatar
Ingo Molnar committed
818
	 * split in 3 before unmapping it.
819 820 821 822 823 824 825
	 * That means 2 more maps (1 for each) to the ones we already hold.
	 * Check whether current map count plus 2 still leads us to 4 maps below
	 * the threshold, otherwise return -ENOMEM here to be more safe.
	 */
	if ((mm->map_count + 2) >= sysctl_max_map_count - 3)
		return -ENOMEM;

826 827 828 829 830
	if (flags & MREMAP_FIXED) {
		ret = do_munmap(mm, new_addr, new_len, uf_unmap_early);
		if (ret)
			goto out;
	}
Al Viro's avatar
Al Viro committed
831

832
	if (old_len > new_len) {
833
		ret = do_munmap(mm, addr+new_len, old_len - new_len, uf_unmap);
834
		if (ret)
Al Viro's avatar
Al Viro committed
835 836 837 838
			goto out;
		old_len = new_len;
	}

839
	vma = vma_to_resize(addr, old_len, new_len, flags);
Al Viro's avatar
Al Viro committed
840 841 842 843 844
	if (IS_ERR(vma)) {
		ret = PTR_ERR(vma);
		goto out;
	}

845 846 847 848 849 850 851 852 853 854
	/* MREMAP_DONTUNMAP expands by old_len since old_len == new_len */
	if (flags & MREMAP_DONTUNMAP &&
		!may_expand_vm(mm, vma->vm_flags, old_len >> PAGE_SHIFT)) {
		ret = -ENOMEM;
		goto out;
	}

	if (flags & MREMAP_FIXED)
		map_flags |= MAP_FIXED;

855 856
	if (vma->vm_flags & VM_MAYSHARE)
		map_flags |= MAP_SHARED;
857

858 859 860
	ret = get_unmapped_area(vma->vm_file, new_addr, new_len, vma->vm_pgoff +
				((addr - vma->vm_start) >> PAGE_SHIFT),
				map_flags);
861
	if (IS_ERR_VALUE(ret))
862
		goto out;
863

864 865 866 867 868
	/* We got a new mapping */
	if (!(flags & MREMAP_FIXED))
		new_addr = ret;

	ret = move_vma(vma, addr, old_len, new_len, new_addr, locked, flags, uf,
869
		       uf_unmap);
870

Al Viro's avatar
Al Viro committed
871 872 873 874
out:
	return ret;
}

Al Viro's avatar
Al Viro committed
875 876
static int vma_expandable(struct vm_area_struct *vma, unsigned long delta)
{
877
	unsigned long end = vma->vm_end + delta;
878

879
	if (end < vma->vm_end) /* overflow */
880
		return 0;
881
	if (find_vma_intersection(vma->vm_mm, vma->vm_end, end))
882 883 884
		return 0;
	if (get_unmapped_area(NULL, vma->vm_start, end - vma->vm_start,
			      0, MAP_FIXED) & ~PAGE_MASK)
Al Viro's avatar
Al Viro committed
885 886 887 888
		return 0;
	return 1;
}

Linus Torvalds's avatar
Linus Torvalds committed
889 890 891 892 893 894 895
/*
 * Expand (or shrink) an existing mapping, potentially moving it at the
 * same time (controlled by the MREMAP_MAYMOVE flag and available VM space)
 *
 * MREMAP_FIXED option added 5-Dec-1999 by Benjamin LaHaise
 * This option implies MREMAP_MAYMOVE.
 */
896 897 898
SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
		unsigned long, new_len, unsigned long, flags,
		unsigned long, new_addr)
Linus Torvalds's avatar
Linus Torvalds committed
899
{
900
	struct mm_struct *mm = current->mm;
Linus Torvalds's avatar
Linus Torvalds committed
901 902
	struct vm_area_struct *vma;
	unsigned long ret = -EINVAL;
903
	bool locked = false;
904
	bool downgraded = false;
905
	struct vm_userfaultfd_ctx uf = NULL_VM_UFFD_CTX;
906
	LIST_HEAD(uf_unmap_early);
907
	LIST_HEAD(uf_unmap);
Linus Torvalds's avatar
Linus Torvalds committed
908

909 910 911 912 913 914 915 916 917 918
	/*
	 * There is a deliberate asymmetry here: we strip the pointer tag
	 * from the old address but leave the new address alone. This is
	 * for consistency with mmap(), where we prevent the creation of
	 * aliasing mappings in userspace by leaving the tag bits of the
	 * mapping address intact. A non-zero tag will cause the subsequent
	 * range checks to reject the address as invalid.
	 *
	 * See Documentation/arm64/tagged-address-abi.rst for more information.
	 */
919 920
	addr = untagged_addr(addr);

921
	if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE | MREMAP_DONTUNMAP))
922 923 924 925
		return ret;

	if (flags & MREMAP_FIXED && !(flags & MREMAP_MAYMOVE))
		return ret;
Linus Torvalds's avatar
Linus Torvalds committed
926

927 928 929 930 931 932 933 934 935
	/*
	 * MREMAP_DONTUNMAP is always a move and it does not allow resizing
	 * in the process.
	 */
	if (flags & MREMAP_DONTUNMAP &&
			(!(flags & MREMAP_MAYMOVE) || old_len != new_len))
		return ret;


936
	if (offset_in_page(addr))
937
		return ret;
Linus Torvalds's avatar
Linus Torvalds committed
938 939 940 941 942 943 944 945 946 947

	old_len = PAGE_ALIGN(old_len);
	new_len = PAGE_ALIGN(new_len);

	/*
	 * We allow a zero old-len as a special case
	 * for DOS-emu "duplicate shm area" thing. But
	 * a zero new-len is nonsensical.
	 */
	if (!new_len)
948 949
		return ret;

950
	if (mmap_write_lock_killable(current->mm))
951
		return -EINTR;
952 953
	vma = vma_lookup(mm, addr);
	if (!vma) {
954
		ret = -EFAULT;
955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976
		goto out;
	}

	if (is_vm_hugetlb_page(vma)) {
		struct hstate *h __maybe_unused = hstate_vma(vma);

		old_len = ALIGN(old_len, huge_page_size(h));
		new_len = ALIGN(new_len, huge_page_size(h));

		/* addrs must be huge page aligned */
		if (addr & ~huge_page_mask(h))
			goto out;
		if (new_addr & ~huge_page_mask(h))
			goto out;

		/*
		 * Don't allow remap expansion, because the underlying hugetlb
		 * reservation is not yet capable to handle split reservation.
		 */
		if (new_len > old_len)
			goto out;
	}
Linus Torvalds's avatar
Linus Torvalds committed
977

978
	if (flags & (MREMAP_FIXED | MREMAP_DONTUNMAP)) {
979
		ret = mremap_to(addr, old_len, new_addr, new_len,
980 981
				&locked, flags, &uf, &uf_unmap_early,
				&uf_unmap);
Al Viro's avatar
Al Viro committed
982
		goto out;
Linus Torvalds's avatar
Linus Torvalds committed
983 984 985 986 987
	}

	/*
	 * Always allow a shrinking remap: that just unmaps
	 * the unnecessary pages..
988
	 * do_vmi_munmap does all the needed commit accounting, and
989
	 * downgrades mmap_lock to read if so directed.
Linus Torvalds's avatar
Linus Torvalds committed
990 991
	 */
	if (old_len >= new_len) {
992
		int retval;
993
		VMA_ITERATOR(vmi, mm, addr + new_len);
994

995
		retval = do_vmi_munmap(&vmi, mm, addr + new_len,
996
				       old_len - new_len, &uf_unmap, true);
997
		/* Returning 1 indicates mmap_lock is downgraded to read. */
998
		if (retval == 1) {
999
			downgraded = true;
1000 1001 1002 1003 1004
		} else if (retval < 0 && old_len != new_len) {
			ret = retval;
			goto out;
		}

Linus Torvalds's avatar
Linus Torvalds committed
1005
		ret = addr;
Al Viro's avatar
Al Viro committed
1006
		goto out;
Linus Torvalds's avatar
Linus Torvalds committed
1007 1008 1009
	}

	/*
Al Viro's avatar
Al Viro committed
1010
	 * Ok, we need to grow..
Linus Torvalds's avatar
Linus Torvalds committed
1011
	 */
1012
	vma = vma_to_resize(addr, old_len, new_len, flags);
Al Viro's avatar
Al Viro committed
1013 1014
	if (IS_ERR(vma)) {
		ret = PTR_ERR(vma);
Linus Torvalds's avatar
Linus Torvalds committed
1015
		goto out;
1016
	}
Linus Torvalds's avatar
Linus Torvalds committed
1017 1018 1019

	/* old_len exactly to the end of the area..
	 */
Al Viro's avatar
Al Viro committed
1020
	if (old_len == vma->vm_end - addr) {
Linus Torvalds's avatar
Linus Torvalds committed
1021
		/* can we just expand the current mapping? */
Al Viro's avatar
Al Viro committed
1022
		if (vma_expandable(vma, new_len - old_len)) {
1023
			long pages = (new_len - old_len) >> PAGE_SHIFT;
1024 1025
			unsigned long extension_start = addr + old_len;
			unsigned long extension_end = addr + new_len;
1026 1027
			pgoff_t extension_pgoff = vma->vm_pgoff +
				((extension_start - vma->vm_start) >> PAGE_SHIFT);
1028
			VMA_ITERATOR(vmi, mm, extension_start);
1029 1030 1031 1032 1033 1034 1035

			if (vma->vm_flags & VM_ACCOUNT) {
				if (security_vm_enough_memory_mm(mm, pages)) {
					ret = -ENOMEM;
					goto out;
				}
			}
Linus Torvalds's avatar
Linus Torvalds committed
1036

1037
			/*
1038 1039 1040 1041 1042 1043
			 * Function vma_merge() is called on the extension we
			 * are adding to the already existing vma, vma_merge()
			 * will merge this extension with the already existing
			 * vma (expand operation itself) and possibly also with
			 * the next vma if it becomes adjacent to the expanded
			 * vma and  otherwise compatible.
1044
			 */
1045 1046 1047 1048
			vma = vma_merge(&vmi, mm, vma, extension_start,
				extension_end, vma->vm_flags, vma->anon_vma,
				vma->vm_file, extension_pgoff, vma_policy(vma),
				vma->vm_userfaultfd_ctx, anon_vma_name(vma));
1049
			if (!vma) {
1050
				vm_unacct_memory(pages);
1051 1052 1053
				ret = -ENOMEM;
				goto out;
			}
Linus Torvalds's avatar
Linus Torvalds committed
1054

1055
			vm_stat_account(mm, vma->vm_flags, pages);
Linus Torvalds's avatar
Linus Torvalds committed
1056
			if (vma->vm_flags & VM_LOCKED) {
1057
				mm->locked_vm += pages;
1058 1059
				locked = true;
				new_addr = addr;
Linus Torvalds's avatar
Linus Torvalds committed
1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071
			}
			ret = addr;
			goto out;
		}
	}

	/*
	 * We weren't able to just expand or shrink the area,
	 * we need to create a new one and move it..
	 */
	ret = -ENOMEM;
	if (flags & MREMAP_MAYMOVE) {
Al Viro's avatar
Al Viro committed
1072 1073 1074 1075 1076
		unsigned long map_flags = 0;
		if (vma->vm_flags & VM_MAYSHARE)
			map_flags |= MAP_SHARED;

		new_addr = get_unmapped_area(vma->vm_file, 0, new_len,
1077 1078 1079
					vma->vm_pgoff +
					((addr - vma->vm_start) >> PAGE_SHIFT),
					map_flags);
1080
		if (IS_ERR_VALUE(new_addr)) {
Al Viro's avatar
Al Viro committed
1081 1082
			ret = new_addr;
			goto out;
Linus Torvalds's avatar
Linus Torvalds committed
1083
		}
Al Viro's avatar
Al Viro committed
1084

1085
		ret = move_vma(vma, addr, old_len, new_len, new_addr,
1086
			       &locked, flags, &uf, &uf_unmap);
Linus Torvalds's avatar
Linus Torvalds committed
1087 1088
	}
out:
1089
	if (offset_in_page(ret))
Zou Wei's avatar
Zou Wei committed
1090
		locked = false;
1091
	if (downgraded)
1092
		mmap_read_unlock(current->mm);
1093
	else
1094
		mmap_write_unlock(current->mm);
1095 1096
	if (locked && new_len > old_len)
		mm_populate(new_addr + old_len, new_len - old_len);
1097
	userfaultfd_unmap_complete(mm, &uf_unmap_early);
1098
	mremap_userfaultfd_complete(&uf, addr, ret, old_len);
1099
	userfaultfd_unmap_complete(mm, &uf_unmap);
Linus Torvalds's avatar
Linus Torvalds committed
1100 1101
	return ret;
}