khugepaged.c 71.6 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0
2 3 4 5
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/mm.h>
#include <linux/sched.h>
6
#include <linux/sched/mm.h>
7
#include <linux/sched/coredump.h>
8 9 10 11 12 13 14 15 16 17 18
#include <linux/mmu_notifier.h>
#include <linux/rmap.h>
#include <linux/swap.h>
#include <linux/mm_inline.h>
#include <linux/kthread.h>
#include <linux/khugepaged.h>
#include <linux/freezer.h>
#include <linux/mman.h>
#include <linux/hashtable.h>
#include <linux/userfaultfd_k.h>
#include <linux/page_idle.h>
19
#include <linux/page_table_check.h>
20
#include <linux/swapops.h>
21
#include <linux/shmem_fs.h>
22
#include <linux/ksm.h>
23 24 25 26

#include <asm/tlb.h>
#include <asm/pgalloc.h>
#include "internal.h"
27
#include "mm_slot.h"
28 29 30 31 32

enum scan_result {
	SCAN_FAIL,
	SCAN_SUCCEED,
	SCAN_PMD_NULL,
33
	SCAN_PMD_NONE,
34
	SCAN_PMD_MAPPED,
35
	SCAN_EXCEED_NONE_PTE,
36 37
	SCAN_EXCEED_SWAP_PTE,
	SCAN_EXCEED_SHARED_PTE,
38
	SCAN_PTE_NON_PRESENT,
39
	SCAN_PTE_UFFD_WP,
40
	SCAN_PTE_MAPPED_HUGEPAGE,
41
	SCAN_PAGE_RO,
42
	SCAN_LACK_REFERENCED_PAGE,
43 44 45 46 47 48 49 50 51 52 53 54 55 56
	SCAN_PAGE_NULL,
	SCAN_SCAN_ABORT,
	SCAN_PAGE_COUNT,
	SCAN_PAGE_LRU,
	SCAN_PAGE_LOCK,
	SCAN_PAGE_ANON,
	SCAN_PAGE_COMPOUND,
	SCAN_ANY_PROCESS,
	SCAN_VMA_NULL,
	SCAN_VMA_CHECK,
	SCAN_ADDRESS_RANGE,
	SCAN_DEL_PAGE_LRU,
	SCAN_ALLOC_HUGE_PAGE_FAIL,
	SCAN_CGROUP_CHARGE_FAIL,
57
	SCAN_TRUNCATED,
58
	SCAN_PAGE_HAS_PRIVATE,
59
	SCAN_STORE_FAILED,
60
	SCAN_COPY_MC,
61
	SCAN_PAGE_FILLED,
62 63 64 65 66
};

#define CREATE_TRACE_POINTS
#include <trace/events/huge_memory.h>

67 68 69
static struct task_struct *khugepaged_thread __read_mostly;
static DEFINE_MUTEX(khugepaged_mutex);

70 71 72 73 74 75 76 77 78 79 80 81 82 83
/* default scan 8*512 pte (or vmas) every 30 second */
static unsigned int khugepaged_pages_to_scan __read_mostly;
static unsigned int khugepaged_pages_collapsed;
static unsigned int khugepaged_full_scans;
static unsigned int khugepaged_scan_sleep_millisecs __read_mostly = 10000;
/* during fragmentation poll the hugepage allocator once every minute */
static unsigned int khugepaged_alloc_sleep_millisecs __read_mostly = 60000;
static unsigned long khugepaged_sleep_expire;
static DEFINE_SPINLOCK(khugepaged_mm_lock);
static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait);
/*
 * default collapse hugepages if there is at least one pte mapped like
 * it would have happened if the vma was large enough during page
 * fault.
84 85
 *
 * Note that these are only respected if collapse was initiated by khugepaged.
86 87 88
 */
static unsigned int khugepaged_max_ptes_none __read_mostly;
static unsigned int khugepaged_max_ptes_swap __read_mostly;
89
static unsigned int khugepaged_max_ptes_shared __read_mostly;
90 91

#define MM_SLOTS_HASH_BITS 10
92
static DEFINE_READ_MOSTLY_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
93 94 95

static struct kmem_cache *mm_slot_cache __read_mostly;

96
struct collapse_control {
97 98
	bool is_khugepaged;

99 100 101
	/* Num pages scanned per node */
	u32 node_load[MAX_NUMNODES];

102 103
	/* nodemask for allocation fallback */
	nodemask_t alloc_nmask;
104 105
};

106
/**
107 108
 * struct khugepaged_mm_slot - khugepaged information per mm that is being scanned
 * @slot: hash lookup from mm to mm_slot
109
 */
110 111
struct khugepaged_mm_slot {
	struct mm_slot slot;
112 113 114 115 116 117 118 119 120 121 122 123
};

/**
 * struct khugepaged_scan - cursor for scanning
 * @mm_head: the head of the mm list to scan
 * @mm_slot: the current mm_slot we are scanning
 * @address: the next address inside that to be scanned
 *
 * There is only the one khugepaged_scan instance of this cursor structure.
 */
struct khugepaged_scan {
	struct list_head mm_head;
124
	struct khugepaged_mm_slot *mm_slot;
125 126 127 128 129 130 131
	unsigned long address;
};

static struct khugepaged_scan khugepaged_scan = {
	.mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head),
};

132
#ifdef CONFIG_SYSFS
133 134 135 136
static ssize_t scan_sleep_millisecs_show(struct kobject *kobj,
					 struct kobj_attribute *attr,
					 char *buf)
{
137
	return sysfs_emit(buf, "%u\n", khugepaged_scan_sleep_millisecs);
138 139 140 141 142 143
}

static ssize_t scan_sleep_millisecs_store(struct kobject *kobj,
					  struct kobj_attribute *attr,
					  const char *buf, size_t count)
{
144
	unsigned int msecs;
145 146
	int err;

147 148
	err = kstrtouint(buf, 10, &msecs);
	if (err)
149 150 151 152 153 154 155 156 157
		return -EINVAL;

	khugepaged_scan_sleep_millisecs = msecs;
	khugepaged_sleep_expire = 0;
	wake_up_interruptible(&khugepaged_wait);

	return count;
}
static struct kobj_attribute scan_sleep_millisecs_attr =
158
	__ATTR_RW(scan_sleep_millisecs);
159 160 161 162 163

static ssize_t alloc_sleep_millisecs_show(struct kobject *kobj,
					  struct kobj_attribute *attr,
					  char *buf)
{
164
	return sysfs_emit(buf, "%u\n", khugepaged_alloc_sleep_millisecs);
165 166 167 168 169 170
}

static ssize_t alloc_sleep_millisecs_store(struct kobject *kobj,
					   struct kobj_attribute *attr,
					   const char *buf, size_t count)
{
171
	unsigned int msecs;
172 173
	int err;

174 175
	err = kstrtouint(buf, 10, &msecs);
	if (err)
176 177 178 179 180 181 182 183 184
		return -EINVAL;

	khugepaged_alloc_sleep_millisecs = msecs;
	khugepaged_sleep_expire = 0;
	wake_up_interruptible(&khugepaged_wait);

	return count;
}
static struct kobj_attribute alloc_sleep_millisecs_attr =
185
	__ATTR_RW(alloc_sleep_millisecs);
186 187 188 189 190

static ssize_t pages_to_scan_show(struct kobject *kobj,
				  struct kobj_attribute *attr,
				  char *buf)
{
191
	return sysfs_emit(buf, "%u\n", khugepaged_pages_to_scan);
192 193 194 195 196
}
static ssize_t pages_to_scan_store(struct kobject *kobj,
				   struct kobj_attribute *attr,
				   const char *buf, size_t count)
{
197
	unsigned int pages;
198 199
	int err;

200 201
	err = kstrtouint(buf, 10, &pages);
	if (err || !pages)
202 203 204 205 206 207 208
		return -EINVAL;

	khugepaged_pages_to_scan = pages;

	return count;
}
static struct kobj_attribute pages_to_scan_attr =
209
	__ATTR_RW(pages_to_scan);
210 211 212 213 214

static ssize_t pages_collapsed_show(struct kobject *kobj,
				    struct kobj_attribute *attr,
				    char *buf)
{
215
	return sysfs_emit(buf, "%u\n", khugepaged_pages_collapsed);
216 217 218 219 220 221 222 223
}
static struct kobj_attribute pages_collapsed_attr =
	__ATTR_RO(pages_collapsed);

static ssize_t full_scans_show(struct kobject *kobj,
			       struct kobj_attribute *attr,
			       char *buf)
{
224
	return sysfs_emit(buf, "%u\n", khugepaged_full_scans);
225 226 227 228
}
static struct kobj_attribute full_scans_attr =
	__ATTR_RO(full_scans);

229 230
static ssize_t defrag_show(struct kobject *kobj,
			   struct kobj_attribute *attr, char *buf)
231 232
{
	return single_hugepage_flag_show(kobj, attr, buf,
233
					 TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG);
234
}
235 236 237
static ssize_t defrag_store(struct kobject *kobj,
			    struct kobj_attribute *attr,
			    const char *buf, size_t count)
238 239 240 241 242
{
	return single_hugepage_flag_store(kobj, attr, buf, count,
				 TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG);
}
static struct kobj_attribute khugepaged_defrag_attr =
243
	__ATTR_RW(defrag);
244 245 246 247 248 249 250 251 252

/*
 * max_ptes_none controls if khugepaged should collapse hugepages over
 * any unmapped ptes in turn potentially increasing the memory
 * footprint of the vmas. When max_ptes_none is 0 khugepaged will not
 * reduce the available free memory in the system as it
 * runs. Increasing max_ptes_none will instead potentially reduce the
 * free memory in the system during the khugepaged scan.
 */
253 254 255
static ssize_t max_ptes_none_show(struct kobject *kobj,
				  struct kobj_attribute *attr,
				  char *buf)
256
{
257
	return sysfs_emit(buf, "%u\n", khugepaged_max_ptes_none);
258
}
259 260 261
static ssize_t max_ptes_none_store(struct kobject *kobj,
				   struct kobj_attribute *attr,
				   const char *buf, size_t count)
262 263 264 265 266
{
	int err;
	unsigned long max_ptes_none;

	err = kstrtoul(buf, 10, &max_ptes_none);
267
	if (err || max_ptes_none > HPAGE_PMD_NR - 1)
268 269 270 271 272 273 274
		return -EINVAL;

	khugepaged_max_ptes_none = max_ptes_none;

	return count;
}
static struct kobj_attribute khugepaged_max_ptes_none_attr =
275
	__ATTR_RW(max_ptes_none);
276

277 278 279
static ssize_t max_ptes_swap_show(struct kobject *kobj,
				  struct kobj_attribute *attr,
				  char *buf)
280
{
281
	return sysfs_emit(buf, "%u\n", khugepaged_max_ptes_swap);
282 283
}

284 285 286
static ssize_t max_ptes_swap_store(struct kobject *kobj,
				   struct kobj_attribute *attr,
				   const char *buf, size_t count)
287 288 289 290 291
{
	int err;
	unsigned long max_ptes_swap;

	err  = kstrtoul(buf, 10, &max_ptes_swap);
292
	if (err || max_ptes_swap > HPAGE_PMD_NR - 1)
293 294 295 296 297 298 299 300
		return -EINVAL;

	khugepaged_max_ptes_swap = max_ptes_swap;

	return count;
}

static struct kobj_attribute khugepaged_max_ptes_swap_attr =
301
	__ATTR_RW(max_ptes_swap);
302

303 304 305
static ssize_t max_ptes_shared_show(struct kobject *kobj,
				    struct kobj_attribute *attr,
				    char *buf)
306
{
307
	return sysfs_emit(buf, "%u\n", khugepaged_max_ptes_shared);
308 309
}

310 311 312
static ssize_t max_ptes_shared_store(struct kobject *kobj,
				     struct kobj_attribute *attr,
				     const char *buf, size_t count)
313 314 315 316 317
{
	int err;
	unsigned long max_ptes_shared;

	err  = kstrtoul(buf, 10, &max_ptes_shared);
318
	if (err || max_ptes_shared > HPAGE_PMD_NR - 1)
319 320 321 322 323 324 325 326
		return -EINVAL;

	khugepaged_max_ptes_shared = max_ptes_shared;

	return count;
}

static struct kobj_attribute khugepaged_max_ptes_shared_attr =
327
	__ATTR_RW(max_ptes_shared);
328

329 330 331
static struct attribute *khugepaged_attr[] = {
	&khugepaged_defrag_attr.attr,
	&khugepaged_max_ptes_none_attr.attr,
332 333
	&khugepaged_max_ptes_swap_attr.attr,
	&khugepaged_max_ptes_shared_attr.attr,
334 335 336 337 338 339 340 341 342 343 344 345
	&pages_to_scan_attr.attr,
	&pages_collapsed_attr.attr,
	&full_scans_attr.attr,
	&scan_sleep_millisecs_attr.attr,
	&alloc_sleep_millisecs_attr.attr,
	NULL,
};

struct attribute_group khugepaged_attr_group = {
	.attrs = khugepaged_attr,
	.name = "khugepaged",
};
346
#endif /* CONFIG_SYSFS */
347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368

int hugepage_madvise(struct vm_area_struct *vma,
		     unsigned long *vm_flags, int advice)
{
	switch (advice) {
	case MADV_HUGEPAGE:
#ifdef CONFIG_S390
		/*
		 * qemu blindly sets MADV_HUGEPAGE on all allocations, but s390
		 * can't handle this properly after s390_enable_sie, so we simply
		 * ignore the madvise to prevent qemu from causing a SIGSEGV.
		 */
		if (mm_has_pgste(vma->vm_mm))
			return 0;
#endif
		*vm_flags &= ~VM_NOHUGEPAGE;
		*vm_flags |= VM_HUGEPAGE;
		/*
		 * If the vma become good for khugepaged to scan,
		 * register it here without waiting a page fault that
		 * may not happen any time soon.
		 */
369
		khugepaged_enter_vma(vma, *vm_flags);
370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387
		break;
	case MADV_NOHUGEPAGE:
		*vm_flags &= ~VM_HUGEPAGE;
		*vm_flags |= VM_NOHUGEPAGE;
		/*
		 * Setting VM_NOHUGEPAGE will prevent khugepaged from scanning
		 * this vma even if we leave the mm registered in khugepaged if
		 * it got registered before VM_NOHUGEPAGE was set.
		 */
		break;
	}

	return 0;
}

int __init khugepaged_init(void)
{
	mm_slot_cache = kmem_cache_create("khugepaged_mm_slot",
388 389 390
					  sizeof(struct khugepaged_mm_slot),
					  __alignof__(struct khugepaged_mm_slot),
					  0, NULL);
391 392 393 394 395 396
	if (!mm_slot_cache)
		return -ENOMEM;

	khugepaged_pages_to_scan = HPAGE_PMD_NR * 8;
	khugepaged_max_ptes_none = HPAGE_PMD_NR - 1;
	khugepaged_max_ptes_swap = HPAGE_PMD_NR / 8;
397
	khugepaged_max_ptes_shared = HPAGE_PMD_NR / 2;
398 399 400 401 402 403 404 405 406

	return 0;
}

void __init khugepaged_destroy(void)
{
	kmem_cache_destroy(mm_slot_cache);
}

407
static inline int hpage_collapse_test_exit(struct mm_struct *mm)
408
{
409
	return atomic_read(&mm->mm_users) == 0;
410 411
}

412
void __khugepaged_enter(struct mm_struct *mm)
413
{
414 415
	struct khugepaged_mm_slot *mm_slot;
	struct mm_slot *slot;
416 417
	int wakeup;

418 419 420 421 422
	/* __khugepaged_exit() must not run from under us */
	VM_BUG_ON_MM(hpage_collapse_test_exit(mm), mm);
	if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags)))
		return;

423
	mm_slot = mm_slot_alloc(mm_slot_cache);
424
	if (!mm_slot)
425
		return;
426

427 428
	slot = &mm_slot->slot;

429
	spin_lock(&khugepaged_mm_lock);
430
	mm_slot_insert(mm_slots_hash, mm, slot);
431 432 433 434 435
	/*
	 * Insert just behind the scanning cursor, to let the area settle
	 * down a little.
	 */
	wakeup = list_empty(&khugepaged_scan.mm_head);
436
	list_add_tail(&slot->mm_node, &khugepaged_scan.mm_head);
437 438
	spin_unlock(&khugepaged_mm_lock);

439
	mmgrab(mm);
440 441 442 443
	if (wakeup)
		wake_up_interruptible(&khugepaged_wait);
}

444 445
void khugepaged_enter_vma(struct vm_area_struct *vma,
			  unsigned long vm_flags)
446
{
447
	if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags) &&
448
	    hugepage_flags_enabled()) {
449
		if (hugepage_vma_check(vma, vm_flags, false, false, true))
450 451
			__khugepaged_enter(vma->vm_mm);
	}
452 453 454 455
}

void __khugepaged_exit(struct mm_struct *mm)
{
456 457
	struct khugepaged_mm_slot *mm_slot;
	struct mm_slot *slot;
458 459 460
	int free = 0;

	spin_lock(&khugepaged_mm_lock);
461 462
	slot = mm_slot_lookup(mm_slots_hash, mm);
	mm_slot = mm_slot_entry(slot, struct khugepaged_mm_slot, slot);
463
	if (mm_slot && khugepaged_scan.mm_slot != mm_slot) {
464 465
		hash_del(&slot->hash);
		list_del(&slot->mm_node);
466 467 468 469 470 471
		free = 1;
	}
	spin_unlock(&khugepaged_mm_lock);

	if (free) {
		clear_bit(MMF_VM_HUGEPAGE, &mm->flags);
472
		mm_slot_free(mm_slot_cache, mm_slot);
473 474 475 476
		mmdrop(mm);
	} else if (mm_slot) {
		/*
		 * This is required to serialize against
477 478 479 480
		 * hpage_collapse_test_exit() (which is guaranteed to run
		 * under mmap sem read mode). Stop here (after we return all
		 * pagetables will be destroyed) until khugepaged has finished
		 * working on the pagetables under the mmap_lock.
481
		 */
482 483
		mmap_write_lock(mm);
		mmap_write_unlock(mm);
484 485 486
	}
}

487 488 489 490 491 492 493 494 495
static void release_pte_folio(struct folio *folio)
{
	node_stat_mod_folio(folio,
			NR_ISOLATED_ANON + folio_is_file_lru(folio),
			-folio_nr_pages(folio));
	folio_unlock(folio);
	folio_putback_lru(folio);
}

496 497
static void release_pte_page(struct page *page)
{
498
	release_pte_folio(page_folio(page));
499 500
}

501 502
static void release_pte_pages(pte_t *pte, pte_t *_pte,
		struct list_head *compound_pagelist)
503
{
504
	struct folio *folio, *tmp;
505

506
	while (--_pte >= pte) {
Ryan Roberts's avatar
Ryan Roberts committed
507
		pte_t pteval = ptep_get(_pte);
508
		unsigned long pfn;
509

510 511 512 513 514 515 516 517 518
		if (pte_none(pteval))
			continue;
		pfn = pte_pfn(pteval);
		if (is_zero_pfn(pfn))
			continue;
		folio = pfn_folio(pfn);
		if (folio_test_large(folio))
			continue;
		release_pte_folio(folio);
519 520
	}

521 522 523
	list_for_each_entry_safe(folio, tmp, compound_pagelist, lru) {
		list_del(&folio->lru);
		release_pte_folio(folio);
524 525 526
	}
}

527 528 529 530 531 532 533 534 535 536 537
static bool is_refcount_suitable(struct page *page)
{
	int expected_refcount;

	expected_refcount = total_mapcount(page);
	if (PageSwapCache(page))
		expected_refcount += compound_nr(page);

	return page_count(page) == expected_refcount;
}

538 539
static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
					unsigned long address,
540
					pte_t *pte,
541
					struct collapse_control *cc,
542
					struct list_head *compound_pagelist)
543 544
{
	struct page *page = NULL;
545
	struct folio *folio = NULL;
546
	pte_t *_pte;
547
	int none_or_zero = 0, shared = 0, result = SCAN_FAIL, referenced = 0;
548
	bool writable = false;
549

550
	for (_pte = pte; _pte < pte + HPAGE_PMD_NR;
551
	     _pte++, address += PAGE_SIZE) {
Ryan Roberts's avatar
Ryan Roberts committed
552
		pte_t pteval = ptep_get(_pte);
553 554
		if (pte_none(pteval) || (pte_present(pteval) &&
				is_zero_pfn(pte_pfn(pteval)))) {
555
			++none_or_zero;
556
			if (!userfaultfd_armed(vma) &&
557 558
			    (!cc->is_khugepaged ||
			     none_or_zero <= khugepaged_max_ptes_none)) {
559 560 561
				continue;
			} else {
				result = SCAN_EXCEED_NONE_PTE;
562
				count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
563 564 565 566 567 568 569
				goto out;
			}
		}
		if (!pte_present(pteval)) {
			result = SCAN_PTE_NON_PRESENT;
			goto out;
		}
570 571 572 573
		if (pte_uffd_wp(pteval)) {
			result = SCAN_PTE_UFFD_WP;
			goto out;
		}
574
		page = vm_normal_page(vma, address, pteval);
575
		if (unlikely(!page) || unlikely(is_zone_device_page(page))) {
576 577 578 579
			result = SCAN_PAGE_NULL;
			goto out;
		}

580 581
		folio = page_folio(page);
		VM_BUG_ON_FOLIO(!folio_test_anon(folio), folio);
582

583 584 585 586 587 588 589 590
		if (page_mapcount(page) > 1) {
			++shared;
			if (cc->is_khugepaged &&
			    shared > khugepaged_max_ptes_shared) {
				result = SCAN_EXCEED_SHARED_PTE;
				count_vm_event(THP_SCAN_EXCEED_SHARED_PTE);
				goto out;
			}
591 592
		}

593 594
		if (folio_test_large(folio)) {
			struct folio *f;
595

596 597 598 599
			/*
			 * Check if we have dealt with the compound page
			 * already
			 */
600 601
			list_for_each_entry(f, compound_pagelist, lru) {
				if (folio == f)
602 603 604
					goto next;
			}
		}
605 606 607 608 609 610 611

		/*
		 * We can do it before isolate_lru_page because the
		 * page can't be freed from under us. NOTE: PG_lock
		 * is needed to serialize against split_huge_page
		 * when invoked from the VM.
		 */
612
		if (!folio_trylock(folio)) {
613 614 615 616 617
			result = SCAN_PAGE_LOCK;
			goto out;
		}

		/*
618 619 620 621
		 * Check if the page has any GUP (or other external) pins.
		 *
		 * The page table that maps the page has been already unlinked
		 * from the page table tree and this process cannot get
Ingo Molnar's avatar
Ingo Molnar committed
622
		 * an additional pin on the page.
623 624 625 626
		 *
		 * New pins can come later if the page is shared across fork,
		 * but not from this process. The other process cannot write to
		 * the page, only trigger CoW.
627
		 */
628 629
		if (!is_refcount_suitable(&folio->page)) {
			folio_unlock(folio);
630 631 632 633 634 635 636 637
			result = SCAN_PAGE_COUNT;
			goto out;
		}

		/*
		 * Isolate the page to avoid collapsing an hugepage
		 * currently in use by the VM.
		 */
638 639
		if (!folio_isolate_lru(folio)) {
			folio_unlock(folio);
640 641 642
			result = SCAN_DEL_PAGE_LRU;
			goto out;
		}
643 644 645 646 647 648 649 650
		node_stat_mod_folio(folio,
				NR_ISOLATED_ANON + folio_is_file_lru(folio),
				folio_nr_pages(folio));
		VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
		VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);

		if (folio_test_large(folio))
			list_add_tail(&folio->lru, compound_pagelist);
651
next:
652 653 654 655 656
		/*
		 * If collapse was initiated by khugepaged, check that there is
		 * enough young pte to justify collapsing the page
		 */
		if (cc->is_khugepaged &&
657 658
		    (pte_young(pteval) || folio_test_young(folio) ||
		     folio_test_referenced(folio) || mmu_notifier_test_young(vma->vm_mm,
659
								     address)))
660
			referenced++;
661 662 663

		if (pte_write(pteval))
			writable = true;
664
	}
665 666

	if (unlikely(!writable)) {
667
		result = SCAN_PAGE_RO;
668
	} else if (unlikely(cc->is_khugepaged && !referenced)) {
669 670 671
		result = SCAN_LACK_REFERENCED_PAGE;
	} else {
		result = SCAN_SUCCEED;
672
		trace_mm_collapse_huge_page_isolate(&folio->page, none_or_zero,
673
						    referenced, writable, result);
674
		return result;
675 676
	}
out:
677
	release_pte_pages(pte, _pte, compound_pagelist);
678
	trace_mm_collapse_huge_page_isolate(&folio->page, none_or_zero,
679
					    referenced, writable, result);
680
	return result;
681 682
}

683 684 685 686 687
static void __collapse_huge_page_copy_succeeded(pte_t *pte,
						struct vm_area_struct *vma,
						unsigned long address,
						spinlock_t *ptl,
						struct list_head *compound_pagelist)
688
{
689 690
	struct page *src_page;
	struct page *tmp;
691
	pte_t *_pte;
692
	pte_t pteval;
693

694 695
	for (_pte = pte; _pte < pte + HPAGE_PMD_NR;
	     _pte++, address += PAGE_SIZE) {
Ryan Roberts's avatar
Ryan Roberts committed
696
		pteval = ptep_get(_pte);
697 698 699 700 701 702 703
		if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
			add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1);
			if (is_zero_pfn(pte_pfn(pteval))) {
				/*
				 * ptl mostly unnecessary.
				 */
				spin_lock(ptl);
704
				ptep_clear(vma->vm_mm, address, _pte);
705
				spin_unlock(ptl);
706
				ksm_might_unmap_zero_page(vma->vm_mm, pteval);
707 708 709
			}
		} else {
			src_page = pte_page(pteval);
710 711
			if (!PageCompound(src_page))
				release_pte_page(src_page);
712 713 714 715 716 717
			/*
			 * ptl mostly unnecessary, but preempt has to
			 * be disabled to update the per-cpu stats
			 * inside page_remove_rmap().
			 */
			spin_lock(ptl);
718
			ptep_clear(vma->vm_mm, address, _pte);
719
			page_remove_rmap(src_page, vma, false);
720 721 722 723
			spin_unlock(ptl);
			free_page_and_swap_cache(src_page);
		}
	}
724 725 726

	list_for_each_entry_safe(src_page, tmp, compound_pagelist, lru) {
		list_del(&src_page->lru);
727 728 729 730 731 732
		mod_node_page_state(page_pgdat(src_page),
				    NR_ISOLATED_ANON + page_is_file_lru(src_page),
				    -compound_nr(src_page));
		unlock_page(src_page);
		free_swap_cache(src_page);
		putback_lru_page(src_page);
733
	}
734 735
}

736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794
static void __collapse_huge_page_copy_failed(pte_t *pte,
					     pmd_t *pmd,
					     pmd_t orig_pmd,
					     struct vm_area_struct *vma,
					     struct list_head *compound_pagelist)
{
	spinlock_t *pmd_ptl;

	/*
	 * Re-establish the PMD to point to the original page table
	 * entry. Restoring PMD needs to be done prior to releasing
	 * pages. Since pages are still isolated and locked here,
	 * acquiring anon_vma_lock_write is unnecessary.
	 */
	pmd_ptl = pmd_lock(vma->vm_mm, pmd);
	pmd_populate(vma->vm_mm, pmd, pmd_pgtable(orig_pmd));
	spin_unlock(pmd_ptl);
	/*
	 * Release both raw and compound pages isolated
	 * in __collapse_huge_page_isolate.
	 */
	release_pte_pages(pte, pte + HPAGE_PMD_NR, compound_pagelist);
}

/*
 * __collapse_huge_page_copy - attempts to copy memory contents from raw
 * pages to a hugepage. Cleans up the raw pages if copying succeeds;
 * otherwise restores the original page table and releases isolated raw pages.
 * Returns SCAN_SUCCEED if copying succeeds, otherwise returns SCAN_COPY_MC.
 *
 * @pte: starting of the PTEs to copy from
 * @page: the new hugepage to copy contents to
 * @pmd: pointer to the new hugepage's PMD
 * @orig_pmd: the original raw pages' PMD
 * @vma: the original raw pages' virtual memory area
 * @address: starting address to copy
 * @ptl: lock on raw pages' PTEs
 * @compound_pagelist: list that stores compound pages
 */
static int __collapse_huge_page_copy(pte_t *pte,
				     struct page *page,
				     pmd_t *pmd,
				     pmd_t orig_pmd,
				     struct vm_area_struct *vma,
				     unsigned long address,
				     spinlock_t *ptl,
				     struct list_head *compound_pagelist)
{
	struct page *src_page;
	pte_t *_pte;
	pte_t pteval;
	unsigned long _address;
	int result = SCAN_SUCCEED;

	/*
	 * Copying pages' contents is subject to memory poison at any iteration.
	 */
	for (_pte = pte, _address = address; _pte < pte + HPAGE_PMD_NR;
	     _pte++, page++, _address += PAGE_SIZE) {
Ryan Roberts's avatar
Ryan Roberts committed
795
		pteval = ptep_get(_pte);
796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816
		if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
			clear_user_highpage(page, _address);
			continue;
		}
		src_page = pte_page(pteval);
		if (copy_mc_user_highpage(page, src_page, _address, vma) > 0) {
			result = SCAN_COPY_MC;
			break;
		}
	}

	if (likely(result == SCAN_SUCCEED))
		__collapse_huge_page_copy_succeeded(pte, vma, address, ptl,
						    compound_pagelist);
	else
		__collapse_huge_page_copy_failed(pte, pmd, orig_pmd, vma,
						 compound_pagelist);

	return result;
}

817 818 819 820 821
static void khugepaged_alloc_sleep(void)
{
	DEFINE_WAIT(wait);

	add_wait_queue(&khugepaged_wait, &wait);
822 823
	__set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE);
	schedule_timeout(msecs_to_jiffies(khugepaged_alloc_sleep_millisecs));
824 825 826
	remove_wait_queue(&khugepaged_wait, &wait);
}

827
struct collapse_control khugepaged_collapse_control = {
828
	.is_khugepaged = true,
829
};
830

831
static bool hpage_collapse_scan_abort(int nid, struct collapse_control *cc)
832 833 834 835
{
	int i;

	/*
836
	 * If node_reclaim_mode is disabled, then no extra effort is made to
837 838
	 * allocate memory locally.
	 */
839
	if (!node_reclaim_enabled())
840 841 842
		return false;

	/* If there is a count for this node already, it must be acceptable */
843
	if (cc->node_load[nid])
844 845 846
		return false;

	for (i = 0; i < MAX_NUMNODES; i++) {
847
		if (!cc->node_load[i])
848
			continue;
849
		if (node_distance(nid, i) > node_reclaim_distance)
850 851 852 853 854
			return true;
	}
	return false;
}

855 856 857 858
#define khugepaged_defrag()					\
	(transparent_hugepage_flags &				\
	 (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG))

859 860 861
/* Defrag for khugepaged will enter direct reclaim/compaction if necessary */
static inline gfp_t alloc_hugepage_khugepaged_gfpmask(void)
{
862
	return khugepaged_defrag() ? GFP_TRANSHUGE : GFP_TRANSHUGE_LIGHT;
863 864 865
}

#ifdef CONFIG_NUMA
866
static int hpage_collapse_find_target_node(struct collapse_control *cc)
867 868 869 870 871
{
	int nid, target_node = 0, max_value = 0;

	/* find first node with max normal pages hit */
	for (nid = 0; nid < MAX_NUMNODES; nid++)
872 873
		if (cc->node_load[nid] > max_value) {
			max_value = cc->node_load[nid];
874 875 876
			target_node = nid;
		}

877 878 879 880
	for_each_online_node(nid) {
		if (max_value == cc->node_load[nid])
			node_set(nid, cc->alloc_nmask);
	}
881 882 883

	return target_node;
}
884
#else
885
static int hpage_collapse_find_target_node(struct collapse_control *cc)
886
{
887
	return 0;
888
}
889
#endif
890

891 892
static bool hpage_collapse_alloc_page(struct page **hpage, gfp_t gfp, int node,
				      nodemask_t *nmask)
893
{
894
	*hpage = __alloc_pages(gfp, HPAGE_PMD_ORDER, node, nmask);
895 896
	if (unlikely(!*hpage)) {
		count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
897
		return false;
898 899
	}

900
	folio_prep_large_rmappable((struct folio *)*hpage);
901 902 903 904 905
	count_vm_event(THP_COLLAPSE_ALLOC);
	return true;
}

/*
906 907
 * If mmap_lock temporarily dropped, revalidate vma
 * before taking mmap_lock.
908
 * Returns enum scan_result value.
909 910
 */

911
static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address,
912
				   bool expect_anon,
913 914
				   struct vm_area_struct **vmap,
				   struct collapse_control *cc)
915 916 917
{
	struct vm_area_struct *vma;

918
	if (unlikely(hpage_collapse_test_exit(mm)))
919 920
		return SCAN_ANY_PROCESS;

921
	*vmap = vma = find_vma(mm, address);
922 923 924
	if (!vma)
		return SCAN_VMA_NULL;

925
	if (!transhuge_vma_suitable(vma, address))
926
		return SCAN_ADDRESS_RANGE;
927 928
	if (!hugepage_vma_check(vma, vma->vm_flags, false, false,
				cc->is_khugepaged))
929
		return SCAN_VMA_CHECK;
930 931 932 933 934 935 936
	/*
	 * Anon VMA expected, the address may be unmapped then
	 * remapped to file after khugepaged reaquired the mmap_lock.
	 *
	 * hugepage_vma_check may return true for qualified file
	 * vmas.
	 */
937 938
	if (expect_anon && (!(*vmap)->anon_vma || !vma_is_anonymous(*vmap)))
		return SCAN_PAGE_ANON;
939
	return SCAN_SUCCEED;
940 941
}

942 943 944 945 946 947 948 949 950 951
static int find_pmd_or_thp_or_none(struct mm_struct *mm,
				   unsigned long address,
				   pmd_t **pmd)
{
	pmd_t pmde;

	*pmd = mm_find_pmd(mm, address);
	if (!*pmd)
		return SCAN_PMD_NULL;

952
	pmde = pmdp_get_lockless(*pmd);
953 954
	if (pmd_none(pmde))
		return SCAN_PMD_NONE;
955 956
	if (!pmd_present(pmde))
		return SCAN_PMD_NULL;
957 958
	if (pmd_trans_huge(pmde))
		return SCAN_PMD_MAPPED;
959 960
	if (pmd_devmap(pmde))
		return SCAN_PMD_NULL;
961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977
	if (pmd_bad(pmde))
		return SCAN_PMD_NULL;
	return SCAN_SUCCEED;
}

static int check_pmd_still_valid(struct mm_struct *mm,
				 unsigned long address,
				 pmd_t *pmd)
{
	pmd_t *new_pmd;
	int result = find_pmd_or_thp_or_none(mm, address, &new_pmd);

	if (result != SCAN_SUCCEED)
		return result;
	if (new_pmd != pmd)
		return SCAN_FAIL;
	return SCAN_SUCCEED;
978 979 980 981
}

/*
 * Bring missing pages in from swap, to complete THP collapse.
982
 * Only done if hpage_collapse_scan_pmd believes it is worthwhile.
983
 *
984
 * Called and returns without pte mapped or spinlocks held.
985
 * Returns result: if not SCAN_SUCCEED, mmap_lock has been released.
986
 */
987 988 989 990
static int __collapse_huge_page_swapin(struct mm_struct *mm,
				       struct vm_area_struct *vma,
				       unsigned long haddr, pmd_t *pmd,
				       int referenced)
991
{
992 993
	int swapped_in = 0;
	vm_fault_t ret = 0;
994
	unsigned long address, end = haddr + (HPAGE_PMD_NR * PAGE_SIZE);
995 996
	int result;
	pte_t *pte = NULL;
997
	spinlock_t *ptl;
998 999 1000 1001 1002

	for (address = haddr; address < end; address += PAGE_SIZE) {
		struct vm_fault vmf = {
			.vma = vma,
			.address = address,
1003
			.pgoff = linear_page_index(vma, address),
1004 1005 1006 1007
			.flags = FAULT_FLAG_ALLOW_RETRY,
			.pmd = pmd,
		};

1008
		if (!pte++) {
1009
			pte = pte_offset_map_nolock(mm, pmd, address, &ptl);
1010 1011 1012 1013 1014
			if (!pte) {
				mmap_read_unlock(mm);
				result = SCAN_PMD_NULL;
				goto out;
			}
1015
		}
1016

1017
		vmf.orig_pte = ptep_get_lockless(pte);
1018 1019 1020 1021
		if (!is_swap_pte(vmf.orig_pte))
			continue;

		vmf.pte = pte;
1022
		vmf.ptl = ptl;
1023
		ret = do_swap_page(&vmf);
1024 1025
		/* Which unmaps pte (after perhaps re-checking the entry) */
		pte = NULL;
1026

1027 1028 1029 1030 1031 1032
		/*
		 * do_swap_page returns VM_FAULT_RETRY with released mmap_lock.
		 * Note we treat VM_FAULT_RETRY as VM_FAULT_ERROR here because
		 * we do not retry here and swap entry will remain in pagetable
		 * resulting in later failure.
		 */
1033
		if (ret & VM_FAULT_RETRY) {
1034
			/* Likely, but not guaranteed, that page lock failed */
1035 1036
			result = SCAN_PAGE_LOCK;
			goto out;
1037 1038
		}
		if (ret & VM_FAULT_ERROR) {
1039
			mmap_read_unlock(mm);
1040 1041
			result = SCAN_FAIL;
			goto out;
1042
		}
1043
		swapped_in++;
1044
	}
1045

1046 1047 1048
	if (pte)
		pte_unmap(pte);

1049
	/* Drain LRU cache to remove extra pin on the swapped in pages */
1050 1051 1052
	if (swapped_in)
		lru_add_drain();

1053 1054 1055 1056
	result = SCAN_SUCCEED;
out:
	trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, result);
	return result;
1057 1058
}

1059 1060 1061
static int alloc_charge_hpage(struct page **hpage, struct mm_struct *mm,
			      struct collapse_control *cc)
{
1062
	gfp_t gfp = (cc->is_khugepaged ? alloc_hugepage_khugepaged_gfpmask() :
1063
		     GFP_TRANSHUGE);
1064
	int node = hpage_collapse_find_target_node(cc);
1065
	struct folio *folio;
1066

1067
	if (!hpage_collapse_alloc_page(hpage, gfp, node, &cc->alloc_nmask))
1068
		return SCAN_ALLOC_HUGE_PAGE_FAIL;
1069 1070 1071 1072 1073

	folio = page_folio(*hpage);
	if (unlikely(mem_cgroup_charge(folio, mm, gfp))) {
		folio_put(folio);
		*hpage = NULL;
1074
		return SCAN_CGROUP_CHARGE_FAIL;
1075
	}
1076
	count_memcg_page_event(*hpage, THP_COLLAPSE_ALLOC);
1077

1078 1079 1080
	return SCAN_SUCCEED;
}

1081 1082 1083
static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
			      int referenced, int unmapped,
			      struct collapse_control *cc)
1084
{
1085
	LIST_HEAD(compound_pagelist);
1086 1087 1088
	pmd_t *pmd, _pmd;
	pte_t *pte;
	pgtable_t pgtable;
1089
	struct page *hpage;
1090
	spinlock_t *pmd_ptl, *pte_ptl;
1091
	int result = SCAN_FAIL;
1092
	struct vm_area_struct *vma;
1093
	struct mmu_notifier_range range;
1094 1095 1096

	VM_BUG_ON(address & ~HPAGE_PMD_MASK);

1097
	/*
1098
	 * Before allocating the hugepage, release the mmap_lock read lock.
1099
	 * The allocation can take potentially a long time if it involves
1100
	 * sync compaction, and we do not need to hold the mmap_lock during
1101 1102
	 * that. We will recheck the vma after taking it again in write mode.
	 */
1103
	mmap_read_unlock(mm);
1104

1105
	result = alloc_charge_hpage(&hpage, mm, cc);
1106
	if (result != SCAN_SUCCEED)
1107 1108
		goto out_nolock;

1109
	mmap_read_lock(mm);
1110
	result = hugepage_vma_revalidate(mm, address, true, &vma, cc);
1111
	if (result != SCAN_SUCCEED) {
1112
		mmap_read_unlock(mm);
1113 1114 1115
		goto out_nolock;
	}

1116 1117
	result = find_pmd_or_thp_or_none(mm, address, &pmd);
	if (result != SCAN_SUCCEED) {
1118
		mmap_read_unlock(mm);
1119 1120 1121
		goto out_nolock;
	}

1122 1123 1124 1125 1126 1127 1128 1129 1130 1131
	if (unmapped) {
		/*
		 * __collapse_huge_page_swapin will return with mmap_lock
		 * released when it fails. So we jump out_nolock directly in
		 * that case.  Continuing to collapse causes inconsistency.
		 */
		result = __collapse_huge_page_swapin(mm, vma, address, pmd,
						     referenced);
		if (result != SCAN_SUCCEED)
			goto out_nolock;
1132 1133
	}

1134
	mmap_read_unlock(mm);
1135 1136 1137 1138 1139
	/*
	 * Prevent all access to pagetables with the exception of
	 * gup_fast later handled by the ptep_clear_flush and the VM
	 * handled by the anon_vma lock + PG_lock.
	 */
1140
	mmap_write_lock(mm);
1141
	result = hugepage_vma_revalidate(mm, address, true, &vma, cc);
1142
	if (result != SCAN_SUCCEED)
1143
		goto out_up_write;
1144
	/* check if the pmd is still valid */
1145 1146
	result = check_pmd_still_valid(mm, address, pmd);
	if (result != SCAN_SUCCEED)
1147
		goto out_up_write;
1148

1149
	vma_start_write(vma);
1150 1151
	anon_vma_lock_write(vma->anon_vma);

1152 1153
	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, address,
				address + HPAGE_PMD_SIZE);
1154
	mmu_notifier_invalidate_range_start(&range);
1155

1156 1157
	pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */
	/*
1158 1159 1160 1161 1162 1163
	 * This removes any huge TLB entry from the CPU so we won't allow
	 * huge and small TLB entries for the same virtual address to
	 * avoid the risk of CPU bugs in that area.
	 *
	 * Parallel fast GUP is fine since fast GUP will back off when
	 * it detects PMD is changed.
1164 1165 1166
	 */
	_pmd = pmdp_collapse_flush(vma, address, pmd);
	spin_unlock(pmd_ptl);
1167
	mmu_notifier_invalidate_range_end(&range);
1168
	tlb_remove_table_sync_one();
1169

1170 1171 1172 1173 1174 1175 1176 1177
	pte = pte_offset_map_lock(mm, &_pmd, address, &pte_ptl);
	if (pte) {
		result = __collapse_huge_page_isolate(vma, address, pte, cc,
						      &compound_pagelist);
		spin_unlock(pte_ptl);
	} else {
		result = SCAN_PMD_NULL;
	}
1178

1179
	if (unlikely(result != SCAN_SUCCEED)) {
1180 1181
		if (pte)
			pte_unmap(pte);
1182 1183 1184 1185 1186 1187 1188 1189 1190 1191
		spin_lock(pmd_ptl);
		BUG_ON(!pmd_none(*pmd));
		/*
		 * We can only use set_pmd_at when establishing
		 * hugepmds and never for establishing regular pmds that
		 * points to regular pagetables. Use pmd_populate for that
		 */
		pmd_populate(mm, pmd, pmd_pgtable(_pmd));
		spin_unlock(pmd_ptl);
		anon_vma_unlock_write(vma->anon_vma);
1192
		goto out_up_write;
1193 1194 1195 1196 1197 1198 1199 1200
	}

	/*
	 * All pages are isolated and locked so anon_vma rmap
	 * can't run anymore.
	 */
	anon_vma_unlock_write(vma->anon_vma);

1201 1202 1203
	result = __collapse_huge_page_copy(pte, hpage, pmd, _pmd,
					   vma, address, pte_ptl,
					   &compound_pagelist);
1204
	pte_unmap(pte);
1205 1206 1207
	if (unlikely(result != SCAN_SUCCEED))
		goto out_up_write;

1208 1209 1210 1211 1212 1213
	/*
	 * spin_lock() below is not the equivalent of smp_wmb(), but
	 * the smp_wmb() inside __SetPageUptodate() can be reused to
	 * avoid the copy_huge_page writes to become visible after
	 * the set_pmd_at() write.
	 */
1214
	__SetPageUptodate(hpage);
1215 1216
	pgtable = pmd_pgtable(_pmd);

1217
	_pmd = mk_huge_pmd(hpage, vma->vm_page_prot);
1218
	_pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma);
1219 1220 1221

	spin_lock(pmd_ptl);
	BUG_ON(!pmd_none(*pmd));
1222 1223
	page_add_new_anon_rmap(hpage, vma, address);
	lru_cache_add_inactive_or_unevictable(hpage, vma);
1224 1225 1226 1227 1228
	pgtable_trans_huge_deposit(mm, pmd, pgtable);
	set_pmd_at(mm, address, pmd, _pmd);
	update_mmu_cache_pmd(vma, address, pmd);
	spin_unlock(pmd_ptl);

1229
	hpage = NULL;
1230 1231 1232

	result = SCAN_SUCCEED;
out_up_write:
1233
	mmap_write_unlock(mm);
1234
out_nolock:
1235
	if (hpage)
1236 1237 1238
		put_page(hpage);
	trace_mm_collapse_huge_page(mm, result == SCAN_SUCCEED, result);
	return result;
1239 1240
}

1241 1242 1243 1244
static int hpage_collapse_scan_pmd(struct mm_struct *mm,
				   struct vm_area_struct *vma,
				   unsigned long address, bool *mmap_locked,
				   struct collapse_control *cc)
1245 1246 1247
{
	pmd_t *pmd;
	pte_t *pte, *_pte;
1248
	int result = SCAN_FAIL, referenced = 0;
1249
	int none_or_zero = 0, shared = 0;
1250
	struct page *page = NULL;
1251
	struct folio *folio = NULL;
1252 1253 1254
	unsigned long _address;
	spinlock_t *ptl;
	int node = NUMA_NO_NODE, unmapped = 0;
1255
	bool writable = false;
1256 1257 1258

	VM_BUG_ON(address & ~HPAGE_PMD_MASK);

1259 1260
	result = find_pmd_or_thp_or_none(mm, address, &pmd);
	if (result != SCAN_SUCCEED)
1261 1262
		goto out;

1263
	memset(cc->node_load, 0, sizeof(cc->node_load));
1264
	nodes_clear(cc->alloc_nmask);
1265
	pte = pte_offset_map_lock(mm, pmd, address, &ptl);
1266 1267 1268 1269 1270
	if (!pte) {
		result = SCAN_PMD_NULL;
		goto out;
	}

1271
	for (_address = address, _pte = pte; _pte < pte + HPAGE_PMD_NR;
1272
	     _pte++, _address += PAGE_SIZE) {
Ryan Roberts's avatar
Ryan Roberts committed
1273
		pte_t pteval = ptep_get(_pte);
1274
		if (is_swap_pte(pteval)) {
1275 1276 1277
			++unmapped;
			if (!cc->is_khugepaged ||
			    unmapped <= khugepaged_max_ptes_swap) {
1278 1279 1280 1281 1282
				/*
				 * Always be strict with uffd-wp
				 * enabled swap entries.  Please see
				 * comment below for pte_uffd_wp().
				 */
1283
				if (pte_swp_uffd_wp_any(pteval)) {
1284 1285 1286
					result = SCAN_PTE_UFFD_WP;
					goto out_unmap;
				}
1287 1288 1289
				continue;
			} else {
				result = SCAN_EXCEED_SWAP_PTE;
1290
				count_vm_event(THP_SCAN_EXCEED_SWAP_PTE);
1291 1292 1293 1294
				goto out_unmap;
			}
		}
		if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
1295
			++none_or_zero;
1296
			if (!userfaultfd_armed(vma) &&
1297 1298
			    (!cc->is_khugepaged ||
			     none_or_zero <= khugepaged_max_ptes_none)) {
1299 1300 1301
				continue;
			} else {
				result = SCAN_EXCEED_NONE_PTE;
1302
				count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
1303 1304 1305
				goto out_unmap;
			}
		}
1306 1307 1308 1309 1310 1311
		if (pte_uffd_wp(pteval)) {
			/*
			 * Don't collapse the page if any of the small
			 * PTEs are armed with uffd write protection.
			 * Here we can also mark the new huge pmd as
			 * write protected if any of the small ones is
1312
			 * marked but that could bring unknown
1313 1314 1315 1316 1317 1318
			 * userfault messages that falls outside of
			 * the registered range.  So, just be simple.
			 */
			result = SCAN_PTE_UFFD_WP;
			goto out_unmap;
		}
1319 1320 1321 1322
		if (pte_write(pteval))
			writable = true;

		page = vm_normal_page(vma, _address, pteval);
1323
		if (unlikely(!page) || unlikely(is_zone_device_page(page))) {
1324 1325 1326 1327
			result = SCAN_PAGE_NULL;
			goto out_unmap;
		}

1328 1329 1330 1331 1332 1333 1334 1335
		if (page_mapcount(page) > 1) {
			++shared;
			if (cc->is_khugepaged &&
			    shared > khugepaged_max_ptes_shared) {
				result = SCAN_EXCEED_SHARED_PTE;
				count_vm_event(THP_SCAN_EXCEED_SHARED_PTE);
				goto out_unmap;
			}
1336 1337
		}

1338
		folio = page_folio(page);
1339 1340
		/*
		 * Record which node the original page is from and save this
1341
		 * information to cc->node_load[].
Quanfa Fu's avatar
Quanfa Fu committed
1342
		 * Khugepaged will allocate hugepage from the node has the max
1343 1344
		 * hit record.
		 */
1345
		node = folio_nid(folio);
1346
		if (hpage_collapse_scan_abort(node, cc)) {
1347 1348 1349
			result = SCAN_SCAN_ABORT;
			goto out_unmap;
		}
1350
		cc->node_load[node]++;
1351
		if (!folio_test_lru(folio)) {
1352 1353 1354
			result = SCAN_PAGE_LRU;
			goto out_unmap;
		}
1355
		if (folio_test_locked(folio)) {
1356 1357 1358
			result = SCAN_PAGE_LOCK;
			goto out_unmap;
		}
1359
		if (!folio_test_anon(folio)) {
1360 1361 1362 1363 1364
			result = SCAN_PAGE_ANON;
			goto out_unmap;
		}

		/*
1365 1366
		 * Check if the page has any GUP (or other external) pins.
		 *
1367 1368
		 * Here the check may be racy:
		 * it may see total_mapcount > refcount in some cases?
1369 1370 1371 1372
		 * But such case is ephemeral we could always retry collapse
		 * later.  However it may report false positive if the page
		 * has excessive GUP pins (i.e. 512).  Anyway the same check
		 * will be done again later the risk seems low.
1373
		 */
1374
		if (!is_refcount_suitable(&folio->page)) {
1375 1376 1377
			result = SCAN_PAGE_COUNT;
			goto out_unmap;
		}
1378 1379 1380 1381 1382 1383

		/*
		 * If collapse was initiated by khugepaged, check that there is
		 * enough young pte to justify collapsing the page
		 */
		if (cc->is_khugepaged &&
1384 1385
		    (pte_young(pteval) || folio_test_young(folio) ||
		     folio_test_referenced(folio) || mmu_notifier_test_young(vma->vm_mm,
1386
								     address)))
1387
			referenced++;
1388
	}
1389
	if (!writable) {
1390
		result = SCAN_PAGE_RO;
1391 1392 1393
	} else if (cc->is_khugepaged &&
		   (!referenced ||
		    (unmapped && referenced < HPAGE_PMD_NR / 2))) {
1394 1395 1396
		result = SCAN_LACK_REFERENCED_PAGE;
	} else {
		result = SCAN_SUCCEED;
1397 1398 1399
	}
out_unmap:
	pte_unmap_unlock(pte, ptl);
1400 1401 1402
	if (result == SCAN_SUCCEED) {
		result = collapse_huge_page(mm, address, referenced,
					    unmapped, cc);
1403
		/* collapse_huge_page will return with the mmap_lock released */
1404
		*mmap_locked = false;
1405 1406
	}
out:
1407
	trace_mm_khugepaged_scan_pmd(mm, &folio->page, writable, referenced,
1408
				     none_or_zero, result, unmapped);
1409
	return result;
1410 1411
}

1412
static void collect_mm_slot(struct khugepaged_mm_slot *mm_slot)
1413
{
1414 1415
	struct mm_slot *slot = &mm_slot->slot;
	struct mm_struct *mm = slot->mm;
1416

1417
	lockdep_assert_held(&khugepaged_mm_lock);
1418

1419
	if (hpage_collapse_test_exit(mm)) {
1420
		/* free mm_slot */
1421 1422
		hash_del(&slot->hash);
		list_del(&slot->mm_node);
1423 1424 1425 1426 1427 1428 1429 1430

		/*
		 * Not strictly needed because the mm exited already.
		 *
		 * clear_bit(MMF_VM_HUGEPAGE, &mm->flags);
		 */

		/* khugepaged_mm_lock actually not necessary for the below */
1431
		mm_slot_free(mm_slot_cache, mm_slot);
1432 1433 1434 1435
		mmdrop(mm);
	}
}

1436
#ifdef CONFIG_SHMEM
1437
/* hpage must be locked, and mmap_lock must be held */
1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448
static int set_huge_pmd(struct vm_area_struct *vma, unsigned long addr,
			pmd_t *pmdp, struct page *hpage)
{
	struct vm_fault vmf = {
		.vma = vma,
		.address = addr,
		.flags = 0,
		.pmd = pmdp,
	};

	VM_BUG_ON(!PageTransHuge(hpage));
1449
	mmap_assert_locked(vma->vm_mm);
1450 1451 1452 1453 1454 1455

	if (do_set_pmd(&vmf, hpage))
		return SCAN_FAIL;

	get_page(hpage);
	return SCAN_SUCCEED;
1456 1457 1458
}

/**
1459 1460 1461 1462 1463
 * collapse_pte_mapped_thp - Try to collapse a pte-mapped THP for mm at
 * address haddr.
 *
 * @mm: process address space where collapse happens
 * @addr: THP collapse address
1464
 * @install_pmd: If a huge PMD should be installed
1465 1466 1467
 *
 * This function checks whether all the PTEs in the PMD are pointing to the
 * right THP. If so, retract the page table so the THP can refault in with
1468
 * as pmd-mapped. Possibly install a huge PMD mapping the THP.
1469
 */
1470 1471
int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
			    bool install_pmd)
1472
{
1473 1474
	struct mmu_notifier_range range;
	bool notified = false;
1475
	unsigned long haddr = addr & HPAGE_PMD_MASK;
1476
	struct vm_area_struct *vma = vma_lookup(mm, haddr);
1477
	struct page *hpage;
1478
	pte_t *start_pte, *pte;
1479
	pmd_t *pmd, pgt_pmd;
1480
	spinlock_t *pml = NULL, *ptl;
1481
	int nr_ptes = 0, result = SCAN_FAIL;
1482 1483
	int i;

1484 1485 1486 1487 1488 1489
	mmap_assert_locked(mm);

	/* First check VMA found, in case page tables are being torn down */
	if (!vma || !vma->vm_file ||
	    !range_in_vma(vma, haddr, haddr + HPAGE_PMD_SIZE))
		return SCAN_VMA_CHECK;
1490

1491
	/* Fast check before locking page if already PMD-mapped */
1492
	result = find_pmd_or_thp_or_none(mm, haddr, &pmd);
1493 1494
	if (result == SCAN_PMD_MAPPED)
		return result;
1495

1496
	/*
1497 1498 1499 1500 1501
	 * If we are here, we've succeeded in replacing all the native pages
	 * in the page cache with a single hugepage. If a mm were to fault-in
	 * this memory (mapped by a suitably aligned VMA), we'd get the hugepage
	 * and map it by a PMD, regardless of sysfs THP settings. As such, let's
	 * analogously elide sysfs THP settings here.
1502
	 */
1503
	if (!hugepage_vma_check(vma, vma->vm_flags, false, false, false))
1504
		return SCAN_VMA_CHECK;
1505

1506 1507
	/* Keep pmd pgtable for uffd-wp; see comment in retract_page_tables() */
	if (userfaultfd_wp(vma))
1508
		return SCAN_PTE_UFFD_WP;
1509

1510 1511 1512
	hpage = find_lock_page(vma->vm_file->f_mapping,
			       linear_page_index(vma, haddr));
	if (!hpage)
1513
		return SCAN_PAGE_NULL;
1514

1515 1516
	if (!PageHead(hpage)) {
		result = SCAN_FAIL;
1517
		goto drop_hpage;
1518
	}
1519

1520 1521
	if (compound_order(hpage) != HPAGE_PMD_ORDER) {
		result = SCAN_PAGE_COMPOUND;
1522
		goto drop_hpage;
1523
	}
1524

1525
	result = find_pmd_or_thp_or_none(mm, haddr, &pmd);
1526 1527 1528 1529 1530
	switch (result) {
	case SCAN_SUCCEED:
		break;
	case SCAN_PMD_NONE:
		/*
1531 1532
		 * All pte entries have been removed and pmd cleared.
		 * Skip all the pte checks and just update the pmd mapping.
1533 1534 1535
		 */
		goto maybe_install_pmd;
	default:
1536
		goto drop_hpage;
1537
	}
1538

1539
	result = SCAN_FAIL;
1540
	start_pte = pte_offset_map_lock(mm, pmd, haddr, &ptl);
1541 1542
	if (!start_pte)		/* mmap_lock + page lock should prevent this */
		goto drop_hpage;
1543 1544 1545 1546 1547

	/* step 1: check all mapped PTEs are to the right huge page */
	for (i = 0, addr = haddr, pte = start_pte;
	     i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) {
		struct page *page;
Ryan Roberts's avatar
Ryan Roberts committed
1548
		pte_t ptent = ptep_get(pte);
1549 1550

		/* empty pte, skip */
Ryan Roberts's avatar
Ryan Roberts committed
1551
		if (pte_none(ptent))
1552 1553 1554
			continue;

		/* page swapped out, abort */
Ryan Roberts's avatar
Ryan Roberts committed
1555
		if (!pte_present(ptent)) {
1556
			result = SCAN_PTE_NON_PRESENT;
1557
			goto abort;
1558
		}
1559

Ryan Roberts's avatar
Ryan Roberts committed
1560
		page = vm_normal_page(vma, addr, ptent);
1561 1562
		if (WARN_ON_ONCE(page && is_zone_device_page(page)))
			page = NULL;
1563
		/*
1564 1565
		 * Note that uprobe, debugger, or MAP_PRIVATE may change the
		 * page table, but the new page will not be a subpage of hpage.
1566
		 */
1567
		if (hpage + i != page)
1568 1569 1570
			goto abort;
	}

1571 1572 1573 1574 1575
	pte_unmap_unlock(start_pte, ptl);
	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm,
				haddr, haddr + HPAGE_PMD_SIZE);
	mmu_notifier_invalidate_range_start(&range);
	notified = true;
1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588

	/*
	 * pmd_lock covers a wider range than ptl, and (if split from mm's
	 * page_table_lock) ptl nests inside pml. The less time we hold pml,
	 * the better; but userfaultfd's mfill_atomic_pte() on a private VMA
	 * inserts a valid as-if-COWed PTE without even looking up page cache.
	 * So page lock of hpage does not protect from it, so we must not drop
	 * ptl before pgt_pmd is removed, so uffd private needs pml taken now.
	 */
	if (userfaultfd_armed(vma) && !(vma->vm_flags & VM_SHARED))
		pml = pmd_lock(mm, pmd);

	start_pte = pte_offset_map_nolock(mm, pmd, haddr, &ptl);
1589 1590
	if (!start_pte)		/* mmap_lock + page lock should prevent this */
		goto abort;
1591 1592 1593 1594
	if (!pml)
		spin_lock(ptl);
	else if (ptl != pml)
		spin_lock_nested(ptl, SINGLE_DEPTH_NESTING);
1595 1596

	/* step 2: clear page table and adjust rmap */
1597 1598 1599
	for (i = 0, addr = haddr, pte = start_pte;
	     i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) {
		struct page *page;
Ryan Roberts's avatar
Ryan Roberts committed
1600
		pte_t ptent = ptep_get(pte);
1601

Ryan Roberts's avatar
Ryan Roberts committed
1602
		if (pte_none(ptent))
1603
			continue;
1604 1605 1606 1607 1608 1609 1610 1611 1612 1613
		/*
		 * We dropped ptl after the first scan, to do the mmu_notifier:
		 * page lock stops more PTEs of the hpage being faulted in, but
		 * does not stop write faults COWing anon copies from existing
		 * PTEs; and does not stop those being swapped out or migrated.
		 */
		if (!pte_present(ptent)) {
			result = SCAN_PTE_NON_PRESENT;
			goto abort;
		}
Ryan Roberts's avatar
Ryan Roberts committed
1614
		page = vm_normal_page(vma, addr, ptent);
1615
		if (hpage + i != page)
1616
			goto abort;
1617 1618 1619 1620 1621 1622 1623

		/*
		 * Must clear entry, or a racing truncate may re-remove it.
		 * TLB flush can be left until pmdp_collapse_flush() does it.
		 * PTE dirty? Shmem page is already dirty; file is read-only.
		 */
		ptep_clear(mm, addr, pte);
1624
		page_remove_rmap(page, vma, false);
1625
		nr_ptes++;
1626 1627
	}

1628 1629 1630
	pte_unmap(start_pte);
	if (!pml)
		spin_unlock(ptl);
1631 1632

	/* step 3: set proper refcount and mm_counters. */
1633 1634 1635
	if (nr_ptes) {
		page_ref_sub(hpage, nr_ptes);
		add_mm_counter(mm, mm_counter_file(hpage), -nr_ptes);
1636 1637
	}

1638 1639 1640 1641 1642 1643
	/* step 4: remove empty page table */
	if (!pml) {
		pml = pmd_lock(mm, pmd);
		if (ptl != pml)
			spin_lock_nested(ptl, SINGLE_DEPTH_NESTING);
	}
1644 1645 1646 1647 1648
	pgt_pmd = pmdp_collapse_flush(vma, haddr, pmd);
	pmdp_get_lockless_sync();
	if (ptl != pml)
		spin_unlock(ptl);
	spin_unlock(pml);
1649

1650
	mmu_notifier_invalidate_range_end(&range);
1651

1652 1653 1654
	mm_dec_nr_ptes(mm);
	page_table_check_pte_clear_range(mm, haddr, pgt_pmd);
	pte_free_defer(mm, pmd_pgtable(pgt_pmd));
1655

1656 1657 1658 1659 1660
maybe_install_pmd:
	/* step 5: install pmd entry */
	result = install_pmd
			? set_huge_pmd(vma, haddr, pmd, hpage)
			: SCAN_SUCCEED;
1661 1662 1663 1664 1665 1666 1667 1668 1669
	goto drop_hpage;
abort:
	if (nr_ptes) {
		flush_tlb_mm(mm);
		page_ref_sub(hpage, nr_ptes);
		add_mm_counter(mm, mm_counter_file(hpage), -nr_ptes);
	}
	if (start_pte)
		pte_unmap_unlock(start_pte, ptl);
1670 1671
	if (pml && pml != ptl)
		spin_unlock(pml);
1672 1673
	if (notified)
		mmu_notifier_invalidate_range_end(&range);
1674 1675 1676
drop_hpage:
	unlock_page(hpage);
	put_page(hpage);
1677
	return result;
1678 1679
}

1680
static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
1681 1682 1683
{
	struct vm_area_struct *vma;

1684
	i_mmap_lock_read(mapping);
1685
	vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
1686 1687 1688 1689 1690 1691 1692
		struct mmu_notifier_range range;
		struct mm_struct *mm;
		unsigned long addr;
		pmd_t *pmd, pgt_pmd;
		spinlock_t *pml;
		spinlock_t *ptl;
		bool skipped_uffd = false;
1693

1694 1695
		/*
		 * Check vma->anon_vma to exclude MAP_PRIVATE mappings that
1696 1697
		 * got written to. These VMAs are likely not worth removing
		 * page tables from, as PMD-mapping is likely to be split later.
1698
		 */
1699 1700 1701
		if (READ_ONCE(vma->anon_vma))
			continue;

1702
		addr = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
1703
		if (addr & ~HPAGE_PMD_MASK ||
1704 1705 1706
		    vma->vm_end < addr + HPAGE_PMD_SIZE)
			continue;

1707
		mm = vma->vm_mm;
1708 1709 1710 1711 1712
		if (find_pmd_or_thp_or_none(mm, addr, &pmd) != SCAN_SUCCEED)
			continue;

		if (hpage_collapse_test_exit(mm))
			continue;
1713
		/*
1714 1715 1716 1717 1718
		 * When a vma is registered with uffd-wp, we cannot recycle
		 * the page table because there may be pte markers installed.
		 * Other vmas can still have the same file mapped hugely, but
		 * skip this one: it will always be mapped in small page size
		 * for uffd-wp registered ranges.
1719
		 */
1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731
		if (userfaultfd_wp(vma))
			continue;

		/* PTEs were notified when unmapped; but now for the PMD? */
		mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm,
					addr, addr + HPAGE_PMD_SIZE);
		mmu_notifier_invalidate_range_start(&range);

		pml = pmd_lock(mm, pmd);
		ptl = pte_lockptr(mm, pmd);
		if (ptl != pml)
			spin_lock_nested(ptl, SINGLE_DEPTH_NESTING);
1732

1733
		/*
1734 1735 1736 1737 1738 1739 1740
		 * Huge page lock is still held, so normally the page table
		 * must remain empty; and we have already skipped anon_vma
		 * and userfaultfd_wp() vmas.  But since the mmap_lock is not
		 * held, it is still possible for a racing userfaultfd_ioctl()
		 * to have inserted ptes or markers.  Now that we hold ptlock,
		 * repeating the anon_vma check protects from one category,
		 * and repeating the userfaultfd_wp() check from another.
1741
		 */
1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758
		if (unlikely(vma->anon_vma || userfaultfd_wp(vma))) {
			skipped_uffd = true;
		} else {
			pgt_pmd = pmdp_collapse_flush(vma, addr, pmd);
			pmdp_get_lockless_sync();
		}

		if (ptl != pml)
			spin_unlock(ptl);
		spin_unlock(pml);

		mmu_notifier_invalidate_range_end(&range);

		if (!skipped_uffd) {
			mm_dec_nr_ptes(mm);
			page_table_check_pte_clear_range(mm, addr, pgt_pmd);
			pte_free_defer(mm, pmd_pgtable(pgt_pmd));
1759 1760
		}
	}
1761
	i_mmap_unlock_read(mapping);
1762 1763 1764
}

/**
1765
 * collapse_file - collapse filemap/tmpfs/shmem pages into huge one.
1766
 *
1767
 * @mm: process address space where collapse happens
1768
 * @addr: virtual collapse start address
1769 1770
 * @file: file that collapse on
 * @start: collapse start address
1771
 * @cc: collapse context and scratchpad
1772
 *
1773
 * Basic scheme is simple, details are more complex:
1774
 *  - allocate and lock a new huge page;
1775
 *  - scan page cache, locking old pages
1776
 *    + swap/gup in pages if necessary;
1777 1778 1779 1780
 *  - copy data to new page
 *  - handle shmem holes
 *    + re-validate that holes weren't filled by someone else
 *    + check for userfaultfd
1781
 *  - finalize updates to the page cache;
1782
 *  - if replacing succeeds:
1783
 *    + unlock huge page;
1784
 *    + free old pages;
1785
 *  - if replacing failed;
1786
 *    + unlock old pages
1787
 *    + unlock and free huge page;
1788
 */
1789 1790 1791
static int collapse_file(struct mm_struct *mm, unsigned long addr,
			 struct file *file, pgoff_t start,
			 struct collapse_control *cc)
1792
{
1793
	struct address_space *mapping = file->f_mapping;
1794
	struct page *hpage;
1795 1796 1797
	struct page *page;
	struct page *tmp;
	struct folio *folio;
1798
	pgoff_t index = 0, end = start + HPAGE_PMD_NR;
1799
	LIST_HEAD(pagelist);
1800
	XA_STATE_ORDER(xas, &mapping->i_pages, start, HPAGE_PMD_ORDER);
1801
	int nr_none = 0, result = SCAN_SUCCEED;
1802
	bool is_shmem = shmem_file(file);
1803
	int nr = 0;
1804

1805
	VM_BUG_ON(!IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && !is_shmem);
1806 1807
	VM_BUG_ON(start & (HPAGE_PMD_NR - 1));

1808
	result = alloc_charge_hpage(&hpage, mm, cc);
1809
	if (result != SCAN_SUCCEED)
1810 1811
		goto out;

1812 1813 1814 1815 1816 1817
	__SetPageLocked(hpage);
	if (is_shmem)
		__SetPageSwapBacked(hpage);
	hpage->index = start;
	hpage->mapping = mapping;

1818 1819 1820 1821
	/*
	 * Ensure we have slots for all the pages in the range.  This is
	 * almost certainly a no-op because most of the pages must be present
	 */
1822 1823 1824 1825 1826 1827 1828 1829
	do {
		xas_lock_irq(&xas);
		xas_create_range(&xas);
		if (!xas_error(&xas))
			break;
		xas_unlock_irq(&xas);
		if (!xas_nomem(&xas, GFP_KERNEL)) {
			result = SCAN_FAIL;
1830
			goto rollback;
1831 1832 1833
		}
	} while (1);

1834
	for (index = start; index < end; index++) {
1835 1836
		xas_set(&xas, index);
		page = xas_load(&xas);
1837 1838

		VM_BUG_ON(index != xas.xa_index);
1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853
		if (is_shmem) {
			if (!page) {
				/*
				 * Stop if extent has been truncated or
				 * hole-punched, and is now completely
				 * empty.
				 */
				if (index == start) {
					if (!xas_next_entry(&xas, end - 1)) {
						result = SCAN_TRUNCATED;
						goto xa_locked;
					}
				}
				nr_none++;
				continue;
1854
			}
1855 1856 1857 1858

			if (xa_is_value(page) || !PageUptodate(page)) {
				xas_unlock_irq(&xas);
				/* swap in or instantiate fallocated page */
1859 1860
				if (shmem_get_folio(mapping->host, index,
						&folio, SGP_NOALLOC)) {
1861 1862 1863
					result = SCAN_FAIL;
					goto xa_unlocked;
				}
1864
				/* drain lru cache to help isolate_lru_page() */
1865
				lru_add_drain();
1866
				page = folio_file_page(folio, index);
1867 1868 1869 1870 1871
			} else if (trylock_page(page)) {
				get_page(page);
				xas_unlock_irq(&xas);
			} else {
				result = SCAN_PAGE_LOCK;
1872
				goto xa_locked;
1873
			}
1874 1875 1876 1877 1878
		} else {	/* !is_shmem */
			if (!page || xa_is_value(page)) {
				xas_unlock_irq(&xas);
				page_cache_sync_readahead(mapping, &file->f_ra,
							  file, index,
1879
							  end - index);
1880
				/* drain lru cache to help isolate_lru_page() */
1881 1882 1883 1884 1885 1886
				lru_add_drain();
				page = find_lock_page(mapping, index);
				if (unlikely(page == NULL)) {
					result = SCAN_FAIL;
					goto xa_unlocked;
				}
1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904
			} else if (PageDirty(page)) {
				/*
				 * khugepaged only works on read-only fd,
				 * so this page is dirty because it hasn't
				 * been flushed since first write. There
				 * won't be new dirty pages.
				 *
				 * Trigger async flush here and hope the
				 * writeback is done when khugepaged
				 * revisits this page.
				 *
				 * This is a one-off situation. We are not
				 * forcing writeback in loop.
				 */
				xas_unlock_irq(&xas);
				filemap_flush(mapping);
				result = SCAN_FAIL;
				goto xa_unlocked;
1905 1906 1907 1908
			} else if (PageWriteback(page)) {
				xas_unlock_irq(&xas);
				result = SCAN_FAIL;
				goto xa_unlocked;
1909 1910 1911 1912 1913 1914
			} else if (trylock_page(page)) {
				get_page(page);
				xas_unlock_irq(&xas);
			} else {
				result = SCAN_PAGE_LOCK;
				goto xa_locked;
1915 1916 1917 1918
			}
		}

		/*
Matthew Wilcox's avatar
Matthew Wilcox committed
1919
		 * The page must be locked, so we can drop the i_pages lock
1920 1921 1922
		 * without racing with truncate.
		 */
		VM_BUG_ON_PAGE(!PageLocked(page), page);
1923 1924 1925 1926 1927 1928

		/* make sure the page is up to date */
		if (unlikely(!PageUptodate(page))) {
			result = SCAN_FAIL;
			goto out_unlock;
		}
1929 1930 1931 1932

		/*
		 * If file was truncated then extended, or hole-punched, before
		 * we locked the first page, then a THP might be there already.
1933
		 * This will be discovered on the first iteration.
1934 1935
		 */
		if (PageTransCompound(page)) {
1936 1937 1938 1939 1940 1941 1942
			struct page *head = compound_head(page);

			result = compound_order(head) == HPAGE_PMD_ORDER &&
					head->index == start
					/* Maybe PMD-mapped */
					? SCAN_PTE_MAPPED_HUGEPAGE
					: SCAN_PAGE_COMPOUND;
1943 1944
			goto out_unlock;
		}
1945

1946 1947 1948
		folio = page_folio(page);

		if (folio_mapping(folio) != mapping) {
1949 1950 1951 1952
			result = SCAN_TRUNCATED;
			goto out_unlock;
		}

1953 1954
		if (!is_shmem && (folio_test_dirty(folio) ||
				  folio_test_writeback(folio))) {
1955 1956 1957 1958 1959 1960 1961 1962 1963
			/*
			 * khugepaged only works on read-only fd, so this
			 * page is dirty because it hasn't been flushed
			 * since first write.
			 */
			result = SCAN_FAIL;
			goto out_unlock;
		}

1964
		if (!folio_isolate_lru(folio)) {
1965
			result = SCAN_DEL_PAGE_LRU;
1966
			goto out_unlock;
1967 1968
		}

1969
		if (!filemap_release_folio(folio, GFP_KERNEL)) {
1970
			result = SCAN_PAGE_HAS_PRIVATE;
1971
			folio_putback_lru(folio);
1972 1973 1974
			goto out_unlock;
		}

1975 1976
		if (folio_mapped(folio))
			try_to_unmap(folio,
1977
					TTU_IGNORE_MLOCK | TTU_BATCH_FLUSH);
1978

1979
		xas_lock_irq(&xas);
1980

1981
		VM_BUG_ON_PAGE(page != xa_load(xas.xa, index), page);
1982 1983

		/*
1984
		 * We control three references to the page:
1985
		 *  - we hold a pin on it;
1986
		 *  - one reference from page cache;
1987
		 *  - one from isolate_lru_page;
1988 1989 1990 1991
		 * If those are the only references, then any new usage of the
		 * page will have to fetch it from the page cache. That requires
		 * locking the page to handle truncate, so any new usage will be
		 * blocked until we unlock page after collapse/during rollback.
1992
		 */
1993
		if (page_count(page) != 3) {
1994
			result = SCAN_PAGE_COUNT;
1995 1996 1997
			xas_unlock_irq(&xas);
			putback_lru_page(page);
			goto out_unlock;
1998 1999 2000
		}

		/*
2001
		 * Accumulate the pages that are being collapsed.
2002 2003 2004 2005 2006 2007
		 */
		list_add_tail(&page->lru, &pagelist);
		continue;
out_unlock:
		unlock_page(page);
		put_page(page);
2008
		goto xa_unlocked;
2009 2010
	}

2011
	if (!is_shmem) {
2012
		filemap_nr_thps_inc(mapping);
2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023
		/*
		 * Paired with smp_mb() in do_dentry_open() to ensure
		 * i_writecount is up to date and the update to nr_thps is
		 * visible. Ensures the page cache will be truncated if the
		 * file is opened writable.
		 */
		smp_mb();
		if (inode_is_open_for_write(mapping->host)) {
			result = SCAN_FAIL;
			filemap_nr_thps_dec(mapping);
		}
2024
	}
2025

2026 2027
xa_locked:
	xas_unlock_irq(&xas);
2028
xa_unlocked:
2029

2030 2031 2032 2033 2034 2035 2036
	/*
	 * If collapse is successful, flush must be done now before copying.
	 * If collapse is unsuccessful, does flush actually need to be done?
	 * Do it anyway, to clear the state.
	 */
	try_to_unmap_flush();

2037 2038 2039 2040 2041
	if (result == SCAN_SUCCEED && nr_none &&
	    !shmem_charge(mapping->host, nr_none))
		result = SCAN_FAIL;
	if (result != SCAN_SUCCEED) {
		nr_none = 0;
2042
		goto rollback;
2043
	}
2044 2045

	/*
2046
	 * The old pages are locked, so they won't change anymore.
2047 2048 2049 2050
	 */
	index = start;
	list_for_each_entry(page, &pagelist, lru) {
		while (index < page->index) {
2051 2052 2053
			clear_highpage(hpage + (index % HPAGE_PMD_NR));
			index++;
		}
2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064
		if (copy_mc_highpage(hpage + (page->index % HPAGE_PMD_NR), page) > 0) {
			result = SCAN_COPY_MC;
			goto rollback;
		}
		index++;
	}
	while (index < end) {
		clear_highpage(hpage + (index % HPAGE_PMD_NR));
		index++;
	}

2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121
	if (nr_none) {
		struct vm_area_struct *vma;
		int nr_none_check = 0;

		i_mmap_lock_read(mapping);
		xas_lock_irq(&xas);

		xas_set(&xas, start);
		for (index = start; index < end; index++) {
			if (!xas_next(&xas)) {
				xas_store(&xas, XA_RETRY_ENTRY);
				if (xas_error(&xas)) {
					result = SCAN_STORE_FAILED;
					goto immap_locked;
				}
				nr_none_check++;
			}
		}

		if (nr_none != nr_none_check) {
			result = SCAN_PAGE_FILLED;
			goto immap_locked;
		}

		/*
		 * If userspace observed a missing page in a VMA with a MODE_MISSING
		 * userfaultfd, then it might expect a UFFD_EVENT_PAGEFAULT for that
		 * page. If so, we need to roll back to avoid suppressing such an
		 * event. Since wp/minor userfaultfds don't give userspace any
		 * guarantees that the kernel doesn't fill a missing page with a zero
		 * page, so they don't matter here.
		 *
		 * Any userfaultfds registered after this point will not be able to
		 * observe any missing pages due to the previously inserted retry
		 * entries.
		 */
		vma_interval_tree_foreach(vma, &mapping->i_mmap, start, end) {
			if (userfaultfd_missing(vma)) {
				result = SCAN_EXCEED_NONE_PTE;
				goto immap_locked;
			}
		}

immap_locked:
		i_mmap_unlock_read(mapping);
		if (result != SCAN_SUCCEED) {
			xas_set(&xas, start);
			for (index = start; index < end; index++) {
				if (xas_next(&xas) == XA_RETRY_ENTRY)
					xas_store(&xas, NULL);
			}

			xas_unlock_irq(&xas);
			goto rollback;
		}
	} else {
		xas_lock_irq(&xas);
2122 2123 2124
	}

	nr = thp_nr_pages(hpage);
2125 2126 2127 2128
	if (is_shmem)
		__mod_lruvec_page_state(hpage, NR_SHMEM_THPS, nr);
	else
		__mod_lruvec_page_state(hpage, NR_FILE_THPS, nr);
2129

2130 2131 2132 2133 2134
	if (nr_none) {
		__mod_lruvec_page_state(hpage, NR_FILE_PAGES, nr_none);
		/* nr_none is always 0 for non-shmem. */
		__mod_lruvec_page_state(hpage, NR_SHMEM, nr_none);
	}
2135

2136 2137 2138 2139
	/*
	 * Mark hpage as uptodate before inserting it into the page cache so
	 * that it isn't mistaken for an fallocated but unwritten page.
	 */
2140 2141 2142
	folio = page_folio(hpage);
	folio_mark_uptodate(folio);
	folio_ref_add(folio, HPAGE_PMD_NR - 1);
2143

2144 2145 2146
	if (is_shmem)
		folio_mark_dirty(folio);
	folio_add_lru(folio);
2147

2148 2149 2150
	/* Join all the small entries into a single multi-index entry. */
	xas_set_order(&xas, start, HPAGE_PMD_ORDER);
	xas_store(&xas, hpage);
2151
	WARN_ON_ONCE(xas_error(&xas));
2152 2153
	xas_unlock_irq(&xas);

2154 2155
	/*
	 * Remove pte page tables, so we can re-fault the page as huge.
2156
	 * If MADV_COLLAPSE, adjust result to call collapse_pte_mapped_thp().
2157
	 */
2158 2159 2160
	retract_page_tables(mapping, start);
	if (cc && !cc->is_khugepaged)
		result = SCAN_PTE_MAPPED_HUGEPAGE;
2161
	unlock_page(hpage);
2162 2163 2164 2165 2166 2167 2168 2169 2170 2171

	/*
	 * The collapse has succeeded, so free the old pages.
	 */
	list_for_each_entry_safe(page, tmp, &pagelist, lru) {
		list_del(&page->lru);
		page->mapping = NULL;
		ClearPageActive(page);
		ClearPageUnevictable(page);
		unlock_page(page);
2172
		folio_put_refs(page_folio(page), 3);
2173 2174
	}

2175 2176 2177 2178 2179
	goto out;

rollback:
	/* Something went wrong: roll back page cache changes */
	if (nr_none) {
2180
		xas_lock_irq(&xas);
2181
		mapping->nrpages -= nr_none;
2182
		xas_unlock_irq(&xas);
2183
		shmem_uncharge(mapping->host, nr_none);
2184
	}
2185

2186
	list_for_each_entry_safe(page, tmp, &pagelist, lru) {
2187 2188 2189
		list_del(&page->lru);
		unlock_page(page);
		putback_lru_page(page);
2190
		put_page(page);
2191 2192 2193 2194 2195 2196 2197 2198
	}
	/*
	 * Undo the updates of filemap_nr_thps_inc for non-SHMEM
	 * file only. This undo is not needed unless failure is
	 * due to SCAN_COPY_MC.
	 */
	if (!is_shmem && result == SCAN_COPY_MC) {
		filemap_nr_thps_dec(mapping);
2199
		/*
2200 2201
		 * Paired with smp_mb() in do_dentry_open() to
		 * ensure the update to nr_thps is visible.
2202
		 */
2203 2204
		smp_mb();
	}
2205

2206
	hpage->mapping = NULL;
2207

2208 2209
	unlock_page(hpage);
	put_page(hpage);
2210 2211
out:
	VM_BUG_ON(!list_empty(&pagelist));
2212
	trace_mm_khugepaged_collapse_file(mm, hpage, index, is_shmem, addr, file, nr, result);
2213
	return result;
2214 2215
}

2216 2217 2218
static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr,
				    struct file *file, pgoff_t start,
				    struct collapse_control *cc)
2219 2220
{
	struct page *page = NULL;
2221
	struct address_space *mapping = file->f_mapping;
2222
	XA_STATE(xas, &mapping->i_pages, start);
2223 2224 2225 2226 2227 2228
	int present, swap;
	int node = NUMA_NO_NODE;
	int result = SCAN_SUCCEED;

	present = 0;
	swap = 0;
2229
	memset(cc->node_load, 0, sizeof(cc->node_load));
2230
	nodes_clear(cc->alloc_nmask);
2231
	rcu_read_lock();
2232 2233
	xas_for_each(&xas, page, start + HPAGE_PMD_NR - 1) {
		if (xas_retry(&xas, page))
2234 2235
			continue;

2236
		if (xa_is_value(page)) {
2237 2238 2239
			++swap;
			if (cc->is_khugepaged &&
			    swap > khugepaged_max_ptes_swap) {
2240
				result = SCAN_EXCEED_SWAP_PTE;
2241
				count_vm_event(THP_SCAN_EXCEED_SWAP_PTE);
2242 2243 2244 2245 2246
				break;
			}
			continue;
		}

2247
		/*
2248
		 * TODO: khugepaged should compact smaller compound pages
2249 2250
		 * into a PMD sized page
		 */
2251
		if (PageTransCompound(page)) {
2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264
			struct page *head = compound_head(page);

			result = compound_order(head) == HPAGE_PMD_ORDER &&
					head->index == start
					/* Maybe PMD-mapped */
					? SCAN_PTE_MAPPED_HUGEPAGE
					: SCAN_PAGE_COMPOUND;
			/*
			 * For SCAN_PTE_MAPPED_HUGEPAGE, further processing
			 * by the caller won't touch the page cache, and so
			 * it's safe to skip LRU and refcount checks before
			 * returning.
			 */
2265 2266 2267 2268
			break;
		}

		node = page_to_nid(page);
2269
		if (hpage_collapse_scan_abort(node, cc)) {
2270 2271 2272
			result = SCAN_SCAN_ABORT;
			break;
		}
2273
		cc->node_load[node]++;
2274 2275 2276 2277 2278 2279

		if (!PageLRU(page)) {
			result = SCAN_PAGE_LRU;
			break;
		}

2280 2281
		if (page_count(page) !=
		    1 + page_mapcount(page) + page_has_private(page)) {
2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294
			result = SCAN_PAGE_COUNT;
			break;
		}

		/*
		 * We probably should check if the page is referenced here, but
		 * nobody would transfer pte_young() to PageReferenced() for us.
		 * And rmap walk here is just too costly...
		 */

		present++;

		if (need_resched()) {
2295
			xas_pause(&xas);
2296 2297 2298 2299 2300 2301
			cond_resched_rcu();
		}
	}
	rcu_read_unlock();

	if (result == SCAN_SUCCEED) {
2302 2303
		if (cc->is_khugepaged &&
		    present < HPAGE_PMD_NR - khugepaged_max_ptes_none) {
2304
			result = SCAN_EXCEED_NONE_PTE;
2305
			count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
2306
		} else {
2307
			result = collapse_file(mm, addr, file, start, cc);
2308 2309 2310
		}
	}

2311
	trace_mm_khugepaged_scan_file(mm, page, file, present, swap, result);
2312
	return result;
2313 2314
}
#else
2315 2316 2317
static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr,
				    struct file *file, pgoff_t start,
				    struct collapse_control *cc)
2318 2319 2320 2321 2322
{
	BUILD_BUG();
}
#endif

2323
static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
2324
					    struct collapse_control *cc)
2325 2326 2327
	__releases(&khugepaged_mm_lock)
	__acquires(&khugepaged_mm_lock)
{
2328
	struct vma_iterator vmi;
2329 2330
	struct khugepaged_mm_slot *mm_slot;
	struct mm_slot *slot;
2331 2332 2333 2334 2335
	struct mm_struct *mm;
	struct vm_area_struct *vma;
	int progress = 0;

	VM_BUG_ON(!pages);
2336
	lockdep_assert_held(&khugepaged_mm_lock);
2337
	*result = SCAN_FAIL;
2338

2339
	if (khugepaged_scan.mm_slot) {
2340
		mm_slot = khugepaged_scan.mm_slot;
2341 2342 2343
		slot = &mm_slot->slot;
	} else {
		slot = list_entry(khugepaged_scan.mm_head.next,
2344
				     struct mm_slot, mm_node);
2345
		mm_slot = mm_slot_entry(slot, struct khugepaged_mm_slot, slot);
2346 2347 2348 2349 2350
		khugepaged_scan.address = 0;
		khugepaged_scan.mm_slot = mm_slot;
	}
	spin_unlock(&khugepaged_mm_lock);

2351
	mm = slot->mm;
2352 2353 2354 2355 2356
	/*
	 * Don't wait for semaphore (to avoid long wait times).  Just move to
	 * the next mm on the list.
	 */
	vma = NULL;
2357
	if (unlikely(!mmap_read_trylock(mm)))
2358
		goto breakouterloop_mmap_lock;
2359 2360

	progress++;
2361 2362 2363 2364 2365
	if (unlikely(hpage_collapse_test_exit(mm)))
		goto breakouterloop;

	vma_iter_init(&vmi, mm, khugepaged_scan.address);
	for_each_vma(vmi, vma) {
2366 2367 2368
		unsigned long hstart, hend;

		cond_resched();
2369
		if (unlikely(hpage_collapse_test_exit(mm))) {
2370 2371 2372
			progress++;
			break;
		}
2373
		if (!hugepage_vma_check(vma, vma->vm_flags, false, false, true)) {
2374 2375 2376 2377
skip:
			progress++;
			continue;
		}
2378 2379
		hstart = round_up(vma->vm_start, HPAGE_PMD_SIZE);
		hend = round_down(vma->vm_end, HPAGE_PMD_SIZE);
2380 2381 2382 2383 2384 2385 2386
		if (khugepaged_scan.address > hend)
			goto skip;
		if (khugepaged_scan.address < hstart)
			khugepaged_scan.address = hstart;
		VM_BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK);

		while (khugepaged_scan.address < hend) {
2387 2388
			bool mmap_locked = true;

2389
			cond_resched();
2390
			if (unlikely(hpage_collapse_test_exit(mm)))
2391 2392 2393 2394 2395
				goto breakouterloop;

			VM_BUG_ON(khugepaged_scan.address < hstart ||
				  khugepaged_scan.address + HPAGE_PMD_SIZE >
				  hend);
2396
			if (IS_ENABLED(CONFIG_SHMEM) && vma->vm_file) {
2397
				struct file *file = get_file(vma->vm_file);
2398 2399
				pgoff_t pgoff = linear_page_index(vma,
						khugepaged_scan.address);
2400

2401
				mmap_read_unlock(mm);
2402
				mmap_locked = false;
2403 2404
				*result = hpage_collapse_scan_file(mm,
					khugepaged_scan.address, file, pgoff, cc);
2405
				fput(file);
2406 2407 2408 2409 2410 2411 2412 2413 2414 2415
				if (*result == SCAN_PTE_MAPPED_HUGEPAGE) {
					mmap_read_lock(mm);
					if (hpage_collapse_test_exit(mm))
						goto breakouterloop;
					*result = collapse_pte_mapped_thp(mm,
						khugepaged_scan.address, false);
					if (*result == SCAN_PMD_MAPPED)
						*result = SCAN_SUCCEED;
					mmap_read_unlock(mm);
				}
2416
			} else {
2417
				*result = hpage_collapse_scan_pmd(mm, vma,
2418
					khugepaged_scan.address, &mmap_locked, cc);
2419
			}
2420 2421

			if (*result == SCAN_SUCCEED)
2422
				++khugepaged_pages_collapsed;
2423

2424 2425 2426
			/* move to next address */
			khugepaged_scan.address += HPAGE_PMD_SIZE;
			progress += HPAGE_PMD_NR;
2427 2428 2429 2430 2431 2432 2433 2434
			if (!mmap_locked)
				/*
				 * We released mmap_lock so break loop.  Note
				 * that we drop mmap_lock before all hugepage
				 * allocations, so if allocation fails, we are
				 * guaranteed to break here and report the
				 * correct result back to caller.
				 */
2435
				goto breakouterloop_mmap_lock;
2436 2437 2438 2439 2440
			if (progress >= pages)
				goto breakouterloop;
		}
	}
breakouterloop:
2441
	mmap_read_unlock(mm); /* exit_mmap will destroy ptes after this */
2442
breakouterloop_mmap_lock:
2443 2444 2445 2446 2447 2448 2449

	spin_lock(&khugepaged_mm_lock);
	VM_BUG_ON(khugepaged_scan.mm_slot != mm_slot);
	/*
	 * Release the current mm_slot if this mm is about to die, or
	 * if we scanned all vmas of this mm.
	 */
2450
	if (hpage_collapse_test_exit(mm) || !vma) {
2451 2452 2453 2454 2455
		/*
		 * Make sure that if mm_users is reaching zero while
		 * khugepaged runs here, khugepaged_exit will find
		 * mm_slot not pointing to the exiting mm.
		 */
2456 2457 2458 2459 2460
		if (slot->mm_node.next != &khugepaged_scan.mm_head) {
			slot = list_entry(slot->mm_node.next,
					  struct mm_slot, mm_node);
			khugepaged_scan.mm_slot =
				mm_slot_entry(slot, struct khugepaged_mm_slot, slot);
2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475
			khugepaged_scan.address = 0;
		} else {
			khugepaged_scan.mm_slot = NULL;
			khugepaged_full_scans++;
		}

		collect_mm_slot(mm_slot);
	}

	return progress;
}

static int khugepaged_has_work(void)
{
	return !list_empty(&khugepaged_scan.mm_head) &&
2476
		hugepage_flags_enabled();
2477 2478 2479 2480 2481 2482 2483 2484
}

static int khugepaged_wait_event(void)
{
	return !list_empty(&khugepaged_scan.mm_head) ||
		kthread_should_stop();
}

2485
static void khugepaged_do_scan(struct collapse_control *cc)
2486 2487
{
	unsigned int progress = 0, pass_through_head = 0;
2488
	unsigned int pages = READ_ONCE(khugepaged_pages_to_scan);
2489
	bool wait = true;
2490
	int result = SCAN_SUCCEED;
2491

2492 2493
	lru_add_drain_all();

2494
	while (true) {
2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505
		cond_resched();

		if (unlikely(kthread_should_stop() || try_to_freeze()))
			break;

		spin_lock(&khugepaged_mm_lock);
		if (!khugepaged_scan.mm_slot)
			pass_through_head++;
		if (khugepaged_has_work() &&
		    pass_through_head < 2)
			progress += khugepaged_scan_mm_slot(pages - progress,
2506
							    &result, cc);
2507 2508 2509 2510
		else
			progress = pages;
		spin_unlock(&khugepaged_mm_lock);

2511 2512 2513
		if (progress >= pages)
			break;

2514
		if (result == SCAN_ALLOC_HUGE_PAGE_FAIL) {
2515 2516 2517 2518 2519 2520 2521 2522 2523 2524
			/*
			 * If fail to allocate the first time, try to sleep for
			 * a while.  When hit again, cancel the scan.
			 */
			if (!wait)
				break;
			wait = false;
			khugepaged_alloc_sleep();
		}
	}
2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548
}

static bool khugepaged_should_wakeup(void)
{
	return kthread_should_stop() ||
	       time_after_eq(jiffies, khugepaged_sleep_expire);
}

static void khugepaged_wait_work(void)
{
	if (khugepaged_has_work()) {
		const unsigned long scan_sleep_jiffies =
			msecs_to_jiffies(khugepaged_scan_sleep_millisecs);

		if (!scan_sleep_jiffies)
			return;

		khugepaged_sleep_expire = jiffies + scan_sleep_jiffies;
		wait_event_freezable_timeout(khugepaged_wait,
					     khugepaged_should_wakeup(),
					     scan_sleep_jiffies);
		return;
	}

2549
	if (hugepage_flags_enabled())
2550 2551 2552 2553 2554
		wait_event_freezable(khugepaged_wait, khugepaged_wait_event());
}

static int khugepaged(void *none)
{
2555
	struct khugepaged_mm_slot *mm_slot;
2556 2557 2558 2559 2560

	set_freezable();
	set_user_nice(current, MAX_NICE);

	while (!kthread_should_stop()) {
2561
		khugepaged_do_scan(&khugepaged_collapse_control);
2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579
		khugepaged_wait_work();
	}

	spin_lock(&khugepaged_mm_lock);
	mm_slot = khugepaged_scan.mm_slot;
	khugepaged_scan.mm_slot = NULL;
	if (mm_slot)
		collect_mm_slot(mm_slot);
	spin_unlock(&khugepaged_mm_lock);
	return 0;
}

static void set_recommended_min_free_kbytes(void)
{
	struct zone *zone;
	int nr_zones = 0;
	unsigned long recommended_min;

2580
	if (!hugepage_flags_enabled()) {
2581 2582 2583 2584
		calculate_min_free_kbytes();
		goto update_wmarks;
	}

2585 2586 2587 2588 2589 2590 2591 2592
	for_each_populated_zone(zone) {
		/*
		 * We don't need to worry about fragmentation of
		 * ZONE_MOVABLE since it only has movable pages.
		 */
		if (zone_idx(zone) > gfp_zone(GFP_USER))
			continue;

2593
		nr_zones++;
2594
	}
2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619

	/* Ensure 2 pageblocks are free to assist fragmentation avoidance */
	recommended_min = pageblock_nr_pages * nr_zones * 2;

	/*
	 * Make sure that on average at least two pageblocks are almost free
	 * of another type, one for a migratetype to fall back to and a
	 * second to avoid subsequent fallbacks of other types There are 3
	 * MIGRATE_TYPES we care about.
	 */
	recommended_min += pageblock_nr_pages * nr_zones *
			   MIGRATE_PCPTYPES * MIGRATE_PCPTYPES;

	/* don't ever allow to reserve more than 5% of the lowmem */
	recommended_min = min(recommended_min,
			      (unsigned long) nr_free_buffer_pages() / 20);
	recommended_min <<= (PAGE_SHIFT-10);

	if (recommended_min > min_free_kbytes) {
		if (user_min_free_kbytes >= 0)
			pr_info("raising min_free_kbytes from %d to %lu to help transparent hugepage allocations\n",
				min_free_kbytes, recommended_min);

		min_free_kbytes = recommended_min;
	}
2620 2621

update_wmarks:
2622 2623 2624 2625 2626 2627 2628 2629
	setup_per_zone_wmarks();
}

int start_stop_khugepaged(void)
{
	int err = 0;

	mutex_lock(&khugepaged_mutex);
2630
	if (hugepage_flags_enabled()) {
2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646
		if (!khugepaged_thread)
			khugepaged_thread = kthread_run(khugepaged, NULL,
							"khugepaged");
		if (IS_ERR(khugepaged_thread)) {
			pr_err("khugepaged: kthread_run(khugepaged) failed\n");
			err = PTR_ERR(khugepaged_thread);
			khugepaged_thread = NULL;
			goto fail;
		}

		if (!list_empty(&khugepaged_scan.mm_head))
			wake_up_interruptible(&khugepaged_wait);
	} else if (khugepaged_thread) {
		kthread_stop(khugepaged_thread);
		khugepaged_thread = NULL;
	}
2647
	set_recommended_min_free_kbytes();
2648 2649 2650 2651
fail:
	mutex_unlock(&khugepaged_mutex);
	return err;
}
2652 2653 2654 2655

void khugepaged_min_free_kbytes_update(void)
{
	mutex_lock(&khugepaged_mutex);
2656
	if (hugepage_flags_enabled() && khugepaged_thread)
2657 2658 2659
		set_recommended_min_free_kbytes();
	mutex_unlock(&khugepaged_mutex);
}
2660

2661 2662 2663 2664 2665
bool current_is_khugepaged(void)
{
	return kthread_func(current) == khugepaged;
}

2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676
static int madvise_collapse_errno(enum scan_result r)
{
	/*
	 * MADV_COLLAPSE breaks from existing madvise(2) conventions to provide
	 * actionable feedback to caller, so they may take an appropriate
	 * fallback measure depending on the nature of the failure.
	 */
	switch (r) {
	case SCAN_ALLOC_HUGE_PAGE_FAIL:
		return -ENOMEM;
	case SCAN_CGROUP_CHARGE_FAIL:
2677
	case SCAN_EXCEED_NONE_PTE:
2678 2679
		return -EBUSY;
	/* Resource temporary unavailable - trying again might succeed */
2680
	case SCAN_PAGE_COUNT:
2681 2682
	case SCAN_PAGE_LOCK:
	case SCAN_PAGE_LRU:
2683
	case SCAN_DEL_PAGE_LRU:
2684
	case SCAN_PAGE_FILLED:
2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730
		return -EAGAIN;
	/*
	 * Other: Trying again likely not to succeed / error intrinsic to
	 * specified memory range. khugepaged likely won't be able to collapse
	 * either.
	 */
	default:
		return -EINVAL;
	}
}

int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev,
		     unsigned long start, unsigned long end)
{
	struct collapse_control *cc;
	struct mm_struct *mm = vma->vm_mm;
	unsigned long hstart, hend, addr;
	int thps = 0, last_fail = SCAN_FAIL;
	bool mmap_locked = true;

	BUG_ON(vma->vm_start > start);
	BUG_ON(vma->vm_end < end);

	*prev = vma;

	if (!hugepage_vma_check(vma, vma->vm_flags, false, false, false))
		return -EINVAL;

	cc = kmalloc(sizeof(*cc), GFP_KERNEL);
	if (!cc)
		return -ENOMEM;
	cc->is_khugepaged = false;

	mmgrab(mm);
	lru_add_drain_all();

	hstart = (start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
	hend = end & HPAGE_PMD_MASK;

	for (addr = hstart; addr < hend; addr += HPAGE_PMD_SIZE) {
		int result = SCAN_FAIL;

		if (!mmap_locked) {
			cond_resched();
			mmap_read_lock(mm);
			mmap_locked = true;
2731 2732
			result = hugepage_vma_revalidate(mm, addr, false, &vma,
							 cc);
2733 2734 2735 2736
			if (result  != SCAN_SUCCEED) {
				last_fail = result;
				goto out_nolock;
			}
2737

2738
			hend = min(hend, vma->vm_end & HPAGE_PMD_MASK);
2739 2740 2741
		}
		mmap_assert_locked(mm);
		memset(cc->node_load, 0, sizeof(cc->node_load));
2742
		nodes_clear(cc->alloc_nmask);
2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755
		if (IS_ENABLED(CONFIG_SHMEM) && vma->vm_file) {
			struct file *file = get_file(vma->vm_file);
			pgoff_t pgoff = linear_page_index(vma, addr);

			mmap_read_unlock(mm);
			mmap_locked = false;
			result = hpage_collapse_scan_file(mm, addr, file, pgoff,
							  cc);
			fput(file);
		} else {
			result = hpage_collapse_scan_pmd(mm, vma, addr,
							 &mmap_locked, cc);
		}
2756 2757 2758
		if (!mmap_locked)
			*prev = NULL;  /* Tell caller we dropped mmap_lock */

2759
handle_result:
2760 2761 2762 2763 2764
		switch (result) {
		case SCAN_SUCCEED:
		case SCAN_PMD_MAPPED:
			++thps;
			break;
2765 2766 2767
		case SCAN_PTE_MAPPED_HUGEPAGE:
			BUG_ON(mmap_locked);
			BUG_ON(*prev);
2768
			mmap_read_lock(mm);
2769
			result = collapse_pte_mapped_thp(mm, addr, true);
2770
			mmap_read_unlock(mm);
2771
			goto handle_result;
2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782
		/* Whitelisted set of results where continuing OK */
		case SCAN_PMD_NULL:
		case SCAN_PTE_NON_PRESENT:
		case SCAN_PTE_UFFD_WP:
		case SCAN_PAGE_RO:
		case SCAN_LACK_REFERENCED_PAGE:
		case SCAN_PAGE_NULL:
		case SCAN_PAGE_COUNT:
		case SCAN_PAGE_LOCK:
		case SCAN_PAGE_COMPOUND:
		case SCAN_PAGE_LRU:
2783
		case SCAN_DEL_PAGE_LRU:
2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804
			last_fail = result;
			break;
		default:
			last_fail = result;
			/* Other error, exit */
			goto out_maybelock;
		}
	}

out_maybelock:
	/* Caller expects us to hold mmap_lock on return */
	if (!mmap_locked)
		mmap_read_lock(mm);
out_nolock:
	mmap_assert_locked(mm);
	mmdrop(mm);
	kfree(cc);

	return thps == ((hend - hstart) >> HPAGE_PMD_SHIFT) ? 0
			: madvise_collapse_errno(last_fail);
}