init_64.c 42.7 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0-only
Linus Torvalds's avatar
Linus Torvalds committed
2 3 4 5
/*
 *  linux/arch/x86_64/mm/init.c
 *
 *  Copyright (C) 1995  Linus Torvalds
Pavel Machek's avatar
Pavel Machek committed
6
 *  Copyright (C) 2000  Pavel Machek <pavel@ucw.cz>
Linus Torvalds's avatar
Linus Torvalds committed
7 8 9 10 11 12 13 14 15 16 17 18 19 20 21
 *  Copyright (C) 2002,2003 Andi Kleen <ak@suse.de>
 */

#include <linux/signal.h>
#include <linux/sched.h>
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/string.h>
#include <linux/types.h>
#include <linux/ptrace.h>
#include <linux/mman.h>
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/smp.h>
#include <linux/init.h>
22
#include <linux/initrd.h>
Linus Torvalds's avatar
Linus Torvalds committed
23
#include <linux/pagemap.h>
24
#include <linux/memblock.h>
Linus Torvalds's avatar
Linus Torvalds committed
25
#include <linux/proc_fs.h>
26
#include <linux/pci.h>
27
#include <linux/pfn.h>
28
#include <linux/poison.h>
29
#include <linux/dma-mapping.h>
30
#include <linux/memory.h>
31
#include <linux/memory_hotplug.h>
32
#include <linux/memremap.h>
33
#include <linux/nmi.h>
34
#include <linux/gfp.h>
35
#include <linux/kcore.h>
36
#include <linux/bootmem_info.h>
Linus Torvalds's avatar
Linus Torvalds committed
37 38

#include <asm/processor.h>
39
#include <asm/bios_ebda.h>
40
#include <linux/uaccess.h>
Linus Torvalds's avatar
Linus Torvalds committed
41 42 43
#include <asm/pgalloc.h>
#include <asm/dma.h>
#include <asm/fixmap.h>
44
#include <asm/e820/api.h>
Linus Torvalds's avatar
Linus Torvalds committed
45 46 47 48 49
#include <asm/apic.h>
#include <asm/tlb.h>
#include <asm/mmu_context.h>
#include <asm/proto.h>
#include <asm/smp.h>
50
#include <asm/sections.h>
51
#include <asm/kdebug.h>
52
#include <asm/numa.h>
53
#include <asm/set_memory.h>
54
#include <asm/init.h>
55
#include <asm/uv/uv.h>
56
#include <asm/setup.h>
57
#include <asm/ftrace.h>
Linus Torvalds's avatar
Linus Torvalds committed
58

59 60
#include "mm_internal.h"

61
#include "ident_map.c"
62

63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93
#define DEFINE_POPULATE(fname, type1, type2, init)		\
static inline void fname##_init(struct mm_struct *mm,		\
		type1##_t *arg1, type2##_t *arg2, bool init)	\
{								\
	if (init)						\
		fname##_safe(mm, arg1, arg2);			\
	else							\
		fname(mm, arg1, arg2);				\
}

DEFINE_POPULATE(p4d_populate, p4d, pud, init)
DEFINE_POPULATE(pgd_populate, pgd, p4d, init)
DEFINE_POPULATE(pud_populate, pud, pmd, init)
DEFINE_POPULATE(pmd_populate_kernel, pmd, pte, init)

#define DEFINE_ENTRY(type1, type2, init)			\
static inline void set_##type1##_init(type1##_t *arg1,		\
			type2##_t arg2, bool init)		\
{								\
	if (init)						\
		set_##type1##_safe(arg1, arg2);			\
	else							\
		set_##type1(arg1, arg2);			\
}

DEFINE_ENTRY(p4d, p4d, init)
DEFINE_ENTRY(pud, pud, init)
DEFINE_ENTRY(pmd, pmd, init)
DEFINE_ENTRY(pte, pte, init)


Linus Torvalds's avatar
Linus Torvalds committed
94 95 96 97 98 99
/*
 * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the
 * physical space so we can cache the place of the first one and move
 * around without checking the pgd every time.
 */

100
/* Bits supported by the hardware: */
101
pteval_t __supported_pte_mask __read_mostly = ~0;
102 103
/* Bits allowed in normal kernel mappings: */
pteval_t __default_kernel_pte_mask __read_mostly = ~0;
104
EXPORT_SYMBOL_GPL(__supported_pte_mask);
105 106
/* Used in PAGE_KERNEL_* macros which are reasonably used out-of-tree: */
EXPORT_SYMBOL(__default_kernel_pte_mask);
107 108 109

int force_personality32;

Ingo Molnar's avatar
Ingo Molnar committed
110 111 112 113 114 115 116 117
/*
 * noexec32=on|off
 * Control non executable heap for 32bit processes.
 * To control the stack too use noexec=off
 *
 * on	PROT_READ does not imply PROT_EXEC for 32-bit processes (default)
 * off	PROT_READ implies PROT_EXEC
 */
118 119 120 121 122 123 124 125 126 127
static int __init nonx32_setup(char *str)
{
	if (!strcmp(str, "on"))
		force_personality32 &= ~READ_IMPLIES_EXEC;
	else if (!strcmp(str, "off"))
		force_personality32 |= READ_IMPLIES_EXEC;
	return 1;
}
__setup("noexec32=", nonx32_setup);

128
static void sync_global_pgds_l5(unsigned long start, unsigned long end)
129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163
{
	unsigned long addr;

	for (addr = start; addr <= end; addr = ALIGN(addr + 1, PGDIR_SIZE)) {
		const pgd_t *pgd_ref = pgd_offset_k(addr);
		struct page *page;

		/* Check for overflow */
		if (addr < start)
			break;

		if (pgd_none(*pgd_ref))
			continue;

		spin_lock(&pgd_lock);
		list_for_each_entry(page, &pgd_list, lru) {
			pgd_t *pgd;
			spinlock_t *pgt_lock;

			pgd = (pgd_t *)page_address(page) + pgd_index(addr);
			/* the pgt_lock only for Xen */
			pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
			spin_lock(pgt_lock);

			if (!pgd_none(*pgd_ref) && !pgd_none(*pgd))
				BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));

			if (pgd_none(*pgd))
				set_pgd(pgd, *pgd_ref);

			spin_unlock(pgt_lock);
		}
		spin_unlock(&pgd_lock);
	}
}
164 165

static void sync_global_pgds_l4(unsigned long start, unsigned long end)
166
{
167
	unsigned long addr;
168

169 170
	for (addr = start; addr <= end; addr = ALIGN(addr + 1, PGDIR_SIZE)) {
		pgd_t *pgd_ref = pgd_offset_k(addr);
171
		const p4d_t *p4d_ref;
172 173
		struct page *page;

174 175
		/*
		 * With folded p4d, pgd_none() is always false, we need to
176
		 * handle synchronization on p4d level.
177
		 */
178
		MAYBE_BUILD_BUG_ON(pgd_none(*pgd_ref));
179
		p4d_ref = p4d_offset(pgd_ref, addr);
180 181

		if (p4d_none(*p4d_ref))
182 183
			continue;

184
		spin_lock(&pgd_lock);
185
		list_for_each_entry(page, &pgd_list, lru) {
186
			pgd_t *pgd;
187
			p4d_t *p4d;
188 189
			spinlock_t *pgt_lock;

190 191
			pgd = (pgd_t *)page_address(page) + pgd_index(addr);
			p4d = p4d_offset(pgd, addr);
192
			/* the pgt_lock only for Xen */
193 194 195
			pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
			spin_lock(pgt_lock);

196
			if (!p4d_none(*p4d_ref) && !p4d_none(*p4d))
197 198
				BUG_ON(p4d_pgtable(*p4d)
				       != p4d_pgtable(*p4d_ref));
199

200 201
			if (p4d_none(*p4d))
				set_p4d(p4d, *p4d_ref);
202

203
			spin_unlock(pgt_lock);
204
		}
205
		spin_unlock(&pgd_lock);
206
	}
207
}
208 209 210 211 212

/*
 * When memory was added make sure all the processes MM have
 * suitable PGD entries in the local PGD level page.
 */
213
static void sync_global_pgds(unsigned long start, unsigned long end)
214
{
215
	if (pgtable_l5_enabled())
216 217 218 219
		sync_global_pgds_l5(start, end);
	else
		sync_global_pgds_l4(start, end);
}
220

221 222 223 224 225
/*
 * NOTE: This function is marked __ref because it calls __init function
 * (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0.
 */
static __ref void *spp_getpage(void)
Thomas Gleixner's avatar
Thomas Gleixner committed
226
{
Linus Torvalds's avatar
Linus Torvalds committed
227
	void *ptr;
Thomas Gleixner's avatar
Thomas Gleixner committed
228

Linus Torvalds's avatar
Linus Torvalds committed
229
	if (after_bootmem)
230
		ptr = (void *) get_zeroed_page(GFP_ATOMIC);
Linus Torvalds's avatar
Linus Torvalds committed
231
	else
232
		ptr = memblock_alloc(PAGE_SIZE, PAGE_SIZE);
Thomas Gleixner's avatar
Thomas Gleixner committed
233 234 235 236 237

	if (!ptr || ((unsigned long)ptr & ~PAGE_MASK)) {
		panic("set_pte_phys: cannot allocate page data %s\n",
			after_bootmem ? "after bootmem" : "");
	}
Linus Torvalds's avatar
Linus Torvalds committed
238

239
	pr_debug("spp_getpage %p\n", ptr);
Thomas Gleixner's avatar
Thomas Gleixner committed
240

Linus Torvalds's avatar
Linus Torvalds committed
241
	return ptr;
Thomas Gleixner's avatar
Thomas Gleixner committed
242
}
Linus Torvalds's avatar
Linus Torvalds committed
243

244
static p4d_t *fill_p4d(pgd_t *pgd, unsigned long vaddr)
Linus Torvalds's avatar
Linus Torvalds committed
245
{
246
	if (pgd_none(*pgd)) {
247 248 249
		p4d_t *p4d = (p4d_t *)spp_getpage();
		pgd_populate(&init_mm, pgd, p4d);
		if (p4d != p4d_offset(pgd, 0))
250
			printk(KERN_ERR "PAGETABLE BUG #00! %p <-> %p\n",
251 252 253 254 255 256 257 258 259 260 261 262 263
			       p4d, p4d_offset(pgd, 0));
	}
	return p4d_offset(pgd, vaddr);
}

static pud_t *fill_pud(p4d_t *p4d, unsigned long vaddr)
{
	if (p4d_none(*p4d)) {
		pud_t *pud = (pud_t *)spp_getpage();
		p4d_populate(&init_mm, p4d, pud);
		if (pud != pud_offset(p4d, 0))
			printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n",
			       pud, pud_offset(p4d, 0));
264
	}
265
	return pud_offset(p4d, vaddr);
266
}
Linus Torvalds's avatar
Linus Torvalds committed
267

268
static pmd_t *fill_pmd(pud_t *pud, unsigned long vaddr)
269
{
Linus Torvalds's avatar
Linus Torvalds committed
270
	if (pud_none(*pud)) {
271
		pmd_t *pmd = (pmd_t *) spp_getpage();
272
		pud_populate(&init_mm, pud, pmd);
273
		if (pmd != pmd_offset(pud, 0))
274
			printk(KERN_ERR "PAGETABLE BUG #02! %p <-> %p\n",
275
			       pmd, pmd_offset(pud, 0));
Linus Torvalds's avatar
Linus Torvalds committed
276
	}
277 278 279
	return pmd_offset(pud, vaddr);
}

280
static pte_t *fill_pte(pmd_t *pmd, unsigned long vaddr)
281
{
Linus Torvalds's avatar
Linus Torvalds committed
282
	if (pmd_none(*pmd)) {
283
		pte_t *pte = (pte_t *) spp_getpage();
284
		pmd_populate_kernel(&init_mm, pmd, pte);
285
		if (pte != pte_offset_kernel(pmd, 0))
286
			printk(KERN_ERR "PAGETABLE BUG #03!\n");
Linus Torvalds's avatar
Linus Torvalds committed
287
	}
288 289 290
	return pte_offset_kernel(pmd, vaddr);
}

291
static void __set_pte_vaddr(pud_t *pud, unsigned long vaddr, pte_t new_pte)
292
{
293 294
	pmd_t *pmd = fill_pmd(pud, vaddr);
	pte_t *pte = fill_pte(pmd, vaddr);
Linus Torvalds's avatar
Linus Torvalds committed
295 296 297 298 299 300 301

	set_pte(pte, new_pte);

	/*
	 * It's enough to flush this one mapping.
	 * (PGE mappings get flushed as well)
	 */
302
	flush_tlb_one_kernel(vaddr);
Linus Torvalds's avatar
Linus Torvalds committed
303 304
}

305 306 307 308 309 310 311 312 313 314 315 316 317 318 319
void set_pte_vaddr_p4d(p4d_t *p4d_page, unsigned long vaddr, pte_t new_pte)
{
	p4d_t *p4d = p4d_page + p4d_index(vaddr);
	pud_t *pud = fill_pud(p4d, vaddr);

	__set_pte_vaddr(pud, vaddr, new_pte);
}

void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte)
{
	pud_t *pud = pud_page + pud_index(vaddr);

	__set_pte_vaddr(pud, vaddr, new_pte);
}

320
void set_pte_vaddr(unsigned long vaddr, pte_t pteval)
321 322
{
	pgd_t *pgd;
323
	p4d_t *p4d_page;
324 325 326 327 328 329 330 331 332

	pr_debug("set_pte_vaddr %lx to %lx\n", vaddr, native_pte_val(pteval));

	pgd = pgd_offset_k(vaddr);
	if (pgd_none(*pgd)) {
		printk(KERN_ERR
			"PGD FIXMAP MISSING, it should be setup in head.S!\n");
		return;
	}
333 334 335

	p4d_page = p4d_offset(pgd, 0);
	set_pte_vaddr_p4d(p4d_page, vaddr, pteval);
336 337
}

338
pmd_t * __init populate_extra_pmd(unsigned long vaddr)
339 340
{
	pgd_t *pgd;
341
	p4d_t *p4d;
342 343 344
	pud_t *pud;

	pgd = pgd_offset_k(vaddr);
345 346
	p4d = fill_p4d(pgd, vaddr);
	pud = fill_pud(p4d, vaddr);
347 348 349 350 351 352
	return fill_pmd(pud, vaddr);
}

pte_t * __init populate_extra_pte(unsigned long vaddr)
{
	pmd_t *pmd;
353

354 355
	pmd = populate_extra_pmd(vaddr);
	return fill_pte(pmd, vaddr);
356 357
}

358 359 360 361
/*
 * Create large page table mappings for a range of physical addresses.
 */
static void __init __init_extra_mapping(unsigned long phys, unsigned long size,
362
					enum page_cache_mode cache)
363 364
{
	pgd_t *pgd;
365
	p4d_t *p4d;
366 367
	pud_t *pud;
	pmd_t *pmd;
368
	pgprot_t prot;
369

370
	pgprot_val(prot) = pgprot_val(PAGE_KERNEL_LARGE) |
371
		protval_4k_2_large(cachemode2protval(cache));
372 373 374 375
	BUG_ON((phys & ~PMD_MASK) || (size & ~PMD_MASK));
	for (; size; phys += PMD_SIZE, size -= PMD_SIZE) {
		pgd = pgd_offset_k((unsigned long)__va(phys));
		if (pgd_none(*pgd)) {
376 377 378 379 380 381
			p4d = (p4d_t *) spp_getpage();
			set_pgd(pgd, __pgd(__pa(p4d) | _KERNPG_TABLE |
						_PAGE_USER));
		}
		p4d = p4d_offset(pgd, (unsigned long)__va(phys));
		if (p4d_none(*p4d)) {
382
			pud = (pud_t *) spp_getpage();
383
			set_p4d(p4d, __p4d(__pa(pud) | _KERNPG_TABLE |
384 385
						_PAGE_USER));
		}
386
		pud = pud_offset(p4d, (unsigned long)__va(phys));
387 388 389 390 391 392 393 394 395 396 397 398 399
		if (pud_none(*pud)) {
			pmd = (pmd_t *) spp_getpage();
			set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE |
						_PAGE_USER));
		}
		pmd = pmd_offset(pud, phys);
		BUG_ON(!pmd_none(*pmd));
		set_pmd(pmd, __pmd(phys | pgprot_val(prot)));
	}
}

void __init init_extra_mapping_wb(unsigned long phys, unsigned long size)
{
400
	__init_extra_mapping(phys, size, _PAGE_CACHE_MODE_WB);
401 402 403 404
}

void __init init_extra_mapping_uc(unsigned long phys, unsigned long size)
{
405
	__init_extra_mapping(phys, size, _PAGE_CACHE_MODE_UC);
406 407
}

408
/*
409 410 411
 * The head.S code sets up the kernel high mapping:
 *
 *   from __START_KERNEL_map to __START_KERNEL_map + size (== _end-_text)
412
 *
413
 * phys_base holds the negative offset to the kernel, which is added
414 415 416
 * to the compile time generated pmds. This results in invalid pmds up
 * to the point where we hit the physaddr 0 mapping.
 *
417 418
 * We limit the mappings to the region from _text to _brk_end.  _brk_end
 * is rounded up to the 2MB boundary. This catches the invalid pmds as
419 420 421 422 423
 * well, as they are located before _text:
 */
void __init cleanup_highmap(void)
{
	unsigned long vaddr = __START_KERNEL_map;
424
	unsigned long vaddr_end = __START_KERNEL_map + KERNEL_IMAGE_SIZE;
425
	unsigned long end = roundup((unsigned long)_brk_end, PMD_SIZE) - 1;
426 427
	pmd_t *pmd = level2_kernel_pgt;

428 429 430 431 432 433 434 435
	/*
	 * Native path, max_pfn_mapped is not set yet.
	 * Xen has valid max_pfn_mapped set in
	 *	arch/x86/xen/mmu.c:xen_setup_kernel_pagetable().
	 */
	if (max_pfn_mapped)
		vaddr_end = __START_KERNEL_map + (max_pfn_mapped << PAGE_SHIFT);

436
	for (; vaddr + PMD_SIZE - 1 < vaddr_end; pmd++, vaddr += PMD_SIZE) {
437
		if (pmd_none(*pmd))
438 439 440 441 442 443
			continue;
		if (vaddr < (unsigned long) _text || vaddr > end)
			set_pmd(pmd, __pmd(0));
	}
}

444 445 446 447
/*
 * Create PTE level page table mapping for physical addresses.
 * It returns the last physical address mapped.
 */
448
static unsigned long __meminit
449
phys_pte_init(pte_t *pte_page, unsigned long paddr, unsigned long paddr_end,
450
	      pgprot_t prot, bool init)
451
{
452 453 454
	unsigned long pages = 0, paddr_next;
	unsigned long paddr_last = paddr_end;
	pte_t *pte;
455
	int i;
456

457 458
	pte = pte_page + pte_index(paddr);
	i = pte_index(paddr);
459

460 461 462
	for (; i < PTRS_PER_PTE; i++, paddr = paddr_next, pte++) {
		paddr_next = (paddr & PAGE_MASK) + PAGE_SIZE;
		if (paddr >= paddr_end) {
463
			if (!after_bootmem &&
464
			    !e820__mapped_any(paddr & PAGE_MASK, paddr_next,
465
					     E820_TYPE_RAM) &&
466
			    !e820__mapped_any(paddr & PAGE_MASK, paddr_next,
467
					     E820_TYPE_RESERVED_KERN))
468
				set_pte_init(pte, __pte(0), init);
469
			continue;
470 471
		}

472 473 474 475 476 477
		/*
		 * We will re-use the existing mapping.
		 * Xen for example has some special requirements, like mapping
		 * pagetable pages as RO. So assume someone who pre-setup
		 * these mappings are more intelligent.
		 */
478
		if (!pte_none(*pte)) {
479 480
			if (!after_bootmem)
				pages++;
481
			continue;
482
		}
483 484

		if (0)
485 486
			pr_info("   pte=%p addr=%lx pte=%016lx\n", pte, paddr,
				pfn_pte(paddr >> PAGE_SHIFT, PAGE_KERNEL).pte);
487
		pages++;
488
		set_pte_init(pte, pfn_pte(paddr >> PAGE_SHIFT, prot), init);
489
		paddr_last = (paddr & PAGE_MASK) + PAGE_SIZE;
490
	}
491

492
	update_page_count(PG_LEVEL_4K, pages);
493

494
	return paddr_last;
495 496
}

497 498 499 500 501
/*
 * Create PMD level page table mapping for physical addresses. The virtual
 * and physical address have to be aligned at this level.
 * It returns the last physical address mapped.
 */
502
static unsigned long __meminit
503
phys_pmd_init(pmd_t *pmd_page, unsigned long paddr, unsigned long paddr_end,
504
	      unsigned long page_size_mask, pgprot_t prot, bool init)
505
{
506 507
	unsigned long pages = 0, paddr_next;
	unsigned long paddr_last = paddr_end;
508

509
	int i = pmd_index(paddr);
510

511 512
	for (; i < PTRS_PER_PMD; i++, paddr = paddr_next) {
		pmd_t *pmd = pmd_page + pmd_index(paddr);
513
		pte_t *pte;
514
		pgprot_t new_prot = prot;
515

516 517
		paddr_next = (paddr & PMD_MASK) + PMD_SIZE;
		if (paddr >= paddr_end) {
518
			if (!after_bootmem &&
519
			    !e820__mapped_any(paddr & PMD_MASK, paddr_next,
520
					     E820_TYPE_RAM) &&
521
			    !e820__mapped_any(paddr & PMD_MASK, paddr_next,
522
					     E820_TYPE_RESERVED_KERN))
523
				set_pmd_init(pmd, __pmd(0), init);
524
			continue;
525
		}
526

527
		if (!pmd_none(*pmd)) {
528 529
			if (!pmd_large(*pmd)) {
				spin_lock(&init_mm.page_table_lock);
530
				pte = (pte_t *)pmd_page_vaddr(*pmd);
531
				paddr_last = phys_pte_init(pte, paddr,
532 533
							   paddr_end, prot,
							   init);
534
				spin_unlock(&init_mm.page_table_lock);
535
				continue;
536
			}
537 538 539 540 541 542 543 544 545 546 547 548
			/*
			 * If we are ok with PG_LEVEL_2M mapping, then we will
			 * use the existing mapping,
			 *
			 * Otherwise, we will split the large page mapping but
			 * use the same existing protection bits except for
			 * large page, so that we don't violate Intel's TLB
			 * Application note (317080) which says, while changing
			 * the page sizes, new and old translations should
			 * not differ with respect to page frame and
			 * attributes.
			 */
549
			if (page_size_mask & (1 << PG_LEVEL_2M)) {
550 551
				if (!after_bootmem)
					pages++;
552
				paddr_last = paddr_next;
553
				continue;
554
			}
555
			new_prot = pte_pgprot(pte_clrhuge(*(pte_t *)pmd));
556 557
		}

558
		if (page_size_mask & (1<<PG_LEVEL_2M)) {
559
			pages++;
560
			spin_lock(&init_mm.page_table_lock);
561 562 563 564
			set_pte_init((pte_t *)pmd,
				     pfn_pte((paddr & PMD_MASK) >> PAGE_SHIFT,
					     __pgprot(pgprot_val(prot) | _PAGE_PSE)),
				     init);
565
			spin_unlock(&init_mm.page_table_lock);
566
			paddr_last = paddr_next;
567
			continue;
568
		}
569

570
		pte = alloc_low_page();
571
		paddr_last = phys_pte_init(pte, paddr, paddr_end, new_prot, init);
572

573
		spin_lock(&init_mm.page_table_lock);
574
		pmd_populate_kernel_init(&init_mm, pmd, pte, init);
575
		spin_unlock(&init_mm.page_table_lock);
576
	}
577
	update_page_count(PG_LEVEL_2M, pages);
578
	return paddr_last;
579 580
}

581 582
/*
 * Create PUD level page table mapping for physical addresses. The virtual
583 584
 * and physical address do not have to be aligned at this level. KASLR can
 * randomize virtual addresses up to this level.
585 586
 * It returns the last physical address mapped.
 */
587
static unsigned long __meminit
588
phys_pud_init(pud_t *pud_page, unsigned long paddr, unsigned long paddr_end,
589
	      unsigned long page_size_mask, pgprot_t _prot, bool init)
Thomas Gleixner's avatar
Thomas Gleixner committed
590
{
591 592
	unsigned long pages = 0, paddr_next;
	unsigned long paddr_last = paddr_end;
593 594
	unsigned long vaddr = (unsigned long)__va(paddr);
	int i = pud_index(vaddr);
595

596
	for (; i < PTRS_PER_PUD; i++, paddr = paddr_next) {
597
		pud_t *pud;
Linus Torvalds's avatar
Linus Torvalds committed
598
		pmd_t *pmd;
599
		pgprot_t prot = _prot;
Linus Torvalds's avatar
Linus Torvalds committed
600

601 602
		vaddr = (unsigned long)__va(paddr);
		pud = pud_page + pud_index(vaddr);
603
		paddr_next = (paddr & PUD_MASK) + PUD_SIZE;
604

605
		if (paddr >= paddr_end) {
606
			if (!after_bootmem &&
607
			    !e820__mapped_any(paddr & PUD_MASK, paddr_next,
608
					     E820_TYPE_RAM) &&
609
			    !e820__mapped_any(paddr & PUD_MASK, paddr_next,
610
					     E820_TYPE_RESERVED_KERN))
611
				set_pud_init(pud, __pud(0), init);
Linus Torvalds's avatar
Linus Torvalds committed
612
			continue;
Thomas Gleixner's avatar
Thomas Gleixner committed
613
		}
Linus Torvalds's avatar
Linus Torvalds committed
614

615
		if (!pud_none(*pud)) {
616
			if (!pud_large(*pud)) {
617
				pmd = pmd_offset(pud, 0);
618 619 620
				paddr_last = phys_pmd_init(pmd, paddr,
							   paddr_end,
							   page_size_mask,
621
							   prot, init);
622 623
				continue;
			}
624 625 626 627 628 629 630 631 632 633 634 635
			/*
			 * If we are ok with PG_LEVEL_1G mapping, then we will
			 * use the existing mapping.
			 *
			 * Otherwise, we will split the gbpage mapping but use
			 * the same existing protection  bits except for large
			 * page, so that we don't violate Intel's TLB
			 * Application note (317080) which says, while changing
			 * the page sizes, new and old translations should
			 * not differ with respect to page frame and
			 * attributes.
			 */
636
			if (page_size_mask & (1 << PG_LEVEL_1G)) {
637 638
				if (!after_bootmem)
					pages++;
639
				paddr_last = paddr_next;
640
				continue;
641
			}
642
			prot = pte_pgprot(pte_clrhuge(*(pte_t *)pud));
643 644
		}

645
		if (page_size_mask & (1<<PG_LEVEL_1G)) {
646
			pages++;
647
			spin_lock(&init_mm.page_table_lock);
648 649 650

			prot = __pgprot(pgprot_val(prot) | __PAGE_KERNEL_LARGE);

651 652
			set_pte_init((pte_t *)pud,
				     pfn_pte((paddr & PUD_MASK) >> PAGE_SHIFT,
653
					     prot),
654
				     init);
655
			spin_unlock(&init_mm.page_table_lock);
656
			paddr_last = paddr_next;
657 658 659
			continue;
		}

660
		pmd = alloc_low_page();
661
		paddr_last = phys_pmd_init(pmd, paddr, paddr_end,
662
					   page_size_mask, prot, init);
663 664

		spin_lock(&init_mm.page_table_lock);
665
		pud_populate_init(&init_mm, pud, pmd, init);
666
		spin_unlock(&init_mm.page_table_lock);
Linus Torvalds's avatar
Linus Torvalds committed
667
	}
668

669
	update_page_count(PG_LEVEL_1G, pages);
670

671
	return paddr_last;
Thomas Gleixner's avatar
Thomas Gleixner committed
672
}
Linus Torvalds's avatar
Linus Torvalds committed
673

674 675
static unsigned long __meminit
phys_p4d_init(p4d_t *p4d_page, unsigned long paddr, unsigned long paddr_end,
676
	      unsigned long page_size_mask, pgprot_t prot, bool init)
677
{
678 679 680 681 682
	unsigned long vaddr, vaddr_end, vaddr_next, paddr_next, paddr_last;

	paddr_last = paddr_end;
	vaddr = (unsigned long)__va(paddr);
	vaddr_end = (unsigned long)__va(paddr_end);
683

684
	if (!pgtable_l5_enabled())
685
		return phys_pud_init((pud_t *) p4d_page, paddr, paddr_end,
686
				     page_size_mask, prot, init);
687

688 689
	for (; vaddr < vaddr_end; vaddr = vaddr_next) {
		p4d_t *p4d = p4d_page + p4d_index(vaddr);
690 691
		pud_t *pud;

692 693
		vaddr_next = (vaddr & P4D_MASK) + P4D_SIZE;
		paddr = __pa(vaddr);
694 695

		if (paddr >= paddr_end) {
696
			paddr_next = __pa(vaddr_next);
697 698 699 700 701
			if (!after_bootmem &&
			    !e820__mapped_any(paddr & P4D_MASK, paddr_next,
					     E820_TYPE_RAM) &&
			    !e820__mapped_any(paddr & P4D_MASK, paddr_next,
					     E820_TYPE_RESERVED_KERN))
702
				set_p4d_init(p4d, __p4d(0), init);
703 704 705 706 707
			continue;
		}

		if (!p4d_none(*p4d)) {
			pud = pud_offset(p4d, 0);
708
			paddr_last = phys_pud_init(pud, paddr, __pa(vaddr_end),
709
					page_size_mask, prot, init);
710 711 712 713
			continue;
		}

		pud = alloc_low_page();
714
		paddr_last = phys_pud_init(pud, paddr, __pa(vaddr_end),
715
					   page_size_mask, prot, init);
716 717

		spin_lock(&init_mm.page_table_lock);
718
		p4d_populate_init(&init_mm, p4d, pud, init);
719 720 721 722 723 724
		spin_unlock(&init_mm.page_table_lock);
	}

	return paddr_last;
}

725 726 727 728
static unsigned long __meminit
__kernel_physical_mapping_init(unsigned long paddr_start,
			       unsigned long paddr_end,
			       unsigned long page_size_mask,
729
			       pgprot_t prot, bool init)
Thomas Gleixner's avatar
Thomas Gleixner committed
730
{
731
	bool pgd_changed = false;
732
	unsigned long vaddr, vaddr_start, vaddr_end, vaddr_next, paddr_last;
Linus Torvalds's avatar
Linus Torvalds committed
733

734 735 736 737
	paddr_last = paddr_end;
	vaddr = (unsigned long)__va(paddr_start);
	vaddr_end = (unsigned long)__va(paddr_end);
	vaddr_start = vaddr;
Linus Torvalds's avatar
Linus Torvalds committed
738

739 740
	for (; vaddr < vaddr_end; vaddr = vaddr_next) {
		pgd_t *pgd = pgd_offset_k(vaddr);
741
		p4d_t *p4d;
742

743
		vaddr_next = (vaddr & PGDIR_MASK) + PGDIR_SIZE;
744

745 746 747
		if (pgd_val(*pgd)) {
			p4d = (p4d_t *)pgd_page_vaddr(*pgd);
			paddr_last = phys_p4d_init(p4d, __pa(vaddr),
748
						   __pa(vaddr_end),
749
						   page_size_mask,
750
						   prot, init);
751 752 753
			continue;
		}

754 755
		p4d = alloc_low_page();
		paddr_last = phys_p4d_init(p4d, __pa(vaddr), __pa(vaddr_end),
756
					   page_size_mask, prot, init);
757 758

		spin_lock(&init_mm.page_table_lock);
759
		if (pgtable_l5_enabled())
760
			pgd_populate_init(&init_mm, pgd, p4d, init);
761
		else
762 763 764
			p4d_populate_init(&init_mm, p4d_offset(pgd, vaddr),
					  (pud_t *) p4d, init);

765
		spin_unlock(&init_mm.page_table_lock);
766
		pgd_changed = true;
Thomas Gleixner's avatar
Thomas Gleixner committed
767
	}
768 769

	if (pgd_changed)
770
		sync_global_pgds(vaddr_start, vaddr_end - 1);
771

772
	return paddr_last;
773
}
774

775 776 777 778 779 780 781 782 783 784

/*
 * Create page table mapping for the physical memory for specific physical
 * addresses. Note that it can only be used to populate non-present entries.
 * The virtual and physical addresses have to be aligned on PMD level
 * down. It returns the last physical address mapped.
 */
unsigned long __meminit
kernel_physical_mapping_init(unsigned long paddr_start,
			     unsigned long paddr_end,
785
			     unsigned long page_size_mask, pgprot_t prot)
786 787
{
	return __kernel_physical_mapping_init(paddr_start, paddr_end,
788
					      page_size_mask, prot, true);
789 790 791 792 793 794 795 796 797 798 799 800 801 802
}

/*
 * This function is similar to kernel_physical_mapping_init() above with the
 * exception that it uses set_{pud,pmd}() instead of the set_{pud,pte}_safe()
 * when updating the mapping. The caller is responsible to flush the TLBs after
 * the function returns.
 */
unsigned long __meminit
kernel_physical_mapping_change(unsigned long paddr_start,
			       unsigned long paddr_end,
			       unsigned long page_size_mask)
{
	return __kernel_physical_mapping_init(paddr_start, paddr_end,
803 804
					      page_size_mask, PAGE_KERNEL,
					      false);
805 806
}

807
#ifndef CONFIG_NUMA
808
void __init initmem_init(void)
809
{
810
	memblock_set_node(0, PHYS_ADDR_MAX, &memblock.memory, 0);
811
}
812
#endif
813

Linus Torvalds's avatar
Linus Torvalds committed
814 815
void __init paging_init(void)
{
816
	sparse_init();
817 818 819 820 821 822 823

	/*
	 * clear the default setting with node 0
	 * note: don't use nodes_clear here, that is really clearing when
	 *	 numa support is not compiled in, and later node_set_state
	 *	 will not set it back.
	 */
824
	node_clear_state(0, N_MEMORY);
825
	node_clear_state(0, N_NORMAL_MEMORY);
826

827
	zone_sizes_init();
Linus Torvalds's avatar
Linus Torvalds committed
828 829
}

830 831 832
#ifdef CONFIG_SPARSEMEM_VMEMMAP
#define PAGE_UNUSED 0xFD

833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851
/*
 * The unused vmemmap range, which was not yet memset(PAGE_UNUSED), ranges
 * from unused_pmd_start to next PMD_SIZE boundary.
 */
static unsigned long unused_pmd_start __meminitdata;

static void __meminit vmemmap_flush_unused_pmd(void)
{
	if (!unused_pmd_start)
		return;
	/*
	 * Clears (unused_pmd_start, PMD_END]
	 */
	memset((void *)unused_pmd_start, PAGE_UNUSED,
	       ALIGN(unused_pmd_start, PMD_SIZE) - unused_pmd_start);
	unused_pmd_start = 0;
}

#ifdef CONFIG_MEMORY_HOTPLUG
852 853 854 855 856
/* Returns true if the PMD is completely unused and thus it can be freed */
static bool __meminit vmemmap_pmd_is_unused(unsigned long addr, unsigned long end)
{
	unsigned long start = ALIGN_DOWN(addr, PMD_SIZE);

857 858 859 860 861
	/*
	 * Flush the unused range cache to ensure that memchr_inv() will work
	 * for the whole range.
	 */
	vmemmap_flush_unused_pmd();
862 863 864 865
	memset((void *)addr, PAGE_UNUSED, end - addr);

	return !memchr_inv((void *)start, PAGE_UNUSED, PMD_SIZE);
}
866
#endif
867

868
static void __meminit __vmemmap_use_sub_pmd(unsigned long start)
869 870 871 872 873 874 875 876 877 878 879
{
	/*
	 * As we expect to add in the same granularity as we remove, it's
	 * sufficient to mark only some piece used to block the memmap page from
	 * getting removed when removing some other adjacent memmap (just in
	 * case the first memmap never gets initialized e.g., because the memory
	 * block never gets onlined).
	 */
	memset((void *)start, 0, sizeof(struct page));
}

880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902
static void __meminit vmemmap_use_sub_pmd(unsigned long start, unsigned long end)
{
	/*
	 * We only optimize if the new used range directly follows the
	 * previously unused range (esp., when populating consecutive sections).
	 */
	if (unused_pmd_start == start) {
		if (likely(IS_ALIGNED(end, PMD_SIZE)))
			unused_pmd_start = 0;
		else
			unused_pmd_start = end;
		return;
	}

	/*
	 * If the range does not contiguously follows previous one, make sure
	 * to mark the unused range of the previous one so it can be removed.
	 */
	vmemmap_flush_unused_pmd();
	__vmemmap_use_sub_pmd(start);
}


903 904
static void __meminit vmemmap_use_new_sub_pmd(unsigned long start, unsigned long end)
{
905 906
	vmemmap_flush_unused_pmd();

907 908 909 910
	/*
	 * Could be our memmap page is filled with PAGE_UNUSED already from a
	 * previous remove. Make sure to reset it.
	 */
911
	__vmemmap_use_sub_pmd(start);
912 913 914 915 916 917

	/*
	 * Mark with PAGE_UNUSED the unused parts of the new memmap range
	 */
	if (!IS_ALIGNED(start, PMD_SIZE))
		memset((void *)start, PAGE_UNUSED,
918 919 920 921 922 923 924
			start - ALIGN_DOWN(start, PMD_SIZE));

	/*
	 * We want to avoid memset(PAGE_UNUSED) when populating the vmemmap of
	 * consecutive sections. Remember for the last added PMD where the
	 * unused range begins.
	 */
925
	if (!IS_ALIGNED(end, PMD_SIZE))
926
		unused_pmd_start = end;
927 928 929
}
#endif

930 931 932
/*
 * Memory hotplug specific functions
 */
933
#ifdef CONFIG_MEMORY_HOTPLUG
934 935 936 937
/*
 * After memory hotplug the variables max_pfn, max_low_pfn and high_memory need
 * updating.
 */
938
static void update_end_of_memory_vars(u64 start, u64 size)
939 940 941 942 943 944 945 946 947 948
{
	unsigned long end_pfn = PFN_UP(start + size);

	if (end_pfn > max_pfn) {
		max_pfn = end_pfn;
		max_low_pfn = end_pfn;
		high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1;
	}
}

949
int add_pages(int nid, unsigned long start_pfn, unsigned long nr_pages,
950
	      struct mhp_params *params)
951 952 953
{
	int ret;

954
	ret = __add_pages(nid, start_pfn, nr_pages, params);
955
	WARN_ON_ONCE(ret);
956

957
	/* update max_pfn, max_low_pfn and high_memory */
958 959
	update_end_of_memory_vars(start_pfn << PAGE_SHIFT,
				  nr_pages << PAGE_SHIFT);
960

961 962
	return ret;
}
963

964
int arch_add_memory(int nid, u64 start, u64 size,
965
		    struct mhp_params *params)
966 967 968 969
{
	unsigned long start_pfn = start >> PAGE_SHIFT;
	unsigned long nr_pages = size >> PAGE_SHIFT;

970
	init_memory_mapping(start, start + size, params->pgprot);
971

972
	return add_pages(nid, start_pfn, nr_pages, params);
973
}
974

975
static void __meminit free_pagetable(struct page *page, int order)
976 977 978
{
	unsigned long magic;
	unsigned int nr_pages = 1 << order;
979

980 981 982 983
	/* bootmem page has reserved flag */
	if (PageReserved(page)) {
		__ClearPageReserved(page);

984
		magic = page->index;
985 986 987 988
		if (magic == SECTION_INFO || magic == MIX_SECTION_INFO) {
			while (nr_pages--)
				put_page_bootmem(page++);
		} else
989 990
			while (nr_pages--)
				free_reserved_page(page++);
991 992 993 994
	} else
		free_pages((unsigned long)page_address(page), order);
}

995
static void __meminit free_hugepage_table(struct page *page,
996
		struct vmem_altmap *altmap)
997 998 999 1000 1001 1002 1003 1004
{
	if (altmap)
		vmem_altmap_free(altmap, PMD_SIZE / PAGE_SIZE);
	else
		free_pagetable(page, get_order(PMD_SIZE));
}

static void __meminit free_pte_table(pte_t *pte_start, pmd_t *pmd)
1005 1006 1007 1008 1009 1010
{
	pte_t *pte;
	int i;

	for (i = 0; i < PTRS_PER_PTE; i++) {
		pte = pte_start + i;
1011
		if (!pte_none(*pte))
1012 1013 1014 1015
			return;
	}

	/* free a pte talbe */
1016
	free_pagetable(pmd_page(*pmd), 0);
1017 1018 1019 1020 1021
	spin_lock(&init_mm.page_table_lock);
	pmd_clear(pmd);
	spin_unlock(&init_mm.page_table_lock);
}

1022
static void __meminit free_pmd_table(pmd_t *pmd_start, pud_t *pud)
1023 1024 1025 1026 1027 1028
{
	pmd_t *pmd;
	int i;

	for (i = 0; i < PTRS_PER_PMD; i++) {
		pmd = pmd_start + i;
1029
		if (!pmd_none(*pmd))
1030 1031 1032 1033
			return;
	}

	/* free a pmd talbe */
1034
	free_pagetable(pud_page(*pud), 0);
1035 1036 1037 1038 1039
	spin_lock(&init_mm.page_table_lock);
	pud_clear(pud);
	spin_unlock(&init_mm.page_table_lock);
}

1040
static void __meminit free_pud_table(pud_t *pud_start, p4d_t *p4d)
1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051
{
	pud_t *pud;
	int i;

	for (i = 0; i < PTRS_PER_PUD; i++) {
		pud = pud_start + i;
		if (!pud_none(*pud))
			return;
	}

	/* free a pud talbe */
1052
	free_pagetable(p4d_page(*p4d), 0);
1053 1054 1055 1056 1057
	spin_lock(&init_mm.page_table_lock);
	p4d_clear(p4d);
	spin_unlock(&init_mm.page_table_lock);
}

1058 1059
static void __meminit
remove_pte_table(pte_t *pte_start, unsigned long addr, unsigned long end,
1060
		 bool direct)
1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083
{
	unsigned long next, pages = 0;
	pte_t *pte;
	phys_addr_t phys_addr;

	pte = pte_start + pte_index(addr);
	for (; addr < end; addr = next, pte++) {
		next = (addr + PAGE_SIZE) & PAGE_MASK;
		if (next > end)
			next = end;

		if (!pte_present(*pte))
			continue;

		/*
		 * We mapped [0,1G) memory as identity mapping when
		 * initializing, in arch/x86/kernel/head_64.S. These
		 * pagetables cannot be removed.
		 */
		phys_addr = pte_val(*pte) + (addr & PAGE_MASK);
		if (phys_addr < (phys_addr_t)0x40000000)
			return;

1084 1085
		if (!direct)
			free_pagetable(pte_page(*pte), 0);
1086

1087 1088 1089
		spin_lock(&init_mm.page_table_lock);
		pte_clear(&init_mm, addr, pte);
		spin_unlock(&init_mm.page_table_lock);
1090

1091 1092
		/* For non-direct mapping, pages means nothing. */
		pages++;
1093 1094 1095 1096 1097 1098 1099 1100 1101 1102
	}

	/* Call free_pte_table() in remove_pmd_table(). */
	flush_tlb_all();
	if (direct)
		update_page_count(PG_LEVEL_4K, -pages);
}

static void __meminit
remove_pmd_table(pmd_t *pmd_start, unsigned long addr, unsigned long end,
1103
		 bool direct, struct vmem_altmap *altmap)
1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119
{
	unsigned long next, pages = 0;
	pte_t *pte_base;
	pmd_t *pmd;

	pmd = pmd_start + pmd_index(addr);
	for (; addr < end; addr = next, pmd++) {
		next = pmd_addr_end(addr, end);

		if (!pmd_present(*pmd))
			continue;

		if (pmd_large(*pmd)) {
			if (IS_ALIGNED(addr, PMD_SIZE) &&
			    IS_ALIGNED(next, PMD_SIZE)) {
				if (!direct)
1120 1121
					free_hugepage_table(pmd_page(*pmd),
							    altmap);
1122 1123 1124 1125 1126

				spin_lock(&init_mm.page_table_lock);
				pmd_clear(pmd);
				spin_unlock(&init_mm.page_table_lock);
				pages++;
1127 1128 1129
			}
#ifdef CONFIG_SPARSEMEM_VMEMMAP
			else if (vmemmap_pmd_is_unused(addr, next)) {
1130 1131
					free_hugepage_table(pmd_page(*pmd),
							    altmap);
1132 1133 1134 1135
					spin_lock(&init_mm.page_table_lock);
					pmd_clear(pmd);
					spin_unlock(&init_mm.page_table_lock);
			}
1136
#endif
1137 1138 1139 1140
			continue;
		}

		pte_base = (pte_t *)pmd_page_vaddr(*pmd);
1141 1142
		remove_pte_table(pte_base, addr, next, direct);
		free_pte_table(pte_base, pmd);
1143 1144 1145 1146 1147 1148 1149 1150 1151
	}

	/* Call free_pmd_table() in remove_pud_table(). */
	if (direct)
		update_page_count(PG_LEVEL_2M, -pages);
}

static void __meminit
remove_pud_table(pud_t *pud_start, unsigned long addr, unsigned long end,
1152
		 struct vmem_altmap *altmap, bool direct)
1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164
{
	unsigned long next, pages = 0;
	pmd_t *pmd_base;
	pud_t *pud;

	pud = pud_start + pud_index(addr);
	for (; addr < end; addr = next, pud++) {
		next = pud_addr_end(addr, end);

		if (!pud_present(*pud))
			continue;

1165 1166 1167 1168 1169 1170 1171
		if (pud_large(*pud) &&
		    IS_ALIGNED(addr, PUD_SIZE) &&
		    IS_ALIGNED(next, PUD_SIZE)) {
			spin_lock(&init_mm.page_table_lock);
			pud_clear(pud);
			spin_unlock(&init_mm.page_table_lock);
			pages++;
1172 1173 1174
			continue;
		}

1175
		pmd_base = pmd_offset(pud, 0);
1176
		remove_pmd_table(pmd_base, addr, next, direct, altmap);
1177
		free_pmd_table(pmd_base, pud);
1178 1179 1180 1181 1182 1183
	}

	if (direct)
		update_page_count(PG_LEVEL_1G, -pages);
}

1184 1185
static void __meminit
remove_p4d_table(p4d_t *p4d_start, unsigned long addr, unsigned long end,
1186
		 struct vmem_altmap *altmap, bool direct)
1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200
{
	unsigned long next, pages = 0;
	pud_t *pud_base;
	p4d_t *p4d;

	p4d = p4d_start + p4d_index(addr);
	for (; addr < end; addr = next, p4d++) {
		next = p4d_addr_end(addr, end);

		if (!p4d_present(*p4d))
			continue;

		BUILD_BUG_ON(p4d_large(*p4d));

1201
		pud_base = pud_offset(p4d, 0);
1202
		remove_pud_table(pud_base, addr, next, altmap, direct);
1203 1204 1205 1206 1207
		/*
		 * For 4-level page tables we do not want to free PUDs, but in the
		 * 5-level case we should free them. This code will have to change
		 * to adapt for boot-time switching between 4 and 5 level page tables.
		 */
1208
		if (pgtable_l5_enabled())
1209
			free_pud_table(pud_base, p4d);
1210 1211 1212 1213 1214 1215
	}

	if (direct)
		update_page_count(PG_LEVEL_512G, -pages);
}

1216 1217
/* start and end are both virtual address. */
static void __meminit
1218 1219
remove_pagetable(unsigned long start, unsigned long end, bool direct,
		struct vmem_altmap *altmap)
1220 1221
{
	unsigned long next;
1222
	unsigned long addr;
1223
	pgd_t *pgd;
1224
	p4d_t *p4d;
1225

1226 1227
	for (addr = start; addr < end; addr = next) {
		next = pgd_addr_end(addr, end);
1228

1229
		pgd = pgd_offset_k(addr);
1230 1231 1232
		if (!pgd_present(*pgd))
			continue;

1233
		p4d = p4d_offset(pgd, 0);
1234
		remove_p4d_table(p4d, addr, next, altmap, direct);
1235 1236 1237 1238 1239
	}

	flush_tlb_all();
}

1240 1241
void __ref vmemmap_free(unsigned long start, unsigned long end,
		struct vmem_altmap *altmap)
1242
{
1243 1244 1245
	VM_BUG_ON(!IS_ALIGNED(start, PAGE_SIZE));
	VM_BUG_ON(!IS_ALIGNED(end, PAGE_SIZE));

1246
	remove_pagetable(start, end, false, altmap);
1247 1248
}

1249 1250 1251 1252 1253 1254
static void __meminit
kernel_physical_mapping_remove(unsigned long start, unsigned long end)
{
	start = (unsigned long)__va(start);
	end = (unsigned long)__va(end);

1255
	remove_pagetable(start, end, true, NULL);
1256 1257
}

1258
void __ref arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap)
1259 1260 1261 1262
{
	unsigned long start_pfn = start >> PAGE_SHIFT;
	unsigned long nr_pages = size >> PAGE_SHIFT;

1263
	__remove_pages(start_pfn, nr_pages, altmap);
1264
	kernel_physical_mapping_remove(start, start + size);
1265
}
1266 1267
#endif /* CONFIG_MEMORY_HOTPLUG */

1268
static struct kcore_list kcore_vsyscall;
Linus Torvalds's avatar
Linus Torvalds committed
1269

1270 1271
static void __init register_page_bootmem_info(void)
{
1272
#if defined(CONFIG_NUMA) || defined(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP)
1273 1274 1275 1276 1277 1278 1279
	int i;

	for_each_online_node(i)
		register_page_bootmem_info_node(NODE_DATA(i));
#endif
}

1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294
/*
 * Pre-allocates page-table pages for the vmalloc area in the kernel page-table.
 * Only the level which needs to be synchronized between all page-tables is
 * allocated because the synchronization can be expensive.
 */
static void __init preallocate_vmalloc_pages(void)
{
	unsigned long addr;
	const char *lvl;

	for (addr = VMALLOC_START; addr <= VMALLOC_END; addr = ALIGN(addr + 1, PGDIR_SIZE)) {
		pgd_t *pgd = pgd_offset_k(addr);
		p4d_t *p4d;
		pud_t *pud;

1295 1296 1297 1298
		lvl = "p4d";
		p4d = p4d_alloc(&init_mm, pgd, addr);
		if (!p4d)
			goto failed;
1299 1300 1301 1302

		if (pgtable_l5_enabled())
			continue;

1303 1304 1305 1306 1307 1308 1309 1310 1311 1312
		/*
		 * The goal here is to allocate all possibly required
		 * hardware page tables pointed to by the top hardware
		 * level.
		 *
		 * On 4-level systems, the P4D layer is folded away and
		 * the above code does no preallocation.  Below, go down
		 * to the pud _software_ level to ensure the second
		 * hardware level is allocated on 4-level systems too.
		 */
1313 1314 1315 1316
		lvl = "pud";
		pud = pud_alloc(&init_mm, p4d, addr);
		if (!pud)
			goto failed;
1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329
	}

	return;

failed:

	/*
	 * The pages have to be there now or they will be missing in
	 * process page-tables later.
	 */
	panic("Failed to pre-allocate %s pages for vmalloc area\n", lvl);
}

Linus Torvalds's avatar
Linus Torvalds committed
1330 1331
void __init mem_init(void)
{
1332
	pci_iommu_alloc();
Linus Torvalds's avatar
Linus Torvalds committed
1333

1334
	/* clear_bss() already clear the empty_zero_page */
Linus Torvalds's avatar
Linus Torvalds committed
1335

1336
	/* this will put all memory onto the freelists */
1337
	memblock_free_all();
Linus Torvalds's avatar
Linus Torvalds committed
1338
	after_bootmem = 1;
1339
	x86_init.hyper.init_after_bootmem();
Linus Torvalds's avatar
Linus Torvalds committed
1340

1341 1342 1343
	/*
	 * Must be done after boot memory is put on freelist, because here we
	 * might set fields in deferred struct pages that have not yet been
1344
	 * initialized, and memblock_free_all() initializes all the reserved
1345 1346 1347 1348
	 * deferred pages for us.
	 */
	register_page_bootmem_info();

Linus Torvalds's avatar
Linus Torvalds committed
1349
	/* Register memory areas for /proc/kcore */
1350 1351
	if (get_gate_vma(&init_mm))
		kclist_add(&kcore_vsyscall, (void *)VSYSCALL_ADDR, PAGE_SIZE, KCORE_USER);
Linus Torvalds's avatar
Linus Torvalds committed
1352

1353
	preallocate_vmalloc_pages();
Linus Torvalds's avatar
Linus Torvalds committed
1354 1355
}

1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
int __init deferred_page_init_max_threads(const struct cpumask *node_cpumask)
{
	/*
	 * More CPUs always led to greater speedups on tested systems, up to
	 * all the nodes' CPUs.  Use all since the system is otherwise idle
	 * now.
	 */
	return max_t(int, cpumask_weight(node_cpumask), 1);
}
#endif

1368
int kernel_set_to_readonly;
1369

1370 1371
void mark_rodata_ro(void)
{
1372
	unsigned long start = PFN_ALIGN(_text);
1373
	unsigned long rodata_start = PFN_ALIGN(__start_rodata);
1374 1375 1376
	unsigned long end = (unsigned long)__end_rodata_hpage_align;
	unsigned long text_end = PFN_ALIGN(_etext);
	unsigned long rodata_end = PFN_ALIGN(__end_rodata);
1377
	unsigned long all_end;
1378

1379
	printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
1380
	       (end - start) >> 10);
1381 1382
	set_memory_ro(start, (end - start) >> PAGE_SHIFT);

1383 1384
	kernel_set_to_readonly = 1;

1385
	/*
1386 1387
	 * The rodata/data/bss/brk section (but not the kernel text!)
	 * should also be not-executable.
1388 1389 1390 1391 1392 1393 1394 1395
	 *
	 * We align all_end to PMD_SIZE because the existing mapping
	 * is a full PMD. If we would align _brk_end to PAGE_SIZE we
	 * split the PMD and the reminder between _brk_end and the end
	 * of the PMD will remain mapped executable.
	 *
	 * Any PMD which was setup after the one which covers _brk_end
	 * has been zapped already via cleanup_highmem().
1396
	 */
1397
	all_end = roundup((unsigned long)_brk_end, PMD_SIZE);
1398
	set_memory_nx(text_end, (all_end - text_end) >> PAGE_SHIFT);
1399

1400 1401
	set_ftrace_ops_ro();

1402
#ifdef CONFIG_CPA_DEBUG
1403
	printk(KERN_INFO "Testing CPA: undo %lx-%lx\n", start, end);
1404
	set_memory_rw(start, (end-start) >> PAGE_SHIFT);
1405

1406
	printk(KERN_INFO "Testing CPA: again\n");
1407
	set_memory_ro(start, (end-start) >> PAGE_SHIFT);
1408
#endif
1409

1410 1411 1412 1413
	free_kernel_image_pages("unused kernel image (text/rodata gap)",
				(void *)text_end, (void *)rodata_start);
	free_kernel_image_pages("unused kernel image (rodata/data gap)",
				(void *)rodata_end, (void *)_sdata);
1414 1415

	debug_checkwx();
1416
}
1417

Thomas Gleixner's avatar
Thomas Gleixner committed
1418 1419
int kern_addr_valid(unsigned long addr)
{
Linus Torvalds's avatar
Linus Torvalds committed
1420
	unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT;
Thomas Gleixner's avatar
Thomas Gleixner committed
1421
	pgd_t *pgd;
1422
	p4d_t *p4d;
Thomas Gleixner's avatar
Thomas Gleixner committed
1423 1424 1425
	pud_t *pud;
	pmd_t *pmd;
	pte_t *pte;
Linus Torvalds's avatar
Linus Torvalds committed
1426 1427

	if (above != 0 && above != -1UL)
Thomas Gleixner's avatar
Thomas Gleixner committed
1428 1429
		return 0;

Linus Torvalds's avatar
Linus Torvalds committed
1430 1431 1432 1433
	pgd = pgd_offset_k(addr);
	if (pgd_none(*pgd))
		return 0;

1434
	p4d = p4d_offset(pgd, addr);
1435
	if (!p4d_present(*p4d))
1436 1437 1438
		return 0;

	pud = pud_offset(p4d, addr);
1439
	if (!pud_present(*pud))
Thomas Gleixner's avatar
Thomas Gleixner committed
1440
		return 0;
Linus Torvalds's avatar
Linus Torvalds committed
1441

1442 1443 1444
	if (pud_large(*pud))
		return pfn_valid(pud_pfn(*pud));

Linus Torvalds's avatar
Linus Torvalds committed
1445
	pmd = pmd_offset(pud, addr);
1446
	if (!pmd_present(*pmd))
Linus Torvalds's avatar
Linus Torvalds committed
1447
		return 0;
Thomas Gleixner's avatar
Thomas Gleixner committed
1448

Linus Torvalds's avatar
Linus Torvalds committed
1449 1450 1451 1452 1453 1454
	if (pmd_large(*pmd))
		return pfn_valid(pmd_pfn(*pmd));

	pte = pte_offset_kernel(pmd, addr);
	if (pte_none(*pte))
		return 0;
Thomas Gleixner's avatar
Thomas Gleixner committed
1455

Linus Torvalds's avatar
Linus Torvalds committed
1456 1457 1458
	return pfn_valid(pte_pfn(*pte));
}

1459 1460 1461 1462 1463 1464 1465 1466 1467 1468
/*
 * Block size is the minimum amount of memory which can be hotplugged or
 * hotremoved. It must be power of two and must be equal or larger than
 * MIN_MEMORY_BLOCK_SIZE.
 */
#define MAX_BLOCK_SIZE (2UL << 30)

/* Amount of ram needed to start using large blocks */
#define MEM_SIZE_FOR_LARGE_BLOCK (64UL << 30)

1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481
/* Adjustable memory block size */
static unsigned long set_memory_block_size;
int __init set_memory_block_size_order(unsigned int order)
{
	unsigned long size = 1UL << order;

	if (size > MEM_SIZE_FOR_LARGE_BLOCK || size < MIN_MEMORY_BLOCK_SIZE)
		return -EINVAL;

	set_memory_block_size = size;
	return 0;
}

1482
static unsigned long probe_memory_block_size(void)
1483
{
1484 1485
	unsigned long boot_mem_end = max_pfn << PAGE_SHIFT;
	unsigned long bz;
1486

1487 1488 1489
	/* If memory block size has been set, then use it */
	bz = set_memory_block_size;
	if (bz)
1490
		goto done;
1491

1492 1493 1494 1495 1496 1497
	/* Use regular block if RAM is smaller than MEM_SIZE_FOR_LARGE_BLOCK */
	if (boot_mem_end < MEM_SIZE_FOR_LARGE_BLOCK) {
		bz = MIN_MEMORY_BLOCK_SIZE;
		goto done;
	}

1498 1499 1500 1501 1502 1503 1504 1505 1506
	/*
	 * Use max block size to minimize overhead on bare metal, where
	 * alignment for memory hotplug isn't a concern.
	 */
	if (!boot_cpu_has(X86_FEATURE_HYPERVISOR)) {
		bz = MAX_BLOCK_SIZE;
		goto done;
	}

1507 1508 1509 1510 1511 1512
	/* Find the largest allowed block size that aligns to memory end */
	for (bz = MAX_BLOCK_SIZE; bz > MIN_MEMORY_BLOCK_SIZE; bz >>= 1) {
		if (IS_ALIGNED(boot_mem_end, bz))
			break;
	}
done:
1513
	pr_info("x86/mm: Memory block size: %ldMB\n", bz >> 20);
1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526

	return bz;
}

static unsigned long memory_block_size_probed;
unsigned long memory_block_size_bytes(void)
{
	if (!memory_block_size_probed)
		memory_block_size_probed = probe_memory_block_size();

	return memory_block_size_probed;
}

1527 1528 1529 1530
#ifdef CONFIG_SPARSEMEM_VMEMMAP
/*
 * Initialise the sparsemem vmemmap using huge-pages at the PMD level.
 */
1531 1532 1533 1534
static long __meminitdata addr_start, addr_end;
static void __meminitdata *p_start, *p_end;
static int __meminitdata node_start;

1535
static int __meminit vmemmap_populate_hugepages(unsigned long start,
1536
		unsigned long end, int node, struct vmem_altmap *altmap)
1537
{
1538
	unsigned long addr;
1539 1540
	unsigned long next;
	pgd_t *pgd;
1541
	p4d_t *p4d;
1542 1543 1544
	pud_t *pud;
	pmd_t *pmd;

1545
	for (addr = start; addr < end; addr = next) {
1546
		next = pmd_addr_end(addr, end);
1547 1548 1549 1550

		pgd = vmemmap_pgd_populate(addr, node);
		if (!pgd)
			return -ENOMEM;
Thomas Gleixner's avatar
Thomas Gleixner committed
1551

1552 1553 1554 1555 1556
		p4d = vmemmap_p4d_populate(pgd, addr, node);
		if (!p4d)
			return -ENOMEM;

		pud = vmemmap_pud_populate(p4d, addr, node);
1557 1558 1559
		if (!pud)
			return -ENOMEM;

1560 1561 1562
		pmd = pmd_offset(pud, addr);
		if (pmd_none(*pmd)) {
			void *p;
Thomas Gleixner's avatar
Thomas Gleixner committed
1563

1564
			p = vmemmap_alloc_block_buf(PMD_SIZE, node, altmap);
1565 1566 1567 1568 1569 1570 1571 1572 1573 1574
			if (p) {
				pte_t entry;

				entry = pfn_pte(__pa(p) >> PAGE_SHIFT,
						PAGE_KERNEL_LARGE);
				set_pmd(pmd, __pmd(pte_val(entry)));

				/* check to see if we have contiguous blocks */
				if (p_end != p || node_start != node) {
					if (p_start)
1575
						pr_debug(" [%lx-%lx] PMD -> [%p-%p] on node %d\n",
1576 1577 1578 1579 1580
						       addr_start, addr_end-1, p_start, p_end-1, node_start);
					addr_start = addr;
					node_start = node;
					p_start = p;
				}
1581

1582 1583
				addr_end = addr + PMD_SIZE;
				p_end = p + PMD_SIZE;
1584 1585 1586 1587 1588

				if (!IS_ALIGNED(addr, PMD_SIZE) ||
				    !IS_ALIGNED(next, PMD_SIZE))
					vmemmap_use_new_sub_pmd(addr, next);

1589
				continue;
1590 1591
			} else if (altmap)
				return -ENOMEM; /* no fallback */
1592
		} else if (pmd_large(*pmd)) {
1593
			vmemmap_verify((pte_t *)pmd, node, addr, next);
1594
			vmemmap_use_sub_pmd(addr, next);
1595 1596
			continue;
		}
1597
		if (vmemmap_populate_basepages(addr, next, node, NULL))
1598
			return -ENOMEM;
1599 1600 1601
	}
	return 0;
}
1602

1603 1604
int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
		struct vmem_altmap *altmap)
1605 1606 1607
{
	int err;

1608 1609 1610
	VM_BUG_ON(!IS_ALIGNED(start, PAGE_SIZE));
	VM_BUG_ON(!IS_ALIGNED(end, PAGE_SIZE));

1611
	if (end - start < PAGES_PER_SECTION * sizeof(struct page))
1612
		err = vmemmap_populate_basepages(start, end, node, NULL);
1613
	else if (boot_cpu_has(X86_FEATURE_PSE))
1614 1615 1616 1617 1618 1619
		err = vmemmap_populate_hugepages(start, end, node, altmap);
	else if (altmap) {
		pr_err_once("%s: no cpu support for altmap allocations\n",
				__func__);
		err = -ENOMEM;
	} else
1620
		err = vmemmap_populate_basepages(start, end, node, NULL);
1621
	if (!err)
1622
		sync_global_pgds(start, end - 1);
1623 1624 1625
	return err;
}

1626
#ifdef CONFIG_HAVE_BOOTMEM_INFO_NODE
1627
void register_page_bootmem_memmap(unsigned long section_nr,
1628
				  struct page *start_page, unsigned long nr_pages)
1629 1630
{
	unsigned long addr = (unsigned long)start_page;
1631
	unsigned long end = (unsigned long)(start_page + nr_pages);
1632 1633
	unsigned long next;
	pgd_t *pgd;
1634
	p4d_t *p4d;
1635 1636
	pud_t *pud;
	pmd_t *pmd;
1637
	unsigned int nr_pmd_pages;
1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649
	struct page *page;

	for (; addr < end; addr = next) {
		pte_t *pte = NULL;

		pgd = pgd_offset_k(addr);
		if (pgd_none(*pgd)) {
			next = (addr + PAGE_SIZE) & PAGE_MASK;
			continue;
		}
		get_page_bootmem(section_nr, pgd_page(*pgd), MIX_SECTION_INFO);

1650 1651 1652 1653 1654 1655 1656 1657
		p4d = p4d_offset(pgd, addr);
		if (p4d_none(*p4d)) {
			next = (addr + PAGE_SIZE) & PAGE_MASK;
			continue;
		}
		get_page_bootmem(section_nr, p4d_page(*p4d), MIX_SECTION_INFO);

		pud = pud_offset(p4d, addr);
1658 1659 1660 1661 1662 1663
		if (pud_none(*pud)) {
			next = (addr + PAGE_SIZE) & PAGE_MASK;
			continue;
		}
		get_page_bootmem(section_nr, pud_page(*pud), MIX_SECTION_INFO);

1664
		if (!boot_cpu_has(X86_FEATURE_PSE)) {
1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683
			next = (addr + PAGE_SIZE) & PAGE_MASK;
			pmd = pmd_offset(pud, addr);
			if (pmd_none(*pmd))
				continue;
			get_page_bootmem(section_nr, pmd_page(*pmd),
					 MIX_SECTION_INFO);

			pte = pte_offset_kernel(pmd, addr);
			if (pte_none(*pte))
				continue;
			get_page_bootmem(section_nr, pte_page(*pte),
					 SECTION_INFO);
		} else {
			next = pmd_addr_end(addr, end);

			pmd = pmd_offset(pud, addr);
			if (pmd_none(*pmd))
				continue;

1684
			nr_pmd_pages = 1 << get_order(PMD_SIZE);
1685
			page = pmd_page(*pmd);
1686
			while (nr_pmd_pages--)
1687 1688 1689 1690 1691 1692 1693
				get_page_bootmem(section_nr, page++,
						 SECTION_INFO);
		}
	}
}
#endif

1694 1695 1696
void __meminit vmemmap_populate_print_last(void)
{
	if (p_start) {
1697
		pr_debug(" [%lx-%lx] PMD -> [%p-%p] on node %d\n",
1698 1699 1700 1701 1702 1703
			addr_start, addr_end-1, p_start, p_end-1, node_start);
		p_start = NULL;
		p_end = NULL;
		node_start = 0;
	}
}
1704
#endif