e820.c 35.7 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0-only
2
/*
3
 * Low level x86 E820 memory map handling functions.
4
 *
5 6
 * The firmware and bootloader passes us the "E820 table", which is the primary
 * physical memory layout description available about x86 systems.
7
 *
8 9 10
 * The kernel takes the E820 memory layout and optionally modifies it with
 * quirks and other tweaks, and feeds that into the generic Linux memory
 * allocation code routines via a platform independent interface (memblock, etc.).
11
 */
12
#include <linux/crash_dump.h>
13
#include <linux/memblock.h>
14
#include <linux/suspend.h>
15
#include <linux/acpi.h>
16
#include <linux/firmware-map.h>
17
#include <linux/sort.h>
18
#include <linux/memory_hotplug.h>
19

20
#include <asm/e820/api.h>
21 22
#include <asm/setup.h>

23
/*
24
 * We organize the E820 table into three main data structures:
25
 *
26 27 28 29
 * - 'e820_table_firmware': the original firmware version passed to us by the
 *   bootloader - not modified by the kernel. It is composed of two parts:
 *   the first 128 E820 memory entries in boot_params.e820_table and the remaining
 *   (if any) entries of the SETUP_E820_EXT nodes. We use this to:
30 31 32 33 34 35 36
 *
 *       - inform the user about the firmware's notion of memory layout
 *         via /sys/firmware/memmap
 *
 *       - the hibernation code uses it to generate a kernel-independent MD5
 *         fingerprint of the physical memory layout of a system.
 *
37 38 39 40 41 42 43 44
 * - 'e820_table_kexec': a slightly modified (by the kernel) firmware version
 *   passed to us by the bootloader - the major difference between
 *   e820_table_firmware[] and this one is that, the latter marks the setup_data
 *   list created by the EFI boot stub as reserved, so that kexec can reuse the
 *   setup_data information in the second kernel. Besides, e820_table_kexec[]
 *   might also be modified by the kexec itself to fake a mptable.
 *   We use this to:
 *
45
 *       - kexec, which is a bootloader in disguise, uses the original E820
46
 *         layout to pass to the kexec-ed kernel. This way the original kernel
47
 *         can have a restricted E820 map while the kexec()-ed kexec-kernel
48 49
 *         can have access to full memory - etc.
 *
50
 * - 'e820_table': this is the main E820 table that is massaged by the
51 52 53
 *   low level x86 platform code, or modified by boot parameters, before
 *   passed on to higher level MM layers.
 *
54
 * Once the E820 map has been converted to the standard Linux memory layout
55 56 57
 * information its role stops - modifying it has no effect and does not get
 * re-propagated. So itsmain role is a temporary bootstrap storage of firmware
 * specific memory layout data during early bootup.
58
 */
59
static struct e820_table e820_table_init		__initdata;
60
static struct e820_table e820_table_kexec_init		__initdata;
61
static struct e820_table e820_table_firmware_init	__initdata;
62 63

struct e820_table *e820_table __refdata			= &e820_table_init;
64
struct e820_table *e820_table_kexec __refdata		= &e820_table_kexec_init;
65
struct e820_table *e820_table_firmware __refdata	= &e820_table_firmware_init;
66 67 68 69 70 71 72 73 74 75 76

/* For PCI or other memory-mapped resources */
unsigned long pci_mem_start = 0xaeedbabe;
#ifdef CONFIG_PCI
EXPORT_SYMBOL(pci_mem_start);
#endif

/*
 * This function checks if any part of the range <start,end> is mapped
 * with type.
 */
77 78
static bool _e820__mapped_any(struct e820_table *table,
			      u64 start, u64 end, enum e820_type type)
79 80 81
{
	int i;

82 83
	for (i = 0; i < table->nr_entries; i++) {
		struct e820_entry *entry = &table->entries[i];
84

85
		if (type && entry->type != type)
86
			continue;
87
		if (entry->addr >= end || entry->addr + entry->size <= start)
88
			continue;
89
		return true;
90
	}
91
	return false;
92
}
93 94 95 96 97 98 99 100 101 102 103

bool e820__mapped_raw_any(u64 start, u64 end, enum e820_type type)
{
	return _e820__mapped_any(e820_table_firmware, start, end, type);
}
EXPORT_SYMBOL_GPL(e820__mapped_raw_any);

bool e820__mapped_any(u64 start, u64 end, enum e820_type type)
{
	return _e820__mapped_any(e820_table, start, end, type);
}
104
EXPORT_SYMBOL_GPL(e820__mapped_any);
105 106

/*
107
 * This function checks if the entire <start,end> range is mapped with 'type'.
108
 *
109 110
 * Note: this function only works correctly once the E820 table is sorted and
 * not-overlapping (at least for the range specified), which is the case normally.
111
 */
112 113
static struct e820_entry *__e820__mapped_all(u64 start, u64 end,
					     enum e820_type type)
114 115 116
{
	int i;

117
	for (i = 0; i < e820_table->nr_entries; i++) {
118
		struct e820_entry *entry = &e820_table->entries[i];
119

120
		if (type && entry->type != type)
121
			continue;
122 123

		/* Is the region (part) in overlap with the current region? */
124
		if (entry->addr >= end || entry->addr + entry->size <= start)
125 126
			continue;

127 128 129
		/*
		 * If the region is at the beginning of <start,end> we move
		 * 'start' to the end of the region since it's ok until there
130
		 */
131 132
		if (entry->addr <= start)
			start = entry->addr + entry->size;
133

134
		/*
135 136
		 * If 'start' is now at or beyond 'end', we're done, full
		 * coverage of the desired range exists:
137 138
		 */
		if (start >= end)
139
			return entry;
140
	}
141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160

	return NULL;
}

/*
 * This function checks if the entire range <start,end> is mapped with type.
 */
bool __init e820__mapped_all(u64 start, u64 end, enum e820_type type)
{
	return __e820__mapped_all(start, end, type);
}

/*
 * This function returns the type associated with the range <start,end>.
 */
int e820__get_entry_type(u64 start, u64 end)
{
	struct e820_entry *entry = __e820__mapped_all(start, end, 0);

	return entry ? entry->type : -EINVAL;
161 162 163
}

/*
164
 * Add a memory region to the kernel E820 map.
165
 */
166
static void __init __e820__range_add(struct e820_table *table, u64 start, u64 size, enum e820_type type)
167
{
168
	int x = table->nr_entries;
169

170
	if (x >= ARRAY_SIZE(table->entries)) {
171 172
		pr_err("too many entries; ignoring [mem %#010llx-%#010llx]\n",
		       start, start + size - 1);
173 174 175
		return;
	}

176 177 178 179
	table->entries[x].addr = start;
	table->entries[x].size = size;
	table->entries[x].type = type;
	table->nr_entries++;
Yinghai Lu's avatar
Yinghai Lu committed
180 181
}

182
void __init e820__range_add(u64 start, u64 size, enum e820_type type)
Yinghai Lu's avatar
Yinghai Lu committed
183
{
184
	__e820__range_add(e820_table, start, size, type);
185 186
}

187
static void __init e820_print_type(enum e820_type type)
188 189
{
	switch (type) {
190 191 192 193 194 195 196 197
	case E820_TYPE_RAM:		/* Fall through: */
	case E820_TYPE_RESERVED_KERN:	pr_cont("usable");			break;
	case E820_TYPE_RESERVED:	pr_cont("reserved");			break;
	case E820_TYPE_ACPI:		pr_cont("ACPI data");			break;
	case E820_TYPE_NVS:		pr_cont("ACPI NVS");			break;
	case E820_TYPE_UNUSABLE:	pr_cont("unusable");			break;
	case E820_TYPE_PMEM:		/* Fall through: */
	case E820_TYPE_PRAM:		pr_cont("persistent (type %u)", type);	break;
198
	default:			pr_cont("type %u", type);		break;
199 200 201
	}
}

202
void __init e820__print_table(char *who)
203 204 205
{
	int i;

206
	for (i = 0; i < e820_table->nr_entries; i++) {
207 208 209 210
		pr_info("%s: [mem %#018Lx-%#018Lx] ",
			who,
			e820_table->entries[i].addr,
			e820_table->entries[i].addr + e820_table->entries[i].size - 1);
211

212
		e820_print_type(e820_table->entries[i].type);
213
		pr_cont("\n");
214 215 216 217
	}
}

/*
218
 * Sanitize an E820 map.
219
 *
220
 * Some E820 layouts include overlapping entries. The following
221
 * replaces the original E820 map with a new one, removing overlaps,
222 223
 * and resolving conflicting memory types in favor of highest
 * numbered type.
224
 *
225 226 227
 * The input parameter 'entries' points to an array of 'struct
 * e820_entry' which on entry has elements in the range [0, *nr_entries)
 * valid, and which has space for up to max_nr_entries entries.
228
 * On return, the resulting sanitized E820 map entries will be in
229
 * overwritten in the same location, starting at 'entries'.
230
 *
231 232 233 234
 * The integer pointed to by nr_entries must be valid on entry (the
 * current number of valid entries located at 'entries'). If the
 * sanitizing succeeds the *nr_entries will be updated with the new
 * number of valid entries (something no more than max_nr_entries).
235
 *
236
 * The return value from e820__update_table() is zero if it
237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276
 * successfully 'sanitized' the map entries passed in, and is -1
 * if it did nothing, which can happen if either of (1) it was
 * only passed one map entry, or (2) any of the input map entries
 * were invalid (start + size < start, meaning that the size was
 * so big the described memory range wrapped around through zero.)
 *
 *	Visually we're performing the following
 *	(1,2,3,4 = memory types)...
 *
 *	Sample memory map (w/overlaps):
 *	   ____22__________________
 *	   ______________________4_
 *	   ____1111________________
 *	   _44_____________________
 *	   11111111________________
 *	   ____________________33__
 *	   ___________44___________
 *	   __________33333_________
 *	   ______________22________
 *	   ___________________2222_
 *	   _________111111111______
 *	   _____________________11_
 *	   _________________4______
 *
 *	Sanitized equivalent (no overlap):
 *	   1_______________________
 *	   _44_____________________
 *	   ___1____________________
 *	   ____22__________________
 *	   ______11________________
 *	   _________1______________
 *	   __________3_____________
 *	   ___________44___________
 *	   _____________33_________
 *	   _______________2________
 *	   ________________1_______
 *	   _________________4______
 *	   ___________________2____
 *	   ____________________33__
 *	   ______________________4_
277
 */
278
struct change_member {
279 280
	/* Pointer to the original entry: */
	struct e820_entry	*entry;
281 282
	/* Address for this change point: */
	unsigned long long	addr;
283 284
};

285 286 287 288 289
static struct change_member	change_point_list[2*E820_MAX_ENTRIES]	__initdata;
static struct change_member	*change_point[2*E820_MAX_ENTRIES]	__initdata;
static struct e820_entry	*overlap_list[E820_MAX_ENTRIES]		__initdata;
static struct e820_entry	new_entries[E820_MAX_ENTRIES]		__initdata;

290 291 292 293 294 295 296
static int __init cpcompare(const void *a, const void *b)
{
	struct change_member * const *app = a, * const *bpp = b;
	const struct change_member *ap = *app, *bp = *bpp;

	/*
	 * Inputs are pointers to two elements of change_point[].  If their
297
	 * addresses are not equal, their difference dominates.  If the addresses
298 299 300 301 302 303
	 * are equal, then consider one that represents the end of its region
	 * to be greater than one that does not.
	 */
	if (ap->addr != bp->addr)
		return ap->addr > bp->addr ? 1 : -1;

304
	return (ap->addr != ap->entry->addr) - (bp->addr != bp->entry->addr);
305
}
306

307
int __init e820__update_table(struct e820_table *table)
308
{
309 310
	struct e820_entry *entries = table->entries;
	u32 max_nr_entries = ARRAY_SIZE(table->entries);
311
	enum e820_type current_type, last_type;
312
	unsigned long long last_addr;
313 314
	u32 new_nr_entries, overlap_entries;
	u32 i, chg_idx, chg_nr;
315

316
	/* If there's only one memory region, don't bother: */
317
	if (table->nr_entries < 2)
318 319
		return -1;

320
	BUG_ON(table->nr_entries > max_nr_entries);
321

322
	/* Bail out if we find any unreasonable addresses in the map: */
323
	for (i = 0; i < table->nr_entries; i++) {
324
		if (entries[i].addr + entries[i].size < entries[i].addr)
325
			return -1;
326
	}
327

328
	/* Create pointers for initial change-point information (for sorting): */
329
	for (i = 0; i < 2 * table->nr_entries; i++)
330 331
		change_point[i] = &change_point_list[i];

332 333 334 335
	/*
	 * Record all known change-points (starting and ending addresses),
	 * omitting empty memory regions:
	 */
336 337
	chg_idx = 0;
	for (i = 0; i < table->nr_entries; i++)	{
338
		if (entries[i].size != 0) {
339 340 341 342
			change_point[chg_idx]->addr	= entries[i].addr;
			change_point[chg_idx++]->entry	= &entries[i];
			change_point[chg_idx]->addr	= entries[i].addr + entries[i].size;
			change_point[chg_idx++]->entry	= &entries[i];
343 344
		}
	}
345
	chg_nr = chg_idx;
346

347
	/* Sort change-point list by memory addresses (low -> high): */
348
	sort(change_point, chg_nr, sizeof(*change_point), cpcompare, NULL);
349

350
	/* Create a new memory map, removing overlaps: */
351
	overlap_entries = 0;	 /* Number of entries in the overlap table */
352
	new_nr_entries = 0;	 /* Index for creating new map entries */
353 354
	last_type = 0;		 /* Start with undefined memory type */
	last_addr = 0;		 /* Start with 0 as last starting address */
355

356
	/* Loop through change-points, determining effect on the new map: */
357
	for (chg_idx = 0; chg_idx < chg_nr; chg_idx++) {
358
		/* Keep track of all overlapping entries */
359
		if (change_point[chg_idx]->addr == change_point[chg_idx]->entry->addr) {
360
			/* Add map entry to overlap list (> 1 entry implies an overlap) */
361
			overlap_list[overlap_entries++] = change_point[chg_idx]->entry;
362
		} else {
363
			/* Remove entry from list (order independent, so swap with last): */
364
			for (i = 0; i < overlap_entries; i++) {
365
				if (overlap_list[i] == change_point[chg_idx]->entry)
366
					overlap_list[i] = overlap_list[overlap_entries-1];
367 368 369 370
			}
			overlap_entries--;
		}
		/*
371
		 * If there are overlapping entries, decide which
372 373 374 375
		 * "type" to use (larger value takes precedence --
		 * 1=usable, 2,3,4,4+=unusable)
		 */
		current_type = 0;
376
		for (i = 0; i < overlap_entries; i++) {
377 378
			if (overlap_list[i]->type > current_type)
				current_type = overlap_list[i]->type;
379 380
		}

381
		/* Continue building up new map based on this information: */
382
		if (current_type != last_type || current_type == E820_TYPE_PRAM) {
383
			if (last_type != 0)	 {
384
				new_entries[new_nr_entries].size = change_point[chg_idx]->addr - last_addr;
385
				/* Move forward only if the new size was non-zero: */
386 387 388
				if (new_entries[new_nr_entries].size != 0)
					/* No more space left for new entries? */
					if (++new_nr_entries >= max_nr_entries)
389 390 391
						break;
			}
			if (current_type != 0)	{
392
				new_entries[new_nr_entries].addr = change_point[chg_idx]->addr;
393
				new_entries[new_nr_entries].type = current_type;
394
				last_addr = change_point[chg_idx]->addr;
395 396 397 398
			}
			last_type = current_type;
		}
	}
399

400
	/* Copy the new entries into the original location: */
401 402
	memcpy(entries, new_entries, new_nr_entries*sizeof(*entries));
	table->nr_entries = new_nr_entries;
403 404 405 406

	return 0;
}

407
static int __init __append_e820_table(struct boot_e820_entry *entries, u32 nr_entries)
408
{
409
	struct boot_e820_entry *entry = entries;
410 411 412 413

	while (nr_entries) {
		u64 start = entry->addr;
		u64 size = entry->size;
414
		u64 end = start + size - 1;
415
		u32 type = entry->type;
416

417
		/* Ignore the entry on 64-bit overflow: */
418
		if (start > end && likely(size))
419 420
			return -1;

421
		e820__range_add(start, size, type);
422

423 424
		entry++;
		nr_entries--;
425 426 427 428
	}
	return 0;
}

429
/*
430
 * Copy the BIOS E820 map into a safe place.
431 432 433 434 435 436 437
 *
 * Sanity-check it while we're at it..
 *
 * If we're lucky and live on a modern system, the setup code
 * will have given us a memory map that we can use to properly
 * set up memory.  If we aren't, we'll fake a memory map.
 */
438
static int __init append_e820_table(struct boot_e820_entry *entries, u32 nr_entries)
439 440
{
	/* Only one memory region (or negative)? Ignore it */
441
	if (nr_entries < 2)
442 443
		return -1;

444
	return __append_e820_table(entries, nr_entries);
445 446
}

447
static u64 __init
448
__e820__range_update(struct e820_table *table, u64 start, u64 size, enum e820_type old_type, enum e820_type new_type)
449
{
450
	u64 end;
Yinghai Lu's avatar
Yinghai Lu committed
451
	unsigned int i;
452 453 454 455
	u64 real_updated_size = 0;

	BUG_ON(old_type == new_type);

456 457 458
	if (size > (ULLONG_MAX - start))
		size = ULLONG_MAX - start;

459
	end = start + size;
460
	printk(KERN_DEBUG "e820: update [mem %#010Lx-%#010Lx] ", start, end - 1);
461
	e820_print_type(old_type);
462
	pr_cont(" ==> ");
463
	e820_print_type(new_type);
464
	pr_cont("\n");
465

466
	for (i = 0; i < table->nr_entries; i++) {
467
		struct e820_entry *entry = &table->entries[i];
468
		u64 final_start, final_end;
469
		u64 entry_end;
470

471
		if (entry->type != old_type)
472
			continue;
473

474
		entry_end = entry->addr + entry->size;
475 476

		/* Completely covered by new range? */
477 478 479
		if (entry->addr >= start && entry_end <= end) {
			entry->type = new_type;
			real_updated_size += entry->size;
480 481
			continue;
		}
482

483
		/* New range is completely covered? */
484
		if (entry->addr < start && entry_end > end) {
485 486
			__e820__range_add(table, start, size, new_type);
			__e820__range_add(table, end, entry_end - end, entry->type);
487
			entry->size = start - entry->addr;
488 489 490 491
			real_updated_size += size;
			continue;
		}

492
		/* Partially covered: */
493 494
		final_start = max(start, entry->addr);
		final_end = min(end, entry_end);
495 496
		if (final_start >= final_end)
			continue;
497

498
		__e820__range_add(table, final_start, final_end - final_start, new_type);
499

500
		real_updated_size += final_end - final_start;
501

Yinghai Lu's avatar
Yinghai Lu committed
502
		/*
503 504
		 * Left range could be head or tail, so need to update
		 * its size first:
Yinghai Lu's avatar
Yinghai Lu committed
505
		 */
506 507
		entry->size -= final_end - final_start;
		if (entry->addr < final_start)
508
			continue;
509

510
		entry->addr = final_end;
511 512 513 514
	}
	return real_updated_size;
}

515
u64 __init e820__range_update(u64 start, u64 size, enum e820_type old_type, enum e820_type new_type)
516
{
517
	return __e820__range_update(e820_table, start, size, old_type, new_type);
518 519
}

520
static u64 __init e820__range_update_kexec(u64 start, u64 size, enum e820_type old_type, enum e820_type  new_type)
521
{
522
	return __e820__range_update(e820_table_kexec, start, size, old_type, new_type);
523 524
}

525
/* Remove a range of memory from the E820 table: */
526
u64 __init e820__range_remove(u64 start, u64 size, enum e820_type old_type, bool check_type)
Yinghai Lu's avatar
Yinghai Lu committed
527 528
{
	int i;
529
	u64 end;
Yinghai Lu's avatar
Yinghai Lu committed
530 531
	u64 real_removed_size = 0;

532 533 534
	if (size > (ULLONG_MAX - start))
		size = ULLONG_MAX - start;

535
	end = start + size;
536
	printk(KERN_DEBUG "e820: remove [mem %#010Lx-%#010Lx] ", start, end - 1);
537
	if (check_type)
538
		e820_print_type(old_type);
539
	pr_cont("\n");
540

541
	for (i = 0; i < e820_table->nr_entries; i++) {
542
		struct e820_entry *entry = &e820_table->entries[i];
Yinghai Lu's avatar
Yinghai Lu committed
543
		u64 final_start, final_end;
544
		u64 entry_end;
Yinghai Lu's avatar
Yinghai Lu committed
545

546
		if (check_type && entry->type != old_type)
Yinghai Lu's avatar
Yinghai Lu committed
547
			continue;
548

549
		entry_end = entry->addr + entry->size;
550 551

		/* Completely covered? */
552 553
		if (entry->addr >= start && entry_end <= end) {
			real_removed_size += entry->size;
554
			memset(entry, 0, sizeof(*entry));
Yinghai Lu's avatar
Yinghai Lu committed
555 556
			continue;
		}
557

558
		/* Is the new range completely covered? */
559
		if (entry->addr < start && entry_end > end) {
560
			e820__range_add(end, entry_end - end, entry->type);
561
			entry->size = start - entry->addr;
562 563 564 565
			real_removed_size += size;
			continue;
		}

566
		/* Partially covered: */
567 568
		final_start = max(start, entry->addr);
		final_end = min(end, entry_end);
Yinghai Lu's avatar
Yinghai Lu committed
569 570
		if (final_start >= final_end)
			continue;
571

Yinghai Lu's avatar
Yinghai Lu committed
572 573
		real_removed_size += final_end - final_start;

574
		/*
575 576
		 * Left range could be head or tail, so need to update
		 * the size first:
577
		 */
578 579
		entry->size -= final_end - final_start;
		if (entry->addr < final_start)
Yinghai Lu's avatar
Yinghai Lu committed
580
			continue;
581

582
		entry->addr = final_end;
Yinghai Lu's avatar
Yinghai Lu committed
583 584 585 586
	}
	return real_removed_size;
}

587
void __init e820__update_table_print(void)
588
{
589
	if (e820__update_table(e820_table))
590
		return;
591

592
	pr_info("modified physical RAM map:\n");
593
	e820__print_table("modified");
594
}
595

596
static void __init e820__update_table_kexec(void)
597
{
598
	e820__update_table(e820_table_kexec);
599
}
600

601
#define MAX_GAP_END 0x100000000ull
602

603
/*
604
 * Search for a gap in the E820 memory space from 0 to MAX_GAP_END (4GB).
605
 */
606
static int __init e820_search_gap(unsigned long *gapstart, unsigned long *gapsize)
607
{
608
	unsigned long long last = MAX_GAP_END;
609
	int i = e820_table->nr_entries;
610 611 612
	int found = 0;

	while (--i >= 0) {
613 614
		unsigned long long start = e820_table->entries[i].addr;
		unsigned long long end = start + e820_table->entries[i].size;
615 616 617

		/*
		 * Since "last" is at most 4GB, we know we'll
618
		 * fit in 32 bits if this condition is true:
619 620 621 622
		 */
		if (last > end) {
			unsigned long gap = last - end;

623 624 625
			if (gap >= *gapsize) {
				*gapsize = gap;
				*gapstart = end;
626 627 628 629 630 631
				found = 1;
			}
		}
		if (start < last)
			last = start;
	}
632 633 634 635
	return found;
}

/*
636 637 638 639 640
 * Search for the biggest gap in the low 32 bits of the E820
 * memory space. We pass this space to the PCI subsystem, so
 * that it can assign MMIO resources for hotplug or
 * unconfigured devices in.
 *
641 642
 * Hopefully the BIOS let enough space left.
 */
643
__init void e820__setup_pci_gap(void)
644
{
645
	unsigned long gapstart, gapsize;
646 647 648
	int found;

	gapsize = 0x400000;
649
	found  = e820_search_gap(&gapstart, &gapsize);
650 651

	if (!found) {
652
#ifdef CONFIG_X86_64
Yinghai Lu's avatar
Yinghai Lu committed
653
		gapstart = (max_pfn << PAGE_SHIFT) + 1024*1024;
654 655
		pr_err("Cannot find an available gap in the 32-bit address range\n");
		pr_err("PCI devices with unassigned 32-bit BARs may not work!\n");
656 657
#else
		gapstart = 0x10000000;
658
#endif
659
	}
660 661

	/*
662
	 * e820__reserve_resources_late() protects stolen RAM already:
663
	 */
664
	pci_mem_start = gapstart;
665

666 667
	pr_info("[mem %#010lx-%#010lx] available for PCI devices\n",
		gapstart, gapstart + gapsize - 1);
668 669
}

670 671 672
/*
 * Called late during init, in free_initmem().
 *
673
 * Initial e820_table and e820_table_kexec are largish __initdata arrays.
674 675 676 677 678 679 680
 *
 * Copy them to a (usually much smaller) dynamically allocated area that is
 * sized precisely after the number of e820 entries.
 *
 * This is done after we've performed all the fixes and tweaks to the tables.
 * All functions which modify them are __init functions, which won't exist
 * after free_initmem().
681
 */
682
__init void e820__reallocate_tables(void)
683
{
684
	struct e820_table *n;
685 686
	int size;

687
	size = offsetof(struct e820_table, entries) + sizeof(struct e820_entry)*e820_table->nr_entries;
688
	n = kmemdup(e820_table, size, GFP_KERNEL);
689
	BUG_ON(!n);
690
	e820_table = n;
691

692
	size = offsetof(struct e820_table, entries) + sizeof(struct e820_entry)*e820_table_kexec->nr_entries;
693
	n = kmemdup(e820_table_kexec, size, GFP_KERNEL);
694
	BUG_ON(!n);
695
	e820_table_kexec = n;
696 697

	size = offsetof(struct e820_table, entries) + sizeof(struct e820_entry)*e820_table_firmware->nr_entries;
698
	n = kmemdup(e820_table_firmware, size, GFP_KERNEL);
699 700
	BUG_ON(!n);
	e820_table_firmware = n;
701 702
}

703 704 705 706 707
/*
 * Because of the small fixed size of struct boot_params, only the first
 * 128 E820 memory entries are passed to the kernel via boot_params.e820_table,
 * the remaining (if any) entries are passed via the SETUP_E820_EXT node of
 * struct setup_data, which is parsed here.
708
 */
709
void __init e820__memory_setup_extended(u64 phys_addr, u32 data_len)
710 711
{
	int entries;
712
	struct boot_e820_entry *extmap;
713
	struct setup_data *sdata;
714

715
	sdata = early_memremap(phys_addr, data_len);
716
	entries = sdata->len / sizeof(*extmap);
717
	extmap = (struct boot_e820_entry *)(sdata->data);
718

719
	__append_e820_table(extmap, entries);
720
	e820__update_table(e820_table);
721

722
	memcpy(e820_table_kexec, e820_table, sizeof(*e820_table_kexec));
723
	memcpy(e820_table_firmware, e820_table, sizeof(*e820_table_firmware));
724

725
	early_memunmap(sdata, data_len);
726
	pr_info("extended physical RAM map:\n");
727
	e820__print_table("extended");
728 729
}

730
/*
731
 * Find the ranges of physical addresses that do not correspond to
732
 * E820 RAM areas and register the corresponding pages as 'nosave' for
733
 * hibernation (32-bit) or software suspend and suspend to RAM (64-bit).
734
 *
735
 * This function requires the E820 map to be sorted and without any
736
 * overlapping entries.
737
 */
738
void __init e820__register_nosave_regions(unsigned long limit_pfn)
739 740
{
	int i;
741
	unsigned long pfn = 0;
742

743
	for (i = 0; i < e820_table->nr_entries; i++) {
744
		struct e820_entry *entry = &e820_table->entries[i];
745

746 747
		if (pfn < PFN_UP(entry->addr))
			register_nosave_region(pfn, PFN_UP(entry->addr));
748

749
		pfn = PFN_DOWN(entry->addr + entry->size);
750

751
		if (entry->type != E820_TYPE_RAM && entry->type != E820_TYPE_RESERVED_KERN)
752
			register_nosave_region(PFN_UP(entry->addr), pfn);
753 754 755 756 757

		if (pfn >= limit_pfn)
			break;
	}
}
758

Huang Ying's avatar
Huang Ying committed
759
#ifdef CONFIG_ACPI
760 761 762
/*
 * Register ACPI NVS memory regions, so that we can save/restore them during
 * hibernation and the subsequent resume:
763
 */
764
static int __init e820__register_nvs_regions(void)
765 766 767
{
	int i;

768
	for (i = 0; i < e820_table->nr_entries; i++) {
769
		struct e820_entry *entry = &e820_table->entries[i];
770

771
		if (entry->type == E820_TYPE_NVS)
772
			acpi_nvs_register(entry->addr, entry->size);
773 774 775 776
	}

	return 0;
}
777
core_initcall(e820__register_nvs_regions);
778 779
#endif

Yinghai Lu's avatar
Yinghai Lu committed
780
/*
781 782
 * Allocate the requested number of bytes with the requsted alignment
 * and return (the physical address) to the caller. Also register this
783
 * range in the 'kexec' E820 table as a reserved range.
784 785 786
 *
 * This allows kexec to fake a new mptable, as if it came from the real
 * system.
Yinghai Lu's avatar
Yinghai Lu committed
787
 */
788
u64 __init e820__memblock_alloc_reserved(u64 size, u64 align)
Yinghai Lu's avatar
Yinghai Lu committed
789 790 791
{
	u64 addr;

792
	addr = memblock_phys_alloc(size, align);
793
	if (addr) {
794
		e820__range_update_kexec(addr, size, E820_TYPE_RAM, E820_TYPE_RESERVED);
795
		pr_info("update e820_table_kexec for e820__memblock_alloc_reserved()\n");
796
		e820__update_table_kexec();
797
	}
Yinghai Lu's avatar
Yinghai Lu committed
798 799 800 801

	return addr;
}

802 803 804 805 806 807 808
#ifdef CONFIG_X86_32
# ifdef CONFIG_X86_PAE
#  define MAX_ARCH_PFN		(1ULL<<(36-PAGE_SHIFT))
# else
#  define MAX_ARCH_PFN		(1ULL<<(32-PAGE_SHIFT))
# endif
#else /* CONFIG_X86_32 */
809
# define MAX_ARCH_PFN MAXMEM>>PAGE_SHIFT
810 811 812 813 814
#endif

/*
 * Find the highest page frame number we have available
 */
815
static unsigned long __init e820_end_pfn(unsigned long limit_pfn, enum e820_type type)
816
{
817 818
	int i;
	unsigned long last_pfn = 0;
819 820
	unsigned long max_arch_pfn = MAX_ARCH_PFN;

821
	for (i = 0; i < e820_table->nr_entries; i++) {
822
		struct e820_entry *entry = &e820_table->entries[i];
823
		unsigned long start_pfn;
824 825
		unsigned long end_pfn;

826
		if (entry->type != type)
827 828
			continue;

829 830
		start_pfn = entry->addr >> PAGE_SHIFT;
		end_pfn = (entry->addr + entry->size) >> PAGE_SHIFT;
831 832 833 834 835 836 837

		if (start_pfn >= limit_pfn)
			continue;
		if (end_pfn > limit_pfn) {
			last_pfn = limit_pfn;
			break;
		}
838 839 840
		if (end_pfn > last_pfn)
			last_pfn = end_pfn;
	}
841 842 843 844

	if (last_pfn > max_arch_pfn)
		last_pfn = max_arch_pfn;

845 846
	pr_info("last_pfn = %#lx max_arch_pfn = %#lx\n",
		last_pfn, max_arch_pfn);
847 848
	return last_pfn;
}
849

850
unsigned long __init e820__end_of_ram_pfn(void)
851
{
852
	return e820_end_pfn(MAX_ARCH_PFN, E820_TYPE_RAM);
853
}
854

855
unsigned long __init e820__end_of_low_ram_pfn(void)
856
{
857
	return e820_end_pfn(1UL << (32 - PAGE_SHIFT), E820_TYPE_RAM);
858
}
859

860
static void __init early_panic(char *msg)
861 862 863 864 865
{
	early_printk(msg);
	panic(msg);
}

866 867
static int userdef __initdata;

868
/* The "mem=nopentium" boot option disables 4MB page tables on 32-bit kernels: */
869 870 871 872 873 874 875 876
static int __init parse_memopt(char *p)
{
	u64 mem_size;

	if (!p)
		return -EINVAL;

	if (!strcmp(p, "nopentium")) {
877
#ifdef CONFIG_X86_32
878 879
		setup_clear_cpu_cap(X86_FEATURE_PSE);
		return 0;
880
#else
881
		pr_warn("mem=nopentium ignored! (only supported on x86_32)\n");
882
		return -EINVAL;
883
#endif
884
	}
885

886
	userdef = 1;
887
	mem_size = memparse(p, &p);
888 889

	/* Don't remove all memory when getting "mem={invalid}" parameter: */
890 891
	if (mem_size == 0)
		return -EINVAL;
892

893
	e820__range_remove(mem_size, ULLONG_MAX - mem_size, E820_TYPE_RAM, 1);
894

895 896 897 898
#ifdef CONFIG_MEMORY_HOTPLUG
	max_mem_size = mem_size;
#endif

899 900 901 902
	return 0;
}
early_param("mem", parse_memopt);

903
static int __init parse_memmap_one(char *p)
904 905 906 907
{
	char *oldp;
	u64 start_at, mem_size;

908 909 910
	if (!p)
		return -EINVAL;

911
	if (!strncmp(p, "exactmap", 8)) {
912 913 914
#ifdef CONFIG_CRASH_DUMP
		/*
		 * If we are doing a crash dump, we still need to know
915
		 * the real memory size before the original memory map is
916 917
		 * reset.
		 */
918
		saved_max_pfn = e820__end_of_ram_pfn();
919
#endif
920
		e820_table->nr_entries = 0;
921 922 923 924 925 926 927 928 929 930 931 932
		userdef = 1;
		return 0;
	}

	oldp = p;
	mem_size = memparse(p, &p);
	if (p == oldp)
		return -EINVAL;

	userdef = 1;
	if (*p == '@') {
		start_at = memparse(p+1, &p);
933
		e820__range_add(start_at, mem_size, E820_TYPE_RAM);
934 935
	} else if (*p == '#') {
		start_at = memparse(p+1, &p);
936
		e820__range_add(start_at, mem_size, E820_TYPE_ACPI);
937 938
	} else if (*p == '$') {
		start_at = memparse(p+1, &p);
939
		e820__range_add(start_at, mem_size, E820_TYPE_RESERVED);
940 941
	} else if (*p == '!') {
		start_at = memparse(p+1, &p);
942
		e820__range_add(start_at, mem_size, E820_TYPE_PRAM);
943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960
	} else if (*p == '%') {
		enum e820_type from = 0, to = 0;

		start_at = memparse(p + 1, &p);
		if (*p == '-')
			from = simple_strtoull(p + 1, &p, 0);
		if (*p == '+')
			to = simple_strtoull(p + 1, &p, 0);
		if (*p != '\0')
			return -EINVAL;
		if (from && to)
			e820__range_update(start_at, mem_size, from, to);
		else if (to)
			e820__range_add(start_at, mem_size, to);
		else if (from)
			e820__range_remove(start_at, mem_size, from, 1);
		else
			e820__range_remove(start_at, mem_size, 0, 0);
961
	} else {
962
		e820__range_remove(mem_size, ULLONG_MAX - mem_size, E820_TYPE_RAM, 1);
963
	}
Yinghai Lu's avatar
Yinghai Lu committed
964

965 966
	return *p == '\0' ? 0 : -EINVAL;
}
967

968 969 970 971 972 973 974 975 976 977 978 979 980 981
static int __init parse_memmap_opt(char *str)
{
	while (str) {
		char *k = strchr(str, ',');

		if (k)
			*k++ = 0;

		parse_memmap_one(str);
		str = k;
	}

	return 0;
}
982 983
early_param("memmap", parse_memmap_opt);

984 985 986 987 988 989
/*
 * Reserve all entries from the bootloader's extensible data nodes list,
 * because if present we are going to use it later on to fetch e820
 * entries from it:
 */
void __init e820__reserve_setup_data(void)
990 991 992 993 994 995 996 997 998 999
{
	struct setup_data *data;
	u64 pa_data;

	pa_data = boot_params.hdr.setup_data;
	if (!pa_data)
		return;

	while (pa_data) {
		data = early_memremap(pa_data, sizeof(*data));
1000
		e820__range_update(pa_data, sizeof(*data)+data->len, E820_TYPE_RAM, E820_TYPE_RESERVED_KERN);
1001
		e820__range_update_kexec(pa_data, sizeof(*data)+data->len, E820_TYPE_RAM, E820_TYPE_RESERVED_KERN);
1002 1003 1004 1005
		pa_data = data->next;
		early_memunmap(data, sizeof(*data));
	}

1006
	e820__update_table(e820_table);
1007
	e820__update_table(e820_table_kexec);
1008 1009

	pr_info("extended physical RAM map:\n");
1010
	e820__print_table("reserve setup_data");
1011 1012
}

1013 1014 1015 1016 1017 1018
/*
 * Called after parse_early_param(), after early parameters (such as mem=)
 * have been processed, in which case we already have an E820 table filled in
 * via the parameter callback function(s), but it's not sorted and printed yet:
 */
void __init e820__finish_early_params(void)
1019 1020
{
	if (userdef) {
1021
		if (e820__update_table(e820_table) < 0)
1022 1023
			early_panic("Invalid user supplied memory map");

1024
		pr_info("user-defined physical RAM map:\n");
1025
		e820__print_table("user");
1026 1027
	}
}
1028

1029
static const char *__init e820_type_to_string(struct e820_entry *entry)
1030
{
1031
	switch (entry->type) {
1032 1033 1034 1035 1036 1037 1038
	case E820_TYPE_RESERVED_KERN:	/* Fall-through: */
	case E820_TYPE_RAM:		return "System RAM";
	case E820_TYPE_ACPI:		return "ACPI Tables";
	case E820_TYPE_NVS:		return "ACPI Non-volatile Storage";
	case E820_TYPE_UNUSABLE:	return "Unusable memory";
	case E820_TYPE_PRAM:		return "Persistent Memory (legacy)";
	case E820_TYPE_PMEM:		return "Persistent Memory";
1039 1040
	case E820_TYPE_RESERVED:	return "Reserved";
	default:			return "Unknown E820 type";
1041 1042 1043
	}
}

1044
static unsigned long __init e820_type_to_iomem_type(struct e820_entry *entry)
1045
{
1046
	switch (entry->type) {
1047 1048 1049 1050 1051 1052 1053
	case E820_TYPE_RESERVED_KERN:	/* Fall-through: */
	case E820_TYPE_RAM:		return IORESOURCE_SYSTEM_RAM;
	case E820_TYPE_ACPI:		/* Fall-through: */
	case E820_TYPE_NVS:		/* Fall-through: */
	case E820_TYPE_UNUSABLE:	/* Fall-through: */
	case E820_TYPE_PRAM:		/* Fall-through: */
	case E820_TYPE_PMEM:		/* Fall-through: */
1054
	case E820_TYPE_RESERVED:	/* Fall-through: */
1055
	default:			return IORESOURCE_MEM;
1056 1057 1058
	}
}

1059
static unsigned long __init e820_type_to_iores_desc(struct e820_entry *entry)
1060
{
1061
	switch (entry->type) {
1062 1063 1064 1065
	case E820_TYPE_ACPI:		return IORES_DESC_ACPI_TABLES;
	case E820_TYPE_NVS:		return IORES_DESC_ACPI_NV_STORAGE;
	case E820_TYPE_PMEM:		return IORES_DESC_PERSISTENT_MEMORY;
	case E820_TYPE_PRAM:		return IORES_DESC_PERSISTENT_MEMORY_LEGACY;
1066
	case E820_TYPE_RESERVED:	return IORES_DESC_RESERVED;
1067 1068 1069 1070
	case E820_TYPE_RESERVED_KERN:	/* Fall-through: */
	case E820_TYPE_RAM:		/* Fall-through: */
	case E820_TYPE_UNUSABLE:	/* Fall-through: */
	default:			return IORES_DESC_NONE;
1071 1072 1073
	}
}

1074
static bool __init do_mark_busy(enum e820_type type, struct resource *res)
1075 1076 1077 1078 1079 1080 1081 1082 1083 1084
{
	/* this is the legacy bios/dos rom-shadow + mmio region */
	if (res->start < (1ULL<<20))
		return true;

	/*
	 * Treat persistent memory like device memory, i.e. reserve it
	 * for exclusive use of a driver
	 */
	switch (type) {
1085 1086 1087
	case E820_TYPE_RESERVED:
	case E820_TYPE_PRAM:
	case E820_TYPE_PMEM:
1088
		return false;
1089 1090 1091 1092 1093
	case E820_TYPE_RESERVED_KERN:
	case E820_TYPE_RAM:
	case E820_TYPE_ACPI:
	case E820_TYPE_NVS:
	case E820_TYPE_UNUSABLE:
1094 1095 1096 1097 1098
	default:
		return true;
	}
}

1099
/*
1100
 * Mark E820 reserved areas as busy for the resource manager:
1101
 */
1102

1103
static struct resource __initdata *e820_res;
1104

1105
void __init e820__reserve_resources(void)
1106 1107
{
	int i;
1108
	struct resource *res;
1109
	u64 end;
1110

1111 1112
	res = memblock_alloc(sizeof(*res) * e820_table->nr_entries,
			     SMP_CACHE_BYTES);
1113 1114 1115
	if (!res)
		panic("%s: Failed to allocate %zu bytes\n", __func__,
		      sizeof(*res) * e820_table->nr_entries);
1116
	e820_res = res;
1117

1118
	for (i = 0; i < e820_table->nr_entries; i++) {
1119 1120 1121
		struct e820_entry *entry = e820_table->entries + i;

		end = entry->addr + entry->size - 1;
1122
		if (end != (resource_size_t)end) {
1123 1124 1125
			res++;
			continue;
		}
1126 1127 1128 1129 1130
		res->start = entry->addr;
		res->end   = end;
		res->name  = e820_type_to_string(entry);
		res->flags = e820_type_to_iomem_type(entry);
		res->desc  = e820_type_to_iores_desc(entry);
1131 1132

		/*
1133 1134 1135
		 * Don't register the region that could be conflicted with
		 * PCI device BAR resources and insert them later in
		 * pcibios_resource_survey():
1136
		 */
1137
		if (do_mark_busy(entry->type, res)) {
1138
			res->flags |= IORESOURCE_BUSY;
1139
			insert_resource(&iomem_resource, res);
1140
		}
1141 1142
		res++;
	}
1143

1144 1145 1146
	/* Expose the bootloader-provided memory layout to the sysfs. */
	for (i = 0; i < e820_table_firmware->nr_entries; i++) {
		struct e820_entry *entry = e820_table_firmware->entries + i;
1147

1148
		firmware_map_add_early(entry->addr, entry->addr + entry->size, e820_type_to_string(entry));
1149
	}
1150 1151
}

1152 1153 1154
/*
 * How much should we pad the end of RAM, depending on where it is?
 */
1155
static unsigned long __init ram_alignment(resource_size_t pos)
1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166
{
	unsigned long mb = pos >> 20;

	/* To 64kB in the first megabyte */
	if (!mb)
		return 64*1024;

	/* To 1MB in the first 16MB */
	if (mb < 16)
		return 1024*1024;

1167 1168
	/* To 64MB for anything above that */
	return 64*1024*1024;
1169 1170
}

1171 1172
#define MAX_RESOURCE_SIZE ((resource_size_t)-1)

1173
void __init e820__reserve_resources_late(void)
1174 1175 1176 1177 1178
{
	int i;
	struct resource *res;

	res = e820_res;
1179
	for (i = 0; i < e820_table->nr_entries; i++) {
1180
		if (!res->parent && res->end)
1181
			insert_resource_expand_to_fit(&iomem_resource, res);
1182 1183
		res++;
	}
1184 1185

	/*
1186
	 * Try to bump up RAM regions to reasonable boundaries, to
1187 1188
	 * avoid stolen RAM:
	 */
1189 1190
	for (i = 0; i < e820_table->nr_entries; i++) {
		struct e820_entry *entry = &e820_table->entries[i];
1191
		u64 start, end;
1192

1193
		if (entry->type != E820_TYPE_RAM)
1194
			continue;
1195

1196
		start = entry->addr + entry->size;
1197 1198 1199 1200
		end = round_up(start, ram_alignment(start)) - 1;
		if (end > MAX_RESOURCE_SIZE)
			end = MAX_RESOURCE_SIZE;
		if (start >= end)
1201
			continue;
1202

1203
		printk(KERN_DEBUG "e820: reserve RAM buffer [mem %#010llx-%#010llx]\n", start, end);
1204
		reserve_region_with_split(&iomem_resource, start, end, "RAM buffer");
1205
	}
1206 1207
}

1208 1209 1210
/*
 * Pass the firmware (bootloader) E820 map to the kernel and process it:
 */
1211
char *__init e820__memory_setup_default(void)
1212 1213
{
	char *who = "BIOS-e820";
1214

1215 1216 1217 1218 1219 1220
	/*
	 * Try to copy the BIOS-supplied E820-map.
	 *
	 * Otherwise fake a memory map; one section from 0k->640k,
	 * the next section from 1mb->appropriate_mem_k
	 */
1221
	if (append_e820_table(boot_params.e820_table, boot_params.e820_entries) < 0) {
1222
		u64 mem_size;
1223

1224 1225
		/* Compare results from other methods and take the one that gives more RAM: */
		if (boot_params.alt_mem_k < boot_params.screen_info.ext_mem_k) {
1226 1227 1228 1229 1230 1231 1232
			mem_size = boot_params.screen_info.ext_mem_k;
			who = "BIOS-88";
		} else {
			mem_size = boot_params.alt_mem_k;
			who = "BIOS-e801";
		}

1233
		e820_table->nr_entries = 0;
1234 1235
		e820__range_add(0, LOWMEMSIZE(), E820_TYPE_RAM);
		e820__range_add(HIGH_MEMORY, mem_size << 10, E820_TYPE_RAM);
1236 1237
	}

1238 1239 1240
	/* We just appended a lot of ranges, sanitize the table: */
	e820__update_table(e820_table);

1241 1242 1243
	return who;
}

1244 1245 1246 1247 1248 1249
/*
 * Calls e820__memory_setup_default() in essence to pick up the firmware/bootloader
 * E820 map - with an optional platform quirk available for virtual platforms
 * to override this method of boot environment processing:
 */
void __init e820__memory_setup(void)
1250
{
1251 1252
	char *who;

1253
	/* This is a firmware interface ABI - make sure we don't break it: */
1254
	BUILD_BUG_ON(sizeof(struct boot_e820_entry) != 20);
1255

1256
	who = x86_init.resources.memory_setup();
1257

1258
	memcpy(e820_table_kexec, e820_table, sizeof(*e820_table_kexec));
1259
	memcpy(e820_table_firmware, e820_table, sizeof(*e820_table_firmware));
1260

1261
	pr_info("BIOS-provided physical RAM map:\n");
1262
	e820__print_table(who);
1263
}
1264

1265
void __init e820__memblock_setup(void)
1266 1267 1268 1269 1270
{
	int i;
	u64 end;

	/*
1271 1272 1273 1274 1275 1276 1277
	 * The bootstrap memblock region count maximum is 128 entries
	 * (INIT_MEMBLOCK_REGIONS), but EFI might pass us more E820 entries
	 * than that - so allow memblock resizing.
	 *
	 * This is safe, because this call happens pretty late during x86 setup,
	 * so we know about reserved memory regions already. (This is important
	 * so that memblock resizing does no stomp over reserved areas.)
1278
	 */
1279
	memblock_allow_resize();
1280

1281
	for (i = 0; i < e820_table->nr_entries; i++) {
1282
		struct e820_entry *entry = &e820_table->entries[i];
1283

1284
		end = entry->addr + entry->size;
1285 1286 1287
		if (end != (resource_size_t)end)
			continue;

1288
		if (entry->type != E820_TYPE_RAM && entry->type != E820_TYPE_RESERVED_KERN)
1289 1290 1291
			continue;

		memblock_add(entry->addr, entry->size);
1292 1293
	}

1294
	/* Throw away partial pages: */
1295 1296
	memblock_trim_memory(PAGE_SIZE);

1297 1298
	memblock_dump_all();
}