Commit 0bd4b96d authored by David Hildenbrand's avatar David Hildenbrand Committed by Michael Ellerman

powernv/memtrace: don't abuse memory hot(un)plug infrastructure for memory allocations

Let's use alloc_contig_pages() for allocating memory and remove the
linear mapping manually via arch_remove_linear_mapping(). Mark all pages
PG_offline, such that they will definitely not get touched - e.g.,
when hibernating. When freeing memory, try to revert what we did.

The original idea was discussed in:
 https://lkml.kernel.org/r/48340e96-7e6b-736f-9e23-d3111b915b6e@redhat.com

This is similar to CONFIG_DEBUG_PAGEALLOC handling on other
architectures, whereby only single pages are unmapped from the linear
mapping. Let's mimic what memory hot(un)plug would do with the linear
mapping.

We now need MEMORY_HOTPLUG and CONTIG_ALLOC as dependencies. Add a TODO
that we want to use __GFP_ZERO for clearing once alloc_contig_pages()
understands that.

Tested with in QEMU/TCG with 10 GiB of main memory:
  [root@localhost ~]# echo 0x40000000 > /sys/kernel/debug/powerpc/memtrace/enable
  [  105.903043][ T1080] memtrace: Allocated trace memory on node 0 at 0x0000000080000000
  [root@localhost ~]# echo 0x40000000 > /sys/kernel/debug/powerpc/memtrace/enable
  [  145.042493][ T1080] radix-mmu: Mapped 0x0000000080000000-0x00000000c0000000 with 64.0 KiB pages
  [  145.049019][ T1080] memtrace: Freed trace memory back on node 0
  [  145.333960][ T1080] memtrace: Allocated trace memory on node 0 at 0x0000000080000000
  [root@localhost ~]# echo 0x80000000 > /sys/kernel/debug/powerpc/memtrace/enable
  [  213.606916][ T1080] radix-mmu: Mapped 0x0000000080000000-0x00000000c0000000 with 64.0 KiB pages
  [  213.613855][ T1080] memtrace: Freed trace memory back on node 0
  [  214.185094][ T1080] memtrace: Allocated trace memory on node 0 at 0x0000000080000000
  [root@localhost ~]# echo 0x100000000 > /sys/kernel/debug/powerpc/memtrace/enable
  [  234.874872][ T1080] radix-mmu: Mapped 0x0000000080000000-0x0000000100000000 with 64.0 KiB pages
  [  234.886974][ T1080] memtrace: Freed trace memory back on node 0
  [  234.890153][ T1080] memtrace: Failed to allocate trace memory on node 0
  [root@localhost ~]# echo 0x40000000 > /sys/kernel/debug/powerpc/memtrace/enable
  [  259.490196][ T1080] memtrace: Allocated trace memory on node 0 at 0x0000000080000000

I also made sure allocated memory is properly zeroed.

Note 1: We currently won't be allocating from ZONE_MOVABLE - because our
	pages are not movable. However, as we don't run with any memory
	hot(un)plug mechanism around, we could make an exception to
	increase the chance of allocations succeeding.

Note 2: PG_reserved isn't sufficient. E.g., kernel_page_present() used
	along PG_reserved in hibernation code will always return "true"
	on powerpc, resulting in the pages getting touched. It's too
	generic - e.g., indicates boot allocations.

Note 3: For now, we keep using memory_block_size_bytes() as minimum
	granularity.
Suggested-by: default avatarMichal Hocko <mhocko@kernel.org>
Signed-off-by: default avatarDavid Hildenbrand <david@redhat.com>
Reviewed-by: default avatarOscar Salvador <osalvador@suse.de>
Signed-off-by: default avatarMichael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20201111145322.15793-9-david@redhat.com
parent ca2c36ca
......@@ -27,11 +27,11 @@ config OPAL_PRD
recovery diagnostics on OpenPower machines
config PPC_MEMTRACE
bool "Enable removal of RAM from kernel mappings for tracing"
depends on PPC_POWERNV && MEMORY_HOTREMOVE
bool "Enable runtime allocation of RAM for tracing"
depends on PPC_POWERNV && MEMORY_HOTPLUG && CONTIG_ALLOC
help
Enabling this option allows for the removal of memory (RAM)
from the kernel mappings to be used for hardware tracing.
Enabling this option allows for runtime allocation of memory (RAM)
for hardware tracing.
config PPC_VAS
bool "IBM Virtual Accelerator Switchboard (VAS)"
......
......@@ -51,33 +51,12 @@ static const struct file_operations memtrace_fops = {
.open = simple_open,
};
static int check_memblock_online(struct memory_block *mem, void *arg)
{
if (mem->state != MEM_ONLINE)
return -1;
return 0;
}
static int change_memblock_state(struct memory_block *mem, void *arg)
{
unsigned long state = (unsigned long)arg;
mem->state = state;
return 0;
}
static void memtrace_clear_range(unsigned long start_pfn,
unsigned long nr_pages)
{
unsigned long pfn;
/*
* As pages are offline, we cannot trust the memmap anymore. As HIGHMEM
* does not apply, avoid passing around "struct page" and use
* clear_page() instead directly.
*/
/* As HIGHMEM does not apply, use clear_page() directly. */
for (pfn = start_pfn; pfn < start_pfn + nr_pages; pfn++) {
if (IS_ALIGNED(pfn, PAGES_PER_SECTION))
cond_resched();
......@@ -85,72 +64,39 @@ static void memtrace_clear_range(unsigned long start_pfn,
}
}
/* called with device_hotplug_lock held */
static bool memtrace_offline_pages(u32 nid, u64 start_pfn, u64 nr_pages)
{
const unsigned long start = PFN_PHYS(start_pfn);
const unsigned long size = PFN_PHYS(nr_pages);
if (walk_memory_blocks(start, size, NULL, check_memblock_online))
return false;
walk_memory_blocks(start, size, (void *)MEM_GOING_OFFLINE,
change_memblock_state);
if (offline_pages(start_pfn, nr_pages)) {
walk_memory_blocks(start, size, (void *)MEM_ONLINE,
change_memblock_state);
return false;
}
walk_memory_blocks(start, size, (void *)MEM_OFFLINE,
change_memblock_state);
return true;
}
static u64 memtrace_alloc_node(u32 nid, u64 size)
{
u64 start_pfn, end_pfn, nr_pages, pfn;
u64 base_pfn;
u64 bytes = memory_block_size_bytes();
const unsigned long nr_pages = PHYS_PFN(size);
unsigned long pfn, start_pfn;
struct page *page;
if (!node_spanned_pages(nid))
/*
* Trace memory needs to be aligned to the size, which is guaranteed
* by alloc_contig_pages().
*/
page = alloc_contig_pages(nr_pages, GFP_KERNEL | __GFP_THISNODE |
__GFP_NOWARN, nid, NULL);
if (!page)
return 0;
start_pfn = page_to_pfn(page);
start_pfn = node_start_pfn(nid);
end_pfn = node_end_pfn(nid);
nr_pages = size >> PAGE_SHIFT;
/* Trace memory needs to be aligned to the size */
end_pfn = round_down(end_pfn - nr_pages, nr_pages);
lock_device_hotplug();
for (base_pfn = end_pfn; base_pfn > start_pfn; base_pfn -= nr_pages) {
if (memtrace_offline_pages(nid, base_pfn, nr_pages) == true) {
/*
* Clear the range while we still have a linear
* mapping.
* Clear the range while we still have a linear mapping.
*
* TODO: use __GFP_ZERO with alloc_contig_pages() once supported.
*/
memtrace_clear_range(base_pfn, nr_pages);
memtrace_clear_range(start_pfn, nr_pages);
/*
* Remove memory in memory block size chunks so that
* iomem resources are always split to the same size and
* we never try to remove memory that spans two iomem
* resources.
* Set pages PageOffline(), to indicate that nobody (e.g., hibernation,
* dumping, ...) should be touching these pages.
*/
end_pfn = base_pfn + nr_pages;
for (pfn = base_pfn; pfn < end_pfn; pfn += bytes>> PAGE_SHIFT) {
__remove_memory(nid, pfn << PAGE_SHIFT, bytes);
}
unlock_device_hotplug();
return base_pfn << PAGE_SHIFT;
}
}
unlock_device_hotplug();
for (pfn = start_pfn; pfn < start_pfn + nr_pages; pfn++)
__SetPageOffline(pfn_to_page(pfn));
return 0;
arch_remove_linear_mapping(PFN_PHYS(start_pfn), size);
return PFN_PHYS(start_pfn);
}
static int memtrace_init_regions_runtime(u64 size)
......@@ -220,16 +166,30 @@ static int memtrace_init_debugfs(void)
return ret;
}
static int online_mem_block(struct memory_block *mem, void *arg)
static int memtrace_free(int nid, u64 start, u64 size)
{
return device_online(&mem->dev);
struct mhp_params params = { .pgprot = PAGE_KERNEL };
const unsigned long nr_pages = PHYS_PFN(size);
const unsigned long start_pfn = PHYS_PFN(start);
unsigned long pfn;
int ret;
ret = arch_create_linear_mapping(nid, start, size, &params);
if (ret)
return ret;
for (pfn = start_pfn; pfn < start_pfn + nr_pages; pfn++)
__ClearPageOffline(pfn_to_page(pfn));
free_contig_range(start_pfn, nr_pages);
return 0;
}
/*
* Iterate through the chunks of memory we have removed from the kernel
* and attempt to add them back to the kernel.
* Iterate through the chunks of memory we allocated and attempt to expose
* them back to the kernel.
*/
static int memtrace_online(void)
static int memtrace_free_regions(void)
{
int i, ret = 0;
struct memtrace_entry *ent;
......@@ -237,7 +197,7 @@ static int memtrace_online(void)
for (i = memtrace_array_nr - 1; i >= 0; i--) {
ent = &memtrace_array[i];
/* We have onlined this chunk previously */
/* We have freed this chunk previously */
if (ent->nid == NUMA_NO_NODE)
continue;
......@@ -247,30 +207,25 @@ static int memtrace_online(void)
ent->mem = 0;
}
if (add_memory(ent->nid, ent->start, ent->size, MHP_NONE)) {
pr_err("Failed to add trace memory to node %d\n",
if (memtrace_free(ent->nid, ent->start, ent->size)) {
pr_err("Failed to free trace memory on node %d\n",
ent->nid);
ret += 1;
continue;
}
lock_device_hotplug();
walk_memory_blocks(ent->start, ent->size, NULL,
online_mem_block);
unlock_device_hotplug();
/*
* Memory was added successfully so clean up references to it
* so on reentry we can tell that this chunk was added.
* Memory was freed successfully so clean up references to it
* so on reentry we can tell that this chunk was freed.
*/
debugfs_remove_recursive(ent->dir);
pr_info("Added trace memory back to node %d\n", ent->nid);
pr_info("Freed trace memory back on node %d\n", ent->nid);
ent->size = ent->start = ent->nid = NUMA_NO_NODE;
}
if (ret)
return ret;
/* If all chunks of memory were added successfully, reset globals */
/* If all chunks of memory were freed successfully, reset globals */
kfree(memtrace_array);
memtrace_array = NULL;
memtrace_size = 0;
......@@ -295,18 +250,16 @@ static int memtrace_enable_set(void *data, u64 val)
mutex_lock(&memtrace_mutex);
/* Re-add/online previously removed/offlined memory */
if (memtrace_size) {
if (memtrace_online())
/* Free all previously allocated memory. */
if (memtrace_size && memtrace_free_regions())
goto out_unlock;
}
if (!val) {
rc = 0;
goto out_unlock;
}
/* Offline and remove memory */
/* Allocate memory. */
if (memtrace_init_regions_runtime(val))
goto out_unlock;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment