Commit 033fbae9 authored by Dan Williams's avatar Dan Williams

mm: ZONE_DEVICE for "device memory"

While pmem is usable as a block device or via DAX mappings to userspace
there are several usage scenarios that can not target pmem due to its
lack of struct page coverage. In preparation for "hot plugging" pmem
into the vmemmap add ZONE_DEVICE as a new zone to tag these pages
separately from the ones that are subject to standard page allocations.
Importantly "device memory" can be removed at will by userspace
unbinding the driver of the device.

Having a separate zone prevents allocation and otherwise marks these
pages that are distinct from typical uniform memory.  Device memory has
different lifetime and performance characteristics than RAM.  However,
since we have run out of ZONES_SHIFT bits this functionality currently
depends on sacrificing ZONE_DMA.

Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Jerome Glisse <j.glisse@gmail.com>
[hch: various simplifications in the arch interface]
Signed-off-by: default avatarChristoph Hellwig <hch@lst.de>
Signed-off-by: default avatarDan Williams <dan.j.williams@intel.com>
parent 012dcef3
...@@ -645,7 +645,7 @@ mem_init (void) ...@@ -645,7 +645,7 @@ mem_init (void)
} }
#ifdef CONFIG_MEMORY_HOTPLUG #ifdef CONFIG_MEMORY_HOTPLUG
int arch_add_memory(int nid, u64 start, u64 size) int arch_add_memory(int nid, u64 start, u64 size, bool for_device)
{ {
pg_data_t *pgdat; pg_data_t *pgdat;
struct zone *zone; struct zone *zone;
...@@ -656,7 +656,7 @@ int arch_add_memory(int nid, u64 start, u64 size) ...@@ -656,7 +656,7 @@ int arch_add_memory(int nid, u64 start, u64 size)
pgdat = NODE_DATA(nid); pgdat = NODE_DATA(nid);
zone = pgdat->node_zones + zone = pgdat->node_zones +
zone_for_memory(nid, start, size, ZONE_NORMAL); zone_for_memory(nid, start, size, ZONE_NORMAL, for_device);
ret = __add_pages(nid, zone, start_pfn, nr_pages); ret = __add_pages(nid, zone, start_pfn, nr_pages);
if (ret) if (ret)
......
...@@ -113,7 +113,7 @@ int memory_add_physaddr_to_nid(u64 start) ...@@ -113,7 +113,7 @@ int memory_add_physaddr_to_nid(u64 start)
} }
#endif #endif
int arch_add_memory(int nid, u64 start, u64 size) int arch_add_memory(int nid, u64 start, u64 size, bool for_device)
{ {
struct pglist_data *pgdata; struct pglist_data *pgdata;
struct zone *zone; struct zone *zone;
...@@ -128,7 +128,7 @@ int arch_add_memory(int nid, u64 start, u64 size) ...@@ -128,7 +128,7 @@ int arch_add_memory(int nid, u64 start, u64 size)
/* this should work for most non-highmem platforms */ /* this should work for most non-highmem platforms */
zone = pgdata->node_zones + zone = pgdata->node_zones +
zone_for_memory(nid, start, size, 0); zone_for_memory(nid, start, size, 0, for_device);
return __add_pages(nid, zone, start_pfn, nr_pages); return __add_pages(nid, zone, start_pfn, nr_pages);
} }
......
...@@ -168,7 +168,7 @@ void __init free_initrd_mem(unsigned long start, unsigned long end) ...@@ -168,7 +168,7 @@ void __init free_initrd_mem(unsigned long start, unsigned long end)
#endif #endif
#ifdef CONFIG_MEMORY_HOTPLUG #ifdef CONFIG_MEMORY_HOTPLUG
int arch_add_memory(int nid, u64 start, u64 size) int arch_add_memory(int nid, u64 start, u64 size, bool for_device)
{ {
unsigned long zone_start_pfn, zone_end_pfn, nr_pages; unsigned long zone_start_pfn, zone_end_pfn, nr_pages;
unsigned long start_pfn = PFN_DOWN(start); unsigned long start_pfn = PFN_DOWN(start);
......
...@@ -485,7 +485,7 @@ void free_initrd_mem(unsigned long start, unsigned long end) ...@@ -485,7 +485,7 @@ void free_initrd_mem(unsigned long start, unsigned long end)
#endif #endif
#ifdef CONFIG_MEMORY_HOTPLUG #ifdef CONFIG_MEMORY_HOTPLUG
int arch_add_memory(int nid, u64 start, u64 size) int arch_add_memory(int nid, u64 start, u64 size, bool for_device)
{ {
pg_data_t *pgdat; pg_data_t *pgdat;
unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long start_pfn = start >> PAGE_SHIFT;
...@@ -496,7 +496,8 @@ int arch_add_memory(int nid, u64 start, u64 size) ...@@ -496,7 +496,8 @@ int arch_add_memory(int nid, u64 start, u64 size)
/* We only have ZONE_NORMAL, so this is easy.. */ /* We only have ZONE_NORMAL, so this is easy.. */
ret = __add_pages(nid, pgdat->node_zones + ret = __add_pages(nid, pgdat->node_zones +
zone_for_memory(nid, start, size, ZONE_NORMAL), zone_for_memory(nid, start, size, ZONE_NORMAL,
for_device),
start_pfn, nr_pages); start_pfn, nr_pages);
if (unlikely(ret)) if (unlikely(ret))
printk("%s: Failed, __add_pages() == %d\n", __func__, ret); printk("%s: Failed, __add_pages() == %d\n", __func__, ret);
......
...@@ -863,7 +863,7 @@ void __init mem_init(void) ...@@ -863,7 +863,7 @@ void __init mem_init(void)
* memory to the highmem for now. * memory to the highmem for now.
*/ */
#ifndef CONFIG_NEED_MULTIPLE_NODES #ifndef CONFIG_NEED_MULTIPLE_NODES
int arch_add_memory(u64 start, u64 size) int arch_add_memory(u64 start, u64 size, bool for_device)
{ {
struct pglist_data *pgdata = &contig_page_data; struct pglist_data *pgdata = &contig_page_data;
struct zone *zone = pgdata->node_zones + MAX_NR_ZONES-1; struct zone *zone = pgdata->node_zones + MAX_NR_ZONES-1;
......
...@@ -822,11 +822,11 @@ void __init mem_init(void) ...@@ -822,11 +822,11 @@ void __init mem_init(void)
} }
#ifdef CONFIG_MEMORY_HOTPLUG #ifdef CONFIG_MEMORY_HOTPLUG
int arch_add_memory(int nid, u64 start, u64 size) int arch_add_memory(int nid, u64 start, u64 size, bool for_device)
{ {
struct pglist_data *pgdata = NODE_DATA(nid); struct pglist_data *pgdata = NODE_DATA(nid);
struct zone *zone = pgdata->node_zones + struct zone *zone = pgdata->node_zones +
zone_for_memory(nid, start, size, ZONE_HIGHMEM); zone_for_memory(nid, start, size, ZONE_HIGHMEM, for_device);
unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long start_pfn = start >> PAGE_SHIFT;
unsigned long nr_pages = size >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT;
......
...@@ -687,11 +687,11 @@ static void update_end_of_memory_vars(u64 start, u64 size) ...@@ -687,11 +687,11 @@ static void update_end_of_memory_vars(u64 start, u64 size)
* Memory is added always to NORMAL zone. This means you will never get * Memory is added always to NORMAL zone. This means you will never get
* additional DMA/DMA32 memory. * additional DMA/DMA32 memory.
*/ */
int arch_add_memory(int nid, u64 start, u64 size) int arch_add_memory(int nid, u64 start, u64 size, bool for_device)
{ {
struct pglist_data *pgdat = NODE_DATA(nid); struct pglist_data *pgdat = NODE_DATA(nid);
struct zone *zone = pgdat->node_zones + struct zone *zone = pgdat->node_zones +
zone_for_memory(nid, start, size, ZONE_NORMAL); zone_for_memory(nid, start, size, ZONE_NORMAL, for_device);
unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long start_pfn = start >> PAGE_SHIFT;
unsigned long nr_pages = size >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT;
int ret; int ret;
......
...@@ -266,8 +266,9 @@ static inline void remove_memory(int nid, u64 start, u64 size) {} ...@@ -266,8 +266,9 @@ static inline void remove_memory(int nid, u64 start, u64 size) {}
extern int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn, extern int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn,
void *arg, int (*func)(struct memory_block *, void *)); void *arg, int (*func)(struct memory_block *, void *));
extern int add_memory(int nid, u64 start, u64 size); extern int add_memory(int nid, u64 start, u64 size);
extern int zone_for_memory(int nid, u64 start, u64 size, int zone_default); extern int zone_for_memory(int nid, u64 start, u64 size, int zone_default,
extern int arch_add_memory(int nid, u64 start, u64 size); bool for_device);
extern int arch_add_memory(int nid, u64 start, u64 size, bool for_device);
extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages); extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages);
extern bool is_memblock_offlined(struct memory_block *mem); extern bool is_memblock_offlined(struct memory_block *mem);
extern void remove_memory(int nid, u64 start, u64 size); extern void remove_memory(int nid, u64 start, u64 size);
......
...@@ -319,7 +319,11 @@ enum zone_type { ...@@ -319,7 +319,11 @@ enum zone_type {
ZONE_HIGHMEM, ZONE_HIGHMEM,
#endif #endif
ZONE_MOVABLE, ZONE_MOVABLE,
#ifdef CONFIG_ZONE_DEVICE
ZONE_DEVICE,
#endif
__MAX_NR_ZONES __MAX_NR_ZONES
}; };
#ifndef __GENERATING_BOUNDS_H #ifndef __GENERATING_BOUNDS_H
...@@ -794,6 +798,25 @@ static inline bool pgdat_is_empty(pg_data_t *pgdat) ...@@ -794,6 +798,25 @@ static inline bool pgdat_is_empty(pg_data_t *pgdat)
return !pgdat->node_start_pfn && !pgdat->node_spanned_pages; return !pgdat->node_start_pfn && !pgdat->node_spanned_pages;
} }
static inline int zone_id(const struct zone *zone)
{
struct pglist_data *pgdat = zone->zone_pgdat;
return zone - pgdat->node_zones;
}
#ifdef CONFIG_ZONE_DEVICE
static inline bool is_dev_zone(const struct zone *zone)
{
return zone_id(zone) == ZONE_DEVICE;
}
#else
static inline bool is_dev_zone(const struct zone *zone)
{
return false;
}
#endif
#include <linux/memory_hotplug.h> #include <linux/memory_hotplug.h>
extern struct mutex zonelists_mutex; extern struct mutex zonelists_mutex;
......
...@@ -654,3 +654,20 @@ config DEFERRED_STRUCT_PAGE_INIT ...@@ -654,3 +654,20 @@ config DEFERRED_STRUCT_PAGE_INIT
when kswapd starts. This has a potential performance impact on when kswapd starts. This has a potential performance impact on
processes running early in the lifetime of the systemm until kswapd processes running early in the lifetime of the systemm until kswapd
finishes the initialisation. finishes the initialisation.
config ZONE_DEVICE
bool "Device memory (pmem, etc...) hotplug support" if EXPERT
default !ZONE_DMA
depends on !ZONE_DMA
depends on MEMORY_HOTPLUG
depends on MEMORY_HOTREMOVE
depends on X86_64 #arch_add_memory() comprehends device memory
help
Device memory hotplug support allows for establishing pmem,
or other device driver discovered memory regions, in the
memmap. This allows pfn_to_page() lookups of otherwise
"device-physical" addresses which is needed for using a DAX
mapping in an O_DIRECT operation, among other things.
If FS_DAX is enabled, then say Y.
...@@ -770,7 +770,10 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn, ...@@ -770,7 +770,10 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
start = phys_start_pfn << PAGE_SHIFT; start = phys_start_pfn << PAGE_SHIFT;
size = nr_pages * PAGE_SIZE; size = nr_pages * PAGE_SIZE;
ret = release_mem_region_adjustable(&iomem_resource, start, size);
/* in the ZONE_DEVICE case device driver owns the memory region */
if (!is_dev_zone(zone))
ret = release_mem_region_adjustable(&iomem_resource, start, size);
if (ret) { if (ret) {
resource_size_t endres = start + size - 1; resource_size_t endres = start + size - 1;
...@@ -1207,8 +1210,13 @@ static int should_add_memory_movable(int nid, u64 start, u64 size) ...@@ -1207,8 +1210,13 @@ static int should_add_memory_movable(int nid, u64 start, u64 size)
return 0; return 0;
} }
int zone_for_memory(int nid, u64 start, u64 size, int zone_default) int zone_for_memory(int nid, u64 start, u64 size, int zone_default,
bool for_device)
{ {
#ifdef CONFIG_ZONE_DEVICE
if (for_device)
return ZONE_DEVICE;
#endif
if (should_add_memory_movable(nid, start, size)) if (should_add_memory_movable(nid, start, size))
return ZONE_MOVABLE; return ZONE_MOVABLE;
...@@ -1249,7 +1257,7 @@ int __ref add_memory(int nid, u64 start, u64 size) ...@@ -1249,7 +1257,7 @@ int __ref add_memory(int nid, u64 start, u64 size)
} }
/* call arch's memory hotadd */ /* call arch's memory hotadd */
ret = arch_add_memory(nid, start, size); ret = arch_add_memory(nid, start, size, false);
if (ret < 0) if (ret < 0)
goto error; goto error;
......
...@@ -207,6 +207,9 @@ static char * const zone_names[MAX_NR_ZONES] = { ...@@ -207,6 +207,9 @@ static char * const zone_names[MAX_NR_ZONES] = {
"HighMem", "HighMem",
#endif #endif
"Movable", "Movable",
#ifdef CONFIG_ZONE_DEVICE
"Device",
#endif
}; };
int min_free_kbytes = 1024; int min_free_kbytes = 1024;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment