Commit 8fc5c735 authored by Dan Williams's avatar Dan Williams

acpi/nfit, device-dax: Identify differentiated memory with a unique numa-node

Persistent memory, as described by the ACPI NFIT (NVDIMM Firmware
Interface Table), is the first known instance of a memory range
described by a unique "target" proximity domain. Where "initiator" and
"target" proximity domains is an approach that the ACPI HMAT
(Heterogeneous Memory Attributes Table) uses to described the unique
performance properties of a memory range relative to a given initiator
(e.g. CPU or DMA device).

Currently the numa-node for a /dev/pmemX block-device or /dev/daxX.Y
char-device follows the traditional notion of 'numa-node' where the
attribute conveys the closest online numa-node. That numa-node attribute
is useful for cpu-binding and memory-binding processes *near* the
device. However, when the memory range backing a 'pmem', or 'dax' device
is onlined (memory hot-add) the memory-only-numa-node representing that
address needs to be differentiated from the set of online nodes. In
other words, the numa-node association of the device depends on whether
you can bind processes *near* the cpu-numa-node in the offline
device-case, or bind process *on* the memory-range directly after the
backing address range is onlined.

Allow for the case that platform firmware describes persistent memory
with a unique proximity domain, i.e. when it is distinct from the
proximity of DRAM and CPUs that are on the same socket. Plumb the Linux
numa-node translation of that proximity through the libnvdimm region
device to namespaces that are in device-dax mode. With this in place the
proposed kmem driver [1] can optionally discover a unique numa-node
number for the address range as it transitions the memory from an
offline state managed by a device-driver to an online memory range
managed by the core-mm.

[1]: https://lore.kernel.org/lkml/20181022201317.8558C1D8@viggo.jf.intel.comReported-by: default avatarFan Du <fan.du@intel.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: "Oliver O'Halloran" <oohall@gmail.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Jérôme Glisse <jglisse@redhat.com>
Reviewed-by: default avatarYang Shi <yang.shi@linux.alibaba.com>
Signed-off-by: default avatarDan Williams <dan.j.williams@intel.com>
parent 730926c3
...@@ -236,6 +236,7 @@ static int papr_scm_nvdimm_init(struct papr_scm_priv *p) ...@@ -236,6 +236,7 @@ static int papr_scm_nvdimm_init(struct papr_scm_priv *p)
memset(&ndr_desc, 0, sizeof(ndr_desc)); memset(&ndr_desc, 0, sizeof(ndr_desc));
ndr_desc.attr_groups = region_attr_groups; ndr_desc.attr_groups = region_attr_groups;
ndr_desc.numa_node = dev_to_node(&p->pdev->dev); ndr_desc.numa_node = dev_to_node(&p->pdev->dev);
ndr_desc.target_node = ndr_desc.numa_node;
ndr_desc.res = &p->res; ndr_desc.res = &p->res;
ndr_desc.of_node = p->dn; ndr_desc.of_node = p->dn;
ndr_desc.provider_data = p; ndr_desc.provider_data = p;
......
...@@ -2869,11 +2869,15 @@ static int acpi_nfit_register_region(struct acpi_nfit_desc *acpi_desc, ...@@ -2869,11 +2869,15 @@ static int acpi_nfit_register_region(struct acpi_nfit_desc *acpi_desc,
ndr_desc->res = &res; ndr_desc->res = &res;
ndr_desc->provider_data = nfit_spa; ndr_desc->provider_data = nfit_spa;
ndr_desc->attr_groups = acpi_nfit_region_attribute_groups; ndr_desc->attr_groups = acpi_nfit_region_attribute_groups;
if (spa->flags & ACPI_NFIT_PROXIMITY_VALID) if (spa->flags & ACPI_NFIT_PROXIMITY_VALID) {
ndr_desc->numa_node = acpi_map_pxm_to_online_node( ndr_desc->numa_node = acpi_map_pxm_to_online_node(
spa->proximity_domain); spa->proximity_domain);
else ndr_desc->target_node = acpi_map_pxm_to_node(
spa->proximity_domain);
} else {
ndr_desc->numa_node = NUMA_NO_NODE; ndr_desc->numa_node = NUMA_NO_NODE;
ndr_desc->target_node = NUMA_NO_NODE;
}
/* /*
* Persistence domain bits are hierarchical, if * Persistence domain bits are hierarchical, if
......
...@@ -84,6 +84,7 @@ int acpi_map_pxm_to_node(int pxm) ...@@ -84,6 +84,7 @@ int acpi_map_pxm_to_node(int pxm)
return node; return node;
} }
EXPORT_SYMBOL(acpi_map_pxm_to_node);
/** /**
* acpi_map_pxm_to_online_node - Map proximity ID to online node * acpi_map_pxm_to_online_node - Map proximity ID to online node
......
...@@ -214,7 +214,7 @@ static void dax_region_unregister(void *region) ...@@ -214,7 +214,7 @@ static void dax_region_unregister(void *region)
} }
struct dax_region *alloc_dax_region(struct device *parent, int region_id, struct dax_region *alloc_dax_region(struct device *parent, int region_id,
struct resource *res, unsigned int align, struct resource *res, int target_node, unsigned int align,
unsigned long pfn_flags) unsigned long pfn_flags)
{ {
struct dax_region *dax_region; struct dax_region *dax_region;
...@@ -244,6 +244,7 @@ struct dax_region *alloc_dax_region(struct device *parent, int region_id, ...@@ -244,6 +244,7 @@ struct dax_region *alloc_dax_region(struct device *parent, int region_id,
dax_region->id = region_id; dax_region->id = region_id;
dax_region->align = align; dax_region->align = align;
dax_region->dev = parent; dax_region->dev = parent;
dax_region->target_node = target_node;
if (sysfs_create_groups(&parent->kobj, dax_region_attribute_groups)) { if (sysfs_create_groups(&parent->kobj, dax_region_attribute_groups)) {
kfree(dax_region); kfree(dax_region);
return NULL; return NULL;
...@@ -348,6 +349,7 @@ struct dev_dax *__devm_create_dev_dax(struct dax_region *dax_region, int id, ...@@ -348,6 +349,7 @@ struct dev_dax *__devm_create_dev_dax(struct dax_region *dax_region, int id,
dev_dax->dax_dev = dax_dev; dev_dax->dax_dev = dax_dev;
dev_dax->region = dax_region; dev_dax->region = dax_region;
dev_dax->target_node = dax_region->target_node;
kref_get(&dax_region->kref); kref_get(&dax_region->kref);
inode = dax_inode(dax_dev); inode = dax_inode(dax_dev);
......
...@@ -10,7 +10,8 @@ struct dax_device; ...@@ -10,7 +10,8 @@ struct dax_device;
struct dax_region; struct dax_region;
void dax_region_put(struct dax_region *dax_region); void dax_region_put(struct dax_region *dax_region);
struct dax_region *alloc_dax_region(struct device *parent, int region_id, struct dax_region *alloc_dax_region(struct device *parent, int region_id,
struct resource *res, unsigned int align, unsigned long flags); struct resource *res, int target_node, unsigned int align,
unsigned long flags);
enum dev_dax_subsys { enum dev_dax_subsys {
DEV_DAX_BUS, DEV_DAX_BUS,
......
...@@ -26,6 +26,7 @@ void dax_bus_exit(void); ...@@ -26,6 +26,7 @@ void dax_bus_exit(void);
/** /**
* struct dax_region - mapping infrastructure for dax devices * struct dax_region - mapping infrastructure for dax devices
* @id: kernel-wide unique region for a memory range * @id: kernel-wide unique region for a memory range
* @target_node: effective numa node if this memory range is onlined
* @kref: to pin while other agents have a need to do lookups * @kref: to pin while other agents have a need to do lookups
* @dev: parent device backing this region * @dev: parent device backing this region
* @align: allocation and mapping alignment for child dax devices * @align: allocation and mapping alignment for child dax devices
...@@ -34,6 +35,7 @@ void dax_bus_exit(void); ...@@ -34,6 +35,7 @@ void dax_bus_exit(void);
*/ */
struct dax_region { struct dax_region {
int id; int id;
int target_node;
struct kref kref; struct kref kref;
struct device *dev; struct device *dev;
unsigned int align; unsigned int align;
...@@ -46,6 +48,7 @@ struct dax_region { ...@@ -46,6 +48,7 @@ struct dax_region {
* data while the device is activated in the driver. * data while the device is activated in the driver.
* @region - parent region * @region - parent region
* @dax_dev - core dax functionality * @dax_dev - core dax functionality
* @target_node: effective numa node if dev_dax memory range is onlined
* @dev - device core * @dev - device core
* @pgmap - pgmap for memmap setup / lifetime (driver owned) * @pgmap - pgmap for memmap setup / lifetime (driver owned)
* @ref: pgmap reference count (driver owned) * @ref: pgmap reference count (driver owned)
...@@ -54,6 +57,7 @@ struct dax_region { ...@@ -54,6 +57,7 @@ struct dax_region {
struct dev_dax { struct dev_dax {
struct dax_region *region; struct dax_region *region;
struct dax_device *dax_dev; struct dax_device *dax_dev;
int target_node;
struct device dev; struct device dev;
struct dev_pagemap pgmap; struct dev_pagemap pgmap;
struct percpu_ref ref; struct percpu_ref ref;
......
...@@ -20,6 +20,7 @@ struct dev_dax *__dax_pmem_probe(struct device *dev, enum dev_dax_subsys subsys) ...@@ -20,6 +20,7 @@ struct dev_dax *__dax_pmem_probe(struct device *dev, enum dev_dax_subsys subsys)
struct nd_namespace_common *ndns; struct nd_namespace_common *ndns;
struct nd_dax *nd_dax = to_nd_dax(dev); struct nd_dax *nd_dax = to_nd_dax(dev);
struct nd_pfn *nd_pfn = &nd_dax->nd_pfn; struct nd_pfn *nd_pfn = &nd_dax->nd_pfn;
struct nd_region *nd_region = to_nd_region(dev->parent);
ndns = nvdimm_namespace_common_probe(dev); ndns = nvdimm_namespace_common_probe(dev);
if (IS_ERR(ndns)) if (IS_ERR(ndns))
...@@ -52,7 +53,8 @@ struct dev_dax *__dax_pmem_probe(struct device *dev, enum dev_dax_subsys subsys) ...@@ -52,7 +53,8 @@ struct dev_dax *__dax_pmem_probe(struct device *dev, enum dev_dax_subsys subsys)
memcpy(&res, &pgmap.res, sizeof(res)); memcpy(&res, &pgmap.res, sizeof(res));
res.start += offset; res.start += offset;
dax_region = alloc_dax_region(dev, region_id, &res, dax_region = alloc_dax_region(dev, region_id, &res,
le32_to_cpu(pfn_sb->align), PFN_DEV|PFN_MAP); nd_region->target_node, le32_to_cpu(pfn_sb->align),
PFN_DEV|PFN_MAP);
if (!dax_region) if (!dax_region)
return ERR_PTR(-ENOMEM); return ERR_PTR(-ENOMEM);
......
...@@ -47,6 +47,7 @@ static int e820_register_one(struct resource *res, void *data) ...@@ -47,6 +47,7 @@ static int e820_register_one(struct resource *res, void *data)
ndr_desc.res = res; ndr_desc.res = res;
ndr_desc.attr_groups = e820_pmem_region_attribute_groups; ndr_desc.attr_groups = e820_pmem_region_attribute_groups;
ndr_desc.numa_node = e820_range_to_nid(res->start); ndr_desc.numa_node = e820_range_to_nid(res->start);
ndr_desc.target_node = ndr_desc.numa_node;
set_bit(ND_REGION_PAGEMAP, &ndr_desc.flags); set_bit(ND_REGION_PAGEMAP, &ndr_desc.flags);
if (!nvdimm_pmem_region_create(nvdimm_bus, &ndr_desc)) if (!nvdimm_pmem_region_create(nvdimm_bus, &ndr_desc))
return -ENXIO; return -ENXIO;
......
...@@ -153,7 +153,7 @@ struct nd_region { ...@@ -153,7 +153,7 @@ struct nd_region {
u16 ndr_mappings; u16 ndr_mappings;
u64 ndr_size; u64 ndr_size;
u64 ndr_start; u64 ndr_start;
int id, num_lanes, ro, numa_node; int id, num_lanes, ro, numa_node, target_node;
void *provider_data; void *provider_data;
struct kernfs_node *bb_state; struct kernfs_node *bb_state;
struct badblocks bb; struct badblocks bb;
......
...@@ -68,6 +68,7 @@ static int of_pmem_region_probe(struct platform_device *pdev) ...@@ -68,6 +68,7 @@ static int of_pmem_region_probe(struct platform_device *pdev)
memset(&ndr_desc, 0, sizeof(ndr_desc)); memset(&ndr_desc, 0, sizeof(ndr_desc));
ndr_desc.attr_groups = region_attr_groups; ndr_desc.attr_groups = region_attr_groups;
ndr_desc.numa_node = dev_to_node(&pdev->dev); ndr_desc.numa_node = dev_to_node(&pdev->dev);
ndr_desc.target_node = ndr_desc.numa_node;
ndr_desc.res = &pdev->resource[i]; ndr_desc.res = &pdev->resource[i];
ndr_desc.of_node = np; ndr_desc.of_node = np;
set_bit(ND_REGION_PAGEMAP, &ndr_desc.flags); set_bit(ND_REGION_PAGEMAP, &ndr_desc.flags);
......
...@@ -1065,6 +1065,7 @@ static struct nd_region *nd_region_create(struct nvdimm_bus *nvdimm_bus, ...@@ -1065,6 +1065,7 @@ static struct nd_region *nd_region_create(struct nvdimm_bus *nvdimm_bus,
nd_region->flags = ndr_desc->flags; nd_region->flags = ndr_desc->flags;
nd_region->ro = ro; nd_region->ro = ro;
nd_region->numa_node = ndr_desc->numa_node; nd_region->numa_node = ndr_desc->numa_node;
nd_region->target_node = ndr_desc->target_node;
ida_init(&nd_region->ns_ida); ida_init(&nd_region->ns_ida);
ida_init(&nd_region->btt_ida); ida_init(&nd_region->btt_ida);
ida_init(&nd_region->pfn_ida); ida_init(&nd_region->pfn_ida);
......
...@@ -400,12 +400,17 @@ extern bool acpi_osi_is_win8(void); ...@@ -400,12 +400,17 @@ extern bool acpi_osi_is_win8(void);
#ifdef CONFIG_ACPI_NUMA #ifdef CONFIG_ACPI_NUMA
int acpi_map_pxm_to_online_node(int pxm); int acpi_map_pxm_to_online_node(int pxm);
int acpi_map_pxm_to_node(int pxm);
int acpi_get_node(acpi_handle handle); int acpi_get_node(acpi_handle handle);
#else #else
static inline int acpi_map_pxm_to_online_node(int pxm) static inline int acpi_map_pxm_to_online_node(int pxm)
{ {
return 0; return 0;
} }
static inline int acpi_map_pxm_to_node(int pxm)
{
return 0;
}
static inline int acpi_get_node(acpi_handle handle) static inline int acpi_get_node(acpi_handle handle)
{ {
return 0; return 0;
......
...@@ -128,6 +128,7 @@ struct nd_region_desc { ...@@ -128,6 +128,7 @@ struct nd_region_desc {
void *provider_data; void *provider_data;
int num_lanes; int num_lanes;
int numa_node; int numa_node;
int target_node;
unsigned long flags; unsigned long flags;
struct device_node *of_node; struct device_node *of_node;
}; };
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment