Commit 55d130a5 authored by Andrew Morton's avatar Andrew Morton Committed by Linus Torvalds

[PATCH] ppc64: NUMA fixes

From: Anton Blanchard <anton@samba.org>

From: Anton Blanchard, Dave Hansen and Olof Johansson:

Fix multiple bugs in the ppc64 NUMA topology probe code.

- We were using HW cpu numbers instead of logical ones.  615, 630, 650,
  some 670 and some 690 SMP will all fail to boot without this patch.

- The old code would BUG() when it got confused (more NUMA zones than the
  kernel is configured for etc).

- The common depth calculation was incorrect.  Dave found an OF property
  that gives us exactly what we want.

- Things were broken on SMT machines.

The new code should work on those broken systems and should no longer BUG()
but fall back to a flat topology when it gets confused.
parent 2c725188
...@@ -19,7 +19,7 @@ ...@@ -19,7 +19,7 @@
#include <asm/abs_addr.h> #include <asm/abs_addr.h>
#if 1 #if 1
#define dbg(args...) udbg_printf(args) #define dbg(args...) printk(KERN_INFO args)
#else #else
#define dbg(args...) #define dbg(args...)
#endif #endif
...@@ -56,16 +56,136 @@ static inline void map_cpu_to_node(int cpu, int node) ...@@ -56,16 +56,136 @@ static inline void map_cpu_to_node(int cpu, int node)
} }
} }
static struct device_node * __init find_cpu_node(unsigned int cpu)
{
unsigned int hw_cpuid = get_hard_smp_processor_id(cpu);
struct device_node *cpu_node = NULL;
unsigned int *interrupt_server, *reg;
int len;
while ((cpu_node = of_find_node_by_type(cpu_node, "cpu")) != NULL) {
/* Try interrupt server first */
interrupt_server = (unsigned int *)get_property(cpu_node,
"ibm,ppc-interrupt-server#s", &len);
if (interrupt_server && (len > 0)) {
while (len--) {
if (interrupt_server[len-1] == hw_cpuid)
return cpu_node;
}
} else {
reg = (unsigned int *)get_property(cpu_node,
"reg", &len);
if (reg && (len > 0) && (reg[0] == hw_cpuid))
return cpu_node;
}
}
return NULL;
}
/* must hold reference to node during call */
static int *of_get_associativity(struct device_node *dev)
{
unsigned int *result;
int len;
result = (unsigned int *)get_property(dev, "ibm,associativity", &len);
if (len <= 0)
return NULL;
return result;
}
static int of_node_numa_domain(struct device_node *device, int depth)
{
int numa_domain;
unsigned int *tmp;
tmp = of_get_associativity(device);
if (tmp && (tmp[0] >= depth)) {
numa_domain = tmp[depth];
} else {
printk(KERN_ERR "WARNING: no NUMA information for "
"%s\n", device->full_name);
numa_domain = 0;
}
return numa_domain;
}
/*
* In theory, the "ibm,associativity" property may contain multiple
* associativity lists because a resource may be multiply connected
* into the machine. This resource then has different associativity
* characteristics relative to its multiple connections. We ignore
* this for now. We also assume that all cpu and memory sets have
* their distances represented at a common level. This won't be
* true for heirarchical NUMA.
*
* In any case the ibm,associativity-reference-points should give
* the correct depth for a normal NUMA system.
*
* - Dave Hansen <haveblue@us.ibm.com>
*/
static int find_min_common_depth(void)
{
int depth;
unsigned int *ref_points;
struct device_node *rtas_root;
unsigned int len;
rtas_root = of_find_node_by_path("/rtas");
if (!rtas_root) {
printk(KERN_ERR "WARNING: %s() could not find rtas root\n",
__FUNCTION__);
return -1;
}
/*
* this property is 2 32-bit integers, each representing a level of
* depth in the associativity nodes. The first is for an SMP
* configuration (should be all 0's) and the second is for a normal
* NUMA configuration.
*/
ref_points = (unsigned int *)get_property(rtas_root,
"ibm,associativity-reference-points", &len);
if ((len >= 1) && ref_points) {
depth = ref_points[1];
} else {
printk(KERN_ERR "WARNING: could not find NUMA "
"associativity reference point\n");
depth = -1;
}
of_node_put(rtas_root);
return depth;
}
static unsigned long read_cell_ul(struct device_node *device, unsigned int **buf)
{
int i;
unsigned long result = 0;
i = prom_n_size_cells(device);
/* bug on i>2 ?? */
while (i--) {
result = (result << 32) | **buf;
(*buf)++;
}
return result;
}
static int __init parse_numa_properties(void) static int __init parse_numa_properties(void)
{ {
struct device_node *cpu = NULL; struct device_node *cpu = NULL;
struct device_node *memory = NULL; struct device_node *memory = NULL;
int *cpu_associativity;
int *memory_associativity;
int depth; int depth;
int max_domain = 0; int max_domain = 0;
long entries = lmb_end_of_DRAM() >> MEMORY_INCREMENT_SHIFT; long entries = lmb_end_of_DRAM() >> MEMORY_INCREMENT_SHIFT;
long i; unsigned long i;
if (strstr(saved_command_line, "numa=off")) { if (strstr(saved_command_line, "numa=off")) {
printk(KERN_WARNING "NUMA disabled by user\n"); printk(KERN_WARNING "NUMA disabled by user\n");
...@@ -78,112 +198,78 @@ static int __init parse_numa_properties(void) ...@@ -78,112 +198,78 @@ static int __init parse_numa_properties(void)
for (i = 0; i < entries ; i++) for (i = 0; i < entries ; i++)
numa_memory_lookup_table[i] = ARRAY_INITIALISER; numa_memory_lookup_table[i] = ARRAY_INITIALISER;
cpu = of_find_node_by_type(NULL, "cpu"); depth = find_min_common_depth();
if (!cpu)
goto err;
memory = of_find_node_by_type(NULL, "memory");
if (!memory)
goto err;
cpu_associativity = (int *)get_property(cpu, "ibm,associativity", NULL);
if (!cpu_associativity)
goto err;
memory_associativity = (int *)get_property(memory, "ibm,associativity", printk(KERN_INFO "NUMA associativity depth for CPU/Memory: %d\n", depth);
NULL); if (depth < 0)
if (!memory_associativity) return depth;
goto err;
/* find common depth */ for_each_cpu(i) {
if (cpu_associativity[0] < memory_associativity[0]) int numa_domain;
depth = cpu_associativity[0];
else
depth = memory_associativity[0];
for (; cpu; cpu = of_find_node_by_type(cpu, "cpu")) {
int *tmp;
int cpu_nr, numa_domain;
tmp = (int *)get_property(cpu, "reg", NULL);
if (!tmp)
continue;
cpu_nr = *tmp;
tmp = (int *)get_property(cpu, "ibm,associativity",
NULL);
if (!tmp)
continue;
numa_domain = tmp[depth];
/* FIXME */ cpu = find_cpu_node(i);
if (numa_domain == 0xffff) {
dbg("cpu %d has no numa doman\n", cpu_nr); if (cpu) {
numa_domain = of_node_numa_domain(cpu, depth);
of_node_put(cpu);
if (numa_domain >= MAX_NUMNODES) {
/*
* POWER4 LPAR uses 0xffff as invalid node,
* dont warn in this case.
*/
if (numa_domain != 0xffff)
printk(KERN_ERR "WARNING: cpu %ld "
"maps to invalid NUMA node %d\n",
i, numa_domain);
numa_domain = 0;
}
} else {
printk(KERN_ERR "WARNING: no NUMA information for "
"cpu %ld\n", i);
numa_domain = 0; numa_domain = 0;
} }
if (numa_domain >= MAX_NUMNODES)
BUG();
node_set_online(numa_domain); node_set_online(numa_domain);
if (max_domain < numa_domain) if (max_domain < numa_domain)
max_domain = numa_domain; max_domain = numa_domain;
map_cpu_to_node(cpu_nr, numa_domain); map_cpu_to_node(i, numa_domain);
/* register the second thread on an SMT machine */
if (cur_cpu_spec->cpu_features & CPU_FTR_SMT)
map_cpu_to_node(cpu_nr ^ 0x1, numa_domain);
} }
for (; memory; memory = of_find_node_by_type(memory, "memory")) { memory = NULL;
unsigned int *tmp1, *tmp2; while ((memory = of_find_node_by_type(memory, "memory")) != NULL) {
unsigned long i; unsigned long start;
unsigned long start = 0; unsigned long size;
unsigned long size = 0;
int numa_domain; int numa_domain;
int ranges; int ranges;
unsigned int *memcell_buf;
unsigned int len;
tmp1 = (int *)get_property(memory, "reg", NULL); memcell_buf = (unsigned int *)get_property(memory, "reg", &len);
if (!tmp1) if (!memcell_buf || len <= 0)
continue; continue;
ranges = memory->n_addrs; ranges = memory->n_addrs;
new_range: new_range:
/* these are order-sensitive, and modify the buffer pointer */
i = prom_n_size_cells(memory); start = read_cell_ul(memory, &memcell_buf);
while (i--) { size = read_cell_ul(memory, &memcell_buf);
start = (start << 32) | *tmp1;
tmp1++;
}
i = prom_n_size_cells(memory);
while (i--) {
size = (size << 32) | *tmp1;
tmp1++;
}
start = _ALIGN_DOWN(start, MEMORY_INCREMENT); start = _ALIGN_DOWN(start, MEMORY_INCREMENT);
size = _ALIGN_UP(size, MEMORY_INCREMENT); size = _ALIGN_UP(size, MEMORY_INCREMENT);
if ((start + size) > MAX_MEMORY) numa_domain = of_node_numa_domain(memory, depth);
BUG();
tmp2 = (int *)get_property(memory, "ibm,associativity", if (numa_domain >= MAX_NUMNODES) {
NULL); if (numa_domain != 0xffff)
if (!tmp2) printk(KERN_ERR "WARNING: memory at %lx maps "
continue; "to invalid NUMA node %d\n", start,
numa_domain = tmp2[depth]; numa_domain);
/* FIXME */
if (numa_domain == 0xffff) {
dbg("memory has no numa doman\n");
numa_domain = 0; numa_domain = 0;
} }
if (numa_domain >= MAX_NUMNODES)
BUG();
node_set_online(numa_domain); node_set_online(numa_domain);
if (max_domain < numa_domain) if (max_domain < numa_domain)
...@@ -205,11 +291,13 @@ static int __init parse_numa_properties(void) ...@@ -205,11 +291,13 @@ static int __init parse_numa_properties(void)
start, size); start, size);
continue; continue;
} }
node_data[numa_domain].node_spanned_pages += size / PAGE_SIZE; node_data[numa_domain].node_spanned_pages +=
size / PAGE_SIZE;
} else { } else {
node_data[numa_domain].node_start_pfn = node_data[numa_domain].node_start_pfn =
start / PAGE_SIZE; start / PAGE_SIZE;
node_data[numa_domain].node_spanned_pages = size / PAGE_SIZE; node_data[numa_domain].node_spanned_pages =
size / PAGE_SIZE;
} }
for (i = start ; i < (start+size); i += MEMORY_INCREMENT) for (i = start ; i < (start+size); i += MEMORY_INCREMENT)
...@@ -227,10 +315,6 @@ static int __init parse_numa_properties(void) ...@@ -227,10 +315,6 @@ static int __init parse_numa_properties(void)
numnodes = max_domain + 1; numnodes = max_domain + 1;
return 0; return 0;
err:
of_node_put(cpu);
of_node_put(memory);
return -1;
} }
static void __init setup_nonnuma(void) static void __init setup_nonnuma(void)
......
...@@ -23,7 +23,6 @@ extern char *numa_memory_lookup_table; ...@@ -23,7 +23,6 @@ extern char *numa_memory_lookup_table;
extern cpumask_t numa_cpumask_lookup_table[]; extern cpumask_t numa_cpumask_lookup_table[];
extern int nr_cpus_in_node[]; extern int nr_cpus_in_node[];
#define MAX_MEMORY (1UL << 41)
/* 16MB regions */ /* 16MB regions */
#define MEMORY_INCREMENT_SHIFT 24 #define MEMORY_INCREMENT_SHIFT 24
#define MEMORY_INCREMENT (1UL << MEMORY_INCREMENT_SHIFT) #define MEMORY_INCREMENT (1UL << MEMORY_INCREMENT_SHIFT)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment