Commit 2601ad70 authored by Andi Kleen's avatar Andi Kleen Committed by Linus Torvalds

[PATCH] x86_64: Add SRAT NUMA discovery to x86-64.

Add SRAT NUMA discovery to x86-64.

Previously the NUMA nodes on Opteron systems were directly read from the
hardware registers of the northbridge.

This version also supports reading it from the more generic ACPI SRAT table.
This allows to support NUMA on all kinds of systems and in general gives more
flexibility, but needs a bit of BIOS support.

This implementation was not done before because the SRAT specification had a
bad license from Microsoft that didn't allow to implement it on other OS.  Now
that ACPI 3.0 is released and SRAT is part of it that isn't a concern anymore.

One side effect of this is that holes not described in SRAT (e.g.  usually the
640K DOS and 4GB PCI memory holes) will not included in the nodes and won't
get mem_map entries allocated.  This may have side effects for some drivers.

Some support for handling dual core AMD CPUs is included.

SLIT parsing is done in a later patch.  Depends on earlier patches to split up
the ACPI boot parsing and to add better CMP support on x86-64.

Can be disabled with numa=noacpi
Signed-off-by: default avatarAndi Kleen <ak@suse.de>
Signed-off-by: default avatarAndrew Morton <akpm@osdl.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@osdl.org>
parent ac989232
...@@ -76,6 +76,8 @@ extern acpi_interrupt_flags acpi_sci_flags; ...@@ -76,6 +76,8 @@ extern acpi_interrupt_flags acpi_sci_flags;
int __initdata acpi_force = 0; int __initdata acpi_force = 0;
#endif #endif
int acpi_numa __initdata;
/* For PCI or other memory-mapped resources */ /* For PCI or other memory-mapped resources */
unsigned long pci_mem_start = 0x10000000; unsigned long pci_mem_start = 0x10000000;
...@@ -498,6 +500,13 @@ void __init setup_arch(char **cmdline_p) ...@@ -498,6 +500,13 @@ void __init setup_arch(char **cmdline_p)
acpi_boot_table_init(); acpi_boot_table_init();
#endif #endif
#ifdef CONFIG_ACPI_NUMA
/*
* Parse SRAT to discover nodes.
*/
acpi_numa_init();
#endif
#ifdef CONFIG_DISCONTIGMEM #ifdef CONFIG_DISCONTIGMEM
numa_initmem_init(0, end_pfn); numa_initmem_init(0, end_pfn);
#else #else
...@@ -674,6 +683,7 @@ static int __init init_amd(struct cpuinfo_x86 *c) ...@@ -674,6 +683,7 @@ static int __init init_amd(struct cpuinfo_x86 *c)
{ {
int r; int r;
int level; int level;
int cpu;
/* Bit 31 in normal CPUID used for nonstandard 3DNow ID; /* Bit 31 in normal CPUID used for nonstandard 3DNow ID;
3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway */ 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway */
...@@ -705,13 +715,14 @@ static int __init init_amd(struct cpuinfo_x86 *c) ...@@ -705,13 +715,14 @@ static int __init init_amd(struct cpuinfo_x86 *c)
/* On a dual core setup the lower bits of apic id /* On a dual core setup the lower bits of apic id
distingush the cores. Fix up the CPU<->node mappings distingush the cores. Fix up the CPU<->node mappings
here based on that. here based on that.
Assumes number of cores is a power of two. */ Assumes number of cores is a power of two.
if (c->x86_num_cores > 1) { When using SRAT use mapping from SRAT. */
int cpu = c->x86_apicid; cpu = c->x86_apicid;
if (acpi_numa <= 0 && c->x86_num_cores > 1) {
cpu_to_node[cpu] = cpu >> hweight32(c->x86_num_cores - 1); cpu_to_node[cpu] = cpu >> hweight32(c->x86_num_cores - 1);
printk(KERN_INFO "CPU %d -> Node %d\n",
cpu, cpu_to_node[cpu]);
} }
printk(KERN_INFO "CPU %d(%d) -> Node %d\n",
cpu, c->x86_num_cores, cpu_to_node[cpu]);
#endif #endif
} }
......
...@@ -6,5 +6,6 @@ obj-y := init.o fault.o ioremap.o extable.o pageattr.o ...@@ -6,5 +6,6 @@ obj-y := init.o fault.o ioremap.o extable.o pageattr.o
obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
obj-$(CONFIG_DISCONTIGMEM) += numa.o obj-$(CONFIG_DISCONTIGMEM) += numa.o
obj-$(CONFIG_K8_NUMA) += k8topology.o obj-$(CONFIG_K8_NUMA) += k8topology.o
obj-$(CONFIG_ACPI_NUMA) += srat.o
hugetlbpage-y = ../../i386/mm/hugetlbpage.o hugetlbpage-y = ../../i386/mm/hugetlbpage.o
...@@ -2,9 +2,7 @@ ...@@ -2,9 +2,7 @@
* AMD K8 NUMA support. * AMD K8 NUMA support.
* Discover the memory map and associated nodes. * Discover the memory map and associated nodes.
* *
* Doesn't use the ACPI SRAT table because it has a questionable license. * This version reads it directly from the K8 northbridge.
* Instead the northbridge registers are read directly.
* XXX in 2.5 we could use the generic SRAT code
* *
* Copyright 2002,2003 Andi Kleen, SuSE Labs. * Copyright 2002,2003 Andi Kleen, SuSE Labs.
*/ */
......
...@@ -16,6 +16,7 @@ ...@@ -16,6 +16,7 @@
#include <asm/proto.h> #include <asm/proto.h>
#include <asm/dma.h> #include <asm/dma.h>
#include <asm/numa.h> #include <asm/numa.h>
#include <asm/acpi.h>
#ifndef Dprintk #ifndef Dprintk
#define Dprintk(x...) #define Dprintk(x...)
...@@ -27,10 +28,11 @@ bootmem_data_t plat_node_bdata[MAX_NUMNODES]; ...@@ -27,10 +28,11 @@ bootmem_data_t plat_node_bdata[MAX_NUMNODES];
int memnode_shift; int memnode_shift;
u8 memnodemap[NODEMAPSIZE]; u8 memnodemap[NODEMAPSIZE];
unsigned char cpu_to_node[NR_CPUS]; #define NUMA_NO_NODE 0xff
unsigned char cpu_to_node[NR_CPUS] = { [0 ... NR_CPUS-1] = NUMA_NO_NODE };
cpumask_t node_to_cpumask[MAXNODE]; cpumask_t node_to_cpumask[MAXNODE];
static int numa_off __initdata; int numa_off __initdata;
unsigned long nodes_present; unsigned long nodes_present;
...@@ -153,6 +155,8 @@ void __init numa_init_array(void) ...@@ -153,6 +155,8 @@ void __init numa_init_array(void)
for (i = 0; i < MAXNODE; i++) { for (i = 0; i < MAXNODE; i++) {
if (node_online(i)) if (node_online(i))
continue; continue;
if (cpu_to_node[i] != NUMA_NO_NODE)
continue;
rr = next_node(rr, node_online_map); rr = next_node(rr, node_online_map);
if (rr == MAX_NUMNODES) if (rr == MAX_NUMNODES)
rr = first_node(node_online_map); rr = first_node(node_online_map);
...@@ -220,6 +224,12 @@ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn) ...@@ -220,6 +224,12 @@ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
return; return;
#endif #endif
#ifdef CONFIG_ACPI_NUMA
if (!numa_off && !acpi_scan_nodes(start_pfn << PAGE_SHIFT,
end_pfn << PAGE_SHIFT))
return;
#endif
#ifdef CONFIG_K8_NUMA #ifdef CONFIG_K8_NUMA
if (!numa_off && !k8_scan_nodes(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT)) if (!numa_off && !k8_scan_nodes(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT))
return; return;
...@@ -237,7 +247,7 @@ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn) ...@@ -237,7 +247,7 @@ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
for (i = 0; i < NR_CPUS; i++) for (i = 0; i < NR_CPUS; i++)
cpu_to_node[i] = 0; cpu_to_node[i] = 0;
node_to_cpumask[0] = cpumask_of_cpu(0); node_to_cpumask[0] = cpumask_of_cpu(0);
setup_node_bootmem(0, start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT); setup_node_bootmem(0, start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT);
} }
__init void numa_add_cpu(int cpu) __init void numa_add_cpu(int cpu)
...@@ -276,6 +286,10 @@ __init int numa_setup(char *opt) ...@@ -276,6 +286,10 @@ __init int numa_setup(char *opt)
if (numa_fake >= MAX_NUMNODES) if (numa_fake >= MAX_NUMNODES)
numa_fake = MAX_NUMNODES; numa_fake = MAX_NUMNODES;
} }
#endif
#ifdef CONFIG_ACPI_NUMA
if (!strncmp(opt,"noacpi",6))
acpi_numa = -1;
#endif #endif
return 1; return 1;
} }
......
/*
* ACPI 3.0 based NUMA setup
* Copyright 2004 Andi Kleen, SuSE Labs.
*
* Reads the ACPI SRAT table to figure out what memory belongs to which CPUs.
*
* Called from acpi_numa_init while reading the SRAT and SLIT tables.
* Assumes all memory regions belonging to a single proximity domain
* are in one chunk. Holes between them will be included in the node.
*/
#include <linux/kernel.h>
#include <linux/acpi.h>
#include <linux/mmzone.h>
#include <linux/bitmap.h>
#include <asm/proto.h>
#include <asm/numa.h>
static DECLARE_BITMAP(nodes_parsed, MAXNODE) __initdata;
static struct node nodes[MAXNODE] __initdata;
static __u8 pxm2node[256] __initdata = { [0 ... 255] = 0xff };
static __init int setup_node(int pxm)
{
if (pxm2node[pxm] == 0xff) {
if (numnodes > MAXNODE)
return -1;
pxm2node[pxm] = numnodes - 1;
numnodes++;
}
return pxm2node[pxm];
}
static __init int conflicting_nodes(unsigned long start, unsigned long end)
{
int i;
for (i = 0; i < numnodes; i++) {
struct node *nd = &nodes[i];
if (nd->start == nd->end)
continue;
if (nd->end > start && nd->start < end)
return 1;
if (nd->end == end && nd->start == start)
return 1;
}
return -1;
}
static __init void cutoff_node(int i, unsigned long start, unsigned long end)
{
struct node *nd = &nodes[i];
if (nd->start < start) {
nd->start = start;
if (nd->end < nd->start)
nd->start = nd->end;
}
if (nd->end > end) {
if (!(end & 0xfff))
end--;
nd->end = end;
if (nd->start > nd->end)
nd->start = nd->end;
}
}
static __init void bad_srat(void)
{
printk(KERN_ERR "SRAT: SRAT not used.\n");
acpi_numa = -1;
}
static __init inline int srat_disabled(void)
{
return numa_off || acpi_numa < 0;
}
/* Callback for SLIT parsing */
void __init acpi_numa_slit_init(struct acpi_table_slit *slit)
{
/* ignored for now */
}
/* Callback for Proximity Domain -> LAPIC mapping */
void __init
acpi_numa_processor_affinity_init(struct acpi_table_processor_affinity *pa)
{
int pxm, node;
if (srat_disabled() || pa->flags.enabled == 0)
return;
pxm = pa->proximity_domain;
node = setup_node(pxm);
if (node < 0) {
printk(KERN_ERR "SRAT: Too many proximity domains %x\n", pxm);
bad_srat();
return;
}
if (pa->apic_id >= NR_CPUS) {
printk(KERN_ERR "SRAT: lapic %u too large.\n",
pa->apic_id);
bad_srat();
return;
}
cpu_to_node[pa->apic_id] = node;
acpi_numa = 1;
printk(KERN_INFO "SRAT: PXM %u -> APIC %u -> Node %u\n",
pxm, pa->apic_id, node);
}
/* Callback for parsing of the Proximity Domain <-> Memory Area mappings */
void __init
acpi_numa_memory_affinity_init(struct acpi_table_memory_affinity *ma)
{
struct node *nd;
unsigned long start, end;
int node, pxm;
int i;
if (srat_disabled() || ma->flags.enabled == 0)
return;
/* hotplug bit is ignored for now */
pxm = ma->proximity_domain;
node = setup_node(pxm);
if (node < 0) {
printk(KERN_ERR "SRAT: Too many proximity domains.\n");
bad_srat();
return;
}
start = ma->base_addr_lo | ((u64)ma->base_addr_hi << 32);
end = start + (ma->length_lo | ((u64)ma->length_hi << 32));
i = conflicting_nodes(start, end);
if (i >= 0) {
printk(KERN_ERR
"SRAT: pxm %d overlap %lx-%lx with node %d(%Lx-%Lx)\n",
pxm, start, end, i, nodes[i].start, nodes[i].end);
bad_srat();
return;
}
nd = &nodes[node];
if (!test_and_set_bit(node, &nodes_parsed)) {
nd->start = start;
nd->end = end;
} else {
if (start < nd->start)
nd->start = start;
if (nd->end < end)
nd->end = end;
}
if (!(nd->end & 0xfff))
nd->end--;
printk(KERN_INFO "SRAT: Node %u PXM %u %Lx-%Lx\n", node, pxm,
nd->start, nd->end);
}
void __init acpi_numa_arch_fixup(void)
{
numnodes--;
}
/* Use the information discovered above to actually set up the nodes. */
int __init acpi_scan_nodes(unsigned long start, unsigned long end)
{
int i;
if (acpi_numa <= 0)
return -1;
memnode_shift = compute_hash_shift(nodes);
if (memnode_shift < 0) {
printk(KERN_ERR
"SRAT: No NUMA node hash function found. Contact maintainer\n");
bad_srat();
return -1;
}
for (i = 0; i < MAXNODE; i++) {
if (!test_bit(i, &nodes_parsed))
continue;
cutoff_node(i, start, end);
if (nodes[i].start == nodes[i].end)
continue;
setup_node_bootmem(i, nodes[i].start, nodes[i].end);
}
numa_init_array();
return 0;
}
...@@ -167,7 +167,7 @@ config ACPI_NUMA ...@@ -167,7 +167,7 @@ config ACPI_NUMA
bool "NUMA support" bool "NUMA support"
depends on ACPI_INTERPRETER depends on ACPI_INTERPRETER
depends on NUMA depends on NUMA
depends on IA64 depends on (IA64 || X86_64)
default y if IA64_GENERIC || IA64_SGI_SN2 default y if IA64_GENERIC || IA64_SGI_SN2
config ACPI_ASUS config ACPI_ASUS
......
...@@ -131,6 +131,10 @@ extern int acpi_gsi_to_irq(u32 gsi, unsigned int *irq); ...@@ -131,6 +131,10 @@ extern int acpi_gsi_to_irq(u32 gsi, unsigned int *irq);
#define acpi_ioapic 0 #define acpi_ioapic 0
#endif /* !CONFIG_ACPI_BOOT */ #endif /* !CONFIG_ACPI_BOOT */
extern int acpi_numa;
extern int acpi_scan_nodes(unsigned long start, unsigned long end);
#define NR_NODE_MEMBLKS (MAX_NUMNODES*2)
#ifdef CONFIG_ACPI_PCI #ifdef CONFIG_ACPI_PCI
static inline void acpi_noirq_set(void) { acpi_noirq = 1; } static inline void acpi_noirq_set(void) { acpi_noirq = 1; }
static inline void acpi_disable_pci(void) static inline void acpi_disable_pci(void)
......
...@@ -19,5 +19,6 @@ extern int compute_hash_shift(struct node *nodes); ...@@ -19,5 +19,6 @@ extern int compute_hash_shift(struct node *nodes);
extern void numa_add_cpu(int cpu); extern void numa_add_cpu(int cpu);
extern void numa_init_array(void); extern void numa_init_array(void);
extern int numa_off;
#endif #endif
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment