From 34dc9e74968ef4b582c3815b0124d88a68dfabef Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 22 Dec 2010 17:23:40 -0800 Subject: [PATCH 1/6] x86, numa: Reduce minimum fake node size to 32M This patch changes the minimum fake node size from 64MB to 32MB so it is possible to test NUMA code at a greater scale on smaller machines (64 nodes on a 2G machine, 1024 nodes on 32G machine with CONFIG_NODES_SHIFT=10). Signed-off-by: David Rientjes LKML-Reference: Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/numa_64.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/include/asm/numa_64.h b/arch/x86/include/asm/numa_64.h index 823e070e7c26..5ae87285a502 100644 --- a/arch/x86/include/asm/numa_64.h +++ b/arch/x86/include/asm/numa_64.h @@ -38,7 +38,7 @@ extern void __cpuinit numa_add_cpu(int cpu); extern void __cpuinit numa_remove_cpu(int cpu); #ifdef CONFIG_NUMA_EMU -#define FAKE_NODE_MIN_SIZE ((u64)64 << 20) +#define FAKE_NODE_MIN_SIZE ((u64)32 << 20) #define FAKE_NODE_MIN_HASH_MASK (~(FAKE_NODE_MIN_SIZE - 1UL)) #endif /* CONFIG_NUMA_EMU */ #else From 4e76f4e67a106ed827ca721b4c8b622047cd2f6d Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 22 Dec 2010 17:23:47 -0800 Subject: [PATCH 2/6] x86, numa: Avoid compiling NUMA emulation functions without CONFIG_NUMA_EMU Both acpi_get_nodes() and amd_get_nodes() are only necessary when CONFIG_NUMA_EMU is enabled, so avoid compiling them when the option is disabled. Signed-off-by: David Rientjes LKML-Reference: Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/acpi.h | 5 ++++- arch/x86/include/asm/amd_nb.h | 5 ++++- arch/x86/mm/amdtopology_64.c | 2 ++ arch/x86/mm/srat_64.c | 2 ++ 4 files changed, 12 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h index 55d106b5e31b..b326fa99db57 100644 --- a/arch/x86/include/asm/acpi.h +++ b/arch/x86/include/asm/acpi.h @@ -188,14 +188,17 @@ extern int acpi_numa; extern int acpi_get_nodes(struct bootnode *physnodes); extern int acpi_scan_nodes(unsigned long start, unsigned long end); #define NR_NODE_MEMBLKS (MAX_NUMNODES*2) + +#ifdef CONFIG_NUMA_EMU extern void acpi_fake_nodes(const struct bootnode *fake_nodes, int num_nodes); +#endif #else static inline void acpi_fake_nodes(const struct bootnode *fake_nodes, int num_nodes) { } -#endif +#endif /* CONFIG_ACPI_NUMA */ #define acpi_unlazy_tlb(x) leave_mm(x) diff --git a/arch/x86/include/asm/amd_nb.h b/arch/x86/include/asm/amd_nb.h index 6aee50d655d1..9c16cde63f04 100644 --- a/arch/x86/include/asm/amd_nb.h +++ b/arch/x86/include/asm/amd_nb.h @@ -9,10 +9,13 @@ struct bootnode; extern int early_is_amd_nb(u32 value); extern int amd_cache_northbridges(void); extern void amd_flush_garts(void); -extern int amd_get_nodes(struct bootnode *nodes); extern int amd_numa_init(unsigned long start_pfn, unsigned long end_pfn); extern int amd_scan_nodes(void); +#ifdef CONFIG_NUMA_EMU +extern int amd_get_nodes(struct bootnode *nodes); +#endif + struct amd_northbridge { struct pci_dev *misc; }; diff --git a/arch/x86/mm/amdtopology_64.c b/arch/x86/mm/amdtopology_64.c index 51fae9cfdecb..fe050af614e2 100644 --- a/arch/x86/mm/amdtopology_64.c +++ b/arch/x86/mm/amdtopology_64.c @@ -69,6 +69,7 @@ static __init void early_get_boot_cpu_id(void) early_init_lapic_mapping(); } +#ifdef CONFIG_NUMA_EMU int __init amd_get_nodes(struct bootnode *physnodes) { int i; @@ -81,6 +82,7 @@ int __init amd_get_nodes(struct bootnode *physnodes) } return ret; } +#endif /* CONFIG_NUMA_EMU */ int __init amd_numa_init(unsigned long start_pfn, unsigned long end_pfn) { diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c index a35cb9d8b060..8241bf0f6eb2 100644 --- a/arch/x86/mm/srat_64.c +++ b/arch/x86/mm/srat_64.c @@ -339,6 +339,7 @@ static int __init nodes_cover_memory(const struct bootnode *nodes) void __init acpi_numa_arch_fixup(void) {} +#ifdef CONFIG_NUMA_EMU int __init acpi_get_nodes(struct bootnode *physnodes) { int i; @@ -351,6 +352,7 @@ int __init acpi_get_nodes(struct bootnode *physnodes) } return ret; } +#endif /* CONFIG_NUMA_EMU */ /* Use the information discovered above to actually set up the nodes. */ int __init acpi_scan_nodes(unsigned long start, unsigned long end) From f51bf3073a145a5b3263fd882c52d6ec04b687da Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 22 Dec 2010 17:23:51 -0800 Subject: [PATCH 3/6] x86, numa: Fake apicid and pxm mappings for NUMA emulation This patch adds the equivalent of acpi_fake_nodes() for AMD Northbridge platforms. The goal is to fake the apicid-to-node mappings for NUMA emulation so the physical topology of the machine is correctly maintained within the kernel. This change also fakes proximity domains for both ACPI and k8 code so the physical distance between emulated nodes is maintained via node_distance(). This exports the correct distances via /sys/devices/system/node/.../distance based on the underlying topology. A new helper function, fake_physnodes(), is introduced to correctly invoke the correct NUMA code to fake these two mappings based on the system type. If there is no underlying NUMA configuration, all cpus are mapped to node 0 for local distance. Since acpi_fake_nodes() is no longer called with CONFIG_ACPI_NUMA, it's prototype can be removed from the header file for such a configuration. Signed-off-by: David Rientjes LKML-Reference: Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/acpi.h | 5 -- arch/x86/include/asm/amd_nb.h | 1 + arch/x86/mm/amdtopology_64.c | 91 +++++++++++++++++++++++++++++------ arch/x86/mm/numa_64.c | 20 +++++++- arch/x86/mm/srat_64.c | 2 - 5 files changed, 95 insertions(+), 24 deletions(-) diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h index b326fa99db57..8288daf72dc9 100644 --- a/arch/x86/include/asm/acpi.h +++ b/arch/x86/include/asm/acpi.h @@ -193,11 +193,6 @@ extern int acpi_scan_nodes(unsigned long start, unsigned long end); extern void acpi_fake_nodes(const struct bootnode *fake_nodes, int num_nodes); #endif -#else -static inline void acpi_fake_nodes(const struct bootnode *fake_nodes, - int num_nodes) -{ -} #endif /* CONFIG_ACPI_NUMA */ #define acpi_unlazy_tlb(x) leave_mm(x) diff --git a/arch/x86/include/asm/amd_nb.h b/arch/x86/include/asm/amd_nb.h index 9c16cde63f04..8f6192c1592c 100644 --- a/arch/x86/include/asm/amd_nb.h +++ b/arch/x86/include/asm/amd_nb.h @@ -13,6 +13,7 @@ extern int amd_numa_init(unsigned long start_pfn, unsigned long end_pfn); extern int amd_scan_nodes(void); #ifdef CONFIG_NUMA_EMU +extern void amd_fake_nodes(const struct bootnode *nodes, int nr_nodes); extern int amd_get_nodes(struct bootnode *nodes); #endif diff --git a/arch/x86/mm/amdtopology_64.c b/arch/x86/mm/amdtopology_64.c index fe050af614e2..eb5cbb97b68d 100644 --- a/arch/x86/mm/amdtopology_64.c +++ b/arch/x86/mm/amdtopology_64.c @@ -27,6 +27,7 @@ #include static struct bootnode __initdata nodes[8]; +static unsigned char __initdata nodeids[8]; static nodemask_t __initdata nodes_parsed = NODE_MASK_NONE; static __init int find_northbridge(void) @@ -69,21 +70,6 @@ static __init void early_get_boot_cpu_id(void) early_init_lapic_mapping(); } -#ifdef CONFIG_NUMA_EMU -int __init amd_get_nodes(struct bootnode *physnodes) -{ - int i; - int ret = 0; - - for_each_node_mask(i, nodes_parsed) { - physnodes[ret].start = nodes[i].start; - physnodes[ret].end = nodes[i].end; - ret++; - } - return ret; -} -#endif /* CONFIG_NUMA_EMU */ - int __init amd_numa_init(unsigned long start_pfn, unsigned long end_pfn) { unsigned long start = PFN_PHYS(start_pfn); @@ -116,7 +102,7 @@ int __init amd_numa_init(unsigned long start_pfn, unsigned long end_pfn) base = read_pci_config(0, nb, 1, 0x40 + i*8); limit = read_pci_config(0, nb, 1, 0x44 + i*8); - nodeid = limit & 7; + nodeids[i] = nodeid = limit & 7; if ((base & 3) == 0) { if (i < numnodes) pr_info("Skipping disabled node %d\n", i); @@ -196,6 +182,79 @@ int __init amd_numa_init(unsigned long start_pfn, unsigned long end_pfn) return 0; } +#ifdef CONFIG_NUMA_EMU +static s16 fake_apicid_to_node[MAX_LOCAL_APIC] __initdata = { + [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE +}; + +int __init amd_get_nodes(struct bootnode *physnodes) +{ + int i; + int ret = 0; + + for_each_node_mask(i, nodes_parsed) { + physnodes[ret].start = nodes[i].start; + physnodes[ret].end = nodes[i].end; + ret++; + } + return ret; +} + +static int __init find_node_by_addr(unsigned long addr) +{ + int ret = NUMA_NO_NODE; + int i; + + for (i = 0; i < 8; i++) + if (addr >= nodes[i].start && addr < nodes[i].end) { + ret = i; + break; + } + return ret; +} + +/* + * For NUMA emulation, fake proximity domain (_PXM) to node id mappings must be + * setup to represent the physical topology but reflect the emulated + * environment. For each emulated node, the real node which it appears on is + * found and a fake pxm to nid mapping is created which mirrors the actual + * locality. node_distance() then represents the correct distances between + * emulated nodes by using the fake acpi mappings to pxms. + */ +void __init amd_fake_nodes(const struct bootnode *nodes, int nr_nodes) +{ + unsigned int bits; + unsigned int cores; + unsigned int apicid_base = 0; + int i; + + bits = boot_cpu_data.x86_coreid_bits; + cores = 1 << bits; + early_get_boot_cpu_id(); + if (boot_cpu_physical_apicid > 0) + apicid_base = boot_cpu_physical_apicid; + + for (i = 0; i < nr_nodes; i++) { + int index; + int nid; + int j; + + nid = find_node_by_addr(nodes[i].start); + if (nid == NUMA_NO_NODE) + continue; + + index = nodeids[nid] << bits; + if (fake_apicid_to_node[index + apicid_base] == NUMA_NO_NODE) + for (j = apicid_base; j < cores + apicid_base; j++) + fake_apicid_to_node[index + j] = i; +#ifdef CONFIG_ACPI_NUMA + __acpi_map_pxm_to_node(nid, i); +#endif + } + memcpy(apicid_to_node, fake_apicid_to_node, sizeof(apicid_to_node)); +} +#endif /* CONFIG_NUMA_EMU */ + int __init amd_scan_nodes(void) { unsigned int bits; diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index 7762a517d69d..cc390f3a1bde 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -324,6 +324,24 @@ static int __init setup_physnodes(unsigned long start, unsigned long end, return ret; } +static void __init fake_physnodes(int acpi, int amd, int nr_nodes) +{ + int i; + + BUG_ON(acpi && amd); +#ifdef CONFIG_ACPI_NUMA + if (acpi) + acpi_fake_nodes(nodes, nr_nodes); +#endif +#ifdef CONFIG_AMD_NUMA + if (amd) + amd_fake_nodes(nodes, nr_nodes); +#endif + if (!acpi && !amd) + for (i = 0; i < nr_cpu_ids; i++) + numa_set_node(i, 0); +} + /* * Setups up nid to range from addr to addr + size. If the end * boundary is greater than max_addr, then max_addr is used instead. @@ -595,7 +613,7 @@ static int __init numa_emulation(unsigned long start_pfn, nodes[i].end >> PAGE_SHIFT); setup_node_bootmem(i, nodes[i].start, nodes[i].end); } - acpi_fake_nodes(nodes, num_nodes); + fake_physnodes(acpi, amd, num_nodes); numa_init_array(); return 0; } diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c index 8241bf0f6eb2..c48b443706c5 100644 --- a/arch/x86/mm/srat_64.c +++ b/arch/x86/mm/srat_64.c @@ -497,8 +497,6 @@ void __init acpi_fake_nodes(const struct bootnode *fake_nodes, int num_nodes) { int i, j; - printk(KERN_INFO "Faking PXM affinity for fake nodes on real " - "topology.\n"); for (i = 0; i < num_nodes; i++) { int nid, pxm; From c1c3443c9c5e9be92641029ed229a41563e44506 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 22 Dec 2010 17:23:54 -0800 Subject: [PATCH 4/6] x86, numa: Fake node-to-cpumask for NUMA emulation It's necessary to fake the node-to-cpumask mapping so that an emulated node ID returns a cpumask that includes all cpus that have affinity to the memory it represents. This is a little intrusive because it requires knowledge of the physical topology of the system. setup_physnodes() gives us that information, but since NUMA emulation ends up altering the physnodes array, it's necessary to reset it before cpus are brought online. Accordingly, the physnodes array is moved out of init.data and into cpuinit.data since it will be needed on cpuup callbacks. This works regardless of whether numa=fake is used on the command line, or the setup of the fake node succeeds or fails. The physnodes array always contains the physical topology of the machine if CONFIG_NUMA_EMU is enabled and can be used to setup the correct node-to-cpumask mappings in all cases since setup_physnodes() is called whenever the array needs to be repopulated with the correct data. To fake the actual mappings, numa_add_cpu() and numa_remove_cpu() are rewritten for CONFIG_NUMA_EMU so that we first find the physical node to which each cpu has local affinity, then iterate through all online nodes to find the emulated nodes that have local affinity to that physical node, and then finally map the cpu to each of those emulated nodes. Signed-off-by: David Rientjes LKML-Reference: Signed-off-by: H. Peter Anvin --- arch/x86/mm/numa_64.c | 101 +++++++++++++++++++++++++++++++++--------- 1 file changed, 80 insertions(+), 21 deletions(-) diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index cc390f3a1bde..dd300c491f1f 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -260,7 +260,7 @@ void __init numa_init_array(void) #ifdef CONFIG_NUMA_EMU /* Numa emulation */ static struct bootnode nodes[MAX_NUMNODES] __initdata; -static struct bootnode physnodes[MAX_NUMNODES] __initdata; +static struct bootnode physnodes[MAX_NUMNODES] __cpuinitdata; static char *cmdline __initdata; static int __init setup_physnodes(unsigned long start, unsigned long end, @@ -270,6 +270,7 @@ static int __init setup_physnodes(unsigned long start, unsigned long end, int ret = 0; int i; + memset(physnodes, 0, sizeof(physnodes)); #ifdef CONFIG_ACPI_NUMA if (acpi) nr_nodes = acpi_get_nodes(physnodes); @@ -370,8 +371,7 @@ static int __init setup_node_range(int nid, u64 *addr, u64 size, u64 max_addr) * Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr * to max_addr. The return value is the number of nodes allocated. */ -static int __init split_nodes_interleave(u64 addr, u64 max_addr, - int nr_phys_nodes, int nr_nodes) +static int __init split_nodes_interleave(u64 addr, u64 max_addr, int nr_nodes) { nodemask_t physnode_mask = NODE_MASK_NONE; u64 size; @@ -402,7 +402,7 @@ static int __init split_nodes_interleave(u64 addr, u64 max_addr, return -1; } - for (i = 0; i < nr_phys_nodes; i++) + for (i = 0; i < MAX_NUMNODES; i++) if (physnodes[i].start != physnodes[i].end) node_set(i, physnode_mask); @@ -571,11 +571,9 @@ static int __init numa_emulation(unsigned long start_pfn, { u64 addr = start_pfn << PAGE_SHIFT; u64 max_addr = last_pfn << PAGE_SHIFT; - int num_phys_nodes; int num_nodes; int i; - num_phys_nodes = setup_physnodes(addr, max_addr, acpi, amd); /* * If the numa=fake command-line contains a 'M' or 'G', it represents * the fixed node size. Otherwise, if it is just a single number N, @@ -590,7 +588,7 @@ static int __init numa_emulation(unsigned long start_pfn, unsigned long n; n = simple_strtoul(cmdline, NULL, 0); - num_nodes = split_nodes_interleave(addr, max_addr, num_phys_nodes, n); + num_nodes = split_nodes_interleave(addr, max_addr, n); } if (num_nodes < 0) @@ -613,6 +611,7 @@ static int __init numa_emulation(unsigned long start_pfn, nodes[i].end >> PAGE_SHIFT); setup_node_bootmem(i, nodes[i].start, nodes[i].end); } + setup_physnodes(addr, max_addr, acpi, amd); fake_physnodes(acpi, amd, num_nodes); numa_init_array(); return 0; @@ -628,8 +627,12 @@ void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn, nodes_clear(node_online_map); #ifdef CONFIG_NUMA_EMU + setup_physnodes(start_pfn << PAGE_SHIFT, last_pfn << PAGE_SHIFT, + acpi, amd); if (cmdline && !numa_emulation(start_pfn, last_pfn, acpi, amd)) return; + setup_physnodes(start_pfn << PAGE_SHIFT, last_pfn << PAGE_SHIFT, + acpi, amd); nodes_clear(node_possible_map); nodes_clear(node_online_map); #endif @@ -785,6 +788,7 @@ void __cpuinit numa_clear_node(int cpu) #ifndef CONFIG_DEBUG_PER_CPU_MAPS +#ifndef CONFIG_NUMA_EMU void __cpuinit numa_add_cpu(int cpu) { cpumask_set_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]); @@ -794,6 +798,51 @@ void __cpuinit numa_remove_cpu(int cpu) { cpumask_clear_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]); } +#else +void __cpuinit numa_add_cpu(int cpu) +{ + unsigned long addr; + u16 apicid; + int physnid; + int nid = NUMA_NO_NODE; + + apicid = early_per_cpu(x86_cpu_to_apicid, cpu); + if (apicid != BAD_APICID) + nid = apicid_to_node[apicid]; + if (nid == NUMA_NO_NODE) + nid = early_cpu_to_node(cpu); + BUG_ON(nid == NUMA_NO_NODE || !node_online(nid)); + + /* + * Use the starting address of the emulated node to find which physical + * node it is allocated on. + */ + addr = node_start_pfn(nid) << PAGE_SHIFT; + for (physnid = 0; physnid < MAX_NUMNODES; physnid++) + if (addr >= physnodes[physnid].start && + addr < physnodes[physnid].end) + break; + + /* + * Map the cpu to each emulated node that is allocated on the physical + * node of the cpu's apic id. + */ + for_each_online_node(nid) { + addr = node_start_pfn(nid) << PAGE_SHIFT; + if (addr >= physnodes[physnid].start && + addr < physnodes[physnid].end) + cpumask_set_cpu(cpu, node_to_cpumask_map[nid]); + } +} + +void __cpuinit numa_remove_cpu(int cpu) +{ + int i; + + for_each_online_node(i) + cpumask_clear_cpu(cpu, node_to_cpumask_map[i]); +} +#endif /* !CONFIG_NUMA_EMU */ #else /* CONFIG_DEBUG_PER_CPU_MAPS */ @@ -805,22 +854,32 @@ static void __cpuinit numa_set_cpumask(int cpu, int enable) int node = early_cpu_to_node(cpu); struct cpumask *mask; char buf[64]; + int i; - mask = node_to_cpumask_map[node]; - if (mask == NULL) { - printk(KERN_ERR "node_to_cpumask_map[%i] NULL\n", node); - dump_stack(); - return; + for_each_online_node(i) { + unsigned long addr; + + addr = node_start_pfn(i) << PAGE_SHIFT; + if (addr < physnodes[node].start || + addr >= physnodes[node].end) + continue; + mask = node_to_cpumask_map[node]; + if (mask == NULL) { + pr_err("node_to_cpumask_map[%i] NULL\n", i); + dump_stack(); + return; + } + + if (enable) + cpumask_set_cpu(cpu, mask); + else + cpumask_clear_cpu(cpu, mask); + + cpulist_scnprintf(buf, sizeof(buf), mask); + printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n", + enable ? "numa_add_cpu" : "numa_remove_cpu", + cpu, node, buf); } - - if (enable) - cpumask_set_cpu(cpu, mask); - else - cpumask_clear_cpu(cpu, mask); - - cpulist_scnprintf(buf, sizeof(buf), mask); - printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n", - enable ? "numa_add_cpu" : "numa_remove_cpu", cpu, node, buf); } void __cpuinit numa_add_cpu(int cpu) From a387e95a49743cf9835c5299ca549232618d8249 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 22 Dec 2010 17:23:56 -0800 Subject: [PATCH 5/6] x86, numa: Fix cpu to node mapping for sparse node ids NUMA boot code assumes that physical node ids start at 0, but the DIMMs that the apic id represents may not be reachable. If this is the case, node 0 is never online and cpus never end up getting appropriately assigned to a node. This causes the cpumask of all online nodes to be empty and machines crash with kernel code assuming online nodes have valid cpus. The fix is to appropriately map all the address ranges for physical nodes and ensure the cpu to node mapping function checks all possible nodes (up to MAX_NUMNODES) instead of simply checking nodes 0-N, where N is the number of physical nodes, for valid address ranges. This requires no longer "compressing" the address ranges of nodes in the physical node map from 0-N, but rather leave indices in physnodes[] to represent the actual node id of the physical node. Accordingly, the topology exported by both amd_get_nodes() and acpi_get_nodes() no longer must return the number of nodes to iterate through; all such iterations will now be to MAX_NUMNODES. This change also passes the end address of system RAM (which may be different from normal operation if mem= is specified on the command line) before the physnodes[] array is populated. ACPI parsed nodes are truncated to fit within the address range that respect the mem= boundaries and even some physical nodes may become unreachable in such cases. When NUMA emulation does succeed, any apicid to node mapping that exists for unreachable nodes are given default values so that proximity domains can still be assigned. This is important for node_distance() to function as desired. Signed-off-by: David Rientjes LKML-Reference: Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/acpi.h | 3 ++- arch/x86/include/asm/amd_nb.h | 2 +- arch/x86/mm/amdtopology_64.c | 9 +++------ arch/x86/mm/numa_64.c | 18 +++--------------- arch/x86/mm/srat_64.c | 22 ++++++++++++++++------ 5 files changed, 25 insertions(+), 29 deletions(-) diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h index 8288daf72dc9..211ca3f7fd16 100644 --- a/arch/x86/include/asm/acpi.h +++ b/arch/x86/include/asm/acpi.h @@ -185,7 +185,8 @@ struct bootnode; #ifdef CONFIG_ACPI_NUMA extern int acpi_numa; -extern int acpi_get_nodes(struct bootnode *physnodes); +extern void acpi_get_nodes(struct bootnode *physnodes, unsigned long start, + unsigned long end); extern int acpi_scan_nodes(unsigned long start, unsigned long end); #define NR_NODE_MEMBLKS (MAX_NUMNODES*2) diff --git a/arch/x86/include/asm/amd_nb.h b/arch/x86/include/asm/amd_nb.h index 8f6192c1592c..980f22567631 100644 --- a/arch/x86/include/asm/amd_nb.h +++ b/arch/x86/include/asm/amd_nb.h @@ -14,7 +14,7 @@ extern int amd_scan_nodes(void); #ifdef CONFIG_NUMA_EMU extern void amd_fake_nodes(const struct bootnode *nodes, int nr_nodes); -extern int amd_get_nodes(struct bootnode *nodes); +extern void amd_get_nodes(struct bootnode *nodes); #endif struct amd_northbridge { diff --git a/arch/x86/mm/amdtopology_64.c b/arch/x86/mm/amdtopology_64.c index eb5cbb97b68d..0df2623d1039 100644 --- a/arch/x86/mm/amdtopology_64.c +++ b/arch/x86/mm/amdtopology_64.c @@ -187,17 +187,14 @@ static s16 fake_apicid_to_node[MAX_LOCAL_APIC] __initdata = { [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE }; -int __init amd_get_nodes(struct bootnode *physnodes) +void __init amd_get_nodes(struct bootnode *physnodes) { int i; - int ret = 0; for_each_node_mask(i, nodes_parsed) { - physnodes[ret].start = nodes[i].start; - physnodes[ret].end = nodes[i].end; - ret++; + physnodes[i].start = nodes[i].start; + physnodes[i].end = nodes[i].end; } - return ret; } static int __init find_node_by_addr(unsigned long addr) diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index dd300c491f1f..3d73201ba347 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -266,25 +266,24 @@ static char *cmdline __initdata; static int __init setup_physnodes(unsigned long start, unsigned long end, int acpi, int amd) { - int nr_nodes = 0; int ret = 0; int i; memset(physnodes, 0, sizeof(physnodes)); #ifdef CONFIG_ACPI_NUMA if (acpi) - nr_nodes = acpi_get_nodes(physnodes); + acpi_get_nodes(physnodes, start, end); #endif #ifdef CONFIG_AMD_NUMA if (amd) - nr_nodes = amd_get_nodes(physnodes); + amd_get_nodes(physnodes); #endif /* * Basic sanity checking on the physical node map: there may be errors * if the SRAT or AMD code incorrectly reported the topology or the mem= * kernel parameter is used. */ - for (i = 0; i < nr_nodes; i++) { + for (i = 0; i < MAX_NUMNODES; i++) { if (physnodes[i].start == physnodes[i].end) continue; if (physnodes[i].start > end) { @@ -299,17 +298,6 @@ static int __init setup_physnodes(unsigned long start, unsigned long end, physnodes[i].start = start; if (physnodes[i].end > end) physnodes[i].end = end; - } - - /* - * Remove all nodes that have no memory or were truncated because of the - * limited address range. - */ - for (i = 0; i < nr_nodes; i++) { - if (physnodes[i].start == physnodes[i].end) - continue; - physnodes[ret].start = physnodes[i].start; - physnodes[ret].end = physnodes[i].end; ret++; } diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c index c48b443706c5..a756bcf3fa48 100644 --- a/arch/x86/mm/srat_64.c +++ b/arch/x86/mm/srat_64.c @@ -340,17 +340,16 @@ static int __init nodes_cover_memory(const struct bootnode *nodes) void __init acpi_numa_arch_fixup(void) {} #ifdef CONFIG_NUMA_EMU -int __init acpi_get_nodes(struct bootnode *physnodes) +void __init acpi_get_nodes(struct bootnode *physnodes, unsigned long start, + unsigned long end) { int i; - int ret = 0; for_each_node_mask(i, nodes_parsed) { - physnodes[ret].start = nodes[i].start; - physnodes[ret].end = nodes[i].end; - ret++; + cutoff_node(i, start, end); + physnodes[i].start = nodes[i].start; + physnodes[i].end = nodes[i].end; } - return ret; } #endif /* CONFIG_NUMA_EMU */ @@ -516,6 +515,17 @@ void __init acpi_fake_nodes(const struct bootnode *fake_nodes, int num_nodes) fake_apicid_to_node[j] == NUMA_NO_NODE) fake_apicid_to_node[j] = i; } + + /* + * If there are apicid-to-node mappings for physical nodes that do not + * have a corresponding emulated node, it should default to a guaranteed + * value. + */ + for (i = 0; i < MAX_LOCAL_APIC; i++) + if (apicid_to_node[i] != NUMA_NO_NODE && + fake_apicid_to_node[i] == NUMA_NO_NODE) + fake_apicid_to_node[i] = 0; + for (i = 0; i < num_nodes; i++) __acpi_map_pxm_to_node(fake_node_to_pxm_map[i], i); memcpy(apicid_to_node, fake_apicid_to_node, sizeof(apicid_to_node)); From d906f0eb2f0e6d1a24c479f69a9c39e7e45c5ae8 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Thu, 30 Dec 2010 10:54:16 -0800 Subject: [PATCH 6/6] x86, numa: Fix CONFIG_DEBUG_PER_CPU_MAPS without NUMA emulation "x86, numa: Fake node-to-cpumask for NUMA emulation" broke the build when CONFIG_DEBUG_PER_CPU_MAPS is set and CONFIG_NUMA_EMU is not. This is because it is possible to map a cpu to multiple nodes when NUMA emulation is used; the patch required a physical node address table to find those nodes that was only available when CONFIG_NUMA_EMU was enabled. This extracts the common debug functionality to its own function for CONFIG_DEBUG_PER_CPU_MAPS and uses it regardless of whether CONFIG_NUMA_EMU is set or not. NUMA emulation will now iterate over the set of possible nodes for each cpu and call the new debug function whereas only the cpu's node will be used without NUMA emulation enabled. Reported-by: Ingo Molnar Signed-off-by: David Rientjes Acked-by: Yinghai Lu LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/mm/numa_64.c | 56 +++++++++++++++++++++++++++++++------------ 1 file changed, 41 insertions(+), 15 deletions(-) diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index 3d73201ba347..1e72102e80c9 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -833,15 +833,48 @@ void __cpuinit numa_remove_cpu(int cpu) #endif /* !CONFIG_NUMA_EMU */ #else /* CONFIG_DEBUG_PER_CPU_MAPS */ - -/* - * --------- debug versions of the numa functions --------- - */ -static void __cpuinit numa_set_cpumask(int cpu, int enable) +static struct cpumask __cpuinit *debug_cpumask_set_cpu(int cpu, int enable) { int node = early_cpu_to_node(cpu); struct cpumask *mask; char buf[64]; + + mask = node_to_cpumask_map[node]; + if (!mask) { + pr_err("node_to_cpumask_map[%i] NULL\n", node); + dump_stack(); + return NULL; + } + + cpulist_scnprintf(buf, sizeof(buf), mask); + printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n", + enable ? "numa_add_cpu" : "numa_remove_cpu", + cpu, node, buf); + return mask; +} + +/* + * --------- debug versions of the numa functions --------- + */ +#ifndef CONFIG_NUMA_EMU +static void __cpuinit numa_set_cpumask(int cpu, int enable) +{ + struct cpumask *mask; + + mask = debug_cpumask_set_cpu(cpu, enable); + if (!mask) + return; + + if (enable) + cpumask_set_cpu(cpu, mask); + else + cpumask_clear_cpu(cpu, mask); +} +#else +static void __cpuinit numa_set_cpumask(int cpu, int enable) +{ + int node = early_cpu_to_node(cpu); + struct cpumask *mask; int i; for_each_online_node(i) { @@ -851,24 +884,17 @@ static void __cpuinit numa_set_cpumask(int cpu, int enable) if (addr < physnodes[node].start || addr >= physnodes[node].end) continue; - mask = node_to_cpumask_map[node]; - if (mask == NULL) { - pr_err("node_to_cpumask_map[%i] NULL\n", i); - dump_stack(); + mask = debug_cpumask_set_cpu(cpu, enable); + if (!mask) return; - } if (enable) cpumask_set_cpu(cpu, mask); else cpumask_clear_cpu(cpu, mask); - - cpulist_scnprintf(buf, sizeof(buf), mask); - printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n", - enable ? "numa_add_cpu" : "numa_remove_cpu", - cpu, node, buf); } } +#endif /* CONFIG_NUMA_EMU */ void __cpuinit numa_add_cpu(int cpu) {