[PATCH] ia64 cpuset + build_sched_domains() mangles structures

I've already sent this to the maintainers, and this is now being sent to a larger community audience. I have fixed a problem with the ia64 version of build_sched_domains(), but a similar fix still needs to be made to the generic build_sched_domains() in kernel/sched.c. The "dynamic sched domains" functionality has recently been merged into 2.6.13-rcN that sees the dynamic declaration of a cpu-exclusive (a.k.a. "isolated") cpuset and rebuilds the CPU Scheduler sched domains and sched groups to separate away the CPUs in this cpu-exclusive cpuset from the remainder of the non-isolated CPUs. This allows the non-isolated CPUs to completely ignore the isolated CPUs when doing load-balancing. Unfortunately, build_sched_domains() expects that a sched domain will include all the CPUs of each node in the domain, i.e., that no node will belong in both an isolated cpuset and a non-isolated cpuset. Declaring a cpuset that violates this presumption will produce flawed data structures and will oops the kernel. To trigger the problem (on a NUMA system with >1 CPUs per node): cd /dev/cpuset mkdir newcpuset cd newcpuset echo 0 >cpus echo 0 >mems echo 1 >cpu_exclusive I have fixed this shortcoming for ia64 NUMA (with multiple CPUs per node). A similar shortcoming exists in the generic build_sched_domains() (in kernel/sched.c) for NUMA, and that needs to be fixed also. The fix involves dynamically allocating sched_group_nodes[] and sched_group_allnodes[] for each invocation of build_sched_domains(), rather than using global arrays for these structures. Care must be taken to remember kmalloc() addresses so that arch_destroy_sched_domains() can properly kfree() the new dynamic structures. Signed-off-by: John Hawkes <hawkes@sgi.com> Cc: Nick Piggin <nickpiggin@yahoo.com.au> Cc: Ingo Molnar <mingo@elte.hu> Cc: "Luck, Tony" <tony.luck@intel.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-09-06 15:18:06 -07:00 · 2005-09-06 15:18:06 -07:00 · f68f447e83
parent 38f1852759
commit f68f447e83
1 changed files with 69 additions and 21 deletions
--- a/arch/ia64/kernel/domain.c
+++ b/arch/ia64/kernel/domain.c
@ -120,10 +120,10 @@ static int cpu_to_phys_group(int cpu)
 * gets dynamically allocated.
 */
 static DEFINE_PER_CPU(struct sched_domain, node_domains);
-static struct sched_group *sched_group_nodes[MAX_NUMNODES];
+static struct sched_group **sched_group_nodes_bycpu[NR_CPUS];
 static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);
-static struct sched_group sched_group_allnodes[MAX_NUMNODES];
+static struct sched_group *sched_group_allnodes_bycpu[NR_CPUS];
 static int cpu_to_allnodes_group(int cpu)
 {
@ -138,6 +138,21 @@ static int cpu_to_allnodes_group(int cpu)
 void build_sched_domains(const cpumask_t *cpu_map)
 {
 	int i;
 #ifdef CONFIG_NUMA
 	struct sched_group **sched_group_nodes = NULL;
 	struct sched_group *sched_group_allnodes = NULL;
 	/*
 	 * Allocate the per-node list of sched groups
 	 */
 	sched_group_nodes = kmalloc(sizeof(struct sched_group*)*MAX_NUMNODES,
 					   GFP_ATOMIC);
 	if (!sched_group_nodes) {
 		printk(KERN_WARNING "Can not alloc sched group node list\n");
 		return;
 	}
 	sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes;
 #endif
 	/*
 	 * Set up domains for cpus specified by the cpu_map.
@ -150,8 +165,21 @@ void build_sched_domains(const cpumask_t *cpu_map)
 		cpus_and(nodemask, nodemask, *cpu_map);
 #ifdef CONFIG_NUMA
-		if (num_online_cpus()
+		if (cpus_weight(*cpu_map)
 				> SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) {
 			if (!sched_group_allnodes) {
 				sched_group_allnodes
 					= kmalloc(sizeof(struct sched_group)
 							* MAX_NUMNODES,
 						  GFP_KERNEL);
 				if (!sched_group_allnodes) {
 					printk(KERN_WARNING
 					"Can not alloc allnodes sched group\n");
 					break;
 				}
 				sched_group_allnodes_bycpu[i]
 						= sched_group_allnodes;
 			}
 			sd = &per_cpu(allnodes_domains, i);
 			*sd = SD_ALLNODES_INIT;
 			sd->span = *cpu_map;
@ -214,8 +242,9 @@ void build_sched_domains(const cpumask_t *cpu_map)
 	}
 #ifdef CONFIG_NUMA
-	init_sched_build_groups(sched_group_allnodes, *cpu_map,
+	if (sched_group_allnodes)
-				&cpu_to_allnodes_group);
+		init_sched_build_groups(sched_group_allnodes, *cpu_map,
 					&cpu_to_allnodes_group);
 	for (i = 0; i < MAX_NUMNODES; i++) {
 		/* Set up node groups */
@ -226,8 +255,10 @@ void build_sched_domains(const cpumask_t *cpu_map)
 		int j;
 		cpus_and(nodemask, nodemask, *cpu_map);
-		if (cpus_empty(nodemask))
+		if (cpus_empty(nodemask)) {
 			sched_group_nodes[i] = NULL;
 			continue;
 		}
 		domainspan = sched_domain_node_span(i);
 		cpus_and(domainspan, domainspan, *cpu_map);
@ -372,25 +403,42 @@ void arch_destroy_sched_domains(const cpumask_t *cpu_map)
 {
 #ifdef CONFIG_NUMA
 	int i;
-	for (i = 0; i < MAX_NUMNODES; i++) {
+	int cpu;
 		cpumask_t nodemask = node_to_cpumask(i);
 		struct sched_group *oldsg, *sg = sched_group_nodes[i];
-		cpus_and(nodemask, nodemask, *cpu_map);
+	for_each_cpu_mask(cpu, *cpu_map) {
-		if (cpus_empty(nodemask))
+		struct sched_group *sched_group_allnodes
 			= sched_group_allnodes_bycpu[cpu];
 		struct sched_group **sched_group_nodes
 			= sched_group_nodes_bycpu[cpu];
 		if (sched_group_allnodes) {
 			kfree(sched_group_allnodes);
 			sched_group_allnodes_bycpu[cpu] = NULL;
 		}
 		if (!sched_group_nodes)
 			continue;
-		if (sg == NULL)
+		for (i = 0; i < MAX_NUMNODES; i++) {
-			continue;
+			cpumask_t nodemask = node_to_cpumask(i);
-		sg = sg->next;
+			struct sched_group *oldsg, *sg = sched_group_nodes[i];
 			cpus_and(nodemask, nodemask, *cpu_map);
 			if (cpus_empty(nodemask))
 				continue;
 			if (sg == NULL)
 				continue;
 			sg = sg->next;
 next_sg:
-		oldsg = sg;
+			oldsg = sg;
-		sg = sg->next;
+			sg = sg->next;
-		kfree(oldsg);
+			kfree(oldsg);
-		if (oldsg != sched_group_nodes[i])
+			if (oldsg != sched_group_nodes[i])
-			goto next_sg;
+				goto next_sg;
-		sched_group_nodes[i] = NULL;
+		}
 		kfree(sched_group_nodes);
 		sched_group_nodes_bycpu[cpu] = NULL;
 	}
 #endif
 }