[PATCH] sched: limit cpuspan of node scheduler domains

This patch limits the cpu span of each node's scheduler domain to prevent balancing across too many cpus. The cpus included in a node's domain are determined by the SD_NODES_PER_DOMAIN define and the arch specific sched_domain_node_span routine if ARCH_HAS_SCHED_DOMAIN is defined. If ARCH_HAS_SCHED_DOMAIN is not defined, behavior is unchanged--all possible cpus will be included in each node's scheduling domain. Currently, only ia64 provides an arch specific sched_domain_node_span routine. From: Jesse Barnes <jbarnes@engr.sgi.com> This patch adds some more NUMA specific logic to the creation of scheduler domains. Domains spanning all CPUs in a large system are too large to schedule across efficiently, leading to livelocks and inordinate amounts of time being spent in scheduler routines. With this patch applied, the node scheduling domains for NUMA platforms will only contain a specified number of nearby CPUs, based on the value of SD_NODES_PER_DOMAIN. It also allows arches to override SD_NODE_INIT, which sets the domain scheduling parameters for each node's domain. This is necessary especially for large systems. Possible future directions: o multilevel node hierarchy (e.g. node domains could contain 4 nodes worth of CPUs, supernode domains could contain 32 nodes worth, etc. each with their own SD_NODE_INIT values) o more tweaking of SD_NODE_INIT values for good load balancing vs. overhead tradeoffs From: mita akinobu <amgta@yacht.ocn.ne.jp> Compile fix Signed-off-by: Jesse Barnes <jbarnes@sgi.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>

[PATCH] sched: limit cpuspan of node scheduler domains
This patch limits the cpu span of each node's scheduler domain to prevent balancing across too many cpus. The cpus included in a node's domain are determined by the SD_NODES_PER_DOMAIN define and the arch specific sched_domain_node_span routine if ARCH_HAS_SCHED_DOMAIN is defined. If ARCH_HAS_SCHED_DOMAIN is not defined, behavior is unchanged--all possible cpus will be included in each node's scheduling domain. Currently, only ia64 provides an arch specific sched_domain_node_span routine. From: Jesse Barnes <jbarnes@engr.sgi.com> This patch adds some more NUMA specific logic to the creation of scheduler domains. Domains spanning all CPUs in a large system are too large to schedule across efficiently, leading to livelocks and inordinate amounts of time being spent in scheduler routines. With this patch applied, the node scheduling domains for NUMA platforms will only contain a specified number of nearby CPUs, based on the value of SD_NODES_PER_DOMAIN. It also allows arches to override SD_NODE_INIT, which sets the domain scheduling parameters for each node's domain. This is necessary especially for large systems. Possible future directions: o multilevel node hierarchy (e.g. node domains could contain 4 nodes worth of CPUs, supernode domains could contain 32 nodes worth, etc. each with their own SD_NODE_INIT values) o more tweaking of SD_NODE_INIT values for good load balancing vs. overhead tradeoffs From: mita akinobu <amgta@yacht.ocn.ne.jp> Compile fix Signed-off-by: Jesse Barnes <jbarnes@sgi.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
c183e253 · Jesse Barnes · Linus Torvalds · 8a7a2318 · c183e253 · c183e253
Commit c183e253 authored Aug 23, 2004 by Jesse Barnes Committed by Linus Torvalds Aug 23, 2004
Hide whitespace changes
Inline Side-by-side

Showing with 95 additions and 8 deletions

arch/ia64/kernel/smpboot.c arch/ia64/kernel/smpboot.c +1 -0

include/asm-ia64/processor.h include/asm-ia64/processor.h +20 -0

kernel/sched.c kernel/sched.c +74 -8

No files found.
--- a/arch/ia64/kernel/smpboot.c
+++ b/arch/ia64/kernel/smpboot.c
@@ -716,3 +716,4 @@ init_smp_config(void)
 		printk(KERN_ERR "SMP: Can't set SAL AP Boot Rendezvous: %s\n",
 		       ia64_sal_strerror(sal_ret));
 }
+
--- a/include/asm-ia64/processor.h
+++ b/include/asm-ia64/processor.h
@@ -334,6 +334,26 @@ struct task_struct;
 /* Prepare to copy thread state - unlazy all lazy status */
 #define prepare_to_copy(tsk)	do { } while (0)

+#ifdef CONFIG_NUMA
+#define SD_NODE_INIT (struct sched_domain) {		\
+	.span			= CPU_MASK_NONE,	\
+	.parent			= NULL,			\
+	.groups			= NULL,			\
+	.min_interval		= 80,			\
+	.max_interval		= 320,			\
+	.busy_factor		= 320,			\
+	.imbalance_pct		= 125,			\
+	.cache_hot_time		= (10*1000000),		\
+	.cache_nice_tries	= 1,			\
+	.per_cpu_gain		= 100,			\
+	.flags			= SD_BALANCE_EXEC	\
+				| SD_WAKE_BALANCE,	\
+	.last_balance		= jiffies,		\
+	.balance_interval	= 10,			\
+	.nr_balance_failed	= 0,			\
+}
+#endif
+
 /*
 * This is the mechanism for creating a new kernel thread.
 *

--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1783,10 +1783,8 @@ static void active_load_balance(runqueue_t *busiest, int busiest_cpu)
 	for_each_domain(busiest_cpu, sd)
 		if (cpu_isset(busiest->push_cpu, sd->span))
 			break;
-	if (!sd) {
-		WARN_ON(1);
+	if (!sd)
 		return;
-	}

 	group = sd->groups;
 	while (!cpu_isset(busiest_cpu, group->cpumask))
@@ -3656,9 +3654,73 @@ void cpu_attach_domain(struct sched_domain *sd, int cpu)
 	unlock_cpu_hotplug();
 }

-#ifdef ARCH_HAS_SCHED_DOMAIN
-extern void __init arch_init_sched_domains(void);
-#else
+#ifdef CONFIG_NUMA
+/**
+ * find_next_best_node - find the next node to include in a sched_domain
+ * @node: node whose sched_domain we're building
+ * @used_nodes: nodes already in the sched_domain
+ *
+ * Find the next node to include in a given scheduling domain.  Simply
+ * finds the closest node not already in the @used_nodes map.
+ *
+ * Should use nodemask_t.
+ */
+static int __init find_next_best_node(int node, unsigned long *used_nodes)
+{
+	int i, n, val, min_val, best_node = 0;
+
+	min_val = INT_MAX;
+
+	for (i = 0; i < numnodes; i++) {
+		/* Start at @node */
+		n = (node + i) % numnodes;
+
+		/* Skip already used nodes */
+		if (test_bit(n, used_nodes))
+			continue;
+
+		/* Simple min distance search */
+		val = node_distance(node, i);
+
+		if (val < min_val) {
+			min_val = val;
+			best_node = n;
+		}
+	}
+
+	set_bit(best_node, used_nodes);
+	return best_node;
+}
+
+/**
+ * sched_domain_node_span - get a cpumask for a node's sched_domain
+ * @node: node whose cpumask we're constructing
+ * @size: number of nodes to include in this span
+ *
+ * Given a node, construct a good cpumask for its sched_domain to span.  It
+ * should be one that prevents unnecessary balancing, but also spreads tasks
+ * out optimally.
+ */
+cpumask_t __init sched_domain_node_span(int node, int size)
+{
+	int i;
+	cpumask_t span;
+	DECLARE_BITMAP(used_nodes, MAX_NUMNODES);
+
+	cpus_clear(span);
+	bitmap_zero(used_nodes, MAX_NUMNODES);
+
+	for (i = 0; i < size; i++) {
+		int next_node = find_next_best_node(node, used_nodes);
+		cpumask_t  nodemask;
+
+		nodemask = node_to_cpumask(next_node);
+		cpus_or(span, span, nodemask);
+	}
+
+	return span;
+}
+#endif /* CONFIG_NUMA */

 #ifdef CONFIG_SCHED_SMT
 static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
@@ -3681,6 +3743,10 @@ __init static int cpu_to_phys_group(int cpu)
 }

 #ifdef CONFIG_NUMA
+
+/* Number of nearby nodes in a node's scheduling domain */
+#define SD_NODES_PER_DOMAIN 4
+
 static DEFINE_PER_CPU(struct sched_domain, node_domains);
 static struct sched_group sched_group_nodes[MAX_NUMNODES];
 __init static int cpu_to_node_group(int cpu)
@@ -3748,7 +3814,8 @@ __init static void arch_init_sched_domains(void)
 		sd = &per_cpu(node_domains, i);
 		group = cpu_to_node_group(i);
 		*sd = SD_NODE_INIT;
-		sd->span = cpu_possible_map;
+		/* FIXME: should be multilevel, in arch code */
+		sd->span = sched_domain_node_span(i, SD_NODES_PER_DOMAIN);
 		sd->groups = &sched_group_nodes[group];
 #endif

@@ -3835,7 +3902,6 @@ __init static void arch_init_sched_domains(void)
 		cpu_attach_domain(sd, i);
 	}
 }
-#endif /* ARCH_HAS_SCHED_DOMAIN */

 #define SCHED_DOMAIN_DEBUG
 #ifdef SCHED_DOMAIN_DEBUG