Commit c183e253 authored by Jesse Barnes's avatar Jesse Barnes Committed by Linus Torvalds

[PATCH] sched: limit cpuspan of node scheduler domains

  This patch limits the cpu span of each node's scheduler domain to prevent
  balancing across too many cpus.  The cpus included in a node's domain are
  determined by the SD_NODES_PER_DOMAIN define and the arch specific
  sched_domain_node_span routine if ARCH_HAS_SCHED_DOMAIN is defined.  If
  ARCH_HAS_SCHED_DOMAIN is not defined, behavior is unchanged--all possible
  cpus will be included in each node's scheduling domain.  Currently, only
  ia64 provides an arch specific sched_domain_node_span routine.

From: Jesse Barnes <jbarnes@engr.sgi.com>

  This patch adds some more NUMA specific logic to the creation of scheduler
  domains.  Domains spanning all CPUs in a large system are too large to
  schedule across efficiently, leading to livelocks and inordinate amounts of
  time being spent in scheduler routines.  With this patch applied, the node
  scheduling domains for NUMA platforms will only contain a specified number
  of nearby CPUs, based on the value of SD_NODES_PER_DOMAIN.  It also allows
  arches to override SD_NODE_INIT, which sets the domain scheduling parameters
  for each node's domain.  This is necessary especially for large systems.

  Possible future directions:

  o multilevel node hierarchy (e.g.  node domains could contain 4 nodes
    worth of CPUs, supernode domains could contain 32 nodes worth, etc.  each
    with their own SD_NODE_INIT values)

  o more tweaking of SD_NODE_INIT values for good load balancing vs. 
    overhead tradeoffs

From: mita akinobu <amgta@yacht.ocn.ne.jp>

  Compile fix
Signed-off-by: default avatarJesse Barnes <jbarnes@sgi.com>
Signed-off-by: default avatarIngo Molnar <mingo@elte.hu>
Signed-off-by: default avatarAndrew Morton <akpm@osdl.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@osdl.org>
parent 8a7a2318
......@@ -716,3 +716,4 @@ init_smp_config(void)
printk(KERN_ERR "SMP: Can't set SAL AP Boot Rendezvous: %s\n",
ia64_sal_strerror(sal_ret));
}
......@@ -334,6 +334,26 @@ struct task_struct;
/* Prepare to copy thread state - unlazy all lazy status */
#define prepare_to_copy(tsk) do { } while (0)
#ifdef CONFIG_NUMA
#define SD_NODE_INIT (struct sched_domain) { \
.span = CPU_MASK_NONE, \
.parent = NULL, \
.groups = NULL, \
.min_interval = 80, \
.max_interval = 320, \
.busy_factor = 320, \
.imbalance_pct = 125, \
.cache_hot_time = (10*1000000), \
.cache_nice_tries = 1, \
.per_cpu_gain = 100, \
.flags = SD_BALANCE_EXEC \
| SD_WAKE_BALANCE, \
.last_balance = jiffies, \
.balance_interval = 10, \
.nr_balance_failed = 0, \
}
#endif
/*
* This is the mechanism for creating a new kernel thread.
*
......
......@@ -1783,10 +1783,8 @@ static void active_load_balance(runqueue_t *busiest, int busiest_cpu)
for_each_domain(busiest_cpu, sd)
if (cpu_isset(busiest->push_cpu, sd->span))
break;
if (!sd) {
WARN_ON(1);
if (!sd)
return;
}
group = sd->groups;
while (!cpu_isset(busiest_cpu, group->cpumask))
......@@ -3656,9 +3654,73 @@ void cpu_attach_domain(struct sched_domain *sd, int cpu)
unlock_cpu_hotplug();
}
#ifdef ARCH_HAS_SCHED_DOMAIN
extern void __init arch_init_sched_domains(void);
#else
#ifdef CONFIG_NUMA
/**
* find_next_best_node - find the next node to include in a sched_domain
* @node: node whose sched_domain we're building
* @used_nodes: nodes already in the sched_domain
*
* Find the next node to include in a given scheduling domain. Simply
* finds the closest node not already in the @used_nodes map.
*
* Should use nodemask_t.
*/
static int __init find_next_best_node(int node, unsigned long *used_nodes)
{
int i, n, val, min_val, best_node = 0;
min_val = INT_MAX;
for (i = 0; i < numnodes; i++) {
/* Start at @node */
n = (node + i) % numnodes;
/* Skip already used nodes */
if (test_bit(n, used_nodes))
continue;
/* Simple min distance search */
val = node_distance(node, i);
if (val < min_val) {
min_val = val;
best_node = n;
}
}
set_bit(best_node, used_nodes);
return best_node;
}
/**
* sched_domain_node_span - get a cpumask for a node's sched_domain
* @node: node whose cpumask we're constructing
* @size: number of nodes to include in this span
*
* Given a node, construct a good cpumask for its sched_domain to span. It
* should be one that prevents unnecessary balancing, but also spreads tasks
* out optimally.
*/
cpumask_t __init sched_domain_node_span(int node, int size)
{
int i;
cpumask_t span;
DECLARE_BITMAP(used_nodes, MAX_NUMNODES);
cpus_clear(span);
bitmap_zero(used_nodes, MAX_NUMNODES);
for (i = 0; i < size; i++) {
int next_node = find_next_best_node(node, used_nodes);
cpumask_t nodemask;
nodemask = node_to_cpumask(next_node);
cpus_or(span, span, nodemask);
}
return span;
}
#endif /* CONFIG_NUMA */
#ifdef CONFIG_SCHED_SMT
static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
......@@ -3681,6 +3743,10 @@ __init static int cpu_to_phys_group(int cpu)
}
#ifdef CONFIG_NUMA
/* Number of nearby nodes in a node's scheduling domain */
#define SD_NODES_PER_DOMAIN 4
static DEFINE_PER_CPU(struct sched_domain, node_domains);
static struct sched_group sched_group_nodes[MAX_NUMNODES];
__init static int cpu_to_node_group(int cpu)
......@@ -3748,7 +3814,8 @@ __init static void arch_init_sched_domains(void)
sd = &per_cpu(node_domains, i);
group = cpu_to_node_group(i);
*sd = SD_NODE_INIT;
sd->span = cpu_possible_map;
/* FIXME: should be multilevel, in arch code */
sd->span = sched_domain_node_span(i, SD_NODES_PER_DOMAIN);
sd->groups = &sched_group_nodes[group];
#endif
......@@ -3835,7 +3902,6 @@ __init static void arch_init_sched_domains(void)
cpu_attach_domain(sd, i);
}
}
#endif /* ARCH_HAS_SCHED_DOMAIN */
#define SCHED_DOMAIN_DEBUG
#ifdef SCHED_DOMAIN_DEBUG
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment