Commit dce840a0 authored by Peter Zijlstra's avatar Peter Zijlstra Committed by Ingo Molnar

sched: Dynamically allocate sched_domain/sched_group data-structures

Instead of relying on static allocations for the sched_domain and
sched_group trees, dynamically allocate and RCU free them.

Allocating this dynamically also allows for some build_sched_groups()
simplification since we can now (like with other simplifications) rely
on the sched_domain tree instead of hard-coded knowledge.

One tricky to note is that detach_destroy_domains() needs to hold
rcu_read_lock() over the entire tear-down, per-cpu is not sufficient
since that can lead to partial sched_group existance (could possibly
be solved by doing the tear-down backwards but this is much more
robust).

A concequence of the above is that we can no longer print the
sched_domain debug stuff from cpu_attach_domain() since that might now
run with preemption disabled (due to classic RCU etc.) and
sched_domain_debug() does some GFP_KERNEL allocations.

Another thing to note is that we now fully rely on normal RCU and not
RCU-sched, this is because with the new and exiting RCU flavours we
grew over the years BH doesn't necessarily hold off RCU-sched grace
periods (-rt is known to break this). This would in fact already cause
us grief since we do sched_domain/sched_group iterations from softirq
context.

This patch is somewhat larger than I would like it to be, but I didn't
find any means of shrinking/splitting this.
Signed-off-by: default avatarPeter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Nick Piggin <npiggin@kernel.dk>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Link: http://lkml.kernel.org/r/20110407122942.245307941@chello.nlSigned-off-by: default avatarIngo Molnar <mingo@elte.hu>
parent a9c9a9b6
...@@ -868,6 +868,7 @@ static inline int sd_power_saving_flags(void) ...@@ -868,6 +868,7 @@ static inline int sd_power_saving_flags(void)
struct sched_group { struct sched_group {
struct sched_group *next; /* Must be a circular list */ struct sched_group *next; /* Must be a circular list */
atomic_t ref;
/* /*
* CPU power of this group, SCHED_LOAD_SCALE being max power for a * CPU power of this group, SCHED_LOAD_SCALE being max power for a
...@@ -973,6 +974,10 @@ struct sched_domain { ...@@ -973,6 +974,10 @@ struct sched_domain {
#ifdef CONFIG_SCHED_DEBUG #ifdef CONFIG_SCHED_DEBUG
char *name; char *name;
#endif #endif
union {
void *private; /* used during construction */
struct rcu_head rcu; /* used during destruction */
};
unsigned int span_weight; unsigned int span_weight;
/* /*
......
This diff is collapsed.
...@@ -1622,6 +1622,7 @@ static int select_idle_sibling(struct task_struct *p, int target) ...@@ -1622,6 +1622,7 @@ static int select_idle_sibling(struct task_struct *p, int target)
/* /*
* Otherwise, iterate the domains and find an elegible idle cpu. * Otherwise, iterate the domains and find an elegible idle cpu.
*/ */
rcu_read_lock();
for_each_domain(target, sd) { for_each_domain(target, sd) {
if (!(sd->flags & SD_SHARE_PKG_RESOURCES)) if (!(sd->flags & SD_SHARE_PKG_RESOURCES))
break; break;
...@@ -1641,6 +1642,7 @@ static int select_idle_sibling(struct task_struct *p, int target) ...@@ -1641,6 +1642,7 @@ static int select_idle_sibling(struct task_struct *p, int target)
cpumask_test_cpu(prev_cpu, sched_domain_span(sd))) cpumask_test_cpu(prev_cpu, sched_domain_span(sd)))
break; break;
} }
rcu_read_unlock();
return target; return target;
} }
...@@ -1673,6 +1675,7 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_ ...@@ -1673,6 +1675,7 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_
new_cpu = prev_cpu; new_cpu = prev_cpu;
} }
rcu_read_lock();
for_each_domain(cpu, tmp) { for_each_domain(cpu, tmp) {
if (!(tmp->flags & SD_LOAD_BALANCE)) if (!(tmp->flags & SD_LOAD_BALANCE))
continue; continue;
...@@ -1723,9 +1726,10 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_ ...@@ -1723,9 +1726,10 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_
if (affine_sd) { if (affine_sd) {
if (cpu == prev_cpu || wake_affine(affine_sd, p, sync)) if (cpu == prev_cpu || wake_affine(affine_sd, p, sync))
return select_idle_sibling(p, cpu); prev_cpu = cpu;
else
return select_idle_sibling(p, prev_cpu); new_cpu = select_idle_sibling(p, prev_cpu);
goto unlock;
} }
while (sd) { while (sd) {
...@@ -1766,6 +1770,8 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_ ...@@ -1766,6 +1770,8 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_
} }
/* while loop will break here if sd == NULL */ /* while loop will break here if sd == NULL */
} }
unlock:
rcu_read_unlock();
return new_cpu; return new_cpu;
} }
...@@ -3462,6 +3468,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq) ...@@ -3462,6 +3468,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
raw_spin_unlock(&this_rq->lock); raw_spin_unlock(&this_rq->lock);
update_shares(this_cpu); update_shares(this_cpu);
rcu_read_lock();
for_each_domain(this_cpu, sd) { for_each_domain(this_cpu, sd) {
unsigned long interval; unsigned long interval;
int balance = 1; int balance = 1;
...@@ -3483,6 +3490,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq) ...@@ -3483,6 +3490,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
break; break;
} }
} }
rcu_read_unlock();
raw_spin_lock(&this_rq->lock); raw_spin_lock(&this_rq->lock);
...@@ -3531,6 +3539,7 @@ static int active_load_balance_cpu_stop(void *data) ...@@ -3531,6 +3539,7 @@ static int active_load_balance_cpu_stop(void *data)
double_lock_balance(busiest_rq, target_rq); double_lock_balance(busiest_rq, target_rq);
/* Search for an sd spanning us and the target CPU. */ /* Search for an sd spanning us and the target CPU. */
rcu_read_lock();
for_each_domain(target_cpu, sd) { for_each_domain(target_cpu, sd) {
if ((sd->flags & SD_LOAD_BALANCE) && if ((sd->flags & SD_LOAD_BALANCE) &&
cpumask_test_cpu(busiest_cpu, sched_domain_span(sd))) cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
...@@ -3546,6 +3555,7 @@ static int active_load_balance_cpu_stop(void *data) ...@@ -3546,6 +3555,7 @@ static int active_load_balance_cpu_stop(void *data)
else else
schedstat_inc(sd, alb_failed); schedstat_inc(sd, alb_failed);
} }
rcu_read_unlock();
double_unlock_balance(busiest_rq, target_rq); double_unlock_balance(busiest_rq, target_rq);
out_unlock: out_unlock:
busiest_rq->active_balance = 0; busiest_rq->active_balance = 0;
...@@ -3672,6 +3682,7 @@ static int find_new_ilb(int cpu) ...@@ -3672,6 +3682,7 @@ static int find_new_ilb(int cpu)
{ {
struct sched_domain *sd; struct sched_domain *sd;
struct sched_group *ilb_group; struct sched_group *ilb_group;
int ilb = nr_cpu_ids;
/* /*
* Have idle load balancer selection from semi-idle packages only * Have idle load balancer selection from semi-idle packages only
...@@ -3687,20 +3698,25 @@ static int find_new_ilb(int cpu) ...@@ -3687,20 +3698,25 @@ static int find_new_ilb(int cpu)
if (cpumask_weight(nohz.idle_cpus_mask) < 2) if (cpumask_weight(nohz.idle_cpus_mask) < 2)
goto out_done; goto out_done;
rcu_read_lock();
for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) { for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
ilb_group = sd->groups; ilb_group = sd->groups;
do { do {
if (is_semi_idle_group(ilb_group)) if (is_semi_idle_group(ilb_group)) {
return cpumask_first(nohz.grp_idle_mask); ilb = cpumask_first(nohz.grp_idle_mask);
goto unlock;
}
ilb_group = ilb_group->next; ilb_group = ilb_group->next;
} while (ilb_group != sd->groups); } while (ilb_group != sd->groups);
} }
unlock:
rcu_read_unlock();
out_done: out_done:
return nr_cpu_ids; return ilb;
} }
#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */ #else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
static inline int find_new_ilb(int call_cpu) static inline int find_new_ilb(int call_cpu)
...@@ -3845,6 +3861,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) ...@@ -3845,6 +3861,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
update_shares(cpu); update_shares(cpu);
rcu_read_lock();
for_each_domain(cpu, sd) { for_each_domain(cpu, sd) {
if (!(sd->flags & SD_LOAD_BALANCE)) if (!(sd->flags & SD_LOAD_BALANCE))
continue; continue;
...@@ -3890,6 +3907,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) ...@@ -3890,6 +3907,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
if (!balance) if (!balance)
break; break;
} }
rcu_read_unlock();
/* /*
* next_balance will be updated only when there is a need. * next_balance will be updated only when there is a need.
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment