Commit 66fef08f authored by Ingo Molnar's avatar Ingo Molnar

Merge branch 'sched/balancing' into sched/core

parents b6d98422 b7bb4c9b
...@@ -3189,246 +3189,479 @@ static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, ...@@ -3189,246 +3189,479 @@ static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
return 0; return 0;
} }
/********** Helpers for find_busiest_group ************************/
/**
* sd_lb_stats - Structure to store the statistics of a sched_domain
* during load balancing.
*/
struct sd_lb_stats {
struct sched_group *busiest; /* Busiest group in this sd */
struct sched_group *this; /* Local group in this sd */
unsigned long total_load; /* Total load of all groups in sd */
unsigned long total_pwr; /* Total power of all groups in sd */
unsigned long avg_load; /* Average load across all groups in sd */
/** Statistics of this group */
unsigned long this_load;
unsigned long this_load_per_task;
unsigned long this_nr_running;
/* Statistics of the busiest group */
unsigned long max_load;
unsigned long busiest_load_per_task;
unsigned long busiest_nr_running;
int group_imb; /* Is there imbalance in this sd */
#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
int power_savings_balance; /* Is powersave balance needed for this sd */
struct sched_group *group_min; /* Least loaded group in sd */
struct sched_group *group_leader; /* Group which relieves group_min */
unsigned long min_load_per_task; /* load_per_task in group_min */
unsigned long leader_nr_running; /* Nr running of group_leader */
unsigned long min_nr_running; /* Nr running of group_min */
#endif
};
/* /**
* find_busiest_group finds and returns the busiest CPU group within the * sg_lb_stats - stats of a sched_group required for load_balancing
* domain. It calculates and returns the amount of weighted load which */
* should be moved to restore balance via the imbalance parameter. struct sg_lb_stats {
unsigned long avg_load; /*Avg load across the CPUs of the group */
unsigned long group_load; /* Total load over the CPUs of the group */
unsigned long sum_nr_running; /* Nr tasks running in the group */
unsigned long sum_weighted_load; /* Weighted load of group's tasks */
unsigned long group_capacity;
int group_imb; /* Is there an imbalance in the group ? */
};
/**
* group_first_cpu - Returns the first cpu in the cpumask of a sched_group.
* @group: The group whose first cpu is to be returned.
*/ */
static struct sched_group * static inline unsigned int group_first_cpu(struct sched_group *group)
find_busiest_group(struct sched_domain *sd, int this_cpu,
unsigned long *imbalance, enum cpu_idle_type idle,
int *sd_idle, const struct cpumask *cpus, int *balance)
{ {
struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; return cpumask_first(sched_group_cpus(group));
unsigned long max_load, avg_load, total_load, this_load, total_pwr; }
unsigned long max_pull;
unsigned long busiest_load_per_task, busiest_nr_running;
unsigned long this_load_per_task, this_nr_running;
int load_idx, group_imb = 0;
#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
int power_savings_balance = 1;
unsigned long leader_nr_running = 0, min_load_per_task = 0;
unsigned long min_nr_running = ULONG_MAX;
struct sched_group *group_min = NULL, *group_leader = NULL;
#endif
max_load = this_load = total_load = total_pwr = 0; /**
busiest_load_per_task = busiest_nr_running = 0; * get_sd_load_idx - Obtain the load index for a given sched domain.
this_load_per_task = this_nr_running = 0; * @sd: The sched_domain whose load_idx is to be obtained.
* @idle: The Idle status of the CPU for whose sd load_icx is obtained.
*/
static inline int get_sd_load_idx(struct sched_domain *sd,
enum cpu_idle_type idle)
{
int load_idx;
if (idle == CPU_NOT_IDLE) switch (idle) {
case CPU_NOT_IDLE:
load_idx = sd->busy_idx; load_idx = sd->busy_idx;
else if (idle == CPU_NEWLY_IDLE) break;
case CPU_NEWLY_IDLE:
load_idx = sd->newidle_idx; load_idx = sd->newidle_idx;
else break;
default:
load_idx = sd->idle_idx; load_idx = sd->idle_idx;
break;
}
do { return load_idx;
unsigned long load, group_capacity, max_cpu_load, min_cpu_load; }
int local_group;
int i;
int __group_imb = 0;
unsigned int balance_cpu = -1, first_idle_cpu = 0;
unsigned long sum_nr_running, sum_weighted_load;
unsigned long sum_avg_load_per_task;
unsigned long avg_load_per_task;
local_group = cpumask_test_cpu(this_cpu,
sched_group_cpus(group));
if (local_group) #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
balance_cpu = cpumask_first(sched_group_cpus(group)); /**
* init_sd_power_savings_stats - Initialize power savings statistics for
* the given sched_domain, during load balancing.
*
* @sd: Sched domain whose power-savings statistics are to be initialized.
* @sds: Variable containing the statistics for sd.
* @idle: Idle status of the CPU at which we're performing load-balancing.
*/
static inline void init_sd_power_savings_stats(struct sched_domain *sd,
struct sd_lb_stats *sds, enum cpu_idle_type idle)
{
/*
* Busy processors will not participate in power savings
* balance.
*/
if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
sds->power_savings_balance = 0;
else {
sds->power_savings_balance = 1;
sds->min_nr_running = ULONG_MAX;
sds->leader_nr_running = 0;
}
}
/* Tally up the load of all CPUs in the group */ /**
sum_weighted_load = sum_nr_running = avg_load = 0; * update_sd_power_savings_stats - Update the power saving stats for a
sum_avg_load_per_task = avg_load_per_task = 0; * sched_domain while performing load balancing.
*
* @group: sched_group belonging to the sched_domain under consideration.
* @sds: Variable containing the statistics of the sched_domain
* @local_group: Does group contain the CPU for which we're performing
* load balancing ?
* @sgs: Variable containing the statistics of the group.
*/
static inline void update_sd_power_savings_stats(struct sched_group *group,
struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
{
max_cpu_load = 0; if (!sds->power_savings_balance)
min_cpu_load = ~0UL; return;
for_each_cpu_and(i, sched_group_cpus(group), cpus) { /*
struct rq *rq = cpu_rq(i); * If the local group is idle or completely loaded
* no need to do power savings balance at this domain
*/
if (local_group && (sds->this_nr_running >= sgs->group_capacity ||
!sds->this_nr_running))
sds->power_savings_balance = 0;
if (*sd_idle && rq->nr_running) /*
*sd_idle = 0; * If a group is already running at full capacity or idle,
* don't include that group in power savings calculations
*/
if (!sds->power_savings_balance ||
sgs->sum_nr_running >= sgs->group_capacity ||
!sgs->sum_nr_running)
return;
/* Bias balancing toward cpus of our domain */ /*
if (local_group) { * Calculate the group which has the least non-idle load.
if (idle_cpu(i) && !first_idle_cpu) { * This is the group from where we need to pick up the load
first_idle_cpu = 1; * for saving power
balance_cpu = i; */
} if ((sgs->sum_nr_running < sds->min_nr_running) ||
(sgs->sum_nr_running == sds->min_nr_running &&
group_first_cpu(group) > group_first_cpu(sds->group_min))) {
sds->group_min = group;
sds->min_nr_running = sgs->sum_nr_running;
sds->min_load_per_task = sgs->sum_weighted_load /
sgs->sum_nr_running;
}
load = target_load(i, load_idx); /*
} else { * Calculate the group which is almost near its
load = source_load(i, load_idx); * capacity but still has some space to pick up some load
if (load > max_cpu_load) * from other group and save more power
max_cpu_load = load; */
if (min_cpu_load > load) if (sgs->sum_nr_running > sgs->group_capacity - 1)
min_cpu_load = load; return;
}
avg_load += load; if (sgs->sum_nr_running > sds->leader_nr_running ||
sum_nr_running += rq->nr_running; (sgs->sum_nr_running == sds->leader_nr_running &&
sum_weighted_load += weighted_cpuload(i); group_first_cpu(group) < group_first_cpu(sds->group_leader))) {
sds->group_leader = group;
sds->leader_nr_running = sgs->sum_nr_running;
}
}
sum_avg_load_per_task += cpu_avg_load_per_task(i); /**
} * check_power_save_busiest_group - Check if we have potential to perform
* some power-savings balance. If yes, set the busiest group to be
* the least loaded group in the sched_domain, so that it's CPUs can
* be put to idle.
*
* @sds: Variable containing the statistics of the sched_domain
* under consideration.
* @this_cpu: Cpu at which we're currently performing load-balancing.
* @imbalance: Variable to store the imbalance.
*
* Returns 1 if there is potential to perform power-savings balance.
* Else returns 0.
*/
static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
int this_cpu, unsigned long *imbalance)
{
if (!sds->power_savings_balance)
return 0;
/* if (sds->this != sds->group_leader ||
* First idle cpu or the first cpu(busiest) in this sched group sds->group_leader == sds->group_min)
* is eligible for doing load balancing at this and above return 0;
* domains. In the newly idle case, we will allow all the cpu's
* to do the newly idle load balance.
*/
if (idle != CPU_NEWLY_IDLE && local_group &&
balance_cpu != this_cpu && balance) {
*balance = 0;
goto ret;
}
total_load += avg_load; *imbalance = sds->min_load_per_task;
total_pwr += group->__cpu_power; sds->busiest = sds->group_min;
/* Adjust by relative CPU power of the group */ if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) {
avg_load = sg_div_cpu_power(group, cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu =
avg_load * SCHED_LOAD_SCALE); group_first_cpu(sds->group_leader);
}
return 1;
/* }
* Consider the group unbalanced when the imbalance is larger #else /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
* than the average weight of two tasks. static inline void init_sd_power_savings_stats(struct sched_domain *sd,
* struct sd_lb_stats *sds, enum cpu_idle_type idle)
* APZ: with cgroup the avg task weight can vary wildly and {
* might not be a suitable number - should we keep a return;
* normalized nr_running number somewhere that negates }
* the hierarchy?
*/ static inline void update_sd_power_savings_stats(struct sched_group *group,
avg_load_per_task = sg_div_cpu_power(group, struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
sum_avg_load_per_task * SCHED_LOAD_SCALE); {
return;
}
static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
int this_cpu, unsigned long *imbalance)
{
return 0;
}
#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
/**
* update_sg_lb_stats - Update sched_group's statistics for load balancing.
* @group: sched_group whose statistics are to be updated.
* @this_cpu: Cpu for which load balance is currently performed.
* @idle: Idle status of this_cpu
* @load_idx: Load index of sched_domain of this_cpu for load calc.
* @sd_idle: Idle status of the sched_domain containing group.
* @local_group: Does group contain this_cpu.
* @cpus: Set of cpus considered for load balancing.
* @balance: Should we balance.
* @sgs: variable to hold the statistics for this group.
*/
static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu,
enum cpu_idle_type idle, int load_idx, int *sd_idle,
int local_group, const struct cpumask *cpus,
int *balance, struct sg_lb_stats *sgs)
{
unsigned long load, max_cpu_load, min_cpu_load;
int i;
unsigned int balance_cpu = -1, first_idle_cpu = 0;
unsigned long sum_avg_load_per_task;
unsigned long avg_load_per_task;
if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task) if (local_group)
__group_imb = 1; balance_cpu = group_first_cpu(group);
group_capacity = group->__cpu_power / SCHED_LOAD_SCALE; /* Tally up the load of all CPUs in the group */
sum_avg_load_per_task = avg_load_per_task = 0;
max_cpu_load = 0;
min_cpu_load = ~0UL;
for_each_cpu_and(i, sched_group_cpus(group), cpus) {
struct rq *rq = cpu_rq(i);
if (*sd_idle && rq->nr_running)
*sd_idle = 0;
/* Bias balancing toward cpus of our domain */
if (local_group) { if (local_group) {
this_load = avg_load; if (idle_cpu(i) && !first_idle_cpu) {
this = group; first_idle_cpu = 1;
this_nr_running = sum_nr_running; balance_cpu = i;
this_load_per_task = sum_weighted_load; }
} else if (avg_load > max_load &&
(sum_nr_running > group_capacity || __group_imb)) { load = target_load(i, load_idx);
max_load = avg_load; } else {
busiest = group; load = source_load(i, load_idx);
busiest_nr_running = sum_nr_running; if (load > max_cpu_load)
busiest_load_per_task = sum_weighted_load; max_cpu_load = load;
group_imb = __group_imb; if (min_cpu_load > load)
min_cpu_load = load;
} }
#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) sgs->group_load += load;
/* sgs->sum_nr_running += rq->nr_running;
* Busy processors will not participate in power savings sgs->sum_weighted_load += weighted_cpuload(i);
* balance.
*/
if (idle == CPU_NOT_IDLE ||
!(sd->flags & SD_POWERSAVINGS_BALANCE))
goto group_next;
/* sum_avg_load_per_task += cpu_avg_load_per_task(i);
* If the local group is idle or completely loaded }
* no need to do power savings balance at this domain
*/
if (local_group && (this_nr_running >= group_capacity ||
!this_nr_running))
power_savings_balance = 0;
/* /*
* If a group is already running at full capacity or idle, * First idle cpu or the first cpu(busiest) in this sched group
* don't include that group in power savings calculations * is eligible for doing load balancing at this and above
*/ * domains. In the newly idle case, we will allow all the cpu's
if (!power_savings_balance || sum_nr_running >= group_capacity * to do the newly idle load balance.
|| !sum_nr_running) */
goto group_next; if (idle != CPU_NEWLY_IDLE && local_group &&
balance_cpu != this_cpu && balance) {
*balance = 0;
return;
}
/* /* Adjust by relative CPU power of the group */
* Calculate the group which has the least non-idle load. sgs->avg_load = sg_div_cpu_power(group,
* This is the group from where we need to pick up the load sgs->group_load * SCHED_LOAD_SCALE);
* for saving power
*/
if ((sum_nr_running < min_nr_running) ||
(sum_nr_running == min_nr_running &&
cpumask_first(sched_group_cpus(group)) >
cpumask_first(sched_group_cpus(group_min)))) {
group_min = group;
min_nr_running = sum_nr_running;
min_load_per_task = sum_weighted_load /
sum_nr_running;
}
/*
* Calculate the group which is almost near its /*
* capacity but still has some space to pick up some load * Consider the group unbalanced when the imbalance is larger
* from other group and save more power * than the average weight of two tasks.
*/ *
if (sum_nr_running <= group_capacity - 1) { * APZ: with cgroup the avg task weight can vary wildly and
if (sum_nr_running > leader_nr_running || * might not be a suitable number - should we keep a
(sum_nr_running == leader_nr_running && * normalized nr_running number somewhere that negates
cpumask_first(sched_group_cpus(group)) < * the hierarchy?
cpumask_first(sched_group_cpus(group_leader)))) { */
group_leader = group; avg_load_per_task = sg_div_cpu_power(group,
leader_nr_running = sum_nr_running; sum_avg_load_per_task * SCHED_LOAD_SCALE);
}
if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
sgs->group_imb = 1;
sgs->group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;
}
/**
* update_sd_lb_stats - Update sched_group's statistics for load balancing.
* @sd: sched_domain whose statistics are to be updated.
* @this_cpu: Cpu for which load balance is currently performed.
* @idle: Idle status of this_cpu
* @sd_idle: Idle status of the sched_domain containing group.
* @cpus: Set of cpus considered for load balancing.
* @balance: Should we balance.
* @sds: variable to hold the statistics for this sched_domain.
*/
static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
enum cpu_idle_type idle, int *sd_idle,
const struct cpumask *cpus, int *balance,
struct sd_lb_stats *sds)
{
struct sched_group *group = sd->groups;
struct sg_lb_stats sgs;
int load_idx;
init_sd_power_savings_stats(sd, sds, idle);
load_idx = get_sd_load_idx(sd, idle);
do {
int local_group;
local_group = cpumask_test_cpu(this_cpu,
sched_group_cpus(group));
memset(&sgs, 0, sizeof(sgs));
update_sg_lb_stats(group, this_cpu, idle, load_idx, sd_idle,
local_group, cpus, balance, &sgs);
if (local_group && balance && !(*balance))
return;
sds->total_load += sgs.group_load;
sds->total_pwr += group->__cpu_power;
if (local_group) {
sds->this_load = sgs.avg_load;
sds->this = group;
sds->this_nr_running = sgs.sum_nr_running;
sds->this_load_per_task = sgs.sum_weighted_load;
} else if (sgs.avg_load > sds->max_load &&
(sgs.sum_nr_running > sgs.group_capacity ||
sgs.group_imb)) {
sds->max_load = sgs.avg_load;
sds->busiest = group;
sds->busiest_nr_running = sgs.sum_nr_running;
sds->busiest_load_per_task = sgs.sum_weighted_load;
sds->group_imb = sgs.group_imb;
} }
group_next:
#endif update_sd_power_savings_stats(group, sds, local_group, &sgs);
group = group->next; group = group->next;
} while (group != sd->groups); } while (group != sd->groups);
if (!busiest || this_load >= max_load || busiest_nr_running == 0) }
goto out_balanced;
avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
if (this_load >= avg_load || /**
100*max_load <= sd->imbalance_pct*this_load) * fix_small_imbalance - Calculate the minor imbalance that exists
goto out_balanced; * amongst the groups of a sched_domain, during
* load balancing.
* @sds: Statistics of the sched_domain whose imbalance is to be calculated.
* @this_cpu: The cpu at whose sched_domain we're performing load-balance.
* @imbalance: Variable to store the imbalance.
*/
static inline void fix_small_imbalance(struct sd_lb_stats *sds,
int this_cpu, unsigned long *imbalance)
{
unsigned long tmp, pwr_now = 0, pwr_move = 0;
unsigned int imbn = 2;
if (sds->this_nr_running) {
sds->this_load_per_task /= sds->this_nr_running;
if (sds->busiest_load_per_task >
sds->this_load_per_task)
imbn = 1;
} else
sds->this_load_per_task =
cpu_avg_load_per_task(this_cpu);
busiest_load_per_task /= busiest_nr_running; if (sds->max_load - sds->this_load + sds->busiest_load_per_task >=
if (group_imb) sds->busiest_load_per_task * imbn) {
busiest_load_per_task = min(busiest_load_per_task, avg_load); *imbalance = sds->busiest_load_per_task;
return;
}
/* /*
* We're trying to get all the cpus to the average_load, so we don't * OK, we don't have enough imbalance to justify moving tasks,
* want to push ourselves above the average load, nor do we wish to * however we may be able to increase total CPU power used by
* reduce the max loaded cpu below the average load, as either of these * moving them.
* actions would just result in more rebalancing later, and ping-pong
* tasks around. Thus we look for the minimum possible imbalance.
* Negative imbalances (*we* are more loaded than anyone else) will
* be counted as no imbalance for these purposes -- we can't fix that
* by pulling tasks to us. Be careful of negative numbers as they'll
* appear as very large values with unsigned longs.
*/ */
if (max_load <= busiest_load_per_task)
goto out_balanced;
pwr_now += sds->busiest->__cpu_power *
min(sds->busiest_load_per_task, sds->max_load);
pwr_now += sds->this->__cpu_power *
min(sds->this_load_per_task, sds->this_load);
pwr_now /= SCHED_LOAD_SCALE;
/* Amount of load we'd subtract */
tmp = sg_div_cpu_power(sds->busiest,
sds->busiest_load_per_task * SCHED_LOAD_SCALE);
if (sds->max_load > tmp)
pwr_move += sds->busiest->__cpu_power *
min(sds->busiest_load_per_task, sds->max_load - tmp);
/* Amount of load we'd add */
if (sds->max_load * sds->busiest->__cpu_power <
sds->busiest_load_per_task * SCHED_LOAD_SCALE)
tmp = sg_div_cpu_power(sds->this,
sds->max_load * sds->busiest->__cpu_power);
else
tmp = sg_div_cpu_power(sds->this,
sds->busiest_load_per_task * SCHED_LOAD_SCALE);
pwr_move += sds->this->__cpu_power *
min(sds->this_load_per_task, sds->this_load + tmp);
pwr_move /= SCHED_LOAD_SCALE;
/* Move if we gain throughput */
if (pwr_move > pwr_now)
*imbalance = sds->busiest_load_per_task;
}
/**
* calculate_imbalance - Calculate the amount of imbalance present within the
* groups of a given sched_domain during load balance.
* @sds: statistics of the sched_domain whose imbalance is to be calculated.
* @this_cpu: Cpu for which currently load balance is being performed.
* @imbalance: The variable to store the imbalance.
*/
static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
unsigned long *imbalance)
{
unsigned long max_pull;
/* /*
* In the presence of smp nice balancing, certain scenarios can have * In the presence of smp nice balancing, certain scenarios can have
* max load less than avg load(as we skip the groups at or below * max load less than avg load(as we skip the groups at or below
* its cpu_power, while calculating max_load..) * its cpu_power, while calculating max_load..)
*/ */
if (max_load < avg_load) { if (sds->max_load < sds->avg_load) {
*imbalance = 0; *imbalance = 0;
goto small_imbalance; return fix_small_imbalance(sds, this_cpu, imbalance);
} }
/* Don't want to pull so many tasks that a group would go idle */ /* Don't want to pull so many tasks that a group would go idle */
max_pull = min(max_load - avg_load, max_load - busiest_load_per_task); max_pull = min(sds->max_load - sds->avg_load,
sds->max_load - sds->busiest_load_per_task);
/* How much load to actually move to equalise the imbalance */ /* How much load to actually move to equalise the imbalance */
*imbalance = min(max_pull * busiest->__cpu_power, *imbalance = min(max_pull * sds->busiest->__cpu_power,
(avg_load - this_load) * this->__cpu_power) (sds->avg_load - sds->this_load) * sds->this->__cpu_power)
/ SCHED_LOAD_SCALE; / SCHED_LOAD_SCALE;
/* /*
...@@ -3437,78 +3670,110 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, ...@@ -3437,78 +3670,110 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
* a think about bumping its value to force at least one task to be * a think about bumping its value to force at least one task to be
* moved * moved
*/ */
if (*imbalance < busiest_load_per_task) { if (*imbalance < sds->busiest_load_per_task)
unsigned long tmp, pwr_now, pwr_move; return fix_small_imbalance(sds, this_cpu, imbalance);
unsigned int imbn;
small_imbalance:
pwr_move = pwr_now = 0;
imbn = 2;
if (this_nr_running) {
this_load_per_task /= this_nr_running;
if (busiest_load_per_task > this_load_per_task)
imbn = 1;
} else
this_load_per_task = cpu_avg_load_per_task(this_cpu);
if (max_load - this_load + busiest_load_per_task >= }
busiest_load_per_task * imbn) { /******* find_busiest_group() helpers end here *********************/
*imbalance = busiest_load_per_task;
return busiest;
}
/* /**
* OK, we don't have enough imbalance to justify moving tasks, * find_busiest_group - Returns the busiest group within the sched_domain
* however we may be able to increase total CPU power used by * if there is an imbalance. If there isn't an imbalance, and
* moving them. * the user has opted for power-savings, it returns a group whose
*/ * CPUs can be put to idle by rebalancing those tasks elsewhere, if
* such a group exists.
*
* Also calculates the amount of weighted load which should be moved
* to restore balance.
*
* @sd: The sched_domain whose busiest group is to be returned.
* @this_cpu: The cpu for which load balancing is currently being performed.
* @imbalance: Variable which stores amount of weighted load which should
* be moved to restore balance/put a group to idle.
* @idle: The idle status of this_cpu.
* @sd_idle: The idleness of sd
* @cpus: The set of CPUs under consideration for load-balancing.
* @balance: Pointer to a variable indicating if this_cpu
* is the appropriate cpu to perform load balancing at this_level.
*
* Returns: - the busiest group if imbalance exists.
* - If no imbalance and user has opted for power-savings balance,
* return the least loaded group whose CPUs can be
* put to idle by rebalancing its tasks onto our group.
*/
static struct sched_group *
find_busiest_group(struct sched_domain *sd, int this_cpu,
unsigned long *imbalance, enum cpu_idle_type idle,
int *sd_idle, const struct cpumask *cpus, int *balance)
{
struct sd_lb_stats sds;
pwr_now += busiest->__cpu_power * memset(&sds, 0, sizeof(sds));
min(busiest_load_per_task, max_load);
pwr_now += this->__cpu_power *
min(this_load_per_task, this_load);
pwr_now /= SCHED_LOAD_SCALE;
/* Amount of load we'd subtract */
tmp = sg_div_cpu_power(busiest,
busiest_load_per_task * SCHED_LOAD_SCALE);
if (max_load > tmp)
pwr_move += busiest->__cpu_power *
min(busiest_load_per_task, max_load - tmp);
/* Amount of load we'd add */
if (max_load * busiest->__cpu_power <
busiest_load_per_task * SCHED_LOAD_SCALE)
tmp = sg_div_cpu_power(this,
max_load * busiest->__cpu_power);
else
tmp = sg_div_cpu_power(this,
busiest_load_per_task * SCHED_LOAD_SCALE);
pwr_move += this->__cpu_power *
min(this_load_per_task, this_load + tmp);
pwr_move /= SCHED_LOAD_SCALE;
/* Move if we gain throughput */ /*
if (pwr_move > pwr_now) * Compute the various statistics relavent for load balancing at
*imbalance = busiest_load_per_task; * this level.
} */
update_sd_lb_stats(sd, this_cpu, idle, sd_idle, cpus,
balance, &sds);
/* Cases where imbalance does not exist from POV of this_cpu */
/* 1) this_cpu is not the appropriate cpu to perform load balancing
* at this level.
* 2) There is no busy sibling group to pull from.
* 3) This group is the busiest group.
* 4) This group is more busy than the avg busieness at this
* sched_domain.
* 5) The imbalance is within the specified limit.
* 6) Any rebalance would lead to ping-pong
*/
if (balance && !(*balance))
goto ret;
return busiest; if (!sds.busiest || sds.busiest_nr_running == 0)
goto out_balanced;
out_balanced: if (sds.this_load >= sds.max_load)
#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) goto out_balanced;
if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
goto ret;
if (this == group_leader && group_leader != group_min) { sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr;
*imbalance = min_load_per_task;
if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) { if (sds.this_load >= sds.avg_load)
cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu = goto out_balanced;
cpumask_first(sched_group_cpus(group_leader));
} if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
return group_min; goto out_balanced;
}
#endif sds.busiest_load_per_task /= sds.busiest_nr_running;
if (sds.group_imb)
sds.busiest_load_per_task =
min(sds.busiest_load_per_task, sds.avg_load);
/*
* We're trying to get all the cpus to the average_load, so we don't
* want to push ourselves above the average load, nor do we wish to
* reduce the max loaded cpu below the average load, as either of these
* actions would just result in more rebalancing later, and ping-pong
* tasks around. Thus we look for the minimum possible imbalance.
* Negative imbalances (*we* are more loaded than anyone else) will
* be counted as no imbalance for these purposes -- we can't fix that
* by pulling tasks to us. Be careful of negative numbers as they'll
* appear as very large values with unsigned longs.
*/
if (sds.max_load <= sds.busiest_load_per_task)
goto out_balanced;
/* Looks like there is an imbalance. Compute it */
calculate_imbalance(&sds, this_cpu, imbalance);
return sds.busiest;
out_balanced:
/*
* There is no obvious imbalance. But check if we can do some balancing
* to save power.
*/
if (check_power_save_busiest_group(&sds, this_cpu, imbalance))
return sds.busiest;
ret: ret:
*imbalance = 0; *imbalance = 0;
return NULL; return NULL;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment