sched: simplify the group load balancer

While thinking about the previous patch - I realized that using per domain aggregate load values in load_balance_fair() is wrong. We should use the load value for that CPU. By not needing per domain hierarchical load values we don't need to store per domain aggregate shares, which greatly simplifies all the math. It basically falls apart in two separate computations: - per domain update of the shares - per CPU update of the hierarchical load Also get rid of the move_group_shares() stuff - just re-compute the shares again after a successful load balance. Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com> Cc: Mike Galbraith <efault@gmx.de> Signed-off-by: Ingo Molnar <mingo@elte.hu>

sched: simplify the group load balancer
While thinking about the previous patch - I realized that using per domain aggregate load values in load_balance_fair() is wrong. We should use the load value for that CPU. By not needing per domain hierarchical load values we don't need to store per domain aggregate shares, which greatly simplifies all the math. It basically falls apart in two separate computations: - per domain update of the shares - per CPU update of the hierarchical load Also get rid of the move_group_shares() stuff - just re-compute the shares again after a successful load balance. Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com> Cc: Mike Galbraith <efault@gmx.de> Signed-off-by: Ingo Molnar <mingo@elte.hu>
c8cba857 · Peter Zijlstra · Ingo Molnar · a25b5aca · c8cba857 · c8cba857
Commit c8cba857 authored Jun 27, 2008 by Peter Zijlstra Committed by Ingo Molnar Jun 27, 2008
Show whitespace changes
Inline Side-by-side

Showing with 72 additions and 229 deletions

kernel/sched.c kernel/sched.c +64 -222

kernel/sched_fair.c kernel/sched_fair.c +8 -7

No files found.
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -406,34 +406,23 @@ struct cfs_rq {
 	struct task_group *tg;	/* group that "owns" this runqueue */
 #ifdef CONFIG_SMP
-	unsigned long task_weight;
-	unsigned long shares;
 	/*
-	 * We need space to build a sched_domain wide view of the full task
+	 * the part of load.weight contributed by tasks
-	 * group tree, in order to avoid depending on dynamic memory allocation
-	 * during the load balancing we place this in the per cpu task group
-	 * hierarchy. This limits the load balancing to one instance per cpu,
-	 * but more should not be needed anyway.
 	 */
-	struct aggregate_struct {
+	unsigned long task_weight;
 	/*
-		 *   load = weight(cpus) * f(tg)
+	 *   h_load = weight * f(tg)
 	 *
 	 * Where f(tg) is the recursive weight fraction assigned to
 	 * this group.
 	 */
-		unsigned long load;
+	unsigned long h_load;
 	/*
-		 * part of the group weight distributed to this span.
+	 * this cpu's part of tg->shares
 	 */
 	unsigned long shares;
-		/*
-		 * The sum of all runqueue weights within this span.
-		 */
-		unsigned long rq_weight;
-	} aggregate;
 #endif
 #endif
 };
@@ -1443,47 +1432,14 @@ static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
 #ifdef CONFIG_FAIR_GROUP_SCHED
-/*
+typedef void (*tg_visitor)(struct task_group *, int, struct sched_domain *);
- * Group load balancing.
- *
- * We calculate a few balance domain wide aggregate numbers; load and weight.
- * Given the pictures below, and assuming each item has equal weight:
- *
- *         root          1 - thread
- *         / | \         A - group
- *        A  1  B
- *       /|\   / \
- *      C 2 D 3   4
- *      |   |
- *      5   6
- *
- * load:
- *    A and B get 1/3-rd of the total load. C and D get 1/3-rd of A's 1/3-rd,
- *    which equals 1/9-th of the total load.
- *
- * shares:
- *    The weight of this group on the selected cpus.
- *
- * rq_weight:
- *    Direct sum of all the cpu's their rq weight, e.g. A would get 3 while
- *    B would get 2.
- */
-static inline struct aggregate_struct *
-aggregate(struct task_group *tg, int cpu)
-{
-	return &tg->cfs_rq[cpu]->aggregate;
-}
-typedef void (*aggregate_func)(struct task_group *, int, struct sched_domain *);
 /*
 * Iterate the full tree, calling @down when first entering a node and @up when
 * leaving it for the final time.
 */
-static
+static void
-void aggregate_walk_tree(aggregate_func down, aggregate_func up,
+walk_tg_tree(tg_visitor down, tg_visitor up, int cpu, struct sched_domain *sd)
-			 int cpu, struct sched_domain *sd)
 {
 	struct task_group *parent, *child;
@@ -1507,72 +1463,6 @@ void aggregate_walk_tree(aggregate_func down, aggregate_func up,
 	rcu_read_unlock();
 }
-/*
- * Calculate the aggregate runqueue weight.
- */
-static void
-aggregate_group_weight(struct task_group *tg, int cpu, struct sched_domain *sd)
-{
-	unsigned long rq_weight = 0;
-	int i;
-	for_each_cpu_mask(i, sd->span)
-		rq_weight += tg->cfs_rq[i]->load.weight;
-	aggregate(tg, cpu)->rq_weight = rq_weight;
-}
-/*
- * Compute the weight of this group on the given cpus.
- */
-static void
-aggregate_group_shares(struct task_group *tg, int cpu, struct sched_domain *sd)
-{
-	unsigned long shares = 0;
-	int i;
-	for_each_cpu_mask(i, sd->span)
-		shares += tg->cfs_rq[i]->shares;
-	if ((!shares && aggregate(tg, cpu)->rq_weight) || shares > tg->shares)
-		shares = tg->shares;
-	if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
-		shares = tg->shares;
-	aggregate(tg, cpu)->shares = shares;
-}
-/*
- * Compute the load fraction assigned to this group, relies on the aggregate
- * weight and this group's parent's load, i.e. top-down.
- */
-static void
-aggregate_group_load(struct task_group *tg, int cpu, struct sched_domain *sd)
-{
-	unsigned long load;
-	if (!tg->parent) {
-		int i;
-		load = 0;
-		for_each_cpu_mask(i, sd->span)
-			load += cpu_rq(i)->load.weight;
-	} else {
-		load = aggregate(tg->parent, cpu)->load;
-		/*
-		 * shares is our weight in the parent's rq so
-		 * shares/parent->rq_weight gives our fraction of the load
-		 */
-		load *= aggregate(tg, cpu)->shares;
-		load /= aggregate(tg->parent, cpu)->rq_weight + 1;
-	}
-	aggregate(tg, cpu)->load = load;
-}
 static void __set_se_shares(struct sched_entity *se, unsigned long shares);
 /*
@@ -1580,16 +1470,16 @@ static void __set_se_shares(struct sched_entity *se, unsigned long shares);
 */
 static void
 __update_group_shares_cpu(struct task_group *tg, int cpu,
-			  struct sched_domain *sd, int tcpu)
+			  unsigned long sd_shares, unsigned long sd_rq_weight)
 {
 	int boost = 0;
 	unsigned long shares;
 	unsigned long rq_weight;
-	if (!tg->se[tcpu])
+	if (!tg->se[cpu])
 		return;
-	rq_weight = tg->cfs_rq[tcpu]->load.weight;
+	rq_weight = tg->cfs_rq[cpu]->load.weight;
 	/*
 	 * If there are currently no tasks on the cpu pretend there is one of
@@ -1601,124 +1491,97 @@ __update_group_shares_cpu(struct task_group *tg, int cpu,
 		rq_weight = NICE_0_LOAD;
 	}
+	if (unlikely(rq_weight > sd_rq_weight))
+		rq_weight = sd_rq_weight;
 	/*
 	 *           \Sum shares * rq_weight
 	 * shares =  -----------------------
 	 *               \Sum rq_weight
 	 *
 	 */
-	shares = aggregate(tg, cpu)->shares * rq_weight;
+	shares = (sd_shares * rq_weight) / (sd_rq_weight + 1);
-	shares /= aggregate(tg, cpu)->rq_weight + 1;
 	/*
 	 * record the actual number of shares, not the boosted amount.
 	 */
-	tg->cfs_rq[tcpu]->shares = boost ? 0 : shares;
+	tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
 	if (shares < MIN_SHARES)
 		shares = MIN_SHARES;
 	else if (shares > MAX_SHARES)
 		shares = MAX_SHARES;
-	__set_se_shares(tg->se[tcpu], shares);
+	__set_se_shares(tg->se[cpu], shares);
 }
 /*
- * Re-adjust the weights on the cpu the task came from and on the cpu the
+ * Re-compute the task group their per cpu shares over the given domain.
- * task went to.
+ * This needs to be done in a bottom-up fashion because the rq weight of a
+ * parent group depends on the shares of its child groups.
 */
 static void
-__move_group_shares(struct task_group *tg, int cpu, struct sched_domain *sd,
+tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd)
-		    int scpu, int dcpu)
 {
-	__update_group_shares_cpu(tg, cpu, sd, scpu);
+	unsigned long rq_weight = 0;
-	__update_group_shares_cpu(tg, cpu, sd, dcpu);
+	unsigned long shares = 0;
-}
+	int i;
-/*
+	for_each_cpu_mask(i, sd->span) {
- * Because changing a group's shares changes the weight of the super-group
+		rq_weight += tg->cfs_rq[i]->load.weight;
- * we need to walk up the tree and change all shares until we hit the root.
+		shares += tg->cfs_rq[i]->shares;
- */
-static void
-move_group_shares(struct task_group *tg, int cpu, struct sched_domain *sd,
-		  int scpu, int dcpu)
-{
-	while (tg) {
-		__move_group_shares(tg, cpu, sd, scpu, dcpu);
-		tg = tg->parent;
 	}
-}
-static void
+	if ((!shares && rq_weight) || shares > tg->shares)
-aggregate_group_set_shares(struct task_group *tg, int cpu, struct sched_domain *sd)
+		shares = tg->shares;
-{
-	int i;
+	if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
+		shares = tg->shares;
 	for_each_cpu_mask(i, sd->span) {
 		struct rq *rq = cpu_rq(i);
 		unsigned long flags;
 		spin_lock_irqsave(&rq->lock, flags);
-		__update_group_shares_cpu(tg, cpu, sd, i);
+		__update_group_shares_cpu(tg, i, shares, rq_weight);
 		spin_unlock_irqrestore(&rq->lock, flags);
 	}
-	aggregate_group_shares(tg, cpu, sd);
-}
-/*
- * Calculate the accumulative weight and recursive load of each task group
- * while walking down the tree.
- */
-static void
-aggregate_get_down(struct task_group *tg, int cpu, struct sched_domain *sd)
-{
-	aggregate_group_weight(tg, cpu, sd);
-	aggregate_group_shares(tg, cpu, sd);
-	aggregate_group_load(tg, cpu, sd);
 }
 /*
- * Rebalance the cpu shares while walking back up the tree.
+ * Compute the cpu's hierarchical load factor for each task group.
+ * This needs to be done in a top-down fashion because the load of a child
+ * group is a fraction of its parents load.
 */
 static void
-aggregate_get_up(struct task_group *tg, int cpu, struct sched_domain *sd)
+tg_load_down(struct task_group *tg, int cpu, struct sched_domain *sd)
-{
-	aggregate_group_set_shares(tg, cpu, sd);
-}
-static void
-aggregate_get_nop(struct task_group *tg, int cpu, struct sched_domain *sd)
 {
-}
+	unsigned long load;
-static DEFINE_PER_CPU(spinlock_t, aggregate_lock);
-static void __init init_aggregate(void)
+	if (!tg->parent) {
-{
+		load = cpu_rq(cpu)->load.weight;
-	int i;
+	} else {
+		load = tg->parent->cfs_rq[cpu]->h_load;
+		load *= tg->cfs_rq[cpu]->shares;
+		load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
+	}
-	for_each_possible_cpu(i)
+	tg->cfs_rq[cpu]->h_load = load;
-		spin_lock_init(&per_cpu(aggregate_lock, i));
 }
-static int get_aggregate(int cpu, struct sched_domain *sd)
+static void
+tg_nop(struct task_group *tg, int cpu, struct sched_domain *sd)
 {
-	if (!spin_trylock(&per_cpu(aggregate_lock, cpu)))
-		return 0;
-	aggregate_walk_tree(aggregate_get_down, aggregate_get_up, cpu, sd);
-	return 1;
 }
-static void update_aggregate(int cpu, struct sched_domain *sd)
+static void update_shares(struct sched_domain *sd)
 {
-	aggregate_walk_tree(aggregate_get_down, aggregate_get_nop, cpu, sd);
+	walk_tg_tree(tg_nop, tg_shares_up, 0, sd);
 }
-static void put_aggregate(int cpu, struct sched_domain *sd)
+static void update_h_load(int cpu)
 {
-	spin_unlock(&per_cpu(aggregate_lock, cpu));
+	walk_tg_tree(tg_load_down, tg_nop, cpu, NULL);
 }
 static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
@@ -1728,22 +1591,10 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
 #else
-static inline void init_aggregate(void)
+static inline void update_shares(struct sched_domain *sd)
-{
-}
-static inline int get_aggregate(int cpu, struct sched_domain *sd)
 {
-	return 0;
 }
-static inline void update_aggregate(int cpu, struct sched_domain *sd)
-{
-}
-static inline void put_aggregate(int cpu, struct sched_domain *sd)
-{
-}
 #endif
 #endif
@@ -2172,12 +2023,6 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
 	int load_idx = sd->forkexec_idx;
 	int imbalance = 100 + (sd->imbalance_pct-100)/2;
-	/*
-	 * now that we have both rqs locked the rq weight won't change
-	 * anymore - so update the stats.
-	 */
-	update_aggregate(this_cpu, sd);
 	do {
 		unsigned long load, avg_load;
 		int local_group;
@@ -3521,12 +3366,9 @@ static int load_balance(int this_cpu, struct rq *this_rq,
 	unsigned long imbalance;
 	struct rq *busiest;
 	unsigned long flags;
-	int unlock_aggregate;
 	cpus_setall(*cpus);
-	unlock_aggregate = get_aggregate(this_cpu, sd);
 	/*
 	 * When power savings policy is enabled for the parent domain, idle
 	 * sibling can pick up load irrespective of busy siblings. In this case,
@@ -3540,6 +3382,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
 	schedstat_inc(sd, lb_count[idle]);
 redo:
+	update_shares(sd);
 	group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
 				   cpus, balance);
@@ -3663,8 +3506,8 @@ static int load_balance(int this_cpu, struct rq *this_rq,
 	else
 		ld_moved = 0;
 out:
-	if (unlock_aggregate)
+	if (ld_moved)
-		put_aggregate(this_cpu, sd);
+		update_shares(sd);
 	return ld_moved;
 }
@@ -8019,7 +7862,6 @@ void __init sched_init(void)
 	}
 #ifdef CONFIG_SMP
-	init_aggregate();
 	init_defrootdomain();
 #endif

--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1421,17 +1421,20 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
 	struct task_group *tg;
 	rcu_read_lock();
+	update_h_load(busiest_cpu);
 	list_for_each_entry(tg, &task_groups, list) {
+		struct cfs_rq *busiest_cfs_rq = tg->cfs_rq[busiest_cpu];
 		long rem_load, moved_load;
 		/*
 		 * empty group
 		 */
-		if (!tg->cfs_rq[busiest_cpu]->task_weight)
+		if (!busiest_cfs_rq->task_weight)
 			continue;
-		rem_load = rem_load_move * aggregate(tg, this_cpu)->rq_weight;
+		rem_load = rem_load_move * busiest_cfs_rq->load.weight;
-		rem_load /= aggregate(tg, this_cpu)->load + 1;
+		rem_load /= busiest_cfs_rq->h_load + 1;
 		moved_load = __load_balance_fair(this_rq, this_cpu, busiest,
 				rem_load, sd, idle, all_pinned, this_best_prio,
@@ -1440,10 +1443,8 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
 		if (!moved_load)
 			continue;
-		move_group_shares(tg, this_cpu, sd, busiest_cpu, this_cpu);
+		moved_load *= busiest_cfs_rq->h_load;
+		moved_load /= busiest_cfs_rq->load.weight + 1;
-		moved_load *= aggregate(tg, this_cpu)->load;
-		moved_load /= aggregate(tg, this_cpu)->rq_weight + 1;
 		rem_load_move -= moved_load;
 		if (rem_load_move < 0)