Commit cb29a5c1 authored by Mel Gorman's avatar Mel Gorman Committed by Peter Zijlstra

sched/numa: Apply imbalance limitations consistently

The imbalance limitations are applied inconsistently at fork time
and at runtime. At fork, a new task can remain local until there are
too many running tasks even if the degree of imbalance is larger than
NUMA_IMBALANCE_MIN which is different to runtime. Secondly, the imbalance
figure used during load balancing is different to the one used at NUMA
placement. Load balancing uses the number of tasks that must move to
restore imbalance where as NUMA balancing uses the total imbalance.

In combination, it is possible for a parallel workload that uses a small
number of CPUs without applying scheduler policies to have very variable
run-to-run performance.

[lkp@intel.com: Fix build breakage for arc-allyesconfig]
Signed-off-by: default avatarMel Gorman <mgorman@techsingularity.net>
Signed-off-by: default avatarPeter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: default avatarK Prateek Nayak <kprateek.nayak@amd.com>
Link: https://lore.kernel.org/r/20220520103519.1863-4-mgorman@techsingularity.net
parent 13ede331
...@@ -1055,6 +1055,33 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se) ...@@ -1055,6 +1055,33 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
* Scheduling class queueing methods: * Scheduling class queueing methods:
*/ */
#ifdef CONFIG_NUMA
#define NUMA_IMBALANCE_MIN 2
static inline long
adjust_numa_imbalance(int imbalance, int dst_running, int imb_numa_nr)
{
/*
* Allow a NUMA imbalance if busy CPUs is less than the maximum
* threshold. Above this threshold, individual tasks may be contending
* for both memory bandwidth and any shared HT resources. This is an
* approximation as the number of running tasks may not be related to
* the number of busy CPUs due to sched_setaffinity.
*/
if (dst_running > imb_numa_nr)
return imbalance;
/*
* Allow a small imbalance based on a simple pair of communicating
* tasks that remain local when the destination is lightly loaded.
*/
if (imbalance <= NUMA_IMBALANCE_MIN)
return 0;
return imbalance;
}
#endif /* CONFIG_NUMA */
#ifdef CONFIG_NUMA_BALANCING #ifdef CONFIG_NUMA_BALANCING
/* /*
* Approximate time to scan a full NUMA task in ms. The task scan period is * Approximate time to scan a full NUMA task in ms. The task scan period is
...@@ -1548,8 +1575,6 @@ struct task_numa_env { ...@@ -1548,8 +1575,6 @@ struct task_numa_env {
static unsigned long cpu_load(struct rq *rq); static unsigned long cpu_load(struct rq *rq);
static unsigned long cpu_runnable(struct rq *rq); static unsigned long cpu_runnable(struct rq *rq);
static inline long adjust_numa_imbalance(int imbalance,
int dst_running, int imb_numa_nr);
static inline enum static inline enum
numa_type numa_classify(unsigned int imbalance_pct, numa_type numa_classify(unsigned int imbalance_pct,
...@@ -9067,16 +9092,6 @@ static bool update_pick_idlest(struct sched_group *idlest, ...@@ -9067,16 +9092,6 @@ static bool update_pick_idlest(struct sched_group *idlest,
return true; return true;
} }
/*
* Allow a NUMA imbalance if busy CPUs is less than 25% of the domain.
* This is an approximation as the number of running tasks may not be
* related to the number of busy CPUs due to sched_setaffinity.
*/
static inline bool allow_numa_imbalance(int running, int imb_numa_nr)
{
return running <= imb_numa_nr;
}
/* /*
* find_idlest_group() finds and returns the least busy CPU group within the * find_idlest_group() finds and returns the least busy CPU group within the
* domain. * domain.
...@@ -9193,6 +9208,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) ...@@ -9193,6 +9208,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
break; break;
case group_has_spare: case group_has_spare:
#ifdef CONFIG_NUMA
if (sd->flags & SD_NUMA) { if (sd->flags & SD_NUMA) {
#ifdef CONFIG_NUMA_BALANCING #ifdef CONFIG_NUMA_BALANCING
int idlest_cpu; int idlest_cpu;
...@@ -9206,7 +9222,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) ...@@ -9206,7 +9222,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
idlest_cpu = cpumask_first(sched_group_span(idlest)); idlest_cpu = cpumask_first(sched_group_span(idlest));
if (cpu_to_node(idlest_cpu) == p->numa_preferred_nid) if (cpu_to_node(idlest_cpu) == p->numa_preferred_nid)
return idlest; return idlest;
#endif #endif /* CONFIG_NUMA_BALANCING */
/* /*
* Otherwise, keep the task close to the wakeup source * Otherwise, keep the task close to the wakeup source
* and improve locality if the number of running tasks * and improve locality if the number of running tasks
...@@ -9214,9 +9230,14 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) ...@@ -9214,9 +9230,14 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
* allowed. If there is a real need of migration, * allowed. If there is a real need of migration,
* periodic load balance will take care of it. * periodic load balance will take care of it.
*/ */
if (allow_numa_imbalance(local_sgs.sum_nr_running + 1, sd->imb_numa_nr)) imbalance = abs(local_sgs.idle_cpus - idlest_sgs.idle_cpus);
if (!adjust_numa_imbalance(imbalance,
local_sgs.sum_nr_running + 1,
sd->imb_numa_nr)) {
return NULL; return NULL;
}
} }
#endif /* CONFIG_NUMA */
/* /*
* Select group with highest number of idle CPUs. We could also * Select group with highest number of idle CPUs. We could also
...@@ -9303,24 +9324,6 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd ...@@ -9303,24 +9324,6 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
} }
} }
#define NUMA_IMBALANCE_MIN 2
static inline long adjust_numa_imbalance(int imbalance,
int dst_running, int imb_numa_nr)
{
if (!allow_numa_imbalance(dst_running, imb_numa_nr))
return imbalance;
/*
* Allow a small imbalance based on a simple pair of communicating
* tasks that remain local when the destination is lightly loaded.
*/
if (imbalance <= NUMA_IMBALANCE_MIN)
return 0;
return imbalance;
}
/** /**
* calculate_imbalance - Calculate the amount of imbalance present within the * calculate_imbalance - Calculate the amount of imbalance present within the
* groups of a given sched_domain during load balance. * groups of a given sched_domain during load balance.
...@@ -9405,7 +9408,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s ...@@ -9405,7 +9408,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
*/ */
env->migration_type = migrate_task; env->migration_type = migrate_task;
lsub_positive(&nr_diff, local->sum_nr_running); lsub_positive(&nr_diff, local->sum_nr_running);
env->imbalance = nr_diff >> 1; env->imbalance = nr_diff;
} else { } else {
/* /*
...@@ -9413,15 +9416,21 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s ...@@ -9413,15 +9416,21 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
* idle cpus. * idle cpus.
*/ */
env->migration_type = migrate_task; env->migration_type = migrate_task;
env->imbalance = max_t(long, 0, (local->idle_cpus - env->imbalance = max_t(long, 0,
busiest->idle_cpus) >> 1); (local->idle_cpus - busiest->idle_cpus));
} }
#ifdef CONFIG_NUMA
/* Consider allowing a small imbalance between NUMA groups */ /* Consider allowing a small imbalance between NUMA groups */
if (env->sd->flags & SD_NUMA) { if (env->sd->flags & SD_NUMA) {
env->imbalance = adjust_numa_imbalance(env->imbalance, env->imbalance = adjust_numa_imbalance(env->imbalance,
local->sum_nr_running + 1, env->sd->imb_numa_nr); local->sum_nr_running + 1,
env->sd->imb_numa_nr);
} }
#endif
/* Number of tasks to move to restore balance */
env->imbalance >>= 1;
return; return;
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment