Commit 42f52e1c authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull scheduler updates from Ingo Molnar:
 "The main changes are:

   - Migrate CPU-intense 'misfit' tasks on asymmetric capacity systems,
     to better utilize (much) faster 'big core' CPUs. (Morten Rasmussen,
     Valentin Schneider)

   - Topology handling improvements, in particular when CPU capacity
     changes and related load-balancing fixes/improvements (Morten
     Rasmussen)

   - ... plus misc other improvements, fixes and updates"

* 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (28 commits)
  sched/completions/Documentation: Add recommendation for dynamic and ONSTACK completions
  sched/completions/Documentation: Clean up the document some more
  sched/completions/Documentation: Fix a couple of punctuation nits
  cpu/SMT: State SMT is disabled even with nosmt and without "=force"
  sched/core: Fix comment regarding nr_iowait_cpu() and get_iowait_load()
  sched/fair: Remove setting task's se->runnable_weight during PELT update
  sched/fair: Disable LB_BIAS by default
  sched/pelt: Fix warning and clean up IRQ PELT config
  sched/topology: Make local variables static
  sched/debug: Use symbolic names for task state constants
  sched/numa: Remove unused numa_stats::nr_running field
  sched/numa: Remove unused code from update_numa_stats()
  sched/debug: Explicitly cast sched_feat() to bool
  sched/core: Disable SD_PREFER_SIBLING on asymmetric CPU capacity domains
  sched/fair: Don't move tasks to lower capacity CPUs unless necessary
  sched/fair: Set rq->rd->overload when misfit
  sched/fair: Wrap rq->rd->overload accesses with READ/WRITE_ONCE()
  sched/core: Change root_domain->overload type to int
  sched/fair: Change 'prefer_sibling' type to bool
  sched/fair: Kick nohz balance if rq->misfit_task_load
  ...
parents 0d1b82cd 11e13696
This diff is collapsed.
......@@ -33,6 +33,9 @@ const struct cpumask *cpu_coregroup_mask(int cpu);
/* Replace task scheduler's default cpu-invariant accounting */
#define arch_scale_cpu_capacity topology_get_cpu_scale
/* Enable topology flag updates */
#define arch_update_cpu_topology topology_update_cpu_topology
#else
static inline void init_cpu_topology(void) { }
......
......@@ -45,6 +45,9 @@ int pcibus_to_node(struct pci_bus *bus);
/* Replace task scheduler's default cpu-invariant accounting */
#define arch_scale_cpu_capacity topology_get_cpu_scale
/* Enable topology flag updates */
#define arch_update_cpu_topology topology_update_cpu_topology
#include <asm-generic/topology.h>
#endif /* _ASM_ARM_TOPOLOGY_H */
......@@ -15,6 +15,7 @@
#include <linux/slab.h>
#include <linux/string.h>
#include <linux/sched/topology.h>
#include <linux/cpuset.h>
DEFINE_PER_CPU(unsigned long, freq_scale) = SCHED_CAPACITY_SCALE;
......@@ -47,6 +48,9 @@ static ssize_t cpu_capacity_show(struct device *dev,
return sprintf(buf, "%lu\n", topology_get_cpu_scale(NULL, cpu->dev.id));
}
static void update_topology_flags_workfn(struct work_struct *work);
static DECLARE_WORK(update_topology_flags_work, update_topology_flags_workfn);
static ssize_t cpu_capacity_store(struct device *dev,
struct device_attribute *attr,
const char *buf,
......@@ -72,6 +76,8 @@ static ssize_t cpu_capacity_store(struct device *dev,
topology_set_cpu_scale(i, new_capacity);
mutex_unlock(&cpu_scale_mutex);
schedule_work(&update_topology_flags_work);
return count;
}
......@@ -96,6 +102,25 @@ static int register_cpu_capacity_sysctl(void)
}
subsys_initcall(register_cpu_capacity_sysctl);
static int update_topology;
int topology_update_cpu_topology(void)
{
return update_topology;
}
/*
* Updating the sched_domains can't be done directly from cpufreq callbacks
* due to locking, so queue the work for later.
*/
static void update_topology_flags_workfn(struct work_struct *work)
{
update_topology = 1;
rebuild_sched_domains();
pr_debug("sched_domain hierarchy rebuilt, flags updated\n");
update_topology = 0;
}
static u32 capacity_scale;
static u32 *raw_capacity;
......@@ -201,6 +226,7 @@ init_cpu_capacity_callback(struct notifier_block *nb,
if (cpumask_empty(cpus_to_visit)) {
topology_normalize_cpu_scale();
schedule_work(&update_topology_flags_work);
free_raw_capacity();
pr_debug("cpu_capacity: parsing done\n");
schedule_work(&parsing_done_work);
......
......@@ -9,6 +9,7 @@
#include <linux/percpu.h>
void topology_normalize_cpu_scale(void);
int topology_update_cpu_topology(void);
struct device_node;
bool topology_parse_cpu_capacity(struct device_node *cpu_node, int cpu);
......
......@@ -23,10 +23,10 @@
#define SD_BALANCE_FORK 0x0008 /* Balance on fork, clone */
#define SD_BALANCE_WAKE 0x0010 /* Balance on wakeup */
#define SD_WAKE_AFFINE 0x0020 /* Wake task to waking CPU */
#define SD_ASYM_CPUCAPACITY 0x0040 /* Groups have different max cpu capacities */
#define SD_SHARE_CPUCAPACITY 0x0080 /* Domain members share cpu capacity */
#define SD_ASYM_CPUCAPACITY 0x0040 /* Domain members have different CPU capacities */
#define SD_SHARE_CPUCAPACITY 0x0080 /* Domain members share CPU capacity */
#define SD_SHARE_POWERDOMAIN 0x0100 /* Domain members share power domain */
#define SD_SHARE_PKG_RESOURCES 0x0200 /* Domain members share cpu pkg resources */
#define SD_SHARE_PKG_RESOURCES 0x0200 /* Domain members share CPU pkg resources */
#define SD_SERIALIZE 0x0400 /* Only a single load balancing instance */
#define SD_ASYM_PACKING 0x0800 /* Place busy groups earlier in the domain */
#define SD_PREFER_SIBLING 0x1000 /* Prefer to place tasks in a sibling domain */
......
......@@ -159,9 +159,14 @@ TRACE_EVENT(sched_switch,
(__entry->prev_state & (TASK_REPORT_MAX - 1)) ?
__print_flags(__entry->prev_state & (TASK_REPORT_MAX - 1), "|",
{ 0x01, "S" }, { 0x02, "D" }, { 0x04, "T" },
{ 0x08, "t" }, { 0x10, "X" }, { 0x20, "Z" },
{ 0x40, "P" }, { 0x80, "I" }) :
{ TASK_INTERRUPTIBLE, "S" },
{ TASK_UNINTERRUPTIBLE, "D" },
{ __TASK_STOPPED, "T" },
{ __TASK_TRACED, "t" },
{ EXIT_DEAD, "X" },
{ EXIT_ZOMBIE, "Z" },
{ TASK_PARKED, "P" },
{ TASK_DEAD, "I" }) :
"R",
__entry->prev_state & TASK_REPORT_MAX ? "+" : "",
......
......@@ -415,6 +415,11 @@ config IRQ_TIME_ACCOUNTING
If in doubt, say N here.
config HAVE_SCHED_AVG_IRQ
def_bool y
depends on IRQ_TIME_ACCOUNTING || PARAVIRT_TIME_ACCOUNTING
depends on SMP
config BSD_PROCESS_ACCT
bool "BSD Process Accounting"
depends on MULTIUSER
......
......@@ -383,6 +383,7 @@ void __init cpu_smt_disable(bool force)
pr_info("SMT: Force disabled\n");
cpu_smt_control = CPU_SMT_FORCE_DISABLED;
} else {
pr_info("SMT: disabled\n");
cpu_smt_control = CPU_SMT_DISABLED;
}
}
......
......@@ -135,9 +135,8 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
* In theory, the compile should just see 0 here, and optimize out the call
* to sched_rt_avg_update. But I don't trust it...
*/
#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
s64 steal = 0, irq_delta = 0;
#endif
s64 __maybe_unused steal = 0, irq_delta = 0;
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
......@@ -177,7 +176,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
rq->clock_task += delta;
#ifdef HAVE_SCHED_AVG_IRQ
#ifdef CONFIG_HAVE_SCHED_AVG_IRQ
if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY))
update_irq_load_avg(rq, irq_delta + steal);
#endif
......@@ -701,6 +700,7 @@ static void set_load_weight(struct task_struct *p, bool update_load)
if (idle_policy(p->policy)) {
load->weight = scale_load(WEIGHT_IDLEPRIO);
load->inv_weight = WMULT_IDLEPRIO;
p->se.runnable_weight = load->weight;
return;
}
......@@ -713,6 +713,7 @@ static void set_load_weight(struct task_struct *p, bool update_load)
} else {
load->weight = scale_load(sched_prio_to_weight[prio]);
load->inv_weight = sched_prio_to_wmult[prio];
p->se.runnable_weight = load->weight;
}
}
......@@ -2915,10 +2916,10 @@ unsigned long nr_iowait(void)
}
/*
* Consumers of these two interfaces, like for example the cpufreq menu
* governor are using nonsensical data. Boosting frequency for a CPU that has
* IO-wait which might not even end up running the task when it does become
* runnable.
* Consumers of these two interfaces, like for example the cpuidle menu
* governor, are using nonsensical data. Preferring shallow idle state selection
* for a CPU that has IO-wait which might not even end up running the task when
* it does become runnable.
*/
unsigned long nr_iowait_cpu(int cpu)
......
This diff is collapsed.
......@@ -39,7 +39,7 @@ SCHED_FEAT(WAKEUP_PREEMPTION, true)
SCHED_FEAT(HRTICK, false)
SCHED_FEAT(DOUBLE_TICK, false)
SCHED_FEAT(LB_BIAS, true)
SCHED_FEAT(LB_BIAS, false)
/*
* Decrement CPU capacity based on time not spent running tasks
......
......@@ -269,9 +269,6 @@ ___update_load_avg(struct sched_avg *sa, unsigned long load, unsigned long runna
int __update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se)
{
if (entity_is_task(se))
se->runnable_weight = se->load.weight;
if (___update_load_sum(now, cpu, &se->avg, 0, 0, 0)) {
___update_load_avg(&se->avg, se_weight(se), se_runnable(se));
return 1;
......@@ -282,9 +279,6 @@ int __update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se)
int __update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se)
{
if (entity_is_task(se))
se->runnable_weight = se->load.weight;
if (___update_load_sum(now, cpu, &se->avg, !!se->on_rq, !!se->on_rq,
cfs_rq->curr == se)) {
......@@ -358,7 +352,7 @@ int update_dl_rq_load_avg(u64 now, struct rq *rq, int running)
return 0;
}
#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
#ifdef CONFIG_HAVE_SCHED_AVG_IRQ
/*
* irq:
*
......
......@@ -6,7 +6,7 @@ int __update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq);
int update_rt_rq_load_avg(u64 now, struct rq *rq, int running);
int update_dl_rq_load_avg(u64 now, struct rq *rq, int running);
#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
#ifdef CONFIG_HAVE_SCHED_AVG_IRQ
int update_irq_load_avg(struct rq *rq, u64 running);
#else
static inline int
......
......@@ -717,8 +717,12 @@ struct root_domain {
cpumask_var_t span;
cpumask_var_t online;
/* Indicate more than one runnable task for any CPU */
bool overload;
/*
* Indicate pullable load on at least one CPU, e.g:
* - More than one runnable task
* - Running task is misfit
*/
int overload;
/*
* The bit corresponding to a CPU gets set here if such CPU has more
......@@ -845,6 +849,8 @@ struct rq {
unsigned char idle_balance;
unsigned long misfit_task_load;
/* For active balancing */
int active_balance;
int push_cpu;
......@@ -858,8 +864,7 @@ struct rq {
struct sched_avg avg_rt;
struct sched_avg avg_dl;
#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
#define HAVE_SCHED_AVG_IRQ
#ifdef CONFIG_HAVE_SCHED_AVG_IRQ
struct sched_avg avg_irq;
#endif
u64 idle_stamp;
......@@ -1188,6 +1193,7 @@ DECLARE_PER_CPU(int, sd_llc_id);
DECLARE_PER_CPU(struct sched_domain_shared *, sd_llc_shared);
DECLARE_PER_CPU(struct sched_domain *, sd_numa);
DECLARE_PER_CPU(struct sched_domain *, sd_asym);
extern struct static_key_false sched_asym_cpucapacity;
struct sched_group_capacity {
atomic_t ref;
......@@ -1197,6 +1203,7 @@ struct sched_group_capacity {
*/
unsigned long capacity;
unsigned long min_capacity; /* Min per-CPU capacity in group */
unsigned long max_capacity; /* Max per-CPU capacity in group */
unsigned long next_update;
int imbalance; /* XXX unrelated to capacity but shared group state */
......@@ -1396,7 +1403,7 @@ static const_debug __maybe_unused unsigned int sysctl_sched_features =
0;
#undef SCHED_FEAT
#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
#define sched_feat(x) !!(sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
#endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */
......@@ -1696,8 +1703,8 @@ static inline void add_nr_running(struct rq *rq, unsigned count)
if (prev_nr < 2 && rq->nr_running >= 2) {
#ifdef CONFIG_SMP
if (!rq->rd->overload)
rq->rd->overload = true;
if (!READ_ONCE(rq->rd->overload))
WRITE_ONCE(rq->rd->overload, 1);
#endif
}
......@@ -2217,7 +2224,7 @@ static inline unsigned long cpu_util_rt(struct rq *rq)
}
#endif
#ifdef HAVE_SCHED_AVG_IRQ
#ifdef CONFIG_HAVE_SCHED_AVG_IRQ
static inline unsigned long cpu_util_irq(struct rq *rq)
{
return rq->avg_irq.util_avg;
......
......@@ -7,8 +7,8 @@
DEFINE_MUTEX(sched_domains_mutex);
/* Protected by sched_domains_mutex: */
cpumask_var_t sched_domains_tmpmask;
cpumask_var_t sched_domains_tmpmask2;
static cpumask_var_t sched_domains_tmpmask;
static cpumask_var_t sched_domains_tmpmask2;
#ifdef CONFIG_SCHED_DEBUG
......@@ -398,6 +398,7 @@ DEFINE_PER_CPU(int, sd_llc_id);
DEFINE_PER_CPU(struct sched_domain_shared *, sd_llc_shared);
DEFINE_PER_CPU(struct sched_domain *, sd_numa);
DEFINE_PER_CPU(struct sched_domain *, sd_asym);
DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity);
static void update_top_cache_domain(int cpu)
{
......@@ -692,6 +693,7 @@ static void init_overlap_sched_group(struct sched_domain *sd,
sg_span = sched_group_span(sg);
sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span);
sg->sgc->min_capacity = SCHED_CAPACITY_SCALE;
sg->sgc->max_capacity = SCHED_CAPACITY_SCALE;
}
static int
......@@ -851,6 +853,7 @@ static struct sched_group *get_group(int cpu, struct sd_data *sdd)
sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sched_group_span(sg));
sg->sgc->min_capacity = SCHED_CAPACITY_SCALE;
sg->sgc->max_capacity = SCHED_CAPACITY_SCALE;
return sg;
}
......@@ -1061,7 +1064,6 @@ static struct cpumask ***sched_domains_numa_masks;
* SD_SHARE_PKG_RESOURCES - describes shared caches
* SD_NUMA - describes NUMA topologies
* SD_SHARE_POWERDOMAIN - describes shared power domain
* SD_ASYM_CPUCAPACITY - describes mixed capacity topologies
*
* Odd one out, which beside describing the topology has a quirk also
* prescribes the desired behaviour that goes along with it:
......@@ -1073,13 +1075,12 @@ static struct cpumask ***sched_domains_numa_masks;
SD_SHARE_PKG_RESOURCES | \
SD_NUMA | \
SD_ASYM_PACKING | \
SD_ASYM_CPUCAPACITY | \
SD_SHARE_POWERDOMAIN)
static struct sched_domain *
sd_init(struct sched_domain_topology_level *tl,
const struct cpumask *cpu_map,
struct sched_domain *child, int cpu)
struct sched_domain *child, int dflags, int cpu)
{
struct sd_data *sdd = &tl->data;
struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
......@@ -1100,6 +1101,9 @@ sd_init(struct sched_domain_topology_level *tl,
"wrong sd_flags in topology description\n"))
sd_flags &= ~TOPOLOGY_SD_FLAGS;
/* Apply detected topology flags */
sd_flags |= dflags;
*sd = (struct sched_domain){
.min_interval = sd_weight,
.max_interval = 2*sd_weight,
......@@ -1122,7 +1126,7 @@ sd_init(struct sched_domain_topology_level *tl,
| 0*SD_SHARE_CPUCAPACITY
| 0*SD_SHARE_PKG_RESOURCES
| 0*SD_SERIALIZE
| 0*SD_PREFER_SIBLING
| 1*SD_PREFER_SIBLING
| 0*SD_NUMA
| sd_flags
,
......@@ -1148,17 +1152,21 @@ sd_init(struct sched_domain_topology_level *tl,
if (sd->flags & SD_ASYM_CPUCAPACITY) {
struct sched_domain *t = sd;
/*
* Don't attempt to spread across CPUs of different capacities.
*/
if (sd->child)
sd->child->flags &= ~SD_PREFER_SIBLING;
for_each_lower_domain(t)
t->flags |= SD_BALANCE_WAKE;
}
if (sd->flags & SD_SHARE_CPUCAPACITY) {
sd->flags |= SD_PREFER_SIBLING;
sd->imbalance_pct = 110;
sd->smt_gain = 1178; /* ~15% */
} else if (sd->flags & SD_SHARE_PKG_RESOURCES) {
sd->flags |= SD_PREFER_SIBLING;
sd->imbalance_pct = 117;
sd->cache_nice_tries = 1;
sd->busy_idx = 2;
......@@ -1169,6 +1177,7 @@ sd_init(struct sched_domain_topology_level *tl,
sd->busy_idx = 3;
sd->idle_idx = 2;
sd->flags &= ~SD_PREFER_SIBLING;
sd->flags |= SD_SERIALIZE;
if (sched_domains_numa_distance[tl->numa_level] > RECLAIM_DISTANCE) {
sd->flags &= ~(SD_BALANCE_EXEC |
......@@ -1178,7 +1187,6 @@ sd_init(struct sched_domain_topology_level *tl,
#endif
} else {
sd->flags |= SD_PREFER_SIBLING;
sd->cache_nice_tries = 1;
sd->busy_idx = 2;
sd->idle_idx = 1;
......@@ -1604,9 +1612,9 @@ static void __sdt_free(const struct cpumask *cpu_map)
static struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
const struct cpumask *cpu_map, struct sched_domain_attr *attr,
struct sched_domain *child, int cpu)
struct sched_domain *child, int dflags, int cpu)
{
struct sched_domain *sd = sd_init(tl, cpu_map, child, cpu);
struct sched_domain *sd = sd_init(tl, cpu_map, child, dflags, cpu);
if (child) {
sd->level = child->level + 1;
......@@ -1632,6 +1640,65 @@ static struct sched_domain *build_sched_domain(struct sched_domain_topology_leve
return sd;
}
/*
* Find the sched_domain_topology_level where all CPU capacities are visible
* for all CPUs.
*/
static struct sched_domain_topology_level
*asym_cpu_capacity_level(const struct cpumask *cpu_map)
{
int i, j, asym_level = 0;
bool asym = false;
struct sched_domain_topology_level *tl, *asym_tl = NULL;
unsigned long cap;
/* Is there any asymmetry? */
cap = arch_scale_cpu_capacity(NULL, cpumask_first(cpu_map));
for_each_cpu(i, cpu_map) {
if (arch_scale_cpu_capacity(NULL, i) != cap) {
asym = true;
break;
}
}
if (!asym)
return NULL;
/*
* Examine topology from all CPU's point of views to detect the lowest
* sched_domain_topology_level where a highest capacity CPU is visible
* to everyone.
*/
for_each_cpu(i, cpu_map) {
unsigned long max_capacity = arch_scale_cpu_capacity(NULL, i);
int tl_id = 0;
for_each_sd_topology(tl) {
if (tl_id < asym_level)
goto next_level;
for_each_cpu_and(j, tl->mask(i), cpu_map) {
unsigned long capacity;
capacity = arch_scale_cpu_capacity(NULL, j);
if (capacity <= max_capacity)
continue;
max_capacity = capacity;
asym_level = tl_id;
asym_tl = tl;
}
next_level:
tl_id++;
}
}
return asym_tl;
}
/*
* Build sched domains for a given set of CPUs and attach the sched domains
* to the individual CPUs
......@@ -1644,18 +1711,30 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
struct s_data d;
struct rq *rq = NULL;
int i, ret = -ENOMEM;
struct sched_domain_topology_level *tl_asym;
bool has_asym = false;
alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
if (alloc_state != sa_rootdomain)
goto error;
tl_asym = asym_cpu_capacity_level(cpu_map);
/* Set up domains for CPUs specified by the cpu_map: */
for_each_cpu(i, cpu_map) {
struct sched_domain_topology_level *tl;
sd = NULL;
for_each_sd_topology(tl) {
sd = build_sched_domain(tl, cpu_map, attr, sd, i);
int dflags = 0;
if (tl == tl_asym) {
dflags |= SD_ASYM_CPUCAPACITY;
has_asym = true;
}
sd = build_sched_domain(tl, cpu_map, attr, sd, dflags, i);
if (tl == sched_domain_topology)
*per_cpu_ptr(d.sd, i) = sd;
if (tl->flags & SDTL_OVERLAP)
......@@ -1704,6 +1783,9 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
}
rcu_read_unlock();
if (has_asym)
static_branch_enable_cpuslocked(&sched_asym_cpucapacity);
if (rq && sched_debug_enabled) {
pr_info("root domain span: %*pbl (max cpu_capacity = %lu)\n",
cpumask_pr_args(cpu_map), rq->rd->max_cpu_capacity);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment