Commit 17bf423a authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull scheduler updates from Ingo Molnar:
 "The main changes in this cycle were:

   - Introduce "Energy Aware Scheduling" - by Quentin Perret.

     This is a coherent topology description of CPUs in cooperation with
     the PM subsystem, with the goal to schedule more energy-efficiently
     on asymetric SMP platform - such as waking up tasks to the more
     energy-efficient CPUs first, as long as the system isn't
     oversubscribed.

     For details of the design, see:

        https://lore.kernel.org/lkml/20180724122521.22109-1-quentin.perret@arm.com/

   - Misc cleanups and smaller enhancements"

* 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (23 commits)
  sched/fair: Select an energy-efficient CPU on task wake-up
  sched/fair: Introduce an energy estimation helper function
  sched/fair: Add over-utilization/tipping point indicator
  sched/fair: Clean-up update_sg_lb_stats parameters
  sched/toplogy: Introduce the 'sched_energy_present' static key
  sched/topology: Make Energy Aware Scheduling depend on schedutil
  sched/topology: Disable EAS on inappropriate platforms
  sched/topology: Add lowest CPU asymmetry sched_domain level pointer
  sched/topology: Reference the Energy Model of CPUs when available
  PM: Introduce an Energy Model management framework
  sched/cpufreq: Prepare schedutil for Energy Aware Scheduling
  sched/topology: Relocate arch_scale_cpu_capacity() to the internal header
  sched/core: Remove unnecessary unlikely() in push_*_task()
  sched/topology: Remove the ::smt_gain field from 'struct sched_domain'
  sched: Fix various typos in comments
  sched/core: Clean up the #ifdef block in add_nr_running()
  sched/fair: Make some variables static
  sched/core: Create task_has_idle_policy() helper
  sched/fair: Add lsub_positive() and use it consistently
  sched/fair: Mask UTIL_AVG_UNCHANGED usages
  ...
parents 116b081c 732cd75b
...@@ -2277,6 +2277,7 @@ static int cpufreq_set_policy(struct cpufreq_policy *policy, ...@@ -2277,6 +2277,7 @@ static int cpufreq_set_policy(struct cpufreq_policy *policy,
ret = cpufreq_start_governor(policy); ret = cpufreq_start_governor(policy);
if (!ret) { if (!ret) {
pr_debug("cpufreq: governor change\n"); pr_debug("cpufreq: governor change\n");
sched_cpufreq_governor_change(policy, old_gov);
return 0; return 0;
} }
cpufreq_exit_governor(policy); cpufreq_exit_governor(policy);
......
...@@ -950,6 +950,14 @@ static inline bool policy_has_boost_freq(struct cpufreq_policy *policy) ...@@ -950,6 +950,14 @@ static inline bool policy_has_boost_freq(struct cpufreq_policy *policy)
} }
#endif #endif
#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
void sched_cpufreq_governor_change(struct cpufreq_policy *policy,
struct cpufreq_governor *old_gov);
#else
static inline void sched_cpufreq_governor_change(struct cpufreq_policy *policy,
struct cpufreq_governor *old_gov) { }
#endif
extern void arch_freq_prepare_all(void); extern void arch_freq_prepare_all(void);
extern unsigned int arch_freq_get_on_cpu(int cpu); extern unsigned int arch_freq_get_on_cpu(int cpu);
......
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_ENERGY_MODEL_H
#define _LINUX_ENERGY_MODEL_H
#include <linux/cpumask.h>
#include <linux/jump_label.h>
#include <linux/kobject.h>
#include <linux/rcupdate.h>
#include <linux/sched/cpufreq.h>
#include <linux/sched/topology.h>
#include <linux/types.h>
#ifdef CONFIG_ENERGY_MODEL
/**
* em_cap_state - Capacity state of a performance domain
* @frequency: The CPU frequency in KHz, for consistency with CPUFreq
* @power: The power consumed by 1 CPU at this level, in milli-watts
* @cost: The cost coefficient associated with this level, used during
* energy calculation. Equal to: power * max_frequency / frequency
*/
struct em_cap_state {
unsigned long frequency;
unsigned long power;
unsigned long cost;
};
/**
* em_perf_domain - Performance domain
* @table: List of capacity states, in ascending order
* @nr_cap_states: Number of capacity states
* @cpus: Cpumask covering the CPUs of the domain
*
* A "performance domain" represents a group of CPUs whose performance is
* scaled together. All CPUs of a performance domain must have the same
* micro-architecture. Performance domains often have a 1-to-1 mapping with
* CPUFreq policies.
*/
struct em_perf_domain {
struct em_cap_state *table;
int nr_cap_states;
unsigned long cpus[0];
};
#define EM_CPU_MAX_POWER 0xFFFF
struct em_data_callback {
/**
* active_power() - Provide power at the next capacity state of a CPU
* @power : Active power at the capacity state in mW (modified)
* @freq : Frequency at the capacity state in kHz (modified)
* @cpu : CPU for which we do this operation
*
* active_power() must find the lowest capacity state of 'cpu' above
* 'freq' and update 'power' and 'freq' to the matching active power
* and frequency.
*
* The power is the one of a single CPU in the domain, expressed in
* milli-watts. It is expected to fit in the [0, EM_CPU_MAX_POWER]
* range.
*
* Return 0 on success.
*/
int (*active_power)(unsigned long *power, unsigned long *freq, int cpu);
};
#define EM_DATA_CB(_active_power_cb) { .active_power = &_active_power_cb }
struct em_perf_domain *em_cpu_get(int cpu);
int em_register_perf_domain(cpumask_t *span, unsigned int nr_states,
struct em_data_callback *cb);
/**
* em_pd_energy() - Estimates the energy consumed by the CPUs of a perf. domain
* @pd : performance domain for which energy has to be estimated
* @max_util : highest utilization among CPUs of the domain
* @sum_util : sum of the utilization of all CPUs in the domain
*
* Return: the sum of the energy consumed by the CPUs of the domain assuming
* a capacity state satisfying the max utilization of the domain.
*/
static inline unsigned long em_pd_energy(struct em_perf_domain *pd,
unsigned long max_util, unsigned long sum_util)
{
unsigned long freq, scale_cpu;
struct em_cap_state *cs;
int i, cpu;
/*
* In order to predict the capacity state, map the utilization of the
* most utilized CPU of the performance domain to a requested frequency,
* like schedutil.
*/
cpu = cpumask_first(to_cpumask(pd->cpus));
scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
cs = &pd->table[pd->nr_cap_states - 1];
freq = map_util_freq(max_util, cs->frequency, scale_cpu);
/*
* Find the lowest capacity state of the Energy Model above the
* requested frequency.
*/
for (i = 0; i < pd->nr_cap_states; i++) {
cs = &pd->table[i];
if (cs->frequency >= freq)
break;
}
/*
* The capacity of a CPU in the domain at that capacity state (cs)
* can be computed as:
*
* cs->freq * scale_cpu
* cs->cap = -------------------- (1)
* cpu_max_freq
*
* So, ignoring the costs of idle states (which are not available in
* the EM), the energy consumed by this CPU at that capacity state is
* estimated as:
*
* cs->power * cpu_util
* cpu_nrg = -------------------- (2)
* cs->cap
*
* since 'cpu_util / cs->cap' represents its percentage of busy time.
*
* NOTE: Although the result of this computation actually is in
* units of power, it can be manipulated as an energy value
* over a scheduling period, since it is assumed to be
* constant during that interval.
*
* By injecting (1) in (2), 'cpu_nrg' can be re-expressed as a product
* of two terms:
*
* cs->power * cpu_max_freq cpu_util
* cpu_nrg = ------------------------ * --------- (3)
* cs->freq scale_cpu
*
* The first term is static, and is stored in the em_cap_state struct
* as 'cs->cost'.
*
* Since all CPUs of the domain have the same micro-architecture, they
* share the same 'cs->cost', and the same CPU capacity. Hence, the
* total energy of the domain (which is the simple sum of the energy of
* all of its CPUs) can be factorized as:
*
* cs->cost * \Sum cpu_util
* pd_nrg = ------------------------ (4)
* scale_cpu
*/
return cs->cost * sum_util / scale_cpu;
}
/**
* em_pd_nr_cap_states() - Get the number of capacity states of a perf. domain
* @pd : performance domain for which this must be done
*
* Return: the number of capacity states in the performance domain table
*/
static inline int em_pd_nr_cap_states(struct em_perf_domain *pd)
{
return pd->nr_cap_states;
}
#else
struct em_perf_domain {};
struct em_data_callback {};
#define EM_DATA_CB(_active_power_cb) { }
static inline int em_register_perf_domain(cpumask_t *span,
unsigned int nr_states, struct em_data_callback *cb)
{
return -EINVAL;
}
static inline struct em_perf_domain *em_cpu_get(int cpu)
{
return NULL;
}
static inline unsigned long em_pd_energy(struct em_perf_domain *pd,
unsigned long max_util, unsigned long sum_util)
{
return 0;
}
static inline int em_pd_nr_cap_states(struct em_perf_domain *pd)
{
return 0;
}
#endif
#endif
...@@ -176,7 +176,7 @@ struct task_group; ...@@ -176,7 +176,7 @@ struct task_group;
* TASK_RUNNING store which can collide with __set_current_state(TASK_RUNNING). * TASK_RUNNING store which can collide with __set_current_state(TASK_RUNNING).
* *
* However, with slightly different timing the wakeup TASK_RUNNING store can * However, with slightly different timing the wakeup TASK_RUNNING store can
* also collide with the TASK_UNINTERRUPTIBLE store. Loosing that store is not * also collide with the TASK_UNINTERRUPTIBLE store. Losing that store is not
* a problem either because that will result in one extra go around the loop * a problem either because that will result in one extra go around the loop
* and our @cond test will save the day. * and our @cond test will save the day.
* *
...@@ -515,7 +515,7 @@ struct sched_dl_entity { ...@@ -515,7 +515,7 @@ struct sched_dl_entity {
/* /*
* Actual scheduling parameters. Initialized with the values above, * Actual scheduling parameters. Initialized with the values above,
* they are continously updated during task execution. Note that * they are continuously updated during task execution. Note that
* the remaining runtime could be < 0 in case we are in overrun. * the remaining runtime could be < 0 in case we are in overrun.
*/ */
s64 runtime; /* Remaining runtime for this instance */ s64 runtime; /* Remaining runtime for this instance */
......
...@@ -20,6 +20,12 @@ void cpufreq_add_update_util_hook(int cpu, struct update_util_data *data, ...@@ -20,6 +20,12 @@ void cpufreq_add_update_util_hook(int cpu, struct update_util_data *data,
void (*func)(struct update_util_data *data, u64 time, void (*func)(struct update_util_data *data, u64 time,
unsigned int flags)); unsigned int flags));
void cpufreq_remove_update_util_hook(int cpu); void cpufreq_remove_update_util_hook(int cpu);
static inline unsigned long map_util_freq(unsigned long util,
unsigned long freq, unsigned long cap)
{
return (freq + (freq >> 2)) * util / cap;
}
#endif /* CONFIG_CPU_FREQ */ #endif /* CONFIG_CPU_FREQ */
#endif /* _LINUX_SCHED_CPUFREQ_H */ #endif /* _LINUX_SCHED_CPUFREQ_H */
...@@ -16,7 +16,7 @@ enum hk_flags { ...@@ -16,7 +16,7 @@ enum hk_flags {
}; };
#ifdef CONFIG_CPU_ISOLATION #ifdef CONFIG_CPU_ISOLATION
DECLARE_STATIC_KEY_FALSE(housekeeping_overriden); DECLARE_STATIC_KEY_FALSE(housekeeping_overridden);
extern int housekeeping_any_cpu(enum hk_flags flags); extern int housekeeping_any_cpu(enum hk_flags flags);
extern const struct cpumask *housekeeping_cpumask(enum hk_flags flags); extern const struct cpumask *housekeeping_cpumask(enum hk_flags flags);
extern void housekeeping_affine(struct task_struct *t, enum hk_flags flags); extern void housekeeping_affine(struct task_struct *t, enum hk_flags flags);
...@@ -43,7 +43,7 @@ static inline void housekeeping_init(void) { } ...@@ -43,7 +43,7 @@ static inline void housekeeping_init(void) { }
static inline bool housekeeping_cpu(int cpu, enum hk_flags flags) static inline bool housekeeping_cpu(int cpu, enum hk_flags flags)
{ {
#ifdef CONFIG_CPU_ISOLATION #ifdef CONFIG_CPU_ISOLATION
if (static_branch_unlikely(&housekeeping_overriden)) if (static_branch_unlikely(&housekeeping_overridden))
return housekeeping_test_cpu(cpu, flags); return housekeeping_test_cpu(cpu, flags);
#endif #endif
return true; return true;
......
...@@ -153,7 +153,7 @@ static inline gfp_t current_gfp_context(gfp_t flags) ...@@ -153,7 +153,7 @@ static inline gfp_t current_gfp_context(gfp_t flags)
{ {
/* /*
* NOIO implies both NOIO and NOFS and it is a weaker context * NOIO implies both NOIO and NOFS and it is a weaker context
* so always make sure it makes precendence * so always make sure it makes precedence
*/ */
if (unlikely(current->flags & PF_MEMALLOC_NOIO)) if (unlikely(current->flags & PF_MEMALLOC_NOIO))
flags &= ~(__GFP_IO | __GFP_FS); flags &= ~(__GFP_IO | __GFP_FS);
......
...@@ -8,7 +8,7 @@ ...@@ -8,7 +8,7 @@
* Various counters maintained by the scheduler and fork(), * Various counters maintained by the scheduler and fork(),
* exposed via /proc, sys.c or used by drivers via these APIs. * exposed via /proc, sys.c or used by drivers via these APIs.
* *
* ( Note that all these values are aquired without locking, * ( Note that all these values are acquired without locking,
* so they can only be relied on in narrow circumstances. ) * so they can only be relied on in narrow circumstances. )
*/ */
......
...@@ -89,7 +89,6 @@ struct sched_domain { ...@@ -89,7 +89,6 @@ struct sched_domain {
unsigned int newidle_idx; unsigned int newidle_idx;
unsigned int wake_idx; unsigned int wake_idx;
unsigned int forkexec_idx; unsigned int forkexec_idx;
unsigned int smt_gain;
int nohz_idle; /* NOHZ IDLE status */ int nohz_idle; /* NOHZ IDLE status */
int flags; /* See SD_* */ int flags; /* See SD_* */
...@@ -202,6 +201,14 @@ extern void set_sched_topology(struct sched_domain_topology_level *tl); ...@@ -202,6 +201,14 @@ extern void set_sched_topology(struct sched_domain_topology_level *tl);
# define SD_INIT_NAME(type) # define SD_INIT_NAME(type)
#endif #endif
#ifndef arch_scale_cpu_capacity
static __always_inline
unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu)
{
return SCHED_CAPACITY_SCALE;
}
#endif
#else /* CONFIG_SMP */ #else /* CONFIG_SMP */
struct sched_domain_attr; struct sched_domain_attr;
...@@ -217,6 +224,14 @@ static inline bool cpus_share_cache(int this_cpu, int that_cpu) ...@@ -217,6 +224,14 @@ static inline bool cpus_share_cache(int this_cpu, int that_cpu)
return true; return true;
} }
#ifndef arch_scale_cpu_capacity
static __always_inline
unsigned long arch_scale_cpu_capacity(void __always_unused *sd, int cpu)
{
return SCHED_CAPACITY_SCALE;
}
#endif
#endif /* !CONFIG_SMP */ #endif /* !CONFIG_SMP */
static inline int task_node(const struct task_struct *p) static inline int task_node(const struct task_struct *p)
......
...@@ -298,3 +298,18 @@ config PM_GENERIC_DOMAINS_OF ...@@ -298,3 +298,18 @@ config PM_GENERIC_DOMAINS_OF
config CPU_PM config CPU_PM
bool bool
config ENERGY_MODEL
bool "Energy Model for CPUs"
depends on SMP
depends on CPU_FREQ
default n
help
Several subsystems (thermal and/or the task scheduler for example)
can leverage information about the energy consumed by CPUs to make
smarter decisions. This config option enables the framework from
which subsystems can access the energy models.
The exact usage of the energy model is subsystem-dependent.
If in doubt, say N.
...@@ -15,3 +15,5 @@ obj-$(CONFIG_PM_AUTOSLEEP) += autosleep.o ...@@ -15,3 +15,5 @@ obj-$(CONFIG_PM_AUTOSLEEP) += autosleep.o
obj-$(CONFIG_PM_WAKELOCKS) += wakelock.o obj-$(CONFIG_PM_WAKELOCKS) += wakelock.o
obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o
obj-$(CONFIG_ENERGY_MODEL) += energy_model.o
// SPDX-License-Identifier: GPL-2.0
/*
* Energy Model of CPUs
*
* Copyright (c) 2018, Arm ltd.
* Written by: Quentin Perret, Arm ltd.
*/
#define pr_fmt(fmt) "energy_model: " fmt
#include <linux/cpu.h>
#include <linux/cpumask.h>
#include <linux/energy_model.h>
#include <linux/sched/topology.h>
#include <linux/slab.h>
/* Mapping of each CPU to the performance domain to which it belongs. */
static DEFINE_PER_CPU(struct em_perf_domain *, em_data);
/*
* Mutex serializing the registrations of performance domains and letting
* callbacks defined by drivers sleep.
*/
static DEFINE_MUTEX(em_pd_mutex);
static struct em_perf_domain *em_create_pd(cpumask_t *span, int nr_states,
struct em_data_callback *cb)
{
unsigned long opp_eff, prev_opp_eff = ULONG_MAX;
unsigned long power, freq, prev_freq = 0;
int i, ret, cpu = cpumask_first(span);
struct em_cap_state *table;
struct em_perf_domain *pd;
u64 fmax;
if (!cb->active_power)
return NULL;
pd = kzalloc(sizeof(*pd) + cpumask_size(), GFP_KERNEL);
if (!pd)
return NULL;
table = kcalloc(nr_states, sizeof(*table), GFP_KERNEL);
if (!table)
goto free_pd;
/* Build the list of capacity states for this performance domain */
for (i = 0, freq = 0; i < nr_states; i++, freq++) {
/*
* active_power() is a driver callback which ceils 'freq' to
* lowest capacity state of 'cpu' above 'freq' and updates
* 'power' and 'freq' accordingly.
*/
ret = cb->active_power(&power, &freq, cpu);
if (ret) {
pr_err("pd%d: invalid cap. state: %d\n", cpu, ret);
goto free_cs_table;
}
/*
* We expect the driver callback to increase the frequency for
* higher capacity states.
*/
if (freq <= prev_freq) {
pr_err("pd%d: non-increasing freq: %lu\n", cpu, freq);
goto free_cs_table;
}
/*
* The power returned by active_state() is expected to be
* positive, in milli-watts and to fit into 16 bits.
*/
if (!power || power > EM_CPU_MAX_POWER) {
pr_err("pd%d: invalid power: %lu\n", cpu, power);
goto free_cs_table;
}
table[i].power = power;
table[i].frequency = prev_freq = freq;
/*
* The hertz/watts efficiency ratio should decrease as the
* frequency grows on sane platforms. But this isn't always
* true in practice so warn the user if a higher OPP is more
* power efficient than a lower one.
*/
opp_eff = freq / power;
if (opp_eff >= prev_opp_eff)
pr_warn("pd%d: hertz/watts ratio non-monotonically decreasing: em_cap_state %d >= em_cap_state%d\n",
cpu, i, i - 1);
prev_opp_eff = opp_eff;
}
/* Compute the cost of each capacity_state. */
fmax = (u64) table[nr_states - 1].frequency;
for (i = 0; i < nr_states; i++) {
table[i].cost = div64_u64(fmax * table[i].power,
table[i].frequency);
}
pd->table = table;
pd->nr_cap_states = nr_states;
cpumask_copy(to_cpumask(pd->cpus), span);
return pd;
free_cs_table:
kfree(table);
free_pd:
kfree(pd);
return NULL;
}
/**
* em_cpu_get() - Return the performance domain for a CPU
* @cpu : CPU to find the performance domain for
*
* Return: the performance domain to which 'cpu' belongs, or NULL if it doesn't
* exist.
*/
struct em_perf_domain *em_cpu_get(int cpu)
{
return READ_ONCE(per_cpu(em_data, cpu));
}
EXPORT_SYMBOL_GPL(em_cpu_get);
/**
* em_register_perf_domain() - Register the Energy Model of a performance domain
* @span : Mask of CPUs in the performance domain
* @nr_states : Number of capacity states to register
* @cb : Callback functions providing the data of the Energy Model
*
* Create Energy Model tables for a performance domain using the callbacks
* defined in cb.
*
* If multiple clients register the same performance domain, all but the first
* registration will be ignored.
*
* Return 0 on success
*/
int em_register_perf_domain(cpumask_t *span, unsigned int nr_states,
struct em_data_callback *cb)
{
unsigned long cap, prev_cap = 0;
struct em_perf_domain *pd;
int cpu, ret = 0;
if (!span || !nr_states || !cb)
return -EINVAL;
/*
* Use a mutex to serialize the registration of performance domains and
* let the driver-defined callback functions sleep.
*/
mutex_lock(&em_pd_mutex);
for_each_cpu(cpu, span) {
/* Make sure we don't register again an existing domain. */
if (READ_ONCE(per_cpu(em_data, cpu))) {
ret = -EEXIST;
goto unlock;
}
/*
* All CPUs of a domain must have the same micro-architecture
* since they all share the same table.
*/
cap = arch_scale_cpu_capacity(NULL, cpu);
if (prev_cap && prev_cap != cap) {
pr_err("CPUs of %*pbl must have the same capacity\n",
cpumask_pr_args(span));
ret = -EINVAL;
goto unlock;
}
prev_cap = cap;
}
/* Create the performance domain and add it to the Energy Model. */
pd = em_create_pd(span, nr_states, cb);
if (!pd) {
ret = -EINVAL;
goto unlock;
}
for_each_cpu(cpu, span) {
/*
* The per-cpu array can be read concurrently from em_cpu_get().
* The barrier enforces the ordering needed to make sure readers
* can only access well formed em_perf_domain structs.
*/
smp_store_release(per_cpu_ptr(&em_data, cpu), pd);
}
pr_debug("Created perf domain %*pbl\n", cpumask_pr_args(span));
unlock:
mutex_unlock(&em_pd_mutex);
return ret;
}
EXPORT_SYMBOL_GPL(em_register_perf_domain);
...@@ -697,7 +697,7 @@ static void set_load_weight(struct task_struct *p, bool update_load) ...@@ -697,7 +697,7 @@ static void set_load_weight(struct task_struct *p, bool update_load)
/* /*
* SCHED_IDLE tasks get minimal weight: * SCHED_IDLE tasks get minimal weight:
*/ */
if (idle_policy(p->policy)) { if (task_has_idle_policy(p)) {
load->weight = scale_load(WEIGHT_IDLEPRIO); load->weight = scale_load(WEIGHT_IDLEPRIO);
load->inv_weight = WMULT_IDLEPRIO; load->inv_weight = WMULT_IDLEPRIO;
p->se.runnable_weight = load->weight; p->se.runnable_weight = load->weight;
...@@ -2857,7 +2857,7 @@ unsigned long nr_running(void) ...@@ -2857,7 +2857,7 @@ unsigned long nr_running(void)
* preemption, thus the result might have a time-of-check-to-time-of-use * preemption, thus the result might have a time-of-check-to-time-of-use
* race. The caller is responsible to use it correctly, for example: * race. The caller is responsible to use it correctly, for example:
* *
* - from a non-preemptable section (of course) * - from a non-preemptible section (of course)
* *
* - from a thread that is bound to a single CPU * - from a thread that is bound to a single CPU
* *
...@@ -4191,7 +4191,7 @@ static int __sched_setscheduler(struct task_struct *p, ...@@ -4191,7 +4191,7 @@ static int __sched_setscheduler(struct task_struct *p,
* Treat SCHED_IDLE as nice 20. Only allow a switch to * Treat SCHED_IDLE as nice 20. Only allow a switch to
* SCHED_NORMAL if the RLIMIT_NICE would normally permit it. * SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
*/ */
if (idle_policy(p->policy) && !idle_policy(policy)) { if (task_has_idle_policy(p) && !idle_policy(policy)) {
if (!can_nice(p, task_nice(p))) if (!can_nice(p, task_nice(p)))
return -EPERM; return -EPERM;
} }
......
...@@ -10,6 +10,7 @@ ...@@ -10,6 +10,7 @@
#include "sched.h" #include "sched.h"
#include <linux/sched/cpufreq.h>
#include <trace/events/power.h> #include <trace/events/power.h>
struct sugov_tunables { struct sugov_tunables {
...@@ -164,7 +165,7 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy, ...@@ -164,7 +165,7 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy,
unsigned int freq = arch_scale_freq_invariant() ? unsigned int freq = arch_scale_freq_invariant() ?
policy->cpuinfo.max_freq : policy->cur; policy->cpuinfo.max_freq : policy->cur;
freq = (freq + (freq >> 2)) * util / max; freq = map_util_freq(util, freq, max);
if (freq == sg_policy->cached_raw_freq && !sg_policy->need_freq_update) if (freq == sg_policy->cached_raw_freq && !sg_policy->need_freq_update)
return sg_policy->next_freq; return sg_policy->next_freq;
...@@ -194,15 +195,13 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy, ...@@ -194,15 +195,13 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy,
* based on the task model parameters and gives the minimal utilization * based on the task model parameters and gives the minimal utilization
* required to meet deadlines. * required to meet deadlines.
*/ */
static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu) unsigned long schedutil_freq_util(int cpu, unsigned long util_cfs,
unsigned long max, enum schedutil_type type)
{ {
struct rq *rq = cpu_rq(sg_cpu->cpu); unsigned long dl_util, util, irq;
unsigned long util, irq, max; struct rq *rq = cpu_rq(cpu);
sg_cpu->max = max = arch_scale_cpu_capacity(NULL, sg_cpu->cpu);
sg_cpu->bw_dl = cpu_bw_dl(rq);
if (rt_rq_is_runnable(&rq->rt)) if (type == FREQUENCY_UTIL && rt_rq_is_runnable(&rq->rt))
return max; return max;
/* /*
...@@ -220,21 +219,30 @@ static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu) ...@@ -220,21 +219,30 @@ static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu)
* utilization (PELT windows are synchronized) we can directly add them * utilization (PELT windows are synchronized) we can directly add them
* to obtain the CPU's actual utilization. * to obtain the CPU's actual utilization.
*/ */
util = cpu_util_cfs(rq); util = util_cfs;
util += cpu_util_rt(rq); util += cpu_util_rt(rq);
dl_util = cpu_util_dl(rq);
/* /*
* We do not make cpu_util_dl() a permanent part of this sum because we * For frequency selection we do not make cpu_util_dl() a permanent part
* want to use cpu_bw_dl() later on, but we need to check if the * of this sum because we want to use cpu_bw_dl() later on, but we need
* CFS+RT+DL sum is saturated (ie. no idle time) such that we select * to check if the CFS+RT+DL sum is saturated (ie. no idle time) such
* f_max when there is no idle time. * that we select f_max when there is no idle time.
* *
* NOTE: numerical errors or stop class might cause us to not quite hit * NOTE: numerical errors or stop class might cause us to not quite hit
* saturation when we should -- something for later. * saturation when we should -- something for later.
*/ */
if ((util + cpu_util_dl(rq)) >= max) if (util + dl_util >= max)
return max; return max;
/*
* OTOH, for energy computation we need the estimated running time, so
* include util_dl and ignore dl_bw.
*/
if (type == ENERGY_UTIL)
util += dl_util;
/* /*
* There is still idle time; further improve the number by using the * There is still idle time; further improve the number by using the
* irq metric. Because IRQ/steal time is hidden from the task clock we * irq metric. Because IRQ/steal time is hidden from the task clock we
...@@ -257,7 +265,22 @@ static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu) ...@@ -257,7 +265,22 @@ static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu)
* bw_dl as requested freq. However, cpufreq is not yet ready for such * bw_dl as requested freq. However, cpufreq is not yet ready for such
* an interface. So, we only do the latter for now. * an interface. So, we only do the latter for now.
*/ */
return min(max, util + sg_cpu->bw_dl); if (type == FREQUENCY_UTIL)
util += cpu_bw_dl(rq);
return min(max, util);
}
static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu)
{
struct rq *rq = cpu_rq(sg_cpu->cpu);
unsigned long util = cpu_util_cfs(rq);
unsigned long max = arch_scale_cpu_capacity(NULL, sg_cpu->cpu);
sg_cpu->max = max;
sg_cpu->bw_dl = cpu_bw_dl(rq);
return schedutil_freq_util(sg_cpu->cpu, util, max, FREQUENCY_UTIL);
} }
/** /**
...@@ -598,7 +621,7 @@ static struct kobj_type sugov_tunables_ktype = { ...@@ -598,7 +621,7 @@ static struct kobj_type sugov_tunables_ktype = {
/********************** cpufreq governor interface *********************/ /********************** cpufreq governor interface *********************/
static struct cpufreq_governor schedutil_gov; struct cpufreq_governor schedutil_gov;
static struct sugov_policy *sugov_policy_alloc(struct cpufreq_policy *policy) static struct sugov_policy *sugov_policy_alloc(struct cpufreq_policy *policy)
{ {
...@@ -857,7 +880,7 @@ static void sugov_limits(struct cpufreq_policy *policy) ...@@ -857,7 +880,7 @@ static void sugov_limits(struct cpufreq_policy *policy)
sg_policy->need_freq_update = true; sg_policy->need_freq_update = true;
} }
static struct cpufreq_governor schedutil_gov = { struct cpufreq_governor schedutil_gov = {
.name = "schedutil", .name = "schedutil",
.owner = THIS_MODULE, .owner = THIS_MODULE,
.dynamic_switching = true, .dynamic_switching = true,
...@@ -880,3 +903,36 @@ static int __init sugov_register(void) ...@@ -880,3 +903,36 @@ static int __init sugov_register(void)
return cpufreq_register_governor(&schedutil_gov); return cpufreq_register_governor(&schedutil_gov);
} }
fs_initcall(sugov_register); fs_initcall(sugov_register);
#ifdef CONFIG_ENERGY_MODEL
extern bool sched_energy_update;
extern struct mutex sched_energy_mutex;
static void rebuild_sd_workfn(struct work_struct *work)
{
mutex_lock(&sched_energy_mutex);
sched_energy_update = true;
rebuild_sched_domains();
sched_energy_update = false;
mutex_unlock(&sched_energy_mutex);
}
static DECLARE_WORK(rebuild_sd_work, rebuild_sd_workfn);
/*
* EAS shouldn't be attempted without sugov, so rebuild the sched_domains
* on governor changes to make sure the scheduler knows about it.
*/
void sched_cpufreq_governor_change(struct cpufreq_policy *policy,
struct cpufreq_governor *old_gov)
{
if (old_gov == &schedutil_gov || policy->governor == &schedutil_gov) {
/*
* When called from the cpufreq_register_driver() path, the
* cpu_hotplug_lock is already held, so use a work item to
* avoid nested locking in rebuild_sched_domains().
*/
schedule_work(&rebuild_sd_work);
}
}
#endif
...@@ -525,7 +525,7 @@ void account_idle_ticks(unsigned long ticks) ...@@ -525,7 +525,7 @@ void account_idle_ticks(unsigned long ticks)
/* /*
* Perform (stime * rtime) / total, but avoid multiplication overflow by * Perform (stime * rtime) / total, but avoid multiplication overflow by
* loosing precision when the numbers are big. * losing precision when the numbers are big.
*/ */
static u64 scale_stime(u64 stime, u64 rtime, u64 total) static u64 scale_stime(u64 stime, u64 rtime, u64 total)
{ {
......
...@@ -727,7 +727,7 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se, ...@@ -727,7 +727,7 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se,
* refill the runtime and set the deadline a period in the future, * refill the runtime and set the deadline a period in the future,
* because keeping the current (absolute) deadline of the task would * because keeping the current (absolute) deadline of the task would
* result in breaking guarantees promised to other tasks (refer to * result in breaking guarantees promised to other tasks (refer to
* Documentation/scheduler/sched-deadline.txt for more informations). * Documentation/scheduler/sched-deadline.txt for more information).
* *
* This function returns true if: * This function returns true if:
* *
...@@ -1695,6 +1695,14 @@ static void start_hrtick_dl(struct rq *rq, struct task_struct *p) ...@@ -1695,6 +1695,14 @@ static void start_hrtick_dl(struct rq *rq, struct task_struct *p)
} }
#endif #endif
static inline void set_next_task(struct rq *rq, struct task_struct *p)
{
p->se.exec_start = rq_clock_task(rq);
/* You can't push away the running task */
dequeue_pushable_dl_task(rq, p);
}
static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq, static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq,
struct dl_rq *dl_rq) struct dl_rq *dl_rq)
{ {
...@@ -1750,10 +1758,8 @@ pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) ...@@ -1750,10 +1758,8 @@ pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
BUG_ON(!dl_se); BUG_ON(!dl_se);
p = dl_task_of(dl_se); p = dl_task_of(dl_se);
p->se.exec_start = rq_clock_task(rq);
/* Running task will never be pushed. */ set_next_task(rq, p);
dequeue_pushable_dl_task(rq, p);
if (hrtick_enabled(rq)) if (hrtick_enabled(rq))
start_hrtick_dl(rq, p); start_hrtick_dl(rq, p);
...@@ -1808,12 +1814,7 @@ static void task_fork_dl(struct task_struct *p) ...@@ -1808,12 +1814,7 @@ static void task_fork_dl(struct task_struct *p)
static void set_curr_task_dl(struct rq *rq) static void set_curr_task_dl(struct rq *rq)
{ {
struct task_struct *p = rq->curr; set_next_task(rq, rq->curr);
p->se.exec_start = rq_clock_task(rq);
/* You can't push away the running task */
dequeue_pushable_dl_task(rq, p);
} }
#ifdef CONFIG_SMP #ifdef CONFIG_SMP
...@@ -2041,10 +2042,8 @@ static int push_dl_task(struct rq *rq) ...@@ -2041,10 +2042,8 @@ static int push_dl_task(struct rq *rq)
return 0; return 0;
retry: retry:
if (unlikely(next_task == rq->curr)) { if (WARN_ON(next_task == rq->curr))
WARN_ON(1);
return 0; return 0;
}
/* /*
* If next_task preempts rq->curr, and rq->curr * If next_task preempts rq->curr, and rq->curr
......
...@@ -974,7 +974,7 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, ...@@ -974,7 +974,7 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
#endif #endif
P(policy); P(policy);
P(prio); P(prio);
if (p->policy == SCHED_DEADLINE) { if (task_has_dl_policy(p)) {
P(dl.runtime); P(dl.runtime);
P(dl.deadline); P(dl.deadline);
} }
......
...@@ -38,7 +38,7 @@ ...@@ -38,7 +38,7 @@
* (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds) * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
*/ */
unsigned int sysctl_sched_latency = 6000000ULL; unsigned int sysctl_sched_latency = 6000000ULL;
unsigned int normalized_sysctl_sched_latency = 6000000ULL; static unsigned int normalized_sysctl_sched_latency = 6000000ULL;
/* /*
* The initial- and re-scaling of tunables is configurable * The initial- and re-scaling of tunables is configurable
...@@ -58,8 +58,8 @@ enum sched_tunable_scaling sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_L ...@@ -58,8 +58,8 @@ enum sched_tunable_scaling sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_L
* *
* (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds) * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
*/ */
unsigned int sysctl_sched_min_granularity = 750000ULL; unsigned int sysctl_sched_min_granularity = 750000ULL;
unsigned int normalized_sysctl_sched_min_granularity = 750000ULL; static unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
/* /*
* This value is kept at sysctl_sched_latency/sysctl_sched_min_granularity * This value is kept at sysctl_sched_latency/sysctl_sched_min_granularity
...@@ -81,8 +81,8 @@ unsigned int sysctl_sched_child_runs_first __read_mostly; ...@@ -81,8 +81,8 @@ unsigned int sysctl_sched_child_runs_first __read_mostly;
* *
* (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds) * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
*/ */
unsigned int sysctl_sched_wakeup_granularity = 1000000UL; unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL; static unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
const_debug unsigned int sysctl_sched_migration_cost = 500000UL; const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
...@@ -116,7 +116,7 @@ unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL; ...@@ -116,7 +116,7 @@ unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
* *
* (default: ~20%) * (default: ~20%)
*/ */
unsigned int capacity_margin = 1280; static unsigned int capacity_margin = 1280;
static inline void update_load_add(struct load_weight *lw, unsigned long inc) static inline void update_load_add(struct load_weight *lw, unsigned long inc)
{ {
...@@ -703,9 +703,9 @@ void init_entity_runnable_average(struct sched_entity *se) ...@@ -703,9 +703,9 @@ void init_entity_runnable_average(struct sched_entity *se)
memset(sa, 0, sizeof(*sa)); memset(sa, 0, sizeof(*sa));
/* /*
* Tasks are intialized with full load to be seen as heavy tasks until * Tasks are initialized with full load to be seen as heavy tasks until
* they get a chance to stabilize to their real load level. * they get a chance to stabilize to their real load level.
* Group entities are intialized with zero load to reflect the fact that * Group entities are initialized with zero load to reflect the fact that
* nothing has been attached to the task group yet. * nothing has been attached to the task group yet.
*/ */
if (entity_is_task(se)) if (entity_is_task(se))
...@@ -2734,6 +2734,17 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) ...@@ -2734,6 +2734,17 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
WRITE_ONCE(*ptr, res); \ WRITE_ONCE(*ptr, res); \
} while (0) } while (0)
/*
* Remove and clamp on negative, from a local variable.
*
* A variant of sub_positive(), which does not use explicit load-store
* and is thus optimized for local variable updates.
*/
#define lsub_positive(_ptr, _val) do { \
typeof(_ptr) ptr = (_ptr); \
*ptr -= min_t(typeof(*ptr), *ptr, _val); \
} while (0)
#ifdef CONFIG_SMP #ifdef CONFIG_SMP
static inline void static inline void
enqueue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) enqueue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
...@@ -3604,7 +3615,7 @@ static inline unsigned long _task_util_est(struct task_struct *p) ...@@ -3604,7 +3615,7 @@ static inline unsigned long _task_util_est(struct task_struct *p)
{ {
struct util_est ue = READ_ONCE(p->se.avg.util_est); struct util_est ue = READ_ONCE(p->se.avg.util_est);
return max(ue.ewma, ue.enqueued); return (max(ue.ewma, ue.enqueued) | UTIL_AVG_UNCHANGED);
} }
static inline unsigned long task_util_est(struct task_struct *p) static inline unsigned long task_util_est(struct task_struct *p)
...@@ -3622,7 +3633,7 @@ static inline void util_est_enqueue(struct cfs_rq *cfs_rq, ...@@ -3622,7 +3633,7 @@ static inline void util_est_enqueue(struct cfs_rq *cfs_rq,
/* Update root cfs_rq's estimated utilization */ /* Update root cfs_rq's estimated utilization */
enqueued = cfs_rq->avg.util_est.enqueued; enqueued = cfs_rq->avg.util_est.enqueued;
enqueued += (_task_util_est(p) | UTIL_AVG_UNCHANGED); enqueued += _task_util_est(p);
WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued); WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued);
} }
...@@ -3650,8 +3661,7 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep) ...@@ -3650,8 +3661,7 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep)
/* Update root cfs_rq's estimated utilization */ /* Update root cfs_rq's estimated utilization */
ue.enqueued = cfs_rq->avg.util_est.enqueued; ue.enqueued = cfs_rq->avg.util_est.enqueued;
ue.enqueued -= min_t(unsigned int, ue.enqueued, ue.enqueued -= min_t(unsigned int, ue.enqueued, _task_util_est(p));
(_task_util_est(p) | UTIL_AVG_UNCHANGED));
WRITE_ONCE(cfs_rq->avg.util_est.enqueued, ue.enqueued); WRITE_ONCE(cfs_rq->avg.util_est.enqueued, ue.enqueued);
/* /*
...@@ -3966,8 +3976,8 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) ...@@ -3966,8 +3976,8 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
/* /*
* When dequeuing a sched_entity, we must: * When dequeuing a sched_entity, we must:
* - Update loads to have both entity and cfs_rq synced with now. * - Update loads to have both entity and cfs_rq synced with now.
* - Substract its load from the cfs_rq->runnable_avg. * - Subtract its load from the cfs_rq->runnable_avg.
* - Substract its previous weight from cfs_rq->load.weight. * - Subtract its previous weight from cfs_rq->load.weight.
* - For group entity, update its weight to reflect the new share * - For group entity, update its weight to reflect the new share
* of its group cfs_rq. * of its group cfs_rq.
*/ */
...@@ -4640,7 +4650,7 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun) ...@@ -4640,7 +4650,7 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
cfs_b->distribute_running = 0; cfs_b->distribute_running = 0;
throttled = !list_empty(&cfs_b->throttled_cfs_rq); throttled = !list_empty(&cfs_b->throttled_cfs_rq);
cfs_b->runtime -= min(runtime, cfs_b->runtime); lsub_positive(&cfs_b->runtime, runtime);
} }
/* /*
...@@ -4774,7 +4784,7 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) ...@@ -4774,7 +4784,7 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
raw_spin_lock(&cfs_b->lock); raw_spin_lock(&cfs_b->lock);
if (expires == cfs_b->runtime_expires) if (expires == cfs_b->runtime_expires)
cfs_b->runtime -= min(runtime, cfs_b->runtime); lsub_positive(&cfs_b->runtime, runtime);
cfs_b->distribute_running = 0; cfs_b->distribute_running = 0;
raw_spin_unlock(&cfs_b->lock); raw_spin_unlock(&cfs_b->lock);
} }
...@@ -5072,6 +5082,24 @@ static inline void hrtick_update(struct rq *rq) ...@@ -5072,6 +5082,24 @@ static inline void hrtick_update(struct rq *rq)
} }
#endif #endif
#ifdef CONFIG_SMP
static inline unsigned long cpu_util(int cpu);
static unsigned long capacity_of(int cpu);
static inline bool cpu_overutilized(int cpu)
{
return (capacity_of(cpu) * 1024) < (cpu_util(cpu) * capacity_margin);
}
static inline void update_overutilized_status(struct rq *rq)
{
if (!READ_ONCE(rq->rd->overutilized) && cpu_overutilized(rq->cpu))
WRITE_ONCE(rq->rd->overutilized, SG_OVERUTILIZED);
}
#else
static inline void update_overutilized_status(struct rq *rq) { }
#endif
/* /*
* The enqueue_task method is called before nr_running is * The enqueue_task method is called before nr_running is
* increased. Here we update the fair scheduling stats and * increased. Here we update the fair scheduling stats and
...@@ -5129,8 +5157,26 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) ...@@ -5129,8 +5157,26 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
update_cfs_group(se); update_cfs_group(se);
} }
if (!se) if (!se) {
add_nr_running(rq, 1); add_nr_running(rq, 1);
/*
* Since new tasks are assigned an initial util_avg equal to
* half of the spare capacity of their CPU, tiny tasks have the
* ability to cross the overutilized threshold, which will
* result in the load balancer ruining all the task placement
* done by EAS. As a way to mitigate that effect, do not account
* for the first enqueue operation of new tasks during the
* overutilized flag detection.
*
* A better way of solving this problem would be to wait for
* the PELT signals of tasks to converge before taking them
* into account, but that is not straightforward to implement,
* and the following generally works well enough in practice.
*/
if (flags & ENQUEUE_WAKEUP)
update_overutilized_status(rq);
}
hrtick_update(rq); hrtick_update(rq);
} }
...@@ -6241,7 +6287,7 @@ static unsigned long cpu_util_without(int cpu, struct task_struct *p) ...@@ -6241,7 +6287,7 @@ static unsigned long cpu_util_without(int cpu, struct task_struct *p)
util = READ_ONCE(cfs_rq->avg.util_avg); util = READ_ONCE(cfs_rq->avg.util_avg);
/* Discount task's util from CPU's util */ /* Discount task's util from CPU's util */
util -= min_t(unsigned int, util, task_util(p)); lsub_positive(&util, task_util(p));
/* /*
* Covered cases: * Covered cases:
...@@ -6290,10 +6336,9 @@ static unsigned long cpu_util_without(int cpu, struct task_struct *p) ...@@ -6290,10 +6336,9 @@ static unsigned long cpu_util_without(int cpu, struct task_struct *p)
* properly fix the execl regression and it helps in further * properly fix the execl regression and it helps in further
* reducing the chances for the above race. * reducing the chances for the above race.
*/ */
if (unlikely(task_on_rq_queued(p) || current == p)) { if (unlikely(task_on_rq_queued(p) || current == p))
estimated -= min_t(unsigned int, estimated, lsub_positive(&estimated, _task_util_est(p));
(_task_util_est(p) | UTIL_AVG_UNCHANGED));
}
util = max(util, estimated); util = max(util, estimated);
} }
...@@ -6332,6 +6377,213 @@ static int wake_cap(struct task_struct *p, int cpu, int prev_cpu) ...@@ -6332,6 +6377,213 @@ static int wake_cap(struct task_struct *p, int cpu, int prev_cpu)
return !task_fits_capacity(p, min_cap); return !task_fits_capacity(p, min_cap);
} }
/*
* Predicts what cpu_util(@cpu) would return if @p was migrated (and enqueued)
* to @dst_cpu.
*/
static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu)
{
struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs;
unsigned long util_est, util = READ_ONCE(cfs_rq->avg.util_avg);
/*
* If @p migrates from @cpu to another, remove its contribution. Or,
* if @p migrates from another CPU to @cpu, add its contribution. In
* the other cases, @cpu is not impacted by the migration, so the
* util_avg should already be correct.
*/
if (task_cpu(p) == cpu && dst_cpu != cpu)
sub_positive(&util, task_util(p));
else if (task_cpu(p) != cpu && dst_cpu == cpu)
util += task_util(p);
if (sched_feat(UTIL_EST)) {
util_est = READ_ONCE(cfs_rq->avg.util_est.enqueued);
/*
* During wake-up, the task isn't enqueued yet and doesn't
* appear in the cfs_rq->avg.util_est.enqueued of any rq,
* so just add it (if needed) to "simulate" what will be
* cpu_util() after the task has been enqueued.
*/
if (dst_cpu == cpu)
util_est += _task_util_est(p);
util = max(util, util_est);
}
return min(util, capacity_orig_of(cpu));
}
/*
* compute_energy(): Estimates the energy that would be consumed if @p was
* migrated to @dst_cpu. compute_energy() predicts what will be the utilization
* landscape of the * CPUs after the task migration, and uses the Energy Model
* to compute what would be the energy if we decided to actually migrate that
* task.
*/
static long
compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd)
{
long util, max_util, sum_util, energy = 0;
int cpu;
for (; pd; pd = pd->next) {
max_util = sum_util = 0;
/*
* The capacity state of CPUs of the current rd can be driven by
* CPUs of another rd if they belong to the same performance
* domain. So, account for the utilization of these CPUs too
* by masking pd with cpu_online_mask instead of the rd span.
*
* If an entire performance domain is outside of the current rd,
* it will not appear in its pd list and will not be accounted
* by compute_energy().
*/
for_each_cpu_and(cpu, perf_domain_span(pd), cpu_online_mask) {
util = cpu_util_next(cpu, p, dst_cpu);
util = schedutil_energy_util(cpu, util);
max_util = max(util, max_util);
sum_util += util;
}
energy += em_pd_energy(pd->em_pd, max_util, sum_util);
}
return energy;
}
/*
* find_energy_efficient_cpu(): Find most energy-efficient target CPU for the
* waking task. find_energy_efficient_cpu() looks for the CPU with maximum
* spare capacity in each performance domain and uses it as a potential
* candidate to execute the task. Then, it uses the Energy Model to figure
* out which of the CPU candidates is the most energy-efficient.
*
* The rationale for this heuristic is as follows. In a performance domain,
* all the most energy efficient CPU candidates (according to the Energy
* Model) are those for which we'll request a low frequency. When there are
* several CPUs for which the frequency request will be the same, we don't
* have enough data to break the tie between them, because the Energy Model
* only includes active power costs. With this model, if we assume that
* frequency requests follow utilization (e.g. using schedutil), the CPU with
* the maximum spare capacity in a performance domain is guaranteed to be among
* the best candidates of the performance domain.
*
* In practice, it could be preferable from an energy standpoint to pack
* small tasks on a CPU in order to let other CPUs go in deeper idle states,
* but that could also hurt our chances to go cluster idle, and we have no
* ways to tell with the current Energy Model if this is actually a good
* idea or not. So, find_energy_efficient_cpu() basically favors
* cluster-packing, and spreading inside a cluster. That should at least be
* a good thing for latency, and this is consistent with the idea that most
* of the energy savings of EAS come from the asymmetry of the system, and
* not so much from breaking the tie between identical CPUs. That's also the
* reason why EAS is enabled in the topology code only for systems where
* SD_ASYM_CPUCAPACITY is set.
*
* NOTE: Forkees are not accepted in the energy-aware wake-up path because
* they don't have any useful utilization data yet and it's not possible to
* forecast their impact on energy consumption. Consequently, they will be
* placed by find_idlest_cpu() on the least loaded CPU, which might turn out
* to be energy-inefficient in some use-cases. The alternative would be to
* bias new tasks towards specific types of CPUs first, or to try to infer
* their util_avg from the parent task, but those heuristics could hurt
* other use-cases too. So, until someone finds a better way to solve this,
* let's keep things simple by re-using the existing slow path.
*/
static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
{
unsigned long prev_energy = ULONG_MAX, best_energy = ULONG_MAX;
struct root_domain *rd = cpu_rq(smp_processor_id())->rd;
int cpu, best_energy_cpu = prev_cpu;
struct perf_domain *head, *pd;
unsigned long cpu_cap, util;
struct sched_domain *sd;
rcu_read_lock();
pd = rcu_dereference(rd->pd);
if (!pd || READ_ONCE(rd->overutilized))
goto fail;
head = pd;
/*
* Energy-aware wake-up happens on the lowest sched_domain starting
* from sd_asym_cpucapacity spanning over this_cpu and prev_cpu.
*/
sd = rcu_dereference(*this_cpu_ptr(&sd_asym_cpucapacity));
while (sd && !cpumask_test_cpu(prev_cpu, sched_domain_span(sd)))
sd = sd->parent;
if (!sd)
goto fail;
sync_entity_load_avg(&p->se);
if (!task_util_est(p))
goto unlock;
for (; pd; pd = pd->next) {
unsigned long cur_energy, spare_cap, max_spare_cap = 0;
int max_spare_cap_cpu = -1;
for_each_cpu_and(cpu, perf_domain_span(pd), sched_domain_span(sd)) {
if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
continue;
/* Skip CPUs that will be overutilized. */
util = cpu_util_next(cpu, p, cpu);
cpu_cap = capacity_of(cpu);
if (cpu_cap * 1024 < util * capacity_margin)
continue;
/* Always use prev_cpu as a candidate. */
if (cpu == prev_cpu) {
prev_energy = compute_energy(p, prev_cpu, head);
best_energy = min(best_energy, prev_energy);
continue;
}
/*
* Find the CPU with the maximum spare capacity in
* the performance domain
*/
spare_cap = cpu_cap - util;
if (spare_cap > max_spare_cap) {
max_spare_cap = spare_cap;
max_spare_cap_cpu = cpu;
}
}
/* Evaluate the energy impact of using this CPU. */
if (max_spare_cap_cpu >= 0) {
cur_energy = compute_energy(p, max_spare_cap_cpu, head);
if (cur_energy < best_energy) {
best_energy = cur_energy;
best_energy_cpu = max_spare_cap_cpu;
}
}
}
unlock:
rcu_read_unlock();
/*
* Pick the best CPU if prev_cpu cannot be used, or if it saves at
* least 6% of the energy used by prev_cpu.
*/
if (prev_energy == ULONG_MAX)
return best_energy_cpu;
if ((prev_energy - best_energy) > (prev_energy >> 4))
return best_energy_cpu;
return prev_cpu;
fail:
rcu_read_unlock();
return -1;
}
/* /*
* select_task_rq_fair: Select target runqueue for the waking task in domains * select_task_rq_fair: Select target runqueue for the waking task in domains
* that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE, * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE,
...@@ -6355,8 +6607,16 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f ...@@ -6355,8 +6607,16 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
if (sd_flag & SD_BALANCE_WAKE) { if (sd_flag & SD_BALANCE_WAKE) {
record_wakee(p); record_wakee(p);
want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu)
&& cpumask_test_cpu(cpu, &p->cpus_allowed); if (static_branch_unlikely(&sched_energy_present)) {
new_cpu = find_energy_efficient_cpu(p, prev_cpu);
if (new_cpu >= 0)
return new_cpu;
new_cpu = prev_cpu;
}
want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu) &&
cpumask_test_cpu(cpu, &p->cpus_allowed);
} }
rcu_read_lock(); rcu_read_lock();
...@@ -6520,7 +6780,7 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se) ...@@ -6520,7 +6780,7 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
static void set_last_buddy(struct sched_entity *se) static void set_last_buddy(struct sched_entity *se)
{ {
if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE)) if (entity_is_task(se) && unlikely(task_has_idle_policy(task_of(se))))
return; return;
for_each_sched_entity(se) { for_each_sched_entity(se) {
...@@ -6532,7 +6792,7 @@ static void set_last_buddy(struct sched_entity *se) ...@@ -6532,7 +6792,7 @@ static void set_last_buddy(struct sched_entity *se)
static void set_next_buddy(struct sched_entity *se) static void set_next_buddy(struct sched_entity *se)
{ {
if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE)) if (entity_is_task(se) && unlikely(task_has_idle_policy(task_of(se))))
return; return;
for_each_sched_entity(se) { for_each_sched_entity(se) {
...@@ -6590,8 +6850,8 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ ...@@ -6590,8 +6850,8 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
return; return;
/* Idle tasks are by definition preempted by non-idle tasks. */ /* Idle tasks are by definition preempted by non-idle tasks. */
if (unlikely(curr->policy == SCHED_IDLE) && if (unlikely(task_has_idle_policy(curr)) &&
likely(p->policy != SCHED_IDLE)) likely(!task_has_idle_policy(p)))
goto preempt; goto preempt;
/* /*
...@@ -7012,7 +7272,7 @@ static int task_hot(struct task_struct *p, struct lb_env *env) ...@@ -7012,7 +7272,7 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
if (p->sched_class != &fair_sched_class) if (p->sched_class != &fair_sched_class)
return 0; return 0;
if (unlikely(p->policy == SCHED_IDLE)) if (unlikely(task_has_idle_policy(p)))
return 0; return 0;
/* /*
...@@ -7896,16 +8156,16 @@ static bool update_nohz_stats(struct rq *rq, bool force) ...@@ -7896,16 +8156,16 @@ static bool update_nohz_stats(struct rq *rq, bool force)
* update_sg_lb_stats - Update sched_group's statistics for load balancing. * update_sg_lb_stats - Update sched_group's statistics for load balancing.
* @env: The load balancing environment. * @env: The load balancing environment.
* @group: sched_group whose statistics are to be updated. * @group: sched_group whose statistics are to be updated.
* @load_idx: Load index of sched_domain of this_cpu for load calc.
* @local_group: Does group contain this_cpu.
* @sgs: variable to hold the statistics for this group. * @sgs: variable to hold the statistics for this group.
* @overload: Indicate pullable load (e.g. >1 runnable task). * @sg_status: Holds flag indicating the status of the sched_group
*/ */
static inline void update_sg_lb_stats(struct lb_env *env, static inline void update_sg_lb_stats(struct lb_env *env,
struct sched_group *group, int load_idx, struct sched_group *group,
int local_group, struct sg_lb_stats *sgs, struct sg_lb_stats *sgs,
bool *overload) int *sg_status)
{ {
int local_group = cpumask_test_cpu(env->dst_cpu, sched_group_span(group));
int load_idx = get_sd_load_idx(env->sd, env->idle);
unsigned long load; unsigned long load;
int i, nr_running; int i, nr_running;
...@@ -7929,7 +8189,10 @@ static inline void update_sg_lb_stats(struct lb_env *env, ...@@ -7929,7 +8189,10 @@ static inline void update_sg_lb_stats(struct lb_env *env,
nr_running = rq->nr_running; nr_running = rq->nr_running;
if (nr_running > 1) if (nr_running > 1)
*overload = true; *sg_status |= SG_OVERLOAD;
if (cpu_overutilized(i))
*sg_status |= SG_OVERUTILIZED;
#ifdef CONFIG_NUMA_BALANCING #ifdef CONFIG_NUMA_BALANCING
sgs->nr_numa_running += rq->nr_numa_running; sgs->nr_numa_running += rq->nr_numa_running;
...@@ -7945,7 +8208,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, ...@@ -7945,7 +8208,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
if (env->sd->flags & SD_ASYM_CPUCAPACITY && if (env->sd->flags & SD_ASYM_CPUCAPACITY &&
sgs->group_misfit_task_load < rq->misfit_task_load) { sgs->group_misfit_task_load < rq->misfit_task_load) {
sgs->group_misfit_task_load = rq->misfit_task_load; sgs->group_misfit_task_load = rq->misfit_task_load;
*overload = 1; *sg_status |= SG_OVERLOAD;
} }
} }
...@@ -8090,17 +8353,14 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd ...@@ -8090,17 +8353,14 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
struct sched_group *sg = env->sd->groups; struct sched_group *sg = env->sd->groups;
struct sg_lb_stats *local = &sds->local_stat; struct sg_lb_stats *local = &sds->local_stat;
struct sg_lb_stats tmp_sgs; struct sg_lb_stats tmp_sgs;
int load_idx;
bool overload = false;
bool prefer_sibling = child && child->flags & SD_PREFER_SIBLING; bool prefer_sibling = child && child->flags & SD_PREFER_SIBLING;
int sg_status = 0;
#ifdef CONFIG_NO_HZ_COMMON #ifdef CONFIG_NO_HZ_COMMON
if (env->idle == CPU_NEWLY_IDLE && READ_ONCE(nohz.has_blocked)) if (env->idle == CPU_NEWLY_IDLE && READ_ONCE(nohz.has_blocked))
env->flags |= LBF_NOHZ_STATS; env->flags |= LBF_NOHZ_STATS;
#endif #endif
load_idx = get_sd_load_idx(env->sd, env->idle);
do { do {
struct sg_lb_stats *sgs = &tmp_sgs; struct sg_lb_stats *sgs = &tmp_sgs;
int local_group; int local_group;
...@@ -8115,8 +8375,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd ...@@ -8115,8 +8375,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
update_group_capacity(env->sd, env->dst_cpu); update_group_capacity(env->sd, env->dst_cpu);
} }
update_sg_lb_stats(env, sg, load_idx, local_group, sgs, update_sg_lb_stats(env, sg, sgs, &sg_status);
&overload);
if (local_group) if (local_group)
goto next_group; goto next_group;
...@@ -8165,9 +8424,15 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd ...@@ -8165,9 +8424,15 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
env->fbq_type = fbq_classify_group(&sds->busiest_stat); env->fbq_type = fbq_classify_group(&sds->busiest_stat);
if (!env->sd->parent) { if (!env->sd->parent) {
struct root_domain *rd = env->dst_rq->rd;
/* update overload indicator if we are at root domain */ /* update overload indicator if we are at root domain */
if (READ_ONCE(env->dst_rq->rd->overload) != overload) WRITE_ONCE(rd->overload, sg_status & SG_OVERLOAD);
WRITE_ONCE(env->dst_rq->rd->overload, overload);
/* Update over-utilization (tipping point, U >= 0) indicator */
WRITE_ONCE(rd->overutilized, sg_status & SG_OVERUTILIZED);
} else if (sg_status & SG_OVERUTILIZED) {
WRITE_ONCE(env->dst_rq->rd->overutilized, SG_OVERUTILIZED);
} }
} }
...@@ -8394,6 +8659,14 @@ static struct sched_group *find_busiest_group(struct lb_env *env) ...@@ -8394,6 +8659,14 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
* this level. * this level.
*/ */
update_sd_lb_stats(env, &sds); update_sd_lb_stats(env, &sds);
if (static_branch_unlikely(&sched_energy_present)) {
struct root_domain *rd = env->dst_rq->rd;
if (rcu_dereference(rd->pd) && !READ_ONCE(rd->overutilized))
goto out_balanced;
}
local = &sds.local_stat; local = &sds.local_stat;
busiest = &sds.busiest_stat; busiest = &sds.busiest_stat;
...@@ -8910,13 +9183,22 @@ static int load_balance(int this_cpu, struct rq *this_rq, ...@@ -8910,13 +9183,22 @@ static int load_balance(int this_cpu, struct rq *this_rq,
sd->nr_balance_failed = 0; sd->nr_balance_failed = 0;
out_one_pinned: out_one_pinned:
ld_moved = 0;
/*
* idle_balance() disregards balance intervals, so we could repeatedly
* reach this code, which would lead to balance_interval skyrocketting
* in a short amount of time. Skip the balance_interval increase logic
* to avoid that.
*/
if (env.idle == CPU_NEWLY_IDLE)
goto out;
/* tune up the balancing interval */ /* tune up the balancing interval */
if (((env.flags & LBF_ALL_PINNED) && if ((env.flags & LBF_ALL_PINNED &&
sd->balance_interval < MAX_PINNED_INTERVAL) || sd->balance_interval < MAX_PINNED_INTERVAL) ||
(sd->balance_interval < sd->max_interval)) sd->balance_interval < sd->max_interval)
sd->balance_interval *= 2; sd->balance_interval *= 2;
ld_moved = 0;
out: out:
return ld_moved; return ld_moved;
} }
...@@ -9281,7 +9563,7 @@ static void nohz_balancer_kick(struct rq *rq) ...@@ -9281,7 +9563,7 @@ static void nohz_balancer_kick(struct rq *rq)
} }
} }
sd = rcu_dereference(per_cpu(sd_asym, cpu)); sd = rcu_dereference(per_cpu(sd_asym_packing, cpu));
if (sd) { if (sd) {
for_each_cpu(i, sched_domain_span(sd)) { for_each_cpu(i, sched_domain_span(sd)) {
if (i == cpu || if (i == cpu ||
...@@ -9783,6 +10065,7 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) ...@@ -9783,6 +10065,7 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
task_tick_numa(rq, curr); task_tick_numa(rq, curr);
update_misfit_status(curr, rq); update_misfit_status(curr, rq);
update_overutilized_status(task_rq(curr));
} }
/* /*
......
...@@ -8,14 +8,14 @@ ...@@ -8,14 +8,14 @@
*/ */
#include "sched.h" #include "sched.h"
DEFINE_STATIC_KEY_FALSE(housekeeping_overriden); DEFINE_STATIC_KEY_FALSE(housekeeping_overridden);
EXPORT_SYMBOL_GPL(housekeeping_overriden); EXPORT_SYMBOL_GPL(housekeeping_overridden);
static cpumask_var_t housekeeping_mask; static cpumask_var_t housekeeping_mask;
static unsigned int housekeeping_flags; static unsigned int housekeeping_flags;
int housekeeping_any_cpu(enum hk_flags flags) int housekeeping_any_cpu(enum hk_flags flags)
{ {
if (static_branch_unlikely(&housekeeping_overriden)) if (static_branch_unlikely(&housekeeping_overridden))
if (housekeeping_flags & flags) if (housekeeping_flags & flags)
return cpumask_any_and(housekeeping_mask, cpu_online_mask); return cpumask_any_and(housekeeping_mask, cpu_online_mask);
return smp_processor_id(); return smp_processor_id();
...@@ -24,7 +24,7 @@ EXPORT_SYMBOL_GPL(housekeeping_any_cpu); ...@@ -24,7 +24,7 @@ EXPORT_SYMBOL_GPL(housekeeping_any_cpu);
const struct cpumask *housekeeping_cpumask(enum hk_flags flags) const struct cpumask *housekeeping_cpumask(enum hk_flags flags)
{ {
if (static_branch_unlikely(&housekeeping_overriden)) if (static_branch_unlikely(&housekeeping_overridden))
if (housekeeping_flags & flags) if (housekeeping_flags & flags)
return housekeeping_mask; return housekeeping_mask;
return cpu_possible_mask; return cpu_possible_mask;
...@@ -33,7 +33,7 @@ EXPORT_SYMBOL_GPL(housekeeping_cpumask); ...@@ -33,7 +33,7 @@ EXPORT_SYMBOL_GPL(housekeeping_cpumask);
void housekeeping_affine(struct task_struct *t, enum hk_flags flags) void housekeeping_affine(struct task_struct *t, enum hk_flags flags)
{ {
if (static_branch_unlikely(&housekeeping_overriden)) if (static_branch_unlikely(&housekeeping_overridden))
if (housekeeping_flags & flags) if (housekeeping_flags & flags)
set_cpus_allowed_ptr(t, housekeeping_mask); set_cpus_allowed_ptr(t, housekeeping_mask);
} }
...@@ -41,7 +41,7 @@ EXPORT_SYMBOL_GPL(housekeeping_affine); ...@@ -41,7 +41,7 @@ EXPORT_SYMBOL_GPL(housekeeping_affine);
bool housekeeping_test_cpu(int cpu, enum hk_flags flags) bool housekeeping_test_cpu(int cpu, enum hk_flags flags)
{ {
if (static_branch_unlikely(&housekeeping_overriden)) if (static_branch_unlikely(&housekeeping_overridden))
if (housekeeping_flags & flags) if (housekeeping_flags & flags)
return cpumask_test_cpu(cpu, housekeeping_mask); return cpumask_test_cpu(cpu, housekeeping_mask);
return true; return true;
...@@ -53,7 +53,7 @@ void __init housekeeping_init(void) ...@@ -53,7 +53,7 @@ void __init housekeeping_init(void)
if (!housekeeping_flags) if (!housekeeping_flags)
return; return;
static_branch_enable(&housekeeping_overriden); static_branch_enable(&housekeeping_overridden);
if (housekeeping_flags & HK_FLAG_TICK) if (housekeeping_flags & HK_FLAG_TICK)
sched_tick_offload_init(); sched_tick_offload_init();
......
...@@ -1498,6 +1498,14 @@ static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flag ...@@ -1498,6 +1498,14 @@ static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flag
#endif #endif
} }
static inline void set_next_task(struct rq *rq, struct task_struct *p)
{
p->se.exec_start = rq_clock_task(rq);
/* The running task is never eligible for pushing */
dequeue_pushable_task(rq, p);
}
static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq, static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq,
struct rt_rq *rt_rq) struct rt_rq *rt_rq)
{ {
...@@ -1518,7 +1526,6 @@ static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq, ...@@ -1518,7 +1526,6 @@ static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq,
static struct task_struct *_pick_next_task_rt(struct rq *rq) static struct task_struct *_pick_next_task_rt(struct rq *rq)
{ {
struct sched_rt_entity *rt_se; struct sched_rt_entity *rt_se;
struct task_struct *p;
struct rt_rq *rt_rq = &rq->rt; struct rt_rq *rt_rq = &rq->rt;
do { do {
...@@ -1527,10 +1534,7 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq) ...@@ -1527,10 +1534,7 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq)
rt_rq = group_rt_rq(rt_se); rt_rq = group_rt_rq(rt_se);
} while (rt_rq); } while (rt_rq);
p = rt_task_of(rt_se); return rt_task_of(rt_se);
p->se.exec_start = rq_clock_task(rq);
return p;
} }
static struct task_struct * static struct task_struct *
...@@ -1573,8 +1577,7 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) ...@@ -1573,8 +1577,7 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
p = _pick_next_task_rt(rq); p = _pick_next_task_rt(rq);
/* The running task is never eligible for pushing */ set_next_task(rq, p);
dequeue_pushable_task(rq, p);
rt_queue_push_tasks(rq); rt_queue_push_tasks(rq);
...@@ -1810,10 +1813,8 @@ static int push_rt_task(struct rq *rq) ...@@ -1810,10 +1813,8 @@ static int push_rt_task(struct rq *rq)
return 0; return 0;
retry: retry:
if (unlikely(next_task == rq->curr)) { if (WARN_ON(next_task == rq->curr))
WARN_ON(1);
return 0; return 0;
}
/* /*
* It's possible that the next_task slipped in of * It's possible that the next_task slipped in of
...@@ -2355,12 +2356,7 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued) ...@@ -2355,12 +2356,7 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
static void set_curr_task_rt(struct rq *rq) static void set_curr_task_rt(struct rq *rq)
{ {
struct task_struct *p = rq->curr; set_next_task(rq, rq->curr);
p->se.exec_start = rq_clock_task(rq);
/* The running task is never eligible for pushing */
dequeue_pushable_task(rq, p);
} }
static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task) static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task)
......
...@@ -45,6 +45,7 @@ ...@@ -45,6 +45,7 @@
#include <linux/ctype.h> #include <linux/ctype.h>
#include <linux/debugfs.h> #include <linux/debugfs.h>
#include <linux/delayacct.h> #include <linux/delayacct.h>
#include <linux/energy_model.h>
#include <linux/init_task.h> #include <linux/init_task.h>
#include <linux/kprobes.h> #include <linux/kprobes.h>
#include <linux/kthread.h> #include <linux/kthread.h>
...@@ -177,6 +178,11 @@ static inline bool valid_policy(int policy) ...@@ -177,6 +178,11 @@ static inline bool valid_policy(int policy)
rt_policy(policy) || dl_policy(policy); rt_policy(policy) || dl_policy(policy);
} }
static inline int task_has_idle_policy(struct task_struct *p)
{
return idle_policy(p->policy);
}
static inline int task_has_rt_policy(struct task_struct *p) static inline int task_has_rt_policy(struct task_struct *p)
{ {
return rt_policy(p->policy); return rt_policy(p->policy);
...@@ -632,7 +638,7 @@ struct dl_rq { ...@@ -632,7 +638,7 @@ struct dl_rq {
/* /*
* Deadline values of the currently executing and the * Deadline values of the currently executing and the
* earliest ready task on this rq. Caching these facilitates * earliest ready task on this rq. Caching these facilitates
* the decision wether or not a ready but not running task * the decision whether or not a ready but not running task
* should migrate somewhere else. * should migrate somewhere else.
*/ */
struct { struct {
...@@ -704,6 +710,16 @@ static inline bool sched_asym_prefer(int a, int b) ...@@ -704,6 +710,16 @@ static inline bool sched_asym_prefer(int a, int b)
return arch_asym_cpu_priority(a) > arch_asym_cpu_priority(b); return arch_asym_cpu_priority(a) > arch_asym_cpu_priority(b);
} }
struct perf_domain {
struct em_perf_domain *em_pd;
struct perf_domain *next;
struct rcu_head rcu;
};
/* Scheduling group status flags */
#define SG_OVERLOAD 0x1 /* More than one runnable task on a CPU. */
#define SG_OVERUTILIZED 0x2 /* One or more CPUs are over-utilized. */
/* /*
* We add the notion of a root-domain which will be used to define per-domain * We add the notion of a root-domain which will be used to define per-domain
* variables. Each exclusive cpuset essentially defines an island domain by * variables. Each exclusive cpuset essentially defines an island domain by
...@@ -726,6 +742,9 @@ struct root_domain { ...@@ -726,6 +742,9 @@ struct root_domain {
*/ */
int overload; int overload;
/* Indicate one or more cpus over-utilized (tipping point) */
int overutilized;
/* /*
* The bit corresponding to a CPU gets set here if such CPU has more * The bit corresponding to a CPU gets set here if such CPU has more
* than one runnable -deadline task (as it is below for RT tasks). * than one runnable -deadline task (as it is below for RT tasks).
...@@ -756,6 +775,12 @@ struct root_domain { ...@@ -756,6 +775,12 @@ struct root_domain {
struct cpupri cpupri; struct cpupri cpupri;
unsigned long max_cpu_capacity; unsigned long max_cpu_capacity;
/*
* NULL-terminated list of performance domains intersecting with the
* CPUs of the rd. Protected by RCU.
*/
struct perf_domain *pd;
}; };
extern struct root_domain def_root_domain; extern struct root_domain def_root_domain;
...@@ -1285,7 +1310,8 @@ DECLARE_PER_CPU(int, sd_llc_size); ...@@ -1285,7 +1310,8 @@ DECLARE_PER_CPU(int, sd_llc_size);
DECLARE_PER_CPU(int, sd_llc_id); DECLARE_PER_CPU(int, sd_llc_id);
DECLARE_PER_CPU(struct sched_domain_shared *, sd_llc_shared); DECLARE_PER_CPU(struct sched_domain_shared *, sd_llc_shared);
DECLARE_PER_CPU(struct sched_domain *, sd_numa); DECLARE_PER_CPU(struct sched_domain *, sd_numa);
DECLARE_PER_CPU(struct sched_domain *, sd_asym); DECLARE_PER_CPU(struct sched_domain *, sd_asym_packing);
DECLARE_PER_CPU(struct sched_domain *, sd_asym_cpucapacity);
extern struct static_key_false sched_asym_cpucapacity; extern struct static_key_false sched_asym_cpucapacity;
struct sched_group_capacity { struct sched_group_capacity {
...@@ -1429,7 +1455,7 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) ...@@ -1429,7 +1455,7 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
#ifdef CONFIG_SMP #ifdef CONFIG_SMP
/* /*
* After ->cpu is set up to a new value, task_rq_lock(p, ...) can be * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be
* successfuly executed on another CPU. We must ensure that updates of * successfully executed on another CPU. We must ensure that updates of
* per-task data have been completed by this moment. * per-task data have been completed by this moment.
*/ */
smp_wmb(); smp_wmb();
...@@ -1794,12 +1820,12 @@ static inline void add_nr_running(struct rq *rq, unsigned count) ...@@ -1794,12 +1820,12 @@ static inline void add_nr_running(struct rq *rq, unsigned count)
rq->nr_running = prev_nr + count; rq->nr_running = prev_nr + count;
if (prev_nr < 2 && rq->nr_running >= 2) {
#ifdef CONFIG_SMP #ifdef CONFIG_SMP
if (prev_nr < 2 && rq->nr_running >= 2) {
if (!READ_ONCE(rq->rd->overload)) if (!READ_ONCE(rq->rd->overload))
WRITE_ONCE(rq->rd->overload, 1); WRITE_ONCE(rq->rd->overload, 1);
#endif
} }
#endif
sched_update_tick_dependency(rq); sched_update_tick_dependency(rq);
} }
...@@ -1854,27 +1880,6 @@ unsigned long arch_scale_freq_capacity(int cpu) ...@@ -1854,27 +1880,6 @@ unsigned long arch_scale_freq_capacity(int cpu)
} }
#endif #endif
#ifdef CONFIG_SMP
#ifndef arch_scale_cpu_capacity
static __always_inline
unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu)
{
if (sd && (sd->flags & SD_SHARE_CPUCAPACITY) && (sd->span_weight > 1))
return sd->smt_gain / sd->span_weight;
return SCHED_CAPACITY_SCALE;
}
#endif
#else
#ifndef arch_scale_cpu_capacity
static __always_inline
unsigned long arch_scale_cpu_capacity(void __always_unused *sd, int cpu)
{
return SCHED_CAPACITY_SCALE;
}
#endif
#endif
#ifdef CONFIG_SMP #ifdef CONFIG_SMP
#ifdef CONFIG_PREEMPT #ifdef CONFIG_PREEMPT
...@@ -2207,6 +2212,31 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {} ...@@ -2207,6 +2212,31 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {}
#endif #endif
#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL #ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL
/**
* enum schedutil_type - CPU utilization type
* @FREQUENCY_UTIL: Utilization used to select frequency
* @ENERGY_UTIL: Utilization used during energy calculation
*
* The utilization signals of all scheduling classes (CFS/RT/DL) and IRQ time
* need to be aggregated differently depending on the usage made of them. This
* enum is used within schedutil_freq_util() to differentiate the types of
* utilization expected by the callers, and adjust the aggregation accordingly.
*/
enum schedutil_type {
FREQUENCY_UTIL,
ENERGY_UTIL,
};
unsigned long schedutil_freq_util(int cpu, unsigned long util_cfs,
unsigned long max, enum schedutil_type type);
static inline unsigned long schedutil_energy_util(int cpu, unsigned long cfs)
{
unsigned long max = arch_scale_cpu_capacity(NULL, cpu);
return schedutil_freq_util(cpu, cfs, max, ENERGY_UTIL);
}
static inline unsigned long cpu_bw_dl(struct rq *rq) static inline unsigned long cpu_bw_dl(struct rq *rq)
{ {
return (rq->dl.running_bw * SCHED_CAPACITY_SCALE) >> BW_SHIFT; return (rq->dl.running_bw * SCHED_CAPACITY_SCALE) >> BW_SHIFT;
...@@ -2233,6 +2263,11 @@ static inline unsigned long cpu_util_rt(struct rq *rq) ...@@ -2233,6 +2263,11 @@ static inline unsigned long cpu_util_rt(struct rq *rq)
{ {
return READ_ONCE(rq->avg_rt.util_avg); return READ_ONCE(rq->avg_rt.util_avg);
} }
#else /* CONFIG_CPU_FREQ_GOV_SCHEDUTIL */
static inline unsigned long schedutil_energy_util(int cpu, unsigned long cfs)
{
return cfs;
}
#endif #endif
#ifdef CONFIG_HAVE_SCHED_AVG_IRQ #ifdef CONFIG_HAVE_SCHED_AVG_IRQ
...@@ -2262,3 +2297,13 @@ unsigned long scale_irq_capacity(unsigned long util, unsigned long irq, unsigned ...@@ -2262,3 +2297,13 @@ unsigned long scale_irq_capacity(unsigned long util, unsigned long irq, unsigned
return util; return util;
} }
#endif #endif
#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
#define perf_domain_span(pd) (to_cpumask(((pd)->em_pd->cpus)))
#else
#define perf_domain_span(pd) NULL
#endif
#ifdef CONFIG_SMP
extern struct static_key_false sched_energy_present;
#endif
...@@ -201,6 +201,199 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) ...@@ -201,6 +201,199 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
return 1; return 1;
} }
DEFINE_STATIC_KEY_FALSE(sched_energy_present);
#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
DEFINE_MUTEX(sched_energy_mutex);
bool sched_energy_update;
static void free_pd(struct perf_domain *pd)
{
struct perf_domain *tmp;
while (pd) {
tmp = pd->next;
kfree(pd);
pd = tmp;
}
}
static struct perf_domain *find_pd(struct perf_domain *pd, int cpu)
{
while (pd) {
if (cpumask_test_cpu(cpu, perf_domain_span(pd)))
return pd;
pd = pd->next;
}
return NULL;
}
static struct perf_domain *pd_init(int cpu)
{
struct em_perf_domain *obj = em_cpu_get(cpu);
struct perf_domain *pd;
if (!obj) {
if (sched_debug())
pr_info("%s: no EM found for CPU%d\n", __func__, cpu);
return NULL;
}
pd = kzalloc(sizeof(*pd), GFP_KERNEL);
if (!pd)
return NULL;
pd->em_pd = obj;
return pd;
}
static void perf_domain_debug(const struct cpumask *cpu_map,
struct perf_domain *pd)
{
if (!sched_debug() || !pd)
return;
printk(KERN_DEBUG "root_domain %*pbl:", cpumask_pr_args(cpu_map));
while (pd) {
printk(KERN_CONT " pd%d:{ cpus=%*pbl nr_cstate=%d }",
cpumask_first(perf_domain_span(pd)),
cpumask_pr_args(perf_domain_span(pd)),
em_pd_nr_cap_states(pd->em_pd));
pd = pd->next;
}
printk(KERN_CONT "\n");
}
static void destroy_perf_domain_rcu(struct rcu_head *rp)
{
struct perf_domain *pd;
pd = container_of(rp, struct perf_domain, rcu);
free_pd(pd);
}
static void sched_energy_set(bool has_eas)
{
if (!has_eas && static_branch_unlikely(&sched_energy_present)) {
if (sched_debug())
pr_info("%s: stopping EAS\n", __func__);
static_branch_disable_cpuslocked(&sched_energy_present);
} else if (has_eas && !static_branch_unlikely(&sched_energy_present)) {
if (sched_debug())
pr_info("%s: starting EAS\n", __func__);
static_branch_enable_cpuslocked(&sched_energy_present);
}
}
/*
* EAS can be used on a root domain if it meets all the following conditions:
* 1. an Energy Model (EM) is available;
* 2. the SD_ASYM_CPUCAPACITY flag is set in the sched_domain hierarchy.
* 3. the EM complexity is low enough to keep scheduling overheads low;
* 4. schedutil is driving the frequency of all CPUs of the rd;
*
* The complexity of the Energy Model is defined as:
*
* C = nr_pd * (nr_cpus + nr_cs)
*
* with parameters defined as:
* - nr_pd: the number of performance domains
* - nr_cpus: the number of CPUs
* - nr_cs: the sum of the number of capacity states of all performance
* domains (for example, on a system with 2 performance domains,
* with 10 capacity states each, nr_cs = 2 * 10 = 20).
*
* It is generally not a good idea to use such a model in the wake-up path on
* very complex platforms because of the associated scheduling overheads. The
* arbitrary constraint below prevents that. It makes EAS usable up to 16 CPUs
* with per-CPU DVFS and less than 8 capacity states each, for example.
*/
#define EM_MAX_COMPLEXITY 2048
extern struct cpufreq_governor schedutil_gov;
static bool build_perf_domains(const struct cpumask *cpu_map)
{
int i, nr_pd = 0, nr_cs = 0, nr_cpus = cpumask_weight(cpu_map);
struct perf_domain *pd = NULL, *tmp;
int cpu = cpumask_first(cpu_map);
struct root_domain *rd = cpu_rq(cpu)->rd;
struct cpufreq_policy *policy;
struct cpufreq_governor *gov;
/* EAS is enabled for asymmetric CPU capacity topologies. */
if (!per_cpu(sd_asym_cpucapacity, cpu)) {
if (sched_debug()) {
pr_info("rd %*pbl: CPUs do not have asymmetric capacities\n",
cpumask_pr_args(cpu_map));
}
goto free;
}
for_each_cpu(i, cpu_map) {
/* Skip already covered CPUs. */
if (find_pd(pd, i))
continue;
/* Do not attempt EAS if schedutil is not being used. */
policy = cpufreq_cpu_get(i);
if (!policy)
goto free;
gov = policy->governor;
cpufreq_cpu_put(policy);
if (gov != &schedutil_gov) {
if (rd->pd)
pr_warn("rd %*pbl: Disabling EAS, schedutil is mandatory\n",
cpumask_pr_args(cpu_map));
goto free;
}
/* Create the new pd and add it to the local list. */
tmp = pd_init(i);
if (!tmp)
goto free;
tmp->next = pd;
pd = tmp;
/*
* Count performance domains and capacity states for the
* complexity check.
*/
nr_pd++;
nr_cs += em_pd_nr_cap_states(pd->em_pd);
}
/* Bail out if the Energy Model complexity is too high. */
if (nr_pd * (nr_cs + nr_cpus) > EM_MAX_COMPLEXITY) {
WARN(1, "rd %*pbl: Failed to start EAS, EM complexity is too high\n",
cpumask_pr_args(cpu_map));
goto free;
}
perf_domain_debug(cpu_map, pd);
/* Attach the new list of performance domains to the root domain. */
tmp = rd->pd;
rcu_assign_pointer(rd->pd, pd);
if (tmp)
call_rcu(&tmp->rcu, destroy_perf_domain_rcu);
return !!pd;
free:
free_pd(pd);
tmp = rd->pd;
rcu_assign_pointer(rd->pd, NULL);
if (tmp)
call_rcu(&tmp->rcu, destroy_perf_domain_rcu);
return false;
}
#else
static void free_pd(struct perf_domain *pd) { }
#endif /* CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL*/
static void free_rootdomain(struct rcu_head *rcu) static void free_rootdomain(struct rcu_head *rcu)
{ {
struct root_domain *rd = container_of(rcu, struct root_domain, rcu); struct root_domain *rd = container_of(rcu, struct root_domain, rcu);
...@@ -211,6 +404,7 @@ static void free_rootdomain(struct rcu_head *rcu) ...@@ -211,6 +404,7 @@ static void free_rootdomain(struct rcu_head *rcu)
free_cpumask_var(rd->rto_mask); free_cpumask_var(rd->rto_mask);
free_cpumask_var(rd->online); free_cpumask_var(rd->online);
free_cpumask_var(rd->span); free_cpumask_var(rd->span);
free_pd(rd->pd);
kfree(rd); kfree(rd);
} }
...@@ -397,7 +591,8 @@ DEFINE_PER_CPU(int, sd_llc_size); ...@@ -397,7 +591,8 @@ DEFINE_PER_CPU(int, sd_llc_size);
DEFINE_PER_CPU(int, sd_llc_id); DEFINE_PER_CPU(int, sd_llc_id);
DEFINE_PER_CPU(struct sched_domain_shared *, sd_llc_shared); DEFINE_PER_CPU(struct sched_domain_shared *, sd_llc_shared);
DEFINE_PER_CPU(struct sched_domain *, sd_numa); DEFINE_PER_CPU(struct sched_domain *, sd_numa);
DEFINE_PER_CPU(struct sched_domain *, sd_asym); DEFINE_PER_CPU(struct sched_domain *, sd_asym_packing);
DEFINE_PER_CPU(struct sched_domain *, sd_asym_cpucapacity);
DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity); DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity);
static void update_top_cache_domain(int cpu) static void update_top_cache_domain(int cpu)
...@@ -423,7 +618,10 @@ static void update_top_cache_domain(int cpu) ...@@ -423,7 +618,10 @@ static void update_top_cache_domain(int cpu)
rcu_assign_pointer(per_cpu(sd_numa, cpu), sd); rcu_assign_pointer(per_cpu(sd_numa, cpu), sd);
sd = highest_flag_domain(cpu, SD_ASYM_PACKING); sd = highest_flag_domain(cpu, SD_ASYM_PACKING);
rcu_assign_pointer(per_cpu(sd_asym, cpu), sd); rcu_assign_pointer(per_cpu(sd_asym_packing, cpu), sd);
sd = lowest_flag_domain(cpu, SD_ASYM_CPUCAPACITY);
rcu_assign_pointer(per_cpu(sd_asym_cpucapacity, cpu), sd);
} }
/* /*
...@@ -1133,7 +1331,6 @@ sd_init(struct sched_domain_topology_level *tl, ...@@ -1133,7 +1331,6 @@ sd_init(struct sched_domain_topology_level *tl,
.last_balance = jiffies, .last_balance = jiffies,
.balance_interval = sd_weight, .balance_interval = sd_weight,
.smt_gain = 0,
.max_newidle_lb_cost = 0, .max_newidle_lb_cost = 0,
.next_decay_max_lb_cost = jiffies, .next_decay_max_lb_cost = jiffies,
.child = child, .child = child,
...@@ -1164,7 +1361,6 @@ sd_init(struct sched_domain_topology_level *tl, ...@@ -1164,7 +1361,6 @@ sd_init(struct sched_domain_topology_level *tl,
if (sd->flags & SD_SHARE_CPUCAPACITY) { if (sd->flags & SD_SHARE_CPUCAPACITY) {
sd->imbalance_pct = 110; sd->imbalance_pct = 110;
sd->smt_gain = 1178; /* ~15% */
} else if (sd->flags & SD_SHARE_PKG_RESOURCES) { } else if (sd->flags & SD_SHARE_PKG_RESOURCES) {
sd->imbalance_pct = 117; sd->imbalance_pct = 117;
...@@ -1934,6 +2130,7 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur, ...@@ -1934,6 +2130,7 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
struct sched_domain_attr *dattr_new) struct sched_domain_attr *dattr_new)
{ {
bool __maybe_unused has_eas = false;
int i, j, n; int i, j, n;
int new_topology; int new_topology;
...@@ -1961,8 +2158,8 @@ void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], ...@@ -1961,8 +2158,8 @@ void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
/* Destroy deleted domains: */ /* Destroy deleted domains: */
for (i = 0; i < ndoms_cur; i++) { for (i = 0; i < ndoms_cur; i++) {
for (j = 0; j < n && !new_topology; j++) { for (j = 0; j < n && !new_topology; j++) {
if (cpumask_equal(doms_cur[i], doms_new[j]) if (cpumask_equal(doms_cur[i], doms_new[j]) &&
&& dattrs_equal(dattr_cur, i, dattr_new, j)) dattrs_equal(dattr_cur, i, dattr_new, j))
goto match1; goto match1;
} }
/* No match - a current sched domain not in new doms_new[] */ /* No match - a current sched domain not in new doms_new[] */
...@@ -1982,8 +2179,8 @@ void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], ...@@ -1982,8 +2179,8 @@ void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
/* Build new domains: */ /* Build new domains: */
for (i = 0; i < ndoms_new; i++) { for (i = 0; i < ndoms_new; i++) {
for (j = 0; j < n && !new_topology; j++) { for (j = 0; j < n && !new_topology; j++) {
if (cpumask_equal(doms_new[i], doms_cur[j]) if (cpumask_equal(doms_new[i], doms_cur[j]) &&
&& dattrs_equal(dattr_new, i, dattr_cur, j)) dattrs_equal(dattr_new, i, dattr_cur, j))
goto match2; goto match2;
} }
/* No match - add a new doms_new */ /* No match - add a new doms_new */
...@@ -1992,6 +2189,24 @@ void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], ...@@ -1992,6 +2189,24 @@ void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
; ;
} }
#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
/* Build perf. domains: */
for (i = 0; i < ndoms_new; i++) {
for (j = 0; j < n && !sched_energy_update; j++) {
if (cpumask_equal(doms_new[i], doms_cur[j]) &&
cpu_rq(cpumask_first(doms_cur[j]))->rd->pd) {
has_eas = true;
goto match3;
}
}
/* No match - add perf. domains for a new rd */
has_eas |= build_perf_domains(doms_new[i]);
match3:
;
}
sched_energy_set(has_eas);
#endif
/* Remember the new sched domains: */ /* Remember the new sched domains: */
if (doms_cur != &fallback_doms) if (doms_cur != &fallback_doms)
free_sched_domains(doms_cur, ndoms_cur); free_sched_domains(doms_cur, ndoms_cur);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment