Commit af8c5e2d authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull scheduler updates from Ingo Molnar:
 "The main changes in this cycle were:

   - Implement frequency/CPU invariance and OPP selection for
     SCHED_DEADLINE (Juri Lelli)

   - Tweak the task migration logic for better multi-tasking
     workload scalability (Mel Gorman)

   - Misc cleanups, fixes and improvements"

* 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  sched/deadline: Make bandwidth enforcement scale-invariant
  sched/cpufreq: Move arch_scale_{freq,cpu}_capacity() outside of #ifdef CONFIG_SMP
  sched/cpufreq: Remove arch_scale_freq_capacity()'s 'sd' parameter
  sched/cpufreq: Always consider all CPUs when deciding next freq
  sched/cpufreq: Split utilization signals
  sched/cpufreq: Change the worker kthread to SCHED_DEADLINE
  sched/deadline: Move CPU frequency selection triggering points
  sched/cpufreq: Use the DEADLINE utilization signal
  sched/deadline: Implement "runtime overrun signal" support
  sched/fair: Only immediately migrate tasks due to interrupts if prev and target CPUs share cache
  sched/fair: Correct obsolete comment about cpufreq_update_util()
  sched/fair: Remove impossible condition from find_idlest_group_cpu()
  sched/cpufreq: Don't pass flags to sugov_set_iowait_boost()
  sched/cpufreq: Initialize sg_cpu->flags to 0
  sched/fair: Consider RT/IRQ pressure in capacity_spare_wake()
  sched/fair: Use 'unsigned long' for utilization, consistently
  sched/core: Rework and clarify prepare_lock_switch()
  sched/fair: Remove unused 'curr' parameter from wakeup_gran
  sched/headers: Constify object_is_on_stack()
parents a1c75e17 07881166
...@@ -27,7 +27,7 @@ void topology_set_cpu_scale(unsigned int cpu, unsigned long capacity); ...@@ -27,7 +27,7 @@ void topology_set_cpu_scale(unsigned int cpu, unsigned long capacity);
DECLARE_PER_CPU(unsigned long, freq_scale); DECLARE_PER_CPU(unsigned long, freq_scale);
static inline static inline
unsigned long topology_get_freq_scale(struct sched_domain *sd, int cpu) unsigned long topology_get_freq_scale(int cpu)
{ {
return per_cpu(freq_scale, cpu); return per_cpu(freq_scale, cpu);
} }
......
...@@ -472,11 +472,15 @@ struct sched_dl_entity { ...@@ -472,11 +472,15 @@ struct sched_dl_entity {
* has not been executed yet. This flag is useful to avoid race * has not been executed yet. This flag is useful to avoid race
* conditions between the inactive timer handler and the wakeup * conditions between the inactive timer handler and the wakeup
* code. * code.
*
* @dl_overrun tells if the task asked to be informed about runtime
* overruns.
*/ */
unsigned int dl_throttled : 1; unsigned int dl_throttled : 1;
unsigned int dl_boosted : 1; unsigned int dl_boosted : 1;
unsigned int dl_yielded : 1; unsigned int dl_yielded : 1;
unsigned int dl_non_contending : 1; unsigned int dl_non_contending : 1;
unsigned int dl_overrun : 1;
/* /*
* Bandwidth enforcement timer. Each -deadline task has its * Bandwidth enforcement timer. Each -deadline task has its
...@@ -1427,6 +1431,7 @@ extern int idle_cpu(int cpu); ...@@ -1427,6 +1431,7 @@ extern int idle_cpu(int cpu);
extern int sched_setscheduler(struct task_struct *, int, const struct sched_param *); extern int sched_setscheduler(struct task_struct *, int, const struct sched_param *);
extern int sched_setscheduler_nocheck(struct task_struct *, int, const struct sched_param *); extern int sched_setscheduler_nocheck(struct task_struct *, int, const struct sched_param *);
extern int sched_setattr(struct task_struct *, const struct sched_attr *); extern int sched_setattr(struct task_struct *, const struct sched_attr *);
extern int sched_setattr_nocheck(struct task_struct *, const struct sched_attr *);
extern struct task_struct *idle_task(int cpu); extern struct task_struct *idle_task(int cpu);
/** /**
......
...@@ -12,8 +12,6 @@ ...@@ -12,8 +12,6 @@
#define SCHED_CPUFREQ_DL (1U << 1) #define SCHED_CPUFREQ_DL (1U << 1)
#define SCHED_CPUFREQ_IOWAIT (1U << 2) #define SCHED_CPUFREQ_IOWAIT (1U << 2)
#define SCHED_CPUFREQ_RT_DL (SCHED_CPUFREQ_RT | SCHED_CPUFREQ_DL)
#ifdef CONFIG_CPU_FREQ #ifdef CONFIG_CPU_FREQ
struct update_util_data { struct update_util_data {
void (*func)(struct update_util_data *data, u64 time, unsigned int flags); void (*func)(struct update_util_data *data, u64 time, unsigned int flags);
......
...@@ -78,7 +78,7 @@ static inline void put_task_stack(struct task_struct *tsk) {} ...@@ -78,7 +78,7 @@ static inline void put_task_stack(struct task_struct *tsk) {}
#define task_stack_end_corrupted(task) \ #define task_stack_end_corrupted(task) \
(*(end_of_stack(task)) != STACK_END_MAGIC) (*(end_of_stack(task)) != STACK_END_MAGIC)
static inline int object_is_on_stack(void *obj) static inline int object_is_on_stack(const void *obj)
{ {
void *stack = task_stack_page(current); void *stack = task_stack_page(current);
......
...@@ -6,6 +6,12 @@ ...@@ -6,6 +6,12 @@
#include <linux/sched/idle.h> #include <linux/sched/idle.h>
/*
* Increase resolution of cpu_capacity calculations
*/
#define SCHED_CAPACITY_SHIFT SCHED_FIXEDPOINT_SHIFT
#define SCHED_CAPACITY_SCALE (1L << SCHED_CAPACITY_SHIFT)
/* /*
* sched-domains (multiprocessor balancing) declarations: * sched-domains (multiprocessor balancing) declarations:
*/ */
...@@ -27,12 +33,6 @@ ...@@ -27,12 +33,6 @@
#define SD_OVERLAP 0x2000 /* sched_domains of this level overlap */ #define SD_OVERLAP 0x2000 /* sched_domains of this level overlap */
#define SD_NUMA 0x4000 /* cross-node balancing */ #define SD_NUMA 0x4000 /* cross-node balancing */
/*
* Increase resolution of cpu_capacity calculations
*/
#define SCHED_CAPACITY_SHIFT SCHED_FIXEDPOINT_SHIFT
#define SCHED_CAPACITY_SCALE (1L << SCHED_CAPACITY_SHIFT)
#ifdef CONFIG_SCHED_SMT #ifdef CONFIG_SCHED_SMT
static inline int cpu_smt_flags(void) static inline int cpu_smt_flags(void)
{ {
......
...@@ -49,5 +49,10 @@ ...@@ -49,5 +49,10 @@
*/ */
#define SCHED_FLAG_RESET_ON_FORK 0x01 #define SCHED_FLAG_RESET_ON_FORK 0x01
#define SCHED_FLAG_RECLAIM 0x02 #define SCHED_FLAG_RECLAIM 0x02
#define SCHED_FLAG_DL_OVERRUN 0x04
#define SCHED_FLAG_ALL (SCHED_FLAG_RESET_ON_FORK | \
SCHED_FLAG_RECLAIM | \
SCHED_FLAG_DL_OVERRUN)
#endif /* _UAPI_LINUX_SCHED_H */ #endif /* _UAPI_LINUX_SCHED_H */
...@@ -2046,7 +2046,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) ...@@ -2046,7 +2046,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
* If the owning (remote) CPU is still in the middle of schedule() with * If the owning (remote) CPU is still in the middle of schedule() with
* this task as prev, wait until its done referencing the task. * this task as prev, wait until its done referencing the task.
* *
* Pairs with the smp_store_release() in finish_lock_switch(). * Pairs with the smp_store_release() in finish_task().
* *
* This ensures that tasks getting woken will be fully ordered against * This ensures that tasks getting woken will be fully ordered against
* their previous state and preserve Program Order. * their previous state and preserve Program Order.
...@@ -2572,6 +2572,50 @@ fire_sched_out_preempt_notifiers(struct task_struct *curr, ...@@ -2572,6 +2572,50 @@ fire_sched_out_preempt_notifiers(struct task_struct *curr,
#endif /* CONFIG_PREEMPT_NOTIFIERS */ #endif /* CONFIG_PREEMPT_NOTIFIERS */
static inline void prepare_task(struct task_struct *next)
{
#ifdef CONFIG_SMP
/*
* Claim the task as running, we do this before switching to it
* such that any running task will have this set.
*/
next->on_cpu = 1;
#endif
}
static inline void finish_task(struct task_struct *prev)
{
#ifdef CONFIG_SMP
/*
* After ->on_cpu is cleared, the task can be moved to a different CPU.
* We must ensure this doesn't happen until the switch is completely
* finished.
*
* In particular, the load of prev->state in finish_task_switch() must
* happen before this.
*
* Pairs with the smp_cond_load_acquire() in try_to_wake_up().
*/
smp_store_release(&prev->on_cpu, 0);
#endif
}
static inline void finish_lock_switch(struct rq *rq)
{
#ifdef CONFIG_DEBUG_SPINLOCK
/* this is a valid case when another task releases the spinlock */
rq->lock.owner = current;
#endif
/*
* If we are tracking spinlock dependencies then we have to
* fix up the runqueue lock - which gets 'carried over' from
* prev into current:
*/
spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
raw_spin_unlock_irq(&rq->lock);
}
/** /**
* prepare_task_switch - prepare to switch tasks * prepare_task_switch - prepare to switch tasks
* @rq: the runqueue preparing to switch * @rq: the runqueue preparing to switch
...@@ -2592,7 +2636,7 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev, ...@@ -2592,7 +2636,7 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev,
sched_info_switch(rq, prev, next); sched_info_switch(rq, prev, next);
perf_event_task_sched_out(prev, next); perf_event_task_sched_out(prev, next);
fire_sched_out_preempt_notifiers(prev, next); fire_sched_out_preempt_notifiers(prev, next);
prepare_lock_switch(rq, next); prepare_task(next);
prepare_arch_switch(next); prepare_arch_switch(next);
} }
...@@ -2647,7 +2691,7 @@ static struct rq *finish_task_switch(struct task_struct *prev) ...@@ -2647,7 +2691,7 @@ static struct rq *finish_task_switch(struct task_struct *prev)
* the scheduled task must drop that reference. * the scheduled task must drop that reference.
* *
* We must observe prev->state before clearing prev->on_cpu (in * We must observe prev->state before clearing prev->on_cpu (in
* finish_lock_switch), otherwise a concurrent wakeup can get prev * finish_task), otherwise a concurrent wakeup can get prev
* running on another CPU and we could rave with its RUNNING -> DEAD * running on another CPU and we could rave with its RUNNING -> DEAD
* transition, resulting in a double drop. * transition, resulting in a double drop.
*/ */
...@@ -2664,7 +2708,8 @@ static struct rq *finish_task_switch(struct task_struct *prev) ...@@ -2664,7 +2708,8 @@ static struct rq *finish_task_switch(struct task_struct *prev)
* to use. * to use.
*/ */
smp_mb__after_unlock_lock(); smp_mb__after_unlock_lock();
finish_lock_switch(rq, prev); finish_task(prev);
finish_lock_switch(rq);
finish_arch_post_lock_switch(); finish_arch_post_lock_switch();
fire_sched_in_preempt_notifiers(current); fire_sched_in_preempt_notifiers(current);
...@@ -4041,8 +4086,7 @@ static int __sched_setscheduler(struct task_struct *p, ...@@ -4041,8 +4086,7 @@ static int __sched_setscheduler(struct task_struct *p,
return -EINVAL; return -EINVAL;
} }
if (attr->sched_flags & if (attr->sched_flags & ~(SCHED_FLAG_ALL | SCHED_FLAG_SUGOV))
~(SCHED_FLAG_RESET_ON_FORK | SCHED_FLAG_RECLAIM))
return -EINVAL; return -EINVAL;
/* /*
...@@ -4109,6 +4153,9 @@ static int __sched_setscheduler(struct task_struct *p, ...@@ -4109,6 +4153,9 @@ static int __sched_setscheduler(struct task_struct *p,
} }
if (user) { if (user) {
if (attr->sched_flags & SCHED_FLAG_SUGOV)
return -EINVAL;
retval = security_task_setscheduler(p); retval = security_task_setscheduler(p);
if (retval) if (retval)
return retval; return retval;
...@@ -4164,7 +4211,8 @@ static int __sched_setscheduler(struct task_struct *p, ...@@ -4164,7 +4211,8 @@ static int __sched_setscheduler(struct task_struct *p,
} }
#endif #endif
#ifdef CONFIG_SMP #ifdef CONFIG_SMP
if (dl_bandwidth_enabled() && dl_policy(policy)) { if (dl_bandwidth_enabled() && dl_policy(policy) &&
!(attr->sched_flags & SCHED_FLAG_SUGOV)) {
cpumask_t *span = rq->rd->span; cpumask_t *span = rq->rd->span;
/* /*
...@@ -4294,6 +4342,11 @@ int sched_setattr(struct task_struct *p, const struct sched_attr *attr) ...@@ -4294,6 +4342,11 @@ int sched_setattr(struct task_struct *p, const struct sched_attr *attr)
} }
EXPORT_SYMBOL_GPL(sched_setattr); EXPORT_SYMBOL_GPL(sched_setattr);
int sched_setattr_nocheck(struct task_struct *p, const struct sched_attr *attr)
{
return __sched_setscheduler(p, attr, false, true);
}
/** /**
* sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace. * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.
* @p: the task in question. * @p: the task in question.
......
...@@ -60,7 +60,8 @@ struct sugov_cpu { ...@@ -60,7 +60,8 @@ struct sugov_cpu {
u64 last_update; u64 last_update;
/* The fields below are only needed when sharing a policy. */ /* The fields below are only needed when sharing a policy. */
unsigned long util; unsigned long util_cfs;
unsigned long util_dl;
unsigned long max; unsigned long max;
unsigned int flags; unsigned int flags;
...@@ -176,21 +177,28 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy, ...@@ -176,21 +177,28 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy,
return cpufreq_driver_resolve_freq(policy, freq); return cpufreq_driver_resolve_freq(policy, freq);
} }
static void sugov_get_util(unsigned long *util, unsigned long *max, int cpu) static void sugov_get_util(struct sugov_cpu *sg_cpu)
{ {
struct rq *rq = cpu_rq(cpu); struct rq *rq = cpu_rq(sg_cpu->cpu);
unsigned long cfs_max;
cfs_max = arch_scale_cpu_capacity(NULL, cpu); sg_cpu->max = arch_scale_cpu_capacity(NULL, sg_cpu->cpu);
sg_cpu->util_cfs = cpu_util_cfs(rq);
sg_cpu->util_dl = cpu_util_dl(rq);
}
*util = min(rq->cfs.avg.util_avg, cfs_max); static unsigned long sugov_aggregate_util(struct sugov_cpu *sg_cpu)
*max = cfs_max; {
/*
* Ideally we would like to set util_dl as min/guaranteed freq and
* util_cfs + util_dl as requested freq. However, cpufreq is not yet
* ready for such an interface. So, we only do the latter for now.
*/
return min(sg_cpu->util_cfs + sg_cpu->util_dl, sg_cpu->max);
} }
static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time, static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time)
unsigned int flags)
{ {
if (flags & SCHED_CPUFREQ_IOWAIT) { if (sg_cpu->flags & SCHED_CPUFREQ_IOWAIT) {
if (sg_cpu->iowait_boost_pending) if (sg_cpu->iowait_boost_pending)
return; return;
...@@ -264,7 +272,7 @@ static void sugov_update_single(struct update_util_data *hook, u64 time, ...@@ -264,7 +272,7 @@ static void sugov_update_single(struct update_util_data *hook, u64 time,
unsigned int next_f; unsigned int next_f;
bool busy; bool busy;
sugov_set_iowait_boost(sg_cpu, time, flags); sugov_set_iowait_boost(sg_cpu, time);
sg_cpu->last_update = time; sg_cpu->last_update = time;
if (!sugov_should_update_freq(sg_policy, time)) if (!sugov_should_update_freq(sg_policy, time))
...@@ -272,10 +280,12 @@ static void sugov_update_single(struct update_util_data *hook, u64 time, ...@@ -272,10 +280,12 @@ static void sugov_update_single(struct update_util_data *hook, u64 time,
busy = sugov_cpu_is_busy(sg_cpu); busy = sugov_cpu_is_busy(sg_cpu);
if (flags & SCHED_CPUFREQ_RT_DL) { if (flags & SCHED_CPUFREQ_RT) {
next_f = policy->cpuinfo.max_freq; next_f = policy->cpuinfo.max_freq;
} else { } else {
sugov_get_util(&util, &max, sg_cpu->cpu); sugov_get_util(sg_cpu);
max = sg_cpu->max;
util = sugov_aggregate_util(sg_cpu);
sugov_iowait_boost(sg_cpu, &util, &max); sugov_iowait_boost(sg_cpu, &util, &max);
next_f = get_next_freq(sg_policy, util, max); next_f = get_next_freq(sg_policy, util, max);
/* /*
...@@ -305,23 +315,27 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time) ...@@ -305,23 +315,27 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time)
s64 delta_ns; s64 delta_ns;
/* /*
* If the CPU utilization was last updated before the previous * If the CFS CPU utilization was last updated before the
* frequency update and the time elapsed between the last update * previous frequency update and the time elapsed between the
* of the CPU utilization and the last frequency update is long * last update of the CPU utilization and the last frequency
* enough, don't take the CPU into account as it probably is * update is long enough, reset iowait_boost and util_cfs, as
* idle now (and clear iowait_boost for it). * they are now probably stale. However, still consider the
* CPU contribution if it has some DEADLINE utilization
* (util_dl).
*/ */
delta_ns = time - j_sg_cpu->last_update; delta_ns = time - j_sg_cpu->last_update;
if (delta_ns > TICK_NSEC) { if (delta_ns > TICK_NSEC) {
j_sg_cpu->iowait_boost = 0; j_sg_cpu->iowait_boost = 0;
j_sg_cpu->iowait_boost_pending = false; j_sg_cpu->iowait_boost_pending = false;
continue; j_sg_cpu->util_cfs = 0;
if (j_sg_cpu->util_dl == 0)
continue;
} }
if (j_sg_cpu->flags & SCHED_CPUFREQ_RT_DL) if (j_sg_cpu->flags & SCHED_CPUFREQ_RT)
return policy->cpuinfo.max_freq; return policy->cpuinfo.max_freq;
j_util = j_sg_cpu->util;
j_max = j_sg_cpu->max; j_max = j_sg_cpu->max;
j_util = sugov_aggregate_util(j_sg_cpu);
if (j_util * max > j_max * util) { if (j_util * max > j_max * util) {
util = j_util; util = j_util;
max = j_max; max = j_max;
...@@ -338,22 +352,18 @@ static void sugov_update_shared(struct update_util_data *hook, u64 time, ...@@ -338,22 +352,18 @@ static void sugov_update_shared(struct update_util_data *hook, u64 time,
{ {
struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util); struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util);
struct sugov_policy *sg_policy = sg_cpu->sg_policy; struct sugov_policy *sg_policy = sg_cpu->sg_policy;
unsigned long util, max;
unsigned int next_f; unsigned int next_f;
sugov_get_util(&util, &max, sg_cpu->cpu);
raw_spin_lock(&sg_policy->update_lock); raw_spin_lock(&sg_policy->update_lock);
sg_cpu->util = util; sugov_get_util(sg_cpu);
sg_cpu->max = max;
sg_cpu->flags = flags; sg_cpu->flags = flags;
sugov_set_iowait_boost(sg_cpu, time, flags); sugov_set_iowait_boost(sg_cpu, time);
sg_cpu->last_update = time; sg_cpu->last_update = time;
if (sugov_should_update_freq(sg_policy, time)) { if (sugov_should_update_freq(sg_policy, time)) {
if (flags & SCHED_CPUFREQ_RT_DL) if (flags & SCHED_CPUFREQ_RT)
next_f = sg_policy->policy->cpuinfo.max_freq; next_f = sg_policy->policy->cpuinfo.max_freq;
else else
next_f = sugov_next_freq_shared(sg_cpu, time); next_f = sugov_next_freq_shared(sg_cpu, time);
...@@ -383,9 +393,9 @@ static void sugov_irq_work(struct irq_work *irq_work) ...@@ -383,9 +393,9 @@ static void sugov_irq_work(struct irq_work *irq_work)
sg_policy = container_of(irq_work, struct sugov_policy, irq_work); sg_policy = container_of(irq_work, struct sugov_policy, irq_work);
/* /*
* For RT and deadline tasks, the schedutil governor shoots the * For RT tasks, the schedutil governor shoots the frequency to maximum.
* frequency to maximum. Special care must be taken to ensure that this * Special care must be taken to ensure that this kthread doesn't result
* kthread doesn't result in the same behavior. * in the same behavior.
* *
* This is (mostly) guaranteed by the work_in_progress flag. The flag is * This is (mostly) guaranteed by the work_in_progress flag. The flag is
* updated only at the end of the sugov_work() function and before that * updated only at the end of the sugov_work() function and before that
...@@ -470,7 +480,20 @@ static void sugov_policy_free(struct sugov_policy *sg_policy) ...@@ -470,7 +480,20 @@ static void sugov_policy_free(struct sugov_policy *sg_policy)
static int sugov_kthread_create(struct sugov_policy *sg_policy) static int sugov_kthread_create(struct sugov_policy *sg_policy)
{ {
struct task_struct *thread; struct task_struct *thread;
struct sched_param param = { .sched_priority = MAX_USER_RT_PRIO / 2 }; struct sched_attr attr = {
.size = sizeof(struct sched_attr),
.sched_policy = SCHED_DEADLINE,
.sched_flags = SCHED_FLAG_SUGOV,
.sched_nice = 0,
.sched_priority = 0,
/*
* Fake (unused) bandwidth; workaround to "fix"
* priority inheritance.
*/
.sched_runtime = 1000000,
.sched_deadline = 10000000,
.sched_period = 10000000,
};
struct cpufreq_policy *policy = sg_policy->policy; struct cpufreq_policy *policy = sg_policy->policy;
int ret; int ret;
...@@ -488,10 +511,10 @@ static int sugov_kthread_create(struct sugov_policy *sg_policy) ...@@ -488,10 +511,10 @@ static int sugov_kthread_create(struct sugov_policy *sg_policy)
return PTR_ERR(thread); return PTR_ERR(thread);
} }
ret = sched_setscheduler_nocheck(thread, SCHED_FIFO, &param); ret = sched_setattr_nocheck(thread, &attr);
if (ret) { if (ret) {
kthread_stop(thread); kthread_stop(thread);
pr_warn("%s: failed to set SCHED_FIFO\n", __func__); pr_warn("%s: failed to set SCHED_DEADLINE\n", __func__);
return ret; return ret;
} }
...@@ -655,7 +678,7 @@ static int sugov_start(struct cpufreq_policy *policy) ...@@ -655,7 +678,7 @@ static int sugov_start(struct cpufreq_policy *policy)
memset(sg_cpu, 0, sizeof(*sg_cpu)); memset(sg_cpu, 0, sizeof(*sg_cpu));
sg_cpu->cpu = cpu; sg_cpu->cpu = cpu;
sg_cpu->sg_policy = sg_policy; sg_cpu->sg_policy = sg_policy;
sg_cpu->flags = SCHED_CPUFREQ_RT; sg_cpu->flags = 0;
sg_cpu->iowait_boost_max = policy->cpuinfo.max_freq; sg_cpu->iowait_boost_max = policy->cpuinfo.max_freq;
} }
......
...@@ -78,7 +78,7 @@ static inline int dl_bw_cpus(int i) ...@@ -78,7 +78,7 @@ static inline int dl_bw_cpus(int i)
#endif #endif
static inline static inline
void add_running_bw(u64 dl_bw, struct dl_rq *dl_rq) void __add_running_bw(u64 dl_bw, struct dl_rq *dl_rq)
{ {
u64 old = dl_rq->running_bw; u64 old = dl_rq->running_bw;
...@@ -86,10 +86,12 @@ void add_running_bw(u64 dl_bw, struct dl_rq *dl_rq) ...@@ -86,10 +86,12 @@ void add_running_bw(u64 dl_bw, struct dl_rq *dl_rq)
dl_rq->running_bw += dl_bw; dl_rq->running_bw += dl_bw;
SCHED_WARN_ON(dl_rq->running_bw < old); /* overflow */ SCHED_WARN_ON(dl_rq->running_bw < old); /* overflow */
SCHED_WARN_ON(dl_rq->running_bw > dl_rq->this_bw); SCHED_WARN_ON(dl_rq->running_bw > dl_rq->this_bw);
/* kick cpufreq (see the comment in kernel/sched/sched.h). */
cpufreq_update_util(rq_of_dl_rq(dl_rq), SCHED_CPUFREQ_DL);
} }
static inline static inline
void sub_running_bw(u64 dl_bw, struct dl_rq *dl_rq) void __sub_running_bw(u64 dl_bw, struct dl_rq *dl_rq)
{ {
u64 old = dl_rq->running_bw; u64 old = dl_rq->running_bw;
...@@ -98,10 +100,12 @@ void sub_running_bw(u64 dl_bw, struct dl_rq *dl_rq) ...@@ -98,10 +100,12 @@ void sub_running_bw(u64 dl_bw, struct dl_rq *dl_rq)
SCHED_WARN_ON(dl_rq->running_bw > old); /* underflow */ SCHED_WARN_ON(dl_rq->running_bw > old); /* underflow */
if (dl_rq->running_bw > old) if (dl_rq->running_bw > old)
dl_rq->running_bw = 0; dl_rq->running_bw = 0;
/* kick cpufreq (see the comment in kernel/sched/sched.h). */
cpufreq_update_util(rq_of_dl_rq(dl_rq), SCHED_CPUFREQ_DL);
} }
static inline static inline
void add_rq_bw(u64 dl_bw, struct dl_rq *dl_rq) void __add_rq_bw(u64 dl_bw, struct dl_rq *dl_rq)
{ {
u64 old = dl_rq->this_bw; u64 old = dl_rq->this_bw;
...@@ -111,7 +115,7 @@ void add_rq_bw(u64 dl_bw, struct dl_rq *dl_rq) ...@@ -111,7 +115,7 @@ void add_rq_bw(u64 dl_bw, struct dl_rq *dl_rq)
} }
static inline static inline
void sub_rq_bw(u64 dl_bw, struct dl_rq *dl_rq) void __sub_rq_bw(u64 dl_bw, struct dl_rq *dl_rq)
{ {
u64 old = dl_rq->this_bw; u64 old = dl_rq->this_bw;
...@@ -123,16 +127,46 @@ void sub_rq_bw(u64 dl_bw, struct dl_rq *dl_rq) ...@@ -123,16 +127,46 @@ void sub_rq_bw(u64 dl_bw, struct dl_rq *dl_rq)
SCHED_WARN_ON(dl_rq->running_bw > dl_rq->this_bw); SCHED_WARN_ON(dl_rq->running_bw > dl_rq->this_bw);
} }
static inline
void add_rq_bw(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
{
if (!dl_entity_is_special(dl_se))
__add_rq_bw(dl_se->dl_bw, dl_rq);
}
static inline
void sub_rq_bw(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
{
if (!dl_entity_is_special(dl_se))
__sub_rq_bw(dl_se->dl_bw, dl_rq);
}
static inline
void add_running_bw(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
{
if (!dl_entity_is_special(dl_se))
__add_running_bw(dl_se->dl_bw, dl_rq);
}
static inline
void sub_running_bw(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
{
if (!dl_entity_is_special(dl_se))
__sub_running_bw(dl_se->dl_bw, dl_rq);
}
void dl_change_utilization(struct task_struct *p, u64 new_bw) void dl_change_utilization(struct task_struct *p, u64 new_bw)
{ {
struct rq *rq; struct rq *rq;
BUG_ON(p->dl.flags & SCHED_FLAG_SUGOV);
if (task_on_rq_queued(p)) if (task_on_rq_queued(p))
return; return;
rq = task_rq(p); rq = task_rq(p);
if (p->dl.dl_non_contending) { if (p->dl.dl_non_contending) {
sub_running_bw(p->dl.dl_bw, &rq->dl); sub_running_bw(&p->dl, &rq->dl);
p->dl.dl_non_contending = 0; p->dl.dl_non_contending = 0;
/* /*
* If the timer handler is currently running and the * If the timer handler is currently running and the
...@@ -144,8 +178,8 @@ void dl_change_utilization(struct task_struct *p, u64 new_bw) ...@@ -144,8 +178,8 @@ void dl_change_utilization(struct task_struct *p, u64 new_bw)
if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1) if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1)
put_task_struct(p); put_task_struct(p);
} }
sub_rq_bw(p->dl.dl_bw, &rq->dl); __sub_rq_bw(p->dl.dl_bw, &rq->dl);
add_rq_bw(new_bw, &rq->dl); __add_rq_bw(new_bw, &rq->dl);
} }
/* /*
...@@ -217,6 +251,9 @@ static void task_non_contending(struct task_struct *p) ...@@ -217,6 +251,9 @@ static void task_non_contending(struct task_struct *p)
if (dl_se->dl_runtime == 0) if (dl_se->dl_runtime == 0)
return; return;
if (dl_entity_is_special(dl_se))
return;
WARN_ON(hrtimer_active(&dl_se->inactive_timer)); WARN_ON(hrtimer_active(&dl_se->inactive_timer));
WARN_ON(dl_se->dl_non_contending); WARN_ON(dl_se->dl_non_contending);
...@@ -236,12 +273,12 @@ static void task_non_contending(struct task_struct *p) ...@@ -236,12 +273,12 @@ static void task_non_contending(struct task_struct *p)
*/ */
if (zerolag_time < 0) { if (zerolag_time < 0) {
if (dl_task(p)) if (dl_task(p))
sub_running_bw(dl_se->dl_bw, dl_rq); sub_running_bw(dl_se, dl_rq);
if (!dl_task(p) || p->state == TASK_DEAD) { if (!dl_task(p) || p->state == TASK_DEAD) {
struct dl_bw *dl_b = dl_bw_of(task_cpu(p)); struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
if (p->state == TASK_DEAD) if (p->state == TASK_DEAD)
sub_rq_bw(p->dl.dl_bw, &rq->dl); sub_rq_bw(&p->dl, &rq->dl);
raw_spin_lock(&dl_b->lock); raw_spin_lock(&dl_b->lock);
__dl_sub(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p))); __dl_sub(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p)));
__dl_clear_params(p); __dl_clear_params(p);
...@@ -268,7 +305,7 @@ static void task_contending(struct sched_dl_entity *dl_se, int flags) ...@@ -268,7 +305,7 @@ static void task_contending(struct sched_dl_entity *dl_se, int flags)
return; return;
if (flags & ENQUEUE_MIGRATED) if (flags & ENQUEUE_MIGRATED)
add_rq_bw(dl_se->dl_bw, dl_rq); add_rq_bw(dl_se, dl_rq);
if (dl_se->dl_non_contending) { if (dl_se->dl_non_contending) {
dl_se->dl_non_contending = 0; dl_se->dl_non_contending = 0;
...@@ -289,7 +326,7 @@ static void task_contending(struct sched_dl_entity *dl_se, int flags) ...@@ -289,7 +326,7 @@ static void task_contending(struct sched_dl_entity *dl_se, int flags)
* when the "inactive timer" fired). * when the "inactive timer" fired).
* So, add it back. * So, add it back.
*/ */
add_running_bw(dl_se->dl_bw, dl_rq); add_running_bw(dl_se, dl_rq);
} }
} }
...@@ -1114,7 +1151,8 @@ static void update_curr_dl(struct rq *rq) ...@@ -1114,7 +1151,8 @@ static void update_curr_dl(struct rq *rq)
{ {
struct task_struct *curr = rq->curr; struct task_struct *curr = rq->curr;
struct sched_dl_entity *dl_se = &curr->dl; struct sched_dl_entity *dl_se = &curr->dl;
u64 delta_exec; u64 delta_exec, scaled_delta_exec;
int cpu = cpu_of(rq);
if (!dl_task(curr) || !on_dl_rq(dl_se)) if (!dl_task(curr) || !on_dl_rq(dl_se))
return; return;
...@@ -1134,9 +1172,6 @@ static void update_curr_dl(struct rq *rq) ...@@ -1134,9 +1172,6 @@ static void update_curr_dl(struct rq *rq)
return; return;
} }
/* kick cpufreq (see the comment in kernel/sched/sched.h). */
cpufreq_update_util(rq, SCHED_CPUFREQ_DL);
schedstat_set(curr->se.statistics.exec_max, schedstat_set(curr->se.statistics.exec_max,
max(curr->se.statistics.exec_max, delta_exec)); max(curr->se.statistics.exec_max, delta_exec));
...@@ -1148,13 +1183,39 @@ static void update_curr_dl(struct rq *rq) ...@@ -1148,13 +1183,39 @@ static void update_curr_dl(struct rq *rq)
sched_rt_avg_update(rq, delta_exec); sched_rt_avg_update(rq, delta_exec);
if (unlikely(dl_se->flags & SCHED_FLAG_RECLAIM)) if (dl_entity_is_special(dl_se))
delta_exec = grub_reclaim(delta_exec, rq, &curr->dl); return;
dl_se->runtime -= delta_exec;
/*
* For tasks that participate in GRUB, we implement GRUB-PA: the
* spare reclaimed bandwidth is used to clock down frequency.
*
* For the others, we still need to scale reservation parameters
* according to current frequency and CPU maximum capacity.
*/
if (unlikely(dl_se->flags & SCHED_FLAG_RECLAIM)) {
scaled_delta_exec = grub_reclaim(delta_exec,
rq,
&curr->dl);
} else {
unsigned long scale_freq = arch_scale_freq_capacity(cpu);
unsigned long scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
scaled_delta_exec = cap_scale(delta_exec, scale_freq);
scaled_delta_exec = cap_scale(scaled_delta_exec, scale_cpu);
}
dl_se->runtime -= scaled_delta_exec;
throttle: throttle:
if (dl_runtime_exceeded(dl_se) || dl_se->dl_yielded) { if (dl_runtime_exceeded(dl_se) || dl_se->dl_yielded) {
dl_se->dl_throttled = 1; dl_se->dl_throttled = 1;
/* If requested, inform the user about runtime overruns. */
if (dl_runtime_exceeded(dl_se) &&
(dl_se->flags & SCHED_FLAG_DL_OVERRUN))
dl_se->dl_overrun = 1;
__dequeue_task_dl(rq, curr, 0); __dequeue_task_dl(rq, curr, 0);
if (unlikely(dl_se->dl_boosted || !start_dl_timer(curr))) if (unlikely(dl_se->dl_boosted || !start_dl_timer(curr)))
enqueue_task_dl(rq, curr, ENQUEUE_REPLENISH); enqueue_task_dl(rq, curr, ENQUEUE_REPLENISH);
...@@ -1204,8 +1265,8 @@ static enum hrtimer_restart inactive_task_timer(struct hrtimer *timer) ...@@ -1204,8 +1265,8 @@ static enum hrtimer_restart inactive_task_timer(struct hrtimer *timer)
struct dl_bw *dl_b = dl_bw_of(task_cpu(p)); struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
if (p->state == TASK_DEAD && dl_se->dl_non_contending) { if (p->state == TASK_DEAD && dl_se->dl_non_contending) {
sub_running_bw(p->dl.dl_bw, dl_rq_of_se(&p->dl)); sub_running_bw(&p->dl, dl_rq_of_se(&p->dl));
sub_rq_bw(p->dl.dl_bw, dl_rq_of_se(&p->dl)); sub_rq_bw(&p->dl, dl_rq_of_se(&p->dl));
dl_se->dl_non_contending = 0; dl_se->dl_non_contending = 0;
} }
...@@ -1222,7 +1283,7 @@ static enum hrtimer_restart inactive_task_timer(struct hrtimer *timer) ...@@ -1222,7 +1283,7 @@ static enum hrtimer_restart inactive_task_timer(struct hrtimer *timer)
sched_clock_tick(); sched_clock_tick();
update_rq_clock(rq); update_rq_clock(rq);
sub_running_bw(dl_se->dl_bw, &rq->dl); sub_running_bw(dl_se, &rq->dl);
dl_se->dl_non_contending = 0; dl_se->dl_non_contending = 0;
unlock: unlock:
task_rq_unlock(rq, p, &rf); task_rq_unlock(rq, p, &rf);
...@@ -1416,8 +1477,8 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) ...@@ -1416,8 +1477,8 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
dl_check_constrained_dl(&p->dl); dl_check_constrained_dl(&p->dl);
if (p->on_rq == TASK_ON_RQ_MIGRATING || flags & ENQUEUE_RESTORE) { if (p->on_rq == TASK_ON_RQ_MIGRATING || flags & ENQUEUE_RESTORE) {
add_rq_bw(p->dl.dl_bw, &rq->dl); add_rq_bw(&p->dl, &rq->dl);
add_running_bw(p->dl.dl_bw, &rq->dl); add_running_bw(&p->dl, &rq->dl);
} }
/* /*
...@@ -1457,8 +1518,8 @@ static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags) ...@@ -1457,8 +1518,8 @@ static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags)
__dequeue_task_dl(rq, p, flags); __dequeue_task_dl(rq, p, flags);
if (p->on_rq == TASK_ON_RQ_MIGRATING || flags & DEQUEUE_SAVE) { if (p->on_rq == TASK_ON_RQ_MIGRATING || flags & DEQUEUE_SAVE) {
sub_running_bw(p->dl.dl_bw, &rq->dl); sub_running_bw(&p->dl, &rq->dl);
sub_rq_bw(p->dl.dl_bw, &rq->dl); sub_rq_bw(&p->dl, &rq->dl);
} }
/* /*
...@@ -1564,7 +1625,7 @@ static void migrate_task_rq_dl(struct task_struct *p) ...@@ -1564,7 +1625,7 @@ static void migrate_task_rq_dl(struct task_struct *p)
*/ */
raw_spin_lock(&rq->lock); raw_spin_lock(&rq->lock);
if (p->dl.dl_non_contending) { if (p->dl.dl_non_contending) {
sub_running_bw(p->dl.dl_bw, &rq->dl); sub_running_bw(&p->dl, &rq->dl);
p->dl.dl_non_contending = 0; p->dl.dl_non_contending = 0;
/* /*
* If the timer handler is currently running and the * If the timer handler is currently running and the
...@@ -1576,7 +1637,7 @@ static void migrate_task_rq_dl(struct task_struct *p) ...@@ -1576,7 +1637,7 @@ static void migrate_task_rq_dl(struct task_struct *p)
if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1) if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1)
put_task_struct(p); put_task_struct(p);
} }
sub_rq_bw(p->dl.dl_bw, &rq->dl); sub_rq_bw(&p->dl, &rq->dl);
raw_spin_unlock(&rq->lock); raw_spin_unlock(&rq->lock);
} }
...@@ -2019,11 +2080,11 @@ static int push_dl_task(struct rq *rq) ...@@ -2019,11 +2080,11 @@ static int push_dl_task(struct rq *rq)
} }
deactivate_task(rq, next_task, 0); deactivate_task(rq, next_task, 0);
sub_running_bw(next_task->dl.dl_bw, &rq->dl); sub_running_bw(&next_task->dl, &rq->dl);
sub_rq_bw(next_task->dl.dl_bw, &rq->dl); sub_rq_bw(&next_task->dl, &rq->dl);
set_task_cpu(next_task, later_rq->cpu); set_task_cpu(next_task, later_rq->cpu);
add_rq_bw(next_task->dl.dl_bw, &later_rq->dl); add_rq_bw(&next_task->dl, &later_rq->dl);
add_running_bw(next_task->dl.dl_bw, &later_rq->dl); add_running_bw(&next_task->dl, &later_rq->dl);
activate_task(later_rq, next_task, 0); activate_task(later_rq, next_task, 0);
ret = 1; ret = 1;
...@@ -2111,11 +2172,11 @@ static void pull_dl_task(struct rq *this_rq) ...@@ -2111,11 +2172,11 @@ static void pull_dl_task(struct rq *this_rq)
resched = true; resched = true;
deactivate_task(src_rq, p, 0); deactivate_task(src_rq, p, 0);
sub_running_bw(p->dl.dl_bw, &src_rq->dl); sub_running_bw(&p->dl, &src_rq->dl);
sub_rq_bw(p->dl.dl_bw, &src_rq->dl); sub_rq_bw(&p->dl, &src_rq->dl);
set_task_cpu(p, this_cpu); set_task_cpu(p, this_cpu);
add_rq_bw(p->dl.dl_bw, &this_rq->dl); add_rq_bw(&p->dl, &this_rq->dl);
add_running_bw(p->dl.dl_bw, &this_rq->dl); add_running_bw(&p->dl, &this_rq->dl);
activate_task(this_rq, p, 0); activate_task(this_rq, p, 0);
dmin = p->dl.deadline; dmin = p->dl.deadline;
...@@ -2224,7 +2285,7 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p) ...@@ -2224,7 +2285,7 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p)
task_non_contending(p); task_non_contending(p);
if (!task_on_rq_queued(p)) if (!task_on_rq_queued(p))
sub_rq_bw(p->dl.dl_bw, &rq->dl); sub_rq_bw(&p->dl, &rq->dl);
/* /*
* We cannot use inactive_task_timer() to invoke sub_running_bw() * We cannot use inactive_task_timer() to invoke sub_running_bw()
...@@ -2256,7 +2317,7 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p) ...@@ -2256,7 +2317,7 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p)
/* If p is not queued we will update its parameters at next wakeup. */ /* If p is not queued we will update its parameters at next wakeup. */
if (!task_on_rq_queued(p)) { if (!task_on_rq_queued(p)) {
add_rq_bw(p->dl.dl_bw, &rq->dl); add_rq_bw(&p->dl, &rq->dl);
return; return;
} }
...@@ -2435,6 +2496,9 @@ int sched_dl_overflow(struct task_struct *p, int policy, ...@@ -2435,6 +2496,9 @@ int sched_dl_overflow(struct task_struct *p, int policy,
u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0; u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0;
int cpus, err = -1; int cpus, err = -1;
if (attr->sched_flags & SCHED_FLAG_SUGOV)
return 0;
/* !deadline task may carry old deadline bandwidth */ /* !deadline task may carry old deadline bandwidth */
if (new_bw == p->dl.dl_bw && task_has_dl_policy(p)) if (new_bw == p->dl.dl_bw && task_has_dl_policy(p))
return 0; return 0;
...@@ -2521,6 +2585,10 @@ void __getparam_dl(struct task_struct *p, struct sched_attr *attr) ...@@ -2521,6 +2585,10 @@ void __getparam_dl(struct task_struct *p, struct sched_attr *attr)
*/ */
bool __checkparam_dl(const struct sched_attr *attr) bool __checkparam_dl(const struct sched_attr *attr)
{ {
/* special dl tasks don't actually use any parameter */
if (attr->sched_flags & SCHED_FLAG_SUGOV)
return true;
/* deadline != 0 */ /* deadline != 0 */
if (attr->sched_deadline == 0) if (attr->sched_deadline == 0)
return false; return false;
...@@ -2566,6 +2634,7 @@ void __dl_clear_params(struct task_struct *p) ...@@ -2566,6 +2634,7 @@ void __dl_clear_params(struct task_struct *p)
dl_se->dl_throttled = 0; dl_se->dl_throttled = 0;
dl_se->dl_yielded = 0; dl_se->dl_yielded = 0;
dl_se->dl_non_contending = 0; dl_se->dl_non_contending = 0;
dl_se->dl_overrun = 0;
} }
bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr) bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr)
......
...@@ -3020,9 +3020,7 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq) ...@@ -3020,9 +3020,7 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
/* /*
* There are a few boundary cases this might miss but it should * There are a few boundary cases this might miss but it should
* get called often enough that that should (hopefully) not be * get called often enough that that should (hopefully) not be
* a real problem -- added to that it only calls on the local * a real problem.
* CPU, so if we enqueue remotely we'll miss an update, but
* the next tick/schedule should update.
* *
* It will not get called when we go idle, because the idle * It will not get called when we go idle, because the idle
* thread is a different class (!fair), nor will the utilization * thread is a different class (!fair), nor will the utilization
...@@ -3091,8 +3089,6 @@ static u32 __accumulate_pelt_segments(u64 periods, u32 d1, u32 d3) ...@@ -3091,8 +3089,6 @@ static u32 __accumulate_pelt_segments(u64 periods, u32 d1, u32 d3)
return c1 + c2 + c3; return c1 + c2 + c3;
} }
#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)
/* /*
* Accumulate the three separate parts of the sum; d1 the remainder * Accumulate the three separate parts of the sum; d1 the remainder
* of the last (incomplete) period, d2 the span of full periods and d3 * of the last (incomplete) period, d2 the span of full periods and d3
...@@ -3122,7 +3118,7 @@ accumulate_sum(u64 delta, int cpu, struct sched_avg *sa, ...@@ -3122,7 +3118,7 @@ accumulate_sum(u64 delta, int cpu, struct sched_avg *sa,
u32 contrib = (u32)delta; /* p == 0 -> delta < 1024 */ u32 contrib = (u32)delta; /* p == 0 -> delta < 1024 */
u64 periods; u64 periods;
scale_freq = arch_scale_freq_capacity(NULL, cpu); scale_freq = arch_scale_freq_capacity(cpu);
scale_cpu = arch_scale_cpu_capacity(NULL, cpu); scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
delta += sa->period_contrib; delta += sa->period_contrib;
...@@ -5689,8 +5685,8 @@ static int wake_wide(struct task_struct *p) ...@@ -5689,8 +5685,8 @@ static int wake_wide(struct task_struct *p)
* soonest. For the purpose of speed we only consider the waking and previous * soonest. For the purpose of speed we only consider the waking and previous
* CPU. * CPU.
* *
* wake_affine_idle() - only considers 'now', it check if the waking CPU is (or * wake_affine_idle() - only considers 'now', it check if the waking CPU is
* will be) idle. * cache-affine and is (or will be) idle.
* *
* wake_affine_weight() - considers the weight to reflect the average * wake_affine_weight() - considers the weight to reflect the average
* scheduling latency of the CPUs. This seems to work * scheduling latency of the CPUs. This seems to work
...@@ -5701,7 +5697,13 @@ static bool ...@@ -5701,7 +5697,13 @@ static bool
wake_affine_idle(struct sched_domain *sd, struct task_struct *p, wake_affine_idle(struct sched_domain *sd, struct task_struct *p,
int this_cpu, int prev_cpu, int sync) int this_cpu, int prev_cpu, int sync)
{ {
if (idle_cpu(this_cpu)) /*
* If this_cpu is idle, it implies the wakeup is from interrupt
* context. Only allow the move if cache is shared. Otherwise an
* interrupt intensive workload could force all tasks onto one
* node depending on the IO topology or IRQ affinity settings.
*/
if (idle_cpu(this_cpu) && cpus_share_cache(this_cpu, prev_cpu))
return true; return true;
if (sync && cpu_rq(this_cpu)->nr_running == 1) if (sync && cpu_rq(this_cpu)->nr_running == 1)
...@@ -5765,12 +5767,12 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, ...@@ -5765,12 +5767,12 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p,
return affine; return affine;
} }
static inline int task_util(struct task_struct *p); static inline unsigned long task_util(struct task_struct *p);
static int cpu_util_wake(int cpu, struct task_struct *p); static unsigned long cpu_util_wake(int cpu, struct task_struct *p);
static unsigned long capacity_spare_wake(int cpu, struct task_struct *p) static unsigned long capacity_spare_wake(int cpu, struct task_struct *p)
{ {
return capacity_orig_of(cpu) - cpu_util_wake(cpu, p); return max_t(long, capacity_of(cpu) - cpu_util_wake(cpu, p), 0);
} }
/* /*
...@@ -5950,7 +5952,7 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this ...@@ -5950,7 +5952,7 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this
} }
} else if (shallowest_idle_cpu == -1) { } else if (shallowest_idle_cpu == -1) {
load = weighted_cpuload(cpu_rq(i)); load = weighted_cpuload(cpu_rq(i));
if (load < min_load || (load == min_load && i == this_cpu)) { if (load < min_load) {
min_load = load; min_load = load;
least_loaded_cpu = i; least_loaded_cpu = i;
} }
...@@ -6247,7 +6249,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) ...@@ -6247,7 +6249,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
* capacity_orig) as it useful for predicting the capacity required after task * capacity_orig) as it useful for predicting the capacity required after task
* migrations (scheduler-driven DVFS). * migrations (scheduler-driven DVFS).
*/ */
static int cpu_util(int cpu) static unsigned long cpu_util(int cpu)
{ {
unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg; unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg;
unsigned long capacity = capacity_orig_of(cpu); unsigned long capacity = capacity_orig_of(cpu);
...@@ -6255,7 +6257,7 @@ static int cpu_util(int cpu) ...@@ -6255,7 +6257,7 @@ static int cpu_util(int cpu)
return (util >= capacity) ? capacity : util; return (util >= capacity) ? capacity : util;
} }
static inline int task_util(struct task_struct *p) static inline unsigned long task_util(struct task_struct *p)
{ {
return p->se.avg.util_avg; return p->se.avg.util_avg;
} }
...@@ -6264,7 +6266,7 @@ static inline int task_util(struct task_struct *p) ...@@ -6264,7 +6266,7 @@ static inline int task_util(struct task_struct *p)
* cpu_util_wake: Compute cpu utilization with any contributions from * cpu_util_wake: Compute cpu utilization with any contributions from
* the waking task p removed. * the waking task p removed.
*/ */
static int cpu_util_wake(int cpu, struct task_struct *p) static unsigned long cpu_util_wake(int cpu, struct task_struct *p)
{ {
unsigned long util, capacity; unsigned long util, capacity;
...@@ -6449,8 +6451,7 @@ static void task_dead_fair(struct task_struct *p) ...@@ -6449,8 +6451,7 @@ static void task_dead_fair(struct task_struct *p)
} }
#endif /* CONFIG_SMP */ #endif /* CONFIG_SMP */
static unsigned long static unsigned long wakeup_gran(struct sched_entity *se)
wakeup_gran(struct sched_entity *curr, struct sched_entity *se)
{ {
unsigned long gran = sysctl_sched_wakeup_granularity; unsigned long gran = sysctl_sched_wakeup_granularity;
...@@ -6492,7 +6493,7 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se) ...@@ -6492,7 +6493,7 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
if (vdiff <= 0) if (vdiff <= 0)
return -1; return -1;
gran = wakeup_gran(curr, se); gran = wakeup_gran(se);
if (vdiff > gran) if (vdiff > gran)
return 1; return 1;
......
...@@ -156,13 +156,39 @@ static inline int task_has_dl_policy(struct task_struct *p) ...@@ -156,13 +156,39 @@ static inline int task_has_dl_policy(struct task_struct *p)
return dl_policy(p->policy); return dl_policy(p->policy);
} }
#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)
/*
* !! For sched_setattr_nocheck() (kernel) only !!
*
* This is actually gross. :(
*
* It is used to make schedutil kworker(s) higher priority than SCHED_DEADLINE
* tasks, but still be able to sleep. We need this on platforms that cannot
* atomically change clock frequency. Remove once fast switching will be
* available on such platforms.
*
* SUGOV stands for SchedUtil GOVernor.
*/
#define SCHED_FLAG_SUGOV 0x10000000
static inline bool dl_entity_is_special(struct sched_dl_entity *dl_se)
{
#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL
return unlikely(dl_se->flags & SCHED_FLAG_SUGOV);
#else
return false;
#endif
}
/* /*
* Tells if entity @a should preempt entity @b. * Tells if entity @a should preempt entity @b.
*/ */
static inline bool static inline bool
dl_entity_preempt(struct sched_dl_entity *a, struct sched_dl_entity *b) dl_entity_preempt(struct sched_dl_entity *a, struct sched_dl_entity *b)
{ {
return dl_time_before(a->deadline, b->deadline); return dl_entity_is_special(a) ||
dl_time_before(a->deadline, b->deadline);
} }
/* /*
...@@ -1328,47 +1354,6 @@ static inline int task_on_rq_migrating(struct task_struct *p) ...@@ -1328,47 +1354,6 @@ static inline int task_on_rq_migrating(struct task_struct *p)
# define finish_arch_post_lock_switch() do { } while (0) # define finish_arch_post_lock_switch() do { } while (0)
#endif #endif
static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
{
#ifdef CONFIG_SMP
/*
* We can optimise this out completely for !SMP, because the
* SMP rebalancing from interrupt is the only thing that cares
* here.
*/
next->on_cpu = 1;
#endif
}
static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
{
#ifdef CONFIG_SMP
/*
* After ->on_cpu is cleared, the task can be moved to a different CPU.
* We must ensure this doesn't happen until the switch is completely
* finished.
*
* In particular, the load of prev->state in finish_task_switch() must
* happen before this.
*
* Pairs with the smp_cond_load_acquire() in try_to_wake_up().
*/
smp_store_release(&prev->on_cpu, 0);
#endif
#ifdef CONFIG_DEBUG_SPINLOCK
/* this is a valid case when another task releases the spinlock */
rq->lock.owner = current;
#endif
/*
* If we are tracking spinlock dependencies then we have to
* fix up the runqueue lock - which gets 'carried over' from
* prev into current:
*/
spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
raw_spin_unlock_irq(&rq->lock);
}
/* /*
* wake flags * wake flags
*/ */
...@@ -1687,17 +1672,17 @@ static inline int hrtick_enabled(struct rq *rq) ...@@ -1687,17 +1672,17 @@ static inline int hrtick_enabled(struct rq *rq)
#endif /* CONFIG_SCHED_HRTICK */ #endif /* CONFIG_SCHED_HRTICK */
#ifdef CONFIG_SMP
extern void sched_avg_update(struct rq *rq);
#ifndef arch_scale_freq_capacity #ifndef arch_scale_freq_capacity
static __always_inline static __always_inline
unsigned long arch_scale_freq_capacity(struct sched_domain *sd, int cpu) unsigned long arch_scale_freq_capacity(int cpu)
{ {
return SCHED_CAPACITY_SCALE; return SCHED_CAPACITY_SCALE;
} }
#endif #endif
#ifdef CONFIG_SMP
extern void sched_avg_update(struct rq *rq);
#ifndef arch_scale_cpu_capacity #ifndef arch_scale_cpu_capacity
static __always_inline static __always_inline
unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu) unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu)
...@@ -1711,10 +1696,17 @@ unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu) ...@@ -1711,10 +1696,17 @@ unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu)
static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
{ {
rq->rt_avg += rt_delta * arch_scale_freq_capacity(NULL, cpu_of(rq)); rq->rt_avg += rt_delta * arch_scale_freq_capacity(cpu_of(rq));
sched_avg_update(rq); sched_avg_update(rq);
} }
#else #else
#ifndef arch_scale_cpu_capacity
static __always_inline
unsigned long arch_scale_cpu_capacity(void __always_unused *sd, int cpu)
{
return SCHED_CAPACITY_SCALE;
}
#endif
static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { } static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { }
static inline void sched_avg_update(struct rq *rq) { } static inline void sched_avg_update(struct rq *rq) { }
#endif #endif
...@@ -2096,14 +2088,14 @@ DECLARE_PER_CPU(struct update_util_data *, cpufreq_update_util_data); ...@@ -2096,14 +2088,14 @@ DECLARE_PER_CPU(struct update_util_data *, cpufreq_update_util_data);
* The way cpufreq is currently arranged requires it to evaluate the CPU * The way cpufreq is currently arranged requires it to evaluate the CPU
* performance state (frequency/voltage) on a regular basis to prevent it from * performance state (frequency/voltage) on a regular basis to prevent it from
* being stuck in a completely inadequate performance level for too long. * being stuck in a completely inadequate performance level for too long.
* That is not guaranteed to happen if the updates are only triggered from CFS, * That is not guaranteed to happen if the updates are only triggered from CFS
* though, because they may not be coming in if RT or deadline tasks are active * and DL, though, because they may not be coming in if only RT tasks are
* all the time (or there are RT and DL tasks only). * active all the time (or there are RT tasks only).
* *
* As a workaround for that issue, this function is called by the RT and DL * As a workaround for that issue, this function is called periodically by the
* sched classes to trigger extra cpufreq updates to prevent it from stalling, * RT sched class to trigger extra cpufreq updates to prevent it from stalling,
* but that really is a band-aid. Going forward it should be replaced with * but that really is a band-aid. Going forward it should be replaced with
* solutions targeted more specifically at RT and DL tasks. * solutions targeted more specifically at RT tasks.
*/ */
static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) static inline void cpufreq_update_util(struct rq *rq, unsigned int flags)
{ {
...@@ -2125,3 +2117,17 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {} ...@@ -2125,3 +2117,17 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {}
#else /* arch_scale_freq_capacity */ #else /* arch_scale_freq_capacity */
#define arch_scale_freq_invariant() (false) #define arch_scale_freq_invariant() (false)
#endif #endif
#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL
static inline unsigned long cpu_util_dl(struct rq *rq)
{
return (rq->dl.running_bw * SCHED_CAPACITY_SCALE) >> BW_SHIFT;
}
static inline unsigned long cpu_util_cfs(struct rq *rq)
{
return rq->cfs.avg.util_avg;
}
#endif
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
#include <linux/tick.h> #include <linux/tick.h>
#include <linux/workqueue.h> #include <linux/workqueue.h>
#include <linux/compat.h> #include <linux/compat.h>
#include <linux/sched/deadline.h>
#include "posix-timers.h" #include "posix-timers.h"
...@@ -791,6 +792,14 @@ check_timers_list(struct list_head *timers, ...@@ -791,6 +792,14 @@ check_timers_list(struct list_head *timers,
return 0; return 0;
} }
static inline void check_dl_overrun(struct task_struct *tsk)
{
if (tsk->dl.dl_overrun) {
tsk->dl.dl_overrun = 0;
__group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk);
}
}
/* /*
* Check for any per-thread CPU timers that have fired and move them off * Check for any per-thread CPU timers that have fired and move them off
* the tsk->cpu_timers[N] list onto the firing list. Here we update the * the tsk->cpu_timers[N] list onto the firing list. Here we update the
...@@ -804,6 +813,9 @@ static void check_thread_timers(struct task_struct *tsk, ...@@ -804,6 +813,9 @@ static void check_thread_timers(struct task_struct *tsk,
u64 expires; u64 expires;
unsigned long soft; unsigned long soft;
if (dl_task(tsk))
check_dl_overrun(tsk);
/* /*
* If cputime_expires is zero, then there are no active * If cputime_expires is zero, then there are no active
* per thread CPU timers. * per thread CPU timers.
...@@ -906,6 +918,9 @@ static void check_process_timers(struct task_struct *tsk, ...@@ -906,6 +918,9 @@ static void check_process_timers(struct task_struct *tsk,
struct task_cputime cputime; struct task_cputime cputime;
unsigned long soft; unsigned long soft;
if (dl_task(tsk))
check_dl_overrun(tsk);
/* /*
* If cputimer is not running, then there are no active * If cputimer is not running, then there are no active
* process wide timers (POSIX 1.b, itimers, RLIMIT_CPU). * process wide timers (POSIX 1.b, itimers, RLIMIT_CPU).
...@@ -1111,6 +1126,9 @@ static inline int fastpath_timer_check(struct task_struct *tsk) ...@@ -1111,6 +1126,9 @@ static inline int fastpath_timer_check(struct task_struct *tsk)
return 1; return 1;
} }
if (dl_task(tsk) && tsk->dl.dl_overrun)
return 1;
return 0; return 0;
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment