Commit 6ae71436 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'sched_core_for_v5.17_rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull scheduler updates from Borislav Petkov:
 "Mostly minor things this time; some highlights:

   - core-sched: Add 'Forced Idle' accounting; this allows to track how
     much CPU time is 'lost' due to core scheduling constraints.

   - psi: Fix for MEM_FULL; a task running reclaim would be counted as a
     runnable task and prevent MEM_FULL from being reported.

   - cpuacct: Long standing fixes for some cgroup accounting issues.

   - rt: Bandwidth timer could, under unusual circumstances, be failed
     to armed, leading to indefinite throttling."

[ Description above by Peter Zijlstra ]

* tag 'sched_core_for_v5.17_rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  sched/fair: Replace CFS internal cpu_util() with cpu_util_cfs()
  sched/fair: Cleanup task_util and capacity type
  sched/rt: Try to restart rt period timer when rt runtime exceeded
  sched/fair: Document the slow path and fast path in select_task_rq_fair
  sched/fair: Fix per-CPU kthread and wakee stacking for asym CPU capacity
  sched/fair: Fix detection of per-CPU kthreads waking a task
  sched/cpuacct: Make user/system times in cpuacct.stat more precise
  sched/cpuacct: Fix user/system in shown cpuacct.usage*
  cpuacct: Convert BUG_ON() to WARN_ON_ONCE()
  cputime, cpuacct: Include guest time in user time in cpuacct.stat
  psi: Fix PSI_MEM_FULL state when tasks are in memstall and doing reclaim
  sched/core: Forced idle accounting
  psi: Add a missing SPDX license header
  psi: Remove repeated verbose comment
parents 01367e86 82762d2a
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_PSI_H #ifndef _LINUX_PSI_H
#define _LINUX_PSI_H #define _LINUX_PSI_H
......
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_PSI_TYPES_H #ifndef _LINUX_PSI_TYPES_H
#define _LINUX_PSI_TYPES_H #define _LINUX_PSI_TYPES_H
...@@ -21,7 +22,17 @@ enum psi_task_count { ...@@ -21,7 +22,17 @@ enum psi_task_count {
* don't have to special case any state tracking for it. * don't have to special case any state tracking for it.
*/ */
NR_ONCPU, NR_ONCPU,
NR_PSI_TASK_COUNTS = 4, /*
* For IO and CPU stalls the presence of running/oncpu tasks
* in the domain means a partial rather than a full stall.
* For memory it's not so simple because of page reclaimers:
* they are running/oncpu while representing a stall. To tell
* whether a domain has productivity left or not, we need to
* distinguish between regular running (i.e. productive)
* threads and memstall ones.
*/
NR_MEMSTALL_RUNNING,
NR_PSI_TASK_COUNTS = 5,
}; };
/* Task state bitmasks */ /* Task state bitmasks */
...@@ -29,6 +40,7 @@ enum psi_task_count { ...@@ -29,6 +40,7 @@ enum psi_task_count {
#define TSK_MEMSTALL (1 << NR_MEMSTALL) #define TSK_MEMSTALL (1 << NR_MEMSTALL)
#define TSK_RUNNING (1 << NR_RUNNING) #define TSK_RUNNING (1 << NR_RUNNING)
#define TSK_ONCPU (1 << NR_ONCPU) #define TSK_ONCPU (1 << NR_ONCPU)
#define TSK_MEMSTALL_RUNNING (1 << NR_MEMSTALL_RUNNING)
/* Resources that workloads could be stalled on */ /* Resources that workloads could be stalled on */
enum psi_res { enum psi_res {
......
...@@ -523,7 +523,11 @@ struct sched_statistics { ...@@ -523,7 +523,11 @@ struct sched_statistics {
u64 nr_wakeups_affine_attempts; u64 nr_wakeups_affine_attempts;
u64 nr_wakeups_passive; u64 nr_wakeups_passive;
u64 nr_wakeups_idle; u64 nr_wakeups_idle;
#ifdef CONFIG_SCHED_CORE
u64 core_forceidle_sum;
#endif #endif
#endif /* CONFIG_SCHEDSTATS */
} ____cacheline_aligned; } ____cacheline_aligned;
struct sched_entity { struct sched_entity {
......
...@@ -144,7 +144,7 @@ static inline bool __sched_core_less(struct task_struct *a, struct task_struct * ...@@ -144,7 +144,7 @@ static inline bool __sched_core_less(struct task_struct *a, struct task_struct *
return false; return false;
/* flip prio, so high prio is leftmost */ /* flip prio, so high prio is leftmost */
if (prio_less(b, a, task_rq(a)->core->core_forceidle)) if (prio_less(b, a, !!task_rq(a)->core->core_forceidle_count))
return true; return true;
return false; return false;
...@@ -181,15 +181,23 @@ void sched_core_enqueue(struct rq *rq, struct task_struct *p) ...@@ -181,15 +181,23 @@ void sched_core_enqueue(struct rq *rq, struct task_struct *p)
rb_add(&p->core_node, &rq->core_tree, rb_sched_core_less); rb_add(&p->core_node, &rq->core_tree, rb_sched_core_less);
} }
void sched_core_dequeue(struct rq *rq, struct task_struct *p) void sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags)
{ {
rq->core->core_task_seq++; rq->core->core_task_seq++;
if (!sched_core_enqueued(p)) if (sched_core_enqueued(p)) {
return; rb_erase(&p->core_node, &rq->core_tree);
RB_CLEAR_NODE(&p->core_node);
}
rb_erase(&p->core_node, &rq->core_tree); /*
RB_CLEAR_NODE(&p->core_node); * Migrating the last task off the cpu, with the cpu in forced idle
* state. Reschedule to create an accounting edge for forced idle,
* and re-examine whether the core is still in forced idle state.
*/
if (!(flags & DEQUEUE_SAVE) && rq->nr_running == 1 &&
rq->core->core_forceidle_count && rq->curr == rq->idle)
resched_curr(rq);
} }
/* /*
...@@ -280,6 +288,8 @@ static void __sched_core_flip(bool enabled) ...@@ -280,6 +288,8 @@ static void __sched_core_flip(bool enabled)
for_each_cpu(t, smt_mask) for_each_cpu(t, smt_mask)
cpu_rq(t)->core_enabled = enabled; cpu_rq(t)->core_enabled = enabled;
cpu_rq(cpu)->core->core_forceidle_start = 0;
sched_core_unlock(cpu, &flags); sched_core_unlock(cpu, &flags);
cpumask_andnot(&sched_core_mask, &sched_core_mask, smt_mask); cpumask_andnot(&sched_core_mask, &sched_core_mask, smt_mask);
...@@ -364,7 +374,8 @@ void sched_core_put(void) ...@@ -364,7 +374,8 @@ void sched_core_put(void)
#else /* !CONFIG_SCHED_CORE */ #else /* !CONFIG_SCHED_CORE */
static inline void sched_core_enqueue(struct rq *rq, struct task_struct *p) { } static inline void sched_core_enqueue(struct rq *rq, struct task_struct *p) { }
static inline void sched_core_dequeue(struct rq *rq, struct task_struct *p) { } static inline void
sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags) { }
#endif /* CONFIG_SCHED_CORE */ #endif /* CONFIG_SCHED_CORE */
...@@ -2005,7 +2016,7 @@ static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags) ...@@ -2005,7 +2016,7 @@ static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags) static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
{ {
if (sched_core_enabled(rq)) if (sched_core_enabled(rq))
sched_core_dequeue(rq, p); sched_core_dequeue(rq, p, flags);
if (!(flags & DEQUEUE_NOCLOCK)) if (!(flags & DEQUEUE_NOCLOCK))
update_rq_clock(rq); update_rq_clock(rq);
...@@ -5244,6 +5255,7 @@ void scheduler_tick(void) ...@@ -5244,6 +5255,7 @@ void scheduler_tick(void)
if (sched_feat(LATENCY_WARN)) if (sched_feat(LATENCY_WARN))
resched_latency = cpu_resched_latency(rq); resched_latency = cpu_resched_latency(rq);
calc_global_load_tick(rq); calc_global_load_tick(rq);
sched_core_tick(rq);
rq_unlock(rq, &rf); rq_unlock(rq, &rf);
...@@ -5656,6 +5668,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) ...@@ -5656,6 +5668,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
struct task_struct *next, *p, *max = NULL; struct task_struct *next, *p, *max = NULL;
const struct cpumask *smt_mask; const struct cpumask *smt_mask;
bool fi_before = false; bool fi_before = false;
bool core_clock_updated = (rq == rq->core);
unsigned long cookie; unsigned long cookie;
int i, cpu, occ = 0; int i, cpu, occ = 0;
struct rq *rq_i; struct rq *rq_i;
...@@ -5708,10 +5721,18 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) ...@@ -5708,10 +5721,18 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
/* reset state */ /* reset state */
rq->core->core_cookie = 0UL; rq->core->core_cookie = 0UL;
if (rq->core->core_forceidle) { if (rq->core->core_forceidle_count) {
if (!core_clock_updated) {
update_rq_clock(rq->core);
core_clock_updated = true;
}
sched_core_account_forceidle(rq);
/* reset after accounting force idle */
rq->core->core_forceidle_start = 0;
rq->core->core_forceidle_count = 0;
rq->core->core_forceidle_occupation = 0;
need_sync = true; need_sync = true;
fi_before = true; fi_before = true;
rq->core->core_forceidle = false;
} }
/* /*
...@@ -5753,7 +5774,12 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) ...@@ -5753,7 +5774,12 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
for_each_cpu_wrap(i, smt_mask, cpu) { for_each_cpu_wrap(i, smt_mask, cpu) {
rq_i = cpu_rq(i); rq_i = cpu_rq(i);
if (i != cpu) /*
* Current cpu always has its clock updated on entrance to
* pick_next_task(). If the current cpu is not the core,
* the core may also have been updated above.
*/
if (i != cpu && (rq_i != rq->core || !core_clock_updated))
update_rq_clock(rq_i); update_rq_clock(rq_i);
p = rq_i->core_pick = pick_task(rq_i); p = rq_i->core_pick = pick_task(rq_i);
...@@ -5783,7 +5809,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) ...@@ -5783,7 +5809,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
if (p == rq_i->idle) { if (p == rq_i->idle) {
if (rq_i->nr_running) { if (rq_i->nr_running) {
rq->core->core_forceidle = true; rq->core->core_forceidle_count++;
if (!fi_before) if (!fi_before)
rq->core->core_forceidle_seq++; rq->core->core_forceidle_seq++;
} }
...@@ -5792,6 +5818,12 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) ...@@ -5792,6 +5818,12 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
} }
} }
if (schedstat_enabled() && rq->core->core_forceidle_count) {
if (cookie)
rq->core->core_forceidle_start = rq_clock(rq->core);
rq->core->core_forceidle_occupation = occ;
}
rq->core->core_pick_seq = rq->core->core_task_seq; rq->core->core_pick_seq = rq->core->core_task_seq;
next = rq->core_pick; next = rq->core_pick;
rq->core_sched_seq = rq->core->core_pick_seq; rq->core_sched_seq = rq->core->core_pick_seq;
...@@ -5828,8 +5860,8 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) ...@@ -5828,8 +5860,8 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
* 1 0 1 * 1 0 1
* 1 1 0 * 1 1 0
*/ */
if (!(fi_before && rq->core->core_forceidle)) if (!(fi_before && rq->core->core_forceidle_count))
task_vruntime_update(rq_i, rq_i->core_pick, rq->core->core_forceidle); task_vruntime_update(rq_i, rq_i->core_pick, !!rq->core->core_forceidle_count);
rq_i->core_pick->core_occupation = occ; rq_i->core_pick->core_occupation = occ;
...@@ -6033,11 +6065,19 @@ static void sched_core_cpu_deactivate(unsigned int cpu) ...@@ -6033,11 +6065,19 @@ static void sched_core_cpu_deactivate(unsigned int cpu)
goto unlock; goto unlock;
/* copy the shared state to the new leader */ /* copy the shared state to the new leader */
core_rq->core_task_seq = rq->core_task_seq; core_rq->core_task_seq = rq->core_task_seq;
core_rq->core_pick_seq = rq->core_pick_seq; core_rq->core_pick_seq = rq->core_pick_seq;
core_rq->core_cookie = rq->core_cookie; core_rq->core_cookie = rq->core_cookie;
core_rq->core_forceidle = rq->core_forceidle; core_rq->core_forceidle_count = rq->core_forceidle_count;
core_rq->core_forceidle_seq = rq->core_forceidle_seq; core_rq->core_forceidle_seq = rq->core_forceidle_seq;
core_rq->core_forceidle_occupation = rq->core_forceidle_occupation;
/*
* Accounting edge for forced idle is handled in pick_next_task().
* Don't need another one here, since the hotplug thread shouldn't
* have a cookie.
*/
core_rq->core_forceidle_start = 0;
/* install new leader */ /* install new leader */
for_each_cpu(t, smt_mask) { for_each_cpu(t, smt_mask) {
...@@ -7126,7 +7166,7 @@ unsigned long effective_cpu_util(int cpu, unsigned long util_cfs, ...@@ -7126,7 +7166,7 @@ unsigned long effective_cpu_util(int cpu, unsigned long util_cfs,
unsigned long sched_cpu_util(int cpu, unsigned long max) unsigned long sched_cpu_util(int cpu, unsigned long max)
{ {
return effective_cpu_util(cpu, cpu_util_cfs(cpu_rq(cpu)), max, return effective_cpu_util(cpu, cpu_util_cfs(cpu), max,
ENERGY_UTIL, NULL); ENERGY_UTIL, NULL);
} }
#endif /* CONFIG_SMP */ #endif /* CONFIG_SMP */
...@@ -9409,7 +9449,9 @@ void __init sched_init(void) ...@@ -9409,7 +9449,9 @@ void __init sched_init(void)
rq->core_pick = NULL; rq->core_pick = NULL;
rq->core_enabled = 0; rq->core_enabled = 0;
rq->core_tree = RB_ROOT; rq->core_tree = RB_ROOT;
rq->core_forceidle = false; rq->core_forceidle_count = 0;
rq->core_forceidle_occupation = 0;
rq->core_forceidle_start = 0;
rq->core_cookie = 0UL; rq->core_cookie = 0UL;
#endif #endif
......
...@@ -73,7 +73,7 @@ static unsigned long sched_core_update_cookie(struct task_struct *p, ...@@ -73,7 +73,7 @@ static unsigned long sched_core_update_cookie(struct task_struct *p,
enqueued = sched_core_enqueued(p); enqueued = sched_core_enqueued(p);
if (enqueued) if (enqueued)
sched_core_dequeue(rq, p); sched_core_dequeue(rq, p, DEQUEUE_SAVE);
old_cookie = p->core_cookie; old_cookie = p->core_cookie;
p->core_cookie = cookie; p->core_cookie = cookie;
...@@ -85,6 +85,10 @@ static unsigned long sched_core_update_cookie(struct task_struct *p, ...@@ -85,6 +85,10 @@ static unsigned long sched_core_update_cookie(struct task_struct *p,
* If task is currently running, it may not be compatible anymore after * If task is currently running, it may not be compatible anymore after
* the cookie change, so enter the scheduler on its CPU to schedule it * the cookie change, so enter the scheduler on its CPU to schedule it
* away. * away.
*
* Note that it is possible that as a result of this cookie change, the
* core has now entered/left forced idle state. Defer accounting to the
* next scheduling edge, rather than always forcing a reschedule here.
*/ */
if (task_running(rq, p)) if (task_running(rq, p))
resched_curr(rq); resched_curr(rq);
...@@ -232,3 +236,63 @@ int sched_core_share_pid(unsigned int cmd, pid_t pid, enum pid_type type, ...@@ -232,3 +236,63 @@ int sched_core_share_pid(unsigned int cmd, pid_t pid, enum pid_type type,
return err; return err;
} }
#ifdef CONFIG_SCHEDSTATS
/* REQUIRES: rq->core's clock recently updated. */
void __sched_core_account_forceidle(struct rq *rq)
{
const struct cpumask *smt_mask = cpu_smt_mask(cpu_of(rq));
u64 delta, now = rq_clock(rq->core);
struct rq *rq_i;
struct task_struct *p;
int i;
lockdep_assert_rq_held(rq);
WARN_ON_ONCE(!rq->core->core_forceidle_count);
if (rq->core->core_forceidle_start == 0)
return;
delta = now - rq->core->core_forceidle_start;
if (unlikely((s64)delta <= 0))
return;
rq->core->core_forceidle_start = now;
if (WARN_ON_ONCE(!rq->core->core_forceidle_occupation)) {
/* can't be forced idle without a running task */
} else if (rq->core->core_forceidle_count > 1 ||
rq->core->core_forceidle_occupation > 1) {
/*
* For larger SMT configurations, we need to scale the charged
* forced idle amount since there can be more than one forced
* idle sibling and more than one running cookied task.
*/
delta *= rq->core->core_forceidle_count;
delta = div_u64(delta, rq->core->core_forceidle_occupation);
}
for_each_cpu(i, smt_mask) {
rq_i = cpu_rq(i);
p = rq_i->core_pick ?: rq_i->curr;
if (!p->core_cookie)
continue;
__schedstat_add(p->stats.core_forceidle_sum, delta);
}
}
void __sched_core_tick(struct rq *rq)
{
if (!rq->core->core_forceidle_count)
return;
if (rq != rq->core)
update_rq_clock(rq->core);
__sched_core_account_forceidle(rq);
}
#endif /* CONFIG_SCHEDSTATS */
...@@ -21,15 +21,11 @@ static const char * const cpuacct_stat_desc[] = { ...@@ -21,15 +21,11 @@ static const char * const cpuacct_stat_desc[] = {
[CPUACCT_STAT_SYSTEM] = "system", [CPUACCT_STAT_SYSTEM] = "system",
}; };
struct cpuacct_usage {
u64 usages[CPUACCT_STAT_NSTATS];
};
/* track CPU usage of a group of tasks and its child groups */ /* track CPU usage of a group of tasks and its child groups */
struct cpuacct { struct cpuacct {
struct cgroup_subsys_state css; struct cgroup_subsys_state css;
/* cpuusage holds pointer to a u64-type object on every CPU */ /* cpuusage holds pointer to a u64-type object on every CPU */
struct cpuacct_usage __percpu *cpuusage; u64 __percpu *cpuusage;
struct kernel_cpustat __percpu *cpustat; struct kernel_cpustat __percpu *cpustat;
}; };
...@@ -49,7 +45,7 @@ static inline struct cpuacct *parent_ca(struct cpuacct *ca) ...@@ -49,7 +45,7 @@ static inline struct cpuacct *parent_ca(struct cpuacct *ca)
return css_ca(ca->css.parent); return css_ca(ca->css.parent);
} }
static DEFINE_PER_CPU(struct cpuacct_usage, root_cpuacct_cpuusage); static DEFINE_PER_CPU(u64, root_cpuacct_cpuusage);
static struct cpuacct root_cpuacct = { static struct cpuacct root_cpuacct = {
.cpustat = &kernel_cpustat, .cpustat = &kernel_cpustat,
.cpuusage = &root_cpuacct_cpuusage, .cpuusage = &root_cpuacct_cpuusage,
...@@ -68,7 +64,7 @@ cpuacct_css_alloc(struct cgroup_subsys_state *parent_css) ...@@ -68,7 +64,7 @@ cpuacct_css_alloc(struct cgroup_subsys_state *parent_css)
if (!ca) if (!ca)
goto out; goto out;
ca->cpuusage = alloc_percpu(struct cpuacct_usage); ca->cpuusage = alloc_percpu(u64);
if (!ca->cpuusage) if (!ca->cpuusage)
goto out_free_ca; goto out_free_ca;
...@@ -99,14 +95,16 @@ static void cpuacct_css_free(struct cgroup_subsys_state *css) ...@@ -99,14 +95,16 @@ static void cpuacct_css_free(struct cgroup_subsys_state *css)
static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu, static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu,
enum cpuacct_stat_index index) enum cpuacct_stat_index index)
{ {
struct cpuacct_usage *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
u64 *cpustat = per_cpu_ptr(ca->cpustat, cpu)->cpustat;
u64 data; u64 data;
/* /*
* We allow index == CPUACCT_STAT_NSTATS here to read * We allow index == CPUACCT_STAT_NSTATS here to read
* the sum of usages. * the sum of usages.
*/ */
BUG_ON(index > CPUACCT_STAT_NSTATS); if (WARN_ON_ONCE(index > CPUACCT_STAT_NSTATS))
return 0;
#ifndef CONFIG_64BIT #ifndef CONFIG_64BIT
/* /*
...@@ -115,14 +113,17 @@ static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu, ...@@ -115,14 +113,17 @@ static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu,
raw_spin_rq_lock_irq(cpu_rq(cpu)); raw_spin_rq_lock_irq(cpu_rq(cpu));
#endif #endif
if (index == CPUACCT_STAT_NSTATS) { switch (index) {
int i = 0; case CPUACCT_STAT_USER:
data = cpustat[CPUTIME_USER] + cpustat[CPUTIME_NICE];
data = 0; break;
for (i = 0; i < CPUACCT_STAT_NSTATS; i++) case CPUACCT_STAT_SYSTEM:
data += cpuusage->usages[i]; data = cpustat[CPUTIME_SYSTEM] + cpustat[CPUTIME_IRQ] +
} else { cpustat[CPUTIME_SOFTIRQ];
data = cpuusage->usages[index]; break;
case CPUACCT_STAT_NSTATS:
data = *cpuusage;
break;
} }
#ifndef CONFIG_64BIT #ifndef CONFIG_64BIT
...@@ -132,10 +133,14 @@ static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu, ...@@ -132,10 +133,14 @@ static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu,
return data; return data;
} }
static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val) static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu)
{ {
struct cpuacct_usage *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
int i; u64 *cpustat = per_cpu_ptr(ca->cpustat, cpu)->cpustat;
/* Don't allow to reset global kernel_cpustat */
if (ca == &root_cpuacct)
return;
#ifndef CONFIG_64BIT #ifndef CONFIG_64BIT
/* /*
...@@ -143,9 +148,10 @@ static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val) ...@@ -143,9 +148,10 @@ static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
*/ */
raw_spin_rq_lock_irq(cpu_rq(cpu)); raw_spin_rq_lock_irq(cpu_rq(cpu));
#endif #endif
*cpuusage = 0;
for (i = 0; i < CPUACCT_STAT_NSTATS; i++) cpustat[CPUTIME_USER] = cpustat[CPUTIME_NICE] = 0;
cpuusage->usages[i] = val; cpustat[CPUTIME_SYSTEM] = cpustat[CPUTIME_IRQ] = 0;
cpustat[CPUTIME_SOFTIRQ] = 0;
#ifndef CONFIG_64BIT #ifndef CONFIG_64BIT
raw_spin_rq_unlock_irq(cpu_rq(cpu)); raw_spin_rq_unlock_irq(cpu_rq(cpu));
...@@ -196,7 +202,7 @@ static int cpuusage_write(struct cgroup_subsys_state *css, struct cftype *cft, ...@@ -196,7 +202,7 @@ static int cpuusage_write(struct cgroup_subsys_state *css, struct cftype *cft,
return -EINVAL; return -EINVAL;
for_each_possible_cpu(cpu) for_each_possible_cpu(cpu)
cpuacct_cpuusage_write(ca, cpu, 0); cpuacct_cpuusage_write(ca, cpu);
return 0; return 0;
} }
...@@ -243,25 +249,10 @@ static int cpuacct_all_seq_show(struct seq_file *m, void *V) ...@@ -243,25 +249,10 @@ static int cpuacct_all_seq_show(struct seq_file *m, void *V)
seq_puts(m, "\n"); seq_puts(m, "\n");
for_each_possible_cpu(cpu) { for_each_possible_cpu(cpu) {
struct cpuacct_usage *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
seq_printf(m, "%d", cpu); seq_printf(m, "%d", cpu);
for (index = 0; index < CPUACCT_STAT_NSTATS; index++)
for (index = 0; index < CPUACCT_STAT_NSTATS; index++) { seq_printf(m, " %llu",
#ifndef CONFIG_64BIT cpuacct_cpuusage_read(ca, cpu, index));
/*
* Take rq->lock to make 64-bit read safe on 32-bit
* platforms.
*/
raw_spin_rq_lock_irq(cpu_rq(cpu));
#endif
seq_printf(m, " %llu", cpuusage->usages[index]);
#ifndef CONFIG_64BIT
raw_spin_rq_unlock_irq(cpu_rq(cpu));
#endif
}
seq_puts(m, "\n"); seq_puts(m, "\n");
} }
return 0; return 0;
...@@ -270,25 +261,30 @@ static int cpuacct_all_seq_show(struct seq_file *m, void *V) ...@@ -270,25 +261,30 @@ static int cpuacct_all_seq_show(struct seq_file *m, void *V)
static int cpuacct_stats_show(struct seq_file *sf, void *v) static int cpuacct_stats_show(struct seq_file *sf, void *v)
{ {
struct cpuacct *ca = css_ca(seq_css(sf)); struct cpuacct *ca = css_ca(seq_css(sf));
s64 val[CPUACCT_STAT_NSTATS]; struct task_cputime cputime;
u64 val[CPUACCT_STAT_NSTATS];
int cpu; int cpu;
int stat; int stat;
memset(val, 0, sizeof(val)); memset(&cputime, 0, sizeof(cputime));
for_each_possible_cpu(cpu) { for_each_possible_cpu(cpu) {
u64 *cpustat = per_cpu_ptr(ca->cpustat, cpu)->cpustat; u64 *cpustat = per_cpu_ptr(ca->cpustat, cpu)->cpustat;
val[CPUACCT_STAT_USER] += cpustat[CPUTIME_USER]; cputime.utime += cpustat[CPUTIME_USER];
val[CPUACCT_STAT_USER] += cpustat[CPUTIME_NICE]; cputime.utime += cpustat[CPUTIME_NICE];
val[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_SYSTEM]; cputime.stime += cpustat[CPUTIME_SYSTEM];
val[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_IRQ]; cputime.stime += cpustat[CPUTIME_IRQ];
val[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_SOFTIRQ]; cputime.stime += cpustat[CPUTIME_SOFTIRQ];
cputime.sum_exec_runtime += *per_cpu_ptr(ca->cpuusage, cpu);
} }
cputime_adjust(&cputime, &seq_css(sf)->cgroup->prev_cputime,
&val[CPUACCT_STAT_USER], &val[CPUACCT_STAT_SYSTEM]);
for (stat = 0; stat < CPUACCT_STAT_NSTATS; stat++) { for (stat = 0; stat < CPUACCT_STAT_NSTATS; stat++) {
seq_printf(sf, "%s %lld\n", seq_printf(sf, "%s %llu\n", cpuacct_stat_desc[stat],
cpuacct_stat_desc[stat], nsec_to_clock_t(val[stat]));
(long long)nsec_to_clock_t(val[stat]));
} }
return 0; return 0;
...@@ -339,16 +335,11 @@ static struct cftype files[] = { ...@@ -339,16 +335,11 @@ static struct cftype files[] = {
void cpuacct_charge(struct task_struct *tsk, u64 cputime) void cpuacct_charge(struct task_struct *tsk, u64 cputime)
{ {
struct cpuacct *ca; struct cpuacct *ca;
int index = CPUACCT_STAT_SYSTEM;
struct pt_regs *regs = get_irq_regs() ? : task_pt_regs(tsk);
if (regs && user_mode(regs))
index = CPUACCT_STAT_USER;
rcu_read_lock(); rcu_read_lock();
for (ca = task_ca(tsk); ca; ca = parent_ca(ca)) for (ca = task_ca(tsk); ca; ca = parent_ca(ca))
__this_cpu_add(ca->cpuusage->usages[index], cputime); __this_cpu_add(*ca->cpuusage, cputime);
rcu_read_unlock(); rcu_read_unlock();
} }
......
...@@ -168,7 +168,7 @@ static void sugov_get_util(struct sugov_cpu *sg_cpu) ...@@ -168,7 +168,7 @@ static void sugov_get_util(struct sugov_cpu *sg_cpu)
sg_cpu->max = max; sg_cpu->max = max;
sg_cpu->bw_dl = cpu_bw_dl(rq); sg_cpu->bw_dl = cpu_bw_dl(rq);
sg_cpu->util = effective_cpu_util(sg_cpu->cpu, cpu_util_cfs(rq), max, sg_cpu->util = effective_cpu_util(sg_cpu->cpu, cpu_util_cfs(sg_cpu->cpu), max,
FREQUENCY_UTIL, NULL); FREQUENCY_UTIL, NULL);
} }
......
...@@ -148,10 +148,10 @@ void account_guest_time(struct task_struct *p, u64 cputime) ...@@ -148,10 +148,10 @@ void account_guest_time(struct task_struct *p, u64 cputime)
/* Add guest time to cpustat. */ /* Add guest time to cpustat. */
if (task_nice(p) > 0) { if (task_nice(p) > 0) {
cpustat[CPUTIME_NICE] += cputime; task_group_account_field(p, CPUTIME_NICE, cputime);
cpustat[CPUTIME_GUEST_NICE] += cputime; cpustat[CPUTIME_GUEST_NICE] += cputime;
} else { } else {
cpustat[CPUTIME_USER] += cputime; task_group_account_field(p, CPUTIME_USER, cputime);
cpustat[CPUTIME_GUEST] += cputime; cpustat[CPUTIME_GUEST] += cputime;
} }
} }
......
...@@ -1023,6 +1023,10 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, ...@@ -1023,6 +1023,10 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
__PN(avg_atom); __PN(avg_atom);
__PN(avg_per_cpu); __PN(avg_per_cpu);
#ifdef CONFIG_SCHED_CORE
PN_SCHEDSTAT(core_forceidle_sum);
#endif
} }
__P(nr_switches); __P(nr_switches);
......
...@@ -1502,7 +1502,6 @@ struct task_numa_env { ...@@ -1502,7 +1502,6 @@ struct task_numa_env {
static unsigned long cpu_load(struct rq *rq); static unsigned long cpu_load(struct rq *rq);
static unsigned long cpu_runnable(struct rq *rq); static unsigned long cpu_runnable(struct rq *rq);
static unsigned long cpu_util(int cpu);
static inline long adjust_numa_imbalance(int imbalance, static inline long adjust_numa_imbalance(int imbalance,
int dst_running, int dst_weight); int dst_running, int dst_weight);
...@@ -1569,7 +1568,7 @@ static void update_numa_stats(struct task_numa_env *env, ...@@ -1569,7 +1568,7 @@ static void update_numa_stats(struct task_numa_env *env,
ns->load += cpu_load(rq); ns->load += cpu_load(rq);
ns->runnable += cpu_runnable(rq); ns->runnable += cpu_runnable(rq);
ns->util += cpu_util(cpu); ns->util += cpu_util_cfs(cpu);
ns->nr_running += rq->cfs.h_nr_running; ns->nr_running += rq->cfs.h_nr_running;
ns->compute_capacity += capacity_of(cpu); ns->compute_capacity += capacity_of(cpu);
...@@ -3240,7 +3239,7 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags) ...@@ -3240,7 +3239,7 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags)
* As is, the util number is not freq-invariant (we'd have to * As is, the util number is not freq-invariant (we'd have to
* implement arch_scale_freq_capacity() for that). * implement arch_scale_freq_capacity() for that).
* *
* See cpu_util(). * See cpu_util_cfs().
*/ */
cpufreq_update_util(rq, flags); cpufreq_update_util(rq, flags);
} }
...@@ -4070,7 +4069,8 @@ static inline void util_est_update(struct cfs_rq *cfs_rq, ...@@ -4070,7 +4069,8 @@ static inline void util_est_update(struct cfs_rq *cfs_rq,
trace_sched_util_est_se_tp(&p->se); trace_sched_util_est_se_tp(&p->se);
} }
static inline int task_fits_capacity(struct task_struct *p, long capacity) static inline int task_fits_capacity(struct task_struct *p,
unsigned long capacity)
{ {
return fits_capacity(uclamp_task_util(p), capacity); return fits_capacity(uclamp_task_util(p), capacity);
} }
...@@ -5509,11 +5509,9 @@ static inline void hrtick_update(struct rq *rq) ...@@ -5509,11 +5509,9 @@ static inline void hrtick_update(struct rq *rq)
#endif #endif
#ifdef CONFIG_SMP #ifdef CONFIG_SMP
static inline unsigned long cpu_util(int cpu);
static inline bool cpu_overutilized(int cpu) static inline bool cpu_overutilized(int cpu)
{ {
return !fits_capacity(cpu_util(cpu), capacity_of(cpu)); return !fits_capacity(cpu_util_cfs(cpu), capacity_of(cpu));
} }
static inline void update_overutilized_status(struct rq *rq) static inline void update_overutilized_status(struct rq *rq)
...@@ -6345,7 +6343,7 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target) ...@@ -6345,7 +6343,7 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
return best_cpu; return best_cpu;
} }
static inline bool asym_fits_capacity(int task_util, int cpu) static inline bool asym_fits_capacity(unsigned long task_util, int cpu)
{ {
if (static_branch_unlikely(&sched_asym_cpucapacity)) if (static_branch_unlikely(&sched_asym_cpucapacity))
return fits_capacity(task_util, capacity_of(cpu)); return fits_capacity(task_util, capacity_of(cpu));
...@@ -6398,8 +6396,10 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) ...@@ -6398,8 +6396,10 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
* pattern is IO completions. * pattern is IO completions.
*/ */
if (is_per_cpu_kthread(current) && if (is_per_cpu_kthread(current) &&
in_task() &&
prev == smp_processor_id() && prev == smp_processor_id() &&
this_rq()->nr_running <= 1) { this_rq()->nr_running <= 1 &&
asym_fits_capacity(task_util, prev)) {
return prev; return prev;
} }
...@@ -6456,58 +6456,6 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) ...@@ -6456,58 +6456,6 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
return target; return target;
} }
/**
* cpu_util - Estimates the amount of capacity of a CPU used by CFS tasks.
* @cpu: the CPU to get the utilization of
*
* The unit of the return value must be the one of capacity so we can compare
* the utilization with the capacity of the CPU that is available for CFS task
* (ie cpu_capacity).
*
* cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the
* recent utilization of currently non-runnable tasks on a CPU. It represents
* the amount of utilization of a CPU in the range [0..capacity_orig] where
* capacity_orig is the cpu_capacity available at the highest frequency
* (arch_scale_freq_capacity()).
* The utilization of a CPU converges towards a sum equal to or less than the
* current capacity (capacity_curr <= capacity_orig) of the CPU because it is
* the running time on this CPU scaled by capacity_curr.
*
* The estimated utilization of a CPU is defined to be the maximum between its
* cfs_rq.avg.util_avg and the sum of the estimated utilization of the tasks
* currently RUNNABLE on that CPU.
* This allows to properly represent the expected utilization of a CPU which
* has just got a big task running since a long sleep period. At the same time
* however it preserves the benefits of the "blocked utilization" in
* describing the potential for other tasks waking up on the same CPU.
*
* Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even
* higher than capacity_orig because of unfortunate rounding in
* cfs.avg.util_avg or just after migrating tasks and new task wakeups until
* the average stabilizes with the new running time. We need to check that the
* utilization stays within the range of [0..capacity_orig] and cap it if
* necessary. Without utilization capping, a group could be seen as overloaded
* (CPU0 utilization at 121% + CPU1 utilization at 80%) whereas CPU1 has 20% of
* available capacity. We allow utilization to overshoot capacity_curr (but not
* capacity_orig) as it useful for predicting the capacity required after task
* migrations (scheduler-driven DVFS).
*
* Return: the (estimated) utilization for the specified CPU
*/
static inline unsigned long cpu_util(int cpu)
{
struct cfs_rq *cfs_rq;
unsigned int util;
cfs_rq = &cpu_rq(cpu)->cfs;
util = READ_ONCE(cfs_rq->avg.util_avg);
if (sched_feat(UTIL_EST))
util = max(util, READ_ONCE(cfs_rq->avg.util_est.enqueued));
return min_t(unsigned long, util, capacity_orig_of(cpu));
}
/* /*
* cpu_util_without: compute cpu utilization without any contributions from *p * cpu_util_without: compute cpu utilization without any contributions from *p
* @cpu: the CPU which utilization is requested * @cpu: the CPU which utilization is requested
...@@ -6528,7 +6476,7 @@ static unsigned long cpu_util_without(int cpu, struct task_struct *p) ...@@ -6528,7 +6476,7 @@ static unsigned long cpu_util_without(int cpu, struct task_struct *p)
/* Task has no contribution or is new */ /* Task has no contribution or is new */
if (cpu != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time)) if (cpu != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
return cpu_util(cpu); return cpu_util_cfs(cpu);
cfs_rq = &cpu_rq(cpu)->cfs; cfs_rq = &cpu_rq(cpu)->cfs;
util = READ_ONCE(cfs_rq->avg.util_avg); util = READ_ONCE(cfs_rq->avg.util_avg);
...@@ -6592,7 +6540,7 @@ static unsigned long cpu_util_without(int cpu, struct task_struct *p) ...@@ -6592,7 +6540,7 @@ static unsigned long cpu_util_without(int cpu, struct task_struct *p)
/* /*
* Utilization (estimated) can exceed the CPU capacity, thus let's * Utilization (estimated) can exceed the CPU capacity, thus let's
* clamp to the maximum CPU capacity to ensure consistency with * clamp to the maximum CPU capacity to ensure consistency with
* the cpu_util call. * cpu_util.
*/ */
return min_t(unsigned long, util, capacity_orig_of(cpu)); return min_t(unsigned long, util, capacity_orig_of(cpu));
} }
...@@ -6624,7 +6572,7 @@ static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu) ...@@ -6624,7 +6572,7 @@ static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu)
* During wake-up, the task isn't enqueued yet and doesn't * During wake-up, the task isn't enqueued yet and doesn't
* appear in the cfs_rq->avg.util_est.enqueued of any rq, * appear in the cfs_rq->avg.util_est.enqueued of any rq,
* so just add it (if needed) to "simulate" what will be * so just add it (if needed) to "simulate" what will be
* cpu_util() after the task has been enqueued. * cpu_util after the task has been enqueued.
*/ */
if (dst_cpu == cpu) if (dst_cpu == cpu)
util_est += _task_util_est(p); util_est += _task_util_est(p);
...@@ -6915,6 +6863,11 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags) ...@@ -6915,6 +6863,11 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
break; break;
} }
/*
* Usually only true for WF_EXEC and WF_FORK, as sched_domains
* usually do not have SD_BALANCE_WAKE set. That means wakeup
* will usually go to the fast path.
*/
if (tmp->flags & sd_flag) if (tmp->flags & sd_flag)
sd = tmp; sd = tmp;
else if (!want_affine) else if (!want_affine)
...@@ -8681,7 +8634,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, ...@@ -8681,7 +8634,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
struct rq *rq = cpu_rq(i); struct rq *rq = cpu_rq(i);
sgs->group_load += cpu_load(rq); sgs->group_load += cpu_load(rq);
sgs->group_util += cpu_util(i); sgs->group_util += cpu_util_cfs(i);
sgs->group_runnable += cpu_runnable(rq); sgs->group_runnable += cpu_runnable(rq);
sgs->sum_h_nr_running += rq->cfs.h_nr_running; sgs->sum_h_nr_running += rq->cfs.h_nr_running;
...@@ -9699,7 +9652,7 @@ static struct rq *find_busiest_queue(struct lb_env *env, ...@@ -9699,7 +9652,7 @@ static struct rq *find_busiest_queue(struct lb_env *env,
break; break;
case migrate_util: case migrate_util:
util = cpu_util(cpu_of(rq)); util = cpu_util_cfs(i);
/* /*
* Don't try to pull utilization from a CPU with one * Don't try to pull utilization from a CPU with one
...@@ -11068,7 +11021,7 @@ static inline void task_tick_core(struct rq *rq, struct task_struct *curr) ...@@ -11068,7 +11021,7 @@ static inline void task_tick_core(struct rq *rq, struct task_struct *curr)
* MIN_NR_TASKS_DURING_FORCEIDLE - 1 tasks and use that to check * MIN_NR_TASKS_DURING_FORCEIDLE - 1 tasks and use that to check
* if we need to give up the CPU. * if we need to give up the CPU.
*/ */
if (rq->core->core_forceidle && rq->cfs.nr_running == 1 && if (rq->core->core_forceidle_count && rq->cfs.nr_running == 1 &&
__entity_slice_used(&curr->se, MIN_NR_TASKS_DURING_FORCEIDLE)) __entity_slice_used(&curr->se, MIN_NR_TASKS_DURING_FORCEIDLE))
resched_curr(rq); resched_curr(rq);
} }
......
// SPDX-License-Identifier: GPL-2.0
/* /*
* Pressure stall information for CPU, memory and IO * Pressure stall information for CPU, memory and IO
* *
...@@ -34,13 +35,19 @@ ...@@ -34,13 +35,19 @@
* delayed on that resource such that nobody is advancing and the CPU * delayed on that resource such that nobody is advancing and the CPU
* goes idle. This leaves both workload and CPU unproductive. * goes idle. This leaves both workload and CPU unproductive.
* *
* Naturally, the FULL state doesn't exist for the CPU resource at the
* system level, but exist at the cgroup level, means all non-idle tasks
* in a cgroup are delayed on the CPU resource which used by others outside
* of the cgroup or throttled by the cgroup cpu.max configuration.
*
* SOME = nr_delayed_tasks != 0 * SOME = nr_delayed_tasks != 0
* FULL = nr_delayed_tasks != 0 && nr_running_tasks == 0 * FULL = nr_delayed_tasks != 0 && nr_productive_tasks == 0
*
* What it means for a task to be productive is defined differently
* for each resource. For IO, productive means a running task. For
* memory, productive means a running task that isn't a reclaimer. For
* CPU, productive means an oncpu task.
*
* Naturally, the FULL state doesn't exist for the CPU resource at the
* system level, but exist at the cgroup level. At the cgroup level,
* FULL means all non-idle tasks in the cgroup are delayed on the CPU
* resource which is being used by others outside of the cgroup or
* throttled by the cgroup cpu.max configuration.
* *
* The percentage of wallclock time spent in those compound stall * The percentage of wallclock time spent in those compound stall
* states gives pressure numbers between 0 and 100 for each resource, * states gives pressure numbers between 0 and 100 for each resource,
...@@ -81,13 +88,13 @@ ...@@ -81,13 +88,13 @@
* *
* threads = min(nr_nonidle_tasks, nr_cpus) * threads = min(nr_nonidle_tasks, nr_cpus)
* SOME = min(nr_delayed_tasks / threads, 1) * SOME = min(nr_delayed_tasks / threads, 1)
* FULL = (threads - min(nr_running_tasks, threads)) / threads * FULL = (threads - min(nr_productive_tasks, threads)) / threads
* *
* For the 257 number crunchers on 256 CPUs, this yields: * For the 257 number crunchers on 256 CPUs, this yields:
* *
* threads = min(257, 256) * threads = min(257, 256)
* SOME = min(1 / 256, 1) = 0.4% * SOME = min(1 / 256, 1) = 0.4%
* FULL = (256 - min(257, 256)) / 256 = 0% * FULL = (256 - min(256, 256)) / 256 = 0%
* *
* For the 1 out of 4 memory-delayed tasks, this yields: * For the 1 out of 4 memory-delayed tasks, this yields:
* *
...@@ -112,7 +119,7 @@ ...@@ -112,7 +119,7 @@
* For each runqueue, we track: * For each runqueue, we track:
* *
* tSOME[cpu] = time(nr_delayed_tasks[cpu] != 0) * tSOME[cpu] = time(nr_delayed_tasks[cpu] != 0)
* tFULL[cpu] = time(nr_delayed_tasks[cpu] && !nr_running_tasks[cpu]) * tFULL[cpu] = time(nr_delayed_tasks[cpu] && !nr_productive_tasks[cpu])
* tNONIDLE[cpu] = time(nr_nonidle_tasks[cpu] != 0) * tNONIDLE[cpu] = time(nr_nonidle_tasks[cpu] != 0)
* *
* and then periodically aggregate: * and then periodically aggregate:
...@@ -233,7 +240,8 @@ static bool test_state(unsigned int *tasks, enum psi_states state) ...@@ -233,7 +240,8 @@ static bool test_state(unsigned int *tasks, enum psi_states state)
case PSI_MEM_SOME: case PSI_MEM_SOME:
return unlikely(tasks[NR_MEMSTALL]); return unlikely(tasks[NR_MEMSTALL]);
case PSI_MEM_FULL: case PSI_MEM_FULL:
return unlikely(tasks[NR_MEMSTALL] && !tasks[NR_RUNNING]); return unlikely(tasks[NR_MEMSTALL] &&
tasks[NR_RUNNING] == tasks[NR_MEMSTALL_RUNNING]);
case PSI_CPU_SOME: case PSI_CPU_SOME:
return unlikely(tasks[NR_RUNNING] > tasks[NR_ONCPU]); return unlikely(tasks[NR_RUNNING] > tasks[NR_ONCPU]);
case PSI_CPU_FULL: case PSI_CPU_FULL:
...@@ -710,10 +718,11 @@ static void psi_group_change(struct psi_group *group, int cpu, ...@@ -710,10 +718,11 @@ static void psi_group_change(struct psi_group *group, int cpu,
if (groupc->tasks[t]) { if (groupc->tasks[t]) {
groupc->tasks[t]--; groupc->tasks[t]--;
} else if (!psi_bug) { } else if (!psi_bug) {
printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u %u] clear=%x set=%x\n", printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u %u %u] clear=%x set=%x\n",
cpu, t, groupc->tasks[0], cpu, t, groupc->tasks[0],
groupc->tasks[1], groupc->tasks[2], groupc->tasks[1], groupc->tasks[2],
groupc->tasks[3], clear, set); groupc->tasks[3], groupc->tasks[4],
clear, set);
psi_bug = 1; psi_bug = 1;
} }
} }
...@@ -833,7 +842,6 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next, ...@@ -833,7 +842,6 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
/* /*
* When switching between tasks that have an identical * When switching between tasks that have an identical
* runtime state, the cgroup that contains both tasks * runtime state, the cgroup that contains both tasks
* runtime state, the cgroup that contains both tasks
* we reach the first common ancestor. Iterate @next's * we reach the first common ancestor. Iterate @next's
* ancestors only until we encounter @prev's ONCPU. * ancestors only until we encounter @prev's ONCPU.
*/ */
...@@ -854,12 +862,15 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next, ...@@ -854,12 +862,15 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
int clear = TSK_ONCPU, set = 0; int clear = TSK_ONCPU, set = 0;
/* /*
* When we're going to sleep, psi_dequeue() lets us handle * When we're going to sleep, psi_dequeue() lets us
* TSK_RUNNING and TSK_IOWAIT here, where we can combine it * handle TSK_RUNNING, TSK_MEMSTALL_RUNNING and
* with TSK_ONCPU and save walking common ancestors twice. * TSK_IOWAIT here, where we can combine it with
* TSK_ONCPU and save walking common ancestors twice.
*/ */
if (sleep) { if (sleep) {
clear |= TSK_RUNNING; clear |= TSK_RUNNING;
if (prev->in_memstall)
clear |= TSK_MEMSTALL_RUNNING;
if (prev->in_iowait) if (prev->in_iowait)
set |= TSK_IOWAIT; set |= TSK_IOWAIT;
} }
...@@ -908,7 +919,7 @@ void psi_memstall_enter(unsigned long *flags) ...@@ -908,7 +919,7 @@ void psi_memstall_enter(unsigned long *flags)
rq = this_rq_lock_irq(&rf); rq = this_rq_lock_irq(&rf);
current->in_memstall = 1; current->in_memstall = 1;
psi_task_change(current, 0, TSK_MEMSTALL); psi_task_change(current, 0, TSK_MEMSTALL | TSK_MEMSTALL_RUNNING);
rq_unlock_irq(rq, &rf); rq_unlock_irq(rq, &rf);
} }
...@@ -937,7 +948,7 @@ void psi_memstall_leave(unsigned long *flags) ...@@ -937,7 +948,7 @@ void psi_memstall_leave(unsigned long *flags)
rq = this_rq_lock_irq(&rf); rq = this_rq_lock_irq(&rf);
current->in_memstall = 0; current->in_memstall = 0;
psi_task_change(current, TSK_MEMSTALL, 0); psi_task_change(current, TSK_MEMSTALL | TSK_MEMSTALL_RUNNING, 0);
rq_unlock_irq(rq, &rf); rq_unlock_irq(rq, &rf);
} }
......
...@@ -52,11 +52,8 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime) ...@@ -52,11 +52,8 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
rt_b->rt_period_timer.function = sched_rt_period_timer; rt_b->rt_period_timer.function = sched_rt_period_timer;
} }
static void start_rt_bandwidth(struct rt_bandwidth *rt_b) static inline void do_start_rt_bandwidth(struct rt_bandwidth *rt_b)
{ {
if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
return;
raw_spin_lock(&rt_b->rt_runtime_lock); raw_spin_lock(&rt_b->rt_runtime_lock);
if (!rt_b->rt_period_active) { if (!rt_b->rt_period_active) {
rt_b->rt_period_active = 1; rt_b->rt_period_active = 1;
...@@ -75,6 +72,14 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b) ...@@ -75,6 +72,14 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
raw_spin_unlock(&rt_b->rt_runtime_lock); raw_spin_unlock(&rt_b->rt_runtime_lock);
} }
static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
{
if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
return;
do_start_rt_bandwidth(rt_b);
}
void init_rt_rq(struct rt_rq *rt_rq) void init_rt_rq(struct rt_rq *rt_rq)
{ {
struct rt_prio_array *array; struct rt_prio_array *array;
...@@ -1031,13 +1036,17 @@ static void update_curr_rt(struct rq *rq) ...@@ -1031,13 +1036,17 @@ static void update_curr_rt(struct rq *rq)
for_each_sched_rt_entity(rt_se) { for_each_sched_rt_entity(rt_se) {
struct rt_rq *rt_rq = rt_rq_of_se(rt_se); struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
int exceeded;
if (sched_rt_runtime(rt_rq) != RUNTIME_INF) { if (sched_rt_runtime(rt_rq) != RUNTIME_INF) {
raw_spin_lock(&rt_rq->rt_runtime_lock); raw_spin_lock(&rt_rq->rt_runtime_lock);
rt_rq->rt_time += delta_exec; rt_rq->rt_time += delta_exec;
if (sched_rt_runtime_exceeded(rt_rq)) exceeded = sched_rt_runtime_exceeded(rt_rq);
if (exceeded)
resched_curr(rq); resched_curr(rq);
raw_spin_unlock(&rt_rq->rt_runtime_lock); raw_spin_unlock(&rt_rq->rt_runtime_lock);
if (exceeded)
do_start_rt_bandwidth(sched_rt_bandwidth(rt_rq));
} }
} }
} }
...@@ -2911,8 +2920,12 @@ static int sched_rt_global_validate(void) ...@@ -2911,8 +2920,12 @@ static int sched_rt_global_validate(void)
static void sched_rt_do_global(void) static void sched_rt_do_global(void)
{ {
unsigned long flags;
raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
def_rt_bandwidth.rt_runtime = global_rt_runtime(); def_rt_bandwidth.rt_runtime = global_rt_runtime();
def_rt_bandwidth.rt_period = ns_to_ktime(global_rt_period()); def_rt_bandwidth.rt_period = ns_to_ktime(global_rt_period());
raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
} }
int sched_rt_handler(struct ctl_table *table, int write, void *buffer, int sched_rt_handler(struct ctl_table *table, int write, void *buffer,
......
...@@ -1111,8 +1111,10 @@ struct rq { ...@@ -1111,8 +1111,10 @@ struct rq {
unsigned int core_task_seq; unsigned int core_task_seq;
unsigned int core_pick_seq; unsigned int core_pick_seq;
unsigned long core_cookie; unsigned long core_cookie;
unsigned char core_forceidle; unsigned int core_forceidle_count;
unsigned int core_forceidle_seq; unsigned int core_forceidle_seq;
unsigned int core_forceidle_occupation;
u64 core_forceidle_start;
#endif #endif
}; };
...@@ -1253,7 +1255,7 @@ static inline bool sched_core_enqueued(struct task_struct *p) ...@@ -1253,7 +1255,7 @@ static inline bool sched_core_enqueued(struct task_struct *p)
} }
extern void sched_core_enqueue(struct rq *rq, struct task_struct *p); extern void sched_core_enqueue(struct rq *rq, struct task_struct *p);
extern void sched_core_dequeue(struct rq *rq, struct task_struct *p); extern void sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags);
extern void sched_core_get(void); extern void sched_core_get(void);
extern void sched_core_put(void); extern void sched_core_put(void);
...@@ -1854,6 +1856,32 @@ static inline void flush_smp_call_function_from_idle(void) { } ...@@ -1854,6 +1856,32 @@ static inline void flush_smp_call_function_from_idle(void) { }
#include "stats.h" #include "stats.h"
#include "autogroup.h" #include "autogroup.h"
#if defined(CONFIG_SCHED_CORE) && defined(CONFIG_SCHEDSTATS)
extern void __sched_core_account_forceidle(struct rq *rq);
static inline void sched_core_account_forceidle(struct rq *rq)
{
if (schedstat_enabled())
__sched_core_account_forceidle(rq);
}
extern void __sched_core_tick(struct rq *rq);
static inline void sched_core_tick(struct rq *rq)
{
if (sched_core_enabled(rq) && schedstat_enabled())
__sched_core_tick(rq);
}
#else
static inline void sched_core_account_forceidle(struct rq *rq) {}
static inline void sched_core_tick(struct rq *rq) {}
#endif /* CONFIG_SCHED_CORE && CONFIG_SCHEDSTATS */
#ifdef CONFIG_CGROUP_SCHED #ifdef CONFIG_CGROUP_SCHED
/* /*
...@@ -2938,16 +2966,52 @@ static inline unsigned long cpu_util_dl(struct rq *rq) ...@@ -2938,16 +2966,52 @@ static inline unsigned long cpu_util_dl(struct rq *rq)
return READ_ONCE(rq->avg_dl.util_avg); return READ_ONCE(rq->avg_dl.util_avg);
} }
static inline unsigned long cpu_util_cfs(struct rq *rq) /**
* cpu_util_cfs() - Estimates the amount of CPU capacity used by CFS tasks.
* @cpu: the CPU to get the utilization for.
*
* The unit of the return value must be the same as the one of CPU capacity
* so that CPU utilization can be compared with CPU capacity.
*
* CPU utilization is the sum of running time of runnable tasks plus the
* recent utilization of currently non-runnable tasks on that CPU.
* It represents the amount of CPU capacity currently used by CFS tasks in
* the range [0..max CPU capacity] with max CPU capacity being the CPU
* capacity at f_max.
*
* The estimated CPU utilization is defined as the maximum between CPU
* utilization and sum of the estimated utilization of the currently
* runnable tasks on that CPU. It preserves a utilization "snapshot" of
* previously-executed tasks, which helps better deduce how busy a CPU will
* be when a long-sleeping task wakes up. The contribution to CPU utilization
* of such a task would be significantly decayed at this point of time.
*
* CPU utilization can be higher than the current CPU capacity
* (f_curr/f_max * max CPU capacity) or even the max CPU capacity because
* of rounding errors as well as task migrations or wakeups of new tasks.
* CPU utilization has to be capped to fit into the [0..max CPU capacity]
* range. Otherwise a group of CPUs (CPU0 util = 121% + CPU1 util = 80%)
* could be seen as over-utilized even though CPU1 has 20% of spare CPU
* capacity. CPU utilization is allowed to overshoot current CPU capacity
* though since this is useful for predicting the CPU capacity required
* after task migrations (scheduler-driven DVFS).
*
* Return: (Estimated) utilization for the specified CPU.
*/
static inline unsigned long cpu_util_cfs(int cpu)
{ {
unsigned long util = READ_ONCE(rq->cfs.avg.util_avg); struct cfs_rq *cfs_rq;
unsigned long util;
cfs_rq = &cpu_rq(cpu)->cfs;
util = READ_ONCE(cfs_rq->avg.util_avg);
if (sched_feat(UTIL_EST)) { if (sched_feat(UTIL_EST)) {
util = max_t(unsigned long, util, util = max_t(unsigned long, util,
READ_ONCE(rq->cfs.avg.util_est.enqueued)); READ_ONCE(cfs_rq->avg.util_est.enqueued));
} }
return util; return min(util, capacity_orig_of(cpu));
} }
static inline unsigned long cpu_util_rt(struct rq *rq) static inline unsigned long cpu_util_rt(struct rq *rq)
......
...@@ -118,6 +118,9 @@ static inline void psi_enqueue(struct task_struct *p, bool wakeup) ...@@ -118,6 +118,9 @@ static inline void psi_enqueue(struct task_struct *p, bool wakeup)
if (static_branch_likely(&psi_disabled)) if (static_branch_likely(&psi_disabled))
return; return;
if (p->in_memstall)
set |= TSK_MEMSTALL_RUNNING;
if (!wakeup || p->sched_psi_wake_requeue) { if (!wakeup || p->sched_psi_wake_requeue) {
if (p->in_memstall) if (p->in_memstall)
set |= TSK_MEMSTALL; set |= TSK_MEMSTALL;
...@@ -148,7 +151,7 @@ static inline void psi_dequeue(struct task_struct *p, bool sleep) ...@@ -148,7 +151,7 @@ static inline void psi_dequeue(struct task_struct *p, bool sleep)
return; return;
if (p->in_memstall) if (p->in_memstall)
clear |= TSK_MEMSTALL; clear |= (TSK_MEMSTALL | TSK_MEMSTALL_RUNNING);
psi_task_change(p, clear, 0); psi_task_change(p, clear, 0);
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment