Commit c006fac5 authored by Paul Turner's avatar Paul Turner Committed by Peter Zijlstra

sched: Warn on long periods of pending need_resched

CPU scheduler marks need_resched flag to signal a schedule() on a
particular CPU. But, schedule() may not happen immediately in cases
where the current task is executing in the kernel mode (no
preemption state) for extended periods of time.

This patch adds a warn_on if need_resched is pending for more than the
time specified in sysctl resched_latency_warn_ms. If it goes off, it is
likely that there is a missing cond_resched() somewhere. Monitoring is
done via the tick and the accuracy is hence limited to jiffy scale. This
also means that we won't trigger the warning if the tick is disabled.

This feature (LATENCY_WARN) is default disabled.
Signed-off-by: default avatarPaul Turner <pjt@google.com>
Signed-off-by: default avatarJosh Don <joshdon@google.com>
Signed-off-by: default avatarPeter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20210416212936.390566-1-joshdon@google.com
parent 3f5ad914
...@@ -48,6 +48,9 @@ extern unsigned int sysctl_numa_balancing_scan_size; ...@@ -48,6 +48,9 @@ extern unsigned int sysctl_numa_balancing_scan_size;
#ifdef CONFIG_SCHED_DEBUG #ifdef CONFIG_SCHED_DEBUG
extern __read_mostly unsigned int sysctl_sched_migration_cost; extern __read_mostly unsigned int sysctl_sched_migration_cost;
extern __read_mostly unsigned int sysctl_sched_nr_migrate; extern __read_mostly unsigned int sysctl_sched_nr_migrate;
extern int sysctl_resched_latency_warn_ms;
extern int sysctl_resched_latency_warn_once;
#endif #endif
/* /*
......
...@@ -58,7 +58,17 @@ const_debug unsigned int sysctl_sched_features = ...@@ -58,7 +58,17 @@ const_debug unsigned int sysctl_sched_features =
#include "features.h" #include "features.h"
0; 0;
#undef SCHED_FEAT #undef SCHED_FEAT
#endif
/*
* Print a warning if need_resched is set for the given duration (if
* LATENCY_WARN is enabled).
*
* If sysctl_resched_latency_warn_once is set, only one warning will be shown
* per boot.
*/
__read_mostly int sysctl_resched_latency_warn_ms = 100;
__read_mostly int sysctl_resched_latency_warn_once = 1;
#endif /* CONFIG_SCHED_DEBUG */
/* /*
* Number of tasks to iterate in a single balance run. * Number of tasks to iterate in a single balance run.
...@@ -4527,6 +4537,55 @@ unsigned long long task_sched_runtime(struct task_struct *p) ...@@ -4527,6 +4537,55 @@ unsigned long long task_sched_runtime(struct task_struct *p)
return ns; return ns;
} }
#ifdef CONFIG_SCHED_DEBUG
static u64 cpu_resched_latency(struct rq *rq)
{
int latency_warn_ms = READ_ONCE(sysctl_resched_latency_warn_ms);
u64 resched_latency, now = rq_clock(rq);
static bool warned_once;
if (sysctl_resched_latency_warn_once && warned_once)
return 0;
if (!need_resched() || !latency_warn_ms)
return 0;
if (system_state == SYSTEM_BOOTING)
return 0;
if (!rq->last_seen_need_resched_ns) {
rq->last_seen_need_resched_ns = now;
rq->ticks_without_resched = 0;
return 0;
}
rq->ticks_without_resched++;
resched_latency = now - rq->last_seen_need_resched_ns;
if (resched_latency <= latency_warn_ms * NSEC_PER_MSEC)
return 0;
warned_once = true;
return resched_latency;
}
static int __init setup_resched_latency_warn_ms(char *str)
{
long val;
if ((kstrtol(str, 0, &val))) {
pr_warn("Unable to set resched_latency_warn_ms\n");
return 1;
}
sysctl_resched_latency_warn_ms = val;
return 1;
}
__setup("resched_latency_warn_ms=", setup_resched_latency_warn_ms);
#else
static inline u64 cpu_resched_latency(struct rq *rq) { return 0; }
#endif /* CONFIG_SCHED_DEBUG */
/* /*
* This function gets called by the timer code, with HZ frequency. * This function gets called by the timer code, with HZ frequency.
* We call it with interrupts disabled. * We call it with interrupts disabled.
...@@ -4538,6 +4597,7 @@ void scheduler_tick(void) ...@@ -4538,6 +4597,7 @@ void scheduler_tick(void)
struct task_struct *curr = rq->curr; struct task_struct *curr = rq->curr;
struct rq_flags rf; struct rq_flags rf;
unsigned long thermal_pressure; unsigned long thermal_pressure;
u64 resched_latency;
arch_scale_freq_tick(); arch_scale_freq_tick();
sched_clock_tick(); sched_clock_tick();
...@@ -4548,10 +4608,15 @@ void scheduler_tick(void) ...@@ -4548,10 +4608,15 @@ void scheduler_tick(void)
thermal_pressure = arch_scale_thermal_pressure(cpu_of(rq)); thermal_pressure = arch_scale_thermal_pressure(cpu_of(rq));
update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure); update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure);
curr->sched_class->task_tick(rq, curr, 0); curr->sched_class->task_tick(rq, curr, 0);
if (sched_feat(LATENCY_WARN))
resched_latency = cpu_resched_latency(rq);
calc_global_load_tick(rq); calc_global_load_tick(rq);
rq_unlock(rq, &rf); rq_unlock(rq, &rf);
if (sched_feat(LATENCY_WARN) && resched_latency)
resched_latency_warn(cpu, resched_latency);
perf_event_task_tick(); perf_event_task_tick();
#ifdef CONFIG_SMP #ifdef CONFIG_SMP
...@@ -5046,6 +5111,9 @@ static void __sched notrace __schedule(bool preempt) ...@@ -5046,6 +5111,9 @@ static void __sched notrace __schedule(bool preempt)
next = pick_next_task(rq, prev, &rf); next = pick_next_task(rq, prev, &rf);
clear_tsk_need_resched(prev); clear_tsk_need_resched(prev);
clear_preempt_need_resched(); clear_preempt_need_resched();
#ifdef CONFIG_SCHED_DEBUG
rq->last_seen_need_resched_ns = 0;
#endif
if (likely(prev != next)) { if (likely(prev != next)) {
rq->nr_switches++; rq->nr_switches++;
......
...@@ -309,6 +309,9 @@ static __init int sched_init_debug(void) ...@@ -309,6 +309,9 @@ static __init int sched_init_debug(void)
debugfs_create_u32("min_granularity_ns", 0644, debugfs_sched, &sysctl_sched_min_granularity); debugfs_create_u32("min_granularity_ns", 0644, debugfs_sched, &sysctl_sched_min_granularity);
debugfs_create_u32("wakeup_granularity_ns", 0644, debugfs_sched, &sysctl_sched_wakeup_granularity); debugfs_create_u32("wakeup_granularity_ns", 0644, debugfs_sched, &sysctl_sched_wakeup_granularity);
debugfs_create_u32("latency_warn_ms", 0644, debugfs_sched, &sysctl_resched_latency_warn_ms);
debugfs_create_u32("latency_warn_once", 0644, debugfs_sched, &sysctl_resched_latency_warn_once);
#ifdef CONFIG_SMP #ifdef CONFIG_SMP
debugfs_create_file("tunable_scaling", 0644, debugfs_sched, NULL, &sched_scaling_fops); debugfs_create_file("tunable_scaling", 0644, debugfs_sched, NULL, &sched_scaling_fops);
debugfs_create_u32("migration_cost_ns", 0644, debugfs_sched, &sysctl_sched_migration_cost); debugfs_create_u32("migration_cost_ns", 0644, debugfs_sched, &sysctl_sched_migration_cost);
...@@ -1027,3 +1030,13 @@ void proc_sched_set_task(struct task_struct *p) ...@@ -1027,3 +1030,13 @@ void proc_sched_set_task(struct task_struct *p)
memset(&p->se.statistics, 0, sizeof(p->se.statistics)); memset(&p->se.statistics, 0, sizeof(p->se.statistics));
#endif #endif
} }
void resched_latency_warn(int cpu, u64 latency)
{
static DEFINE_RATELIMIT_STATE(latency_check_ratelimit, 60 * 60 * HZ, 1);
WARN(__ratelimit(&latency_check_ratelimit),
"sched: CPU %d need_resched set for > %llu ns (%d ticks) "
"without schedule\n",
cpu, latency, cpu_rq(cpu)->ticks_without_resched);
}
...@@ -91,5 +91,7 @@ SCHED_FEAT(WA_BIAS, true) ...@@ -91,5 +91,7 @@ SCHED_FEAT(WA_BIAS, true)
SCHED_FEAT(UTIL_EST, true) SCHED_FEAT(UTIL_EST, true)
SCHED_FEAT(UTIL_EST_FASTUP, true) SCHED_FEAT(UTIL_EST_FASTUP, true)
SCHED_FEAT(LATENCY_WARN, false)
SCHED_FEAT(ALT_PERIOD, true) SCHED_FEAT(ALT_PERIOD, true)
SCHED_FEAT(BASE_SLICE, true) SCHED_FEAT(BASE_SLICE, true)
...@@ -58,6 +58,7 @@ ...@@ -58,6 +58,7 @@
#include <linux/prefetch.h> #include <linux/prefetch.h>
#include <linux/profile.h> #include <linux/profile.h>
#include <linux/psi.h> #include <linux/psi.h>
#include <linux/ratelimit.h>
#include <linux/rcupdate_wait.h> #include <linux/rcupdate_wait.h>
#include <linux/security.h> #include <linux/security.h>
#include <linux/stop_machine.h> #include <linux/stop_machine.h>
...@@ -971,6 +972,11 @@ struct rq { ...@@ -971,6 +972,11 @@ struct rq {
atomic_t nr_iowait; atomic_t nr_iowait;
#ifdef CONFIG_SCHED_DEBUG
u64 last_seen_need_resched_ns;
int ticks_without_resched;
#endif
#ifdef CONFIG_MEMBARRIER #ifdef CONFIG_MEMBARRIER
int membarrier_state; int membarrier_state;
#endif #endif
...@@ -2371,6 +2377,8 @@ extern void print_dl_stats(struct seq_file *m, int cpu); ...@@ -2371,6 +2377,8 @@ extern void print_dl_stats(struct seq_file *m, int cpu);
extern void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq); extern void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq);
extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq); extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq);
extern void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq); extern void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq);
extern void resched_latency_warn(int cpu, u64 latency);
#ifdef CONFIG_NUMA_BALANCING #ifdef CONFIG_NUMA_BALANCING
extern void extern void
show_numa_stats(struct task_struct *p, struct seq_file *m); show_numa_stats(struct task_struct *p, struct seq_file *m);
...@@ -2378,6 +2386,8 @@ extern void ...@@ -2378,6 +2386,8 @@ extern void
print_numa_stats(struct seq_file *m, int node, unsigned long tsf, print_numa_stats(struct seq_file *m, int node, unsigned long tsf,
unsigned long tpf, unsigned long gsf, unsigned long gpf); unsigned long tpf, unsigned long gsf, unsigned long gpf);
#endif /* CONFIG_NUMA_BALANCING */ #endif /* CONFIG_NUMA_BALANCING */
#else
static inline void resched_latency_warn(int cpu, u64 latency) {}
#endif /* CONFIG_SCHED_DEBUG */ #endif /* CONFIG_SCHED_DEBUG */
extern void init_cfs_rq(struct cfs_rq *cfs_rq); extern void init_cfs_rq(struct cfs_rq *cfs_rq);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment