Commit 46e0d28b authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull scheduler updates from Ingo Molnar:
 "The main scheduler changes in this cycle were:

   - NUMA balancing improvements (Mel Gorman)

   - Further load tracking improvements (Patrick Bellasi)

   - Various NOHZ balancing cleanups and optimizations (Peter Zijlstra)

   - Improve blocked load handling, in particular we can now reduce and
     eventually stop periodic load updates on 'very idle' CPUs. (Vincent
     Guittot)

   - On isolated CPUs offload the final 1Hz scheduler tick as well, plus
     related cleanups and reorganization. (Frederic Weisbecker)

   - Core scheduler code cleanups (Ingo Molnar)"

* 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (45 commits)
  sched/core: Update preempt_notifier_key to modern API
  sched/cpufreq: Rate limits for SCHED_DEADLINE
  sched/fair: Update util_est only on util_avg updates
  sched/cpufreq/schedutil: Use util_est for OPP selection
  sched/fair: Use util_est in LB and WU paths
  sched/fair: Add util_est on top of PELT
  sched/core: Remove TASK_ALL
  sched/completions: Use bool in try_wait_for_completion()
  sched/fair: Update blocked load when newly idle
  sched/fair: Move idle_balance()
  sched/nohz: Merge CONFIG_NO_HZ_COMMON blocks
  sched/fair: Move rebalance_domains()
  sched/nohz: Optimize nohz_idle_balance()
  sched/fair: Reduce the periodic update duration
  sched/nohz: Stop NOHZ stats when decayed
  sched/cpufreq: Provide migration hint
  sched/nohz: Clean up nohz enter/exit
  sched/fair: Update blocked load from NEWIDLE
  sched/fair: Add NOHZ stats balancing
  sched/fair: Restructure nohz_balance_kick()
  ...
parents 86bbbeba b7203428
......@@ -1766,6 +1766,17 @@
nohz
Disable the tick when a single task runs.
A residual 1Hz tick is offloaded to workqueues, which you
need to affine to housekeeping through the global
workqueue's affinity configured via the
/sys/devices/virtual/workqueue/cpumask sysfs file, or
by using the 'domain' flag described below.
NOTE: by default the global workqueue runs on all CPUs,
so to protect individual CPUs the 'cpumask' file has to
be configured manually after bootup.
domain
Isolate from the general SMP balancing and scheduling
algorithms. Note that performing domain isolation this way
......
......@@ -93,7 +93,6 @@ struct task_group;
/* Convenience macros for the sake of wake_up(): */
#define TASK_NORMAL (TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE)
#define TASK_ALL (TASK_NORMAL | __TASK_STOPPED | __TASK_TRACED)
/* get_task_state(): */
#define TASK_REPORT (TASK_RUNNING | TASK_INTERRUPTIBLE | \
......@@ -275,6 +274,34 @@ struct load_weight {
u32 inv_weight;
};
/**
* struct util_est - Estimation utilization of FAIR tasks
* @enqueued: instantaneous estimated utilization of a task/cpu
* @ewma: the Exponential Weighted Moving Average (EWMA)
* utilization of a task
*
* Support data structure to track an Exponential Weighted Moving Average
* (EWMA) of a FAIR task's utilization. New samples are added to the moving
* average each time a task completes an activation. Sample's weight is chosen
* so that the EWMA will be relatively insensitive to transient changes to the
* task's workload.
*
* The enqueued attribute has a slightly different meaning for tasks and cpus:
* - task: the task's util_avg at last task dequeue time
* - cfs_rq: the sum of util_est.enqueued for each RUNNABLE task on that CPU
* Thus, the util_est.enqueued of a task represents the contribution on the
* estimated utilization of the CPU where that task is currently enqueued.
*
* Only for tasks we track a moving average of the past instantaneous
* estimated utilization. This allows to absorb sporadic drops in utilization
* of an otherwise almost periodic task.
*/
struct util_est {
unsigned int enqueued;
unsigned int ewma;
#define UTIL_EST_WEIGHT_SHIFT 2
};
/*
* The load_avg/util_avg accumulates an infinite geometric series
* (see __update_load_avg() in kernel/sched/fair.c).
......@@ -336,6 +363,7 @@ struct sched_avg {
unsigned long load_avg;
unsigned long runnable_load_avg;
unsigned long util_avg;
struct util_est util_est;
};
struct sched_statistics {
......
......@@ -8,9 +8,8 @@
* Interface between cpufreq drivers and the scheduler:
*/
#define SCHED_CPUFREQ_RT (1U << 0)
#define SCHED_CPUFREQ_DL (1U << 1)
#define SCHED_CPUFREQ_IOWAIT (1U << 2)
#define SCHED_CPUFREQ_IOWAIT (1U << 0)
#define SCHED_CPUFREQ_MIGRATION (1U << 1)
#ifdef CONFIG_CPU_FREQ
struct update_util_data {
......
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_SCHED_DEADLINE_H
#define _LINUX_SCHED_DEADLINE_H
#include <linux/sched.h>
/*
* SCHED_DEADLINE tasks has negative priorities, reflecting
......@@ -28,5 +24,3 @@ static inline bool dl_time_before(u64 a, u64 b)
{
return (s64)(a - b) < 0;
}
#endif /* _LINUX_SCHED_DEADLINE_H */
......@@ -12,6 +12,7 @@ enum hk_flags {
HK_FLAG_SCHED = (1 << 3),
HK_FLAG_TICK = (1 << 4),
HK_FLAG_DOMAIN = (1 << 5),
HK_FLAG_WQ = (1 << 6),
};
#ifdef CONFIG_CPU_ISOLATION
......
......@@ -16,11 +16,9 @@ static inline void cpu_load_update_nohz_stop(void) { }
#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
extern void nohz_balance_enter_idle(int cpu);
extern void set_cpu_sd_state_idle(void);
extern int get_nohz_timer_target(void);
#else
static inline void nohz_balance_enter_idle(int cpu) { }
static inline void set_cpu_sd_state_idle(void) { }
#endif
#ifdef CONFIG_NO_HZ_COMMON
......@@ -37,8 +35,4 @@ extern void wake_up_nohz_cpu(int cpu);
static inline void wake_up_nohz_cpu(int cpu) { }
#endif
#ifdef CONFIG_NO_HZ_FULL
extern u64 scheduler_tick_max_deferment(void);
#endif
#endif /* _LINUX_SCHED_NOHZ_H */
......@@ -113,7 +113,8 @@ enum tick_dep_bits {
#ifdef CONFIG_NO_HZ_COMMON
extern bool tick_nohz_enabled;
extern int tick_nohz_tick_stopped(void);
extern bool tick_nohz_tick_stopped(void);
extern bool tick_nohz_tick_stopped_cpu(int cpu);
extern void tick_nohz_idle_enter(void);
extern void tick_nohz_idle_exit(void);
extern void tick_nohz_irq_exit(void);
......@@ -125,6 +126,7 @@ extern u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time);
#else /* !CONFIG_NO_HZ_COMMON */
#define tick_nohz_enabled (0)
static inline int tick_nohz_tick_stopped(void) { return 0; }
static inline int tick_nohz_tick_stopped_cpu(int cpu) { return 0; }
static inline void tick_nohz_idle_enter(void) { }
static inline void tick_nohz_idle_exit(void) { }
......
......@@ -17,8 +17,9 @@ CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer
endif
obj-y += core.o loadavg.o clock.o cputime.o
obj-y += idle_task.o fair.o rt.o deadline.o
obj-y += wait.o wait_bit.o swait.o completion.o idle.o
obj-y += idle.o fair.o rt.o deadline.o
obj-y += wait.o wait_bit.o swait.o completion.o
obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o
obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o
obj-$(CONFIG_SCHEDSTATS) += stats.o
......
// SPDX-License-Identifier: GPL-2.0
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/utsname.h>
#include <linux/security.h>
#include <linux/export.h>
/*
* Auto-group scheduling implementation:
*/
#include "sched.h"
unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1;
......@@ -168,18 +165,19 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag)
autogroup_kref_put(prev);
}
/* Allocates GFP_KERNEL, cannot be called under any spinlock */
/* Allocates GFP_KERNEL, cannot be called under any spinlock: */
void sched_autogroup_create_attach(struct task_struct *p)
{
struct autogroup *ag = autogroup_create();
autogroup_move_group(p, ag);
/* drop extra reference added by autogroup_create() */
/* Drop extra reference added by autogroup_create(): */
autogroup_kref_put(ag);
}
EXPORT_SYMBOL(sched_autogroup_create_attach);
/* Cannot be called under siglock. Currently has no users */
/* Cannot be called under siglock. Currently has no users: */
void sched_autogroup_detach(struct task_struct *p)
{
autogroup_move_group(p, &autogroup_default);
......@@ -202,7 +200,6 @@ static int __init setup_autogroup(char *str)
return 1;
}
__setup("noautogroup", setup_autogroup);
#ifdef CONFIG_PROC_FS
......@@ -224,7 +221,7 @@ int proc_sched_autogroup_set_nice(struct task_struct *p, int nice)
if (nice < 0 && !can_nice(current, nice))
return -EPERM;
/* this is a heavy operation taking global locks.. */
/* This is a heavy operation, taking global locks.. */
if (!capable(CAP_SYS_ADMIN) && time_before(jiffies, next))
return -EAGAIN;
......@@ -267,4 +264,4 @@ int autogroup_path(struct task_group *tg, char *buf, int buflen)
return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id);
}
#endif /* CONFIG_SCHED_DEBUG */
#endif
/* SPDX-License-Identifier: GPL-2.0 */
#ifdef CONFIG_SCHED_AUTOGROUP
#include <linux/kref.h>
#include <linux/rwsem.h>
#include <linux/sched/autogroup.h>
struct autogroup {
/*
* reference doesn't mean how many thread attach to this
* autogroup now. It just stands for the number of task
* could use this autogroup.
* Reference doesn't mean how many threads attach to this
* autogroup now. It just stands for the number of tasks
* which could use this autogroup.
*/
struct kref kref;
struct task_group *tg;
......@@ -56,11 +52,9 @@ autogroup_task_group(struct task_struct *p, struct task_group *tg)
return tg;
}
#ifdef CONFIG_SCHED_DEBUG
static inline int autogroup_path(struct task_group *tg, char *buf, int buflen)
{
return 0;
}
#endif
#endif /* CONFIG_SCHED_AUTOGROUP */
/*
* sched_clock for unstable cpu clocks
* sched_clock() for unstable CPU clocks
*
* Copyright (C) 2008 Red Hat, Inc., Peter Zijlstra
*
......@@ -11,7 +11,7 @@
* Guillaume Chazarain <guichaz@gmail.com>
*
*
* What:
* What this file implements:
*
* cpu_clock(i) provides a fast (execution time) high resolution
* clock with bounded drift between CPUs. The value of cpu_clock(i)
......@@ -26,11 +26,11 @@
* at 0 on boot (but people really shouldn't rely on that).
*
* cpu_clock(i) -- can be used from any context, including NMI.
* local_clock() -- is cpu_clock() on the current cpu.
* local_clock() -- is cpu_clock() on the current CPU.
*
* sched_clock_cpu(i)
*
* How:
* How it is implemented:
*
* The implementation either uses sched_clock() when
* !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK, which means in that case the
......@@ -52,19 +52,7 @@
* that is otherwise invisible (TSC gets stopped).
*
*/
#include <linux/spinlock.h>
#include <linux/hardirq.h>
#include <linux/export.h>
#include <linux/percpu.h>
#include <linux/ktime.h>
#include <linux/sched.h>
#include <linux/nmi.h>
#include <linux/sched/clock.h>
#include <linux/static_key.h>
#include <linux/workqueue.h>
#include <linux/compiler.h>
#include <linux/tick.h>
#include <linux/init.h>
#include "sched.h"
/*
* Scheduler clock - returns current time in nanosec units.
......@@ -302,21 +290,21 @@ static u64 sched_clock_remote(struct sched_clock_data *scd)
* cmpxchg64 below only protects one readout.
*
* We must reread via sched_clock_local() in the retry case on
* 32bit as an NMI could use sched_clock_local() via the
* 32-bit kernels as an NMI could use sched_clock_local() via the
* tracer and hit between the readout of
* the low32bit and the high 32bit portion.
* the low 32-bit and the high 32-bit portion.
*/
this_clock = sched_clock_local(my_scd);
/*
* We must enforce atomic readout on 32bit, otherwise the
* update on the remote cpu can hit inbetween the readout of
* the low32bit and the high 32bit portion.
* We must enforce atomic readout on 32-bit, otherwise the
* update on the remote CPU can hit inbetween the readout of
* the low 32-bit and the high 32-bit portion.
*/
remote_clock = cmpxchg64(&scd->clock, 0, 0);
#else
/*
* On 64bit the read of [my]scd->clock is atomic versus the
* update, so we can avoid the above 32bit dance.
* On 64-bit kernels the read of [my]scd->clock is atomic versus the
* update, so we can avoid the above 32-bit dance.
*/
sched_clock_local(my_scd);
again:
......
......@@ -11,10 +11,7 @@
* typically be used for exclusion which gives rise to priority inversion.
* Waiting for completion is a typically sync point, but not an exclusion point.
*/
#include <linux/sched/signal.h>
#include <linux/sched/debug.h>
#include <linux/completion.h>
#include "sched.h"
/**
* complete: - signals a single thread waiting on this completion
......@@ -283,7 +280,7 @@ EXPORT_SYMBOL(wait_for_completion_killable_timeout);
bool try_wait_for_completion(struct completion *x)
{
unsigned long flags;
int ret = 1;
bool ret = true;
/*
* Since x->done will need to be locked only
......@@ -292,11 +289,11 @@ bool try_wait_for_completion(struct completion *x)
* return early in the blocking case.
*/
if (!READ_ONCE(x->done))
return 0;
return false;
spin_lock_irqsave(&x->wait.lock, flags);
if (!x->done)
ret = 0;
ret = false;
else if (x->done != UINT_MAX)
x->done--;
spin_unlock_irqrestore(&x->wait.lock, flags);
......
......@@ -5,37 +5,11 @@
*
* Copyright (C) 1991-2002 Linus Torvalds
*/
#include <linux/sched.h>
#include <linux/sched/clock.h>
#include <uapi/linux/sched/types.h>
#include <linux/sched/loadavg.h>
#include <linux/sched/hotplug.h>
#include <linux/wait_bit.h>
#include <linux/cpuset.h>
#include <linux/delayacct.h>
#include <linux/init_task.h>
#include <linux/context_tracking.h>
#include <linux/rcupdate_wait.h>
#include <linux/compat.h>
#include <linux/blkdev.h>
#include <linux/kprobes.h>
#include <linux/mmu_context.h>
#include <linux/module.h>
#include <linux/nmi.h>
#include <linux/prefetch.h>
#include <linux/profile.h>
#include <linux/security.h>
#include <linux/syscalls.h>
#include <linux/sched/isolation.h>
#include "sched.h"
#include <asm/switch_to.h>
#include <asm/tlb.h>
#ifdef CONFIG_PARAVIRT
#include <asm/paravirt.h>
#endif
#include "sched.h"
#include "../workqueue_internal.h"
#include "../smpboot.h"
......@@ -135,7 +109,7 @@ struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf)
* [L] ->on_rq
* RELEASE (rq->lock)
*
* If we observe the old cpu in task_rq_lock, the acquire of
* If we observe the old CPU in task_rq_lock, the acquire of
* the old rq->lock will fully serialize against the stores.
*
* If we observe the new CPU in task_rq_lock, the acquire will
......@@ -333,7 +307,7 @@ void hrtick_start(struct rq *rq, u64 delay)
}
#endif /* CONFIG_SMP */
static void init_rq_hrtick(struct rq *rq)
static void hrtick_rq_init(struct rq *rq)
{
#ifdef CONFIG_SMP
rq->hrtick_csd_pending = 0;
......@@ -351,7 +325,7 @@ static inline void hrtick_clear(struct rq *rq)
{
}
static inline void init_rq_hrtick(struct rq *rq)
static inline void hrtick_rq_init(struct rq *rq)
{
}
#endif /* CONFIG_SCHED_HRTICK */
......@@ -609,7 +583,7 @@ static inline bool got_nohz_idle_kick(void)
{
int cpu = smp_processor_id();
if (!test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu)))
if (!(atomic_read(nohz_flags(cpu)) & NOHZ_KICK_MASK))
return false;
if (idle_cpu(cpu) && !need_resched())
......@@ -619,7 +593,7 @@ static inline bool got_nohz_idle_kick(void)
* We can't run Idle Load Balance on this CPU for this time so we
* cancel it and clear NOHZ_BALANCE_KICK
*/
clear_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu));
atomic_andnot(NOHZ_KICK_MASK, nohz_flags(cpu));
return false;
}
......@@ -1457,7 +1431,7 @@ EXPORT_SYMBOL_GPL(kick_process);
*
* - cpu_active must be a subset of cpu_online
*
* - on cpu-up we allow per-cpu kthreads on the online && !active cpu,
* - on CPU-up we allow per-CPU kthreads on the online && !active CPU,
* see __set_cpus_allowed_ptr(). At this point the newly online
* CPU isn't yet part of the sched domains, and balancing will not
* see it.
......@@ -2488,17 +2462,17 @@ void wake_up_new_task(struct task_struct *p)
#ifdef CONFIG_PREEMPT_NOTIFIERS
static struct static_key preempt_notifier_key = STATIC_KEY_INIT_FALSE;
static DEFINE_STATIC_KEY_FALSE(preempt_notifier_key);
void preempt_notifier_inc(void)
{
static_key_slow_inc(&preempt_notifier_key);
static_branch_inc(&preempt_notifier_key);
}
EXPORT_SYMBOL_GPL(preempt_notifier_inc);
void preempt_notifier_dec(void)
{
static_key_slow_dec(&preempt_notifier_key);
static_branch_dec(&preempt_notifier_key);
}
EXPORT_SYMBOL_GPL(preempt_notifier_dec);
......@@ -2508,7 +2482,7 @@ EXPORT_SYMBOL_GPL(preempt_notifier_dec);
*/
void preempt_notifier_register(struct preempt_notifier *notifier)
{
if (!static_key_false(&preempt_notifier_key))
if (!static_branch_unlikely(&preempt_notifier_key))
WARN(1, "registering preempt_notifier while notifiers disabled\n");
hlist_add_head(&notifier->link, &current->preempt_notifiers);
......@@ -2537,7 +2511,7 @@ static void __fire_sched_in_preempt_notifiers(struct task_struct *curr)
static __always_inline void fire_sched_in_preempt_notifiers(struct task_struct *curr)
{
if (static_key_false(&preempt_notifier_key))
if (static_branch_unlikely(&preempt_notifier_key))
__fire_sched_in_preempt_notifiers(curr);
}
......@@ -2555,7 +2529,7 @@ static __always_inline void
fire_sched_out_preempt_notifiers(struct task_struct *curr,
struct task_struct *next)
{
if (static_key_false(&preempt_notifier_key))
if (static_branch_unlikely(&preempt_notifier_key))
__fire_sched_out_preempt_notifiers(curr, next);
}
......@@ -2629,6 +2603,18 @@ static inline void finish_lock_switch(struct rq *rq)
raw_spin_unlock_irq(&rq->lock);
}
/*
* NOP if the arch has not defined these:
*/
#ifndef prepare_arch_switch
# define prepare_arch_switch(next) do { } while (0)
#endif
#ifndef finish_arch_post_lock_switch
# define finish_arch_post_lock_switch() do { } while (0)
#endif
/**
* prepare_task_switch - prepare to switch tasks
* @rq: the runqueue preparing to switch
......@@ -3037,7 +3023,7 @@ unsigned long long task_sched_runtime(struct task_struct *p)
#if defined(CONFIG_64BIT) && defined(CONFIG_SMP)
/*
* 64-bit doesn't need locks to atomically read a 64bit value.
* 64-bit doesn't need locks to atomically read a 64-bit value.
* So we have a optimization chance when the task's delta_exec is 0.
* Reading ->on_cpu is racy, but this is ok.
*
......@@ -3096,35 +3082,99 @@ void scheduler_tick(void)
rq->idle_balance = idle_cpu(cpu);
trigger_load_balance(rq);
#endif
rq_last_tick_reset(rq);
}
#ifdef CONFIG_NO_HZ_FULL
/**
* scheduler_tick_max_deferment
*
* Keep at least one tick per second when a single
* active task is running because the scheduler doesn't
* yet completely support full dynticks environment.
*
* This makes sure that uptime, CFS vruntime, load
* balancing, etc... continue to move forward, even
* with a very low granularity.
*
* Return: Maximum deferment in nanoseconds.
*/
u64 scheduler_tick_max_deferment(void)
struct tick_work {
int cpu;
struct delayed_work work;
};
static struct tick_work __percpu *tick_work_cpu;
static void sched_tick_remote(struct work_struct *work)
{
struct rq *rq = this_rq();
unsigned long next, now = READ_ONCE(jiffies);
struct delayed_work *dwork = to_delayed_work(work);
struct tick_work *twork = container_of(dwork, struct tick_work, work);
int cpu = twork->cpu;
struct rq *rq = cpu_rq(cpu);
struct rq_flags rf;
next = rq->last_sched_tick + HZ;
/*
* Handle the tick only if it appears the remote CPU is running in full
* dynticks mode. The check is racy by nature, but missing a tick or
* having one too much is no big deal because the scheduler tick updates
* statistics and checks timeslices in a time-independent way, regardless
* of when exactly it is running.
*/
if (!idle_cpu(cpu) && tick_nohz_tick_stopped_cpu(cpu)) {
struct task_struct *curr;
u64 delta;
if (time_before_eq(next, now))
return 0;
rq_lock_irq(rq, &rf);
update_rq_clock(rq);
curr = rq->curr;
delta = rq_clock_task(rq) - curr->se.exec_start;
return jiffies_to_nsecs(next - now);
/*
* Make sure the next tick runs within a reasonable
* amount of time.
*/
WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3);
curr->sched_class->task_tick(rq, curr, 0);
rq_unlock_irq(rq, &rf);
}
/*
* Run the remote tick once per second (1Hz). This arbitrary
* frequency is large enough to avoid overload but short enough
* to keep scheduler internal stats reasonably up to date.
*/
queue_delayed_work(system_unbound_wq, dwork, HZ);
}
static void sched_tick_start(int cpu)
{
struct tick_work *twork;
if (housekeeping_cpu(cpu, HK_FLAG_TICK))
return;
WARN_ON_ONCE(!tick_work_cpu);
twork = per_cpu_ptr(tick_work_cpu, cpu);
twork->cpu = cpu;
INIT_DELAYED_WORK(&twork->work, sched_tick_remote);
queue_delayed_work(system_unbound_wq, &twork->work, HZ);
}
#ifdef CONFIG_HOTPLUG_CPU
static void sched_tick_stop(int cpu)
{
struct tick_work *twork;
if (housekeeping_cpu(cpu, HK_FLAG_TICK))
return;
WARN_ON_ONCE(!tick_work_cpu);
twork = per_cpu_ptr(tick_work_cpu, cpu);
cancel_delayed_work_sync(&twork->work);
}
#endif /* CONFIG_HOTPLUG_CPU */
int __init sched_tick_offload_init(void)
{
tick_work_cpu = alloc_percpu(struct tick_work);
BUG_ON(!tick_work_cpu);
return 0;
}
#else /* !CONFIG_NO_HZ_FULL */
static inline void sched_tick_start(int cpu) { }
static inline void sched_tick_stop(int cpu) { }
#endif
#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
......@@ -5786,6 +5836,7 @@ int sched_cpu_starting(unsigned int cpu)
{
set_cpu_rq_start_time(cpu);
sched_rq_cpu_starting(cpu);
sched_tick_start(cpu);
return 0;
}
......@@ -5797,6 +5848,7 @@ int sched_cpu_dying(unsigned int cpu)
/* Handle pending wakeups and then migrate everything off */
sched_ttwu_pending();
sched_tick_stop(cpu);
rq_lock_irqsave(rq, &rf);
if (rq->rd) {
......@@ -5809,7 +5861,7 @@ int sched_cpu_dying(unsigned int cpu)
calc_load_migrate(rq);
update_max_interval();
nohz_balance_exit_idle(cpu);
nohz_balance_exit_idle(rq);
hrtick_clear(rq);
return 0;
}
......@@ -6022,13 +6074,11 @@ void __init sched_init(void)
rq_attach_root(rq, &def_root_domain);
#ifdef CONFIG_NO_HZ_COMMON
rq->last_load_update_tick = jiffies;
rq->nohz_flags = 0;
#endif
#ifdef CONFIG_NO_HZ_FULL
rq->last_sched_tick = 0;
rq->last_blocked_load_update_tick = jiffies;
atomic_set(&rq->nohz_flags, 0);
#endif
#endif /* CONFIG_SMP */
init_rq_hrtick(rq);
hrtick_rq_init(rq);
atomic_set(&rq->nr_iowait, 0);
}
......@@ -7027,3 +7077,5 @@ const u32 sched_prio_to_wmult[40] = {
/* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717,
/* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
};
#undef CREATE_TRACE_POINTS
// SPDX-License-Identifier: GPL-2.0
#include <linux/cgroup.h>
#include <linux/slab.h>
#include <linux/percpu.h>
#include <linux/spinlock.h>
#include <linux/cpumask.h>
#include <linux/seq_file.h>
#include <linux/rcupdate.h>
#include <linux/kernel_stat.h>
#include <linux/err.h>
#include "sched.h"
/*
* CPU accounting code for task groups.
*
* Based on the work by Paul Menage (menage@google.com) and Balbir Singh
* (balbir@in.ibm.com).
*/
#include "sched.h"
/* Time spent by the tasks of the cpu accounting group executing in ... */
/* Time spent by the tasks of the CPU accounting group executing in ... */
enum cpuacct_stat_index {
CPUACCT_STAT_USER, /* ... user mode */
CPUACCT_STAT_SYSTEM, /* ... kernel mode */
......@@ -35,12 +24,12 @@ struct cpuacct_usage {
u64 usages[CPUACCT_STAT_NSTATS];
};
/* track cpu usage of a group of tasks and its child groups */
/* track CPU usage of a group of tasks and its child groups */
struct cpuacct {
struct cgroup_subsys_state css;
/* cpuusage holds pointer to a u64-type object on every cpu */
struct cpuacct_usage __percpu *cpuusage;
struct kernel_cpustat __percpu *cpustat;
struct cgroup_subsys_state css;
/* cpuusage holds pointer to a u64-type object on every CPU */
struct cpuacct_usage __percpu *cpuusage;
struct kernel_cpustat __percpu *cpustat;
};
static inline struct cpuacct *css_ca(struct cgroup_subsys_state *css)
......@@ -48,7 +37,7 @@ static inline struct cpuacct *css_ca(struct cgroup_subsys_state *css)
return css ? container_of(css, struct cpuacct, css) : NULL;
}
/* return cpu accounting group to which this task belongs */
/* Return CPU accounting group to which this task belongs */
static inline struct cpuacct *task_ca(struct task_struct *tsk)
{
return css_ca(task_css(tsk, cpuacct_cgrp_id));
......@@ -65,7 +54,7 @@ static struct cpuacct root_cpuacct = {
.cpuusage = &root_cpuacct_cpuusage,
};
/* create a new cpu accounting group */
/* Create a new CPU accounting group */
static struct cgroup_subsys_state *
cpuacct_css_alloc(struct cgroup_subsys_state *parent_css)
{
......@@ -96,7 +85,7 @@ cpuacct_css_alloc(struct cgroup_subsys_state *parent_css)
return ERR_PTR(-ENOMEM);
}
/* destroy an existing cpu accounting group */
/* Destroy an existing CPU accounting group */
static void cpuacct_css_free(struct cgroup_subsys_state *css)
{
struct cpuacct *ca = css_ca(css);
......@@ -162,7 +151,7 @@ static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
#endif
}
/* return total cpu usage (in nanoseconds) of a group */
/* Return total CPU usage (in nanoseconds) of a group */
static u64 __cpuusage_read(struct cgroup_subsys_state *css,
enum cpuacct_stat_index index)
{
......
......@@ -10,11 +10,7 @@
* as published by the Free Software Foundation; version 2
* of the License.
*/
#include <linux/gfp.h>
#include <linux/kernel.h>
#include <linux/slab.h>
#include "cpudeadline.h"
#include "sched.h"
static inline int parent(int i)
{
......@@ -42,8 +38,9 @@ static void cpudl_heapify_down(struct cpudl *cp, int idx)
return;
/* adapted from lib/prio_heap.c */
while(1) {
while (1) {
u64 largest_dl;
l = left_child(idx);
r = right_child(idx);
largest = idx;
......@@ -131,6 +128,7 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p,
return 1;
} else {
int best_cpu = cpudl_maximum(cp);
WARN_ON(best_cpu != -1 && !cpu_present(best_cpu));
if (cpumask_test_cpu(best_cpu, &p->cpus_allowed) &&
......@@ -145,9 +143,9 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p,
}
/*
* cpudl_clear - remove a cpu from the cpudl max-heap
* cpudl_clear - remove a CPU from the cpudl max-heap
* @cp: the cpudl max-heap context
* @cpu: the target cpu
* @cpu: the target CPU
*
* Notes: assumes cpu_rq(cpu)->lock is locked
*
......@@ -186,8 +184,8 @@ void cpudl_clear(struct cpudl *cp, int cpu)
/*
* cpudl_set - update the cpudl max-heap
* @cp: the cpudl max-heap context
* @cpu: the target cpu
* @dl: the new earliest deadline for this cpu
* @cpu: the target CPU
* @dl: the new earliest deadline for this CPU
*
* Notes: assumes cpu_rq(cpu)->lock is locked
*
......@@ -205,6 +203,7 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl)
old_idx = cp->elements[cpu].idx;
if (old_idx == IDX_INVALID) {
int new_idx = cp->size++;
cp->elements[new_idx].dl = dl;
cp->elements[new_idx].cpu = cpu;
cp->elements[cpu].idx = new_idx;
......@@ -221,7 +220,7 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl)
/*
* cpudl_set_freecpu - Set the cpudl.free_cpus
* @cp: the cpudl max-heap context
* @cpu: rd attached cpu
* @cpu: rd attached CPU
*/
void cpudl_set_freecpu(struct cpudl *cp, int cpu)
{
......@@ -231,7 +230,7 @@ void cpudl_set_freecpu(struct cpudl *cp, int cpu)
/*
* cpudl_clear_freecpu - Clear the cpudl.free_cpus
* @cp: the cpudl max-heap context
* @cpu: rd attached cpu
* @cpu: rd attached CPU
*/
void cpudl_clear_freecpu(struct cpudl *cp, int cpu)
{
......
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_CPUDL_H
#define _LINUX_CPUDL_H
#include <linux/sched.h>
#include <linux/sched/deadline.h>
#define IDX_INVALID -1
#define IDX_INVALID -1
struct cpudl_item {
u64 dl;
int cpu;
int idx;
u64 dl;
int cpu;
int idx;
};
struct cpudl {
raw_spinlock_t lock;
int size;
cpumask_var_t free_cpus;
struct cpudl_item *elements;
raw_spinlock_t lock;
int size;
cpumask_var_t free_cpus;
struct cpudl_item *elements;
};
#ifdef CONFIG_SMP
int cpudl_find(struct cpudl *cp, struct task_struct *p,
struct cpumask *later_mask);
int cpudl_find(struct cpudl *cp, struct task_struct *p, struct cpumask *later_mask);
void cpudl_set(struct cpudl *cp, int cpu, u64 dl);
void cpudl_clear(struct cpudl *cp, int cpu);
int cpudl_init(struct cpudl *cp);
int cpudl_init(struct cpudl *cp);
void cpudl_set_freecpu(struct cpudl *cp, int cpu);
void cpudl_clear_freecpu(struct cpudl *cp, int cpu);
void cpudl_cleanup(struct cpudl *cp);
#endif /* CONFIG_SMP */
#endif /* _LINUX_CPUDL_H */
......@@ -8,7 +8,6 @@
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*/
#include "sched.h"
DEFINE_PER_CPU(struct update_util_data *, cpufreq_update_util_data);
......
......@@ -11,61 +11,56 @@
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/cpufreq.h>
#include <linux/kthread.h>
#include <uapi/linux/sched/types.h>
#include <linux/slab.h>
#include <trace/events/power.h>
#include "sched.h"
#include <trace/events/power.h>
struct sugov_tunables {
struct gov_attr_set attr_set;
unsigned int rate_limit_us;
struct gov_attr_set attr_set;
unsigned int rate_limit_us;
};
struct sugov_policy {
struct cpufreq_policy *policy;
struct sugov_tunables *tunables;
struct list_head tunables_hook;
raw_spinlock_t update_lock; /* For shared policies */
u64 last_freq_update_time;
s64 freq_update_delay_ns;
unsigned int next_freq;
unsigned int cached_raw_freq;
/* The next fields are only needed if fast switch cannot be used. */
struct irq_work irq_work;
struct kthread_work work;
struct mutex work_lock;
struct kthread_worker worker;
struct task_struct *thread;
bool work_in_progress;
bool need_freq_update;
struct cpufreq_policy *policy;
struct sugov_tunables *tunables;
struct list_head tunables_hook;
raw_spinlock_t update_lock; /* For shared policies */
u64 last_freq_update_time;
s64 freq_update_delay_ns;
unsigned int next_freq;
unsigned int cached_raw_freq;
/* The next fields are only needed if fast switch cannot be used: */
struct irq_work irq_work;
struct kthread_work work;
struct mutex work_lock;
struct kthread_worker worker;
struct task_struct *thread;
bool work_in_progress;
bool need_freq_update;
};
struct sugov_cpu {
struct update_util_data update_util;
struct sugov_policy *sg_policy;
unsigned int cpu;
struct update_util_data update_util;
struct sugov_policy *sg_policy;
unsigned int cpu;
bool iowait_boost_pending;
unsigned int iowait_boost;
unsigned int iowait_boost_max;
bool iowait_boost_pending;
unsigned int iowait_boost;
unsigned int iowait_boost_max;
u64 last_update;
/* The fields below are only needed when sharing a policy. */
unsigned long util_cfs;
unsigned long util_dl;
unsigned long max;
unsigned int flags;
/* The fields below are only needed when sharing a policy: */
unsigned long util_cfs;
unsigned long util_dl;
unsigned long max;
/* The field below is for single-CPU policies only. */
/* The field below is for single-CPU policies only: */
#ifdef CONFIG_NO_HZ_COMMON
unsigned long saved_idle_calls;
unsigned long saved_idle_calls;
#endif
};
......@@ -79,9 +74,9 @@ static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time)
/*
* Since cpufreq_update_util() is called with rq->lock held for
* the @target_cpu, our per-cpu data is fully serialized.
* the @target_cpu, our per-CPU data is fully serialized.
*
* However, drivers cannot in general deal with cross-cpu
* However, drivers cannot in general deal with cross-CPU
* requests, so while get_next_freq() will work, our
* sugov_update_commit() call may not for the fast switching platforms.
*
......@@ -111,6 +106,7 @@ static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time)
}
delta_ns = time - sg_policy->last_freq_update_time;
return delta_ns >= sg_policy->freq_update_delay_ns;
}
......@@ -186,17 +182,28 @@ static void sugov_get_util(struct sugov_cpu *sg_cpu)
static unsigned long sugov_aggregate_util(struct sugov_cpu *sg_cpu)
{
struct rq *rq = cpu_rq(sg_cpu->cpu);
unsigned long util;
if (rq->rt.rt_nr_running) {
util = sg_cpu->max;
} else {
util = sg_cpu->util_dl;
if (rq->cfs.h_nr_running)
util += sg_cpu->util_cfs;
}
/*
* Ideally we would like to set util_dl as min/guaranteed freq and
* util_cfs + util_dl as requested freq. However, cpufreq is not yet
* ready for such an interface. So, we only do the latter for now.
*/
return min(sg_cpu->util_cfs + sg_cpu->util_dl, sg_cpu->max);
return min(util, sg_cpu->max);
}
static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time)
static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time, unsigned int flags)
{
if (sg_cpu->flags & SCHED_CPUFREQ_IOWAIT) {
if (flags & SCHED_CPUFREQ_IOWAIT) {
if (sg_cpu->iowait_boost_pending)
return;
......@@ -260,43 +267,51 @@ static bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu)
static inline bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) { return false; }
#endif /* CONFIG_NO_HZ_COMMON */
/*
* Make sugov_should_update_freq() ignore the rate limit when DL
* has increased the utilization.
*/
static inline void ignore_dl_rate_limit(struct sugov_cpu *sg_cpu, struct sugov_policy *sg_policy)
{
if (cpu_util_dl(cpu_rq(sg_cpu->cpu)) > sg_cpu->util_dl)
sg_policy->need_freq_update = true;
}
static void sugov_update_single(struct update_util_data *hook, u64 time,
unsigned int flags)
{
struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util);
struct sugov_policy *sg_policy = sg_cpu->sg_policy;
struct cpufreq_policy *policy = sg_policy->policy;
unsigned long util, max;
unsigned int next_f;
bool busy;
sugov_set_iowait_boost(sg_cpu, time);
sugov_set_iowait_boost(sg_cpu, time, flags);
sg_cpu->last_update = time;
ignore_dl_rate_limit(sg_cpu, sg_policy);
if (!sugov_should_update_freq(sg_policy, time))
return;
busy = sugov_cpu_is_busy(sg_cpu);
if (flags & SCHED_CPUFREQ_RT) {
next_f = policy->cpuinfo.max_freq;
} else {
sugov_get_util(sg_cpu);
max = sg_cpu->max;
util = sugov_aggregate_util(sg_cpu);
sugov_iowait_boost(sg_cpu, &util, &max);
next_f = get_next_freq(sg_policy, util, max);
/*
* Do not reduce the frequency if the CPU has not been idle
* recently, as the reduction is likely to be premature then.
*/
if (busy && next_f < sg_policy->next_freq) {
next_f = sg_policy->next_freq;
sugov_get_util(sg_cpu);
max = sg_cpu->max;
util = sugov_aggregate_util(sg_cpu);
sugov_iowait_boost(sg_cpu, &util, &max);
next_f = get_next_freq(sg_policy, util, max);
/*
* Do not reduce the frequency if the CPU has not been idle
* recently, as the reduction is likely to be premature then.
*/
if (busy && next_f < sg_policy->next_freq) {
next_f = sg_policy->next_freq;
/* Reset cached freq as next_freq has changed */
sg_policy->cached_raw_freq = 0;
}
/* Reset cached freq as next_freq has changed */
sg_policy->cached_raw_freq = 0;
}
sugov_update_commit(sg_policy, time, next_f);
}
......@@ -312,6 +327,8 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time)
unsigned long j_util, j_max;
s64 delta_ns;
sugov_get_util(j_sg_cpu);
/*
* If the CFS CPU utilization was last updated before the
* previous frequency update and the time elapsed between the
......@@ -325,28 +342,22 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time)
if (delta_ns > TICK_NSEC) {
j_sg_cpu->iowait_boost = 0;
j_sg_cpu->iowait_boost_pending = false;
j_sg_cpu->util_cfs = 0;
if (j_sg_cpu->util_dl == 0)
continue;
}
if (j_sg_cpu->flags & SCHED_CPUFREQ_RT)
return policy->cpuinfo.max_freq;
j_max = j_sg_cpu->max;
j_util = sugov_aggregate_util(j_sg_cpu);
sugov_iowait_boost(j_sg_cpu, &j_util, &j_max);
if (j_util * max > j_max * util) {
util = j_util;
max = j_max;
}
sugov_iowait_boost(j_sg_cpu, &util, &max);
}
return get_next_freq(sg_policy, util, max);
}
static void sugov_update_shared(struct update_util_data *hook, u64 time,
unsigned int flags)
static void
sugov_update_shared(struct update_util_data *hook, u64 time, unsigned int flags)
{
struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util);
struct sugov_policy *sg_policy = sg_cpu->sg_policy;
......@@ -354,18 +365,13 @@ static void sugov_update_shared(struct update_util_data *hook, u64 time,
raw_spin_lock(&sg_policy->update_lock);
sugov_get_util(sg_cpu);
sg_cpu->flags = flags;
sugov_set_iowait_boost(sg_cpu, time);
sugov_set_iowait_boost(sg_cpu, time, flags);
sg_cpu->last_update = time;
if (sugov_should_update_freq(sg_policy, time)) {
if (flags & SCHED_CPUFREQ_RT)
next_f = sg_policy->policy->cpuinfo.max_freq;
else
next_f = sugov_next_freq_shared(sg_cpu, time);
ignore_dl_rate_limit(sg_cpu, sg_policy);
if (sugov_should_update_freq(sg_policy, time)) {
next_f = sugov_next_freq_shared(sg_cpu, time);
sugov_update_commit(sg_policy, time, next_f);
}
......@@ -423,8 +429,8 @@ static ssize_t rate_limit_us_show(struct gov_attr_set *attr_set, char *buf)
return sprintf(buf, "%u\n", tunables->rate_limit_us);
}
static ssize_t rate_limit_us_store(struct gov_attr_set *attr_set, const char *buf,
size_t count)
static ssize_t
rate_limit_us_store(struct gov_attr_set *attr_set, const char *buf, size_t count)
{
struct sugov_tunables *tunables = to_sugov_tunables(attr_set);
struct sugov_policy *sg_policy;
......@@ -479,11 +485,11 @@ static int sugov_kthread_create(struct sugov_policy *sg_policy)
{
struct task_struct *thread;
struct sched_attr attr = {
.size = sizeof(struct sched_attr),
.sched_policy = SCHED_DEADLINE,
.sched_flags = SCHED_FLAG_SUGOV,
.sched_nice = 0,
.sched_priority = 0,
.size = sizeof(struct sched_attr),
.sched_policy = SCHED_DEADLINE,
.sched_flags = SCHED_FLAG_SUGOV,
.sched_nice = 0,
.sched_priority = 0,
/*
* Fake (unused) bandwidth; workaround to "fix"
* priority inheritance.
......@@ -663,21 +669,20 @@ static int sugov_start(struct cpufreq_policy *policy)
struct sugov_policy *sg_policy = policy->governor_data;
unsigned int cpu;
sg_policy->freq_update_delay_ns = sg_policy->tunables->rate_limit_us * NSEC_PER_USEC;
sg_policy->last_freq_update_time = 0;
sg_policy->next_freq = UINT_MAX;
sg_policy->work_in_progress = false;
sg_policy->need_freq_update = false;
sg_policy->cached_raw_freq = 0;
sg_policy->freq_update_delay_ns = sg_policy->tunables->rate_limit_us * NSEC_PER_USEC;
sg_policy->last_freq_update_time = 0;
sg_policy->next_freq = UINT_MAX;
sg_policy->work_in_progress = false;
sg_policy->need_freq_update = false;
sg_policy->cached_raw_freq = 0;
for_each_cpu(cpu, policy->cpus) {
struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu);
memset(sg_cpu, 0, sizeof(*sg_cpu));
sg_cpu->cpu = cpu;
sg_cpu->sg_policy = sg_policy;
sg_cpu->flags = 0;
sg_cpu->iowait_boost_max = policy->cpuinfo.max_freq;
sg_cpu->cpu = cpu;
sg_cpu->sg_policy = sg_policy;
sg_cpu->iowait_boost_max = policy->cpuinfo.max_freq;
}
for_each_cpu(cpu, policy->cpus) {
......@@ -721,14 +726,14 @@ static void sugov_limits(struct cpufreq_policy *policy)
}
static struct cpufreq_governor schedutil_gov = {
.name = "schedutil",
.owner = THIS_MODULE,
.dynamic_switching = true,
.init = sugov_init,
.exit = sugov_exit,
.start = sugov_start,
.stop = sugov_stop,
.limits = sugov_limits,
.name = "schedutil",
.owner = THIS_MODULE,
.dynamic_switching = true,
.init = sugov_init,
.exit = sugov_exit,
.start = sugov_start,
.stop = sugov_stop,
.limits = sugov_limits,
};
#ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL
......
......@@ -14,7 +14,7 @@
*
* going from the lowest priority to the highest. CPUs in the INVALID state
* are not eligible for routing. The system maintains this state with
* a 2 dimensional bitmap (the first for priority class, the second for cpus
* a 2 dimensional bitmap (the first for priority class, the second for CPUs
* in that class). Therefore a typical application without affinity
* restrictions can find a suitable CPU with O(1) complexity (e.g. two bit
* searches). For tasks with affinity restrictions, the algorithm has a
......@@ -26,12 +26,7 @@
* as published by the Free Software Foundation; version 2
* of the License.
*/
#include <linux/gfp.h>
#include <linux/sched.h>
#include <linux/sched/rt.h>
#include <linux/slab.h>
#include "cpupri.h"
#include "sched.h"
/* Convert between a 140 based task->prio, and our 102 based cpupri */
static int convert_prio(int prio)
......@@ -128,9 +123,9 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p,
}
/**
* cpupri_set - update the cpu priority setting
* cpupri_set - update the CPU priority setting
* @cp: The cpupri context
* @cpu: The target cpu
* @cpu: The target CPU
* @newpri: The priority (INVALID-RT99) to assign to this CPU
*
* Note: Assumes cpu_rq(cpu)->lock is locked
......@@ -151,7 +146,7 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
return;
/*
* If the cpu was currently mapped to a different value, we
* If the CPU was currently mapped to a different value, we
* need to map it to the new value then remove the old value.
* Note, we must add the new value first, otherwise we risk the
* cpu being missed by the priority loop in cpupri_find.
......
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_CPUPRI_H
#define _LINUX_CPUPRI_H
#include <linux/sched.h>
#define CPUPRI_NR_PRIORITIES (MAX_RT_PRIO + 2)
#define CPUPRI_INVALID -1
#define CPUPRI_IDLE 0
#define CPUPRI_NORMAL 1
#define CPUPRI_INVALID -1
#define CPUPRI_IDLE 0
#define CPUPRI_NORMAL 1
/* values 2-101 are RT priorities 0-99 */
struct cpupri_vec {
atomic_t count;
cpumask_var_t mask;
atomic_t count;
cpumask_var_t mask;
};
struct cpupri {
struct cpupri_vec pri_to_cpu[CPUPRI_NR_PRIORITIES];
int *cpu_to_pri;
struct cpupri_vec pri_to_cpu[CPUPRI_NR_PRIORITIES];
int *cpu_to_pri;
};
#ifdef CONFIG_SMP
int cpupri_find(struct cpupri *cp,
struct task_struct *p, struct cpumask *lowest_mask);
int cpupri_find(struct cpupri *cp, struct task_struct *p, struct cpumask *lowest_mask);
void cpupri_set(struct cpupri *cp, int cpu, int pri);
int cpupri_init(struct cpupri *cp);
int cpupri_init(struct cpupri *cp);
void cpupri_cleanup(struct cpupri *cp);
#endif
#endif /* _LINUX_CPUPRI_H */
#include <linux/export.h>
#include <linux/sched.h>
#include <linux/tsacct_kern.h>
#include <linux/kernel_stat.h>
#include <linux/static_key.h>
#include <linux/context_tracking.h>
#include <linux/sched/cputime.h>
/*
* Simple CPU accounting cgroup controller
*/
#include "sched.h"
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
......@@ -113,9 +109,9 @@ static inline void task_group_account_field(struct task_struct *p, int index,
}
/*
* Account user cpu time to a process.
* @p: the process that the cpu time gets accounted to
* @cputime: the cpu time spent in user space since the last update
* Account user CPU time to a process.
* @p: the process that the CPU time gets accounted to
* @cputime: the CPU time spent in user space since the last update
*/
void account_user_time(struct task_struct *p, u64 cputime)
{
......@@ -135,9 +131,9 @@ void account_user_time(struct task_struct *p, u64 cputime)
}
/*
* Account guest cpu time to a process.
* @p: the process that the cpu time gets accounted to
* @cputime: the cpu time spent in virtual machine since the last update
* Account guest CPU time to a process.
* @p: the process that the CPU time gets accounted to
* @cputime: the CPU time spent in virtual machine since the last update
*/
void account_guest_time(struct task_struct *p, u64 cputime)
{
......@@ -159,9 +155,9 @@ void account_guest_time(struct task_struct *p, u64 cputime)
}
/*
* Account system cpu time to a process and desired cpustat field
* @p: the process that the cpu time gets accounted to
* @cputime: the cpu time spent in kernel space since the last update
* Account system CPU time to a process and desired cpustat field
* @p: the process that the CPU time gets accounted to
* @cputime: the CPU time spent in kernel space since the last update
* @index: pointer to cpustat field that has to be updated
*/
void account_system_index_time(struct task_struct *p,
......@@ -179,10 +175,10 @@ void account_system_index_time(struct task_struct *p,
}
/*
* Account system cpu time to a process.
* @p: the process that the cpu time gets accounted to
* Account system CPU time to a process.
* @p: the process that the CPU time gets accounted to
* @hardirq_offset: the offset to subtract from hardirq_count()
* @cputime: the cpu time spent in kernel space since the last update
* @cputime: the CPU time spent in kernel space since the last update
*/
void account_system_time(struct task_struct *p, int hardirq_offset, u64 cputime)
{
......@@ -205,7 +201,7 @@ void account_system_time(struct task_struct *p, int hardirq_offset, u64 cputime)
/*
* Account for involuntary wait time.
* @cputime: the cpu time spent in involuntary wait
* @cputime: the CPU time spent in involuntary wait
*/
void account_steal_time(u64 cputime)
{
......@@ -216,7 +212,7 @@ void account_steal_time(u64 cputime)
/*
* Account for idle time.
* @cputime: the cpu time spent in idle wait
* @cputime: the CPU time spent in idle wait
*/
void account_idle_time(u64 cputime)
{
......@@ -338,7 +334,7 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
/*
* Account a tick to a process and cpustat
* @p: the process that the cpu time gets accounted to
* @p: the process that the CPU time gets accounted to
* @user_tick: is the tick from userspace
* @rq: the pointer to rq
*
......@@ -400,17 +396,16 @@ static void irqtime_account_idle_ticks(int ticks)
irqtime_account_process_tick(current, 0, rq, ticks);
}
#else /* CONFIG_IRQ_TIME_ACCOUNTING */
static inline void irqtime_account_idle_ticks(int ticks) {}
static inline void irqtime_account_idle_ticks(int ticks) { }
static inline void irqtime_account_process_tick(struct task_struct *p, int user_tick,
struct rq *rq, int nr_ticks) {}
struct rq *rq, int nr_ticks) { }
#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
/*
* Use precise platform statistics if available:
*/
#ifdef CONFIG_VIRT_CPU_ACCOUNTING
#ifndef __ARCH_HAS_VTIME_TASK_SWITCH
# ifndef __ARCH_HAS_VTIME_TASK_SWITCH
void vtime_common_task_switch(struct task_struct *prev)
{
if (is_idle_task(prev))
......@@ -421,8 +416,7 @@ void vtime_common_task_switch(struct task_struct *prev)
vtime_flush(prev);
arch_vtime_task_switch(prev);
}
#endif
# endif
#endif /* CONFIG_VIRT_CPU_ACCOUNTING */
......@@ -469,10 +463,12 @@ void thread_group_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st)
*ut = cputime.utime;
*st = cputime.stime;
}
#else /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
#else /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE: */
/*
* Account a single tick of cpu time.
* @p: the process that the cpu time gets accounted to
* Account a single tick of CPU time.
* @p: the process that the CPU time gets accounted to
* @user_tick: indicates if the tick is a user or a system tick
*/
void account_process_tick(struct task_struct *p, int user_tick)
......
......@@ -17,9 +17,6 @@
*/
#include "sched.h"
#include <linux/slab.h>
#include <uapi/linux/sched/types.h>
struct dl_bandwidth def_dl_bandwidth;
static inline struct task_struct *dl_task_of(struct sched_dl_entity *dl_se)
......@@ -87,7 +84,7 @@ void __add_running_bw(u64 dl_bw, struct dl_rq *dl_rq)
SCHED_WARN_ON(dl_rq->running_bw < old); /* overflow */
SCHED_WARN_ON(dl_rq->running_bw > dl_rq->this_bw);
/* kick cpufreq (see the comment in kernel/sched/sched.h). */
cpufreq_update_util(rq_of_dl_rq(dl_rq), SCHED_CPUFREQ_DL);
cpufreq_update_util(rq_of_dl_rq(dl_rq), 0);
}
static inline
......@@ -101,7 +98,7 @@ void __sub_running_bw(u64 dl_bw, struct dl_rq *dl_rq)
if (dl_rq->running_bw > old)
dl_rq->running_bw = 0;
/* kick cpufreq (see the comment in kernel/sched/sched.h). */
cpufreq_update_util(rq_of_dl_rq(dl_rq), SCHED_CPUFREQ_DL);
cpufreq_update_util(rq_of_dl_rq(dl_rq), 0);
}
static inline
......@@ -514,7 +511,7 @@ static DEFINE_PER_CPU(struct callback_head, dl_pull_head);
static void push_dl_tasks(struct rq *);
static void pull_dl_task(struct rq *);
static inline void queue_push_tasks(struct rq *rq)
static inline void deadline_queue_push_tasks(struct rq *rq)
{
if (!has_pushable_dl_tasks(rq))
return;
......@@ -522,7 +519,7 @@ static inline void queue_push_tasks(struct rq *rq)
queue_balance_callback(rq, &per_cpu(dl_push_head, rq->cpu), push_dl_tasks);
}
static inline void queue_pull_task(struct rq *rq)
static inline void deadline_queue_pull_task(struct rq *rq)
{
queue_balance_callback(rq, &per_cpu(dl_pull_head, rq->cpu), pull_dl_task);
}
......@@ -539,12 +536,12 @@ static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p
/*
* If we cannot preempt any rq, fall back to pick any
* online cpu.
* online CPU:
*/
cpu = cpumask_any_and(cpu_active_mask, &p->cpus_allowed);
if (cpu >= nr_cpu_ids) {
/*
* Fail to find any suitable cpu.
* Failed to find any suitable CPU.
* The task will never come back!
*/
BUG_ON(dl_bandwidth_enabled());
......@@ -597,19 +594,18 @@ static inline void pull_dl_task(struct rq *rq)
{
}
static inline void queue_push_tasks(struct rq *rq)
static inline void deadline_queue_push_tasks(struct rq *rq)
{
}
static inline void queue_pull_task(struct rq *rq)
static inline void deadline_queue_pull_task(struct rq *rq)
{
}
#endif /* CONFIG_SMP */
static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags);
static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags);
static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p,
int flags);
static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p, int flags);
/*
* We are being explicitly informed that a new instance is starting,
......@@ -1763,7 +1759,7 @@ pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
if (hrtick_enabled(rq))
start_hrtick_dl(rq, p);
queue_push_tasks(rq);
deadline_queue_push_tasks(rq);
return p;
}
......@@ -1776,6 +1772,14 @@ static void put_prev_task_dl(struct rq *rq, struct task_struct *p)
enqueue_pushable_dl_task(rq, p);
}
/*
* scheduler tick hitting a task of our scheduling class.
*
* NOTE: This function can be called remotely by the tick offload that
* goes along full dynticks. Therefore no local assumption can be made
* and everything must be accessed through the @rq and @curr passed in
* parameters.
*/
static void task_tick_dl(struct rq *rq, struct task_struct *p, int queued)
{
update_curr_dl(rq);
......@@ -1865,7 +1869,7 @@ static int find_later_rq(struct task_struct *task)
/*
* We have to consider system topology and task affinity
* first, then we can look for a suitable cpu.
* first, then we can look for a suitable CPU.
*/
if (!cpudl_find(&task_rq(task)->rd->cpudl, task, later_mask))
return -1;
......@@ -1879,7 +1883,7 @@ static int find_later_rq(struct task_struct *task)
* Now we check how well this matches with task's
* affinity and system topology.
*
* The last cpu where the task run is our first
* The last CPU where the task run is our first
* guess, since it is most likely cache-hot there.
*/
if (cpumask_test_cpu(cpu, later_mask))
......@@ -1909,9 +1913,9 @@ static int find_later_rq(struct task_struct *task)
best_cpu = cpumask_first_and(later_mask,
sched_domain_span(sd));
/*
* Last chance: if a cpu being in both later_mask
* Last chance: if a CPU being in both later_mask
* and current sd span is valid, that becomes our
* choice. Of course, the latest possible cpu is
* choice. Of course, the latest possible CPU is
* already under consideration through later_mask.
*/
if (best_cpu < nr_cpu_ids) {
......@@ -2067,7 +2071,7 @@ static int push_dl_task(struct rq *rq)
if (task == next_task) {
/*
* The task is still there. We don't try
* again, some other cpu will pull it when ready.
* again, some other CPU will pull it when ready.
*/
goto out;
}
......@@ -2300,12 +2304,12 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p)
/*
* Since this might be the only -deadline task on the rq,
* this is the right place to try to pull some other one
* from an overloaded cpu, if any.
* from an overloaded CPU, if any.
*/
if (!task_on_rq_queued(p) || rq->dl.dl_nr_running)
return;
queue_pull_task(rq);
deadline_queue_pull_task(rq);
}
/*
......@@ -2327,7 +2331,7 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p)
if (rq->curr != p) {
#ifdef CONFIG_SMP
if (p->nr_cpus_allowed > 1 && rq->dl.overloaded)
queue_push_tasks(rq);
deadline_queue_push_tasks(rq);
#endif
if (dl_task(rq->curr))
check_preempt_curr_dl(rq, p, 0);
......@@ -2352,7 +2356,7 @@ static void prio_changed_dl(struct rq *rq, struct task_struct *p,
* or lowering its prio, so...
*/
if (!rq->dl.overloaded)
queue_pull_task(rq);
deadline_queue_pull_task(rq);
/*
* If we now have a earlier deadline task than p,
......@@ -2626,17 +2630,17 @@ void __dl_clear_params(struct task_struct *p)
{
struct sched_dl_entity *dl_se = &p->dl;
dl_se->dl_runtime = 0;
dl_se->dl_deadline = 0;
dl_se->dl_period = 0;
dl_se->flags = 0;
dl_se->dl_bw = 0;
dl_se->dl_density = 0;
dl_se->dl_runtime = 0;
dl_se->dl_deadline = 0;
dl_se->dl_period = 0;
dl_se->flags = 0;
dl_se->dl_bw = 0;
dl_se->dl_density = 0;
dl_se->dl_throttled = 0;
dl_se->dl_yielded = 0;
dl_se->dl_non_contending = 0;
dl_se->dl_overrun = 0;
dl_se->dl_throttled = 0;
dl_se->dl_yielded = 0;
dl_se->dl_non_contending = 0;
dl_se->dl_overrun = 0;
}
bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr)
......@@ -2655,21 +2659,22 @@ bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr)
#ifdef CONFIG_SMP
int dl_task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_allowed)
{
unsigned int dest_cpu = cpumask_any_and(cpu_active_mask,
cs_cpus_allowed);
unsigned int dest_cpu;
struct dl_bw *dl_b;
bool overflow;
int cpus, ret;
unsigned long flags;
dest_cpu = cpumask_any_and(cpu_active_mask, cs_cpus_allowed);
rcu_read_lock_sched();
dl_b = dl_bw_of(dest_cpu);
raw_spin_lock_irqsave(&dl_b->lock, flags);
cpus = dl_bw_cpus(dest_cpu);
overflow = __dl_overflow(dl_b, cpus, 0, p->dl.dl_bw);
if (overflow)
if (overflow) {
ret = -EBUSY;
else {
} else {
/*
* We reserve space for this task in the destination
* root_domain, as we can't fail after this point.
......@@ -2681,6 +2686,7 @@ int dl_task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_allo
}
raw_spin_unlock_irqrestore(&dl_b->lock, flags);
rcu_read_unlock_sched();
return ret;
}
......@@ -2701,6 +2707,7 @@ int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur,
ret = 0;
raw_spin_unlock_irqrestore(&cur_dl_b->lock, flags);
rcu_read_unlock_sched();
return ret;
}
......@@ -2718,6 +2725,7 @@ bool dl_cpu_busy(unsigned int cpu)
overflow = __dl_overflow(dl_b, cpus, 0, 0);
raw_spin_unlock_irqrestore(&dl_b->lock, flags);
rcu_read_unlock_sched();
return overflow;
}
#endif
......
/*
* kernel/sched/debug.c
*
* Print the CFS rbtree
* Print the CFS rbtree and other debugging details
*
* Copyright(C) 2007, Red Hat, Inc., Ingo Molnar
*
......@@ -9,16 +9,6 @@
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*/
#include <linux/proc_fs.h>
#include <linux/sched/mm.h>
#include <linux/sched/task.h>
#include <linux/seq_file.h>
#include <linux/kallsyms.h>
#include <linux/utsname.h>
#include <linux/mempolicy.h>
#include <linux/debugfs.h>
#include "sched.h"
static DEFINE_SPINLOCK(sched_debug_lock);
......@@ -274,34 +264,19 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd)
if (table == NULL)
return NULL;
set_table_entry(&table[0], "min_interval", &sd->min_interval,
sizeof(long), 0644, proc_doulongvec_minmax, false);
set_table_entry(&table[1], "max_interval", &sd->max_interval,
sizeof(long), 0644, proc_doulongvec_minmax, false);
set_table_entry(&table[2], "busy_idx", &sd->busy_idx,
sizeof(int), 0644, proc_dointvec_minmax, true);
set_table_entry(&table[3], "idle_idx", &sd->idle_idx,
sizeof(int), 0644, proc_dointvec_minmax, true);
set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx,
sizeof(int), 0644, proc_dointvec_minmax, true);
set_table_entry(&table[5], "wake_idx", &sd->wake_idx,
sizeof(int), 0644, proc_dointvec_minmax, true);
set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx,
sizeof(int), 0644, proc_dointvec_minmax, true);
set_table_entry(&table[7], "busy_factor", &sd->busy_factor,
sizeof(int), 0644, proc_dointvec_minmax, false);
set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct,
sizeof(int), 0644, proc_dointvec_minmax, false);
set_table_entry(&table[9], "cache_nice_tries",
&sd->cache_nice_tries,
sizeof(int), 0644, proc_dointvec_minmax, false);
set_table_entry(&table[10], "flags", &sd->flags,
sizeof(int), 0644, proc_dointvec_minmax, false);
set_table_entry(&table[11], "max_newidle_lb_cost",
&sd->max_newidle_lb_cost,
sizeof(long), 0644, proc_doulongvec_minmax, false);
set_table_entry(&table[12], "name", sd->name,
CORENAME_MAX_SIZE, 0444, proc_dostring, false);
set_table_entry(&table[0] , "min_interval", &sd->min_interval, sizeof(long), 0644, proc_doulongvec_minmax, false);
set_table_entry(&table[1] , "max_interval", &sd->max_interval, sizeof(long), 0644, proc_doulongvec_minmax, false);
set_table_entry(&table[2] , "busy_idx", &sd->busy_idx, sizeof(int) , 0644, proc_dointvec_minmax, true );
set_table_entry(&table[3] , "idle_idx", &sd->idle_idx, sizeof(int) , 0644, proc_dointvec_minmax, true );
set_table_entry(&table[4] , "newidle_idx", &sd->newidle_idx, sizeof(int) , 0644, proc_dointvec_minmax, true );
set_table_entry(&table[5] , "wake_idx", &sd->wake_idx, sizeof(int) , 0644, proc_dointvec_minmax, true );
set_table_entry(&table[6] , "forkexec_idx", &sd->forkexec_idx, sizeof(int) , 0644, proc_dointvec_minmax, true );
set_table_entry(&table[7] , "busy_factor", &sd->busy_factor, sizeof(int) , 0644, proc_dointvec_minmax, false);
set_table_entry(&table[8] , "imbalance_pct", &sd->imbalance_pct, sizeof(int) , 0644, proc_dointvec_minmax, false);
set_table_entry(&table[9] , "cache_nice_tries", &sd->cache_nice_tries, sizeof(int) , 0644, proc_dointvec_minmax, false);
set_table_entry(&table[10], "flags", &sd->flags, sizeof(int) , 0644, proc_dointvec_minmax, false);
set_table_entry(&table[11], "max_newidle_lb_cost", &sd->max_newidle_lb_cost, sizeof(long), 0644, proc_doulongvec_minmax, false);
set_table_entry(&table[12], "name", sd->name, CORENAME_MAX_SIZE, 0444, proc_dostring, false);
/* &table[13] is terminator */
return table;
......@@ -332,8 +307,8 @@ static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu)
return table;
}
static cpumask_var_t sd_sysctl_cpus;
static struct ctl_table_header *sd_sysctl_header;
static cpumask_var_t sd_sysctl_cpus;
static struct ctl_table_header *sd_sysctl_header;
void register_sched_domain_sysctl(void)
{
......@@ -413,14 +388,10 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
{
struct sched_entity *se = tg->se[cpu];
#define P(F) \
SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)F)
#define P_SCHEDSTAT(F) \
SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)schedstat_val(F))
#define PN(F) \
SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F))
#define PN_SCHEDSTAT(F) \
SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)schedstat_val(F)))
#define P(F) SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)F)
#define P_SCHEDSTAT(F) SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)schedstat_val(F))
#define PN(F) SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F))
#define PN_SCHEDSTAT(F) SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)schedstat_val(F)))
if (!se)
return;
......@@ -428,6 +399,7 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
PN(se->exec_start);
PN(se->vruntime);
PN(se->sum_exec_runtime);
if (schedstat_enabled()) {
PN_SCHEDSTAT(se->statistics.wait_start);
PN_SCHEDSTAT(se->statistics.sleep_start);
......@@ -440,6 +412,7 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
PN_SCHEDSTAT(se->statistics.wait_sum);
P_SCHEDSTAT(se->statistics.wait_count);
}
P(se->load.weight);
P(se->runnable_weight);
#ifdef CONFIG_SMP
......@@ -464,6 +437,7 @@ static char *task_group_path(struct task_group *tg)
return group_path;
cgroup_path(tg->css.cgroup, group_path, PATH_MAX);
return group_path;
}
#endif
......@@ -569,6 +543,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
cfs_rq->avg.runnable_load_avg);
SEQ_printf(m, " .%-30s: %lu\n", "util_avg",
cfs_rq->avg.util_avg);
SEQ_printf(m, " .%-30s: %u\n", "util_est_enqueued",
cfs_rq->avg.util_est.enqueued);
SEQ_printf(m, " .%-30s: %ld\n", "removed.load_avg",
cfs_rq->removed.load_avg);
SEQ_printf(m, " .%-30s: %ld\n", "removed.util_avg",
......@@ -804,9 +780,9 @@ void sysrq_sched_debug_show(void)
/*
* This itererator needs some explanation.
* It returns 1 for the header position.
* This means 2 is cpu 0.
* In a hotplugged system some cpus, including cpu 0, may be missing so we have
* to use cpumask_* to iterate over the cpus.
* This means 2 is CPU 0.
* In a hotplugged system some CPUs, including CPU 0, may be missing so we have
* to use cpumask_* to iterate over the CPUs.
*/
static void *sched_debug_start(struct seq_file *file, loff_t *offset)
{
......@@ -826,6 +802,7 @@ static void *sched_debug_start(struct seq_file *file, loff_t *offset)
if (n < nr_cpu_ids)
return (void *)(unsigned long)(n + 2);
return NULL;
}
......@@ -840,10 +817,10 @@ static void sched_debug_stop(struct seq_file *file, void *data)
}
static const struct seq_operations sched_debug_sops = {
.start = sched_debug_start,
.next = sched_debug_next,
.stop = sched_debug_stop,
.show = sched_debug_show,
.start = sched_debug_start,
.next = sched_debug_next,
.stop = sched_debug_stop,
.show = sched_debug_show,
};
static int sched_debug_release(struct inode *inode, struct file *file)
......@@ -881,14 +858,10 @@ static int __init init_sched_debug_procfs(void)
__initcall(init_sched_debug_procfs);
#define __P(F) \
SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)F)
#define P(F) \
SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)p->F)
#define __PN(F) \
SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F))
#define PN(F) \
SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F))
#define __P(F) SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)F)
#define P(F) SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)p->F)
#define __PN(F) SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F))
#define PN(F) SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F))
#ifdef CONFIG_NUMA_BALANCING
......@@ -1023,6 +996,8 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
P(se.avg.runnable_load_avg);
P(se.avg.util_avg);
P(se.avg.last_update_time);
P(se.avg.util_est.ewma);
P(se.avg.util_est.enqueued);
#endif
P(policy);
P(prio);
......
......@@ -20,25 +20,10 @@
* Adaptive scheduling granularity, math enhancements by Peter Zijlstra
* Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
*/
#include <linux/sched/mm.h>
#include <linux/sched/topology.h>
#include <linux/latencytop.h>
#include <linux/cpumask.h>
#include <linux/cpuidle.h>
#include <linux/slab.h>
#include <linux/profile.h>
#include <linux/interrupt.h>
#include <linux/mempolicy.h>
#include <linux/migrate.h>
#include <linux/task_work.h>
#include <linux/sched/isolation.h>
#include "sched.h"
#include <trace/events/sched.h>
#include "sched.h"
/*
* Targeted preemption latency for CPU-bound tasks:
*
......@@ -103,7 +88,7 @@ const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
#ifdef CONFIG_SMP
/*
* For asym packing, by default the lower numbered cpu has higher priority.
* For asym packing, by default the lower numbered CPU has higher priority.
*/
int __weak arch_asym_cpu_priority(int cpu)
{
......@@ -787,7 +772,7 @@ void post_init_entity_util_avg(struct sched_entity *se)
* For !fair tasks do:
*
update_cfs_rq_load_avg(now, cfs_rq);
attach_entity_load_avg(cfs_rq, se);
attach_entity_load_avg(cfs_rq, se, 0);
switched_from_fair(rq, p);
*
* such that the next switched_to_fair() has the
......@@ -1181,7 +1166,7 @@ pid_t task_numa_group_id(struct task_struct *p)
}
/*
* The averaged statistics, shared & private, memory & cpu,
* The averaged statistics, shared & private, memory & CPU,
* occupy the first half of the array. The second half of the
* array is for current counters, which are averaged into the
* first set by task_numa_placement.
......@@ -1587,7 +1572,7 @@ static void task_numa_compare(struct task_numa_env *env,
* be incurred if the tasks were swapped.
*/
if (cur) {
/* Skip this swap candidate if cannot move to the source cpu */
/* Skip this swap candidate if cannot move to the source CPU: */
if (!cpumask_test_cpu(env->src_cpu, &cur->cpus_allowed))
goto unlock;
......@@ -1631,7 +1616,7 @@ static void task_numa_compare(struct task_numa_env *env,
goto balance;
}
/* Balance doesn't matter much if we're running a task per cpu */
/* Balance doesn't matter much if we're running a task per CPU: */
if (imp > env->best_imp && src_rq->nr_running == 1 &&
dst_rq->nr_running == 1)
goto assign;
......@@ -1676,7 +1661,7 @@ static void task_numa_compare(struct task_numa_env *env,
*/
if (!cur) {
/*
* select_idle_siblings() uses an per-cpu cpumask that
* select_idle_siblings() uses an per-CPU cpumask that
* can be used from IRQ context.
*/
local_irq_disable();
......@@ -1869,6 +1854,7 @@ static int task_numa_migrate(struct task_struct *p)
static void numa_migrate_preferred(struct task_struct *p)
{
unsigned long interval = HZ;
unsigned long numa_migrate_retry;
/* This task has no NUMA fault statistics yet */
if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults))
......@@ -1876,7 +1862,18 @@ static void numa_migrate_preferred(struct task_struct *p)
/* Periodically retry migrating the task to the preferred node */
interval = min(interval, msecs_to_jiffies(p->numa_scan_period) / 16);
p->numa_migrate_retry = jiffies + interval;
numa_migrate_retry = jiffies + interval;
/*
* Check that the new retry threshold is after the current one. If
* the retry is in the future, it implies that wake_affine has
* temporarily asked NUMA balancing to backoff from placement.
*/
if (numa_migrate_retry > p->numa_migrate_retry)
return;
/* Safe to try placing the task on the preferred node */
p->numa_migrate_retry = numa_migrate_retry;
/* Success if task is already running on preferred CPU */
if (task_node(p) == p->numa_preferred_nid)
......@@ -2823,7 +2820,7 @@ void reweight_task(struct task_struct *p, int prio)
}
#ifdef CONFIG_FAIR_GROUP_SCHED
# ifdef CONFIG_SMP
#ifdef CONFIG_SMP
/*
* All this does is approximate the hierarchical proportion which includes that
* global sum we all love to hate.
......@@ -2974,7 +2971,7 @@ static long calc_group_runnable(struct cfs_rq *cfs_rq, long shares)
return clamp_t(long, runnable, MIN_SHARES, shares);
}
# endif /* CONFIG_SMP */
#endif /* CONFIG_SMP */
static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
......@@ -3012,11 +3009,11 @@ static inline void update_cfs_group(struct sched_entity *se)
}
#endif /* CONFIG_FAIR_GROUP_SCHED */
static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags)
{
struct rq *rq = rq_of(cfs_rq);
if (&rq->cfs == cfs_rq) {
if (&rq->cfs == cfs_rq || (flags & SCHED_CPUFREQ_MIGRATION)) {
/*
* There are a few boundary cases this might miss but it should
* get called often enough that that should (hopefully) not be
......@@ -3031,7 +3028,7 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
*
* See cpu_util().
*/
cpufreq_update_util(rq, 0);
cpufreq_update_util(rq, flags);
}
}
......@@ -3245,6 +3242,32 @@ ___update_load_avg(struct sched_avg *sa, unsigned long load, unsigned long runna
sa->util_avg = sa->util_sum / divider;
}
/*
* When a task is dequeued, its estimated utilization should not be update if
* its util_avg has not been updated at least once.
* This flag is used to synchronize util_avg updates with util_est updates.
* We map this information into the LSB bit of the utilization saved at
* dequeue time (i.e. util_est.dequeued).
*/
#define UTIL_AVG_UNCHANGED 0x1
static inline void cfs_se_util_change(struct sched_avg *avg)
{
unsigned int enqueued;
if (!sched_feat(UTIL_EST))
return;
/* Avoid store if the flag has been already set */
enqueued = avg->util_est.enqueued;
if (!(enqueued & UTIL_AVG_UNCHANGED))
return;
/* Reset flag to report util_avg has been updated */
enqueued &= ~UTIL_AVG_UNCHANGED;
WRITE_ONCE(avg->util_est.enqueued, enqueued);
}
/*
* sched_entity:
*
......@@ -3296,6 +3319,7 @@ __update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entit
cfs_rq->curr == se)) {
___update_load_avg(&se->avg, se_weight(se), se_runnable(se));
cfs_se_util_change(&se->avg);
return 1;
}
......@@ -3350,7 +3374,7 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
}
/*
* Called within set_task_rq() right before setting a task's cpu. The
* Called within set_task_rq() right before setting a task's CPU. The
* caller only guarantees p->pi_lock is held; no other assumptions,
* including the state of rq->lock, should be made.
*/
......@@ -3529,7 +3553,7 @@ update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cf
/*
* runnable_sum can't be lower than running_sum
* As running sum is scale with cpu capacity wehreas the runnable sum
* As running sum is scale with CPU capacity wehreas the runnable sum
* is not we rescale running_sum 1st
*/
running_sum = se->avg.util_sum /
......@@ -3689,7 +3713,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
#endif
if (decayed)
cfs_rq_util_change(cfs_rq);
cfs_rq_util_change(cfs_rq, 0);
return decayed;
}
......@@ -3702,7 +3726,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
* Must call update_cfs_rq_load_avg() before this, since we rely on
* cfs_rq->avg.last_update_time being current.
*/
static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
u32 divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib;
......@@ -3738,7 +3762,7 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
add_tg_cfs_propagate(cfs_rq, se->avg.load_sum);
cfs_rq_util_change(cfs_rq);
cfs_rq_util_change(cfs_rq, flags);
}
/**
......@@ -3757,7 +3781,7 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum);
cfs_rq_util_change(cfs_rq);
cfs_rq_util_change(cfs_rq, 0);
}
/*
......@@ -3787,7 +3811,14 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
if (!se->avg.last_update_time && (flags & DO_ATTACH)) {
attach_entity_load_avg(cfs_rq, se);
/*
* DO_ATTACH means we're here from enqueue_entity().
* !last_update_time means we've passed through
* migrate_task_rq_fair() indicating we migrated.
*
* IOW we're enqueueing a task on a new CPU.
*/
attach_entity_load_avg(cfs_rq, se, SCHED_CPUFREQ_MIGRATION);
update_tg_load_avg(cfs_rq, 0);
} else if (decayed && (flags & UPDATE_TG))
......@@ -3869,6 +3900,120 @@ static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq)
static int idle_balance(struct rq *this_rq, struct rq_flags *rf);
static inline unsigned long task_util(struct task_struct *p)
{
return READ_ONCE(p->se.avg.util_avg);
}
static inline unsigned long _task_util_est(struct task_struct *p)
{
struct util_est ue = READ_ONCE(p->se.avg.util_est);
return max(ue.ewma, ue.enqueued);
}
static inline unsigned long task_util_est(struct task_struct *p)
{
return max(task_util(p), _task_util_est(p));
}
static inline void util_est_enqueue(struct cfs_rq *cfs_rq,
struct task_struct *p)
{
unsigned int enqueued;
if (!sched_feat(UTIL_EST))
return;
/* Update root cfs_rq's estimated utilization */
enqueued = cfs_rq->avg.util_est.enqueued;
enqueued += (_task_util_est(p) | UTIL_AVG_UNCHANGED);
WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued);
}
/*
* Check if a (signed) value is within a specified (unsigned) margin,
* based on the observation that:
*
* abs(x) < y := (unsigned)(x + y - 1) < (2 * y - 1)
*
* NOTE: this only works when value + maring < INT_MAX.
*/
static inline bool within_margin(int value, int margin)
{
return ((unsigned int)(value + margin - 1) < (2 * margin - 1));
}
static void
util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep)
{
long last_ewma_diff;
struct util_est ue;
if (!sched_feat(UTIL_EST))
return;
/*
* Update root cfs_rq's estimated utilization
*
* If *p is the last task then the root cfs_rq's estimated utilization
* of a CPU is 0 by definition.
*/
ue.enqueued = 0;
if (cfs_rq->nr_running) {
ue.enqueued = cfs_rq->avg.util_est.enqueued;
ue.enqueued -= min_t(unsigned int, ue.enqueued,
(_task_util_est(p) | UTIL_AVG_UNCHANGED));
}
WRITE_ONCE(cfs_rq->avg.util_est.enqueued, ue.enqueued);
/*
* Skip update of task's estimated utilization when the task has not
* yet completed an activation, e.g. being migrated.
*/
if (!task_sleep)
return;
/*
* If the PELT values haven't changed since enqueue time,
* skip the util_est update.
*/
ue = p->se.avg.util_est;
if (ue.enqueued & UTIL_AVG_UNCHANGED)
return;
/*
* Skip update of task's estimated utilization when its EWMA is
* already ~1% close to its last activation value.
*/
ue.enqueued = (task_util(p) | UTIL_AVG_UNCHANGED);
last_ewma_diff = ue.enqueued - ue.ewma;
if (within_margin(last_ewma_diff, (SCHED_CAPACITY_SCALE / 100)))
return;
/*
* Update Task's estimated utilization
*
* When *p completes an activation we can consolidate another sample
* of the task size. This is done by storing the current PELT value
* as ue.enqueued and by using this value to update the Exponential
* Weighted Moving Average (EWMA):
*
* ewma(t) = w * task_util(p) + (1-w) * ewma(t-1)
* = w * task_util(p) + ewma(t-1) - w * ewma(t-1)
* = w * (task_util(p) - ewma(t-1)) + ewma(t-1)
* = w * ( last_ewma_diff ) + ewma(t-1)
* = w * (last_ewma_diff + ewma(t-1) / w)
*
* Where 'w' is the weight of new samples, which is configured to be
* 0.25, thus making w=1/4 ( >>= UTIL_EST_WEIGHT_SHIFT)
*/
ue.ewma <<= UTIL_EST_WEIGHT_SHIFT;
ue.ewma += last_ewma_diff;
ue.ewma >>= UTIL_EST_WEIGHT_SHIFT;
WRITE_ONCE(p->se.avg.util_est, ue);
}
#else /* CONFIG_SMP */
static inline int
......@@ -3883,13 +4028,13 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int not_used1)
{
cfs_rq_util_change(cfs_rq);
cfs_rq_util_change(cfs_rq, 0);
}
static inline void remove_entity_load_avg(struct sched_entity *se) {}
static inline void
attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) {}
static inline void
detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
......@@ -3898,6 +4043,13 @@ static inline int idle_balance(struct rq *rq, struct rq_flags *rf)
return 0;
}
static inline void
util_est_enqueue(struct cfs_rq *cfs_rq, struct task_struct *p) {}
static inline void
util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p,
bool task_sleep) {}
#endif /* CONFIG_SMP */
static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
......@@ -4676,7 +4828,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
if (!se)
add_nr_running(rq, task_delta);
/* determine whether we need to wake up potentially idle cpu */
/* Determine whether we need to wake up potentially idle CPU: */
if (rq->curr == rq->idle && rq->cfs.nr_running)
resched_curr(rq);
}
......@@ -5041,7 +5193,7 @@ static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
}
/*
* Both these cpu hotplug callbacks race against unregister_fair_sched_group()
* Both these CPU hotplug callbacks race against unregister_fair_sched_group()
*
* The race is harmless, since modifying bandwidth settings of unhooked group
* bits doesn't do much.
......@@ -5086,7 +5238,7 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
*/
cfs_rq->runtime_remaining = 1;
/*
* Offline rq is schedulable till cpu is completely disabled
* Offline rq is schedulable till CPU is completely disabled
* in take_cpu_down(), so we prevent new cfs throttling here.
*/
cfs_rq->runtime_enabled = 0;
......@@ -5245,6 +5397,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (!se)
add_nr_running(rq, 1);
util_est_enqueue(&rq->cfs, p);
hrtick_update(rq);
}
......@@ -5304,6 +5457,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (!se)
sub_nr_running(rq, 1);
util_est_dequeue(&rq->cfs, p, task_sleep);
hrtick_update(rq);
}
......@@ -5323,8 +5477,8 @@ DEFINE_PER_CPU(cpumask_var_t, select_idle_mask);
*
* load' = (1 - 1/2^i) * load + (1/2^i) * cur_load
*
* If a cpu misses updates for n ticks (as it was idle) and update gets
* called on the n+1-th tick when cpu may be busy, then we have:
* If a CPU misses updates for n ticks (as it was idle) and update gets
* called on the n+1-th tick when CPU may be busy, then we have:
*
* load_n = (1 - 1/2^i)^n * load_0
* load_n+1 = (1 - 1/2^i) * load_n + (1/2^i) * cur_load
......@@ -5379,6 +5533,15 @@ decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
}
return load;
}
static struct {
cpumask_var_t idle_cpus_mask;
atomic_t nr_cpus;
int has_blocked; /* Idle CPUS has blocked load */
unsigned long next_balance; /* in jiffy units */
unsigned long next_blocked; /* Next update of blocked load in jiffies */
} nohz ____cacheline_aligned;
#endif /* CONFIG_NO_HZ_COMMON */
/**
......@@ -5468,7 +5631,7 @@ static unsigned long weighted_cpuload(struct rq *rq)
#ifdef CONFIG_NO_HZ_COMMON
/*
* There is no sane way to deal with nohz on smp when using jiffies because the
* cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
* CPU doing the jiffies update might drift wrt the CPU doing the jiffy reading
* causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
*
* Therefore we need to avoid the delta approach from the regular tick when
......@@ -5579,7 +5742,7 @@ void cpu_load_update_active(struct rq *this_rq)
}
/*
* Return a low guess at the load of a migration-source cpu weighted
* Return a low guess at the load of a migration-source CPU weighted
* according to the scheduling class and "nice" value.
*
* We want to under-estimate the load of migration sources, to
......@@ -5597,7 +5760,7 @@ static unsigned long source_load(int cpu, int type)
}
/*
* Return a high guess at the load of a migration-target cpu weighted
* Return a high guess at the load of a migration-target CPU weighted
* according to the scheduling class and "nice" value.
*/
static unsigned long target_load(int cpu, int type)
......@@ -5724,7 +5887,6 @@ wake_affine_weight(struct sched_domain *sd, struct task_struct *p,
unsigned long task_load;
this_eff_load = target_load(this_cpu, sd->wake_idx);
prev_eff_load = source_load(prev_cpu, sd->wake_idx);
if (sync) {
unsigned long current_load = task_h_load(current);
......@@ -5742,18 +5904,69 @@ wake_affine_weight(struct sched_domain *sd, struct task_struct *p,
this_eff_load *= 100;
this_eff_load *= capacity_of(prev_cpu);
prev_eff_load = source_load(prev_cpu, sd->wake_idx);
prev_eff_load -= task_load;
if (sched_feat(WA_BIAS))
prev_eff_load *= 100 + (sd->imbalance_pct - 100) / 2;
prev_eff_load *= capacity_of(this_cpu);
return this_eff_load <= prev_eff_load ? this_cpu : nr_cpumask_bits;
/*
* If sync, adjust the weight of prev_eff_load such that if
* prev_eff == this_eff that select_idle_sibling() will consider
* stacking the wakee on top of the waker if no other CPU is
* idle.
*/
if (sync)
prev_eff_load += 1;
return this_eff_load < prev_eff_load ? this_cpu : nr_cpumask_bits;
}
#ifdef CONFIG_NUMA_BALANCING
static void
update_wa_numa_placement(struct task_struct *p, int prev_cpu, int target)
{
unsigned long interval;
if (!static_branch_likely(&sched_numa_balancing))
return;
/* If balancing has no preference then continue gathering data */
if (p->numa_preferred_nid == -1)
return;
/*
* If the wakeup is not affecting locality then it is neutral from
* the perspective of NUMA balacing so continue gathering data.
*/
if (cpu_to_node(prev_cpu) == cpu_to_node(target))
return;
/*
* Temporarily prevent NUMA balancing trying to place waker/wakee after
* wakee has been moved by wake_affine. This will potentially allow
* related tasks to converge and update their data placement. The
* 4 * numa_scan_period is to allow the two-pass filter to migrate
* hot data to the wakers node.
*/
interval = max(sysctl_numa_balancing_scan_delay,
p->numa_scan_period << 2);
p->numa_migrate_retry = jiffies + msecs_to_jiffies(interval);
interval = max(sysctl_numa_balancing_scan_delay,
current->numa_scan_period << 2);
current->numa_migrate_retry = jiffies + msecs_to_jiffies(interval);
}
#else
static void
update_wa_numa_placement(struct task_struct *p, int prev_cpu, int target)
{
}
#endif
static int wake_affine(struct sched_domain *sd, struct task_struct *p,
int prev_cpu, int sync)
int this_cpu, int prev_cpu, int sync)
{
int this_cpu = smp_processor_id();
int target = nr_cpumask_bits;
if (sched_feat(WA_IDLE))
......@@ -5766,12 +5979,12 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p,
if (target == nr_cpumask_bits)
return prev_cpu;
update_wa_numa_placement(p, prev_cpu, target);
schedstat_inc(sd->ttwu_move_affine);
schedstat_inc(p->se.statistics.nr_wakeups_affine);
return target;
}
static inline unsigned long task_util(struct task_struct *p);
static unsigned long cpu_util_wake(int cpu, struct task_struct *p);
static unsigned long capacity_spare_wake(int cpu, struct task_struct *p)
......@@ -5826,7 +6039,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
max_spare_cap = 0;
for_each_cpu(i, sched_group_span(group)) {
/* Bias balancing toward cpus of our domain */
/* Bias balancing toward CPUs of our domain */
if (local_group)
load = source_load(i, load_idx);
else
......@@ -5856,7 +6069,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
if (min_runnable_load > (runnable_load + imbalance)) {
/*
* The runnable load is significantly smaller
* so we can pick this new cpu
* so we can pick this new CPU:
*/
min_runnable_load = runnable_load;
min_avg_load = avg_load;
......@@ -5865,7 +6078,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
(100*min_avg_load > imbalance_scale*avg_load)) {
/*
* The runnable loads are close so take the
* blocked load into account through avg_load.
* blocked load into account through avg_load:
*/
min_avg_load = avg_load;
idlest = group;
......@@ -5903,6 +6116,18 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
if (!idlest)
return NULL;
/*
* When comparing groups across NUMA domains, it's possible for the
* local domain to be very lightly loaded relative to the remote
* domains but "imbalance" skews the comparison making remote CPUs
* look much more favourable. When considering cross-domain, add
* imbalance to the runnable load on the remote node and consider
* staying local.
*/
if ((sd->flags & SD_NUMA) &&
min_runnable_load + imbalance >= this_runnable_load)
return NULL;
if (min_runnable_load > (this_runnable_load + imbalance))
return NULL;
......@@ -5914,7 +6139,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
}
/*
* find_idlest_group_cpu - find the idlest cpu among the cpus in group.
* find_idlest_group_cpu - find the idlest CPU among the CPUs in the group.
*/
static int
find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
......@@ -5992,12 +6217,12 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p
new_cpu = find_idlest_group_cpu(group, p, cpu);
if (new_cpu == cpu) {
/* Now try balancing at a lower domain level of cpu */
/* Now try balancing at a lower domain level of 'cpu': */
sd = sd->child;
continue;
}
/* Now try balancing at a lower domain level of new_cpu */
/* Now try balancing at a lower domain level of 'new_cpu': */
cpu = new_cpu;
weight = sd->span_weight;
sd = NULL;
......@@ -6007,7 +6232,6 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p
if (tmp->flags & sd_flag)
sd = tmp;
}
/* while loop will break here if sd == NULL */
}
return new_cpu;
......@@ -6203,12 +6427,12 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
return target;
/*
* If the previous cpu is cache affine and idle, don't be stupid.
* If the previous CPU is cache affine and idle, don't be stupid:
*/
if (prev != target && cpus_share_cache(prev, target) && idle_cpu(prev))
return prev;
/* Check a recently used CPU as a potential idle candidate */
/* Check a recently used CPU as a potential idle candidate: */
recent_used_cpu = p->recent_used_cpu;
if (recent_used_cpu != prev &&
recent_used_cpu != target &&
......@@ -6217,7 +6441,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
cpumask_test_cpu(p->recent_used_cpu, &p->cpus_allowed)) {
/*
* Replace recent_used_cpu with prev as it is a potential
* candidate for the next wake.
* candidate for the next wake:
*/
p->recent_used_cpu = prev;
return recent_used_cpu;
......@@ -6242,11 +6466,13 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
return target;
}
/*
* cpu_util returns the amount of capacity of a CPU that is used by CFS
* tasks. The unit of the return value must be the one of capacity so we can
* compare the utilization with the capacity of the CPU that is available for
* CFS task (ie cpu_capacity).
/**
* Amount of capacity of a CPU that is (estimated to be) used by CFS tasks
* @cpu: the CPU to get the utilization of
*
* The unit of the return value must be the one of capacity so we can compare
* the utilization with the capacity of the CPU that is available for CFS task
* (ie cpu_capacity).
*
* cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the
* recent utilization of currently non-runnable tasks on a CPU. It represents
......@@ -6257,6 +6483,14 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
* current capacity (capacity_curr <= capacity_orig) of the CPU because it is
* the running time on this CPU scaled by capacity_curr.
*
* The estimated utilization of a CPU is defined to be the maximum between its
* cfs_rq.avg.util_avg and the sum of the estimated utilization of the tasks
* currently RUNNABLE on that CPU.
* This allows to properly represent the expected utilization of a CPU which
* has just got a big task running since a long sleep period. At the same time
* however it preserves the benefits of the "blocked utilization" in
* describing the potential for other tasks waking up on the same CPU.
*
* Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even
* higher than capacity_orig because of unfortunate rounding in
* cfs.avg.util_avg or just after migrating tasks and new task wakeups until
......@@ -6267,36 +6501,77 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
* available capacity. We allow utilization to overshoot capacity_curr (but not
* capacity_orig) as it useful for predicting the capacity required after task
* migrations (scheduler-driven DVFS).
*
* Return: the (estimated) utilization for the specified CPU
*/
static unsigned long cpu_util(int cpu)
static inline unsigned long cpu_util(int cpu)
{
unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg;
unsigned long capacity = capacity_orig_of(cpu);
struct cfs_rq *cfs_rq;
unsigned int util;
return (util >= capacity) ? capacity : util;
}
cfs_rq = &cpu_rq(cpu)->cfs;
util = READ_ONCE(cfs_rq->avg.util_avg);
static inline unsigned long task_util(struct task_struct *p)
{
return p->se.avg.util_avg;
if (sched_feat(UTIL_EST))
util = max(util, READ_ONCE(cfs_rq->avg.util_est.enqueued));
return min_t(unsigned long, util, capacity_orig_of(cpu));
}
/*
* cpu_util_wake: Compute cpu utilization with any contributions from
* cpu_util_wake: Compute CPU utilization with any contributions from
* the waking task p removed.
*/
static unsigned long cpu_util_wake(int cpu, struct task_struct *p)
{
unsigned long util, capacity;
struct cfs_rq *cfs_rq;
unsigned int util;
/* Task has no contribution or is new */
if (cpu != task_cpu(p) || !p->se.avg.last_update_time)
if (cpu != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
return cpu_util(cpu);
capacity = capacity_orig_of(cpu);
util = max_t(long, cpu_rq(cpu)->cfs.avg.util_avg - task_util(p), 0);
cfs_rq = &cpu_rq(cpu)->cfs;
util = READ_ONCE(cfs_rq->avg.util_avg);
/* Discount task's blocked util from CPU's util */
util -= min_t(unsigned int, util, task_util(p));
/*
* Covered cases:
*
* a) if *p is the only task sleeping on this CPU, then:
* cpu_util (== task_util) > util_est (== 0)
* and thus we return:
* cpu_util_wake = (cpu_util - task_util) = 0
*
* b) if other tasks are SLEEPING on this CPU, which is now exiting
* IDLE, then:
* cpu_util >= task_util
* cpu_util > util_est (== 0)
* and thus we discount *p's blocked utilization to return:
* cpu_util_wake = (cpu_util - task_util) >= 0
*
* c) if other tasks are RUNNABLE on that CPU and
* util_est > cpu_util
* then we use util_est since it returns a more restrictive
* estimation of the spare capacity on that CPU, by just
* considering the expected utilization of tasks already
* runnable on that CPU.
*
* Cases a) and b) are covered by the above code, while case c) is
* covered by the following code when estimated utilization is
* enabled.
*/
if (sched_feat(UTIL_EST))
util = max(util, READ_ONCE(cfs_rq->avg.util_est.enqueued));
return (util >= capacity) ? capacity : util;
/*
* Utilization (estimated) can exceed the CPU capacity, thus let's
* clamp to the maximum CPU capacity to ensure consistency with
* the cpu_util call.
*/
return min_t(unsigned long, util, capacity_orig_of(cpu));
}
/*
......@@ -6328,10 +6603,10 @@ static int wake_cap(struct task_struct *p, int cpu, int prev_cpu)
* that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE,
* SD_BALANCE_FORK, or SD_BALANCE_EXEC.
*
* Balances load by selecting the idlest cpu in the idlest group, or under
* certain conditions an idle sibling cpu if the domain has SD_WAKE_AFFINE set.
* Balances load by selecting the idlest CPU in the idlest group, or under
* certain conditions an idle sibling CPU if the domain has SD_WAKE_AFFINE set.
*
* Returns the target cpu number.
* Returns the target CPU number.
*
* preempt must be disabled.
*/
......@@ -6342,7 +6617,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
int cpu = smp_processor_id();
int new_cpu = prev_cpu;
int want_affine = 0;
int sync = wake_flags & WF_SYNC;
int sync = (wake_flags & WF_SYNC) && !(current->flags & PF_EXITING);
if (sd_flag & SD_BALANCE_WAKE) {
record_wakee(p);
......@@ -6356,7 +6631,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
break;
/*
* If both cpu and prev_cpu are part of this domain,
* If both 'cpu' and 'prev_cpu' are part of this domain,
* cpu is a valid SD_WAKE_AFFINE target.
*/
if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
......@@ -6376,7 +6651,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
if (cpu == prev_cpu)
goto pick_cpu;
new_cpu = wake_affine(affine_sd, p, prev_cpu, sync);
new_cpu = wake_affine(affine_sd, p, cpu, prev_cpu, sync);
}
if (sd && !(sd_flag & SD_BALANCE_FORK)) {
......@@ -6407,9 +6682,9 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
static void detach_entity_cfs_rq(struct sched_entity *se);
/*
* Called immediately before a task is migrated to a new cpu; task_cpu(p) and
* Called immediately before a task is migrated to a new CPU; task_cpu(p) and
* cfs_rq_of(p) references at time of call are still valid and identify the
* previous cpu. The caller guarantees p->pi_lock or task_rq(p)->lock is held.
* previous CPU. The caller guarantees p->pi_lock or task_rq(p)->lock is held.
*/
static void migrate_task_rq_fair(struct task_struct *p)
{
......@@ -6738,7 +7013,7 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf
p = task_of(se);
done: __maybe_unused
done: __maybe_unused;
#ifdef CONFIG_SMP
/*
* Move the next running task to the front of
......@@ -6843,17 +7118,17 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
* BASICS
*
* The purpose of load-balancing is to achieve the same basic fairness the
* per-cpu scheduler provides, namely provide a proportional amount of compute
* per-CPU scheduler provides, namely provide a proportional amount of compute
* time to each task. This is expressed in the following equation:
*
* W_i,n/P_i == W_j,n/P_j for all i,j (1)
*
* Where W_i,n is the n-th weight average for cpu i. The instantaneous weight
* Where W_i,n is the n-th weight average for CPU i. The instantaneous weight
* W_i,0 is defined as:
*
* W_i,0 = \Sum_j w_i,j (2)
*
* Where w_i,j is the weight of the j-th runnable task on cpu i. This weight
* Where w_i,j is the weight of the j-th runnable task on CPU i. This weight
* is derived from the nice value as per sched_prio_to_weight[].
*
* The weight average is an exponential decay average of the instantaneous
......@@ -6861,7 +7136,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
*
* W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0 (3)
*
* C_i is the compute capacity of cpu i, typically it is the
* C_i is the compute capacity of CPU i, typically it is the
* fraction of 'recent' time available for SCHED_OTHER task execution. But it
* can also include other factors [XXX].
*
......@@ -6882,11 +7157,11 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
* SCHED DOMAINS
*
* In order to solve the imbalance equation (4), and avoid the obvious O(n^2)
* for all i,j solution, we create a tree of cpus that follows the hardware
* for all i,j solution, we create a tree of CPUs that follows the hardware
* topology where each level pairs two lower groups (or better). This results
* in O(log n) layers. Furthermore we reduce the number of cpus going up the
* in O(log n) layers. Furthermore we reduce the number of CPUs going up the
* tree to only the first of the previous level and we decrease the frequency
* of load-balance at each level inv. proportional to the number of cpus in
* of load-balance at each level inv. proportional to the number of CPUs in
* the groups.
*
* This yields:
......@@ -6895,7 +7170,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
* \Sum { --- * --- * 2^i } = O(n) (5)
* i = 0 2^i 2^i
* `- size of each group
* | | `- number of cpus doing load-balance
* | | `- number of CPUs doing load-balance
* | `- freq
* `- sum over all levels
*
......@@ -6903,7 +7178,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
* this makes (5) the runtime complexity of the balancer.
*
* An important property here is that each CPU is still (indirectly) connected
* to every other cpu in at most O(log n) steps:
* to every other CPU in at most O(log n) steps:
*
* The adjacency matrix of the resulting graph is given by:
*
......@@ -6915,7 +7190,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
*
* A^(log_2 n)_i,j != 0 for all i,j (7)
*
* Showing there's indeed a path between every cpu in at most O(log n) steps.
* Showing there's indeed a path between every CPU in at most O(log n) steps.
* The task movement gives a factor of O(m), giving a convergence complexity
* of:
*
......@@ -6925,7 +7200,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
* WORK CONSERVING
*
* In order to avoid CPUs going idle while there's still work to do, new idle
* balancing is more aggressive and has the newly idle cpu iterate up the domain
* balancing is more aggressive and has the newly idle CPU iterate up the domain
* tree itself instead of relying on other CPUs to bring it work.
*
* This adds some complexity to both (5) and (8) but it reduces the total idle
......@@ -6946,7 +7221,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
*
* s_k,i = \Sum_j w_i,j,k and S_k = \Sum_i s_k,i (10)
*
* w_i,j,k is the weight of the j-th runnable task in the k-th cgroup on cpu i.
* w_i,j,k is the weight of the j-th runnable task in the k-th cgroup on CPU i.
*
* The big problem is S_k, its a global sum needed to compute a local (W_i)
* property.
......@@ -6963,6 +7238,8 @@ enum fbq_type { regular, remote, all };
#define LBF_NEED_BREAK 0x02
#define LBF_DST_PINNED 0x04
#define LBF_SOME_PINNED 0x08
#define LBF_NOHZ_STATS 0x10
#define LBF_NOHZ_AGAIN 0x20
struct lb_env {
struct sched_domain *sd;
......@@ -7110,7 +7387,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
env->flags |= LBF_SOME_PINNED;
/*
* Remember if this task can be migrated to any other cpu in
* Remember if this task can be migrated to any other CPU in
* our sched_group. We may want to revisit it if we couldn't
* meet load balance goals by pulling other tasks on src_cpu.
*
......@@ -7120,7 +7397,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
if (env->idle == CPU_NEWLY_IDLE || (env->flags & LBF_DST_PINNED))
return 0;
/* Prevent to re-select dst_cpu via env's cpus */
/* Prevent to re-select dst_cpu via env's CPUs: */
for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
if (cpumask_test_cpu(cpu, &p->cpus_allowed)) {
env->flags |= LBF_DST_PINNED;
......@@ -7347,6 +7624,17 @@ static void attach_tasks(struct lb_env *env)
rq_unlock(env->dst_rq, &rf);
}
static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq)
{
if (cfs_rq->avg.load_avg)
return true;
if (cfs_rq->avg.util_avg)
return true;
return false;
}
#ifdef CONFIG_FAIR_GROUP_SCHED
static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
......@@ -7371,6 +7659,7 @@ static void update_blocked_averages(int cpu)
struct rq *rq = cpu_rq(cpu);
struct cfs_rq *cfs_rq, *pos;
struct rq_flags rf;
bool done = true;
rq_lock_irqsave(rq, &rf);
update_rq_clock(rq);
......@@ -7400,7 +7689,17 @@ static void update_blocked_averages(int cpu)
*/
if (cfs_rq_is_decayed(cfs_rq))
list_del_leaf_cfs_rq(cfs_rq);
/* Don't need periodic decay once load/util_avg are null */
if (cfs_rq_has_blocked(cfs_rq))
done = false;
}
#ifdef CONFIG_NO_HZ_COMMON
rq->last_blocked_load_update_tick = jiffies;
if (done)
rq->has_blocked_load = 0;
#endif
rq_unlock_irqrestore(rq, &rf);
}
......@@ -7460,6 +7759,11 @@ static inline void update_blocked_averages(int cpu)
rq_lock_irqsave(rq, &rf);
update_rq_clock(rq);
update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq);
#ifdef CONFIG_NO_HZ_COMMON
rq->last_blocked_load_update_tick = jiffies;
if (!cfs_rq_has_blocked(cfs_rq))
rq->has_blocked_load = 0;
#endif
rq_unlock_irqrestore(rq, &rf);
}
......@@ -7694,8 +7998,8 @@ check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
* Group imbalance indicates (and tries to solve) the problem where balancing
* groups is inadequate due to ->cpus_allowed constraints.
*
* Imagine a situation of two groups of 4 cpus each and 4 tasks each with a
* cpumask covering 1 cpu of the first group and 3 cpus of the second group.
* Imagine a situation of two groups of 4 CPUs each and 4 tasks each with a
* cpumask covering 1 CPU of the first group and 3 CPUs of the second group.
* Something like:
*
* { 0 1 2 3 } { 4 5 6 7 }
......@@ -7703,7 +8007,7 @@ check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
*
* If we were to balance group-wise we'd place two tasks in the first group and
* two tasks in the second group. Clearly this is undesired as it will overload
* cpu 3 and leave one of the cpus in the second group unused.
* cpu 3 and leave one of the CPUs in the second group unused.
*
* The current solution to this issue is detecting the skew in the first group
* by noticing the lower domain failed to reach balance and had difficulty
......@@ -7794,6 +8098,28 @@ group_type group_classify(struct sched_group *group,
return group_other;
}
static bool update_nohz_stats(struct rq *rq, bool force)
{
#ifdef CONFIG_NO_HZ_COMMON
unsigned int cpu = rq->cpu;
if (!rq->has_blocked_load)
return false;
if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask))
return false;
if (!force && !time_after(jiffies, rq->last_blocked_load_update_tick))
return true;
update_blocked_averages(cpu);
return rq->has_blocked_load;
#else
return false;
#endif
}
/**
* update_sg_lb_stats - Update sched_group's statistics for load balancing.
* @env: The load balancing environment.
......@@ -7816,7 +8142,10 @@ static inline void update_sg_lb_stats(struct lb_env *env,
for_each_cpu_and(i, sched_group_span(group), env->cpus) {
struct rq *rq = cpu_rq(i);
/* Bias balancing toward cpus of our domain */
if ((env->flags & LBF_NOHZ_STATS) && update_nohz_stats(rq, false))
env->flags |= LBF_NOHZ_AGAIN;
/* Bias balancing toward CPUs of our domain: */
if (local_group)
load = target_load(i, load_idx);
else
......@@ -7902,7 +8231,7 @@ static bool update_sd_pick_busiest(struct lb_env *env,
if (!(env->sd->flags & SD_ASYM_PACKING))
return true;
/* No ASYM_PACKING if target cpu is already busy */
/* No ASYM_PACKING if target CPU is already busy */
if (env->idle == CPU_NOT_IDLE)
return true;
/*
......@@ -7915,7 +8244,7 @@ static bool update_sd_pick_busiest(struct lb_env *env,
if (!sds->busiest)
return true;
/* Prefer to move from lowest priority cpu's work */
/* Prefer to move from lowest priority CPU's work */
if (sched_asym_prefer(sds->busiest->asym_prefer_cpu,
sg->asym_prefer_cpu))
return true;
......@@ -7971,6 +8300,11 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
if (child && child->flags & SD_PREFER_SIBLING)
prefer_sibling = 1;
#ifdef CONFIG_NO_HZ_COMMON
if (env->idle == CPU_NEWLY_IDLE && READ_ONCE(nohz.has_blocked))
env->flags |= LBF_NOHZ_STATS;
#endif
load_idx = get_sd_load_idx(env->sd, env->idle);
do {
......@@ -8024,6 +8358,15 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
sg = sg->next;
} while (sg != env->sd->groups);
#ifdef CONFIG_NO_HZ_COMMON
if ((env->flags & LBF_NOHZ_AGAIN) &&
cpumask_subset(nohz.idle_cpus_mask, sched_domain_span(env->sd))) {
WRITE_ONCE(nohz.next_blocked,
jiffies + msecs_to_jiffies(LOAD_AVG_PERIOD));
}
#endif
if (env->sd->flags & SD_NUMA)
env->fbq_type = fbq_classify_group(&sds->busiest_stat);
......@@ -8168,7 +8511,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
if (busiest->group_type == group_imbalanced) {
/*
* In the group_imb case we cannot rely on group-wide averages
* to ensure cpu-load equilibrium, look at wider averages. XXX
* to ensure CPU-load equilibrium, look at wider averages. XXX
*/
busiest->load_per_task =
min(busiest->load_per_task, sds->avg_load);
......@@ -8187,7 +8530,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
}
/*
* If there aren't any idle cpus, avoid creating some.
* If there aren't any idle CPUs, avoid creating some.
*/
if (busiest->group_type == group_overloaded &&
local->group_type == group_overloaded) {
......@@ -8201,9 +8544,9 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
}
/*
* We're trying to get all the cpus to the average_load, so we don't
* We're trying to get all the CPUs to the average_load, so we don't
* want to push ourselves above the average load, nor do we wish to
* reduce the max loaded cpu below the average load. At the same time,
* reduce the max loaded CPU below the average load. At the same time,
* we also don't want to reduce the group load below the group
* capacity. Thus we look for the minimum possible imbalance.
*/
......@@ -8297,9 +8640,9 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
if (env->idle == CPU_IDLE) {
/*
* This cpu is idle. If the busiest group is not overloaded
* This CPU is idle. If the busiest group is not overloaded
* and there is no imbalance between this and busiest group
* wrt idle cpus, it is balanced. The imbalance becomes
* wrt idle CPUs, it is balanced. The imbalance becomes
* significant if the diff is greater than 1 otherwise we
* might end up to just move the imbalance on another group
*/
......@@ -8327,7 +8670,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
}
/*
* find_busiest_queue - find the busiest runqueue among the cpus in group.
* find_busiest_queue - find the busiest runqueue among the CPUs in the group.
*/
static struct rq *find_busiest_queue(struct lb_env *env,
struct sched_group *group)
......@@ -8371,7 +8714,7 @@ static struct rq *find_busiest_queue(struct lb_env *env,
/*
* When comparing with imbalance, use weighted_cpuload()
* which is not scaled with the cpu capacity.
* which is not scaled with the CPU capacity.
*/
if (rq->nr_running == 1 && wl > env->imbalance &&
......@@ -8379,9 +8722,9 @@ static struct rq *find_busiest_queue(struct lb_env *env,
continue;
/*
* For the load comparisons with the other cpu's, consider
* the weighted_cpuload() scaled with the cpu capacity, so
* that the load can be moved away from the cpu that is
* For the load comparisons with the other CPU's, consider
* the weighted_cpuload() scaled with the CPU capacity, so
* that the load can be moved away from the CPU that is
* potentially running at a lower capacity.
*
* Thus we're looking for max(wl_i / capacity_i), crosswise
......@@ -8452,13 +8795,13 @@ static int should_we_balance(struct lb_env *env)
return 0;
/*
* In the newly idle case, we will allow all the cpu's
* In the newly idle case, we will allow all the CPUs
* to do the newly idle load balance.
*/
if (env->idle == CPU_NEWLY_IDLE)
return 1;
/* Try to find first idle cpu */
/* Try to find first idle CPU */
for_each_cpu_and(cpu, group_balance_mask(sg), env->cpus) {
if (!idle_cpu(cpu))
continue;
......@@ -8471,7 +8814,7 @@ static int should_we_balance(struct lb_env *env)
balance_cpu = group_balance_cpu(sg);
/*
* First idle cpu or the first cpu(busiest) in this sched group
* First idle CPU or the first CPU(busiest) in this sched group
* is eligible for doing load balancing at this and above domains.
*/
return balance_cpu == env->dst_cpu;
......@@ -8580,7 +8923,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
* Revisit (affine) tasks on src_cpu that couldn't be moved to
* us and move them to an alternate dst_cpu in our sched_group
* where they can run. The upper limit on how many times we
* iterate on same src_cpu is dependent on number of cpus in our
* iterate on same src_cpu is dependent on number of CPUs in our
* sched_group.
*
* This changes load balance semantics a bit on who can move
......@@ -8597,7 +8940,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
*/
if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) {
/* Prevent to re-select dst_cpu via env's cpus */
/* Prevent to re-select dst_cpu via env's CPUs */
cpumask_clear_cpu(env.dst_cpu, env.cpus);
env.dst_rq = cpu_rq(env.new_dst_cpu);
......@@ -8659,9 +9002,10 @@ static int load_balance(int this_cpu, struct rq *this_rq,
raw_spin_lock_irqsave(&busiest->lock, flags);
/* don't kick the active_load_balance_cpu_stop,
* if the curr task on busiest cpu can't be
* moved to this_cpu
/*
* Don't kick the active_load_balance_cpu_stop,
* if the curr task on busiest CPU can't be
* moved to this_cpu:
*/
if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) {
raw_spin_unlock_irqrestore(&busiest->lock,
......@@ -8773,167 +9117,53 @@ update_next_balance(struct sched_domain *sd, unsigned long *next_balance)
}
/*
* idle_balance is called by schedule() if this_cpu is about to become
* idle. Attempts to pull tasks from other CPUs.
* active_load_balance_cpu_stop is run by the CPU stopper. It pushes
* running tasks off the busiest CPU onto idle CPUs. It requires at
* least 1 task to be running on each physical CPU where possible, and
* avoids physical / logical imbalances.
*/
static int idle_balance(struct rq *this_rq, struct rq_flags *rf)
static int active_load_balance_cpu_stop(void *data)
{
unsigned long next_balance = jiffies + HZ;
int this_cpu = this_rq->cpu;
struct rq *busiest_rq = data;
int busiest_cpu = cpu_of(busiest_rq);
int target_cpu = busiest_rq->push_cpu;
struct rq *target_rq = cpu_rq(target_cpu);
struct sched_domain *sd;
int pulled_task = 0;
u64 curr_cost = 0;
struct task_struct *p = NULL;
struct rq_flags rf;
rq_lock_irq(busiest_rq, &rf);
/*
* We must set idle_stamp _before_ calling idle_balance(), such that we
* measure the duration of idle_balance() as idle time.
* Between queueing the stop-work and running it is a hole in which
* CPUs can become inactive. We should not move tasks from or to
* inactive CPUs.
*/
this_rq->idle_stamp = rq_clock(this_rq);
if (!cpu_active(busiest_cpu) || !cpu_active(target_cpu))
goto out_unlock;
/*
* Do not pull tasks towards !active CPUs...
*/
if (!cpu_active(this_cpu))
return 0;
/* Make sure the requested CPU hasn't gone down in the meantime: */
if (unlikely(busiest_cpu != smp_processor_id() ||
!busiest_rq->active_balance))
goto out_unlock;
/* Is there any task to move? */
if (busiest_rq->nr_running <= 1)
goto out_unlock;
/*
* This is OK, because current is on_cpu, which avoids it being picked
* for load-balance and preemption/IRQs are still disabled avoiding
* further scheduler activity on it and we're being very careful to
* re-start the picking loop.
* This condition is "impossible", if it occurs
* we need to fix it. Originally reported by
* Bjorn Helgaas on a 128-CPU setup.
*/
rq_unpin_lock(this_rq, rf);
if (this_rq->avg_idle < sysctl_sched_migration_cost ||
!this_rq->rd->overload) {
rcu_read_lock();
sd = rcu_dereference_check_sched_domain(this_rq->sd);
if (sd)
update_next_balance(sd, &next_balance);
rcu_read_unlock();
goto out;
}
raw_spin_unlock(&this_rq->lock);
BUG_ON(busiest_rq == target_rq);
update_blocked_averages(this_cpu);
/* Search for an sd spanning us and the target CPU. */
rcu_read_lock();
for_each_domain(this_cpu, sd) {
int continue_balancing = 1;
u64 t0, domain_cost;
if (!(sd->flags & SD_LOAD_BALANCE))
continue;
if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) {
update_next_balance(sd, &next_balance);
break;
}
if (sd->flags & SD_BALANCE_NEWIDLE) {
t0 = sched_clock_cpu(this_cpu);
pulled_task = load_balance(this_cpu, this_rq,
sd, CPU_NEWLY_IDLE,
&continue_balancing);
domain_cost = sched_clock_cpu(this_cpu) - t0;
if (domain_cost > sd->max_newidle_lb_cost)
sd->max_newidle_lb_cost = domain_cost;
curr_cost += domain_cost;
}
update_next_balance(sd, &next_balance);
/*
* Stop searching for tasks to pull if there are
* now runnable tasks on this rq.
*/
if (pulled_task || this_rq->nr_running > 0)
break;
}
rcu_read_unlock();
raw_spin_lock(&this_rq->lock);
if (curr_cost > this_rq->max_idle_balance_cost)
this_rq->max_idle_balance_cost = curr_cost;
/*
* While browsing the domains, we released the rq lock, a task could
* have been enqueued in the meantime. Since we're not going idle,
* pretend we pulled a task.
*/
if (this_rq->cfs.h_nr_running && !pulled_task)
pulled_task = 1;
out:
/* Move the next balance forward */
if (time_after(this_rq->next_balance, next_balance))
this_rq->next_balance = next_balance;
/* Is there a task of a high priority class? */
if (this_rq->nr_running != this_rq->cfs.h_nr_running)
pulled_task = -1;
if (pulled_task)
this_rq->idle_stamp = 0;
rq_repin_lock(this_rq, rf);
return pulled_task;
}
/*
* active_load_balance_cpu_stop is run by cpu stopper. It pushes
* running tasks off the busiest CPU onto idle CPUs. It requires at
* least 1 task to be running on each physical CPU where possible, and
* avoids physical / logical imbalances.
*/
static int active_load_balance_cpu_stop(void *data)
{
struct rq *busiest_rq = data;
int busiest_cpu = cpu_of(busiest_rq);
int target_cpu = busiest_rq->push_cpu;
struct rq *target_rq = cpu_rq(target_cpu);
struct sched_domain *sd;
struct task_struct *p = NULL;
struct rq_flags rf;
rq_lock_irq(busiest_rq, &rf);
/*
* Between queueing the stop-work and running it is a hole in which
* CPUs can become inactive. We should not move tasks from or to
* inactive CPUs.
*/
if (!cpu_active(busiest_cpu) || !cpu_active(target_cpu))
goto out_unlock;
/* make sure the requested cpu hasn't gone down in the meantime */
if (unlikely(busiest_cpu != smp_processor_id() ||
!busiest_rq->active_balance))
goto out_unlock;
/* Is there any task to move? */
if (busiest_rq->nr_running <= 1)
goto out_unlock;
/*
* This condition is "impossible", if it occurs
* we need to fix it. Originally reported by
* Bjorn Helgaas on a 128-cpu setup.
*/
BUG_ON(busiest_rq == target_rq);
/* Search for an sd spanning us and the target CPU. */
rcu_read_lock();
for_each_domain(target_cpu, sd) {
if ((sd->flags & SD_LOAD_BALANCE) &&
cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
break;
}
for_each_domain(target_cpu, sd) {
if ((sd->flags & SD_LOAD_BALANCE) &&
cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
break;
}
if (likely(sd)) {
struct lb_env env = {
......@@ -8977,141 +9207,6 @@ static int active_load_balance_cpu_stop(void *data)
return 0;
}
static inline int on_null_domain(struct rq *rq)
{
return unlikely(!rcu_dereference_sched(rq->sd));
}
#ifdef CONFIG_NO_HZ_COMMON
/*
* idle load balancing details
* - When one of the busy CPUs notice that there may be an idle rebalancing
* needed, they will kick the idle load balancer, which then does idle
* load balancing for all the idle CPUs.
*/
static struct {
cpumask_var_t idle_cpus_mask;
atomic_t nr_cpus;
unsigned long next_balance; /* in jiffy units */
} nohz ____cacheline_aligned;
static inline int find_new_ilb(void)
{
int ilb = cpumask_first(nohz.idle_cpus_mask);
if (ilb < nr_cpu_ids && idle_cpu(ilb))
return ilb;
return nr_cpu_ids;
}
/*
* Kick a CPU to do the nohz balancing, if it is time for it. We pick the
* nohz_load_balancer CPU (if there is one) otherwise fallback to any idle
* CPU (if there is one).
*/
static void nohz_balancer_kick(void)
{
int ilb_cpu;
nohz.next_balance++;
ilb_cpu = find_new_ilb();
if (ilb_cpu >= nr_cpu_ids)
return;
if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu)))
return;
/*
* Use smp_send_reschedule() instead of resched_cpu().
* This way we generate a sched IPI on the target cpu which
* is idle. And the softirq performing nohz idle load balance
* will be run before returning from the IPI.
*/
smp_send_reschedule(ilb_cpu);
return;
}
void nohz_balance_exit_idle(unsigned int cpu)
{
if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
/*
* Completely isolated CPUs don't ever set, so we must test.
*/
if (likely(cpumask_test_cpu(cpu, nohz.idle_cpus_mask))) {
cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
atomic_dec(&nohz.nr_cpus);
}
clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
}
}
static inline void set_cpu_sd_state_busy(void)
{
struct sched_domain *sd;
int cpu = smp_processor_id();
rcu_read_lock();
sd = rcu_dereference(per_cpu(sd_llc, cpu));
if (!sd || !sd->nohz_idle)
goto unlock;
sd->nohz_idle = 0;
atomic_inc(&sd->shared->nr_busy_cpus);
unlock:
rcu_read_unlock();
}
void set_cpu_sd_state_idle(void)
{
struct sched_domain *sd;
int cpu = smp_processor_id();
rcu_read_lock();
sd = rcu_dereference(per_cpu(sd_llc, cpu));
if (!sd || sd->nohz_idle)
goto unlock;
sd->nohz_idle = 1;
atomic_dec(&sd->shared->nr_busy_cpus);
unlock:
rcu_read_unlock();
}
/*
* This routine will record that the cpu is going idle with tick stopped.
* This info will be used in performing idle load balancing in the future.
*/
void nohz_balance_enter_idle(int cpu)
{
/*
* If this cpu is going down, then nothing needs to be done.
*/
if (!cpu_active(cpu))
return;
/* Spare idle load balancing on CPUs that don't want to be disturbed: */
if (!housekeeping_cpu(cpu, HK_FLAG_SCHED))
return;
if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))
return;
/*
* If we're a completely isolated CPU, we don't play.
*/
if (on_null_domain(cpu_rq(cpu)))
return;
cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
atomic_inc(&nohz.nr_cpus);
set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
}
#endif
static DEFINE_SPINLOCK(balancing);
/*
......@@ -9141,8 +9236,6 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
int need_serialize, need_decay = 0;
u64 max_cost = 0;
update_blocked_averages(cpu);
rcu_read_lock();
for_each_domain(cpu, sd) {
/*
......@@ -9232,68 +9325,56 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
}
}
static inline int on_null_domain(struct rq *rq)
{
return unlikely(!rcu_dereference_sched(rq->sd));
}
#ifdef CONFIG_NO_HZ_COMMON
/*
* In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
* rebalancing for all the cpus for whom scheduler ticks are stopped.
* idle load balancing details
* - When one of the busy CPUs notice that there may be an idle rebalancing
* needed, they will kick the idle load balancer, which then does idle
* load balancing for all the idle CPUs.
*/
static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
static inline int find_new_ilb(void)
{
int this_cpu = this_rq->cpu;
struct rq *rq;
int balance_cpu;
/* Earliest time when we have to do rebalance again */
unsigned long next_balance = jiffies + 60*HZ;
int update_next_balance = 0;
int ilb = cpumask_first(nohz.idle_cpus_mask);
if (idle != CPU_IDLE ||
!test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)))
goto end;
if (ilb < nr_cpu_ids && idle_cpu(ilb))
return ilb;
for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
continue;
return nr_cpu_ids;
}
/*
* If this cpu gets work to do, stop the load balancing
* work being done for other cpus. Next load
* balancing owner will pick it up.
*/
if (need_resched())
break;
rq = cpu_rq(balance_cpu);
/*
* Kick a CPU to do the nohz balancing, if it is time for it. We pick the
* nohz_load_balancer CPU (if there is one) otherwise fallback to any idle
* CPU (if there is one).
*/
static void kick_ilb(unsigned int flags)
{
int ilb_cpu;
/*
* If time for next balance is due,
* do the balance.
*/
if (time_after_eq(jiffies, rq->next_balance)) {
struct rq_flags rf;
nohz.next_balance++;
rq_lock_irq(rq, &rf);
update_rq_clock(rq);
cpu_load_update_idle(rq);
rq_unlock_irq(rq, &rf);
ilb_cpu = find_new_ilb();
rebalance_domains(rq, CPU_IDLE);
}
if (ilb_cpu >= nr_cpu_ids)
return;
if (time_after(next_balance, rq->next_balance)) {
next_balance = rq->next_balance;
update_next_balance = 1;
}
}
flags = atomic_fetch_or(flags, nohz_flags(ilb_cpu));
if (flags & NOHZ_KICK_MASK)
return;
/*
* next_balance will be updated only when there is a need.
* When the CPU is attached to null domain for ex, it will not be
* updated.
* Use smp_send_reschedule() instead of resched_cpu().
* This way we generate a sched IPI on the target CPU which
* is idle. And the softirq performing nohz idle load balance
* will be run before returning from the IPI.
*/
if (likely(update_next_balance))
nohz.next_balance = next_balance;
end:
clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu));
smp_send_reschedule(ilb_cpu);
}
/*
......@@ -9307,36 +9388,41 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
* - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
* domain span are idle.
*/
static inline bool nohz_kick_needed(struct rq *rq)
static void nohz_balancer_kick(struct rq *rq)
{
unsigned long now = jiffies;
struct sched_domain_shared *sds;
struct sched_domain *sd;
int nr_busy, i, cpu = rq->cpu;
bool kick = false;
unsigned int flags = 0;
if (unlikely(rq->idle_balance))
return false;
return;
/*
* We may be recently in ticked or tickless idle mode. At the first
* busy tick after returning from idle, we will update the busy stats.
*/
set_cpu_sd_state_busy();
nohz_balance_exit_idle(cpu);
/*
* We may be recently in ticked or tickless idle mode. At the first
* busy tick after returning from idle, we will update the busy stats.
*/
nohz_balance_exit_idle(rq);
/*
* None are in tickless mode and hence no need for NOHZ idle load
* balancing.
*/
if (likely(!atomic_read(&nohz.nr_cpus)))
return false;
return;
if (READ_ONCE(nohz.has_blocked) &&
time_after(now, READ_ONCE(nohz.next_blocked)))
flags = NOHZ_STATS_KICK;
if (time_before(now, nohz.next_balance))
return false;
goto out;
if (rq->nr_running >= 2)
return true;
if (rq->nr_running >= 2) {
flags = NOHZ_KICK_MASK;
goto out;
}
rcu_read_lock();
sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
......@@ -9347,7 +9433,7 @@ static inline bool nohz_kick_needed(struct rq *rq)
*/
nr_busy = atomic_read(&sds->nr_busy_cpus);
if (nr_busy > 1) {
kick = true;
flags = NOHZ_KICK_MASK;
goto unlock;
}
......@@ -9357,7 +9443,7 @@ static inline bool nohz_kick_needed(struct rq *rq)
if (sd) {
if ((rq->cfs.h_nr_running >= 1) &&
check_cpu_capacity(rq, sd)) {
kick = true;
flags = NOHZ_KICK_MASK;
goto unlock;
}
}
......@@ -9370,18 +9456,421 @@ static inline bool nohz_kick_needed(struct rq *rq)
continue;
if (sched_asym_prefer(i, cpu)) {
kick = true;
flags = NOHZ_KICK_MASK;
goto unlock;
}
}
}
unlock:
rcu_read_unlock();
return kick;
out:
if (flags)
kick_ilb(flags);
}
static void set_cpu_sd_state_busy(int cpu)
{
struct sched_domain *sd;
rcu_read_lock();
sd = rcu_dereference(per_cpu(sd_llc, cpu));
if (!sd || !sd->nohz_idle)
goto unlock;
sd->nohz_idle = 0;
atomic_inc(&sd->shared->nr_busy_cpus);
unlock:
rcu_read_unlock();
}
void nohz_balance_exit_idle(struct rq *rq)
{
SCHED_WARN_ON(rq != this_rq());
if (likely(!rq->nohz_tick_stopped))
return;
rq->nohz_tick_stopped = 0;
cpumask_clear_cpu(rq->cpu, nohz.idle_cpus_mask);
atomic_dec(&nohz.nr_cpus);
set_cpu_sd_state_busy(rq->cpu);
}
static void set_cpu_sd_state_idle(int cpu)
{
struct sched_domain *sd;
rcu_read_lock();
sd = rcu_dereference(per_cpu(sd_llc, cpu));
if (!sd || sd->nohz_idle)
goto unlock;
sd->nohz_idle = 1;
atomic_dec(&sd->shared->nr_busy_cpus);
unlock:
rcu_read_unlock();
}
/*
* This routine will record that the CPU is going idle with tick stopped.
* This info will be used in performing idle load balancing in the future.
*/
void nohz_balance_enter_idle(int cpu)
{
struct rq *rq = cpu_rq(cpu);
SCHED_WARN_ON(cpu != smp_processor_id());
/* If this CPU is going down, then nothing needs to be done: */
if (!cpu_active(cpu))
return;
/* Spare idle load balancing on CPUs that don't want to be disturbed: */
if (!housekeeping_cpu(cpu, HK_FLAG_SCHED))
return;
/*
* Can be set safely without rq->lock held
* If a clear happens, it will have evaluated last additions because
* rq->lock is held during the check and the clear
*/
rq->has_blocked_load = 1;
/*
* The tick is still stopped but load could have been added in the
* meantime. We set the nohz.has_blocked flag to trig a check of the
* *_avg. The CPU is already part of nohz.idle_cpus_mask so the clear
* of nohz.has_blocked can only happen after checking the new load
*/
if (rq->nohz_tick_stopped)
goto out;
/* If we're a completely isolated CPU, we don't play: */
if (on_null_domain(rq))
return;
rq->nohz_tick_stopped = 1;
cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
atomic_inc(&nohz.nr_cpus);
/*
* Ensures that if nohz_idle_balance() fails to observe our
* @idle_cpus_mask store, it must observe the @has_blocked
* store.
*/
smp_mb__after_atomic();
set_cpu_sd_state_idle(cpu);
out:
/*
* Each time a cpu enter idle, we assume that it has blocked load and
* enable the periodic update of the load of idle cpus
*/
WRITE_ONCE(nohz.has_blocked, 1);
}
/*
* Internal function that runs load balance for all idle cpus. The load balance
* can be a simple update of blocked load or a complete load balance with
* tasks movement depending of flags.
* The function returns false if the loop has stopped before running
* through all idle CPUs.
*/
static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
enum cpu_idle_type idle)
{
/* Earliest time when we have to do rebalance again */
unsigned long now = jiffies;
unsigned long next_balance = now + 60*HZ;
bool has_blocked_load = false;
int update_next_balance = 0;
int this_cpu = this_rq->cpu;
int balance_cpu;
int ret = false;
struct rq *rq;
SCHED_WARN_ON((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK);
/*
* We assume there will be no idle load after this update and clear
* the has_blocked flag. If a cpu enters idle in the mean time, it will
* set the has_blocked flag and trig another update of idle load.
* Because a cpu that becomes idle, is added to idle_cpus_mask before
* setting the flag, we are sure to not clear the state and not
* check the load of an idle cpu.
*/
WRITE_ONCE(nohz.has_blocked, 0);
/*
* Ensures that if we miss the CPU, we must see the has_blocked
* store from nohz_balance_enter_idle().
*/
smp_mb();
for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
continue;
/*
* If this CPU gets work to do, stop the load balancing
* work being done for other CPUs. Next load
* balancing owner will pick it up.
*/
if (need_resched()) {
has_blocked_load = true;
goto abort;
}
rq = cpu_rq(balance_cpu);
has_blocked_load |= update_nohz_stats(rq, true);
/*
* If time for next balance is due,
* do the balance.
*/
if (time_after_eq(jiffies, rq->next_balance)) {
struct rq_flags rf;
rq_lock_irqsave(rq, &rf);
update_rq_clock(rq);
cpu_load_update_idle(rq);
rq_unlock_irqrestore(rq, &rf);
if (flags & NOHZ_BALANCE_KICK)
rebalance_domains(rq, CPU_IDLE);
}
if (time_after(next_balance, rq->next_balance)) {
next_balance = rq->next_balance;
update_next_balance = 1;
}
}
/* Newly idle CPU doesn't need an update */
if (idle != CPU_NEWLY_IDLE) {
update_blocked_averages(this_cpu);
has_blocked_load |= this_rq->has_blocked_load;
}
if (flags & NOHZ_BALANCE_KICK)
rebalance_domains(this_rq, CPU_IDLE);
WRITE_ONCE(nohz.next_blocked,
now + msecs_to_jiffies(LOAD_AVG_PERIOD));
/* The full idle balance loop has been done */
ret = true;
abort:
/* There is still blocked load, enable periodic update */
if (has_blocked_load)
WRITE_ONCE(nohz.has_blocked, 1);
/*
* next_balance will be updated only when there is a need.
* When the CPU is attached to null domain for ex, it will not be
* updated.
*/
if (likely(update_next_balance))
nohz.next_balance = next_balance;
return ret;
}
/*
* In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
* rebalancing for all the cpus for whom scheduler ticks are stopped.
*/
static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
{
int this_cpu = this_rq->cpu;
unsigned int flags;
if (!(atomic_read(nohz_flags(this_cpu)) & NOHZ_KICK_MASK))
return false;
if (idle != CPU_IDLE) {
atomic_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
return false;
}
/*
* barrier, pairs with nohz_balance_enter_idle(), ensures ...
*/
flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
if (!(flags & NOHZ_KICK_MASK))
return false;
_nohz_idle_balance(this_rq, flags, idle);
return true;
}
static void nohz_newidle_balance(struct rq *this_rq)
{
int this_cpu = this_rq->cpu;
/*
* This CPU doesn't want to be disturbed by scheduler
* housekeeping
*/
if (!housekeeping_cpu(this_cpu, HK_FLAG_SCHED))
return;
/* Will wake up very soon. No time for doing anything else*/
if (this_rq->avg_idle < sysctl_sched_migration_cost)
return;
/* Don't need to update blocked load of idle CPUs*/
if (!READ_ONCE(nohz.has_blocked) ||
time_before(jiffies, READ_ONCE(nohz.next_blocked)))
return;
raw_spin_unlock(&this_rq->lock);
/*
* This CPU is going to be idle and blocked load of idle CPUs
* need to be updated. Run the ilb locally as it is a good
* candidate for ilb instead of waking up another idle CPU.
* Kick an normal ilb if we failed to do the update.
*/
if (!_nohz_idle_balance(this_rq, NOHZ_STATS_KICK, CPU_NEWLY_IDLE))
kick_ilb(NOHZ_STATS_KICK);
raw_spin_lock(&this_rq->lock);
}
#else /* !CONFIG_NO_HZ_COMMON */
static inline void nohz_balancer_kick(struct rq *rq) { }
static inline bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
{
return false;
}
static inline void nohz_newidle_balance(struct rq *this_rq) { }
#endif /* CONFIG_NO_HZ_COMMON */
/*
* idle_balance is called by schedule() if this_cpu is about to become
* idle. Attempts to pull tasks from other CPUs.
*/
static int idle_balance(struct rq *this_rq, struct rq_flags *rf)
{
unsigned long next_balance = jiffies + HZ;
int this_cpu = this_rq->cpu;
struct sched_domain *sd;
int pulled_task = 0;
u64 curr_cost = 0;
/*
* We must set idle_stamp _before_ calling idle_balance(), such that we
* measure the duration of idle_balance() as idle time.
*/
this_rq->idle_stamp = rq_clock(this_rq);
/*
* Do not pull tasks towards !active CPUs...
*/
if (!cpu_active(this_cpu))
return 0;
/*
* This is OK, because current is on_cpu, which avoids it being picked
* for load-balance and preemption/IRQs are still disabled avoiding
* further scheduler activity on it and we're being very careful to
* re-start the picking loop.
*/
rq_unpin_lock(this_rq, rf);
if (this_rq->avg_idle < sysctl_sched_migration_cost ||
!this_rq->rd->overload) {
rcu_read_lock();
sd = rcu_dereference_check_sched_domain(this_rq->sd);
if (sd)
update_next_balance(sd, &next_balance);
rcu_read_unlock();
nohz_newidle_balance(this_rq);
goto out;
}
raw_spin_unlock(&this_rq->lock);
update_blocked_averages(this_cpu);
rcu_read_lock();
for_each_domain(this_cpu, sd) {
int continue_balancing = 1;
u64 t0, domain_cost;
if (!(sd->flags & SD_LOAD_BALANCE))
continue;
if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) {
update_next_balance(sd, &next_balance);
break;
}
if (sd->flags & SD_BALANCE_NEWIDLE) {
t0 = sched_clock_cpu(this_cpu);
pulled_task = load_balance(this_cpu, this_rq,
sd, CPU_NEWLY_IDLE,
&continue_balancing);
domain_cost = sched_clock_cpu(this_cpu) - t0;
if (domain_cost > sd->max_newidle_lb_cost)
sd->max_newidle_lb_cost = domain_cost;
curr_cost += domain_cost;
}
update_next_balance(sd, &next_balance);
/*
* Stop searching for tasks to pull if there are
* now runnable tasks on this rq.
*/
if (pulled_task || this_rq->nr_running > 0)
break;
}
rcu_read_unlock();
raw_spin_lock(&this_rq->lock);
if (curr_cost > this_rq->max_idle_balance_cost)
this_rq->max_idle_balance_cost = curr_cost;
/*
* While browsing the domains, we released the rq lock, a task could
* have been enqueued in the meantime. Since we're not going idle,
* pretend we pulled a task.
*/
if (this_rq->cfs.h_nr_running && !pulled_task)
pulled_task = 1;
out:
/* Move the next balance forward */
if (time_after(this_rq->next_balance, next_balance))
this_rq->next_balance = next_balance;
/* Is there a task of a high priority class? */
if (this_rq->nr_running != this_rq->cfs.h_nr_running)
pulled_task = -1;
if (pulled_task)
this_rq->idle_stamp = 0;
rq_repin_lock(this_rq, rf);
return pulled_task;
}
#else
static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { }
#endif
/*
* run_rebalance_domains is triggered when needed from the scheduler tick.
......@@ -9394,14 +9883,18 @@ static __latent_entropy void run_rebalance_domains(struct softirq_action *h)
CPU_IDLE : CPU_NOT_IDLE;
/*
* If this cpu has a pending nohz_balance_kick, then do the
* balancing on behalf of the other idle cpus whose ticks are
* If this CPU has a pending nohz_balance_kick, then do the
* balancing on behalf of the other idle CPUs whose ticks are
* stopped. Do nohz_idle_balance *before* rebalance_domains to
* give the idle cpus a chance to load balance. Else we may
* give the idle CPUs a chance to load balance. Else we may
* load balance only within the local sched_domain hierarchy
* and abort nohz_idle_balance altogether if we pull some load.
*/
nohz_idle_balance(this_rq, idle);
if (nohz_idle_balance(this_rq, idle))
return;
/* normal load balance */
update_blocked_averages(this_rq->cpu);
rebalance_domains(this_rq, idle);
}
......@@ -9416,10 +9909,8 @@ void trigger_load_balance(struct rq *rq)
if (time_after_eq(jiffies, rq->next_balance))
raise_softirq(SCHED_SOFTIRQ);
#ifdef CONFIG_NO_HZ_COMMON
if (nohz_kick_needed(rq))
nohz_balancer_kick();
#endif
nohz_balancer_kick(rq);
}
static void rq_online_fair(struct rq *rq)
......@@ -9440,7 +9931,12 @@ static void rq_offline_fair(struct rq *rq)
#endif /* CONFIG_SMP */
/*
* scheduler tick hitting a task of our scheduling class:
* scheduler tick hitting a task of our scheduling class.
*
* NOTE: This function can be called remotely by the tick offload that
* goes along full dynticks. Therefore no local assumption can be made
* and everything must be accessed through the @rq and @curr passed in
* parameters.
*/
static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
{
......@@ -9591,7 +10087,7 @@ static void attach_entity_cfs_rq(struct sched_entity *se)
/* Synchronize entity with its cfs_rq */
update_load_avg(cfs_rq, se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD);
attach_entity_load_avg(cfs_rq, se);
attach_entity_load_avg(cfs_rq, se, 0);
update_tg_load_avg(cfs_rq, false);
propagate_entity_cfs_rq(se);
}
......@@ -9993,6 +10489,7 @@ __init void init_sched_fair_class(void)
#ifdef CONFIG_NO_HZ_COMMON
nohz.next_balance = jiffies;
nohz.next_blocked = jiffies;
zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
#endif
#endif /* SMP */
......
......@@ -85,3 +85,8 @@ SCHED_FEAT(ATTACH_AGE_LOAD, true)
SCHED_FEAT(WA_IDLE, true)
SCHED_FEAT(WA_WEIGHT, true)
SCHED_FEAT(WA_BIAS, true)
/*
* UtilEstimation. Use estimated CPU utilization.
*/
SCHED_FEAT(UTIL_EST, true)
/*
* Generic entry point for the idle threads
* Generic entry points for the idle threads and
* implementation of the idle task scheduling class.
*
* (NOTE: these are not related to SCHED_IDLE batch scheduled
* tasks which are handled in sched/fair.c )
*/
#include <linux/sched.h>
#include <linux/sched/idle.h>
#include <linux/cpu.h>
#include <linux/cpuidle.h>
#include <linux/cpuhotplug.h>
#include <linux/tick.h>
#include <linux/mm.h>
#include <linux/stackprotector.h>
#include <linux/suspend.h>
#include <linux/livepatch.h>
#include <asm/tlb.h>
#include "sched.h"
#include <trace/events/power.h>
#include "sched.h"
/* Linker adds these: start and end of __cpuidle functions */
extern char __cpuidle_text_start[], __cpuidle_text_end[];
......@@ -46,6 +37,7 @@ void cpu_idle_poll_ctrl(bool enable)
static int __init cpu_idle_poll_setup(char *__unused)
{
cpu_idle_force_poll = 1;
return 1;
}
__setup("nohlt", cpu_idle_poll_setup);
......@@ -53,6 +45,7 @@ __setup("nohlt", cpu_idle_poll_setup);
static int __init cpu_idle_nopoll_setup(char *__unused)
{
cpu_idle_force_poll = 0;
return 1;
}
__setup("hlt", cpu_idle_nopoll_setup);
......@@ -64,12 +57,14 @@ static noinline int __cpuidle cpu_idle_poll(void)
trace_cpu_idle_rcuidle(0, smp_processor_id());
local_irq_enable();
stop_critical_timings();
while (!tif_need_resched() &&
(cpu_idle_force_poll || tick_check_broadcast_expired()))
cpu_relax();
start_critical_timings();
trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
rcu_idle_exit();
return 1;
}
......@@ -332,8 +327,8 @@ void cpu_startup_entry(enum cpuhp_state state)
{
/*
* This #ifdef needs to die, but it's too late in the cycle to
* make this generic (arm and sh have never invoked the canary
* init for the non boot cpus!). Will be fixed in 3.11
* make this generic (ARM and SH have never invoked the canary
* init for the non boot CPUs!). Will be fixed in 3.11
*/
#ifdef CONFIG_X86
/*
......@@ -350,3 +345,116 @@ void cpu_startup_entry(enum cpuhp_state state)
while (1)
do_idle();
}
/*
* idle-task scheduling class.
*/
#ifdef CONFIG_SMP
static int
select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags)
{
return task_cpu(p); /* IDLE tasks as never migrated */
}
#endif
/*
* Idle tasks are unconditionally rescheduled:
*/
static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int flags)
{
resched_curr(rq);
}
static struct task_struct *
pick_next_task_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{
put_prev_task(rq, prev);
update_idle_core(rq);
schedstat_inc(rq->sched_goidle);
return rq->idle;
}
/*
* It is not legal to sleep in the idle task - print a warning
* message if some code attempts to do it:
*/
static void
dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags)
{
raw_spin_unlock_irq(&rq->lock);
printk(KERN_ERR "bad: scheduling from the idle thread!\n");
dump_stack();
raw_spin_lock_irq(&rq->lock);
}
static void put_prev_task_idle(struct rq *rq, struct task_struct *prev)
{
}
/*
* scheduler tick hitting a task of our scheduling class.
*
* NOTE: This function can be called remotely by the tick offload that
* goes along full dynticks. Therefore no local assumption can be made
* and everything must be accessed through the @rq and @curr passed in
* parameters.
*/
static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued)
{
}
static void set_curr_task_idle(struct rq *rq)
{
}
static void switched_to_idle(struct rq *rq, struct task_struct *p)
{
BUG();
}
static void
prio_changed_idle(struct rq *rq, struct task_struct *p, int oldprio)
{
BUG();
}
static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task)
{
return 0;
}
static void update_curr_idle(struct rq *rq)
{
}
/*
* Simple, special scheduling class for the per-CPU idle tasks:
*/
const struct sched_class idle_sched_class = {
/* .next is NULL */
/* no enqueue/yield_task for idle tasks */
/* dequeue is not valid, we print a debug message there: */
.dequeue_task = dequeue_task_idle,
.check_preempt_curr = check_preempt_curr_idle,
.pick_next_task = pick_next_task_idle,
.put_prev_task = put_prev_task_idle,
#ifdef CONFIG_SMP
.select_task_rq = select_task_rq_idle,
.set_cpus_allowed = set_cpus_allowed_common,
#endif
.set_curr_task = set_curr_task_idle,
.task_tick = task_tick_idle,
.get_rr_interval = get_rr_interval_idle,
.prio_changed = prio_changed_idle,
.switched_to = switched_to_idle,
.update_curr = update_curr_idle,
};
// SPDX-License-Identifier: GPL-2.0
#include "sched.h"
/*
* idle-task scheduling class.
*
* (NOTE: these are not related to SCHED_IDLE tasks which are
* handled in sched/fair.c)
*/
#ifdef CONFIG_SMP
static int
select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags)
{
return task_cpu(p); /* IDLE tasks as never migrated */
}
#endif /* CONFIG_SMP */
/*
* Idle tasks are unconditionally rescheduled:
*/
static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int flags)
{
resched_curr(rq);
}
static struct task_struct *
pick_next_task_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{
put_prev_task(rq, prev);
update_idle_core(rq);
schedstat_inc(rq->sched_goidle);
return rq->idle;
}
/*
* It is not legal to sleep in the idle task - print a warning
* message if some code attempts to do it:
*/
static void
dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags)
{
raw_spin_unlock_irq(&rq->lock);
printk(KERN_ERR "bad: scheduling from the idle thread!\n");
dump_stack();
raw_spin_lock_irq(&rq->lock);
}
static void put_prev_task_idle(struct rq *rq, struct task_struct *prev)
{
rq_last_tick_reset(rq);
}
static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued)
{
}
static void set_curr_task_idle(struct rq *rq)
{
}
static void switched_to_idle(struct rq *rq, struct task_struct *p)
{
BUG();
}
static void
prio_changed_idle(struct rq *rq, struct task_struct *p, int oldprio)
{
BUG();
}
static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task)
{
return 0;
}
static void update_curr_idle(struct rq *rq)
{
}
/*
* Simple, special scheduling class for the per-CPU idle tasks:
*/
const struct sched_class idle_sched_class = {
/* .next is NULL */
/* no enqueue/yield_task for idle tasks */
/* dequeue is not valid, we print a debug message there: */
.dequeue_task = dequeue_task_idle,
.check_preempt_curr = check_preempt_curr_idle,
.pick_next_task = pick_next_task_idle,
.put_prev_task = put_prev_task_idle,
#ifdef CONFIG_SMP
.select_task_rq = select_task_rq_idle,
.set_cpus_allowed = set_cpus_allowed_common,
#endif
.set_curr_task = set_curr_task_idle,
.task_tick = task_tick_idle,
.get_rr_interval = get_rr_interval_idle,
.prio_changed = prio_changed_idle,
.switched_to = switched_to_idle,
.update_curr = update_curr_idle,
};
......@@ -3,15 +3,10 @@
* any CPU: unbound workqueues, timers, kthreads and any offloadable work.
*
* Copyright (C) 2017 Red Hat, Inc., Frederic Weisbecker
* Copyright (C) 2017-2018 SUSE, Frederic Weisbecker
*
*/
#include <linux/sched/isolation.h>
#include <linux/tick.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/static_key.h>
#include <linux/ctype.h>
#include "sched.h"
DEFINE_STATIC_KEY_FALSE(housekeeping_overriden);
EXPORT_SYMBOL_GPL(housekeeping_overriden);
......@@ -60,6 +55,9 @@ void __init housekeeping_init(void)
static_branch_enable(&housekeeping_overriden);
if (housekeeping_flags & HK_FLAG_TICK)
sched_tick_offload_init();
/* We need at least one CPU to handle housekeeping work */
WARN_ON_ONCE(cpumask_empty(housekeeping_mask));
}
......@@ -119,7 +117,7 @@ static int __init housekeeping_nohz_full_setup(char *str)
{
unsigned int flags;
flags = HK_FLAG_TICK | HK_FLAG_TIMER | HK_FLAG_RCU | HK_FLAG_MISC;
flags = HK_FLAG_TICK | HK_FLAG_WQ | HK_FLAG_TIMER | HK_FLAG_RCU | HK_FLAG_MISC;
return housekeeping_setup(str, flags);
}
......
......@@ -6,10 +6,6 @@
* figure. Its a silly number but people think its important. We go through
* great pains to make it work on big machines and tickless kernels.
*/
#include <linux/export.h>
#include <linux/sched/loadavg.h>
#include "sched.h"
/*
......@@ -32,29 +28,29 @@
* Due to a number of reasons the above turns in the mess below:
*
* - for_each_possible_cpu() is prohibitively expensive on machines with
* serious number of cpus, therefore we need to take a distributed approach
* serious number of CPUs, therefore we need to take a distributed approach
* to calculating nr_active.
*
* \Sum_i x_i(t) = \Sum_i x_i(t) - x_i(t_0) | x_i(t_0) := 0
* = \Sum_i { \Sum_j=1 x_i(t_j) - x_i(t_j-1) }
*
* So assuming nr_active := 0 when we start out -- true per definition, we
* can simply take per-cpu deltas and fold those into a global accumulate
* can simply take per-CPU deltas and fold those into a global accumulate
* to obtain the same result. See calc_load_fold_active().
*
* Furthermore, in order to avoid synchronizing all per-cpu delta folding
* Furthermore, in order to avoid synchronizing all per-CPU delta folding
* across the machine, we assume 10 ticks is sufficient time for every
* cpu to have completed this task.
* CPU to have completed this task.
*
* This places an upper-bound on the IRQ-off latency of the machine. Then
* again, being late doesn't loose the delta, just wrecks the sample.
*
* - cpu_rq()->nr_uninterruptible isn't accurately tracked per-cpu because
* this would add another cross-cpu cacheline miss and atomic operation
* to the wakeup path. Instead we increment on whatever cpu the task ran
* when it went into uninterruptible state and decrement on whatever cpu
* - cpu_rq()->nr_uninterruptible isn't accurately tracked per-CPU because
* this would add another cross-CPU cacheline miss and atomic operation
* to the wakeup path. Instead we increment on whatever CPU the task ran
* when it went into uninterruptible state and decrement on whatever CPU
* did the wakeup. This means that only the sum of nr_uninterruptible over
* all cpus yields the correct result.
* all CPUs yields the correct result.
*
* This covers the NO_HZ=n code, for extra head-aches, see the comment below.
*/
......@@ -115,11 +111,11 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active)
* Handle NO_HZ for the global load-average.
*
* Since the above described distributed algorithm to compute the global
* load-average relies on per-cpu sampling from the tick, it is affected by
* load-average relies on per-CPU sampling from the tick, it is affected by
* NO_HZ.
*
* The basic idea is to fold the nr_active delta into a global NO_HZ-delta upon
* entering NO_HZ state such that we can include this as an 'extra' cpu delta
* entering NO_HZ state such that we can include this as an 'extra' CPU delta
* when we read the global state.
*
* Obviously reality has to ruin such a delightfully simple scheme:
......@@ -146,9 +142,9 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active)
* busy state.
*
* This is solved by pushing the window forward, and thus skipping the
* sample, for this cpu (effectively using the NO_HZ-delta for this cpu which
* sample, for this CPU (effectively using the NO_HZ-delta for this CPU which
* was in effect at the time the window opened). This also solves the issue
* of having to deal with a cpu having been in NO_HZ for multiple LOAD_FREQ
* of having to deal with a CPU having been in NO_HZ for multiple LOAD_FREQ
* intervals.
*
* When making the ILB scale, we should try to pull this in as well.
......@@ -299,7 +295,7 @@ calc_load_n(unsigned long load, unsigned long exp,
}
/*
* NO_HZ can leave us missing all per-cpu ticks calling
* NO_HZ can leave us missing all per-CPU ticks calling
* calc_load_fold_active(), but since a NO_HZ CPU folds its delta into
* calc_load_nohz per calc_load_nohz_start(), all we need to do is fold
* in the pending NO_HZ delta if our NO_HZ period crossed a load cycle boundary.
......@@ -363,7 +359,7 @@ void calc_global_load(unsigned long ticks)
return;
/*
* Fold the 'old' NO_HZ-delta to include all NO_HZ cpus.
* Fold the 'old' NO_HZ-delta to include all NO_HZ CPUs.
*/
delta = calc_load_nohz_fold();
if (delta)
......
......@@ -13,32 +13,25 @@
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*/
#include <linux/syscalls.h>
#include <linux/membarrier.h>
#include <linux/tick.h>
#include <linux/cpumask.h>
#include <linux/atomic.h>
#include "sched.h" /* for cpu_rq(). */
#include "sched.h"
/*
* Bitmask made from a "or" of all commands within enum membarrier_cmd,
* except MEMBARRIER_CMD_QUERY.
*/
#ifdef CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE
#define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK \
(MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE \
#define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK \
(MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE \
| MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE)
#else
#define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK 0
#endif
#define MEMBARRIER_CMD_BITMASK \
(MEMBARRIER_CMD_GLOBAL | MEMBARRIER_CMD_GLOBAL_EXPEDITED \
| MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED \
| MEMBARRIER_CMD_PRIVATE_EXPEDITED \
| MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED \
#define MEMBARRIER_CMD_BITMASK \
(MEMBARRIER_CMD_GLOBAL | MEMBARRIER_CMD_GLOBAL_EXPEDITED \
| MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED \
| MEMBARRIER_CMD_PRIVATE_EXPEDITED \
| MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED \
| MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK)
static void ipi_mb(void *info)
......@@ -85,6 +78,7 @@ static int membarrier_global_expedited(void)
*/
if (cpu == raw_smp_processor_id())
continue;
rcu_read_lock();
p = task_rcu_dereference(&cpu_rq(cpu)->curr);
if (p && p->mm && (atomic_read(&p->mm->membarrier_state) &
......@@ -188,6 +182,7 @@ static int membarrier_private_expedited(int flags)
* rq->curr modification in scheduler.
*/
smp_mb(); /* exit from system call is not a mb */
return 0;
}
......@@ -219,6 +214,7 @@ static int membarrier_register_global_expedited(void)
}
atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY,
&mm->membarrier_state);
return 0;
}
......@@ -253,6 +249,7 @@ static int membarrier_register_private_expedited(int flags)
synchronize_sched();
}
atomic_or(state, &mm->membarrier_state);
return 0;
}
......
......@@ -3,12 +3,8 @@
* Real-Time Scheduling Class (mapped to the SCHED_FIFO and SCHED_RR
* policies)
*/
#include "sched.h"
#include <linux/slab.h>
#include <linux/irq_work.h>
int sched_rr_timeslice = RR_TIMESLICE;
int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE;
......@@ -359,7 +355,7 @@ static DEFINE_PER_CPU(struct callback_head, rt_pull_head);
static void push_rt_tasks(struct rq *);
static void pull_rt_task(struct rq *);
static inline void queue_push_tasks(struct rq *rq)
static inline void rt_queue_push_tasks(struct rq *rq)
{
if (!has_pushable_tasks(rq))
return;
......@@ -367,7 +363,7 @@ static inline void queue_push_tasks(struct rq *rq)
queue_balance_callback(rq, &per_cpu(rt_push_head, rq->cpu), push_rt_tasks);
}
static inline void queue_pull_task(struct rq *rq)
static inline void rt_queue_pull_task(struct rq *rq)
{
queue_balance_callback(rq, &per_cpu(rt_pull_head, rq->cpu), pull_rt_task);
}
......@@ -425,7 +421,7 @@ static inline void pull_rt_task(struct rq *this_rq)
{
}
static inline void queue_push_tasks(struct rq *rq)
static inline void rt_queue_push_tasks(struct rq *rq)
{
}
#endif /* CONFIG_SMP */
......@@ -961,9 +957,6 @@ static void update_curr_rt(struct rq *rq)
if (unlikely((s64)delta_exec <= 0))
return;
/* Kick cpufreq (see the comment in kernel/sched/sched.h). */
cpufreq_update_util(rq, SCHED_CPUFREQ_RT);
schedstat_set(curr->se.statistics.exec_max,
max(curr->se.statistics.exec_max, delta_exec));
......@@ -1005,6 +998,9 @@ dequeue_top_rt_rq(struct rt_rq *rt_rq)
sub_nr_running(rq, rt_rq->rt_nr_running);
rt_rq->rt_queued = 0;
/* Kick cpufreq (see the comment in kernel/sched/sched.h). */
cpufreq_update_util(rq, 0);
}
static void
......@@ -1021,6 +1017,9 @@ enqueue_top_rt_rq(struct rt_rq *rt_rq)
add_nr_running(rq, rt_rq->rt_nr_running);
rt_rq->rt_queued = 1;
/* Kick cpufreq (see the comment in kernel/sched/sched.h). */
cpufreq_update_util(rq, 0);
}
#if defined CONFIG_SMP
......@@ -1453,9 +1452,9 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
return;
/*
* There appears to be other cpus that can accept
* current and none to run 'p', so lets reschedule
* to try and push current away:
* There appear to be other CPUs that can accept
* the current task but none can run 'p', so lets reschedule
* to try and push the current task away:
*/
requeue_task_rt(rq, p, 1);
resched_curr(rq);
......@@ -1569,7 +1568,7 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
/* The running task is never eligible for pushing */
dequeue_pushable_task(rq, p);
queue_push_tasks(rq);
rt_queue_push_tasks(rq);
return p;
}
......@@ -1596,12 +1595,13 @@ static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
if (!task_running(rq, p) &&
cpumask_test_cpu(cpu, &p->cpus_allowed))
return 1;
return 0;
}
/*
* Return the highest pushable rq's task, which is suitable to be executed
* on the cpu, NULL otherwise
* on the CPU, NULL otherwise
*/
static struct task_struct *pick_highest_pushable_task(struct rq *rq, int cpu)
{
......@@ -1639,11 +1639,11 @@ static int find_lowest_rq(struct task_struct *task)
return -1; /* No targets found */
/*
* At this point we have built a mask of cpus representing the
* At this point we have built a mask of CPUs representing the
* lowest priority tasks in the system. Now we want to elect
* the best one based on our affinity and topology.
*
* We prioritize the last cpu that the task executed on since
* We prioritize the last CPU that the task executed on since
* it is most likely cache-hot in that location.
*/
if (cpumask_test_cpu(cpu, lowest_mask))
......@@ -1651,7 +1651,7 @@ static int find_lowest_rq(struct task_struct *task)
/*
* Otherwise, we consult the sched_domains span maps to figure
* out which cpu is logically closest to our hot cache data.
* out which CPU is logically closest to our hot cache data.
*/
if (!cpumask_test_cpu(this_cpu, lowest_mask))
this_cpu = -1; /* Skip this_cpu opt if not among lowest */
......@@ -1692,6 +1692,7 @@ static int find_lowest_rq(struct task_struct *task)
cpu = cpumask_any(lowest_mask);
if (cpu < nr_cpu_ids)
return cpu;
return -1;
}
......@@ -1827,7 +1828,7 @@ static int push_rt_task(struct rq *rq)
* The task hasn't migrated, and is still the next
* eligible task, but we failed to find a run-queue
* to push it to. Do not retry in this case, since
* other cpus will pull from us when ready.
* other CPUs will pull from us when ready.
*/
goto out;
}
......@@ -1919,7 +1920,7 @@ static int rto_next_cpu(struct root_domain *rd)
* rt_next_cpu() will simply return the first CPU found in
* the rto_mask.
*
* If rto_next_cpu() is called with rto_cpu is a valid cpu, it
* If rto_next_cpu() is called with rto_cpu is a valid CPU, it
* will return the next CPU found in the rto_mask.
*
* If there are no more CPUs left in the rto_mask, then a check is made
......@@ -1980,7 +1981,7 @@ static void tell_cpu_to_push(struct rq *rq)
raw_spin_lock(&rq->rd->rto_lock);
/*
* The rto_cpu is updated under the lock, if it has a valid cpu
* The rto_cpu is updated under the lock, if it has a valid CPU
* then the IPI is still running and will continue due to the
* update to loop_next, and nothing needs to be done here.
* Otherwise it is finishing up and an ipi needs to be sent.
......@@ -2105,7 +2106,7 @@ static void pull_rt_task(struct rq *this_rq)
/*
* There's a chance that p is higher in priority
* than what's currently running on its cpu.
* than what's currently running on its CPU.
* This is just that p is wakeing up and hasn't
* had a chance to schedule. We only pull
* p if it is lower in priority than the
......@@ -2187,7 +2188,7 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p)
if (!task_on_rq_queued(p) || rq->rt.rt_nr_running)
return;
queue_pull_task(rq);
rt_queue_pull_task(rq);
}
void __init init_sched_rt_class(void)
......@@ -2218,7 +2219,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p)
if (task_on_rq_queued(p) && rq->curr != p) {
#ifdef CONFIG_SMP
if (p->nr_cpus_allowed > 1 && rq->rt.overloaded)
queue_push_tasks(rq);
rt_queue_push_tasks(rq);
#endif /* CONFIG_SMP */
if (p->prio < rq->curr->prio && cpu_online(cpu_of(rq)))
resched_curr(rq);
......@@ -2242,7 +2243,7 @@ prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
* may need to pull tasks to this runqueue.
*/
if (oldprio < p->prio)
queue_pull_task(rq);
rt_queue_pull_task(rq);
/*
* If there's a higher priority task waiting to run
......@@ -2292,6 +2293,14 @@ static void watchdog(struct rq *rq, struct task_struct *p)
static inline void watchdog(struct rq *rq, struct task_struct *p) { }
#endif
/*
* scheduler tick hitting a task of our scheduling class.
*
* NOTE: This function can be called remotely by the tick offload that
* goes along full dynticks. Therefore no local assumption can be made
* and everything must be accessed through the @rq and @curr passed in
* parameters.
*/
static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
{
struct sched_rt_entity *rt_se = &p->rt;
......@@ -2685,6 +2694,7 @@ int sched_rr_handler(struct ctl_table *table, int write,
msecs_to_jiffies(sysctl_sched_rr_timeslice);
}
mutex_unlock(&mutex);
return ret;
}
......
/* SPDX-License-Identifier: GPL-2.0 */
/*
* Scheduler internal types and methods:
*/
#include <linux/sched.h>
#include <linux/sched/autogroup.h>
#include <linux/sched/sysctl.h>
#include <linux/sched/topology.h>
#include <linux/sched/rt.h>
#include <linux/sched/deadline.h>
#include <linux/sched/clock.h>
#include <linux/sched/wake_q.h>
#include <linux/sched/signal.h>
#include <linux/sched/numa_balancing.h>
#include <linux/sched/mm.h>
#include <linux/sched/coredump.h>
#include <linux/sched/cpufreq.h>
#include <linux/sched/stat.h>
#include <linux/sched/nohz.h>
#include <linux/sched/cputime.h>
#include <linux/sched/deadline.h>
#include <linux/sched/debug.h>
#include <linux/sched/hotplug.h>
#include <linux/sched/idle.h>
#include <linux/sched/init.h>
#include <linux/sched/isolation.h>
#include <linux/sched/jobctl.h>
#include <linux/sched/loadavg.h>
#include <linux/sched/mm.h>
#include <linux/sched/nohz.h>
#include <linux/sched/numa_balancing.h>
#include <linux/sched/prio.h>
#include <linux/sched/rt.h>
#include <linux/sched/signal.h>
#include <linux/sched/stat.h>
#include <linux/sched/sysctl.h>
#include <linux/sched/task.h>
#include <linux/sched/task_stack.h>
#include <linux/sched/cputime.h>
#include <linux/sched/init.h>
#include <linux/sched/topology.h>
#include <linux/sched/user.h>
#include <linux/sched/wake_q.h>
#include <linux/sched/xacct.h>
#include <uapi/linux/sched/types.h>
#include <linux/u64_stats_sync.h>
#include <linux/kernel_stat.h>
#include <linux/binfmts.h>
#include <linux/mutex.h>
#include <linux/spinlock.h>
#include <linux/blkdev.h>
#include <linux/compat.h>
#include <linux/context_tracking.h>
#include <linux/cpufreq.h>
#include <linux/cpuidle.h>
#include <linux/cpuset.h>
#include <linux/ctype.h>
#include <linux/debugfs.h>
#include <linux/delayacct.h>
#include <linux/init_task.h>
#include <linux/kprobes.h>
#include <linux/kthread.h>
#include <linux/membarrier.h>
#include <linux/migrate.h>
#include <linux/mmu_context.h>
#include <linux/nmi.h>
#include <linux/proc_fs.h>
#include <linux/prefetch.h>
#include <linux/profile.h>
#include <linux/rcupdate_wait.h>
#include <linux/security.h>
#include <linux/stackprotector.h>
#include <linux/stop_machine.h>
#include <linux/irq_work.h>
#include <linux/tick.h>
#include <linux/slab.h>
#include <linux/cgroup.h>
#include <linux/suspend.h>
#include <linux/swait.h>
#include <linux/syscalls.h>
#include <linux/task_work.h>
#include <linux/tsacct_kern.h>
#include <asm/tlb.h>
#ifdef CONFIG_PARAVIRT
#include <asm/paravirt.h>
# include <asm/paravirt.h>
#endif
#include "cpupri.h"
......@@ -79,11 +113,11 @@ static inline void cpu_load_update_active(struct rq *this_rq) { }
* and does not change the user-interface for setting shares/weights.
*
* We increase resolution only if we have enough bits to allow this increased
* resolution (i.e. 64bit). The costs for increasing resolution when 32bit are
* pretty high and the returns do not justify the increased costs.
* resolution (i.e. 64-bit). The costs for increasing resolution when 32-bit
* are pretty high and the returns do not justify the increased costs.
*
* Really only required when CONFIG_FAIR_GROUP_SCHED is also set, but to
* increase coverage and consistency always enable it on 64bit platforms.
* Really only required when CONFIG_FAIR_GROUP_SCHED=y is also set, but to
* increase coverage and consistency always enable it on 64-bit platforms.
*/
#ifdef CONFIG_64BIT
# define NICE_0_LOAD_SHIFT (SCHED_FIXEDPOINT_SHIFT + SCHED_FIXEDPOINT_SHIFT)
......@@ -111,16 +145,12 @@ static inline void cpu_load_update_active(struct rq *this_rq) { }
* 10 -> just above 1us
* 9 -> just above 0.5us
*/
#define DL_SCALE (10)
#define DL_SCALE 10
/*
* These are the 'tuning knobs' of the scheduler:
* Single value that denotes runtime == period, ie unlimited time.
*/
/*
* single value that denotes runtime == period, ie unlimited time.
*/
#define RUNTIME_INF ((u64)~0ULL)
#define RUNTIME_INF ((u64)~0ULL)
static inline int idle_policy(int policy)
{
......@@ -235,9 +265,9 @@ void __dl_clear_params(struct task_struct *p);
* control.
*/
struct dl_bandwidth {
raw_spinlock_t dl_runtime_lock;
u64 dl_runtime;
u64 dl_period;
raw_spinlock_t dl_runtime_lock;
u64 dl_runtime;
u64 dl_period;
};
static inline int dl_bandwidth_enabled(void)
......@@ -246,8 +276,9 @@ static inline int dl_bandwidth_enabled(void)
}
struct dl_bw {
raw_spinlock_t lock;
u64 bw, total_bw;
raw_spinlock_t lock;
u64 bw;
u64 total_bw;
};
static inline void __dl_update(struct dl_bw *dl_b, s64 bw);
......@@ -273,20 +304,17 @@ bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw)
dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw;
}
void dl_change_utilization(struct task_struct *p, u64 new_bw);
extern void dl_change_utilization(struct task_struct *p, u64 new_bw);
extern void init_dl_bw(struct dl_bw *dl_b);
extern int sched_dl_global_validate(void);
extern int sched_dl_global_validate(void);
extern void sched_dl_do_global(void);
extern int sched_dl_overflow(struct task_struct *p, int policy,
const struct sched_attr *attr);
extern int sched_dl_overflow(struct task_struct *p, int policy, const struct sched_attr *attr);
extern void __setparam_dl(struct task_struct *p, const struct sched_attr *attr);
extern void __getparam_dl(struct task_struct *p, struct sched_attr *attr);
extern bool __checkparam_dl(const struct sched_attr *attr);
extern bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr);
extern int dl_task_can_attach(struct task_struct *p,
const struct cpumask *cs_cpus_allowed);
extern int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur,
const struct cpumask *trial);
extern int dl_task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_allowed);
extern int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial);
extern bool dl_cpu_busy(unsigned int cpu);
#ifdef CONFIG_CGROUP_SCHED
......@@ -300,32 +328,36 @@ extern struct list_head task_groups;
struct cfs_bandwidth {
#ifdef CONFIG_CFS_BANDWIDTH
raw_spinlock_t lock;
ktime_t period;
u64 quota, runtime;
s64 hierarchical_quota;
u64 runtime_expires;
int idle, period_active;
struct hrtimer period_timer, slack_timer;
struct list_head throttled_cfs_rq;
/* statistics */
int nr_periods, nr_throttled;
u64 throttled_time;
raw_spinlock_t lock;
ktime_t period;
u64 quota;
u64 runtime;
s64 hierarchical_quota;
u64 runtime_expires;
int idle;
int period_active;
struct hrtimer period_timer;
struct hrtimer slack_timer;
struct list_head throttled_cfs_rq;
/* Statistics: */
int nr_periods;
int nr_throttled;
u64 throttled_time;
#endif
};
/* task group related information */
/* Task group related information */
struct task_group {
struct cgroup_subsys_state css;
#ifdef CONFIG_FAIR_GROUP_SCHED
/* schedulable entities of this group on each cpu */
struct sched_entity **se;
/* runqueue "owned" by this group on each cpu */
struct cfs_rq **cfs_rq;
unsigned long shares;
/* schedulable entities of this group on each CPU */
struct sched_entity **se;
/* runqueue "owned" by this group on each CPU */
struct cfs_rq **cfs_rq;
unsigned long shares;
#ifdef CONFIG_SMP
/*
......@@ -333,29 +365,29 @@ struct task_group {
* it in its own cacheline separated from the fields above which
* will also be accessed at each tick.
*/
atomic_long_t load_avg ____cacheline_aligned;
atomic_long_t load_avg ____cacheline_aligned;
#endif
#endif
#ifdef CONFIG_RT_GROUP_SCHED
struct sched_rt_entity **rt_se;
struct rt_rq **rt_rq;
struct sched_rt_entity **rt_se;
struct rt_rq **rt_rq;
struct rt_bandwidth rt_bandwidth;
struct rt_bandwidth rt_bandwidth;
#endif
struct rcu_head rcu;
struct list_head list;
struct rcu_head rcu;
struct list_head list;
struct task_group *parent;
struct list_head siblings;
struct list_head children;
struct task_group *parent;
struct list_head siblings;
struct list_head children;
#ifdef CONFIG_SCHED_AUTOGROUP
struct autogroup *autogroup;
struct autogroup *autogroup;
#endif
struct cfs_bandwidth cfs_bandwidth;
struct cfs_bandwidth cfs_bandwidth;
};
#ifdef CONFIG_FAIR_GROUP_SCHED
......@@ -369,8 +401,8 @@ struct task_group {
* (The default weight is 1024 - so there's no practical
* limitation from this.)
*/
#define MIN_SHARES (1UL << 1)
#define MAX_SHARES (1UL << 18)
#define MIN_SHARES (1UL << 1)
#define MAX_SHARES (1UL << 18)
#endif
typedef int (*tg_visitor)(struct task_group *, void *);
......@@ -443,35 +475,39 @@ struct cfs_bandwidth { };
/* CFS-related fields in a runqueue */
struct cfs_rq {
struct load_weight load;
unsigned long runnable_weight;
unsigned int nr_running, h_nr_running;
struct load_weight load;
unsigned long runnable_weight;
unsigned int nr_running;
unsigned int h_nr_running;
u64 exec_clock;
u64 min_vruntime;
u64 exec_clock;
u64 min_vruntime;
#ifndef CONFIG_64BIT
u64 min_vruntime_copy;
u64 min_vruntime_copy;
#endif
struct rb_root_cached tasks_timeline;
struct rb_root_cached tasks_timeline;
/*
* 'curr' points to currently running entity on this cfs_rq.
* It is set to NULL otherwise (i.e when none are currently running).
*/
struct sched_entity *curr, *next, *last, *skip;
struct sched_entity *curr;
struct sched_entity *next;
struct sched_entity *last;
struct sched_entity *skip;
#ifdef CONFIG_SCHED_DEBUG
unsigned int nr_spread_over;
unsigned int nr_spread_over;
#endif
#ifdef CONFIG_SMP
/*
* CFS load tracking
*/
struct sched_avg avg;
struct sched_avg avg;
#ifndef CONFIG_64BIT
u64 load_last_update_time_copy;
u64 load_last_update_time_copy;
#endif
struct {
raw_spinlock_t lock ____cacheline_aligned;
......@@ -482,9 +518,9 @@ struct cfs_rq {
} removed;
#ifdef CONFIG_FAIR_GROUP_SCHED
unsigned long tg_load_avg_contrib;
long propagate;
long prop_runnable_sum;
unsigned long tg_load_avg_contrib;
long propagate;
long prop_runnable_sum;
/*
* h_load = weight * f(tg)
......@@ -492,36 +528,38 @@ struct cfs_rq {
* Where f(tg) is the recursive weight fraction assigned to
* this group.
*/
unsigned long h_load;
u64 last_h_load_update;
struct sched_entity *h_load_next;
unsigned long h_load;
u64 last_h_load_update;
struct sched_entity *h_load_next;
#endif /* CONFIG_FAIR_GROUP_SCHED */
#endif /* CONFIG_SMP */
#ifdef CONFIG_FAIR_GROUP_SCHED
struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */
struct rq *rq; /* CPU runqueue to which this cfs_rq is attached */
/*
* leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
* a hierarchy). Non-leaf lrqs hold other higher schedulable entities
* (like users, containers etc.)
*
* leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
* list is used during load balance.
* leaf_cfs_rq_list ties together list of leaf cfs_rq's in a CPU.
* This list is used during load balance.
*/
int on_list;
struct list_head leaf_cfs_rq_list;
struct task_group *tg; /* group that "owns" this runqueue */
int on_list;
struct list_head leaf_cfs_rq_list;
struct task_group *tg; /* group that "owns" this runqueue */
#ifdef CONFIG_CFS_BANDWIDTH
int runtime_enabled;
u64 runtime_expires;
s64 runtime_remaining;
u64 throttled_clock, throttled_clock_task;
u64 throttled_clock_task_time;
int throttled, throttle_count;
struct list_head throttled_list;
int runtime_enabled;
u64 runtime_expires;
s64 runtime_remaining;
u64 throttled_clock;
u64 throttled_clock_task;
u64 throttled_clock_task_time;
int throttled;
int throttle_count;
struct list_head throttled_list;
#endif /* CONFIG_CFS_BANDWIDTH */
#endif /* CONFIG_FAIR_GROUP_SCHED */
};
......@@ -538,45 +576,45 @@ static inline int rt_bandwidth_enabled(void)
/* Real-Time classes' related field in a runqueue: */
struct rt_rq {
struct rt_prio_array active;
unsigned int rt_nr_running;
unsigned int rr_nr_running;
struct rt_prio_array active;
unsigned int rt_nr_running;
unsigned int rr_nr_running;
#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
struct {
int curr; /* highest queued rt task prio */
int curr; /* highest queued rt task prio */
#ifdef CONFIG_SMP
int next; /* next highest */
int next; /* next highest */
#endif
} highest_prio;
#endif
#ifdef CONFIG_SMP
unsigned long rt_nr_migratory;
unsigned long rt_nr_total;
int overloaded;
struct plist_head pushable_tasks;
unsigned long rt_nr_migratory;
unsigned long rt_nr_total;
int overloaded;
struct plist_head pushable_tasks;
#endif /* CONFIG_SMP */
int rt_queued;
int rt_queued;
int rt_throttled;
u64 rt_time;
u64 rt_runtime;
int rt_throttled;
u64 rt_time;
u64 rt_runtime;
/* Nests inside the rq lock: */
raw_spinlock_t rt_runtime_lock;
raw_spinlock_t rt_runtime_lock;
#ifdef CONFIG_RT_GROUP_SCHED
unsigned long rt_nr_boosted;
unsigned long rt_nr_boosted;
struct rq *rq;
struct task_group *tg;
struct rq *rq;
struct task_group *tg;
#endif
};
/* Deadline class' related fields in a runqueue */
struct dl_rq {
/* runqueue is an rbtree, ordered by deadline */
struct rb_root_cached root;
struct rb_root_cached root;
unsigned long dl_nr_running;
unsigned long dl_nr_running;
#ifdef CONFIG_SMP
/*
......@@ -586,28 +624,28 @@ struct dl_rq {
* should migrate somewhere else.
*/
struct {
u64 curr;
u64 next;
u64 curr;
u64 next;
} earliest_dl;
unsigned long dl_nr_migratory;
int overloaded;
unsigned long dl_nr_migratory;
int overloaded;
/*
* Tasks on this rq that can be pushed away. They are kept in
* an rb-tree, ordered by tasks' deadlines, with caching
* of the leftmost (earliest deadline) element.
*/
struct rb_root_cached pushable_dl_tasks_root;
struct rb_root_cached pushable_dl_tasks_root;
#else
struct dl_bw dl_bw;
struct dl_bw dl_bw;
#endif
/*
* "Active utilization" for this runqueue: increased when a
* task wakes up (becomes TASK_RUNNING) and decreased when a
* task blocks
*/
u64 running_bw;
u64 running_bw;
/*
* Utilization of the tasks "assigned" to this runqueue (including
......@@ -618,14 +656,14 @@ struct dl_rq {
* This is needed to compute the "inactive utilization" for the
* runqueue (inactive utilization = this_bw - running_bw).
*/
u64 this_bw;
u64 extra_bw;
u64 this_bw;
u64 extra_bw;
/*
* Inverse of the fraction of CPU utilization that can be reclaimed
* by the GRUB algorithm.
*/
u64 bw_ratio;
u64 bw_ratio;
};
#ifdef CONFIG_SMP
......@@ -638,51 +676,51 @@ static inline bool sched_asym_prefer(int a, int b)
/*
* We add the notion of a root-domain which will be used to define per-domain
* variables. Each exclusive cpuset essentially defines an island domain by
* fully partitioning the member cpus from any other cpuset. Whenever a new
* fully partitioning the member CPUs from any other cpuset. Whenever a new
* exclusive cpuset is created, we also create and attach a new root-domain
* object.
*
*/
struct root_domain {
atomic_t refcount;
atomic_t rto_count;
struct rcu_head rcu;
cpumask_var_t span;
cpumask_var_t online;
atomic_t refcount;
atomic_t rto_count;
struct rcu_head rcu;
cpumask_var_t span;
cpumask_var_t online;
/* Indicate more than one runnable task for any CPU */
bool overload;
bool overload;
/*
* The bit corresponding to a CPU gets set here if such CPU has more
* than one runnable -deadline task (as it is below for RT tasks).
*/
cpumask_var_t dlo_mask;
atomic_t dlo_count;
struct dl_bw dl_bw;
struct cpudl cpudl;
cpumask_var_t dlo_mask;
atomic_t dlo_count;
struct dl_bw dl_bw;
struct cpudl cpudl;
#ifdef HAVE_RT_PUSH_IPI
/*
* For IPI pull requests, loop across the rto_mask.
*/
struct irq_work rto_push_work;
raw_spinlock_t rto_lock;
struct irq_work rto_push_work;
raw_spinlock_t rto_lock;
/* These are only updated and read within rto_lock */
int rto_loop;
int rto_cpu;
int rto_loop;
int rto_cpu;
/* These atomics are updated outside of a lock */
atomic_t rto_loop_next;
atomic_t rto_loop_start;
atomic_t rto_loop_next;
atomic_t rto_loop_start;
#endif
/*
* The "RT overload" flag: it gets set if a CPU has more than
* one runnable RT task.
*/
cpumask_var_t rto_mask;
struct cpupri cpupri;
cpumask_var_t rto_mask;
struct cpupri cpupri;
unsigned long max_cpu_capacity;
unsigned long max_cpu_capacity;
};
extern struct root_domain def_root_domain;
......@@ -708,41 +746,42 @@ extern void rto_push_irq_work_func(struct irq_work *work);
*/
struct rq {
/* runqueue lock: */
raw_spinlock_t lock;
raw_spinlock_t lock;
/*
* nr_running and cpu_load should be in the same cacheline because
* remote CPUs use both these fields when doing load calculation.
*/
unsigned int nr_running;
unsigned int nr_running;
#ifdef CONFIG_NUMA_BALANCING
unsigned int nr_numa_running;
unsigned int nr_preferred_running;
unsigned int nr_numa_running;
unsigned int nr_preferred_running;
#endif
#define CPU_LOAD_IDX_MAX 5
unsigned long cpu_load[CPU_LOAD_IDX_MAX];
unsigned long cpu_load[CPU_LOAD_IDX_MAX];
#ifdef CONFIG_NO_HZ_COMMON
#ifdef CONFIG_SMP
unsigned long last_load_update_tick;
unsigned long last_load_update_tick;
unsigned long last_blocked_load_update_tick;
unsigned int has_blocked_load;
#endif /* CONFIG_SMP */
unsigned long nohz_flags;
unsigned int nohz_tick_stopped;
atomic_t nohz_flags;
#endif /* CONFIG_NO_HZ_COMMON */
#ifdef CONFIG_NO_HZ_FULL
unsigned long last_sched_tick;
#endif
/* capture load from *all* tasks on this cpu: */
struct load_weight load;
unsigned long nr_load_updates;
u64 nr_switches;
struct cfs_rq cfs;
struct rt_rq rt;
struct dl_rq dl;
/* capture load from *all* tasks on this CPU: */
struct load_weight load;
unsigned long nr_load_updates;
u64 nr_switches;
struct cfs_rq cfs;
struct rt_rq rt;
struct dl_rq dl;
#ifdef CONFIG_FAIR_GROUP_SCHED
/* list of leaf cfs_rq on this cpu: */
struct list_head leaf_cfs_rq_list;
struct list_head *tmp_alone_branch;
/* list of leaf cfs_rq on this CPU: */
struct list_head leaf_cfs_rq_list;
struct list_head *tmp_alone_branch;
#endif /* CONFIG_FAIR_GROUP_SCHED */
/*
......@@ -751,94 +790,98 @@ struct rq {
* one CPU and if it got migrated afterwards it may decrease
* it on another CPU. Always updated under the runqueue lock:
*/
unsigned long nr_uninterruptible;
unsigned long nr_uninterruptible;
struct task_struct *curr, *idle, *stop;
unsigned long next_balance;
struct mm_struct *prev_mm;
struct task_struct *curr;
struct task_struct *idle;
struct task_struct *stop;
unsigned long next_balance;
struct mm_struct *prev_mm;
unsigned int clock_update_flags;
u64 clock;
u64 clock_task;
unsigned int clock_update_flags;
u64 clock;
u64 clock_task;
atomic_t nr_iowait;
atomic_t nr_iowait;
#ifdef CONFIG_SMP
struct root_domain *rd;
struct sched_domain *sd;
struct root_domain *rd;
struct sched_domain *sd;
unsigned long cpu_capacity;
unsigned long cpu_capacity_orig;
unsigned long cpu_capacity;
unsigned long cpu_capacity_orig;
struct callback_head *balance_callback;
struct callback_head *balance_callback;
unsigned char idle_balance;
unsigned char idle_balance;
/* For active balancing */
int active_balance;
int push_cpu;
struct cpu_stop_work active_balance_work;
/* cpu of this runqueue: */
int cpu;
int online;
int active_balance;
int push_cpu;
struct cpu_stop_work active_balance_work;
/* CPU of this runqueue: */
int cpu;
int online;
struct list_head cfs_tasks;
u64 rt_avg;
u64 age_stamp;
u64 idle_stamp;
u64 avg_idle;
u64 rt_avg;
u64 age_stamp;
u64 idle_stamp;
u64 avg_idle;
/* This is used to determine avg_idle's max value */
u64 max_idle_balance_cost;
u64 max_idle_balance_cost;
#endif
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
u64 prev_irq_time;
u64 prev_irq_time;
#endif
#ifdef CONFIG_PARAVIRT
u64 prev_steal_time;
u64 prev_steal_time;
#endif
#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
u64 prev_steal_time_rq;
u64 prev_steal_time_rq;
#endif
/* calc_load related fields */
unsigned long calc_load_update;
long calc_load_active;
unsigned long calc_load_update;
long calc_load_active;
#ifdef CONFIG_SCHED_HRTICK
#ifdef CONFIG_SMP
int hrtick_csd_pending;
call_single_data_t hrtick_csd;
int hrtick_csd_pending;
call_single_data_t hrtick_csd;
#endif
struct hrtimer hrtick_timer;
struct hrtimer hrtick_timer;
#endif
#ifdef CONFIG_SCHEDSTATS
/* latency stats */
struct sched_info rq_sched_info;
unsigned long long rq_cpu_time;
struct sched_info rq_sched_info;
unsigned long long rq_cpu_time;
/* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */
/* sys_sched_yield() stats */
unsigned int yld_count;
unsigned int yld_count;
/* schedule() stats */
unsigned int sched_count;
unsigned int sched_goidle;
unsigned int sched_count;
unsigned int sched_goidle;
/* try_to_wake_up() stats */
unsigned int ttwu_count;
unsigned int ttwu_local;
unsigned int ttwu_count;
unsigned int ttwu_local;
#endif
#ifdef CONFIG_SMP
struct llist_head wake_list;
struct llist_head wake_list;
#endif
#ifdef CONFIG_CPU_IDLE
/* Must be inspected within a rcu lock section */
struct cpuidle_state *idle_state;
struct cpuidle_state *idle_state;
#endif
};
......@@ -904,9 +947,9 @@ static inline u64 __rq_clock_broken(struct rq *rq)
* one position though, because the next rq_unpin_lock() will shift it
* back.
*/
#define RQCF_REQ_SKIP 0x01
#define RQCF_ACT_SKIP 0x02
#define RQCF_UPDATED 0x04
#define RQCF_REQ_SKIP 0x01
#define RQCF_ACT_SKIP 0x02
#define RQCF_UPDATED 0x04
static inline void assert_clock_updated(struct rq *rq)
{
......@@ -1059,12 +1102,12 @@ extern void sched_ttwu_pending(void);
/**
* highest_flag_domain - Return highest sched_domain containing flag.
* @cpu: The cpu whose highest level of sched domain is to
* @cpu: The CPU whose highest level of sched domain is to
* be returned.
* @flag: The flag to check for the highest sched_domain
* for the given cpu.
* for the given CPU.
*
* Returns the highest sched_domain of a cpu which contains the given flag.
* Returns the highest sched_domain of a CPU which contains the given flag.
*/
static inline struct sched_domain *highest_flag_domain(int cpu, int flag)
{
......@@ -1099,30 +1142,30 @@ DECLARE_PER_CPU(struct sched_domain *, sd_numa);
DECLARE_PER_CPU(struct sched_domain *, sd_asym);
struct sched_group_capacity {
atomic_t ref;
atomic_t ref;
/*
* CPU capacity of this group, SCHED_CAPACITY_SCALE being max capacity
* for a single CPU.
*/
unsigned long capacity;
unsigned long min_capacity; /* Min per-CPU capacity in group */
unsigned long next_update;
int imbalance; /* XXX unrelated to capacity but shared group state */
unsigned long capacity;
unsigned long min_capacity; /* Min per-CPU capacity in group */
unsigned long next_update;
int imbalance; /* XXX unrelated to capacity but shared group state */
#ifdef CONFIG_SCHED_DEBUG
int id;
int id;
#endif
unsigned long cpumask[0]; /* balance mask */
unsigned long cpumask[0]; /* Balance mask */
};
struct sched_group {
struct sched_group *next; /* Must be a circular list */
atomic_t ref;
struct sched_group *next; /* Must be a circular list */
atomic_t ref;
unsigned int group_weight;
unsigned int group_weight;
struct sched_group_capacity *sgc;
int asym_prefer_cpu; /* cpu of highest priority in group */
int asym_prefer_cpu; /* CPU of highest priority in group */
/*
* The CPUs this group covers.
......@@ -1131,7 +1174,7 @@ struct sched_group {
* by attaching extra space to the end of the structure,
* depending on how many CPUs the kernel has booted up with)
*/
unsigned long cpumask[0];
unsigned long cpumask[0];
};
static inline struct cpumask *sched_group_span(struct sched_group *sg)
......@@ -1148,8 +1191,8 @@ static inline struct cpumask *group_balance_mask(struct sched_group *sg)
}
/**
* group_first_cpu - Returns the first cpu in the cpumask of a sched_group.
* @group: The group whose first cpu is to be returned.
* group_first_cpu - Returns the first CPU in the cpumask of a sched_group.
* @group: The group whose first CPU is to be returned.
*/
static inline unsigned int group_first_cpu(struct sched_group *group)
{
......@@ -1349,19 +1392,12 @@ static inline int task_on_rq_migrating(struct task_struct *p)
return p->on_rq == TASK_ON_RQ_MIGRATING;
}
#ifndef prepare_arch_switch
# define prepare_arch_switch(next) do { } while (0)
#endif
#ifndef finish_arch_post_lock_switch
# define finish_arch_post_lock_switch() do { } while (0)
#endif
/*
* wake flags
*/
#define WF_SYNC 0x01 /* waker goes to sleep after wakeup */
#define WF_FORK 0x02 /* child wakeup after fork */
#define WF_MIGRATED 0x4 /* internal use, task got migrated */
#define WF_SYNC 0x01 /* Waker goes to sleep after wakeup */
#define WF_FORK 0x02 /* Child wakeup after fork */
#define WF_MIGRATED 0x4 /* Internal use, task got migrated */
/*
* To aid in avoiding the subversion of "niceness" due to uneven distribution
......@@ -1372,11 +1408,11 @@ static inline int task_on_rq_migrating(struct task_struct *p)
* slice expiry etc.
*/
#define WEIGHT_IDLEPRIO 3
#define WMULT_IDLEPRIO 1431655765
#define WEIGHT_IDLEPRIO 3
#define WMULT_IDLEPRIO 1431655765
extern const int sched_prio_to_weight[40];
extern const u32 sched_prio_to_wmult[40];
extern const int sched_prio_to_weight[40];
extern const u32 sched_prio_to_wmult[40];
/*
* {de,en}queue flags:
......@@ -1398,9 +1434,9 @@ extern const u32 sched_prio_to_wmult[40];
*/
#define DEQUEUE_SLEEP 0x01
#define DEQUEUE_SAVE 0x02 /* matches ENQUEUE_RESTORE */
#define DEQUEUE_MOVE 0x04 /* matches ENQUEUE_MOVE */
#define DEQUEUE_NOCLOCK 0x08 /* matches ENQUEUE_NOCLOCK */
#define DEQUEUE_SAVE 0x02 /* Matches ENQUEUE_RESTORE */
#define DEQUEUE_MOVE 0x04 /* Matches ENQUEUE_MOVE */
#define DEQUEUE_NOCLOCK 0x08 /* Matches ENQUEUE_NOCLOCK */
#define ENQUEUE_WAKEUP 0x01
#define ENQUEUE_RESTORE 0x02
......@@ -1422,10 +1458,10 @@ struct sched_class {
void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags);
void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags);
void (*yield_task) (struct rq *rq);
bool (*yield_to_task) (struct rq *rq, struct task_struct *p, bool preempt);
void (*yield_task) (struct rq *rq);
bool (*yield_to_task)(struct rq *rq, struct task_struct *p, bool preempt);
void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int flags);
void (*check_preempt_curr)(struct rq *rq, struct task_struct *p, int flags);
/*
* It is the responsibility of the pick_next_task() method that will
......@@ -1435,16 +1471,16 @@ struct sched_class {
* May return RETRY_TASK when it finds a higher prio class has runnable
* tasks.
*/
struct task_struct * (*pick_next_task) (struct rq *rq,
struct task_struct *prev,
struct rq_flags *rf);
void (*put_prev_task) (struct rq *rq, struct task_struct *p);
struct task_struct * (*pick_next_task)(struct rq *rq,
struct task_struct *prev,
struct rq_flags *rf);
void (*put_prev_task)(struct rq *rq, struct task_struct *p);
#ifdef CONFIG_SMP
int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags);
void (*migrate_task_rq)(struct task_struct *p);
void (*task_woken) (struct rq *this_rq, struct task_struct *task);
void (*task_woken)(struct rq *this_rq, struct task_struct *task);
void (*set_cpus_allowed)(struct task_struct *p,
const struct cpumask *newmask);
......@@ -1453,31 +1489,31 @@ struct sched_class {
void (*rq_offline)(struct rq *rq);
#endif
void (*set_curr_task) (struct rq *rq);
void (*task_tick) (struct rq *rq, struct task_struct *p, int queued);
void (*task_fork) (struct task_struct *p);
void (*task_dead) (struct task_struct *p);
void (*set_curr_task)(struct rq *rq);
void (*task_tick)(struct rq *rq, struct task_struct *p, int queued);
void (*task_fork)(struct task_struct *p);
void (*task_dead)(struct task_struct *p);
/*
* The switched_from() call is allowed to drop rq->lock, therefore we
* cannot assume the switched_from/switched_to pair is serliazed by
* rq->lock. They are however serialized by p->pi_lock.
*/
void (*switched_from) (struct rq *this_rq, struct task_struct *task);
void (*switched_to) (struct rq *this_rq, struct task_struct *task);
void (*switched_from)(struct rq *this_rq, struct task_struct *task);
void (*switched_to) (struct rq *this_rq, struct task_struct *task);
void (*prio_changed) (struct rq *this_rq, struct task_struct *task,
int oldprio);
int oldprio);
unsigned int (*get_rr_interval) (struct rq *rq,
struct task_struct *task);
unsigned int (*get_rr_interval)(struct rq *rq,
struct task_struct *task);
void (*update_curr) (struct rq *rq);
void (*update_curr)(struct rq *rq);
#define TASK_SET_GROUP 0
#define TASK_MOVE_GROUP 1
#define TASK_SET_GROUP 0
#define TASK_MOVE_GROUP 1
#ifdef CONFIG_FAIR_GROUP_SCHED
void (*task_change_group) (struct task_struct *p, int type);
void (*task_change_group)(struct task_struct *p, int type);
#endif
};
......@@ -1526,6 +1562,7 @@ static inline void idle_set_state(struct rq *rq,
static inline struct cpuidle_state *idle_get_state(struct rq *rq)
{
SCHED_WARN_ON(!rcu_read_lock_held());
return rq->idle_state;
}
#else
......@@ -1564,9 +1601,9 @@ extern void init_dl_task_timer(struct sched_dl_entity *dl_se);
extern void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se);
extern void init_dl_rq_bw_ratio(struct dl_rq *dl_rq);
#define BW_SHIFT 20
#define BW_UNIT (1 << BW_SHIFT)
#define RATIO_SHIFT 8
#define BW_SHIFT 20
#define BW_UNIT (1 << BW_SHIFT)
#define RATIO_SHIFT 8
unsigned long to_ratio(u64 period, u64 runtime);
extern void init_entity_runnable_average(struct sched_entity *se);
......@@ -1574,6 +1611,7 @@ extern void post_init_entity_util_avg(struct sched_entity *se);
#ifdef CONFIG_NO_HZ_FULL
extern bool sched_can_stop_tick(struct rq *rq);
extern int __init sched_tick_offload_init(void);
/*
* Tick may be needed by tasks in the runqueue depending on their policy and
......@@ -1598,6 +1636,7 @@ static inline void sched_update_tick_dependency(struct rq *rq)
tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED);
}
#else
static inline int sched_tick_offload_init(void) { return 0; }
static inline void sched_update_tick_dependency(struct rq *rq) { }
#endif
......@@ -1624,13 +1663,6 @@ static inline void sub_nr_running(struct rq *rq, unsigned count)
sched_update_tick_dependency(rq);
}
static inline void rq_last_tick_reset(struct rq *rq)
{
#ifdef CONFIG_NO_HZ_FULL
rq->last_sched_tick = jiffies;
#endif
}
extern void update_rq_clock(struct rq *rq);
extern void activate_task(struct rq *rq, struct task_struct *p, int flags);
......@@ -1821,8 +1853,8 @@ static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
/*
* Unfair double_lock_balance: Optimizes throughput at the expense of
* latency by eliminating extra atomic operations when the locks are
* already in proper order on entry. This favors lower cpu-ids and will
* grant the double lock to lower cpus over higher ids under contention,
* already in proper order on entry. This favors lower CPU-ids and will
* grant the double lock to lower CPUs over higher ids under contention,
* regardless of entry order into the function.
*/
static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
......@@ -1854,7 +1886,7 @@ static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
static inline int double_lock_balance(struct rq *this_rq, struct rq *busiest)
{
if (unlikely(!irqs_disabled())) {
/* printk() doesn't work good under rq->lock */
/* printk() doesn't work well under rq->lock */
raw_spin_unlock(&this_rq->lock);
BUG_ON(1);
}
......@@ -2005,16 +2037,19 @@ extern void cfs_bandwidth_usage_inc(void);
extern void cfs_bandwidth_usage_dec(void);
#ifdef CONFIG_NO_HZ_COMMON
enum rq_nohz_flag_bits {
NOHZ_TICK_STOPPED,
NOHZ_BALANCE_KICK,
};
#define NOHZ_BALANCE_KICK_BIT 0
#define NOHZ_STATS_KICK_BIT 1
#define NOHZ_BALANCE_KICK BIT(NOHZ_BALANCE_KICK_BIT)
#define NOHZ_STATS_KICK BIT(NOHZ_STATS_KICK_BIT)
#define NOHZ_KICK_MASK (NOHZ_BALANCE_KICK | NOHZ_STATS_KICK)
#define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags)
extern void nohz_balance_exit_idle(unsigned int cpu);
extern void nohz_balance_exit_idle(struct rq *rq);
#else
static inline void nohz_balance_exit_idle(unsigned int cpu) { }
static inline void nohz_balance_exit_idle(struct rq *rq) { }
#endif
......@@ -2113,15 +2148,14 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {}
#endif /* CONFIG_CPU_FREQ */
#ifdef arch_scale_freq_capacity
#ifndef arch_scale_freq_invariant
#define arch_scale_freq_invariant() (true)
#endif
#else /* arch_scale_freq_capacity */
#define arch_scale_freq_invariant() (false)
# ifndef arch_scale_freq_invariant
# define arch_scale_freq_invariant() true
# endif
#else
# define arch_scale_freq_invariant() false
#endif
#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL
static inline unsigned long cpu_util_dl(struct rq *rq)
{
return (rq->dl.running_bw * SCHED_CAPACITY_SCALE) >> BW_SHIFT;
......@@ -2129,7 +2163,13 @@ static inline unsigned long cpu_util_dl(struct rq *rq)
static inline unsigned long cpu_util_cfs(struct rq *rq)
{
return rq->cfs.avg.util_avg;
}
unsigned long util = READ_ONCE(rq->cfs.avg.util_avg);
if (sched_feat(UTIL_EST)) {
util = max_t(unsigned long, util,
READ_ONCE(rq->cfs.avg.util_est.enqueued));
}
return util;
}
#endif
// SPDX-License-Identifier: GPL-2.0
#include <linux/slab.h>
#include <linux/fs.h>
#include <linux/seq_file.h>
#include <linux/proc_fs.h>
/*
* /proc/schedstat implementation
*/
#include "sched.h"
/*
* bump this up when changing the output format or the meaning of an existing
* Current schedstat API version.
*
* Bump this up when changing the output format or the meaning of an existing
* format, so that tools can adapt (or abort)
*/
#define SCHEDSTAT_VERSION 15
......@@ -78,8 +77,8 @@ static int show_schedstat(struct seq_file *seq, void *v)
* This itererator needs some explanation.
* It returns 1 for the header position.
* This means 2 is cpu 0.
* In a hotplugged system some cpus, including cpu 0, may be missing so we have
* to use cpumask_* to iterate over the cpus.
* In a hotplugged system some CPUs, including cpu 0, may be missing so we have
* to use cpumask_* to iterate over the CPUs.
*/
static void *schedstat_start(struct seq_file *file, loff_t *offset)
{
......@@ -99,12 +98,14 @@ static void *schedstat_start(struct seq_file *file, loff_t *offset)
if (n < nr_cpu_ids)
return (void *)(unsigned long)(n + 2);
return NULL;
}
static void *schedstat_next(struct seq_file *file, void *data, loff_t *offset)
{
(*offset)++;
return schedstat_start(file, offset);
}
......@@ -134,6 +135,7 @@ static const struct file_operations proc_schedstat_operations = {
static int __init proc_schedstat_init(void)
{
proc_create("schedstat", 0, NULL, &proc_schedstat_operations);
return 0;
}
subsys_initcall(proc_schedstat_init);
......@@ -30,35 +30,29 @@ rq_sched_info_dequeued(struct rq *rq, unsigned long long delta)
if (rq)
rq->rq_sched_info.run_delay += delta;
}
#define schedstat_enabled() static_branch_unlikely(&sched_schedstats)
#define schedstat_enabled() static_branch_unlikely(&sched_schedstats)
#define __schedstat_inc(var) do { var++; } while (0)
#define schedstat_inc(var) do { if (schedstat_enabled()) { var++; } } while (0)
#define schedstat_inc(var) do { if (schedstat_enabled()) { var++; } } while (0)
#define __schedstat_add(var, amt) do { var += (amt); } while (0)
#define schedstat_add(var, amt) do { if (schedstat_enabled()) { var += (amt); } } while (0)
#define __schedstat_set(var, val) do { var = (val); } while (0)
#define schedstat_set(var, val) do { if (schedstat_enabled()) { var = (val); } } while (0)
#define schedstat_val(var) (var)
#define schedstat_val_or_zero(var) ((schedstat_enabled()) ? (var) : 0)
#else /* !CONFIG_SCHEDSTATS */
static inline void
rq_sched_info_arrive(struct rq *rq, unsigned long long delta)
{}
static inline void
rq_sched_info_dequeued(struct rq *rq, unsigned long long delta)
{}
static inline void
rq_sched_info_depart(struct rq *rq, unsigned long long delta)
{}
#define schedstat_enabled() 0
#define __schedstat_inc(var) do { } while (0)
#define schedstat_inc(var) do { } while (0)
#define __schedstat_add(var, amt) do { } while (0)
#define schedstat_add(var, amt) do { } while (0)
#define __schedstat_set(var, val) do { } while (0)
#define schedstat_set(var, val) do { } while (0)
#define schedstat_val(var) 0
#define schedstat_val_or_zero(var) 0
#define schedstat_add(var, amt) do { if (schedstat_enabled()) { var += (amt); } } while (0)
#define __schedstat_set(var, val) do { var = (val); } while (0)
#define schedstat_set(var, val) do { if (schedstat_enabled()) { var = (val); } } while (0)
#define schedstat_val(var) (var)
#define schedstat_val_or_zero(var) ((schedstat_enabled()) ? (var) : 0)
#else /* !CONFIG_SCHEDSTATS: */
static inline void rq_sched_info_arrive (struct rq *rq, unsigned long long delta) { }
static inline void rq_sched_info_dequeued(struct rq *rq, unsigned long long delta) { }
static inline void rq_sched_info_depart (struct rq *rq, unsigned long long delta) { }
# define schedstat_enabled() 0
# define __schedstat_inc(var) do { } while (0)
# define schedstat_inc(var) do { } while (0)
# define __schedstat_add(var, amt) do { } while (0)
# define schedstat_add(var, amt) do { } while (0)
# define __schedstat_set(var, val) do { } while (0)
# define schedstat_set(var, val) do { } while (0)
# define schedstat_val(var) 0
# define schedstat_val_or_zero(var) 0
#endif /* CONFIG_SCHEDSTATS */
#ifdef CONFIG_SCHED_INFO
......@@ -69,9 +63,9 @@ static inline void sched_info_reset_dequeued(struct task_struct *t)
/*
* We are interested in knowing how long it was from the *first* time a
* task was queued to the time that it finally hit a cpu, we call this routine
* from dequeue_task() to account for possible rq->clock skew across cpus. The
* delta taken on each cpu would annul the skew.
* task was queued to the time that it finally hit a CPU, we call this routine
* from dequeue_task() to account for possible rq->clock skew across CPUs. The
* delta taken on each CPU would annul the skew.
*/
static inline void sched_info_dequeued(struct rq *rq, struct task_struct *t)
{
......@@ -87,7 +81,7 @@ static inline void sched_info_dequeued(struct rq *rq, struct task_struct *t)
}
/*
* Called when a task finally hits the cpu. We can now calculate how
* Called when a task finally hits the CPU. We can now calculate how
* long it was waiting to run. We also note when it began so that we
* can keep stats on how long its timeslice is.
*/
......@@ -112,9 +106,10 @@ static void sched_info_arrive(struct rq *rq, struct task_struct *t)
*/
static inline void sched_info_queued(struct rq *rq, struct task_struct *t)
{
if (unlikely(sched_info_on()))
if (unlikely(sched_info_on())) {
if (!t->sched_info.last_queued)
t->sched_info.last_queued = rq_clock(rq);
}
}
/*
......@@ -127,8 +122,7 @@ static inline void sched_info_queued(struct rq *rq, struct task_struct *t)
*/
static inline void sched_info_depart(struct rq *rq, struct task_struct *t)
{
unsigned long long delta = rq_clock(rq) -
t->sched_info.last_arrival;
unsigned long long delta = rq_clock(rq) - t->sched_info.last_arrival;
rq_sched_info_depart(rq, delta);
......@@ -142,11 +136,10 @@ static inline void sched_info_depart(struct rq *rq, struct task_struct *t)
* the idle task.) We are only called when prev != next.
*/
static inline void
__sched_info_switch(struct rq *rq,
struct task_struct *prev, struct task_struct *next)
__sched_info_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next)
{
/*
* prev now departs the cpu. It's not interesting to record
* prev now departs the CPU. It's not interesting to record
* stats about how efficient we were at scheduling the idle
* process, however.
*/
......@@ -156,18 +149,19 @@ __sched_info_switch(struct rq *rq,
if (next != rq->idle)
sched_info_arrive(rq, next);
}
static inline void
sched_info_switch(struct rq *rq,
struct task_struct *prev, struct task_struct *next)
sched_info_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next)
{
if (unlikely(sched_info_on()))
__sched_info_switch(rq, prev, next);
}
#else
#define sched_info_queued(rq, t) do { } while (0)
#define sched_info_reset_dequeued(t) do { } while (0)
#define sched_info_dequeued(rq, t) do { } while (0)
#define sched_info_depart(rq, t) do { } while (0)
#define sched_info_arrive(rq, next) do { } while (0)
#define sched_info_switch(rq, t, next) do { } while (0)
#else /* !CONFIG_SCHED_INFO: */
# define sched_info_queued(rq, t) do { } while (0)
# define sched_info_reset_dequeued(t) do { } while (0)
# define sched_info_dequeued(rq, t) do { } while (0)
# define sched_info_depart(rq, t) do { } while (0)
# define sched_info_arrive(rq, next) do { } while (0)
# define sched_info_switch(rq, t, next) do { } while (0)
#endif /* CONFIG_SCHED_INFO */
// SPDX-License-Identifier: GPL-2.0
#include "sched.h"
/*
* stop-task scheduling class.
*
......@@ -9,6 +7,7 @@
*
* See kernel/stop_machine.c
*/
#include "sched.h"
#ifdef CONFIG_SMP
static int
......@@ -75,6 +74,14 @@ static void put_prev_task_stop(struct rq *rq, struct task_struct *prev)
cgroup_account_cputime(curr, delta_exec);
}
/*
* scheduler tick hitting a task of our scheduling class.
*
* NOTE: This function can be called remotely by the tick offload that
* goes along full dynticks. Therefore no local assumption can be made
* and everything must be accessed through the @rq and @curr passed in
* parameters.
*/
static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued)
{
}
......
// SPDX-License-Identifier: GPL-2.0
#include <linux/sched/signal.h>
#include <linux/swait.h>
/*
* <linux/swait.h> (simple wait queues ) implementation:
*/
#include "sched.h"
void __init_swait_queue_head(struct swait_queue_head *q, const char *name,
struct lock_class_key *key)
......
......@@ -2,10 +2,6 @@
/*
* Scheduler topology setup/handling methods
*/
#include <linux/sched.h>
#include <linux/mutex.h>
#include <linux/sched/isolation.h>
#include "sched.h"
DEFINE_MUTEX(sched_domains_mutex);
......@@ -41,8 +37,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
if (!(sd->flags & SD_LOAD_BALANCE)) {
printk("does not load-balance\n");
if (sd->parent)
printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"
" has parent");
printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain has parent");
return -1;
}
......@@ -50,12 +45,10 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
cpumask_pr_args(sched_domain_span(sd)), sd->name);
if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {
printk(KERN_ERR "ERROR: domain->span does not contain "
"CPU%d\n", cpu);
printk(KERN_ERR "ERROR: domain->span does not contain CPU%d\n", cpu);
}
if (!cpumask_test_cpu(cpu, sched_group_span(group))) {
printk(KERN_ERR "ERROR: domain->groups does not contain"
" CPU%d\n", cpu);
printk(KERN_ERR "ERROR: domain->groups does not contain CPU%d\n", cpu);
}
printk(KERN_DEBUG "%*s groups:", level + 1, "");
......@@ -115,8 +108,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
if (sd->parent &&
!cpumask_subset(groupmask, sched_domain_span(sd->parent)))
printk(KERN_ERR "ERROR: parent span is not a superset "
"of domain->span\n");
printk(KERN_ERR "ERROR: parent span is not a superset of domain->span\n");
return 0;
}
......@@ -595,7 +587,7 @@ int group_balance_cpu(struct sched_group *sg)
* are not.
*
* This leads to a few particularly weird cases where the sched_domain's are
* not of the same number for each cpu. Consider:
* not of the same number for each CPU. Consider:
*
* NUMA-2 0-3 0-3
* groups: {0-2},{1-3} {1-3},{0-2}
......@@ -780,7 +772,7 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
* ^ ^ ^ ^
* `-' `-'
*
* The sched_domains are per-cpu and have a two way link (parent & child) and
* The sched_domains are per-CPU and have a two way link (parent & child) and
* denote the ever growing mask of CPUs belonging to that level of topology.
*
* Each sched_domain has a circular (double) linked list of sched_group's, each
......@@ -1021,6 +1013,7 @@ __visit_domain_allocation_hell(struct s_data *d, const struct cpumask *cpu_map)
d->rd = alloc_rootdomain();
if (!d->rd)
return sa_sd;
return sa_rootdomain;
}
......@@ -1047,12 +1040,14 @@ static void claim_allocations(int cpu, struct sched_domain *sd)
}
#ifdef CONFIG_NUMA
static int sched_domains_numa_levels;
enum numa_topology_type sched_numa_topology_type;
static int *sched_domains_numa_distance;
int sched_max_numa_distance;
static struct cpumask ***sched_domains_numa_masks;
static int sched_domains_curr_level;
static int sched_domains_numa_levels;
static int sched_domains_curr_level;
int sched_max_numa_distance;
static int *sched_domains_numa_distance;
static struct cpumask ***sched_domains_numa_masks;
#endif
/*
......@@ -1074,11 +1069,11 @@ static int sched_domains_curr_level;
* SD_ASYM_PACKING - describes SMT quirks
*/
#define TOPOLOGY_SD_FLAGS \
(SD_SHARE_CPUCAPACITY | \
(SD_SHARE_CPUCAPACITY | \
SD_SHARE_PKG_RESOURCES | \
SD_NUMA | \
SD_ASYM_PACKING | \
SD_ASYM_CPUCAPACITY | \
SD_NUMA | \
SD_ASYM_PACKING | \
SD_ASYM_CPUCAPACITY | \
SD_SHARE_POWERDOMAIN)
static struct sched_domain *
......@@ -1628,7 +1623,7 @@ static struct sched_domain *build_sched_domain(struct sched_domain_topology_leve
pr_err(" the %s domain not a subset of the %s domain\n",
child->name, sd->name);
#endif
/* Fixup, ensure @sd has at least @child cpus. */
/* Fixup, ensure @sd has at least @child CPUs. */
cpumask_or(sched_domain_span(sd),
sched_domain_span(sd),
sched_domain_span(child));
......@@ -1720,6 +1715,7 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
ret = 0;
error:
__free_domain_allocs(&d, alloc_state, cpu_map);
return ret;
}
......@@ -1824,6 +1820,7 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
return 1;
tmp = SD_ATTR_INIT;
return !memcmp(cur ? (cur + idx_cur) : &tmp,
new ? (new + idx_new) : &tmp,
sizeof(struct sched_domain_attr));
......@@ -1929,4 +1926,3 @@ void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
mutex_unlock(&sched_domains_mutex);
}
......@@ -3,14 +3,7 @@
*
* (C) 2004 Nadia Yvette Chambers, Oracle
*/
#include <linux/init.h>
#include <linux/export.h>
#include <linux/sched/signal.h>
#include <linux/sched/debug.h>
#include <linux/mm.h>
#include <linux/wait.h>
#include <linux/hash.h>
#include <linux/kthread.h>
#include "sched.h"
void __init_waitqueue_head(struct wait_queue_head *wq_head, const char *name, struct lock_class_key *key)
{
......@@ -107,6 +100,7 @@ static int __wake_up_common(struct wait_queue_head *wq_head, unsigned int mode,
break;
}
}
return nr_exclusive;
}
......@@ -317,6 +311,7 @@ int do_wait_intr(wait_queue_head_t *wq, wait_queue_entry_t *wait)
spin_unlock(&wq->lock);
schedule();
spin_lock(&wq->lock);
return 0;
}
EXPORT_SYMBOL(do_wait_intr);
......@@ -333,6 +328,7 @@ int do_wait_intr_irq(wait_queue_head_t *wq, wait_queue_entry_t *wait)
spin_unlock_irq(&wq->lock);
schedule();
spin_lock_irq(&wq->lock);
return 0;
}
EXPORT_SYMBOL(do_wait_intr_irq);
......@@ -378,6 +374,7 @@ int autoremove_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, i
if (ret)
list_del_init(&wq_entry->entry);
return ret;
}
EXPORT_SYMBOL(autoremove_wake_function);
......
/*
* The implementation of the wait_bit*() and related waiting APIs:
*/
#include <linux/wait_bit.h>
#include <linux/sched/signal.h>
#include <linux/sched/debug.h>
#include <linux/hash.h>
#include "sched.h"
#define WAIT_TABLE_BITS 8
#define WAIT_TABLE_SIZE (1 << WAIT_TABLE_BITS)
......@@ -29,8 +26,8 @@ int wake_bit_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync
wait_bit->key.bit_nr != key->bit_nr ||
test_bit(key->bit_nr, key->flags))
return 0;
else
return autoremove_wake_function(wq_entry, mode, sync, key);
return autoremove_wake_function(wq_entry, mode, sync, key);
}
EXPORT_SYMBOL(wake_bit_function);
......@@ -50,7 +47,9 @@ __wait_on_bit(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_
if (test_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags))
ret = (*action)(&wbq_entry->key, mode);
} while (test_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags) && !ret);
finish_wait(wq_head, &wbq_entry->wq_entry);
return ret;
}
EXPORT_SYMBOL(__wait_on_bit);
......@@ -73,6 +72,7 @@ int __sched out_of_line_wait_on_bit_timeout(
DEFINE_WAIT_BIT(wq_entry, word, bit);
wq_entry.key.timeout = jiffies + timeout;
return __wait_on_bit(wq_head, &wq_entry, action, mode);
}
EXPORT_SYMBOL_GPL(out_of_line_wait_on_bit_timeout);
......@@ -120,6 +120,7 @@ EXPORT_SYMBOL(out_of_line_wait_on_bit_lock);
void __wake_up_bit(struct wait_queue_head *wq_head, void *word, int bit)
{
struct wait_bit_key key = __WAIT_BIT_KEY_INITIALIZER(word, bit);
if (waitqueue_active(wq_head))
__wake_up(wq_head, TASK_NORMAL, 1, &key);
}
......@@ -157,6 +158,7 @@ static inline wait_queue_head_t *atomic_t_waitqueue(atomic_t *p)
{
if (BITS_PER_LONG == 64) {
unsigned long q = (unsigned long)p;
return bit_waitqueue((void *)(q & ~1), q & 1);
}
return bit_waitqueue(p, 0);
......@@ -173,6 +175,7 @@ static int wake_atomic_t_function(struct wait_queue_entry *wq_entry, unsigned mo
wait_bit->key.bit_nr != key->bit_nr ||
atomic_read(val) != 0)
return 0;
return autoremove_wake_function(wq_entry, mode, sync, key);
}
......@@ -196,6 +199,7 @@ int __wait_on_atomic_t(struct wait_queue_head *wq_head, struct wait_bit_queue_en
ret = (*action)(val, mode);
} while (!ret && atomic_read(val) != 0);
finish_wait(wq_head, &wbq_entry->wq_entry);
return ret;
}
......@@ -226,6 +230,7 @@ __sched int atomic_t_wait(atomic_t *counter, unsigned int mode)
schedule();
if (signal_pending_state(mode, current))
return -EINTR;
return 0;
}
EXPORT_SYMBOL(atomic_t_wait);
......@@ -250,6 +255,7 @@ __sched int bit_wait(struct wait_bit_key *word, int mode)
schedule();
if (signal_pending_state(mode, current))
return -EINTR;
return 0;
}
EXPORT_SYMBOL(bit_wait);
......@@ -259,6 +265,7 @@ __sched int bit_wait_io(struct wait_bit_key *word, int mode)
io_schedule();
if (signal_pending_state(mode, current))
return -EINTR;
return 0;
}
EXPORT_SYMBOL(bit_wait_io);
......@@ -266,11 +273,13 @@ EXPORT_SYMBOL(bit_wait_io);
__sched int bit_wait_timeout(struct wait_bit_key *word, int mode)
{
unsigned long now = READ_ONCE(jiffies);
if (time_after_eq(now, word->timeout))
return -EAGAIN;
schedule_timeout(word->timeout - now);
if (signal_pending_state(mode, current))
return -EINTR;
return 0;
}
EXPORT_SYMBOL_GPL(bit_wait_timeout);
......@@ -278,11 +287,13 @@ EXPORT_SYMBOL_GPL(bit_wait_timeout);
__sched int bit_wait_io_timeout(struct wait_bit_key *word, int mode)
{
unsigned long now = READ_ONCE(jiffies);
if (time_after_eq(now, word->timeout))
return -EAGAIN;
io_schedule_timeout(word->timeout - now);
if (signal_pending_state(mode, current))
return -EINTR;
return 0;
}
EXPORT_SYMBOL_GPL(bit_wait_io_timeout);
......
......@@ -463,11 +463,18 @@ static int __init setup_tick_nohz(char *str)
__setup("nohz=", setup_tick_nohz);
int tick_nohz_tick_stopped(void)
bool tick_nohz_tick_stopped(void)
{
return __this_cpu_read(tick_cpu_sched.tick_stopped);
}
bool tick_nohz_tick_stopped_cpu(int cpu)
{
struct tick_sched *ts = per_cpu_ptr(&tick_cpu_sched, cpu);
return ts->tick_stopped;
}
/**
* tick_nohz_update_jiffies - update jiffies when idle was interrupted
*
......@@ -723,12 +730,6 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
delta = KTIME_MAX;
}
#ifdef CONFIG_NO_HZ_FULL
/* Limit the tick delta to the maximum scheduler deferment */
if (!ts->inidle)
delta = min(delta, scheduler_tick_max_deferment());
#endif
/* Calculate the next expiry time */
if (delta < (KTIME_MAX - basemono))
expires = basemono + delta;
......@@ -935,13 +936,6 @@ void tick_nohz_idle_enter(void)
struct tick_sched *ts;
lockdep_assert_irqs_enabled();
/*
* Update the idle state in the scheduler domain hierarchy
* when tick_nohz_stop_sched_tick() is called from the idle loop.
* State will be updated to busy during the first busy tick after
* exiting idle.
*/
set_cpu_sd_state_idle();
local_irq_disable();
......
......@@ -5573,12 +5573,13 @@ static void __init wq_numa_init(void)
int __init workqueue_init_early(void)
{
int std_nice[NR_STD_WORKER_POOLS] = { 0, HIGHPRI_NICE_LEVEL };
int hk_flags = HK_FLAG_DOMAIN | HK_FLAG_WQ;
int i, cpu;
WARN_ON(__alignof__(struct pool_workqueue) < __alignof__(long long));
BUG_ON(!alloc_cpumask_var(&wq_unbound_cpumask, GFP_KERNEL));
cpumask_copy(wq_unbound_cpumask, housekeeping_cpumask(HK_FLAG_DOMAIN));
cpumask_copy(wq_unbound_cpumask, housekeeping_cpumask(hk_flags));
pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment