Commit 46e0d28b authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull scheduler updates from Ingo Molnar:
 "The main scheduler changes in this cycle were:

   - NUMA balancing improvements (Mel Gorman)

   - Further load tracking improvements (Patrick Bellasi)

   - Various NOHZ balancing cleanups and optimizations (Peter Zijlstra)

   - Improve blocked load handling, in particular we can now reduce and
     eventually stop periodic load updates on 'very idle' CPUs. (Vincent
     Guittot)

   - On isolated CPUs offload the final 1Hz scheduler tick as well, plus
     related cleanups and reorganization. (Frederic Weisbecker)

   - Core scheduler code cleanups (Ingo Molnar)"

* 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (45 commits)
  sched/core: Update preempt_notifier_key to modern API
  sched/cpufreq: Rate limits for SCHED_DEADLINE
  sched/fair: Update util_est only on util_avg updates
  sched/cpufreq/schedutil: Use util_est for OPP selection
  sched/fair: Use util_est in LB and WU paths
  sched/fair: Add util_est on top of PELT
  sched/core: Remove TASK_ALL
  sched/completions: Use bool in try_wait_for_completion()
  sched/fair: Update blocked load when newly idle
  sched/fair: Move idle_balance()
  sched/nohz: Merge CONFIG_NO_HZ_COMMON blocks
  sched/fair: Move rebalance_domains()
  sched/nohz: Optimize nohz_idle_balance()
  sched/fair: Reduce the periodic update duration
  sched/nohz: Stop NOHZ stats when decayed
  sched/cpufreq: Provide migration hint
  sched/nohz: Clean up nohz enter/exit
  sched/fair: Update blocked load from NEWIDLE
  sched/fair: Add NOHZ stats balancing
  sched/fair: Restructure nohz_balance_kick()
  ...
parents 86bbbeba b7203428
...@@ -1766,6 +1766,17 @@ ...@@ -1766,6 +1766,17 @@
nohz nohz
Disable the tick when a single task runs. Disable the tick when a single task runs.
A residual 1Hz tick is offloaded to workqueues, which you
need to affine to housekeeping through the global
workqueue's affinity configured via the
/sys/devices/virtual/workqueue/cpumask sysfs file, or
by using the 'domain' flag described below.
NOTE: by default the global workqueue runs on all CPUs,
so to protect individual CPUs the 'cpumask' file has to
be configured manually after bootup.
domain domain
Isolate from the general SMP balancing and scheduling Isolate from the general SMP balancing and scheduling
algorithms. Note that performing domain isolation this way algorithms. Note that performing domain isolation this way
......
...@@ -93,7 +93,6 @@ struct task_group; ...@@ -93,7 +93,6 @@ struct task_group;
/* Convenience macros for the sake of wake_up(): */ /* Convenience macros for the sake of wake_up(): */
#define TASK_NORMAL (TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE) #define TASK_NORMAL (TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE)
#define TASK_ALL (TASK_NORMAL | __TASK_STOPPED | __TASK_TRACED)
/* get_task_state(): */ /* get_task_state(): */
#define TASK_REPORT (TASK_RUNNING | TASK_INTERRUPTIBLE | \ #define TASK_REPORT (TASK_RUNNING | TASK_INTERRUPTIBLE | \
...@@ -275,6 +274,34 @@ struct load_weight { ...@@ -275,6 +274,34 @@ struct load_weight {
u32 inv_weight; u32 inv_weight;
}; };
/**
* struct util_est - Estimation utilization of FAIR tasks
* @enqueued: instantaneous estimated utilization of a task/cpu
* @ewma: the Exponential Weighted Moving Average (EWMA)
* utilization of a task
*
* Support data structure to track an Exponential Weighted Moving Average
* (EWMA) of a FAIR task's utilization. New samples are added to the moving
* average each time a task completes an activation. Sample's weight is chosen
* so that the EWMA will be relatively insensitive to transient changes to the
* task's workload.
*
* The enqueued attribute has a slightly different meaning for tasks and cpus:
* - task: the task's util_avg at last task dequeue time
* - cfs_rq: the sum of util_est.enqueued for each RUNNABLE task on that CPU
* Thus, the util_est.enqueued of a task represents the contribution on the
* estimated utilization of the CPU where that task is currently enqueued.
*
* Only for tasks we track a moving average of the past instantaneous
* estimated utilization. This allows to absorb sporadic drops in utilization
* of an otherwise almost periodic task.
*/
struct util_est {
unsigned int enqueued;
unsigned int ewma;
#define UTIL_EST_WEIGHT_SHIFT 2
};
/* /*
* The load_avg/util_avg accumulates an infinite geometric series * The load_avg/util_avg accumulates an infinite geometric series
* (see __update_load_avg() in kernel/sched/fair.c). * (see __update_load_avg() in kernel/sched/fair.c).
...@@ -336,6 +363,7 @@ struct sched_avg { ...@@ -336,6 +363,7 @@ struct sched_avg {
unsigned long load_avg; unsigned long load_avg;
unsigned long runnable_load_avg; unsigned long runnable_load_avg;
unsigned long util_avg; unsigned long util_avg;
struct util_est util_est;
}; };
struct sched_statistics { struct sched_statistics {
......
...@@ -8,9 +8,8 @@ ...@@ -8,9 +8,8 @@
* Interface between cpufreq drivers and the scheduler: * Interface between cpufreq drivers and the scheduler:
*/ */
#define SCHED_CPUFREQ_RT (1U << 0) #define SCHED_CPUFREQ_IOWAIT (1U << 0)
#define SCHED_CPUFREQ_DL (1U << 1) #define SCHED_CPUFREQ_MIGRATION (1U << 1)
#define SCHED_CPUFREQ_IOWAIT (1U << 2)
#ifdef CONFIG_CPU_FREQ #ifdef CONFIG_CPU_FREQ
struct update_util_data { struct update_util_data {
......
/* SPDX-License-Identifier: GPL-2.0 */ /* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_SCHED_DEADLINE_H
#define _LINUX_SCHED_DEADLINE_H
#include <linux/sched.h>
/* /*
* SCHED_DEADLINE tasks has negative priorities, reflecting * SCHED_DEADLINE tasks has negative priorities, reflecting
...@@ -28,5 +24,3 @@ static inline bool dl_time_before(u64 a, u64 b) ...@@ -28,5 +24,3 @@ static inline bool dl_time_before(u64 a, u64 b)
{ {
return (s64)(a - b) < 0; return (s64)(a - b) < 0;
} }
#endif /* _LINUX_SCHED_DEADLINE_H */
...@@ -12,6 +12,7 @@ enum hk_flags { ...@@ -12,6 +12,7 @@ enum hk_flags {
HK_FLAG_SCHED = (1 << 3), HK_FLAG_SCHED = (1 << 3),
HK_FLAG_TICK = (1 << 4), HK_FLAG_TICK = (1 << 4),
HK_FLAG_DOMAIN = (1 << 5), HK_FLAG_DOMAIN = (1 << 5),
HK_FLAG_WQ = (1 << 6),
}; };
#ifdef CONFIG_CPU_ISOLATION #ifdef CONFIG_CPU_ISOLATION
......
...@@ -16,11 +16,9 @@ static inline void cpu_load_update_nohz_stop(void) { } ...@@ -16,11 +16,9 @@ static inline void cpu_load_update_nohz_stop(void) { }
#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
extern void nohz_balance_enter_idle(int cpu); extern void nohz_balance_enter_idle(int cpu);
extern void set_cpu_sd_state_idle(void);
extern int get_nohz_timer_target(void); extern int get_nohz_timer_target(void);
#else #else
static inline void nohz_balance_enter_idle(int cpu) { } static inline void nohz_balance_enter_idle(int cpu) { }
static inline void set_cpu_sd_state_idle(void) { }
#endif #endif
#ifdef CONFIG_NO_HZ_COMMON #ifdef CONFIG_NO_HZ_COMMON
...@@ -37,8 +35,4 @@ extern void wake_up_nohz_cpu(int cpu); ...@@ -37,8 +35,4 @@ extern void wake_up_nohz_cpu(int cpu);
static inline void wake_up_nohz_cpu(int cpu) { } static inline void wake_up_nohz_cpu(int cpu) { }
#endif #endif
#ifdef CONFIG_NO_HZ_FULL
extern u64 scheduler_tick_max_deferment(void);
#endif
#endif /* _LINUX_SCHED_NOHZ_H */ #endif /* _LINUX_SCHED_NOHZ_H */
...@@ -113,7 +113,8 @@ enum tick_dep_bits { ...@@ -113,7 +113,8 @@ enum tick_dep_bits {
#ifdef CONFIG_NO_HZ_COMMON #ifdef CONFIG_NO_HZ_COMMON
extern bool tick_nohz_enabled; extern bool tick_nohz_enabled;
extern int tick_nohz_tick_stopped(void); extern bool tick_nohz_tick_stopped(void);
extern bool tick_nohz_tick_stopped_cpu(int cpu);
extern void tick_nohz_idle_enter(void); extern void tick_nohz_idle_enter(void);
extern void tick_nohz_idle_exit(void); extern void tick_nohz_idle_exit(void);
extern void tick_nohz_irq_exit(void); extern void tick_nohz_irq_exit(void);
...@@ -125,6 +126,7 @@ extern u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time); ...@@ -125,6 +126,7 @@ extern u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time);
#else /* !CONFIG_NO_HZ_COMMON */ #else /* !CONFIG_NO_HZ_COMMON */
#define tick_nohz_enabled (0) #define tick_nohz_enabled (0)
static inline int tick_nohz_tick_stopped(void) { return 0; } static inline int tick_nohz_tick_stopped(void) { return 0; }
static inline int tick_nohz_tick_stopped_cpu(int cpu) { return 0; }
static inline void tick_nohz_idle_enter(void) { } static inline void tick_nohz_idle_enter(void) { }
static inline void tick_nohz_idle_exit(void) { } static inline void tick_nohz_idle_exit(void) { }
......
...@@ -17,8 +17,9 @@ CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer ...@@ -17,8 +17,9 @@ CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer
endif endif
obj-y += core.o loadavg.o clock.o cputime.o obj-y += core.o loadavg.o clock.o cputime.o
obj-y += idle_task.o fair.o rt.o deadline.o obj-y += idle.o fair.o rt.o deadline.o
obj-y += wait.o wait_bit.o swait.o completion.o idle.o obj-y += wait.o wait_bit.o swait.o completion.o
obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o
obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o
obj-$(CONFIG_SCHEDSTATS) += stats.o obj-$(CONFIG_SCHEDSTATS) += stats.o
......
// SPDX-License-Identifier: GPL-2.0 // SPDX-License-Identifier: GPL-2.0
#include <linux/proc_fs.h> /*
#include <linux/seq_file.h> * Auto-group scheduling implementation:
#include <linux/utsname.h> */
#include <linux/security.h>
#include <linux/export.h>
#include "sched.h" #include "sched.h"
unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1; unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1;
...@@ -168,18 +165,19 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag) ...@@ -168,18 +165,19 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag)
autogroup_kref_put(prev); autogroup_kref_put(prev);
} }
/* Allocates GFP_KERNEL, cannot be called under any spinlock */ /* Allocates GFP_KERNEL, cannot be called under any spinlock: */
void sched_autogroup_create_attach(struct task_struct *p) void sched_autogroup_create_attach(struct task_struct *p)
{ {
struct autogroup *ag = autogroup_create(); struct autogroup *ag = autogroup_create();
autogroup_move_group(p, ag); autogroup_move_group(p, ag);
/* drop extra reference added by autogroup_create() */
/* Drop extra reference added by autogroup_create(): */
autogroup_kref_put(ag); autogroup_kref_put(ag);
} }
EXPORT_SYMBOL(sched_autogroup_create_attach); EXPORT_SYMBOL(sched_autogroup_create_attach);
/* Cannot be called under siglock. Currently has no users */ /* Cannot be called under siglock. Currently has no users: */
void sched_autogroup_detach(struct task_struct *p) void sched_autogroup_detach(struct task_struct *p)
{ {
autogroup_move_group(p, &autogroup_default); autogroup_move_group(p, &autogroup_default);
...@@ -202,7 +200,6 @@ static int __init setup_autogroup(char *str) ...@@ -202,7 +200,6 @@ static int __init setup_autogroup(char *str)
return 1; return 1;
} }
__setup("noautogroup", setup_autogroup); __setup("noautogroup", setup_autogroup);
#ifdef CONFIG_PROC_FS #ifdef CONFIG_PROC_FS
...@@ -224,7 +221,7 @@ int proc_sched_autogroup_set_nice(struct task_struct *p, int nice) ...@@ -224,7 +221,7 @@ int proc_sched_autogroup_set_nice(struct task_struct *p, int nice)
if (nice < 0 && !can_nice(current, nice)) if (nice < 0 && !can_nice(current, nice))
return -EPERM; return -EPERM;
/* this is a heavy operation taking global locks.. */ /* This is a heavy operation, taking global locks.. */
if (!capable(CAP_SYS_ADMIN) && time_before(jiffies, next)) if (!capable(CAP_SYS_ADMIN) && time_before(jiffies, next))
return -EAGAIN; return -EAGAIN;
...@@ -267,4 +264,4 @@ int autogroup_path(struct task_group *tg, char *buf, int buflen) ...@@ -267,4 +264,4 @@ int autogroup_path(struct task_group *tg, char *buf, int buflen)
return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id); return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id);
} }
#endif /* CONFIG_SCHED_DEBUG */ #endif
/* SPDX-License-Identifier: GPL-2.0 */ /* SPDX-License-Identifier: GPL-2.0 */
#ifdef CONFIG_SCHED_AUTOGROUP #ifdef CONFIG_SCHED_AUTOGROUP
#include <linux/kref.h>
#include <linux/rwsem.h>
#include <linux/sched/autogroup.h>
struct autogroup { struct autogroup {
/* /*
* reference doesn't mean how many thread attach to this * Reference doesn't mean how many threads attach to this
* autogroup now. It just stands for the number of task * autogroup now. It just stands for the number of tasks
* could use this autogroup. * which could use this autogroup.
*/ */
struct kref kref; struct kref kref;
struct task_group *tg; struct task_group *tg;
...@@ -56,11 +52,9 @@ autogroup_task_group(struct task_struct *p, struct task_group *tg) ...@@ -56,11 +52,9 @@ autogroup_task_group(struct task_struct *p, struct task_group *tg)
return tg; return tg;
} }
#ifdef CONFIG_SCHED_DEBUG
static inline int autogroup_path(struct task_group *tg, char *buf, int buflen) static inline int autogroup_path(struct task_group *tg, char *buf, int buflen)
{ {
return 0; return 0;
} }
#endif
#endif /* CONFIG_SCHED_AUTOGROUP */ #endif /* CONFIG_SCHED_AUTOGROUP */
/* /*
* sched_clock for unstable cpu clocks * sched_clock() for unstable CPU clocks
* *
* Copyright (C) 2008 Red Hat, Inc., Peter Zijlstra * Copyright (C) 2008 Red Hat, Inc., Peter Zijlstra
* *
...@@ -11,7 +11,7 @@ ...@@ -11,7 +11,7 @@
* Guillaume Chazarain <guichaz@gmail.com> * Guillaume Chazarain <guichaz@gmail.com>
* *
* *
* What: * What this file implements:
* *
* cpu_clock(i) provides a fast (execution time) high resolution * cpu_clock(i) provides a fast (execution time) high resolution
* clock with bounded drift between CPUs. The value of cpu_clock(i) * clock with bounded drift between CPUs. The value of cpu_clock(i)
...@@ -26,11 +26,11 @@ ...@@ -26,11 +26,11 @@
* at 0 on boot (but people really shouldn't rely on that). * at 0 on boot (but people really shouldn't rely on that).
* *
* cpu_clock(i) -- can be used from any context, including NMI. * cpu_clock(i) -- can be used from any context, including NMI.
* local_clock() -- is cpu_clock() on the current cpu. * local_clock() -- is cpu_clock() on the current CPU.
* *
* sched_clock_cpu(i) * sched_clock_cpu(i)
* *
* How: * How it is implemented:
* *
* The implementation either uses sched_clock() when * The implementation either uses sched_clock() when
* !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK, which means in that case the * !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK, which means in that case the
...@@ -52,19 +52,7 @@ ...@@ -52,19 +52,7 @@
* that is otherwise invisible (TSC gets stopped). * that is otherwise invisible (TSC gets stopped).
* *
*/ */
#include <linux/spinlock.h> #include "sched.h"
#include <linux/hardirq.h>
#include <linux/export.h>
#include <linux/percpu.h>
#include <linux/ktime.h>
#include <linux/sched.h>
#include <linux/nmi.h>
#include <linux/sched/clock.h>
#include <linux/static_key.h>
#include <linux/workqueue.h>
#include <linux/compiler.h>
#include <linux/tick.h>
#include <linux/init.h>
/* /*
* Scheduler clock - returns current time in nanosec units. * Scheduler clock - returns current time in nanosec units.
...@@ -302,21 +290,21 @@ static u64 sched_clock_remote(struct sched_clock_data *scd) ...@@ -302,21 +290,21 @@ static u64 sched_clock_remote(struct sched_clock_data *scd)
* cmpxchg64 below only protects one readout. * cmpxchg64 below only protects one readout.
* *
* We must reread via sched_clock_local() in the retry case on * We must reread via sched_clock_local() in the retry case on
* 32bit as an NMI could use sched_clock_local() via the * 32-bit kernels as an NMI could use sched_clock_local() via the
* tracer and hit between the readout of * tracer and hit between the readout of
* the low32bit and the high 32bit portion. * the low 32-bit and the high 32-bit portion.
*/ */
this_clock = sched_clock_local(my_scd); this_clock = sched_clock_local(my_scd);
/* /*
* We must enforce atomic readout on 32bit, otherwise the * We must enforce atomic readout on 32-bit, otherwise the
* update on the remote cpu can hit inbetween the readout of * update on the remote CPU can hit inbetween the readout of
* the low32bit and the high 32bit portion. * the low 32-bit and the high 32-bit portion.
*/ */
remote_clock = cmpxchg64(&scd->clock, 0, 0); remote_clock = cmpxchg64(&scd->clock, 0, 0);
#else #else
/* /*
* On 64bit the read of [my]scd->clock is atomic versus the * On 64-bit kernels the read of [my]scd->clock is atomic versus the
* update, so we can avoid the above 32bit dance. * update, so we can avoid the above 32-bit dance.
*/ */
sched_clock_local(my_scd); sched_clock_local(my_scd);
again: again:
......
...@@ -11,10 +11,7 @@ ...@@ -11,10 +11,7 @@
* typically be used for exclusion which gives rise to priority inversion. * typically be used for exclusion which gives rise to priority inversion.
* Waiting for completion is a typically sync point, but not an exclusion point. * Waiting for completion is a typically sync point, but not an exclusion point.
*/ */
#include "sched.h"
#include <linux/sched/signal.h>
#include <linux/sched/debug.h>
#include <linux/completion.h>
/** /**
* complete: - signals a single thread waiting on this completion * complete: - signals a single thread waiting on this completion
...@@ -283,7 +280,7 @@ EXPORT_SYMBOL(wait_for_completion_killable_timeout); ...@@ -283,7 +280,7 @@ EXPORT_SYMBOL(wait_for_completion_killable_timeout);
bool try_wait_for_completion(struct completion *x) bool try_wait_for_completion(struct completion *x)
{ {
unsigned long flags; unsigned long flags;
int ret = 1; bool ret = true;
/* /*
* Since x->done will need to be locked only * Since x->done will need to be locked only
...@@ -292,11 +289,11 @@ bool try_wait_for_completion(struct completion *x) ...@@ -292,11 +289,11 @@ bool try_wait_for_completion(struct completion *x)
* return early in the blocking case. * return early in the blocking case.
*/ */
if (!READ_ONCE(x->done)) if (!READ_ONCE(x->done))
return 0; return false;
spin_lock_irqsave(&x->wait.lock, flags); spin_lock_irqsave(&x->wait.lock, flags);
if (!x->done) if (!x->done)
ret = 0; ret = false;
else if (x->done != UINT_MAX) else if (x->done != UINT_MAX)
x->done--; x->done--;
spin_unlock_irqrestore(&x->wait.lock, flags); spin_unlock_irqrestore(&x->wait.lock, flags);
......
...@@ -5,37 +5,11 @@ ...@@ -5,37 +5,11 @@
* *
* Copyright (C) 1991-2002 Linus Torvalds * Copyright (C) 1991-2002 Linus Torvalds
*/ */
#include <linux/sched.h> #include "sched.h"
#include <linux/sched/clock.h>
#include <uapi/linux/sched/types.h>
#include <linux/sched/loadavg.h>
#include <linux/sched/hotplug.h>
#include <linux/wait_bit.h>
#include <linux/cpuset.h>
#include <linux/delayacct.h>
#include <linux/init_task.h>
#include <linux/context_tracking.h>
#include <linux/rcupdate_wait.h>
#include <linux/compat.h>
#include <linux/blkdev.h>
#include <linux/kprobes.h>
#include <linux/mmu_context.h>
#include <linux/module.h>
#include <linux/nmi.h>
#include <linux/prefetch.h>
#include <linux/profile.h>
#include <linux/security.h>
#include <linux/syscalls.h>
#include <linux/sched/isolation.h>
#include <asm/switch_to.h> #include <asm/switch_to.h>
#include <asm/tlb.h> #include <asm/tlb.h>
#ifdef CONFIG_PARAVIRT
#include <asm/paravirt.h>
#endif
#include "sched.h"
#include "../workqueue_internal.h" #include "../workqueue_internal.h"
#include "../smpboot.h" #include "../smpboot.h"
...@@ -135,7 +109,7 @@ struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) ...@@ -135,7 +109,7 @@ struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf)
* [L] ->on_rq * [L] ->on_rq
* RELEASE (rq->lock) * RELEASE (rq->lock)
* *
* If we observe the old cpu in task_rq_lock, the acquire of * If we observe the old CPU in task_rq_lock, the acquire of
* the old rq->lock will fully serialize against the stores. * the old rq->lock will fully serialize against the stores.
* *
* If we observe the new CPU in task_rq_lock, the acquire will * If we observe the new CPU in task_rq_lock, the acquire will
...@@ -333,7 +307,7 @@ void hrtick_start(struct rq *rq, u64 delay) ...@@ -333,7 +307,7 @@ void hrtick_start(struct rq *rq, u64 delay)
} }
#endif /* CONFIG_SMP */ #endif /* CONFIG_SMP */
static void init_rq_hrtick(struct rq *rq) static void hrtick_rq_init(struct rq *rq)
{ {
#ifdef CONFIG_SMP #ifdef CONFIG_SMP
rq->hrtick_csd_pending = 0; rq->hrtick_csd_pending = 0;
...@@ -351,7 +325,7 @@ static inline void hrtick_clear(struct rq *rq) ...@@ -351,7 +325,7 @@ static inline void hrtick_clear(struct rq *rq)
{ {
} }
static inline void init_rq_hrtick(struct rq *rq) static inline void hrtick_rq_init(struct rq *rq)
{ {
} }
#endif /* CONFIG_SCHED_HRTICK */ #endif /* CONFIG_SCHED_HRTICK */
...@@ -609,7 +583,7 @@ static inline bool got_nohz_idle_kick(void) ...@@ -609,7 +583,7 @@ static inline bool got_nohz_idle_kick(void)
{ {
int cpu = smp_processor_id(); int cpu = smp_processor_id();
if (!test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu))) if (!(atomic_read(nohz_flags(cpu)) & NOHZ_KICK_MASK))
return false; return false;
if (idle_cpu(cpu) && !need_resched()) if (idle_cpu(cpu) && !need_resched())
...@@ -619,7 +593,7 @@ static inline bool got_nohz_idle_kick(void) ...@@ -619,7 +593,7 @@ static inline bool got_nohz_idle_kick(void)
* We can't run Idle Load Balance on this CPU for this time so we * We can't run Idle Load Balance on this CPU for this time so we
* cancel it and clear NOHZ_BALANCE_KICK * cancel it and clear NOHZ_BALANCE_KICK
*/ */
clear_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu)); atomic_andnot(NOHZ_KICK_MASK, nohz_flags(cpu));
return false; return false;
} }
...@@ -1457,7 +1431,7 @@ EXPORT_SYMBOL_GPL(kick_process); ...@@ -1457,7 +1431,7 @@ EXPORT_SYMBOL_GPL(kick_process);
* *
* - cpu_active must be a subset of cpu_online * - cpu_active must be a subset of cpu_online
* *
* - on cpu-up we allow per-cpu kthreads on the online && !active cpu, * - on CPU-up we allow per-CPU kthreads on the online && !active CPU,
* see __set_cpus_allowed_ptr(). At this point the newly online * see __set_cpus_allowed_ptr(). At this point the newly online
* CPU isn't yet part of the sched domains, and balancing will not * CPU isn't yet part of the sched domains, and balancing will not
* see it. * see it.
...@@ -2488,17 +2462,17 @@ void wake_up_new_task(struct task_struct *p) ...@@ -2488,17 +2462,17 @@ void wake_up_new_task(struct task_struct *p)
#ifdef CONFIG_PREEMPT_NOTIFIERS #ifdef CONFIG_PREEMPT_NOTIFIERS
static struct static_key preempt_notifier_key = STATIC_KEY_INIT_FALSE; static DEFINE_STATIC_KEY_FALSE(preempt_notifier_key);
void preempt_notifier_inc(void) void preempt_notifier_inc(void)
{ {
static_key_slow_inc(&preempt_notifier_key); static_branch_inc(&preempt_notifier_key);
} }
EXPORT_SYMBOL_GPL(preempt_notifier_inc); EXPORT_SYMBOL_GPL(preempt_notifier_inc);
void preempt_notifier_dec(void) void preempt_notifier_dec(void)
{ {
static_key_slow_dec(&preempt_notifier_key); static_branch_dec(&preempt_notifier_key);
} }
EXPORT_SYMBOL_GPL(preempt_notifier_dec); EXPORT_SYMBOL_GPL(preempt_notifier_dec);
...@@ -2508,7 +2482,7 @@ EXPORT_SYMBOL_GPL(preempt_notifier_dec); ...@@ -2508,7 +2482,7 @@ EXPORT_SYMBOL_GPL(preempt_notifier_dec);
*/ */
void preempt_notifier_register(struct preempt_notifier *notifier) void preempt_notifier_register(struct preempt_notifier *notifier)
{ {
if (!static_key_false(&preempt_notifier_key)) if (!static_branch_unlikely(&preempt_notifier_key))
WARN(1, "registering preempt_notifier while notifiers disabled\n"); WARN(1, "registering preempt_notifier while notifiers disabled\n");
hlist_add_head(&notifier->link, &current->preempt_notifiers); hlist_add_head(&notifier->link, &current->preempt_notifiers);
...@@ -2537,7 +2511,7 @@ static void __fire_sched_in_preempt_notifiers(struct task_struct *curr) ...@@ -2537,7 +2511,7 @@ static void __fire_sched_in_preempt_notifiers(struct task_struct *curr)
static __always_inline void fire_sched_in_preempt_notifiers(struct task_struct *curr) static __always_inline void fire_sched_in_preempt_notifiers(struct task_struct *curr)
{ {
if (static_key_false(&preempt_notifier_key)) if (static_branch_unlikely(&preempt_notifier_key))
__fire_sched_in_preempt_notifiers(curr); __fire_sched_in_preempt_notifiers(curr);
} }
...@@ -2555,7 +2529,7 @@ static __always_inline void ...@@ -2555,7 +2529,7 @@ static __always_inline void
fire_sched_out_preempt_notifiers(struct task_struct *curr, fire_sched_out_preempt_notifiers(struct task_struct *curr,
struct task_struct *next) struct task_struct *next)
{ {
if (static_key_false(&preempt_notifier_key)) if (static_branch_unlikely(&preempt_notifier_key))
__fire_sched_out_preempt_notifiers(curr, next); __fire_sched_out_preempt_notifiers(curr, next);
} }
...@@ -2629,6 +2603,18 @@ static inline void finish_lock_switch(struct rq *rq) ...@@ -2629,6 +2603,18 @@ static inline void finish_lock_switch(struct rq *rq)
raw_spin_unlock_irq(&rq->lock); raw_spin_unlock_irq(&rq->lock);
} }
/*
* NOP if the arch has not defined these:
*/
#ifndef prepare_arch_switch
# define prepare_arch_switch(next) do { } while (0)
#endif
#ifndef finish_arch_post_lock_switch
# define finish_arch_post_lock_switch() do { } while (0)
#endif
/** /**
* prepare_task_switch - prepare to switch tasks * prepare_task_switch - prepare to switch tasks
* @rq: the runqueue preparing to switch * @rq: the runqueue preparing to switch
...@@ -3037,7 +3023,7 @@ unsigned long long task_sched_runtime(struct task_struct *p) ...@@ -3037,7 +3023,7 @@ unsigned long long task_sched_runtime(struct task_struct *p)
#if defined(CONFIG_64BIT) && defined(CONFIG_SMP) #if defined(CONFIG_64BIT) && defined(CONFIG_SMP)
/* /*
* 64-bit doesn't need locks to atomically read a 64bit value. * 64-bit doesn't need locks to atomically read a 64-bit value.
* So we have a optimization chance when the task's delta_exec is 0. * So we have a optimization chance when the task's delta_exec is 0.
* Reading ->on_cpu is racy, but this is ok. * Reading ->on_cpu is racy, but this is ok.
* *
...@@ -3096,35 +3082,99 @@ void scheduler_tick(void) ...@@ -3096,35 +3082,99 @@ void scheduler_tick(void)
rq->idle_balance = idle_cpu(cpu); rq->idle_balance = idle_cpu(cpu);
trigger_load_balance(rq); trigger_load_balance(rq);
#endif #endif
rq_last_tick_reset(rq);
} }
#ifdef CONFIG_NO_HZ_FULL #ifdef CONFIG_NO_HZ_FULL
/**
* scheduler_tick_max_deferment struct tick_work {
* int cpu;
* Keep at least one tick per second when a single struct delayed_work work;
* active task is running because the scheduler doesn't };
* yet completely support full dynticks environment.
* static struct tick_work __percpu *tick_work_cpu;
* This makes sure that uptime, CFS vruntime, load
* balancing, etc... continue to move forward, even static void sched_tick_remote(struct work_struct *work)
* with a very low granularity. {
* struct delayed_work *dwork = to_delayed_work(work);
* Return: Maximum deferment in nanoseconds. struct tick_work *twork = container_of(dwork, struct tick_work, work);
int cpu = twork->cpu;
struct rq *rq = cpu_rq(cpu);
struct rq_flags rf;
/*
* Handle the tick only if it appears the remote CPU is running in full
* dynticks mode. The check is racy by nature, but missing a tick or
* having one too much is no big deal because the scheduler tick updates
* statistics and checks timeslices in a time-independent way, regardless
* of when exactly it is running.
*/
if (!idle_cpu(cpu) && tick_nohz_tick_stopped_cpu(cpu)) {
struct task_struct *curr;
u64 delta;
rq_lock_irq(rq, &rf);
update_rq_clock(rq);
curr = rq->curr;
delta = rq_clock_task(rq) - curr->se.exec_start;
/*
* Make sure the next tick runs within a reasonable
* amount of time.
*/ */
u64 scheduler_tick_max_deferment(void) WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3);
curr->sched_class->task_tick(rq, curr, 0);
rq_unlock_irq(rq, &rf);
}
/*
* Run the remote tick once per second (1Hz). This arbitrary
* frequency is large enough to avoid overload but short enough
* to keep scheduler internal stats reasonably up to date.
*/
queue_delayed_work(system_unbound_wq, dwork, HZ);
}
static void sched_tick_start(int cpu)
{ {
struct rq *rq = this_rq(); struct tick_work *twork;
unsigned long next, now = READ_ONCE(jiffies);
next = rq->last_sched_tick + HZ; if (housekeeping_cpu(cpu, HK_FLAG_TICK))
return;
if (time_before_eq(next, now)) WARN_ON_ONCE(!tick_work_cpu);
return 0;
return jiffies_to_nsecs(next - now); twork = per_cpu_ptr(tick_work_cpu, cpu);
twork->cpu = cpu;
INIT_DELAYED_WORK(&twork->work, sched_tick_remote);
queue_delayed_work(system_unbound_wq, &twork->work, HZ);
} }
#ifdef CONFIG_HOTPLUG_CPU
static void sched_tick_stop(int cpu)
{
struct tick_work *twork;
if (housekeeping_cpu(cpu, HK_FLAG_TICK))
return;
WARN_ON_ONCE(!tick_work_cpu);
twork = per_cpu_ptr(tick_work_cpu, cpu);
cancel_delayed_work_sync(&twork->work);
}
#endif /* CONFIG_HOTPLUG_CPU */
int __init sched_tick_offload_init(void)
{
tick_work_cpu = alloc_percpu(struct tick_work);
BUG_ON(!tick_work_cpu);
return 0;
}
#else /* !CONFIG_NO_HZ_FULL */
static inline void sched_tick_start(int cpu) { }
static inline void sched_tick_stop(int cpu) { }
#endif #endif
#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
...@@ -5786,6 +5836,7 @@ int sched_cpu_starting(unsigned int cpu) ...@@ -5786,6 +5836,7 @@ int sched_cpu_starting(unsigned int cpu)
{ {
set_cpu_rq_start_time(cpu); set_cpu_rq_start_time(cpu);
sched_rq_cpu_starting(cpu); sched_rq_cpu_starting(cpu);
sched_tick_start(cpu);
return 0; return 0;
} }
...@@ -5797,6 +5848,7 @@ int sched_cpu_dying(unsigned int cpu) ...@@ -5797,6 +5848,7 @@ int sched_cpu_dying(unsigned int cpu)
/* Handle pending wakeups and then migrate everything off */ /* Handle pending wakeups and then migrate everything off */
sched_ttwu_pending(); sched_ttwu_pending();
sched_tick_stop(cpu);
rq_lock_irqsave(rq, &rf); rq_lock_irqsave(rq, &rf);
if (rq->rd) { if (rq->rd) {
...@@ -5809,7 +5861,7 @@ int sched_cpu_dying(unsigned int cpu) ...@@ -5809,7 +5861,7 @@ int sched_cpu_dying(unsigned int cpu)
calc_load_migrate(rq); calc_load_migrate(rq);
update_max_interval(); update_max_interval();
nohz_balance_exit_idle(cpu); nohz_balance_exit_idle(rq);
hrtick_clear(rq); hrtick_clear(rq);
return 0; return 0;
} }
...@@ -6022,13 +6074,11 @@ void __init sched_init(void) ...@@ -6022,13 +6074,11 @@ void __init sched_init(void)
rq_attach_root(rq, &def_root_domain); rq_attach_root(rq, &def_root_domain);
#ifdef CONFIG_NO_HZ_COMMON #ifdef CONFIG_NO_HZ_COMMON
rq->last_load_update_tick = jiffies; rq->last_load_update_tick = jiffies;
rq->nohz_flags = 0; rq->last_blocked_load_update_tick = jiffies;
#endif atomic_set(&rq->nohz_flags, 0);
#ifdef CONFIG_NO_HZ_FULL
rq->last_sched_tick = 0;
#endif #endif
#endif /* CONFIG_SMP */ #endif /* CONFIG_SMP */
init_rq_hrtick(rq); hrtick_rq_init(rq);
atomic_set(&rq->nr_iowait, 0); atomic_set(&rq->nr_iowait, 0);
} }
...@@ -7027,3 +7077,5 @@ const u32 sched_prio_to_wmult[40] = { ...@@ -7027,3 +7077,5 @@ const u32 sched_prio_to_wmult[40] = {
/* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717, /* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717,
/* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
}; };
#undef CREATE_TRACE_POINTS
// SPDX-License-Identifier: GPL-2.0 // SPDX-License-Identifier: GPL-2.0
#include <linux/cgroup.h>
#include <linux/slab.h>
#include <linux/percpu.h>
#include <linux/spinlock.h>
#include <linux/cpumask.h>
#include <linux/seq_file.h>
#include <linux/rcupdate.h>
#include <linux/kernel_stat.h>
#include <linux/err.h>
#include "sched.h"
/* /*
* CPU accounting code for task groups. * CPU accounting code for task groups.
* *
* Based on the work by Paul Menage (menage@google.com) and Balbir Singh * Based on the work by Paul Menage (menage@google.com) and Balbir Singh
* (balbir@in.ibm.com). * (balbir@in.ibm.com).
*/ */
#include "sched.h"
/* Time spent by the tasks of the cpu accounting group executing in ... */ /* Time spent by the tasks of the CPU accounting group executing in ... */
enum cpuacct_stat_index { enum cpuacct_stat_index {
CPUACCT_STAT_USER, /* ... user mode */ CPUACCT_STAT_USER, /* ... user mode */
CPUACCT_STAT_SYSTEM, /* ... kernel mode */ CPUACCT_STAT_SYSTEM, /* ... kernel mode */
...@@ -35,10 +24,10 @@ struct cpuacct_usage { ...@@ -35,10 +24,10 @@ struct cpuacct_usage {
u64 usages[CPUACCT_STAT_NSTATS]; u64 usages[CPUACCT_STAT_NSTATS];
}; };
/* track cpu usage of a group of tasks and its child groups */ /* track CPU usage of a group of tasks and its child groups */
struct cpuacct { struct cpuacct {
struct cgroup_subsys_state css; struct cgroup_subsys_state css;
/* cpuusage holds pointer to a u64-type object on every cpu */ /* cpuusage holds pointer to a u64-type object on every CPU */
struct cpuacct_usage __percpu *cpuusage; struct cpuacct_usage __percpu *cpuusage;
struct kernel_cpustat __percpu *cpustat; struct kernel_cpustat __percpu *cpustat;
}; };
...@@ -48,7 +37,7 @@ static inline struct cpuacct *css_ca(struct cgroup_subsys_state *css) ...@@ -48,7 +37,7 @@ static inline struct cpuacct *css_ca(struct cgroup_subsys_state *css)
return css ? container_of(css, struct cpuacct, css) : NULL; return css ? container_of(css, struct cpuacct, css) : NULL;
} }
/* return cpu accounting group to which this task belongs */ /* Return CPU accounting group to which this task belongs */
static inline struct cpuacct *task_ca(struct task_struct *tsk) static inline struct cpuacct *task_ca(struct task_struct *tsk)
{ {
return css_ca(task_css(tsk, cpuacct_cgrp_id)); return css_ca(task_css(tsk, cpuacct_cgrp_id));
...@@ -65,7 +54,7 @@ static struct cpuacct root_cpuacct = { ...@@ -65,7 +54,7 @@ static struct cpuacct root_cpuacct = {
.cpuusage = &root_cpuacct_cpuusage, .cpuusage = &root_cpuacct_cpuusage,
}; };
/* create a new cpu accounting group */ /* Create a new CPU accounting group */
static struct cgroup_subsys_state * static struct cgroup_subsys_state *
cpuacct_css_alloc(struct cgroup_subsys_state *parent_css) cpuacct_css_alloc(struct cgroup_subsys_state *parent_css)
{ {
...@@ -96,7 +85,7 @@ cpuacct_css_alloc(struct cgroup_subsys_state *parent_css) ...@@ -96,7 +85,7 @@ cpuacct_css_alloc(struct cgroup_subsys_state *parent_css)
return ERR_PTR(-ENOMEM); return ERR_PTR(-ENOMEM);
} }
/* destroy an existing cpu accounting group */ /* Destroy an existing CPU accounting group */
static void cpuacct_css_free(struct cgroup_subsys_state *css) static void cpuacct_css_free(struct cgroup_subsys_state *css)
{ {
struct cpuacct *ca = css_ca(css); struct cpuacct *ca = css_ca(css);
...@@ -162,7 +151,7 @@ static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val) ...@@ -162,7 +151,7 @@ static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
#endif #endif
} }
/* return total cpu usage (in nanoseconds) of a group */ /* Return total CPU usage (in nanoseconds) of a group */
static u64 __cpuusage_read(struct cgroup_subsys_state *css, static u64 __cpuusage_read(struct cgroup_subsys_state *css,
enum cpuacct_stat_index index) enum cpuacct_stat_index index)
{ {
......
...@@ -10,11 +10,7 @@ ...@@ -10,11 +10,7 @@
* as published by the Free Software Foundation; version 2 * as published by the Free Software Foundation; version 2
* of the License. * of the License.
*/ */
#include "sched.h"
#include <linux/gfp.h>
#include <linux/kernel.h>
#include <linux/slab.h>
#include "cpudeadline.h"
static inline int parent(int i) static inline int parent(int i)
{ {
...@@ -42,8 +38,9 @@ static void cpudl_heapify_down(struct cpudl *cp, int idx) ...@@ -42,8 +38,9 @@ static void cpudl_heapify_down(struct cpudl *cp, int idx)
return; return;
/* adapted from lib/prio_heap.c */ /* adapted from lib/prio_heap.c */
while(1) { while (1) {
u64 largest_dl; u64 largest_dl;
l = left_child(idx); l = left_child(idx);
r = right_child(idx); r = right_child(idx);
largest = idx; largest = idx;
...@@ -131,6 +128,7 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p, ...@@ -131,6 +128,7 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p,
return 1; return 1;
} else { } else {
int best_cpu = cpudl_maximum(cp); int best_cpu = cpudl_maximum(cp);
WARN_ON(best_cpu != -1 && !cpu_present(best_cpu)); WARN_ON(best_cpu != -1 && !cpu_present(best_cpu));
if (cpumask_test_cpu(best_cpu, &p->cpus_allowed) && if (cpumask_test_cpu(best_cpu, &p->cpus_allowed) &&
...@@ -145,9 +143,9 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p, ...@@ -145,9 +143,9 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p,
} }
/* /*
* cpudl_clear - remove a cpu from the cpudl max-heap * cpudl_clear - remove a CPU from the cpudl max-heap
* @cp: the cpudl max-heap context * @cp: the cpudl max-heap context
* @cpu: the target cpu * @cpu: the target CPU
* *
* Notes: assumes cpu_rq(cpu)->lock is locked * Notes: assumes cpu_rq(cpu)->lock is locked
* *
...@@ -186,8 +184,8 @@ void cpudl_clear(struct cpudl *cp, int cpu) ...@@ -186,8 +184,8 @@ void cpudl_clear(struct cpudl *cp, int cpu)
/* /*
* cpudl_set - update the cpudl max-heap * cpudl_set - update the cpudl max-heap
* @cp: the cpudl max-heap context * @cp: the cpudl max-heap context
* @cpu: the target cpu * @cpu: the target CPU
* @dl: the new earliest deadline for this cpu * @dl: the new earliest deadline for this CPU
* *
* Notes: assumes cpu_rq(cpu)->lock is locked * Notes: assumes cpu_rq(cpu)->lock is locked
* *
...@@ -205,6 +203,7 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl) ...@@ -205,6 +203,7 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl)
old_idx = cp->elements[cpu].idx; old_idx = cp->elements[cpu].idx;
if (old_idx == IDX_INVALID) { if (old_idx == IDX_INVALID) {
int new_idx = cp->size++; int new_idx = cp->size++;
cp->elements[new_idx].dl = dl; cp->elements[new_idx].dl = dl;
cp->elements[new_idx].cpu = cpu; cp->elements[new_idx].cpu = cpu;
cp->elements[cpu].idx = new_idx; cp->elements[cpu].idx = new_idx;
...@@ -221,7 +220,7 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl) ...@@ -221,7 +220,7 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl)
/* /*
* cpudl_set_freecpu - Set the cpudl.free_cpus * cpudl_set_freecpu - Set the cpudl.free_cpus
* @cp: the cpudl max-heap context * @cp: the cpudl max-heap context
* @cpu: rd attached cpu * @cpu: rd attached CPU
*/ */
void cpudl_set_freecpu(struct cpudl *cp, int cpu) void cpudl_set_freecpu(struct cpudl *cp, int cpu)
{ {
...@@ -231,7 +230,7 @@ void cpudl_set_freecpu(struct cpudl *cp, int cpu) ...@@ -231,7 +230,7 @@ void cpudl_set_freecpu(struct cpudl *cp, int cpu)
/* /*
* cpudl_clear_freecpu - Clear the cpudl.free_cpus * cpudl_clear_freecpu - Clear the cpudl.free_cpus
* @cp: the cpudl max-heap context * @cp: the cpudl max-heap context
* @cpu: rd attached cpu * @cpu: rd attached CPU
*/ */
void cpudl_clear_freecpu(struct cpudl *cp, int cpu) void cpudl_clear_freecpu(struct cpudl *cp, int cpu)
{ {
......
/* SPDX-License-Identifier: GPL-2.0 */ /* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_CPUDL_H
#define _LINUX_CPUDL_H
#include <linux/sched.h>
#include <linux/sched/deadline.h>
#define IDX_INVALID -1 #define IDX_INVALID -1
...@@ -20,10 +15,8 @@ struct cpudl { ...@@ -20,10 +15,8 @@ struct cpudl {
struct cpudl_item *elements; struct cpudl_item *elements;
}; };
#ifdef CONFIG_SMP #ifdef CONFIG_SMP
int cpudl_find(struct cpudl *cp, struct task_struct *p, int cpudl_find(struct cpudl *cp, struct task_struct *p, struct cpumask *later_mask);
struct cpumask *later_mask);
void cpudl_set(struct cpudl *cp, int cpu, u64 dl); void cpudl_set(struct cpudl *cp, int cpu, u64 dl);
void cpudl_clear(struct cpudl *cp, int cpu); void cpudl_clear(struct cpudl *cp, int cpu);
int cpudl_init(struct cpudl *cp); int cpudl_init(struct cpudl *cp);
...@@ -31,5 +24,3 @@ void cpudl_set_freecpu(struct cpudl *cp, int cpu); ...@@ -31,5 +24,3 @@ void cpudl_set_freecpu(struct cpudl *cp, int cpu);
void cpudl_clear_freecpu(struct cpudl *cp, int cpu); void cpudl_clear_freecpu(struct cpudl *cp, int cpu);
void cpudl_cleanup(struct cpudl *cp); void cpudl_cleanup(struct cpudl *cp);
#endif /* CONFIG_SMP */ #endif /* CONFIG_SMP */
#endif /* _LINUX_CPUDL_H */
...@@ -8,7 +8,6 @@ ...@@ -8,7 +8,6 @@
* it under the terms of the GNU General Public License version 2 as * it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation. * published by the Free Software Foundation.
*/ */
#include "sched.h" #include "sched.h"
DEFINE_PER_CPU(struct update_util_data *, cpufreq_update_util_data); DEFINE_PER_CPU(struct update_util_data *, cpufreq_update_util_data);
......
...@@ -11,14 +11,10 @@ ...@@ -11,14 +11,10 @@
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/cpufreq.h>
#include <linux/kthread.h>
#include <uapi/linux/sched/types.h>
#include <linux/slab.h>
#include <trace/events/power.h>
#include "sched.h" #include "sched.h"
#include <trace/events/power.h>
struct sugov_tunables { struct sugov_tunables {
struct gov_attr_set attr_set; struct gov_attr_set attr_set;
unsigned int rate_limit_us; unsigned int rate_limit_us;
...@@ -36,7 +32,7 @@ struct sugov_policy { ...@@ -36,7 +32,7 @@ struct sugov_policy {
unsigned int next_freq; unsigned int next_freq;
unsigned int cached_raw_freq; unsigned int cached_raw_freq;
/* The next fields are only needed if fast switch cannot be used. */ /* The next fields are only needed if fast switch cannot be used: */
struct irq_work irq_work; struct irq_work irq_work;
struct kthread_work work; struct kthread_work work;
struct mutex work_lock; struct mutex work_lock;
...@@ -57,13 +53,12 @@ struct sugov_cpu { ...@@ -57,13 +53,12 @@ struct sugov_cpu {
unsigned int iowait_boost_max; unsigned int iowait_boost_max;
u64 last_update; u64 last_update;
/* The fields below are only needed when sharing a policy. */ /* The fields below are only needed when sharing a policy: */
unsigned long util_cfs; unsigned long util_cfs;
unsigned long util_dl; unsigned long util_dl;
unsigned long max; unsigned long max;
unsigned int flags;
/* The field below is for single-CPU policies only. */ /* The field below is for single-CPU policies only: */
#ifdef CONFIG_NO_HZ_COMMON #ifdef CONFIG_NO_HZ_COMMON
unsigned long saved_idle_calls; unsigned long saved_idle_calls;
#endif #endif
...@@ -79,9 +74,9 @@ static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time) ...@@ -79,9 +74,9 @@ static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time)
/* /*
* Since cpufreq_update_util() is called with rq->lock held for * Since cpufreq_update_util() is called with rq->lock held for
* the @target_cpu, our per-cpu data is fully serialized. * the @target_cpu, our per-CPU data is fully serialized.
* *
* However, drivers cannot in general deal with cross-cpu * However, drivers cannot in general deal with cross-CPU
* requests, so while get_next_freq() will work, our * requests, so while get_next_freq() will work, our
* sugov_update_commit() call may not for the fast switching platforms. * sugov_update_commit() call may not for the fast switching platforms.
* *
...@@ -111,6 +106,7 @@ static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time) ...@@ -111,6 +106,7 @@ static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time)
} }
delta_ns = time - sg_policy->last_freq_update_time; delta_ns = time - sg_policy->last_freq_update_time;
return delta_ns >= sg_policy->freq_update_delay_ns; return delta_ns >= sg_policy->freq_update_delay_ns;
} }
...@@ -186,17 +182,28 @@ static void sugov_get_util(struct sugov_cpu *sg_cpu) ...@@ -186,17 +182,28 @@ static void sugov_get_util(struct sugov_cpu *sg_cpu)
static unsigned long sugov_aggregate_util(struct sugov_cpu *sg_cpu) static unsigned long sugov_aggregate_util(struct sugov_cpu *sg_cpu)
{ {
struct rq *rq = cpu_rq(sg_cpu->cpu);
unsigned long util;
if (rq->rt.rt_nr_running) {
util = sg_cpu->max;
} else {
util = sg_cpu->util_dl;
if (rq->cfs.h_nr_running)
util += sg_cpu->util_cfs;
}
/* /*
* Ideally we would like to set util_dl as min/guaranteed freq and * Ideally we would like to set util_dl as min/guaranteed freq and
* util_cfs + util_dl as requested freq. However, cpufreq is not yet * util_cfs + util_dl as requested freq. However, cpufreq is not yet
* ready for such an interface. So, we only do the latter for now. * ready for such an interface. So, we only do the latter for now.
*/ */
return min(sg_cpu->util_cfs + sg_cpu->util_dl, sg_cpu->max); return min(util, sg_cpu->max);
} }
static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time) static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time, unsigned int flags)
{ {
if (sg_cpu->flags & SCHED_CPUFREQ_IOWAIT) { if (flags & SCHED_CPUFREQ_IOWAIT) {
if (sg_cpu->iowait_boost_pending) if (sg_cpu->iowait_boost_pending)
return; return;
...@@ -260,27 +267,35 @@ static bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) ...@@ -260,27 +267,35 @@ static bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu)
static inline bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) { return false; } static inline bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) { return false; }
#endif /* CONFIG_NO_HZ_COMMON */ #endif /* CONFIG_NO_HZ_COMMON */
/*
* Make sugov_should_update_freq() ignore the rate limit when DL
* has increased the utilization.
*/
static inline void ignore_dl_rate_limit(struct sugov_cpu *sg_cpu, struct sugov_policy *sg_policy)
{
if (cpu_util_dl(cpu_rq(sg_cpu->cpu)) > sg_cpu->util_dl)
sg_policy->need_freq_update = true;
}
static void sugov_update_single(struct update_util_data *hook, u64 time, static void sugov_update_single(struct update_util_data *hook, u64 time,
unsigned int flags) unsigned int flags)
{ {
struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util); struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util);
struct sugov_policy *sg_policy = sg_cpu->sg_policy; struct sugov_policy *sg_policy = sg_cpu->sg_policy;
struct cpufreq_policy *policy = sg_policy->policy;
unsigned long util, max; unsigned long util, max;
unsigned int next_f; unsigned int next_f;
bool busy; bool busy;
sugov_set_iowait_boost(sg_cpu, time); sugov_set_iowait_boost(sg_cpu, time, flags);
sg_cpu->last_update = time; sg_cpu->last_update = time;
ignore_dl_rate_limit(sg_cpu, sg_policy);
if (!sugov_should_update_freq(sg_policy, time)) if (!sugov_should_update_freq(sg_policy, time))
return; return;
busy = sugov_cpu_is_busy(sg_cpu); busy = sugov_cpu_is_busy(sg_cpu);
if (flags & SCHED_CPUFREQ_RT) {
next_f = policy->cpuinfo.max_freq;
} else {
sugov_get_util(sg_cpu); sugov_get_util(sg_cpu);
max = sg_cpu->max; max = sg_cpu->max;
util = sugov_aggregate_util(sg_cpu); util = sugov_aggregate_util(sg_cpu);
...@@ -296,7 +311,7 @@ static void sugov_update_single(struct update_util_data *hook, u64 time, ...@@ -296,7 +311,7 @@ static void sugov_update_single(struct update_util_data *hook, u64 time,
/* Reset cached freq as next_freq has changed */ /* Reset cached freq as next_freq has changed */
sg_policy->cached_raw_freq = 0; sg_policy->cached_raw_freq = 0;
} }
}
sugov_update_commit(sg_policy, time, next_f); sugov_update_commit(sg_policy, time, next_f);
} }
...@@ -312,6 +327,8 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time) ...@@ -312,6 +327,8 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time)
unsigned long j_util, j_max; unsigned long j_util, j_max;
s64 delta_ns; s64 delta_ns;
sugov_get_util(j_sg_cpu);
/* /*
* If the CFS CPU utilization was last updated before the * If the CFS CPU utilization was last updated before the
* previous frequency update and the time elapsed between the * previous frequency update and the time elapsed between the
...@@ -325,28 +342,22 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time) ...@@ -325,28 +342,22 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time)
if (delta_ns > TICK_NSEC) { if (delta_ns > TICK_NSEC) {
j_sg_cpu->iowait_boost = 0; j_sg_cpu->iowait_boost = 0;
j_sg_cpu->iowait_boost_pending = false; j_sg_cpu->iowait_boost_pending = false;
j_sg_cpu->util_cfs = 0;
if (j_sg_cpu->util_dl == 0)
continue;
} }
if (j_sg_cpu->flags & SCHED_CPUFREQ_RT)
return policy->cpuinfo.max_freq;
j_max = j_sg_cpu->max; j_max = j_sg_cpu->max;
j_util = sugov_aggregate_util(j_sg_cpu); j_util = sugov_aggregate_util(j_sg_cpu);
sugov_iowait_boost(j_sg_cpu, &j_util, &j_max);
if (j_util * max > j_max * util) { if (j_util * max > j_max * util) {
util = j_util; util = j_util;
max = j_max; max = j_max;
} }
sugov_iowait_boost(j_sg_cpu, &util, &max);
} }
return get_next_freq(sg_policy, util, max); return get_next_freq(sg_policy, util, max);
} }
static void sugov_update_shared(struct update_util_data *hook, u64 time, static void
unsigned int flags) sugov_update_shared(struct update_util_data *hook, u64 time, unsigned int flags)
{ {
struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util); struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util);
struct sugov_policy *sg_policy = sg_cpu->sg_policy; struct sugov_policy *sg_policy = sg_cpu->sg_policy;
...@@ -354,18 +365,13 @@ static void sugov_update_shared(struct update_util_data *hook, u64 time, ...@@ -354,18 +365,13 @@ static void sugov_update_shared(struct update_util_data *hook, u64 time,
raw_spin_lock(&sg_policy->update_lock); raw_spin_lock(&sg_policy->update_lock);
sugov_get_util(sg_cpu); sugov_set_iowait_boost(sg_cpu, time, flags);
sg_cpu->flags = flags;
sugov_set_iowait_boost(sg_cpu, time);
sg_cpu->last_update = time; sg_cpu->last_update = time;
ignore_dl_rate_limit(sg_cpu, sg_policy);
if (sugov_should_update_freq(sg_policy, time)) { if (sugov_should_update_freq(sg_policy, time)) {
if (flags & SCHED_CPUFREQ_RT)
next_f = sg_policy->policy->cpuinfo.max_freq;
else
next_f = sugov_next_freq_shared(sg_cpu, time); next_f = sugov_next_freq_shared(sg_cpu, time);
sugov_update_commit(sg_policy, time, next_f); sugov_update_commit(sg_policy, time, next_f);
} }
...@@ -423,8 +429,8 @@ static ssize_t rate_limit_us_show(struct gov_attr_set *attr_set, char *buf) ...@@ -423,8 +429,8 @@ static ssize_t rate_limit_us_show(struct gov_attr_set *attr_set, char *buf)
return sprintf(buf, "%u\n", tunables->rate_limit_us); return sprintf(buf, "%u\n", tunables->rate_limit_us);
} }
static ssize_t rate_limit_us_store(struct gov_attr_set *attr_set, const char *buf, static ssize_t
size_t count) rate_limit_us_store(struct gov_attr_set *attr_set, const char *buf, size_t count)
{ {
struct sugov_tunables *tunables = to_sugov_tunables(attr_set); struct sugov_tunables *tunables = to_sugov_tunables(attr_set);
struct sugov_policy *sg_policy; struct sugov_policy *sg_policy;
...@@ -676,7 +682,6 @@ static int sugov_start(struct cpufreq_policy *policy) ...@@ -676,7 +682,6 @@ static int sugov_start(struct cpufreq_policy *policy)
memset(sg_cpu, 0, sizeof(*sg_cpu)); memset(sg_cpu, 0, sizeof(*sg_cpu));
sg_cpu->cpu = cpu; sg_cpu->cpu = cpu;
sg_cpu->sg_policy = sg_policy; sg_cpu->sg_policy = sg_policy;
sg_cpu->flags = 0;
sg_cpu->iowait_boost_max = policy->cpuinfo.max_freq; sg_cpu->iowait_boost_max = policy->cpuinfo.max_freq;
} }
......
...@@ -14,7 +14,7 @@ ...@@ -14,7 +14,7 @@
* *
* going from the lowest priority to the highest. CPUs in the INVALID state * going from the lowest priority to the highest. CPUs in the INVALID state
* are not eligible for routing. The system maintains this state with * are not eligible for routing. The system maintains this state with
* a 2 dimensional bitmap (the first for priority class, the second for cpus * a 2 dimensional bitmap (the first for priority class, the second for CPUs
* in that class). Therefore a typical application without affinity * in that class). Therefore a typical application without affinity
* restrictions can find a suitable CPU with O(1) complexity (e.g. two bit * restrictions can find a suitable CPU with O(1) complexity (e.g. two bit
* searches). For tasks with affinity restrictions, the algorithm has a * searches). For tasks with affinity restrictions, the algorithm has a
...@@ -26,12 +26,7 @@ ...@@ -26,12 +26,7 @@
* as published by the Free Software Foundation; version 2 * as published by the Free Software Foundation; version 2
* of the License. * of the License.
*/ */
#include "sched.h"
#include <linux/gfp.h>
#include <linux/sched.h>
#include <linux/sched/rt.h>
#include <linux/slab.h>
#include "cpupri.h"
/* Convert between a 140 based task->prio, and our 102 based cpupri */ /* Convert between a 140 based task->prio, and our 102 based cpupri */
static int convert_prio(int prio) static int convert_prio(int prio)
...@@ -128,9 +123,9 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p, ...@@ -128,9 +123,9 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p,
} }
/** /**
* cpupri_set - update the cpu priority setting * cpupri_set - update the CPU priority setting
* @cp: The cpupri context * @cp: The cpupri context
* @cpu: The target cpu * @cpu: The target CPU
* @newpri: The priority (INVALID-RT99) to assign to this CPU * @newpri: The priority (INVALID-RT99) to assign to this CPU
* *
* Note: Assumes cpu_rq(cpu)->lock is locked * Note: Assumes cpu_rq(cpu)->lock is locked
...@@ -151,7 +146,7 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri) ...@@ -151,7 +146,7 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
return; return;
/* /*
* If the cpu was currently mapped to a different value, we * If the CPU was currently mapped to a different value, we
* need to map it to the new value then remove the old value. * need to map it to the new value then remove the old value.
* Note, we must add the new value first, otherwise we risk the * Note, we must add the new value first, otherwise we risk the
* cpu being missed by the priority loop in cpupri_find. * cpu being missed by the priority loop in cpupri_find.
......
/* SPDX-License-Identifier: GPL-2.0 */ /* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_CPUPRI_H
#define _LINUX_CPUPRI_H
#include <linux/sched.h>
#define CPUPRI_NR_PRIORITIES (MAX_RT_PRIO + 2) #define CPUPRI_NR_PRIORITIES (MAX_RT_PRIO + 2)
...@@ -22,11 +18,8 @@ struct cpupri { ...@@ -22,11 +18,8 @@ struct cpupri {
}; };
#ifdef CONFIG_SMP #ifdef CONFIG_SMP
int cpupri_find(struct cpupri *cp, int cpupri_find(struct cpupri *cp, struct task_struct *p, struct cpumask *lowest_mask);
struct task_struct *p, struct cpumask *lowest_mask);
void cpupri_set(struct cpupri *cp, int cpu, int pri); void cpupri_set(struct cpupri *cp, int cpu, int pri);
int cpupri_init(struct cpupri *cp); int cpupri_init(struct cpupri *cp);
void cpupri_cleanup(struct cpupri *cp); void cpupri_cleanup(struct cpupri *cp);
#endif #endif
#endif /* _LINUX_CPUPRI_H */
#include <linux/export.h> /*
#include <linux/sched.h> * Simple CPU accounting cgroup controller
#include <linux/tsacct_kern.h> */
#include <linux/kernel_stat.h>
#include <linux/static_key.h>
#include <linux/context_tracking.h>
#include <linux/sched/cputime.h>
#include "sched.h" #include "sched.h"
#ifdef CONFIG_IRQ_TIME_ACCOUNTING #ifdef CONFIG_IRQ_TIME_ACCOUNTING
...@@ -113,9 +109,9 @@ static inline void task_group_account_field(struct task_struct *p, int index, ...@@ -113,9 +109,9 @@ static inline void task_group_account_field(struct task_struct *p, int index,
} }
/* /*
* Account user cpu time to a process. * Account user CPU time to a process.
* @p: the process that the cpu time gets accounted to * @p: the process that the CPU time gets accounted to
* @cputime: the cpu time spent in user space since the last update * @cputime: the CPU time spent in user space since the last update
*/ */
void account_user_time(struct task_struct *p, u64 cputime) void account_user_time(struct task_struct *p, u64 cputime)
{ {
...@@ -135,9 +131,9 @@ void account_user_time(struct task_struct *p, u64 cputime) ...@@ -135,9 +131,9 @@ void account_user_time(struct task_struct *p, u64 cputime)
} }
/* /*
* Account guest cpu time to a process. * Account guest CPU time to a process.
* @p: the process that the cpu time gets accounted to * @p: the process that the CPU time gets accounted to
* @cputime: the cpu time spent in virtual machine since the last update * @cputime: the CPU time spent in virtual machine since the last update
*/ */
void account_guest_time(struct task_struct *p, u64 cputime) void account_guest_time(struct task_struct *p, u64 cputime)
{ {
...@@ -159,9 +155,9 @@ void account_guest_time(struct task_struct *p, u64 cputime) ...@@ -159,9 +155,9 @@ void account_guest_time(struct task_struct *p, u64 cputime)
} }
/* /*
* Account system cpu time to a process and desired cpustat field * Account system CPU time to a process and desired cpustat field
* @p: the process that the cpu time gets accounted to * @p: the process that the CPU time gets accounted to
* @cputime: the cpu time spent in kernel space since the last update * @cputime: the CPU time spent in kernel space since the last update
* @index: pointer to cpustat field that has to be updated * @index: pointer to cpustat field that has to be updated
*/ */
void account_system_index_time(struct task_struct *p, void account_system_index_time(struct task_struct *p,
...@@ -179,10 +175,10 @@ void account_system_index_time(struct task_struct *p, ...@@ -179,10 +175,10 @@ void account_system_index_time(struct task_struct *p,
} }
/* /*
* Account system cpu time to a process. * Account system CPU time to a process.
* @p: the process that the cpu time gets accounted to * @p: the process that the CPU time gets accounted to
* @hardirq_offset: the offset to subtract from hardirq_count() * @hardirq_offset: the offset to subtract from hardirq_count()
* @cputime: the cpu time spent in kernel space since the last update * @cputime: the CPU time spent in kernel space since the last update
*/ */
void account_system_time(struct task_struct *p, int hardirq_offset, u64 cputime) void account_system_time(struct task_struct *p, int hardirq_offset, u64 cputime)
{ {
...@@ -205,7 +201,7 @@ void account_system_time(struct task_struct *p, int hardirq_offset, u64 cputime) ...@@ -205,7 +201,7 @@ void account_system_time(struct task_struct *p, int hardirq_offset, u64 cputime)
/* /*
* Account for involuntary wait time. * Account for involuntary wait time.
* @cputime: the cpu time spent in involuntary wait * @cputime: the CPU time spent in involuntary wait
*/ */
void account_steal_time(u64 cputime) void account_steal_time(u64 cputime)
{ {
...@@ -216,7 +212,7 @@ void account_steal_time(u64 cputime) ...@@ -216,7 +212,7 @@ void account_steal_time(u64 cputime)
/* /*
* Account for idle time. * Account for idle time.
* @cputime: the cpu time spent in idle wait * @cputime: the CPU time spent in idle wait
*/ */
void account_idle_time(u64 cputime) void account_idle_time(u64 cputime)
{ {
...@@ -338,7 +334,7 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) ...@@ -338,7 +334,7 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
#ifdef CONFIG_IRQ_TIME_ACCOUNTING #ifdef CONFIG_IRQ_TIME_ACCOUNTING
/* /*
* Account a tick to a process and cpustat * Account a tick to a process and cpustat
* @p: the process that the cpu time gets accounted to * @p: the process that the CPU time gets accounted to
* @user_tick: is the tick from userspace * @user_tick: is the tick from userspace
* @rq: the pointer to rq * @rq: the pointer to rq
* *
...@@ -400,17 +396,16 @@ static void irqtime_account_idle_ticks(int ticks) ...@@ -400,17 +396,16 @@ static void irqtime_account_idle_ticks(int ticks)
irqtime_account_process_tick(current, 0, rq, ticks); irqtime_account_process_tick(current, 0, rq, ticks);
} }
#else /* CONFIG_IRQ_TIME_ACCOUNTING */ #else /* CONFIG_IRQ_TIME_ACCOUNTING */
static inline void irqtime_account_idle_ticks(int ticks) {} static inline void irqtime_account_idle_ticks(int ticks) { }
static inline void irqtime_account_process_tick(struct task_struct *p, int user_tick, static inline void irqtime_account_process_tick(struct task_struct *p, int user_tick,
struct rq *rq, int nr_ticks) {} struct rq *rq, int nr_ticks) { }
#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ #endif /* CONFIG_IRQ_TIME_ACCOUNTING */
/* /*
* Use precise platform statistics if available: * Use precise platform statistics if available:
*/ */
#ifdef CONFIG_VIRT_CPU_ACCOUNTING #ifdef CONFIG_VIRT_CPU_ACCOUNTING
# ifndef __ARCH_HAS_VTIME_TASK_SWITCH
#ifndef __ARCH_HAS_VTIME_TASK_SWITCH
void vtime_common_task_switch(struct task_struct *prev) void vtime_common_task_switch(struct task_struct *prev)
{ {
if (is_idle_task(prev)) if (is_idle_task(prev))
...@@ -421,8 +416,7 @@ void vtime_common_task_switch(struct task_struct *prev) ...@@ -421,8 +416,7 @@ void vtime_common_task_switch(struct task_struct *prev)
vtime_flush(prev); vtime_flush(prev);
arch_vtime_task_switch(prev); arch_vtime_task_switch(prev);
} }
#endif # endif
#endif /* CONFIG_VIRT_CPU_ACCOUNTING */ #endif /* CONFIG_VIRT_CPU_ACCOUNTING */
...@@ -469,10 +463,12 @@ void thread_group_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st) ...@@ -469,10 +463,12 @@ void thread_group_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st)
*ut = cputime.utime; *ut = cputime.utime;
*st = cputime.stime; *st = cputime.stime;
} }
#else /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
#else /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE: */
/* /*
* Account a single tick of cpu time. * Account a single tick of CPU time.
* @p: the process that the cpu time gets accounted to * @p: the process that the CPU time gets accounted to
* @user_tick: indicates if the tick is a user or a system tick * @user_tick: indicates if the tick is a user or a system tick
*/ */
void account_process_tick(struct task_struct *p, int user_tick) void account_process_tick(struct task_struct *p, int user_tick)
......
...@@ -17,9 +17,6 @@ ...@@ -17,9 +17,6 @@
*/ */
#include "sched.h" #include "sched.h"
#include <linux/slab.h>
#include <uapi/linux/sched/types.h>
struct dl_bandwidth def_dl_bandwidth; struct dl_bandwidth def_dl_bandwidth;
static inline struct task_struct *dl_task_of(struct sched_dl_entity *dl_se) static inline struct task_struct *dl_task_of(struct sched_dl_entity *dl_se)
...@@ -87,7 +84,7 @@ void __add_running_bw(u64 dl_bw, struct dl_rq *dl_rq) ...@@ -87,7 +84,7 @@ void __add_running_bw(u64 dl_bw, struct dl_rq *dl_rq)
SCHED_WARN_ON(dl_rq->running_bw < old); /* overflow */ SCHED_WARN_ON(dl_rq->running_bw < old); /* overflow */
SCHED_WARN_ON(dl_rq->running_bw > dl_rq->this_bw); SCHED_WARN_ON(dl_rq->running_bw > dl_rq->this_bw);
/* kick cpufreq (see the comment in kernel/sched/sched.h). */ /* kick cpufreq (see the comment in kernel/sched/sched.h). */
cpufreq_update_util(rq_of_dl_rq(dl_rq), SCHED_CPUFREQ_DL); cpufreq_update_util(rq_of_dl_rq(dl_rq), 0);
} }
static inline static inline
...@@ -101,7 +98,7 @@ void __sub_running_bw(u64 dl_bw, struct dl_rq *dl_rq) ...@@ -101,7 +98,7 @@ void __sub_running_bw(u64 dl_bw, struct dl_rq *dl_rq)
if (dl_rq->running_bw > old) if (dl_rq->running_bw > old)
dl_rq->running_bw = 0; dl_rq->running_bw = 0;
/* kick cpufreq (see the comment in kernel/sched/sched.h). */ /* kick cpufreq (see the comment in kernel/sched/sched.h). */
cpufreq_update_util(rq_of_dl_rq(dl_rq), SCHED_CPUFREQ_DL); cpufreq_update_util(rq_of_dl_rq(dl_rq), 0);
} }
static inline static inline
...@@ -514,7 +511,7 @@ static DEFINE_PER_CPU(struct callback_head, dl_pull_head); ...@@ -514,7 +511,7 @@ static DEFINE_PER_CPU(struct callback_head, dl_pull_head);
static void push_dl_tasks(struct rq *); static void push_dl_tasks(struct rq *);
static void pull_dl_task(struct rq *); static void pull_dl_task(struct rq *);
static inline void queue_push_tasks(struct rq *rq) static inline void deadline_queue_push_tasks(struct rq *rq)
{ {
if (!has_pushable_dl_tasks(rq)) if (!has_pushable_dl_tasks(rq))
return; return;
...@@ -522,7 +519,7 @@ static inline void queue_push_tasks(struct rq *rq) ...@@ -522,7 +519,7 @@ static inline void queue_push_tasks(struct rq *rq)
queue_balance_callback(rq, &per_cpu(dl_push_head, rq->cpu), push_dl_tasks); queue_balance_callback(rq, &per_cpu(dl_push_head, rq->cpu), push_dl_tasks);
} }
static inline void queue_pull_task(struct rq *rq) static inline void deadline_queue_pull_task(struct rq *rq)
{ {
queue_balance_callback(rq, &per_cpu(dl_pull_head, rq->cpu), pull_dl_task); queue_balance_callback(rq, &per_cpu(dl_pull_head, rq->cpu), pull_dl_task);
} }
...@@ -539,12 +536,12 @@ static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p ...@@ -539,12 +536,12 @@ static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p
/* /*
* If we cannot preempt any rq, fall back to pick any * If we cannot preempt any rq, fall back to pick any
* online cpu. * online CPU:
*/ */
cpu = cpumask_any_and(cpu_active_mask, &p->cpus_allowed); cpu = cpumask_any_and(cpu_active_mask, &p->cpus_allowed);
if (cpu >= nr_cpu_ids) { if (cpu >= nr_cpu_ids) {
/* /*
* Fail to find any suitable cpu. * Failed to find any suitable CPU.
* The task will never come back! * The task will never come back!
*/ */
BUG_ON(dl_bandwidth_enabled()); BUG_ON(dl_bandwidth_enabled());
...@@ -597,19 +594,18 @@ static inline void pull_dl_task(struct rq *rq) ...@@ -597,19 +594,18 @@ static inline void pull_dl_task(struct rq *rq)
{ {
} }
static inline void queue_push_tasks(struct rq *rq) static inline void deadline_queue_push_tasks(struct rq *rq)
{ {
} }
static inline void queue_pull_task(struct rq *rq) static inline void deadline_queue_pull_task(struct rq *rq)
{ {
} }
#endif /* CONFIG_SMP */ #endif /* CONFIG_SMP */
static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags); static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags);
static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags); static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags);
static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p, static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p, int flags);
int flags);
/* /*
* We are being explicitly informed that a new instance is starting, * We are being explicitly informed that a new instance is starting,
...@@ -1763,7 +1759,7 @@ pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) ...@@ -1763,7 +1759,7 @@ pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
if (hrtick_enabled(rq)) if (hrtick_enabled(rq))
start_hrtick_dl(rq, p); start_hrtick_dl(rq, p);
queue_push_tasks(rq); deadline_queue_push_tasks(rq);
return p; return p;
} }
...@@ -1776,6 +1772,14 @@ static void put_prev_task_dl(struct rq *rq, struct task_struct *p) ...@@ -1776,6 +1772,14 @@ static void put_prev_task_dl(struct rq *rq, struct task_struct *p)
enqueue_pushable_dl_task(rq, p); enqueue_pushable_dl_task(rq, p);
} }
/*
* scheduler tick hitting a task of our scheduling class.
*
* NOTE: This function can be called remotely by the tick offload that
* goes along full dynticks. Therefore no local assumption can be made
* and everything must be accessed through the @rq and @curr passed in
* parameters.
*/
static void task_tick_dl(struct rq *rq, struct task_struct *p, int queued) static void task_tick_dl(struct rq *rq, struct task_struct *p, int queued)
{ {
update_curr_dl(rq); update_curr_dl(rq);
...@@ -1865,7 +1869,7 @@ static int find_later_rq(struct task_struct *task) ...@@ -1865,7 +1869,7 @@ static int find_later_rq(struct task_struct *task)
/* /*
* We have to consider system topology and task affinity * We have to consider system topology and task affinity
* first, then we can look for a suitable cpu. * first, then we can look for a suitable CPU.
*/ */
if (!cpudl_find(&task_rq(task)->rd->cpudl, task, later_mask)) if (!cpudl_find(&task_rq(task)->rd->cpudl, task, later_mask))
return -1; return -1;
...@@ -1879,7 +1883,7 @@ static int find_later_rq(struct task_struct *task) ...@@ -1879,7 +1883,7 @@ static int find_later_rq(struct task_struct *task)
* Now we check how well this matches with task's * Now we check how well this matches with task's
* affinity and system topology. * affinity and system topology.
* *
* The last cpu where the task run is our first * The last CPU where the task run is our first
* guess, since it is most likely cache-hot there. * guess, since it is most likely cache-hot there.
*/ */
if (cpumask_test_cpu(cpu, later_mask)) if (cpumask_test_cpu(cpu, later_mask))
...@@ -1909,9 +1913,9 @@ static int find_later_rq(struct task_struct *task) ...@@ -1909,9 +1913,9 @@ static int find_later_rq(struct task_struct *task)
best_cpu = cpumask_first_and(later_mask, best_cpu = cpumask_first_and(later_mask,
sched_domain_span(sd)); sched_domain_span(sd));
/* /*
* Last chance: if a cpu being in both later_mask * Last chance: if a CPU being in both later_mask
* and current sd span is valid, that becomes our * and current sd span is valid, that becomes our
* choice. Of course, the latest possible cpu is * choice. Of course, the latest possible CPU is
* already under consideration through later_mask. * already under consideration through later_mask.
*/ */
if (best_cpu < nr_cpu_ids) { if (best_cpu < nr_cpu_ids) {
...@@ -2067,7 +2071,7 @@ static int push_dl_task(struct rq *rq) ...@@ -2067,7 +2071,7 @@ static int push_dl_task(struct rq *rq)
if (task == next_task) { if (task == next_task) {
/* /*
* The task is still there. We don't try * The task is still there. We don't try
* again, some other cpu will pull it when ready. * again, some other CPU will pull it when ready.
*/ */
goto out; goto out;
} }
...@@ -2300,12 +2304,12 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p) ...@@ -2300,12 +2304,12 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p)
/* /*
* Since this might be the only -deadline task on the rq, * Since this might be the only -deadline task on the rq,
* this is the right place to try to pull some other one * this is the right place to try to pull some other one
* from an overloaded cpu, if any. * from an overloaded CPU, if any.
*/ */
if (!task_on_rq_queued(p) || rq->dl.dl_nr_running) if (!task_on_rq_queued(p) || rq->dl.dl_nr_running)
return; return;
queue_pull_task(rq); deadline_queue_pull_task(rq);
} }
/* /*
...@@ -2327,7 +2331,7 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p) ...@@ -2327,7 +2331,7 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p)
if (rq->curr != p) { if (rq->curr != p) {
#ifdef CONFIG_SMP #ifdef CONFIG_SMP
if (p->nr_cpus_allowed > 1 && rq->dl.overloaded) if (p->nr_cpus_allowed > 1 && rq->dl.overloaded)
queue_push_tasks(rq); deadline_queue_push_tasks(rq);
#endif #endif
if (dl_task(rq->curr)) if (dl_task(rq->curr))
check_preempt_curr_dl(rq, p, 0); check_preempt_curr_dl(rq, p, 0);
...@@ -2352,7 +2356,7 @@ static void prio_changed_dl(struct rq *rq, struct task_struct *p, ...@@ -2352,7 +2356,7 @@ static void prio_changed_dl(struct rq *rq, struct task_struct *p,
* or lowering its prio, so... * or lowering its prio, so...
*/ */
if (!rq->dl.overloaded) if (!rq->dl.overloaded)
queue_pull_task(rq); deadline_queue_pull_task(rq);
/* /*
* If we now have a earlier deadline task than p, * If we now have a earlier deadline task than p,
...@@ -2655,21 +2659,22 @@ bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr) ...@@ -2655,21 +2659,22 @@ bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr)
#ifdef CONFIG_SMP #ifdef CONFIG_SMP
int dl_task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_allowed) int dl_task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_allowed)
{ {
unsigned int dest_cpu = cpumask_any_and(cpu_active_mask, unsigned int dest_cpu;
cs_cpus_allowed);
struct dl_bw *dl_b; struct dl_bw *dl_b;
bool overflow; bool overflow;
int cpus, ret; int cpus, ret;
unsigned long flags; unsigned long flags;
dest_cpu = cpumask_any_and(cpu_active_mask, cs_cpus_allowed);
rcu_read_lock_sched(); rcu_read_lock_sched();
dl_b = dl_bw_of(dest_cpu); dl_b = dl_bw_of(dest_cpu);
raw_spin_lock_irqsave(&dl_b->lock, flags); raw_spin_lock_irqsave(&dl_b->lock, flags);
cpus = dl_bw_cpus(dest_cpu); cpus = dl_bw_cpus(dest_cpu);
overflow = __dl_overflow(dl_b, cpus, 0, p->dl.dl_bw); overflow = __dl_overflow(dl_b, cpus, 0, p->dl.dl_bw);
if (overflow) if (overflow) {
ret = -EBUSY; ret = -EBUSY;
else { } else {
/* /*
* We reserve space for this task in the destination * We reserve space for this task in the destination
* root_domain, as we can't fail after this point. * root_domain, as we can't fail after this point.
...@@ -2681,6 +2686,7 @@ int dl_task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_allo ...@@ -2681,6 +2686,7 @@ int dl_task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_allo
} }
raw_spin_unlock_irqrestore(&dl_b->lock, flags); raw_spin_unlock_irqrestore(&dl_b->lock, flags);
rcu_read_unlock_sched(); rcu_read_unlock_sched();
return ret; return ret;
} }
...@@ -2701,6 +2707,7 @@ int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, ...@@ -2701,6 +2707,7 @@ int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur,
ret = 0; ret = 0;
raw_spin_unlock_irqrestore(&cur_dl_b->lock, flags); raw_spin_unlock_irqrestore(&cur_dl_b->lock, flags);
rcu_read_unlock_sched(); rcu_read_unlock_sched();
return ret; return ret;
} }
...@@ -2718,6 +2725,7 @@ bool dl_cpu_busy(unsigned int cpu) ...@@ -2718,6 +2725,7 @@ bool dl_cpu_busy(unsigned int cpu)
overflow = __dl_overflow(dl_b, cpus, 0, 0); overflow = __dl_overflow(dl_b, cpus, 0, 0);
raw_spin_unlock_irqrestore(&dl_b->lock, flags); raw_spin_unlock_irqrestore(&dl_b->lock, flags);
rcu_read_unlock_sched(); rcu_read_unlock_sched();
return overflow; return overflow;
} }
#endif #endif
......
/* /*
* kernel/sched/debug.c * kernel/sched/debug.c
* *
* Print the CFS rbtree * Print the CFS rbtree and other debugging details
* *
* Copyright(C) 2007, Red Hat, Inc., Ingo Molnar * Copyright(C) 2007, Red Hat, Inc., Ingo Molnar
* *
...@@ -9,16 +9,6 @@ ...@@ -9,16 +9,6 @@
* it under the terms of the GNU General Public License version 2 as * it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation. * published by the Free Software Foundation.
*/ */
#include <linux/proc_fs.h>
#include <linux/sched/mm.h>
#include <linux/sched/task.h>
#include <linux/seq_file.h>
#include <linux/kallsyms.h>
#include <linux/utsname.h>
#include <linux/mempolicy.h>
#include <linux/debugfs.h>
#include "sched.h" #include "sched.h"
static DEFINE_SPINLOCK(sched_debug_lock); static DEFINE_SPINLOCK(sched_debug_lock);
...@@ -274,34 +264,19 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd) ...@@ -274,34 +264,19 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd)
if (table == NULL) if (table == NULL)
return NULL; return NULL;
set_table_entry(&table[0], "min_interval", &sd->min_interval, set_table_entry(&table[0] , "min_interval", &sd->min_interval, sizeof(long), 0644, proc_doulongvec_minmax, false);
sizeof(long), 0644, proc_doulongvec_minmax, false); set_table_entry(&table[1] , "max_interval", &sd->max_interval, sizeof(long), 0644, proc_doulongvec_minmax, false);
set_table_entry(&table[1], "max_interval", &sd->max_interval, set_table_entry(&table[2] , "busy_idx", &sd->busy_idx, sizeof(int) , 0644, proc_dointvec_minmax, true );
sizeof(long), 0644, proc_doulongvec_minmax, false); set_table_entry(&table[3] , "idle_idx", &sd->idle_idx, sizeof(int) , 0644, proc_dointvec_minmax, true );
set_table_entry(&table[2], "busy_idx", &sd->busy_idx, set_table_entry(&table[4] , "newidle_idx", &sd->newidle_idx, sizeof(int) , 0644, proc_dointvec_minmax, true );
sizeof(int), 0644, proc_dointvec_minmax, true); set_table_entry(&table[5] , "wake_idx", &sd->wake_idx, sizeof(int) , 0644, proc_dointvec_minmax, true );
set_table_entry(&table[3], "idle_idx", &sd->idle_idx, set_table_entry(&table[6] , "forkexec_idx", &sd->forkexec_idx, sizeof(int) , 0644, proc_dointvec_minmax, true );
sizeof(int), 0644, proc_dointvec_minmax, true); set_table_entry(&table[7] , "busy_factor", &sd->busy_factor, sizeof(int) , 0644, proc_dointvec_minmax, false);
set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx, set_table_entry(&table[8] , "imbalance_pct", &sd->imbalance_pct, sizeof(int) , 0644, proc_dointvec_minmax, false);
sizeof(int), 0644, proc_dointvec_minmax, true); set_table_entry(&table[9] , "cache_nice_tries", &sd->cache_nice_tries, sizeof(int) , 0644, proc_dointvec_minmax, false);
set_table_entry(&table[5], "wake_idx", &sd->wake_idx, set_table_entry(&table[10], "flags", &sd->flags, sizeof(int) , 0644, proc_dointvec_minmax, false);
sizeof(int), 0644, proc_dointvec_minmax, true); set_table_entry(&table[11], "max_newidle_lb_cost", &sd->max_newidle_lb_cost, sizeof(long), 0644, proc_doulongvec_minmax, false);
set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx, set_table_entry(&table[12], "name", sd->name, CORENAME_MAX_SIZE, 0444, proc_dostring, false);
sizeof(int), 0644, proc_dointvec_minmax, true);
set_table_entry(&table[7], "busy_factor", &sd->busy_factor,
sizeof(int), 0644, proc_dointvec_minmax, false);
set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct,
sizeof(int), 0644, proc_dointvec_minmax, false);
set_table_entry(&table[9], "cache_nice_tries",
&sd->cache_nice_tries,
sizeof(int), 0644, proc_dointvec_minmax, false);
set_table_entry(&table[10], "flags", &sd->flags,
sizeof(int), 0644, proc_dointvec_minmax, false);
set_table_entry(&table[11], "max_newidle_lb_cost",
&sd->max_newidle_lb_cost,
sizeof(long), 0644, proc_doulongvec_minmax, false);
set_table_entry(&table[12], "name", sd->name,
CORENAME_MAX_SIZE, 0444, proc_dostring, false);
/* &table[13] is terminator */ /* &table[13] is terminator */
return table; return table;
...@@ -413,14 +388,10 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group ...@@ -413,14 +388,10 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
{ {
struct sched_entity *se = tg->se[cpu]; struct sched_entity *se = tg->se[cpu];
#define P(F) \ #define P(F) SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)F)
SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)F) #define P_SCHEDSTAT(F) SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)schedstat_val(F))
#define P_SCHEDSTAT(F) \ #define PN(F) SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F))
SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)schedstat_val(F)) #define PN_SCHEDSTAT(F) SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)schedstat_val(F)))
#define PN(F) \
SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F))
#define PN_SCHEDSTAT(F) \
SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)schedstat_val(F)))
if (!se) if (!se)
return; return;
...@@ -428,6 +399,7 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group ...@@ -428,6 +399,7 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
PN(se->exec_start); PN(se->exec_start);
PN(se->vruntime); PN(se->vruntime);
PN(se->sum_exec_runtime); PN(se->sum_exec_runtime);
if (schedstat_enabled()) { if (schedstat_enabled()) {
PN_SCHEDSTAT(se->statistics.wait_start); PN_SCHEDSTAT(se->statistics.wait_start);
PN_SCHEDSTAT(se->statistics.sleep_start); PN_SCHEDSTAT(se->statistics.sleep_start);
...@@ -440,6 +412,7 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group ...@@ -440,6 +412,7 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
PN_SCHEDSTAT(se->statistics.wait_sum); PN_SCHEDSTAT(se->statistics.wait_sum);
P_SCHEDSTAT(se->statistics.wait_count); P_SCHEDSTAT(se->statistics.wait_count);
} }
P(se->load.weight); P(se->load.weight);
P(se->runnable_weight); P(se->runnable_weight);
#ifdef CONFIG_SMP #ifdef CONFIG_SMP
...@@ -464,6 +437,7 @@ static char *task_group_path(struct task_group *tg) ...@@ -464,6 +437,7 @@ static char *task_group_path(struct task_group *tg)
return group_path; return group_path;
cgroup_path(tg->css.cgroup, group_path, PATH_MAX); cgroup_path(tg->css.cgroup, group_path, PATH_MAX);
return group_path; return group_path;
} }
#endif #endif
...@@ -569,6 +543,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) ...@@ -569,6 +543,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
cfs_rq->avg.runnable_load_avg); cfs_rq->avg.runnable_load_avg);
SEQ_printf(m, " .%-30s: %lu\n", "util_avg", SEQ_printf(m, " .%-30s: %lu\n", "util_avg",
cfs_rq->avg.util_avg); cfs_rq->avg.util_avg);
SEQ_printf(m, " .%-30s: %u\n", "util_est_enqueued",
cfs_rq->avg.util_est.enqueued);
SEQ_printf(m, " .%-30s: %ld\n", "removed.load_avg", SEQ_printf(m, " .%-30s: %ld\n", "removed.load_avg",
cfs_rq->removed.load_avg); cfs_rq->removed.load_avg);
SEQ_printf(m, " .%-30s: %ld\n", "removed.util_avg", SEQ_printf(m, " .%-30s: %ld\n", "removed.util_avg",
...@@ -804,9 +780,9 @@ void sysrq_sched_debug_show(void) ...@@ -804,9 +780,9 @@ void sysrq_sched_debug_show(void)
/* /*
* This itererator needs some explanation. * This itererator needs some explanation.
* It returns 1 for the header position. * It returns 1 for the header position.
* This means 2 is cpu 0. * This means 2 is CPU 0.
* In a hotplugged system some cpus, including cpu 0, may be missing so we have * In a hotplugged system some CPUs, including CPU 0, may be missing so we have
* to use cpumask_* to iterate over the cpus. * to use cpumask_* to iterate over the CPUs.
*/ */
static void *sched_debug_start(struct seq_file *file, loff_t *offset) static void *sched_debug_start(struct seq_file *file, loff_t *offset)
{ {
...@@ -826,6 +802,7 @@ static void *sched_debug_start(struct seq_file *file, loff_t *offset) ...@@ -826,6 +802,7 @@ static void *sched_debug_start(struct seq_file *file, loff_t *offset)
if (n < nr_cpu_ids) if (n < nr_cpu_ids)
return (void *)(unsigned long)(n + 2); return (void *)(unsigned long)(n + 2);
return NULL; return NULL;
} }
...@@ -881,14 +858,10 @@ static int __init init_sched_debug_procfs(void) ...@@ -881,14 +858,10 @@ static int __init init_sched_debug_procfs(void)
__initcall(init_sched_debug_procfs); __initcall(init_sched_debug_procfs);
#define __P(F) \ #define __P(F) SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)F)
SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)F) #define P(F) SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)p->F)
#define P(F) \ #define __PN(F) SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F))
SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)p->F) #define PN(F) SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F))
#define __PN(F) \
SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F))
#define PN(F) \
SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F))
#ifdef CONFIG_NUMA_BALANCING #ifdef CONFIG_NUMA_BALANCING
...@@ -1023,6 +996,8 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, ...@@ -1023,6 +996,8 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
P(se.avg.runnable_load_avg); P(se.avg.runnable_load_avg);
P(se.avg.util_avg); P(se.avg.util_avg);
P(se.avg.last_update_time); P(se.avg.last_update_time);
P(se.avg.util_est.ewma);
P(se.avg.util_est.enqueued);
#endif #endif
P(policy); P(policy);
P(prio); P(prio);
......
This diff is collapsed.
...@@ -85,3 +85,8 @@ SCHED_FEAT(ATTACH_AGE_LOAD, true) ...@@ -85,3 +85,8 @@ SCHED_FEAT(ATTACH_AGE_LOAD, true)
SCHED_FEAT(WA_IDLE, true) SCHED_FEAT(WA_IDLE, true)
SCHED_FEAT(WA_WEIGHT, true) SCHED_FEAT(WA_WEIGHT, true)
SCHED_FEAT(WA_BIAS, true) SCHED_FEAT(WA_BIAS, true)
/*
* UtilEstimation. Use estimated CPU utilization.
*/
SCHED_FEAT(UTIL_EST, true)
/* /*
* Generic entry point for the idle threads * Generic entry points for the idle threads and
* implementation of the idle task scheduling class.
*
* (NOTE: these are not related to SCHED_IDLE batch scheduled
* tasks which are handled in sched/fair.c )
*/ */
#include <linux/sched.h> #include "sched.h"
#include <linux/sched/idle.h>
#include <linux/cpu.h>
#include <linux/cpuidle.h>
#include <linux/cpuhotplug.h>
#include <linux/tick.h>
#include <linux/mm.h>
#include <linux/stackprotector.h>
#include <linux/suspend.h>
#include <linux/livepatch.h>
#include <asm/tlb.h>
#include <trace/events/power.h> #include <trace/events/power.h>
#include "sched.h"
/* Linker adds these: start and end of __cpuidle functions */ /* Linker adds these: start and end of __cpuidle functions */
extern char __cpuidle_text_start[], __cpuidle_text_end[]; extern char __cpuidle_text_start[], __cpuidle_text_end[];
...@@ -46,6 +37,7 @@ void cpu_idle_poll_ctrl(bool enable) ...@@ -46,6 +37,7 @@ void cpu_idle_poll_ctrl(bool enable)
static int __init cpu_idle_poll_setup(char *__unused) static int __init cpu_idle_poll_setup(char *__unused)
{ {
cpu_idle_force_poll = 1; cpu_idle_force_poll = 1;
return 1; return 1;
} }
__setup("nohlt", cpu_idle_poll_setup); __setup("nohlt", cpu_idle_poll_setup);
...@@ -53,6 +45,7 @@ __setup("nohlt", cpu_idle_poll_setup); ...@@ -53,6 +45,7 @@ __setup("nohlt", cpu_idle_poll_setup);
static int __init cpu_idle_nopoll_setup(char *__unused) static int __init cpu_idle_nopoll_setup(char *__unused)
{ {
cpu_idle_force_poll = 0; cpu_idle_force_poll = 0;
return 1; return 1;
} }
__setup("hlt", cpu_idle_nopoll_setup); __setup("hlt", cpu_idle_nopoll_setup);
...@@ -64,12 +57,14 @@ static noinline int __cpuidle cpu_idle_poll(void) ...@@ -64,12 +57,14 @@ static noinline int __cpuidle cpu_idle_poll(void)
trace_cpu_idle_rcuidle(0, smp_processor_id()); trace_cpu_idle_rcuidle(0, smp_processor_id());
local_irq_enable(); local_irq_enable();
stop_critical_timings(); stop_critical_timings();
while (!tif_need_resched() && while (!tif_need_resched() &&
(cpu_idle_force_poll || tick_check_broadcast_expired())) (cpu_idle_force_poll || tick_check_broadcast_expired()))
cpu_relax(); cpu_relax();
start_critical_timings(); start_critical_timings();
trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
rcu_idle_exit(); rcu_idle_exit();
return 1; return 1;
} }
...@@ -332,8 +327,8 @@ void cpu_startup_entry(enum cpuhp_state state) ...@@ -332,8 +327,8 @@ void cpu_startup_entry(enum cpuhp_state state)
{ {
/* /*
* This #ifdef needs to die, but it's too late in the cycle to * This #ifdef needs to die, but it's too late in the cycle to
* make this generic (arm and sh have never invoked the canary * make this generic (ARM and SH have never invoked the canary
* init for the non boot cpus!). Will be fixed in 3.11 * init for the non boot CPUs!). Will be fixed in 3.11
*/ */
#ifdef CONFIG_X86 #ifdef CONFIG_X86
/* /*
...@@ -350,3 +345,116 @@ void cpu_startup_entry(enum cpuhp_state state) ...@@ -350,3 +345,116 @@ void cpu_startup_entry(enum cpuhp_state state)
while (1) while (1)
do_idle(); do_idle();
} }
/*
* idle-task scheduling class.
*/
#ifdef CONFIG_SMP
static int
select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags)
{
return task_cpu(p); /* IDLE tasks as never migrated */
}
#endif
/*
* Idle tasks are unconditionally rescheduled:
*/
static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int flags)
{
resched_curr(rq);
}
static struct task_struct *
pick_next_task_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{
put_prev_task(rq, prev);
update_idle_core(rq);
schedstat_inc(rq->sched_goidle);
return rq->idle;
}
/*
* It is not legal to sleep in the idle task - print a warning
* message if some code attempts to do it:
*/
static void
dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags)
{
raw_spin_unlock_irq(&rq->lock);
printk(KERN_ERR "bad: scheduling from the idle thread!\n");
dump_stack();
raw_spin_lock_irq(&rq->lock);
}
static void put_prev_task_idle(struct rq *rq, struct task_struct *prev)
{
}
/*
* scheduler tick hitting a task of our scheduling class.
*
* NOTE: This function can be called remotely by the tick offload that
* goes along full dynticks. Therefore no local assumption can be made
* and everything must be accessed through the @rq and @curr passed in
* parameters.
*/
static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued)
{
}
static void set_curr_task_idle(struct rq *rq)
{
}
static void switched_to_idle(struct rq *rq, struct task_struct *p)
{
BUG();
}
static void
prio_changed_idle(struct rq *rq, struct task_struct *p, int oldprio)
{
BUG();
}
static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task)
{
return 0;
}
static void update_curr_idle(struct rq *rq)
{
}
/*
* Simple, special scheduling class for the per-CPU idle tasks:
*/
const struct sched_class idle_sched_class = {
/* .next is NULL */
/* no enqueue/yield_task for idle tasks */
/* dequeue is not valid, we print a debug message there: */
.dequeue_task = dequeue_task_idle,
.check_preempt_curr = check_preempt_curr_idle,
.pick_next_task = pick_next_task_idle,
.put_prev_task = put_prev_task_idle,
#ifdef CONFIG_SMP
.select_task_rq = select_task_rq_idle,
.set_cpus_allowed = set_cpus_allowed_common,
#endif
.set_curr_task = set_curr_task_idle,
.task_tick = task_tick_idle,
.get_rr_interval = get_rr_interval_idle,
.prio_changed = prio_changed_idle,
.switched_to = switched_to_idle,
.update_curr = update_curr_idle,
};
// SPDX-License-Identifier: GPL-2.0
#include "sched.h"
/*
* idle-task scheduling class.
*
* (NOTE: these are not related to SCHED_IDLE tasks which are
* handled in sched/fair.c)
*/
#ifdef CONFIG_SMP
static int
select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags)
{
return task_cpu(p); /* IDLE tasks as never migrated */
}
#endif /* CONFIG_SMP */
/*
* Idle tasks are unconditionally rescheduled:
*/
static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int flags)
{
resched_curr(rq);
}
static struct task_struct *
pick_next_task_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{
put_prev_task(rq, prev);
update_idle_core(rq);
schedstat_inc(rq->sched_goidle);
return rq->idle;
}
/*
* It is not legal to sleep in the idle task - print a warning
* message if some code attempts to do it:
*/
static void
dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags)
{
raw_spin_unlock_irq(&rq->lock);
printk(KERN_ERR "bad: scheduling from the idle thread!\n");
dump_stack();
raw_spin_lock_irq(&rq->lock);
}
static void put_prev_task_idle(struct rq *rq, struct task_struct *prev)
{
rq_last_tick_reset(rq);
}
static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued)
{
}
static void set_curr_task_idle(struct rq *rq)
{
}
static void switched_to_idle(struct rq *rq, struct task_struct *p)
{
BUG();
}
static void
prio_changed_idle(struct rq *rq, struct task_struct *p, int oldprio)
{
BUG();
}
static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task)
{
return 0;
}
static void update_curr_idle(struct rq *rq)
{
}
/*
* Simple, special scheduling class for the per-CPU idle tasks:
*/
const struct sched_class idle_sched_class = {
/* .next is NULL */
/* no enqueue/yield_task for idle tasks */
/* dequeue is not valid, we print a debug message there: */
.dequeue_task = dequeue_task_idle,
.check_preempt_curr = check_preempt_curr_idle,
.pick_next_task = pick_next_task_idle,
.put_prev_task = put_prev_task_idle,
#ifdef CONFIG_SMP
.select_task_rq = select_task_rq_idle,
.set_cpus_allowed = set_cpus_allowed_common,
#endif
.set_curr_task = set_curr_task_idle,
.task_tick = task_tick_idle,
.get_rr_interval = get_rr_interval_idle,
.prio_changed = prio_changed_idle,
.switched_to = switched_to_idle,
.update_curr = update_curr_idle,
};
...@@ -3,15 +3,10 @@ ...@@ -3,15 +3,10 @@
* any CPU: unbound workqueues, timers, kthreads and any offloadable work. * any CPU: unbound workqueues, timers, kthreads and any offloadable work.
* *
* Copyright (C) 2017 Red Hat, Inc., Frederic Weisbecker * Copyright (C) 2017 Red Hat, Inc., Frederic Weisbecker
* Copyright (C) 2017-2018 SUSE, Frederic Weisbecker
* *
*/ */
#include "sched.h"
#include <linux/sched/isolation.h>
#include <linux/tick.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/static_key.h>
#include <linux/ctype.h>
DEFINE_STATIC_KEY_FALSE(housekeeping_overriden); DEFINE_STATIC_KEY_FALSE(housekeeping_overriden);
EXPORT_SYMBOL_GPL(housekeeping_overriden); EXPORT_SYMBOL_GPL(housekeeping_overriden);
...@@ -60,6 +55,9 @@ void __init housekeeping_init(void) ...@@ -60,6 +55,9 @@ void __init housekeeping_init(void)
static_branch_enable(&housekeeping_overriden); static_branch_enable(&housekeeping_overriden);
if (housekeeping_flags & HK_FLAG_TICK)
sched_tick_offload_init();
/* We need at least one CPU to handle housekeeping work */ /* We need at least one CPU to handle housekeeping work */
WARN_ON_ONCE(cpumask_empty(housekeeping_mask)); WARN_ON_ONCE(cpumask_empty(housekeeping_mask));
} }
...@@ -119,7 +117,7 @@ static int __init housekeeping_nohz_full_setup(char *str) ...@@ -119,7 +117,7 @@ static int __init housekeeping_nohz_full_setup(char *str)
{ {
unsigned int flags; unsigned int flags;
flags = HK_FLAG_TICK | HK_FLAG_TIMER | HK_FLAG_RCU | HK_FLAG_MISC; flags = HK_FLAG_TICK | HK_FLAG_WQ | HK_FLAG_TIMER | HK_FLAG_RCU | HK_FLAG_MISC;
return housekeeping_setup(str, flags); return housekeeping_setup(str, flags);
} }
......
...@@ -6,10 +6,6 @@ ...@@ -6,10 +6,6 @@
* figure. Its a silly number but people think its important. We go through * figure. Its a silly number but people think its important. We go through
* great pains to make it work on big machines and tickless kernels. * great pains to make it work on big machines and tickless kernels.
*/ */
#include <linux/export.h>
#include <linux/sched/loadavg.h>
#include "sched.h" #include "sched.h"
/* /*
...@@ -32,29 +28,29 @@ ...@@ -32,29 +28,29 @@
* Due to a number of reasons the above turns in the mess below: * Due to a number of reasons the above turns in the mess below:
* *
* - for_each_possible_cpu() is prohibitively expensive on machines with * - for_each_possible_cpu() is prohibitively expensive on machines with
* serious number of cpus, therefore we need to take a distributed approach * serious number of CPUs, therefore we need to take a distributed approach
* to calculating nr_active. * to calculating nr_active.
* *
* \Sum_i x_i(t) = \Sum_i x_i(t) - x_i(t_0) | x_i(t_0) := 0 * \Sum_i x_i(t) = \Sum_i x_i(t) - x_i(t_0) | x_i(t_0) := 0
* = \Sum_i { \Sum_j=1 x_i(t_j) - x_i(t_j-1) } * = \Sum_i { \Sum_j=1 x_i(t_j) - x_i(t_j-1) }
* *
* So assuming nr_active := 0 when we start out -- true per definition, we * So assuming nr_active := 0 when we start out -- true per definition, we
* can simply take per-cpu deltas and fold those into a global accumulate * can simply take per-CPU deltas and fold those into a global accumulate
* to obtain the same result. See calc_load_fold_active(). * to obtain the same result. See calc_load_fold_active().
* *
* Furthermore, in order to avoid synchronizing all per-cpu delta folding * Furthermore, in order to avoid synchronizing all per-CPU delta folding
* across the machine, we assume 10 ticks is sufficient time for every * across the machine, we assume 10 ticks is sufficient time for every
* cpu to have completed this task. * CPU to have completed this task.
* *
* This places an upper-bound on the IRQ-off latency of the machine. Then * This places an upper-bound on the IRQ-off latency of the machine. Then
* again, being late doesn't loose the delta, just wrecks the sample. * again, being late doesn't loose the delta, just wrecks the sample.
* *
* - cpu_rq()->nr_uninterruptible isn't accurately tracked per-cpu because * - cpu_rq()->nr_uninterruptible isn't accurately tracked per-CPU because
* this would add another cross-cpu cacheline miss and atomic operation * this would add another cross-CPU cacheline miss and atomic operation
* to the wakeup path. Instead we increment on whatever cpu the task ran * to the wakeup path. Instead we increment on whatever CPU the task ran
* when it went into uninterruptible state and decrement on whatever cpu * when it went into uninterruptible state and decrement on whatever CPU
* did the wakeup. This means that only the sum of nr_uninterruptible over * did the wakeup. This means that only the sum of nr_uninterruptible over
* all cpus yields the correct result. * all CPUs yields the correct result.
* *
* This covers the NO_HZ=n code, for extra head-aches, see the comment below. * This covers the NO_HZ=n code, for extra head-aches, see the comment below.
*/ */
...@@ -115,11 +111,11 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active) ...@@ -115,11 +111,11 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active)
* Handle NO_HZ for the global load-average. * Handle NO_HZ for the global load-average.
* *
* Since the above described distributed algorithm to compute the global * Since the above described distributed algorithm to compute the global
* load-average relies on per-cpu sampling from the tick, it is affected by * load-average relies on per-CPU sampling from the tick, it is affected by
* NO_HZ. * NO_HZ.
* *
* The basic idea is to fold the nr_active delta into a global NO_HZ-delta upon * The basic idea is to fold the nr_active delta into a global NO_HZ-delta upon
* entering NO_HZ state such that we can include this as an 'extra' cpu delta * entering NO_HZ state such that we can include this as an 'extra' CPU delta
* when we read the global state. * when we read the global state.
* *
* Obviously reality has to ruin such a delightfully simple scheme: * Obviously reality has to ruin such a delightfully simple scheme:
...@@ -146,9 +142,9 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active) ...@@ -146,9 +142,9 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active)
* busy state. * busy state.
* *
* This is solved by pushing the window forward, and thus skipping the * This is solved by pushing the window forward, and thus skipping the
* sample, for this cpu (effectively using the NO_HZ-delta for this cpu which * sample, for this CPU (effectively using the NO_HZ-delta for this CPU which
* was in effect at the time the window opened). This also solves the issue * was in effect at the time the window opened). This also solves the issue
* of having to deal with a cpu having been in NO_HZ for multiple LOAD_FREQ * of having to deal with a CPU having been in NO_HZ for multiple LOAD_FREQ
* intervals. * intervals.
* *
* When making the ILB scale, we should try to pull this in as well. * When making the ILB scale, we should try to pull this in as well.
...@@ -299,7 +295,7 @@ calc_load_n(unsigned long load, unsigned long exp, ...@@ -299,7 +295,7 @@ calc_load_n(unsigned long load, unsigned long exp,
} }
/* /*
* NO_HZ can leave us missing all per-cpu ticks calling * NO_HZ can leave us missing all per-CPU ticks calling
* calc_load_fold_active(), but since a NO_HZ CPU folds its delta into * calc_load_fold_active(), but since a NO_HZ CPU folds its delta into
* calc_load_nohz per calc_load_nohz_start(), all we need to do is fold * calc_load_nohz per calc_load_nohz_start(), all we need to do is fold
* in the pending NO_HZ delta if our NO_HZ period crossed a load cycle boundary. * in the pending NO_HZ delta if our NO_HZ period crossed a load cycle boundary.
...@@ -363,7 +359,7 @@ void calc_global_load(unsigned long ticks) ...@@ -363,7 +359,7 @@ void calc_global_load(unsigned long ticks)
return; return;
/* /*
* Fold the 'old' NO_HZ-delta to include all NO_HZ cpus. * Fold the 'old' NO_HZ-delta to include all NO_HZ CPUs.
*/ */
delta = calc_load_nohz_fold(); delta = calc_load_nohz_fold();
if (delta) if (delta)
......
...@@ -13,14 +13,7 @@ ...@@ -13,14 +13,7 @@
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details. * GNU General Public License for more details.
*/ */
#include "sched.h"
#include <linux/syscalls.h>
#include <linux/membarrier.h>
#include <linux/tick.h>
#include <linux/cpumask.h>
#include <linux/atomic.h>
#include "sched.h" /* for cpu_rq(). */
/* /*
* Bitmask made from a "or" of all commands within enum membarrier_cmd, * Bitmask made from a "or" of all commands within enum membarrier_cmd,
...@@ -85,6 +78,7 @@ static int membarrier_global_expedited(void) ...@@ -85,6 +78,7 @@ static int membarrier_global_expedited(void)
*/ */
if (cpu == raw_smp_processor_id()) if (cpu == raw_smp_processor_id())
continue; continue;
rcu_read_lock(); rcu_read_lock();
p = task_rcu_dereference(&cpu_rq(cpu)->curr); p = task_rcu_dereference(&cpu_rq(cpu)->curr);
if (p && p->mm && (atomic_read(&p->mm->membarrier_state) & if (p && p->mm && (atomic_read(&p->mm->membarrier_state) &
...@@ -188,6 +182,7 @@ static int membarrier_private_expedited(int flags) ...@@ -188,6 +182,7 @@ static int membarrier_private_expedited(int flags)
* rq->curr modification in scheduler. * rq->curr modification in scheduler.
*/ */
smp_mb(); /* exit from system call is not a mb */ smp_mb(); /* exit from system call is not a mb */
return 0; return 0;
} }
...@@ -219,6 +214,7 @@ static int membarrier_register_global_expedited(void) ...@@ -219,6 +214,7 @@ static int membarrier_register_global_expedited(void)
} }
atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY, atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY,
&mm->membarrier_state); &mm->membarrier_state);
return 0; return 0;
} }
...@@ -253,6 +249,7 @@ static int membarrier_register_private_expedited(int flags) ...@@ -253,6 +249,7 @@ static int membarrier_register_private_expedited(int flags)
synchronize_sched(); synchronize_sched();
} }
atomic_or(state, &mm->membarrier_state); atomic_or(state, &mm->membarrier_state);
return 0; return 0;
} }
......
...@@ -3,12 +3,8 @@ ...@@ -3,12 +3,8 @@
* Real-Time Scheduling Class (mapped to the SCHED_FIFO and SCHED_RR * Real-Time Scheduling Class (mapped to the SCHED_FIFO and SCHED_RR
* policies) * policies)
*/ */
#include "sched.h" #include "sched.h"
#include <linux/slab.h>
#include <linux/irq_work.h>
int sched_rr_timeslice = RR_TIMESLICE; int sched_rr_timeslice = RR_TIMESLICE;
int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE; int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE;
...@@ -359,7 +355,7 @@ static DEFINE_PER_CPU(struct callback_head, rt_pull_head); ...@@ -359,7 +355,7 @@ static DEFINE_PER_CPU(struct callback_head, rt_pull_head);
static void push_rt_tasks(struct rq *); static void push_rt_tasks(struct rq *);
static void pull_rt_task(struct rq *); static void pull_rt_task(struct rq *);
static inline void queue_push_tasks(struct rq *rq) static inline void rt_queue_push_tasks(struct rq *rq)
{ {
if (!has_pushable_tasks(rq)) if (!has_pushable_tasks(rq))
return; return;
...@@ -367,7 +363,7 @@ static inline void queue_push_tasks(struct rq *rq) ...@@ -367,7 +363,7 @@ static inline void queue_push_tasks(struct rq *rq)
queue_balance_callback(rq, &per_cpu(rt_push_head, rq->cpu), push_rt_tasks); queue_balance_callback(rq, &per_cpu(rt_push_head, rq->cpu), push_rt_tasks);
} }
static inline void queue_pull_task(struct rq *rq) static inline void rt_queue_pull_task(struct rq *rq)
{ {
queue_balance_callback(rq, &per_cpu(rt_pull_head, rq->cpu), pull_rt_task); queue_balance_callback(rq, &per_cpu(rt_pull_head, rq->cpu), pull_rt_task);
} }
...@@ -425,7 +421,7 @@ static inline void pull_rt_task(struct rq *this_rq) ...@@ -425,7 +421,7 @@ static inline void pull_rt_task(struct rq *this_rq)
{ {
} }
static inline void queue_push_tasks(struct rq *rq) static inline void rt_queue_push_tasks(struct rq *rq)
{ {
} }
#endif /* CONFIG_SMP */ #endif /* CONFIG_SMP */
...@@ -961,9 +957,6 @@ static void update_curr_rt(struct rq *rq) ...@@ -961,9 +957,6 @@ static void update_curr_rt(struct rq *rq)
if (unlikely((s64)delta_exec <= 0)) if (unlikely((s64)delta_exec <= 0))
return; return;
/* Kick cpufreq (see the comment in kernel/sched/sched.h). */
cpufreq_update_util(rq, SCHED_CPUFREQ_RT);
schedstat_set(curr->se.statistics.exec_max, schedstat_set(curr->se.statistics.exec_max,
max(curr->se.statistics.exec_max, delta_exec)); max(curr->se.statistics.exec_max, delta_exec));
...@@ -1005,6 +998,9 @@ dequeue_top_rt_rq(struct rt_rq *rt_rq) ...@@ -1005,6 +998,9 @@ dequeue_top_rt_rq(struct rt_rq *rt_rq)
sub_nr_running(rq, rt_rq->rt_nr_running); sub_nr_running(rq, rt_rq->rt_nr_running);
rt_rq->rt_queued = 0; rt_rq->rt_queued = 0;
/* Kick cpufreq (see the comment in kernel/sched/sched.h). */
cpufreq_update_util(rq, 0);
} }
static void static void
...@@ -1021,6 +1017,9 @@ enqueue_top_rt_rq(struct rt_rq *rt_rq) ...@@ -1021,6 +1017,9 @@ enqueue_top_rt_rq(struct rt_rq *rt_rq)
add_nr_running(rq, rt_rq->rt_nr_running); add_nr_running(rq, rt_rq->rt_nr_running);
rt_rq->rt_queued = 1; rt_rq->rt_queued = 1;
/* Kick cpufreq (see the comment in kernel/sched/sched.h). */
cpufreq_update_util(rq, 0);
} }
#if defined CONFIG_SMP #if defined CONFIG_SMP
...@@ -1453,9 +1452,9 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) ...@@ -1453,9 +1452,9 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
return; return;
/* /*
* There appears to be other cpus that can accept * There appear to be other CPUs that can accept
* current and none to run 'p', so lets reschedule * the current task but none can run 'p', so lets reschedule
* to try and push current away: * to try and push the current task away:
*/ */
requeue_task_rt(rq, p, 1); requeue_task_rt(rq, p, 1);
resched_curr(rq); resched_curr(rq);
...@@ -1569,7 +1568,7 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) ...@@ -1569,7 +1568,7 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
/* The running task is never eligible for pushing */ /* The running task is never eligible for pushing */
dequeue_pushable_task(rq, p); dequeue_pushable_task(rq, p);
queue_push_tasks(rq); rt_queue_push_tasks(rq);
return p; return p;
} }
...@@ -1596,12 +1595,13 @@ static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) ...@@ -1596,12 +1595,13 @@ static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
if (!task_running(rq, p) && if (!task_running(rq, p) &&
cpumask_test_cpu(cpu, &p->cpus_allowed)) cpumask_test_cpu(cpu, &p->cpus_allowed))
return 1; return 1;
return 0; return 0;
} }
/* /*
* Return the highest pushable rq's task, which is suitable to be executed * Return the highest pushable rq's task, which is suitable to be executed
* on the cpu, NULL otherwise * on the CPU, NULL otherwise
*/ */
static struct task_struct *pick_highest_pushable_task(struct rq *rq, int cpu) static struct task_struct *pick_highest_pushable_task(struct rq *rq, int cpu)
{ {
...@@ -1639,11 +1639,11 @@ static int find_lowest_rq(struct task_struct *task) ...@@ -1639,11 +1639,11 @@ static int find_lowest_rq(struct task_struct *task)
return -1; /* No targets found */ return -1; /* No targets found */
/* /*
* At this point we have built a mask of cpus representing the * At this point we have built a mask of CPUs representing the
* lowest priority tasks in the system. Now we want to elect * lowest priority tasks in the system. Now we want to elect
* the best one based on our affinity and topology. * the best one based on our affinity and topology.
* *
* We prioritize the last cpu that the task executed on since * We prioritize the last CPU that the task executed on since
* it is most likely cache-hot in that location. * it is most likely cache-hot in that location.
*/ */
if (cpumask_test_cpu(cpu, lowest_mask)) if (cpumask_test_cpu(cpu, lowest_mask))
...@@ -1651,7 +1651,7 @@ static int find_lowest_rq(struct task_struct *task) ...@@ -1651,7 +1651,7 @@ static int find_lowest_rq(struct task_struct *task)
/* /*
* Otherwise, we consult the sched_domains span maps to figure * Otherwise, we consult the sched_domains span maps to figure
* out which cpu is logically closest to our hot cache data. * out which CPU is logically closest to our hot cache data.
*/ */
if (!cpumask_test_cpu(this_cpu, lowest_mask)) if (!cpumask_test_cpu(this_cpu, lowest_mask))
this_cpu = -1; /* Skip this_cpu opt if not among lowest */ this_cpu = -1; /* Skip this_cpu opt if not among lowest */
...@@ -1692,6 +1692,7 @@ static int find_lowest_rq(struct task_struct *task) ...@@ -1692,6 +1692,7 @@ static int find_lowest_rq(struct task_struct *task)
cpu = cpumask_any(lowest_mask); cpu = cpumask_any(lowest_mask);
if (cpu < nr_cpu_ids) if (cpu < nr_cpu_ids)
return cpu; return cpu;
return -1; return -1;
} }
...@@ -1827,7 +1828,7 @@ static int push_rt_task(struct rq *rq) ...@@ -1827,7 +1828,7 @@ static int push_rt_task(struct rq *rq)
* The task hasn't migrated, and is still the next * The task hasn't migrated, and is still the next
* eligible task, but we failed to find a run-queue * eligible task, but we failed to find a run-queue
* to push it to. Do not retry in this case, since * to push it to. Do not retry in this case, since
* other cpus will pull from us when ready. * other CPUs will pull from us when ready.
*/ */
goto out; goto out;
} }
...@@ -1919,7 +1920,7 @@ static int rto_next_cpu(struct root_domain *rd) ...@@ -1919,7 +1920,7 @@ static int rto_next_cpu(struct root_domain *rd)
* rt_next_cpu() will simply return the first CPU found in * rt_next_cpu() will simply return the first CPU found in
* the rto_mask. * the rto_mask.
* *
* If rto_next_cpu() is called with rto_cpu is a valid cpu, it * If rto_next_cpu() is called with rto_cpu is a valid CPU, it
* will return the next CPU found in the rto_mask. * will return the next CPU found in the rto_mask.
* *
* If there are no more CPUs left in the rto_mask, then a check is made * If there are no more CPUs left in the rto_mask, then a check is made
...@@ -1980,7 +1981,7 @@ static void tell_cpu_to_push(struct rq *rq) ...@@ -1980,7 +1981,7 @@ static void tell_cpu_to_push(struct rq *rq)
raw_spin_lock(&rq->rd->rto_lock); raw_spin_lock(&rq->rd->rto_lock);
/* /*
* The rto_cpu is updated under the lock, if it has a valid cpu * The rto_cpu is updated under the lock, if it has a valid CPU
* then the IPI is still running and will continue due to the * then the IPI is still running and will continue due to the
* update to loop_next, and nothing needs to be done here. * update to loop_next, and nothing needs to be done here.
* Otherwise it is finishing up and an ipi needs to be sent. * Otherwise it is finishing up and an ipi needs to be sent.
...@@ -2105,7 +2106,7 @@ static void pull_rt_task(struct rq *this_rq) ...@@ -2105,7 +2106,7 @@ static void pull_rt_task(struct rq *this_rq)
/* /*
* There's a chance that p is higher in priority * There's a chance that p is higher in priority
* than what's currently running on its cpu. * than what's currently running on its CPU.
* This is just that p is wakeing up and hasn't * This is just that p is wakeing up and hasn't
* had a chance to schedule. We only pull * had a chance to schedule. We only pull
* p if it is lower in priority than the * p if it is lower in priority than the
...@@ -2187,7 +2188,7 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p) ...@@ -2187,7 +2188,7 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p)
if (!task_on_rq_queued(p) || rq->rt.rt_nr_running) if (!task_on_rq_queued(p) || rq->rt.rt_nr_running)
return; return;
queue_pull_task(rq); rt_queue_pull_task(rq);
} }
void __init init_sched_rt_class(void) void __init init_sched_rt_class(void)
...@@ -2218,7 +2219,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p) ...@@ -2218,7 +2219,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p)
if (task_on_rq_queued(p) && rq->curr != p) { if (task_on_rq_queued(p) && rq->curr != p) {
#ifdef CONFIG_SMP #ifdef CONFIG_SMP
if (p->nr_cpus_allowed > 1 && rq->rt.overloaded) if (p->nr_cpus_allowed > 1 && rq->rt.overloaded)
queue_push_tasks(rq); rt_queue_push_tasks(rq);
#endif /* CONFIG_SMP */ #endif /* CONFIG_SMP */
if (p->prio < rq->curr->prio && cpu_online(cpu_of(rq))) if (p->prio < rq->curr->prio && cpu_online(cpu_of(rq)))
resched_curr(rq); resched_curr(rq);
...@@ -2242,7 +2243,7 @@ prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio) ...@@ -2242,7 +2243,7 @@ prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
* may need to pull tasks to this runqueue. * may need to pull tasks to this runqueue.
*/ */
if (oldprio < p->prio) if (oldprio < p->prio)
queue_pull_task(rq); rt_queue_pull_task(rq);
/* /*
* If there's a higher priority task waiting to run * If there's a higher priority task waiting to run
...@@ -2292,6 +2293,14 @@ static void watchdog(struct rq *rq, struct task_struct *p) ...@@ -2292,6 +2293,14 @@ static void watchdog(struct rq *rq, struct task_struct *p)
static inline void watchdog(struct rq *rq, struct task_struct *p) { } static inline void watchdog(struct rq *rq, struct task_struct *p) { }
#endif #endif
/*
* scheduler tick hitting a task of our scheduling class.
*
* NOTE: This function can be called remotely by the tick offload that
* goes along full dynticks. Therefore no local assumption can be made
* and everything must be accessed through the @rq and @curr passed in
* parameters.
*/
static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued) static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
{ {
struct sched_rt_entity *rt_se = &p->rt; struct sched_rt_entity *rt_se = &p->rt;
...@@ -2685,6 +2694,7 @@ int sched_rr_handler(struct ctl_table *table, int write, ...@@ -2685,6 +2694,7 @@ int sched_rr_handler(struct ctl_table *table, int write,
msecs_to_jiffies(sysctl_sched_rr_timeslice); msecs_to_jiffies(sysctl_sched_rr_timeslice);
} }
mutex_unlock(&mutex); mutex_unlock(&mutex);
return ret; return ret;
} }
......
This diff is collapsed.
// SPDX-License-Identifier: GPL-2.0 // SPDX-License-Identifier: GPL-2.0
/*
#include <linux/slab.h> * /proc/schedstat implementation
#include <linux/fs.h> */
#include <linux/seq_file.h>
#include <linux/proc_fs.h>
#include "sched.h" #include "sched.h"
/* /*
* bump this up when changing the output format or the meaning of an existing * Current schedstat API version.
*
* Bump this up when changing the output format or the meaning of an existing
* format, so that tools can adapt (or abort) * format, so that tools can adapt (or abort)
*/ */
#define SCHEDSTAT_VERSION 15 #define SCHEDSTAT_VERSION 15
...@@ -78,8 +77,8 @@ static int show_schedstat(struct seq_file *seq, void *v) ...@@ -78,8 +77,8 @@ static int show_schedstat(struct seq_file *seq, void *v)
* This itererator needs some explanation. * This itererator needs some explanation.
* It returns 1 for the header position. * It returns 1 for the header position.
* This means 2 is cpu 0. * This means 2 is cpu 0.
* In a hotplugged system some cpus, including cpu 0, may be missing so we have * In a hotplugged system some CPUs, including cpu 0, may be missing so we have
* to use cpumask_* to iterate over the cpus. * to use cpumask_* to iterate over the CPUs.
*/ */
static void *schedstat_start(struct seq_file *file, loff_t *offset) static void *schedstat_start(struct seq_file *file, loff_t *offset)
{ {
...@@ -99,12 +98,14 @@ static void *schedstat_start(struct seq_file *file, loff_t *offset) ...@@ -99,12 +98,14 @@ static void *schedstat_start(struct seq_file *file, loff_t *offset)
if (n < nr_cpu_ids) if (n < nr_cpu_ids)
return (void *)(unsigned long)(n + 2); return (void *)(unsigned long)(n + 2);
return NULL; return NULL;
} }
static void *schedstat_next(struct seq_file *file, void *data, loff_t *offset) static void *schedstat_next(struct seq_file *file, void *data, loff_t *offset)
{ {
(*offset)++; (*offset)++;
return schedstat_start(file, offset); return schedstat_start(file, offset);
} }
...@@ -134,6 +135,7 @@ static const struct file_operations proc_schedstat_operations = { ...@@ -134,6 +135,7 @@ static const struct file_operations proc_schedstat_operations = {
static int __init proc_schedstat_init(void) static int __init proc_schedstat_init(void)
{ {
proc_create("schedstat", 0, NULL, &proc_schedstat_operations); proc_create("schedstat", 0, NULL, &proc_schedstat_operations);
return 0; return 0;
} }
subsys_initcall(proc_schedstat_init); subsys_initcall(proc_schedstat_init);
...@@ -40,25 +40,19 @@ rq_sched_info_dequeued(struct rq *rq, unsigned long long delta) ...@@ -40,25 +40,19 @@ rq_sched_info_dequeued(struct rq *rq, unsigned long long delta)
#define schedstat_val(var) (var) #define schedstat_val(var) (var)
#define schedstat_val_or_zero(var) ((schedstat_enabled()) ? (var) : 0) #define schedstat_val_or_zero(var) ((schedstat_enabled()) ? (var) : 0)
#else /* !CONFIG_SCHEDSTATS */ #else /* !CONFIG_SCHEDSTATS: */
static inline void static inline void rq_sched_info_arrive (struct rq *rq, unsigned long long delta) { }
rq_sched_info_arrive(struct rq *rq, unsigned long long delta) static inline void rq_sched_info_dequeued(struct rq *rq, unsigned long long delta) { }
{} static inline void rq_sched_info_depart (struct rq *rq, unsigned long long delta) { }
static inline void # define schedstat_enabled() 0
rq_sched_info_dequeued(struct rq *rq, unsigned long long delta) # define __schedstat_inc(var) do { } while (0)
{} # define schedstat_inc(var) do { } while (0)
static inline void # define __schedstat_add(var, amt) do { } while (0)
rq_sched_info_depart(struct rq *rq, unsigned long long delta) # define schedstat_add(var, amt) do { } while (0)
{} # define __schedstat_set(var, val) do { } while (0)
#define schedstat_enabled() 0 # define schedstat_set(var, val) do { } while (0)
#define __schedstat_inc(var) do { } while (0) # define schedstat_val(var) 0
#define schedstat_inc(var) do { } while (0) # define schedstat_val_or_zero(var) 0
#define __schedstat_add(var, amt) do { } while (0)
#define schedstat_add(var, amt) do { } while (0)
#define __schedstat_set(var, val) do { } while (0)
#define schedstat_set(var, val) do { } while (0)
#define schedstat_val(var) 0
#define schedstat_val_or_zero(var) 0
#endif /* CONFIG_SCHEDSTATS */ #endif /* CONFIG_SCHEDSTATS */
#ifdef CONFIG_SCHED_INFO #ifdef CONFIG_SCHED_INFO
...@@ -69,9 +63,9 @@ static inline void sched_info_reset_dequeued(struct task_struct *t) ...@@ -69,9 +63,9 @@ static inline void sched_info_reset_dequeued(struct task_struct *t)
/* /*
* We are interested in knowing how long it was from the *first* time a * We are interested in knowing how long it was from the *first* time a
* task was queued to the time that it finally hit a cpu, we call this routine * task was queued to the time that it finally hit a CPU, we call this routine
* from dequeue_task() to account for possible rq->clock skew across cpus. The * from dequeue_task() to account for possible rq->clock skew across CPUs. The
* delta taken on each cpu would annul the skew. * delta taken on each CPU would annul the skew.
*/ */
static inline void sched_info_dequeued(struct rq *rq, struct task_struct *t) static inline void sched_info_dequeued(struct rq *rq, struct task_struct *t)
{ {
...@@ -87,7 +81,7 @@ static inline void sched_info_dequeued(struct rq *rq, struct task_struct *t) ...@@ -87,7 +81,7 @@ static inline void sched_info_dequeued(struct rq *rq, struct task_struct *t)
} }
/* /*
* Called when a task finally hits the cpu. We can now calculate how * Called when a task finally hits the CPU. We can now calculate how
* long it was waiting to run. We also note when it began so that we * long it was waiting to run. We also note when it began so that we
* can keep stats on how long its timeslice is. * can keep stats on how long its timeslice is.
*/ */
...@@ -112,9 +106,10 @@ static void sched_info_arrive(struct rq *rq, struct task_struct *t) ...@@ -112,9 +106,10 @@ static void sched_info_arrive(struct rq *rq, struct task_struct *t)
*/ */
static inline void sched_info_queued(struct rq *rq, struct task_struct *t) static inline void sched_info_queued(struct rq *rq, struct task_struct *t)
{ {
if (unlikely(sched_info_on())) if (unlikely(sched_info_on())) {
if (!t->sched_info.last_queued) if (!t->sched_info.last_queued)
t->sched_info.last_queued = rq_clock(rq); t->sched_info.last_queued = rq_clock(rq);
}
} }
/* /*
...@@ -127,8 +122,7 @@ static inline void sched_info_queued(struct rq *rq, struct task_struct *t) ...@@ -127,8 +122,7 @@ static inline void sched_info_queued(struct rq *rq, struct task_struct *t)
*/ */
static inline void sched_info_depart(struct rq *rq, struct task_struct *t) static inline void sched_info_depart(struct rq *rq, struct task_struct *t)
{ {
unsigned long long delta = rq_clock(rq) - unsigned long long delta = rq_clock(rq) - t->sched_info.last_arrival;
t->sched_info.last_arrival;
rq_sched_info_depart(rq, delta); rq_sched_info_depart(rq, delta);
...@@ -142,11 +136,10 @@ static inline void sched_info_depart(struct rq *rq, struct task_struct *t) ...@@ -142,11 +136,10 @@ static inline void sched_info_depart(struct rq *rq, struct task_struct *t)
* the idle task.) We are only called when prev != next. * the idle task.) We are only called when prev != next.
*/ */
static inline void static inline void
__sched_info_switch(struct rq *rq, __sched_info_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next)
struct task_struct *prev, struct task_struct *next)
{ {
/* /*
* prev now departs the cpu. It's not interesting to record * prev now departs the CPU. It's not interesting to record
* stats about how efficient we were at scheduling the idle * stats about how efficient we were at scheduling the idle
* process, however. * process, however.
*/ */
...@@ -156,18 +149,19 @@ __sched_info_switch(struct rq *rq, ...@@ -156,18 +149,19 @@ __sched_info_switch(struct rq *rq,
if (next != rq->idle) if (next != rq->idle)
sched_info_arrive(rq, next); sched_info_arrive(rq, next);
} }
static inline void static inline void
sched_info_switch(struct rq *rq, sched_info_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next)
struct task_struct *prev, struct task_struct *next)
{ {
if (unlikely(sched_info_on())) if (unlikely(sched_info_on()))
__sched_info_switch(rq, prev, next); __sched_info_switch(rq, prev, next);
} }
#else
#define sched_info_queued(rq, t) do { } while (0) #else /* !CONFIG_SCHED_INFO: */
#define sched_info_reset_dequeued(t) do { } while (0) # define sched_info_queued(rq, t) do { } while (0)
#define sched_info_dequeued(rq, t) do { } while (0) # define sched_info_reset_dequeued(t) do { } while (0)
#define sched_info_depart(rq, t) do { } while (0) # define sched_info_dequeued(rq, t) do { } while (0)
#define sched_info_arrive(rq, next) do { } while (0) # define sched_info_depart(rq, t) do { } while (0)
#define sched_info_switch(rq, t, next) do { } while (0) # define sched_info_arrive(rq, next) do { } while (0)
# define sched_info_switch(rq, t, next) do { } while (0)
#endif /* CONFIG_SCHED_INFO */ #endif /* CONFIG_SCHED_INFO */
// SPDX-License-Identifier: GPL-2.0 // SPDX-License-Identifier: GPL-2.0
#include "sched.h"
/* /*
* stop-task scheduling class. * stop-task scheduling class.
* *
...@@ -9,6 +7,7 @@ ...@@ -9,6 +7,7 @@
* *
* See kernel/stop_machine.c * See kernel/stop_machine.c
*/ */
#include "sched.h"
#ifdef CONFIG_SMP #ifdef CONFIG_SMP
static int static int
...@@ -75,6 +74,14 @@ static void put_prev_task_stop(struct rq *rq, struct task_struct *prev) ...@@ -75,6 +74,14 @@ static void put_prev_task_stop(struct rq *rq, struct task_struct *prev)
cgroup_account_cputime(curr, delta_exec); cgroup_account_cputime(curr, delta_exec);
} }
/*
* scheduler tick hitting a task of our scheduling class.
*
* NOTE: This function can be called remotely by the tick offload that
* goes along full dynticks. Therefore no local assumption can be made
* and everything must be accessed through the @rq and @curr passed in
* parameters.
*/
static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued) static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued)
{ {
} }
......
// SPDX-License-Identifier: GPL-2.0 // SPDX-License-Identifier: GPL-2.0
#include <linux/sched/signal.h> /*
#include <linux/swait.h> * <linux/swait.h> (simple wait queues ) implementation:
*/
#include "sched.h"
void __init_swait_queue_head(struct swait_queue_head *q, const char *name, void __init_swait_queue_head(struct swait_queue_head *q, const char *name,
struct lock_class_key *key) struct lock_class_key *key)
......
...@@ -2,10 +2,6 @@ ...@@ -2,10 +2,6 @@
/* /*
* Scheduler topology setup/handling methods * Scheduler topology setup/handling methods
*/ */
#include <linux/sched.h>
#include <linux/mutex.h>
#include <linux/sched/isolation.h>
#include "sched.h" #include "sched.h"
DEFINE_MUTEX(sched_domains_mutex); DEFINE_MUTEX(sched_domains_mutex);
...@@ -41,8 +37,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, ...@@ -41,8 +37,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
if (!(sd->flags & SD_LOAD_BALANCE)) { if (!(sd->flags & SD_LOAD_BALANCE)) {
printk("does not load-balance\n"); printk("does not load-balance\n");
if (sd->parent) if (sd->parent)
printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain" printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain has parent");
" has parent");
return -1; return -1;
} }
...@@ -50,12 +45,10 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, ...@@ -50,12 +45,10 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
cpumask_pr_args(sched_domain_span(sd)), sd->name); cpumask_pr_args(sched_domain_span(sd)), sd->name);
if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) { if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {
printk(KERN_ERR "ERROR: domain->span does not contain " printk(KERN_ERR "ERROR: domain->span does not contain CPU%d\n", cpu);
"CPU%d\n", cpu);
} }
if (!cpumask_test_cpu(cpu, sched_group_span(group))) { if (!cpumask_test_cpu(cpu, sched_group_span(group))) {
printk(KERN_ERR "ERROR: domain->groups does not contain" printk(KERN_ERR "ERROR: domain->groups does not contain CPU%d\n", cpu);
" CPU%d\n", cpu);
} }
printk(KERN_DEBUG "%*s groups:", level + 1, ""); printk(KERN_DEBUG "%*s groups:", level + 1, "");
...@@ -115,8 +108,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, ...@@ -115,8 +108,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
if (sd->parent && if (sd->parent &&
!cpumask_subset(groupmask, sched_domain_span(sd->parent))) !cpumask_subset(groupmask, sched_domain_span(sd->parent)))
printk(KERN_ERR "ERROR: parent span is not a superset " printk(KERN_ERR "ERROR: parent span is not a superset of domain->span\n");
"of domain->span\n");
return 0; return 0;
} }
...@@ -595,7 +587,7 @@ int group_balance_cpu(struct sched_group *sg) ...@@ -595,7 +587,7 @@ int group_balance_cpu(struct sched_group *sg)
* are not. * are not.
* *
* This leads to a few particularly weird cases where the sched_domain's are * This leads to a few particularly weird cases where the sched_domain's are
* not of the same number for each cpu. Consider: * not of the same number for each CPU. Consider:
* *
* NUMA-2 0-3 0-3 * NUMA-2 0-3 0-3
* groups: {0-2},{1-3} {1-3},{0-2} * groups: {0-2},{1-3} {1-3},{0-2}
...@@ -780,7 +772,7 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) ...@@ -780,7 +772,7 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
* ^ ^ ^ ^ * ^ ^ ^ ^
* `-' `-' * `-' `-'
* *
* The sched_domains are per-cpu and have a two way link (parent & child) and * The sched_domains are per-CPU and have a two way link (parent & child) and
* denote the ever growing mask of CPUs belonging to that level of topology. * denote the ever growing mask of CPUs belonging to that level of topology.
* *
* Each sched_domain has a circular (double) linked list of sched_group's, each * Each sched_domain has a circular (double) linked list of sched_group's, each
...@@ -1021,6 +1013,7 @@ __visit_domain_allocation_hell(struct s_data *d, const struct cpumask *cpu_map) ...@@ -1021,6 +1013,7 @@ __visit_domain_allocation_hell(struct s_data *d, const struct cpumask *cpu_map)
d->rd = alloc_rootdomain(); d->rd = alloc_rootdomain();
if (!d->rd) if (!d->rd)
return sa_sd; return sa_sd;
return sa_rootdomain; return sa_rootdomain;
} }
...@@ -1047,12 +1040,14 @@ static void claim_allocations(int cpu, struct sched_domain *sd) ...@@ -1047,12 +1040,14 @@ static void claim_allocations(int cpu, struct sched_domain *sd)
} }
#ifdef CONFIG_NUMA #ifdef CONFIG_NUMA
static int sched_domains_numa_levels;
enum numa_topology_type sched_numa_topology_type; enum numa_topology_type sched_numa_topology_type;
static int *sched_domains_numa_distance;
static int sched_domains_numa_levels;
static int sched_domains_curr_level;
int sched_max_numa_distance; int sched_max_numa_distance;
static int *sched_domains_numa_distance;
static struct cpumask ***sched_domains_numa_masks; static struct cpumask ***sched_domains_numa_masks;
static int sched_domains_curr_level;
#endif #endif
/* /*
...@@ -1628,7 +1623,7 @@ static struct sched_domain *build_sched_domain(struct sched_domain_topology_leve ...@@ -1628,7 +1623,7 @@ static struct sched_domain *build_sched_domain(struct sched_domain_topology_leve
pr_err(" the %s domain not a subset of the %s domain\n", pr_err(" the %s domain not a subset of the %s domain\n",
child->name, sd->name); child->name, sd->name);
#endif #endif
/* Fixup, ensure @sd has at least @child cpus. */ /* Fixup, ensure @sd has at least @child CPUs. */
cpumask_or(sched_domain_span(sd), cpumask_or(sched_domain_span(sd),
sched_domain_span(sd), sched_domain_span(sd),
sched_domain_span(child)); sched_domain_span(child));
...@@ -1720,6 +1715,7 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att ...@@ -1720,6 +1715,7 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
ret = 0; ret = 0;
error: error:
__free_domain_allocs(&d, alloc_state, cpu_map); __free_domain_allocs(&d, alloc_state, cpu_map);
return ret; return ret;
} }
...@@ -1824,6 +1820,7 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur, ...@@ -1824,6 +1820,7 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
return 1; return 1;
tmp = SD_ATTR_INIT; tmp = SD_ATTR_INIT;
return !memcmp(cur ? (cur + idx_cur) : &tmp, return !memcmp(cur ? (cur + idx_cur) : &tmp,
new ? (new + idx_new) : &tmp, new ? (new + idx_new) : &tmp,
sizeof(struct sched_domain_attr)); sizeof(struct sched_domain_attr));
...@@ -1929,4 +1926,3 @@ void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], ...@@ -1929,4 +1926,3 @@ void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
mutex_unlock(&sched_domains_mutex); mutex_unlock(&sched_domains_mutex);
} }
...@@ -3,14 +3,7 @@ ...@@ -3,14 +3,7 @@
* *
* (C) 2004 Nadia Yvette Chambers, Oracle * (C) 2004 Nadia Yvette Chambers, Oracle
*/ */
#include <linux/init.h> #include "sched.h"
#include <linux/export.h>
#include <linux/sched/signal.h>
#include <linux/sched/debug.h>
#include <linux/mm.h>
#include <linux/wait.h>
#include <linux/hash.h>
#include <linux/kthread.h>
void __init_waitqueue_head(struct wait_queue_head *wq_head, const char *name, struct lock_class_key *key) void __init_waitqueue_head(struct wait_queue_head *wq_head, const char *name, struct lock_class_key *key)
{ {
...@@ -107,6 +100,7 @@ static int __wake_up_common(struct wait_queue_head *wq_head, unsigned int mode, ...@@ -107,6 +100,7 @@ static int __wake_up_common(struct wait_queue_head *wq_head, unsigned int mode,
break; break;
} }
} }
return nr_exclusive; return nr_exclusive;
} }
...@@ -317,6 +311,7 @@ int do_wait_intr(wait_queue_head_t *wq, wait_queue_entry_t *wait) ...@@ -317,6 +311,7 @@ int do_wait_intr(wait_queue_head_t *wq, wait_queue_entry_t *wait)
spin_unlock(&wq->lock); spin_unlock(&wq->lock);
schedule(); schedule();
spin_lock(&wq->lock); spin_lock(&wq->lock);
return 0; return 0;
} }
EXPORT_SYMBOL(do_wait_intr); EXPORT_SYMBOL(do_wait_intr);
...@@ -333,6 +328,7 @@ int do_wait_intr_irq(wait_queue_head_t *wq, wait_queue_entry_t *wait) ...@@ -333,6 +328,7 @@ int do_wait_intr_irq(wait_queue_head_t *wq, wait_queue_entry_t *wait)
spin_unlock_irq(&wq->lock); spin_unlock_irq(&wq->lock);
schedule(); schedule();
spin_lock_irq(&wq->lock); spin_lock_irq(&wq->lock);
return 0; return 0;
} }
EXPORT_SYMBOL(do_wait_intr_irq); EXPORT_SYMBOL(do_wait_intr_irq);
...@@ -378,6 +374,7 @@ int autoremove_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, i ...@@ -378,6 +374,7 @@ int autoremove_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, i
if (ret) if (ret)
list_del_init(&wq_entry->entry); list_del_init(&wq_entry->entry);
return ret; return ret;
} }
EXPORT_SYMBOL(autoremove_wake_function); EXPORT_SYMBOL(autoremove_wake_function);
......
/* /*
* The implementation of the wait_bit*() and related waiting APIs: * The implementation of the wait_bit*() and related waiting APIs:
*/ */
#include <linux/wait_bit.h> #include "sched.h"
#include <linux/sched/signal.h>
#include <linux/sched/debug.h>
#include <linux/hash.h>
#define WAIT_TABLE_BITS 8 #define WAIT_TABLE_BITS 8
#define WAIT_TABLE_SIZE (1 << WAIT_TABLE_BITS) #define WAIT_TABLE_SIZE (1 << WAIT_TABLE_BITS)
...@@ -29,7 +26,7 @@ int wake_bit_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync ...@@ -29,7 +26,7 @@ int wake_bit_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync
wait_bit->key.bit_nr != key->bit_nr || wait_bit->key.bit_nr != key->bit_nr ||
test_bit(key->bit_nr, key->flags)) test_bit(key->bit_nr, key->flags))
return 0; return 0;
else
return autoremove_wake_function(wq_entry, mode, sync, key); return autoremove_wake_function(wq_entry, mode, sync, key);
} }
EXPORT_SYMBOL(wake_bit_function); EXPORT_SYMBOL(wake_bit_function);
...@@ -50,7 +47,9 @@ __wait_on_bit(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_ ...@@ -50,7 +47,9 @@ __wait_on_bit(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_
if (test_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags)) if (test_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags))
ret = (*action)(&wbq_entry->key, mode); ret = (*action)(&wbq_entry->key, mode);
} while (test_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags) && !ret); } while (test_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags) && !ret);
finish_wait(wq_head, &wbq_entry->wq_entry); finish_wait(wq_head, &wbq_entry->wq_entry);
return ret; return ret;
} }
EXPORT_SYMBOL(__wait_on_bit); EXPORT_SYMBOL(__wait_on_bit);
...@@ -73,6 +72,7 @@ int __sched out_of_line_wait_on_bit_timeout( ...@@ -73,6 +72,7 @@ int __sched out_of_line_wait_on_bit_timeout(
DEFINE_WAIT_BIT(wq_entry, word, bit); DEFINE_WAIT_BIT(wq_entry, word, bit);
wq_entry.key.timeout = jiffies + timeout; wq_entry.key.timeout = jiffies + timeout;
return __wait_on_bit(wq_head, &wq_entry, action, mode); return __wait_on_bit(wq_head, &wq_entry, action, mode);
} }
EXPORT_SYMBOL_GPL(out_of_line_wait_on_bit_timeout); EXPORT_SYMBOL_GPL(out_of_line_wait_on_bit_timeout);
...@@ -120,6 +120,7 @@ EXPORT_SYMBOL(out_of_line_wait_on_bit_lock); ...@@ -120,6 +120,7 @@ EXPORT_SYMBOL(out_of_line_wait_on_bit_lock);
void __wake_up_bit(struct wait_queue_head *wq_head, void *word, int bit) void __wake_up_bit(struct wait_queue_head *wq_head, void *word, int bit)
{ {
struct wait_bit_key key = __WAIT_BIT_KEY_INITIALIZER(word, bit); struct wait_bit_key key = __WAIT_BIT_KEY_INITIALIZER(word, bit);
if (waitqueue_active(wq_head)) if (waitqueue_active(wq_head))
__wake_up(wq_head, TASK_NORMAL, 1, &key); __wake_up(wq_head, TASK_NORMAL, 1, &key);
} }
...@@ -157,6 +158,7 @@ static inline wait_queue_head_t *atomic_t_waitqueue(atomic_t *p) ...@@ -157,6 +158,7 @@ static inline wait_queue_head_t *atomic_t_waitqueue(atomic_t *p)
{ {
if (BITS_PER_LONG == 64) { if (BITS_PER_LONG == 64) {
unsigned long q = (unsigned long)p; unsigned long q = (unsigned long)p;
return bit_waitqueue((void *)(q & ~1), q & 1); return bit_waitqueue((void *)(q & ~1), q & 1);
} }
return bit_waitqueue(p, 0); return bit_waitqueue(p, 0);
...@@ -173,6 +175,7 @@ static int wake_atomic_t_function(struct wait_queue_entry *wq_entry, unsigned mo ...@@ -173,6 +175,7 @@ static int wake_atomic_t_function(struct wait_queue_entry *wq_entry, unsigned mo
wait_bit->key.bit_nr != key->bit_nr || wait_bit->key.bit_nr != key->bit_nr ||
atomic_read(val) != 0) atomic_read(val) != 0)
return 0; return 0;
return autoremove_wake_function(wq_entry, mode, sync, key); return autoremove_wake_function(wq_entry, mode, sync, key);
} }
...@@ -196,6 +199,7 @@ int __wait_on_atomic_t(struct wait_queue_head *wq_head, struct wait_bit_queue_en ...@@ -196,6 +199,7 @@ int __wait_on_atomic_t(struct wait_queue_head *wq_head, struct wait_bit_queue_en
ret = (*action)(val, mode); ret = (*action)(val, mode);
} while (!ret && atomic_read(val) != 0); } while (!ret && atomic_read(val) != 0);
finish_wait(wq_head, &wbq_entry->wq_entry); finish_wait(wq_head, &wbq_entry->wq_entry);
return ret; return ret;
} }
...@@ -226,6 +230,7 @@ __sched int atomic_t_wait(atomic_t *counter, unsigned int mode) ...@@ -226,6 +230,7 @@ __sched int atomic_t_wait(atomic_t *counter, unsigned int mode)
schedule(); schedule();
if (signal_pending_state(mode, current)) if (signal_pending_state(mode, current))
return -EINTR; return -EINTR;
return 0; return 0;
} }
EXPORT_SYMBOL(atomic_t_wait); EXPORT_SYMBOL(atomic_t_wait);
...@@ -250,6 +255,7 @@ __sched int bit_wait(struct wait_bit_key *word, int mode) ...@@ -250,6 +255,7 @@ __sched int bit_wait(struct wait_bit_key *word, int mode)
schedule(); schedule();
if (signal_pending_state(mode, current)) if (signal_pending_state(mode, current))
return -EINTR; return -EINTR;
return 0; return 0;
} }
EXPORT_SYMBOL(bit_wait); EXPORT_SYMBOL(bit_wait);
...@@ -259,6 +265,7 @@ __sched int bit_wait_io(struct wait_bit_key *word, int mode) ...@@ -259,6 +265,7 @@ __sched int bit_wait_io(struct wait_bit_key *word, int mode)
io_schedule(); io_schedule();
if (signal_pending_state(mode, current)) if (signal_pending_state(mode, current))
return -EINTR; return -EINTR;
return 0; return 0;
} }
EXPORT_SYMBOL(bit_wait_io); EXPORT_SYMBOL(bit_wait_io);
...@@ -266,11 +273,13 @@ EXPORT_SYMBOL(bit_wait_io); ...@@ -266,11 +273,13 @@ EXPORT_SYMBOL(bit_wait_io);
__sched int bit_wait_timeout(struct wait_bit_key *word, int mode) __sched int bit_wait_timeout(struct wait_bit_key *word, int mode)
{ {
unsigned long now = READ_ONCE(jiffies); unsigned long now = READ_ONCE(jiffies);
if (time_after_eq(now, word->timeout)) if (time_after_eq(now, word->timeout))
return -EAGAIN; return -EAGAIN;
schedule_timeout(word->timeout - now); schedule_timeout(word->timeout - now);
if (signal_pending_state(mode, current)) if (signal_pending_state(mode, current))
return -EINTR; return -EINTR;
return 0; return 0;
} }
EXPORT_SYMBOL_GPL(bit_wait_timeout); EXPORT_SYMBOL_GPL(bit_wait_timeout);
...@@ -278,11 +287,13 @@ EXPORT_SYMBOL_GPL(bit_wait_timeout); ...@@ -278,11 +287,13 @@ EXPORT_SYMBOL_GPL(bit_wait_timeout);
__sched int bit_wait_io_timeout(struct wait_bit_key *word, int mode) __sched int bit_wait_io_timeout(struct wait_bit_key *word, int mode)
{ {
unsigned long now = READ_ONCE(jiffies); unsigned long now = READ_ONCE(jiffies);
if (time_after_eq(now, word->timeout)) if (time_after_eq(now, word->timeout))
return -EAGAIN; return -EAGAIN;
io_schedule_timeout(word->timeout - now); io_schedule_timeout(word->timeout - now);
if (signal_pending_state(mode, current)) if (signal_pending_state(mode, current))
return -EINTR; return -EINTR;
return 0; return 0;
} }
EXPORT_SYMBOL_GPL(bit_wait_io_timeout); EXPORT_SYMBOL_GPL(bit_wait_io_timeout);
......
This diff is collapsed.
...@@ -5573,12 +5573,13 @@ static void __init wq_numa_init(void) ...@@ -5573,12 +5573,13 @@ static void __init wq_numa_init(void)
int __init workqueue_init_early(void) int __init workqueue_init_early(void)
{ {
int std_nice[NR_STD_WORKER_POOLS] = { 0, HIGHPRI_NICE_LEVEL }; int std_nice[NR_STD_WORKER_POOLS] = { 0, HIGHPRI_NICE_LEVEL };
int hk_flags = HK_FLAG_DOMAIN | HK_FLAG_WQ;
int i, cpu; int i, cpu;
WARN_ON(__alignof__(struct pool_workqueue) < __alignof__(long long)); WARN_ON(__alignof__(struct pool_workqueue) < __alignof__(long long));
BUG_ON(!alloc_cpumask_var(&wq_unbound_cpumask, GFP_KERNEL)); BUG_ON(!alloc_cpumask_var(&wq_unbound_cpumask, GFP_KERNEL));
cpumask_copy(wq_unbound_cpumask, housekeeping_cpumask(HK_FLAG_DOMAIN)); cpumask_copy(wq_unbound_cpumask, housekeeping_cpumask(hk_flags));
pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC); pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment