Commit 16b3d0cf authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'sched-core-2021-04-28' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull scheduler updates from Ingo Molnar:

 - Clean up SCHED_DEBUG: move the decades old mess of sysctl, procfs and
   debugfs interfaces to a unified debugfs interface.

 - Signals: Allow caching one sigqueue object per task, to improve
   performance & latencies.

 - Improve newidle_balance() irq-off latencies on systems with a large
   number of CPU cgroups.

 - Improve energy-aware scheduling

 - Improve the PELT metrics for certain workloads

 - Reintroduce select_idle_smt() to improve load-balancing locality -
   but without the previous regressions

 - Add 'scheduler latency debugging': warn after long periods of pending
   need_resched. This is an opt-in feature that requires the enabling of
   the LATENCY_WARN scheduler feature, or the use of the
   resched_latency_warn_ms=xx boot parameter.

 - CPU hotplug fixes for HP-rollback, and for the 'fail' interface. Fix
   remaining balance_push() vs. hotplug holes/races

 - PSI fixes, plus allow /proc/pressure/ files to be written by
   CAP_SYS_RESOURCE tasks as well

 - Fix/improve various load-balancing corner cases vs. capacity margins

 - Fix sched topology on systems with NUMA diameter of 3 or above

 - Fix PF_KTHREAD vs to_kthread() race

 - Minor rseq optimizations

 - Misc cleanups, optimizations, fixes and smaller updates

* tag 'sched-core-2021-04-28' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (61 commits)
  cpumask/hotplug: Fix cpu_dying() state tracking
  kthread: Fix PF_KTHREAD vs to_kthread() race
  sched/debug: Fix cgroup_path[] serialization
  sched,psi: Handle potential task count underflow bugs more gracefully
  sched: Warn on long periods of pending need_resched
  sched/fair: Move update_nohz_stats() to the CONFIG_NO_HZ_COMMON block to simplify the code & fix an unused function warning
  sched/debug: Rename the sched_debug parameter to sched_verbose
  sched,fair: Alternative sched_slice()
  sched: Move /proc/sched_debug to debugfs
  sched,debug: Convert sysctl sched_domains to debugfs
  debugfs: Implement debugfs_create_str()
  sched,preempt: Move preempt_dynamic to debug.c
  sched: Move SCHED_DEBUG sysctl to debugfs
  sched: Don't make LATENCYTOP select SCHED_DEBUG
  sched: Remove sched_schedstats sysctl out from under SCHED_DEBUG
  sched/numa: Allow runtime enabling/disabling of NUMA balance without SCHED_DEBUG
  sched: Use cpu_dying() to fix balance_push vs hotplug-rollback
  cpumask: Introduce DYING mask
  cpumask: Make cpu_{online,possible,present,active}() inline
  rseq: Optimise rseq_get_rseq_cs() and clear_rseq_cs()
  ...
parents 42dec9a9 2ea46c6f
......@@ -4754,7 +4754,7 @@
sbni= [NET] Granch SBNI12 leased line adapter
sched_debug [KNL] Enables verbose scheduler debug messages.
sched_verbose [KNL] Enables verbose scheduler debug messages.
schedstats= [KNL,X86] Enable or disable scheduled statistics.
Allowed values are enable and disable. This feature
......
......@@ -74,8 +74,8 @@ for a given topology level by creating a sched_domain_topology_level array and
calling set_sched_topology() with this array as the parameter.
The sched-domains debugging infrastructure can be enabled by enabling
CONFIG_SCHED_DEBUG and adding 'sched_debug' to your cmdline. If you forgot to
tweak your cmdline, you can also flip the /sys/kernel/debug/sched_debug
knob. This enables an error checking parse of the sched domains which should
catch most possible errors (described above). It also prints out the domain
structure in a visual format.
CONFIG_SCHED_DEBUG and adding 'sched_debug_verbose' to your cmdline. If you
forgot to tweak your cmdline, you can also flip the
/sys/kernel/debug/sched/verbose knob. This enables an error checking parse of
the sched domains which should catch most possible errors (described above). It
also prints out the domain structure in a visual format.
......@@ -18,6 +18,7 @@
#include <linux/usb.h>
#include <linux/wait.h>
#include <linux/sched/task.h>
#include <linux/kcov.h>
#include <uapi/linux/usbip.h>
#undef pr_fmt
......
......@@ -864,6 +864,97 @@ struct dentry *debugfs_create_bool(const char *name, umode_t mode,
}
EXPORT_SYMBOL_GPL(debugfs_create_bool);
ssize_t debugfs_read_file_str(struct file *file, char __user *user_buf,
size_t count, loff_t *ppos)
{
struct dentry *dentry = F_DENTRY(file);
char *str, *copy = NULL;
int copy_len, len;
ssize_t ret;
ret = debugfs_file_get(dentry);
if (unlikely(ret))
return ret;
str = *(char **)file->private_data;
len = strlen(str) + 1;
copy = kmalloc(len, GFP_KERNEL);
if (!copy) {
debugfs_file_put(dentry);
return -ENOMEM;
}
copy_len = strscpy(copy, str, len);
debugfs_file_put(dentry);
if (copy_len < 0) {
kfree(copy);
return copy_len;
}
copy[copy_len] = '\n';
ret = simple_read_from_buffer(user_buf, count, ppos, copy, copy_len);
kfree(copy);
return ret;
}
static ssize_t debugfs_write_file_str(struct file *file, const char __user *user_buf,
size_t count, loff_t *ppos)
{
/* This is really only for read-only strings */
return -EINVAL;
}
static const struct file_operations fops_str = {
.read = debugfs_read_file_str,
.write = debugfs_write_file_str,
.open = simple_open,
.llseek = default_llseek,
};
static const struct file_operations fops_str_ro = {
.read = debugfs_read_file_str,
.open = simple_open,
.llseek = default_llseek,
};
static const struct file_operations fops_str_wo = {
.write = debugfs_write_file_str,
.open = simple_open,
.llseek = default_llseek,
};
/**
* debugfs_create_str - create a debugfs file that is used to read and write a string value
* @name: a pointer to a string containing the name of the file to create.
* @mode: the permission that the file should have
* @parent: a pointer to the parent dentry for this file. This should be a
* directory dentry if set. If this parameter is %NULL, then the
* file will be created in the root of the debugfs filesystem.
* @value: a pointer to the variable that the file should read to and write
* from.
*
* This function creates a file in debugfs with the given name that
* contains the value of the variable @value. If the @mode variable is so
* set, it can be read from, and written to.
*
* This function will return a pointer to a dentry if it succeeds. This
* pointer must be passed to the debugfs_remove() function when the file is
* to be removed (no automatic cleanup happens if your module is unloaded,
* you are responsible here.) If an error occurs, ERR_PTR(-ERROR) will be
* returned.
*
* If debugfs is not enabled in the kernel, the value ERR_PTR(-ENODEV) will
* be returned.
*/
void debugfs_create_str(const char *name, umode_t mode,
struct dentry *parent, char **value)
{
debugfs_create_mode_unsafe(name, mode, parent, value, &fops_str,
&fops_str_ro, &fops_str_wo);
}
static ssize_t read_file_blob(struct file *file, char __user *user_buf,
size_t count, loff_t *ppos)
{
......
......@@ -91,44 +91,15 @@ extern struct cpumask __cpu_possible_mask;
extern struct cpumask __cpu_online_mask;
extern struct cpumask __cpu_present_mask;
extern struct cpumask __cpu_active_mask;
extern struct cpumask __cpu_dying_mask;
#define cpu_possible_mask ((const struct cpumask *)&__cpu_possible_mask)
#define cpu_online_mask ((const struct cpumask *)&__cpu_online_mask)
#define cpu_present_mask ((const struct cpumask *)&__cpu_present_mask)
#define cpu_active_mask ((const struct cpumask *)&__cpu_active_mask)
#define cpu_dying_mask ((const struct cpumask *)&__cpu_dying_mask)
extern atomic_t __num_online_cpus;
#if NR_CPUS > 1
/**
* num_online_cpus() - Read the number of online CPUs
*
* Despite the fact that __num_online_cpus is of type atomic_t, this
* interface gives only a momentary snapshot and is not protected against
* concurrent CPU hotplug operations unless invoked from a cpuhp_lock held
* region.
*/
static inline unsigned int num_online_cpus(void)
{
return atomic_read(&__num_online_cpus);
}
#define num_possible_cpus() cpumask_weight(cpu_possible_mask)
#define num_present_cpus() cpumask_weight(cpu_present_mask)
#define num_active_cpus() cpumask_weight(cpu_active_mask)
#define cpu_online(cpu) cpumask_test_cpu((cpu), cpu_online_mask)
#define cpu_possible(cpu) cpumask_test_cpu((cpu), cpu_possible_mask)
#define cpu_present(cpu) cpumask_test_cpu((cpu), cpu_present_mask)
#define cpu_active(cpu) cpumask_test_cpu((cpu), cpu_active_mask)
#else
#define num_online_cpus() 1U
#define num_possible_cpus() 1U
#define num_present_cpus() 1U
#define num_active_cpus() 1U
#define cpu_online(cpu) ((cpu) == 0)
#define cpu_possible(cpu) ((cpu) == 0)
#define cpu_present(cpu) ((cpu) == 0)
#define cpu_active(cpu) ((cpu) == 0)
#endif
extern cpumask_t cpus_booted_once_mask;
static inline void cpu_max_bits_warn(unsigned int cpu, unsigned int bits)
......@@ -857,6 +828,14 @@ set_cpu_active(unsigned int cpu, bool active)
cpumask_clear_cpu(cpu, &__cpu_active_mask);
}
static inline void
set_cpu_dying(unsigned int cpu, bool dying)
{
if (dying)
cpumask_set_cpu(cpu, &__cpu_dying_mask);
else
cpumask_clear_cpu(cpu, &__cpu_dying_mask);
}
/**
* to_cpumask - convert an NR_CPUS bitmap to a struct cpumask *
......@@ -894,6 +873,82 @@ static inline const struct cpumask *get_cpu_mask(unsigned int cpu)
return to_cpumask(p);
}
#if NR_CPUS > 1
/**
* num_online_cpus() - Read the number of online CPUs
*
* Despite the fact that __num_online_cpus is of type atomic_t, this
* interface gives only a momentary snapshot and is not protected against
* concurrent CPU hotplug operations unless invoked from a cpuhp_lock held
* region.
*/
static inline unsigned int num_online_cpus(void)
{
return atomic_read(&__num_online_cpus);
}
#define num_possible_cpus() cpumask_weight(cpu_possible_mask)
#define num_present_cpus() cpumask_weight(cpu_present_mask)
#define num_active_cpus() cpumask_weight(cpu_active_mask)
static inline bool cpu_online(unsigned int cpu)
{
return cpumask_test_cpu(cpu, cpu_online_mask);
}
static inline bool cpu_possible(unsigned int cpu)
{
return cpumask_test_cpu(cpu, cpu_possible_mask);
}
static inline bool cpu_present(unsigned int cpu)
{
return cpumask_test_cpu(cpu, cpu_present_mask);
}
static inline bool cpu_active(unsigned int cpu)
{
return cpumask_test_cpu(cpu, cpu_active_mask);
}
static inline bool cpu_dying(unsigned int cpu)
{
return cpumask_test_cpu(cpu, cpu_dying_mask);
}
#else
#define num_online_cpus() 1U
#define num_possible_cpus() 1U
#define num_present_cpus() 1U
#define num_active_cpus() 1U
static inline bool cpu_online(unsigned int cpu)
{
return cpu == 0;
}
static inline bool cpu_possible(unsigned int cpu)
{
return cpu == 0;
}
static inline bool cpu_present(unsigned int cpu)
{
return cpu == 0;
}
static inline bool cpu_active(unsigned int cpu)
{
return cpu == 0;
}
static inline bool cpu_dying(unsigned int cpu)
{
return false;
}
#endif /* NR_CPUS > 1 */
#define cpu_is_offline(cpu) unlikely(!cpu_online(cpu))
#if NR_CPUS <= BITS_PER_LONG
......
......@@ -128,6 +128,8 @@ void debugfs_create_atomic_t(const char *name, umode_t mode,
struct dentry *parent, atomic_t *value);
struct dentry *debugfs_create_bool(const char *name, umode_t mode,
struct dentry *parent, bool *value);
void debugfs_create_str(const char *name, umode_t mode,
struct dentry *parent, char **value);
struct dentry *debugfs_create_blob(const char *name, umode_t mode,
struct dentry *parent,
......@@ -156,6 +158,9 @@ ssize_t debugfs_read_file_bool(struct file *file, char __user *user_buf,
ssize_t debugfs_write_file_bool(struct file *file, const char __user *user_buf,
size_t count, loff_t *ppos);
ssize_t debugfs_read_file_str(struct file *file, char __user *user_buf,
size_t count, loff_t *ppos);
#else
#include <linux/err.h>
......@@ -297,6 +302,11 @@ static inline struct dentry *debugfs_create_bool(const char *name, umode_t mode,
return ERR_PTR(-ENODEV);
}
static inline void debugfs_create_str(const char *name, umode_t mode,
struct dentry *parent,
char **value)
{ }
static inline struct dentry *debugfs_create_blob(const char *name, umode_t mode,
struct dentry *parent,
struct debugfs_blob_wrapper *blob)
......@@ -348,6 +358,13 @@ static inline ssize_t debugfs_write_file_bool(struct file *file,
return -ENODEV;
}
static inline ssize_t debugfs_read_file_str(struct file *file,
char __user *user_buf,
size_t count, loff_t *ppos)
{
return -ENODEV;
}
#endif
/**
......
......@@ -2,6 +2,7 @@
#ifndef _LINUX_KCOV_H
#define _LINUX_KCOV_H
#include <linux/sched.h>
#include <uapi/linux/kcov.h>
struct task_struct;
......
......@@ -20,7 +20,6 @@ void psi_task_change(struct task_struct *task, int clear, int set);
void psi_task_switch(struct task_struct *prev, struct task_struct *next,
bool sleep);
void psi_memstall_tick(struct task_struct *task, int cpu);
void psi_memstall_enter(unsigned long *flags);
void psi_memstall_leave(unsigned long *flags);
......
......@@ -50,9 +50,10 @@ enum psi_states {
PSI_MEM_SOME,
PSI_MEM_FULL,
PSI_CPU_SOME,
PSI_CPU_FULL,
/* Only per-CPU, to weigh the CPU in the global average: */
PSI_NONIDLE,
NR_PSI_STATES = 6,
NR_PSI_STATES = 7,
};
enum psi_aggregators {
......
......@@ -14,7 +14,6 @@
#include <linux/pid.h>
#include <linux/sem.h>
#include <linux/shm.h>
#include <linux/kcov.h>
#include <linux/mutex.h>
#include <linux/plist.h>
#include <linux/hrtimer.h>
......@@ -985,6 +984,7 @@ struct task_struct {
/* Signal handlers: */
struct signal_struct *signal;
struct sighand_struct __rcu *sighand;
struct sigqueue *sigqueue_cache;
sigset_t blocked;
sigset_t real_blocked;
/* Restored if set_restore_sigmask() was used: */
......@@ -1101,7 +1101,7 @@ struct task_struct {
#ifdef CONFIG_CPUSETS
/* Protected by ->alloc_lock: */
nodemask_t mems_allowed;
/* Seqence number to catch updates: */
/* Sequence number to catch updates: */
seqcount_spinlock_t mems_allowed_seq;
int cpuset_mem_spread_rotor;
int cpuset_slab_spread_rotor;
......
......@@ -26,10 +26,11 @@ int proc_dohung_task_timeout_secs(struct ctl_table *table, int write,
enum { sysctl_hung_task_timeout_secs = 0 };
#endif
extern unsigned int sysctl_sched_child_runs_first;
extern unsigned int sysctl_sched_latency;
extern unsigned int sysctl_sched_min_granularity;
extern unsigned int sysctl_sched_wakeup_granularity;
extern unsigned int sysctl_sched_child_runs_first;
enum sched_tunable_scaling {
SCHED_TUNABLESCALING_NONE,
......@@ -37,7 +38,7 @@ enum sched_tunable_scaling {
SCHED_TUNABLESCALING_LINEAR,
SCHED_TUNABLESCALING_END,
};
extern enum sched_tunable_scaling sysctl_sched_tunable_scaling;
extern unsigned int sysctl_sched_tunable_scaling;
extern unsigned int sysctl_numa_balancing_scan_delay;
extern unsigned int sysctl_numa_balancing_scan_period_min;
......@@ -48,8 +49,8 @@ extern unsigned int sysctl_numa_balancing_scan_size;
extern __read_mostly unsigned int sysctl_sched_migration_cost;
extern __read_mostly unsigned int sysctl_sched_nr_migrate;
int sched_proc_update_handler(struct ctl_table *table, int write,
void *buffer, size_t *length, loff_t *ppos);
extern int sysctl_resched_latency_warn_ms;
extern int sysctl_resched_latency_warn_once;
#endif
/*
......
......@@ -266,6 +266,7 @@ static inline void init_sigpending(struct sigpending *sig)
}
extern void flush_sigqueue(struct sigpending *queue);
extern void exit_task_sigqueue_cache(struct task_struct *tsk);
/* Test if 'sig' is valid signal. Use this instead of testing _NSIG directly */
static inline int valid_signal(unsigned long sig)
......
......@@ -102,6 +102,16 @@ struct ptrace_syscall_info {
};
};
#define PTRACE_GET_RSEQ_CONFIGURATION 0x420f
struct ptrace_rseq_configuration {
__u64 rseq_abi_pointer;
__u32 rseq_abi_size;
__u32 signature;
__u32 flags;
__u32 pad;
};
/*
* These values are stored in task->ptrace_message
* by tracehook_report_syscall_* to describe the current syscall-stop.
......
......@@ -63,6 +63,7 @@ struct cpuhp_cpu_state {
bool rollback;
bool single;
bool bringup;
int cpu;
struct hlist_node *node;
struct hlist_node *last;
enum cpuhp_state cb_state;
......@@ -135,6 +136,11 @@ static struct cpuhp_step *cpuhp_get_step(enum cpuhp_state state)
return cpuhp_hp_states + state;
}
static bool cpuhp_step_empty(bool bringup, struct cpuhp_step *step)
{
return bringup ? !step->startup.single : !step->teardown.single;
}
/**
* cpuhp_invoke_callback _ Invoke the callbacks for a given state
* @cpu: The cpu for which the callback should be invoked
......@@ -157,26 +163,24 @@ static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state state,
if (st->fail == state) {
st->fail = CPUHP_INVALID;
if (!(bringup ? step->startup.single : step->teardown.single))
return 0;
return -EAGAIN;
}
if (cpuhp_step_empty(bringup, step)) {
WARN_ON_ONCE(1);
return 0;
}
if (!step->multi_instance) {
WARN_ON_ONCE(lastp && *lastp);
cb = bringup ? step->startup.single : step->teardown.single;
if (!cb)
return 0;
trace_cpuhp_enter(cpu, st->target, state, cb);
ret = cb(cpu);
trace_cpuhp_exit(cpu, st->state, state, ret);
return ret;
}
cbm = bringup ? step->startup.multi : step->teardown.multi;
if (!cbm)
return 0;
/* Single invocation for instance add/remove */
if (node) {
......@@ -461,13 +465,16 @@ static inline enum cpuhp_state
cpuhp_set_state(struct cpuhp_cpu_state *st, enum cpuhp_state target)
{
enum cpuhp_state prev_state = st->state;
bool bringup = st->state < target;
st->rollback = false;
st->last = NULL;
st->target = target;
st->single = false;
st->bringup = st->state < target;
st->bringup = bringup;
if (cpu_dying(st->cpu) != !bringup)
set_cpu_dying(st->cpu, !bringup);
return prev_state;
}
......@@ -475,6 +482,17 @@ cpuhp_set_state(struct cpuhp_cpu_state *st, enum cpuhp_state target)
static inline void
cpuhp_reset_state(struct cpuhp_cpu_state *st, enum cpuhp_state prev_state)
{
bool bringup = !st->bringup;
st->target = prev_state;
/*
* Already rolling back. No need invert the bringup value or to change
* the current state.
*/
if (st->rollback)
return;
st->rollback = true;
/*
......@@ -488,8 +506,9 @@ cpuhp_reset_state(struct cpuhp_cpu_state *st, enum cpuhp_state prev_state)
st->state++;
}
st->target = prev_state;
st->bringup = !st->bringup;
st->bringup = bringup;
if (cpu_dying(st->cpu) != !bringup)
set_cpu_dying(st->cpu, !bringup);
}
/* Regular hotplug invocation of the AP hotplug thread */
......@@ -591,10 +610,53 @@ static int finish_cpu(unsigned int cpu)
* Hotplug state machine related functions
*/
static void undo_cpu_up(unsigned int cpu, struct cpuhp_cpu_state *st)
/*
* Get the next state to run. Empty ones will be skipped. Returns true if a
* state must be run.
*
* st->state will be modified ahead of time, to match state_to_run, as if it
* has already ran.
*/
static bool cpuhp_next_state(bool bringup,
enum cpuhp_state *state_to_run,
struct cpuhp_cpu_state *st,
enum cpuhp_state target)
{
do {
if (bringup) {
if (st->state >= target)
return false;
*state_to_run = ++st->state;
} else {
if (st->state <= target)
return false;
*state_to_run = st->state--;
}
if (!cpuhp_step_empty(bringup, cpuhp_get_step(*state_to_run)))
break;
} while (true);
return true;
}
static int cpuhp_invoke_callback_range(bool bringup,
unsigned int cpu,
struct cpuhp_cpu_state *st,
enum cpuhp_state target)
{
for (st->state--; st->state > st->target; st->state--)
cpuhp_invoke_callback(cpu, st->state, false, NULL, NULL);
enum cpuhp_state state;
int err = 0;
while (cpuhp_next_state(bringup, &state, st, target)) {
err = cpuhp_invoke_callback(cpu, state, bringup, NULL, NULL);
if (err)
break;
}
return err;
}
static inline bool can_rollback_cpu(struct cpuhp_cpu_state *st)
......@@ -617,16 +679,12 @@ static int cpuhp_up_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st,
enum cpuhp_state prev_state = st->state;
int ret = 0;
while (st->state < target) {
st->state++;
ret = cpuhp_invoke_callback(cpu, st->state, true, NULL, NULL);
if (ret) {
if (can_rollback_cpu(st)) {
st->target = prev_state;
undo_cpu_up(cpu, st);
}
break;
}
ret = cpuhp_invoke_callback_range(true, cpu, st, target);
if (ret) {
cpuhp_reset_state(st, prev_state);
if (can_rollback_cpu(st))
WARN_ON(cpuhp_invoke_callback_range(false, cpu, st,
prev_state));
}
return ret;
}
......@@ -640,6 +698,7 @@ static void cpuhp_create(unsigned int cpu)
init_completion(&st->done_up);
init_completion(&st->done_down);
st->cpu = cpu;
}
static int cpuhp_should_run(unsigned int cpu)
......@@ -690,17 +749,9 @@ static void cpuhp_thread_fun(unsigned int cpu)
state = st->cb_state;
st->should_run = false;
} else {
if (bringup) {
st->state++;
state = st->state;
st->should_run = (st->state < st->target);
WARN_ON_ONCE(st->state > st->target);
} else {
state = st->state;
st->state--;
st->should_run = (st->state > st->target);
WARN_ON_ONCE(st->state < st->target);
}
st->should_run = cpuhp_next_state(bringup, &state, st, st->target);
if (!st->should_run)
goto end;
}
WARN_ON_ONCE(!cpuhp_is_ap_state(state));
......@@ -728,6 +779,7 @@ static void cpuhp_thread_fun(unsigned int cpu)
st->should_run = false;
}
end:
cpuhp_lock_release(bringup);
lockdep_release_cpus_lock();
......@@ -881,19 +933,18 @@ static int take_cpu_down(void *_param)
return err;
/*
* We get here while we are in CPUHP_TEARDOWN_CPU state and we must not
* do this step again.
* Must be called from CPUHP_TEARDOWN_CPU, which means, as we are going
* down, that the current state is CPUHP_TEARDOWN_CPU - 1.
*/
WARN_ON(st->state != CPUHP_TEARDOWN_CPU);
st->state--;
WARN_ON(st->state != (CPUHP_TEARDOWN_CPU - 1));
/* Invoke the former CPU_DYING callbacks */
for (; st->state > target; st->state--) {
ret = cpuhp_invoke_callback(cpu, st->state, false, NULL, NULL);
/*
* DYING must not fail!
*/
WARN_ON_ONCE(ret);
}
ret = cpuhp_invoke_callback_range(false, cpu, st, target);
/*
* DYING must not fail!
*/
WARN_ON_ONCE(ret);
/* Give up timekeeping duties */
tick_handover_do_timer();
......@@ -975,27 +1026,22 @@ void cpuhp_report_idle_dead(void)
cpuhp_complete_idle_dead, st, 0);
}
static void undo_cpu_down(unsigned int cpu, struct cpuhp_cpu_state *st)
{
for (st->state++; st->state < st->target; st->state++)
cpuhp_invoke_callback(cpu, st->state, true, NULL, NULL);
}
static int cpuhp_down_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st,
enum cpuhp_state target)
{
enum cpuhp_state prev_state = st->state;
int ret = 0;
for (; st->state > target; st->state--) {
ret = cpuhp_invoke_callback(cpu, st->state, false, NULL, NULL);
if (ret) {
st->target = prev_state;
if (st->state < prev_state)
undo_cpu_down(cpu, st);
break;
}
ret = cpuhp_invoke_callback_range(false, cpu, st, target);
if (ret) {
cpuhp_reset_state(st, prev_state);
if (st->state < prev_state)
WARN_ON(cpuhp_invoke_callback_range(true, cpu, st,
prev_state));
}
return ret;
}
......@@ -1045,9 +1091,13 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen,
* to do the further cleanups.
*/
ret = cpuhp_down_callbacks(cpu, st, target);
if (ret && st->state == CPUHP_TEARDOWN_CPU && st->state < prev_state) {
cpuhp_reset_state(st, prev_state);
__cpuhp_kick_ap(st);
if (ret && st->state < prev_state) {
if (st->state == CPUHP_TEARDOWN_CPU) {
cpuhp_reset_state(st, prev_state);
__cpuhp_kick_ap(st);
} else {
WARN(1, "DEAD callback error for CPU%d", cpu);
}
}
out:
......@@ -1164,14 +1214,12 @@ void notify_cpu_starting(unsigned int cpu)
rcu_cpu_starting(cpu); /* Enables RCU usage on this CPU. */
cpumask_set_cpu(cpu, &cpus_booted_once_mask);
while (st->state < target) {
st->state++;
ret = cpuhp_invoke_callback(cpu, st->state, true, NULL, NULL);
/*
* STARTING must not fail!
*/
WARN_ON_ONCE(ret);
}
ret = cpuhp_invoke_callback_range(true, cpu, st, target);
/*
* STARTING must not fail!
*/
WARN_ON_ONCE(ret);
}
/*
......@@ -1777,8 +1825,7 @@ static int cpuhp_issue_call(int cpu, enum cpuhp_state state, bool bringup,
* If there's nothing to do, we done.
* Relies on the union for multi_instance.
*/
if ((bringup && !sp->startup.single) ||
(!bringup && !sp->teardown.single))
if (cpuhp_step_empty(bringup, sp))
return 0;
/*
* The non AP bound callbacks can fail on bringup. On teardown
......@@ -2207,6 +2254,11 @@ static ssize_t write_cpuhp_fail(struct device *dev,
if (ret)
return ret;
if (fail == CPUHP_INVALID) {
st->fail = fail;
return count;
}
if (fail < CPUHP_OFFLINE || fail > CPUHP_ONLINE)
return -EINVAL;
......@@ -2216,6 +2268,15 @@ static ssize_t write_cpuhp_fail(struct device *dev,
if (cpuhp_is_atomic_state(fail))
return -EINVAL;
/*
* DEAD callbacks cannot fail...
* ... neither can CPUHP_BRINGUP_CPU during hotunplug. The latter
* triggering STARTING callbacks, a failure in this state would
* hinder rollback.
*/
if (fail <= CPUHP_BRINGUP_CPU && st->state > CPUHP_BRINGUP_CPU)
return -EINVAL;
/*
* Cannot fail anything that doesn't have callbacks.
*/
......@@ -2460,6 +2521,9 @@ EXPORT_SYMBOL(__cpu_present_mask);
struct cpumask __cpu_active_mask __read_mostly;
EXPORT_SYMBOL(__cpu_active_mask);
struct cpumask __cpu_dying_mask __read_mostly;
EXPORT_SYMBOL(__cpu_dying_mask);
atomic_t __num_online_cpus __read_mostly;
EXPORT_SYMBOL(__num_online_cpus);
......
......@@ -162,6 +162,7 @@ static void __exit_signal(struct task_struct *tsk)
flush_sigqueue(&sig->shared_pending);
tty_kref_put(tty);
}
exit_task_sigqueue_cache(tsk);
}
static void delayed_put_task_struct(struct rcu_head *rhp)
......
......@@ -2009,6 +2009,7 @@ static __latent_entropy struct task_struct *copy_process(
spin_lock_init(&p->alloc_lock);
init_sigpending(&p->pending);
p->sigqueue_cache = NULL;
p->utime = p->stime = p->gtime = 0;
#ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME
......
......@@ -84,6 +84,25 @@ static inline struct kthread *to_kthread(struct task_struct *k)
return (__force void *)k->set_child_tid;
}
/*
* Variant of to_kthread() that doesn't assume @p is a kthread.
*
* Per construction; when:
*
* (p->flags & PF_KTHREAD) && p->set_child_tid
*
* the task is both a kthread and struct kthread is persistent. However
* PF_KTHREAD on it's own is not, kernel_thread() can exec() (See umh.c and
* begin_new_exec()).
*/
static inline struct kthread *__to_kthread(struct task_struct *p)
{
void *kthread = (__force void *)p->set_child_tid;
if (kthread && !(p->flags & PF_KTHREAD))
kthread = NULL;
return kthread;
}
void free_kthread_struct(struct task_struct *k)
{
struct kthread *kthread;
......@@ -168,8 +187,9 @@ EXPORT_SYMBOL_GPL(kthread_freezable_should_stop);
*/
void *kthread_func(struct task_struct *task)
{
if (task->flags & PF_KTHREAD)
return to_kthread(task)->threadfn;
struct kthread *kthread = __to_kthread(task);
if (kthread)
return kthread->threadfn;
return NULL;
}
EXPORT_SYMBOL_GPL(kthread_func);
......@@ -199,10 +219,11 @@ EXPORT_SYMBOL_GPL(kthread_data);
*/
void *kthread_probe_data(struct task_struct *task)
{
struct kthread *kthread = to_kthread(task);
struct kthread *kthread = __to_kthread(task);
void *data = NULL;
copy_from_kernel_nofault(&data, &kthread->data, sizeof(data));
if (kthread)
copy_from_kernel_nofault(&data, &kthread->data, sizeof(data));
return data;
}
......@@ -514,9 +535,9 @@ void kthread_set_per_cpu(struct task_struct *k, int cpu)
set_bit(KTHREAD_IS_PER_CPU, &kthread->flags);
}
bool kthread_is_per_cpu(struct task_struct *k)
bool kthread_is_per_cpu(struct task_struct *p)
{
struct kthread *kthread = to_kthread(k);
struct kthread *kthread = __to_kthread(p);
if (!kthread)
return false;
......
......@@ -31,6 +31,7 @@
#include <linux/cn_proc.h>
#include <linux/compat.h>
#include <linux/sched/signal.h>
#include <linux/minmax.h>
#include <asm/syscall.h> /* for syscall_get_* */
......@@ -779,6 +780,24 @@ static int ptrace_peek_siginfo(struct task_struct *child,
return ret;
}
#ifdef CONFIG_RSEQ
static long ptrace_get_rseq_configuration(struct task_struct *task,
unsigned long size, void __user *data)
{
struct ptrace_rseq_configuration conf = {
.rseq_abi_pointer = (u64)(uintptr_t)task->rseq,
.rseq_abi_size = sizeof(*task->rseq),
.signature = task->rseq_sig,
.flags = 0,
};
size = min_t(unsigned long, size, sizeof(conf));
if (copy_to_user(data, &conf, size))
return -EFAULT;
return sizeof(conf);
}
#endif
#ifdef PTRACE_SINGLESTEP
#define is_singlestep(request) ((request) == PTRACE_SINGLESTEP)
#else
......@@ -1222,6 +1241,12 @@ int ptrace_request(struct task_struct *child, long request,
ret = seccomp_get_metadata(child, addr, datavp);
break;
#ifdef CONFIG_RSEQ
case PTRACE_GET_RSEQ_CONFIGURATION:
ret = ptrace_get_rseq_configuration(child, addr, datavp);
break;
#endif
default:
break;
}
......
......@@ -84,13 +84,20 @@
static int rseq_update_cpu_id(struct task_struct *t)
{
u32 cpu_id = raw_smp_processor_id();
struct rseq __user *rseq = t->rseq;
if (put_user(cpu_id, &t->rseq->cpu_id_start))
return -EFAULT;
if (put_user(cpu_id, &t->rseq->cpu_id))
return -EFAULT;
if (!user_write_access_begin(rseq, sizeof(*rseq)))
goto efault;
unsafe_put_user(cpu_id, &rseq->cpu_id_start, efault_end);
unsafe_put_user(cpu_id, &rseq->cpu_id, efault_end);
user_write_access_end();
trace_rseq_update(t);
return 0;
efault_end:
user_write_access_end();
efault:
return -EFAULT;
}
static int rseq_reset_rseq_cpu_id(struct task_struct *t)
......@@ -120,8 +127,13 @@ static int rseq_get_rseq_cs(struct task_struct *t, struct rseq_cs *rseq_cs)
u32 sig;
int ret;
#ifdef CONFIG_64BIT
if (get_user(ptr, &t->rseq->rseq_cs.ptr64))
return -EFAULT;
#else
if (copy_from_user(&ptr, &t->rseq->rseq_cs.ptr64, sizeof(ptr)))
return -EFAULT;
#endif
if (!ptr) {
memset(rseq_cs, 0, sizeof(*rseq_cs));
return 0;
......@@ -204,9 +216,13 @@ static int clear_rseq_cs(struct task_struct *t)
*
* Set rseq_cs to NULL.
*/
#ifdef CONFIG_64BIT
return put_user(0UL, &t->rseq->rseq_cs.ptr64);
#else
if (clear_user(&t->rseq->rseq_cs.ptr64, sizeof(t->rseq->rseq_cs.ptr64)))
return -EFAULT;
return 0;
#endif
}
/*
......@@ -266,8 +282,6 @@ void __rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs)
if (unlikely(t->flags & PF_EXITING))
return;
if (unlikely(!access_ok(t->rseq, sizeof(*t->rseq))))
goto error;
ret = rseq_ip_fixup(regs);
if (unlikely(ret < 0))
goto error;
......@@ -294,8 +308,7 @@ void rseq_syscall(struct pt_regs *regs)
if (!t->rseq)
return;
if (!access_ok(t->rseq, sizeof(*t->rseq)) ||
rseq_get_rseq_cs(t, &rseq_cs) || in_rseq_cs(ip, &rseq_cs))
if (rseq_get_rseq_cs(t, &rseq_cs) || in_rseq_cs(ip, &rseq_cs))
force_sig(SIGSEGV);
}
......
......@@ -41,7 +41,7 @@
* Otherwise it tries to create a semi stable clock from a mixture of other
* clocks, including:
*
* - GTOD (clock monotomic)
* - GTOD (clock monotonic)
* - sched_clock()
* - explicit idle events
*
......
This diff is collapsed.
......@@ -104,7 +104,7 @@ static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu,
/*
* We allow index == CPUACCT_STAT_NSTATS here to read
* the sum of suages.
* the sum of usages.
*/
BUG_ON(index > CPUACCT_STAT_NSTATS);
......
......@@ -466,7 +466,7 @@ static void sugov_work(struct kthread_work *work)
/*
* Hold sg_policy->update_lock shortly to handle the case where:
* incase sg_policy->next_freq is read here, and then updated by
* in case sg_policy->next_freq is read here, and then updated by
* sugov_deferred_update() just before work_in_progress is set to false
* here, we may miss queueing the new update.
*
......
......@@ -77,7 +77,7 @@ static inline int __cpupri_find(struct cpupri *cp, struct task_struct *p,
* When looking at the vector, we need to read the counter,
* do a memory barrier, then read the mask.
*
* Note: This is still all racey, but we can deal with it.
* Note: This is still all racy, but we can deal with it.
* Ideally, we only want to look at masks that are set.
*
* If a mask is not set, then the only thing wrong is that we
......@@ -186,7 +186,7 @@ int cpupri_find_fitness(struct cpupri *cp, struct task_struct *p,
* The cost of this trade-off is not entirely clear and will probably
* be good for some workloads and bad for others.
*
* The main idea here is that if some CPUs were overcommitted, we try
* The main idea here is that if some CPUs were over-committed, we try
* to spread which is what the scheduler traditionally did. Sys admins
* must do proper RT planning to avoid overloading the system if they
* really care.
......
......@@ -563,7 +563,7 @@ void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev,
/*
* If either stime or utime are 0, assume all runtime is userspace.
* Once a task gets some ticks, the monotonicy code at 'update:'
* Once a task gets some ticks, the monotonicity code at 'update:'
* will ensure things converge to the observed ratio.
*/
if (stime == 0) {
......
......@@ -245,7 +245,7 @@ static void dl_change_utilization(struct task_struct *p, u64 new_bw)
p->dl.dl_non_contending = 0;
/*
* If the timer handler is currently running and the
* timer cannot be cancelled, inactive_task_timer()
* timer cannot be canceled, inactive_task_timer()
* will see that dl_not_contending is not set, and
* will not touch the rq's active utilization,
* so we are still safe.
......@@ -267,7 +267,7 @@ static void dl_change_utilization(struct task_struct *p, u64 new_bw)
* fires.
*
* If the task wakes up again before the inactive timer fires,
* the timer is cancelled, whereas if the task wakes up after the
* the timer is canceled, whereas if the task wakes up after the
* inactive timer fired (and running_bw has been decreased) the
* task's utilization has to be added to running_bw again.
* A flag in the deadline scheduling entity (dl_non_contending)
......@@ -385,7 +385,7 @@ static void task_contending(struct sched_dl_entity *dl_se, int flags)
dl_se->dl_non_contending = 0;
/*
* If the timer handler is currently running and the
* timer cannot be cancelled, inactive_task_timer()
* timer cannot be canceled, inactive_task_timer()
* will see that dl_not_contending is not set, and
* will not touch the rq's active utilization,
* so we are still safe.
......@@ -1206,7 +1206,7 @@ extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq);
* Since rq->dl.running_bw and rq->dl.this_bw contain utilizations
* multiplied by 2^BW_SHIFT, the result has to be shifted right by
* BW_SHIFT.
* Since rq->dl.bw_ratio contains 1 / Umax multipled by 2^RATIO_SHIFT,
* Since rq->dl.bw_ratio contains 1 / Umax multiplied by 2^RATIO_SHIFT,
* dl_bw is multiped by rq->dl.bw_ratio and shifted right by RATIO_SHIFT.
* Since delta is a 64 bit variable, to have an overflow its value
* should be larger than 2^(64 - 20 - 8), which is more than 64 seconds.
......@@ -1737,7 +1737,7 @@ static void migrate_task_rq_dl(struct task_struct *p, int new_cpu __maybe_unused
p->dl.dl_non_contending = 0;
/*
* If the timer handler is currently running and the
* timer cannot be cancelled, inactive_task_timer()
* timer cannot be canceled, inactive_task_timer()
* will see that dl_not_contending is not set, and
* will not touch the rq's active utilization,
* so we are still safe.
......@@ -2745,7 +2745,7 @@ void __getparam_dl(struct task_struct *p, struct sched_attr *attr)
/*
* Default limits for DL period; on the top end we guard against small util
* tasks still getting rediculous long effective runtimes, on the bottom end we
* tasks still getting ridiculously long effective runtimes, on the bottom end we
* guard against timer DoS.
*/
unsigned int sysctl_sched_dl_period_max = 1 << 22; /* ~4 seconds */
......
This diff is collapsed.
This diff is collapsed.
......@@ -27,7 +27,7 @@ SCHED_FEAT(NEXT_BUDDY, false)
SCHED_FEAT(LAST_BUDDY, true)
/*
* Consider buddies to be cache hot, decreases the likelyness of a
* Consider buddies to be cache hot, decreases the likeliness of a
* cache buddy being migrated away, increases cache locality.
*/
SCHED_FEAT(CACHE_HOT_BUDDY, true)
......@@ -90,3 +90,8 @@ SCHED_FEAT(WA_BIAS, true)
*/
SCHED_FEAT(UTIL_EST, true)
SCHED_FEAT(UTIL_EST_FASTUP, true)
SCHED_FEAT(LATENCY_WARN, false)
SCHED_FEAT(ALT_PERIOD, true)
SCHED_FEAT(BASE_SLICE, true)
......@@ -163,7 +163,7 @@ static int call_cpuidle(struct cpuidle_driver *drv, struct cpuidle_device *dev,
*
* NOTE: no locks or semaphores should be used here
*
* On archs that support TIF_POLLING_NRFLAG, is called with polling
* On architectures that support TIF_POLLING_NRFLAG, is called with polling
* set, and it returns with polling set. If it ever stops polling, it
* must clear the polling bit.
*/
......@@ -199,7 +199,7 @@ static void cpuidle_idle_call(void)
* Suspend-to-idle ("s2idle") is a system state in which all user space
* has been frozen, all I/O devices have been suspended and the only
* activity happens here and in interrupts (if any). In that case bypass
* the cpuidle governor and go stratight for the deepest idle state
* the cpuidle governor and go straight for the deepest idle state
* available. Possibly also suspend the local tick and the entire
* timekeeping to prevent timer interrupts from kicking us out of idle
* until a proper wakeup interrupt happens.
......@@ -261,6 +261,12 @@ static void cpuidle_idle_call(void)
static void do_idle(void)
{
int cpu = smp_processor_id();
/*
* Check if we need to update blocked load
*/
nohz_run_idle_balance(cpu);
/*
* If the arch has a polling bit, we maintain an invariant:
*
......
......@@ -189,7 +189,7 @@ calc_load_n(unsigned long load, unsigned long exp,
* w:0 1 1 0 0 1 1 0 0
*
* This ensures we'll fold the old NO_HZ contribution in this window while
* accumlating the new one.
* accumulating the new one.
*
* - When we wake up from NO_HZ during the window, we push up our
* contribution, since we effectively move our sample point to a known
......
......@@ -133,7 +133,7 @@ accumulate_sum(u64 delta, struct sched_avg *sa,
* runnable = running = 0;
*
* clause from ___update_load_sum(); this results in
* the below usage of @contrib to dissapear entirely,
* the below usage of @contrib to disappear entirely,
* so no point in calculating it.
*/
contrib = __accumulate_pelt_segments(periods,
......
......@@ -130,7 +130,7 @@ static inline void update_idle_rq_clock_pelt(struct rq *rq)
* Reflecting stolen time makes sense only if the idle
* phase would be present at max capacity. As soon as the
* utilization of a rq has reached the maximum value, it is
* considered as an always runnig rq without idle time to
* considered as an always running rq without idle time to
* steal. This potential idle time is considered as lost in
* this case. We keep track of this lost idle time compare to
* rq's clock_task.
......
This diff is collapsed.
......@@ -700,7 +700,7 @@ static void do_balance_runtime(struct rt_rq *rt_rq)
/*
* Either all rqs have inf runtime and there's nothing to steal
* or __disable_runtime() below sets a specific rq to inf to
* indicate its been disabled and disalow stealing.
* indicate its been disabled and disallow stealing.
*/
if (iter->rt_runtime == RUNTIME_INF)
goto next;
......@@ -1998,7 +1998,7 @@ static void push_rt_tasks(struct rq *rq)
*
* Each root domain has its own irq work function that can iterate over
* all CPUs with RT overloaded tasks. Since all CPUs with overloaded RT
* tassk must be checked if there's one or many CPUs that are lowering
* task must be checked if there's one or many CPUs that are lowering
* their priority, there's a single irq work iterator that will try to
* push off RT tasks that are waiting to run.
*
......@@ -2216,7 +2216,7 @@ static void pull_rt_task(struct rq *this_rq)
/*
* There's a chance that p is higher in priority
* than what's currently running on its CPU.
* This is just that p is wakeing up and hasn't
* This is just that p is waking up and hasn't
* had a chance to schedule. We only pull
* p if it is lower in priority than the
* current task on the run queue
......
......@@ -36,6 +36,7 @@
#include <uapi/linux/sched/types.h>
#include <linux/binfmts.h>
#include <linux/bitops.h>
#include <linux/blkdev.h>
#include <linux/compat.h>
#include <linux/context_tracking.h>
......@@ -57,6 +58,7 @@
#include <linux/prefetch.h>
#include <linux/profile.h>
#include <linux/psi.h>
#include <linux/ratelimit.h>
#include <linux/rcupdate_wait.h>
#include <linux/security.h>
#include <linux/stop_machine.h>
......@@ -204,6 +206,13 @@ static inline void update_avg(u64 *avg, u64 sample)
*avg += diff / 8;
}
/*
* Shifting a value by an exponent greater *or equal* to the size of said value
* is UB; cap at size-1.
*/
#define shr_bound(val, shift) \
(val >> min_t(typeof(shift), shift, BITS_PER_TYPE(typeof(val)) - 1))
/*
* !! For sched_setattr_nocheck() (kernel) only !!
*
......@@ -963,6 +972,11 @@ struct rq {
atomic_t nr_iowait;
#ifdef CONFIG_SCHED_DEBUG
u64 last_seen_need_resched_ns;
int ticks_without_resched;
#endif
#ifdef CONFIG_MEMBARRIER
int membarrier_state;
#endif
......@@ -975,7 +989,6 @@ struct rq {
unsigned long cpu_capacity_orig;
struct callback_head *balance_callback;
unsigned char balance_push;
unsigned char nohz_idle_balance;
unsigned char idle_balance;
......@@ -1147,7 +1160,7 @@ static inline u64 __rq_clock_broken(struct rq *rq)
*
* if (rq-clock_update_flags >= RQCF_UPDATED)
*
* to check if %RQCF_UPADTED is set. It'll never be shifted more than
* to check if %RQCF_UPDATED is set. It'll never be shifted more than
* one position though, because the next rq_unpin_lock() will shift it
* back.
*/
......@@ -1206,7 +1219,7 @@ static inline void rq_clock_skip_update(struct rq *rq)
/*
* See rt task throttling, which is the only time a skip
* request is cancelled.
* request is canceled.
*/
static inline void rq_clock_cancel_skipupdate(struct rq *rq)
{
......@@ -1545,22 +1558,20 @@ static inline unsigned int group_first_cpu(struct sched_group *group)
extern int group_balance_cpu(struct sched_group *sg);
#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
void register_sched_domain_sysctl(void);
#ifdef CONFIG_SCHED_DEBUG
void update_sched_domain_debugfs(void);
void dirty_sched_domain_sysctl(int cpu);
void unregister_sched_domain_sysctl(void);
#else
static inline void register_sched_domain_sysctl(void)
static inline void update_sched_domain_debugfs(void)
{
}
static inline void dirty_sched_domain_sysctl(int cpu)
{
}
static inline void unregister_sched_domain_sysctl(void)
{
}
#endif
extern int sched_update_scaling(void);
extern void flush_smp_call_function_from_idle(void);
#else /* !CONFIG_SMP: */
......@@ -1853,7 +1864,7 @@ struct sched_class {
/*
* The switched_from() call is allowed to drop rq->lock, therefore we
* cannot assume the switched_from/switched_to pair is serliazed by
* cannot assume the switched_from/switched_to pair is serialized by
* rq->lock. They are however serialized by p->pi_lock.
*/
void (*switched_from)(struct rq *this_rq, struct task_struct *task);
......@@ -2358,7 +2369,7 @@ extern struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq);
extern struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq);
#ifdef CONFIG_SCHED_DEBUG
extern bool sched_debug_enabled;
extern bool sched_debug_verbose;
extern void print_cfs_stats(struct seq_file *m, int cpu);
extern void print_rt_stats(struct seq_file *m, int cpu);
......@@ -2366,6 +2377,8 @@ extern void print_dl_stats(struct seq_file *m, int cpu);
extern void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq);
extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq);
extern void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq);
extern void resched_latency_warn(int cpu, u64 latency);
#ifdef CONFIG_NUMA_BALANCING
extern void
show_numa_stats(struct task_struct *p, struct seq_file *m);
......@@ -2373,6 +2386,8 @@ extern void
print_numa_stats(struct seq_file *m, int node, unsigned long tsf,
unsigned long tpf, unsigned long gsf, unsigned long gpf);
#endif /* CONFIG_NUMA_BALANCING */
#else
static inline void resched_latency_warn(int cpu, u64 latency) {}
#endif /* CONFIG_SCHED_DEBUG */
extern void init_cfs_rq(struct cfs_rq *cfs_rq);
......@@ -2385,9 +2400,11 @@ extern void cfs_bandwidth_usage_dec(void);
#ifdef CONFIG_NO_HZ_COMMON
#define NOHZ_BALANCE_KICK_BIT 0
#define NOHZ_STATS_KICK_BIT 1
#define NOHZ_NEWILB_KICK_BIT 2
#define NOHZ_BALANCE_KICK BIT(NOHZ_BALANCE_KICK_BIT)
#define NOHZ_STATS_KICK BIT(NOHZ_STATS_KICK_BIT)
#define NOHZ_NEWILB_KICK BIT(NOHZ_NEWILB_KICK_BIT)
#define NOHZ_KICK_MASK (NOHZ_BALANCE_KICK | NOHZ_STATS_KICK)
......@@ -2398,6 +2415,11 @@ extern void nohz_balance_exit_idle(struct rq *rq);
static inline void nohz_balance_exit_idle(struct rq *rq) { }
#endif
#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
extern void nohz_run_idle_balance(int cpu);
#else
static inline void nohz_run_idle_balance(int cpu) { }
#endif
#ifdef CONFIG_SMP
static inline
......@@ -2437,7 +2459,7 @@ DECLARE_PER_CPU(struct irqtime, cpu_irqtime);
/*
* Returns the irqtime minus the softirq time computed by ksoftirqd.
* Otherwise ksoftirqd's sum_exec_runtime is substracted its own runtime
* Otherwise ksoftirqd's sum_exec_runtime is subtracted its own runtime
* and never move forward.
*/
static inline u64 irq_time_read(int cpu)
......@@ -2718,5 +2740,12 @@ static inline bool is_per_cpu_kthread(struct task_struct *p)
}
#endif
void swake_up_all_locked(struct swait_queue_head *q);
void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait);
extern void swake_up_all_locked(struct swait_queue_head *q);
extern void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait);
#ifdef CONFIG_PREEMPT_DYNAMIC
extern int preempt_dynamic_mode;
extern int sched_dynamic_mode(const char *str);
extern void sched_dynamic_update(int mode);
#endif
......@@ -74,7 +74,7 @@ static int show_schedstat(struct seq_file *seq, void *v)
}
/*
* This itererator needs some explanation.
* This iterator needs some explanation.
* It returns 1 for the header position.
* This means 2 is cpu 0.
* In a hotplugged system some CPUs, including cpu 0, may be missing so we have
......
......@@ -84,28 +84,24 @@ static inline void psi_enqueue(struct task_struct *p, bool wakeup)
static inline void psi_dequeue(struct task_struct *p, bool sleep)
{
int clear = TSK_RUNNING, set = 0;
int clear = TSK_RUNNING;
if (static_branch_likely(&psi_disabled))
return;
if (!sleep) {
if (p->in_memstall)
clear |= TSK_MEMSTALL;
} else {
/*
* When a task sleeps, schedule() dequeues it before
* switching to the next one. Merge the clearing of
* TSK_RUNNING and TSK_ONCPU to save an unnecessary
* psi_task_change() call in psi_sched_switch().
*/
clear |= TSK_ONCPU;
/*
* A voluntary sleep is a dequeue followed by a task switch. To
* avoid walking all ancestors twice, psi_task_switch() handles
* TSK_RUNNING and TSK_IOWAIT for us when it moves TSK_ONCPU.
* Do nothing here.
*/
if (sleep)
return;
if (p->in_iowait)
set |= TSK_IOWAIT;
}
if (p->in_memstall)
clear |= TSK_MEMSTALL;
psi_task_change(p, clear, set);
psi_task_change(p, clear, 0);
}
static inline void psi_ttwu_dequeue(struct task_struct *p)
......@@ -144,14 +140,6 @@ static inline void psi_sched_switch(struct task_struct *prev,
psi_task_switch(prev, next, sleep);
}
static inline void psi_task_tick(struct rq *rq)
{
if (static_branch_likely(&psi_disabled))
return;
if (unlikely(rq->curr->in_memstall))
psi_memstall_tick(rq->curr, cpu_of(rq));
}
#else /* CONFIG_PSI */
static inline void psi_enqueue(struct task_struct *p, bool wakeup) {}
static inline void psi_dequeue(struct task_struct *p, bool sleep) {}
......@@ -159,7 +147,6 @@ static inline void psi_ttwu_dequeue(struct task_struct *p) {}
static inline void psi_sched_switch(struct task_struct *prev,
struct task_struct *next,
bool sleep) {}
static inline void psi_task_tick(struct rq *rq) {}
#endif /* CONFIG_PSI */
#ifdef CONFIG_SCHED_INFO
......
......@@ -14,15 +14,15 @@ static cpumask_var_t sched_domains_tmpmask2;
static int __init sched_debug_setup(char *str)
{
sched_debug_enabled = true;
sched_debug_verbose = true;
return 0;
}
early_param("sched_debug", sched_debug_setup);
early_param("sched_verbose", sched_debug_setup);
static inline bool sched_debug(void)
{
return sched_debug_enabled;
return sched_debug_verbose;
}
#define SD_FLAG(_name, mflags) [__##_name] = { .meta_flags = mflags, .name = #_name },
......@@ -131,7 +131,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
{
int level = 0;
if (!sched_debug_enabled)
if (!sched_debug_verbose)
return;
if (!sd) {
......@@ -152,7 +152,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
}
#else /* !CONFIG_SCHED_DEBUG */
# define sched_debug_enabled 0
# define sched_debug_verbose 0
# define sched_domain_debug(sd, cpu) do { } while (0)
static inline bool sched_debug(void)
{
......@@ -723,35 +723,6 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
for (tmp = sd; tmp; tmp = tmp->parent)
numa_distance += !!(tmp->flags & SD_NUMA);
/*
* FIXME: Diameter >=3 is misrepresented.
*
* Smallest diameter=3 topology is:
*
* node 0 1 2 3
* 0: 10 20 30 40
* 1: 20 10 20 30
* 2: 30 20 10 20
* 3: 40 30 20 10
*
* 0 --- 1 --- 2 --- 3
*
* NUMA-3 0-3 N/A N/A 0-3
* groups: {0-2},{1-3} {1-3},{0-2}
*
* NUMA-2 0-2 0-3 0-3 1-3
* groups: {0-1},{1-3} {0-2},{2-3} {1-3},{0-1} {2-3},{0-2}
*
* NUMA-1 0-1 0-2 1-3 2-3
* groups: {0},{1} {1},{2},{0} {2},{3},{1} {3},{2}
*
* NUMA-0 0 1 2 3
*
* The NUMA-2 groups for nodes 0 and 3 are obviously buggered, as the
* group span isn't a subset of the domain span.
*/
WARN_ONCE(numa_distance > 2, "Shortest NUMA path spans too many nodes\n");
sched_domain_debug(sd, cpu);
rq_attach_root(rq, rd);
......@@ -963,7 +934,7 @@ static void init_overlap_sched_group(struct sched_domain *sd,
int cpu;
build_balance_mask(sd, sg, mask);
cpu = cpumask_first_and(sched_group_span(sg), mask);
cpu = cpumask_first(mask);
sg->sgc = *per_cpu_ptr(sdd->sgc, cpu);
if (atomic_inc_return(&sg->sgc->ref) == 1)
......@@ -982,6 +953,31 @@ static void init_overlap_sched_group(struct sched_domain *sd,
sg->sgc->max_capacity = SCHED_CAPACITY_SCALE;
}
static struct sched_domain *
find_descended_sibling(struct sched_domain *sd, struct sched_domain *sibling)
{
/*
* The proper descendant would be the one whose child won't span out
* of sd
*/
while (sibling->child &&
!cpumask_subset(sched_domain_span(sibling->child),
sched_domain_span(sd)))
sibling = sibling->child;
/*
* As we are referencing sgc across different topology level, we need
* to go down to skip those sched_domains which don't contribute to
* scheduling because they will be degenerated in cpu_attach_domain
*/
while (sibling->child &&
cpumask_equal(sched_domain_span(sibling->child),
sched_domain_span(sibling)))
sibling = sibling->child;
return sibling;
}
static int
build_overlap_sched_groups(struct sched_domain *sd, int cpu)
{
......@@ -1015,6 +1011,41 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
if (!cpumask_test_cpu(i, sched_domain_span(sibling)))
continue;
/*
* Usually we build sched_group by sibling's child sched_domain
* But for machines whose NUMA diameter are 3 or above, we move
* to build sched_group by sibling's proper descendant's child
* domain because sibling's child sched_domain will span out of
* the sched_domain being built as below.
*
* Smallest diameter=3 topology is:
*
* node 0 1 2 3
* 0: 10 20 30 40
* 1: 20 10 20 30
* 2: 30 20 10 20
* 3: 40 30 20 10
*
* 0 --- 1 --- 2 --- 3
*
* NUMA-3 0-3 N/A N/A 0-3
* groups: {0-2},{1-3} {1-3},{0-2}
*
* NUMA-2 0-2 0-3 0-3 1-3
* groups: {0-1},{1-3} {0-2},{2-3} {1-3},{0-1} {2-3},{0-2}
*
* NUMA-1 0-1 0-2 1-3 2-3
* groups: {0},{1} {1},{2},{0} {2},{3},{1} {3},{2}
*
* NUMA-0 0 1 2 3
*
* The NUMA-2 groups for nodes 0 and 3 are obviously buggered, as the
* group span isn't a subset of the domain span.
*/
if (sibling->child &&
!cpumask_subset(sched_domain_span(sibling->child), span))
sibling = find_descended_sibling(sd, sibling);
sg = build_group_from_child_sched_domain(sibling, cpu);
if (!sg)
goto fail;
......@@ -1022,7 +1053,7 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
sg_span = sched_group_span(sg);
cpumask_or(covered, covered, sg_span);
init_overlap_sched_group(sd, sg);
init_overlap_sched_group(sibling, sg);
if (!first)
first = sg;
......@@ -2110,7 +2141,7 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
if (has_asym)
static_branch_inc_cpuslocked(&sched_asym_cpucapacity);
if (rq && sched_debug_enabled) {
if (rq && sched_debug_verbose) {
pr_info("root domain span: %*pbl (max cpu_capacity = %lu)\n",
cpumask_pr_args(cpu_map), rq->rd->max_cpu_capacity);
}
......@@ -2128,7 +2159,7 @@ static cpumask_var_t *doms_cur;
/* Number of sched domains in 'doms_cur': */
static int ndoms_cur;
/* Attribues of custom domains in 'doms_cur' */
/* Attributes of custom domains in 'doms_cur' */
static struct sched_domain_attr *dattr_cur;
/*
......@@ -2192,7 +2223,6 @@ int sched_init_domains(const struct cpumask *cpu_map)
doms_cur = &fallback_doms;
cpumask_and(doms_cur[0], cpu_map, housekeeping_cpumask(HK_FLAG_DOMAIN));
err = build_sched_domains(doms_cur[0], NULL);
register_sched_domain_sysctl();
return err;
}
......@@ -2267,9 +2297,6 @@ void partition_sched_domains_locked(int ndoms_new, cpumask_var_t doms_new[],
lockdep_assert_held(&sched_domains_mutex);
/* Always unregister in case we don't destroy any domains: */
unregister_sched_domain_sysctl();
/* Let the architecture update CPU core mappings: */
new_topology = arch_update_cpu_topology();
......@@ -2358,7 +2385,7 @@ void partition_sched_domains_locked(int ndoms_new, cpumask_var_t doms_new[],
dattr_cur = dattr_new;
ndoms_cur = ndoms_new;
register_sched_domain_sysctl();
update_sched_domain_debugfs();
}
/*
......
......@@ -408,7 +408,8 @@ void task_join_group_stop(struct task_struct *task)
* appropriate lock must be held to stop the target task from exiting
*/
static struct sigqueue *
__sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimit)
__sigqueue_alloc(int sig, struct task_struct *t, gfp_t gfp_flags,
int override_rlimit, const unsigned int sigqueue_flags)
{
struct sigqueue *q = NULL;
struct user_struct *user;
......@@ -430,7 +431,16 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimi
rcu_read_unlock();
if (override_rlimit || likely(sigpending <= task_rlimit(t, RLIMIT_SIGPENDING))) {
q = kmem_cache_alloc(sigqueue_cachep, flags);
/*
* Preallocation does not hold sighand::siglock so it can't
* use the cache. The lockless caching requires that only
* one consumer and only one producer run at a time.
*/
q = READ_ONCE(t->sigqueue_cache);
if (!q || sigqueue_flags)
q = kmem_cache_alloc(sigqueue_cachep, gfp_flags);
else
WRITE_ONCE(t->sigqueue_cache, NULL);
} else {
print_dropped_signal(sig);
}
......@@ -440,20 +450,51 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimi
free_uid(user);
} else {
INIT_LIST_HEAD(&q->list);
q->flags = 0;
q->flags = sigqueue_flags;
q->user = user;
}
return q;
}
void exit_task_sigqueue_cache(struct task_struct *tsk)
{
/* Race free because @tsk is mopped up */
struct sigqueue *q = tsk->sigqueue_cache;
if (q) {
tsk->sigqueue_cache = NULL;
/*
* Hand it back to the cache as the task might
* be self reaping which would leak the object.
*/
kmem_cache_free(sigqueue_cachep, q);
}
}
static void sigqueue_cache_or_free(struct sigqueue *q)
{
/*
* Cache one sigqueue per task. This pairs with the consumer side
* in __sigqueue_alloc() and needs READ/WRITE_ONCE() to prevent the
* compiler from store tearing and to tell KCSAN that the data race
* is intentional when run without holding current->sighand->siglock,
* which is fine as current obviously cannot run __sigqueue_free()
* concurrently.
*/
if (!READ_ONCE(current->sigqueue_cache))
WRITE_ONCE(current->sigqueue_cache, q);
else
kmem_cache_free(sigqueue_cachep, q);
}
static void __sigqueue_free(struct sigqueue *q)
{
if (q->flags & SIGQUEUE_PREALLOC)
return;
if (atomic_dec_and_test(&q->user->sigpending))
free_uid(q->user);
kmem_cache_free(sigqueue_cachep, q);
sigqueue_cache_or_free(q);
}
void flush_sigqueue(struct sigpending *queue)
......@@ -1111,7 +1152,8 @@ static int __send_signal(int sig, struct kernel_siginfo *info, struct task_struc
else
override_rlimit = 0;
q = __sigqueue_alloc(sig, t, GFP_ATOMIC, override_rlimit);
q = __sigqueue_alloc(sig, t, GFP_ATOMIC, override_rlimit, 0);
if (q) {
list_add_tail(&q->list, &pending->list);
switch ((unsigned long) info) {
......@@ -1806,12 +1848,7 @@ EXPORT_SYMBOL(kill_pid);
*/
struct sigqueue *sigqueue_alloc(void)
{
struct sigqueue *q = __sigqueue_alloc(-1, current, GFP_KERNEL, 0);
if (q)
q->flags |= SIGQUEUE_PREALLOC;
return q;
return __sigqueue_alloc(-1, current, GFP_KERNEL, 0, SIGQUEUE_PREALLOC);
}
void sigqueue_free(struct sigqueue *q)
......
......@@ -409,6 +409,7 @@ static bool queue_stop_cpus_work(const struct cpumask *cpumask,
work->fn = fn;
work->arg = arg;
work->done = done;
work->caller = _RET_IP_;
if (cpu_stop_queue_work(cpu, work))
queued = true;
}
......
......@@ -184,17 +184,6 @@ static enum sysctl_writes_mode sysctl_writes_strict = SYSCTL_WRITES_STRICT;
int sysctl_legacy_va_layout;
#endif
#ifdef CONFIG_SCHED_DEBUG
static int min_sched_granularity_ns = 100000; /* 100 usecs */
static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */
static int min_wakeup_granularity_ns; /* 0 usecs */
static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */
#ifdef CONFIG_SMP
static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE;
static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1;
#endif /* CONFIG_SMP */
#endif /* CONFIG_SCHED_DEBUG */
#ifdef CONFIG_COMPACTION
static int min_extfrag_threshold;
static int max_extfrag_threshold = 1000;
......@@ -1659,58 +1648,6 @@ static struct ctl_table kern_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec,
},
#ifdef CONFIG_SCHED_DEBUG
{
.procname = "sched_min_granularity_ns",
.data = &sysctl_sched_min_granularity,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = sched_proc_update_handler,
.extra1 = &min_sched_granularity_ns,
.extra2 = &max_sched_granularity_ns,
},
{
.procname = "sched_latency_ns",
.data = &sysctl_sched_latency,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = sched_proc_update_handler,
.extra1 = &min_sched_granularity_ns,
.extra2 = &max_sched_granularity_ns,
},
{
.procname = "sched_wakeup_granularity_ns",
.data = &sysctl_sched_wakeup_granularity,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = sched_proc_update_handler,
.extra1 = &min_wakeup_granularity_ns,
.extra2 = &max_wakeup_granularity_ns,
},
#ifdef CONFIG_SMP
{
.procname = "sched_tunable_scaling",
.data = &sysctl_sched_tunable_scaling,
.maxlen = sizeof(enum sched_tunable_scaling),
.mode = 0644,
.proc_handler = sched_proc_update_handler,
.extra1 = &min_sched_tunable_scaling,
.extra2 = &max_sched_tunable_scaling,
},
{
.procname = "sched_migration_cost_ns",
.data = &sysctl_sched_migration_cost,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec,
},
{
.procname = "sched_nr_migrate",
.data = &sysctl_sched_nr_migrate,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec,
},
#ifdef CONFIG_SCHEDSTATS
{
.procname = "sched_schedstats",
......@@ -1722,37 +1659,7 @@ static struct ctl_table kern_table[] = {
.extra2 = SYSCTL_ONE,
},
#endif /* CONFIG_SCHEDSTATS */
#endif /* CONFIG_SMP */
#ifdef CONFIG_NUMA_BALANCING
{
.procname = "numa_balancing_scan_delay_ms",
.data = &sysctl_numa_balancing_scan_delay,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec,
},
{
.procname = "numa_balancing_scan_period_min_ms",
.data = &sysctl_numa_balancing_scan_period_min,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec,
},
{
.procname = "numa_balancing_scan_period_max_ms",
.data = &sysctl_numa_balancing_scan_period_max,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec,
},
{
.procname = "numa_balancing_scan_size_mb",
.data = &sysctl_numa_balancing_scan_size,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = SYSCTL_ONE,
},
{
.procname = "numa_balancing",
.data = NULL, /* filled in by handler */
......@@ -1763,7 +1670,6 @@ static struct ctl_table kern_table[] = {
.extra2 = SYSCTL_ONE,
},
#endif /* CONFIG_NUMA_BALANCING */
#endif /* CONFIG_SCHED_DEBUG */
{
.procname = "sched_rt_period_us",
.data = &sysctl_sched_rt_period,
......
......@@ -1710,7 +1710,6 @@ config LATENCYTOP
select KALLSYMS_ALL
select STACKTRACE
select SCHEDSTATS
select SCHED_DEBUG
help
Enable this option if you want to use the LatencyTOP tool
to find out which userspace is blocking on what kernel operations.
......
......@@ -60,6 +60,7 @@
#include <linux/prefetch.h>
#include <linux/if_vlan.h>
#include <linux/mpls.h>
#include <linux/kcov.h>
#include <net/protocol.h>
#include <net/dst.h>
......
......@@ -15,6 +15,7 @@
#include <linux/if_arp.h>
#include <linux/netdevice.h>
#include <linux/rtnetlink.h>
#include <linux/kcov.h>
#include <net/mac80211.h>
#include <net/ieee80211_radiotap.h>
#include "ieee80211_i.h"
......
......@@ -17,6 +17,7 @@
#include <linux/etherdevice.h>
#include <linux/rcupdate.h>
#include <linux/export.h>
#include <linux/kcov.h>
#include <linux/bitops.h>
#include <net/mac80211.h>
#include <net/ieee80211_radiotap.h>
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment