Commit af79ad2b authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull scheduler changes from Ingo Molnar:
 "The main changes are:

   - irqtime accounting cleanups and enhancements. (Frederic Weisbecker)

   - schedstat debugging enhancements, make it more broadly runtime
     available. (Josh Poimboeuf)

   - More work on asymmetric topology/capacity scheduling. (Morten
     Rasmussen)

   - sched/wait fixes and cleanups. (Oleg Nesterov)

   - PELT (per entity load tracking) improvements. (Peter Zijlstra)

   - Rewrite and enhance select_idle_siblings(). (Peter Zijlstra)

   - sched/numa enhancements/fixes (Rik van Riel)

   - sched/cputime scalability improvements (Stanislaw Gruszka)

   - Load calculation arithmetics fixes. (Dietmar Eggemann)

   - sched/deadline enhancements (Tommaso Cucinotta)

   - Fix utilization accounting when switching to the SCHED_NORMAL
     policy. (Vincent Guittot)

   - ... plus misc cleanups and enhancements"

* 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (64 commits)
  sched/irqtime: Consolidate irqtime flushing code
  sched/irqtime: Consolidate accounting synchronization with u64_stats API
  u64_stats: Introduce IRQs disabled helpers
  sched/irqtime: Remove needless IRQs disablement on kcpustat update
  sched/irqtime: No need for preempt-safe accessors
  sched/fair: Fix min_vruntime tracking
  sched/debug: Add SCHED_WARN_ON()
  sched/core: Fix set_user_nice()
  sched/fair: Introduce set_curr_task() helper
  sched/core, ia64: Rename set_curr_task()
  sched/core: Fix incorrect utilization accounting when switching to fair class
  sched/core: Optimize SCHED_SMT
  sched/core: Rewrite and improve select_idle_siblings()
  sched/core: Replace sd_busy/nr_busy_cpus with sched_domain_shared
  sched/core: Introduce 'struct sched_domain_shared'
  sched/core: Restructure destroy_sched_domain()
  sched/core: Remove unused @cpu argument from destroy_sched_domain*()
  sched/wait: Introduce init_wait_entry()
  sched/wait: Avoid abort_exclusive_wait() in __wait_on_bit_lock()
  sched/wait: Avoid abort_exclusive_wait() in ___wait_event()
  ...
parents e606d81d 447976ef
......@@ -16,6 +16,7 @@ CONTENTS
4.1 System-wide settings
4.2 Task interface
4.3 Default behavior
4.4 Behavior of sched_yield()
5. Tasks CPU affinity
5.1 SCHED_DEADLINE and cpusets HOWTO
6. Future plans
......@@ -426,6 +427,23 @@ CONTENTS
Finally, notice that in order not to jeopardize the admission control a
-deadline task cannot fork.
4.4 Behavior of sched_yield()
-----------------------------
When a SCHED_DEADLINE task calls sched_yield(), it gives up its
remaining runtime and is immediately throttled, until the next
period, when its runtime will be replenished (a special flag
dl_yielded is set and used to handle correctly throttling and runtime
replenishment after a call to sched_yield()).
This behavior of sched_yield() allows the task to wake-up exactly at
the beginning of the next period. Also, this may be useful in the
future with bandwidth reclaiming mechanisms, where sched_yield() will
make the leftoever runtime available for reclamation by other
SCHED_DEADLINE tasks.
5. Tasks CPU affinity
=====================
......
......@@ -986,7 +986,7 @@ ia64_mca_modify_original_stack(struct pt_regs *regs,
int cpu = smp_processor_id();
previous_current = curr_task(cpu);
set_curr_task(cpu, current);
ia64_set_curr_task(cpu, current);
if ((p = strchr(current->comm, ' ')))
*p = '\0';
......@@ -1360,14 +1360,14 @@ ia64_mca_handler(struct pt_regs *regs, struct switch_stack *sw,
cpumask_clear_cpu(i, &mca_cpu); /* wake next cpu */
while (monarch_cpu != -1)
cpu_relax(); /* spin until last cpu leaves */
set_curr_task(cpu, previous_current);
ia64_set_curr_task(cpu, previous_current);
ia64_mc_info.imi_rendez_checkin[cpu]
= IA64_MCA_RENDEZ_CHECKIN_NOTDONE;
return;
}
}
}
set_curr_task(cpu, previous_current);
ia64_set_curr_task(cpu, previous_current);
ia64_mc_info.imi_rendez_checkin[cpu] = IA64_MCA_RENDEZ_CHECKIN_NOTDONE;
monarch_cpu = -1; /* This frees the slaves and previous monarchs */
}
......@@ -1729,7 +1729,7 @@ ia64_init_handler(struct pt_regs *regs, struct switch_stack *sw,
NOTIFY_INIT(DIE_INIT_SLAVE_LEAVE, regs, (long)&nd, 1);
mprintk("Slave on cpu %d returning to normal service.\n", cpu);
set_curr_task(cpu, previous_current);
ia64_set_curr_task(cpu, previous_current);
ia64_mc_info.imi_rendez_checkin[cpu] = IA64_MCA_RENDEZ_CHECKIN_NOTDONE;
atomic_dec(&slaves);
return;
......@@ -1756,7 +1756,7 @@ ia64_init_handler(struct pt_regs *regs, struct switch_stack *sw,
mprintk("\nINIT dump complete. Monarch on cpu %d returning to normal service.\n", cpu);
atomic_dec(&monarchs);
set_curr_task(cpu, previous_current);
ia64_set_curr_task(cpu, previous_current);
monarch_cpu = -1;
return;
}
......
......@@ -471,7 +471,7 @@ static bool match_die(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
return false;
}
static struct sched_domain_topology_level numa_inside_package_topology[] = {
static struct sched_domain_topology_level x86_numa_in_package_topology[] = {
#ifdef CONFIG_SCHED_SMT
{ cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) },
#endif
......@@ -480,22 +480,23 @@ static struct sched_domain_topology_level numa_inside_package_topology[] = {
#endif
{ NULL, },
};
static struct sched_domain_topology_level x86_topology[] = {
#ifdef CONFIG_SCHED_SMT
{ cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) },
#endif
#ifdef CONFIG_SCHED_MC
{ cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
#endif
{ cpu_cpu_mask, SD_INIT_NAME(DIE) },
{ NULL, },
};
/*
* set_sched_topology() sets the topology internal to a CPU. The
* NUMA topologies are layered on top of it to build the full
* system topology.
*
* If NUMA nodes are observed to occur within a CPU package, this
* function should be called. It forces the sched domain code to
* only use the SMT level for the CPU portion of the topology.
* This essentially falls back to relying on NUMA information
* from the SRAT table to describe the entire system topology
* (except for hyperthreads).
* Set if a package/die has multiple NUMA nodes inside.
* AMD Magny-Cours and Intel Cluster-on-Die have this.
*/
static void primarily_use_numa_for_topology(void)
{
set_sched_topology(numa_inside_package_topology);
}
static bool x86_has_numa_in_package;
void set_cpu_sibling_map(int cpu)
{
......@@ -558,7 +559,7 @@ void set_cpu_sibling_map(int cpu)
c->booted_cores = cpu_data(i).booted_cores;
}
if (match_die(c, o) && !topology_same_node(c, o))
primarily_use_numa_for_topology();
x86_has_numa_in_package = true;
}
threads = cpumask_weight(topology_sibling_cpumask(cpu));
......@@ -1304,6 +1305,16 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
zalloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL);
zalloc_cpumask_var(&per_cpu(cpu_llc_shared_map, i), GFP_KERNEL);
}
/*
* Set 'default' x86 topology, this matches default_topology() in that
* it has NUMA nodes as a topology level. See also
* native_smp_cpus_done().
*
* Must be done before set_cpus_sibling_map() is ran.
*/
set_sched_topology(x86_topology);
set_cpu_sibling_map(0);
switch (smp_sanity_check(max_cpus)) {
......@@ -1370,6 +1381,9 @@ void __init native_smp_cpus_done(unsigned int max_cpus)
{
pr_debug("Boot done\n");
if (x86_has_numa_in_package)
set_sched_topology(x86_numa_in_package_topology);
nmi_selftest();
impress_friends();
setup_ioapic_dest();
......
......@@ -259,17 +259,14 @@ static inline void might_fault(void) { }
extern struct atomic_notifier_head panic_notifier_list;
extern long (*panic_blink)(int state);
__printf(1, 2)
void panic(const char *fmt, ...)
__noreturn __cold;
void panic(const char *fmt, ...) __noreturn __cold;
void nmi_panic(struct pt_regs *regs, const char *msg);
extern void oops_enter(void);
extern void oops_exit(void);
void print_oops_end_marker(void);
extern int oops_may_print(void);
void do_exit(long error_code)
__noreturn;
void complete_and_exit(struct completion *, long)
__noreturn;
void do_exit(long error_code) __noreturn;
void complete_and_exit(struct completion *, long) __noreturn;
/* Internal, do not use. */
int __must_check _kstrtoul(const char *s, unsigned int base, unsigned long *res);
......
......@@ -448,6 +448,8 @@ static inline void io_schedule(void)
io_schedule_timeout(MAX_SCHEDULE_TIMEOUT);
}
void __noreturn do_task_dead(void);
struct nsproxy;
struct user_namespace;
......@@ -1022,7 +1024,8 @@ extern void wake_up_q(struct wake_q_head *head);
#define SD_BALANCE_FORK 0x0008 /* Balance on fork, clone */
#define SD_BALANCE_WAKE 0x0010 /* Balance on wakeup */
#define SD_WAKE_AFFINE 0x0020 /* Wake task to waking CPU */
#define SD_SHARE_CPUCAPACITY 0x0080 /* Domain members share cpu power */
#define SD_ASYM_CPUCAPACITY 0x0040 /* Groups have different max cpu capacities */
#define SD_SHARE_CPUCAPACITY 0x0080 /* Domain members share cpu capacity */
#define SD_SHARE_POWERDOMAIN 0x0100 /* Domain members share power domain */
#define SD_SHARE_PKG_RESOURCES 0x0200 /* Domain members share cpu pkg resources */
#define SD_SERIALIZE 0x0400 /* Only a single load balancing instance */
......@@ -1064,6 +1067,12 @@ extern int sched_domain_level_max;
struct sched_group;
struct sched_domain_shared {
atomic_t ref;
atomic_t nr_busy_cpus;
int has_idle_cores;
};
struct sched_domain {
/* These fields must be setup */
struct sched_domain *parent; /* top domain must be null terminated */
......@@ -1094,6 +1103,8 @@ struct sched_domain {
u64 max_newidle_lb_cost;
unsigned long next_decay_max_lb_cost;
u64 avg_scan_cost; /* select_idle_sibling */
#ifdef CONFIG_SCHEDSTATS
/* load_balance() stats */
unsigned int lb_count[CPU_MAX_IDLE_TYPES];
......@@ -1132,6 +1143,7 @@ struct sched_domain {
void *private; /* used during construction */
struct rcu_head rcu; /* used during destruction */
};
struct sched_domain_shared *shared;
unsigned int span_weight;
/*
......@@ -1165,6 +1177,7 @@ typedef int (*sched_domain_flags_f)(void);
struct sd_data {
struct sched_domain **__percpu sd;
struct sched_domain_shared **__percpu sds;
struct sched_group **__percpu sg;
struct sched_group_capacity **__percpu sgc;
};
......@@ -2568,7 +2581,7 @@ static inline bool is_idle_task(const struct task_struct *p)
return p->pid == 0;
}
extern struct task_struct *curr_task(int cpu);
extern void set_curr_task(int cpu, struct task_struct *p);
extern void ia64_set_curr_task(int cpu, struct task_struct *p);
void yield(void);
......@@ -3206,7 +3219,11 @@ static inline int signal_pending_state(long state, struct task_struct *p)
* cond_resched_lock() will drop the spinlock before scheduling,
* cond_resched_softirq() will enable bhs before scheduling.
*/
#ifndef CONFIG_PREEMPT
extern int _cond_resched(void);
#else
static inline int _cond_resched(void) { return 0; }
#endif
#define cond_resched() ({ \
___might_sleep(__FILE__, __LINE__, 0); \
......@@ -3236,6 +3253,15 @@ static inline void cond_resched_rcu(void)
#endif
}
static inline unsigned long get_preempt_disable_ip(struct task_struct *p)
{
#ifdef CONFIG_DEBUG_PREEMPT
return p->preempt_disable_ip;
#else
return 0;
#endif
}
/*
* Does a critical section need to be broken due to another
* task waiting?: (technically does not depend on CONFIG_PREEMPT,
......
......@@ -103,31 +103,42 @@ static inline void u64_stats_update_end_raw(struct u64_stats_sync *syncp)
#endif
}
static inline unsigned int u64_stats_fetch_begin(const struct u64_stats_sync *syncp)
static inline unsigned int __u64_stats_fetch_begin(const struct u64_stats_sync *syncp)
{
#if BITS_PER_LONG==32 && defined(CONFIG_SMP)
return read_seqcount_begin(&syncp->seq);
#else
#if BITS_PER_LONG==32
preempt_disable();
#endif
return 0;
#endif
}
static inline bool u64_stats_fetch_retry(const struct u64_stats_sync *syncp,
static inline unsigned int u64_stats_fetch_begin(const struct u64_stats_sync *syncp)
{
#if BITS_PER_LONG==32 && !defined(CONFIG_SMP)
preempt_disable();
#endif
return __u64_stats_fetch_begin(syncp);
}
static inline bool __u64_stats_fetch_retry(const struct u64_stats_sync *syncp,
unsigned int start)
{
#if BITS_PER_LONG==32 && defined(CONFIG_SMP)
return read_seqcount_retry(&syncp->seq, start);
#else
#if BITS_PER_LONG==32
preempt_enable();
#endif
return false;
#endif
}
static inline bool u64_stats_fetch_retry(const struct u64_stats_sync *syncp,
unsigned int start)
{
#if BITS_PER_LONG==32 && !defined(CONFIG_SMP)
preempt_enable();
#endif
return __u64_stats_fetch_retry(syncp, start);
}
/*
* In case irq handlers can update u64 counters, readers can use following helpers
* - SMP 32bit arches use seqcount protection, irq safe.
......@@ -136,27 +147,19 @@ static inline bool u64_stats_fetch_retry(const struct u64_stats_sync *syncp,
*/
static inline unsigned int u64_stats_fetch_begin_irq(const struct u64_stats_sync *syncp)
{
#if BITS_PER_LONG==32 && defined(CONFIG_SMP)
return read_seqcount_begin(&syncp->seq);
#else
#if BITS_PER_LONG==32
#if BITS_PER_LONG==32 && !defined(CONFIG_SMP)
local_irq_disable();
#endif
return 0;
#endif
return __u64_stats_fetch_begin(syncp);
}
static inline bool u64_stats_fetch_retry_irq(const struct u64_stats_sync *syncp,
unsigned int start)
unsigned int start)
{
#if BITS_PER_LONG==32 && defined(CONFIG_SMP)
return read_seqcount_retry(&syncp->seq, start);
#else
#if BITS_PER_LONG==32
#if BITS_PER_LONG==32 && !defined(CONFIG_SMP)
local_irq_enable();
#endif
return false;
#endif
return __u64_stats_fetch_retry(syncp, start);
}
#endif /* _LINUX_U64_STATS_SYNC_H */
......@@ -248,6 +248,8 @@ wait_queue_head_t *bit_waitqueue(void *, int);
(!__builtin_constant_p(state) || \
state == TASK_INTERRUPTIBLE || state == TASK_KILLABLE) \
extern void init_wait_entry(wait_queue_t *__wait, int flags);
/*
* The below macro ___wait_event() has an explicit shadow of the __ret
* variable when used from the wait_event_*() macros.
......@@ -266,12 +268,7 @@ wait_queue_head_t *bit_waitqueue(void *, int);
wait_queue_t __wait; \
long __ret = ret; /* explicit shadow */ \
\
INIT_LIST_HEAD(&__wait.task_list); \
if (exclusive) \
__wait.flags = WQ_FLAG_EXCLUSIVE; \
else \
__wait.flags = 0; \
\
init_wait_entry(&__wait, exclusive ? WQ_FLAG_EXCLUSIVE : 0); \
for (;;) { \
long __int = prepare_to_wait_event(&wq, &__wait, state);\
\
......@@ -280,12 +277,7 @@ wait_queue_head_t *bit_waitqueue(void *, int);
\
if (___wait_is_interruptible(state) && __int) { \
__ret = __int; \
if (exclusive) { \
abort_exclusive_wait(&wq, &__wait, \
state, NULL); \
goto __out; \
} \
break; \
goto __out; \
} \
\
cmd; \
......@@ -989,7 +981,6 @@ void prepare_to_wait(wait_queue_head_t *q, wait_queue_t *wait, int state);
void prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state);
long prepare_to_wait_event(wait_queue_head_t *q, wait_queue_t *wait, int state);
void finish_wait(wait_queue_head_t *q, wait_queue_t *wait);
void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait, unsigned int mode, void *key);
long wait_woken(wait_queue_t *wait, unsigned mode, long timeout);
int woken_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key);
int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key);
......
......@@ -725,7 +725,7 @@ static void check_stack_usage(void)
static inline void check_stack_usage(void) {}
#endif
void do_exit(long code)
void __noreturn do_exit(long code)
{
struct task_struct *tsk = current;
int group_dead;
......@@ -882,29 +882,7 @@ void do_exit(long code)
exit_rcu();
TASKS_RCU(__srcu_read_unlock(&tasks_rcu_exit_srcu, tasks_rcu_i));
/*
* The setting of TASK_RUNNING by try_to_wake_up() may be delayed
* when the following two conditions become true.
* - There is race condition of mmap_sem (It is acquired by
* exit_mm()), and
* - SMI occurs before setting TASK_RUNINNG.
* (or hypervisor of virtual machine switches to other guest)
* As a result, we may become TASK_RUNNING after becoming TASK_DEAD
*
* To avoid it, we have to wait for releasing tsk->pi_lock which
* is held by try_to_wake_up()
*/
smp_mb();
raw_spin_unlock_wait(&tsk->pi_lock);
/* causes final put_task_struct in finish_task_switch(). */
tsk->state = TASK_DEAD;
tsk->flags |= PF_NOFREEZE; /* tell freezer to ignore us */
schedule();
BUG();
/* Avoid "noreturn function does return". */
for (;;)
cpu_relax(); /* For when BUG is null */
do_task_dead();
}
EXPORT_SYMBOL_GPL(do_exit);
......
This diff is collapsed.
......@@ -31,56 +31,81 @@ static inline int right_child(int i)
return (i << 1) + 2;
}
static void cpudl_exchange(struct cpudl *cp, int a, int b)
static void cpudl_heapify_down(struct cpudl *cp, int idx)
{
int cpu_a = cp->elements[a].cpu, cpu_b = cp->elements[b].cpu;
int l, r, largest;
swap(cp->elements[a].cpu, cp->elements[b].cpu);
swap(cp->elements[a].dl , cp->elements[b].dl );
int orig_cpu = cp->elements[idx].cpu;
u64 orig_dl = cp->elements[idx].dl;
swap(cp->elements[cpu_a].idx, cp->elements[cpu_b].idx);
}
static void cpudl_heapify(struct cpudl *cp, int idx)
{
int l, r, largest;
if (left_child(idx) >= cp->size)
return;
/* adapted from lib/prio_heap.c */
while(1) {
u64 largest_dl;
l = left_child(idx);
r = right_child(idx);
largest = idx;
largest_dl = orig_dl;
if ((l < cp->size) && dl_time_before(cp->elements[idx].dl,
cp->elements[l].dl))
if ((l < cp->size) && dl_time_before(orig_dl,
cp->elements[l].dl)) {
largest = l;
if ((r < cp->size) && dl_time_before(cp->elements[largest].dl,
cp->elements[r].dl))
largest_dl = cp->elements[l].dl;
}
if ((r < cp->size) && dl_time_before(largest_dl,
cp->elements[r].dl))
largest = r;
if (largest == idx)
break;
/* Push idx down the heap one level and bump one up */
cpudl_exchange(cp, largest, idx);
/* pull largest child onto idx */
cp->elements[idx].cpu = cp->elements[largest].cpu;
cp->elements[idx].dl = cp->elements[largest].dl;
cp->elements[cp->elements[idx].cpu].idx = idx;
idx = largest;
}
/* actual push down of saved original values orig_* */
cp->elements[idx].cpu = orig_cpu;
cp->elements[idx].dl = orig_dl;
cp->elements[cp->elements[idx].cpu].idx = idx;
}
static void cpudl_change_key(struct cpudl *cp, int idx, u64 new_dl)
static void cpudl_heapify_up(struct cpudl *cp, int idx)
{
WARN_ON(idx == IDX_INVALID || !cpu_present(idx));
int p;
if (dl_time_before(new_dl, cp->elements[idx].dl)) {
cp->elements[idx].dl = new_dl;
cpudl_heapify(cp, idx);
} else {
cp->elements[idx].dl = new_dl;
while (idx > 0 && dl_time_before(cp->elements[parent(idx)].dl,
cp->elements[idx].dl)) {
cpudl_exchange(cp, idx, parent(idx));
idx = parent(idx);
}
}
int orig_cpu = cp->elements[idx].cpu;
u64 orig_dl = cp->elements[idx].dl;
if (idx == 0)
return;
do {
p = parent(idx);
if (dl_time_before(orig_dl, cp->elements[p].dl))
break;
/* pull parent onto idx */
cp->elements[idx].cpu = cp->elements[p].cpu;
cp->elements[idx].dl = cp->elements[p].dl;
cp->elements[cp->elements[idx].cpu].idx = idx;
idx = p;
} while (idx != 0);
/* actual push up of saved original values orig_* */
cp->elements[idx].cpu = orig_cpu;
cp->elements[idx].dl = orig_dl;
cp->elements[cp->elements[idx].cpu].idx = idx;
}
static void cpudl_heapify(struct cpudl *cp, int idx)
{
if (idx > 0 && dl_time_before(cp->elements[parent(idx)].dl,
cp->elements[idx].dl))
cpudl_heapify_up(cp, idx);
else
cpudl_heapify_down(cp, idx);
}
static inline int cpudl_maximum(struct cpudl *cp)
......@@ -120,16 +145,15 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p,
}
/*
* cpudl_set - update the cpudl max-heap
* cpudl_clear - remove a cpu from the cpudl max-heap
* @cp: the cpudl max-heap context
* @cpu: the target cpu
* @dl: the new earliest deadline for this cpu
*
* Notes: assumes cpu_rq(cpu)->lock is locked
*
* Returns: (void)
*/
void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid)
void cpudl_clear(struct cpudl *cp, int cpu)
{
int old_idx, new_cpu;
unsigned long flags;
......@@ -137,47 +161,60 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid)
WARN_ON(!cpu_present(cpu));
raw_spin_lock_irqsave(&cp->lock, flags);
old_idx = cp->elements[cpu].idx;
if (!is_valid) {
/* remove item */
if (old_idx == IDX_INVALID) {
/*
* Nothing to remove if old_idx was invalid.
* This could happen if a rq_offline_dl is
* called for a CPU without -dl tasks running.
*/
goto out;
}
if (old_idx == IDX_INVALID) {
/*
* Nothing to remove if old_idx was invalid.
* This could happen if a rq_offline_dl is
* called for a CPU without -dl tasks running.
*/
} else {
new_cpu = cp->elements[cp->size - 1].cpu;
cp->elements[old_idx].dl = cp->elements[cp->size - 1].dl;
cp->elements[old_idx].cpu = new_cpu;
cp->size--;
cp->elements[new_cpu].idx = old_idx;
cp->elements[cpu].idx = IDX_INVALID;
while (old_idx > 0 && dl_time_before(
cp->elements[parent(old_idx)].dl,
cp->elements[old_idx].dl)) {
cpudl_exchange(cp, old_idx, parent(old_idx));
old_idx = parent(old_idx);
}
cpumask_set_cpu(cpu, cp->free_cpus);
cpudl_heapify(cp, old_idx);
cpudl_heapify(cp, old_idx);
goto out;
cpumask_set_cpu(cpu, cp->free_cpus);
}
raw_spin_unlock_irqrestore(&cp->lock, flags);
}
/*
* cpudl_set - update the cpudl max-heap
* @cp: the cpudl max-heap context
* @cpu: the target cpu
* @dl: the new earliest deadline for this cpu
*
* Notes: assumes cpu_rq(cpu)->lock is locked
*
* Returns: (void)
*/
void cpudl_set(struct cpudl *cp, int cpu, u64 dl)
{
int old_idx;
unsigned long flags;
WARN_ON(!cpu_present(cpu));
raw_spin_lock_irqsave(&cp->lock, flags);
old_idx = cp->elements[cpu].idx;
if (old_idx == IDX_INVALID) {
cp->size++;
cp->elements[cp->size - 1].dl = dl;
cp->elements[cp->size - 1].cpu = cpu;
cp->elements[cpu].idx = cp->size - 1;
cpudl_change_key(cp, cp->size - 1, dl);
int new_idx = cp->size++;
cp->elements[new_idx].dl = dl;
cp->elements[new_idx].cpu = cpu;
cp->elements[cpu].idx = new_idx;
cpudl_heapify_up(cp, new_idx);
cpumask_clear_cpu(cpu, cp->free_cpus);
} else {
cpudl_change_key(cp, old_idx, dl);
cp->elements[old_idx].dl = dl;
cpudl_heapify(cp, old_idx);
}
out:
raw_spin_unlock_irqrestore(&cp->lock, flags);
}
......
......@@ -23,7 +23,8 @@ struct cpudl {
#ifdef CONFIG_SMP
int cpudl_find(struct cpudl *cp, struct task_struct *p,
struct cpumask *later_mask);
void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid);
void cpudl_set(struct cpudl *cp, int cpu, u64 dl);
void cpudl_clear(struct cpudl *cp, int cpu);
int cpudl_init(struct cpudl *cp);
void cpudl_set_freecpu(struct cpudl *cp, int cpu);
void cpudl_clear_freecpu(struct cpudl *cp, int cpu);
......
......@@ -23,10 +23,8 @@
* task when irq is in progress while we read rq->clock. That is a worthy
* compromise in place of having locks on each irq in account_system_time.
*/
DEFINE_PER_CPU(u64, cpu_hardirq_time);
DEFINE_PER_CPU(u64, cpu_softirq_time);
DEFINE_PER_CPU(struct irqtime, cpu_irqtime);
static DEFINE_PER_CPU(u64, irq_start_time);
static int sched_clock_irqtime;
void enable_sched_clock_irqtime(void)
......@@ -39,16 +37,13 @@ void disable_sched_clock_irqtime(void)
sched_clock_irqtime = 0;
}
#ifndef CONFIG_64BIT
DEFINE_PER_CPU(seqcount_t, irq_time_seq);
#endif /* CONFIG_64BIT */
/*
* Called before incrementing preempt_count on {soft,}irq_enter
* and before decrementing preempt_count on {soft,}irq_exit.
*/
void irqtime_account_irq(struct task_struct *curr)
{
struct irqtime *irqtime = this_cpu_ptr(&cpu_irqtime);
s64 delta;
int cpu;
......@@ -56,10 +51,10 @@ void irqtime_account_irq(struct task_struct *curr)
return;
cpu = smp_processor_id();
delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time);
__this_cpu_add(irq_start_time, delta);
delta = sched_clock_cpu(cpu) - irqtime->irq_start_time;
irqtime->irq_start_time += delta;
irq_time_write_begin();
u64_stats_update_begin(&irqtime->sync);
/*
* We do not account for softirq time from ksoftirqd here.
* We want to continue accounting softirq time to ksoftirqd thread
......@@ -67,42 +62,36 @@ void irqtime_account_irq(struct task_struct *curr)
* that do not consume any time, but still wants to run.
*/
if (hardirq_count())
__this_cpu_add(cpu_hardirq_time, delta);
irqtime->hardirq_time += delta;
else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
__this_cpu_add(cpu_softirq_time, delta);
irqtime->softirq_time += delta;
irq_time_write_end();
u64_stats_update_end(&irqtime->sync);
}
EXPORT_SYMBOL_GPL(irqtime_account_irq);
static cputime_t irqtime_account_hi_update(cputime_t maxtime)
static cputime_t irqtime_account_update(u64 irqtime, int idx, cputime_t maxtime)
{
u64 *cpustat = kcpustat_this_cpu->cpustat;
unsigned long flags;
cputime_t irq_cputime;
local_irq_save(flags);
irq_cputime = nsecs_to_cputime64(this_cpu_read(cpu_hardirq_time)) -
cpustat[CPUTIME_IRQ];
irq_cputime = nsecs_to_cputime64(irqtime) - cpustat[idx];
irq_cputime = min(irq_cputime, maxtime);
cpustat[CPUTIME_IRQ] += irq_cputime;
local_irq_restore(flags);
cpustat[idx] += irq_cputime;
return irq_cputime;
}
static cputime_t irqtime_account_si_update(cputime_t maxtime)
static cputime_t irqtime_account_hi_update(cputime_t maxtime)
{
u64 *cpustat = kcpustat_this_cpu->cpustat;
unsigned long flags;
cputime_t softirq_cputime;
return irqtime_account_update(__this_cpu_read(cpu_irqtime.hardirq_time),
CPUTIME_IRQ, maxtime);
}
local_irq_save(flags);
softirq_cputime = nsecs_to_cputime64(this_cpu_read(cpu_softirq_time)) -
cpustat[CPUTIME_SOFTIRQ];
softirq_cputime = min(softirq_cputime, maxtime);
cpustat[CPUTIME_SOFTIRQ] += softirq_cputime;
local_irq_restore(flags);
return softirq_cputime;
static cputime_t irqtime_account_si_update(cputime_t maxtime)
{
return irqtime_account_update(__this_cpu_read(cpu_irqtime.softirq_time),
CPUTIME_SOFTIRQ, maxtime);
}
#else /* CONFIG_IRQ_TIME_ACCOUNTING */
......@@ -295,6 +284,9 @@ static inline cputime_t account_other_time(cputime_t max)
{
cputime_t accounted;
/* Shall be converted to a lockdep-enabled lightweight check */
WARN_ON_ONCE(!irqs_disabled());
accounted = steal_account_process_time(max);
if (accounted < max)
......@@ -306,6 +298,26 @@ static inline cputime_t account_other_time(cputime_t max)
return accounted;
}
#ifdef CONFIG_64BIT
static inline u64 read_sum_exec_runtime(struct task_struct *t)
{
return t->se.sum_exec_runtime;
}
#else
static u64 read_sum_exec_runtime(struct task_struct *t)
{
u64 ns;
struct rq_flags rf;
struct rq *rq;
rq = task_rq_lock(t, &rf);
ns = t->se.sum_exec_runtime;
task_rq_unlock(rq, t, &rf);
return ns;
}
#endif
/*
* Accumulate raw cputime values of dead tasks (sig->[us]time) and live
* tasks (sum on group iteration) belonging to @tsk's group.
......@@ -318,6 +330,17 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
unsigned int seq, nextseq;
unsigned long flags;
/*
* Update current task runtime to account pending time since last
* scheduler action or thread_group_cputime() call. This thread group
* might have other running tasks on different CPUs, but updating
* their runtime can affect syscall performance, so we skip account
* those pending times and rely only on values updated on tick or
* other scheduler action.
*/
if (same_thread_group(current, tsk))
(void) task_sched_runtime(current);
rcu_read_lock();
/* Attempt a lockless read on the first round. */
nextseq = 0;
......@@ -332,7 +355,7 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
task_cputime(t, &utime, &stime);
times->utime += utime;
times->stime += stime;
times->sum_exec_runtime += task_sched_runtime(t);
times->sum_exec_runtime += read_sum_exec_runtime(t);
}
/* If lockless access failed, take the lock. */
nextseq = 1;
......
......@@ -243,10 +243,8 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq);
static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p)
{
struct rq *later_rq = NULL;
bool fallback = false;
later_rq = find_lock_later_rq(p, rq);
if (!later_rq) {
int cpu;
......@@ -254,7 +252,6 @@ static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p
* If we cannot preempt any rq, fall back to pick any
* online cpu.
*/
fallback = true;
cpu = cpumask_any_and(cpu_active_mask, tsk_cpus_allowed(p));
if (cpu >= nr_cpu_ids) {
/*
......@@ -274,16 +271,7 @@ static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p
double_lock_balance(rq, later_rq);
}
/*
* By now the task is replenished and enqueued; migrate it.
*/
deactivate_task(rq, p, 0);
set_task_cpu(p, later_rq->cpu);
activate_task(later_rq, p, 0);
if (!fallback)
resched_curr(later_rq);
double_unlock_balance(later_rq, rq);
return later_rq;
......@@ -346,12 +334,12 @@ static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p,
* one, and to (try to!) reconcile itself with its own scheduling
* parameters.
*/
static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se,
struct sched_dl_entity *pi_se)
static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se)
{
struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
struct rq *rq = rq_of_dl_rq(dl_rq);
WARN_ON(dl_se->dl_boosted);
WARN_ON(dl_time_before(rq_clock(rq), dl_se->deadline));
/*
......@@ -367,8 +355,8 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se,
* future; in fact, we must consider execution overheads (time
* spent on hardirq context, etc.).
*/
dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline;
dl_se->runtime = pi_se->dl_runtime;
dl_se->deadline = rq_clock(rq) + dl_se->dl_deadline;
dl_se->runtime = dl_se->dl_runtime;
}
/*
......@@ -641,29 +629,31 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
goto unlock;
}
enqueue_task_dl(rq, p, ENQUEUE_REPLENISH);
if (dl_task(rq->curr))
check_preempt_curr_dl(rq, p, 0);
else
resched_curr(rq);
#ifdef CONFIG_SMP
/*
* Perform balancing operations here; after the replenishments. We
* cannot drop rq->lock before this, otherwise the assertion in
* start_dl_timer() about not missing updates is not true.
*
* If we find that the rq the task was on is no longer available, we
* need to select a new rq.
*
* XXX figure out if select_task_rq_dl() deals with offline cpus.
*/
if (unlikely(!rq->online)) {
/*
* If the runqueue is no longer available, migrate the
* task elsewhere. This necessarily changes rq.
*/
lockdep_unpin_lock(&rq->lock, rf.cookie);
rq = dl_task_offline_migration(rq, p);
rf.cookie = lockdep_pin_lock(&rq->lock);
/*
* Now that the task has been migrated to the new RQ and we
* have that locked, proceed as normal and enqueue the task
* there.
*/
}
#endif
enqueue_task_dl(rq, p, ENQUEUE_REPLENISH);
if (dl_task(rq->curr))
check_preempt_curr_dl(rq, p, 0);
else
resched_curr(rq);
#ifdef CONFIG_SMP
/*
* Queueing this task back might have overloaded rq, check if we need
* to kick someone away.
......@@ -797,7 +787,7 @@ static void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline)
if (dl_rq->earliest_dl.curr == 0 ||
dl_time_before(deadline, dl_rq->earliest_dl.curr)) {
dl_rq->earliest_dl.curr = deadline;
cpudl_set(&rq->rd->cpudl, rq->cpu, deadline, 1);
cpudl_set(&rq->rd->cpudl, rq->cpu, deadline);
}
}
......@@ -812,14 +802,14 @@ static void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline)
if (!dl_rq->dl_nr_running) {
dl_rq->earliest_dl.curr = 0;
dl_rq->earliest_dl.next = 0;
cpudl_set(&rq->rd->cpudl, rq->cpu, 0, 0);
cpudl_clear(&rq->rd->cpudl, rq->cpu);
} else {
struct rb_node *leftmost = dl_rq->rb_leftmost;
struct sched_dl_entity *entry;
entry = rb_entry(leftmost, struct sched_dl_entity, rb_node);
dl_rq->earliest_dl.curr = entry->deadline;
cpudl_set(&rq->rd->cpudl, rq->cpu, entry->deadline, 1);
cpudl_set(&rq->rd->cpudl, rq->cpu, entry->deadline);
}
}
......@@ -1670,7 +1660,7 @@ static void rq_online_dl(struct rq *rq)
cpudl_set_freecpu(&rq->rd->cpudl, rq->cpu);
if (rq->dl.dl_nr_running > 0)
cpudl_set(&rq->rd->cpudl, rq->cpu, rq->dl.earliest_dl.curr, 1);
cpudl_set(&rq->rd->cpudl, rq->cpu, rq->dl.earliest_dl.curr);
}
/* Assumes rq->lock is held */
......@@ -1679,7 +1669,7 @@ static void rq_offline_dl(struct rq *rq)
if (rq->dl.overloaded)
dl_clear_overload(rq);
cpudl_set(&rq->rd->cpudl, rq->cpu, 0, 0);
cpudl_clear(&rq->rd->cpudl, rq->cpu);
cpudl_clear_freecpu(&rq->rd->cpudl, rq->cpu);
}
......@@ -1722,10 +1712,20 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p)
*/
static void switched_to_dl(struct rq *rq, struct task_struct *p)
{
/* If p is not queued we will update its parameters at next wakeup. */
if (!task_on_rq_queued(p))
return;
/*
* If p is boosted we already updated its params in
* rt_mutex_setprio()->enqueue_task(..., ENQUEUE_REPLENISH),
* p's deadline being now already after rq_clock(rq).
*/
if (dl_time_before(p->dl.deadline, rq_clock(rq)))
setup_new_dl_entity(&p->dl, &p->dl);
setup_new_dl_entity(&p->dl);
if (task_on_rq_queued(p) && rq->curr != p) {
if (rq->curr != p) {
#ifdef CONFIG_SMP
if (tsk_nr_cpus_allowed(p) > 1 && rq->dl.overloaded)
queue_push_tasks(rq);
......
......@@ -369,8 +369,12 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
#define P(F) \
SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)F)
#define P_SCHEDSTAT(F) \
SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)schedstat_val(F))
#define PN(F) \
SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F))
#define PN_SCHEDSTAT(F) \
SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)schedstat_val(F)))
if (!se)
return;
......@@ -378,26 +382,27 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
PN(se->exec_start);
PN(se->vruntime);
PN(se->sum_exec_runtime);
#ifdef CONFIG_SCHEDSTATS
if (schedstat_enabled()) {
PN(se->statistics.wait_start);
PN(se->statistics.sleep_start);
PN(se->statistics.block_start);
PN(se->statistics.sleep_max);
PN(se->statistics.block_max);
PN(se->statistics.exec_max);
PN(se->statistics.slice_max);
PN(se->statistics.wait_max);
PN(se->statistics.wait_sum);
P(se->statistics.wait_count);
PN_SCHEDSTAT(se->statistics.wait_start);
PN_SCHEDSTAT(se->statistics.sleep_start);
PN_SCHEDSTAT(se->statistics.block_start);
PN_SCHEDSTAT(se->statistics.sleep_max);
PN_SCHEDSTAT(se->statistics.block_max);
PN_SCHEDSTAT(se->statistics.exec_max);
PN_SCHEDSTAT(se->statistics.slice_max);
PN_SCHEDSTAT(se->statistics.wait_max);
PN_SCHEDSTAT(se->statistics.wait_sum);
P_SCHEDSTAT(se->statistics.wait_count);
}
#endif
P(se->load.weight);
#ifdef CONFIG_SMP
P(se->avg.load_avg);
P(se->avg.util_avg);
#endif
#undef PN_SCHEDSTAT
#undef PN
#undef P_SCHEDSTAT
#undef P
}
#endif
......@@ -429,9 +434,9 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
p->prio);
SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld",
SPLIT_NS(schedstat_val(p, se.statistics.wait_sum)),
SPLIT_NS(schedstat_val_or_zero(p->se.statistics.wait_sum)),
SPLIT_NS(p->se.sum_exec_runtime),
SPLIT_NS(schedstat_val(p, se.statistics.sum_sleep_runtime)));
SPLIT_NS(schedstat_val_or_zero(p->se.statistics.sum_sleep_runtime)));
#ifdef CONFIG_NUMA_BALANCING
SEQ_printf(m, " %d %d", task_node(p), task_numa_group_id(p));
......@@ -626,9 +631,7 @@ do { \
#undef P64
#endif
#ifdef CONFIG_SCHEDSTATS
#define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, rq->n);
#define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, schedstat_val(rq->n));
if (schedstat_enabled()) {
P(yld_count);
P(sched_count);
......@@ -636,9 +639,8 @@ do { \
P(ttwu_count);
P(ttwu_local);
}
#undef P
#endif
spin_lock_irqsave(&sched_debug_lock, flags);
print_cfs_stats(m, cpu);
print_rt_stats(m, cpu);
......@@ -868,10 +870,14 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)F)
#define P(F) \
SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)p->F)
#define P_SCHEDSTAT(F) \
SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)schedstat_val(p->F))
#define __PN(F) \
SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F))
#define PN(F) \
SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F))
#define PN_SCHEDSTAT(F) \
SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)schedstat_val(p->F)))
PN(se.exec_start);
PN(se.vruntime);
......@@ -881,37 +887,36 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
P(se.nr_migrations);
#ifdef CONFIG_SCHEDSTATS
if (schedstat_enabled()) {
u64 avg_atom, avg_per_cpu;
PN(se.statistics.sum_sleep_runtime);
PN(se.statistics.wait_start);
PN(se.statistics.sleep_start);
PN(se.statistics.block_start);
PN(se.statistics.sleep_max);
PN(se.statistics.block_max);
PN(se.statistics.exec_max);
PN(se.statistics.slice_max);
PN(se.statistics.wait_max);
PN(se.statistics.wait_sum);
P(se.statistics.wait_count);
PN(se.statistics.iowait_sum);
P(se.statistics.iowait_count);
P(se.statistics.nr_migrations_cold);
P(se.statistics.nr_failed_migrations_affine);
P(se.statistics.nr_failed_migrations_running);
P(se.statistics.nr_failed_migrations_hot);
P(se.statistics.nr_forced_migrations);
P(se.statistics.nr_wakeups);
P(se.statistics.nr_wakeups_sync);
P(se.statistics.nr_wakeups_migrate);
P(se.statistics.nr_wakeups_local);
P(se.statistics.nr_wakeups_remote);
P(se.statistics.nr_wakeups_affine);
P(se.statistics.nr_wakeups_affine_attempts);
P(se.statistics.nr_wakeups_passive);
P(se.statistics.nr_wakeups_idle);
PN_SCHEDSTAT(se.statistics.sum_sleep_runtime);
PN_SCHEDSTAT(se.statistics.wait_start);
PN_SCHEDSTAT(se.statistics.sleep_start);
PN_SCHEDSTAT(se.statistics.block_start);
PN_SCHEDSTAT(se.statistics.sleep_max);
PN_SCHEDSTAT(se.statistics.block_max);
PN_SCHEDSTAT(se.statistics.exec_max);
PN_SCHEDSTAT(se.statistics.slice_max);
PN_SCHEDSTAT(se.statistics.wait_max);
PN_SCHEDSTAT(se.statistics.wait_sum);
P_SCHEDSTAT(se.statistics.wait_count);
PN_SCHEDSTAT(se.statistics.iowait_sum);
P_SCHEDSTAT(se.statistics.iowait_count);
P_SCHEDSTAT(se.statistics.nr_migrations_cold);
P_SCHEDSTAT(se.statistics.nr_failed_migrations_affine);
P_SCHEDSTAT(se.statistics.nr_failed_migrations_running);
P_SCHEDSTAT(se.statistics.nr_failed_migrations_hot);
P_SCHEDSTAT(se.statistics.nr_forced_migrations);
P_SCHEDSTAT(se.statistics.nr_wakeups);
P_SCHEDSTAT(se.statistics.nr_wakeups_sync);
P_SCHEDSTAT(se.statistics.nr_wakeups_migrate);
P_SCHEDSTAT(se.statistics.nr_wakeups_local);
P_SCHEDSTAT(se.statistics.nr_wakeups_remote);
P_SCHEDSTAT(se.statistics.nr_wakeups_affine);
P_SCHEDSTAT(se.statistics.nr_wakeups_affine_attempts);
P_SCHEDSTAT(se.statistics.nr_wakeups_passive);
P_SCHEDSTAT(se.statistics.nr_wakeups_idle);
avg_atom = p->se.sum_exec_runtime;
if (nr_switches)
......@@ -930,7 +935,7 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
__PN(avg_atom);
__PN(avg_per_cpu);
}
#endif
__P(nr_switches);
SEQ_printf(m, "%-45s:%21Ld\n",
"nr_voluntary_switches", (long long)p->nvcsw);
......@@ -947,8 +952,10 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
#endif
P(policy);
P(prio);
#undef PN_SCHEDSTAT
#undef PN
#undef __PN
#undef P_SCHEDSTAT
#undef P
#undef __P
......
This diff is collapsed.
......@@ -27,8 +27,8 @@ static struct task_struct *
pick_next_task_idle(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie)
{
put_prev_task(rq, prev);
schedstat_inc(rq, sched_goidle);
update_idle_core(rq);
schedstat_inc(rq->sched_goidle);
return rq->idle;
}
......
......@@ -2,6 +2,7 @@
#include <linux/sched.h>
#include <linux/sched/sysctl.h>
#include <linux/sched/rt.h>
#include <linux/u64_stats_sync.h>
#include <linux/sched/deadline.h>
#include <linux/binfmts.h>
#include <linux/mutex.h>
......@@ -15,6 +16,12 @@
#include "cpudeadline.h"
#include "cpuacct.h"
#ifdef CONFIG_SCHED_DEBUG
#define SCHED_WARN_ON(x) WARN_ONCE(x, #x)
#else
#define SCHED_WARN_ON(x) ((void)(x))
#endif
struct rq;
struct cpuidle_state;
......@@ -565,6 +572,8 @@ struct root_domain {
*/
cpumask_var_t rto_mask;
struct cpupri cpupri;
unsigned long max_cpu_capacity;
};
extern struct root_domain def_root_domain;
......@@ -597,7 +606,6 @@ struct rq {
#ifdef CONFIG_SMP
unsigned long last_load_update_tick;
#endif /* CONFIG_SMP */
u64 nohz_stamp;
unsigned long nohz_flags;
#endif /* CONFIG_NO_HZ_COMMON */
#ifdef CONFIG_NO_HZ_FULL
......@@ -723,6 +731,23 @@ static inline int cpu_of(struct rq *rq)
#endif
}
#ifdef CONFIG_SCHED_SMT
extern struct static_key_false sched_smt_present;
extern void __update_idle_core(struct rq *rq);
static inline void update_idle_core(struct rq *rq)
{
if (static_branch_unlikely(&sched_smt_present))
__update_idle_core(rq);
}
#else
static inline void update_idle_core(struct rq *rq) { }
#endif
DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
......@@ -857,8 +882,8 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
DECLARE_PER_CPU(struct sched_domain *, sd_llc);
DECLARE_PER_CPU(int, sd_llc_size);
DECLARE_PER_CPU(int, sd_llc_id);
DECLARE_PER_CPU(struct sched_domain_shared *, sd_llc_shared);
DECLARE_PER_CPU(struct sched_domain *, sd_numa);
DECLARE_PER_CPU(struct sched_domain *, sd_busy);
DECLARE_PER_CPU(struct sched_domain *, sd_asym);
struct sched_group_capacity {
......@@ -870,10 +895,6 @@ struct sched_group_capacity {
unsigned int capacity;
unsigned long next_update;
int imbalance; /* XXX unrelated to capacity but shared group state */
/*
* Number of busy cpus in this group.
*/
atomic_t nr_busy_cpus;
unsigned long cpumask[0]; /* iteration mask */
};
......@@ -1260,6 +1281,11 @@ static inline void put_prev_task(struct rq *rq, struct task_struct *prev)
prev->sched_class->put_prev_task(rq, prev);
}
static inline void set_curr_task(struct rq *rq, struct task_struct *curr)
{
curr->sched_class->set_curr_task(rq);
}
#define sched_class_highest (&stop_sched_class)
#define for_each_class(class) \
for (class = sched_class_highest; class; class = class->next)
......@@ -1290,7 +1316,7 @@ static inline void idle_set_state(struct rq *rq,
static inline struct cpuidle_state *idle_get_state(struct rq *rq)
{
WARN_ON(!rcu_read_lock_held());
SCHED_WARN_ON(!rcu_read_lock_held());
return rq->idle_state;
}
#else
......@@ -1710,52 +1736,28 @@ static inline void nohz_balance_exit_idle(unsigned int cpu) { }
#endif
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
struct irqtime {
u64 hardirq_time;
u64 softirq_time;
u64 irq_start_time;
struct u64_stats_sync sync;
};
DECLARE_PER_CPU(u64, cpu_hardirq_time);
DECLARE_PER_CPU(u64, cpu_softirq_time);
#ifndef CONFIG_64BIT
DECLARE_PER_CPU(seqcount_t, irq_time_seq);
static inline void irq_time_write_begin(void)
{
__this_cpu_inc(irq_time_seq.sequence);
smp_wmb();
}
static inline void irq_time_write_end(void)
{
smp_wmb();
__this_cpu_inc(irq_time_seq.sequence);
}
DECLARE_PER_CPU(struct irqtime, cpu_irqtime);
static inline u64 irq_time_read(int cpu)
{
u64 irq_time;
unsigned seq;
struct irqtime *irqtime = &per_cpu(cpu_irqtime, cpu);
unsigned int seq;
u64 total;
do {
seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu));
irq_time = per_cpu(cpu_softirq_time, cpu) +
per_cpu(cpu_hardirq_time, cpu);
} while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq));
return irq_time;
}
#else /* CONFIG_64BIT */
static inline void irq_time_write_begin(void)
{
}
seq = __u64_stats_fetch_begin(&irqtime->sync);
total = irqtime->softirq_time + irqtime->hardirq_time;
} while (__u64_stats_fetch_retry(&irqtime->sync, seq));
static inline void irq_time_write_end(void)
{
}
static inline u64 irq_time_read(int cpu)
{
return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu);
return total;
}
#endif /* CONFIG_64BIT */
#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
#ifdef CONFIG_CPU_FREQ
......
......@@ -29,11 +29,12 @@ rq_sched_info_dequeued(struct rq *rq, unsigned long long delta)
if (rq)
rq->rq_sched_info.run_delay += delta;
}
# define schedstat_enabled() static_branch_unlikely(&sched_schedstats)
# define schedstat_inc(rq, field) do { if (schedstat_enabled()) { (rq)->field++; } } while (0)
# define schedstat_add(rq, field, amt) do { if (schedstat_enabled()) { (rq)->field += (amt); } } while (0)
# define schedstat_set(var, val) do { if (schedstat_enabled()) { var = (val); } } while (0)
# define schedstat_val(rq, field) ((schedstat_enabled()) ? (rq)->field : 0)
#define schedstat_enabled() static_branch_unlikely(&sched_schedstats)
#define schedstat_inc(var) do { if (schedstat_enabled()) { var++; } } while (0)
#define schedstat_add(var, amt) do { if (schedstat_enabled()) { var += (amt); } } while (0)
#define schedstat_set(var, val) do { if (schedstat_enabled()) { var = (val); } } while (0)
#define schedstat_val(var) (var)
#define schedstat_val_or_zero(var) ((schedstat_enabled()) ? (var) : 0)
#else /* !CONFIG_SCHEDSTATS */
static inline void
......@@ -45,12 +46,13 @@ rq_sched_info_dequeued(struct rq *rq, unsigned long long delta)
static inline void
rq_sched_info_depart(struct rq *rq, unsigned long long delta)
{}
# define schedstat_enabled() 0
# define schedstat_inc(rq, field) do { } while (0)
# define schedstat_add(rq, field, amt) do { } while (0)
# define schedstat_set(var, val) do { } while (0)
# define schedstat_val(rq, field) 0
#endif
#define schedstat_enabled() 0
#define schedstat_inc(var) do { } while (0)
#define schedstat_add(var, amt) do { } while (0)
#define schedstat_set(var, val) do { } while (0)
#define schedstat_val(var) 0
#define schedstat_val_or_zero(var) 0
#endif /* CONFIG_SCHEDSTATS */
#ifdef CONFIG_SCHED_INFO
static inline void sched_info_reset_dequeued(struct task_struct *t)
......
......@@ -196,27 +196,48 @@ prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state)
}
EXPORT_SYMBOL(prepare_to_wait_exclusive);
long prepare_to_wait_event(wait_queue_head_t *q, wait_queue_t *wait, int state)
void init_wait_entry(wait_queue_t *wait, int flags)
{
unsigned long flags;
if (signal_pending_state(state, current))
return -ERESTARTSYS;
wait->flags = flags;
wait->private = current;
wait->func = autoremove_wake_function;
INIT_LIST_HEAD(&wait->task_list);
}
EXPORT_SYMBOL(init_wait_entry);
long prepare_to_wait_event(wait_queue_head_t *q, wait_queue_t *wait, int state)
{
unsigned long flags;
long ret = 0;
spin_lock_irqsave(&q->lock, flags);
if (list_empty(&wait->task_list)) {
if (wait->flags & WQ_FLAG_EXCLUSIVE)
__add_wait_queue_tail(q, wait);
else
__add_wait_queue(q, wait);
if (unlikely(signal_pending_state(state, current))) {
/*
* Exclusive waiter must not fail if it was selected by wakeup,
* it should "consume" the condition we were waiting for.
*
* The caller will recheck the condition and return success if
* we were already woken up, we can not miss the event because
* wakeup locks/unlocks the same q->lock.
*
* But we need to ensure that set-condition + wakeup after that
* can't see us, it should wake up another exclusive waiter if
* we fail.
*/
list_del_init(&wait->task_list);
ret = -ERESTARTSYS;
} else {
if (list_empty(&wait->task_list)) {
if (wait->flags & WQ_FLAG_EXCLUSIVE)
__add_wait_queue_tail(q, wait);
else
__add_wait_queue(q, wait);
}
set_current_state(state);
}
set_current_state(state);
spin_unlock_irqrestore(&q->lock, flags);
return 0;
return ret;
}
EXPORT_SYMBOL(prepare_to_wait_event);
......@@ -255,39 +276,6 @@ void finish_wait(wait_queue_head_t *q, wait_queue_t *wait)
}
EXPORT_SYMBOL(finish_wait);
/**
* abort_exclusive_wait - abort exclusive waiting in a queue
* @q: waitqueue waited on
* @wait: wait descriptor
* @mode: runstate of the waiter to be woken
* @key: key to identify a wait bit queue or %NULL
*
* Sets current thread back to running state and removes
* the wait descriptor from the given waitqueue if still
* queued.
*
* Wakes up the next waiter if the caller is concurrently
* woken up through the queue.
*
* This prevents waiter starvation where an exclusive waiter
* aborts and is woken up concurrently and no one wakes up
* the next waiter.
*/
void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait,
unsigned int mode, void *key)
{
unsigned long flags;
__set_current_state(TASK_RUNNING);
spin_lock_irqsave(&q->lock, flags);
if (!list_empty(&wait->task_list))
list_del_init(&wait->task_list);
else if (waitqueue_active(q))
__wake_up_locked_key(q, mode, key);
spin_unlock_irqrestore(&q->lock, flags);
}
EXPORT_SYMBOL(abort_exclusive_wait);
int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key)
{
int ret = default_wake_function(wait, mode, sync, key);
......@@ -425,20 +413,29 @@ int __sched
__wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q,
wait_bit_action_f *action, unsigned mode)
{
do {
int ret;
int ret = 0;
for (;;) {
prepare_to_wait_exclusive(wq, &q->wait, mode);
if (!test_bit(q->key.bit_nr, q->key.flags))
continue;
ret = action(&q->key, mode);
if (!ret)
continue;
abort_exclusive_wait(wq, &q->wait, mode, &q->key);
return ret;
} while (test_and_set_bit(q->key.bit_nr, q->key.flags));
finish_wait(wq, &q->wait);
return 0;
if (test_bit(q->key.bit_nr, q->key.flags)) {
ret = action(&q->key, mode);
/*
* See the comment in prepare_to_wait_event().
* finish_wait() does not necessarily takes wq->lock,
* but test_and_set_bit() implies mb() which pairs with
* smp_mb__after_atomic() before wake_up_page().
*/
if (ret)
finish_wait(wq, &q->wait);
}
if (!test_and_set_bit(q->key.bit_nr, q->key.flags)) {
if (!ret)
finish_wait(wq, &q->wait);
return 0;
} else if (ret) {
return ret;
}
}
}
EXPORT_SYMBOL(__wait_on_bit_lock);
......
......@@ -122,12 +122,12 @@ static int smpboot_thread_fn(void *data)
if (kthread_should_park()) {
__set_current_state(TASK_RUNNING);
preempt_enable();
if (ht->park && td->status == HP_THREAD_ACTIVE) {
BUG_ON(td->cpu != smp_processor_id());
ht->park(td->cpu);
td->status = HP_THREAD_PARKED;
}
preempt_enable();
kthread_parkme();
/* We might have been woken for stop */
continue;
......
......@@ -121,6 +121,11 @@ int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg)
cpu_stop_init_done(&done, 1);
if (!cpu_stop_queue_work(cpu, &work))
return -ENOENT;
/*
* In case @cpu == smp_proccessor_id() we can avoid a sleep+wakeup
* cycle by doing a preemption:
*/
cond_resched();
wait_for_completion(&done.completion);
return done.ret;
}
......
......@@ -1165,7 +1165,7 @@ int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t pmd)
}
/* See similar comment in do_numa_page for explanation */
if (!(vma->vm_flags & VM_WRITE))
if (!pmd_write(pmd))
flags |= TNF_NO_GROUP;
/*
......
......@@ -3395,7 +3395,7 @@ static int do_numa_page(struct fault_env *fe, pte_t pte)
* pte_dirty has unpredictable behaviour between PTE scan updates,
* background writeback, dirty balancing and application behaviour.
*/
if (!(vma->vm_flags & VM_WRITE))
if (!pte_write(pte))
flags |= TNF_NO_GROUP;
/*
......
......@@ -175,6 +175,7 @@ static int __dead_end_function(struct objtool_file *file, struct symbol *func,
"__stack_chk_fail",
"panic",
"do_exit",
"do_task_dead",
"__module_put_and_exit",
"complete_and_exit",
"kvm_spurious_fault",
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment