Commit af79ad2b authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull scheduler changes from Ingo Molnar:
 "The main changes are:

   - irqtime accounting cleanups and enhancements. (Frederic Weisbecker)

   - schedstat debugging enhancements, make it more broadly runtime
     available. (Josh Poimboeuf)

   - More work on asymmetric topology/capacity scheduling. (Morten
     Rasmussen)

   - sched/wait fixes and cleanups. (Oleg Nesterov)

   - PELT (per entity load tracking) improvements. (Peter Zijlstra)

   - Rewrite and enhance select_idle_siblings(). (Peter Zijlstra)

   - sched/numa enhancements/fixes (Rik van Riel)

   - sched/cputime scalability improvements (Stanislaw Gruszka)

   - Load calculation arithmetics fixes. (Dietmar Eggemann)

   - sched/deadline enhancements (Tommaso Cucinotta)

   - Fix utilization accounting when switching to the SCHED_NORMAL
     policy. (Vincent Guittot)

   - ... plus misc cleanups and enhancements"

* 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (64 commits)
  sched/irqtime: Consolidate irqtime flushing code
  sched/irqtime: Consolidate accounting synchronization with u64_stats API
  u64_stats: Introduce IRQs disabled helpers
  sched/irqtime: Remove needless IRQs disablement on kcpustat update
  sched/irqtime: No need for preempt-safe accessors
  sched/fair: Fix min_vruntime tracking
  sched/debug: Add SCHED_WARN_ON()
  sched/core: Fix set_user_nice()
  sched/fair: Introduce set_curr_task() helper
  sched/core, ia64: Rename set_curr_task()
  sched/core: Fix incorrect utilization accounting when switching to fair class
  sched/core: Optimize SCHED_SMT
  sched/core: Rewrite and improve select_idle_siblings()
  sched/core: Replace sd_busy/nr_busy_cpus with sched_domain_shared
  sched/core: Introduce 'struct sched_domain_shared'
  sched/core: Restructure destroy_sched_domain()
  sched/core: Remove unused @cpu argument from destroy_sched_domain*()
  sched/wait: Introduce init_wait_entry()
  sched/wait: Avoid abort_exclusive_wait() in __wait_on_bit_lock()
  sched/wait: Avoid abort_exclusive_wait() in ___wait_event()
  ...
parents e606d81d 447976ef
...@@ -16,6 +16,7 @@ CONTENTS ...@@ -16,6 +16,7 @@ CONTENTS
4.1 System-wide settings 4.1 System-wide settings
4.2 Task interface 4.2 Task interface
4.3 Default behavior 4.3 Default behavior
4.4 Behavior of sched_yield()
5. Tasks CPU affinity 5. Tasks CPU affinity
5.1 SCHED_DEADLINE and cpusets HOWTO 5.1 SCHED_DEADLINE and cpusets HOWTO
6. Future plans 6. Future plans
...@@ -426,6 +427,23 @@ CONTENTS ...@@ -426,6 +427,23 @@ CONTENTS
Finally, notice that in order not to jeopardize the admission control a Finally, notice that in order not to jeopardize the admission control a
-deadline task cannot fork. -deadline task cannot fork.
4.4 Behavior of sched_yield()
-----------------------------
When a SCHED_DEADLINE task calls sched_yield(), it gives up its
remaining runtime and is immediately throttled, until the next
period, when its runtime will be replenished (a special flag
dl_yielded is set and used to handle correctly throttling and runtime
replenishment after a call to sched_yield()).
This behavior of sched_yield() allows the task to wake-up exactly at
the beginning of the next period. Also, this may be useful in the
future with bandwidth reclaiming mechanisms, where sched_yield() will
make the leftoever runtime available for reclamation by other
SCHED_DEADLINE tasks.
5. Tasks CPU affinity 5. Tasks CPU affinity
===================== =====================
......
...@@ -986,7 +986,7 @@ ia64_mca_modify_original_stack(struct pt_regs *regs, ...@@ -986,7 +986,7 @@ ia64_mca_modify_original_stack(struct pt_regs *regs,
int cpu = smp_processor_id(); int cpu = smp_processor_id();
previous_current = curr_task(cpu); previous_current = curr_task(cpu);
set_curr_task(cpu, current); ia64_set_curr_task(cpu, current);
if ((p = strchr(current->comm, ' '))) if ((p = strchr(current->comm, ' ')))
*p = '\0'; *p = '\0';
...@@ -1360,14 +1360,14 @@ ia64_mca_handler(struct pt_regs *regs, struct switch_stack *sw, ...@@ -1360,14 +1360,14 @@ ia64_mca_handler(struct pt_regs *regs, struct switch_stack *sw,
cpumask_clear_cpu(i, &mca_cpu); /* wake next cpu */ cpumask_clear_cpu(i, &mca_cpu); /* wake next cpu */
while (monarch_cpu != -1) while (monarch_cpu != -1)
cpu_relax(); /* spin until last cpu leaves */ cpu_relax(); /* spin until last cpu leaves */
set_curr_task(cpu, previous_current); ia64_set_curr_task(cpu, previous_current);
ia64_mc_info.imi_rendez_checkin[cpu] ia64_mc_info.imi_rendez_checkin[cpu]
= IA64_MCA_RENDEZ_CHECKIN_NOTDONE; = IA64_MCA_RENDEZ_CHECKIN_NOTDONE;
return; return;
} }
} }
} }
set_curr_task(cpu, previous_current); ia64_set_curr_task(cpu, previous_current);
ia64_mc_info.imi_rendez_checkin[cpu] = IA64_MCA_RENDEZ_CHECKIN_NOTDONE; ia64_mc_info.imi_rendez_checkin[cpu] = IA64_MCA_RENDEZ_CHECKIN_NOTDONE;
monarch_cpu = -1; /* This frees the slaves and previous monarchs */ monarch_cpu = -1; /* This frees the slaves and previous monarchs */
} }
...@@ -1729,7 +1729,7 @@ ia64_init_handler(struct pt_regs *regs, struct switch_stack *sw, ...@@ -1729,7 +1729,7 @@ ia64_init_handler(struct pt_regs *regs, struct switch_stack *sw,
NOTIFY_INIT(DIE_INIT_SLAVE_LEAVE, regs, (long)&nd, 1); NOTIFY_INIT(DIE_INIT_SLAVE_LEAVE, regs, (long)&nd, 1);
mprintk("Slave on cpu %d returning to normal service.\n", cpu); mprintk("Slave on cpu %d returning to normal service.\n", cpu);
set_curr_task(cpu, previous_current); ia64_set_curr_task(cpu, previous_current);
ia64_mc_info.imi_rendez_checkin[cpu] = IA64_MCA_RENDEZ_CHECKIN_NOTDONE; ia64_mc_info.imi_rendez_checkin[cpu] = IA64_MCA_RENDEZ_CHECKIN_NOTDONE;
atomic_dec(&slaves); atomic_dec(&slaves);
return; return;
...@@ -1756,7 +1756,7 @@ ia64_init_handler(struct pt_regs *regs, struct switch_stack *sw, ...@@ -1756,7 +1756,7 @@ ia64_init_handler(struct pt_regs *regs, struct switch_stack *sw,
mprintk("\nINIT dump complete. Monarch on cpu %d returning to normal service.\n", cpu); mprintk("\nINIT dump complete. Monarch on cpu %d returning to normal service.\n", cpu);
atomic_dec(&monarchs); atomic_dec(&monarchs);
set_curr_task(cpu, previous_current); ia64_set_curr_task(cpu, previous_current);
monarch_cpu = -1; monarch_cpu = -1;
return; return;
} }
......
...@@ -471,7 +471,7 @@ static bool match_die(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) ...@@ -471,7 +471,7 @@ static bool match_die(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
return false; return false;
} }
static struct sched_domain_topology_level numa_inside_package_topology[] = { static struct sched_domain_topology_level x86_numa_in_package_topology[] = {
#ifdef CONFIG_SCHED_SMT #ifdef CONFIG_SCHED_SMT
{ cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) }, { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) },
#endif #endif
...@@ -480,22 +480,23 @@ static struct sched_domain_topology_level numa_inside_package_topology[] = { ...@@ -480,22 +480,23 @@ static struct sched_domain_topology_level numa_inside_package_topology[] = {
#endif #endif
{ NULL, }, { NULL, },
}; };
static struct sched_domain_topology_level x86_topology[] = {
#ifdef CONFIG_SCHED_SMT
{ cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) },
#endif
#ifdef CONFIG_SCHED_MC
{ cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
#endif
{ cpu_cpu_mask, SD_INIT_NAME(DIE) },
{ NULL, },
};
/* /*
* set_sched_topology() sets the topology internal to a CPU. The * Set if a package/die has multiple NUMA nodes inside.
* NUMA topologies are layered on top of it to build the full * AMD Magny-Cours and Intel Cluster-on-Die have this.
* system topology.
*
* If NUMA nodes are observed to occur within a CPU package, this
* function should be called. It forces the sched domain code to
* only use the SMT level for the CPU portion of the topology.
* This essentially falls back to relying on NUMA information
* from the SRAT table to describe the entire system topology
* (except for hyperthreads).
*/ */
static void primarily_use_numa_for_topology(void) static bool x86_has_numa_in_package;
{
set_sched_topology(numa_inside_package_topology);
}
void set_cpu_sibling_map(int cpu) void set_cpu_sibling_map(int cpu)
{ {
...@@ -558,7 +559,7 @@ void set_cpu_sibling_map(int cpu) ...@@ -558,7 +559,7 @@ void set_cpu_sibling_map(int cpu)
c->booted_cores = cpu_data(i).booted_cores; c->booted_cores = cpu_data(i).booted_cores;
} }
if (match_die(c, o) && !topology_same_node(c, o)) if (match_die(c, o) && !topology_same_node(c, o))
primarily_use_numa_for_topology(); x86_has_numa_in_package = true;
} }
threads = cpumask_weight(topology_sibling_cpumask(cpu)); threads = cpumask_weight(topology_sibling_cpumask(cpu));
...@@ -1304,6 +1305,16 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) ...@@ -1304,6 +1305,16 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
zalloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL); zalloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL);
zalloc_cpumask_var(&per_cpu(cpu_llc_shared_map, i), GFP_KERNEL); zalloc_cpumask_var(&per_cpu(cpu_llc_shared_map, i), GFP_KERNEL);
} }
/*
* Set 'default' x86 topology, this matches default_topology() in that
* it has NUMA nodes as a topology level. See also
* native_smp_cpus_done().
*
* Must be done before set_cpus_sibling_map() is ran.
*/
set_sched_topology(x86_topology);
set_cpu_sibling_map(0); set_cpu_sibling_map(0);
switch (smp_sanity_check(max_cpus)) { switch (smp_sanity_check(max_cpus)) {
...@@ -1370,6 +1381,9 @@ void __init native_smp_cpus_done(unsigned int max_cpus) ...@@ -1370,6 +1381,9 @@ void __init native_smp_cpus_done(unsigned int max_cpus)
{ {
pr_debug("Boot done\n"); pr_debug("Boot done\n");
if (x86_has_numa_in_package)
set_sched_topology(x86_numa_in_package_topology);
nmi_selftest(); nmi_selftest();
impress_friends(); impress_friends();
setup_ioapic_dest(); setup_ioapic_dest();
......
...@@ -259,17 +259,14 @@ static inline void might_fault(void) { } ...@@ -259,17 +259,14 @@ static inline void might_fault(void) { }
extern struct atomic_notifier_head panic_notifier_list; extern struct atomic_notifier_head panic_notifier_list;
extern long (*panic_blink)(int state); extern long (*panic_blink)(int state);
__printf(1, 2) __printf(1, 2)
void panic(const char *fmt, ...) void panic(const char *fmt, ...) __noreturn __cold;
__noreturn __cold;
void nmi_panic(struct pt_regs *regs, const char *msg); void nmi_panic(struct pt_regs *regs, const char *msg);
extern void oops_enter(void); extern void oops_enter(void);
extern void oops_exit(void); extern void oops_exit(void);
void print_oops_end_marker(void); void print_oops_end_marker(void);
extern int oops_may_print(void); extern int oops_may_print(void);
void do_exit(long error_code) void do_exit(long error_code) __noreturn;
__noreturn; void complete_and_exit(struct completion *, long) __noreturn;
void complete_and_exit(struct completion *, long)
__noreturn;
/* Internal, do not use. */ /* Internal, do not use. */
int __must_check _kstrtoul(const char *s, unsigned int base, unsigned long *res); int __must_check _kstrtoul(const char *s, unsigned int base, unsigned long *res);
......
...@@ -448,6 +448,8 @@ static inline void io_schedule(void) ...@@ -448,6 +448,8 @@ static inline void io_schedule(void)
io_schedule_timeout(MAX_SCHEDULE_TIMEOUT); io_schedule_timeout(MAX_SCHEDULE_TIMEOUT);
} }
void __noreturn do_task_dead(void);
struct nsproxy; struct nsproxy;
struct user_namespace; struct user_namespace;
...@@ -1022,7 +1024,8 @@ extern void wake_up_q(struct wake_q_head *head); ...@@ -1022,7 +1024,8 @@ extern void wake_up_q(struct wake_q_head *head);
#define SD_BALANCE_FORK 0x0008 /* Balance on fork, clone */ #define SD_BALANCE_FORK 0x0008 /* Balance on fork, clone */
#define SD_BALANCE_WAKE 0x0010 /* Balance on wakeup */ #define SD_BALANCE_WAKE 0x0010 /* Balance on wakeup */
#define SD_WAKE_AFFINE 0x0020 /* Wake task to waking CPU */ #define SD_WAKE_AFFINE 0x0020 /* Wake task to waking CPU */
#define SD_SHARE_CPUCAPACITY 0x0080 /* Domain members share cpu power */ #define SD_ASYM_CPUCAPACITY 0x0040 /* Groups have different max cpu capacities */
#define SD_SHARE_CPUCAPACITY 0x0080 /* Domain members share cpu capacity */
#define SD_SHARE_POWERDOMAIN 0x0100 /* Domain members share power domain */ #define SD_SHARE_POWERDOMAIN 0x0100 /* Domain members share power domain */
#define SD_SHARE_PKG_RESOURCES 0x0200 /* Domain members share cpu pkg resources */ #define SD_SHARE_PKG_RESOURCES 0x0200 /* Domain members share cpu pkg resources */
#define SD_SERIALIZE 0x0400 /* Only a single load balancing instance */ #define SD_SERIALIZE 0x0400 /* Only a single load balancing instance */
...@@ -1064,6 +1067,12 @@ extern int sched_domain_level_max; ...@@ -1064,6 +1067,12 @@ extern int sched_domain_level_max;
struct sched_group; struct sched_group;
struct sched_domain_shared {
atomic_t ref;
atomic_t nr_busy_cpus;
int has_idle_cores;
};
struct sched_domain { struct sched_domain {
/* These fields must be setup */ /* These fields must be setup */
struct sched_domain *parent; /* top domain must be null terminated */ struct sched_domain *parent; /* top domain must be null terminated */
...@@ -1094,6 +1103,8 @@ struct sched_domain { ...@@ -1094,6 +1103,8 @@ struct sched_domain {
u64 max_newidle_lb_cost; u64 max_newidle_lb_cost;
unsigned long next_decay_max_lb_cost; unsigned long next_decay_max_lb_cost;
u64 avg_scan_cost; /* select_idle_sibling */
#ifdef CONFIG_SCHEDSTATS #ifdef CONFIG_SCHEDSTATS
/* load_balance() stats */ /* load_balance() stats */
unsigned int lb_count[CPU_MAX_IDLE_TYPES]; unsigned int lb_count[CPU_MAX_IDLE_TYPES];
...@@ -1132,6 +1143,7 @@ struct sched_domain { ...@@ -1132,6 +1143,7 @@ struct sched_domain {
void *private; /* used during construction */ void *private; /* used during construction */
struct rcu_head rcu; /* used during destruction */ struct rcu_head rcu; /* used during destruction */
}; };
struct sched_domain_shared *shared;
unsigned int span_weight; unsigned int span_weight;
/* /*
...@@ -1165,6 +1177,7 @@ typedef int (*sched_domain_flags_f)(void); ...@@ -1165,6 +1177,7 @@ typedef int (*sched_domain_flags_f)(void);
struct sd_data { struct sd_data {
struct sched_domain **__percpu sd; struct sched_domain **__percpu sd;
struct sched_domain_shared **__percpu sds;
struct sched_group **__percpu sg; struct sched_group **__percpu sg;
struct sched_group_capacity **__percpu sgc; struct sched_group_capacity **__percpu sgc;
}; };
...@@ -2568,7 +2581,7 @@ static inline bool is_idle_task(const struct task_struct *p) ...@@ -2568,7 +2581,7 @@ static inline bool is_idle_task(const struct task_struct *p)
return p->pid == 0; return p->pid == 0;
} }
extern struct task_struct *curr_task(int cpu); extern struct task_struct *curr_task(int cpu);
extern void set_curr_task(int cpu, struct task_struct *p); extern void ia64_set_curr_task(int cpu, struct task_struct *p);
void yield(void); void yield(void);
...@@ -3206,7 +3219,11 @@ static inline int signal_pending_state(long state, struct task_struct *p) ...@@ -3206,7 +3219,11 @@ static inline int signal_pending_state(long state, struct task_struct *p)
* cond_resched_lock() will drop the spinlock before scheduling, * cond_resched_lock() will drop the spinlock before scheduling,
* cond_resched_softirq() will enable bhs before scheduling. * cond_resched_softirq() will enable bhs before scheduling.
*/ */
#ifndef CONFIG_PREEMPT
extern int _cond_resched(void); extern int _cond_resched(void);
#else
static inline int _cond_resched(void) { return 0; }
#endif
#define cond_resched() ({ \ #define cond_resched() ({ \
___might_sleep(__FILE__, __LINE__, 0); \ ___might_sleep(__FILE__, __LINE__, 0); \
...@@ -3236,6 +3253,15 @@ static inline void cond_resched_rcu(void) ...@@ -3236,6 +3253,15 @@ static inline void cond_resched_rcu(void)
#endif #endif
} }
static inline unsigned long get_preempt_disable_ip(struct task_struct *p)
{
#ifdef CONFIG_DEBUG_PREEMPT
return p->preempt_disable_ip;
#else
return 0;
#endif
}
/* /*
* Does a critical section need to be broken due to another * Does a critical section need to be broken due to another
* task waiting?: (technically does not depend on CONFIG_PREEMPT, * task waiting?: (technically does not depend on CONFIG_PREEMPT,
......
...@@ -103,31 +103,42 @@ static inline void u64_stats_update_end_raw(struct u64_stats_sync *syncp) ...@@ -103,31 +103,42 @@ static inline void u64_stats_update_end_raw(struct u64_stats_sync *syncp)
#endif #endif
} }
static inline unsigned int u64_stats_fetch_begin(const struct u64_stats_sync *syncp) static inline unsigned int __u64_stats_fetch_begin(const struct u64_stats_sync *syncp)
{ {
#if BITS_PER_LONG==32 && defined(CONFIG_SMP) #if BITS_PER_LONG==32 && defined(CONFIG_SMP)
return read_seqcount_begin(&syncp->seq); return read_seqcount_begin(&syncp->seq);
#else #else
#if BITS_PER_LONG==32
preempt_disable();
#endif
return 0; return 0;
#endif #endif
} }
static inline bool u64_stats_fetch_retry(const struct u64_stats_sync *syncp, static inline unsigned int u64_stats_fetch_begin(const struct u64_stats_sync *syncp)
{
#if BITS_PER_LONG==32 && !defined(CONFIG_SMP)
preempt_disable();
#endif
return __u64_stats_fetch_begin(syncp);
}
static inline bool __u64_stats_fetch_retry(const struct u64_stats_sync *syncp,
unsigned int start) unsigned int start)
{ {
#if BITS_PER_LONG==32 && defined(CONFIG_SMP) #if BITS_PER_LONG==32 && defined(CONFIG_SMP)
return read_seqcount_retry(&syncp->seq, start); return read_seqcount_retry(&syncp->seq, start);
#else #else
#if BITS_PER_LONG==32
preempt_enable();
#endif
return false; return false;
#endif #endif
} }
static inline bool u64_stats_fetch_retry(const struct u64_stats_sync *syncp,
unsigned int start)
{
#if BITS_PER_LONG==32 && !defined(CONFIG_SMP)
preempt_enable();
#endif
return __u64_stats_fetch_retry(syncp, start);
}
/* /*
* In case irq handlers can update u64 counters, readers can use following helpers * In case irq handlers can update u64 counters, readers can use following helpers
* - SMP 32bit arches use seqcount protection, irq safe. * - SMP 32bit arches use seqcount protection, irq safe.
...@@ -136,27 +147,19 @@ static inline bool u64_stats_fetch_retry(const struct u64_stats_sync *syncp, ...@@ -136,27 +147,19 @@ static inline bool u64_stats_fetch_retry(const struct u64_stats_sync *syncp,
*/ */
static inline unsigned int u64_stats_fetch_begin_irq(const struct u64_stats_sync *syncp) static inline unsigned int u64_stats_fetch_begin_irq(const struct u64_stats_sync *syncp)
{ {
#if BITS_PER_LONG==32 && defined(CONFIG_SMP) #if BITS_PER_LONG==32 && !defined(CONFIG_SMP)
return read_seqcount_begin(&syncp->seq);
#else
#if BITS_PER_LONG==32
local_irq_disable(); local_irq_disable();
#endif #endif
return 0; return __u64_stats_fetch_begin(syncp);
#endif
} }
static inline bool u64_stats_fetch_retry_irq(const struct u64_stats_sync *syncp, static inline bool u64_stats_fetch_retry_irq(const struct u64_stats_sync *syncp,
unsigned int start) unsigned int start)
{ {
#if BITS_PER_LONG==32 && defined(CONFIG_SMP) #if BITS_PER_LONG==32 && !defined(CONFIG_SMP)
return read_seqcount_retry(&syncp->seq, start);
#else
#if BITS_PER_LONG==32
local_irq_enable(); local_irq_enable();
#endif #endif
return false; return __u64_stats_fetch_retry(syncp, start);
#endif
} }
#endif /* _LINUX_U64_STATS_SYNC_H */ #endif /* _LINUX_U64_STATS_SYNC_H */
...@@ -248,6 +248,8 @@ wait_queue_head_t *bit_waitqueue(void *, int); ...@@ -248,6 +248,8 @@ wait_queue_head_t *bit_waitqueue(void *, int);
(!__builtin_constant_p(state) || \ (!__builtin_constant_p(state) || \
state == TASK_INTERRUPTIBLE || state == TASK_KILLABLE) \ state == TASK_INTERRUPTIBLE || state == TASK_KILLABLE) \
extern void init_wait_entry(wait_queue_t *__wait, int flags);
/* /*
* The below macro ___wait_event() has an explicit shadow of the __ret * The below macro ___wait_event() has an explicit shadow of the __ret
* variable when used from the wait_event_*() macros. * variable when used from the wait_event_*() macros.
...@@ -266,12 +268,7 @@ wait_queue_head_t *bit_waitqueue(void *, int); ...@@ -266,12 +268,7 @@ wait_queue_head_t *bit_waitqueue(void *, int);
wait_queue_t __wait; \ wait_queue_t __wait; \
long __ret = ret; /* explicit shadow */ \ long __ret = ret; /* explicit shadow */ \
\ \
INIT_LIST_HEAD(&__wait.task_list); \ init_wait_entry(&__wait, exclusive ? WQ_FLAG_EXCLUSIVE : 0); \
if (exclusive) \
__wait.flags = WQ_FLAG_EXCLUSIVE; \
else \
__wait.flags = 0; \
\
for (;;) { \ for (;;) { \
long __int = prepare_to_wait_event(&wq, &__wait, state);\ long __int = prepare_to_wait_event(&wq, &__wait, state);\
\ \
...@@ -280,12 +277,7 @@ wait_queue_head_t *bit_waitqueue(void *, int); ...@@ -280,12 +277,7 @@ wait_queue_head_t *bit_waitqueue(void *, int);
\ \
if (___wait_is_interruptible(state) && __int) { \ if (___wait_is_interruptible(state) && __int) { \
__ret = __int; \ __ret = __int; \
if (exclusive) { \ goto __out; \
abort_exclusive_wait(&wq, &__wait, \
state, NULL); \
goto __out; \
} \
break; \
} \ } \
\ \
cmd; \ cmd; \
...@@ -989,7 +981,6 @@ void prepare_to_wait(wait_queue_head_t *q, wait_queue_t *wait, int state); ...@@ -989,7 +981,6 @@ void prepare_to_wait(wait_queue_head_t *q, wait_queue_t *wait, int state);
void prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state); void prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state);
long prepare_to_wait_event(wait_queue_head_t *q, wait_queue_t *wait, int state); long prepare_to_wait_event(wait_queue_head_t *q, wait_queue_t *wait, int state);
void finish_wait(wait_queue_head_t *q, wait_queue_t *wait); void finish_wait(wait_queue_head_t *q, wait_queue_t *wait);
void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait, unsigned int mode, void *key);
long wait_woken(wait_queue_t *wait, unsigned mode, long timeout); long wait_woken(wait_queue_t *wait, unsigned mode, long timeout);
int woken_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key); int woken_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key);
int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key); int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key);
......
...@@ -725,7 +725,7 @@ static void check_stack_usage(void) ...@@ -725,7 +725,7 @@ static void check_stack_usage(void)
static inline void check_stack_usage(void) {} static inline void check_stack_usage(void) {}
#endif #endif
void do_exit(long code) void __noreturn do_exit(long code)
{ {
struct task_struct *tsk = current; struct task_struct *tsk = current;
int group_dead; int group_dead;
...@@ -882,29 +882,7 @@ void do_exit(long code) ...@@ -882,29 +882,7 @@ void do_exit(long code)
exit_rcu(); exit_rcu();
TASKS_RCU(__srcu_read_unlock(&tasks_rcu_exit_srcu, tasks_rcu_i)); TASKS_RCU(__srcu_read_unlock(&tasks_rcu_exit_srcu, tasks_rcu_i));
/* do_task_dead();
* The setting of TASK_RUNNING by try_to_wake_up() may be delayed
* when the following two conditions become true.
* - There is race condition of mmap_sem (It is acquired by
* exit_mm()), and
* - SMI occurs before setting TASK_RUNINNG.
* (or hypervisor of virtual machine switches to other guest)
* As a result, we may become TASK_RUNNING after becoming TASK_DEAD
*
* To avoid it, we have to wait for releasing tsk->pi_lock which
* is held by try_to_wake_up()
*/
smp_mb();
raw_spin_unlock_wait(&tsk->pi_lock);
/* causes final put_task_struct in finish_task_switch(). */
tsk->state = TASK_DEAD;
tsk->flags |= PF_NOFREEZE; /* tell freezer to ignore us */
schedule();
BUG();
/* Avoid "noreturn function does return". */
for (;;)
cpu_relax(); /* For when BUG is null */
} }
EXPORT_SYMBOL_GPL(do_exit); EXPORT_SYMBOL_GPL(do_exit);
......
...@@ -1070,8 +1070,12 @@ static int migration_cpu_stop(void *data) ...@@ -1070,8 +1070,12 @@ static int migration_cpu_stop(void *data)
* holding rq->lock, if p->on_rq == 0 it cannot get enqueued because * holding rq->lock, if p->on_rq == 0 it cannot get enqueued because
* we're holding p->pi_lock. * we're holding p->pi_lock.
*/ */
if (task_rq(p) == rq && task_on_rq_queued(p)) if (task_rq(p) == rq) {
rq = __migrate_task(rq, p, arg->dest_cpu); if (task_on_rq_queued(p))
rq = __migrate_task(rq, p, arg->dest_cpu);
else
p->wake_cpu = arg->dest_cpu;
}
raw_spin_unlock(&rq->lock); raw_spin_unlock(&rq->lock);
raw_spin_unlock(&p->pi_lock); raw_spin_unlock(&p->pi_lock);
...@@ -1112,10 +1116,10 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) ...@@ -1112,10 +1116,10 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
p->sched_class->set_cpus_allowed(p, new_mask); p->sched_class->set_cpus_allowed(p, new_mask);
if (running)
p->sched_class->set_curr_task(rq);
if (queued) if (queued)
enqueue_task(rq, p, ENQUEUE_RESTORE); enqueue_task(rq, p, ENQUEUE_RESTORE);
if (running)
set_curr_task(rq, p);
} }
/* /*
...@@ -1272,7 +1276,7 @@ static void __migrate_swap_task(struct task_struct *p, int cpu) ...@@ -1272,7 +1276,7 @@ static void __migrate_swap_task(struct task_struct *p, int cpu)
/* /*
* Task isn't running anymore; make it appear like we migrated * Task isn't running anymore; make it appear like we migrated
* it before it went to sleep. This means on wakeup we make the * it before it went to sleep. This means on wakeup we make the
* previous cpu our targer instead of where it really is. * previous cpu our target instead of where it really is.
*/ */
p->wake_cpu = cpu; p->wake_cpu = cpu;
} }
...@@ -1636,23 +1640,25 @@ static inline int __set_cpus_allowed_ptr(struct task_struct *p, ...@@ -1636,23 +1640,25 @@ static inline int __set_cpus_allowed_ptr(struct task_struct *p,
static void static void
ttwu_stat(struct task_struct *p, int cpu, int wake_flags) ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
{ {
#ifdef CONFIG_SCHEDSTATS struct rq *rq;
struct rq *rq = this_rq();
#ifdef CONFIG_SMP if (!schedstat_enabled())
int this_cpu = smp_processor_id(); return;
rq = this_rq();
if (cpu == this_cpu) { #ifdef CONFIG_SMP
schedstat_inc(rq, ttwu_local); if (cpu == rq->cpu) {
schedstat_inc(p, se.statistics.nr_wakeups_local); schedstat_inc(rq->ttwu_local);
schedstat_inc(p->se.statistics.nr_wakeups_local);
} else { } else {
struct sched_domain *sd; struct sched_domain *sd;
schedstat_inc(p, se.statistics.nr_wakeups_remote); schedstat_inc(p->se.statistics.nr_wakeups_remote);
rcu_read_lock(); rcu_read_lock();
for_each_domain(this_cpu, sd) { for_each_domain(rq->cpu, sd) {
if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
schedstat_inc(sd, ttwu_wake_remote); schedstat_inc(sd->ttwu_wake_remote);
break; break;
} }
} }
...@@ -1660,17 +1666,14 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags) ...@@ -1660,17 +1666,14 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
} }
if (wake_flags & WF_MIGRATED) if (wake_flags & WF_MIGRATED)
schedstat_inc(p, se.statistics.nr_wakeups_migrate); schedstat_inc(p->se.statistics.nr_wakeups_migrate);
#endif /* CONFIG_SMP */ #endif /* CONFIG_SMP */
schedstat_inc(rq, ttwu_count); schedstat_inc(rq->ttwu_count);
schedstat_inc(p, se.statistics.nr_wakeups); schedstat_inc(p->se.statistics.nr_wakeups);
if (wake_flags & WF_SYNC) if (wake_flags & WF_SYNC)
schedstat_inc(p, se.statistics.nr_wakeups_sync); schedstat_inc(p->se.statistics.nr_wakeups_sync);
#endif /* CONFIG_SCHEDSTATS */
} }
static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags) static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
...@@ -2091,8 +2094,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) ...@@ -2091,8 +2094,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
ttwu_queue(p, cpu, wake_flags); ttwu_queue(p, cpu, wake_flags);
stat: stat:
if (schedstat_enabled()) ttwu_stat(p, cpu, wake_flags);
ttwu_stat(p, cpu, wake_flags);
out: out:
raw_spin_unlock_irqrestore(&p->pi_lock, flags); raw_spin_unlock_irqrestore(&p->pi_lock, flags);
...@@ -2102,6 +2104,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) ...@@ -2102,6 +2104,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
/** /**
* try_to_wake_up_local - try to wake up a local task with rq lock held * try_to_wake_up_local - try to wake up a local task with rq lock held
* @p: the thread to be awakened * @p: the thread to be awakened
* @cookie: context's cookie for pinning
* *
* Put @p on the run-queue if it's not already there. The caller must * Put @p on the run-queue if it's not already there. The caller must
* ensure that this_rq() is locked, @p is bound to this_rq() and not * ensure that this_rq() is locked, @p is bound to this_rq() and not
...@@ -2140,8 +2143,7 @@ static void try_to_wake_up_local(struct task_struct *p, struct pin_cookie cookie ...@@ -2140,8 +2143,7 @@ static void try_to_wake_up_local(struct task_struct *p, struct pin_cookie cookie
ttwu_activate(rq, p, ENQUEUE_WAKEUP); ttwu_activate(rq, p, ENQUEUE_WAKEUP);
ttwu_do_wakeup(rq, p, 0, cookie); ttwu_do_wakeup(rq, p, 0, cookie);
if (schedstat_enabled()) ttwu_stat(p, smp_processor_id(), 0);
ttwu_stat(p, smp_processor_id(), 0);
out: out:
raw_spin_unlock(&p->pi_lock); raw_spin_unlock(&p->pi_lock);
} }
...@@ -3199,6 +3201,9 @@ static inline void preempt_latency_stop(int val) { } ...@@ -3199,6 +3201,9 @@ static inline void preempt_latency_stop(int val) { }
*/ */
static noinline void __schedule_bug(struct task_struct *prev) static noinline void __schedule_bug(struct task_struct *prev)
{ {
/* Save this before calling printk(), since that will clobber it */
unsigned long preempt_disable_ip = get_preempt_disable_ip(current);
if (oops_in_progress) if (oops_in_progress)
return; return;
...@@ -3209,13 +3214,12 @@ static noinline void __schedule_bug(struct task_struct *prev) ...@@ -3209,13 +3214,12 @@ static noinline void __schedule_bug(struct task_struct *prev)
print_modules(); print_modules();
if (irqs_disabled()) if (irqs_disabled())
print_irqtrace_events(prev); print_irqtrace_events(prev);
#ifdef CONFIG_DEBUG_PREEMPT if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)
if (in_atomic_preempt_off()) { && in_atomic_preempt_off()) {
pr_err("Preemption disabled at:"); pr_err("Preemption disabled at:");
print_ip_sym(current->preempt_disable_ip); print_ip_sym(preempt_disable_ip);
pr_cont("\n"); pr_cont("\n");
} }
#endif
if (panic_on_warn) if (panic_on_warn)
panic("scheduling while atomic\n"); panic("scheduling while atomic\n");
...@@ -3241,7 +3245,7 @@ static inline void schedule_debug(struct task_struct *prev) ...@@ -3241,7 +3245,7 @@ static inline void schedule_debug(struct task_struct *prev)
profile_hit(SCHED_PROFILING, __builtin_return_address(0)); profile_hit(SCHED_PROFILING, __builtin_return_address(0));
schedstat_inc(this_rq(), sched_count); schedstat_inc(this_rq()->sched_count);
} }
/* /*
...@@ -3334,17 +3338,6 @@ static void __sched notrace __schedule(bool preempt) ...@@ -3334,17 +3338,6 @@ static void __sched notrace __schedule(bool preempt)
rq = cpu_rq(cpu); rq = cpu_rq(cpu);
prev = rq->curr; prev = rq->curr;
/*
* do_exit() calls schedule() with preemption disabled as an exception;
* however we must fix that up, otherwise the next task will see an
* inconsistent (higher) preempt count.
*
* It also avoids the below schedule_debug() test from complaining
* about this.
*/
if (unlikely(prev->state == TASK_DEAD))
preempt_enable_no_resched_notrace();
schedule_debug(prev); schedule_debug(prev);
if (sched_feat(HRTICK)) if (sched_feat(HRTICK))
...@@ -3412,6 +3405,33 @@ static void __sched notrace __schedule(bool preempt) ...@@ -3412,6 +3405,33 @@ static void __sched notrace __schedule(bool preempt)
} }
STACK_FRAME_NON_STANDARD(__schedule); /* switch_to() */ STACK_FRAME_NON_STANDARD(__schedule); /* switch_to() */
void __noreturn do_task_dead(void)
{
/*
* The setting of TASK_RUNNING by try_to_wake_up() may be delayed
* when the following two conditions become true.
* - There is race condition of mmap_sem (It is acquired by
* exit_mm()), and
* - SMI occurs before setting TASK_RUNINNG.
* (or hypervisor of virtual machine switches to other guest)
* As a result, we may become TASK_RUNNING after becoming TASK_DEAD
*
* To avoid it, we have to wait for releasing tsk->pi_lock which
* is held by try_to_wake_up()
*/
smp_mb();
raw_spin_unlock_wait(&current->pi_lock);
/* causes final put_task_struct in finish_task_switch(). */
__set_current_state(TASK_DEAD);
current->flags |= PF_NOFREEZE; /* tell freezer to ignore us */
__schedule(false);
BUG();
/* Avoid "noreturn function does return". */
for (;;)
cpu_relax(); /* For when BUG is null */
}
static inline void sched_submit_work(struct task_struct *tsk) static inline void sched_submit_work(struct task_struct *tsk)
{ {
if (!tsk->state || tsk_is_pi_blocked(tsk)) if (!tsk->state || tsk_is_pi_blocked(tsk))
...@@ -3694,10 +3714,10 @@ void rt_mutex_setprio(struct task_struct *p, int prio) ...@@ -3694,10 +3714,10 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
p->prio = prio; p->prio = prio;
if (running)
p->sched_class->set_curr_task(rq);
if (queued) if (queued)
enqueue_task(rq, p, queue_flag); enqueue_task(rq, p, queue_flag);
if (running)
set_curr_task(rq, p);
check_class_changed(rq, p, prev_class, oldprio); check_class_changed(rq, p, prev_class, oldprio);
out_unlock: out_unlock:
...@@ -3711,7 +3731,8 @@ void rt_mutex_setprio(struct task_struct *p, int prio) ...@@ -3711,7 +3731,8 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
void set_user_nice(struct task_struct *p, long nice) void set_user_nice(struct task_struct *p, long nice)
{ {
int old_prio, delta, queued; bool queued, running;
int old_prio, delta;
struct rq_flags rf; struct rq_flags rf;
struct rq *rq; struct rq *rq;
...@@ -3733,8 +3754,11 @@ void set_user_nice(struct task_struct *p, long nice) ...@@ -3733,8 +3754,11 @@ void set_user_nice(struct task_struct *p, long nice)
goto out_unlock; goto out_unlock;
} }
queued = task_on_rq_queued(p); queued = task_on_rq_queued(p);
running = task_current(rq, p);
if (queued) if (queued)
dequeue_task(rq, p, DEQUEUE_SAVE); dequeue_task(rq, p, DEQUEUE_SAVE);
if (running)
put_prev_task(rq, p);
p->static_prio = NICE_TO_PRIO(nice); p->static_prio = NICE_TO_PRIO(nice);
set_load_weight(p); set_load_weight(p);
...@@ -3751,6 +3775,8 @@ void set_user_nice(struct task_struct *p, long nice) ...@@ -3751,6 +3775,8 @@ void set_user_nice(struct task_struct *p, long nice)
if (delta < 0 || (delta > 0 && task_running(rq, p))) if (delta < 0 || (delta > 0 && task_running(rq, p)))
resched_curr(rq); resched_curr(rq);
} }
if (running)
set_curr_task(rq, p);
out_unlock: out_unlock:
task_rq_unlock(rq, p, &rf); task_rq_unlock(rq, p, &rf);
} }
...@@ -4250,8 +4276,6 @@ static int __sched_setscheduler(struct task_struct *p, ...@@ -4250,8 +4276,6 @@ static int __sched_setscheduler(struct task_struct *p,
prev_class = p->sched_class; prev_class = p->sched_class;
__setscheduler(rq, p, attr, pi); __setscheduler(rq, p, attr, pi);
if (running)
p->sched_class->set_curr_task(rq);
if (queued) { if (queued) {
/* /*
* We enqueue to tail when the priority of a task is * We enqueue to tail when the priority of a task is
...@@ -4262,6 +4286,8 @@ static int __sched_setscheduler(struct task_struct *p, ...@@ -4262,6 +4286,8 @@ static int __sched_setscheduler(struct task_struct *p,
enqueue_task(rq, p, queue_flags); enqueue_task(rq, p, queue_flags);
} }
if (running)
set_curr_task(rq, p);
check_class_changed(rq, p, prev_class, oldprio); check_class_changed(rq, p, prev_class, oldprio);
preempt_disable(); /* avoid rq from going away on us */ preempt_disable(); /* avoid rq from going away on us */
...@@ -4853,7 +4879,7 @@ SYSCALL_DEFINE0(sched_yield) ...@@ -4853,7 +4879,7 @@ SYSCALL_DEFINE0(sched_yield)
{ {
struct rq *rq = this_rq_lock(); struct rq *rq = this_rq_lock();
schedstat_inc(rq, yld_count); schedstat_inc(rq->yld_count);
current->sched_class->yield_task(rq); current->sched_class->yield_task(rq);
/* /*
...@@ -4870,6 +4896,7 @@ SYSCALL_DEFINE0(sched_yield) ...@@ -4870,6 +4896,7 @@ SYSCALL_DEFINE0(sched_yield)
return 0; return 0;
} }
#ifndef CONFIG_PREEMPT
int __sched _cond_resched(void) int __sched _cond_resched(void)
{ {
if (should_resched(0)) { if (should_resched(0)) {
...@@ -4879,6 +4906,7 @@ int __sched _cond_resched(void) ...@@ -4879,6 +4906,7 @@ int __sched _cond_resched(void)
return 0; return 0;
} }
EXPORT_SYMBOL(_cond_resched); EXPORT_SYMBOL(_cond_resched);
#endif
/* /*
* __cond_resched_lock() - if a reschedule is pending, drop the given lock, * __cond_resched_lock() - if a reschedule is pending, drop the given lock,
...@@ -5004,7 +5032,7 @@ int __sched yield_to(struct task_struct *p, bool preempt) ...@@ -5004,7 +5032,7 @@ int __sched yield_to(struct task_struct *p, bool preempt)
yielded = curr->sched_class->yield_to_task(rq, p, preempt); yielded = curr->sched_class->yield_to_task(rq, p, preempt);
if (yielded) { if (yielded) {
schedstat_inc(rq, yld_count); schedstat_inc(rq->yld_count);
/* /*
* Make p's CPU reschedule; pick_next_entity takes care of * Make p's CPU reschedule; pick_next_entity takes care of
* fairness. * fairness.
...@@ -5424,10 +5452,10 @@ void sched_setnuma(struct task_struct *p, int nid) ...@@ -5424,10 +5452,10 @@ void sched_setnuma(struct task_struct *p, int nid)
p->numa_preferred_nid = nid; p->numa_preferred_nid = nid;
if (running)
p->sched_class->set_curr_task(rq);
if (queued) if (queued)
enqueue_task(rq, p, ENQUEUE_RESTORE); enqueue_task(rq, p, ENQUEUE_RESTORE);
if (running)
set_curr_task(rq, p);
task_rq_unlock(rq, p, &rf); task_rq_unlock(rq, p, &rf);
} }
#endif /* CONFIG_NUMA_BALANCING */ #endif /* CONFIG_NUMA_BALANCING */
...@@ -5724,6 +5752,8 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) ...@@ -5724,6 +5752,8 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
} }
} }
#else /* !CONFIG_SCHED_DEBUG */ #else /* !CONFIG_SCHED_DEBUG */
# define sched_debug_enabled 0
# define sched_domain_debug(sd, cpu) do { } while (0) # define sched_domain_debug(sd, cpu) do { } while (0)
static inline bool sched_debug(void) static inline bool sched_debug(void)
{ {
...@@ -5742,6 +5772,7 @@ static int sd_degenerate(struct sched_domain *sd) ...@@ -5742,6 +5772,7 @@ static int sd_degenerate(struct sched_domain *sd)
SD_BALANCE_FORK | SD_BALANCE_FORK |
SD_BALANCE_EXEC | SD_BALANCE_EXEC |
SD_SHARE_CPUCAPACITY | SD_SHARE_CPUCAPACITY |
SD_ASYM_CPUCAPACITY |
SD_SHARE_PKG_RESOURCES | SD_SHARE_PKG_RESOURCES |
SD_SHARE_POWERDOMAIN)) { SD_SHARE_POWERDOMAIN)) {
if (sd->groups != sd->groups->next) if (sd->groups != sd->groups->next)
...@@ -5772,6 +5803,7 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) ...@@ -5772,6 +5803,7 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
SD_BALANCE_NEWIDLE | SD_BALANCE_NEWIDLE |
SD_BALANCE_FORK | SD_BALANCE_FORK |
SD_BALANCE_EXEC | SD_BALANCE_EXEC |
SD_ASYM_CPUCAPACITY |
SD_SHARE_CPUCAPACITY | SD_SHARE_CPUCAPACITY |
SD_SHARE_PKG_RESOURCES | SD_SHARE_PKG_RESOURCES |
SD_PREFER_SIBLING | SD_PREFER_SIBLING |
...@@ -5916,10 +5948,8 @@ static void free_sched_groups(struct sched_group *sg, int free_sgc) ...@@ -5916,10 +5948,8 @@ static void free_sched_groups(struct sched_group *sg, int free_sgc)
} while (sg != first); } while (sg != first);
} }
static void free_sched_domain(struct rcu_head *rcu) static void destroy_sched_domain(struct sched_domain *sd)
{ {
struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);
/* /*
* If its an overlapping domain it has private groups, iterate and * If its an overlapping domain it has private groups, iterate and
* nuke them all. * nuke them all.
...@@ -5930,18 +5960,26 @@ static void free_sched_domain(struct rcu_head *rcu) ...@@ -5930,18 +5960,26 @@ static void free_sched_domain(struct rcu_head *rcu)
kfree(sd->groups->sgc); kfree(sd->groups->sgc);
kfree(sd->groups); kfree(sd->groups);
} }
if (sd->shared && atomic_dec_and_test(&sd->shared->ref))
kfree(sd->shared);
kfree(sd); kfree(sd);
} }
static void destroy_sched_domain(struct sched_domain *sd, int cpu) static void destroy_sched_domains_rcu(struct rcu_head *rcu)
{ {
call_rcu(&sd->rcu, free_sched_domain); struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);
while (sd) {
struct sched_domain *parent = sd->parent;
destroy_sched_domain(sd);
sd = parent;
}
} }
static void destroy_sched_domains(struct sched_domain *sd, int cpu) static void destroy_sched_domains(struct sched_domain *sd)
{ {
for (; sd; sd = sd->parent) if (sd)
destroy_sched_domain(sd, cpu); call_rcu(&sd->rcu, destroy_sched_domains_rcu);
} }
/* /*
...@@ -5956,14 +5994,14 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu) ...@@ -5956,14 +5994,14 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu)
DEFINE_PER_CPU(struct sched_domain *, sd_llc); DEFINE_PER_CPU(struct sched_domain *, sd_llc);
DEFINE_PER_CPU(int, sd_llc_size); DEFINE_PER_CPU(int, sd_llc_size);
DEFINE_PER_CPU(int, sd_llc_id); DEFINE_PER_CPU(int, sd_llc_id);
DEFINE_PER_CPU(struct sched_domain_shared *, sd_llc_shared);
DEFINE_PER_CPU(struct sched_domain *, sd_numa); DEFINE_PER_CPU(struct sched_domain *, sd_numa);
DEFINE_PER_CPU(struct sched_domain *, sd_busy);
DEFINE_PER_CPU(struct sched_domain *, sd_asym); DEFINE_PER_CPU(struct sched_domain *, sd_asym);
static void update_top_cache_domain(int cpu) static void update_top_cache_domain(int cpu)
{ {
struct sched_domain_shared *sds = NULL;
struct sched_domain *sd; struct sched_domain *sd;
struct sched_domain *busy_sd = NULL;
int id = cpu; int id = cpu;
int size = 1; int size = 1;
...@@ -5971,13 +6009,13 @@ static void update_top_cache_domain(int cpu) ...@@ -5971,13 +6009,13 @@ static void update_top_cache_domain(int cpu)
if (sd) { if (sd) {
id = cpumask_first(sched_domain_span(sd)); id = cpumask_first(sched_domain_span(sd));
size = cpumask_weight(sched_domain_span(sd)); size = cpumask_weight(sched_domain_span(sd));
busy_sd = sd->parent; /* sd_busy */ sds = sd->shared;
} }
rcu_assign_pointer(per_cpu(sd_busy, cpu), busy_sd);
rcu_assign_pointer(per_cpu(sd_llc, cpu), sd); rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
per_cpu(sd_llc_size, cpu) = size; per_cpu(sd_llc_size, cpu) = size;
per_cpu(sd_llc_id, cpu) = id; per_cpu(sd_llc_id, cpu) = id;
rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds);
sd = lowest_flag_domain(cpu, SD_NUMA); sd = lowest_flag_domain(cpu, SD_NUMA);
rcu_assign_pointer(per_cpu(sd_numa, cpu), sd); rcu_assign_pointer(per_cpu(sd_numa, cpu), sd);
...@@ -6013,7 +6051,7 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) ...@@ -6013,7 +6051,7 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
*/ */
if (parent->flags & SD_PREFER_SIBLING) if (parent->flags & SD_PREFER_SIBLING)
tmp->flags |= SD_PREFER_SIBLING; tmp->flags |= SD_PREFER_SIBLING;
destroy_sched_domain(parent, cpu); destroy_sched_domain(parent);
} else } else
tmp = tmp->parent; tmp = tmp->parent;
} }
...@@ -6021,7 +6059,7 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) ...@@ -6021,7 +6059,7 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
if (sd && sd_degenerate(sd)) { if (sd && sd_degenerate(sd)) {
tmp = sd; tmp = sd;
sd = sd->parent; sd = sd->parent;
destroy_sched_domain(tmp, cpu); destroy_sched_domain(tmp);
if (sd) if (sd)
sd->child = NULL; sd->child = NULL;
} }
...@@ -6031,7 +6069,7 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) ...@@ -6031,7 +6069,7 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
rq_attach_root(rq, rd); rq_attach_root(rq, rd);
tmp = rq->sd; tmp = rq->sd;
rcu_assign_pointer(rq->sd, sd); rcu_assign_pointer(rq->sd, sd);
destroy_sched_domains(tmp, cpu); destroy_sched_domains(tmp);
update_top_cache_domain(cpu); update_top_cache_domain(cpu);
} }
...@@ -6274,7 +6312,6 @@ static void init_sched_groups_capacity(int cpu, struct sched_domain *sd) ...@@ -6274,7 +6312,6 @@ static void init_sched_groups_capacity(int cpu, struct sched_domain *sd)
return; return;
update_group_capacity(sd, cpu); update_group_capacity(sd, cpu);
atomic_set(&sg->sgc->nr_busy_cpus, sg->group_weight);
} }
/* /*
...@@ -6362,6 +6399,9 @@ static void claim_allocations(int cpu, struct sched_domain *sd) ...@@ -6362,6 +6399,9 @@ static void claim_allocations(int cpu, struct sched_domain *sd)
WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd); WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);
*per_cpu_ptr(sdd->sd, cpu) = NULL; *per_cpu_ptr(sdd->sd, cpu) = NULL;
if (atomic_read(&(*per_cpu_ptr(sdd->sds, cpu))->ref))
*per_cpu_ptr(sdd->sds, cpu) = NULL;
if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref)) if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref))
*per_cpu_ptr(sdd->sg, cpu) = NULL; *per_cpu_ptr(sdd->sg, cpu) = NULL;
...@@ -6381,26 +6421,37 @@ static int sched_domains_curr_level; ...@@ -6381,26 +6421,37 @@ static int sched_domains_curr_level;
/* /*
* SD_flags allowed in topology descriptions. * SD_flags allowed in topology descriptions.
* *
* SD_SHARE_CPUCAPACITY - describes SMT topologies * These flags are purely descriptive of the topology and do not prescribe
* SD_SHARE_PKG_RESOURCES - describes shared caches * behaviour. Behaviour is artificial and mapped in the below sd_init()
* SD_NUMA - describes NUMA topologies * function:
* SD_SHARE_POWERDOMAIN - describes shared power domain *
* SD_SHARE_CPUCAPACITY - describes SMT topologies
* SD_SHARE_PKG_RESOURCES - describes shared caches
* SD_NUMA - describes NUMA topologies
* SD_SHARE_POWERDOMAIN - describes shared power domain
* SD_ASYM_CPUCAPACITY - describes mixed capacity topologies
*
* Odd one out, which beside describing the topology has a quirk also
* prescribes the desired behaviour that goes along with it:
* *
* Odd one out: * SD_ASYM_PACKING - describes SMT quirks
* SD_ASYM_PACKING - describes SMT quirks
*/ */
#define TOPOLOGY_SD_FLAGS \ #define TOPOLOGY_SD_FLAGS \
(SD_SHARE_CPUCAPACITY | \ (SD_SHARE_CPUCAPACITY | \
SD_SHARE_PKG_RESOURCES | \ SD_SHARE_PKG_RESOURCES | \
SD_NUMA | \ SD_NUMA | \
SD_ASYM_PACKING | \ SD_ASYM_PACKING | \
SD_ASYM_CPUCAPACITY | \
SD_SHARE_POWERDOMAIN) SD_SHARE_POWERDOMAIN)
static struct sched_domain * static struct sched_domain *
sd_init(struct sched_domain_topology_level *tl, int cpu) sd_init(struct sched_domain_topology_level *tl,
const struct cpumask *cpu_map,
struct sched_domain *child, int cpu)
{ {
struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); struct sd_data *sdd = &tl->data;
int sd_weight, sd_flags = 0; struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
int sd_id, sd_weight, sd_flags = 0;
#ifdef CONFIG_NUMA #ifdef CONFIG_NUMA
/* /*
...@@ -6449,15 +6500,26 @@ sd_init(struct sched_domain_topology_level *tl, int cpu) ...@@ -6449,15 +6500,26 @@ sd_init(struct sched_domain_topology_level *tl, int cpu)
.smt_gain = 0, .smt_gain = 0,
.max_newidle_lb_cost = 0, .max_newidle_lb_cost = 0,
.next_decay_max_lb_cost = jiffies, .next_decay_max_lb_cost = jiffies,
.child = child,
#ifdef CONFIG_SCHED_DEBUG #ifdef CONFIG_SCHED_DEBUG
.name = tl->name, .name = tl->name,
#endif #endif
}; };
cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
sd_id = cpumask_first(sched_domain_span(sd));
/* /*
* Convert topological properties into behaviour. * Convert topological properties into behaviour.
*/ */
if (sd->flags & SD_ASYM_CPUCAPACITY) {
struct sched_domain *t = sd;
for_each_lower_domain(t)
t->flags |= SD_BALANCE_WAKE;
}
if (sd->flags & SD_SHARE_CPUCAPACITY) { if (sd->flags & SD_SHARE_CPUCAPACITY) {
sd->flags |= SD_PREFER_SIBLING; sd->flags |= SD_PREFER_SIBLING;
sd->imbalance_pct = 110; sd->imbalance_pct = 110;
...@@ -6489,7 +6551,17 @@ sd_init(struct sched_domain_topology_level *tl, int cpu) ...@@ -6489,7 +6551,17 @@ sd_init(struct sched_domain_topology_level *tl, int cpu)
sd->idle_idx = 1; sd->idle_idx = 1;
} }
sd->private = &tl->data; /*
* For all levels sharing cache; connect a sched_domain_shared
* instance.
*/
if (sd->flags & SD_SHARE_PKG_RESOURCES) {
sd->shared = *per_cpu_ptr(sdd->sds, sd_id);
atomic_inc(&sd->shared->ref);
atomic_set(&sd->shared->nr_busy_cpus, sd_weight);
}
sd->private = sdd;
return sd; return sd;
} }
...@@ -6516,6 +6588,9 @@ static struct sched_domain_topology_level *sched_domain_topology = ...@@ -6516,6 +6588,9 @@ static struct sched_domain_topology_level *sched_domain_topology =
void set_sched_topology(struct sched_domain_topology_level *tl) void set_sched_topology(struct sched_domain_topology_level *tl)
{ {
if (WARN_ON_ONCE(sched_smp_initialized))
return;
sched_domain_topology = tl; sched_domain_topology = tl;
} }
...@@ -6796,6 +6871,10 @@ static int __sdt_alloc(const struct cpumask *cpu_map) ...@@ -6796,6 +6871,10 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
if (!sdd->sd) if (!sdd->sd)
return -ENOMEM; return -ENOMEM;
sdd->sds = alloc_percpu(struct sched_domain_shared *);
if (!sdd->sds)
return -ENOMEM;
sdd->sg = alloc_percpu(struct sched_group *); sdd->sg = alloc_percpu(struct sched_group *);
if (!sdd->sg) if (!sdd->sg)
return -ENOMEM; return -ENOMEM;
...@@ -6806,6 +6885,7 @@ static int __sdt_alloc(const struct cpumask *cpu_map) ...@@ -6806,6 +6885,7 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
for_each_cpu(j, cpu_map) { for_each_cpu(j, cpu_map) {
struct sched_domain *sd; struct sched_domain *sd;
struct sched_domain_shared *sds;
struct sched_group *sg; struct sched_group *sg;
struct sched_group_capacity *sgc; struct sched_group_capacity *sgc;
...@@ -6816,6 +6896,13 @@ static int __sdt_alloc(const struct cpumask *cpu_map) ...@@ -6816,6 +6896,13 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
*per_cpu_ptr(sdd->sd, j) = sd; *per_cpu_ptr(sdd->sd, j) = sd;
sds = kzalloc_node(sizeof(struct sched_domain_shared),
GFP_KERNEL, cpu_to_node(j));
if (!sds)
return -ENOMEM;
*per_cpu_ptr(sdd->sds, j) = sds;
sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
GFP_KERNEL, cpu_to_node(j)); GFP_KERNEL, cpu_to_node(j));
if (!sg) if (!sg)
...@@ -6855,6 +6942,8 @@ static void __sdt_free(const struct cpumask *cpu_map) ...@@ -6855,6 +6942,8 @@ static void __sdt_free(const struct cpumask *cpu_map)
kfree(*per_cpu_ptr(sdd->sd, j)); kfree(*per_cpu_ptr(sdd->sd, j));
} }
if (sdd->sds)
kfree(*per_cpu_ptr(sdd->sds, j));
if (sdd->sg) if (sdd->sg)
kfree(*per_cpu_ptr(sdd->sg, j)); kfree(*per_cpu_ptr(sdd->sg, j));
if (sdd->sgc) if (sdd->sgc)
...@@ -6862,6 +6951,8 @@ static void __sdt_free(const struct cpumask *cpu_map) ...@@ -6862,6 +6951,8 @@ static void __sdt_free(const struct cpumask *cpu_map)
} }
free_percpu(sdd->sd); free_percpu(sdd->sd);
sdd->sd = NULL; sdd->sd = NULL;
free_percpu(sdd->sds);
sdd->sds = NULL;
free_percpu(sdd->sg); free_percpu(sdd->sg);
sdd->sg = NULL; sdd->sg = NULL;
free_percpu(sdd->sgc); free_percpu(sdd->sgc);
...@@ -6873,16 +6964,12 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, ...@@ -6873,16 +6964,12 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
const struct cpumask *cpu_map, struct sched_domain_attr *attr, const struct cpumask *cpu_map, struct sched_domain_attr *attr,
struct sched_domain *child, int cpu) struct sched_domain *child, int cpu)
{ {
struct sched_domain *sd = sd_init(tl, cpu); struct sched_domain *sd = sd_init(tl, cpu_map, child, cpu);
if (!sd)
return child;
cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
if (child) { if (child) {
sd->level = child->level + 1; sd->level = child->level + 1;
sched_domain_level_max = max(sched_domain_level_max, sd->level); sched_domain_level_max = max(sched_domain_level_max, sd->level);
child->parent = sd; child->parent = sd;
sd->child = child;
if (!cpumask_subset(sched_domain_span(child), if (!cpumask_subset(sched_domain_span(child),
sched_domain_span(sd))) { sched_domain_span(sd))) {
...@@ -6913,6 +7000,7 @@ static int build_sched_domains(const struct cpumask *cpu_map, ...@@ -6913,6 +7000,7 @@ static int build_sched_domains(const struct cpumask *cpu_map,
enum s_alloc alloc_state; enum s_alloc alloc_state;
struct sched_domain *sd; struct sched_domain *sd;
struct s_data d; struct s_data d;
struct rq *rq = NULL;
int i, ret = -ENOMEM; int i, ret = -ENOMEM;
alloc_state = __visit_domain_allocation_hell(&d, cpu_map); alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
...@@ -6963,11 +7051,22 @@ static int build_sched_domains(const struct cpumask *cpu_map, ...@@ -6963,11 +7051,22 @@ static int build_sched_domains(const struct cpumask *cpu_map,
/* Attach the domains */ /* Attach the domains */
rcu_read_lock(); rcu_read_lock();
for_each_cpu(i, cpu_map) { for_each_cpu(i, cpu_map) {
rq = cpu_rq(i);
sd = *per_cpu_ptr(d.sd, i); sd = *per_cpu_ptr(d.sd, i);
/* Use READ_ONCE()/WRITE_ONCE() to avoid load/store tearing: */
if (rq->cpu_capacity_orig > READ_ONCE(d.rd->max_cpu_capacity))
WRITE_ONCE(d.rd->max_cpu_capacity, rq->cpu_capacity_orig);
cpu_attach_domain(sd, d.rd, i); cpu_attach_domain(sd, d.rd, i);
} }
rcu_read_unlock(); rcu_read_unlock();
if (rq && sched_debug_enabled) {
pr_info("span: %*pbl (max cpu_capacity = %lu)\n",
cpumask_pr_args(cpu_map), rq->rd->max_cpu_capacity);
}
ret = 0; ret = 0;
error: error:
__free_domain_allocs(&d, alloc_state, cpu_map); __free_domain_allocs(&d, alloc_state, cpu_map);
...@@ -7326,6 +7425,22 @@ int sched_cpu_dying(unsigned int cpu) ...@@ -7326,6 +7425,22 @@ int sched_cpu_dying(unsigned int cpu)
} }
#endif #endif
#ifdef CONFIG_SCHED_SMT
DEFINE_STATIC_KEY_FALSE(sched_smt_present);
static void sched_init_smt(void)
{
/*
* We've enumerated all CPUs and will assume that if any CPU
* has SMT siblings, CPU0 will too.
*/
if (cpumask_weight(cpu_smt_mask(0)) > 1)
static_branch_enable(&sched_smt_present);
}
#else
static inline void sched_init_smt(void) { }
#endif
void __init sched_init_smp(void) void __init sched_init_smp(void)
{ {
cpumask_var_t non_isolated_cpus; cpumask_var_t non_isolated_cpus;
...@@ -7355,6 +7470,9 @@ void __init sched_init_smp(void) ...@@ -7355,6 +7470,9 @@ void __init sched_init_smp(void)
init_sched_rt_class(); init_sched_rt_class();
init_sched_dl_class(); init_sched_dl_class();
sched_init_smt();
sched_smp_initialized = true; sched_smp_initialized = true;
} }
...@@ -7392,6 +7510,7 @@ static struct kmem_cache *task_group_cache __read_mostly; ...@@ -7392,6 +7510,7 @@ static struct kmem_cache *task_group_cache __read_mostly;
#endif #endif
DECLARE_PER_CPU(cpumask_var_t, load_balance_mask); DECLARE_PER_CPU(cpumask_var_t, load_balance_mask);
DECLARE_PER_CPU(cpumask_var_t, select_idle_mask);
void __init sched_init(void) void __init sched_init(void)
{ {
...@@ -7428,6 +7547,8 @@ void __init sched_init(void) ...@@ -7428,6 +7547,8 @@ void __init sched_init(void)
for_each_possible_cpu(i) { for_each_possible_cpu(i) {
per_cpu(load_balance_mask, i) = (cpumask_var_t)kzalloc_node( per_cpu(load_balance_mask, i) = (cpumask_var_t)kzalloc_node(
cpumask_size(), GFP_KERNEL, cpu_to_node(i)); cpumask_size(), GFP_KERNEL, cpu_to_node(i));
per_cpu(select_idle_mask, i) = (cpumask_var_t)kzalloc_node(
cpumask_size(), GFP_KERNEL, cpu_to_node(i));
} }
#endif /* CONFIG_CPUMASK_OFFSTACK */ #endif /* CONFIG_CPUMASK_OFFSTACK */
...@@ -7530,21 +7651,12 @@ void __init sched_init(void) ...@@ -7530,21 +7651,12 @@ void __init sched_init(void)
set_load_weight(&init_task); set_load_weight(&init_task);
#ifdef CONFIG_PREEMPT_NOTIFIERS
INIT_HLIST_HEAD(&init_task.preempt_notifiers);
#endif
/* /*
* The boot idle thread does lazy MMU switching as well: * The boot idle thread does lazy MMU switching as well:
*/ */
atomic_inc(&init_mm.mm_count); atomic_inc(&init_mm.mm_count);
enter_lazy_tlb(&init_mm, current); enter_lazy_tlb(&init_mm, current);
/*
* During early bootup we pretend to be a normal task:
*/
current->sched_class = &fair_sched_class;
/* /*
* Make us the idle thread. Technically, schedule() should not be * Make us the idle thread. Technically, schedule() should not be
* called from this thread, however somewhere below it might be, * called from this thread, however somewhere below it might be,
...@@ -7599,6 +7711,7 @@ EXPORT_SYMBOL(__might_sleep); ...@@ -7599,6 +7711,7 @@ EXPORT_SYMBOL(__might_sleep);
void ___might_sleep(const char *file, int line, int preempt_offset) void ___might_sleep(const char *file, int line, int preempt_offset)
{ {
static unsigned long prev_jiffy; /* ratelimiting */ static unsigned long prev_jiffy; /* ratelimiting */
unsigned long preempt_disable_ip;
rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */ rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */
if ((preempt_count_equals(preempt_offset) && !irqs_disabled() && if ((preempt_count_equals(preempt_offset) && !irqs_disabled() &&
...@@ -7609,6 +7722,9 @@ void ___might_sleep(const char *file, int line, int preempt_offset) ...@@ -7609,6 +7722,9 @@ void ___might_sleep(const char *file, int line, int preempt_offset)
return; return;
prev_jiffy = jiffies; prev_jiffy = jiffies;
/* Save this before calling printk(), since that will clobber it */
preempt_disable_ip = get_preempt_disable_ip(current);
printk(KERN_ERR printk(KERN_ERR
"BUG: sleeping function called from invalid context at %s:%d\n", "BUG: sleeping function called from invalid context at %s:%d\n",
file, line); file, line);
...@@ -7623,14 +7739,14 @@ void ___might_sleep(const char *file, int line, int preempt_offset) ...@@ -7623,14 +7739,14 @@ void ___might_sleep(const char *file, int line, int preempt_offset)
debug_show_held_locks(current); debug_show_held_locks(current);
if (irqs_disabled()) if (irqs_disabled())
print_irqtrace_events(current); print_irqtrace_events(current);
#ifdef CONFIG_DEBUG_PREEMPT if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)
if (!preempt_count_equals(preempt_offset)) { && !preempt_count_equals(preempt_offset)) {
pr_err("Preemption disabled at:"); pr_err("Preemption disabled at:");
print_ip_sym(current->preempt_disable_ip); print_ip_sym(preempt_disable_ip);
pr_cont("\n"); pr_cont("\n");
} }
#endif
dump_stack(); dump_stack();
add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
} }
EXPORT_SYMBOL(___might_sleep); EXPORT_SYMBOL(___might_sleep);
#endif #endif
...@@ -7651,12 +7767,10 @@ void normalize_rt_tasks(void) ...@@ -7651,12 +7767,10 @@ void normalize_rt_tasks(void)
if (p->flags & PF_KTHREAD) if (p->flags & PF_KTHREAD)
continue; continue;
p->se.exec_start = 0; p->se.exec_start = 0;
#ifdef CONFIG_SCHEDSTATS schedstat_set(p->se.statistics.wait_start, 0);
p->se.statistics.wait_start = 0; schedstat_set(p->se.statistics.sleep_start, 0);
p->se.statistics.sleep_start = 0; schedstat_set(p->se.statistics.block_start, 0);
p->se.statistics.block_start = 0;
#endif
if (!dl_task(p) && !rt_task(p)) { if (!dl_task(p) && !rt_task(p)) {
/* /*
...@@ -7717,7 +7831,7 @@ struct task_struct *curr_task(int cpu) ...@@ -7717,7 +7831,7 @@ struct task_struct *curr_task(int cpu)
* *
* ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
*/ */
void set_curr_task(int cpu, struct task_struct *p) void ia64_set_curr_task(int cpu, struct task_struct *p)
{ {
cpu_curr(cpu) = p; cpu_curr(cpu) = p;
} }
...@@ -7848,10 +7962,10 @@ void sched_move_task(struct task_struct *tsk) ...@@ -7848,10 +7962,10 @@ void sched_move_task(struct task_struct *tsk)
sched_change_group(tsk, TASK_MOVE_GROUP); sched_change_group(tsk, TASK_MOVE_GROUP);
if (unlikely(running))
tsk->sched_class->set_curr_task(rq);
if (queued) if (queued)
enqueue_task(rq, tsk, ENQUEUE_RESTORE | ENQUEUE_MOVE); enqueue_task(rq, tsk, ENQUEUE_RESTORE | ENQUEUE_MOVE);
if (unlikely(running))
set_curr_task(rq, tsk);
task_rq_unlock(rq, tsk, &rf); task_rq_unlock(rq, tsk, &rf);
} }
......
...@@ -31,56 +31,81 @@ static inline int right_child(int i) ...@@ -31,56 +31,81 @@ static inline int right_child(int i)
return (i << 1) + 2; return (i << 1) + 2;
} }
static void cpudl_exchange(struct cpudl *cp, int a, int b) static void cpudl_heapify_down(struct cpudl *cp, int idx)
{ {
int cpu_a = cp->elements[a].cpu, cpu_b = cp->elements[b].cpu; int l, r, largest;
swap(cp->elements[a].cpu, cp->elements[b].cpu); int orig_cpu = cp->elements[idx].cpu;
swap(cp->elements[a].dl , cp->elements[b].dl ); u64 orig_dl = cp->elements[idx].dl;
swap(cp->elements[cpu_a].idx, cp->elements[cpu_b].idx); if (left_child(idx) >= cp->size)
} return;
static void cpudl_heapify(struct cpudl *cp, int idx)
{
int l, r, largest;
/* adapted from lib/prio_heap.c */ /* adapted from lib/prio_heap.c */
while(1) { while(1) {
u64 largest_dl;
l = left_child(idx); l = left_child(idx);
r = right_child(idx); r = right_child(idx);
largest = idx; largest = idx;
largest_dl = orig_dl;
if ((l < cp->size) && dl_time_before(cp->elements[idx].dl, if ((l < cp->size) && dl_time_before(orig_dl,
cp->elements[l].dl)) cp->elements[l].dl)) {
largest = l; largest = l;
if ((r < cp->size) && dl_time_before(cp->elements[largest].dl, largest_dl = cp->elements[l].dl;
cp->elements[r].dl)) }
if ((r < cp->size) && dl_time_before(largest_dl,
cp->elements[r].dl))
largest = r; largest = r;
if (largest == idx) if (largest == idx)
break; break;
/* Push idx down the heap one level and bump one up */ /* pull largest child onto idx */
cpudl_exchange(cp, largest, idx); cp->elements[idx].cpu = cp->elements[largest].cpu;
cp->elements[idx].dl = cp->elements[largest].dl;
cp->elements[cp->elements[idx].cpu].idx = idx;
idx = largest; idx = largest;
} }
/* actual push down of saved original values orig_* */
cp->elements[idx].cpu = orig_cpu;
cp->elements[idx].dl = orig_dl;
cp->elements[cp->elements[idx].cpu].idx = idx;
} }
static void cpudl_change_key(struct cpudl *cp, int idx, u64 new_dl) static void cpudl_heapify_up(struct cpudl *cp, int idx)
{ {
WARN_ON(idx == IDX_INVALID || !cpu_present(idx)); int p;
if (dl_time_before(new_dl, cp->elements[idx].dl)) { int orig_cpu = cp->elements[idx].cpu;
cp->elements[idx].dl = new_dl; u64 orig_dl = cp->elements[idx].dl;
cpudl_heapify(cp, idx);
} else { if (idx == 0)
cp->elements[idx].dl = new_dl; return;
while (idx > 0 && dl_time_before(cp->elements[parent(idx)].dl,
cp->elements[idx].dl)) { do {
cpudl_exchange(cp, idx, parent(idx)); p = parent(idx);
idx = parent(idx); if (dl_time_before(orig_dl, cp->elements[p].dl))
} break;
} /* pull parent onto idx */
cp->elements[idx].cpu = cp->elements[p].cpu;
cp->elements[idx].dl = cp->elements[p].dl;
cp->elements[cp->elements[idx].cpu].idx = idx;
idx = p;
} while (idx != 0);
/* actual push up of saved original values orig_* */
cp->elements[idx].cpu = orig_cpu;
cp->elements[idx].dl = orig_dl;
cp->elements[cp->elements[idx].cpu].idx = idx;
}
static void cpudl_heapify(struct cpudl *cp, int idx)
{
if (idx > 0 && dl_time_before(cp->elements[parent(idx)].dl,
cp->elements[idx].dl))
cpudl_heapify_up(cp, idx);
else
cpudl_heapify_down(cp, idx);
} }
static inline int cpudl_maximum(struct cpudl *cp) static inline int cpudl_maximum(struct cpudl *cp)
...@@ -120,16 +145,15 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p, ...@@ -120,16 +145,15 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p,
} }
/* /*
* cpudl_set - update the cpudl max-heap * cpudl_clear - remove a cpu from the cpudl max-heap
* @cp: the cpudl max-heap context * @cp: the cpudl max-heap context
* @cpu: the target cpu * @cpu: the target cpu
* @dl: the new earliest deadline for this cpu
* *
* Notes: assumes cpu_rq(cpu)->lock is locked * Notes: assumes cpu_rq(cpu)->lock is locked
* *
* Returns: (void) * Returns: (void)
*/ */
void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid) void cpudl_clear(struct cpudl *cp, int cpu)
{ {
int old_idx, new_cpu; int old_idx, new_cpu;
unsigned long flags; unsigned long flags;
...@@ -137,47 +161,60 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid) ...@@ -137,47 +161,60 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid)
WARN_ON(!cpu_present(cpu)); WARN_ON(!cpu_present(cpu));
raw_spin_lock_irqsave(&cp->lock, flags); raw_spin_lock_irqsave(&cp->lock, flags);
old_idx = cp->elements[cpu].idx; old_idx = cp->elements[cpu].idx;
if (!is_valid) { if (old_idx == IDX_INVALID) {
/* remove item */ /*
if (old_idx == IDX_INVALID) { * Nothing to remove if old_idx was invalid.
/* * This could happen if a rq_offline_dl is
* Nothing to remove if old_idx was invalid. * called for a CPU without -dl tasks running.
* This could happen if a rq_offline_dl is */
* called for a CPU without -dl tasks running. } else {
*/
goto out;
}
new_cpu = cp->elements[cp->size - 1].cpu; new_cpu = cp->elements[cp->size - 1].cpu;
cp->elements[old_idx].dl = cp->elements[cp->size - 1].dl; cp->elements[old_idx].dl = cp->elements[cp->size - 1].dl;
cp->elements[old_idx].cpu = new_cpu; cp->elements[old_idx].cpu = new_cpu;
cp->size--; cp->size--;
cp->elements[new_cpu].idx = old_idx; cp->elements[new_cpu].idx = old_idx;
cp->elements[cpu].idx = IDX_INVALID; cp->elements[cpu].idx = IDX_INVALID;
while (old_idx > 0 && dl_time_before( cpudl_heapify(cp, old_idx);
cp->elements[parent(old_idx)].dl,
cp->elements[old_idx].dl)) {
cpudl_exchange(cp, old_idx, parent(old_idx));
old_idx = parent(old_idx);
}
cpumask_set_cpu(cpu, cp->free_cpus);
cpudl_heapify(cp, old_idx);
goto out; cpumask_set_cpu(cpu, cp->free_cpus);
} }
raw_spin_unlock_irqrestore(&cp->lock, flags);
}
/*
* cpudl_set - update the cpudl max-heap
* @cp: the cpudl max-heap context
* @cpu: the target cpu
* @dl: the new earliest deadline for this cpu
*
* Notes: assumes cpu_rq(cpu)->lock is locked
*
* Returns: (void)
*/
void cpudl_set(struct cpudl *cp, int cpu, u64 dl)
{
int old_idx;
unsigned long flags;
WARN_ON(!cpu_present(cpu));
raw_spin_lock_irqsave(&cp->lock, flags);
old_idx = cp->elements[cpu].idx;
if (old_idx == IDX_INVALID) { if (old_idx == IDX_INVALID) {
cp->size++; int new_idx = cp->size++;
cp->elements[cp->size - 1].dl = dl; cp->elements[new_idx].dl = dl;
cp->elements[cp->size - 1].cpu = cpu; cp->elements[new_idx].cpu = cpu;
cp->elements[cpu].idx = cp->size - 1; cp->elements[cpu].idx = new_idx;
cpudl_change_key(cp, cp->size - 1, dl); cpudl_heapify_up(cp, new_idx);
cpumask_clear_cpu(cpu, cp->free_cpus); cpumask_clear_cpu(cpu, cp->free_cpus);
} else { } else {
cpudl_change_key(cp, old_idx, dl); cp->elements[old_idx].dl = dl;
cpudl_heapify(cp, old_idx);
} }
out:
raw_spin_unlock_irqrestore(&cp->lock, flags); raw_spin_unlock_irqrestore(&cp->lock, flags);
} }
......
...@@ -23,7 +23,8 @@ struct cpudl { ...@@ -23,7 +23,8 @@ struct cpudl {
#ifdef CONFIG_SMP #ifdef CONFIG_SMP
int cpudl_find(struct cpudl *cp, struct task_struct *p, int cpudl_find(struct cpudl *cp, struct task_struct *p,
struct cpumask *later_mask); struct cpumask *later_mask);
void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid); void cpudl_set(struct cpudl *cp, int cpu, u64 dl);
void cpudl_clear(struct cpudl *cp, int cpu);
int cpudl_init(struct cpudl *cp); int cpudl_init(struct cpudl *cp);
void cpudl_set_freecpu(struct cpudl *cp, int cpu); void cpudl_set_freecpu(struct cpudl *cp, int cpu);
void cpudl_clear_freecpu(struct cpudl *cp, int cpu); void cpudl_clear_freecpu(struct cpudl *cp, int cpu);
......
...@@ -23,10 +23,8 @@ ...@@ -23,10 +23,8 @@
* task when irq is in progress while we read rq->clock. That is a worthy * task when irq is in progress while we read rq->clock. That is a worthy
* compromise in place of having locks on each irq in account_system_time. * compromise in place of having locks on each irq in account_system_time.
*/ */
DEFINE_PER_CPU(u64, cpu_hardirq_time); DEFINE_PER_CPU(struct irqtime, cpu_irqtime);
DEFINE_PER_CPU(u64, cpu_softirq_time);
static DEFINE_PER_CPU(u64, irq_start_time);
static int sched_clock_irqtime; static int sched_clock_irqtime;
void enable_sched_clock_irqtime(void) void enable_sched_clock_irqtime(void)
...@@ -39,16 +37,13 @@ void disable_sched_clock_irqtime(void) ...@@ -39,16 +37,13 @@ void disable_sched_clock_irqtime(void)
sched_clock_irqtime = 0; sched_clock_irqtime = 0;
} }
#ifndef CONFIG_64BIT
DEFINE_PER_CPU(seqcount_t, irq_time_seq);
#endif /* CONFIG_64BIT */
/* /*
* Called before incrementing preempt_count on {soft,}irq_enter * Called before incrementing preempt_count on {soft,}irq_enter
* and before decrementing preempt_count on {soft,}irq_exit. * and before decrementing preempt_count on {soft,}irq_exit.
*/ */
void irqtime_account_irq(struct task_struct *curr) void irqtime_account_irq(struct task_struct *curr)
{ {
struct irqtime *irqtime = this_cpu_ptr(&cpu_irqtime);
s64 delta; s64 delta;
int cpu; int cpu;
...@@ -56,10 +51,10 @@ void irqtime_account_irq(struct task_struct *curr) ...@@ -56,10 +51,10 @@ void irqtime_account_irq(struct task_struct *curr)
return; return;
cpu = smp_processor_id(); cpu = smp_processor_id();
delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time); delta = sched_clock_cpu(cpu) - irqtime->irq_start_time;
__this_cpu_add(irq_start_time, delta); irqtime->irq_start_time += delta;
irq_time_write_begin(); u64_stats_update_begin(&irqtime->sync);
/* /*
* We do not account for softirq time from ksoftirqd here. * We do not account for softirq time from ksoftirqd here.
* We want to continue accounting softirq time to ksoftirqd thread * We want to continue accounting softirq time to ksoftirqd thread
...@@ -67,42 +62,36 @@ void irqtime_account_irq(struct task_struct *curr) ...@@ -67,42 +62,36 @@ void irqtime_account_irq(struct task_struct *curr)
* that do not consume any time, but still wants to run. * that do not consume any time, but still wants to run.
*/ */
if (hardirq_count()) if (hardirq_count())
__this_cpu_add(cpu_hardirq_time, delta); irqtime->hardirq_time += delta;
else if (in_serving_softirq() && curr != this_cpu_ksoftirqd()) else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
__this_cpu_add(cpu_softirq_time, delta); irqtime->softirq_time += delta;
irq_time_write_end(); u64_stats_update_end(&irqtime->sync);
} }
EXPORT_SYMBOL_GPL(irqtime_account_irq); EXPORT_SYMBOL_GPL(irqtime_account_irq);
static cputime_t irqtime_account_hi_update(cputime_t maxtime) static cputime_t irqtime_account_update(u64 irqtime, int idx, cputime_t maxtime)
{ {
u64 *cpustat = kcpustat_this_cpu->cpustat; u64 *cpustat = kcpustat_this_cpu->cpustat;
unsigned long flags;
cputime_t irq_cputime; cputime_t irq_cputime;
local_irq_save(flags); irq_cputime = nsecs_to_cputime64(irqtime) - cpustat[idx];
irq_cputime = nsecs_to_cputime64(this_cpu_read(cpu_hardirq_time)) -
cpustat[CPUTIME_IRQ];
irq_cputime = min(irq_cputime, maxtime); irq_cputime = min(irq_cputime, maxtime);
cpustat[CPUTIME_IRQ] += irq_cputime; cpustat[idx] += irq_cputime;
local_irq_restore(flags);
return irq_cputime; return irq_cputime;
} }
static cputime_t irqtime_account_si_update(cputime_t maxtime) static cputime_t irqtime_account_hi_update(cputime_t maxtime)
{ {
u64 *cpustat = kcpustat_this_cpu->cpustat; return irqtime_account_update(__this_cpu_read(cpu_irqtime.hardirq_time),
unsigned long flags; CPUTIME_IRQ, maxtime);
cputime_t softirq_cputime; }
local_irq_save(flags); static cputime_t irqtime_account_si_update(cputime_t maxtime)
softirq_cputime = nsecs_to_cputime64(this_cpu_read(cpu_softirq_time)) - {
cpustat[CPUTIME_SOFTIRQ]; return irqtime_account_update(__this_cpu_read(cpu_irqtime.softirq_time),
softirq_cputime = min(softirq_cputime, maxtime); CPUTIME_SOFTIRQ, maxtime);
cpustat[CPUTIME_SOFTIRQ] += softirq_cputime;
local_irq_restore(flags);
return softirq_cputime;
} }
#else /* CONFIG_IRQ_TIME_ACCOUNTING */ #else /* CONFIG_IRQ_TIME_ACCOUNTING */
...@@ -295,6 +284,9 @@ static inline cputime_t account_other_time(cputime_t max) ...@@ -295,6 +284,9 @@ static inline cputime_t account_other_time(cputime_t max)
{ {
cputime_t accounted; cputime_t accounted;
/* Shall be converted to a lockdep-enabled lightweight check */
WARN_ON_ONCE(!irqs_disabled());
accounted = steal_account_process_time(max); accounted = steal_account_process_time(max);
if (accounted < max) if (accounted < max)
...@@ -306,6 +298,26 @@ static inline cputime_t account_other_time(cputime_t max) ...@@ -306,6 +298,26 @@ static inline cputime_t account_other_time(cputime_t max)
return accounted; return accounted;
} }
#ifdef CONFIG_64BIT
static inline u64 read_sum_exec_runtime(struct task_struct *t)
{
return t->se.sum_exec_runtime;
}
#else
static u64 read_sum_exec_runtime(struct task_struct *t)
{
u64 ns;
struct rq_flags rf;
struct rq *rq;
rq = task_rq_lock(t, &rf);
ns = t->se.sum_exec_runtime;
task_rq_unlock(rq, t, &rf);
return ns;
}
#endif
/* /*
* Accumulate raw cputime values of dead tasks (sig->[us]time) and live * Accumulate raw cputime values of dead tasks (sig->[us]time) and live
* tasks (sum on group iteration) belonging to @tsk's group. * tasks (sum on group iteration) belonging to @tsk's group.
...@@ -318,6 +330,17 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) ...@@ -318,6 +330,17 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
unsigned int seq, nextseq; unsigned int seq, nextseq;
unsigned long flags; unsigned long flags;
/*
* Update current task runtime to account pending time since last
* scheduler action or thread_group_cputime() call. This thread group
* might have other running tasks on different CPUs, but updating
* their runtime can affect syscall performance, so we skip account
* those pending times and rely only on values updated on tick or
* other scheduler action.
*/
if (same_thread_group(current, tsk))
(void) task_sched_runtime(current);
rcu_read_lock(); rcu_read_lock();
/* Attempt a lockless read on the first round. */ /* Attempt a lockless read on the first round. */
nextseq = 0; nextseq = 0;
...@@ -332,7 +355,7 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) ...@@ -332,7 +355,7 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
task_cputime(t, &utime, &stime); task_cputime(t, &utime, &stime);
times->utime += utime; times->utime += utime;
times->stime += stime; times->stime += stime;
times->sum_exec_runtime += task_sched_runtime(t); times->sum_exec_runtime += read_sum_exec_runtime(t);
} }
/* If lockless access failed, take the lock. */ /* If lockless access failed, take the lock. */
nextseq = 1; nextseq = 1;
......
...@@ -243,10 +243,8 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq); ...@@ -243,10 +243,8 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq);
static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p) static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p)
{ {
struct rq *later_rq = NULL; struct rq *later_rq = NULL;
bool fallback = false;
later_rq = find_lock_later_rq(p, rq); later_rq = find_lock_later_rq(p, rq);
if (!later_rq) { if (!later_rq) {
int cpu; int cpu;
...@@ -254,7 +252,6 @@ static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p ...@@ -254,7 +252,6 @@ static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p
* If we cannot preempt any rq, fall back to pick any * If we cannot preempt any rq, fall back to pick any
* online cpu. * online cpu.
*/ */
fallback = true;
cpu = cpumask_any_and(cpu_active_mask, tsk_cpus_allowed(p)); cpu = cpumask_any_and(cpu_active_mask, tsk_cpus_allowed(p));
if (cpu >= nr_cpu_ids) { if (cpu >= nr_cpu_ids) {
/* /*
...@@ -274,16 +271,7 @@ static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p ...@@ -274,16 +271,7 @@ static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p
double_lock_balance(rq, later_rq); double_lock_balance(rq, later_rq);
} }
/*
* By now the task is replenished and enqueued; migrate it.
*/
deactivate_task(rq, p, 0);
set_task_cpu(p, later_rq->cpu); set_task_cpu(p, later_rq->cpu);
activate_task(later_rq, p, 0);
if (!fallback)
resched_curr(later_rq);
double_unlock_balance(later_rq, rq); double_unlock_balance(later_rq, rq);
return later_rq; return later_rq;
...@@ -346,12 +334,12 @@ static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p, ...@@ -346,12 +334,12 @@ static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p,
* one, and to (try to!) reconcile itself with its own scheduling * one, and to (try to!) reconcile itself with its own scheduling
* parameters. * parameters.
*/ */
static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se, static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se)
struct sched_dl_entity *pi_se)
{ {
struct dl_rq *dl_rq = dl_rq_of_se(dl_se); struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
struct rq *rq = rq_of_dl_rq(dl_rq); struct rq *rq = rq_of_dl_rq(dl_rq);
WARN_ON(dl_se->dl_boosted);
WARN_ON(dl_time_before(rq_clock(rq), dl_se->deadline)); WARN_ON(dl_time_before(rq_clock(rq), dl_se->deadline));
/* /*
...@@ -367,8 +355,8 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se, ...@@ -367,8 +355,8 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se,
* future; in fact, we must consider execution overheads (time * future; in fact, we must consider execution overheads (time
* spent on hardirq context, etc.). * spent on hardirq context, etc.).
*/ */
dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; dl_se->deadline = rq_clock(rq) + dl_se->dl_deadline;
dl_se->runtime = pi_se->dl_runtime; dl_se->runtime = dl_se->dl_runtime;
} }
/* /*
...@@ -641,29 +629,31 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) ...@@ -641,29 +629,31 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
goto unlock; goto unlock;
} }
enqueue_task_dl(rq, p, ENQUEUE_REPLENISH);
if (dl_task(rq->curr))
check_preempt_curr_dl(rq, p, 0);
else
resched_curr(rq);
#ifdef CONFIG_SMP #ifdef CONFIG_SMP
/*
* Perform balancing operations here; after the replenishments. We
* cannot drop rq->lock before this, otherwise the assertion in
* start_dl_timer() about not missing updates is not true.
*
* If we find that the rq the task was on is no longer available, we
* need to select a new rq.
*
* XXX figure out if select_task_rq_dl() deals with offline cpus.
*/
if (unlikely(!rq->online)) { if (unlikely(!rq->online)) {
/*
* If the runqueue is no longer available, migrate the
* task elsewhere. This necessarily changes rq.
*/
lockdep_unpin_lock(&rq->lock, rf.cookie); lockdep_unpin_lock(&rq->lock, rf.cookie);
rq = dl_task_offline_migration(rq, p); rq = dl_task_offline_migration(rq, p);
rf.cookie = lockdep_pin_lock(&rq->lock); rf.cookie = lockdep_pin_lock(&rq->lock);
/*
* Now that the task has been migrated to the new RQ and we
* have that locked, proceed as normal and enqueue the task
* there.
*/
} }
#endif
enqueue_task_dl(rq, p, ENQUEUE_REPLENISH);
if (dl_task(rq->curr))
check_preempt_curr_dl(rq, p, 0);
else
resched_curr(rq);
#ifdef CONFIG_SMP
/* /*
* Queueing this task back might have overloaded rq, check if we need * Queueing this task back might have overloaded rq, check if we need
* to kick someone away. * to kick someone away.
...@@ -797,7 +787,7 @@ static void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline) ...@@ -797,7 +787,7 @@ static void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline)
if (dl_rq->earliest_dl.curr == 0 || if (dl_rq->earliest_dl.curr == 0 ||
dl_time_before(deadline, dl_rq->earliest_dl.curr)) { dl_time_before(deadline, dl_rq->earliest_dl.curr)) {
dl_rq->earliest_dl.curr = deadline; dl_rq->earliest_dl.curr = deadline;
cpudl_set(&rq->rd->cpudl, rq->cpu, deadline, 1); cpudl_set(&rq->rd->cpudl, rq->cpu, deadline);
} }
} }
...@@ -812,14 +802,14 @@ static void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline) ...@@ -812,14 +802,14 @@ static void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline)
if (!dl_rq->dl_nr_running) { if (!dl_rq->dl_nr_running) {
dl_rq->earliest_dl.curr = 0; dl_rq->earliest_dl.curr = 0;
dl_rq->earliest_dl.next = 0; dl_rq->earliest_dl.next = 0;
cpudl_set(&rq->rd->cpudl, rq->cpu, 0, 0); cpudl_clear(&rq->rd->cpudl, rq->cpu);
} else { } else {
struct rb_node *leftmost = dl_rq->rb_leftmost; struct rb_node *leftmost = dl_rq->rb_leftmost;
struct sched_dl_entity *entry; struct sched_dl_entity *entry;
entry = rb_entry(leftmost, struct sched_dl_entity, rb_node); entry = rb_entry(leftmost, struct sched_dl_entity, rb_node);
dl_rq->earliest_dl.curr = entry->deadline; dl_rq->earliest_dl.curr = entry->deadline;
cpudl_set(&rq->rd->cpudl, rq->cpu, entry->deadline, 1); cpudl_set(&rq->rd->cpudl, rq->cpu, entry->deadline);
} }
} }
...@@ -1670,7 +1660,7 @@ static void rq_online_dl(struct rq *rq) ...@@ -1670,7 +1660,7 @@ static void rq_online_dl(struct rq *rq)
cpudl_set_freecpu(&rq->rd->cpudl, rq->cpu); cpudl_set_freecpu(&rq->rd->cpudl, rq->cpu);
if (rq->dl.dl_nr_running > 0) if (rq->dl.dl_nr_running > 0)
cpudl_set(&rq->rd->cpudl, rq->cpu, rq->dl.earliest_dl.curr, 1); cpudl_set(&rq->rd->cpudl, rq->cpu, rq->dl.earliest_dl.curr);
} }
/* Assumes rq->lock is held */ /* Assumes rq->lock is held */
...@@ -1679,7 +1669,7 @@ static void rq_offline_dl(struct rq *rq) ...@@ -1679,7 +1669,7 @@ static void rq_offline_dl(struct rq *rq)
if (rq->dl.overloaded) if (rq->dl.overloaded)
dl_clear_overload(rq); dl_clear_overload(rq);
cpudl_set(&rq->rd->cpudl, rq->cpu, 0, 0); cpudl_clear(&rq->rd->cpudl, rq->cpu);
cpudl_clear_freecpu(&rq->rd->cpudl, rq->cpu); cpudl_clear_freecpu(&rq->rd->cpudl, rq->cpu);
} }
...@@ -1722,10 +1712,20 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p) ...@@ -1722,10 +1712,20 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p)
*/ */
static void switched_to_dl(struct rq *rq, struct task_struct *p) static void switched_to_dl(struct rq *rq, struct task_struct *p)
{ {
/* If p is not queued we will update its parameters at next wakeup. */
if (!task_on_rq_queued(p))
return;
/*
* If p is boosted we already updated its params in
* rt_mutex_setprio()->enqueue_task(..., ENQUEUE_REPLENISH),
* p's deadline being now already after rq_clock(rq).
*/
if (dl_time_before(p->dl.deadline, rq_clock(rq))) if (dl_time_before(p->dl.deadline, rq_clock(rq)))
setup_new_dl_entity(&p->dl, &p->dl); setup_new_dl_entity(&p->dl);
if (task_on_rq_queued(p) && rq->curr != p) { if (rq->curr != p) {
#ifdef CONFIG_SMP #ifdef CONFIG_SMP
if (tsk_nr_cpus_allowed(p) > 1 && rq->dl.overloaded) if (tsk_nr_cpus_allowed(p) > 1 && rq->dl.overloaded)
queue_push_tasks(rq); queue_push_tasks(rq);
......
...@@ -369,8 +369,12 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group ...@@ -369,8 +369,12 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
#define P(F) \ #define P(F) \
SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)F) SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)F)
#define P_SCHEDSTAT(F) \
SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)schedstat_val(F))
#define PN(F) \ #define PN(F) \
SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F)) SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F))
#define PN_SCHEDSTAT(F) \
SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)schedstat_val(F)))
if (!se) if (!se)
return; return;
...@@ -378,26 +382,27 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group ...@@ -378,26 +382,27 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
PN(se->exec_start); PN(se->exec_start);
PN(se->vruntime); PN(se->vruntime);
PN(se->sum_exec_runtime); PN(se->sum_exec_runtime);
#ifdef CONFIG_SCHEDSTATS
if (schedstat_enabled()) { if (schedstat_enabled()) {
PN(se->statistics.wait_start); PN_SCHEDSTAT(se->statistics.wait_start);
PN(se->statistics.sleep_start); PN_SCHEDSTAT(se->statistics.sleep_start);
PN(se->statistics.block_start); PN_SCHEDSTAT(se->statistics.block_start);
PN(se->statistics.sleep_max); PN_SCHEDSTAT(se->statistics.sleep_max);
PN(se->statistics.block_max); PN_SCHEDSTAT(se->statistics.block_max);
PN(se->statistics.exec_max); PN_SCHEDSTAT(se->statistics.exec_max);
PN(se->statistics.slice_max); PN_SCHEDSTAT(se->statistics.slice_max);
PN(se->statistics.wait_max); PN_SCHEDSTAT(se->statistics.wait_max);
PN(se->statistics.wait_sum); PN_SCHEDSTAT(se->statistics.wait_sum);
P(se->statistics.wait_count); P_SCHEDSTAT(se->statistics.wait_count);
} }
#endif
P(se->load.weight); P(se->load.weight);
#ifdef CONFIG_SMP #ifdef CONFIG_SMP
P(se->avg.load_avg); P(se->avg.load_avg);
P(se->avg.util_avg); P(se->avg.util_avg);
#endif #endif
#undef PN_SCHEDSTAT
#undef PN #undef PN
#undef P_SCHEDSTAT
#undef P #undef P
} }
#endif #endif
...@@ -429,9 +434,9 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) ...@@ -429,9 +434,9 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
p->prio); p->prio);
SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld", SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld",
SPLIT_NS(schedstat_val(p, se.statistics.wait_sum)), SPLIT_NS(schedstat_val_or_zero(p->se.statistics.wait_sum)),
SPLIT_NS(p->se.sum_exec_runtime), SPLIT_NS(p->se.sum_exec_runtime),
SPLIT_NS(schedstat_val(p, se.statistics.sum_sleep_runtime))); SPLIT_NS(schedstat_val_or_zero(p->se.statistics.sum_sleep_runtime)));
#ifdef CONFIG_NUMA_BALANCING #ifdef CONFIG_NUMA_BALANCING
SEQ_printf(m, " %d %d", task_node(p), task_numa_group_id(p)); SEQ_printf(m, " %d %d", task_node(p), task_numa_group_id(p));
...@@ -626,9 +631,7 @@ do { \ ...@@ -626,9 +631,7 @@ do { \
#undef P64 #undef P64
#endif #endif
#ifdef CONFIG_SCHEDSTATS #define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, schedstat_val(rq->n));
#define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, rq->n);
if (schedstat_enabled()) { if (schedstat_enabled()) {
P(yld_count); P(yld_count);
P(sched_count); P(sched_count);
...@@ -636,9 +639,8 @@ do { \ ...@@ -636,9 +639,8 @@ do { \
P(ttwu_count); P(ttwu_count);
P(ttwu_local); P(ttwu_local);
} }
#undef P #undef P
#endif
spin_lock_irqsave(&sched_debug_lock, flags); spin_lock_irqsave(&sched_debug_lock, flags);
print_cfs_stats(m, cpu); print_cfs_stats(m, cpu);
print_rt_stats(m, cpu); print_rt_stats(m, cpu);
...@@ -868,10 +870,14 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) ...@@ -868,10 +870,14 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)F) SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)F)
#define P(F) \ #define P(F) \
SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)p->F) SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)p->F)
#define P_SCHEDSTAT(F) \
SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)schedstat_val(p->F))
#define __PN(F) \ #define __PN(F) \
SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F)) SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F))
#define PN(F) \ #define PN(F) \
SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F)) SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F))
#define PN_SCHEDSTAT(F) \
SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)schedstat_val(p->F)))
PN(se.exec_start); PN(se.exec_start);
PN(se.vruntime); PN(se.vruntime);
...@@ -881,37 +887,36 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) ...@@ -881,37 +887,36 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
P(se.nr_migrations); P(se.nr_migrations);
#ifdef CONFIG_SCHEDSTATS
if (schedstat_enabled()) { if (schedstat_enabled()) {
u64 avg_atom, avg_per_cpu; u64 avg_atom, avg_per_cpu;
PN(se.statistics.sum_sleep_runtime); PN_SCHEDSTAT(se.statistics.sum_sleep_runtime);
PN(se.statistics.wait_start); PN_SCHEDSTAT(se.statistics.wait_start);
PN(se.statistics.sleep_start); PN_SCHEDSTAT(se.statistics.sleep_start);
PN(se.statistics.block_start); PN_SCHEDSTAT(se.statistics.block_start);
PN(se.statistics.sleep_max); PN_SCHEDSTAT(se.statistics.sleep_max);
PN(se.statistics.block_max); PN_SCHEDSTAT(se.statistics.block_max);
PN(se.statistics.exec_max); PN_SCHEDSTAT(se.statistics.exec_max);
PN(se.statistics.slice_max); PN_SCHEDSTAT(se.statistics.slice_max);
PN(se.statistics.wait_max); PN_SCHEDSTAT(se.statistics.wait_max);
PN(se.statistics.wait_sum); PN_SCHEDSTAT(se.statistics.wait_sum);
P(se.statistics.wait_count); P_SCHEDSTAT(se.statistics.wait_count);
PN(se.statistics.iowait_sum); PN_SCHEDSTAT(se.statistics.iowait_sum);
P(se.statistics.iowait_count); P_SCHEDSTAT(se.statistics.iowait_count);
P(se.statistics.nr_migrations_cold); P_SCHEDSTAT(se.statistics.nr_migrations_cold);
P(se.statistics.nr_failed_migrations_affine); P_SCHEDSTAT(se.statistics.nr_failed_migrations_affine);
P(se.statistics.nr_failed_migrations_running); P_SCHEDSTAT(se.statistics.nr_failed_migrations_running);
P(se.statistics.nr_failed_migrations_hot); P_SCHEDSTAT(se.statistics.nr_failed_migrations_hot);
P(se.statistics.nr_forced_migrations); P_SCHEDSTAT(se.statistics.nr_forced_migrations);
P(se.statistics.nr_wakeups); P_SCHEDSTAT(se.statistics.nr_wakeups);
P(se.statistics.nr_wakeups_sync); P_SCHEDSTAT(se.statistics.nr_wakeups_sync);
P(se.statistics.nr_wakeups_migrate); P_SCHEDSTAT(se.statistics.nr_wakeups_migrate);
P(se.statistics.nr_wakeups_local); P_SCHEDSTAT(se.statistics.nr_wakeups_local);
P(se.statistics.nr_wakeups_remote); P_SCHEDSTAT(se.statistics.nr_wakeups_remote);
P(se.statistics.nr_wakeups_affine); P_SCHEDSTAT(se.statistics.nr_wakeups_affine);
P(se.statistics.nr_wakeups_affine_attempts); P_SCHEDSTAT(se.statistics.nr_wakeups_affine_attempts);
P(se.statistics.nr_wakeups_passive); P_SCHEDSTAT(se.statistics.nr_wakeups_passive);
P(se.statistics.nr_wakeups_idle); P_SCHEDSTAT(se.statistics.nr_wakeups_idle);
avg_atom = p->se.sum_exec_runtime; avg_atom = p->se.sum_exec_runtime;
if (nr_switches) if (nr_switches)
...@@ -930,7 +935,7 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) ...@@ -930,7 +935,7 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
__PN(avg_atom); __PN(avg_atom);
__PN(avg_per_cpu); __PN(avg_per_cpu);
} }
#endif
__P(nr_switches); __P(nr_switches);
SEQ_printf(m, "%-45s:%21Ld\n", SEQ_printf(m, "%-45s:%21Ld\n",
"nr_voluntary_switches", (long long)p->nvcsw); "nr_voluntary_switches", (long long)p->nvcsw);
...@@ -947,8 +952,10 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) ...@@ -947,8 +952,10 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
#endif #endif
P(policy); P(policy);
P(prio); P(prio);
#undef PN_SCHEDSTAT
#undef PN #undef PN
#undef __PN #undef __PN
#undef P_SCHEDSTAT
#undef P #undef P
#undef __P #undef __P
......
...@@ -114,6 +114,12 @@ unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL; ...@@ -114,6 +114,12 @@ unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL; unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
#endif #endif
/*
* The margin used when comparing utilization with CPU capacity:
* util * 1024 < capacity * margin
*/
unsigned int capacity_margin = 1280; /* ~20% */
static inline void update_load_add(struct load_weight *lw, unsigned long inc) static inline void update_load_add(struct load_weight *lw, unsigned long inc)
{ {
lw->weight += inc; lw->weight += inc;
...@@ -256,9 +262,7 @@ static inline struct rq *rq_of(struct cfs_rq *cfs_rq) ...@@ -256,9 +262,7 @@ static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
static inline struct task_struct *task_of(struct sched_entity *se) static inline struct task_struct *task_of(struct sched_entity *se)
{ {
#ifdef CONFIG_SCHED_DEBUG SCHED_WARN_ON(!entity_is_task(se));
WARN_ON_ONCE(!entity_is_task(se));
#endif
return container_of(se, struct task_struct, se); return container_of(se, struct task_struct, se);
} }
...@@ -456,17 +460,23 @@ static inline int entity_before(struct sched_entity *a, ...@@ -456,17 +460,23 @@ static inline int entity_before(struct sched_entity *a,
static void update_min_vruntime(struct cfs_rq *cfs_rq) static void update_min_vruntime(struct cfs_rq *cfs_rq)
{ {
struct sched_entity *curr = cfs_rq->curr;
u64 vruntime = cfs_rq->min_vruntime; u64 vruntime = cfs_rq->min_vruntime;
if (cfs_rq->curr) if (curr) {
vruntime = cfs_rq->curr->vruntime; if (curr->on_rq)
vruntime = curr->vruntime;
else
curr = NULL;
}
if (cfs_rq->rb_leftmost) { if (cfs_rq->rb_leftmost) {
struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost, struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost,
struct sched_entity, struct sched_entity,
run_node); run_node);
if (!cfs_rq->curr) if (!curr)
vruntime = se->vruntime; vruntime = se->vruntime;
else else
vruntime = min_vruntime(vruntime, se->vruntime); vruntime = min_vruntime(vruntime, se->vruntime);
...@@ -656,7 +666,7 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se) ...@@ -656,7 +666,7 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
} }
#ifdef CONFIG_SMP #ifdef CONFIG_SMP
static int select_idle_sibling(struct task_struct *p, int cpu); static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu);
static unsigned long task_h_load(struct task_struct *p); static unsigned long task_h_load(struct task_struct *p);
/* /*
...@@ -726,7 +736,6 @@ void post_init_entity_util_avg(struct sched_entity *se) ...@@ -726,7 +736,6 @@ void post_init_entity_util_avg(struct sched_entity *se)
struct sched_avg *sa = &se->avg; struct sched_avg *sa = &se->avg;
long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2; long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2;
u64 now = cfs_rq_clock_task(cfs_rq); u64 now = cfs_rq_clock_task(cfs_rq);
int tg_update;
if (cap > 0) { if (cap > 0) {
if (cfs_rq->avg.util_avg != 0) { if (cfs_rq->avg.util_avg != 0) {
...@@ -759,10 +768,9 @@ void post_init_entity_util_avg(struct sched_entity *se) ...@@ -759,10 +768,9 @@ void post_init_entity_util_avg(struct sched_entity *se)
} }
} }
tg_update = update_cfs_rq_load_avg(now, cfs_rq, false); update_cfs_rq_load_avg(now, cfs_rq, false);
attach_entity_load_avg(cfs_rq, se); attach_entity_load_avg(cfs_rq, se);
if (tg_update) update_tg_load_avg(cfs_rq, false);
update_tg_load_avg(cfs_rq, false);
} }
#else /* !CONFIG_SMP */ #else /* !CONFIG_SMP */
...@@ -799,7 +807,7 @@ static void update_curr(struct cfs_rq *cfs_rq) ...@@ -799,7 +807,7 @@ static void update_curr(struct cfs_rq *cfs_rq)
max(delta_exec, curr->statistics.exec_max)); max(delta_exec, curr->statistics.exec_max));
curr->sum_exec_runtime += delta_exec; curr->sum_exec_runtime += delta_exec;
schedstat_add(cfs_rq, exec_clock, delta_exec); schedstat_add(cfs_rq->exec_clock, delta_exec);
curr->vruntime += calc_delta_fair(delta_exec, curr); curr->vruntime += calc_delta_fair(delta_exec, curr);
update_min_vruntime(cfs_rq); update_min_vruntime(cfs_rq);
...@@ -820,26 +828,34 @@ static void update_curr_fair(struct rq *rq) ...@@ -820,26 +828,34 @@ static void update_curr_fair(struct rq *rq)
update_curr(cfs_rq_of(&rq->curr->se)); update_curr(cfs_rq_of(&rq->curr->se));
} }
#ifdef CONFIG_SCHEDSTATS
static inline void static inline void
update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
{ {
u64 wait_start = rq_clock(rq_of(cfs_rq)); u64 wait_start, prev_wait_start;
if (!schedstat_enabled())
return;
wait_start = rq_clock(rq_of(cfs_rq));
prev_wait_start = schedstat_val(se->statistics.wait_start);
if (entity_is_task(se) && task_on_rq_migrating(task_of(se)) && if (entity_is_task(se) && task_on_rq_migrating(task_of(se)) &&
likely(wait_start > se->statistics.wait_start)) likely(wait_start > prev_wait_start))
wait_start -= se->statistics.wait_start; wait_start -= prev_wait_start;
se->statistics.wait_start = wait_start; schedstat_set(se->statistics.wait_start, wait_start);
} }
static void static inline void
update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
{ {
struct task_struct *p; struct task_struct *p;
u64 delta; u64 delta;
delta = rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start; if (!schedstat_enabled())
return;
delta = rq_clock(rq_of(cfs_rq)) - schedstat_val(se->statistics.wait_start);
if (entity_is_task(se)) { if (entity_is_task(se)) {
p = task_of(se); p = task_of(se);
...@@ -849,35 +865,114 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) ...@@ -849,35 +865,114 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
* time stamp can be adjusted to accumulate wait time * time stamp can be adjusted to accumulate wait time
* prior to migration. * prior to migration.
*/ */
se->statistics.wait_start = delta; schedstat_set(se->statistics.wait_start, delta);
return; return;
} }
trace_sched_stat_wait(p, delta); trace_sched_stat_wait(p, delta);
} }
se->statistics.wait_max = max(se->statistics.wait_max, delta); schedstat_set(se->statistics.wait_max,
se->statistics.wait_count++; max(schedstat_val(se->statistics.wait_max), delta));
se->statistics.wait_sum += delta; schedstat_inc(se->statistics.wait_count);
se->statistics.wait_start = 0; schedstat_add(se->statistics.wait_sum, delta);
schedstat_set(se->statistics.wait_start, 0);
}
static inline void
update_stats_enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
struct task_struct *tsk = NULL;
u64 sleep_start, block_start;
if (!schedstat_enabled())
return;
sleep_start = schedstat_val(se->statistics.sleep_start);
block_start = schedstat_val(se->statistics.block_start);
if (entity_is_task(se))
tsk = task_of(se);
if (sleep_start) {
u64 delta = rq_clock(rq_of(cfs_rq)) - sleep_start;
if ((s64)delta < 0)
delta = 0;
if (unlikely(delta > schedstat_val(se->statistics.sleep_max)))
schedstat_set(se->statistics.sleep_max, delta);
schedstat_set(se->statistics.sleep_start, 0);
schedstat_add(se->statistics.sum_sleep_runtime, delta);
if (tsk) {
account_scheduler_latency(tsk, delta >> 10, 1);
trace_sched_stat_sleep(tsk, delta);
}
}
if (block_start) {
u64 delta = rq_clock(rq_of(cfs_rq)) - block_start;
if ((s64)delta < 0)
delta = 0;
if (unlikely(delta > schedstat_val(se->statistics.block_max)))
schedstat_set(se->statistics.block_max, delta);
schedstat_set(se->statistics.block_start, 0);
schedstat_add(se->statistics.sum_sleep_runtime, delta);
if (tsk) {
if (tsk->in_iowait) {
schedstat_add(se->statistics.iowait_sum, delta);
schedstat_inc(se->statistics.iowait_count);
trace_sched_stat_iowait(tsk, delta);
}
trace_sched_stat_blocked(tsk, delta);
/*
* Blocking time is in units of nanosecs, so shift by
* 20 to get a milliseconds-range estimation of the
* amount of time that the task spent sleeping:
*/
if (unlikely(prof_on == SLEEP_PROFILING)) {
profile_hits(SLEEP_PROFILING,
(void *)get_wchan(tsk),
delta >> 20);
}
account_scheduler_latency(tsk, delta >> 10, 0);
}
}
} }
/* /*
* Task is being enqueued - update stats: * Task is being enqueued - update stats:
*/ */
static inline void static inline void
update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{ {
if (!schedstat_enabled())
return;
/* /*
* Are we enqueueing a waiting task? (for current tasks * Are we enqueueing a waiting task? (for current tasks
* a dequeue/enqueue event is a NOP) * a dequeue/enqueue event is a NOP)
*/ */
if (se != cfs_rq->curr) if (se != cfs_rq->curr)
update_stats_wait_start(cfs_rq, se); update_stats_wait_start(cfs_rq, se);
if (flags & ENQUEUE_WAKEUP)
update_stats_enqueue_sleeper(cfs_rq, se);
} }
static inline void static inline void
update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{ {
if (!schedstat_enabled())
return;
/* /*
* Mark the end of the wait period if dequeueing a * Mark the end of the wait period if dequeueing a
* waiting task: * waiting task:
...@@ -885,40 +980,18 @@ update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) ...@@ -885,40 +980,18 @@ update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
if (se != cfs_rq->curr) if (se != cfs_rq->curr)
update_stats_wait_end(cfs_rq, se); update_stats_wait_end(cfs_rq, se);
if (flags & DEQUEUE_SLEEP) { if ((flags & DEQUEUE_SLEEP) && entity_is_task(se)) {
if (entity_is_task(se)) { struct task_struct *tsk = task_of(se);
struct task_struct *tsk = task_of(se);
if (tsk->state & TASK_INTERRUPTIBLE) if (tsk->state & TASK_INTERRUPTIBLE)
se->statistics.sleep_start = rq_clock(rq_of(cfs_rq)); schedstat_set(se->statistics.sleep_start,
if (tsk->state & TASK_UNINTERRUPTIBLE) rq_clock(rq_of(cfs_rq)));
se->statistics.block_start = rq_clock(rq_of(cfs_rq)); if (tsk->state & TASK_UNINTERRUPTIBLE)
} schedstat_set(se->statistics.block_start,
rq_clock(rq_of(cfs_rq)));
} }
}
#else
static inline void
update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
}
static inline void
update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
}
static inline void
update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
} }
static inline void
update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
}
#endif
/* /*
* We are picking a new current task - update its stats: * We are picking a new current task - update its stats:
*/ */
...@@ -1513,8 +1586,16 @@ static void task_numa_compare(struct task_numa_env *env, ...@@ -1513,8 +1586,16 @@ static void task_numa_compare(struct task_numa_env *env,
* One idle CPU per node is evaluated for a task numa move. * One idle CPU per node is evaluated for a task numa move.
* Call select_idle_sibling to maybe find a better one. * Call select_idle_sibling to maybe find a better one.
*/ */
if (!cur) if (!cur) {
env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu); /*
* select_idle_siblings() uses an per-cpu cpumask that
* can be used from IRQ context.
*/
local_irq_disable();
env->dst_cpu = select_idle_sibling(env->p, env->src_cpu,
env->dst_cpu);
local_irq_enable();
}
assign: assign:
task_numa_assign(env, cur, imp); task_numa_assign(env, cur, imp);
...@@ -2292,7 +2373,7 @@ void task_numa_work(struct callback_head *work) ...@@ -2292,7 +2373,7 @@ void task_numa_work(struct callback_head *work)
unsigned long nr_pte_updates = 0; unsigned long nr_pte_updates = 0;
long pages, virtpages; long pages, virtpages;
WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work)); SCHED_WARN_ON(p != container_of(work, struct task_struct, numa_work));
work->next = work; /* protect against double add */ work->next = work; /* protect against double add */
/* /*
...@@ -2803,9 +2884,21 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa, ...@@ -2803,9 +2884,21 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa,
} }
#ifdef CONFIG_FAIR_GROUP_SCHED #ifdef CONFIG_FAIR_GROUP_SCHED
/* /**
* Updating tg's load_avg is necessary before update_cfs_share (which is done) * update_tg_load_avg - update the tg's load avg
* and effective_load (which is not done because it is too costly). * @cfs_rq: the cfs_rq whose avg changed
* @force: update regardless of how small the difference
*
* This function 'ensures': tg->load_avg := \Sum tg->cfs_rq[]->avg.load.
* However, because tg->load_avg is a global value there are performance
* considerations.
*
* In order to avoid having to look at the other cfs_rq's, we use a
* differential update where we store the last value we propagated. This in
* turn allows skipping updates if the differential is 'small'.
*
* Updating tg's load_avg is necessary before update_cfs_share() (which is
* done) and effective_load() (which is not done because it is too costly).
*/ */
static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
{ {
...@@ -2925,10 +3018,10 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq) ...@@ -2925,10 +3018,10 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
* *
* cfs_rq->avg is used for task_h_load() and update_cfs_share() for example. * cfs_rq->avg is used for task_h_load() and update_cfs_share() for example.
* *
* Returns true if the load decayed or we removed utilization. It is expected * Returns true if the load decayed or we removed load.
* that one calls update_tg_load_avg() on this condition, but after you've *
* modified the cfs_rq avg (attach/detach), such that we propagate the new * Since both these conditions indicate a changed cfs_rq->avg.load we should
* avg up. * call update_tg_load_avg() when this function returns true.
*/ */
static inline int static inline int
update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
...@@ -3174,68 +3267,6 @@ static inline int idle_balance(struct rq *rq) ...@@ -3174,68 +3267,6 @@ static inline int idle_balance(struct rq *rq)
#endif /* CONFIG_SMP */ #endif /* CONFIG_SMP */
static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
#ifdef CONFIG_SCHEDSTATS
struct task_struct *tsk = NULL;
if (entity_is_task(se))
tsk = task_of(se);
if (se->statistics.sleep_start) {
u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.sleep_start;
if ((s64)delta < 0)
delta = 0;
if (unlikely(delta > se->statistics.sleep_max))
se->statistics.sleep_max = delta;
se->statistics.sleep_start = 0;
se->statistics.sum_sleep_runtime += delta;
if (tsk) {
account_scheduler_latency(tsk, delta >> 10, 1);
trace_sched_stat_sleep(tsk, delta);
}
}
if (se->statistics.block_start) {
u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.block_start;
if ((s64)delta < 0)
delta = 0;
if (unlikely(delta > se->statistics.block_max))
se->statistics.block_max = delta;
se->statistics.block_start = 0;
se->statistics.sum_sleep_runtime += delta;
if (tsk) {
if (tsk->in_iowait) {
se->statistics.iowait_sum += delta;
se->statistics.iowait_count++;
trace_sched_stat_iowait(tsk, delta);
}
trace_sched_stat_blocked(tsk, delta);
/*
* Blocking time is in units of nanosecs, so shift by
* 20 to get a milliseconds-range estimation of the
* amount of time that the task spent sleeping:
*/
if (unlikely(prof_on == SLEEP_PROFILING)) {
profile_hits(SLEEP_PROFILING,
(void *)get_wchan(tsk),
delta >> 20);
}
account_scheduler_latency(tsk, delta >> 10, 0);
}
}
#endif
}
static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se) static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
{ {
#ifdef CONFIG_SCHED_DEBUG #ifdef CONFIG_SCHED_DEBUG
...@@ -3245,7 +3276,7 @@ static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se) ...@@ -3245,7 +3276,7 @@ static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
d = -d; d = -d;
if (d > 3*sysctl_sched_latency) if (d > 3*sysctl_sched_latency)
schedstat_inc(cfs_rq, nr_spread_over); schedstat_inc(cfs_rq->nr_spread_over);
#endif #endif
} }
...@@ -3362,17 +3393,12 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) ...@@ -3362,17 +3393,12 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
account_entity_enqueue(cfs_rq, se); account_entity_enqueue(cfs_rq, se);
update_cfs_shares(cfs_rq); update_cfs_shares(cfs_rq);
if (flags & ENQUEUE_WAKEUP) { if (flags & ENQUEUE_WAKEUP)
place_entity(cfs_rq, se, 0); place_entity(cfs_rq, se, 0);
if (schedstat_enabled())
enqueue_sleeper(cfs_rq, se);
}
check_schedstat_required(); check_schedstat_required();
if (schedstat_enabled()) { update_stats_enqueue(cfs_rq, se, flags);
update_stats_enqueue(cfs_rq, se); check_spread(cfs_rq, se);
check_spread(cfs_rq, se);
}
if (!curr) if (!curr)
__enqueue_entity(cfs_rq, se); __enqueue_entity(cfs_rq, se);
se->on_rq = 1; se->on_rq = 1;
...@@ -3439,8 +3465,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) ...@@ -3439,8 +3465,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
update_curr(cfs_rq); update_curr(cfs_rq);
dequeue_entity_load_avg(cfs_rq, se); dequeue_entity_load_avg(cfs_rq, se);
if (schedstat_enabled()) update_stats_dequeue(cfs_rq, se, flags);
update_stats_dequeue(cfs_rq, se, flags);
clear_buddies(cfs_rq, se); clear_buddies(cfs_rq, se);
...@@ -3450,9 +3475,10 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) ...@@ -3450,9 +3475,10 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
account_entity_dequeue(cfs_rq, se); account_entity_dequeue(cfs_rq, se);
/* /*
* Normalize the entity after updating the min_vruntime because the * Normalize after update_curr(); which will also have moved
* update can refer to the ->curr item and we need to reflect this * min_vruntime if @se is the one holding it back. But before doing
* movement in our normalized position. * update_min_vruntime() again, which will discount @se's position and
* can move min_vruntime forward still more.
*/ */
if (!(flags & DEQUEUE_SLEEP)) if (!(flags & DEQUEUE_SLEEP))
se->vruntime -= cfs_rq->min_vruntime; se->vruntime -= cfs_rq->min_vruntime;
...@@ -3460,8 +3486,16 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) ...@@ -3460,8 +3486,16 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
/* return excess runtime on last dequeue */ /* return excess runtime on last dequeue */
return_cfs_rq_runtime(cfs_rq); return_cfs_rq_runtime(cfs_rq);
update_min_vruntime(cfs_rq);
update_cfs_shares(cfs_rq); update_cfs_shares(cfs_rq);
/*
* Now advance min_vruntime if @se was the entity holding it back,
* except when: DEQUEUE_SAVE && !DEQUEUE_MOVE, in this case we'll be
* put back on, and if we advance min_vruntime, we'll be placed back
* further than we started -- ie. we'll be penalized.
*/
if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) == DEQUEUE_SAVE)
update_min_vruntime(cfs_rq);
} }
/* /*
...@@ -3514,25 +3548,25 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) ...@@ -3514,25 +3548,25 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
* a CPU. So account for the time it spent waiting on the * a CPU. So account for the time it spent waiting on the
* runqueue. * runqueue.
*/ */
if (schedstat_enabled()) update_stats_wait_end(cfs_rq, se);
update_stats_wait_end(cfs_rq, se);
__dequeue_entity(cfs_rq, se); __dequeue_entity(cfs_rq, se);
update_load_avg(se, 1); update_load_avg(se, 1);
} }
update_stats_curr_start(cfs_rq, se); update_stats_curr_start(cfs_rq, se);
cfs_rq->curr = se; cfs_rq->curr = se;
#ifdef CONFIG_SCHEDSTATS
/* /*
* Track our maximum slice length, if the CPU's load is at * Track our maximum slice length, if the CPU's load is at
* least twice that of our own weight (i.e. dont track it * least twice that of our own weight (i.e. dont track it
* when there are only lesser-weight tasks around): * when there are only lesser-weight tasks around):
*/ */
if (schedstat_enabled() && rq_of(cfs_rq)->load.weight >= 2*se->load.weight) { if (schedstat_enabled() && rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
se->statistics.slice_max = max(se->statistics.slice_max, schedstat_set(se->statistics.slice_max,
se->sum_exec_runtime - se->prev_sum_exec_runtime); max((u64)schedstat_val(se->statistics.slice_max),
se->sum_exec_runtime - se->prev_sum_exec_runtime));
} }
#endif
se->prev_sum_exec_runtime = se->sum_exec_runtime; se->prev_sum_exec_runtime = se->sum_exec_runtime;
} }
...@@ -3611,13 +3645,10 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) ...@@ -3611,13 +3645,10 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
/* throttle cfs_rqs exceeding runtime */ /* throttle cfs_rqs exceeding runtime */
check_cfs_rq_runtime(cfs_rq); check_cfs_rq_runtime(cfs_rq);
if (schedstat_enabled()) { check_spread(cfs_rq, prev);
check_spread(cfs_rq, prev);
if (prev->on_rq)
update_stats_wait_start(cfs_rq, prev);
}
if (prev->on_rq) { if (prev->on_rq) {
update_stats_wait_start(cfs_rq, prev);
/* Put 'current' back into the tree. */ /* Put 'current' back into the tree. */
__enqueue_entity(cfs_rq, prev); __enqueue_entity(cfs_rq, prev);
/* in !on_rq case, update occurred at dequeue */ /* in !on_rq case, update occurred at dequeue */
...@@ -4447,9 +4478,9 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p) ...@@ -4447,9 +4478,9 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
struct sched_entity *se = &p->se; struct sched_entity *se = &p->se;
struct cfs_rq *cfs_rq = cfs_rq_of(se); struct cfs_rq *cfs_rq = cfs_rq_of(se);
WARN_ON(task_rq(p) != rq); SCHED_WARN_ON(task_rq(p) != rq);
if (cfs_rq->nr_running > 1) { if (rq->cfs.h_nr_running > 1) {
u64 slice = sched_slice(cfs_rq, se); u64 slice = sched_slice(cfs_rq, se);
u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime; u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
s64 delta = slice - ran; s64 delta = slice - ran;
...@@ -4604,6 +4635,11 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) ...@@ -4604,6 +4635,11 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
} }
#ifdef CONFIG_SMP #ifdef CONFIG_SMP
/* Working cpumask for: load_balance, load_balance_newidle. */
DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
DEFINE_PER_CPU(cpumask_var_t, select_idle_mask);
#ifdef CONFIG_NO_HZ_COMMON #ifdef CONFIG_NO_HZ_COMMON
/* /*
* per rq 'load' arrray crap; XXX kill this. * per rq 'load' arrray crap; XXX kill this.
...@@ -5005,9 +5041,9 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg) ...@@ -5005,9 +5041,9 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
* wl = S * s'_i; see (2) * wl = S * s'_i; see (2)
*/ */
if (W > 0 && w < W) if (W > 0 && w < W)
wl = (w * (long)tg->shares) / W; wl = (w * (long)scale_load_down(tg->shares)) / W;
else else
wl = tg->shares; wl = scale_load_down(tg->shares);
/* /*
* Per the above, wl is the new se->load.weight value; since * Per the above, wl is the new se->load.weight value; since
...@@ -5090,18 +5126,18 @@ static int wake_wide(struct task_struct *p) ...@@ -5090,18 +5126,18 @@ static int wake_wide(struct task_struct *p)
return 1; return 1;
} }
static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) static int wake_affine(struct sched_domain *sd, struct task_struct *p,
int prev_cpu, int sync)
{ {
s64 this_load, load; s64 this_load, load;
s64 this_eff_load, prev_eff_load; s64 this_eff_load, prev_eff_load;
int idx, this_cpu, prev_cpu; int idx, this_cpu;
struct task_group *tg; struct task_group *tg;
unsigned long weight; unsigned long weight;
int balanced; int balanced;
idx = sd->wake_idx; idx = sd->wake_idx;
this_cpu = smp_processor_id(); this_cpu = smp_processor_id();
prev_cpu = task_cpu(p);
load = source_load(prev_cpu, idx); load = source_load(prev_cpu, idx);
this_load = target_load(this_cpu, idx); this_load = target_load(this_cpu, idx);
...@@ -5145,13 +5181,13 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) ...@@ -5145,13 +5181,13 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
balanced = this_eff_load <= prev_eff_load; balanced = this_eff_load <= prev_eff_load;
schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts); schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts);
if (!balanced) if (!balanced)
return 0; return 0;
schedstat_inc(sd, ttwu_move_affine); schedstat_inc(sd->ttwu_move_affine);
schedstat_inc(p, se.statistics.nr_wakeups_affine); schedstat_inc(p->se.statistics.nr_wakeups_affine);
return 1; return 1;
} }
...@@ -5227,6 +5263,10 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) ...@@ -5227,6 +5263,10 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
int shallowest_idle_cpu = -1; int shallowest_idle_cpu = -1;
int i; int i;
/* Check if we have any choice: */
if (group->group_weight == 1)
return cpumask_first(sched_group_cpus(group));
/* Traverse only the allowed CPUs */ /* Traverse only the allowed CPUs */
for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) { for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) {
if (idle_cpu(i)) { if (idle_cpu(i)) {
...@@ -5264,64 +5304,237 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) ...@@ -5264,64 +5304,237 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
} }
/* /*
* Try and locate an idle CPU in the sched_domain. * Implement a for_each_cpu() variant that starts the scan at a given cpu
* (@start), and wraps around.
*
* This is used to scan for idle CPUs; such that not all CPUs looking for an
* idle CPU find the same CPU. The down-side is that tasks tend to cycle
* through the LLC domain.
*
* Especially tbench is found sensitive to this.
*/
static int cpumask_next_wrap(int n, const struct cpumask *mask, int start, int *wrapped)
{
int next;
again:
next = find_next_bit(cpumask_bits(mask), nr_cpumask_bits, n+1);
if (*wrapped) {
if (next >= start)
return nr_cpumask_bits;
} else {
if (next >= nr_cpumask_bits) {
*wrapped = 1;
n = -1;
goto again;
}
}
return next;
}
#define for_each_cpu_wrap(cpu, mask, start, wrap) \
for ((wrap) = 0, (cpu) = (start)-1; \
(cpu) = cpumask_next_wrap((cpu), (mask), (start), &(wrap)), \
(cpu) < nr_cpumask_bits; )
#ifdef CONFIG_SCHED_SMT
static inline void set_idle_cores(int cpu, int val)
{
struct sched_domain_shared *sds;
sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
if (sds)
WRITE_ONCE(sds->has_idle_cores, val);
}
static inline bool test_idle_cores(int cpu, bool def)
{
struct sched_domain_shared *sds;
sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
if (sds)
return READ_ONCE(sds->has_idle_cores);
return def;
}
/*
* Scans the local SMT mask to see if the entire core is idle, and records this
* information in sd_llc_shared->has_idle_cores.
*
* Since SMT siblings share all cache levels, inspecting this limited remote
* state should be fairly cheap.
*/ */
static int select_idle_sibling(struct task_struct *p, int target) void __update_idle_core(struct rq *rq)
{
int core = cpu_of(rq);
int cpu;
rcu_read_lock();
if (test_idle_cores(core, true))
goto unlock;
for_each_cpu(cpu, cpu_smt_mask(core)) {
if (cpu == core)
continue;
if (!idle_cpu(cpu))
goto unlock;
}
set_idle_cores(core, 1);
unlock:
rcu_read_unlock();
}
/*
* Scan the entire LLC domain for idle cores; this dynamically switches off if
* there are no idle cores left in the system; tracked through
* sd_llc->shared->has_idle_cores and enabled through update_idle_core() above.
*/
static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target)
{
struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
int core, cpu, wrap;
if (!static_branch_likely(&sched_smt_present))
return -1;
if (!test_idle_cores(target, false))
return -1;
cpumask_and(cpus, sched_domain_span(sd), tsk_cpus_allowed(p));
for_each_cpu_wrap(core, cpus, target, wrap) {
bool idle = true;
for_each_cpu(cpu, cpu_smt_mask(core)) {
cpumask_clear_cpu(cpu, cpus);
if (!idle_cpu(cpu))
idle = false;
}
if (idle)
return core;
}
/*
* Failed to find an idle core; stop looking for one.
*/
set_idle_cores(target, 0);
return -1;
}
/*
* Scan the local SMT mask for idle CPUs.
*/
static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
{
int cpu;
if (!static_branch_likely(&sched_smt_present))
return -1;
for_each_cpu(cpu, cpu_smt_mask(target)) {
if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
continue;
if (idle_cpu(cpu))
return cpu;
}
return -1;
}
#else /* CONFIG_SCHED_SMT */
static inline int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target)
{
return -1;
}
static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
{
return -1;
}
#endif /* CONFIG_SCHED_SMT */
/*
* Scan the LLC domain for idle CPUs; this is dynamically regulated by
* comparing the average scan cost (tracked in sd->avg_scan_cost) against the
* average idle time for this rq (as found in rq->avg_idle).
*/
static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int target)
{
struct sched_domain *this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
u64 avg_idle = this_rq()->avg_idle;
u64 avg_cost = this_sd->avg_scan_cost;
u64 time, cost;
s64 delta;
int cpu, wrap;
/*
* Due to large variance we need a large fuzz factor; hackbench in
* particularly is sensitive here.
*/
if ((avg_idle / 512) < avg_cost)
return -1;
time = local_clock();
for_each_cpu_wrap(cpu, sched_domain_span(sd), target, wrap) {
if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
continue;
if (idle_cpu(cpu))
break;
}
time = local_clock() - time;
cost = this_sd->avg_scan_cost;
delta = (s64)(time - cost) / 8;
this_sd->avg_scan_cost += delta;
return cpu;
}
/*
* Try and locate an idle core/thread in the LLC cache domain.
*/
static int select_idle_sibling(struct task_struct *p, int prev, int target)
{ {
struct sched_domain *sd; struct sched_domain *sd;
struct sched_group *sg; int i;
int i = task_cpu(p);
if (idle_cpu(target)) if (idle_cpu(target))
return target; return target;
/* /*
* If the prevous cpu is cache affine and idle, don't be stupid. * If the previous cpu is cache affine and idle, don't be stupid.
*/ */
if (i != target && cpus_share_cache(i, target) && idle_cpu(i)) if (prev != target && cpus_share_cache(prev, target) && idle_cpu(prev))
return i; return prev;
/*
* Otherwise, iterate the domains and find an eligible idle cpu.
*
* A completely idle sched group at higher domains is more
* desirable than an idle group at a lower level, because lower
* domains have smaller groups and usually share hardware
* resources which causes tasks to contend on them, e.g. x86
* hyperthread siblings in the lowest domain (SMT) can contend
* on the shared cpu pipeline.
*
* However, while we prefer idle groups at higher domains
* finding an idle cpu at the lowest domain is still better than
* returning 'target', which we've already established, isn't
* idle.
*/
sd = rcu_dereference(per_cpu(sd_llc, target)); sd = rcu_dereference(per_cpu(sd_llc, target));
for_each_lower_domain(sd) { if (!sd)
sg = sd->groups; return target;
do {
if (!cpumask_intersects(sched_group_cpus(sg), i = select_idle_core(p, sd, target);
tsk_cpus_allowed(p))) if ((unsigned)i < nr_cpumask_bits)
goto next; return i;
/* Ensure the entire group is idle */ i = select_idle_cpu(p, sd, target);
for_each_cpu(i, sched_group_cpus(sg)) { if ((unsigned)i < nr_cpumask_bits)
if (i == target || !idle_cpu(i)) return i;
goto next;
} i = select_idle_smt(p, sd, target);
if ((unsigned)i < nr_cpumask_bits)
return i;
/*
* It doesn't matter which cpu we pick, the
* whole group is idle.
*/
target = cpumask_first_and(sched_group_cpus(sg),
tsk_cpus_allowed(p));
goto done;
next:
sg = sg->next;
} while (sg != sd->groups);
}
done:
return target; return target;
} }
...@@ -5359,6 +5572,32 @@ static int cpu_util(int cpu) ...@@ -5359,6 +5572,32 @@ static int cpu_util(int cpu)
return (util >= capacity) ? capacity : util; return (util >= capacity) ? capacity : util;
} }
static inline int task_util(struct task_struct *p)
{
return p->se.avg.util_avg;
}
/*
* Disable WAKE_AFFINE in the case where task @p doesn't fit in the
* capacity of either the waking CPU @cpu or the previous CPU @prev_cpu.
*
* In that case WAKE_AFFINE doesn't make sense and we'll let
* BALANCE_WAKE sort things out.
*/
static int wake_cap(struct task_struct *p, int cpu, int prev_cpu)
{
long min_cap, max_cap;
min_cap = min(capacity_orig_of(prev_cpu), capacity_orig_of(cpu));
max_cap = cpu_rq(cpu)->rd->max_cpu_capacity;
/* Minimum capacity is close to max, no need to abort wake_affine */
if (max_cap - min_cap < max_cap >> 3)
return 0;
return min_cap * 1024 < task_util(p) * capacity_margin;
}
/* /*
* select_task_rq_fair: Select target runqueue for the waking task in domains * select_task_rq_fair: Select target runqueue for the waking task in domains
* that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE, * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE,
...@@ -5382,7 +5621,8 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f ...@@ -5382,7 +5621,8 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
if (sd_flag & SD_BALANCE_WAKE) { if (sd_flag & SD_BALANCE_WAKE) {
record_wakee(p); record_wakee(p);
want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, tsk_cpus_allowed(p)); want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu)
&& cpumask_test_cpu(cpu, tsk_cpus_allowed(p));
} }
rcu_read_lock(); rcu_read_lock();
...@@ -5408,13 +5648,13 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f ...@@ -5408,13 +5648,13 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
if (affine_sd) { if (affine_sd) {
sd = NULL; /* Prefer wake_affine over balance flags */ sd = NULL; /* Prefer wake_affine over balance flags */
if (cpu != prev_cpu && wake_affine(affine_sd, p, sync)) if (cpu != prev_cpu && wake_affine(affine_sd, p, prev_cpu, sync))
new_cpu = cpu; new_cpu = cpu;
} }
if (!sd) { if (!sd) {
if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */ if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */
new_cpu = select_idle_sibling(p, new_cpu); new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
} else while (sd) { } else while (sd) {
struct sched_group *group; struct sched_group *group;
...@@ -5938,7 +6178,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp ...@@ -5938,7 +6178,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
* *
* The adjacency matrix of the resulting graph is given by: * The adjacency matrix of the resulting graph is given by:
* *
* log_2 n * log_2 n
* A_i,j = \Union (i % 2^k == 0) && i / 2^(k+1) == j / 2^(k+1) (6) * A_i,j = \Union (i % 2^k == 0) && i / 2^(k+1) == j / 2^(k+1) (6)
* k = 0 * k = 0
* *
...@@ -5984,7 +6224,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp ...@@ -5984,7 +6224,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
* *
* [XXX write more on how we solve this.. _after_ merging pjt's patches that * [XXX write more on how we solve this.. _after_ merging pjt's patches that
* rewrite all of this once again.] * rewrite all of this once again.]
*/ */
static unsigned long __read_mostly max_load_balance_interval = HZ/10; static unsigned long __read_mostly max_load_balance_interval = HZ/10;
...@@ -6132,7 +6372,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) ...@@ -6132,7 +6372,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) { if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {
int cpu; int cpu;
schedstat_inc(p, se.statistics.nr_failed_migrations_affine); schedstat_inc(p->se.statistics.nr_failed_migrations_affine);
env->flags |= LBF_SOME_PINNED; env->flags |= LBF_SOME_PINNED;
...@@ -6163,7 +6403,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) ...@@ -6163,7 +6403,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
env->flags &= ~LBF_ALL_PINNED; env->flags &= ~LBF_ALL_PINNED;
if (task_running(env->src_rq, p)) { if (task_running(env->src_rq, p)) {
schedstat_inc(p, se.statistics.nr_failed_migrations_running); schedstat_inc(p->se.statistics.nr_failed_migrations_running);
return 0; return 0;
} }
...@@ -6180,13 +6420,13 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) ...@@ -6180,13 +6420,13 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
if (tsk_cache_hot <= 0 || if (tsk_cache_hot <= 0 ||
env->sd->nr_balance_failed > env->sd->cache_nice_tries) { env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
if (tsk_cache_hot == 1) { if (tsk_cache_hot == 1) {
schedstat_inc(env->sd, lb_hot_gained[env->idle]); schedstat_inc(env->sd->lb_hot_gained[env->idle]);
schedstat_inc(p, se.statistics.nr_forced_migrations); schedstat_inc(p->se.statistics.nr_forced_migrations);
} }
return 1; return 1;
} }
schedstat_inc(p, se.statistics.nr_failed_migrations_hot); schedstat_inc(p->se.statistics.nr_failed_migrations_hot);
return 0; return 0;
} }
...@@ -6226,7 +6466,7 @@ static struct task_struct *detach_one_task(struct lb_env *env) ...@@ -6226,7 +6466,7 @@ static struct task_struct *detach_one_task(struct lb_env *env)
* so we can safely collect stats here rather than * so we can safely collect stats here rather than
* inside detach_tasks(). * inside detach_tasks().
*/ */
schedstat_inc(env->sd, lb_gained[env->idle]); schedstat_inc(env->sd->lb_gained[env->idle]);
return p; return p;
} }
return NULL; return NULL;
...@@ -6318,7 +6558,7 @@ static int detach_tasks(struct lb_env *env) ...@@ -6318,7 +6558,7 @@ static int detach_tasks(struct lb_env *env)
* so we can safely collect detach_one_task() stats here rather * so we can safely collect detach_one_task() stats here rather
* than inside detach_one_task(). * than inside detach_one_task().
*/ */
schedstat_add(env->sd, lb_gained[env->idle], detached); schedstat_add(env->sd->lb_gained[env->idle], detached);
return detached; return detached;
} }
...@@ -6646,7 +6886,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu) ...@@ -6646,7 +6886,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
/* /*
* !SD_OVERLAP domains can assume that child groups * !SD_OVERLAP domains can assume that child groups
* span the current group. * span the current group.
*/ */
group = child->groups; group = child->groups;
do { do {
...@@ -7146,7 +7386,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s ...@@ -7146,7 +7386,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
load_above_capacity = busiest->sum_nr_running * SCHED_CAPACITY_SCALE; load_above_capacity = busiest->sum_nr_running * SCHED_CAPACITY_SCALE;
if (load_above_capacity > busiest->group_capacity) { if (load_above_capacity > busiest->group_capacity) {
load_above_capacity -= busiest->group_capacity; load_above_capacity -= busiest->group_capacity;
load_above_capacity *= NICE_0_LOAD; load_above_capacity *= scale_load_down(NICE_0_LOAD);
load_above_capacity /= busiest->group_capacity; load_above_capacity /= busiest->group_capacity;
} else } else
load_above_capacity = ~0UL; load_above_capacity = ~0UL;
...@@ -7353,9 +7593,6 @@ static struct rq *find_busiest_queue(struct lb_env *env, ...@@ -7353,9 +7593,6 @@ static struct rq *find_busiest_queue(struct lb_env *env,
*/ */
#define MAX_PINNED_INTERVAL 512 #define MAX_PINNED_INTERVAL 512
/* Working cpumask for load_balance and load_balance_newidle. */
DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
static int need_active_balance(struct lb_env *env) static int need_active_balance(struct lb_env *env)
{ {
struct sched_domain *sd = env->sd; struct sched_domain *sd = env->sd;
...@@ -7459,7 +7696,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, ...@@ -7459,7 +7696,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
cpumask_copy(cpus, cpu_active_mask); cpumask_copy(cpus, cpu_active_mask);
schedstat_inc(sd, lb_count[idle]); schedstat_inc(sd->lb_count[idle]);
redo: redo:
if (!should_we_balance(&env)) { if (!should_we_balance(&env)) {
...@@ -7469,19 +7706,19 @@ static int load_balance(int this_cpu, struct rq *this_rq, ...@@ -7469,19 +7706,19 @@ static int load_balance(int this_cpu, struct rq *this_rq,
group = find_busiest_group(&env); group = find_busiest_group(&env);
if (!group) { if (!group) {
schedstat_inc(sd, lb_nobusyg[idle]); schedstat_inc(sd->lb_nobusyg[idle]);
goto out_balanced; goto out_balanced;
} }
busiest = find_busiest_queue(&env, group); busiest = find_busiest_queue(&env, group);
if (!busiest) { if (!busiest) {
schedstat_inc(sd, lb_nobusyq[idle]); schedstat_inc(sd->lb_nobusyq[idle]);
goto out_balanced; goto out_balanced;
} }
BUG_ON(busiest == env.dst_rq); BUG_ON(busiest == env.dst_rq);
schedstat_add(sd, lb_imbalance[idle], env.imbalance); schedstat_add(sd->lb_imbalance[idle], env.imbalance);
env.src_cpu = busiest->cpu; env.src_cpu = busiest->cpu;
env.src_rq = busiest; env.src_rq = busiest;
...@@ -7588,7 +7825,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, ...@@ -7588,7 +7825,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
} }
if (!ld_moved) { if (!ld_moved) {
schedstat_inc(sd, lb_failed[idle]); schedstat_inc(sd->lb_failed[idle]);
/* /*
* Increment the failure counter only on periodic balance. * Increment the failure counter only on periodic balance.
* We do not want newidle balance, which can be very * We do not want newidle balance, which can be very
...@@ -7671,7 +7908,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, ...@@ -7671,7 +7908,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
* we can't migrate them. Let the imbalance flag set so parent level * we can't migrate them. Let the imbalance flag set so parent level
* can try to migrate them. * can try to migrate them.
*/ */
schedstat_inc(sd, lb_balanced[idle]); schedstat_inc(sd->lb_balanced[idle]);
sd->nr_balance_failed = 0; sd->nr_balance_failed = 0;
...@@ -7703,11 +7940,12 @@ get_sd_balance_interval(struct sched_domain *sd, int cpu_busy) ...@@ -7703,11 +7940,12 @@ get_sd_balance_interval(struct sched_domain *sd, int cpu_busy)
} }
static inline void static inline void
update_next_balance(struct sched_domain *sd, int cpu_busy, unsigned long *next_balance) update_next_balance(struct sched_domain *sd, unsigned long *next_balance)
{ {
unsigned long interval, next; unsigned long interval, next;
interval = get_sd_balance_interval(sd, cpu_busy); /* used by idle balance, so cpu_busy = 0 */
interval = get_sd_balance_interval(sd, 0);
next = sd->last_balance + interval; next = sd->last_balance + interval;
if (time_after(*next_balance, next)) if (time_after(*next_balance, next))
...@@ -7737,7 +7975,7 @@ static int idle_balance(struct rq *this_rq) ...@@ -7737,7 +7975,7 @@ static int idle_balance(struct rq *this_rq)
rcu_read_lock(); rcu_read_lock();
sd = rcu_dereference_check_sched_domain(this_rq->sd); sd = rcu_dereference_check_sched_domain(this_rq->sd);
if (sd) if (sd)
update_next_balance(sd, 0, &next_balance); update_next_balance(sd, &next_balance);
rcu_read_unlock(); rcu_read_unlock();
goto out; goto out;
...@@ -7755,7 +7993,7 @@ static int idle_balance(struct rq *this_rq) ...@@ -7755,7 +7993,7 @@ static int idle_balance(struct rq *this_rq)
continue; continue;
if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) { if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) {
update_next_balance(sd, 0, &next_balance); update_next_balance(sd, &next_balance);
break; break;
} }
...@@ -7773,7 +8011,7 @@ static int idle_balance(struct rq *this_rq) ...@@ -7773,7 +8011,7 @@ static int idle_balance(struct rq *this_rq)
curr_cost += domain_cost; curr_cost += domain_cost;
} }
update_next_balance(sd, 0, &next_balance); update_next_balance(sd, &next_balance);
/* /*
* Stop searching for tasks to pull if there are * Stop searching for tasks to pull if there are
...@@ -7863,15 +8101,15 @@ static int active_load_balance_cpu_stop(void *data) ...@@ -7863,15 +8101,15 @@ static int active_load_balance_cpu_stop(void *data)
.idle = CPU_IDLE, .idle = CPU_IDLE,
}; };
schedstat_inc(sd, alb_count); schedstat_inc(sd->alb_count);
p = detach_one_task(&env); p = detach_one_task(&env);
if (p) { if (p) {
schedstat_inc(sd, alb_pushed); schedstat_inc(sd->alb_pushed);
/* Active balancing done, reset the failure counter. */ /* Active balancing done, reset the failure counter. */
sd->nr_balance_failed = 0; sd->nr_balance_failed = 0;
} else { } else {
schedstat_inc(sd, alb_failed); schedstat_inc(sd->alb_failed);
} }
} }
rcu_read_unlock(); rcu_read_unlock();
...@@ -7963,13 +8201,13 @@ static inline void set_cpu_sd_state_busy(void) ...@@ -7963,13 +8201,13 @@ static inline void set_cpu_sd_state_busy(void)
int cpu = smp_processor_id(); int cpu = smp_processor_id();
rcu_read_lock(); rcu_read_lock();
sd = rcu_dereference(per_cpu(sd_busy, cpu)); sd = rcu_dereference(per_cpu(sd_llc, cpu));
if (!sd || !sd->nohz_idle) if (!sd || !sd->nohz_idle)
goto unlock; goto unlock;
sd->nohz_idle = 0; sd->nohz_idle = 0;
atomic_inc(&sd->groups->sgc->nr_busy_cpus); atomic_inc(&sd->shared->nr_busy_cpus);
unlock: unlock:
rcu_read_unlock(); rcu_read_unlock();
} }
...@@ -7980,13 +8218,13 @@ void set_cpu_sd_state_idle(void) ...@@ -7980,13 +8218,13 @@ void set_cpu_sd_state_idle(void)
int cpu = smp_processor_id(); int cpu = smp_processor_id();
rcu_read_lock(); rcu_read_lock();
sd = rcu_dereference(per_cpu(sd_busy, cpu)); sd = rcu_dereference(per_cpu(sd_llc, cpu));
if (!sd || sd->nohz_idle) if (!sd || sd->nohz_idle)
goto unlock; goto unlock;
sd->nohz_idle = 1; sd->nohz_idle = 1;
atomic_dec(&sd->groups->sgc->nr_busy_cpus); atomic_dec(&sd->shared->nr_busy_cpus);
unlock: unlock:
rcu_read_unlock(); rcu_read_unlock();
} }
...@@ -8213,8 +8451,8 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) ...@@ -8213,8 +8451,8 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
static inline bool nohz_kick_needed(struct rq *rq) static inline bool nohz_kick_needed(struct rq *rq)
{ {
unsigned long now = jiffies; unsigned long now = jiffies;
struct sched_domain_shared *sds;
struct sched_domain *sd; struct sched_domain *sd;
struct sched_group_capacity *sgc;
int nr_busy, cpu = rq->cpu; int nr_busy, cpu = rq->cpu;
bool kick = false; bool kick = false;
...@@ -8242,11 +8480,13 @@ static inline bool nohz_kick_needed(struct rq *rq) ...@@ -8242,11 +8480,13 @@ static inline bool nohz_kick_needed(struct rq *rq)
return true; return true;
rcu_read_lock(); rcu_read_lock();
sd = rcu_dereference(per_cpu(sd_busy, cpu)); sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
if (sd) { if (sds) {
sgc = sd->groups->sgc; /*
nr_busy = atomic_read(&sgc->nr_busy_cpus); * XXX: write a coherent comment on why we do this.
* See also: http://lkml.kernel.org/r/20111202010832.602203411@sbsiddha-desk.sc.intel.com
*/
nr_busy = atomic_read(&sds->nr_busy_cpus);
if (nr_busy > 1) { if (nr_busy > 1) {
kick = true; kick = true;
goto unlock; goto unlock;
...@@ -8440,7 +8680,6 @@ static void detach_task_cfs_rq(struct task_struct *p) ...@@ -8440,7 +8680,6 @@ static void detach_task_cfs_rq(struct task_struct *p)
struct sched_entity *se = &p->se; struct sched_entity *se = &p->se;
struct cfs_rq *cfs_rq = cfs_rq_of(se); struct cfs_rq *cfs_rq = cfs_rq_of(se);
u64 now = cfs_rq_clock_task(cfs_rq); u64 now = cfs_rq_clock_task(cfs_rq);
int tg_update;
if (!vruntime_normalized(p)) { if (!vruntime_normalized(p)) {
/* /*
...@@ -8452,10 +8691,9 @@ static void detach_task_cfs_rq(struct task_struct *p) ...@@ -8452,10 +8691,9 @@ static void detach_task_cfs_rq(struct task_struct *p)
} }
/* Catch up with the cfs_rq and remove our load when we leave */ /* Catch up with the cfs_rq and remove our load when we leave */
tg_update = update_cfs_rq_load_avg(now, cfs_rq, false); update_cfs_rq_load_avg(now, cfs_rq, false);
detach_entity_load_avg(cfs_rq, se); detach_entity_load_avg(cfs_rq, se);
if (tg_update) update_tg_load_avg(cfs_rq, false);
update_tg_load_avg(cfs_rq, false);
} }
static void attach_task_cfs_rq(struct task_struct *p) static void attach_task_cfs_rq(struct task_struct *p)
...@@ -8463,7 +8701,6 @@ static void attach_task_cfs_rq(struct task_struct *p) ...@@ -8463,7 +8701,6 @@ static void attach_task_cfs_rq(struct task_struct *p)
struct sched_entity *se = &p->se; struct sched_entity *se = &p->se;
struct cfs_rq *cfs_rq = cfs_rq_of(se); struct cfs_rq *cfs_rq = cfs_rq_of(se);
u64 now = cfs_rq_clock_task(cfs_rq); u64 now = cfs_rq_clock_task(cfs_rq);
int tg_update;
#ifdef CONFIG_FAIR_GROUP_SCHED #ifdef CONFIG_FAIR_GROUP_SCHED
/* /*
...@@ -8474,10 +8711,9 @@ static void attach_task_cfs_rq(struct task_struct *p) ...@@ -8474,10 +8711,9 @@ static void attach_task_cfs_rq(struct task_struct *p)
#endif #endif
/* Synchronize task with its cfs_rq */ /* Synchronize task with its cfs_rq */
tg_update = update_cfs_rq_load_avg(now, cfs_rq, false); update_cfs_rq_load_avg(now, cfs_rq, false);
attach_entity_load_avg(cfs_rq, se); attach_entity_load_avg(cfs_rq, se);
if (tg_update) update_tg_load_avg(cfs_rq, false);
update_tg_load_avg(cfs_rq, false);
if (!vruntime_normalized(p)) if (!vruntime_normalized(p))
se->vruntime += cfs_rq->min_vruntime; se->vruntime += cfs_rq->min_vruntime;
......
...@@ -27,8 +27,8 @@ static struct task_struct * ...@@ -27,8 +27,8 @@ static struct task_struct *
pick_next_task_idle(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie) pick_next_task_idle(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie)
{ {
put_prev_task(rq, prev); put_prev_task(rq, prev);
update_idle_core(rq);
schedstat_inc(rq, sched_goidle); schedstat_inc(rq->sched_goidle);
return rq->idle; return rq->idle;
} }
......
...@@ -2,6 +2,7 @@ ...@@ -2,6 +2,7 @@
#include <linux/sched.h> #include <linux/sched.h>
#include <linux/sched/sysctl.h> #include <linux/sched/sysctl.h>
#include <linux/sched/rt.h> #include <linux/sched/rt.h>
#include <linux/u64_stats_sync.h>
#include <linux/sched/deadline.h> #include <linux/sched/deadline.h>
#include <linux/binfmts.h> #include <linux/binfmts.h>
#include <linux/mutex.h> #include <linux/mutex.h>
...@@ -15,6 +16,12 @@ ...@@ -15,6 +16,12 @@
#include "cpudeadline.h" #include "cpudeadline.h"
#include "cpuacct.h" #include "cpuacct.h"
#ifdef CONFIG_SCHED_DEBUG
#define SCHED_WARN_ON(x) WARN_ONCE(x, #x)
#else
#define SCHED_WARN_ON(x) ((void)(x))
#endif
struct rq; struct rq;
struct cpuidle_state; struct cpuidle_state;
...@@ -565,6 +572,8 @@ struct root_domain { ...@@ -565,6 +572,8 @@ struct root_domain {
*/ */
cpumask_var_t rto_mask; cpumask_var_t rto_mask;
struct cpupri cpupri; struct cpupri cpupri;
unsigned long max_cpu_capacity;
}; };
extern struct root_domain def_root_domain; extern struct root_domain def_root_domain;
...@@ -597,7 +606,6 @@ struct rq { ...@@ -597,7 +606,6 @@ struct rq {
#ifdef CONFIG_SMP #ifdef CONFIG_SMP
unsigned long last_load_update_tick; unsigned long last_load_update_tick;
#endif /* CONFIG_SMP */ #endif /* CONFIG_SMP */
u64 nohz_stamp;
unsigned long nohz_flags; unsigned long nohz_flags;
#endif /* CONFIG_NO_HZ_COMMON */ #endif /* CONFIG_NO_HZ_COMMON */
#ifdef CONFIG_NO_HZ_FULL #ifdef CONFIG_NO_HZ_FULL
...@@ -723,6 +731,23 @@ static inline int cpu_of(struct rq *rq) ...@@ -723,6 +731,23 @@ static inline int cpu_of(struct rq *rq)
#endif #endif
} }
#ifdef CONFIG_SCHED_SMT
extern struct static_key_false sched_smt_present;
extern void __update_idle_core(struct rq *rq);
static inline void update_idle_core(struct rq *rq)
{
if (static_branch_unlikely(&sched_smt_present))
__update_idle_core(rq);
}
#else
static inline void update_idle_core(struct rq *rq) { }
#endif
DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) #define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
...@@ -857,8 +882,8 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag) ...@@ -857,8 +882,8 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
DECLARE_PER_CPU(struct sched_domain *, sd_llc); DECLARE_PER_CPU(struct sched_domain *, sd_llc);
DECLARE_PER_CPU(int, sd_llc_size); DECLARE_PER_CPU(int, sd_llc_size);
DECLARE_PER_CPU(int, sd_llc_id); DECLARE_PER_CPU(int, sd_llc_id);
DECLARE_PER_CPU(struct sched_domain_shared *, sd_llc_shared);
DECLARE_PER_CPU(struct sched_domain *, sd_numa); DECLARE_PER_CPU(struct sched_domain *, sd_numa);
DECLARE_PER_CPU(struct sched_domain *, sd_busy);
DECLARE_PER_CPU(struct sched_domain *, sd_asym); DECLARE_PER_CPU(struct sched_domain *, sd_asym);
struct sched_group_capacity { struct sched_group_capacity {
...@@ -870,10 +895,6 @@ struct sched_group_capacity { ...@@ -870,10 +895,6 @@ struct sched_group_capacity {
unsigned int capacity; unsigned int capacity;
unsigned long next_update; unsigned long next_update;
int imbalance; /* XXX unrelated to capacity but shared group state */ int imbalance; /* XXX unrelated to capacity but shared group state */
/*
* Number of busy cpus in this group.
*/
atomic_t nr_busy_cpus;
unsigned long cpumask[0]; /* iteration mask */ unsigned long cpumask[0]; /* iteration mask */
}; };
...@@ -1260,6 +1281,11 @@ static inline void put_prev_task(struct rq *rq, struct task_struct *prev) ...@@ -1260,6 +1281,11 @@ static inline void put_prev_task(struct rq *rq, struct task_struct *prev)
prev->sched_class->put_prev_task(rq, prev); prev->sched_class->put_prev_task(rq, prev);
} }
static inline void set_curr_task(struct rq *rq, struct task_struct *curr)
{
curr->sched_class->set_curr_task(rq);
}
#define sched_class_highest (&stop_sched_class) #define sched_class_highest (&stop_sched_class)
#define for_each_class(class) \ #define for_each_class(class) \
for (class = sched_class_highest; class; class = class->next) for (class = sched_class_highest; class; class = class->next)
...@@ -1290,7 +1316,7 @@ static inline void idle_set_state(struct rq *rq, ...@@ -1290,7 +1316,7 @@ static inline void idle_set_state(struct rq *rq,
static inline struct cpuidle_state *idle_get_state(struct rq *rq) static inline struct cpuidle_state *idle_get_state(struct rq *rq)
{ {
WARN_ON(!rcu_read_lock_held()); SCHED_WARN_ON(!rcu_read_lock_held());
return rq->idle_state; return rq->idle_state;
} }
#else #else
...@@ -1710,52 +1736,28 @@ static inline void nohz_balance_exit_idle(unsigned int cpu) { } ...@@ -1710,52 +1736,28 @@ static inline void nohz_balance_exit_idle(unsigned int cpu) { }
#endif #endif
#ifdef CONFIG_IRQ_TIME_ACCOUNTING #ifdef CONFIG_IRQ_TIME_ACCOUNTING
struct irqtime {
u64 hardirq_time;
u64 softirq_time;
u64 irq_start_time;
struct u64_stats_sync sync;
};
DECLARE_PER_CPU(u64, cpu_hardirq_time); DECLARE_PER_CPU(struct irqtime, cpu_irqtime);
DECLARE_PER_CPU(u64, cpu_softirq_time);
#ifndef CONFIG_64BIT
DECLARE_PER_CPU(seqcount_t, irq_time_seq);
static inline void irq_time_write_begin(void)
{
__this_cpu_inc(irq_time_seq.sequence);
smp_wmb();
}
static inline void irq_time_write_end(void)
{
smp_wmb();
__this_cpu_inc(irq_time_seq.sequence);
}
static inline u64 irq_time_read(int cpu) static inline u64 irq_time_read(int cpu)
{ {
u64 irq_time; struct irqtime *irqtime = &per_cpu(cpu_irqtime, cpu);
unsigned seq; unsigned int seq;
u64 total;
do { do {
seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu)); seq = __u64_stats_fetch_begin(&irqtime->sync);
irq_time = per_cpu(cpu_softirq_time, cpu) + total = irqtime->softirq_time + irqtime->hardirq_time;
per_cpu(cpu_hardirq_time, cpu); } while (__u64_stats_fetch_retry(&irqtime->sync, seq));
} while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq));
return irq_time;
}
#else /* CONFIG_64BIT */
static inline void irq_time_write_begin(void)
{
}
static inline void irq_time_write_end(void) return total;
{
}
static inline u64 irq_time_read(int cpu)
{
return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu);
} }
#endif /* CONFIG_64BIT */
#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ #endif /* CONFIG_IRQ_TIME_ACCOUNTING */
#ifdef CONFIG_CPU_FREQ #ifdef CONFIG_CPU_FREQ
......
...@@ -29,11 +29,12 @@ rq_sched_info_dequeued(struct rq *rq, unsigned long long delta) ...@@ -29,11 +29,12 @@ rq_sched_info_dequeued(struct rq *rq, unsigned long long delta)
if (rq) if (rq)
rq->rq_sched_info.run_delay += delta; rq->rq_sched_info.run_delay += delta;
} }
# define schedstat_enabled() static_branch_unlikely(&sched_schedstats) #define schedstat_enabled() static_branch_unlikely(&sched_schedstats)
# define schedstat_inc(rq, field) do { if (schedstat_enabled()) { (rq)->field++; } } while (0) #define schedstat_inc(var) do { if (schedstat_enabled()) { var++; } } while (0)
# define schedstat_add(rq, field, amt) do { if (schedstat_enabled()) { (rq)->field += (amt); } } while (0) #define schedstat_add(var, amt) do { if (schedstat_enabled()) { var += (amt); } } while (0)
# define schedstat_set(var, val) do { if (schedstat_enabled()) { var = (val); } } while (0) #define schedstat_set(var, val) do { if (schedstat_enabled()) { var = (val); } } while (0)
# define schedstat_val(rq, field) ((schedstat_enabled()) ? (rq)->field : 0) #define schedstat_val(var) (var)
#define schedstat_val_or_zero(var) ((schedstat_enabled()) ? (var) : 0)
#else /* !CONFIG_SCHEDSTATS */ #else /* !CONFIG_SCHEDSTATS */
static inline void static inline void
...@@ -45,12 +46,13 @@ rq_sched_info_dequeued(struct rq *rq, unsigned long long delta) ...@@ -45,12 +46,13 @@ rq_sched_info_dequeued(struct rq *rq, unsigned long long delta)
static inline void static inline void
rq_sched_info_depart(struct rq *rq, unsigned long long delta) rq_sched_info_depart(struct rq *rq, unsigned long long delta)
{} {}
# define schedstat_enabled() 0 #define schedstat_enabled() 0
# define schedstat_inc(rq, field) do { } while (0) #define schedstat_inc(var) do { } while (0)
# define schedstat_add(rq, field, amt) do { } while (0) #define schedstat_add(var, amt) do { } while (0)
# define schedstat_set(var, val) do { } while (0) #define schedstat_set(var, val) do { } while (0)
# define schedstat_val(rq, field) 0 #define schedstat_val(var) 0
#endif #define schedstat_val_or_zero(var) 0
#endif /* CONFIG_SCHEDSTATS */
#ifdef CONFIG_SCHED_INFO #ifdef CONFIG_SCHED_INFO
static inline void sched_info_reset_dequeued(struct task_struct *t) static inline void sched_info_reset_dequeued(struct task_struct *t)
......
...@@ -196,27 +196,48 @@ prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state) ...@@ -196,27 +196,48 @@ prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state)
} }
EXPORT_SYMBOL(prepare_to_wait_exclusive); EXPORT_SYMBOL(prepare_to_wait_exclusive);
long prepare_to_wait_event(wait_queue_head_t *q, wait_queue_t *wait, int state) void init_wait_entry(wait_queue_t *wait, int flags)
{ {
unsigned long flags; wait->flags = flags;
if (signal_pending_state(state, current))
return -ERESTARTSYS;
wait->private = current; wait->private = current;
wait->func = autoremove_wake_function; wait->func = autoremove_wake_function;
INIT_LIST_HEAD(&wait->task_list);
}
EXPORT_SYMBOL(init_wait_entry);
long prepare_to_wait_event(wait_queue_head_t *q, wait_queue_t *wait, int state)
{
unsigned long flags;
long ret = 0;
spin_lock_irqsave(&q->lock, flags); spin_lock_irqsave(&q->lock, flags);
if (list_empty(&wait->task_list)) { if (unlikely(signal_pending_state(state, current))) {
if (wait->flags & WQ_FLAG_EXCLUSIVE) /*
__add_wait_queue_tail(q, wait); * Exclusive waiter must not fail if it was selected by wakeup,
else * it should "consume" the condition we were waiting for.
__add_wait_queue(q, wait); *
* The caller will recheck the condition and return success if
* we were already woken up, we can not miss the event because
* wakeup locks/unlocks the same q->lock.
*
* But we need to ensure that set-condition + wakeup after that
* can't see us, it should wake up another exclusive waiter if
* we fail.
*/
list_del_init(&wait->task_list);
ret = -ERESTARTSYS;
} else {
if (list_empty(&wait->task_list)) {
if (wait->flags & WQ_FLAG_EXCLUSIVE)
__add_wait_queue_tail(q, wait);
else
__add_wait_queue(q, wait);
}
set_current_state(state);
} }
set_current_state(state);
spin_unlock_irqrestore(&q->lock, flags); spin_unlock_irqrestore(&q->lock, flags);
return 0; return ret;
} }
EXPORT_SYMBOL(prepare_to_wait_event); EXPORT_SYMBOL(prepare_to_wait_event);
...@@ -255,39 +276,6 @@ void finish_wait(wait_queue_head_t *q, wait_queue_t *wait) ...@@ -255,39 +276,6 @@ void finish_wait(wait_queue_head_t *q, wait_queue_t *wait)
} }
EXPORT_SYMBOL(finish_wait); EXPORT_SYMBOL(finish_wait);
/**
* abort_exclusive_wait - abort exclusive waiting in a queue
* @q: waitqueue waited on
* @wait: wait descriptor
* @mode: runstate of the waiter to be woken
* @key: key to identify a wait bit queue or %NULL
*
* Sets current thread back to running state and removes
* the wait descriptor from the given waitqueue if still
* queued.
*
* Wakes up the next waiter if the caller is concurrently
* woken up through the queue.
*
* This prevents waiter starvation where an exclusive waiter
* aborts and is woken up concurrently and no one wakes up
* the next waiter.
*/
void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait,
unsigned int mode, void *key)
{
unsigned long flags;
__set_current_state(TASK_RUNNING);
spin_lock_irqsave(&q->lock, flags);
if (!list_empty(&wait->task_list))
list_del_init(&wait->task_list);
else if (waitqueue_active(q))
__wake_up_locked_key(q, mode, key);
spin_unlock_irqrestore(&q->lock, flags);
}
EXPORT_SYMBOL(abort_exclusive_wait);
int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key) int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key)
{ {
int ret = default_wake_function(wait, mode, sync, key); int ret = default_wake_function(wait, mode, sync, key);
...@@ -425,20 +413,29 @@ int __sched ...@@ -425,20 +413,29 @@ int __sched
__wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q, __wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q,
wait_bit_action_f *action, unsigned mode) wait_bit_action_f *action, unsigned mode)
{ {
do { int ret = 0;
int ret;
for (;;) {
prepare_to_wait_exclusive(wq, &q->wait, mode); prepare_to_wait_exclusive(wq, &q->wait, mode);
if (!test_bit(q->key.bit_nr, q->key.flags)) if (test_bit(q->key.bit_nr, q->key.flags)) {
continue; ret = action(&q->key, mode);
ret = action(&q->key, mode); /*
if (!ret) * See the comment in prepare_to_wait_event().
continue; * finish_wait() does not necessarily takes wq->lock,
abort_exclusive_wait(wq, &q->wait, mode, &q->key); * but test_and_set_bit() implies mb() which pairs with
return ret; * smp_mb__after_atomic() before wake_up_page().
} while (test_and_set_bit(q->key.bit_nr, q->key.flags)); */
finish_wait(wq, &q->wait); if (ret)
return 0; finish_wait(wq, &q->wait);
}
if (!test_and_set_bit(q->key.bit_nr, q->key.flags)) {
if (!ret)
finish_wait(wq, &q->wait);
return 0;
} else if (ret) {
return ret;
}
}
} }
EXPORT_SYMBOL(__wait_on_bit_lock); EXPORT_SYMBOL(__wait_on_bit_lock);
......
...@@ -122,12 +122,12 @@ static int smpboot_thread_fn(void *data) ...@@ -122,12 +122,12 @@ static int smpboot_thread_fn(void *data)
if (kthread_should_park()) { if (kthread_should_park()) {
__set_current_state(TASK_RUNNING); __set_current_state(TASK_RUNNING);
preempt_enable();
if (ht->park && td->status == HP_THREAD_ACTIVE) { if (ht->park && td->status == HP_THREAD_ACTIVE) {
BUG_ON(td->cpu != smp_processor_id()); BUG_ON(td->cpu != smp_processor_id());
ht->park(td->cpu); ht->park(td->cpu);
td->status = HP_THREAD_PARKED; td->status = HP_THREAD_PARKED;
} }
preempt_enable();
kthread_parkme(); kthread_parkme();
/* We might have been woken for stop */ /* We might have been woken for stop */
continue; continue;
......
...@@ -121,6 +121,11 @@ int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg) ...@@ -121,6 +121,11 @@ int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg)
cpu_stop_init_done(&done, 1); cpu_stop_init_done(&done, 1);
if (!cpu_stop_queue_work(cpu, &work)) if (!cpu_stop_queue_work(cpu, &work))
return -ENOENT; return -ENOENT;
/*
* In case @cpu == smp_proccessor_id() we can avoid a sleep+wakeup
* cycle by doing a preemption:
*/
cond_resched();
wait_for_completion(&done.completion); wait_for_completion(&done.completion);
return done.ret; return done.ret;
} }
......
...@@ -1165,7 +1165,7 @@ int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t pmd) ...@@ -1165,7 +1165,7 @@ int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t pmd)
} }
/* See similar comment in do_numa_page for explanation */ /* See similar comment in do_numa_page for explanation */
if (!(vma->vm_flags & VM_WRITE)) if (!pmd_write(pmd))
flags |= TNF_NO_GROUP; flags |= TNF_NO_GROUP;
/* /*
......
...@@ -3395,7 +3395,7 @@ static int do_numa_page(struct fault_env *fe, pte_t pte) ...@@ -3395,7 +3395,7 @@ static int do_numa_page(struct fault_env *fe, pte_t pte)
* pte_dirty has unpredictable behaviour between PTE scan updates, * pte_dirty has unpredictable behaviour between PTE scan updates,
* background writeback, dirty balancing and application behaviour. * background writeback, dirty balancing and application behaviour.
*/ */
if (!(vma->vm_flags & VM_WRITE)) if (!pte_write(pte))
flags |= TNF_NO_GROUP; flags |= TNF_NO_GROUP;
/* /*
......
...@@ -175,6 +175,7 @@ static int __dead_end_function(struct objtool_file *file, struct symbol *func, ...@@ -175,6 +175,7 @@ static int __dead_end_function(struct objtool_file *file, struct symbol *func,
"__stack_chk_fail", "__stack_chk_fail",
"panic", "panic",
"do_exit", "do_exit",
"do_task_dead",
"__module_put_and_exit", "__module_put_and_exit",
"complete_and_exit", "complete_and_exit",
"kvm_spurious_fault", "kvm_spurious_fault",
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment