Commit 53528695 authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull scheduler changes from Ingo Molnar:
 "The main changes in this cycle were:

   - sched/fair load tracking fixes and cleanups (Byungchul Park)

   - Make load tracking frequency scale invariant (Dietmar Eggemann)

   - sched/deadline updates (Juri Lelli)

   - stop machine fixes, cleanups and enhancements for bugs triggered by
     CPU hotplug stress testing (Oleg Nesterov)

   - scheduler preemption code rework: remove PREEMPT_ACTIVE and related
     cleanups (Peter Zijlstra)

   - Rework the sched_info::run_delay code to fix races (Peter Zijlstra)

   - Optimize per entity utilization tracking (Peter Zijlstra)

   - ... misc other fixes, cleanups and smaller updates"

* 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (57 commits)
  sched: Don't scan all-offline ->cpus_allowed twice if !CONFIG_CPUSETS
  sched: Move cpu_active() tests from stop_two_cpus() into migrate_swap_stop()
  sched: Start stopper early
  stop_machine: Kill cpu_stop_threads->setup() and cpu_stop_unpark()
  stop_machine: Kill smp_hotplug_thread->pre_unpark, introduce stop_machine_unpark()
  stop_machine: Change cpu_stop_queue_two_works() to rely on stopper->enabled
  stop_machine: Introduce __cpu_stop_queue_work() and cpu_stop_queue_two_works()
  stop_machine: Ensure that a queued callback will be called before cpu_stop_park()
  sched/x86: Fix typo in __switch_to() comments
  sched/core: Remove a parameter in the migrate_task_rq() function
  sched/core: Drop unlikely behind BUG_ON()
  sched/core: Fix task and run queue sched_info::run_delay inconsistencies
  sched/numa: Fix task_tick_fair() from disabling numa_balancing
  sched/core: Add preempt_count invariant check
  sched/core: More notrace annotations
  sched/core: Kill PREEMPT_ACTIVE
  sched/core, sched/x86: Kill thread_info::saved_preempt_count
  sched/core: Simplify preempt_count tests
  sched/core: Robustify preemption leak checks
  sched/core: Stop setting PREEMPT_ACTIVE
  ...
parents b831ef2c e73e85f0
...@@ -30,12 +30,9 @@ static __always_inline void preempt_count_set(int pc) ...@@ -30,12 +30,9 @@ static __always_inline void preempt_count_set(int pc)
/* /*
* must be macros to avoid header recursion hell * must be macros to avoid header recursion hell
*/ */
#define init_task_preempt_count(p) do { \ #define init_task_preempt_count(p) do { } while (0)
task_thread_info(p)->saved_preempt_count = PREEMPT_DISABLED; \
} while (0)
#define init_idle_preempt_count(p, cpu) do { \ #define init_idle_preempt_count(p, cpu) do { \
task_thread_info(p)->saved_preempt_count = PREEMPT_ENABLED; \
per_cpu(__preempt_count, (cpu)) = PREEMPT_ENABLED; \ per_cpu(__preempt_count, (cpu)) = PREEMPT_ENABLED; \
} while (0) } while (0)
......
...@@ -57,7 +57,6 @@ struct thread_info { ...@@ -57,7 +57,6 @@ struct thread_info {
__u32 flags; /* low level flags */ __u32 flags; /* low level flags */
__u32 status; /* thread synchronous flags */ __u32 status; /* thread synchronous flags */
__u32 cpu; /* current CPU */ __u32 cpu; /* current CPU */
int saved_preempt_count;
mm_segment_t addr_limit; mm_segment_t addr_limit;
void __user *sysenter_return; void __user *sysenter_return;
unsigned int sig_on_uaccess_error:1; unsigned int sig_on_uaccess_error:1;
...@@ -69,7 +68,6 @@ struct thread_info { ...@@ -69,7 +68,6 @@ struct thread_info {
.task = &tsk, \ .task = &tsk, \
.flags = 0, \ .flags = 0, \
.cpu = 0, \ .cpu = 0, \
.saved_preempt_count = INIT_PREEMPT_COUNT, \
.addr_limit = KERNEL_DS, \ .addr_limit = KERNEL_DS, \
} }
......
...@@ -279,14 +279,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) ...@@ -279,14 +279,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
if (get_kernel_rpl() && unlikely(prev->iopl != next->iopl)) if (get_kernel_rpl() && unlikely(prev->iopl != next->iopl))
set_iopl_mask(next->iopl); set_iopl_mask(next->iopl);
/*
* If it were not for PREEMPT_ACTIVE we could guarantee that the
* preempt_count of all tasks was equal here and this would not be
* needed.
*/
task_thread_info(prev_p)->saved_preempt_count = this_cpu_read(__preempt_count);
this_cpu_write(__preempt_count, task_thread_info(next_p)->saved_preempt_count);
/* /*
* Now maybe handle debug registers and/or IO bitmaps * Now maybe handle debug registers and/or IO bitmaps
*/ */
......
...@@ -332,7 +332,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) ...@@ -332,7 +332,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
/* /*
* Switch FS and GS. * Switch FS and GS.
* *
* These are even more complicated than FS and GS: they have * These are even more complicated than DS and ES: they have
* 64-bit bases are that controlled by arch_prctl. Those bases * 64-bit bases are that controlled by arch_prctl. Those bases
* only differ from the values in the GDT or LDT if the selector * only differ from the values in the GDT or LDT if the selector
* is 0. * is 0.
...@@ -401,14 +401,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) ...@@ -401,14 +401,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
*/ */
this_cpu_write(current_task, next_p); this_cpu_write(current_task, next_p);
/*
* If it were not for PREEMPT_ACTIVE we could guarantee that the
* preempt_count of all tasks was equal here and this would not be
* needed.
*/
task_thread_info(prev_p)->saved_preempt_count = this_cpu_read(__preempt_count);
this_cpu_write(__preempt_count, task_thread_info(next_p)->saved_preempt_count);
/* Reload esp0 and ss1. This changes current_thread_info(). */ /* Reload esp0 and ss1. This changes current_thread_info(). */
load_sp0(tss, next); load_sp0(tss, next);
......
...@@ -24,7 +24,7 @@ static __always_inline void preempt_count_set(int pc) ...@@ -24,7 +24,7 @@ static __always_inline void preempt_count_set(int pc)
* must be macros to avoid header recursion hell * must be macros to avoid header recursion hell
*/ */
#define init_task_preempt_count(p) do { \ #define init_task_preempt_count(p) do { \
task_thread_info(p)->preempt_count = PREEMPT_DISABLED; \ task_thread_info(p)->preempt_count = FORK_PREEMPT_COUNT; \
} while (0) } while (0)
#define init_idle_preempt_count(p, cpu) do { \ #define init_idle_preempt_count(p, cpu) do { \
......
...@@ -26,7 +26,6 @@ ...@@ -26,7 +26,6 @@
* SOFTIRQ_MASK: 0x0000ff00 * SOFTIRQ_MASK: 0x0000ff00
* HARDIRQ_MASK: 0x000f0000 * HARDIRQ_MASK: 0x000f0000
* NMI_MASK: 0x00100000 * NMI_MASK: 0x00100000
* PREEMPT_ACTIVE: 0x00200000
* PREEMPT_NEED_RESCHED: 0x80000000 * PREEMPT_NEED_RESCHED: 0x80000000
*/ */
#define PREEMPT_BITS 8 #define PREEMPT_BITS 8
...@@ -53,10 +52,6 @@ ...@@ -53,10 +52,6 @@
#define SOFTIRQ_DISABLE_OFFSET (2 * SOFTIRQ_OFFSET) #define SOFTIRQ_DISABLE_OFFSET (2 * SOFTIRQ_OFFSET)
#define PREEMPT_ACTIVE_BITS 1
#define PREEMPT_ACTIVE_SHIFT (NMI_SHIFT + NMI_BITS)
#define PREEMPT_ACTIVE (__IRQ_MASK(PREEMPT_ACTIVE_BITS) << PREEMPT_ACTIVE_SHIFT)
/* We use the MSB mostly because its available */ /* We use the MSB mostly because its available */
#define PREEMPT_NEED_RESCHED 0x80000000 #define PREEMPT_NEED_RESCHED 0x80000000
...@@ -126,8 +121,7 @@ ...@@ -126,8 +121,7 @@
* Check whether we were atomic before we did preempt_disable(): * Check whether we were atomic before we did preempt_disable():
* (used by the scheduler) * (used by the scheduler)
*/ */
#define in_atomic_preempt_off() \ #define in_atomic_preempt_off() (preempt_count() != PREEMPT_DISABLE_OFFSET)
((preempt_count() & ~PREEMPT_ACTIVE) != PREEMPT_DISABLE_OFFSET)
#if defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_PREEMPT_TRACER) #if defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_PREEMPT_TRACER)
extern void preempt_count_add(int val); extern void preempt_count_add(int val);
...@@ -146,18 +140,6 @@ extern void preempt_count_sub(int val); ...@@ -146,18 +140,6 @@ extern void preempt_count_sub(int val);
#define preempt_count_inc() preempt_count_add(1) #define preempt_count_inc() preempt_count_add(1)
#define preempt_count_dec() preempt_count_sub(1) #define preempt_count_dec() preempt_count_sub(1)
#define preempt_active_enter() \
do { \
preempt_count_add(PREEMPT_ACTIVE + PREEMPT_DISABLE_OFFSET); \
barrier(); \
} while (0)
#define preempt_active_exit() \
do { \
barrier(); \
preempt_count_sub(PREEMPT_ACTIVE + PREEMPT_DISABLE_OFFSET); \
} while (0)
#ifdef CONFIG_PREEMPT_COUNT #ifdef CONFIG_PREEMPT_COUNT
#define preempt_disable() \ #define preempt_disable() \
......
...@@ -599,20 +599,26 @@ struct task_cputime_atomic { ...@@ -599,20 +599,26 @@ struct task_cputime_atomic {
.sum_exec_runtime = ATOMIC64_INIT(0), \ .sum_exec_runtime = ATOMIC64_INIT(0), \
} }
#ifdef CONFIG_PREEMPT_COUNT #define PREEMPT_DISABLED (PREEMPT_DISABLE_OFFSET + PREEMPT_ENABLED)
#define PREEMPT_DISABLED (1 + PREEMPT_ENABLED)
#else /*
#define PREEMPT_DISABLED PREEMPT_ENABLED * Disable preemption until the scheduler is running -- use an unconditional
#endif * value so that it also works on !PREEMPT_COUNT kernels.
*
* Reset by start_kernel()->sched_init()->init_idle()->init_idle_preempt_count().
*/
#define INIT_PREEMPT_COUNT PREEMPT_OFFSET
/* /*
* Disable preemption until the scheduler is running. * Initial preempt_count value; reflects the preempt_count schedule invariant
* Reset by start_kernel()->sched_init()->init_idle(). * which states that during context switches:
* *
* We include PREEMPT_ACTIVE to avoid cond_resched() from working * preempt_count() == 2*PREEMPT_DISABLE_OFFSET
* before the scheduler is active -- see should_resched(). *
* Note: PREEMPT_DISABLE_OFFSET is 0 for !PREEMPT_COUNT kernels.
* Note: See finish_task_switch().
*/ */
#define INIT_PREEMPT_COUNT (PREEMPT_DISABLED + PREEMPT_ACTIVE) #define FORK_PREEMPT_COUNT (2*PREEMPT_DISABLE_OFFSET + PREEMPT_ENABLED)
/** /**
* struct thread_group_cputimer - thread group interval timer counts * struct thread_group_cputimer - thread group interval timer counts
...@@ -1142,8 +1148,6 @@ struct sched_domain_topology_level { ...@@ -1142,8 +1148,6 @@ struct sched_domain_topology_level {
#endif #endif
}; };
extern struct sched_domain_topology_level *sched_domain_topology;
extern void set_sched_topology(struct sched_domain_topology_level *tl); extern void set_sched_topology(struct sched_domain_topology_level *tl);
extern void wake_up_if_idle(int cpu); extern void wake_up_if_idle(int cpu);
...@@ -1192,10 +1196,10 @@ struct load_weight { ...@@ -1192,10 +1196,10 @@ struct load_weight {
/* /*
* The load_avg/util_avg accumulates an infinite geometric series. * The load_avg/util_avg accumulates an infinite geometric series.
* 1) load_avg factors the amount of time that a sched_entity is * 1) load_avg factors frequency scaling into the amount of time that a
* runnable on a rq into its weight. For cfs_rq, it is the aggregated * sched_entity is runnable on a rq into its weight. For cfs_rq, it is the
* such weights of all runnable and blocked sched_entities. * aggregated such weights of all runnable and blocked sched_entities.
* 2) util_avg factors frequency scaling into the amount of time * 2) util_avg factors frequency and cpu scaling into the amount of time
* that a sched_entity is running on a CPU, in the range [0..SCHED_LOAD_SCALE]. * that a sched_entity is running on a CPU, in the range [0..SCHED_LOAD_SCALE].
* For cfs_rq, it is the aggregated such times of all runnable and * For cfs_rq, it is the aggregated such times of all runnable and
* blocked sched_entities. * blocked sched_entities.
......
...@@ -21,4 +21,9 @@ static inline int dl_task(struct task_struct *p) ...@@ -21,4 +21,9 @@ static inline int dl_task(struct task_struct *p)
return dl_prio(p->prio); return dl_prio(p->prio);
} }
static inline bool dl_time_before(u64 a, u64 b)
{
return (s64)(a - b) < 0;
}
#endif /* _SCHED_DEADLINE_H */ #endif /* _SCHED_DEADLINE_H */
...@@ -24,9 +24,6 @@ struct smpboot_thread_data; ...@@ -24,9 +24,6 @@ struct smpboot_thread_data;
* parked (cpu offline) * parked (cpu offline)
* @unpark: Optional unpark function, called when the thread is * @unpark: Optional unpark function, called when the thread is
* unparked (cpu online) * unparked (cpu online)
* @pre_unpark: Optional unpark function, called before the thread is
* unparked (cpu online). This is not guaranteed to be
* called on the target cpu of the thread. Careful!
* @cpumask: Internal state. To update which threads are unparked, * @cpumask: Internal state. To update which threads are unparked,
* call smpboot_update_cpumask_percpu_thread(). * call smpboot_update_cpumask_percpu_thread().
* @selfparking: Thread is not parked by the park function. * @selfparking: Thread is not parked by the park function.
...@@ -42,7 +39,6 @@ struct smp_hotplug_thread { ...@@ -42,7 +39,6 @@ struct smp_hotplug_thread {
void (*cleanup)(unsigned int cpu, bool online); void (*cleanup)(unsigned int cpu, bool online);
void (*park)(unsigned int cpu); void (*park)(unsigned int cpu);
void (*unpark)(unsigned int cpu); void (*unpark)(unsigned int cpu);
void (*pre_unpark)(unsigned int cpu);
cpumask_var_t cpumask; cpumask_var_t cpumask;
bool selfparking; bool selfparking;
const char *thread_comm; const char *thread_comm;
......
...@@ -33,6 +33,8 @@ void stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg, ...@@ -33,6 +33,8 @@ void stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg,
struct cpu_stop_work *work_buf); struct cpu_stop_work *work_buf);
int stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg); int stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg);
int try_stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg); int try_stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg);
void stop_machine_park(int cpu);
void stop_machine_unpark(int cpu);
#else /* CONFIG_SMP */ #else /* CONFIG_SMP */
......
...@@ -104,22 +104,17 @@ DEFINE_EVENT(sched_wakeup_template, sched_wakeup_new, ...@@ -104,22 +104,17 @@ DEFINE_EVENT(sched_wakeup_template, sched_wakeup_new,
TP_ARGS(p)); TP_ARGS(p));
#ifdef CREATE_TRACE_POINTS #ifdef CREATE_TRACE_POINTS
static inline long __trace_sched_switch_state(struct task_struct *p) static inline long __trace_sched_switch_state(bool preempt, struct task_struct *p)
{ {
long state = p->state;
#ifdef CONFIG_PREEMPT
#ifdef CONFIG_SCHED_DEBUG #ifdef CONFIG_SCHED_DEBUG
BUG_ON(p != current); BUG_ON(p != current);
#endif /* CONFIG_SCHED_DEBUG */ #endif /* CONFIG_SCHED_DEBUG */
/* /*
* For all intents and purposes a preempted task is a running task. * Preemption ignores task state, therefore preempted tasks are always
* RUNNING (we will not have dequeued if state != RUNNING).
*/ */
if (preempt_count() & PREEMPT_ACTIVE) return preempt ? TASK_RUNNING | TASK_STATE_MAX : p->state;
state = TASK_RUNNING | TASK_STATE_MAX;
#endif /* CONFIG_PREEMPT */
return state;
} }
#endif /* CREATE_TRACE_POINTS */ #endif /* CREATE_TRACE_POINTS */
...@@ -128,10 +123,11 @@ static inline long __trace_sched_switch_state(struct task_struct *p) ...@@ -128,10 +123,11 @@ static inline long __trace_sched_switch_state(struct task_struct *p)
*/ */
TRACE_EVENT(sched_switch, TRACE_EVENT(sched_switch,
TP_PROTO(struct task_struct *prev, TP_PROTO(bool preempt,
struct task_struct *prev,
struct task_struct *next), struct task_struct *next),
TP_ARGS(prev, next), TP_ARGS(preempt, prev, next),
TP_STRUCT__entry( TP_STRUCT__entry(
__array( char, prev_comm, TASK_COMM_LEN ) __array( char, prev_comm, TASK_COMM_LEN )
...@@ -147,7 +143,7 @@ TRACE_EVENT(sched_switch, ...@@ -147,7 +143,7 @@ TRACE_EVENT(sched_switch,
memcpy(__entry->next_comm, next->comm, TASK_COMM_LEN); memcpy(__entry->next_comm, next->comm, TASK_COMM_LEN);
__entry->prev_pid = prev->pid; __entry->prev_pid = prev->pid;
__entry->prev_prio = prev->prio; __entry->prev_prio = prev->prio;
__entry->prev_state = __trace_sched_switch_state(prev); __entry->prev_state = __trace_sched_switch_state(preempt, prev);
memcpy(__entry->prev_comm, prev->comm, TASK_COMM_LEN); memcpy(__entry->prev_comm, prev->comm, TASK_COMM_LEN);
__entry->next_pid = next->pid; __entry->next_pid = next->pid;
__entry->next_prio = next->prio; __entry->next_prio = next->prio;
......
...@@ -291,8 +291,8 @@ static inline void check_for_tasks(int dead_cpu) ...@@ -291,8 +291,8 @@ static inline void check_for_tasks(int dead_cpu)
{ {
struct task_struct *g, *p; struct task_struct *g, *p;
read_lock_irq(&tasklist_lock); read_lock(&tasklist_lock);
do_each_thread(g, p) { for_each_process_thread(g, p) {
if (!p->on_rq) if (!p->on_rq)
continue; continue;
/* /*
...@@ -307,8 +307,8 @@ static inline void check_for_tasks(int dead_cpu) ...@@ -307,8 +307,8 @@ static inline void check_for_tasks(int dead_cpu)
pr_warn("Task %s (pid=%d) is on cpu %d (state=%ld, flags=%x)\n", pr_warn("Task %s (pid=%d) is on cpu %d (state=%ld, flags=%x)\n",
p->comm, task_pid_nr(p), dead_cpu, p->state, p->flags); p->comm, task_pid_nr(p), dead_cpu, p->state, p->flags);
} while_each_thread(g, p); }
read_unlock_irq(&tasklist_lock); read_unlock(&tasklist_lock);
} }
struct take_cpu_down_param { struct take_cpu_down_param {
...@@ -331,7 +331,7 @@ static int take_cpu_down(void *_param) ...@@ -331,7 +331,7 @@ static int take_cpu_down(void *_param)
/* Give up timekeeping duties */ /* Give up timekeeping duties */
tick_handover_do_timer(); tick_handover_do_timer();
/* Park the stopper thread */ /* Park the stopper thread */
kthread_park(current); stop_machine_park((long)param->hcpu);
return 0; return 0;
} }
......
...@@ -706,10 +706,12 @@ void do_exit(long code) ...@@ -706,10 +706,12 @@ void do_exit(long code)
smp_mb(); smp_mb();
raw_spin_unlock_wait(&tsk->pi_lock); raw_spin_unlock_wait(&tsk->pi_lock);
if (unlikely(in_atomic())) if (unlikely(in_atomic())) {
pr_info("note: %s[%d] exited with preempt_count %d\n", pr_info("note: %s[%d] exited with preempt_count %d\n",
current->comm, task_pid_nr(current), current->comm, task_pid_nr(current),
preempt_count()); preempt_count());
preempt_count_set(PREEMPT_ENABLED);
}
/* sync mm's RSS info before statistics gathering */ /* sync mm's RSS info before statistics gathering */
if (tsk->mm) if (tsk->mm)
......
...@@ -170,7 +170,8 @@ rt_mutex_waiter_less(struct rt_mutex_waiter *left, ...@@ -170,7 +170,8 @@ rt_mutex_waiter_less(struct rt_mutex_waiter *left,
* then right waiter has a dl_prio() too. * then right waiter has a dl_prio() too.
*/ */
if (dl_prio(left->prio)) if (dl_prio(left->prio))
return (left->task->dl.deadline < right->task->dl.deadline); return dl_time_before(left->task->dl.deadline,
right->task->dl.deadline);
return 0; return 0;
} }
......
This diff is collapsed.
...@@ -31,11 +31,6 @@ static inline int right_child(int i) ...@@ -31,11 +31,6 @@ static inline int right_child(int i)
return (i << 1) + 2; return (i << 1) + 2;
} }
static inline int dl_time_before(u64 a, u64 b)
{
return (s64)(a - b) < 0;
}
static void cpudl_exchange(struct cpudl *cp, int a, int b) static void cpudl_exchange(struct cpudl *cp, int a, int b)
{ {
int cpu_a = cp->elements[a].cpu, cpu_b = cp->elements[b].cpu; int cpu_a = cp->elements[a].cpu, cpu_b = cp->elements[b].cpu;
......
...@@ -2,6 +2,7 @@ ...@@ -2,6 +2,7 @@
#define _LINUX_CPUDL_H #define _LINUX_CPUDL_H
#include <linux/sched.h> #include <linux/sched.h>
#include <linux/sched/deadline.h>
#define IDX_INVALID -1 #define IDX_INVALID -1
......
This diff is collapsed.
...@@ -36,11 +36,6 @@ SCHED_FEAT(CACHE_HOT_BUDDY, true) ...@@ -36,11 +36,6 @@ SCHED_FEAT(CACHE_HOT_BUDDY, true)
*/ */
SCHED_FEAT(WAKEUP_PREEMPTION, true) SCHED_FEAT(WAKEUP_PREEMPTION, true)
/*
* Use arch dependent cpu capacity functions
*/
SCHED_FEAT(ARCH_CAPACITY, true)
SCHED_FEAT(HRTICK, false) SCHED_FEAT(HRTICK, false)
SCHED_FEAT(DOUBLE_TICK, false) SCHED_FEAT(DOUBLE_TICK, false)
SCHED_FEAT(LB_BIAS, true) SCHED_FEAT(LB_BIAS, true)
...@@ -72,19 +67,5 @@ SCHED_FEAT(RT_PUSH_IPI, true) ...@@ -72,19 +67,5 @@ SCHED_FEAT(RT_PUSH_IPI, true)
SCHED_FEAT(FORCE_SD_OVERLAP, false) SCHED_FEAT(FORCE_SD_OVERLAP, false)
SCHED_FEAT(RT_RUNTIME_SHARE, true) SCHED_FEAT(RT_RUNTIME_SHARE, true)
SCHED_FEAT(LB_MIN, false) SCHED_FEAT(LB_MIN, false)
SCHED_FEAT(ATTACH_AGE_LOAD, true)
/*
* Apply the automatic NUMA scheduling policy. Enabled automatically
* at runtime if running on a NUMA machine. Can be controlled via
* numa_balancing=
*/
#ifdef CONFIG_NUMA_BALANCING
/*
* NUMA will favor moving tasks towards nodes where a higher number of
* hinting faults are recorded during active load balancing. It will
* resist moving tasks towards nodes where a lower number of hinting
* faults have been recorded.
*/
SCHED_FEAT(NUMA, true)
#endif
...@@ -635,11 +635,11 @@ bool sched_rt_bandwidth_account(struct rt_rq *rt_rq) ...@@ -635,11 +635,11 @@ bool sched_rt_bandwidth_account(struct rt_rq *rt_rq)
/* /*
* We ran out of runtime, see if we can borrow some from our neighbours. * We ran out of runtime, see if we can borrow some from our neighbours.
*/ */
static int do_balance_runtime(struct rt_rq *rt_rq) static void do_balance_runtime(struct rt_rq *rt_rq)
{ {
struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
struct root_domain *rd = rq_of_rt_rq(rt_rq)->rd; struct root_domain *rd = rq_of_rt_rq(rt_rq)->rd;
int i, weight, more = 0; int i, weight;
u64 rt_period; u64 rt_period;
weight = cpumask_weight(rd->span); weight = cpumask_weight(rd->span);
...@@ -673,7 +673,6 @@ static int do_balance_runtime(struct rt_rq *rt_rq) ...@@ -673,7 +673,6 @@ static int do_balance_runtime(struct rt_rq *rt_rq)
diff = rt_period - rt_rq->rt_runtime; diff = rt_period - rt_rq->rt_runtime;
iter->rt_runtime -= diff; iter->rt_runtime -= diff;
rt_rq->rt_runtime += diff; rt_rq->rt_runtime += diff;
more = 1;
if (rt_rq->rt_runtime == rt_period) { if (rt_rq->rt_runtime == rt_period) {
raw_spin_unlock(&iter->rt_runtime_lock); raw_spin_unlock(&iter->rt_runtime_lock);
break; break;
...@@ -683,8 +682,6 @@ static int do_balance_runtime(struct rt_rq *rt_rq) ...@@ -683,8 +682,6 @@ static int do_balance_runtime(struct rt_rq *rt_rq)
raw_spin_unlock(&iter->rt_runtime_lock); raw_spin_unlock(&iter->rt_runtime_lock);
} }
raw_spin_unlock(&rt_b->rt_runtime_lock); raw_spin_unlock(&rt_b->rt_runtime_lock);
return more;
} }
/* /*
...@@ -796,26 +793,19 @@ static void __enable_runtime(struct rq *rq) ...@@ -796,26 +793,19 @@ static void __enable_runtime(struct rq *rq)
} }
} }
static int balance_runtime(struct rt_rq *rt_rq) static void balance_runtime(struct rt_rq *rt_rq)
{ {
int more = 0;
if (!sched_feat(RT_RUNTIME_SHARE)) if (!sched_feat(RT_RUNTIME_SHARE))
return more; return;
if (rt_rq->rt_time > rt_rq->rt_runtime) { if (rt_rq->rt_time > rt_rq->rt_runtime) {
raw_spin_unlock(&rt_rq->rt_runtime_lock); raw_spin_unlock(&rt_rq->rt_runtime_lock);
more = do_balance_runtime(rt_rq); do_balance_runtime(rt_rq);
raw_spin_lock(&rt_rq->rt_runtime_lock); raw_spin_lock(&rt_rq->rt_runtime_lock);
} }
return more;
} }
#else /* !CONFIG_SMP */ #else /* !CONFIG_SMP */
static inline int balance_runtime(struct rt_rq *rt_rq) static inline void balance_runtime(struct rt_rq *rt_rq) {}
{
return 0;
}
#endif /* CONFIG_SMP */ #endif /* CONFIG_SMP */
static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
......
...@@ -84,6 +84,10 @@ static inline void update_cpu_load_active(struct rq *this_rq) { } ...@@ -84,6 +84,10 @@ static inline void update_cpu_load_active(struct rq *this_rq) { }
*/ */
#define RUNTIME_INF ((u64)~0ULL) #define RUNTIME_INF ((u64)~0ULL)
static inline int idle_policy(int policy)
{
return policy == SCHED_IDLE;
}
static inline int fair_policy(int policy) static inline int fair_policy(int policy)
{ {
return policy == SCHED_NORMAL || policy == SCHED_BATCH; return policy == SCHED_NORMAL || policy == SCHED_BATCH;
...@@ -98,6 +102,11 @@ static inline int dl_policy(int policy) ...@@ -98,6 +102,11 @@ static inline int dl_policy(int policy)
{ {
return policy == SCHED_DEADLINE; return policy == SCHED_DEADLINE;
} }
static inline bool valid_policy(int policy)
{
return idle_policy(policy) || fair_policy(policy) ||
rt_policy(policy) || dl_policy(policy);
}
static inline int task_has_rt_policy(struct task_struct *p) static inline int task_has_rt_policy(struct task_struct *p)
{ {
...@@ -109,11 +118,6 @@ static inline int task_has_dl_policy(struct task_struct *p) ...@@ -109,11 +118,6 @@ static inline int task_has_dl_policy(struct task_struct *p)
return dl_policy(p->policy); return dl_policy(p->policy);
} }
static inline bool dl_time_before(u64 a, u64 b)
{
return (s64)(a - b) < 0;
}
/* /*
* Tells if entity @a should preempt entity @b. * Tells if entity @a should preempt entity @b.
*/ */
...@@ -1003,17 +1007,7 @@ extern struct static_key sched_feat_keys[__SCHED_FEAT_NR]; ...@@ -1003,17 +1007,7 @@ extern struct static_key sched_feat_keys[__SCHED_FEAT_NR];
#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) #define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
#endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */ #endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */
#ifdef CONFIG_NUMA_BALANCING extern struct static_key_false sched_numa_balancing;
#define sched_feat_numa(x) sched_feat(x)
#ifdef CONFIG_SCHED_DEBUG
#define numabalancing_enabled sched_feat_numa(NUMA)
#else
extern bool numabalancing_enabled;
#endif /* CONFIG_SCHED_DEBUG */
#else
#define sched_feat_numa(x) (0)
#define numabalancing_enabled (0)
#endif /* CONFIG_NUMA_BALANCING */
static inline u64 global_rt_period(void) static inline u64 global_rt_period(void)
{ {
...@@ -1157,16 +1151,18 @@ static const u32 prio_to_wmult[40] = { ...@@ -1157,16 +1151,18 @@ static const u32 prio_to_wmult[40] = {
/* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
}; };
#define ENQUEUE_WAKEUP 1 #define ENQUEUE_WAKEUP 0x01
#define ENQUEUE_HEAD 2 #define ENQUEUE_HEAD 0x02
#ifdef CONFIG_SMP #ifdef CONFIG_SMP
#define ENQUEUE_WAKING 4 /* sched_class::task_waking was called */ #define ENQUEUE_WAKING 0x04 /* sched_class::task_waking was called */
#else #else
#define ENQUEUE_WAKING 0 #define ENQUEUE_WAKING 0x00
#endif #endif
#define ENQUEUE_REPLENISH 8 #define ENQUEUE_REPLENISH 0x08
#define ENQUEUE_RESTORE 0x10
#define DEQUEUE_SLEEP 1 #define DEQUEUE_SLEEP 0x01
#define DEQUEUE_SAVE 0x02
#define RETRY_TASK ((void *)-1UL) #define RETRY_TASK ((void *)-1UL)
...@@ -1194,7 +1190,7 @@ struct sched_class { ...@@ -1194,7 +1190,7 @@ struct sched_class {
#ifdef CONFIG_SMP #ifdef CONFIG_SMP
int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags); int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags);
void (*migrate_task_rq)(struct task_struct *p, int next_cpu); void (*migrate_task_rq)(struct task_struct *p);
void (*task_waking) (struct task_struct *task); void (*task_waking) (struct task_struct *task);
void (*task_woken) (struct rq *this_rq, struct task_struct *task); void (*task_woken) (struct rq *this_rq, struct task_struct *task);
...@@ -1227,7 +1223,7 @@ struct sched_class { ...@@ -1227,7 +1223,7 @@ struct sched_class {
void (*update_curr) (struct rq *rq); void (*update_curr) (struct rq *rq);
#ifdef CONFIG_FAIR_GROUP_SCHED #ifdef CONFIG_FAIR_GROUP_SCHED
void (*task_move_group) (struct task_struct *p, int on_rq); void (*task_move_group) (struct task_struct *p);
#endif #endif
}; };
...@@ -1405,6 +1401,17 @@ unsigned long arch_scale_freq_capacity(struct sched_domain *sd, int cpu) ...@@ -1405,6 +1401,17 @@ unsigned long arch_scale_freq_capacity(struct sched_domain *sd, int cpu)
} }
#endif #endif
#ifndef arch_scale_cpu_capacity
static __always_inline
unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu)
{
if (sd && (sd->flags & SD_SHARE_CPUCAPACITY) && (sd->span_weight > 1))
return sd->smt_gain / sd->span_weight;
return SCHED_CAPACITY_SCALE;
}
#endif
static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
{ {
rq->rt_avg += rt_delta * arch_scale_freq_capacity(NULL, cpu_of(rq)); rq->rt_avg += rt_delta * arch_scale_freq_capacity(NULL, cpu_of(rq));
......
...@@ -222,9 +222,8 @@ static void smpboot_unpark_thread(struct smp_hotplug_thread *ht, unsigned int cp ...@@ -222,9 +222,8 @@ static void smpboot_unpark_thread(struct smp_hotplug_thread *ht, unsigned int cp
{ {
struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu); struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu);
if (ht->pre_unpark) if (!ht->selfparking)
ht->pre_unpark(cpu); kthread_unpark(tsk);
kthread_unpark(tsk);
} }
void smpboot_unpark_threads(unsigned int cpu) void smpboot_unpark_threads(unsigned int cpu)
......
...@@ -73,21 +73,24 @@ static void cpu_stop_signal_done(struct cpu_stop_done *done, bool executed) ...@@ -73,21 +73,24 @@ static void cpu_stop_signal_done(struct cpu_stop_done *done, bool executed)
} }
} }
static void __cpu_stop_queue_work(struct cpu_stopper *stopper,
struct cpu_stop_work *work)
{
list_add_tail(&work->list, &stopper->works);
wake_up_process(stopper->thread);
}
/* queue @work to @stopper. if offline, @work is completed immediately */ /* queue @work to @stopper. if offline, @work is completed immediately */
static void cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work) static void cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work)
{ {
struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
unsigned long flags; unsigned long flags;
spin_lock_irqsave(&stopper->lock, flags); spin_lock_irqsave(&stopper->lock, flags);
if (stopper->enabled)
if (stopper->enabled) { __cpu_stop_queue_work(stopper, work);
list_add_tail(&work->list, &stopper->works); else
wake_up_process(stopper->thread);
} else
cpu_stop_signal_done(work->done, false); cpu_stop_signal_done(work->done, false);
spin_unlock_irqrestore(&stopper->lock, flags); spin_unlock_irqrestore(&stopper->lock, flags);
} }
...@@ -213,6 +216,31 @@ static int multi_cpu_stop(void *data) ...@@ -213,6 +216,31 @@ static int multi_cpu_stop(void *data)
return err; return err;
} }
static int cpu_stop_queue_two_works(int cpu1, struct cpu_stop_work *work1,
int cpu2, struct cpu_stop_work *work2)
{
struct cpu_stopper *stopper1 = per_cpu_ptr(&cpu_stopper, cpu1);
struct cpu_stopper *stopper2 = per_cpu_ptr(&cpu_stopper, cpu2);
int err;
lg_double_lock(&stop_cpus_lock, cpu1, cpu2);
spin_lock_irq(&stopper1->lock);
spin_lock_nested(&stopper2->lock, SINGLE_DEPTH_NESTING);
err = -ENOENT;
if (!stopper1->enabled || !stopper2->enabled)
goto unlock;
err = 0;
__cpu_stop_queue_work(stopper1, work1);
__cpu_stop_queue_work(stopper2, work2);
unlock:
spin_unlock(&stopper2->lock);
spin_unlock_irq(&stopper1->lock);
lg_double_unlock(&stop_cpus_lock, cpu1, cpu2);
return err;
}
/** /**
* stop_two_cpus - stops two cpus * stop_two_cpus - stops two cpus
* @cpu1: the cpu to stop * @cpu1: the cpu to stop
...@@ -247,24 +275,13 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void * ...@@ -247,24 +275,13 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *
cpu_stop_init_done(&done, 2); cpu_stop_init_done(&done, 2);
set_state(&msdata, MULTI_STOP_PREPARE); set_state(&msdata, MULTI_STOP_PREPARE);
/* if (cpu1 > cpu2)
* If we observe both CPUs active we know _cpu_down() cannot yet have swap(cpu1, cpu2);
* queued its stop_machine works and therefore ours will get executed if (cpu_stop_queue_two_works(cpu1, &work1, cpu2, &work2)) {
* first. Or its not either one of our CPUs that's getting unplugged,
* in which case we don't care.
*
* This relies on the stopper workqueues to be FIFO.
*/
if (!cpu_active(cpu1) || !cpu_active(cpu2)) {
preempt_enable(); preempt_enable();
return -ENOENT; return -ENOENT;
} }
lg_double_lock(&stop_cpus_lock, cpu1, cpu2);
cpu_stop_queue_work(cpu1, &work1);
cpu_stop_queue_work(cpu2, &work2);
lg_double_unlock(&stop_cpus_lock, cpu1, cpu2);
preempt_enable(); preempt_enable();
wait_for_completion(&done.completion); wait_for_completion(&done.completion);
...@@ -452,6 +469,18 @@ static void cpu_stopper_thread(unsigned int cpu) ...@@ -452,6 +469,18 @@ static void cpu_stopper_thread(unsigned int cpu)
} }
} }
void stop_machine_park(int cpu)
{
struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
/*
* Lockless. cpu_stopper_thread() will take stopper->lock and flush
* the pending works before it parks, until then it is fine to queue
* the new works.
*/
stopper->enabled = false;
kthread_park(stopper->thread);
}
extern void sched_set_stop_task(int cpu, struct task_struct *stop); extern void sched_set_stop_task(int cpu, struct task_struct *stop);
static void cpu_stop_create(unsigned int cpu) static void cpu_stop_create(unsigned int cpu)
...@@ -462,26 +491,16 @@ static void cpu_stop_create(unsigned int cpu) ...@@ -462,26 +491,16 @@ static void cpu_stop_create(unsigned int cpu)
static void cpu_stop_park(unsigned int cpu) static void cpu_stop_park(unsigned int cpu)
{ {
struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
struct cpu_stop_work *work, *tmp;
unsigned long flags;
/* drain remaining works */ WARN_ON(!list_empty(&stopper->works));
spin_lock_irqsave(&stopper->lock, flags);
list_for_each_entry_safe(work, tmp, &stopper->works, list) {
list_del_init(&work->list);
cpu_stop_signal_done(work->done, false);
}
stopper->enabled = false;
spin_unlock_irqrestore(&stopper->lock, flags);
} }
static void cpu_stop_unpark(unsigned int cpu) void stop_machine_unpark(int cpu)
{ {
struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
spin_lock_irq(&stopper->lock);
stopper->enabled = true; stopper->enabled = true;
spin_unlock_irq(&stopper->lock); kthread_unpark(stopper->thread);
} }
static struct smp_hotplug_thread cpu_stop_threads = { static struct smp_hotplug_thread cpu_stop_threads = {
...@@ -490,9 +509,7 @@ static struct smp_hotplug_thread cpu_stop_threads = { ...@@ -490,9 +509,7 @@ static struct smp_hotplug_thread cpu_stop_threads = {
.thread_fn = cpu_stopper_thread, .thread_fn = cpu_stopper_thread,
.thread_comm = "migration/%u", .thread_comm = "migration/%u",
.create = cpu_stop_create, .create = cpu_stop_create,
.setup = cpu_stop_unpark,
.park = cpu_stop_park, .park = cpu_stop_park,
.pre_unpark = cpu_stop_unpark,
.selfparking = true, .selfparking = true,
}; };
...@@ -508,6 +525,7 @@ static int __init cpu_stop_init(void) ...@@ -508,6 +525,7 @@ static int __init cpu_stop_init(void)
} }
BUG_ON(smpboot_register_percpu_thread(&cpu_stop_threads)); BUG_ON(smpboot_register_percpu_thread(&cpu_stop_threads));
stop_machine_unpark(raw_smp_processor_id());
stop_machine_initialized = true; stop_machine_initialized = true;
return 0; return 0;
} }
......
...@@ -5697,7 +5697,7 @@ static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list) ...@@ -5697,7 +5697,7 @@ static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list)
} }
static void static void
ftrace_graph_probe_sched_switch(void *ignore, ftrace_graph_probe_sched_switch(void *ignore, bool preempt,
struct task_struct *prev, struct task_struct *next) struct task_struct *prev, struct task_struct *next)
{ {
unsigned long long timestamp; unsigned long long timestamp;
......
...@@ -16,7 +16,8 @@ static int sched_ref; ...@@ -16,7 +16,8 @@ static int sched_ref;
static DEFINE_MUTEX(sched_register_mutex); static DEFINE_MUTEX(sched_register_mutex);
static void static void
probe_sched_switch(void *ignore, struct task_struct *prev, struct task_struct *next) probe_sched_switch(void *ignore, bool preempt,
struct task_struct *prev, struct task_struct *next)
{ {
if (unlikely(!sched_ref)) if (unlikely(!sched_ref))
return; return;
......
...@@ -420,7 +420,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr, ...@@ -420,7 +420,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
} }
static void notrace static void notrace
probe_wakeup_sched_switch(void *ignore, probe_wakeup_sched_switch(void *ignore, bool preempt,
struct task_struct *prev, struct task_struct *next) struct task_struct *prev, struct task_struct *next)
{ {
struct trace_array_cpu *data; struct trace_array_cpu *data;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment