Commit e00d4135 authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull scheduler updates from Ingo Molnar:
 "The main changes in this cycle were:

   - Make nohz housekeeping processing more permissive and less
     intrusive to isolated CPUs

   - Decouple CPU-bound workqueue acconting from the scheduler and move
     it into the workqueue code.

   - Optimize topology building

   - Better handle quota and period overflows

   - Add more RCU annotations

   - Comment updates, misc cleanups"

* 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (25 commits)
  nohz_full: Allow the boot CPU to be nohz_full
  sched/isolation: Require a present CPU in housekeeping mask
  kernel/cpu: Allow non-zero CPU to be primary for suspend / kexec freeze
  power/suspend: Add function to disable secondaries for suspend
  sched/core: Allow the remote scheduler tick to be started on CPU0
  sched/nohz: Run NOHZ idle load balancer on HK_FLAG_MISC CPUs
  sched/debug: Fix spelling mistake "logaritmic" -> "logarithmic"
  sched/topology: Update init_sched_domains() comment
  cgroup/cpuset: Update stale generate_sched_domains() comments
  sched/core: Check quota and period overflow at usec to nsec conversion
  sched/core: Handle overflow in cpu_shares_write_u64
  sched/rt: Check integer overflow at usec to nsec conversion
  sched/core: Fix typo in comment
  sched/core: Make some functions static
  sched/core: Unify p->on_rq updates
  sched/core: Remove ttwu_activate()
  sched/core, workqueues: Distangle worker accounting from rq lock
  sched/fair: Remove unneeded prototype of capacity_of()
  sched/topology: Skip duplicate group rewrites in build_sched_groups()
  sched/topology: Fix build_sched_groups() comment
  ...
parents 90489a72 08ae95f4
...@@ -313,6 +313,10 @@ config ARCH_SUSPEND_POSSIBLE ...@@ -313,6 +313,10 @@ config ARCH_SUSPEND_POSSIBLE
(PPC_85xx && !PPC_E500MC) || PPC_86xx || PPC_PSERIES \ (PPC_85xx && !PPC_E500MC) || PPC_86xx || PPC_PSERIES \
|| 44x || 40x || 44x || 40x
config ARCH_SUSPEND_NONZERO_CPU
def_bool y
depends on PPC_POWERNV || PPC_PSERIES
config PPC_DCR_NATIVE config PPC_DCR_NATIVE
bool bool
......
...@@ -137,9 +137,26 @@ static inline int disable_nonboot_cpus(void) ...@@ -137,9 +137,26 @@ static inline int disable_nonboot_cpus(void)
return freeze_secondary_cpus(0); return freeze_secondary_cpus(0);
} }
extern void enable_nonboot_cpus(void); extern void enable_nonboot_cpus(void);
static inline int suspend_disable_secondary_cpus(void)
{
int cpu = 0;
if (IS_ENABLED(CONFIG_PM_SLEEP_SMP_NONZERO_CPU))
cpu = -1;
return freeze_secondary_cpus(cpu);
}
static inline void suspend_enable_secondary_cpus(void)
{
return enable_nonboot_cpus();
}
#else /* !CONFIG_PM_SLEEP_SMP */ #else /* !CONFIG_PM_SLEEP_SMP */
static inline int disable_nonboot_cpus(void) { return 0; } static inline int disable_nonboot_cpus(void) { return 0; }
static inline void enable_nonboot_cpus(void) {} static inline void enable_nonboot_cpus(void) {}
static inline int suspend_disable_secondary_cpus(void) { return 0; }
static inline void suspend_enable_secondary_cpus(void) { }
#endif /* !CONFIG_PM_SLEEP_SMP */ #endif /* !CONFIG_PM_SLEEP_SMP */
void cpu_startup_entry(enum cpuhp_state state); void cpu_startup_entry(enum cpuhp_state state);
......
...@@ -18,7 +18,7 @@ ...@@ -18,7 +18,7 @@
* awoken. * awoken.
*/ */
struct rcuwait { struct rcuwait {
struct task_struct *task; struct task_struct __rcu *task;
}; };
#define __RCUWAIT_INITIALIZER(name) \ #define __RCUWAIT_INITIALIZER(name) \
......
...@@ -76,8 +76,8 @@ struct sched_domain_shared { ...@@ -76,8 +76,8 @@ struct sched_domain_shared {
struct sched_domain { struct sched_domain {
/* These fields must be setup */ /* These fields must be setup */
struct sched_domain *parent; /* top domain must be null terminated */ struct sched_domain __rcu *parent; /* top domain must be null terminated */
struct sched_domain *child; /* bottom domain must be null terminated */ struct sched_domain __rcu *child; /* bottom domain must be null terminated */
struct sched_group *groups; /* the balancing groups of the domain */ struct sched_group *groups; /* the balancing groups of the domain */
unsigned long min_interval; /* Minimum balance interval ms */ unsigned long min_interval; /* Minimum balance interval ms */
unsigned long max_interval; /* Maximum balance interval ms */ unsigned long max_interval; /* Maximum balance interval ms */
......
...@@ -740,11 +740,10 @@ static inline int nr_cpusets(void) ...@@ -740,11 +740,10 @@ static inline int nr_cpusets(void)
* Must be called with cpuset_mutex held. * Must be called with cpuset_mutex held.
* *
* The three key local variables below are: * The three key local variables below are:
* q - a linked-list queue of cpuset pointers, used to implement a * cp - cpuset pointer, used (together with pos_css) to perform a
* top-down scan of all cpusets. This scan loads a pointer * top-down scan of all cpusets. For our purposes, rebuilding
* to each cpuset marked is_sched_load_balance into the * the schedulers sched domains, we can ignore !is_sched_load_
* array 'csa'. For our purposes, rebuilding the schedulers * balance cpusets.
* sched domains, we can ignore !is_sched_load_balance cpusets.
* csa - (for CpuSet Array) Array of pointers to all the cpusets * csa - (for CpuSet Array) Array of pointers to all the cpusets
* that need to be load balanced, for convenient iterative * that need to be load balanced, for convenient iterative
* access by the subsequent code that finds the best partition, * access by the subsequent code that finds the best partition,
...@@ -775,7 +774,7 @@ static inline int nr_cpusets(void) ...@@ -775,7 +774,7 @@ static inline int nr_cpusets(void)
static int generate_sched_domains(cpumask_var_t **domains, static int generate_sched_domains(cpumask_var_t **domains,
struct sched_domain_attr **attributes) struct sched_domain_attr **attributes)
{ {
struct cpuset *cp; /* scans q */ struct cpuset *cp; /* top-down scan of cpusets */
struct cpuset **csa; /* array of all cpuset ptrs */ struct cpuset **csa; /* array of all cpuset ptrs */
int csn; /* how many cpuset ptrs in csa so far */ int csn; /* how many cpuset ptrs in csa so far */
int i, j, k; /* indices for partition finding loops */ int i, j, k; /* indices for partition finding loops */
......
...@@ -9,6 +9,7 @@ ...@@ -9,6 +9,7 @@
#include <linux/notifier.h> #include <linux/notifier.h>
#include <linux/sched/signal.h> #include <linux/sched/signal.h>
#include <linux/sched/hotplug.h> #include <linux/sched/hotplug.h>
#include <linux/sched/isolation.h>
#include <linux/sched/task.h> #include <linux/sched/task.h>
#include <linux/sched/smt.h> #include <linux/sched/smt.h>
#include <linux/unistd.h> #include <linux/unistd.h>
...@@ -1199,8 +1200,15 @@ int freeze_secondary_cpus(int primary) ...@@ -1199,8 +1200,15 @@ int freeze_secondary_cpus(int primary)
int cpu, error = 0; int cpu, error = 0;
cpu_maps_update_begin(); cpu_maps_update_begin();
if (primary == -1) {
primary = cpumask_first(cpu_online_mask);
if (!housekeeping_cpu(primary, HK_FLAG_TIMER))
primary = housekeeping_any_cpu(HK_FLAG_TIMER);
} else {
if (!cpu_online(primary)) if (!cpu_online(primary))
primary = cpumask_first(cpu_online_mask); primary = cpumask_first(cpu_online_mask);
}
/* /*
* We take down all of the non-boot CPUs in one shot to avoid races * We take down all of the non-boot CPUs in one shot to avoid races
* with the userspace trying to use the CPU hotplug at the same time * with the userspace trying to use the CPU hotplug at the same time
......
...@@ -1150,7 +1150,7 @@ int kernel_kexec(void) ...@@ -1150,7 +1150,7 @@ int kernel_kexec(void)
error = dpm_suspend_end(PMSG_FREEZE); error = dpm_suspend_end(PMSG_FREEZE);
if (error) if (error)
goto Resume_devices; goto Resume_devices;
error = disable_nonboot_cpus(); error = suspend_disable_secondary_cpus();
if (error) if (error)
goto Enable_cpus; goto Enable_cpus;
local_irq_disable(); local_irq_disable();
...@@ -1183,7 +1183,7 @@ int kernel_kexec(void) ...@@ -1183,7 +1183,7 @@ int kernel_kexec(void)
Enable_irqs: Enable_irqs:
local_irq_enable(); local_irq_enable();
Enable_cpus: Enable_cpus:
enable_nonboot_cpus(); suspend_enable_secondary_cpus();
dpm_resume_start(PMSG_RESTORE); dpm_resume_start(PMSG_RESTORE);
Resume_devices: Resume_devices:
dpm_resume_end(PMSG_RESTORE); dpm_resume_end(PMSG_RESTORE);
......
...@@ -114,6 +114,15 @@ config PM_SLEEP_SMP ...@@ -114,6 +114,15 @@ config PM_SLEEP_SMP
depends on PM_SLEEP depends on PM_SLEEP
select HOTPLUG_CPU select HOTPLUG_CPU
config PM_SLEEP_SMP_NONZERO_CPU
def_bool y
depends on PM_SLEEP_SMP
depends on ARCH_SUSPEND_NONZERO_CPU
---help---
If an arch can suspend (for suspend, hibernate, kexec, etc) on a
non-zero numbered CPU, it may define ARCH_SUSPEND_NONZERO_CPU. This
will allow nohz_full mask to include CPU0.
config PM_AUTOSLEEP config PM_AUTOSLEEP
bool "Opportunistic sleep" bool "Opportunistic sleep"
depends on PM_SLEEP depends on PM_SLEEP
......
...@@ -281,7 +281,7 @@ static int create_image(int platform_mode) ...@@ -281,7 +281,7 @@ static int create_image(int platform_mode)
if (error || hibernation_test(TEST_PLATFORM)) if (error || hibernation_test(TEST_PLATFORM))
goto Platform_finish; goto Platform_finish;
error = disable_nonboot_cpus(); error = suspend_disable_secondary_cpus();
if (error || hibernation_test(TEST_CPUS)) if (error || hibernation_test(TEST_CPUS))
goto Enable_cpus; goto Enable_cpus;
...@@ -323,7 +323,7 @@ static int create_image(int platform_mode) ...@@ -323,7 +323,7 @@ static int create_image(int platform_mode)
local_irq_enable(); local_irq_enable();
Enable_cpus: Enable_cpus:
enable_nonboot_cpus(); suspend_enable_secondary_cpus();
Platform_finish: Platform_finish:
platform_finish(platform_mode); platform_finish(platform_mode);
...@@ -417,7 +417,7 @@ int hibernation_snapshot(int platform_mode) ...@@ -417,7 +417,7 @@ int hibernation_snapshot(int platform_mode)
int __weak hibernate_resume_nonboot_cpu_disable(void) int __weak hibernate_resume_nonboot_cpu_disable(void)
{ {
return disable_nonboot_cpus(); return suspend_disable_secondary_cpus();
} }
/** /**
...@@ -486,7 +486,7 @@ static int resume_target_kernel(bool platform_mode) ...@@ -486,7 +486,7 @@ static int resume_target_kernel(bool platform_mode)
local_irq_enable(); local_irq_enable();
Enable_cpus: Enable_cpus:
enable_nonboot_cpus(); suspend_enable_secondary_cpus();
Cleanup: Cleanup:
platform_restore_cleanup(platform_mode); platform_restore_cleanup(platform_mode);
...@@ -564,7 +564,7 @@ int hibernation_platform_enter(void) ...@@ -564,7 +564,7 @@ int hibernation_platform_enter(void)
if (error) if (error)
goto Platform_finish; goto Platform_finish;
error = disable_nonboot_cpus(); error = suspend_disable_secondary_cpus();
if (error) if (error)
goto Enable_cpus; goto Enable_cpus;
...@@ -586,7 +586,7 @@ int hibernation_platform_enter(void) ...@@ -586,7 +586,7 @@ int hibernation_platform_enter(void)
local_irq_enable(); local_irq_enable();
Enable_cpus: Enable_cpus:
enable_nonboot_cpus(); suspend_enable_secondary_cpus();
Platform_finish: Platform_finish:
hibernation_ops->finish(); hibernation_ops->finish();
......
...@@ -428,7 +428,7 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) ...@@ -428,7 +428,7 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
if (suspend_test(TEST_PLATFORM)) if (suspend_test(TEST_PLATFORM))
goto Platform_wake; goto Platform_wake;
error = disable_nonboot_cpus(); error = suspend_disable_secondary_cpus();
if (error || suspend_test(TEST_CPUS)) if (error || suspend_test(TEST_CPUS))
goto Enable_cpus; goto Enable_cpus;
...@@ -458,7 +458,7 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) ...@@ -458,7 +458,7 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
BUG_ON(irqs_disabled()); BUG_ON(irqs_disabled());
Enable_cpus: Enable_cpus:
enable_nonboot_cpus(); suspend_enable_secondary_cpus();
Platform_wake: Platform_wake:
platform_resume_noirq(state); platform_resume_noirq(state);
......
...@@ -792,10 +792,14 @@ void activate_task(struct rq *rq, struct task_struct *p, int flags) ...@@ -792,10 +792,14 @@ void activate_task(struct rq *rq, struct task_struct *p, int flags)
rq->nr_uninterruptible--; rq->nr_uninterruptible--;
enqueue_task(rq, p, flags); enqueue_task(rq, p, flags);
p->on_rq = TASK_ON_RQ_QUEUED;
} }
void deactivate_task(struct rq *rq, struct task_struct *p, int flags) void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
{ {
p->on_rq = (flags & DEQUEUE_SLEEP) ? 0 : TASK_ON_RQ_MIGRATING;
if (task_contributes_to_load(p)) if (task_contributes_to_load(p))
rq->nr_uninterruptible++; rq->nr_uninterruptible++;
...@@ -920,7 +924,7 @@ static inline bool is_per_cpu_kthread(struct task_struct *p) ...@@ -920,7 +924,7 @@ static inline bool is_per_cpu_kthread(struct task_struct *p)
} }
/* /*
* Per-CPU kthreads are allowed to run on !actie && online CPUs, see * Per-CPU kthreads are allowed to run on !active && online CPUs, see
* __set_cpus_allowed_ptr() and select_fallback_rq(). * __set_cpus_allowed_ptr() and select_fallback_rq().
*/ */
static inline bool is_cpu_allowed(struct task_struct *p, int cpu) static inline bool is_cpu_allowed(struct task_struct *p, int cpu)
...@@ -1236,11 +1240,9 @@ static void __migrate_swap_task(struct task_struct *p, int cpu) ...@@ -1236,11 +1240,9 @@ static void __migrate_swap_task(struct task_struct *p, int cpu)
rq_pin_lock(src_rq, &srf); rq_pin_lock(src_rq, &srf);
rq_pin_lock(dst_rq, &drf); rq_pin_lock(dst_rq, &drf);
p->on_rq = TASK_ON_RQ_MIGRATING;
deactivate_task(src_rq, p, 0); deactivate_task(src_rq, p, 0);
set_task_cpu(p, cpu); set_task_cpu(p, cpu);
activate_task(dst_rq, p, 0); activate_task(dst_rq, p, 0);
p->on_rq = TASK_ON_RQ_QUEUED;
check_preempt_curr(dst_rq, p, 0); check_preempt_curr(dst_rq, p, 0);
rq_unpin_lock(dst_rq, &drf); rq_unpin_lock(dst_rq, &drf);
...@@ -1680,16 +1682,6 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags) ...@@ -1680,16 +1682,6 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
__schedstat_inc(p->se.statistics.nr_wakeups_sync); __schedstat_inc(p->se.statistics.nr_wakeups_sync);
} }
static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
{
activate_task(rq, p, en_flags);
p->on_rq = TASK_ON_RQ_QUEUED;
/* If a worker is waking up, notify the workqueue: */
if (p->flags & PF_WQ_WORKER)
wq_worker_waking_up(p, cpu_of(rq));
}
/* /*
* Mark the task runnable and perform wakeup-preemption. * Mark the task runnable and perform wakeup-preemption.
*/ */
...@@ -1741,7 +1733,7 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, ...@@ -1741,7 +1733,7 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
en_flags |= ENQUEUE_MIGRATED; en_flags |= ENQUEUE_MIGRATED;
#endif #endif
ttwu_activate(rq, p, en_flags); activate_task(rq, p, en_flags);
ttwu_do_wakeup(rq, p, wake_flags, rf); ttwu_do_wakeup(rq, p, wake_flags, rf);
} }
...@@ -2105,56 +2097,6 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) ...@@ -2105,56 +2097,6 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
return success; return success;
} }
/**
* try_to_wake_up_local - try to wake up a local task with rq lock held
* @p: the thread to be awakened
* @rf: request-queue flags for pinning
*
* Put @p on the run-queue if it's not already there. The caller must
* ensure that this_rq() is locked, @p is bound to this_rq() and not
* the current task.
*/
static void try_to_wake_up_local(struct task_struct *p, struct rq_flags *rf)
{
struct rq *rq = task_rq(p);
if (WARN_ON_ONCE(rq != this_rq()) ||
WARN_ON_ONCE(p == current))
return;
lockdep_assert_held(&rq->lock);
if (!raw_spin_trylock(&p->pi_lock)) {
/*
* This is OK, because current is on_cpu, which avoids it being
* picked for load-balance and preemption/IRQs are still
* disabled avoiding further scheduler activity on it and we've
* not yet picked a replacement task.
*/
rq_unlock(rq, rf);
raw_spin_lock(&p->pi_lock);
rq_relock(rq, rf);
}
if (!(p->state & TASK_NORMAL))
goto out;
trace_sched_waking(p);
if (!task_on_rq_queued(p)) {
if (p->in_iowait) {
delayacct_blkio_end(p);
atomic_dec(&rq->nr_iowait);
}
ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK);
}
ttwu_do_wakeup(rq, p, 0, rf);
ttwu_stat(p, smp_processor_id(), 0);
out:
raw_spin_unlock(&p->pi_lock);
}
/** /**
* wake_up_process - Wake up a specific process * wake_up_process - Wake up a specific process
* @p: The process to be woken up. * @p: The process to be woken up.
...@@ -2466,7 +2408,6 @@ void wake_up_new_task(struct task_struct *p) ...@@ -2466,7 +2408,6 @@ void wake_up_new_task(struct task_struct *p)
post_init_entity_util_avg(p); post_init_entity_util_avg(p);
activate_task(rq, p, ENQUEUE_NOCLOCK); activate_task(rq, p, ENQUEUE_NOCLOCK);
p->on_rq = TASK_ON_RQ_QUEUED;
trace_sched_wakeup_new(p); trace_sched_wakeup_new(p);
check_preempt_curr(rq, p, WF_FORK); check_preempt_curr(rq, p, WF_FORK);
#ifdef CONFIG_SMP #ifdef CONFIG_SMP
...@@ -3465,25 +3406,11 @@ static void __sched notrace __schedule(bool preempt) ...@@ -3465,25 +3406,11 @@ static void __sched notrace __schedule(bool preempt)
prev->state = TASK_RUNNING; prev->state = TASK_RUNNING;
} else { } else {
deactivate_task(rq, prev, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK); deactivate_task(rq, prev, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK);
prev->on_rq = 0;
if (prev->in_iowait) { if (prev->in_iowait) {
atomic_inc(&rq->nr_iowait); atomic_inc(&rq->nr_iowait);
delayacct_blkio_start(); delayacct_blkio_start();
} }
/*
* If a worker went to sleep, notify and ask workqueue
* whether it wants to wake up a task to maintain
* concurrency.
*/
if (prev->flags & PF_WQ_WORKER) {
struct task_struct *to_wakeup;
to_wakeup = wq_worker_sleeping(prev);
if (to_wakeup)
try_to_wake_up_local(to_wakeup, &rf);
}
} }
switch_count = &prev->nvcsw; switch_count = &prev->nvcsw;
} }
...@@ -3543,6 +3470,20 @@ static inline void sched_submit_work(struct task_struct *tsk) ...@@ -3543,6 +3470,20 @@ static inline void sched_submit_work(struct task_struct *tsk)
{ {
if (!tsk->state || tsk_is_pi_blocked(tsk)) if (!tsk->state || tsk_is_pi_blocked(tsk))
return; return;
/*
* If a worker went to sleep, notify and ask workqueue whether
* it wants to wake up a task to maintain concurrency.
* As this function is called inside the schedule() context,
* we disable preemption to avoid it calling schedule() again
* in the possible wakeup of a kworker.
*/
if (tsk->flags & PF_WQ_WORKER) {
preempt_disable();
wq_worker_sleeping(tsk);
preempt_enable_no_resched();
}
/* /*
* If we are going to sleep and we have plugged IO queued, * If we are going to sleep and we have plugged IO queued,
* make sure to submit it to avoid deadlocks. * make sure to submit it to avoid deadlocks.
...@@ -3551,6 +3492,12 @@ static inline void sched_submit_work(struct task_struct *tsk) ...@@ -3551,6 +3492,12 @@ static inline void sched_submit_work(struct task_struct *tsk)
blk_schedule_flush_plug(tsk); blk_schedule_flush_plug(tsk);
} }
static void sched_update_worker(struct task_struct *tsk)
{
if (tsk->flags & PF_WQ_WORKER)
wq_worker_running(tsk);
}
asmlinkage __visible void __sched schedule(void) asmlinkage __visible void __sched schedule(void)
{ {
struct task_struct *tsk = current; struct task_struct *tsk = current;
...@@ -3561,6 +3508,7 @@ asmlinkage __visible void __sched schedule(void) ...@@ -3561,6 +3508,7 @@ asmlinkage __visible void __sched schedule(void)
__schedule(false); __schedule(false);
sched_preempt_enable_no_resched(); sched_preempt_enable_no_resched();
} while (need_resched()); } while (need_resched());
sched_update_worker(tsk);
} }
EXPORT_SYMBOL(schedule); EXPORT_SYMBOL(schedule);
...@@ -5917,7 +5865,7 @@ void __init sched_init_smp(void) ...@@ -5917,7 +5865,7 @@ void __init sched_init_smp(void)
static int __init migration_init(void) static int __init migration_init(void)
{ {
sched_rq_cpu_starting(smp_processor_id()); sched_cpu_starting(smp_processor_id());
return 0; return 0;
} }
early_initcall(migration_init); early_initcall(migration_init);
...@@ -6558,6 +6506,8 @@ static void cpu_cgroup_attach(struct cgroup_taskset *tset) ...@@ -6558,6 +6506,8 @@ static void cpu_cgroup_attach(struct cgroup_taskset *tset)
static int cpu_shares_write_u64(struct cgroup_subsys_state *css, static int cpu_shares_write_u64(struct cgroup_subsys_state *css,
struct cftype *cftype, u64 shareval) struct cftype *cftype, u64 shareval)
{ {
if (shareval > scale_load_down(ULONG_MAX))
shareval = MAX_SHARES;
return sched_group_set_shares(css_tg(css), scale_load(shareval)); return sched_group_set_shares(css_tg(css), scale_load(shareval));
} }
...@@ -6573,7 +6523,7 @@ static u64 cpu_shares_read_u64(struct cgroup_subsys_state *css, ...@@ -6573,7 +6523,7 @@ static u64 cpu_shares_read_u64(struct cgroup_subsys_state *css,
static DEFINE_MUTEX(cfs_constraints_mutex); static DEFINE_MUTEX(cfs_constraints_mutex);
const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */ const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */
const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */ static const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */
static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime); static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);
...@@ -6653,20 +6603,22 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) ...@@ -6653,20 +6603,22 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
return ret; return ret;
} }
int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us) static int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us)
{ {
u64 quota, period; u64 quota, period;
period = ktime_to_ns(tg->cfs_bandwidth.period); period = ktime_to_ns(tg->cfs_bandwidth.period);
if (cfs_quota_us < 0) if (cfs_quota_us < 0)
quota = RUNTIME_INF; quota = RUNTIME_INF;
else else if ((u64)cfs_quota_us <= U64_MAX / NSEC_PER_USEC)
quota = (u64)cfs_quota_us * NSEC_PER_USEC; quota = (u64)cfs_quota_us * NSEC_PER_USEC;
else
return -EINVAL;
return tg_set_cfs_bandwidth(tg, period, quota); return tg_set_cfs_bandwidth(tg, period, quota);
} }
long tg_get_cfs_quota(struct task_group *tg) static long tg_get_cfs_quota(struct task_group *tg)
{ {
u64 quota_us; u64 quota_us;
...@@ -6679,17 +6631,20 @@ long tg_get_cfs_quota(struct task_group *tg) ...@@ -6679,17 +6631,20 @@ long tg_get_cfs_quota(struct task_group *tg)
return quota_us; return quota_us;
} }
int tg_set_cfs_period(struct task_group *tg, long cfs_period_us) static int tg_set_cfs_period(struct task_group *tg, long cfs_period_us)
{ {
u64 quota, period; u64 quota, period;
if ((u64)cfs_period_us > U64_MAX / NSEC_PER_USEC)
return -EINVAL;
period = (u64)cfs_period_us * NSEC_PER_USEC; period = (u64)cfs_period_us * NSEC_PER_USEC;
quota = tg->cfs_bandwidth.quota; quota = tg->cfs_bandwidth.quota;
return tg_set_cfs_bandwidth(tg, period, quota); return tg_set_cfs_bandwidth(tg, period, quota);
} }
long tg_get_cfs_period(struct task_group *tg) static long tg_get_cfs_period(struct task_group *tg)
{ {
u64 cfs_period_us; u64 cfs_period_us;
......
...@@ -7,7 +7,7 @@ ...@@ -7,7 +7,7 @@
*/ */
#include "sched.h" #include "sched.h"
DEFINE_PER_CPU(struct update_util_data *, cpufreq_update_util_data); DEFINE_PER_CPU(struct update_util_data __rcu *, cpufreq_update_util_data);
/** /**
* cpufreq_add_update_util_hook - Populate the CPU's update_util_data pointer. * cpufreq_add_update_util_hook - Populate the CPU's update_util_data pointer.
......
...@@ -702,7 +702,7 @@ do { \ ...@@ -702,7 +702,7 @@ do { \
static const char *sched_tunable_scaling_names[] = { static const char *sched_tunable_scaling_names[] = {
"none", "none",
"logaritmic", "logarithmic",
"linear" "linear"
}; };
......
...@@ -2597,7 +2597,7 @@ void task_numa_work(struct callback_head *work) ...@@ -2597,7 +2597,7 @@ void task_numa_work(struct callback_head *work)
/* /*
* Drive the periodic memory faults.. * Drive the periodic memory faults..
*/ */
void task_tick_numa(struct rq *rq, struct task_struct *curr) static void task_tick_numa(struct rq *rq, struct task_struct *curr)
{ {
struct callback_head *work = &curr->numa_work; struct callback_head *work = &curr->numa_work;
u64 period, now; u64 period, now;
...@@ -3571,7 +3571,7 @@ static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq) ...@@ -3571,7 +3571,7 @@ static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
* Synchronize entity load avg of dequeued entity without locking * Synchronize entity load avg of dequeued entity without locking
* the previous rq. * the previous rq.
*/ */
void sync_entity_load_avg(struct sched_entity *se) static void sync_entity_load_avg(struct sched_entity *se)
{ {
struct cfs_rq *cfs_rq = cfs_rq_of(se); struct cfs_rq *cfs_rq = cfs_rq_of(se);
u64 last_update_time; u64 last_update_time;
...@@ -3584,7 +3584,7 @@ void sync_entity_load_avg(struct sched_entity *se) ...@@ -3584,7 +3584,7 @@ void sync_entity_load_avg(struct sched_entity *se)
* Task first catches up with cfs_rq, and then subtract * Task first catches up with cfs_rq, and then subtract
* itself from the cfs_rq (task must be off the queue now). * itself from the cfs_rq (task must be off the queue now).
*/ */
void remove_entity_load_avg(struct sched_entity *se) static void remove_entity_load_avg(struct sched_entity *se)
{ {
struct cfs_rq *cfs_rq = cfs_rq_of(se); struct cfs_rq *cfs_rq = cfs_rq_of(se);
unsigned long flags; unsigned long flags;
...@@ -5145,7 +5145,6 @@ static inline void hrtick_update(struct rq *rq) ...@@ -5145,7 +5145,6 @@ static inline void hrtick_update(struct rq *rq)
#ifdef CONFIG_SMP #ifdef CONFIG_SMP
static inline unsigned long cpu_util(int cpu); static inline unsigned long cpu_util(int cpu);
static unsigned long capacity_of(int cpu);
static inline bool cpu_overutilized(int cpu) static inline bool cpu_overutilized(int cpu)
{ {
...@@ -7521,7 +7520,6 @@ static void detach_task(struct task_struct *p, struct lb_env *env) ...@@ -7521,7 +7520,6 @@ static void detach_task(struct task_struct *p, struct lb_env *env)
{ {
lockdep_assert_held(&env->src_rq->lock); lockdep_assert_held(&env->src_rq->lock);
p->on_rq = TASK_ON_RQ_MIGRATING;
deactivate_task(env->src_rq, p, DEQUEUE_NOCLOCK); deactivate_task(env->src_rq, p, DEQUEUE_NOCLOCK);
set_task_cpu(p, env->dst_cpu); set_task_cpu(p, env->dst_cpu);
} }
...@@ -7657,7 +7655,6 @@ static void attach_task(struct rq *rq, struct task_struct *p) ...@@ -7657,7 +7655,6 @@ static void attach_task(struct rq *rq, struct task_struct *p)
BUG_ON(task_rq(p) != rq); BUG_ON(task_rq(p) != rq);
activate_task(rq, p, ENQUEUE_NOCLOCK); activate_task(rq, p, ENQUEUE_NOCLOCK);
p->on_rq = TASK_ON_RQ_QUEUED;
check_preempt_curr(rq, p, 0); check_preempt_curr(rq, p, 0);
} }
...@@ -9551,22 +9548,26 @@ static inline int on_null_domain(struct rq *rq) ...@@ -9551,22 +9548,26 @@ static inline int on_null_domain(struct rq *rq)
* - When one of the busy CPUs notice that there may be an idle rebalancing * - When one of the busy CPUs notice that there may be an idle rebalancing
* needed, they will kick the idle load balancer, which then does idle * needed, they will kick the idle load balancer, which then does idle
* load balancing for all the idle CPUs. * load balancing for all the idle CPUs.
* - HK_FLAG_MISC CPUs are used for this task, because HK_FLAG_SCHED not set
* anywhere yet.
*/ */
static inline int find_new_ilb(void) static inline int find_new_ilb(void)
{ {
int ilb = cpumask_first(nohz.idle_cpus_mask); int ilb;
if (ilb < nr_cpu_ids && idle_cpu(ilb)) for_each_cpu_and(ilb, nohz.idle_cpus_mask,
housekeeping_cpumask(HK_FLAG_MISC)) {
if (idle_cpu(ilb))
return ilb; return ilb;
}
return nr_cpu_ids; return nr_cpu_ids;
} }
/* /*
* Kick a CPU to do the nohz balancing, if it is time for it. We pick the * Kick a CPU to do the nohz balancing, if it is time for it. We pick any
* nohz_load_balancer CPU (if there is one) otherwise fallback to any idle * idle CPU in the HK_FLAG_MISC housekeeping set (if there is one).
* CPU (if there is one).
*/ */
static void kick_ilb(unsigned int flags) static void kick_ilb(unsigned int flags)
{ {
......
...@@ -65,6 +65,7 @@ void __init housekeeping_init(void) ...@@ -65,6 +65,7 @@ void __init housekeeping_init(void)
static int __init housekeeping_setup(char *str, enum hk_flags flags) static int __init housekeeping_setup(char *str, enum hk_flags flags)
{ {
cpumask_var_t non_housekeeping_mask; cpumask_var_t non_housekeeping_mask;
cpumask_var_t tmp;
int err; int err;
alloc_bootmem_cpumask_var(&non_housekeeping_mask); alloc_bootmem_cpumask_var(&non_housekeeping_mask);
...@@ -75,16 +76,23 @@ static int __init housekeeping_setup(char *str, enum hk_flags flags) ...@@ -75,16 +76,23 @@ static int __init housekeeping_setup(char *str, enum hk_flags flags)
return 0; return 0;
} }
alloc_bootmem_cpumask_var(&tmp);
if (!housekeeping_flags) { if (!housekeeping_flags) {
alloc_bootmem_cpumask_var(&housekeeping_mask); alloc_bootmem_cpumask_var(&housekeeping_mask);
cpumask_andnot(housekeeping_mask, cpumask_andnot(housekeeping_mask,
cpu_possible_mask, non_housekeeping_mask); cpu_possible_mask, non_housekeeping_mask);
if (cpumask_empty(housekeeping_mask))
cpumask_andnot(tmp, cpu_present_mask, non_housekeeping_mask);
if (cpumask_empty(tmp)) {
pr_warn("Housekeeping: must include one present CPU, "
"using boot CPU:%d\n", smp_processor_id());
__cpumask_set_cpu(smp_processor_id(), housekeeping_mask); __cpumask_set_cpu(smp_processor_id(), housekeeping_mask);
__cpumask_clear_cpu(smp_processor_id(), non_housekeeping_mask);
}
} else { } else {
cpumask_var_t tmp; cpumask_andnot(tmp, cpu_present_mask, non_housekeeping_mask);
if (cpumask_empty(tmp))
alloc_bootmem_cpumask_var(&tmp); __cpumask_clear_cpu(smp_processor_id(), non_housekeeping_mask);
cpumask_andnot(tmp, cpu_possible_mask, non_housekeeping_mask); cpumask_andnot(tmp, cpu_possible_mask, non_housekeeping_mask);
if (!cpumask_equal(tmp, housekeeping_mask)) { if (!cpumask_equal(tmp, housekeeping_mask)) {
pr_warn("Housekeeping: nohz_full= must match isolcpus=\n"); pr_warn("Housekeeping: nohz_full= must match isolcpus=\n");
...@@ -92,8 +100,8 @@ static int __init housekeeping_setup(char *str, enum hk_flags flags) ...@@ -92,8 +100,8 @@ static int __init housekeeping_setup(char *str, enum hk_flags flags)
free_bootmem_cpumask_var(non_housekeeping_mask); free_bootmem_cpumask_var(non_housekeeping_mask);
return 0; return 0;
} }
free_bootmem_cpumask_var(tmp);
} }
free_bootmem_cpumask_var(tmp);
if ((flags & HK_FLAG_TICK) && !(housekeeping_flags & HK_FLAG_TICK)) { if ((flags & HK_FLAG_TICK) && !(housekeeping_flags & HK_FLAG_TICK)) {
if (IS_ENABLED(CONFIG_NO_HZ_FULL)) { if (IS_ENABLED(CONFIG_NO_HZ_FULL)) {
......
...@@ -2555,6 +2555,8 @@ int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us) ...@@ -2555,6 +2555,8 @@ int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC; rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;
if (rt_runtime_us < 0) if (rt_runtime_us < 0)
rt_runtime = RUNTIME_INF; rt_runtime = RUNTIME_INF;
else if ((u64)rt_runtime_us > U64_MAX / NSEC_PER_USEC)
return -EINVAL;
return tg_set_rt_bandwidth(tg, rt_period, rt_runtime); return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
} }
...@@ -2575,6 +2577,9 @@ int sched_group_set_rt_period(struct task_group *tg, u64 rt_period_us) ...@@ -2575,6 +2577,9 @@ int sched_group_set_rt_period(struct task_group *tg, u64 rt_period_us)
{ {
u64 rt_runtime, rt_period; u64 rt_runtime, rt_period;
if (rt_period_us > U64_MAX / NSEC_PER_USEC)
return -EINVAL;
rt_period = rt_period_us * NSEC_PER_USEC; rt_period = rt_period_us * NSEC_PER_USEC;
rt_runtime = tg->rt_bandwidth.rt_runtime; rt_runtime = tg->rt_bandwidth.rt_runtime;
......
...@@ -780,7 +780,7 @@ struct root_domain { ...@@ -780,7 +780,7 @@ struct root_domain {
* NULL-terminated list of performance domains intersecting with the * NULL-terminated list of performance domains intersecting with the
* CPUs of the rd. Protected by RCU. * CPUs of the rd. Protected by RCU.
*/ */
struct perf_domain *pd; struct perf_domain __rcu *pd;
}; };
extern struct root_domain def_root_domain; extern struct root_domain def_root_domain;
...@@ -870,7 +870,7 @@ struct rq { ...@@ -870,7 +870,7 @@ struct rq {
#ifdef CONFIG_SMP #ifdef CONFIG_SMP
struct root_domain *rd; struct root_domain *rd;
struct sched_domain *sd; struct sched_domain __rcu *sd;
unsigned long cpu_capacity; unsigned long cpu_capacity;
unsigned long cpu_capacity_orig; unsigned long cpu_capacity_orig;
...@@ -1324,13 +1324,13 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag) ...@@ -1324,13 +1324,13 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
return sd; return sd;
} }
DECLARE_PER_CPU(struct sched_domain *, sd_llc); DECLARE_PER_CPU(struct sched_domain __rcu *, sd_llc);
DECLARE_PER_CPU(int, sd_llc_size); DECLARE_PER_CPU(int, sd_llc_size);
DECLARE_PER_CPU(int, sd_llc_id); DECLARE_PER_CPU(int, sd_llc_id);
DECLARE_PER_CPU(struct sched_domain_shared *, sd_llc_shared); DECLARE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared);
DECLARE_PER_CPU(struct sched_domain *, sd_numa); DECLARE_PER_CPU(struct sched_domain __rcu *, sd_numa);
DECLARE_PER_CPU(struct sched_domain *, sd_asym_packing); DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing);
DECLARE_PER_CPU(struct sched_domain *, sd_asym_cpucapacity); DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity);
extern struct static_key_false sched_asym_cpucapacity; extern struct static_key_false sched_asym_cpucapacity;
struct sched_group_capacity { struct sched_group_capacity {
...@@ -2185,7 +2185,7 @@ static inline u64 irq_time_read(int cpu) ...@@ -2185,7 +2185,7 @@ static inline u64 irq_time_read(int cpu)
#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ #endif /* CONFIG_IRQ_TIME_ACCOUNTING */
#ifdef CONFIG_CPU_FREQ #ifdef CONFIG_CPU_FREQ
DECLARE_PER_CPU(struct update_util_data *, cpufreq_update_util_data); DECLARE_PER_CPU(struct update_util_data __rcu *, cpufreq_update_util_data);
/** /**
* cpufreq_update_util - Take a note about CPU utilization changes. * cpufreq_update_util - Take a note about CPU utilization changes.
......
...@@ -615,13 +615,13 @@ static void destroy_sched_domains(struct sched_domain *sd) ...@@ -615,13 +615,13 @@ static void destroy_sched_domains(struct sched_domain *sd)
* the cpumask of the domain), this allows us to quickly tell if * the cpumask of the domain), this allows us to quickly tell if
* two CPUs are in the same cache domain, see cpus_share_cache(). * two CPUs are in the same cache domain, see cpus_share_cache().
*/ */
DEFINE_PER_CPU(struct sched_domain *, sd_llc); DEFINE_PER_CPU(struct sched_domain __rcu *, sd_llc);
DEFINE_PER_CPU(int, sd_llc_size); DEFINE_PER_CPU(int, sd_llc_size);
DEFINE_PER_CPU(int, sd_llc_id); DEFINE_PER_CPU(int, sd_llc_id);
DEFINE_PER_CPU(struct sched_domain_shared *, sd_llc_shared); DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared);
DEFINE_PER_CPU(struct sched_domain *, sd_numa); DEFINE_PER_CPU(struct sched_domain __rcu *, sd_numa);
DEFINE_PER_CPU(struct sched_domain *, sd_asym_packing); DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing);
DEFINE_PER_CPU(struct sched_domain *, sd_asym_cpucapacity); DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity);
DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity); DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity);
static void update_top_cache_domain(int cpu) static void update_top_cache_domain(int cpu)
...@@ -1059,6 +1059,7 @@ static struct sched_group *get_group(int cpu, struct sd_data *sdd) ...@@ -1059,6 +1059,7 @@ static struct sched_group *get_group(int cpu, struct sd_data *sdd)
struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu); struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
struct sched_domain *child = sd->child; struct sched_domain *child = sd->child;
struct sched_group *sg; struct sched_group *sg;
bool already_visited;
if (child) if (child)
cpu = cpumask_first(sched_domain_span(child)); cpu = cpumask_first(sched_domain_span(child));
...@@ -1066,9 +1067,14 @@ static struct sched_group *get_group(int cpu, struct sd_data *sdd) ...@@ -1066,9 +1067,14 @@ static struct sched_group *get_group(int cpu, struct sd_data *sdd)
sg = *per_cpu_ptr(sdd->sg, cpu); sg = *per_cpu_ptr(sdd->sg, cpu);
sg->sgc = *per_cpu_ptr(sdd->sgc, cpu); sg->sgc = *per_cpu_ptr(sdd->sgc, cpu);
/* For claim_allocations: */ /* Increase refcounts for claim_allocations: */
atomic_inc(&sg->ref); already_visited = atomic_inc_return(&sg->ref) > 1;
atomic_inc(&sg->sgc->ref); /* sgc visits should follow a similar trend as sg */
WARN_ON(already_visited != (atomic_inc_return(&sg->sgc->ref) > 1));
/* If we have already visited that group, it's already initialized. */
if (already_visited)
return sg;
if (child) { if (child) {
cpumask_copy(sched_group_span(sg), sched_domain_span(child)); cpumask_copy(sched_group_span(sg), sched_domain_span(child));
...@@ -1087,8 +1093,8 @@ static struct sched_group *get_group(int cpu, struct sd_data *sdd) ...@@ -1087,8 +1093,8 @@ static struct sched_group *get_group(int cpu, struct sd_data *sdd)
/* /*
* build_sched_groups will build a circular linked list of the groups * build_sched_groups will build a circular linked list of the groups
* covered by the given span, and will set each group's ->cpumask correctly, * covered by the given span, will set each group's ->cpumask correctly,
* and ->cpu_capacity to 0. * and will initialize their ->sgc.
* *
* Assumes the sched_domain tree is fully constructed * Assumes the sched_domain tree is fully constructed
*/ */
...@@ -2075,9 +2081,8 @@ void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms) ...@@ -2075,9 +2081,8 @@ void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)
} }
/* /*
* Set up scheduler domains and groups. Callers must hold the hotplug lock. * Set up scheduler domains and groups. For now this just excludes isolated
* For now this just excludes isolated CPUs, but could be used to * CPUs, but could be used to exclude other special cases in the future.
* exclude other special cases in the future.
*/ */
int sched_init_domains(const struct cpumask *cpu_map) int sched_init_domains(const struct cpumask *cpu_map)
{ {
......
...@@ -46,6 +46,14 @@ ktime_t tick_period; ...@@ -46,6 +46,14 @@ ktime_t tick_period;
* procedure also covers cpu hotplug. * procedure also covers cpu hotplug.
*/ */
int tick_do_timer_cpu __read_mostly = TICK_DO_TIMER_BOOT; int tick_do_timer_cpu __read_mostly = TICK_DO_TIMER_BOOT;
#ifdef CONFIG_NO_HZ_FULL
/*
* tick_do_timer_boot_cpu indicates the boot CPU temporarily owns
* tick_do_timer_cpu and it should be taken over by an eligible secondary
* when one comes online.
*/
static int tick_do_timer_boot_cpu __read_mostly = -1;
#endif
/* /*
* Debugging: see timer_list.c * Debugging: see timer_list.c
...@@ -167,6 +175,26 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast) ...@@ -167,6 +175,26 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast)
} }
} }
#ifdef CONFIG_NO_HZ_FULL
static void giveup_do_timer(void *info)
{
int cpu = *(unsigned int *)info;
WARN_ON(tick_do_timer_cpu != smp_processor_id());
tick_do_timer_cpu = cpu;
}
static void tick_take_do_timer_from_boot(void)
{
int cpu = smp_processor_id();
int from = tick_do_timer_boot_cpu;
if (from >= 0 && from != cpu)
smp_call_function_single(from, giveup_do_timer, &cpu, 1);
}
#endif
/* /*
* Setup the tick device * Setup the tick device
*/ */
...@@ -186,12 +214,26 @@ static void tick_setup_device(struct tick_device *td, ...@@ -186,12 +214,26 @@ static void tick_setup_device(struct tick_device *td,
* this cpu: * this cpu:
*/ */
if (tick_do_timer_cpu == TICK_DO_TIMER_BOOT) { if (tick_do_timer_cpu == TICK_DO_TIMER_BOOT) {
if (!tick_nohz_full_cpu(cpu))
tick_do_timer_cpu = cpu; tick_do_timer_cpu = cpu;
else
tick_do_timer_cpu = TICK_DO_TIMER_NONE;
tick_next_period = ktime_get(); tick_next_period = ktime_get();
tick_period = NSEC_PER_SEC / HZ; tick_period = NSEC_PER_SEC / HZ;
#ifdef CONFIG_NO_HZ_FULL
/*
* The boot CPU may be nohz_full, in which case set
* tick_do_timer_boot_cpu so the first housekeeping
* secondary that comes up will take do_timer from
* us.
*/
if (tick_nohz_full_cpu(cpu))
tick_do_timer_boot_cpu = cpu;
} else if (tick_do_timer_boot_cpu != -1 &&
!tick_nohz_full_cpu(cpu)) {
tick_take_do_timer_from_boot();
tick_do_timer_boot_cpu = -1;
WARN_ON(tick_do_timer_cpu != cpu);
#endif
} }
/* /*
......
...@@ -121,10 +121,16 @@ static void tick_sched_do_timer(struct tick_sched *ts, ktime_t now) ...@@ -121,10 +121,16 @@ static void tick_sched_do_timer(struct tick_sched *ts, ktime_t now)
* into a long sleep. If two CPUs happen to assign themselves to * into a long sleep. If two CPUs happen to assign themselves to
* this duty, then the jiffies update is still serialized by * this duty, then the jiffies update is still serialized by
* jiffies_lock. * jiffies_lock.
*
* If nohz_full is enabled, this should not happen because the
* tick_do_timer_cpu never relinquishes.
*/ */
if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE) if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE)) {
&& !tick_nohz_full_cpu(cpu)) #ifdef CONFIG_NO_HZ_FULL
WARN_ON(tick_nohz_full_running);
#endif
tick_do_timer_cpu = cpu; tick_do_timer_cpu = cpu;
}
#endif #endif
/* Check, if the jiffies need an update */ /* Check, if the jiffies need an update */
...@@ -395,8 +401,8 @@ void __init tick_nohz_full_setup(cpumask_var_t cpumask) ...@@ -395,8 +401,8 @@ void __init tick_nohz_full_setup(cpumask_var_t cpumask)
static int tick_nohz_cpu_down(unsigned int cpu) static int tick_nohz_cpu_down(unsigned int cpu)
{ {
/* /*
* The boot CPU handles housekeeping duty (unbound timers, * The tick_do_timer_cpu CPU handles housekeeping duty (unbound
* workqueues, timekeeping, ...) on behalf of full dynticks * timers, workqueues, timekeeping, ...) on behalf of full dynticks
* CPUs. It must remain online when nohz full is enabled. * CPUs. It must remain online when nohz full is enabled.
*/ */
if (tick_nohz_full_running && tick_do_timer_cpu == cpu) if (tick_nohz_full_running && tick_do_timer_cpu == cpu)
...@@ -423,13 +429,16 @@ void __init tick_nohz_init(void) ...@@ -423,13 +429,16 @@ void __init tick_nohz_init(void)
return; return;
} }
if (IS_ENABLED(CONFIG_PM_SLEEP_SMP) &&
!IS_ENABLED(CONFIG_PM_SLEEP_SMP_NONZERO_CPU)) {
cpu = smp_processor_id(); cpu = smp_processor_id();
if (cpumask_test_cpu(cpu, tick_nohz_full_mask)) { if (cpumask_test_cpu(cpu, tick_nohz_full_mask)) {
pr_warn("NO_HZ: Clearing %d from nohz_full range for timekeeping\n", pr_warn("NO_HZ: Clearing %d from nohz_full range "
cpu); "for timekeeping\n", cpu);
cpumask_clear_cpu(cpu, tick_nohz_full_mask); cpumask_clear_cpu(cpu, tick_nohz_full_mask);
} }
}
for_each_cpu(cpu, tick_nohz_full_mask) for_each_cpu(cpu, tick_nohz_full_mask)
context_tracking_cpu_set(cpu); context_tracking_cpu_set(cpu);
...@@ -904,8 +913,13 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) ...@@ -904,8 +913,13 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
/* /*
* Boot safety: make sure the timekeeping duty has been * Boot safety: make sure the timekeeping duty has been
* assigned before entering dyntick-idle mode, * assigned before entering dyntick-idle mode,
* tick_do_timer_cpu is TICK_DO_TIMER_BOOT
*/ */
if (tick_do_timer_cpu == TICK_DO_TIMER_NONE) if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_BOOT))
return false;
/* Should not happen for nohz-full */
if (WARN_ON_ONCE(tick_do_timer_cpu == TICK_DO_TIMER_NONE))
return false; return false;
} }
......
...@@ -841,43 +841,32 @@ static void wake_up_worker(struct worker_pool *pool) ...@@ -841,43 +841,32 @@ static void wake_up_worker(struct worker_pool *pool)
} }
/** /**
* wq_worker_waking_up - a worker is waking up * wq_worker_running - a worker is running again
* @task: task waking up * @task: task waking up
* @cpu: CPU @task is waking up to
* *
* This function is called during try_to_wake_up() when a worker is * This function is called when a worker returns from schedule()
* being awoken.
*
* CONTEXT:
* spin_lock_irq(rq->lock)
*/ */
void wq_worker_waking_up(struct task_struct *task, int cpu) void wq_worker_running(struct task_struct *task)
{ {
struct worker *worker = kthread_data(task); struct worker *worker = kthread_data(task);
if (!(worker->flags & WORKER_NOT_RUNNING)) { if (!worker->sleeping)
WARN_ON_ONCE(worker->pool->cpu != cpu); return;
if (!(worker->flags & WORKER_NOT_RUNNING))
atomic_inc(&worker->pool->nr_running); atomic_inc(&worker->pool->nr_running);
} worker->sleeping = 0;
} }
/** /**
* wq_worker_sleeping - a worker is going to sleep * wq_worker_sleeping - a worker is going to sleep
* @task: task going to sleep * @task: task going to sleep
* *
* This function is called during schedule() when a busy worker is * This function is called from schedule() when a busy worker is
* going to sleep. Worker on the same cpu can be woken up by * going to sleep.
* returning pointer to its task.
*
* CONTEXT:
* spin_lock_irq(rq->lock)
*
* Return:
* Worker task on @cpu to wake up, %NULL if none.
*/ */
struct task_struct *wq_worker_sleeping(struct task_struct *task) void wq_worker_sleeping(struct task_struct *task)
{ {
struct worker *worker = kthread_data(task), *to_wakeup = NULL; struct worker *next, *worker = kthread_data(task);
struct worker_pool *pool; struct worker_pool *pool;
/* /*
...@@ -886,13 +875,15 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task) ...@@ -886,13 +875,15 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task)
* checking NOT_RUNNING. * checking NOT_RUNNING.
*/ */
if (worker->flags & WORKER_NOT_RUNNING) if (worker->flags & WORKER_NOT_RUNNING)
return NULL; return;
pool = worker->pool; pool = worker->pool;
/* this can only happen on the local cpu */ if (WARN_ON_ONCE(worker->sleeping))
if (WARN_ON_ONCE(pool->cpu != raw_smp_processor_id())) return;
return NULL;
worker->sleeping = 1;
spin_lock_irq(&pool->lock);
/* /*
* The counterpart of the following dec_and_test, implied mb, * The counterpart of the following dec_and_test, implied mb,
...@@ -906,9 +897,12 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task) ...@@ -906,9 +897,12 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task)
* lock is safe. * lock is safe.
*/ */
if (atomic_dec_and_test(&pool->nr_running) && if (atomic_dec_and_test(&pool->nr_running) &&
!list_empty(&pool->worklist)) !list_empty(&pool->worklist)) {
to_wakeup = first_idle_worker(pool); next = first_idle_worker(pool);
return to_wakeup ? to_wakeup->task : NULL; if (next)
wake_up_process(next->task);
}
spin_unlock_irq(&pool->lock);
} }
/** /**
...@@ -4929,7 +4923,7 @@ static void rebind_workers(struct worker_pool *pool) ...@@ -4929,7 +4923,7 @@ static void rebind_workers(struct worker_pool *pool)
* *
* WRITE_ONCE() is necessary because @worker->flags may be * WRITE_ONCE() is necessary because @worker->flags may be
* tested without holding any lock in * tested without holding any lock in
* wq_worker_waking_up(). Without it, NOT_RUNNING test may * wq_worker_running(). Without it, NOT_RUNNING test may
* fail incorrectly leading to premature concurrency * fail incorrectly leading to premature concurrency
* management operations. * management operations.
*/ */
......
...@@ -44,6 +44,7 @@ struct worker { ...@@ -44,6 +44,7 @@ struct worker {
unsigned long last_active; /* L: last active timestamp */ unsigned long last_active; /* L: last active timestamp */
unsigned int flags; /* X: flags */ unsigned int flags; /* X: flags */
int id; /* I: worker id */ int id; /* I: worker id */
int sleeping; /* None */
/* /*
* Opaque string set with work_set_desc(). Printed out with task * Opaque string set with work_set_desc(). Printed out with task
...@@ -72,8 +73,8 @@ static inline struct worker *current_wq_worker(void) ...@@ -72,8 +73,8 @@ static inline struct worker *current_wq_worker(void)
* Scheduler hooks for concurrency managed workqueue. Only to be used from * Scheduler hooks for concurrency managed workqueue. Only to be used from
* sched/ and workqueue.c. * sched/ and workqueue.c.
*/ */
void wq_worker_waking_up(struct task_struct *task, int cpu); void wq_worker_running(struct task_struct *task);
struct task_struct *wq_worker_sleeping(struct task_struct *task); void wq_worker_sleeping(struct task_struct *task);
work_func_t wq_worker_last_func(struct task_struct *task); work_func_t wq_worker_last_func(struct task_struct *task);
#endif /* _KERNEL_WORKQUEUE_INTERNAL_H */ #endif /* _KERNEL_WORKQUEUE_INTERNAL_H */
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment