Commit 24c56ee0 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'sched_urgent_for_v5.11_rc5' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull scheduler fixes from Borislav Petkov:

 - Correct the marking of kthreads which are supposed to run on a
   specific, single CPU vs such which are affine to only one CPU, mark
   per-cpu workqueue threads as such and make sure that marking
   "survives" CPU hotplug. Fix CPU hotplug issues with such kthreads.

 - A fix to not push away tasks on CPUs coming online.

 - Have workqueue CPU hotplug code use cpu_possible_mask when breaking
   affinity on CPU offlining so that pending workers can finish on newly
   arrived onlined CPUs too.

 - Dump tasks which haven't vacated a CPU which is currently being
   unplugged.

 - Register a special scale invariance callback which gets called on
   resume from RAM to read out APERF/MPERF after resume and thus make
   the schedutil scaling governor more precise.

* tag 'sched_urgent_for_v5.11_rc5' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  sched: Relax the set_cpus_allowed_ptr() semantics
  sched: Fix CPU hotplug / tighten is_per_cpu_kthread()
  sched: Prepare to use balance_push in ttwu()
  workqueue: Restrict affinity change to rescuer
  workqueue: Tag bound workers with KTHREAD_IS_PER_CPU
  kthread: Extract KTHREAD_IS_PER_CPU
  sched: Don't run cpu-online with balance_push() enabled
  workqueue: Use cpu_possible_mask instead of cpu_active_mask to break affinity
  sched/core: Print out straggler tasks in sched_cpu_dying()
  x86: PM: Register syscore_ops for scale invariance
parents 025929f4 741ba80f
...@@ -56,6 +56,7 @@ ...@@ -56,6 +56,7 @@
#include <linux/numa.h> #include <linux/numa.h>
#include <linux/pgtable.h> #include <linux/pgtable.h>
#include <linux/overflow.h> #include <linux/overflow.h>
#include <linux/syscore_ops.h>
#include <asm/acpi.h> #include <asm/acpi.h>
#include <asm/desc.h> #include <asm/desc.h>
...@@ -2083,6 +2084,23 @@ static void init_counter_refs(void) ...@@ -2083,6 +2084,23 @@ static void init_counter_refs(void)
this_cpu_write(arch_prev_mperf, mperf); this_cpu_write(arch_prev_mperf, mperf);
} }
#ifdef CONFIG_PM_SLEEP
static struct syscore_ops freq_invariance_syscore_ops = {
.resume = init_counter_refs,
};
static void register_freq_invariance_syscore_ops(void)
{
/* Bail out if registered already. */
if (freq_invariance_syscore_ops.node.prev)
return;
register_syscore_ops(&freq_invariance_syscore_ops);
}
#else
static inline void register_freq_invariance_syscore_ops(void) {}
#endif
static void init_freq_invariance(bool secondary, bool cppc_ready) static void init_freq_invariance(bool secondary, bool cppc_ready)
{ {
bool ret = false; bool ret = false;
...@@ -2109,6 +2127,7 @@ static void init_freq_invariance(bool secondary, bool cppc_ready) ...@@ -2109,6 +2127,7 @@ static void init_freq_invariance(bool secondary, bool cppc_ready)
if (ret) { if (ret) {
init_counter_refs(); init_counter_refs();
static_branch_enable(&arch_scale_freq_key); static_branch_enable(&arch_scale_freq_key);
register_freq_invariance_syscore_ops();
pr_info("Estimated ratio of average max frequency by base frequency (times 1024): %llu\n", arch_max_freq_ratio); pr_info("Estimated ratio of average max frequency by base frequency (times 1024): %llu\n", arch_max_freq_ratio);
} else { } else {
pr_debug("Couldn't determine max cpu frequency, necessary for scale-invariant accounting.\n"); pr_debug("Couldn't determine max cpu frequency, necessary for scale-invariant accounting.\n");
......
...@@ -33,6 +33,9 @@ struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data), ...@@ -33,6 +33,9 @@ struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data),
unsigned int cpu, unsigned int cpu,
const char *namefmt); const char *namefmt);
void kthread_set_per_cpu(struct task_struct *k, int cpu);
bool kthread_is_per_cpu(struct task_struct *k);
/** /**
* kthread_run - create and wake a thread. * kthread_run - create and wake a thread.
* @threadfn: the function to run until signal_pending(current). * @threadfn: the function to run until signal_pending(current).
......
...@@ -493,11 +493,36 @@ struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data), ...@@ -493,11 +493,36 @@ struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data),
return p; return p;
kthread_bind(p, cpu); kthread_bind(p, cpu);
/* CPU hotplug need to bind once again when unparking the thread. */ /* CPU hotplug need to bind once again when unparking the thread. */
set_bit(KTHREAD_IS_PER_CPU, &to_kthread(p)->flags);
to_kthread(p)->cpu = cpu; to_kthread(p)->cpu = cpu;
return p; return p;
} }
void kthread_set_per_cpu(struct task_struct *k, int cpu)
{
struct kthread *kthread = to_kthread(k);
if (!kthread)
return;
WARN_ON_ONCE(!(k->flags & PF_NO_SETAFFINITY));
if (cpu < 0) {
clear_bit(KTHREAD_IS_PER_CPU, &kthread->flags);
return;
}
kthread->cpu = cpu;
set_bit(KTHREAD_IS_PER_CPU, &kthread->flags);
}
bool kthread_is_per_cpu(struct task_struct *k)
{
struct kthread *kthread = to_kthread(k);
if (!kthread)
return false;
return test_bit(KTHREAD_IS_PER_CPU, &kthread->flags);
}
/** /**
* kthread_unpark - unpark a thread created by kthread_create(). * kthread_unpark - unpark a thread created by kthread_create().
* @k: thread created by kthread_create(). * @k: thread created by kthread_create().
......
...@@ -1796,13 +1796,28 @@ static inline bool rq_has_pinned_tasks(struct rq *rq) ...@@ -1796,13 +1796,28 @@ static inline bool rq_has_pinned_tasks(struct rq *rq)
*/ */
static inline bool is_cpu_allowed(struct task_struct *p, int cpu) static inline bool is_cpu_allowed(struct task_struct *p, int cpu)
{ {
/* When not in the task's cpumask, no point in looking further. */
if (!cpumask_test_cpu(cpu, p->cpus_ptr)) if (!cpumask_test_cpu(cpu, p->cpus_ptr))
return false; return false;
if (is_per_cpu_kthread(p) || is_migration_disabled(p)) /* migrate_disabled() must be allowed to finish. */
if (is_migration_disabled(p))
return cpu_online(cpu); return cpu_online(cpu);
return cpu_active(cpu); /* Non kernel threads are not allowed during either online or offline. */
if (!(p->flags & PF_KTHREAD))
return cpu_active(cpu);
/* KTHREAD_IS_PER_CPU is always allowed. */
if (kthread_is_per_cpu(p))
return cpu_online(cpu);
/* Regular kernel threads don't get to stay during offline. */
if (cpu_rq(cpu)->balance_push)
return false;
/* But are allowed during online. */
return cpu_online(cpu);
} }
/* /*
...@@ -2327,7 +2342,9 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, ...@@ -2327,7 +2342,9 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
if (p->flags & PF_KTHREAD || is_migration_disabled(p)) { if (p->flags & PF_KTHREAD || is_migration_disabled(p)) {
/* /*
* Kernel threads are allowed on online && !active CPUs. * Kernel threads are allowed on online && !active CPUs,
* however, during cpu-hot-unplug, even these might get pushed
* away if not KTHREAD_IS_PER_CPU.
* *
* Specifically, migration_disabled() tasks must not fail the * Specifically, migration_disabled() tasks must not fail the
* cpumask_any_and_distribute() pick below, esp. so on * cpumask_any_and_distribute() pick below, esp. so on
...@@ -2371,16 +2388,6 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, ...@@ -2371,16 +2388,6 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
__do_set_cpus_allowed(p, new_mask, flags); __do_set_cpus_allowed(p, new_mask, flags);
if (p->flags & PF_KTHREAD) {
/*
* For kernel threads that do indeed end up on online &&
* !active we want to ensure they are strict per-CPU threads.
*/
WARN_ON(cpumask_intersects(new_mask, cpu_online_mask) &&
!cpumask_intersects(new_mask, cpu_active_mask) &&
p->nr_cpus_allowed != 1);
}
return affine_move_task(rq, p, &rf, dest_cpu, flags); return affine_move_task(rq, p, &rf, dest_cpu, flags);
out: out:
...@@ -3121,6 +3128,13 @@ bool cpus_share_cache(int this_cpu, int that_cpu) ...@@ -3121,6 +3128,13 @@ bool cpus_share_cache(int this_cpu, int that_cpu)
static inline bool ttwu_queue_cond(int cpu, int wake_flags) static inline bool ttwu_queue_cond(int cpu, int wake_flags)
{ {
/*
* Do not complicate things with the async wake_list while the CPU is
* in hotplug state.
*/
if (!cpu_active(cpu))
return false;
/* /*
* If the CPU does not share cache, then queue the task on the * If the CPU does not share cache, then queue the task on the
* remote rqs wakelist to avoid accessing remote data. * remote rqs wakelist to avoid accessing remote data.
...@@ -7276,8 +7290,14 @@ static void balance_push(struct rq *rq) ...@@ -7276,8 +7290,14 @@ static void balance_push(struct rq *rq)
/* /*
* Both the cpu-hotplug and stop task are in this case and are * Both the cpu-hotplug and stop task are in this case and are
* required to complete the hotplug process. * required to complete the hotplug process.
*
* XXX: the idle task does not match kthread_is_per_cpu() due to
* histerical raisins.
*/ */
if (is_per_cpu_kthread(push_task) || is_migration_disabled(push_task)) { if (rq->idle == push_task ||
((push_task->flags & PF_KTHREAD) && kthread_is_per_cpu(push_task)) ||
is_migration_disabled(push_task)) {
/* /*
* If this is the idle task on the outgoing CPU try to wake * If this is the idle task on the outgoing CPU try to wake
* up the hotplug control thread which might wait for the * up the hotplug control thread which might wait for the
...@@ -7309,7 +7329,7 @@ static void balance_push(struct rq *rq) ...@@ -7309,7 +7329,7 @@ static void balance_push(struct rq *rq)
/* /*
* At this point need_resched() is true and we'll take the loop in * At this point need_resched() is true and we'll take the loop in
* schedule(). The next pick is obviously going to be the stop task * schedule(). The next pick is obviously going to be the stop task
* which is_per_cpu_kthread() and will push this task away. * which kthread_is_per_cpu() and will push this task away.
*/ */
raw_spin_lock(&rq->lock); raw_spin_lock(&rq->lock);
} }
...@@ -7320,10 +7340,13 @@ static void balance_push_set(int cpu, bool on) ...@@ -7320,10 +7340,13 @@ static void balance_push_set(int cpu, bool on)
struct rq_flags rf; struct rq_flags rf;
rq_lock_irqsave(rq, &rf); rq_lock_irqsave(rq, &rf);
if (on) rq->balance_push = on;
if (on) {
WARN_ON_ONCE(rq->balance_callback);
rq->balance_callback = &balance_push_callback; rq->balance_callback = &balance_push_callback;
else } else if (rq->balance_callback == &balance_push_callback) {
rq->balance_callback = NULL; rq->balance_callback = NULL;
}
rq_unlock_irqrestore(rq, &rf); rq_unlock_irqrestore(rq, &rf);
} }
...@@ -7441,6 +7464,10 @@ int sched_cpu_activate(unsigned int cpu) ...@@ -7441,6 +7464,10 @@ int sched_cpu_activate(unsigned int cpu)
struct rq *rq = cpu_rq(cpu); struct rq *rq = cpu_rq(cpu);
struct rq_flags rf; struct rq_flags rf;
/*
* Make sure that when the hotplug state machine does a roll-back
* we clear balance_push. Ideally that would happen earlier...
*/
balance_push_set(cpu, false); balance_push_set(cpu, false);
#ifdef CONFIG_SCHED_SMT #ifdef CONFIG_SCHED_SMT
...@@ -7483,17 +7510,27 @@ int sched_cpu_deactivate(unsigned int cpu) ...@@ -7483,17 +7510,27 @@ int sched_cpu_deactivate(unsigned int cpu)
int ret; int ret;
set_cpu_active(cpu, false); set_cpu_active(cpu, false);
/*
* From this point forward, this CPU will refuse to run any task that
* is not: migrate_disable() or KTHREAD_IS_PER_CPU, and will actively
* push those tasks away until this gets cleared, see
* sched_cpu_dying().
*/
balance_push_set(cpu, true);
/* /*
* We've cleared cpu_active_mask, wait for all preempt-disabled and RCU * We've cleared cpu_active_mask / set balance_push, wait for all
* users of this state to go away such that all new such users will * preempt-disabled and RCU users of this state to go away such that
* observe it. * all new such users will observe it.
*
* Specifically, we rely on ttwu to no longer target this CPU, see
* ttwu_queue_cond() and is_cpu_allowed().
* *
* Do sync before park smpboot threads to take care the rcu boost case. * Do sync before park smpboot threads to take care the rcu boost case.
*/ */
synchronize_rcu(); synchronize_rcu();
balance_push_set(cpu, true);
rq_lock_irqsave(rq, &rf); rq_lock_irqsave(rq, &rf);
if (rq->rd) { if (rq->rd) {
update_rq_clock(rq); update_rq_clock(rq);
...@@ -7574,6 +7611,25 @@ static void calc_load_migrate(struct rq *rq) ...@@ -7574,6 +7611,25 @@ static void calc_load_migrate(struct rq *rq)
atomic_long_add(delta, &calc_load_tasks); atomic_long_add(delta, &calc_load_tasks);
} }
static void dump_rq_tasks(struct rq *rq, const char *loglvl)
{
struct task_struct *g, *p;
int cpu = cpu_of(rq);
lockdep_assert_held(&rq->lock);
printk("%sCPU%d enqueued tasks (%u total):\n", loglvl, cpu, rq->nr_running);
for_each_process_thread(g, p) {
if (task_cpu(p) != cpu)
continue;
if (!task_on_rq_queued(p))
continue;
printk("%s\tpid: %d, name: %s\n", loglvl, p->pid, p->comm);
}
}
int sched_cpu_dying(unsigned int cpu) int sched_cpu_dying(unsigned int cpu)
{ {
struct rq *rq = cpu_rq(cpu); struct rq *rq = cpu_rq(cpu);
...@@ -7583,9 +7639,18 @@ int sched_cpu_dying(unsigned int cpu) ...@@ -7583,9 +7639,18 @@ int sched_cpu_dying(unsigned int cpu)
sched_tick_stop(cpu); sched_tick_stop(cpu);
rq_lock_irqsave(rq, &rf); rq_lock_irqsave(rq, &rf);
BUG_ON(rq->nr_running != 1 || rq_has_pinned_tasks(rq)); if (rq->nr_running != 1 || rq_has_pinned_tasks(rq)) {
WARN(true, "Dying CPU not properly vacated!");
dump_rq_tasks(rq, KERN_WARNING);
}
rq_unlock_irqrestore(rq, &rf); rq_unlock_irqrestore(rq, &rf);
/*
* Now that the CPU is offline, make sure we're welcome
* to new tasks once we come back up.
*/
balance_push_set(cpu, false);
calc_load_migrate(rq); calc_load_migrate(rq);
update_max_interval(); update_max_interval();
nohz_balance_exit_idle(rq); nohz_balance_exit_idle(rq);
......
...@@ -975,6 +975,7 @@ struct rq { ...@@ -975,6 +975,7 @@ struct rq {
unsigned long cpu_capacity_orig; unsigned long cpu_capacity_orig;
struct callback_head *balance_callback; struct callback_head *balance_callback;
unsigned char balance_push;
unsigned char nohz_idle_balance; unsigned char nohz_idle_balance;
unsigned char idle_balance; unsigned char idle_balance;
......
...@@ -188,6 +188,7 @@ __smpboot_create_thread(struct smp_hotplug_thread *ht, unsigned int cpu) ...@@ -188,6 +188,7 @@ __smpboot_create_thread(struct smp_hotplug_thread *ht, unsigned int cpu)
kfree(td); kfree(td);
return PTR_ERR(tsk); return PTR_ERR(tsk);
} }
kthread_set_per_cpu(tsk, cpu);
/* /*
* Park the thread so that it could start right on the CPU * Park the thread so that it could start right on the CPU
* when it is available. * when it is available.
......
...@@ -1848,12 +1848,6 @@ static void worker_attach_to_pool(struct worker *worker, ...@@ -1848,12 +1848,6 @@ static void worker_attach_to_pool(struct worker *worker,
{ {
mutex_lock(&wq_pool_attach_mutex); mutex_lock(&wq_pool_attach_mutex);
/*
* set_cpus_allowed_ptr() will fail if the cpumask doesn't have any
* online CPUs. It'll be re-applied when any of the CPUs come up.
*/
set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask);
/* /*
* The wq_pool_attach_mutex ensures %POOL_DISASSOCIATED remains * The wq_pool_attach_mutex ensures %POOL_DISASSOCIATED remains
* stable across this function. See the comments above the flag * stable across this function. See the comments above the flag
...@@ -1861,6 +1855,11 @@ static void worker_attach_to_pool(struct worker *worker, ...@@ -1861,6 +1855,11 @@ static void worker_attach_to_pool(struct worker *worker,
*/ */
if (pool->flags & POOL_DISASSOCIATED) if (pool->flags & POOL_DISASSOCIATED)
worker->flags |= WORKER_UNBOUND; worker->flags |= WORKER_UNBOUND;
else
kthread_set_per_cpu(worker->task, pool->cpu);
if (worker->rescue_wq)
set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask);
list_add_tail(&worker->node, &pool->workers); list_add_tail(&worker->node, &pool->workers);
worker->pool = pool; worker->pool = pool;
...@@ -1883,6 +1882,7 @@ static void worker_detach_from_pool(struct worker *worker) ...@@ -1883,6 +1882,7 @@ static void worker_detach_from_pool(struct worker *worker)
mutex_lock(&wq_pool_attach_mutex); mutex_lock(&wq_pool_attach_mutex);
kthread_set_per_cpu(worker->task, -1);
list_del(&worker->node); list_del(&worker->node);
worker->pool = NULL; worker->pool = NULL;
...@@ -4919,8 +4919,10 @@ static void unbind_workers(int cpu) ...@@ -4919,8 +4919,10 @@ static void unbind_workers(int cpu)
raw_spin_unlock_irq(&pool->lock); raw_spin_unlock_irq(&pool->lock);
for_each_pool_worker(worker, pool) for_each_pool_worker(worker, pool) {
WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, cpu_active_mask) < 0); kthread_set_per_cpu(worker->task, -1);
WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, cpu_possible_mask) < 0);
}
mutex_unlock(&wq_pool_attach_mutex); mutex_unlock(&wq_pool_attach_mutex);
...@@ -4972,9 +4974,11 @@ static void rebind_workers(struct worker_pool *pool) ...@@ -4972,9 +4974,11 @@ static void rebind_workers(struct worker_pool *pool)
* of all workers first and then clear UNBOUND. As we're called * of all workers first and then clear UNBOUND. As we're called
* from CPU_ONLINE, the following shouldn't fail. * from CPU_ONLINE, the following shouldn't fail.
*/ */
for_each_pool_worker(worker, pool) for_each_pool_worker(worker, pool) {
kthread_set_per_cpu(worker->task, pool->cpu);
WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task,
pool->attrs->cpumask) < 0); pool->attrs->cpumask) < 0);
}
raw_spin_lock_irq(&pool->lock); raw_spin_lock_irq(&pool->lock);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment