Commit c94d89fa authored by Peter Zijlstra's avatar Peter Zijlstra

Merge branch 'sched/core'

parents 7c60610d 234b8ab6
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
#include <linux/cpumask.h> #include <linux/cpumask.h>
#include <linux/nodemask.h> #include <linux/nodemask.h>
#include <linux/mm.h> #include <linux/mm.h>
#include <linux/mmu_context.h>
#include <linux/jump_label.h> #include <linux/jump_label.h>
#ifdef CONFIG_CPUSETS #ifdef CONFIG_CPUSETS
...@@ -58,7 +59,7 @@ extern void cpuset_wait_for_hotplug(void); ...@@ -58,7 +59,7 @@ extern void cpuset_wait_for_hotplug(void);
extern void cpuset_read_lock(void); extern void cpuset_read_lock(void);
extern void cpuset_read_unlock(void); extern void cpuset_read_unlock(void);
extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask); extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask);
extern void cpuset_cpus_allowed_fallback(struct task_struct *p); extern bool cpuset_cpus_allowed_fallback(struct task_struct *p);
extern nodemask_t cpuset_mems_allowed(struct task_struct *p); extern nodemask_t cpuset_mems_allowed(struct task_struct *p);
#define cpuset_current_mems_allowed (current->mems_allowed) #define cpuset_current_mems_allowed (current->mems_allowed)
void cpuset_init_current_mems_allowed(void); void cpuset_init_current_mems_allowed(void);
...@@ -184,11 +185,12 @@ static inline void cpuset_read_unlock(void) { } ...@@ -184,11 +185,12 @@ static inline void cpuset_read_unlock(void) { }
static inline void cpuset_cpus_allowed(struct task_struct *p, static inline void cpuset_cpus_allowed(struct task_struct *p,
struct cpumask *mask) struct cpumask *mask)
{ {
cpumask_copy(mask, cpu_possible_mask); cpumask_copy(mask, task_cpu_possible_mask(p));
} }
static inline void cpuset_cpus_allowed_fallback(struct task_struct *p) static inline bool cpuset_cpus_allowed_fallback(struct task_struct *p)
{ {
return false;
} }
static inline nodemask_t cpuset_mems_allowed(struct task_struct *p) static inline nodemask_t cpuset_mems_allowed(struct task_struct *p)
......
...@@ -14,4 +14,18 @@ ...@@ -14,4 +14,18 @@
static inline void leave_mm(int cpu) { } static inline void leave_mm(int cpu) { }
#endif #endif
/*
* CPUs that are capable of running user task @p. Must contain at least one
* active CPU. It is assumed that the kernel can run on all CPUs, so calling
* this for a kernel thread is pointless.
*
* By default, we assume a sane, homogeneous system.
*/
#ifndef task_cpu_possible_mask
# define task_cpu_possible_mask(p) cpu_possible_mask
# define task_cpu_possible(cpu, p) true
#else
# define task_cpu_possible(cpu, p) cpumask_test_cpu((cpu), task_cpu_possible_mask(p))
#endif
#endif #endif
...@@ -748,6 +748,7 @@ struct task_struct { ...@@ -748,6 +748,7 @@ struct task_struct {
unsigned int policy; unsigned int policy;
int nr_cpus_allowed; int nr_cpus_allowed;
const cpumask_t *cpus_ptr; const cpumask_t *cpus_ptr;
cpumask_t *user_cpus_ptr;
cpumask_t cpus_mask; cpumask_t cpus_mask;
void *migration_pending; void *migration_pending;
#ifdef CONFIG_SMP #ifdef CONFIG_SMP
...@@ -1705,6 +1706,11 @@ extern int task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_ ...@@ -1705,6 +1706,11 @@ extern int task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_
#ifdef CONFIG_SMP #ifdef CONFIG_SMP
extern void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask); extern void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask);
extern int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask); extern int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask);
extern int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src, int node);
extern void release_user_cpus_ptr(struct task_struct *p);
extern int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask);
extern void force_compatible_cpus_allowed_ptr(struct task_struct *p);
extern void relax_compatible_cpus_allowed_ptr(struct task_struct *p);
#else #else
static inline void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) static inline void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
{ {
...@@ -1715,6 +1721,21 @@ static inline int set_cpus_allowed_ptr(struct task_struct *p, const struct cpuma ...@@ -1715,6 +1721,21 @@ static inline int set_cpus_allowed_ptr(struct task_struct *p, const struct cpuma
return -EINVAL; return -EINVAL;
return 0; return 0;
} }
static inline int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src, int node)
{
if (src->user_cpus_ptr)
return -EINVAL;
return 0;
}
static inline void release_user_cpus_ptr(struct task_struct *p)
{
WARN_ON(p->user_cpus_ptr);
}
static inline int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask)
{
return 0;
}
#endif #endif
extern int yield_to(struct task_struct *p, bool preempt); extern int yield_to(struct task_struct *p, bool preempt);
......
...@@ -28,30 +28,12 @@ enum { sysctl_hung_task_timeout_secs = 0 }; ...@@ -28,30 +28,12 @@ enum { sysctl_hung_task_timeout_secs = 0 };
extern unsigned int sysctl_sched_child_runs_first; extern unsigned int sysctl_sched_child_runs_first;
extern unsigned int sysctl_sched_latency;
extern unsigned int sysctl_sched_min_granularity;
extern unsigned int sysctl_sched_wakeup_granularity;
enum sched_tunable_scaling { enum sched_tunable_scaling {
SCHED_TUNABLESCALING_NONE, SCHED_TUNABLESCALING_NONE,
SCHED_TUNABLESCALING_LOG, SCHED_TUNABLESCALING_LOG,
SCHED_TUNABLESCALING_LINEAR, SCHED_TUNABLESCALING_LINEAR,
SCHED_TUNABLESCALING_END, SCHED_TUNABLESCALING_END,
}; };
extern unsigned int sysctl_sched_tunable_scaling;
extern unsigned int sysctl_numa_balancing_scan_delay;
extern unsigned int sysctl_numa_balancing_scan_period_min;
extern unsigned int sysctl_numa_balancing_scan_period_max;
extern unsigned int sysctl_numa_balancing_scan_size;
#ifdef CONFIG_SCHED_DEBUG
extern __read_mostly unsigned int sysctl_sched_migration_cost;
extern __read_mostly unsigned int sysctl_sched_nr_migrate;
extern int sysctl_resched_latency_warn_ms;
extern int sysctl_resched_latency_warn_once;
#endif
/* /*
* control realtime throttling: * control realtime throttling:
......
...@@ -56,7 +56,7 @@ struct task_struct; ...@@ -56,7 +56,7 @@ struct task_struct;
#define __WAIT_QUEUE_HEAD_INITIALIZER(name) { \ #define __WAIT_QUEUE_HEAD_INITIALIZER(name) { \
.lock = __SPIN_LOCK_UNLOCKED(name.lock), \ .lock = __SPIN_LOCK_UNLOCKED(name.lock), \
.head = { &(name).head, &(name).head } } .head = LIST_HEAD_INIT(name.head) }
#define DECLARE_WAIT_QUEUE_HEAD(name) \ #define DECLARE_WAIT_QUEUE_HEAD(name) \
struct wait_queue_head name = __WAIT_QUEUE_HEAD_INITIALIZER(name) struct wait_queue_head name = __WAIT_QUEUE_HEAD_INITIALIZER(name)
......
...@@ -80,6 +80,7 @@ struct task_struct init_task ...@@ -80,6 +80,7 @@ struct task_struct init_task
.normal_prio = MAX_PRIO - 20, .normal_prio = MAX_PRIO - 20,
.policy = SCHED_NORMAL, .policy = SCHED_NORMAL,
.cpus_ptr = &init_task.cpus_mask, .cpus_ptr = &init_task.cpus_mask,
.user_cpus_ptr = NULL,
.cpus_mask = CPU_MASK_ALL, .cpus_mask = CPU_MASK_ALL,
.nr_cpus_allowed= NR_CPUS, .nr_cpus_allowed= NR_CPUS,
.mm = NULL, .mm = NULL,
......
...@@ -372,18 +372,29 @@ static inline bool is_in_v2_mode(void) ...@@ -372,18 +372,29 @@ static inline bool is_in_v2_mode(void)
} }
/* /*
* Return in pmask the portion of a cpusets's cpus_allowed that * Return in pmask the portion of a task's cpusets's cpus_allowed that
* are online. If none are online, walk up the cpuset hierarchy * are online and are capable of running the task. If none are found,
* until we find one that does have some online cpus. * walk up the cpuset hierarchy until we find one that does have some
* appropriate cpus.
* *
* One way or another, we guarantee to return some non-empty subset * One way or another, we guarantee to return some non-empty subset
* of cpu_online_mask. * of cpu_online_mask.
* *
* Call with callback_lock or cpuset_mutex held. * Call with callback_lock or cpuset_mutex held.
*/ */
static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask) static void guarantee_online_cpus(struct task_struct *tsk,
struct cpumask *pmask)
{ {
while (!cpumask_intersects(cs->effective_cpus, cpu_online_mask)) { const struct cpumask *possible_mask = task_cpu_possible_mask(tsk);
struct cpuset *cs;
if (WARN_ON(!cpumask_and(pmask, possible_mask, cpu_online_mask)))
cpumask_copy(pmask, cpu_online_mask);
rcu_read_lock();
cs = task_cs(tsk);
while (!cpumask_intersects(cs->effective_cpus, pmask)) {
cs = parent_cs(cs); cs = parent_cs(cs);
if (unlikely(!cs)) { if (unlikely(!cs)) {
/* /*
...@@ -393,11 +404,13 @@ static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask) ...@@ -393,11 +404,13 @@ static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask)
* cpuset's effective_cpus is on its way to be * cpuset's effective_cpus is on its way to be
* identical to cpu_online_mask. * identical to cpu_online_mask.
*/ */
cpumask_copy(pmask, cpu_online_mask); goto out_unlock;
return;
} }
} }
cpumask_and(pmask, cs->effective_cpus, cpu_online_mask); cpumask_and(pmask, pmask, cs->effective_cpus);
out_unlock:
rcu_read_unlock();
} }
/* /*
...@@ -2199,15 +2212,13 @@ static void cpuset_attach(struct cgroup_taskset *tset) ...@@ -2199,15 +2212,13 @@ static void cpuset_attach(struct cgroup_taskset *tset)
percpu_down_write(&cpuset_rwsem); percpu_down_write(&cpuset_rwsem);
/* prepare for attach */
if (cs == &top_cpuset)
cpumask_copy(cpus_attach, cpu_possible_mask);
else
guarantee_online_cpus(cs, cpus_attach);
guarantee_online_mems(cs, &cpuset_attach_nodemask_to); guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
cgroup_taskset_for_each(task, css, tset) { cgroup_taskset_for_each(task, css, tset) {
if (cs != &top_cpuset)
guarantee_online_cpus(task, cpus_attach);
else
cpumask_copy(cpus_attach, task_cpu_possible_mask(task));
/* /*
* can_attach beforehand should guarantee that this doesn't * can_attach beforehand should guarantee that this doesn't
* fail. TODO: have a better way to handle failure here * fail. TODO: have a better way to handle failure here
...@@ -3302,9 +3313,7 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) ...@@ -3302,9 +3313,7 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
unsigned long flags; unsigned long flags;
spin_lock_irqsave(&callback_lock, flags); spin_lock_irqsave(&callback_lock, flags);
rcu_read_lock(); guarantee_online_cpus(tsk, pmask);
guarantee_online_cpus(task_cs(tsk), pmask);
rcu_read_unlock();
spin_unlock_irqrestore(&callback_lock, flags); spin_unlock_irqrestore(&callback_lock, flags);
} }
...@@ -3318,13 +3327,22 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) ...@@ -3318,13 +3327,22 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
* which will not contain a sane cpumask during cases such as cpu hotplugging. * which will not contain a sane cpumask during cases such as cpu hotplugging.
* This is the absolute last resort for the scheduler and it is only used if * This is the absolute last resort for the scheduler and it is only used if
* _every_ other avenue has been traveled. * _every_ other avenue has been traveled.
*
* Returns true if the affinity of @tsk was changed, false otherwise.
**/ **/
void cpuset_cpus_allowed_fallback(struct task_struct *tsk) bool cpuset_cpus_allowed_fallback(struct task_struct *tsk)
{ {
const struct cpumask *possible_mask = task_cpu_possible_mask(tsk);
const struct cpumask *cs_mask;
bool changed = false;
rcu_read_lock(); rcu_read_lock();
do_set_cpus_allowed(tsk, is_in_v2_mode() ? cs_mask = task_cs(tsk)->cpus_allowed;
task_cs(tsk)->cpus_allowed : cpu_possible_mask); if (is_in_v2_mode() && cpumask_subset(cs_mask, possible_mask)) {
do_set_cpus_allowed(tsk, cs_mask);
changed = true;
}
rcu_read_unlock(); rcu_read_unlock();
/* /*
...@@ -3344,6 +3362,7 @@ void cpuset_cpus_allowed_fallback(struct task_struct *tsk) ...@@ -3344,6 +3362,7 @@ void cpuset_cpus_allowed_fallback(struct task_struct *tsk)
* select_fallback_rq() will fix things ups and set cpu_possible_mask * select_fallback_rq() will fix things ups and set cpu_possible_mask
* if required. * if required.
*/ */
return changed;
} }
void __init cpuset_init_current_mems_allowed(void) void __init cpuset_init_current_mems_allowed(void)
......
...@@ -446,6 +446,7 @@ void put_task_stack(struct task_struct *tsk) ...@@ -446,6 +446,7 @@ void put_task_stack(struct task_struct *tsk)
void free_task(struct task_struct *tsk) void free_task(struct task_struct *tsk)
{ {
release_user_cpus_ptr(tsk);
scs_release(tsk); scs_release(tsk);
#ifndef CONFIG_THREAD_INFO_IN_TASK #ifndef CONFIG_THREAD_INFO_IN_TASK
...@@ -924,6 +925,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) ...@@ -924,6 +925,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
#endif #endif
if (orig->cpus_ptr == &orig->cpus_mask) if (orig->cpus_ptr == &orig->cpus_mask)
tsk->cpus_ptr = &tsk->cpus_mask; tsk->cpus_ptr = &tsk->cpus_mask;
dup_user_cpus_ptr(tsk, orig, node);
/* /*
* One for the user space visible state that goes away when reaped. * One for the user space visible state that goes away when reaped.
......
...@@ -993,6 +993,7 @@ int get_nohz_timer_target(void) ...@@ -993,6 +993,7 @@ int get_nohz_timer_target(void)
{ {
int i, cpu = smp_processor_id(), default_cpu = -1; int i, cpu = smp_processor_id(), default_cpu = -1;
struct sched_domain *sd; struct sched_domain *sd;
const struct cpumask *hk_mask;
if (housekeeping_cpu(cpu, HK_FLAG_TIMER)) { if (housekeeping_cpu(cpu, HK_FLAG_TIMER)) {
if (!idle_cpu(cpu)) if (!idle_cpu(cpu))
...@@ -1000,10 +1001,11 @@ int get_nohz_timer_target(void) ...@@ -1000,10 +1001,11 @@ int get_nohz_timer_target(void)
default_cpu = cpu; default_cpu = cpu;
} }
hk_mask = housekeeping_cpumask(HK_FLAG_TIMER);
rcu_read_lock(); rcu_read_lock();
for_each_domain(cpu, sd) { for_each_domain(cpu, sd) {
for_each_cpu_and(i, sched_domain_span(sd), for_each_cpu_and(i, sched_domain_span(sd), hk_mask) {
housekeeping_cpumask(HK_FLAG_TIMER)) {
if (cpu == i) if (cpu == i)
continue; continue;
...@@ -1619,6 +1621,23 @@ static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) ...@@ -1619,6 +1621,23 @@ static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p)
uclamp_rq_dec_id(rq, p, clamp_id); uclamp_rq_dec_id(rq, p, clamp_id);
} }
static inline void uclamp_rq_reinc_id(struct rq *rq, struct task_struct *p,
enum uclamp_id clamp_id)
{
if (!p->uclamp[clamp_id].active)
return;
uclamp_rq_dec_id(rq, p, clamp_id);
uclamp_rq_inc_id(rq, p, clamp_id);
/*
* Make sure to clear the idle flag if we've transiently reached 0
* active tasks on rq.
*/
if (clamp_id == UCLAMP_MAX && (rq->uclamp_flags & UCLAMP_FLAG_IDLE))
rq->uclamp_flags &= ~UCLAMP_FLAG_IDLE;
}
static inline void static inline void
uclamp_update_active(struct task_struct *p) uclamp_update_active(struct task_struct *p)
{ {
...@@ -1642,12 +1661,8 @@ uclamp_update_active(struct task_struct *p) ...@@ -1642,12 +1661,8 @@ uclamp_update_active(struct task_struct *p)
* affecting a valid clamp bucket, the next time it's enqueued, * affecting a valid clamp bucket, the next time it's enqueued,
* it will already see the updated clamp bucket value. * it will already see the updated clamp bucket value.
*/ */
for_each_clamp_id(clamp_id) { for_each_clamp_id(clamp_id)
if (p->uclamp[clamp_id].active) { uclamp_rq_reinc_id(rq, p, clamp_id);
uclamp_rq_dec_id(rq, p, clamp_id);
uclamp_rq_inc_id(rq, p, clamp_id);
}
}
task_rq_unlock(rq, p, &rf); task_rq_unlock(rq, p, &rf);
} }
...@@ -2161,7 +2176,7 @@ static inline bool is_cpu_allowed(struct task_struct *p, int cpu) ...@@ -2161,7 +2176,7 @@ static inline bool is_cpu_allowed(struct task_struct *p, int cpu)
/* Non kernel threads are not allowed during either online or offline. */ /* Non kernel threads are not allowed during either online or offline. */
if (!(p->flags & PF_KTHREAD)) if (!(p->flags & PF_KTHREAD))
return cpu_active(cpu); return cpu_active(cpu) && task_cpu_possible(cpu, p);
/* KTHREAD_IS_PER_CPU is always allowed. */ /* KTHREAD_IS_PER_CPU is always allowed. */
if (kthread_is_per_cpu(p)) if (kthread_is_per_cpu(p))
...@@ -2468,6 +2483,34 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) ...@@ -2468,6 +2483,34 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
__do_set_cpus_allowed(p, new_mask, 0); __do_set_cpus_allowed(p, new_mask, 0);
} }
int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src,
int node)
{
if (!src->user_cpus_ptr)
return 0;
dst->user_cpus_ptr = kmalloc_node(cpumask_size(), GFP_KERNEL, node);
if (!dst->user_cpus_ptr)
return -ENOMEM;
cpumask_copy(dst->user_cpus_ptr, src->user_cpus_ptr);
return 0;
}
static inline struct cpumask *clear_user_cpus_ptr(struct task_struct *p)
{
struct cpumask *user_mask = NULL;
swap(p->user_cpus_ptr, user_mask);
return user_mask;
}
void release_user_cpus_ptr(struct task_struct *p)
{
kfree(clear_user_cpus_ptr(p));
}
/* /*
* This function is wildly self concurrent; here be dragons. * This function is wildly self concurrent; here be dragons.
* *
...@@ -2685,28 +2728,26 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag ...@@ -2685,28 +2728,26 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag
} }
/* /*
* Change a given task's CPU affinity. Migrate the thread to a * Called with both p->pi_lock and rq->lock held; drops both before returning.
* proper CPU and schedule it away if the CPU it's executing on
* is removed from the allowed bitmask.
*
* NOTE: the caller must have a valid reference to the task, the
* task must not exit() & deallocate itself prematurely. The
* call is not atomic; no spinlocks may be held.
*/ */
static int __set_cpus_allowed_ptr(struct task_struct *p, static int __set_cpus_allowed_ptr_locked(struct task_struct *p,
const struct cpumask *new_mask, const struct cpumask *new_mask,
u32 flags) u32 flags,
struct rq *rq,
struct rq_flags *rf)
__releases(rq->lock)
__releases(p->pi_lock)
{ {
const struct cpumask *cpu_allowed_mask = task_cpu_possible_mask(p);
const struct cpumask *cpu_valid_mask = cpu_active_mask; const struct cpumask *cpu_valid_mask = cpu_active_mask;
bool kthread = p->flags & PF_KTHREAD;
struct cpumask *user_mask = NULL;
unsigned int dest_cpu; unsigned int dest_cpu;
struct rq_flags rf;
struct rq *rq;
int ret = 0; int ret = 0;
rq = task_rq_lock(p, &rf);
update_rq_clock(rq); update_rq_clock(rq);
if (p->flags & PF_KTHREAD || is_migration_disabled(p)) { if (kthread || is_migration_disabled(p)) {
/* /*
* Kernel threads are allowed on online && !active CPUs, * Kernel threads are allowed on online && !active CPUs,
* however, during cpu-hot-unplug, even these might get pushed * however, during cpu-hot-unplug, even these might get pushed
...@@ -2720,6 +2761,11 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, ...@@ -2720,6 +2761,11 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
cpu_valid_mask = cpu_online_mask; cpu_valid_mask = cpu_online_mask;
} }
if (!kthread && !cpumask_subset(new_mask, cpu_allowed_mask)) {
ret = -EINVAL;
goto out;
}
/* /*
* Must re-check here, to close a race against __kthread_bind(), * Must re-check here, to close a race against __kthread_bind(),
* sched_setaffinity() is not guaranteed to observe the flag. * sched_setaffinity() is not guaranteed to observe the flag.
...@@ -2754,20 +2800,178 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, ...@@ -2754,20 +2800,178 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
__do_set_cpus_allowed(p, new_mask, flags); __do_set_cpus_allowed(p, new_mask, flags);
return affine_move_task(rq, p, &rf, dest_cpu, flags); if (flags & SCA_USER)
user_mask = clear_user_cpus_ptr(p);
ret = affine_move_task(rq, p, rf, dest_cpu, flags);
kfree(user_mask);
return ret;
out: out:
task_rq_unlock(rq, p, &rf); task_rq_unlock(rq, p, rf);
return ret; return ret;
} }
/*
* Change a given task's CPU affinity. Migrate the thread to a
* proper CPU and schedule it away if the CPU it's executing on
* is removed from the allowed bitmask.
*
* NOTE: the caller must have a valid reference to the task, the
* task must not exit() & deallocate itself prematurely. The
* call is not atomic; no spinlocks may be held.
*/
static int __set_cpus_allowed_ptr(struct task_struct *p,
const struct cpumask *new_mask, u32 flags)
{
struct rq_flags rf;
struct rq *rq;
rq = task_rq_lock(p, &rf);
return __set_cpus_allowed_ptr_locked(p, new_mask, flags, rq, &rf);
}
int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
{ {
return __set_cpus_allowed_ptr(p, new_mask, 0); return __set_cpus_allowed_ptr(p, new_mask, 0);
} }
EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
/*
* Change a given task's CPU affinity to the intersection of its current
* affinity mask and @subset_mask, writing the resulting mask to @new_mask
* and pointing @p->user_cpus_ptr to a copy of the old mask.
* If the resulting mask is empty, leave the affinity unchanged and return
* -EINVAL.
*/
static int restrict_cpus_allowed_ptr(struct task_struct *p,
struct cpumask *new_mask,
const struct cpumask *subset_mask)
{
struct cpumask *user_mask = NULL;
struct rq_flags rf;
struct rq *rq;
int err;
if (!p->user_cpus_ptr) {
user_mask = kmalloc(cpumask_size(), GFP_KERNEL);
if (!user_mask)
return -ENOMEM;
}
rq = task_rq_lock(p, &rf);
/*
* Forcefully restricting the affinity of a deadline task is
* likely to cause problems, so fail and noisily override the
* mask entirely.
*/
if (task_has_dl_policy(p) && dl_bandwidth_enabled()) {
err = -EPERM;
goto err_unlock;
}
if (!cpumask_and(new_mask, &p->cpus_mask, subset_mask)) {
err = -EINVAL;
goto err_unlock;
}
/*
* We're about to butcher the task affinity, so keep track of what
* the user asked for in case we're able to restore it later on.
*/
if (user_mask) {
cpumask_copy(user_mask, p->cpus_ptr);
p->user_cpus_ptr = user_mask;
}
return __set_cpus_allowed_ptr_locked(p, new_mask, 0, rq, &rf);
err_unlock:
task_rq_unlock(rq, p, &rf);
kfree(user_mask);
return err;
}
/*
* Restrict the CPU affinity of task @p so that it is a subset of
* task_cpu_possible_mask() and point @p->user_cpu_ptr to a copy of the
* old affinity mask. If the resulting mask is empty, we warn and walk
* up the cpuset hierarchy until we find a suitable mask.
*/
void force_compatible_cpus_allowed_ptr(struct task_struct *p)
{
cpumask_var_t new_mask;
const struct cpumask *override_mask = task_cpu_possible_mask(p);
alloc_cpumask_var(&new_mask, GFP_KERNEL);
/*
* __migrate_task() can fail silently in the face of concurrent
* offlining of the chosen destination CPU, so take the hotplug
* lock to ensure that the migration succeeds.
*/
cpus_read_lock();
if (!cpumask_available(new_mask))
goto out_set_mask;
if (!restrict_cpus_allowed_ptr(p, new_mask, override_mask))
goto out_free_mask;
/*
* We failed to find a valid subset of the affinity mask for the
* task, so override it based on its cpuset hierarchy.
*/
cpuset_cpus_allowed(p, new_mask);
override_mask = new_mask;
out_set_mask:
if (printk_ratelimit()) {
printk_deferred("Overriding affinity for process %d (%s) to CPUs %*pbl\n",
task_pid_nr(p), p->comm,
cpumask_pr_args(override_mask));
}
WARN_ON(set_cpus_allowed_ptr(p, override_mask));
out_free_mask:
cpus_read_unlock();
free_cpumask_var(new_mask);
}
static int
__sched_setaffinity(struct task_struct *p, const struct cpumask *mask);
/*
* Restore the affinity of a task @p which was previously restricted by a
* call to force_compatible_cpus_allowed_ptr(). This will clear (and free)
* @p->user_cpus_ptr.
*
* It is the caller's responsibility to serialise this with any calls to
* force_compatible_cpus_allowed_ptr(@p).
*/
void relax_compatible_cpus_allowed_ptr(struct task_struct *p)
{
struct cpumask *user_mask = p->user_cpus_ptr;
unsigned long flags;
/*
* Try to restore the old affinity mask. If this fails, then
* we free the mask explicitly to avoid it being inherited across
* a subsequent fork().
*/
if (!user_mask || !__sched_setaffinity(p, user_mask))
return;
raw_spin_lock_irqsave(&p->pi_lock, flags);
user_mask = clear_user_cpus_ptr(p);
raw_spin_unlock_irqrestore(&p->pi_lock, flags);
kfree(user_mask);
}
void set_task_cpu(struct task_struct *p, unsigned int new_cpu) void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
{ {
#ifdef CONFIG_SCHED_DEBUG #ifdef CONFIG_SCHED_DEBUG
...@@ -3112,9 +3316,7 @@ static int select_fallback_rq(int cpu, struct task_struct *p) ...@@ -3112,9 +3316,7 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
/* Look for allowed, online CPU in same node. */ /* Look for allowed, online CPU in same node. */
for_each_cpu(dest_cpu, nodemask) { for_each_cpu(dest_cpu, nodemask) {
if (!cpu_active(dest_cpu)) if (is_cpu_allowed(p, dest_cpu))
continue;
if (cpumask_test_cpu(dest_cpu, p->cpus_ptr))
return dest_cpu; return dest_cpu;
} }
} }
...@@ -3131,8 +3333,7 @@ static int select_fallback_rq(int cpu, struct task_struct *p) ...@@ -3131,8 +3333,7 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
/* No more Mr. Nice Guy. */ /* No more Mr. Nice Guy. */
switch (state) { switch (state) {
case cpuset: case cpuset:
if (IS_ENABLED(CONFIG_CPUSETS)) { if (cpuset_cpus_allowed_fallback(p)) {
cpuset_cpus_allowed_fallback(p);
state = possible; state = possible;
break; break;
} }
...@@ -3144,10 +3345,9 @@ static int select_fallback_rq(int cpu, struct task_struct *p) ...@@ -3144,10 +3345,9 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
* *
* More yuck to audit. * More yuck to audit.
*/ */
do_set_cpus_allowed(p, cpu_possible_mask); do_set_cpus_allowed(p, task_cpu_possible_mask(p));
state = fail; state = fail;
break; break;
case fail: case fail:
BUG(); BUG();
break; break;
...@@ -5660,11 +5860,9 @@ static bool try_steal_cookie(int this, int that) ...@@ -5660,11 +5860,9 @@ static bool try_steal_cookie(int this, int that)
if (p->core_occupation > dst->idle->core_occupation) if (p->core_occupation > dst->idle->core_occupation)
goto next; goto next;
p->on_rq = TASK_ON_RQ_MIGRATING;
deactivate_task(src, p, 0); deactivate_task(src, p, 0);
set_task_cpu(p, this); set_task_cpu(p, this);
activate_task(dst, p, 0); activate_task(dst, p, 0);
p->on_rq = TASK_ON_RQ_QUEUED;
resched_curr(dst); resched_curr(dst);
...@@ -7300,6 +7498,16 @@ static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *a ...@@ -7300,6 +7498,16 @@ static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *a
return -E2BIG; return -E2BIG;
} }
static void get_params(struct task_struct *p, struct sched_attr *attr)
{
if (task_has_dl_policy(p))
__getparam_dl(p, attr);
else if (task_has_rt_policy(p))
attr->sched_priority = p->rt_priority;
else
attr->sched_nice = task_nice(p);
}
/** /**
* sys_sched_setscheduler - set/change the scheduler policy and RT priority * sys_sched_setscheduler - set/change the scheduler policy and RT priority
* @pid: the pid in question. * @pid: the pid in question.
...@@ -7361,6 +7569,8 @@ SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr, ...@@ -7361,6 +7569,8 @@ SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr,
rcu_read_unlock(); rcu_read_unlock();
if (likely(p)) { if (likely(p)) {
if (attr.sched_flags & SCHED_FLAG_KEEP_PARAMS)
get_params(p, &attr);
retval = sched_setattr(p, &attr); retval = sched_setattr(p, &attr);
put_task_struct(p); put_task_struct(p);
} }
...@@ -7509,12 +7719,8 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, ...@@ -7509,12 +7719,8 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
kattr.sched_policy = p->policy; kattr.sched_policy = p->policy;
if (p->sched_reset_on_fork) if (p->sched_reset_on_fork)
kattr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; kattr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
if (task_has_dl_policy(p)) get_params(p, &kattr);
__getparam_dl(p, &kattr); kattr.sched_flags &= SCHED_FLAG_ALL;
else if (task_has_rt_policy(p))
kattr.sched_priority = p->rt_priority;
else
kattr.sched_nice = task_nice(p);
#ifdef CONFIG_UCLAMP_TASK #ifdef CONFIG_UCLAMP_TASK
/* /*
...@@ -7535,9 +7741,76 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, ...@@ -7535,9 +7741,76 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
return retval; return retval;
} }
long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) #ifdef CONFIG_SMP
int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask)
{ {
int ret = 0;
/*
* If the task isn't a deadline task or admission control is
* disabled then we don't care about affinity changes.
*/
if (!task_has_dl_policy(p) || !dl_bandwidth_enabled())
return 0;
/*
* Since bandwidth control happens on root_domain basis,
* if admission test is enabled, we only admit -deadline
* tasks allowed to run on all the CPUs in the task's
* root_domain.
*/
rcu_read_lock();
if (!cpumask_subset(task_rq(p)->rd->span, mask))
ret = -EBUSY;
rcu_read_unlock();
return ret;
}
#endif
static int
__sched_setaffinity(struct task_struct *p, const struct cpumask *mask)
{
int retval;
cpumask_var_t cpus_allowed, new_mask; cpumask_var_t cpus_allowed, new_mask;
if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL))
return -ENOMEM;
if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
retval = -ENOMEM;
goto out_free_cpus_allowed;
}
cpuset_cpus_allowed(p, cpus_allowed);
cpumask_and(new_mask, mask, cpus_allowed);
retval = dl_task_check_affinity(p, new_mask);
if (retval)
goto out_free_new_mask;
again:
retval = __set_cpus_allowed_ptr(p, new_mask, SCA_CHECK | SCA_USER);
if (retval)
goto out_free_new_mask;
cpuset_cpus_allowed(p, cpus_allowed);
if (!cpumask_subset(new_mask, cpus_allowed)) {
/*
* We must have raced with a concurrent cpuset update.
* Just reset the cpumask to the cpuset's cpus_allowed.
*/
cpumask_copy(new_mask, cpus_allowed);
goto again;
}
out_free_new_mask:
free_cpumask_var(new_mask);
out_free_cpus_allowed:
free_cpumask_var(cpus_allowed);
return retval;
}
long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
{
struct task_struct *p; struct task_struct *p;
int retval; int retval;
...@@ -7557,68 +7830,22 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) ...@@ -7557,68 +7830,22 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
retval = -EINVAL; retval = -EINVAL;
goto out_put_task; goto out_put_task;
} }
if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
retval = -ENOMEM;
goto out_put_task;
}
if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
retval = -ENOMEM;
goto out_free_cpus_allowed;
}
retval = -EPERM;
if (!check_same_owner(p)) { if (!check_same_owner(p)) {
rcu_read_lock(); rcu_read_lock();
if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) { if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) {
rcu_read_unlock(); rcu_read_unlock();
goto out_free_new_mask; retval = -EPERM;
goto out_put_task;
} }
rcu_read_unlock(); rcu_read_unlock();
} }
retval = security_task_setscheduler(p); retval = security_task_setscheduler(p);
if (retval) if (retval)
goto out_free_new_mask; goto out_put_task;
cpuset_cpus_allowed(p, cpus_allowed);
cpumask_and(new_mask, in_mask, cpus_allowed);
/*
* Since bandwidth control happens on root_domain basis,
* if admission test is enabled, we only admit -deadline
* tasks allowed to run on all the CPUs in the task's
* root_domain.
*/
#ifdef CONFIG_SMP
if (task_has_dl_policy(p) && dl_bandwidth_enabled()) {
rcu_read_lock();
if (!cpumask_subset(task_rq(p)->rd->span, new_mask)) {
retval = -EBUSY;
rcu_read_unlock();
goto out_free_new_mask;
}
rcu_read_unlock();
}
#endif
again:
retval = __set_cpus_allowed_ptr(p, new_mask, SCA_CHECK);
if (!retval) { retval = __sched_setaffinity(p, in_mask);
cpuset_cpus_allowed(p, cpus_allowed);
if (!cpumask_subset(new_mask, cpus_allowed)) {
/*
* We must have raced with a concurrent cpuset
* update. Just reset the cpus_allowed to the
* cpuset's cpus_allowed
*/
cpumask_copy(new_mask, cpus_allowed);
goto again;
}
}
out_free_new_mask:
free_cpumask_var(new_mask);
out_free_cpus_allowed:
free_cpumask_var(cpus_allowed);
out_put_task: out_put_task:
put_task_struct(p); put_task_struct(p);
return retval; return retval;
...@@ -9804,7 +10031,7 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota, ...@@ -9804,7 +10031,7 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota,
* Prevent race between setting of cfs_rq->runtime_enabled and * Prevent race between setting of cfs_rq->runtime_enabled and
* unthrottle_offline_cfs_rqs(). * unthrottle_offline_cfs_rqs().
*/ */
get_online_cpus(); cpus_read_lock();
mutex_lock(&cfs_constraints_mutex); mutex_lock(&cfs_constraints_mutex);
ret = __cfs_schedulable(tg, period, quota); ret = __cfs_schedulable(tg, period, quota);
if (ret) if (ret)
...@@ -9848,7 +10075,7 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota, ...@@ -9848,7 +10075,7 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota,
cfs_bandwidth_usage_dec(); cfs_bandwidth_usage_dec();
out_unlock: out_unlock:
mutex_unlock(&cfs_constraints_mutex); mutex_unlock(&cfs_constraints_mutex);
put_online_cpus(); cpus_read_unlock();
return ret; return ret;
} }
...@@ -10099,6 +10326,20 @@ static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css, ...@@ -10099,6 +10326,20 @@ static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css,
} }
#endif /* CONFIG_RT_GROUP_SCHED */ #endif /* CONFIG_RT_GROUP_SCHED */
#ifdef CONFIG_FAIR_GROUP_SCHED
static s64 cpu_idle_read_s64(struct cgroup_subsys_state *css,
struct cftype *cft)
{
return css_tg(css)->idle;
}
static int cpu_idle_write_s64(struct cgroup_subsys_state *css,
struct cftype *cft, s64 idle)
{
return sched_group_set_idle(css_tg(css), idle);
}
#endif
static struct cftype cpu_legacy_files[] = { static struct cftype cpu_legacy_files[] = {
#ifdef CONFIG_FAIR_GROUP_SCHED #ifdef CONFIG_FAIR_GROUP_SCHED
{ {
...@@ -10106,6 +10347,11 @@ static struct cftype cpu_legacy_files[] = { ...@@ -10106,6 +10347,11 @@ static struct cftype cpu_legacy_files[] = {
.read_u64 = cpu_shares_read_u64, .read_u64 = cpu_shares_read_u64,
.write_u64 = cpu_shares_write_u64, .write_u64 = cpu_shares_write_u64,
}, },
{
.name = "idle",
.read_s64 = cpu_idle_read_s64,
.write_s64 = cpu_idle_write_s64,
},
#endif #endif
#ifdef CONFIG_CFS_BANDWIDTH #ifdef CONFIG_CFS_BANDWIDTH
{ {
...@@ -10313,6 +10559,12 @@ static struct cftype cpu_files[] = { ...@@ -10313,6 +10559,12 @@ static struct cftype cpu_files[] = {
.read_s64 = cpu_weight_nice_read_s64, .read_s64 = cpu_weight_nice_read_s64,
.write_s64 = cpu_weight_nice_write_s64, .write_s64 = cpu_weight_nice_write_s64,
}, },
{
.name = "idle",
.flags = CFTYPE_NOT_ON_ROOT,
.read_s64 = cpu_idle_read_s64,
.write_s64 = cpu_idle_write_s64,
},
#endif #endif
#ifdef CONFIG_CFS_BANDWIDTH #ifdef CONFIG_CFS_BANDWIDTH
{ {
......
...@@ -1733,6 +1733,7 @@ static void migrate_task_rq_dl(struct task_struct *p, int new_cpu __maybe_unused ...@@ -1733,6 +1733,7 @@ static void migrate_task_rq_dl(struct task_struct *p, int new_cpu __maybe_unused
*/ */
raw_spin_rq_lock(rq); raw_spin_rq_lock(rq);
if (p->dl.dl_non_contending) { if (p->dl.dl_non_contending) {
update_rq_clock(rq);
sub_running_bw(&p->dl, &rq->dl); sub_running_bw(&p->dl, &rq->dl);
p->dl.dl_non_contending = 0; p->dl.dl_non_contending = 0;
/* /*
...@@ -2741,7 +2742,7 @@ void __setparam_dl(struct task_struct *p, const struct sched_attr *attr) ...@@ -2741,7 +2742,7 @@ void __setparam_dl(struct task_struct *p, const struct sched_attr *attr)
dl_se->dl_runtime = attr->sched_runtime; dl_se->dl_runtime = attr->sched_runtime;
dl_se->dl_deadline = attr->sched_deadline; dl_se->dl_deadline = attr->sched_deadline;
dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline; dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline;
dl_se->flags = attr->sched_flags; dl_se->flags = attr->sched_flags & SCHED_DL_FLAGS;
dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime); dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime);
dl_se->dl_density = to_ratio(dl_se->dl_deadline, dl_se->dl_runtime); dl_se->dl_density = to_ratio(dl_se->dl_deadline, dl_se->dl_runtime);
} }
...@@ -2754,7 +2755,8 @@ void __getparam_dl(struct task_struct *p, struct sched_attr *attr) ...@@ -2754,7 +2755,8 @@ void __getparam_dl(struct task_struct *p, struct sched_attr *attr)
attr->sched_runtime = dl_se->dl_runtime; attr->sched_runtime = dl_se->dl_runtime;
attr->sched_deadline = dl_se->dl_deadline; attr->sched_deadline = dl_se->dl_deadline;
attr->sched_period = dl_se->dl_period; attr->sched_period = dl_se->dl_period;
attr->sched_flags = dl_se->flags; attr->sched_flags &= ~SCHED_DL_FLAGS;
attr->sched_flags |= dl_se->flags;
} }
/* /*
...@@ -2851,7 +2853,7 @@ bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr) ...@@ -2851,7 +2853,7 @@ bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr)
if (dl_se->dl_runtime != attr->sched_runtime || if (dl_se->dl_runtime != attr->sched_runtime ||
dl_se->dl_deadline != attr->sched_deadline || dl_se->dl_deadline != attr->sched_deadline ||
dl_se->dl_period != attr->sched_period || dl_se->dl_period != attr->sched_period ||
dl_se->flags != attr->sched_flags) dl_se->flags != (attr->sched_flags & SCHED_DL_FLAGS))
return true; return true;
return false; return false;
......
...@@ -388,6 +388,13 @@ void update_sched_domain_debugfs(void) ...@@ -388,6 +388,13 @@ void update_sched_domain_debugfs(void)
{ {
int cpu, i; int cpu, i;
/*
* This can unfortunately be invoked before sched_debug_init() creates
* the debug directory. Don't touch sd_sysctl_cpus until then.
*/
if (!debugfs_sched)
return;
if (!cpumask_available(sd_sysctl_cpus)) { if (!cpumask_available(sd_sysctl_cpus)) {
if (!alloc_cpumask_var(&sd_sysctl_cpus, GFP_KERNEL)) if (!alloc_cpumask_var(&sd_sysctl_cpus, GFP_KERNEL))
return; return;
...@@ -600,6 +607,9 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) ...@@ -600,6 +607,9 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over", SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over",
cfs_rq->nr_spread_over); cfs_rq->nr_spread_over);
SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running); SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running);
SEQ_printf(m, " .%-30s: %d\n", "h_nr_running", cfs_rq->h_nr_running);
SEQ_printf(m, " .%-30s: %d\n", "idle_h_nr_running",
cfs_rq->idle_h_nr_running);
SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight);
#ifdef CONFIG_SMP #ifdef CONFIG_SMP
SEQ_printf(m, " .%-30s: %lu\n", "load_avg", SEQ_printf(m, " .%-30s: %lu\n", "load_avg",
......
...@@ -431,6 +431,23 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse) ...@@ -431,6 +431,23 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse)
} }
} }
static int tg_is_idle(struct task_group *tg)
{
return tg->idle > 0;
}
static int cfs_rq_is_idle(struct cfs_rq *cfs_rq)
{
return cfs_rq->idle > 0;
}
static int se_is_idle(struct sched_entity *se)
{
if (entity_is_task(se))
return task_has_idle_policy(task_of(se));
return cfs_rq_is_idle(group_cfs_rq(se));
}
#else /* !CONFIG_FAIR_GROUP_SCHED */ #else /* !CONFIG_FAIR_GROUP_SCHED */
#define for_each_sched_entity(se) \ #define for_each_sched_entity(se) \
...@@ -468,6 +485,21 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse) ...@@ -468,6 +485,21 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse)
{ {
} }
static int tg_is_idle(struct task_group *tg)
{
return 0;
}
static int cfs_rq_is_idle(struct cfs_rq *cfs_rq)
{
return 0;
}
static int se_is_idle(struct sched_entity *se)
{
return 0;
}
#endif /* CONFIG_FAIR_GROUP_SCHED */ #endif /* CONFIG_FAIR_GROUP_SCHED */
static __always_inline static __always_inline
...@@ -1486,7 +1518,7 @@ static inline bool is_core_idle(int cpu) ...@@ -1486,7 +1518,7 @@ static inline bool is_core_idle(int cpu)
if (cpu == sibling) if (cpu == sibling)
continue; continue;
if (!idle_cpu(cpu)) if (!idle_cpu(sibling))
return false; return false;
} }
#endif #endif
...@@ -4841,6 +4873,9 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) ...@@ -4841,6 +4873,9 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP); dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
if (cfs_rq_is_idle(group_cfs_rq(se)))
idle_task_delta = cfs_rq->h_nr_running;
qcfs_rq->h_nr_running -= task_delta; qcfs_rq->h_nr_running -= task_delta;
qcfs_rq->idle_h_nr_running -= idle_task_delta; qcfs_rq->idle_h_nr_running -= idle_task_delta;
...@@ -4860,6 +4895,9 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) ...@@ -4860,6 +4895,9 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
update_load_avg(qcfs_rq, se, 0); update_load_avg(qcfs_rq, se, 0);
se_update_runnable(se); se_update_runnable(se);
if (cfs_rq_is_idle(group_cfs_rq(se)))
idle_task_delta = cfs_rq->h_nr_running;
qcfs_rq->h_nr_running -= task_delta; qcfs_rq->h_nr_running -= task_delta;
qcfs_rq->idle_h_nr_running -= idle_task_delta; qcfs_rq->idle_h_nr_running -= idle_task_delta;
} }
...@@ -4904,39 +4942,45 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) ...@@ -4904,39 +4942,45 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
task_delta = cfs_rq->h_nr_running; task_delta = cfs_rq->h_nr_running;
idle_task_delta = cfs_rq->idle_h_nr_running; idle_task_delta = cfs_rq->idle_h_nr_running;
for_each_sched_entity(se) { for_each_sched_entity(se) {
struct cfs_rq *qcfs_rq = cfs_rq_of(se);
if (se->on_rq) if (se->on_rq)
break; break;
cfs_rq = cfs_rq_of(se); enqueue_entity(qcfs_rq, se, ENQUEUE_WAKEUP);
enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
if (cfs_rq_is_idle(group_cfs_rq(se)))
idle_task_delta = cfs_rq->h_nr_running;
cfs_rq->h_nr_running += task_delta; qcfs_rq->h_nr_running += task_delta;
cfs_rq->idle_h_nr_running += idle_task_delta; qcfs_rq->idle_h_nr_running += idle_task_delta;
/* end evaluation on encountering a throttled cfs_rq */ /* end evaluation on encountering a throttled cfs_rq */
if (cfs_rq_throttled(cfs_rq)) if (cfs_rq_throttled(qcfs_rq))
goto unthrottle_throttle; goto unthrottle_throttle;
} }
for_each_sched_entity(se) { for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se); struct cfs_rq *qcfs_rq = cfs_rq_of(se);
update_load_avg(cfs_rq, se, UPDATE_TG); update_load_avg(qcfs_rq, se, UPDATE_TG);
se_update_runnable(se); se_update_runnable(se);
cfs_rq->h_nr_running += task_delta; if (cfs_rq_is_idle(group_cfs_rq(se)))
cfs_rq->idle_h_nr_running += idle_task_delta; idle_task_delta = cfs_rq->h_nr_running;
qcfs_rq->h_nr_running += task_delta;
qcfs_rq->idle_h_nr_running += idle_task_delta;
/* end evaluation on encountering a throttled cfs_rq */ /* end evaluation on encountering a throttled cfs_rq */
if (cfs_rq_throttled(cfs_rq)) if (cfs_rq_throttled(qcfs_rq))
goto unthrottle_throttle; goto unthrottle_throttle;
/* /*
* One parent has been throttled and cfs_rq removed from the * One parent has been throttled and cfs_rq removed from the
* list. Add it back to not break the leaf list. * list. Add it back to not break the leaf list.
*/ */
if (throttled_hierarchy(cfs_rq)) if (throttled_hierarchy(qcfs_rq))
list_add_leaf_cfs_rq(cfs_rq); list_add_leaf_cfs_rq(qcfs_rq);
} }
/* At this point se is NULL and we are at root level*/ /* At this point se is NULL and we are at root level*/
...@@ -4949,9 +4993,9 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) ...@@ -4949,9 +4993,9 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
* assertion below. * assertion below.
*/ */
for_each_sched_entity(se) { for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se); struct cfs_rq *qcfs_rq = cfs_rq_of(se);
if (list_add_leaf_cfs_rq(cfs_rq)) if (list_add_leaf_cfs_rq(qcfs_rq))
break; break;
} }
...@@ -5574,6 +5618,9 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) ...@@ -5574,6 +5618,9 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
cfs_rq->h_nr_running++; cfs_rq->h_nr_running++;
cfs_rq->idle_h_nr_running += idle_h_nr_running; cfs_rq->idle_h_nr_running += idle_h_nr_running;
if (cfs_rq_is_idle(cfs_rq))
idle_h_nr_running = 1;
/* end evaluation on encountering a throttled cfs_rq */ /* end evaluation on encountering a throttled cfs_rq */
if (cfs_rq_throttled(cfs_rq)) if (cfs_rq_throttled(cfs_rq))
goto enqueue_throttle; goto enqueue_throttle;
...@@ -5591,6 +5638,9 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) ...@@ -5591,6 +5638,9 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
cfs_rq->h_nr_running++; cfs_rq->h_nr_running++;
cfs_rq->idle_h_nr_running += idle_h_nr_running; cfs_rq->idle_h_nr_running += idle_h_nr_running;
if (cfs_rq_is_idle(cfs_rq))
idle_h_nr_running = 1;
/* end evaluation on encountering a throttled cfs_rq */ /* end evaluation on encountering a throttled cfs_rq */
if (cfs_rq_throttled(cfs_rq)) if (cfs_rq_throttled(cfs_rq))
goto enqueue_throttle; goto enqueue_throttle;
...@@ -5668,6 +5718,9 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) ...@@ -5668,6 +5718,9 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
cfs_rq->h_nr_running--; cfs_rq->h_nr_running--;
cfs_rq->idle_h_nr_running -= idle_h_nr_running; cfs_rq->idle_h_nr_running -= idle_h_nr_running;
if (cfs_rq_is_idle(cfs_rq))
idle_h_nr_running = 1;
/* end evaluation on encountering a throttled cfs_rq */ /* end evaluation on encountering a throttled cfs_rq */
if (cfs_rq_throttled(cfs_rq)) if (cfs_rq_throttled(cfs_rq))
goto dequeue_throttle; goto dequeue_throttle;
...@@ -5697,6 +5750,9 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) ...@@ -5697,6 +5750,9 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
cfs_rq->h_nr_running--; cfs_rq->h_nr_running--;
cfs_rq->idle_h_nr_running -= idle_h_nr_running; cfs_rq->idle_h_nr_running -= idle_h_nr_running;
if (cfs_rq_is_idle(cfs_rq))
idle_h_nr_running = 1;
/* end evaluation on encountering a throttled cfs_rq */ /* end evaluation on encountering a throttled cfs_rq */
if (cfs_rq_throttled(cfs_rq)) if (cfs_rq_throttled(cfs_rq))
goto dequeue_throttle; goto dequeue_throttle;
...@@ -6249,7 +6305,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool ...@@ -6249,7 +6305,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
time = cpu_clock(this); time = cpu_clock(this);
} }
for_each_cpu_wrap(cpu, cpus, target) { for_each_cpu_wrap(cpu, cpus, target + 1) {
if (has_idle_core) { if (has_idle_core) {
i = select_idle_core(p, cpu, cpus, &idle_cpu); i = select_idle_core(p, cpu, cpus, &idle_cpu);
if ((unsigned int)i < nr_cpumask_bits) if ((unsigned int)i < nr_cpumask_bits)
...@@ -6376,6 +6432,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) ...@@ -6376,6 +6432,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
/* Check a recently used CPU as a potential idle candidate: */ /* Check a recently used CPU as a potential idle candidate: */
recent_used_cpu = p->recent_used_cpu; recent_used_cpu = p->recent_used_cpu;
p->recent_used_cpu = prev;
if (recent_used_cpu != prev && if (recent_used_cpu != prev &&
recent_used_cpu != target && recent_used_cpu != target &&
cpus_share_cache(recent_used_cpu, target) && cpus_share_cache(recent_used_cpu, target) &&
...@@ -6902,9 +6959,6 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags) ...@@ -6902,9 +6959,6 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
} else if (wake_flags & WF_TTWU) { /* XXX always ? */ } else if (wake_flags & WF_TTWU) { /* XXX always ? */
/* Fast path */ /* Fast path */
new_cpu = select_idle_sibling(p, prev_cpu, new_cpu); new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
if (want_affine)
current->recent_used_cpu = cpu;
} }
rcu_read_unlock(); rcu_read_unlock();
...@@ -7041,24 +7095,22 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se) ...@@ -7041,24 +7095,22 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
static void set_last_buddy(struct sched_entity *se) static void set_last_buddy(struct sched_entity *se)
{ {
if (entity_is_task(se) && unlikely(task_has_idle_policy(task_of(se))))
return;
for_each_sched_entity(se) { for_each_sched_entity(se) {
if (SCHED_WARN_ON(!se->on_rq)) if (SCHED_WARN_ON(!se->on_rq))
return; return;
if (se_is_idle(se))
return;
cfs_rq_of(se)->last = se; cfs_rq_of(se)->last = se;
} }
} }
static void set_next_buddy(struct sched_entity *se) static void set_next_buddy(struct sched_entity *se)
{ {
if (entity_is_task(se) && unlikely(task_has_idle_policy(task_of(se))))
return;
for_each_sched_entity(se) { for_each_sched_entity(se) {
if (SCHED_WARN_ON(!se->on_rq)) if (SCHED_WARN_ON(!se->on_rq))
return; return;
if (se_is_idle(se))
return;
cfs_rq_of(se)->next = se; cfs_rq_of(se)->next = se;
} }
} }
...@@ -7079,6 +7131,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ ...@@ -7079,6 +7131,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
struct cfs_rq *cfs_rq = task_cfs_rq(curr); struct cfs_rq *cfs_rq = task_cfs_rq(curr);
int scale = cfs_rq->nr_running >= sched_nr_latency; int scale = cfs_rq->nr_running >= sched_nr_latency;
int next_buddy_marked = 0; int next_buddy_marked = 0;
int cse_is_idle, pse_is_idle;
if (unlikely(se == pse)) if (unlikely(se == pse))
return; return;
...@@ -7123,8 +7176,21 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ ...@@ -7123,8 +7176,21 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
return; return;
find_matching_se(&se, &pse); find_matching_se(&se, &pse);
update_curr(cfs_rq_of(se));
BUG_ON(!pse); BUG_ON(!pse);
cse_is_idle = se_is_idle(se);
pse_is_idle = se_is_idle(pse);
/*
* Preempt an idle group in favor of a non-idle group (and don't preempt
* in the inverse case).
*/
if (cse_is_idle && !pse_is_idle)
goto preempt;
if (cse_is_idle != pse_is_idle)
return;
update_curr(cfs_rq_of(se));
if (wakeup_preempt_entity(se, pse) == 1) { if (wakeup_preempt_entity(se, pse) == 1) {
/* /*
* Bias pick_next to pick the sched entity that is * Bias pick_next to pick the sched entity that is
...@@ -10217,9 +10283,11 @@ static inline int on_null_domain(struct rq *rq) ...@@ -10217,9 +10283,11 @@ static inline int on_null_domain(struct rq *rq)
static inline int find_new_ilb(void) static inline int find_new_ilb(void)
{ {
int ilb; int ilb;
const struct cpumask *hk_mask;
hk_mask = housekeeping_cpumask(HK_FLAG_MISC);
for_each_cpu_and(ilb, nohz.idle_cpus_mask, for_each_cpu_and(ilb, nohz.idle_cpus_mask, hk_mask) {
housekeeping_cpumask(HK_FLAG_MISC)) {
if (ilb == smp_processor_id()) if (ilb == smp_processor_id())
continue; continue;
...@@ -11416,10 +11484,12 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, ...@@ -11416,10 +11484,12 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
static DEFINE_MUTEX(shares_mutex); static DEFINE_MUTEX(shares_mutex);
int sched_group_set_shares(struct task_group *tg, unsigned long shares) static int __sched_group_set_shares(struct task_group *tg, unsigned long shares)
{ {
int i; int i;
lockdep_assert_held(&shares_mutex);
/* /*
* We can't change the weight of the root cgroup. * We can't change the weight of the root cgroup.
*/ */
...@@ -11428,9 +11498,8 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) ...@@ -11428,9 +11498,8 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES)); shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
mutex_lock(&shares_mutex);
if (tg->shares == shares) if (tg->shares == shares)
goto done; return 0;
tg->shares = shares; tg->shares = shares;
for_each_possible_cpu(i) { for_each_possible_cpu(i) {
...@@ -11448,10 +11517,88 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) ...@@ -11448,10 +11517,88 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
rq_unlock_irqrestore(rq, &rf); rq_unlock_irqrestore(rq, &rf);
} }
done: return 0;
}
int sched_group_set_shares(struct task_group *tg, unsigned long shares)
{
int ret;
mutex_lock(&shares_mutex);
if (tg_is_idle(tg))
ret = -EINVAL;
else
ret = __sched_group_set_shares(tg, shares);
mutex_unlock(&shares_mutex);
return ret;
}
int sched_group_set_idle(struct task_group *tg, long idle)
{
int i;
if (tg == &root_task_group)
return -EINVAL;
if (idle < 0 || idle > 1)
return -EINVAL;
mutex_lock(&shares_mutex);
if (tg->idle == idle) {
mutex_unlock(&shares_mutex);
return 0;
}
tg->idle = idle;
for_each_possible_cpu(i) {
struct rq *rq = cpu_rq(i);
struct sched_entity *se = tg->se[i];
struct cfs_rq *grp_cfs_rq = tg->cfs_rq[i];
bool was_idle = cfs_rq_is_idle(grp_cfs_rq);
long idle_task_delta;
struct rq_flags rf;
rq_lock_irqsave(rq, &rf);
grp_cfs_rq->idle = idle;
if (WARN_ON_ONCE(was_idle == cfs_rq_is_idle(grp_cfs_rq)))
goto next_cpu;
idle_task_delta = grp_cfs_rq->h_nr_running -
grp_cfs_rq->idle_h_nr_running;
if (!cfs_rq_is_idle(grp_cfs_rq))
idle_task_delta *= -1;
for_each_sched_entity(se) {
struct cfs_rq *cfs_rq = cfs_rq_of(se);
if (!se->on_rq)
break;
cfs_rq->idle_h_nr_running += idle_task_delta;
/* Already accounted at parent level and above. */
if (cfs_rq_is_idle(cfs_rq))
break;
}
next_cpu:
rq_unlock_irqrestore(rq, &rf);
}
/* Idle groups have minimum weight. */
if (tg_is_idle(tg))
__sched_group_set_shares(tg, scale_load(WEIGHT_IDLEPRIO));
else
__sched_group_set_shares(tg, NICE_0_LOAD);
mutex_unlock(&shares_mutex); mutex_unlock(&shares_mutex);
return 0; return 0;
} }
#else /* CONFIG_FAIR_GROUP_SCHED */ #else /* CONFIG_FAIR_GROUP_SCHED */
void free_fair_sched_group(struct task_group *tg) { } void free_fair_sched_group(struct task_group *tg) { }
......
...@@ -227,6 +227,8 @@ static inline void update_avg(u64 *avg, u64 sample) ...@@ -227,6 +227,8 @@ static inline void update_avg(u64 *avg, u64 sample)
*/ */
#define SCHED_FLAG_SUGOV 0x10000000 #define SCHED_FLAG_SUGOV 0x10000000
#define SCHED_DL_FLAGS (SCHED_FLAG_RECLAIM | SCHED_FLAG_DL_OVERRUN | SCHED_FLAG_SUGOV)
static inline bool dl_entity_is_special(struct sched_dl_entity *dl_se) static inline bool dl_entity_is_special(struct sched_dl_entity *dl_se)
{ {
#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL #ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL
...@@ -394,6 +396,9 @@ struct task_group { ...@@ -394,6 +396,9 @@ struct task_group {
struct cfs_rq **cfs_rq; struct cfs_rq **cfs_rq;
unsigned long shares; unsigned long shares;
/* A positive value indicates that this is a SCHED_IDLE group. */
int idle;
#ifdef CONFIG_SMP #ifdef CONFIG_SMP
/* /*
* load_avg can be heavily contended at clock tick time, so put * load_avg can be heavily contended at clock tick time, so put
...@@ -503,6 +508,8 @@ extern void sched_move_task(struct task_struct *tsk); ...@@ -503,6 +508,8 @@ extern void sched_move_task(struct task_struct *tsk);
#ifdef CONFIG_FAIR_GROUP_SCHED #ifdef CONFIG_FAIR_GROUP_SCHED
extern int sched_group_set_shares(struct task_group *tg, unsigned long shares); extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
extern int sched_group_set_idle(struct task_group *tg, long idle);
#ifdef CONFIG_SMP #ifdef CONFIG_SMP
extern void set_task_rq_fair(struct sched_entity *se, extern void set_task_rq_fair(struct sched_entity *se,
struct cfs_rq *prev, struct cfs_rq *next); struct cfs_rq *prev, struct cfs_rq *next);
...@@ -599,6 +606,9 @@ struct cfs_rq { ...@@ -599,6 +606,9 @@ struct cfs_rq {
struct list_head leaf_cfs_rq_list; struct list_head leaf_cfs_rq_list;
struct task_group *tg; /* group that "owns" this runqueue */ struct task_group *tg; /* group that "owns" this runqueue */
/* Locally cached copy of our task_group's idle value */
int idle;
#ifdef CONFIG_CFS_BANDWIDTH #ifdef CONFIG_CFS_BANDWIDTH
int runtime_enabled; int runtime_enabled;
s64 runtime_remaining; s64 runtime_remaining;
...@@ -2234,6 +2244,7 @@ extern struct task_struct *pick_next_task_idle(struct rq *rq); ...@@ -2234,6 +2244,7 @@ extern struct task_struct *pick_next_task_idle(struct rq *rq);
#define SCA_CHECK 0x01 #define SCA_CHECK 0x01
#define SCA_MIGRATE_DISABLE 0x02 #define SCA_MIGRATE_DISABLE 0x02
#define SCA_MIGRATE_ENABLE 0x04 #define SCA_MIGRATE_ENABLE 0x04
#define SCA_USER 0x08
#ifdef CONFIG_SMP #ifdef CONFIG_SMP
...@@ -2385,6 +2396,21 @@ extern void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags); ...@@ -2385,6 +2396,21 @@ extern void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags);
extern const_debug unsigned int sysctl_sched_nr_migrate; extern const_debug unsigned int sysctl_sched_nr_migrate;
extern const_debug unsigned int sysctl_sched_migration_cost; extern const_debug unsigned int sysctl_sched_migration_cost;
#ifdef CONFIG_SCHED_DEBUG
extern unsigned int sysctl_sched_latency;
extern unsigned int sysctl_sched_min_granularity;
extern unsigned int sysctl_sched_wakeup_granularity;
extern int sysctl_resched_latency_warn_ms;
extern int sysctl_resched_latency_warn_once;
extern unsigned int sysctl_sched_tunable_scaling;
extern unsigned int sysctl_numa_balancing_scan_delay;
extern unsigned int sysctl_numa_balancing_scan_period_min;
extern unsigned int sysctl_numa_balancing_scan_period_max;
extern unsigned int sysctl_numa_balancing_scan_size;
#endif
#ifdef CONFIG_SCHED_HRTICK #ifdef CONFIG_SCHED_HRTICK
/* /*
......
...@@ -1482,6 +1482,8 @@ int sched_max_numa_distance; ...@@ -1482,6 +1482,8 @@ int sched_max_numa_distance;
static int *sched_domains_numa_distance; static int *sched_domains_numa_distance;
static struct cpumask ***sched_domains_numa_masks; static struct cpumask ***sched_domains_numa_masks;
int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE; int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE;
static unsigned long __read_mostly *sched_numa_onlined_nodes;
#endif #endif
/* /*
...@@ -1833,6 +1835,16 @@ void sched_init_numa(void) ...@@ -1833,6 +1835,16 @@ void sched_init_numa(void)
sched_domains_numa_masks[i][j] = mask; sched_domains_numa_masks[i][j] = mask;
for_each_node(k) { for_each_node(k) {
/*
* Distance information can be unreliable for
* offline nodes, defer building the node
* masks to its bringup.
* This relies on all unique distance values
* still being visible at init time.
*/
if (!node_online(j))
continue;
if (sched_debug() && (node_distance(j, k) != node_distance(k, j))) if (sched_debug() && (node_distance(j, k) != node_distance(k, j)))
sched_numa_warn("Node-distance not symmetric"); sched_numa_warn("Node-distance not symmetric");
...@@ -1886,6 +1898,53 @@ void sched_init_numa(void) ...@@ -1886,6 +1898,53 @@ void sched_init_numa(void)
sched_max_numa_distance = sched_domains_numa_distance[nr_levels - 1]; sched_max_numa_distance = sched_domains_numa_distance[nr_levels - 1];
init_numa_topology_type(); init_numa_topology_type();
sched_numa_onlined_nodes = bitmap_alloc(nr_node_ids, GFP_KERNEL);
if (!sched_numa_onlined_nodes)
return;
bitmap_zero(sched_numa_onlined_nodes, nr_node_ids);
for_each_online_node(i)
bitmap_set(sched_numa_onlined_nodes, i, 1);
}
static void __sched_domains_numa_masks_set(unsigned int node)
{
int i, j;
/*
* NUMA masks are not built for offline nodes in sched_init_numa().
* Thus, when a CPU of a never-onlined-before node gets plugged in,
* adding that new CPU to the right NUMA masks is not sufficient: the
* masks of that CPU's node must also be updated.
*/
if (test_bit(node, sched_numa_onlined_nodes))
return;
bitmap_set(sched_numa_onlined_nodes, node, 1);
for (i = 0; i < sched_domains_numa_levels; i++) {
for (j = 0; j < nr_node_ids; j++) {
if (!node_online(j) || node == j)
continue;
if (node_distance(j, node) > sched_domains_numa_distance[i])
continue;
/* Add remote nodes in our masks */
cpumask_or(sched_domains_numa_masks[i][node],
sched_domains_numa_masks[i][node],
sched_domains_numa_masks[0][j]);
}
}
/*
* A new node has been brought up, potentially changing the topology
* classification.
*
* Note that this is racy vs any use of sched_numa_topology_type :/
*/
init_numa_topology_type();
} }
void sched_domains_numa_masks_set(unsigned int cpu) void sched_domains_numa_masks_set(unsigned int cpu)
...@@ -1893,8 +1952,14 @@ void sched_domains_numa_masks_set(unsigned int cpu) ...@@ -1893,8 +1952,14 @@ void sched_domains_numa_masks_set(unsigned int cpu)
int node = cpu_to_node(cpu); int node = cpu_to_node(cpu);
int i, j; int i, j;
__sched_domains_numa_masks_set(node);
for (i = 0; i < sched_domains_numa_levels; i++) { for (i = 0; i < sched_domains_numa_levels; i++) {
for (j = 0; j < nr_node_ids; j++) { for (j = 0; j < nr_node_ids; j++) {
if (!node_online(j))
continue;
/* Set ourselves in the remote node's masks */
if (node_distance(j, node) <= sched_domains_numa_distance[i]) if (node_distance(j, node) <= sched_domains_numa_distance[i])
cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]); cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]);
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment