Commit 25834c73 authored by Peter Zijlstra's avatar Peter Zijlstra Committed by Ingo Molnar

sched: Fix a race between __kthread_bind() and sched_setaffinity()

Because sched_setscheduler() checks p->flags & PF_NO_SETAFFINITY
without locks, a caller might observe an old value and race with the
set_cpus_allowed_ptr() call from __kthread_bind() and effectively undo
it:

	__kthread_bind()
	  do_set_cpus_allowed()
						<SYSCALL>
						  sched_setaffinity()
						    if (p->flags & PF_NO_SETAFFINITIY)
						    set_cpus_allowed_ptr()
	  p->flags |= PF_NO_SETAFFINITY

Fix the bug by putting everything under the regular scheduler locks.

This also closes a hole in the serialization of task_struct::{nr_,}cpus_allowed.
Signed-off-by: default avatarPeter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: default avatarTejun Heo <tj@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: dedekind1@gmail.com
Cc: juri.lelli@arm.com
Cc: mgorman@suse.de
Cc: riel@redhat.com
Cc: rostedt@goodmis.org
Link: http://lkml.kernel.org/r/20150515154833.545640346@infradead.orgSigned-off-by: default avatarIngo Molnar <mingo@kernel.org>
parent 7855a35a
...@@ -38,6 +38,7 @@ struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data), ...@@ -38,6 +38,7 @@ struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data),
}) })
void kthread_bind(struct task_struct *k, unsigned int cpu); void kthread_bind(struct task_struct *k, unsigned int cpu);
void kthread_bind_mask(struct task_struct *k, const struct cpumask *mask);
int kthread_stop(struct task_struct *k); int kthread_stop(struct task_struct *k);
bool kthread_should_stop(void); bool kthread_should_stop(void);
bool kthread_should_park(void); bool kthread_should_park(void);
......
...@@ -2203,13 +2203,6 @@ static inline void calc_load_enter_idle(void) { } ...@@ -2203,13 +2203,6 @@ static inline void calc_load_enter_idle(void) { }
static inline void calc_load_exit_idle(void) { } static inline void calc_load_exit_idle(void) { }
#endif /* CONFIG_NO_HZ_COMMON */ #endif /* CONFIG_NO_HZ_COMMON */
#ifndef CONFIG_CPUMASK_OFFSTACK
static inline int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask)
{
return set_cpus_allowed_ptr(p, &new_mask);
}
#endif
/* /*
* Do not use outside of architecture code which knows its limitations. * Do not use outside of architecture code which knows its limitations.
* *
......
...@@ -325,16 +325,30 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data), ...@@ -325,16 +325,30 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
} }
EXPORT_SYMBOL(kthread_create_on_node); EXPORT_SYMBOL(kthread_create_on_node);
static void __kthread_bind(struct task_struct *p, unsigned int cpu, long state) static void __kthread_bind_mask(struct task_struct *p, const struct cpumask *mask, long state)
{ {
/* Must have done schedule() in kthread() before we set_task_cpu */ unsigned long flags;
if (!wait_task_inactive(p, state)) { if (!wait_task_inactive(p, state)) {
WARN_ON(1); WARN_ON(1);
return; return;
} }
/* It's safe because the task is inactive. */ /* It's safe because the task is inactive. */
do_set_cpus_allowed(p, cpumask_of(cpu)); raw_spin_lock_irqsave(&p->pi_lock, flags);
do_set_cpus_allowed(p, mask);
p->flags |= PF_NO_SETAFFINITY; p->flags |= PF_NO_SETAFFINITY;
raw_spin_unlock_irqrestore(&p->pi_lock, flags);
}
static void __kthread_bind(struct task_struct *p, unsigned int cpu, long state)
{
__kthread_bind_mask(p, cpumask_of(cpu), state);
}
void kthread_bind_mask(struct task_struct *p, const struct cpumask *mask)
{
__kthread_bind_mask(p, mask, TASK_UNINTERRUPTIBLE);
} }
/** /**
......
...@@ -1153,6 +1153,8 @@ static int migration_cpu_stop(void *data) ...@@ -1153,6 +1153,8 @@ static int migration_cpu_stop(void *data)
void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
{ {
lockdep_assert_held(&p->pi_lock);
if (p->sched_class->set_cpus_allowed) if (p->sched_class->set_cpus_allowed)
p->sched_class->set_cpus_allowed(p, new_mask); p->sched_class->set_cpus_allowed(p, new_mask);
...@@ -1169,7 +1171,8 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) ...@@ -1169,7 +1171,8 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
* task must not exit() & deallocate itself prematurely. The * task must not exit() & deallocate itself prematurely. The
* call is not atomic; no spinlocks may be held. * call is not atomic; no spinlocks may be held.
*/ */
int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) static int __set_cpus_allowed_ptr(struct task_struct *p,
const struct cpumask *new_mask, bool check)
{ {
unsigned long flags; unsigned long flags;
struct rq *rq; struct rq *rq;
...@@ -1178,6 +1181,15 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) ...@@ -1178,6 +1181,15 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
rq = task_rq_lock(p, &flags); rq = task_rq_lock(p, &flags);
/*
* Must re-check here, to close a race against __kthread_bind(),
* sched_setaffinity() is not guaranteed to observe the flag.
*/
if (check && (p->flags & PF_NO_SETAFFINITY)) {
ret = -EINVAL;
goto out;
}
if (cpumask_equal(&p->cpus_allowed, new_mask)) if (cpumask_equal(&p->cpus_allowed, new_mask))
goto out; goto out;
...@@ -1214,6 +1226,11 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) ...@@ -1214,6 +1226,11 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
return ret; return ret;
} }
int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
{
return __set_cpus_allowed_ptr(p, new_mask, false);
}
EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
void set_task_cpu(struct task_struct *p, unsigned int new_cpu) void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
...@@ -1595,6 +1612,15 @@ static void update_avg(u64 *avg, u64 sample) ...@@ -1595,6 +1612,15 @@ static void update_avg(u64 *avg, u64 sample)
s64 diff = sample - *avg; s64 diff = sample - *avg;
*avg += diff >> 3; *avg += diff >> 3;
} }
#else
static inline int __set_cpus_allowed_ptr(struct task_struct *p,
const struct cpumask *new_mask, bool check)
{
return set_cpus_allowed_ptr(p, new_mask);
}
#endif /* CONFIG_SMP */ #endif /* CONFIG_SMP */
static void static void
...@@ -4340,7 +4366,7 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) ...@@ -4340,7 +4366,7 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
} }
#endif #endif
again: again:
retval = set_cpus_allowed_ptr(p, new_mask); retval = __set_cpus_allowed_ptr(p, new_mask, true);
if (!retval) { if (!retval) {
cpuset_cpus_allowed(p, cpus_allowed); cpuset_cpus_allowed(p, cpus_allowed);
...@@ -4865,7 +4891,8 @@ void init_idle(struct task_struct *idle, int cpu) ...@@ -4865,7 +4891,8 @@ void init_idle(struct task_struct *idle, int cpu)
struct rq *rq = cpu_rq(cpu); struct rq *rq = cpu_rq(cpu);
unsigned long flags; unsigned long flags;
raw_spin_lock_irqsave(&rq->lock, flags); raw_spin_lock_irqsave(&idle->pi_lock, flags);
raw_spin_lock(&rq->lock);
__sched_fork(0, idle); __sched_fork(0, idle);
idle->state = TASK_RUNNING; idle->state = TASK_RUNNING;
...@@ -4891,7 +4918,8 @@ void init_idle(struct task_struct *idle, int cpu) ...@@ -4891,7 +4918,8 @@ void init_idle(struct task_struct *idle, int cpu)
#if defined(CONFIG_SMP) #if defined(CONFIG_SMP)
idle->on_cpu = 1; idle->on_cpu = 1;
#endif #endif
raw_spin_unlock_irqrestore(&rq->lock, flags); raw_spin_unlock(&rq->lock);
raw_spin_unlock_irqrestore(&idle->pi_lock, flags);
/* Set the preempt count _outside_ the spinlocks! */ /* Set the preempt count _outside_ the spinlocks! */
init_idle_preempt_count(idle, cpu); init_idle_preempt_count(idle, cpu);
......
...@@ -1714,9 +1714,7 @@ static struct worker *create_worker(struct worker_pool *pool) ...@@ -1714,9 +1714,7 @@ static struct worker *create_worker(struct worker_pool *pool)
goto fail; goto fail;
set_user_nice(worker->task, pool->attrs->nice); set_user_nice(worker->task, pool->attrs->nice);
kthread_bind_mask(worker->task, pool->attrs->cpumask);
/* prevent userland from meddling with cpumask of workqueue workers */
worker->task->flags |= PF_NO_SETAFFINITY;
/* successful, attach the worker to the pool */ /* successful, attach the worker to the pool */
worker_attach_to_pool(worker, pool); worker_attach_to_pool(worker, pool);
...@@ -3856,7 +3854,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, ...@@ -3856,7 +3854,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
} }
wq->rescuer = rescuer; wq->rescuer = rescuer;
rescuer->task->flags |= PF_NO_SETAFFINITY; kthread_bind_mask(rescuer->task, cpu_possible_mask);
wake_up_process(rescuer->task); wake_up_process(rescuer->task);
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment