Commit fabf318e authored by Peter Zijlstra's avatar Peter Zijlstra Committed by Thomas Gleixner

sched: Fix fork vs hotplug vs cpuset namespaces

There are a number of issues:

1) TASK_WAKING vs cgroup_clone (cpusets)

copy_process():

  sched_fork()
    child->state = TASK_WAKING; /* waiting for wake_up_new_task() */
  if (current->nsproxy != p->nsproxy)
     ns_cgroup_clone()
       cgroup_clone()
         mutex_lock(inode->i_mutex)
         mutex_lock(cgroup_mutex)
         cgroup_attach_task()
	   ss->can_attach()
           ss->attach() [ -> cpuset_attach() ]
             cpuset_attach_task()
               set_cpus_allowed_ptr();
                 while (child->state == TASK_WAKING)
                   cpu_relax();
will deadlock the system.


2) cgroup_clone (cpusets) vs copy_process

So even if the above would work we still have:

copy_process():

  if (current->nsproxy != p->nsproxy)
     ns_cgroup_clone()
       cgroup_clone()
         mutex_lock(inode->i_mutex)
         mutex_lock(cgroup_mutex)
         cgroup_attach_task()
	   ss->can_attach()
           ss->attach() [ -> cpuset_attach() ]
             cpuset_attach_task()
               set_cpus_allowed_ptr();
  ...

  p->cpus_allowed = current->cpus_allowed

over-writing the modified cpus_allowed.


3) fork() vs hotplug

  if we unplug the child's cpu after the sanity check when the child
  gets attached to the task_list but before wake_up_new_task() shit
  will meet with fan.

Solve all these issues by moving fork cpu selection into
wake_up_new_task().
Reported-by: default avatarSerge E. Hallyn <serue@us.ibm.com>
Tested-by: default avatarSerge E. Hallyn <serue@us.ibm.com>
Signed-off-by: default avatarPeter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1264106190.4283.1314.camel@laptop>
Signed-off-by: default avatarThomas Gleixner <tglx@linutronix.de>
parent 6d558c3a
...@@ -1241,21 +1241,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, ...@@ -1241,21 +1241,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
/* Need tasklist lock for parent etc handling! */ /* Need tasklist lock for parent etc handling! */
write_lock_irq(&tasklist_lock); write_lock_irq(&tasklist_lock);
/*
* The task hasn't been attached yet, so its cpus_allowed mask will
* not be changed, nor will its assigned CPU.
*
* The cpus_allowed mask of the parent may have changed after it was
* copied first time - so re-copy it here, then check the child's CPU
* to ensure it is on a valid CPU (and if not, just force it back to
* parent's CPU). This avoids alot of nasty races.
*/
p->cpus_allowed = current->cpus_allowed;
p->rt.nr_cpus_allowed = current->rt.nr_cpus_allowed;
if (unlikely(!cpu_isset(task_cpu(p), p->cpus_allowed) ||
!cpu_online(task_cpu(p))))
set_task_cpu(p, smp_processor_id());
/* CLONE_PARENT re-uses the old parent */ /* CLONE_PARENT re-uses the old parent */
if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) { if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) {
p->real_parent = current->real_parent; p->real_parent = current->real_parent;
......
...@@ -2320,14 +2320,12 @@ static int select_fallback_rq(int cpu, struct task_struct *p) ...@@ -2320,14 +2320,12 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
} }
/* /*
* Called from: * Gets called from 3 sites (exec, fork, wakeup), since it is called without
* holding rq->lock we need to ensure ->cpus_allowed is stable, this is done
* by:
* *
* - fork, @p is stable because it isn't on the tasklist yet * exec: is unstable, retry loop
* * fork & wake-up: serialize ->cpus_allowed against TASK_WAKING
* - exec, @p is unstable, retry loop
*
* - wake-up, we serialize ->cpus_allowed against TASK_WAKING so
* we should be good.
*/ */
static inline static inline
int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags) int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
...@@ -2620,9 +2618,6 @@ void sched_fork(struct task_struct *p, int clone_flags) ...@@ -2620,9 +2618,6 @@ void sched_fork(struct task_struct *p, int clone_flags)
if (p->sched_class->task_fork) if (p->sched_class->task_fork)
p->sched_class->task_fork(p); p->sched_class->task_fork(p);
#ifdef CONFIG_SMP
cpu = select_task_rq(p, SD_BALANCE_FORK, 0);
#endif
set_task_cpu(p, cpu); set_task_cpu(p, cpu);
#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
...@@ -2652,6 +2647,21 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) ...@@ -2652,6 +2647,21 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
{ {
unsigned long flags; unsigned long flags;
struct rq *rq; struct rq *rq;
int cpu = get_cpu();
#ifdef CONFIG_SMP
/*
* Fork balancing, do it here and not earlier because:
* - cpus_allowed can change in the fork path
* - any previously selected cpu might disappear through hotplug
*
* We still have TASK_WAKING but PF_STARTING is gone now, meaning
* ->cpus_allowed is stable, we have preemption disabled, meaning
* cpu_online_mask is stable.
*/
cpu = select_task_rq(p, SD_BALANCE_FORK, 0);
set_task_cpu(p, cpu);
#endif
rq = task_rq_lock(p, &flags); rq = task_rq_lock(p, &flags);
BUG_ON(p->state != TASK_WAKING); BUG_ON(p->state != TASK_WAKING);
...@@ -2665,6 +2675,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) ...@@ -2665,6 +2675,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
p->sched_class->task_woken(rq, p); p->sched_class->task_woken(rq, p);
#endif #endif
task_rq_unlock(rq, &flags); task_rq_unlock(rq, &flags);
put_cpu();
} }
#ifdef CONFIG_PREEMPT_NOTIFIERS #ifdef CONFIG_PREEMPT_NOTIFIERS
...@@ -7139,14 +7150,18 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) ...@@ -7139,14 +7150,18 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
* the ->cpus_allowed mask from under waking tasks, which would be * the ->cpus_allowed mask from under waking tasks, which would be
* possible when we change rq->lock in ttwu(), so synchronize against * possible when we change rq->lock in ttwu(), so synchronize against
* TASK_WAKING to avoid that. * TASK_WAKING to avoid that.
*
* Make an exception for freshly cloned tasks, since cpuset namespaces
* might move the task about, we have to validate the target in
* wake_up_new_task() anyway since the cpu might have gone away.
*/ */
again: again:
while (p->state == TASK_WAKING) while (p->state == TASK_WAKING && !(p->flags & PF_STARTING))
cpu_relax(); cpu_relax();
rq = task_rq_lock(p, &flags); rq = task_rq_lock(p, &flags);
if (p->state == TASK_WAKING) { if (p->state == TASK_WAKING && !(p->flags & PF_STARTING)) {
task_rq_unlock(rq, &flags); task_rq_unlock(rq, &flags);
goto again; goto again;
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment