Commit 43768f7c authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'sched-urgent-2020-07-19' of...

Merge tag 'sched-urgent-2020-07-19' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip into master

Pull scheduler fixes from Thomas Gleixner:
 "A set of scheduler fixes:

   - Plug a load average accounting race which was introduced with a
     recent optimization casing load average to show bogus numbers.

   - Fix the rseq CPU id initialization for new tasks. sched_fork() does
     not update the rseq CPU id so the id is the stale id of the parent
     task, which can cause user space data corruption.

   - Handle a 0 return value of task_h_load() correctly in the load
     balancer, which does not decrease imbalance and therefore pulls
     until the maximum number of loops is reached, which might be all
     tasks just created by a fork bomb"

* tag 'sched-urgent-2020-07-19' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  sched/fair: handle case of task_h_load() returning 0
  sched: Fix unreliable rseq cpu_id for new tasks
  sched: Fix loadavg accounting race
parents 9413cd77 01cfcde9
...@@ -114,10 +114,6 @@ struct task_group; ...@@ -114,10 +114,6 @@ struct task_group;
#define task_is_stopped_or_traced(task) ((task->state & (__TASK_STOPPED | __TASK_TRACED)) != 0) #define task_is_stopped_or_traced(task) ((task->state & (__TASK_STOPPED | __TASK_TRACED)) != 0)
#define task_contributes_to_load(task) ((task->state & TASK_UNINTERRUPTIBLE) != 0 && \
(task->flags & PF_FROZEN) == 0 && \
(task->state & TASK_NOLOAD) == 0)
#ifdef CONFIG_DEBUG_ATOMIC_SLEEP #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
/* /*
......
...@@ -1311,9 +1311,6 @@ static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags) ...@@ -1311,9 +1311,6 @@ static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
void activate_task(struct rq *rq, struct task_struct *p, int flags) void activate_task(struct rq *rq, struct task_struct *p, int flags)
{ {
if (task_contributes_to_load(p))
rq->nr_uninterruptible--;
enqueue_task(rq, p, flags); enqueue_task(rq, p, flags);
p->on_rq = TASK_ON_RQ_QUEUED; p->on_rq = TASK_ON_RQ_QUEUED;
...@@ -1323,9 +1320,6 @@ void deactivate_task(struct rq *rq, struct task_struct *p, int flags) ...@@ -1323,9 +1320,6 @@ void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
{ {
p->on_rq = (flags & DEQUEUE_SLEEP) ? 0 : TASK_ON_RQ_MIGRATING; p->on_rq = (flags & DEQUEUE_SLEEP) ? 0 : TASK_ON_RQ_MIGRATING;
if (task_contributes_to_load(p))
rq->nr_uninterruptible++;
dequeue_task(rq, p, flags); dequeue_task(rq, p, flags);
} }
...@@ -2236,10 +2230,10 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, ...@@ -2236,10 +2230,10 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
lockdep_assert_held(&rq->lock); lockdep_assert_held(&rq->lock);
#ifdef CONFIG_SMP
if (p->sched_contributes_to_load) if (p->sched_contributes_to_load)
rq->nr_uninterruptible--; rq->nr_uninterruptible--;
#ifdef CONFIG_SMP
if (wake_flags & WF_MIGRATED) if (wake_flags & WF_MIGRATED)
en_flags |= ENQUEUE_MIGRATED; en_flags |= ENQUEUE_MIGRATED;
#endif #endif
...@@ -2583,7 +2577,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) ...@@ -2583,7 +2577,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
* A similar smb_rmb() lives in try_invoke_on_locked_down_task(). * A similar smb_rmb() lives in try_invoke_on_locked_down_task().
*/ */
smp_rmb(); smp_rmb();
if (p->on_rq && ttwu_remote(p, wake_flags)) if (READ_ONCE(p->on_rq) && ttwu_remote(p, wake_flags))
goto unlock; goto unlock;
if (p->in_iowait) { if (p->in_iowait) {
...@@ -2592,9 +2586,6 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) ...@@ -2592,9 +2586,6 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
} }
#ifdef CONFIG_SMP #ifdef CONFIG_SMP
p->sched_contributes_to_load = !!task_contributes_to_load(p);
p->state = TASK_WAKING;
/* /*
* Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be * Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be
* possible to, falsely, observe p->on_cpu == 0. * possible to, falsely, observe p->on_cpu == 0.
...@@ -2613,8 +2604,20 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) ...@@ -2613,8 +2604,20 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
* *
* Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in
* __schedule(). See the comment for smp_mb__after_spinlock(). * __schedule(). See the comment for smp_mb__after_spinlock().
*
* Form a control-dep-acquire with p->on_rq == 0 above, to ensure
* schedule()'s deactivate_task() has 'happened' and p will no longer
* care about it's own p->state. See the comment in __schedule().
*/ */
smp_rmb(); smp_acquire__after_ctrl_dep();
/*
* We're doing the wakeup (@success == 1), they did a dequeue (p->on_rq
* == 0), which means we need to do an enqueue, change p->state to
* TASK_WAKING such that we can unlock p->pi_lock before doing the
* enqueue, such as ttwu_queue_wakelist().
*/
p->state = TASK_WAKING;
/* /*
* If the owning (remote) CPU is still in the middle of schedule() with * If the owning (remote) CPU is still in the middle of schedule() with
...@@ -2962,6 +2965,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) ...@@ -2962,6 +2965,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
* Silence PROVE_RCU. * Silence PROVE_RCU.
*/ */
raw_spin_lock_irqsave(&p->pi_lock, flags); raw_spin_lock_irqsave(&p->pi_lock, flags);
rseq_migrate(p);
/* /*
* We're setting the CPU for the first time, we don't migrate, * We're setting the CPU for the first time, we don't migrate,
* so use __set_task_cpu(). * so use __set_task_cpu().
...@@ -3026,6 +3030,7 @@ void wake_up_new_task(struct task_struct *p) ...@@ -3026,6 +3030,7 @@ void wake_up_new_task(struct task_struct *p)
* as we're not fully set-up yet. * as we're not fully set-up yet.
*/ */
p->recent_used_cpu = task_cpu(p); p->recent_used_cpu = task_cpu(p);
rseq_migrate(p);
__set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0)); __set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
#endif #endif
rq = __task_rq_lock(p, &rf); rq = __task_rq_lock(p, &rf);
...@@ -4097,6 +4102,7 @@ static void __sched notrace __schedule(bool preempt) ...@@ -4097,6 +4102,7 @@ static void __sched notrace __schedule(bool preempt)
{ {
struct task_struct *prev, *next; struct task_struct *prev, *next;
unsigned long *switch_count; unsigned long *switch_count;
unsigned long prev_state;
struct rq_flags rf; struct rq_flags rf;
struct rq *rq; struct rq *rq;
int cpu; int cpu;
...@@ -4113,12 +4119,22 @@ static void __sched notrace __schedule(bool preempt) ...@@ -4113,12 +4119,22 @@ static void __sched notrace __schedule(bool preempt)
local_irq_disable(); local_irq_disable();
rcu_note_context_switch(preempt); rcu_note_context_switch(preempt);
/* See deactivate_task() below. */
prev_state = prev->state;
/* /*
* Make sure that signal_pending_state()->signal_pending() below * Make sure that signal_pending_state()->signal_pending() below
* can't be reordered with __set_current_state(TASK_INTERRUPTIBLE) * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE)
* done by the caller to avoid the race with signal_wake_up(). * done by the caller to avoid the race with signal_wake_up():
* *
* The membarrier system call requires a full memory barrier * __set_current_state(@state) signal_wake_up()
* schedule() set_tsk_thread_flag(p, TIF_SIGPENDING)
* wake_up_state(p, state)
* LOCK rq->lock LOCK p->pi_state
* smp_mb__after_spinlock() smp_mb__after_spinlock()
* if (signal_pending_state()) if (p->state & @state)
*
* Also, the membarrier system call requires a full memory barrier
* after coming from user-space, before storing to rq->curr. * after coming from user-space, before storing to rq->curr.
*/ */
rq_lock(rq, &rf); rq_lock(rq, &rf);
...@@ -4129,10 +4145,31 @@ static void __sched notrace __schedule(bool preempt) ...@@ -4129,10 +4145,31 @@ static void __sched notrace __schedule(bool preempt)
update_rq_clock(rq); update_rq_clock(rq);
switch_count = &prev->nivcsw; switch_count = &prev->nivcsw;
if (!preempt && prev->state) { /*
if (signal_pending_state(prev->state, prev)) { * We must re-load prev->state in case ttwu_remote() changed it
* before we acquired rq->lock.
*/
if (!preempt && prev_state && prev_state == prev->state) {
if (signal_pending_state(prev_state, prev)) {
prev->state = TASK_RUNNING; prev->state = TASK_RUNNING;
} else { } else {
prev->sched_contributes_to_load =
(prev_state & TASK_UNINTERRUPTIBLE) &&
!(prev_state & TASK_NOLOAD) &&
!(prev->flags & PF_FROZEN);
if (prev->sched_contributes_to_load)
rq->nr_uninterruptible++;
/*
* __schedule() ttwu()
* prev_state = prev->state; if (READ_ONCE(p->on_rq) && ...)
* LOCK rq->lock goto out;
* smp_mb__after_spinlock(); smp_acquire__after_ctrl_dep();
* p->on_rq = 0; p->state = TASK_WAKING;
*
* After this, schedule() must not care about p->state any more.
*/
deactivate_task(rq, prev, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK); deactivate_task(rq, prev, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK);
if (prev->in_iowait) { if (prev->in_iowait) {
......
...@@ -4039,7 +4039,11 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq) ...@@ -4039,7 +4039,11 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
return; return;
} }
rq->misfit_task_load = task_h_load(p); /*
* Make sure that misfit_task_load will not be null even if
* task_h_load() returns 0.
*/
rq->misfit_task_load = max_t(unsigned long, task_h_load(p), 1);
} }
#else /* CONFIG_SMP */ #else /* CONFIG_SMP */
...@@ -7638,7 +7642,14 @@ static int detach_tasks(struct lb_env *env) ...@@ -7638,7 +7642,14 @@ static int detach_tasks(struct lb_env *env)
switch (env->migration_type) { switch (env->migration_type) {
case migrate_load: case migrate_load:
load = task_h_load(p); /*
* Depending of the number of CPUs and tasks and the
* cgroup hierarchy, task_h_load() can return a null
* value. Make sure that env->imbalance decreases
* otherwise detach_tasks() will stop only after
* detaching up to loop_max tasks.
*/
load = max_t(unsigned long, task_h_load(p), 1);
if (sched_feat(LB_MIN) && if (sched_feat(LB_MIN) &&
load < 16 && !env->sd->nr_balance_failed) load < 16 && !env->sd->nr_balance_failed)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment