Commit e2defd02 authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull scheduler fixes from Ingo Molnar:
 "Thiscontains misc fixes: preempt_schedule_common() and io_schedule()
  recursion fixes, sched/dl fixes, a completion_done() revert, two
  sched/rt fixes and a comment update patch"

* 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  sched/rt: Avoid obvious configuration fail
  sched/autogroup: Fix failure to set cpu.rt_runtime_us
  sched/dl: Do update_rq_clock() in yield_task_dl()
  sched: Prevent recursion in io_schedule()
  sched/completion: Serialize completion_done() with complete()
  sched: Fix preempt_schedule_common() triggering tracing recursion
  sched/dl: Prevent enqueue of a sleeping task in dl_task_timer()
  sched: Make dl_task_time() use task_rq_lock()
  sched: Clarify ordering between task_rq_lock() and move_queued_task()
parents b5aeca54 2636ed5f
...@@ -363,9 +363,6 @@ extern void show_regs(struct pt_regs *); ...@@ -363,9 +363,6 @@ extern void show_regs(struct pt_regs *);
*/ */
extern void show_stack(struct task_struct *task, unsigned long *sp); extern void show_stack(struct task_struct *task, unsigned long *sp);
void io_schedule(void);
long io_schedule_timeout(long timeout);
extern void cpu_init (void); extern void cpu_init (void);
extern void trap_init(void); extern void trap_init(void);
extern void update_process_times(int user); extern void update_process_times(int user);
...@@ -422,6 +419,13 @@ extern signed long schedule_timeout_uninterruptible(signed long timeout); ...@@ -422,6 +419,13 @@ extern signed long schedule_timeout_uninterruptible(signed long timeout);
asmlinkage void schedule(void); asmlinkage void schedule(void);
extern void schedule_preempt_disabled(void); extern void schedule_preempt_disabled(void);
extern long io_schedule_timeout(long timeout);
static inline void io_schedule(void)
{
io_schedule_timeout(MAX_SCHEDULE_TIMEOUT);
}
struct nsproxy; struct nsproxy;
struct user_namespace; struct user_namespace;
......
...@@ -87,8 +87,7 @@ static inline struct autogroup *autogroup_create(void) ...@@ -87,8 +87,7 @@ static inline struct autogroup *autogroup_create(void)
* so we don't have to move tasks around upon policy change, * so we don't have to move tasks around upon policy change,
* or flail around trying to allocate bandwidth on the fly. * or flail around trying to allocate bandwidth on the fly.
* A bandwidth exception in __sched_setscheduler() allows * A bandwidth exception in __sched_setscheduler() allows
* the policy change to proceed. Thereafter, task_group() * the policy change to proceed.
* returns &root_task_group, so zero bandwidth is required.
*/ */
free_rt_sched_group(tg); free_rt_sched_group(tg);
tg->rt_se = root_task_group.rt_se; tg->rt_se = root_task_group.rt_se;
...@@ -115,9 +114,6 @@ bool task_wants_autogroup(struct task_struct *p, struct task_group *tg) ...@@ -115,9 +114,6 @@ bool task_wants_autogroup(struct task_struct *p, struct task_group *tg)
if (tg != &root_task_group) if (tg != &root_task_group)
return false; return false;
if (p->sched_class != &fair_sched_class)
return false;
/* /*
* We can only assume the task group can't go away on us if * We can only assume the task group can't go away on us if
* autogroup_move_group() can see us on ->thread_group list. * autogroup_move_group() can see us on ->thread_group list.
......
...@@ -274,7 +274,7 @@ bool try_wait_for_completion(struct completion *x) ...@@ -274,7 +274,7 @@ bool try_wait_for_completion(struct completion *x)
* first without taking the lock so we can * first without taking the lock so we can
* return early in the blocking case. * return early in the blocking case.
*/ */
if (!ACCESS_ONCE(x->done)) if (!READ_ONCE(x->done))
return 0; return 0;
spin_lock_irqsave(&x->wait.lock, flags); spin_lock_irqsave(&x->wait.lock, flags);
...@@ -297,6 +297,21 @@ EXPORT_SYMBOL(try_wait_for_completion); ...@@ -297,6 +297,21 @@ EXPORT_SYMBOL(try_wait_for_completion);
*/ */
bool completion_done(struct completion *x) bool completion_done(struct completion *x)
{ {
return !!ACCESS_ONCE(x->done); if (!READ_ONCE(x->done))
return false;
/*
* If ->done, we need to wait for complete() to release ->wait.lock
* otherwise we can end up freeing the completion before complete()
* is done referencing it.
*
* The RMB pairs with complete()'s RELEASE of ->wait.lock and orders
* the loads of ->done and ->wait.lock such that we cannot observe
* the lock before complete() acquires it while observing the ->done
* after it's acquired the lock.
*/
smp_rmb();
spin_unlock_wait(&x->wait.lock);
return true;
} }
EXPORT_SYMBOL(completion_done); EXPORT_SYMBOL(completion_done);
...@@ -306,66 +306,6 @@ __read_mostly int scheduler_running; ...@@ -306,66 +306,6 @@ __read_mostly int scheduler_running;
*/ */
int sysctl_sched_rt_runtime = 950000; int sysctl_sched_rt_runtime = 950000;
/*
* __task_rq_lock - lock the rq @p resides on.
*/
static inline struct rq *__task_rq_lock(struct task_struct *p)
__acquires(rq->lock)
{
struct rq *rq;
lockdep_assert_held(&p->pi_lock);
for (;;) {
rq = task_rq(p);
raw_spin_lock(&rq->lock);
if (likely(rq == task_rq(p) && !task_on_rq_migrating(p)))
return rq;
raw_spin_unlock(&rq->lock);
while (unlikely(task_on_rq_migrating(p)))
cpu_relax();
}
}
/*
* task_rq_lock - lock p->pi_lock and lock the rq @p resides on.
*/
static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
__acquires(p->pi_lock)
__acquires(rq->lock)
{
struct rq *rq;
for (;;) {
raw_spin_lock_irqsave(&p->pi_lock, *flags);
rq = task_rq(p);
raw_spin_lock(&rq->lock);
if (likely(rq == task_rq(p) && !task_on_rq_migrating(p)))
return rq;
raw_spin_unlock(&rq->lock);
raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
while (unlikely(task_on_rq_migrating(p)))
cpu_relax();
}
}
static void __task_rq_unlock(struct rq *rq)
__releases(rq->lock)
{
raw_spin_unlock(&rq->lock);
}
static inline void
task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags)
__releases(rq->lock)
__releases(p->pi_lock)
{
raw_spin_unlock(&rq->lock);
raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
}
/* /*
* this_rq_lock - lock this runqueue and disable interrupts. * this_rq_lock - lock this runqueue and disable interrupts.
*/ */
...@@ -2899,7 +2839,7 @@ void __sched schedule_preempt_disabled(void) ...@@ -2899,7 +2839,7 @@ void __sched schedule_preempt_disabled(void)
preempt_disable(); preempt_disable();
} }
static void preempt_schedule_common(void) static void __sched notrace preempt_schedule_common(void)
{ {
do { do {
__preempt_count_add(PREEMPT_ACTIVE); __preempt_count_add(PREEMPT_ACTIVE);
...@@ -4418,36 +4358,29 @@ EXPORT_SYMBOL_GPL(yield_to); ...@@ -4418,36 +4358,29 @@ EXPORT_SYMBOL_GPL(yield_to);
* This task is about to go to sleep on IO. Increment rq->nr_iowait so * This task is about to go to sleep on IO. Increment rq->nr_iowait so
* that process accounting knows that this is a task in IO wait state. * that process accounting knows that this is a task in IO wait state.
*/ */
void __sched io_schedule(void)
{
struct rq *rq = raw_rq();
delayacct_blkio_start();
atomic_inc(&rq->nr_iowait);
blk_flush_plug(current);
current->in_iowait = 1;
schedule();
current->in_iowait = 0;
atomic_dec(&rq->nr_iowait);
delayacct_blkio_end();
}
EXPORT_SYMBOL(io_schedule);
long __sched io_schedule_timeout(long timeout) long __sched io_schedule_timeout(long timeout)
{ {
struct rq *rq = raw_rq(); int old_iowait = current->in_iowait;
struct rq *rq;
long ret; long ret;
current->in_iowait = 1;
if (old_iowait)
blk_schedule_flush_plug(current);
else
blk_flush_plug(current);
delayacct_blkio_start(); delayacct_blkio_start();
rq = raw_rq();
atomic_inc(&rq->nr_iowait); atomic_inc(&rq->nr_iowait);
blk_flush_plug(current);
current->in_iowait = 1;
ret = schedule_timeout(timeout); ret = schedule_timeout(timeout);
current->in_iowait = 0; current->in_iowait = old_iowait;
atomic_dec(&rq->nr_iowait); atomic_dec(&rq->nr_iowait);
delayacct_blkio_end(); delayacct_blkio_end();
return ret; return ret;
} }
EXPORT_SYMBOL(io_schedule_timeout);
/** /**
* sys_sched_get_priority_max - return maximum RT priority. * sys_sched_get_priority_max - return maximum RT priority.
...@@ -7642,6 +7575,12 @@ static inline int tg_has_rt_tasks(struct task_group *tg) ...@@ -7642,6 +7575,12 @@ static inline int tg_has_rt_tasks(struct task_group *tg)
{ {
struct task_struct *g, *p; struct task_struct *g, *p;
/*
* Autogroups do not have RT tasks; see autogroup_create().
*/
if (task_group_is_autogroup(tg))
return 0;
for_each_process_thread(g, p) { for_each_process_thread(g, p) {
if (rt_task(p) && task_group(p) == tg) if (rt_task(p) && task_group(p) == tg)
return 1; return 1;
...@@ -7734,6 +7673,17 @@ static int tg_set_rt_bandwidth(struct task_group *tg, ...@@ -7734,6 +7673,17 @@ static int tg_set_rt_bandwidth(struct task_group *tg,
{ {
int i, err = 0; int i, err = 0;
/*
* Disallowing the root group RT runtime is BAD, it would disallow the
* kernel creating (and or operating) RT threads.
*/
if (tg == &root_task_group && rt_runtime == 0)
return -EINVAL;
/* No period doesn't make any sense. */
if (rt_period == 0)
return -EINVAL;
mutex_lock(&rt_constraints_mutex); mutex_lock(&rt_constraints_mutex);
read_lock(&tasklist_lock); read_lock(&tasklist_lock);
err = __rt_schedulable(tg, rt_period, rt_runtime); err = __rt_schedulable(tg, rt_period, rt_runtime);
...@@ -7790,9 +7740,6 @@ static int sched_group_set_rt_period(struct task_group *tg, long rt_period_us) ...@@ -7790,9 +7740,6 @@ static int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
rt_period = (u64)rt_period_us * NSEC_PER_USEC; rt_period = (u64)rt_period_us * NSEC_PER_USEC;
rt_runtime = tg->rt_bandwidth.rt_runtime; rt_runtime = tg->rt_bandwidth.rt_runtime;
if (rt_period == 0)
return -EINVAL;
return tg_set_rt_bandwidth(tg, rt_period, rt_runtime); return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
} }
......
...@@ -511,16 +511,10 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) ...@@ -511,16 +511,10 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
struct sched_dl_entity, struct sched_dl_entity,
dl_timer); dl_timer);
struct task_struct *p = dl_task_of(dl_se); struct task_struct *p = dl_task_of(dl_se);
unsigned long flags;
struct rq *rq; struct rq *rq;
again:
rq = task_rq(p);
raw_spin_lock(&rq->lock);
if (rq != task_rq(p)) { rq = task_rq_lock(current, &flags);
/* Task was moved, retrying. */
raw_spin_unlock(&rq->lock);
goto again;
}
/* /*
* We need to take care of several possible races here: * We need to take care of several possible races here:
...@@ -541,6 +535,26 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) ...@@ -541,6 +535,26 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
sched_clock_tick(); sched_clock_tick();
update_rq_clock(rq); update_rq_clock(rq);
/*
* If the throttle happened during sched-out; like:
*
* schedule()
* deactivate_task()
* dequeue_task_dl()
* update_curr_dl()
* start_dl_timer()
* __dequeue_task_dl()
* prev->on_rq = 0;
*
* We can be both throttled and !queued. Replenish the counter
* but do not enqueue -- wait for our wakeup to do that.
*/
if (!task_on_rq_queued(p)) {
replenish_dl_entity(dl_se, dl_se);
goto unlock;
}
enqueue_task_dl(rq, p, ENQUEUE_REPLENISH); enqueue_task_dl(rq, p, ENQUEUE_REPLENISH);
if (dl_task(rq->curr)) if (dl_task(rq->curr))
check_preempt_curr_dl(rq, p, 0); check_preempt_curr_dl(rq, p, 0);
...@@ -555,7 +569,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) ...@@ -555,7 +569,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
push_dl_task(rq); push_dl_task(rq);
#endif #endif
unlock: unlock:
raw_spin_unlock(&rq->lock); task_rq_unlock(rq, current, &flags);
return HRTIMER_NORESTART; return HRTIMER_NORESTART;
} }
...@@ -898,6 +912,7 @@ static void yield_task_dl(struct rq *rq) ...@@ -898,6 +912,7 @@ static void yield_task_dl(struct rq *rq)
rq->curr->dl.dl_yielded = 1; rq->curr->dl.dl_yielded = 1;
p->dl.runtime = 0; p->dl.runtime = 0;
} }
update_rq_clock(rq);
update_curr_dl(rq); update_curr_dl(rq);
} }
......
...@@ -1380,6 +1380,82 @@ static inline void sched_avg_update(struct rq *rq) { } ...@@ -1380,6 +1380,82 @@ static inline void sched_avg_update(struct rq *rq) { }
extern void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period); extern void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period);
/*
* __task_rq_lock - lock the rq @p resides on.
*/
static inline struct rq *__task_rq_lock(struct task_struct *p)
__acquires(rq->lock)
{
struct rq *rq;
lockdep_assert_held(&p->pi_lock);
for (;;) {
rq = task_rq(p);
raw_spin_lock(&rq->lock);
if (likely(rq == task_rq(p) && !task_on_rq_migrating(p)))
return rq;
raw_spin_unlock(&rq->lock);
while (unlikely(task_on_rq_migrating(p)))
cpu_relax();
}
}
/*
* task_rq_lock - lock p->pi_lock and lock the rq @p resides on.
*/
static inline struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
__acquires(p->pi_lock)
__acquires(rq->lock)
{
struct rq *rq;
for (;;) {
raw_spin_lock_irqsave(&p->pi_lock, *flags);
rq = task_rq(p);
raw_spin_lock(&rq->lock);
/*
* move_queued_task() task_rq_lock()
*
* ACQUIRE (rq->lock)
* [S] ->on_rq = MIGRATING [L] rq = task_rq()
* WMB (__set_task_cpu()) ACQUIRE (rq->lock);
* [S] ->cpu = new_cpu [L] task_rq()
* [L] ->on_rq
* RELEASE (rq->lock)
*
* If we observe the old cpu in task_rq_lock, the acquire of
* the old rq->lock will fully serialize against the stores.
*
* If we observe the new cpu in task_rq_lock, the acquire will
* pair with the WMB to ensure we must then also see migrating.
*/
if (likely(rq == task_rq(p) && !task_on_rq_migrating(p)))
return rq;
raw_spin_unlock(&rq->lock);
raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
while (unlikely(task_on_rq_migrating(p)))
cpu_relax();
}
}
static inline void __task_rq_unlock(struct rq *rq)
__releases(rq->lock)
{
raw_spin_unlock(&rq->lock);
}
static inline void
task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags)
__releases(rq->lock)
__releases(p->pi_lock)
{
raw_spin_unlock(&rq->lock);
raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
}
#ifdef CONFIG_SMP #ifdef CONFIG_SMP
#ifdef CONFIG_PREEMPT #ifdef CONFIG_PREEMPT
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment