Commit 2279f540 authored by Juri Lelli's avatar Juri Lelli Committed by Peter Zijlstra

sched/deadline: Fix priority inheritance with multiple scheduling classes

Glenn reported that "an application [he developed produces] a BUG in
deadline.c when a SCHED_DEADLINE task contends with CFS tasks on nested
PTHREAD_PRIO_INHERIT mutexes.  I believe the bug is triggered when a CFS
task that was boosted by a SCHED_DEADLINE task boosts another CFS task
(nested priority inheritance).

 ------------[ cut here ]------------
 kernel BUG at kernel/sched/deadline.c:1462!
 invalid opcode: 0000 [#1] PREEMPT SMP
 CPU: 12 PID: 19171 Comm: dl_boost_bug Tainted: ...
 Hardware name: ...
 RIP: 0010:enqueue_task_dl+0x335/0x910
 Code: ...
 RSP: 0018:ffffc9000c2bbc68 EFLAGS: 00010002
 RAX: 0000000000000009 RBX: ffff888c0af94c00 RCX: ffffffff81e12500
 RDX: 000000000000002e RSI: ffff888c0af94c00 RDI: ffff888c10b22600
 RBP: ffffc9000c2bbd08 R08: 0000000000000009 R09: 0000000000000078
 R10: ffffffff81e12440 R11: ffffffff81e1236c R12: ffff888bc8932600
 R13: ffff888c0af94eb8 R14: ffff888c10b22600 R15: ffff888bc8932600
 FS:  00007fa58ac55700(0000) GS:ffff888c10b00000(0000) knlGS:0000000000000000
 CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
 CR2: 00007fa58b523230 CR3: 0000000bf44ab003 CR4: 00000000007606e0
 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
 PKRU: 55555554
 Call Trace:
  ? intel_pstate_update_util_hwp+0x13/0x170
  rt_mutex_setprio+0x1cc/0x4b0
  task_blocks_on_rt_mutex+0x225/0x260
  rt_spin_lock_slowlock_locked+0xab/0x2d0
  rt_spin_lock_slowlock+0x50/0x80
  hrtimer_grab_expiry_lock+0x20/0x30
  hrtimer_cancel+0x13/0x30
  do_nanosleep+0xa0/0x150
  hrtimer_nanosleep+0xe1/0x230
  ? __hrtimer_init_sleeper+0x60/0x60
  __x64_sys_nanosleep+0x8d/0xa0
  do_syscall_64+0x4a/0x100
  entry_SYSCALL_64_after_hwframe+0x49/0xbe
 RIP: 0033:0x7fa58b52330d
 ...
 ---[ end trace 0000000000000002 ]—

He also provided a simple reproducer creating the situation below:

 So the execution order of locking steps are the following
 (N1 and N2 are non-deadline tasks. D1 is a deadline task. M1 and M2
 are mutexes that are enabled * with priority inheritance.)

 Time moves forward as this timeline goes down:

 N1              N2               D1
 |               |                |
 |               |                |
 Lock(M1)        |                |
 |               |                |
 |             Lock(M2)           |
 |               |                |
 |               |              Lock(M2)
 |               |                |
 |             Lock(M1)           |
 |             (!!bug triggered!) |

Daniel reported a similar situation as well, by just letting ksoftirqd
run with DEADLINE (and eventually block on a mutex).

Problem is that boosted entities (Priority Inheritance) use static
DEADLINE parameters of the top priority waiter. However, there might be
cases where top waiter could be a non-DEADLINE entity that is currently
boosted by a DEADLINE entity from a different lock chain (i.e., nested
priority chains involving entities of non-DEADLINE classes). In this
case, top waiter static DEADLINE parameters could be null (initialized
to 0 at fork()) and replenish_dl_entity() would hit a BUG().

Fix this by keeping track of the original donor and using its parameters
when a task is boosted.
Reported-by: default avatarGlenn Elliott <glenn@aurora.tech>
Reported-by: default avatarDaniel Bristot de Oliveira <bristot@redhat.com>
Signed-off-by: default avatarJuri Lelli <juri.lelli@redhat.com>
Signed-off-by: default avatarPeter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: default avatarDaniel Bristot de Oliveira <bristot@redhat.com>
Link: https://lkml.kernel.org/r/20201117061432.517340-1-juri.lelli@redhat.com
parent ec618b84
...@@ -551,7 +551,6 @@ struct sched_dl_entity { ...@@ -551,7 +551,6 @@ struct sched_dl_entity {
* overruns. * overruns.
*/ */
unsigned int dl_throttled : 1; unsigned int dl_throttled : 1;
unsigned int dl_boosted : 1;
unsigned int dl_yielded : 1; unsigned int dl_yielded : 1;
unsigned int dl_non_contending : 1; unsigned int dl_non_contending : 1;
unsigned int dl_overrun : 1; unsigned int dl_overrun : 1;
...@@ -570,6 +569,15 @@ struct sched_dl_entity { ...@@ -570,6 +569,15 @@ struct sched_dl_entity {
* time. * time.
*/ */
struct hrtimer inactive_timer; struct hrtimer inactive_timer;
#ifdef CONFIG_RT_MUTEXES
/*
* Priority Inheritance. When a DEADLINE scheduling entity is boosted
* pi_se points to the donor, otherwise points to the dl_se it belongs
* to (the original one/itself).
*/
struct sched_dl_entity *pi_se;
#endif
}; };
#ifdef CONFIG_UCLAMP_TASK #ifdef CONFIG_UCLAMP_TASK
......
...@@ -4912,20 +4912,21 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) ...@@ -4912,20 +4912,21 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
if (!dl_prio(p->normal_prio) || if (!dl_prio(p->normal_prio) ||
(pi_task && dl_prio(pi_task->prio) && (pi_task && dl_prio(pi_task->prio) &&
dl_entity_preempt(&pi_task->dl, &p->dl))) { dl_entity_preempt(&pi_task->dl, &p->dl))) {
p->dl.dl_boosted = 1; p->dl.pi_se = pi_task->dl.pi_se;
queue_flag |= ENQUEUE_REPLENISH; queue_flag |= ENQUEUE_REPLENISH;
} else } else {
p->dl.dl_boosted = 0; p->dl.pi_se = &p->dl;
}
p->sched_class = &dl_sched_class; p->sched_class = &dl_sched_class;
} else if (rt_prio(prio)) { } else if (rt_prio(prio)) {
if (dl_prio(oldprio)) if (dl_prio(oldprio))
p->dl.dl_boosted = 0; p->dl.pi_se = &p->dl;
if (oldprio < prio) if (oldprio < prio)
queue_flag |= ENQUEUE_HEAD; queue_flag |= ENQUEUE_HEAD;
p->sched_class = &rt_sched_class; p->sched_class = &rt_sched_class;
} else { } else {
if (dl_prio(oldprio)) if (dl_prio(oldprio))
p->dl.dl_boosted = 0; p->dl.pi_se = &p->dl;
if (rt_prio(oldprio)) if (rt_prio(oldprio))
p->rt.timeout = 0; p->rt.timeout = 0;
p->sched_class = &fair_sched_class; p->sched_class = &fair_sched_class;
......
...@@ -43,6 +43,28 @@ static inline int on_dl_rq(struct sched_dl_entity *dl_se) ...@@ -43,6 +43,28 @@ static inline int on_dl_rq(struct sched_dl_entity *dl_se)
return !RB_EMPTY_NODE(&dl_se->rb_node); return !RB_EMPTY_NODE(&dl_se->rb_node);
} }
#ifdef CONFIG_RT_MUTEXES
static inline struct sched_dl_entity *pi_of(struct sched_dl_entity *dl_se)
{
return dl_se->pi_se;
}
static inline bool is_dl_boosted(struct sched_dl_entity *dl_se)
{
return pi_of(dl_se) != dl_se;
}
#else
static inline struct sched_dl_entity *pi_of(struct sched_dl_entity *dl_se)
{
return dl_se;
}
static inline bool is_dl_boosted(struct sched_dl_entity *dl_se)
{
return false;
}
#endif
#ifdef CONFIG_SMP #ifdef CONFIG_SMP
static inline struct dl_bw *dl_bw_of(int i) static inline struct dl_bw *dl_bw_of(int i)
{ {
...@@ -698,7 +720,7 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se) ...@@ -698,7 +720,7 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se)
struct dl_rq *dl_rq = dl_rq_of_se(dl_se); struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
struct rq *rq = rq_of_dl_rq(dl_rq); struct rq *rq = rq_of_dl_rq(dl_rq);
WARN_ON(dl_se->dl_boosted); WARN_ON(is_dl_boosted(dl_se));
WARN_ON(dl_time_before(rq_clock(rq), dl_se->deadline)); WARN_ON(dl_time_before(rq_clock(rq), dl_se->deadline));
/* /*
...@@ -736,21 +758,20 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se) ...@@ -736,21 +758,20 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se)
* could happen are, typically, a entity voluntarily trying to overcome its * could happen are, typically, a entity voluntarily trying to overcome its
* runtime, or it just underestimated it during sched_setattr(). * runtime, or it just underestimated it during sched_setattr().
*/ */
static void replenish_dl_entity(struct sched_dl_entity *dl_se, static void replenish_dl_entity(struct sched_dl_entity *dl_se)
struct sched_dl_entity *pi_se)
{ {
struct dl_rq *dl_rq = dl_rq_of_se(dl_se); struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
struct rq *rq = rq_of_dl_rq(dl_rq); struct rq *rq = rq_of_dl_rq(dl_rq);
BUG_ON(pi_se->dl_runtime <= 0); BUG_ON(pi_of(dl_se)->dl_runtime <= 0);
/* /*
* This could be the case for a !-dl task that is boosted. * This could be the case for a !-dl task that is boosted.
* Just go with full inherited parameters. * Just go with full inherited parameters.
*/ */
if (dl_se->dl_deadline == 0) { if (dl_se->dl_deadline == 0) {
dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; dl_se->deadline = rq_clock(rq) + pi_of(dl_se)->dl_deadline;
dl_se->runtime = pi_se->dl_runtime; dl_se->runtime = pi_of(dl_se)->dl_runtime;
} }
if (dl_se->dl_yielded && dl_se->runtime > 0) if (dl_se->dl_yielded && dl_se->runtime > 0)
...@@ -763,8 +784,8 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se, ...@@ -763,8 +784,8 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se,
* arbitrary large. * arbitrary large.
*/ */
while (dl_se->runtime <= 0) { while (dl_se->runtime <= 0) {
dl_se->deadline += pi_se->dl_period; dl_se->deadline += pi_of(dl_se)->dl_period;
dl_se->runtime += pi_se->dl_runtime; dl_se->runtime += pi_of(dl_se)->dl_runtime;
} }
/* /*
...@@ -778,8 +799,8 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se, ...@@ -778,8 +799,8 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se,
*/ */
if (dl_time_before(dl_se->deadline, rq_clock(rq))) { if (dl_time_before(dl_se->deadline, rq_clock(rq))) {
printk_deferred_once("sched: DL replenish lagged too much\n"); printk_deferred_once("sched: DL replenish lagged too much\n");
dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; dl_se->deadline = rq_clock(rq) + pi_of(dl_se)->dl_deadline;
dl_se->runtime = pi_se->dl_runtime; dl_se->runtime = pi_of(dl_se)->dl_runtime;
} }
if (dl_se->dl_yielded) if (dl_se->dl_yielded)
...@@ -812,8 +833,7 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se, ...@@ -812,8 +833,7 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se,
* task with deadline equal to period this is the same of using * task with deadline equal to period this is the same of using
* dl_period instead of dl_deadline in the equation above. * dl_period instead of dl_deadline in the equation above.
*/ */
static bool dl_entity_overflow(struct sched_dl_entity *dl_se, static bool dl_entity_overflow(struct sched_dl_entity *dl_se, u64 t)
struct sched_dl_entity *pi_se, u64 t)
{ {
u64 left, right; u64 left, right;
...@@ -835,9 +855,9 @@ static bool dl_entity_overflow(struct sched_dl_entity *dl_se, ...@@ -835,9 +855,9 @@ static bool dl_entity_overflow(struct sched_dl_entity *dl_se,
* of anything below microseconds resolution is actually fiction * of anything below microseconds resolution is actually fiction
* (but still we want to give the user that illusion >;). * (but still we want to give the user that illusion >;).
*/ */
left = (pi_se->dl_deadline >> DL_SCALE) * (dl_se->runtime >> DL_SCALE); left = (pi_of(dl_se)->dl_deadline >> DL_SCALE) * (dl_se->runtime >> DL_SCALE);
right = ((dl_se->deadline - t) >> DL_SCALE) * right = ((dl_se->deadline - t) >> DL_SCALE) *
(pi_se->dl_runtime >> DL_SCALE); (pi_of(dl_se)->dl_runtime >> DL_SCALE);
return dl_time_before(right, left); return dl_time_before(right, left);
} }
...@@ -922,24 +942,23 @@ static inline bool dl_is_implicit(struct sched_dl_entity *dl_se) ...@@ -922,24 +942,23 @@ static inline bool dl_is_implicit(struct sched_dl_entity *dl_se)
* Please refer to the comments update_dl_revised_wakeup() function to find * Please refer to the comments update_dl_revised_wakeup() function to find
* more about the Revised CBS rule. * more about the Revised CBS rule.
*/ */
static void update_dl_entity(struct sched_dl_entity *dl_se, static void update_dl_entity(struct sched_dl_entity *dl_se)
struct sched_dl_entity *pi_se)
{ {
struct dl_rq *dl_rq = dl_rq_of_se(dl_se); struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
struct rq *rq = rq_of_dl_rq(dl_rq); struct rq *rq = rq_of_dl_rq(dl_rq);
if (dl_time_before(dl_se->deadline, rq_clock(rq)) || if (dl_time_before(dl_se->deadline, rq_clock(rq)) ||
dl_entity_overflow(dl_se, pi_se, rq_clock(rq))) { dl_entity_overflow(dl_se, rq_clock(rq))) {
if (unlikely(!dl_is_implicit(dl_se) && if (unlikely(!dl_is_implicit(dl_se) &&
!dl_time_before(dl_se->deadline, rq_clock(rq)) && !dl_time_before(dl_se->deadline, rq_clock(rq)) &&
!dl_se->dl_boosted)){ !is_dl_boosted(dl_se))) {
update_dl_revised_wakeup(dl_se, rq); update_dl_revised_wakeup(dl_se, rq);
return; return;
} }
dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; dl_se->deadline = rq_clock(rq) + pi_of(dl_se)->dl_deadline;
dl_se->runtime = pi_se->dl_runtime; dl_se->runtime = pi_of(dl_se)->dl_runtime;
} }
} }
...@@ -1038,7 +1057,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) ...@@ -1038,7 +1057,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
* The task might have been boosted by someone else and might be in the * The task might have been boosted by someone else and might be in the
* boosting/deboosting path, its not throttled. * boosting/deboosting path, its not throttled.
*/ */
if (dl_se->dl_boosted) if (is_dl_boosted(dl_se))
goto unlock; goto unlock;
/* /*
...@@ -1066,7 +1085,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) ...@@ -1066,7 +1085,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
* but do not enqueue -- wait for our wakeup to do that. * but do not enqueue -- wait for our wakeup to do that.
*/ */
if (!task_on_rq_queued(p)) { if (!task_on_rq_queued(p)) {
replenish_dl_entity(dl_se, dl_se); replenish_dl_entity(dl_se);
goto unlock; goto unlock;
} }
...@@ -1156,7 +1175,7 @@ static inline void dl_check_constrained_dl(struct sched_dl_entity *dl_se) ...@@ -1156,7 +1175,7 @@ static inline void dl_check_constrained_dl(struct sched_dl_entity *dl_se)
if (dl_time_before(dl_se->deadline, rq_clock(rq)) && if (dl_time_before(dl_se->deadline, rq_clock(rq)) &&
dl_time_before(rq_clock(rq), dl_next_period(dl_se))) { dl_time_before(rq_clock(rq), dl_next_period(dl_se))) {
if (unlikely(dl_se->dl_boosted || !start_dl_timer(p))) if (unlikely(is_dl_boosted(dl_se) || !start_dl_timer(p)))
return; return;
dl_se->dl_throttled = 1; dl_se->dl_throttled = 1;
if (dl_se->runtime > 0) if (dl_se->runtime > 0)
...@@ -1287,7 +1306,7 @@ static void update_curr_dl(struct rq *rq) ...@@ -1287,7 +1306,7 @@ static void update_curr_dl(struct rq *rq)
dl_se->dl_overrun = 1; dl_se->dl_overrun = 1;
__dequeue_task_dl(rq, curr, 0); __dequeue_task_dl(rq, curr, 0);
if (unlikely(dl_se->dl_boosted || !start_dl_timer(curr))) if (unlikely(is_dl_boosted(dl_se) || !start_dl_timer(curr)))
enqueue_task_dl(rq, curr, ENQUEUE_REPLENISH); enqueue_task_dl(rq, curr, ENQUEUE_REPLENISH);
if (!is_leftmost(curr, &rq->dl)) if (!is_leftmost(curr, &rq->dl))
...@@ -1481,8 +1500,7 @@ static void __dequeue_dl_entity(struct sched_dl_entity *dl_se) ...@@ -1481,8 +1500,7 @@ static void __dequeue_dl_entity(struct sched_dl_entity *dl_se)
} }
static void static void
enqueue_dl_entity(struct sched_dl_entity *dl_se, enqueue_dl_entity(struct sched_dl_entity *dl_se, int flags)
struct sched_dl_entity *pi_se, int flags)
{ {
BUG_ON(on_dl_rq(dl_se)); BUG_ON(on_dl_rq(dl_se));
...@@ -1493,9 +1511,9 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se, ...@@ -1493,9 +1511,9 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se,
*/ */
if (flags & ENQUEUE_WAKEUP) { if (flags & ENQUEUE_WAKEUP) {
task_contending(dl_se, flags); task_contending(dl_se, flags);
update_dl_entity(dl_se, pi_se); update_dl_entity(dl_se);
} else if (flags & ENQUEUE_REPLENISH) { } else if (flags & ENQUEUE_REPLENISH) {
replenish_dl_entity(dl_se, pi_se); replenish_dl_entity(dl_se);
} else if ((flags & ENQUEUE_RESTORE) && } else if ((flags & ENQUEUE_RESTORE) &&
dl_time_before(dl_se->deadline, dl_time_before(dl_se->deadline,
rq_clock(rq_of_dl_rq(dl_rq_of_se(dl_se))))) { rq_clock(rq_of_dl_rq(dl_rq_of_se(dl_se))))) {
...@@ -1512,19 +1530,7 @@ static void dequeue_dl_entity(struct sched_dl_entity *dl_se) ...@@ -1512,19 +1530,7 @@ static void dequeue_dl_entity(struct sched_dl_entity *dl_se)
static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
{ {
struct task_struct *pi_task = rt_mutex_get_top_task(p); if (is_dl_boosted(&p->dl)) {
struct sched_dl_entity *pi_se = &p->dl;
/*
* Use the scheduling parameters of the top pi-waiter task if:
* - we have a top pi-waiter which is a SCHED_DEADLINE task AND
* - our dl_boosted is set (i.e. the pi-waiter's (absolute) deadline is
* smaller than our deadline OR we are a !SCHED_DEADLINE task getting
* boosted due to a SCHED_DEADLINE pi-waiter).
* Otherwise we keep our runtime and deadline.
*/
if (pi_task && dl_prio(pi_task->normal_prio) && p->dl.dl_boosted) {
pi_se = &pi_task->dl;
/* /*
* Because of delays in the detection of the overrun of a * Because of delays in the detection of the overrun of a
* thread's runtime, it might be the case that a thread * thread's runtime, it might be the case that a thread
...@@ -1557,7 +1563,7 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) ...@@ -1557,7 +1563,7 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
* the throttle. * the throttle.
*/ */
p->dl.dl_throttled = 0; p->dl.dl_throttled = 0;
BUG_ON(!p->dl.dl_boosted || flags != ENQUEUE_REPLENISH); BUG_ON(!is_dl_boosted(&p->dl) || flags != ENQUEUE_REPLENISH);
return; return;
} }
...@@ -1594,7 +1600,7 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) ...@@ -1594,7 +1600,7 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
return; return;
} }
enqueue_dl_entity(&p->dl, pi_se, flags); enqueue_dl_entity(&p->dl, flags);
if (!task_current(rq, p) && p->nr_cpus_allowed > 1) if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
enqueue_pushable_dl_task(rq, p); enqueue_pushable_dl_task(rq, p);
...@@ -2787,11 +2793,14 @@ void __dl_clear_params(struct task_struct *p) ...@@ -2787,11 +2793,14 @@ void __dl_clear_params(struct task_struct *p)
dl_se->dl_bw = 0; dl_se->dl_bw = 0;
dl_se->dl_density = 0; dl_se->dl_density = 0;
dl_se->dl_boosted = 0;
dl_se->dl_throttled = 0; dl_se->dl_throttled = 0;
dl_se->dl_yielded = 0; dl_se->dl_yielded = 0;
dl_se->dl_non_contending = 0; dl_se->dl_non_contending = 0;
dl_se->dl_overrun = 0; dl_se->dl_overrun = 0;
#ifdef CONFIG_RT_MUTEXES
dl_se->pi_se = dl_se;
#endif
} }
bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr) bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment