Commit 750a40d8 authored by Tejun Heo's avatar Tejun Heo

sched_ext: Synchronize bypass state changes with rq lock

While the BPF scheduler is being unloaded, the following warning messages
trigger sometimes:

 NOHZ tick-stop error: local softirq work is pending, handler #80!!!

This is caused by the CPU entering idle while there are pending softirqs.
The main culprit is the bypassing state assertion not being synchronized
with rq operations. As the BPF scheduler cannot be trusted in the disable
path, the first step is entering the bypass mode where the BPF scheduler is
ignored and scheduling becomes global FIFO.

This is implemented by turning scx_ops_bypassing() true. However, the
transition isn't synchronized against anything and it's possible for enqueue
and dispatch paths to have different ideas on whether bypass mode is on.

Make each rq track its own bypass state with SCX_RQ_BYPASSING which is
modified while rq is locked.

This removes most of the NOHZ tick-stop messages but not completely. I
believe the stragglers are from the sched core bug where pick_task_scx() can
be called without preceding balance_scx(). Once that bug is fixed, we should
verify that all occurrences of this error message are gone too.

v2: scx_enabled() test moved inside the for_each_possible_cpu() loop so that
    the per-cpu states are always synchronized with the global state.
Signed-off-by: default avatarTejun Heo <tj@kernel.org>
Reported-by: default avatarDavid Vernet <void@manifault.com>
parent 2d285d56
...@@ -1411,9 +1411,9 @@ static bool scx_ops_tryset_enable_state(enum scx_ops_enable_state to, ...@@ -1411,9 +1411,9 @@ static bool scx_ops_tryset_enable_state(enum scx_ops_enable_state to,
return atomic_try_cmpxchg(&scx_ops_enable_state_var, &from_v, to); return atomic_try_cmpxchg(&scx_ops_enable_state_var, &from_v, to);
} }
static bool scx_ops_bypassing(void) static bool scx_rq_bypassing(struct rq *rq)
{ {
return unlikely(atomic_read(&scx_ops_bypass_depth)); return unlikely(rq->scx.flags & SCX_RQ_BYPASSING);
} }
/** /**
...@@ -1948,7 +1948,7 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags, ...@@ -1948,7 +1948,7 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
if (!scx_rq_online(rq)) if (!scx_rq_online(rq))
goto local; goto local;
if (scx_ops_bypassing()) if (scx_rq_bypassing(rq))
goto global; goto global;
if (p->scx.ddsp_dsq_id != SCX_DSQ_INVALID) if (p->scx.ddsp_dsq_id != SCX_DSQ_INVALID)
...@@ -2612,7 +2612,7 @@ static int balance_one(struct rq *rq, struct task_struct *prev) ...@@ -2612,7 +2612,7 @@ static int balance_one(struct rq *rq, struct task_struct *prev)
* bypassing test. * bypassing test.
*/ */
if ((prev->scx.flags & SCX_TASK_QUEUED) && if ((prev->scx.flags & SCX_TASK_QUEUED) &&
prev->scx.slice && !scx_ops_bypassing()) { prev->scx.slice && !scx_rq_bypassing(rq)) {
rq->scx.flags |= SCX_RQ_BAL_KEEP; rq->scx.flags |= SCX_RQ_BAL_KEEP;
goto has_tasks; goto has_tasks;
} }
...@@ -2625,7 +2625,7 @@ static int balance_one(struct rq *rq, struct task_struct *prev) ...@@ -2625,7 +2625,7 @@ static int balance_one(struct rq *rq, struct task_struct *prev)
if (consume_dispatch_q(rq, &scx_dsq_global)) if (consume_dispatch_q(rq, &scx_dsq_global))
goto has_tasks; goto has_tasks;
if (!SCX_HAS_OP(dispatch) || scx_ops_bypassing() || !scx_rq_online(rq)) if (!SCX_HAS_OP(dispatch) || scx_rq_bypassing(rq) || !scx_rq_online(rq))
goto no_tasks; goto no_tasks;
dspc->rq = rq; dspc->rq = rq;
...@@ -2671,7 +2671,8 @@ static int balance_one(struct rq *rq, struct task_struct *prev) ...@@ -2671,7 +2671,8 @@ static int balance_one(struct rq *rq, struct task_struct *prev)
* %SCX_OPS_ENQ_LAST is in effect. * %SCX_OPS_ENQ_LAST is in effect.
*/ */
if ((prev->scx.flags & SCX_TASK_QUEUED) && if ((prev->scx.flags & SCX_TASK_QUEUED) &&
(!static_branch_unlikely(&scx_ops_enq_last) || scx_ops_bypassing())) { (!static_branch_unlikely(&scx_ops_enq_last) ||
scx_rq_bypassing(rq))) {
rq->scx.flags |= SCX_RQ_BAL_KEEP; rq->scx.flags |= SCX_RQ_BAL_KEEP;
goto has_tasks; goto has_tasks;
} }
...@@ -2863,7 +2864,7 @@ static void put_prev_task_scx(struct rq *rq, struct task_struct *p, ...@@ -2863,7 +2864,7 @@ static void put_prev_task_scx(struct rq *rq, struct task_struct *p,
* forcing a different task. Leave it at the head of the local * forcing a different task. Leave it at the head of the local
* DSQ. * DSQ.
*/ */
if (p->scx.slice && !scx_ops_bypassing()) { if (p->scx.slice && !scx_rq_bypassing(rq)) {
dispatch_enqueue(&rq->scx.local_dsq, p, SCX_ENQ_HEAD); dispatch_enqueue(&rq->scx.local_dsq, p, SCX_ENQ_HEAD);
return; return;
} }
...@@ -2928,7 +2929,7 @@ static struct task_struct *pick_task_scx(struct rq *rq) ...@@ -2928,7 +2929,7 @@ static struct task_struct *pick_task_scx(struct rq *rq)
return NULL; return NULL;
if (unlikely(!p->scx.slice)) { if (unlikely(!p->scx.slice)) {
if (!scx_ops_bypassing() && !scx_warned_zero_slice) { if (!scx_rq_bypassing(rq) && !scx_warned_zero_slice) {
printk_deferred(KERN_WARNING "sched_ext: %s[%d] has zero slice in pick_next_task_scx()\n", printk_deferred(KERN_WARNING "sched_ext: %s[%d] has zero slice in pick_next_task_scx()\n",
p->comm, p->pid); p->comm, p->pid);
scx_warned_zero_slice = true; scx_warned_zero_slice = true;
...@@ -2966,7 +2967,7 @@ bool scx_prio_less(const struct task_struct *a, const struct task_struct *b, ...@@ -2966,7 +2967,7 @@ bool scx_prio_less(const struct task_struct *a, const struct task_struct *b,
* calling ops.core_sched_before(). Accesses are controlled by the * calling ops.core_sched_before(). Accesses are controlled by the
* verifier. * verifier.
*/ */
if (SCX_HAS_OP(core_sched_before) && !scx_ops_bypassing()) if (SCX_HAS_OP(core_sched_before) && !scx_rq_bypassing(task_rq(a)))
return SCX_CALL_OP_2TASKS_RET(SCX_KF_REST, core_sched_before, return SCX_CALL_OP_2TASKS_RET(SCX_KF_REST, core_sched_before,
(struct task_struct *)a, (struct task_struct *)a,
(struct task_struct *)b); (struct task_struct *)b);
...@@ -3325,7 +3326,7 @@ static void task_tick_scx(struct rq *rq, struct task_struct *curr, int queued) ...@@ -3325,7 +3326,7 @@ static void task_tick_scx(struct rq *rq, struct task_struct *curr, int queued)
* While disabling, always resched and refresh core-sched timestamp as * While disabling, always resched and refresh core-sched timestamp as
* we can't trust the slice management or ops.core_sched_before(). * we can't trust the slice management or ops.core_sched_before().
*/ */
if (scx_ops_bypassing()) { if (scx_rq_bypassing(rq)) {
curr->scx.slice = 0; curr->scx.slice = 0;
touch_core_sched(rq, curr); touch_core_sched(rq, curr);
} else if (SCX_HAS_OP(tick)) { } else if (SCX_HAS_OP(tick)) {
...@@ -3664,7 +3665,7 @@ bool scx_can_stop_tick(struct rq *rq) ...@@ -3664,7 +3665,7 @@ bool scx_can_stop_tick(struct rq *rq)
{ {
struct task_struct *p = rq->curr; struct task_struct *p = rq->curr;
if (scx_ops_bypassing()) if (scx_rq_bypassing(rq))
return false; return false;
if (p->sched_class != &ext_sched_class) if (p->sched_class != &ext_sched_class)
...@@ -4256,17 +4257,9 @@ static void scx_ops_bypass(bool bypass) ...@@ -4256,17 +4257,9 @@ static void scx_ops_bypass(bool bypass)
return; return;
} }
/*
* We need to guarantee that no tasks are on the BPF scheduler while
* bypassing. Either we see enabled or the enable path sees the
* increased bypass_depth before moving tasks to SCX.
*/
if (!scx_enabled())
return;
/* /*
* No task property is changing. We just need to make sure all currently * No task property is changing. We just need to make sure all currently
* queued tasks are re-queued according to the new scx_ops_bypassing() * queued tasks are re-queued according to the new scx_rq_bypassing()
* state. As an optimization, walk each rq's runnable_list instead of * state. As an optimization, walk each rq's runnable_list instead of
* the scx_tasks list. * the scx_tasks list.
* *
...@@ -4280,6 +4273,24 @@ static void scx_ops_bypass(bool bypass) ...@@ -4280,6 +4273,24 @@ static void scx_ops_bypass(bool bypass)
rq_lock_irqsave(rq, &rf); rq_lock_irqsave(rq, &rf);
if (bypass) {
WARN_ON_ONCE(rq->scx.flags & SCX_RQ_BYPASSING);
rq->scx.flags |= SCX_RQ_BYPASSING;
} else {
WARN_ON_ONCE(!(rq->scx.flags & SCX_RQ_BYPASSING));
rq->scx.flags &= ~SCX_RQ_BYPASSING;
}
/*
* We need to guarantee that no tasks are on the BPF scheduler
* while bypassing. Either we see enabled or the enable path
* sees scx_rq_bypassing() before moving tasks to SCX.
*/
if (!scx_enabled()) {
rq_unlock_irqrestore(rq, &rf);
continue;
}
/* /*
* The use of list_for_each_entry_safe_reverse() is required * The use of list_for_each_entry_safe_reverse() is required
* because each task is going to be removed from and added back * because each task is going to be removed from and added back
...@@ -6397,17 +6408,17 @@ __bpf_kfunc void scx_bpf_kick_cpu(s32 cpu, u64 flags) ...@@ -6397,17 +6408,17 @@ __bpf_kfunc void scx_bpf_kick_cpu(s32 cpu, u64 flags)
if (!ops_cpu_valid(cpu, NULL)) if (!ops_cpu_valid(cpu, NULL))
return; return;
local_irq_save(irq_flags);
this_rq = this_rq();
/* /*
* While bypassing for PM ops, IRQ handling may not be online which can * While bypassing for PM ops, IRQ handling may not be online which can
* lead to irq_work_queue() malfunction such as infinite busy wait for * lead to irq_work_queue() malfunction such as infinite busy wait for
* IRQ status update. Suppress kicking. * IRQ status update. Suppress kicking.
*/ */
if (scx_ops_bypassing()) if (scx_rq_bypassing(this_rq))
return; goto out;
local_irq_save(irq_flags);
this_rq = this_rq();
/* /*
* Actual kicking is bounced to kick_cpus_irq_workfn() to avoid nesting * Actual kicking is bounced to kick_cpus_irq_workfn() to avoid nesting
......
...@@ -750,6 +750,7 @@ enum scx_rq_flags { ...@@ -750,6 +750,7 @@ enum scx_rq_flags {
SCX_RQ_ONLINE = 1 << 0, SCX_RQ_ONLINE = 1 << 0,
SCX_RQ_CAN_STOP_TICK = 1 << 1, SCX_RQ_CAN_STOP_TICK = 1 << 1,
SCX_RQ_BAL_KEEP = 1 << 2, /* balance decided to keep current */ SCX_RQ_BAL_KEEP = 1 << 2, /* balance decided to keep current */
SCX_RQ_BYPASSING = 1 << 3,
SCX_RQ_IN_WAKEUP = 1 << 16, SCX_RQ_IN_WAKEUP = 1 << 16,
SCX_RQ_IN_BALANCE = 1 << 17, SCX_RQ_IN_BALANCE = 1 << 17,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment