Commit 0df340ce authored by Tejun Heo's avatar Tejun Heo

Merge branch 'sched/core' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip into for-6.12

Pull tip/sched/core to resolve the following four conflicts. While 2-4 are
simple context conflicts, 1 is a bit subtle and easy to resolve incorrectly.

1. 2c8d046d ("sched: Add normal_policy()")
   vs.
   faa42d29 ("sched/fair: Make SCHED_IDLE entity be preempted in strict hierarchy")

The former converts direct test on p->policy to use the helper
normal_policy(). The latter moves the p->policy test to a different
location. Resolve by converting the test on p->plicy in the new location to
use normal_policy().

2. a7a9fc54 ("sched_ext: Add boilerplate for extensible scheduler class")
   vs.
   a110a81c ("sched/deadline: Deferrable dl server")

Both add calls to put_prev_task_idle() and set_next_task_idle(). Simple
context conflict. Resolve by taking changes from both.

3. a7a9fc54 ("sched_ext: Add boilerplate for extensible scheduler class")
   vs.
   c2459100 ("sched/core: Add clearing of ->dl_server in put_prev_task_balance()")

The former changes for_each_class() itertion to use for_each_active_class().
The latter moves away the adjacent dl_server handling code. Simple context
conflict. Resolve by taking changes from both.

4. 60c27fb5 ("sched_ext: Implement sched_ext_ops.cpu_online/offline()")
   vs.
   31b164e2 ("sched/smt: Introduce sched_smt_present_inc/dec() helper")
   2f027354 ("sched/core: Introduce sched_set_rq_on/offline() helper")

The former adds scx_rq_deactivate() call. The latter two change code around
it. Simple context conflict. Resolve by taking changes from both.
Signed-off-by: default avatarTejun Heo <tj@kernel.org>
parents e99129e5 cea5a347
......@@ -641,12 +641,26 @@ struct sched_dl_entity {
*
* @dl_overrun tells if the task asked to be informed about runtime
* overruns.
*
* @dl_server tells if this is a server entity.
*
* @dl_defer tells if this is a deferred or regular server. For
* now only defer server exists.
*
* @dl_defer_armed tells if the deferrable server is waiting
* for the replenishment timer to activate it.
*
* @dl_defer_running tells if the deferrable server is actually
* running, skipping the defer phase.
*/
unsigned int dl_throttled : 1;
unsigned int dl_yielded : 1;
unsigned int dl_non_contending : 1;
unsigned int dl_overrun : 1;
unsigned int dl_server : 1;
unsigned int dl_defer : 1;
unsigned int dl_defer_armed : 1;
unsigned int dl_defer_running : 1;
/*
* Bandwidth enforcement timer. Each -deadline task has its
......@@ -674,7 +688,8 @@ struct sched_dl_entity {
*/
struct rq *rq;
dl_server_has_tasks_f server_has_tasks;
dl_server_pick_f server_pick;
dl_server_pick_f server_pick_next;
dl_server_pick_f server_pick_task;
#ifdef CONFIG_RT_MUTEXES
/*
......
......@@ -163,6 +163,9 @@ static inline int __task_prio(const struct task_struct *p)
if (p->sched_class == &stop_sched_class) /* trumps deadline */
return -2;
if (p->dl_server)
return -1; /* deadline */
if (rt_prio(p->prio)) /* includes deadline */
return p->prio; /* [-1, 99] */
......@@ -195,8 +198,24 @@ static inline bool prio_less(const struct task_struct *a,
if (-pb < -pa)
return false;
if (pa == -1) /* dl_prio() doesn't work because of stop_class above */
return !dl_time_before(a->dl.deadline, b->dl.deadline);
if (pa == -1) { /* dl_prio() doesn't work because of stop_class above */
const struct sched_dl_entity *a_dl, *b_dl;
a_dl = &a->dl;
/*
* Since,'a' and 'b' can be CFS tasks served by DL server,
* __task_prio() can return -1 (for DL) even for those. In that
* case, get to the dl_server's DL entity.
*/
if (a->dl_server)
a_dl = a->dl_server;
b_dl = &b->dl;
if (b->dl_server)
b_dl = b->dl_server;
return !dl_time_before(a_dl->deadline, b_dl->deadline);
}
if (pa == MAX_RT_PRIO + MAX_NICE) /* fair */
return cfs_prio_less(a, b, in_fi);
......@@ -1280,7 +1299,7 @@ bool sched_can_stop_tick(struct rq *rq)
* dequeued by migrating while the constrained task continues to run.
* E.g. going from 2->1 without going through pick_next_task().
*/
if (sched_feat(HZ_BW) && __need_bw_check(rq, rq->curr)) {
if (__need_bw_check(rq, rq->curr)) {
if (cfs_task_bw_constrained(rq->curr))
return false;
}
......@@ -2255,6 +2274,12 @@ void migrate_disable(void)
struct task_struct *p = current;
if (p->migration_disabled) {
#ifdef CONFIG_DEBUG_PREEMPT
/*
*Warn about overflow half-way through the range.
*/
WARN_ON_ONCE((s16)p->migration_disabled < 0);
#endif
p->migration_disabled++;
return;
}
......@@ -2273,14 +2298,20 @@ void migrate_enable(void)
.flags = SCA_MIGRATE_ENABLE,
};
#ifdef CONFIG_DEBUG_PREEMPT
/*
* Check both overflow from migrate_disable() and superfluous
* migrate_enable().
*/
if (WARN_ON_ONCE((s16)p->migration_disabled <= 0))
return;
#endif
if (p->migration_disabled > 1) {
p->migration_disabled--;
return;
}
if (WARN_ON_ONCE(!p->migration_disabled))
return;
/*
* Ensure stop_task runs either before or after this, and that
* __set_cpus_allowed_ptr(SCA_MIGRATE_ENABLE) doesn't schedule().
......@@ -4737,7 +4768,7 @@ void wake_up_new_task(struct task_struct *p)
update_rq_clock(rq);
post_init_entity_util_avg(p);
activate_task(rq, p, ENQUEUE_NOCLOCK);
activate_task(rq, p, ENQUEUE_NOCLOCK | ENQUEUE_INITIAL);
trace_sched_wakeup_new(p);
wakeup_preempt(rq, p, WF_FORK);
#ifdef CONFIG_SMP
......@@ -5855,6 +5886,14 @@ static void put_prev_task_balance(struct rq *rq, struct task_struct *prev,
#endif
put_prev_task(rq, prev);
/*
* We've updated @prev and no longer need the server link, clear it.
* Must be done before ->pick_next_task() because that can (re)set
* ->dl_server.
*/
if (prev->dl_server)
prev->dl_server = NULL;
}
/*
......@@ -5888,6 +5927,13 @@ __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
p = pick_next_task_idle(rq);
}
/*
* This is a normal CFS pick, but the previous could be a DL pick.
* Clear it as previous is no longer picked.
*/
if (prev->dl_server)
prev->dl_server = NULL;
/*
* This is the fast path; it cannot be a DL server pick;
* therefore even if @p == @prev, ->dl_server must be NULL.
......@@ -5901,14 +5947,6 @@ __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
restart:
put_prev_task_balance(rq, prev, rf);
/*
* We've updated @prev and no longer need the server link, clear it.
* Must be done before ->pick_next_task() because that can (re)set
* ->dl_server.
*/
if (prev->dl_server)
prev->dl_server = NULL;
for_each_active_class(class) {
p = class->pick_next_task(rq);
if (p) {
......@@ -7925,6 +7963,30 @@ void set_rq_offline(struct rq *rq)
}
}
static inline void sched_set_rq_online(struct rq *rq, int cpu)
{
struct rq_flags rf;
rq_lock_irqsave(rq, &rf);
if (rq->rd) {
BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
set_rq_online(rq);
}
rq_unlock_irqrestore(rq, &rf);
}
static inline void sched_set_rq_offline(struct rq *rq, int cpu)
{
struct rq_flags rf;
rq_lock_irqsave(rq, &rf);
if (rq->rd) {
BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
set_rq_offline(rq);
}
rq_unlock_irqrestore(rq, &rf);
}
/*
* used to mark begin/end of suspend/resume:
*/
......@@ -7975,10 +8037,25 @@ static int cpuset_cpu_inactive(unsigned int cpu)
return 0;
}
static inline void sched_smt_present_inc(int cpu)
{
#ifdef CONFIG_SCHED_SMT
if (cpumask_weight(cpu_smt_mask(cpu)) == 2)
static_branch_inc_cpuslocked(&sched_smt_present);
#endif
}
static inline void sched_smt_present_dec(int cpu)
{
#ifdef CONFIG_SCHED_SMT
if (cpumask_weight(cpu_smt_mask(cpu)) == 2)
static_branch_dec_cpuslocked(&sched_smt_present);
#endif
}
int sched_cpu_activate(unsigned int cpu)
{
struct rq *rq = cpu_rq(cpu);
struct rq_flags rf;
/*
* Clear the balance_push callback and prepare to schedule
......@@ -7986,13 +8063,10 @@ int sched_cpu_activate(unsigned int cpu)
*/
balance_push_set(cpu, false);
#ifdef CONFIG_SCHED_SMT
/*
* When going up, increment the number of cores with SMT present.
*/
if (cpumask_weight(cpu_smt_mask(cpu)) == 2)
static_branch_inc_cpuslocked(&sched_smt_present);
#endif
sched_smt_present_inc(cpu);
set_cpu_active(cpu, true);
if (sched_smp_initialized) {
......@@ -8012,12 +8086,7 @@ int sched_cpu_activate(unsigned int cpu)
* 2) At runtime, if cpuset_cpu_active() fails to rebuild the
* domains.
*/
rq_lock_irqsave(rq, &rf);
if (rq->rd) {
BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
set_rq_online(rq);
}
rq_unlock_irqrestore(rq, &rf);
sched_set_rq_online(rq, cpu);
return 0;
}
......@@ -8025,7 +8094,6 @@ int sched_cpu_activate(unsigned int cpu)
int sched_cpu_deactivate(unsigned int cpu)
{
struct rq *rq = cpu_rq(cpu);
struct rq_flags rf;
int ret;
/*
......@@ -8056,22 +8124,16 @@ int sched_cpu_deactivate(unsigned int cpu)
*/
synchronize_rcu();
rq_lock_irqsave(rq, &rf);
if (rq->rd) {
BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
set_rq_offline(rq);
}
rq_unlock_irqrestore(rq, &rf);
sched_set_rq_offline(rq, cpu);
scx_rq_deactivate(rq);
#ifdef CONFIG_SCHED_SMT
/*
* When going down, decrement the number of cores with SMT present.
*/
if (cpumask_weight(cpu_smt_mask(cpu)) == 2)
static_branch_dec_cpuslocked(&sched_smt_present);
sched_smt_present_dec(cpu);
#ifdef CONFIG_SCHED_SMT
sched_core_cpu_deactivate(cpu);
#endif
......@@ -8081,6 +8143,8 @@ int sched_cpu_deactivate(unsigned int cpu)
sched_update_numa(cpu, false);
ret = cpuset_cpu_inactive(cpu);
if (ret) {
sched_smt_present_inc(cpu);
sched_set_rq_online(rq, cpu);
balance_push_set(cpu, false);
set_cpu_active(cpu, true);
sched_update_numa(cpu, true);
......@@ -8290,8 +8354,6 @@ void __init sched_init(void)
#endif /* CONFIG_RT_GROUP_SCHED */
}
init_rt_bandwidth(&def_rt_bandwidth, global_rt_period(), global_rt_runtime());
#ifdef CONFIG_SMP
init_defrootdomain();
#endif
......@@ -8346,8 +8408,13 @@ void __init sched_init(void)
init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);
#endif /* CONFIG_FAIR_GROUP_SCHED */
rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
#ifdef CONFIG_RT_GROUP_SCHED
/*
* This is required for init cpu because rt.c:__enable_runtime()
* starts working after scheduler_running, which is not the case
* yet.
*/
rq->rt.rt_runtime = global_rt_runtime();
init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);
#endif
#ifdef CONFIG_SMP
......@@ -8379,6 +8446,7 @@ void __init sched_init(void)
#endif /* CONFIG_SMP */
hrtick_rq_init(rq);
atomic_set(&rq->nr_iowait, 0);
fair_server_init(rq);
#ifdef CONFIG_SCHED_CORE
rq->core = rq;
......
......@@ -582,6 +582,12 @@ void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev,
}
stime = mul_u64_u64_div_u64(stime, rtime, stime + utime);
/*
* Because mul_u64_u64_div_u64() can approximate on some
* achitectures; enforce the constraint that: a*b/(b+c) <= a.
*/
if (unlikely(stime > rtime))
stime = rtime;
update:
/*
......
This diff is collapsed.
......@@ -333,8 +333,165 @@ static const struct file_operations sched_debug_fops = {
.release = seq_release,
};
enum dl_param {
DL_RUNTIME = 0,
DL_PERIOD,
};
static unsigned long fair_server_period_max = (1 << 22) * NSEC_PER_USEC; /* ~4 seconds */
static unsigned long fair_server_period_min = (100) * NSEC_PER_USEC; /* 100 us */
static ssize_t sched_fair_server_write(struct file *filp, const char __user *ubuf,
size_t cnt, loff_t *ppos, enum dl_param param)
{
long cpu = (long) ((struct seq_file *) filp->private_data)->private;
struct rq *rq = cpu_rq(cpu);
u64 runtime, period;
size_t err;
int retval;
u64 value;
err = kstrtoull_from_user(ubuf, cnt, 10, &value);
if (err)
return err;
scoped_guard (rq_lock_irqsave, rq) {
runtime = rq->fair_server.dl_runtime;
period = rq->fair_server.dl_period;
switch (param) {
case DL_RUNTIME:
if (runtime == value)
break;
runtime = value;
break;
case DL_PERIOD:
if (value == period)
break;
period = value;
break;
}
if (runtime > period ||
period > fair_server_period_max ||
period < fair_server_period_min) {
return -EINVAL;
}
if (rq->cfs.h_nr_running) {
update_rq_clock(rq);
dl_server_stop(&rq->fair_server);
}
retval = dl_server_apply_params(&rq->fair_server, runtime, period, 0);
if (retval)
cnt = retval;
if (!runtime)
printk_deferred("Fair server disabled in CPU %d, system may crash due to starvation.\n",
cpu_of(rq));
if (rq->cfs.h_nr_running)
dl_server_start(&rq->fair_server);
}
*ppos += cnt;
return cnt;
}
static size_t sched_fair_server_show(struct seq_file *m, void *v, enum dl_param param)
{
unsigned long cpu = (unsigned long) m->private;
struct rq *rq = cpu_rq(cpu);
u64 value;
switch (param) {
case DL_RUNTIME:
value = rq->fair_server.dl_runtime;
break;
case DL_PERIOD:
value = rq->fair_server.dl_period;
break;
}
seq_printf(m, "%llu\n", value);
return 0;
}
static ssize_t
sched_fair_server_runtime_write(struct file *filp, const char __user *ubuf,
size_t cnt, loff_t *ppos)
{
return sched_fair_server_write(filp, ubuf, cnt, ppos, DL_RUNTIME);
}
static int sched_fair_server_runtime_show(struct seq_file *m, void *v)
{
return sched_fair_server_show(m, v, DL_RUNTIME);
}
static int sched_fair_server_runtime_open(struct inode *inode, struct file *filp)
{
return single_open(filp, sched_fair_server_runtime_show, inode->i_private);
}
static const struct file_operations fair_server_runtime_fops = {
.open = sched_fair_server_runtime_open,
.write = sched_fair_server_runtime_write,
.read = seq_read,
.llseek = seq_lseek,
.release = single_release,
};
static ssize_t
sched_fair_server_period_write(struct file *filp, const char __user *ubuf,
size_t cnt, loff_t *ppos)
{
return sched_fair_server_write(filp, ubuf, cnt, ppos, DL_PERIOD);
}
static int sched_fair_server_period_show(struct seq_file *m, void *v)
{
return sched_fair_server_show(m, v, DL_PERIOD);
}
static int sched_fair_server_period_open(struct inode *inode, struct file *filp)
{
return single_open(filp, sched_fair_server_period_show, inode->i_private);
}
static const struct file_operations fair_server_period_fops = {
.open = sched_fair_server_period_open,
.write = sched_fair_server_period_write,
.read = seq_read,
.llseek = seq_lseek,
.release = single_release,
};
static struct dentry *debugfs_sched;
static void debugfs_fair_server_init(void)
{
struct dentry *d_fair;
unsigned long cpu;
d_fair = debugfs_create_dir("fair_server", debugfs_sched);
if (!d_fair)
return;
for_each_possible_cpu(cpu) {
struct dentry *d_cpu;
char buf[32];
snprintf(buf, sizeof(buf), "cpu%lu", cpu);
d_cpu = debugfs_create_dir(buf, d_fair);
debugfs_create_file("runtime", 0644, d_cpu, (void *) cpu, &fair_server_runtime_fops);
debugfs_create_file("period", 0644, d_cpu, (void *) cpu, &fair_server_period_fops);
}
}
static __init int sched_init_debug(void)
{
struct dentry __maybe_unused *numa;
......@@ -374,6 +531,8 @@ static __init int sched_init_debug(void)
debugfs_create_file("debug", 0444, debugfs_sched, NULL, &sched_debug_fops);
debugfs_fair_server_init();
return 0;
}
late_initcall(sched_init_debug);
......@@ -641,8 +800,6 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
SEQ_printf(m, "\n");
SEQ_printf(m, "cfs_rq[%d]:\n", cpu);
#endif
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock",
SPLIT_NS(cfs_rq->exec_clock));
raw_spin_rq_lock_irqsave(rq, flags);
root = __pick_root_entity(cfs_rq);
......@@ -669,8 +826,6 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
SPLIT_NS(right_vruntime));
spread = right_vruntime - left_vruntime;
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread", SPLIT_NS(spread));
SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over",
cfs_rq->nr_spread_over);
SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running);
SEQ_printf(m, " .%-30s: %d\n", "h_nr_running", cfs_rq->h_nr_running);
SEQ_printf(m, " .%-30s: %d\n", "idle_nr_running",
......@@ -730,9 +885,12 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rt_rq->x))
PU(rt_nr_running);
#ifdef CONFIG_RT_GROUP_SCHED
P(rt_throttled);
PN(rt_time);
PN(rt_runtime);
#endif
#undef PN
#undef PU
......
......@@ -511,7 +511,7 @@ static int cfs_rq_is_idle(struct cfs_rq *cfs_rq)
static int se_is_idle(struct sched_entity *se)
{
return 0;
return task_has_idle_policy(task_of(se));
}
#endif /* CONFIG_FAIR_GROUP_SCHED */
......@@ -1156,12 +1156,13 @@ s64 update_curr_common(struct rq *rq)
static void update_curr(struct cfs_rq *cfs_rq)
{
struct sched_entity *curr = cfs_rq->curr;
struct rq *rq = rq_of(cfs_rq);
s64 delta_exec;
if (unlikely(!curr))
return;
delta_exec = update_curr_se(rq_of(cfs_rq), curr);
delta_exec = update_curr_se(rq, curr);
if (unlikely(delta_exec <= 0))
return;
......@@ -1169,8 +1170,19 @@ static void update_curr(struct cfs_rq *cfs_rq)
update_deadline(cfs_rq, curr);
update_min_vruntime(cfs_rq);
if (entity_is_task(curr))
update_curr_task(task_of(curr), delta_exec);
if (entity_is_task(curr)) {
struct task_struct *p = task_of(curr);
update_curr_task(p, delta_exec);
/*
* Any fair task that runs outside of fair_server should
* account against fair_server such that it can account for
* this time and possibly avoid running this period.
*/
if (p->dl_server != &rq->fair_server)
dl_server_update(&rq->fair_server, delta_exec);
}
account_cfs_rq_runtime(cfs_rq, delta_exec);
}
......@@ -5766,6 +5778,7 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
struct sched_entity *se;
long task_delta, idle_task_delta, dequeue = 1;
long rq_h_nr_running = rq->cfs.h_nr_running;
raw_spin_lock(&cfs_b->lock);
/* This will start the period timer if necessary */
......@@ -5837,6 +5850,9 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
/* At this point se is NULL and we are at root level*/
sub_nr_running(rq, task_delta);
/* Stop the fair server if throttling resulted in no runnable tasks */
if (rq_h_nr_running && !rq->cfs.h_nr_running)
dl_server_stop(&rq->fair_server);
done:
/*
* Note: distribution will already see us throttled via the
......@@ -5855,6 +5871,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
struct sched_entity *se;
long task_delta, idle_task_delta;
long rq_h_nr_running = rq->cfs.h_nr_running;
se = cfs_rq->tg->se[cpu_of(rq)];
......@@ -5924,6 +5941,10 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
goto unthrottle_throttle;
}
/* Start the fair server if un-throttling resulted in new runnable tasks */
if (!rq_h_nr_running && rq->cfs.h_nr_running)
dl_server_start(&rq->fair_server);
/* At this point se is NULL and we are at root level*/
add_nr_running(rq, task_delta);
......@@ -6556,7 +6577,7 @@ static void sched_fair_update_stop_tick(struct rq *rq, struct task_struct *p)
{
int cpu = cpu_of(rq);
if (!sched_feat(HZ_BW) || !cfs_bandwidth_used())
if (!cfs_bandwidth_used())
return;
if (!tick_nohz_full_cpu(cpu))
......@@ -6751,6 +6772,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
struct sched_entity *se = &p->se;
int idle_h_nr_running = task_has_idle_policy(p);
int task_new = !(flags & ENQUEUE_WAKEUP);
int rq_h_nr_running = rq->cfs.h_nr_running;
/*
* The code below (indirectly) updates schedutil which looks at
......@@ -6805,6 +6827,13 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
goto enqueue_throttle;
}
if (!rq_h_nr_running && rq->cfs.h_nr_running) {
/* Account for idle runtime */
if (!rq->nr_running)
dl_server_update_idle_time(rq, rq->curr);
dl_server_start(&rq->fair_server);
}
/* At this point se is NULL and we are at root level*/
add_nr_running(rq, 1);
......@@ -6845,6 +6874,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
int task_sleep = flags & DEQUEUE_SLEEP;
int idle_h_nr_running = task_has_idle_policy(p);
bool was_sched_idle = sched_idle_rq(rq);
int rq_h_nr_running = rq->cfs.h_nr_running;
util_est_dequeue(&rq->cfs, p);
......@@ -6899,6 +6929,9 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
/* At this point se is NULL and we are at root level*/
sub_nr_running(rq, 1);
if (rq_h_nr_running && !rq->cfs.h_nr_running)
dl_server_stop(&rq->fair_server);
/* balance early to pull high priority tasks */
if (unlikely(!was_sched_idle && sched_idle_rq(rq)))
rq->next_balance = jiffies;
......@@ -8382,16 +8415,7 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int
if (test_tsk_need_resched(curr))
return;
/* Idle tasks are by definition preempted by non-idle tasks. */
if (unlikely(task_has_idle_policy(curr)) &&
likely(!task_has_idle_policy(p)))
goto preempt;
/*
* Batch and idle tasks do not preempt non-idle tasks (their preemption
* is driven by the tick):
*/
if (unlikely(!normal_policy(p->policy)) || !sched_feat(WAKEUP_PREEMPTION))
if (!sched_feat(WAKEUP_PREEMPTION))
return;
find_matching_se(&se, &pse);
......@@ -8401,7 +8425,7 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int
pse_is_idle = se_is_idle(pse);
/*
* Preempt an idle group in favor of a non-idle group (and don't preempt
* Preempt an idle entity in favor of a non-idle entity (and don't preempt
* in the inverse case).
*/
if (cse_is_idle && !pse_is_idle)
......@@ -8409,9 +8433,14 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int
if (cse_is_idle != pse_is_idle)
return;
/*
* BATCH and IDLE tasks do not preempt others.
*/
if (unlikely(!normal_policy(p->policy)))
return;
cfs_rq = cfs_rq_of(se);
update_curr(cfs_rq);
/*
* XXX pick_eevdf(cfs_rq) != se ?
*/
......@@ -8453,6 +8482,14 @@ static struct task_struct *pick_task_fair(struct rq *rq)
cfs_rq = group_cfs_rq(se);
} while (cfs_rq);
/*
* This can be called from directly from CFS's ->pick_task() or indirectly
* from DL's ->pick_task when fair server is enabled. In the indirect case,
* DL will set ->dl_server just after this function is called, so its Ok to
* clear. In the direct case, we are picking directly so we must clear it.
*/
task_of(se)->dl_server = NULL;
return task_of(se);
}
#endif
......@@ -8607,6 +8644,36 @@ static struct task_struct *__pick_next_task_fair(struct rq *rq)
return pick_next_task_fair(rq, NULL, NULL);
}
static bool fair_server_has_tasks(struct sched_dl_entity *dl_se)
{
return !!dl_se->rq->cfs.nr_running;
}
static struct task_struct *fair_server_pick_task(struct sched_dl_entity *dl_se)
{
#ifdef CONFIG_SMP
return pick_task_fair(dl_se->rq);
#else
return NULL;
#endif
}
static struct task_struct *fair_server_pick_next(struct sched_dl_entity *dl_se)
{
return pick_next_task_fair(dl_se->rq, NULL, NULL);
}
void fair_server_init(struct rq *rq)
{
struct sched_dl_entity *dl_se = &rq->fair_server;
init_dl_entity(dl_se);
dl_server_init(dl_se, rq, fair_server_has_tasks, fair_server_pick_next,
fair_server_pick_task);
}
/*
* Account for a descheduled task:
*/
......@@ -12693,22 +12760,7 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
*/
static void task_fork_fair(struct task_struct *p)
{
struct sched_entity *se = &p->se, *curr;
struct cfs_rq *cfs_rq;
struct rq *rq = this_rq();
struct rq_flags rf;
rq_lock(rq, &rf);
update_rq_clock(rq);
set_task_max_allowed_capacity(p);
cfs_rq = task_cfs_rq(current);
curr = cfs_rq->curr;
if (curr)
update_curr(cfs_rq);
place_entity(cfs_rq, se, ENQUEUE_INITIAL);
rq_unlock(rq, &rf);
}
/*
......
......@@ -85,5 +85,3 @@ SCHED_FEAT(WA_BIAS, true)
SCHED_FEAT(UTIL_EST, true)
SCHED_FEAT(LATENCY_WARN, false)
SCHED_FEAT(HZ_BW, true)
......@@ -452,6 +452,7 @@ static void wakeup_preempt_idle(struct rq *rq, struct task_struct *p, int flags)
static void put_prev_task_idle(struct rq *rq, struct task_struct *prev)
{
dl_server_update_idle_time(rq, prev);
scx_update_idle(rq, false);
}
......@@ -460,6 +461,7 @@ static void set_next_task_idle(struct rq *rq, struct task_struct *next, bool fir
update_idle_core(rq);
scx_update_idle(rq, true);
schedstat_inc(rq->sched_goidle);
next->se.exec_start = rq_clock_task(rq);
}
#ifdef CONFIG_SMP
......
......@@ -8,10 +8,6 @@ int sched_rr_timeslice = RR_TIMESLICE;
/* More than 4 hours if BW_SHIFT equals 20. */
static const u64 max_rt_runtime = MAX_BW;
static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
struct rt_bandwidth def_rt_bandwidth;
/*
* period over which we measure -rt task CPU usage in us.
* default: 1s
......@@ -66,6 +62,40 @@ static int __init sched_rt_sysctl_init(void)
late_initcall(sched_rt_sysctl_init);
#endif
void init_rt_rq(struct rt_rq *rt_rq)
{
struct rt_prio_array *array;
int i;
array = &rt_rq->active;
for (i = 0; i < MAX_RT_PRIO; i++) {
INIT_LIST_HEAD(array->queue + i);
__clear_bit(i, array->bitmap);
}
/* delimiter for bitsearch: */
__set_bit(MAX_RT_PRIO, array->bitmap);
#if defined CONFIG_SMP
rt_rq->highest_prio.curr = MAX_RT_PRIO-1;
rt_rq->highest_prio.next = MAX_RT_PRIO-1;
rt_rq->overloaded = 0;
plist_head_init(&rt_rq->pushable_tasks);
#endif /* CONFIG_SMP */
/* We start is dequeued state, because no RT tasks are queued */
rt_rq->rt_queued = 0;
#ifdef CONFIG_RT_GROUP_SCHED
rt_rq->rt_time = 0;
rt_rq->rt_throttled = 0;
rt_rq->rt_runtime = 0;
raw_spin_lock_init(&rt_rq->rt_runtime_lock);
#endif
}
#ifdef CONFIG_RT_GROUP_SCHED
static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)
{
struct rt_bandwidth *rt_b =
......@@ -130,35 +160,6 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
do_start_rt_bandwidth(rt_b);
}
void init_rt_rq(struct rt_rq *rt_rq)
{
struct rt_prio_array *array;
int i;
array = &rt_rq->active;
for (i = 0; i < MAX_RT_PRIO; i++) {
INIT_LIST_HEAD(array->queue + i);
__clear_bit(i, array->bitmap);
}
/* delimiter for bit-search: */
__set_bit(MAX_RT_PRIO, array->bitmap);
#if defined CONFIG_SMP
rt_rq->highest_prio.curr = MAX_RT_PRIO-1;
rt_rq->highest_prio.next = MAX_RT_PRIO-1;
rt_rq->overloaded = 0;
plist_head_init(&rt_rq->pushable_tasks);
#endif /* CONFIG_SMP */
/* We start is dequeued state, because no RT tasks are queued */
rt_rq->rt_queued = 0;
rt_rq->rt_time = 0;
rt_rq->rt_throttled = 0;
rt_rq->rt_runtime = 0;
raw_spin_lock_init(&rt_rq->rt_runtime_lock);
}
#ifdef CONFIG_RT_GROUP_SCHED
static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
{
hrtimer_cancel(&rt_b->rt_period_timer);
......@@ -195,7 +196,6 @@ void unregister_rt_sched_group(struct task_group *tg)
{
if (tg->rt_se)
destroy_rt_bandwidth(&tg->rt_bandwidth);
}
void free_rt_sched_group(struct task_group *tg)
......@@ -253,8 +253,7 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
if (!tg->rt_se)
goto err;
init_rt_bandwidth(&tg->rt_bandwidth,
ktime_to_ns(def_rt_bandwidth.rt_period), 0);
init_rt_bandwidth(&tg->rt_bandwidth, ktime_to_ns(global_rt_period()), 0);
for_each_possible_cpu(i) {
rt_rq = kzalloc_node(sizeof(struct rt_rq),
......@@ -604,70 +603,6 @@ static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
return &rt_rq->tg->rt_bandwidth;
}
#else /* !CONFIG_RT_GROUP_SCHED */
static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
{
return rt_rq->rt_runtime;
}
static inline u64 sched_rt_period(struct rt_rq *rt_rq)
{
return ktime_to_ns(def_rt_bandwidth.rt_period);
}
typedef struct rt_rq *rt_rq_iter_t;
#define for_each_rt_rq(rt_rq, iter, rq) \
for ((void) iter, rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
#define for_each_sched_rt_entity(rt_se) \
for (; rt_se; rt_se = NULL)
static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
{
return NULL;
}
static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
{
struct rq *rq = rq_of_rt_rq(rt_rq);
if (!rt_rq->rt_nr_running)
return;
enqueue_top_rt_rq(rt_rq);
resched_curr(rq);
}
static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
{
dequeue_top_rt_rq(rt_rq, rt_rq->rt_nr_running);
}
static inline int rt_rq_throttled(struct rt_rq *rt_rq)
{
return rt_rq->rt_throttled;
}
static inline const struct cpumask *sched_rt_period_mask(void)
{
return cpu_online_mask;
}
static inline
struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu)
{
return &cpu_rq(cpu)->rt;
}
static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
{
return &def_rt_bandwidth;
}
#endif /* CONFIG_RT_GROUP_SCHED */
bool sched_rt_bandwidth_account(struct rt_rq *rt_rq)
{
struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
......@@ -859,7 +794,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
const struct cpumask *span;
span = sched_rt_period_mask();
#ifdef CONFIG_RT_GROUP_SCHED
/*
* FIXME: isolated CPUs should really leave the root task group,
* whether they are isolcpus or were isolated via cpusets, lest
......@@ -871,7 +806,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
*/
if (rt_b == &root_task_group.rt_bandwidth)
span = cpu_online_mask;
#endif
for_each_cpu(i, span) {
int enqueue = 0;
struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i);
......@@ -938,18 +873,6 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
return idle;
}
static inline int rt_se_prio(struct sched_rt_entity *rt_se)
{
#ifdef CONFIG_RT_GROUP_SCHED
struct rt_rq *rt_rq = group_rt_rq(rt_se);
if (rt_rq)
return rt_rq->highest_prio.curr;
#endif
return rt_task_of(rt_se)->prio;
}
static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
{
u64 runtime = sched_rt_runtime(rt_rq);
......@@ -993,6 +916,72 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
return 0;
}
#else /* !CONFIG_RT_GROUP_SCHED */
typedef struct rt_rq *rt_rq_iter_t;
#define for_each_rt_rq(rt_rq, iter, rq) \
for ((void) iter, rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
#define for_each_sched_rt_entity(rt_se) \
for (; rt_se; rt_se = NULL)
static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
{
return NULL;
}
static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
{
struct rq *rq = rq_of_rt_rq(rt_rq);
if (!rt_rq->rt_nr_running)
return;
enqueue_top_rt_rq(rt_rq);
resched_curr(rq);
}
static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
{
dequeue_top_rt_rq(rt_rq, rt_rq->rt_nr_running);
}
static inline int rt_rq_throttled(struct rt_rq *rt_rq)
{
return false;
}
static inline const struct cpumask *sched_rt_period_mask(void)
{
return cpu_online_mask;
}
static inline
struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu)
{
return &cpu_rq(cpu)->rt;
}
#ifdef CONFIG_SMP
static void __enable_runtime(struct rq *rq) { }
static void __disable_runtime(struct rq *rq) { }
#endif
#endif /* CONFIG_RT_GROUP_SCHED */
static inline int rt_se_prio(struct sched_rt_entity *rt_se)
{
#ifdef CONFIG_RT_GROUP_SCHED
struct rt_rq *rt_rq = group_rt_rq(rt_se);
if (rt_rq)
return rt_rq->highest_prio.curr;
#endif
return rt_task_of(rt_se)->prio;
}
/*
* Update the current task's runtime statistics. Skip current tasks that
* are not in our scheduling class.
......@@ -1000,7 +989,6 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
static void update_curr_rt(struct rq *rq)
{
struct task_struct *curr = rq->curr;
struct sched_rt_entity *rt_se = &curr->rt;
s64 delta_exec;
if (curr->sched_class != &rt_sched_class)
......@@ -1010,6 +998,9 @@ static void update_curr_rt(struct rq *rq)
if (unlikely(delta_exec <= 0))
return;
#ifdef CONFIG_RT_GROUP_SCHED
struct sched_rt_entity *rt_se = &curr->rt;
if (!rt_bandwidth_enabled())
return;
......@@ -1028,6 +1019,7 @@ static void update_curr_rt(struct rq *rq)
do_start_rt_bandwidth(sched_rt_bandwidth(rt_rq));
}
}
#endif
}
static void
......@@ -1184,7 +1176,6 @@ dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
static void
inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
{
start_rt_bandwidth(&def_rt_bandwidth);
}
static inline
......@@ -2912,19 +2903,6 @@ int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
#ifdef CONFIG_SYSCTL
static int sched_rt_global_constraints(void)
{
unsigned long flags;
int i;
raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
for_each_possible_cpu(i) {
struct rt_rq *rt_rq = &cpu_rq(i)->rt;
raw_spin_lock(&rt_rq->rt_runtime_lock);
rt_rq->rt_runtime = global_rt_runtime();
raw_spin_unlock(&rt_rq->rt_runtime_lock);
}
raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
return 0;
}
#endif /* CONFIG_SYSCTL */
......@@ -2944,12 +2922,6 @@ static int sched_rt_global_validate(void)
static void sched_rt_do_global(void)
{
unsigned long flags;
raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
def_rt_bandwidth.rt_runtime = global_rt_runtime();
def_rt_bandwidth.rt_period = ns_to_ktime(global_rt_period());
raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
}
static int sched_rt_handler(const struct ctl_table *table, int write, void *buffer,
......
......@@ -362,7 +362,7 @@ extern bool __checkparam_dl(const struct sched_attr *attr);
extern bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr);
extern int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial);
extern int dl_bw_check_overflow(int cpu);
extern s64 dl_scaled_delta_exec(struct rq *rq, struct sched_dl_entity *dl_se, s64 delta_exec);
/*
* SCHED_DEADLINE supports servers (nested scheduling) with the following
* interface:
......@@ -388,7 +388,15 @@ extern void dl_server_start(struct sched_dl_entity *dl_se);
extern void dl_server_stop(struct sched_dl_entity *dl_se);
extern void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq,
dl_server_has_tasks_f has_tasks,
dl_server_pick_f pick);
dl_server_pick_f pick_next,
dl_server_pick_f pick_task);
extern void dl_server_update_idle_time(struct rq *rq,
struct task_struct *p);
extern void fair_server_init(struct rq *rq);
extern void __dl_server_attach_root(struct sched_dl_entity *dl_se, struct rq *rq);
extern int dl_server_apply_params(struct sched_dl_entity *dl_se,
u64 runtime, u64 period, bool init);
#ifdef CONFIG_CGROUP_SCHED
......@@ -631,7 +639,6 @@ struct cfs_rq {
s64 avg_vruntime;
u64 avg_load;
u64 exec_clock;
u64 min_vruntime;
#ifdef CONFIG_SCHED_CORE
unsigned int forceidle_seq;
......@@ -651,10 +658,6 @@ struct cfs_rq {
struct sched_entity *curr;
struct sched_entity *next;
#ifdef CONFIG_SCHED_DEBUG
unsigned int nr_spread_over;
#endif
#ifdef CONFIG_SMP
/*
* CFS load tracking
......@@ -794,13 +797,13 @@ struct rt_rq {
#endif /* CONFIG_SMP */
int rt_queued;
#ifdef CONFIG_RT_GROUP_SCHED
int rt_throttled;
u64 rt_time;
u64 rt_runtime;
/* Nests inside the rq lock: */
raw_spinlock_t rt_runtime_lock;
#ifdef CONFIG_RT_GROUP_SCHED
unsigned int rt_nr_boosted;
struct rq *rq;
......@@ -1110,6 +1113,8 @@ struct rq {
struct scx_rq scx;
#endif
struct sched_dl_entity fair_server;
#ifdef CONFIG_FAIR_GROUP_SCHED
/* list of leaf cfs_rq on this CPU: */
struct list_head leaf_cfs_rq_list;
......@@ -1224,7 +1229,6 @@ struct rq {
/* latency stats */
struct sched_info rq_sched_info;
unsigned long long rq_cpu_time;
/* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */
/* sys_sched_yield() stats */
unsigned int yld_count;
......@@ -2619,7 +2623,6 @@ extern void init_sched_fair_class(void);
extern void resched_curr(struct rq *rq);
extern void resched_cpu(int cpu);
extern struct rt_bandwidth def_rt_bandwidth;
extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime);
extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq);
......
......@@ -516,6 +516,14 @@ void rq_attach_root(struct rq *rq, struct root_domain *rd)
if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
set_rq_online(rq);
/*
* Because the rq is not a task, dl_add_task_root_domain() did not
* move the fair server bw to the rd if it already started.
* Add it now.
*/
if (rq->fair_server.dl_server)
__dl_server_attach_root(&rq->fair_server, rq);
rq_unlock_irqrestore(rq, &rf);
if (old_rd)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment