Commit b167fdff authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'sched-core-2022-08-01' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull scheduler updates from Ingo Molnar:
"Load-balancing improvements:

   - Improve NUMA balancing on AMD Zen systems for affine workloads.

   - Improve the handling of reduced-capacity CPUs in load-balancing.

   - Energy Model improvements: fix & refine all the energy fairness
     metrics (PELT), and remove the conservative threshold requiring 6%
     energy savings to migrate a task. Doing this improves power
     efficiency for most workloads, and also increases the reliability
     of energy-efficiency scheduling.

   - Optimize/tweak select_idle_cpu() to spend (much) less time
     searching for an idle CPU on overloaded systems. There's reports of
     several milliseconds spent there on large systems with large
     workloads ...

     [ Since the search logic changed, there might be behavioral side
       effects. ]

   - Improve NUMA imbalance behavior. On certain systems with spare
     capacity, initial placement of tasks is non-deterministic, and such
     an artificial placement imbalance can persist for a long time,
     hurting (and sometimes helping) performance.

     The fix is to make fork-time task placement consistent with runtime
     NUMA balancing placement.

     Note that some performance regressions were reported against this,
     caused by workloads that are not memory bandwith limited, which
     benefit from the artificial locality of the placement bug(s). Mel
     Gorman's conclusion, with which we concur, was that consistency is
     better than random workload benefits from non-deterministic bugs:

        "Given there is no crystal ball and it's a tradeoff, I think
         it's better to be consistent and use similar logic at both fork
         time and runtime even if it doesn't have universal benefit."

   - Improve core scheduling by fixing a bug in
     sched_core_update_cookie() that caused unnecessary forced idling.

   - Improve wakeup-balancing by allowing same-LLC wakeup of idle CPUs
     for newly woken tasks.

   - Fix a newidle balancing bug that introduced unnecessary wakeup
     latencies.

  ABI improvements/fixes:

   - Do not check capabilities and do not issue capability check denial
     messages when a scheduler syscall doesn't require privileges. (Such
     as increasing niceness.)

   - Add forced-idle accounting to cgroups too.

   - Fix/improve the RSEQ ABI to not just silently accept unknown flags.
     (No existing tooling is known to have learned to rely on the
     previous behavior.)

   - Depreciate the (unused) RSEQ_CS_FLAG_NO_RESTART_ON_* flags.

  Optimizations:

   - Optimize & simplify leaf_cfs_rq_list()

   - Micro-optimize set_nr_{and_not,if}_polling() via try_cmpxchg().

  Misc fixes & cleanups:

   - Fix the RSEQ self-tests on RISC-V and Glibc 2.35 systems.

   - Fix a full-NOHZ bug that can in some cases result in the tick not
     being re-enabled when the last SCHED_RT task is gone from a
     runqueue but there's still SCHED_OTHER tasks around.

   - Various PREEMPT_RT related fixes.

   - Misc cleanups & smaller fixes"

* tag 'sched-core-2022-08-01' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (32 commits)
  rseq: Kill process when unknown flags are encountered in ABI structures
  rseq: Deprecate RSEQ_CS_FLAG_NO_RESTART_ON_* flags
  sched/core: Fix the bug that task won't enqueue into core tree when update cookie
  nohz/full, sched/rt: Fix missed tick-reenabling bug in dequeue_task_rt()
  sched/core: Always flush pending blk_plug
  sched/fair: fix case with reduced capacity CPU
  sched/core: Use try_cmpxchg in set_nr_{and_not,if}_polling
  sched/core: add forced idle accounting for cgroups
  sched/fair: Remove the energy margin in feec()
  sched/fair: Remove task_util from effective utilization in feec()
  sched/fair: Use the same cpumask per-PD throughout find_energy_efficient_cpu()
  sched/fair: Rename select_idle_mask to select_rq_mask
  sched, drivers: Remove max param from effective_cpu_util()/sched_cpu_util()
  sched/fair: Decay task PELT values during wakeup migration
  sched/fair: Provide u64 read for 32-bits arch helper
  sched/fair: Introduce SIS_UTIL to search idle CPU based on sum of util_avg
  sched: only perform capability check on privileged operation
  sched: Remove unused function group_first_cpu()
  sched/fair: Remove redundant word " *"
  selftests/rseq: check if libc rseq support is registered
  ...
parents 0dd1cabe c17a6ff9
......@@ -71,34 +71,19 @@ static u64 set_pd_power_limit(struct dtpm *dtpm, u64 power_limit)
static u64 scale_pd_power_uw(struct cpumask *pd_mask, u64 power)
{
unsigned long max = 0, sum_util = 0;
unsigned long max, sum_util = 0;
int cpu;
for_each_cpu_and(cpu, pd_mask, cpu_online_mask) {
/*
* The capacity is the same for all CPUs belonging to
* the same perf domain, so a single call to
* arch_scale_cpu_capacity() is enough. However, we
* need the CPU parameter to be initialized by the
* loop, so the call ends up in this block.
*
* We can initialize 'max' with a cpumask_first() call
* before the loop but the bits computation is not
* worth given the arch_scale_cpu_capacity() just
* returns a value where the resulting assembly code
* will be optimized by the compiler.
*/
max = arch_scale_cpu_capacity(cpu);
sum_util += sched_cpu_util(cpu, max);
}
/*
* In the improbable case where all the CPUs of the perf
* domain are offline, 'max' will be zero and will lead to an
* illegal operation with a zero division.
* The capacity is the same for all CPUs belonging to
* the same perf domain.
*/
return max ? (power * ((sum_util << 10) / max)) >> 10 : 0;
max = arch_scale_cpu_capacity(cpumask_first(pd_mask));
for_each_cpu_and(cpu, pd_mask, cpu_online_mask)
sum_util += sched_cpu_util(cpu);
return (power * ((sum_util << 10) / max)) >> 10;
}
static u64 get_pd_power_uw(struct dtpm *dtpm)
......
......@@ -137,11 +137,9 @@ static u32 cpu_power_to_freq(struct cpufreq_cooling_device *cpufreq_cdev,
static u32 get_load(struct cpufreq_cooling_device *cpufreq_cdev, int cpu,
int cpu_idx)
{
unsigned long max = arch_scale_cpu_capacity(cpu);
unsigned long util;
unsigned long util = sched_cpu_util(cpu);
util = sched_cpu_util(cpu, max);
return (util * 100) / max;
return (util * 100) / arch_scale_cpu_capacity(cpu);
}
#else /* !CONFIG_SMP */
static u32 get_load(struct cpufreq_cooling_device *cpufreq_cdev, int cpu,
......
......@@ -288,6 +288,10 @@ struct css_set {
struct cgroup_base_stat {
struct task_cputime cputime;
#ifdef CONFIG_SCHED_CORE
u64 forceidle_sum;
#endif
};
/*
......
......@@ -28,6 +28,9 @@ enum cpu_usage_stat {
CPUTIME_STEAL,
CPUTIME_GUEST,
CPUTIME_GUEST_NICE,
#ifdef CONFIG_SCHED_CORE
CPUTIME_FORCEIDLE,
#endif
NR_STATS,
};
......@@ -115,4 +118,8 @@ extern void account_process_tick(struct task_struct *, int user);
extern void account_idle_ticks(unsigned long ticks);
#ifdef CONFIG_SCHED_CORE
extern void __account_forceidle_time(struct task_struct *tsk, u64 delta);
#endif
#endif /* _LINUX_KERNEL_STAT_H */
......@@ -2257,7 +2257,7 @@ static inline bool owner_on_cpu(struct task_struct *owner)
}
/* Returns effective CPU energy utilization, as seen by the scheduler */
unsigned long sched_cpu_util(int cpu, unsigned long max);
unsigned long sched_cpu_util(int cpu);
#endif /* CONFIG_SMP */
#ifdef CONFIG_RSEQ
......
......@@ -39,20 +39,12 @@ static inline struct task_struct *rt_mutex_get_top_task(struct task_struct *p)
}
extern void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task);
extern void rt_mutex_adjust_pi(struct task_struct *p);
static inline bool tsk_is_pi_blocked(struct task_struct *tsk)
{
return tsk->pi_blocked_on != NULL;
}
#else
static inline struct task_struct *rt_mutex_get_top_task(struct task_struct *task)
{
return NULL;
}
# define rt_mutex_adjust_pi(p) do { } while (0)
static inline bool tsk_is_pi_blocked(struct task_struct *tsk)
{
return false;
}
#endif
extern void normalize_rt_tasks(void);
......
......@@ -81,6 +81,7 @@ struct sched_domain_shared {
atomic_t ref;
atomic_t nr_busy_cpus;
int has_idle_cores;
int nr_idle_scan;
};
struct sched_domain {
......
......@@ -310,6 +310,9 @@ static void cgroup_base_stat_add(struct cgroup_base_stat *dst_bstat,
dst_bstat->cputime.utime += src_bstat->cputime.utime;
dst_bstat->cputime.stime += src_bstat->cputime.stime;
dst_bstat->cputime.sum_exec_runtime += src_bstat->cputime.sum_exec_runtime;
#ifdef CONFIG_SCHED_CORE
dst_bstat->forceidle_sum += src_bstat->forceidle_sum;
#endif
}
static void cgroup_base_stat_sub(struct cgroup_base_stat *dst_bstat,
......@@ -318,6 +321,9 @@ static void cgroup_base_stat_sub(struct cgroup_base_stat *dst_bstat,
dst_bstat->cputime.utime -= src_bstat->cputime.utime;
dst_bstat->cputime.stime -= src_bstat->cputime.stime;
dst_bstat->cputime.sum_exec_runtime -= src_bstat->cputime.sum_exec_runtime;
#ifdef CONFIG_SCHED_CORE
dst_bstat->forceidle_sum -= src_bstat->forceidle_sum;
#endif
}
static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu)
......@@ -398,6 +404,11 @@ void __cgroup_account_cputime_field(struct cgroup *cgrp,
case CPUTIME_SOFTIRQ:
rstatc->bstat.cputime.stime += delta_exec;
break;
#ifdef CONFIG_SCHED_CORE
case CPUTIME_FORCEIDLE:
rstatc->bstat.forceidle_sum += delta_exec;
break;
#endif
default:
break;
}
......@@ -411,8 +422,9 @@ void __cgroup_account_cputime_field(struct cgroup *cgrp,
* with how it is done by __cgroup_account_cputime_field for each bit of
* cpu time attributed to a cgroup.
*/
static void root_cgroup_cputime(struct task_cputime *cputime)
static void root_cgroup_cputime(struct cgroup_base_stat *bstat)
{
struct task_cputime *cputime = &bstat->cputime;
int i;
cputime->stime = 0;
......@@ -438,6 +450,10 @@ static void root_cgroup_cputime(struct task_cputime *cputime)
cputime->sum_exec_runtime += user;
cputime->sum_exec_runtime += sys;
cputime->sum_exec_runtime += cpustat[CPUTIME_STEAL];
#ifdef CONFIG_SCHED_CORE
bstat->forceidle_sum += cpustat[CPUTIME_FORCEIDLE];
#endif
}
}
......@@ -445,27 +461,43 @@ void cgroup_base_stat_cputime_show(struct seq_file *seq)
{
struct cgroup *cgrp = seq_css(seq)->cgroup;
u64 usage, utime, stime;
struct task_cputime cputime;
struct cgroup_base_stat bstat;
#ifdef CONFIG_SCHED_CORE
u64 forceidle_time;
#endif
if (cgroup_parent(cgrp)) {
cgroup_rstat_flush_hold(cgrp);
usage = cgrp->bstat.cputime.sum_exec_runtime;
cputime_adjust(&cgrp->bstat.cputime, &cgrp->prev_cputime,
&utime, &stime);
#ifdef CONFIG_SCHED_CORE
forceidle_time = cgrp->bstat.forceidle_sum;
#endif
cgroup_rstat_flush_release();
} else {
root_cgroup_cputime(&cputime);
usage = cputime.sum_exec_runtime;
utime = cputime.utime;
stime = cputime.stime;
root_cgroup_cputime(&bstat);
usage = bstat.cputime.sum_exec_runtime;
utime = bstat.cputime.utime;
stime = bstat.cputime.stime;
#ifdef CONFIG_SCHED_CORE
forceidle_time = bstat.forceidle_sum;
#endif
}
do_div(usage, NSEC_PER_USEC);
do_div(utime, NSEC_PER_USEC);
do_div(stime, NSEC_PER_USEC);
#ifdef CONFIG_SCHED_CORE
do_div(forceidle_time, NSEC_PER_USEC);
#endif
seq_printf(seq, "usage_usec %llu\n"
"user_usec %llu\n"
"system_usec %llu\n",
usage, utime, stime);
#ifdef CONFIG_SCHED_CORE
seq_printf(seq, "core_sched.force_idle_usec %llu\n", forceidle_time);
#endif
}
......@@ -18,8 +18,9 @@
#define CREATE_TRACE_POINTS
#include <trace/events/rseq.h>
#define RSEQ_CS_PREEMPT_MIGRATE_FLAGS (RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE | \
RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT)
#define RSEQ_CS_NO_RESTART_FLAGS (RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT | \
RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL | \
RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE)
/*
*
......@@ -175,23 +176,15 @@ static int rseq_need_restart(struct task_struct *t, u32 cs_flags)
u32 flags, event_mask;
int ret;
if (WARN_ON_ONCE(cs_flags & RSEQ_CS_NO_RESTART_FLAGS) || cs_flags)
return -EINVAL;
/* Get thread flags. */
ret = get_user(flags, &t->rseq->flags);
if (ret)
return ret;
/* Take critical section flags into account. */
flags |= cs_flags;
/*
* Restart on signal can only be inhibited when restart on
* preempt and restart on migrate are inhibited too. Otherwise,
* a preempted signal handler could fail to restart the prior
* execution context on sigreturn.
*/
if (unlikely((flags & RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL) &&
(flags & RSEQ_CS_PREEMPT_MIGRATE_FLAGS) !=
RSEQ_CS_PREEMPT_MIGRATE_FLAGS))
if (WARN_ON_ONCE(flags & RSEQ_CS_NO_RESTART_FLAGS) || flags)
return -EINVAL;
/*
......@@ -203,7 +196,7 @@ static int rseq_need_restart(struct task_struct *t, u32 cs_flags)
t->rseq_event_mask = 0;
preempt_enable();
return !!(event_mask & ~flags);
return !!event_mask;
}
static int clear_rseq_cs(struct task_struct *t)
......
This diff is collapsed.
......@@ -56,7 +56,6 @@ static unsigned long sched_core_update_cookie(struct task_struct *p,
unsigned long old_cookie;
struct rq_flags rf;
struct rq *rq;
bool enqueued;
rq = task_rq_lock(p, &rf);
......@@ -68,14 +67,16 @@ static unsigned long sched_core_update_cookie(struct task_struct *p,
*/
SCHED_WARN_ON((p->core_cookie || cookie) && !sched_core_enabled(rq));
enqueued = sched_core_enqueued(p);
if (enqueued)
if (sched_core_enqueued(p))
sched_core_dequeue(rq, p, DEQUEUE_SAVE);
old_cookie = p->core_cookie;
p->core_cookie = cookie;
if (enqueued)
/*
* Consider the cases: !prev_cookie and !cookie.
*/
if (cookie && task_on_rq_queued(p))
sched_core_enqueue(rq, p);
/*
......@@ -277,7 +278,11 @@ void __sched_core_account_forceidle(struct rq *rq)
if (p == rq_i->idle)
continue;
__schedstat_add(p->stats.core_forceidle_sum, delta);
/*
* Note: this will account forceidle to the current cpu, even
* if it comes from our SMT sibling.
*/
__account_forceidle_time(p, delta);
}
}
......
......@@ -157,11 +157,10 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy,
static void sugov_get_util(struct sugov_cpu *sg_cpu)
{
struct rq *rq = cpu_rq(sg_cpu->cpu);
unsigned long max = arch_scale_cpu_capacity(sg_cpu->cpu);
sg_cpu->max = max;
sg_cpu->max = arch_scale_cpu_capacity(sg_cpu->cpu);
sg_cpu->bw_dl = cpu_bw_dl(rq);
sg_cpu->util = effective_cpu_util(sg_cpu->cpu, cpu_util_cfs(sg_cpu->cpu), max,
sg_cpu->util = effective_cpu_util(sg_cpu->cpu, cpu_util_cfs(sg_cpu->cpu),
FREQUENCY_UTIL, NULL);
}
......
......@@ -226,6 +226,21 @@ void account_idle_time(u64 cputime)
cpustat[CPUTIME_IDLE] += cputime;
}
#ifdef CONFIG_SCHED_CORE
/*
* Account for forceidle time due to core scheduling.
*
* REQUIRES: schedstat is enabled.
*/
void __account_forceidle_time(struct task_struct *p, u64 delta)
{
__schedstat_add(p->stats.core_forceidle_sum, delta);
task_group_account_field(p, CPUTIME_FORCEIDLE, delta);
}
#endif
/*
* When a guest is interrupted for a longer amount of time, missed clock
* ticks are not redelivered later. Due to that, this function may on
......
......@@ -30,14 +30,16 @@ static struct ctl_table sched_dl_sysctls[] = {
.data = &sysctl_sched_dl_period_max,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec,
.proc_handler = proc_douintvec_minmax,
.extra1 = (void *)&sysctl_sched_dl_period_min,
},
{
.procname = "sched_deadline_period_min_us",
.data = &sysctl_sched_dl_period_min,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec,
.proc_handler = proc_douintvec_minmax,
.extra2 = (void *)&sysctl_sched_dl_period_max,
},
{}
};
......
This diff is collapsed.
......@@ -60,7 +60,8 @@ SCHED_FEAT(TTWU_QUEUE, true)
/*
* When doing wakeups, attempt to limit superfluous scans of the LLC domain.
*/
SCHED_FEAT(SIS_PROP, true)
SCHED_FEAT(SIS_PROP, false)
SCHED_FEAT(SIS_UTIL, true)
/*
* Issue a WARN when we do multiple update_rq_clock() calls
......
......@@ -61,6 +61,25 @@ static inline void cfs_se_util_change(struct sched_avg *avg)
WRITE_ONCE(avg->util_est.enqueued, enqueued);
}
static inline u64 rq_clock_pelt(struct rq *rq)
{
lockdep_assert_rq_held(rq);
assert_clock_updated(rq);
return rq->clock_pelt - rq->lost_idle_time;
}
/* The rq is idle, we can sync to clock_task */
static inline void _update_idle_rq_clock_pelt(struct rq *rq)
{
rq->clock_pelt = rq_clock_task(rq);
u64_u32_store(rq->clock_idle, rq_clock(rq));
/* Paired with smp_rmb in migrate_se_pelt_lag() */
smp_wmb();
u64_u32_store(rq->clock_pelt_idle, rq_clock_pelt(rq));
}
/*
* The clock_pelt scales the time to reflect the effective amount of
* computation done during the running delta time but then sync back to
......@@ -76,8 +95,7 @@ static inline void cfs_se_util_change(struct sched_avg *avg)
static inline void update_rq_clock_pelt(struct rq *rq, s64 delta)
{
if (unlikely(is_idle_task(rq->curr))) {
/* The rq is idle, we can sync to clock_task */
rq->clock_pelt = rq_clock_task(rq);
_update_idle_rq_clock_pelt(rq);
return;
}
......@@ -130,17 +148,23 @@ static inline void update_idle_rq_clock_pelt(struct rq *rq)
*/
if (util_sum >= divider)
rq->lost_idle_time += rq_clock_task(rq) - rq->clock_pelt;
_update_idle_rq_clock_pelt(rq);
}
static inline u64 rq_clock_pelt(struct rq *rq)
#ifdef CONFIG_CFS_BANDWIDTH
static inline void update_idle_cfs_rq_clock_pelt(struct cfs_rq *cfs_rq)
{
lockdep_assert_rq_held(rq);
assert_clock_updated(rq);
u64 throttled;
return rq->clock_pelt - rq->lost_idle_time;
if (unlikely(cfs_rq->throttle_count))
throttled = U64_MAX;
else
throttled = cfs_rq->throttled_clock_pelt_time;
u64_u32_store(cfs_rq->throttled_pelt_idle, throttled);
}
#ifdef CONFIG_CFS_BANDWIDTH
/* rq->task_clock normalized against any time this cfs_rq has spent throttled */
static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq)
{
......@@ -150,6 +174,7 @@ static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq)
return rq_clock_pelt(rq_of(cfs_rq)) - cfs_rq->throttled_clock_pelt_time;
}
#else
static inline void update_idle_cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) { }
static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq)
{
return rq_clock_pelt(rq_of(cfs_rq));
......@@ -204,6 +229,7 @@ update_rq_clock_pelt(struct rq *rq, s64 delta) { }
static inline void
update_idle_rq_clock_pelt(struct rq *rq) { }
static inline void update_idle_cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) { }
#endif
......@@ -480,7 +480,7 @@ static inline void rt_queue_push_tasks(struct rq *rq)
#endif /* CONFIG_SMP */
static void enqueue_top_rt_rq(struct rt_rq *rt_rq);
static void dequeue_top_rt_rq(struct rt_rq *rt_rq);
static void dequeue_top_rt_rq(struct rt_rq *rt_rq, unsigned int count);
static inline int on_rt_rq(struct sched_rt_entity *rt_se)
{
......@@ -601,7 +601,7 @@ static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
rt_se = rt_rq->tg->rt_se[cpu];
if (!rt_se) {
dequeue_top_rt_rq(rt_rq);
dequeue_top_rt_rq(rt_rq, rt_rq->rt_nr_running);
/* Kick cpufreq (see the comment in kernel/sched/sched.h). */
cpufreq_update_util(rq_of_rt_rq(rt_rq), 0);
}
......@@ -687,7 +687,7 @@ static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
{
dequeue_top_rt_rq(rt_rq);
dequeue_top_rt_rq(rt_rq, rt_rq->rt_nr_running);
}
static inline int rt_rq_throttled(struct rt_rq *rt_rq)
......@@ -1089,7 +1089,7 @@ static void update_curr_rt(struct rq *rq)
}
static void
dequeue_top_rt_rq(struct rt_rq *rt_rq)
dequeue_top_rt_rq(struct rt_rq *rt_rq, unsigned int count)
{
struct rq *rq = rq_of_rt_rq(rt_rq);
......@@ -1100,7 +1100,7 @@ dequeue_top_rt_rq(struct rt_rq *rt_rq)
BUG_ON(!rq->nr_running);
sub_nr_running(rq, rt_rq->rt_nr_running);
sub_nr_running(rq, count);
rt_rq->rt_queued = 0;
}
......@@ -1486,18 +1486,21 @@ static void __dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flag
static void dequeue_rt_stack(struct sched_rt_entity *rt_se, unsigned int flags)
{
struct sched_rt_entity *back = NULL;
unsigned int rt_nr_running;
for_each_sched_rt_entity(rt_se) {
rt_se->back = back;
back = rt_se;
}
dequeue_top_rt_rq(rt_rq_of_se(back));
rt_nr_running = rt_rq_of_se(back)->rt_nr_running;
for (rt_se = back; rt_se; rt_se = rt_se->back) {
if (on_rt_rq(rt_se))
__dequeue_rt_entity(rt_se, flags);
}
dequeue_top_rt_rq(rt_rq_of_se(back), rt_nr_running);
}
static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
......
......@@ -520,6 +520,45 @@ struct cfs_bandwidth { };
#endif /* CONFIG_CGROUP_SCHED */
/*
* u64_u32_load/u64_u32_store
*
* Use a copy of a u64 value to protect against data race. This is only
* applicable for 32-bits architectures.
*/
#ifdef CONFIG_64BIT
# define u64_u32_load_copy(var, copy) var
# define u64_u32_store_copy(var, copy, val) (var = val)
#else
# define u64_u32_load_copy(var, copy) \
({ \
u64 __val, __val_copy; \
do { \
__val_copy = copy; \
/* \
* paired with u64_u32_store_copy(), ordering access \
* to var and copy. \
*/ \
smp_rmb(); \
__val = var; \
} while (__val != __val_copy); \
__val; \
})
# define u64_u32_store_copy(var, copy, val) \
do { \
typeof(val) __val = (val); \
var = __val; \
/* \
* paired with u64_u32_load_copy(), ordering access to var and \
* copy. \
*/ \
smp_wmb(); \
copy = __val; \
} while (0)
#endif
# define u64_u32_load(var) u64_u32_load_copy(var, var##_copy)
# define u64_u32_store(var, val) u64_u32_store_copy(var, var##_copy, val)
/* CFS-related fields in a runqueue */
struct cfs_rq {
struct load_weight load;
......@@ -560,7 +599,7 @@ struct cfs_rq {
*/
struct sched_avg avg;
#ifndef CONFIG_64BIT
u64 load_last_update_time_copy;
u64 last_update_time_copy;
#endif
struct {
raw_spinlock_t lock ____cacheline_aligned;
......@@ -609,6 +648,10 @@ struct cfs_rq {
int runtime_enabled;
s64 runtime_remaining;
u64 throttled_pelt_idle;
#ifndef CONFIG_64BIT
u64 throttled_pelt_idle_copy;
#endif
u64 throttled_clock;
u64 throttled_clock_pelt;
u64 throttled_clock_pelt_time;
......@@ -981,6 +1024,12 @@ struct rq {
u64 clock_task ____cacheline_aligned;
u64 clock_pelt;
unsigned long lost_idle_time;
u64 clock_pelt_idle;
u64 clock_idle;
#ifndef CONFIG_64BIT
u64 clock_pelt_idle_copy;
u64 clock_idle_copy;
#endif
atomic_t nr_iowait;
......@@ -1815,15 +1864,6 @@ static inline struct cpumask *group_balance_mask(struct sched_group *sg)
return to_cpumask(sg->sgc->cpumask);
}
/**
* group_first_cpu - Returns the first CPU in the cpumask of a sched_group.
* @group: The group whose first CPU is to be returned.
*/
static inline unsigned int group_first_cpu(struct sched_group *group)
{
return cpumask_first(sched_group_span(group));
}
extern int group_balance_cpu(struct sched_group *sg);
#ifdef CONFIG_SCHED_DEBUG
......@@ -2044,7 +2084,6 @@ static inline int task_on_rq_migrating(struct task_struct *p)
#define WF_SYNC 0x10 /* Waker goes to sleep after wakeup */
#define WF_MIGRATED 0x20 /* Internal use, task got migrated */
#define WF_ON_CPU 0x40 /* Wakee is on_cpu */
#ifdef CONFIG_SMP
static_assert(WF_EXEC == SD_BALANCE_EXEC);
......@@ -2852,7 +2891,7 @@ enum cpu_util_type {
};
unsigned long effective_cpu_util(int cpu, unsigned long util_cfs,
unsigned long max, enum cpu_util_type type,
enum cpu_util_type type,
struct task_struct *p);
static inline unsigned long cpu_bw_dl(struct rq *rq)
......
......@@ -2316,23 +2316,30 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
/*
* For a single LLC per node, allow an
* imbalance up to 25% of the node. This is an
* arbitrary cutoff based on SMT-2 to balance
* between memory bandwidth and avoiding
* premature sharing of HT resources and SMT-4
* or SMT-8 *may* benefit from a different
* cutoff.
* imbalance up to 12.5% of the node. This is
* arbitrary cutoff based two factors -- SMT and
* memory channels. For SMT-2, the intent is to
* avoid premature sharing of HT resources but
* SMT-4 or SMT-8 *may* benefit from a different
* cutoff. For memory channels, this is a very
* rough estimate of how many channels may be
* active and is based on recent CPUs with
* many cores.
*
* For multiple LLCs, allow an imbalance
* until multiple tasks would share an LLC
* on one node while LLCs on another node
* remain idle.
* remain idle. This assumes that there are
* enough logical CPUs per LLC to avoid SMT
* factors and that there is a correlation
* between LLCs and memory channels.
*/
nr_llcs = sd->span_weight / child->span_weight;
if (nr_llcs == 1)
imb = sd->span_weight >> 2;
imb = sd->span_weight >> 3;
else
imb = nr_llcs;
imb = max(1U, imb);
sd->imb_numa_nr = imb;
/* Set span based on the first NUMA domain. */
......
......@@ -86,7 +86,7 @@ do { \
#define RSEQ_ASM_STORE_RSEQ_CS(label, cs_label, rseq_cs) \
RSEQ_INJECT_ASM(1) \
"la "RSEQ_ASM_TMP_REG_1 ", " __rseq_str(cs_label) "\n" \
"la " RSEQ_ASM_TMP_REG_1 ", " __rseq_str(cs_label) "\n" \
REG_S RSEQ_ASM_TMP_REG_1 ", %[" __rseq_str(rseq_cs) "]\n" \
__rseq_str(label) ":\n"
......@@ -103,17 +103,17 @@ do { \
#define RSEQ_ASM_OP_CMPEQ(var, expect, label) \
REG_L RSEQ_ASM_TMP_REG_1 ", %[" __rseq_str(var) "]\n" \
"bne "RSEQ_ASM_TMP_REG_1 ", %[" __rseq_str(expect) "] ," \
"bne " RSEQ_ASM_TMP_REG_1 ", %[" __rseq_str(expect) "] ," \
__rseq_str(label) "\n"
#define RSEQ_ASM_OP_CMPEQ32(var, expect, label) \
"lw "RSEQ_ASM_TMP_REG_1 ", %[" __rseq_str(var) "]\n" \
"bne "RSEQ_ASM_TMP_REG_1 ", %[" __rseq_str(expect) "] ," \
"lw " RSEQ_ASM_TMP_REG_1 ", %[" __rseq_str(var) "]\n" \
"bne " RSEQ_ASM_TMP_REG_1 ", %[" __rseq_str(expect) "] ," \
__rseq_str(label) "\n"
#define RSEQ_ASM_OP_CMPNE(var, expect, label) \
REG_L RSEQ_ASM_TMP_REG_1 ", %[" __rseq_str(var) "]\n" \
"beq "RSEQ_ASM_TMP_REG_1 ", %[" __rseq_str(expect) "] ," \
"beq " RSEQ_ASM_TMP_REG_1 ", %[" __rseq_str(expect) "] ," \
__rseq_str(label) "\n"
#define RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, label) \
......@@ -127,12 +127,12 @@ do { \
REG_S RSEQ_ASM_TMP_REG_1 ", %[" __rseq_str(var) "]\n"
#define RSEQ_ASM_OP_R_LOAD_OFF(offset) \
"add "RSEQ_ASM_TMP_REG_1 ", %[" __rseq_str(offset) "], " \
"add " RSEQ_ASM_TMP_REG_1 ", %[" __rseq_str(offset) "], " \
RSEQ_ASM_TMP_REG_1 "\n" \
REG_L RSEQ_ASM_TMP_REG_1 ", (" RSEQ_ASM_TMP_REG_1 ")\n"
#define RSEQ_ASM_OP_R_ADD(count) \
"add "RSEQ_ASM_TMP_REG_1 ", " RSEQ_ASM_TMP_REG_1 \
"add " RSEQ_ASM_TMP_REG_1 ", " RSEQ_ASM_TMP_REG_1 \
", %[" __rseq_str(count) "]\n"
#define RSEQ_ASM_OP_FINAL_STORE(value, var, post_commit_label) \
......@@ -194,8 +194,8 @@ int rseq_cmpeqv_storev(intptr_t *v, intptr_t expect, intptr_t newv, int cpu)
RSEQ_ASM_DEFINE_ABORT(4, abort)
: /* gcc asm goto does not allow outputs */
: [cpu_id] "r" (cpu),
[current_cpu_id] "m" (__rseq_abi.cpu_id),
[rseq_cs] "m" (__rseq_abi.rseq_cs),
[current_cpu_id] "m" (rseq_get_abi()->cpu_id),
[rseq_cs] "m" (rseq_get_abi()->rseq_cs.arch.ptr),
[v] "m" (*v),
[expect] "r" (expect),
[newv] "r" (newv)
......@@ -251,8 +251,8 @@ int rseq_cmpnev_storeoffp_load(intptr_t *v, intptr_t expectnot,
RSEQ_ASM_DEFINE_ABORT(4, abort)
: /* gcc asm goto does not allow outputs */
: [cpu_id] "r" (cpu),
[current_cpu_id] "m" (__rseq_abi.cpu_id),
[rseq_cs] "m" (__rseq_abi.rseq_cs),
[current_cpu_id] "m" (rseq_get_abi()->cpu_id),
[rseq_cs] "m" (rseq_get_abi()->rseq_cs.arch.ptr),
[v] "m" (*v),
[expectnot] "r" (expectnot),
[load] "m" (*load),
......@@ -301,8 +301,8 @@ int rseq_addv(intptr_t *v, intptr_t count, int cpu)
RSEQ_ASM_DEFINE_ABORT(4, abort)
: /* gcc asm goto does not allow outputs */
: [cpu_id] "r" (cpu),
[current_cpu_id] "m" (__rseq_abi.cpu_id),
[rseq_cs] "m" (__rseq_abi.rseq_cs),
[current_cpu_id] "m" (rseq_get_abi()->cpu_id),
[rseq_cs] "m" (rseq_get_abi()->rseq_cs.arch.ptr),
[v] "m" (*v),
[count] "r" (count)
RSEQ_INJECT_INPUT
......@@ -352,8 +352,8 @@ int rseq_cmpeqv_trystorev_storev(intptr_t *v, intptr_t expect,
RSEQ_ASM_DEFINE_ABORT(4, abort)
: /* gcc asm goto does not allow outputs */
: [cpu_id] "r" (cpu),
[current_cpu_id] "m" (__rseq_abi.cpu_id),
[rseq_cs] "m" (__rseq_abi.rseq_cs),
[current_cpu_id] "m" (rseq_get_abi()->cpu_id),
[rseq_cs] "m" (rseq_get_abi()->rseq_cs.arch.ptr),
[expect] "r" (expect),
[v] "m" (*v),
[newv] "r" (newv),
......@@ -411,8 +411,8 @@ int rseq_cmpeqv_trystorev_storev_release(intptr_t *v, intptr_t expect,
RSEQ_ASM_DEFINE_ABORT(4, abort)
: /* gcc asm goto does not allow outputs */
: [cpu_id] "r" (cpu),
[current_cpu_id] "m" (__rseq_abi.cpu_id),
[rseq_cs] "m" (__rseq_abi.rseq_cs),
[current_cpu_id] "m" (rseq_get_abi()->cpu_id),
[rseq_cs] "m" (rseq_get_abi()->rseq_cs.arch.ptr),
[expect] "r" (expect),
[v] "m" (*v),
[newv] "r" (newv),
......@@ -472,8 +472,8 @@ int rseq_cmpeqv_cmpeqv_storev(intptr_t *v, intptr_t expect,
RSEQ_ASM_DEFINE_ABORT(4, abort)
: /* gcc asm goto does not allow outputs */
: [cpu_id] "r" (cpu),
[current_cpu_id] "m" (__rseq_abi.cpu_id),
[rseq_cs] "m" (__rseq_abi.rseq_cs),
[current_cpu_id] "m" (rseq_get_abi()->cpu_id),
[rseq_cs] "m" (rseq_get_abi()->rseq_cs.arch.ptr),
[v] "m" (*v),
[expect] "r" (expect),
[v2] "m" (*v2),
......@@ -532,8 +532,8 @@ int rseq_cmpeqv_trymemcpy_storev(intptr_t *v, intptr_t expect,
RSEQ_ASM_DEFINE_ABORT(4, abort)
: /* gcc asm goto does not allow outputs */
: [cpu_id] "r" (cpu),
[current_cpu_id] "m" (__rseq_abi.cpu_id),
[rseq_cs] "m" (__rseq_abi.rseq_cs),
[current_cpu_id] "m" (rseq_get_abi()->cpu_id),
[rseq_cs] "m" (rseq_get_abi()->rseq_cs.arch.ptr),
[expect] "r" (expect),
[v] "m" (*v),
[newv] "r" (newv),
......@@ -593,8 +593,8 @@ int rseq_cmpeqv_trymemcpy_storev_release(intptr_t *v, intptr_t expect,
RSEQ_ASM_DEFINE_ABORT(4, abort)
: /* gcc asm goto does not allow outputs */
: [cpu_id] "r" (cpu),
[current_cpu_id] "m" (__rseq_abi.cpu_id),
[rseq_cs] "m" (__rseq_abi.rseq_cs),
[current_cpu_id] "m" (rseq_get_abi()->cpu_id),
[rseq_cs] "m" (rseq_get_abi()->rseq_cs.arch.ptr),
[expect] "r" (expect),
[v] "m" (*v),
[newv] "r" (newv),
......@@ -651,8 +651,8 @@ int rseq_offset_deref_addv(intptr_t *ptr, off_t off, intptr_t inc, int cpu)
RSEQ_ASM_DEFINE_ABORT(4, abort)
: /* gcc asm goto does not allow outputs */
: [cpu_id] "r" (cpu),
[current_cpu_id] "m" (__rseq_abi.cpu_id),
[rseq_cs] "m" (__rseq_abi.rseq_cs),
[current_cpu_id] "m" (rseq_get_abi()->cpu_id),
[rseq_cs] "m" (rseq_get_abi()->rseq_cs.arch.ptr),
[ptr] "r" (ptr),
[off] "er" (off),
[inc] "er" (inc)
......
......@@ -111,7 +111,8 @@ void rseq_init(void)
libc_rseq_offset_p = dlsym(RTLD_NEXT, "__rseq_offset");
libc_rseq_size_p = dlsym(RTLD_NEXT, "__rseq_size");
libc_rseq_flags_p = dlsym(RTLD_NEXT, "__rseq_flags");
if (libc_rseq_size_p && libc_rseq_offset_p && libc_rseq_flags_p) {
if (libc_rseq_size_p && libc_rseq_offset_p && libc_rseq_flags_p &&
*libc_rseq_size_p != 0) {
/* rseq registration owned by glibc */
rseq_offset = *libc_rseq_offset_p;
rseq_size = *libc_rseq_size_p;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment