Commit 8ad075c2 authored by Josh Don's avatar Josh Don Committed by Peter Zijlstra

sched: Async unthrottling for cfs bandwidth

CFS bandwidth currently distributes new runtime and unthrottles cfs_rq's
inline in an hrtimer callback. Runtime distribution is a per-cpu
operation, and unthrottling is a per-cgroup operation, since a tg walk
is required. On machines with a large number of cpus and large cgroup
hierarchies, this cpus*cgroups work can be too much to do in a single
hrtimer callback: since IRQ are disabled, hard lockups may easily occur.
Specifically, we've found this scalability issue on configurations with
256 cpus, O(1000) cgroups in the hierarchy being throttled, and high
memory bandwidth usage.

To fix this, we can instead unthrottle cfs_rq's asynchronously via a
CSD. Each cpu is responsible for unthrottling itself, thus sharding the
total work more fairly across the system, and avoiding hard lockups.
Signed-off-by: default avatarJosh Don <joshdon@google.com>
Signed-off-by: default avatarPeter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20221117005418.3499691-1-joshdon@google.com
parent 9a5322db
...@@ -5461,22 +5461,105 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) ...@@ -5461,22 +5461,105 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
resched_curr(rq); resched_curr(rq);
} }
static void distribute_cfs_runtime(struct cfs_bandwidth *cfs_b) #ifdef CONFIG_SMP
static void __cfsb_csd_unthrottle(void *arg)
{ {
struct cfs_rq *cfs_rq; struct cfs_rq *cursor, *tmp;
struct rq *rq = arg;
struct rq_flags rf;
rq_lock(rq, &rf);
/*
* Since we hold rq lock we're safe from concurrent manipulation of
* the CSD list. However, this RCU critical section annotates the
* fact that we pair with sched_free_group_rcu(), so that we cannot
* race with group being freed in the window between removing it
* from the list and advancing to the next entry in the list.
*/
rcu_read_lock();
list_for_each_entry_safe(cursor, tmp, &rq->cfsb_csd_list,
throttled_csd_list) {
list_del_init(&cursor->throttled_csd_list);
if (cfs_rq_throttled(cursor))
unthrottle_cfs_rq(cursor);
}
rcu_read_unlock();
rq_unlock(rq, &rf);
}
static inline void __unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq)
{
struct rq *rq = rq_of(cfs_rq);
bool first;
if (rq == this_rq()) {
unthrottle_cfs_rq(cfs_rq);
return;
}
/* Already enqueued */
if (SCHED_WARN_ON(!list_empty(&cfs_rq->throttled_csd_list)))
return;
first = list_empty(&rq->cfsb_csd_list);
list_add_tail(&cfs_rq->throttled_csd_list, &rq->cfsb_csd_list);
if (first)
smp_call_function_single_async(cpu_of(rq), &rq->cfsb_csd);
}
#else
static inline void __unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq)
{
unthrottle_cfs_rq(cfs_rq);
}
#endif
static void unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq)
{
lockdep_assert_rq_held(rq_of(cfs_rq));
if (SCHED_WARN_ON(!cfs_rq_throttled(cfs_rq) ||
cfs_rq->runtime_remaining <= 0))
return;
__unthrottle_cfs_rq_async(cfs_rq);
}
static bool distribute_cfs_runtime(struct cfs_bandwidth *cfs_b)
{
struct cfs_rq *local_unthrottle = NULL;
int this_cpu = smp_processor_id();
u64 runtime, remaining = 1; u64 runtime, remaining = 1;
bool throttled = false;
struct cfs_rq *cfs_rq;
struct rq_flags rf;
struct rq *rq;
rcu_read_lock(); rcu_read_lock();
list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq, list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
throttled_list) { throttled_list) {
struct rq *rq = rq_of(cfs_rq); rq = rq_of(cfs_rq);
struct rq_flags rf;
if (!remaining) {
throttled = true;
break;
}
rq_lock_irqsave(rq, &rf); rq_lock_irqsave(rq, &rf);
if (!cfs_rq_throttled(cfs_rq)) if (!cfs_rq_throttled(cfs_rq))
goto next; goto next;
/* By the above check, this should never be true */ #ifdef CONFIG_SMP
/* Already queued for async unthrottle */
if (!list_empty(&cfs_rq->throttled_csd_list))
goto next;
#endif
/* By the above checks, this should never be true */
SCHED_WARN_ON(cfs_rq->runtime_remaining > 0); SCHED_WARN_ON(cfs_rq->runtime_remaining > 0);
raw_spin_lock(&cfs_b->lock); raw_spin_lock(&cfs_b->lock);
...@@ -5490,16 +5573,30 @@ static void distribute_cfs_runtime(struct cfs_bandwidth *cfs_b) ...@@ -5490,16 +5573,30 @@ static void distribute_cfs_runtime(struct cfs_bandwidth *cfs_b)
cfs_rq->runtime_remaining += runtime; cfs_rq->runtime_remaining += runtime;
/* we check whether we're throttled above */ /* we check whether we're throttled above */
if (cfs_rq->runtime_remaining > 0) if (cfs_rq->runtime_remaining > 0) {
unthrottle_cfs_rq(cfs_rq); if (cpu_of(rq) != this_cpu ||
SCHED_WARN_ON(local_unthrottle))
unthrottle_cfs_rq_async(cfs_rq);
else
local_unthrottle = cfs_rq;
} else {
throttled = true;
}
next: next:
rq_unlock_irqrestore(rq, &rf); rq_unlock_irqrestore(rq, &rf);
if (!remaining)
break;
} }
rcu_read_unlock(); rcu_read_unlock();
if (local_unthrottle) {
rq = cpu_rq(this_cpu);
rq_lock_irqsave(rq, &rf);
if (cfs_rq_throttled(local_unthrottle))
unthrottle_cfs_rq(local_unthrottle);
rq_unlock_irqrestore(rq, &rf);
}
return throttled;
} }
/* /*
...@@ -5544,10 +5641,8 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, u ...@@ -5544,10 +5641,8 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, u
while (throttled && cfs_b->runtime > 0) { while (throttled && cfs_b->runtime > 0) {
raw_spin_unlock_irqrestore(&cfs_b->lock, flags); raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
/* we can't nest cfs_b->lock while distributing bandwidth */ /* we can't nest cfs_b->lock while distributing bandwidth */
distribute_cfs_runtime(cfs_b); throttled = distribute_cfs_runtime(cfs_b);
raw_spin_lock_irqsave(&cfs_b->lock, flags); raw_spin_lock_irqsave(&cfs_b->lock, flags);
throttled = !list_empty(&cfs_b->throttled_cfs_rq);
} }
/* /*
...@@ -5824,6 +5919,9 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) ...@@ -5824,6 +5919,9 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
{ {
cfs_rq->runtime_enabled = 0; cfs_rq->runtime_enabled = 0;
INIT_LIST_HEAD(&cfs_rq->throttled_list); INIT_LIST_HEAD(&cfs_rq->throttled_list);
#ifdef CONFIG_SMP
INIT_LIST_HEAD(&cfs_rq->throttled_csd_list);
#endif
} }
void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
...@@ -5840,12 +5938,38 @@ void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) ...@@ -5840,12 +5938,38 @@ void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
{ {
int __maybe_unused i;
/* init_cfs_bandwidth() was not called */ /* init_cfs_bandwidth() was not called */
if (!cfs_b->throttled_cfs_rq.next) if (!cfs_b->throttled_cfs_rq.next)
return; return;
hrtimer_cancel(&cfs_b->period_timer); hrtimer_cancel(&cfs_b->period_timer);
hrtimer_cancel(&cfs_b->slack_timer); hrtimer_cancel(&cfs_b->slack_timer);
/*
* It is possible that we still have some cfs_rq's pending on a CSD
* list, though this race is very rare. In order for this to occur, we
* must have raced with the last task leaving the group while there
* exist throttled cfs_rq(s), and the period_timer must have queued the
* CSD item but the remote cpu has not yet processed it. To handle this,
* we can simply flush all pending CSD work inline here. We're
* guaranteed at this point that no additional cfs_rq of this group can
* join a CSD list.
*/
#ifdef CONFIG_SMP
for_each_possible_cpu(i) {
struct rq *rq = cpu_rq(i);
unsigned long flags;
if (list_empty(&rq->cfsb_csd_list))
continue;
local_irq_save(flags);
__cfsb_csd_unthrottle(rq);
local_irq_restore(flags);
}
#endif
} }
/* /*
...@@ -12474,6 +12598,11 @@ __init void init_sched_fair_class(void) ...@@ -12474,6 +12598,11 @@ __init void init_sched_fair_class(void)
for_each_possible_cpu(i) { for_each_possible_cpu(i) {
zalloc_cpumask_var_node(&per_cpu(load_balance_mask, i), GFP_KERNEL, cpu_to_node(i)); zalloc_cpumask_var_node(&per_cpu(load_balance_mask, i), GFP_KERNEL, cpu_to_node(i));
zalloc_cpumask_var_node(&per_cpu(select_rq_mask, i), GFP_KERNEL, cpu_to_node(i)); zalloc_cpumask_var_node(&per_cpu(select_rq_mask, i), GFP_KERNEL, cpu_to_node(i));
#ifdef CONFIG_CFS_BANDWIDTH
INIT_CSD(&cpu_rq(i)->cfsb_csd, __cfsb_csd_unthrottle, cpu_rq(i));
INIT_LIST_HEAD(&cpu_rq(i)->cfsb_csd_list);
#endif
} }
open_softirq(SCHED_SOFTIRQ, run_rebalance_domains); open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
......
...@@ -645,6 +645,9 @@ struct cfs_rq { ...@@ -645,6 +645,9 @@ struct cfs_rq {
int throttled; int throttled;
int throttle_count; int throttle_count;
struct list_head throttled_list; struct list_head throttled_list;
#ifdef CONFIG_SMP
struct list_head throttled_csd_list;
#endif
#endif /* CONFIG_CFS_BANDWIDTH */ #endif /* CONFIG_CFS_BANDWIDTH */
#endif /* CONFIG_FAIR_GROUP_SCHED */ #endif /* CONFIG_FAIR_GROUP_SCHED */
}; };
...@@ -1154,6 +1157,11 @@ struct rq { ...@@ -1154,6 +1157,11 @@ struct rq {
/* Scratch cpumask to be temporarily used under rq_lock */ /* Scratch cpumask to be temporarily used under rq_lock */
cpumask_var_t scratch_mask; cpumask_var_t scratch_mask;
#if defined(CONFIG_CFS_BANDWIDTH) && defined(CONFIG_SMP)
call_single_data_t cfsb_csd;
struct list_head cfsb_csd_list;
#endif
}; };
#ifdef CONFIG_FAIR_GROUP_SCHED #ifdef CONFIG_FAIR_GROUP_SCHED
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment