Commit df1e849a authored by Paul E. McKenney's avatar Paul E. McKenney

rcu: Enable tick for nohz_full CPUs slow to provide expedited QS

An expedited grace period can be stalled by a nohz_full CPU looping
in kernel context.  This possibility is currently handled by some
carefully crafted checks in rcu_read_unlock_special() that enlist help
from ksoftirqd when permitted by the scheduler.  However, it is exactly
these checks that require the scheduler avoid holding any of its rq or
pi locks across rcu_read_unlock() without also having held them across
the entire RCU read-side critical section.

It would therefore be very nice if expedited grace periods could
handle nohz_full CPUs looping in kernel context without such checks.
This commit therefore adds code to the expedited grace period's wait
and cleanup code that forces the scheduler-clock interrupt on for CPUs
that fail to quickly supply a quiescent state.  "Quickly" is currently
a hard-coded single-jiffy delay.
Signed-off-by: default avatarPaul E. McKenney <paulmck@kernel.org>
parent 28f0361f
...@@ -109,8 +109,10 @@ enum tick_dep_bits { ...@@ -109,8 +109,10 @@ enum tick_dep_bits {
TICK_DEP_BIT_PERF_EVENTS = 1, TICK_DEP_BIT_PERF_EVENTS = 1,
TICK_DEP_BIT_SCHED = 2, TICK_DEP_BIT_SCHED = 2,
TICK_DEP_BIT_CLOCK_UNSTABLE = 3, TICK_DEP_BIT_CLOCK_UNSTABLE = 3,
TICK_DEP_BIT_RCU = 4 TICK_DEP_BIT_RCU = 4,
TICK_DEP_BIT_RCU_EXP = 5
}; };
#define TICK_DEP_BIT_MAX TICK_DEP_BIT_RCU_EXP
#define TICK_DEP_MASK_NONE 0 #define TICK_DEP_MASK_NONE 0
#define TICK_DEP_MASK_POSIX_TIMER (1 << TICK_DEP_BIT_POSIX_TIMER) #define TICK_DEP_MASK_POSIX_TIMER (1 << TICK_DEP_BIT_POSIX_TIMER)
...@@ -118,6 +120,7 @@ enum tick_dep_bits { ...@@ -118,6 +120,7 @@ enum tick_dep_bits {
#define TICK_DEP_MASK_SCHED (1 << TICK_DEP_BIT_SCHED) #define TICK_DEP_MASK_SCHED (1 << TICK_DEP_BIT_SCHED)
#define TICK_DEP_MASK_CLOCK_UNSTABLE (1 << TICK_DEP_BIT_CLOCK_UNSTABLE) #define TICK_DEP_MASK_CLOCK_UNSTABLE (1 << TICK_DEP_BIT_CLOCK_UNSTABLE)
#define TICK_DEP_MASK_RCU (1 << TICK_DEP_BIT_RCU) #define TICK_DEP_MASK_RCU (1 << TICK_DEP_BIT_RCU)
#define TICK_DEP_MASK_RCU_EXP (1 << TICK_DEP_BIT_RCU_EXP)
#ifdef CONFIG_NO_HZ_COMMON #ifdef CONFIG_NO_HZ_COMMON
extern bool tick_nohz_enabled; extern bool tick_nohz_enabled;
......
...@@ -182,6 +182,7 @@ struct rcu_data { ...@@ -182,6 +182,7 @@ struct rcu_data {
bool rcu_need_heavy_qs; /* GP old, so heavy quiescent state! */ bool rcu_need_heavy_qs; /* GP old, so heavy quiescent state! */
bool rcu_urgent_qs; /* GP old need light quiescent state. */ bool rcu_urgent_qs; /* GP old need light quiescent state. */
bool rcu_forced_tick; /* Forced tick to provide QS. */ bool rcu_forced_tick; /* Forced tick to provide QS. */
bool rcu_forced_tick_exp; /* ... provide QS to expedited GP. */
#ifdef CONFIG_RCU_FAST_NO_HZ #ifdef CONFIG_RCU_FAST_NO_HZ
bool all_lazy; /* All CPU's CBs lazy at idle start? */ bool all_lazy; /* All CPU's CBs lazy at idle start? */
unsigned long last_accelerate; /* Last jiffy CBs were accelerated. */ unsigned long last_accelerate; /* Last jiffy CBs were accelerated. */
......
...@@ -230,7 +230,9 @@ static void __maybe_unused rcu_report_exp_rnp(struct rcu_node *rnp, bool wake) ...@@ -230,7 +230,9 @@ static void __maybe_unused rcu_report_exp_rnp(struct rcu_node *rnp, bool wake)
static void rcu_report_exp_cpu_mult(struct rcu_node *rnp, static void rcu_report_exp_cpu_mult(struct rcu_node *rnp,
unsigned long mask, bool wake) unsigned long mask, bool wake)
{ {
int cpu;
unsigned long flags; unsigned long flags;
struct rcu_data *rdp;
raw_spin_lock_irqsave_rcu_node(rnp, flags); raw_spin_lock_irqsave_rcu_node(rnp, flags);
if (!(rnp->expmask & mask)) { if (!(rnp->expmask & mask)) {
...@@ -238,6 +240,13 @@ static void rcu_report_exp_cpu_mult(struct rcu_node *rnp, ...@@ -238,6 +240,13 @@ static void rcu_report_exp_cpu_mult(struct rcu_node *rnp,
return; return;
} }
WRITE_ONCE(rnp->expmask, rnp->expmask & ~mask); WRITE_ONCE(rnp->expmask, rnp->expmask & ~mask);
for_each_leaf_node_cpu_mask(rnp, cpu, mask) {
rdp = per_cpu_ptr(&rcu_data, cpu);
if (!IS_ENABLED(CONFIG_NO_HZ_FULL) || !rdp->rcu_forced_tick_exp)
continue;
rdp->rcu_forced_tick_exp = false;
tick_dep_clear_cpu(cpu, TICK_DEP_BIT_RCU_EXP);
}
__rcu_report_exp_rnp(rnp, wake, flags); /* Releases rnp->lock. */ __rcu_report_exp_rnp(rnp, wake, flags); /* Releases rnp->lock. */
} }
...@@ -449,6 +458,26 @@ static void sync_rcu_exp_select_cpus(void) ...@@ -449,6 +458,26 @@ static void sync_rcu_exp_select_cpus(void)
flush_work(&rnp->rew.rew_work); flush_work(&rnp->rew.rew_work);
} }
/*
* Wait for the expedited grace period to elapse, within time limit.
* If the time limit is exceeded without the grace period elapsing,
* return false, otherwise return true.
*/
static bool synchronize_rcu_expedited_wait_once(long tlimit)
{
int t;
struct rcu_node *rnp_root = rcu_get_root();
t = swait_event_timeout_exclusive(rcu_state.expedited_wq,
sync_rcu_exp_done_unlocked(rnp_root),
tlimit);
// Workqueues should not be signaled.
if (t > 0 || sync_rcu_exp_done_unlocked(rnp_root))
return true;
WARN_ON(t < 0); /* workqueues should not be signaled. */
return false;
}
/* /*
* Wait for the expedited grace period to elapse, issuing any needed * Wait for the expedited grace period to elapse, issuing any needed
* RCU CPU stall warnings along the way. * RCU CPU stall warnings along the way.
...@@ -460,22 +489,31 @@ static void synchronize_rcu_expedited_wait(void) ...@@ -460,22 +489,31 @@ static void synchronize_rcu_expedited_wait(void)
unsigned long jiffies_start; unsigned long jiffies_start;
unsigned long mask; unsigned long mask;
int ndetected; int ndetected;
struct rcu_data *rdp;
struct rcu_node *rnp; struct rcu_node *rnp;
struct rcu_node *rnp_root = rcu_get_root(); struct rcu_node *rnp_root = rcu_get_root();
int ret;
trace_rcu_exp_grace_period(rcu_state.name, rcu_exp_gp_seq_endval(), TPS("startwait")); trace_rcu_exp_grace_period(rcu_state.name, rcu_exp_gp_seq_endval(), TPS("startwait"));
jiffies_stall = rcu_jiffies_till_stall_check(); jiffies_stall = rcu_jiffies_till_stall_check();
jiffies_start = jiffies; jiffies_start = jiffies;
if (IS_ENABLED(CONFIG_NO_HZ_FULL)) {
if (synchronize_rcu_expedited_wait_once(1))
return;
rcu_for_each_leaf_node(rnp) {
for_each_leaf_node_cpu_mask(rnp, cpu, rnp->expmask) {
rdp = per_cpu_ptr(&rcu_data, cpu);
if (rdp->rcu_forced_tick_exp)
continue;
rdp->rcu_forced_tick_exp = true;
tick_dep_set_cpu(cpu, TICK_DEP_BIT_RCU_EXP);
}
}
WARN_ON_ONCE(1);
}
for (;;) { for (;;) {
ret = swait_event_timeout_exclusive( if (synchronize_rcu_expedited_wait_once(jiffies_stall))
rcu_state.expedited_wq,
sync_rcu_exp_done_unlocked(rnp_root),
jiffies_stall);
if (ret > 0 || sync_rcu_exp_done_unlocked(rnp_root))
return; return;
WARN_ON(ret < 0); /* workqueues should not be signaled. */
if (rcu_cpu_stall_suppress) if (rcu_cpu_stall_suppress)
continue; continue;
panic_on_rcu_stall(); panic_on_rcu_stall();
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment