Commit ec2c2976 authored by Paul E. McKenney's avatar Paul E. McKenney

rcu: Fix grace-period hangs from mid-init task resume

Without special fail-safe quiescent-state-propagation checks, grace-period
hangs can result from the following scenario:

1.	A task running on a given CPU is preempted in its RCU read-side
	critical section.

2.	That CPU goes offline, and there are now no online CPUs
	corresponding to that CPU's leaf rcu_node structure.

3.	The rcu_gp_init() function does the first phase of grace-period
	initialization, and sets the aforementioned leaf rcu_node
	structure's ->qsmaskinit field to all zeroes.  Because there
	is a blocked task, it does not propagate the zeroing of either
	->qsmaskinit or ->qsmaskinitnext up the rcu_node tree.

4.	The task resumes on some other CPU and exits its critical section.
	There is no grace period in progress, so the resulting quiescent
	state is not reported up the tree.

5.	The rcu_gp_init() function does the second phase of grace-period
	initialization, which results in the leaf rcu_node structure
	being initialized to expect no further quiescent states, but
	with that structure's parent expecting a quiescent-state report.

	The parent will never receive a quiescent state from this leaf
	rcu_node structure, so the grace period will hang, resulting in
	RCU CPU stall warnings.

It would be good to get rid of the special fail-safe quiescent-state
propagation checks.  This commit therefore checks the leaf rcu_node
structure's ->wait_blkd_tasks field during grace-period initialization.
If this flag is set, the rcu_report_qs_rnp() is invoked to immediately
report the possible quiescent state.  While in the neighborhood, this
commit also report quiescent states for any CPUs that went offline between
the two phases of grace-period initialization, thus reducing grace-period
delays and hopefully eventually allowing removal of offline-CPU checks
from the force-quiescent-state code path.
Signed-off-by: default avatarPaul E. McKenney <paulmck@linux.vnet.ibm.com>
parent 0b107d24
...@@ -154,6 +154,9 @@ EXPORT_SYMBOL_GPL(rcu_scheduler_active); ...@@ -154,6 +154,9 @@ EXPORT_SYMBOL_GPL(rcu_scheduler_active);
*/ */
static int rcu_scheduler_fully_active __read_mostly; static int rcu_scheduler_fully_active __read_mostly;
static void
rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
struct rcu_node *rnp, unsigned long gps, unsigned long flags);
static void rcu_init_new_rnp(struct rcu_node *rnp_leaf); static void rcu_init_new_rnp(struct rcu_node *rnp_leaf);
static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf); static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf);
static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu); static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu);
...@@ -1858,7 +1861,9 @@ static void rcu_gp_slow(struct rcu_state *rsp, int delay) ...@@ -1858,7 +1861,9 @@ static void rcu_gp_slow(struct rcu_state *rsp, int delay)
*/ */
static bool rcu_gp_init(struct rcu_state *rsp) static bool rcu_gp_init(struct rcu_state *rsp)
{ {
unsigned long flags;
unsigned long oldmask; unsigned long oldmask;
unsigned long mask;
struct rcu_data *rdp; struct rcu_data *rdp;
struct rcu_node *rnp = rcu_get_root(rsp); struct rcu_node *rnp = rcu_get_root(rsp);
...@@ -1951,7 +1956,7 @@ static bool rcu_gp_init(struct rcu_state *rsp) ...@@ -1951,7 +1956,7 @@ static bool rcu_gp_init(struct rcu_state *rsp)
*/ */
rcu_for_each_node_breadth_first(rsp, rnp) { rcu_for_each_node_breadth_first(rsp, rnp) {
rcu_gp_slow(rsp, gp_init_delay); rcu_gp_slow(rsp, gp_init_delay);
raw_spin_lock_irq_rcu_node(rnp); raw_spin_lock_irqsave_rcu_node(rnp, flags);
rdp = this_cpu_ptr(rsp->rda); rdp = this_cpu_ptr(rsp->rda);
rcu_preempt_check_blocked_tasks(rnp); rcu_preempt_check_blocked_tasks(rnp);
rnp->qsmask = rnp->qsmaskinit; rnp->qsmask = rnp->qsmaskinit;
...@@ -1962,6 +1967,11 @@ static bool rcu_gp_init(struct rcu_state *rsp) ...@@ -1962,6 +1967,11 @@ static bool rcu_gp_init(struct rcu_state *rsp)
trace_rcu_grace_period_init(rsp->name, rnp->gp_seq, trace_rcu_grace_period_init(rsp->name, rnp->gp_seq,
rnp->level, rnp->grplo, rnp->level, rnp->grplo,
rnp->grphi, rnp->qsmask); rnp->grphi, rnp->qsmask);
/* Quiescent states for tasks on any now-offline CPUs. */
mask = rnp->qsmask & ~rnp->qsmaskinitnext;
if ((mask || rnp->wait_blkd_tasks) && rcu_is_leaf_node(rnp))
rcu_report_qs_rnp(mask, rsp, rnp, rnp->gp_seq, flags);
else
raw_spin_unlock_irq_rcu_node(rnp); raw_spin_unlock_irq_rcu_node(rnp);
cond_resched_tasks_rcu_qs(); cond_resched_tasks_rcu_qs();
WRITE_ONCE(rsp->gp_activity, jiffies); WRITE_ONCE(rsp->gp_activity, jiffies);
...@@ -2233,6 +2243,10 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags) ...@@ -2233,6 +2243,10 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
* is the grace-period snapshot, which means that the quiescent states * is the grace-period snapshot, which means that the quiescent states
* are valid only if rnp->gp_seq is equal to gps. That structure's lock * are valid only if rnp->gp_seq is equal to gps. That structure's lock
* must be held upon entry, and it is released before return. * must be held upon entry, and it is released before return.
*
* As a special case, if mask is zero, the bit-already-cleared check is
* disabled. This allows propagating quiescent state due to resumed tasks
* during grace-period initialization.
*/ */
static void static void
rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
...@@ -2246,7 +2260,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, ...@@ -2246,7 +2260,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
/* Walk up the rcu_node hierarchy. */ /* Walk up the rcu_node hierarchy. */
for (;;) { for (;;) {
if (!(rnp->qsmask & mask) || rnp->gp_seq != gps) { if ((!(rnp->qsmask & mask) && mask) || rnp->gp_seq != gps) {
/* /*
* Our bit has already been cleared, or the * Our bit has already been cleared, or the
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment