tree.c 121 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14
/*
 * Read-Copy Update mechanism for mutual exclusion
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
15 16
 * along with this program; if not, you can access it online at
 * http://www.gnu.org/licenses/gpl-2.0.html.
17 18 19 20 21 22 23 24 25 26 27
 *
 * Copyright IBM Corporation, 2008
 *
 * Authors: Dipankar Sarma <dipankar@in.ibm.com>
 *	    Manfred Spraul <manfred@colorfullife.com>
 *	    Paul E. McKenney <paulmck@linux.vnet.ibm.com> Hierarchical version
 *
 * Based on the original work by Paul McKenney <paulmck@us.ibm.com>
 * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
 *
 * For detailed explanation of Read-Copy Update mechanism see -
28
 *	Documentation/RCU
29
 */
30 31 32

#define pr_fmt(fmt) "rcu: " fmt

33 34 35 36 37
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/spinlock.h>
#include <linux/smp.h>
38
#include <linux/rcupdate_wait.h>
39 40
#include <linux/interrupt.h>
#include <linux/sched.h>
41
#include <linux/sched/debug.h>
42
#include <linux/nmi.h>
43
#include <linux/atomic.h>
44
#include <linux/bitops.h>
45
#include <linux/export.h>
46 47 48 49 50 51 52
#include <linux/completion.h>
#include <linux/moduleparam.h>
#include <linux/percpu.h>
#include <linux/notifier.h>
#include <linux/cpu.h>
#include <linux/mutex.h>
#include <linux/time.h>
53
#include <linux/kernel_stat.h>
54 55
#include <linux/wait.h>
#include <linux/kthread.h>
56
#include <uapi/linux/sched/types.h>
57
#include <linux/prefetch.h>
58 59
#include <linux/delay.h>
#include <linux/stop_machine.h>
60
#include <linux/random.h>
61
#include <linux/trace_events.h>
62
#include <linux/suspend.h>
63
#include <linux/ftrace.h>
64

65
#include "tree.h"
66
#include "rcu.h"
67

68 69 70 71 72
#ifdef MODULE_PARAM_PREFIX
#undef MODULE_PARAM_PREFIX
#endif
#define MODULE_PARAM_PREFIX "rcutree."

73 74
/* Data structures. */

75 76 77 78 79 80 81 82 83 84 85 86
static DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, rcu_data);
struct rcu_state rcu_state = {
	.level = { &rcu_state.node[0] },
	.gp_state = RCU_GP_IDLE,
	.gp_seq = (0UL - 300UL) << RCU_SEQ_CTR_SHIFT,
	.barrier_mutex = __MUTEX_INITIALIZER(rcu_state.barrier_mutex),
	.name = RCU_NAME,
	.abbr = RCU_ABBR,
	.exp_mutex = __MUTEX_INITIALIZER(rcu_state.exp_mutex),
	.exp_wake_mutex = __MUTEX_INITIALIZER(rcu_state.exp_wake_mutex),
	.ofl_lock = __SPIN_LOCK_UNLOCKED(rcu_state.ofl_lock),
};
87

88 89 90
/* Dump rcu_node combining tree at boot to verify correct setup. */
static bool dump_tree;
module_param(dump_tree, bool, 0444);
91 92 93
/* Control rcu_node-tree auto-balancing at boot time. */
static bool rcu_fanout_exact;
module_param(rcu_fanout_exact, bool, 0444);
94 95
/* Increase (but not decrease) the RCU_FANOUT_LEAF at boot time. */
static int rcu_fanout_leaf = RCU_FANOUT_LEAF;
96
module_param(rcu_fanout_leaf, int, 0444);
97
int rcu_num_lvls __read_mostly = RCU_NUM_LVLS;
98
/* Number of rcu_nodes at specified level. */
99
int num_rcu_lvl[] = NUM_RCU_LVL_INIT;
100
int rcu_num_nodes __read_mostly = NUM_RCU_NODES; /* Total # rcu_nodes in use. */
101 102
/* panic() on RCU Stall sysctl. */
int sysctl_panic_on_rcu_stall __read_mostly;
103

104
/*
105 106 107 108
 * The rcu_scheduler_active variable is initialized to the value
 * RCU_SCHEDULER_INACTIVE and transitions RCU_SCHEDULER_INIT just before the
 * first task is spawned.  So when this variable is RCU_SCHEDULER_INACTIVE,
 * RCU can assume that there is but one task, allowing RCU to (for example)
109
 * optimize synchronize_rcu() to a simple barrier().  When this variable
110 111 112 113 114
 * is RCU_SCHEDULER_INIT, RCU must actually do all the hard work required
 * to detect real grace periods.  This variable is also used to suppress
 * boot-time false positives from lockdep-RCU error checking.  Finally, it
 * transitions from RCU_SCHEDULER_INIT to RCU_SCHEDULER_RUNNING after RCU
 * is fully initialized, including all of its kthreads having been spawned.
115
 */
116 117 118
int rcu_scheduler_active __read_mostly;
EXPORT_SYMBOL_GPL(rcu_scheduler_active);

119 120 121 122 123 124 125 126 127 128 129 130 131 132
/*
 * The rcu_scheduler_fully_active variable transitions from zero to one
 * during the early_initcall() processing, which is after the scheduler
 * is capable of creating new tasks.  So RCU processing (for example,
 * creating tasks for RCU priority boosting) must be delayed until after
 * rcu_scheduler_fully_active transitions from zero to one.  We also
 * currently delay invocation of any RCU callbacks until after this point.
 *
 * It might later prove better for people registering RCU callbacks during
 * early boot to take responsibility for these callbacks, but one step at
 * a time.
 */
static int rcu_scheduler_fully_active __read_mostly;

133 134
static void rcu_report_qs_rnp(unsigned long mask, struct rcu_node *rnp,
			      unsigned long gps, unsigned long flags);
135 136
static void rcu_init_new_rnp(struct rcu_node *rnp_leaf);
static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf);
Thomas Gleixner's avatar
Thomas Gleixner committed
137
static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu);
138
static void invoke_rcu_core(void);
139
static void invoke_rcu_callbacks(struct rcu_data *rdp);
140
static void rcu_report_exp_rdp(struct rcu_data *rdp);
141
static void sync_sched_exp_online_cleanup(int cpu);
142

143
/* rcuc/rcub kthread realtime priority */
144
static int kthread_prio = IS_ENABLED(CONFIG_RCU_BOOST) ? 1 : 0;
145 146
module_param(kthread_prio, int, 0644);

147
/* Delay in jiffies for grace-period initialization delays, debug only. */
148

149 150 151 152 153 154
static int gp_preinit_delay;
module_param(gp_preinit_delay, int, 0444);
static int gp_init_delay;
module_param(gp_init_delay, int, 0444);
static int gp_cleanup_delay;
module_param(gp_cleanup_delay, int, 0444);
155

156
/* Retrieve RCU kthreads priority for rcutorture */
157 158 159 160 161 162
int rcu_get_gp_kthreads_prio(void)
{
	return kthread_prio;
}
EXPORT_SYMBOL_GPL(rcu_get_gp_kthreads_prio);

163 164
/*
 * Number of grace periods between delays, normalized by the duration of
165
 * the delay.  The longer the delay, the more the grace periods between
166 167 168 169 170 171 172
 * each delay.  The reason for this normalization is that it means that,
 * for non-zero delays, the overall slowdown of grace periods is constant
 * regardless of the duration of the delay.  This arrangement balances
 * the need for long delays to increase some race probabilities with the
 * need for fast grace periods to increase other race probabilities.
 */
#define PER_RCU_NODE_PERIOD 3	/* Number of grace periods between delays. */
173

174 175 176 177 178 179 180 181
/*
 * Compute the mask of online CPUs for the specified rcu_node structure.
 * This will not be stable unless the rcu_node structure's ->lock is
 * held, but the bit corresponding to the current CPU will be stable
 * in most contexts.
 */
unsigned long rcu_rnp_online_cpus(struct rcu_node *rnp)
{
182
	return READ_ONCE(rnp->qsmaskinitnext);
183 184
}

185
/*
186
 * Return true if an RCU grace period is in progress.  The READ_ONCE()s
187 188 189
 * permit this function to be invoked without holding the root rcu_node
 * structure's ->lock, but of course results can be subject to change.
 */
190
static int rcu_gp_in_progress(void)
191
{
192
	return rcu_seq_state(rcu_seq_current(&rcu_state.gp_seq));
193 194
}

195 196
void rcu_softirq_qs(void)
{
197
	rcu_qs();
198 199 200
	rcu_preempt_deferred_qs(current);
}

201 202 203 204 205 206 207 208 209
/*
 * Steal a bit from the bottom of ->dynticks for idle entry/exit
 * control.  Initially this is for TLB flushing.
 */
#define RCU_DYNTICK_CTRL_MASK 0x1
#define RCU_DYNTICK_CTRL_CTR  (RCU_DYNTICK_CTRL_MASK + 1)
#ifndef rcu_eqs_special_exit
#define rcu_eqs_special_exit() do { } while (0)
#endif
210 211

static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
212
	.dynticks_nesting = 1,
213
	.dynticks_nmi_nesting = DYNTICK_IRQ_NONIDLE,
214
	.dynticks = ATOMIC_INIT(RCU_DYNTICK_CTRL_CTR),
215 216
};

217 218 219 220 221 222 223
/*
 * Record entry into an extended quiescent state.  This is only to be
 * called when not already in an extended quiescent state.
 */
static void rcu_dynticks_eqs_enter(void)
{
	struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
224
	int seq;
225 226

	/*
227
	 * CPUs seeing atomic_add_return() must see prior RCU read-side
228 229 230
	 * critical sections, and we also must force ordering with the
	 * next idle sojourn.
	 */
231 232 233 234 235 236 237
	seq = atomic_add_return(RCU_DYNTICK_CTRL_CTR, &rdtp->dynticks);
	/* Better be in an extended quiescent state! */
	WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
		     (seq & RCU_DYNTICK_CTRL_CTR));
	/* Better not have special action (TLB flush) pending! */
	WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
		     (seq & RCU_DYNTICK_CTRL_MASK));
238 239 240 241 242 243 244 245 246
}

/*
 * Record exit from an extended quiescent state.  This is only to be
 * called from an extended quiescent state.
 */
static void rcu_dynticks_eqs_exit(void)
{
	struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
247
	int seq;
248 249

	/*
250
	 * CPUs seeing atomic_add_return() must see prior idle sojourns,
251 252 253
	 * and we also must force ordering with the next RCU read-side
	 * critical section.
	 */
254 255 256 257 258 259 260 261 262
	seq = atomic_add_return(RCU_DYNTICK_CTRL_CTR, &rdtp->dynticks);
	WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
		     !(seq & RCU_DYNTICK_CTRL_CTR));
	if (seq & RCU_DYNTICK_CTRL_MASK) {
		atomic_andnot(RCU_DYNTICK_CTRL_MASK, &rdtp->dynticks);
		smp_mb__after_atomic(); /* _exit after clearing mask. */
		/* Prefer duplicate flushes to losing a flush. */
		rcu_eqs_special_exit();
	}
263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278
}

/*
 * Reset the current CPU's ->dynticks counter to indicate that the
 * newly onlined CPU is no longer in an extended quiescent state.
 * This will either leave the counter unchanged, or increment it
 * to the next non-quiescent value.
 *
 * The non-atomic test/increment sequence works because the upper bits
 * of the ->dynticks counter are manipulated only by the corresponding CPU,
 * or when the corresponding CPU is offline.
 */
static void rcu_dynticks_eqs_online(void)
{
	struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);

279
	if (atomic_read(&rdtp->dynticks) & RCU_DYNTICK_CTRL_CTR)
280
		return;
281
	atomic_add(RCU_DYNTICK_CTRL_CTR, &rdtp->dynticks);
282 283
}

284 285 286 287 288 289 290 291 292
/*
 * Is the current CPU in an extended quiescent state?
 *
 * No ordering, as we are sampling CPU-local information.
 */
bool rcu_dynticks_curr_cpu_in_eqs(void)
{
	struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);

293
	return !(atomic_read(&rdtp->dynticks) & RCU_DYNTICK_CTRL_CTR);
294 295
}

296 297 298 299
/*
 * Snapshot the ->dynticks counter with full ordering so as to allow
 * stable comparison of this counter with past and future snapshots.
 */
300
int rcu_dynticks_snap(struct rcu_dynticks *rdtp)
301 302 303
{
	int snap = atomic_add_return(0, &rdtp->dynticks);

304
	return snap & ~RCU_DYNTICK_CTRL_MASK;
305 306
}

307 308 309 310 311 312
/*
 * Return true if the snapshot returned from rcu_dynticks_snap()
 * indicates that RCU is in an extended quiescent state.
 */
static bool rcu_dynticks_in_eqs(int snap)
{
313
	return !(snap & RCU_DYNTICK_CTRL_CTR);
314 315 316 317 318 319 320 321 322 323 324 325
}

/*
 * Return true if the CPU corresponding to the specified rcu_dynticks
 * structure has spent some time in an extended quiescent state since
 * rcu_dynticks_snap() returned the specified snapshot.
 */
static bool rcu_dynticks_in_eqs_since(struct rcu_dynticks *rdtp, int snap)
{
	return snap != rcu_dynticks_snap(rdtp);
}

326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345
/*
 * Set the special (bottom) bit of the specified CPU so that it
 * will take special action (such as flushing its TLB) on the
 * next exit from an extended quiescent state.  Returns true if
 * the bit was successfully set, or false if the CPU was not in
 * an extended quiescent state.
 */
bool rcu_eqs_special_set(int cpu)
{
	int old;
	int new;
	struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);

	do {
		old = atomic_read(&rdtp->dynticks);
		if (old & RCU_DYNTICK_CTRL_CTR)
			return false;
		new = old | RCU_DYNTICK_CTRL_MASK;
	} while (atomic_cmpxchg(&rdtp->dynticks, old, new) != old);
	return true;
346
}
347

348 349 350 351 352 353 354
/*
 * Let the RCU core know that this CPU has gone through the scheduler,
 * which is a quiescent state.  This is called when the need for a
 * quiescent state is urgent, so we burn an atomic operation and full
 * memory barriers to let the RCU core know about it, regardless of what
 * this CPU might (or might not) do in the near future.
 *
355
 * We inform the RCU core by emulating a zero-duration dyntick-idle period.
356
 *
357
 * The caller must have disabled interrupts and must not be idle.
358 359 360
 */
static void rcu_momentary_dyntick_idle(void)
{
361 362 363
	struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
	int special;

364
	raw_cpu_write(rcu_dynticks.rcu_need_heavy_qs, false);
365 366 367
	special = atomic_add_return(2 * RCU_DYNTICK_CTRL_CTR, &rdtp->dynticks);
	/* It is illegal to call this from idle state. */
	WARN_ON_ONCE(!(special & RCU_DYNTICK_CTRL_CTR));
368
	rcu_preempt_deferred_qs(current);
369 370
}

371 372 373 374 375 376
/**
 * rcu_is_cpu_rrupt_from_idle - see if idle or immediately interrupted from idle
 *
 * If the current CPU is idle or running at a first-level (not nested)
 * interrupt from idle, return true.  The caller must have at least
 * disabled preemption.
377
 */
378
static int rcu_is_cpu_rrupt_from_idle(void)
379
{
380 381
	return __this_cpu_read(rcu_dynticks.dynticks_nesting) <= 0 &&
	       __this_cpu_read(rcu_dynticks.dynticks_nmi_nesting) <= 1;
382 383
}

384
/*
385
 * Register a quiescent state for all RCU flavors.  If there is an
386 387
 * emergency, invoke rcu_momentary_dyntick_idle() to do a heavy-weight
 * dyntick-idle quiescent state visible to other CPUs (but only for those
388
 * RCU flavors in desperate need of a quiescent state, which will normally
389 390
 * be none of them).  Either way, do a lightweight quiescent state for
 * all RCU flavors.
391 392 393 394 395
 *
 * The barrier() calls are redundant in the common case when this is
 * called externally, but just in case this is called from within this
 * file.
 *
396 397 398
 */
void rcu_all_qs(void)
{
399 400
	unsigned long flags;

401 402 403 404 405 406 407 408 409
	if (!raw_cpu_read(rcu_dynticks.rcu_urgent_qs))
		return;
	preempt_disable();
	/* Load rcu_urgent_qs before other flags. */
	if (!smp_load_acquire(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs))) {
		preempt_enable();
		return;
	}
	this_cpu_write(rcu_dynticks.rcu_urgent_qs, false);
410
	barrier(); /* Avoid RCU read-side critical sections leaking down. */
411
	if (unlikely(raw_cpu_read(rcu_dynticks.rcu_need_heavy_qs))) {
412
		local_irq_save(flags);
413
		rcu_momentary_dyntick_idle();
414 415
		local_irq_restore(flags);
	}
416 417
	if (unlikely(raw_cpu_read(rcu_data.cpu_no_qs.b.exp)))
		rcu_qs();
418
	this_cpu_inc(rcu_dynticks.rcu_qs_ctr);
419
	barrier(); /* Avoid RCU read-side critical sections leaking up. */
420
	preempt_enable();
421 422 423
}
EXPORT_SYMBOL_GPL(rcu_all_qs);

424 425 426 427 428 429
#define DEFAULT_RCU_BLIMIT 10     /* Maximum callbacks per rcu_do_batch. */
static long blimit = DEFAULT_RCU_BLIMIT;
#define DEFAULT_RCU_QHIMARK 10000 /* If this many pending, ignore blimit. */
static long qhimark = DEFAULT_RCU_QHIMARK;
#define DEFAULT_RCU_QLOMARK 100   /* Once only this many pending, use blimit. */
static long qlowmark = DEFAULT_RCU_QLOMARK;
430

431 432 433
module_param(blimit, long, 0444);
module_param(qhimark, long, 0444);
module_param(qlowmark, long, 0444);
434

435 436
static ulong jiffies_till_first_fqs = ULONG_MAX;
static ulong jiffies_till_next_fqs = ULONG_MAX;
437
static bool rcu_kick_kthreads;
438

439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470
static int param_set_first_fqs_jiffies(const char *val, const struct kernel_param *kp)
{
	ulong j;
	int ret = kstrtoul(val, 0, &j);

	if (!ret)
		WRITE_ONCE(*(ulong *)kp->arg, (j > HZ) ? HZ : j);
	return ret;
}

static int param_set_next_fqs_jiffies(const char *val, const struct kernel_param *kp)
{
	ulong j;
	int ret = kstrtoul(val, 0, &j);

	if (!ret)
		WRITE_ONCE(*(ulong *)kp->arg, (j > HZ) ? HZ : (j ?: 1));
	return ret;
}

static struct kernel_param_ops first_fqs_jiffies_ops = {
	.set = param_set_first_fqs_jiffies,
	.get = param_get_ulong,
};

static struct kernel_param_ops next_fqs_jiffies_ops = {
	.set = param_set_next_fqs_jiffies,
	.get = param_get_ulong,
};

module_param_cb(jiffies_till_first_fqs, &first_fqs_jiffies_ops, &jiffies_till_first_fqs, 0644);
module_param_cb(jiffies_till_next_fqs, &next_fqs_jiffies_ops, &jiffies_till_next_fqs, 0644);
471
module_param(rcu_kick_kthreads, bool, 0644);
472

473 474 475 476
/*
 * How long the grace period must be before we start recruiting
 * quiescent-state help from rcu_note_context_switch().
 */
477 478
static ulong jiffies_till_sched_qs = HZ / 10;
module_param(jiffies_till_sched_qs, ulong, 0444);
479

480 481
static void force_qs_rnp(int (*f)(struct rcu_data *rsp));
static void force_quiescent_state(void);
482
static int rcu_pending(void);
483 484

/*
485
 * Return the number of RCU GPs completed thus far for debug & stats.
486
 */
487
unsigned long rcu_get_gp_seq(void)
488
{
489
	return READ_ONCE(rcu_state.gp_seq);
490
}
491
EXPORT_SYMBOL_GPL(rcu_get_gp_seq);
492 493

/*
494
 * Return the number of RCU-sched GPs completed thus far for debug & stats.
495
 */
496
unsigned long rcu_sched_get_gp_seq(void)
497
{
498
	return rcu_get_gp_seq();
499
}
500
EXPORT_SYMBOL_GPL(rcu_sched_get_gp_seq);
501 502

/*
503 504
 * Return the number of RCU GPs completed thus far for debug & stats.
 * This is a transitional API and will soon be removed.
505
 */
506
unsigned long rcu_bh_get_gp_seq(void)
507
{
508
	return READ_ONCE(rcu_state.gp_seq);
509
}
510
EXPORT_SYMBOL_GPL(rcu_bh_get_gp_seq);
511

512 513 514 515 516 517 518 519
/*
 * Return the number of RCU expedited batches completed thus far for
 * debug & stats.  Odd numbers mean that a batch is in progress, even
 * numbers mean idle.  The value returned will thus be roughly double
 * the cumulative batches since boot.
 */
unsigned long rcu_exp_batches_completed(void)
{
520
	return rcu_state.expedited_sequence;
521 522 523 524 525 526 527 528 529
}
EXPORT_SYMBOL_GPL(rcu_exp_batches_completed);

/*
 * Return the number of RCU-sched expedited batches completed thus far
 * for debug & stats.  Similar to rcu_exp_batches_completed().
 */
unsigned long rcu_exp_batches_completed_sched(void)
{
530
	return rcu_state.expedited_sequence;
531 532 533
}
EXPORT_SYMBOL_GPL(rcu_exp_batches_completed_sched);

534 535 536 537 538
/*
 * Force a quiescent state.
 */
void rcu_force_quiescent_state(void)
{
539
	force_quiescent_state();
540 541 542
}
EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);

543 544 545 546 547
/*
 * Force a quiescent state for RCU BH.
 */
void rcu_bh_force_quiescent_state(void)
{
548
	force_quiescent_state();
549 550 551
}
EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state);

552 553 554 555 556
/*
 * Force a quiescent state for RCU-sched.
 */
void rcu_sched_force_quiescent_state(void)
{
557
	rcu_force_quiescent_state();
558 559 560
}
EXPORT_SYMBOL_GPL(rcu_sched_force_quiescent_state);

561 562 563 564 565
/*
 * Show the state of the grace-period kthreads.
 */
void show_rcu_gp_kthreads(void)
{
566 567 568
	int cpu;
	struct rcu_data *rdp;
	struct rcu_node *rnp;
569

570 571 572 573 574 575 576 577 578 579 580 581 582 583 584
	pr_info("%s: wait state: %d ->state: %#lx\n", rcu_state.name,
		rcu_state.gp_state, rcu_state.gp_kthread->state);
	rcu_for_each_node_breadth_first(rnp) {
		if (ULONG_CMP_GE(rcu_state.gp_seq, rnp->gp_seq_needed))
			continue;
		pr_info("\trcu_node %d:%d ->gp_seq %lu ->gp_seq_needed %lu\n",
			rnp->grplo, rnp->grphi, rnp->gp_seq,
			rnp->gp_seq_needed);
		if (!rcu_is_leaf_node(rnp))
			continue;
		for_each_leaf_node_possible_cpu(rnp, cpu) {
			rdp = per_cpu_ptr(&rcu_data, cpu);
			if (rdp->gpwrap ||
			    ULONG_CMP_GE(rcu_state.gp_seq,
					 rdp->gp_seq_needed))
585
				continue;
586 587
			pr_info("\tcpu %d ->gp_seq_needed %lu\n",
				cpu, rdp->gp_seq_needed);
588
		}
589
	}
590
	/* sched_show_task(rcu_state.gp_kthread); */
591 592 593
}
EXPORT_SYMBOL_GPL(show_rcu_gp_kthreads);

594 595 596 597
/*
 * Send along grace-period-related data for rcutorture diagnostics.
 */
void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags,
598
			    unsigned long *gp_seq)
599 600 601 602 603
{
	switch (test_type) {
	case RCU_FLAVOR:
	case RCU_BH_FLAVOR:
	case RCU_SCHED_FLAVOR:
604 605
		*flags = READ_ONCE(rcu_state.gp_flags);
		*gp_seq = rcu_seq_current(&rcu_state.gp_seq);
606 607 608 609 610 611 612
		break;
	default:
		break;
	}
}
EXPORT_SYMBOL_GPL(rcutorture_get_gp_data);

613 614 615
/*
 * Return the root node of the specified rcu_state structure.
 */
616
static struct rcu_node *rcu_get_root(void)
617
{
618
	return &rcu_state.node[0];
619 620
}

621
/*
622 623
 * Enter an RCU extended quiescent state, which can be either the
 * idle loop or adaptive-tickless usermode execution.
624
 *
625 626 627
 * We crowbar the ->dynticks_nmi_nesting field to zero to allow for
 * the possibility of usermode upcalls having messed up our count
 * of interrupt nesting level during the prior busy period.
628
 */
629
static void rcu_eqs_enter(bool user)
630
{
631
	struct rcu_data *rdp;
632
	struct rcu_dynticks *rdtp;
633

634
	rdtp = this_cpu_ptr(&rcu_dynticks);
635
	WARN_ON_ONCE(rdtp->dynticks_nmi_nesting != DYNTICK_IRQ_NONIDLE);
636 637 638 639 640 641
	WRITE_ONCE(rdtp->dynticks_nmi_nesting, 0);
	WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
		     rdtp->dynticks_nesting == 0);
	if (rdtp->dynticks_nesting != 1) {
		rdtp->dynticks_nesting--;
		return;
642
	}
643

644
	lockdep_assert_irqs_disabled();
645
	trace_rcu_dyntick(TPS("Start"), rdtp->dynticks_nesting, 0, rdtp->dynticks);
646
	WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current));
647 648
	rdp = this_cpu_ptr(&rcu_data);
	do_nocb_deferred_wakeup(rdp);
649
	rcu_prepare_for_idle();
650
	rcu_preempt_deferred_qs(current);
651
	WRITE_ONCE(rdtp->dynticks_nesting, 0); /* Avoid irq-access tearing. */
652
	rcu_dynticks_eqs_enter();
653
	rcu_dynticks_task_enter();
654
}
655 656 657 658 659 660 661 662 663

/**
 * rcu_idle_enter - inform RCU that current CPU is entering idle
 *
 * Enter idle mode, in other words, -leave- the mode in which RCU
 * read-side critical sections can occur.  (Though RCU read-side
 * critical sections can occur in irq handlers in idle, a possibility
 * handled by irq_enter() and irq_exit().)
 *
664 665
 * If you add or remove a call to rcu_idle_enter(), be sure to test with
 * CONFIG_RCU_EQS_DEBUG=y.
666 667 668
 */
void rcu_idle_enter(void)
{
669
	lockdep_assert_irqs_disabled();
670
	rcu_eqs_enter(false);
671
}
672

673
#ifdef CONFIG_NO_HZ_FULL
674 675 676 677 678 679 680
/**
 * rcu_user_enter - inform RCU that we are resuming userspace.
 *
 * Enter RCU idle mode right before resuming userspace.  No use of RCU
 * is permitted between this call and rcu_user_exit(). This way the
 * CPU doesn't need to maintain the tick for RCU maintenance purposes
 * when the CPU runs in userspace.
681 682 683
 *
 * If you add or remove a call to rcu_user_enter(), be sure to test with
 * CONFIG_RCU_EQS_DEBUG=y.
684 685 686
 */
void rcu_user_enter(void)
{
687
	lockdep_assert_irqs_disabled();
688
	rcu_eqs_enter(true);
689
}
690
#endif /* CONFIG_NO_HZ_FULL */
691

692
/*
693 694 695 696 697
 * If we are returning from the outermost NMI handler that interrupted an
 * RCU-idle period, update rdtp->dynticks and rdtp->dynticks_nmi_nesting
 * to let the RCU grace-period handling know that the CPU is back to
 * being RCU-idle.
 *
698
 * If you add or remove a call to rcu_nmi_exit_common(), be sure to test
699 700
 * with CONFIG_RCU_EQS_DEBUG=y.
 */
701
static __always_inline void rcu_nmi_exit_common(bool irq)
702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717
{
	struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);

	/*
	 * Check for ->dynticks_nmi_nesting underflow and bad ->dynticks.
	 * (We are exiting an NMI handler, so RCU better be paying attention
	 * to us!)
	 */
	WARN_ON_ONCE(rdtp->dynticks_nmi_nesting <= 0);
	WARN_ON_ONCE(rcu_dynticks_curr_cpu_in_eqs());

	/*
	 * If the nesting level is not 1, the CPU wasn't RCU-idle, so
	 * leave it in non-RCU-idle state.
	 */
	if (rdtp->dynticks_nmi_nesting != 1) {
718
		trace_rcu_dyntick(TPS("--="), rdtp->dynticks_nmi_nesting, rdtp->dynticks_nmi_nesting - 2, rdtp->dynticks);
719 720 721 722 723 724
		WRITE_ONCE(rdtp->dynticks_nmi_nesting, /* No store tearing. */
			   rdtp->dynticks_nmi_nesting - 2);
		return;
	}

	/* This NMI interrupted an RCU-idle CPU, restore RCU-idleness. */
725
	trace_rcu_dyntick(TPS("Startirq"), rdtp->dynticks_nmi_nesting, 0, rdtp->dynticks);
726
	WRITE_ONCE(rdtp->dynticks_nmi_nesting, 0); /* Avoid store tearing. */
727 728 729 730

	if (irq)
		rcu_prepare_for_idle();

731
	rcu_dynticks_eqs_enter();
732 733 734 735 736 737 738 739 740 741 742 743 744 745 746

	if (irq)
		rcu_dynticks_task_enter();
}

/**
 * rcu_nmi_exit - inform RCU of exit from NMI context
 * @irq: Is this call from rcu_irq_exit?
 *
 * If you add or remove a call to rcu_nmi_exit(), be sure to test
 * with CONFIG_RCU_EQS_DEBUG=y.
 */
void rcu_nmi_exit(void)
{
	rcu_nmi_exit_common(false);
747 748
}

749 750 751 752 753
/**
 * rcu_irq_exit - inform RCU that current CPU is exiting irq towards idle
 *
 * Exit from an interrupt handler, which might possibly result in entering
 * idle mode, in other words, leaving the mode in which read-side critical
754
 * sections can occur.  The caller must have disabled interrupts.
755
 *
756 757
 * This code assumes that the idle loop never does anything that might
 * result in unbalanced calls to irq_enter() and irq_exit().  If your
758 759
 * architecture's idle loop violates this assumption, RCU will give you what
 * you deserve, good and hard.  But very infrequently and irreproducibly.
760 761 762 763
 *
 * Use things like work queues to work around this limitation.
 *
 * You have been warned.
764 765 766
 *
 * If you add or remove a call to rcu_irq_exit(), be sure to test with
 * CONFIG_RCU_EQS_DEBUG=y.
767
 */
768
void rcu_irq_exit(void)
769
{
770
	lockdep_assert_irqs_disabled();
771
	rcu_nmi_exit_common(true);
772 773 774 775
}

/*
 * Wrapper for rcu_irq_exit() where interrupts are enabled.
776 777 778
 *
 * If you add or remove a call to rcu_irq_exit_irqson(), be sure to test
 * with CONFIG_RCU_EQS_DEBUG=y.
779 780 781 782 783 784 785
 */
void rcu_irq_exit_irqson(void)
{
	unsigned long flags;

	local_irq_save(flags);
	rcu_irq_exit();
786 787 788
	local_irq_restore(flags);
}

789 790 791
/*
 * Exit an RCU extended quiescent state, which can be either the
 * idle loop or adaptive-tickless usermode execution.
792 793 794 795
 *
 * We crowbar the ->dynticks_nmi_nesting field to DYNTICK_IRQ_NONIDLE to
 * allow for the possibility of usermode upcalls messing up our count of
 * interrupt nesting level during the busy period that is just now starting.
796
 */
797
static void rcu_eqs_exit(bool user)
798 799
{
	struct rcu_dynticks *rdtp;
800
	long oldval;
801

802
	lockdep_assert_irqs_disabled();
803
	rdtp = this_cpu_ptr(&rcu_dynticks);
804
	oldval = rdtp->dynticks_nesting;
805
	WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && oldval < 0);
806 807
	if (oldval) {
		rdtp->dynticks_nesting++;
808
		return;
809
	}
810 811 812 813
	rcu_dynticks_task_exit();
	rcu_dynticks_eqs_exit();
	rcu_cleanup_after_idle();
	trace_rcu_dyntick(TPS("End"), rdtp->dynticks_nesting, 1, rdtp->dynticks);
814
	WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current));
815
	WRITE_ONCE(rdtp->dynticks_nesting, 1);
816
	WARN_ON_ONCE(rdtp->dynticks_nmi_nesting);
817
	WRITE_ONCE(rdtp->dynticks_nmi_nesting, DYNTICK_IRQ_NONIDLE);
818
}
819 820 821 822 823 824 825

/**
 * rcu_idle_exit - inform RCU that current CPU is leaving idle
 *
 * Exit idle mode, in other words, -enter- the mode in which RCU
 * read-side critical sections can occur.
 *
826 827
 * If you add or remove a call to rcu_idle_exit(), be sure to test with
 * CONFIG_RCU_EQS_DEBUG=y.
828 829 830
 */
void rcu_idle_exit(void)
{
831 832 833
	unsigned long flags;

	local_irq_save(flags);
834
	rcu_eqs_exit(false);
835
	local_irq_restore(flags);
836
}
837

838
#ifdef CONFIG_NO_HZ_FULL
839 840 841 842 843
/**
 * rcu_user_exit - inform RCU that we are exiting userspace.
 *
 * Exit RCU idle mode while entering the kernel because it can
 * run a RCU read side critical section anytime.
844 845 846
 *
 * If you add or remove a call to rcu_user_exit(), be sure to test with
 * CONFIG_RCU_EQS_DEBUG=y.
847 848 849
 */
void rcu_user_exit(void)
{
850
	rcu_eqs_exit(1);
851
}
852
#endif /* CONFIG_NO_HZ_FULL */
853

854
/**
855 856
 * rcu_nmi_enter_common - inform RCU of entry to NMI context
 * @irq: Is this call from rcu_irq_enter?
857
 *
858 859 860 861 862
 * If the CPU was idle from RCU's viewpoint, update rdtp->dynticks and
 * rdtp->dynticks_nmi_nesting to let the RCU grace-period handling know
 * that the CPU is active.  This implementation permits nested NMIs, as
 * long as the nesting level does not overflow an int.  (You will probably
 * run out of stack space first.)
863
 *
864
 * If you add or remove a call to rcu_nmi_enter_common(), be sure to test
865
 * with CONFIG_RCU_EQS_DEBUG=y.
866
 */
867
static __always_inline void rcu_nmi_enter_common(bool irq)
868
{
869
	struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
870
	long incby = 2;
871

872 873 874 875 876 877 878 879 880 881 882
	/* Complain about underflow. */
	WARN_ON_ONCE(rdtp->dynticks_nmi_nesting < 0);

	/*
	 * If idle from RCU viewpoint, atomically increment ->dynticks
	 * to mark non-idle and increment ->dynticks_nmi_nesting by one.
	 * Otherwise, increment ->dynticks_nmi_nesting by two.  This means
	 * if ->dynticks_nmi_nesting is equal to one, we are guaranteed
	 * to be in the outermost NMI handler that interrupted an RCU-idle
	 * period (observation due to Andy Lutomirski).
	 */
883
	if (rcu_dynticks_curr_cpu_in_eqs()) {
884 885 886 887

		if (irq)
			rcu_dynticks_task_exit();

888
		rcu_dynticks_eqs_exit();
889 890 891 892

		if (irq)
			rcu_cleanup_after_idle();

893 894
		incby = 1;
	}
895 896
	trace_rcu_dyntick(incby == 1 ? TPS("Endirq") : TPS("++="),
			  rdtp->dynticks_nmi_nesting,
897
			  rdtp->dynticks_nmi_nesting + incby, rdtp->dynticks);
898 899
	WRITE_ONCE(rdtp->dynticks_nmi_nesting, /* Prevent store tearing. */
		   rdtp->dynticks_nmi_nesting + incby);
900
	barrier();
901 902
}

903 904 905 906 907 908 909 910
/**
 * rcu_nmi_enter - inform RCU of entry to NMI context
 */
void rcu_nmi_enter(void)
{
	rcu_nmi_enter_common(false);
}

911
/**
912
 * rcu_irq_enter - inform RCU that current CPU is entering irq away from idle
913
 *
914 915
 * Enter an interrupt handler, which might possibly result in exiting
 * idle mode, in other words, entering the mode in which read-side critical
916
 * sections can occur.  The caller must have disabled interrupts.
917
 *
918
 * Note that the Linux kernel is fully capable of entering an interrupt
919 920 921 922 923 924
 * handler that it never exits, for example when doing upcalls to user mode!
 * This code assumes that the idle loop never does upcalls to user mode.
 * If your architecture's idle loop does do upcalls to user mode (or does
 * anything else that results in unbalanced calls to the irq_enter() and
 * irq_exit() functions), RCU will give you what you deserve, good and hard.
 * But very infrequently and irreproducibly.
925 926 927 928
 *
 * Use things like work queues to work around this limitation.
 *
 * You have been warned.
929 930 931
 *
 * If you add or remove a call to rcu_irq_enter(), be sure to test with
 * CONFIG_RCU_EQS_DEBUG=y.
932
 */
933
void rcu_irq_enter(void)
934
{
935
	lockdep_assert_irqs_disabled();
936
	rcu_nmi_enter_common(true);
937
}
938

939 940
/*
 * Wrapper for rcu_irq_enter() where interrupts are enabled.
941 942 943
 *
 * If you add or remove a call to rcu_irq_enter_irqson(), be sure to test
 * with CONFIG_RCU_EQS_DEBUG=y.
944 945 946 947
 */
void rcu_irq_enter_irqson(void)
{
	unsigned long flags;
948

949 950
	local_irq_save(flags);
	rcu_irq_enter();
951 952 953
	local_irq_restore(flags);
}

954 955
/**
 * rcu_is_watching - see if RCU thinks that the current CPU is idle
956
 *
957 958 959
 * Return true if RCU is watching the running CPU, which means that this
 * CPU can safely enter RCU read-side critical sections.  In other words,
 * if the current CPU is in its idle loop and is neither in an interrupt
960
 * or NMI handler, return true.
961
 */
962
bool notrace rcu_is_watching(void)
963
{
964
	bool ret;
965

966
	preempt_disable_notrace();
967
	ret = !rcu_dynticks_curr_cpu_in_eqs();
968
	preempt_enable_notrace();
969
	return ret;
970
}
971
EXPORT_SYMBOL_GPL(rcu_is_watching);
972

973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990
/*
 * If a holdout task is actually running, request an urgent quiescent
 * state from its CPU.  This is unsynchronized, so migrations can cause
 * the request to go to the wrong CPU.  Which is OK, all that will happen
 * is that the CPU's next context switch will be a bit slower and next
 * time around this task will generate another request.
 */
void rcu_request_urgent_qs_task(struct task_struct *t)
{
	int cpu;

	barrier();
	cpu = task_cpu(t);
	if (!task_curr(t))
		return; /* This task is not running on that CPU. */
	smp_store_release(per_cpu_ptr(&rcu_dynticks.rcu_urgent_qs, cpu), true);
}

991
#if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU)
992 993

/*
994
 * Is the current CPU online as far as RCU is concerned?
995
 *
996 997 998 999 1000 1001 1002 1003
 * Disable preemption to avoid false positives that could otherwise
 * happen due to the current CPU number being sampled, this task being
 * preempted, its old CPU being taken offline, resuming on some other CPU,
 * then determining that its old CPU is now offline.  Because there are
 * multiple flavors of RCU, and because this function can be called in the
 * midst of updating the flavors while a given CPU coming online or going
 * offline, it is necessary to check all flavors.  If any of the flavors
 * believe that given CPU is online, it is considered to be online.
1004
 *
1005 1006 1007 1008
 * Disable checking if in an NMI handler because we cannot safely
 * report errors from NMI handlers anyway.  In addition, it is OK to use
 * RCU on an offline processor during initial boot, hence the check for
 * rcu_scheduler_fully_active.
1009 1010 1011
 */
bool rcu_lockdep_current_cpu_online(void)
{
1012 1013
	struct rcu_data *rdp;
	struct rcu_node *rnp;
1014
	bool ret = false;
1015

1016
	if (in_nmi() || !rcu_scheduler_fully_active)
Fengguang Wu's avatar
Fengguang Wu committed
1017
		return true;
1018
	preempt_disable();
1019 1020 1021 1022
	rdp = this_cpu_ptr(&rcu_data);
	rnp = rdp->mynode;
	if (rdp->grpmask & rcu_rnp_online_cpus(rnp))
		ret = true;
1023
	preempt_enable();
1024
	return ret;
1025 1026 1027
}
EXPORT_SYMBOL_GPL(rcu_lockdep_current_cpu_online);

1028
#endif /* #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) */
1029

1030 1031 1032
/*
 * We are reporting a quiescent state on behalf of some other CPU, so
 * it is our responsibility to check for and handle potential overflow
1033
 * of the rcu_node ->gp_seq counter with respect to the rcu_data counters.
1034 1035 1036 1037 1038
 * After all, the CPU might be in deep idle state, and thus executing no
 * code whatsoever.
 */
static void rcu_gpnum_ovf(struct rcu_node *rnp, struct rcu_data *rdp)
{
1039
	raw_lockdep_assert_held_rcu_node(rnp);
1040 1041
	if (ULONG_CMP_LT(rcu_seq_current(&rdp->gp_seq) + ULONG_MAX / 4,
			 rnp->gp_seq))
1042
		WRITE_ONCE(rdp->gpwrap, true);
1043 1044
	if (ULONG_CMP_LT(rdp->rcu_iw_gp_seq + ULONG_MAX / 4, rnp->gp_seq))
		rdp->rcu_iw_gp_seq = rnp->gp_seq + ULONG_MAX / 4;
1045 1046
}

1047 1048 1049
/*
 * Snapshot the specified CPU's dynticks counter so that we can later
 * credit them with an implicit quiescent state.  Return 1 if this CPU
1050
 * is in dynticks idle mode, which is an extended quiescent state.
1051
 */
1052
static int dyntick_save_progress_counter(struct rcu_data *rdp)
1053
{
1054
	rdp->dynticks_snap = rcu_dynticks_snap(rdp->dynticks);
1055
	if (rcu_dynticks_in_eqs(rdp->dynticks_snap)) {
1056
		trace_rcu_fqs(rcu_state.name, rdp->gp_seq, rdp->cpu, TPS("dti"));
1057
		rcu_gpnum_ovf(rdp->mynode, rdp);
1058
		return 1;
1059
	}
1060
	return 0;
1061 1062
}

1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077
/*
 * Handler for the irq_work request posted when a grace period has
 * gone on for too long, but not yet long enough for an RCU CPU
 * stall warning.  Set state appropriately, but just complain if
 * there is unexpected state on entry.
 */
static void rcu_iw_handler(struct irq_work *iwp)
{
	struct rcu_data *rdp;
	struct rcu_node *rnp;

	rdp = container_of(iwp, struct rcu_data, rcu_iw);
	rnp = rdp->mynode;
	raw_spin_lock_rcu_node(rnp);
	if (!WARN_ON_ONCE(!rdp->rcu_iw_pending)) {
1078
		rdp->rcu_iw_gp_seq = rnp->gp_seq;
1079 1080 1081 1082 1083
		rdp->rcu_iw_pending = false;
	}
	raw_spin_unlock_rcu_node(rnp);
}

1084 1085 1086 1087
/*
 * Return true if the specified CPU has passed through a quiescent
 * state by virtue of being in or having passed through an dynticks
 * idle state since the last call to dyntick_save_progress_counter()
1088
 * for this same CPU, or by virtue of having been offline.
1089
 */
1090
static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
1091
{
1092
	unsigned long jtsq;
1093
	bool *rnhqp;
1094
	bool *ruqp;
1095
	struct rcu_node *rnp = rdp->mynode;
1096 1097 1098 1099 1100 1101 1102 1103 1104

	/*
	 * If the CPU passed through or entered a dynticks idle phase with
	 * no active irq/NMI handlers, then we can safely pretend that the CPU
	 * already acknowledged the request to pass through a quiescent
	 * state.  Either way, that CPU cannot possibly be in an RCU
	 * read-side critical section that started before the beginning
	 * of the current RCU grace period.
	 */
1105
	if (rcu_dynticks_in_eqs_since(rdp->dynticks, rdp->dynticks_snap)) {
1106
		trace_rcu_fqs(rcu_state.name, rdp->gp_seq, rdp->cpu, TPS("dti"));
1107
		rdp->dynticks_fqs++;
1108
		rcu_gpnum_ovf(rnp, rdp);
1109 1110 1111
		return 1;
	}

1112
	/*
1113 1114 1115 1116
	 * Has this CPU encountered a cond_resched() since the beginning
	 * of the grace period?  For this to be the case, the CPU has to
	 * have noticed the current grace period.  This might not be the
	 * case for nohz_full CPUs looping in the kernel.
1117
	 */
1118
	jtsq = jiffies_till_sched_qs;
1119
	ruqp = per_cpu_ptr(&rcu_dynticks.rcu_urgent_qs, rdp->cpu);
1120
	if (time_after(jiffies, rcu_state.gp_start + jtsq) &&
1121
	    READ_ONCE(rdp->rcu_qs_ctr_snap) != per_cpu(rcu_dynticks.rcu_qs_ctr, rdp->cpu) &&
1122
	    rcu_seq_current(&rdp->gp_seq) == rnp->gp_seq && !rdp->gpwrap) {
1123
		trace_rcu_fqs(rcu_state.name, rdp->gp_seq, rdp->cpu, TPS("rqc"));
1124
		rcu_gpnum_ovf(rnp, rdp);
1125
		return 1;
1126
	} else if (time_after(jiffies, rcu_state.gp_start + jtsq)) {
1127 1128
		/* Load rcu_qs_ctr before store to rcu_urgent_qs. */
		smp_store_release(ruqp, true);
1129 1130
	}

1131 1132
	/* If waiting too long on an offline CPU, complain. */
	if (!(rdp->grpmask & rcu_rnp_online_cpus(rnp)) &&
1133
	    time_after(jiffies, rcu_state.gp_start + HZ)) {
1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151
		bool onl;
		struct rcu_node *rnp1;

		WARN_ON(1);  /* Offline CPUs are supposed to report QS! */
		pr_info("%s: grp: %d-%d level: %d ->gp_seq %ld ->completedqs %ld\n",
			__func__, rnp->grplo, rnp->grphi, rnp->level,
			(long)rnp->gp_seq, (long)rnp->completedqs);
		for (rnp1 = rnp; rnp1; rnp1 = rnp1->parent)
			pr_info("%s: %d:%d ->qsmask %#lx ->qsmaskinit %#lx ->qsmaskinitnext %#lx ->rcu_gp_init_mask %#lx\n",
				__func__, rnp1->grplo, rnp1->grphi, rnp1->qsmask, rnp1->qsmaskinit, rnp1->qsmaskinitnext, rnp1->rcu_gp_init_mask);
		onl = !!(rdp->grpmask & rcu_rnp_online_cpus(rnp));
		pr_info("%s %d: %c online: %ld(%d) offline: %ld(%d)\n",
			__func__, rdp->cpu, ".o"[onl],
			(long)rdp->rcu_onl_gp_seq, rdp->rcu_onl_gp_flags,
			(long)rdp->rcu_ofl_gp_seq, rdp->rcu_ofl_gp_flags);
		return 1; /* Break things loose after complaining. */
	}

1152
	/*
1153 1154 1155 1156 1157 1158
	 * A CPU running for an extended time within the kernel can
	 * delay RCU grace periods.  When the CPU is in NO_HZ_FULL mode,
	 * even context-switching back and forth between a pair of
	 * in-kernel CPU-bound tasks cannot advance grace periods.
	 * So if the grace period is old enough, make the CPU pay attention.
	 * Note that the unsynchronized assignments to the per-CPU
1159
	 * rcu_need_heavy_qs variable are safe.  Yes, setting of
1160 1161 1162 1163 1164 1165 1166 1167
	 * bits can be lost, but they will be set again on the next
	 * force-quiescent-state pass.  So lost bit sets do not result
	 * in incorrect behavior, merely in a grace period lasting
	 * a few jiffies longer than it might otherwise.  Because
	 * there are at most four threads involved, and because the
	 * updates are only once every few jiffies, the probability of
	 * lossage (and thus of slight grace-period extension) is
	 * quite low.
1168
	 */
1169 1170
	rnhqp = &per_cpu(rcu_dynticks.rcu_need_heavy_qs, rdp->cpu);
	if (!READ_ONCE(*rnhqp) &&
1171 1172
	    (time_after(jiffies, rcu_state.gp_start + jtsq) ||
	     time_after(jiffies, rcu_state.jiffies_resched))) {
1173
		WRITE_ONCE(*rnhqp, true);
1174 1175
		/* Store rcu_need_heavy_qs before rcu_urgent_qs. */
		smp_store_release(ruqp, true);
1176
		rcu_state.jiffies_resched += jtsq; /* Re-enable beating. */
1177 1178
	}

1179
	/*
1180 1181 1182 1183
	 * If more than halfway to RCU CPU stall-warning time, do a
	 * resched_cpu() to try to loosen things up a bit.  Also check to
	 * see if the CPU is getting hammered with interrupts, but only
	 * once per grace period, just to keep the IPIs down to a dull roar.
1184
	 */
1185
	if (jiffies - rcu_state.gp_start > rcu_jiffies_till_stall_check() / 2) {
1186
		resched_cpu(rdp->cpu);
1187
		if (IS_ENABLED(CONFIG_IRQ_WORK) &&
1188
		    !rdp->rcu_iw_pending && rdp->rcu_iw_gp_seq != rnp->gp_seq &&
1189 1190 1191
		    (rnp->ffmask & rdp->grpmask)) {
			init_irq_work(&rdp->rcu_iw, rcu_iw_handler);
			rdp->rcu_iw_pending = true;
1192
			rdp->rcu_iw_gp_seq = rnp->gp_seq;
1193 1194 1195
			irq_work_queue_on(&rdp->rcu_iw, rdp->cpu);
		}
	}
1196

1197
	return 0;
1198 1199
}

1200
static void record_gp_stall_check_time(void)
1201
{
1202
	unsigned long j = jiffies;
1203
	unsigned long j1;
1204

1205
	rcu_state.gp_start = j;
1206
	j1 = rcu_jiffies_till_stall_check();
1207
	/* Record ->gp_start before ->jiffies_stall. */
1208 1209 1210
	smp_store_release(&rcu_state.jiffies_stall, j + j1); /* ^^^ */
	rcu_state.jiffies_resched = j + j1 / 2;
	rcu_state.n_force_qs_gpstart = READ_ONCE(rcu_state.n_force_qs);
1211 1212
}

1213 1214 1215 1216 1217 1218 1219 1220 1221 1222
/*
 * Convert a ->gp_state value to a character string.
 */
static const char *gp_state_getname(short gs)
{
	if (gs < 0 || gs >= ARRAY_SIZE(gp_state_names))
		return "???";
	return gp_state_names[gs];
}

1223 1224 1225
/*
 * Complain about starvation of grace-period kthread.
 */
1226
static void rcu_check_gp_kthread_starvation(void)
1227
{
1228
	struct task_struct *gpk = rcu_state.gp_kthread;
1229 1230
	unsigned long j;

1231 1232
	j = jiffies - READ_ONCE(rcu_state.gp_activity);
	if (j > 2 * HZ) {
1233
		pr_err("%s kthread starved for %ld jiffies! g%ld f%#x %s(%d) ->state=%#lx ->cpu=%d\n",
1234 1235 1236 1237 1238 1239
		       rcu_state.name, j,
		       (long)rcu_seq_current(&rcu_state.gp_seq),
		       rcu_state.gp_flags,
		       gp_state_getname(rcu_state.gp_state), rcu_state.gp_state,
		       gpk ? gpk->state : ~0, gpk ? task_cpu(gpk) : -1);
		if (gpk) {
1240
			pr_err("RCU grace-period kthread stack dump:\n");
1241 1242
			sched_show_task(gpk);
			wake_up_process(gpk);
1243
		}
1244
	}
1245 1246
}

1247
/*
1248 1249 1250 1251
 * Dump stacks of all tasks running on stalled CPUs.  First try using
 * NMIs, but fall back to manual remote stack tracing on architectures
 * that don't support NMI-based stack dumps.  The NMI-triggered stack
 * traces are more accurate because they are printed by the target CPU.
1252
 */
1253
static void rcu_dump_cpu_stacks(void)
1254 1255 1256 1257 1258
{
	int cpu;
	unsigned long flags;
	struct rcu_node *rnp;

1259
	rcu_for_each_leaf_node(rnp) {
1260
		raw_spin_lock_irqsave_rcu_node(rnp, flags);
1261 1262 1263
		for_each_leaf_node_possible_cpu(rnp, cpu)
			if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu))
				if (!trigger_single_cpu_backtrace(cpu))
1264
					dump_cpu_task(cpu);
Boqun Feng's avatar
Boqun Feng committed
1265
		raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
1266 1267 1268
	}
}

1269 1270 1271 1272
/*
 * If too much time has passed in the current grace period, and if
 * so configured, go kick the relevant kthreads.
 */
1273
static void rcu_stall_kick_kthreads(void)
1274 1275 1276 1277 1278
{
	unsigned long j;

	if (!rcu_kick_kthreads)
		return;
1279 1280 1281 1282 1283
	j = READ_ONCE(rcu_state.jiffies_kick_kthreads);
	if (time_after(jiffies, j) && rcu_state.gp_kthread &&
	    (rcu_gp_in_progress() || READ_ONCE(rcu_state.gp_flags))) {
		WARN_ONCE(1, "Kicking %s grace-period kthread\n",
			  rcu_state.name);
1284
		rcu_ftrace_dump(DUMP_ALL);
1285 1286
		wake_up_process(rcu_state.gp_kthread);
		WRITE_ONCE(rcu_state.jiffies_kick_kthreads, j + HZ);
1287 1288 1289
	}
}

1290
static void panic_on_rcu_stall(void)
1291 1292 1293 1294 1295
{
	if (sysctl_panic_on_rcu_stall)
		panic("RCU Stall\n");
}

1296
static void print_other_cpu_stall(unsigned long gp_seq)
1297 1298 1299
{
	int cpu;
	unsigned long flags;
1300 1301
	unsigned long gpa;
	unsigned long j;
1302
	int ndetected = 0;
1303
	struct rcu_node *rnp = rcu_get_root();
1304
	long totqlen = 0;
1305

1306
	/* Kick and suppress, if so configured. */
1307
	rcu_stall_kick_kthreads();
1308 1309 1310
	if (rcu_cpu_stall_suppress)
		return;

1311 1312 1313 1314 1315
	/*
	 * OK, time to rat on our buddy...
	 * See Documentation/RCU/stallwarn.txt for info on how to debug
	 * RCU CPU stall warnings.
	 */
1316
	pr_err("INFO: %s detected stalls on CPUs/tasks:", rcu_state.name);
1317
	print_cpu_stall_info_begin();
1318
	rcu_for_each_leaf_node(rnp) {
1319
		raw_spin_lock_irqsave_rcu_node(rnp, flags);
1320
		ndetected += rcu_print_task_stall(rnp);
1321
		if (rnp->qsmask != 0) {
1322 1323
			for_each_leaf_node_possible_cpu(rnp, cpu)
				if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu)) {
1324
					print_cpu_stall_info(cpu);
1325 1326 1327
					ndetected++;
				}
		}
Boqun Feng's avatar
Boqun Feng committed
1328
		raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
1329
	}
1330 1331

	print_cpu_stall_info_end();
1332
	for_each_possible_cpu(cpu)
1333
		totqlen += rcu_segcblist_n_cbs(&per_cpu_ptr(&rcu_data,
1334
							    cpu)->cblist);
1335
	pr_cont("(detected by %d, t=%ld jiffies, g=%ld, q=%lu)\n",
1336 1337
	       smp_processor_id(), (long)(jiffies - rcu_state.gp_start),
	       (long)rcu_seq_current(&rcu_state.gp_seq), totqlen);
1338
	if (ndetected) {
1339
		rcu_dump_cpu_stacks();
1340 1341

		/* Complain about tasks blocking the grace period. */
1342
		rcu_print_detail_task_stall();
1343
	} else {
1344
		if (rcu_seq_current(&rcu_state.gp_seq) != gp_seq) {
1345 1346 1347
			pr_err("INFO: Stall ended before state dump start\n");
		} else {
			j = jiffies;
1348
			gpa = READ_ONCE(rcu_state.gp_activity);
1349
			pr_err("All QSes seen, last %s kthread activity %ld (%ld-%ld), jiffies_till_next_fqs=%ld, root ->qsmask %#lx\n",
1350
			       rcu_state.name, j - gpa, j, gpa,
1351
			       jiffies_till_next_fqs,
1352
			       rcu_get_root()->qsmask);
1353 1354 1355 1356
			/* In this case, the current CPU might be at fault. */
			sched_show_task(current);
		}
	}
1357
	/* Rewrite if needed in case of slow consoles. */
1358 1359
	if (ULONG_CMP_GE(jiffies, READ_ONCE(rcu_state.jiffies_stall)))
		WRITE_ONCE(rcu_state.jiffies_stall,
1360
			   jiffies + 3 * rcu_jiffies_till_stall_check() + 3);
1361

1362
	rcu_check_gp_kthread_starvation();
1363

1364 1365
	panic_on_rcu_stall();

1366
	force_quiescent_state();  /* Kick them all. */
1367 1368
}

1369
static void print_cpu_stall(void)
1370
{
1371
	int cpu;
1372
	unsigned long flags;
1373
	struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
1374
	struct rcu_node *rnp = rcu_get_root();
1375
	long totqlen = 0;
1376

1377
	/* Kick and suppress, if so configured. */
1378
	rcu_stall_kick_kthreads();
1379 1380 1381
	if (rcu_cpu_stall_suppress)
		return;

1382 1383 1384 1385 1386
	/*
	 * OK, time to rat on ourselves...
	 * See Documentation/RCU/stallwarn.txt for info on how to debug
	 * RCU CPU stall warnings.
	 */
1387
	pr_err("INFO: %s self-detected stall on CPU", rcu_state.name);
1388
	print_cpu_stall_info_begin();
1389
	raw_spin_lock_irqsave_rcu_node(rdp->mynode, flags);
1390
	print_cpu_stall_info(smp_processor_id());
1391
	raw_spin_unlock_irqrestore_rcu_node(rdp->mynode, flags);
1392
	print_cpu_stall_info_end();
1393
	for_each_possible_cpu(cpu)
1394
		totqlen += rcu_segcblist_n_cbs(&per_cpu_ptr(&rcu_data,
1395
							    cpu)->cblist);
1396
	pr_cont(" (t=%lu jiffies g=%ld q=%lu)\n",
1397 1398
		jiffies - rcu_state.gp_start,
		(long)rcu_seq_current(&rcu_state.gp_seq), totqlen);
1399

1400
	rcu_check_gp_kthread_starvation();
1401

1402
	rcu_dump_cpu_stacks();
1403

1404
	raw_spin_lock_irqsave_rcu_node(rnp, flags);
1405
	/* Rewrite if needed in case of slow consoles. */
1406 1407
	if (ULONG_CMP_GE(jiffies, READ_ONCE(rcu_state.jiffies_stall)))
		WRITE_ONCE(rcu_state.jiffies_stall,
1408
			   jiffies + 3 * rcu_jiffies_till_stall_check() + 3);
Boqun Feng's avatar
Boqun Feng committed
1409
	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
1410

1411 1412
	panic_on_rcu_stall();

1413 1414 1415 1416 1417 1418 1419 1420
	/*
	 * Attempt to revive the RCU machinery by forcing a context switch.
	 *
	 * A context switch would normally allow the RCU state machine to make
	 * progress and it could be we're stuck in kernel space without context
	 * switches for an entirely unreasonable amount of time.
	 */
	resched_cpu(smp_processor_id());
1421 1422
}

1423
static void check_cpu_stall(struct rcu_data *rdp)
1424
{
1425 1426
	unsigned long gs1;
	unsigned long gs2;
1427
	unsigned long gps;
1428
	unsigned long j;
1429
	unsigned long jn;
1430
	unsigned long js;
1431 1432
	struct rcu_node *rnp;

1433
	if ((rcu_cpu_stall_suppress && !rcu_kick_kthreads) ||
1434
	    !rcu_gp_in_progress())
1435
		return;
1436
	rcu_stall_kick_kthreads();
1437
	j = jiffies;
1438 1439 1440 1441

	/*
	 * Lots of memory barriers to reject false positives.
	 *
1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452
	 * The idea is to pick up rcu_state.gp_seq, then
	 * rcu_state.jiffies_stall, then rcu_state.gp_start, and finally
	 * another copy of rcu_state.gp_seq.  These values are updated in
	 * the opposite order with memory barriers (or equivalent) during
	 * grace-period initialization and cleanup.  Now, a false positive
	 * can occur if we get an new value of rcu_state.gp_start and a old
	 * value of rcu_state.jiffies_stall.  But given the memory barriers,
	 * the only way that this can happen is if one grace period ends
	 * and another starts between these two fetches.  This is detected
	 * by comparing the second fetch of rcu_state.gp_seq with the
	 * previous fetch from rcu_state.gp_seq.
1453
	 *
1454 1455
	 * Given this check, comparisons of jiffies, rcu_state.jiffies_stall,
	 * and rcu_state.gp_start suffice to forestall false positives.
1456
	 */
1457
	gs1 = READ_ONCE(rcu_state.gp_seq);
1458
	smp_rmb(); /* Pick up ->gp_seq first... */
1459
	js = READ_ONCE(rcu_state.jiffies_stall);
1460
	smp_rmb(); /* ...then ->jiffies_stall before the rest... */
1461
	gps = READ_ONCE(rcu_state.gp_start);
1462
	smp_rmb(); /* ...and finally ->gp_start before ->gp_seq again. */
1463
	gs2 = READ_ONCE(rcu_state.gp_seq);
1464
	if (gs1 != gs2 ||
1465 1466 1467
	    ULONG_CMP_LT(j, js) ||
	    ULONG_CMP_GE(gps, js))
		return; /* No stall or GP completed since entering function. */
1468
	rnp = rdp->mynode;
1469
	jn = jiffies + 3 * rcu_jiffies_till_stall_check() + 3;
1470
	if (rcu_gp_in_progress() &&
1471
	    (READ_ONCE(rnp->qsmask) & rdp->grpmask) &&
1472
	    cmpxchg(&rcu_state.jiffies_stall, js, jn) == js) {
1473 1474

		/* We haven't checked in, so go dump stack. */
1475
		print_cpu_stall();
1476

1477
	} else if (rcu_gp_in_progress() &&
1478
		   ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY) &&
1479
		   cmpxchg(&rcu_state.jiffies_stall, js, jn) == js) {
1480

1481
		/* They had a few time units to dump stack, so complain. */
1482
		print_other_cpu_stall(gs2);
1483 1484 1485
	}
}

1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496
/**
 * rcu_cpu_stall_reset - prevent further stall warnings in current grace period
 *
 * Set the stall-warning timeout way off into the future, thus preventing
 * any RCU CPU stall-warning messages from appearing in the current set of
 * RCU grace periods.
 *
 * The caller must disable hard irqs.
 */
void rcu_cpu_stall_reset(void)
{
1497
	WRITE_ONCE(rcu_state.jiffies_stall, jiffies + ULONG_MAX / 2);
1498 1499
}

1500 1501
/* Trace-event wrapper function for trace_rcu_future_grace_period.  */
static void trace_rcu_this_gp(struct rcu_node *rnp, struct rcu_data *rdp,
1502
			      unsigned long gp_seq_req, const char *s)
1503
{
1504
	trace_rcu_future_grace_period(rcu_state.name, rnp->gp_seq, gp_seq_req,
1505
				      rnp->level, rnp->grplo, rnp->grphi, s);
1506 1507 1508
}

/*
1509
 * rcu_start_this_gp - Request the start of a particular grace period
1510
 * @rnp_start: The leaf node of the CPU from which to start.
1511 1512 1513
 * @rdp: The rcu_data corresponding to the CPU from which to start.
 * @gp_seq_req: The gp_seq of the grace period to start.
 *
1514
 * Start the specified grace period, as needed to handle newly arrived
1515
 * callbacks.  The required future grace periods are recorded in each
1516
 * rcu_node structure's ->gp_seq_needed field.  Returns true if there
1517
 * is reason to awaken the grace-period kthread.
1518
 *
1519 1520
 * The caller must hold the specified rcu_node structure's ->lock, which
 * is why the caller is responsible for waking the grace-period kthread.
1521 1522
 *
 * Returns true if the GP thread needs to be awakened else false.
1523
 */
1524
static bool rcu_start_this_gp(struct rcu_node *rnp_start, struct rcu_data *rdp,
1525
			      unsigned long gp_seq_req)
1526
{
1527
	bool ret = false;
1528
	struct rcu_node *rnp;
1529 1530

	/*
1531 1532
	 * Use funnel locking to either acquire the root rcu_node
	 * structure's lock or bail out if the need for this grace period
1533 1534 1535 1536 1537
	 * has already been recorded -- or if that grace period has in
	 * fact already started.  If there is already a grace period in
	 * progress in a non-leaf node, no recording is needed because the
	 * end of the grace period will scan the leaf rcu_node structures.
	 * Note that rnp_start->lock must not be released.
1538
	 */
1539 1540 1541 1542 1543 1544 1545 1546 1547 1548
	raw_lockdep_assert_held_rcu_node(rnp_start);
	trace_rcu_this_gp(rnp_start, rdp, gp_seq_req, TPS("Startleaf"));
	for (rnp = rnp_start; 1; rnp = rnp->parent) {
		if (rnp != rnp_start)
			raw_spin_lock_rcu_node(rnp);
		if (ULONG_CMP_GE(rnp->gp_seq_needed, gp_seq_req) ||
		    rcu_seq_started(&rnp->gp_seq, gp_seq_req) ||
		    (rnp != rnp_start &&
		     rcu_seq_state(rcu_seq_current(&rnp->gp_seq)))) {
			trace_rcu_this_gp(rnp, rdp, gp_seq_req,
1549
					  TPS("Prestarted"));
1550 1551
			goto unlock_out;
		}
1552
		rnp->gp_seq_needed = gp_seq_req;
1553
		if (rcu_seq_state(rcu_seq_current(&rnp->gp_seq))) {
1554
			/*
1555 1556 1557 1558
			 * We just marked the leaf or internal node, and a
			 * grace period is in progress, which means that
			 * rcu_gp_cleanup() will see the marking.  Bail to
			 * reduce contention.
1559
			 */
1560
			trace_rcu_this_gp(rnp_start, rdp, gp_seq_req,
1561
					  TPS("Startedleaf"));
1562 1563
			goto unlock_out;
		}
1564 1565 1566
		if (rnp != rnp_start && rnp->parent != NULL)
			raw_spin_unlock_rcu_node(rnp);
		if (!rnp->parent)
1567
			break;  /* At root, and perhaps also leaf. */
1568 1569
	}

1570
	/* If GP already in progress, just leave, otherwise start one. */
1571
	if (rcu_gp_in_progress()) {
1572
		trace_rcu_this_gp(rnp, rdp, gp_seq_req, TPS("Startedleafroot"));
1573 1574
		goto unlock_out;
	}
1575
	trace_rcu_this_gp(rnp, rdp, gp_seq_req, TPS("Startedroot"));
1576 1577 1578
	WRITE_ONCE(rcu_state.gp_flags, rcu_state.gp_flags | RCU_GP_FLAG_INIT);
	rcu_state.gp_req_activity = jiffies;
	if (!rcu_state.gp_kthread) {
1579
		trace_rcu_this_gp(rnp, rdp, gp_seq_req, TPS("NoGPkthread"));
1580
		goto unlock_out;
1581
	}
1582
	trace_rcu_grace_period(rcu_state.name, READ_ONCE(rcu_state.gp_seq), TPS("newreq"));
1583
	ret = true;  /* Caller must wake GP kthread. */
1584
unlock_out:
1585
	/* Push furthest requested GP to leaf node and rcu_data structure. */
1586 1587 1588
	if (ULONG_CMP_LT(gp_seq_req, rnp->gp_seq_needed)) {
		rnp_start->gp_seq_needed = rnp->gp_seq_needed;
		rdp->gp_seq_needed = rnp->gp_seq_needed;
1589
	}
1590 1591
	if (rnp != rnp_start)
		raw_spin_unlock_rcu_node(rnp);
1592
	return ret;
1593 1594 1595 1596
}

/*
 * Clean up any old requests for the just-ended grace period.  Also return
1597
 * whether any additional grace periods have been requested.
1598
 */
1599
static bool rcu_future_gp_cleanup(struct rcu_node *rnp)
1600
{
1601
	bool needmore;
1602
	struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
1603

1604 1605 1606
	needmore = ULONG_CMP_LT(rnp->gp_seq, rnp->gp_seq_needed);
	if (!needmore)
		rnp->gp_seq_needed = rnp->gp_seq; /* Avoid counter wrap. */
1607
	trace_rcu_this_gp(rnp, rdp, rnp->gp_seq,
1608
			  needmore ? TPS("CleanupMore") : TPS("Cleanup"));
1609 1610 1611
	return needmore;
}

1612 1613 1614 1615 1616 1617 1618
/*
 * Awaken the grace-period kthread for the specified flavor of RCU.
 * Don't do a self-awaken, and don't bother awakening when there is
 * nothing for the grace-period kthread to do (as in several CPUs
 * raced to awaken, and we lost), and finally don't try to awaken
 * a kthread that has not yet been created.
 */
1619
static void rcu_gp_kthread_wake(void)
1620
{
1621 1622 1623
	if (current == rcu_state.gp_kthread ||
	    !READ_ONCE(rcu_state.gp_flags) ||
	    !rcu_state.gp_kthread)
1624
		return;
1625
	swake_up_one(&rcu_state.gp_wq);
1626 1627
}

1628
/*
1629 1630 1631 1632 1633 1634 1635 1636
 * If there is room, assign a ->gp_seq number to any callbacks on this
 * CPU that have not already been assigned.  Also accelerate any callbacks
 * that were previously assigned a ->gp_seq number that has since proven
 * to be too conservative, which can happen if callbacks get assigned a
 * ->gp_seq number while RCU is idle, but with reference to a non-root
 * rcu_node structure.  This function is idempotent, so it does not hurt
 * to call it repeatedly.  Returns an flag saying that we should awaken
 * the RCU grace-period kthread.
1637 1638 1639
 *
 * The caller must hold rnp->lock with interrupts disabled.
 */
1640
static bool rcu_accelerate_cbs(struct rcu_node *rnp, struct rcu_data *rdp)
1641
{
1642
	unsigned long gp_seq_req;
1643
	bool ret = false;
1644

1645
	raw_lockdep_assert_held_rcu_node(rnp);
1646

1647 1648
	/* If no pending (not yet ready to invoke) callbacks, nothing to do. */
	if (!rcu_segcblist_pend_cbs(&rdp->cblist))
1649
		return false;
1650 1651

	/*
1652 1653 1654 1655 1656 1657 1658 1659
	 * Callbacks are often registered with incomplete grace-period
	 * information.  Something about the fact that getting exact
	 * information requires acquiring a global lock...  RCU therefore
	 * makes a conservative estimate of the grace period number at which
	 * a given callback will become ready to invoke.	The following
	 * code checks this estimate and improves it when possible, thus
	 * accelerating callback invocation to an earlier grace-period
	 * number.
1660
	 */
1661
	gp_seq_req = rcu_seq_snap(&rcu_state.gp_seq);
1662 1663
	if (rcu_segcblist_accelerate(&rdp->cblist, gp_seq_req))
		ret = rcu_start_this_gp(rnp, rdp, gp_seq_req);
1664 1665

	/* Trace depending on how much we were able to accelerate. */
1666
	if (rcu_segcblist_restempty(&rdp->cblist, RCU_WAIT_TAIL))
1667
		trace_rcu_grace_period(rcu_state.name, rdp->gp_seq, TPS("AccWaitCB"));
1668
	else
1669
		trace_rcu_grace_period(rcu_state.name, rdp->gp_seq, TPS("AccReadyCB"));
1670
	return ret;
1671 1672
}

1673 1674 1675 1676 1677 1678 1679
/*
 * Similar to rcu_accelerate_cbs(), but does not require that the leaf
 * rcu_node structure's ->lock be held.  It consults the cached value
 * of ->gp_seq_needed in the rcu_data structure, and if that indicates
 * that a new grace-period request be made, invokes rcu_accelerate_cbs()
 * while holding the leaf rcu_node structure's ->lock.
 */
1680
static void rcu_accelerate_cbs_unlocked(struct rcu_node *rnp,
1681 1682 1683 1684 1685 1686
					struct rcu_data *rdp)
{
	unsigned long c;
	bool needwake;

	lockdep_assert_irqs_disabled();
1687
	c = rcu_seq_snap(&rcu_state.gp_seq);
1688 1689 1690 1691 1692 1693
	if (!rdp->gpwrap && ULONG_CMP_GE(rdp->gp_seq_needed, c)) {
		/* Old request still live, so mark recent callbacks. */
		(void)rcu_segcblist_accelerate(&rdp->cblist, c);
		return;
	}
	raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */
1694
	needwake = rcu_accelerate_cbs(rnp, rdp);
1695 1696
	raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */
	if (needwake)
1697
		rcu_gp_kthread_wake();
1698 1699
}

1700 1701 1702
/*
 * Move any callbacks whose grace period has completed to the
 * RCU_DONE_TAIL sublist, then compact the remaining sublists and
1703
 * assign ->gp_seq numbers to any callbacks in the RCU_NEXT_TAIL
1704 1705
 * sublist.  This function is idempotent, so it does not hurt to
 * invoke it repeatedly.  As long as it is not invoked -too- often...
1706
 * Returns true if the RCU grace-period kthread needs to be awakened.
1707 1708 1709
 *
 * The caller must hold rnp->lock with interrupts disabled.
 */
1710
static bool rcu_advance_cbs(struct rcu_node *rnp, struct rcu_data *rdp)
1711
{
1712
	raw_lockdep_assert_held_rcu_node(rnp);
1713

1714 1715
	/* If no pending (not yet ready to invoke) callbacks, nothing to do. */
	if (!rcu_segcblist_pend_cbs(&rdp->cblist))
1716
		return false;
1717 1718

	/*
1719
	 * Find all callbacks whose ->gp_seq numbers indicate that they
1720 1721
	 * are ready to invoke, and put them into the RCU_DONE_TAIL sublist.
	 */
1722
	rcu_segcblist_advance(&rdp->cblist, rnp->gp_seq);
1723 1724

	/* Classify any remaining callbacks. */
1725
	return rcu_accelerate_cbs(rnp, rdp);
1726 1727
}

1728
/*
1729 1730 1731
 * Update CPU-local rcu_data state to record the beginnings and ends of
 * grace periods.  The caller must hold the ->lock of the leaf rcu_node
 * structure corresponding to the current CPU, and must have irqs disabled.
1732
 * Returns true if the grace-period kthread needs to be awakened.
1733
 */
1734
static bool __note_gp_changes(struct rcu_node *rnp, struct rcu_data *rdp)
1735
{
1736
	bool ret;
1737
	bool need_gp;
1738

1739
	raw_lockdep_assert_held_rcu_node(rnp);
1740

1741 1742
	if (rdp->gp_seq == rnp->gp_seq)
		return false; /* Nothing to do. */
1743

1744 1745 1746
	/* Handle the ends of any preceding grace periods first. */
	if (rcu_seq_completed_gp(rdp->gp_seq, rnp->gp_seq) ||
	    unlikely(READ_ONCE(rdp->gpwrap))) {
1747
		ret = rcu_advance_cbs(rnp, rdp); /* Advance callbacks. */
1748
		trace_rcu_grace_period(rcu_state.name, rdp->gp_seq, TPS("cpuend"));
1749
	} else {
1750
		ret = rcu_accelerate_cbs(rnp, rdp); /* Recent callbacks. */
1751
	}
1752

1753 1754 1755
	/* Now handle the beginnings of any new-to-this-CPU grace periods. */
	if (rcu_seq_new_gp(rdp->gp_seq, rnp->gp_seq) ||
	    unlikely(READ_ONCE(rdp->gpwrap))) {
1756 1757 1758 1759 1760
		/*
		 * If the current grace period is waiting for this CPU,
		 * set up to detect a quiescent state, otherwise don't
		 * go looking for one.
		 */
1761
		trace_rcu_grace_period(rcu_state.name, rnp->gp_seq, TPS("cpustart"));
1762 1763
		need_gp = !!(rnp->qsmask & rdp->grpmask);
		rdp->cpu_no_qs.b.norm = need_gp;
1764
		rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_dynticks.rcu_qs_ctr);
1765
		rdp->core_needs_qs = need_gp;
1766 1767
		zero_cpu_stall_ticks(rdp);
	}
1768
	rdp->gp_seq = rnp->gp_seq;  /* Remember new grace-period state. */
1769 1770 1771 1772
	if (ULONG_CMP_GE(rnp->gp_seq_needed, rdp->gp_seq_needed) || rdp->gpwrap)
		rdp->gp_seq_needed = rnp->gp_seq_needed;
	WRITE_ONCE(rdp->gpwrap, false);
	rcu_gpnum_ovf(rnp, rdp);
1773
	return ret;
1774 1775
}

1776
static void note_gp_changes(struct rcu_data *rdp)
1777 1778
{
	unsigned long flags;
1779
	bool needwake;
1780 1781 1782 1783
	struct rcu_node *rnp;

	local_irq_save(flags);
	rnp = rdp->mynode;
1784
	if ((rdp->gp_seq == rcu_seq_current(&rnp->gp_seq) &&
1785
	     !unlikely(READ_ONCE(rdp->gpwrap))) || /* w/out lock. */
1786
	    !raw_spin_trylock_rcu_node(rnp)) { /* irqs already off, so later. */
1787 1788 1789
		local_irq_restore(flags);
		return;
	}
1790
	needwake = __note_gp_changes(rnp, rdp);
Boqun Feng's avatar
Boqun Feng committed
1791
	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
1792
	if (needwake)
1793
		rcu_gp_kthread_wake();
1794 1795
}

1796
static void rcu_gp_slow(int delay)
1797 1798
{
	if (delay > 0 &&
1799
	    !(rcu_seq_ctr(rcu_state.gp_seq) %
1800
	      (rcu_num_nodes * PER_RCU_NODE_PERIOD * delay)))
1801 1802 1803
		schedule_timeout_uninterruptible(delay);
}

1804
/*
1805
 * Initialize a new grace period.  Return false if no grace period required.
1806
 */
1807
static bool rcu_gp_init(void)
1808
{
1809
	unsigned long flags;
1810
	unsigned long oldmask;
1811
	unsigned long mask;
1812
	struct rcu_data *rdp;
1813
	struct rcu_node *rnp = rcu_get_root();
1814

1815
	WRITE_ONCE(rcu_state.gp_activity, jiffies);
1816
	raw_spin_lock_irq_rcu_node(rnp);
1817
	if (!READ_ONCE(rcu_state.gp_flags)) {
1818
		/* Spurious wakeup, tell caller to go back to sleep.  */
Boqun Feng's avatar
Boqun Feng committed
1819
		raw_spin_unlock_irq_rcu_node(rnp);
1820
		return false;
1821
	}
1822
	WRITE_ONCE(rcu_state.gp_flags, 0); /* Clear all flags: New GP. */
1823

1824
	if (WARN_ON_ONCE(rcu_gp_in_progress())) {
1825 1826 1827 1828
		/*
		 * Grace period already in progress, don't start another.
		 * Not supposed to be able to happen.
		 */
Boqun Feng's avatar
Boqun Feng committed
1829
		raw_spin_unlock_irq_rcu_node(rnp);
1830
		return false;
1831 1832 1833
	}

	/* Advance to a new grace period and initialize state. */
1834
	record_gp_stall_check_time();
1835
	/* Record GP times before starting GP, hence rcu_seq_start(). */
1836 1837
	rcu_seq_start(&rcu_state.gp_seq);
	trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("start"));
Boqun Feng's avatar
Boqun Feng committed
1838
	raw_spin_unlock_irq_rcu_node(rnp);
1839

1840 1841 1842 1843 1844 1845
	/*
	 * Apply per-leaf buffered online and offline operations to the
	 * rcu_node tree.  Note that this new grace period need not wait
	 * for subsequent online CPUs, and that quiescent-state forcing
	 * will handle subsequent offline CPUs.
	 */
1846
	rcu_state.gp_state = RCU_GP_ONOFF;
1847
	rcu_for_each_leaf_node(rnp) {
1848
		spin_lock(&rcu_state.ofl_lock);
1849
		raw_spin_lock_irq_rcu_node(rnp);
1850 1851 1852
		if (rnp->qsmaskinit == rnp->qsmaskinitnext &&
		    !rnp->wait_blkd_tasks) {
			/* Nothing to do on this leaf rcu_node structure. */
Boqun Feng's avatar
Boqun Feng committed
1853
			raw_spin_unlock_irq_rcu_node(rnp);
1854
			spin_unlock(&rcu_state.ofl_lock);
1855 1856 1857 1858 1859 1860 1861 1862 1863
			continue;
		}

		/* Record old state, apply changes to ->qsmaskinit field. */
		oldmask = rnp->qsmaskinit;
		rnp->qsmaskinit = rnp->qsmaskinitnext;

		/* If zero-ness of ->qsmaskinit changed, propagate up tree. */
		if (!oldmask != !rnp->qsmaskinit) {
1864 1865 1866 1867 1868 1869
			if (!oldmask) { /* First online CPU for rcu_node. */
				if (!rnp->wait_blkd_tasks) /* Ever offline? */
					rcu_init_new_rnp(rnp);
			} else if (rcu_preempt_has_tasks(rnp)) {
				rnp->wait_blkd_tasks = true; /* blocked tasks */
			} else { /* Last offline CPU and can propagate. */
1870
				rcu_cleanup_dead_rnp(rnp);
1871
			}
1872 1873 1874 1875 1876 1877 1878 1879
		}

		/*
		 * If all waited-on tasks from prior grace period are
		 * done, and if all this rcu_node structure's CPUs are
		 * still offline, propagate up the rcu_node tree and
		 * clear ->wait_blkd_tasks.  Otherwise, if one of this
		 * rcu_node structure's CPUs has since come back online,
1880
		 * simply clear ->wait_blkd_tasks.
1881 1882
		 */
		if (rnp->wait_blkd_tasks &&
1883
		    (!rcu_preempt_has_tasks(rnp) || rnp->qsmaskinit)) {
1884
			rnp->wait_blkd_tasks = false;
1885 1886
			if (!rnp->qsmaskinit)
				rcu_cleanup_dead_rnp(rnp);
1887 1888
		}

Boqun Feng's avatar
Boqun Feng committed
1889
		raw_spin_unlock_irq_rcu_node(rnp);
1890
		spin_unlock(&rcu_state.ofl_lock);
1891
	}
1892
	rcu_gp_slow(gp_preinit_delay); /* Races with CPU hotplug. */
1893 1894 1895

	/*
	 * Set the quiescent-state-needed bits in all the rcu_node
1896 1897 1898 1899 1900 1901
	 * structures for all currently online CPUs in breadth-first
	 * order, starting from the root rcu_node structure, relying on the
	 * layout of the tree within the rcu_state.node[] array.  Note that
	 * other CPUs will access only the leaves of the hierarchy, thus
	 * seeing that no grace period is in progress, at least until the
	 * corresponding leaf node has been initialized.
1902 1903 1904 1905
	 *
	 * The grace period cannot complete until the initialization
	 * process finishes, because this kthread handles both.
	 */
1906
	rcu_state.gp_state = RCU_GP_INIT;
1907
	rcu_for_each_node_breadth_first(rnp) {
1908
		rcu_gp_slow(gp_init_delay);
1909
		raw_spin_lock_irqsave_rcu_node(rnp, flags);
1910
		rdp = this_cpu_ptr(&rcu_data);
1911
		rcu_preempt_check_blocked_tasks(rnp);
1912
		rnp->qsmask = rnp->qsmaskinit;
1913
		WRITE_ONCE(rnp->gp_seq, rcu_state.gp_seq);
1914
		if (rnp == rdp->mynode)
1915
			(void)__note_gp_changes(rnp, rdp);
1916
		rcu_preempt_boost_start_gp(rnp);
1917
		trace_rcu_grace_period_init(rcu_state.name, rnp->gp_seq,
1918 1919
					    rnp->level, rnp->grplo,
					    rnp->grphi, rnp->qsmask);
1920 1921
		/* Quiescent states for tasks on any now-offline CPUs. */
		mask = rnp->qsmask & ~rnp->qsmaskinitnext;
1922
		rnp->rcu_gp_init_mask = mask;
1923
		if ((mask || rnp->wait_blkd_tasks) && rcu_is_leaf_node(rnp))
1924
			rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags);
1925 1926
		else
			raw_spin_unlock_irq_rcu_node(rnp);
1927
		cond_resched_tasks_rcu_qs();
1928
		WRITE_ONCE(rcu_state.gp_activity, jiffies);
1929
	}
1930

1931
	return true;
1932
}
1933

1934
/*
1935
 * Helper function for swait_event_idle_exclusive() wakeup at force-quiescent-state
1936
 * time.
1937
 */
1938
static bool rcu_gp_fqs_check_wake(int *gfp)
1939
{
1940
	struct rcu_node *rnp = rcu_get_root();
1941 1942

	/* Someone like call_rcu() requested a force-quiescent-state scan. */
1943
	*gfp = READ_ONCE(rcu_state.gp_flags);
1944 1945 1946 1947 1948 1949 1950 1951 1952 1953
	if (*gfp & RCU_GP_FLAG_FQS)
		return true;

	/* The current grace period has completed. */
	if (!READ_ONCE(rnp->qsmask) && !rcu_preempt_blocked_readers_cgp(rnp))
		return true;

	return false;
}

1954 1955 1956
/*
 * Do one round of quiescent-state forcing.
 */
1957
static void rcu_gp_fqs(bool first_time)
1958
{
1959
	struct rcu_node *rnp = rcu_get_root();
1960

1961 1962
	WRITE_ONCE(rcu_state.gp_activity, jiffies);
	rcu_state.n_force_qs++;
1963
	if (first_time) {
1964
		/* Collect dyntick-idle snapshots. */
1965
		force_qs_rnp(dyntick_save_progress_counter);
1966 1967
	} else {
		/* Handle dyntick-idle and offline CPUs. */
1968
		force_qs_rnp(rcu_implicit_dynticks_qs);
1969 1970
	}
	/* Clear flag to prevent immediate re-entry. */
1971
	if (READ_ONCE(rcu_state.gp_flags) & RCU_GP_FLAG_FQS) {
1972
		raw_spin_lock_irq_rcu_node(rnp);
1973 1974
		WRITE_ONCE(rcu_state.gp_flags,
			   READ_ONCE(rcu_state.gp_flags) & ~RCU_GP_FLAG_FQS);
Boqun Feng's avatar
Boqun Feng committed
1975
		raw_spin_unlock_irq_rcu_node(rnp);
1976 1977 1978
	}
}

1979 1980 1981
/*
 * Clean up after the old grace period.
 */
1982
static void rcu_gp_cleanup(void)
1983 1984
{
	unsigned long gp_duration;
1985
	bool needgp = false;
1986
	unsigned long new_gp_seq;
1987
	struct rcu_data *rdp;
1988
	struct rcu_node *rnp = rcu_get_root();
1989
	struct swait_queue_head *sq;
1990

1991
	WRITE_ONCE(rcu_state.gp_activity, jiffies);
1992
	raw_spin_lock_irq_rcu_node(rnp);
1993 1994 1995
	gp_duration = jiffies - rcu_state.gp_start;
	if (gp_duration > rcu_state.gp_max)
		rcu_state.gp_max = gp_duration;
1996

1997 1998 1999 2000 2001 2002 2003 2004
	/*
	 * We know the grace period is complete, but to everyone else
	 * it appears to still be ongoing.  But it is also the case
	 * that to everyone else it looks like there is nothing that
	 * they can do to advance the grace period.  It is therefore
	 * safe for us to drop the lock in order to mark the grace
	 * period as completed in all of the rcu_node structures.
	 */
Boqun Feng's avatar
Boqun Feng committed
2005
	raw_spin_unlock_irq_rcu_node(rnp);
2006

2007
	/*
2008 2009 2010 2011 2012 2013 2014
	 * Propagate new ->gp_seq value to rcu_node structures so that
	 * other CPUs don't have to wait until the start of the next grace
	 * period to process their callbacks.  This also avoids some nasty
	 * RCU grace-period initialization races by forcing the end of
	 * the current grace period to be completely recorded in all of
	 * the rcu_node structures before the beginning of the next grace
	 * period is recorded in any of the rcu_node structures.
2015
	 */
2016
	new_gp_seq = rcu_state.gp_seq;
2017
	rcu_seq_end(&new_gp_seq);
2018
	rcu_for_each_node_breadth_first(rnp) {
2019
		raw_spin_lock_irq_rcu_node(rnp);
2020
		if (WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)))
2021
			dump_blkd_tasks(rnp, 10);
2022
		WARN_ON_ONCE(rnp->qsmask);
2023
		WRITE_ONCE(rnp->gp_seq, new_gp_seq);
2024
		rdp = this_cpu_ptr(&rcu_data);
2025
		if (rnp == rdp->mynode)
2026
			needgp = __note_gp_changes(rnp, rdp) || needgp;
2027
		/* smp_mb() provided by prior unlock-lock pair. */
2028
		needgp = rcu_future_gp_cleanup(rnp) || needgp;
2029
		sq = rcu_nocb_gp_get(rnp);
Boqun Feng's avatar
Boqun Feng committed
2030
		raw_spin_unlock_irq_rcu_node(rnp);
2031
		rcu_nocb_gp_cleanup(sq);
2032
		cond_resched_tasks_rcu_qs();
2033
		WRITE_ONCE(rcu_state.gp_activity, jiffies);
2034
		rcu_gp_slow(gp_cleanup_delay);
2035
	}
2036
	rnp = rcu_get_root();
2037
	raw_spin_lock_irq_rcu_node(rnp); /* GP before ->gp_seq update. */
2038

2039
	/* Declare grace period done. */
2040 2041 2042
	rcu_seq_end(&rcu_state.gp_seq);
	trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("end"));
	rcu_state.gp_state = RCU_GP_IDLE;
2043
	/* Check for GP requests since above loop. */
2044
	rdp = this_cpu_ptr(&rcu_data);
2045
	if (!needgp && ULONG_CMP_LT(rnp->gp_seq, rnp->gp_seq_needed)) {
2046
		trace_rcu_this_gp(rnp, rdp, rnp->gp_seq_needed,
2047
				  TPS("CleanupMore"));
2048 2049
		needgp = true;
	}
2050
	/* Advance CBs to reduce false positives below. */
2051
	if (!rcu_accelerate_cbs(rnp, rdp) && needgp) {
2052 2053 2054 2055
		WRITE_ONCE(rcu_state.gp_flags, RCU_GP_FLAG_INIT);
		rcu_state.gp_req_activity = jiffies;
		trace_rcu_grace_period(rcu_state.name,
				       READ_ONCE(rcu_state.gp_seq),
2056
				       TPS("newreq"));
2057
	} else {
2058 2059
		WRITE_ONCE(rcu_state.gp_flags,
			   rcu_state.gp_flags & RCU_GP_FLAG_INIT);
2060
	}
Boqun Feng's avatar
Boqun Feng committed
2061
	raw_spin_unlock_irq_rcu_node(rnp);
2062 2063 2064 2065 2066
}

/*
 * Body of kthread that handles grace periods.
 */
2067
static int __noreturn rcu_gp_kthread(void *unused)
2068
{
2069
	bool first_gp_fqs;
2070
	int gf;
2071
	unsigned long j;
2072
	int ret;
2073
	struct rcu_node *rnp = rcu_get_root();
2074

2075
	rcu_bind_gp_kthread();
2076 2077 2078 2079
	for (;;) {

		/* Handle grace-period start. */
		for (;;) {
2080 2081
			trace_rcu_grace_period(rcu_state.name,
					       READ_ONCE(rcu_state.gp_seq),
2082
					       TPS("reqwait"));
2083 2084 2085 2086 2087
			rcu_state.gp_state = RCU_GP_WAIT_GPS;
			swait_event_idle_exclusive(rcu_state.gp_wq,
					 READ_ONCE(rcu_state.gp_flags) &
					 RCU_GP_FLAG_INIT);
			rcu_state.gp_state = RCU_GP_DONE_GPS;
2088
			/* Locking provides needed memory barrier. */
2089
			if (rcu_gp_init())
2090
				break;
2091
			cond_resched_tasks_rcu_qs();
2092
			WRITE_ONCE(rcu_state.gp_activity, jiffies);
2093
			WARN_ON(signal_pending(current));
2094 2095
			trace_rcu_grace_period(rcu_state.name,
					       READ_ONCE(rcu_state.gp_seq),
2096
					       TPS("reqwaitsig"));
2097
		}
2098

2099
		/* Handle quiescent-state forcing. */
2100
		first_gp_fqs = true;
2101
		j = jiffies_till_first_fqs;
2102
		ret = 0;
2103
		for (;;) {
2104
			if (!ret) {
2105 2106
				rcu_state.jiffies_force_qs = jiffies + j;
				WRITE_ONCE(rcu_state.jiffies_kick_kthreads,
2107 2108
					   jiffies + 3 * j);
			}
2109 2110
			trace_rcu_grace_period(rcu_state.name,
					       READ_ONCE(rcu_state.gp_seq),
2111
					       TPS("fqswait"));
2112 2113
			rcu_state.gp_state = RCU_GP_WAIT_FQS;
			ret = swait_event_idle_timeout_exclusive(rcu_state.gp_wq,
2114
					rcu_gp_fqs_check_wake(&gf), j);
2115
			rcu_state.gp_state = RCU_GP_DOING_FQS;
2116
			/* Locking provides needed memory barriers. */
2117
			/* If grace period done, leave loop. */
2118
			if (!READ_ONCE(rnp->qsmask) &&
2119
			    !rcu_preempt_blocked_readers_cgp(rnp))
2120
				break;
2121
			/* If time for quiescent-state forcing, do it. */
2122
			if (ULONG_CMP_GE(jiffies, rcu_state.jiffies_force_qs) ||
2123
			    (gf & RCU_GP_FLAG_FQS)) {
2124 2125
				trace_rcu_grace_period(rcu_state.name,
						       READ_ONCE(rcu_state.gp_seq),
2126
						       TPS("fqsstart"));
2127
				rcu_gp_fqs(first_gp_fqs);
2128
				first_gp_fqs = false;
2129 2130
				trace_rcu_grace_period(rcu_state.name,
						       READ_ONCE(rcu_state.gp_seq),
2131
						       TPS("fqsend"));
2132
				cond_resched_tasks_rcu_qs();
2133
				WRITE_ONCE(rcu_state.gp_activity, jiffies);
2134 2135
				ret = 0; /* Force full wait till next FQS. */
				j = jiffies_till_next_fqs;
2136 2137
			} else {
				/* Deal with stray signal. */
2138
				cond_resched_tasks_rcu_qs();
2139
				WRITE_ONCE(rcu_state.gp_activity, jiffies);
2140
				WARN_ON(signal_pending(current));
2141 2142
				trace_rcu_grace_period(rcu_state.name,
						       READ_ONCE(rcu_state.gp_seq),
2143
						       TPS("fqswaitsig"));
2144 2145
				ret = 1; /* Keep old FQS timing. */
				j = jiffies;
2146 2147
				if (time_after(jiffies,
					       rcu_state.jiffies_force_qs))
2148 2149
					j = 1;
				else
2150
					j = rcu_state.jiffies_force_qs - j;
2151
			}
2152
		}
2153 2154

		/* Handle grace-period end. */
2155
		rcu_state.gp_state = RCU_GP_CLEANUP;
2156
		rcu_gp_cleanup();
2157
		rcu_state.gp_state = RCU_GP_CLEANED;
2158 2159 2160
	}
}

2161
/*
2162 2163 2164 2165 2166 2167 2168
 * Report a full set of quiescent states to the specified rcu_state data
 * structure.  Invoke rcu_gp_kthread_wake() to awaken the grace-period
 * kthread if another grace period is required.  Whether we wake
 * the grace-period kthread or it awakens itself for the next round
 * of quiescent-state forcing, that kthread will clean up after the
 * just-completed grace period.  Note that the caller must hold rnp->lock,
 * which is released before return.
2169
 */
2170
static void rcu_report_qs_rsp(unsigned long flags)
2171
	__releases(rcu_get_root()->lock)
2172
{
2173
	raw_lockdep_assert_held_rcu_node(rcu_get_root());
2174
	WARN_ON_ONCE(!rcu_gp_in_progress());
2175 2176
	WRITE_ONCE(rcu_state.gp_flags,
		   READ_ONCE(rcu_state.gp_flags) | RCU_GP_FLAG_FQS);
2177
	raw_spin_unlock_irqrestore_rcu_node(rcu_get_root(), flags);
2178
	rcu_gp_kthread_wake();
2179 2180
}

2181
/*
2182 2183 2184
 * Similar to rcu_report_qs_rdp(), for which it is a helper function.
 * Allows quiescent states for a group of CPUs to be reported at one go
 * to the specified rcu_node structure, though all the CPUs in the group
2185 2186 2187
 * must be represented by the same rcu_node structure (which need not be a
 * leaf rcu_node structure, though it often will be).  The gps parameter
 * is the grace-period snapshot, which means that the quiescent states
2188
 * are valid only if rnp->gp_seq is equal to gps.  That structure's lock
2189
 * must be held upon entry, and it is released before return.
2190 2191 2192 2193
 *
 * As a special case, if mask is zero, the bit-already-cleared check is
 * disabled.  This allows propagating quiescent state due to resumed tasks
 * during grace-period initialization.
2194
 */
2195 2196
static void rcu_report_qs_rnp(unsigned long mask, struct rcu_node *rnp,
			      unsigned long gps, unsigned long flags)
2197 2198
	__releases(rnp->lock)
{
2199
	unsigned long oldmask = 0;
2200
	struct rcu_node *rnp_c;
2201
	struct rcu_state __maybe_unused *rsp = &rcu_state;
2202

2203
	raw_lockdep_assert_held_rcu_node(rnp);
2204

2205 2206
	/* Walk up the rcu_node hierarchy. */
	for (;;) {
2207
		if ((!(rnp->qsmask & mask) && mask) || rnp->gp_seq != gps) {
2208

2209 2210 2211 2212
			/*
			 * Our bit has already been cleared, or the
			 * relevant grace period is already over, so done.
			 */
Boqun Feng's avatar
Boqun Feng committed
2213
			raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
2214 2215
			return;
		}
2216
		WARN_ON_ONCE(oldmask); /* Any child must be all zeroed! */
2217
		WARN_ON_ONCE(!rcu_is_leaf_node(rnp) &&
2218
			     rcu_preempt_blocked_readers_cgp(rnp));
2219
		rnp->qsmask &= ~mask;
2220
		trace_rcu_quiescent_state_report(rsp->name, rnp->gp_seq,
2221 2222 2223
						 mask, rnp->qsmask, rnp->level,
						 rnp->grplo, rnp->grphi,
						 !!rnp->gp_tasks);
2224
		if (rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) {
2225 2226

			/* Other bits still set at this level, so done. */
Boqun Feng's avatar
Boqun Feng committed
2227
			raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
2228 2229
			return;
		}
2230
		rnp->completedqs = rnp->gp_seq;
2231 2232 2233 2234 2235 2236 2237
		mask = rnp->grpmask;
		if (rnp->parent == NULL) {

			/* No more levels.  Exit loop holding root lock. */

			break;
		}
Boqun Feng's avatar
Boqun Feng committed
2238
		raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
2239
		rnp_c = rnp;
2240
		rnp = rnp->parent;
2241
		raw_spin_lock_irqsave_rcu_node(rnp, flags);
2242
		oldmask = rnp_c->qsmask;
2243 2244 2245 2246
	}

	/*
	 * Get here if we are the last CPU to pass through a quiescent
2247
	 * state for this grace period.  Invoke rcu_report_qs_rsp()
2248
	 * to clean up and start the next grace period if one is needed.
2249
	 */
2250
	rcu_report_qs_rsp(flags); /* releases rnp->lock. */
2251 2252
}

2253 2254 2255 2256 2257 2258 2259
/*
 * Record a quiescent state for all tasks that were previously queued
 * on the specified rcu_node structure and that were blocking the current
 * RCU grace period.  The caller must hold the specified rnp->lock with
 * irqs disabled, and this lock is released upon return, but irqs remain
 * disabled.
 */
2260
static void __maybe_unused
2261
rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
2262 2263
	__releases(rnp->lock)
{
2264
	unsigned long gps;
2265 2266 2267
	unsigned long mask;
	struct rcu_node *rnp_p;

2268
	raw_lockdep_assert_held_rcu_node(rnp);
2269
	if (WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEMPT)) ||
2270 2271
	    WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)) ||
	    rnp->qsmask != 0) {
Boqun Feng's avatar
Boqun Feng committed
2272
		raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
2273 2274 2275
		return;  /* Still need more quiescent states! */
	}

2276
	rnp->completedqs = rnp->gp_seq;
2277 2278 2279
	rnp_p = rnp->parent;
	if (rnp_p == NULL) {
		/*
2280 2281
		 * Only one rcu_node structure in the tree, so don't
		 * try to report up to its nonexistent parent!
2282
		 */
2283
		rcu_report_qs_rsp(flags);
2284 2285 2286
		return;
	}

2287 2288
	/* Report up the rest of the hierarchy, tracking current ->gp_seq. */
	gps = rnp->gp_seq;
2289
	mask = rnp->grpmask;
Boqun Feng's avatar
Boqun Feng committed
2290
	raw_spin_unlock_rcu_node(rnp);	/* irqs remain disabled. */
2291
	raw_spin_lock_rcu_node(rnp_p);	/* irqs already disabled. */
2292
	rcu_report_qs_rnp(mask, rnp_p, gps, flags);
2293 2294
}

2295
/*
2296
 * Record a quiescent state for the specified CPU to that CPU's rcu_data
2297
 * structure.  This must be called from the specified CPU.
2298 2299
 */
static void
2300
rcu_report_qs_rdp(int cpu, struct rcu_data *rdp)
2301 2302 2303
{
	unsigned long flags;
	unsigned long mask;
2304
	bool needwake;
2305 2306 2307
	struct rcu_node *rnp;

	rnp = rdp->mynode;
2308
	raw_spin_lock_irqsave_rcu_node(rnp, flags);
2309 2310
	if (rdp->cpu_no_qs.b.norm || rdp->gp_seq != rnp->gp_seq ||
	    rdp->gpwrap) {
2311 2312

		/*
2313 2314 2315 2316
		 * The grace period in which this quiescent state was
		 * recorded has ended, so don't report it upwards.
		 * We will instead need a new quiescent state that lies
		 * within the current grace period.
2317
		 */
2318
		rdp->cpu_no_qs.b.norm = true;	/* need qs for new gp. */
2319
		rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_dynticks.rcu_qs_ctr);
Boqun Feng's avatar
Boqun Feng committed
2320
		raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
2321 2322 2323 2324
		return;
	}
	mask = rdp->grpmask;
	if ((rnp->qsmask & mask) == 0) {
Boqun Feng's avatar
Boqun Feng committed
2325
		raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
2326
	} else {
2327
		rdp->core_needs_qs = false;
2328 2329 2330 2331 2332

		/*
		 * This GP can't end until cpu checks in, so all of our
		 * callbacks can be processed during the next GP.
		 */
2333
		needwake = rcu_accelerate_cbs(rnp, rdp);
2334

2335
		rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags);
2336
		/* ^^^ Released rnp->lock */
2337
		if (needwake)
2338
			rcu_gp_kthread_wake();
2339 2340 2341 2342 2343 2344 2345 2346 2347 2348
	}
}

/*
 * Check to see if there is a new grace period of which this CPU
 * is not yet aware, and if so, set up local rcu_data state for it.
 * Otherwise, see if this CPU has just passed through its first
 * quiescent state for this grace period, and record that fact if so.
 */
static void
2349
rcu_check_quiescent_state(struct rcu_data *rdp)
2350
{
2351
	/* Check for grace-period ends and beginnings. */
2352
	note_gp_changes(rdp);
2353 2354 2355 2356 2357

	/*
	 * Does this CPU still need to do its part for current grace period?
	 * If no, return and let the other CPUs do their part as well.
	 */
2358
	if (!rdp->core_needs_qs)
2359 2360 2361 2362 2363 2364
		return;

	/*
	 * Was there a quiescent state since the beginning of the grace
	 * period? If no, then exit and wait for the next call.
	 */
2365
	if (rdp->cpu_no_qs.b.norm)
2366 2367
		return;

2368 2369 2370 2371
	/*
	 * Tell RCU we are done (but rcu_report_qs_rdp() will be the
	 * judge of that).
	 */
2372
	rcu_report_qs_rdp(rdp->cpu, rdp);
2373 2374
}

2375
/*
2376 2377
 * Near the end of the offline process.  Trace the fact that this CPU
 * is going offline.
2378
 */
2379
int rcutree_dying_cpu(unsigned int cpu)
2380
{
2381
	RCU_TRACE(bool blkd;)
2382
	RCU_TRACE(struct rcu_data *rdp = this_cpu_ptr(&rcu_data);)
2383
	RCU_TRACE(struct rcu_node *rnp = rdp->mynode;)
2384

2385
	if (!IS_ENABLED(CONFIG_HOTPLUG_CPU))
2386
		return 0;
2387

2388
	RCU_TRACE(blkd = !!(rnp->qsmask & rdp->grpmask);)
2389
	trace_rcu_grace_period(rcu_state.name, rnp->gp_seq,
2390
			       blkd ? TPS("cpuofl") : TPS("cpuofl-bgp"));
2391
	return 0;
2392 2393
}

2394 2395 2396 2397 2398 2399 2400 2401 2402
/*
 * All CPUs for the specified rcu_node structure have gone offline,
 * and all tasks that were preempted within an RCU read-side critical
 * section while running on one of those CPUs have since exited their RCU
 * read-side critical section.  Some other CPU is reporting this fact with
 * the specified rcu_node structure's ->lock held and interrupts disabled.
 * This function therefore goes up the tree of rcu_node structures,
 * clearing the corresponding bits in the ->qsmaskinit fields.  Note that
 * the leaf rcu_node structure's ->qsmaskinit field has already been
2403
 * updated.
2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415
 *
 * This function does check that the specified rcu_node structure has
 * all CPUs offline and no blocked tasks, so it is OK to invoke it
 * prematurely.  That said, invoking it after the fact will cost you
 * a needless lock acquisition.  So once it has done its work, don't
 * invoke it again.
 */
static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf)
{
	long mask;
	struct rcu_node *rnp = rnp_leaf;

2416
	raw_lockdep_assert_held_rcu_node(rnp_leaf);
2417
	if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) ||
2418 2419
	    WARN_ON_ONCE(rnp_leaf->qsmaskinit) ||
	    WARN_ON_ONCE(rcu_preempt_has_tasks(rnp_leaf)))
2420 2421 2422 2423 2424 2425
		return;
	for (;;) {
		mask = rnp->grpmask;
		rnp = rnp->parent;
		if (!rnp)
			break;
2426
		raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */
2427
		rnp->qsmaskinit &= ~mask;
2428 2429
		/* Between grace periods, so better already be zero! */
		WARN_ON_ONCE(rnp->qsmask);
2430
		if (rnp->qsmaskinit) {
Boqun Feng's avatar
Boqun Feng committed
2431 2432
			raw_spin_unlock_rcu_node(rnp);
			/* irqs remain disabled. */
2433 2434
			return;
		}
Boqun Feng's avatar
Boqun Feng committed
2435
		raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */
2436 2437 2438
	}
}

2439
/*
2440
 * The CPU has been completely removed, and some other CPU is reporting
2441 2442 2443
 * this fact from process context.  Do the remainder of the cleanup.
 * There can only be one CPU hotplug operation at a time, so no need for
 * explicit locking.
2444
 */
2445
int rcutree_dead_cpu(unsigned int cpu)
2446
{
2447
	struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
2448
	struct rcu_node *rnp = rdp->mynode;  /* Outgoing CPU's rdp & rnp. */
2449

2450
	if (!IS_ENABLED(CONFIG_HOTPLUG_CPU))
2451
		return 0;
2452

2453
	/* Adjust any no-longer-needed kthreads. */
Thomas Gleixner's avatar
Thomas Gleixner committed
2454
	rcu_boost_kthread_setaffinity(rnp, -1);
2455 2456 2457
	/* Do any needed no-CB deferred wakeups from this CPU. */
	do_nocb_deferred_wakeup(per_cpu_ptr(&rcu_data, cpu));
	return 0;
2458 2459 2460 2461 2462 2463
}

/*
 * Invoke any RCU callbacks that have made it to the end of their grace
 * period.  Thottle as specified by rdp->blimit.
 */
2464
static void rcu_do_batch(struct rcu_data *rdp)
2465 2466
{
	unsigned long flags;
2467 2468 2469
	struct rcu_head *rhp;
	struct rcu_cblist rcl = RCU_CBLIST_INITIALIZER(rcl);
	long bl, count;
2470
	struct rcu_state *rsp = &rcu_state;
2471

2472
	/* If no callbacks are ready, just return. */
2473 2474 2475 2476 2477 2478
	if (!rcu_segcblist_ready_cbs(&rdp->cblist)) {
		trace_rcu_batch_start(rsp->name,
				      rcu_segcblist_n_lazy_cbs(&rdp->cblist),
				      rcu_segcblist_n_cbs(&rdp->cblist), 0);
		trace_rcu_batch_end(rsp->name, 0,
				    !rcu_segcblist_empty(&rdp->cblist),
2479 2480
				    need_resched(), is_idle_task(current),
				    rcu_is_callbacks_kthread());
2481
		return;
2482
	}
2483 2484 2485

	/*
	 * Extract the list of ready callbacks, disabling to prevent
2486 2487
	 * races with call_rcu() from interrupt handlers.  Leave the
	 * callback counts, as rcu_barrier() needs to be conservative.
2488 2489
	 */
	local_irq_save(flags);
2490
	WARN_ON_ONCE(cpu_is_offline(smp_processor_id()));
2491
	bl = rdp->blimit;
2492 2493 2494
	trace_rcu_batch_start(rsp->name, rcu_segcblist_n_lazy_cbs(&rdp->cblist),
			      rcu_segcblist_n_cbs(&rdp->cblist), bl);
	rcu_segcblist_extract_done_cbs(&rdp->cblist, &rcl);
2495 2496 2497
	local_irq_restore(flags);

	/* Invoke callbacks. */
2498 2499 2500 2501 2502 2503 2504 2505 2506
	rhp = rcu_cblist_dequeue(&rcl);
	for (; rhp; rhp = rcu_cblist_dequeue(&rcl)) {
		debug_rcu_head_unqueue(rhp);
		if (__rcu_reclaim(rsp->name, rhp))
			rcu_cblist_dequeued_lazy(&rcl);
		/*
		 * Stop only if limit reached and CPU has something to do.
		 * Note: The rcl structure counts down from zero.
		 */
2507
		if (-rcl.len >= bl &&
2508 2509
		    (need_resched() ||
		     (!is_idle_task(current) && !rcu_is_callbacks_kthread())))
2510 2511 2512 2513
			break;
	}

	local_irq_save(flags);
2514
	count = -rcl.len;
2515 2516
	trace_rcu_batch_end(rsp->name, count, !!rcl.head, need_resched(),
			    is_idle_task(current), rcu_is_callbacks_kthread());
2517

2518 2519
	/* Update counts and requeue any remaining callbacks. */
	rcu_segcblist_insert_done_cbs(&rdp->cblist, &rcl);
2520
	smp_mb(); /* List handling before counting for rcu_barrier(). */
2521
	rcu_segcblist_insert_count(&rdp->cblist, &rcl);
2522 2523

	/* Reinstate batch limit if we have worked down the excess. */
2524 2525
	count = rcu_segcblist_n_cbs(&rdp->cblist);
	if (rdp->blimit == LONG_MAX && count <= qlowmark)
2526 2527
		rdp->blimit = blimit;

2528
	/* Reset ->qlen_last_fqs_check trigger if enough CBs have drained. */
2529
	if (count == 0 && rdp->qlen_last_fqs_check != 0) {
2530 2531
		rdp->qlen_last_fqs_check = 0;
		rdp->n_force_qs_snap = rsp->n_force_qs;
2532 2533
	} else if (count < rdp->qlen_last_fqs_check - qhimark)
		rdp->qlen_last_fqs_check = count;
2534 2535 2536 2537 2538

	/*
	 * The following usually indicates a double call_rcu().  To track
	 * this down, try building with CONFIG_DEBUG_OBJECTS_RCU_HEAD=y.
	 */
2539
	WARN_ON_ONCE(rcu_segcblist_empty(&rdp->cblist) != (count == 0));
2540

2541 2542
	local_irq_restore(flags);

2543
	/* Re-invoke RCU core processing if there are callbacks remaining. */
2544
	if (rcu_segcblist_ready_cbs(&rdp->cblist))
2545
		invoke_rcu_core();
2546 2547 2548 2549 2550
}

/*
 * Check to see if this CPU is in a non-context-switch quiescent state
 * (user mode or idle loop for rcu, non-softirq execution for rcu_bh).
2551
 * Also schedule RCU core processing.
2552
 *
2553
 * This function must be called from hardirq context.  It is normally
2554
 * invoked from the scheduling-clock interrupt.
2555
 */
2556
void rcu_check_callbacks(int user)
2557
{
2558
	trace_rcu_utilization(TPS("Start scheduler-tick"));
2559
	increment_cpu_stall_ticks();
2560
	rcu_flavor_check_callbacks(user);
2561
	if (rcu_pending())
2562
		invoke_rcu_core();
2563

2564
	trace_rcu_utilization(TPS("End scheduler-tick"));
2565 2566 2567 2568 2569
}

/*
 * Scan the leaf rcu_node structures, processing dyntick state for any that
 * have not yet encountered a quiescent state, using the function specified.
2570 2571
 * Also initiate boosting for any threads blocked on the root rcu_node.
 *
2572
 * The caller must have suppressed start of new grace periods.
2573
 */
2574
static void force_qs_rnp(int (*f)(struct rcu_data *rsp))
2575 2576 2577 2578
{
	int cpu;
	unsigned long flags;
	unsigned long mask;
2579
	struct rcu_node *rnp;
2580

2581
	rcu_for_each_leaf_node(rnp) {
2582
		cond_resched_tasks_rcu_qs();
2583
		mask = 0;
2584
		raw_spin_lock_irqsave_rcu_node(rnp, flags);
2585
		if (rnp->qsmask == 0) {
2586
			if (!IS_ENABLED(CONFIG_PREEMPT) ||
2587 2588 2589 2590 2591 2592 2593 2594 2595 2596
			    rcu_preempt_blocked_readers_cgp(rnp)) {
				/*
				 * No point in scanning bits because they
				 * are all zero.  But we might need to
				 * priority-boost blocked readers.
				 */
				rcu_initiate_boost(rnp, flags);
				/* rcu_initiate_boost() releases rnp->lock */
				continue;
			}
2597 2598
			raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
			continue;
2599
		}
2600 2601
		for_each_leaf_node_possible_cpu(rnp, cpu) {
			unsigned long bit = leaf_node_cpu_bit(rnp, cpu);
2602
			if ((rnp->qsmask & bit) != 0) {
2603
				if (f(per_cpu_ptr(&rcu_data, cpu)))
2604 2605
					mask |= bit;
			}
2606
		}
2607
		if (mask != 0) {
2608
			/* Idle/offline CPUs, report (releases rnp->lock). */
2609
			rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags);
2610 2611
		} else {
			/* Nothing to do here, so just drop the lock. */
Boqun Feng's avatar
Boqun Feng committed
2612
			raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
2613 2614 2615 2616 2617 2618 2619 2620
		}
	}
}

/*
 * Force quiescent states on reluctant CPUs, and also detect which
 * CPUs are in dyntick-idle mode.
 */
2621
static void force_quiescent_state(void)
2622 2623
{
	unsigned long flags;
2624 2625 2626
	bool ret;
	struct rcu_node *rnp;
	struct rcu_node *rnp_old = NULL;
2627
	struct rcu_state *rsp = &rcu_state;
2628 2629

	/* Funnel through hierarchy to reduce memory contention. */
2630
	rnp = __this_cpu_read(rcu_data.mynode);
2631
	for (; rnp != NULL; rnp = rnp->parent) {
2632
		ret = (READ_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) ||
2633 2634 2635
		      !raw_spin_trylock(&rnp->fqslock);
		if (rnp_old != NULL)
			raw_spin_unlock(&rnp_old->fqslock);
2636
		if (ret)
2637 2638 2639
			return;
		rnp_old = rnp;
	}
2640
	/* rnp_old == rcu_get_root(), rnp == NULL. */
2641

2642
	/* Reached the root of the rcu_node tree, acquire lock. */
2643
	raw_spin_lock_irqsave_rcu_node(rnp_old, flags);
2644
	raw_spin_unlock(&rnp_old->fqslock);
2645
	if (READ_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) {
Boqun Feng's avatar
Boqun Feng committed
2646
		raw_spin_unlock_irqrestore_rcu_node(rnp_old, flags);
2647
		return;  /* Someone beat us to it. */
2648
	}
2649
	WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS);
Boqun Feng's avatar
Boqun Feng committed
2650
	raw_spin_unlock_irqrestore_rcu_node(rnp_old, flags);
2651
	rcu_gp_kthread_wake();
2652 2653
}

2654 2655 2656 2657 2658
/*
 * This function checks for grace-period requests that fail to motivate
 * RCU to come out of its idle mode.
 */
static void
2659
rcu_check_gp_start_stall(struct rcu_node *rnp, struct rcu_data *rdp)
2660
{
2661
	const unsigned long gpssdelay = rcu_jiffies_till_stall_check() * HZ;
2662 2663
	unsigned long flags;
	unsigned long j;
2664
	struct rcu_node *rnp_root = rcu_get_root();
2665
	struct rcu_state *rsp = &rcu_state;
2666 2667
	static atomic_t warned = ATOMIC_INIT(0);

2668
	if (!IS_ENABLED(CONFIG_PROVE_RCU) || rcu_gp_in_progress() ||
2669
	    ULONG_CMP_GE(rnp_root->gp_seq, rnp_root->gp_seq_needed))
2670 2671
		return;
	j = jiffies; /* Expensive access, and in common case don't get here. */
2672 2673
	if (time_before(j, READ_ONCE(rsp->gp_req_activity) + gpssdelay) ||
	    time_before(j, READ_ONCE(rsp->gp_activity) + gpssdelay) ||
2674 2675 2676 2677 2678
	    atomic_read(&warned))
		return;

	raw_spin_lock_irqsave_rcu_node(rnp, flags);
	j = jiffies;
2679
	if (rcu_gp_in_progress() ||
2680
	    ULONG_CMP_GE(rnp_root->gp_seq, rnp_root->gp_seq_needed) ||
2681 2682
	    time_before(j, READ_ONCE(rsp->gp_req_activity) + gpssdelay) ||
	    time_before(j, READ_ONCE(rsp->gp_activity) + gpssdelay) ||
2683 2684 2685 2686 2687 2688 2689 2690 2691
	    atomic_read(&warned)) {
		raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
		return;
	}
	/* Hold onto the leaf lock to make others see warned==1. */

	if (rnp_root != rnp)
		raw_spin_lock_rcu_node(rnp_root); /* irqs already disabled. */
	j = jiffies;
2692
	if (rcu_gp_in_progress() ||
2693
	    ULONG_CMP_GE(rnp_root->gp_seq, rnp_root->gp_seq_needed) ||
2694 2695
	    time_before(j, rsp->gp_req_activity + gpssdelay) ||
	    time_before(j, rsp->gp_activity + gpssdelay) ||
2696 2697 2698 2699 2700
	    atomic_xchg(&warned, 1)) {
		raw_spin_unlock_rcu_node(rnp_root); /* irqs remain disabled. */
		raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
		return;
	}
2701
	pr_alert("%s: g%ld->%ld gar:%lu ga:%lu f%#x gs:%d %s->state:%#lx\n",
2702 2703
		 __func__, (long)READ_ONCE(rsp->gp_seq),
		 (long)READ_ONCE(rnp_root->gp_seq_needed),
2704
		 j - rsp->gp_req_activity, j - rsp->gp_activity,
2705
		 rsp->gp_flags, rsp->gp_state, rsp->name,
2706 2707 2708 2709 2710 2711 2712
		 rsp->gp_kthread ? rsp->gp_kthread->state : 0x1ffffL);
	WARN_ON(1);
	if (rnp_root != rnp)
		raw_spin_unlock_rcu_node(rnp_root);
	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
}

2713
/*
2714 2715 2716
 * This does the RCU core processing work for the specified rcu_data
 * structures.  This may be called only from the CPU to whom the rdp
 * belongs.
2717
 */
2718
static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused)
2719 2720
{
	unsigned long flags;
2721
	struct rcu_data *rdp = raw_cpu_ptr(&rcu_data);
2722
	struct rcu_node *rnp = rdp->mynode;
2723

2724 2725 2726
	if (cpu_is_offline(smp_processor_id()))
		return;
	trace_rcu_utilization(TPS("Start RCU core"));
2727
	WARN_ON_ONCE(!rdp->beenonline);
2728

2729 2730 2731 2732 2733 2734
	/* Report any deferred quiescent states if preemption enabled. */
	if (!(preempt_count() & PREEMPT_MASK))
		rcu_preempt_deferred_qs(current);
	else if (rcu_preempt_need_deferred_qs(current))
		resched_cpu(rdp->cpu); /* Provoke future context switch. */

2735
	/* Update RCU state based on any recent quiescent states. */
2736
	rcu_check_quiescent_state(rdp);
2737

2738
	/* No grace period and unregistered callbacks? */
2739
	if (!rcu_gp_in_progress() &&
2740 2741
	    rcu_segcblist_is_enabled(&rdp->cblist)) {
		local_irq_save(flags);
2742
		if (!rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL))
2743
			rcu_accelerate_cbs_unlocked(rnp, rdp);
2744
		local_irq_restore(flags);
2745 2746
	}

2747
	rcu_check_gp_start_stall(rnp, rdp);
2748

2749
	/* If there are callbacks ready, invoke them. */
2750
	if (rcu_segcblist_ready_cbs(&rdp->cblist))
2751
		invoke_rcu_callbacks(rdp);
2752 2753 2754

	/* Do any needed deferred wakeups of rcuo kthreads. */
	do_nocb_deferred_wakeup(rdp);
2755
	trace_rcu_utilization(TPS("End RCU core"));
2756 2757
}

2758
/*
2759 2760 2761
 * Schedule RCU callback invocation.  If the specified type of RCU
 * does not support RCU priority boosting, just do a direct call,
 * otherwise wake up the per-CPU kernel kthread.  Note that because we
2762
 * are running on the current CPU with softirqs disabled, the
2763
 * rcu_cpu_kthread_task cannot disappear out from under us.
2764
 */
2765
static void invoke_rcu_callbacks(struct rcu_data *rdp)
2766
{
2767 2768
	struct rcu_state *rsp = &rcu_state;

2769
	if (unlikely(!READ_ONCE(rcu_scheduler_fully_active)))
2770
		return;
2771
	if (likely(!rsp->boost)) {
2772
		rcu_do_batch(rdp);
2773 2774
		return;
	}
2775
	invoke_rcu_callbacks_kthread();
2776 2777
}

2778
static void invoke_rcu_core(void)
2779
{
2780 2781
	if (cpu_online(smp_processor_id()))
		raise_softirq(RCU_SOFTIRQ);
2782 2783
}

2784 2785 2786
/*
 * Handle any core-RCU processing required by a call_rcu() invocation.
 */
2787 2788
static void __call_rcu_core(struct rcu_data *rdp, struct rcu_head *head,
			    unsigned long flags)
2789
{
2790 2791 2792 2793
	/*
	 * If called from an extended quiescent state, invoke the RCU
	 * core in order to force a re-evaluation of RCU's idleness.
	 */
2794
	if (!rcu_is_watching())
2795 2796
		invoke_rcu_core();

2797
	/* If interrupts were disabled or CPU offline, don't invoke RCU core. */
2798
	if (irqs_disabled_flags(flags) || cpu_is_offline(smp_processor_id()))
2799
		return;
2800

2801 2802 2803 2804 2805 2806 2807
	/*
	 * Force the grace period if too many callbacks or too long waiting.
	 * Enforce hysteresis, and don't invoke force_quiescent_state()
	 * if some other CPU has recently done so.  Also, don't bother
	 * invoking force_quiescent_state() if the newly enqueued callback
	 * is the only one waiting for a grace period to complete.
	 */
2808 2809
	if (unlikely(rcu_segcblist_n_cbs(&rdp->cblist) >
		     rdp->qlen_last_fqs_check + qhimark)) {
2810 2811

		/* Are we ignoring a completed grace period? */
2812
		note_gp_changes(rdp);
2813 2814

		/* Start a new grace period if one not already started. */
2815
		if (!rcu_gp_in_progress()) {
2816
			rcu_accelerate_cbs_unlocked(rdp->mynode, rdp);
2817 2818 2819
		} else {
			/* Give the grace period a kick. */
			rdp->blimit = LONG_MAX;
2820
			if (rcu_state.n_force_qs == rdp->n_force_qs_snap &&
2821
			    rcu_segcblist_first_pend_cb(&rdp->cblist) != head)
2822
				force_quiescent_state();
2823
			rdp->n_force_qs_snap = rcu_state.n_force_qs;
2824
			rdp->qlen_last_fqs_check = rcu_segcblist_n_cbs(&rdp->cblist);
2825
		}
2826
	}
2827 2828
}

2829 2830 2831 2832 2833 2834 2835
/*
 * RCU callback function to leak a callback.
 */
static void rcu_leak_callback(struct rcu_head *rhp)
{
}

2836 2837 2838 2839 2840 2841
/*
 * Helper function for call_rcu() and friends.  The cpu argument will
 * normally be -1, indicating "currently running CPU".  It may specify
 * a CPU only if that CPU is a no-CBs CPU.  Currently, only _rcu_barrier()
 * is expected to specify a CPU.
 */
2842
static void
2843
__call_rcu(struct rcu_head *head, rcu_callback_t func, int cpu, bool lazy)
2844 2845 2846
{
	unsigned long flags;
	struct rcu_data *rdp;
2847
	struct rcu_state __maybe_unused *rsp = &rcu_state;
2848

2849 2850 2851
	/* Misaligned rcu_head! */
	WARN_ON_ONCE((unsigned long)head & (sizeof(void *) - 1));

2852
	if (debug_rcu_head_queue(head)) {
2853 2854 2855 2856 2857 2858 2859
		/*
		 * Probable double call_rcu(), so leak the callback.
		 * Use rcu:rcu_callback trace event to find the previous
		 * time callback was passed to __call_rcu().
		 */
		WARN_ONCE(1, "__call_rcu(): Double-freed CB %p->%pF()!!!\n",
			  head, head->func);
2860
		WRITE_ONCE(head->func, rcu_leak_callback);
2861 2862
		return;
	}
2863 2864 2865
	head->func = func;
	head->next = NULL;
	local_irq_save(flags);
2866
	rdp = this_cpu_ptr(&rcu_data);
2867 2868

	/* Add the callback to our list. */
2869
	if (unlikely(!rcu_segcblist_is_enabled(&rdp->cblist)) || cpu != -1) {
2870 2871 2872
		int offline;

		if (cpu != -1)
2873
			rdp = per_cpu_ptr(&rcu_data, cpu);
2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886
		if (likely(rdp->mynode)) {
			/* Post-boot, so this should be for a no-CBs CPU. */
			offline = !__call_rcu_nocb(rdp, head, lazy, flags);
			WARN_ON_ONCE(offline);
			/* Offline CPU, _call_rcu() illegal, leak callback.  */
			local_irq_restore(flags);
			return;
		}
		/*
		 * Very early boot, before rcu_init().  Initialize if needed
		 * and then drop through to queue the callback.
		 */
		BUG_ON(cpu != -1);
2887
		WARN_ON_ONCE(!rcu_is_watching());
2888 2889
		if (rcu_segcblist_empty(&rdp->cblist))
			rcu_segcblist_init(&rdp->cblist);
2890
	}
2891 2892
	rcu_segcblist_enqueue(&rdp->cblist, head, lazy);
	if (!lazy)
2893
		rcu_idle_count_callbacks_posted();
2894

2895 2896
	if (__is_kfree_rcu_offset((unsigned long)func))
		trace_rcu_kfree_callback(rsp->name, head, (unsigned long)func,
2897 2898
					 rcu_segcblist_n_lazy_cbs(&rdp->cblist),
					 rcu_segcblist_n_cbs(&rdp->cblist));
2899
	else
2900 2901 2902
		trace_rcu_callback(rsp->name, head,
				   rcu_segcblist_n_lazy_cbs(&rdp->cblist),
				   rcu_segcblist_n_cbs(&rdp->cblist));
2903

2904
	/* Go handle any RCU core processing required. */
2905
	__call_rcu_core(rdp, head, flags);
2906 2907 2908
	local_irq_restore(flags);
}

2909
/**
2910
 * call_rcu() - Queue an RCU callback for invocation after a grace period.
2911 2912 2913 2914
 * @head: structure to be used for queueing the RCU updates.
 * @func: actual callback function to be invoked after the grace period
 *
 * The callback function will be invoked some time after a full grace
2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945
 * period elapses, in other words after all pre-existing RCU read-side
 * critical sections have completed.  However, the callback function
 * might well execute concurrently with RCU read-side critical sections
 * that started after call_rcu() was invoked.  RCU read-side critical
 * sections are delimited by rcu_read_lock() and rcu_read_unlock(), and
 * may be nested.  In addition, regions of code across which interrupts,
 * preemption, or softirqs have been disabled also serve as RCU read-side
 * critical sections.  This includes hardware interrupt handlers, softirq
 * handlers, and NMI handlers.
 *
 * Note that all CPUs must agree that the grace period extended beyond
 * all pre-existing RCU read-side critical section.  On systems with more
 * than one CPU, this means that when "func()" is invoked, each CPU is
 * guaranteed to have executed a full memory barrier since the end of its
 * last RCU read-side critical section whose beginning preceded the call
 * to call_rcu().  It also means that each CPU executing an RCU read-side
 * critical section that continues beyond the start of "func()" must have
 * executed a memory barrier after the call_rcu() but before the beginning
 * of that RCU read-side critical section.  Note that these guarantees
 * include CPUs that are offline, idle, or executing in user mode, as
 * well as CPUs that are executing in the kernel.
 *
 * Furthermore, if CPU A invoked call_rcu() and CPU B invoked the
 * resulting RCU callback function "func()", then both CPU A and CPU B are
 * guaranteed to execute a full memory barrier during the time interval
 * between the call to call_rcu() and the invocation of "func()" -- even
 * if CPU A and CPU B are the same CPU (but again only if the system has
 * more than one CPU).
 */
void call_rcu(struct rcu_head *head, rcu_callback_t func)
{
2946
	__call_rcu(head, func, -1, 0);
2947 2948 2949 2950 2951 2952 2953
}
EXPORT_SYMBOL_GPL(call_rcu);

/**
 * call_rcu_sched() - Queue an RCU for invocation after sched grace period.
 * @head: structure to be used for queueing the RCU updates.
 * @func: actual callback function to be invoked after the grace period
2954
 *
2955
 * This is transitional.
2956
 */
2957
void call_rcu_sched(struct rcu_head *head, rcu_callback_t func)
2958
{
2959
	call_rcu(head, func);
2960
}
2961
EXPORT_SYMBOL_GPL(call_rcu_sched);
2962

2963 2964 2965 2966 2967 2968 2969
/*
 * Queue an RCU callback for lazy invocation after a grace period.
 * This will likely be later named something like "call_rcu_lazy()",
 * but this change will require some way of tagging the lazy RCU
 * callbacks in the list of pending callbacks. Until then, this
 * function may only be called from __kfree_rcu().
 */
2970
void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
2971
{
2972
	__call_rcu(head, func, -1, 1);
2973 2974 2975
}
EXPORT_SYMBOL_GPL(kfree_call_rcu);

2976 2977 2978
/**
 * synchronize_sched - wait until an rcu-sched grace period has elapsed.
 *
2979
 * This is transitional.
2980 2981 2982
 */
void synchronize_sched(void)
{
2983
	synchronize_rcu();
2984 2985 2986
}
EXPORT_SYMBOL_GPL(synchronize_sched);

2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997
/**
 * get_state_synchronize_rcu - Snapshot current RCU state
 *
 * Returns a cookie that is used by a later call to cond_synchronize_rcu()
 * to determine whether or not a full grace period has elapsed in the
 * meantime.
 */
unsigned long get_state_synchronize_rcu(void)
{
	/*
	 * Any prior manipulation of RCU-protected data must happen
2998
	 * before the load from ->gp_seq.
2999 3000
	 */
	smp_mb();  /* ^^^ */
3001
	return rcu_seq_snap(&rcu_state.gp_seq);
3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020
}
EXPORT_SYMBOL_GPL(get_state_synchronize_rcu);

/**
 * cond_synchronize_rcu - Conditionally wait for an RCU grace period
 *
 * @oldstate: return value from earlier call to get_state_synchronize_rcu()
 *
 * If a full RCU grace period has elapsed since the earlier call to
 * get_state_synchronize_rcu(), just return.  Otherwise, invoke
 * synchronize_rcu() to wait for a full grace period.
 *
 * Yes, this function does not take counter wrap into account.  But
 * counter wrap is harmless.  If the counter wraps, we have waited for
 * more than 2 billion grace periods (and way more on a 64-bit system!),
 * so waiting for one additional grace period should be just fine.
 */
void cond_synchronize_rcu(unsigned long oldstate)
{
3021
	if (!rcu_seq_done(&rcu_state.gp_seq, oldstate))
3022
		synchronize_rcu();
3023 3024
	else
		smp_mb(); /* Ensure GP ends before subsequent accesses. */
3025 3026 3027
}
EXPORT_SYMBOL_GPL(cond_synchronize_rcu);

3028 3029 3030
/**
 * get_state_synchronize_sched - Snapshot current RCU-sched state
 *
3031
 * This is transitional, and only used by rcutorture.
3032 3033 3034
 */
unsigned long get_state_synchronize_sched(void)
{
3035
	return get_state_synchronize_rcu();
3036 3037 3038 3039 3040 3041 3042
}
EXPORT_SYMBOL_GPL(get_state_synchronize_sched);

/**
 * cond_synchronize_sched - Conditionally wait for an RCU-sched grace period
 * @oldstate: return value from earlier call to get_state_synchronize_sched()
 *
3043
 * This is transitional and only used by rcutorture.
3044 3045 3046
 */
void cond_synchronize_sched(unsigned long oldstate)
{
3047
	cond_synchronize_rcu(oldstate);
3048 3049 3050
}
EXPORT_SYMBOL_GPL(cond_synchronize_sched);

3051
/*
3052 3053 3054 3055 3056 3057
 * Check to see if there is any immediate RCU-related work to be done by
 * the current CPU, for the specified type of RCU, returning 1 if so and
 * zero otherwise.  The checks are in order of increasing expense: checks
 * that can be carried out against CPU-local state are performed first.
 * However, we must check for CPU stalls first, else we might not get
 * a chance.
3058
 */
3059
static int rcu_pending(void)
3060
{
3061
	struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
3062 3063
	struct rcu_node *rnp = rdp->mynode;

3064
	/* Check for CPU stalls, if enabled. */
3065
	check_cpu_stall(rdp);
3066

3067
	/* Is this CPU a NO_HZ_FULL CPU that should ignore RCU? */
3068
	if (rcu_nohz_full_cpu())
3069 3070
		return 0;

3071
	/* Is the RCU core waiting for a quiescent state from this CPU? */
3072
	if (rdp->core_needs_qs && !rdp->cpu_no_qs.b.norm)
3073 3074 3075
		return 1;

	/* Does this CPU have callbacks ready to invoke? */
3076
	if (rcu_segcblist_ready_cbs(&rdp->cblist))
3077 3078 3079
		return 1;

	/* Has RCU gone idle with this CPU needing another grace period? */
3080
	if (!rcu_gp_in_progress() &&
3081 3082
	    rcu_segcblist_is_enabled(&rdp->cblist) &&
	    !rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL))
3083 3084
		return 1;

3085 3086
	/* Have RCU grace period completed or started?  */
	if (rcu_seq_current(&rnp->gp_seq) != rdp->gp_seq ||
3087
	    unlikely(READ_ONCE(rdp->gpwrap))) /* outside lock */
3088 3089
		return 1;

3090
	/* Does this CPU need a deferred NOCB wakeup? */
3091
	if (rcu_nocb_need_deferred_wakeup(rdp))
3092 3093
		return 1;

3094 3095 3096 3097 3098
	/* nothing to do */
	return 0;
}

/*
3099 3100 3101
 * Return true if the specified CPU has any callback.  If all_lazy is
 * non-NULL, store an indication of whether all callbacks are lazy.
 * (If there are no callbacks, all of them are deemed to be lazy.)
3102
 */
3103
static bool rcu_cpu_has_callbacks(bool *all_lazy)
3104
{
3105 3106 3107
	bool al = true;
	bool hc = false;
	struct rcu_data *rdp;
3108

3109 3110
	rdp = this_cpu_ptr(&rcu_data);
	if (!rcu_segcblist_empty(&rdp->cblist)) {
3111
		hc = true;
3112
		if (rcu_segcblist_n_nonlazy_cbs(&rdp->cblist))
3113 3114 3115 3116 3117
			al = false;
	}
	if (all_lazy)
		*all_lazy = al;
	return hc;
3118 3119
}

3120 3121 3122 3123
/*
 * Helper function for _rcu_barrier() tracing.  If tracing is disabled,
 * the compiler is expected to optimize this away.
 */
3124
static void _rcu_barrier_trace(const char *s, int cpu, unsigned long done)
3125
{
3126 3127
	trace_rcu_barrier(rcu_state.name, s, cpu,
			  atomic_read(&rcu_state.barrier_cpu_count), done);
3128 3129
}

3130 3131 3132 3133
/*
 * RCU callback function for _rcu_barrier().  If we are last, wake
 * up the task executing _rcu_barrier().
 */
3134
static void rcu_barrier_callback(struct rcu_head *rhp)
3135
{
3136
	struct rcu_state *rsp = &rcu_state;
3137

3138
	if (atomic_dec_and_test(&rsp->barrier_cpu_count)) {
3139
		_rcu_barrier_trace(TPS("LastCB"), -1, rsp->barrier_sequence);
3140
		complete(&rsp->barrier_completion);
3141
	} else {
3142
		_rcu_barrier_trace(TPS("CB"), -1, rsp->barrier_sequence);
3143
	}
3144 3145 3146 3147 3148 3149 3150
}

/*
 * Called with preemption disabled, and from cross-cpu IRQ context.
 */
static void rcu_barrier_func(void *type)
{
3151
	struct rcu_state *rsp = type;
3152
	struct rcu_data *rdp = raw_cpu_ptr(&rcu_data);
3153

3154
	_rcu_barrier_trace(TPS("IRQ"), -1, rsp->barrier_sequence);
3155 3156 3157 3158 3159 3160
	rdp->barrier_head.func = rcu_barrier_callback;
	debug_rcu_head_queue(&rdp->barrier_head);
	if (rcu_segcblist_entrain(&rdp->cblist, &rdp->barrier_head, 0)) {
		atomic_inc(&rsp->barrier_cpu_count);
	} else {
		debug_rcu_head_unqueue(&rdp->barrier_head);
3161
		_rcu_barrier_trace(TPS("IRQNQ"), -1, rsp->barrier_sequence);
3162
	}
3163 3164 3165 3166 3167 3168
}

/*
 * Orchestrate the specified type of RCU barrier, waiting for all
 * RCU callbacks of the specified type to complete.
 */
3169
static void _rcu_barrier(void)
3170
{
3171 3172
	int cpu;
	struct rcu_data *rdp;
3173
	struct rcu_state *rsp = &rcu_state;
3174
	unsigned long s = rcu_seq_snap(&rsp->barrier_sequence);
3175

3176
	_rcu_barrier_trace(TPS("Begin"), -1, s);
3177

3178
	/* Take mutex to serialize concurrent rcu_barrier() requests. */
3179
	mutex_lock(&rsp->barrier_mutex);
3180

3181 3182
	/* Did someone else do our work for us? */
	if (rcu_seq_done(&rsp->barrier_sequence, s)) {
3183
		_rcu_barrier_trace(TPS("EarlyExit"), -1, rsp->barrier_sequence);
3184 3185 3186 3187 3188
		smp_mb(); /* caller's subsequent code after above check. */
		mutex_unlock(&rsp->barrier_mutex);
		return;
	}

3189 3190
	/* Mark the start of the barrier operation. */
	rcu_seq_start(&rsp->barrier_sequence);
3191
	_rcu_barrier_trace(TPS("Inc1"), -1, rsp->barrier_sequence);
3192

3193
	/*
3194 3195
	 * Initialize the count to one rather than to zero in order to
	 * avoid a too-soon return to zero in case of a short grace period
3196 3197
	 * (or preemption of this task).  Exclude CPU-hotplug operations
	 * to ensure that no offline CPU has callbacks queued.
3198
	 */
3199
	init_completion(&rsp->barrier_completion);
3200
	atomic_set(&rsp->barrier_cpu_count, 1);
3201
	get_online_cpus();
3202 3203

	/*
3204 3205 3206
	 * Force each CPU with callbacks to register a new callback.
	 * When that callback is invoked, we will know that all of the
	 * corresponding CPU's preceding callbacks have been invoked.
3207
	 */
3208
	for_each_possible_cpu(cpu) {
3209
		if (!cpu_online(cpu) && !rcu_is_nocb_cpu(cpu))
3210
			continue;
3211
		rdp = per_cpu_ptr(&rcu_data, cpu);
3212
		if (rcu_is_nocb_cpu(cpu)) {
3213
			if (!rcu_nocb_cpu_needs_barrier(cpu)) {
3214
				_rcu_barrier_trace(TPS("OfflineNoCB"), cpu,
3215
						   rsp->barrier_sequence);
3216
			} else {
3217
				_rcu_barrier_trace(TPS("OnlineNoCB"), cpu,
3218
						   rsp->barrier_sequence);
3219
				smp_mb__before_atomic();
3220 3221
				atomic_inc(&rsp->barrier_cpu_count);
				__call_rcu(&rdp->barrier_head,
3222
					   rcu_barrier_callback, cpu, 0);
3223
			}
3224
		} else if (rcu_segcblist_n_cbs(&rdp->cblist)) {
3225
			_rcu_barrier_trace(TPS("OnlineQ"), cpu,
3226
					   rsp->barrier_sequence);
3227
			smp_call_function_single(cpu, rcu_barrier_func, rsp, 1);
3228
		} else {
3229
			_rcu_barrier_trace(TPS("OnlineNQ"), cpu,
3230
					   rsp->barrier_sequence);
3231 3232
		}
	}
3233
	put_online_cpus();
3234 3235 3236 3237 3238

	/*
	 * Now that we have an rcu_barrier_callback() callback on each
	 * CPU, and thus each counted, remove the initial count.
	 */
3239
	if (atomic_dec_and_test(&rsp->barrier_cpu_count))
3240
		complete(&rsp->barrier_completion);
3241 3242

	/* Wait for all rcu_barrier_callback() callbacks to be invoked. */
3243
	wait_for_completion(&rsp->barrier_completion);
3244

3245
	/* Mark the end of the barrier operation. */
3246
	_rcu_barrier_trace(TPS("Inc2"), -1, rsp->barrier_sequence);
3247 3248
	rcu_seq_end(&rsp->barrier_sequence);

3249
	/* Other rcu_barrier() invocations can now safely proceed. */
3250
	mutex_unlock(&rsp->barrier_mutex);
3251 3252 3253 3254 3255 3256 3257
}

/**
 * rcu_barrier_bh - Wait until all in-flight call_rcu_bh() callbacks complete.
 */
void rcu_barrier_bh(void)
{
3258
	_rcu_barrier();
3259 3260 3261
}
EXPORT_SYMBOL_GPL(rcu_barrier_bh);

3262 3263 3264 3265 3266 3267 3268 3269 3270 3271
/**
 * rcu_barrier - Wait until all in-flight call_rcu() callbacks complete.
 *
 * Note that this primitive does not necessarily wait for an RCU grace period
 * to complete.  For example, if there are no RCU callbacks queued anywhere
 * in the system, then rcu_barrier() is within its rights to return
 * immediately, without waiting for anything, much less an RCU grace period.
 */
void rcu_barrier(void)
{
3272
	_rcu_barrier();
3273 3274 3275
}
EXPORT_SYMBOL_GPL(rcu_barrier);

3276 3277
/**
 * rcu_barrier_sched - Wait for in-flight call_rcu_sched() callbacks.
3278 3279
 *
 * This is transitional.
3280 3281 3282
 */
void rcu_barrier_sched(void)
{
3283
	rcu_barrier();
3284 3285 3286
}
EXPORT_SYMBOL_GPL(rcu_barrier_sched);

3287 3288 3289 3290 3291 3292 3293 3294 3295
/*
 * Propagate ->qsinitmask bits up the rcu_node tree to account for the
 * first CPU in a given leaf rcu_node structure coming online.  The caller
 * must hold the corresponding leaf rcu_node ->lock with interrrupts
 * disabled.
 */
static void rcu_init_new_rnp(struct rcu_node *rnp_leaf)
{
	long mask;
3296
	long oldmask;
3297 3298
	struct rcu_node *rnp = rnp_leaf;

3299
	raw_lockdep_assert_held_rcu_node(rnp_leaf);
3300
	WARN_ON_ONCE(rnp->wait_blkd_tasks);
3301 3302 3303 3304 3305
	for (;;) {
		mask = rnp->grpmask;
		rnp = rnp->parent;
		if (rnp == NULL)
			return;
3306
		raw_spin_lock_rcu_node(rnp); /* Interrupts already disabled. */
3307
		oldmask = rnp->qsmaskinit;
3308
		rnp->qsmaskinit |= mask;
Boqun Feng's avatar
Boqun Feng committed
3309
		raw_spin_unlock_rcu_node(rnp); /* Interrupts remain disabled. */
3310 3311
		if (oldmask)
			return;
3312 3313 3314
	}
}

3315
/*
3316
 * Do boot-time initialization of a CPU's per-CPU RCU data.
3317
 */
3318
static void __init
3319
rcu_boot_init_percpu_data(int cpu)
3320
{
3321
	struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
3322 3323

	/* Set up local state, ensuring consistent view of global state. */
3324
	rdp->grpmask = leaf_node_cpu_bit(rdp->mynode, cpu);
3325
	rdp->dynticks = &per_cpu(rcu_dynticks, cpu);
3326
	WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != 1);
3327
	WARN_ON_ONCE(rcu_dynticks_in_eqs(rcu_dynticks_snap(rdp->dynticks)));
3328
	rdp->rcu_ofl_gp_seq = rcu_state.gp_seq;
3329
	rdp->rcu_ofl_gp_flags = RCU_GP_CLEANED;
3330
	rdp->rcu_onl_gp_seq = rcu_state.gp_seq;
3331
	rdp->rcu_onl_gp_flags = RCU_GP_CLEANED;
3332
	rdp->cpu = cpu;
3333
	rcu_boot_init_nocb_percpu_data(rdp);
3334 3335 3336
}

/*
3337 3338 3339 3340
 * Invoked early in the CPU-online process, when pretty much all services
 * are available.  The incoming CPU is not present.
 *
 * Initializes a CPU's per-CPU RCU data.  Note that only one online or
3341 3342 3343
 * offline event can be happening at a given time.  Note also that we can
 * accept some slop in the rsp->gp_seq access due to the fact that this
 * CPU cannot possibly have any RCU callbacks in flight yet.
3344
 */
3345
int rcutree_prepare_cpu(unsigned int cpu)
3346 3347
{
	unsigned long flags;
3348
	struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
3349
	struct rcu_node *rnp = rcu_get_root();
3350 3351

	/* Set up local state, ensuring consistent view of global state. */
3352
	raw_spin_lock_irqsave_rcu_node(rnp, flags);
3353
	rdp->qlen_last_fqs_check = 0;
3354
	rdp->n_force_qs_snap = rcu_state.n_force_qs;
3355
	rdp->blimit = blimit;
3356 3357 3358
	if (rcu_segcblist_empty(&rdp->cblist) && /* No early-boot CBs? */
	    !init_nocb_callback_list(rdp))
		rcu_segcblist_init(&rdp->cblist);  /* Re-enable callbacks. */
3359
	rdp->dynticks->dynticks_nesting = 1;	/* CPU not up, no tearing. */
3360
	rcu_dynticks_eqs_online();
Boqun Feng's avatar
Boqun Feng committed
3361
	raw_spin_unlock_rcu_node(rnp);		/* irqs remain disabled. */
3362

3363 3364 3365 3366 3367
	/*
	 * Add CPU to leaf rcu_node pending-online bitmask.  Any needed
	 * propagation up the rcu_node tree will happen at the beginning
	 * of the next grace period.
	 */
3368
	rnp = rdp->mynode;
3369
	raw_spin_lock_rcu_node(rnp);		/* irqs already disabled. */
3370
	rdp->beenonline = true;	 /* We have now been online. */
3371
	rdp->gp_seq = rnp->gp_seq;
3372
	rdp->gp_seq_needed = rnp->gp_seq;
3373
	rdp->cpu_no_qs.b.norm = true;
3374
	rdp->rcu_qs_ctr_snap = per_cpu(rcu_dynticks.rcu_qs_ctr, cpu);
3375
	rdp->core_needs_qs = false;
3376
	rdp->rcu_iw_pending = false;
3377
	rdp->rcu_iw_gp_seq = rnp->gp_seq - 1;
3378
	trace_rcu_grace_period(rcu_state.name, rdp->gp_seq, TPS("cpuonl"));
Boqun Feng's avatar
Boqun Feng committed
3379
	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
3380 3381 3382 3383 3384 3385
	rcu_prepare_kthreads(cpu);
	rcu_spawn_all_nocb_kthreads(cpu);

	return 0;
}

3386 3387 3388
/*
 * Update RCU priority boot kthread affinity for CPU-hotplug changes.
 */
3389 3390
static void rcutree_affinity_setting(unsigned int cpu, int outgoing)
{
3391
	struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
3392 3393 3394 3395

	rcu_boost_kthread_setaffinity(rdp->mynode, outgoing);
}

3396 3397 3398 3399
/*
 * Near the end of the CPU-online process.  Pretty much all services
 * enabled, and the CPU is now very much alive.
 */
3400 3401
int rcutree_online_cpu(unsigned int cpu)
{
3402 3403 3404 3405
	unsigned long flags;
	struct rcu_data *rdp;
	struct rcu_node *rnp;

3406 3407 3408 3409 3410
	rdp = per_cpu_ptr(&rcu_data, cpu);
	rnp = rdp->mynode;
	raw_spin_lock_irqsave_rcu_node(rnp, flags);
	rnp->ffmask |= rdp->grpmask;
	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
3411 3412
	if (IS_ENABLED(CONFIG_TREE_SRCU))
		srcu_online_cpu(cpu);
3413 3414 3415 3416
	if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE)
		return 0; /* Too early in boot for scheduler work. */
	sync_sched_exp_online_cleanup(cpu);
	rcutree_affinity_setting(cpu, -1);
3417 3418 3419
	return 0;
}

3420 3421 3422 3423
/*
 * Near the beginning of the process.  The CPU is still very much alive
 * with pretty much all services enabled.
 */
3424 3425
int rcutree_offline_cpu(unsigned int cpu)
{
3426 3427 3428 3429
	unsigned long flags;
	struct rcu_data *rdp;
	struct rcu_node *rnp;

3430 3431 3432 3433 3434
	rdp = per_cpu_ptr(&rcu_data, cpu);
	rnp = rdp->mynode;
	raw_spin_lock_irqsave_rcu_node(rnp, flags);
	rnp->ffmask &= ~rdp->grpmask;
	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
3435

3436
	rcutree_affinity_setting(cpu, cpu);
3437 3438
	if (IS_ENABLED(CONFIG_TREE_SRCU))
		srcu_offline_cpu(cpu);
3439 3440 3441
	return 0;
}

3442 3443
static DEFINE_PER_CPU(int, rcu_cpu_started);

3444 3445 3446 3447 3448 3449
/*
 * Mark the specified CPU as being online so that subsequent grace periods
 * (both expedited and normal) will wait on it.  Note that this means that
 * incoming CPUs are not allowed to use RCU read-side critical sections
 * until this function is called.  Failing to observe this restriction
 * will result in lockdep splats.
3450 3451 3452 3453
 *
 * Note that this function is special in that it is invoked directly
 * from the incoming CPU rather than from the cpuhp_step mechanism.
 * This is because this function must be invoked at a precise location.
3454 3455 3456 3457 3458
 */
void rcu_cpu_starting(unsigned int cpu)
{
	unsigned long flags;
	unsigned long mask;
3459 3460
	int nbits;
	unsigned long oldmask;
3461 3462
	struct rcu_data *rdp;
	struct rcu_node *rnp;
3463
	struct rcu_state *rsp = &rcu_state;
3464

3465 3466 3467 3468 3469
	if (per_cpu(rcu_cpu_started, cpu))
		return;

	per_cpu(rcu_cpu_started, cpu) = 1;

3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488
	rdp = per_cpu_ptr(&rcu_data, cpu);
	rnp = rdp->mynode;
	mask = rdp->grpmask;
	raw_spin_lock_irqsave_rcu_node(rnp, flags);
	rnp->qsmaskinitnext |= mask;
	oldmask = rnp->expmaskinitnext;
	rnp->expmaskinitnext |= mask;
	oldmask ^= rnp->expmaskinitnext;
	nbits = bitmap_weight(&oldmask, BITS_PER_LONG);
	/* Allow lockless access for expedited grace periods. */
	smp_store_release(&rsp->ncpus, rsp->ncpus + nbits); /* ^^^ */
	rcu_gpnum_ovf(rnp, rdp); /* Offline-induced counter wrap? */
	rdp->rcu_onl_gp_seq = READ_ONCE(rsp->gp_seq);
	rdp->rcu_onl_gp_flags = READ_ONCE(rsp->gp_flags);
	if (rnp->qsmask & mask) { /* RCU waiting on incoming CPU? */
		/* Report QS -after- changing ->qsmaskinitnext! */
		rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags);
	} else {
		raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
3489
	}
3490
	smp_mb(); /* Ensure RCU read-side usage follows above initialization. */
3491 3492
}

3493 3494
#ifdef CONFIG_HOTPLUG_CPU
/*
3495 3496 3497 3498 3499 3500
 * The outgoing function has no further need of RCU, so remove it from
 * the rcu_node tree's ->qsmaskinitnext bit masks.
 *
 * Note that this function is special in that it is invoked directly
 * from the outgoing CPU rather than from the cpuhp_step mechanism.
 * This is because this function must be invoked at a precise location.
3501
 */
3502
void rcu_report_dead(unsigned int cpu)
3503 3504 3505
{
	unsigned long flags;
	unsigned long mask;
3506
	struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
3507 3508
	struct rcu_node *rnp = rdp->mynode;  /* Outgoing CPU's rdp & rnp. */

3509 3510
	/* QS for any half-done expedited RCU-sched GP. */
	preempt_disable();
3511
	rcu_report_exp_rdp(this_cpu_ptr(&rcu_data));
3512 3513 3514
	preempt_enable();
	rcu_preempt_deferred_qs(current);

3515 3516
	/* Remove outgoing CPU from mask in the leaf rcu_node structure. */
	mask = rdp->grpmask;
3517
	spin_lock(&rcu_state.ofl_lock);
3518
	raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Enforce GP memory-order guarantee. */
3519 3520
	rdp->rcu_ofl_gp_seq = READ_ONCE(rcu_state.gp_seq);
	rdp->rcu_ofl_gp_flags = READ_ONCE(rcu_state.gp_flags);
3521 3522
	if (rnp->qsmask & mask) { /* RCU waiting on outgoing CPU? */
		/* Report quiescent state -before- changing ->qsmaskinitnext! */
3523
		rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags);
3524 3525
		raw_spin_lock_irqsave_rcu_node(rnp, flags);
	}
3526
	rnp->qsmaskinitnext &= ~mask;
3527
	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
3528
	spin_unlock(&rcu_state.ofl_lock);
3529 3530

	per_cpu(rcu_cpu_started, cpu) = 0;
3531
}
3532

3533 3534 3535 3536 3537 3538
/*
 * The outgoing CPU has just passed through the dying-idle state, and we
 * are being invoked from the CPU that was IPIed to continue the offline
 * operation.  Migrate the outgoing CPU's callbacks to the current CPU.
 */
void rcutree_migrate_callbacks(int cpu)
3539 3540
{
	unsigned long flags;
3541
	struct rcu_data *my_rdp;
3542
	struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
3543
	struct rcu_node *rnp_root = rcu_get_root();
3544
	bool needwake;
3545

3546 3547 3548
	if (rcu_is_nocb_cpu(cpu) || rcu_segcblist_empty(&rdp->cblist))
		return;  /* No callbacks to migrate. */

3549
	local_irq_save(flags);
3550
	my_rdp = this_cpu_ptr(&rcu_data);
3551 3552 3553 3554
	if (rcu_nocb_adopt_orphan_cbs(my_rdp, rdp, flags)) {
		local_irq_restore(flags);
		return;
	}
3555
	raw_spin_lock_rcu_node(rnp_root); /* irqs already disabled. */
3556
	/* Leverage recent GPs and set GP for new callbacks. */
3557 3558
	needwake = rcu_advance_cbs(rnp_root, rdp) ||
		   rcu_advance_cbs(rnp_root, my_rdp);
3559
	rcu_segcblist_merge(&my_rdp->cblist, &rdp->cblist);
3560 3561
	WARN_ON_ONCE(rcu_segcblist_empty(&my_rdp->cblist) !=
		     !rcu_segcblist_n_cbs(&my_rdp->cblist));
3562
	raw_spin_unlock_irqrestore_rcu_node(rnp_root, flags);
3563
	if (needwake)
3564
		rcu_gp_kthread_wake();
3565 3566 3567 3568 3569 3570
	WARN_ONCE(rcu_segcblist_n_cbs(&rdp->cblist) != 0 ||
		  !rcu_segcblist_empty(&rdp->cblist),
		  "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, 1stCB=%p\n",
		  cpu, rcu_segcblist_n_cbs(&rdp->cblist),
		  rcu_segcblist_first_cb(&rdp->cblist));
}
3571 3572
#endif

3573 3574 3575 3576
/*
 * On non-huge systems, use expedited RCU grace periods to make suspend
 * and hibernation run faster.
 */
3577 3578 3579 3580 3581 3582 3583
static int rcu_pm_notify(struct notifier_block *self,
			 unsigned long action, void *hcpu)
{
	switch (action) {
	case PM_HIBERNATION_PREPARE:
	case PM_SUSPEND_PREPARE:
		if (nr_cpu_ids <= 256) /* Expediting bad for large systems. */
3584
			rcu_expedite_gp();
3585 3586 3587
		break;
	case PM_POST_HIBERNATION:
	case PM_POST_SUSPEND:
3588 3589
		if (nr_cpu_ids <= 256) /* Expediting bad for large systems. */
			rcu_unexpedite_gp();
3590 3591 3592 3593 3594 3595 3596
		break;
	default:
		break;
	}
	return NOTIFY_OK;
}

3597
/*
3598
 * Spawn the kthreads that handle each RCU flavor's grace periods.
3599 3600 3601 3602
 */
static int __init rcu_spawn_gp_kthread(void)
{
	unsigned long flags;
3603
	int kthread_prio_in = kthread_prio;
3604
	struct rcu_node *rnp;
3605
	struct sched_param sp;
3606 3607
	struct task_struct *t;

3608
	/* Force priority into range. */
3609 3610 3611 3612
	if (IS_ENABLED(CONFIG_RCU_BOOST) && kthread_prio < 2
	    && IS_BUILTIN(CONFIG_RCU_TORTURE_TEST))
		kthread_prio = 2;
	else if (IS_ENABLED(CONFIG_RCU_BOOST) && kthread_prio < 1)
3613 3614 3615 3616 3617
		kthread_prio = 1;
	else if (kthread_prio < 0)
		kthread_prio = 0;
	else if (kthread_prio > 99)
		kthread_prio = 99;
3618

3619 3620 3621 3622
	if (kthread_prio != kthread_prio_in)
		pr_alert("rcu_spawn_gp_kthread(): Limited prio to %d from %d\n",
			 kthread_prio, kthread_prio_in);

3623
	rcu_scheduler_fully_active = 1;
3624 3625 3626 3627 3628 3629 3630 3631
	t = kthread_create(rcu_gp_kthread, NULL, "%s", rcu_state.name);
	BUG_ON(IS_ERR(t));
	rnp = rcu_get_root();
	raw_spin_lock_irqsave_rcu_node(rnp, flags);
	rcu_state.gp_kthread = t;
	if (kthread_prio) {
		sp.sched_priority = kthread_prio;
		sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
3632
	}
3633 3634
	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
	wake_up_process(t);
3635
	rcu_spawn_nocb_kthreads();
3636
	rcu_spawn_boost_kthreads();
3637 3638 3639 3640
	return 0;
}
early_initcall(rcu_spawn_gp_kthread);

3641
/*
3642 3643 3644 3645 3646 3647
 * This function is invoked towards the end of the scheduler's
 * initialization process.  Before this is called, the idle task might
 * contain synchronous grace-period primitives (during which time, this idle
 * task is booting the system, and such primitives are no-ops).  After this
 * function is called, any synchronous grace-period primitives are run as
 * expedited, with the requesting task driving the grace period forward.
3648
 * A later core_initcall() rcu_set_runtime_mode() will switch to full
3649
 * runtime RCU functionality.
3650 3651 3652 3653 3654
 */
void rcu_scheduler_starting(void)
{
	WARN_ON(num_online_cpus() != 1);
	WARN_ON(nr_context_switches() > 0);
3655 3656 3657
	rcu_test_sync_prims();
	rcu_scheduler_active = RCU_SCHEDULER_INIT;
	rcu_test_sync_prims();
3658 3659
}

3660 3661 3662
/*
 * Helper function for rcu_init() that initializes one rcu_state structure.
 */
3663
static void __init rcu_init_one(void)
3664
{
3665 3666
	static const char * const buf[] = RCU_NODE_NAME_INIT;
	static const char * const fqs[] = RCU_FQS_NAME_INIT;
3667 3668
	static struct lock_class_key rcu_node_class[RCU_NUM_LVLS];
	static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS];
3669 3670

	int levelspread[RCU_NUM_LVLS];		/* kids/node in each level. */
3671 3672 3673 3674
	int cpustride = 1;
	int i;
	int j;
	struct rcu_node *rnp;
3675
	struct rcu_state *rsp = &rcu_state;
3676

3677
	BUILD_BUG_ON(RCU_NUM_LVLS > ARRAY_SIZE(buf));  /* Fix buf[] init! */
3678

3679 3680 3681
	/* Silence gcc 4.8 false positive about array index out of range. */
	if (rcu_num_lvls <= 0 || rcu_num_lvls > RCU_NUM_LVLS)
		panic("rcu_init_one: rcu_num_lvls out of range");
3682

3683 3684
	/* Initialize the level-tracking arrays. */

3685
	for (i = 1; i < rcu_num_lvls; i++)
3686 3687
		rsp->level[i] = rsp->level[i - 1] + num_rcu_lvl[i - 1];
	rcu_init_levelspread(levelspread, num_rcu_lvl);
3688 3689 3690

	/* Initialize the elements themselves, starting from the leaves. */

3691
	for (i = rcu_num_lvls - 1; i >= 0; i--) {
3692
		cpustride *= levelspread[i];
3693
		rnp = rsp->level[i];
3694
		for (j = 0; j < num_rcu_lvl[i]; j++, rnp++) {
Boqun Feng's avatar
Boqun Feng committed
3695 3696
			raw_spin_lock_init(&ACCESS_PRIVATE(rnp, lock));
			lockdep_set_class_and_name(&ACCESS_PRIVATE(rnp, lock),
3697
						   &rcu_node_class[i], buf[i]);
3698 3699 3700
			raw_spin_lock_init(&rnp->fqslock);
			lockdep_set_class_and_name(&rnp->fqslock,
						   &rcu_fqs_class[i], fqs[i]);
3701
			rnp->gp_seq = rsp->gp_seq;
3702
			rnp->gp_seq_needed = rsp->gp_seq;
3703
			rnp->completedqs = rsp->gp_seq;
3704 3705 3706 3707
			rnp->qsmask = 0;
			rnp->qsmaskinit = 0;
			rnp->grplo = j * cpustride;
			rnp->grphi = (j + 1) * cpustride - 1;
3708 3709
			if (rnp->grphi >= nr_cpu_ids)
				rnp->grphi = nr_cpu_ids - 1;
3710 3711 3712 3713 3714
			if (i == 0) {
				rnp->grpnum = 0;
				rnp->grpmask = 0;
				rnp->parent = NULL;
			} else {
3715
				rnp->grpnum = j % levelspread[i - 1];
3716 3717
				rnp->grpmask = 1UL << rnp->grpnum;
				rnp->parent = rsp->level[i - 1] +
3718
					      j / levelspread[i - 1];
3719 3720
			}
			rnp->level = i;
3721
			INIT_LIST_HEAD(&rnp->blkd_tasks);
3722
			rcu_init_one_nocb(rnp);
3723 3724
			init_waitqueue_head(&rnp->exp_wq[0]);
			init_waitqueue_head(&rnp->exp_wq[1]);
3725 3726
			init_waitqueue_head(&rnp->exp_wq[2]);
			init_waitqueue_head(&rnp->exp_wq[3]);
3727
			spin_lock_init(&rnp->exp_lock);
3728 3729
		}
	}
3730

3731 3732
	init_swait_queue_head(&rsp->gp_wq);
	init_swait_queue_head(&rsp->expedited_wq);
3733
	rnp = rcu_first_leaf_node();
3734
	for_each_possible_cpu(i) {
3735
		while (i > rnp->grphi)
3736
			rnp++;
3737
		per_cpu_ptr(&rcu_data, i)->mynode = rnp;
3738
		rcu_boot_init_percpu_data(i);
3739
	}
3740 3741
}

3742 3743
/*
 * Compute the rcu_node tree geometry from kernel parameters.  This cannot
3744
 * replace the definitions in tree.h because those are needed to size
3745 3746 3747 3748
 * the ->node array in the rcu_state structure.
 */
static void __init rcu_init_geometry(void)
{
3749
	ulong d;
3750
	int i;
3751
	int rcu_capacity[RCU_NUM_LVLS];
3752

3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765
	/*
	 * Initialize any unspecified boot parameters.
	 * The default values of jiffies_till_first_fqs and
	 * jiffies_till_next_fqs are set to the RCU_JIFFIES_TILL_FORCE_QS
	 * value, which is a function of HZ, then adding one for each
	 * RCU_JIFFIES_FQS_DIV CPUs that might be on the system.
	 */
	d = RCU_JIFFIES_TILL_FORCE_QS + nr_cpu_ids / RCU_JIFFIES_FQS_DIV;
	if (jiffies_till_first_fqs == ULONG_MAX)
		jiffies_till_first_fqs = d;
	if (jiffies_till_next_fqs == ULONG_MAX)
		jiffies_till_next_fqs = d;

3766
	/* If the compile-time values are accurate, just leave. */
3767
	if (rcu_fanout_leaf == RCU_FANOUT_LEAF &&
3768
	    nr_cpu_ids == NR_CPUS)
3769
		return;
3770
	pr_info("Adjusting geometry for rcu_fanout_leaf=%d, nr_cpu_ids=%u\n",
3771
		rcu_fanout_leaf, nr_cpu_ids);
3772 3773

	/*
3774 3775 3776 3777
	 * The boot-time rcu_fanout_leaf parameter must be at least two
	 * and cannot exceed the number of bits in the rcu_node masks.
	 * Complain and fall back to the compile-time values if this
	 * limit is exceeded.
3778
	 */
3779
	if (rcu_fanout_leaf < 2 ||
3780
	    rcu_fanout_leaf > sizeof(unsigned long) * 8) {
3781
		rcu_fanout_leaf = RCU_FANOUT_LEAF;
3782 3783 3784 3785 3786 3787
		WARN_ON(1);
		return;
	}

	/*
	 * Compute number of nodes that can be handled an rcu_node tree
3788
	 * with the given number of levels.
3789
	 */
3790
	rcu_capacity[0] = rcu_fanout_leaf;
3791
	for (i = 1; i < RCU_NUM_LVLS; i++)
3792
		rcu_capacity[i] = rcu_capacity[i - 1] * RCU_FANOUT;
3793 3794

	/*
3795
	 * The tree must be able to accommodate the configured number of CPUs.
3796
	 * If this limit is exceeded, fall back to the compile-time values.
3797
	 */
3798 3799 3800 3801 3802
	if (nr_cpu_ids > rcu_capacity[RCU_NUM_LVLS - 1]) {
		rcu_fanout_leaf = RCU_FANOUT_LEAF;
		WARN_ON(1);
		return;
	}
3803

3804
	/* Calculate the number of levels in the tree. */
3805
	for (i = 0; nr_cpu_ids > rcu_capacity[i]; i++) {
3806
	}
3807
	rcu_num_lvls = i + 1;
3808

3809
	/* Calculate the number of rcu_nodes at each level of the tree. */
3810
	for (i = 0; i < rcu_num_lvls; i++) {
3811
		int cap = rcu_capacity[(rcu_num_lvls - 1) - i];
3812 3813
		num_rcu_lvl[i] = DIV_ROUND_UP(nr_cpu_ids, cap);
	}
3814 3815 3816

	/* Calculate the total number of rcu_node structures. */
	rcu_num_nodes = 0;
3817
	for (i = 0; i < rcu_num_lvls; i++)
3818 3819 3820
		rcu_num_nodes += num_rcu_lvl[i];
}

3821 3822 3823 3824
/*
 * Dump out the structure of the rcu_node combining tree associated
 * with the rcu_state structure referenced by rsp.
 */
3825
static void __init rcu_dump_rcu_node_tree(void)
3826 3827 3828 3829 3830 3831
{
	int level = 0;
	struct rcu_node *rnp;

	pr_info("rcu_node tree layout dump\n");
	pr_info(" ");
3832
	rcu_for_each_node_breadth_first(rnp) {
3833 3834 3835 3836 3837 3838 3839 3840 3841 3842
		if (rnp->level != level) {
			pr_cont("\n");
			pr_info(" ");
			level = rnp->level;
		}
		pr_cont("%d:%d ^%d  ", rnp->grplo, rnp->grphi, rnp->grpnum);
	}
	pr_cont("\n");
}

3843
struct workqueue_struct *rcu_gp_wq;
3844
struct workqueue_struct *rcu_par_gp_wq;
3845

3846
void __init rcu_init(void)
3847
{
Paul E. McKenney's avatar
Paul E. McKenney committed
3848
	int cpu;
3849

3850 3851
	rcu_early_boot_tests();

3852
	rcu_bootup_announce();
3853
	rcu_init_geometry();
3854
	rcu_init_one();
3855
	if (dump_tree)
3856
		rcu_dump_rcu_node_tree();
Jiang Fang's avatar
Jiang Fang committed
3857
	open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
3858 3859 3860 3861 3862 3863

	/*
	 * We don't need protection against CPU-hotplug here because
	 * this is called early in boot, before either interrupts
	 * or the scheduler are operational.
	 */
3864
	pm_notifier(rcu_pm_notify, 0);
3865
	for_each_online_cpu(cpu) {
3866
		rcutree_prepare_cpu(cpu);
3867
		rcu_cpu_starting(cpu);
3868
		rcutree_online_cpu(cpu);
3869
	}
3870 3871 3872 3873

	/* Create workqueue for expedited GPs and for Tree SRCU. */
	rcu_gp_wq = alloc_workqueue("rcu_gp", WQ_MEM_RECLAIM, 0);
	WARN_ON(!rcu_gp_wq);
3874 3875
	rcu_par_gp_wq = alloc_workqueue("rcu_par_gp", WQ_MEM_RECLAIM, 0);
	WARN_ON(!rcu_par_gp_wq);
3876 3877
}

3878
#include "tree_exp.h"
3879
#include "tree_plugin.h"