Commit fe5ac724 authored by Paul E. McKenney's avatar Paul E. McKenney

rcu: Remove nohz_full full-system-idle state machine

The NO_HZ_FULL_SYSIDLE full-system-idle capability was added in 2013
by commit 0edd1b17 ("nohz_full: Add full-system-idle state machine"),
but has not been used.  This commit therefore removes it.

If it turns out to be needed later, this commit can always be reverted.
Signed-off-by: default avatarPaul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Ingo Molnar <mingo@kernel.org>
Acked-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
parent f7a10a97
...@@ -2520,11 +2520,7 @@ It is similarly socially unacceptable to interrupt an ...@@ -2520,11 +2520,7 @@ It is similarly socially unacceptable to interrupt an
<tt>nohz_full</tt> CPU running in userspace. <tt>nohz_full</tt> CPU running in userspace.
RCU must therefore track <tt>nohz_full</tt> userspace RCU must therefore track <tt>nohz_full</tt> userspace
execution. execution.
And in RCU must therefore be able to sample state at two points in
<a href="https://lwn.net/Articles/558284/"><tt>CONFIG_NO_HZ_FULL_SYSIDLE=y</tt></a>
kernels, RCU must separately track idle CPUs on the one hand and
CPUs that are either idle or executing in userspace on the other.
In both cases, RCU must be able to sample state at two points in
time, and be able to determine whether or not some other CPU spent time, and be able to determine whether or not some other CPU spent
any time idle and/or executing in userspace. any time idle and/or executing in userspace.
......
...@@ -854,15 +854,6 @@ static inline notrace void rcu_read_unlock_sched_notrace(void) ...@@ -854,15 +854,6 @@ static inline notrace void rcu_read_unlock_sched_notrace(void)
#define kfree_rcu(ptr, rcu_head) \ #define kfree_rcu(ptr, rcu_head) \
__kfree_rcu(&((ptr)->rcu_head), offsetof(typeof(*(ptr)), rcu_head)) __kfree_rcu(&((ptr)->rcu_head), offsetof(typeof(*(ptr)), rcu_head))
/* Only for use by adaptive-ticks code. */
#ifdef CONFIG_NO_HZ_FULL_SYSIDLE
bool rcu_sys_is_idle(void);
void rcu_sysidle_force_exit(void);
#else /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
static inline bool rcu_sys_is_idle(void) { return false; }
static inline void rcu_sysidle_force_exit(void) { }
#endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
/* /*
* Place this after a lock-acquisition primitive to guarantee that * Place this after a lock-acquisition primitive to guarantee that
......
...@@ -270,10 +270,6 @@ void rcu_bh_qs(void) ...@@ -270,10 +270,6 @@ void rcu_bh_qs(void)
static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
.dynticks_nesting = DYNTICK_TASK_EXIT_IDLE, .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE,
.dynticks = ATOMIC_INIT(RCU_DYNTICK_CTRL_CTR), .dynticks = ATOMIC_INIT(RCU_DYNTICK_CTRL_CTR),
#ifdef CONFIG_NO_HZ_FULL_SYSIDLE
.dynticks_idle_nesting = DYNTICK_TASK_NEST_VALUE,
.dynticks_idle = ATOMIC_INIT(1),
#endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
}; };
/* /*
...@@ -546,10 +542,7 @@ module_param(jiffies_till_sched_qs, ulong, 0644); ...@@ -546,10 +542,7 @@ module_param(jiffies_till_sched_qs, ulong, 0644);
static bool rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, static bool rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp,
struct rcu_data *rdp); struct rcu_data *rdp);
static void force_qs_rnp(struct rcu_state *rsp, static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *rsp));
int (*f)(struct rcu_data *rsp, bool *isidle,
unsigned long *maxj),
bool *isidle, unsigned long *maxj);
static void force_quiescent_state(struct rcu_state *rsp); static void force_quiescent_state(struct rcu_state *rsp);
static int rcu_pending(void); static int rcu_pending(void);
...@@ -854,7 +847,6 @@ void rcu_idle_enter(void) ...@@ -854,7 +847,6 @@ void rcu_idle_enter(void)
local_irq_save(flags); local_irq_save(flags);
rcu_eqs_enter(false); rcu_eqs_enter(false);
rcu_sysidle_enter(0);
local_irq_restore(flags); local_irq_restore(flags);
} }
EXPORT_SYMBOL_GPL(rcu_idle_enter); EXPORT_SYMBOL_GPL(rcu_idle_enter);
...@@ -904,7 +896,6 @@ void rcu_irq_exit(void) ...@@ -904,7 +896,6 @@ void rcu_irq_exit(void)
trace_rcu_dyntick(TPS("--="), rdtp->dynticks_nesting, rdtp->dynticks_nesting - 1); trace_rcu_dyntick(TPS("--="), rdtp->dynticks_nesting, rdtp->dynticks_nesting - 1);
rdtp->dynticks_nesting--; rdtp->dynticks_nesting--;
} }
rcu_sysidle_enter(1);
} }
/* /*
...@@ -986,7 +977,6 @@ void rcu_idle_exit(void) ...@@ -986,7 +977,6 @@ void rcu_idle_exit(void)
local_irq_save(flags); local_irq_save(flags);
rcu_eqs_exit(false); rcu_eqs_exit(false);
rcu_sysidle_exit(0);
local_irq_restore(flags); local_irq_restore(flags);
} }
EXPORT_SYMBOL_GPL(rcu_idle_exit); EXPORT_SYMBOL_GPL(rcu_idle_exit);
...@@ -1038,7 +1028,6 @@ void rcu_irq_enter(void) ...@@ -1038,7 +1028,6 @@ void rcu_irq_enter(void)
trace_rcu_dyntick(TPS("++="), oldval, rdtp->dynticks_nesting); trace_rcu_dyntick(TPS("++="), oldval, rdtp->dynticks_nesting);
else else
rcu_eqs_exit_common(oldval, true); rcu_eqs_exit_common(oldval, true);
rcu_sysidle_exit(1);
} }
/* /*
...@@ -1217,11 +1206,9 @@ static int rcu_is_cpu_rrupt_from_idle(void) ...@@ -1217,11 +1206,9 @@ static int rcu_is_cpu_rrupt_from_idle(void)
* credit them with an implicit quiescent state. Return 1 if this CPU * credit them with an implicit quiescent state. Return 1 if this CPU
* is in dynticks idle mode, which is an extended quiescent state. * is in dynticks idle mode, which is an extended quiescent state.
*/ */
static int dyntick_save_progress_counter(struct rcu_data *rdp, static int dyntick_save_progress_counter(struct rcu_data *rdp)
bool *isidle, unsigned long *maxj)
{ {
rdp->dynticks_snap = rcu_dynticks_snap(rdp->dynticks); rdp->dynticks_snap = rcu_dynticks_snap(rdp->dynticks);
rcu_sysidle_check_cpu(rdp, isidle, maxj);
if (rcu_dynticks_in_eqs(rdp->dynticks_snap)) { if (rcu_dynticks_in_eqs(rdp->dynticks_snap)) {
trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti")); trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti"));
if (ULONG_CMP_LT(READ_ONCE(rdp->gpnum) + ULONG_MAX / 4, if (ULONG_CMP_LT(READ_ONCE(rdp->gpnum) + ULONG_MAX / 4,
...@@ -1238,8 +1225,7 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp, ...@@ -1238,8 +1225,7 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp,
* idle state since the last call to dyntick_save_progress_counter() * idle state since the last call to dyntick_save_progress_counter()
* for this same CPU, or by virtue of having been offline. * for this same CPU, or by virtue of having been offline.
*/ */
static int rcu_implicit_dynticks_qs(struct rcu_data *rdp, static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
bool *isidle, unsigned long *maxj)
{ {
unsigned long jtsq; unsigned long jtsq;
bool *rnhqp; bool *rnhqp;
...@@ -2105,25 +2091,16 @@ static bool rcu_gp_fqs_check_wake(struct rcu_state *rsp, int *gfp) ...@@ -2105,25 +2091,16 @@ static bool rcu_gp_fqs_check_wake(struct rcu_state *rsp, int *gfp)
*/ */
static void rcu_gp_fqs(struct rcu_state *rsp, bool first_time) static void rcu_gp_fqs(struct rcu_state *rsp, bool first_time)
{ {
bool isidle = false;
unsigned long maxj;
struct rcu_node *rnp = rcu_get_root(rsp); struct rcu_node *rnp = rcu_get_root(rsp);
WRITE_ONCE(rsp->gp_activity, jiffies); WRITE_ONCE(rsp->gp_activity, jiffies);
rsp->n_force_qs++; rsp->n_force_qs++;
if (first_time) { if (first_time) {
/* Collect dyntick-idle snapshots. */ /* Collect dyntick-idle snapshots. */
if (is_sysidle_rcu_state(rsp)) { force_qs_rnp(rsp, dyntick_save_progress_counter);
isidle = true;
maxj = jiffies - ULONG_MAX / 4;
}
force_qs_rnp(rsp, dyntick_save_progress_counter,
&isidle, &maxj);
rcu_sysidle_report_gp(rsp, isidle, maxj);
} else { } else {
/* Handle dyntick-idle and offline CPUs. */ /* Handle dyntick-idle and offline CPUs. */
isidle = true; force_qs_rnp(rsp, rcu_implicit_dynticks_qs);
force_qs_rnp(rsp, rcu_implicit_dynticks_qs, &isidle, &maxj);
} }
/* Clear flag to prevent immediate re-entry. */ /* Clear flag to prevent immediate re-entry. */
if (READ_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { if (READ_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) {
...@@ -2895,10 +2872,7 @@ void rcu_check_callbacks(int user) ...@@ -2895,10 +2872,7 @@ void rcu_check_callbacks(int user)
* *
* The caller must have suppressed start of new grace periods. * The caller must have suppressed start of new grace periods.
*/ */
static void force_qs_rnp(struct rcu_state *rsp, static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *rsp))
int (*f)(struct rcu_data *rsp, bool *isidle,
unsigned long *maxj),
bool *isidle, unsigned long *maxj)
{ {
int cpu; int cpu;
unsigned long flags; unsigned long flags;
...@@ -2937,7 +2911,7 @@ static void force_qs_rnp(struct rcu_state *rsp, ...@@ -2937,7 +2911,7 @@ static void force_qs_rnp(struct rcu_state *rsp,
for_each_leaf_node_possible_cpu(rnp, cpu) { for_each_leaf_node_possible_cpu(rnp, cpu) {
unsigned long bit = leaf_node_cpu_bit(rnp, cpu); unsigned long bit = leaf_node_cpu_bit(rnp, cpu);
if ((rnp->qsmask & bit) != 0) { if ((rnp->qsmask & bit) != 0) {
if (f(per_cpu_ptr(rsp->rda, cpu), isidle, maxj)) if (f(per_cpu_ptr(rsp->rda, cpu)))
mask |= bit; mask |= bit;
} }
} }
...@@ -3793,7 +3767,6 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp) ...@@ -3793,7 +3767,6 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
!init_nocb_callback_list(rdp)) !init_nocb_callback_list(rdp))
rcu_segcblist_init(&rdp->cblist); /* Re-enable callbacks. */ rcu_segcblist_init(&rdp->cblist); /* Re-enable callbacks. */
rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
rcu_sysidle_init_percpu_data(rdp->dynticks);
rcu_dynticks_eqs_online(); rcu_dynticks_eqs_online();
raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */ raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */
......
...@@ -45,14 +45,6 @@ struct rcu_dynticks { ...@@ -45,14 +45,6 @@ struct rcu_dynticks {
bool rcu_need_heavy_qs; /* GP old, need heavy quiescent state. */ bool rcu_need_heavy_qs; /* GP old, need heavy quiescent state. */
unsigned long rcu_qs_ctr; /* Light universal quiescent state ctr. */ unsigned long rcu_qs_ctr; /* Light universal quiescent state ctr. */
bool rcu_urgent_qs; /* GP old need light quiescent state. */ bool rcu_urgent_qs; /* GP old need light quiescent state. */
#ifdef CONFIG_NO_HZ_FULL_SYSIDLE
long long dynticks_idle_nesting;
/* irq/process nesting level from idle. */
atomic_t dynticks_idle; /* Even value for idle, else odd. */
/* "Idle" excludes userspace execution. */
unsigned long dynticks_idle_jiffies;
/* End of last non-NMI non-idle period. */
#endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
#ifdef CONFIG_RCU_FAST_NO_HZ #ifdef CONFIG_RCU_FAST_NO_HZ
bool all_lazy; /* Are all CPU's CBs lazy? */ bool all_lazy; /* Are all CPU's CBs lazy? */
unsigned long nonlazy_posted; unsigned long nonlazy_posted;
...@@ -529,15 +521,7 @@ static void __init rcu_organize_nocb_kthreads(struct rcu_state *rsp); ...@@ -529,15 +521,7 @@ static void __init rcu_organize_nocb_kthreads(struct rcu_state *rsp);
#endif /* #ifdef CONFIG_RCU_NOCB_CPU */ #endif /* #ifdef CONFIG_RCU_NOCB_CPU */
static void __maybe_unused rcu_kick_nohz_cpu(int cpu); static void __maybe_unused rcu_kick_nohz_cpu(int cpu);
static bool init_nocb_callback_list(struct rcu_data *rdp); static bool init_nocb_callback_list(struct rcu_data *rdp);
static void rcu_sysidle_enter(int irq);
static void rcu_sysidle_exit(int irq);
static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle,
unsigned long *maxj);
static bool is_sysidle_rcu_state(struct rcu_state *rsp);
static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle,
unsigned long maxj);
static void rcu_bind_gp_kthread(void); static void rcu_bind_gp_kthread(void);
static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp);
static bool rcu_nohz_full_cpu(struct rcu_state *rsp); static bool rcu_nohz_full_cpu(struct rcu_state *rsp);
static void rcu_dynticks_task_enter(void); static void rcu_dynticks_task_enter(void);
static void rcu_dynticks_task_exit(void); static void rcu_dynticks_task_exit(void);
......
...@@ -2563,429 +2563,6 @@ static void __maybe_unused rcu_kick_nohz_cpu(int cpu) ...@@ -2563,429 +2563,6 @@ static void __maybe_unused rcu_kick_nohz_cpu(int cpu)
#endif /* #ifdef CONFIG_NO_HZ_FULL */ #endif /* #ifdef CONFIG_NO_HZ_FULL */
} }
#ifdef CONFIG_NO_HZ_FULL_SYSIDLE
static int full_sysidle_state; /* Current system-idle state. */
#define RCU_SYSIDLE_NOT 0 /* Some CPU is not idle. */
#define RCU_SYSIDLE_SHORT 1 /* All CPUs idle for brief period. */
#define RCU_SYSIDLE_LONG 2 /* All CPUs idle for long enough. */
#define RCU_SYSIDLE_FULL 3 /* All CPUs idle, ready for sysidle. */
#define RCU_SYSIDLE_FULL_NOTED 4 /* Actually entered sysidle state. */
/*
* Invoked to note exit from irq or task transition to idle. Note that
* usermode execution does -not- count as idle here! After all, we want
* to detect full-system idle states, not RCU quiescent states and grace
* periods. The caller must have disabled interrupts.
*/
static void rcu_sysidle_enter(int irq)
{
unsigned long j;
struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_sysidle_enter() invoked with irqs enabled!!!");
/* If there are no nohz_full= CPUs, no need to track this. */
if (!tick_nohz_full_enabled())
return;
/* Adjust nesting, check for fully idle. */
if (irq) {
rdtp->dynticks_idle_nesting--;
WARN_ON_ONCE(rdtp->dynticks_idle_nesting < 0);
if (rdtp->dynticks_idle_nesting != 0)
return; /* Still not fully idle. */
} else {
if ((rdtp->dynticks_idle_nesting & DYNTICK_TASK_NEST_MASK) ==
DYNTICK_TASK_NEST_VALUE) {
rdtp->dynticks_idle_nesting = 0;
} else {
rdtp->dynticks_idle_nesting -= DYNTICK_TASK_NEST_VALUE;
WARN_ON_ONCE(rdtp->dynticks_idle_nesting < 0);
return; /* Still not fully idle. */
}
}
/* Record start of fully idle period. */
j = jiffies;
WRITE_ONCE(rdtp->dynticks_idle_jiffies, j);
smp_mb__before_atomic();
atomic_inc(&rdtp->dynticks_idle);
smp_mb__after_atomic();
WARN_ON_ONCE(atomic_read(&rdtp->dynticks_idle) & 0x1);
}
/*
* Unconditionally force exit from full system-idle state. This is
* invoked when a normal CPU exits idle, but must be called separately
* for the timekeeping CPU (tick_do_timer_cpu). The reason for this
* is that the timekeeping CPU is permitted to take scheduling-clock
* interrupts while the system is in system-idle state, and of course
* rcu_sysidle_exit() has no way of distinguishing a scheduling-clock
* interrupt from any other type of interrupt.
*/
void rcu_sysidle_force_exit(void)
{
int oldstate = READ_ONCE(full_sysidle_state);
int newoldstate;
/*
* Each pass through the following loop attempts to exit full
* system-idle state. If contention proves to be a problem,
* a trylock-based contention tree could be used here.
*/
while (oldstate > RCU_SYSIDLE_SHORT) {
newoldstate = cmpxchg(&full_sysidle_state,
oldstate, RCU_SYSIDLE_NOT);
if (oldstate == newoldstate &&
oldstate == RCU_SYSIDLE_FULL_NOTED) {
rcu_kick_nohz_cpu(tick_do_timer_cpu);
return; /* We cleared it, done! */
}
oldstate = newoldstate;
}
smp_mb(); /* Order initial oldstate fetch vs. later non-idle work. */
}
/*
* Invoked to note entry to irq or task transition from idle. Note that
* usermode execution does -not- count as idle here! The caller must
* have disabled interrupts.
*/
static void rcu_sysidle_exit(int irq)
{
struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_sysidle_exit() invoked with irqs enabled!!!");
/* If there are no nohz_full= CPUs, no need to track this. */
if (!tick_nohz_full_enabled())
return;
/* Adjust nesting, check for already non-idle. */
if (irq) {
rdtp->dynticks_idle_nesting++;
WARN_ON_ONCE(rdtp->dynticks_idle_nesting <= 0);
if (rdtp->dynticks_idle_nesting != 1)
return; /* Already non-idle. */
} else {
/*
* Allow for irq misnesting. Yes, it really is possible
* to enter an irq handler then never leave it, and maybe
* also vice versa. Handle both possibilities.
*/
if (rdtp->dynticks_idle_nesting & DYNTICK_TASK_NEST_MASK) {
rdtp->dynticks_idle_nesting += DYNTICK_TASK_NEST_VALUE;
WARN_ON_ONCE(rdtp->dynticks_idle_nesting <= 0);
return; /* Already non-idle. */
} else {
rdtp->dynticks_idle_nesting = DYNTICK_TASK_EXIT_IDLE;
}
}
/* Record end of idle period. */
smp_mb__before_atomic();
atomic_inc(&rdtp->dynticks_idle);
smp_mb__after_atomic();
WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks_idle) & 0x1));
/*
* If we are the timekeeping CPU, we are permitted to be non-idle
* during a system-idle state. This must be the case, because
* the timekeeping CPU has to take scheduling-clock interrupts
* during the time that the system is transitioning to full
* system-idle state. This means that the timekeeping CPU must
* invoke rcu_sysidle_force_exit() directly if it does anything
* more than take a scheduling-clock interrupt.
*/
if (smp_processor_id() == tick_do_timer_cpu)
return;
/* Update system-idle state: We are clearly no longer fully idle! */
rcu_sysidle_force_exit();
}
/*
* Check to see if the current CPU is idle. Note that usermode execution
* does not count as idle. The caller must have disabled interrupts,
* and must be running on tick_do_timer_cpu.
*/
static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle,
unsigned long *maxj)
{
int cur;
unsigned long j;
struct rcu_dynticks *rdtp = rdp->dynticks;
RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_sysidle_check_cpu() invoked with irqs enabled!!!");
/* If there are no nohz_full= CPUs, don't check system-wide idleness. */
if (!tick_nohz_full_enabled())
return;
/*
* If some other CPU has already reported non-idle, if this is
* not the flavor of RCU that tracks sysidle state, or if this
* is an offline or the timekeeping CPU, nothing to do.
*/
if (!*isidle || rdp->rsp != rcu_state_p ||
cpu_is_offline(rdp->cpu) || rdp->cpu == tick_do_timer_cpu)
return;
/* Verify affinity of current kthread. */
WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu);
/* Pick up current idle and NMI-nesting counter and check. */
cur = atomic_read(&rdtp->dynticks_idle);
if (cur & 0x1) {
*isidle = false; /* We are not idle! */
return;
}
smp_mb(); /* Read counters before timestamps. */
/* Pick up timestamps. */
j = READ_ONCE(rdtp->dynticks_idle_jiffies);
/* If this CPU entered idle more recently, update maxj timestamp. */
if (ULONG_CMP_LT(*maxj, j))
*maxj = j;
}
/*
* Is this the flavor of RCU that is handling full-system idle?
*/
static bool is_sysidle_rcu_state(struct rcu_state *rsp)
{
return rsp == rcu_state_p;
}
/*
* Return a delay in jiffies based on the number of CPUs, rcu_node
* leaf fanout, and jiffies tick rate. The idea is to allow larger
* systems more time to transition to full-idle state in order to
* avoid the cache thrashing that otherwise occur on the state variable.
* Really small systems (less than a couple of tens of CPUs) should
* instead use a single global atomically incremented counter, and later
* versions of this will automatically reconfigure themselves accordingly.
*/
static unsigned long rcu_sysidle_delay(void)
{
if (nr_cpu_ids <= CONFIG_NO_HZ_FULL_SYSIDLE_SMALL)
return 0;
return DIV_ROUND_UP(nr_cpu_ids * HZ, rcu_fanout_leaf * 1000);
}
/*
* Advance the full-system-idle state. This is invoked when all of
* the non-timekeeping CPUs are idle.
*/
static void rcu_sysidle(unsigned long j)
{
/* Check the current state. */
switch (READ_ONCE(full_sysidle_state)) {
case RCU_SYSIDLE_NOT:
/* First time all are idle, so note a short idle period. */
WRITE_ONCE(full_sysidle_state, RCU_SYSIDLE_SHORT);
break;
case RCU_SYSIDLE_SHORT:
/*
* Idle for a bit, time to advance to next state?
* cmpxchg failure means race with non-idle, let them win.
*/
if (ULONG_CMP_GE(jiffies, j + rcu_sysidle_delay()))
(void)cmpxchg(&full_sysidle_state,
RCU_SYSIDLE_SHORT, RCU_SYSIDLE_LONG);
break;
case RCU_SYSIDLE_LONG:
/*
* Do an additional check pass before advancing to full.
* cmpxchg failure means race with non-idle, let them win.
*/
if (ULONG_CMP_GE(jiffies, j + rcu_sysidle_delay()))
(void)cmpxchg(&full_sysidle_state,
RCU_SYSIDLE_LONG, RCU_SYSIDLE_FULL);
break;
default:
break;
}
}
/*
* Found a non-idle non-timekeeping CPU, so kick the system-idle state
* back to the beginning.
*/
static void rcu_sysidle_cancel(void)
{
smp_mb();
if (full_sysidle_state > RCU_SYSIDLE_SHORT)
WRITE_ONCE(full_sysidle_state, RCU_SYSIDLE_NOT);
}
/*
* Update the sysidle state based on the results of a force-quiescent-state
* scan of the CPUs' dyntick-idle state.
*/
static void rcu_sysidle_report(struct rcu_state *rsp, int isidle,
unsigned long maxj, bool gpkt)
{
if (rsp != rcu_state_p)
return; /* Wrong flavor, ignore. */
if (gpkt && nr_cpu_ids <= CONFIG_NO_HZ_FULL_SYSIDLE_SMALL)
return; /* Running state machine from timekeeping CPU. */
if (isidle)
rcu_sysidle(maxj); /* More idle! */
else
rcu_sysidle_cancel(); /* Idle is over. */
}
/*
* Wrapper for rcu_sysidle_report() when called from the grace-period
* kthread's context.
*/
static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle,
unsigned long maxj)
{
/* If there are no nohz_full= CPUs, no need to track this. */
if (!tick_nohz_full_enabled())
return;
rcu_sysidle_report(rsp, isidle, maxj, true);
}
/* Callback and function for forcing an RCU grace period. */
struct rcu_sysidle_head {
struct rcu_head rh;
int inuse;
};
static void rcu_sysidle_cb(struct rcu_head *rhp)
{
struct rcu_sysidle_head *rshp;
/*
* The following memory barrier is needed to replace the
* memory barriers that would normally be in the memory
* allocator.
*/
smp_mb(); /* grace period precedes setting inuse. */
rshp = container_of(rhp, struct rcu_sysidle_head, rh);
WRITE_ONCE(rshp->inuse, 0);
}
/*
* Check to see if the system is fully idle, other than the timekeeping CPU.
* The caller must have disabled interrupts. This is not intended to be
* called unless tick_nohz_full_enabled().
*/
bool rcu_sys_is_idle(void)
{
static struct rcu_sysidle_head rsh;
int rss = READ_ONCE(full_sysidle_state);
RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_sys_is_idle() invoked with irqs enabled!!!");
if (WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu))
return false;
/* Handle small-system case by doing a full scan of CPUs. */
if (nr_cpu_ids <= CONFIG_NO_HZ_FULL_SYSIDLE_SMALL) {
int oldrss = rss - 1;
/*
* One pass to advance to each state up to _FULL.
* Give up if any pass fails to advance the state.
*/
while (rss < RCU_SYSIDLE_FULL && oldrss < rss) {
int cpu;
bool isidle = true;
unsigned long maxj = jiffies - ULONG_MAX / 4;
struct rcu_data *rdp;
/* Scan all the CPUs looking for nonidle CPUs. */
for_each_possible_cpu(cpu) {
rdp = per_cpu_ptr(rcu_state_p->rda, cpu);
rcu_sysidle_check_cpu(rdp, &isidle, &maxj);
if (!isidle)
break;
}
rcu_sysidle_report(rcu_state_p, isidle, maxj, false);
oldrss = rss;
rss = READ_ONCE(full_sysidle_state);
}
}
/* If this is the first observation of an idle period, record it. */
if (rss == RCU_SYSIDLE_FULL) {
rss = cmpxchg(&full_sysidle_state,
RCU_SYSIDLE_FULL, RCU_SYSIDLE_FULL_NOTED);
return rss == RCU_SYSIDLE_FULL;
}
smp_mb(); /* ensure rss load happens before later caller actions. */
/* If already fully idle, tell the caller (in case of races). */
if (rss == RCU_SYSIDLE_FULL_NOTED)
return true;
/*
* If we aren't there yet, and a grace period is not in flight,
* initiate a grace period. Either way, tell the caller that
* we are not there yet. We use an xchg() rather than an assignment
* to make up for the memory barriers that would otherwise be
* provided by the memory allocator.
*/
if (nr_cpu_ids > CONFIG_NO_HZ_FULL_SYSIDLE_SMALL &&
!rcu_gp_in_progress(rcu_state_p) &&
!rsh.inuse && xchg(&rsh.inuse, 1) == 0)
call_rcu(&rsh.rh, rcu_sysidle_cb);
return false;
}
/*
* Initialize dynticks sysidle state for CPUs coming online.
*/
static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp)
{
rdtp->dynticks_idle_nesting = DYNTICK_TASK_NEST_VALUE;
}
#else /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
static void rcu_sysidle_enter(int irq)
{
}
static void rcu_sysidle_exit(int irq)
{
}
static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle,
unsigned long *maxj)
{
}
static bool is_sysidle_rcu_state(struct rcu_state *rsp)
{
return false;
}
static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle,
unsigned long maxj)
{
}
static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp)
{
}
#endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
/* /*
* Is this CPU a NO_HZ_FULL CPU that should ignore RCU so that the * Is this CPU a NO_HZ_FULL CPU that should ignore RCU so that the
* grace-period kthread will do force_quiescent_state() processing? * grace-period kthread will do force_quiescent_state() processing?
...@@ -3016,13 +2593,7 @@ static void rcu_bind_gp_kthread(void) ...@@ -3016,13 +2593,7 @@ static void rcu_bind_gp_kthread(void)
if (!tick_nohz_full_enabled()) if (!tick_nohz_full_enabled())
return; return;
#ifdef CONFIG_NO_HZ_FULL_SYSIDLE
cpu = tick_do_timer_cpu;
if (cpu >= 0 && cpu < nr_cpu_ids)
set_cpus_allowed_ptr(current, cpumask_of(cpu));
#else /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
housekeeping_affine(current); housekeeping_affine(current);
#endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
} }
/* Record the current task on dyntick-idle entry. */ /* Record the current task on dyntick-idle entry. */
......
...@@ -126,56 +126,6 @@ config NO_HZ_FULL_ALL ...@@ -126,56 +126,6 @@ config NO_HZ_FULL_ALL
Note the boot CPU will still be kept outside the range to Note the boot CPU will still be kept outside the range to
handle the timekeeping duty. handle the timekeeping duty.
config NO_HZ_FULL_SYSIDLE
bool "Detect full-system idle state for full dynticks system"
depends on NO_HZ_FULL
default n
help
At least one CPU must keep the scheduling-clock tick running for
timekeeping purposes whenever there is a non-idle CPU, where
"non-idle" also includes dynticks CPUs as long as they are
running non-idle tasks. Because the underlying adaptive-tick
support cannot distinguish between all CPUs being idle and
all CPUs each running a single task in dynticks mode, the
underlying support simply ensures that there is always a CPU
handling the scheduling-clock tick, whether or not all CPUs
are idle. This Kconfig option enables scalable detection of
the all-CPUs-idle state, thus allowing the scheduling-clock
tick to be disabled when all CPUs are idle. Note that scalable
detection of the all-CPUs-idle state means that larger systems
will be slower to declare the all-CPUs-idle state.
Say Y if you would like to help debug all-CPUs-idle detection.
Say N if you are unsure.
config NO_HZ_FULL_SYSIDLE_SMALL
int "Number of CPUs above which large-system approach is used"
depends on NO_HZ_FULL_SYSIDLE
range 1 NR_CPUS
default 8
help
The full-system idle detection mechanism takes a lazy approach
on large systems, as is required to attain decent scalability.
However, on smaller systems, scalability is not anywhere near as
large a concern as is energy efficiency. The sysidle subsystem
therefore uses a fast but non-scalable algorithm for small
systems and a lazier but scalable algorithm for large systems.
This Kconfig parameter defines the number of CPUs in the largest
system that will be considered to be "small".
The default value will be fine in most cases. Battery-powered
systems that (1) enable NO_HZ_FULL_SYSIDLE, (2) have larger
numbers of CPUs, and (3) are suffering from battery-lifetime
problems due to long sysidle latencies might wish to experiment
with larger values for this Kconfig parameter. On the other
hand, they might be even better served by disabling NO_HZ_FULL
entirely, given that NO_HZ_FULL is intended for HPC and
real-time workloads that at present do not tend to be run on
battery-powered systems.
Take the default if you are unsure.
config NO_HZ config NO_HZ
bool "Old Idle dynticks config" bool "Old Idle dynticks config"
depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS
......
...@@ -8,7 +8,6 @@ CONFIG_HZ_PERIODIC=n ...@@ -8,7 +8,6 @@ CONFIG_HZ_PERIODIC=n
CONFIG_NO_HZ_IDLE=n CONFIG_NO_HZ_IDLE=n
CONFIG_NO_HZ_FULL=y CONFIG_NO_HZ_FULL=y
CONFIG_NO_HZ_FULL_ALL=n CONFIG_NO_HZ_FULL_ALL=n
CONFIG_NO_HZ_FULL_SYSIDLE=y
CONFIG_RCU_FAST_NO_HZ=n CONFIG_RCU_FAST_NO_HZ=n
CONFIG_RCU_TRACE=y CONFIG_RCU_TRACE=y
CONFIG_HOTPLUG_CPU=y CONFIG_HOTPLUG_CPU=y
......
...@@ -18,7 +18,6 @@ CONFIG_PROVE_RCU ...@@ -18,7 +18,6 @@ CONFIG_PROVE_RCU
In common code tested by TREE_RCU test cases. In common code tested by TREE_RCU test cases.
CONFIG_NO_HZ_FULL_SYSIDLE
CONFIG_RCU_NOCB_CPU CONFIG_RCU_NOCB_CPU
Meaningless for TINY_RCU. Meaningless for TINY_RCU.
......
...@@ -9,8 +9,7 @@ CONFIG_DEBUG_OBJECTS_RCU_HEAD -- Do one. ...@@ -9,8 +9,7 @@ CONFIG_DEBUG_OBJECTS_RCU_HEAD -- Do one.
CONFIG_HOTPLUG_CPU -- Do half. (Every second.) CONFIG_HOTPLUG_CPU -- Do half. (Every second.)
CONFIG_HZ_PERIODIC -- Do one. CONFIG_HZ_PERIODIC -- Do one.
CONFIG_NO_HZ_IDLE -- Do those not otherwise specified. (Groups of two.) CONFIG_NO_HZ_IDLE -- Do those not otherwise specified. (Groups of two.)
CONFIG_NO_HZ_FULL -- Do two, one with CONFIG_NO_HZ_FULL_SYSIDLE. CONFIG_NO_HZ_FULL -- Do two, one with partial CPU enablement.
CONFIG_NO_HZ_FULL_SYSIDLE -- Do one.
CONFIG_PREEMPT -- Do half. (First three and #8.) CONFIG_PREEMPT -- Do half. (First three and #8.)
CONFIG_PROVE_LOCKING -- Do several, covering CONFIG_DEBUG_LOCK_ALLOC=y and not. CONFIG_PROVE_LOCKING -- Do several, covering CONFIG_DEBUG_LOCK_ALLOC=y and not.
CONFIG_PROVE_RCU -- Hardwired to CONFIG_PROVE_LOCKING. CONFIG_PROVE_RCU -- Hardwired to CONFIG_PROVE_LOCKING.
...@@ -48,10 +47,6 @@ CONFIG_64BIT ...@@ -48,10 +47,6 @@ CONFIG_64BIT
Used only to check CONFIG_RCU_FANOUT value, inspection suffices. Used only to check CONFIG_RCU_FANOUT value, inspection suffices.
CONFIG_NO_HZ_FULL_SYSIDLE_SMALL
Defer until Frederic uses this.
CONFIG_PREEMPT_COUNT CONFIG_PREEMPT_COUNT
CONFIG_PREEMPT_RCU CONFIG_PREEMPT_RCU
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment