Commit d1b222c6 authored by Paul E. McKenney's avatar Paul E. McKenney

rcu/nocb: Add bypass callback queueing

Use of the rcu_data structure's segmented ->cblist for no-CBs CPUs
takes advantage of unrelated grace periods, thus reducing the memory
footprint in the face of floods of call_rcu() invocations.  However,
the ->cblist field is a more-complex rcu_segcblist structure which must
be protected via locking.  Even though there are only three entities
which can acquire this lock (the CPU invoking call_rcu(), the no-CBs
grace-period kthread, and the no-CBs callbacks kthread), the contention
on this lock is excessive under heavy stress.

This commit therefore greatly reduces contention by provisioning
an rcu_cblist structure field named ->nocb_bypass within the
rcu_data structure.  Each no-CBs CPU is permitted only a limited
number of enqueues onto the ->cblist per jiffy, controlled by a new
nocb_nobypass_lim_per_jiffy kernel boot parameter that defaults to
about 16 enqueues per millisecond (16 * 1000 / HZ).  When that limit is
exceeded, the CPU instead enqueues onto the new ->nocb_bypass.

The ->nocb_bypass is flushed into the ->cblist every jiffy or when
the number of callbacks on ->nocb_bypass exceeds qhimark, whichever
happens first.  During call_rcu() floods, this flushing is carried out
by the CPU during the course of its call_rcu() invocations.  However,
a CPU could simply stop invoking call_rcu() at any time.  The no-CBs
grace-period kthread therefore carries out less-aggressive flushing
(every few jiffies or when the number of callbacks on ->nocb_bypass
exceeds (2 * qhimark), whichever comes first).  This means that the
no-CBs grace-period kthread cannot be permitted to do unbounded waits
while there are callbacks on ->nocb_bypass.  A ->nocb_bypass_timer is
used to provide the needed wakeups.

[ paulmck: Apply Coverity feedback reported by Colin Ian King. ]
Signed-off-by: default avatarPaul E. McKenney <paulmck@linux.ibm.com>
parent eda669a6
......@@ -36,6 +36,36 @@ void rcu_cblist_enqueue(struct rcu_cblist *rclp, struct rcu_head *rhp)
WRITE_ONCE(rclp->len, rclp->len + 1);
}
/*
* Flush the second rcu_cblist structure onto the first one, obliterating
* any contents of the first. If rhp is non-NULL, enqueue it as the sole
* element of the second rcu_cblist structure, but ensuring that the second
* rcu_cblist structure, if initially non-empty, always appears non-empty
* throughout the process. If rdp is NULL, the second rcu_cblist structure
* is instead initialized to empty.
*/
void rcu_cblist_flush_enqueue(struct rcu_cblist *drclp,
struct rcu_cblist *srclp,
struct rcu_head *rhp)
{
drclp->head = srclp->head;
if (drclp->head)
drclp->tail = srclp->tail;
else
drclp->tail = &drclp->head;
drclp->len = srclp->len;
drclp->len_lazy = srclp->len_lazy;
if (!rhp) {
rcu_cblist_init(srclp);
} else {
rhp->next = NULL;
srclp->head = rhp;
srclp->tail = &rhp->next;
WRITE_ONCE(srclp->len, 1);
srclp->len_lazy = 0;
}
}
/*
* Dequeue the oldest rcu_head structure from the specified callback
* list. This function assumes that the callback is non-lazy, but
......
......@@ -25,6 +25,10 @@ static inline void rcu_cblist_dequeued_lazy(struct rcu_cblist *rclp)
}
void rcu_cblist_init(struct rcu_cblist *rclp);
void rcu_cblist_enqueue(struct rcu_cblist *rclp, struct rcu_head *rhp);
void rcu_cblist_flush_enqueue(struct rcu_cblist *drclp,
struct rcu_cblist *srclp,
struct rcu_head *rhp);
struct rcu_head *rcu_cblist_dequeue(struct rcu_cblist *rclp);
/*
......@@ -92,6 +96,7 @@ static inline bool rcu_segcblist_restempty(struct rcu_segcblist *rsclp, int seg)
return !READ_ONCE(*READ_ONCE(rsclp->tails[seg]));
}
void rcu_segcblist_inc_len(struct rcu_segcblist *rsclp);
void rcu_segcblist_init(struct rcu_segcblist *rsclp);
void rcu_segcblist_disable(struct rcu_segcblist *rsclp);
void rcu_segcblist_offload(struct rcu_segcblist *rsclp);
......
......@@ -1251,6 +1251,7 @@ static bool rcu_accelerate_cbs(struct rcu_node *rnp, struct rcu_data *rdp)
unsigned long gp_seq_req;
bool ret = false;
rcu_lockdep_assert_cblist_protected(rdp);
raw_lockdep_assert_held_rcu_node(rnp);
/* If no pending (not yet ready to invoke) callbacks, nothing to do. */
......@@ -1292,7 +1293,7 @@ static void rcu_accelerate_cbs_unlocked(struct rcu_node *rnp,
unsigned long c;
bool needwake;
lockdep_assert_irqs_disabled();
rcu_lockdep_assert_cblist_protected(rdp);
c = rcu_seq_snap(&rcu_state.gp_seq);
if (!rdp->gpwrap && ULONG_CMP_GE(rdp->gp_seq_needed, c)) {
/* Old request still live, so mark recent callbacks. */
......@@ -1318,6 +1319,7 @@ static void rcu_accelerate_cbs_unlocked(struct rcu_node *rnp,
*/
static bool rcu_advance_cbs(struct rcu_node *rnp, struct rcu_data *rdp)
{
rcu_lockdep_assert_cblist_protected(rdp);
raw_lockdep_assert_held_rcu_node(rnp);
/* If no pending (not yet ready to invoke) callbacks, nothing to do. */
......@@ -1341,6 +1343,7 @@ static bool rcu_advance_cbs(struct rcu_node *rnp, struct rcu_data *rdp)
static void __maybe_unused rcu_advance_cbs_nowake(struct rcu_node *rnp,
struct rcu_data *rdp)
{
rcu_lockdep_assert_cblist_protected(rdp);
if (!rcu_seq_state(rcu_seq_current(&rnp->gp_seq)) ||
!raw_spin_trylock_rcu_node(rnp))
return;
......@@ -2187,7 +2190,9 @@ static void rcu_do_batch(struct rcu_data *rdp)
* The following usually indicates a double call_rcu(). To track
* this down, try building with CONFIG_DEBUG_OBJECTS_RCU_HEAD=y.
*/
WARN_ON_ONCE(rcu_segcblist_empty(&rdp->cblist) != (count == 0));
WARN_ON_ONCE(count == 0 && !rcu_segcblist_empty(&rdp->cblist));
WARN_ON_ONCE(!IS_ENABLED(CONFIG_RCU_NOCB_CPU) &&
count != 0 && rcu_segcblist_empty(&rdp->cblist));
rcu_nocb_unlock_irqrestore(rdp, flags);
......@@ -2564,8 +2569,9 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func, bool lazy)
if (rcu_segcblist_empty(&rdp->cblist))
rcu_segcblist_init(&rdp->cblist);
}
rcu_nocb_lock(rdp);
was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist);
if (rcu_nocb_try_bypass(rdp, head, &was_alldone, flags))
return; // Enqueued onto ->nocb_bypass, so just leave.
/* If we get here, rcu_nocb_try_bypass() acquired ->nocb_lock. */
rcu_segcblist_enqueue(&rdp->cblist, head, lazy);
if (__is_kfree_rcu_offset((unsigned long)func))
trace_rcu_kfree_callback(rcu_state.name, head,
......@@ -2839,6 +2845,7 @@ static void rcu_barrier_func(void *unused)
rdp->barrier_head.func = rcu_barrier_callback;
debug_rcu_head_queue(&rdp->barrier_head);
rcu_nocb_lock(rdp);
WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies));
if (rcu_segcblist_entrain(&rdp->cblist, &rdp->barrier_head, 0)) {
atomic_inc(&rcu_state.barrier_cpu_count);
} else {
......@@ -3192,6 +3199,7 @@ void rcutree_migrate_callbacks(int cpu)
my_rdp = this_cpu_ptr(&rcu_data);
my_rnp = my_rdp->mynode;
rcu_nocb_lock(my_rdp); /* irqs already disabled. */
WARN_ON_ONCE(!rcu_nocb_flush_bypass(my_rdp, NULL, jiffies));
raw_spin_lock_rcu_node(my_rnp); /* irqs already disabled. */
/* Leverage recent GPs and set GP for new callbacks. */
needwake = rcu_advance_cbs(my_rnp, rdp) ||
......
......@@ -200,18 +200,26 @@ struct rcu_data {
atomic_t nocb_lock_contended; /* Contention experienced. */
int nocb_defer_wakeup; /* Defer wakeup of nocb_kthread. */
struct timer_list nocb_timer; /* Enforce finite deferral. */
unsigned long nocb_gp_adv_time; /* Last call_rcu() CB adv (jiffies). */
/* The following fields are used by call_rcu, hence own cacheline. */
raw_spinlock_t nocb_bypass_lock ____cacheline_internodealigned_in_smp;
struct rcu_cblist nocb_bypass; /* Lock-contention-bypass CB list. */
unsigned long nocb_bypass_first; /* Time (jiffies) of first enqueue. */
unsigned long nocb_nobypass_last; /* Last ->cblist enqueue (jiffies). */
int nocb_nobypass_count; /* # ->cblist enqueues at ^^^ time. */
/* The following fields are used by GP kthread, hence own cacheline. */
raw_spinlock_t nocb_gp_lock ____cacheline_internodealigned_in_smp;
bool nocb_gp_sleep;
/* Is the nocb GP thread asleep? */
struct timer_list nocb_bypass_timer; /* Force nocb_bypass flush. */
bool nocb_gp_sleep; /* Is the nocb GP thread asleep? */
struct swait_queue_head nocb_gp_wq; /* For nocb kthreads to sleep on. */
bool nocb_cb_sleep; /* Is the nocb CB thread asleep? */
struct task_struct *nocb_cb_kthread;
struct rcu_data *nocb_next_cb_rdp;
/* Next rcu_data in wakeup chain. */
/* The following fields are used by CB kthread, hence new cachline. */
/* The following fields are used by CB kthread, hence new cacheline. */
struct rcu_data *nocb_gp_rdp ____cacheline_internodealigned_in_smp;
/* GP rdp takes GP-end wakeups. */
#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
......@@ -419,6 +427,10 @@ static void zero_cpu_stall_ticks(struct rcu_data *rdp);
static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp);
static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq);
static void rcu_init_one_nocb(struct rcu_node *rnp);
static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
unsigned long j);
static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
bool *was_alldone, unsigned long flags);
static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_empty,
unsigned long flags);
static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp);
......@@ -430,19 +442,15 @@ static void rcu_nocb_lock(struct rcu_data *rdp);
static void rcu_nocb_unlock(struct rcu_data *rdp);
static void rcu_nocb_unlock_irqrestore(struct rcu_data *rdp,
unsigned long flags);
static void rcu_lockdep_assert_cblist_protected(struct rcu_data *rdp);
#ifdef CONFIG_RCU_NOCB_CPU
static void __init rcu_organize_nocb_kthreads(void);
#define rcu_nocb_lock_irqsave(rdp, flags) \
do { \
if (!rcu_segcblist_is_offloaded(&(rdp)->cblist)) { \
if (!rcu_segcblist_is_offloaded(&(rdp)->cblist)) \
local_irq_save(flags); \
} else if (!raw_spin_trylock_irqsave(&(rdp)->nocb_lock, (flags))) {\
atomic_inc(&(rdp)->nocb_lock_contended); \
smp_mb__after_atomic(); /* atomic_inc() before lock. */ \
else \
raw_spin_lock_irqsave(&(rdp)->nocb_lock, (flags)); \
smp_mb__before_atomic(); /* atomic_dec() after lock. */ \
atomic_dec(&(rdp)->nocb_lock_contended); \
} \
} while (0)
#else /* #ifdef CONFIG_RCU_NOCB_CPU */
#define rcu_nocb_lock_irqsave(rdp, flags) local_irq_save(flags)
......
This diff is collapsed.
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment