Commit be45bf53 authored by Peter Zijlstra's avatar Peter Zijlstra Committed by Ingo Molnar

watchdog/softlockup: Fix cpu_stop_queue_work() double-queue bug

When scheduling is delayed for longer than the softlockup interrupt
period it is possible to double-queue the cpu_stop_work, causing list
corruption.

Cure this by adding a completion to track the cpu_stop_work's
progress.
Reported-by: default avatarkernel test robot <lkp@intel.com>
Tested-by: default avatarRong Chen <rong.a.chen@intel.com>
Signed-off-by: default avatarPeter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Fixes: 9cf57731 ("watchdog/softlockup: Replace "watchdog/%u" threads with cpu_stop_work")
Link: http://lkml.kernel.org/r/20180713104208.GW2494@hirez.programming.kicks-ass.netSigned-off-by: default avatarIngo Molnar <mingo@kernel.org>
parent fdf2ceb7
...@@ -330,6 +330,9 @@ static void watchdog_interrupt_count(void) ...@@ -330,6 +330,9 @@ static void watchdog_interrupt_count(void)
__this_cpu_inc(hrtimer_interrupts); __this_cpu_inc(hrtimer_interrupts);
} }
static DEFINE_PER_CPU(struct completion, softlockup_completion);
static DEFINE_PER_CPU(struct cpu_stop_work, softlockup_stop_work);
/* /*
* The watchdog thread function - touches the timestamp. * The watchdog thread function - touches the timestamp.
* *
...@@ -343,12 +346,11 @@ static int softlockup_fn(void *data) ...@@ -343,12 +346,11 @@ static int softlockup_fn(void *data)
__this_cpu_write(soft_lockup_hrtimer_cnt, __this_cpu_write(soft_lockup_hrtimer_cnt,
__this_cpu_read(hrtimer_interrupts)); __this_cpu_read(hrtimer_interrupts));
__touch_watchdog(); __touch_watchdog();
complete(this_cpu_ptr(&softlockup_completion));
return 0; return 0;
} }
static DEFINE_PER_CPU(struct cpu_stop_work, softlockup_stop_work);
/* watchdog kicker functions */ /* watchdog kicker functions */
static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
{ {
...@@ -364,9 +366,12 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) ...@@ -364,9 +366,12 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
watchdog_interrupt_count(); watchdog_interrupt_count();
/* kick the softlockup detector */ /* kick the softlockup detector */
stop_one_cpu_nowait(smp_processor_id(), if (completion_done(this_cpu_ptr(&softlockup_completion))) {
softlockup_fn, NULL, reinit_completion(this_cpu_ptr(&softlockup_completion));
this_cpu_ptr(&softlockup_stop_work)); stop_one_cpu_nowait(smp_processor_id(),
softlockup_fn, NULL,
this_cpu_ptr(&softlockup_stop_work));
}
/* .. and repeat */ /* .. and repeat */
hrtimer_forward_now(hrtimer, ns_to_ktime(sample_period)); hrtimer_forward_now(hrtimer, ns_to_ktime(sample_period));
...@@ -467,9 +472,13 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) ...@@ -467,9 +472,13 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
static void watchdog_enable(unsigned int cpu) static void watchdog_enable(unsigned int cpu)
{ {
struct hrtimer *hrtimer = this_cpu_ptr(&watchdog_hrtimer); struct hrtimer *hrtimer = this_cpu_ptr(&watchdog_hrtimer);
struct completion *done = this_cpu_ptr(&softlockup_completion);
WARN_ON_ONCE(cpu != smp_processor_id()); WARN_ON_ONCE(cpu != smp_processor_id());
init_completion(done);
complete(done);
/* /*
* Start the timer first to prevent the NMI watchdog triggering * Start the timer first to prevent the NMI watchdog triggering
* before the timer has a chance to fire. * before the timer has a chance to fire.
...@@ -499,6 +508,7 @@ static void watchdog_disable(unsigned int cpu) ...@@ -499,6 +508,7 @@ static void watchdog_disable(unsigned int cpu)
*/ */
watchdog_nmi_disable(cpu); watchdog_nmi_disable(cpu);
hrtimer_cancel(hrtimer); hrtimer_cancel(hrtimer);
wait_for_completion(this_cpu_ptr(&softlockup_completion));
} }
static int softlockup_stop_fn(void *data) static int softlockup_stop_fn(void *data)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment