Commit 4eec42f3 authored by Mandeep Singh Baines's avatar Mandeep Singh Baines Committed by Ingo Molnar

watchdog: Change the default timeout and configure nmi watchdog period based on watchdog_thresh

Before the conversion of the NMI watchdog to perf event, the
watchdog timeout was 5 seconds. Now it is 60 seconds. For my
particular application, netbooks, 5 seconds was a better
timeout. With a short timeout, we catch faults earlier and are
able to send back a panic. With a 60 second timeout, the user is
unlikely to wait and will instead hit the power button, causing
us to lose the panic info.

This change configures the NMI period to watchdog_thresh and
sets the softlockup_thresh to watchdog_thresh * 2. In addition,
watchdog_thresh was reduced to 10 seconds as suggested by Ingo
Molnar.
Signed-off-by: default avatarMandeep Singh Baines <msb@chromium.org>
Cc: Marcin Slusarz <marcin.slusarz@gmail.com>
Cc: Don Zickus <dzickus@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Link: http://lkml.kernel.org/r/1306127423-3347-4-git-send-email-msb@chromium.orgSigned-off-by: default avatarIngo Molnar <mingo@elte.hu>
LKML-Reference: <20110517071642.GF22305@elte.hu>
parent 586692a5
...@@ -19,9 +19,9 @@ ...@@ -19,9 +19,9 @@
#include <linux/delay.h> #include <linux/delay.h>
#ifdef CONFIG_HARDLOCKUP_DETECTOR #ifdef CONFIG_HARDLOCKUP_DETECTOR
u64 hw_nmi_get_sample_period(void) u64 hw_nmi_get_sample_period(int watchdog_thresh)
{ {
return (u64)(cpu_khz) * 1000 * 60; return (u64)(cpu_khz) * 1000 * watchdog_thresh;
} }
#endif #endif
......
...@@ -45,7 +45,7 @@ static inline bool trigger_all_cpu_backtrace(void) ...@@ -45,7 +45,7 @@ static inline bool trigger_all_cpu_backtrace(void)
#ifdef CONFIG_LOCKUP_DETECTOR #ifdef CONFIG_LOCKUP_DETECTOR
int hw_nmi_is_cpu_stuck(struct pt_regs *); int hw_nmi_is_cpu_stuck(struct pt_regs *);
u64 hw_nmi_get_sample_period(void); u64 hw_nmi_get_sample_period(int watchdog_thresh);
extern int watchdog_enabled; extern int watchdog_enabled;
extern int watchdog_thresh; extern int watchdog_thresh;
struct ctl_table; struct ctl_table;
......
...@@ -28,7 +28,7 @@ ...@@ -28,7 +28,7 @@
#include <linux/perf_event.h> #include <linux/perf_event.h>
int watchdog_enabled = 1; int watchdog_enabled = 1;
int __read_mostly watchdog_thresh = 60; int __read_mostly watchdog_thresh = 10;
static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts); static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts);
static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog); static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog);
...@@ -91,6 +91,17 @@ static int __init nosoftlockup_setup(char *str) ...@@ -91,6 +91,17 @@ static int __init nosoftlockup_setup(char *str)
__setup("nosoftlockup", nosoftlockup_setup); __setup("nosoftlockup", nosoftlockup_setup);
/* */ /* */
/*
* Hard-lockup warnings should be triggered after just a few seconds. Soft-
* lockups can have false positives under extreme conditions. So we generally
* want a higher threshold for soft lockups than for hard lockups. So we couple
* the thresholds with a factor: we make the soft threshold twice the amount of
* time the hard threshold is.
*/
static int get_softlockup_thresh()
{
return watchdog_thresh * 2;
}
/* /*
* Returns seconds, approximately. We don't need nanosecond * Returns seconds, approximately. We don't need nanosecond
...@@ -110,7 +121,7 @@ static unsigned long get_sample_period(void) ...@@ -110,7 +121,7 @@ static unsigned long get_sample_period(void)
* increment before the hardlockup detector generates * increment before the hardlockup detector generates
* a warning * a warning
*/ */
return watchdog_thresh * (NSEC_PER_SEC / 5); return get_softlockup_thresh() * (NSEC_PER_SEC / 5);
} }
/* Commands for resetting the watchdog */ /* Commands for resetting the watchdog */
...@@ -182,7 +193,7 @@ static int is_softlockup(unsigned long touch_ts) ...@@ -182,7 +193,7 @@ static int is_softlockup(unsigned long touch_ts)
unsigned long now = get_timestamp(smp_processor_id()); unsigned long now = get_timestamp(smp_processor_id());
/* Warn about unreasonable delays: */ /* Warn about unreasonable delays: */
if (time_after(now, touch_ts + watchdog_thresh)) if (time_after(now, touch_ts + get_softlockup_thresh()))
return now - touch_ts; return now - touch_ts;
return 0; return 0;
...@@ -359,7 +370,7 @@ static int watchdog_nmi_enable(int cpu) ...@@ -359,7 +370,7 @@ static int watchdog_nmi_enable(int cpu)
/* Try to register using hardware perf events */ /* Try to register using hardware perf events */
wd_attr = &wd_hw_attr; wd_attr = &wd_hw_attr;
wd_attr->sample_period = hw_nmi_get_sample_period(); wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh);
event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback); event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback);
if (!IS_ERR(event)) { if (!IS_ERR(event)) {
printk(KERN_INFO "NMI watchdog enabled, takes one hw-pmu counter.\n"); printk(KERN_INFO "NMI watchdog enabled, takes one hw-pmu counter.\n");
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment