[PATCH] hangcheck-timer

Patch from: Joel Becker <Joel.Becker@oracle.com> This kernel module will detect long durations when jiffies has failed to increment, and will reboot the machine in response. Joel says: "Here's why Oracle wants such a thing. We run clusters. Imagine a two node cluster. Node1 pauses completely for some reason. There are multiple reasons this can happen. A bad driver can udelay() for 90 seconds (qla used to do this). zVM on S/390 can page Linux out for minutes at a time. Anything that causes the box to freeze. Jiffies does *not* count during this, so when Node1 returns it feels that no time has passed. Node2, however, has been counting time. When Node1 goes away, the Oracle cluster manager starts looking for it. After a timeout, it gives up. It then recovers any in-progress transactions from Node1. After that, it starts new operations, modifying the data in ways that Node1 has no idea about (it's still out to lunch). When Node1 finally returns (udelay() ends, zVM pages it in, whatever), any I/O that it has queued or is about to queue will get sent to the disk. Oops, you've just corrupted your shared data. hangcheck-timer should catch this and reboot the box. This is why Oracle wants this driver. We figure that such functionality would be beneficial to others as well, so we posted to l-k. We'd all hope that driver writers don't udelay() for 90s, but S/390 with zVM is still around. Some folks might want to notice when it happens. I am sure other things exist that trigger the same symptoms."

[PATCH] hangcheck-timer
Patch from: Joel Becker <Joel.Becker@oracle.com> This kernel module will detect long durations when jiffies has failed to increment, and will reboot the machine in response. Joel says: "Here's why Oracle wants such a thing. We run clusters. Imagine a two node cluster. Node1 pauses completely for some reason. There are multiple reasons this can happen. A bad driver can udelay() for 90 seconds (qla used to do this). zVM on S/390 can page Linux out for minutes at a time. Anything that causes the box to freeze. Jiffies does *not* count during this, so when Node1 returns it feels that no time has passed. Node2, however, has been counting time. When Node1 goes away, the Oracle cluster manager starts looking for it. After a timeout, it gives up. It then recovers any in-progress transactions from Node1. After that, it starts new operations, modifying the data in ways that Node1 has no idea about (it's still out to lunch). When Node1 finally returns (udelay() ends, zVM pages it in, whatever), any I/O that it has queued or is about to queue will get sent to the disk. Oops, you've just corrupted your shared data. hangcheck-timer should catch this and reboot the box. This is why Oracle wants this driver. We figure that such functionality would be beneficial to others as well, so we posted to l-k. We'd all hope that driver writers don't udelay() for 90s, but S/390 with zVM is still around. Some folks might want to notice when it happens. I am sure other things exist that trigger the same symptoms."
5dd7d1b6 · Andrew Morton · Linus Torvalds · 46052b73 · 5dd7d1b6 · 5dd7d1b6
Commit 5dd7d1b6 authored Feb 03, 2003 by Andrew Morton Committed by Linus Torvalds Feb 03, 2003
Hide whitespace changes
Inline Side-by-side

Showing with 135 additions and 0 deletions

drivers/char/Kconfig drivers/char/Kconfig +7 -0

drivers/char/Makefile drivers/char/Makefile +1 -0

drivers/char/hangcheck-timer.c drivers/char/hangcheck-timer.c +127 -0

No files found.
--- a/drivers/char/Kconfig
+++ b/drivers/char/Kconfig
@@ -992,5 +992,12 @@ config RAW_DRIVER
 	  Once bound, I/O against /dev/raw/rawN uses efficient zero-copy I/O. 
 	  See the raw(8) manpage for more details.

+config HANGCHECK_TIMER
+	tristate "Hangcheck timer"
+	help
+	  The hangcheck-timer module detects when the system has gone
+	  out to lunch past a certain margin.  It can reboot the system
+	  or merely print a warning.
+
 endmenu

--- a/drivers/char/Makefile
+++ b/drivers/char/Makefile
@@ -77,6 +77,7 @@ obj-$(CONFIG_DRM) += drm/
 obj-$(CONFIG_PCMCIA) += pcmcia/
 obj-$(CONFIG_IPMI_HANDLER) += ipmi/

+obj-$(CONFIG_HANGCHECK_TIMER) += hangcheck-timer.o

 # Files generated that shall be removed upon make clean
 clean-files := consolemap_deftbl.c defkeymap.c qtronixmap.c

--- a/drivers/char/hangcheck-timer.c
+++ b/drivers/char/hangcheck-timer.c
+/*
+ * hangcheck-timer.c
+ *
+ * Driver for a little io fencing timer.
+ *
+ * Copyright (C) 2002 Oracle Corporation.  All rights reserved.
+ *
+ * Author: Joel Becker <joel.becker@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License version 2 as published by the Free Software Foundation.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ * 
+ * You should have recieved a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+/*
+ * The hangcheck-timer driver uses the TSC to catch delays that
+ * jiffies does not notice.  A timer is set.  When the timer fires, it
+ * checks whether it was delayed and if that delay exceeds a given
+ * margin of error.  The hangcheck_tick module paramter takes the timer
+ * duration in seconds.  The hangcheck_margin parameter defines the
+ * margin of error, in seconds.  The defaults are 60 seconds for the
+ * timer and 180 seconds for the margin of error.  IOW, a timer is set
+ * for 60 seconds.  When the timer fires, the callback checks the
+ * actual duration that the timer waited.  If the duration exceeds the
+ * alloted time and margin (here 60 + 180, or 240 seconds), the machine
+ * is restarted.  A healthy machine will have the duration match the
+ * expected timeout very closely.
+ */
+
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/reboot.h>
+#include <linux/init.h>
+#include <asm/uaccess.h>
+
+
+#define VERSION_STR "0.5.0"
+
+#define DEFAULT_IOFENCE_MARGIN 60	/* Default fudge factor, in seconds */
+#define DEFAULT_IOFENCE_TICK 180	/* Default timer timeout, in seconds */
+
+static int hangcheck_tick = DEFAULT_IOFENCE_TICK;
+static int hangcheck_margin = DEFAULT_IOFENCE_MARGIN;
+static int hangcheck_reboot;  /* Defaults to not reboot */
+
+/* Driver options */
+module_param(hangcheck_tick, int, 0);
+MODULE_PARM_DESC(hangcheck_tick, "Timer delay.");
+module_param(hangcheck_margin, int, 0);
+MODULE_PARM_DESC(hangcheck_margin, "If the hangcheck timer has been delayed more than hangcheck_margin seconds, the driver will fire.");
+module_param(hangcheck_reboot, int, 0);
+MODULE_PARM_DESC(hangcheck_reboot, "If nonzero, the machine will reboot when the timer margin is exceeded.");
+
+MODULE_AUTHOR("Joel Becker");
+MODULE_DESCRIPTION("Hangcheck-timer detects when the system has gone out to lunch past a certain margin.");
+MODULE_LICENSE("GPL");
+
+
+/* Last time scheduled */
+static unsigned long long hangcheck_tsc, hangcheck_tsc_margin;
+
+static void hangcheck_fire(unsigned long);
+
+static struct timer_list hangcheck_ticktock =
+		TIMER_INITIALIZER(hangcheck_fire, 0, 0);
+
+static void hangcheck_fire(unsigned long data)
+{
+	unsigned long long cur_tsc, tsc_diff;
+
+	cur_tsc = get_cycles();
+
+	if (cur_tsc > hangcheck_tsc)
+		tsc_diff = cur_tsc - hangcheck_tsc;
+	else
+		tsc_diff = (cur_tsc + (~0ULL - hangcheck_tsc)); /* or something */
+
+	if (tsc_diff > hangcheck_tsc_margin) {
+		if (hangcheck_reboot) {
+			printk(KERN_CRIT "Hangcheck: hangcheck is restarting the machine.\n");
+			machine_restart(NULL);
+		} else {
+			printk(KERN_CRIT "Hangcheck: hangcheck value past margin!\n");
+		}
+	}
+	mod_timer(&hangcheck_ticktock, jiffies + (hangcheck_tick*HZ));
+	hangcheck_tsc = get_cycles();
+}
+
+
+static int __init hangcheck_init(void)
+{
+	printk("Hangcheck: starting hangcheck timer %s (tick is %d seconds, margin is %d seconds).\n",
+	       VERSION_STR, hangcheck_tick, hangcheck_margin);
+
+	hangcheck_tsc_margin = hangcheck_margin + hangcheck_tick;
+	hangcheck_tsc_margin *= HZ;
+	hangcheck_tsc_margin *= current_cpu_data.loops_per_jiffy;
+
+	hangcheck_tsc = get_cycles();
+	mod_timer(&hangcheck_ticktock, jiffies + (hangcheck_tick*HZ));
+
+	return 0;
+}
+
+
+static void __exit hangcheck_exit(void)
+{
+	del_timer_sync(&hangcheck_ticktock);
+}
+
+module_init(hangcheck_init);
+module_exit(hangcheck_exit);