Merge branch 'ras-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull RAS updates from Ingo Molnar: "The main changes in this cycle were: - add the 'Corrected Errors Collector' kernel feature which collect and monitor correctable errors statistics and will preemptively (soft-)offline physical pages that have a suspiciously high error count. - handle MCE errors during kexec() more gracefully - factor out and deprecate the /dev/mcelog driver - ... plus misc fixes and cleanpus" * 'ras-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: x86/mce: Check MCi_STATUS[MISCV] for usable addr on Intel only ACPI/APEI: Use setup_deferrable_timer() x86/mce: Update notifier priority check x86/mce: Enable PPIN for Knights Landing/Mill x86/mce: Do not register notifiers with invalid prio x86/mce: Factor out and deprecate the /dev/mcelog driver RAS: Add a Corrected Errors Collector x86/mce: Rename mce_log to mce_log_buffer x86/mce: Rename mce_log()'s argument x86/mce: Init some CPU features early x86/mce: Handle broadcasted MCE gracefully with kexec

Merge branch 'ras-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull RAS updates from Ingo Molnar: "The main changes in this cycle were: - add the 'Corrected Errors Collector' kernel feature which collect and monitor correctable errors statistics and will preemptively (soft-)offline physical pages that have a suspiciously high error count. - handle MCE errors during kexec() more gracefully - factor out and deprecate the /dev/mcelog driver - ... plus misc fixes and cleanpus" * 'ras-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: x86/mce: Check MCi_STATUS[MISCV] for usable addr on Intel only ACPI/APEI: Use setup_deferrable_timer() x86/mce: Update notifier priority check x86/mce: Enable PPIN for Knights Landing/Mill x86/mce: Do not register notifiers with invalid prio x86/mce: Factor out and deprecate the /dev/mcelog driver RAS: Add a Corrected Errors Collector x86/mce: Rename mce_log to mce_log_buffer x86/mce: Rename mce_log()'s argument x86/mce: Init some CPU features early x86/mce: Handle broadcasted MCE gracefully with kexec
3dee9fb2 · Linus Torvalds · 7c8c03bf · c6a9583f · 3dee9fb2 · 3dee9fb2
Commit 3dee9fb2 authored May 01, 2017 by Linus Torvalds
18 changed files
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -3177,6 +3177,12 @@
 	ramdisk_size=	[RAM] Sizes of RAM disks in kilobytes
 			See Documentation/blockdev/ramdisk.txt.
+	ras=option[,option,...]	[KNL] RAS-specific options
+		cec_disable	[X86]
+				Disable the Correctable Errors Collector,
+				see CONFIG_RAS_CEC help text.
 	rcu_nocbs=	[KNL]
 			The argument is a cpu list, as described above.

--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1042,6 +1042,14 @@ config X86_MCE
 	  The action the kernel takes depends on the severity of the problem,
 	  ranging from warning messages to halting the machine.
+config X86_MCELOG_LEGACY
+	bool "Support for deprecated /dev/mcelog character device"
+	depends on X86_MCE
+	---help---
+	  Enable support for /dev/mcelog which is needed by the old mcelog
+	  userspace logging daemon. Consider switching to the new generation
+	  rasdaemon solution.
 config X86_MCE_INTEL
 	def_bool y
 	prompt "Intel MCE features"
@@ -1071,7 +1079,7 @@ config X86_MCE_THRESHOLD
 	def_bool y
 config X86_MCE_INJECT
-	depends on X86_MCE && X86_LOCAL_APIC
+	depends on X86_MCE && X86_LOCAL_APIC && X86_MCELOG_LEGACY
 	tristate "Machine check injector support"
 	---help---
 	  Provide support for injecting machine checks for testing purposes.

--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -128,7 +128,7 @@
 * debugging tools.  Each entry is only valid when its finished flag
 * is set.
 */
-struct mce_log {
+struct mce_log_buffer {
 	char signature[12]; /* "MACHINECHECK" */
 	unsigned len;	    /* = MCE_LOG_LEN */
 	unsigned next;
@@ -191,10 +191,12 @@ extern struct mca_config mca_cfg;
 extern struct mca_msr_regs msr_ops;
 enum mce_notifier_prios {
-	MCE_PRIO_SRAO		= INT_MAX,
+	MCE_PRIO_FIRST		= INT_MAX,
-	MCE_PRIO_EXTLOG		= INT_MAX - 1,
+	MCE_PRIO_SRAO		= INT_MAX - 1,
-	MCE_PRIO_NFIT		= INT_MAX - 2,
+	MCE_PRIO_EXTLOG		= INT_MAX - 2,
-	MCE_PRIO_EDAC		= INT_MAX - 3,
+	MCE_PRIO_NFIT		= INT_MAX - 3,
+	MCE_PRIO_EDAC		= INT_MAX - 4,
+	MCE_PRIO_MCELOG		= 1,
 	MCE_PRIO_LOWEST		= 0,
 };

--- a/arch/x86/include/asm/reboot.h
+++ b/arch/x86/include/asm/reboot.h
@@ -15,6 +15,7 @@ struct machine_ops {
 };
 extern struct machine_ops machine_ops;
+extern int crashing_cpu;
 void native_machine_crash_shutdown(struct pt_regs *regs);
 void native_machine_shutdown(void);

--- a/arch/x86/kernel/cpu/mcheck/Makefile
+++ b/arch/x86/kernel/cpu/mcheck/Makefile
@@ -9,3 +9,5 @@ obj-$(CONFIG_X86_MCE_INJECT)	+= mce-inject.o
 obj-$(CONFIG_X86_THERMAL_VECTOR) += therm_throt.o
 obj-$(CONFIG_ACPI_APEI)		+= mce-apei.o
+obj-$(CONFIG_X86_MCELOG_LEGACY)	+= dev-mcelog.o
--- a/arch/x86/kernel/cpu/mcheck/dev-mcelog.c
+++ b/arch/x86/kernel/cpu/mcheck/dev-mcelog.c
+/*
+ * /dev/mcelog driver
+ *
+ * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
+ * Rest from unknown author(s).
+ * 2004 Andi Kleen. Rewrote most of it.
+ * Copyright 2008 Intel Corporation
+ * Author: Andi Kleen
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/miscdevice.h>
+#include <linux/slab.h>
+#include <linux/kmod.h>
+#include <linux/poll.h>
+#include "mce-internal.h"
+static DEFINE_MUTEX(mce_chrdev_read_mutex);
+static char mce_helper[128];
+static char *mce_helper_argv[2] = { mce_helper, NULL };
+#define mce_log_get_idx_check(p) \
+({ \
+	RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() && \
+			 !lockdep_is_held(&mce_chrdev_read_mutex), \
+			 "suspicious mce_log_get_idx_check() usage"); \
+	smp_load_acquire(&(p)); \
+})
+/*
+ * Lockless MCE logging infrastructure.
+ * This avoids deadlocks on printk locks without having to break locks. Also
+ * separate MCEs from kernel messages to avoid bogus bug reports.
+ */
+static struct mce_log_buffer mcelog = {
+	.signature	= MCE_LOG_SIGNATURE,
+	.len		= MCE_LOG_LEN,
+	.recordlen	= sizeof(struct mce),
+};
+static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait);
+/* User mode helper program triggered by machine check event */
+extern char			mce_helper[128];
+static int dev_mce_log(struct notifier_block *nb, unsigned long val,
+				void *data)
+{
+	struct mce *mce = (struct mce *)data;
+	unsigned int next, entry;
+	wmb();
+	for (;;) {
+		entry = mce_log_get_idx_check(mcelog.next);
+		for (;;) {
+			/*
+			 * When the buffer fills up discard new entries.
+			 * Assume that the earlier errors are the more
+			 * interesting ones:
+			 */
+			if (entry >= MCE_LOG_LEN) {
+				set_bit(MCE_OVERFLOW,
+					(unsigned long *)&mcelog.flags);
+				return NOTIFY_OK;
+			}
+			/* Old left over entry. Skip: */
+			if (mcelog.entry[entry].finished) {
+				entry++;
+				continue;
+			}
+			break;
+		}
+		smp_rmb();
+		next = entry + 1;
+		if (cmpxchg(&mcelog.next, entry, next) == entry)
+			break;
+	}
+	memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
+	wmb();
+	mcelog.entry[entry].finished = 1;
+	wmb();
+	/* wake processes polling /dev/mcelog */
+	wake_up_interruptible(&mce_chrdev_wait);
+	return NOTIFY_OK;
+}
+static struct notifier_block dev_mcelog_nb = {
+	.notifier_call	= dev_mce_log,
+	.priority	= MCE_PRIO_MCELOG,
+};
+static void mce_do_trigger(struct work_struct *work)
+{
+	call_usermodehelper(mce_helper, mce_helper_argv, NULL, UMH_NO_WAIT);
+}
+static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
+void mce_work_trigger(void)
+{
+	if (mce_helper[0])
+		schedule_work(&mce_trigger_work);
+}
+static ssize_t
+show_trigger(struct device *s, struct device_attribute *attr, char *buf)
+{
+	strcpy(buf, mce_helper);
+	strcat(buf, "\n");
+	return strlen(mce_helper) + 1;
+}
+static ssize_t set_trigger(struct device *s, struct device_attribute *attr,
+				const char *buf, size_t siz)
+{
+	char *p;
+	strncpy(mce_helper, buf, sizeof(mce_helper));
+	mce_helper[sizeof(mce_helper)-1] = 0;
+	p = strchr(mce_helper, '\n');
+	if (p)
+		*p = 0;
+	return strlen(mce_helper) + !!p;
+}
+DEVICE_ATTR(trigger, 0644, show_trigger, set_trigger);
+/*
+ * mce_chrdev: Character device /dev/mcelog to read and clear the MCE log.
+ */
+static DEFINE_SPINLOCK(mce_chrdev_state_lock);
+static int mce_chrdev_open_count;	/* #times opened */
+static int mce_chrdev_open_exclu;	/* already open exclusive? */
+static int mce_chrdev_open(struct inode *inode, struct file *file)
+{
+	spin_lock(&mce_chrdev_state_lock);
+	if (mce_chrdev_open_exclu ||
+	    (mce_chrdev_open_count && (file->f_flags & O_EXCL))) {
+		spin_unlock(&mce_chrdev_state_lock);
+		return -EBUSY;
+	}
+	if (file->f_flags & O_EXCL)
+		mce_chrdev_open_exclu = 1;
+	mce_chrdev_open_count++;
+	spin_unlock(&mce_chrdev_state_lock);
+	return nonseekable_open(inode, file);
+}
+static int mce_chrdev_release(struct inode *inode, struct file *file)
+{
+	spin_lock(&mce_chrdev_state_lock);
+	mce_chrdev_open_count--;
+	mce_chrdev_open_exclu = 0;
+	spin_unlock(&mce_chrdev_state_lock);
+	return 0;
+}
+static void collect_tscs(void *data)
+{
+	unsigned long *cpu_tsc = (unsigned long *)data;
+	cpu_tsc[smp_processor_id()] = rdtsc();
+}
+static int mce_apei_read_done;
+/* Collect MCE record of previous boot in persistent storage via APEI ERST. */
+static int __mce_read_apei(char __user **ubuf, size_t usize)
+{
+	int rc;
+	u64 record_id;
+	struct mce m;
+	if (usize < sizeof(struct mce))
+		return -EINVAL;
+	rc = apei_read_mce(&m, &record_id);
+	/* Error or no more MCE record */
+	if (rc <= 0) {
+		mce_apei_read_done = 1;
+		/*
+		 * When ERST is disabled, mce_chrdev_read() should return
+		 * "no record" instead of "no device."
+		 */
+		if (rc == -ENODEV)
+			return 0;
+		return rc;
+	}
+	rc = -EFAULT;
+	if (copy_to_user(*ubuf, &m, sizeof(struct mce)))
+		return rc;
+	/*
+	 * In fact, we should have cleared the record after that has
+	 * been flushed to the disk or sent to network in
+	 * /sbin/mcelog, but we have no interface to support that now,
+	 * so just clear it to avoid duplication.
+	 */
+	rc = apei_clear_mce(record_id);
+	if (rc) {
+		mce_apei_read_done = 1;
+		return rc;
+	}
+	*ubuf += sizeof(struct mce);
+	return 0;
+}
+static ssize_t mce_chrdev_read(struct file *filp, char __user *ubuf,
+				size_t usize, loff_t *off)
+{
+	char __user *buf = ubuf;
+	unsigned long *cpu_tsc;
+	unsigned prev, next;
+	int i, err;
+	cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL);
+	if (!cpu_tsc)
+		return -ENOMEM;
+	mutex_lock(&mce_chrdev_read_mutex);
+	if (!mce_apei_read_done) {
+		err = __mce_read_apei(&buf, usize);
+		if (err || buf != ubuf)
+			goto out;
+	}
+	next = mce_log_get_idx_check(mcelog.next);
+	/* Only supports full reads right now */
+	err = -EINVAL;
+	if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce))
+		goto out;
+	err = 0;
+	prev = 0;
+	do {
+		for (i = prev; i < next; i++) {
+			unsigned long start = jiffies;
+			struct mce *m = &mcelog.entry[i];
+			while (!m->finished) {
+				if (time_after_eq(jiffies, start + 2)) {
+					memset(m, 0, sizeof(*m));
+					goto timeout;
+				}
+				cpu_relax();
+			}
+			smp_rmb();
+			err |= copy_to_user(buf, m, sizeof(*m));
+			buf += sizeof(*m);
+timeout:
+			;
+		}
+		memset(mcelog.entry + prev, 0,
+		       (next - prev) * sizeof(struct mce));
+		prev = next;
+		next = cmpxchg(&mcelog.next, prev, 0);
+	} while (next != prev);
+	synchronize_sched();
+	/*
+	 * Collect entries that were still getting written before the
+	 * synchronize.
+	 */
+	on_each_cpu(collect_tscs, cpu_tsc, 1);
+	for (i = next; i < MCE_LOG_LEN; i++) {
+		struct mce *m = &mcelog.entry[i];
+		if (m->finished && m->tsc < cpu_tsc[m->cpu]) {
+			err |= copy_to_user(buf, m, sizeof(*m));
+			smp_rmb();
+			buf += sizeof(*m);
+			memset(m, 0, sizeof(*m));
+		}
+	}
+	if (err)
+		err = -EFAULT;
+out:
+	mutex_unlock(&mce_chrdev_read_mutex);
+	kfree(cpu_tsc);
+	return err ? err : buf - ubuf;
+}
+static unsigned int mce_chrdev_poll(struct file *file, poll_table *wait)
+{
+	poll_wait(file, &mce_chrdev_wait, wait);
+	if (READ_ONCE(mcelog.next))
+		return POLLIN | POLLRDNORM;
+	if (!mce_apei_read_done && apei_check_mce())
+		return POLLIN | POLLRDNORM;
+	return 0;
+}
+static long mce_chrdev_ioctl(struct file *f, unsigned int cmd,
+				unsigned long arg)
+{
+	int __user *p = (int __user *)arg;
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+	switch (cmd) {
+	case MCE_GET_RECORD_LEN:
+		return put_user(sizeof(struct mce), p);
+	case MCE_GET_LOG_LEN:
+		return put_user(MCE_LOG_LEN, p);
+	case MCE_GETCLEAR_FLAGS: {
+		unsigned flags;
+		do {
+			flags = mcelog.flags;
+		} while (cmpxchg(&mcelog.flags, flags, 0) != flags);
+		return put_user(flags, p);
+	}
+	default:
+		return -ENOTTY;
+	}
+}
+static ssize_t (*mce_write)(struct file *filp, const char __user *ubuf,
+			    size_t usize, loff_t *off);
+void register_mce_write_callback(ssize_t (*fn)(struct file *filp,
+			     const char __user *ubuf,
+			     size_t usize, loff_t *off))
+{
+	mce_write = fn;
+}
+EXPORT_SYMBOL_GPL(register_mce_write_callback);
+static ssize_t mce_chrdev_write(struct file *filp, const char __user *ubuf,
+				size_t usize, loff_t *off)
+{
+	if (mce_write)
+		return mce_write(filp, ubuf, usize, off);
+	else
+		return -EINVAL;
+}
+static const struct file_operations mce_chrdev_ops = {
+	.open			= mce_chrdev_open,
+	.release		= mce_chrdev_release,
+	.read			= mce_chrdev_read,
+	.write			= mce_chrdev_write,
+	.poll			= mce_chrdev_poll,
+	.unlocked_ioctl		= mce_chrdev_ioctl,
+	.llseek			= no_llseek,
+};
+static struct miscdevice mce_chrdev_device = {
+	MISC_MCELOG_MINOR,
+	"mcelog",
+	&mce_chrdev_ops,
+};
+static __init int dev_mcelog_init_device(void)
+{
+	int err;
+	/* register character device /dev/mcelog */
+	err = misc_register(&mce_chrdev_device);
+	if (err) {
+		pr_err("Unable to init device /dev/mcelog (rc: %d)\n", err);
+		return err;
+	}
+	mce_register_decode_chain(&dev_mcelog_nb);
+	return 0;
+}
+device_initcall_sync(dev_mcelog_init_device);
--- a/arch/x86/kernel/cpu/mcheck/mce-internal.h
+++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h
@@ -96,3 +96,11 @@ static inline bool mce_cmp(struct mce *m1, struct mce *m2)
 		m1->addr != m2->addr ||
 		m1->misc != m2->misc;
 }
+extern struct device_attribute dev_attr_trigger;
+#ifdef CONFIG_X86_MCELOG_LEGACY
+extern void mce_work_trigger(void);
+#else
+static inline void mce_work_trigger(void)	{ }
+#endif
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
--- a/arch/x86/kernel/cpu/mcheck/mce_intel.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c
@@ -481,6 +481,9 @@ static void intel_ppin_init(struct cpuinfo_x86 *c)
 	case INTEL_FAM6_BROADWELL_XEON_D:
 	case INTEL_FAM6_BROADWELL_X:
 	case INTEL_FAM6_SKYLAKE_X:
+	case INTEL_FAM6_XEON_PHI_KNL:
+	case INTEL_FAM6_XEON_PHI_KNM:
 		if (rdmsrl_safe(MSR_PPIN_CTL, &val))
 			return;

--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -765,10 +765,11 @@ void machine_crash_shutdown(struct pt_regs *regs)
 #endif
+/* This is the CPU performing the emergency shutdown work. */
+int crashing_cpu = -1;
 #if defined(CONFIG_SMP)
-/* This keeps a track of which one is crashing cpu. */
-static int crashing_cpu;
 static nmi_shootdown_cb shootdown_callback;
 static atomic_t waiting_for_crash_ipi;

--- a/arch/x86/ras/Kconfig
+++ b/arch/x86/ras/Kconfig
@@ -7,3 +7,17 @@ config MCE_AMD_INJ
 	  aspects of the MCE handling code.
 	  WARNING: Do not even assume this interface is staying stable!
+config RAS_CEC
+	bool "Correctable Errors Collector"
+	depends on X86_MCE && MEMORY_FAILURE && DEBUG_FS
+	---help---
+	  This is a small cache which collects correctable memory errors per 4K
+	  page PFN and counts their repeated occurrence. Once the counter for a
+	  PFN overflows, we try to soft-offline that page as we take it to mean
+	  that it has reached a relatively high error count and would probably
+	  be best if we don't use it anymore.
+	  Bear in mind that this is absolutely useless if your platform doesn't
+	  have ECC DIMMs and doesn't have DRAM ECC checking enabled in the BIOS.
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -1005,9 +1005,8 @@ static int ghes_probe(struct platform_device *ghes_dev)
 	switch (generic->notify.type) {
 	case ACPI_HEST_NOTIFY_POLLED:
-		ghes->timer.function = ghes_poll_func;
+		setup_deferrable_timer(&ghes->timer, ghes_poll_func,
-		ghes->timer.data = (unsigned long)ghes;
+				       (unsigned long)ghes);
-		init_timer_deferrable(&ghes->timer);
 		ghes_add_timer(ghes);
 		break;
 	case ACPI_HEST_NOTIFY_EXTERNAL:

--- a/drivers/ras/Makefile
+++ b/drivers/ras/Makefile
-obj-$(CONFIG_RAS) += ras.o debugfs.o
+obj-$(CONFIG_RAS)	+= ras.o debugfs.o
+obj-$(CONFIG_RAS_CEC)	+= cec.o
--- a/drivers/ras/cec.c
+++ b/drivers/ras/cec.c
--- a/drivers/ras/debugfs.c
+++ b/drivers/ras/debugfs.c
 #include <linux/debugfs.h>
-static struct dentry *ras_debugfs_dir;
+struct dentry *ras_debugfs_dir;
 static atomic_t trace_count = ATOMIC_INIT(0);

--- a/drivers/ras/debugfs.h
+++ b/drivers/ras/debugfs.h
+#ifndef __RAS_DEBUGFS_H__
+#define __RAS_DEBUGFS_H__
+#include <linux/debugfs.h>
+extern struct dentry *ras_debugfs_dir;
+#endif /* __RAS_DEBUGFS_H__ */
--- a/drivers/ras/ras.c
+++ b/drivers/ras/ras.c
@@ -27,3 +27,14 @@ subsys_initcall(ras_init);
 EXPORT_TRACEPOINT_SYMBOL_GPL(extlog_mem_event);
 #endif
 EXPORT_TRACEPOINT_SYMBOL_GPL(mc_event);
+int __init parse_ras_param(char *str)
+{
+#ifdef CONFIG_RAS_CEC
+	parse_cec_param(str);
+#endif
+	return 1;
+}
+__setup("ras", parse_ras_param);
--- a/include/linux/ras.h
+++ b/include/linux/ras.h
 #ifndef __RAS_H__
 #define __RAS_H__
+#include <asm/errno.h>
 #ifdef CONFIG_DEBUG_FS
 int ras_userspace_consumers(void);
 void ras_debugfs_init(void);
 int ras_add_daemon_trace(void);
 #else
 static inline int ras_userspace_consumers(void) { return 0; }
-static inline void ras_debugfs_init(void) { return; }
+static inline void ras_debugfs_init(void) { }
 static inline int ras_add_daemon_trace(void) { return 0; }
 #endif
+#ifdef CONFIG_RAS_CEC
+void __init cec_init(void);
+int __init parse_cec_param(char *str);
+int cec_add_elem(u64 pfn);
+#else
+static inline void __init cec_init(void)	{ }
+static inline int cec_add_elem(u64 pfn)		{ return -ENODEV; }
 #endif
+#endif /* __RAS_H__ */