Merge branch 'ras-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull RAS updates from Thomas Gleixner: "The RAS updates for the 4.13 merge window: - Cleanup of the MCE injection facility (Borsilav Petkov) - Rework of the AMD/SMCA handling (Yazen Ghannam) - Enhancements for ACPI/APEI to handle new notitication types (Shiju Jose) - atomic_t to refcount_t conversion (Elena Reshetova) - A few fixes and enhancements all over the place" * 'ras-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: RAS/CEC: Check the correct variable in the debugfs error handling x86/mce: Always save severity in machine_check_poll() x86/MCE, xen/mcelog: Make /dev/mcelog registration messages more precise x86/mce: Update bootlog description to reflect behavior on AMD x86/mce: Don't disable MCA banks when offlining a CPU on AMD x86/mce/mce-inject: Preset the MCE injection struct x86/mce: Clean up include files x86/mce: Get rid of register_mce_write_callback() x86/mce: Merge mce_amd_inj into mce-inject x86/mce/AMD: Use saved threshold block info in interrupt handler x86/mce/AMD: Use msr_stat when clearing MCA_STATUS x86/mce/AMD: Carve out SMCA bank configuration x86/mce/AMD: Redo error logging from APIC LVT interrupt handlers x86/mce: Convert threshold_bank.cpus from atomic_t to refcount_t RAS: Make local function parse_ras_param() static ACPI/APEI: Handle GSIV and GPIO notification types

Merge branch 'ras-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull RAS updates from Thomas Gleixner: "The RAS updates for the 4.13 merge window: - Cleanup of the MCE injection facility (Borsilav Petkov) - Rework of the AMD/SMCA handling (Yazen Ghannam) - Enhancements for ACPI/APEI to handle new notitication types (Shiju Jose) - atomic_t to refcount_t conversion (Elena Reshetova) - A few fixes and enhancements all over the place" * 'ras-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: RAS/CEC: Check the correct variable in the debugfs error handling x86/mce: Always save severity in machine_check_poll() x86/MCE, xen/mcelog: Make /dev/mcelog registration messages more precise x86/mce: Update bootlog description to reflect behavior on AMD x86/mce: Don't disable MCA banks when offlining a CPU on AMD x86/mce/mce-inject: Preset the MCE injection struct x86/mce: Clean up include files x86/mce: Get rid of register_mce_write_callback() x86/mce: Merge mce_amd_inj into mce-inject x86/mce/AMD: Use saved threshold block info in interrupt handler x86/mce/AMD: Use msr_stat when clearing MCA_STATUS x86/mce/AMD: Carve out SMCA bank configuration x86/mce/AMD: Redo error logging from APIC LVT interrupt handlers x86/mce: Convert threshold_bank.cpus from atomic_t to refcount_t RAS: Make local function parse_ras_param() static ACPI/APEI: Handle GSIV and GPIO notification types
4422d80e · Linus Torvalds · 9a9594ef · 32288daf · 4422d80e · 4422d80e
Commit 4422d80e authored Jul 03, 2017 by Linus Torvalds
18 changed files
--- a/Documentation/x86/x86_64/boot-options.txt
+++ b/Documentation/x86/x86_64/boot-options.txt
@@ -36,7 +36,8 @@ Machine check
 		to broadcast MCEs.
   mce=bootlog
 		Enable logging of machine checks left over from booting.
-		Disabled by default on AMD because some BIOS leave bogus ones.
+		Disabled by default on AMD Fam10h and older because some BIOS
+		leave bogus ones.
 		If your BIOS doesn't do that it's a good idea to enable though
 		to make sure you log even machine check events that result
 		in a reboot. On Intel systems it is enabled by default.

--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1085,7 +1085,7 @@ config X86_MCE_THRESHOLD
 	def_bool y
 config X86_MCE_INJECT
-	depends on X86_MCE && X86_LOCAL_APIC && X86_MCELOG_LEGACY
+	depends on X86_MCE && X86_LOCAL_APIC && DEBUG_FS
 	tristate "Machine check injector support"
 	---help---
 	  Provide support for injecting machine checks for testing purposes.

--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -257,8 +257,6 @@ drivers-$(CONFIG_PM) += arch/x86/power/
 drivers-$(CONFIG_FB) += arch/x86/video/
-drivers-$(CONFIG_RAS) += arch/x86/ras/
 ####
 # boot loader support. Several targets are kept for legacy purposes

--- a/arch/x86/include/asm/amd_nb.h
+++ b/arch/x86/include/asm/amd_nb.h
@@ -3,6 +3,7 @@
 #include <linux/ioport.h>
 #include <linux/pci.h>
+#include <linux/refcount.h>
 struct amd_nb_bus_dev_range {
 	u8 bus;
@@ -55,7 +56,7 @@ struct threshold_bank {
 	struct threshold_block	*blocks;
 	/* initialized to the number of CPUs on the node sharing this bank */
-	atomic_t		cpus;
+	refcount_t		cpus;
 };
 struct amd_northbridge {

--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -285,10 +285,6 @@ int mce_notify_irq(void);
 DECLARE_PER_CPU(struct mce, injectm);
-extern void register_mce_write_callback(ssize_t (*)(struct file *filp,
-				    const char __user *ubuf,
-				    size_t usize, loff_t *off));
 /* Disable CMCI/polling for MCA bank claimed by firmware */
 extern void mce_disable_bank(int bank);

--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -907,8 +907,13 @@ static inline int mpx_disable_management(void)
 }
 #endif /* CONFIG_X86_INTEL_MPX */
+#ifdef CONFIG_CPU_SUP_AMD
 extern u16 amd_get_nb_id(int cpu);
 extern u32 amd_get_nodes_per_socket(void);
+#else
+static inline u16 amd_get_nb_id(int cpu)		{ return 0; }
+static inline u32 amd_get_nodes_per_socket(void)	{ return 0; }
+#endif
 static inline uint32_t hypervisor_cpuid_base(const char *sig, uint32_t leaves)
 {

--- a/arch/x86/kernel/cpu/mcheck/dev-mcelog.c
+++ b/arch/x86/kernel/cpu/mcheck/dev-mcelog.c
@@ -17,6 +17,8 @@
 #include "mce-internal.h"
+static BLOCKING_NOTIFIER_HEAD(mce_injector_chain);
 static DEFINE_MUTEX(mce_chrdev_read_mutex);
 static char mce_helper[128];
@@ -345,24 +347,49 @@ static long mce_chrdev_ioctl(struct file *f, unsigned int cmd,
 	}
 }
-static ssize_t (*mce_write)(struct file *filp, const char __user *ubuf,
+void mce_register_injector_chain(struct notifier_block *nb)
-			    size_t usize, loff_t *off);
+{
+	blocking_notifier_chain_register(&mce_injector_chain, nb);
+}
+EXPORT_SYMBOL_GPL(mce_register_injector_chain);
-void register_mce_write_callback(ssize_t (*fn)(struct file *filp,
+void mce_unregister_injector_chain(struct notifier_block *nb)
-			     const char __user *ubuf,
-			     size_t usize, loff_t *off))
 {
-	mce_write = fn;
+	blocking_notifier_chain_unregister(&mce_injector_chain, nb);
 }
-EXPORT_SYMBOL_GPL(register_mce_write_callback);
+EXPORT_SYMBOL_GPL(mce_unregister_injector_chain);
 static ssize_t mce_chrdev_write(struct file *filp, const char __user *ubuf,
 				size_t usize, loff_t *off)
 {
-	if (mce_write)
+	struct mce m;
-		return mce_write(filp, ubuf, usize, off);
-	else
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+	/*
+	 * There are some cases where real MSR reads could slip
+	 * through.
+	 */
+	if (!boot_cpu_has(X86_FEATURE_MCE) || !boot_cpu_has(X86_FEATURE_MCA))
+		return -EIO;
+	if ((unsigned long)usize > sizeof(struct mce))
+		usize = sizeof(struct mce);
+	if (copy_from_user(&m, ubuf, usize))
+		return -EFAULT;
+	if (m.extcpu >= num_possible_cpus() || !cpu_online(m.extcpu))
 		return -EINVAL;
+	/*
+	 * Need to give user space some time to set everything up,
+	 * so do it a jiffie or two later everywhere.
+	 */
+	schedule_timeout(2);
+	blocking_notifier_call_chain(&mce_injector_chain, 0, &m);
+	return usize;
 }
 static const struct file_operations mce_chrdev_ops = {
@@ -388,9 +415,15 @@ static __init int dev_mcelog_init_device(void)
 	/* register character device /dev/mcelog */
 	err = misc_register(&mce_chrdev_device);
 	if (err) {
+		if (err == -EBUSY)
+			/* Xen dom0 might have registered the device already. */
+			pr_info("Unable to init device /dev/mcelog, already registered");
+		else
 			pr_err("Unable to init device /dev/mcelog (rc: %d)\n", err);
 		return err;
 	}
 	mce_register_decode_chain(&dev_mcelog_nb);
 	return 0;
 }

--- a/arch/x86/kernel/cpu/mcheck/mce-inject.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c
@@ -10,23 +10,105 @@
 * Authors:
 * Andi Kleen
 * Ying Huang
+ *
+ * The AMD part (from mce_amd_inj.c): a simple MCE injection facility
+ * for testing different aspects of the RAS code. This driver should be
+ * built as module so that it can be loaded on production kernels for
+ * testing purposes.
+ *
+ * This file may be distributed under the terms of the GNU General Public
+ * License version 2.
+ *
+ * Copyright (c) 2010-17:  Borislav Petkov <bp@alien8.de>
+ *			   Advanced Micro Devices Inc.
 */
-#include <linux/uaccess.h>
-#include <linux/module.h>
+#include <linux/cpu.h>
-#include <linux/timer.h>
+#include <linux/debugfs.h>
 #include <linux/kernel.h>
-#include <linux/string.h>
+#include <linux/module.h>
-#include <linux/fs.h>
-#include <linux/preempt.h>
-#include <linux/smp.h>
 #include <linux/notifier.h>
-#include <linux/kdebug.h>
+#include <linux/pci.h>
-#include <linux/cpu.h>
+#include <linux/uaccess.h>
-#include <linux/sched.h>
-#include <linux/gfp.h>
+#include <asm/amd_nb.h>
-#include <asm/mce.h>
 #include <asm/apic.h>
+#include <asm/irq_vectors.h>
+#include <asm/mce.h>
 #include <asm/nmi.h>
+#include <asm/smp.h>
+#include "mce-internal.h"
+/*
+ * Collect all the MCi_XXX settings
+ */
+static struct mce i_mce;
+static struct dentry *dfs_inj;
+static u8 n_banks;
+#define MAX_FLAG_OPT_SIZE	3
+#define NBCFG			0x44
+enum injection_type {
+	SW_INJ = 0,	/* SW injection, simply decode the error */
+	HW_INJ,		/* Trigger a #MC */
+	DFR_INT_INJ,    /* Trigger Deferred error interrupt */
+	THR_INT_INJ,    /* Trigger threshold interrupt */
+	N_INJ_TYPES,
+};
+static const char * const flags_options[] = {
+	[SW_INJ] = "sw",
+	[HW_INJ] = "hw",
+	[DFR_INT_INJ] = "df",
+	[THR_INT_INJ] = "th",
+	NULL
+};
+/* Set default injection to SW_INJ */
+static enum injection_type inj_type = SW_INJ;
+#define MCE_INJECT_SET(reg)						\
+static int inj_##reg##_set(void *data, u64 val)				\
+{									\
+	struct mce *m = (struct mce *)data;				\
+									\
+	m->reg = val;							\
+	return 0;							\
+}
+MCE_INJECT_SET(status);
+MCE_INJECT_SET(misc);
+MCE_INJECT_SET(addr);
+MCE_INJECT_SET(synd);
+#define MCE_INJECT_GET(reg)						\
+static int inj_##reg##_get(void *data, u64 *val)			\
+{									\
+	struct mce *m = (struct mce *)data;				\
+									\
+	*val = m->reg;							\
+	return 0;							\
+}
+MCE_INJECT_GET(status);
+MCE_INJECT_GET(misc);
+MCE_INJECT_GET(addr);
+MCE_INJECT_GET(synd);
+DEFINE_SIMPLE_ATTRIBUTE(status_fops, inj_status_get, inj_status_set, "%llx\n");
+DEFINE_SIMPLE_ATTRIBUTE(misc_fops, inj_misc_get, inj_misc_set, "%llx\n");
+DEFINE_SIMPLE_ATTRIBUTE(addr_fops, inj_addr_get, inj_addr_set, "%llx\n");
+DEFINE_SIMPLE_ATTRIBUTE(synd_fops, inj_synd_get, inj_synd_set, "%llx\n");
+static void setup_inj_struct(struct mce *m)
+{
+	memset(m, 0, sizeof(struct mce));
+	m->cpuvendor = boot_cpu_data.x86_vendor;
+}
 /* Update fake mce registers on current CPU. */
 static void inject_mce(struct mce *m)
@@ -143,7 +225,7 @@ static int raise_local(void)
 	return ret;
 }
-static void raise_mce(struct mce *m)
+static void __maybe_unused raise_mce(struct mce *m)
 {
 	int context = MCJ_CTX(m->inject_flags);
@@ -198,55 +280,454 @@ static void raise_mce(struct mce *m)
 	}
 }
-/* Error injection interface */
+static int mce_inject_raise(struct notifier_block *nb, unsigned long val,
-static ssize_t mce_write(struct file *filp, const char __user *ubuf,
+			    void *data)
-			 size_t usize, loff_t *off)
 {
-	struct mce m;
+	struct mce *m = (struct mce *)data;
-	if (!capable(CAP_SYS_ADMIN))
+	if (!m)
-		return -EPERM;
+		return NOTIFY_DONE;
-	/*
-	 * There are some cases where real MSR reads could slip
+	mutex_lock(&mce_inject_mutex);
-	 * through.
+	raise_mce(m);
+	mutex_unlock(&mce_inject_mutex);
+	return NOTIFY_DONE;
+}
+static struct notifier_block inject_nb = {
+	.notifier_call  = mce_inject_raise,
+};
+/*
+ * Caller needs to be make sure this cpu doesn't disappear
+ * from under us, i.e.: get_cpu/put_cpu.
 */
-	if (!boot_cpu_has(X86_FEATURE_MCE) || !boot_cpu_has(X86_FEATURE_MCA))
+static int toggle_hw_mce_inject(unsigned int cpu, bool enable)
-		return -EIO;
+{
+	u32 l, h;
+	int err;
+	err = rdmsr_on_cpu(cpu, MSR_K7_HWCR, &l, &h);
+	if (err) {
+		pr_err("%s: error reading HWCR\n", __func__);
+		return err;
+	}
+	enable ? (l |= BIT(18)) : (l &= ~BIT(18));
+	err = wrmsr_on_cpu(cpu, MSR_K7_HWCR, l, h);
+	if (err)
+		pr_err("%s: error writing HWCR\n", __func__);
+	return err;
+}
+static int __set_inj(const char *buf)
+{
+	int i;
+	for (i = 0; i < N_INJ_TYPES; i++) {
+		if (!strncmp(flags_options[i], buf, strlen(flags_options[i]))) {
+			inj_type = i;
+			return 0;
+		}
+	}
+	return -EINVAL;
+}
-	if ((unsigned long)usize > sizeof(struct mce))
+static ssize_t flags_read(struct file *filp, char __user *ubuf,
-		usize = sizeof(struct mce);
+			  size_t cnt, loff_t *ppos)
-	if (copy_from_user(&m, ubuf, usize))
+{
+	char buf[MAX_FLAG_OPT_SIZE];
+	int n;
+	n = sprintf(buf, "%s\n", flags_options[inj_type]);
+	return simple_read_from_buffer(ubuf, cnt, ppos, buf, n);
+}
+static ssize_t flags_write(struct file *filp, const char __user *ubuf,
+			   size_t cnt, loff_t *ppos)
+{
+	char buf[MAX_FLAG_OPT_SIZE], *__buf;
+	int err;
+	if (cnt > MAX_FLAG_OPT_SIZE)
+		return -EINVAL;
+	if (copy_from_user(&buf, ubuf, cnt))
 		return -EFAULT;
-	if (m.extcpu >= num_possible_cpus() || !cpu_online(m.extcpu))
+	buf[cnt - 1] = 0;
+	/* strip whitespace */
+	__buf = strstrip(buf);
+	err = __set_inj(__buf);
+	if (err) {
+		pr_err("%s: Invalid flags value: %s\n", __func__, __buf);
+		return err;
+	}
+	*ppos += cnt;
+	return cnt;
+}
+static const struct file_operations flags_fops = {
+	.read           = flags_read,
+	.write          = flags_write,
+	.llseek         = generic_file_llseek,
+};
+/*
+ * On which CPU to inject?
+ */
+MCE_INJECT_GET(extcpu);
+static int inj_extcpu_set(void *data, u64 val)
+{
+	struct mce *m = (struct mce *)data;
+	if (val >= nr_cpu_ids || !cpu_online(val)) {
+		pr_err("%s: Invalid CPU: %llu\n", __func__, val);
 		return -EINVAL;
+	}
+	m->extcpu = val;
+	return 0;
+}
+DEFINE_SIMPLE_ATTRIBUTE(extcpu_fops, inj_extcpu_get, inj_extcpu_set, "%llu\n");
+static void trigger_mce(void *info)
+{
+	asm volatile("int $18");
+}
+static void trigger_dfr_int(void *info)
+{
+	asm volatile("int %0" :: "i" (DEFERRED_ERROR_VECTOR));
+}
+static void trigger_thr_int(void *info)
+{
+	asm volatile("int %0" :: "i" (THRESHOLD_APIC_VECTOR));
+}
+static u32 get_nbc_for_node(int node_id)
+{
+	struct cpuinfo_x86 *c = &boot_cpu_data;
+	u32 cores_per_node;
+	cores_per_node = (c->x86_max_cores * smp_num_siblings) / amd_get_nodes_per_socket();
+	return cores_per_node * node_id;
+}
+static void toggle_nb_mca_mst_cpu(u16 nid)
+{
+	struct amd_northbridge *nb;
+	struct pci_dev *F3;
+	u32 val;
+	int err;
+	nb = node_to_amd_nb(nid);
+	if (!nb)
+		return;
+	F3 = nb->misc;
+	if (!F3)
+		return;
+	err = pci_read_config_dword(F3, NBCFG, &val);
+	if (err) {
+		pr_err("%s: Error reading F%dx%03x.\n",
+		       __func__, PCI_FUNC(F3->devfn), NBCFG);
+		return;
+	}
+	if (val & BIT(27))
+		return;
+	pr_err("%s: Set D18F3x44[NbMcaToMstCpuEn] which BIOS hasn't done.\n",
+	       __func__);
+	val |= BIT(27);
+	err = pci_write_config_dword(F3, NBCFG, val);
+	if (err)
+		pr_err("%s: Error writing F%dx%03x.\n",
+		       __func__, PCI_FUNC(F3->devfn), NBCFG);
+}
+static void prepare_msrs(void *info)
+{
+	struct mce m = *(struct mce *)info;
+	u8 b = m.bank;
+	wrmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
+	if (boot_cpu_has(X86_FEATURE_SMCA)) {
+		if (m.inject_flags == DFR_INT_INJ) {
+			wrmsrl(MSR_AMD64_SMCA_MCx_DESTAT(b), m.status);
+			wrmsrl(MSR_AMD64_SMCA_MCx_DEADDR(b), m.addr);
+		} else {
+			wrmsrl(MSR_AMD64_SMCA_MCx_STATUS(b), m.status);
+			wrmsrl(MSR_AMD64_SMCA_MCx_ADDR(b), m.addr);
+		}
+		wrmsrl(MSR_AMD64_SMCA_MCx_MISC(b), m.misc);
+		wrmsrl(MSR_AMD64_SMCA_MCx_SYND(b), m.synd);
+	} else {
+		wrmsrl(MSR_IA32_MCx_STATUS(b), m.status);
+		wrmsrl(MSR_IA32_MCx_ADDR(b), m.addr);
+		wrmsrl(MSR_IA32_MCx_MISC(b), m.misc);
+	}
+}
+static void do_inject(void)
+{
+	u64 mcg_status = 0;
+	unsigned int cpu = i_mce.extcpu;
+	u8 b = i_mce.bank;
+	rdtscll(i_mce.tsc);
+	if (i_mce.misc)
+		i_mce.status |= MCI_STATUS_MISCV;
+	if (i_mce.synd)
+		i_mce.status |= MCI_STATUS_SYNDV;
+	if (inj_type == SW_INJ) {
+		mce_inject_log(&i_mce);
+		return;
+	}
+	/* prep MCE global settings for the injection */
+	mcg_status = MCG_STATUS_MCIP | MCG_STATUS_EIPV;
+	if (!(i_mce.status & MCI_STATUS_PCC))
+		mcg_status |= MCG_STATUS_RIPV;
 	/*
-	 * Need to give user space some time to set everything up,
+	 * Ensure necessary status bits for deferred errors:
-	 * so do it a jiffie or two later everywhere.
+	 * - MCx_STATUS[Deferred]: make sure it is a deferred error
+	 * - MCx_STATUS[UC] cleared: deferred errors are _not_ UC
 	 */
-	schedule_timeout(2);
+	if (inj_type == DFR_INT_INJ) {
+		i_mce.status |= MCI_STATUS_DEFERRED;
+		i_mce.status |= (i_mce.status & ~MCI_STATUS_UC);
+	}
+	/*
+	 * For multi node CPUs, logging and reporting of bank 4 errors happens
+	 * only on the node base core. Refer to D18F3x44[NbMcaToMstCpuEn] for
+	 * Fam10h and later BKDGs.
+	 */
+	if (static_cpu_has(X86_FEATURE_AMD_DCM) &&
+	    b == 4 &&
+	    boot_cpu_data.x86 < 0x17) {
+		toggle_nb_mca_mst_cpu(amd_get_nb_id(cpu));
+		cpu = get_nbc_for_node(amd_get_nb_id(cpu));
+	}
+	get_online_cpus();
+	if (!cpu_online(cpu))
+		goto err;
+	toggle_hw_mce_inject(cpu, true);
+	i_mce.mcgstatus = mcg_status;
+	i_mce.inject_flags = inj_type;
+	smp_call_function_single(cpu, prepare_msrs, &i_mce, 0);
+	toggle_hw_mce_inject(cpu, false);
+	switch (inj_type) {
+	case DFR_INT_INJ:
+		smp_call_function_single(cpu, trigger_dfr_int, NULL, 0);
+		break;
+	case THR_INT_INJ:
+		smp_call_function_single(cpu, trigger_thr_int, NULL, 0);
+		break;
+	default:
+		smp_call_function_single(cpu, trigger_mce, NULL, 0);
+	}
+err:
+	put_online_cpus();
-	mutex_lock(&mce_inject_mutex);
-	raise_mce(&m);
-	mutex_unlock(&mce_inject_mutex);
-	return usize;
 }
-static int inject_init(void)
+/*
+ * This denotes into which bank we're injecting and triggers
+ * the injection, at the same time.
+ */
+static int inj_bank_set(void *data, u64 val)
 {
+	struct mce *m = (struct mce *)data;
+	if (val >= n_banks) {
+		pr_err("Non-existent MCE bank: %llu\n", val);
+		return -EINVAL;
+	}
+	m->bank = val;
+	do_inject();
+	return 0;
+}
+MCE_INJECT_GET(bank);
+DEFINE_SIMPLE_ATTRIBUTE(bank_fops, inj_bank_get, inj_bank_set, "%llu\n");
+static const char readme_msg[] =
+"Description of the files and their usages:\n"
+"\n"
+"Note1: i refers to the bank number below.\n"
+"Note2: See respective BKDGs for the exact bit definitions of the files below\n"
+"as they mirror the hardware registers.\n"
+"\n"
+"status:\t Set MCi_STATUS: the bits in that MSR control the error type and\n"
+"\t attributes of the error which caused the MCE.\n"
+"\n"
+"misc:\t Set MCi_MISC: provide auxiliary info about the error. It is mostly\n"
+"\t used for error thresholding purposes and its validity is indicated by\n"
+"\t MCi_STATUS[MiscV].\n"
+"\n"
+"synd:\t Set MCi_SYND: provide syndrome info about the error. Only valid on\n"
+"\t Scalable MCA systems, and its validity is indicated by MCi_STATUS[SyndV].\n"
+"\n"
+"addr:\t Error address value to be written to MCi_ADDR. Log address information\n"
+"\t associated with the error.\n"
+"\n"
+"cpu:\t The CPU to inject the error on.\n"
+"\n"
+"bank:\t Specify the bank you want to inject the error into: the number of\n"
+"\t banks in a processor varies and is family/model-specific, therefore, the\n"
+"\t supplied value is sanity-checked. Setting the bank value also triggers the\n"
+"\t injection.\n"
+"\n"
+"flags:\t Injection type to be performed. Writing to this file will trigger a\n"
+"\t real machine check, an APIC interrupt or invoke the error decoder routines\n"
+"\t for AMD processors.\n"
+"\n"
+"\t Allowed error injection types:\n"
+"\t  - \"sw\": Software error injection. Decode error to a human-readable \n"
+"\t    format only. Safe to use.\n"
+"\t  - \"hw\": Hardware error injection. Causes the #MC exception handler to \n"
+"\t    handle the error. Be warned: might cause system panic if MCi_STATUS[PCC] \n"
+"\t    is set. Therefore, consider setting (debugfs_mountpoint)/mce/fake_panic \n"
+"\t    before injecting.\n"
+"\t  - \"df\": Trigger APIC interrupt for Deferred error. Causes deferred \n"
+"\t    error APIC interrupt handler to handle the error if the feature is \n"
+"\t    is present in hardware. \n"
+"\t  - \"th\": Trigger APIC interrupt for Threshold errors. Causes threshold \n"
+"\t    APIC interrupt handler to handle the error. \n"
+"\n";
+static ssize_t
+inj_readme_read(struct file *filp, char __user *ubuf,
+		       size_t cnt, loff_t *ppos)
+{
+	return simple_read_from_buffer(ubuf, cnt, ppos,
+					readme_msg, strlen(readme_msg));
+}
+static const struct file_operations readme_fops = {
+	.read		= inj_readme_read,
+};
+static struct dfs_node {
+	char *name;
+	struct dentry *d;
+	const struct file_operations *fops;
+	umode_t perm;
+} dfs_fls[] = {
+	{ .name = "status",	.fops = &status_fops, .perm = S_IRUSR | S_IWUSR },
+	{ .name = "misc",	.fops = &misc_fops,   .perm = S_IRUSR | S_IWUSR },
+	{ .name = "addr",	.fops = &addr_fops,   .perm = S_IRUSR | S_IWUSR },
+	{ .name = "synd",	.fops = &synd_fops,   .perm = S_IRUSR | S_IWUSR },
+	{ .name = "bank",	.fops = &bank_fops,   .perm = S_IRUSR | S_IWUSR },
+	{ .name = "flags",	.fops = &flags_fops,  .perm = S_IRUSR | S_IWUSR },
+	{ .name = "cpu",	.fops = &extcpu_fops, .perm = S_IRUSR | S_IWUSR },
+	{ .name = "README",	.fops = &readme_fops, .perm = S_IRUSR | S_IRGRP | S_IROTH },
+};
+static int __init debugfs_init(void)
+{
+	unsigned int i;
+	u64 cap;
+	rdmsrl(MSR_IA32_MCG_CAP, cap);
+	n_banks = cap & MCG_BANKCNT_MASK;
+	dfs_inj = debugfs_create_dir("mce-inject", NULL);
+	if (!dfs_inj)
+		return -EINVAL;
+	for (i = 0; i < ARRAY_SIZE(dfs_fls); i++) {
+		dfs_fls[i].d = debugfs_create_file(dfs_fls[i].name,
+						    dfs_fls[i].perm,
+						    dfs_inj,
+						    &i_mce,
+						    dfs_fls[i].fops);
+		if (!dfs_fls[i].d)
+			goto err_dfs_add;
+	}
+	return 0;
+err_dfs_add:
+	while (i-- > 0)
+		debugfs_remove(dfs_fls[i].d);
+	debugfs_remove(dfs_inj);
+	dfs_inj = NULL;
+	return -ENODEV;
+}
+static int __init inject_init(void)
+{
+	int err;
 	if (!alloc_cpumask_var(&mce_inject_cpumask, GFP_KERNEL))
 		return -ENOMEM;
+	err = debugfs_init();
+	if (err) {
+		free_cpumask_var(mce_inject_cpumask);
+		return err;
+	}
+	register_nmi_handler(NMI_LOCAL, mce_raise_notify, 0, "mce_notify");
+	mce_register_injector_chain(&inject_nb);
+	setup_inj_struct(&i_mce);
 	pr_info("Machine check injector initialized\n");
-	register_mce_write_callback(mce_write);
-	register_nmi_handler(NMI_LOCAL, mce_raise_notify, 0,
-				"mce_notify");
 	return 0;
 }
+static void __exit inject_exit(void)
+{
+	mce_unregister_injector_chain(&inject_nb);
+	unregister_nmi_handler(NMI_LOCAL, "mce_notify");
+	debugfs_remove_recursive(dfs_inj);
+	dfs_inj = NULL;
+	memset(&dfs_fls, 0, sizeof(dfs_fls));
+	free_cpumask_var(mce_inject_cpumask);
+}
 module_init(inject_init);
-/*
+module_exit(inject_exit);
- * Cannot tolerate unloading currently because we cannot
- * guarantee all openers of mce_chrdev will get a reference to us.
- */
 MODULE_LICENSE("GPL");
--- a/arch/x86/kernel/cpu/mcheck/mce-internal.h
+++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h
@@ -100,7 +100,11 @@ static inline bool mce_cmp(struct mce *m1, struct mce *m2)
 extern struct device_attribute dev_attr_trigger;
 #ifdef CONFIG_X86_MCELOG_LEGACY
-extern void mce_work_trigger(void);
+void mce_work_trigger(void);
+void mce_register_injector_chain(struct notifier_block *nb);
+void mce_unregister_injector_chain(struct notifier_block *nb);
 #else
 static inline void mce_work_trigger(void)	{ }
+static inline void mce_register_injector_chain(struct notifier_block *nb)	{ }
+static inline void mce_unregister_injector_chain(struct notifier_block *nb)	{ }
 #endif
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -673,7 +673,6 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
 {
 	bool error_seen = false;
 	struct mce m;
-	int severity;
 	int i;
 	this_cpu_inc(mce_poll_count);
@@ -710,11 +709,7 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
 		mce_read_aux(&m, i);
-		severity = mce_severity(&m, mca_cfg.tolerant, NULL, false);
+		m.severity = mce_severity(&m, mca_cfg.tolerant, NULL, false);
-		if (severity == MCE_DEFERRED_SEVERITY && mce_is_memory_error(&m))
-			if (m.status & MCI_STATUS_ADDRV)
-				m.severity = severity;
 		/*
 		 * Don't get the IP here because it's unlikely to
@@ -1550,7 +1545,7 @@ static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
 			 */
 			clear_bit(10, (unsigned long *)&mce_banks[4].ctl);
 		}
-		if (c->x86 < 17 && cfg->bootlog < 0) {
+		if (c->x86 < 0x11 && cfg->bootlog < 0) {
 			/*
 			 * Lots of broken BIOS around that don't clear them
 			 * by default and leave crap in there. Don't log:
@@ -1832,7 +1827,8 @@ void mce_disable_bank(int bank)
 * mce=TOLERANCELEVEL[,monarchtimeout] (number, see above)
 *	monarchtimeout is how long to wait for other CPUs on machine
 *	check, or 0 to not wait
- * mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
+ * mce=bootlog Log MCEs from before booting. Disabled by default on AMD Fam10h
+	and older.
 * mce=nobootlog Don't log MCEs from before booting.
 * mce=bios_cmci_threshold Don't program the CMCI threshold
 * mce=recovery force enable memcpy_mcsafe()
@@ -1912,12 +1908,13 @@ static void mce_disable_error_reporting(void)
 static void vendor_disable_error_reporting(void)
 {
 	/*
-	 * Don't clear on Intel CPUs. Some of these MSRs are socket-wide.
+	 * Don't clear on Intel or AMD CPUs. Some of these MSRs are socket-wide.
 	 * Disabling them for just a single offlined CPU is bad, since it will
 	 * inhibit reporting for all shared resources on the socket like the
 	 * last level cache (LLC), the integrated memory controller (iMC), etc.
 	 */
-	if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
+	if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL ||
+	    boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
 		return;
 	mce_disable_error_reporting();

--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -164,17 +164,48 @@ static void default_deferred_error_interrupt(void)
 }
 void (*deferred_error_int_vector)(void) = default_deferred_error_interrupt;
-static void get_smca_bank_info(unsigned int bank)
+static void smca_configure(unsigned int bank, unsigned int cpu)
 {
-	unsigned int i, hwid_mcatype, cpu = smp_processor_id();
+	unsigned int i, hwid_mcatype;
 	struct smca_hwid *s_hwid;
-	u32 high, instance_id;
+	u32 high, low;
+	u32 smca_config = MSR_AMD64_SMCA_MCx_CONFIG(bank);
+	/* Set appropriate bits in MCA_CONFIG */
+	if (!rdmsr_safe(smca_config, &low, &high)) {
+		/*
+		 * OS is required to set the MCAX bit to acknowledge that it is
+		 * now using the new MSR ranges and new registers under each
+		 * bank. It also means that the OS will configure deferred
+		 * errors in the new MCx_CONFIG register. If the bit is not set,
+		 * uncorrectable errors will cause a system panic.
+		 *
+		 * MCA_CONFIG[MCAX] is bit 32 (0 in the high portion of the MSR.)
+		 */
+		high |= BIT(0);
+		/*
+		 * SMCA sets the Deferred Error Interrupt type per bank.
+		 *
+		 * MCA_CONFIG[DeferredIntTypeSupported] is bit 5, and tells us
+		 * if the DeferredIntType bit field is available.
+		 *
+		 * MCA_CONFIG[DeferredIntType] is bits [38:37] ([6:5] in the
+		 * high portion of the MSR). OS should set this to 0x1 to enable
+		 * APIC based interrupt. First, check that no interrupt has been
+		 * set.
+		 */
+		if ((low & BIT(5)) && !((high >> 5) & 0x3))
+			high |= BIT(5);
+		wrmsr(smca_config, low, high);
+	}
 	/* Collect bank_info using CPU 0 for now. */
 	if (cpu)
 		return;
-	if (rdmsr_safe_on_cpu(cpu, MSR_AMD64_SMCA_MCx_IPID(bank), &instance_id, &high)) {
+	if (rdmsr_safe_on_cpu(cpu, MSR_AMD64_SMCA_MCx_IPID(bank), &low, &high)) {
 		pr_warn("Failed to read MCA_IPID for bank %d\n", bank);
 		return;
 	}
@@ -191,7 +222,7 @@ static void get_smca_bank_info(unsigned int bank)
 			     smca_get_name(s_hwid->bank_type));
 			smca_banks[bank].hwid = s_hwid;
-			smca_banks[bank].id = instance_id;
+			smca_banks[bank].id = low;
 			smca_banks[bank].sysfs_id = s_hwid->count++;
 			break;
 		}
@@ -433,7 +464,7 @@ prepare_threshold_block(unsigned int bank, unsigned int block, u32 addr,
 			int offset, u32 misc_high)
 {
 	unsigned int cpu = smp_processor_id();
-	u32 smca_low, smca_high, smca_addr;
+	u32 smca_low, smca_high;
 	struct threshold_block b;
 	int new;
@@ -457,51 +488,6 @@ prepare_threshold_block(unsigned int bank, unsigned int block, u32 addr,
 		goto set_offset;
 	}
-	smca_addr = MSR_AMD64_SMCA_MCx_CONFIG(bank);
-	if (!rdmsr_safe(smca_addr, &smca_low, &smca_high)) {
-		/*
-		 * OS is required to set the MCAX bit to acknowledge that it is
-		 * now using the new MSR ranges and new registers under each
-		 * bank. It also means that the OS will configure deferred
-		 * errors in the new MCx_CONFIG register. If the bit is not set,
-		 * uncorrectable errors will cause a system panic.
-		 *
-		 * MCA_CONFIG[MCAX] is bit 32 (0 in the high portion of the MSR.)
-		 */
-		smca_high |= BIT(0);
-		/*
-		 * SMCA logs Deferred Error information in MCA_DE{STAT,ADDR}
-		 * registers with the option of additionally logging to
-		 * MCA_{STATUS,ADDR} if MCA_CONFIG[LogDeferredInMcaStat] is set.
-		 *
-		 * This bit is usually set by BIOS to retain the old behavior
-		 * for OSes that don't use the new registers. Linux supports the
-		 * new registers so let's disable that additional logging here.
-		 *
-		 * MCA_CONFIG[LogDeferredInMcaStat] is bit 34 (bit 2 in the high
-		 * portion of the MSR).
-		 */
-		smca_high &= ~BIT(2);
-		/*
-		 * SMCA sets the Deferred Error Interrupt type per bank.
-		 *
-		 * MCA_CONFIG[DeferredIntTypeSupported] is bit 5, and tells us
-		 * if the DeferredIntType bit field is available.
-		 *
-		 * MCA_CONFIG[DeferredIntType] is bits [38:37] ([6:5] in the
-		 * high portion of the MSR). OS should set this to 0x1 to enable
-		 * APIC based interrupt. First, check that no interrupt has been
-		 * set.
-		 */
-		if ((smca_low & BIT(5)) && !((smca_high >> 5) & 0x3))
-			smca_high |= BIT(5);
-		wrmsr(smca_addr, smca_low, smca_high);
-	}
 	/* Gather LVT offset for thresholding: */
 	if (rdmsr_safe(MSR_CU_DEF_ERR, &smca_low, &smca_high))
 		goto out;
@@ -530,7 +516,7 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
 	for (bank = 0; bank < mca_cfg.banks; ++bank) {
 		if (mce_flags.smca)
-			get_smca_bank_info(bank);
+			smca_configure(bank, cpu);
 		for (block = 0; block < NR_BLOCKS; ++block) {
 			address = get_block_address(cpu, address, low, high, bank, block);
@@ -755,37 +741,19 @@ int umc_normaddr_to_sysaddr(u64 norm_addr, u16 nid, u8 umc, u64 *sys_addr)
 }
 EXPORT_SYMBOL_GPL(umc_normaddr_to_sysaddr);
-static void
+static void __log_error(unsigned int bank, u64 status, u64 addr, u64 misc)
-__log_error(unsigned int bank, bool deferred_err, bool threshold_err, u64 misc)
 {
-	u32 msr_status = msr_ops.status(bank);
-	u32 msr_addr = msr_ops.addr(bank);
 	struct mce m;
-	u64 status;
-	WARN_ON_ONCE(deferred_err && threshold_err);
-	if (deferred_err && mce_flags.smca) {
-		msr_status = MSR_AMD64_SMCA_MCx_DESTAT(bank);
-		msr_addr = MSR_AMD64_SMCA_MCx_DEADDR(bank);
-	}
-	rdmsrl(msr_status, status);
-	if (!(status & MCI_STATUS_VAL))
-		return;
 	mce_setup(&m);
 	m.status = status;
+	m.misc   = misc;
 	m.bank   = bank;
 	m.tsc	 = rdtsc();
-	if (threshold_err)
-		m.misc = misc;
 	if (m.status & MCI_STATUS_ADDRV) {
-		rdmsrl(msr_addr, m.addr);
+		m.addr = addr;
 		/*
 		 * Extract [55:<lsb>] where lsb is the least significant
@@ -806,8 +774,6 @@ __log_error(unsigned int bank, bool deferred_err, bool threshold_err, u64 misc)
 	}
 	mce_log(&m);
-	wrmsrl(msr_status, 0);
 }
 static inline void __smp_deferred_error_interrupt(void)
@@ -832,86 +798,125 @@ asmlinkage __visible void __irq_entry smp_trace_deferred_error_interrupt(void)
 	exiting_ack_irq();
 }
-/* APIC interrupt handler for deferred errors */
+/*
-static void amd_deferred_error_interrupt(void)
+ * Returns true if the logged error is deferred. False, otherwise.
+ */
+static inline bool
+_log_error_bank(unsigned int bank, u32 msr_stat, u32 msr_addr, u64 misc)
 {
-	unsigned int bank;
+	u64 status, addr = 0;
-	u32 msr_status;
-	u64 status;
-	for (bank = 0; bank < mca_cfg.banks; ++bank) {
+	rdmsrl(msr_stat, status);
-		msr_status = (mce_flags.smca) ? MSR_AMD64_SMCA_MCx_DESTAT(bank)
+	if (!(status & MCI_STATUS_VAL))
-					      : msr_ops.status(bank);
+		return false;
-		rdmsrl(msr_status, status);
+	if (status & MCI_STATUS_ADDRV)
+		rdmsrl(msr_addr, addr);
-		if (!(status & MCI_STATUS_VAL) ||
+	__log_error(bank, status, addr, misc);
-		    !(status & MCI_STATUS_DEFERRED))
-			continue;
-		__log_error(bank, true, false, 0);
+	wrmsrl(msr_stat, 0);
-		break;
-	}
+	return status & MCI_STATUS_DEFERRED;
 }
 /*
- * APIC Interrupt Handler
+ * We have three scenarios for checking for Deferred errors:
+ *
+ * 1) Non-SMCA systems check MCA_STATUS and log error if found.
+ * 2) SMCA systems check MCA_STATUS. If error is found then log it and also
+ *    clear MCA_DESTAT.
+ * 3) SMCA systems check MCA_DESTAT, if error was not found in MCA_STATUS, and
+ *    log it.
 */
+static void log_error_deferred(unsigned int bank)
+{
+	bool defrd;
-/*
+	defrd = _log_error_bank(bank, msr_ops.status(bank),
- * threshold interrupt handler will service THRESHOLD_APIC_VECTOR.
+					msr_ops.addr(bank), 0);
- * the interrupt goes off when error_count reaches threshold_limit.
- * the handler will simply log mcelog w/ software defined bank number.
+	if (!mce_flags.smca)
+		return;
+	/* Clear MCA_DESTAT if we logged the deferred error from MCA_STATUS. */
+	if (defrd) {
+		wrmsrl(MSR_AMD64_SMCA_MCx_DESTAT(bank), 0);
+		return;
+	}
+	/*
+	 * Only deferred errors are logged in MCA_DE{STAT,ADDR} so just check
+	 * for a valid error.
 	 */
+	_log_error_bank(bank, MSR_AMD64_SMCA_MCx_DESTAT(bank),
+			      MSR_AMD64_SMCA_MCx_DEADDR(bank), 0);
+}
-static void amd_threshold_interrupt(void)
+/* APIC interrupt handler for deferred errors */
+static void amd_deferred_error_interrupt(void)
 {
-	u32 low = 0, high = 0, address = 0;
+	unsigned int bank;
-	unsigned int bank, block, cpu = smp_processor_id();
-	struct thresh_restart tr;
-	/* assume first bank caused it */
+	for (bank = 0; bank < mca_cfg.banks; ++bank)
-	for (bank = 0; bank < mca_cfg.banks; ++bank) {
+		log_error_deferred(bank);
-		if (!(per_cpu(bank_map, cpu) & (1 << bank)))
+}
-			continue;
-		for (block = 0; block < NR_BLOCKS; ++block) {
-			address = get_block_address(cpu, address, low, high, bank, block);
-			if (!address)
-				break;
-			if (rdmsr_safe(address, &low, &high))
+static void log_error_thresholding(unsigned int bank, u64 misc)
-				break;
+{
+	_log_error_bank(bank, msr_ops.status(bank), msr_ops.addr(bank), misc);
+}
-			if (!(high & MASK_VALID_HI)) {
+static void log_and_reset_block(struct threshold_block *block)
-				if (block)
+{
-					continue;
+	struct thresh_restart tr;
-				else
+	u32 low = 0, high = 0;
-					break;
-			}
-			if (!(high & MASK_CNTP_HI)  ||
+	if (!block)
-			     (high & MASK_LOCKED_HI))
+		return;
-				continue;
-			/*
+	if (rdmsr_safe(block->address, &low, &high))
-			 * Log the machine check that caused the threshold
-			 * event.
-			 */
-			if (high & MASK_OVERFLOW_HI)
-				goto log;
-		}
-	}
 		return;
-log:
+	if (!(high & MASK_OVERFLOW_HI))
-	__log_error(bank, false, true, ((u64)high << 32) | low);
+		return;
+	/* Log the MCE which caused the threshold event. */
+	log_error_thresholding(block->bank, ((u64)high << 32) | low);
 	/* Reset threshold block after logging error. */
 	memset(&tr, 0, sizeof(tr));
-	tr.b = &per_cpu(threshold_banks, cpu)[bank]->blocks[block];
+	tr.b = block;
 	threshold_restart_bank(&tr);
 }
+/*
+ * Threshold interrupt handler will service THRESHOLD_APIC_VECTOR. The interrupt
+ * goes off when error_count reaches threshold_limit.
+ */
+static void amd_threshold_interrupt(void)
+{
+	struct threshold_block *first_block = NULL, *block = NULL, *tmp = NULL;
+	unsigned int bank, cpu = smp_processor_id();
+	for (bank = 0; bank < mca_cfg.banks; ++bank) {
+		if (!(per_cpu(bank_map, cpu) & (1 << bank)))
+			continue;
+		first_block = per_cpu(threshold_banks, cpu)[bank]->blocks;
+		if (!first_block)
+			continue;
+		/*
+		 * The first block is also the head of the list. Check it first
+		 * before iterating over the rest.
+		 */
+		log_and_reset_block(first_block);
+		list_for_each_entry_safe(block, tmp, &first_block->miscj, miscj)
+			log_and_reset_block(block);
+	}
+}
 /*
 * Sysfs Interface
 */
@@ -1202,7 +1207,7 @@ static int threshold_create_bank(unsigned int cpu, unsigned int bank)
 				goto out;
 			per_cpu(threshold_banks, cpu)[bank] = b;
-			atomic_inc(&b->cpus);
+			refcount_inc(&b->cpus);
 			err = __threshold_add_blocks(b);
@@ -1225,7 +1230,7 @@ static int threshold_create_bank(unsigned int cpu, unsigned int bank)
 	per_cpu(threshold_banks, cpu)[bank] = b;
 	if (is_shared_bank(bank)) {
-		atomic_set(&b->cpus, 1);
+		refcount_set(&b->cpus, 1);
 		/* nb is already initialized, see above */
 		if (nb) {
@@ -1289,7 +1294,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank)
 		goto free_out;
 	if (is_shared_bank(bank)) {
-		if (!atomic_dec_and_test(&b->cpus)) {
+		if (!refcount_dec_and_test(&b->cpus)) {
 			__threshold_remove_blocks(b);
 			per_cpu(threshold_banks, cpu)[bank] = NULL;
 			return;

--- a/arch/x86/ras/Kconfig
+++ b/arch/x86/ras/Kconfig
-config MCE_AMD_INJ
-	tristate "Simple MCE injection interface for AMD processors"
-	depends on RAS && X86_MCE && DEBUG_FS && AMD_NB
-	default n
-	help
-	  This is a simple debugfs interface to inject MCEs and test different
-	  aspects of the MCE handling code.
-	  WARNING: Do not even assume this interface is staying stable!
 config RAS_CEC
 	bool "Correctable Errors Collector"
 	depends on X86_MCE && MEMORY_FAILURE && DEBUG_FS
@@ -20,4 +10,3 @@ config RAS_CEC
 	  Bear in mind that this is absolutely useless if your platform doesn't
 	  have ECC DIMMs and doesn't have DRAM ECC checking enabled in the BIOS.
--- a/arch/x86/ras/Makefile
+++ b/arch/x86/ras/Makefile
-obj-$(CONFIG_MCE_AMD_INJ)		+= mce_amd_inj.o
--- a/arch/x86/ras/mce_amd_inj.c
+++ b/arch/x86/ras/mce_amd_inj.c
-/*
- * A simple MCE injection facility for testing different aspects of the RAS
- * code. This driver should be built as module so that it can be loaded
- * on production kernels for testing purposes.
- *
- * This file may be distributed under the terms of the GNU General Public
- * License version 2.
- *
- * Copyright (c) 2010-15:  Borislav Petkov <bp@alien8.de>
- *			Advanced Micro Devices Inc.
- */
-#include <linux/kobject.h>
-#include <linux/debugfs.h>
-#include <linux/device.h>
-#include <linux/module.h>
-#include <linux/cpu.h>
-#include <linux/string.h>
-#include <linux/uaccess.h>
-#include <linux/pci.h>
-#include <asm/mce.h>
-#include <asm/smp.h>
-#include <asm/amd_nb.h>
-#include <asm/irq_vectors.h>
-#include "../kernel/cpu/mcheck/mce-internal.h"
-/*
- * Collect all the MCi_XXX settings
- */
-static struct mce i_mce;
-static struct dentry *dfs_inj;
-static u8 n_banks;
-#define MAX_FLAG_OPT_SIZE	3
-#define NBCFG			0x44
-enum injection_type {
-	SW_INJ = 0,	/* SW injection, simply decode the error */
-	HW_INJ,		/* Trigger a #MC */
-	DFR_INT_INJ,    /* Trigger Deferred error interrupt */
-	THR_INT_INJ,    /* Trigger threshold interrupt */
-	N_INJ_TYPES,
-};
-static const char * const flags_options[] = {
-	[SW_INJ] = "sw",
-	[HW_INJ] = "hw",
-	[DFR_INT_INJ] = "df",
-	[THR_INT_INJ] = "th",
-	NULL
-};
-/* Set default injection to SW_INJ */
-static enum injection_type inj_type = SW_INJ;
-#define MCE_INJECT_SET(reg)						\
-static int inj_##reg##_set(void *data, u64 val)				\
-{									\
-	struct mce *m = (struct mce *)data;				\
-									\
-	m->reg = val;							\
-	return 0;							\
-}
-MCE_INJECT_SET(status);
-MCE_INJECT_SET(misc);
-MCE_INJECT_SET(addr);
-MCE_INJECT_SET(synd);
-#define MCE_INJECT_GET(reg)						\
-static int inj_##reg##_get(void *data, u64 *val)			\
-{									\
-	struct mce *m = (struct mce *)data;				\
-									\
-	*val = m->reg;							\
-	return 0;							\
-}
-MCE_INJECT_GET(status);
-MCE_INJECT_GET(misc);
-MCE_INJECT_GET(addr);
-MCE_INJECT_GET(synd);
-DEFINE_SIMPLE_ATTRIBUTE(status_fops, inj_status_get, inj_status_set, "%llx\n");
-DEFINE_SIMPLE_ATTRIBUTE(misc_fops, inj_misc_get, inj_misc_set, "%llx\n");
-DEFINE_SIMPLE_ATTRIBUTE(addr_fops, inj_addr_get, inj_addr_set, "%llx\n");
-DEFINE_SIMPLE_ATTRIBUTE(synd_fops, inj_synd_get, inj_synd_set, "%llx\n");
-/*
- * Caller needs to be make sure this cpu doesn't disappear
- * from under us, i.e.: get_cpu/put_cpu.
- */
-static int toggle_hw_mce_inject(unsigned int cpu, bool enable)
-{
-	u32 l, h;
-	int err;
-	err = rdmsr_on_cpu(cpu, MSR_K7_HWCR, &l, &h);
-	if (err) {
-		pr_err("%s: error reading HWCR\n", __func__);
-		return err;
-	}
-	enable ? (l |= BIT(18)) : (l &= ~BIT(18));
-	err = wrmsr_on_cpu(cpu, MSR_K7_HWCR, l, h);
-	if (err)
-		pr_err("%s: error writing HWCR\n", __func__);
-	return err;
-}
-static int __set_inj(const char *buf)
-{
-	int i;
-	for (i = 0; i < N_INJ_TYPES; i++) {
-		if (!strncmp(flags_options[i], buf, strlen(flags_options[i]))) {
-			inj_type = i;
-			return 0;
-		}
-	}
-	return -EINVAL;
-}
-static ssize_t flags_read(struct file *filp, char __user *ubuf,
-			  size_t cnt, loff_t *ppos)
-{
-	char buf[MAX_FLAG_OPT_SIZE];
-	int n;
-	n = sprintf(buf, "%s\n", flags_options[inj_type]);
-	return simple_read_from_buffer(ubuf, cnt, ppos, buf, n);
-}
-static ssize_t flags_write(struct file *filp, const char __user *ubuf,
-			   size_t cnt, loff_t *ppos)
-{
-	char buf[MAX_FLAG_OPT_SIZE], *__buf;
-	int err;
-	if (cnt > MAX_FLAG_OPT_SIZE)
-		return -EINVAL;
-	if (copy_from_user(&buf, ubuf, cnt))
-		return -EFAULT;
-	buf[cnt - 1] = 0;
-	/* strip whitespace */
-	__buf = strstrip(buf);
-	err = __set_inj(__buf);
-	if (err) {
-		pr_err("%s: Invalid flags value: %s\n", __func__, __buf);
-		return err;
-	}
-	*ppos += cnt;
-	return cnt;
-}
-static const struct file_operations flags_fops = {
-	.read           = flags_read,
-	.write          = flags_write,
-	.llseek         = generic_file_llseek,
-};
-/*
- * On which CPU to inject?
- */
-MCE_INJECT_GET(extcpu);
-static int inj_extcpu_set(void *data, u64 val)
-{
-	struct mce *m = (struct mce *)data;
-	if (val >= nr_cpu_ids || !cpu_online(val)) {
-		pr_err("%s: Invalid CPU: %llu\n", __func__, val);
-		return -EINVAL;
-	}
-	m->extcpu = val;
-	return 0;
-}
-DEFINE_SIMPLE_ATTRIBUTE(extcpu_fops, inj_extcpu_get, inj_extcpu_set, "%llu\n");
-static void trigger_mce(void *info)
-{
-	asm volatile("int $18");
-}
-static void trigger_dfr_int(void *info)
-{
-	asm volatile("int %0" :: "i" (DEFERRED_ERROR_VECTOR));
-}
-static void trigger_thr_int(void *info)
-{
-	asm volatile("int %0" :: "i" (THRESHOLD_APIC_VECTOR));
-}
-static u32 get_nbc_for_node(int node_id)
-{
-	struct cpuinfo_x86 *c = &boot_cpu_data;
-	u32 cores_per_node;
-	cores_per_node = (c->x86_max_cores * smp_num_siblings) / amd_get_nodes_per_socket();
-	return cores_per_node * node_id;
-}
-static void toggle_nb_mca_mst_cpu(u16 nid)
-{
-	struct pci_dev *F3 = node_to_amd_nb(nid)->misc;
-	u32 val;
-	int err;
-	if (!F3)
-		return;
-	err = pci_read_config_dword(F3, NBCFG, &val);
-	if (err) {
-		pr_err("%s: Error reading F%dx%03x.\n",
-		       __func__, PCI_FUNC(F3->devfn), NBCFG);
-		return;
-	}
-	if (val & BIT(27))
-		return;
-	pr_err("%s: Set D18F3x44[NbMcaToMstCpuEn] which BIOS hasn't done.\n",
-	       __func__);
-	val |= BIT(27);
-	err = pci_write_config_dword(F3, NBCFG, val);
-	if (err)
-		pr_err("%s: Error writing F%dx%03x.\n",
-		       __func__, PCI_FUNC(F3->devfn), NBCFG);
-}
-static void prepare_msrs(void *info)
-{
-	struct mce m = *(struct mce *)info;
-	u8 b = m.bank;
-	wrmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
-	if (boot_cpu_has(X86_FEATURE_SMCA)) {
-		if (m.inject_flags == DFR_INT_INJ) {
-			wrmsrl(MSR_AMD64_SMCA_MCx_DESTAT(b), m.status);
-			wrmsrl(MSR_AMD64_SMCA_MCx_DEADDR(b), m.addr);
-		} else {
-			wrmsrl(MSR_AMD64_SMCA_MCx_STATUS(b), m.status);
-			wrmsrl(MSR_AMD64_SMCA_MCx_ADDR(b), m.addr);
-		}
-		wrmsrl(MSR_AMD64_SMCA_MCx_MISC(b), m.misc);
-		wrmsrl(MSR_AMD64_SMCA_MCx_SYND(b), m.synd);
-	} else {
-		wrmsrl(MSR_IA32_MCx_STATUS(b), m.status);
-		wrmsrl(MSR_IA32_MCx_ADDR(b), m.addr);
-		wrmsrl(MSR_IA32_MCx_MISC(b), m.misc);
-	}
-}
-static void do_inject(void)
-{
-	u64 mcg_status = 0;
-	unsigned int cpu = i_mce.extcpu;
-	u8 b = i_mce.bank;
-	rdtscll(i_mce.tsc);
-	if (i_mce.misc)
-		i_mce.status |= MCI_STATUS_MISCV;
-	if (i_mce.synd)
-		i_mce.status |= MCI_STATUS_SYNDV;
-	if (inj_type == SW_INJ) {
-		mce_inject_log(&i_mce);
-		return;
-	}
-	/* prep MCE global settings for the injection */
-	mcg_status = MCG_STATUS_MCIP | MCG_STATUS_EIPV;
-	if (!(i_mce.status & MCI_STATUS_PCC))
-		mcg_status |= MCG_STATUS_RIPV;
-	/*
-	 * Ensure necessary status bits for deferred errors:
-	 * - MCx_STATUS[Deferred]: make sure it is a deferred error
-	 * - MCx_STATUS[UC] cleared: deferred errors are _not_ UC
-	 */
-	if (inj_type == DFR_INT_INJ) {
-		i_mce.status |= MCI_STATUS_DEFERRED;
-		i_mce.status |= (i_mce.status & ~MCI_STATUS_UC);
-	}
-	/*
-	 * For multi node CPUs, logging and reporting of bank 4 errors happens
-	 * only on the node base core. Refer to D18F3x44[NbMcaToMstCpuEn] for
-	 * Fam10h and later BKDGs.
-	 */
-	if (static_cpu_has(X86_FEATURE_AMD_DCM) &&
-	    b == 4 &&
-	    boot_cpu_data.x86 < 0x17) {
-		toggle_nb_mca_mst_cpu(amd_get_nb_id(cpu));
-		cpu = get_nbc_for_node(amd_get_nb_id(cpu));
-	}
-	get_online_cpus();
-	if (!cpu_online(cpu))
-		goto err;
-	toggle_hw_mce_inject(cpu, true);
-	i_mce.mcgstatus = mcg_status;
-	i_mce.inject_flags = inj_type;
-	smp_call_function_single(cpu, prepare_msrs, &i_mce, 0);
-	toggle_hw_mce_inject(cpu, false);
-	switch (inj_type) {
-	case DFR_INT_INJ:
-		smp_call_function_single(cpu, trigger_dfr_int, NULL, 0);
-		break;
-	case THR_INT_INJ:
-		smp_call_function_single(cpu, trigger_thr_int, NULL, 0);
-		break;
-	default:
-		smp_call_function_single(cpu, trigger_mce, NULL, 0);
-	}
-err:
-	put_online_cpus();
-}
-/*
- * This denotes into which bank we're injecting and triggers
- * the injection, at the same time.
- */
-static int inj_bank_set(void *data, u64 val)
-{
-	struct mce *m = (struct mce *)data;
-	if (val >= n_banks) {
-		pr_err("Non-existent MCE bank: %llu\n", val);
-		return -EINVAL;
-	}
-	m->bank = val;
-	do_inject();
-	return 0;
-}
-MCE_INJECT_GET(bank);
-DEFINE_SIMPLE_ATTRIBUTE(bank_fops, inj_bank_get, inj_bank_set, "%llu\n");
-static const char readme_msg[] =
-"Description of the files and their usages:\n"
-"\n"
-"Note1: i refers to the bank number below.\n"
-"Note2: See respective BKDGs for the exact bit definitions of the files below\n"
-"as they mirror the hardware registers.\n"
-"\n"
-"status:\t Set MCi_STATUS: the bits in that MSR control the error type and\n"
-"\t attributes of the error which caused the MCE.\n"
-"\n"
-"misc:\t Set MCi_MISC: provide auxiliary info about the error. It is mostly\n"
-"\t used for error thresholding purposes and its validity is indicated by\n"
-"\t MCi_STATUS[MiscV].\n"
-"\n"
-"synd:\t Set MCi_SYND: provide syndrome info about the error. Only valid on\n"
-"\t Scalable MCA systems, and its validity is indicated by MCi_STATUS[SyndV].\n"
-"\n"
-"addr:\t Error address value to be written to MCi_ADDR. Log address information\n"
-"\t associated with the error.\n"
-"\n"
-"cpu:\t The CPU to inject the error on.\n"
-"\n"
-"bank:\t Specify the bank you want to inject the error into: the number of\n"
-"\t banks in a processor varies and is family/model-specific, therefore, the\n"
-"\t supplied value is sanity-checked. Setting the bank value also triggers the\n"
-"\t injection.\n"
-"\n"
-"flags:\t Injection type to be performed. Writing to this file will trigger a\n"
-"\t real machine check, an APIC interrupt or invoke the error decoder routines\n"
-"\t for AMD processors.\n"
-"\n"
-"\t Allowed error injection types:\n"
-"\t  - \"sw\": Software error injection. Decode error to a human-readable \n"
-"\t    format only. Safe to use.\n"
-"\t  - \"hw\": Hardware error injection. Causes the #MC exception handler to \n"
-"\t    handle the error. Be warned: might cause system panic if MCi_STATUS[PCC] \n"
-"\t    is set. Therefore, consider setting (debugfs_mountpoint)/mce/fake_panic \n"
-"\t    before injecting.\n"
-"\t  - \"df\": Trigger APIC interrupt for Deferred error. Causes deferred \n"
-"\t    error APIC interrupt handler to handle the error if the feature is \n"
-"\t    is present in hardware. \n"
-"\t  - \"th\": Trigger APIC interrupt for Threshold errors. Causes threshold \n"
-"\t    APIC interrupt handler to handle the error. \n"
-"\n";
-static ssize_t
-inj_readme_read(struct file *filp, char __user *ubuf,
-		       size_t cnt, loff_t *ppos)
-{
-	return simple_read_from_buffer(ubuf, cnt, ppos,
-					readme_msg, strlen(readme_msg));
-}
-static const struct file_operations readme_fops = {
-	.read		= inj_readme_read,
-};
-static struct dfs_node {
-	char *name;
-	struct dentry *d;
-	const struct file_operations *fops;
-	umode_t perm;
-} dfs_fls[] = {
-	{ .name = "status",	.fops = &status_fops, .perm = S_IRUSR | S_IWUSR },
-	{ .name = "misc",	.fops = &misc_fops,   .perm = S_IRUSR | S_IWUSR },
-	{ .name = "addr",	.fops = &addr_fops,   .perm = S_IRUSR | S_IWUSR },
-	{ .name = "synd",	.fops = &synd_fops,   .perm = S_IRUSR | S_IWUSR },
-	{ .name = "bank",	.fops = &bank_fops,   .perm = S_IRUSR | S_IWUSR },
-	{ .name = "flags",	.fops = &flags_fops,  .perm = S_IRUSR | S_IWUSR },
-	{ .name = "cpu",	.fops = &extcpu_fops, .perm = S_IRUSR | S_IWUSR },
-	{ .name = "README",	.fops = &readme_fops, .perm = S_IRUSR | S_IRGRP | S_IROTH },
-};
-static int __init init_mce_inject(void)
-{
-	unsigned int i;
-	u64 cap;
-	rdmsrl(MSR_IA32_MCG_CAP, cap);
-	n_banks = cap & MCG_BANKCNT_MASK;
-	dfs_inj = debugfs_create_dir("mce-inject", NULL);
-	if (!dfs_inj)
-		return -EINVAL;
-	for (i = 0; i < ARRAY_SIZE(dfs_fls); i++) {
-		dfs_fls[i].d = debugfs_create_file(dfs_fls[i].name,
-						    dfs_fls[i].perm,
-						    dfs_inj,
-						    &i_mce,
-						    dfs_fls[i].fops);
-		if (!dfs_fls[i].d)
-			goto err_dfs_add;
-	}
-	return 0;
-err_dfs_add:
-	while (i-- > 0)
-		debugfs_remove(dfs_fls[i].d);
-	debugfs_remove(dfs_inj);
-	dfs_inj = NULL;
-	return -ENODEV;
-}
-static void __exit exit_mce_inject(void)
-{
-	debugfs_remove_recursive(dfs_inj);
-	dfs_inj = NULL;
-	memset(&dfs_fls, 0, sizeof(dfs_fls));
-}
-module_init(init_mce_inject);
-module_exit(exit_mce_inject);
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Borislav Petkov <bp@alien8.de>");
-MODULE_AUTHOR("AMD Inc.");
-MODULE_DESCRIPTION("MCE injection facility for RAS testing");
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -89,14 +89,14 @@ bool ghes_disable;
 module_param_named(disable, ghes_disable, bool, 0);
 /*
- * All error sources notified with SCI shares one notifier function,
+ * All error sources notified with HED (Hardware Error Device) share a
- * so they need to be linked and checked one by one.  This is applied
+ * single notifier callback, so they need to be linked and checked one
- * to NMI too.
+ * by one. This holds true for NMI too.
 *
 * RCU is used for these lists, so ghes_list_mutex is only used for
 * list changing, not for traversing.
 */
-static LIST_HEAD(ghes_sci);
+static LIST_HEAD(ghes_hed);
 static DEFINE_MUTEX(ghes_list_mutex);
 /*
@@ -702,14 +702,14 @@ static irqreturn_t ghes_irq_func(int irq, void *data)
 	return IRQ_HANDLED;
 }
-static int ghes_notify_sci(struct notifier_block *this,
+static int ghes_notify_hed(struct notifier_block *this, unsigned long event,
-				  unsigned long event, void *data)
+			   void *data)
 {
 	struct ghes *ghes;
 	int ret = NOTIFY_DONE;
 	rcu_read_lock();
-	list_for_each_entry_rcu(ghes, &ghes_sci, list) {
+	list_for_each_entry_rcu(ghes, &ghes_hed, list) {
 		if (!ghes_proc(ghes))
 			ret = NOTIFY_OK;
 	}
@@ -718,8 +718,8 @@ static int ghes_notify_sci(struct notifier_block *this,
 	return ret;
 }
-static struct notifier_block ghes_notifier_sci = {
+static struct notifier_block ghes_notifier_hed = {
-	.notifier_call = ghes_notify_sci,
+	.notifier_call = ghes_notify_hed,
 };
 #ifdef CONFIG_HAVE_ACPI_APEI_NMI
@@ -966,7 +966,10 @@ static int ghes_probe(struct platform_device *ghes_dev)
 	case ACPI_HEST_NOTIFY_POLLED:
 	case ACPI_HEST_NOTIFY_EXTERNAL:
 	case ACPI_HEST_NOTIFY_SCI:
+	case ACPI_HEST_NOTIFY_GSIV:
+	case ACPI_HEST_NOTIFY_GPIO:
 		break;
 	case ACPI_HEST_NOTIFY_NMI:
 		if (!IS_ENABLED(CONFIG_HAVE_ACPI_APEI_NMI)) {
 			pr_warn(GHES_PFX "Generic hardware error source: %d notified via NMI interrupt is not supported!\n",
@@ -1024,13 +1027,17 @@ static int ghes_probe(struct platform_device *ghes_dev)
 			goto err_edac_unreg;
 		}
 		break;
 	case ACPI_HEST_NOTIFY_SCI:
+	case ACPI_HEST_NOTIFY_GSIV:
+	case ACPI_HEST_NOTIFY_GPIO:
 		mutex_lock(&ghes_list_mutex);
-		if (list_empty(&ghes_sci))
+		if (list_empty(&ghes_hed))
-			register_acpi_hed_notifier(&ghes_notifier_sci);
+			register_acpi_hed_notifier(&ghes_notifier_hed);
-		list_add_rcu(&ghes->list, &ghes_sci);
+		list_add_rcu(&ghes->list, &ghes_hed);
 		mutex_unlock(&ghes_list_mutex);
 		break;
 	case ACPI_HEST_NOTIFY_NMI:
 		ghes_nmi_add(ghes);
 		break;
@@ -1066,14 +1073,18 @@ static int ghes_remove(struct platform_device *ghes_dev)
 	case ACPI_HEST_NOTIFY_EXTERNAL:
 		free_irq(ghes->irq, ghes);
 		break;
 	case ACPI_HEST_NOTIFY_SCI:
+	case ACPI_HEST_NOTIFY_GSIV:
+	case ACPI_HEST_NOTIFY_GPIO:
 		mutex_lock(&ghes_list_mutex);
 		list_del_rcu(&ghes->list);
-		if (list_empty(&ghes_sci))
+		if (list_empty(&ghes_hed))
-			unregister_acpi_hed_notifier(&ghes_notifier_sci);
+			unregister_acpi_hed_notifier(&ghes_notifier_hed);
 		mutex_unlock(&ghes_list_mutex);
 		synchronize_rcu();
 		break;
 	case ACPI_HEST_NOTIFY_NMI:
 		ghes_nmi_remove(ghes);
 		break;

--- a/drivers/ras/cec.c
+++ b/drivers/ras/cec.c
@@ -481,7 +481,7 @@ static int __init create_debugfs_nodes(void)
 	count = debugfs_create_file("count_threshold", S_IRUSR | S_IWUSR, d,
 				    &count_threshold, &count_threshold_ops);
-	if (!decay) {
+	if (!count) {
 		pr_warn("Error creating count_threshold debugfs node!\n");
 		goto err;
 	}

--- a/drivers/ras/ras.c
+++ b/drivers/ras/ras.c
@@ -29,7 +29,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(extlog_mem_event);
 EXPORT_TRACEPOINT_SYMBOL_GPL(mc_event);
-int __init parse_ras_param(char *str)
+static int __init parse_ras_param(char *str)
 {
 #ifdef CONFIG_RAS_CEC
 	parse_cec_param(str);

--- a/drivers/xen/mcelog.c
+++ b/drivers/xen/mcelog.c
@@ -408,6 +408,8 @@ static int __init xen_late_init_mcelog(void)
 	if (ret)
 		goto deregister;
+	pr_info("/dev/mcelog registered by Xen\n");
 	return 0;
 deregister: