Commit e606d81d authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'ras-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull RAS updates from Ingo Molnar:
 "The main changes were:

   - Lots of enhancements for AMD SMCA (Scalable MCA
     features/extensions) systems: extract, decode and print more
     hardware error information and add matching support on the
     injection/testing side as well. (Yazn Ghannam)

   - Various MCE handling improvements on modern Intel Xeons. (Tony
     Luck)

   - Plus misc fixes and enhancements"

* 'ras-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (21 commits)
  x86/RAS/mce_amd_inj: Remove debugfs dir recursively on exit
  x86/RAS/mce_amd_inj: Fix signed wrap around when decrementing index 'i'
  x86/RAS/mce_amd_inj: Fix some W= warnings
  x86/MCE/AMD, EDAC: Handle reserved bank 4 on Fam17h properly
  x86/mce/AMD: Extract the error address on SMCA systems
  x86/mce, EDAC/mce_amd: Print MCA_SYND and MCA_IPID during MCE on SMCA systems
  x86/mce/AMD: Save MCA_IPID in MCE struct on SMCA systems
  x86/mce/AMD: Ensure the deferred error interrupt is of type APIC on SMCA systems
  x86/mce/AMD: Update sysfs bank names for SMCA systems
  x86/mce/AMD, EDAC/mce_amd: Define and use tables for known SMCA IP types
  EDAC/mce_amd: Use SMCA prefix for error descriptions arrays
  EDAC/mce_amd: Add missing SMCA error descriptions
  x86/mce/AMD: Read MSRs on the CPU allocating the threshold blocks
  x86/RAS: Add syndrome support to mce_amd_inj
  EDAC/mce_amd: Print syndrome register value on SMCA systems
  x86/mce: Add support for new MCA_SYND register
  x86/mce/AMD: Use msr_ops.misc() in allocate_threshold_blocks()
  x86/mce: Drop X86_FEATURE_MCE_RECOVERY and the related model string test
  x86/mce: Improve memcpy_mcsafe()
  x86/mce: Add PCI quirks to identify Xeons with machine check recovery
  ...
parents 12b7bcb4 b199ac6c
...@@ -106,7 +106,6 @@ ...@@ -106,7 +106,6 @@
#define X86_FEATURE_APERFMPERF ( 3*32+28) /* APERFMPERF */ #define X86_FEATURE_APERFMPERF ( 3*32+28) /* APERFMPERF */
#define X86_FEATURE_EAGER_FPU ( 3*32+29) /* "eagerfpu" Non lazy FPU restore */ #define X86_FEATURE_EAGER_FPU ( 3*32+29) /* "eagerfpu" Non lazy FPU restore */
#define X86_FEATURE_NONSTOP_TSC_S3 ( 3*32+30) /* TSC doesn't stop in S3 state */ #define X86_FEATURE_NONSTOP_TSC_S3 ( 3*32+30) /* TSC doesn't stop in S3 state */
#define X86_FEATURE_MCE_RECOVERY ( 3*32+31) /* cpu has recoverable machine checks */
/* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */ /* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */
#define X86_FEATURE_XMM3 ( 4*32+ 0) /* "pni" SSE-3 */ #define X86_FEATURE_XMM3 ( 4*32+ 0) /* "pni" SSE-3 */
......
...@@ -40,9 +40,10 @@ ...@@ -40,9 +40,10 @@
#define MCI_STATUS_AR (1ULL<<55) /* Action required */ #define MCI_STATUS_AR (1ULL<<55) /* Action required */
/* AMD-specific bits */ /* AMD-specific bits */
#define MCI_STATUS_TCC (1ULL<<55) /* Task context corrupt */
#define MCI_STATUS_SYNDV (1ULL<<53) /* synd reg. valid */
#define MCI_STATUS_DEFERRED (1ULL<<44) /* uncorrected error, deferred exception */ #define MCI_STATUS_DEFERRED (1ULL<<44) /* uncorrected error, deferred exception */
#define MCI_STATUS_POISON (1ULL<<43) /* access poisonous data */ #define MCI_STATUS_POISON (1ULL<<43) /* access poisonous data */
#define MCI_STATUS_TCC (1ULL<<55) /* Task context corrupt */
/* /*
* McaX field if set indicates a given bank supports MCA extensions: * McaX field if set indicates a given bank supports MCA extensions:
...@@ -110,6 +111,7 @@ ...@@ -110,6 +111,7 @@
#define MSR_AMD64_SMCA_MC0_MISC0 0xc0002003 #define MSR_AMD64_SMCA_MC0_MISC0 0xc0002003
#define MSR_AMD64_SMCA_MC0_CONFIG 0xc0002004 #define MSR_AMD64_SMCA_MC0_CONFIG 0xc0002004
#define MSR_AMD64_SMCA_MC0_IPID 0xc0002005 #define MSR_AMD64_SMCA_MC0_IPID 0xc0002005
#define MSR_AMD64_SMCA_MC0_SYND 0xc0002006
#define MSR_AMD64_SMCA_MC0_DESTAT 0xc0002008 #define MSR_AMD64_SMCA_MC0_DESTAT 0xc0002008
#define MSR_AMD64_SMCA_MC0_DEADDR 0xc0002009 #define MSR_AMD64_SMCA_MC0_DEADDR 0xc0002009
#define MSR_AMD64_SMCA_MC0_MISC1 0xc000200a #define MSR_AMD64_SMCA_MC0_MISC1 0xc000200a
...@@ -119,6 +121,7 @@ ...@@ -119,6 +121,7 @@
#define MSR_AMD64_SMCA_MCx_MISC(x) (MSR_AMD64_SMCA_MC0_MISC0 + 0x10*(x)) #define MSR_AMD64_SMCA_MCx_MISC(x) (MSR_AMD64_SMCA_MC0_MISC0 + 0x10*(x))
#define MSR_AMD64_SMCA_MCx_CONFIG(x) (MSR_AMD64_SMCA_MC0_CONFIG + 0x10*(x)) #define MSR_AMD64_SMCA_MCx_CONFIG(x) (MSR_AMD64_SMCA_MC0_CONFIG + 0x10*(x))
#define MSR_AMD64_SMCA_MCx_IPID(x) (MSR_AMD64_SMCA_MC0_IPID + 0x10*(x)) #define MSR_AMD64_SMCA_MCx_IPID(x) (MSR_AMD64_SMCA_MC0_IPID + 0x10*(x))
#define MSR_AMD64_SMCA_MCx_SYND(x) (MSR_AMD64_SMCA_MC0_SYND + 0x10*(x))
#define MSR_AMD64_SMCA_MCx_DESTAT(x) (MSR_AMD64_SMCA_MC0_DESTAT + 0x10*(x)) #define MSR_AMD64_SMCA_MCx_DESTAT(x) (MSR_AMD64_SMCA_MC0_DESTAT + 0x10*(x))
#define MSR_AMD64_SMCA_MCx_DEADDR(x) (MSR_AMD64_SMCA_MC0_DEADDR + 0x10*(x)) #define MSR_AMD64_SMCA_MCx_DEADDR(x) (MSR_AMD64_SMCA_MC0_DEADDR + 0x10*(x))
#define MSR_AMD64_SMCA_MCx_MISCy(x, y) ((MSR_AMD64_SMCA_MC0_MISC1 + y) + (0x10*(x))) #define MSR_AMD64_SMCA_MCx_MISCy(x, y) ((MSR_AMD64_SMCA_MC0_MISC1 + y) + (0x10*(x)))
...@@ -334,44 +337,47 @@ extern void apei_mce_report_mem_error(int corrected, ...@@ -334,44 +337,47 @@ extern void apei_mce_report_mem_error(int corrected,
* Scalable MCA. * Scalable MCA.
*/ */
#ifdef CONFIG_X86_MCE_AMD #ifdef CONFIG_X86_MCE_AMD
enum amd_ip_types {
SMCA_F17H_CORE = 0, /* Core errors */ /* These may be used by multiple smca_hwid_mcatypes */
SMCA_DF, /* Data Fabric */ enum smca_bank_types {
SMCA_LS = 0, /* Load Store */
SMCA_IF, /* Instruction Fetch */
SMCA_L2_CACHE, /* L2 Cache */
SMCA_DE, /* Decoder Unit */
SMCA_EX, /* Execution Unit */
SMCA_FP, /* Floating Point */
SMCA_L3_CACHE, /* L3 Cache */
SMCA_CS, /* Coherent Slave */
SMCA_PIE, /* Power, Interrupts, etc. */
SMCA_UMC, /* Unified Memory Controller */ SMCA_UMC, /* Unified Memory Controller */
SMCA_PB, /* Parameter Block */ SMCA_PB, /* Parameter Block */
SMCA_PSP, /* Platform Security Processor */ SMCA_PSP, /* Platform Security Processor */
SMCA_SMU, /* System Management Unit */ SMCA_SMU, /* System Management Unit */
N_AMD_IP_TYPES N_SMCA_BANK_TYPES
}; };
struct amd_hwid { struct smca_bank_name {
const char *name; const char *name; /* Short name for sysfs */
unsigned int hwid; const char *long_name; /* Long name for pretty-printing */
}; };
extern struct amd_hwid amd_hwids[N_AMD_IP_TYPES]; extern struct smca_bank_name smca_bank_names[N_SMCA_BANK_TYPES];
enum amd_core_mca_blocks { #define HWID_MCATYPE(hwid, mcatype) ((hwid << 16) | mcatype)
SMCA_LS = 0, /* Load Store */
SMCA_IF, /* Instruction Fetch */
SMCA_L2_CACHE, /* L2 cache */
SMCA_DE, /* Decoder unit */
RES, /* Reserved */
SMCA_EX, /* Execution unit */
SMCA_FP, /* Floating Point */
SMCA_L3_CACHE, /* L3 cache */
N_CORE_MCA_BLOCKS
};
extern const char * const amd_core_mcablock_names[N_CORE_MCA_BLOCKS]; struct smca_hwid_mcatype {
unsigned int bank_type; /* Use with smca_bank_types for easy indexing. */
u32 hwid_mcatype; /* (hwid,mcatype) tuple */
u32 xec_bitmap; /* Bitmap of valid ExtErrorCodes; current max is 21. */
};
enum amd_df_mca_blocks { struct smca_bank_info {
SMCA_CS = 0, /* Coherent Slave */ struct smca_hwid_mcatype *type;
SMCA_PIE, /* Power management, Interrupts, etc */ u32 type_instance;
N_DF_BLOCKS
}; };
extern const char * const amd_df_mcablock_names[N_DF_BLOCKS]; extern struct smca_bank_info smca_banks[MAX_NR_BANKS];
#endif #endif
#endif /* _ASM_X86_MCE_H */ #endif /* _ASM_X86_MCE_H */
...@@ -46,10 +46,7 @@ static inline void arch_memcpy_to_pmem(void *dst, const void *src, size_t n) ...@@ -46,10 +46,7 @@ static inline void arch_memcpy_to_pmem(void *dst, const void *src, size_t n)
static inline int arch_memcpy_from_pmem(void *dst, const void *src, size_t n) static inline int arch_memcpy_from_pmem(void *dst, const void *src, size_t n)
{ {
if (static_cpu_has(X86_FEATURE_MCE_RECOVERY))
return memcpy_mcsafe(dst, src, n); return memcpy_mcsafe(dst, src, n);
memcpy(dst, src, n);
return 0;
} }
/** /**
......
...@@ -2,6 +2,7 @@ ...@@ -2,6 +2,7 @@
#define _ASM_X86_STRING_64_H #define _ASM_X86_STRING_64_H
#ifdef __KERNEL__ #ifdef __KERNEL__
#include <linux/jump_label.h>
/* Written 2002 by Andi Kleen */ /* Written 2002 by Andi Kleen */
...@@ -78,6 +79,9 @@ int strcmp(const char *cs, const char *ct); ...@@ -78,6 +79,9 @@ int strcmp(const char *cs, const char *ct);
#define memset(s, c, n) __memset(s, c, n) #define memset(s, c, n) __memset(s, c, n)
#endif #endif
__must_check int memcpy_mcsafe_unrolled(void *dst, const void *src, size_t cnt);
DECLARE_STATIC_KEY_FALSE(mcsafe_key);
/** /**
* memcpy_mcsafe - copy memory with indication if a machine check happened * memcpy_mcsafe - copy memory with indication if a machine check happened
* *
...@@ -86,10 +90,23 @@ int strcmp(const char *cs, const char *ct); ...@@ -86,10 +90,23 @@ int strcmp(const char *cs, const char *ct);
* @cnt: number of bytes to copy * @cnt: number of bytes to copy
* *
* Low level memory copy function that catches machine checks * Low level memory copy function that catches machine checks
* We only call into the "safe" function on systems that can
* actually do machine check recovery. Everyone else can just
* use memcpy().
* *
* Return 0 for success, -EFAULT for fail * Return 0 for success, -EFAULT for fail
*/ */
int memcpy_mcsafe(void *dst, const void *src, size_t cnt); static __always_inline __must_check int
memcpy_mcsafe(void *dst, const void *src, size_t cnt)
{
#ifdef CONFIG_X86_MCE
if (static_branch_unlikely(&mcsafe_key))
return memcpy_mcsafe_unrolled(dst, src, cnt);
else
#endif
memcpy(dst, src, cnt);
return 0;
}
#endif /* __KERNEL__ */ #endif /* __KERNEL__ */
......
...@@ -26,6 +26,8 @@ struct mce { ...@@ -26,6 +26,8 @@ struct mce {
__u32 socketid; /* CPU socket ID */ __u32 socketid; /* CPU socket ID */
__u32 apicid; /* CPU initial apic ID */ __u32 apicid; /* CPU initial apic ID */
__u64 mcgcap; /* MCGCAP MSR: machine check capabilities of CPU */ __u64 mcgcap; /* MCGCAP MSR: machine check capabilities of CPU */
__u64 synd; /* MCA_SYND MSR: only valid on SMCA systems */
__u64 ipid; /* MCA_IPID MSR: only valid on SMCA systems */
}; };
#define MCE_GET_RECORD_LEN _IOR('M', 1, int) #define MCE_GET_RECORD_LEN _IOR('M', 1, int)
......
...@@ -41,6 +41,7 @@ ...@@ -41,6 +41,7 @@
#include <linux/debugfs.h> #include <linux/debugfs.h>
#include <linux/irq_work.h> #include <linux/irq_work.h>
#include <linux/export.h> #include <linux/export.h>
#include <linux/jump_label.h>
#include <asm/processor.h> #include <asm/processor.h>
#include <asm/traps.h> #include <asm/traps.h>
...@@ -292,6 +293,13 @@ static void print_mce(struct mce *m) ...@@ -292,6 +293,13 @@ static void print_mce(struct mce *m)
if (m->misc) if (m->misc)
pr_cont("MISC %llx ", m->misc); pr_cont("MISC %llx ", m->misc);
if (mce_flags.smca) {
if (m->synd)
pr_cont("SYND %llx ", m->synd);
if (m->ipid)
pr_cont("IPID %llx ", m->ipid);
}
pr_cont("\n"); pr_cont("\n");
/* /*
* Note this output is parsed by external tools and old fields * Note this output is parsed by external tools and old fields
...@@ -568,6 +576,7 @@ static void mce_read_aux(struct mce *m, int i) ...@@ -568,6 +576,7 @@ static void mce_read_aux(struct mce *m, int i)
{ {
if (m->status & MCI_STATUS_MISCV) if (m->status & MCI_STATUS_MISCV)
m->misc = mce_rdmsrl(msr_ops.misc(i)); m->misc = mce_rdmsrl(msr_ops.misc(i));
if (m->status & MCI_STATUS_ADDRV) { if (m->status & MCI_STATUS_ADDRV) {
m->addr = mce_rdmsrl(msr_ops.addr(i)); m->addr = mce_rdmsrl(msr_ops.addr(i));
...@@ -579,6 +588,23 @@ static void mce_read_aux(struct mce *m, int i) ...@@ -579,6 +588,23 @@ static void mce_read_aux(struct mce *m, int i)
m->addr >>= shift; m->addr >>= shift;
m->addr <<= shift; m->addr <<= shift;
} }
/*
* Extract [55:<lsb>] where lsb is the least significant
* *valid* bit of the address bits.
*/
if (mce_flags.smca) {
u8 lsb = (m->addr >> 56) & 0x3f;
m->addr &= GENMASK_ULL(55, lsb);
}
}
if (mce_flags.smca) {
m->ipid = mce_rdmsrl(MSR_AMD64_SMCA_MCx_IPID(i));
if (m->status & MCI_STATUS_SYNDV)
m->synd = mce_rdmsrl(MSR_AMD64_SMCA_MCx_SYND(i));
} }
} }
...@@ -1633,17 +1659,6 @@ static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) ...@@ -1633,17 +1659,6 @@ static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
if (c->x86 == 6 && c->x86_model == 45) if (c->x86 == 6 && c->x86_model == 45)
quirk_no_way_out = quirk_sandybridge_ifu; quirk_no_way_out = quirk_sandybridge_ifu;
/*
* MCG_CAP.MCG_SER_P is necessary but not sufficient to know
* whether this processor will actually generate recoverable
* machine checks. Check to see if this is an E7 model Xeon.
* We can't do a model number check because E5 and E7 use the
* same model number. E5 doesn't support recovery, E7 does.
*/
if (mca_cfg.recovery || (mca_cfg.ser &&
!strncmp(c->x86_model_id,
"Intel(R) Xeon(R) CPU E7-", 24)))
set_cpu_cap(c, X86_FEATURE_MCE_RECOVERY);
} }
if (cfg->monarch_timeout < 0) if (cfg->monarch_timeout < 0)
cfg->monarch_timeout = 0; cfg->monarch_timeout = 0;
...@@ -2080,6 +2095,7 @@ void mce_disable_bank(int bank) ...@@ -2080,6 +2095,7 @@ void mce_disable_bank(int bank)
* mce=bootlog Log MCEs from before booting. Disabled by default on AMD. * mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
* mce=nobootlog Don't log MCEs from before booting. * mce=nobootlog Don't log MCEs from before booting.
* mce=bios_cmci_threshold Don't program the CMCI threshold * mce=bios_cmci_threshold Don't program the CMCI threshold
* mce=recovery force enable memcpy_mcsafe()
*/ */
static int __init mcheck_enable(char *str) static int __init mcheck_enable(char *str)
{ {
...@@ -2676,8 +2692,14 @@ static int __init mcheck_debugfs_init(void) ...@@ -2676,8 +2692,14 @@ static int __init mcheck_debugfs_init(void)
static int __init mcheck_debugfs_init(void) { return -EINVAL; } static int __init mcheck_debugfs_init(void) { return -EINVAL; }
#endif #endif
DEFINE_STATIC_KEY_FALSE(mcsafe_key);
EXPORT_SYMBOL_GPL(mcsafe_key);
static int __init mcheck_late_init(void) static int __init mcheck_late_init(void)
{ {
if (mca_cfg.recovery)
static_branch_inc(&mcsafe_key);
mcheck_debugfs_init(); mcheck_debugfs_init();
/* /*
......
...@@ -20,6 +20,7 @@ ...@@ -20,6 +20,7 @@
#include <linux/init.h> #include <linux/init.h>
#include <linux/cpu.h> #include <linux/cpu.h>
#include <linux/smp.h> #include <linux/smp.h>
#include <linux/string.h>
#include <asm/amd_nb.h> #include <asm/amd_nb.h>
#include <asm/apic.h> #include <asm/apic.h>
...@@ -63,34 +64,71 @@ static const char * const th_names[] = { ...@@ -63,34 +64,71 @@ static const char * const th_names[] = {
"execution_unit", "execution_unit",
}; };
/* Define HWID to IP type mappings for Scalable MCA */ static const char * const smca_umc_block_names[] = {
struct amd_hwid amd_hwids[] = { "dram_ecc",
[SMCA_F17H_CORE] = { "f17h_core", 0xB0 }, "misc_umc"
[SMCA_DF] = { "data_fabric", 0x2E },
[SMCA_UMC] = { "umc", 0x96 },
[SMCA_PB] = { "param_block", 0x5 },
[SMCA_PSP] = { "psp", 0xFF },
[SMCA_SMU] = { "smu", 0x1 },
}; };
EXPORT_SYMBOL_GPL(amd_hwids);
struct smca_bank_name smca_bank_names[] = {
const char * const amd_core_mcablock_names[] = { [SMCA_LS] = { "load_store", "Load Store Unit" },
[SMCA_LS] = "load_store", [SMCA_IF] = { "insn_fetch", "Instruction Fetch Unit" },
[SMCA_IF] = "insn_fetch", [SMCA_L2_CACHE] = { "l2_cache", "L2 Cache" },
[SMCA_L2_CACHE] = "l2_cache", [SMCA_DE] = { "decode_unit", "Decode Unit" },
[SMCA_DE] = "decode_unit", [SMCA_EX] = { "execution_unit", "Execution Unit" },
[RES] = "", [SMCA_FP] = { "floating_point", "Floating Point Unit" },
[SMCA_EX] = "execution_unit", [SMCA_L3_CACHE] = { "l3_cache", "L3 Cache" },
[SMCA_FP] = "floating_point", [SMCA_CS] = { "coherent_slave", "Coherent Slave" },
[SMCA_L3_CACHE] = "l3_cache", [SMCA_PIE] = { "pie", "Power, Interrupts, etc." },
[SMCA_UMC] = { "umc", "Unified Memory Controller" },
[SMCA_PB] = { "param_block", "Parameter Block" },
[SMCA_PSP] = { "psp", "Platform Security Processor" },
[SMCA_SMU] = { "smu", "System Management Unit" },
}; };
EXPORT_SYMBOL_GPL(amd_core_mcablock_names); EXPORT_SYMBOL_GPL(smca_bank_names);
static struct smca_hwid_mcatype smca_hwid_mcatypes[] = {
/* { bank_type, hwid_mcatype, xec_bitmap } */
/* ZN Core (HWID=0xB0) MCA types */
{ SMCA_LS, HWID_MCATYPE(0xB0, 0x0), 0x1FFFEF },
{ SMCA_IF, HWID_MCATYPE(0xB0, 0x1), 0x3FFF },
{ SMCA_L2_CACHE, HWID_MCATYPE(0xB0, 0x2), 0xF },
{ SMCA_DE, HWID_MCATYPE(0xB0, 0x3), 0x1FF },
/* HWID 0xB0 MCATYPE 0x4 is Reserved */
{ SMCA_EX, HWID_MCATYPE(0xB0, 0x5), 0x7FF },
{ SMCA_FP, HWID_MCATYPE(0xB0, 0x6), 0x7F },
{ SMCA_L3_CACHE, HWID_MCATYPE(0xB0, 0x7), 0xFF },
/* Data Fabric MCA types */
{ SMCA_CS, HWID_MCATYPE(0x2E, 0x0), 0x1FF },
{ SMCA_PIE, HWID_MCATYPE(0x2E, 0x1), 0xF },
/* Unified Memory Controller MCA type */
{ SMCA_UMC, HWID_MCATYPE(0x96, 0x0), 0x3F },
/* Parameter Block MCA type */
{ SMCA_PB, HWID_MCATYPE(0x05, 0x0), 0x1 },
/* Platform Security Processor MCA type */
{ SMCA_PSP, HWID_MCATYPE(0xFF, 0x0), 0x1 },
const char * const amd_df_mcablock_names[] = { /* System Management Unit MCA type */
[SMCA_CS] = "coherent_slave", { SMCA_SMU, HWID_MCATYPE(0x01, 0x0), 0x1 },
[SMCA_PIE] = "pie",
}; };
EXPORT_SYMBOL_GPL(amd_df_mcablock_names);
struct smca_bank_info smca_banks[MAX_NR_BANKS];
EXPORT_SYMBOL_GPL(smca_banks);
/*
* In SMCA enabled processors, we can have multiple banks for a given IP type.
* So to define a unique name for each bank, we use a temp c-string to append
* the MCA_IPID[InstanceId] to type's name in get_name().
*
* InstanceId is 32 bits which is 8 characters. Make sure MAX_MCATYPE_NAME_LEN
* is greater than 8 plus 1 (for underscore) plus length of longest type name.
*/
#define MAX_MCATYPE_NAME_LEN 30
static char buf_mcatype[MAX_MCATYPE_NAME_LEN];
static DEFINE_PER_CPU(struct threshold_bank **, threshold_banks); static DEFINE_PER_CPU(struct threshold_bank **, threshold_banks);
static DEFINE_PER_CPU(unsigned int, bank_map); /* see which banks are on */ static DEFINE_PER_CPU(unsigned int, bank_map); /* see which banks are on */
...@@ -108,6 +146,36 @@ void (*deferred_error_int_vector)(void) = default_deferred_error_interrupt; ...@@ -108,6 +146,36 @@ void (*deferred_error_int_vector)(void) = default_deferred_error_interrupt;
* CPU Initialization * CPU Initialization
*/ */
static void get_smca_bank_info(unsigned int bank)
{
unsigned int i, hwid_mcatype, cpu = smp_processor_id();
struct smca_hwid_mcatype *type;
u32 high, instanceId;
u16 hwid, mcatype;
/* Collect bank_info using CPU 0 for now. */
if (cpu)
return;
if (rdmsr_safe_on_cpu(cpu, MSR_AMD64_SMCA_MCx_IPID(bank), &instanceId, &high)) {
pr_warn("Failed to read MCA_IPID for bank %d\n", bank);
return;
}
hwid = high & MCI_IPID_HWID;
mcatype = (high & MCI_IPID_MCATYPE) >> 16;
hwid_mcatype = HWID_MCATYPE(hwid, mcatype);
for (i = 0; i < ARRAY_SIZE(smca_hwid_mcatypes); i++) {
type = &smca_hwid_mcatypes[i];
if (hwid_mcatype == type->hwid_mcatype) {
smca_banks[bank].type = type;
smca_banks[bank].type_instance = instanceId;
break;
}
}
}
struct thresh_restart { struct thresh_restart {
struct threshold_block *b; struct threshold_block *b;
int reset; int reset;
...@@ -293,7 +361,7 @@ static void deferred_error_interrupt_enable(struct cpuinfo_x86 *c) ...@@ -293,7 +361,7 @@ static void deferred_error_interrupt_enable(struct cpuinfo_x86 *c)
wrmsr(MSR_CU_DEF_ERR, low, high); wrmsr(MSR_CU_DEF_ERR, low, high);
} }
static u32 get_block_address(u32 current_addr, u32 low, u32 high, static u32 get_block_address(unsigned int cpu, u32 current_addr, u32 low, u32 high,
unsigned int bank, unsigned int block) unsigned int bank, unsigned int block)
{ {
u32 addr = 0, offset = 0; u32 addr = 0, offset = 0;
...@@ -309,13 +377,13 @@ static u32 get_block_address(u32 current_addr, u32 low, u32 high, ...@@ -309,13 +377,13 @@ static u32 get_block_address(u32 current_addr, u32 low, u32 high,
*/ */
u32 low, high; u32 low, high;
if (rdmsr_safe(MSR_AMD64_SMCA_MCx_CONFIG(bank), &low, &high)) if (rdmsr_safe_on_cpu(cpu, MSR_AMD64_SMCA_MCx_CONFIG(bank), &low, &high))
return addr; return addr;
if (!(low & MCI_CONFIG_MCAX)) if (!(low & MCI_CONFIG_MCAX))
return addr; return addr;
if (!rdmsr_safe(MSR_AMD64_SMCA_MCx_MISC(bank), &low, &high) && if (!rdmsr_safe_on_cpu(cpu, MSR_AMD64_SMCA_MCx_MISC(bank), &low, &high) &&
(low & MASK_BLKPTR_LO)) (low & MASK_BLKPTR_LO))
addr = MSR_AMD64_SMCA_MCx_MISCy(bank, block - 1); addr = MSR_AMD64_SMCA_MCx_MISCy(bank, block - 1);
} }
...@@ -395,6 +463,20 @@ prepare_threshold_block(unsigned int bank, unsigned int block, u32 addr, ...@@ -395,6 +463,20 @@ prepare_threshold_block(unsigned int bank, unsigned int block, u32 addr,
*/ */
smca_high &= ~BIT(2); smca_high &= ~BIT(2);
/*
* SMCA sets the Deferred Error Interrupt type per bank.
*
* MCA_CONFIG[DeferredIntTypeSupported] is bit 5, and tells us
* if the DeferredIntType bit field is available.
*
* MCA_CONFIG[DeferredIntType] is bits [38:37] ([6:5] in the
* high portion of the MSR). OS should set this to 0x1 to enable
* APIC based interrupt. First, check that no interrupt has been
* set.
*/
if ((smca_low & BIT(5)) && !((smca_high >> 5) & 0x3))
smca_high |= BIT(5);
wrmsr(smca_addr, smca_low, smca_high); wrmsr(smca_addr, smca_low, smca_high);
} }
...@@ -421,12 +503,15 @@ prepare_threshold_block(unsigned int bank, unsigned int block, u32 addr, ...@@ -421,12 +503,15 @@ prepare_threshold_block(unsigned int bank, unsigned int block, u32 addr,
void mce_amd_feature_init(struct cpuinfo_x86 *c) void mce_amd_feature_init(struct cpuinfo_x86 *c)
{ {
u32 low = 0, high = 0, address = 0; u32 low = 0, high = 0, address = 0;
unsigned int bank, block; unsigned int bank, block, cpu = smp_processor_id();
int offset = -1; int offset = -1;
for (bank = 0; bank < mca_cfg.banks; ++bank) { for (bank = 0; bank < mca_cfg.banks; ++bank) {
if (mce_flags.smca)
get_smca_bank_info(bank);
for (block = 0; block < NR_BLOCKS; ++block) { for (block = 0; block < NR_BLOCKS; ++block) {
address = get_block_address(address, low, high, bank, block); address = get_block_address(cpu, address, low, high, bank, block);
if (!address) if (!address)
break; break;
...@@ -476,9 +561,27 @@ __log_error(unsigned int bank, bool deferred_err, bool threshold_err, u64 misc) ...@@ -476,9 +561,27 @@ __log_error(unsigned int bank, bool deferred_err, bool threshold_err, u64 misc)
if (threshold_err) if (threshold_err)
m.misc = misc; m.misc = misc;
if (m.status & MCI_STATUS_ADDRV) if (m.status & MCI_STATUS_ADDRV) {
rdmsrl(msr_addr, m.addr); rdmsrl(msr_addr, m.addr);
/*
* Extract [55:<lsb>] where lsb is the least significant
* *valid* bit of the address bits.
*/
if (mce_flags.smca) {
u8 lsb = (m.addr >> 56) & 0x3f;
m.addr &= GENMASK_ULL(55, lsb);
}
}
if (mce_flags.smca) {
rdmsrl(MSR_AMD64_SMCA_MCx_IPID(bank), m.ipid);
if (m.status & MCI_STATUS_SYNDV)
rdmsrl(MSR_AMD64_SMCA_MCx_SYND(bank), m.synd);
}
mce_log(&m); mce_log(&m);
wrmsrl(msr_status, 0); wrmsrl(msr_status, 0);
...@@ -541,15 +644,14 @@ static void amd_deferred_error_interrupt(void) ...@@ -541,15 +644,14 @@ static void amd_deferred_error_interrupt(void)
static void amd_threshold_interrupt(void) static void amd_threshold_interrupt(void)
{ {
u32 low = 0, high = 0, address = 0; u32 low = 0, high = 0, address = 0;
int cpu = smp_processor_id(); unsigned int bank, block, cpu = smp_processor_id();
unsigned int bank, block;
/* assume first bank caused it */ /* assume first bank caused it */
for (bank = 0; bank < mca_cfg.banks; ++bank) { for (bank = 0; bank < mca_cfg.banks; ++bank) {
if (!(per_cpu(bank_map, cpu) & (1 << bank))) if (!(per_cpu(bank_map, cpu) & (1 << bank)))
continue; continue;
for (block = 0; block < NR_BLOCKS; ++block) { for (block = 0; block < NR_BLOCKS; ++block) {
address = get_block_address(address, low, high, bank, block); address = get_block_address(cpu, address, low, high, bank, block);
if (!address) if (!address)
break; break;
...@@ -713,6 +815,34 @@ static struct kobj_type threshold_ktype = { ...@@ -713,6 +815,34 @@ static struct kobj_type threshold_ktype = {
.default_attrs = default_attrs, .default_attrs = default_attrs,
}; };
static const char *get_name(unsigned int bank, struct threshold_block *b)
{
unsigned int bank_type;
if (!mce_flags.smca) {
if (b && bank == 4)
return bank4_names(b);
return th_names[bank];
}
if (!smca_banks[bank].type)
return NULL;
bank_type = smca_banks[bank].type->bank_type;
if (b && bank_type == SMCA_UMC) {
if (b->block < ARRAY_SIZE(smca_umc_block_names))
return smca_umc_block_names[b->block];
return NULL;
}
snprintf(buf_mcatype, MAX_MCATYPE_NAME_LEN,
"%s_%x", smca_bank_names[bank_type].name,
smca_banks[bank].type_instance);
return buf_mcatype;
}
static int allocate_threshold_blocks(unsigned int cpu, unsigned int bank, static int allocate_threshold_blocks(unsigned int cpu, unsigned int bank,
unsigned int block, u32 address) unsigned int block, u32 address)
{ {
...@@ -767,11 +897,11 @@ static int allocate_threshold_blocks(unsigned int cpu, unsigned int bank, ...@@ -767,11 +897,11 @@ static int allocate_threshold_blocks(unsigned int cpu, unsigned int bank,
err = kobject_init_and_add(&b->kobj, &threshold_ktype, err = kobject_init_and_add(&b->kobj, &threshold_ktype,
per_cpu(threshold_banks, cpu)[bank]->kobj, per_cpu(threshold_banks, cpu)[bank]->kobj,
(bank == 4 ? bank4_names(b) : th_names[bank])); get_name(bank, b));
if (err) if (err)
goto out_free; goto out_free;
recurse: recurse:
address = get_block_address(address, low, high, bank, ++block); address = get_block_address(cpu, address, low, high, bank, ++block);
if (!address) if (!address)
return 0; return 0;
...@@ -822,7 +952,7 @@ static int threshold_create_bank(unsigned int cpu, unsigned int bank) ...@@ -822,7 +952,7 @@ static int threshold_create_bank(unsigned int cpu, unsigned int bank)
struct device *dev = per_cpu(mce_device, cpu); struct device *dev = per_cpu(mce_device, cpu);
struct amd_northbridge *nb = NULL; struct amd_northbridge *nb = NULL;
struct threshold_bank *b = NULL; struct threshold_bank *b = NULL;
const char *name = th_names[bank]; const char *name = get_name(bank, NULL);
int err = 0; int err = 0;
if (is_shared_bank(bank)) { if (is_shared_bank(bank)) {
...@@ -869,7 +999,7 @@ static int threshold_create_bank(unsigned int cpu, unsigned int bank) ...@@ -869,7 +999,7 @@ static int threshold_create_bank(unsigned int cpu, unsigned int bank)
} }
} }
err = allocate_threshold_blocks(cpu, bank, 0, MSR_IA32_MCx_MISC(bank)); err = allocate_threshold_blocks(cpu, bank, 0, msr_ops.misc(bank));
if (!err) if (!err)
goto out; goto out;
......
...@@ -626,3 +626,34 @@ DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_NB_F3, ...@@ -626,3 +626,34 @@ DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_NB_F3,
amd_disable_seq_and_redirect_scrub); amd_disable_seq_and_redirect_scrub);
#endif #endif
#if defined(CONFIG_X86_64) && defined(CONFIG_X86_MCE)
#include <linux/jump_label.h>
#include <asm/string_64.h>
/* Ivy Bridge, Haswell, Broadwell */
static void quirk_intel_brickland_xeon_ras_cap(struct pci_dev *pdev)
{
u32 capid0;
pci_read_config_dword(pdev, 0x84, &capid0);
if (capid0 & 0x10)
static_branch_inc(&mcsafe_key);
}
/* Skylake */
static void quirk_intel_purley_xeon_ras_cap(struct pci_dev *pdev)
{
u32 capid0;
pci_read_config_dword(pdev, 0x84, &capid0);
if ((capid0 & 0xc0) == 0xc0)
static_branch_inc(&mcsafe_key);
}
DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x0ec3, quirk_intel_brickland_xeon_ras_cap);
DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2fc0, quirk_intel_brickland_xeon_ras_cap);
DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x6fc0, quirk_intel_brickland_xeon_ras_cap);
DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2083, quirk_intel_purley_xeon_ras_cap);
#endif
...@@ -38,7 +38,7 @@ EXPORT_SYMBOL(__copy_user_nocache); ...@@ -38,7 +38,7 @@ EXPORT_SYMBOL(__copy_user_nocache);
EXPORT_SYMBOL(_copy_from_user); EXPORT_SYMBOL(_copy_from_user);
EXPORT_SYMBOL(_copy_to_user); EXPORT_SYMBOL(_copy_to_user);
EXPORT_SYMBOL_GPL(memcpy_mcsafe); EXPORT_SYMBOL_GPL(memcpy_mcsafe_unrolled);
EXPORT_SYMBOL(copy_page); EXPORT_SYMBOL(copy_page);
EXPORT_SYMBOL(clear_page); EXPORT_SYMBOL(clear_page);
......
...@@ -181,11 +181,11 @@ ENDPROC(memcpy_orig) ...@@ -181,11 +181,11 @@ ENDPROC(memcpy_orig)
#ifndef CONFIG_UML #ifndef CONFIG_UML
/* /*
* memcpy_mcsafe - memory copy with machine check exception handling * memcpy_mcsafe_unrolled - memory copy with machine check exception handling
* Note that we only catch machine checks when reading the source addresses. * Note that we only catch machine checks when reading the source addresses.
* Writes to target are posted and don't generate machine checks. * Writes to target are posted and don't generate machine checks.
*/ */
ENTRY(memcpy_mcsafe) ENTRY(memcpy_mcsafe_unrolled)
cmpl $8, %edx cmpl $8, %edx
/* Less than 8 bytes? Go to byte copy loop */ /* Less than 8 bytes? Go to byte copy loop */
jb .L_no_whole_words jb .L_no_whole_words
...@@ -273,7 +273,7 @@ ENTRY(memcpy_mcsafe) ...@@ -273,7 +273,7 @@ ENTRY(memcpy_mcsafe)
.L_done_memcpy_trap: .L_done_memcpy_trap:
xorq %rax, %rax xorq %rax, %rax
ret ret
ENDPROC(memcpy_mcsafe) ENDPROC(memcpy_mcsafe_unrolled)
.section .fixup, "ax" .section .fixup, "ax"
/* Return -EFAULT for any failure */ /* Return -EFAULT for any failure */
......
...@@ -68,6 +68,7 @@ static int inj_##reg##_set(void *data, u64 val) \ ...@@ -68,6 +68,7 @@ static int inj_##reg##_set(void *data, u64 val) \
MCE_INJECT_SET(status); MCE_INJECT_SET(status);
MCE_INJECT_SET(misc); MCE_INJECT_SET(misc);
MCE_INJECT_SET(addr); MCE_INJECT_SET(addr);
MCE_INJECT_SET(synd);
#define MCE_INJECT_GET(reg) \ #define MCE_INJECT_GET(reg) \
static int inj_##reg##_get(void *data, u64 *val) \ static int inj_##reg##_get(void *data, u64 *val) \
...@@ -81,10 +82,12 @@ static int inj_##reg##_get(void *data, u64 *val) \ ...@@ -81,10 +82,12 @@ static int inj_##reg##_get(void *data, u64 *val) \
MCE_INJECT_GET(status); MCE_INJECT_GET(status);
MCE_INJECT_GET(misc); MCE_INJECT_GET(misc);
MCE_INJECT_GET(addr); MCE_INJECT_GET(addr);
MCE_INJECT_GET(synd);
DEFINE_SIMPLE_ATTRIBUTE(status_fops, inj_status_get, inj_status_set, "%llx\n"); DEFINE_SIMPLE_ATTRIBUTE(status_fops, inj_status_get, inj_status_set, "%llx\n");
DEFINE_SIMPLE_ATTRIBUTE(misc_fops, inj_misc_get, inj_misc_set, "%llx\n"); DEFINE_SIMPLE_ATTRIBUTE(misc_fops, inj_misc_get, inj_misc_set, "%llx\n");
DEFINE_SIMPLE_ATTRIBUTE(addr_fops, inj_addr_get, inj_addr_set, "%llx\n"); DEFINE_SIMPLE_ATTRIBUTE(addr_fops, inj_addr_get, inj_addr_set, "%llx\n");
DEFINE_SIMPLE_ATTRIBUTE(synd_fops, inj_synd_get, inj_synd_set, "%llx\n");
/* /*
* Caller needs to be make sure this cpu doesn't disappear * Caller needs to be make sure this cpu doesn't disappear
...@@ -243,27 +246,27 @@ static void toggle_nb_mca_mst_cpu(u16 nid) ...@@ -243,27 +246,27 @@ static void toggle_nb_mca_mst_cpu(u16 nid)
static void prepare_msrs(void *info) static void prepare_msrs(void *info)
{ {
struct mce i_mce = *(struct mce *)info; struct mce m = *(struct mce *)info;
u8 b = i_mce.bank; u8 b = m.bank;
wrmsrl(MSR_IA32_MCG_STATUS, i_mce.mcgstatus); wrmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
if (boot_cpu_has(X86_FEATURE_SMCA)) { if (boot_cpu_has(X86_FEATURE_SMCA)) {
if (i_mce.inject_flags == DFR_INT_INJ) { if (m.inject_flags == DFR_INT_INJ) {
wrmsrl(MSR_AMD64_SMCA_MCx_DESTAT(b), i_mce.status); wrmsrl(MSR_AMD64_SMCA_MCx_DESTAT(b), m.status);
wrmsrl(MSR_AMD64_SMCA_MCx_DEADDR(b), i_mce.addr); wrmsrl(MSR_AMD64_SMCA_MCx_DEADDR(b), m.addr);
} else { } else {
wrmsrl(MSR_AMD64_SMCA_MCx_STATUS(b), i_mce.status); wrmsrl(MSR_AMD64_SMCA_MCx_STATUS(b), m.status);
wrmsrl(MSR_AMD64_SMCA_MCx_ADDR(b), i_mce.addr); wrmsrl(MSR_AMD64_SMCA_MCx_ADDR(b), m.addr);
} }
wrmsrl(MSR_AMD64_SMCA_MCx_MISC(b), i_mce.misc); wrmsrl(MSR_AMD64_SMCA_MCx_MISC(b), m.misc);
wrmsrl(MSR_AMD64_SMCA_MCx_SYND(b), m.synd);
} else { } else {
wrmsrl(MSR_IA32_MCx_STATUS(b), i_mce.status); wrmsrl(MSR_IA32_MCx_STATUS(b), m.status);
wrmsrl(MSR_IA32_MCx_ADDR(b), i_mce.addr); wrmsrl(MSR_IA32_MCx_ADDR(b), m.addr);
wrmsrl(MSR_IA32_MCx_MISC(b), i_mce.misc); wrmsrl(MSR_IA32_MCx_MISC(b), m.misc);
} }
} }
static void do_inject(void) static void do_inject(void)
...@@ -275,6 +278,9 @@ static void do_inject(void) ...@@ -275,6 +278,9 @@ static void do_inject(void)
if (i_mce.misc) if (i_mce.misc)
i_mce.status |= MCI_STATUS_MISCV; i_mce.status |= MCI_STATUS_MISCV;
if (i_mce.synd)
i_mce.status |= MCI_STATUS_SYNDV;
if (inj_type == SW_INJ) { if (inj_type == SW_INJ) {
mce_inject_log(&i_mce); mce_inject_log(&i_mce);
return; return;
...@@ -301,7 +307,9 @@ static void do_inject(void) ...@@ -301,7 +307,9 @@ static void do_inject(void)
* only on the node base core. Refer to D18F3x44[NbMcaToMstCpuEn] for * only on the node base core. Refer to D18F3x44[NbMcaToMstCpuEn] for
* Fam10h and later BKDGs. * Fam10h and later BKDGs.
*/ */
if (static_cpu_has(X86_FEATURE_AMD_DCM) && b == 4) { if (static_cpu_has(X86_FEATURE_AMD_DCM) &&
b == 4 &&
boot_cpu_data.x86 < 0x17) {
toggle_nb_mca_mst_cpu(amd_get_nb_id(cpu)); toggle_nb_mca_mst_cpu(amd_get_nb_id(cpu));
cpu = get_nbc_for_node(amd_get_nb_id(cpu)); cpu = get_nbc_for_node(amd_get_nb_id(cpu));
} }
...@@ -371,6 +379,9 @@ static const char readme_msg[] = ...@@ -371,6 +379,9 @@ static const char readme_msg[] =
"\t used for error thresholding purposes and its validity is indicated by\n" "\t used for error thresholding purposes and its validity is indicated by\n"
"\t MCi_STATUS[MiscV].\n" "\t MCi_STATUS[MiscV].\n"
"\n" "\n"
"synd:\t Set MCi_SYND: provide syndrome info about the error. Only valid on\n"
"\t Scalable MCA systems, and its validity is indicated by MCi_STATUS[SyndV].\n"
"\n"
"addr:\t Error address value to be written to MCi_ADDR. Log address information\n" "addr:\t Error address value to be written to MCi_ADDR. Log address information\n"
"\t associated with the error.\n" "\t associated with the error.\n"
"\n" "\n"
...@@ -420,6 +431,7 @@ static struct dfs_node { ...@@ -420,6 +431,7 @@ static struct dfs_node {
{ .name = "status", .fops = &status_fops, .perm = S_IRUSR | S_IWUSR }, { .name = "status", .fops = &status_fops, .perm = S_IRUSR | S_IWUSR },
{ .name = "misc", .fops = &misc_fops, .perm = S_IRUSR | S_IWUSR }, { .name = "misc", .fops = &misc_fops, .perm = S_IRUSR | S_IWUSR },
{ .name = "addr", .fops = &addr_fops, .perm = S_IRUSR | S_IWUSR }, { .name = "addr", .fops = &addr_fops, .perm = S_IRUSR | S_IWUSR },
{ .name = "synd", .fops = &synd_fops, .perm = S_IRUSR | S_IWUSR },
{ .name = "bank", .fops = &bank_fops, .perm = S_IRUSR | S_IWUSR }, { .name = "bank", .fops = &bank_fops, .perm = S_IRUSR | S_IWUSR },
{ .name = "flags", .fops = &flags_fops, .perm = S_IRUSR | S_IWUSR }, { .name = "flags", .fops = &flags_fops, .perm = S_IRUSR | S_IWUSR },
{ .name = "cpu", .fops = &extcpu_fops, .perm = S_IRUSR | S_IWUSR }, { .name = "cpu", .fops = &extcpu_fops, .perm = S_IRUSR | S_IWUSR },
...@@ -428,7 +440,7 @@ static struct dfs_node { ...@@ -428,7 +440,7 @@ static struct dfs_node {
static int __init init_mce_inject(void) static int __init init_mce_inject(void)
{ {
int i; unsigned int i;
u64 cap; u64 cap;
rdmsrl(MSR_IA32_MCG_CAP, cap); rdmsrl(MSR_IA32_MCG_CAP, cap);
...@@ -452,26 +464,22 @@ static int __init init_mce_inject(void) ...@@ -452,26 +464,22 @@ static int __init init_mce_inject(void)
return 0; return 0;
err_dfs_add: err_dfs_add:
while (--i >= 0) while (i-- > 0)
debugfs_remove(dfs_fls[i].d); debugfs_remove(dfs_fls[i].d);
debugfs_remove(dfs_inj); debugfs_remove(dfs_inj);
dfs_inj = NULL; dfs_inj = NULL;
return -ENOMEM; return -ENODEV;
} }
static void __exit exit_mce_inject(void) static void __exit exit_mce_inject(void)
{ {
int i;
for (i = 0; i < ARRAY_SIZE(dfs_fls); i++) debugfs_remove_recursive(dfs_inj);
debugfs_remove(dfs_fls[i].d); dfs_inj = NULL;
memset(&dfs_fls, 0, sizeof(dfs_fls)); memset(&dfs_fls, 0, sizeof(dfs_fls));
debugfs_remove(dfs_inj);
dfs_inj = NULL;
} }
module_init(init_mce_inject); module_init(init_mce_inject);
module_exit(exit_mce_inject); module_exit(exit_mce_inject);
......
...@@ -148,12 +148,12 @@ static const char * const mc6_mce_desc[] = { ...@@ -148,12 +148,12 @@ static const char * const mc6_mce_desc[] = {
}; };
/* Scalable MCA error strings */ /* Scalable MCA error strings */
static const char * const f17h_ls_mce_desc[] = { static const char * const smca_ls_mce_desc[] = {
"Load queue parity", "Load queue parity",
"Store queue parity", "Store queue parity",
"Miss address buffer payload parity", "Miss address buffer payload parity",
"L1 TLB parity", "L1 TLB parity",
"", /* reserved */ "Reserved",
"DC tag error type 6", "DC tag error type 6",
"DC tag error type 1", "DC tag error type 1",
"Internal error type 1", "Internal error type 1",
...@@ -172,7 +172,7 @@ static const char * const f17h_ls_mce_desc[] = { ...@@ -172,7 +172,7 @@ static const char * const f17h_ls_mce_desc[] = {
"L2 fill data error", "L2 fill data error",
}; };
static const char * const f17h_if_mce_desc[] = { static const char * const smca_if_mce_desc[] = {
"microtag probe port parity error", "microtag probe port parity error",
"IC microtag or full tag multi-hit error", "IC microtag or full tag multi-hit error",
"IC full tag parity", "IC full tag parity",
...@@ -185,19 +185,22 @@ static const char * const f17h_if_mce_desc[] = { ...@@ -185,19 +185,22 @@ static const char * const f17h_if_mce_desc[] = {
"BPQ snoop parity on Thread 1", "BPQ snoop parity on Thread 1",
"L1 BTB multi-match error", "L1 BTB multi-match error",
"L2 BTB multi-match error", "L2 BTB multi-match error",
"L2 Cache Response Poison error",
"System Read Data error",
}; };
static const char * const f17h_l2_mce_desc[] = { static const char * const smca_l2_mce_desc[] = {
"L2M tag multi-way-hit error", "L2M tag multi-way-hit error",
"L2M tag ECC error", "L2M tag ECC error",
"L2M data ECC error", "L2M data ECC error",
"HW assert", "HW assert",
}; };
static const char * const f17h_de_mce_desc[] = { static const char * const smca_de_mce_desc[] = {
"uop cache tag parity error", "uop cache tag parity error",
"uop cache data parity error", "uop cache data parity error",
"Insn buffer parity error", "Insn buffer parity error",
"uop queue parity error",
"Insn dispatch queue parity error", "Insn dispatch queue parity error",
"Fetch address FIFO parity", "Fetch address FIFO parity",
"Patch RAM data parity", "Patch RAM data parity",
...@@ -205,7 +208,7 @@ static const char * const f17h_de_mce_desc[] = { ...@@ -205,7 +208,7 @@ static const char * const f17h_de_mce_desc[] = {
"uop buffer parity" "uop buffer parity"
}; };
static const char * const f17h_ex_mce_desc[] = { static const char * const smca_ex_mce_desc[] = {
"Watchdog timeout error", "Watchdog timeout error",
"Phy register file parity", "Phy register file parity",
"Flag register file parity", "Flag register file parity",
...@@ -214,18 +217,22 @@ static const char * const f17h_ex_mce_desc[] = { ...@@ -214,18 +217,22 @@ static const char * const f17h_ex_mce_desc[] = {
"EX payload parity", "EX payload parity",
"Checkpoint queue parity", "Checkpoint queue parity",
"Retire dispatch queue parity", "Retire dispatch queue parity",
"Retire status queue parity error",
"Scheduling queue parity error",
"Branch buffer queue parity error",
}; };
static const char * const f17h_fp_mce_desc[] = { static const char * const smca_fp_mce_desc[] = {
"Physical register file parity", "Physical register file parity",
"Freelist parity error", "Freelist parity error",
"Schedule queue parity", "Schedule queue parity",
"NSQ parity error", "NSQ parity error",
"Retire queue parity", "Retire queue parity",
"Status register file parity", "Status register file parity",
"Hardware assertion",
}; };
static const char * const f17h_l3_mce_desc[] = { static const char * const smca_l3_mce_desc[] = {
"Shadow tag macro ECC error", "Shadow tag macro ECC error",
"Shadow tag macro multi-way-hit error", "Shadow tag macro multi-way-hit error",
"L3M tag ECC error", "L3M tag ECC error",
...@@ -236,7 +243,7 @@ static const char * const f17h_l3_mce_desc[] = { ...@@ -236,7 +243,7 @@ static const char * const f17h_l3_mce_desc[] = {
"L3 HW assert", "L3 HW assert",
}; };
static const char * const f17h_cs_mce_desc[] = { static const char * const smca_cs_mce_desc[] = {
"Illegal request from transport layer", "Illegal request from transport layer",
"Address violation", "Address violation",
"Security violation", "Security violation",
...@@ -248,14 +255,14 @@ static const char * const f17h_cs_mce_desc[] = { ...@@ -248,14 +255,14 @@ static const char * const f17h_cs_mce_desc[] = {
"ECC error on probe filter access", "ECC error on probe filter access",
}; };
static const char * const f17h_pie_mce_desc[] = { static const char * const smca_pie_mce_desc[] = {
"HW assert", "HW assert",
"Internal PIE register security violation", "Internal PIE register security violation",
"Error on GMI link", "Error on GMI link",
"Poison data written to internal PIE register", "Poison data written to internal PIE register",
}; };
static const char * const f17h_umc_mce_desc[] = { static const char * const smca_umc_mce_desc[] = {
"DRAM ECC error", "DRAM ECC error",
"Data poison error on DRAM", "Data poison error on DRAM",
"SDP parity error", "SDP parity error",
...@@ -264,18 +271,39 @@ static const char * const f17h_umc_mce_desc[] = { ...@@ -264,18 +271,39 @@ static const char * const f17h_umc_mce_desc[] = {
"Write data CRC error", "Write data CRC error",
}; };
static const char * const f17h_pb_mce_desc[] = { static const char * const smca_pb_mce_desc[] = {
"Parameter Block RAM ECC error", "Parameter Block RAM ECC error",
}; };
static const char * const f17h_psp_mce_desc[] = { static const char * const smca_psp_mce_desc[] = {
"PSP RAM ECC or parity error", "PSP RAM ECC or parity error",
}; };
static const char * const f17h_smu_mce_desc[] = { static const char * const smca_smu_mce_desc[] = {
"SMU RAM ECC or parity error", "SMU RAM ECC or parity error",
}; };
struct smca_mce_desc {
const char * const *descs;
unsigned int num_descs;
};
static struct smca_mce_desc smca_mce_descs[] = {
[SMCA_LS] = { smca_ls_mce_desc, ARRAY_SIZE(smca_ls_mce_desc) },
[SMCA_IF] = { smca_if_mce_desc, ARRAY_SIZE(smca_if_mce_desc) },
[SMCA_L2_CACHE] = { smca_l2_mce_desc, ARRAY_SIZE(smca_l2_mce_desc) },
[SMCA_DE] = { smca_de_mce_desc, ARRAY_SIZE(smca_de_mce_desc) },
[SMCA_EX] = { smca_ex_mce_desc, ARRAY_SIZE(smca_ex_mce_desc) },
[SMCA_FP] = { smca_fp_mce_desc, ARRAY_SIZE(smca_fp_mce_desc) },
[SMCA_L3_CACHE] = { smca_l3_mce_desc, ARRAY_SIZE(smca_l3_mce_desc) },
[SMCA_CS] = { smca_cs_mce_desc, ARRAY_SIZE(smca_cs_mce_desc) },
[SMCA_PIE] = { smca_pie_mce_desc, ARRAY_SIZE(smca_pie_mce_desc) },
[SMCA_UMC] = { smca_umc_mce_desc, ARRAY_SIZE(smca_umc_mce_desc) },
[SMCA_PB] = { smca_pb_mce_desc, ARRAY_SIZE(smca_pb_mce_desc) },
[SMCA_PSP] = { smca_psp_mce_desc, ARRAY_SIZE(smca_psp_mce_desc) },
[SMCA_SMU] = { smca_smu_mce_desc, ARRAY_SIZE(smca_smu_mce_desc) },
};
static bool f12h_mc0_mce(u16 ec, u8 xec) static bool f12h_mc0_mce(u16 ec, u8 xec)
{ {
bool ret = false; bool ret = false;
...@@ -820,175 +848,35 @@ static void decode_mc6_mce(struct mce *m) ...@@ -820,175 +848,35 @@ static void decode_mc6_mce(struct mce *m)
pr_emerg(HW_ERR "Corrupted MC6 MCE info?\n"); pr_emerg(HW_ERR "Corrupted MC6 MCE info?\n");
} }
static void decode_f17h_core_errors(const char *ip_name, u8 xec,
unsigned int mca_type)
{
const char * const *error_desc_array;
size_t len;
pr_emerg(HW_ERR "%s Error: ", ip_name);
switch (mca_type) {
case SMCA_LS:
error_desc_array = f17h_ls_mce_desc;
len = ARRAY_SIZE(f17h_ls_mce_desc) - 1;
if (xec == 0x4) {
pr_cont("Unrecognized LS MCA error code.\n");
return;
}
break;
case SMCA_IF:
error_desc_array = f17h_if_mce_desc;
len = ARRAY_SIZE(f17h_if_mce_desc) - 1;
break;
case SMCA_L2_CACHE:
error_desc_array = f17h_l2_mce_desc;
len = ARRAY_SIZE(f17h_l2_mce_desc) - 1;
break;
case SMCA_DE:
error_desc_array = f17h_de_mce_desc;
len = ARRAY_SIZE(f17h_de_mce_desc) - 1;
break;
case SMCA_EX:
error_desc_array = f17h_ex_mce_desc;
len = ARRAY_SIZE(f17h_ex_mce_desc) - 1;
break;
case SMCA_FP:
error_desc_array = f17h_fp_mce_desc;
len = ARRAY_SIZE(f17h_fp_mce_desc) - 1;
break;
case SMCA_L3_CACHE:
error_desc_array = f17h_l3_mce_desc;
len = ARRAY_SIZE(f17h_l3_mce_desc) - 1;
break;
default:
pr_cont("Corrupted MCA core error info.\n");
return;
}
if (xec > len) {
pr_cont("Unrecognized %s MCA bank error code.\n",
amd_core_mcablock_names[mca_type]);
return;
}
pr_cont("%s.\n", error_desc_array[xec]);
}
static void decode_df_errors(u8 xec, unsigned int mca_type)
{
const char * const *error_desc_array;
size_t len;
pr_emerg(HW_ERR "Data Fabric Error: ");
switch (mca_type) {
case SMCA_CS:
error_desc_array = f17h_cs_mce_desc;
len = ARRAY_SIZE(f17h_cs_mce_desc) - 1;
break;
case SMCA_PIE:
error_desc_array = f17h_pie_mce_desc;
len = ARRAY_SIZE(f17h_pie_mce_desc) - 1;
break;
default:
pr_cont("Corrupted MCA Data Fabric info.\n");
return;
}
if (xec > len) {
pr_cont("Unrecognized %s MCA bank error code.\n",
amd_df_mcablock_names[mca_type]);
return;
}
pr_cont("%s.\n", error_desc_array[xec]);
}
/* Decode errors according to Scalable MCA specification */ /* Decode errors according to Scalable MCA specification */
static void decode_smca_errors(struct mce *m) static void decode_smca_errors(struct mce *m)
{ {
u32 addr = MSR_AMD64_SMCA_MCx_IPID(m->bank); struct smca_hwid_mcatype *type;
unsigned int hwid, mca_type, i; unsigned int bank_type;
u8 xec = XEC(m->status, xec_mask);
const char * const *error_desc_array;
const char *ip_name; const char *ip_name;
u32 low, high; u8 xec = XEC(m->status, xec_mask);
size_t len;
if (rdmsr_safe(addr, &low, &high)) { if (m->bank >= ARRAY_SIZE(smca_banks))
pr_emerg("Invalid IP block specified, error information is unreliable.\n");
return; return;
}
hwid = high & MCI_IPID_HWID;
mca_type = (high & MCI_IPID_MCATYPE) >> 16;
pr_emerg(HW_ERR "MC%d IPID value: 0x%08x%08x\n", m->bank, high, low);
/*
* Based on hwid and mca_type values, decode errors from respective IPs.
* Note: mca_type values make sense only in the context of an hwid.
*/
for (i = 0; i < ARRAY_SIZE(amd_hwids); i++)
if (amd_hwids[i].hwid == hwid)
break;
switch (i) {
case SMCA_F17H_CORE:
ip_name = (mca_type == SMCA_L3_CACHE) ?
"L3 Cache" : "F17h Core";
return decode_f17h_core_errors(ip_name, xec, mca_type);
break;
case SMCA_DF:
return decode_df_errors(xec, mca_type);
break;
case SMCA_UMC: if (boot_cpu_data.x86 >= 0x17 && m->bank == 4)
error_desc_array = f17h_umc_mce_desc; pr_emerg(HW_ERR "Bank 4 is reserved on Fam17h.\n");
len = ARRAY_SIZE(f17h_umc_mce_desc) - 1;
break;
case SMCA_PB: type = smca_banks[m->bank].type;
error_desc_array = f17h_pb_mce_desc; if (!type)
len = ARRAY_SIZE(f17h_pb_mce_desc) - 1; return;
break;
case SMCA_PSP: bank_type = type->bank_type;
error_desc_array = f17h_psp_mce_desc; ip_name = smca_bank_names[bank_type].long_name;
len = ARRAY_SIZE(f17h_psp_mce_desc) - 1;
break;
case SMCA_SMU: pr_emerg(HW_ERR "%s Extended Error Code: %d\n", ip_name, xec);
error_desc_array = f17h_smu_mce_desc;
len = ARRAY_SIZE(f17h_smu_mce_desc) - 1;
break;
default: /* Only print the decode of valid error codes */
pr_emerg(HW_ERR "HWID:%d does not match any existing IPs.\n", hwid); if (xec < smca_mce_descs[bank_type].num_descs &&
return; (type->xec_bitmap & BIT_ULL(xec))) {
}
ip_name = amd_hwids[i].name;
pr_emerg(HW_ERR "%s Error: ", ip_name); pr_emerg(HW_ERR "%s Error: ", ip_name);
pr_cont("%s.\n", smca_mce_descs[bank_type].descs[xec]);
if (xec > len) {
pr_cont("Unrecognized %s MCA bank error code.\n", ip_name);
return;
} }
pr_cont("%s.\n", error_desc_array[xec]);
} }
static inline void amd_decode_err_code(u16 ec) static inline void amd_decode_err_code(u16 ec)
...@@ -1078,6 +966,8 @@ int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data) ...@@ -1078,6 +966,8 @@ int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
u32 low, high; u32 low, high;
u32 addr = MSR_AMD64_SMCA_MCx_CONFIG(m->bank); u32 addr = MSR_AMD64_SMCA_MCx_CONFIG(m->bank);
pr_cont("|%s", ((m->status & MCI_STATUS_SYNDV) ? "SyndV" : "-"));
if (!rdmsr_safe(addr, &low, &high) && if (!rdmsr_safe(addr, &low, &high) &&
(low & MCI_CONFIG_MCAX)) (low & MCI_CONFIG_MCAX))
pr_cont("|%s", ((m->status & MCI_STATUS_TCC) ? "TCC" : "-")); pr_cont("|%s", ((m->status & MCI_STATUS_TCC) ? "TCC" : "-"));
...@@ -1091,12 +981,20 @@ int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data) ...@@ -1091,12 +981,20 @@ int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
pr_cont("]: 0x%016llx\n", m->status); pr_cont("]: 0x%016llx\n", m->status);
if (m->status & MCI_STATUS_ADDRV) if (m->status & MCI_STATUS_ADDRV)
pr_emerg(HW_ERR "MC%d Error Address: 0x%016llx\n", m->bank, m->addr); pr_emerg(HW_ERR "Error Addr: 0x%016llx", m->addr);
if (boot_cpu_has(X86_FEATURE_SMCA)) { if (boot_cpu_has(X86_FEATURE_SMCA)) {
if (m->status & MCI_STATUS_SYNDV)
pr_cont(", Syndrome: 0x%016llx", m->synd);
pr_cont(", IPID: 0x%016llx", m->ipid);
pr_cont("\n");
decode_smca_errors(m); decode_smca_errors(m);
goto err_code; goto err_code;
} } else
pr_cont("\n");
if (!fam_ops) if (!fam_ops)
goto err_code; goto err_code;
......
...@@ -269,9 +269,15 @@ struct static_key_false { ...@@ -269,9 +269,15 @@ struct static_key_false {
#define DEFINE_STATIC_KEY_TRUE(name) \ #define DEFINE_STATIC_KEY_TRUE(name) \
struct static_key_true name = STATIC_KEY_TRUE_INIT struct static_key_true name = STATIC_KEY_TRUE_INIT
#define DECLARE_STATIC_KEY_TRUE(name) \
extern struct static_key_true name
#define DEFINE_STATIC_KEY_FALSE(name) \ #define DEFINE_STATIC_KEY_FALSE(name) \
struct static_key_false name = STATIC_KEY_FALSE_INIT struct static_key_false name = STATIC_KEY_FALSE_INIT
#define DECLARE_STATIC_KEY_FALSE(name) \
extern struct static_key_false name
#define DEFINE_STATIC_KEY_ARRAY_TRUE(name, count) \ #define DEFINE_STATIC_KEY_ARRAY_TRUE(name, count) \
struct static_key_true name[count] = { \ struct static_key_true name[count] = { \
[0 ... (count) - 1] = STATIC_KEY_TRUE_INIT, \ [0 ... (count) - 1] = STATIC_KEY_TRUE_INIT, \
......
...@@ -20,6 +20,8 @@ TRACE_EVENT(mce_record, ...@@ -20,6 +20,8 @@ TRACE_EVENT(mce_record,
__field( u64, status ) __field( u64, status )
__field( u64, addr ) __field( u64, addr )
__field( u64, misc ) __field( u64, misc )
__field( u64, synd )
__field( u64, ipid )
__field( u64, ip ) __field( u64, ip )
__field( u64, tsc ) __field( u64, tsc )
__field( u64, walltime ) __field( u64, walltime )
...@@ -38,6 +40,8 @@ TRACE_EVENT(mce_record, ...@@ -38,6 +40,8 @@ TRACE_EVENT(mce_record,
__entry->status = m->status; __entry->status = m->status;
__entry->addr = m->addr; __entry->addr = m->addr;
__entry->misc = m->misc; __entry->misc = m->misc;
__entry->synd = m->synd;
__entry->ipid = m->ipid;
__entry->ip = m->ip; __entry->ip = m->ip;
__entry->tsc = m->tsc; __entry->tsc = m->tsc;
__entry->walltime = m->time; __entry->walltime = m->time;
...@@ -50,11 +54,12 @@ TRACE_EVENT(mce_record, ...@@ -50,11 +54,12 @@ TRACE_EVENT(mce_record,
__entry->cpuvendor = m->cpuvendor; __entry->cpuvendor = m->cpuvendor;
), ),
TP_printk("CPU: %d, MCGc/s: %llx/%llx, MC%d: %016Lx, ADDR/MISC: %016Lx/%016Lx, RIP: %02x:<%016Lx>, TSC: %llx, PROCESSOR: %u:%x, TIME: %llu, SOCKET: %u, APIC: %x", TP_printk("CPU: %d, MCGc/s: %llx/%llx, MC%d: %016Lx, IPID: %016Lx, ADDR/MISC/SYND: %016Lx/%016Lx/%016Lx, RIP: %02x:<%016Lx>, TSC: %llx, PROCESSOR: %u:%x, TIME: %llu, SOCKET: %u, APIC: %x",
__entry->cpu, __entry->cpu,
__entry->mcgcap, __entry->mcgstatus, __entry->mcgcap, __entry->mcgstatus,
__entry->bank, __entry->status, __entry->bank, __entry->status,
__entry->addr, __entry->misc, __entry->ipid,
__entry->addr, __entry->misc, __entry->synd,
__entry->cs, __entry->ip, __entry->cs, __entry->ip,
__entry->tsc, __entry->tsc,
__entry->cpuvendor, __entry->cpuid, __entry->cpuvendor, __entry->cpuid,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment