Commit 07f2d8c6 authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'x86-ras-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull x86 RAS changes from Ingo Molnar:
 "The main changes in this cycle were:

   - Simplify the CMCI storm logic on Intel CPUs after yet another
     report about a race in the code (Borislav Petkov)

   - Enable the MCE threshold irq on AMD CPUs by default (Aravind
     Gopalakrishnan)

   - Add AMD-specific MCE-severity grading function.  Further error
     recovery actions will be based on its output (Aravind Gopalakrishnan)

   - Documentation updates (Borislav Petkov)

   - ... assorted fixes and cleanups"

* 'x86-ras-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  x86/mce/severity: Fix warning about indented braces
  x86/mce: Define mce_severity function pointer
  x86/mce: Add an AMD severities-grading function
  x86/mce: Reindent __mcheck_cpu_apply_quirks() properly
  x86/mce: Use safe MSR accesses for AMD quirk
  x86/MCE/AMD: Enable thresholding interrupts by default if supported
  x86/MCE: Make mce_panic() fatal machine check msg in the same pattern
  x86/MCE/intel: Cleanup CMCI storm logic
  Documentation/acpi/einj: Correct and streamline text
  x86/MCE/AMD: Drop bogus const modifier from AMD's bank4_names()
parents ee799f41 cee8f5a6
This diff is collapsed.
......@@ -116,6 +116,12 @@ struct mca_config {
u32 rip_msr;
};
struct mce_vendor_flags {
__u64 overflow_recov : 1, /* cpuid_ebx(80000007) */
__reserved_0 : 63;
};
extern struct mce_vendor_flags mce_flags;
extern struct mca_config mca_cfg;
extern void mce_register_decode_chain(struct notifier_block *nb);
extern void mce_unregister_decode_chain(struct notifier_block *nb);
......@@ -128,9 +134,11 @@ extern int mce_p5_enabled;
#ifdef CONFIG_X86_MCE
int mcheck_init(void);
void mcheck_cpu_init(struct cpuinfo_x86 *c);
void mcheck_vendor_init_severity(void);
#else
static inline int mcheck_init(void) { return 0; }
static inline void mcheck_cpu_init(struct cpuinfo_x86 *c) {}
static inline void mcheck_vendor_init_severity(void) {}
#endif
#ifdef CONFIG_X86_ANCIENT_MCE
......@@ -183,11 +191,11 @@ typedef DECLARE_BITMAP(mce_banks_t, MAX_NR_BANKS);
DECLARE_PER_CPU(mce_banks_t, mce_poll_banks);
enum mcp_flags {
MCP_TIMESTAMP = (1 << 0), /* log time stamp */
MCP_UC = (1 << 1), /* log uncorrected errors */
MCP_DONTLOG = (1 << 2), /* only clear, don't log */
MCP_TIMESTAMP = BIT(0), /* log time stamp */
MCP_UC = BIT(1), /* log uncorrected errors */
MCP_DONTLOG = BIT(2), /* only clear, don't log */
};
void machine_check_poll(enum mcp_flags flags, mce_banks_t *b);
bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b);
int mce_notify_irq(void);
......
......@@ -14,6 +14,7 @@ enum severity_level {
};
#define ATTR_LEN 16
#define INITIAL_CHECK_INTERVAL 5 * 60 /* 5 minutes */
/* One object for each MCE bank, shared by all CPUs */
struct mce_bank {
......@@ -23,20 +24,20 @@ struct mce_bank {
char attrname[ATTR_LEN]; /* attribute name */
};
int mce_severity(struct mce *a, int tolerant, char **msg, bool is_excp);
extern int (*mce_severity)(struct mce *a, int tolerant, char **msg, bool is_excp);
struct dentry *mce_get_debugfs_dir(void);
extern struct mce_bank *mce_banks;
extern mce_banks_t mce_banks_ce_disabled;
#ifdef CONFIG_X86_MCE_INTEL
unsigned long mce_intel_adjust_timer(unsigned long interval);
void mce_intel_cmci_poll(void);
unsigned long cmci_intel_adjust_timer(unsigned long interval);
bool mce_intel_cmci_poll(void);
void mce_intel_hcpu_update(unsigned long cpu);
void cmci_disable_bank(int bank);
#else
# define mce_intel_adjust_timer mce_adjust_timer_default
static inline void mce_intel_cmci_poll(void) { }
# define cmci_intel_adjust_timer mce_adjust_timer_default
static inline bool mce_intel_cmci_poll(void) { return false; }
static inline void mce_intel_hcpu_update(unsigned long cpu) { }
static inline void cmci_disable_bank(int bank) { }
#endif
......
......@@ -186,7 +186,61 @@ static int error_context(struct mce *m)
return ((m->cs & 3) == 3) ? IN_USER : IN_KERNEL;
}
int mce_severity(struct mce *m, int tolerant, char **msg, bool is_excp)
/*
* See AMD Error Scope Hierarchy table in a newer BKDG. For example
* 49125_15h_Models_30h-3Fh_BKDG.pdf, section "RAS Features"
*/
static int mce_severity_amd(struct mce *m, int tolerant, char **msg, bool is_excp)
{
enum context ctx = error_context(m);
/* Processor Context Corrupt, no need to fumble too much, die! */
if (m->status & MCI_STATUS_PCC)
return MCE_PANIC_SEVERITY;
if (m->status & MCI_STATUS_UC) {
/*
* On older systems where overflow_recov flag is not present, we
* should simply panic if an error overflow occurs. If
* overflow_recov flag is present and set, then software can try
* to at least kill process to prolong system operation.
*/
if (mce_flags.overflow_recov) {
/* software can try to contain */
if (!(m->mcgstatus & MCG_STATUS_RIPV) && (ctx == IN_KERNEL))
return MCE_PANIC_SEVERITY;
/* kill current process */
return MCE_AR_SEVERITY;
} else {
/* at least one error was not logged */
if (m->status & MCI_STATUS_OVER)
return MCE_PANIC_SEVERITY;
}
/*
* For any other case, return MCE_UC_SEVERITY so that we log the
* error and exit #MC handler.
*/
return MCE_UC_SEVERITY;
}
/*
* deferred error: poll handler catches these and adds to mce_ring so
* memory-failure can take recovery actions.
*/
if (m->status & MCI_STATUS_DEFERRED)
return MCE_DEFERRED_SEVERITY;
/*
* corrected error: poll handler catches these and passes responsibility
* of decoding the error to EDAC
*/
return MCE_KEEP_SEVERITY;
}
static int mce_severity_intel(struct mce *m, int tolerant, char **msg, bool is_excp)
{
enum exception excp = (is_excp ? EXCP_CONTEXT : NO_EXCP);
enum context ctx = error_context(m);
......@@ -216,6 +270,16 @@ int mce_severity(struct mce *m, int tolerant, char **msg, bool is_excp)
}
}
/* Default to mce_severity_intel */
int (*mce_severity)(struct mce *m, int tolerant, char **msg, bool is_excp) =
mce_severity_intel;
void __init mcheck_vendor_init_severity(void)
{
if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
mce_severity = mce_severity_amd;
}
#ifdef CONFIG_DEBUG_FS
static void *s_start(struct seq_file *f, loff_t *pos)
{
......
......@@ -65,6 +65,7 @@ static DEFINE_MUTEX(mce_chrdev_read_mutex);
DEFINE_PER_CPU(unsigned, mce_exception_count);
struct mce_bank *mce_banks __read_mostly;
struct mce_vendor_flags mce_flags __read_mostly;
struct mca_config mca_cfg __read_mostly = {
.bootlog = -1,
......@@ -89,9 +90,6 @@ static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait);
static DEFINE_PER_CPU(struct mce, mces_seen);
static int cpu_missing;
/* CMCI storm detection filter */
static DEFINE_PER_CPU(unsigned long, mce_polled_error);
/*
* MCA banks polled by the period polling timer for corrected events.
* With Intel CMCI, this only has MCA banks which do not support CMCI (if any).
......@@ -622,8 +620,9 @@ DEFINE_PER_CPU(unsigned, mce_poll_count);
* is already totally * confused. In this case it's likely it will
* not fully execute the machine check handler either.
*/
void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
{
bool error_logged = false;
struct mce m;
int severity;
int i;
......@@ -646,7 +645,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
if (!(m.status & MCI_STATUS_VAL))
continue;
this_cpu_write(mce_polled_error, 1);
/*
* Uncorrected or signalled events are handled by the exception
* handler when it is enabled, so don't process those here.
......@@ -679,8 +678,10 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
* Don't get the IP here because it's unlikely to
* have anything to do with the actual error location.
*/
if (!(flags & MCP_DONTLOG) && !mca_cfg.dont_log_ce)
if (!(flags & MCP_DONTLOG) && !mca_cfg.dont_log_ce) {
error_logged = true;
mce_log(&m);
}
/*
* Clear state for this bank.
......@@ -694,6 +695,8 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
*/
sync_core();
return error_logged;
}
EXPORT_SYMBOL_GPL(machine_check_poll);
......@@ -813,7 +816,7 @@ static void mce_reign(void)
* other CPUs.
*/
if (m && global_worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3)
mce_panic("Fatal Machine check", m, msg);
mce_panic("Fatal machine check", m, msg);
/*
* For UC somewhere we let the CPU who detects it handle it.
......@@ -826,7 +829,7 @@ static void mce_reign(void)
* source or one CPU is hung. Panic.
*/
if (global_worst <= MCE_KEEP_SEVERITY && mca_cfg.tolerant < 3)
mce_panic("Machine check from unknown source", NULL, NULL);
mce_panic("Fatal machine check from unknown source", NULL, NULL);
/*
* Now clear all the mces_seen so that they don't reappear on
......@@ -1258,7 +1261,7 @@ void mce_log_therm_throt_event(__u64 status)
* poller finds an MCE, poll 2x faster. When the poller finds no more
* errors, poll 2x slower (up to check_interval seconds).
*/
static unsigned long check_interval = 5 * 60; /* 5 minutes */
static unsigned long check_interval = INITIAL_CHECK_INTERVAL;
static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */
static DEFINE_PER_CPU(struct timer_list, mce_timer);
......@@ -1268,49 +1271,57 @@ static unsigned long mce_adjust_timer_default(unsigned long interval)
return interval;
}
static unsigned long (*mce_adjust_timer)(unsigned long interval) =
mce_adjust_timer_default;
static unsigned long (*mce_adjust_timer)(unsigned long interval) = mce_adjust_timer_default;
static int cmc_error_seen(void)
static void __restart_timer(struct timer_list *t, unsigned long interval)
{
unsigned long *v = this_cpu_ptr(&mce_polled_error);
unsigned long when = jiffies + interval;
unsigned long flags;
local_irq_save(flags);
if (timer_pending(t)) {
if (time_before(when, t->expires))
mod_timer_pinned(t, when);
} else {
t->expires = round_jiffies(when);
add_timer_on(t, smp_processor_id());
}
return test_and_clear_bit(0, v);
local_irq_restore(flags);
}
static void mce_timer_fn(unsigned long data)
{
struct timer_list *t = this_cpu_ptr(&mce_timer);
int cpu = smp_processor_id();
unsigned long iv;
int notify;
WARN_ON(smp_processor_id() != data);
WARN_ON(cpu != data);
iv = __this_cpu_read(mce_next_interval);
if (mce_available(this_cpu_ptr(&cpu_info))) {
machine_check_poll(MCP_TIMESTAMP,
this_cpu_ptr(&mce_poll_banks));
mce_intel_cmci_poll();
machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_poll_banks));
if (mce_intel_cmci_poll()) {
iv = mce_adjust_timer(iv);
goto done;
}
}
/*
* Alert userspace if needed. If we logged an MCE, reduce the
* polling interval, otherwise increase the polling interval.
* Alert userspace if needed. If we logged an MCE, reduce the polling
* interval, otherwise increase the polling interval.
*/
iv = __this_cpu_read(mce_next_interval);
notify = mce_notify_irq();
notify |= cmc_error_seen();
if (notify) {
if (mce_notify_irq())
iv = max(iv / 2, (unsigned long) HZ/100);
} else {
else
iv = min(iv * 2, round_jiffies_relative(check_interval * HZ));
iv = mce_adjust_timer(iv);
}
done:
__this_cpu_write(mce_next_interval, iv);
/* Might have become 0 after CMCI storm subsided */
if (iv) {
t->expires = jiffies + iv;
add_timer_on(t, smp_processor_id());
}
__restart_timer(t, iv);
}
/*
......@@ -1319,16 +1330,10 @@ static void mce_timer_fn(unsigned long data)
void mce_timer_kick(unsigned long interval)
{
struct timer_list *t = this_cpu_ptr(&mce_timer);
unsigned long when = jiffies + interval;
unsigned long iv = __this_cpu_read(mce_next_interval);
if (timer_pending(t)) {
if (time_before(when, t->expires))
mod_timer_pinned(t, when);
} else {
t->expires = round_jiffies(when);
add_timer_on(t, smp_processor_id());
}
__restart_timer(t, interval);
if (interval < iv)
__this_cpu_write(mce_next_interval, interval);
}
......@@ -1528,6 +1533,13 @@ static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
if (c->x86 == 6 && cfg->banks > 0)
mce_banks[0].ctl = 0;
/*
* overflow_recov is supported for F15h Models 00h-0fh
* even though we don't have a CPUID bit for it.
*/
if (c->x86 == 0x15 && c->x86_model <= 0xf)
mce_flags.overflow_recov = 1;
/*
* Turn off MC4_MISC thresholding banks on those models since
* they're not supported there.
......@@ -1535,7 +1547,7 @@ static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
if (c->x86 == 0x15 &&
(c->x86_model >= 0x10 && c->x86_model <= 0x1f)) {
int i;
u64 val, hwcr;
u64 hwcr;
bool need_toggle;
u32 msrs[] = {
0x00000413, /* MC4_MISC0 */
......@@ -1550,15 +1562,9 @@ static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
if (need_toggle)
wrmsrl(MSR_K7_HWCR, hwcr | BIT(18));
for (i = 0; i < ARRAY_SIZE(msrs); i++) {
rdmsrl(msrs[i], val);
/* CntP bit set? */
if (val & BIT_64(62)) {
val &= ~BIT_64(62);
wrmsrl(msrs[i], val);
}
}
/* Clear CntP bit safely */
for (i = 0; i < ARRAY_SIZE(msrs); i++)
msr_clear_bit(msrs[i], 62);
/* restore old settings */
if (need_toggle)
......@@ -1629,10 +1635,11 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
switch (c->x86_vendor) {
case X86_VENDOR_INTEL:
mce_intel_feature_init(c);
mce_adjust_timer = mce_intel_adjust_timer;
mce_adjust_timer = cmci_intel_adjust_timer;
break;
case X86_VENDOR_AMD:
mce_amd_feature_init(c);
mce_flags.overflow_recov = cpuid_ebx(0x80000007) & 0x1;
break;
default:
break;
......@@ -2017,6 +2024,7 @@ __setup("mce", mcheck_enable);
int __init mcheck_init(void)
{
mcheck_intel_therm_init();
mcheck_vendor_init_severity();
return 0;
}
......
......@@ -79,7 +79,7 @@ static inline bool is_shared_bank(int bank)
return (bank == 4);
}
static const char * const bank4_names(struct threshold_block *b)
static const char *bank4_names(const struct threshold_block *b)
{
switch (b->address) {
/* MSR4_MISC0 */
......@@ -250,6 +250,7 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
if (!b.interrupt_capable)
goto init;
b.interrupt_enable = 1;
new = (high & MASK_LVTOFF_HI) >> 20;
offset = setup_APIC_mce(offset, new);
......@@ -322,6 +323,8 @@ static void amd_threshold_interrupt(void)
log:
mce_setup(&m);
rdmsrl(MSR_IA32_MCx_STATUS(bank), m.status);
if (!(m.status & MCI_STATUS_VAL))
return;
m.misc = ((u64)high << 32) | low;
m.bank = bank;
mce_log(&m);
......@@ -497,10 +500,12 @@ static int allocate_threshold_blocks(unsigned int cpu, unsigned int bank,
b->interrupt_capable = lvt_interrupt_supported(bank, high);
b->threshold_limit = THRESHOLD_MAX;
if (b->interrupt_capable)
if (b->interrupt_capable) {
threshold_ktype.default_attrs[2] = &interrupt_enable.attr;
else
b->interrupt_enable = 1;
} else {
threshold_ktype.default_attrs[2] = NULL;
}
INIT_LIST_HEAD(&b->miscj);
......
......@@ -38,6 +38,15 @@
*/
static DEFINE_PER_CPU(mce_banks_t, mce_banks_owned);
/*
* CMCI storm detection backoff counter
*
* During storm, we reset this counter to INITIAL_CHECK_INTERVAL in case we've
* encountered an error. If not, we decrement it by one. We signal the end of
* the CMCI storm when it reaches 0.
*/
static DEFINE_PER_CPU(int, cmci_backoff_cnt);
/*
* cmci_discover_lock protects against parallel discovery attempts
* which could race against each other.
......@@ -46,7 +55,7 @@ static DEFINE_RAW_SPINLOCK(cmci_discover_lock);
#define CMCI_THRESHOLD 1
#define CMCI_POLL_INTERVAL (30 * HZ)
#define CMCI_STORM_INTERVAL (1 * HZ)
#define CMCI_STORM_INTERVAL (HZ)
#define CMCI_STORM_THRESHOLD 15
static DEFINE_PER_CPU(unsigned long, cmci_time_stamp);
......@@ -82,11 +91,21 @@ static int cmci_supported(int *banks)
return !!(cap & MCG_CMCI_P);
}
void mce_intel_cmci_poll(void)
bool mce_intel_cmci_poll(void)
{
if (__this_cpu_read(cmci_storm_state) == CMCI_STORM_NONE)
return;
machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_banks_owned));
return false;
/*
* Reset the counter if we've logged an error in the last poll
* during the storm.
*/
if (machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_banks_owned)))
this_cpu_write(cmci_backoff_cnt, INITIAL_CHECK_INTERVAL);
else
this_cpu_dec(cmci_backoff_cnt);
return true;
}
void mce_intel_hcpu_update(unsigned long cpu)
......@@ -97,31 +116,32 @@ void mce_intel_hcpu_update(unsigned long cpu)
per_cpu(cmci_storm_state, cpu) = CMCI_STORM_NONE;
}
unsigned long mce_intel_adjust_timer(unsigned long interval)
unsigned long cmci_intel_adjust_timer(unsigned long interval)
{
int r;
if (interval < CMCI_POLL_INTERVAL)
return interval;
if ((this_cpu_read(cmci_backoff_cnt) > 0) &&
(__this_cpu_read(cmci_storm_state) == CMCI_STORM_ACTIVE)) {
mce_notify_irq();
return CMCI_STORM_INTERVAL;
}
switch (__this_cpu_read(cmci_storm_state)) {
case CMCI_STORM_ACTIVE:
/*
* We switch back to interrupt mode once the poll timer has
* silenced itself. That means no events recorded and the
* timer interval is back to our poll interval.
* silenced itself. That means no events recorded and the timer
* interval is back to our poll interval.
*/
__this_cpu_write(cmci_storm_state, CMCI_STORM_SUBSIDED);
r = atomic_sub_return(1, &cmci_storm_on_cpus);
if (r == 0)
if (!atomic_sub_return(1, &cmci_storm_on_cpus))
pr_notice("CMCI storm subsided: switching to interrupt mode\n");
/* FALLTHROUGH */
case CMCI_STORM_SUBSIDED:
/*
* We wait for all cpus to go back to SUBSIDED
* state. When that happens we switch back to
* interrupt mode.
* We wait for all CPUs to go back to SUBSIDED state. When that
* happens we switch back to interrupt mode.
*/
if (!atomic_read(&cmci_storm_on_cpus)) {
__this_cpu_write(cmci_storm_state, CMCI_STORM_NONE);
......@@ -130,10 +150,8 @@ unsigned long mce_intel_adjust_timer(unsigned long interval)
}
return CMCI_POLL_INTERVAL;
default:
/*
* We have shiny weather. Let the poll do whatever it
* thinks.
*/
/* We have shiny weather. Let the poll do whatever it thinks. */
return interval;
}
}
......@@ -178,7 +196,8 @@ static bool cmci_storm_detect(void)
cmci_storm_disable_banks();
__this_cpu_write(cmci_storm_state, CMCI_STORM_ACTIVE);
r = atomic_add_return(1, &cmci_storm_on_cpus);
mce_timer_kick(CMCI_POLL_INTERVAL);
mce_timer_kick(CMCI_STORM_INTERVAL);
this_cpu_write(cmci_backoff_cnt, INITIAL_CHECK_INTERVAL);
if (r == 1)
pr_notice("CMCI storm detected: switching to poll mode\n");
......@@ -195,6 +214,7 @@ static void intel_threshold_interrupt(void)
{
if (cmci_storm_detect())
return;
machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_banks_owned));
mce_notify_irq();
}
......@@ -286,6 +306,7 @@ void cmci_recheck(void)
if (!mce_available(raw_cpu_ptr(&cpu_info)) || !cmci_supported(&banks))
return;
local_irq_save(flags);
machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_banks_owned));
local_irq_restore(flags);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment