Commit 3dee9fb2 authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'ras-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull RAS updates from Ingo Molnar:
 "The main changes in this cycle were:

   - add the 'Corrected Errors Collector' kernel feature which collect
     and monitor correctable errors statistics and will preemptively
     (soft-)offline physical pages that have a suspiciously high error
     count.

   - handle MCE errors during kexec() more gracefully

   - factor out and deprecate the /dev/mcelog driver

   - ... plus misc fixes and cleanpus"

* 'ras-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  x86/mce: Check MCi_STATUS[MISCV] for usable addr on Intel only
  ACPI/APEI: Use setup_deferrable_timer()
  x86/mce: Update notifier priority check
  x86/mce: Enable PPIN for Knights Landing/Mill
  x86/mce: Do not register notifiers with invalid prio
  x86/mce: Factor out and deprecate the /dev/mcelog driver
  RAS: Add a Corrected Errors Collector
  x86/mce: Rename mce_log to mce_log_buffer
  x86/mce: Rename mce_log()'s argument
  x86/mce: Init some CPU features early
  x86/mce: Handle broadcasted MCE gracefully with kexec
parents 7c8c03bf c6a9583f
...@@ -3177,6 +3177,12 @@ ...@@ -3177,6 +3177,12 @@
ramdisk_size= [RAM] Sizes of RAM disks in kilobytes ramdisk_size= [RAM] Sizes of RAM disks in kilobytes
See Documentation/blockdev/ramdisk.txt. See Documentation/blockdev/ramdisk.txt.
ras=option[,option,...] [KNL] RAS-specific options
cec_disable [X86]
Disable the Correctable Errors Collector,
see CONFIG_RAS_CEC help text.
rcu_nocbs= [KNL] rcu_nocbs= [KNL]
The argument is a cpu list, as described above. The argument is a cpu list, as described above.
......
...@@ -1042,6 +1042,14 @@ config X86_MCE ...@@ -1042,6 +1042,14 @@ config X86_MCE
The action the kernel takes depends on the severity of the problem, The action the kernel takes depends on the severity of the problem,
ranging from warning messages to halting the machine. ranging from warning messages to halting the machine.
config X86_MCELOG_LEGACY
bool "Support for deprecated /dev/mcelog character device"
depends on X86_MCE
---help---
Enable support for /dev/mcelog which is needed by the old mcelog
userspace logging daemon. Consider switching to the new generation
rasdaemon solution.
config X86_MCE_INTEL config X86_MCE_INTEL
def_bool y def_bool y
prompt "Intel MCE features" prompt "Intel MCE features"
...@@ -1071,7 +1079,7 @@ config X86_MCE_THRESHOLD ...@@ -1071,7 +1079,7 @@ config X86_MCE_THRESHOLD
def_bool y def_bool y
config X86_MCE_INJECT config X86_MCE_INJECT
depends on X86_MCE && X86_LOCAL_APIC depends on X86_MCE && X86_LOCAL_APIC && X86_MCELOG_LEGACY
tristate "Machine check injector support" tristate "Machine check injector support"
---help--- ---help---
Provide support for injecting machine checks for testing purposes. Provide support for injecting machine checks for testing purposes.
......
...@@ -128,7 +128,7 @@ ...@@ -128,7 +128,7 @@
* debugging tools. Each entry is only valid when its finished flag * debugging tools. Each entry is only valid when its finished flag
* is set. * is set.
*/ */
struct mce_log { struct mce_log_buffer {
char signature[12]; /* "MACHINECHECK" */ char signature[12]; /* "MACHINECHECK" */
unsigned len; /* = MCE_LOG_LEN */ unsigned len; /* = MCE_LOG_LEN */
unsigned next; unsigned next;
...@@ -191,10 +191,12 @@ extern struct mca_config mca_cfg; ...@@ -191,10 +191,12 @@ extern struct mca_config mca_cfg;
extern struct mca_msr_regs msr_ops; extern struct mca_msr_regs msr_ops;
enum mce_notifier_prios { enum mce_notifier_prios {
MCE_PRIO_SRAO = INT_MAX, MCE_PRIO_FIRST = INT_MAX,
MCE_PRIO_EXTLOG = INT_MAX - 1, MCE_PRIO_SRAO = INT_MAX - 1,
MCE_PRIO_NFIT = INT_MAX - 2, MCE_PRIO_EXTLOG = INT_MAX - 2,
MCE_PRIO_EDAC = INT_MAX - 3, MCE_PRIO_NFIT = INT_MAX - 3,
MCE_PRIO_EDAC = INT_MAX - 4,
MCE_PRIO_MCELOG = 1,
MCE_PRIO_LOWEST = 0, MCE_PRIO_LOWEST = 0,
}; };
......
...@@ -15,6 +15,7 @@ struct machine_ops { ...@@ -15,6 +15,7 @@ struct machine_ops {
}; };
extern struct machine_ops machine_ops; extern struct machine_ops machine_ops;
extern int crashing_cpu;
void native_machine_crash_shutdown(struct pt_regs *regs); void native_machine_crash_shutdown(struct pt_regs *regs);
void native_machine_shutdown(void); void native_machine_shutdown(void);
......
...@@ -9,3 +9,5 @@ obj-$(CONFIG_X86_MCE_INJECT) += mce-inject.o ...@@ -9,3 +9,5 @@ obj-$(CONFIG_X86_MCE_INJECT) += mce-inject.o
obj-$(CONFIG_X86_THERMAL_VECTOR) += therm_throt.o obj-$(CONFIG_X86_THERMAL_VECTOR) += therm_throt.o
obj-$(CONFIG_ACPI_APEI) += mce-apei.o obj-$(CONFIG_ACPI_APEI) += mce-apei.o
obj-$(CONFIG_X86_MCELOG_LEGACY) += dev-mcelog.o
/*
* /dev/mcelog driver
*
* K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
* Rest from unknown author(s).
* 2004 Andi Kleen. Rewrote most of it.
* Copyright 2008 Intel Corporation
* Author: Andi Kleen
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/miscdevice.h>
#include <linux/slab.h>
#include <linux/kmod.h>
#include <linux/poll.h>
#include "mce-internal.h"
static DEFINE_MUTEX(mce_chrdev_read_mutex);
static char mce_helper[128];
static char *mce_helper_argv[2] = { mce_helper, NULL };
#define mce_log_get_idx_check(p) \
({ \
RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() && \
!lockdep_is_held(&mce_chrdev_read_mutex), \
"suspicious mce_log_get_idx_check() usage"); \
smp_load_acquire(&(p)); \
})
/*
* Lockless MCE logging infrastructure.
* This avoids deadlocks on printk locks without having to break locks. Also
* separate MCEs from kernel messages to avoid bogus bug reports.
*/
static struct mce_log_buffer mcelog = {
.signature = MCE_LOG_SIGNATURE,
.len = MCE_LOG_LEN,
.recordlen = sizeof(struct mce),
};
static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait);
/* User mode helper program triggered by machine check event */
extern char mce_helper[128];
static int dev_mce_log(struct notifier_block *nb, unsigned long val,
void *data)
{
struct mce *mce = (struct mce *)data;
unsigned int next, entry;
wmb();
for (;;) {
entry = mce_log_get_idx_check(mcelog.next);
for (;;) {
/*
* When the buffer fills up discard new entries.
* Assume that the earlier errors are the more
* interesting ones:
*/
if (entry >= MCE_LOG_LEN) {
set_bit(MCE_OVERFLOW,
(unsigned long *)&mcelog.flags);
return NOTIFY_OK;
}
/* Old left over entry. Skip: */
if (mcelog.entry[entry].finished) {
entry++;
continue;
}
break;
}
smp_rmb();
next = entry + 1;
if (cmpxchg(&mcelog.next, entry, next) == entry)
break;
}
memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
wmb();
mcelog.entry[entry].finished = 1;
wmb();
/* wake processes polling /dev/mcelog */
wake_up_interruptible(&mce_chrdev_wait);
return NOTIFY_OK;
}
static struct notifier_block dev_mcelog_nb = {
.notifier_call = dev_mce_log,
.priority = MCE_PRIO_MCELOG,
};
static void mce_do_trigger(struct work_struct *work)
{
call_usermodehelper(mce_helper, mce_helper_argv, NULL, UMH_NO_WAIT);
}
static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
void mce_work_trigger(void)
{
if (mce_helper[0])
schedule_work(&mce_trigger_work);
}
static ssize_t
show_trigger(struct device *s, struct device_attribute *attr, char *buf)
{
strcpy(buf, mce_helper);
strcat(buf, "\n");
return strlen(mce_helper) + 1;
}
static ssize_t set_trigger(struct device *s, struct device_attribute *attr,
const char *buf, size_t siz)
{
char *p;
strncpy(mce_helper, buf, sizeof(mce_helper));
mce_helper[sizeof(mce_helper)-1] = 0;
p = strchr(mce_helper, '\n');
if (p)
*p = 0;
return strlen(mce_helper) + !!p;
}
DEVICE_ATTR(trigger, 0644, show_trigger, set_trigger);
/*
* mce_chrdev: Character device /dev/mcelog to read and clear the MCE log.
*/
static DEFINE_SPINLOCK(mce_chrdev_state_lock);
static int mce_chrdev_open_count; /* #times opened */
static int mce_chrdev_open_exclu; /* already open exclusive? */
static int mce_chrdev_open(struct inode *inode, struct file *file)
{
spin_lock(&mce_chrdev_state_lock);
if (mce_chrdev_open_exclu ||
(mce_chrdev_open_count && (file->f_flags & O_EXCL))) {
spin_unlock(&mce_chrdev_state_lock);
return -EBUSY;
}
if (file->f_flags & O_EXCL)
mce_chrdev_open_exclu = 1;
mce_chrdev_open_count++;
spin_unlock(&mce_chrdev_state_lock);
return nonseekable_open(inode, file);
}
static int mce_chrdev_release(struct inode *inode, struct file *file)
{
spin_lock(&mce_chrdev_state_lock);
mce_chrdev_open_count--;
mce_chrdev_open_exclu = 0;
spin_unlock(&mce_chrdev_state_lock);
return 0;
}
static void collect_tscs(void *data)
{
unsigned long *cpu_tsc = (unsigned long *)data;
cpu_tsc[smp_processor_id()] = rdtsc();
}
static int mce_apei_read_done;
/* Collect MCE record of previous boot in persistent storage via APEI ERST. */
static int __mce_read_apei(char __user **ubuf, size_t usize)
{
int rc;
u64 record_id;
struct mce m;
if (usize < sizeof(struct mce))
return -EINVAL;
rc = apei_read_mce(&m, &record_id);
/* Error or no more MCE record */
if (rc <= 0) {
mce_apei_read_done = 1;
/*
* When ERST is disabled, mce_chrdev_read() should return
* "no record" instead of "no device."
*/
if (rc == -ENODEV)
return 0;
return rc;
}
rc = -EFAULT;
if (copy_to_user(*ubuf, &m, sizeof(struct mce)))
return rc;
/*
* In fact, we should have cleared the record after that has
* been flushed to the disk or sent to network in
* /sbin/mcelog, but we have no interface to support that now,
* so just clear it to avoid duplication.
*/
rc = apei_clear_mce(record_id);
if (rc) {
mce_apei_read_done = 1;
return rc;
}
*ubuf += sizeof(struct mce);
return 0;
}
static ssize_t mce_chrdev_read(struct file *filp, char __user *ubuf,
size_t usize, loff_t *off)
{
char __user *buf = ubuf;
unsigned long *cpu_tsc;
unsigned prev, next;
int i, err;
cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL);
if (!cpu_tsc)
return -ENOMEM;
mutex_lock(&mce_chrdev_read_mutex);
if (!mce_apei_read_done) {
err = __mce_read_apei(&buf, usize);
if (err || buf != ubuf)
goto out;
}
next = mce_log_get_idx_check(mcelog.next);
/* Only supports full reads right now */
err = -EINVAL;
if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce))
goto out;
err = 0;
prev = 0;
do {
for (i = prev; i < next; i++) {
unsigned long start = jiffies;
struct mce *m = &mcelog.entry[i];
while (!m->finished) {
if (time_after_eq(jiffies, start + 2)) {
memset(m, 0, sizeof(*m));
goto timeout;
}
cpu_relax();
}
smp_rmb();
err |= copy_to_user(buf, m, sizeof(*m));
buf += sizeof(*m);
timeout:
;
}
memset(mcelog.entry + prev, 0,
(next - prev) * sizeof(struct mce));
prev = next;
next = cmpxchg(&mcelog.next, prev, 0);
} while (next != prev);
synchronize_sched();
/*
* Collect entries that were still getting written before the
* synchronize.
*/
on_each_cpu(collect_tscs, cpu_tsc, 1);
for (i = next; i < MCE_LOG_LEN; i++) {
struct mce *m = &mcelog.entry[i];
if (m->finished && m->tsc < cpu_tsc[m->cpu]) {
err |= copy_to_user(buf, m, sizeof(*m));
smp_rmb();
buf += sizeof(*m);
memset(m, 0, sizeof(*m));
}
}
if (err)
err = -EFAULT;
out:
mutex_unlock(&mce_chrdev_read_mutex);
kfree(cpu_tsc);
return err ? err : buf - ubuf;
}
static unsigned int mce_chrdev_poll(struct file *file, poll_table *wait)
{
poll_wait(file, &mce_chrdev_wait, wait);
if (READ_ONCE(mcelog.next))
return POLLIN | POLLRDNORM;
if (!mce_apei_read_done && apei_check_mce())
return POLLIN | POLLRDNORM;
return 0;
}
static long mce_chrdev_ioctl(struct file *f, unsigned int cmd,
unsigned long arg)
{
int __user *p = (int __user *)arg;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
switch (cmd) {
case MCE_GET_RECORD_LEN:
return put_user(sizeof(struct mce), p);
case MCE_GET_LOG_LEN:
return put_user(MCE_LOG_LEN, p);
case MCE_GETCLEAR_FLAGS: {
unsigned flags;
do {
flags = mcelog.flags;
} while (cmpxchg(&mcelog.flags, flags, 0) != flags);
return put_user(flags, p);
}
default:
return -ENOTTY;
}
}
static ssize_t (*mce_write)(struct file *filp, const char __user *ubuf,
size_t usize, loff_t *off);
void register_mce_write_callback(ssize_t (*fn)(struct file *filp,
const char __user *ubuf,
size_t usize, loff_t *off))
{
mce_write = fn;
}
EXPORT_SYMBOL_GPL(register_mce_write_callback);
static ssize_t mce_chrdev_write(struct file *filp, const char __user *ubuf,
size_t usize, loff_t *off)
{
if (mce_write)
return mce_write(filp, ubuf, usize, off);
else
return -EINVAL;
}
static const struct file_operations mce_chrdev_ops = {
.open = mce_chrdev_open,
.release = mce_chrdev_release,
.read = mce_chrdev_read,
.write = mce_chrdev_write,
.poll = mce_chrdev_poll,
.unlocked_ioctl = mce_chrdev_ioctl,
.llseek = no_llseek,
};
static struct miscdevice mce_chrdev_device = {
MISC_MCELOG_MINOR,
"mcelog",
&mce_chrdev_ops,
};
static __init int dev_mcelog_init_device(void)
{
int err;
/* register character device /dev/mcelog */
err = misc_register(&mce_chrdev_device);
if (err) {
pr_err("Unable to init device /dev/mcelog (rc: %d)\n", err);
return err;
}
mce_register_decode_chain(&dev_mcelog_nb);
return 0;
}
device_initcall_sync(dev_mcelog_init_device);
...@@ -96,3 +96,11 @@ static inline bool mce_cmp(struct mce *m1, struct mce *m2) ...@@ -96,3 +96,11 @@ static inline bool mce_cmp(struct mce *m1, struct mce *m2)
m1->addr != m2->addr || m1->addr != m2->addr ||
m1->misc != m2->misc; m1->misc != m2->misc;
} }
extern struct device_attribute dev_attr_trigger;
#ifdef CONFIG_X86_MCELOG_LEGACY
extern void mce_work_trigger(void);
#else
static inline void mce_work_trigger(void) { }
#endif
...@@ -35,6 +35,7 @@ ...@@ -35,6 +35,7 @@
#include <linux/poll.h> #include <linux/poll.h>
#include <linux/nmi.h> #include <linux/nmi.h>
#include <linux/cpu.h> #include <linux/cpu.h>
#include <linux/ras.h>
#include <linux/smp.h> #include <linux/smp.h>
#include <linux/fs.h> #include <linux/fs.h>
#include <linux/mm.h> #include <linux/mm.h>
...@@ -49,20 +50,11 @@ ...@@ -49,20 +50,11 @@
#include <asm/tlbflush.h> #include <asm/tlbflush.h>
#include <asm/mce.h> #include <asm/mce.h>
#include <asm/msr.h> #include <asm/msr.h>
#include <asm/reboot.h>
#include "mce-internal.h" #include "mce-internal.h"
static DEFINE_MUTEX(mce_chrdev_read_mutex); static DEFINE_MUTEX(mce_log_mutex);
static int mce_chrdev_open_count; /* #times opened */
#define mce_log_get_idx_check(p) \
({ \
RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() && \
!lockdep_is_held(&mce_chrdev_read_mutex), \
"suspicious mce_log_get_idx_check() usage"); \
smp_load_acquire(&(p)); \
})
#define CREATE_TRACE_POINTS #define CREATE_TRACE_POINTS
#include <trace/events/mce.h> #include <trace/events/mce.h>
...@@ -87,14 +79,8 @@ struct mca_config mca_cfg __read_mostly = { ...@@ -87,14 +79,8 @@ struct mca_config mca_cfg __read_mostly = {
.monarch_timeout = -1 .monarch_timeout = -1
}; };
/* User mode helper program triggered by machine check event */
static unsigned long mce_need_notify;
static char mce_helper[128];
static char *mce_helper_argv[2] = { mce_helper, NULL };
static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait);
static DEFINE_PER_CPU(struct mce, mces_seen); static DEFINE_PER_CPU(struct mce, mces_seen);
static unsigned long mce_need_notify;
static int cpu_missing; static int cpu_missing;
/* /*
...@@ -145,80 +131,36 @@ void mce_setup(struct mce *m) ...@@ -145,80 +131,36 @@ void mce_setup(struct mce *m)
DEFINE_PER_CPU(struct mce, injectm); DEFINE_PER_CPU(struct mce, injectm);
EXPORT_PER_CPU_SYMBOL_GPL(injectm); EXPORT_PER_CPU_SYMBOL_GPL(injectm);
/* void mce_log(struct mce *m)
* Lockless MCE logging infrastructure.
* This avoids deadlocks on printk locks without having to break locks. Also
* separate MCEs from kernel messages to avoid bogus bug reports.
*/
static struct mce_log mcelog = {
.signature = MCE_LOG_SIGNATURE,
.len = MCE_LOG_LEN,
.recordlen = sizeof(struct mce),
};
void mce_log(struct mce *mce)
{ {
unsigned next, entry; if (!mce_gen_pool_add(m))
/* Emit the trace record: */
trace_mce_record(mce);
if (!mce_gen_pool_add(mce))
irq_work_queue(&mce_irq_work); irq_work_queue(&mce_irq_work);
wmb();
for (;;) {
entry = mce_log_get_idx_check(mcelog.next);
for (;;) {
/*
* When the buffer fills up discard new entries.
* Assume that the earlier errors are the more
* interesting ones:
*/
if (entry >= MCE_LOG_LEN) {
set_bit(MCE_OVERFLOW,
(unsigned long *)&mcelog.flags);
return;
}
/* Old left over entry. Skip: */
if (mcelog.entry[entry].finished) {
entry++;
continue;
}
break;
}
smp_rmb();
next = entry + 1;
if (cmpxchg(&mcelog.next, entry, next) == entry)
break;
}
memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
wmb();
mcelog.entry[entry].finished = 1;
wmb();
set_bit(0, &mce_need_notify);
} }
void mce_inject_log(struct mce *m) void mce_inject_log(struct mce *m)
{ {
mutex_lock(&mce_chrdev_read_mutex); mutex_lock(&mce_log_mutex);
mce_log(m); mce_log(m);
mutex_unlock(&mce_chrdev_read_mutex); mutex_unlock(&mce_log_mutex);
} }
EXPORT_SYMBOL_GPL(mce_inject_log); EXPORT_SYMBOL_GPL(mce_inject_log);
static struct notifier_block mce_srao_nb; static struct notifier_block mce_srao_nb;
/*
* We run the default notifier if we have only the SRAO, the first and the
* default notifier registered. I.e., the mandatory NUM_DEFAULT_NOTIFIERS
* notifiers registered on the chain.
*/
#define NUM_DEFAULT_NOTIFIERS 3
static atomic_t num_notifiers; static atomic_t num_notifiers;
void mce_register_decode_chain(struct notifier_block *nb) void mce_register_decode_chain(struct notifier_block *nb)
{ {
atomic_inc(&num_notifiers); if (WARN_ON(nb->priority > MCE_PRIO_MCELOG && nb->priority < MCE_PRIO_EDAC))
return;
WARN_ON(nb->priority > MCE_PRIO_LOWEST && nb->priority < MCE_PRIO_EDAC); atomic_inc(&num_notifiers);
blocking_notifier_chain_register(&x86_mce_decoder_chain, nb); blocking_notifier_chain_register(&x86_mce_decoder_chain, nb);
} }
...@@ -510,7 +452,6 @@ static void mce_schedule_work(void) ...@@ -510,7 +452,6 @@ static void mce_schedule_work(void)
static void mce_irq_work_cb(struct irq_work *entry) static void mce_irq_work_cb(struct irq_work *entry)
{ {
mce_notify_irq();
mce_schedule_work(); mce_schedule_work();
} }
...@@ -539,20 +480,97 @@ static void mce_report_event(struct pt_regs *regs) ...@@ -539,20 +480,97 @@ static void mce_report_event(struct pt_regs *regs)
*/ */
static int mce_usable_address(struct mce *m) static int mce_usable_address(struct mce *m)
{ {
if (!(m->status & MCI_STATUS_MISCV) || !(m->status & MCI_STATUS_ADDRV)) if (!(m->status & MCI_STATUS_ADDRV))
return 0; return 0;
/* Checks after this one are Intel-specific: */ /* Checks after this one are Intel-specific: */
if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
return 1; return 1;
if (!(m->status & MCI_STATUS_MISCV))
return 0;
if (MCI_MISC_ADDR_LSB(m->misc) > PAGE_SHIFT) if (MCI_MISC_ADDR_LSB(m->misc) > PAGE_SHIFT)
return 0; return 0;
if (MCI_MISC_ADDR_MODE(m->misc) != MCI_MISC_ADDR_PHYS) if (MCI_MISC_ADDR_MODE(m->misc) != MCI_MISC_ADDR_PHYS)
return 0; return 0;
return 1; return 1;
} }
static bool memory_error(struct mce *m)
{
struct cpuinfo_x86 *c = &boot_cpu_data;
if (c->x86_vendor == X86_VENDOR_AMD) {
/* ErrCodeExt[20:16] */
u8 xec = (m->status >> 16) & 0x1f;
return (xec == 0x0 || xec == 0x8);
} else if (c->x86_vendor == X86_VENDOR_INTEL) {
/*
* Intel SDM Volume 3B - 15.9.2 Compound Error Codes
*
* Bit 7 of the MCACOD field of IA32_MCi_STATUS is used for
* indicating a memory error. Bit 8 is used for indicating a
* cache hierarchy error. The combination of bit 2 and bit 3
* is used for indicating a `generic' cache hierarchy error
* But we can't just blindly check the above bits, because if
* bit 11 is set, then it is a bus/interconnect error - and
* either way the above bits just gives more detail on what
* bus/interconnect error happened. Note that bit 12 can be
* ignored, as it's the "filter" bit.
*/
return (m->status & 0xef80) == BIT(7) ||
(m->status & 0xef00) == BIT(8) ||
(m->status & 0xeffc) == 0xc;
}
return false;
}
static bool cec_add_mce(struct mce *m)
{
if (!m)
return false;
/* We eat only correctable DRAM errors with usable addresses. */
if (memory_error(m) &&
!(m->status & MCI_STATUS_UC) &&
mce_usable_address(m))
if (!cec_add_elem(m->addr >> PAGE_SHIFT))
return true;
return false;
}
static int mce_first_notifier(struct notifier_block *nb, unsigned long val,
void *data)
{
struct mce *m = (struct mce *)data;
if (!m)
return NOTIFY_DONE;
if (cec_add_mce(m))
return NOTIFY_STOP;
/* Emit the trace record: */
trace_mce_record(m);
set_bit(0, &mce_need_notify);
mce_notify_irq();
return NOTIFY_DONE;
}
static struct notifier_block first_nb = {
.notifier_call = mce_first_notifier,
.priority = MCE_PRIO_FIRST,
};
static int srao_decode_notifier(struct notifier_block *nb, unsigned long val, static int srao_decode_notifier(struct notifier_block *nb, unsigned long val,
void *data) void *data)
{ {
...@@ -582,15 +600,7 @@ static int mce_default_notifier(struct notifier_block *nb, unsigned long val, ...@@ -582,15 +600,7 @@ static int mce_default_notifier(struct notifier_block *nb, unsigned long val,
if (!m) if (!m)
return NOTIFY_DONE; return NOTIFY_DONE;
/* if (atomic_read(&num_notifiers) > NUM_DEFAULT_NOTIFIERS)
* Run the default notifier if we have only the SRAO
* notifier and us registered.
*/
if (atomic_read(&num_notifiers) > 2)
return NOTIFY_DONE;
/* Don't print when mcelog is running */
if (mce_chrdev_open_count > 0)
return NOTIFY_DONE; return NOTIFY_DONE;
__print_mce(m); __print_mce(m);
...@@ -643,37 +653,6 @@ static void mce_read_aux(struct mce *m, int i) ...@@ -643,37 +653,6 @@ static void mce_read_aux(struct mce *m, int i)
} }
} }
static bool memory_error(struct mce *m)
{
struct cpuinfo_x86 *c = &boot_cpu_data;
if (c->x86_vendor == X86_VENDOR_AMD) {
/* ErrCodeExt[20:16] */
u8 xec = (m->status >> 16) & 0x1f;
return (xec == 0x0 || xec == 0x8);
} else if (c->x86_vendor == X86_VENDOR_INTEL) {
/*
* Intel SDM Volume 3B - 15.9.2 Compound Error Codes
*
* Bit 7 of the MCACOD field of IA32_MCi_STATUS is used for
* indicating a memory error. Bit 8 is used for indicating a
* cache hierarchy error. The combination of bit 2 and bit 3
* is used for indicating a `generic' cache hierarchy error
* But we can't just blindly check the above bits, because if
* bit 11 is set, then it is a bus/interconnect error - and
* either way the above bits just gives more detail on what
* bus/interconnect error happened. Note that bit 12 can be
* ignored, as it's the "filter" bit.
*/
return (m->status & 0xef80) == BIT(7) ||
(m->status & 0xef00) == BIT(8) ||
(m->status & 0xeffc) == 0xc;
}
return false;
}
DEFINE_PER_CPU(unsigned, mce_poll_count); DEFINE_PER_CPU(unsigned, mce_poll_count);
/* /*
...@@ -1122,9 +1101,22 @@ void do_machine_check(struct pt_regs *regs, long error_code) ...@@ -1122,9 +1101,22 @@ void do_machine_check(struct pt_regs *regs, long error_code)
* on Intel. * on Intel.
*/ */
int lmce = 1; int lmce = 1;
int cpu = smp_processor_id();
/* If this CPU is offline, just bail out. */ /*
if (cpu_is_offline(smp_processor_id())) { * Cases where we avoid rendezvous handler timeout:
* 1) If this CPU is offline.
*
* 2) If crashing_cpu was set, e.g. we're entering kdump and we need to
* skip those CPUs which remain looping in the 1st kernel - see
* crash_nmi_callback().
*
* Note: there still is a small window between kexec-ing and the new,
* kdump kernel establishing a new #MC handler where a broadcasted MCE
* might not get handled properly.
*/
if (cpu_is_offline(cpu) ||
(crashing_cpu != -1 && crashing_cpu != cpu)) {
u64 mcgstatus; u64 mcgstatus;
mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
...@@ -1394,13 +1386,6 @@ static void mce_timer_delete_all(void) ...@@ -1394,13 +1386,6 @@ static void mce_timer_delete_all(void)
del_timer_sync(&per_cpu(mce_timer, cpu)); del_timer_sync(&per_cpu(mce_timer, cpu));
} }
static void mce_do_trigger(struct work_struct *work)
{
call_usermodehelper(mce_helper, mce_helper_argv, NULL, UMH_NO_WAIT);
}
static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
/* /*
* Notify the user(s) about new machine check events. * Notify the user(s) about new machine check events.
* Can be called from interrupt context, but not from machine check/NMI * Can be called from interrupt context, but not from machine check/NMI
...@@ -1412,11 +1397,7 @@ int mce_notify_irq(void) ...@@ -1412,11 +1397,7 @@ int mce_notify_irq(void)
static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2); static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
if (test_and_clear_bit(0, &mce_need_notify)) { if (test_and_clear_bit(0, &mce_need_notify)) {
/* wake processes polling /dev/mcelog */ mce_work_trigger();
wake_up_interruptible(&mce_chrdev_wait);
if (mce_helper[0])
schedule_work(&mce_trigger_work);
if (__ratelimit(&ratelimit)) if (__ratelimit(&ratelimit))
pr_info(HW_ERR "Machine check events logged\n"); pr_info(HW_ERR "Machine check events logged\n");
...@@ -1683,30 +1664,35 @@ static int __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c) ...@@ -1683,30 +1664,35 @@ static int __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c)
return 0; return 0;
} }
static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c) /*
* Init basic CPU features needed for early decoding of MCEs.
*/
static void __mcheck_cpu_init_early(struct cpuinfo_x86 *c)
{ {
switch (c->x86_vendor) { if (c->x86_vendor == X86_VENDOR_AMD) {
case X86_VENDOR_INTEL:
mce_intel_feature_init(c);
mce_adjust_timer = cmci_intel_adjust_timer;
break;
case X86_VENDOR_AMD: {
mce_flags.overflow_recov = !!cpu_has(c, X86_FEATURE_OVERFLOW_RECOV); mce_flags.overflow_recov = !!cpu_has(c, X86_FEATURE_OVERFLOW_RECOV);
mce_flags.succor = !!cpu_has(c, X86_FEATURE_SUCCOR); mce_flags.succor = !!cpu_has(c, X86_FEATURE_SUCCOR);
mce_flags.smca = !!cpu_has(c, X86_FEATURE_SMCA); mce_flags.smca = !!cpu_has(c, X86_FEATURE_SMCA);
/*
* Install proper ops for Scalable MCA enabled processors
*/
if (mce_flags.smca) { if (mce_flags.smca) {
msr_ops.ctl = smca_ctl_reg; msr_ops.ctl = smca_ctl_reg;
msr_ops.status = smca_status_reg; msr_ops.status = smca_status_reg;
msr_ops.addr = smca_addr_reg; msr_ops.addr = smca_addr_reg;
msr_ops.misc = smca_misc_reg; msr_ops.misc = smca_misc_reg;
} }
mce_amd_feature_init(c); }
}
static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
{
switch (c->x86_vendor) {
case X86_VENDOR_INTEL:
mce_intel_feature_init(c);
mce_adjust_timer = cmci_intel_adjust_timer;
break;
case X86_VENDOR_AMD: {
mce_amd_feature_init(c);
break; break;
} }
...@@ -1793,6 +1779,7 @@ void mcheck_cpu_init(struct cpuinfo_x86 *c) ...@@ -1793,6 +1779,7 @@ void mcheck_cpu_init(struct cpuinfo_x86 *c)
machine_check_vector = do_machine_check; machine_check_vector = do_machine_check;
__mcheck_cpu_init_early(c);
__mcheck_cpu_init_generic(); __mcheck_cpu_init_generic();
__mcheck_cpu_init_vendor(c); __mcheck_cpu_init_vendor(c);
__mcheck_cpu_init_clear_banks(); __mcheck_cpu_init_clear_banks();
...@@ -1818,251 +1805,6 @@ void mcheck_cpu_clear(struct cpuinfo_x86 *c) ...@@ -1818,251 +1805,6 @@ void mcheck_cpu_clear(struct cpuinfo_x86 *c)
} }
/*
* mce_chrdev: Character device /dev/mcelog to read and clear the MCE log.
*/
static DEFINE_SPINLOCK(mce_chrdev_state_lock);
static int mce_chrdev_open_exclu; /* already open exclusive? */
static int mce_chrdev_open(struct inode *inode, struct file *file)
{
spin_lock(&mce_chrdev_state_lock);
if (mce_chrdev_open_exclu ||
(mce_chrdev_open_count && (file->f_flags & O_EXCL))) {
spin_unlock(&mce_chrdev_state_lock);
return -EBUSY;
}
if (file->f_flags & O_EXCL)
mce_chrdev_open_exclu = 1;
mce_chrdev_open_count++;
spin_unlock(&mce_chrdev_state_lock);
return nonseekable_open(inode, file);
}
static int mce_chrdev_release(struct inode *inode, struct file *file)
{
spin_lock(&mce_chrdev_state_lock);
mce_chrdev_open_count--;
mce_chrdev_open_exclu = 0;
spin_unlock(&mce_chrdev_state_lock);
return 0;
}
static void collect_tscs(void *data)
{
unsigned long *cpu_tsc = (unsigned long *)data;
cpu_tsc[smp_processor_id()] = rdtsc();
}
static int mce_apei_read_done;
/* Collect MCE record of previous boot in persistent storage via APEI ERST. */
static int __mce_read_apei(char __user **ubuf, size_t usize)
{
int rc;
u64 record_id;
struct mce m;
if (usize < sizeof(struct mce))
return -EINVAL;
rc = apei_read_mce(&m, &record_id);
/* Error or no more MCE record */
if (rc <= 0) {
mce_apei_read_done = 1;
/*
* When ERST is disabled, mce_chrdev_read() should return
* "no record" instead of "no device."
*/
if (rc == -ENODEV)
return 0;
return rc;
}
rc = -EFAULT;
if (copy_to_user(*ubuf, &m, sizeof(struct mce)))
return rc;
/*
* In fact, we should have cleared the record after that has
* been flushed to the disk or sent to network in
* /sbin/mcelog, but we have no interface to support that now,
* so just clear it to avoid duplication.
*/
rc = apei_clear_mce(record_id);
if (rc) {
mce_apei_read_done = 1;
return rc;
}
*ubuf += sizeof(struct mce);
return 0;
}
static ssize_t mce_chrdev_read(struct file *filp, char __user *ubuf,
size_t usize, loff_t *off)
{
char __user *buf = ubuf;
unsigned long *cpu_tsc;
unsigned prev, next;
int i, err;
cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL);
if (!cpu_tsc)
return -ENOMEM;
mutex_lock(&mce_chrdev_read_mutex);
if (!mce_apei_read_done) {
err = __mce_read_apei(&buf, usize);
if (err || buf != ubuf)
goto out;
}
next = mce_log_get_idx_check(mcelog.next);
/* Only supports full reads right now */
err = -EINVAL;
if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce))
goto out;
err = 0;
prev = 0;
do {
for (i = prev; i < next; i++) {
unsigned long start = jiffies;
struct mce *m = &mcelog.entry[i];
while (!m->finished) {
if (time_after_eq(jiffies, start + 2)) {
memset(m, 0, sizeof(*m));
goto timeout;
}
cpu_relax();
}
smp_rmb();
err |= copy_to_user(buf, m, sizeof(*m));
buf += sizeof(*m);
timeout:
;
}
memset(mcelog.entry + prev, 0,
(next - prev) * sizeof(struct mce));
prev = next;
next = cmpxchg(&mcelog.next, prev, 0);
} while (next != prev);
synchronize_sched();
/*
* Collect entries that were still getting written before the
* synchronize.
*/
on_each_cpu(collect_tscs, cpu_tsc, 1);
for (i = next; i < MCE_LOG_LEN; i++) {
struct mce *m = &mcelog.entry[i];
if (m->finished && m->tsc < cpu_tsc[m->cpu]) {
err |= copy_to_user(buf, m, sizeof(*m));
smp_rmb();
buf += sizeof(*m);
memset(m, 0, sizeof(*m));
}
}
if (err)
err = -EFAULT;
out:
mutex_unlock(&mce_chrdev_read_mutex);
kfree(cpu_tsc);
return err ? err : buf - ubuf;
}
static unsigned int mce_chrdev_poll(struct file *file, poll_table *wait)
{
poll_wait(file, &mce_chrdev_wait, wait);
if (READ_ONCE(mcelog.next))
return POLLIN | POLLRDNORM;
if (!mce_apei_read_done && apei_check_mce())
return POLLIN | POLLRDNORM;
return 0;
}
static long mce_chrdev_ioctl(struct file *f, unsigned int cmd,
unsigned long arg)
{
int __user *p = (int __user *)arg;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
switch (cmd) {
case MCE_GET_RECORD_LEN:
return put_user(sizeof(struct mce), p);
case MCE_GET_LOG_LEN:
return put_user(MCE_LOG_LEN, p);
case MCE_GETCLEAR_FLAGS: {
unsigned flags;
do {
flags = mcelog.flags;
} while (cmpxchg(&mcelog.flags, flags, 0) != flags);
return put_user(flags, p);
}
default:
return -ENOTTY;
}
}
static ssize_t (*mce_write)(struct file *filp, const char __user *ubuf,
size_t usize, loff_t *off);
void register_mce_write_callback(ssize_t (*fn)(struct file *filp,
const char __user *ubuf,
size_t usize, loff_t *off))
{
mce_write = fn;
}
EXPORT_SYMBOL_GPL(register_mce_write_callback);
static ssize_t mce_chrdev_write(struct file *filp, const char __user *ubuf,
size_t usize, loff_t *off)
{
if (mce_write)
return mce_write(filp, ubuf, usize, off);
else
return -EINVAL;
}
static const struct file_operations mce_chrdev_ops = {
.open = mce_chrdev_open,
.release = mce_chrdev_release,
.read = mce_chrdev_read,
.write = mce_chrdev_write,
.poll = mce_chrdev_poll,
.unlocked_ioctl = mce_chrdev_ioctl,
.llseek = no_llseek,
};
static struct miscdevice mce_chrdev_device = {
MISC_MCELOG_MINOR,
"mcelog",
&mce_chrdev_ops,
};
static void __mce_disable_bank(void *arg) static void __mce_disable_bank(void *arg)
{ {
int bank = *((int *)arg); int bank = *((int *)arg);
...@@ -2136,6 +1878,7 @@ __setup("mce", mcheck_enable); ...@@ -2136,6 +1878,7 @@ __setup("mce", mcheck_enable);
int __init mcheck_init(void) int __init mcheck_init(void)
{ {
mcheck_intel_therm_init(); mcheck_intel_therm_init();
mce_register_decode_chain(&first_nb);
mce_register_decode_chain(&mce_srao_nb); mce_register_decode_chain(&mce_srao_nb);
mce_register_decode_chain(&mce_default_nb); mce_register_decode_chain(&mce_default_nb);
mcheck_vendor_init_severity(); mcheck_vendor_init_severity();
...@@ -2280,29 +2023,6 @@ static ssize_t set_bank(struct device *s, struct device_attribute *attr, ...@@ -2280,29 +2023,6 @@ static ssize_t set_bank(struct device *s, struct device_attribute *attr,
return size; return size;
} }
static ssize_t
show_trigger(struct device *s, struct device_attribute *attr, char *buf)
{
strcpy(buf, mce_helper);
strcat(buf, "\n");
return strlen(mce_helper) + 1;
}
static ssize_t set_trigger(struct device *s, struct device_attribute *attr,
const char *buf, size_t siz)
{
char *p;
strncpy(mce_helper, buf, sizeof(mce_helper));
mce_helper[sizeof(mce_helper)-1] = 0;
p = strchr(mce_helper, '\n');
if (p)
*p = 0;
return strlen(mce_helper) + !!p;
}
static ssize_t set_ignore_ce(struct device *s, static ssize_t set_ignore_ce(struct device *s,
struct device_attribute *attr, struct device_attribute *attr,
const char *buf, size_t size) const char *buf, size_t size)
...@@ -2359,7 +2079,6 @@ static ssize_t store_int_with_restart(struct device *s, ...@@ -2359,7 +2079,6 @@ static ssize_t store_int_with_restart(struct device *s,
return ret; return ret;
} }
static DEVICE_ATTR(trigger, 0644, show_trigger, set_trigger);
static DEVICE_INT_ATTR(tolerant, 0644, mca_cfg.tolerant); static DEVICE_INT_ATTR(tolerant, 0644, mca_cfg.tolerant);
static DEVICE_INT_ATTR(monarch_timeout, 0644, mca_cfg.monarch_timeout); static DEVICE_INT_ATTR(monarch_timeout, 0644, mca_cfg.monarch_timeout);
static DEVICE_BOOL_ATTR(dont_log_ce, 0644, mca_cfg.dont_log_ce); static DEVICE_BOOL_ATTR(dont_log_ce, 0644, mca_cfg.dont_log_ce);
...@@ -2382,7 +2101,9 @@ static struct dev_ext_attribute dev_attr_cmci_disabled = { ...@@ -2382,7 +2101,9 @@ static struct dev_ext_attribute dev_attr_cmci_disabled = {
static struct device_attribute *mce_device_attrs[] = { static struct device_attribute *mce_device_attrs[] = {
&dev_attr_tolerant.attr, &dev_attr_tolerant.attr,
&dev_attr_check_interval.attr, &dev_attr_check_interval.attr,
#ifdef CONFIG_X86_MCELOG_LEGACY
&dev_attr_trigger, &dev_attr_trigger,
#endif
&dev_attr_monarch_timeout.attr, &dev_attr_monarch_timeout.attr,
&dev_attr_dont_log_ce.attr, &dev_attr_dont_log_ce.attr,
&dev_attr_ignore_ce.attr, &dev_attr_ignore_ce.attr,
...@@ -2556,7 +2277,6 @@ static __init void mce_init_banks(void) ...@@ -2556,7 +2277,6 @@ static __init void mce_init_banks(void)
static __init int mcheck_init_device(void) static __init int mcheck_init_device(void)
{ {
enum cpuhp_state hp_online;
int err; int err;
if (!mce_available(&boot_cpu_data)) { if (!mce_available(&boot_cpu_data)) {
...@@ -2584,21 +2304,11 @@ static __init int mcheck_init_device(void) ...@@ -2584,21 +2304,11 @@ static __init int mcheck_init_device(void)
mce_cpu_online, mce_cpu_pre_down); mce_cpu_online, mce_cpu_pre_down);
if (err < 0) if (err < 0)
goto err_out_online; goto err_out_online;
hp_online = err;
register_syscore_ops(&mce_syscore_ops); register_syscore_ops(&mce_syscore_ops);
/* register character device /dev/mcelog */
err = misc_register(&mce_chrdev_device);
if (err)
goto err_register;
return 0; return 0;
err_register:
unregister_syscore_ops(&mce_syscore_ops);
cpuhp_remove_state(hp_online);
err_out_online: err_out_online:
cpuhp_remove_state(CPUHP_X86_MCE_DEAD); cpuhp_remove_state(CPUHP_X86_MCE_DEAD);
...@@ -2606,7 +2316,7 @@ static __init int mcheck_init_device(void) ...@@ -2606,7 +2316,7 @@ static __init int mcheck_init_device(void)
free_cpumask_var(mce_device_initialized); free_cpumask_var(mce_device_initialized);
err_out: err_out:
pr_err("Unable to init device /dev/mcelog (rc: %d)\n", err); pr_err("Unable to init MCE device (rc: %d)\n", err);
return err; return err;
} }
...@@ -2685,6 +2395,7 @@ static int __init mcheck_late_init(void) ...@@ -2685,6 +2395,7 @@ static int __init mcheck_late_init(void)
static_branch_inc(&mcsafe_key); static_branch_inc(&mcsafe_key);
mcheck_debugfs_init(); mcheck_debugfs_init();
cec_init();
/* /*
* Flush out everything that has been logged during early boot, now that * Flush out everything that has been logged during early boot, now that
......
...@@ -481,6 +481,9 @@ static void intel_ppin_init(struct cpuinfo_x86 *c) ...@@ -481,6 +481,9 @@ static void intel_ppin_init(struct cpuinfo_x86 *c)
case INTEL_FAM6_BROADWELL_XEON_D: case INTEL_FAM6_BROADWELL_XEON_D:
case INTEL_FAM6_BROADWELL_X: case INTEL_FAM6_BROADWELL_X:
case INTEL_FAM6_SKYLAKE_X: case INTEL_FAM6_SKYLAKE_X:
case INTEL_FAM6_XEON_PHI_KNL:
case INTEL_FAM6_XEON_PHI_KNM:
if (rdmsrl_safe(MSR_PPIN_CTL, &val)) if (rdmsrl_safe(MSR_PPIN_CTL, &val))
return; return;
......
...@@ -765,10 +765,11 @@ void machine_crash_shutdown(struct pt_regs *regs) ...@@ -765,10 +765,11 @@ void machine_crash_shutdown(struct pt_regs *regs)
#endif #endif
/* This is the CPU performing the emergency shutdown work. */
int crashing_cpu = -1;
#if defined(CONFIG_SMP) #if defined(CONFIG_SMP)
/* This keeps a track of which one is crashing cpu. */
static int crashing_cpu;
static nmi_shootdown_cb shootdown_callback; static nmi_shootdown_cb shootdown_callback;
static atomic_t waiting_for_crash_ipi; static atomic_t waiting_for_crash_ipi;
......
...@@ -7,3 +7,17 @@ config MCE_AMD_INJ ...@@ -7,3 +7,17 @@ config MCE_AMD_INJ
aspects of the MCE handling code. aspects of the MCE handling code.
WARNING: Do not even assume this interface is staying stable! WARNING: Do not even assume this interface is staying stable!
config RAS_CEC
bool "Correctable Errors Collector"
depends on X86_MCE && MEMORY_FAILURE && DEBUG_FS
---help---
This is a small cache which collects correctable memory errors per 4K
page PFN and counts their repeated occurrence. Once the counter for a
PFN overflows, we try to soft-offline that page as we take it to mean
that it has reached a relatively high error count and would probably
be best if we don't use it anymore.
Bear in mind that this is absolutely useless if your platform doesn't
have ECC DIMMs and doesn't have DRAM ECC checking enabled in the BIOS.
...@@ -1005,9 +1005,8 @@ static int ghes_probe(struct platform_device *ghes_dev) ...@@ -1005,9 +1005,8 @@ static int ghes_probe(struct platform_device *ghes_dev)
switch (generic->notify.type) { switch (generic->notify.type) {
case ACPI_HEST_NOTIFY_POLLED: case ACPI_HEST_NOTIFY_POLLED:
ghes->timer.function = ghes_poll_func; setup_deferrable_timer(&ghes->timer, ghes_poll_func,
ghes->timer.data = (unsigned long)ghes; (unsigned long)ghes);
init_timer_deferrable(&ghes->timer);
ghes_add_timer(ghes); ghes_add_timer(ghes);
break; break;
case ACPI_HEST_NOTIFY_EXTERNAL: case ACPI_HEST_NOTIFY_EXTERNAL:
......
obj-$(CONFIG_RAS) += ras.o debugfs.o obj-$(CONFIG_RAS) += ras.o debugfs.o
obj-$(CONFIG_RAS_CEC) += cec.o
#include <linux/mm.h>
#include <linux/gfp.h>
#include <linux/kernel.h>
#include <asm/mce.h>
#include "debugfs.h"
/*
* RAS Correctable Errors Collector
*
* This is a simple gadget which collects correctable errors and counts their
* occurrence per physical page address.
*
* We've opted for possibly the simplest data structure to collect those - an
* array of the size of a memory page. It stores 512 u64's with the following
* structure:
*
* [63 ... PFN ... 12 | 11 ... generation ... 10 | 9 ... count ... 0]
*
* The generation in the two highest order bits is two bits which are set to 11b
* on every insertion. During the course of each entry's existence, the
* generation field gets decremented during spring cleaning to 10b, then 01b and
* then 00b.
*
* This way we're employing the natural numeric ordering to make sure that newly
* inserted/touched elements have higher 12-bit counts (which we've manufactured)
* and thus iterating over the array initially won't kick out those elements
* which were inserted last.
*
* Spring cleaning is what we do when we reach a certain number CLEAN_ELEMS of
* elements entered into the array, during which, we're decaying all elements.
* If, after decay, an element gets inserted again, its generation is set to 11b
* to make sure it has higher numerical count than other, older elements and
* thus emulate an an LRU-like behavior when deleting elements to free up space
* in the page.
*
* When an element reaches it's max count of count_threshold, we try to poison
* it by assuming that errors triggered count_threshold times in a single page
* are excessive and that page shouldn't be used anymore. count_threshold is
* initialized to COUNT_MASK which is the maximum.
*
* That error event entry causes cec_add_elem() to return !0 value and thus
* signal to its callers to log the error.
*
* To the question why we've chosen a page and moving elements around with
* memmove(), it is because it is a very simple structure to handle and max data
* movement is 4K which on highly optimized modern CPUs is almost unnoticeable.
* We wanted to avoid the pointer traversal of more complex structures like a
* linked list or some sort of a balancing search tree.
*
* Deleting an element takes O(n) but since it is only a single page, it should
* be fast enough and it shouldn't happen all too often depending on error
* patterns.
*/
#undef pr_fmt
#define pr_fmt(fmt) "RAS: " fmt
/*
* We use DECAY_BITS bits of PAGE_SHIFT bits for counting decay, i.e., how long
* elements have stayed in the array without having been accessed again.
*/
#define DECAY_BITS 2
#define DECAY_MASK ((1ULL << DECAY_BITS) - 1)
#define MAX_ELEMS (PAGE_SIZE / sizeof(u64))
/*
* Threshold amount of inserted elements after which we start spring
* cleaning.
*/
#define CLEAN_ELEMS (MAX_ELEMS >> DECAY_BITS)
/* Bits which count the number of errors happened in this 4K page. */
#define COUNT_BITS (PAGE_SHIFT - DECAY_BITS)
#define COUNT_MASK ((1ULL << COUNT_BITS) - 1)
#define FULL_COUNT_MASK (PAGE_SIZE - 1)
/*
* u64: [ 63 ... 12 | DECAY_BITS | COUNT_BITS ]
*/
#define PFN(e) ((e) >> PAGE_SHIFT)
#define DECAY(e) (((e) >> COUNT_BITS) & DECAY_MASK)
#define COUNT(e) ((unsigned int)(e) & COUNT_MASK)
#define FULL_COUNT(e) ((e) & (PAGE_SIZE - 1))
static struct ce_array {
u64 *array; /* container page */
unsigned int n; /* number of elements in the array */
unsigned int decay_count; /*
* number of element insertions/increments
* since the last spring cleaning.
*/
u64 pfns_poisoned; /*
* number of PFNs which got poisoned.
*/
u64 ces_entered; /*
* The number of correctable errors
* entered into the collector.
*/
u64 decays_done; /*
* Times we did spring cleaning.
*/
union {
struct {
__u32 disabled : 1, /* cmdline disabled */
__resv : 31;
};
__u32 flags;
};
} ce_arr;
static DEFINE_MUTEX(ce_mutex);
static u64 dfs_pfn;
/* Amount of errors after which we offline */
static unsigned int count_threshold = COUNT_MASK;
/*
* The timer "decays" element count each timer_interval which is 24hrs by
* default.
*/
#define CEC_TIMER_DEFAULT_INTERVAL 24 * 60 * 60 /* 24 hrs */
#define CEC_TIMER_MIN_INTERVAL 1 * 60 * 60 /* 1h */
#define CEC_TIMER_MAX_INTERVAL 30 * 24 * 60 * 60 /* one month */
static struct timer_list cec_timer;
static u64 timer_interval = CEC_TIMER_DEFAULT_INTERVAL;
/*
* Decrement decay value. We're using DECAY_BITS bits to denote decay of an
* element in the array. On insertion and any access, it gets reset to max.
*/
static void do_spring_cleaning(struct ce_array *ca)
{
int i;
for (i = 0; i < ca->n; i++) {
u8 decay = DECAY(ca->array[i]);
if (!decay)
continue;
decay--;
ca->array[i] &= ~(DECAY_MASK << COUNT_BITS);
ca->array[i] |= (decay << COUNT_BITS);
}
ca->decay_count = 0;
ca->decays_done++;
}
/*
* @interval in seconds
*/
static void cec_mod_timer(struct timer_list *t, unsigned long interval)
{
unsigned long iv;
iv = interval * HZ + jiffies;
mod_timer(t, round_jiffies(iv));
}
static void cec_timer_fn(unsigned long data)
{
struct ce_array *ca = (struct ce_array *)data;
do_spring_cleaning(ca);
cec_mod_timer(&cec_timer, timer_interval);
}
/*
* @to: index of the smallest element which is >= then @pfn.
*
* Return the index of the pfn if found, otherwise negative value.
*/
static int __find_elem(struct ce_array *ca, u64 pfn, unsigned int *to)
{
u64 this_pfn;
int min = 0, max = ca->n;
while (min < max) {
int tmp = (max + min) >> 1;
this_pfn = PFN(ca->array[tmp]);
if (this_pfn < pfn)
min = tmp + 1;
else if (this_pfn > pfn)
max = tmp;
else {
min = tmp;
break;
}
}
if (to)
*to = min;
this_pfn = PFN(ca->array[min]);
if (this_pfn == pfn)
return min;
return -ENOKEY;
}
static int find_elem(struct ce_array *ca, u64 pfn, unsigned int *to)
{
WARN_ON(!to);
if (!ca->n) {
*to = 0;
return -ENOKEY;
}
return __find_elem(ca, pfn, to);
}
static void del_elem(struct ce_array *ca, int idx)
{
/* Save us a function call when deleting the last element. */
if (ca->n - (idx + 1))
memmove((void *)&ca->array[idx],
(void *)&ca->array[idx + 1],
(ca->n - (idx + 1)) * sizeof(u64));
ca->n--;
}
static u64 del_lru_elem_unlocked(struct ce_array *ca)
{
unsigned int min = FULL_COUNT_MASK;
int i, min_idx = 0;
for (i = 0; i < ca->n; i++) {
unsigned int this = FULL_COUNT(ca->array[i]);
if (min > this) {
min = this;
min_idx = i;
}
}
del_elem(ca, min_idx);
return PFN(ca->array[min_idx]);
}
/*
* We return the 0th pfn in the error case under the assumption that it cannot
* be poisoned and excessive CEs in there are a serious deal anyway.
*/
static u64 __maybe_unused del_lru_elem(void)
{
struct ce_array *ca = &ce_arr;
u64 pfn;
if (!ca->n)
return 0;
mutex_lock(&ce_mutex);
pfn = del_lru_elem_unlocked(ca);
mutex_unlock(&ce_mutex);
return pfn;
}
int cec_add_elem(u64 pfn)
{
struct ce_array *ca = &ce_arr;
unsigned int to;
int count, ret = 0;
/*
* We can be called very early on the identify_cpu() path where we are
* not initialized yet. We ignore the error for simplicity.
*/
if (!ce_arr.array || ce_arr.disabled)
return -ENODEV;
ca->ces_entered++;
mutex_lock(&ce_mutex);
if (ca->n == MAX_ELEMS)
WARN_ON(!del_lru_elem_unlocked(ca));
ret = find_elem(ca, pfn, &to);
if (ret < 0) {
/*
* Shift range [to-end] to make room for one more element.
*/
memmove((void *)&ca->array[to + 1],
(void *)&ca->array[to],
(ca->n - to) * sizeof(u64));
ca->array[to] = (pfn << PAGE_SHIFT) |
(DECAY_MASK << COUNT_BITS) | 1;
ca->n++;
ret = 0;
goto decay;
}
count = COUNT(ca->array[to]);
if (count < count_threshold) {
ca->array[to] |= (DECAY_MASK << COUNT_BITS);
ca->array[to]++;
ret = 0;
} else {
u64 pfn = ca->array[to] >> PAGE_SHIFT;
if (!pfn_valid(pfn)) {
pr_warn("CEC: Invalid pfn: 0x%llx\n", pfn);
} else {
/* We have reached max count for this page, soft-offline it. */
pr_err("Soft-offlining pfn: 0x%llx\n", pfn);
memory_failure_queue(pfn, 0, MF_SOFT_OFFLINE);
ca->pfns_poisoned++;
}
del_elem(ca, to);
/*
* Return a >0 value to denote that we've reached the offlining
* threshold.
*/
ret = 1;
goto unlock;
}
decay:
ca->decay_count++;
if (ca->decay_count >= CLEAN_ELEMS)
do_spring_cleaning(ca);
unlock:
mutex_unlock(&ce_mutex);
return ret;
}
static int u64_get(void *data, u64 *val)
{
*val = *(u64 *)data;
return 0;
}
static int pfn_set(void *data, u64 val)
{
*(u64 *)data = val;
return cec_add_elem(val);
}
DEFINE_DEBUGFS_ATTRIBUTE(pfn_ops, u64_get, pfn_set, "0x%llx\n");
static int decay_interval_set(void *data, u64 val)
{
*(u64 *)data = val;
if (val < CEC_TIMER_MIN_INTERVAL)
return -EINVAL;
if (val > CEC_TIMER_MAX_INTERVAL)
return -EINVAL;
timer_interval = val;
cec_mod_timer(&cec_timer, timer_interval);
return 0;
}
DEFINE_DEBUGFS_ATTRIBUTE(decay_interval_ops, u64_get, decay_interval_set, "%lld\n");
static int count_threshold_set(void *data, u64 val)
{
*(u64 *)data = val;
if (val > COUNT_MASK)
val = COUNT_MASK;
count_threshold = val;
return 0;
}
DEFINE_DEBUGFS_ATTRIBUTE(count_threshold_ops, u64_get, count_threshold_set, "%lld\n");
static int array_dump(struct seq_file *m, void *v)
{
struct ce_array *ca = &ce_arr;
u64 prev = 0;
int i;
mutex_lock(&ce_mutex);
seq_printf(m, "{ n: %d\n", ca->n);
for (i = 0; i < ca->n; i++) {
u64 this = PFN(ca->array[i]);
seq_printf(m, " %03d: [%016llx|%03llx]\n", i, this, FULL_COUNT(ca->array[i]));
WARN_ON(prev > this);
prev = this;
}
seq_printf(m, "}\n");
seq_printf(m, "Stats:\nCEs: %llu\nofflined pages: %llu\n",
ca->ces_entered, ca->pfns_poisoned);
seq_printf(m, "Flags: 0x%x\n", ca->flags);
seq_printf(m, "Timer interval: %lld seconds\n", timer_interval);
seq_printf(m, "Decays: %lld\n", ca->decays_done);
seq_printf(m, "Action threshold: %d\n", count_threshold);
mutex_unlock(&ce_mutex);
return 0;
}
static int array_open(struct inode *inode, struct file *filp)
{
return single_open(filp, array_dump, NULL);
}
static const struct file_operations array_ops = {
.owner = THIS_MODULE,
.open = array_open,
.read = seq_read,
.llseek = seq_lseek,
.release = single_release,
};
static int __init create_debugfs_nodes(void)
{
struct dentry *d, *pfn, *decay, *count, *array;
d = debugfs_create_dir("cec", ras_debugfs_dir);
if (!d) {
pr_warn("Error creating cec debugfs node!\n");
return -1;
}
pfn = debugfs_create_file("pfn", S_IRUSR | S_IWUSR, d, &dfs_pfn, &pfn_ops);
if (!pfn) {
pr_warn("Error creating pfn debugfs node!\n");
goto err;
}
array = debugfs_create_file("array", S_IRUSR, d, NULL, &array_ops);
if (!array) {
pr_warn("Error creating array debugfs node!\n");
goto err;
}
decay = debugfs_create_file("decay_interval", S_IRUSR | S_IWUSR, d,
&timer_interval, &decay_interval_ops);
if (!decay) {
pr_warn("Error creating decay_interval debugfs node!\n");
goto err;
}
count = debugfs_create_file("count_threshold", S_IRUSR | S_IWUSR, d,
&count_threshold, &count_threshold_ops);
if (!decay) {
pr_warn("Error creating count_threshold debugfs node!\n");
goto err;
}
return 0;
err:
debugfs_remove_recursive(d);
return 1;
}
void __init cec_init(void)
{
if (ce_arr.disabled)
return;
ce_arr.array = (void *)get_zeroed_page(GFP_KERNEL);
if (!ce_arr.array) {
pr_err("Error allocating CE array page!\n");
return;
}
if (create_debugfs_nodes())
return;
setup_timer(&cec_timer, cec_timer_fn, (unsigned long)&ce_arr);
cec_mod_timer(&cec_timer, CEC_TIMER_DEFAULT_INTERVAL);
pr_info("Correctable Errors collector initialized.\n");
}
int __init parse_cec_param(char *str)
{
if (!str)
return 0;
if (*str == '=')
str++;
if (!strncmp(str, "cec_disable", 7))
ce_arr.disabled = 1;
else
return 0;
return 1;
}
#include <linux/debugfs.h> #include <linux/debugfs.h>
static struct dentry *ras_debugfs_dir; struct dentry *ras_debugfs_dir;
static atomic_t trace_count = ATOMIC_INIT(0); static atomic_t trace_count = ATOMIC_INIT(0);
......
#ifndef __RAS_DEBUGFS_H__
#define __RAS_DEBUGFS_H__
#include <linux/debugfs.h>
extern struct dentry *ras_debugfs_dir;
#endif /* __RAS_DEBUGFS_H__ */
...@@ -27,3 +27,14 @@ subsys_initcall(ras_init); ...@@ -27,3 +27,14 @@ subsys_initcall(ras_init);
EXPORT_TRACEPOINT_SYMBOL_GPL(extlog_mem_event); EXPORT_TRACEPOINT_SYMBOL_GPL(extlog_mem_event);
#endif #endif
EXPORT_TRACEPOINT_SYMBOL_GPL(mc_event); EXPORT_TRACEPOINT_SYMBOL_GPL(mc_event);
int __init parse_ras_param(char *str)
{
#ifdef CONFIG_RAS_CEC
parse_cec_param(str);
#endif
return 1;
}
__setup("ras", parse_ras_param);
#ifndef __RAS_H__ #ifndef __RAS_H__
#define __RAS_H__ #define __RAS_H__
#include <asm/errno.h>
#ifdef CONFIG_DEBUG_FS #ifdef CONFIG_DEBUG_FS
int ras_userspace_consumers(void); int ras_userspace_consumers(void);
void ras_debugfs_init(void); void ras_debugfs_init(void);
int ras_add_daemon_trace(void); int ras_add_daemon_trace(void);
#else #else
static inline int ras_userspace_consumers(void) { return 0; } static inline int ras_userspace_consumers(void) { return 0; }
static inline void ras_debugfs_init(void) { return; } static inline void ras_debugfs_init(void) { }
static inline int ras_add_daemon_trace(void) { return 0; } static inline int ras_add_daemon_trace(void) { return 0; }
#endif #endif
#ifdef CONFIG_RAS_CEC
void __init cec_init(void);
int __init parse_cec_param(char *str);
int cec_add_elem(u64 pfn);
#else
static inline void __init cec_init(void) { }
static inline int cec_add_elem(u64 pfn) { return -ENODEV; }
#endif #endif
#endif /* __RAS_H__ */
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment