Commit 090bc5a2 authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'ras-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull RAS updates from Ingo Molnar:
 "Boris is on vacation so I'm sending the RAS bits this time. The main
  changes were:

   - Various RAS/CEC improvements and fixes by Borislav Petkov:
       - error insertion fixes
       - offlining latency fix
       - memory leak fix
       - additional sanity checks
       - cleanups
       - debug output improvements

   - More SMCA enhancements by Yazen Ghannam:
       - make banks truly per-CPU which they are in the hardware
       - don't over-cache certain registers
       - make the number of MCA banks per-CPU variable

     The long term goal with these changes is to support future
     heterogenous SMCA extensions.

   - Misc fixes and improvements"

* 'ras-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  x86/mce: Do not check return value of debugfs_create functions
  x86/MCE: Determine MCA banks' init state properly
  x86/MCE: Make the number of MCA banks a per-CPU variable
  x86/MCE/AMD: Don't cache block addresses on SMCA systems
  x86/MCE: Make mce_banks a per-CPU array
  x86/MCE: Make struct mce_banks[] static
  RAS/CEC: Add copyright
  RAS/CEC: Add CONFIG_RAS_CEC_DEBUG and move CEC debug features there
  RAS/CEC: Dump the different array element sections
  RAS/CEC: Rename count_threshold to action_threshold
  RAS/CEC: Sanity-check array on every insertion
  RAS/CEC: Fix potential memory leak
  RAS/CEC: Do not set decay value on error
  RAS/CEC: Check count_threshold unconditionally
  RAS/CEC: Fix pfn insertion
parents e1928328 6e4f929e
......@@ -99,11 +99,6 @@ static struct smca_bank_name smca_names[] = {
[SMCA_PCIE] = { "pcie", "PCI Express Unit" },
};
static u32 smca_bank_addrs[MAX_NR_BANKS][NR_BLOCKS] __ro_after_init =
{
[0 ... MAX_NR_BANKS - 1] = { [0 ... NR_BLOCKS - 1] = -1 }
};
static const char *smca_get_name(enum smca_bank_types t)
{
if (t >= N_SMCA_BANK_TYPES)
......@@ -197,6 +192,9 @@ static char buf_mcatype[MAX_MCATYPE_NAME_LEN];
static DEFINE_PER_CPU(struct threshold_bank **, threshold_banks);
static DEFINE_PER_CPU(unsigned int, bank_map); /* see which banks are on */
/* Map of banks that have more than MCA_MISC0 available. */
static DEFINE_PER_CPU(u32, smca_misc_banks_map);
static void amd_threshold_interrupt(void);
static void amd_deferred_error_interrupt(void);
......@@ -206,6 +204,28 @@ static void default_deferred_error_interrupt(void)
}
void (*deferred_error_int_vector)(void) = default_deferred_error_interrupt;
static void smca_set_misc_banks_map(unsigned int bank, unsigned int cpu)
{
u32 low, high;
/*
* For SMCA enabled processors, BLKPTR field of the first MISC register
* (MCx_MISC0) indicates presence of additional MISC regs set (MISC1-4).
*/
if (rdmsr_safe(MSR_AMD64_SMCA_MCx_CONFIG(bank), &low, &high))
return;
if (!(low & MCI_CONFIG_MCAX))
return;
if (rdmsr_safe(MSR_AMD64_SMCA_MCx_MISC(bank), &low, &high))
return;
if (low & MASK_BLKPTR_LO)
per_cpu(smca_misc_banks_map, cpu) |= BIT(bank);
}
static void smca_configure(unsigned int bank, unsigned int cpu)
{
unsigned int i, hwid_mcatype;
......@@ -243,6 +263,8 @@ static void smca_configure(unsigned int bank, unsigned int cpu)
wrmsr(smca_config, low, high);
}
smca_set_misc_banks_map(bank, cpu);
/* Return early if this bank was already initialized. */
if (smca_banks[bank].hwid)
return;
......@@ -453,50 +475,29 @@ static void deferred_error_interrupt_enable(struct cpuinfo_x86 *c)
wrmsr(MSR_CU_DEF_ERR, low, high);
}
static u32 smca_get_block_address(unsigned int bank, unsigned int block)
static u32 smca_get_block_address(unsigned int bank, unsigned int block,
unsigned int cpu)
{
u32 low, high;
u32 addr = 0;
if (smca_get_bank_type(bank) == SMCA_RESERVED)
return addr;
if (!block)
return MSR_AMD64_SMCA_MCx_MISC(bank);
/* Check our cache first: */
if (smca_bank_addrs[bank][block] != -1)
return smca_bank_addrs[bank][block];
/*
* For SMCA enabled processors, BLKPTR field of the first MISC register
* (MCx_MISC0) indicates presence of additional MISC regs set (MISC1-4).
*/
if (rdmsr_safe(MSR_AMD64_SMCA_MCx_CONFIG(bank), &low, &high))
goto out;
if (!(low & MCI_CONFIG_MCAX))
goto out;
if (!rdmsr_safe(MSR_AMD64_SMCA_MCx_MISC(bank), &low, &high) &&
(low & MASK_BLKPTR_LO))
addr = MSR_AMD64_SMCA_MCx_MISCy(bank, block - 1);
if (!(per_cpu(smca_misc_banks_map, cpu) & BIT(bank)))
return 0;
out:
smca_bank_addrs[bank][block] = addr;
return addr;
return MSR_AMD64_SMCA_MCx_MISCy(bank, block - 1);
}
static u32 get_block_address(u32 current_addr, u32 low, u32 high,
unsigned int bank, unsigned int block)
unsigned int bank, unsigned int block,
unsigned int cpu)
{
u32 addr = 0, offset = 0;
if ((bank >= mca_cfg.banks) || (block >= NR_BLOCKS))
if ((bank >= per_cpu(mce_num_banks, cpu)) || (block >= NR_BLOCKS))
return addr;
if (mce_flags.smca)
return smca_get_block_address(bank, block);
return smca_get_block_address(bank, block, cpu);
/* Fall back to method we used for older processors: */
switch (block) {
......@@ -624,18 +625,19 @@ void disable_err_thresholding(struct cpuinfo_x86 *c, unsigned int bank)
/* cpu init entry point, called from mce.c with preempt off */
void mce_amd_feature_init(struct cpuinfo_x86 *c)
{
u32 low = 0, high = 0, address = 0;
unsigned int bank, block, cpu = smp_processor_id();
u32 low = 0, high = 0, address = 0;
int offset = -1;
for (bank = 0; bank < mca_cfg.banks; ++bank) {
for (bank = 0; bank < this_cpu_read(mce_num_banks); ++bank) {
if (mce_flags.smca)
smca_configure(bank, cpu);
disable_err_thresholding(c, bank);
for (block = 0; block < NR_BLOCKS; ++block) {
address = get_block_address(address, low, high, bank, block);
address = get_block_address(address, low, high, bank, block, cpu);
if (!address)
break;
......@@ -973,7 +975,7 @@ static void amd_deferred_error_interrupt(void)
{
unsigned int bank;
for (bank = 0; bank < mca_cfg.banks; ++bank)
for (bank = 0; bank < this_cpu_read(mce_num_banks); ++bank)
log_error_deferred(bank);
}
......@@ -1014,7 +1016,7 @@ static void amd_threshold_interrupt(void)
struct threshold_block *first_block = NULL, *block = NULL, *tmp = NULL;
unsigned int bank, cpu = smp_processor_id();
for (bank = 0; bank < mca_cfg.banks; ++bank) {
for (bank = 0; bank < this_cpu_read(mce_num_banks); ++bank) {
if (!(per_cpu(bank_map, cpu) & (1 << bank)))
continue;
......@@ -1201,7 +1203,7 @@ static int allocate_threshold_blocks(unsigned int cpu, unsigned int bank,
u32 low, high;
int err;
if ((bank >= mca_cfg.banks) || (block >= NR_BLOCKS))
if ((bank >= per_cpu(mce_num_banks, cpu)) || (block >= NR_BLOCKS))
return 0;
if (rdmsr_safe_on_cpu(cpu, address, &low, &high))
......@@ -1252,7 +1254,7 @@ static int allocate_threshold_blocks(unsigned int cpu, unsigned int bank,
if (err)
goto out_free;
recurse:
address = get_block_address(address, low, high, bank, ++block);
address = get_block_address(address, low, high, bank, ++block, cpu);
if (!address)
return 0;
......@@ -1435,7 +1437,7 @@ int mce_threshold_remove_device(unsigned int cpu)
{
unsigned int bank;
for (bank = 0; bank < mca_cfg.banks; ++bank) {
for (bank = 0; bank < per_cpu(mce_num_banks, cpu); ++bank) {
if (!(per_cpu(bank_map, cpu) & (1 << bank)))
continue;
threshold_remove_bank(cpu, bank);
......@@ -1456,14 +1458,14 @@ int mce_threshold_create_device(unsigned int cpu)
if (bp)
return 0;
bp = kcalloc(mca_cfg.banks, sizeof(struct threshold_bank *),
bp = kcalloc(per_cpu(mce_num_banks, cpu), sizeof(struct threshold_bank *),
GFP_KERNEL);
if (!bp)
return -ENOMEM;
per_cpu(threshold_banks, cpu) = bp;
for (bank = 0; bank < mca_cfg.banks; ++bank) {
for (bank = 0; bank < per_cpu(mce_num_banks, cpu); ++bank) {
if (!(per_cpu(bank_map, cpu) & (1 << bank)))
continue;
err = threshold_create_bank(cpu, bank);
......
This diff is collapsed.
......@@ -645,7 +645,6 @@ static const struct file_operations readme_fops = {
static struct dfs_node {
char *name;
struct dentry *d;
const struct file_operations *fops;
umode_t perm;
} dfs_fls[] = {
......@@ -659,49 +658,23 @@ static struct dfs_node {
{ .name = "README", .fops = &readme_fops, .perm = S_IRUSR | S_IRGRP | S_IROTH },
};
static int __init debugfs_init(void)
static void __init debugfs_init(void)
{
unsigned int i;
dfs_inj = debugfs_create_dir("mce-inject", NULL);
if (!dfs_inj)
return -EINVAL;
for (i = 0; i < ARRAY_SIZE(dfs_fls); i++) {
dfs_fls[i].d = debugfs_create_file(dfs_fls[i].name,
dfs_fls[i].perm,
dfs_inj,
&i_mce,
dfs_fls[i].fops);
if (!dfs_fls[i].d)
goto err_dfs_add;
}
return 0;
err_dfs_add:
while (i-- > 0)
debugfs_remove(dfs_fls[i].d);
debugfs_remove(dfs_inj);
dfs_inj = NULL;
return -ENODEV;
for (i = 0; i < ARRAY_SIZE(dfs_fls); i++)
debugfs_create_file(dfs_fls[i].name, dfs_fls[i].perm, dfs_inj,
&i_mce, dfs_fls[i].fops);
}
static int __init inject_init(void)
{
int err;
if (!alloc_cpumask_var(&mce_inject_cpumask, GFP_KERNEL))
return -ENOMEM;
err = debugfs_init();
if (err) {
free_cpumask_var(mce_inject_cpumask);
return err;
}
debugfs_init();
register_nmi_handler(NMI_LOCAL, mce_raise_notify, 0, "mce_notify");
mce_register_injector_chain(&inject_nb);
......
......@@ -22,17 +22,8 @@ enum severity_level {
extern struct blocking_notifier_head x86_mce_decoder_chain;
#define ATTR_LEN 16
#define INITIAL_CHECK_INTERVAL 5 * 60 /* 5 minutes */
/* One object for each MCE bank, shared by all CPUs */
struct mce_bank {
u64 ctl; /* subevents to enable */
unsigned char init; /* initialise bank? */
struct device_attribute attr; /* device attribute */
char attrname[ATTR_LEN]; /* attribute name */
};
struct mce_evt_llist {
struct llist_node llnode;
struct mce mce;
......@@ -47,7 +38,6 @@ struct llist_node *mce_gen_pool_prepare_records(void);
extern int (*mce_severity)(struct mce *a, int tolerant, char **msg, bool is_excp);
struct dentry *mce_get_debugfs_dir(void);
extern struct mce_bank *mce_banks;
extern mce_banks_t mce_banks_ce_disabled;
#ifdef CONFIG_X86_MCE_INTEL
......@@ -128,7 +118,6 @@ struct mca_config {
bios_cmci_threshold : 1,
__reserved : 59;
u8 banks;
s8 bootlog;
int tolerant;
int monarch_timeout;
......@@ -137,6 +126,7 @@ struct mca_config {
};
extern struct mca_config mca_cfg;
DECLARE_PER_CPU_READ_MOSTLY(unsigned int, mce_num_banks);
struct mce_vendor_flags {
/*
......
......@@ -400,21 +400,13 @@ static const struct file_operations severities_coverage_fops = {
static int __init severities_debugfs_init(void)
{
struct dentry *dmce, *fsev;
struct dentry *dmce;
dmce = mce_get_debugfs_dir();
if (!dmce)
goto err_out;
fsev = debugfs_create_file("severities-coverage", 0444, dmce, NULL,
&severities_coverage_fops);
if (!fsev)
goto err_out;
debugfs_create_file("severities-coverage", 0444, dmce, NULL,
&severities_coverage_fops);
return 0;
err_out:
return -ENOMEM;
}
late_initcall(severities_debugfs_init);
#endif /* CONFIG_DEBUG_FS */
......@@ -11,3 +11,13 @@ config RAS_CEC
Bear in mind that this is absolutely useless if your platform doesn't
have ECC DIMMs and doesn't have DRAM ECC checking enabled in the BIOS.
config RAS_CEC_DEBUG
bool "CEC debugging machinery"
default n
depends on RAS_CEC
help
Add extra files to (debugfs)/ras/cec to test the correctable error
collector feature. "pfn" is a writable file that allows user to
simulate an error in a particular page frame. "array" is a read-only
file that dumps out the current state of all pages logged so far.
// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (c) 2017-2019 Borislav Petkov, SUSE Labs.
*/
#include <linux/mm.h>
#include <linux/gfp.h>
#include <linux/kernel.h>
......@@ -37,9 +40,9 @@
* thus emulate an an LRU-like behavior when deleting elements to free up space
* in the page.
*
* When an element reaches it's max count of count_threshold, we try to poison
* it by assuming that errors triggered count_threshold times in a single page
* are excessive and that page shouldn't be used anymore. count_threshold is
* When an element reaches it's max count of action_threshold, we try to poison
* it by assuming that errors triggered action_threshold times in a single page
* are excessive and that page shouldn't be used anymore. action_threshold is
* initialized to COUNT_MASK which is the maximum.
*
* That error event entry causes cec_add_elem() to return !0 value and thus
......@@ -122,7 +125,7 @@ static DEFINE_MUTEX(ce_mutex);
static u64 dfs_pfn;
/* Amount of errors after which we offline */
static unsigned int count_threshold = COUNT_MASK;
static u64 action_threshold = COUNT_MASK;
/* Each element "decays" each decay_interval which is 24hrs by default. */
#define CEC_DECAY_DEFAULT_INTERVAL 24 * 60 * 60 /* 24 hrs */
......@@ -276,11 +279,39 @@ static u64 __maybe_unused del_lru_elem(void)
return pfn;
}
static bool sanity_check(struct ce_array *ca)
{
bool ret = false;
u64 prev = 0;
int i;
for (i = 0; i < ca->n; i++) {
u64 this = PFN(ca->array[i]);
if (WARN(prev > this, "prev: 0x%016llx <-> this: 0x%016llx\n", prev, this))
ret = true;
prev = this;
}
if (!ret)
return ret;
pr_info("Sanity check dump:\n{ n: %d\n", ca->n);
for (i = 0; i < ca->n; i++) {
u64 this = PFN(ca->array[i]);
pr_info(" %03d: [%016llx|%03llx]\n", i, this, FULL_COUNT(ca->array[i]));
}
pr_info("}\n");
return ret;
}
int cec_add_elem(u64 pfn)
{
struct ce_array *ca = &ce_arr;
unsigned int to;
unsigned int to = 0;
int count, ret = 0;
/*
......@@ -294,6 +325,7 @@ int cec_add_elem(u64 pfn)
ca->ces_entered++;
/* Array full, free the LRU slot. */
if (ca->n == MAX_ELEMS)
WARN_ON(!del_lru_elem_unlocked(ca));
......@@ -306,24 +338,17 @@ int cec_add_elem(u64 pfn)
(void *)&ca->array[to],
(ca->n - to) * sizeof(u64));
ca->array[to] = (pfn << PAGE_SHIFT) |
(DECAY_MASK << COUNT_BITS) | 1;
ca->array[to] = pfn << PAGE_SHIFT;
ca->n++;
ret = 0;
goto decay;
}
count = COUNT(ca->array[to]);
if (count < count_threshold) {
ca->array[to] |= (DECAY_MASK << COUNT_BITS);
ca->array[to]++;
/* Add/refresh element generation and increment count */
ca->array[to] |= DECAY_MASK << COUNT_BITS;
ca->array[to]++;
ret = 0;
} else {
/* Check action threshold and soft-offline, if reached. */
count = COUNT(ca->array[to]);
if (count >= action_threshold) {
u64 pfn = ca->array[to] >> PAGE_SHIFT;
if (!pfn_valid(pfn)) {
......@@ -338,20 +363,21 @@ int cec_add_elem(u64 pfn)
del_elem(ca, to);
/*
* Return a >0 value to denote that we've reached the offlining
* threshold.
* Return a >0 value to callers, to denote that we've reached
* the offlining threshold.
*/
ret = 1;
goto unlock;
}
decay:
ca->decay_count++;
if (ca->decay_count >= CLEAN_ELEMS)
do_spring_cleaning(ca);
WARN_ON_ONCE(sanity_check(ca));
unlock:
mutex_unlock(&ce_mutex);
......@@ -369,45 +395,48 @@ static int pfn_set(void *data, u64 val)
{
*(u64 *)data = val;
return cec_add_elem(val);
cec_add_elem(val);
return 0;
}
DEFINE_DEBUGFS_ATTRIBUTE(pfn_ops, u64_get, pfn_set, "0x%llx\n");
static int decay_interval_set(void *data, u64 val)
{
*(u64 *)data = val;
if (val < CEC_DECAY_MIN_INTERVAL)
return -EINVAL;
if (val > CEC_DECAY_MAX_INTERVAL)
return -EINVAL;
*(u64 *)data = val;
decay_interval = val;
cec_mod_work(decay_interval);
return 0;
}
DEFINE_DEBUGFS_ATTRIBUTE(decay_interval_ops, u64_get, decay_interval_set, "%lld\n");
static int count_threshold_set(void *data, u64 val)
static int action_threshold_set(void *data, u64 val)
{
*(u64 *)data = val;
if (val > COUNT_MASK)
val = COUNT_MASK;
count_threshold = val;
action_threshold = val;
return 0;
}
DEFINE_DEBUGFS_ATTRIBUTE(count_threshold_ops, u64_get, count_threshold_set, "%lld\n");
DEFINE_DEBUGFS_ATTRIBUTE(action_threshold_ops, u64_get, action_threshold_set, "%lld\n");
static const char * const bins[] = { "00", "01", "10", "11" };
static int array_dump(struct seq_file *m, void *v)
{
struct ce_array *ca = &ce_arr;
u64 prev = 0;
int i;
mutex_lock(&ce_mutex);
......@@ -416,11 +445,8 @@ static int array_dump(struct seq_file *m, void *v)
for (i = 0; i < ca->n; i++) {
u64 this = PFN(ca->array[i]);
seq_printf(m, " %03d: [%016llx|%03llx]\n", i, this, FULL_COUNT(ca->array[i]));
WARN_ON(prev > this);
prev = this;
seq_printf(m, " %3d: [%016llx|%s|%03llx]\n",
i, this, bins[DECAY(ca->array[i])], COUNT(ca->array[i]));
}
seq_printf(m, "}\n");
......@@ -433,7 +459,7 @@ static int array_dump(struct seq_file *m, void *v)
seq_printf(m, "Decay interval: %lld seconds\n", decay_interval);
seq_printf(m, "Decays: %lld\n", ca->decays_done);
seq_printf(m, "Action threshold: %d\n", count_threshold);
seq_printf(m, "Action threshold: %lld\n", action_threshold);
mutex_unlock(&ce_mutex);
......@@ -463,18 +489,6 @@ static int __init create_debugfs_nodes(void)
return -1;
}
pfn = debugfs_create_file("pfn", S_IRUSR | S_IWUSR, d, &dfs_pfn, &pfn_ops);
if (!pfn) {
pr_warn("Error creating pfn debugfs node!\n");
goto err;
}
array = debugfs_create_file("array", S_IRUSR, d, NULL, &array_ops);
if (!array) {
pr_warn("Error creating array debugfs node!\n");
goto err;
}
decay = debugfs_create_file("decay_interval", S_IRUSR | S_IWUSR, d,
&decay_interval, &decay_interval_ops);
if (!decay) {
......@@ -482,13 +496,27 @@ static int __init create_debugfs_nodes(void)
goto err;
}
count = debugfs_create_file("count_threshold", S_IRUSR | S_IWUSR, d,
&count_threshold, &count_threshold_ops);
count = debugfs_create_file("action_threshold", S_IRUSR | S_IWUSR, d,
&action_threshold, &action_threshold_ops);
if (!count) {
pr_warn("Error creating count_threshold debugfs node!\n");
pr_warn("Error creating action_threshold debugfs node!\n");
goto err;
}
if (!IS_ENABLED(CONFIG_RAS_CEC_DEBUG))
return 0;
pfn = debugfs_create_file("pfn", S_IRUSR | S_IWUSR, d, &dfs_pfn, &pfn_ops);
if (!pfn) {
pr_warn("Error creating pfn debugfs node!\n");
goto err;
}
array = debugfs_create_file("array", S_IRUSR, d, NULL, &array_ops);
if (!array) {
pr_warn("Error creating array debugfs node!\n");
goto err;
}
return 0;
......@@ -509,8 +537,10 @@ void __init cec_init(void)
return;
}
if (create_debugfs_nodes())
if (create_debugfs_nodes()) {
free_page((unsigned long)ce_arr.array);
return;
}
INIT_DELAYED_WORK(&cec_work, cec_work_fn);
schedule_delayed_work(&cec_work, CEC_DECAY_DEFAULT_INTERVAL);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment