Commit 345fb0a9 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'edac_for_4.11' of git://git.kernel.org/pub/scm/linux/kernel/git/bp/bp

Pull EDAC updates from Borislav Petkov:

 - Make amd64_edac still load on a machine with unpopulated nodes +
   cleanups (Yazen Ghannam)

 - Expose per-DIMM error counts in sysfs (Aaron Miller)

 - Add T2080 l2-cache support to mpc85xx (Chris Packham)

 - Random other small improvements/cleanups/fixlets

* tag 'edac_for_4.11' of git://git.kernel.org/pub/scm/linux/kernel/git/bp/bp:
  EDAC, mce_amd: Print IPID and Syndrome on a separate line
  EDAC, amd64: Bump driver version
  MAINTAINERS, EDAC: Update email for Thor Thayer
  EDAC, fsl_ddr: Make locally used symbols static
  EDAC, mpc85xx: Add T2080 l2-cache support
  EDAC, amd64: Add x86cpuid sanity check during init
  EDAC, amd64: Don't treat ECC disabled as failure
  EDAC: Add routine to check if MC devices list is empty
  EDAC, amd64: Remove unused printing macros
  EDAC, amd64: Rework messages in ecc_enabled()
  EDAC, amd64: Move global code out of instance functions
  EDAC, amd64: Free unused memory when init_one_instance() fails
  EDAC, mce_amd: Give more context to deferred error message
  EDAC, i7300: Test for the second channel properly
  EDAC, sb_edac: Get rid of ->show_interleave_mode()
  EDAC: Expose per-DIMM error counts in sysfs
  EDAC, amd64: Save and return err code from probe_one_instance()
  EDAC, i82975x: Add ioremap_nocache() error handling
  EDAC: Fix typos in enum mem_type comments
  EDAC: Make dev_attr_sdram_scrub_rate static
parents 507b5007 75bf2f64
......@@ -138,3 +138,20 @@ Contact: Mauro Carvalho Chehab <m.chehab@samsung.com>
Description: This attribute file will display what type of memory is
currently on this csrow. Normally, either buffered or
unbuffered memory (for example, Unbuffered-DDR3).
What: /sys/devices/system/edac/mc/mc*/(dimm|rank)*/dimm_ce_count
Date: October 2016
Contact: linux-edac@vger.kernel.org
Description: This attribute file displays the total count of correctable
errors that have occurred on this DIMM. This count is very important
to examine. CEs provide early indications that a DIMM is beginning
to fail. This count field should be monitored for non-zero values
and report such information to the system administrator.
What: /sys/devices/system/edac/mc/mc*/(dimm|rank)*/dimm_ue_count
Date: October 2016
Contact: linux-edac@vger.kernel.org
Description: This attribute file displays the total count of uncorrectable
errors that have occurred on this DIMM. If panic_on_ue is set, this
counter will not have a chance to increment, since EDAC will panic the
system
......@@ -438,11 +438,13 @@ A typical EDAC system has the following structure under
│   │   ├── ce_count
│   │   ├── ce_noinfo_count
│   │   ├── dimm0
│   │   │   ├── dimm_ce_count
│   │   │   ├── dimm_dev_type
│   │   │   ├── dimm_edac_mode
│   │   │   ├── dimm_label
│   │   │   ├── dimm_location
│   │   │   ├── dimm_mem_type
│   │   │   ├── dimm_ue_count
│   │   │   ├── size
│   │   │   └── uevent
│   │   ├── max_location
......@@ -457,11 +459,13 @@ A typical EDAC system has the following structure under
│   │   ├── ce_count
│   │   ├── ce_noinfo_count
│   │   ├── dimm0
│   │   │   ├── dimm_ce_count
│   │   │   ├── dimm_dev_type
│   │   │   ├── dimm_edac_mode
│   │   │   ├── dimm_label
│   │   │   ├── dimm_location
│   │   │   ├── dimm_mem_type
│   │   │   ├── dimm_ue_count
│   │   │   ├── size
│   │   │   └── uevent
│   │   ├── max_location
......@@ -483,6 +487,22 @@ this ``X`` memory module:
This attribute file displays, in count of megabytes, the memory
that this csrow contains.
- ``dimm_ue_count`` - Uncorrectable Errors count attribute file
This attribute file displays the total count of uncorrectable
errors that have occurred on this DIMM. If panic_on_ue is set
this counter will not have a chance to increment, since EDAC
will panic the system.
- ``dimm_ce_count`` - Correctable Errors count attribute file
This attribute file displays the total count of correctable
errors that have occurred on this DIMM. This count is very
important to examine. CEs provide early indications that a
DIMM is beginning to fail. This count field should be
monitored for non-zero values and report such information
to the system administrator.
- ``dimm_dev_type`` - Device type attribute file
This attribute file will display what type of DRAM device is
......
......@@ -643,7 +643,7 @@ S: Maintained
F: drivers/gpio/gpio-altera.c
ALTERA SYSTEM RESOURCE DRIVER FOR ARRIA10 DEVKIT
M: Thor Thayer <tthayer@opensource.altera.com>
M: Thor Thayer <thor.thayer@linux.intel.com>
S: Maintained
F: drivers/gpio/gpio-altera-a10sr.c
F: drivers/mfd/altera-a10sr.c
......@@ -1788,7 +1788,7 @@ S: Maintained
F: drivers/clk/socfpga/
ARM/SOCFPGA EDAC SUPPORT
M: Thor Thayer <tthayer@opensource.altera.com>
M: Thor Thayer <thor.thayer@linux.intel.com>
S: Maintained
F: drivers/edac/altera_edac.
......
......@@ -678,5 +678,6 @@ L2_1: l2-cache-controller@c20000 {
compatible = "fsl,t2080-l2-cache-controller";
reg = <0xc20000 0x40000>;
next-level-cache = <&cpc>;
interrupts = <16 2 1 9>;
};
};
......@@ -3065,6 +3065,8 @@ static bool ecc_enabled(struct pci_dev *F3, u16 nid)
/* Check whether at least one UMC is enabled: */
if (umc_en_mask)
ecc_en = umc_en_mask == ecc_en_mask;
else
edac_dbg(0, "Node %d: No enabled UMCs.\n", nid);
/* Assume UMC MCA banks are enabled. */
nb_mce_en = true;
......@@ -3075,14 +3077,15 @@ static bool ecc_enabled(struct pci_dev *F3, u16 nid)
nb_mce_en = nb_mce_bank_enabled_on_node(nid);
if (!nb_mce_en)
amd64_notice("NB MCE bank disabled, set MSR 0x%08x[4] on node %d to enable.\n",
edac_dbg(0, "NB MCE bank disabled, set MSR 0x%08x[4] on node %d to enable.\n",
MSR_IA32_MCG_CTL, nid);
}
amd64_info("DRAM ECC %s.\n", (ecc_en ? "enabled" : "disabled"));
amd64_info("Node %d: DRAM ECC %s.\n",
nid, (ecc_en ? "enabled" : "disabled"));
if (!ecc_en || !nb_mce_en) {
amd64_notice("%s", ecc_msg);
amd64_info("%s", ecc_msg);
return false;
}
return true;
......@@ -3300,15 +3303,6 @@ static int init_one_instance(unsigned int nid)
goto err_add_mc;
}
/* register stuff with EDAC MCE */
if (report_gart_errors)
amd_report_gart_errors(true);
if (pvt->umc)
amd_register_ecc_decoder(decode_umc_error);
else
amd_register_ecc_decoder(decode_bus_error);
return 0;
err_add_mc:
......@@ -3342,7 +3336,7 @@ static int probe_one_instance(unsigned int nid)
ecc_stngs[nid] = s;
if (!ecc_enabled(F3, nid)) {
ret = -ENODEV;
ret = 0;
if (!ecc_enable_override)
goto err_enable;
......@@ -3363,6 +3357,8 @@ static int probe_one_instance(unsigned int nid)
if (boot_cpu_data.x86 < 0x17)
restore_ecc_error_reporting(s, nid, F3);
goto err_enable;
}
return ret;
......@@ -3396,14 +3392,6 @@ static void remove_one_instance(unsigned int nid)
free_mc_sibling_devs(pvt);
/* unregister from EDAC MCE */
amd_report_gart_errors(false);
if (pvt->umc)
amd_unregister_ecc_decoder(decode_umc_error);
else
amd_unregister_ecc_decoder(decode_bus_error);
kfree(ecc_stngs[nid]);
ecc_stngs[nid] = NULL;
......@@ -3452,8 +3440,11 @@ static int __init amd64_edac_init(void)
int err = -ENODEV;
int i;
if (!x86_match_cpu(amd64_cpuids))
return -ENODEV;
if (amd_cache_northbridges() < 0)
goto err_ret;
return -ENODEV;
opstate_init();
......@@ -3466,14 +3457,30 @@ static int __init amd64_edac_init(void)
if (!msrs)
goto err_free;
for (i = 0; i < amd_nb_num(); i++)
if (probe_one_instance(i)) {
for (i = 0; i < amd_nb_num(); i++) {
err = probe_one_instance(i);
if (err) {
/* unwind properly */
while (--i >= 0)
remove_one_instance(i);
goto err_pci;
}
}
if (!edac_has_mcs()) {
err = -ENODEV;
goto err_pci;
}
/* register stuff with EDAC MCE */
if (report_gart_errors)
amd_report_gart_errors(true);
if (boot_cpu_data.x86 >= 0x17)
amd_register_ecc_decoder(decode_umc_error);
else
amd_register_ecc_decoder(decode_bus_error);
setup_pci_device();
......@@ -3493,7 +3500,6 @@ static int __init amd64_edac_init(void)
kfree(ecc_stngs);
ecc_stngs = NULL;
err_ret:
return err;
}
......@@ -3504,6 +3510,14 @@ static void __exit amd64_edac_exit(void)
if (pci_ctl)
edac_pci_release_generic_ctl(pci_ctl);
/* unregister from EDAC MCE */
amd_report_gart_errors(false);
if (boot_cpu_data.x86 >= 0x17)
amd_unregister_ecc_decoder(decode_umc_error);
else
amd_unregister_ecc_decoder(decode_bus_error);
for (i = 0; i < amd_nb_num(); i++)
remove_one_instance(i);
......
......@@ -16,19 +16,14 @@
#include <linux/slab.h>
#include <linux/mmzone.h>
#include <linux/edac.h>
#include <asm/cpu_device_id.h>
#include <asm/msr.h>
#include "edac_module.h"
#include "mce_amd.h"
#define amd64_debug(fmt, arg...) \
edac_printk(KERN_DEBUG, "amd64", fmt, ##arg)
#define amd64_info(fmt, arg...) \
edac_printk(KERN_INFO, "amd64", fmt, ##arg)
#define amd64_notice(fmt, arg...) \
edac_printk(KERN_NOTICE, "amd64", fmt, ##arg)
#define amd64_warn(fmt, arg...) \
edac_printk(KERN_WARNING, "amd64", "Warning: " fmt, ##arg)
......@@ -90,7 +85,7 @@
* sections 3.5.4 and 3.5.5 for more information.
*/
#define EDAC_AMD64_VERSION "3.4.0"
#define EDAC_AMD64_VERSION "3.5.0"
#define EDAC_MOD_STR "amd64_edac"
/* Extended Model from CPUID, for CPU Revision numbers */
......
......@@ -453,6 +453,20 @@ void edac_mc_free(struct mem_ctl_info *mci)
}
EXPORT_SYMBOL_GPL(edac_mc_free);
bool edac_has_mcs(void)
{
bool ret;
mutex_lock(&mem_ctls_mutex);
ret = list_empty(&mc_devices);
mutex_unlock(&mem_ctls_mutex);
return !ret;
}
EXPORT_SYMBOL_GPL(edac_has_mcs);
/* Caller must hold mem_ctls_mutex */
static struct mem_ctl_info *__find_mci_by_dev(struct device *dev)
{
......
......@@ -148,6 +148,15 @@ extern int edac_mc_add_mc_with_groups(struct mem_ctl_info *mci,
*/
extern void edac_mc_free(struct mem_ctl_info *mci);
/**
* edac_has_mcs() - Check if any MCs have been allocated.
*
* Returns:
* True if MC instances have been registered successfully.
* False otherwise.
*/
extern bool edac_has_mcs(void);
/**
* edac_mc_find() - Search for a mem_ctl_info structure whose index is @idx.
*
......
......@@ -569,6 +569,40 @@ static ssize_t dimmdev_edac_mode_show(struct device *dev,
return sprintf(data, "%s\n", edac_caps[dimm->edac_mode]);
}
static ssize_t dimmdev_ce_count_show(struct device *dev,
struct device_attribute *mattr,
char *data)
{
struct dimm_info *dimm = to_dimm(dev);
u32 count;
int off;
off = EDAC_DIMM_OFF(dimm->mci->layers,
dimm->mci->n_layers,
dimm->location[0],
dimm->location[1],
dimm->location[2]);
count = dimm->mci->ce_per_layer[dimm->mci->n_layers-1][off];
return sprintf(data, "%u\n", count);
}
static ssize_t dimmdev_ue_count_show(struct device *dev,
struct device_attribute *mattr,
char *data)
{
struct dimm_info *dimm = to_dimm(dev);
u32 count;
int off;
off = EDAC_DIMM_OFF(dimm->mci->layers,
dimm->mci->n_layers,
dimm->location[0],
dimm->location[1],
dimm->location[2]);
count = dimm->mci->ue_per_layer[dimm->mci->n_layers-1][off];
return sprintf(data, "%u\n", count);
}
/* dimm/rank attribute files */
static DEVICE_ATTR(dimm_label, S_IRUGO | S_IWUSR,
dimmdev_label_show, dimmdev_label_store);
......@@ -577,6 +611,8 @@ static DEVICE_ATTR(size, S_IRUGO, dimmdev_size_show, NULL);
static DEVICE_ATTR(dimm_mem_type, S_IRUGO, dimmdev_mem_type_show, NULL);
static DEVICE_ATTR(dimm_dev_type, S_IRUGO, dimmdev_dev_type_show, NULL);
static DEVICE_ATTR(dimm_edac_mode, S_IRUGO, dimmdev_edac_mode_show, NULL);
static DEVICE_ATTR(dimm_ce_count, S_IRUGO, dimmdev_ce_count_show, NULL);
static DEVICE_ATTR(dimm_ue_count, S_IRUGO, dimmdev_ue_count_show, NULL);
/* attributes of the dimm<id>/rank<id> object */
static struct attribute *dimm_attrs[] = {
......@@ -586,6 +622,8 @@ static struct attribute *dimm_attrs[] = {
&dev_attr_dimm_mem_type.attr,
&dev_attr_dimm_dev_type.attr,
&dev_attr_dimm_edac_mode.attr,
&dev_attr_dimm_ce_count.attr,
&dev_attr_dimm_ue_count.attr,
NULL,
};
......@@ -831,7 +869,7 @@ static DEVICE_ATTR(ce_count, S_IRUGO, mci_ce_count_show, NULL);
static DEVICE_ATTR(max_location, S_IRUGO, mci_max_location_show, NULL);
/* memory scrubber attribute file */
DEVICE_ATTR(sdram_scrub_rate, 0, mci_sdram_scrub_rate_show,
static DEVICE_ATTR(sdram_scrub_rate, 0, mci_sdram_scrub_rate_show,
mci_sdram_scrub_rate_store); /* umode set later in is_visible */
static struct attribute *mci_attrs[] = {
......
......@@ -145,11 +145,11 @@ static ssize_t fsl_mc_inject_ctrl_store(struct device *dev,
return 0;
}
DEVICE_ATTR(inject_data_hi, S_IRUGO | S_IWUSR,
static DEVICE_ATTR(inject_data_hi, S_IRUGO | S_IWUSR,
fsl_mc_inject_data_hi_show, fsl_mc_inject_data_hi_store);
DEVICE_ATTR(inject_data_lo, S_IRUGO | S_IWUSR,
static DEVICE_ATTR(inject_data_lo, S_IRUGO | S_IWUSR,
fsl_mc_inject_data_lo_show, fsl_mc_inject_data_lo_store);
DEVICE_ATTR(inject_ctrl, S_IRUGO | S_IWUSR,
static DEVICE_ATTR(inject_ctrl, S_IRUGO | S_IWUSR,
fsl_mc_inject_ctrl_show, fsl_mc_inject_ctrl_store);
static struct attribute *fsl_ddr_dev_attrs[] = {
......
......@@ -304,7 +304,6 @@ static const char *ferr_global_lo_name[] = {
#define REDMEMA 0xdc
#define REDMEMB 0x7c
#define IS_SECOND_CH(v) ((v) * (1 << 17))
#define RECMEMA 0xe0
#define RECMEMA_BANK(v) (((v) >> 12) & 7)
......@@ -483,8 +482,9 @@ static void i7300_process_fbd_error(struct mem_ctl_info *mci)
pci_read_config_dword(pvt->pci_dev_16_1_fsb_addr_map,
REDMEMB, &value);
channel = (branch << 1);
if (IS_SECOND_CH(value))
channel++;
/* Second channel ? */
channel += !!(value & BIT(17));
/* Clear the error bit */
pci_write_config_dword(pvt->pci_dev_16_1_fsb_addr_map,
......
......@@ -494,6 +494,10 @@ static int i82975x_probe1(struct pci_dev *pdev, int dev_idx)
}
mchbar &= 0xffffc000; /* bits 31:14 used for 16K window */
mch_window = ioremap_nocache(mchbar, 0x1000);
if (!mch_window) {
edac_dbg(3, "error ioremapping MCHBAR!\n");
goto fail0;
}
#ifdef i82975x_DEBUG_IOMEM
i82975x_printk(KERN_INFO, "MCHBAR real = %0x, remapped = %p\n",
......
......@@ -937,7 +937,7 @@ static const char *decode_error_status(struct mce *m)
}
if (m->status & MCI_STATUS_DEFERRED)
return "Deferred error.";
return "Deferred error, no action required.";
return "Corrected error, no action required.";
}
......@@ -991,20 +991,19 @@ int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
pr_cont("]: 0x%016llx\n", m->status);
if (m->status & MCI_STATUS_ADDRV)
pr_emerg(HW_ERR "Error Addr: 0x%016llx", m->addr);
pr_emerg(HW_ERR "Error Addr: 0x%016llx\n", m->addr);
if (boot_cpu_has(X86_FEATURE_SMCA)) {
pr_emerg(HW_ERR "IPID: 0x%016llx", m->ipid);
if (m->status & MCI_STATUS_SYNDV)
pr_cont(", Syndrome: 0x%016llx", m->synd);
pr_cont(", IPID: 0x%016llx", m->ipid);
pr_cont("\n");
decode_smca_errors(m);
goto err_code;
} else
pr_cont("\n");
}
if (!fam_ops)
goto err_code;
......
......@@ -629,6 +629,7 @@ static const struct of_device_id mpc85xx_l2_err_of_match[] = {
{ .compatible = "fsl,p1020-l2-cache-controller", },
{ .compatible = "fsl,p1021-l2-cache-controller", },
{ .compatible = "fsl,p2020-l2-cache-controller", },
{ .compatible = "fsl,t2080-l2-cache-controller", },
{},
};
MODULE_DEVICE_TABLE(of, mpc85xx_l2_err_of_match);
......
......@@ -304,7 +304,6 @@ struct sbridge_info {
u64 (*rir_limit)(u32 reg);
u64 (*sad_limit)(u32 reg);
u32 (*interleave_mode)(u32 reg);
char* (*show_interleave_mode)(u32 reg);
u32 (*dram_attr)(u32 reg);
const u32 *dram_rule;
const u32 *interleave_list;
......@@ -811,11 +810,6 @@ static u32 interleave_mode(u32 reg)
return GET_BITFIELD(reg, 1, 1);
}
char *show_interleave_mode(u32 reg)
{
return interleave_mode(reg) ? "8:6" : "[8:6]XOR[18:16]";
}
static u32 dram_attr(u32 reg)
{
return GET_BITFIELD(reg, 2, 3);
......@@ -831,29 +825,16 @@ static u32 knl_interleave_mode(u32 reg)
return GET_BITFIELD(reg, 1, 2);
}
static char *knl_show_interleave_mode(u32 reg)
{
char *s;
switch (knl_interleave_mode(reg)) {
case 0:
s = "use address bits [8:6]";
break;
case 1:
s = "use address bits [10:8]";
break;
case 2:
s = "use address bits [14:12]";
break;
case 3:
s = "use address bits [32:30]";
break;
default:
WARN_ON(1);
break;
}
static const char * const knl_intlv_mode[] = {
"[8:6]", "[10:8]", "[14:12]", "[32:30]"
};
return s;
static const char *get_intlv_mode_str(u32 reg, enum type t)
{
if (t == KNIGHTS_LANDING)
return knl_intlv_mode[knl_interleave_mode(reg)];
else
return interleave_mode(reg) ? "[8:6]" : "[8:6]XOR[18:16]";
}
static u32 dram_attr_knl(u32 reg)
......@@ -1810,7 +1791,7 @@ static void get_memory_layout(const struct mem_ctl_info *mci)
show_dram_attr(pvt->info.dram_attr(reg)),
gb, (mb*1000)/1024,
((u64)tmp_mb) << 20L,
pvt->info.show_interleave_mode(reg),
get_intlv_mode_str(reg, pvt->info.type),
reg);
prv = limit;
......@@ -3227,7 +3208,6 @@ static int sbridge_register_mci(struct sbridge_dev *sbridge_dev, enum type type)
pvt->info.rir_limit = rir_limit;
pvt->info.sad_limit = sad_limit;
pvt->info.interleave_mode = interleave_mode;
pvt->info.show_interleave_mode = show_interleave_mode;
pvt->info.dram_attr = dram_attr;
pvt->info.max_sad = ARRAY_SIZE(ibridge_dram_rule);
pvt->info.interleave_list = ibridge_interleave_list;
......@@ -3251,7 +3231,6 @@ static int sbridge_register_mci(struct sbridge_dev *sbridge_dev, enum type type)
pvt->info.rir_limit = rir_limit;
pvt->info.sad_limit = sad_limit;
pvt->info.interleave_mode = interleave_mode;
pvt->info.show_interleave_mode = show_interleave_mode;
pvt->info.dram_attr = dram_attr;
pvt->info.max_sad = ARRAY_SIZE(sbridge_dram_rule);
pvt->info.interleave_list = sbridge_interleave_list;
......@@ -3275,7 +3254,6 @@ static int sbridge_register_mci(struct sbridge_dev *sbridge_dev, enum type type)
pvt->info.rir_limit = haswell_rir_limit;
pvt->info.sad_limit = sad_limit;
pvt->info.interleave_mode = interleave_mode;
pvt->info.show_interleave_mode = show_interleave_mode;
pvt->info.dram_attr = dram_attr;
pvt->info.max_sad = ARRAY_SIZE(ibridge_dram_rule);
pvt->info.interleave_list = ibridge_interleave_list;
......@@ -3299,7 +3277,6 @@ static int sbridge_register_mci(struct sbridge_dev *sbridge_dev, enum type type)
pvt->info.rir_limit = haswell_rir_limit;
pvt->info.sad_limit = sad_limit;
pvt->info.interleave_mode = interleave_mode;
pvt->info.show_interleave_mode = show_interleave_mode;
pvt->info.dram_attr = dram_attr;
pvt->info.max_sad = ARRAY_SIZE(ibridge_dram_rule);
pvt->info.interleave_list = ibridge_interleave_list;
......@@ -3323,7 +3300,6 @@ static int sbridge_register_mci(struct sbridge_dev *sbridge_dev, enum type type)
pvt->info.rir_limit = NULL;
pvt->info.sad_limit = knl_sad_limit;
pvt->info.interleave_mode = knl_interleave_mode;
pvt->info.show_interleave_mode = knl_show_interleave_mode;
pvt->info.dram_attr = dram_attr_knl;
pvt->info.max_sad = ARRAY_SIZE(knl_dram_rule);
pvt->info.interleave_list = knl_interleave_list;
......
......@@ -190,8 +190,8 @@ static inline char *mc_event_error_type(const unsigned int err_type)
* part of the memory details to the memory controller.
* @MEM_RMBS: Rambus DRAM, used on a few Pentium III/IV controllers.
* @MEM_DDR2: DDR2 RAM, as described at JEDEC JESD79-2F.
* Those memories are labed as "PC2-" instead of "PC" to
* differenciate from DDR.
* Those memories are labeled as "PC2-" instead of "PC" to
* differentiate from DDR.
* @MEM_FB_DDR2: Fully-Buffered DDR2, as described at JEDEC Std No. 205
* and JESD206.
* Those memories are accessed per DIMM slot, and not by
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment