Commit 4fb6fde7 authored by Aaron Miller's avatar Aaron Miller Committed by Borislav Petkov

EDAC: Expose per-DIMM error counts in sysfs

The old csrowX sysfs directories have per-csrow error counters, but the
new dimmX directories do not currently expose error counts.

EDAC already keeps these counts, add them to sysfs so per-DIMM counts
are still available when CONFIG_EDAC_LEGACY_SYSFS=n.
Signed-off-by: default avatarAaron Miller <aaronmiller@fb.com>
Cc: linux-edac <linux-edac@vger.kernel.org>
Link: http://lkml.kernel.org/r/20161103220153.3997328-1-aaronmiller@fb.comSigned-off-by: default avatarBorislav Petkov <bp@suse.de>
parent 2287c636
......@@ -138,3 +138,20 @@ Contact: Mauro Carvalho Chehab <m.chehab@samsung.com>
Description: This attribute file will display what type of memory is
currently on this csrow. Normally, either buffered or
unbuffered memory (for example, Unbuffered-DDR3).
What: /sys/devices/system/edac/mc/mc*/(dimm|rank)*/dimm_ce_count
Date: October 2016
Contact: linux-edac@vger.kernel.org
Description: This attribute file displays the total count of correctable
errors that have occurred on this DIMM. This count is very important
to examine. CEs provide early indications that a DIMM is beginning
to fail. This count field should be monitored for non-zero values
and report such information to the system administrator.
What: /sys/devices/system/edac/mc/mc*/(dimm|rank)*/dimm_ue_count
Date: October 2016
Contact: linux-edac@vger.kernel.org
Description: This attribute file displays the total count of uncorrectable
errors that have occurred on this DIMM. If panic_on_ue is set, this
counter will not have a chance to increment, since EDAC will panic the
system
......@@ -438,11 +438,13 @@ A typical EDAC system has the following structure under
│   │   ├── ce_count
│   │   ├── ce_noinfo_count
│   │   ├── dimm0
│   │   │   ├── dimm_ce_count
│   │   │   ├── dimm_dev_type
│   │   │   ├── dimm_edac_mode
│   │   │   ├── dimm_label
│   │   │   ├── dimm_location
│   │   │   ├── dimm_mem_type
│   │   │   ├── dimm_ue_count
│   │   │   ├── size
│   │   │   └── uevent
│   │   ├── max_location
......@@ -457,11 +459,13 @@ A typical EDAC system has the following structure under
│   │   ├── ce_count
│   │   ├── ce_noinfo_count
│   │   ├── dimm0
│   │   │   ├── dimm_ce_count
│   │   │   ├── dimm_dev_type
│   │   │   ├── dimm_edac_mode
│   │   │   ├── dimm_label
│   │   │   ├── dimm_location
│   │   │   ├── dimm_mem_type
│   │   │   ├── dimm_ue_count
│   │   │   ├── size
│   │   │   └── uevent
│   │   ├── max_location
......@@ -483,6 +487,22 @@ this ``X`` memory module:
This attribute file displays, in count of megabytes, the memory
that this csrow contains.
- ``dimm_ue_count`` - Uncorrectable Errors count attribute file
This attribute file displays the total count of uncorrectable
errors that have occurred on this DIMM. If panic_on_ue is set
this counter will not have a chance to increment, since EDAC
will panic the system.
- ``dimm_ce_count`` - Correctable Errors count attribute file
This attribute file displays the total count of correctable
errors that have occurred on this DIMM. This count is very
important to examine. CEs provide early indications that a
DIMM is beginning to fail. This count field should be
monitored for non-zero values and report such information
to the system administrator.
- ``dimm_dev_type`` - Device type attribute file
This attribute file will display what type of DRAM device is
......
......@@ -569,6 +569,40 @@ static ssize_t dimmdev_edac_mode_show(struct device *dev,
return sprintf(data, "%s\n", edac_caps[dimm->edac_mode]);
}
static ssize_t dimmdev_ce_count_show(struct device *dev,
struct device_attribute *mattr,
char *data)
{
struct dimm_info *dimm = to_dimm(dev);
u32 count;
int off;
off = EDAC_DIMM_OFF(dimm->mci->layers,
dimm->mci->n_layers,
dimm->location[0],
dimm->location[1],
dimm->location[2]);
count = dimm->mci->ce_per_layer[dimm->mci->n_layers-1][off];
return sprintf(data, "%u\n", count);
}
static ssize_t dimmdev_ue_count_show(struct device *dev,
struct device_attribute *mattr,
char *data)
{
struct dimm_info *dimm = to_dimm(dev);
u32 count;
int off;
off = EDAC_DIMM_OFF(dimm->mci->layers,
dimm->mci->n_layers,
dimm->location[0],
dimm->location[1],
dimm->location[2]);
count = dimm->mci->ue_per_layer[dimm->mci->n_layers-1][off];
return sprintf(data, "%u\n", count);
}
/* dimm/rank attribute files */
static DEVICE_ATTR(dimm_label, S_IRUGO | S_IWUSR,
dimmdev_label_show, dimmdev_label_store);
......@@ -577,6 +611,8 @@ static DEVICE_ATTR(size, S_IRUGO, dimmdev_size_show, NULL);
static DEVICE_ATTR(dimm_mem_type, S_IRUGO, dimmdev_mem_type_show, NULL);
static DEVICE_ATTR(dimm_dev_type, S_IRUGO, dimmdev_dev_type_show, NULL);
static DEVICE_ATTR(dimm_edac_mode, S_IRUGO, dimmdev_edac_mode_show, NULL);
static DEVICE_ATTR(dimm_ce_count, S_IRUGO, dimmdev_ce_count_show, NULL);
static DEVICE_ATTR(dimm_ue_count, S_IRUGO, dimmdev_ue_count_show, NULL);
/* attributes of the dimm<id>/rank<id> object */
static struct attribute *dimm_attrs[] = {
......@@ -586,6 +622,8 @@ static struct attribute *dimm_attrs[] = {
&dev_attr_dimm_mem_type.attr,
&dev_attr_dimm_dev_type.attr,
&dev_attr_dimm_edac_mode.attr,
&dev_attr_dimm_ce_count.attr,
&dev_attr_dimm_ue_count.attr,
NULL,
};
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment