EDAC: Expose per-DIMM error counts in sysfs

The old csrowX sysfs directories have per-csrow error counters, but the
new dimmX directories do not currently expose error counts.

EDAC already keeps these counts, add them to sysfs so per-DIMM counts
are still available when CONFIG_EDAC_LEGACY_SYSFS=n.

Signed-off-by: Aaron Miller <aaronmiller@fb.com>
Cc: linux-edac <linux-edac@vger.kernel.org>
Link: http://lkml.kernel.org/r/20161103220153.3997328-1-aaronmiller@fb.com
Signed-off-by: Borislav Petkov <bp@suse.de>
This commit is contained in:
Aaron Miller 2016-11-03 15:01:53 -07:00 committed by Borislav Petkov
parent 2287c63643
commit 4fb6fde74d
3 changed files with 75 additions and 0 deletions

View File

@ -138,3 +138,20 @@ Contact: Mauro Carvalho Chehab <m.chehab@samsung.com>
Description: This attribute file will display what type of memory is
currently on this csrow. Normally, either buffered or
unbuffered memory (for example, Unbuffered-DDR3).
What: /sys/devices/system/edac/mc/mc*/(dimm|rank)*/dimm_ce_count
Date: October 2016
Contact: linux-edac@vger.kernel.org
Description: This attribute file displays the total count of correctable
errors that have occurred on this DIMM. This count is very important
to examine. CEs provide early indications that a DIMM is beginning
to fail. This count field should be monitored for non-zero values
and report such information to the system administrator.
What: /sys/devices/system/edac/mc/mc*/(dimm|rank)*/dimm_ue_count
Date: October 2016
Contact: linux-edac@vger.kernel.org
Description: This attribute file displays the total count of uncorrectable
errors that have occurred on this DIMM. If panic_on_ue is set, this
counter will not have a chance to increment, since EDAC will panic the
system

View File

@ -438,11 +438,13 @@ A typical EDAC system has the following structure under
│   │   ├── ce_count
│   │   ├── ce_noinfo_count
│   │   ├── dimm0
│   │   │   ├── dimm_ce_count
│   │   │   ├── dimm_dev_type
│   │   │   ├── dimm_edac_mode
│   │   │   ├── dimm_label
│   │   │   ├── dimm_location
│   │   │   ├── dimm_mem_type
│   │   │   ├── dimm_ue_count
│   │   │   ├── size
│   │   │   └── uevent
│   │   ├── max_location
@ -457,11 +459,13 @@ A typical EDAC system has the following structure under
│   │   ├── ce_count
│   │   ├── ce_noinfo_count
│   │   ├── dimm0
│   │   │   ├── dimm_ce_count
│   │   │   ├── dimm_dev_type
│   │   │   ├── dimm_edac_mode
│   │   │   ├── dimm_label
│   │   │   ├── dimm_location
│   │   │   ├── dimm_mem_type
│   │   │   ├── dimm_ue_count
│   │   │   ├── size
│   │   │   └── uevent
│   │   ├── max_location
@ -483,6 +487,22 @@ this ``X`` memory module:
This attribute file displays, in count of megabytes, the memory
that this csrow contains.
- ``dimm_ue_count`` - Uncorrectable Errors count attribute file
This attribute file displays the total count of uncorrectable
errors that have occurred on this DIMM. If panic_on_ue is set
this counter will not have a chance to increment, since EDAC
will panic the system.
- ``dimm_ce_count`` - Correctable Errors count attribute file
This attribute file displays the total count of correctable
errors that have occurred on this DIMM. This count is very
important to examine. CEs provide early indications that a
DIMM is beginning to fail. This count field should be
monitored for non-zero values and report such information
to the system administrator.
- ``dimm_dev_type`` - Device type attribute file
This attribute file will display what type of DRAM device is

View File

@ -569,6 +569,40 @@ static ssize_t dimmdev_edac_mode_show(struct device *dev,
return sprintf(data, "%s\n", edac_caps[dimm->edac_mode]);
}
static ssize_t dimmdev_ce_count_show(struct device *dev,
struct device_attribute *mattr,
char *data)
{
struct dimm_info *dimm = to_dimm(dev);
u32 count;
int off;
off = EDAC_DIMM_OFF(dimm->mci->layers,
dimm->mci->n_layers,
dimm->location[0],
dimm->location[1],
dimm->location[2]);
count = dimm->mci->ce_per_layer[dimm->mci->n_layers-1][off];
return sprintf(data, "%u\n", count);
}
static ssize_t dimmdev_ue_count_show(struct device *dev,
struct device_attribute *mattr,
char *data)
{
struct dimm_info *dimm = to_dimm(dev);
u32 count;
int off;
off = EDAC_DIMM_OFF(dimm->mci->layers,
dimm->mci->n_layers,
dimm->location[0],
dimm->location[1],
dimm->location[2]);
count = dimm->mci->ue_per_layer[dimm->mci->n_layers-1][off];
return sprintf(data, "%u\n", count);
}
/* dimm/rank attribute files */
static DEVICE_ATTR(dimm_label, S_IRUGO | S_IWUSR,
dimmdev_label_show, dimmdev_label_store);
@ -577,6 +611,8 @@ static DEVICE_ATTR(size, S_IRUGO, dimmdev_size_show, NULL);
static DEVICE_ATTR(dimm_mem_type, S_IRUGO, dimmdev_mem_type_show, NULL);
static DEVICE_ATTR(dimm_dev_type, S_IRUGO, dimmdev_dev_type_show, NULL);
static DEVICE_ATTR(dimm_edac_mode, S_IRUGO, dimmdev_edac_mode_show, NULL);
static DEVICE_ATTR(dimm_ce_count, S_IRUGO, dimmdev_ce_count_show, NULL);
static DEVICE_ATTR(dimm_ue_count, S_IRUGO, dimmdev_ue_count_show, NULL);
/* attributes of the dimm<id>/rank<id> object */
static struct attribute *dimm_attrs[] = {
@ -586,6 +622,8 @@ static struct attribute *dimm_attrs[] = {
&dev_attr_dimm_mem_type.attr,
&dev_attr_dimm_dev_type.attr,
&dev_attr_dimm_edac_mode.attr,
&dev_attr_dimm_ce_count.attr,
&dev_attr_dimm_ue_count.attr,
NULL,
};