Commit ad6e1605 authored by Qiuxu Zhuo's avatar Qiuxu Zhuo Committed by Borislav Petkov

EDAC, skx_edac: Add address translation for non-volatile DIMMs

Currently, this driver doesn't support address translation for
non-volatile DIMMs.

The ACPI ADXL DSM method provides address translation for both volatile
and non-volatile DIMMs. Enable it to use the ACPI DSM methods if they
are supported and there are non-volatile DIMMs populated on the system.
Co-developed-by: default avatarTony Luck <tony.luck@intel.com>
Signed-off-by: default avatarQiuxu Zhuo <qiuxu.zhuo@intel.com>
Signed-off-by: default avatarBorislav Petkov <bp@suse.de>
CC: Mauro Carvalho Chehab <mchehab@kernel.org>
CC: arozansk@redhat.com
CC: linux-edac <linux-edac@vger.kernel.org>
Link: http://lkml.kernel.org/r/1540106336-5212-1-git-send-email-qiuxu.zhuo@intel.com
parent 36168d71
......@@ -234,6 +234,7 @@ config EDAC_SKX
depends on PCI && X86_64 && X86_MCE_INTEL && PCI_MMCONFIG
depends on ACPI_NFIT || !ACPI_NFIT # if ACPI_NFIT=m, EDAC_SKX can't be y
select DMI
select ACPI_ADXL
help
Support for error detection and correction the Intel
Skylake server Integrated Memory Controllers. If your
......
......@@ -26,6 +26,7 @@
#include <linux/bitmap.h>
#include <linux/math64.h>
#include <linux/mod_devicetable.h>
#include <linux/adxl.h>
#include <acpi/nfit.h>
#include <asm/cpu_device_id.h>
#include <asm/intel-family.h>
......@@ -35,6 +36,7 @@
#include "edac_module.h"
#define EDAC_MOD_STR "skx_edac"
#define MSG_SIZE 1024
/*
* Debug macros
......@@ -54,6 +56,29 @@
static LIST_HEAD(skx_edac_list);
static u64 skx_tolm, skx_tohm;
static char *skx_msg;
static unsigned int nvdimm_count;
enum {
INDEX_SOCKET,
INDEX_MEMCTRL,
INDEX_CHANNEL,
INDEX_DIMM,
INDEX_MAX
};
static const char * const component_names[] = {
[INDEX_SOCKET] = "ProcessorSocketId",
[INDEX_MEMCTRL] = "MemoryControllerId",
[INDEX_CHANNEL] = "ChannelId",
[INDEX_DIMM] = "DimmSlotId",
};
static int component_indices[ARRAY_SIZE(component_names)];
static int adxl_component_count;
static const char * const *adxl_component_names;
static u64 *adxl_values;
static char *adxl_msg;
#define NUM_IMC 2 /* memory controllers per socket */
#define NUM_CHANNELS 3 /* channels per memory controller */
......@@ -393,6 +418,8 @@ static int get_nvdimm_info(struct dimm_info *dimm, struct skx_imc *imc,
u16 flags;
u64 size = 0;
nvdimm_count++;
dev_handle = ACPI_NFIT_BUILD_DEVICE_HANDLE(dimmno, chan, imc->lmc,
imc->src_id, 0);
......@@ -941,12 +968,46 @@ static void teardown_skx_debug(void)
}
#endif /*CONFIG_EDAC_DEBUG*/
static bool skx_adxl_decode(struct decoded_addr *res)
{
int i, len = 0;
if (res->addr >= skx_tohm || (res->addr >= skx_tolm &&
res->addr < BIT_ULL(32))) {
edac_dbg(0, "Address 0x%llx out of range\n", res->addr);
return false;
}
if (adxl_decode(res->addr, adxl_values)) {
edac_dbg(0, "Failed to decode 0x%llx\n", res->addr);
return false;
}
res->socket = (int)adxl_values[component_indices[INDEX_SOCKET]];
res->imc = (int)adxl_values[component_indices[INDEX_MEMCTRL]];
res->channel = (int)adxl_values[component_indices[INDEX_CHANNEL]];
res->dimm = (int)adxl_values[component_indices[INDEX_DIMM]];
for (i = 0; i < adxl_component_count; i++) {
if (adxl_values[i] == ~0x0ull)
continue;
len += snprintf(adxl_msg + len, MSG_SIZE - len, " %s:0x%llx",
adxl_component_names[i], adxl_values[i]);
if (MSG_SIZE - len <= 0)
break;
}
return true;
}
static void skx_mce_output_error(struct mem_ctl_info *mci,
const struct mce *m,
struct decoded_addr *res)
{
enum hw_event_mc_err_type tp_event;
char *type, *optype, msg[256];
char *type, *optype;
bool ripv = GET_BITFIELD(m->mcgstatus, 0, 0);
bool overflow = GET_BITFIELD(m->status, 62, 62);
bool uncorrected_error = GET_BITFIELD(m->status, 61, 61);
......@@ -1007,22 +1068,47 @@ static void skx_mce_output_error(struct mem_ctl_info *mci,
break;
}
}
if (adxl_component_count) {
snprintf(skx_msg, MSG_SIZE, "%s%s err_code:%04x:%04x %s",
overflow ? " OVERFLOW" : "",
(uncorrected_error && recoverable) ? " recoverable" : "",
mscod, errcode, adxl_msg);
} else {
snprintf(skx_msg, MSG_SIZE,
"%s%s err_code:%04x:%04x socket:%d imc:%d rank:%d bg:%d ba:%d row:%x col:%x",
overflow ? " OVERFLOW" : "",
(uncorrected_error && recoverable) ? " recoverable" : "",
mscod, errcode,
res->socket, res->imc, res->rank,
res->bank_group, res->bank_address, res->row, res->column);
}
snprintf(msg, sizeof(msg),
"%s%s err_code:%04x:%04x socket:%d imc:%d rank:%d bg:%d ba:%d row:%x col:%x",
overflow ? " OVERFLOW" : "",
(uncorrected_error && recoverable) ? " recoverable" : "",
mscod, errcode,
res->socket, res->imc, res->rank,
res->bank_group, res->bank_address, res->row, res->column);
edac_dbg(0, "%s\n", msg);
edac_dbg(0, "%s\n", skx_msg);
/* Call the helper to output message */
edac_mc_handle_error(tp_event, mci, core_err_cnt,
m->addr >> PAGE_SHIFT, m->addr & ~PAGE_MASK, 0,
res->channel, res->dimm, -1,
optype, msg);
optype, skx_msg);
}
static struct mem_ctl_info *get_mci(int src_id, int lmc)
{
struct skx_dev *d;
if (lmc > NUM_IMC - 1) {
skx_printk(KERN_ERR, "Bad lmc %d\n", lmc);
return NULL;
}
list_for_each_entry(d, &skx_edac_list, list) {
if (d->imc[0].src_id == src_id)
return d->imc[lmc].mci;
}
skx_printk(KERN_ERR, "No mci for src_id %d lmc %d\n", src_id, lmc);
return NULL;
}
static int skx_mce_check_error(struct notifier_block *nb, unsigned long val,
......@@ -1040,10 +1126,23 @@ static int skx_mce_check_error(struct notifier_block *nb, unsigned long val,
if ((mce->status & 0xefff) >> 7 != 1 || !(mce->status & MCI_STATUS_ADDRV))
return NOTIFY_DONE;
memset(&res, 0, sizeof(res));
res.addr = mce->addr;
if (!skx_decode(&res))
if (adxl_component_count) {
if (!skx_adxl_decode(&res))
return NOTIFY_DONE;
mci = get_mci(res.socket, res.imc);
} else {
if (!skx_decode(&res))
return NOTIFY_DONE;
mci = res.dev->imc[res.imc].mci;
}
if (!mci)
return NOTIFY_DONE;
mci = res.dev->imc[res.imc].mci;
if (mce->mcgstatus & MCG_STATUS_MCIP)
type = "Exception";
......@@ -1094,6 +1193,62 @@ static void skx_remove(void)
}
}
static void __init skx_adxl_get(void)
{
const char * const *names;
int i, j;
names = adxl_get_component_names();
if (!names) {
skx_printk(KERN_NOTICE, "No firmware support for address translation.");
skx_printk(KERN_CONT, " Only decoding DDR4 address!\n");
return;
}
for (i = 0; i < INDEX_MAX; i++) {
for (j = 0; names[j]; j++) {
if (!strcmp(component_names[i], names[j])) {
component_indices[i] = j;
break;
}
}
if (!names[j])
goto err;
}
adxl_component_names = names;
while (*names++)
adxl_component_count++;
adxl_values = kcalloc(adxl_component_count, sizeof(*adxl_values),
GFP_KERNEL);
if (!adxl_values) {
adxl_component_count = 0;
return;
}
adxl_msg = kzalloc(MSG_SIZE, GFP_KERNEL);
if (!adxl_msg) {
adxl_component_count = 0;
kfree(adxl_values);
}
return;
err:
skx_printk(KERN_ERR, "'%s' is not matched from DSM parameters: ",
component_names[i]);
for (j = 0; names[j]; j++)
skx_printk(KERN_CONT, "%s ", names[j]);
skx_printk(KERN_CONT, "\n");
}
static void __exit skx_adxl_put(void)
{
kfree(adxl_values);
kfree(adxl_msg);
}
/*
* skx_init:
* make sure we are running on the correct cpu model
......@@ -1158,6 +1313,15 @@ static int __init skx_init(void)
}
}
skx_msg = kzalloc(MSG_SIZE, GFP_KERNEL);
if (!skx_msg) {
rc = -ENOMEM;
goto fail;
}
if (nvdimm_count)
skx_adxl_get();
/* Ensure that the OPSTATE is set correctly for POLL or NMI */
opstate_init();
......@@ -1176,6 +1340,9 @@ static void __exit skx_exit(void)
edac_dbg(2, "\n");
mce_unregister_decode_chain(&skx_mce_dec);
skx_remove();
if (nvdimm_count)
skx_adxl_put();
kfree(skx_msg);
teardown_skx_debug();
}
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment