Commit ad6e1605 authored by Qiuxu Zhuo's avatar Qiuxu Zhuo Committed by Borislav Petkov

EDAC, skx_edac: Add address translation for non-volatile DIMMs

Currently, this driver doesn't support address translation for
non-volatile DIMMs.

The ACPI ADXL DSM method provides address translation for both volatile
and non-volatile DIMMs. Enable it to use the ACPI DSM methods if they
are supported and there are non-volatile DIMMs populated on the system.
Co-developed-by: default avatarTony Luck <tony.luck@intel.com>
Signed-off-by: default avatarQiuxu Zhuo <qiuxu.zhuo@intel.com>
Signed-off-by: default avatarBorislav Petkov <bp@suse.de>
CC: Mauro Carvalho Chehab <mchehab@kernel.org>
CC: arozansk@redhat.com
CC: linux-edac <linux-edac@vger.kernel.org>
Link: http://lkml.kernel.org/r/1540106336-5212-1-git-send-email-qiuxu.zhuo@intel.com
parent 36168d71
...@@ -234,6 +234,7 @@ config EDAC_SKX ...@@ -234,6 +234,7 @@ config EDAC_SKX
depends on PCI && X86_64 && X86_MCE_INTEL && PCI_MMCONFIG depends on PCI && X86_64 && X86_MCE_INTEL && PCI_MMCONFIG
depends on ACPI_NFIT || !ACPI_NFIT # if ACPI_NFIT=m, EDAC_SKX can't be y depends on ACPI_NFIT || !ACPI_NFIT # if ACPI_NFIT=m, EDAC_SKX can't be y
select DMI select DMI
select ACPI_ADXL
help help
Support for error detection and correction the Intel Support for error detection and correction the Intel
Skylake server Integrated Memory Controllers. If your Skylake server Integrated Memory Controllers. If your
......
...@@ -26,6 +26,7 @@ ...@@ -26,6 +26,7 @@
#include <linux/bitmap.h> #include <linux/bitmap.h>
#include <linux/math64.h> #include <linux/math64.h>
#include <linux/mod_devicetable.h> #include <linux/mod_devicetable.h>
#include <linux/adxl.h>
#include <acpi/nfit.h> #include <acpi/nfit.h>
#include <asm/cpu_device_id.h> #include <asm/cpu_device_id.h>
#include <asm/intel-family.h> #include <asm/intel-family.h>
...@@ -35,6 +36,7 @@ ...@@ -35,6 +36,7 @@
#include "edac_module.h" #include "edac_module.h"
#define EDAC_MOD_STR "skx_edac" #define EDAC_MOD_STR "skx_edac"
#define MSG_SIZE 1024
/* /*
* Debug macros * Debug macros
...@@ -54,6 +56,29 @@ ...@@ -54,6 +56,29 @@
static LIST_HEAD(skx_edac_list); static LIST_HEAD(skx_edac_list);
static u64 skx_tolm, skx_tohm; static u64 skx_tolm, skx_tohm;
static char *skx_msg;
static unsigned int nvdimm_count;
enum {
INDEX_SOCKET,
INDEX_MEMCTRL,
INDEX_CHANNEL,
INDEX_DIMM,
INDEX_MAX
};
static const char * const component_names[] = {
[INDEX_SOCKET] = "ProcessorSocketId",
[INDEX_MEMCTRL] = "MemoryControllerId",
[INDEX_CHANNEL] = "ChannelId",
[INDEX_DIMM] = "DimmSlotId",
};
static int component_indices[ARRAY_SIZE(component_names)];
static int adxl_component_count;
static const char * const *adxl_component_names;
static u64 *adxl_values;
static char *adxl_msg;
#define NUM_IMC 2 /* memory controllers per socket */ #define NUM_IMC 2 /* memory controllers per socket */
#define NUM_CHANNELS 3 /* channels per memory controller */ #define NUM_CHANNELS 3 /* channels per memory controller */
...@@ -393,6 +418,8 @@ static int get_nvdimm_info(struct dimm_info *dimm, struct skx_imc *imc, ...@@ -393,6 +418,8 @@ static int get_nvdimm_info(struct dimm_info *dimm, struct skx_imc *imc,
u16 flags; u16 flags;
u64 size = 0; u64 size = 0;
nvdimm_count++;
dev_handle = ACPI_NFIT_BUILD_DEVICE_HANDLE(dimmno, chan, imc->lmc, dev_handle = ACPI_NFIT_BUILD_DEVICE_HANDLE(dimmno, chan, imc->lmc,
imc->src_id, 0); imc->src_id, 0);
...@@ -941,12 +968,46 @@ static void teardown_skx_debug(void) ...@@ -941,12 +968,46 @@ static void teardown_skx_debug(void)
} }
#endif /*CONFIG_EDAC_DEBUG*/ #endif /*CONFIG_EDAC_DEBUG*/
static bool skx_adxl_decode(struct decoded_addr *res)
{
int i, len = 0;
if (res->addr >= skx_tohm || (res->addr >= skx_tolm &&
res->addr < BIT_ULL(32))) {
edac_dbg(0, "Address 0x%llx out of range\n", res->addr);
return false;
}
if (adxl_decode(res->addr, adxl_values)) {
edac_dbg(0, "Failed to decode 0x%llx\n", res->addr);
return false;
}
res->socket = (int)adxl_values[component_indices[INDEX_SOCKET]];
res->imc = (int)adxl_values[component_indices[INDEX_MEMCTRL]];
res->channel = (int)adxl_values[component_indices[INDEX_CHANNEL]];
res->dimm = (int)adxl_values[component_indices[INDEX_DIMM]];
for (i = 0; i < adxl_component_count; i++) {
if (adxl_values[i] == ~0x0ull)
continue;
len += snprintf(adxl_msg + len, MSG_SIZE - len, " %s:0x%llx",
adxl_component_names[i], adxl_values[i]);
if (MSG_SIZE - len <= 0)
break;
}
return true;
}
static void skx_mce_output_error(struct mem_ctl_info *mci, static void skx_mce_output_error(struct mem_ctl_info *mci,
const struct mce *m, const struct mce *m,
struct decoded_addr *res) struct decoded_addr *res)
{ {
enum hw_event_mc_err_type tp_event; enum hw_event_mc_err_type tp_event;
char *type, *optype, msg[256]; char *type, *optype;
bool ripv = GET_BITFIELD(m->mcgstatus, 0, 0); bool ripv = GET_BITFIELD(m->mcgstatus, 0, 0);
bool overflow = GET_BITFIELD(m->status, 62, 62); bool overflow = GET_BITFIELD(m->status, 62, 62);
bool uncorrected_error = GET_BITFIELD(m->status, 61, 61); bool uncorrected_error = GET_BITFIELD(m->status, 61, 61);
...@@ -1007,22 +1068,47 @@ static void skx_mce_output_error(struct mem_ctl_info *mci, ...@@ -1007,22 +1068,47 @@ static void skx_mce_output_error(struct mem_ctl_info *mci,
break; break;
} }
} }
if (adxl_component_count) {
snprintf(msg, sizeof(msg), snprintf(skx_msg, MSG_SIZE, "%s%s err_code:%04x:%04x %s",
overflow ? " OVERFLOW" : "",
(uncorrected_error && recoverable) ? " recoverable" : "",
mscod, errcode, adxl_msg);
} else {
snprintf(skx_msg, MSG_SIZE,
"%s%s err_code:%04x:%04x socket:%d imc:%d rank:%d bg:%d ba:%d row:%x col:%x", "%s%s err_code:%04x:%04x socket:%d imc:%d rank:%d bg:%d ba:%d row:%x col:%x",
overflow ? " OVERFLOW" : "", overflow ? " OVERFLOW" : "",
(uncorrected_error && recoverable) ? " recoverable" : "", (uncorrected_error && recoverable) ? " recoverable" : "",
mscod, errcode, mscod, errcode,
res->socket, res->imc, res->rank, res->socket, res->imc, res->rank,
res->bank_group, res->bank_address, res->row, res->column); res->bank_group, res->bank_address, res->row, res->column);
}
edac_dbg(0, "%s\n", msg); edac_dbg(0, "%s\n", skx_msg);
/* Call the helper to output message */ /* Call the helper to output message */
edac_mc_handle_error(tp_event, mci, core_err_cnt, edac_mc_handle_error(tp_event, mci, core_err_cnt,
m->addr >> PAGE_SHIFT, m->addr & ~PAGE_MASK, 0, m->addr >> PAGE_SHIFT, m->addr & ~PAGE_MASK, 0,
res->channel, res->dimm, -1, res->channel, res->dimm, -1,
optype, msg); optype, skx_msg);
}
static struct mem_ctl_info *get_mci(int src_id, int lmc)
{
struct skx_dev *d;
if (lmc > NUM_IMC - 1) {
skx_printk(KERN_ERR, "Bad lmc %d\n", lmc);
return NULL;
}
list_for_each_entry(d, &skx_edac_list, list) {
if (d->imc[0].src_id == src_id)
return d->imc[lmc].mci;
}
skx_printk(KERN_ERR, "No mci for src_id %d lmc %d\n", src_id, lmc);
return NULL;
} }
static int skx_mce_check_error(struct notifier_block *nb, unsigned long val, static int skx_mce_check_error(struct notifier_block *nb, unsigned long val,
...@@ -1040,10 +1126,23 @@ static int skx_mce_check_error(struct notifier_block *nb, unsigned long val, ...@@ -1040,10 +1126,23 @@ static int skx_mce_check_error(struct notifier_block *nb, unsigned long val,
if ((mce->status & 0xefff) >> 7 != 1 || !(mce->status & MCI_STATUS_ADDRV)) if ((mce->status & 0xefff) >> 7 != 1 || !(mce->status & MCI_STATUS_ADDRV))
return NOTIFY_DONE; return NOTIFY_DONE;
memset(&res, 0, sizeof(res));
res.addr = mce->addr; res.addr = mce->addr;
if (adxl_component_count) {
if (!skx_adxl_decode(&res))
return NOTIFY_DONE;
mci = get_mci(res.socket, res.imc);
} else {
if (!skx_decode(&res)) if (!skx_decode(&res))
return NOTIFY_DONE; return NOTIFY_DONE;
mci = res.dev->imc[res.imc].mci; mci = res.dev->imc[res.imc].mci;
}
if (!mci)
return NOTIFY_DONE;
if (mce->mcgstatus & MCG_STATUS_MCIP) if (mce->mcgstatus & MCG_STATUS_MCIP)
type = "Exception"; type = "Exception";
...@@ -1094,6 +1193,62 @@ static void skx_remove(void) ...@@ -1094,6 +1193,62 @@ static void skx_remove(void)
} }
} }
static void __init skx_adxl_get(void)
{
const char * const *names;
int i, j;
names = adxl_get_component_names();
if (!names) {
skx_printk(KERN_NOTICE, "No firmware support for address translation.");
skx_printk(KERN_CONT, " Only decoding DDR4 address!\n");
return;
}
for (i = 0; i < INDEX_MAX; i++) {
for (j = 0; names[j]; j++) {
if (!strcmp(component_names[i], names[j])) {
component_indices[i] = j;
break;
}
}
if (!names[j])
goto err;
}
adxl_component_names = names;
while (*names++)
adxl_component_count++;
adxl_values = kcalloc(adxl_component_count, sizeof(*adxl_values),
GFP_KERNEL);
if (!adxl_values) {
adxl_component_count = 0;
return;
}
adxl_msg = kzalloc(MSG_SIZE, GFP_KERNEL);
if (!adxl_msg) {
adxl_component_count = 0;
kfree(adxl_values);
}
return;
err:
skx_printk(KERN_ERR, "'%s' is not matched from DSM parameters: ",
component_names[i]);
for (j = 0; names[j]; j++)
skx_printk(KERN_CONT, "%s ", names[j]);
skx_printk(KERN_CONT, "\n");
}
static void __exit skx_adxl_put(void)
{
kfree(adxl_values);
kfree(adxl_msg);
}
/* /*
* skx_init: * skx_init:
* make sure we are running on the correct cpu model * make sure we are running on the correct cpu model
...@@ -1158,6 +1313,15 @@ static int __init skx_init(void) ...@@ -1158,6 +1313,15 @@ static int __init skx_init(void)
} }
} }
skx_msg = kzalloc(MSG_SIZE, GFP_KERNEL);
if (!skx_msg) {
rc = -ENOMEM;
goto fail;
}
if (nvdimm_count)
skx_adxl_get();
/* Ensure that the OPSTATE is set correctly for POLL or NMI */ /* Ensure that the OPSTATE is set correctly for POLL or NMI */
opstate_init(); opstate_init();
...@@ -1176,6 +1340,9 @@ static void __exit skx_exit(void) ...@@ -1176,6 +1340,9 @@ static void __exit skx_exit(void)
edac_dbg(2, "\n"); edac_dbg(2, "\n");
mce_unregister_decode_chain(&skx_mce_dec); mce_unregister_decode_chain(&skx_mce_dec);
skx_remove(); skx_remove();
if (nvdimm_count)
skx_adxl_put();
kfree(skx_msg);
teardown_skx_debug(); teardown_skx_debug();
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment