Commit d9de5ce8 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'edac_updates_for_v6.3' of git://git.kernel.org/pub/scm/linux/kernel/git/ras/ras

Pull EDAC updates from Borislav Petkov:

 - Add a driver for the RAS functionality on Xilinx's on chip memory
   controller

 - Add support for decoding errors from the first and second level
   memory on SKL-based hardware

 - Add support for the memory controllers in Intel Granite Rapids and
   Emerald Rapids machines

 - First round of amd64_edac driver simplification and removal of
   unneeded functionality

 - The usual cleanups and fixes

* tag 'edac_updates_for_v6.3' of git://git.kernel.org/pub/scm/linux/kernel/git/ras/ras:
  EDAC/amd64: Shut up an -Werror,-Wsometimes-uninitialized clang false positive
  EDAC/amd64: Remove early_channel_count()
  EDAC/amd64: Remove PCI Function 0
  EDAC/amd64: Remove PCI Function 6
  EDAC/amd64: Remove scrub rate control for Family 17h and later
  EDAC/amd64: Don't set up EDAC PCI control on Family 17h+
  EDAC/i10nm: Add driver decoder for Sapphire Rapids server
  EDAC/i10nm: Add Intel Granite Rapids server support
  EDAC/i10nm: Make more configurations CPU model specific
  EDAC/i10nm: Add Intel Emerald Rapids server support
  EDAC/skx_common: Delete duplicated and unreachable code
  EDAC/skx_common: Enable EDAC support for the "near" memory
  EDAC/qcom: Add platform_device_id table for module autoloading
  EDAC/zynqmp: Add EDAC support for Xilinx ZynqMP OCM
  dt-bindings: edac: Add bindings for Xilinx ZynqMP OCM
parents 0246725d 28980db9
# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)
%YAML 1.2
---
$id: http://devicetree.org/schemas/memory-controllers/xlnx,zynqmp-ocmc-1.0.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
title: Xilinx Zynqmp OCM(On-Chip Memory) Controller
maintainers:
- Shubhrajyoti Datta <shubhrajyoti.datta@amd.com>
- Sai Krishna Potthuri <sai.krishna.potthuri@amd.com>
description: |
The OCM supports 64-bit wide ECC functionality to detect multi-bit errors
and recover from a single-bit memory fault.On a write, if all bytes are
being written, the ECC is generated and written into the ECC RAM along with
the write-data that is written into the data RAM. If one or more bytes are
not written, then the read operation results in an correctable error or
uncorrectable error.
properties:
compatible:
const: xlnx,zynqmp-ocmc-1.0
reg:
maxItems: 1
interrupts:
maxItems: 1
required:
- compatible
- reg
- interrupts
additionalProperties: false
examples:
- |
#include <dt-bindings/interrupt-controller/arm-gic.h>
memory-controller@ff960000 {
compatible = "xlnx,zynqmp-ocmc-1.0";
reg = <0xff960000 0x1000>;
interrupts = <GIC_SPI 10 IRQ_TYPE_LEVEL_HIGH>;
};
......@@ -22743,6 +22743,13 @@ F: Documentation/devicetree/bindings/dma/xilinx/xlnx,zynqmp-dpdma.yaml
F: drivers/dma/xilinx/xilinx_dpdma.c
F: include/dt-bindings/dma/xlnx-zynqmp-dpdma.h
XILINX ZYNQMP OCM EDAC DRIVER
M: Shubhrajyoti Datta <shubhrajyoti.datta@amd.com>
M: Sai Krishna Potthuri <sai.krishna.potthuri@amd.com>
S: Maintained
F: Documentation/devicetree/bindings/memory-controllers/xlnx,zynqmp-ocmc-1.0.yaml
F: drivers/edac/zynqmp_edac.c
XILINX ZYNQMP PSGTR PHY DRIVER
M: Anurag Kumar Vulisha <anurag.kumar.vulisha@xilinx.com>
M: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
......
......@@ -542,4 +542,12 @@ config EDAC_DMC520
Support for error detection and correction on the
SoCs with ARM DMC-520 DRAM controller.
config EDAC_ZYNQMP
tristate "Xilinx ZynqMP OCM Controller"
depends on ARCH_ZYNQMP || COMPILE_TEST
help
This driver supports error detection and correction for the
Xilinx ZynqMP OCM (On Chip Memory) controller. It can also be
built as a module. In that case it will be called zynqmp_edac.
endif # EDAC
......@@ -84,3 +84,4 @@ obj-$(CONFIG_EDAC_QCOM) += qcom_edac.o
obj-$(CONFIG_EDAC_ASPEED) += aspeed_edac.o
obj-$(CONFIG_EDAC_BLUEFIELD) += bluefield_edac.o
obj-$(CONFIG_EDAC_DMC520) += dmc520_edac.o
obj-$(CONFIG_EDAC_ZYNQMP) += zynqmp_edac.o
This diff is collapsed.
......@@ -114,22 +114,6 @@
#define PCI_DEVICE_ID_AMD_16H_NB_F2 0x1532
#define PCI_DEVICE_ID_AMD_16H_M30H_NB_F1 0x1581
#define PCI_DEVICE_ID_AMD_16H_M30H_NB_F2 0x1582
#define PCI_DEVICE_ID_AMD_17H_DF_F0 0x1460
#define PCI_DEVICE_ID_AMD_17H_DF_F6 0x1466
#define PCI_DEVICE_ID_AMD_17H_M10H_DF_F0 0x15e8
#define PCI_DEVICE_ID_AMD_17H_M10H_DF_F6 0x15ee
#define PCI_DEVICE_ID_AMD_17H_M30H_DF_F0 0x1490
#define PCI_DEVICE_ID_AMD_17H_M30H_DF_F6 0x1496
#define PCI_DEVICE_ID_AMD_17H_M60H_DF_F0 0x1448
#define PCI_DEVICE_ID_AMD_17H_M60H_DF_F6 0x144e
#define PCI_DEVICE_ID_AMD_17H_M70H_DF_F0 0x1440
#define PCI_DEVICE_ID_AMD_17H_M70H_DF_F6 0x1446
#define PCI_DEVICE_ID_AMD_19H_DF_F0 0x1650
#define PCI_DEVICE_ID_AMD_19H_DF_F6 0x1656
#define PCI_DEVICE_ID_AMD_19H_M10H_DF_F0 0x14ad
#define PCI_DEVICE_ID_AMD_19H_M10H_DF_F6 0x14b3
#define PCI_DEVICE_ID_AMD_19H_M50H_DF_F0 0x166a
#define PCI_DEVICE_ID_AMD_19H_M50H_DF_F6 0x1670
/*
* Function 1 - Address Map
......@@ -215,8 +199,6 @@
#define DCT_SEL_HI 0x114
#define F15H_M60H_SCRCTRL 0x1C8
#define F17H_SCR_BASE_ADDR 0x48
#define F17H_SCR_LIMIT_ADDR 0x4C
/*
* Function 3 - Misc Control
......@@ -356,7 +338,7 @@ struct amd64_pvt {
struct low_ops *ops;
/* pci_device handles which we utilize */
struct pci_dev *F0, *F1, *F2, *F3, *F6;
struct pci_dev *F1, *F2, *F3;
u16 mc_node_id; /* MC index of this MC node */
u8 fam; /* CPU family */
......@@ -364,7 +346,6 @@ struct amd64_pvt {
u8 stepping; /* ... stepping */
int ext_model; /* extended model value of this node */
int channel_count;
/* Raw registers */
u32 dclr0; /* DRAM Configuration Low DCT0 reg */
......@@ -484,7 +465,6 @@ struct ecc_settings {
* functions and per device encoding/decoding logic.
*/
struct low_ops {
int (*early_channel_count) (struct amd64_pvt *pvt);
void (*map_sysaddr_to_csrow) (struct mem_ctl_info *mci, u64 sys_addr,
struct err_info *);
int (*dbam_to_cs) (struct amd64_pvt *pvt, u8 dct,
......@@ -503,7 +483,7 @@ struct amd64_family_flags {
struct amd64_family_type {
const char *ctl_name;
u16 f0_id, f1_id, f2_id, f6_id;
u16 f1_id, f2_id;
/* Maximum number of memory controllers per die/node. */
u8 max_mcs;
struct amd64_family_flags flags;
......
This diff is collapsed.
......@@ -396,12 +396,19 @@ static int qcom_llcc_edac_remove(struct platform_device *pdev)
return 0;
}
static const struct platform_device_id qcom_llcc_edac_id_table[] = {
{ .name = "qcom_llcc_edac" },
{}
};
MODULE_DEVICE_TABLE(platform, qcom_llcc_edac_id_table);
static struct platform_driver qcom_llcc_edac_driver = {
.probe = qcom_llcc_edac_probe,
.remove = qcom_llcc_edac_remove,
.driver = {
.name = "qcom_llcc_edac",
},
.id_table = qcom_llcc_edac_id_table,
};
module_platform_driver(qcom_llcc_edac_driver);
......
......@@ -560,44 +560,28 @@ static void skx_mce_output_error(struct mem_ctl_info *mci,
tp_event = HW_EVENT_ERR_CORRECTED;
}
/*
* According to Intel Architecture spec vol 3B,
* Table 15-10 "IA32_MCi_Status [15:0] Compound Error Code Encoding"
* memory errors should fit one of these masks:
* 000f 0000 1mmm cccc (binary)
* 000f 0010 1mmm cccc (binary) [RAM used as cache]
* where:
* f = Correction Report Filtering Bit. If 1, subsequent errors
* won't be shown
* mmm = error type
* cccc = channel
* If the mask doesn't match, report an error to the parsing logic
*/
if (!((errcode & 0xef80) == 0x80 || (errcode & 0xef80) == 0x280)) {
optype = "Can't parse: it is not a mem";
} else {
switch (optypenum) {
case 0:
optype = "generic undef request error";
break;
case 1:
optype = "memory read error";
break;
case 2:
optype = "memory write error";
break;
case 3:
optype = "addr/cmd error";
break;
case 4:
optype = "memory scrubbing error";
scrub_err = true;
break;
default:
optype = "reserved";
break;
}
switch (optypenum) {
case 0:
optype = "generic undef request error";
break;
case 1:
optype = "memory read error";
break;
case 2:
optype = "memory write error";
break;
case 3:
optype = "addr/cmd error";
break;
case 4:
optype = "memory scrubbing error";
scrub_err = true;
break;
default:
optype = "reserved";
break;
}
if (res->decoded_by_adxl) {
len = snprintf(skx_msg, MSG_SIZE, "%s%s err_code:0x%04x:0x%04x %s",
overflow ? " OVERFLOW" : "",
......@@ -632,12 +616,18 @@ static bool skx_error_in_1st_level_mem(const struct mce *m)
if (!skx_mem_cfg_2lm)
return false;
errcode = GET_BITFIELD(m->status, 0, 15);
errcode = GET_BITFIELD(m->status, 0, 15) & MCACOD_MEM_ERR_MASK;
if ((errcode & 0xef80) != 0x280)
return false;
return errcode == MCACOD_EXT_MEM_ERR;
}
return true;
static bool skx_error_in_mem(const struct mce *m)
{
u32 errcode;
errcode = GET_BITFIELD(m->status, 0, 15) & MCACOD_MEM_ERR_MASK;
return (errcode == MCACOD_MEM_CTL_ERR || errcode == MCACOD_EXT_MEM_ERR);
}
int skx_mce_check_error(struct notifier_block *nb, unsigned long val,
......@@ -651,8 +641,8 @@ int skx_mce_check_error(struct notifier_block *nb, unsigned long val,
if (mce->kflags & MCE_HANDLED_CEC)
return NOTIFY_DONE;
/* ignore unless this is memory related with an address */
if ((mce->status & 0xefff) >> 7 != 1 || !(mce->status & MCI_STATUS_ADDRV))
/* Ignore unless this is memory related with an address */
if (!skx_error_in_mem(mce) || !(mce->status & MCI_STATUS_ADDRV))
return NOTIFY_DONE;
memset(&res, 0, sizeof(res));
......
......@@ -33,7 +33,7 @@
#define SKX_NUM_CHANNELS 3 /* Channels per memory controller */
#define SKX_NUM_DIMMS 2 /* Max DIMMS per channel */
#define I10NM_NUM_DDR_IMC 4
#define I10NM_NUM_DDR_IMC 12
#define I10NM_NUM_DDR_CHANNELS 2
#define I10NM_NUM_DDR_DIMMS 2
......@@ -56,6 +56,30 @@
#define MCI_MISC_ECC_MODE(m) (((m) >> 59) & 15)
#define MCI_MISC_ECC_DDRT 8 /* read from DDRT */
/*
* According to Intel Architecture spec vol 3B,
* Table 15-10 "IA32_MCi_Status [15:0] Compound Error Code Encoding"
* memory errors should fit one of these masks:
* 000f 0000 1mmm cccc (binary)
* 000f 0010 1mmm cccc (binary) [RAM used as cache]
* where:
* f = Correction Report Filtering Bit. If 1, subsequent errors
* won't be shown
* mmm = error type
* cccc = channel
*/
#define MCACOD_MEM_ERR_MASK 0xef80
/*
* Errors from either the memory of the 1-level memory system or the
* 2nd level memory (the slow "far" memory) of the 2-level memory system.
*/
#define MCACOD_MEM_CTL_ERR 0x80
/*
* Errors from the 1st level memory (the fast "near" memory as cache)
* of the 2-level memory system.
*/
#define MCACOD_EXT_MEM_ERR 0x280
/*
* Each cpu socket contains some pci devices that provide global
* information, and also some that are local to each of the two
......@@ -105,7 +129,8 @@ struct skx_pvt {
enum type {
SKX,
I10NM,
SPR
SPR,
GNR
};
enum {
......@@ -149,19 +174,47 @@ struct decoded_addr {
bool decoded_by_adxl;
};
struct pci_bdf {
u32 bus : 8;
u32 dev : 5;
u32 fun : 3;
};
struct res_config {
enum type type;
/* Configuration agent device ID */
unsigned int decs_did;
/* Default bus number configuration register offset */
int busno_cfg_offset;
/* DDR memory controllers per socket */
int ddr_imc_num;
/* DDR channels per DDR memory controller */
int ddr_chan_num;
/* DDR DIMMs per DDR memory channel */
int ddr_dimm_num;
/* Per DDR channel memory-mapped I/O size */
int ddr_chan_mmio_sz;
/* HBM memory controllers per socket */
int hbm_imc_num;
/* HBM channels per HBM memory controller */
int hbm_chan_num;
/* HBM DIMMs per HBM memory channel */
int hbm_dimm_num;
/* Per HBM channel memory-mapped I/O size */
int hbm_chan_mmio_sz;
bool support_ddr5;
/* SAD device number and function number */
unsigned int sad_all_devfn;
/* SAD device BDF */
struct pci_bdf sad_all_bdf;
/* PCU device BDF */
struct pci_bdf pcu_cr3_bdf;
/* UTIL device BDF */
struct pci_bdf util_all_bdf;
/* URACU device BDF */
struct pci_bdf uracu_bdf;
/* DDR mdev device BDF */
struct pci_bdf ddr_mdev_bdf;
/* HBM mdev device BDF */
struct pci_bdf hbm_mdev_bdf;
int sad_all_offset;
/* Offsets of retry_rd_err_log registers */
u32 *offsets_scrub;
......
This diff is collapsed.
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment