Commit 3b566b30 authored by Yazen Ghannam's avatar Yazen Ghannam Committed by Borislav Petkov (AMD)

RAS/AMD/ATL: Add MI300 row retirement support

DRAM row retirement depends on model-specific information that is best
done within the AMD Address Translation Library.

Export a generic wrapper function for other modules to use. Add any
model-specific helpers here.
Signed-off-by: default avatarYazen Ghannam <yazen.ghannam@amd.com>
Signed-off-by: default avatarBorislav Petkov (AMD) <bp@alien8.de>
Link: https://lore.kernel.org/r/20240214033516.1344948-2-yazen.ghannam@amd.com
parent 0e4fd816
...@@ -10,6 +10,7 @@ ...@@ -10,6 +10,7 @@
config AMD_ATL config AMD_ATL
tristate "AMD Address Translation Library" tristate "AMD Address Translation Library"
depends on AMD_NB && X86_64 && RAS depends on AMD_NB && X86_64 && RAS
depends on MEMORY_FAILURE
default N default N
help help
This library includes support for implementation-specific This library includes support for implementation-specific
......
...@@ -239,6 +239,57 @@ static unsigned long convert_dram_to_norm_addr_mi300(unsigned long addr) ...@@ -239,6 +239,57 @@ static unsigned long convert_dram_to_norm_addr_mi300(unsigned long addr)
return addr; return addr;
} }
/*
* When a DRAM ECC error occurs on MI300 systems, it is recommended to retire
* all memory within that DRAM row. This applies to the memory with a DRAM
* bank.
*
* To find the memory addresses, loop through permutations of the DRAM column
* bits and find the System Physical address of each. The column bits are used
* to calculate the intermediate Normalized address, so all permutations should
* be checked.
*
* See amd_atl::convert_dram_to_norm_addr_mi300() for MI300 address formats.
*/
#define MI300_NUM_COL BIT(HWEIGHT(MI300_UMC_MCA_COL))
static void retire_row_mi300(struct atl_err *a_err)
{
unsigned long addr;
struct page *p;
u8 col;
for (col = 0; col < MI300_NUM_COL; col++) {
a_err->addr &= ~MI300_UMC_MCA_COL;
a_err->addr |= FIELD_PREP(MI300_UMC_MCA_COL, col);
addr = amd_convert_umc_mca_addr_to_sys_addr(a_err);
if (IS_ERR_VALUE(addr))
continue;
addr = PHYS_PFN(addr);
/*
* Skip invalid or already poisoned pages to avoid unnecessary
* error messages from memory_failure().
*/
p = pfn_to_online_page(addr);
if (!p)
continue;
if (PageHWPoison(p))
continue;
memory_failure(addr, 0);
}
}
void amd_retire_dram_row(struct atl_err *a_err)
{
if (df_cfg.rev == DF4p5 && df_cfg.flags.heterogeneous)
return retire_row_mi300(a_err);
}
EXPORT_SYMBOL_GPL(amd_retire_dram_row);
static unsigned long get_addr(unsigned long addr) static unsigned long get_addr(unsigned long addr)
{ {
if (df_cfg.rev == DF4p5 && df_cfg.flags.heterogeneous) if (df_cfg.rev == DF4p5 && df_cfg.flags.heterogeneous)
......
...@@ -45,8 +45,10 @@ struct atl_err { ...@@ -45,8 +45,10 @@ struct atl_err {
#if IS_ENABLED(CONFIG_AMD_ATL) #if IS_ENABLED(CONFIG_AMD_ATL)
void amd_atl_register_decoder(unsigned long (*f)(struct atl_err *)); void amd_atl_register_decoder(unsigned long (*f)(struct atl_err *));
void amd_atl_unregister_decoder(void); void amd_atl_unregister_decoder(void);
void amd_retire_dram_row(struct atl_err *err);
unsigned long amd_convert_umc_mca_addr_to_sys_addr(struct atl_err *err); unsigned long amd_convert_umc_mca_addr_to_sys_addr(struct atl_err *err);
#else #else
static inline void amd_retire_dram_row(struct atl_err *err) { }
static inline unsigned long static inline unsigned long
amd_convert_umc_mca_addr_to_sys_addr(struct atl_err *err) { return -EINVAL; } amd_convert_umc_mca_addr_to_sys_addr(struct atl_err *err) { return -EINVAL; }
#endif /* CONFIG_AMD_ATL */ #endif /* CONFIG_AMD_ATL */
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment