Commit afb617f3 authored by YiPeng Chai's avatar YiPeng Chai Committed by Alex Deucher

drm/amdgpu: add interface to check mca umc status

Add interface to check mca umc status.
Signed-off-by: default avatarYiPeng Chai <YiPeng.Chai@amd.com>
Reviewed-by: default avatarHawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent 6c23f3d1
...@@ -27,6 +27,16 @@ ...@@ -27,6 +27,16 @@
#include "umc/umc_6_7_0_offset.h" #include "umc/umc_6_7_0_offset.h"
#include "umc/umc_6_7_0_sh_mask.h" #include "umc/umc_6_7_0_sh_mask.h"
static bool amdgpu_mca_is_deferred_error(struct amdgpu_device *adev,
uint64_t mc_status)
{
if (adev->umc.ras->check_ecc_err_status)
return adev->umc.ras->check_ecc_err_status(adev,
AMDGPU_MCA_ERROR_TYPE_DE, &mc_status);
return false;
}
void amdgpu_mca_query_correctable_error_count(struct amdgpu_device *adev, void amdgpu_mca_query_correctable_error_count(struct amdgpu_device *adev,
uint64_t mc_status_addr, uint64_t mc_status_addr,
unsigned long *error_count) unsigned long *error_count)
...@@ -257,7 +267,7 @@ int amdgpu_mca_smu_log_ras_error(struct amdgpu_device *adev, enum amdgpu_ras_blo ...@@ -257,7 +267,7 @@ int amdgpu_mca_smu_log_ras_error(struct amdgpu_device *adev, enum amdgpu_ras_blo
amdgpu_ras_error_statistic_ue_count(err_data, amdgpu_ras_error_statistic_ue_count(err_data,
&mcm_info, &err_addr, (uint64_t)count); &mcm_info, &err_addr, (uint64_t)count);
else { else {
if (!!(MCA_REG__STATUS__DEFERRED(entry->regs[MCA_REG_IDX_STATUS]))) if (amdgpu_mca_is_deferred_error(adev, entry->regs[MCA_REG_IDX_STATUS]))
amdgpu_ras_error_statistic_de_count(err_data, amdgpu_ras_error_statistic_de_count(err_data,
&mcm_info, &err_addr, (uint64_t)count); &mcm_info, &err_addr, (uint64_t)count);
else else
......
...@@ -65,6 +65,7 @@ enum amdgpu_mca_ip { ...@@ -65,6 +65,7 @@ enum amdgpu_mca_ip {
enum amdgpu_mca_error_type { enum amdgpu_mca_error_type {
AMDGPU_MCA_ERROR_TYPE_UE = 0, AMDGPU_MCA_ERROR_TYPE_UE = 0,
AMDGPU_MCA_ERROR_TYPE_CE, AMDGPU_MCA_ERROR_TYPE_CE,
AMDGPU_MCA_ERROR_TYPE_DE,
}; };
struct amdgpu_mca_ras_block { struct amdgpu_mca_ras_block {
......
...@@ -21,7 +21,7 @@ ...@@ -21,7 +21,7 @@
#ifndef __AMDGPU_UMC_H__ #ifndef __AMDGPU_UMC_H__
#define __AMDGPU_UMC_H__ #define __AMDGPU_UMC_H__
#include "amdgpu_ras.h" #include "amdgpu_ras.h"
#include "amdgpu_mca.h"
/* /*
* (addr / 256) * 4096, the higher 26 bits in ErrorAddr * (addr / 256) * 4096, the higher 26 bits in ErrorAddr
* is the index of 4KB block * is the index of 4KB block
...@@ -64,6 +64,8 @@ struct amdgpu_umc_ras { ...@@ -64,6 +64,8 @@ struct amdgpu_umc_ras {
void *ras_error_status); void *ras_error_status);
void (*ecc_info_query_ras_error_address)(struct amdgpu_device *adev, void (*ecc_info_query_ras_error_address)(struct amdgpu_device *adev,
void *ras_error_status); void *ras_error_status);
bool (*check_ecc_err_status)(struct amdgpu_device *adev,
enum amdgpu_mca_error_type type, void *ras_error_status);
/* support different eeprom table version for different asic */ /* support different eeprom table version for different asic */
void (*set_eeprom_table_version)(struct amdgpu_ras_eeprom_table_header *hdr); void (*set_eeprom_table_version)(struct amdgpu_ras_eeprom_table_header *hdr);
}; };
......
...@@ -422,6 +422,25 @@ static void umc_v12_0_ecc_info_query_ras_error_address(struct amdgpu_device *ade ...@@ -422,6 +422,25 @@ static void umc_v12_0_ecc_info_query_ras_error_address(struct amdgpu_device *ade
} }
} }
static bool umc_v12_0_check_ecc_err_status(struct amdgpu_device *adev,
enum amdgpu_mca_error_type type, void *ras_error_status)
{
uint64_t mc_umc_status = *(uint64_t *)ras_error_status;
switch (type) {
case AMDGPU_MCA_ERROR_TYPE_UE:
return umc_v12_0_is_uncorrectable_error(adev, mc_umc_status);
case AMDGPU_MCA_ERROR_TYPE_CE:
return umc_v12_0_is_correctable_error(adev, mc_umc_status);
case AMDGPU_MCA_ERROR_TYPE_DE:
return umc_v12_0_is_deferred_error(adev, mc_umc_status);
default:
return false;
}
return false;
}
static void umc_v12_0_err_cnt_init(struct amdgpu_device *adev) static void umc_v12_0_err_cnt_init(struct amdgpu_device *adev)
{ {
amdgpu_umc_loop_channels(adev, amdgpu_umc_loop_channels(adev,
...@@ -507,5 +526,6 @@ struct amdgpu_umc_ras umc_v12_0_ras = { ...@@ -507,5 +526,6 @@ struct amdgpu_umc_ras umc_v12_0_ras = {
.query_ras_poison_mode = umc_v12_0_query_ras_poison_mode, .query_ras_poison_mode = umc_v12_0_query_ras_poison_mode,
.ecc_info_query_ras_error_count = umc_v12_0_ecc_info_query_ras_error_count, .ecc_info_query_ras_error_count = umc_v12_0_ecc_info_query_ras_error_count,
.ecc_info_query_ras_error_address = umc_v12_0_ecc_info_query_ras_error_address, .ecc_info_query_ras_error_address = umc_v12_0_ecc_info_query_ras_error_address,
.check_ecc_err_status = umc_v12_0_check_ecc_err_status,
}; };
...@@ -2557,9 +2557,9 @@ static int mca_umc_mca_get_err_count(const struct mca_ras_info *mca_ras, struct ...@@ -2557,9 +2557,9 @@ static int mca_umc_mca_get_err_count(const struct mca_ras_info *mca_ras, struct
return 0; return 0;
} }
if ((type == AMDGPU_MCA_ERROR_TYPE_UE && umc_v12_0_is_uncorrectable_error(adev, status0)) || if (umc_v12_0_is_deferred_error(adev, status0) ||
(type == AMDGPU_MCA_ERROR_TYPE_CE && (umc_v12_0_is_correctable_error(adev, status0) || umc_v12_0_is_uncorrectable_error(adev, status0) ||
umc_v12_0_is_deferred_error(adev, status0)))) umc_v12_0_is_correctable_error(adev, status0))
*count = 1; *count = 1;
return 0; return 0;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment