Commit 9f91e983 authored by YiPeng Chai's avatar YiPeng Chai Committed by Alex Deucher

drm/amdgpu: MCA supports recording umc address information

MCA supports recording umc address information.

V2:
  Move err_addr variable from struct ras_err_node to
struct ras_err_info.
Signed-off-by: default avatarYiPeng Chai <YiPeng.Chai@amd.com>
Reviewed-by: default avatarHawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent 731b2f6e
...@@ -218,6 +218,7 @@ static void amdgpu_mca_smu_mca_bank_dump(struct amdgpu_device *adev, int idx, st ...@@ -218,6 +218,7 @@ static void amdgpu_mca_smu_mca_bank_dump(struct amdgpu_device *adev, int idx, st
int amdgpu_mca_smu_log_ras_error(struct amdgpu_device *adev, enum amdgpu_ras_block blk, enum amdgpu_mca_error_type type, struct ras_err_data *err_data) int amdgpu_mca_smu_log_ras_error(struct amdgpu_device *adev, enum amdgpu_ras_block blk, enum amdgpu_mca_error_type type, struct ras_err_data *err_data)
{ {
struct amdgpu_smuio_mcm_config_info mcm_info; struct amdgpu_smuio_mcm_config_info mcm_info;
struct ras_err_addr err_addr = {0};
struct mca_bank_set mca_set; struct mca_bank_set mca_set;
struct mca_bank_node *node; struct mca_bank_node *node;
struct mca_bank_entry *entry; struct mca_bank_entry *entry;
...@@ -246,10 +247,18 @@ int amdgpu_mca_smu_log_ras_error(struct amdgpu_device *adev, enum amdgpu_ras_blo ...@@ -246,10 +247,18 @@ int amdgpu_mca_smu_log_ras_error(struct amdgpu_device *adev, enum amdgpu_ras_blo
mcm_info.socket_id = entry->info.socket_id; mcm_info.socket_id = entry->info.socket_id;
mcm_info.die_id = entry->info.aid; mcm_info.die_id = entry->info.aid;
if (blk == AMDGPU_RAS_BLOCK__UMC) {
err_addr.err_status = entry->regs[MCA_REG_IDX_STATUS];
err_addr.err_ipid = entry->regs[MCA_REG_IDX_IPID];
err_addr.err_addr = entry->regs[MCA_REG_IDX_ADDR];
}
if (type == AMDGPU_MCA_ERROR_TYPE_UE) if (type == AMDGPU_MCA_ERROR_TYPE_UE)
amdgpu_ras_error_statistic_ue_count(err_data, &mcm_info, (uint64_t)count); amdgpu_ras_error_statistic_ue_count(err_data,
&mcm_info, &err_addr, (uint64_t)count);
else else
amdgpu_ras_error_statistic_ce_count(err_data, &mcm_info, (uint64_t)count); amdgpu_ras_error_statistic_ce_count(err_data,
&mcm_info, &err_addr, (uint64_t)count);
} }
out_mca_release: out_mca_release:
......
...@@ -1156,8 +1156,10 @@ static void amdgpu_rasmgr_error_data_statistic_update(struct ras_manager *obj, s ...@@ -1156,8 +1156,10 @@ static void amdgpu_rasmgr_error_data_statistic_update(struct ras_manager *obj, s
for_each_ras_error(err_node, err_data) { for_each_ras_error(err_node, err_data) {
err_info = &err_node->err_info; err_info = &err_node->err_info;
amdgpu_ras_error_statistic_ce_count(&obj->err_data, &err_info->mcm_info, err_info->ce_count); amdgpu_ras_error_statistic_ce_count(&obj->err_data,
amdgpu_ras_error_statistic_ue_count(&obj->err_data, &err_info->mcm_info, err_info->ue_count); &err_info->mcm_info, NULL, err_info->ce_count);
amdgpu_ras_error_statistic_ue_count(&obj->err_data,
&err_info->mcm_info, NULL, err_info->ue_count);
} }
} else { } else {
/* for legacy asic path which doesn't has error source info */ /* for legacy asic path which doesn't has error source info */
...@@ -3691,7 +3693,8 @@ static int ras_err_info_cmp(void *priv, const struct list_head *a, const struct ...@@ -3691,7 +3693,8 @@ static int ras_err_info_cmp(void *priv, const struct list_head *a, const struct
} }
static struct ras_err_info *amdgpu_ras_error_get_info(struct ras_err_data *err_data, static struct ras_err_info *amdgpu_ras_error_get_info(struct ras_err_data *err_data,
struct amdgpu_smuio_mcm_config_info *mcm_info) struct amdgpu_smuio_mcm_config_info *mcm_info,
struct ras_err_addr *err_addr)
{ {
struct ras_err_node *err_node; struct ras_err_node *err_node;
...@@ -3705,6 +3708,9 @@ static struct ras_err_info *amdgpu_ras_error_get_info(struct ras_err_data *err_d ...@@ -3705,6 +3708,9 @@ static struct ras_err_info *amdgpu_ras_error_get_info(struct ras_err_data *err_d
memcpy(&err_node->err_info.mcm_info, mcm_info, sizeof(*mcm_info)); memcpy(&err_node->err_info.mcm_info, mcm_info, sizeof(*mcm_info));
if (err_addr)
memcpy(&err_node->err_info.err_addr, err_addr, sizeof(*err_addr));
err_data->err_list_count++; err_data->err_list_count++;
list_add_tail(&err_node->node, &err_data->err_node_list); list_add_tail(&err_node->node, &err_data->err_node_list);
list_sort(NULL, &err_data->err_node_list, ras_err_info_cmp); list_sort(NULL, &err_data->err_node_list, ras_err_info_cmp);
...@@ -3713,7 +3719,8 @@ static struct ras_err_info *amdgpu_ras_error_get_info(struct ras_err_data *err_d ...@@ -3713,7 +3719,8 @@ static struct ras_err_info *amdgpu_ras_error_get_info(struct ras_err_data *err_d
} }
int amdgpu_ras_error_statistic_ue_count(struct ras_err_data *err_data, int amdgpu_ras_error_statistic_ue_count(struct ras_err_data *err_data,
struct amdgpu_smuio_mcm_config_info *mcm_info, u64 count) struct amdgpu_smuio_mcm_config_info *mcm_info,
struct ras_err_addr *err_addr, u64 count)
{ {
struct ras_err_info *err_info; struct ras_err_info *err_info;
...@@ -3723,7 +3730,7 @@ int amdgpu_ras_error_statistic_ue_count(struct ras_err_data *err_data, ...@@ -3723,7 +3730,7 @@ int amdgpu_ras_error_statistic_ue_count(struct ras_err_data *err_data,
if (!count) if (!count)
return 0; return 0;
err_info = amdgpu_ras_error_get_info(err_data, mcm_info); err_info = amdgpu_ras_error_get_info(err_data, mcm_info, err_addr);
if (!err_info) if (!err_info)
return -EINVAL; return -EINVAL;
...@@ -3734,7 +3741,8 @@ int amdgpu_ras_error_statistic_ue_count(struct ras_err_data *err_data, ...@@ -3734,7 +3741,8 @@ int amdgpu_ras_error_statistic_ue_count(struct ras_err_data *err_data,
} }
int amdgpu_ras_error_statistic_ce_count(struct ras_err_data *err_data, int amdgpu_ras_error_statistic_ce_count(struct ras_err_data *err_data,
struct amdgpu_smuio_mcm_config_info *mcm_info, u64 count) struct amdgpu_smuio_mcm_config_info *mcm_info,
struct ras_err_addr *err_addr, u64 count)
{ {
struct ras_err_info *err_info; struct ras_err_info *err_info;
...@@ -3744,7 +3752,7 @@ int amdgpu_ras_error_statistic_ce_count(struct ras_err_data *err_data, ...@@ -3744,7 +3752,7 @@ int amdgpu_ras_error_statistic_ce_count(struct ras_err_data *err_data,
if (!count) if (!count)
return 0; return 0;
err_info = amdgpu_ras_error_get_info(err_data, mcm_info); err_info = amdgpu_ras_error_get_info(err_data, mcm_info, err_addr);
if (!err_info) if (!err_info)
return -EINVAL; return -EINVAL;
......
...@@ -452,10 +452,17 @@ struct ras_fs_data { ...@@ -452,10 +452,17 @@ struct ras_fs_data {
char debugfs_name[32]; char debugfs_name[32];
}; };
struct ras_err_addr {
uint64_t err_status;
uint64_t err_ipid;
uint64_t err_addr;
};
struct ras_err_info { struct ras_err_info {
struct amdgpu_smuio_mcm_config_info mcm_info; struct amdgpu_smuio_mcm_config_info mcm_info;
u64 ce_count; u64 ce_count;
u64 ue_count; u64 ue_count;
struct ras_err_addr err_addr;
}; };
struct ras_err_node { struct ras_err_node {
...@@ -806,8 +813,10 @@ void amdgpu_ras_inst_reset_ras_error_count(struct amdgpu_device *adev, ...@@ -806,8 +813,10 @@ void amdgpu_ras_inst_reset_ras_error_count(struct amdgpu_device *adev,
int amdgpu_ras_error_data_init(struct ras_err_data *err_data); int amdgpu_ras_error_data_init(struct ras_err_data *err_data);
void amdgpu_ras_error_data_fini(struct ras_err_data *err_data); void amdgpu_ras_error_data_fini(struct ras_err_data *err_data);
int amdgpu_ras_error_statistic_ce_count(struct ras_err_data *err_data, int amdgpu_ras_error_statistic_ce_count(struct ras_err_data *err_data,
struct amdgpu_smuio_mcm_config_info *mcm_info, u64 count); struct amdgpu_smuio_mcm_config_info *mcm_info,
struct ras_err_addr *err_addr, u64 count);
int amdgpu_ras_error_statistic_ue_count(struct ras_err_data *err_data, int amdgpu_ras_error_statistic_ue_count(struct ras_err_data *err_data,
struct amdgpu_smuio_mcm_config_info *mcm_info, u64 count); struct amdgpu_smuio_mcm_config_info *mcm_info,
struct ras_err_addr *err_addr, u64 count);
#endif #endif
...@@ -1313,10 +1313,10 @@ static void __xgmi_v6_4_0_query_error_count(struct amdgpu_device *adev, struct a ...@@ -1313,10 +1313,10 @@ static void __xgmi_v6_4_0_query_error_count(struct amdgpu_device *adev, struct a
switch (xgmi_v6_4_0_pcs_mca_get_error_type(adev, status)) { switch (xgmi_v6_4_0_pcs_mca_get_error_type(adev, status)) {
case AMDGPU_MCA_ERROR_TYPE_UE: case AMDGPU_MCA_ERROR_TYPE_UE:
amdgpu_ras_error_statistic_ue_count(err_data, mcm_info, 1ULL); amdgpu_ras_error_statistic_ue_count(err_data, mcm_info, NULL, 1ULL);
break; break;
case AMDGPU_MCA_ERROR_TYPE_CE: case AMDGPU_MCA_ERROR_TYPE_CE:
amdgpu_ras_error_statistic_ce_count(err_data, mcm_info, 1ULL); amdgpu_ras_error_statistic_ce_count(err_data, mcm_info, NULL, 1ULL);
break; break;
default: default:
break; break;
......
...@@ -3828,8 +3828,8 @@ static void gfx_v9_4_3_inst_query_ras_err_count(struct amdgpu_device *adev, ...@@ -3828,8 +3828,8 @@ static void gfx_v9_4_3_inst_query_ras_err_count(struct amdgpu_device *adev,
/* the caller should make sure initialize value of /* the caller should make sure initialize value of
* err_data->ue_count and err_data->ce_count * err_data->ue_count and err_data->ce_count
*/ */
amdgpu_ras_error_statistic_ue_count(err_data, &mcm_info, ue_count); amdgpu_ras_error_statistic_ue_count(err_data, &mcm_info, NULL, ue_count);
amdgpu_ras_error_statistic_ce_count(err_data, &mcm_info, ce_count); amdgpu_ras_error_statistic_ce_count(err_data, &mcm_info, NULL, ce_count);
} }
static void gfx_v9_4_3_inst_reset_ras_err_count(struct amdgpu_device *adev, static void gfx_v9_4_3_inst_reset_ras_err_count(struct amdgpu_device *adev,
......
...@@ -652,8 +652,8 @@ static void mmhub_v1_8_inst_query_ras_error_count(struct amdgpu_device *adev, ...@@ -652,8 +652,8 @@ static void mmhub_v1_8_inst_query_ras_error_count(struct amdgpu_device *adev,
AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE, AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE,
&ue_count); &ue_count);
amdgpu_ras_error_statistic_ce_count(err_data, &mcm_info, ce_count); amdgpu_ras_error_statistic_ce_count(err_data, &mcm_info, NULL, ce_count);
amdgpu_ras_error_statistic_ue_count(err_data, &mcm_info, ue_count); amdgpu_ras_error_statistic_ue_count(err_data, &mcm_info, NULL, ue_count);
} }
static void mmhub_v1_8_query_ras_error_count(struct amdgpu_device *adev, static void mmhub_v1_8_query_ras_error_count(struct amdgpu_device *adev,
......
...@@ -2156,7 +2156,7 @@ static void sdma_v4_4_2_inst_query_ras_error_count(struct amdgpu_device *adev, ...@@ -2156,7 +2156,7 @@ static void sdma_v4_4_2_inst_query_ras_error_count(struct amdgpu_device *adev,
AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE, AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE,
&ue_count); &ue_count);
amdgpu_ras_error_statistic_ue_count(err_data, &mcm_info, ue_count); amdgpu_ras_error_statistic_ue_count(err_data, &mcm_info, NULL, ue_count);
} }
static void sdma_v4_4_2_query_ras_error_count(struct amdgpu_device *adev, static void sdma_v4_4_2_query_ras_error_count(struct amdgpu_device *adev,
......
...@@ -166,8 +166,8 @@ static int umc_v12_0_query_error_count(struct amdgpu_device *adev, ...@@ -166,8 +166,8 @@ static int umc_v12_0_query_error_count(struct amdgpu_device *adev,
umc_v12_0_query_correctable_error_count(adev, umc_reg_offset, &ce_count); umc_v12_0_query_correctable_error_count(adev, umc_reg_offset, &ce_count);
umc_v12_0_query_uncorrectable_error_count(adev, umc_reg_offset, &ue_count); umc_v12_0_query_uncorrectable_error_count(adev, umc_reg_offset, &ue_count);
amdgpu_ras_error_statistic_ue_count(err_data, &mcm_info, ue_count); amdgpu_ras_error_statistic_ue_count(err_data, &mcm_info, NULL, ue_count);
amdgpu_ras_error_statistic_ce_count(err_data, &mcm_info, ce_count); amdgpu_ras_error_statistic_ce_count(err_data, &mcm_info, NULL, ce_count);
return 0; return 0;
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment