Commit 0795b5d2 authored by YiPeng Chai's avatar YiPeng Chai Committed by Alex Deucher

drm/amdgpu:Support retiring multiple MCA error address pages

Support retiring multiple MCA error address pages in
one in-band query for umc v12_0.
Signed-off-by: default avatarYiPeng Chai <YiPeng.Chai@amd.com>
Reviewed-by: default avatarHawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent afb617f3
...@@ -3920,8 +3920,7 @@ static int ras_err_info_cmp(void *priv, const struct list_head *a, const struct ...@@ -3920,8 +3920,7 @@ static int ras_err_info_cmp(void *priv, const struct list_head *a, const struct
} }
static struct ras_err_info *amdgpu_ras_error_get_info(struct ras_err_data *err_data, static struct ras_err_info *amdgpu_ras_error_get_info(struct ras_err_data *err_data,
struct amdgpu_smuio_mcm_config_info *mcm_info, struct amdgpu_smuio_mcm_config_info *mcm_info)
struct ras_err_addr *err_addr)
{ {
struct ras_err_node *err_node; struct ras_err_node *err_node;
...@@ -3933,10 +3932,9 @@ static struct ras_err_info *amdgpu_ras_error_get_info(struct ras_err_data *err_d ...@@ -3933,10 +3932,9 @@ static struct ras_err_info *amdgpu_ras_error_get_info(struct ras_err_data *err_d
if (!err_node) if (!err_node)
return NULL; return NULL;
memcpy(&err_node->err_info.mcm_info, mcm_info, sizeof(*mcm_info)); INIT_LIST_HEAD(&err_node->err_info.err_addr_list);
if (err_addr) memcpy(&err_node->err_info.mcm_info, mcm_info, sizeof(*mcm_info));
memcpy(&err_node->err_info.err_addr, err_addr, sizeof(*err_addr));
err_data->err_list_count++; err_data->err_list_count++;
list_add_tail(&err_node->node, &err_data->err_node_list); list_add_tail(&err_node->node, &err_data->err_node_list);
...@@ -3945,6 +3943,29 @@ static struct ras_err_info *amdgpu_ras_error_get_info(struct ras_err_data *err_d ...@@ -3945,6 +3943,29 @@ static struct ras_err_info *amdgpu_ras_error_get_info(struct ras_err_data *err_d
return &err_node->err_info; return &err_node->err_info;
} }
void amdgpu_ras_add_mca_err_addr(struct ras_err_info *err_info, struct ras_err_addr *err_addr)
{
struct ras_err_addr *mca_err_addr;
mca_err_addr = kzalloc(sizeof(*mca_err_addr), GFP_KERNEL);
if (!mca_err_addr)
return;
INIT_LIST_HEAD(&mca_err_addr->node);
mca_err_addr->err_status = err_addr->err_status;
mca_err_addr->err_ipid = err_addr->err_ipid;
mca_err_addr->err_addr = err_addr->err_addr;
list_add_tail(&mca_err_addr->node, &err_info->err_addr_list);
}
void amdgpu_ras_del_mca_err_addr(struct ras_err_info *err_info, struct ras_err_addr *mca_err_addr)
{
list_del(&mca_err_addr->node);
kfree(mca_err_addr);
}
int amdgpu_ras_error_statistic_ue_count(struct ras_err_data *err_data, int amdgpu_ras_error_statistic_ue_count(struct ras_err_data *err_data,
struct amdgpu_smuio_mcm_config_info *mcm_info, struct amdgpu_smuio_mcm_config_info *mcm_info,
struct ras_err_addr *err_addr, u64 count) struct ras_err_addr *err_addr, u64 count)
...@@ -3957,10 +3978,13 @@ int amdgpu_ras_error_statistic_ue_count(struct ras_err_data *err_data, ...@@ -3957,10 +3978,13 @@ int amdgpu_ras_error_statistic_ue_count(struct ras_err_data *err_data,
if (!count) if (!count)
return 0; return 0;
err_info = amdgpu_ras_error_get_info(err_data, mcm_info, err_addr); err_info = amdgpu_ras_error_get_info(err_data, mcm_info);
if (!err_info) if (!err_info)
return -EINVAL; return -EINVAL;
if (err_addr && err_addr->err_status)
amdgpu_ras_add_mca_err_addr(err_info, err_addr);
err_info->ue_count += count; err_info->ue_count += count;
err_data->ue_count += count; err_data->ue_count += count;
...@@ -3979,7 +4003,7 @@ int amdgpu_ras_error_statistic_ce_count(struct ras_err_data *err_data, ...@@ -3979,7 +4003,7 @@ int amdgpu_ras_error_statistic_ce_count(struct ras_err_data *err_data,
if (!count) if (!count)
return 0; return 0;
err_info = amdgpu_ras_error_get_info(err_data, mcm_info, err_addr); err_info = amdgpu_ras_error_get_info(err_data, mcm_info);
if (!err_info) if (!err_info)
return -EINVAL; return -EINVAL;
...@@ -4001,10 +4025,13 @@ int amdgpu_ras_error_statistic_de_count(struct ras_err_data *err_data, ...@@ -4001,10 +4025,13 @@ int amdgpu_ras_error_statistic_de_count(struct ras_err_data *err_data,
if (!count) if (!count)
return 0; return 0;
err_info = amdgpu_ras_error_get_info(err_data, mcm_info, err_addr); err_info = amdgpu_ras_error_get_info(err_data, mcm_info);
if (!err_info) if (!err_info)
return -EINVAL; return -EINVAL;
if (err_addr && err_addr->err_status)
amdgpu_ras_add_mca_err_addr(err_info, err_addr);
err_info->de_count += count; err_info->de_count += count;
err_data->de_count += count; err_data->de_count += count;
......
...@@ -480,6 +480,7 @@ struct ras_fs_data { ...@@ -480,6 +480,7 @@ struct ras_fs_data {
}; };
struct ras_err_addr { struct ras_err_addr {
struct list_head node;
uint64_t err_status; uint64_t err_status;
uint64_t err_ipid; uint64_t err_ipid;
uint64_t err_addr; uint64_t err_addr;
...@@ -490,7 +491,7 @@ struct ras_err_info { ...@@ -490,7 +491,7 @@ struct ras_err_info {
u64 ce_count; u64 ce_count;
u64 ue_count; u64 ue_count;
u64 de_count; u64 de_count;
struct ras_err_addr err_addr; struct list_head err_addr_list;
}; };
struct ras_err_node { struct ras_err_node {
...@@ -862,4 +863,9 @@ int amdgpu_ras_unbind_aca(struct amdgpu_device *adev, enum amdgpu_ras_block blk) ...@@ -862,4 +863,9 @@ int amdgpu_ras_unbind_aca(struct amdgpu_device *adev, enum amdgpu_ras_block blk)
ssize_t amdgpu_ras_aca_sysfs_read(struct device *dev, struct device_attribute *attr, ssize_t amdgpu_ras_aca_sysfs_read(struct device *dev, struct device_attribute *attr,
struct aca_handle *handle, char *buf, void *data); struct aca_handle *handle, char *buf, void *data);
void amdgpu_ras_add_mca_err_addr(struct ras_err_info *err_info,
struct ras_err_addr *err_addr);
void amdgpu_ras_del_mca_err_addr(struct ras_err_info *err_info,
struct ras_err_addr *mca_err_addr);
#endif #endif
...@@ -382,29 +382,33 @@ static void umc_v12_0_ecc_info_query_ras_error_address(struct amdgpu_device *ade ...@@ -382,29 +382,33 @@ static void umc_v12_0_ecc_info_query_ras_error_address(struct amdgpu_device *ade
{ {
struct ras_err_node *err_node; struct ras_err_node *err_node;
uint64_t mc_umc_status; uint64_t mc_umc_status;
struct ras_err_info *err_info;
struct ras_err_addr *mca_err_addr, *tmp;
struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status; struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
for_each_ras_error(err_node, err_data) { for_each_ras_error(err_node, err_data) {
mc_umc_status = err_node->err_info.err_addr.err_status; err_info = &err_node->err_info;
if (!mc_umc_status) if (list_empty(&err_info->err_addr_list))
continue; continue;
if (umc_v12_0_is_uncorrectable_error(adev, mc_umc_status) || list_for_each_entry_safe(mca_err_addr, tmp, &err_info->err_addr_list, node) {
umc_v12_0_is_deferred_error(adev, mc_umc_status)) { mc_umc_status = mca_err_addr->err_status;
if (mc_umc_status &&
(umc_v12_0_is_uncorrectable_error(adev, mc_umc_status) ||
umc_v12_0_is_deferred_error(adev, mc_umc_status))) {
uint64_t mca_addr, err_addr, mca_ipid; uint64_t mca_addr, err_addr, mca_ipid;
uint32_t InstanceIdLo; uint32_t InstanceIdLo;
struct amdgpu_smuio_mcm_config_info *mcm_info;
mcm_info = &err_node->err_info.mcm_info; mca_addr = mca_err_addr->err_addr;
mca_addr = err_node->err_info.err_addr.err_addr; mca_ipid = mca_err_addr->err_ipid;
mca_ipid = err_node->err_info.err_addr.err_ipid;
err_addr = REG_GET_FIELD(mca_addr, MCA_UMC_UMC0_MCUMC_ADDRT0, ErrorAddr); err_addr = REG_GET_FIELD(mca_addr,
MCA_UMC_UMC0_MCUMC_ADDRT0, ErrorAddr);
InstanceIdLo = REG_GET_FIELD(mca_ipid, MCMP1_IPIDT0, InstanceIdLo); InstanceIdLo = REG_GET_FIELD(mca_ipid, MCMP1_IPIDT0, InstanceIdLo);
dev_info(adev->dev, "UMC:IPID:0x%llx, aid:%d, inst:%d, ch:%d, err_addr:0x%llx\n", dev_info(adev->dev, "UMC:IPID:0x%llx, aid:%d, inst:%d, ch:%d, err_addr:0x%llx\n",
mca_ipid, mca_ipid,
mcm_info->die_id, err_info->mcm_info.die_id,
MCA_IPID_LO_2_UMC_INST(InstanceIdLo), MCA_IPID_LO_2_UMC_INST(InstanceIdLo),
MCA_IPID_LO_2_UMC_CH(InstanceIdLo), MCA_IPID_LO_2_UMC_CH(InstanceIdLo),
err_addr); err_addr);
...@@ -413,11 +417,11 @@ static void umc_v12_0_ecc_info_query_ras_error_address(struct amdgpu_device *ade ...@@ -413,11 +417,11 @@ static void umc_v12_0_ecc_info_query_ras_error_address(struct amdgpu_device *ade
err_data, err_addr, err_data, err_addr,
MCA_IPID_LO_2_UMC_CH(InstanceIdLo), MCA_IPID_LO_2_UMC_CH(InstanceIdLo),
MCA_IPID_LO_2_UMC_INST(InstanceIdLo), MCA_IPID_LO_2_UMC_INST(InstanceIdLo),
mcm_info->die_id); err_info->mcm_info.die_id);
}
/* Clear umc error address content */ /* Delete error address node from list and free memory */
memset(&err_node->err_info.err_addr, amdgpu_ras_del_mca_err_addr(err_info, mca_err_addr);
0, sizeof(err_node->err_info.err_addr));
} }
} }
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment