Commit fdcb279d authored by Stanley.Yang's avatar Stanley.Yang Committed by Alex Deucher

drm/amdgpu: query umc error info from ecc_table v2

if smu support ECCTABLE, driver can message smu to get ecc_table
then query umc error info from ECCTABLE

v2:
    optimize source code makes logical more reasonable
Signed-off-by: default avatarStanley.Yang <Stanley.Yang@amd.com>
Reviewed-by: default avatarHawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent edd79420
...@@ -892,6 +892,38 @@ void amdgpu_ras_mca_query_error_status(struct amdgpu_device *adev, ...@@ -892,6 +892,38 @@ void amdgpu_ras_mca_query_error_status(struct amdgpu_device *adev,
} }
} }
static void amdgpu_ras_get_ecc_info(struct amdgpu_device *adev, struct ras_err_data *err_data)
{
struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
int ret = 0;
/*
* choosing right query method according to
* whether smu support query error information
*/
ret = smu_get_ecc_info(&adev->smu, (void *)&(ras->umc_ecc));
if (ret == -EOPNOTSUPP) {
if (adev->umc.ras_funcs &&
adev->umc.ras_funcs->query_ras_error_count)
adev->umc.ras_funcs->query_ras_error_count(adev, err_data);
/* umc query_ras_error_address is also responsible for clearing
* error status
*/
if (adev->umc.ras_funcs &&
adev->umc.ras_funcs->query_ras_error_address)
adev->umc.ras_funcs->query_ras_error_address(adev, err_data);
} else if (!ret) {
if (adev->umc.ras_funcs &&
adev->umc.ras_funcs->ecc_info_query_ras_error_count)
adev->umc.ras_funcs->ecc_info_query_ras_error_count(adev, err_data);
if (adev->umc.ras_funcs &&
adev->umc.ras_funcs->ecc_info_query_ras_error_address)
adev->umc.ras_funcs->ecc_info_query_ras_error_address(adev, err_data);
}
}
/* query/inject/cure begin */ /* query/inject/cure begin */
int amdgpu_ras_query_error_status(struct amdgpu_device *adev, int amdgpu_ras_query_error_status(struct amdgpu_device *adev,
struct ras_query_if *info) struct ras_query_if *info)
...@@ -905,15 +937,7 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev, ...@@ -905,15 +937,7 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev,
switch (info->head.block) { switch (info->head.block) {
case AMDGPU_RAS_BLOCK__UMC: case AMDGPU_RAS_BLOCK__UMC:
if (adev->umc.ras_funcs && amdgpu_ras_get_ecc_info(adev, &err_data);
adev->umc.ras_funcs->query_ras_error_count)
adev->umc.ras_funcs->query_ras_error_count(adev, &err_data);
/* umc query_ras_error_address is also responsible for clearing
* error status
*/
if (adev->umc.ras_funcs &&
adev->umc.ras_funcs->query_ras_error_address)
adev->umc.ras_funcs->query_ras_error_address(adev, &err_data);
break; break;
case AMDGPU_RAS_BLOCK__SDMA: case AMDGPU_RAS_BLOCK__SDMA:
if (adev->sdma.funcs->query_ras_error_count) { if (adev->sdma.funcs->query_ras_error_count) {
......
...@@ -94,8 +94,11 @@ int amdgpu_umc_process_ras_data_cb(struct amdgpu_device *adev, ...@@ -94,8 +94,11 @@ int amdgpu_umc_process_ras_data_cb(struct amdgpu_device *adev,
{ {
struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status; struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
struct amdgpu_ras *con = amdgpu_ras_get_context(adev); struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
int ret = 0;
kgd2kfd_set_sram_ecc_flag(adev->kfd.dev); kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
ret = smu_get_ecc_info(&adev->smu, (void *)&(con->umc_ecc));
if (ret == -EOPNOTSUPP) {
if (adev->umc.ras_funcs && if (adev->umc.ras_funcs &&
adev->umc.ras_funcs->query_ras_error_count) adev->umc.ras_funcs->query_ras_error_count)
adev->umc.ras_funcs->query_ras_error_count(adev, ras_error_status); adev->umc.ras_funcs->query_ras_error_count(adev, ras_error_status);
...@@ -119,6 +122,31 @@ int amdgpu_umc_process_ras_data_cb(struct amdgpu_device *adev, ...@@ -119,6 +122,31 @@ int amdgpu_umc_process_ras_data_cb(struct amdgpu_device *adev,
*/ */
adev->umc.ras_funcs->query_ras_error_address(adev, ras_error_status); adev->umc.ras_funcs->query_ras_error_address(adev, ras_error_status);
} }
} else if (!ret) {
if (adev->umc.ras_funcs &&
adev->umc.ras_funcs->ecc_info_query_ras_error_count)
adev->umc.ras_funcs->ecc_info_query_ras_error_count(adev, ras_error_status);
if (adev->umc.ras_funcs &&
adev->umc.ras_funcs->ecc_info_query_ras_error_address &&
adev->umc.max_ras_err_cnt_per_query) {
err_data->err_addr =
kcalloc(adev->umc.max_ras_err_cnt_per_query,
sizeof(struct eeprom_table_record), GFP_KERNEL);
/* still call query_ras_error_address to clear error status
* even NOMEM error is encountered
*/
if(!err_data->err_addr)
dev_warn(adev->dev, "Failed to alloc memory for "
"umc error address record!\n");
/* umc query_ras_error_address is also responsible for clearing
* error status
*/
adev->umc.ras_funcs->ecc_info_query_ras_error_address(adev, ras_error_status);
}
}
/* only uncorrectable error needs gpu reset */ /* only uncorrectable error needs gpu reset */
if (err_data->ue_count) { if (err_data->ue_count) {
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment