Commit 9c97bf88 authored by Candice Li's avatar Candice Li Committed by Alex Deucher

drm/amdgpu: Do bad page retirement for deferred errors

Needs to do bad page retirement for deferred errors.

v2: Drop unused dev_info.
Signed-off-by: default avatarYiPeng Chai <YiPeng.Chai@amd.com>
Signed-off-by: default avatarCandice Li <candice.li@amd.com>
Reviewed-by: default avatarHawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent bbcbfd43
...@@ -93,6 +93,7 @@ static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev, ...@@ -93,6 +93,7 @@ static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev,
struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status; struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
struct amdgpu_ras *con = amdgpu_ras_get_context(adev); struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
int ret = 0; int ret = 0;
unsigned long err_count;
kgd2kfd_set_sram_ecc_flag(adev->kfd.dev); kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
ret = amdgpu_dpm_get_ecc_info(adev, (void *)&(con->umc_ecc)); ret = amdgpu_dpm_get_ecc_info(adev, (void *)&(con->umc_ecc));
...@@ -147,16 +148,13 @@ static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev, ...@@ -147,16 +148,13 @@ static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev,
} }
/* only uncorrectable error needs gpu reset */ /* only uncorrectable error needs gpu reset */
if (err_data->ue_count) { if (err_data->ue_count || err_data->de_count) {
dev_info(adev->dev, "%ld uncorrectable hardware errors " err_count = err_data->ue_count + err_data->de_count;
"detected in UMC block\n",
err_data->ue_count);
if ((amdgpu_bad_page_threshold != 0) && if ((amdgpu_bad_page_threshold != 0) &&
err_data->err_addr_cnt) { err_data->err_addr_cnt) {
amdgpu_ras_add_bad_pages(adev, err_data->err_addr, amdgpu_ras_add_bad_pages(adev, err_data->err_addr,
err_data->err_addr_cnt); err_data->err_addr_cnt);
amdgpu_ras_save_bad_pages(adev, &(err_data->ue_count)); amdgpu_ras_save_bad_pages(adev, &err_count);
amdgpu_dpm_send_hbm_bad_pages_num(adev, con->eeprom_control.ras_num_recs); amdgpu_dpm_send_hbm_bad_pages_num(adev, con->eeprom_control.ras_num_recs);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment