Commit 87d2b92f authored by Tao Zhou's avatar Tao Zhou Committed by Alex Deucher

drm/amdgpu: save umc error records

save umc error records to ras bad page array

v2: add bad pages before gpu reset
v3: add NULL check for adev->umc.funcs
Signed-off-by: default avatarTao Zhou <tao.zhou1@amd.com>
Signed-off-by: default avatarAndrey Grodzovsky <andrey.grodzovsky@amd.com>
Reviewed-by: default avatarGuchun Chen <guchun.chen@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent 78ad00c9
...@@ -347,7 +347,7 @@ struct ras_err_data { ...@@ -347,7 +347,7 @@ struct ras_err_data {
unsigned long ue_count; unsigned long ue_count;
unsigned long ce_count; unsigned long ce_count;
unsigned long err_addr_cnt; unsigned long err_addr_cnt;
uint64_t *err_addr; struct eeprom_table_record *err_addr;
}; };
struct ras_err_handler_data { struct ras_err_handler_data {
......
...@@ -247,21 +247,43 @@ static int gmc_v9_0_process_ras_data_cb(struct amdgpu_device *adev, ...@@ -247,21 +247,43 @@ static int gmc_v9_0_process_ras_data_cb(struct amdgpu_device *adev,
struct ras_err_data *err_data, struct ras_err_data *err_data,
struct amdgpu_iv_entry *entry) struct amdgpu_iv_entry *entry)
{ {
if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__GFX)) { if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__GFX))
return AMDGPU_RAS_SUCCESS;
kgd2kfd_set_sram_ecc_flag(adev->kfd.dev); kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
if (adev->umc.funcs->query_ras_error_count) if (adev->umc.funcs &&
adev->umc.funcs->query_ras_error_count)
adev->umc.funcs->query_ras_error_count(adev, err_data); adev->umc.funcs->query_ras_error_count(adev, err_data);
if (adev->umc.funcs &&
adev->umc.funcs->query_ras_error_address &&
adev->umc.max_ras_err_cnt_per_query) {
err_data->err_addr =
kcalloc(adev->umc.max_ras_err_cnt_per_query,
sizeof(struct eeprom_table_record), GFP_KERNEL);
/* still call query_ras_error_address to clear error status
* even NOMEM error is encountered
*/
if(!err_data->err_addr)
DRM_WARN("Failed to alloc memory for umc error address record!\n");
/* umc query_ras_error_address is also responsible for clearing /* umc query_ras_error_address is also responsible for clearing
* error status * error status
*/ */
if (adev->umc.funcs->query_ras_error_address)
adev->umc.funcs->query_ras_error_address(adev, err_data); adev->umc.funcs->query_ras_error_address(adev, err_data);
}
/* only uncorrectable error needs gpu reset */ /* only uncorrectable error needs gpu reset */
if (err_data->ue_count) if (err_data->ue_count) {
if (err_data->err_addr_cnt &&
amdgpu_ras_add_bad_pages(adev, err_data->err_addr,
err_data->err_addr_cnt))
DRM_WARN("Failed to add ras bad page!\n");
amdgpu_ras_reset_gpu(adev, 0); amdgpu_ras_reset_gpu(adev, 0);
} }
kfree(err_data->err_addr);
return AMDGPU_RAS_SUCCESS; return AMDGPU_RAS_SUCCESS;
} }
......
...@@ -75,6 +75,17 @@ static void umc_v6_1_disable_umc_index_mode(struct amdgpu_device *adev) ...@@ -75,6 +75,17 @@ static void umc_v6_1_disable_umc_index_mode(struct amdgpu_device *adev)
RSMU_UMC_INDEX_MODE_EN, 0); RSMU_UMC_INDEX_MODE_EN, 0);
} }
static uint32_t umc_v6_1_get_umc_inst(struct amdgpu_device *adev)
{
uint32_t rsmu_umc_index;
rsmu_umc_index = RREG32_SOC15(RSMU, 0,
mmRSMU_UMC_INDEX_REGISTER_NBIF_VG20_GPU);
return REG_GET_FIELD(rsmu_umc_index,
RSMU_UMC_INDEX_REGISTER_NBIF_VG20_GPU,
RSMU_UMC_INDEX_INSTANCE);
}
static void umc_v6_1_query_correctable_error_count(struct amdgpu_device *adev, static void umc_v6_1_query_correctable_error_count(struct amdgpu_device *adev,
uint32_t umc_reg_offset, uint32_t umc_reg_offset,
unsigned long *error_count) unsigned long *error_count)
...@@ -165,7 +176,8 @@ static void umc_v6_1_query_error_address(struct amdgpu_device *adev, ...@@ -165,7 +176,8 @@ static void umc_v6_1_query_error_address(struct amdgpu_device *adev,
uint32_t umc_reg_offset, uint32_t channel_index) uint32_t umc_reg_offset, uint32_t channel_index)
{ {
uint32_t lsb, mc_umc_status_addr; uint32_t lsb, mc_umc_status_addr;
uint64_t mc_umc_status, err_addr; uint64_t mc_umc_status, err_addr, retired_page;
struct eeprom_table_record *err_rec;
mc_umc_status_addr = mc_umc_status_addr =
SOC15_REG_OFFSET(UMC, 0, mmMCA_UMC_UMC0_MCUMC_STATUST0); SOC15_REG_OFFSET(UMC, 0, mmMCA_UMC_UMC0_MCUMC_STATUST0);
...@@ -177,6 +189,7 @@ static void umc_v6_1_query_error_address(struct amdgpu_device *adev, ...@@ -177,6 +189,7 @@ static void umc_v6_1_query_error_address(struct amdgpu_device *adev,
return; return;
} }
err_rec = &err_data->err_addr[err_data->err_addr_cnt];
mc_umc_status = RREG64_UMC(mc_umc_status_addr + umc_reg_offset); mc_umc_status = RREG64_UMC(mc_umc_status_addr + umc_reg_offset);
/* calculate error address if ue/ce error is detected */ /* calculate error address if ue/ce error is detected */
...@@ -191,13 +204,25 @@ static void umc_v6_1_query_error_address(struct amdgpu_device *adev, ...@@ -191,13 +204,25 @@ static void umc_v6_1_query_error_address(struct amdgpu_device *adev,
err_addr &= ~((0x1ULL << lsb) - 1); err_addr &= ~((0x1ULL << lsb) - 1);
/* translate umc channel address to soc pa, 3 parts are included */ /* translate umc channel address to soc pa, 3 parts are included */
err_data->err_addr[err_data->err_addr_cnt] = retired_page = ADDR_OF_8KB_BLOCK(err_addr) |
ADDR_OF_8KB_BLOCK(err_addr) |
ADDR_OF_256B_BLOCK(channel_index) | ADDR_OF_256B_BLOCK(channel_index) |
OFFSET_IN_256B_BLOCK(err_addr); OFFSET_IN_256B_BLOCK(err_addr);
/* we only save ue error information currently, ce is skipped */
if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC)
== 1) {
err_rec->address = err_addr;
/* page frame address is saved */
err_rec->retired_page = retired_page >> PAGE_SHIFT;
err_rec->ts = (uint64_t)ktime_get_real_seconds();
err_rec->err_type = AMDGPU_RAS_EEPROM_ERR_NON_RECOVERABLE;
err_rec->cu = 0;
err_rec->mem_channel = channel_index;
err_rec->mcumc_id = umc_v6_1_get_umc_inst(adev);
err_data->err_addr_cnt++; err_data->err_addr_cnt++;
} }
}
/* clear umc status */ /* clear umc status */
WREG64_UMC(mc_umc_status_addr + umc_reg_offset, 0x0ULL); WREG64_UMC(mc_umc_status_addr + umc_reg_offset, 0x0ULL);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment