Commit 3f975d0f authored by Stanley.Yang's avatar Stanley.Yang Committed by Alex Deucher

drm/amdgpu: update athub interrupt harvesting handle

GCEA/MMHUB EA error should not result to DF freeze, this is
fixed in next generation, but for some reasons the GCEA/MMHUB
EA error will result to DF freeze in previous generation,
diver should avoid to indicate GCEA/MMHUB EA error as hw fatal
error in kernel message by read GCEA/MMHUB err status registers.

Changed from V1:
    make query_ras_error_status function more general
    make read mmhub er status register more friendly

Changed from V2:
    move ras error status query function into do_recovery workqueue

Changed from V3:
    remove useless code from V2, print GCEA error status
    instance number
Signed-off-by: default avatarStanley.Yang <Stanley.Yang@amd.com>
Reviewed-by: default avatarHawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent d117413f
...@@ -217,6 +217,7 @@ struct amdgpu_gfx_funcs { ...@@ -217,6 +217,7 @@ struct amdgpu_gfx_funcs {
int (*query_ras_error_count) (struct amdgpu_device *adev, void *ras_error_status); int (*query_ras_error_count) (struct amdgpu_device *adev, void *ras_error_status);
void (*reset_ras_error_count) (struct amdgpu_device *adev); void (*reset_ras_error_count) (struct amdgpu_device *adev);
void (*init_spm_golden)(struct amdgpu_device *adev); void (*init_spm_golden)(struct amdgpu_device *adev);
void (*query_ras_error_status) (struct amdgpu_device *adev);
}; };
struct sq_work { struct sq_work {
......
...@@ -40,6 +40,7 @@ struct amdgpu_mmhub_funcs { ...@@ -40,6 +40,7 @@ struct amdgpu_mmhub_funcs {
uint64_t page_table_base); uint64_t page_table_base);
void (*update_power_gating)(struct amdgpu_device *adev, void (*update_power_gating)(struct amdgpu_device *adev,
bool enable); bool enable);
void (*query_ras_error_status)(struct amdgpu_device *adev);
}; };
struct amdgpu_mmhub { struct amdgpu_mmhub {
......
...@@ -1498,6 +1498,45 @@ static void amdgpu_ras_log_on_err_counter(struct amdgpu_device *adev) ...@@ -1498,6 +1498,45 @@ static void amdgpu_ras_log_on_err_counter(struct amdgpu_device *adev)
} }
} }
/* Parse RdRspStatus and WrRspStatus */
void amdgpu_ras_error_status_query(struct amdgpu_device *adev,
struct ras_query_if *info)
{
/*
* Only two block need to query read/write
* RspStatus at current state
*/
switch (info->head.block) {
case AMDGPU_RAS_BLOCK__GFX:
if (adev->gfx.funcs->query_ras_error_status)
adev->gfx.funcs->query_ras_error_status(adev);
break;
case AMDGPU_RAS_BLOCK__MMHUB:
if (adev->mmhub.funcs->query_ras_error_status)
adev->mmhub.funcs->query_ras_error_status(adev);
break;
default:
break;
}
}
static void amdgpu_ras_query_err_status(struct amdgpu_device *adev)
{
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
struct ras_manager *obj;
if (!con)
return;
list_for_each_entry(obj, &con->head, node) {
struct ras_query_if info = {
.head = obj->head,
};
amdgpu_ras_error_status_query(adev, &info);
}
}
/* recovery begin */ /* recovery begin */
/* return 0 on success. /* return 0 on success.
...@@ -1568,8 +1607,10 @@ static void amdgpu_ras_do_recovery(struct work_struct *work) ...@@ -1568,8 +1607,10 @@ static void amdgpu_ras_do_recovery(struct work_struct *work)
} }
list_for_each_entry(remote_adev, list_for_each_entry(remote_adev,
device_list_handle, gmc.xgmi.head) device_list_handle, gmc.xgmi.head) {
amdgpu_ras_query_err_status(remote_adev);
amdgpu_ras_log_on_err_counter(remote_adev); amdgpu_ras_log_on_err_counter(remote_adev);
}
amdgpu_put_xgmi_hive(hive); amdgpu_put_xgmi_hive(hive);
} }
......
...@@ -2075,6 +2075,7 @@ static const struct amdgpu_gfx_funcs gfx_v9_4_gfx_funcs = { ...@@ -2075,6 +2075,7 @@ static const struct amdgpu_gfx_funcs gfx_v9_4_gfx_funcs = {
.ras_error_inject = &gfx_v9_4_ras_error_inject, .ras_error_inject = &gfx_v9_4_ras_error_inject,
.query_ras_error_count = &gfx_v9_4_query_ras_error_count, .query_ras_error_count = &gfx_v9_4_query_ras_error_count,
.reset_ras_error_count = &gfx_v9_4_reset_ras_error_count, .reset_ras_error_count = &gfx_v9_4_reset_ras_error_count,
.query_ras_error_status = &gfx_v9_4_query_ras_error_status,
}; };
static int gfx_v9_0_gpu_early_init(struct amdgpu_device *adev) static int gfx_v9_0_gpu_early_init(struct amdgpu_device *adev)
......
...@@ -992,3 +992,32 @@ int gfx_v9_4_ras_error_inject(struct amdgpu_device *adev, void *inject_if) ...@@ -992,3 +992,32 @@ int gfx_v9_4_ras_error_inject(struct amdgpu_device *adev, void *inject_if)
return ret; return ret;
} }
static const struct soc15_reg_entry gfx_v9_4_rdrsp_status_regs =
{ SOC15_REG_ENTRY(GC, 0, mmGCEA_ERR_STATUS), 0, 1, 32 };
void gfx_v9_4_query_ras_error_status(struct amdgpu_device *adev)
{
uint32_t i, j;
uint32_t reg_value;
if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__GFX))
return;
mutex_lock(&adev->grbm_idx_mutex);
for (i = 0; i < gfx_v9_4_rdrsp_status_regs.se_num; i++) {
for (j = 0; j < gfx_v9_4_rdrsp_status_regs.instance;
j++) {
gfx_v9_4_select_se_sh(adev, i, 0, j);
reg_value = RREG32(SOC15_REG_ENTRY_OFFSET(
gfx_v9_4_rdrsp_status_regs));
if (reg_value)
dev_warn(adev->dev, "GCEA err detected at instance: %d, status: 0x%x!\n",
j, reg_value);
}
}
gfx_v9_4_select_se_sh(adev, 0xffffffff, 0xffffffff, 0xffffffff);
mutex_unlock(&adev->grbm_idx_mutex);
}
...@@ -34,4 +34,6 @@ int gfx_v9_4_ras_error_inject(struct amdgpu_device *adev, ...@@ -34,4 +34,6 @@ int gfx_v9_4_ras_error_inject(struct amdgpu_device *adev,
void gfx_v9_4_reset_ras_error_count(struct amdgpu_device *adev); void gfx_v9_4_reset_ras_error_count(struct amdgpu_device *adev);
void gfx_v9_4_query_ras_error_status(struct amdgpu_device *adev);
#endif /* __GFX_V9_4_H__ */ #endif /* __GFX_V9_4_H__ */
...@@ -1624,6 +1624,34 @@ static void mmhub_v9_4_reset_ras_error_count(struct amdgpu_device *adev) ...@@ -1624,6 +1624,34 @@ static void mmhub_v9_4_reset_ras_error_count(struct amdgpu_device *adev)
} }
} }
static const struct soc15_reg_entry mmhub_v9_4_err_status_regs[] = {
{ SOC15_REG_ENTRY(MMHUB, 0, mmMMEA0_ERR_STATUS), 0, 0, 0 },
{ SOC15_REG_ENTRY(MMHUB, 0, mmMMEA1_ERR_STATUS), 0, 0, 0 },
{ SOC15_REG_ENTRY(MMHUB, 0, mmMMEA2_ERR_STATUS), 0, 0, 0 },
{ SOC15_REG_ENTRY(MMHUB, 0, mmMMEA3_ERR_STATUS), 0, 0, 0 },
{ SOC15_REG_ENTRY(MMHUB, 0, mmMMEA4_ERR_STATUS), 0, 0, 0 },
{ SOC15_REG_ENTRY(MMHUB, 0, mmMMEA5_ERR_STATUS), 0, 0, 0 },
{ SOC15_REG_ENTRY(MMHUB, 0, mmMMEA6_ERR_STATUS), 0, 0, 0 },
{ SOC15_REG_ENTRY(MMHUB, 0, mmMMEA7_ERR_STATUS), 0, 0, 0 },
};
static void mmhub_v9_4_query_ras_error_status(struct amdgpu_device *adev)
{
int i;
uint32_t reg_value;
if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__MMHUB))
return;
for (i = 0; i < ARRAY_SIZE(mmhub_v9_4_err_status_regs); i++) {
reg_value =
RREG32(SOC15_REG_ENTRY_OFFSET(mmhub_v9_4_err_status_regs[i]));
if (reg_value)
dev_warn(adev->dev, "MMHUB EA err detected at instance: %d, status: 0x%x!\n",
i, reg_value);
}
}
const struct amdgpu_mmhub_funcs mmhub_v9_4_funcs = { const struct amdgpu_mmhub_funcs mmhub_v9_4_funcs = {
.ras_late_init = amdgpu_mmhub_ras_late_init, .ras_late_init = amdgpu_mmhub_ras_late_init,
.query_ras_error_count = mmhub_v9_4_query_ras_error_count, .query_ras_error_count = mmhub_v9_4_query_ras_error_count,
...@@ -1636,4 +1664,5 @@ const struct amdgpu_mmhub_funcs mmhub_v9_4_funcs = { ...@@ -1636,4 +1664,5 @@ const struct amdgpu_mmhub_funcs mmhub_v9_4_funcs = {
.set_clockgating = mmhub_v9_4_set_clockgating, .set_clockgating = mmhub_v9_4_set_clockgating,
.get_clockgating = mmhub_v9_4_get_clockgating, .get_clockgating = mmhub_v9_4_get_clockgating,
.setup_vm_pt_regs = mmhub_v9_4_setup_vm_pt_regs, .setup_vm_pt_regs = mmhub_v9_4_setup_vm_pt_regs,
.query_ras_error_status = mmhub_v9_4_query_ras_error_status,
}; };
...@@ -205,6 +205,8 @@ ...@@ -205,6 +205,8 @@
#define mmGCEA_EDC_CNT2_BASE_IDX 0 #define mmGCEA_EDC_CNT2_BASE_IDX 0
#define mmGCEA_EDC_CNT3 0x071b #define mmGCEA_EDC_CNT3 0x071b
#define mmGCEA_EDC_CNT3_BASE_IDX 0 #define mmGCEA_EDC_CNT3_BASE_IDX 0
#define mmGCEA_ERR_STATUS 0x0712
#define mmGCEA_ERR_STATUS_BASE_IDX 0
// addressBlock: gc_gfxudec // addressBlock: gc_gfxudec
// base address: 0x30000 // base address: 0x30000
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment