Commit 6c47a79b authored by YiPeng Chai's avatar YiPeng Chai Committed by Alex Deucher

drm/amdgpu: perform mode2 reset for sdma fed error on gfx v11_0_3

perform mode2 reset for sdma fed error on gfx v11_0_3.
Signed-off-by: default avatarYiPeng Chai <YiPeng.Chai@amd.com>
Reviewed-by: default avatarHawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent 5d062270
...@@ -2053,9 +2053,15 @@ static void amdgpu_ras_do_recovery(struct work_struct *work) ...@@ -2053,9 +2053,15 @@ static void amdgpu_ras_do_recovery(struct work_struct *work)
/* Perform full reset in fatal error mode */ /* Perform full reset in fatal error mode */
if (!amdgpu_ras_is_poison_mode_supported(ras->adev)) if (!amdgpu_ras_is_poison_mode_supported(ras->adev))
set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
else else {
clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
if (ras->gpu_reset_flags & AMDGPU_RAS_GPU_RESET_MODE2_RESET) {
ras->gpu_reset_flags &= ~AMDGPU_RAS_GPU_RESET_MODE2_RESET;
reset_context.method = AMD_RESET_METHOD_MODE2;
}
}
amdgpu_device_gpu_recover(ras->adev, NULL, &reset_context); amdgpu_device_gpu_recover(ras->adev, NULL, &reset_context);
} }
atomic_set(&ras->in_recovery, 0); atomic_set(&ras->in_recovery, 0);
......
...@@ -339,6 +339,8 @@ enum amdgpu_ras_ret { ...@@ -339,6 +339,8 @@ enum amdgpu_ras_ret {
#define AMDGPU_RAS_ERR_STATUS_VALID (1 << 1) #define AMDGPU_RAS_ERR_STATUS_VALID (1 << 1)
#define AMDGPU_RAS_ERR_ADDRESS_VALID (1 << 2) #define AMDGPU_RAS_ERR_ADDRESS_VALID (1 << 2)
#define AMDGPU_RAS_GPU_RESET_MODE2_RESET (0x1 << 0)
struct amdgpu_ras_err_status_reg_entry { struct amdgpu_ras_err_status_reg_entry {
uint32_t hwip; uint32_t hwip;
uint32_t ip_inst; uint32_t ip_inst;
...@@ -427,6 +429,9 @@ struct amdgpu_ras { ...@@ -427,6 +429,9 @@ struct amdgpu_ras {
/* Indicates smu whether need update bad channel info */ /* Indicates smu whether need update bad channel info */
bool update_channel_flag; bool update_channel_flag;
/* Record special requirements of gpu reset caller */
uint32_t gpu_reset_flags;
}; };
struct ras_fs_data { struct ras_fs_data {
......
...@@ -84,8 +84,20 @@ static int gfx_v11_0_3_poison_consumption_handler(struct amdgpu_device *adev, ...@@ -84,8 +84,20 @@ static int gfx_v11_0_3_poison_consumption_handler(struct amdgpu_device *adev,
/* Workaround: when vmid and pasid are both zero, trigger gpu reset in KGD. */ /* Workaround: when vmid and pasid are both zero, trigger gpu reset in KGD. */
if (entry && (entry->client_id == SOC21_IH_CLIENTID_GFX) && if (entry && (entry->client_id == SOC21_IH_CLIENTID_GFX) &&
(entry->src_id == GFX_11_0_0__SRCID__RLC_GC_FED_INTERRUPT) && (entry->src_id == GFX_11_0_0__SRCID__RLC_GC_FED_INTERRUPT) &&
!entry->vmid && !entry->pasid) !entry->vmid && !entry->pasid) {
uint32_t rlc_status0 = 0;
rlc_status0 = RREG32_SOC15(GC, 0, regRLC_RLCS_FED_STATUS_0);
if (REG_GET_FIELD(rlc_status0, RLC_RLCS_FED_STATUS_0, SDMA0_FED_ERR) ||
REG_GET_FIELD(rlc_status0, RLC_RLCS_FED_STATUS_0, SDMA1_FED_ERR)) {
struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
ras->gpu_reset_flags |= AMDGPU_RAS_GPU_RESET_MODE2_RESET;
}
amdgpu_ras_reset_gpu(adev); amdgpu_ras_reset_gpu(adev);
}
return 0; return 0;
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment