Commit bb5c7235 authored by Wenhui Sheng's avatar Wenhui Sheng Committed by Alex Deucher

drm/amdgpu: RAS emergency restart logic refine

If we are in RAS triggered situation and
BACO isn't support, emergency restart is needed,
and this code is only needed for some specific
cases(vega20 with given smu fw version).

After we add smu mode1 reset for sienna cichlid, we
need to share AMD_RESET_METHOD_MODE1 with psp mode1 reset,
so in amdgpu_device_gpu_recover, we need differentiate
which mode1 reset we are using, then decide if it's
a full reset and then decide if emergency restart is needed,
the logic will become much more complex.

After discussion with Hawking, move emergency restart logic
to an independent function.
Signed-off-by: default avatarLikun Gao <Likun.Gao@amd.com>
Signed-off-by: default avatarWenhui Sheng <Wenhui.Sheng@amd.com>
Reviewed-by: default avatarHawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent ea8139d8
...@@ -4245,18 +4245,19 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, ...@@ -4245,18 +4245,19 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
struct amdgpu_hive_info *hive = NULL; struct amdgpu_hive_info *hive = NULL;
struct amdgpu_device *tmp_adev = NULL; struct amdgpu_device *tmp_adev = NULL;
int i, r = 0; int i, r = 0;
bool in_ras_intr = amdgpu_ras_intr_triggered(); bool need_emergency_restart = false;
bool use_baco =
(amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) ?
true : false;
bool audio_suspended = false; bool audio_suspended = false;
/**
* Special case: RAS triggered and full reset isn't supported
*/
need_emergency_restart = amdgpu_ras_need_emergency_restart(adev);
/* /*
* Flush RAM to disk so that after reboot * Flush RAM to disk so that after reboot
* the user can read log and see why the system rebooted. * the user can read log and see why the system rebooted.
*/ */
if (in_ras_intr && !use_baco && amdgpu_ras_get_context(adev)->reboot) { if (need_emergency_restart && amdgpu_ras_get_context(adev)->reboot) {
DRM_WARN("Emergency reboot."); DRM_WARN("Emergency reboot.");
ksys_sync_helper(); ksys_sync_helper();
...@@ -4264,7 +4265,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, ...@@ -4264,7 +4265,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
} }
dev_info(adev->dev, "GPU %s begin!\n", dev_info(adev->dev, "GPU %s begin!\n",
(in_ras_intr && !use_baco) ? "jobs stop":"reset"); need_emergency_restart ? "jobs stop":"reset");
/* /*
* Here we trylock to avoid chain of resets executing from * Here we trylock to avoid chain of resets executing from
...@@ -4336,7 +4337,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, ...@@ -4336,7 +4337,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
amdgpu_fbdev_set_suspend(tmp_adev, 1); amdgpu_fbdev_set_suspend(tmp_adev, 1);
/* disable ras on ALL IPs */ /* disable ras on ALL IPs */
if (!(in_ras_intr && !use_baco) && if (!need_emergency_restart &&
amdgpu_device_ip_need_full_reset(tmp_adev)) amdgpu_device_ip_need_full_reset(tmp_adev))
amdgpu_ras_suspend(tmp_adev); amdgpu_ras_suspend(tmp_adev);
...@@ -4348,12 +4349,12 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, ...@@ -4348,12 +4349,12 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
drm_sched_stop(&ring->sched, job ? &job->base : NULL); drm_sched_stop(&ring->sched, job ? &job->base : NULL);
if (in_ras_intr && !use_baco) if (need_emergency_restart)
amdgpu_job_stop_all_jobs_on_sched(&ring->sched); amdgpu_job_stop_all_jobs_on_sched(&ring->sched);
} }
} }
if (in_ras_intr && !use_baco) if (need_emergency_restart)
goto skip_sched_resume; goto skip_sched_resume;
/* /*
...@@ -4430,7 +4431,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, ...@@ -4430,7 +4431,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
skip_sched_resume: skip_sched_resume:
list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
/*unlock kfd: SRIOV would do it separately */ /*unlock kfd: SRIOV would do it separately */
if (!(in_ras_intr && !use_baco) && !amdgpu_sriov_vf(tmp_adev)) if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev))
amdgpu_amdkfd_post_reset(tmp_adev); amdgpu_amdkfd_post_reset(tmp_adev);
if (audio_suspended) if (audio_suspended)
amdgpu_device_resume_display_audio(tmp_adev); amdgpu_device_resume_display_audio(tmp_adev);
......
...@@ -2131,3 +2131,14 @@ void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev) ...@@ -2131,3 +2131,14 @@ void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev)
amdgpu_ras_reset_gpu(adev); amdgpu_ras_reset_gpu(adev);
} }
} }
bool amdgpu_ras_need_emergency_restart(struct amdgpu_device *adev)
{
if (adev->asic_type == CHIP_VEGA20 &&
adev->pm.fw_version <= 0x283400) {
return !(amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) &&
amdgpu_ras_intr_triggered();
}
return false;
}
...@@ -633,4 +633,5 @@ void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev); ...@@ -633,4 +633,5 @@ void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev);
void amdgpu_ras_set_error_query_ready(struct amdgpu_device *adev, bool ready); void amdgpu_ras_set_error_query_ready(struct amdgpu_device *adev, bool ready);
bool amdgpu_ras_need_emergency_restart(struct amdgpu_device *adev);
#endif #endif
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment