Commit b823821f authored by Le Ma's avatar Le Ma Committed by Alex Deucher

drm/amdgpu: support full gpu reset workflow when ras err_event_athub occurs

This athub fatal error can be recovered by baco without system-level reboot,
so add a mode to use baco for the recovery. Not affect the default psp reset
situations for now.
Signed-off-by: default avatarLe Ma <le.ma@amd.com>
Reviewed-by: default avatarHawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent ce316fa5
...@@ -4018,12 +4018,15 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, ...@@ -4018,12 +4018,15 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
struct amdgpu_device *tmp_adev = NULL; struct amdgpu_device *tmp_adev = NULL;
int i, r = 0; int i, r = 0;
bool in_ras_intr = amdgpu_ras_intr_triggered(); bool in_ras_intr = amdgpu_ras_intr_triggered();
bool use_baco =
(amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) ?
true : false;
/* /*
* Flush RAM to disk so that after reboot * Flush RAM to disk so that after reboot
* the user can read log and see why the system rebooted. * the user can read log and see why the system rebooted.
*/ */
if (in_ras_intr && amdgpu_ras_get_context(adev)->reboot) { if (in_ras_intr && !use_baco && amdgpu_ras_get_context(adev)->reboot) {
DRM_WARN("Emergency reboot."); DRM_WARN("Emergency reboot.");
...@@ -4034,7 +4037,8 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, ...@@ -4034,7 +4037,8 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
need_full_reset = job_signaled = false; need_full_reset = job_signaled = false;
INIT_LIST_HEAD(&device_list); INIT_LIST_HEAD(&device_list);
dev_info(adev->dev, "GPU %s begin!\n", in_ras_intr ? "jobs stop":"reset"); dev_info(adev->dev, "GPU %s begin!\n",
(in_ras_intr && !use_baco) ? "jobs stop":"reset");
cancel_delayed_work_sync(&adev->delayed_init_work); cancel_delayed_work_sync(&adev->delayed_init_work);
...@@ -4101,7 +4105,8 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, ...@@ -4101,7 +4105,8 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
amdgpu_unregister_gpu_instance(tmp_adev); amdgpu_unregister_gpu_instance(tmp_adev);
/* disable ras on ALL IPs */ /* disable ras on ALL IPs */
if (!in_ras_intr && amdgpu_device_ip_need_full_reset(tmp_adev)) if (!(in_ras_intr && !use_baco) &&
amdgpu_device_ip_need_full_reset(tmp_adev))
amdgpu_ras_suspend(tmp_adev); amdgpu_ras_suspend(tmp_adev);
for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
...@@ -4112,13 +4117,13 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, ...@@ -4112,13 +4117,13 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
drm_sched_stop(&ring->sched, job ? &job->base : NULL); drm_sched_stop(&ring->sched, job ? &job->base : NULL);
if (in_ras_intr) if (in_ras_intr && !use_baco)
amdgpu_job_stop_all_jobs_on_sched(&ring->sched); amdgpu_job_stop_all_jobs_on_sched(&ring->sched);
} }
} }
if (in_ras_intr) if (in_ras_intr && !use_baco)
goto skip_sched_resume; goto skip_sched_resume;
/* /*
...@@ -4212,7 +4217,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, ...@@ -4212,7 +4217,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
skip_sched_resume: skip_sched_resume:
list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
/*unlock kfd: SRIOV would do it separately */ /*unlock kfd: SRIOV would do it separately */
if (!in_ras_intr && !amdgpu_sriov_vf(tmp_adev)) if (!(in_ras_intr && !use_baco) && !amdgpu_sriov_vf(tmp_adev))
amdgpu_amdkfd_post_reset(tmp_adev); amdgpu_amdkfd_post_reset(tmp_adev);
amdgpu_device_unlock_adev(tmp_adev); amdgpu_device_unlock_adev(tmp_adev);
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment