Commit acd89fca authored by Andrey Grodzovsky's avatar Andrey Grodzovsky Committed by Alex Deucher

drm/amdgpu: Block all job scheduling activity during DPC recovery

DPC recovery involves ASIC reset just as normal GPU recovery so block
SW GPU schedulers and wait on all concurrent GPU resets.
Signed-off-by: default avatarAndrey Grodzovsky <andrey.grodzovsky@amd.com>
Acked-by: default avatarAlex Deucher <alexander.deucher@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent bf36b52e
......@@ -4745,6 +4745,20 @@ int amdgpu_device_baco_exit(struct drm_device *dev)
return 0;
}
static void amdgpu_cancel_all_tdr(struct amdgpu_device *adev)
{
int i;
for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
struct amdgpu_ring *ring = adev->rings[i];
if (!ring || !ring->sched.thread)
continue;
cancel_delayed_work_sync(&ring->sched.work_tdr);
}
}
/**
* amdgpu_pci_error_detected - Called when a PCI error is detected.
* @pdev: PCI device struct
......@@ -4758,15 +4772,37 @@ pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_sta
{
struct drm_device *dev = pci_get_drvdata(pdev);
struct amdgpu_device *adev = drm_to_adev(dev);
int i;
DRM_INFO("PCI error: detected callback, state(%d)!!\n", state);
switch (state) {
case pci_channel_io_normal:
return PCI_ERS_RESULT_CAN_RECOVER;
case pci_channel_io_frozen:
/* Fatal error, prepare for slot reset */
amdgpu_device_lock_adev(adev);
case pci_channel_io_frozen:
/*
* Cancel and wait for all TDRs in progress if failing to
* set adev->in_gpu_reset in amdgpu_device_lock_adev
*
* Locking adev->reset_sem will prevent any external access
* to GPU during PCI error recovery
*/
while (!amdgpu_device_lock_adev(adev, NULL))
amdgpu_cancel_all_tdr(adev);
/*
* Block any work scheduling as we do for regular GPU reset
* for the duration of the recovery
*/
for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
struct amdgpu_ring *ring = adev->rings[i];
if (!ring || !ring->sched.thread)
continue;
drm_sched_stop(&ring->sched, NULL);
}
return PCI_ERS_RESULT_NEED_RESET;
case pci_channel_io_perm_failure:
/* Permanent error, prepare for device removal */
......@@ -4899,8 +4935,21 @@ void amdgpu_pci_resume(struct pci_dev *pdev)
{
struct drm_device *dev = pci_get_drvdata(pdev);
struct amdgpu_device *adev = drm_to_adev(dev);
int i;
amdgpu_device_unlock_adev(adev);
DRM_INFO("PCI error: resume callback!!\n");
for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
struct amdgpu_ring *ring = adev->rings[i];
if (!ring || !ring->sched.thread)
continue;
drm_sched_resubmit_jobs(&ring->sched);
drm_sched_start(&ring->sched, true);
}
amdgpu_device_unlock_adev(adev);
}
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment