Commit df99ac0f authored by Jesse Zhang's avatar Jesse Zhang Committed by Alex Deucher

drm/amd/amdgpu:Fix compute ring unable to detect hang.

When compute fence did not signal, compute ring cannot detect hardware hang
because its timeout value is set to be infinite by default.

In SR-IOV and passthrough mode, if user does not declare custome timeout
value for compute ring, then use gfx ring timeout value as default. So
that when there is a ture hardware hang, compute ring can detect it.
Signed-off-by: default avatarJesse Zhang <zhexi.zhang@amd.com>
Reviewed-by: default avatarChristian König <christian.koenig@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent 90a08351
...@@ -1025,12 +1025,6 @@ static int amdgpu_device_check_arguments(struct amdgpu_device *adev) ...@@ -1025,12 +1025,6 @@ static int amdgpu_device_check_arguments(struct amdgpu_device *adev)
amdgpu_device_check_block_size(adev); amdgpu_device_check_block_size(adev);
ret = amdgpu_device_get_job_timeout_settings(adev);
if (ret) {
dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n");
return ret;
}
adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type); adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type);
return ret; return ret;
...@@ -2745,6 +2739,12 @@ int amdgpu_device_init(struct amdgpu_device *adev, ...@@ -2745,6 +2739,12 @@ int amdgpu_device_init(struct amdgpu_device *adev,
if (r) if (r)
return r; return r;
r = amdgpu_device_get_job_timeout_settings(adev);
if (r) {
dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n");
return r;
}
/* doorbell bar mapping and doorbell index init*/ /* doorbell bar mapping and doorbell index init*/
amdgpu_device_doorbell_init(adev); amdgpu_device_doorbell_init(adev);
......
...@@ -1341,10 +1341,15 @@ int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) ...@@ -1341,10 +1341,15 @@ int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev)
/* /*
* By default timeout for non compute jobs is 10000. * By default timeout for non compute jobs is 10000.
* And there is no timeout enforced on compute jobs. * And there is no timeout enforced on compute jobs.
* In SR-IOV or passthrough mode, timeout for compute
* jobs are 10000 by default.
*/ */
adev->gfx_timeout = msecs_to_jiffies(10000); adev->gfx_timeout = msecs_to_jiffies(10000);
adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
adev->compute_timeout = MAX_SCHEDULE_TIMEOUT; if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev))
adev->compute_timeout = adev->gfx_timeout;
else
adev->compute_timeout = MAX_SCHEDULE_TIMEOUT;
if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENTH)) { if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENTH)) {
while ((timeout_setting = strsep(&input, ",")) && while ((timeout_setting = strsep(&input, ",")) &&
......
...@@ -462,18 +462,7 @@ int amdgpu_fence_driver_init_ring(struct amdgpu_ring *ring, ...@@ -462,18 +462,7 @@ int amdgpu_fence_driver_init_ring(struct amdgpu_ring *ring,
timeout = adev->gfx_timeout; timeout = adev->gfx_timeout;
break; break;
case AMDGPU_RING_TYPE_COMPUTE: case AMDGPU_RING_TYPE_COMPUTE:
/* timeout = adev->compute_timeout;
* For non-sriov case, no timeout enforce
* on compute ring by default. Unless user
* specifies a timeout for compute ring.
*
* For sriov case, always use the timeout
* as gfx ring
*/
if (!amdgpu_sriov_vf(ring->adev))
timeout = adev->compute_timeout;
else
timeout = adev->gfx_timeout;
break; break;
case AMDGPU_RING_TYPE_SDMA: case AMDGPU_RING_TYPE_SDMA:
timeout = adev->sdma_timeout; timeout = adev->sdma_timeout;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment