Commit 4d9f771e authored by Luben Tuikov's avatar Luben Tuikov Committed by Alex Deucher

drm/amdgpu: Return error if no RAS

In amdgpu_ras_query_error_count() return an error
if the device doesn't support RAS. This prevents
that function from having to always set the values
of the integer pointers (if set), and thus
prevents function side effects--always to have to
set values of integers if integer pointers set,
regardless of whether RAS is supported or
not--with this change this side effect is
mitigated.

Also, if no pointers are set, don't count, since
we've no way of reporting the counts.

Also, give this function a kernel-doc.

Cc: Alexander Deucher <Alexander.Deucher@amd.com>
Cc: John Clements <john.clements@amd.com>
Cc: Hawking Zhang <Hawking.Zhang@amd.com>
Reported-by: default avatarTom Rix <trix@redhat.com>
Fixes: a46751fb ("drm/amdgpu: Fix RAS function interface")
Signed-off-by: default avatarLuben Tuikov <luben.tuikov@amd.com>
Reviewed-by: default avatarAlexander Deucher <Alexander.Deucher@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent b5840166
...@@ -1047,8 +1047,18 @@ int amdgpu_ras_error_inject(struct amdgpu_device *adev, ...@@ -1047,8 +1047,18 @@ int amdgpu_ras_error_inject(struct amdgpu_device *adev,
return ret; return ret;
} }
/* get the total error counts on all IPs */ /**
void amdgpu_ras_query_error_count(struct amdgpu_device *adev, * amdgpu_ras_query_error_count -- Get error counts of all IPs
* adev: pointer to AMD GPU device
* ce_count: pointer to an integer to be set to the count of correctible errors.
* ue_count: pointer to an integer to be set to the count of uncorrectible
* errors.
*
* If set, @ce_count or @ue_count, count and return the corresponding
* error counts in those integer pointers. Return 0 if the device
* supports RAS. Return -EOPNOTSUPP if the device doesn't support RAS.
*/
int amdgpu_ras_query_error_count(struct amdgpu_device *adev,
unsigned long *ce_count, unsigned long *ce_count,
unsigned long *ue_count) unsigned long *ue_count)
{ {
...@@ -1057,7 +1067,12 @@ void amdgpu_ras_query_error_count(struct amdgpu_device *adev, ...@@ -1057,7 +1067,12 @@ void amdgpu_ras_query_error_count(struct amdgpu_device *adev,
unsigned long ce, ue; unsigned long ce, ue;
if (!adev->ras_enabled || !con) if (!adev->ras_enabled || !con)
return; return -EOPNOTSUPP;
/* Don't count since no reporting.
*/
if (!ce_count && !ue_count)
return 0;
ce = 0; ce = 0;
ue = 0; ue = 0;
...@@ -1065,9 +1080,11 @@ void amdgpu_ras_query_error_count(struct amdgpu_device *adev, ...@@ -1065,9 +1080,11 @@ void amdgpu_ras_query_error_count(struct amdgpu_device *adev,
struct ras_query_if info = { struct ras_query_if info = {
.head = obj->head, .head = obj->head,
}; };
int res;
if (amdgpu_ras_query_error_status(adev, &info)) res = amdgpu_ras_query_error_status(adev, &info);
return; if (res)
return res;
ce += info.ce_count; ce += info.ce_count;
ue += info.ue_count; ue += info.ue_count;
...@@ -1078,6 +1095,8 @@ void amdgpu_ras_query_error_count(struct amdgpu_device *adev, ...@@ -1078,6 +1095,8 @@ void amdgpu_ras_query_error_count(struct amdgpu_device *adev,
if (ue_count) if (ue_count)
*ue_count = ue; *ue_count = ue;
return 0;
} }
/* query/inject/cure end */ /* query/inject/cure end */
...@@ -2145,9 +2164,10 @@ static void amdgpu_ras_counte_dw(struct work_struct *work) ...@@ -2145,9 +2164,10 @@ static void amdgpu_ras_counte_dw(struct work_struct *work)
/* Cache new values. /* Cache new values.
*/ */
amdgpu_ras_query_error_count(adev, &ce_count, &ue_count); if (amdgpu_ras_query_error_count(adev, &ce_count, &ue_count) == 0) {
atomic_set(&con->ras_ce_count, ce_count); atomic_set(&con->ras_ce_count, ce_count);
atomic_set(&con->ras_ue_count, ue_count); atomic_set(&con->ras_ue_count, ue_count);
}
pm_runtime_mark_last_busy(dev->dev); pm_runtime_mark_last_busy(dev->dev);
Out: Out:
...@@ -2320,9 +2340,10 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev, ...@@ -2320,9 +2340,10 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev,
/* Those are the cached values at init. /* Those are the cached values at init.
*/ */
amdgpu_ras_query_error_count(adev, &ce_count, &ue_count); if (amdgpu_ras_query_error_count(adev, &ce_count, &ue_count) == 0) {
atomic_set(&con->ras_ce_count, ce_count); atomic_set(&con->ras_ce_count, ce_count);
atomic_set(&con->ras_ue_count, ue_count); atomic_set(&con->ras_ue_count, ue_count);
}
return 0; return 0;
cleanup: cleanup:
......
...@@ -491,7 +491,7 @@ int amdgpu_ras_request_reset_on_boot(struct amdgpu_device *adev, ...@@ -491,7 +491,7 @@ int amdgpu_ras_request_reset_on_boot(struct amdgpu_device *adev,
void amdgpu_ras_resume(struct amdgpu_device *adev); void amdgpu_ras_resume(struct amdgpu_device *adev);
void amdgpu_ras_suspend(struct amdgpu_device *adev); void amdgpu_ras_suspend(struct amdgpu_device *adev);
void amdgpu_ras_query_error_count(struct amdgpu_device *adev, int amdgpu_ras_query_error_count(struct amdgpu_device *adev,
unsigned long *ce_count, unsigned long *ce_count,
unsigned long *ue_count); unsigned long *ue_count);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment