Commit 2a460963 authored by Candice Li's avatar Candice Li Committed by Alex Deucher

drm/amdgpu: Resolve RAS GFX error count issue after cold boot on Arcturus

Adjust the sequence for ras late init and separate ras reset error status
from query status.

v2: squash in fix from Candice
Signed-off-by: default avatarCandice Li <candice.li@amd.com>
Reviewed-by: default avatarHawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent 28caf8c4
...@@ -594,17 +594,20 @@ int amdgpu_get_gfx_off_status(struct amdgpu_device *adev, uint32_t *value) ...@@ -594,17 +594,20 @@ int amdgpu_get_gfx_off_status(struct amdgpu_device *adev, uint32_t *value)
int amdgpu_gfx_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *ras_block) int amdgpu_gfx_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *ras_block)
{ {
int r; int r;
r = amdgpu_ras_block_late_init(adev, ras_block);
if (r)
return r;
if (amdgpu_ras_is_supported(adev, ras_block->block)) { if (amdgpu_ras_is_supported(adev, ras_block->block)) {
if (!amdgpu_persistent_edc_harvesting_supported(adev)) if (!amdgpu_persistent_edc_harvesting_supported(adev))
amdgpu_ras_reset_error_status(adev, AMDGPU_RAS_BLOCK__GFX); amdgpu_ras_reset_error_status(adev, AMDGPU_RAS_BLOCK__GFX);
r = amdgpu_ras_block_late_init(adev, ras_block);
if (r)
return r;
r = amdgpu_irq_get(adev, &adev->gfx.cp_ecc_error_irq, 0); r = amdgpu_irq_get(adev, &adev->gfx.cp_ecc_error_irq, 0);
if (r) if (r)
goto late_fini; goto late_fini;
} else {
amdgpu_ras_feature_enable_on_boot(adev, ras_block, 0);
} }
return 0; return 0;
......
...@@ -197,6 +197,13 @@ static ssize_t amdgpu_ras_debugfs_read(struct file *f, char __user *buf, ...@@ -197,6 +197,13 @@ static ssize_t amdgpu_ras_debugfs_read(struct file *f, char __user *buf,
if (amdgpu_ras_query_error_status(obj->adev, &info)) if (amdgpu_ras_query_error_status(obj->adev, &info))
return -EINVAL; return -EINVAL;
/* Hardware counter will be reset automatically after the query on Vega20 and Arcturus */
if (obj->adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 2) &&
obj->adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 4)) {
if (amdgpu_ras_reset_error_status(obj->adev, info.head.block))
dev_warn(obj->adev->dev, "Failed to reset error counter and error status");
}
s = snprintf(val, sizeof(val), "%s: %lu\n%s: %lu\n", s = snprintf(val, sizeof(val), "%s: %lu\n%s: %lu\n",
"ue", info.ue_count, "ue", info.ue_count,
"ce", info.ce_count); "ce", info.ce_count);
...@@ -550,9 +557,10 @@ static ssize_t amdgpu_ras_sysfs_read(struct device *dev, ...@@ -550,9 +557,10 @@ static ssize_t amdgpu_ras_sysfs_read(struct device *dev,
if (amdgpu_ras_query_error_status(obj->adev, &info)) if (amdgpu_ras_query_error_status(obj->adev, &info))
return -EINVAL; return -EINVAL;
if (obj->adev->asic_type == CHIP_ALDEBARAN) { if (obj->adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 2) &&
obj->adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 4)) {
if (amdgpu_ras_reset_error_status(obj->adev, info.head.block)) if (amdgpu_ras_reset_error_status(obj->adev, info.head.block))
DRM_WARN("Failed to reset error counter and error status"); dev_warn(obj->adev->dev, "Failed to reset error counter and error status");
} }
return sysfs_emit(buf, "%s: %lu\n%s: %lu\n", "ue", info.ue_count, return sysfs_emit(buf, "%s: %lu\n%s: %lu\n", "ue", info.ue_count,
...@@ -1027,9 +1035,6 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev, ...@@ -1027,9 +1035,6 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev,
} }
} }
if (!amdgpu_persistent_edc_harvesting_supported(adev))
amdgpu_ras_reset_error_status(adev, info->head.block);
return 0; return 0;
} }
...@@ -1149,6 +1154,12 @@ int amdgpu_ras_query_error_count(struct amdgpu_device *adev, ...@@ -1149,6 +1154,12 @@ int amdgpu_ras_query_error_count(struct amdgpu_device *adev,
if (res) if (res)
return res; return res;
if (adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 2) &&
adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 4)) {
if (amdgpu_ras_reset_error_status(adev, info.head.block))
dev_warn(adev->dev, "Failed to reset error counter and error status");
}
ce += info.ce_count; ce += info.ce_count;
ue += info.ue_count; ue += info.ue_count;
} }
...@@ -1792,6 +1803,12 @@ static void amdgpu_ras_log_on_err_counter(struct amdgpu_device *adev) ...@@ -1792,6 +1803,12 @@ static void amdgpu_ras_log_on_err_counter(struct amdgpu_device *adev)
continue; continue;
amdgpu_ras_query_error_status(adev, &info); amdgpu_ras_query_error_status(adev, &info);
if (adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 2) &&
adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 4)) {
if (amdgpu_ras_reset_error_status(adev, info.head.block))
dev_warn(adev->dev, "Failed to reset error counter and error status");
}
} }
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment