Commit fec8c524 authored by Tao Zhou's avatar Tao Zhou Committed by Alex Deucher

drm/amdgpu: save error count in RAS poison handler

Otherwise the RAS error count couldn't be queried from sysfs.
Signed-off-by: default avatarTao Zhou <tao.zhou1@amd.com>
Reviewed-by: default avatarStanley.Yang <Stanley.Yang@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent 45e3d1db
...@@ -727,7 +727,7 @@ void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev, bo ...@@ -727,7 +727,7 @@ void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev, bo
/* CPU MCA will handle page retirement if connected_to_cpu is 1 */ /* CPU MCA will handle page retirement if connected_to_cpu is 1 */
if (!adev->gmc.xgmi.connected_to_cpu) if (!adev->gmc.xgmi.connected_to_cpu)
amdgpu_umc_do_page_retirement(adev, &err_data, NULL, reset); amdgpu_umc_poison_handler(adev, &err_data, reset);
else if (reset) else if (reset)
amdgpu_amdkfd_gpu_reset(adev); amdgpu_amdkfd_gpu_reset(adev);
} }
...@@ -23,79 +23,7 @@ ...@@ -23,79 +23,7 @@
#include "amdgpu_ras.h" #include "amdgpu_ras.h"
static int amdgpu_umc_process_ras_data_cb(struct amdgpu_device *adev, static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev,
void *ras_error_status,
struct amdgpu_iv_entry *entry)
{
return amdgpu_umc_do_page_retirement(adev, ras_error_status, entry, true);
}
int amdgpu_umc_ras_late_init(struct amdgpu_device *adev)
{
int r;
struct ras_fs_if fs_info = {
.sysfs_name = "umc_err_count",
};
struct ras_ih_if ih_info = {
.cb = amdgpu_umc_process_ras_data_cb,
};
if (!adev->umc.ras_if) {
adev->umc.ras_if =
kmalloc(sizeof(struct ras_common_if), GFP_KERNEL);
if (!adev->umc.ras_if)
return -ENOMEM;
adev->umc.ras_if->block = AMDGPU_RAS_BLOCK__UMC;
adev->umc.ras_if->type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
adev->umc.ras_if->sub_block_index = 0;
}
ih_info.head = fs_info.head = *adev->umc.ras_if;
r = amdgpu_ras_late_init(adev, adev->umc.ras_if,
&fs_info, &ih_info);
if (r)
goto free;
if (amdgpu_ras_is_supported(adev, adev->umc.ras_if->block)) {
r = amdgpu_irq_get(adev, &adev->gmc.ecc_irq, 0);
if (r)
goto late_fini;
} else {
r = 0;
goto free;
}
/* ras init of specific umc version */
if (adev->umc.ras_funcs &&
adev->umc.ras_funcs->err_cnt_init)
adev->umc.ras_funcs->err_cnt_init(adev);
return 0;
late_fini:
amdgpu_ras_late_fini(adev, adev->umc.ras_if, &ih_info);
free:
kfree(adev->umc.ras_if);
adev->umc.ras_if = NULL;
return r;
}
void amdgpu_umc_ras_fini(struct amdgpu_device *adev)
{
if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__UMC) &&
adev->umc.ras_if) {
struct ras_common_if *ras_if = adev->umc.ras_if;
struct ras_ih_if ih_info = {
.head = *ras_if,
.cb = amdgpu_umc_process_ras_data_cb,
};
amdgpu_ras_late_fini(adev, ras_if, &ih_info);
kfree(ras_if);
}
}
int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev,
void *ras_error_status, void *ras_error_status,
struct amdgpu_iv_entry *entry, struct amdgpu_iv_entry *entry,
bool reset) bool reset)
...@@ -180,6 +108,100 @@ int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev, ...@@ -180,6 +108,100 @@ int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev,
return AMDGPU_RAS_SUCCESS; return AMDGPU_RAS_SUCCESS;
} }
int amdgpu_umc_poison_handler(struct amdgpu_device *adev,
void *ras_error_status,
bool reset)
{
int ret;
struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
struct ras_common_if head = {
.block = AMDGPU_RAS_BLOCK__UMC,
};
struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head);
ret =
amdgpu_umc_do_page_retirement(adev, ras_error_status, NULL, reset);
if (ret == AMDGPU_RAS_SUCCESS && obj) {
obj->err_data.ue_count += err_data->ue_count;
obj->err_data.ce_count += err_data->ce_count;
}
return ret;
}
static int amdgpu_umc_process_ras_data_cb(struct amdgpu_device *adev,
void *ras_error_status,
struct amdgpu_iv_entry *entry)
{
return amdgpu_umc_do_page_retirement(adev, ras_error_status, entry, true);
}
int amdgpu_umc_ras_late_init(struct amdgpu_device *adev)
{
int r;
struct ras_fs_if fs_info = {
.sysfs_name = "umc_err_count",
};
struct ras_ih_if ih_info = {
.cb = amdgpu_umc_process_ras_data_cb,
};
if (!adev->umc.ras_if) {
adev->umc.ras_if =
kmalloc(sizeof(struct ras_common_if), GFP_KERNEL);
if (!adev->umc.ras_if)
return -ENOMEM;
adev->umc.ras_if->block = AMDGPU_RAS_BLOCK__UMC;
adev->umc.ras_if->type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
adev->umc.ras_if->sub_block_index = 0;
}
ih_info.head = fs_info.head = *adev->umc.ras_if;
r = amdgpu_ras_late_init(adev, adev->umc.ras_if,
&fs_info, &ih_info);
if (r)
goto free;
if (amdgpu_ras_is_supported(adev, adev->umc.ras_if->block)) {
r = amdgpu_irq_get(adev, &adev->gmc.ecc_irq, 0);
if (r)
goto late_fini;
} else {
r = 0;
goto free;
}
/* ras init of specific umc version */
if (adev->umc.ras_funcs &&
adev->umc.ras_funcs->err_cnt_init)
adev->umc.ras_funcs->err_cnt_init(adev);
return 0;
late_fini:
amdgpu_ras_late_fini(adev, adev->umc.ras_if, &ih_info);
free:
kfree(adev->umc.ras_if);
adev->umc.ras_if = NULL;
return r;
}
void amdgpu_umc_ras_fini(struct amdgpu_device *adev)
{
if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__UMC) &&
adev->umc.ras_if) {
struct ras_common_if *ras_if = adev->umc.ras_if;
struct ras_ih_if ih_info = {
.head = *ras_if,
.cb = amdgpu_umc_process_ras_data_cb,
};
amdgpu_ras_late_fini(adev, ras_if, &ih_info);
kfree(ras_if);
}
}
int amdgpu_umc_process_ecc_irq(struct amdgpu_device *adev, int amdgpu_umc_process_ecc_irq(struct amdgpu_device *adev,
struct amdgpu_irq_src *source, struct amdgpu_irq_src *source,
struct amdgpu_iv_entry *entry) struct amdgpu_iv_entry *entry)
......
...@@ -78,9 +78,8 @@ struct amdgpu_umc { ...@@ -78,9 +78,8 @@ struct amdgpu_umc {
int amdgpu_umc_ras_late_init(struct amdgpu_device *adev); int amdgpu_umc_ras_late_init(struct amdgpu_device *adev);
void amdgpu_umc_ras_fini(struct amdgpu_device *adev); void amdgpu_umc_ras_fini(struct amdgpu_device *adev);
int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev, int amdgpu_umc_poison_handler(struct amdgpu_device *adev,
void *ras_error_status, void *ras_error_status,
struct amdgpu_iv_entry *entry,
bool reset); bool reset);
int amdgpu_umc_process_ecc_irq(struct amdgpu_device *adev, int amdgpu_umc_process_ecc_irq(struct amdgpu_device *adev,
struct amdgpu_irq_src *source, struct amdgpu_irq_src *source,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment