Commit 5b1270be authored by Yang Wang's avatar Yang Wang Committed by Alex Deucher

drm/amdgpu: add ras_err_info to identify RAS error source

introduced "ras_err_info" to better identify a RAS ERROR source.

NOTE:
For legacy chips, keep the original RAS error print format.

v1:
RAS errors may come from different dies during a RAS error query,
therefore, need a new data structure to identify the source of RAS ERROR.

v2:
- use new data structure 'amdgpu_smuio_mcm_config_info' instead of
  ras_err_id (in v1 patch)
- refine ras error dump function name
- refine ras error dump log format
Signed-off-by: default avatarYang Wang <kevinyang.wang@amd.com>
Reviewed-by: default avatarTao Zhou <tao.zhou1@amd.com>
Reviewed-by: default avatarHawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent 6a1c31c7
This diff is collapsed.
...@@ -28,6 +28,7 @@ ...@@ -28,6 +28,7 @@
#include <linux/list.h> #include <linux/list.h>
#include "ta_ras_if.h" #include "ta_ras_if.h"
#include "amdgpu_ras_eeprom.h" #include "amdgpu_ras_eeprom.h"
#include "amdgpu_smuio.h"
struct amdgpu_iv_entry; struct amdgpu_iv_entry;
...@@ -443,13 +444,29 @@ struct ras_fs_data { ...@@ -443,13 +444,29 @@ struct ras_fs_data {
char debugfs_name[32]; char debugfs_name[32];
}; };
struct ras_err_info {
struct amdgpu_smuio_mcm_config_info mcm_info;
u64 ce_count;
u64 ue_count;
};
struct ras_err_node {
struct list_head node;
struct ras_err_info err_info;
};
struct ras_err_data { struct ras_err_data {
unsigned long ue_count; unsigned long ue_count;
unsigned long ce_count; unsigned long ce_count;
unsigned long err_addr_cnt; unsigned long err_addr_cnt;
struct eeprom_table_record *err_addr; struct eeprom_table_record *err_addr;
u32 err_list_count;
struct list_head err_node_list;
}; };
#define for_each_ras_error(err_node, err_data) \
list_for_each_entry(err_node, &(err_data)->err_node_list, node)
struct ras_err_handler_data { struct ras_err_handler_data {
/* point to bad page records array */ /* point to bad page records array */
struct eeprom_table_record *bps; struct eeprom_table_record *bps;
...@@ -773,4 +790,12 @@ void amdgpu_ras_inst_reset_ras_error_count(struct amdgpu_device *adev, ...@@ -773,4 +790,12 @@ void amdgpu_ras_inst_reset_ras_error_count(struct amdgpu_device *adev,
const struct amdgpu_ras_err_status_reg_entry *reg_list, const struct amdgpu_ras_err_status_reg_entry *reg_list,
uint32_t reg_list_size, uint32_t reg_list_size,
uint32_t instance); uint32_t instance);
int amdgpu_ras_error_data_init(struct ras_err_data *err_data);
void amdgpu_ras_error_data_fini(struct ras_err_data *err_data);
int amdgpu_ras_error_statistic_ce_count(struct ras_err_data *err_data,
struct amdgpu_smuio_mcm_config_info *mcm_info, u64 count);
int amdgpu_ras_error_statistic_ue_count(struct ras_err_data *err_data,
struct amdgpu_smuio_mcm_config_info *mcm_info, u64 count);
#endif #endif
...@@ -30,6 +30,11 @@ enum amdgpu_pkg_type { ...@@ -30,6 +30,11 @@ enum amdgpu_pkg_type {
AMDGPU_PKG_TYPE_UNKNOWN, AMDGPU_PKG_TYPE_UNKNOWN,
}; };
struct amdgpu_smuio_mcm_config_info {
int socket_id;
int die_id;
};
struct amdgpu_smuio_funcs { struct amdgpu_smuio_funcs {
u32 (*get_rom_index_offset)(struct amdgpu_device *adev); u32 (*get_rom_index_offset)(struct amdgpu_device *adev);
u32 (*get_rom_data_offset)(struct amdgpu_device *adev); u32 (*get_rom_data_offset)(struct amdgpu_device *adev);
......
...@@ -45,8 +45,12 @@ static int amdgpu_umc_convert_error_address(struct amdgpu_device *adev, ...@@ -45,8 +45,12 @@ static int amdgpu_umc_convert_error_address(struct amdgpu_device *adev,
int amdgpu_umc_page_retirement_mca(struct amdgpu_device *adev, int amdgpu_umc_page_retirement_mca(struct amdgpu_device *adev,
uint64_t err_addr, uint32_t ch_inst, uint32_t umc_inst) uint64_t err_addr, uint32_t ch_inst, uint32_t umc_inst)
{ {
struct ras_err_data err_data = {0, 0, 0, NULL}; struct ras_err_data err_data;
int ret = AMDGPU_RAS_FAIL; int ret;
ret = amdgpu_ras_error_data_init(&err_data);
if (ret)
return ret;
err_data.err_addr = err_data.err_addr =
kcalloc(adev->umc.max_ras_err_cnt_per_query, kcalloc(adev->umc.max_ras_err_cnt_per_query,
...@@ -54,7 +58,8 @@ int amdgpu_umc_page_retirement_mca(struct amdgpu_device *adev, ...@@ -54,7 +58,8 @@ int amdgpu_umc_page_retirement_mca(struct amdgpu_device *adev,
if (!err_data.err_addr) { if (!err_data.err_addr) {
dev_warn(adev->dev, dev_warn(adev->dev,
"Failed to alloc memory for umc error record in MCA notifier!\n"); "Failed to alloc memory for umc error record in MCA notifier!\n");
return AMDGPU_RAS_FAIL; ret = AMDGPU_RAS_FAIL;
goto out_fini_err_data;
} }
/* /*
...@@ -63,7 +68,7 @@ int amdgpu_umc_page_retirement_mca(struct amdgpu_device *adev, ...@@ -63,7 +68,7 @@ int amdgpu_umc_page_retirement_mca(struct amdgpu_device *adev,
ret = amdgpu_umc_convert_error_address(adev, &err_data, err_addr, ret = amdgpu_umc_convert_error_address(adev, &err_data, err_addr,
ch_inst, umc_inst); ch_inst, umc_inst);
if (ret) if (ret)
goto out; goto out_free_err_addr;
if (amdgpu_bad_page_threshold != 0) { if (amdgpu_bad_page_threshold != 0) {
amdgpu_ras_add_bad_pages(adev, err_data.err_addr, amdgpu_ras_add_bad_pages(adev, err_data.err_addr,
...@@ -71,8 +76,12 @@ int amdgpu_umc_page_retirement_mca(struct amdgpu_device *adev, ...@@ -71,8 +76,12 @@ int amdgpu_umc_page_retirement_mca(struct amdgpu_device *adev,
amdgpu_ras_save_bad_pages(adev, NULL); amdgpu_ras_save_bad_pages(adev, NULL);
} }
out: out_free_err_addr:
kfree(err_data.err_addr); kfree(err_data.err_addr);
out_fini_err_data:
amdgpu_ras_error_data_fini(&err_data);
return ret; return ret;
} }
...@@ -182,18 +191,24 @@ int amdgpu_umc_poison_handler(struct amdgpu_device *adev, bool reset) ...@@ -182,18 +191,24 @@ int amdgpu_umc_poison_handler(struct amdgpu_device *adev, bool reset)
} }
if (!amdgpu_sriov_vf(adev)) { if (!amdgpu_sriov_vf(adev)) {
struct ras_err_data err_data = {0, 0, 0, NULL}; struct ras_err_data err_data;
struct ras_common_if head = { struct ras_common_if head = {
.block = AMDGPU_RAS_BLOCK__UMC, .block = AMDGPU_RAS_BLOCK__UMC,
}; };
struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head); struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head);
ret = amdgpu_ras_error_data_init(&err_data);
if (ret)
return ret;
ret = amdgpu_umc_do_page_retirement(adev, &err_data, NULL, reset); ret = amdgpu_umc_do_page_retirement(adev, &err_data, NULL, reset);
if (ret == AMDGPU_RAS_SUCCESS && obj) { if (ret == AMDGPU_RAS_SUCCESS && obj) {
obj->err_data.ue_count += err_data.ue_count; obj->err_data.ue_count += err_data.ue_count;
obj->err_data.ce_count += err_data.ce_count; obj->err_data.ce_count += err_data.ce_count;
} }
amdgpu_ras_error_data_fini(&err_data);
} else { } else {
if (adev->virt.ops && adev->virt.ops->ras_poison_handler) if (adev->virt.ops && adev->virt.ops->ras_poison_handler)
adev->virt.ops->ras_poison_handler(adev); adev->virt.ops->ras_poison_handler(adev);
......
...@@ -365,9 +365,12 @@ static void nbio_v7_4_handle_ras_controller_intr_no_bifring(struct amdgpu_device ...@@ -365,9 +365,12 @@ static void nbio_v7_4_handle_ras_controller_intr_no_bifring(struct amdgpu_device
{ {
uint32_t bif_doorbell_intr_cntl; uint32_t bif_doorbell_intr_cntl;
struct ras_manager *obj = amdgpu_ras_find_obj(adev, adev->nbio.ras_if); struct ras_manager *obj = amdgpu_ras_find_obj(adev, adev->nbio.ras_if);
struct ras_err_data err_data = {0, 0, 0, NULL}; struct ras_err_data err_data;
struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
if (amdgpu_ras_error_data_init(&err_data))
return;
if (adev->asic_type == CHIP_ALDEBARAN) if (adev->asic_type == CHIP_ALDEBARAN)
bif_doorbell_intr_cntl = RREG32_SOC15(NBIO, 0, mmBIF_DOORBELL_INT_CNTL_ALDE); bif_doorbell_intr_cntl = RREG32_SOC15(NBIO, 0, mmBIF_DOORBELL_INT_CNTL_ALDE);
else else
...@@ -418,6 +421,8 @@ static void nbio_v7_4_handle_ras_controller_intr_no_bifring(struct amdgpu_device ...@@ -418,6 +421,8 @@ static void nbio_v7_4_handle_ras_controller_intr_no_bifring(struct amdgpu_device
*/ */
amdgpu_ras_reset_gpu(adev); amdgpu_ras_reset_gpu(adev);
} }
amdgpu_ras_error_data_fini(&err_data);
} }
static void nbio_v7_4_handle_ras_err_event_athub_intr_no_bifring(struct amdgpu_device *adev) static void nbio_v7_4_handle_ras_err_event_athub_intr_no_bifring(struct amdgpu_device *adev)
......
...@@ -560,9 +560,12 @@ static void nbio_v7_9_handle_ras_controller_intr_no_bifring(struct amdgpu_device ...@@ -560,9 +560,12 @@ static void nbio_v7_9_handle_ras_controller_intr_no_bifring(struct amdgpu_device
{ {
uint32_t bif_doorbell_intr_cntl; uint32_t bif_doorbell_intr_cntl;
struct ras_manager *obj = amdgpu_ras_find_obj(adev, adev->nbio.ras_if); struct ras_manager *obj = amdgpu_ras_find_obj(adev, adev->nbio.ras_if);
struct ras_err_data err_data = {0, 0, 0, NULL}; struct ras_err_data err_data;
struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
if (amdgpu_ras_error_data_init(&err_data))
return;
bif_doorbell_intr_cntl = RREG32_SOC15(NBIO, 0, regBIF_BX0_BIF_DOORBELL_INT_CNTL); bif_doorbell_intr_cntl = RREG32_SOC15(NBIO, 0, regBIF_BX0_BIF_DOORBELL_INT_CNTL);
if (REG_GET_FIELD(bif_doorbell_intr_cntl, if (REG_GET_FIELD(bif_doorbell_intr_cntl,
...@@ -607,6 +610,8 @@ static void nbio_v7_9_handle_ras_controller_intr_no_bifring(struct amdgpu_device ...@@ -607,6 +610,8 @@ static void nbio_v7_9_handle_ras_controller_intr_no_bifring(struct amdgpu_device
*/ */
amdgpu_ras_reset_gpu(adev); amdgpu_ras_reset_gpu(adev);
} }
amdgpu_ras_error_data_fini(&err_data);
} }
static void nbio_v7_9_handle_ras_err_event_athub_intr_no_bifring(struct amdgpu_device *adev) static void nbio_v7_9_handle_ras_err_event_athub_intr_no_bifring(struct amdgpu_device *adev)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment