Commit 04c4fcd2 authored by Yang Wang's avatar Yang Wang Committed by Alex Deucher

drm/amdgpu: add amdgpu ras aca query interface

v1:
add ACA error query interface

v2:
Add a new helper function to determine whether to use ACA or MCA.
Signed-off-by: default avatarYang Wang <kevinyang.wang@amd.com>
Reviewed-by: default avatarHawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent 0c54e457
......@@ -577,6 +577,9 @@ int amdgpu_aca_add_handle(struct amdgpu_device *adev, struct aca_handle *handle,
{
struct amdgpu_aca *aca = &adev->aca;
if (!amdgpu_aca_is_enabled(adev))
return 0;
return add_aca_handle(adev, &aca->mgr, handle, name, ras_info, data);
}
......@@ -613,6 +616,11 @@ static void aca_manager_fini(struct aca_handle_manager *mgr)
remove_aca(handle);
}
bool amdgpu_aca_is_enabled(struct amdgpu_device *adev)
{
return adev->aca.is_enabled;
}
int amdgpu_aca_init(struct amdgpu_device *adev)
{
struct amdgpu_aca *aca = &adev->aca;
......
......@@ -172,6 +172,7 @@ struct aca_smu_funcs {
struct amdgpu_aca {
struct aca_handle_manager mgr;
const struct aca_smu_funcs *smu_funcs;
bool is_enabled;
};
struct aca_info {
......@@ -183,6 +184,7 @@ struct aca_info {
int amdgpu_aca_init(struct amdgpu_device *adev);
void amdgpu_aca_fini(struct amdgpu_device *adev);
void amdgpu_aca_set_smu_funcs(struct amdgpu_device *adev, const struct aca_smu_funcs *smu_funcs);
bool amdgpu_aca_is_enabled(struct amdgpu_device *adev);
int aca_bank_info_decode(struct aca_bank *bank, struct aca_bank_info *info);
int aca_bank_check_error_codes(struct amdgpu_device *adev, struct aca_bank *bank, int *err_codes, int size);
......
......@@ -1167,6 +1167,53 @@ static void amdgpu_rasmgr_error_data_statistic_update(struct ras_manager *obj, s
}
}
static struct ras_manager *get_ras_manager(struct amdgpu_device *adev, enum amdgpu_ras_block blk)
{
struct ras_common_if head;
memset(&head, 0, sizeof(head));
head.block = blk;
return amdgpu_ras_find_obj(adev, &head);
}
int amdgpu_ras_bind_aca(struct amdgpu_device *adev, enum amdgpu_ras_block blk,
const struct aca_info *aca_info, void *data)
{
struct ras_manager *obj;
obj = get_ras_manager(adev, blk);
if (!obj)
return -EINVAL;
return amdgpu_aca_add_handle(adev, &obj->aca_handle, ras_block_str(blk), aca_info, data);
}
int amdgpu_ras_unbind_aca(struct amdgpu_device *adev, enum amdgpu_ras_block blk)
{
struct ras_manager *obj;
obj = get_ras_manager(adev, blk);
if (!obj)
return -EINVAL;
amdgpu_aca_remove_handle(&obj->aca_handle);
return 0;
}
static int amdgpu_aca_log_ras_error_data(struct amdgpu_device *adev, enum amdgpu_ras_block blk,
enum aca_error_type type, struct ras_err_data *err_data)
{
struct ras_manager *obj;
obj = get_ras_manager(adev, blk);
if (!obj)
return -EINVAL;
return amdgpu_aca_get_error_data(adev, &obj->aca_handle, type, err_data);
}
static int amdgpu_ras_query_error_status_helper(struct amdgpu_device *adev,
struct ras_query_if *info,
struct ras_err_data *err_data,
......@@ -1174,6 +1221,7 @@ static int amdgpu_ras_query_error_status_helper(struct amdgpu_device *adev,
{
enum amdgpu_ras_block blk = info ? info->head.block : AMDGPU_RAS_BLOCK_COUNT;
struct amdgpu_ras_block_object *block_obj = NULL;
int ret;
if (blk == AMDGPU_RAS_BLOCK_COUNT)
return -EINVAL;
......@@ -1202,11 +1250,21 @@ static int amdgpu_ras_query_error_status_helper(struct amdgpu_device *adev,
block_obj->hw_ops->query_ras_error_status(adev);
}
}
} else {
if (amdgpu_aca_is_enabled(adev)) {
ret = amdgpu_aca_log_ras_error_data(adev, blk, ACA_ERROR_TYPE_UE, err_data);
if (ret)
return ret;
ret = amdgpu_aca_log_ras_error_data(adev, blk, ACA_ERROR_TYPE_CE, err_data);
if (ret)
return ret;
} else {
/* FIXME: add code to check return value later */
amdgpu_mca_smu_log_ras_error(adev, blk, AMDGPU_MCA_ERROR_TYPE_UE, err_data);
amdgpu_mca_smu_log_ras_error(adev, blk, AMDGPU_MCA_ERROR_TYPE_CE, err_data);
}
}
return 0;
}
......@@ -1254,6 +1312,7 @@ int amdgpu_ras_reset_error_count(struct amdgpu_device *adev,
struct amdgpu_ras_block_object *block_obj = amdgpu_ras_get_ras_block(adev, block, 0);
struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs;
const struct aca_smu_funcs *smu_funcs = adev->aca.smu_funcs;
struct amdgpu_hive_info *hive;
int hive_ras_recovery = 0;
......@@ -1264,7 +1323,7 @@ int amdgpu_ras_reset_error_count(struct amdgpu_device *adev,
}
if (!amdgpu_ras_is_supported(adev, block) ||
!amdgpu_ras_get_mca_debug_mode(adev))
!amdgpu_ras_get_aca_debug_mode(adev))
return -EOPNOTSUPP;
hive = amdgpu_get_xgmi_hive(adev);
......@@ -1276,7 +1335,8 @@ int amdgpu_ras_reset_error_count(struct amdgpu_device *adev,
/* skip ras error reset in gpu reset */
if ((amdgpu_in_reset(adev) || atomic_read(&ras->in_recovery) ||
hive_ras_recovery) &&
mca_funcs && mca_funcs->mca_set_debug_mode)
((smu_funcs && smu_funcs->set_debug_mode) ||
(mca_funcs && mca_funcs->mca_set_debug_mode)))
return -EOPNOTSUPP;
if (block_obj->hw_ops->reset_ras_error_count)
......@@ -1772,6 +1832,9 @@ void amdgpu_ras_debugfs_create_all(struct amdgpu_device *adev)
}
}
if (amdgpu_aca_is_enabled(adev))
amdgpu_aca_smu_debugfs_init(adev, dir);
else
amdgpu_mca_smu_debugfs_init(adev, dir);
}
......@@ -2753,6 +2816,9 @@ static void amdgpu_ras_check_supported(struct amdgpu_device *adev)
adev->ras_enabled = amdgpu_ras_enable == 0 ? 0 :
adev->ras_hw_enabled & amdgpu_ras_mask;
/* aca is disabled by default */
adev->aca.is_enabled = false;
}
static void amdgpu_ras_counte_dw(struct work_struct *work)
......@@ -3142,6 +3208,9 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev)
if (amdgpu_sriov_vf(adev))
return 0;
if (amdgpu_aca_is_enabled(adev))
amdgpu_ras_set_aca_debug_mode(adev, false);
else
amdgpu_ras_set_mca_debug_mode(adev, false);
list_for_each_entry_safe(node, tmp, &adev->ras_list, node) {
......@@ -3425,7 +3494,7 @@ int amdgpu_ras_set_mca_debug_mode(struct amdgpu_device *adev, bool enable)
if (con) {
ret = amdgpu_mca_smu_set_debug_mode(adev, enable);
if (!ret)
con->is_mca_debug_mode = enable;
con->is_aca_debug_mode = enable;
}
return ret;
......@@ -3437,24 +3506,29 @@ int amdgpu_ras_set_aca_debug_mode(struct amdgpu_device *adev, bool enable)
int ret = 0;
if (con) {
if (amdgpu_aca_is_enabled(adev))
ret = amdgpu_aca_smu_set_debug_mode(adev, enable);
else
ret = amdgpu_mca_smu_set_debug_mode(adev, enable);
if (!ret)
con->is_mca_debug_mode = enable;
con->is_aca_debug_mode = enable;
}
return ret;
}
bool amdgpu_ras_get_mca_debug_mode(struct amdgpu_device *adev)
bool amdgpu_ras_get_aca_debug_mode(struct amdgpu_device *adev)
{
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
const struct aca_smu_funcs *smu_funcs = adev->aca.smu_funcs;
const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs;
if (!con)
return false;
if (mca_funcs && mca_funcs->mca_set_debug_mode)
return con->is_mca_debug_mode;
if ((amdgpu_aca_is_enabled(adev) && smu_funcs && smu_funcs->set_debug_mode) ||
(!amdgpu_aca_is_enabled(adev) && mca_funcs && mca_funcs->mca_set_debug_mode))
return con->is_aca_debug_mode;
else
return true;
}
......@@ -3464,15 +3538,16 @@ bool amdgpu_ras_get_error_query_mode(struct amdgpu_device *adev,
{
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs;
const struct aca_smu_funcs *smu_funcs = adev->aca.smu_funcs;
if (!con) {
*error_query_mode = AMDGPU_RAS_INVALID_ERROR_QUERY;
return false;
}
if (mca_funcs && mca_funcs->mca_set_debug_mode)
if ((smu_funcs && smu_funcs->set_debug_mode) || (mca_funcs && mca_funcs->mca_set_debug_mode))
*error_query_mode =
(con->is_mca_debug_mode) ? AMDGPU_RAS_DIRECT_ERROR_QUERY : AMDGPU_RAS_FIRMWARE_ERROR_QUERY;
(con->is_aca_debug_mode) ? AMDGPU_RAS_DIRECT_ERROR_QUERY : AMDGPU_RAS_FIRMWARE_ERROR_QUERY;
else
*error_query_mode = AMDGPU_RAS_DIRECT_ERROR_QUERY;
......
......@@ -455,7 +455,7 @@ struct amdgpu_ras {
/* Indicates smu whether need update bad channel info */
bool update_channel_flag;
/* Record status of smu mca debug mode */
bool is_mca_debug_mode;
bool is_aca_debug_mode;
/* Record special requirements of gpu reset caller */
uint32_t gpu_reset_flags;
......@@ -543,6 +543,8 @@ struct ras_manager {
struct ras_ih_data ih_data;
struct ras_err_data err_data;
struct aca_handle aca_handle;
};
struct ras_badpage {
......@@ -794,9 +796,9 @@ struct amdgpu_ras* amdgpu_ras_get_context(struct amdgpu_device *adev);
int amdgpu_ras_set_context(struct amdgpu_device *adev, struct amdgpu_ras *ras_con);
int amdgpu_ras_set_aca_debug_mode(struct amdgpu_device *adev, bool enable);
int amdgpu_ras_set_mca_debug_mode(struct amdgpu_device *adev, bool enable);
bool amdgpu_ras_get_mca_debug_mode(struct amdgpu_device *adev);
int amdgpu_ras_set_aca_debug_mode(struct amdgpu_device *adev, bool enable);
bool amdgpu_ras_get_aca_debug_mode(struct amdgpu_device *adev);
bool amdgpu_ras_get_error_query_mode(struct amdgpu_device *adev,
unsigned int *mode);
......@@ -834,4 +836,8 @@ int amdgpu_ras_error_statistic_ue_count(struct ras_err_data *err_data,
struct amdgpu_smuio_mcm_config_info *mcm_info,
struct ras_err_addr *err_addr, u64 count);
void amdgpu_ras_query_boot_status(struct amdgpu_device *adev, u32 num_instances);
int amdgpu_ras_bind_aca(struct amdgpu_device *adev, enum amdgpu_ras_block blk,
const struct aca_info *aca_info, void *data);
int amdgpu_ras_unbind_aca(struct amdgpu_device *adev, enum amdgpu_ras_block blk);
#endif
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment