Commit d261b0ab authored by Ofir Bitton's avatar Ofir Bitton Committed by Oded Gabbay

accel/habanalabs/gaudi2: include block id in ECC error reporting

During ECC event handling, Memory wrapper id was mistakenly
printed as block id. Fix the print and in addition fetch the actual
block-id from firmware.
Signed-off-by: default avatarOfir Bitton <obitton@habana.ai>
Reviewed-by: default avatarOded Gabbay <ogabbay@kernel.org>
Signed-off-by: default avatarOded Gabbay <ogabbay@kernel.org>
parent 10d260f6
...@@ -7834,16 +7834,29 @@ static void gaudi2_print_event(struct hl_device *hdev, u16 event_type, ...@@ -7834,16 +7834,29 @@ static void gaudi2_print_event(struct hl_device *hdev, u16 event_type,
static bool gaudi2_handle_ecc_event(struct hl_device *hdev, u16 event_type, static bool gaudi2_handle_ecc_event(struct hl_device *hdev, u16 event_type,
struct hl_eq_ecc_data *ecc_data) struct hl_eq_ecc_data *ecc_data)
{ {
u64 ecc_address = 0, ecc_syndrom = 0; u64 ecc_address = 0, ecc_syndrome = 0;
u8 memory_wrapper_idx = 0; u8 memory_wrapper_idx = 0;
bool has_block_id = false;
u16 block_id;
if (!hl_is_fw_sw_ver_below(hdev, 1, 12))
has_block_id = true;
ecc_address = le64_to_cpu(ecc_data->ecc_address); ecc_address = le64_to_cpu(ecc_data->ecc_address);
ecc_syndrom = le64_to_cpu(ecc_data->ecc_syndrom); ecc_syndrome = le64_to_cpu(ecc_data->ecc_syndrom);
memory_wrapper_idx = ecc_data->memory_wrapper_idx; memory_wrapper_idx = ecc_data->memory_wrapper_idx;
gaudi2_print_event(hdev, event_type, !ecc_data->is_critical, if (has_block_id) {
"ECC error detected. address: %#llx. Syndrom: %#llx. block id %u. critical %u.", block_id = le16_to_cpu(ecc_data->block_id);
ecc_address, ecc_syndrom, memory_wrapper_idx, ecc_data->is_critical); gaudi2_print_event(hdev, event_type, !ecc_data->is_critical,
"ECC error detected. address: %#llx. Syndrome: %#llx. wrapper id %u. block id %#x. critical %u.",
ecc_address, ecc_syndrome, memory_wrapper_idx, block_id,
ecc_data->is_critical);
} else {
gaudi2_print_event(hdev, event_type, !ecc_data->is_critical,
"ECC error detected. address: %#llx. Syndrome: %#llx. wrapper id %u. critical %u.",
ecc_address, ecc_syndrome, memory_wrapper_idx, ecc_data->is_critical);
}
return !!ecc_data->is_critical; return !!ecc_data->is_critical;
} }
......
...@@ -69,7 +69,8 @@ struct hl_eq_ecc_data { ...@@ -69,7 +69,8 @@ struct hl_eq_ecc_data {
__le64 ecc_syndrom; __le64 ecc_syndrom;
__u8 memory_wrapper_idx; __u8 memory_wrapper_idx;
__u8 is_critical; __u8 is_critical;
__u8 pad[6]; __le16 block_id;
__u8 pad[4];
}; };
enum hl_sm_sei_cause { enum hl_sm_sei_cause {
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment