Commit d1958dce authored by Farah Kassabri's avatar Farah Kassabri Committed by Oded Gabbay

accel/habanalabs: fix EQ heartbeat mechanism

Stop rescheduling another heartbeat check when EQ heartbeat check fails
as it generates confusing logs in dmesg that the heartbeat fails.
Signed-off-by: default avatarFarah Kassabri <fkassabri@habana.ai>
Reviewed-by: default avatarOded Gabbay <ogabbay@kernel.org>
Signed-off-by: default avatarOded Gabbay <ogabbay@kernel.org>
parent 42422993
...@@ -1044,20 +1044,21 @@ static bool is_pci_link_healthy(struct hl_device *hdev) ...@@ -1044,20 +1044,21 @@ static bool is_pci_link_healthy(struct hl_device *hdev)
return (vendor_id == PCI_VENDOR_ID_HABANALABS); return (vendor_id == PCI_VENDOR_ID_HABANALABS);
} }
static void hl_device_eq_heartbeat(struct hl_device *hdev) static int hl_device_eq_heartbeat_check(struct hl_device *hdev)
{ {
u64 event_mask = HL_NOTIFIER_EVENT_DEVICE_RESET | HL_NOTIFIER_EVENT_DEVICE_UNAVAILABLE;
struct asic_fixed_properties *prop = &hdev->asic_prop; struct asic_fixed_properties *prop = &hdev->asic_prop;
if (!prop->cpucp_info.eq_health_check_supported) if (!prop->cpucp_info.eq_health_check_supported)
return; return 0;
if (hdev->eq_heartbeat_received) { if (hdev->eq_heartbeat_received) {
hdev->eq_heartbeat_received = false; hdev->eq_heartbeat_received = false;
} else { } else {
dev_err(hdev->dev, "EQ heartbeat event was not received!\n"); dev_err(hdev->dev, "EQ heartbeat event was not received!\n");
hl_device_cond_reset(hdev, HL_DRV_RESET_HARD, event_mask); return -EIO;
} }
return 0;
} }
static void hl_device_heartbeat(struct work_struct *work) static void hl_device_heartbeat(struct work_struct *work)
...@@ -1074,10 +1075,9 @@ static void hl_device_heartbeat(struct work_struct *work) ...@@ -1074,10 +1075,9 @@ static void hl_device_heartbeat(struct work_struct *work)
/* /*
* For EQ health check need to check if driver received the heartbeat eq event * For EQ health check need to check if driver received the heartbeat eq event
* in order to validate the eq is working. * in order to validate the eq is working.
* Only if both the EQ is healthy and we managed to send the next heartbeat reschedule.
*/ */
hl_device_eq_heartbeat(hdev); if ((!hl_device_eq_heartbeat_check(hdev)) && (!hdev->asic_funcs->send_heartbeat(hdev)))
if (!hdev->asic_funcs->send_heartbeat(hdev))
goto reschedule; goto reschedule;
if (hl_device_operational(hdev, NULL)) if (hl_device_operational(hdev, NULL))
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment