]> git.ipfire.org Git - thirdparty/linux.git/commitdiff
accel/habanalabs: fix EQ heartbeat mechanism
authorFarah Kassabri <fkassabri@habana.ai>
Tue, 31 Oct 2023 10:20:36 +0000 (12:20 +0200)
committerOded Gabbay <ogabbay@kernel.org>
Tue, 19 Dec 2023 09:09:43 +0000 (11:09 +0200)
Stop rescheduling another heartbeat check when EQ heartbeat check fails
as it generates confusing logs in dmesg that the heartbeat fails.

Signed-off-by: Farah Kassabri <fkassabri@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
drivers/accel/habanalabs/common/device.c

index d9447aeb3937e6c039d5f365853b129ee58a0665..6bf5f1d0d0057bc667b3ed2b04fa133bd63be330 100644 (file)
@@ -1044,20 +1044,21 @@ static bool is_pci_link_healthy(struct hl_device *hdev)
        return (vendor_id == PCI_VENDOR_ID_HABANALABS);
 }
 
-static void hl_device_eq_heartbeat(struct hl_device *hdev)
+static int hl_device_eq_heartbeat_check(struct hl_device *hdev)
 {
-       u64 event_mask = HL_NOTIFIER_EVENT_DEVICE_RESET | HL_NOTIFIER_EVENT_DEVICE_UNAVAILABLE;
        struct asic_fixed_properties *prop = &hdev->asic_prop;
 
        if (!prop->cpucp_info.eq_health_check_supported)
-               return;
+               return 0;
 
        if (hdev->eq_heartbeat_received) {
                hdev->eq_heartbeat_received = false;
        } else {
                dev_err(hdev->dev, "EQ heartbeat event was not received!\n");
-               hl_device_cond_reset(hdev, HL_DRV_RESET_HARD, event_mask);
+               return -EIO;
        }
+
+       return 0;
 }
 
 static void hl_device_heartbeat(struct work_struct *work)
@@ -1074,10 +1075,9 @@ static void hl_device_heartbeat(struct work_struct *work)
        /*
         * For EQ health check need to check if driver received the heartbeat eq event
         * in order to validate the eq is working.
+        * Only if both the EQ is healthy and we managed to send the next heartbeat reschedule.
         */
-       hl_device_eq_heartbeat(hdev);
-
-       if (!hdev->asic_funcs->send_heartbeat(hdev))
+       if ((!hl_device_eq_heartbeat_check(hdev)) && (!hdev->asic_funcs->send_heartbeat(hdev)))
                goto reschedule;
 
        if (hl_device_operational(hdev, NULL))