]> git.ipfire.org Git - thirdparty/kernel/stable.git/commitdiff
accel/habanalabs: disable device access after CPLD_SHUTDOWN
authorKonstantin Sinyuk <konstantin.sinyuk@intel.com>
Thu, 15 Aug 2024 08:24:24 +0000 (11:24 +0300)
committerKoby Elbaz <koby.elbaz@intel.com>
Thu, 25 Sep 2025 06:09:28 +0000 (09:09 +0300)
After a CPLD shutdown event the device becomes unusable. Prevent further
device access once this event is received.

Signed-off-by: Konstantin Sinyuk <konstantin.sinyuk@intel.com>
Reviewed-by: Koby Elbaz <koby.elbaz@intel.com>
Signed-off-by: Koby Elbaz <koby.elbaz@intel.com>
drivers/accel/habanalabs/common/device.c
drivers/accel/habanalabs/common/habanalabs.h

index 80fa08bf57bdce281ae763fa4427ef127b15ff21..c6830c8026acf5572232356131025edc797d292f 100644 (file)
@@ -1630,6 +1630,11 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)
        from_watchdog_thread = !!(flags & HL_DRV_RESET_FROM_WD_THR);
        reset_upon_device_release = hdev->reset_upon_device_release && from_dev_release;
 
+       if (hdev->cpld_shutdown) {
+               dev_err(hdev->dev, "Cannot reset device, cpld is shutdown! Device is NOT usable\n");
+               return -EIO;
+       }
+
        if (!hard_reset && (hl_device_status(hdev) == HL_DEVICE_STATUS_MALFUNCTION)) {
                dev_dbg(hdev->dev, "soft-reset isn't supported on a malfunctioning device\n");
                return 0;
@@ -2576,6 +2581,14 @@ void hl_device_fini(struct hl_device *hdev)
        if (rc)
                dev_err(hdev->dev, "hw_fini failed in device fini while removing device %d\n", rc);
 
+       /* Reset the H/W (if it accessible). It will be in idle state after this returns */
+       if (!hdev->cpld_shutdown) {
+               rc = hdev->asic_funcs->hw_fini(hdev, true, false);
+               if (rc)
+                       dev_err(hdev->dev,
+                               "hw_fini failed in device fini while removing device %d\n", rc);
+       }
+
        hdev->fw_loader.fw_comp_loaded = FW_TYPE_NONE;
 
        /* Release kernel context */
@@ -2943,3 +2956,15 @@ void hl_handle_clk_change_event(struct hl_device *hdev, u16 event_type, u64 *eve
 
        mutex_unlock(&clk_throttle->lock);
 }
+
+void hl_eq_cpld_shutdown_event_handle(struct hl_device *hdev, u16 event_id, u64 *event_mask)
+{
+       hl_handle_critical_hw_err(hdev, event_id, event_mask);
+       *event_mask |= HL_NOTIFIER_EVENT_DEVICE_UNAVAILABLE;
+
+       /* Avoid any new accesses to the H/W */
+       hdev->disabled = true;
+       hdev->cpld_shutdown = true;
+       hl_cn_hard_reset_prepare(hdev);
+       hl_cn_stop(hdev);
+}
index 122ade172bb498f2967adea7b56170fcff59337f..33aa385c2693f1bb5174d59cae3dcb6ed0a8454e 100644 (file)
@@ -3386,6 +3386,7 @@ struct eq_heartbeat_debug_info {
  *                    addresses.
  * @is_in_dram_scrub: true if dram scrub operation is on going.
  * @disabled: is device disabled.
+ * @cpld_shutdown: is cpld shutdown.
  * @late_init_done: is late init stage was done during initialization.
  * @hwmon_initialized: is H/W monitor sensors was initialized.
  * @reset_on_lockup: true if a reset should be done in case of stuck CS, false
@@ -3562,6 +3563,7 @@ struct hl_device {
        u16                             cpu_pci_msb_addr;
        u8                              is_in_dram_scrub;
        u8                              disabled;
+       u8                              cpld_shutdown;
        u8                              late_init_done;
        u8                              hwmon_initialized;
        u8                              reset_on_lockup;
@@ -4119,6 +4121,7 @@ void hl_init_cpu_for_irq(struct hl_device *hdev);
 void hl_set_irq_affinity(struct hl_device *hdev, int irq);
 void hl_eq_heartbeat_event_handle(struct hl_device *hdev);
 void hl_handle_clk_change_event(struct hl_device *hdev, u16 event_type, u64 *event_mask);
+void hl_eq_cpld_shutdown_event_handle(struct hl_device *hdev, u16 event_id, u64 *event_mask);
 
 #ifdef CONFIG_DEBUG_FS