From: Konstantin Sinyuk Date: Thu, 15 Aug 2024 08:24:24 +0000 (+0300) Subject: accel/habanalabs: disable device access after CPLD_SHUTDOWN X-Git-Tag: v6.18-rc1~134^2~1^2~8 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=083c53a85490b4cf5ed63876b09c067358085e20;p=thirdparty%2Fkernel%2Fstable.git accel/habanalabs: disable device access after CPLD_SHUTDOWN After a CPLD shutdown event the device becomes unusable. Prevent further device access once this event is received. Signed-off-by: Konstantin Sinyuk Reviewed-by: Koby Elbaz Signed-off-by: Koby Elbaz --- diff --git a/drivers/accel/habanalabs/common/device.c b/drivers/accel/habanalabs/common/device.c index 80fa08bf57bdc..c6830c8026acf 100644 --- a/drivers/accel/habanalabs/common/device.c +++ b/drivers/accel/habanalabs/common/device.c @@ -1630,6 +1630,11 @@ int hl_device_reset(struct hl_device *hdev, u32 flags) from_watchdog_thread = !!(flags & HL_DRV_RESET_FROM_WD_THR); reset_upon_device_release = hdev->reset_upon_device_release && from_dev_release; + if (hdev->cpld_shutdown) { + dev_err(hdev->dev, "Cannot reset device, cpld is shutdown! Device is NOT usable\n"); + return -EIO; + } + if (!hard_reset && (hl_device_status(hdev) == HL_DEVICE_STATUS_MALFUNCTION)) { dev_dbg(hdev->dev, "soft-reset isn't supported on a malfunctioning device\n"); return 0; @@ -2576,6 +2581,14 @@ void hl_device_fini(struct hl_device *hdev) if (rc) dev_err(hdev->dev, "hw_fini failed in device fini while removing device %d\n", rc); + /* Reset the H/W (if it accessible). It will be in idle state after this returns */ + if (!hdev->cpld_shutdown) { + rc = hdev->asic_funcs->hw_fini(hdev, true, false); + if (rc) + dev_err(hdev->dev, + "hw_fini failed in device fini while removing device %d\n", rc); + } + hdev->fw_loader.fw_comp_loaded = FW_TYPE_NONE; /* Release kernel context */ @@ -2943,3 +2956,15 @@ void hl_handle_clk_change_event(struct hl_device *hdev, u16 event_type, u64 *eve mutex_unlock(&clk_throttle->lock); } + +void hl_eq_cpld_shutdown_event_handle(struct hl_device *hdev, u16 event_id, u64 *event_mask) +{ + hl_handle_critical_hw_err(hdev, event_id, event_mask); + *event_mask |= HL_NOTIFIER_EVENT_DEVICE_UNAVAILABLE; + + /* Avoid any new accesses to the H/W */ + hdev->disabled = true; + hdev->cpld_shutdown = true; + hl_cn_hard_reset_prepare(hdev); + hl_cn_stop(hdev); +} diff --git a/drivers/accel/habanalabs/common/habanalabs.h b/drivers/accel/habanalabs/common/habanalabs.h index 122ade172bb49..33aa385c2693f 100644 --- a/drivers/accel/habanalabs/common/habanalabs.h +++ b/drivers/accel/habanalabs/common/habanalabs.h @@ -3386,6 +3386,7 @@ struct eq_heartbeat_debug_info { * addresses. * @is_in_dram_scrub: true if dram scrub operation is on going. * @disabled: is device disabled. + * @cpld_shutdown: is cpld shutdown. * @late_init_done: is late init stage was done during initialization. * @hwmon_initialized: is H/W monitor sensors was initialized. * @reset_on_lockup: true if a reset should be done in case of stuck CS, false @@ -3562,6 +3563,7 @@ struct hl_device { u16 cpu_pci_msb_addr; u8 is_in_dram_scrub; u8 disabled; + u8 cpld_shutdown; u8 late_init_done; u8 hwmon_initialized; u8 reset_on_lockup; @@ -4119,6 +4121,7 @@ void hl_init_cpu_for_irq(struct hl_device *hdev); void hl_set_irq_affinity(struct hl_device *hdev, int irq); void hl_eq_heartbeat_event_handle(struct hl_device *hdev); void hl_handle_clk_change_event(struct hl_device *hdev, u16 event_type, u64 *event_mask); +void hl_eq_cpld_shutdown_event_handle(struct hl_device *hdev, u16 event_id, u64 *event_mask); #ifdef CONFIG_DEBUG_FS