accel/ivpu: Perform engine reset instead of device recovery on TDR

author Karol Wachowski <karol.wachowski@linux.intel.com>

Wed, 18 Mar 2026 09:39:27 +0000 (10:39 +0100)

committer Karol Wachowski <karol.wachowski@linux.intel.com>

Fri, 20 Mar 2026 07:03:11 +0000 (08:03 +0100)
author Karol Wachowski <karol.wachowski@linux.intel.com>
Wed, 18 Mar 2026 09:39:27 +0000 (10:39 +0100)
committer Karol Wachowski <karol.wachowski@linux.intel.com>
Fri, 20 Mar 2026 07:03:11 +0000 (08:03 +0100)
diff --git a/drivers/accel/ivpu/ivpu_debugfs.c b/drivers/accel/ivpu/ivpu_debugfs.c

index a09f54fc43020686f3c65b0cdfa8ec74ced98722..189dbe94cf14109974945a15d61bc96f4ec70fcc 100644 (file)
--- a/drivers/accel/ivpu/ivpu_debugfs.c
+++ b/drivers/accel/ivpu/ivpu_debugfs.c
@@ -1,6 +1,6 @@
  // SPDX-License-Identifier: GPL-2.0-only
  /*
- * Copyright (C) 2020-2024 Intel Corporation
+ * Copyright (C) 2020-2026 Intel Corporation
   */
  
  #include <linux/debugfs.h>
@@ -127,6 +127,14 @@ static int firewall_irq_counter_show(struct seq_file *s, void *v)
         return 0;
  }
  
+static int engine_reset_counter_show(struct seq_file *s, void *v)
+{
+       struct ivpu_device *vdev = seq_to_ivpu(s);
+
+       seq_printf(s, "%d\n", atomic_read(&vdev->pm->engine_reset_counter));
+       return 0;
+}
+
  static const struct drm_debugfs_info vdev_debugfs_list[] = {
         {"bo_list", bo_list_show, 0},
         {"fw_name", fw_name_show, 0},
@@ -137,6 +145,7 @@ static const struct drm_debugfs_info vdev_debugfs_list[] = {
         {"reset_counter", reset_counter_show, 0},
         {"reset_pending", reset_pending_show, 0},
         {"firewall_irq_counter", firewall_irq_counter_show, 0},
+       {"engine_reset_counter", engine_reset_counter_show, 0},
  };
  
  static int dvfs_mode_get(void *data, u64 *dvfs_mode)
@@ -352,8 +361,9 @@ static const struct file_operations ivpu_force_recovery_fops = {
  static int ivpu_reset_engine_fn(void *data, u64 val)
  {
         struct ivpu_device *vdev = (struct ivpu_device *)data;
+       struct vpu_jsm_msg resp;
  
-       return ivpu_jsm_reset_engine(vdev, (u32)val);
+       return ivpu_jsm_reset_engine(vdev, (u32)val, &resp);
  }
  
  DEFINE_DEBUGFS_ATTRIBUTE(ivpu_reset_engine_fops, NULL, ivpu_reset_engine_fn, "0x%02llx\n");
diff --git a/drivers/accel/ivpu/ivpu_drv.c b/drivers/accel/ivpu/ivpu_drv.c

index dd3a486df5f1629b4615d52678efe1e07ca4b48d..2801378e3e19271cea63880d67ba0130d134fb5e 100644 (file)
--- a/drivers/accel/ivpu/ivpu_drv.c
+++ b/drivers/accel/ivpu/ivpu_drv.c
@@ -665,6 +665,7 @@ static int ivpu_dev_init(struct ivpu_device *vdev)
         vdev->context_xa_limit.max = IVPU_USER_CONTEXT_MAX_SSID;
         atomic64_set(&vdev->unique_id_counter, 0);
         atomic_set(&vdev->job_timeout_counter, 0);
+       atomic_set(&vdev->faults_detected, 0);
         xa_init_flags(&vdev->context_xa, XA_FLAGS_ALLOC | XA_FLAGS_LOCK_IRQ);
         xa_init_flags(&vdev->submitted_jobs_xa, XA_FLAGS_ALLOC1);
         xa_init_flags(&vdev->db_xa, XA_FLAGS_ALLOC1);
diff --git a/drivers/accel/ivpu/ivpu_drv.h b/drivers/accel/ivpu/ivpu_drv.h

index 6378e23e0c97635fcb99819a121e6997dea0a060..b739738c45666cd842ff51d25a8b4b6988e9d454 100644 (file)
--- a/drivers/accel/ivpu/ivpu_drv.h
+++ b/drivers/accel/ivpu/ivpu_drv.h
@@ -1,6 +1,6 @@
  /* SPDX-License-Identifier: GPL-2.0-only */
  /*
- * Copyright (C) 2020-2025 Intel Corporation
+ * Copyright (C) 2020-2026 Intel Corporation
   */
  
  #ifndef __IVPU_DRV_H__
@@ -168,6 +168,7 @@ struct ivpu_device {
         struct xarray submitted_jobs_xa;
         struct ivpu_ipc_consumer job_done_consumer;
         atomic_t job_timeout_counter;
+       atomic_t faults_detected;
  
         atomic64_t unique_id_counter;
  
diff --git a/drivers/accel/ivpu/ivpu_job.c b/drivers/accel/ivpu/ivpu_job.c

index f0154dfa6ddcec18961c8681ffd4c566a7ab8343..521931d1f7fcaf6cb7e2becda09b7e025356023b 100644 (file)
--- a/drivers/accel/ivpu/ivpu_job.c
+++ b/drivers/accel/ivpu/ivpu_job.c
@@ -1,6 +1,6 @@
  // SPDX-License-Identifier: GPL-2.0-only
  /*
- * Copyright (C) 2020-2025 Intel Corporation
+ * Copyright (C) 2020-2026 Intel Corporation
   */
  
  #include <drm/drm_file.h>
@@ -607,6 +607,7 @@ bool ivpu_job_handle_engine_error(struct ivpu_device *vdev, u32 job_id, u32 job_
                  * status and ensure both are handled in the same way
                  */
                 job->file_priv->has_mmu_faults = true;
+               atomic_set(&vdev->faults_detected, 1);
                 queue_work(system_percpu_wq, &vdev->context_abort_work);
                 return true;
         }
@@ -1115,6 +1116,51 @@ void ivpu_job_done_consumer_fini(struct ivpu_device *vdev)
         ivpu_ipc_consumer_del(vdev, &vdev->job_done_consumer);
  }
  
+static int reset_engine_and_mark_faulty_contexts(struct ivpu_device *vdev)
+{
+       u32 num_impacted_contexts;
+       struct vpu_jsm_msg resp;
+       int ret;
+       u32 i;
+
+       ret = ivpu_jsm_reset_engine(vdev, 0, &resp);
+       if (ret)
+               return ret;
+
+       /*
+        * If faults are detected, ignore guilty contexts from engine reset as NPU may not be stuck
+        * and could return currently running good context and faulty contexts are already marked
+        */
+       if (atomic_cmpxchg(&vdev->faults_detected, 1, 0) == 1)
+               return 0;
+
+       num_impacted_contexts = resp.payload.engine_reset_done.num_impacted_contexts;
+
+       ivpu_warn_ratelimited(vdev, "Engine reset performed, impacted contexts: %u\n",
+                             num_impacted_contexts);
+
+       if (!in_range(num_impacted_contexts, 1, VPU_MAX_ENGINE_RESET_IMPACTED_CONTEXTS - 1)) {
+               ivpu_pm_trigger_recovery(vdev, "Cannot determine guilty contexts");
+               return -EIO;
+       }
+
+       /* No faults detected, NPU likely got stuck. Mark returned contexts as guilty */
+       guard(mutex)(&vdev->context_list_lock);
+
+       for (i = 0; i < num_impacted_contexts; i++) {
+               u32 ssid = resp.payload.engine_reset_done.impacted_contexts[i].host_ssid;
+               struct ivpu_file_priv *file_priv = xa_load(&vdev->context_xa, ssid);
+
+               if (file_priv) {
+                       mutex_lock(&file_priv->lock);
+                       file_priv->has_mmu_faults = true;
+                       mutex_unlock(&file_priv->lock);
+               }
+       }
+
+       return 0;
+}
+
  void ivpu_context_abort_work_fn(struct work_struct *work)
  {
         struct ivpu_device *vdev = container_of(work, struct ivpu_device, context_abort_work);
@@ -1127,7 +1173,7 @@ void ivpu_context_abort_work_fn(struct work_struct *work)
                 return;
  
         if (vdev->fw->sched_mode == VPU_SCHEDULING_MODE_HW)
-               if (ivpu_jsm_reset_engine(vdev, 0))
+               if (reset_engine_and_mark_faulty_contexts(vdev))
                         goto runtime_put;
  
         mutex_lock(&vdev->context_list_lock);
diff --git a/drivers/accel/ivpu/ivpu_jsm_msg.c b/drivers/accel/ivpu/ivpu_jsm_msg.c

index 0256b2dfefc10cb2eb6b3d136c2a7e3dc9fdd2d2..07b1d6f615a911a4ce28eeeb1aeaf35ccce40db4 100644 (file)
--- a/drivers/accel/ivpu/ivpu_jsm_msg.c
+++ b/drivers/accel/ivpu/ivpu_jsm_msg.c
@@ -151,10 +151,9 @@ int ivpu_jsm_get_heartbeat(struct ivpu_device *vdev, u32 engine, u64 *heartbeat)
         return ret;
  }
  
-int ivpu_jsm_reset_engine(struct ivpu_device *vdev, u32 engine)
+int ivpu_jsm_reset_engine(struct ivpu_device *vdev, u32 engine, struct vpu_jsm_msg *resp)
  {
         struct vpu_jsm_msg req = { .type = VPU_JSM_MSG_ENGINE_RESET };
-       struct vpu_jsm_msg resp;
         int ret;
  
         if (engine != VPU_ENGINE_COMPUTE)
@@ -162,14 +161,17 @@ int ivpu_jsm_reset_engine(struct ivpu_device *vdev, u32 engine)
  
         req.payload.engine_reset.engine_idx = engine;
  
-       ret = ivpu_ipc_send_receive(vdev, &req, VPU_JSM_MSG_ENGINE_RESET_DONE, &resp,
+       ret = ivpu_ipc_send_receive(vdev, &req, VPU_JSM_MSG_ENGINE_RESET_DONE, resp,
                                     VPU_IPC_CHAN_ASYNC_CMD, vdev->timeout.jsm);
         if (ret) {
                 ivpu_err_ratelimited(vdev, "Failed to reset engine %d: %d\n", engine, ret);
                 ivpu_pm_trigger_recovery(vdev, "Engine reset failed");
+               return ret;
         }
  
-       return ret;
+       atomic_inc(&vdev->pm->engine_reset_counter);
+
+       return 0;
  }
  
  int ivpu_jsm_preempt_engine(struct ivpu_device *vdev, u32 engine, u32 preempt_id)
@@ -554,6 +556,15 @@ int ivpu_jsm_dct_disable(struct ivpu_device *vdev)
  }
  
  int ivpu_jsm_state_dump(struct ivpu_device *vdev)
+{
+       struct vpu_jsm_msg req = { .type = VPU_JSM_MSG_STATE_DUMP };
+       struct vpu_jsm_msg resp;
+
+       return ivpu_ipc_send_receive_internal(vdev, &req, VPU_JSM_MSG_STATE_DUMP_RSP, &resp,
+                                             VPU_IPC_CHAN_ASYNC_CMD, vdev->timeout.jsm);
+}
+
+int ivpu_jsm_state_dump_no_reply(struct ivpu_device *vdev)
  {
         struct vpu_jsm_msg req = { .type = VPU_JSM_MSG_STATE_DUMP };
  
diff --git a/drivers/accel/ivpu/ivpu_jsm_msg.h b/drivers/accel/ivpu/ivpu_jsm_msg.h

index 9e84d3526a14635cde1d9b3dfe5ef471df67ec90..a74f5a0b0d9354c94a537afc7c2ea26dcce4d997 100644 (file)
--- a/drivers/accel/ivpu/ivpu_jsm_msg.h
+++ b/drivers/accel/ivpu/ivpu_jsm_msg.h
@@ -14,7 +14,7 @@ int ivpu_jsm_register_db(struct ivpu_device *vdev, u32 ctx_id, u32 db_id,
                          u64 jobq_base, u32 jobq_size);
  int ivpu_jsm_unregister_db(struct ivpu_device *vdev, u32 db_id);
  int ivpu_jsm_get_heartbeat(struct ivpu_device *vdev, u32 engine, u64 *heartbeat);
-int ivpu_jsm_reset_engine(struct ivpu_device *vdev, u32 engine);
+int ivpu_jsm_reset_engine(struct ivpu_device *vdev, u32 engine, struct vpu_jsm_msg *response);
  int ivpu_jsm_preempt_engine(struct ivpu_device *vdev, u32 engine, u32 preempt_id);
  int ivpu_jsm_dyndbg_control(struct ivpu_device *vdev, char *command, size_t size);
  int ivpu_jsm_trace_get_capability(struct ivpu_device *vdev, u32 *trace_destination_mask,
@@ -44,5 +44,6 @@ int ivpu_jsm_metric_streamer_info(struct ivpu_device *vdev, u64 metric_group_mas
  int ivpu_jsm_dct_enable(struct ivpu_device *vdev, u32 active_us, u32 inactive_us);
  int ivpu_jsm_dct_disable(struct ivpu_device *vdev);
  int ivpu_jsm_state_dump(struct ivpu_device *vdev);
+int ivpu_jsm_state_dump_no_reply(struct ivpu_device *vdev);
  
  #endif
diff --git a/drivers/accel/ivpu/ivpu_mmu.c b/drivers/accel/ivpu/ivpu_mmu.c

index e1baf6b64935137ac26967adc90f05f2b0f28534..41efd8985fa67fbc9e0ffd179f62a64639c3c9d2 100644 (file)
--- a/drivers/accel/ivpu/ivpu_mmu.c
+++ b/drivers/accel/ivpu/ivpu_mmu.c
@@ -1,6 +1,6 @@
  // SPDX-License-Identifier: GPL-2.0-only
  /*
- * Copyright (C) 2020-2024 Intel Corporation
+ * Copyright (C) 2020-2026 Intel Corporation
   */
  
  #include <linux/circ_buf.h>
@@ -964,6 +964,7 @@ void ivpu_mmu_irq_evtq_handler(struct ivpu_device *vdev)
                 file_priv = xa_load(&vdev->context_xa, ssid);
                 if (file_priv) {
                         if (!READ_ONCE(file_priv->has_mmu_faults)) {
+                               atomic_set(&vdev->faults_detected, 1);
                                 ivpu_mmu_dump_event(vdev, event);
                                 WRITE_ONCE(file_priv->has_mmu_faults, true);
                         }
diff --git a/drivers/accel/ivpu/ivpu_pm.c b/drivers/accel/ivpu/ivpu_pm.c

index d20144a21e090d619d89364a86b65959ad0f6ddb..83da9b297f3782a12bc3002cdfb41050feef3b23 100644 (file)
--- a/drivers/accel/ivpu/ivpu_pm.c
+++ b/drivers/accel/ivpu/ivpu_pm.c
@@ -1,6 +1,6 @@
  // SPDX-License-Identifier: GPL-2.0-only
  /*
- * Copyright (C) 2020-2024 Intel Corporation
+ * Copyright (C) 2020-2026 Intel Corporation
   */
  
  #include <linux/highmem.h>
@@ -166,7 +166,7 @@ static void ivpu_pm_recovery_work(struct work_struct *work)
         ivpu_pm_reset_begin(vdev);
  
         if (!pm_runtime_status_suspended(vdev->drm.dev)) {
-               ivpu_jsm_state_dump(vdev);
+               ivpu_jsm_state_dump_no_reply(vdev);
                 ivpu_dev_coredump(vdev);
                 ivpu_suspend(vdev);
         }
@@ -205,23 +205,25 @@ static void ivpu_job_timeout_work(struct work_struct *work)
  
         if (ivpu_jsm_get_heartbeat(vdev, 0, &heartbeat) || heartbeat <= vdev->fw->last_heartbeat) {
                 ivpu_err(vdev, "Job timeout detected, heartbeat not progressed\n");
-               goto recovery;
+               goto abort;
         }
  
         inference_max_retries = DIV_ROUND_UP(inference_timeout_ms, timeout_ms);
         if (atomic_fetch_inc(&vdev->job_timeout_counter) >= inference_max_retries) {
                 ivpu_err(vdev, "Job timeout detected, heartbeat limit (%lld) exceeded\n",
                          inference_max_retries);
-               goto recovery;
+               goto abort;
         }
  
         vdev->fw->last_heartbeat = heartbeat;
         ivpu_start_job_timeout_detection(vdev);
         return;
  
-recovery:
+abort:
         atomic_set(&vdev->job_timeout_counter, 0);
-       ivpu_pm_trigger_recovery(vdev, "TDR");
+       ivpu_jsm_state_dump(vdev);
+       ivpu_dev_coredump(vdev);
+       queue_work(system_percpu_wq, &vdev->context_abort_work);
  }
  
  void ivpu_start_job_timeout_detection(struct ivpu_device *vdev)
@@ -404,6 +406,7 @@ void ivpu_pm_init(struct ivpu_device *vdev)
         init_rwsem(&pm->reset_lock);
         atomic_set(&pm->reset_pending, 0);
         atomic_set(&pm->reset_counter, 0);
+       atomic_set(&pm->engine_reset_counter, 0);
  
         INIT_WORK(&pm->recovery_work, ivpu_pm_recovery_work);
         INIT_DELAYED_WORK(&pm->job_timeout_work, ivpu_job_timeout_work);
diff --git a/drivers/accel/ivpu/ivpu_pm.h b/drivers/accel/ivpu/ivpu_pm.h

index 00f2a01e3df66c22a92505edb045dff70c134862..2f07bb0b43be34307e5695865eb0f72c7d48d452 100644 (file)
--- a/drivers/accel/ivpu/ivpu_pm.h
+++ b/drivers/accel/ivpu/ivpu_pm.h
@@ -18,6 +18,7 @@ struct ivpu_pm_info {
         struct rw_semaphore reset_lock;
         atomic_t reset_counter;
         atomic_t reset_pending;
+       atomic_t engine_reset_counter;
         u8 dct_active_percent;
  };
author	Karol Wachowski <karol.wachowski@linux.intel.com>
	Wed, 18 Mar 2026 09:39:27 +0000 (10:39 +0100)
committer	Karol Wachowski <karol.wachowski@linux.intel.com>
	Fri, 20 Mar 2026 07:03:11 +0000 (08:03 +0100)
drivers/accel/ivpu/ivpu_debugfs.c		patch \| blob \| blame \| history
drivers/accel/ivpu/ivpu_drv.c		patch \| blob \| blame \| history
drivers/accel/ivpu/ivpu_drv.h		patch \| blob \| blame \| history
drivers/accel/ivpu/ivpu_job.c		patch \| blob \| blame \| history
drivers/accel/ivpu/ivpu_jsm_msg.c		patch \| blob \| blame \| history
drivers/accel/ivpu/ivpu_jsm_msg.h		patch \| blob \| blame \| history
drivers/accel/ivpu/ivpu_mmu.c		patch \| blob \| blame \| history
drivers/accel/ivpu/ivpu_pm.c		patch \| blob \| blame \| history
drivers/accel/ivpu/ivpu_pm.h		patch \| blob \| blame \| history