]> git.ipfire.org Git - thirdparty/linux.git/commitdiff
accel/ivpu: Trigger device recovery on engine reset/resume failure
authorKarol Wachowski <karol.wachowski@intel.com>
Wed, 28 May 2025 15:42:53 +0000 (17:42 +0200)
committerJacek Lawrynowicz <jacek.lawrynowicz@linux.intel.com>
Thu, 5 Jun 2025 12:36:56 +0000 (14:36 +0200)
Trigger full device recovery when the driver fails to restore device state
via engine reset and resume operations. This is necessary because, even if
submissions from a faulty context are blocked, the NPU may still process
previously submitted faulty jobs if the engine reset fails to abort them.
Such jobs can continue to generate faults and occupy device resources.
When engine reset is ineffective, the only way to recover is to perform
a full device recovery.

Fixes: dad945c27a42 ("accel/ivpu: Add handling of VPU_JSM_STATUS_MVNCI_CONTEXT_VIOLATION_HW")
Cc: stable@vger.kernel.org # v6.15+
Signed-off-by: Karol Wachowski <karol.wachowski@intel.com>
Reviewed-by: Lizhi Hou <lizhi.hou@amd.com>
Signed-off-by: Jacek Lawrynowicz <jacek.lawrynowicz@linux.intel.com>
Link: https://lore.kernel.org/r/20250528154253.500556-1-jacek.lawrynowicz@linux.intel.com
drivers/accel/ivpu/ivpu_job.c
drivers/accel/ivpu/ivpu_jsm_msg.c

index 1c8e283ad985427cf66a6018c5a112b05ed8950d..fae8351aa330908d9acbd7636c98a6a6a5215486 100644 (file)
@@ -986,7 +986,8 @@ void ivpu_context_abort_work_fn(struct work_struct *work)
                return;
 
        if (vdev->fw->sched_mode == VPU_SCHEDULING_MODE_HW)
-               ivpu_jsm_reset_engine(vdev, 0);
+               if (ivpu_jsm_reset_engine(vdev, 0))
+                       return;
 
        mutex_lock(&vdev->context_list_lock);
        xa_for_each(&vdev->context_xa, ctx_id, file_priv) {
@@ -1009,7 +1010,8 @@ void ivpu_context_abort_work_fn(struct work_struct *work)
        if (vdev->fw->sched_mode != VPU_SCHEDULING_MODE_HW)
                goto runtime_put;
 
-       ivpu_jsm_hws_resume_engine(vdev, 0);
+       if (ivpu_jsm_hws_resume_engine(vdev, 0))
+               return;
        /*
         * In hardware scheduling mode NPU already has stopped processing jobs
         * and won't send us any further notifications, thus we have to free job related resources
index 219ab8afefabde6945f1766aab8baa5f40e4459d..0256b2dfefc10cb2eb6b3d136c2a7e3dc9fdd2d2 100644 (file)
@@ -7,6 +7,7 @@
 #include "ivpu_hw.h"
 #include "ivpu_ipc.h"
 #include "ivpu_jsm_msg.h"
+#include "ivpu_pm.h"
 #include "vpu_jsm_api.h"
 
 const char *ivpu_jsm_msg_type_to_str(enum vpu_ipc_msg_type type)
@@ -163,8 +164,10 @@ int ivpu_jsm_reset_engine(struct ivpu_device *vdev, u32 engine)
 
        ret = ivpu_ipc_send_receive(vdev, &req, VPU_JSM_MSG_ENGINE_RESET_DONE, &resp,
                                    VPU_IPC_CHAN_ASYNC_CMD, vdev->timeout.jsm);
-       if (ret)
+       if (ret) {
                ivpu_err_ratelimited(vdev, "Failed to reset engine %d: %d\n", engine, ret);
+               ivpu_pm_trigger_recovery(vdev, "Engine reset failed");
+       }
 
        return ret;
 }
@@ -354,8 +357,10 @@ int ivpu_jsm_hws_resume_engine(struct ivpu_device *vdev, u32 engine)
 
        ret = ivpu_ipc_send_receive(vdev, &req, VPU_JSM_MSG_HWS_RESUME_ENGINE_DONE, &resp,
                                    VPU_IPC_CHAN_ASYNC_CMD, vdev->timeout.jsm);
-       if (ret)
+       if (ret) {
                ivpu_err_ratelimited(vdev, "Failed to resume engine %d: %d\n", engine, ret);
+               ivpu_pm_trigger_recovery(vdev, "Engine resume failed");
+       }
 
        return ret;
 }