]> git.ipfire.org Git - thirdparty/kernel/stable.git/commitdiff
accel/ivpu: Add FW state dump on TDR
authorTomasz Rusinowicz <tomasz.rusinowicz@intel.com>
Mon, 30 Sep 2024 19:52:59 +0000 (21:52 +0200)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Thu, 27 Feb 2025 12:30:15 +0000 (04:30 -0800)
[ Upstream commit 5e162f872d7af8f041b143536617ab2563ea7de5 ]

Send JSM state dump message at the beginning of TDR handler. This allows
FW to collect debug info in the FW log before the state of the NPU is
lost allowing to analyze the cause of a TDR.

Wait a predefined timeout (10 ms) so the FW has a chance to write debug
logs. We cannot wait for JSM response at this point because IRQs are
already disabled before TDR handler is invoked.

Signed-off-by: Tomasz Rusinowicz <tomasz.rusinowicz@intel.com>
Reviewed-by: Jacek Lawrynowicz <jacek.lawrynowicz@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20240930195322.461209-9-jacek.lawrynowicz@linux.intel.com
Signed-off-by: Jacek Lawrynowicz <jacek.lawrynowicz@linux.intel.com>
Stable-dep-of: 41a2d8286c90 ("accel/ivpu: Fix error handling in recovery/reset")
Signed-off-by: Sasha Levin <sashal@kernel.org>
drivers/accel/ivpu/ivpu_drv.h
drivers/accel/ivpu/ivpu_hw.c
drivers/accel/ivpu/ivpu_ipc.c
drivers/accel/ivpu/ivpu_ipc.h
drivers/accel/ivpu/ivpu_jsm_msg.c
drivers/accel/ivpu/ivpu_jsm_msg.h
drivers/accel/ivpu/ivpu_pm.c

index 63f13b697eed719367c57f699041fd7d2b9c8337..2b30cc2e9272e40f9a17e47ae7c1f8996f21efb1 100644 (file)
@@ -152,6 +152,7 @@ struct ivpu_device {
                int tdr;
                int autosuspend;
                int d0i3_entry_msg;
+               int state_dump_msg;
        } timeout;
 };
 
index e69c0613513f111c08ce00b4b33be0e826d77a2b..08b3cef58fd2d7190d89c5835253b0f46710f139 100644 (file)
@@ -89,12 +89,14 @@ static void timeouts_init(struct ivpu_device *vdev)
                vdev->timeout.tdr = 2000000;
                vdev->timeout.autosuspend = -1;
                vdev->timeout.d0i3_entry_msg = 500;
+               vdev->timeout.state_dump_msg = 10;
        } else if (ivpu_is_simics(vdev)) {
                vdev->timeout.boot = 50;
                vdev->timeout.jsm = 500;
                vdev->timeout.tdr = 10000;
                vdev->timeout.autosuspend = -1;
                vdev->timeout.d0i3_entry_msg = 100;
+               vdev->timeout.state_dump_msg = 10;
        } else {
                vdev->timeout.boot = 1000;
                vdev->timeout.jsm = 500;
@@ -104,6 +106,7 @@ static void timeouts_init(struct ivpu_device *vdev)
                else
                        vdev->timeout.autosuspend = 100;
                vdev->timeout.d0i3_entry_msg = 5;
+               vdev->timeout.state_dump_msg = 10;
        }
 }
 
index 29b723039a345988225e0b7bb3956bb26aec8a4d..13c8a12162e89ecd440d16d98f23751845e6ea1f 100644 (file)
@@ -353,6 +353,32 @@ rpm_put:
        return ret;
 }
 
+int ivpu_ipc_send_and_wait(struct ivpu_device *vdev, struct vpu_jsm_msg *req,
+                          u32 channel, unsigned long timeout_ms)
+{
+       struct ivpu_ipc_consumer cons;
+       int ret;
+
+       ret = ivpu_rpm_get(vdev);
+       if (ret < 0)
+               return ret;
+
+       ivpu_ipc_consumer_add(vdev, &cons, channel, NULL);
+
+       ret = ivpu_ipc_send(vdev, &cons, req);
+       if (ret) {
+               ivpu_warn_ratelimited(vdev, "IPC send failed: %d\n", ret);
+               goto consumer_del;
+       }
+
+       msleep(timeout_ms);
+
+consumer_del:
+       ivpu_ipc_consumer_del(vdev, &cons);
+       ivpu_rpm_put(vdev);
+       return ret;
+}
+
 static bool
 ivpu_ipc_match_consumer(struct ivpu_device *vdev, struct ivpu_ipc_consumer *cons,
                        struct ivpu_ipc_hdr *ipc_hdr, struct vpu_jsm_msg *jsm_msg)
index fb4de7fb8210ea7a1953c6f92dbebde6e676ef74..b4dfb504679bac3c4537e2fb337a67dffa632601 100644 (file)
@@ -107,5 +107,7 @@ int ivpu_ipc_send_receive_internal(struct ivpu_device *vdev, struct vpu_jsm_msg
 int ivpu_ipc_send_receive(struct ivpu_device *vdev, struct vpu_jsm_msg *req,
                          enum vpu_ipc_msg_type expected_resp, struct vpu_jsm_msg *resp,
                          u32 channel, unsigned long timeout_ms);
+int ivpu_ipc_send_and_wait(struct ivpu_device *vdev, struct vpu_jsm_msg *req,
+                          u32 channel, unsigned long timeout_ms);
 
 #endif /* __IVPU_IPC_H__ */
index 88105963c1b2889e65980e2e23d2f10e6b7e0533..f7618b605f021974721999fa1da9ca53bcaec1fa 100644 (file)
@@ -555,3 +555,11 @@ int ivpu_jsm_dct_disable(struct ivpu_device *vdev)
        return ivpu_ipc_send_receive_internal(vdev, &req, VPU_JSM_MSG_DCT_DISABLE_DONE, &resp,
                                              VPU_IPC_CHAN_ASYNC_CMD, vdev->timeout.jsm);
 }
+
+int ivpu_jsm_state_dump(struct ivpu_device *vdev)
+{
+       struct vpu_jsm_msg req = { .type = VPU_JSM_MSG_STATE_DUMP };
+
+       return ivpu_ipc_send_and_wait(vdev, &req, VPU_IPC_CHAN_ASYNC_CMD,
+                                     vdev->timeout.state_dump_msg);
+}
index e4e42c0ff6e65614b36eb7ff52395d6628d9b0e1..9e84d3526a14635cde1d9b3dfe5ef471df67ec90 100644 (file)
@@ -43,4 +43,6 @@ int ivpu_jsm_metric_streamer_info(struct ivpu_device *vdev, u64 metric_group_mas
                                  u64 buffer_size, u32 *sample_size, u64 *info_size);
 int ivpu_jsm_dct_enable(struct ivpu_device *vdev, u32 active_us, u32 inactive_us);
 int ivpu_jsm_dct_disable(struct ivpu_device *vdev);
+int ivpu_jsm_state_dump(struct ivpu_device *vdev);
+
 #endif
index 0110f5ee7d069b8c452e60281751ca0ed41bf11b..848d7468d48ce5bdd76d8e4f3bf9e15cf72fa42b 100644 (file)
@@ -124,6 +124,7 @@ static void ivpu_pm_recovery_work(struct work_struct *work)
        if (ret)
                ivpu_err(vdev, "Failed to resume NPU: %d\n", ret);
 
+       ivpu_jsm_state_dump(vdev);
        ivpu_dev_coredump(vdev);
 
        atomic_inc(&vdev->pm->reset_counter);