// SPDX-License-Identifier: GPL-2.0-only
/*
- * Copyright (C) 2020-2024 Intel Corporation
+ * Copyright (C) 2020-2026 Intel Corporation
*/
#include <linux/debugfs.h>
return 0;
}
+static int engine_reset_counter_show(struct seq_file *s, void *v)
+{
+ struct ivpu_device *vdev = seq_to_ivpu(s);
+
+ seq_printf(s, "%d\n", atomic_read(&vdev->pm->engine_reset_counter));
+ return 0;
+}
+
static const struct drm_debugfs_info vdev_debugfs_list[] = {
{"bo_list", bo_list_show, 0},
{"fw_name", fw_name_show, 0},
{"reset_counter", reset_counter_show, 0},
{"reset_pending", reset_pending_show, 0},
{"firewall_irq_counter", firewall_irq_counter_show, 0},
+ {"engine_reset_counter", engine_reset_counter_show, 0},
};
static int dvfs_mode_get(void *data, u64 *dvfs_mode)
static int ivpu_reset_engine_fn(void *data, u64 val)
{
struct ivpu_device *vdev = (struct ivpu_device *)data;
+ struct vpu_jsm_msg resp;
- return ivpu_jsm_reset_engine(vdev, (u32)val);
+ return ivpu_jsm_reset_engine(vdev, (u32)val, &resp);
}
DEFINE_DEBUGFS_ATTRIBUTE(ivpu_reset_engine_fops, NULL, ivpu_reset_engine_fn, "0x%02llx\n");
vdev->context_xa_limit.max = IVPU_USER_CONTEXT_MAX_SSID;
atomic64_set(&vdev->unique_id_counter, 0);
atomic_set(&vdev->job_timeout_counter, 0);
+ atomic_set(&vdev->faults_detected, 0);
xa_init_flags(&vdev->context_xa, XA_FLAGS_ALLOC | XA_FLAGS_LOCK_IRQ);
xa_init_flags(&vdev->submitted_jobs_xa, XA_FLAGS_ALLOC1);
xa_init_flags(&vdev->db_xa, XA_FLAGS_ALLOC1);
/* SPDX-License-Identifier: GPL-2.0-only */
/*
- * Copyright (C) 2020-2025 Intel Corporation
+ * Copyright (C) 2020-2026 Intel Corporation
*/
#ifndef __IVPU_DRV_H__
struct xarray submitted_jobs_xa;
struct ivpu_ipc_consumer job_done_consumer;
atomic_t job_timeout_counter;
+ atomic_t faults_detected;
atomic64_t unique_id_counter;
// SPDX-License-Identifier: GPL-2.0-only
/*
- * Copyright (C) 2020-2025 Intel Corporation
+ * Copyright (C) 2020-2026 Intel Corporation
*/
#include <drm/drm_file.h>
* status and ensure both are handled in the same way
*/
job->file_priv->has_mmu_faults = true;
+ atomic_set(&vdev->faults_detected, 1);
queue_work(system_percpu_wq, &vdev->context_abort_work);
return true;
}
ivpu_ipc_consumer_del(vdev, &vdev->job_done_consumer);
}
+static int reset_engine_and_mark_faulty_contexts(struct ivpu_device *vdev)
+{
+ u32 num_impacted_contexts;
+ struct vpu_jsm_msg resp;
+ int ret;
+ u32 i;
+
+ ret = ivpu_jsm_reset_engine(vdev, 0, &resp);
+ if (ret)
+ return ret;
+
+ /*
+ * If faults are detected, ignore guilty contexts from engine reset as NPU may not be stuck
+ * and could return currently running good context and faulty contexts are already marked
+ */
+ if (atomic_cmpxchg(&vdev->faults_detected, 1, 0) == 1)
+ return 0;
+
+ num_impacted_contexts = resp.payload.engine_reset_done.num_impacted_contexts;
+
+ ivpu_warn_ratelimited(vdev, "Engine reset performed, impacted contexts: %u\n",
+ num_impacted_contexts);
+
+ if (!in_range(num_impacted_contexts, 1, VPU_MAX_ENGINE_RESET_IMPACTED_CONTEXTS - 1)) {
+ ivpu_pm_trigger_recovery(vdev, "Cannot determine guilty contexts");
+ return -EIO;
+ }
+
+ /* No faults detected, NPU likely got stuck. Mark returned contexts as guilty */
+ guard(mutex)(&vdev->context_list_lock);
+
+ for (i = 0; i < num_impacted_contexts; i++) {
+ u32 ssid = resp.payload.engine_reset_done.impacted_contexts[i].host_ssid;
+ struct ivpu_file_priv *file_priv = xa_load(&vdev->context_xa, ssid);
+
+ if (file_priv) {
+ mutex_lock(&file_priv->lock);
+ file_priv->has_mmu_faults = true;
+ mutex_unlock(&file_priv->lock);
+ }
+ }
+
+ return 0;
+}
+
void ivpu_context_abort_work_fn(struct work_struct *work)
{
struct ivpu_device *vdev = container_of(work, struct ivpu_device, context_abort_work);
return;
if (vdev->fw->sched_mode == VPU_SCHEDULING_MODE_HW)
- if (ivpu_jsm_reset_engine(vdev, 0))
+ if (reset_engine_and_mark_faulty_contexts(vdev))
goto runtime_put;
mutex_lock(&vdev->context_list_lock);
return ret;
}
-int ivpu_jsm_reset_engine(struct ivpu_device *vdev, u32 engine)
+int ivpu_jsm_reset_engine(struct ivpu_device *vdev, u32 engine, struct vpu_jsm_msg *resp)
{
struct vpu_jsm_msg req = { .type = VPU_JSM_MSG_ENGINE_RESET };
- struct vpu_jsm_msg resp;
int ret;
if (engine != VPU_ENGINE_COMPUTE)
req.payload.engine_reset.engine_idx = engine;
- ret = ivpu_ipc_send_receive(vdev, &req, VPU_JSM_MSG_ENGINE_RESET_DONE, &resp,
+ ret = ivpu_ipc_send_receive(vdev, &req, VPU_JSM_MSG_ENGINE_RESET_DONE, resp,
VPU_IPC_CHAN_ASYNC_CMD, vdev->timeout.jsm);
if (ret) {
ivpu_err_ratelimited(vdev, "Failed to reset engine %d: %d\n", engine, ret);
ivpu_pm_trigger_recovery(vdev, "Engine reset failed");
+ return ret;
}
- return ret;
+ atomic_inc(&vdev->pm->engine_reset_counter);
+
+ return 0;
}
int ivpu_jsm_preempt_engine(struct ivpu_device *vdev, u32 engine, u32 preempt_id)
}
int ivpu_jsm_state_dump(struct ivpu_device *vdev)
+{
+ struct vpu_jsm_msg req = { .type = VPU_JSM_MSG_STATE_DUMP };
+ struct vpu_jsm_msg resp;
+
+ return ivpu_ipc_send_receive_internal(vdev, &req, VPU_JSM_MSG_STATE_DUMP_RSP, &resp,
+ VPU_IPC_CHAN_ASYNC_CMD, vdev->timeout.jsm);
+}
+
+int ivpu_jsm_state_dump_no_reply(struct ivpu_device *vdev)
{
struct vpu_jsm_msg req = { .type = VPU_JSM_MSG_STATE_DUMP };
u64 jobq_base, u32 jobq_size);
int ivpu_jsm_unregister_db(struct ivpu_device *vdev, u32 db_id);
int ivpu_jsm_get_heartbeat(struct ivpu_device *vdev, u32 engine, u64 *heartbeat);
-int ivpu_jsm_reset_engine(struct ivpu_device *vdev, u32 engine);
+int ivpu_jsm_reset_engine(struct ivpu_device *vdev, u32 engine, struct vpu_jsm_msg *response);
int ivpu_jsm_preempt_engine(struct ivpu_device *vdev, u32 engine, u32 preempt_id);
int ivpu_jsm_dyndbg_control(struct ivpu_device *vdev, char *command, size_t size);
int ivpu_jsm_trace_get_capability(struct ivpu_device *vdev, u32 *trace_destination_mask,
int ivpu_jsm_dct_enable(struct ivpu_device *vdev, u32 active_us, u32 inactive_us);
int ivpu_jsm_dct_disable(struct ivpu_device *vdev);
int ivpu_jsm_state_dump(struct ivpu_device *vdev);
+int ivpu_jsm_state_dump_no_reply(struct ivpu_device *vdev);
#endif
// SPDX-License-Identifier: GPL-2.0-only
/*
- * Copyright (C) 2020-2024 Intel Corporation
+ * Copyright (C) 2020-2026 Intel Corporation
*/
#include <linux/circ_buf.h>
file_priv = xa_load(&vdev->context_xa, ssid);
if (file_priv) {
if (!READ_ONCE(file_priv->has_mmu_faults)) {
+ atomic_set(&vdev->faults_detected, 1);
ivpu_mmu_dump_event(vdev, event);
WRITE_ONCE(file_priv->has_mmu_faults, true);
}
// SPDX-License-Identifier: GPL-2.0-only
/*
- * Copyright (C) 2020-2024 Intel Corporation
+ * Copyright (C) 2020-2026 Intel Corporation
*/
#include <linux/highmem.h>
ivpu_pm_reset_begin(vdev);
if (!pm_runtime_status_suspended(vdev->drm.dev)) {
- ivpu_jsm_state_dump(vdev);
+ ivpu_jsm_state_dump_no_reply(vdev);
ivpu_dev_coredump(vdev);
ivpu_suspend(vdev);
}
if (ivpu_jsm_get_heartbeat(vdev, 0, &heartbeat) || heartbeat <= vdev->fw->last_heartbeat) {
ivpu_err(vdev, "Job timeout detected, heartbeat not progressed\n");
- goto recovery;
+ goto abort;
}
inference_max_retries = DIV_ROUND_UP(inference_timeout_ms, timeout_ms);
if (atomic_fetch_inc(&vdev->job_timeout_counter) >= inference_max_retries) {
ivpu_err(vdev, "Job timeout detected, heartbeat limit (%lld) exceeded\n",
inference_max_retries);
- goto recovery;
+ goto abort;
}
vdev->fw->last_heartbeat = heartbeat;
ivpu_start_job_timeout_detection(vdev);
return;
-recovery:
+abort:
atomic_set(&vdev->job_timeout_counter, 0);
- ivpu_pm_trigger_recovery(vdev, "TDR");
+ ivpu_jsm_state_dump(vdev);
+ ivpu_dev_coredump(vdev);
+ queue_work(system_percpu_wq, &vdev->context_abort_work);
}
void ivpu_start_job_timeout_detection(struct ivpu_device *vdev)
init_rwsem(&pm->reset_lock);
atomic_set(&pm->reset_pending, 0);
atomic_set(&pm->reset_counter, 0);
+ atomic_set(&pm->engine_reset_counter, 0);
INIT_WORK(&pm->recovery_work, ivpu_pm_recovery_work);
INIT_DELAYED_WORK(&pm->job_timeout_work, ivpu_job_timeout_work);
struct rw_semaphore reset_lock;
atomic_t reset_counter;
atomic_t reset_pending;
+ atomic_t engine_reset_counter;
u8 dct_active_percent;
};