--- /dev/null
+From c3b0ec0fe0c7ebc4eb42ba60f7340ecdb7aae1a2 Mon Sep 17 00:00:00 2001
+From: Tomasz Rusinowicz <tomasz.rusinowicz@intel.com>
+Date: Mon, 30 Sep 2024 21:53:09 +0200
+Subject: accel/ivpu: Make DB_ID and JOB_ID allocations incremental
+
+From: Tomasz Rusinowicz <tomasz.rusinowicz@intel.com>
+
+commit c3b0ec0fe0c7ebc4eb42ba60f7340ecdb7aae1a2 upstream.
+
+Save last used ID and use it to limit the possible values
+for the ID. This should decrease the rate at which the IDs
+are reused, which will make debugging easier.
+
+Signed-off-by: Tomasz Rusinowicz <tomasz.rusinowicz@intel.com>
+Reviewed-by: Jacek Lawrynowicz <jacek.lawrynowicz@linux.intel.com>
+Reviewed-by: Jeffrey Hugo <quic_jhugo@quicinc.com>
+Link: https://patchwork.freedesktop.org/patch/msgid/20240930195322.461209-19-jacek.lawrynowicz@linux.intel.com
+Signed-off-by: Jacek Lawrynowicz <jacek.lawrynowicz@linux.intel.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/accel/ivpu/ivpu_drv.c | 9 +++++++++
+ drivers/accel/ivpu/ivpu_drv.h | 7 +++++++
+ drivers/accel/ivpu/ivpu_job.c | 37 +++++++++++++++++++++++++++----------
+ 3 files changed, 43 insertions(+), 10 deletions(-)
+
+--- a/drivers/accel/ivpu/ivpu_drv.c
++++ b/drivers/accel/ivpu/ivpu_drv.c
+@@ -260,6 +260,11 @@ static int ivpu_open(struct drm_device *
+ if (ret)
+ goto err_xa_erase;
+
++ file_priv->default_job_limit.min = FIELD_PREP(IVPU_JOB_ID_CONTEXT_MASK,
++ (file_priv->ctx.id - 1));
++ file_priv->default_job_limit.max = file_priv->default_job_limit.min | IVPU_JOB_ID_JOB_MASK;
++ file_priv->job_limit = file_priv->default_job_limit;
++
+ mutex_unlock(&vdev->context_list_lock);
+ drm_dev_exit(idx);
+
+@@ -607,6 +612,10 @@ static int ivpu_dev_init(struct ivpu_dev
+ lockdep_set_class(&vdev->submitted_jobs_xa.xa_lock, &submitted_jobs_xa_lock_class_key);
+ INIT_LIST_HEAD(&vdev->bo_list);
+
++ vdev->default_db_limit.min = IVPU_MIN_DB;
++ vdev->default_db_limit.max = IVPU_MAX_DB;
++ vdev->db_limit = vdev->default_db_limit;
++
+ ret = drmm_mutex_init(&vdev->drm, &vdev->context_list_lock);
+ if (ret)
+ goto err_xa_destroy;
+--- a/drivers/accel/ivpu/ivpu_drv.h
++++ b/drivers/accel/ivpu/ivpu_drv.h
+@@ -46,6 +46,9 @@
+ #define IVPU_MIN_DB 1
+ #define IVPU_MAX_DB 255
+
++#define IVPU_JOB_ID_JOB_MASK GENMASK(7, 0)
++#define IVPU_JOB_ID_CONTEXT_MASK GENMASK(31, 8)
++
+ #define IVPU_NUM_ENGINES 2
+ #define IVPU_NUM_PRIORITIES 4
+ #define IVPU_NUM_CMDQS_PER_CTX (IVPU_NUM_ENGINES * IVPU_NUM_PRIORITIES)
+@@ -136,6 +139,8 @@ struct ivpu_device {
+ struct xa_limit context_xa_limit;
+
+ struct xarray db_xa;
++ struct xa_limit db_limit;
++ struct xa_limit default_db_limit;
+
+ struct mutex bo_list_lock; /* Protects bo_list */
+ struct list_head bo_list;
+@@ -171,6 +176,8 @@ struct ivpu_file_priv {
+ struct mutex ms_lock; /* Protects ms_instance_list, ms_info_bo */
+ struct list_head ms_instance_list;
+ struct ivpu_bo *ms_info_bo;
++ struct xa_limit job_limit;
++ struct xa_limit default_job_limit;
+ bool has_mmu_faults;
+ bool bound;
+ bool aborted;
+--- a/drivers/accel/ivpu/ivpu_job.c
++++ b/drivers/accel/ivpu/ivpu_job.c
+@@ -21,8 +21,6 @@
+ #include "vpu_boot_api.h"
+
+ #define CMD_BUF_IDX 0
+-#define JOB_ID_JOB_MASK GENMASK(7, 0)
+-#define JOB_ID_CONTEXT_MASK GENMASK(31, 8)
+ #define JOB_MAX_BUFFER_COUNT 65535
+
+ static void ivpu_cmdq_ring_db(struct ivpu_device *vdev, struct ivpu_cmdq *cmdq)
+@@ -77,9 +75,28 @@ static void ivpu_preemption_buffers_free
+ ivpu_bo_free(cmdq->secondary_preempt_buf);
+ }
+
++static int ivpu_id_alloc(struct xarray *xa, u32 *id, void *entry, struct xa_limit *limit,
++ const struct xa_limit default_limit)
++{
++ int ret;
++
++ ret = __xa_alloc(xa, id, entry, *limit, GFP_KERNEL);
++ if (ret) {
++ limit->min = default_limit.min;
++ ret = __xa_alloc(xa, id, entry, *limit, GFP_KERNEL);
++ if (ret)
++ return ret;
++ }
++
++ limit->min = *id + 1;
++ if (limit->min > limit->max)
++ limit->min = default_limit.min;
++
++ return ret;
++}
++
+ static struct ivpu_cmdq *ivpu_cmdq_alloc(struct ivpu_file_priv *file_priv)
+ {
+- struct xa_limit db_xa_limit = {.max = IVPU_MAX_DB, .min = IVPU_MIN_DB};
+ struct ivpu_device *vdev = file_priv->vdev;
+ struct ivpu_cmdq *cmdq;
+ int ret;
+@@ -88,7 +105,10 @@ static struct ivpu_cmdq *ivpu_cmdq_alloc
+ if (!cmdq)
+ return NULL;
+
+- ret = xa_alloc(&vdev->db_xa, &cmdq->db_id, NULL, db_xa_limit, GFP_KERNEL);
++ xa_lock(&vdev->db_xa); /* lock here to protect db_limit */
++ ret = ivpu_id_alloc(&vdev->db_xa, &cmdq->db_id, NULL, &vdev->db_limit,
++ vdev->default_db_limit);
++ xa_unlock(&vdev->db_xa);
+ if (ret) {
+ ivpu_err(vdev, "Failed to allocate doorbell id: %d\n", ret);
+ goto err_free_cmdq;
+@@ -519,7 +539,6 @@ static int ivpu_job_submit(struct ivpu_j
+ {
+ struct ivpu_file_priv *file_priv = job->file_priv;
+ struct ivpu_device *vdev = job->vdev;
+- struct xa_limit job_id_range;
+ struct ivpu_cmdq *cmdq;
+ bool is_first_job;
+ int ret;
+@@ -530,7 +549,7 @@ static int ivpu_job_submit(struct ivpu_j
+
+ mutex_lock(&file_priv->lock);
+
+- cmdq = ivpu_cmdq_acquire(job->file_priv, job->engine_idx, priority);
++ cmdq = ivpu_cmdq_acquire(file_priv, job->engine_idx, priority);
+ if (!cmdq) {
+ ivpu_warn_ratelimited(vdev, "Failed to get job queue, ctx %d engine %d prio %d\n",
+ file_priv->ctx.id, job->engine_idx, priority);
+@@ -538,12 +557,10 @@ static int ivpu_job_submit(struct ivpu_j
+ goto err_unlock_file_priv;
+ }
+
+- job_id_range.min = FIELD_PREP(JOB_ID_CONTEXT_MASK, (file_priv->ctx.id - 1));
+- job_id_range.max = job_id_range.min | JOB_ID_JOB_MASK;
+-
+ xa_lock(&vdev->submitted_jobs_xa);
+ is_first_job = xa_empty(&vdev->submitted_jobs_xa);
+- ret = __xa_alloc(&vdev->submitted_jobs_xa, &job->job_id, job, job_id_range, GFP_KERNEL);
++ ret = ivpu_id_alloc(&vdev->submitted_jobs_xa, &job->job_id, job, &file_priv->job_limit,
++ file_priv->default_job_limit);
+ if (ret) {
+ ivpu_dbg(vdev, JOB, "Too many active jobs in ctx %d\n",
+ file_priv->ctx.id);
--- /dev/null
+From a4293cc75348409f998c991c48cbe5532c438114 Mon Sep 17 00:00:00 2001
+From: Andrzej Kacprowski <Andrzej.Kacprowski@intel.com>
+Date: Mon, 30 Sep 2024 21:52:52 +0200
+Subject: accel/ivpu: Update VPU FW API headers
+
+From: Andrzej Kacprowski <Andrzej.Kacprowski@intel.com>
+
+commit a4293cc75348409f998c991c48cbe5532c438114 upstream.
+
+This commit bumps:
+ - Boot API from 3.24.0 to 3.26.3
+ - JSM API from 3.16.0 to 3.25.0
+
+Signed-off-by: Andrzej Kacprowski <Andrzej.Kacprowski@intel.com>
+Co-developed-by: Tomasz Rusinowicz <tomasz.rusinowicz@intel.com>
+Signed-off-by: Tomasz Rusinowicz <tomasz.rusinowicz@intel.com>
+Reviewed-by: Jacek Lawrynowicz <jacek.lawrynowicz@linux.intel.com>
+Link: https://patchwork.freedesktop.org/patch/msgid/20240930195322.461209-2-jacek.lawrynowicz@linux.intel.com
+Signed-off-by: Jacek Lawrynowicz <jacek.lawrynowicz@linux.intel.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/accel/ivpu/ivpu_job.c | 2
+ drivers/accel/ivpu/ivpu_jsm_msg.c | 3
+ drivers/accel/ivpu/vpu_boot_api.h | 43 +++--
+ drivers/accel/ivpu/vpu_jsm_api.h | 303 ++++++++++++++++++++++++++++++++------
+ 4 files changed, 292 insertions(+), 59 deletions(-)
+
+--- a/drivers/accel/ivpu/ivpu_job.c
++++ b/drivers/accel/ivpu/ivpu_job.c
+@@ -352,7 +352,7 @@ static int ivpu_cmdq_push_job(struct ivp
+ return -EBUSY;
+ }
+
+- entry = &cmdq->jobq->job[tail];
++ entry = &cmdq->jobq->slot[tail].job;
+ entry->batch_buf_addr = job->cmd_buf_vpu_addr;
+ entry->job_id = job->job_id;
+ entry->flags = 0;
+--- a/drivers/accel/ivpu/ivpu_jsm_msg.c
++++ b/drivers/accel/ivpu/ivpu_jsm_msg.c
+@@ -48,9 +48,10 @@ const char *ivpu_jsm_msg_type_to_str(enu
+ IVPU_CASE_TO_STR(VPU_JSM_MSG_HWS_RESUME_ENGINE_DONE);
+ IVPU_CASE_TO_STR(VPU_JSM_MSG_STATE_DUMP);
+ IVPU_CASE_TO_STR(VPU_JSM_MSG_STATE_DUMP_RSP);
+- IVPU_CASE_TO_STR(VPU_JSM_MSG_BLOB_DEINIT);
++ IVPU_CASE_TO_STR(VPU_JSM_MSG_BLOB_DEINIT_DEPRECATED);
+ IVPU_CASE_TO_STR(VPU_JSM_MSG_DYNDBG_CONTROL);
+ IVPU_CASE_TO_STR(VPU_JSM_MSG_JOB_DONE);
++ IVPU_CASE_TO_STR(VPU_JSM_MSG_NATIVE_FENCE_SIGNALLED);
+ IVPU_CASE_TO_STR(VPU_JSM_MSG_ENGINE_RESET_DONE);
+ IVPU_CASE_TO_STR(VPU_JSM_MSG_ENGINE_PREEMPT_DONE);
+ IVPU_CASE_TO_STR(VPU_JSM_MSG_REGISTER_DB_DONE);
+--- a/drivers/accel/ivpu/vpu_boot_api.h
++++ b/drivers/accel/ivpu/vpu_boot_api.h
+@@ -1,13 +1,12 @@
+ /* SPDX-License-Identifier: MIT */
+ /*
+- * Copyright (c) 2020-2023, Intel Corporation.
++ * Copyright (c) 2020-2024, Intel Corporation.
+ */
+
+ #ifndef VPU_BOOT_API_H
+ #define VPU_BOOT_API_H
+
+ /*
+- * =========== FW API version information beginning ================
+ * The below values will be used to construct the version info this way:
+ * fw_bin_header->api_version[VPU_BOOT_API_VER_ID] = (VPU_BOOT_API_VER_MAJOR << 16) |
+ * VPU_BOOT_API_VER_MINOR;
+@@ -27,19 +26,18 @@
+ * Minor version changes when API backward compatibility is preserved.
+ * Resets to 0 if Major version is incremented.
+ */
+-#define VPU_BOOT_API_VER_MINOR 24
++#define VPU_BOOT_API_VER_MINOR 26
+
+ /*
+ * API header changed (field names, documentation, formatting) but API itself has not been changed
+ */
+-#define VPU_BOOT_API_VER_PATCH 0
++#define VPU_BOOT_API_VER_PATCH 3
+
+ /*
+ * Index in the API version table
+ * Must be unique for each API
+ */
+ #define VPU_BOOT_API_VER_INDEX 0
+-/* ------------ FW API version information end ---------------------*/
+
+ #pragma pack(push, 4)
+
+@@ -164,8 +162,6 @@ enum vpu_trace_destination {
+ /* VPU 30xx HW component IDs are sequential, so define first and last IDs. */
+ #define VPU_TRACE_PROC_BIT_30XX_FIRST VPU_TRACE_PROC_BIT_LRT
+ #define VPU_TRACE_PROC_BIT_30XX_LAST VPU_TRACE_PROC_BIT_SHV_15
+-#define VPU_TRACE_PROC_BIT_KMB_FIRST VPU_TRACE_PROC_BIT_30XX_FIRST
+-#define VPU_TRACE_PROC_BIT_KMB_LAST VPU_TRACE_PROC_BIT_30XX_LAST
+
+ struct vpu_boot_l2_cache_config {
+ u8 use;
+@@ -199,6 +195,17 @@ struct vpu_warm_boot_section {
+ */
+ #define POWER_PROFILE_SURVIVABILITY 0x1
+
++/**
++ * Enum for dvfs_mode boot param.
++ */
++enum vpu_governor {
++ VPU_GOV_DEFAULT = 0, /* Default Governor for the system */
++ VPU_GOV_MAX_PERFORMANCE = 1, /* Maximum performance governor */
++ VPU_GOV_ON_DEMAND = 2, /* On Demand frequency control governor */
++ VPU_GOV_POWER_SAVE = 3, /* Power save governor */
++ VPU_GOV_ON_DEMAND_PRIORITY_AWARE = 4 /* On Demand priority based governor */
++};
++
+ struct vpu_boot_params {
+ u32 magic;
+ u32 vpu_id;
+@@ -301,7 +308,14 @@ struct vpu_boot_params {
+ u32 temp_sensor_period_ms;
+ /** PLL ratio for efficient clock frequency */
+ u32 pn_freq_pll_ratio;
+- /** DVFS Mode: Default: 0, Max Performance: 1, On Demand: 2, Power Save: 3 */
++ /**
++ * DVFS Mode:
++ * 0 - Default, DVFS mode selected by the firmware
++ * 1 - Max Performance
++ * 2 - On Demand
++ * 3 - Power Save
++ * 4 - On Demand Priority Aware
++ */
+ u32 dvfs_mode;
+ /**
+ * Depending on DVFS Mode:
+@@ -332,8 +346,8 @@ struct vpu_boot_params {
+ u64 d0i3_entry_vpu_ts;
+ /*
+ * The system time of the host operating system in microseconds.
+- * E.g the number of microseconds since 1st of January 1970, or whatever date the
+- * host operating system uses to maintain system time.
++ * E.g the number of microseconds since 1st of January 1970, or whatever
++ * date the host operating system uses to maintain system time.
+ * This value will be used to track system time on the VPU.
+ * The KMD is required to update this value on every VPU reset.
+ */
+@@ -382,10 +396,7 @@ struct vpu_boot_params {
+ u32 pad6[734];
+ };
+
+-/*
+- * Magic numbers set between host and vpu to detect corruptio of tracing init
+- */
+-
++/* Magic numbers set between host and vpu to detect corruption of tracing init */
+ #define VPU_TRACING_BUFFER_CANARY (0xCAFECAFE)
+
+ /* Tracing buffer message format definitions */
+@@ -405,7 +416,9 @@ struct vpu_tracing_buffer_header {
+ u32 host_canary_start;
+ /* offset from start of buffer for trace entries */
+ u32 read_index;
+- u32 pad_to_cache_line_size_0[14];
++ /* keeps track of wrapping on the reader side */
++ u32 read_wrap_count;
++ u32 pad_to_cache_line_size_0[13];
+ /* End of first cache line */
+
+ /**
+--- a/drivers/accel/ivpu/vpu_jsm_api.h
++++ b/drivers/accel/ivpu/vpu_jsm_api.h
+@@ -22,7 +22,7 @@
+ /*
+ * Minor version changes when API backward compatibility is preserved.
+ */
+-#define VPU_JSM_API_VER_MINOR 16
++#define VPU_JSM_API_VER_MINOR 25
+
+ /*
+ * API header changed (field names, documentation, formatting) but API itself has not been changed
+@@ -36,7 +36,7 @@
+
+ /*
+ * Number of Priority Bands for Hardware Scheduling
+- * Bands: RealTime, Focus, Normal, Idle
++ * Bands: Idle(0), Normal(1), Focus(2), RealTime(3)
+ */
+ #define VPU_HWS_NUM_PRIORITY_BANDS 4
+
+@@ -74,6 +74,7 @@
+ #define VPU_JSM_STATUS_MVNCI_INTERNAL_ERROR 0xCU
+ /* Job status returned when the job was preempted mid-inference */
+ #define VPU_JSM_STATUS_PREEMPTED_MID_INFERENCE 0xDU
++#define VPU_JSM_STATUS_MVNCI_CONTEXT_VIOLATION_HW 0xEU
+
+ /*
+ * Host <-> VPU IPC channels.
+@@ -86,18 +87,58 @@
+ /*
+ * Job flags bit masks.
+ */
+-#define VPU_JOB_FLAGS_NULL_SUBMISSION_MASK 0x00000001
+-#define VPU_JOB_FLAGS_PRIVATE_DATA_MASK 0xFF000000
++enum {
++ /*
++ * Null submission mask.
++ * When set, batch buffer's commands are not processed but returned as
++ * successful immediately, except fences and timestamps.
++ * When cleared, batch buffer's commands are processed normally.
++ * Used for testing and profiling purposes.
++ */
++ VPU_JOB_FLAGS_NULL_SUBMISSION_MASK = (1 << 0U),
++ /*
++ * Inline command mask.
++ * When set, the object in job queue is an inline command (see struct vpu_inline_cmd below).
++ * When cleared, the object in job queue is a job (see struct vpu_job_queue_entry below).
++ */
++ VPU_JOB_FLAGS_INLINE_CMD_MASK = (1 << 1U),
++ /*
++ * VPU private data mask.
++ * Reserved for the VPU to store private data about the job (or inline command)
++ * while being processed.
++ */
++ VPU_JOB_FLAGS_PRIVATE_DATA_MASK = 0xFFFF0000U
++};
+
+ /*
+- * Sizes of the reserved areas in jobs, in bytes.
++ * Job queue flags bit masks.
+ */
+-#define VPU_JOB_RESERVED_BYTES 8
++enum {
++ /*
++ * No job done notification mask.
++ * When set, indicates that no job done notification should be sent for any
++ * job from this queue. When cleared, indicates that job done notification
++ * should be sent for every job completed from this queue.
++ */
++ VPU_JOB_QUEUE_FLAGS_NO_JOB_DONE_MASK = (1 << 0U),
++ /*
++ * Native fence usage mask.
++ * When set, indicates that job queue uses native fences (as inline commands
++ * in job queue). Such queues may also use legacy fences (as commands in batch buffers).
++ * When cleared, indicates the job queue only uses legacy fences.
++ * NOTE: For queues using native fences, VPU expects that all jobs in the queue
++ * are immediately followed by an inline command object. This object is expected
++ * to be a fence signal command in most cases, but can also be a NOP in case the host
++ * does not need per-job fence signalling. Other inline commands objects can be
++ * inserted between "job and inline command" pairs.
++ */
++ VPU_JOB_QUEUE_FLAGS_USE_NATIVE_FENCE_MASK = (1 << 1U),
+
+-/*
+- * Sizes of the reserved areas in job queues, in bytes.
+- */
+-#define VPU_JOB_QUEUE_RESERVED_BYTES 52
++ /*
++ * Enable turbo mode for testing NPU performance; not recommended for regular usage.
++ */
++ VPU_JOB_QUEUE_FLAGS_TURBO_MODE = (1 << 2U)
++};
+
+ /*
+ * Max length (including trailing NULL char) of trace entity name (e.g., the
+@@ -141,23 +182,112 @@
+ #define VPU_HWS_INVALID_CMDQ_HANDLE 0ULL
+
+ /*
++ * Inline commands types.
++ */
++/*
++ * NOP.
++ * VPU does nothing other than consuming the inline command object.
++ */
++#define VPU_INLINE_CMD_TYPE_NOP 0x0
++/*
++ * Fence wait.
++ * VPU waits for the fence current value to reach monitored value.
++ * Fence wait operations are executed upon job dispatching. While waiting for
++ * the fence to be satisfied, VPU blocks fetching of the next objects in the queue.
++ * Jobs present in the queue prior to the fence wait object may be processed
++ * concurrently.
++ */
++#define VPU_INLINE_CMD_TYPE_FENCE_WAIT 0x1
++/*
++ * Fence signal.
++ * VPU sets the fence current value to the provided value. If new current value
++ * is equal to or higher than monitored value, VPU sends fence signalled notification
++ * to the host. Fence signal operations are executed upon completion of all the jobs
++ * present in the queue prior to them, and in-order relative to each other in the queue.
++ * But jobs in-between them may be processed concurrently and may complete out-of-order.
++ */
++#define VPU_INLINE_CMD_TYPE_FENCE_SIGNAL 0x2
++
++/*
++ * Job scheduling priority bands for both hardware scheduling and OS scheduling.
++ */
++enum vpu_job_scheduling_priority_band {
++ VPU_JOB_SCHEDULING_PRIORITY_BAND_IDLE = 0,
++ VPU_JOB_SCHEDULING_PRIORITY_BAND_NORMAL = 1,
++ VPU_JOB_SCHEDULING_PRIORITY_BAND_FOCUS = 2,
++ VPU_JOB_SCHEDULING_PRIORITY_BAND_REALTIME = 3,
++ VPU_JOB_SCHEDULING_PRIORITY_BAND_COUNT = 4,
++};
++
++/*
+ * Job format.
++ * Jobs defines the actual workloads to be executed by a given engine.
+ */
+ struct vpu_job_queue_entry {
+- u64 batch_buf_addr; /**< Address of VPU commands batch buffer */
+- u32 job_id; /**< Job ID */
+- u32 flags; /**< Flags bit field, see VPU_JOB_FLAGS_* above */
+- u64 root_page_table_addr; /**< Address of root page table to use for this job */
+- u64 root_page_table_update_counter; /**< Page tables update events counter */
+- u64 primary_preempt_buf_addr;
++ /**< Address of VPU commands batch buffer */
++ u64 batch_buf_addr;
++ /**< Job ID */
++ u32 job_id;
++ /**< Flags bit field, see VPU_JOB_FLAGS_* above */
++ u32 flags;
++ /**
++ * Doorbell ring timestamp taken by KMD from SoC's global system clock, in
++ * microseconds. NPU can convert this value to its own fixed clock's timebase,
++ * to match other profiling timestamps.
++ */
++ u64 doorbell_timestamp;
++ /**< Extra id for job tracking, used only in the firmware perf traces */
++ u64 host_tracking_id;
+ /**< Address of the primary preemption buffer to use for this job */
+- u32 primary_preempt_buf_size;
++ u64 primary_preempt_buf_addr;
+ /**< Size of the primary preemption buffer to use for this job */
+- u32 secondary_preempt_buf_size;
++ u32 primary_preempt_buf_size;
+ /**< Size of secondary preemption buffer to use for this job */
+- u64 secondary_preempt_buf_addr;
++ u32 secondary_preempt_buf_size;
+ /**< Address of secondary preemption buffer to use for this job */
+- u8 reserved_0[VPU_JOB_RESERVED_BYTES];
++ u64 secondary_preempt_buf_addr;
++ u64 reserved_0;
++};
++
++/*
++ * Inline command format.
++ * Inline commands are the commands executed at scheduler level (typically,
++ * synchronization directives). Inline command and job objects must be of
++ * the same size and have flags field at same offset.
++ */
++struct vpu_inline_cmd {
++ u64 reserved_0;
++ /* Inline command type, see VPU_INLINE_CMD_TYPE_* defines. */
++ u32 type;
++ /* Flags bit field, see VPU_JOB_FLAGS_* above. */
++ u32 flags;
++ /* Inline command payload. Depends on inline command type. */
++ union {
++ /* Fence (wait and signal) commands' payload. */
++ struct {
++ /* Fence object handle. */
++ u64 fence_handle;
++ /* User VA of the current fence value. */
++ u64 current_value_va;
++ /* User VA of the monitored fence value (read-only). */
++ u64 monitored_value_va;
++ /* Value to wait for or write in fence location. */
++ u64 value;
++ /* User VA of the log buffer in which to add log entry on completion. */
++ u64 log_buffer_va;
++ } fence;
++ /* Other commands do not have a payload. */
++ /* Payload definition for future inline commands can be inserted here. */
++ u64 reserved_1[6];
++ } payload;
++};
++
++/*
++ * Job queue slots can be populated either with job objects or inline command objects.
++ */
++union vpu_jobq_slot {
++ struct vpu_job_queue_entry job;
++ struct vpu_inline_cmd inline_cmd;
+ };
+
+ /*
+@@ -167,7 +297,21 @@ struct vpu_job_queue_header {
+ u32 engine_idx;
+ u32 head;
+ u32 tail;
+- u8 reserved_0[VPU_JOB_QUEUE_RESERVED_BYTES];
++ u32 flags;
++ /* Set to 1 to indicate priority_band field is valid */
++ u32 priority_band_valid;
++ /*
++ * Priority for the work of this job queue, valid only if the HWS is NOT used
++ * and the `priority_band_valid` is set to 1. It is applied only during
++ * the VPU_JSM_MSG_REGISTER_DB message processing.
++ * The device firmware might use the `priority_band` to optimize the power
++ * management logic, but it will not affect the order of jobs.
++ * Available priority bands: @see enum vpu_job_scheduling_priority_band
++ */
++ u32 priority_band;
++ /* Inside realtime band assigns a further priority, limited to 0..31 range */
++ u32 realtime_priority_level;
++ u32 reserved_0[9];
+ };
+
+ /*
+@@ -175,7 +319,7 @@ struct vpu_job_queue_header {
+ */
+ struct vpu_job_queue {
+ struct vpu_job_queue_header header;
+- struct vpu_job_queue_entry job[];
++ union vpu_jobq_slot slot[];
+ };
+
+ /**
+@@ -197,9 +341,7 @@ enum vpu_trace_entity_type {
+ struct vpu_hws_log_buffer_header {
+ /* Written by VPU after adding a log entry. Initialised by host to 0. */
+ u32 first_free_entry_index;
+- /* Incremented by VPU every time the VPU overwrites the 0th entry;
+- * initialised by host to 0.
+- */
++ /* Incremented by VPU every time the VPU writes the 0th entry; initialised by host to 0. */
+ u32 wraparound_count;
+ /*
+ * This is the number of buffers that can be stored in the log buffer provided by the host.
+@@ -230,14 +372,80 @@ struct vpu_hws_log_buffer_entry {
+ u64 operation_data[2];
+ };
+
++/* Native fence log buffer types. */
++enum vpu_hws_native_fence_log_type {
++ VPU_HWS_NATIVE_FENCE_LOG_TYPE_WAITS = 1,
++ VPU_HWS_NATIVE_FENCE_LOG_TYPE_SIGNALS = 2
++};
++
++/* HWS native fence log buffer header. */
++struct vpu_hws_native_fence_log_header {
++ union {
++ struct {
++ /* Index of the first free entry in buffer. */
++ u32 first_free_entry_idx;
++ /* Incremented each time NPU wraps around the buffer to write next entry. */
++ u32 wraparound_count;
++ };
++ /* Field allowing atomic update of both fields above. */
++ u64 atomic_wraparound_and_entry_idx;
++ };
++ /* Log buffer type, see enum vpu_hws_native_fence_log_type. */
++ u64 type;
++ /* Allocated number of entries in the log buffer. */
++ u64 entry_nb;
++ u64 reserved[2];
++};
++
++/* Native fence log operation types. */
++enum vpu_hws_native_fence_log_op {
++ VPU_HWS_NATIVE_FENCE_LOG_OP_SIGNAL_EXECUTED = 0,
++ VPU_HWS_NATIVE_FENCE_LOG_OP_WAIT_UNBLOCKED = 1
++};
++
++/* HWS native fence log entry. */
++struct vpu_hws_native_fence_log_entry {
++ /* Newly signaled/unblocked fence value. */
++ u64 fence_value;
++ /* Native fence object handle to which this operation belongs. */
++ u64 fence_handle;
++ /* Operation type, see enum vpu_hws_native_fence_log_op. */
++ u64 op_type;
++ u64 reserved_0;
++ /*
++ * VPU_HWS_NATIVE_FENCE_LOG_OP_WAIT_UNBLOCKED only: Timestamp at which fence
++ * wait was started (in NPU SysTime).
++ */
++ u64 fence_wait_start_ts;
++ u64 reserved_1;
++ /* Timestamp at which fence operation was completed (in NPU SysTime). */
++ u64 fence_end_ts;
++};
++
++/* Native fence log buffer. */
++struct vpu_hws_native_fence_log_buffer {
++ struct vpu_hws_native_fence_log_header header;
++ struct vpu_hws_native_fence_log_entry entry[];
++};
++
+ /*
+ * Host <-> VPU IPC messages types.
+ */
+ enum vpu_ipc_msg_type {
+ VPU_JSM_MSG_UNKNOWN = 0xFFFFFFFF,
++
+ /* IPC Host -> Device, Async commands */
+ VPU_JSM_MSG_ASYNC_CMD = 0x1100,
+ VPU_JSM_MSG_ENGINE_RESET = VPU_JSM_MSG_ASYNC_CMD,
++ /**
++ * Preempt engine. The NPU stops (preempts) all the jobs currently
++ * executing on the target engine making the engine become idle and ready to
++ * execute new jobs.
++ * NOTE: The NPU does not remove unstarted jobs (if any) from job queues of
++ * the target engine, but it stops processing them (until the queue doorbell
++ * is rung again); the host is responsible to reset the job queue, either
++ * after preemption or when resubmitting jobs to the queue.
++ */
+ VPU_JSM_MSG_ENGINE_PREEMPT = 0x1101,
+ VPU_JSM_MSG_REGISTER_DB = 0x1102,
+ VPU_JSM_MSG_UNREGISTER_DB = 0x1103,
+@@ -323,9 +531,10 @@ enum vpu_ipc_msg_type {
+ * NOTE: Please introduce new ASYNC commands before this one. *
+ */
+ VPU_JSM_MSG_STATE_DUMP = 0x11FF,
++
+ /* IPC Host -> Device, General commands */
+ VPU_JSM_MSG_GENERAL_CMD = 0x1200,
+- VPU_JSM_MSG_BLOB_DEINIT = VPU_JSM_MSG_GENERAL_CMD,
++ VPU_JSM_MSG_BLOB_DEINIT_DEPRECATED = VPU_JSM_MSG_GENERAL_CMD,
+ /**
+ * Control dyndbg behavior by executing a dyndbg command; equivalent to
+ * Linux command: `echo '<dyndbg_cmd>' > <debugfs>/dynamic_debug/control`.
+@@ -335,8 +544,12 @@ enum vpu_ipc_msg_type {
+ * Perform the save procedure for the D0i3 entry
+ */
+ VPU_JSM_MSG_PWR_D0I3_ENTER = 0x1202,
++
+ /* IPC Device -> Host, Job completion */
+ VPU_JSM_MSG_JOB_DONE = 0x2100,
++ /* IPC Device -> Host, Fence signalled */
++ VPU_JSM_MSG_NATIVE_FENCE_SIGNALLED = 0x2101,
++
+ /* IPC Device -> Host, Async command completion */
+ VPU_JSM_MSG_ASYNC_CMD_DONE = 0x2200,
+ VPU_JSM_MSG_ENGINE_RESET_DONE = VPU_JSM_MSG_ASYNC_CMD_DONE,
+@@ -422,6 +635,7 @@ enum vpu_ipc_msg_type {
+ * NOTE: Please introduce new ASYNC responses before this one. *
+ */
+ VPU_JSM_MSG_STATE_DUMP_RSP = 0x22FF,
++
+ /* IPC Device -> Host, General command completion */
+ VPU_JSM_MSG_GENERAL_CMD_DONE = 0x2300,
+ VPU_JSM_MSG_BLOB_DEINIT_DONE = VPU_JSM_MSG_GENERAL_CMD_DONE,
+@@ -600,11 +814,6 @@ struct vpu_jsm_metric_streamer_update {
+ u64 next_buffer_size;
+ };
+
+-struct vpu_ipc_msg_payload_blob_deinit {
+- /* 64-bit unique ID for the blob to be de-initialized. */
+- u64 blob_id;
+-};
+-
+ struct vpu_ipc_msg_payload_job_done {
+ /* Engine to which the job was submitted. */
+ u32 engine_idx;
+@@ -622,6 +831,21 @@ struct vpu_ipc_msg_payload_job_done {
+ u64 cmdq_id;
+ };
+
++/*
++ * Notification message upon native fence signalling.
++ * @see VPU_JSM_MSG_NATIVE_FENCE_SIGNALLED
++ */
++struct vpu_ipc_msg_payload_native_fence_signalled {
++ /* Engine ID. */
++ u32 engine_idx;
++ /* Host SSID. */
++ u32 host_ssid;
++ /* CMDQ ID */
++ u64 cmdq_id;
++ /* Fence object handle. */
++ u64 fence_handle;
++};
++
+ struct vpu_jsm_engine_reset_context {
+ /* Host SSID */
+ u32 host_ssid;
+@@ -700,11 +924,6 @@ struct vpu_ipc_msg_payload_get_power_lev
+ u8 power_limit[16];
+ };
+
+-struct vpu_ipc_msg_payload_blob_deinit_done {
+- /* 64-bit unique ID for the blob de-initialized. */
+- u64 blob_id;
+-};
+-
+ /* HWS priority band setup request / response */
+ struct vpu_ipc_msg_payload_hws_priority_band_setup {
+ /*
+@@ -794,7 +1013,10 @@ struct vpu_ipc_msg_payload_hws_set_conte
+ u32 reserved_0;
+ /* Command queue id */
+ u64 cmdq_id;
+- /* Priority band to assign to work of this context */
++ /*
++ * Priority band to assign to work of this context.
++ * Available priority bands: @see enum vpu_job_scheduling_priority_band
++ */
+ u32 priority_band;
+ /* Inside realtime band assigns a further priority */
+ u32 realtime_priority_level;
+@@ -869,9 +1091,7 @@ struct vpu_ipc_msg_payload_hws_set_sched
+ */
+ u64 notify_index;
+ /*
+- * Enable extra events to be output to log for debug of scheduling algorithm.
+- * Interpreted by VPU as a boolean to enable or disable, expected values are
+- * 0 and 1.
++ * Field is now deprecated, will be removed when KMD is updated to support removal
+ */
+ u32 enable_extra_events;
+ /* Zero Padding */
+@@ -1243,10 +1463,10 @@ union vpu_ipc_msg_payload {
+ struct vpu_jsm_metric_streamer_start metric_streamer_start;
+ struct vpu_jsm_metric_streamer_stop metric_streamer_stop;
+ struct vpu_jsm_metric_streamer_update metric_streamer_update;
+- struct vpu_ipc_msg_payload_blob_deinit blob_deinit;
+ struct vpu_ipc_msg_payload_ssid_release ssid_release;
+ struct vpu_jsm_hws_register_db hws_register_db;
+ struct vpu_ipc_msg_payload_job_done job_done;
++ struct vpu_ipc_msg_payload_native_fence_signalled native_fence_signalled;
+ struct vpu_ipc_msg_payload_engine_reset_done engine_reset_done;
+ struct vpu_ipc_msg_payload_engine_preempt_done engine_preempt_done;
+ struct vpu_ipc_msg_payload_register_db_done register_db_done;
+@@ -1254,7 +1474,6 @@ union vpu_ipc_msg_payload {
+ struct vpu_ipc_msg_payload_query_engine_hb_done query_engine_hb_done;
+ struct vpu_ipc_msg_payload_get_power_level_count_done get_power_level_count_done;
+ struct vpu_jsm_metric_streamer_done metric_streamer_done;
+- struct vpu_ipc_msg_payload_blob_deinit_done blob_deinit_done;
+ struct vpu_ipc_msg_payload_trace_config trace_config;
+ struct vpu_ipc_msg_payload_trace_capability_rsp trace_capability;
+ struct vpu_ipc_msg_payload_trace_get_name trace_get_name;
--- /dev/null
+From ae7af7d8dc2a13a427aa90d003fe4fb2c168342a Mon Sep 17 00:00:00 2001
+From: Karol Wachowski <karol.wachowski@intel.com>
+Date: Thu, 17 Oct 2024 16:58:12 +0200
+Subject: accel/ivpu: Use xa_alloc_cyclic() instead of custom function
+
+From: Karol Wachowski <karol.wachowski@intel.com>
+
+commit ae7af7d8dc2a13a427aa90d003fe4fb2c168342a upstream.
+
+Remove custom ivpu_id_alloc() wrapper used for ID allocations
+and replace it with standard xa_alloc_cyclic() API.
+
+The idea behind ivpu_id_alloc() was to have monotonic IDs, so the driver
+is easier to debug because same IDs are not reused all over. The same
+can be achieved just by using appropriate Linux API.
+
+Signed-off-by: Karol Wachowski <karol.wachowski@intel.com>
+Reviewed-by: Jacek Lawrynowicz <jacek.lawrynowicz@linux.intel.com>
+Reviewed-by: Jeffrey Hugo <quic_jhugo@quicinc.com>
+Signed-off-by: Jacek Lawrynowicz <jacek.lawrynowicz@linux.intel.com>
+Link: https://patchwork.freedesktop.org/patch/msgid/20241017145817.121590-7-jacek.lawrynowicz@linux.intel.com
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/accel/ivpu/ivpu_drv.c | 11 ++++-------
+ drivers/accel/ivpu/ivpu_drv.h | 4 ++--
+ drivers/accel/ivpu/ivpu_job.c | 34 ++++++----------------------------
+ 3 files changed, 12 insertions(+), 37 deletions(-)
+
+--- a/drivers/accel/ivpu/ivpu_drv.c
++++ b/drivers/accel/ivpu/ivpu_drv.c
+@@ -260,10 +260,8 @@ static int ivpu_open(struct drm_device *
+ if (ret)
+ goto err_xa_erase;
+
+- file_priv->default_job_limit.min = FIELD_PREP(IVPU_JOB_ID_CONTEXT_MASK,
+- (file_priv->ctx.id - 1));
+- file_priv->default_job_limit.max = file_priv->default_job_limit.min | IVPU_JOB_ID_JOB_MASK;
+- file_priv->job_limit = file_priv->default_job_limit;
++ file_priv->job_limit.min = FIELD_PREP(IVPU_JOB_ID_CONTEXT_MASK, (file_priv->ctx.id - 1));
++ file_priv->job_limit.max = file_priv->job_limit.min | IVPU_JOB_ID_JOB_MASK;
+
+ mutex_unlock(&vdev->context_list_lock);
+ drm_dev_exit(idx);
+@@ -612,9 +610,8 @@ static int ivpu_dev_init(struct ivpu_dev
+ lockdep_set_class(&vdev->submitted_jobs_xa.xa_lock, &submitted_jobs_xa_lock_class_key);
+ INIT_LIST_HEAD(&vdev->bo_list);
+
+- vdev->default_db_limit.min = IVPU_MIN_DB;
+- vdev->default_db_limit.max = IVPU_MAX_DB;
+- vdev->db_limit = vdev->default_db_limit;
++ vdev->db_limit.min = IVPU_MIN_DB;
++ vdev->db_limit.max = IVPU_MAX_DB;
+
+ ret = drmm_mutex_init(&vdev->drm, &vdev->context_list_lock);
+ if (ret)
+--- a/drivers/accel/ivpu/ivpu_drv.h
++++ b/drivers/accel/ivpu/ivpu_drv.h
+@@ -140,7 +140,7 @@ struct ivpu_device {
+
+ struct xarray db_xa;
+ struct xa_limit db_limit;
+- struct xa_limit default_db_limit;
++ u32 db_next;
+
+ struct mutex bo_list_lock; /* Protects bo_list */
+ struct list_head bo_list;
+@@ -177,7 +177,7 @@ struct ivpu_file_priv {
+ struct list_head ms_instance_list;
+ struct ivpu_bo *ms_info_bo;
+ struct xa_limit job_limit;
+- struct xa_limit default_job_limit;
++ u32 job_id_next;
+ bool has_mmu_faults;
+ bool bound;
+ bool aborted;
+--- a/drivers/accel/ivpu/ivpu_job.c
++++ b/drivers/accel/ivpu/ivpu_job.c
+@@ -75,26 +75,6 @@ static void ivpu_preemption_buffers_free
+ ivpu_bo_free(cmdq->secondary_preempt_buf);
+ }
+
+-static int ivpu_id_alloc(struct xarray *xa, u32 *id, void *entry, struct xa_limit *limit,
+- const struct xa_limit default_limit)
+-{
+- int ret;
+-
+- ret = __xa_alloc(xa, id, entry, *limit, GFP_KERNEL);
+- if (ret) {
+- limit->min = default_limit.min;
+- ret = __xa_alloc(xa, id, entry, *limit, GFP_KERNEL);
+- if (ret)
+- return ret;
+- }
+-
+- limit->min = *id + 1;
+- if (limit->min > limit->max)
+- limit->min = default_limit.min;
+-
+- return ret;
+-}
+-
+ static struct ivpu_cmdq *ivpu_cmdq_alloc(struct ivpu_file_priv *file_priv)
+ {
+ struct ivpu_device *vdev = file_priv->vdev;
+@@ -105,11 +85,9 @@ static struct ivpu_cmdq *ivpu_cmdq_alloc
+ if (!cmdq)
+ return NULL;
+
+- xa_lock(&vdev->db_xa); /* lock here to protect db_limit */
+- ret = ivpu_id_alloc(&vdev->db_xa, &cmdq->db_id, NULL, &vdev->db_limit,
+- vdev->default_db_limit);
+- xa_unlock(&vdev->db_xa);
+- if (ret) {
++ ret = xa_alloc_cyclic(&vdev->db_xa, &cmdq->db_id, NULL, vdev->db_limit, &vdev->db_next,
++ GFP_KERNEL);
++ if (ret < 0) {
+ ivpu_err(vdev, "Failed to allocate doorbell id: %d\n", ret);
+ goto err_free_cmdq;
+ }
+@@ -559,9 +537,9 @@ static int ivpu_job_submit(struct ivpu_j
+
+ xa_lock(&vdev->submitted_jobs_xa);
+ is_first_job = xa_empty(&vdev->submitted_jobs_xa);
+- ret = ivpu_id_alloc(&vdev->submitted_jobs_xa, &job->job_id, job, &file_priv->job_limit,
+- file_priv->default_job_limit);
+- if (ret) {
++ ret = __xa_alloc_cyclic(&vdev->submitted_jobs_xa, &job->job_id, job, file_priv->job_limit,
++ &file_priv->job_id_next, GFP_KERNEL);
++ if (ret < 0) {
+ ivpu_dbg(vdev, JOB, "Too many active jobs in ctx %d\n",
+ file_priv->ctx.id);
+ ret = -EBUSY;