From 53bd03d86ee941ea1f4231bb921c9c3f2f13d43d Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Tue, 20 May 2025 11:10:15 +0200 Subject: [PATCH] 6.14-stable patches added patches: accel-ivpu-dump-only-first-mmu-fault-from-single-context.patch accel-ivpu-fix-missing-mmu-events-from-reserved-ssid.patch accel-ivpu-fix-missing-mmu-events-if-file_priv-is-unbound.patch accel-ivpu-flush-pending-jobs-of-device-s-workqueues.patch accel-ivpu-move-parts-of-mmu-event-irq-handling-to-thread-handler.patch accel-ivpu-use-workqueue-for-irq-handling.patch drm-xe-gsc-do-not-flush-the-gsc-worker-from-the-reset-path.patch mm-page_alloc-fix-race-condition-in-unaccepted-memory-handling.patch --- ...-first-mmu-fault-from-single-context.patch | 135 +++++++++ ...issing-mmu-events-from-reserved-ssid.patch | 50 ++++ ...g-mmu-events-if-file_priv-is-unbound.patch | 48 +++ ...-pending-jobs-of-device-s-workqueues.patch | 49 ++++ ...event-irq-handling-to-thread-handler.patch | 248 ++++++++++++++++ ...-ivpu-use-workqueue-for-irq-handling.patch | 277 ++++++++++++++++++ ...h-the-gsc-worker-from-the-reset-path.patch | 179 +++++++++++ ...dition-in-unaccepted-memory-handling.patch | 132 +++++++++ queue-6.14/series | 8 + 9 files changed, 1126 insertions(+) create mode 100644 queue-6.14/accel-ivpu-dump-only-first-mmu-fault-from-single-context.patch create mode 100644 queue-6.14/accel-ivpu-fix-missing-mmu-events-from-reserved-ssid.patch create mode 100644 queue-6.14/accel-ivpu-fix-missing-mmu-events-if-file_priv-is-unbound.patch create mode 100644 queue-6.14/accel-ivpu-flush-pending-jobs-of-device-s-workqueues.patch create mode 100644 queue-6.14/accel-ivpu-move-parts-of-mmu-event-irq-handling-to-thread-handler.patch create mode 100644 queue-6.14/accel-ivpu-use-workqueue-for-irq-handling.patch create mode 100644 queue-6.14/drm-xe-gsc-do-not-flush-the-gsc-worker-from-the-reset-path.patch create mode 100644 queue-6.14/mm-page_alloc-fix-race-condition-in-unaccepted-memory-handling.patch diff --git a/queue-6.14/accel-ivpu-dump-only-first-mmu-fault-from-single-context.patch b/queue-6.14/accel-ivpu-dump-only-first-mmu-fault-from-single-context.patch new file mode 100644 index 0000000000..8cf39ca543 --- /dev/null +++ b/queue-6.14/accel-ivpu-dump-only-first-mmu-fault-from-single-context.patch @@ -0,0 +1,135 @@ +From 0240fa18d247c99a1967f2fed025296a89a1c5f5 Mon Sep 17 00:00:00 2001 +From: Karol Wachowski +Date: Tue, 7 Jan 2025 18:32:29 +0100 +Subject: accel/ivpu: Dump only first MMU fault from single context + +From: Karol Wachowski + +commit 0240fa18d247c99a1967f2fed025296a89a1c5f5 upstream. + +Stop dumping consecutive faults from an already faulty context immediately, +instead of waiting for the context abort thread handler (IRQ handler bottom +half) to abort currently executing jobs. + +Remove 'R' (record events) bit from context descriptor of a faulty +context to prevent future faults generation. + +This change speeds up the IRQ handler by eliminating the need to print the +fault content repeatedly. Additionally, it prevents flooding dmesg with +errors, which was occurring due to the delay in the bottom half of the +handler stopping fault-generating jobs. + +Signed-off-by: Karol Wachowski +Signed-off-by: Maciej Falkowski +Reviewed-by: Jacek Lawrynowicz +Signed-off-by: Jacek Lawrynowicz +Link: https://patchwork.freedesktop.org/patch/msgid/20250107173238.381120-7-maciej.falkowski@linux.intel.com +Signed-off-by: Greg Kroah-Hartman +--- + drivers/accel/ivpu/ivpu_mmu.c | 51 ++++++++++++++++++++++++++++++---- + drivers/accel/ivpu/ivpu_mmu_context.c | 13 -------- + drivers/accel/ivpu/ivpu_mmu_context.h | 2 - + 3 files changed, 46 insertions(+), 20 deletions(-) + +--- a/drivers/accel/ivpu/ivpu_mmu.c ++++ b/drivers/accel/ivpu/ivpu_mmu.c +@@ -870,23 +870,64 @@ static u32 *ivpu_mmu_get_event(struct iv + return evt; + } + ++static int ivpu_mmu_disable_events(struct ivpu_device *vdev, u32 ssid) ++{ ++ struct ivpu_mmu_info *mmu = vdev->mmu; ++ struct ivpu_mmu_cdtab *cdtab = &mmu->cdtab; ++ u64 *entry; ++ u64 val; ++ ++ if (ssid > IVPU_MMU_CDTAB_ENT_COUNT) ++ return -EINVAL; ++ ++ entry = cdtab->base + (ssid * IVPU_MMU_CDTAB_ENT_SIZE); ++ ++ val = READ_ONCE(entry[0]); ++ val &= ~IVPU_MMU_CD_0_R; ++ WRITE_ONCE(entry[0], val); ++ ++ if (!ivpu_is_force_snoop_enabled(vdev)) ++ clflush_cache_range(entry, IVPU_MMU_CDTAB_ENT_SIZE); ++ ++ ivpu_mmu_cmdq_write_cfgi_all(vdev); ++ ++ return 0; ++} ++ + void ivpu_mmu_irq_evtq_handler(struct ivpu_device *vdev) + { ++ struct ivpu_file_priv *file_priv; ++ u32 last_ssid = -1; + u32 *event; + u32 ssid; + + ivpu_dbg(vdev, IRQ, "MMU event queue\n"); + +- while ((event = ivpu_mmu_get_event(vdev)) != NULL) { +- ivpu_mmu_dump_event(vdev, event); +- ++ while ((event = ivpu_mmu_get_event(vdev))) { + ssid = FIELD_GET(IVPU_MMU_EVT_SSID_MASK, event[0]); ++ ++ if (ssid == last_ssid) ++ continue; ++ ++ xa_lock(&vdev->context_xa); ++ file_priv = xa_load(&vdev->context_xa, ssid); ++ if (file_priv) { ++ if (file_priv->has_mmu_faults) { ++ event = NULL; ++ } else { ++ ivpu_mmu_disable_events(vdev, ssid); ++ file_priv->has_mmu_faults = true; ++ } ++ } ++ xa_unlock(&vdev->context_xa); ++ ++ if (event) ++ ivpu_mmu_dump_event(vdev, event); ++ + if (ssid == IVPU_GLOBAL_CONTEXT_MMU_SSID) { + ivpu_pm_trigger_recovery(vdev, "MMU event"); + return; + } +- +- ivpu_mmu_user_context_mark_invalid(vdev, ssid); + REGV_WR32(IVPU_MMU_REG_EVTQ_CONS_SEC, vdev->mmu->evtq.cons); + } + +--- a/drivers/accel/ivpu/ivpu_mmu_context.c ++++ b/drivers/accel/ivpu/ivpu_mmu_context.c +@@ -635,16 +635,3 @@ void ivpu_mmu_reserved_context_fini(stru + ivpu_mmu_cd_clear(vdev, vdev->rctx.id); + ivpu_mmu_context_fini(vdev, &vdev->rctx); + } +- +-void ivpu_mmu_user_context_mark_invalid(struct ivpu_device *vdev, u32 ssid) +-{ +- struct ivpu_file_priv *file_priv; +- +- xa_lock(&vdev->context_xa); +- +- file_priv = xa_load(&vdev->context_xa, ssid); +- if (file_priv) +- file_priv->has_mmu_faults = true; +- +- xa_unlock(&vdev->context_xa); +-} +--- a/drivers/accel/ivpu/ivpu_mmu_context.h ++++ b/drivers/accel/ivpu/ivpu_mmu_context.h +@@ -37,8 +37,6 @@ void ivpu_mmu_global_context_fini(struct + int ivpu_mmu_reserved_context_init(struct ivpu_device *vdev); + void ivpu_mmu_reserved_context_fini(struct ivpu_device *vdev); + +-void ivpu_mmu_user_context_mark_invalid(struct ivpu_device *vdev, u32 ssid); +- + int ivpu_mmu_context_insert_node(struct ivpu_mmu_context *ctx, const struct ivpu_addr_range *range, + u64 size, struct drm_mm_node *node); + void ivpu_mmu_context_remove_node(struct ivpu_mmu_context *ctx, struct drm_mm_node *node); diff --git a/queue-6.14/accel-ivpu-fix-missing-mmu-events-from-reserved-ssid.patch b/queue-6.14/accel-ivpu-fix-missing-mmu-events-from-reserved-ssid.patch new file mode 100644 index 0000000000..549f0f1c4d --- /dev/null +++ b/queue-6.14/accel-ivpu-fix-missing-mmu-events-from-reserved-ssid.patch @@ -0,0 +1,50 @@ +From 353b8f48390d36b39276ff6af61464ec64cd4d5c Mon Sep 17 00:00:00 2001 +From: Karol Wachowski +Date: Tue, 7 Jan 2025 18:32:31 +0100 +Subject: accel/ivpu: Fix missing MMU events from reserved SSID + +From: Karol Wachowski + +commit 353b8f48390d36b39276ff6af61464ec64cd4d5c upstream. + +Generate recovery when fault from reserved context is detected. +Add Abort (A) bit to reserved (1) SSID to ensure NPU also receives a fault. + +There is no way to create a file_priv with reserved SSID +but it is still possible to receive MMU faults from that SSID +as it is a default NPU HW setting. Such situation will occur if +FW freed context related resources but still performed access to DRAM. + +Signed-off-by: Karol Wachowski +Signed-off-by: Maciej Falkowski +Reviewed-by: Jacek Lawrynowicz +Signed-off-by: Jacek Lawrynowicz +Link: https://patchwork.freedesktop.org/patch/msgid/20250107173238.381120-9-maciej.falkowski@linux.intel.com +Signed-off-by: Greg Kroah-Hartman +--- + drivers/accel/ivpu/ivpu_mmu.c | 7 ++++--- + 1 file changed, 4 insertions(+), 3 deletions(-) + +--- a/drivers/accel/ivpu/ivpu_mmu.c ++++ b/drivers/accel/ivpu/ivpu_mmu.c +@@ -725,8 +725,8 @@ static int ivpu_mmu_cdtab_entry_set(stru + cd[2] = 0; + cd[3] = 0x0000000000007444; + +- /* For global context generate memory fault on VPU */ +- if (ssid == IVPU_GLOBAL_CONTEXT_MMU_SSID) ++ /* For global and reserved contexts generate memory fault on VPU */ ++ if (ssid == IVPU_GLOBAL_CONTEXT_MMU_SSID || ssid == IVPU_RESERVED_CONTEXT_MMU_SSID) + cd[0] |= IVPU_MMU_CD_0_A; + + if (valid) +@@ -945,7 +945,8 @@ void ivpu_mmu_irq_evtq_handler(struct iv + + while ((event = ivpu_mmu_get_event(vdev))) { + ssid = FIELD_GET(IVPU_MMU_EVT_SSID_MASK, *event); +- if (ssid == IVPU_GLOBAL_CONTEXT_MMU_SSID) { ++ if (ssid == IVPU_GLOBAL_CONTEXT_MMU_SSID || ++ ssid == IVPU_RESERVED_CONTEXT_MMU_SSID) { + ivpu_mmu_dump_event(vdev, event); + ivpu_pm_trigger_recovery(vdev, "MMU event"); + return; diff --git a/queue-6.14/accel-ivpu-fix-missing-mmu-events-if-file_priv-is-unbound.patch b/queue-6.14/accel-ivpu-fix-missing-mmu-events-if-file_priv-is-unbound.patch new file mode 100644 index 0000000000..1243c9da58 --- /dev/null +++ b/queue-6.14/accel-ivpu-fix-missing-mmu-events-if-file_priv-is-unbound.patch @@ -0,0 +1,48 @@ +From 2f5bbea1807a064a1e4c1b385c8cea4f37bb4b17 Mon Sep 17 00:00:00 2001 +From: Karol Wachowski +Date: Wed, 29 Jan 2025 13:56:33 +0100 +Subject: accel/ivpu: Fix missing MMU events if file_priv is unbound + +From: Karol Wachowski + +commit 2f5bbea1807a064a1e4c1b385c8cea4f37bb4b17 upstream. + +Move the ivpu_mmu_discard_events() function to the common portion of +the abort work function. This ensures it is called only once, even if +there are no faulty contexts in context_xa, to guarantee that MMU events +are discarded and new events are not missed. + +Reviewed-by: Jacek Lawrynowicz +Signed-off-by: Karol Wachowski +Reviewed-by: Jeffrey Hugo +Signed-off-by: Jacek Lawrynowicz +Link: https://patchwork.freedesktop.org/patch/msgid/20250129125636.1047413-4-jacek.lawrynowicz@linux.intel.com +Signed-off-by: Greg Kroah-Hartman +--- + drivers/accel/ivpu/ivpu_job.c | 8 +++++++- + 1 file changed, 7 insertions(+), 1 deletion(-) + +--- a/drivers/accel/ivpu/ivpu_job.c ++++ b/drivers/accel/ivpu/ivpu_job.c +@@ -369,7 +369,6 @@ void ivpu_context_abort_locked(struct iv + ivpu_jsm_context_release(vdev, file_priv->ctx.id); + + ivpu_mmu_disable_ssid_events(vdev, file_priv->ctx.id); +- ivpu_mmu_discard_events(vdev); + + file_priv->aborted = true; + } +@@ -872,6 +871,13 @@ void ivpu_context_abort_work_fn(struct w + } + mutex_unlock(&vdev->context_list_lock); + ++ /* ++ * We will not receive new MMU event interrupts until existing events are discarded ++ * however, we want to discard these events only after aborting the faulty context ++ * to avoid generating new faults from that context ++ */ ++ ivpu_mmu_discard_events(vdev); ++ + if (vdev->fw->sched_mode != VPU_SCHEDULING_MODE_HW) + return; + diff --git a/queue-6.14/accel-ivpu-flush-pending-jobs-of-device-s-workqueues.patch b/queue-6.14/accel-ivpu-flush-pending-jobs-of-device-s-workqueues.patch new file mode 100644 index 0000000000..d5c2b48165 --- /dev/null +++ b/queue-6.14/accel-ivpu-flush-pending-jobs-of-device-s-workqueues.patch @@ -0,0 +1,49 @@ +From 683e9fa1c885a0cffbc10b459a7eee9df92af1c1 Mon Sep 17 00:00:00 2001 +From: Maciej Falkowski +Date: Tue, 1 Apr 2025 17:57:55 +0200 +Subject: accel/ivpu: Flush pending jobs of device's workqueues + +From: Maciej Falkowski + +commit 683e9fa1c885a0cffbc10b459a7eee9df92af1c1 upstream. + +Use flush_work() instead of cancel_work_sync() for driver IRQ +workqueues to guarantee that remaining pending work +will be handled. + +This resolves two issues that were encountered where a driver was left +in an incorrect state as the bottom-half was canceled: + +1. Cancelling context-abort of a job that is still executing and + is causing translation faults which is going to cause additional TDRs + +2. Cancelling bottom-half of a DCT (duty-cycle throttling) request + which will cause a device to not be adjusted to an external frequency + request. + +Fixes: bc3e5f48b7ee ("accel/ivpu: Use workqueue for IRQ handling") +Signed-off-by: Maciej Falkowski +Reviewed-by: Lizhi Hou +Reviewed-by: Jeff Hugo +Signed-off-by: Jacek Lawrynowicz +Link: https://lore.kernel.org/r/20250401155755.4049156-1-maciej.falkowski@linux.intel.com +Signed-off-by: Greg Kroah-Hartman +--- + drivers/accel/ivpu/ivpu_drv.c | 6 +++--- + 1 file changed, 3 insertions(+), 3 deletions(-) + +--- a/drivers/accel/ivpu/ivpu_drv.c ++++ b/drivers/accel/ivpu/ivpu_drv.c +@@ -420,9 +420,9 @@ void ivpu_prepare_for_reset(struct ivpu_ + { + ivpu_hw_irq_disable(vdev); + disable_irq(vdev->irq); +- cancel_work_sync(&vdev->irq_ipc_work); +- cancel_work_sync(&vdev->irq_dct_work); +- cancel_work_sync(&vdev->context_abort_work); ++ flush_work(&vdev->irq_ipc_work); ++ flush_work(&vdev->irq_dct_work); ++ flush_work(&vdev->context_abort_work); + ivpu_ipc_disable(vdev); + ivpu_mmu_disable(vdev); + } diff --git a/queue-6.14/accel-ivpu-move-parts-of-mmu-event-irq-handling-to-thread-handler.patch b/queue-6.14/accel-ivpu-move-parts-of-mmu-event-irq-handling-to-thread-handler.patch new file mode 100644 index 0000000000..bbf18ec723 --- /dev/null +++ b/queue-6.14/accel-ivpu-move-parts-of-mmu-event-irq-handling-to-thread-handler.patch @@ -0,0 +1,248 @@ +From 4480912f3f8b8a1fbb5ae12c5c547fd094ec4197 Mon Sep 17 00:00:00 2001 +From: Karol Wachowski +Date: Tue, 7 Jan 2025 18:32:30 +0100 +Subject: accel/ivpu: Move parts of MMU event IRQ handling to thread handler + +From: Karol Wachowski + +commit 4480912f3f8b8a1fbb5ae12c5c547fd094ec4197 upstream. + +To prevent looping infinitely in MMU event handler we stop +generating new events by removing 'R' (record) bit from context +descriptor, but to ensure this change has effect KMD has to perform +configuration invalidation followed by sync command. + +Because of that move parts of the interrupt handler that can take longer +to a thread not to block in interrupt handler for too long. +This includes: + * disabling event queue for the time KMD updates MMU event queue consumer + to ensure proper synchronization between MMU and KMD + + * removal of 'R' (record) bit from context descriptor to ensure no more + faults are recorded until that context is destroyed + +Signed-off-by: Karol Wachowski +Signed-off-by: Maciej Falkowski +Reviewed-by: Jacek Lawrynowicz +Signed-off-by: Jacek Lawrynowicz +Link: https://patchwork.freedesktop.org/patch/msgid/20250107173238.381120-8-maciej.falkowski@linux.intel.com +Signed-off-by: Greg Kroah-Hartman +--- + drivers/accel/ivpu/ivpu_job.c | 7 ++- + drivers/accel/ivpu/ivpu_mmu.c | 93 +++++++++++++++++++++++++++--------------- + drivers/accel/ivpu/ivpu_mmu.h | 2 + 3 files changed, 69 insertions(+), 33 deletions(-) + +--- a/drivers/accel/ivpu/ivpu_job.c ++++ b/drivers/accel/ivpu/ivpu_job.c +@@ -17,6 +17,7 @@ + #include "ivpu_ipc.h" + #include "ivpu_job.h" + #include "ivpu_jsm_msg.h" ++#include "ivpu_mmu.h" + #include "ivpu_pm.h" + #include "ivpu_trace.h" + #include "vpu_boot_api.h" +@@ -360,12 +361,16 @@ void ivpu_context_abort_locked(struct iv + struct ivpu_device *vdev = file_priv->vdev; + + lockdep_assert_held(&file_priv->lock); ++ ivpu_dbg(vdev, JOB, "Context ID: %u abort\n", file_priv->ctx.id); + + ivpu_cmdq_fini_all(file_priv); + + if (vdev->fw->sched_mode == VPU_SCHEDULING_MODE_OS) + ivpu_jsm_context_release(vdev, file_priv->ctx.id); + ++ ivpu_mmu_disable_ssid_events(vdev, file_priv->ctx.id); ++ ivpu_mmu_discard_events(vdev); ++ + file_priv->aborted = true; + } + +@@ -849,8 +854,8 @@ void ivpu_context_abort_work_fn(struct w + { + struct ivpu_device *vdev = container_of(work, struct ivpu_device, context_abort_work); + struct ivpu_file_priv *file_priv; +- unsigned long ctx_id; + struct ivpu_job *job; ++ unsigned long ctx_id; + unsigned long id; + + if (vdev->fw->sched_mode == VPU_SCHEDULING_MODE_HW) +--- a/drivers/accel/ivpu/ivpu_mmu.c ++++ b/drivers/accel/ivpu/ivpu_mmu.c +@@ -20,6 +20,12 @@ + #define IVPU_MMU_REG_CR0 0x00200020u + #define IVPU_MMU_REG_CR0ACK 0x00200024u + #define IVPU_MMU_REG_CR0ACK_VAL_MASK GENMASK(31, 0) ++#define IVPU_MMU_REG_CR0_ATSCHK_MASK BIT(4) ++#define IVPU_MMU_REG_CR0_CMDQEN_MASK BIT(3) ++#define IVPU_MMU_REG_CR0_EVTQEN_MASK BIT(2) ++#define IVPU_MMU_REG_CR0_PRIQEN_MASK BIT(1) ++#define IVPU_MMU_REG_CR0_SMMUEN_MASK BIT(0) ++ + #define IVPU_MMU_REG_CR1 0x00200028u + #define IVPU_MMU_REG_CR2 0x0020002cu + #define IVPU_MMU_REG_IRQ_CTRL 0x00200050u +@@ -141,12 +147,6 @@ + #define IVPU_MMU_IRQ_EVTQ_EN BIT(2) + #define IVPU_MMU_IRQ_GERROR_EN BIT(0) + +-#define IVPU_MMU_CR0_ATSCHK BIT(4) +-#define IVPU_MMU_CR0_CMDQEN BIT(3) +-#define IVPU_MMU_CR0_EVTQEN BIT(2) +-#define IVPU_MMU_CR0_PRIQEN BIT(1) +-#define IVPU_MMU_CR0_SMMUEN BIT(0) +- + #define IVPU_MMU_CR1_TABLE_SH GENMASK(11, 10) + #define IVPU_MMU_CR1_TABLE_OC GENMASK(9, 8) + #define IVPU_MMU_CR1_TABLE_IC GENMASK(7, 6) +@@ -596,7 +596,7 @@ static int ivpu_mmu_reset(struct ivpu_de + REGV_WR32(IVPU_MMU_REG_CMDQ_PROD, 0); + REGV_WR32(IVPU_MMU_REG_CMDQ_CONS, 0); + +- val = IVPU_MMU_CR0_CMDQEN; ++ val = REG_SET_FLD(IVPU_MMU_REG_CR0, CMDQEN, 0); + ret = ivpu_mmu_reg_write_cr0(vdev, val); + if (ret) + return ret; +@@ -617,12 +617,12 @@ static int ivpu_mmu_reset(struct ivpu_de + REGV_WR32(IVPU_MMU_REG_EVTQ_PROD_SEC, 0); + REGV_WR32(IVPU_MMU_REG_EVTQ_CONS_SEC, 0); + +- val |= IVPU_MMU_CR0_EVTQEN; ++ val = REG_SET_FLD(IVPU_MMU_REG_CR0, EVTQEN, val); + ret = ivpu_mmu_reg_write_cr0(vdev, val); + if (ret) + return ret; + +- val |= IVPU_MMU_CR0_ATSCHK; ++ val = REG_SET_FLD(IVPU_MMU_REG_CR0, ATSCHK, val); + ret = ivpu_mmu_reg_write_cr0(vdev, val); + if (ret) + return ret; +@@ -631,7 +631,7 @@ static int ivpu_mmu_reset(struct ivpu_de + if (ret) + return ret; + +- val |= IVPU_MMU_CR0_SMMUEN; ++ val = REG_SET_FLD(IVPU_MMU_REG_CR0, SMMUEN, val); + return ivpu_mmu_reg_write_cr0(vdev, val); + } + +@@ -870,7 +870,47 @@ static u32 *ivpu_mmu_get_event(struct iv + return evt; + } + +-static int ivpu_mmu_disable_events(struct ivpu_device *vdev, u32 ssid) ++static int ivpu_mmu_evtq_set(struct ivpu_device *vdev, bool enable) ++{ ++ u32 val = REGV_RD32(IVPU_MMU_REG_CR0); ++ ++ if (enable) ++ val = REG_SET_FLD(IVPU_MMU_REG_CR0, EVTQEN, val); ++ else ++ val = REG_CLR_FLD(IVPU_MMU_REG_CR0, EVTQEN, val); ++ REGV_WR32(IVPU_MMU_REG_CR0, val); ++ ++ return REGV_POLL_FLD(IVPU_MMU_REG_CR0ACK, VAL, val, IVPU_MMU_REG_TIMEOUT_US); ++} ++ ++static int ivpu_mmu_evtq_enable(struct ivpu_device *vdev) ++{ ++ return ivpu_mmu_evtq_set(vdev, true); ++} ++ ++static int ivpu_mmu_evtq_disable(struct ivpu_device *vdev) ++{ ++ return ivpu_mmu_evtq_set(vdev, false); ++} ++ ++void ivpu_mmu_discard_events(struct ivpu_device *vdev) ++{ ++ /* ++ * Disable event queue (stop MMU from updating the producer) ++ * to allow synchronization of consumer and producer indexes ++ */ ++ ivpu_mmu_evtq_disable(vdev); ++ ++ vdev->mmu->evtq.cons = REGV_RD32(IVPU_MMU_REG_EVTQ_PROD_SEC); ++ REGV_WR32(IVPU_MMU_REG_EVTQ_CONS_SEC, vdev->mmu->evtq.cons); ++ vdev->mmu->evtq.prod = REGV_RD32(IVPU_MMU_REG_EVTQ_PROD_SEC); ++ ++ ivpu_mmu_evtq_enable(vdev); ++ ++ drm_WARN_ON_ONCE(&vdev->drm, vdev->mmu->evtq.cons != vdev->mmu->evtq.prod); ++} ++ ++int ivpu_mmu_disable_ssid_events(struct ivpu_device *vdev, u32 ssid) + { + struct ivpu_mmu_info *mmu = vdev->mmu; + struct ivpu_mmu_cdtab *cdtab = &mmu->cdtab; +@@ -890,6 +930,7 @@ static int ivpu_mmu_disable_events(struc + clflush_cache_range(entry, IVPU_MMU_CDTAB_ENT_SIZE); + + ivpu_mmu_cmdq_write_cfgi_all(vdev); ++ ivpu_mmu_cmdq_sync(vdev); + + return 0; + } +@@ -897,38 +938,26 @@ static int ivpu_mmu_disable_events(struc + void ivpu_mmu_irq_evtq_handler(struct ivpu_device *vdev) + { + struct ivpu_file_priv *file_priv; +- u32 last_ssid = -1; + u32 *event; + u32 ssid; + + ivpu_dbg(vdev, IRQ, "MMU event queue\n"); + + while ((event = ivpu_mmu_get_event(vdev))) { +- ssid = FIELD_GET(IVPU_MMU_EVT_SSID_MASK, event[0]); +- +- if (ssid == last_ssid) +- continue; ++ ssid = FIELD_GET(IVPU_MMU_EVT_SSID_MASK, *event); ++ if (ssid == IVPU_GLOBAL_CONTEXT_MMU_SSID) { ++ ivpu_mmu_dump_event(vdev, event); ++ ivpu_pm_trigger_recovery(vdev, "MMU event"); ++ return; ++ } + +- xa_lock(&vdev->context_xa); + file_priv = xa_load(&vdev->context_xa, ssid); + if (file_priv) { +- if (file_priv->has_mmu_faults) { +- event = NULL; +- } else { +- ivpu_mmu_disable_events(vdev, ssid); +- file_priv->has_mmu_faults = true; ++ if (!READ_ONCE(file_priv->has_mmu_faults)) { ++ ivpu_mmu_dump_event(vdev, event); ++ WRITE_ONCE(file_priv->has_mmu_faults, true); + } + } +- xa_unlock(&vdev->context_xa); +- +- if (event) +- ivpu_mmu_dump_event(vdev, event); +- +- if (ssid == IVPU_GLOBAL_CONTEXT_MMU_SSID) { +- ivpu_pm_trigger_recovery(vdev, "MMU event"); +- return; +- } +- REGV_WR32(IVPU_MMU_REG_EVTQ_CONS_SEC, vdev->mmu->evtq.cons); + } + + queue_work(system_wq, &vdev->context_abort_work); +--- a/drivers/accel/ivpu/ivpu_mmu.h ++++ b/drivers/accel/ivpu/ivpu_mmu.h +@@ -47,5 +47,7 @@ int ivpu_mmu_invalidate_tlb(struct ivpu_ + void ivpu_mmu_irq_evtq_handler(struct ivpu_device *vdev); + void ivpu_mmu_irq_gerr_handler(struct ivpu_device *vdev); + void ivpu_mmu_evtq_dump(struct ivpu_device *vdev); ++void ivpu_mmu_discard_events(struct ivpu_device *vdev); ++int ivpu_mmu_disable_ssid_events(struct ivpu_device *vdev, u32 ssid); + + #endif /* __IVPU_MMU_H__ */ diff --git a/queue-6.14/accel-ivpu-use-workqueue-for-irq-handling.patch b/queue-6.14/accel-ivpu-use-workqueue-for-irq-handling.patch new file mode 100644 index 0000000000..1ba99f02bf --- /dev/null +++ b/queue-6.14/accel-ivpu-use-workqueue-for-irq-handling.patch @@ -0,0 +1,277 @@ +From bc3e5f48b7ee021371dc37297678f7089be6ce28 Mon Sep 17 00:00:00 2001 +From: Maciej Falkowski +Date: Tue, 7 Jan 2025 18:32:28 +0100 +Subject: accel/ivpu: Use workqueue for IRQ handling + +From: Maciej Falkowski + +commit bc3e5f48b7ee021371dc37297678f7089be6ce28 upstream. + +Convert IRQ bottom half from the thread handler into workqueue. +This increases a stability in rare scenarios where driver on +debugging/hardening kernels processes IRQ too slow and misses +some interrupts due to it. +Workqueue handler also gives a very minor performance increase. + +Signed-off-by: Maciej Falkowski +Reviewed-by: Jacek Lawrynowicz +Signed-off-by: Jacek Lawrynowicz +Link: https://patchwork.freedesktop.org/patch/msgid/20250107173238.381120-6-maciej.falkowski@linux.intel.com +Signed-off-by: Greg Kroah-Hartman +--- + drivers/accel/ivpu/ivpu_drv.c | 39 +++++++++----------------------------- + drivers/accel/ivpu/ivpu_drv.h | 5 +++- + drivers/accel/ivpu/ivpu_hw.c | 5 ---- + drivers/accel/ivpu/ivpu_hw.h | 9 -------- + drivers/accel/ivpu/ivpu_hw_btrs.c | 3 -- + drivers/accel/ivpu/ivpu_ipc.c | 7 ++---- + drivers/accel/ivpu/ivpu_ipc.h | 2 - + drivers/accel/ivpu/ivpu_job.c | 2 - + drivers/accel/ivpu/ivpu_job.h | 2 - + drivers/accel/ivpu/ivpu_pm.c | 3 +- + drivers/accel/ivpu/ivpu_pm.h | 2 - + 11 files changed, 24 insertions(+), 55 deletions(-) + +--- a/drivers/accel/ivpu/ivpu_drv.c ++++ b/drivers/accel/ivpu/ivpu_drv.c +@@ -7,6 +7,7 @@ + #include + #include + #include ++#include + #include + + #include +@@ -419,6 +420,9 @@ void ivpu_prepare_for_reset(struct ivpu_ + { + ivpu_hw_irq_disable(vdev); + disable_irq(vdev->irq); ++ cancel_work_sync(&vdev->irq_ipc_work); ++ cancel_work_sync(&vdev->irq_dct_work); ++ cancel_work_sync(&vdev->context_abort_work); + ivpu_ipc_disable(vdev); + ivpu_mmu_disable(vdev); + } +@@ -463,31 +467,6 @@ static const struct drm_driver driver = + .major = 1, + }; + +-static irqreturn_t ivpu_irq_thread_handler(int irq, void *arg) +-{ +- struct ivpu_device *vdev = arg; +- u8 irq_src; +- +- if (kfifo_is_empty(&vdev->hw->irq.fifo)) +- return IRQ_NONE; +- +- while (kfifo_get(&vdev->hw->irq.fifo, &irq_src)) { +- switch (irq_src) { +- case IVPU_HW_IRQ_SRC_IPC: +- ivpu_ipc_irq_thread_handler(vdev); +- break; +- case IVPU_HW_IRQ_SRC_DCT: +- ivpu_pm_dct_irq_thread_handler(vdev); +- break; +- default: +- ivpu_err_ratelimited(vdev, "Unknown IRQ source: %u\n", irq_src); +- break; +- } +- } +- +- return IRQ_HANDLED; +-} +- + static int ivpu_irq_init(struct ivpu_device *vdev) + { + struct pci_dev *pdev = to_pci_dev(vdev->drm.dev); +@@ -499,12 +478,16 @@ static int ivpu_irq_init(struct ivpu_dev + return ret; + } + ++ INIT_WORK(&vdev->irq_ipc_work, ivpu_ipc_irq_work_fn); ++ INIT_WORK(&vdev->irq_dct_work, ivpu_pm_irq_dct_work_fn); ++ INIT_WORK(&vdev->context_abort_work, ivpu_context_abort_work_fn); ++ + ivpu_irq_handlers_init(vdev); + + vdev->irq = pci_irq_vector(pdev, 0); + +- ret = devm_request_threaded_irq(vdev->drm.dev, vdev->irq, ivpu_hw_irq_handler, +- ivpu_irq_thread_handler, IRQF_NO_AUTOEN, DRIVER_NAME, vdev); ++ ret = devm_request_irq(vdev->drm.dev, vdev->irq, ivpu_hw_irq_handler, ++ IRQF_NO_AUTOEN, DRIVER_NAME, vdev); + if (ret) + ivpu_err(vdev, "Failed to request an IRQ %d\n", ret); + +@@ -597,8 +580,6 @@ static int ivpu_dev_init(struct ivpu_dev + vdev->db_limit.min = IVPU_MIN_DB; + vdev->db_limit.max = IVPU_MAX_DB; + +- INIT_WORK(&vdev->context_abort_work, ivpu_context_abort_thread_handler); +- + ret = drmm_mutex_init(&vdev->drm, &vdev->context_list_lock); + if (ret) + goto err_xa_destroy; +--- a/drivers/accel/ivpu/ivpu_drv.h ++++ b/drivers/accel/ivpu/ivpu_drv.h +@@ -137,12 +137,15 @@ struct ivpu_device { + struct mutex context_list_lock; /* Protects user context addition/removal */ + struct xarray context_xa; + struct xa_limit context_xa_limit; +- struct work_struct context_abort_work; + + struct xarray db_xa; + struct xa_limit db_limit; + u32 db_next; + ++ struct work_struct irq_ipc_work; ++ struct work_struct irq_dct_work; ++ struct work_struct context_abort_work; ++ + struct mutex bo_list_lock; /* Protects bo_list */ + struct list_head bo_list; + +--- a/drivers/accel/ivpu/ivpu_hw.c ++++ b/drivers/accel/ivpu/ivpu_hw.c +@@ -285,8 +285,6 @@ void ivpu_hw_profiling_freq_drive(struct + + void ivpu_irq_handlers_init(struct ivpu_device *vdev) + { +- INIT_KFIFO(vdev->hw->irq.fifo); +- + if (ivpu_hw_ip_gen(vdev) == IVPU_HW_IP_37XX) + vdev->hw->irq.ip_irq_handler = ivpu_hw_ip_irq_handler_37xx; + else +@@ -300,7 +298,6 @@ void ivpu_irq_handlers_init(struct ivpu_ + + void ivpu_hw_irq_enable(struct ivpu_device *vdev) + { +- kfifo_reset(&vdev->hw->irq.fifo); + ivpu_hw_ip_irq_enable(vdev); + ivpu_hw_btrs_irq_enable(vdev); + } +@@ -327,8 +324,6 @@ irqreturn_t ivpu_hw_irq_handler(int irq, + /* Re-enable global interrupts to re-trigger MSI for pending interrupts */ + ivpu_hw_btrs_global_int_enable(vdev); + +- if (!kfifo_is_empty(&vdev->hw->irq.fifo)) +- return IRQ_WAKE_THREAD; + if (ip_handled || btrs_handled) + return IRQ_HANDLED; + return IRQ_NONE; +--- a/drivers/accel/ivpu/ivpu_hw.h ++++ b/drivers/accel/ivpu/ivpu_hw.h +@@ -6,18 +6,10 @@ + #ifndef __IVPU_HW_H__ + #define __IVPU_HW_H__ + +-#include +- + #include "ivpu_drv.h" + #include "ivpu_hw_btrs.h" + #include "ivpu_hw_ip.h" + +-#define IVPU_HW_IRQ_FIFO_LENGTH 1024 +- +-#define IVPU_HW_IRQ_SRC_IPC 1 +-#define IVPU_HW_IRQ_SRC_MMU_EVTQ 2 +-#define IVPU_HW_IRQ_SRC_DCT 3 +- + struct ivpu_addr_range { + resource_size_t start; + resource_size_t end; +@@ -27,7 +19,6 @@ struct ivpu_hw_info { + struct { + bool (*btrs_irq_handler)(struct ivpu_device *vdev, int irq); + bool (*ip_irq_handler)(struct ivpu_device *vdev, int irq); +- DECLARE_KFIFO(fifo, u8, IVPU_HW_IRQ_FIFO_LENGTH); + } irq; + struct { + struct ivpu_addr_range global; +--- a/drivers/accel/ivpu/ivpu_hw_btrs.c ++++ b/drivers/accel/ivpu/ivpu_hw_btrs.c +@@ -666,8 +666,7 @@ bool ivpu_hw_btrs_irq_handler_lnl(struct + + if (REG_TEST_FLD(VPU_HW_BTRS_LNL_INTERRUPT_STAT, SURV_ERR, status)) { + ivpu_dbg(vdev, IRQ, "Survivability IRQ\n"); +- if (!kfifo_put(&vdev->hw->irq.fifo, IVPU_HW_IRQ_SRC_DCT)) +- ivpu_err_ratelimited(vdev, "IRQ FIFO full\n"); ++ queue_work(system_wq, &vdev->irq_dct_work); + } + + if (REG_TEST_FLD(VPU_HW_BTRS_LNL_INTERRUPT_STAT, FREQ_CHANGE, status)) { +--- a/drivers/accel/ivpu/ivpu_ipc.c ++++ b/drivers/accel/ivpu/ivpu_ipc.c +@@ -460,13 +460,12 @@ void ivpu_ipc_irq_handler(struct ivpu_de + } + } + +- if (!list_empty(&ipc->cb_msg_list)) +- if (!kfifo_put(&vdev->hw->irq.fifo, IVPU_HW_IRQ_SRC_IPC)) +- ivpu_err_ratelimited(vdev, "IRQ FIFO full\n"); ++ queue_work(system_wq, &vdev->irq_ipc_work); + } + +-void ivpu_ipc_irq_thread_handler(struct ivpu_device *vdev) ++void ivpu_ipc_irq_work_fn(struct work_struct *work) + { ++ struct ivpu_device *vdev = container_of(work, struct ivpu_device, irq_ipc_work); + struct ivpu_ipc_info *ipc = vdev->ipc; + struct ivpu_ipc_rx_msg *rx_msg, *r; + struct list_head cb_msg_list; +--- a/drivers/accel/ivpu/ivpu_ipc.h ++++ b/drivers/accel/ivpu/ivpu_ipc.h +@@ -90,7 +90,7 @@ void ivpu_ipc_disable(struct ivpu_device + void ivpu_ipc_reset(struct ivpu_device *vdev); + + void ivpu_ipc_irq_handler(struct ivpu_device *vdev); +-void ivpu_ipc_irq_thread_handler(struct ivpu_device *vdev); ++void ivpu_ipc_irq_work_fn(struct work_struct *work); + + void ivpu_ipc_consumer_add(struct ivpu_device *vdev, struct ivpu_ipc_consumer *cons, + u32 channel, ivpu_ipc_rx_callback_t callback); +--- a/drivers/accel/ivpu/ivpu_job.c ++++ b/drivers/accel/ivpu/ivpu_job.c +@@ -845,7 +845,7 @@ void ivpu_job_done_consumer_fini(struct + ivpu_ipc_consumer_del(vdev, &vdev->job_done_consumer); + } + +-void ivpu_context_abort_thread_handler(struct work_struct *work) ++void ivpu_context_abort_work_fn(struct work_struct *work) + { + struct ivpu_device *vdev = container_of(work, struct ivpu_device, context_abort_work); + struct ivpu_file_priv *file_priv; +--- a/drivers/accel/ivpu/ivpu_job.h ++++ b/drivers/accel/ivpu/ivpu_job.h +@@ -66,7 +66,7 @@ void ivpu_cmdq_reset_all_contexts(struct + + void ivpu_job_done_consumer_init(struct ivpu_device *vdev); + void ivpu_job_done_consumer_fini(struct ivpu_device *vdev); +-void ivpu_context_abort_thread_handler(struct work_struct *work); ++void ivpu_context_abort_work_fn(struct work_struct *work); + + void ivpu_jobs_abort_all(struct ivpu_device *vdev); + +--- a/drivers/accel/ivpu/ivpu_pm.c ++++ b/drivers/accel/ivpu/ivpu_pm.c +@@ -464,8 +464,9 @@ int ivpu_pm_dct_disable(struct ivpu_devi + return 0; + } + +-void ivpu_pm_dct_irq_thread_handler(struct ivpu_device *vdev) ++void ivpu_pm_irq_dct_work_fn(struct work_struct *work) + { ++ struct ivpu_device *vdev = container_of(work, struct ivpu_device, irq_dct_work); + bool enable; + int ret; + +--- a/drivers/accel/ivpu/ivpu_pm.h ++++ b/drivers/accel/ivpu/ivpu_pm.h +@@ -45,6 +45,6 @@ void ivpu_stop_job_timeout_detection(str + int ivpu_pm_dct_init(struct ivpu_device *vdev); + int ivpu_pm_dct_enable(struct ivpu_device *vdev, u8 active_percent); + int ivpu_pm_dct_disable(struct ivpu_device *vdev); +-void ivpu_pm_dct_irq_thread_handler(struct ivpu_device *vdev); ++void ivpu_pm_irq_dct_work_fn(struct work_struct *work); + + #endif /* __IVPU_PM_H__ */ diff --git a/queue-6.14/drm-xe-gsc-do-not-flush-the-gsc-worker-from-the-reset-path.patch b/queue-6.14/drm-xe-gsc-do-not-flush-the-gsc-worker-from-the-reset-path.patch new file mode 100644 index 0000000000..1163f3fd53 --- /dev/null +++ b/queue-6.14/drm-xe-gsc-do-not-flush-the-gsc-worker-from-the-reset-path.patch @@ -0,0 +1,179 @@ +From 03552d8ac0afcc080c339faa0b726e2c0e9361cb Mon Sep 17 00:00:00 2001 +From: Daniele Ceraolo Spurio +Date: Fri, 2 May 2025 08:51:04 -0700 +Subject: drm/xe/gsc: do not flush the GSC worker from the reset path + +From: Daniele Ceraolo Spurio + +commit 03552d8ac0afcc080c339faa0b726e2c0e9361cb upstream. + +The workqueue used for the reset worker is marked as WQ_MEM_RECLAIM, +while the GSC one isn't (and can't be as we need to do memory +allocations in the gsc worker). Therefore, we can't flush the latter +from the former. + +The reason why we had such a flush was to avoid interrupting either +the GSC FW load or in progress GSC proxy operations. GSC proxy +operations fall into 2 categories: + +1) GSC proxy init: this only happens once immediately after GSC FW load + and does not support being interrupted. The only way to recover from + an interruption of the proxy init is to do an FLR and re-load the GSC. + +2) GSC proxy request: this can happen in response to a request that + the driver sends to the GSC. If this is interrupted, the GSC FW will + timeout and the driver request will be failed, but overall the GSC + will keep working fine. + +Flushing the work allowed us to avoid interruption in both cases (unless +the hang came from the GSC engine itself, in which case we're toast +anyway). However, a failure on a proxy request is tolerable if we're in +a scenario where we're triggering a GT reset (i.e., something is already +gone pretty wrong), so what we really need to avoid is interrupting +the init flow, which we can do by polling on the register that reports +when the proxy init is complete (as that ensure us that all the load and +init operations have been completed). + +Note that during suspend we still want to do a flush of the worker to +make sure it completes any operations involving the HW before the power +is cut. + +v2: fix spelling in commit msg, rename waiter function (Julia) + +Fixes: dd0e89e5edc2 ("drm/xe/gsc: GSC FW load") +Closes: https://gitlab.freedesktop.org/drm/xe/kernel/-/issues/4830 +Signed-off-by: Daniele Ceraolo Spurio +Cc: John Harrison +Cc: Alan Previn +Cc: # v6.8+ +Reviewed-by: Julia Filipchuk +Link: https://lore.kernel.org/r/20250502155104.2201469-1-daniele.ceraolospurio@intel.com +(cherry picked from commit 12370bfcc4f0bdf70279ec5b570eb298963422b5) +Signed-off-by: Lucas De Marchi +Signed-off-by: Greg Kroah-Hartman +--- + drivers/gpu/drm/xe/xe_gsc.c | 22 ++++++++++++++++++++++ + drivers/gpu/drm/xe/xe_gsc.h | 1 + + drivers/gpu/drm/xe/xe_gsc_proxy.c | 11 +++++++++++ + drivers/gpu/drm/xe/xe_gsc_proxy.h | 1 + + drivers/gpu/drm/xe/xe_gt.c | 2 +- + drivers/gpu/drm/xe/xe_uc.c | 8 +++++++- + drivers/gpu/drm/xe/xe_uc.h | 1 + + 7 files changed, 44 insertions(+), 2 deletions(-) + +--- a/drivers/gpu/drm/xe/xe_gsc.c ++++ b/drivers/gpu/drm/xe/xe_gsc.c +@@ -564,6 +564,28 @@ void xe_gsc_remove(struct xe_gsc *gsc) + xe_gsc_proxy_remove(gsc); + } + ++void xe_gsc_stop_prepare(struct xe_gsc *gsc) ++{ ++ struct xe_gt *gt = gsc_to_gt(gsc); ++ int ret; ++ ++ if (!xe_uc_fw_is_loadable(&gsc->fw) || xe_uc_fw_is_in_error_state(&gsc->fw)) ++ return; ++ ++ xe_force_wake_assert_held(gt_to_fw(gt), XE_FW_GSC); ++ ++ /* ++ * If the GSC FW load or the proxy init are interrupted, the only way ++ * to recover it is to do an FLR and reload the GSC from scratch. ++ * Therefore, let's wait for the init to complete before stopping ++ * operations. The proxy init is the last step, so we can just wait on ++ * that ++ */ ++ ret = xe_gsc_wait_for_proxy_init_done(gsc); ++ if (ret) ++ xe_gt_err(gt, "failed to wait for GSC init completion before uc stop\n"); ++} ++ + /* + * wa_14015076503: if the GSC FW is loaded, we need to alert it before doing a + * GSC engine reset by writing a notification bit in the GS1 register and then +--- a/drivers/gpu/drm/xe/xe_gsc.h ++++ b/drivers/gpu/drm/xe/xe_gsc.h +@@ -16,6 +16,7 @@ struct xe_hw_engine; + int xe_gsc_init(struct xe_gsc *gsc); + int xe_gsc_init_post_hwconfig(struct xe_gsc *gsc); + void xe_gsc_wait_for_worker_completion(struct xe_gsc *gsc); ++void xe_gsc_stop_prepare(struct xe_gsc *gsc); + void xe_gsc_load_start(struct xe_gsc *gsc); + void xe_gsc_remove(struct xe_gsc *gsc); + void xe_gsc_hwe_irq_handler(struct xe_hw_engine *hwe, u16 intr_vec); +--- a/drivers/gpu/drm/xe/xe_gsc_proxy.c ++++ b/drivers/gpu/drm/xe/xe_gsc_proxy.c +@@ -71,6 +71,17 @@ bool xe_gsc_proxy_init_done(struct xe_gs + HECI1_FWSTS1_PROXY_STATE_NORMAL; + } + ++int xe_gsc_wait_for_proxy_init_done(struct xe_gsc *gsc) ++{ ++ struct xe_gt *gt = gsc_to_gt(gsc); ++ ++ /* Proxy init can take up to 500ms, so wait double that for safety */ ++ return xe_mmio_wait32(>->mmio, HECI_FWSTS1(MTL_GSC_HECI1_BASE), ++ HECI1_FWSTS1_CURRENT_STATE, ++ HECI1_FWSTS1_PROXY_STATE_NORMAL, ++ USEC_PER_SEC, NULL, false); ++} ++ + static void __gsc_proxy_irq_rmw(struct xe_gsc *gsc, u32 clr, u32 set) + { + struct xe_gt *gt = gsc_to_gt(gsc); +--- a/drivers/gpu/drm/xe/xe_gsc_proxy.h ++++ b/drivers/gpu/drm/xe/xe_gsc_proxy.h +@@ -13,6 +13,7 @@ struct xe_gsc; + int xe_gsc_proxy_init(struct xe_gsc *gsc); + bool xe_gsc_proxy_init_done(struct xe_gsc *gsc); + void xe_gsc_proxy_remove(struct xe_gsc *gsc); ++int xe_gsc_wait_for_proxy_init_done(struct xe_gsc *gsc); + int xe_gsc_proxy_start(struct xe_gsc *gsc); + + int xe_gsc_proxy_request_handler(struct xe_gsc *gsc); +--- a/drivers/gpu/drm/xe/xe_gt.c ++++ b/drivers/gpu/drm/xe/xe_gt.c +@@ -862,7 +862,7 @@ void xe_gt_suspend_prepare(struct xe_gt + + fw_ref = xe_force_wake_get(gt_to_fw(gt), XE_FORCEWAKE_ALL); + +- xe_uc_stop_prepare(>->uc); ++ xe_uc_suspend_prepare(>->uc); + + xe_force_wake_put(gt_to_fw(gt), fw_ref); + } +--- a/drivers/gpu/drm/xe/xe_uc.c ++++ b/drivers/gpu/drm/xe/xe_uc.c +@@ -241,7 +241,7 @@ void xe_uc_gucrc_disable(struct xe_uc *u + + void xe_uc_stop_prepare(struct xe_uc *uc) + { +- xe_gsc_wait_for_worker_completion(&uc->gsc); ++ xe_gsc_stop_prepare(&uc->gsc); + xe_guc_stop_prepare(&uc->guc); + } + +@@ -275,6 +275,12 @@ again: + goto again; + } + ++void xe_uc_suspend_prepare(struct xe_uc *uc) ++{ ++ xe_gsc_wait_for_worker_completion(&uc->gsc); ++ xe_guc_stop_prepare(&uc->guc); ++} ++ + int xe_uc_suspend(struct xe_uc *uc) + { + /* GuC submission not enabled, nothing to do */ +--- a/drivers/gpu/drm/xe/xe_uc.h ++++ b/drivers/gpu/drm/xe/xe_uc.h +@@ -18,6 +18,7 @@ int xe_uc_reset_prepare(struct xe_uc *uc + void xe_uc_stop_prepare(struct xe_uc *uc); + void xe_uc_stop(struct xe_uc *uc); + int xe_uc_start(struct xe_uc *uc); ++void xe_uc_suspend_prepare(struct xe_uc *uc); + int xe_uc_suspend(struct xe_uc *uc); + int xe_uc_sanitize_reset(struct xe_uc *uc); + void xe_uc_remove(struct xe_uc *uc); diff --git a/queue-6.14/mm-page_alloc-fix-race-condition-in-unaccepted-memory-handling.patch b/queue-6.14/mm-page_alloc-fix-race-condition-in-unaccepted-memory-handling.patch new file mode 100644 index 0000000000..adbc9bc723 --- /dev/null +++ b/queue-6.14/mm-page_alloc-fix-race-condition-in-unaccepted-memory-handling.patch @@ -0,0 +1,132 @@ +From fefc075182275057ce607effaa3daa9e6e3bdc73 Mon Sep 17 00:00:00 2001 +From: "Kirill A. Shutemov" +Date: Tue, 6 May 2025 16:32:07 +0300 +Subject: mm/page_alloc: fix race condition in unaccepted memory handling + +From: Kirill A. Shutemov + +commit fefc075182275057ce607effaa3daa9e6e3bdc73 upstream. + +The page allocator tracks the number of zones that have unaccepted memory +using static_branch_enc/dec() and uses that static branch in hot paths to +determine if it needs to deal with unaccepted memory. + +Borislav and Thomas pointed out that the tracking is racy: operations on +static_branch are not serialized against adding/removing unaccepted pages +to/from the zone. + +Sanity checks inside static_branch machinery detects it: + +WARNING: CPU: 0 PID: 10 at kernel/jump_label.c:276 __static_key_slow_dec_cpuslocked+0x8e/0xa0 + +The comment around the WARN() explains the problem: + + /* + * Warn about the '-1' case though; since that means a + * decrement is concurrent with a first (0->1) increment. IOW + * people are trying to disable something that wasn't yet fully + * enabled. This suggests an ordering problem on the user side. + */ + +The effect of this static_branch optimization is only visible on +microbenchmark. + +Instead of adding more complexity around it, remove it altogether. + +Link: https://lkml.kernel.org/r/20250506133207.1009676-1-kirill.shutemov@linux.intel.com +Signed-off-by: Kirill A. Shutemov +Fixes: dcdfdd40fa82 ("mm: Add support for unaccepted memory") +Link: https://lore.kernel.org/all/20250506092445.GBaBnVXXyvnazly6iF@fat_crate.local +Reported-by: Borislav Petkov +Tested-by: Borislav Petkov (AMD) +Reported-by: Thomas Gleixner +Cc: Vlastimil Babka +Cc: Suren Baghdasaryan +Cc: Michal Hocko +Cc: Brendan Jackman +Cc: Johannes Weiner +Cc: [6.5+] +Signed-off-by: Andrew Morton +Signed-off-by: Kirill A. Shutemov +Signed-off-by: Greg Kroah-Hartman +--- + mm/page_alloc.c | 23 ----------------------- + 1 file changed, 23 deletions(-) + +--- a/mm/page_alloc.c ++++ b/mm/page_alloc.c +@@ -6951,9 +6951,6 @@ bool has_managed_dma(void) + + #ifdef CONFIG_UNACCEPTED_MEMORY + +-/* Counts number of zones with unaccepted pages. */ +-static DEFINE_STATIC_KEY_FALSE(zones_with_unaccepted_pages); +- + static bool lazy_accept = true; + + static int __init accept_memory_parse(char *p) +@@ -6980,11 +6977,7 @@ static bool page_contains_unaccepted(str + static void __accept_page(struct zone *zone, unsigned long *flags, + struct page *page) + { +- bool last; +- + list_del(&page->lru); +- last = list_empty(&zone->unaccepted_pages); +- + account_freepages(zone, -MAX_ORDER_NR_PAGES, MIGRATE_MOVABLE); + __mod_zone_page_state(zone, NR_UNACCEPTED, -MAX_ORDER_NR_PAGES); + __ClearPageUnaccepted(page); +@@ -6993,9 +6986,6 @@ static void __accept_page(struct zone *z + accept_memory(page_to_phys(page), PAGE_SIZE << MAX_PAGE_ORDER); + + __free_pages_ok(page, MAX_PAGE_ORDER, FPI_TO_TAIL); +- +- if (last) +- static_branch_dec(&zones_with_unaccepted_pages); + } + + void accept_page(struct page *page) +@@ -7032,19 +7022,11 @@ static bool try_to_accept_memory_one(str + return true; + } + +-static inline bool has_unaccepted_memory(void) +-{ +- return static_branch_unlikely(&zones_with_unaccepted_pages); +-} +- + static bool cond_accept_memory(struct zone *zone, unsigned int order) + { + long to_accept, wmark; + bool ret = false; + +- if (!has_unaccepted_memory()) +- return false; +- + if (list_empty(&zone->unaccepted_pages)) + return false; + +@@ -7078,22 +7060,17 @@ static bool __free_unaccepted(struct pag + { + struct zone *zone = page_zone(page); + unsigned long flags; +- bool first = false; + + if (!lazy_accept) + return false; + + spin_lock_irqsave(&zone->lock, flags); +- first = list_empty(&zone->unaccepted_pages); + list_add_tail(&page->lru, &zone->unaccepted_pages); + account_freepages(zone, MAX_ORDER_NR_PAGES, MIGRATE_MOVABLE); + __mod_zone_page_state(zone, NR_UNACCEPTED, MAX_ORDER_NR_PAGES); + __SetPageUnaccepted(page); + spin_unlock_irqrestore(&zone->lock, flags); + +- if (first) +- static_branch_inc(&zones_with_unaccepted_pages); +- + return true; + } + diff --git a/queue-6.14/series b/queue-6.14/series index 6407f68f42..c86041c935 100644 --- a/queue-6.14/series +++ b/queue-6.14/series @@ -133,3 +133,11 @@ dmaengine-idxd-add-missing-idxd-cleanup-to-fix-memory-leak-in-remove-call.patch dmaengine-idxd-fix-memory-leak-in-error-handling-path-of-idxd_alloc.patch dmaengine-idxd-fix-memory-leak-in-error-handling-path-of-idxd_pci_probe.patch dmaengine-idxd-refactor-remove-call-with-idxd_cleanup-helper.patch +accel-ivpu-use-workqueue-for-irq-handling.patch +accel-ivpu-dump-only-first-mmu-fault-from-single-context.patch +accel-ivpu-move-parts-of-mmu-event-irq-handling-to-thread-handler.patch +accel-ivpu-fix-missing-mmu-events-from-reserved-ssid.patch +accel-ivpu-fix-missing-mmu-events-if-file_priv-is-unbound.patch +accel-ivpu-flush-pending-jobs-of-device-s-workqueues.patch +drm-xe-gsc-do-not-flush-the-gsc-worker-from-the-reset-path.patch +mm-page_alloc-fix-race-condition-in-unaccepted-memory-handling.patch -- 2.47.3