From 7596b4c04e138c55d4284eeded8f1ddeba6b23dd Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Tue, 20 May 2025 11:10:02 +0200 Subject: [PATCH] 6.12-stable patches added patches: drm-xe-gsc-do-not-flush-the-gsc-worker-from-the-reset-path.patch mm-page_alloc-fix-race-condition-in-unaccepted-memory-handling.patch --- ...h-the-gsc-worker-from-the-reset-path.patch | 179 ++++++++++++++++++ ...dition-in-unaccepted-memory-handling.patch | 132 +++++++++++++ queue-6.12/series | 2 + 3 files changed, 313 insertions(+) create mode 100644 queue-6.12/drm-xe-gsc-do-not-flush-the-gsc-worker-from-the-reset-path.patch create mode 100644 queue-6.12/mm-page_alloc-fix-race-condition-in-unaccepted-memory-handling.patch diff --git a/queue-6.12/drm-xe-gsc-do-not-flush-the-gsc-worker-from-the-reset-path.patch b/queue-6.12/drm-xe-gsc-do-not-flush-the-gsc-worker-from-the-reset-path.patch new file mode 100644 index 0000000000..26c09ab076 --- /dev/null +++ b/queue-6.12/drm-xe-gsc-do-not-flush-the-gsc-worker-from-the-reset-path.patch @@ -0,0 +1,179 @@ +From 03552d8ac0afcc080c339faa0b726e2c0e9361cb Mon Sep 17 00:00:00 2001 +From: Daniele Ceraolo Spurio +Date: Fri, 2 May 2025 08:51:04 -0700 +Subject: drm/xe/gsc: do not flush the GSC worker from the reset path + +From: Daniele Ceraolo Spurio + +commit 03552d8ac0afcc080c339faa0b726e2c0e9361cb upstream. + +The workqueue used for the reset worker is marked as WQ_MEM_RECLAIM, +while the GSC one isn't (and can't be as we need to do memory +allocations in the gsc worker). Therefore, we can't flush the latter +from the former. + +The reason why we had such a flush was to avoid interrupting either +the GSC FW load or in progress GSC proxy operations. GSC proxy +operations fall into 2 categories: + +1) GSC proxy init: this only happens once immediately after GSC FW load + and does not support being interrupted. The only way to recover from + an interruption of the proxy init is to do an FLR and re-load the GSC. + +2) GSC proxy request: this can happen in response to a request that + the driver sends to the GSC. If this is interrupted, the GSC FW will + timeout and the driver request will be failed, but overall the GSC + will keep working fine. + +Flushing the work allowed us to avoid interruption in both cases (unless +the hang came from the GSC engine itself, in which case we're toast +anyway). However, a failure on a proxy request is tolerable if we're in +a scenario where we're triggering a GT reset (i.e., something is already +gone pretty wrong), so what we really need to avoid is interrupting +the init flow, which we can do by polling on the register that reports +when the proxy init is complete (as that ensure us that all the load and +init operations have been completed). + +Note that during suspend we still want to do a flush of the worker to +make sure it completes any operations involving the HW before the power +is cut. + +v2: fix spelling in commit msg, rename waiter function (Julia) + +Fixes: dd0e89e5edc2 ("drm/xe/gsc: GSC FW load") +Closes: https://gitlab.freedesktop.org/drm/xe/kernel/-/issues/4830 +Signed-off-by: Daniele Ceraolo Spurio +Cc: John Harrison +Cc: Alan Previn +Cc: # v6.8+ +Reviewed-by: Julia Filipchuk +Link: https://lore.kernel.org/r/20250502155104.2201469-1-daniele.ceraolospurio@intel.com +(cherry picked from commit 12370bfcc4f0bdf70279ec5b570eb298963422b5) +Signed-off-by: Lucas De Marchi +Signed-off-by: Greg Kroah-Hartman +--- + drivers/gpu/drm/xe/xe_gsc.c | 22 ++++++++++++++++++++++ + drivers/gpu/drm/xe/xe_gsc.h | 1 + + drivers/gpu/drm/xe/xe_gsc_proxy.c | 11 +++++++++++ + drivers/gpu/drm/xe/xe_gsc_proxy.h | 1 + + drivers/gpu/drm/xe/xe_gt.c | 2 +- + drivers/gpu/drm/xe/xe_uc.c | 8 +++++++- + drivers/gpu/drm/xe/xe_uc.h | 1 + + 7 files changed, 44 insertions(+), 2 deletions(-) + +--- a/drivers/gpu/drm/xe/xe_gsc.c ++++ b/drivers/gpu/drm/xe/xe_gsc.c +@@ -564,6 +564,28 @@ void xe_gsc_remove(struct xe_gsc *gsc) + xe_gsc_proxy_remove(gsc); + } + ++void xe_gsc_stop_prepare(struct xe_gsc *gsc) ++{ ++ struct xe_gt *gt = gsc_to_gt(gsc); ++ int ret; ++ ++ if (!xe_uc_fw_is_loadable(&gsc->fw) || xe_uc_fw_is_in_error_state(&gsc->fw)) ++ return; ++ ++ xe_force_wake_assert_held(gt_to_fw(gt), XE_FW_GSC); ++ ++ /* ++ * If the GSC FW load or the proxy init are interrupted, the only way ++ * to recover it is to do an FLR and reload the GSC from scratch. ++ * Therefore, let's wait for the init to complete before stopping ++ * operations. The proxy init is the last step, so we can just wait on ++ * that ++ */ ++ ret = xe_gsc_wait_for_proxy_init_done(gsc); ++ if (ret) ++ xe_gt_err(gt, "failed to wait for GSC init completion before uc stop\n"); ++} ++ + /* + * wa_14015076503: if the GSC FW is loaded, we need to alert it before doing a + * GSC engine reset by writing a notification bit in the GS1 register and then +--- a/drivers/gpu/drm/xe/xe_gsc.h ++++ b/drivers/gpu/drm/xe/xe_gsc.h +@@ -16,6 +16,7 @@ struct xe_hw_engine; + int xe_gsc_init(struct xe_gsc *gsc); + int xe_gsc_init_post_hwconfig(struct xe_gsc *gsc); + void xe_gsc_wait_for_worker_completion(struct xe_gsc *gsc); ++void xe_gsc_stop_prepare(struct xe_gsc *gsc); + void xe_gsc_load_start(struct xe_gsc *gsc); + void xe_gsc_remove(struct xe_gsc *gsc); + void xe_gsc_hwe_irq_handler(struct xe_hw_engine *hwe, u16 intr_vec); +--- a/drivers/gpu/drm/xe/xe_gsc_proxy.c ++++ b/drivers/gpu/drm/xe/xe_gsc_proxy.c +@@ -71,6 +71,17 @@ bool xe_gsc_proxy_init_done(struct xe_gs + HECI1_FWSTS1_PROXY_STATE_NORMAL; + } + ++int xe_gsc_wait_for_proxy_init_done(struct xe_gsc *gsc) ++{ ++ struct xe_gt *gt = gsc_to_gt(gsc); ++ ++ /* Proxy init can take up to 500ms, so wait double that for safety */ ++ return xe_mmio_wait32(>->mmio, HECI_FWSTS1(MTL_GSC_HECI1_BASE), ++ HECI1_FWSTS1_CURRENT_STATE, ++ HECI1_FWSTS1_PROXY_STATE_NORMAL, ++ USEC_PER_SEC, NULL, false); ++} ++ + static void __gsc_proxy_irq_rmw(struct xe_gsc *gsc, u32 clr, u32 set) + { + struct xe_gt *gt = gsc_to_gt(gsc); +--- a/drivers/gpu/drm/xe/xe_gsc_proxy.h ++++ b/drivers/gpu/drm/xe/xe_gsc_proxy.h +@@ -13,6 +13,7 @@ struct xe_gsc; + int xe_gsc_proxy_init(struct xe_gsc *gsc); + bool xe_gsc_proxy_init_done(struct xe_gsc *gsc); + void xe_gsc_proxy_remove(struct xe_gsc *gsc); ++int xe_gsc_wait_for_proxy_init_done(struct xe_gsc *gsc); + int xe_gsc_proxy_start(struct xe_gsc *gsc); + + int xe_gsc_proxy_request_handler(struct xe_gsc *gsc); +--- a/drivers/gpu/drm/xe/xe_gt.c ++++ b/drivers/gpu/drm/xe/xe_gt.c +@@ -828,7 +828,7 @@ void xe_gt_suspend_prepare(struct xe_gt + { + XE_WARN_ON(xe_force_wake_get(gt_to_fw(gt), XE_FORCEWAKE_ALL)); + +- xe_uc_stop_prepare(>->uc); ++ xe_uc_suspend_prepare(>->uc); + + XE_WARN_ON(xe_force_wake_put(gt_to_fw(gt), XE_FORCEWAKE_ALL)); + } +--- a/drivers/gpu/drm/xe/xe_uc.c ++++ b/drivers/gpu/drm/xe/xe_uc.c +@@ -241,7 +241,7 @@ void xe_uc_gucrc_disable(struct xe_uc *u + + void xe_uc_stop_prepare(struct xe_uc *uc) + { +- xe_gsc_wait_for_worker_completion(&uc->gsc); ++ xe_gsc_stop_prepare(&uc->gsc); + xe_guc_stop_prepare(&uc->guc); + } + +@@ -275,6 +275,12 @@ again: + goto again; + } + ++void xe_uc_suspend_prepare(struct xe_uc *uc) ++{ ++ xe_gsc_wait_for_worker_completion(&uc->gsc); ++ xe_guc_stop_prepare(&uc->guc); ++} ++ + int xe_uc_suspend(struct xe_uc *uc) + { + /* GuC submission not enabled, nothing to do */ +--- a/drivers/gpu/drm/xe/xe_uc.h ++++ b/drivers/gpu/drm/xe/xe_uc.h +@@ -18,6 +18,7 @@ int xe_uc_reset_prepare(struct xe_uc *uc + void xe_uc_stop_prepare(struct xe_uc *uc); + void xe_uc_stop(struct xe_uc *uc); + int xe_uc_start(struct xe_uc *uc); ++void xe_uc_suspend_prepare(struct xe_uc *uc); + int xe_uc_suspend(struct xe_uc *uc); + int xe_uc_sanitize_reset(struct xe_uc *uc); + void xe_uc_remove(struct xe_uc *uc); diff --git a/queue-6.12/mm-page_alloc-fix-race-condition-in-unaccepted-memory-handling.patch b/queue-6.12/mm-page_alloc-fix-race-condition-in-unaccepted-memory-handling.patch new file mode 100644 index 0000000000..80daaeffb8 --- /dev/null +++ b/queue-6.12/mm-page_alloc-fix-race-condition-in-unaccepted-memory-handling.patch @@ -0,0 +1,132 @@ +From fefc075182275057ce607effaa3daa9e6e3bdc73 Mon Sep 17 00:00:00 2001 +From: "Kirill A. Shutemov" +Date: Tue, 6 May 2025 16:32:07 +0300 +Subject: mm/page_alloc: fix race condition in unaccepted memory handling + +From: Kirill A. Shutemov + +commit fefc075182275057ce607effaa3daa9e6e3bdc73 upstream. + +The page allocator tracks the number of zones that have unaccepted memory +using static_branch_enc/dec() and uses that static branch in hot paths to +determine if it needs to deal with unaccepted memory. + +Borislav and Thomas pointed out that the tracking is racy: operations on +static_branch are not serialized against adding/removing unaccepted pages +to/from the zone. + +Sanity checks inside static_branch machinery detects it: + +WARNING: CPU: 0 PID: 10 at kernel/jump_label.c:276 __static_key_slow_dec_cpuslocked+0x8e/0xa0 + +The comment around the WARN() explains the problem: + + /* + * Warn about the '-1' case though; since that means a + * decrement is concurrent with a first (0->1) increment. IOW + * people are trying to disable something that wasn't yet fully + * enabled. This suggests an ordering problem on the user side. + */ + +The effect of this static_branch optimization is only visible on +microbenchmark. + +Instead of adding more complexity around it, remove it altogether. + +Link: https://lkml.kernel.org/r/20250506133207.1009676-1-kirill.shutemov@linux.intel.com +Signed-off-by: Kirill A. Shutemov +Fixes: dcdfdd40fa82 ("mm: Add support for unaccepted memory") +Link: https://lore.kernel.org/all/20250506092445.GBaBnVXXyvnazly6iF@fat_crate.local +Reported-by: Borislav Petkov +Tested-by: Borislav Petkov (AMD) +Reported-by: Thomas Gleixner +Cc: Vlastimil Babka +Cc: Suren Baghdasaryan +Cc: Michal Hocko +Cc: Brendan Jackman +Cc: Johannes Weiner +Cc: [6.5+] +Signed-off-by: Andrew Morton +Signed-off-by: Kirill A. Shutemov +Signed-off-by: Greg Kroah-Hartman +--- + mm/page_alloc.c | 23 ----------------------- + 1 file changed, 23 deletions(-) + +--- a/mm/page_alloc.c ++++ b/mm/page_alloc.c +@@ -7041,9 +7041,6 @@ bool has_managed_dma(void) + + #ifdef CONFIG_UNACCEPTED_MEMORY + +-/* Counts number of zones with unaccepted pages. */ +-static DEFINE_STATIC_KEY_FALSE(zones_with_unaccepted_pages); +- + static bool lazy_accept = true; + + static int __init accept_memory_parse(char *p) +@@ -7070,11 +7067,7 @@ static bool page_contains_unaccepted(str + static void __accept_page(struct zone *zone, unsigned long *flags, + struct page *page) + { +- bool last; +- + list_del(&page->lru); +- last = list_empty(&zone->unaccepted_pages); +- + account_freepages(zone, -MAX_ORDER_NR_PAGES, MIGRATE_MOVABLE); + __mod_zone_page_state(zone, NR_UNACCEPTED, -MAX_ORDER_NR_PAGES); + __ClearPageUnaccepted(page); +@@ -7083,9 +7076,6 @@ static void __accept_page(struct zone *z + accept_memory(page_to_phys(page), PAGE_SIZE << MAX_PAGE_ORDER); + + __free_pages_ok(page, MAX_PAGE_ORDER, FPI_TO_TAIL); +- +- if (last) +- static_branch_dec(&zones_with_unaccepted_pages); + } + + void accept_page(struct page *page) +@@ -7122,19 +7112,11 @@ static bool try_to_accept_memory_one(str + return true; + } + +-static inline bool has_unaccepted_memory(void) +-{ +- return static_branch_unlikely(&zones_with_unaccepted_pages); +-} +- + static bool cond_accept_memory(struct zone *zone, unsigned int order) + { + long to_accept, wmark; + bool ret = false; + +- if (!has_unaccepted_memory()) +- return false; +- + if (list_empty(&zone->unaccepted_pages)) + return false; + +@@ -7168,22 +7150,17 @@ static bool __free_unaccepted(struct pag + { + struct zone *zone = page_zone(page); + unsigned long flags; +- bool first = false; + + if (!lazy_accept) + return false; + + spin_lock_irqsave(&zone->lock, flags); +- first = list_empty(&zone->unaccepted_pages); + list_add_tail(&page->lru, &zone->unaccepted_pages); + account_freepages(zone, MAX_ORDER_NR_PAGES, MIGRATE_MOVABLE); + __mod_zone_page_state(zone, NR_UNACCEPTED, MAX_ORDER_NR_PAGES); + __SetPageUnaccepted(page); + spin_unlock_irqrestore(&zone->lock, flags); + +- if (first) +- static_branch_inc(&zones_with_unaccepted_pages); +- + return true; + } + diff --git a/queue-6.12/series b/queue-6.12/series index 44aab8a67d..2a99f82b4f 100644 --- a/queue-6.12/series +++ b/queue-6.12/series @@ -133,3 +133,5 @@ dmaengine-idxd-fix-memory-leak-in-error-handling-path-of-idxd_alloc.patch dmaengine-idxd-fix-memory-leak-in-error-handling-path-of-idxd_pci_probe.patch dmaengine-idxd-refactor-remove-call-with-idxd_cleanup-helper.patch cifs-new-mount-option-for-cifs.upcall-namespace-resolution.patch +drm-xe-gsc-do-not-flush-the-gsc-worker-from-the-reset-path.patch +mm-page_alloc-fix-race-condition-in-unaccepted-memory-handling.patch -- 2.47.3