From: Sasha Levin Date: Tue, 8 Jul 2025 00:37:04 +0000 (-0400) Subject: Fixes for 6.12 X-Git-Tag: v5.15.187~49 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=0c49a2484f310f78835666fb3bf324f2094e7c01;p=thirdparty%2Fkernel%2Fstable-queue.git Fixes for 6.12 Signed-off-by: Sasha Levin --- diff --git a/queue-6.12/drm-xe-allow-dropping-kunit-dependency-as-built-in.patch b/queue-6.12/drm-xe-allow-dropping-kunit-dependency-as-built-in.patch new file mode 100644 index 0000000000..e2d1af67d9 --- /dev/null +++ b/queue-6.12/drm-xe-allow-dropping-kunit-dependency-as-built-in.patch @@ -0,0 +1,51 @@ +From 75353bdeb7ef9998c389c3cc89ec06a2ba84b5b9 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 7 Jul 2025 02:14:17 -0400 +Subject: drm/xe: Allow dropping kunit dependency as built-in +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +From: Harry Austen + +[ Upstream commit aa18d5769fcafe645a3ba01a9a69dde4f8dc8cc3 ] + +Fix Kconfig symbol dependency on KUNIT, which isn't actually required +for XE to be built-in. However, if KUNIT is enabled, it must be built-in +too. + +Fixes: 08987a8b6820 ("drm/xe: Fix build with KUNIT=m") +Cc: Lucas De Marchi +Cc: Thomas Hellström +Cc: Jani Nikula +Cc: Maarten Lankhorst +Signed-off-by: Harry Austen +Reviewed-by: Lucas De Marchi +Acked-by: Randy Dunlap +Tested-by: Randy Dunlap +Link: https://lore.kernel.org/r/20250627-xe-kunit-v2-2-756fe5cd56cf@intel.com +Signed-off-by: Lucas De Marchi +(cherry picked from commit a559434880b320b83733d739733250815aecf1b0) +Signed-off-by: Lucas De Marchi +Signed-off-by: Sasha Levin +--- + drivers/gpu/drm/xe/Kconfig | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/drivers/gpu/drm/xe/Kconfig b/drivers/gpu/drm/xe/Kconfig +index 7bbe46a98ff1f..93e742c1f21e7 100644 +--- a/drivers/gpu/drm/xe/Kconfig ++++ b/drivers/gpu/drm/xe/Kconfig +@@ -1,7 +1,8 @@ + # SPDX-License-Identifier: GPL-2.0-only + config DRM_XE + tristate "Intel Xe Graphics" +- depends on DRM && PCI && MMU && (m || (y && KUNIT=y)) ++ depends on DRM && PCI && MMU ++ depends on KUNIT || !KUNIT + select INTERVAL_TREE + # we need shmfs for the swappable backing store, and in particular + # the shmem_readpage() which depends upon tmpfs +-- +2.39.5 + diff --git a/queue-6.12/drm-xe-bmg-update-wa_22019338487.patch b/queue-6.12/drm-xe-bmg-update-wa_22019338487.patch new file mode 100644 index 0000000000..2ff6942717 --- /dev/null +++ b/queue-6.12/drm-xe-bmg-update-wa_22019338487.patch @@ -0,0 +1,326 @@ +From d043d53c34c64bb9fac3f2278c081f7b3b83764d Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 18 Jun 2025 11:50:01 -0700 +Subject: drm/xe/bmg: Update Wa_22019338487 + +From: Vinay Belgaumkar + +[ Upstream commit 84c0b4a00610afbde650fdb8ad6db0424f7b2cc3 ] + +Limit GT max frequency to 2600MHz and wait for frequency to reduce +before proceeding with a transient flush. This is really only needed for +the transient flush: if L2 flush is needed due to 16023588340 then +there's no need to do this additional wait since we are already using +the bigger hammer. + +v2: Use generic names, ensure user set max frequency requests wait +for flush to complete (Rodrigo) +v3: + - User requests wait via wait_var_event_timeout (Lucas) + - Close races on flush + user requests (Lucas) + - Fix xe_guc_pc_remove_flush_freq_limit() being called on last gt + rather than root gt (Lucas) +v4: + - Only apply the freq reducing part if a TDF is needed: L2 flush trumps + the need for waiting a lower frequency + +Fixes: aaa08078e725 ("drm/xe/bmg: Apply Wa_22019338487") +Reviewed-by: Rodrigo Vivi +Signed-off-by: Vinay Belgaumkar +Link: https://lore.kernel.org/r/20250618-wa-22019338487-v5-4-b888388477f2@intel.com +Signed-off-by: Lucas De Marchi +(cherry picked from commit deea6a7d6d803d6bb874a3e6f1b312e560e6c6df) +Signed-off-by: Lucas De Marchi +Signed-off-by: Sasha Levin +--- + drivers/gpu/drm/xe/xe_device.c | 55 +++++----- + drivers/gpu/drm/xe/xe_guc_pc.c | 144 +++++++++++++++++++++++++++ + drivers/gpu/drm/xe/xe_guc_pc.h | 2 + + drivers/gpu/drm/xe/xe_guc_pc_types.h | 2 + + 4 files changed, 179 insertions(+), 24 deletions(-) + +diff --git a/drivers/gpu/drm/xe/xe_device.c b/drivers/gpu/drm/xe/xe_device.c +index 0c3db53b93d8a..82da51a6616a1 100644 +--- a/drivers/gpu/drm/xe/xe_device.c ++++ b/drivers/gpu/drm/xe/xe_device.c +@@ -37,6 +37,7 @@ + #include "xe_gt_printk.h" + #include "xe_gt_sriov_vf.h" + #include "xe_guc.h" ++#include "xe_guc_pc.h" + #include "xe_hw_engine_group.h" + #include "xe_hwmon.h" + #include "xe_irq.h" +@@ -871,31 +872,37 @@ void xe_device_td_flush(struct xe_device *xe) + if (!IS_DGFX(xe) || GRAPHICS_VER(xe) < 20) + return; + +- if (XE_WA(xe_root_mmio_gt(xe), 16023588340)) { ++ gt = xe_root_mmio_gt(xe); ++ if (XE_WA(gt, 16023588340)) { ++ /* A transient flush is not sufficient: flush the L2 */ + xe_device_l2_flush(xe); +- return; +- } +- +- for_each_gt(gt, xe, id) { +- if (xe_gt_is_media_type(gt)) +- continue; +- +- if (xe_force_wake_get(gt_to_fw(gt), XE_FW_GT)) +- return; +- +- xe_mmio_write32(gt, XE2_TDF_CTRL, TRANSIENT_FLUSH_REQUEST); +- /* +- * FIXME: We can likely do better here with our choice of +- * timeout. Currently we just assume the worst case, i.e. 150us, +- * which is believed to be sufficient to cover the worst case +- * scenario on current platforms if all cache entries are +- * transient and need to be flushed.. +- */ +- if (xe_mmio_wait32(gt, XE2_TDF_CTRL, TRANSIENT_FLUSH_REQUEST, 0, +- 150, NULL, false)) +- xe_gt_err_once(gt, "TD flush timeout\n"); +- +- xe_force_wake_put(gt_to_fw(gt), XE_FW_GT); ++ } else { ++ xe_guc_pc_apply_flush_freq_limit(>->uc.guc.pc); ++ ++ /* Execute TDF flush on all graphics GTs */ ++ for_each_gt(gt, xe, id) { ++ if (xe_gt_is_media_type(gt)) ++ continue; ++ ++ if (xe_force_wake_get(gt_to_fw(gt), XE_FW_GT)) ++ return; ++ ++ xe_mmio_write32(gt, XE2_TDF_CTRL, TRANSIENT_FLUSH_REQUEST); ++ /* ++ * FIXME: We can likely do better here with our choice of ++ * timeout. Currently we just assume the worst case, i.e. 150us, ++ * which is believed to be sufficient to cover the worst case ++ * scenario on current platforms if all cache entries are ++ * transient and need to be flushed.. ++ */ ++ if (xe_mmio_wait32(gt, XE2_TDF_CTRL, TRANSIENT_FLUSH_REQUEST, 0, ++ 150, NULL, false)) ++ xe_gt_err_once(gt, "TD flush timeout\n"); ++ ++ xe_force_wake_put(gt_to_fw(gt), XE_FW_GT); ++ } ++ ++ xe_guc_pc_remove_flush_freq_limit(&xe_root_mmio_gt(xe)->uc.guc.pc); + } + } + +diff --git a/drivers/gpu/drm/xe/xe_guc_pc.c b/drivers/gpu/drm/xe/xe_guc_pc.c +index f978da8be35c2..af02803c145bf 100644 +--- a/drivers/gpu/drm/xe/xe_guc_pc.c ++++ b/drivers/gpu/drm/xe/xe_guc_pc.c +@@ -6,6 +6,9 @@ + #include "xe_guc_pc.h" + + #include ++#include ++#include ++#include + + #include + #include +@@ -47,6 +50,12 @@ + + #define LNL_MERT_FREQ_CAP 800 + #define BMG_MERT_FREQ_CAP 2133 ++#define BMG_MIN_FREQ 1200 ++#define BMG_MERT_FLUSH_FREQ_CAP 2600 ++ ++#define SLPC_RESET_TIMEOUT_MS 5 /* roughly 5ms, but no need for precision */ ++#define SLPC_RESET_EXTENDED_TIMEOUT_MS 1000 /* To be used only at pc_start */ ++#define SLPC_ACT_FREQ_TIMEOUT_MS 100 + + /** + * DOC: GuC Power Conservation (PC) +@@ -133,6 +142,36 @@ static int wait_for_pc_state(struct xe_guc_pc *pc, + return -ETIMEDOUT; + } + ++static int wait_for_flush_complete(struct xe_guc_pc *pc) ++{ ++ const unsigned long timeout = msecs_to_jiffies(30); ++ ++ if (!wait_var_event_timeout(&pc->flush_freq_limit, ++ !atomic_read(&pc->flush_freq_limit), ++ timeout)) ++ return -ETIMEDOUT; ++ ++ return 0; ++} ++ ++static int wait_for_act_freq_limit(struct xe_guc_pc *pc, u32 freq) ++{ ++ int timeout_us = SLPC_ACT_FREQ_TIMEOUT_MS * USEC_PER_MSEC; ++ int slept, wait = 10; ++ ++ for (slept = 0; slept < timeout_us;) { ++ if (xe_guc_pc_get_act_freq(pc) <= freq) ++ return 0; ++ ++ usleep_range(wait, wait << 1); ++ slept += wait; ++ wait <<= 1; ++ if (slept + wait > timeout_us) ++ wait = timeout_us - slept; ++ } ++ ++ return -ETIMEDOUT; ++} + static int pc_action_reset(struct xe_guc_pc *pc) + { + struct xe_guc_ct *ct = pc_to_ct(pc); +@@ -584,6 +623,11 @@ int xe_guc_pc_set_max_freq(struct xe_guc_pc *pc, u32 freq) + { + int ret; + ++ if (XE_WA(pc_to_gt(pc), 22019338487)) { ++ if (wait_for_flush_complete(pc) != 0) ++ return -EAGAIN; ++ } ++ + mutex_lock(&pc->freq_lock); + if (!pc->freq_ready) { + /* Might be in the middle of a gt reset */ +@@ -793,6 +837,106 @@ static int pc_adjust_requested_freq(struct xe_guc_pc *pc) + return ret; + } + ++static bool needs_flush_freq_limit(struct xe_guc_pc *pc) ++{ ++ struct xe_gt *gt = pc_to_gt(pc); ++ ++ return XE_WA(gt, 22019338487) && ++ pc->rp0_freq > BMG_MERT_FLUSH_FREQ_CAP; ++} ++ ++/** ++ * xe_guc_pc_apply_flush_freq_limit() - Limit max GT freq during L2 flush ++ * @pc: the xe_guc_pc object ++ * ++ * As per the WA, reduce max GT frequency during L2 cache flush ++ */ ++void xe_guc_pc_apply_flush_freq_limit(struct xe_guc_pc *pc) ++{ ++ struct xe_gt *gt = pc_to_gt(pc); ++ u32 max_freq; ++ int ret; ++ ++ if (!needs_flush_freq_limit(pc)) ++ return; ++ ++ mutex_lock(&pc->freq_lock); ++ ++ if (!pc->freq_ready) { ++ mutex_unlock(&pc->freq_lock); ++ return; ++ } ++ ++ ret = pc_action_query_task_state(pc); ++ if (ret) { ++ mutex_unlock(&pc->freq_lock); ++ return; ++ } ++ ++ max_freq = pc_get_max_freq(pc); ++ if (max_freq > BMG_MERT_FLUSH_FREQ_CAP) { ++ ret = pc_set_max_freq(pc, BMG_MERT_FLUSH_FREQ_CAP); ++ if (ret) { ++ xe_gt_err_once(gt, "Failed to cap max freq on flush to %u, %pe\n", ++ BMG_MERT_FLUSH_FREQ_CAP, ERR_PTR(ret)); ++ mutex_unlock(&pc->freq_lock); ++ return; ++ } ++ ++ atomic_set(&pc->flush_freq_limit, 1); ++ ++ /* ++ * If user has previously changed max freq, stash that value to ++ * restore later, otherwise use the current max. New user ++ * requests wait on flush. ++ */ ++ if (pc->user_requested_max != 0) ++ pc->stashed_max_freq = pc->user_requested_max; ++ else ++ pc->stashed_max_freq = max_freq; ++ } ++ ++ mutex_unlock(&pc->freq_lock); ++ ++ /* ++ * Wait for actual freq to go below the flush cap: even if the previous ++ * max was below cap, the current one might still be above it ++ */ ++ ret = wait_for_act_freq_limit(pc, BMG_MERT_FLUSH_FREQ_CAP); ++ if (ret) ++ xe_gt_err_once(gt, "Actual freq did not reduce to %u, %pe\n", ++ BMG_MERT_FLUSH_FREQ_CAP, ERR_PTR(ret)); ++} ++ ++/** ++ * xe_guc_pc_remove_flush_freq_limit() - Remove max GT freq limit after L2 flush completes. ++ * @pc: the xe_guc_pc object ++ * ++ * Retrieve the previous GT max frequency value. ++ */ ++void xe_guc_pc_remove_flush_freq_limit(struct xe_guc_pc *pc) ++{ ++ struct xe_gt *gt = pc_to_gt(pc); ++ int ret = 0; ++ ++ if (!needs_flush_freq_limit(pc)) ++ return; ++ ++ if (!atomic_read(&pc->flush_freq_limit)) ++ return; ++ ++ mutex_lock(&pc->freq_lock); ++ ++ ret = pc_set_max_freq(>->uc.guc.pc, pc->stashed_max_freq); ++ if (ret) ++ xe_gt_err_once(gt, "Failed to restore max freq %u:%d", ++ pc->stashed_max_freq, ret); ++ ++ atomic_set(&pc->flush_freq_limit, 0); ++ mutex_unlock(&pc->freq_lock); ++ wake_up_var(&pc->flush_freq_limit); ++} ++ + static int pc_set_mert_freq_cap(struct xe_guc_pc *pc) + { + int ret = 0; +diff --git a/drivers/gpu/drm/xe/xe_guc_pc.h b/drivers/gpu/drm/xe/xe_guc_pc.h +index efda432fadfc8..7154b3aab0d84 100644 +--- a/drivers/gpu/drm/xe/xe_guc_pc.h ++++ b/drivers/gpu/drm/xe/xe_guc_pc.h +@@ -34,5 +34,7 @@ u64 xe_guc_pc_mc6_residency(struct xe_guc_pc *pc); + void xe_guc_pc_init_early(struct xe_guc_pc *pc); + int xe_guc_pc_restore_stashed_freq(struct xe_guc_pc *pc); + void xe_guc_pc_raise_unslice(struct xe_guc_pc *pc); ++void xe_guc_pc_apply_flush_freq_limit(struct xe_guc_pc *pc); ++void xe_guc_pc_remove_flush_freq_limit(struct xe_guc_pc *pc); + + #endif /* _XE_GUC_PC_H_ */ +diff --git a/drivers/gpu/drm/xe/xe_guc_pc_types.h b/drivers/gpu/drm/xe/xe_guc_pc_types.h +index 13810be015db5..5b86d91296cb9 100644 +--- a/drivers/gpu/drm/xe/xe_guc_pc_types.h ++++ b/drivers/gpu/drm/xe/xe_guc_pc_types.h +@@ -15,6 +15,8 @@ + struct xe_guc_pc { + /** @bo: GGTT buffer object that is shared with GuC PC */ + struct xe_bo *bo; ++ /** @flush_freq_limit: 1 when max freq changes are limited by driver */ ++ atomic_t flush_freq_limit; + /** @rp0_freq: HW RP0 frequency - The Maximum one */ + u32 rp0_freq; + /** @rpe_freq: HW RPe frequency - The Efficient one */ +-- +2.39.5 + diff --git a/queue-6.12/ib-mlx5-fix-potential-deadlock-in-mr-deregistration.patch b/queue-6.12/ib-mlx5-fix-potential-deadlock-in-mr-deregistration.patch new file mode 100644 index 0000000000..d86d56f02f --- /dev/null +++ b/queue-6.12/ib-mlx5-fix-potential-deadlock-in-mr-deregistration.patch @@ -0,0 +1,268 @@ +From 60a8ab6c686344fa2811dbcd1d1e5d9e7f13ee24 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 16 Jun 2025 11:14:09 +0300 +Subject: IB/mlx5: Fix potential deadlock in MR deregistration +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +From: Or Har-Toov + +[ Upstream commit 2ed25aa7f7711f508b6120e336f05cd9d49943c0 ] + +The issue arises when kzalloc() is invoked while holding umem_mutex or +any other lock acquired under umem_mutex. This is problematic because +kzalloc() can trigger fs_reclaim_aqcuire(), which may, in turn, invoke +mmu_notifier_invalidate_range_start(). This function can lead to +mlx5_ib_invalidate_range(), which attempts to acquire umem_mutex again, +resulting in a deadlock. + +The problematic flow: + CPU0 | CPU1 +---------------------------------------|------------------------------------------------ +mlx5_ib_dereg_mr() | + → revoke_mr() | + → mutex_lock(&umem_odp->umem_mutex) | + | mlx5_mkey_cache_init() + | → mutex_lock(&dev->cache.rb_lock) + | → mlx5r_cache_create_ent_locked() + | → kzalloc(GFP_KERNEL) + | → fs_reclaim() + | → mmu_notifier_invalidate_range_start() + | → mlx5_ib_invalidate_range() + | → mutex_lock(&umem_odp->umem_mutex) + → cache_ent_find_and_store() | + → mutex_lock(&dev->cache.rb_lock) | + +Additionally, when kzalloc() is called from within +cache_ent_find_and_store(), we encounter the same deadlock due to +re-acquisition of umem_mutex. + +Solve by releasing umem_mutex in dereg_mr() after umr_revoke_mr() +and before acquiring rb_lock. This ensures that we don't hold +umem_mutex while performing memory allocations that could trigger +the reclaim path. + +This change prevents the deadlock by ensuring proper lock ordering and +avoiding holding locks during memory allocation operations that could +trigger the reclaim path. + +The following lockdep warning demonstrates the deadlock: + + python3/20557 is trying to acquire lock: + ffff888387542128 (&umem_odp->umem_mutex){+.+.}-{4:4}, at: + mlx5_ib_invalidate_range+0x5b/0x550 [mlx5_ib] + + but task is already holding lock: + ffffffff82f6b840 (mmu_notifier_invalidate_range_start){+.+.}-{0:0}, at: + unmap_vmas+0x7b/0x1a0 + + which lock already depends on the new lock. + + the existing dependency chain (in reverse order) is: + + -> #3 (mmu_notifier_invalidate_range_start){+.+.}-{0:0}: + fs_reclaim_acquire+0x60/0xd0 + mem_cgroup_css_alloc+0x6f/0x9b0 + cgroup_init_subsys+0xa4/0x240 + cgroup_init+0x1c8/0x510 + start_kernel+0x747/0x760 + x86_64_start_reservations+0x25/0x30 + x86_64_start_kernel+0x73/0x80 + common_startup_64+0x129/0x138 + + -> #2 (fs_reclaim){+.+.}-{0:0}: + fs_reclaim_acquire+0x91/0xd0 + __kmalloc_cache_noprof+0x4d/0x4c0 + mlx5r_cache_create_ent_locked+0x75/0x620 [mlx5_ib] + mlx5_mkey_cache_init+0x186/0x360 [mlx5_ib] + mlx5_ib_stage_post_ib_reg_umr_init+0x3c/0x60 [mlx5_ib] + __mlx5_ib_add+0x4b/0x190 [mlx5_ib] + mlx5r_probe+0xd9/0x320 [mlx5_ib] + auxiliary_bus_probe+0x42/0x70 + really_probe+0xdb/0x360 + __driver_probe_device+0x8f/0x130 + driver_probe_device+0x1f/0xb0 + __driver_attach+0xd4/0x1f0 + bus_for_each_dev+0x79/0xd0 + bus_add_driver+0xf0/0x200 + driver_register+0x6e/0xc0 + __auxiliary_driver_register+0x6a/0xc0 + do_one_initcall+0x5e/0x390 + do_init_module+0x88/0x240 + init_module_from_file+0x85/0xc0 + idempotent_init_module+0x104/0x300 + __x64_sys_finit_module+0x68/0xc0 + do_syscall_64+0x6d/0x140 + entry_SYSCALL_64_after_hwframe+0x4b/0x53 + + -> #1 (&dev->cache.rb_lock){+.+.}-{4:4}: + __mutex_lock+0x98/0xf10 + __mlx5_ib_dereg_mr+0x6f2/0x890 [mlx5_ib] + mlx5_ib_dereg_mr+0x21/0x110 [mlx5_ib] + ib_dereg_mr_user+0x85/0x1f0 [ib_core] + uverbs_free_mr+0x19/0x30 [ib_uverbs] + destroy_hw_idr_uobject+0x21/0x80 [ib_uverbs] + uverbs_destroy_uobject+0x60/0x3d0 [ib_uverbs] + uobj_destroy+0x57/0xa0 [ib_uverbs] + ib_uverbs_cmd_verbs+0x4d5/0x1210 [ib_uverbs] + ib_uverbs_ioctl+0x129/0x230 [ib_uverbs] + __x64_sys_ioctl+0x596/0xaa0 + do_syscall_64+0x6d/0x140 + entry_SYSCALL_64_after_hwframe+0x4b/0x53 + + -> #0 (&umem_odp->umem_mutex){+.+.}-{4:4}: + __lock_acquire+0x1826/0x2f00 + lock_acquire+0xd3/0x2e0 + __mutex_lock+0x98/0xf10 + mlx5_ib_invalidate_range+0x5b/0x550 [mlx5_ib] + __mmu_notifier_invalidate_range_start+0x18e/0x1f0 + unmap_vmas+0x182/0x1a0 + exit_mmap+0xf3/0x4a0 + mmput+0x3a/0x100 + do_exit+0x2b9/0xa90 + do_group_exit+0x32/0xa0 + get_signal+0xc32/0xcb0 + arch_do_signal_or_restart+0x29/0x1d0 + syscall_exit_to_user_mode+0x105/0x1d0 + do_syscall_64+0x79/0x140 + entry_SYSCALL_64_after_hwframe+0x4b/0x53 + + Chain exists of: + &dev->cache.rb_lock --> mmu_notifier_invalidate_range_start --> + &umem_odp->umem_mutex + + Possible unsafe locking scenario: + + CPU0 CPU1 + ---- ---- + lock(&umem_odp->umem_mutex); + lock(mmu_notifier_invalidate_range_start); + lock(&umem_odp->umem_mutex); + lock(&dev->cache.rb_lock); + + *** DEADLOCK *** + +Fixes: abb604a1a9c8 ("RDMA/mlx5: Fix a race for an ODP MR which leads to CQE with error") +Signed-off-by: Or Har-Toov +Reviewed-by: Michael Guralnik +Link: https://patch.msgid.link/3c8f225a8a9fade647d19b014df1172544643e4a.1750061612.git.leon@kernel.org +Signed-off-by: Leon Romanovsky +Signed-off-by: Sasha Levin +--- + drivers/infiniband/hw/mlx5/mr.c | 61 +++++++++++++++++++++++++-------- + 1 file changed, 47 insertions(+), 14 deletions(-) + +diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c +index 830a15b66c120..726b81b6330c6 100644 +--- a/drivers/infiniband/hw/mlx5/mr.c ++++ b/drivers/infiniband/hw/mlx5/mr.c +@@ -2028,23 +2028,50 @@ void mlx5_ib_revoke_data_direct_mrs(struct mlx5_ib_dev *dev) + } + } + +-static int mlx5_revoke_mr(struct mlx5_ib_mr *mr) ++static int mlx5_umr_revoke_mr_with_lock(struct mlx5_ib_mr *mr) + { +- struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device); +- struct mlx5_cache_ent *ent = mr->mmkey.cache_ent; +- bool is_odp = is_odp_mr(mr); + bool is_odp_dma_buf = is_dmabuf_mr(mr) && +- !to_ib_umem_dmabuf(mr->umem)->pinned; +- bool from_cache = !!ent; +- int ret = 0; ++ !to_ib_umem_dmabuf(mr->umem)->pinned; ++ bool is_odp = is_odp_mr(mr); ++ int ret; + + if (is_odp) + mutex_lock(&to_ib_umem_odp(mr->umem)->umem_mutex); + + if (is_odp_dma_buf) +- dma_resv_lock(to_ib_umem_dmabuf(mr->umem)->attach->dmabuf->resv, NULL); ++ dma_resv_lock(to_ib_umem_dmabuf(mr->umem)->attach->dmabuf->resv, ++ NULL); ++ ++ ret = mlx5r_umr_revoke_mr(mr); ++ ++ if (is_odp) { ++ if (!ret) ++ to_ib_umem_odp(mr->umem)->private = NULL; ++ mutex_unlock(&to_ib_umem_odp(mr->umem)->umem_mutex); ++ } ++ ++ if (is_odp_dma_buf) { ++ if (!ret) ++ to_ib_umem_dmabuf(mr->umem)->private = NULL; ++ dma_resv_unlock( ++ to_ib_umem_dmabuf(mr->umem)->attach->dmabuf->resv); ++ } + +- if (mr->mmkey.cacheable && !mlx5r_umr_revoke_mr(mr) && !cache_ent_find_and_store(dev, mr)) { ++ return ret; ++} ++ ++static int mlx5r_handle_mkey_cleanup(struct mlx5_ib_mr *mr) ++{ ++ bool is_odp_dma_buf = is_dmabuf_mr(mr) && ++ !to_ib_umem_dmabuf(mr->umem)->pinned; ++ struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device); ++ struct mlx5_cache_ent *ent = mr->mmkey.cache_ent; ++ bool is_odp = is_odp_mr(mr); ++ bool from_cache = !!ent; ++ int ret; ++ ++ if (mr->mmkey.cacheable && !mlx5_umr_revoke_mr_with_lock(mr) && ++ !cache_ent_find_and_store(dev, mr)) { + ent = mr->mmkey.cache_ent; + /* upon storing to a clean temp entry - schedule its cleanup */ + spin_lock_irq(&ent->mkeys_queue.lock); +@@ -2056,7 +2083,7 @@ static int mlx5_revoke_mr(struct mlx5_ib_mr *mr) + ent->tmp_cleanup_scheduled = true; + } + spin_unlock_irq(&ent->mkeys_queue.lock); +- goto out; ++ return 0; + } + + if (ent) { +@@ -2065,8 +2092,14 @@ static int mlx5_revoke_mr(struct mlx5_ib_mr *mr) + mr->mmkey.cache_ent = NULL; + spin_unlock_irq(&ent->mkeys_queue.lock); + } ++ ++ if (is_odp) ++ mutex_lock(&to_ib_umem_odp(mr->umem)->umem_mutex); ++ ++ if (is_odp_dma_buf) ++ dma_resv_lock(to_ib_umem_dmabuf(mr->umem)->attach->dmabuf->resv, ++ NULL); + ret = destroy_mkey(dev, mr); +-out: + if (is_odp) { + if (!ret) + to_ib_umem_odp(mr->umem)->private = NULL; +@@ -2076,9 +2109,9 @@ static int mlx5_revoke_mr(struct mlx5_ib_mr *mr) + if (is_odp_dma_buf) { + if (!ret) + to_ib_umem_dmabuf(mr->umem)->private = NULL; +- dma_resv_unlock(to_ib_umem_dmabuf(mr->umem)->attach->dmabuf->resv); ++ dma_resv_unlock( ++ to_ib_umem_dmabuf(mr->umem)->attach->dmabuf->resv); + } +- + return ret; + } + +@@ -2127,7 +2160,7 @@ static int __mlx5_ib_dereg_mr(struct ib_mr *ibmr) + } + + /* Stop DMA */ +- rc = mlx5_revoke_mr(mr); ++ rc = mlx5r_handle_mkey_cleanup(mr); + if (rc) + return rc; + +-- +2.39.5 + diff --git a/queue-6.12/nfsv4-flexfiles-fix-handling-of-nfs-level-errors-in-.patch b/queue-6.12/nfsv4-flexfiles-fix-handling-of-nfs-level-errors-in-.patch new file mode 100644 index 0000000000..752932aa30 --- /dev/null +++ b/queue-6.12/nfsv4-flexfiles-fix-handling-of-nfs-level-errors-in-.patch @@ -0,0 +1,250 @@ +From 44cf8d0200904f04abc0499e99bf8bceee19bcb1 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 19 Jun 2025 15:16:11 -0400 +Subject: NFSv4/flexfiles: Fix handling of NFS level errors in I/O + +From: Trond Myklebust + +[ Upstream commit 38074de35b015df5623f524d6f2b49a0cd395c40 ] + +Allow the flexfiles error handling to recognise NFS level errors (as +opposed to RPC level errors) and handle them separately. The main +motivator is the NFSERR_PERM errors that get returned if the NFS client +connects to the data server through a port number that is lower than +1024. In that case, the client should disconnect and retry a READ on a +different data server, or it should retry a WRITE after reconnecting. + +Reviewed-by: Tigran Mkrtchyan +Fixes: d67ae825a59d ("pnfs/flexfiles: Add the FlexFile Layout Driver") +Signed-off-by: Trond Myklebust +Signed-off-by: Anna Schumaker +Signed-off-by: Sasha Levin +--- + fs/nfs/flexfilelayout/flexfilelayout.c | 121 ++++++++++++++++++------- + 1 file changed, 87 insertions(+), 34 deletions(-) + +diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c +index 8f7ea4076653d..bf96f7a8900c1 100644 +--- a/fs/nfs/flexfilelayout/flexfilelayout.c ++++ b/fs/nfs/flexfilelayout/flexfilelayout.c +@@ -1104,6 +1104,7 @@ static void ff_layout_reset_read(struct nfs_pgio_header *hdr) + } + + static int ff_layout_async_handle_error_v4(struct rpc_task *task, ++ u32 op_status, + struct nfs4_state *state, + struct nfs_client *clp, + struct pnfs_layout_segment *lseg, +@@ -1114,32 +1115,42 @@ static int ff_layout_async_handle_error_v4(struct rpc_task *task, + struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx); + struct nfs4_slot_table *tbl = &clp->cl_session->fc_slot_table; + +- switch (task->tk_status) { +- case -NFS4ERR_BADSESSION: +- case -NFS4ERR_BADSLOT: +- case -NFS4ERR_BAD_HIGH_SLOT: +- case -NFS4ERR_DEADSESSION: +- case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: +- case -NFS4ERR_SEQ_FALSE_RETRY: +- case -NFS4ERR_SEQ_MISORDERED: ++ switch (op_status) { ++ case NFS4_OK: ++ case NFS4ERR_NXIO: ++ break; ++ case NFSERR_PERM: ++ if (!task->tk_xprt) ++ break; ++ xprt_force_disconnect(task->tk_xprt); ++ goto out_retry; ++ case NFS4ERR_BADSESSION: ++ case NFS4ERR_BADSLOT: ++ case NFS4ERR_BAD_HIGH_SLOT: ++ case NFS4ERR_DEADSESSION: ++ case NFS4ERR_CONN_NOT_BOUND_TO_SESSION: ++ case NFS4ERR_SEQ_FALSE_RETRY: ++ case NFS4ERR_SEQ_MISORDERED: + dprintk("%s ERROR %d, Reset session. Exchangeid " + "flags 0x%x\n", __func__, task->tk_status, + clp->cl_exchange_flags); + nfs4_schedule_session_recovery(clp->cl_session, task->tk_status); +- break; +- case -NFS4ERR_DELAY: +- case -NFS4ERR_GRACE: ++ goto out_retry; ++ case NFS4ERR_DELAY: ++ nfs_inc_stats(lseg->pls_layout->plh_inode, NFSIOS_DELAY); ++ fallthrough; ++ case NFS4ERR_GRACE: + rpc_delay(task, FF_LAYOUT_POLL_RETRY_MAX); +- break; +- case -NFS4ERR_RETRY_UNCACHED_REP: +- break; ++ goto out_retry; ++ case NFS4ERR_RETRY_UNCACHED_REP: ++ goto out_retry; + /* Invalidate Layout errors */ +- case -NFS4ERR_PNFS_NO_LAYOUT: +- case -ESTALE: /* mapped NFS4ERR_STALE */ +- case -EBADHANDLE: /* mapped NFS4ERR_BADHANDLE */ +- case -EISDIR: /* mapped NFS4ERR_ISDIR */ +- case -NFS4ERR_FHEXPIRED: +- case -NFS4ERR_WRONG_TYPE: ++ case NFS4ERR_PNFS_NO_LAYOUT: ++ case NFS4ERR_STALE: ++ case NFS4ERR_BADHANDLE: ++ case NFS4ERR_ISDIR: ++ case NFS4ERR_FHEXPIRED: ++ case NFS4ERR_WRONG_TYPE: + dprintk("%s Invalid layout error %d\n", __func__, + task->tk_status); + /* +@@ -1152,6 +1163,11 @@ static int ff_layout_async_handle_error_v4(struct rpc_task *task, + pnfs_destroy_layout(NFS_I(inode)); + rpc_wake_up(&tbl->slot_tbl_waitq); + goto reset; ++ default: ++ break; ++ } ++ ++ switch (task->tk_status) { + /* RPC connection errors */ + case -ECONNREFUSED: + case -EHOSTDOWN: +@@ -1167,26 +1183,56 @@ static int ff_layout_async_handle_error_v4(struct rpc_task *task, + nfs4_delete_deviceid(devid->ld, devid->nfs_client, + &devid->deviceid); + rpc_wake_up(&tbl->slot_tbl_waitq); +- fallthrough; ++ break; + default: +- if (ff_layout_avoid_mds_available_ds(lseg)) +- return -NFS4ERR_RESET_TO_PNFS; +-reset: +- dprintk("%s Retry through MDS. Error %d\n", __func__, +- task->tk_status); +- return -NFS4ERR_RESET_TO_MDS; ++ break; + } ++ ++ if (ff_layout_avoid_mds_available_ds(lseg)) ++ return -NFS4ERR_RESET_TO_PNFS; ++reset: ++ dprintk("%s Retry through MDS. Error %d\n", __func__, ++ task->tk_status); ++ return -NFS4ERR_RESET_TO_MDS; ++ ++out_retry: + task->tk_status = 0; + return -EAGAIN; + } + + /* Retry all errors through either pNFS or MDS except for -EJUKEBOX */ + static int ff_layout_async_handle_error_v3(struct rpc_task *task, ++ u32 op_status, ++ struct nfs_client *clp, + struct pnfs_layout_segment *lseg, + u32 idx) + { + struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx); + ++ switch (op_status) { ++ case NFS_OK: ++ case NFSERR_NXIO: ++ break; ++ case NFSERR_PERM: ++ if (!task->tk_xprt) ++ break; ++ xprt_force_disconnect(task->tk_xprt); ++ goto out_retry; ++ case NFSERR_ACCES: ++ case NFSERR_BADHANDLE: ++ case NFSERR_FBIG: ++ case NFSERR_IO: ++ case NFSERR_NOSPC: ++ case NFSERR_ROFS: ++ case NFSERR_STALE: ++ goto out_reset_to_pnfs; ++ case NFSERR_JUKEBOX: ++ nfs_inc_stats(lseg->pls_layout->plh_inode, NFSIOS_DELAY); ++ goto out_retry; ++ default: ++ break; ++ } ++ + switch (task->tk_status) { + /* File access problems. Don't mark the device as unavailable */ + case -EACCES: +@@ -1205,6 +1251,7 @@ static int ff_layout_async_handle_error_v3(struct rpc_task *task, + nfs4_delete_deviceid(devid->ld, devid->nfs_client, + &devid->deviceid); + } ++out_reset_to_pnfs: + /* FIXME: Need to prevent infinite looping here. */ + return -NFS4ERR_RESET_TO_PNFS; + out_retry: +@@ -1215,6 +1262,7 @@ static int ff_layout_async_handle_error_v3(struct rpc_task *task, + } + + static int ff_layout_async_handle_error(struct rpc_task *task, ++ u32 op_status, + struct nfs4_state *state, + struct nfs_client *clp, + struct pnfs_layout_segment *lseg, +@@ -1233,10 +1281,11 @@ static int ff_layout_async_handle_error(struct rpc_task *task, + + switch (vers) { + case 3: +- return ff_layout_async_handle_error_v3(task, lseg, idx); +- case 4: +- return ff_layout_async_handle_error_v4(task, state, clp, ++ return ff_layout_async_handle_error_v3(task, op_status, clp, + lseg, idx); ++ case 4: ++ return ff_layout_async_handle_error_v4(task, op_status, state, ++ clp, lseg, idx); + default: + /* should never happen */ + WARN_ON_ONCE(1); +@@ -1289,6 +1338,7 @@ static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg, + switch (status) { + case NFS4ERR_DELAY: + case NFS4ERR_GRACE: ++ case NFS4ERR_PERM: + break; + case NFS4ERR_NXIO: + ff_layout_mark_ds_unreachable(lseg, idx); +@@ -1321,7 +1371,8 @@ static int ff_layout_read_done_cb(struct rpc_task *task, + trace_ff_layout_read_error(hdr); + } + +- err = ff_layout_async_handle_error(task, hdr->args.context->state, ++ err = ff_layout_async_handle_error(task, hdr->res.op_status, ++ hdr->args.context->state, + hdr->ds_clp, hdr->lseg, + hdr->pgio_mirror_idx); + +@@ -1491,7 +1542,8 @@ static int ff_layout_write_done_cb(struct rpc_task *task, + trace_ff_layout_write_error(hdr); + } + +- err = ff_layout_async_handle_error(task, hdr->args.context->state, ++ err = ff_layout_async_handle_error(task, hdr->res.op_status, ++ hdr->args.context->state, + hdr->ds_clp, hdr->lseg, + hdr->pgio_mirror_idx); + +@@ -1537,8 +1589,9 @@ static int ff_layout_commit_done_cb(struct rpc_task *task, + trace_ff_layout_commit_error(data); + } + +- err = ff_layout_async_handle_error(task, NULL, data->ds_clp, +- data->lseg, data->ds_commit_index); ++ err = ff_layout_async_handle_error(task, data->res.op_status, ++ NULL, data->ds_clp, data->lseg, ++ data->ds_commit_index); + + trace_nfs4_pnfs_commit_ds(data, err); + switch (err) { +-- +2.39.5 + diff --git a/queue-6.12/rdma-mlx5-fix-cache-entry-update-on-dereg-error.patch b/queue-6.12/rdma-mlx5-fix-cache-entry-update-on-dereg-error.patch new file mode 100644 index 0000000000..384bdd7557 --- /dev/null +++ b/queue-6.12/rdma-mlx5-fix-cache-entry-update-on-dereg-error.patch @@ -0,0 +1,67 @@ +From bdc3aec857cebfd1c5f39e9e20f57533815bc9b7 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 13 Mar 2025 16:29:49 +0200 +Subject: RDMA/mlx5: Fix cache entry update on dereg error + +From: Michael Guralnik + +[ Upstream commit 24d693cf6c89d216a68634d44fa93e4400775d94 ] + +Fix double decrement of 'in_use' counter on push_mkey_locked() failure +while deregistering an MR. +If we fail to return an mkey to the cache in cache_ent_find_and_store() +it'll update the 'in_use' counter. Its caller, revoke_mr(), also updates +it, thus having double decrement. + +Wrong value of 'in_use' counter will be exposed through debugfs and can +also cause wrong resizing of the cache when users try to set cache +entry size using the 'size' debugfs. + +To address this issue, the 'in_use' counter is now decremented within +mlx5_revoke_mr() also after a successful call to +cache_ent_find_and_store() and not within cache_ent_find_and_store(). +Other success or failure flows remains unchanged where it was also +decremented. + +Fixes: 8c1185fef68c ("RDMA/mlx5: Change check for cacheable mkeys") +Signed-off-by: Michael Guralnik +Reviewed-by: Yishai Hadas +Link: https://patch.msgid.link/97e979dff636f232ff4c83ce709c17c727da1fdb.1741875692.git.leon@kernel.org +Signed-off-by: Leon Romanovsky +Signed-off-by: Sasha Levin +--- + drivers/infiniband/hw/mlx5/mr.c | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c +index 068eac3bdb50b..830a15b66c120 100644 +--- a/drivers/infiniband/hw/mlx5/mr.c ++++ b/drivers/infiniband/hw/mlx5/mr.c +@@ -1968,7 +1968,6 @@ static int cache_ent_find_and_store(struct mlx5_ib_dev *dev, + + if (mr->mmkey.cache_ent) { + spin_lock_irq(&mr->mmkey.cache_ent->mkeys_queue.lock); +- mr->mmkey.cache_ent->in_use--; + goto end; + } + +@@ -2036,6 +2035,7 @@ static int mlx5_revoke_mr(struct mlx5_ib_mr *mr) + bool is_odp = is_odp_mr(mr); + bool is_odp_dma_buf = is_dmabuf_mr(mr) && + !to_ib_umem_dmabuf(mr->umem)->pinned; ++ bool from_cache = !!ent; + int ret = 0; + + if (is_odp) +@@ -2048,6 +2048,8 @@ static int mlx5_revoke_mr(struct mlx5_ib_mr *mr) + ent = mr->mmkey.cache_ent; + /* upon storing to a clean temp entry - schedule its cleanup */ + spin_lock_irq(&ent->mkeys_queue.lock); ++ if (from_cache) ++ ent->in_use--; + if (ent->is_tmp && !ent->tmp_cleanup_scheduled) { + mod_delayed_work(ent->dev->cache.wq, &ent->dwork, + msecs_to_jiffies(30 * 1000)); +-- +2.39.5 + diff --git a/queue-6.12/series b/queue-6.12/series index 17d05c90c0..937ff7a2e5 100644 --- a/queue-6.12/series +++ b/queue-6.12/series @@ -192,3 +192,8 @@ firmware-arm_ffa-replace-mutex-with-rwlock-to-avoid-.patch add-a-string-to-qstr-constructor.patch module-provide-export_symbol_gpl_for_modules-helper.patch fs-export-anon_inode_make_secure_inode-and-fix-secre.patch +rdma-mlx5-fix-cache-entry-update-on-dereg-error.patch +ib-mlx5-fix-potential-deadlock-in-mr-deregistration.patch +drm-xe-bmg-update-wa_22019338487.patch +drm-xe-allow-dropping-kunit-dependency-as-built-in.patch +nfsv4-flexfiles-fix-handling-of-nfs-level-errors-in-.patch