--- /dev/null
+From 75353bdeb7ef9998c389c3cc89ec06a2ba84b5b9 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 7 Jul 2025 02:14:17 -0400
+Subject: drm/xe: Allow dropping kunit dependency as built-in
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+From: Harry Austen <hpausten@protonmail.com>
+
+[ Upstream commit aa18d5769fcafe645a3ba01a9a69dde4f8dc8cc3 ]
+
+Fix Kconfig symbol dependency on KUNIT, which isn't actually required
+for XE to be built-in. However, if KUNIT is enabled, it must be built-in
+too.
+
+Fixes: 08987a8b6820 ("drm/xe: Fix build with KUNIT=m")
+Cc: Lucas De Marchi <lucas.demarchi@intel.com>
+Cc: Thomas Hellström <thomas.hellstrom@linux.intel.com>
+Cc: Jani Nikula <jani.nikula@linux.intel.com>
+Cc: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
+Signed-off-by: Harry Austen <hpausten@protonmail.com>
+Reviewed-by: Lucas De Marchi <lucas.demarchi@intel.com>
+Acked-by: Randy Dunlap <rdunlap@infradead.org>
+Tested-by: Randy Dunlap <rdunlap@infradead.org>
+Link: https://lore.kernel.org/r/20250627-xe-kunit-v2-2-756fe5cd56cf@intel.com
+Signed-off-by: Lucas De Marchi <lucas.demarchi@intel.com>
+(cherry picked from commit a559434880b320b83733d739733250815aecf1b0)
+Signed-off-by: Lucas De Marchi <lucas.demarchi@intel.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/gpu/drm/xe/Kconfig | 3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+diff --git a/drivers/gpu/drm/xe/Kconfig b/drivers/gpu/drm/xe/Kconfig
+index 7bbe46a98ff1f..93e742c1f21e7 100644
+--- a/drivers/gpu/drm/xe/Kconfig
++++ b/drivers/gpu/drm/xe/Kconfig
+@@ -1,7 +1,8 @@
+ # SPDX-License-Identifier: GPL-2.0-only
+ config DRM_XE
+ tristate "Intel Xe Graphics"
+- depends on DRM && PCI && MMU && (m || (y && KUNIT=y))
++ depends on DRM && PCI && MMU
++ depends on KUNIT || !KUNIT
+ select INTERVAL_TREE
+ # we need shmfs for the swappable backing store, and in particular
+ # the shmem_readpage() which depends upon tmpfs
+--
+2.39.5
+
--- /dev/null
+From d043d53c34c64bb9fac3f2278c081f7b3b83764d Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 18 Jun 2025 11:50:01 -0700
+Subject: drm/xe/bmg: Update Wa_22019338487
+
+From: Vinay Belgaumkar <vinay.belgaumkar@intel.com>
+
+[ Upstream commit 84c0b4a00610afbde650fdb8ad6db0424f7b2cc3 ]
+
+Limit GT max frequency to 2600MHz and wait for frequency to reduce
+before proceeding with a transient flush. This is really only needed for
+the transient flush: if L2 flush is needed due to 16023588340 then
+there's no need to do this additional wait since we are already using
+the bigger hammer.
+
+v2: Use generic names, ensure user set max frequency requests wait
+for flush to complete (Rodrigo)
+v3:
+ - User requests wait via wait_var_event_timeout (Lucas)
+ - Close races on flush + user requests (Lucas)
+ - Fix xe_guc_pc_remove_flush_freq_limit() being called on last gt
+ rather than root gt (Lucas)
+v4:
+ - Only apply the freq reducing part if a TDF is needed: L2 flush trumps
+ the need for waiting a lower frequency
+
+Fixes: aaa08078e725 ("drm/xe/bmg: Apply Wa_22019338487")
+Reviewed-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
+Signed-off-by: Vinay Belgaumkar <vinay.belgaumkar@intel.com>
+Link: https://lore.kernel.org/r/20250618-wa-22019338487-v5-4-b888388477f2@intel.com
+Signed-off-by: Lucas De Marchi <lucas.demarchi@intel.com>
+(cherry picked from commit deea6a7d6d803d6bb874a3e6f1b312e560e6c6df)
+Signed-off-by: Lucas De Marchi <lucas.demarchi@intel.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/gpu/drm/xe/xe_device.c | 55 +++++-----
+ drivers/gpu/drm/xe/xe_guc_pc.c | 144 +++++++++++++++++++++++++++
+ drivers/gpu/drm/xe/xe_guc_pc.h | 2 +
+ drivers/gpu/drm/xe/xe_guc_pc_types.h | 2 +
+ 4 files changed, 179 insertions(+), 24 deletions(-)
+
+diff --git a/drivers/gpu/drm/xe/xe_device.c b/drivers/gpu/drm/xe/xe_device.c
+index 0c3db53b93d8a..82da51a6616a1 100644
+--- a/drivers/gpu/drm/xe/xe_device.c
++++ b/drivers/gpu/drm/xe/xe_device.c
+@@ -37,6 +37,7 @@
+ #include "xe_gt_printk.h"
+ #include "xe_gt_sriov_vf.h"
+ #include "xe_guc.h"
++#include "xe_guc_pc.h"
+ #include "xe_hw_engine_group.h"
+ #include "xe_hwmon.h"
+ #include "xe_irq.h"
+@@ -871,31 +872,37 @@ void xe_device_td_flush(struct xe_device *xe)
+ if (!IS_DGFX(xe) || GRAPHICS_VER(xe) < 20)
+ return;
+
+- if (XE_WA(xe_root_mmio_gt(xe), 16023588340)) {
++ gt = xe_root_mmio_gt(xe);
++ if (XE_WA(gt, 16023588340)) {
++ /* A transient flush is not sufficient: flush the L2 */
+ xe_device_l2_flush(xe);
+- return;
+- }
+-
+- for_each_gt(gt, xe, id) {
+- if (xe_gt_is_media_type(gt))
+- continue;
+-
+- if (xe_force_wake_get(gt_to_fw(gt), XE_FW_GT))
+- return;
+-
+- xe_mmio_write32(gt, XE2_TDF_CTRL, TRANSIENT_FLUSH_REQUEST);
+- /*
+- * FIXME: We can likely do better here with our choice of
+- * timeout. Currently we just assume the worst case, i.e. 150us,
+- * which is believed to be sufficient to cover the worst case
+- * scenario on current platforms if all cache entries are
+- * transient and need to be flushed..
+- */
+- if (xe_mmio_wait32(gt, XE2_TDF_CTRL, TRANSIENT_FLUSH_REQUEST, 0,
+- 150, NULL, false))
+- xe_gt_err_once(gt, "TD flush timeout\n");
+-
+- xe_force_wake_put(gt_to_fw(gt), XE_FW_GT);
++ } else {
++ xe_guc_pc_apply_flush_freq_limit(>->uc.guc.pc);
++
++ /* Execute TDF flush on all graphics GTs */
++ for_each_gt(gt, xe, id) {
++ if (xe_gt_is_media_type(gt))
++ continue;
++
++ if (xe_force_wake_get(gt_to_fw(gt), XE_FW_GT))
++ return;
++
++ xe_mmio_write32(gt, XE2_TDF_CTRL, TRANSIENT_FLUSH_REQUEST);
++ /*
++ * FIXME: We can likely do better here with our choice of
++ * timeout. Currently we just assume the worst case, i.e. 150us,
++ * which is believed to be sufficient to cover the worst case
++ * scenario on current platforms if all cache entries are
++ * transient and need to be flushed..
++ */
++ if (xe_mmio_wait32(gt, XE2_TDF_CTRL, TRANSIENT_FLUSH_REQUEST, 0,
++ 150, NULL, false))
++ xe_gt_err_once(gt, "TD flush timeout\n");
++
++ xe_force_wake_put(gt_to_fw(gt), XE_FW_GT);
++ }
++
++ xe_guc_pc_remove_flush_freq_limit(&xe_root_mmio_gt(xe)->uc.guc.pc);
+ }
+ }
+
+diff --git a/drivers/gpu/drm/xe/xe_guc_pc.c b/drivers/gpu/drm/xe/xe_guc_pc.c
+index f978da8be35c2..af02803c145bf 100644
+--- a/drivers/gpu/drm/xe/xe_guc_pc.c
++++ b/drivers/gpu/drm/xe/xe_guc_pc.c
+@@ -6,6 +6,9 @@
+ #include "xe_guc_pc.h"
+
+ #include <linux/delay.h>
++#include <linux/jiffies.h>
++#include <linux/ktime.h>
++#include <linux/wait_bit.h>
+
+ #include <drm/drm_managed.h>
+ #include <generated/xe_wa_oob.h>
+@@ -47,6 +50,12 @@
+
+ #define LNL_MERT_FREQ_CAP 800
+ #define BMG_MERT_FREQ_CAP 2133
++#define BMG_MIN_FREQ 1200
++#define BMG_MERT_FLUSH_FREQ_CAP 2600
++
++#define SLPC_RESET_TIMEOUT_MS 5 /* roughly 5ms, but no need for precision */
++#define SLPC_RESET_EXTENDED_TIMEOUT_MS 1000 /* To be used only at pc_start */
++#define SLPC_ACT_FREQ_TIMEOUT_MS 100
+
+ /**
+ * DOC: GuC Power Conservation (PC)
+@@ -133,6 +142,36 @@ static int wait_for_pc_state(struct xe_guc_pc *pc,
+ return -ETIMEDOUT;
+ }
+
++static int wait_for_flush_complete(struct xe_guc_pc *pc)
++{
++ const unsigned long timeout = msecs_to_jiffies(30);
++
++ if (!wait_var_event_timeout(&pc->flush_freq_limit,
++ !atomic_read(&pc->flush_freq_limit),
++ timeout))
++ return -ETIMEDOUT;
++
++ return 0;
++}
++
++static int wait_for_act_freq_limit(struct xe_guc_pc *pc, u32 freq)
++{
++ int timeout_us = SLPC_ACT_FREQ_TIMEOUT_MS * USEC_PER_MSEC;
++ int slept, wait = 10;
++
++ for (slept = 0; slept < timeout_us;) {
++ if (xe_guc_pc_get_act_freq(pc) <= freq)
++ return 0;
++
++ usleep_range(wait, wait << 1);
++ slept += wait;
++ wait <<= 1;
++ if (slept + wait > timeout_us)
++ wait = timeout_us - slept;
++ }
++
++ return -ETIMEDOUT;
++}
+ static int pc_action_reset(struct xe_guc_pc *pc)
+ {
+ struct xe_guc_ct *ct = pc_to_ct(pc);
+@@ -584,6 +623,11 @@ int xe_guc_pc_set_max_freq(struct xe_guc_pc *pc, u32 freq)
+ {
+ int ret;
+
++ if (XE_WA(pc_to_gt(pc), 22019338487)) {
++ if (wait_for_flush_complete(pc) != 0)
++ return -EAGAIN;
++ }
++
+ mutex_lock(&pc->freq_lock);
+ if (!pc->freq_ready) {
+ /* Might be in the middle of a gt reset */
+@@ -793,6 +837,106 @@ static int pc_adjust_requested_freq(struct xe_guc_pc *pc)
+ return ret;
+ }
+
++static bool needs_flush_freq_limit(struct xe_guc_pc *pc)
++{
++ struct xe_gt *gt = pc_to_gt(pc);
++
++ return XE_WA(gt, 22019338487) &&
++ pc->rp0_freq > BMG_MERT_FLUSH_FREQ_CAP;
++}
++
++/**
++ * xe_guc_pc_apply_flush_freq_limit() - Limit max GT freq during L2 flush
++ * @pc: the xe_guc_pc object
++ *
++ * As per the WA, reduce max GT frequency during L2 cache flush
++ */
++void xe_guc_pc_apply_flush_freq_limit(struct xe_guc_pc *pc)
++{
++ struct xe_gt *gt = pc_to_gt(pc);
++ u32 max_freq;
++ int ret;
++
++ if (!needs_flush_freq_limit(pc))
++ return;
++
++ mutex_lock(&pc->freq_lock);
++
++ if (!pc->freq_ready) {
++ mutex_unlock(&pc->freq_lock);
++ return;
++ }
++
++ ret = pc_action_query_task_state(pc);
++ if (ret) {
++ mutex_unlock(&pc->freq_lock);
++ return;
++ }
++
++ max_freq = pc_get_max_freq(pc);
++ if (max_freq > BMG_MERT_FLUSH_FREQ_CAP) {
++ ret = pc_set_max_freq(pc, BMG_MERT_FLUSH_FREQ_CAP);
++ if (ret) {
++ xe_gt_err_once(gt, "Failed to cap max freq on flush to %u, %pe\n",
++ BMG_MERT_FLUSH_FREQ_CAP, ERR_PTR(ret));
++ mutex_unlock(&pc->freq_lock);
++ return;
++ }
++
++ atomic_set(&pc->flush_freq_limit, 1);
++
++ /*
++ * If user has previously changed max freq, stash that value to
++ * restore later, otherwise use the current max. New user
++ * requests wait on flush.
++ */
++ if (pc->user_requested_max != 0)
++ pc->stashed_max_freq = pc->user_requested_max;
++ else
++ pc->stashed_max_freq = max_freq;
++ }
++
++ mutex_unlock(&pc->freq_lock);
++
++ /*
++ * Wait for actual freq to go below the flush cap: even if the previous
++ * max was below cap, the current one might still be above it
++ */
++ ret = wait_for_act_freq_limit(pc, BMG_MERT_FLUSH_FREQ_CAP);
++ if (ret)
++ xe_gt_err_once(gt, "Actual freq did not reduce to %u, %pe\n",
++ BMG_MERT_FLUSH_FREQ_CAP, ERR_PTR(ret));
++}
++
++/**
++ * xe_guc_pc_remove_flush_freq_limit() - Remove max GT freq limit after L2 flush completes.
++ * @pc: the xe_guc_pc object
++ *
++ * Retrieve the previous GT max frequency value.
++ */
++void xe_guc_pc_remove_flush_freq_limit(struct xe_guc_pc *pc)
++{
++ struct xe_gt *gt = pc_to_gt(pc);
++ int ret = 0;
++
++ if (!needs_flush_freq_limit(pc))
++ return;
++
++ if (!atomic_read(&pc->flush_freq_limit))
++ return;
++
++ mutex_lock(&pc->freq_lock);
++
++ ret = pc_set_max_freq(>->uc.guc.pc, pc->stashed_max_freq);
++ if (ret)
++ xe_gt_err_once(gt, "Failed to restore max freq %u:%d",
++ pc->stashed_max_freq, ret);
++
++ atomic_set(&pc->flush_freq_limit, 0);
++ mutex_unlock(&pc->freq_lock);
++ wake_up_var(&pc->flush_freq_limit);
++}
++
+ static int pc_set_mert_freq_cap(struct xe_guc_pc *pc)
+ {
+ int ret = 0;
+diff --git a/drivers/gpu/drm/xe/xe_guc_pc.h b/drivers/gpu/drm/xe/xe_guc_pc.h
+index efda432fadfc8..7154b3aab0d84 100644
+--- a/drivers/gpu/drm/xe/xe_guc_pc.h
++++ b/drivers/gpu/drm/xe/xe_guc_pc.h
+@@ -34,5 +34,7 @@ u64 xe_guc_pc_mc6_residency(struct xe_guc_pc *pc);
+ void xe_guc_pc_init_early(struct xe_guc_pc *pc);
+ int xe_guc_pc_restore_stashed_freq(struct xe_guc_pc *pc);
+ void xe_guc_pc_raise_unslice(struct xe_guc_pc *pc);
++void xe_guc_pc_apply_flush_freq_limit(struct xe_guc_pc *pc);
++void xe_guc_pc_remove_flush_freq_limit(struct xe_guc_pc *pc);
+
+ #endif /* _XE_GUC_PC_H_ */
+diff --git a/drivers/gpu/drm/xe/xe_guc_pc_types.h b/drivers/gpu/drm/xe/xe_guc_pc_types.h
+index 13810be015db5..5b86d91296cb9 100644
+--- a/drivers/gpu/drm/xe/xe_guc_pc_types.h
++++ b/drivers/gpu/drm/xe/xe_guc_pc_types.h
+@@ -15,6 +15,8 @@
+ struct xe_guc_pc {
+ /** @bo: GGTT buffer object that is shared with GuC PC */
+ struct xe_bo *bo;
++ /** @flush_freq_limit: 1 when max freq changes are limited by driver */
++ atomic_t flush_freq_limit;
+ /** @rp0_freq: HW RP0 frequency - The Maximum one */
+ u32 rp0_freq;
+ /** @rpe_freq: HW RPe frequency - The Efficient one */
+--
+2.39.5
+
--- /dev/null
+From 60a8ab6c686344fa2811dbcd1d1e5d9e7f13ee24 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 16 Jun 2025 11:14:09 +0300
+Subject: IB/mlx5: Fix potential deadlock in MR deregistration
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+From: Or Har-Toov <ohartoov@nvidia.com>
+
+[ Upstream commit 2ed25aa7f7711f508b6120e336f05cd9d49943c0 ]
+
+The issue arises when kzalloc() is invoked while holding umem_mutex or
+any other lock acquired under umem_mutex. This is problematic because
+kzalloc() can trigger fs_reclaim_aqcuire(), which may, in turn, invoke
+mmu_notifier_invalidate_range_start(). This function can lead to
+mlx5_ib_invalidate_range(), which attempts to acquire umem_mutex again,
+resulting in a deadlock.
+
+The problematic flow:
+ CPU0 | CPU1
+---------------------------------------|------------------------------------------------
+mlx5_ib_dereg_mr() |
+ → revoke_mr() |
+ → mutex_lock(&umem_odp->umem_mutex) |
+ | mlx5_mkey_cache_init()
+ | → mutex_lock(&dev->cache.rb_lock)
+ | → mlx5r_cache_create_ent_locked()
+ | → kzalloc(GFP_KERNEL)
+ | → fs_reclaim()
+ | → mmu_notifier_invalidate_range_start()
+ | → mlx5_ib_invalidate_range()
+ | → mutex_lock(&umem_odp->umem_mutex)
+ → cache_ent_find_and_store() |
+ → mutex_lock(&dev->cache.rb_lock) |
+
+Additionally, when kzalloc() is called from within
+cache_ent_find_and_store(), we encounter the same deadlock due to
+re-acquisition of umem_mutex.
+
+Solve by releasing umem_mutex in dereg_mr() after umr_revoke_mr()
+and before acquiring rb_lock. This ensures that we don't hold
+umem_mutex while performing memory allocations that could trigger
+the reclaim path.
+
+This change prevents the deadlock by ensuring proper lock ordering and
+avoiding holding locks during memory allocation operations that could
+trigger the reclaim path.
+
+The following lockdep warning demonstrates the deadlock:
+
+ python3/20557 is trying to acquire lock:
+ ffff888387542128 (&umem_odp->umem_mutex){+.+.}-{4:4}, at:
+ mlx5_ib_invalidate_range+0x5b/0x550 [mlx5_ib]
+
+ but task is already holding lock:
+ ffffffff82f6b840 (mmu_notifier_invalidate_range_start){+.+.}-{0:0}, at:
+ unmap_vmas+0x7b/0x1a0
+
+ which lock already depends on the new lock.
+
+ the existing dependency chain (in reverse order) is:
+
+ -> #3 (mmu_notifier_invalidate_range_start){+.+.}-{0:0}:
+ fs_reclaim_acquire+0x60/0xd0
+ mem_cgroup_css_alloc+0x6f/0x9b0
+ cgroup_init_subsys+0xa4/0x240
+ cgroup_init+0x1c8/0x510
+ start_kernel+0x747/0x760
+ x86_64_start_reservations+0x25/0x30
+ x86_64_start_kernel+0x73/0x80
+ common_startup_64+0x129/0x138
+
+ -> #2 (fs_reclaim){+.+.}-{0:0}:
+ fs_reclaim_acquire+0x91/0xd0
+ __kmalloc_cache_noprof+0x4d/0x4c0
+ mlx5r_cache_create_ent_locked+0x75/0x620 [mlx5_ib]
+ mlx5_mkey_cache_init+0x186/0x360 [mlx5_ib]
+ mlx5_ib_stage_post_ib_reg_umr_init+0x3c/0x60 [mlx5_ib]
+ __mlx5_ib_add+0x4b/0x190 [mlx5_ib]
+ mlx5r_probe+0xd9/0x320 [mlx5_ib]
+ auxiliary_bus_probe+0x42/0x70
+ really_probe+0xdb/0x360
+ __driver_probe_device+0x8f/0x130
+ driver_probe_device+0x1f/0xb0
+ __driver_attach+0xd4/0x1f0
+ bus_for_each_dev+0x79/0xd0
+ bus_add_driver+0xf0/0x200
+ driver_register+0x6e/0xc0
+ __auxiliary_driver_register+0x6a/0xc0
+ do_one_initcall+0x5e/0x390
+ do_init_module+0x88/0x240
+ init_module_from_file+0x85/0xc0
+ idempotent_init_module+0x104/0x300
+ __x64_sys_finit_module+0x68/0xc0
+ do_syscall_64+0x6d/0x140
+ entry_SYSCALL_64_after_hwframe+0x4b/0x53
+
+ -> #1 (&dev->cache.rb_lock){+.+.}-{4:4}:
+ __mutex_lock+0x98/0xf10
+ __mlx5_ib_dereg_mr+0x6f2/0x890 [mlx5_ib]
+ mlx5_ib_dereg_mr+0x21/0x110 [mlx5_ib]
+ ib_dereg_mr_user+0x85/0x1f0 [ib_core]
+ uverbs_free_mr+0x19/0x30 [ib_uverbs]
+ destroy_hw_idr_uobject+0x21/0x80 [ib_uverbs]
+ uverbs_destroy_uobject+0x60/0x3d0 [ib_uverbs]
+ uobj_destroy+0x57/0xa0 [ib_uverbs]
+ ib_uverbs_cmd_verbs+0x4d5/0x1210 [ib_uverbs]
+ ib_uverbs_ioctl+0x129/0x230 [ib_uverbs]
+ __x64_sys_ioctl+0x596/0xaa0
+ do_syscall_64+0x6d/0x140
+ entry_SYSCALL_64_after_hwframe+0x4b/0x53
+
+ -> #0 (&umem_odp->umem_mutex){+.+.}-{4:4}:
+ __lock_acquire+0x1826/0x2f00
+ lock_acquire+0xd3/0x2e0
+ __mutex_lock+0x98/0xf10
+ mlx5_ib_invalidate_range+0x5b/0x550 [mlx5_ib]
+ __mmu_notifier_invalidate_range_start+0x18e/0x1f0
+ unmap_vmas+0x182/0x1a0
+ exit_mmap+0xf3/0x4a0
+ mmput+0x3a/0x100
+ do_exit+0x2b9/0xa90
+ do_group_exit+0x32/0xa0
+ get_signal+0xc32/0xcb0
+ arch_do_signal_or_restart+0x29/0x1d0
+ syscall_exit_to_user_mode+0x105/0x1d0
+ do_syscall_64+0x79/0x140
+ entry_SYSCALL_64_after_hwframe+0x4b/0x53
+
+ Chain exists of:
+ &dev->cache.rb_lock --> mmu_notifier_invalidate_range_start -->
+ &umem_odp->umem_mutex
+
+ Possible unsafe locking scenario:
+
+ CPU0 CPU1
+ ---- ----
+ lock(&umem_odp->umem_mutex);
+ lock(mmu_notifier_invalidate_range_start);
+ lock(&umem_odp->umem_mutex);
+ lock(&dev->cache.rb_lock);
+
+ *** DEADLOCK ***
+
+Fixes: abb604a1a9c8 ("RDMA/mlx5: Fix a race for an ODP MR which leads to CQE with error")
+Signed-off-by: Or Har-Toov <ohartoov@nvidia.com>
+Reviewed-by: Michael Guralnik <michaelgur@nvidia.com>
+Link: https://patch.msgid.link/3c8f225a8a9fade647d19b014df1172544643e4a.1750061612.git.leon@kernel.org
+Signed-off-by: Leon Romanovsky <leon@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/infiniband/hw/mlx5/mr.c | 61 +++++++++++++++++++++++++--------
+ 1 file changed, 47 insertions(+), 14 deletions(-)
+
+diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
+index 830a15b66c120..726b81b6330c6 100644
+--- a/drivers/infiniband/hw/mlx5/mr.c
++++ b/drivers/infiniband/hw/mlx5/mr.c
+@@ -2028,23 +2028,50 @@ void mlx5_ib_revoke_data_direct_mrs(struct mlx5_ib_dev *dev)
+ }
+ }
+
+-static int mlx5_revoke_mr(struct mlx5_ib_mr *mr)
++static int mlx5_umr_revoke_mr_with_lock(struct mlx5_ib_mr *mr)
+ {
+- struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device);
+- struct mlx5_cache_ent *ent = mr->mmkey.cache_ent;
+- bool is_odp = is_odp_mr(mr);
+ bool is_odp_dma_buf = is_dmabuf_mr(mr) &&
+- !to_ib_umem_dmabuf(mr->umem)->pinned;
+- bool from_cache = !!ent;
+- int ret = 0;
++ !to_ib_umem_dmabuf(mr->umem)->pinned;
++ bool is_odp = is_odp_mr(mr);
++ int ret;
+
+ if (is_odp)
+ mutex_lock(&to_ib_umem_odp(mr->umem)->umem_mutex);
+
+ if (is_odp_dma_buf)
+- dma_resv_lock(to_ib_umem_dmabuf(mr->umem)->attach->dmabuf->resv, NULL);
++ dma_resv_lock(to_ib_umem_dmabuf(mr->umem)->attach->dmabuf->resv,
++ NULL);
++
++ ret = mlx5r_umr_revoke_mr(mr);
++
++ if (is_odp) {
++ if (!ret)
++ to_ib_umem_odp(mr->umem)->private = NULL;
++ mutex_unlock(&to_ib_umem_odp(mr->umem)->umem_mutex);
++ }
++
++ if (is_odp_dma_buf) {
++ if (!ret)
++ to_ib_umem_dmabuf(mr->umem)->private = NULL;
++ dma_resv_unlock(
++ to_ib_umem_dmabuf(mr->umem)->attach->dmabuf->resv);
++ }
+
+- if (mr->mmkey.cacheable && !mlx5r_umr_revoke_mr(mr) && !cache_ent_find_and_store(dev, mr)) {
++ return ret;
++}
++
++static int mlx5r_handle_mkey_cleanup(struct mlx5_ib_mr *mr)
++{
++ bool is_odp_dma_buf = is_dmabuf_mr(mr) &&
++ !to_ib_umem_dmabuf(mr->umem)->pinned;
++ struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device);
++ struct mlx5_cache_ent *ent = mr->mmkey.cache_ent;
++ bool is_odp = is_odp_mr(mr);
++ bool from_cache = !!ent;
++ int ret;
++
++ if (mr->mmkey.cacheable && !mlx5_umr_revoke_mr_with_lock(mr) &&
++ !cache_ent_find_and_store(dev, mr)) {
+ ent = mr->mmkey.cache_ent;
+ /* upon storing to a clean temp entry - schedule its cleanup */
+ spin_lock_irq(&ent->mkeys_queue.lock);
+@@ -2056,7 +2083,7 @@ static int mlx5_revoke_mr(struct mlx5_ib_mr *mr)
+ ent->tmp_cleanup_scheduled = true;
+ }
+ spin_unlock_irq(&ent->mkeys_queue.lock);
+- goto out;
++ return 0;
+ }
+
+ if (ent) {
+@@ -2065,8 +2092,14 @@ static int mlx5_revoke_mr(struct mlx5_ib_mr *mr)
+ mr->mmkey.cache_ent = NULL;
+ spin_unlock_irq(&ent->mkeys_queue.lock);
+ }
++
++ if (is_odp)
++ mutex_lock(&to_ib_umem_odp(mr->umem)->umem_mutex);
++
++ if (is_odp_dma_buf)
++ dma_resv_lock(to_ib_umem_dmabuf(mr->umem)->attach->dmabuf->resv,
++ NULL);
+ ret = destroy_mkey(dev, mr);
+-out:
+ if (is_odp) {
+ if (!ret)
+ to_ib_umem_odp(mr->umem)->private = NULL;
+@@ -2076,9 +2109,9 @@ static int mlx5_revoke_mr(struct mlx5_ib_mr *mr)
+ if (is_odp_dma_buf) {
+ if (!ret)
+ to_ib_umem_dmabuf(mr->umem)->private = NULL;
+- dma_resv_unlock(to_ib_umem_dmabuf(mr->umem)->attach->dmabuf->resv);
++ dma_resv_unlock(
++ to_ib_umem_dmabuf(mr->umem)->attach->dmabuf->resv);
+ }
+-
+ return ret;
+ }
+
+@@ -2127,7 +2160,7 @@ static int __mlx5_ib_dereg_mr(struct ib_mr *ibmr)
+ }
+
+ /* Stop DMA */
+- rc = mlx5_revoke_mr(mr);
++ rc = mlx5r_handle_mkey_cleanup(mr);
+ if (rc)
+ return rc;
+
+--
+2.39.5
+
--- /dev/null
+From 44cf8d0200904f04abc0499e99bf8bceee19bcb1 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 19 Jun 2025 15:16:11 -0400
+Subject: NFSv4/flexfiles: Fix handling of NFS level errors in I/O
+
+From: Trond Myklebust <trond.myklebust@hammerspace.com>
+
+[ Upstream commit 38074de35b015df5623f524d6f2b49a0cd395c40 ]
+
+Allow the flexfiles error handling to recognise NFS level errors (as
+opposed to RPC level errors) and handle them separately. The main
+motivator is the NFSERR_PERM errors that get returned if the NFS client
+connects to the data server through a port number that is lower than
+1024. In that case, the client should disconnect and retry a READ on a
+different data server, or it should retry a WRITE after reconnecting.
+
+Reviewed-by: Tigran Mkrtchyan <tigran.mkrtchyan@desy.de>
+Fixes: d67ae825a59d ("pnfs/flexfiles: Add the FlexFile Layout Driver")
+Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
+Signed-off-by: Anna Schumaker <anna.schumaker@oracle.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/nfs/flexfilelayout/flexfilelayout.c | 121 ++++++++++++++++++-------
+ 1 file changed, 87 insertions(+), 34 deletions(-)
+
+diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
+index 8f7ea4076653d..bf96f7a8900c1 100644
+--- a/fs/nfs/flexfilelayout/flexfilelayout.c
++++ b/fs/nfs/flexfilelayout/flexfilelayout.c
+@@ -1104,6 +1104,7 @@ static void ff_layout_reset_read(struct nfs_pgio_header *hdr)
+ }
+
+ static int ff_layout_async_handle_error_v4(struct rpc_task *task,
++ u32 op_status,
+ struct nfs4_state *state,
+ struct nfs_client *clp,
+ struct pnfs_layout_segment *lseg,
+@@ -1114,32 +1115,42 @@ static int ff_layout_async_handle_error_v4(struct rpc_task *task,
+ struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx);
+ struct nfs4_slot_table *tbl = &clp->cl_session->fc_slot_table;
+
+- switch (task->tk_status) {
+- case -NFS4ERR_BADSESSION:
+- case -NFS4ERR_BADSLOT:
+- case -NFS4ERR_BAD_HIGH_SLOT:
+- case -NFS4ERR_DEADSESSION:
+- case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
+- case -NFS4ERR_SEQ_FALSE_RETRY:
+- case -NFS4ERR_SEQ_MISORDERED:
++ switch (op_status) {
++ case NFS4_OK:
++ case NFS4ERR_NXIO:
++ break;
++ case NFSERR_PERM:
++ if (!task->tk_xprt)
++ break;
++ xprt_force_disconnect(task->tk_xprt);
++ goto out_retry;
++ case NFS4ERR_BADSESSION:
++ case NFS4ERR_BADSLOT:
++ case NFS4ERR_BAD_HIGH_SLOT:
++ case NFS4ERR_DEADSESSION:
++ case NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
++ case NFS4ERR_SEQ_FALSE_RETRY:
++ case NFS4ERR_SEQ_MISORDERED:
+ dprintk("%s ERROR %d, Reset session. Exchangeid "
+ "flags 0x%x\n", __func__, task->tk_status,
+ clp->cl_exchange_flags);
+ nfs4_schedule_session_recovery(clp->cl_session, task->tk_status);
+- break;
+- case -NFS4ERR_DELAY:
+- case -NFS4ERR_GRACE:
++ goto out_retry;
++ case NFS4ERR_DELAY:
++ nfs_inc_stats(lseg->pls_layout->plh_inode, NFSIOS_DELAY);
++ fallthrough;
++ case NFS4ERR_GRACE:
+ rpc_delay(task, FF_LAYOUT_POLL_RETRY_MAX);
+- break;
+- case -NFS4ERR_RETRY_UNCACHED_REP:
+- break;
++ goto out_retry;
++ case NFS4ERR_RETRY_UNCACHED_REP:
++ goto out_retry;
+ /* Invalidate Layout errors */
+- case -NFS4ERR_PNFS_NO_LAYOUT:
+- case -ESTALE: /* mapped NFS4ERR_STALE */
+- case -EBADHANDLE: /* mapped NFS4ERR_BADHANDLE */
+- case -EISDIR: /* mapped NFS4ERR_ISDIR */
+- case -NFS4ERR_FHEXPIRED:
+- case -NFS4ERR_WRONG_TYPE:
++ case NFS4ERR_PNFS_NO_LAYOUT:
++ case NFS4ERR_STALE:
++ case NFS4ERR_BADHANDLE:
++ case NFS4ERR_ISDIR:
++ case NFS4ERR_FHEXPIRED:
++ case NFS4ERR_WRONG_TYPE:
+ dprintk("%s Invalid layout error %d\n", __func__,
+ task->tk_status);
+ /*
+@@ -1152,6 +1163,11 @@ static int ff_layout_async_handle_error_v4(struct rpc_task *task,
+ pnfs_destroy_layout(NFS_I(inode));
+ rpc_wake_up(&tbl->slot_tbl_waitq);
+ goto reset;
++ default:
++ break;
++ }
++
++ switch (task->tk_status) {
+ /* RPC connection errors */
+ case -ECONNREFUSED:
+ case -EHOSTDOWN:
+@@ -1167,26 +1183,56 @@ static int ff_layout_async_handle_error_v4(struct rpc_task *task,
+ nfs4_delete_deviceid(devid->ld, devid->nfs_client,
+ &devid->deviceid);
+ rpc_wake_up(&tbl->slot_tbl_waitq);
+- fallthrough;
++ break;
+ default:
+- if (ff_layout_avoid_mds_available_ds(lseg))
+- return -NFS4ERR_RESET_TO_PNFS;
+-reset:
+- dprintk("%s Retry through MDS. Error %d\n", __func__,
+- task->tk_status);
+- return -NFS4ERR_RESET_TO_MDS;
++ break;
+ }
++
++ if (ff_layout_avoid_mds_available_ds(lseg))
++ return -NFS4ERR_RESET_TO_PNFS;
++reset:
++ dprintk("%s Retry through MDS. Error %d\n", __func__,
++ task->tk_status);
++ return -NFS4ERR_RESET_TO_MDS;
++
++out_retry:
+ task->tk_status = 0;
+ return -EAGAIN;
+ }
+
+ /* Retry all errors through either pNFS or MDS except for -EJUKEBOX */
+ static int ff_layout_async_handle_error_v3(struct rpc_task *task,
++ u32 op_status,
++ struct nfs_client *clp,
+ struct pnfs_layout_segment *lseg,
+ u32 idx)
+ {
+ struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx);
+
++ switch (op_status) {
++ case NFS_OK:
++ case NFSERR_NXIO:
++ break;
++ case NFSERR_PERM:
++ if (!task->tk_xprt)
++ break;
++ xprt_force_disconnect(task->tk_xprt);
++ goto out_retry;
++ case NFSERR_ACCES:
++ case NFSERR_BADHANDLE:
++ case NFSERR_FBIG:
++ case NFSERR_IO:
++ case NFSERR_NOSPC:
++ case NFSERR_ROFS:
++ case NFSERR_STALE:
++ goto out_reset_to_pnfs;
++ case NFSERR_JUKEBOX:
++ nfs_inc_stats(lseg->pls_layout->plh_inode, NFSIOS_DELAY);
++ goto out_retry;
++ default:
++ break;
++ }
++
+ switch (task->tk_status) {
+ /* File access problems. Don't mark the device as unavailable */
+ case -EACCES:
+@@ -1205,6 +1251,7 @@ static int ff_layout_async_handle_error_v3(struct rpc_task *task,
+ nfs4_delete_deviceid(devid->ld, devid->nfs_client,
+ &devid->deviceid);
+ }
++out_reset_to_pnfs:
+ /* FIXME: Need to prevent infinite looping here. */
+ return -NFS4ERR_RESET_TO_PNFS;
+ out_retry:
+@@ -1215,6 +1262,7 @@ static int ff_layout_async_handle_error_v3(struct rpc_task *task,
+ }
+
+ static int ff_layout_async_handle_error(struct rpc_task *task,
++ u32 op_status,
+ struct nfs4_state *state,
+ struct nfs_client *clp,
+ struct pnfs_layout_segment *lseg,
+@@ -1233,10 +1281,11 @@ static int ff_layout_async_handle_error(struct rpc_task *task,
+
+ switch (vers) {
+ case 3:
+- return ff_layout_async_handle_error_v3(task, lseg, idx);
+- case 4:
+- return ff_layout_async_handle_error_v4(task, state, clp,
++ return ff_layout_async_handle_error_v3(task, op_status, clp,
+ lseg, idx);
++ case 4:
++ return ff_layout_async_handle_error_v4(task, op_status, state,
++ clp, lseg, idx);
+ default:
+ /* should never happen */
+ WARN_ON_ONCE(1);
+@@ -1289,6 +1338,7 @@ static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg,
+ switch (status) {
+ case NFS4ERR_DELAY:
+ case NFS4ERR_GRACE:
++ case NFS4ERR_PERM:
+ break;
+ case NFS4ERR_NXIO:
+ ff_layout_mark_ds_unreachable(lseg, idx);
+@@ -1321,7 +1371,8 @@ static int ff_layout_read_done_cb(struct rpc_task *task,
+ trace_ff_layout_read_error(hdr);
+ }
+
+- err = ff_layout_async_handle_error(task, hdr->args.context->state,
++ err = ff_layout_async_handle_error(task, hdr->res.op_status,
++ hdr->args.context->state,
+ hdr->ds_clp, hdr->lseg,
+ hdr->pgio_mirror_idx);
+
+@@ -1491,7 +1542,8 @@ static int ff_layout_write_done_cb(struct rpc_task *task,
+ trace_ff_layout_write_error(hdr);
+ }
+
+- err = ff_layout_async_handle_error(task, hdr->args.context->state,
++ err = ff_layout_async_handle_error(task, hdr->res.op_status,
++ hdr->args.context->state,
+ hdr->ds_clp, hdr->lseg,
+ hdr->pgio_mirror_idx);
+
+@@ -1537,8 +1589,9 @@ static int ff_layout_commit_done_cb(struct rpc_task *task,
+ trace_ff_layout_commit_error(data);
+ }
+
+- err = ff_layout_async_handle_error(task, NULL, data->ds_clp,
+- data->lseg, data->ds_commit_index);
++ err = ff_layout_async_handle_error(task, data->res.op_status,
++ NULL, data->ds_clp, data->lseg,
++ data->ds_commit_index);
+
+ trace_nfs4_pnfs_commit_ds(data, err);
+ switch (err) {
+--
+2.39.5
+
--- /dev/null
+From bdc3aec857cebfd1c5f39e9e20f57533815bc9b7 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 13 Mar 2025 16:29:49 +0200
+Subject: RDMA/mlx5: Fix cache entry update on dereg error
+
+From: Michael Guralnik <michaelgur@nvidia.com>
+
+[ Upstream commit 24d693cf6c89d216a68634d44fa93e4400775d94 ]
+
+Fix double decrement of 'in_use' counter on push_mkey_locked() failure
+while deregistering an MR.
+If we fail to return an mkey to the cache in cache_ent_find_and_store()
+it'll update the 'in_use' counter. Its caller, revoke_mr(), also updates
+it, thus having double decrement.
+
+Wrong value of 'in_use' counter will be exposed through debugfs and can
+also cause wrong resizing of the cache when users try to set cache
+entry size using the 'size' debugfs.
+
+To address this issue, the 'in_use' counter is now decremented within
+mlx5_revoke_mr() also after a successful call to
+cache_ent_find_and_store() and not within cache_ent_find_and_store().
+Other success or failure flows remains unchanged where it was also
+decremented.
+
+Fixes: 8c1185fef68c ("RDMA/mlx5: Change check for cacheable mkeys")
+Signed-off-by: Michael Guralnik <michaelgur@nvidia.com>
+Reviewed-by: Yishai Hadas <yishaih@nvidia.com>
+Link: https://patch.msgid.link/97e979dff636f232ff4c83ce709c17c727da1fdb.1741875692.git.leon@kernel.org
+Signed-off-by: Leon Romanovsky <leon@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/infiniband/hw/mlx5/mr.c | 4 +++-
+ 1 file changed, 3 insertions(+), 1 deletion(-)
+
+diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
+index 068eac3bdb50b..830a15b66c120 100644
+--- a/drivers/infiniband/hw/mlx5/mr.c
++++ b/drivers/infiniband/hw/mlx5/mr.c
+@@ -1968,7 +1968,6 @@ static int cache_ent_find_and_store(struct mlx5_ib_dev *dev,
+
+ if (mr->mmkey.cache_ent) {
+ spin_lock_irq(&mr->mmkey.cache_ent->mkeys_queue.lock);
+- mr->mmkey.cache_ent->in_use--;
+ goto end;
+ }
+
+@@ -2036,6 +2035,7 @@ static int mlx5_revoke_mr(struct mlx5_ib_mr *mr)
+ bool is_odp = is_odp_mr(mr);
+ bool is_odp_dma_buf = is_dmabuf_mr(mr) &&
+ !to_ib_umem_dmabuf(mr->umem)->pinned;
++ bool from_cache = !!ent;
+ int ret = 0;
+
+ if (is_odp)
+@@ -2048,6 +2048,8 @@ static int mlx5_revoke_mr(struct mlx5_ib_mr *mr)
+ ent = mr->mmkey.cache_ent;
+ /* upon storing to a clean temp entry - schedule its cleanup */
+ spin_lock_irq(&ent->mkeys_queue.lock);
++ if (from_cache)
++ ent->in_use--;
+ if (ent->is_tmp && !ent->tmp_cleanup_scheduled) {
+ mod_delayed_work(ent->dev->cache.wq, &ent->dwork,
+ msecs_to_jiffies(30 * 1000));
+--
+2.39.5
+
add-a-string-to-qstr-constructor.patch
module-provide-export_symbol_gpl_for_modules-helper.patch
fs-export-anon_inode_make_secure_inode-and-fix-secre.patch
+rdma-mlx5-fix-cache-entry-update-on-dereg-error.patch
+ib-mlx5-fix-potential-deadlock-in-mr-deregistration.patch
+drm-xe-bmg-update-wa_22019338487.patch
+drm-xe-allow-dropping-kunit-dependency-as-built-in.patch
+nfsv4-flexfiles-fix-handling-of-nfs-level-errors-in-.patch