]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/commitdiff
Fixes for 6.12
authorSasha Levin <sashal@kernel.org>
Tue, 8 Jul 2025 00:37:04 +0000 (20:37 -0400)
committerSasha Levin <sashal@kernel.org>
Tue, 8 Jul 2025 00:37:04 +0000 (20:37 -0400)
Signed-off-by: Sasha Levin <sashal@kernel.org>
queue-6.12/drm-xe-allow-dropping-kunit-dependency-as-built-in.patch [new file with mode: 0644]
queue-6.12/drm-xe-bmg-update-wa_22019338487.patch [new file with mode: 0644]
queue-6.12/ib-mlx5-fix-potential-deadlock-in-mr-deregistration.patch [new file with mode: 0644]
queue-6.12/nfsv4-flexfiles-fix-handling-of-nfs-level-errors-in-.patch [new file with mode: 0644]
queue-6.12/rdma-mlx5-fix-cache-entry-update-on-dereg-error.patch [new file with mode: 0644]
queue-6.12/series

diff --git a/queue-6.12/drm-xe-allow-dropping-kunit-dependency-as-built-in.patch b/queue-6.12/drm-xe-allow-dropping-kunit-dependency-as-built-in.patch
new file mode 100644 (file)
index 0000000..e2d1af6
--- /dev/null
@@ -0,0 +1,51 @@
+From 75353bdeb7ef9998c389c3cc89ec06a2ba84b5b9 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 7 Jul 2025 02:14:17 -0400
+Subject: drm/xe: Allow dropping kunit dependency as built-in
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+From: Harry Austen <hpausten@protonmail.com>
+
+[ Upstream commit aa18d5769fcafe645a3ba01a9a69dde4f8dc8cc3 ]
+
+Fix Kconfig symbol dependency on KUNIT, which isn't actually required
+for XE to be built-in. However, if KUNIT is enabled, it must be built-in
+too.
+
+Fixes: 08987a8b6820 ("drm/xe: Fix build with KUNIT=m")
+Cc: Lucas De Marchi <lucas.demarchi@intel.com>
+Cc: Thomas Hellström <thomas.hellstrom@linux.intel.com>
+Cc: Jani Nikula <jani.nikula@linux.intel.com>
+Cc: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
+Signed-off-by: Harry Austen <hpausten@protonmail.com>
+Reviewed-by: Lucas De Marchi <lucas.demarchi@intel.com>
+Acked-by: Randy Dunlap <rdunlap@infradead.org>
+Tested-by: Randy Dunlap <rdunlap@infradead.org>
+Link: https://lore.kernel.org/r/20250627-xe-kunit-v2-2-756fe5cd56cf@intel.com
+Signed-off-by: Lucas De Marchi <lucas.demarchi@intel.com>
+(cherry picked from commit a559434880b320b83733d739733250815aecf1b0)
+Signed-off-by: Lucas De Marchi <lucas.demarchi@intel.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/gpu/drm/xe/Kconfig | 3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+diff --git a/drivers/gpu/drm/xe/Kconfig b/drivers/gpu/drm/xe/Kconfig
+index 7bbe46a98ff1f..93e742c1f21e7 100644
+--- a/drivers/gpu/drm/xe/Kconfig
++++ b/drivers/gpu/drm/xe/Kconfig
+@@ -1,7 +1,8 @@
+ # SPDX-License-Identifier: GPL-2.0-only
+ config DRM_XE
+       tristate "Intel Xe Graphics"
+-      depends on DRM && PCI && MMU && (m || (y && KUNIT=y))
++      depends on DRM && PCI && MMU
++      depends on KUNIT || !KUNIT
+       select INTERVAL_TREE
+       # we need shmfs for the swappable backing store, and in particular
+       # the shmem_readpage() which depends upon tmpfs
+-- 
+2.39.5
+
diff --git a/queue-6.12/drm-xe-bmg-update-wa_22019338487.patch b/queue-6.12/drm-xe-bmg-update-wa_22019338487.patch
new file mode 100644 (file)
index 0000000..2ff6942
--- /dev/null
@@ -0,0 +1,326 @@
+From d043d53c34c64bb9fac3f2278c081f7b3b83764d Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 18 Jun 2025 11:50:01 -0700
+Subject: drm/xe/bmg: Update Wa_22019338487
+
+From: Vinay Belgaumkar <vinay.belgaumkar@intel.com>
+
+[ Upstream commit 84c0b4a00610afbde650fdb8ad6db0424f7b2cc3 ]
+
+Limit GT max frequency to 2600MHz and wait for frequency to reduce
+before proceeding with a transient flush. This is really only needed for
+the transient flush: if L2 flush is needed due to 16023588340 then
+there's no need to do this additional wait since we are already using
+the bigger hammer.
+
+v2: Use generic names, ensure user set max frequency requests wait
+for flush to complete (Rodrigo)
+v3:
+ - User requests wait via wait_var_event_timeout (Lucas)
+ - Close races on flush + user requests (Lucas)
+ - Fix xe_guc_pc_remove_flush_freq_limit() being called on last gt
+   rather than root gt (Lucas)
+v4:
+ - Only apply the freq reducing part if a TDF is needed: L2 flush trumps
+   the need for waiting a lower frequency
+
+Fixes: aaa08078e725 ("drm/xe/bmg: Apply Wa_22019338487")
+Reviewed-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
+Signed-off-by: Vinay Belgaumkar <vinay.belgaumkar@intel.com>
+Link: https://lore.kernel.org/r/20250618-wa-22019338487-v5-4-b888388477f2@intel.com
+Signed-off-by: Lucas De Marchi <lucas.demarchi@intel.com>
+(cherry picked from commit deea6a7d6d803d6bb874a3e6f1b312e560e6c6df)
+Signed-off-by: Lucas De Marchi <lucas.demarchi@intel.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/gpu/drm/xe/xe_device.c       |  55 +++++-----
+ drivers/gpu/drm/xe/xe_guc_pc.c       | 144 +++++++++++++++++++++++++++
+ drivers/gpu/drm/xe/xe_guc_pc.h       |   2 +
+ drivers/gpu/drm/xe/xe_guc_pc_types.h |   2 +
+ 4 files changed, 179 insertions(+), 24 deletions(-)
+
+diff --git a/drivers/gpu/drm/xe/xe_device.c b/drivers/gpu/drm/xe/xe_device.c
+index 0c3db53b93d8a..82da51a6616a1 100644
+--- a/drivers/gpu/drm/xe/xe_device.c
++++ b/drivers/gpu/drm/xe/xe_device.c
+@@ -37,6 +37,7 @@
+ #include "xe_gt_printk.h"
+ #include "xe_gt_sriov_vf.h"
+ #include "xe_guc.h"
++#include "xe_guc_pc.h"
+ #include "xe_hw_engine_group.h"
+ #include "xe_hwmon.h"
+ #include "xe_irq.h"
+@@ -871,31 +872,37 @@ void xe_device_td_flush(struct xe_device *xe)
+       if (!IS_DGFX(xe) || GRAPHICS_VER(xe) < 20)
+               return;
+-      if (XE_WA(xe_root_mmio_gt(xe), 16023588340)) {
++      gt = xe_root_mmio_gt(xe);
++      if (XE_WA(gt, 16023588340)) {
++              /* A transient flush is not sufficient: flush the L2 */
+               xe_device_l2_flush(xe);
+-              return;
+-      }
+-
+-      for_each_gt(gt, xe, id) {
+-              if (xe_gt_is_media_type(gt))
+-                      continue;
+-
+-              if (xe_force_wake_get(gt_to_fw(gt), XE_FW_GT))
+-                      return;
+-
+-              xe_mmio_write32(gt, XE2_TDF_CTRL, TRANSIENT_FLUSH_REQUEST);
+-              /*
+-               * FIXME: We can likely do better here with our choice of
+-               * timeout. Currently we just assume the worst case, i.e. 150us,
+-               * which is believed to be sufficient to cover the worst case
+-               * scenario on current platforms if all cache entries are
+-               * transient and need to be flushed..
+-               */
+-              if (xe_mmio_wait32(gt, XE2_TDF_CTRL, TRANSIENT_FLUSH_REQUEST, 0,
+-                                 150, NULL, false))
+-                      xe_gt_err_once(gt, "TD flush timeout\n");
+-
+-              xe_force_wake_put(gt_to_fw(gt), XE_FW_GT);
++      } else {
++              xe_guc_pc_apply_flush_freq_limit(&gt->uc.guc.pc);
++              
++              /* Execute TDF flush on all graphics GTs */
++              for_each_gt(gt, xe, id) {
++                      if (xe_gt_is_media_type(gt))
++                              continue;
++
++                      if (xe_force_wake_get(gt_to_fw(gt), XE_FW_GT))
++                              return;
++
++                      xe_mmio_write32(gt, XE2_TDF_CTRL, TRANSIENT_FLUSH_REQUEST);
++                      /*
++                       * FIXME: We can likely do better here with our choice of
++                       * timeout. Currently we just assume the worst case, i.e. 150us,
++                       * which is believed to be sufficient to cover the worst case
++                       * scenario on current platforms if all cache entries are
++                       * transient and need to be flushed..
++                       */
++                      if (xe_mmio_wait32(gt, XE2_TDF_CTRL, TRANSIENT_FLUSH_REQUEST, 0,
++                                         150, NULL, false))
++                              xe_gt_err_once(gt, "TD flush timeout\n");
++
++                      xe_force_wake_put(gt_to_fw(gt), XE_FW_GT);
++              }
++              
++              xe_guc_pc_remove_flush_freq_limit(&xe_root_mmio_gt(xe)->uc.guc.pc);
+       }
+ }
+diff --git a/drivers/gpu/drm/xe/xe_guc_pc.c b/drivers/gpu/drm/xe/xe_guc_pc.c
+index f978da8be35c2..af02803c145bf 100644
+--- a/drivers/gpu/drm/xe/xe_guc_pc.c
++++ b/drivers/gpu/drm/xe/xe_guc_pc.c
+@@ -6,6 +6,9 @@
+ #include "xe_guc_pc.h"
+ #include <linux/delay.h>
++#include <linux/jiffies.h>
++#include <linux/ktime.h>
++#include <linux/wait_bit.h>
+ #include <drm/drm_managed.h>
+ #include <generated/xe_wa_oob.h>
+@@ -47,6 +50,12 @@
+ #define LNL_MERT_FREQ_CAP     800
+ #define BMG_MERT_FREQ_CAP     2133
++#define BMG_MIN_FREQ          1200
++#define BMG_MERT_FLUSH_FREQ_CAP       2600
++
++#define SLPC_RESET_TIMEOUT_MS 5 /* roughly 5ms, but no need for precision */
++#define SLPC_RESET_EXTENDED_TIMEOUT_MS 1000 /* To be used only at pc_start */
++#define SLPC_ACT_FREQ_TIMEOUT_MS 100
+ /**
+  * DOC: GuC Power Conservation (PC)
+@@ -133,6 +142,36 @@ static int wait_for_pc_state(struct xe_guc_pc *pc,
+       return -ETIMEDOUT;
+ }
++static int wait_for_flush_complete(struct xe_guc_pc *pc)
++{
++      const unsigned long timeout = msecs_to_jiffies(30);
++
++      if (!wait_var_event_timeout(&pc->flush_freq_limit,
++                                  !atomic_read(&pc->flush_freq_limit),
++                                  timeout))
++              return -ETIMEDOUT;
++
++      return 0;
++}
++
++static int wait_for_act_freq_limit(struct xe_guc_pc *pc, u32 freq)
++{
++      int timeout_us = SLPC_ACT_FREQ_TIMEOUT_MS * USEC_PER_MSEC;
++      int slept, wait = 10;
++
++      for (slept = 0; slept < timeout_us;) {
++              if (xe_guc_pc_get_act_freq(pc) <= freq)
++                      return 0;
++
++              usleep_range(wait, wait << 1);
++              slept += wait;
++              wait <<= 1;
++              if (slept + wait > timeout_us)
++                      wait = timeout_us - slept;
++      }
++
++      return -ETIMEDOUT;
++}
+ static int pc_action_reset(struct xe_guc_pc *pc)
+ {
+       struct xe_guc_ct *ct = pc_to_ct(pc);
+@@ -584,6 +623,11 @@ int xe_guc_pc_set_max_freq(struct xe_guc_pc *pc, u32 freq)
+ {
+       int ret;
++      if (XE_WA(pc_to_gt(pc), 22019338487)) {
++              if (wait_for_flush_complete(pc) != 0)
++                      return -EAGAIN;
++      }
++
+       mutex_lock(&pc->freq_lock);
+       if (!pc->freq_ready) {
+               /* Might be in the middle of a gt reset */
+@@ -793,6 +837,106 @@ static int pc_adjust_requested_freq(struct xe_guc_pc *pc)
+       return ret;
+ }
++static bool needs_flush_freq_limit(struct xe_guc_pc *pc)
++{
++      struct xe_gt *gt = pc_to_gt(pc);
++
++      return  XE_WA(gt, 22019338487) &&
++              pc->rp0_freq > BMG_MERT_FLUSH_FREQ_CAP;
++}
++
++/**
++ * xe_guc_pc_apply_flush_freq_limit() - Limit max GT freq during L2 flush
++ * @pc: the xe_guc_pc object
++ *
++ * As per the WA, reduce max GT frequency during L2 cache flush
++ */
++void xe_guc_pc_apply_flush_freq_limit(struct xe_guc_pc *pc)
++{
++      struct xe_gt *gt = pc_to_gt(pc);
++      u32 max_freq;
++      int ret;
++
++      if (!needs_flush_freq_limit(pc))
++              return;
++
++      mutex_lock(&pc->freq_lock);
++
++      if (!pc->freq_ready) {
++              mutex_unlock(&pc->freq_lock);
++              return;
++      }
++
++      ret = pc_action_query_task_state(pc);
++      if (ret) {
++              mutex_unlock(&pc->freq_lock);
++              return;
++      }
++
++      max_freq = pc_get_max_freq(pc);
++      if (max_freq > BMG_MERT_FLUSH_FREQ_CAP) {
++              ret = pc_set_max_freq(pc, BMG_MERT_FLUSH_FREQ_CAP);
++              if (ret) {
++                      xe_gt_err_once(gt, "Failed to cap max freq on flush to %u, %pe\n",
++                                     BMG_MERT_FLUSH_FREQ_CAP, ERR_PTR(ret));
++                      mutex_unlock(&pc->freq_lock);
++                      return;
++              }
++
++              atomic_set(&pc->flush_freq_limit, 1);
++
++              /*
++               * If user has previously changed max freq, stash that value to
++               * restore later, otherwise use the current max. New user
++               * requests wait on flush.
++               */
++              if (pc->user_requested_max != 0)
++                      pc->stashed_max_freq = pc->user_requested_max;
++              else
++                      pc->stashed_max_freq = max_freq;
++      }
++
++      mutex_unlock(&pc->freq_lock);
++
++      /*
++       * Wait for actual freq to go below the flush cap: even if the previous
++       * max was below cap, the current one might still be above it
++       */
++      ret = wait_for_act_freq_limit(pc, BMG_MERT_FLUSH_FREQ_CAP);
++      if (ret)
++              xe_gt_err_once(gt, "Actual freq did not reduce to %u, %pe\n",
++                             BMG_MERT_FLUSH_FREQ_CAP, ERR_PTR(ret));
++}
++
++/**
++ * xe_guc_pc_remove_flush_freq_limit() - Remove max GT freq limit after L2 flush completes.
++ * @pc: the xe_guc_pc object
++ *
++ * Retrieve the previous GT max frequency value.
++ */
++void xe_guc_pc_remove_flush_freq_limit(struct xe_guc_pc *pc)
++{
++      struct xe_gt *gt = pc_to_gt(pc);
++      int ret = 0;
++
++      if (!needs_flush_freq_limit(pc))
++              return;
++
++      if (!atomic_read(&pc->flush_freq_limit))
++              return;
++
++      mutex_lock(&pc->freq_lock);
++
++      ret = pc_set_max_freq(&gt->uc.guc.pc, pc->stashed_max_freq);
++      if (ret)
++              xe_gt_err_once(gt, "Failed to restore max freq %u:%d",
++                             pc->stashed_max_freq, ret);
++
++      atomic_set(&pc->flush_freq_limit, 0);
++      mutex_unlock(&pc->freq_lock);
++      wake_up_var(&pc->flush_freq_limit);
++}
++
+ static int pc_set_mert_freq_cap(struct xe_guc_pc *pc)
+ {
+       int ret = 0;
+diff --git a/drivers/gpu/drm/xe/xe_guc_pc.h b/drivers/gpu/drm/xe/xe_guc_pc.h
+index efda432fadfc8..7154b3aab0d84 100644
+--- a/drivers/gpu/drm/xe/xe_guc_pc.h
++++ b/drivers/gpu/drm/xe/xe_guc_pc.h
+@@ -34,5 +34,7 @@ u64 xe_guc_pc_mc6_residency(struct xe_guc_pc *pc);
+ void xe_guc_pc_init_early(struct xe_guc_pc *pc);
+ int xe_guc_pc_restore_stashed_freq(struct xe_guc_pc *pc);
+ void xe_guc_pc_raise_unslice(struct xe_guc_pc *pc);
++void xe_guc_pc_apply_flush_freq_limit(struct xe_guc_pc *pc);
++void xe_guc_pc_remove_flush_freq_limit(struct xe_guc_pc *pc);
+ #endif /* _XE_GUC_PC_H_ */
+diff --git a/drivers/gpu/drm/xe/xe_guc_pc_types.h b/drivers/gpu/drm/xe/xe_guc_pc_types.h
+index 13810be015db5..5b86d91296cb9 100644
+--- a/drivers/gpu/drm/xe/xe_guc_pc_types.h
++++ b/drivers/gpu/drm/xe/xe_guc_pc_types.h
+@@ -15,6 +15,8 @@
+ struct xe_guc_pc {
+       /** @bo: GGTT buffer object that is shared with GuC PC */
+       struct xe_bo *bo;
++      /** @flush_freq_limit: 1 when max freq changes are limited by driver */
++      atomic_t flush_freq_limit;
+       /** @rp0_freq: HW RP0 frequency - The Maximum one */
+       u32 rp0_freq;
+       /** @rpe_freq: HW RPe frequency - The Efficient one */
+-- 
+2.39.5
+
diff --git a/queue-6.12/ib-mlx5-fix-potential-deadlock-in-mr-deregistration.patch b/queue-6.12/ib-mlx5-fix-potential-deadlock-in-mr-deregistration.patch
new file mode 100644 (file)
index 0000000..d86d56f
--- /dev/null
@@ -0,0 +1,268 @@
+From 60a8ab6c686344fa2811dbcd1d1e5d9e7f13ee24 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 16 Jun 2025 11:14:09 +0300
+Subject: IB/mlx5: Fix potential deadlock in MR deregistration
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+From: Or Har-Toov <ohartoov@nvidia.com>
+
+[ Upstream commit 2ed25aa7f7711f508b6120e336f05cd9d49943c0 ]
+
+The issue arises when kzalloc() is invoked while holding umem_mutex or
+any other lock acquired under umem_mutex. This is problematic because
+kzalloc() can trigger fs_reclaim_aqcuire(), which may, in turn, invoke
+mmu_notifier_invalidate_range_start(). This function can lead to
+mlx5_ib_invalidate_range(), which attempts to acquire umem_mutex again,
+resulting in a deadlock.
+
+The problematic flow:
+             CPU0                      |              CPU1
+---------------------------------------|------------------------------------------------
+mlx5_ib_dereg_mr()                     |
+ → revoke_mr()                         |
+   → mutex_lock(&umem_odp->umem_mutex) |
+                                       | mlx5_mkey_cache_init()
+                                       |  → mutex_lock(&dev->cache.rb_lock)
+                                       |  → mlx5r_cache_create_ent_locked()
+                                       |    → kzalloc(GFP_KERNEL)
+                                       |      → fs_reclaim()
+                                       |        → mmu_notifier_invalidate_range_start()
+                                       |          → mlx5_ib_invalidate_range()
+                                       |            → mutex_lock(&umem_odp->umem_mutex)
+   → cache_ent_find_and_store()        |
+     → mutex_lock(&dev->cache.rb_lock) |
+
+Additionally, when kzalloc() is called from within
+cache_ent_find_and_store(), we encounter the same deadlock due to
+re-acquisition of umem_mutex.
+
+Solve by releasing umem_mutex in dereg_mr() after umr_revoke_mr()
+and before acquiring rb_lock. This ensures that we don't hold
+umem_mutex while performing memory allocations that could trigger
+the reclaim path.
+
+This change prevents the deadlock by ensuring proper lock ordering and
+avoiding holding locks during memory allocation operations that could
+trigger the reclaim path.
+
+The following lockdep warning demonstrates the deadlock:
+
+ python3/20557 is trying to acquire lock:
+ ffff888387542128 (&umem_odp->umem_mutex){+.+.}-{4:4}, at:
+ mlx5_ib_invalidate_range+0x5b/0x550 [mlx5_ib]
+
+ but task is already holding lock:
+ ffffffff82f6b840 (mmu_notifier_invalidate_range_start){+.+.}-{0:0}, at:
+ unmap_vmas+0x7b/0x1a0
+
+ which lock already depends on the new lock.
+
+ the existing dependency chain (in reverse order) is:
+
+ -> #3 (mmu_notifier_invalidate_range_start){+.+.}-{0:0}:
+       fs_reclaim_acquire+0x60/0xd0
+       mem_cgroup_css_alloc+0x6f/0x9b0
+       cgroup_init_subsys+0xa4/0x240
+       cgroup_init+0x1c8/0x510
+       start_kernel+0x747/0x760
+       x86_64_start_reservations+0x25/0x30
+       x86_64_start_kernel+0x73/0x80
+       common_startup_64+0x129/0x138
+
+ -> #2 (fs_reclaim){+.+.}-{0:0}:
+       fs_reclaim_acquire+0x91/0xd0
+       __kmalloc_cache_noprof+0x4d/0x4c0
+       mlx5r_cache_create_ent_locked+0x75/0x620 [mlx5_ib]
+       mlx5_mkey_cache_init+0x186/0x360 [mlx5_ib]
+       mlx5_ib_stage_post_ib_reg_umr_init+0x3c/0x60 [mlx5_ib]
+       __mlx5_ib_add+0x4b/0x190 [mlx5_ib]
+       mlx5r_probe+0xd9/0x320 [mlx5_ib]
+       auxiliary_bus_probe+0x42/0x70
+       really_probe+0xdb/0x360
+       __driver_probe_device+0x8f/0x130
+       driver_probe_device+0x1f/0xb0
+       __driver_attach+0xd4/0x1f0
+       bus_for_each_dev+0x79/0xd0
+       bus_add_driver+0xf0/0x200
+       driver_register+0x6e/0xc0
+       __auxiliary_driver_register+0x6a/0xc0
+       do_one_initcall+0x5e/0x390
+       do_init_module+0x88/0x240
+       init_module_from_file+0x85/0xc0
+       idempotent_init_module+0x104/0x300
+       __x64_sys_finit_module+0x68/0xc0
+       do_syscall_64+0x6d/0x140
+       entry_SYSCALL_64_after_hwframe+0x4b/0x53
+
+ -> #1 (&dev->cache.rb_lock){+.+.}-{4:4}:
+       __mutex_lock+0x98/0xf10
+       __mlx5_ib_dereg_mr+0x6f2/0x890 [mlx5_ib]
+       mlx5_ib_dereg_mr+0x21/0x110 [mlx5_ib]
+       ib_dereg_mr_user+0x85/0x1f0 [ib_core]
+       uverbs_free_mr+0x19/0x30 [ib_uverbs]
+       destroy_hw_idr_uobject+0x21/0x80 [ib_uverbs]
+       uverbs_destroy_uobject+0x60/0x3d0 [ib_uverbs]
+       uobj_destroy+0x57/0xa0 [ib_uverbs]
+       ib_uverbs_cmd_verbs+0x4d5/0x1210 [ib_uverbs]
+       ib_uverbs_ioctl+0x129/0x230 [ib_uverbs]
+       __x64_sys_ioctl+0x596/0xaa0
+       do_syscall_64+0x6d/0x140
+       entry_SYSCALL_64_after_hwframe+0x4b/0x53
+
+ -> #0 (&umem_odp->umem_mutex){+.+.}-{4:4}:
+       __lock_acquire+0x1826/0x2f00
+       lock_acquire+0xd3/0x2e0
+       __mutex_lock+0x98/0xf10
+       mlx5_ib_invalidate_range+0x5b/0x550 [mlx5_ib]
+       __mmu_notifier_invalidate_range_start+0x18e/0x1f0
+       unmap_vmas+0x182/0x1a0
+       exit_mmap+0xf3/0x4a0
+       mmput+0x3a/0x100
+       do_exit+0x2b9/0xa90
+       do_group_exit+0x32/0xa0
+       get_signal+0xc32/0xcb0
+       arch_do_signal_or_restart+0x29/0x1d0
+       syscall_exit_to_user_mode+0x105/0x1d0
+       do_syscall_64+0x79/0x140
+       entry_SYSCALL_64_after_hwframe+0x4b/0x53
+
+ Chain exists of:
+ &dev->cache.rb_lock --> mmu_notifier_invalidate_range_start -->
+ &umem_odp->umem_mutex
+
+ Possible unsafe locking scenario:
+
+       CPU0                        CPU1
+       ----                        ----
+   lock(&umem_odp->umem_mutex);
+                                lock(mmu_notifier_invalidate_range_start);
+                                lock(&umem_odp->umem_mutex);
+   lock(&dev->cache.rb_lock);
+
+ *** DEADLOCK ***
+
+Fixes: abb604a1a9c8 ("RDMA/mlx5: Fix a race for an ODP MR which leads to CQE with error")
+Signed-off-by: Or Har-Toov <ohartoov@nvidia.com>
+Reviewed-by: Michael Guralnik <michaelgur@nvidia.com>
+Link: https://patch.msgid.link/3c8f225a8a9fade647d19b014df1172544643e4a.1750061612.git.leon@kernel.org
+Signed-off-by: Leon Romanovsky <leon@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/infiniband/hw/mlx5/mr.c | 61 +++++++++++++++++++++++++--------
+ 1 file changed, 47 insertions(+), 14 deletions(-)
+
+diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
+index 830a15b66c120..726b81b6330c6 100644
+--- a/drivers/infiniband/hw/mlx5/mr.c
++++ b/drivers/infiniband/hw/mlx5/mr.c
+@@ -2028,23 +2028,50 @@ void mlx5_ib_revoke_data_direct_mrs(struct mlx5_ib_dev *dev)
+       }
+ }
+-static int mlx5_revoke_mr(struct mlx5_ib_mr *mr)
++static int mlx5_umr_revoke_mr_with_lock(struct mlx5_ib_mr *mr)
+ {
+-      struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device);
+-      struct mlx5_cache_ent *ent = mr->mmkey.cache_ent;
+-      bool is_odp = is_odp_mr(mr);
+       bool is_odp_dma_buf = is_dmabuf_mr(mr) &&
+-                      !to_ib_umem_dmabuf(mr->umem)->pinned;
+-      bool from_cache = !!ent;
+-      int ret = 0;
++                            !to_ib_umem_dmabuf(mr->umem)->pinned;
++      bool is_odp = is_odp_mr(mr);
++      int ret;
+       if (is_odp)
+               mutex_lock(&to_ib_umem_odp(mr->umem)->umem_mutex);
+       if (is_odp_dma_buf)
+-              dma_resv_lock(to_ib_umem_dmabuf(mr->umem)->attach->dmabuf->resv, NULL);
++              dma_resv_lock(to_ib_umem_dmabuf(mr->umem)->attach->dmabuf->resv,
++                            NULL);
++
++      ret = mlx5r_umr_revoke_mr(mr);
++
++      if (is_odp) {
++              if (!ret)
++                      to_ib_umem_odp(mr->umem)->private = NULL;
++              mutex_unlock(&to_ib_umem_odp(mr->umem)->umem_mutex);
++      }
++
++      if (is_odp_dma_buf) {
++              if (!ret)
++                      to_ib_umem_dmabuf(mr->umem)->private = NULL;
++              dma_resv_unlock(
++                      to_ib_umem_dmabuf(mr->umem)->attach->dmabuf->resv);
++      }
+-      if (mr->mmkey.cacheable && !mlx5r_umr_revoke_mr(mr) && !cache_ent_find_and_store(dev, mr)) {
++      return ret;
++}
++
++static int mlx5r_handle_mkey_cleanup(struct mlx5_ib_mr *mr)
++{
++      bool is_odp_dma_buf = is_dmabuf_mr(mr) &&
++                            !to_ib_umem_dmabuf(mr->umem)->pinned;
++      struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device);
++      struct mlx5_cache_ent *ent = mr->mmkey.cache_ent;
++      bool is_odp = is_odp_mr(mr);
++      bool from_cache = !!ent;
++      int ret;
++
++      if (mr->mmkey.cacheable && !mlx5_umr_revoke_mr_with_lock(mr) &&
++          !cache_ent_find_and_store(dev, mr)) {
+               ent = mr->mmkey.cache_ent;
+               /* upon storing to a clean temp entry - schedule its cleanup */
+               spin_lock_irq(&ent->mkeys_queue.lock);
+@@ -2056,7 +2083,7 @@ static int mlx5_revoke_mr(struct mlx5_ib_mr *mr)
+                       ent->tmp_cleanup_scheduled = true;
+               }
+               spin_unlock_irq(&ent->mkeys_queue.lock);
+-              goto out;
++              return 0;
+       }
+       if (ent) {
+@@ -2065,8 +2092,14 @@ static int mlx5_revoke_mr(struct mlx5_ib_mr *mr)
+               mr->mmkey.cache_ent = NULL;
+               spin_unlock_irq(&ent->mkeys_queue.lock);
+       }
++
++      if (is_odp)
++              mutex_lock(&to_ib_umem_odp(mr->umem)->umem_mutex);
++
++      if (is_odp_dma_buf)
++              dma_resv_lock(to_ib_umem_dmabuf(mr->umem)->attach->dmabuf->resv,
++                            NULL);
+       ret = destroy_mkey(dev, mr);
+-out:
+       if (is_odp) {
+               if (!ret)
+                       to_ib_umem_odp(mr->umem)->private = NULL;
+@@ -2076,9 +2109,9 @@ static int mlx5_revoke_mr(struct mlx5_ib_mr *mr)
+       if (is_odp_dma_buf) {
+               if (!ret)
+                       to_ib_umem_dmabuf(mr->umem)->private = NULL;
+-              dma_resv_unlock(to_ib_umem_dmabuf(mr->umem)->attach->dmabuf->resv);
++              dma_resv_unlock(
++                      to_ib_umem_dmabuf(mr->umem)->attach->dmabuf->resv);
+       }
+-
+       return ret;
+ }
+@@ -2127,7 +2160,7 @@ static int __mlx5_ib_dereg_mr(struct ib_mr *ibmr)
+       }
+       /* Stop DMA */
+-      rc = mlx5_revoke_mr(mr);
++      rc = mlx5r_handle_mkey_cleanup(mr);
+       if (rc)
+               return rc;
+-- 
+2.39.5
+
diff --git a/queue-6.12/nfsv4-flexfiles-fix-handling-of-nfs-level-errors-in-.patch b/queue-6.12/nfsv4-flexfiles-fix-handling-of-nfs-level-errors-in-.patch
new file mode 100644 (file)
index 0000000..752932a
--- /dev/null
@@ -0,0 +1,250 @@
+From 44cf8d0200904f04abc0499e99bf8bceee19bcb1 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 19 Jun 2025 15:16:11 -0400
+Subject: NFSv4/flexfiles: Fix handling of NFS level errors in I/O
+
+From: Trond Myklebust <trond.myklebust@hammerspace.com>
+
+[ Upstream commit 38074de35b015df5623f524d6f2b49a0cd395c40 ]
+
+Allow the flexfiles error handling to recognise NFS level errors (as
+opposed to RPC level errors) and handle them separately. The main
+motivator is the NFSERR_PERM errors that get returned if the NFS client
+connects to the data server through a port number that is lower than
+1024. In that case, the client should disconnect and retry a READ on a
+different data server, or it should retry a WRITE after reconnecting.
+
+Reviewed-by: Tigran Mkrtchyan <tigran.mkrtchyan@desy.de>
+Fixes: d67ae825a59d ("pnfs/flexfiles: Add the FlexFile Layout Driver")
+Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
+Signed-off-by: Anna Schumaker <anna.schumaker@oracle.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/nfs/flexfilelayout/flexfilelayout.c | 121 ++++++++++++++++++-------
+ 1 file changed, 87 insertions(+), 34 deletions(-)
+
+diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
+index 8f7ea4076653d..bf96f7a8900c1 100644
+--- a/fs/nfs/flexfilelayout/flexfilelayout.c
++++ b/fs/nfs/flexfilelayout/flexfilelayout.c
+@@ -1104,6 +1104,7 @@ static void ff_layout_reset_read(struct nfs_pgio_header *hdr)
+ }
+ static int ff_layout_async_handle_error_v4(struct rpc_task *task,
++                                         u32 op_status,
+                                          struct nfs4_state *state,
+                                          struct nfs_client *clp,
+                                          struct pnfs_layout_segment *lseg,
+@@ -1114,32 +1115,42 @@ static int ff_layout_async_handle_error_v4(struct rpc_task *task,
+       struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx);
+       struct nfs4_slot_table *tbl = &clp->cl_session->fc_slot_table;
+-      switch (task->tk_status) {
+-      case -NFS4ERR_BADSESSION:
+-      case -NFS4ERR_BADSLOT:
+-      case -NFS4ERR_BAD_HIGH_SLOT:
+-      case -NFS4ERR_DEADSESSION:
+-      case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
+-      case -NFS4ERR_SEQ_FALSE_RETRY:
+-      case -NFS4ERR_SEQ_MISORDERED:
++      switch (op_status) {
++      case NFS4_OK:
++      case NFS4ERR_NXIO:
++              break;
++      case NFSERR_PERM:
++              if (!task->tk_xprt)
++                      break;
++              xprt_force_disconnect(task->tk_xprt);
++              goto out_retry;
++      case NFS4ERR_BADSESSION:
++      case NFS4ERR_BADSLOT:
++      case NFS4ERR_BAD_HIGH_SLOT:
++      case NFS4ERR_DEADSESSION:
++      case NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
++      case NFS4ERR_SEQ_FALSE_RETRY:
++      case NFS4ERR_SEQ_MISORDERED:
+               dprintk("%s ERROR %d, Reset session. Exchangeid "
+                       "flags 0x%x\n", __func__, task->tk_status,
+                       clp->cl_exchange_flags);
+               nfs4_schedule_session_recovery(clp->cl_session, task->tk_status);
+-              break;
+-      case -NFS4ERR_DELAY:
+-      case -NFS4ERR_GRACE:
++              goto out_retry;
++      case NFS4ERR_DELAY:
++              nfs_inc_stats(lseg->pls_layout->plh_inode, NFSIOS_DELAY);
++              fallthrough;
++      case NFS4ERR_GRACE:
+               rpc_delay(task, FF_LAYOUT_POLL_RETRY_MAX);
+-              break;
+-      case -NFS4ERR_RETRY_UNCACHED_REP:
+-              break;
++              goto out_retry;
++      case NFS4ERR_RETRY_UNCACHED_REP:
++              goto out_retry;
+       /* Invalidate Layout errors */
+-      case -NFS4ERR_PNFS_NO_LAYOUT:
+-      case -ESTALE:           /* mapped NFS4ERR_STALE */
+-      case -EBADHANDLE:       /* mapped NFS4ERR_BADHANDLE */
+-      case -EISDIR:           /* mapped NFS4ERR_ISDIR */
+-      case -NFS4ERR_FHEXPIRED:
+-      case -NFS4ERR_WRONG_TYPE:
++      case NFS4ERR_PNFS_NO_LAYOUT:
++      case NFS4ERR_STALE:
++      case NFS4ERR_BADHANDLE:
++      case NFS4ERR_ISDIR:
++      case NFS4ERR_FHEXPIRED:
++      case NFS4ERR_WRONG_TYPE:
+               dprintk("%s Invalid layout error %d\n", __func__,
+                       task->tk_status);
+               /*
+@@ -1152,6 +1163,11 @@ static int ff_layout_async_handle_error_v4(struct rpc_task *task,
+               pnfs_destroy_layout(NFS_I(inode));
+               rpc_wake_up(&tbl->slot_tbl_waitq);
+               goto reset;
++      default:
++              break;
++      }
++
++      switch (task->tk_status) {
+       /* RPC connection errors */
+       case -ECONNREFUSED:
+       case -EHOSTDOWN:
+@@ -1167,26 +1183,56 @@ static int ff_layout_async_handle_error_v4(struct rpc_task *task,
+               nfs4_delete_deviceid(devid->ld, devid->nfs_client,
+                               &devid->deviceid);
+               rpc_wake_up(&tbl->slot_tbl_waitq);
+-              fallthrough;
++              break;
+       default:
+-              if (ff_layout_avoid_mds_available_ds(lseg))
+-                      return -NFS4ERR_RESET_TO_PNFS;
+-reset:
+-              dprintk("%s Retry through MDS. Error %d\n", __func__,
+-                      task->tk_status);
+-              return -NFS4ERR_RESET_TO_MDS;
++              break;
+       }
++
++      if (ff_layout_avoid_mds_available_ds(lseg))
++              return -NFS4ERR_RESET_TO_PNFS;
++reset:
++      dprintk("%s Retry through MDS. Error %d\n", __func__,
++              task->tk_status);
++      return -NFS4ERR_RESET_TO_MDS;
++
++out_retry:
+       task->tk_status = 0;
+       return -EAGAIN;
+ }
+ /* Retry all errors through either pNFS or MDS except for -EJUKEBOX */
+ static int ff_layout_async_handle_error_v3(struct rpc_task *task,
++                                         u32 op_status,
++                                         struct nfs_client *clp,
+                                          struct pnfs_layout_segment *lseg,
+                                          u32 idx)
+ {
+       struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx);
++      switch (op_status) {
++      case NFS_OK:
++      case NFSERR_NXIO:
++              break;
++      case NFSERR_PERM:
++              if (!task->tk_xprt)
++                      break;
++              xprt_force_disconnect(task->tk_xprt);
++              goto out_retry;
++      case NFSERR_ACCES:
++      case NFSERR_BADHANDLE:
++      case NFSERR_FBIG:
++      case NFSERR_IO:
++      case NFSERR_NOSPC:
++      case NFSERR_ROFS:
++      case NFSERR_STALE:
++              goto out_reset_to_pnfs;
++      case NFSERR_JUKEBOX:
++              nfs_inc_stats(lseg->pls_layout->plh_inode, NFSIOS_DELAY);
++              goto out_retry;
++      default:
++              break;
++      }
++
+       switch (task->tk_status) {
+       /* File access problems. Don't mark the device as unavailable */
+       case -EACCES:
+@@ -1205,6 +1251,7 @@ static int ff_layout_async_handle_error_v3(struct rpc_task *task,
+               nfs4_delete_deviceid(devid->ld, devid->nfs_client,
+                               &devid->deviceid);
+       }
++out_reset_to_pnfs:
+       /* FIXME: Need to prevent infinite looping here. */
+       return -NFS4ERR_RESET_TO_PNFS;
+ out_retry:
+@@ -1215,6 +1262,7 @@ static int ff_layout_async_handle_error_v3(struct rpc_task *task,
+ }
+ static int ff_layout_async_handle_error(struct rpc_task *task,
++                                      u32 op_status,
+                                       struct nfs4_state *state,
+                                       struct nfs_client *clp,
+                                       struct pnfs_layout_segment *lseg,
+@@ -1233,10 +1281,11 @@ static int ff_layout_async_handle_error(struct rpc_task *task,
+       switch (vers) {
+       case 3:
+-              return ff_layout_async_handle_error_v3(task, lseg, idx);
+-      case 4:
+-              return ff_layout_async_handle_error_v4(task, state, clp,
++              return ff_layout_async_handle_error_v3(task, op_status, clp,
+                                                      lseg, idx);
++      case 4:
++              return ff_layout_async_handle_error_v4(task, op_status, state,
++                                                     clp, lseg, idx);
+       default:
+               /* should never happen */
+               WARN_ON_ONCE(1);
+@@ -1289,6 +1338,7 @@ static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg,
+       switch (status) {
+       case NFS4ERR_DELAY:
+       case NFS4ERR_GRACE:
++      case NFS4ERR_PERM:
+               break;
+       case NFS4ERR_NXIO:
+               ff_layout_mark_ds_unreachable(lseg, idx);
+@@ -1321,7 +1371,8 @@ static int ff_layout_read_done_cb(struct rpc_task *task,
+               trace_ff_layout_read_error(hdr);
+       }
+-      err = ff_layout_async_handle_error(task, hdr->args.context->state,
++      err = ff_layout_async_handle_error(task, hdr->res.op_status,
++                                         hdr->args.context->state,
+                                          hdr->ds_clp, hdr->lseg,
+                                          hdr->pgio_mirror_idx);
+@@ -1491,7 +1542,8 @@ static int ff_layout_write_done_cb(struct rpc_task *task,
+               trace_ff_layout_write_error(hdr);
+       }
+-      err = ff_layout_async_handle_error(task, hdr->args.context->state,
++      err = ff_layout_async_handle_error(task, hdr->res.op_status,
++                                         hdr->args.context->state,
+                                          hdr->ds_clp, hdr->lseg,
+                                          hdr->pgio_mirror_idx);
+@@ -1537,8 +1589,9 @@ static int ff_layout_commit_done_cb(struct rpc_task *task,
+               trace_ff_layout_commit_error(data);
+       }
+-      err = ff_layout_async_handle_error(task, NULL, data->ds_clp,
+-                                         data->lseg, data->ds_commit_index);
++      err = ff_layout_async_handle_error(task, data->res.op_status,
++                                         NULL, data->ds_clp, data->lseg,
++                                         data->ds_commit_index);
+       trace_nfs4_pnfs_commit_ds(data, err);
+       switch (err) {
+-- 
+2.39.5
+
diff --git a/queue-6.12/rdma-mlx5-fix-cache-entry-update-on-dereg-error.patch b/queue-6.12/rdma-mlx5-fix-cache-entry-update-on-dereg-error.patch
new file mode 100644 (file)
index 0000000..384bdd7
--- /dev/null
@@ -0,0 +1,67 @@
+From bdc3aec857cebfd1c5f39e9e20f57533815bc9b7 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 13 Mar 2025 16:29:49 +0200
+Subject: RDMA/mlx5: Fix cache entry update on dereg error
+
+From: Michael Guralnik <michaelgur@nvidia.com>
+
+[ Upstream commit 24d693cf6c89d216a68634d44fa93e4400775d94 ]
+
+Fix double decrement of 'in_use' counter on push_mkey_locked() failure
+while deregistering an MR.
+If we fail to return an mkey to the cache in cache_ent_find_and_store()
+it'll update the 'in_use' counter. Its caller, revoke_mr(), also updates
+it, thus having double decrement.
+
+Wrong value of 'in_use' counter will be exposed through debugfs and can
+also cause wrong resizing of the cache when users try to set cache
+entry size using the 'size' debugfs.
+
+To address this issue, the 'in_use' counter is now decremented within
+mlx5_revoke_mr() also after a successful call to
+cache_ent_find_and_store() and not within cache_ent_find_and_store().
+Other success or failure flows remains unchanged where it was also
+decremented.
+
+Fixes: 8c1185fef68c ("RDMA/mlx5: Change check for cacheable mkeys")
+Signed-off-by: Michael Guralnik <michaelgur@nvidia.com>
+Reviewed-by: Yishai Hadas <yishaih@nvidia.com>
+Link: https://patch.msgid.link/97e979dff636f232ff4c83ce709c17c727da1fdb.1741875692.git.leon@kernel.org
+Signed-off-by: Leon Romanovsky <leon@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/infiniband/hw/mlx5/mr.c | 4 +++-
+ 1 file changed, 3 insertions(+), 1 deletion(-)
+
+diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
+index 068eac3bdb50b..830a15b66c120 100644
+--- a/drivers/infiniband/hw/mlx5/mr.c
++++ b/drivers/infiniband/hw/mlx5/mr.c
+@@ -1968,7 +1968,6 @@ static int cache_ent_find_and_store(struct mlx5_ib_dev *dev,
+       if (mr->mmkey.cache_ent) {
+               spin_lock_irq(&mr->mmkey.cache_ent->mkeys_queue.lock);
+-              mr->mmkey.cache_ent->in_use--;
+               goto end;
+       }
+@@ -2036,6 +2035,7 @@ static int mlx5_revoke_mr(struct mlx5_ib_mr *mr)
+       bool is_odp = is_odp_mr(mr);
+       bool is_odp_dma_buf = is_dmabuf_mr(mr) &&
+                       !to_ib_umem_dmabuf(mr->umem)->pinned;
++      bool from_cache = !!ent;
+       int ret = 0;
+       if (is_odp)
+@@ -2048,6 +2048,8 @@ static int mlx5_revoke_mr(struct mlx5_ib_mr *mr)
+               ent = mr->mmkey.cache_ent;
+               /* upon storing to a clean temp entry - schedule its cleanup */
+               spin_lock_irq(&ent->mkeys_queue.lock);
++              if (from_cache)
++                      ent->in_use--;
+               if (ent->is_tmp && !ent->tmp_cleanup_scheduled) {
+                       mod_delayed_work(ent->dev->cache.wq, &ent->dwork,
+                                        msecs_to_jiffies(30 * 1000));
+-- 
+2.39.5
+
index 17d05c90c0b45524ff89544f2c7ec681ff1cec7e..937ff7a2e5d1fcf1d3284b0d0709cb2781a5e1da 100644 (file)
@@ -192,3 +192,8 @@ firmware-arm_ffa-replace-mutex-with-rwlock-to-avoid-.patch
 add-a-string-to-qstr-constructor.patch
 module-provide-export_symbol_gpl_for_modules-helper.patch
 fs-export-anon_inode_make_secure_inode-and-fix-secre.patch
+rdma-mlx5-fix-cache-entry-update-on-dereg-error.patch
+ib-mlx5-fix-potential-deadlock-in-mr-deregistration.patch
+drm-xe-bmg-update-wa_22019338487.patch
+drm-xe-allow-dropping-kunit-dependency-as-built-in.patch
+nfsv4-flexfiles-fix-handling-of-nfs-level-errors-in-.patch