--- /dev/null
+From 79cad3705d28ff0c133bcd85a9107d0dbbb27e72 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 20 Sep 2023 13:07:40 +0300
+Subject: IB/core: Add support for XDR link speed
+
+From: Or Har-Toov <ohartoov@nvidia.com>
+
+[ Upstream commit 703289ce43f740b0096724300107df82d008552f ]
+
+Add new IBTA speed XDR, the new rate that was added to Infiniband spec
+as part of XDR and supporting signaling rate of 200Gb.
+
+In order to report that value to rdma-core, add new u32 field to
+query_port response.
+
+Signed-off-by: Or Har-Toov <ohartoov@nvidia.com>
+Reviewed-by: Mark Zhang <markzhang@nvidia.com>
+Link: https://lore.kernel.org/r/9d235fc600a999e8274010f0e18b40fa60540e6c.1695204156.git.leon@kernel.org
+Reviewed-by: Jacob Keller <jacob.e.keller@intel.com>
+Signed-off-by: Leon Romanovsky <leon@kernel.org>
+Stable-dep-of: c534ffda781f ("RDMA/mlx5: Fix AH static rate parsing")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/infiniband/core/sysfs.c | 4 ++++
+ drivers/infiniband/core/uverbs_std_types_device.c | 3 ++-
+ drivers/infiniband/core/verbs.c | 3 +++
+ include/rdma/ib_verbs.h | 2 ++
+ include/uapi/rdma/ib_user_ioctl_verbs.h | 3 ++-
+ 5 files changed, 13 insertions(+), 2 deletions(-)
+
+diff --git a/drivers/infiniband/core/sysfs.c b/drivers/infiniband/core/sysfs.c
+index ec5efdc166601..9f97bef021497 100644
+--- a/drivers/infiniband/core/sysfs.c
++++ b/drivers/infiniband/core/sysfs.c
+@@ -342,6 +342,10 @@ static ssize_t rate_show(struct ib_device *ibdev, u32 port_num,
+ speed = " NDR";
+ rate = 1000;
+ break;
++ case IB_SPEED_XDR:
++ speed = " XDR";
++ rate = 2000;
++ break;
+ case IB_SPEED_SDR:
+ default: /* default to SDR for invalid rates */
+ speed = " SDR";
+diff --git a/drivers/infiniband/core/uverbs_std_types_device.c b/drivers/infiniband/core/uverbs_std_types_device.c
+index 049684880ae03..fb0555647336f 100644
+--- a/drivers/infiniband/core/uverbs_std_types_device.c
++++ b/drivers/infiniband/core/uverbs_std_types_device.c
+@@ -203,6 +203,7 @@ static int UVERBS_HANDLER(UVERBS_METHOD_QUERY_PORT)(
+
+ copy_port_attr_to_resp(&attr, &resp.legacy_resp, ib_dev, port_num);
+ resp.port_cap_flags2 = attr.port_cap_flags2;
++ resp.active_speed_ex = attr.active_speed;
+
+ return uverbs_copy_to_struct_or_zero(attrs, UVERBS_ATTR_QUERY_PORT_RESP,
+ &resp, sizeof(resp));
+@@ -461,7 +462,7 @@ DECLARE_UVERBS_NAMED_METHOD(
+ UVERBS_ATTR_PTR_OUT(
+ UVERBS_ATTR_QUERY_PORT_RESP,
+ UVERBS_ATTR_STRUCT(struct ib_uverbs_query_port_resp_ex,
+- reserved),
++ active_speed_ex),
+ UA_MANDATORY));
+
+ DECLARE_UVERBS_NAMED_METHOD(
+diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c
+index b99b3cc283b65..90848546f1704 100644
+--- a/drivers/infiniband/core/verbs.c
++++ b/drivers/infiniband/core/verbs.c
+@@ -147,6 +147,7 @@ __attribute_const__ int ib_rate_to_mult(enum ib_rate rate)
+ case IB_RATE_50_GBPS: return 20;
+ case IB_RATE_400_GBPS: return 160;
+ case IB_RATE_600_GBPS: return 240;
++ case IB_RATE_800_GBPS: return 320;
+ default: return -1;
+ }
+ }
+@@ -176,6 +177,7 @@ __attribute_const__ enum ib_rate mult_to_ib_rate(int mult)
+ case 20: return IB_RATE_50_GBPS;
+ case 160: return IB_RATE_400_GBPS;
+ case 240: return IB_RATE_600_GBPS;
++ case 320: return IB_RATE_800_GBPS;
+ default: return IB_RATE_PORT_CURRENT;
+ }
+ }
+@@ -205,6 +207,7 @@ __attribute_const__ int ib_rate_to_mbps(enum ib_rate rate)
+ case IB_RATE_50_GBPS: return 53125;
+ case IB_RATE_400_GBPS: return 425000;
+ case IB_RATE_600_GBPS: return 637500;
++ case IB_RATE_800_GBPS: return 850000;
+ default: return -1;
+ }
+ }
+diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
+index 68fd6d22adfd4..750effb875783 100644
+--- a/include/rdma/ib_verbs.h
++++ b/include/rdma/ib_verbs.h
+@@ -557,6 +557,7 @@ enum ib_port_speed {
+ IB_SPEED_EDR = 32,
+ IB_SPEED_HDR = 64,
+ IB_SPEED_NDR = 128,
++ IB_SPEED_XDR = 256,
+ };
+
+ enum ib_stat_flag {
+@@ -836,6 +837,7 @@ enum ib_rate {
+ IB_RATE_50_GBPS = 20,
+ IB_RATE_400_GBPS = 21,
+ IB_RATE_600_GBPS = 22,
++ IB_RATE_800_GBPS = 23,
+ };
+
+ /**
+diff --git a/include/uapi/rdma/ib_user_ioctl_verbs.h b/include/uapi/rdma/ib_user_ioctl_verbs.h
+index 7dd56210226f5..125fb9f0ef4ab 100644
+--- a/include/uapi/rdma/ib_user_ioctl_verbs.h
++++ b/include/uapi/rdma/ib_user_ioctl_verbs.h
+@@ -218,7 +218,8 @@ enum ib_uverbs_advise_mr_flag {
+ struct ib_uverbs_query_port_resp_ex {
+ struct ib_uverbs_query_port_resp legacy_resp;
+ __u16 port_cap_flags2;
+- __u8 reserved[6];
++ __u8 reserved[2];
++ __u32 active_speed_ex;
+ };
+
+ struct ib_uverbs_qp_cap {
+--
+2.39.5
+
--- /dev/null
+From 608ee99426cce23b021b11aac5d6732400828ac0 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Sun, 19 Jan 2025 14:39:46 +0200
+Subject: IB/mlx5: Set and get correct qp_num for a DCT QP
+
+From: Mark Zhang <markzhang@nvidia.com>
+
+[ Upstream commit 12d044770e12c4205fa69535b4fa8a9981fea98f ]
+
+When a DCT QP is created on an active lag, it's dctc.port is assigned
+in a round-robin way, which is from 1 to dev->lag_port. In this case
+when querying this QP, we may get qp_attr.port_num > 2.
+Fix this by setting qp->port when modifying a DCT QP, and read port_num
+from qp->port instead of dctc.port when querying it.
+
+Fixes: 7c4b1ab9f167 ("IB/mlx5: Add DCT RoCE LAG support")
+Signed-off-by: Mark Zhang <markzhang@nvidia.com>
+Reviewed-by: Maher Sanalla <msanalla@nvidia.com>
+Link: https://patch.msgid.link/94c76bf0adbea997f87ffa27674e0a7118ad92a9.1737290358.git.leon@kernel.org
+Signed-off-by: Leon Romanovsky <leon@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/infiniband/hw/mlx5/qp.c | 4 +++-
+ 1 file changed, 3 insertions(+), 1 deletion(-)
+
+diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c
+index 8d132b726c64b..d782a494abcda 100644
+--- a/drivers/infiniband/hw/mlx5/qp.c
++++ b/drivers/infiniband/hw/mlx5/qp.c
+@@ -4466,6 +4466,8 @@ static int mlx5_ib_modify_dct(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+
+ set_id = mlx5_ib_get_counters_id(dev, attr->port_num - 1);
+ MLX5_SET(dctc, dctc, counter_set_id, set_id);
++
++ qp->port = attr->port_num;
+ } else if (cur_state == IB_QPS_INIT && new_state == IB_QPS_RTR) {
+ struct mlx5_ib_modify_qp_resp resp = {};
+ u32 out[MLX5_ST_SZ_DW(create_dct_out)] = {};
+@@ -4955,7 +4957,7 @@ static int mlx5_ib_dct_query_qp(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *mqp,
+ }
+
+ if (qp_attr_mask & IB_QP_PORT)
+- qp_attr->port_num = MLX5_GET(dctc, dctc, port);
++ qp_attr->port_num = mqp->port;
+ if (qp_attr_mask & IB_QP_MIN_RNR_TIMER)
+ qp_attr->min_rnr_timer = MLX5_GET(dctc, dctc, min_rnr_nak);
+ if (qp_attr_mask & IB_QP_AV) {
+--
+2.39.5
+
--- /dev/null
+From a760d15221ea7d4ac13c92a7e6f47314bd8cb2d3 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Sat, 15 Feb 2025 00:51:48 +0300
+Subject: ovl: fix UAF in ovl_dentry_update_reval by moving dput() in
+ ovl_link_up
+
+From: Vasiliy Kovalev <kovalev@altlinux.org>
+
+[ Upstream commit c84e125fff2615b4d9c259e762596134eddd2f27 ]
+
+The issue was caused by dput(upper) being called before
+ovl_dentry_update_reval(), while upper->d_flags was still
+accessed in ovl_dentry_remote().
+
+Move dput(upper) after its last use to prevent use-after-free.
+
+BUG: KASAN: slab-use-after-free in ovl_dentry_remote fs/overlayfs/util.c:162 [inline]
+BUG: KASAN: slab-use-after-free in ovl_dentry_update_reval+0xd2/0xf0 fs/overlayfs/util.c:167
+
+Call Trace:
+ <TASK>
+ __dump_stack lib/dump_stack.c:88 [inline]
+ dump_stack_lvl+0x116/0x1f0 lib/dump_stack.c:114
+ print_address_description mm/kasan/report.c:377 [inline]
+ print_report+0xc3/0x620 mm/kasan/report.c:488
+ kasan_report+0xd9/0x110 mm/kasan/report.c:601
+ ovl_dentry_remote fs/overlayfs/util.c:162 [inline]
+ ovl_dentry_update_reval+0xd2/0xf0 fs/overlayfs/util.c:167
+ ovl_link_up fs/overlayfs/copy_up.c:610 [inline]
+ ovl_copy_up_one+0x2105/0x3490 fs/overlayfs/copy_up.c:1170
+ ovl_copy_up_flags+0x18d/0x200 fs/overlayfs/copy_up.c:1223
+ ovl_rename+0x39e/0x18c0 fs/overlayfs/dir.c:1136
+ vfs_rename+0xf84/0x20a0 fs/namei.c:4893
+...
+ </TASK>
+
+Fixes: b07d5cc93e1b ("ovl: update of dentry revalidate flags after copy up")
+Reported-by: syzbot+316db8a1191938280eb6@syzkaller.appspotmail.com
+Closes: https://syzkaller.appspot.com/bug?extid=316db8a1191938280eb6
+Signed-off-by: Vasiliy Kovalev <kovalev@altlinux.org>
+Link: https://lore.kernel.org/r/20250214215148.761147-1-kovalev@altlinux.org
+Reviewed-by: Amir Goldstein <amir73il@gmail.com>
+Signed-off-by: Christian Brauner <brauner@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/overlayfs/copy_up.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
+index 86d4b6975dbcb..203b88293f6bb 100644
+--- a/fs/overlayfs/copy_up.c
++++ b/fs/overlayfs/copy_up.c
+@@ -532,7 +532,6 @@ static int ovl_link_up(struct ovl_copy_up_ctx *c)
+ err = PTR_ERR(upper);
+ if (!IS_ERR(upper)) {
+ err = ovl_do_link(ofs, ovl_dentry_upper(c->dentry), udir, upper);
+- dput(upper);
+
+ if (!err) {
+ /* Restore timestamps on parent (best effort) */
+@@ -540,6 +539,7 @@ static int ovl_link_up(struct ovl_copy_up_ctx *c)
+ ovl_dentry_set_upper_alias(c->dentry);
+ ovl_dentry_update_reval(c->dentry, upper);
+ }
++ dput(upper);
+ }
+ inode_unlock(udir);
+ if (err)
+--
+2.39.5
+
--- /dev/null
+From d5eccf1fd4fbdb90e3f1aba4e5ba5928ea3163c2 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 4 Jan 2023 11:43:34 +0200
+Subject: RDMA/mlx: Calling qp event handler in workqueue context
+
+From: Mark Zhang <markzhang@nvidia.com>
+
+[ Upstream commit 312b8f79eb05479628ee71357749815b2eeeeea8 ]
+
+Move the call of qp event handler from atomic to workqueue context,
+so that the handler is able to block. This is needed by following
+patches.
+
+Signed-off-by: Mark Zhang <markzhang@nvidia.com>
+Reviewed-by: Patrisious Haddad <phaddad@nvidia.com>
+Link: https://lore.kernel.org/r/0cd17b8331e445f03942f4bb28d447f24ac5669d.1672821186.git.leonro@nvidia.com
+Signed-off-by: Leon Romanovsky <leon@kernel.org>
+Stable-dep-of: c534ffda781f ("RDMA/mlx5: Fix AH static rate parsing")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/infiniband/hw/mlx4/main.c | 8 ++
+ drivers/infiniband/hw/mlx4/mlx4_ib.h | 3 +
+ drivers/infiniband/hw/mlx4/qp.c | 121 +++++++++++++++++-------
+ drivers/infiniband/hw/mlx5/main.c | 7 ++
+ drivers/infiniband/hw/mlx5/qp.c | 119 ++++++++++++++++-------
+ drivers/infiniband/hw/mlx5/qp.h | 2 +
+ drivers/infiniband/hw/mlx5/qpc.c | 3 +-
+ drivers/net/ethernet/mellanox/mlx4/qp.c | 14 ++-
+ include/linux/mlx4/qp.h | 1 +
+ include/rdma/ib_verbs.h | 2 +-
+ 10 files changed, 202 insertions(+), 78 deletions(-)
+
+diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c
+index 7c3dc86ab7f04..0f0b130cc8aac 100644
+--- a/drivers/infiniband/hw/mlx4/main.c
++++ b/drivers/infiniband/hw/mlx4/main.c
+@@ -3307,6 +3307,10 @@ static int __init mlx4_ib_init(void)
+ if (!wq)
+ return -ENOMEM;
+
++ err = mlx4_ib_qp_event_init();
++ if (err)
++ goto clean_qp_event;
++
+ err = mlx4_ib_cm_init();
+ if (err)
+ goto clean_wq;
+@@ -3328,6 +3332,9 @@ static int __init mlx4_ib_init(void)
+ mlx4_ib_cm_destroy();
+
+ clean_wq:
++ mlx4_ib_qp_event_cleanup();
++
++clean_qp_event:
+ destroy_workqueue(wq);
+ return err;
+ }
+@@ -3337,6 +3344,7 @@ static void __exit mlx4_ib_cleanup(void)
+ mlx4_unregister_interface(&mlx4_ib_interface);
+ mlx4_ib_mcg_destroy();
+ mlx4_ib_cm_destroy();
++ mlx4_ib_qp_event_cleanup();
+ destroy_workqueue(wq);
+ }
+
+diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h b/drivers/infiniband/hw/mlx4/mlx4_ib.h
+index 6a3b0f121045e..17fee1e73a45a 100644
+--- a/drivers/infiniband/hw/mlx4/mlx4_ib.h
++++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h
+@@ -940,4 +940,7 @@ int mlx4_ib_umem_calc_optimal_mtt_size(struct ib_umem *umem, u64 start_va,
+ int mlx4_ib_cm_init(void);
+ void mlx4_ib_cm_destroy(void);
+
++int mlx4_ib_qp_event_init(void);
++void mlx4_ib_qp_event_cleanup(void);
++
+ #endif /* MLX4_IB_H */
+diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c
+index ac479e81ddee8..9d08aa99f3cb0 100644
+--- a/drivers/infiniband/hw/mlx4/qp.c
++++ b/drivers/infiniband/hw/mlx4/qp.c
+@@ -102,6 +102,14 @@ enum mlx4_ib_source_type {
+ MLX4_IB_RWQ_SRC = 1,
+ };
+
++struct mlx4_ib_qp_event_work {
++ struct work_struct work;
++ struct mlx4_qp *qp;
++ enum mlx4_event type;
++};
++
++static struct workqueue_struct *mlx4_ib_qp_event_wq;
++
+ static int is_tunnel_qp(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp)
+ {
+ if (!mlx4_is_master(dev->dev))
+@@ -200,50 +208,77 @@ static void stamp_send_wqe(struct mlx4_ib_qp *qp, int n)
+ }
+ }
+
++static void mlx4_ib_handle_qp_event(struct work_struct *_work)
++{
++ struct mlx4_ib_qp_event_work *qpe_work =
++ container_of(_work, struct mlx4_ib_qp_event_work, work);
++ struct ib_qp *ibqp = &to_mibqp(qpe_work->qp)->ibqp;
++ struct ib_event event = {};
++
++ event.device = ibqp->device;
++ event.element.qp = ibqp;
++
++ switch (qpe_work->type) {
++ case MLX4_EVENT_TYPE_PATH_MIG:
++ event.event = IB_EVENT_PATH_MIG;
++ break;
++ case MLX4_EVENT_TYPE_COMM_EST:
++ event.event = IB_EVENT_COMM_EST;
++ break;
++ case MLX4_EVENT_TYPE_SQ_DRAINED:
++ event.event = IB_EVENT_SQ_DRAINED;
++ break;
++ case MLX4_EVENT_TYPE_SRQ_QP_LAST_WQE:
++ event.event = IB_EVENT_QP_LAST_WQE_REACHED;
++ break;
++ case MLX4_EVENT_TYPE_WQ_CATAS_ERROR:
++ event.event = IB_EVENT_QP_FATAL;
++ break;
++ case MLX4_EVENT_TYPE_PATH_MIG_FAILED:
++ event.event = IB_EVENT_PATH_MIG_ERR;
++ break;
++ case MLX4_EVENT_TYPE_WQ_INVAL_REQ_ERROR:
++ event.event = IB_EVENT_QP_REQ_ERR;
++ break;
++ case MLX4_EVENT_TYPE_WQ_ACCESS_ERROR:
++ event.event = IB_EVENT_QP_ACCESS_ERR;
++ break;
++ default:
++ pr_warn("Unexpected event type %d on QP %06x\n",
++ qpe_work->type, qpe_work->qp->qpn);
++ goto out;
++ }
++
++ ibqp->event_handler(&event, ibqp->qp_context);
++
++out:
++ mlx4_put_qp(qpe_work->qp);
++ kfree(qpe_work);
++}
++
+ static void mlx4_ib_qp_event(struct mlx4_qp *qp, enum mlx4_event type)
+ {
+- struct ib_event event;
+ struct ib_qp *ibqp = &to_mibqp(qp)->ibqp;
++ struct mlx4_ib_qp_event_work *qpe_work;
+
+ if (type == MLX4_EVENT_TYPE_PATH_MIG)
+ to_mibqp(qp)->port = to_mibqp(qp)->alt_port;
+
+- if (ibqp->event_handler) {
+- event.device = ibqp->device;
+- event.element.qp = ibqp;
+- switch (type) {
+- case MLX4_EVENT_TYPE_PATH_MIG:
+- event.event = IB_EVENT_PATH_MIG;
+- break;
+- case MLX4_EVENT_TYPE_COMM_EST:
+- event.event = IB_EVENT_COMM_EST;
+- break;
+- case MLX4_EVENT_TYPE_SQ_DRAINED:
+- event.event = IB_EVENT_SQ_DRAINED;
+- break;
+- case MLX4_EVENT_TYPE_SRQ_QP_LAST_WQE:
+- event.event = IB_EVENT_QP_LAST_WQE_REACHED;
+- break;
+- case MLX4_EVENT_TYPE_WQ_CATAS_ERROR:
+- event.event = IB_EVENT_QP_FATAL;
+- break;
+- case MLX4_EVENT_TYPE_PATH_MIG_FAILED:
+- event.event = IB_EVENT_PATH_MIG_ERR;
+- break;
+- case MLX4_EVENT_TYPE_WQ_INVAL_REQ_ERROR:
+- event.event = IB_EVENT_QP_REQ_ERR;
+- break;
+- case MLX4_EVENT_TYPE_WQ_ACCESS_ERROR:
+- event.event = IB_EVENT_QP_ACCESS_ERR;
+- break;
+- default:
+- pr_warn("Unexpected event type %d "
+- "on QP %06x\n", type, qp->qpn);
+- return;
+- }
++ if (!ibqp->event_handler)
++ goto out_no_handler;
+
+- ibqp->event_handler(&event, ibqp->qp_context);
+- }
++ qpe_work = kzalloc(sizeof(*qpe_work), GFP_ATOMIC);
++ if (!qpe_work)
++ goto out_no_handler;
++
++ qpe_work->qp = qp;
++ qpe_work->type = type;
++ INIT_WORK(&qpe_work->work, mlx4_ib_handle_qp_event);
++ queue_work(mlx4_ib_qp_event_wq, &qpe_work->work);
++ return;
++
++out_no_handler:
++ mlx4_put_qp(qp);
+ }
+
+ static void mlx4_ib_wq_event(struct mlx4_qp *qp, enum mlx4_event type)
+@@ -4472,3 +4507,17 @@ void mlx4_ib_drain_rq(struct ib_qp *qp)
+
+ handle_drain_completion(cq, &rdrain, dev);
+ }
++
++int mlx4_ib_qp_event_init(void)
++{
++ mlx4_ib_qp_event_wq = alloc_ordered_workqueue("mlx4_ib_qp_event_wq", 0);
++ if (!mlx4_ib_qp_event_wq)
++ return -ENOMEM;
++
++ return 0;
++}
++
++void mlx4_ib_qp_event_cleanup(void)
++{
++ destroy_workqueue(mlx4_ib_qp_event_wq);
++}
+diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
+index 45a414e8d35fa..a22649617e017 100644
+--- a/drivers/infiniband/hw/mlx5/main.c
++++ b/drivers/infiniband/hw/mlx5/main.c
+@@ -4410,6 +4410,10 @@ static int __init mlx5_ib_init(void)
+ return -ENOMEM;
+ }
+
++ ret = mlx5_ib_qp_event_init();
++ if (ret)
++ goto qp_event_err;
++
+ mlx5_ib_odp_init();
+ ret = mlx5r_rep_init();
+ if (ret)
+@@ -4427,6 +4431,8 @@ static int __init mlx5_ib_init(void)
+ mp_err:
+ mlx5r_rep_cleanup();
+ rep_err:
++ mlx5_ib_qp_event_cleanup();
++qp_event_err:
+ destroy_workqueue(mlx5_ib_event_wq);
+ free_page((unsigned long)xlt_emergency_page);
+ return ret;
+@@ -4438,6 +4444,7 @@ static void __exit mlx5_ib_cleanup(void)
+ auxiliary_driver_unregister(&mlx5r_mp_driver);
+ mlx5r_rep_cleanup();
+
++ mlx5_ib_qp_event_cleanup();
+ destroy_workqueue(mlx5_ib_event_wq);
+ free_page((unsigned long)xlt_emergency_page);
+ }
+diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c
+index d782a494abcda..43c0123babd10 100644
+--- a/drivers/infiniband/hw/mlx5/qp.c
++++ b/drivers/infiniband/hw/mlx5/qp.c
+@@ -71,6 +71,14 @@ struct mlx5_modify_raw_qp_param {
+ u32 port;
+ };
+
++struct mlx5_ib_qp_event_work {
++ struct work_struct work;
++ struct mlx5_core_qp *qp;
++ int type;
++};
++
++static struct workqueue_struct *mlx5_ib_qp_event_wq;
++
+ static void get_cqs(enum ib_qp_type qp_type,
+ struct ib_cq *ib_send_cq, struct ib_cq *ib_recv_cq,
+ struct mlx5_ib_cq **send_cq, struct mlx5_ib_cq **recv_cq);
+@@ -302,51 +310,78 @@ int mlx5_ib_read_wqe_srq(struct mlx5_ib_srq *srq, int wqe_index, void *buffer,
+ return mlx5_ib_read_user_wqe_srq(srq, wqe_index, buffer, buflen, bc);
+ }
+
++static void mlx5_ib_handle_qp_event(struct work_struct *_work)
++{
++ struct mlx5_ib_qp_event_work *qpe_work =
++ container_of(_work, struct mlx5_ib_qp_event_work, work);
++ struct ib_qp *ibqp = &to_mibqp(qpe_work->qp)->ibqp;
++ struct ib_event event = {};
++
++ event.device = ibqp->device;
++ event.element.qp = ibqp;
++ switch (qpe_work->type) {
++ case MLX5_EVENT_TYPE_PATH_MIG:
++ event.event = IB_EVENT_PATH_MIG;
++ break;
++ case MLX5_EVENT_TYPE_COMM_EST:
++ event.event = IB_EVENT_COMM_EST;
++ break;
++ case MLX5_EVENT_TYPE_SQ_DRAINED:
++ event.event = IB_EVENT_SQ_DRAINED;
++ break;
++ case MLX5_EVENT_TYPE_SRQ_LAST_WQE:
++ event.event = IB_EVENT_QP_LAST_WQE_REACHED;
++ break;
++ case MLX5_EVENT_TYPE_WQ_CATAS_ERROR:
++ event.event = IB_EVENT_QP_FATAL;
++ break;
++ case MLX5_EVENT_TYPE_PATH_MIG_FAILED:
++ event.event = IB_EVENT_PATH_MIG_ERR;
++ break;
++ case MLX5_EVENT_TYPE_WQ_INVAL_REQ_ERROR:
++ event.event = IB_EVENT_QP_REQ_ERR;
++ break;
++ case MLX5_EVENT_TYPE_WQ_ACCESS_ERROR:
++ event.event = IB_EVENT_QP_ACCESS_ERR;
++ break;
++ default:
++ pr_warn("mlx5_ib: Unexpected event type %d on QP %06x\n",
++ qpe_work->type, qpe_work->qp->qpn);
++ goto out;
++ }
++
++ ibqp->event_handler(&event, ibqp->qp_context);
++
++out:
++ mlx5_core_res_put(&qpe_work->qp->common);
++ kfree(qpe_work);
++}
++
+ static void mlx5_ib_qp_event(struct mlx5_core_qp *qp, int type)
+ {
+ struct ib_qp *ibqp = &to_mibqp(qp)->ibqp;
+- struct ib_event event;
++ struct mlx5_ib_qp_event_work *qpe_work;
+
+ if (type == MLX5_EVENT_TYPE_PATH_MIG) {
+ /* This event is only valid for trans_qps */
+ to_mibqp(qp)->port = to_mibqp(qp)->trans_qp.alt_port;
+ }
+
+- if (ibqp->event_handler) {
+- event.device = ibqp->device;
+- event.element.qp = ibqp;
+- switch (type) {
+- case MLX5_EVENT_TYPE_PATH_MIG:
+- event.event = IB_EVENT_PATH_MIG;
+- break;
+- case MLX5_EVENT_TYPE_COMM_EST:
+- event.event = IB_EVENT_COMM_EST;
+- break;
+- case MLX5_EVENT_TYPE_SQ_DRAINED:
+- event.event = IB_EVENT_SQ_DRAINED;
+- break;
+- case MLX5_EVENT_TYPE_SRQ_LAST_WQE:
+- event.event = IB_EVENT_QP_LAST_WQE_REACHED;
+- break;
+- case MLX5_EVENT_TYPE_WQ_CATAS_ERROR:
+- event.event = IB_EVENT_QP_FATAL;
+- break;
+- case MLX5_EVENT_TYPE_PATH_MIG_FAILED:
+- event.event = IB_EVENT_PATH_MIG_ERR;
+- break;
+- case MLX5_EVENT_TYPE_WQ_INVAL_REQ_ERROR:
+- event.event = IB_EVENT_QP_REQ_ERR;
+- break;
+- case MLX5_EVENT_TYPE_WQ_ACCESS_ERROR:
+- event.event = IB_EVENT_QP_ACCESS_ERR;
+- break;
+- default:
+- pr_warn("mlx5_ib: Unexpected event type %d on QP %06x\n", type, qp->qpn);
+- return;
+- }
++ if (!ibqp->event_handler)
++ goto out_no_handler;
+
+- ibqp->event_handler(&event, ibqp->qp_context);
+- }
++ qpe_work = kzalloc(sizeof(*qpe_work), GFP_ATOMIC);
++ if (!qpe_work)
++ goto out_no_handler;
++
++ qpe_work->qp = qp;
++ qpe_work->type = type;
++ INIT_WORK(&qpe_work->work, mlx5_ib_handle_qp_event);
++ queue_work(mlx5_ib_qp_event_wq, &qpe_work->work);
++ return;
++
++out_no_handler:
++ mlx5_core_res_put(&qp->common);
+ }
+
+ static int set_rq_size(struct mlx5_ib_dev *dev, struct ib_qp_cap *cap,
+@@ -5752,3 +5787,17 @@ int mlx5_ib_qp_set_counter(struct ib_qp *qp, struct rdma_counter *counter)
+ mutex_unlock(&mqp->mutex);
+ return err;
+ }
++
++int mlx5_ib_qp_event_init(void)
++{
++ mlx5_ib_qp_event_wq = alloc_ordered_workqueue("mlx5_ib_qp_event_wq", 0);
++ if (!mlx5_ib_qp_event_wq)
++ return -ENOMEM;
++
++ return 0;
++}
++
++void mlx5_ib_qp_event_cleanup(void)
++{
++ destroy_workqueue(mlx5_ib_qp_event_wq);
++}
+diff --git a/drivers/infiniband/hw/mlx5/qp.h b/drivers/infiniband/hw/mlx5/qp.h
+index 5d4e140db99ce..fb2f4e030bb8f 100644
+--- a/drivers/infiniband/hw/mlx5/qp.h
++++ b/drivers/infiniband/hw/mlx5/qp.h
+@@ -44,4 +44,6 @@ void mlx5_core_res_put(struct mlx5_core_rsc_common *res);
+ int mlx5_core_xrcd_alloc(struct mlx5_ib_dev *dev, u32 *xrcdn);
+ int mlx5_core_xrcd_dealloc(struct mlx5_ib_dev *dev, u32 xrcdn);
+ int mlx5_ib_qp_set_counter(struct ib_qp *qp, struct rdma_counter *counter);
++int mlx5_ib_qp_event_init(void);
++void mlx5_ib_qp_event_cleanup(void);
+ #endif /* _MLX5_IB_QP_H */
+diff --git a/drivers/infiniband/hw/mlx5/qpc.c b/drivers/infiniband/hw/mlx5/qpc.c
+index d4e7864c56f18..a824ff22f4615 100644
+--- a/drivers/infiniband/hw/mlx5/qpc.c
++++ b/drivers/infiniband/hw/mlx5/qpc.c
+@@ -135,7 +135,8 @@ static int rsc_event_notifier(struct notifier_block *nb,
+ case MLX5_RES_SQ:
+ qp = (struct mlx5_core_qp *)common;
+ qp->event(qp, event_type);
+- break;
++ /* Need to put resource in event handler */
++ return NOTIFY_OK;
+ case MLX5_RES_DCT:
+ dct = (struct mlx5_core_dct *)common;
+ if (event_type == MLX5_EVENT_TYPE_DCT_DRAINED)
+diff --git a/drivers/net/ethernet/mellanox/mlx4/qp.c b/drivers/net/ethernet/mellanox/mlx4/qp.c
+index 48cfaa7eaf50c..913ed255990f4 100644
+--- a/drivers/net/ethernet/mellanox/mlx4/qp.c
++++ b/drivers/net/ethernet/mellanox/mlx4/qp.c
+@@ -46,6 +46,13 @@
+ #define MLX4_BF_QP_SKIP_MASK 0xc0
+ #define MLX4_MAX_BF_QP_RANGE 0x40
+
++void mlx4_put_qp(struct mlx4_qp *qp)
++{
++ if (refcount_dec_and_test(&qp->refcount))
++ complete(&qp->free);
++}
++EXPORT_SYMBOL_GPL(mlx4_put_qp);
++
+ void mlx4_qp_event(struct mlx4_dev *dev, u32 qpn, int event_type)
+ {
+ struct mlx4_qp_table *qp_table = &mlx4_priv(dev)->qp_table;
+@@ -64,10 +71,8 @@ void mlx4_qp_event(struct mlx4_dev *dev, u32 qpn, int event_type)
+ return;
+ }
+
++ /* Need to call mlx4_put_qp() in event handler */
+ qp->event(qp, event_type);
+-
+- if (refcount_dec_and_test(&qp->refcount))
+- complete(&qp->free);
+ }
+
+ /* used for INIT/CLOSE port logic */
+@@ -523,8 +528,7 @@ EXPORT_SYMBOL_GPL(mlx4_qp_remove);
+
+ void mlx4_qp_free(struct mlx4_dev *dev, struct mlx4_qp *qp)
+ {
+- if (refcount_dec_and_test(&qp->refcount))
+- complete(&qp->free);
++ mlx4_put_qp(qp);
+ wait_for_completion(&qp->free);
+
+ mlx4_qp_free_icm(dev, qp->qpn);
+diff --git a/include/linux/mlx4/qp.h b/include/linux/mlx4/qp.h
+index b6b626157b03a..b9a7b1319f5d3 100644
+--- a/include/linux/mlx4/qp.h
++++ b/include/linux/mlx4/qp.h
+@@ -504,4 +504,5 @@ static inline u16 folded_qp(u32 q)
+
+ u16 mlx4_qp_roce_entropy(struct mlx4_dev *dev, u32 qpn);
+
++void mlx4_put_qp(struct mlx4_qp *qp);
+ #endif /* MLX4_QP_H */
+diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
+index 5582509003264..68fd6d22adfd4 100644
+--- a/include/rdma/ib_verbs.h
++++ b/include/rdma/ib_verbs.h
+@@ -1162,7 +1162,7 @@ enum ib_qp_create_flags {
+ */
+
+ struct ib_qp_init_attr {
+- /* Consumer's event_handler callback must not block */
++ /* This callback occurs in workqueue context */
+ void (*event_handler)(struct ib_event *, void *);
+
+ void *qp_context;
+--
+2.39.5
+
--- /dev/null
+From be147ad5b5dbf2b210768ce67d652ae3e1d6ddf1 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 26 Jan 2023 00:28:07 +0200
+Subject: RDMA/mlx5: Add work to remove temporary entries from the cache
+
+From: Michael Guralnik <michaelgur@nvidia.com>
+
+[ Upstream commit 627122280c878cf5d3cda2d2c5a0a8f6a7e35cb7 ]
+
+The non-cache mkeys are stored in the cache only to shorten restarting
+application time. Don't store them longer than needed.
+
+Configure cache entries that store non-cache MRs as temporary entries. If
+30 seconds have passed and no user reclaimed the temporarily cached mkeys,
+an asynchronous work will destroy the mkeys entries.
+
+Link: https://lore.kernel.org/r/20230125222807.6921-7-michaelgur@nvidia.com
+Signed-off-by: Michael Guralnik <michaelgur@nvidia.com>
+Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
+Stable-dep-of: d97505baea64 ("RDMA/mlx5: Fix the recovery flow of the UMR QP")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/infiniband/hw/mlx5/mlx5_ib.h | 9 ++-
+ drivers/infiniband/hw/mlx5/mr.c | 94 ++++++++++++++++++++++------
+ drivers/infiniband/hw/mlx5/odp.c | 2 +-
+ 3 files changed, 82 insertions(+), 23 deletions(-)
+
+diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
+index f345e2ae394d2..7c72e0e9db54a 100644
+--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
++++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
+@@ -770,6 +770,7 @@ struct mlx5_cache_ent {
+ struct rb_node node;
+ struct mlx5r_cache_rb_key rb_key;
+
++ u8 is_tmp:1;
+ u8 disabled:1;
+ u8 fill_to_high_water:1;
+
+@@ -803,6 +804,7 @@ struct mlx5_mkey_cache {
+ struct mutex rb_lock;
+ struct dentry *fs_root;
+ unsigned long last_add;
++ struct delayed_work remove_ent_dwork;
+ };
+
+ struct mlx5_ib_port_resources {
+@@ -1346,9 +1348,10 @@ void mlx5_ib_copy_pas(u64 *old, u64 *new, int step, int num);
+ int mlx5_ib_get_cqe_size(struct ib_cq *ibcq);
+ int mlx5_mkey_cache_init(struct mlx5_ib_dev *dev);
+ int mlx5_mkey_cache_cleanup(struct mlx5_ib_dev *dev);
+-struct mlx5_cache_ent *mlx5r_cache_create_ent(struct mlx5_ib_dev *dev,
+- struct mlx5r_cache_rb_key rb_key,
+- bool persistent_entry);
++struct mlx5_cache_ent *
++mlx5r_cache_create_ent_locked(struct mlx5_ib_dev *dev,
++ struct mlx5r_cache_rb_key rb_key,
++ bool persistent_entry);
+
+ struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev,
+ int access_flags, int access_mode,
+diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
+index bf1ca7565be67..2c1a935734273 100644
+--- a/drivers/infiniband/hw/mlx5/mr.c
++++ b/drivers/infiniband/hw/mlx5/mr.c
+@@ -140,19 +140,16 @@ static void create_mkey_warn(struct mlx5_ib_dev *dev, int status, void *out)
+ mlx5_cmd_out_err(dev->mdev, MLX5_CMD_OP_CREATE_MKEY, 0, out);
+ }
+
+-
+-static int push_mkey(struct mlx5_cache_ent *ent, bool limit_pendings,
+- void *to_store)
++static int push_mkey_locked(struct mlx5_cache_ent *ent, bool limit_pendings,
++ void *to_store)
+ {
+ XA_STATE(xas, &ent->mkeys, 0);
+ void *curr;
+
+- xa_lock_irq(&ent->mkeys);
+ if (limit_pendings &&
+- (ent->reserved - ent->stored) > MAX_PENDING_REG_MR) {
+- xa_unlock_irq(&ent->mkeys);
++ (ent->reserved - ent->stored) > MAX_PENDING_REG_MR)
+ return -EAGAIN;
+- }
++
+ while (1) {
+ /*
+ * This is cmpxchg (NULL, XA_ZERO_ENTRY) however this version
+@@ -191,6 +188,7 @@ static int push_mkey(struct mlx5_cache_ent *ent, bool limit_pendings,
+ break;
+ xa_lock_irq(&ent->mkeys);
+ }
++ xa_lock_irq(&ent->mkeys);
+ if (xas_error(&xas))
+ return xas_error(&xas);
+ if (WARN_ON(curr))
+@@ -198,6 +196,17 @@ static int push_mkey(struct mlx5_cache_ent *ent, bool limit_pendings,
+ return 0;
+ }
+
++static int push_mkey(struct mlx5_cache_ent *ent, bool limit_pendings,
++ void *to_store)
++{
++ int ret;
++
++ xa_lock_irq(&ent->mkeys);
++ ret = push_mkey_locked(ent, limit_pendings, to_store);
++ xa_unlock_irq(&ent->mkeys);
++ return ret;
++}
++
+ static void undo_push_reserve_mkey(struct mlx5_cache_ent *ent)
+ {
+ void *old;
+@@ -545,7 +554,7 @@ static void queue_adjust_cache_locked(struct mlx5_cache_ent *ent)
+ {
+ lockdep_assert_held(&ent->mkeys.xa_lock);
+
+- if (ent->disabled || READ_ONCE(ent->dev->fill_delay))
++ if (ent->disabled || READ_ONCE(ent->dev->fill_delay) || ent->is_tmp)
+ return;
+ if (ent->stored < ent->limit) {
+ ent->fill_to_high_water = true;
+@@ -675,7 +684,6 @@ static int mlx5_cache_ent_insert(struct mlx5_mkey_cache *cache,
+ struct mlx5_cache_ent *cur;
+ int cmp;
+
+- mutex_lock(&cache->rb_lock);
+ /* Figure out where to put new node */
+ while (*new) {
+ cur = rb_entry(*new, struct mlx5_cache_ent, node);
+@@ -695,7 +703,6 @@ static int mlx5_cache_ent_insert(struct mlx5_mkey_cache *cache,
+ rb_link_node(&ent->node, parent, new);
+ rb_insert_color(&ent->node, &cache->rb_root);
+
+- mutex_unlock(&cache->rb_lock);
+ return 0;
+ }
+
+@@ -867,9 +874,10 @@ static void delay_time_func(struct timer_list *t)
+ WRITE_ONCE(dev->fill_delay, 0);
+ }
+
+-struct mlx5_cache_ent *mlx5r_cache_create_ent(struct mlx5_ib_dev *dev,
+- struct mlx5r_cache_rb_key rb_key,
+- bool persistent_entry)
++struct mlx5_cache_ent *
++mlx5r_cache_create_ent_locked(struct mlx5_ib_dev *dev,
++ struct mlx5r_cache_rb_key rb_key,
++ bool persistent_entry)
+ {
+ struct mlx5_cache_ent *ent;
+ int order;
+@@ -882,6 +890,7 @@ struct mlx5_cache_ent *mlx5r_cache_create_ent(struct mlx5_ib_dev *dev,
+ xa_init_flags(&ent->mkeys, XA_FLAGS_LOCK_IRQ);
+ ent->rb_key = rb_key;
+ ent->dev = dev;
++ ent->is_tmp = !persistent_entry;
+
+ INIT_DELAYED_WORK(&ent->dwork, delayed_cache_work_func);
+
+@@ -905,11 +914,44 @@ struct mlx5_cache_ent *mlx5r_cache_create_ent(struct mlx5_ib_dev *dev,
+ ent->limit = 0;
+
+ mlx5_mkey_cache_debugfs_add_ent(dev, ent);
++ } else {
++ mod_delayed_work(ent->dev->cache.wq,
++ &ent->dev->cache.remove_ent_dwork,
++ msecs_to_jiffies(30 * 1000));
+ }
+
+ return ent;
+ }
+
++static void remove_ent_work_func(struct work_struct *work)
++{
++ struct mlx5_mkey_cache *cache;
++ struct mlx5_cache_ent *ent;
++ struct rb_node *cur;
++
++ cache = container_of(work, struct mlx5_mkey_cache,
++ remove_ent_dwork.work);
++ mutex_lock(&cache->rb_lock);
++ cur = rb_last(&cache->rb_root);
++ while (cur) {
++ ent = rb_entry(cur, struct mlx5_cache_ent, node);
++ cur = rb_prev(cur);
++ mutex_unlock(&cache->rb_lock);
++
++ xa_lock_irq(&ent->mkeys);
++ if (!ent->is_tmp) {
++ xa_unlock_irq(&ent->mkeys);
++ mutex_lock(&cache->rb_lock);
++ continue;
++ }
++ xa_unlock_irq(&ent->mkeys);
++
++ clean_keys(ent->dev, ent);
++ mutex_lock(&cache->rb_lock);
++ }
++ mutex_unlock(&cache->rb_lock);
++}
++
+ int mlx5_mkey_cache_init(struct mlx5_ib_dev *dev)
+ {
+ struct mlx5_mkey_cache *cache = &dev->cache;
+@@ -925,6 +967,7 @@ int mlx5_mkey_cache_init(struct mlx5_ib_dev *dev)
+ mutex_init(&dev->slow_path_mutex);
+ mutex_init(&dev->cache.rb_lock);
+ dev->cache.rb_root = RB_ROOT;
++ INIT_DELAYED_WORK(&dev->cache.remove_ent_dwork, remove_ent_work_func);
+ cache->wq = alloc_ordered_workqueue("mkey_cache", WQ_MEM_RECLAIM);
+ if (!cache->wq) {
+ mlx5_ib_warn(dev, "failed to create work queue\n");
+@@ -934,9 +977,10 @@ int mlx5_mkey_cache_init(struct mlx5_ib_dev *dev)
+ mlx5_cmd_init_async_ctx(dev->mdev, &dev->async_ctx);
+ timer_setup(&dev->delay_timer, delay_time_func, 0);
+ mlx5_mkey_cache_debugfs_init(dev);
++ mutex_lock(&cache->rb_lock);
+ for (i = 0; i <= mkey_cache_max_order(dev); i++) {
+ rb_key.ndescs = 1 << (i + 2);
+- ent = mlx5r_cache_create_ent(dev, rb_key, true);
++ ent = mlx5r_cache_create_ent_locked(dev, rb_key, true);
+ if (IS_ERR(ent)) {
+ ret = PTR_ERR(ent);
+ goto err;
+@@ -947,6 +991,7 @@ int mlx5_mkey_cache_init(struct mlx5_ib_dev *dev)
+ if (ret)
+ goto err;
+
++ mutex_unlock(&cache->rb_lock);
+ for (node = rb_first(root); node; node = rb_next(node)) {
+ ent = rb_entry(node, struct mlx5_cache_ent, node);
+ xa_lock_irq(&ent->mkeys);
+@@ -957,6 +1002,7 @@ int mlx5_mkey_cache_init(struct mlx5_ib_dev *dev)
+ return 0;
+
+ err:
++ mutex_unlock(&cache->rb_lock);
+ mlx5_ib_warn(dev, "failed to create mkey cache entry\n");
+ return ret;
+ }
+@@ -970,6 +1016,7 @@ int mlx5_mkey_cache_cleanup(struct mlx5_ib_dev *dev)
+ if (!dev->cache.wq)
+ return 0;
+
++ cancel_delayed_work_sync(&dev->cache.remove_ent_dwork);
+ mutex_lock(&dev->cache.rb_lock);
+ for (node = rb_first(root); node; node = rb_next(node)) {
+ ent = rb_entry(node, struct mlx5_cache_ent, node);
+@@ -1752,33 +1799,42 @@ static int cache_ent_find_and_store(struct mlx5_ib_dev *dev,
+ {
+ struct mlx5_mkey_cache *cache = &dev->cache;
+ struct mlx5_cache_ent *ent;
++ int ret;
+
+ if (mr->mmkey.cache_ent) {
+ xa_lock_irq(&mr->mmkey.cache_ent->mkeys);
+ mr->mmkey.cache_ent->in_use--;
+- xa_unlock_irq(&mr->mmkey.cache_ent->mkeys);
+ goto end;
+ }
+
+ mutex_lock(&cache->rb_lock);
+ ent = mkey_cache_ent_from_rb_key(dev, mr->mmkey.rb_key);
+- mutex_unlock(&cache->rb_lock);
+ if (ent) {
+ if (ent->rb_key.ndescs == mr->mmkey.rb_key.ndescs) {
++ if (ent->disabled) {
++ mutex_unlock(&cache->rb_lock);
++ return -EOPNOTSUPP;
++ }
+ mr->mmkey.cache_ent = ent;
++ xa_lock_irq(&mr->mmkey.cache_ent->mkeys);
++ mutex_unlock(&cache->rb_lock);
+ goto end;
+ }
+ }
+
+- ent = mlx5r_cache_create_ent(dev, mr->mmkey.rb_key, false);
++ ent = mlx5r_cache_create_ent_locked(dev, mr->mmkey.rb_key, false);
++ mutex_unlock(&cache->rb_lock);
+ if (IS_ERR(ent))
+ return PTR_ERR(ent);
+
+ mr->mmkey.cache_ent = ent;
++ xa_lock_irq(&mr->mmkey.cache_ent->mkeys);
+
+ end:
+- return push_mkey(mr->mmkey.cache_ent, false,
+- xa_mk_value(mr->mmkey.key));
++ ret = push_mkey_locked(mr->mmkey.cache_ent, false,
++ xa_mk_value(mr->mmkey.key));
++ xa_unlock_irq(&mr->mmkey.cache_ent->mkeys);
++ return ret;
+ }
+
+ int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata)
+diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c
+index 96d4faabbff8a..6ba4aa1afdc2d 100644
+--- a/drivers/infiniband/hw/mlx5/odp.c
++++ b/drivers/infiniband/hw/mlx5/odp.c
+@@ -1602,7 +1602,7 @@ int mlx5_odp_init_mkey_cache(struct mlx5_ib_dev *dev)
+ if (!(dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT))
+ return 0;
+
+- ent = mlx5r_cache_create_ent(dev, rb_key, true);
++ ent = mlx5r_cache_create_ent_locked(dev, rb_key, true);
+ if (IS_ERR(ent))
+ return PTR_ERR(ent);
+
+--
+2.39.5
+
--- /dev/null
+From 5a09f0237455bc487c3d8cb78b82b7263d23d8fe Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 26 Jan 2023 00:28:06 +0200
+Subject: RDMA/mlx5: Cache all user cacheable mkeys on dereg MR flow
+
+From: Michael Guralnik <michaelgur@nvidia.com>
+
+[ Upstream commit dd1b913fb0d0e3e6d55e92d2319d954474dd66ac ]
+
+Currently, when dereging an MR, if the mkey doesn't belong to a cache
+entry, it will be destroyed. As a result, the restart of applications
+with many non-cached mkeys is not efficient since all the mkeys are
+destroyed and then recreated. This process takes a long time (for 100,000
+MRs, it is ~20 seconds for dereg and ~28 seconds for re-reg).
+
+To shorten the restart runtime, insert all cacheable mkeys to the cache.
+If there is no fitting entry to the mkey properties, create a temporary
+entry that fits it.
+
+After a predetermined timeout, the cache entries will shrink to the
+initial high limit.
+
+The mkeys will still be in the cache when consuming them again after an
+application restart. Therefore, the registration will be much faster
+(for 100,000 MRs, it is ~4 seconds for dereg and ~5 seconds for re-reg).
+
+The temporary cache entries created to store the non-cache mkeys are not
+exposed through sysfs like the default cache entries.
+
+Link: https://lore.kernel.org/r/20230125222807.6921-6-michaelgur@nvidia.com
+Signed-off-by: Michael Guralnik <michaelgur@nvidia.com>
+Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
+Stable-dep-of: d97505baea64 ("RDMA/mlx5: Fix the recovery flow of the UMR QP")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/infiniband/hw/mlx5/mlx5_ib.h | 2 +
+ drivers/infiniband/hw/mlx5/mr.c | 55 +++++++++++++++++++++-------
+ 2 files changed, 44 insertions(+), 13 deletions(-)
+
+diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
+index 7c9d5648947e9..f345e2ae394d2 100644
+--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
++++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
+@@ -650,6 +650,8 @@ struct mlx5_ib_mkey {
+ unsigned int ndescs;
+ struct wait_queue_head wait;
+ refcount_t usecount;
++ /* User Mkey must hold either a rb_key or a cache_ent. */
++ struct mlx5r_cache_rb_key rb_key;
+ struct mlx5_cache_ent *cache_ent;
+ };
+
+diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
+index 1060b30a837a0..bf1ca7565be67 100644
+--- a/drivers/infiniband/hw/mlx5/mr.c
++++ b/drivers/infiniband/hw/mlx5/mr.c
+@@ -1110,15 +1110,14 @@ static struct mlx5_ib_mr *alloc_cacheable_mr(struct ib_pd *pd,
+ rb_key.access_flags = get_unchangeable_access_flags(dev, access_flags);
+ ent = mkey_cache_ent_from_rb_key(dev, rb_key);
+ /*
+- * Matches access in alloc_cache_mr(). If the MR can't come from the
+- * cache then synchronously create an uncached one.
++ * If the MR can't come from the cache then synchronously create an uncached
++ * one.
+ */
+- if (!ent || ent->limit == 0 ||
+- !mlx5r_umr_can_reconfig(dev, 0, access_flags) ||
+- mlx5_umem_needs_ats(dev, umem, access_flags)) {
++ if (!ent) {
+ mutex_lock(&dev->slow_path_mutex);
+ mr = reg_create(pd, umem, iova, access_flags, page_size, false);
+ mutex_unlock(&dev->slow_path_mutex);
++ mr->mmkey.rb_key = rb_key;
+ return mr;
+ }
+
+@@ -1209,6 +1208,7 @@ static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, struct ib_umem *umem,
+ goto err_2;
+ }
+ mr->mmkey.type = MLX5_MKEY_MR;
++ mr->mmkey.ndescs = get_octo_len(iova, umem->length, mr->page_shift);
+ mr->umem = umem;
+ set_mr_fields(dev, mr, umem->length, access_flags, iova);
+ kvfree(in);
+@@ -1747,6 +1747,40 @@ mlx5_free_priv_descs(struct mlx5_ib_mr *mr)
+ }
+ }
+
++static int cache_ent_find_and_store(struct mlx5_ib_dev *dev,
++ struct mlx5_ib_mr *mr)
++{
++ struct mlx5_mkey_cache *cache = &dev->cache;
++ struct mlx5_cache_ent *ent;
++
++ if (mr->mmkey.cache_ent) {
++ xa_lock_irq(&mr->mmkey.cache_ent->mkeys);
++ mr->mmkey.cache_ent->in_use--;
++ xa_unlock_irq(&mr->mmkey.cache_ent->mkeys);
++ goto end;
++ }
++
++ mutex_lock(&cache->rb_lock);
++ ent = mkey_cache_ent_from_rb_key(dev, mr->mmkey.rb_key);
++ mutex_unlock(&cache->rb_lock);
++ if (ent) {
++ if (ent->rb_key.ndescs == mr->mmkey.rb_key.ndescs) {
++ mr->mmkey.cache_ent = ent;
++ goto end;
++ }
++ }
++
++ ent = mlx5r_cache_create_ent(dev, mr->mmkey.rb_key, false);
++ if (IS_ERR(ent))
++ return PTR_ERR(ent);
++
++ mr->mmkey.cache_ent = ent;
++
++end:
++ return push_mkey(mr->mmkey.cache_ent, false,
++ xa_mk_value(mr->mmkey.key));
++}
++
+ int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata)
+ {
+ struct mlx5_ib_mr *mr = to_mmr(ibmr);
+@@ -1792,16 +1826,11 @@ int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata)
+ }
+
+ /* Stop DMA */
+- if (mr->mmkey.cache_ent) {
+- xa_lock_irq(&mr->mmkey.cache_ent->mkeys);
+- mr->mmkey.cache_ent->in_use--;
+- xa_unlock_irq(&mr->mmkey.cache_ent->mkeys);
+-
++ if (mr->umem && mlx5r_umr_can_load_pas(dev, mr->umem->length))
+ if (mlx5r_umr_revoke_mr(mr) ||
+- push_mkey(mr->mmkey.cache_ent, false,
+- xa_mk_value(mr->mmkey.key)))
++ cache_ent_find_and_store(dev, mr))
+ mr->mmkey.cache_ent = NULL;
+- }
++
+ if (!mr->mmkey.cache_ent) {
+ rc = destroy_mkey(to_mdev(mr->ibmr.device), mr);
+ if (rc)
+--
+2.39.5
+
--- /dev/null
+From 3a78949c3d99afa32e87cf8cfe46723a057ee4cb Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 26 Jan 2023 00:28:04 +0200
+Subject: RDMA/mlx5: Change the cache structure to an RB-tree
+
+From: Michael Guralnik <michaelgur@nvidia.com>
+
+[ Upstream commit b9584517832858a0f78d6851d09b697a829514cd ]
+
+Currently, the cache structure is a static linear array. Therefore, his
+size is limited to the number of entries in it and is not expandable. The
+entries are dedicated to mkeys of size 2^x and no access_flags. Mkeys with
+different properties are not cacheable.
+
+In this patch, we change the cache structure to an RB-tree. This will
+allow to extend the cache to support more entries with different mkey
+properties.
+
+Link: https://lore.kernel.org/r/20230125222807.6921-4-michaelgur@nvidia.com
+Signed-off-by: Michael Guralnik <michaelgur@nvidia.com>
+Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
+Stable-dep-of: d97505baea64 ("RDMA/mlx5: Fix the recovery flow of the UMR QP")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/infiniband/hw/mlx5/mlx5_ib.h | 11 +-
+ drivers/infiniband/hw/mlx5/mr.c | 160 ++++++++++++++++++++-------
+ drivers/infiniband/hw/mlx5/odp.c | 8 +-
+ 3 files changed, 132 insertions(+), 47 deletions(-)
+
+diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
+index 10c87901da27c..bd998ac8c29c1 100644
+--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
++++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
+@@ -761,6 +761,8 @@ struct mlx5_cache_ent {
+ u32 access_mode;
+ unsigned int ndescs;
+
++ struct rb_node node;
++
+ u8 disabled:1;
+ u8 fill_to_high_water:1;
+
+@@ -790,8 +792,9 @@ struct mlx5r_async_create_mkey {
+
+ struct mlx5_mkey_cache {
+ struct workqueue_struct *wq;
+- struct mlx5_cache_ent ent[MAX_MKEY_CACHE_ENTRIES];
+- struct dentry *root;
++ struct rb_root rb_root;
++ struct mutex rb_lock;
++ struct dentry *fs_root;
+ unsigned long last_add;
+ };
+
+@@ -1336,11 +1339,15 @@ void mlx5_ib_copy_pas(u64 *old, u64 *new, int step, int num);
+ int mlx5_ib_get_cqe_size(struct ib_cq *ibcq);
+ int mlx5_mkey_cache_init(struct mlx5_ib_dev *dev);
+ int mlx5_mkey_cache_cleanup(struct mlx5_ib_dev *dev);
++struct mlx5_cache_ent *mlx5r_cache_create_ent(struct mlx5_ib_dev *dev,
++ int order);
+
+ struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev,
+ struct mlx5_cache_ent *ent,
+ int access_flags);
+
++struct mlx5_ib_mr *mlx5_mr_cache_alloc_order(struct mlx5_ib_dev *dev, u32 order,
++ int access_flags);
+ int mlx5_ib_check_mr_status(struct ib_mr *ibmr, u32 check_mask,
+ struct ib_mr_status *mr_status);
+ struct ib_wq *mlx5_ib_create_wq(struct ib_pd *pd,
+diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
+index 53fadd6edb68d..b3d83920d3cfb 100644
+--- a/drivers/infiniband/hw/mlx5/mr.c
++++ b/drivers/infiniband/hw/mlx5/mr.c
+@@ -515,18 +515,22 @@ static const struct file_operations limit_fops = {
+
+ static bool someone_adding(struct mlx5_mkey_cache *cache)
+ {
+- unsigned int i;
+-
+- for (i = 0; i < MAX_MKEY_CACHE_ENTRIES; i++) {
+- struct mlx5_cache_ent *ent = &cache->ent[i];
+- bool ret;
++ struct mlx5_cache_ent *ent;
++ struct rb_node *node;
++ bool ret;
+
++ mutex_lock(&cache->rb_lock);
++ for (node = rb_first(&cache->rb_root); node; node = rb_next(node)) {
++ ent = rb_entry(node, struct mlx5_cache_ent, node);
+ xa_lock_irq(&ent->mkeys);
+ ret = ent->stored < ent->limit;
+ xa_unlock_irq(&ent->mkeys);
+- if (ret)
++ if (ret) {
++ mutex_unlock(&cache->rb_lock);
+ return true;
++ }
+ }
++ mutex_unlock(&cache->rb_lock);
+ return false;
+ }
+
+@@ -637,6 +641,59 @@ static void delayed_cache_work_func(struct work_struct *work)
+ __cache_work_func(ent);
+ }
+
++static int mlx5_cache_ent_insert(struct mlx5_mkey_cache *cache,
++ struct mlx5_cache_ent *ent)
++{
++ struct rb_node **new = &cache->rb_root.rb_node, *parent = NULL;
++ struct mlx5_cache_ent *cur;
++
++ mutex_lock(&cache->rb_lock);
++ /* Figure out where to put new node */
++ while (*new) {
++ cur = rb_entry(*new, struct mlx5_cache_ent, node);
++ parent = *new;
++ if (ent->order < cur->order)
++ new = &((*new)->rb_left);
++ if (ent->order > cur->order)
++ new = &((*new)->rb_right);
++ if (ent->order == cur->order) {
++ mutex_unlock(&cache->rb_lock);
++ return -EEXIST;
++ }
++ }
++
++ /* Add new node and rebalance tree. */
++ rb_link_node(&ent->node, parent, new);
++ rb_insert_color(&ent->node, &cache->rb_root);
++
++ mutex_unlock(&cache->rb_lock);
++ return 0;
++}
++
++static struct mlx5_cache_ent *mkey_cache_ent_from_order(struct mlx5_ib_dev *dev,
++ unsigned int order)
++{
++ struct rb_node *node = dev->cache.rb_root.rb_node;
++ struct mlx5_cache_ent *cur, *smallest = NULL;
++
++ /*
++ * Find the smallest ent with order >= requested_order.
++ */
++ while (node) {
++ cur = rb_entry(node, struct mlx5_cache_ent, node);
++ if (cur->order > order) {
++ smallest = cur;
++ node = node->rb_left;
++ }
++ if (cur->order < order)
++ node = node->rb_right;
++ if (cur->order == order)
++ return cur;
++ }
++
++ return smallest;
++}
++
+ struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev,
+ struct mlx5_cache_ent *ent,
+ int access_flags)
+@@ -677,10 +734,16 @@ struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev,
+ return mr;
+ }
+
+-static void clean_keys(struct mlx5_ib_dev *dev, int c)
++struct mlx5_ib_mr *mlx5_mr_cache_alloc_order(struct mlx5_ib_dev *dev,
++ u32 order, int access_flags)
++{
++ struct mlx5_cache_ent *ent = mkey_cache_ent_from_order(dev, order);
++
++ return mlx5_mr_cache_alloc(dev, ent, access_flags);
++}
++
++static void clean_keys(struct mlx5_ib_dev *dev, struct mlx5_cache_ent *ent)
+ {
+- struct mlx5_mkey_cache *cache = &dev->cache;
+- struct mlx5_cache_ent *ent = &cache->ent[c];
+ u32 mkey;
+
+ cancel_delayed_work(&ent->dwork);
+@@ -699,8 +762,8 @@ static void mlx5_mkey_cache_debugfs_cleanup(struct mlx5_ib_dev *dev)
+ if (!mlx5_debugfs_root || dev->is_rep)
+ return;
+
+- debugfs_remove_recursive(dev->cache.root);
+- dev->cache.root = NULL;
++ debugfs_remove_recursive(dev->cache.fs_root);
++ dev->cache.fs_root = NULL;
+ }
+
+ static void mlx5_mkey_cache_debugfs_init(struct mlx5_ib_dev *dev)
+@@ -713,12 +776,13 @@ static void mlx5_mkey_cache_debugfs_init(struct mlx5_ib_dev *dev)
+ if (!mlx5_debugfs_root || dev->is_rep)
+ return;
+
+- cache->root = debugfs_create_dir("mr_cache", mlx5_debugfs_get_dev_root(dev->mdev));
++ dir = mlx5_debugfs_get_dev_root(dev->mdev);
++ cache->fs_root = debugfs_create_dir("mr_cache", dir);
+
+ for (i = 0; i < MAX_MKEY_CACHE_ENTRIES; i++) {
+- ent = &cache->ent[i];
++ ent = mkey_cache_ent_from_order(dev, i);
+ sprintf(ent->name, "%d", ent->order);
+- dir = debugfs_create_dir(ent->name, cache->root);
++ dir = debugfs_create_dir(ent->name, cache->fs_root);
+ debugfs_create_file("size", 0600, dir, ent, &size_fops);
+ debugfs_create_file("limit", 0600, dir, ent, &limit_fops);
+ debugfs_create_ulong("cur", 0400, dir, &ent->stored);
+@@ -733,6 +797,30 @@ static void delay_time_func(struct timer_list *t)
+ WRITE_ONCE(dev->fill_delay, 0);
+ }
+
++struct mlx5_cache_ent *mlx5r_cache_create_ent(struct mlx5_ib_dev *dev,
++ int order)
++{
++ struct mlx5_cache_ent *ent;
++ int ret;
++
++ ent = kzalloc(sizeof(*ent), GFP_KERNEL);
++ if (!ent)
++ return ERR_PTR(-ENOMEM);
++
++ xa_init_flags(&ent->mkeys, XA_FLAGS_LOCK_IRQ);
++ ent->order = order;
++ ent->dev = dev;
++
++ INIT_DELAYED_WORK(&ent->dwork, delayed_cache_work_func);
++
++ ret = mlx5_cache_ent_insert(&dev->cache, ent);
++ if (ret) {
++ kfree(ent);
++ return ERR_PTR(ret);
++ }
++ return ent;
++}
++
+ int mlx5_mkey_cache_init(struct mlx5_ib_dev *dev)
+ {
+ struct mlx5_mkey_cache *cache = &dev->cache;
+@@ -740,6 +828,8 @@ int mlx5_mkey_cache_init(struct mlx5_ib_dev *dev)
+ int i;
+
+ mutex_init(&dev->slow_path_mutex);
++ mutex_init(&dev->cache.rb_lock);
++ dev->cache.rb_root = RB_ROOT;
+ cache->wq = alloc_ordered_workqueue("mkey_cache", WQ_MEM_RECLAIM);
+ if (!cache->wq) {
+ mlx5_ib_warn(dev, "failed to create work queue\n");
+@@ -749,13 +839,7 @@ int mlx5_mkey_cache_init(struct mlx5_ib_dev *dev)
+ mlx5_cmd_init_async_ctx(dev->mdev, &dev->async_ctx);
+ timer_setup(&dev->delay_timer, delay_time_func, 0);
+ for (i = 0; i < MAX_MKEY_CACHE_ENTRIES; i++) {
+- ent = &cache->ent[i];
+- xa_init_flags(&ent->mkeys, XA_FLAGS_LOCK_IRQ);
+- ent->order = i + 2;
+- ent->dev = dev;
+- ent->limit = 0;
+-
+- INIT_DELAYED_WORK(&ent->dwork, delayed_cache_work_func);
++ ent = mlx5r_cache_create_ent(dev, i);
+
+ if (i > MKEY_CACHE_LAST_STD_ENTRY) {
+ mlx5_odp_init_mkey_cache_entry(ent);
+@@ -785,14 +869,16 @@ int mlx5_mkey_cache_init(struct mlx5_ib_dev *dev)
+
+ int mlx5_mkey_cache_cleanup(struct mlx5_ib_dev *dev)
+ {
+- unsigned int i;
++ struct rb_root *root = &dev->cache.rb_root;
++ struct mlx5_cache_ent *ent;
++ struct rb_node *node;
+
+ if (!dev->cache.wq)
+ return 0;
+
+- for (i = 0; i < MAX_MKEY_CACHE_ENTRIES; i++) {
+- struct mlx5_cache_ent *ent = &dev->cache.ent[i];
+-
++ mutex_lock(&dev->cache.rb_lock);
++ for (node = rb_first(root); node; node = rb_next(node)) {
++ ent = rb_entry(node, struct mlx5_cache_ent, node);
+ xa_lock_irq(&ent->mkeys);
+ ent->disabled = true;
+ xa_unlock_irq(&ent->mkeys);
+@@ -802,8 +888,15 @@ int mlx5_mkey_cache_cleanup(struct mlx5_ib_dev *dev)
+ mlx5_mkey_cache_debugfs_cleanup(dev);
+ mlx5_cmd_cleanup_async_ctx(&dev->async_ctx);
+
+- for (i = 0; i < MAX_MKEY_CACHE_ENTRIES; i++)
+- clean_keys(dev, i);
++ node = rb_first(root);
++ while (node) {
++ ent = rb_entry(node, struct mlx5_cache_ent, node);
++ node = rb_next(node);
++ clean_keys(dev, ent);
++ rb_erase(&ent->node, root);
++ kfree(ent);
++ }
++ mutex_unlock(&dev->cache.rb_lock);
+
+ destroy_workqueue(dev->cache.wq);
+ del_timer_sync(&dev->delay_timer);
+@@ -876,19 +969,6 @@ static int mkey_cache_max_order(struct mlx5_ib_dev *dev)
+ return MLX5_MAX_UMR_SHIFT;
+ }
+
+-static struct mlx5_cache_ent *mkey_cache_ent_from_order(struct mlx5_ib_dev *dev,
+- unsigned int order)
+-{
+- struct mlx5_mkey_cache *cache = &dev->cache;
+-
+- if (order < cache->ent[0].order)
+- return &cache->ent[0];
+- order = order - cache->ent[0].order;
+- if (order > MKEY_CACHE_LAST_STD_ENTRY)
+- return NULL;
+- return &cache->ent[order];
+-}
+-
+ static void set_mr_fields(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr,
+ u64 length, int access_flags, u64 iova)
+ {
+diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c
+index 5f0a17382de73..7f68940ca0d1e 100644
+--- a/drivers/infiniband/hw/mlx5/odp.c
++++ b/drivers/infiniband/hw/mlx5/odp.c
+@@ -420,8 +420,7 @@ static struct mlx5_ib_mr *implicit_get_child_mr(struct mlx5_ib_mr *imr,
+ return ERR_CAST(odp);
+
+ BUILD_BUG_ON(order > MKEY_CACHE_LAST_STD_ENTRY);
+- mr = mlx5_mr_cache_alloc(dev, &dev->cache.ent[order],
+- imr->access_flags);
++ mr = mlx5_mr_cache_alloc_order(dev, order, imr->access_flags);
+ if (IS_ERR(mr)) {
+ ib_umem_odp_release(odp);
+ return mr;
+@@ -495,9 +494,8 @@ struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd,
+ if (IS_ERR(umem_odp))
+ return ERR_CAST(umem_odp);
+
+- imr = mlx5_mr_cache_alloc(dev,
+- &dev->cache.ent[MLX5_IMR_KSM_CACHE_ENTRY],
+- access_flags);
++ imr = mlx5_mr_cache_alloc_order(dev, MLX5_IMR_KSM_CACHE_ENTRY,
++ access_flags);
+ if (IS_ERR(imr)) {
+ ib_umem_odp_release(umem_odp);
+ return imr;
+--
+2.39.5
+
--- /dev/null
+From a85b91bcb6fce39a7511353461ead5a60b13bc69 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 26 Jan 2023 00:28:02 +0200
+Subject: RDMA/mlx5: Don't keep umrable 'page_shift' in cache entries
+
+From: Aharon Landau <aharonl@nvidia.com>
+
+[ Upstream commit a2a88b8e22d1b202225d0e40b02ad068afab2ccb ]
+
+mkc.log_page_size can be changed using UMR. Therefore, don't treat it as a
+cache entry property.
+
+Removing it from struct mlx5_cache_ent.
+
+All cache mkeys will be created with default PAGE_SHIFT, and updated with
+the needed page_shift using UMR when passing them to a user.
+
+Link: https://lore.kernel.org/r/20230125222807.6921-2-michaelgur@nvidia.com
+Signed-off-by: Aharon Landau <aharonl@nvidia.com>
+Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
+Stable-dep-of: d97505baea64 ("RDMA/mlx5: Fix the recovery flow of the UMR QP")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/infiniband/hw/mlx5/mlx5_ib.h | 1 -
+ drivers/infiniband/hw/mlx5/mr.c | 3 +--
+ drivers/infiniband/hw/mlx5/odp.c | 2 --
+ 3 files changed, 1 insertion(+), 5 deletions(-)
+
+diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
+index 0ef347e91ffeb..10c87901da27c 100644
+--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
++++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
+@@ -759,7 +759,6 @@ struct mlx5_cache_ent {
+ char name[4];
+ u32 order;
+ u32 access_mode;
+- u32 page;
+ unsigned int ndescs;
+
+ u8 disabled:1;
+diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
+index b81b03aa2a629..53fadd6edb68d 100644
+--- a/drivers/infiniband/hw/mlx5/mr.c
++++ b/drivers/infiniband/hw/mlx5/mr.c
+@@ -297,7 +297,7 @@ static void set_cache_mkc(struct mlx5_cache_ent *ent, void *mkc)
+
+ MLX5_SET(mkc, mkc, translations_octword_size,
+ get_mkc_octo_size(ent->access_mode, ent->ndescs));
+- MLX5_SET(mkc, mkc, log_page_size, ent->page);
++ MLX5_SET(mkc, mkc, log_page_size, PAGE_SHIFT);
+ }
+
+ /* Asynchronously schedule new MRs to be populated in the cache. */
+@@ -765,7 +765,6 @@ int mlx5_mkey_cache_init(struct mlx5_ib_dev *dev)
+ if (ent->order > mkey_cache_max_order(dev))
+ continue;
+
+- ent->page = PAGE_SHIFT;
+ ent->ndescs = 1 << ent->order;
+ ent->access_mode = MLX5_MKC_ACCESS_MODE_MTT;
+ if ((dev->mdev->profile.mask & MLX5_PROF_MASK_MR_CACHE) &&
+diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c
+index 87fbee8061003..a5c9baec8be85 100644
+--- a/drivers/infiniband/hw/mlx5/odp.c
++++ b/drivers/infiniband/hw/mlx5/odp.c
+@@ -1598,14 +1598,12 @@ void mlx5_odp_init_mkey_cache_entry(struct mlx5_cache_ent *ent)
+
+ switch (ent->order - 2) {
+ case MLX5_IMR_MTT_CACHE_ENTRY:
+- ent->page = PAGE_SHIFT;
+ ent->ndescs = MLX5_IMR_MTT_ENTRIES;
+ ent->access_mode = MLX5_MKC_ACCESS_MODE_MTT;
+ ent->limit = 0;
+ break;
+
+ case MLX5_IMR_KSM_CACHE_ENTRY:
+- ent->page = MLX5_KSM_PAGE_SHIFT;
+ ent->ndescs = mlx5_imr_ksm_entries;
+ ent->access_mode = MLX5_MKC_ACCESS_MODE_KSM;
+ ent->limit = 0;
+--
+2.39.5
+
--- /dev/null
+From b79f406d4cc08e99e836a5e95040672efdba5313 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 10 Feb 2025 13:32:39 +0200
+Subject: RDMA/mlx5: Fix AH static rate parsing
+
+From: Patrisious Haddad <phaddad@nvidia.com>
+
+[ Upstream commit c534ffda781f44a1c6ac25ef6e0e444da38ca8af ]
+
+Previously static rate wasn't translated according to our PRM but simply
+used the 4 lower bytes.
+
+Correctly translate static rate value passed in AH creation attribute
+according to our PRM expected values.
+
+In addition change 800GB mapping to zero, which is the PRM
+specified value.
+
+Fixes: e126ba97dba9 ("mlx5: Add driver for Mellanox Connect-IB adapters")
+Signed-off-by: Patrisious Haddad <phaddad@nvidia.com>
+Reviewed-by: Maor Gottlieb <maorg@nvidia.com>
+Link: https://patch.msgid.link/18ef4cc5396caf80728341eb74738cd777596f60.1739187089.git.leon@kernel.org
+Signed-off-by: Leon Romanovsky <leon@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/infiniband/hw/mlx5/ah.c | 3 ++-
+ drivers/infiniband/hw/mlx5/qp.c | 6 +++---
+ drivers/infiniband/hw/mlx5/qp.h | 1 +
+ 3 files changed, 6 insertions(+), 4 deletions(-)
+
+diff --git a/drivers/infiniband/hw/mlx5/ah.c b/drivers/infiniband/hw/mlx5/ah.c
+index 505bc47fd575d..99036afb3aef0 100644
+--- a/drivers/infiniband/hw/mlx5/ah.c
++++ b/drivers/infiniband/hw/mlx5/ah.c
+@@ -67,7 +67,8 @@ static void create_ib_ah(struct mlx5_ib_dev *dev, struct mlx5_ib_ah *ah,
+ ah->av.tclass = grh->traffic_class;
+ }
+
+- ah->av.stat_rate_sl = (rdma_ah_get_static_rate(ah_attr) << 4);
++ ah->av.stat_rate_sl =
++ (mlx5r_ib_rate(dev, rdma_ah_get_static_rate(ah_attr)) << 4);
+
+ if (ah_attr->type == RDMA_AH_ATTR_TYPE_ROCE) {
+ if (init_attr->xmit_slave)
+diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c
+index 43c0123babd10..59dca0cd89052 100644
+--- a/drivers/infiniband/hw/mlx5/qp.c
++++ b/drivers/infiniband/hw/mlx5/qp.c
+@@ -3379,11 +3379,11 @@ static int ib_to_mlx5_rate_map(u8 rate)
+ return 0;
+ }
+
+-static int ib_rate_to_mlx5(struct mlx5_ib_dev *dev, u8 rate)
++int mlx5r_ib_rate(struct mlx5_ib_dev *dev, u8 rate)
+ {
+ u32 stat_rate_support;
+
+- if (rate == IB_RATE_PORT_CURRENT)
++ if (rate == IB_RATE_PORT_CURRENT || rate == IB_RATE_800_GBPS)
+ return 0;
+
+ if (rate < IB_RATE_2_5_GBPS || rate > IB_RATE_600_GBPS)
+@@ -3528,7 +3528,7 @@ static int mlx5_set_path(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp,
+ sizeof(grh->dgid.raw));
+ }
+
+- err = ib_rate_to_mlx5(dev, rdma_ah_get_static_rate(ah));
++ err = mlx5r_ib_rate(dev, rdma_ah_get_static_rate(ah));
+ if (err < 0)
+ return err;
+ MLX5_SET(ads, path, stat_rate, err);
+diff --git a/drivers/infiniband/hw/mlx5/qp.h b/drivers/infiniband/hw/mlx5/qp.h
+index e677fa0ca4226..4abb77d551670 100644
+--- a/drivers/infiniband/hw/mlx5/qp.h
++++ b/drivers/infiniband/hw/mlx5/qp.h
+@@ -55,4 +55,5 @@ int mlx5_core_xrcd_dealloc(struct mlx5_ib_dev *dev, u32 xrcdn);
+ int mlx5_ib_qp_set_counter(struct ib_qp *qp, struct rdma_counter *counter);
+ int mlx5_ib_qp_event_init(void);
+ void mlx5_ib_qp_event_cleanup(void);
++int mlx5r_ib_rate(struct mlx5_ib_dev *dev, u8 rate);
+ #endif /* _MLX5_IB_QP_H */
+--
+2.39.5
+
--- /dev/null
+From e1c50bbca08c17189cc312b70852128bf1271cfb Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 20 Feb 2025 08:47:10 +0200
+Subject: RDMA/mlx5: Fix bind QP error cleanup flow
+
+From: Patrisious Haddad <phaddad@nvidia.com>
+
+[ Upstream commit e1a0bdbdfdf08428f0ede5ae49c7f4139ac73ef5 ]
+
+When there is a failure during bind QP, the cleanup flow destroys the
+counter regardless if it is the one that created it or not, which is
+problematic since if it isn't the one that created it, that counter could
+still be in use.
+
+Fix that by destroying the counter only if it was created during this call.
+
+Fixes: 45842fc627c7 ("IB/mlx5: Support statistic q counter configuration")
+Signed-off-by: Patrisious Haddad <phaddad@nvidia.com>
+Reviewed-by: Mark Zhang <markzhang@nvidia.com>
+Link: https://patch.msgid.link/25dfefddb0ebefa668c32e06a94d84e3216257cf.1740033937.git.leon@kernel.org
+Reviewed-by: Zhu Yanjun <yanjun.zhu@linux.dev>
+Signed-off-by: Leon Romanovsky <leon@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/infiniband/hw/mlx5/counters.c | 8 ++++++--
+ 1 file changed, 6 insertions(+), 2 deletions(-)
+
+diff --git a/drivers/infiniband/hw/mlx5/counters.c b/drivers/infiniband/hw/mlx5/counters.c
+index 3e1272695d993..9915504ad1e18 100644
+--- a/drivers/infiniband/hw/mlx5/counters.c
++++ b/drivers/infiniband/hw/mlx5/counters.c
+@@ -444,6 +444,7 @@ static int mlx5_ib_counter_bind_qp(struct rdma_counter *counter,
+ struct ib_qp *qp)
+ {
+ struct mlx5_ib_dev *dev = to_mdev(qp->device);
++ bool new = false;
+ int err;
+
+ if (!counter->id) {
+@@ -458,6 +459,7 @@ static int mlx5_ib_counter_bind_qp(struct rdma_counter *counter,
+ return err;
+ counter->id =
+ MLX5_GET(alloc_q_counter_out, out, counter_set_id);
++ new = true;
+ }
+
+ err = mlx5_ib_qp_set_counter(qp, counter);
+@@ -467,8 +469,10 @@ static int mlx5_ib_counter_bind_qp(struct rdma_counter *counter,
+ return 0;
+
+ fail_set_counter:
+- mlx5_ib_counter_dealloc(counter);
+- counter->id = 0;
++ if (new) {
++ mlx5_ib_counter_dealloc(counter);
++ counter->id = 0;
++ }
+
+ return err;
+ }
+--
+2.39.5
+
--- /dev/null
+From 15ed43c7d41f9929ea55919272003c7ba5aec402 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Sun, 19 Jan 2025 14:36:13 +0200
+Subject: RDMA/mlx5: Fix the recovery flow of the UMR QP
+
+From: Yishai Hadas <yishaih@nvidia.com>
+
+[ Upstream commit d97505baea64d93538b16baf14ce7b8c1fbad746 ]
+
+This patch addresses an issue in the recovery flow of the UMR QP,
+ensuring tasks do not get stuck, as highlighted by the call trace [1].
+
+During recovery, before transitioning the QP to the RESET state, the
+software must wait for all outstanding WRs to complete.
+
+Failing to do so can cause the firmware to skip sending some flushed
+CQEs with errors and simply discard them upon the RESET, as per the IB
+specification.
+
+This race condition can result in lost CQEs and tasks becoming stuck.
+
+To resolve this, the patch sends a final WR which serves only as a
+barrier before moving the QP state to RESET.
+
+Once a CQE is received for that final WR, it guarantees that no
+outstanding WRs remain, making it safe to transition the QP to RESET and
+subsequently back to RTS, restoring proper functionality.
+
+Note:
+For the barrier WR, we simply reuse the failed and ready WR.
+Since the QP is in an error state, it will only receive
+IB_WC_WR_FLUSH_ERR. However, as it serves only as a barrier we don't
+care about its status.
+
+[1]
+INFO: task rdma_resource_l:1922 blocked for more than 120 seconds.
+Tainted: G W 6.12.0-rc7+ #1626
+"echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
+task:rdma_resource_l state:D stack:0 pid:1922 tgid:1922 ppid:1369
+ flags:0x00004004
+Call Trace:
+<TASK>
+__schedule+0x420/0xd30
+schedule+0x47/0x130
+schedule_timeout+0x280/0x300
+? mark_held_locks+0x48/0x80
+? lockdep_hardirqs_on_prepare+0xe5/0x1a0
+wait_for_completion+0x75/0x130
+mlx5r_umr_post_send_wait+0x3c2/0x5b0 [mlx5_ib]
+? __pfx_mlx5r_umr_done+0x10/0x10 [mlx5_ib]
+mlx5r_umr_revoke_mr+0x93/0xc0 [mlx5_ib]
+__mlx5_ib_dereg_mr+0x299/0x520 [mlx5_ib]
+? _raw_spin_unlock_irq+0x24/0x40
+? wait_for_completion+0xfe/0x130
+? rdma_restrack_put+0x63/0xe0 [ib_core]
+ib_dereg_mr_user+0x5f/0x120 [ib_core]
+? lock_release+0xc6/0x280
+destroy_hw_idr_uobject+0x1d/0x60 [ib_uverbs]
+uverbs_destroy_uobject+0x58/0x1d0 [ib_uverbs]
+uobj_destroy+0x3f/0x70 [ib_uverbs]
+ib_uverbs_cmd_verbs+0x3e4/0xbb0 [ib_uverbs]
+? __pfx_uverbs_destroy_def_handler+0x10/0x10 [ib_uverbs]
+? __lock_acquire+0x64e/0x2080
+? mark_held_locks+0x48/0x80
+? find_held_lock+0x2d/0xa0
+? lock_acquire+0xc1/0x2f0
+? ib_uverbs_ioctl+0xcb/0x170 [ib_uverbs]
+? __fget_files+0xc3/0x1b0
+ib_uverbs_ioctl+0xe7/0x170 [ib_uverbs]
+? ib_uverbs_ioctl+0xcb/0x170 [ib_uverbs]
+__x64_sys_ioctl+0x1b0/0xa70
+do_syscall_64+0x6b/0x140
+entry_SYSCALL_64_after_hwframe+0x76/0x7e
+RIP: 0033:0x7f99c918b17b
+RSP: 002b:00007ffc766d0468 EFLAGS: 00000246 ORIG_RAX:
+ 0000000000000010
+RAX: ffffffffffffffda RBX: 00007ffc766d0578 RCX:
+ 00007f99c918b17b
+RDX: 00007ffc766d0560 RSI: 00000000c0181b01 RDI:
+ 0000000000000003
+RBP: 00007ffc766d0540 R08: 00007f99c8f99010 R09:
+ 000000000000bd7e
+R10: 00007f99c94c1c70 R11: 0000000000000246 R12:
+ 00007ffc766d0530
+R13: 000000000000001c R14: 0000000040246a80 R15:
+ 0000000000000000
+</TASK>
+
+Fixes: 158e71bb69e3 ("RDMA/mlx5: Add a umr recovery flow")
+Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
+Reviewed-by: Michael Guralnik <michaelgur@nvidia.com>
+Link: https://patch.msgid.link/27b51b92ec42dfb09d8096fcbd51878f397ce6ec.1737290141.git.leon@kernel.org
+Signed-off-by: Leon Romanovsky <leon@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/infiniband/hw/mlx5/umr.c | 83 +++++++++++++++++++++-----------
+ 1 file changed, 56 insertions(+), 27 deletions(-)
+
+diff --git a/drivers/infiniband/hw/mlx5/umr.c b/drivers/infiniband/hw/mlx5/umr.c
+index fa000182d0b41..1a39e86178ece 100644
+--- a/drivers/infiniband/hw/mlx5/umr.c
++++ b/drivers/infiniband/hw/mlx5/umr.c
+@@ -199,30 +199,6 @@ void mlx5r_umr_resource_cleanup(struct mlx5_ib_dev *dev)
+ ib_dealloc_pd(dev->umrc.pd);
+ }
+
+-static int mlx5r_umr_recover(struct mlx5_ib_dev *dev)
+-{
+- struct umr_common *umrc = &dev->umrc;
+- struct ib_qp_attr attr;
+- int err;
+-
+- attr.qp_state = IB_QPS_RESET;
+- err = ib_modify_qp(umrc->qp, &attr, IB_QP_STATE);
+- if (err) {
+- mlx5_ib_dbg(dev, "Couldn't modify UMR QP\n");
+- goto err;
+- }
+-
+- err = mlx5r_umr_qp_rst2rts(dev, umrc->qp);
+- if (err)
+- goto err;
+-
+- umrc->state = MLX5_UMR_STATE_ACTIVE;
+- return 0;
+-
+-err:
+- umrc->state = MLX5_UMR_STATE_ERR;
+- return err;
+-}
+
+ static int mlx5r_umr_post_send(struct ib_qp *ibqp, u32 mkey, struct ib_cqe *cqe,
+ struct mlx5r_umr_wqe *wqe, bool with_data)
+@@ -270,6 +246,61 @@ static int mlx5r_umr_post_send(struct ib_qp *ibqp, u32 mkey, struct ib_cqe *cqe,
+ return err;
+ }
+
++static int mlx5r_umr_recover(struct mlx5_ib_dev *dev, u32 mkey,
++ struct mlx5r_umr_context *umr_context,
++ struct mlx5r_umr_wqe *wqe, bool with_data)
++{
++ struct umr_common *umrc = &dev->umrc;
++ struct ib_qp_attr attr;
++ int err;
++
++ mutex_lock(&umrc->lock);
++ /* Preventing any further WRs to be sent now */
++ if (umrc->state != MLX5_UMR_STATE_RECOVER) {
++ mlx5_ib_warn(dev, "UMR recovery encountered an unexpected state=%d\n",
++ umrc->state);
++ umrc->state = MLX5_UMR_STATE_RECOVER;
++ }
++ mutex_unlock(&umrc->lock);
++
++ /* Sending a final/barrier WR (the failed one) and wait for its completion.
++ * This will ensure that all the previous WRs got a completion before
++ * we set the QP state to RESET.
++ */
++ err = mlx5r_umr_post_send(umrc->qp, mkey, &umr_context->cqe, wqe,
++ with_data);
++ if (err) {
++ mlx5_ib_warn(dev, "UMR recovery post send failed, err %d\n", err);
++ goto err;
++ }
++
++ /* Since the QP is in an error state, it will only receive
++ * IB_WC_WR_FLUSH_ERR. However, as it serves only as a barrier
++ * we don't care about its status.
++ */
++ wait_for_completion(&umr_context->done);
++
++ attr.qp_state = IB_QPS_RESET;
++ err = ib_modify_qp(umrc->qp, &attr, IB_QP_STATE);
++ if (err) {
++ mlx5_ib_warn(dev, "Couldn't modify UMR QP to RESET, err=%d\n", err);
++ goto err;
++ }
++
++ err = mlx5r_umr_qp_rst2rts(dev, umrc->qp);
++ if (err) {
++ mlx5_ib_warn(dev, "Couldn't modify UMR QP to RTS, err=%d\n", err);
++ goto err;
++ }
++
++ umrc->state = MLX5_UMR_STATE_ACTIVE;
++ return 0;
++
++err:
++ umrc->state = MLX5_UMR_STATE_ERR;
++ return err;
++}
++
+ static void mlx5r_umr_done(struct ib_cq *cq, struct ib_wc *wc)
+ {
+ struct mlx5_ib_umr_context *context =
+@@ -334,9 +365,7 @@ static int mlx5r_umr_post_send_wait(struct mlx5_ib_dev *dev, u32 mkey,
+ mlx5_ib_warn(dev,
+ "reg umr failed (%u). Trying to recover and resubmit the flushed WQEs, mkey = %u\n",
+ umr_context.status, mkey);
+- mutex_lock(&umrc->lock);
+- err = mlx5r_umr_recover(dev);
+- mutex_unlock(&umrc->lock);
++ err = mlx5r_umr_recover(dev, mkey, &umr_context, wqe, with_data);
+ if (err)
+ mlx5_ib_warn(dev, "couldn't recover UMR, err %d\n",
+ err);
+--
+2.39.5
+
--- /dev/null
+From 73daa66bd410fa9662f7e4578ac5b58338c23b31 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 21 Sep 2023 11:07:16 +0300
+Subject: RDMA/mlx5: Implement mkeys management via LIFO queue
+
+From: Shay Drory <shayd@nvidia.com>
+
+[ Upstream commit 57e7071683ef6148c9f5ea0ba84598d2ba681375 ]
+
+Currently, mkeys are managed via xarray. This implementation leads to
+a degradation in cases many MRs are unregistered in parallel, due to xarray
+internal implementation, for example: deregistration 1M MRs via 64 threads
+is taking ~15% more time[1].
+
+Hence, implement mkeys management via LIFO queue, which solved the
+degradation.
+
+[1]
+2.8us in kernel v5.19 compare to 3.2us in kernel v6.4
+
+Signed-off-by: Shay Drory <shayd@nvidia.com>
+Link: https://lore.kernel.org/r/fde3d4cfab0f32f0ccb231cd113298256e1502c5.1695283384.git.leon@kernel.org
+Signed-off-by: Leon Romanovsky <leon@kernel.org>
+Stable-dep-of: d97505baea64 ("RDMA/mlx5: Fix the recovery flow of the UMR QP")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/infiniband/hw/mlx5/mlx5_ib.h | 21 +-
+ drivers/infiniband/hw/mlx5/mr.c | 324 ++++++++++++---------------
+ drivers/infiniband/hw/mlx5/umr.c | 4 +-
+ 3 files changed, 169 insertions(+), 180 deletions(-)
+
+diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
+index 7c72e0e9db54a..024d2071c6a5d 100644
+--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
++++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
+@@ -760,10 +760,25 @@ struct umr_common {
+ unsigned int state;
+ };
+
++#define NUM_MKEYS_PER_PAGE \
++ ((PAGE_SIZE - sizeof(struct list_head)) / sizeof(u32))
++
++struct mlx5_mkeys_page {
++ u32 mkeys[NUM_MKEYS_PER_PAGE];
++ struct list_head list;
++};
++static_assert(sizeof(struct mlx5_mkeys_page) == PAGE_SIZE);
++
++struct mlx5_mkeys_queue {
++ struct list_head pages_list;
++ u32 num_pages;
++ unsigned long ci;
++ spinlock_t lock; /* sync list ops */
++};
++
+ struct mlx5_cache_ent {
+- struct xarray mkeys;
+- unsigned long stored;
+- unsigned long reserved;
++ struct mlx5_mkeys_queue mkeys_queue;
++ u32 pending;
+
+ char name[4];
+
+diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
+index 2c1a935734273..b66b8346c2dc6 100644
+--- a/drivers/infiniband/hw/mlx5/mr.c
++++ b/drivers/infiniband/hw/mlx5/mr.c
+@@ -140,110 +140,47 @@ static void create_mkey_warn(struct mlx5_ib_dev *dev, int status, void *out)
+ mlx5_cmd_out_err(dev->mdev, MLX5_CMD_OP_CREATE_MKEY, 0, out);
+ }
+
+-static int push_mkey_locked(struct mlx5_cache_ent *ent, bool limit_pendings,
+- void *to_store)
++static int push_mkey_locked(struct mlx5_cache_ent *ent, u32 mkey)
+ {
+- XA_STATE(xas, &ent->mkeys, 0);
+- void *curr;
++ unsigned long tmp = ent->mkeys_queue.ci % NUM_MKEYS_PER_PAGE;
++ struct mlx5_mkeys_page *page;
+
+- if (limit_pendings &&
+- (ent->reserved - ent->stored) > MAX_PENDING_REG_MR)
+- return -EAGAIN;
+-
+- while (1) {
+- /*
+- * This is cmpxchg (NULL, XA_ZERO_ENTRY) however this version
+- * doesn't transparently unlock. Instead we set the xas index to
+- * the current value of reserved every iteration.
+- */
+- xas_set(&xas, ent->reserved);
+- curr = xas_load(&xas);
+- if (!curr) {
+- if (to_store && ent->stored == ent->reserved)
+- xas_store(&xas, to_store);
+- else
+- xas_store(&xas, XA_ZERO_ENTRY);
+- if (xas_valid(&xas)) {
+- ent->reserved++;
+- if (to_store) {
+- if (ent->stored != ent->reserved)
+- __xa_store(&ent->mkeys,
+- ent->stored,
+- to_store,
+- GFP_KERNEL);
+- ent->stored++;
+- queue_adjust_cache_locked(ent);
+- WRITE_ONCE(ent->dev->cache.last_add,
+- jiffies);
+- }
+- }
+- }
+- xa_unlock_irq(&ent->mkeys);
+-
+- /*
+- * Notice xas_nomem() must always be called as it cleans
+- * up any cached allocation.
+- */
+- if (!xas_nomem(&xas, GFP_KERNEL))
+- break;
+- xa_lock_irq(&ent->mkeys);
++ lockdep_assert_held(&ent->mkeys_queue.lock);
++ if (ent->mkeys_queue.ci >=
++ ent->mkeys_queue.num_pages * NUM_MKEYS_PER_PAGE) {
++ page = kzalloc(sizeof(*page), GFP_ATOMIC);
++ if (!page)
++ return -ENOMEM;
++ ent->mkeys_queue.num_pages++;
++ list_add_tail(&page->list, &ent->mkeys_queue.pages_list);
++ } else {
++ page = list_last_entry(&ent->mkeys_queue.pages_list,
++ struct mlx5_mkeys_page, list);
+ }
+- xa_lock_irq(&ent->mkeys);
+- if (xas_error(&xas))
+- return xas_error(&xas);
+- if (WARN_ON(curr))
+- return -EINVAL;
+- return 0;
+-}
+-
+-static int push_mkey(struct mlx5_cache_ent *ent, bool limit_pendings,
+- void *to_store)
+-{
+- int ret;
+-
+- xa_lock_irq(&ent->mkeys);
+- ret = push_mkey_locked(ent, limit_pendings, to_store);
+- xa_unlock_irq(&ent->mkeys);
+- return ret;
+-}
+-
+-static void undo_push_reserve_mkey(struct mlx5_cache_ent *ent)
+-{
+- void *old;
+-
+- ent->reserved--;
+- old = __xa_erase(&ent->mkeys, ent->reserved);
+- WARN_ON(old);
+-}
+-
+-static void push_to_reserved(struct mlx5_cache_ent *ent, u32 mkey)
+-{
+- void *old;
+
+- old = __xa_store(&ent->mkeys, ent->stored, xa_mk_value(mkey), 0);
+- WARN_ON(old);
+- ent->stored++;
++ page->mkeys[tmp] = mkey;
++ ent->mkeys_queue.ci++;
++ return 0;
+ }
+
+-static u32 pop_stored_mkey(struct mlx5_cache_ent *ent)
++static int pop_mkey_locked(struct mlx5_cache_ent *ent)
+ {
+- void *old, *xa_mkey;
+-
+- ent->stored--;
+- ent->reserved--;
++ unsigned long tmp = (ent->mkeys_queue.ci - 1) % NUM_MKEYS_PER_PAGE;
++ struct mlx5_mkeys_page *last_page;
++ u32 mkey;
+
+- if (ent->stored == ent->reserved) {
+- xa_mkey = __xa_erase(&ent->mkeys, ent->stored);
+- WARN_ON(!xa_mkey);
+- return (u32)xa_to_value(xa_mkey);
++ lockdep_assert_held(&ent->mkeys_queue.lock);
++ last_page = list_last_entry(&ent->mkeys_queue.pages_list,
++ struct mlx5_mkeys_page, list);
++ mkey = last_page->mkeys[tmp];
++ last_page->mkeys[tmp] = 0;
++ ent->mkeys_queue.ci--;
++ if (ent->mkeys_queue.num_pages > 1 && !tmp) {
++ list_del(&last_page->list);
++ ent->mkeys_queue.num_pages--;
++ kfree(last_page);
+ }
+-
+- xa_mkey = __xa_store(&ent->mkeys, ent->stored, XA_ZERO_ENTRY,
+- GFP_KERNEL);
+- WARN_ON(!xa_mkey || xa_is_err(xa_mkey));
+- old = __xa_erase(&ent->mkeys, ent->reserved);
+- WARN_ON(old);
+- return (u32)xa_to_value(xa_mkey);
++ return mkey;
+ }
+
+ static void create_mkey_callback(int status, struct mlx5_async_work *context)
+@@ -257,10 +194,10 @@ static void create_mkey_callback(int status, struct mlx5_async_work *context)
+ if (status) {
+ create_mkey_warn(dev, status, mkey_out->out);
+ kfree(mkey_out);
+- xa_lock_irqsave(&ent->mkeys, flags);
+- undo_push_reserve_mkey(ent);
++ spin_lock_irqsave(&ent->mkeys_queue.lock, flags);
++ ent->pending--;
+ WRITE_ONCE(dev->fill_delay, 1);
+- xa_unlock_irqrestore(&ent->mkeys, flags);
++ spin_unlock_irqrestore(&ent->mkeys_queue.lock, flags);
+ mod_timer(&dev->delay_timer, jiffies + HZ);
+ return;
+ }
+@@ -269,11 +206,12 @@ static void create_mkey_callback(int status, struct mlx5_async_work *context)
+ MLX5_GET(create_mkey_out, mkey_out->out, mkey_index));
+ WRITE_ONCE(dev->cache.last_add, jiffies);
+
+- xa_lock_irqsave(&ent->mkeys, flags);
+- push_to_reserved(ent, mkey_out->mkey);
++ spin_lock_irqsave(&ent->mkeys_queue.lock, flags);
++ push_mkey_locked(ent, mkey_out->mkey);
+ /* If we are doing fill_to_high_water then keep going. */
+ queue_adjust_cache_locked(ent);
+- xa_unlock_irqrestore(&ent->mkeys, flags);
++ ent->pending--;
++ spin_unlock_irqrestore(&ent->mkeys_queue.lock, flags);
+ kfree(mkey_out);
+ }
+
+@@ -329,24 +267,28 @@ static int add_keys(struct mlx5_cache_ent *ent, unsigned int num)
+ set_cache_mkc(ent, mkc);
+ async_create->ent = ent;
+
+- err = push_mkey(ent, true, NULL);
+- if (err)
++ spin_lock_irq(&ent->mkeys_queue.lock);
++ if (ent->pending >= MAX_PENDING_REG_MR) {
++ err = -EAGAIN;
+ goto free_async_create;
++ }
++ ent->pending++;
++ spin_unlock_irq(&ent->mkeys_queue.lock);
+
+ err = mlx5_ib_create_mkey_cb(async_create);
+ if (err) {
+ mlx5_ib_warn(ent->dev, "create mkey failed %d\n", err);
+- goto err_undo_reserve;
++ goto err_create_mkey;
+ }
+ }
+
+ return 0;
+
+-err_undo_reserve:
+- xa_lock_irq(&ent->mkeys);
+- undo_push_reserve_mkey(ent);
+- xa_unlock_irq(&ent->mkeys);
++err_create_mkey:
++ spin_lock_irq(&ent->mkeys_queue.lock);
++ ent->pending--;
+ free_async_create:
++ spin_unlock_irq(&ent->mkeys_queue.lock);
+ kfree(async_create);
+ return err;
+ }
+@@ -379,36 +321,36 @@ static void remove_cache_mr_locked(struct mlx5_cache_ent *ent)
+ {
+ u32 mkey;
+
+- lockdep_assert_held(&ent->mkeys.xa_lock);
+- if (!ent->stored)
++ lockdep_assert_held(&ent->mkeys_queue.lock);
++ if (!ent->mkeys_queue.ci)
+ return;
+- mkey = pop_stored_mkey(ent);
+- xa_unlock_irq(&ent->mkeys);
++ mkey = pop_mkey_locked(ent);
++ spin_unlock_irq(&ent->mkeys_queue.lock);
+ mlx5_core_destroy_mkey(ent->dev->mdev, mkey);
+- xa_lock_irq(&ent->mkeys);
++ spin_lock_irq(&ent->mkeys_queue.lock);
+ }
+
+ static int resize_available_mrs(struct mlx5_cache_ent *ent, unsigned int target,
+ bool limit_fill)
+- __acquires(&ent->mkeys) __releases(&ent->mkeys)
++ __acquires(&ent->mkeys_queue.lock) __releases(&ent->mkeys_queue.lock)
+ {
+ int err;
+
+- lockdep_assert_held(&ent->mkeys.xa_lock);
++ lockdep_assert_held(&ent->mkeys_queue.lock);
+
+ while (true) {
+ if (limit_fill)
+ target = ent->limit * 2;
+- if (target == ent->reserved)
++ if (target == ent->pending + ent->mkeys_queue.ci)
+ return 0;
+- if (target > ent->reserved) {
+- u32 todo = target - ent->reserved;
++ if (target > ent->pending + ent->mkeys_queue.ci) {
++ u32 todo = target - (ent->pending + ent->mkeys_queue.ci);
+
+- xa_unlock_irq(&ent->mkeys);
++ spin_unlock_irq(&ent->mkeys_queue.lock);
+ err = add_keys(ent, todo);
+ if (err == -EAGAIN)
+ usleep_range(3000, 5000);
+- xa_lock_irq(&ent->mkeys);
++ spin_lock_irq(&ent->mkeys_queue.lock);
+ if (err) {
+ if (err != -EAGAIN)
+ return err;
+@@ -436,7 +378,7 @@ static ssize_t size_write(struct file *filp, const char __user *buf,
+ * cannot free MRs that are in use. Compute the target value for stored
+ * mkeys.
+ */
+- xa_lock_irq(&ent->mkeys);
++ spin_lock_irq(&ent->mkeys_queue.lock);
+ if (target < ent->in_use) {
+ err = -EINVAL;
+ goto err_unlock;
+@@ -449,12 +391,12 @@ static ssize_t size_write(struct file *filp, const char __user *buf,
+ err = resize_available_mrs(ent, target, false);
+ if (err)
+ goto err_unlock;
+- xa_unlock_irq(&ent->mkeys);
++ spin_unlock_irq(&ent->mkeys_queue.lock);
+
+ return count;
+
+ err_unlock:
+- xa_unlock_irq(&ent->mkeys);
++ spin_unlock_irq(&ent->mkeys_queue.lock);
+ return err;
+ }
+
+@@ -465,7 +407,8 @@ static ssize_t size_read(struct file *filp, char __user *buf, size_t count,
+ char lbuf[20];
+ int err;
+
+- err = snprintf(lbuf, sizeof(lbuf), "%ld\n", ent->stored + ent->in_use);
++ err = snprintf(lbuf, sizeof(lbuf), "%ld\n",
++ ent->mkeys_queue.ci + ent->in_use);
+ if (err < 0)
+ return err;
+
+@@ -494,10 +437,10 @@ static ssize_t limit_write(struct file *filp, const char __user *buf,
+ * Upon set we immediately fill the cache to high water mark implied by
+ * the limit.
+ */
+- xa_lock_irq(&ent->mkeys);
++ spin_lock_irq(&ent->mkeys_queue.lock);
+ ent->limit = var;
+ err = resize_available_mrs(ent, 0, true);
+- xa_unlock_irq(&ent->mkeys);
++ spin_unlock_irq(&ent->mkeys_queue.lock);
+ if (err)
+ return err;
+ return count;
+@@ -533,9 +476,9 @@ static bool someone_adding(struct mlx5_mkey_cache *cache)
+ mutex_lock(&cache->rb_lock);
+ for (node = rb_first(&cache->rb_root); node; node = rb_next(node)) {
+ ent = rb_entry(node, struct mlx5_cache_ent, node);
+- xa_lock_irq(&ent->mkeys);
+- ret = ent->stored < ent->limit;
+- xa_unlock_irq(&ent->mkeys);
++ spin_lock_irq(&ent->mkeys_queue.lock);
++ ret = ent->mkeys_queue.ci < ent->limit;
++ spin_unlock_irq(&ent->mkeys_queue.lock);
+ if (ret) {
+ mutex_unlock(&cache->rb_lock);
+ return true;
+@@ -552,26 +495,26 @@ static bool someone_adding(struct mlx5_mkey_cache *cache)
+ */
+ static void queue_adjust_cache_locked(struct mlx5_cache_ent *ent)
+ {
+- lockdep_assert_held(&ent->mkeys.xa_lock);
++ lockdep_assert_held(&ent->mkeys_queue.lock);
+
+ if (ent->disabled || READ_ONCE(ent->dev->fill_delay) || ent->is_tmp)
+ return;
+- if (ent->stored < ent->limit) {
++ if (ent->mkeys_queue.ci < ent->limit) {
+ ent->fill_to_high_water = true;
+ mod_delayed_work(ent->dev->cache.wq, &ent->dwork, 0);
+ } else if (ent->fill_to_high_water &&
+- ent->reserved < 2 * ent->limit) {
++ ent->mkeys_queue.ci + ent->pending < 2 * ent->limit) {
+ /*
+ * Once we start populating due to hitting a low water mark
+ * continue until we pass the high water mark.
+ */
+ mod_delayed_work(ent->dev->cache.wq, &ent->dwork, 0);
+- } else if (ent->stored == 2 * ent->limit) {
++ } else if (ent->mkeys_queue.ci == 2 * ent->limit) {
+ ent->fill_to_high_water = false;
+- } else if (ent->stored > 2 * ent->limit) {
++ } else if (ent->mkeys_queue.ci > 2 * ent->limit) {
+ /* Queue deletion of excess entries */
+ ent->fill_to_high_water = false;
+- if (ent->stored != ent->reserved)
++ if (ent->pending)
+ queue_delayed_work(ent->dev->cache.wq, &ent->dwork,
+ msecs_to_jiffies(1000));
+ else
+@@ -585,15 +528,16 @@ static void __cache_work_func(struct mlx5_cache_ent *ent)
+ struct mlx5_mkey_cache *cache = &dev->cache;
+ int err;
+
+- xa_lock_irq(&ent->mkeys);
++ spin_lock_irq(&ent->mkeys_queue.lock);
+ if (ent->disabled)
+ goto out;
+
+- if (ent->fill_to_high_water && ent->reserved < 2 * ent->limit &&
++ if (ent->fill_to_high_water &&
++ ent->mkeys_queue.ci + ent->pending < 2 * ent->limit &&
+ !READ_ONCE(dev->fill_delay)) {
+- xa_unlock_irq(&ent->mkeys);
++ spin_unlock_irq(&ent->mkeys_queue.lock);
+ err = add_keys(ent, 1);
+- xa_lock_irq(&ent->mkeys);
++ spin_lock_irq(&ent->mkeys_queue.lock);
+ if (ent->disabled)
+ goto out;
+ if (err) {
+@@ -611,7 +555,7 @@ static void __cache_work_func(struct mlx5_cache_ent *ent)
+ msecs_to_jiffies(1000));
+ }
+ }
+- } else if (ent->stored > 2 * ent->limit) {
++ } else if (ent->mkeys_queue.ci > 2 * ent->limit) {
+ bool need_delay;
+
+ /*
+@@ -626,11 +570,11 @@ static void __cache_work_func(struct mlx5_cache_ent *ent)
+ * the garbage collection work to try to run in next cycle, in
+ * order to free CPU resources to other tasks.
+ */
+- xa_unlock_irq(&ent->mkeys);
++ spin_unlock_irq(&ent->mkeys_queue.lock);
+ need_delay = need_resched() || someone_adding(cache) ||
+ !time_after(jiffies,
+ READ_ONCE(cache->last_add) + 300 * HZ);
+- xa_lock_irq(&ent->mkeys);
++ spin_lock_irq(&ent->mkeys_queue.lock);
+ if (ent->disabled)
+ goto out;
+ if (need_delay) {
+@@ -641,7 +585,7 @@ static void __cache_work_func(struct mlx5_cache_ent *ent)
+ queue_adjust_cache_locked(ent);
+ }
+ out:
+- xa_unlock_irq(&ent->mkeys);
++ spin_unlock_irq(&ent->mkeys_queue.lock);
+ }
+
+ static void delayed_cache_work_func(struct work_struct *work)
+@@ -749,25 +693,25 @@ static struct mlx5_ib_mr *_mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev,
+ if (!mr)
+ return ERR_PTR(-ENOMEM);
+
+- xa_lock_irq(&ent->mkeys);
++ spin_lock_irq(&ent->mkeys_queue.lock);
+ ent->in_use++;
+
+- if (!ent->stored) {
++ if (!ent->mkeys_queue.ci) {
+ queue_adjust_cache_locked(ent);
+ ent->miss++;
+- xa_unlock_irq(&ent->mkeys);
++ spin_unlock_irq(&ent->mkeys_queue.lock);
+ err = create_cache_mkey(ent, &mr->mmkey.key);
+ if (err) {
+- xa_lock_irq(&ent->mkeys);
++ spin_lock_irq(&ent->mkeys_queue.lock);
+ ent->in_use--;
+- xa_unlock_irq(&ent->mkeys);
++ spin_unlock_irq(&ent->mkeys_queue.lock);
+ kfree(mr);
+ return ERR_PTR(err);
+ }
+ } else {
+- mr->mmkey.key = pop_stored_mkey(ent);
++ mr->mmkey.key = pop_mkey_locked(ent);
+ queue_adjust_cache_locked(ent);
+- xa_unlock_irq(&ent->mkeys);
++ spin_unlock_irq(&ent->mkeys_queue.lock);
+ }
+ mr->mmkey.cache_ent = ent;
+ mr->mmkey.type = MLX5_MKEY_MR;
+@@ -820,14 +764,14 @@ static void clean_keys(struct mlx5_ib_dev *dev, struct mlx5_cache_ent *ent)
+ u32 mkey;
+
+ cancel_delayed_work(&ent->dwork);
+- xa_lock_irq(&ent->mkeys);
+- while (ent->stored) {
+- mkey = pop_stored_mkey(ent);
+- xa_unlock_irq(&ent->mkeys);
++ spin_lock_irq(&ent->mkeys_queue.lock);
++ while (ent->mkeys_queue.ci) {
++ mkey = pop_mkey_locked(ent);
++ spin_unlock_irq(&ent->mkeys_queue.lock);
+ mlx5_core_destroy_mkey(dev->mdev, mkey);
+- xa_lock_irq(&ent->mkeys);
++ spin_lock_irq(&ent->mkeys_queue.lock);
+ }
+- xa_unlock_irq(&ent->mkeys);
++ spin_unlock_irq(&ent->mkeys_queue.lock);
+ }
+
+ static void mlx5_mkey_cache_debugfs_cleanup(struct mlx5_ib_dev *dev)
+@@ -852,7 +796,7 @@ static void mlx5_mkey_cache_debugfs_add_ent(struct mlx5_ib_dev *dev,
+ dir = debugfs_create_dir(ent->name, dev->cache.fs_root);
+ debugfs_create_file("size", 0600, dir, ent, &size_fops);
+ debugfs_create_file("limit", 0600, dir, ent, &limit_fops);
+- debugfs_create_ulong("cur", 0400, dir, &ent->stored);
++ debugfs_create_ulong("cur", 0400, dir, &ent->mkeys_queue.ci);
+ debugfs_create_u32("miss", 0600, dir, &ent->miss);
+ }
+
+@@ -874,6 +818,31 @@ static void delay_time_func(struct timer_list *t)
+ WRITE_ONCE(dev->fill_delay, 0);
+ }
+
++static int mlx5r_mkeys_init(struct mlx5_cache_ent *ent)
++{
++ struct mlx5_mkeys_page *page;
++
++ page = kzalloc(sizeof(*page), GFP_KERNEL);
++ if (!page)
++ return -ENOMEM;
++ INIT_LIST_HEAD(&ent->mkeys_queue.pages_list);
++ spin_lock_init(&ent->mkeys_queue.lock);
++ list_add_tail(&page->list, &ent->mkeys_queue.pages_list);
++ ent->mkeys_queue.num_pages++;
++ return 0;
++}
++
++static void mlx5r_mkeys_uninit(struct mlx5_cache_ent *ent)
++{
++ struct mlx5_mkeys_page *page;
++
++ WARN_ON(ent->mkeys_queue.ci || ent->mkeys_queue.num_pages > 1);
++ page = list_last_entry(&ent->mkeys_queue.pages_list,
++ struct mlx5_mkeys_page, list);
++ list_del(&page->list);
++ kfree(page);
++}
++
+ struct mlx5_cache_ent *
+ mlx5r_cache_create_ent_locked(struct mlx5_ib_dev *dev,
+ struct mlx5r_cache_rb_key rb_key,
+@@ -887,7 +856,9 @@ mlx5r_cache_create_ent_locked(struct mlx5_ib_dev *dev,
+ if (!ent)
+ return ERR_PTR(-ENOMEM);
+
+- xa_init_flags(&ent->mkeys, XA_FLAGS_LOCK_IRQ);
++ ret = mlx5r_mkeys_init(ent);
++ if (ret)
++ goto mkeys_err;
+ ent->rb_key = rb_key;
+ ent->dev = dev;
+ ent->is_tmp = !persistent_entry;
+@@ -895,10 +866,8 @@ mlx5r_cache_create_ent_locked(struct mlx5_ib_dev *dev,
+ INIT_DELAYED_WORK(&ent->dwork, delayed_cache_work_func);
+
+ ret = mlx5_cache_ent_insert(&dev->cache, ent);
+- if (ret) {
+- kfree(ent);
+- return ERR_PTR(ret);
+- }
++ if (ret)
++ goto ent_insert_err;
+
+ if (persistent_entry) {
+ if (rb_key.access_mode == MLX5_MKC_ACCESS_MODE_KSM)
+@@ -921,6 +890,11 @@ mlx5r_cache_create_ent_locked(struct mlx5_ib_dev *dev,
+ }
+
+ return ent;
++ent_insert_err:
++ mlx5r_mkeys_uninit(ent);
++mkeys_err:
++ kfree(ent);
++ return ERR_PTR(ret);
+ }
+
+ static void remove_ent_work_func(struct work_struct *work)
+@@ -938,13 +912,13 @@ static void remove_ent_work_func(struct work_struct *work)
+ cur = rb_prev(cur);
+ mutex_unlock(&cache->rb_lock);
+
+- xa_lock_irq(&ent->mkeys);
++ spin_lock_irq(&ent->mkeys_queue.lock);
+ if (!ent->is_tmp) {
+- xa_unlock_irq(&ent->mkeys);
++ spin_unlock_irq(&ent->mkeys_queue.lock);
+ mutex_lock(&cache->rb_lock);
+ continue;
+ }
+- xa_unlock_irq(&ent->mkeys);
++ spin_unlock_irq(&ent->mkeys_queue.lock);
+
+ clean_keys(ent->dev, ent);
+ mutex_lock(&cache->rb_lock);
+@@ -994,9 +968,9 @@ int mlx5_mkey_cache_init(struct mlx5_ib_dev *dev)
+ mutex_unlock(&cache->rb_lock);
+ for (node = rb_first(root); node; node = rb_next(node)) {
+ ent = rb_entry(node, struct mlx5_cache_ent, node);
+- xa_lock_irq(&ent->mkeys);
++ spin_lock_irq(&ent->mkeys_queue.lock);
+ queue_adjust_cache_locked(ent);
+- xa_unlock_irq(&ent->mkeys);
++ spin_unlock_irq(&ent->mkeys_queue.lock);
+ }
+
+ return 0;
+@@ -1020,9 +994,9 @@ int mlx5_mkey_cache_cleanup(struct mlx5_ib_dev *dev)
+ mutex_lock(&dev->cache.rb_lock);
+ for (node = rb_first(root); node; node = rb_next(node)) {
+ ent = rb_entry(node, struct mlx5_cache_ent, node);
+- xa_lock_irq(&ent->mkeys);
++ spin_lock_irq(&ent->mkeys_queue.lock);
+ ent->disabled = true;
+- xa_unlock_irq(&ent->mkeys);
++ spin_unlock_irq(&ent->mkeys_queue.lock);
+ cancel_delayed_work_sync(&ent->dwork);
+ }
+
+@@ -1035,6 +1009,7 @@ int mlx5_mkey_cache_cleanup(struct mlx5_ib_dev *dev)
+ node = rb_next(node);
+ clean_keys(dev, ent);
+ rb_erase(&ent->node, root);
++ mlx5r_mkeys_uninit(ent);
+ kfree(ent);
+ }
+ mutex_unlock(&dev->cache.rb_lock);
+@@ -1802,7 +1777,7 @@ static int cache_ent_find_and_store(struct mlx5_ib_dev *dev,
+ int ret;
+
+ if (mr->mmkey.cache_ent) {
+- xa_lock_irq(&mr->mmkey.cache_ent->mkeys);
++ spin_lock_irq(&mr->mmkey.cache_ent->mkeys_queue.lock);
+ mr->mmkey.cache_ent->in_use--;
+ goto end;
+ }
+@@ -1816,7 +1791,7 @@ static int cache_ent_find_and_store(struct mlx5_ib_dev *dev,
+ return -EOPNOTSUPP;
+ }
+ mr->mmkey.cache_ent = ent;
+- xa_lock_irq(&mr->mmkey.cache_ent->mkeys);
++ spin_lock_irq(&mr->mmkey.cache_ent->mkeys_queue.lock);
+ mutex_unlock(&cache->rb_lock);
+ goto end;
+ }
+@@ -1828,12 +1803,11 @@ static int cache_ent_find_and_store(struct mlx5_ib_dev *dev,
+ return PTR_ERR(ent);
+
+ mr->mmkey.cache_ent = ent;
+- xa_lock_irq(&mr->mmkey.cache_ent->mkeys);
++ spin_lock_irq(&mr->mmkey.cache_ent->mkeys_queue.lock);
+
+ end:
+- ret = push_mkey_locked(mr->mmkey.cache_ent, false,
+- xa_mk_value(mr->mmkey.key));
+- xa_unlock_irq(&mr->mmkey.cache_ent->mkeys);
++ ret = push_mkey_locked(mr->mmkey.cache_ent, mr->mmkey.key);
++ spin_unlock_irq(&mr->mmkey.cache_ent->mkeys_queue.lock);
+ return ret;
+ }
+
+diff --git a/drivers/infiniband/hw/mlx5/umr.c b/drivers/infiniband/hw/mlx5/umr.c
+index cb5cee3dee2b6..fa000182d0b41 100644
+--- a/drivers/infiniband/hw/mlx5/umr.c
++++ b/drivers/infiniband/hw/mlx5/umr.c
+@@ -332,8 +332,8 @@ static int mlx5r_umr_post_send_wait(struct mlx5_ib_dev *dev, u32 mkey,
+
+ WARN_ON_ONCE(1);
+ mlx5_ib_warn(dev,
+- "reg umr failed (%u). Trying to recover and resubmit the flushed WQEs\n",
+- umr_context.status);
++ "reg umr failed (%u). Trying to recover and resubmit the flushed WQEs, mkey = %u\n",
++ umr_context.status, mkey);
+ mutex_lock(&umrc->lock);
+ err = mlx5r_umr_recover(dev);
+ mutex_unlock(&umrc->lock);
+--
+2.39.5
+
--- /dev/null
+From dee0c2d2ab0dbb79d87e227f8b4136f1764cefb4 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 26 Jan 2023 00:28:05 +0200
+Subject: RDMA/mlx5: Introduce mlx5r_cache_rb_key
+
+From: Michael Guralnik <michaelgur@nvidia.com>
+
+[ Upstream commit 73d09b2fe8336f5f37935e46418666ddbcd3c343 ]
+
+Switch from using the mkey order to using the new struct as the key to the
+RB tree of cache entries.
+
+The key is all the mkey properties that UMR operations can't modify.
+Using this key to define the cache entries and to search and create cache
+mkeys.
+
+Link: https://lore.kernel.org/r/20230125222807.6921-5-michaelgur@nvidia.com
+Signed-off-by: Michael Guralnik <michaelgur@nvidia.com>
+Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
+Stable-dep-of: d97505baea64 ("RDMA/mlx5: Fix the recovery flow of the UMR QP")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/infiniband/hw/mlx5/mlx5_ib.h | 27 ++--
+ drivers/infiniband/hw/mlx5/mr.c | 228 +++++++++++++++++++--------
+ drivers/infiniband/hw/mlx5/odp.c | 30 ++--
+ 3 files changed, 201 insertions(+), 84 deletions(-)
+
+diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
+index bd998ac8c29c1..7c9d5648947e9 100644
+--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
++++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
+@@ -637,6 +637,13 @@ enum mlx5_mkey_type {
+ MLX5_MKEY_INDIRECT_DEVX,
+ };
+
++struct mlx5r_cache_rb_key {
++ u8 ats:1;
++ unsigned int access_mode;
++ unsigned int access_flags;
++ unsigned int ndescs;
++};
++
+ struct mlx5_ib_mkey {
+ u32 key;
+ enum mlx5_mkey_type type;
+@@ -757,11 +764,9 @@ struct mlx5_cache_ent {
+ unsigned long reserved;
+
+ char name[4];
+- u32 order;
+- u32 access_mode;
+- unsigned int ndescs;
+
+ struct rb_node node;
++ struct mlx5r_cache_rb_key rb_key;
+
+ u8 disabled:1;
+ u8 fill_to_high_water:1;
+@@ -1340,14 +1345,13 @@ int mlx5_ib_get_cqe_size(struct ib_cq *ibcq);
+ int mlx5_mkey_cache_init(struct mlx5_ib_dev *dev);
+ int mlx5_mkey_cache_cleanup(struct mlx5_ib_dev *dev);
+ struct mlx5_cache_ent *mlx5r_cache_create_ent(struct mlx5_ib_dev *dev,
+- int order);
++ struct mlx5r_cache_rb_key rb_key,
++ bool persistent_entry);
+
+ struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev,
+- struct mlx5_cache_ent *ent,
+- int access_flags);
++ int access_flags, int access_mode,
++ int ndescs);
+
+-struct mlx5_ib_mr *mlx5_mr_cache_alloc_order(struct mlx5_ib_dev *dev, u32 order,
+- int access_flags);
+ int mlx5_ib_check_mr_status(struct ib_mr *ibmr, u32 check_mask,
+ struct ib_mr_status *mr_status);
+ struct ib_wq *mlx5_ib_create_wq(struct ib_pd *pd,
+@@ -1370,7 +1374,7 @@ int mlx5r_odp_create_eq(struct mlx5_ib_dev *dev, struct mlx5_ib_pf_eq *eq);
+ void mlx5_ib_odp_cleanup_one(struct mlx5_ib_dev *ibdev);
+ int __init mlx5_ib_odp_init(void);
+ void mlx5_ib_odp_cleanup(void);
+-void mlx5_odp_init_mkey_cache_entry(struct mlx5_cache_ent *ent);
++int mlx5_odp_init_mkey_cache(struct mlx5_ib_dev *dev);
+ void mlx5_odp_populate_xlt(void *xlt, size_t idx, size_t nentries,
+ struct mlx5_ib_mr *mr, int flags);
+
+@@ -1389,7 +1393,10 @@ static inline int mlx5r_odp_create_eq(struct mlx5_ib_dev *dev,
+ static inline void mlx5_ib_odp_cleanup_one(struct mlx5_ib_dev *ibdev) {}
+ static inline int mlx5_ib_odp_init(void) { return 0; }
+ static inline void mlx5_ib_odp_cleanup(void) {}
+-static inline void mlx5_odp_init_mkey_cache_entry(struct mlx5_cache_ent *ent) {}
++static inline int mlx5_odp_init_mkey_cache(struct mlx5_ib_dev *dev)
++{
++ return 0;
++}
+ static inline void mlx5_odp_populate_xlt(void *xlt, size_t idx, size_t nentries,
+ struct mlx5_ib_mr *mr, int flags) {}
+
+diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
+index b3d83920d3cfb..1060b30a837a0 100644
+--- a/drivers/infiniband/hw/mlx5/mr.c
++++ b/drivers/infiniband/hw/mlx5/mr.c
+@@ -292,11 +292,13 @@ static void set_cache_mkc(struct mlx5_cache_ent *ent, void *mkc)
+ set_mkc_access_pd_addr_fields(mkc, 0, 0, ent->dev->umrc.pd);
+ MLX5_SET(mkc, mkc, free, 1);
+ MLX5_SET(mkc, mkc, umr_en, 1);
+- MLX5_SET(mkc, mkc, access_mode_1_0, ent->access_mode & 0x3);
+- MLX5_SET(mkc, mkc, access_mode_4_2, (ent->access_mode >> 2) & 0x7);
++ MLX5_SET(mkc, mkc, access_mode_1_0, ent->rb_key.access_mode & 0x3);
++ MLX5_SET(mkc, mkc, access_mode_4_2,
++ (ent->rb_key.access_mode >> 2) & 0x7);
+
+ MLX5_SET(mkc, mkc, translations_octword_size,
+- get_mkc_octo_size(ent->access_mode, ent->ndescs));
++ get_mkc_octo_size(ent->rb_key.access_mode,
++ ent->rb_key.ndescs));
+ MLX5_SET(mkc, mkc, log_page_size, PAGE_SHIFT);
+ }
+
+@@ -594,8 +596,8 @@ static void __cache_work_func(struct mlx5_cache_ent *ent)
+ if (err != -EAGAIN) {
+ mlx5_ib_warn(
+ dev,
+- "command failed order %d, err %d\n",
+- ent->order, err);
++ "add keys command failed, err %d\n",
++ err);
+ queue_delayed_work(cache->wq, &ent->dwork,
+ msecs_to_jiffies(1000));
+ }
+@@ -641,22 +643,49 @@ static void delayed_cache_work_func(struct work_struct *work)
+ __cache_work_func(ent);
+ }
+
++static int cache_ent_key_cmp(struct mlx5r_cache_rb_key key1,
++ struct mlx5r_cache_rb_key key2)
++{
++ int res;
++
++ res = key1.ats - key2.ats;
++ if (res)
++ return res;
++
++ res = key1.access_mode - key2.access_mode;
++ if (res)
++ return res;
++
++ res = key1.access_flags - key2.access_flags;
++ if (res)
++ return res;
++
++ /*
++ * keep ndescs the last in the compare table since the find function
++ * searches for an exact match on all properties and only closest
++ * match in size.
++ */
++ return key1.ndescs - key2.ndescs;
++}
++
+ static int mlx5_cache_ent_insert(struct mlx5_mkey_cache *cache,
+ struct mlx5_cache_ent *ent)
+ {
+ struct rb_node **new = &cache->rb_root.rb_node, *parent = NULL;
+ struct mlx5_cache_ent *cur;
++ int cmp;
+
+ mutex_lock(&cache->rb_lock);
+ /* Figure out where to put new node */
+ while (*new) {
+ cur = rb_entry(*new, struct mlx5_cache_ent, node);
+ parent = *new;
+- if (ent->order < cur->order)
++ cmp = cache_ent_key_cmp(cur->rb_key, ent->rb_key);
++ if (cmp > 0)
+ new = &((*new)->rb_left);
+- if (ent->order > cur->order)
++ if (cmp < 0)
+ new = &((*new)->rb_right);
+- if (ent->order == cur->order) {
++ if (cmp == 0) {
+ mutex_unlock(&cache->rb_lock);
+ return -EEXIST;
+ }
+@@ -670,40 +699,45 @@ static int mlx5_cache_ent_insert(struct mlx5_mkey_cache *cache,
+ return 0;
+ }
+
+-static struct mlx5_cache_ent *mkey_cache_ent_from_order(struct mlx5_ib_dev *dev,
+- unsigned int order)
++static struct mlx5_cache_ent *
++mkey_cache_ent_from_rb_key(struct mlx5_ib_dev *dev,
++ struct mlx5r_cache_rb_key rb_key)
+ {
+ struct rb_node *node = dev->cache.rb_root.rb_node;
+ struct mlx5_cache_ent *cur, *smallest = NULL;
++ int cmp;
+
+ /*
+ * Find the smallest ent with order >= requested_order.
+ */
+ while (node) {
+ cur = rb_entry(node, struct mlx5_cache_ent, node);
+- if (cur->order > order) {
++ cmp = cache_ent_key_cmp(cur->rb_key, rb_key);
++ if (cmp > 0) {
+ smallest = cur;
+ node = node->rb_left;
+ }
+- if (cur->order < order)
++ if (cmp < 0)
+ node = node->rb_right;
+- if (cur->order == order)
++ if (cmp == 0)
+ return cur;
+ }
+
+- return smallest;
++ return (smallest &&
++ smallest->rb_key.access_mode == rb_key.access_mode &&
++ smallest->rb_key.access_flags == rb_key.access_flags &&
++ smallest->rb_key.ats == rb_key.ats) ?
++ smallest :
++ NULL;
+ }
+
+-struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev,
+- struct mlx5_cache_ent *ent,
+- int access_flags)
++static struct mlx5_ib_mr *_mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev,
++ struct mlx5_cache_ent *ent,
++ int access_flags)
+ {
+ struct mlx5_ib_mr *mr;
+ int err;
+
+- if (!mlx5r_umr_can_reconfig(dev, 0, access_flags))
+- return ERR_PTR(-EOPNOTSUPP);
+-
+ mr = kzalloc(sizeof(*mr), GFP_KERNEL);
+ if (!mr)
+ return ERR_PTR(-ENOMEM);
+@@ -734,12 +768,44 @@ struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev,
+ return mr;
+ }
+
+-struct mlx5_ib_mr *mlx5_mr_cache_alloc_order(struct mlx5_ib_dev *dev,
+- u32 order, int access_flags)
++static int get_unchangeable_access_flags(struct mlx5_ib_dev *dev,
++ int access_flags)
++{
++ int ret = 0;
++
++ if ((access_flags & IB_ACCESS_REMOTE_ATOMIC) &&
++ MLX5_CAP_GEN(dev->mdev, atomic) &&
++ MLX5_CAP_GEN(dev->mdev, umr_modify_atomic_disabled))
++ ret |= IB_ACCESS_REMOTE_ATOMIC;
++
++ if ((access_flags & IB_ACCESS_RELAXED_ORDERING) &&
++ MLX5_CAP_GEN(dev->mdev, relaxed_ordering_write) &&
++ !MLX5_CAP_GEN(dev->mdev, relaxed_ordering_write_umr))
++ ret |= IB_ACCESS_RELAXED_ORDERING;
++
++ if ((access_flags & IB_ACCESS_RELAXED_ORDERING) &&
++ MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read) &&
++ !MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read_umr))
++ ret |= IB_ACCESS_RELAXED_ORDERING;
++
++ return ret;
++}
++
++struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev,
++ int access_flags, int access_mode,
++ int ndescs)
+ {
+- struct mlx5_cache_ent *ent = mkey_cache_ent_from_order(dev, order);
++ struct mlx5r_cache_rb_key rb_key = {
++ .ndescs = ndescs,
++ .access_mode = access_mode,
++ .access_flags = get_unchangeable_access_flags(dev, access_flags)
++ };
++ struct mlx5_cache_ent *ent = mkey_cache_ent_from_rb_key(dev, rb_key);
+
+- return mlx5_mr_cache_alloc(dev, ent, access_flags);
++ if (!ent)
++ return ERR_PTR(-EOPNOTSUPP);
++
++ return _mlx5_mr_cache_alloc(dev, ent, access_flags);
+ }
+
+ static void clean_keys(struct mlx5_ib_dev *dev, struct mlx5_cache_ent *ent)
+@@ -766,28 +832,32 @@ static void mlx5_mkey_cache_debugfs_cleanup(struct mlx5_ib_dev *dev)
+ dev->cache.fs_root = NULL;
+ }
+
++static void mlx5_mkey_cache_debugfs_add_ent(struct mlx5_ib_dev *dev,
++ struct mlx5_cache_ent *ent)
++{
++ int order = order_base_2(ent->rb_key.ndescs);
++ struct dentry *dir;
++
++ if (ent->rb_key.access_mode == MLX5_MKC_ACCESS_MODE_KSM)
++ order = MLX5_IMR_KSM_CACHE_ENTRY + 2;
++
++ sprintf(ent->name, "%d", order);
++ dir = debugfs_create_dir(ent->name, dev->cache.fs_root);
++ debugfs_create_file("size", 0600, dir, ent, &size_fops);
++ debugfs_create_file("limit", 0600, dir, ent, &limit_fops);
++ debugfs_create_ulong("cur", 0400, dir, &ent->stored);
++ debugfs_create_u32("miss", 0600, dir, &ent->miss);
++}
++
+ static void mlx5_mkey_cache_debugfs_init(struct mlx5_ib_dev *dev)
+ {
++ struct dentry *dbg_root = mlx5_debugfs_get_dev_root(dev->mdev);
+ struct mlx5_mkey_cache *cache = &dev->cache;
+- struct mlx5_cache_ent *ent;
+- struct dentry *dir;
+- int i;
+
+ if (!mlx5_debugfs_root || dev->is_rep)
+ return;
+
+- dir = mlx5_debugfs_get_dev_root(dev->mdev);
+- cache->fs_root = debugfs_create_dir("mr_cache", dir);
+-
+- for (i = 0; i < MAX_MKEY_CACHE_ENTRIES; i++) {
+- ent = mkey_cache_ent_from_order(dev, i);
+- sprintf(ent->name, "%d", ent->order);
+- dir = debugfs_create_dir(ent->name, cache->fs_root);
+- debugfs_create_file("size", 0600, dir, ent, &size_fops);
+- debugfs_create_file("limit", 0600, dir, ent, &limit_fops);
+- debugfs_create_ulong("cur", 0400, dir, &ent->stored);
+- debugfs_create_u32("miss", 0600, dir, &ent->miss);
+- }
++ cache->fs_root = debugfs_create_dir("mr_cache", dbg_root);
+ }
+
+ static void delay_time_func(struct timer_list *t)
+@@ -798,9 +868,11 @@ static void delay_time_func(struct timer_list *t)
+ }
+
+ struct mlx5_cache_ent *mlx5r_cache_create_ent(struct mlx5_ib_dev *dev,
+- int order)
++ struct mlx5r_cache_rb_key rb_key,
++ bool persistent_entry)
+ {
+ struct mlx5_cache_ent *ent;
++ int order;
+ int ret;
+
+ ent = kzalloc(sizeof(*ent), GFP_KERNEL);
+@@ -808,7 +880,7 @@ struct mlx5_cache_ent *mlx5r_cache_create_ent(struct mlx5_ib_dev *dev,
+ return ERR_PTR(-ENOMEM);
+
+ xa_init_flags(&ent->mkeys, XA_FLAGS_LOCK_IRQ);
+- ent->order = order;
++ ent->rb_key = rb_key;
+ ent->dev = dev;
+
+ INIT_DELAYED_WORK(&ent->dwork, delayed_cache_work_func);
+@@ -818,13 +890,36 @@ struct mlx5_cache_ent *mlx5r_cache_create_ent(struct mlx5_ib_dev *dev,
+ kfree(ent);
+ return ERR_PTR(ret);
+ }
++
++ if (persistent_entry) {
++ if (rb_key.access_mode == MLX5_MKC_ACCESS_MODE_KSM)
++ order = MLX5_IMR_KSM_CACHE_ENTRY;
++ else
++ order = order_base_2(rb_key.ndescs) - 2;
++
++ if ((dev->mdev->profile.mask & MLX5_PROF_MASK_MR_CACHE) &&
++ !dev->is_rep && mlx5_core_is_pf(dev->mdev) &&
++ mlx5r_umr_can_load_pas(dev, 0))
++ ent->limit = dev->mdev->profile.mr_cache[order].limit;
++ else
++ ent->limit = 0;
++
++ mlx5_mkey_cache_debugfs_add_ent(dev, ent);
++ }
++
+ return ent;
+ }
+
+ int mlx5_mkey_cache_init(struct mlx5_ib_dev *dev)
+ {
+ struct mlx5_mkey_cache *cache = &dev->cache;
++ struct rb_root *root = &dev->cache.rb_root;
++ struct mlx5r_cache_rb_key rb_key = {
++ .access_mode = MLX5_MKC_ACCESS_MODE_MTT,
++ };
+ struct mlx5_cache_ent *ent;
++ struct rb_node *node;
++ int ret;
+ int i;
+
+ mutex_init(&dev->slow_path_mutex);
+@@ -838,33 +933,32 @@ int mlx5_mkey_cache_init(struct mlx5_ib_dev *dev)
+
+ mlx5_cmd_init_async_ctx(dev->mdev, &dev->async_ctx);
+ timer_setup(&dev->delay_timer, delay_time_func, 0);
+- for (i = 0; i < MAX_MKEY_CACHE_ENTRIES; i++) {
+- ent = mlx5r_cache_create_ent(dev, i);
+-
+- if (i > MKEY_CACHE_LAST_STD_ENTRY) {
+- mlx5_odp_init_mkey_cache_entry(ent);
+- continue;
++ mlx5_mkey_cache_debugfs_init(dev);
++ for (i = 0; i <= mkey_cache_max_order(dev); i++) {
++ rb_key.ndescs = 1 << (i + 2);
++ ent = mlx5r_cache_create_ent(dev, rb_key, true);
++ if (IS_ERR(ent)) {
++ ret = PTR_ERR(ent);
++ goto err;
+ }
++ }
+
+- if (ent->order > mkey_cache_max_order(dev))
+- continue;
++ ret = mlx5_odp_init_mkey_cache(dev);
++ if (ret)
++ goto err;
+
+- ent->ndescs = 1 << ent->order;
+- ent->access_mode = MLX5_MKC_ACCESS_MODE_MTT;
+- if ((dev->mdev->profile.mask & MLX5_PROF_MASK_MR_CACHE) &&
+- !dev->is_rep && mlx5_core_is_pf(dev->mdev) &&
+- mlx5r_umr_can_load_pas(dev, 0))
+- ent->limit = dev->mdev->profile.mr_cache[i].limit;
+- else
+- ent->limit = 0;
++ for (node = rb_first(root); node; node = rb_next(node)) {
++ ent = rb_entry(node, struct mlx5_cache_ent, node);
+ xa_lock_irq(&ent->mkeys);
+ queue_adjust_cache_locked(ent);
+ xa_unlock_irq(&ent->mkeys);
+ }
+
+- mlx5_mkey_cache_debugfs_init(dev);
+-
+ return 0;
++
++err:
++ mlx5_ib_warn(dev, "failed to create mkey cache entry\n");
++ return ret;
+ }
+
+ int mlx5_mkey_cache_cleanup(struct mlx5_ib_dev *dev)
+@@ -965,7 +1059,7 @@ static int get_octo_len(u64 addr, u64 len, int page_shift)
+ static int mkey_cache_max_order(struct mlx5_ib_dev *dev)
+ {
+ if (MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset))
+- return MKEY_CACHE_LAST_STD_ENTRY + 2;
++ return MKEY_CACHE_LAST_STD_ENTRY;
+ return MLX5_MAX_UMR_SHIFT;
+ }
+
+@@ -995,6 +1089,9 @@ static struct mlx5_ib_mr *alloc_cacheable_mr(struct ib_pd *pd,
+ struct ib_umem *umem, u64 iova,
+ int access_flags)
+ {
++ struct mlx5r_cache_rb_key rb_key = {
++ .access_mode = MLX5_MKC_ACCESS_MODE_MTT,
++ };
+ struct mlx5_ib_dev *dev = to_mdev(pd->device);
+ struct mlx5_cache_ent *ent;
+ struct mlx5_ib_mr *mr;
+@@ -1007,8 +1104,11 @@ static struct mlx5_ib_mr *alloc_cacheable_mr(struct ib_pd *pd,
+ 0, iova);
+ if (WARN_ON(!page_size))
+ return ERR_PTR(-EINVAL);
+- ent = mkey_cache_ent_from_order(
+- dev, order_base_2(ib_umem_num_dma_blocks(umem, page_size)));
++
++ rb_key.ndescs = ib_umem_num_dma_blocks(umem, page_size);
++ rb_key.ats = mlx5_umem_needs_ats(dev, umem, access_flags);
++ rb_key.access_flags = get_unchangeable_access_flags(dev, access_flags);
++ ent = mkey_cache_ent_from_rb_key(dev, rb_key);
+ /*
+ * Matches access in alloc_cache_mr(). If the MR can't come from the
+ * cache then synchronously create an uncached one.
+@@ -1022,7 +1122,7 @@ static struct mlx5_ib_mr *alloc_cacheable_mr(struct ib_pd *pd,
+ return mr;
+ }
+
+- mr = mlx5_mr_cache_alloc(dev, ent, access_flags);
++ mr = _mlx5_mr_cache_alloc(dev, ent, access_flags);
+ if (IS_ERR(mr))
+ return mr;
+
+@@ -1452,7 +1552,7 @@ static bool can_use_umr_rereg_pas(struct mlx5_ib_mr *mr,
+ mlx5_umem_find_best_pgsz(new_umem, mkc, log_page_size, 0, iova);
+ if (WARN_ON(!*page_size))
+ return false;
+- return (1ULL << mr->mmkey.cache_ent->order) >=
++ return (mr->mmkey.cache_ent->rb_key.ndescs) >=
+ ib_umem_num_dma_blocks(new_umem, *page_size);
+ }
+
+diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c
+index 7f68940ca0d1e..96d4faabbff8a 100644
+--- a/drivers/infiniband/hw/mlx5/odp.c
++++ b/drivers/infiniband/hw/mlx5/odp.c
+@@ -406,7 +406,6 @@ static void mlx5_ib_page_fault_resume(struct mlx5_ib_dev *dev,
+ static struct mlx5_ib_mr *implicit_get_child_mr(struct mlx5_ib_mr *imr,
+ unsigned long idx)
+ {
+- int order = order_base_2(MLX5_IMR_MTT_ENTRIES);
+ struct mlx5_ib_dev *dev = mr_to_mdev(imr);
+ struct ib_umem_odp *odp;
+ struct mlx5_ib_mr *mr;
+@@ -419,8 +418,9 @@ static struct mlx5_ib_mr *implicit_get_child_mr(struct mlx5_ib_mr *imr,
+ if (IS_ERR(odp))
+ return ERR_CAST(odp);
+
+- BUILD_BUG_ON(order > MKEY_CACHE_LAST_STD_ENTRY);
+- mr = mlx5_mr_cache_alloc_order(dev, order, imr->access_flags);
++ mr = mlx5_mr_cache_alloc(dev, imr->access_flags,
++ MLX5_MKC_ACCESS_MODE_MTT,
++ MLX5_IMR_MTT_ENTRIES);
+ if (IS_ERR(mr)) {
+ ib_umem_odp_release(odp);
+ return mr;
+@@ -494,8 +494,8 @@ struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd,
+ if (IS_ERR(umem_odp))
+ return ERR_CAST(umem_odp);
+
+- imr = mlx5_mr_cache_alloc_order(dev, MLX5_IMR_KSM_CACHE_ENTRY,
+- access_flags);
++ imr = mlx5_mr_cache_alloc(dev, access_flags, MLX5_MKC_ACCESS_MODE_KSM,
++ mlx5_imr_ksm_entries);
+ if (IS_ERR(imr)) {
+ ib_umem_odp_release(umem_odp);
+ return imr;
+@@ -1591,12 +1591,22 @@ mlx5_ib_odp_destroy_eq(struct mlx5_ib_dev *dev, struct mlx5_ib_pf_eq *eq)
+ return err;
+ }
+
+-void mlx5_odp_init_mkey_cache_entry(struct mlx5_cache_ent *ent)
++int mlx5_odp_init_mkey_cache(struct mlx5_ib_dev *dev)
+ {
+- if (!(ent->dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT))
+- return;
+- ent->ndescs = mlx5_imr_ksm_entries;
+- ent->access_mode = MLX5_MKC_ACCESS_MODE_KSM;
++ struct mlx5r_cache_rb_key rb_key = {
++ .access_mode = MLX5_MKC_ACCESS_MODE_KSM,
++ .ndescs = mlx5_imr_ksm_entries,
++ };
++ struct mlx5_cache_ent *ent;
++
++ if (!(dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT))
++ return 0;
++
++ ent = mlx5r_cache_create_ent(dev, rb_key, true);
++ if (IS_ERR(ent))
++ return PTR_ERR(ent);
++
++ return 0;
+ }
+
+ static const struct ib_device_ops mlx5_ib_dev_odp_ops = {
+--
+2.39.5
+
--- /dev/null
+From 31e1b4f44049773843852197aab66262fea5d3ca Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 5 Jun 2023 13:14:05 +0300
+Subject: RDMA/mlx5: Reduce QP table exposure
+
+From: Leon Romanovsky <leonro@nvidia.com>
+
+[ Upstream commit 2ecfd946169e7f56534db2a5f6935858be3005ba ]
+
+driver.h is common header to whole mlx5 code base, but struct
+mlx5_qp_table is used in mlx5_ib driver only. So move that struct
+to be under sole responsibility of mlx5_ib.
+
+Link: https://lore.kernel.org/r/bec0dc1158e795813b135d1143147977f26bf668.1685953497.git.leon@kernel.org
+Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
+Stable-dep-of: c534ffda781f ("RDMA/mlx5: Fix AH static rate parsing")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/infiniband/hw/mlx5/mlx5_ib.h | 1 +
+ drivers/infiniband/hw/mlx5/qp.h | 11 ++++++++++-
+ include/linux/mlx5/driver.h | 9 ---------
+ 3 files changed, 11 insertions(+), 10 deletions(-)
+
+diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
+index 024d2071c6a5d..5c533023a51a4 100644
+--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
++++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
+@@ -25,6 +25,7 @@
+ #include <rdma/mlx5_user_ioctl_verbs.h>
+
+ #include "srq.h"
++#include "qp.h"
+
+ #define mlx5_ib_dbg(_dev, format, arg...) \
+ dev_dbg(&(_dev)->ib_dev.dev, "%s:%d:(pid %d): " format, __func__, \
+diff --git a/drivers/infiniband/hw/mlx5/qp.h b/drivers/infiniband/hw/mlx5/qp.h
+index fb2f4e030bb8f..e677fa0ca4226 100644
+--- a/drivers/infiniband/hw/mlx5/qp.h
++++ b/drivers/infiniband/hw/mlx5/qp.h
+@@ -6,7 +6,16 @@
+ #ifndef _MLX5_IB_QP_H
+ #define _MLX5_IB_QP_H
+
+-#include "mlx5_ib.h"
++struct mlx5_ib_dev;
++
++struct mlx5_qp_table {
++ struct notifier_block nb;
++
++ /* protect radix tree
++ */
++ spinlock_t lock;
++ struct radix_tree_root tree;
++};
+
+ int mlx5_init_qp_table(struct mlx5_ib_dev *dev);
+ void mlx5_cleanup_qp_table(struct mlx5_ib_dev *dev);
+diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
+index 6cea62ca76d6b..060610183fdf9 100644
+--- a/include/linux/mlx5/driver.h
++++ b/include/linux/mlx5/driver.h
+@@ -440,15 +440,6 @@ struct mlx5_core_health {
+ struct delayed_work update_fw_log_ts_work;
+ };
+
+-struct mlx5_qp_table {
+- struct notifier_block nb;
+-
+- /* protect radix tree
+- */
+- spinlock_t lock;
+- struct radix_tree_root tree;
+-};
+-
+ enum {
+ MLX5_PF_NOTIFY_DISABLE_VF,
+ MLX5_PF_NOTIFY_ENABLE_VF,
+--
+2.39.5
+
--- /dev/null
+From f1cf3c129548533fa9dc9569a22ff1ed3e3c9e02 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 26 Jan 2023 00:28:03 +0200
+Subject: RDMA/mlx5: Remove implicit ODP cache entry
+
+From: Aharon Landau <aharonl@nvidia.com>
+
+[ Upstream commit 18b1746bddf5e7f6b2618966596d9517172a5cd7 ]
+
+Implicit ODP mkey doesn't have unique properties. It shares the same
+properties as the order 18 cache entry. There is no need to devote a
+special entry for that.
+
+Link: https://lore.kernel.org/r/20230125222807.6921-3-michaelgur@nvidia.com
+Signed-off-by: Aharon Landau <aharonl@nvidia.com>
+Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
+Stable-dep-of: d97505baea64 ("RDMA/mlx5: Fix the recovery flow of the UMR QP")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/infiniband/hw/mlx5/odp.c | 20 +++++---------------
+ include/linux/mlx5/driver.h | 1 -
+ 2 files changed, 5 insertions(+), 16 deletions(-)
+
+diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c
+index a5c9baec8be85..5f0a17382de73 100644
+--- a/drivers/infiniband/hw/mlx5/odp.c
++++ b/drivers/infiniband/hw/mlx5/odp.c
+@@ -406,6 +406,7 @@ static void mlx5_ib_page_fault_resume(struct mlx5_ib_dev *dev,
+ static struct mlx5_ib_mr *implicit_get_child_mr(struct mlx5_ib_mr *imr,
+ unsigned long idx)
+ {
++ int order = order_base_2(MLX5_IMR_MTT_ENTRIES);
+ struct mlx5_ib_dev *dev = mr_to_mdev(imr);
+ struct ib_umem_odp *odp;
+ struct mlx5_ib_mr *mr;
+@@ -418,7 +419,8 @@ static struct mlx5_ib_mr *implicit_get_child_mr(struct mlx5_ib_mr *imr,
+ if (IS_ERR(odp))
+ return ERR_CAST(odp);
+
+- mr = mlx5_mr_cache_alloc(dev, &dev->cache.ent[MLX5_IMR_MTT_CACHE_ENTRY],
++ BUILD_BUG_ON(order > MKEY_CACHE_LAST_STD_ENTRY);
++ mr = mlx5_mr_cache_alloc(dev, &dev->cache.ent[order],
+ imr->access_flags);
+ if (IS_ERR(mr)) {
+ ib_umem_odp_release(odp);
+@@ -1595,20 +1597,8 @@ void mlx5_odp_init_mkey_cache_entry(struct mlx5_cache_ent *ent)
+ {
+ if (!(ent->dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT))
+ return;
+-
+- switch (ent->order - 2) {
+- case MLX5_IMR_MTT_CACHE_ENTRY:
+- ent->ndescs = MLX5_IMR_MTT_ENTRIES;
+- ent->access_mode = MLX5_MKC_ACCESS_MODE_MTT;
+- ent->limit = 0;
+- break;
+-
+- case MLX5_IMR_KSM_CACHE_ENTRY:
+- ent->ndescs = mlx5_imr_ksm_entries;
+- ent->access_mode = MLX5_MKC_ACCESS_MODE_KSM;
+- ent->limit = 0;
+- break;
+- }
++ ent->ndescs = mlx5_imr_ksm_entries;
++ ent->access_mode = MLX5_MKC_ACCESS_MODE_KSM;
+ }
+
+ static const struct ib_device_ops mlx5_ib_dev_odp_ops = {
+diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
+index 3c3e0f26c2446..6cea62ca76d6b 100644
+--- a/include/linux/mlx5/driver.h
++++ b/include/linux/mlx5/driver.h
+@@ -744,7 +744,6 @@ enum {
+
+ enum {
+ MKEY_CACHE_LAST_STD_ENTRY = 20,
+- MLX5_IMR_MTT_CACHE_ENTRY,
+ MLX5_IMR_KSM_CACHE_ENTRY,
+ MAX_MKEY_CACHE_ENTRIES
+ };
+--
+2.39.5
+
--- /dev/null
+From 724a16cdc34e854b70a5dda60258077b0d252ad1 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 17 Feb 2025 10:16:28 +0800
+Subject: scsi: core: Clear driver private data when retrying request
+
+From: Ye Bin <yebin10@huawei.com>
+
+[ Upstream commit dce5c4afd035e8090a26e5d776b1682c0e649683 ]
+
+After commit 1bad6c4a57ef ("scsi: zero per-cmd private driver data for each
+MQ I/O"), the xen-scsifront/virtio_scsi/snic drivers all removed code that
+explicitly zeroed driver-private command data.
+
+In combination with commit 464a00c9e0ad ("scsi: core: Kill DRIVER_SENSE"),
+after virtio_scsi performs a capacity expansion, the first request will
+return a unit attention to indicate that the capacity has changed. And then
+the original command is retried. As driver-private command data was not
+cleared, the request would return UA again and eventually time out and fail.
+
+Zero driver-private command data when a request is retried.
+
+Fixes: f7de50da1479 ("scsi: xen-scsifront: Remove code that zeroes driver-private command data")
+Fixes: c2bb87318baa ("scsi: virtio_scsi: Remove code that zeroes driver-private command data")
+Fixes: c3006a926468 ("scsi: snic: Remove code that zeroes driver-private command data")
+Signed-off-by: Ye Bin <yebin10@huawei.com>
+Reviewed-by: Bart Van Assche <bvanassche@acm.org>
+Link: https://lore.kernel.org/r/20250217021628.2929248-1-yebin@huaweicloud.com
+Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/scsi/scsi_lib.c | 14 +++++++-------
+ 1 file changed, 7 insertions(+), 7 deletions(-)
+
+diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
+index 72d31b2267ef4..8e75eb1b6eab8 100644
+--- a/drivers/scsi/scsi_lib.c
++++ b/drivers/scsi/scsi_lib.c
+@@ -1579,13 +1579,6 @@ static blk_status_t scsi_prepare_cmd(struct request *req)
+ if (in_flight)
+ __set_bit(SCMD_STATE_INFLIGHT, &cmd->state);
+
+- /*
+- * Only clear the driver-private command data if the LLD does not supply
+- * a function to initialize that data.
+- */
+- if (!shost->hostt->init_cmd_priv)
+- memset(cmd + 1, 0, shost->hostt->cmd_size);
+-
+ cmd->prot_op = SCSI_PROT_NORMAL;
+ if (blk_rq_bytes(req))
+ cmd->sc_data_direction = rq_dma_dir(req);
+@@ -1747,6 +1740,13 @@ static blk_status_t scsi_queue_rq(struct blk_mq_hw_ctx *hctx,
+ if (!scsi_host_queue_ready(q, shost, sdev, cmd))
+ goto out_dec_target_busy;
+
++ /*
++ * Only clear the driver-private command data if the LLD does not supply
++ * a function to initialize that data.
++ */
++ if (shost->hostt->cmd_size && !shost->hostt->init_cmd_priv)
++ memset(cmd + 1, 0, shost->hostt->cmd_size);
++
+ if (!(req->rq_flags & RQF_DONTPREP)) {
+ ret = scsi_prepare_cmd(req);
+ if (ret != BLK_STS_OK)
+--
+2.39.5
+
spi-atmel-quadspi-avoid-overwriting-delay-register-settings.patch
spi-atmel-quadspi-fix-wrong-register-value-written-to-mr.patch
netfilter-allow-exp-not-to-be-removed-in-nf_ct_find_expectation.patch
+rdma-mlx5-don-t-keep-umrable-page_shift-in-cache-ent.patch
+rdma-mlx5-remove-implicit-odp-cache-entry.patch
+rdma-mlx5-change-the-cache-structure-to-an-rb-tree.patch
+rdma-mlx5-introduce-mlx5r_cache_rb_key.patch
+rdma-mlx5-cache-all-user-cacheable-mkeys-on-dereg-mr.patch
+rdma-mlx5-add-work-to-remove-temporary-entries-from-.patch
+rdma-mlx5-implement-mkeys-management-via-lifo-queue.patch
+rdma-mlx5-fix-the-recovery-flow-of-the-umr-qp.patch
+ib-mlx5-set-and-get-correct-qp_num-for-a-dct-qp.patch
+ovl-fix-uaf-in-ovl_dentry_update_reval-by-moving-dpu.patch
+sunrpc-convert-rpc_task_-constants-to-enum.patch
+sunrpc-prevent-looping-due-to-rpc_signal_task-races.patch
+rdma-mlx-calling-qp-event-handler-in-workqueue-conte.patch
+rdma-mlx5-reduce-qp-table-exposure.patch
+ib-core-add-support-for-xdr-link-speed.patch
+rdma-mlx5-fix-ah-static-rate-parsing.patch
+scsi-core-clear-driver-private-data-when-retrying-re.patch
+rdma-mlx5-fix-bind-qp-error-cleanup-flow.patch
+sunrpc-suppress-warnings-for-unused-procfs-functions.patch
--- /dev/null
+From 2efb3833aaf3c7ef2d5028b56fb87338ed97ed8d Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 19 Aug 2024 08:58:59 -0700
+Subject: SUNRPC: convert RPC_TASK_* constants to enum
+
+From: Stephen Brennan <stephen.s.brennan@oracle.com>
+
+[ Upstream commit 0b108e83795c9c23101f584ef7e3ab4f1f120ef0 ]
+
+The RPC_TASK_* constants are defined as macros, which means that most
+kernel builds will not contain their definitions in the debuginfo.
+However, it's quite useful for debuggers to be able to view the task
+state constant and interpret it correctly. Conversion to an enum will
+ensure the constants are present in debuginfo and can be interpreted by
+debuggers without needing to hard-code them and track their changes.
+
+Signed-off-by: Stephen Brennan <stephen.s.brennan@oracle.com>
+Signed-off-by: Anna Schumaker <anna.schumaker@oracle.com>
+Stable-dep-of: 5bbd6e863b15 ("SUNRPC: Prevent looping due to rpc_signal_task() races")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ include/linux/sunrpc/sched.h | 16 +++++++++-------
+ 1 file changed, 9 insertions(+), 7 deletions(-)
+
+diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h
+index 8f9bee0e21c3b..f80b90aca380a 100644
+--- a/include/linux/sunrpc/sched.h
++++ b/include/linux/sunrpc/sched.h
+@@ -140,13 +140,15 @@ struct rpc_task_setup {
+ #define RPC_WAS_SENT(t) ((t)->tk_flags & RPC_TASK_SENT)
+ #define RPC_IS_MOVEABLE(t) ((t)->tk_flags & RPC_TASK_MOVEABLE)
+
+-#define RPC_TASK_RUNNING 0
+-#define RPC_TASK_QUEUED 1
+-#define RPC_TASK_ACTIVE 2
+-#define RPC_TASK_NEED_XMIT 3
+-#define RPC_TASK_NEED_RECV 4
+-#define RPC_TASK_MSG_PIN_WAIT 5
+-#define RPC_TASK_SIGNALLED 6
++enum {
++ RPC_TASK_RUNNING,
++ RPC_TASK_QUEUED,
++ RPC_TASK_ACTIVE,
++ RPC_TASK_NEED_XMIT,
++ RPC_TASK_NEED_RECV,
++ RPC_TASK_MSG_PIN_WAIT,
++ RPC_TASK_SIGNALLED,
++};
+
+ #define rpc_test_and_set_running(t) \
+ test_and_set_bit(RPC_TASK_RUNNING, &(t)->tk_runstate)
+--
+2.39.5
+
--- /dev/null
+From c19996437d22cce7a5e52b63d46399ef5d7795be Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Sat, 1 Feb 2025 15:00:02 -0500
+Subject: SUNRPC: Prevent looping due to rpc_signal_task() races
+
+From: Trond Myklebust <trond.myklebust@hammerspace.com>
+
+[ Upstream commit 5bbd6e863b15a85221e49b9bdb2d5d8f0bb91f3d ]
+
+If rpc_signal_task() is called while a task is in an rpc_call_done()
+callback function, and the latter calls rpc_restart_call(), the task can
+end up looping due to the RPC_TASK_SIGNALLED flag being set without the
+tk_rpc_status being set.
+Removing the redundant mechanism for signalling the task fixes the
+looping behaviour.
+
+Reported-by: Li Lingfeng <lilingfeng3@huawei.com>
+Fixes: 39494194f93b ("SUNRPC: Fix races with rpc_killall_tasks()")
+Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
+Reviewed-by: Jeff Layton <jlayton@kernel.org>
+Signed-off-by: Anna Schumaker <anna.schumaker@oracle.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ include/linux/sunrpc/sched.h | 3 +--
+ include/trace/events/sunrpc.h | 3 +--
+ net/sunrpc/sched.c | 2 --
+ 3 files changed, 2 insertions(+), 6 deletions(-)
+
+diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h
+index f80b90aca380a..a220b28904ca5 100644
+--- a/include/linux/sunrpc/sched.h
++++ b/include/linux/sunrpc/sched.h
+@@ -147,7 +147,6 @@ enum {
+ RPC_TASK_NEED_XMIT,
+ RPC_TASK_NEED_RECV,
+ RPC_TASK_MSG_PIN_WAIT,
+- RPC_TASK_SIGNALLED,
+ };
+
+ #define rpc_test_and_set_running(t) \
+@@ -160,7 +159,7 @@ enum {
+
+ #define RPC_IS_ACTIVATED(t) test_bit(RPC_TASK_ACTIVE, &(t)->tk_runstate)
+
+-#define RPC_SIGNALLED(t) test_bit(RPC_TASK_SIGNALLED, &(t)->tk_runstate)
++#define RPC_SIGNALLED(t) (READ_ONCE(task->tk_rpc_status) == -ERESTARTSYS)
+
+ /*
+ * Task priorities.
+diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h
+index ffe2679a13ced..b70f47a57bf6d 100644
+--- a/include/trace/events/sunrpc.h
++++ b/include/trace/events/sunrpc.h
+@@ -328,8 +328,7 @@ TRACE_EVENT(rpc_request,
+ { (1UL << RPC_TASK_ACTIVE), "ACTIVE" }, \
+ { (1UL << RPC_TASK_NEED_XMIT), "NEED_XMIT" }, \
+ { (1UL << RPC_TASK_NEED_RECV), "NEED_RECV" }, \
+- { (1UL << RPC_TASK_MSG_PIN_WAIT), "MSG_PIN_WAIT" }, \
+- { (1UL << RPC_TASK_SIGNALLED), "SIGNALLED" })
++ { (1UL << RPC_TASK_MSG_PIN_WAIT), "MSG_PIN_WAIT" })
+
+ DECLARE_EVENT_CLASS(rpc_task_running,
+
+diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
+index cef623ea15060..9b45fbdc90cab 100644
+--- a/net/sunrpc/sched.c
++++ b/net/sunrpc/sched.c
+@@ -864,8 +864,6 @@ void rpc_signal_task(struct rpc_task *task)
+ if (!rpc_task_set_rpc_status(task, -ERESTARTSYS))
+ return;
+ trace_rpc_task_signalled(task, task->tk_action);
+- set_bit(RPC_TASK_SIGNALLED, &task->tk_runstate);
+- smp_mb__after_atomic();
+ queue = READ_ONCE(task->tk_waitqueue);
+ if (queue)
+ rpc_wake_up_queued_task(queue, task);
+--
+2.39.5
+
--- /dev/null
+From d7fc0a014420a4f96c0e9ad14e2749c17d329b17 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 25 Feb 2025 15:52:21 +0100
+Subject: sunrpc: suppress warnings for unused procfs functions
+
+From: Arnd Bergmann <arnd@arndb.de>
+
+[ Upstream commit 1f7a4f98c11fbeb18ed21f3b3a497e90a50ad2e0 ]
+
+There is a warning about unused variables when building with W=1 and no procfs:
+
+net/sunrpc/cache.c:1660:30: error: 'cache_flush_proc_ops' defined but not used [-Werror=unused-const-variable=]
+ 1660 | static const struct proc_ops cache_flush_proc_ops = {
+ | ^~~~~~~~~~~~~~~~~~~~
+net/sunrpc/cache.c:1622:30: error: 'content_proc_ops' defined but not used [-Werror=unused-const-variable=]
+ 1622 | static const struct proc_ops content_proc_ops = {
+ | ^~~~~~~~~~~~~~~~
+net/sunrpc/cache.c:1598:30: error: 'cache_channel_proc_ops' defined but not used [-Werror=unused-const-variable=]
+ 1598 | static const struct proc_ops cache_channel_proc_ops = {
+ | ^~~~~~~~~~~~~~~~~~~~~~
+
+These are used inside of an #ifdef, so replacing that with an
+IS_ENABLED() check lets the compiler see how they are used while
+still dropping them during dead code elimination.
+
+Fixes: dbf847ecb631 ("knfsd: allow cache_register to return error on failure")
+Reviewed-by: Jeff Layton <jlayton@kernel.org>
+Acked-by: Chuck Lever <chuck.lever@oracle.com>
+Signed-off-by: Arnd Bergmann <arnd@arndb.de>
+Signed-off-by: Anna Schumaker <anna.schumaker@oracle.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/sunrpc/cache.c | 10 +++-------
+ 1 file changed, 3 insertions(+), 7 deletions(-)
+
+diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c
+index 94889df659f0f..7ac4648c7da7f 100644
+--- a/net/sunrpc/cache.c
++++ b/net/sunrpc/cache.c
+@@ -1675,12 +1675,14 @@ static void remove_cache_proc_entries(struct cache_detail *cd)
+ }
+ }
+
+-#ifdef CONFIG_PROC_FS
+ static int create_cache_proc_entries(struct cache_detail *cd, struct net *net)
+ {
+ struct proc_dir_entry *p;
+ struct sunrpc_net *sn;
+
++ if (!IS_ENABLED(CONFIG_PROC_FS))
++ return 0;
++
+ sn = net_generic(net, sunrpc_net_id);
+ cd->procfs = proc_mkdir(cd->name, sn->proc_net_rpc);
+ if (cd->procfs == NULL)
+@@ -1708,12 +1710,6 @@ static int create_cache_proc_entries(struct cache_detail *cd, struct net *net)
+ remove_cache_proc_entries(cd);
+ return -ENOMEM;
+ }
+-#else /* CONFIG_PROC_FS */
+-static int create_cache_proc_entries(struct cache_detail *cd, struct net *net)
+-{
+- return 0;
+-}
+-#endif
+
+ void __init cache_initialize(void)
+ {
+--
+2.39.5
+