+++ /dev/null
-From 79cad3705d28ff0c133bcd85a9107d0dbbb27e72 Mon Sep 17 00:00:00 2001
-From: Sasha Levin <sashal@kernel.org>
-Date: Wed, 20 Sep 2023 13:07:40 +0300
-Subject: IB/core: Add support for XDR link speed
-
-From: Or Har-Toov <ohartoov@nvidia.com>
-
-[ Upstream commit 703289ce43f740b0096724300107df82d008552f ]
-
-Add new IBTA speed XDR, the new rate that was added to Infiniband spec
-as part of XDR and supporting signaling rate of 200Gb.
-
-In order to report that value to rdma-core, add new u32 field to
-query_port response.
-
-Signed-off-by: Or Har-Toov <ohartoov@nvidia.com>
-Reviewed-by: Mark Zhang <markzhang@nvidia.com>
-Link: https://lore.kernel.org/r/9d235fc600a999e8274010f0e18b40fa60540e6c.1695204156.git.leon@kernel.org
-Reviewed-by: Jacob Keller <jacob.e.keller@intel.com>
-Signed-off-by: Leon Romanovsky <leon@kernel.org>
-Stable-dep-of: c534ffda781f ("RDMA/mlx5: Fix AH static rate parsing")
-Signed-off-by: Sasha Levin <sashal@kernel.org>
----
- drivers/infiniband/core/sysfs.c | 4 ++++
- drivers/infiniband/core/uverbs_std_types_device.c | 3 ++-
- drivers/infiniband/core/verbs.c | 3 +++
- include/rdma/ib_verbs.h | 2 ++
- include/uapi/rdma/ib_user_ioctl_verbs.h | 3 ++-
- 5 files changed, 13 insertions(+), 2 deletions(-)
-
-diff --git a/drivers/infiniband/core/sysfs.c b/drivers/infiniband/core/sysfs.c
-index ec5efdc166601..9f97bef021497 100644
---- a/drivers/infiniband/core/sysfs.c
-+++ b/drivers/infiniband/core/sysfs.c
-@@ -342,6 +342,10 @@ static ssize_t rate_show(struct ib_device *ibdev, u32 port_num,
- speed = " NDR";
- rate = 1000;
- break;
-+ case IB_SPEED_XDR:
-+ speed = " XDR";
-+ rate = 2000;
-+ break;
- case IB_SPEED_SDR:
- default: /* default to SDR for invalid rates */
- speed = " SDR";
-diff --git a/drivers/infiniband/core/uverbs_std_types_device.c b/drivers/infiniband/core/uverbs_std_types_device.c
-index 049684880ae03..fb0555647336f 100644
---- a/drivers/infiniband/core/uverbs_std_types_device.c
-+++ b/drivers/infiniband/core/uverbs_std_types_device.c
-@@ -203,6 +203,7 @@ static int UVERBS_HANDLER(UVERBS_METHOD_QUERY_PORT)(
-
- copy_port_attr_to_resp(&attr, &resp.legacy_resp, ib_dev, port_num);
- resp.port_cap_flags2 = attr.port_cap_flags2;
-+ resp.active_speed_ex = attr.active_speed;
-
- return uverbs_copy_to_struct_or_zero(attrs, UVERBS_ATTR_QUERY_PORT_RESP,
- &resp, sizeof(resp));
-@@ -461,7 +462,7 @@ DECLARE_UVERBS_NAMED_METHOD(
- UVERBS_ATTR_PTR_OUT(
- UVERBS_ATTR_QUERY_PORT_RESP,
- UVERBS_ATTR_STRUCT(struct ib_uverbs_query_port_resp_ex,
-- reserved),
-+ active_speed_ex),
- UA_MANDATORY));
-
- DECLARE_UVERBS_NAMED_METHOD(
-diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c
-index b99b3cc283b65..90848546f1704 100644
---- a/drivers/infiniband/core/verbs.c
-+++ b/drivers/infiniband/core/verbs.c
-@@ -147,6 +147,7 @@ __attribute_const__ int ib_rate_to_mult(enum ib_rate rate)
- case IB_RATE_50_GBPS: return 20;
- case IB_RATE_400_GBPS: return 160;
- case IB_RATE_600_GBPS: return 240;
-+ case IB_RATE_800_GBPS: return 320;
- default: return -1;
- }
- }
-@@ -176,6 +177,7 @@ __attribute_const__ enum ib_rate mult_to_ib_rate(int mult)
- case 20: return IB_RATE_50_GBPS;
- case 160: return IB_RATE_400_GBPS;
- case 240: return IB_RATE_600_GBPS;
-+ case 320: return IB_RATE_800_GBPS;
- default: return IB_RATE_PORT_CURRENT;
- }
- }
-@@ -205,6 +207,7 @@ __attribute_const__ int ib_rate_to_mbps(enum ib_rate rate)
- case IB_RATE_50_GBPS: return 53125;
- case IB_RATE_400_GBPS: return 425000;
- case IB_RATE_600_GBPS: return 637500;
-+ case IB_RATE_800_GBPS: return 850000;
- default: return -1;
- }
- }
-diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
-index 68fd6d22adfd4..750effb875783 100644
---- a/include/rdma/ib_verbs.h
-+++ b/include/rdma/ib_verbs.h
-@@ -557,6 +557,7 @@ enum ib_port_speed {
- IB_SPEED_EDR = 32,
- IB_SPEED_HDR = 64,
- IB_SPEED_NDR = 128,
-+ IB_SPEED_XDR = 256,
- };
-
- enum ib_stat_flag {
-@@ -836,6 +837,7 @@ enum ib_rate {
- IB_RATE_50_GBPS = 20,
- IB_RATE_400_GBPS = 21,
- IB_RATE_600_GBPS = 22,
-+ IB_RATE_800_GBPS = 23,
- };
-
- /**
-diff --git a/include/uapi/rdma/ib_user_ioctl_verbs.h b/include/uapi/rdma/ib_user_ioctl_verbs.h
-index 7dd56210226f5..125fb9f0ef4ab 100644
---- a/include/uapi/rdma/ib_user_ioctl_verbs.h
-+++ b/include/uapi/rdma/ib_user_ioctl_verbs.h
-@@ -218,7 +218,8 @@ enum ib_uverbs_advise_mr_flag {
- struct ib_uverbs_query_port_resp_ex {
- struct ib_uverbs_query_port_resp legacy_resp;
- __u16 port_cap_flags2;
-- __u8 reserved[6];
-+ __u8 reserved[2];
-+ __u32 active_speed_ex;
- };
-
- struct ib_uverbs_qp_cap {
---
-2.39.5
-
+++ /dev/null
-From d5eccf1fd4fbdb90e3f1aba4e5ba5928ea3163c2 Mon Sep 17 00:00:00 2001
-From: Sasha Levin <sashal@kernel.org>
-Date: Wed, 4 Jan 2023 11:43:34 +0200
-Subject: RDMA/mlx: Calling qp event handler in workqueue context
-
-From: Mark Zhang <markzhang@nvidia.com>
-
-[ Upstream commit 312b8f79eb05479628ee71357749815b2eeeeea8 ]
-
-Move the call of qp event handler from atomic to workqueue context,
-so that the handler is able to block. This is needed by following
-patches.
-
-Signed-off-by: Mark Zhang <markzhang@nvidia.com>
-Reviewed-by: Patrisious Haddad <phaddad@nvidia.com>
-Link: https://lore.kernel.org/r/0cd17b8331e445f03942f4bb28d447f24ac5669d.1672821186.git.leonro@nvidia.com
-Signed-off-by: Leon Romanovsky <leon@kernel.org>
-Stable-dep-of: c534ffda781f ("RDMA/mlx5: Fix AH static rate parsing")
-Signed-off-by: Sasha Levin <sashal@kernel.org>
----
- drivers/infiniband/hw/mlx4/main.c | 8 ++
- drivers/infiniband/hw/mlx4/mlx4_ib.h | 3 +
- drivers/infiniband/hw/mlx4/qp.c | 121 +++++++++++++++++-------
- drivers/infiniband/hw/mlx5/main.c | 7 ++
- drivers/infiniband/hw/mlx5/qp.c | 119 ++++++++++++++++-------
- drivers/infiniband/hw/mlx5/qp.h | 2 +
- drivers/infiniband/hw/mlx5/qpc.c | 3 +-
- drivers/net/ethernet/mellanox/mlx4/qp.c | 14 ++-
- include/linux/mlx4/qp.h | 1 +
- include/rdma/ib_verbs.h | 2 +-
- 10 files changed, 202 insertions(+), 78 deletions(-)
-
-diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c
-index 7c3dc86ab7f04..0f0b130cc8aac 100644
---- a/drivers/infiniband/hw/mlx4/main.c
-+++ b/drivers/infiniband/hw/mlx4/main.c
-@@ -3307,6 +3307,10 @@ static int __init mlx4_ib_init(void)
- if (!wq)
- return -ENOMEM;
-
-+ err = mlx4_ib_qp_event_init();
-+ if (err)
-+ goto clean_qp_event;
-+
- err = mlx4_ib_cm_init();
- if (err)
- goto clean_wq;
-@@ -3328,6 +3332,9 @@ static int __init mlx4_ib_init(void)
- mlx4_ib_cm_destroy();
-
- clean_wq:
-+ mlx4_ib_qp_event_cleanup();
-+
-+clean_qp_event:
- destroy_workqueue(wq);
- return err;
- }
-@@ -3337,6 +3344,7 @@ static void __exit mlx4_ib_cleanup(void)
- mlx4_unregister_interface(&mlx4_ib_interface);
- mlx4_ib_mcg_destroy();
- mlx4_ib_cm_destroy();
-+ mlx4_ib_qp_event_cleanup();
- destroy_workqueue(wq);
- }
-
-diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h b/drivers/infiniband/hw/mlx4/mlx4_ib.h
-index 6a3b0f121045e..17fee1e73a45a 100644
---- a/drivers/infiniband/hw/mlx4/mlx4_ib.h
-+++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h
-@@ -940,4 +940,7 @@ int mlx4_ib_umem_calc_optimal_mtt_size(struct ib_umem *umem, u64 start_va,
- int mlx4_ib_cm_init(void);
- void mlx4_ib_cm_destroy(void);
-
-+int mlx4_ib_qp_event_init(void);
-+void mlx4_ib_qp_event_cleanup(void);
-+
- #endif /* MLX4_IB_H */
-diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c
-index ac479e81ddee8..9d08aa99f3cb0 100644
---- a/drivers/infiniband/hw/mlx4/qp.c
-+++ b/drivers/infiniband/hw/mlx4/qp.c
-@@ -102,6 +102,14 @@ enum mlx4_ib_source_type {
- MLX4_IB_RWQ_SRC = 1,
- };
-
-+struct mlx4_ib_qp_event_work {
-+ struct work_struct work;
-+ struct mlx4_qp *qp;
-+ enum mlx4_event type;
-+};
-+
-+static struct workqueue_struct *mlx4_ib_qp_event_wq;
-+
- static int is_tunnel_qp(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp)
- {
- if (!mlx4_is_master(dev->dev))
-@@ -200,50 +208,77 @@ static void stamp_send_wqe(struct mlx4_ib_qp *qp, int n)
- }
- }
-
-+static void mlx4_ib_handle_qp_event(struct work_struct *_work)
-+{
-+ struct mlx4_ib_qp_event_work *qpe_work =
-+ container_of(_work, struct mlx4_ib_qp_event_work, work);
-+ struct ib_qp *ibqp = &to_mibqp(qpe_work->qp)->ibqp;
-+ struct ib_event event = {};
-+
-+ event.device = ibqp->device;
-+ event.element.qp = ibqp;
-+
-+ switch (qpe_work->type) {
-+ case MLX4_EVENT_TYPE_PATH_MIG:
-+ event.event = IB_EVENT_PATH_MIG;
-+ break;
-+ case MLX4_EVENT_TYPE_COMM_EST:
-+ event.event = IB_EVENT_COMM_EST;
-+ break;
-+ case MLX4_EVENT_TYPE_SQ_DRAINED:
-+ event.event = IB_EVENT_SQ_DRAINED;
-+ break;
-+ case MLX4_EVENT_TYPE_SRQ_QP_LAST_WQE:
-+ event.event = IB_EVENT_QP_LAST_WQE_REACHED;
-+ break;
-+ case MLX4_EVENT_TYPE_WQ_CATAS_ERROR:
-+ event.event = IB_EVENT_QP_FATAL;
-+ break;
-+ case MLX4_EVENT_TYPE_PATH_MIG_FAILED:
-+ event.event = IB_EVENT_PATH_MIG_ERR;
-+ break;
-+ case MLX4_EVENT_TYPE_WQ_INVAL_REQ_ERROR:
-+ event.event = IB_EVENT_QP_REQ_ERR;
-+ break;
-+ case MLX4_EVENT_TYPE_WQ_ACCESS_ERROR:
-+ event.event = IB_EVENT_QP_ACCESS_ERR;
-+ break;
-+ default:
-+ pr_warn("Unexpected event type %d on QP %06x\n",
-+ qpe_work->type, qpe_work->qp->qpn);
-+ goto out;
-+ }
-+
-+ ibqp->event_handler(&event, ibqp->qp_context);
-+
-+out:
-+ mlx4_put_qp(qpe_work->qp);
-+ kfree(qpe_work);
-+}
-+
- static void mlx4_ib_qp_event(struct mlx4_qp *qp, enum mlx4_event type)
- {
-- struct ib_event event;
- struct ib_qp *ibqp = &to_mibqp(qp)->ibqp;
-+ struct mlx4_ib_qp_event_work *qpe_work;
-
- if (type == MLX4_EVENT_TYPE_PATH_MIG)
- to_mibqp(qp)->port = to_mibqp(qp)->alt_port;
-
-- if (ibqp->event_handler) {
-- event.device = ibqp->device;
-- event.element.qp = ibqp;
-- switch (type) {
-- case MLX4_EVENT_TYPE_PATH_MIG:
-- event.event = IB_EVENT_PATH_MIG;
-- break;
-- case MLX4_EVENT_TYPE_COMM_EST:
-- event.event = IB_EVENT_COMM_EST;
-- break;
-- case MLX4_EVENT_TYPE_SQ_DRAINED:
-- event.event = IB_EVENT_SQ_DRAINED;
-- break;
-- case MLX4_EVENT_TYPE_SRQ_QP_LAST_WQE:
-- event.event = IB_EVENT_QP_LAST_WQE_REACHED;
-- break;
-- case MLX4_EVENT_TYPE_WQ_CATAS_ERROR:
-- event.event = IB_EVENT_QP_FATAL;
-- break;
-- case MLX4_EVENT_TYPE_PATH_MIG_FAILED:
-- event.event = IB_EVENT_PATH_MIG_ERR;
-- break;
-- case MLX4_EVENT_TYPE_WQ_INVAL_REQ_ERROR:
-- event.event = IB_EVENT_QP_REQ_ERR;
-- break;
-- case MLX4_EVENT_TYPE_WQ_ACCESS_ERROR:
-- event.event = IB_EVENT_QP_ACCESS_ERR;
-- break;
-- default:
-- pr_warn("Unexpected event type %d "
-- "on QP %06x\n", type, qp->qpn);
-- return;
-- }
-+ if (!ibqp->event_handler)
-+ goto out_no_handler;
-
-- ibqp->event_handler(&event, ibqp->qp_context);
-- }
-+ qpe_work = kzalloc(sizeof(*qpe_work), GFP_ATOMIC);
-+ if (!qpe_work)
-+ goto out_no_handler;
-+
-+ qpe_work->qp = qp;
-+ qpe_work->type = type;
-+ INIT_WORK(&qpe_work->work, mlx4_ib_handle_qp_event);
-+ queue_work(mlx4_ib_qp_event_wq, &qpe_work->work);
-+ return;
-+
-+out_no_handler:
-+ mlx4_put_qp(qp);
- }
-
- static void mlx4_ib_wq_event(struct mlx4_qp *qp, enum mlx4_event type)
-@@ -4472,3 +4507,17 @@ void mlx4_ib_drain_rq(struct ib_qp *qp)
-
- handle_drain_completion(cq, &rdrain, dev);
- }
-+
-+int mlx4_ib_qp_event_init(void)
-+{
-+ mlx4_ib_qp_event_wq = alloc_ordered_workqueue("mlx4_ib_qp_event_wq", 0);
-+ if (!mlx4_ib_qp_event_wq)
-+ return -ENOMEM;
-+
-+ return 0;
-+}
-+
-+void mlx4_ib_qp_event_cleanup(void)
-+{
-+ destroy_workqueue(mlx4_ib_qp_event_wq);
-+}
-diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
-index 45a414e8d35fa..a22649617e017 100644
---- a/drivers/infiniband/hw/mlx5/main.c
-+++ b/drivers/infiniband/hw/mlx5/main.c
-@@ -4410,6 +4410,10 @@ static int __init mlx5_ib_init(void)
- return -ENOMEM;
- }
-
-+ ret = mlx5_ib_qp_event_init();
-+ if (ret)
-+ goto qp_event_err;
-+
- mlx5_ib_odp_init();
- ret = mlx5r_rep_init();
- if (ret)
-@@ -4427,6 +4431,8 @@ static int __init mlx5_ib_init(void)
- mp_err:
- mlx5r_rep_cleanup();
- rep_err:
-+ mlx5_ib_qp_event_cleanup();
-+qp_event_err:
- destroy_workqueue(mlx5_ib_event_wq);
- free_page((unsigned long)xlt_emergency_page);
- return ret;
-@@ -4438,6 +4444,7 @@ static void __exit mlx5_ib_cleanup(void)
- auxiliary_driver_unregister(&mlx5r_mp_driver);
- mlx5r_rep_cleanup();
-
-+ mlx5_ib_qp_event_cleanup();
- destroy_workqueue(mlx5_ib_event_wq);
- free_page((unsigned long)xlt_emergency_page);
- }
-diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c
-index d782a494abcda..43c0123babd10 100644
---- a/drivers/infiniband/hw/mlx5/qp.c
-+++ b/drivers/infiniband/hw/mlx5/qp.c
-@@ -71,6 +71,14 @@ struct mlx5_modify_raw_qp_param {
- u32 port;
- };
-
-+struct mlx5_ib_qp_event_work {
-+ struct work_struct work;
-+ struct mlx5_core_qp *qp;
-+ int type;
-+};
-+
-+static struct workqueue_struct *mlx5_ib_qp_event_wq;
-+
- static void get_cqs(enum ib_qp_type qp_type,
- struct ib_cq *ib_send_cq, struct ib_cq *ib_recv_cq,
- struct mlx5_ib_cq **send_cq, struct mlx5_ib_cq **recv_cq);
-@@ -302,51 +310,78 @@ int mlx5_ib_read_wqe_srq(struct mlx5_ib_srq *srq, int wqe_index, void *buffer,
- return mlx5_ib_read_user_wqe_srq(srq, wqe_index, buffer, buflen, bc);
- }
-
-+static void mlx5_ib_handle_qp_event(struct work_struct *_work)
-+{
-+ struct mlx5_ib_qp_event_work *qpe_work =
-+ container_of(_work, struct mlx5_ib_qp_event_work, work);
-+ struct ib_qp *ibqp = &to_mibqp(qpe_work->qp)->ibqp;
-+ struct ib_event event = {};
-+
-+ event.device = ibqp->device;
-+ event.element.qp = ibqp;
-+ switch (qpe_work->type) {
-+ case MLX5_EVENT_TYPE_PATH_MIG:
-+ event.event = IB_EVENT_PATH_MIG;
-+ break;
-+ case MLX5_EVENT_TYPE_COMM_EST:
-+ event.event = IB_EVENT_COMM_EST;
-+ break;
-+ case MLX5_EVENT_TYPE_SQ_DRAINED:
-+ event.event = IB_EVENT_SQ_DRAINED;
-+ break;
-+ case MLX5_EVENT_TYPE_SRQ_LAST_WQE:
-+ event.event = IB_EVENT_QP_LAST_WQE_REACHED;
-+ break;
-+ case MLX5_EVENT_TYPE_WQ_CATAS_ERROR:
-+ event.event = IB_EVENT_QP_FATAL;
-+ break;
-+ case MLX5_EVENT_TYPE_PATH_MIG_FAILED:
-+ event.event = IB_EVENT_PATH_MIG_ERR;
-+ break;
-+ case MLX5_EVENT_TYPE_WQ_INVAL_REQ_ERROR:
-+ event.event = IB_EVENT_QP_REQ_ERR;
-+ break;
-+ case MLX5_EVENT_TYPE_WQ_ACCESS_ERROR:
-+ event.event = IB_EVENT_QP_ACCESS_ERR;
-+ break;
-+ default:
-+ pr_warn("mlx5_ib: Unexpected event type %d on QP %06x\n",
-+ qpe_work->type, qpe_work->qp->qpn);
-+ goto out;
-+ }
-+
-+ ibqp->event_handler(&event, ibqp->qp_context);
-+
-+out:
-+ mlx5_core_res_put(&qpe_work->qp->common);
-+ kfree(qpe_work);
-+}
-+
- static void mlx5_ib_qp_event(struct mlx5_core_qp *qp, int type)
- {
- struct ib_qp *ibqp = &to_mibqp(qp)->ibqp;
-- struct ib_event event;
-+ struct mlx5_ib_qp_event_work *qpe_work;
-
- if (type == MLX5_EVENT_TYPE_PATH_MIG) {
- /* This event is only valid for trans_qps */
- to_mibqp(qp)->port = to_mibqp(qp)->trans_qp.alt_port;
- }
-
-- if (ibqp->event_handler) {
-- event.device = ibqp->device;
-- event.element.qp = ibqp;
-- switch (type) {
-- case MLX5_EVENT_TYPE_PATH_MIG:
-- event.event = IB_EVENT_PATH_MIG;
-- break;
-- case MLX5_EVENT_TYPE_COMM_EST:
-- event.event = IB_EVENT_COMM_EST;
-- break;
-- case MLX5_EVENT_TYPE_SQ_DRAINED:
-- event.event = IB_EVENT_SQ_DRAINED;
-- break;
-- case MLX5_EVENT_TYPE_SRQ_LAST_WQE:
-- event.event = IB_EVENT_QP_LAST_WQE_REACHED;
-- break;
-- case MLX5_EVENT_TYPE_WQ_CATAS_ERROR:
-- event.event = IB_EVENT_QP_FATAL;
-- break;
-- case MLX5_EVENT_TYPE_PATH_MIG_FAILED:
-- event.event = IB_EVENT_PATH_MIG_ERR;
-- break;
-- case MLX5_EVENT_TYPE_WQ_INVAL_REQ_ERROR:
-- event.event = IB_EVENT_QP_REQ_ERR;
-- break;
-- case MLX5_EVENT_TYPE_WQ_ACCESS_ERROR:
-- event.event = IB_EVENT_QP_ACCESS_ERR;
-- break;
-- default:
-- pr_warn("mlx5_ib: Unexpected event type %d on QP %06x\n", type, qp->qpn);
-- return;
-- }
-+ if (!ibqp->event_handler)
-+ goto out_no_handler;
-
-- ibqp->event_handler(&event, ibqp->qp_context);
-- }
-+ qpe_work = kzalloc(sizeof(*qpe_work), GFP_ATOMIC);
-+ if (!qpe_work)
-+ goto out_no_handler;
-+
-+ qpe_work->qp = qp;
-+ qpe_work->type = type;
-+ INIT_WORK(&qpe_work->work, mlx5_ib_handle_qp_event);
-+ queue_work(mlx5_ib_qp_event_wq, &qpe_work->work);
-+ return;
-+
-+out_no_handler:
-+ mlx5_core_res_put(&qp->common);
- }
-
- static int set_rq_size(struct mlx5_ib_dev *dev, struct ib_qp_cap *cap,
-@@ -5752,3 +5787,17 @@ int mlx5_ib_qp_set_counter(struct ib_qp *qp, struct rdma_counter *counter)
- mutex_unlock(&mqp->mutex);
- return err;
- }
-+
-+int mlx5_ib_qp_event_init(void)
-+{
-+ mlx5_ib_qp_event_wq = alloc_ordered_workqueue("mlx5_ib_qp_event_wq", 0);
-+ if (!mlx5_ib_qp_event_wq)
-+ return -ENOMEM;
-+
-+ return 0;
-+}
-+
-+void mlx5_ib_qp_event_cleanup(void)
-+{
-+ destroy_workqueue(mlx5_ib_qp_event_wq);
-+}
-diff --git a/drivers/infiniband/hw/mlx5/qp.h b/drivers/infiniband/hw/mlx5/qp.h
-index 5d4e140db99ce..fb2f4e030bb8f 100644
---- a/drivers/infiniband/hw/mlx5/qp.h
-+++ b/drivers/infiniband/hw/mlx5/qp.h
-@@ -44,4 +44,6 @@ void mlx5_core_res_put(struct mlx5_core_rsc_common *res);
- int mlx5_core_xrcd_alloc(struct mlx5_ib_dev *dev, u32 *xrcdn);
- int mlx5_core_xrcd_dealloc(struct mlx5_ib_dev *dev, u32 xrcdn);
- int mlx5_ib_qp_set_counter(struct ib_qp *qp, struct rdma_counter *counter);
-+int mlx5_ib_qp_event_init(void);
-+void mlx5_ib_qp_event_cleanup(void);
- #endif /* _MLX5_IB_QP_H */
-diff --git a/drivers/infiniband/hw/mlx5/qpc.c b/drivers/infiniband/hw/mlx5/qpc.c
-index d4e7864c56f18..a824ff22f4615 100644
---- a/drivers/infiniband/hw/mlx5/qpc.c
-+++ b/drivers/infiniband/hw/mlx5/qpc.c
-@@ -135,7 +135,8 @@ static int rsc_event_notifier(struct notifier_block *nb,
- case MLX5_RES_SQ:
- qp = (struct mlx5_core_qp *)common;
- qp->event(qp, event_type);
-- break;
-+ /* Need to put resource in event handler */
-+ return NOTIFY_OK;
- case MLX5_RES_DCT:
- dct = (struct mlx5_core_dct *)common;
- if (event_type == MLX5_EVENT_TYPE_DCT_DRAINED)
-diff --git a/drivers/net/ethernet/mellanox/mlx4/qp.c b/drivers/net/ethernet/mellanox/mlx4/qp.c
-index 48cfaa7eaf50c..913ed255990f4 100644
---- a/drivers/net/ethernet/mellanox/mlx4/qp.c
-+++ b/drivers/net/ethernet/mellanox/mlx4/qp.c
-@@ -46,6 +46,13 @@
- #define MLX4_BF_QP_SKIP_MASK 0xc0
- #define MLX4_MAX_BF_QP_RANGE 0x40
-
-+void mlx4_put_qp(struct mlx4_qp *qp)
-+{
-+ if (refcount_dec_and_test(&qp->refcount))
-+ complete(&qp->free);
-+}
-+EXPORT_SYMBOL_GPL(mlx4_put_qp);
-+
- void mlx4_qp_event(struct mlx4_dev *dev, u32 qpn, int event_type)
- {
- struct mlx4_qp_table *qp_table = &mlx4_priv(dev)->qp_table;
-@@ -64,10 +71,8 @@ void mlx4_qp_event(struct mlx4_dev *dev, u32 qpn, int event_type)
- return;
- }
-
-+ /* Need to call mlx4_put_qp() in event handler */
- qp->event(qp, event_type);
--
-- if (refcount_dec_and_test(&qp->refcount))
-- complete(&qp->free);
- }
-
- /* used for INIT/CLOSE port logic */
-@@ -523,8 +528,7 @@ EXPORT_SYMBOL_GPL(mlx4_qp_remove);
-
- void mlx4_qp_free(struct mlx4_dev *dev, struct mlx4_qp *qp)
- {
-- if (refcount_dec_and_test(&qp->refcount))
-- complete(&qp->free);
-+ mlx4_put_qp(qp);
- wait_for_completion(&qp->free);
-
- mlx4_qp_free_icm(dev, qp->qpn);
-diff --git a/include/linux/mlx4/qp.h b/include/linux/mlx4/qp.h
-index b6b626157b03a..b9a7b1319f5d3 100644
---- a/include/linux/mlx4/qp.h
-+++ b/include/linux/mlx4/qp.h
-@@ -504,4 +504,5 @@ static inline u16 folded_qp(u32 q)
-
- u16 mlx4_qp_roce_entropy(struct mlx4_dev *dev, u32 qpn);
-
-+void mlx4_put_qp(struct mlx4_qp *qp);
- #endif /* MLX4_QP_H */
-diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
-index 5582509003264..68fd6d22adfd4 100644
---- a/include/rdma/ib_verbs.h
-+++ b/include/rdma/ib_verbs.h
-@@ -1162,7 +1162,7 @@ enum ib_qp_create_flags {
- */
-
- struct ib_qp_init_attr {
-- /* Consumer's event_handler callback must not block */
-+ /* This callback occurs in workqueue context */
- void (*event_handler)(struct ib_event *, void *);
-
- void *qp_context;
---
-2.39.5
-
+++ /dev/null
-From be147ad5b5dbf2b210768ce67d652ae3e1d6ddf1 Mon Sep 17 00:00:00 2001
-From: Sasha Levin <sashal@kernel.org>
-Date: Thu, 26 Jan 2023 00:28:07 +0200
-Subject: RDMA/mlx5: Add work to remove temporary entries from the cache
-
-From: Michael Guralnik <michaelgur@nvidia.com>
-
-[ Upstream commit 627122280c878cf5d3cda2d2c5a0a8f6a7e35cb7 ]
-
-The non-cache mkeys are stored in the cache only to shorten restarting
-application time. Don't store them longer than needed.
-
-Configure cache entries that store non-cache MRs as temporary entries. If
-30 seconds have passed and no user reclaimed the temporarily cached mkeys,
-an asynchronous work will destroy the mkeys entries.
-
-Link: https://lore.kernel.org/r/20230125222807.6921-7-michaelgur@nvidia.com
-Signed-off-by: Michael Guralnik <michaelgur@nvidia.com>
-Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
-Stable-dep-of: d97505baea64 ("RDMA/mlx5: Fix the recovery flow of the UMR QP")
-Signed-off-by: Sasha Levin <sashal@kernel.org>
----
- drivers/infiniband/hw/mlx5/mlx5_ib.h | 9 ++-
- drivers/infiniband/hw/mlx5/mr.c | 94 ++++++++++++++++++++++------
- drivers/infiniband/hw/mlx5/odp.c | 2 +-
- 3 files changed, 82 insertions(+), 23 deletions(-)
-
-diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
-index f345e2ae394d2..7c72e0e9db54a 100644
---- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
-+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
-@@ -770,6 +770,7 @@ struct mlx5_cache_ent {
- struct rb_node node;
- struct mlx5r_cache_rb_key rb_key;
-
-+ u8 is_tmp:1;
- u8 disabled:1;
- u8 fill_to_high_water:1;
-
-@@ -803,6 +804,7 @@ struct mlx5_mkey_cache {
- struct mutex rb_lock;
- struct dentry *fs_root;
- unsigned long last_add;
-+ struct delayed_work remove_ent_dwork;
- };
-
- struct mlx5_ib_port_resources {
-@@ -1346,9 +1348,10 @@ void mlx5_ib_copy_pas(u64 *old, u64 *new, int step, int num);
- int mlx5_ib_get_cqe_size(struct ib_cq *ibcq);
- int mlx5_mkey_cache_init(struct mlx5_ib_dev *dev);
- int mlx5_mkey_cache_cleanup(struct mlx5_ib_dev *dev);
--struct mlx5_cache_ent *mlx5r_cache_create_ent(struct mlx5_ib_dev *dev,
-- struct mlx5r_cache_rb_key rb_key,
-- bool persistent_entry);
-+struct mlx5_cache_ent *
-+mlx5r_cache_create_ent_locked(struct mlx5_ib_dev *dev,
-+ struct mlx5r_cache_rb_key rb_key,
-+ bool persistent_entry);
-
- struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev,
- int access_flags, int access_mode,
-diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
-index bf1ca7565be67..2c1a935734273 100644
---- a/drivers/infiniband/hw/mlx5/mr.c
-+++ b/drivers/infiniband/hw/mlx5/mr.c
-@@ -140,19 +140,16 @@ static void create_mkey_warn(struct mlx5_ib_dev *dev, int status, void *out)
- mlx5_cmd_out_err(dev->mdev, MLX5_CMD_OP_CREATE_MKEY, 0, out);
- }
-
--
--static int push_mkey(struct mlx5_cache_ent *ent, bool limit_pendings,
-- void *to_store)
-+static int push_mkey_locked(struct mlx5_cache_ent *ent, bool limit_pendings,
-+ void *to_store)
- {
- XA_STATE(xas, &ent->mkeys, 0);
- void *curr;
-
-- xa_lock_irq(&ent->mkeys);
- if (limit_pendings &&
-- (ent->reserved - ent->stored) > MAX_PENDING_REG_MR) {
-- xa_unlock_irq(&ent->mkeys);
-+ (ent->reserved - ent->stored) > MAX_PENDING_REG_MR)
- return -EAGAIN;
-- }
-+
- while (1) {
- /*
- * This is cmpxchg (NULL, XA_ZERO_ENTRY) however this version
-@@ -191,6 +188,7 @@ static int push_mkey(struct mlx5_cache_ent *ent, bool limit_pendings,
- break;
- xa_lock_irq(&ent->mkeys);
- }
-+ xa_lock_irq(&ent->mkeys);
- if (xas_error(&xas))
- return xas_error(&xas);
- if (WARN_ON(curr))
-@@ -198,6 +196,17 @@ static int push_mkey(struct mlx5_cache_ent *ent, bool limit_pendings,
- return 0;
- }
-
-+static int push_mkey(struct mlx5_cache_ent *ent, bool limit_pendings,
-+ void *to_store)
-+{
-+ int ret;
-+
-+ xa_lock_irq(&ent->mkeys);
-+ ret = push_mkey_locked(ent, limit_pendings, to_store);
-+ xa_unlock_irq(&ent->mkeys);
-+ return ret;
-+}
-+
- static void undo_push_reserve_mkey(struct mlx5_cache_ent *ent)
- {
- void *old;
-@@ -545,7 +554,7 @@ static void queue_adjust_cache_locked(struct mlx5_cache_ent *ent)
- {
- lockdep_assert_held(&ent->mkeys.xa_lock);
-
-- if (ent->disabled || READ_ONCE(ent->dev->fill_delay))
-+ if (ent->disabled || READ_ONCE(ent->dev->fill_delay) || ent->is_tmp)
- return;
- if (ent->stored < ent->limit) {
- ent->fill_to_high_water = true;
-@@ -675,7 +684,6 @@ static int mlx5_cache_ent_insert(struct mlx5_mkey_cache *cache,
- struct mlx5_cache_ent *cur;
- int cmp;
-
-- mutex_lock(&cache->rb_lock);
- /* Figure out where to put new node */
- while (*new) {
- cur = rb_entry(*new, struct mlx5_cache_ent, node);
-@@ -695,7 +703,6 @@ static int mlx5_cache_ent_insert(struct mlx5_mkey_cache *cache,
- rb_link_node(&ent->node, parent, new);
- rb_insert_color(&ent->node, &cache->rb_root);
-
-- mutex_unlock(&cache->rb_lock);
- return 0;
- }
-
-@@ -867,9 +874,10 @@ static void delay_time_func(struct timer_list *t)
- WRITE_ONCE(dev->fill_delay, 0);
- }
-
--struct mlx5_cache_ent *mlx5r_cache_create_ent(struct mlx5_ib_dev *dev,
-- struct mlx5r_cache_rb_key rb_key,
-- bool persistent_entry)
-+struct mlx5_cache_ent *
-+mlx5r_cache_create_ent_locked(struct mlx5_ib_dev *dev,
-+ struct mlx5r_cache_rb_key rb_key,
-+ bool persistent_entry)
- {
- struct mlx5_cache_ent *ent;
- int order;
-@@ -882,6 +890,7 @@ struct mlx5_cache_ent *mlx5r_cache_create_ent(struct mlx5_ib_dev *dev,
- xa_init_flags(&ent->mkeys, XA_FLAGS_LOCK_IRQ);
- ent->rb_key = rb_key;
- ent->dev = dev;
-+ ent->is_tmp = !persistent_entry;
-
- INIT_DELAYED_WORK(&ent->dwork, delayed_cache_work_func);
-
-@@ -905,11 +914,44 @@ struct mlx5_cache_ent *mlx5r_cache_create_ent(struct mlx5_ib_dev *dev,
- ent->limit = 0;
-
- mlx5_mkey_cache_debugfs_add_ent(dev, ent);
-+ } else {
-+ mod_delayed_work(ent->dev->cache.wq,
-+ &ent->dev->cache.remove_ent_dwork,
-+ msecs_to_jiffies(30 * 1000));
- }
-
- return ent;
- }
-
-+static void remove_ent_work_func(struct work_struct *work)
-+{
-+ struct mlx5_mkey_cache *cache;
-+ struct mlx5_cache_ent *ent;
-+ struct rb_node *cur;
-+
-+ cache = container_of(work, struct mlx5_mkey_cache,
-+ remove_ent_dwork.work);
-+ mutex_lock(&cache->rb_lock);
-+ cur = rb_last(&cache->rb_root);
-+ while (cur) {
-+ ent = rb_entry(cur, struct mlx5_cache_ent, node);
-+ cur = rb_prev(cur);
-+ mutex_unlock(&cache->rb_lock);
-+
-+ xa_lock_irq(&ent->mkeys);
-+ if (!ent->is_tmp) {
-+ xa_unlock_irq(&ent->mkeys);
-+ mutex_lock(&cache->rb_lock);
-+ continue;
-+ }
-+ xa_unlock_irq(&ent->mkeys);
-+
-+ clean_keys(ent->dev, ent);
-+ mutex_lock(&cache->rb_lock);
-+ }
-+ mutex_unlock(&cache->rb_lock);
-+}
-+
- int mlx5_mkey_cache_init(struct mlx5_ib_dev *dev)
- {
- struct mlx5_mkey_cache *cache = &dev->cache;
-@@ -925,6 +967,7 @@ int mlx5_mkey_cache_init(struct mlx5_ib_dev *dev)
- mutex_init(&dev->slow_path_mutex);
- mutex_init(&dev->cache.rb_lock);
- dev->cache.rb_root = RB_ROOT;
-+ INIT_DELAYED_WORK(&dev->cache.remove_ent_dwork, remove_ent_work_func);
- cache->wq = alloc_ordered_workqueue("mkey_cache", WQ_MEM_RECLAIM);
- if (!cache->wq) {
- mlx5_ib_warn(dev, "failed to create work queue\n");
-@@ -934,9 +977,10 @@ int mlx5_mkey_cache_init(struct mlx5_ib_dev *dev)
- mlx5_cmd_init_async_ctx(dev->mdev, &dev->async_ctx);
- timer_setup(&dev->delay_timer, delay_time_func, 0);
- mlx5_mkey_cache_debugfs_init(dev);
-+ mutex_lock(&cache->rb_lock);
- for (i = 0; i <= mkey_cache_max_order(dev); i++) {
- rb_key.ndescs = 1 << (i + 2);
-- ent = mlx5r_cache_create_ent(dev, rb_key, true);
-+ ent = mlx5r_cache_create_ent_locked(dev, rb_key, true);
- if (IS_ERR(ent)) {
- ret = PTR_ERR(ent);
- goto err;
-@@ -947,6 +991,7 @@ int mlx5_mkey_cache_init(struct mlx5_ib_dev *dev)
- if (ret)
- goto err;
-
-+ mutex_unlock(&cache->rb_lock);
- for (node = rb_first(root); node; node = rb_next(node)) {
- ent = rb_entry(node, struct mlx5_cache_ent, node);
- xa_lock_irq(&ent->mkeys);
-@@ -957,6 +1002,7 @@ int mlx5_mkey_cache_init(struct mlx5_ib_dev *dev)
- return 0;
-
- err:
-+ mutex_unlock(&cache->rb_lock);
- mlx5_ib_warn(dev, "failed to create mkey cache entry\n");
- return ret;
- }
-@@ -970,6 +1016,7 @@ int mlx5_mkey_cache_cleanup(struct mlx5_ib_dev *dev)
- if (!dev->cache.wq)
- return 0;
-
-+ cancel_delayed_work_sync(&dev->cache.remove_ent_dwork);
- mutex_lock(&dev->cache.rb_lock);
- for (node = rb_first(root); node; node = rb_next(node)) {
- ent = rb_entry(node, struct mlx5_cache_ent, node);
-@@ -1752,33 +1799,42 @@ static int cache_ent_find_and_store(struct mlx5_ib_dev *dev,
- {
- struct mlx5_mkey_cache *cache = &dev->cache;
- struct mlx5_cache_ent *ent;
-+ int ret;
-
- if (mr->mmkey.cache_ent) {
- xa_lock_irq(&mr->mmkey.cache_ent->mkeys);
- mr->mmkey.cache_ent->in_use--;
-- xa_unlock_irq(&mr->mmkey.cache_ent->mkeys);
- goto end;
- }
-
- mutex_lock(&cache->rb_lock);
- ent = mkey_cache_ent_from_rb_key(dev, mr->mmkey.rb_key);
-- mutex_unlock(&cache->rb_lock);
- if (ent) {
- if (ent->rb_key.ndescs == mr->mmkey.rb_key.ndescs) {
-+ if (ent->disabled) {
-+ mutex_unlock(&cache->rb_lock);
-+ return -EOPNOTSUPP;
-+ }
- mr->mmkey.cache_ent = ent;
-+ xa_lock_irq(&mr->mmkey.cache_ent->mkeys);
-+ mutex_unlock(&cache->rb_lock);
- goto end;
- }
- }
-
-- ent = mlx5r_cache_create_ent(dev, mr->mmkey.rb_key, false);
-+ ent = mlx5r_cache_create_ent_locked(dev, mr->mmkey.rb_key, false);
-+ mutex_unlock(&cache->rb_lock);
- if (IS_ERR(ent))
- return PTR_ERR(ent);
-
- mr->mmkey.cache_ent = ent;
-+ xa_lock_irq(&mr->mmkey.cache_ent->mkeys);
-
- end:
-- return push_mkey(mr->mmkey.cache_ent, false,
-- xa_mk_value(mr->mmkey.key));
-+ ret = push_mkey_locked(mr->mmkey.cache_ent, false,
-+ xa_mk_value(mr->mmkey.key));
-+ xa_unlock_irq(&mr->mmkey.cache_ent->mkeys);
-+ return ret;
- }
-
- int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata)
-diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c
-index 96d4faabbff8a..6ba4aa1afdc2d 100644
---- a/drivers/infiniband/hw/mlx5/odp.c
-+++ b/drivers/infiniband/hw/mlx5/odp.c
-@@ -1602,7 +1602,7 @@ int mlx5_odp_init_mkey_cache(struct mlx5_ib_dev *dev)
- if (!(dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT))
- return 0;
-
-- ent = mlx5r_cache_create_ent(dev, rb_key, true);
-+ ent = mlx5r_cache_create_ent_locked(dev, rb_key, true);
- if (IS_ERR(ent))
- return PTR_ERR(ent);
-
---
-2.39.5
-
+++ /dev/null
-From 5a09f0237455bc487c3d8cb78b82b7263d23d8fe Mon Sep 17 00:00:00 2001
-From: Sasha Levin <sashal@kernel.org>
-Date: Thu, 26 Jan 2023 00:28:06 +0200
-Subject: RDMA/mlx5: Cache all user cacheable mkeys on dereg MR flow
-
-From: Michael Guralnik <michaelgur@nvidia.com>
-
-[ Upstream commit dd1b913fb0d0e3e6d55e92d2319d954474dd66ac ]
-
-Currently, when dereging an MR, if the mkey doesn't belong to a cache
-entry, it will be destroyed. As a result, the restart of applications
-with many non-cached mkeys is not efficient since all the mkeys are
-destroyed and then recreated. This process takes a long time (for 100,000
-MRs, it is ~20 seconds for dereg and ~28 seconds for re-reg).
-
-To shorten the restart runtime, insert all cacheable mkeys to the cache.
-If there is no fitting entry to the mkey properties, create a temporary
-entry that fits it.
-
-After a predetermined timeout, the cache entries will shrink to the
-initial high limit.
-
-The mkeys will still be in the cache when consuming them again after an
-application restart. Therefore, the registration will be much faster
-(for 100,000 MRs, it is ~4 seconds for dereg and ~5 seconds for re-reg).
-
-The temporary cache entries created to store the non-cache mkeys are not
-exposed through sysfs like the default cache entries.
-
-Link: https://lore.kernel.org/r/20230125222807.6921-6-michaelgur@nvidia.com
-Signed-off-by: Michael Guralnik <michaelgur@nvidia.com>
-Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
-Stable-dep-of: d97505baea64 ("RDMA/mlx5: Fix the recovery flow of the UMR QP")
-Signed-off-by: Sasha Levin <sashal@kernel.org>
----
- drivers/infiniband/hw/mlx5/mlx5_ib.h | 2 +
- drivers/infiniband/hw/mlx5/mr.c | 55 +++++++++++++++++++++-------
- 2 files changed, 44 insertions(+), 13 deletions(-)
-
-diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
-index 7c9d5648947e9..f345e2ae394d2 100644
---- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
-+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
-@@ -650,6 +650,8 @@ struct mlx5_ib_mkey {
- unsigned int ndescs;
- struct wait_queue_head wait;
- refcount_t usecount;
-+ /* User Mkey must hold either a rb_key or a cache_ent. */
-+ struct mlx5r_cache_rb_key rb_key;
- struct mlx5_cache_ent *cache_ent;
- };
-
-diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
-index 1060b30a837a0..bf1ca7565be67 100644
---- a/drivers/infiniband/hw/mlx5/mr.c
-+++ b/drivers/infiniband/hw/mlx5/mr.c
-@@ -1110,15 +1110,14 @@ static struct mlx5_ib_mr *alloc_cacheable_mr(struct ib_pd *pd,
- rb_key.access_flags = get_unchangeable_access_flags(dev, access_flags);
- ent = mkey_cache_ent_from_rb_key(dev, rb_key);
- /*
-- * Matches access in alloc_cache_mr(). If the MR can't come from the
-- * cache then synchronously create an uncached one.
-+ * If the MR can't come from the cache then synchronously create an uncached
-+ * one.
- */
-- if (!ent || ent->limit == 0 ||
-- !mlx5r_umr_can_reconfig(dev, 0, access_flags) ||
-- mlx5_umem_needs_ats(dev, umem, access_flags)) {
-+ if (!ent) {
- mutex_lock(&dev->slow_path_mutex);
- mr = reg_create(pd, umem, iova, access_flags, page_size, false);
- mutex_unlock(&dev->slow_path_mutex);
-+ mr->mmkey.rb_key = rb_key;
- return mr;
- }
-
-@@ -1209,6 +1208,7 @@ static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, struct ib_umem *umem,
- goto err_2;
- }
- mr->mmkey.type = MLX5_MKEY_MR;
-+ mr->mmkey.ndescs = get_octo_len(iova, umem->length, mr->page_shift);
- mr->umem = umem;
- set_mr_fields(dev, mr, umem->length, access_flags, iova);
- kvfree(in);
-@@ -1747,6 +1747,40 @@ mlx5_free_priv_descs(struct mlx5_ib_mr *mr)
- }
- }
-
-+static int cache_ent_find_and_store(struct mlx5_ib_dev *dev,
-+ struct mlx5_ib_mr *mr)
-+{
-+ struct mlx5_mkey_cache *cache = &dev->cache;
-+ struct mlx5_cache_ent *ent;
-+
-+ if (mr->mmkey.cache_ent) {
-+ xa_lock_irq(&mr->mmkey.cache_ent->mkeys);
-+ mr->mmkey.cache_ent->in_use--;
-+ xa_unlock_irq(&mr->mmkey.cache_ent->mkeys);
-+ goto end;
-+ }
-+
-+ mutex_lock(&cache->rb_lock);
-+ ent = mkey_cache_ent_from_rb_key(dev, mr->mmkey.rb_key);
-+ mutex_unlock(&cache->rb_lock);
-+ if (ent) {
-+ if (ent->rb_key.ndescs == mr->mmkey.rb_key.ndescs) {
-+ mr->mmkey.cache_ent = ent;
-+ goto end;
-+ }
-+ }
-+
-+ ent = mlx5r_cache_create_ent(dev, mr->mmkey.rb_key, false);
-+ if (IS_ERR(ent))
-+ return PTR_ERR(ent);
-+
-+ mr->mmkey.cache_ent = ent;
-+
-+end:
-+ return push_mkey(mr->mmkey.cache_ent, false,
-+ xa_mk_value(mr->mmkey.key));
-+}
-+
- int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata)
- {
- struct mlx5_ib_mr *mr = to_mmr(ibmr);
-@@ -1792,16 +1826,11 @@ int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata)
- }
-
- /* Stop DMA */
-- if (mr->mmkey.cache_ent) {
-- xa_lock_irq(&mr->mmkey.cache_ent->mkeys);
-- mr->mmkey.cache_ent->in_use--;
-- xa_unlock_irq(&mr->mmkey.cache_ent->mkeys);
--
-+ if (mr->umem && mlx5r_umr_can_load_pas(dev, mr->umem->length))
- if (mlx5r_umr_revoke_mr(mr) ||
-- push_mkey(mr->mmkey.cache_ent, false,
-- xa_mk_value(mr->mmkey.key)))
-+ cache_ent_find_and_store(dev, mr))
- mr->mmkey.cache_ent = NULL;
-- }
-+
- if (!mr->mmkey.cache_ent) {
- rc = destroy_mkey(to_mdev(mr->ibmr.device), mr);
- if (rc)
---
-2.39.5
-
+++ /dev/null
-From 8c1185fef68cc603b954fece2a434c9f851d6a86 Mon Sep 17 00:00:00 2001
-From: Or Har-Toov <ohartoov@nvidia.com>
-Date: Wed, 3 Apr 2024 13:36:00 +0300
-Subject: RDMA/mlx5: Change check for cacheable mkeys
-
-From: Or Har-Toov <ohartoov@nvidia.com>
-
-commit 8c1185fef68cc603b954fece2a434c9f851d6a86 upstream.
-
-umem can be NULL for user application mkeys in some cases. Therefore
-umem can't be used for checking if the mkey is cacheable and it is
-changed for checking a flag that indicates it. Also make sure that
-all mkeys which are not returned to the cache will be destroyed.
-
-Fixes: dd1b913fb0d0 ("RDMA/mlx5: Cache all user cacheable mkeys on dereg MR flow")
-Signed-off-by: Or Har-Toov <ohartoov@nvidia.com>
-Link: https://lore.kernel.org/r/2690bc5c6896bcb937f89af16a1ff0343a7ab3d0.1712140377.git.leon@kernel.org
-Signed-off-by: Leon Romanovsky <leon@kernel.org>
-Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
----
- drivers/infiniband/hw/mlx5/mlx5_ib.h | 1 +
- drivers/infiniband/hw/mlx5/mr.c | 32 ++++++++++++++++++++++----------
- 2 files changed, 23 insertions(+), 10 deletions(-)
-
---- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
-+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
-@@ -654,6 +654,7 @@ struct mlx5_ib_mkey {
- /* Cacheable user Mkey must hold either a rb_key or a cache_ent. */
- struct mlx5r_cache_rb_key rb_key;
- struct mlx5_cache_ent *cache_ent;
-+ u8 cacheable : 1;
- };
-
- #define MLX5_IB_MTT_PRESENT (MLX5_IB_MTT_READ | MLX5_IB_MTT_WRITE)
---- a/drivers/infiniband/hw/mlx5/mr.c
-+++ b/drivers/infiniband/hw/mlx5/mr.c
-@@ -1155,6 +1155,7 @@ static struct mlx5_ib_mr *alloc_cacheabl
- if (IS_ERR(mr))
- return mr;
- mr->mmkey.rb_key = rb_key;
-+ mr->mmkey.cacheable = true;
- return mr;
- }
-
-@@ -1165,6 +1166,7 @@ static struct mlx5_ib_mr *alloc_cacheabl
- mr->ibmr.pd = pd;
- mr->umem = umem;
- mr->page_shift = order_base_2(page_size);
-+ mr->mmkey.cacheable = true;
- set_mr_fields(dev, mr, umem->length, access_flags, iova);
-
- return mr;
-@@ -1830,6 +1832,23 @@ end:
- return ret;
- }
-
-+static int mlx5_revoke_mr(struct mlx5_ib_mr *mr)
-+{
-+ struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device);
-+ struct mlx5_cache_ent *ent = mr->mmkey.cache_ent;
-+
-+ if (mr->mmkey.cacheable && !mlx5r_umr_revoke_mr(mr) && !cache_ent_find_and_store(dev, mr))
-+ return 0;
-+
-+ if (ent) {
-+ spin_lock_irq(&ent->mkeys_queue.lock);
-+ ent->in_use--;
-+ mr->mmkey.cache_ent = NULL;
-+ spin_unlock_irq(&ent->mkeys_queue.lock);
-+ }
-+ return destroy_mkey(dev, mr);
-+}
-+
- int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata)
- {
- struct mlx5_ib_mr *mr = to_mmr(ibmr);
-@@ -1875,16 +1894,9 @@ int mlx5_ib_dereg_mr(struct ib_mr *ibmr,
- }
-
- /* Stop DMA */
-- if (mr->umem && mlx5r_umr_can_load_pas(dev, mr->umem->length))
-- if (mlx5r_umr_revoke_mr(mr) ||
-- cache_ent_find_and_store(dev, mr))
-- mr->mmkey.cache_ent = NULL;
--
-- if (!mr->mmkey.cache_ent) {
-- rc = destroy_mkey(to_mdev(mr->ibmr.device), mr);
-- if (rc)
-- return rc;
-- }
-+ rc = mlx5_revoke_mr(mr);
-+ if (rc)
-+ return rc;
-
- if (mr->umem) {
- bool is_odp = is_odp_mr(mr);
+++ /dev/null
-From 3a78949c3d99afa32e87cf8cfe46723a057ee4cb Mon Sep 17 00:00:00 2001
-From: Sasha Levin <sashal@kernel.org>
-Date: Thu, 26 Jan 2023 00:28:04 +0200
-Subject: RDMA/mlx5: Change the cache structure to an RB-tree
-
-From: Michael Guralnik <michaelgur@nvidia.com>
-
-[ Upstream commit b9584517832858a0f78d6851d09b697a829514cd ]
-
-Currently, the cache structure is a static linear array. Therefore, his
-size is limited to the number of entries in it and is not expandable. The
-entries are dedicated to mkeys of size 2^x and no access_flags. Mkeys with
-different properties are not cacheable.
-
-In this patch, we change the cache structure to an RB-tree. This will
-allow to extend the cache to support more entries with different mkey
-properties.
-
-Link: https://lore.kernel.org/r/20230125222807.6921-4-michaelgur@nvidia.com
-Signed-off-by: Michael Guralnik <michaelgur@nvidia.com>
-Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
-Stable-dep-of: d97505baea64 ("RDMA/mlx5: Fix the recovery flow of the UMR QP")
-Signed-off-by: Sasha Levin <sashal@kernel.org>
----
- drivers/infiniband/hw/mlx5/mlx5_ib.h | 11 +-
- drivers/infiniband/hw/mlx5/mr.c | 160 ++++++++++++++++++++-------
- drivers/infiniband/hw/mlx5/odp.c | 8 +-
- 3 files changed, 132 insertions(+), 47 deletions(-)
-
-diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
-index 10c87901da27c..bd998ac8c29c1 100644
---- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
-+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
-@@ -761,6 +761,8 @@ struct mlx5_cache_ent {
- u32 access_mode;
- unsigned int ndescs;
-
-+ struct rb_node node;
-+
- u8 disabled:1;
- u8 fill_to_high_water:1;
-
-@@ -790,8 +792,9 @@ struct mlx5r_async_create_mkey {
-
- struct mlx5_mkey_cache {
- struct workqueue_struct *wq;
-- struct mlx5_cache_ent ent[MAX_MKEY_CACHE_ENTRIES];
-- struct dentry *root;
-+ struct rb_root rb_root;
-+ struct mutex rb_lock;
-+ struct dentry *fs_root;
- unsigned long last_add;
- };
-
-@@ -1336,11 +1339,15 @@ void mlx5_ib_copy_pas(u64 *old, u64 *new, int step, int num);
- int mlx5_ib_get_cqe_size(struct ib_cq *ibcq);
- int mlx5_mkey_cache_init(struct mlx5_ib_dev *dev);
- int mlx5_mkey_cache_cleanup(struct mlx5_ib_dev *dev);
-+struct mlx5_cache_ent *mlx5r_cache_create_ent(struct mlx5_ib_dev *dev,
-+ int order);
-
- struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev,
- struct mlx5_cache_ent *ent,
- int access_flags);
-
-+struct mlx5_ib_mr *mlx5_mr_cache_alloc_order(struct mlx5_ib_dev *dev, u32 order,
-+ int access_flags);
- int mlx5_ib_check_mr_status(struct ib_mr *ibmr, u32 check_mask,
- struct ib_mr_status *mr_status);
- struct ib_wq *mlx5_ib_create_wq(struct ib_pd *pd,
-diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
-index 53fadd6edb68d..b3d83920d3cfb 100644
---- a/drivers/infiniband/hw/mlx5/mr.c
-+++ b/drivers/infiniband/hw/mlx5/mr.c
-@@ -515,18 +515,22 @@ static const struct file_operations limit_fops = {
-
- static bool someone_adding(struct mlx5_mkey_cache *cache)
- {
-- unsigned int i;
--
-- for (i = 0; i < MAX_MKEY_CACHE_ENTRIES; i++) {
-- struct mlx5_cache_ent *ent = &cache->ent[i];
-- bool ret;
-+ struct mlx5_cache_ent *ent;
-+ struct rb_node *node;
-+ bool ret;
-
-+ mutex_lock(&cache->rb_lock);
-+ for (node = rb_first(&cache->rb_root); node; node = rb_next(node)) {
-+ ent = rb_entry(node, struct mlx5_cache_ent, node);
- xa_lock_irq(&ent->mkeys);
- ret = ent->stored < ent->limit;
- xa_unlock_irq(&ent->mkeys);
-- if (ret)
-+ if (ret) {
-+ mutex_unlock(&cache->rb_lock);
- return true;
-+ }
- }
-+ mutex_unlock(&cache->rb_lock);
- return false;
- }
-
-@@ -637,6 +641,59 @@ static void delayed_cache_work_func(struct work_struct *work)
- __cache_work_func(ent);
- }
-
-+static int mlx5_cache_ent_insert(struct mlx5_mkey_cache *cache,
-+ struct mlx5_cache_ent *ent)
-+{
-+ struct rb_node **new = &cache->rb_root.rb_node, *parent = NULL;
-+ struct mlx5_cache_ent *cur;
-+
-+ mutex_lock(&cache->rb_lock);
-+ /* Figure out where to put new node */
-+ while (*new) {
-+ cur = rb_entry(*new, struct mlx5_cache_ent, node);
-+ parent = *new;
-+ if (ent->order < cur->order)
-+ new = &((*new)->rb_left);
-+ if (ent->order > cur->order)
-+ new = &((*new)->rb_right);
-+ if (ent->order == cur->order) {
-+ mutex_unlock(&cache->rb_lock);
-+ return -EEXIST;
-+ }
-+ }
-+
-+ /* Add new node and rebalance tree. */
-+ rb_link_node(&ent->node, parent, new);
-+ rb_insert_color(&ent->node, &cache->rb_root);
-+
-+ mutex_unlock(&cache->rb_lock);
-+ return 0;
-+}
-+
-+static struct mlx5_cache_ent *mkey_cache_ent_from_order(struct mlx5_ib_dev *dev,
-+ unsigned int order)
-+{
-+ struct rb_node *node = dev->cache.rb_root.rb_node;
-+ struct mlx5_cache_ent *cur, *smallest = NULL;
-+
-+ /*
-+ * Find the smallest ent with order >= requested_order.
-+ */
-+ while (node) {
-+ cur = rb_entry(node, struct mlx5_cache_ent, node);
-+ if (cur->order > order) {
-+ smallest = cur;
-+ node = node->rb_left;
-+ }
-+ if (cur->order < order)
-+ node = node->rb_right;
-+ if (cur->order == order)
-+ return cur;
-+ }
-+
-+ return smallest;
-+}
-+
- struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev,
- struct mlx5_cache_ent *ent,
- int access_flags)
-@@ -677,10 +734,16 @@ struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev,
- return mr;
- }
-
--static void clean_keys(struct mlx5_ib_dev *dev, int c)
-+struct mlx5_ib_mr *mlx5_mr_cache_alloc_order(struct mlx5_ib_dev *dev,
-+ u32 order, int access_flags)
-+{
-+ struct mlx5_cache_ent *ent = mkey_cache_ent_from_order(dev, order);
-+
-+ return mlx5_mr_cache_alloc(dev, ent, access_flags);
-+}
-+
-+static void clean_keys(struct mlx5_ib_dev *dev, struct mlx5_cache_ent *ent)
- {
-- struct mlx5_mkey_cache *cache = &dev->cache;
-- struct mlx5_cache_ent *ent = &cache->ent[c];
- u32 mkey;
-
- cancel_delayed_work(&ent->dwork);
-@@ -699,8 +762,8 @@ static void mlx5_mkey_cache_debugfs_cleanup(struct mlx5_ib_dev *dev)
- if (!mlx5_debugfs_root || dev->is_rep)
- return;
-
-- debugfs_remove_recursive(dev->cache.root);
-- dev->cache.root = NULL;
-+ debugfs_remove_recursive(dev->cache.fs_root);
-+ dev->cache.fs_root = NULL;
- }
-
- static void mlx5_mkey_cache_debugfs_init(struct mlx5_ib_dev *dev)
-@@ -713,12 +776,13 @@ static void mlx5_mkey_cache_debugfs_init(struct mlx5_ib_dev *dev)
- if (!mlx5_debugfs_root || dev->is_rep)
- return;
-
-- cache->root = debugfs_create_dir("mr_cache", mlx5_debugfs_get_dev_root(dev->mdev));
-+ dir = mlx5_debugfs_get_dev_root(dev->mdev);
-+ cache->fs_root = debugfs_create_dir("mr_cache", dir);
-
- for (i = 0; i < MAX_MKEY_CACHE_ENTRIES; i++) {
-- ent = &cache->ent[i];
-+ ent = mkey_cache_ent_from_order(dev, i);
- sprintf(ent->name, "%d", ent->order);
-- dir = debugfs_create_dir(ent->name, cache->root);
-+ dir = debugfs_create_dir(ent->name, cache->fs_root);
- debugfs_create_file("size", 0600, dir, ent, &size_fops);
- debugfs_create_file("limit", 0600, dir, ent, &limit_fops);
- debugfs_create_ulong("cur", 0400, dir, &ent->stored);
-@@ -733,6 +797,30 @@ static void delay_time_func(struct timer_list *t)
- WRITE_ONCE(dev->fill_delay, 0);
- }
-
-+struct mlx5_cache_ent *mlx5r_cache_create_ent(struct mlx5_ib_dev *dev,
-+ int order)
-+{
-+ struct mlx5_cache_ent *ent;
-+ int ret;
-+
-+ ent = kzalloc(sizeof(*ent), GFP_KERNEL);
-+ if (!ent)
-+ return ERR_PTR(-ENOMEM);
-+
-+ xa_init_flags(&ent->mkeys, XA_FLAGS_LOCK_IRQ);
-+ ent->order = order;
-+ ent->dev = dev;
-+
-+ INIT_DELAYED_WORK(&ent->dwork, delayed_cache_work_func);
-+
-+ ret = mlx5_cache_ent_insert(&dev->cache, ent);
-+ if (ret) {
-+ kfree(ent);
-+ return ERR_PTR(ret);
-+ }
-+ return ent;
-+}
-+
- int mlx5_mkey_cache_init(struct mlx5_ib_dev *dev)
- {
- struct mlx5_mkey_cache *cache = &dev->cache;
-@@ -740,6 +828,8 @@ int mlx5_mkey_cache_init(struct mlx5_ib_dev *dev)
- int i;
-
- mutex_init(&dev->slow_path_mutex);
-+ mutex_init(&dev->cache.rb_lock);
-+ dev->cache.rb_root = RB_ROOT;
- cache->wq = alloc_ordered_workqueue("mkey_cache", WQ_MEM_RECLAIM);
- if (!cache->wq) {
- mlx5_ib_warn(dev, "failed to create work queue\n");
-@@ -749,13 +839,7 @@ int mlx5_mkey_cache_init(struct mlx5_ib_dev *dev)
- mlx5_cmd_init_async_ctx(dev->mdev, &dev->async_ctx);
- timer_setup(&dev->delay_timer, delay_time_func, 0);
- for (i = 0; i < MAX_MKEY_CACHE_ENTRIES; i++) {
-- ent = &cache->ent[i];
-- xa_init_flags(&ent->mkeys, XA_FLAGS_LOCK_IRQ);
-- ent->order = i + 2;
-- ent->dev = dev;
-- ent->limit = 0;
--
-- INIT_DELAYED_WORK(&ent->dwork, delayed_cache_work_func);
-+ ent = mlx5r_cache_create_ent(dev, i);
-
- if (i > MKEY_CACHE_LAST_STD_ENTRY) {
- mlx5_odp_init_mkey_cache_entry(ent);
-@@ -785,14 +869,16 @@ int mlx5_mkey_cache_init(struct mlx5_ib_dev *dev)
-
- int mlx5_mkey_cache_cleanup(struct mlx5_ib_dev *dev)
- {
-- unsigned int i;
-+ struct rb_root *root = &dev->cache.rb_root;
-+ struct mlx5_cache_ent *ent;
-+ struct rb_node *node;
-
- if (!dev->cache.wq)
- return 0;
-
-- for (i = 0; i < MAX_MKEY_CACHE_ENTRIES; i++) {
-- struct mlx5_cache_ent *ent = &dev->cache.ent[i];
--
-+ mutex_lock(&dev->cache.rb_lock);
-+ for (node = rb_first(root); node; node = rb_next(node)) {
-+ ent = rb_entry(node, struct mlx5_cache_ent, node);
- xa_lock_irq(&ent->mkeys);
- ent->disabled = true;
- xa_unlock_irq(&ent->mkeys);
-@@ -802,8 +888,15 @@ int mlx5_mkey_cache_cleanup(struct mlx5_ib_dev *dev)
- mlx5_mkey_cache_debugfs_cleanup(dev);
- mlx5_cmd_cleanup_async_ctx(&dev->async_ctx);
-
-- for (i = 0; i < MAX_MKEY_CACHE_ENTRIES; i++)
-- clean_keys(dev, i);
-+ node = rb_first(root);
-+ while (node) {
-+ ent = rb_entry(node, struct mlx5_cache_ent, node);
-+ node = rb_next(node);
-+ clean_keys(dev, ent);
-+ rb_erase(&ent->node, root);
-+ kfree(ent);
-+ }
-+ mutex_unlock(&dev->cache.rb_lock);
-
- destroy_workqueue(dev->cache.wq);
- del_timer_sync(&dev->delay_timer);
-@@ -876,19 +969,6 @@ static int mkey_cache_max_order(struct mlx5_ib_dev *dev)
- return MLX5_MAX_UMR_SHIFT;
- }
-
--static struct mlx5_cache_ent *mkey_cache_ent_from_order(struct mlx5_ib_dev *dev,
-- unsigned int order)
--{
-- struct mlx5_mkey_cache *cache = &dev->cache;
--
-- if (order < cache->ent[0].order)
-- return &cache->ent[0];
-- order = order - cache->ent[0].order;
-- if (order > MKEY_CACHE_LAST_STD_ENTRY)
-- return NULL;
-- return &cache->ent[order];
--}
--
- static void set_mr_fields(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr,
- u64 length, int access_flags, u64 iova)
- {
-diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c
-index 5f0a17382de73..7f68940ca0d1e 100644
---- a/drivers/infiniband/hw/mlx5/odp.c
-+++ b/drivers/infiniband/hw/mlx5/odp.c
-@@ -420,8 +420,7 @@ static struct mlx5_ib_mr *implicit_get_child_mr(struct mlx5_ib_mr *imr,
- return ERR_CAST(odp);
-
- BUILD_BUG_ON(order > MKEY_CACHE_LAST_STD_ENTRY);
-- mr = mlx5_mr_cache_alloc(dev, &dev->cache.ent[order],
-- imr->access_flags);
-+ mr = mlx5_mr_cache_alloc_order(dev, order, imr->access_flags);
- if (IS_ERR(mr)) {
- ib_umem_odp_release(odp);
- return mr;
-@@ -495,9 +494,8 @@ struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd,
- if (IS_ERR(umem_odp))
- return ERR_CAST(umem_odp);
-
-- imr = mlx5_mr_cache_alloc(dev,
-- &dev->cache.ent[MLX5_IMR_KSM_CACHE_ENTRY],
-- access_flags);
-+ imr = mlx5_mr_cache_alloc_order(dev, MLX5_IMR_KSM_CACHE_ENTRY,
-+ access_flags);
- if (IS_ERR(imr)) {
- ib_umem_odp_release(umem_odp);
- return imr;
---
-2.39.5
-
+++ /dev/null
-From 8e6e49ccf1a0f2b3257394dc8610bb6d48859d3f Mon Sep 17 00:00:00 2001
-From: Dan Carpenter <error27@gmail.com>
-Date: Mon, 6 Feb 2023 17:40:35 +0300
-Subject: RDMA/mlx5: Check reg_create() create for errors
-
-From: Dan Carpenter <error27@gmail.com>
-
-commit 8e6e49ccf1a0f2b3257394dc8610bb6d48859d3f upstream.
-
-The reg_create() can fail. Check for errors before dereferencing it.
-
-Fixes: dd1b913fb0d0 ("RDMA/mlx5: Cache all user cacheable mkeys on dereg MR flow")
-Signed-off-by: Dan Carpenter <error27@gmail.com>
-Link: https://lore.kernel.org/r/Y+ERYy4wN0LsKsm+@kili
-Reviewed-by: Devesh Sharma <devesh.s.sharma@oracle.com>
-Signed-off-by: Leon Romanovsky <leon@kernel.org>
-Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
----
- drivers/infiniband/hw/mlx5/mr.c | 2 ++
- 1 file changed, 2 insertions(+)
-
---- a/drivers/infiniband/hw/mlx5/mr.c
-+++ b/drivers/infiniband/hw/mlx5/mr.c
-@@ -1143,6 +1143,8 @@ static struct mlx5_ib_mr *alloc_cacheabl
- mutex_lock(&dev->slow_path_mutex);
- mr = reg_create(pd, umem, iova, access_flags, page_size, false);
- mutex_unlock(&dev->slow_path_mutex);
-+ if (IS_ERR(mr))
-+ return mr;
- mr->mmkey.rb_key = rb_key;
- return mr;
- }
+++ /dev/null
-From a85b91bcb6fce39a7511353461ead5a60b13bc69 Mon Sep 17 00:00:00 2001
-From: Sasha Levin <sashal@kernel.org>
-Date: Thu, 26 Jan 2023 00:28:02 +0200
-Subject: RDMA/mlx5: Don't keep umrable 'page_shift' in cache entries
-
-From: Aharon Landau <aharonl@nvidia.com>
-
-[ Upstream commit a2a88b8e22d1b202225d0e40b02ad068afab2ccb ]
-
-mkc.log_page_size can be changed using UMR. Therefore, don't treat it as a
-cache entry property.
-
-Removing it from struct mlx5_cache_ent.
-
-All cache mkeys will be created with default PAGE_SHIFT, and updated with
-the needed page_shift using UMR when passing them to a user.
-
-Link: https://lore.kernel.org/r/20230125222807.6921-2-michaelgur@nvidia.com
-Signed-off-by: Aharon Landau <aharonl@nvidia.com>
-Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
-Stable-dep-of: d97505baea64 ("RDMA/mlx5: Fix the recovery flow of the UMR QP")
-Signed-off-by: Sasha Levin <sashal@kernel.org>
----
- drivers/infiniband/hw/mlx5/mlx5_ib.h | 1 -
- drivers/infiniband/hw/mlx5/mr.c | 3 +--
- drivers/infiniband/hw/mlx5/odp.c | 2 --
- 3 files changed, 1 insertion(+), 5 deletions(-)
-
-diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
-index 0ef347e91ffeb..10c87901da27c 100644
---- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
-+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
-@@ -759,7 +759,6 @@ struct mlx5_cache_ent {
- char name[4];
- u32 order;
- u32 access_mode;
-- u32 page;
- unsigned int ndescs;
-
- u8 disabled:1;
-diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
-index b81b03aa2a629..53fadd6edb68d 100644
---- a/drivers/infiniband/hw/mlx5/mr.c
-+++ b/drivers/infiniband/hw/mlx5/mr.c
-@@ -297,7 +297,7 @@ static void set_cache_mkc(struct mlx5_cache_ent *ent, void *mkc)
-
- MLX5_SET(mkc, mkc, translations_octword_size,
- get_mkc_octo_size(ent->access_mode, ent->ndescs));
-- MLX5_SET(mkc, mkc, log_page_size, ent->page);
-+ MLX5_SET(mkc, mkc, log_page_size, PAGE_SHIFT);
- }
-
- /* Asynchronously schedule new MRs to be populated in the cache. */
-@@ -765,7 +765,6 @@ int mlx5_mkey_cache_init(struct mlx5_ib_dev *dev)
- if (ent->order > mkey_cache_max_order(dev))
- continue;
-
-- ent->page = PAGE_SHIFT;
- ent->ndescs = 1 << ent->order;
- ent->access_mode = MLX5_MKC_ACCESS_MODE_MTT;
- if ((dev->mdev->profile.mask & MLX5_PROF_MASK_MR_CACHE) &&
-diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c
-index 87fbee8061003..a5c9baec8be85 100644
---- a/drivers/infiniband/hw/mlx5/odp.c
-+++ b/drivers/infiniband/hw/mlx5/odp.c
-@@ -1598,14 +1598,12 @@ void mlx5_odp_init_mkey_cache_entry(struct mlx5_cache_ent *ent)
-
- switch (ent->order - 2) {
- case MLX5_IMR_MTT_CACHE_ENTRY:
-- ent->page = PAGE_SHIFT;
- ent->ndescs = MLX5_IMR_MTT_ENTRIES;
- ent->access_mode = MLX5_MKC_ACCESS_MODE_MTT;
- ent->limit = 0;
- break;
-
- case MLX5_IMR_KSM_CACHE_ENTRY:
-- ent->page = MLX5_KSM_PAGE_SHIFT;
- ent->ndescs = mlx5_imr_ksm_entries;
- ent->access_mode = MLX5_MKC_ACCESS_MODE_KSM;
- ent->limit = 0;
---
-2.39.5
-
+++ /dev/null
-From 2e4c02fdecf2f6f55cefe48cb82d93fa4f8e2204 Mon Sep 17 00:00:00 2001
-From: Jason Gunthorpe <jgg@nvidia.com>
-Date: Tue, 28 May 2024 15:52:54 +0300
-Subject: RDMA/mlx5: Ensure created mkeys always have a populated rb_key
-
-From: Jason Gunthorpe <jgg@nvidia.com>
-
-commit 2e4c02fdecf2f6f55cefe48cb82d93fa4f8e2204 upstream.
-
-cachable and mmkey.rb_key together are used by mlx5_revoke_mr() to put the
-MR/mkey back into the cache. In all cases they should be set correctly.
-
-alloc_cacheable_mr() was setting cachable but not filling rb_key,
-resulting in cache_ent_find_and_store() bucketing them all into a 0 length
-entry.
-
-implicit_get_child_mr()/mlx5_ib_alloc_implicit_mr() failed to set cachable
-or rb_key at all, so the cache was not working at all for implicit ODP.
-
-Cc: stable@vger.kernel.org
-Fixes: 8c1185fef68c ("RDMA/mlx5: Change check for cacheable mkeys")
-Fixes: dd1b913fb0d0 ("RDMA/mlx5: Cache all user cacheable mkeys on dereg MR flow")
-Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
-Link: https://lore.kernel.org/r/7778c02dfa0999a30d6746c79a23dd7140a9c729.1716900410.git.leon@kernel.org
-Signed-off-by: Leon Romanovsky <leon@kernel.org>
-Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
----
- drivers/infiniband/hw/mlx5/mr.c | 3 ++-
- 1 file changed, 2 insertions(+), 1 deletion(-)
-
---- a/drivers/infiniband/hw/mlx5/mr.c
-+++ b/drivers/infiniband/hw/mlx5/mr.c
-@@ -715,6 +715,8 @@ static struct mlx5_ib_mr *_mlx5_mr_cache
- }
- mr->mmkey.cache_ent = ent;
- mr->mmkey.type = MLX5_MKEY_MR;
-+ mr->mmkey.rb_key = ent->rb_key;
-+ mr->mmkey.cacheable = true;
- init_waitqueue_head(&mr->mmkey.wait);
- return mr;
- }
-@@ -1165,7 +1167,6 @@ static struct mlx5_ib_mr *alloc_cacheabl
- mr->ibmr.pd = pd;
- mr->umem = umem;
- mr->page_shift = order_base_2(page_size);
-- mr->mmkey.cacheable = true;
- set_mr_fields(dev, mr, umem->length, access_flags, iova);
-
- return mr;
+++ /dev/null
-From b79f406d4cc08e99e836a5e95040672efdba5313 Mon Sep 17 00:00:00 2001
-From: Sasha Levin <sashal@kernel.org>
-Date: Mon, 10 Feb 2025 13:32:39 +0200
-Subject: RDMA/mlx5: Fix AH static rate parsing
-
-From: Patrisious Haddad <phaddad@nvidia.com>
-
-[ Upstream commit c534ffda781f44a1c6ac25ef6e0e444da38ca8af ]
-
-Previously static rate wasn't translated according to our PRM but simply
-used the 4 lower bytes.
-
-Correctly translate static rate value passed in AH creation attribute
-according to our PRM expected values.
-
-In addition change 800GB mapping to zero, which is the PRM
-specified value.
-
-Fixes: e126ba97dba9 ("mlx5: Add driver for Mellanox Connect-IB adapters")
-Signed-off-by: Patrisious Haddad <phaddad@nvidia.com>
-Reviewed-by: Maor Gottlieb <maorg@nvidia.com>
-Link: https://patch.msgid.link/18ef4cc5396caf80728341eb74738cd777596f60.1739187089.git.leon@kernel.org
-Signed-off-by: Leon Romanovsky <leon@kernel.org>
-Signed-off-by: Sasha Levin <sashal@kernel.org>
----
- drivers/infiniband/hw/mlx5/ah.c | 3 ++-
- drivers/infiniband/hw/mlx5/qp.c | 6 +++---
- drivers/infiniband/hw/mlx5/qp.h | 1 +
- 3 files changed, 6 insertions(+), 4 deletions(-)
-
-diff --git a/drivers/infiniband/hw/mlx5/ah.c b/drivers/infiniband/hw/mlx5/ah.c
-index 505bc47fd575d..99036afb3aef0 100644
---- a/drivers/infiniband/hw/mlx5/ah.c
-+++ b/drivers/infiniband/hw/mlx5/ah.c
-@@ -67,7 +67,8 @@ static void create_ib_ah(struct mlx5_ib_dev *dev, struct mlx5_ib_ah *ah,
- ah->av.tclass = grh->traffic_class;
- }
-
-- ah->av.stat_rate_sl = (rdma_ah_get_static_rate(ah_attr) << 4);
-+ ah->av.stat_rate_sl =
-+ (mlx5r_ib_rate(dev, rdma_ah_get_static_rate(ah_attr)) << 4);
-
- if (ah_attr->type == RDMA_AH_ATTR_TYPE_ROCE) {
- if (init_attr->xmit_slave)
-diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c
-index 43c0123babd10..59dca0cd89052 100644
---- a/drivers/infiniband/hw/mlx5/qp.c
-+++ b/drivers/infiniband/hw/mlx5/qp.c
-@@ -3379,11 +3379,11 @@ static int ib_to_mlx5_rate_map(u8 rate)
- return 0;
- }
-
--static int ib_rate_to_mlx5(struct mlx5_ib_dev *dev, u8 rate)
-+int mlx5r_ib_rate(struct mlx5_ib_dev *dev, u8 rate)
- {
- u32 stat_rate_support;
-
-- if (rate == IB_RATE_PORT_CURRENT)
-+ if (rate == IB_RATE_PORT_CURRENT || rate == IB_RATE_800_GBPS)
- return 0;
-
- if (rate < IB_RATE_2_5_GBPS || rate > IB_RATE_600_GBPS)
-@@ -3528,7 +3528,7 @@ static int mlx5_set_path(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp,
- sizeof(grh->dgid.raw));
- }
-
-- err = ib_rate_to_mlx5(dev, rdma_ah_get_static_rate(ah));
-+ err = mlx5r_ib_rate(dev, rdma_ah_get_static_rate(ah));
- if (err < 0)
- return err;
- MLX5_SET(ads, path, stat_rate, err);
-diff --git a/drivers/infiniband/hw/mlx5/qp.h b/drivers/infiniband/hw/mlx5/qp.h
-index e677fa0ca4226..4abb77d551670 100644
---- a/drivers/infiniband/hw/mlx5/qp.h
-+++ b/drivers/infiniband/hw/mlx5/qp.h
-@@ -55,4 +55,5 @@ int mlx5_core_xrcd_dealloc(struct mlx5_ib_dev *dev, u32 xrcdn);
- int mlx5_ib_qp_set_counter(struct ib_qp *qp, struct rdma_counter *counter);
- int mlx5_ib_qp_event_init(void);
- void mlx5_ib_qp_event_cleanup(void);
-+int mlx5r_ib_rate(struct mlx5_ib_dev *dev, u8 rate);
- #endif /* _MLX5_IB_QP_H */
---
-2.39.5
-
+++ /dev/null
-From 4f14c6c0213e1def48f0f887d35f44095416c67d Mon Sep 17 00:00:00 2001
-From: Michael Guralnik <michaelgur@nvidia.com>
-Date: Wed, 20 Sep 2023 13:01:54 +0300
-Subject: RDMA/mlx5: Fix assigning access flags to cache mkeys
-
-From: Michael Guralnik <michaelgur@nvidia.com>
-
-commit 4f14c6c0213e1def48f0f887d35f44095416c67d upstream.
-
-After the change to use dynamic cache structure, new cache entries
-can be added and the mkey allocation can no longer assume that all
-mkeys created for the cache have access_flags equal to zero.
-
-Example of a flow that exposes the issue:
-A user registers MR with RO on a HCA that cannot UMR RO and the mkey is
-created outside of the cache. When the user deregisters the MR, a new
-cache entry is created to store mkeys with RO.
-
-Later, the user registers 2 MRs with RO. The first MR is reused from the
-new cache entry. When we try to get the second mkey from the cache we see
-the entry is empty so we go to the MR cache mkey allocation flow which
-would have allocated a mkey with no access flags, resulting the user getting
-a MR without RO.
-
-Fixes: dd1b913fb0d0 ("RDMA/mlx5: Cache all user cacheable mkeys on dereg MR flow")
-Reviewed-by: Edward Srouji <edwards@nvidia.com>
-Signed-off-by: Michael Guralnik <michaelgur@nvidia.com>
-Link: https://lore.kernel.org/r/8a802700b82def3ace3f77cd7a9ad9d734af87e7.1695203958.git.leonro@nvidia.com
-Signed-off-by: Leon Romanovsky <leon@kernel.org>
-Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
----
- drivers/infiniband/hw/mlx5/mr.c | 3 ++-
- 1 file changed, 2 insertions(+), 1 deletion(-)
-
---- a/drivers/infiniband/hw/mlx5/mr.c
-+++ b/drivers/infiniband/hw/mlx5/mr.c
-@@ -236,7 +236,8 @@ static int get_mkc_octo_size(unsigned in
-
- static void set_cache_mkc(struct mlx5_cache_ent *ent, void *mkc)
- {
-- set_mkc_access_pd_addr_fields(mkc, 0, 0, ent->dev->umrc.pd);
-+ set_mkc_access_pd_addr_fields(mkc, ent->rb_key.access_flags, 0,
-+ ent->dev->umrc.pd);
- MLX5_SET(mkc, mkc, free, 1);
- MLX5_SET(mkc, mkc, umr_en, 1);
- MLX5_SET(mkc, mkc, access_mode_1_0, ent->rb_key.access_mode & 0x3);
+++ /dev/null
-From 6f5cd6ac9a4201e4ba6f10b76a9da8044d6e38b0 Mon Sep 17 00:00:00 2001
-From: Michael Guralnik <michaelgur@nvidia.com>
-Date: Tue, 3 Sep 2024 14:24:48 +0300
-Subject: RDMA/mlx5: Fix counter update on MR cache mkey creation
-
-From: Michael Guralnik <michaelgur@nvidia.com>
-
-commit 6f5cd6ac9a4201e4ba6f10b76a9da8044d6e38b0 upstream.
-
-After an mkey is created, update the counter for pending mkeys before
-reshceduling the work that is filling the cache.
-
-Rescheduling the work with a full MR cache entry and a wrong 'pending'
-counter will cause us to miss disabling the fill_to_high_water flag.
-Thus leaving the cache full but with an indication that it's still
-needs to be filled up to it's full size (2 * limit).
-Next time an mkey will be taken from the cache, we'll unnecessarily
-continue the process of filling the cache to it's full size.
-
-Fixes: 57e7071683ef ("RDMA/mlx5: Implement mkeys management via LIFO queue")
-Signed-off-by: Michael Guralnik <michaelgur@nvidia.com>
-Link: https://patch.msgid.link/0f44f462ba22e45f72cb3d0ec6a748634086b8d0.1725362530.git.leon@kernel.org
-Signed-off-by: Leon Romanovsky <leon@kernel.org>
-Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
----
- drivers/infiniband/hw/mlx5/mr.c | 2 +-
- 1 file changed, 1 insertion(+), 1 deletion(-)
-
---- a/drivers/infiniband/hw/mlx5/mr.c
-+++ b/drivers/infiniband/hw/mlx5/mr.c
-@@ -208,9 +208,9 @@ static void create_mkey_callback(int sta
-
- spin_lock_irqsave(&ent->mkeys_queue.lock, flags);
- push_mkey_locked(ent, mkey_out->mkey);
-+ ent->pending--;
- /* If we are doing fill_to_high_water then keep going. */
- queue_adjust_cache_locked(ent);
-- ent->pending--;
- spin_unlock_irqrestore(&ent->mkeys_queue.lock, flags);
- kfree(mkey_out);
- }
+++ /dev/null
-From 374012b0045780b7ad498be62e85153009bb7fe9 Mon Sep 17 00:00:00 2001
-From: Shay Drory <shayd@nvidia.com>
-Date: Tue, 12 Sep 2023 13:07:45 +0300
-Subject: RDMA/mlx5: Fix mkey cache possible deadlock on cleanup
-
-From: Shay Drory <shayd@nvidia.com>
-
-commit 374012b0045780b7ad498be62e85153009bb7fe9 upstream.
-
-Fix the deadlock by refactoring the MR cache cleanup flow to flush the
-workqueue without holding the rb_lock.
-This adds a race between cache cleanup and creation of new entries which
-we solve by denied creation of new entries after cache cleanup started.
-
-Lockdep:
-WARNING: possible circular locking dependency detected
- [ 2785.326074 ] 6.2.0-rc6_for_upstream_debug_2023_01_31_14_02 #1 Not tainted
- [ 2785.339778 ] ------------------------------------------------------
- [ 2785.340848 ] devlink/53872 is trying to acquire lock:
- [ 2785.341701 ] ffff888124f8c0c8 ((work_completion)(&(&ent->dwork)->work)){+.+.}-{0:0}, at: __flush_work+0xc8/0x900
- [ 2785.343403 ]
- [ 2785.343403 ] but task is already holding lock:
- [ 2785.344464 ] ffff88817e8f1260 (&dev->cache.rb_lock){+.+.}-{3:3}, at: mlx5_mkey_cache_cleanup+0x77/0x250 [mlx5_ib]
- [ 2785.346273 ]
- [ 2785.346273 ] which lock already depends on the new lock.
- [ 2785.346273 ]
- [ 2785.347720 ]
- [ 2785.347720 ] the existing dependency chain (in reverse order) is:
- [ 2785.349003 ]
- [ 2785.349003 ] -> #1 (&dev->cache.rb_lock){+.+.}-{3:3}:
- [ 2785.350160 ] __mutex_lock+0x14c/0x15c0
- [ 2785.350962 ] delayed_cache_work_func+0x2d1/0x610 [mlx5_ib]
- [ 2785.352044 ] process_one_work+0x7c2/0x1310
- [ 2785.352879 ] worker_thread+0x59d/0xec0
- [ 2785.353636 ] kthread+0x28f/0x330
- [ 2785.354370 ] ret_from_fork+0x1f/0x30
- [ 2785.355135 ]
- [ 2785.355135 ] -> #0 ((work_completion)(&(&ent->dwork)->work)){+.+.}-{0:0}:
- [ 2785.356515 ] __lock_acquire+0x2d8a/0x5fe0
- [ 2785.357349 ] lock_acquire+0x1c1/0x540
- [ 2785.358121 ] __flush_work+0xe8/0x900
- [ 2785.358852 ] __cancel_work_timer+0x2c7/0x3f0
- [ 2785.359711 ] mlx5_mkey_cache_cleanup+0xfb/0x250 [mlx5_ib]
- [ 2785.360781 ] mlx5_ib_stage_pre_ib_reg_umr_cleanup+0x16/0x30 [mlx5_ib]
- [ 2785.361969 ] __mlx5_ib_remove+0x68/0x120 [mlx5_ib]
- [ 2785.362960 ] mlx5r_remove+0x63/0x80 [mlx5_ib]
- [ 2785.363870 ] auxiliary_bus_remove+0x52/0x70
- [ 2785.364715 ] device_release_driver_internal+0x3c1/0x600
- [ 2785.365695 ] bus_remove_device+0x2a5/0x560
- [ 2785.366525 ] device_del+0x492/0xb80
- [ 2785.367276 ] mlx5_detach_device+0x1a9/0x360 [mlx5_core]
- [ 2785.368615 ] mlx5_unload_one_devl_locked+0x5a/0x110 [mlx5_core]
- [ 2785.369934 ] mlx5_devlink_reload_down+0x292/0x580 [mlx5_core]
- [ 2785.371292 ] devlink_reload+0x439/0x590
- [ 2785.372075 ] devlink_nl_cmd_reload+0xaef/0xff0
- [ 2785.372973 ] genl_family_rcv_msg_doit.isra.0+0x1bd/0x290
- [ 2785.374011 ] genl_rcv_msg+0x3ca/0x6c0
- [ 2785.374798 ] netlink_rcv_skb+0x12c/0x360
- [ 2785.375612 ] genl_rcv+0x24/0x40
- [ 2785.376295 ] netlink_unicast+0x438/0x710
- [ 2785.377121 ] netlink_sendmsg+0x7a1/0xca0
- [ 2785.377926 ] sock_sendmsg+0xc5/0x190
- [ 2785.378668 ] __sys_sendto+0x1bc/0x290
- [ 2785.379440 ] __x64_sys_sendto+0xdc/0x1b0
- [ 2785.380255 ] do_syscall_64+0x3d/0x90
- [ 2785.381031 ] entry_SYSCALL_64_after_hwframe+0x46/0xb0
- [ 2785.381967 ]
- [ 2785.381967 ] other info that might help us debug this:
- [ 2785.381967 ]
- [ 2785.383448 ] Possible unsafe locking scenario:
- [ 2785.383448 ]
- [ 2785.384544 ] CPU0 CPU1
- [ 2785.385383 ] ---- ----
- [ 2785.386193 ] lock(&dev->cache.rb_lock);
- [ 2785.386940 ] lock((work_completion)(&(&ent->dwork)->work));
- [ 2785.388327 ] lock(&dev->cache.rb_lock);
- [ 2785.389425 ] lock((work_completion)(&(&ent->dwork)->work));
- [ 2785.390414 ]
- [ 2785.390414 ] *** DEADLOCK ***
- [ 2785.390414 ]
- [ 2785.391579 ] 6 locks held by devlink/53872:
- [ 2785.392341 ] #0: ffffffff84c17a50 (cb_lock){++++}-{3:3}, at: genl_rcv+0x15/0x40
- [ 2785.393630 ] #1: ffff888142280218 (&devlink->lock_key){+.+.}-{3:3}, at: devlink_get_from_attrs_lock+0x12d/0x2d0
- [ 2785.395324 ] #2: ffff8881422d3c38 (&dev->lock_key){+.+.}-{3:3}, at: mlx5_unload_one_devl_locked+0x4a/0x110 [mlx5_core]
- [ 2785.397322 ] #3: ffffffffa0e59068 (mlx5_intf_mutex){+.+.}-{3:3}, at: mlx5_detach_device+0x60/0x360 [mlx5_core]
- [ 2785.399231 ] #4: ffff88810e3cb0e8 (&dev->mutex){....}-{3:3}, at: device_release_driver_internal+0x8d/0x600
- [ 2785.400864 ] #5: ffff88817e8f1260 (&dev->cache.rb_lock){+.+.}-{3:3}, at: mlx5_mkey_cache_cleanup+0x77/0x250 [mlx5_ib]
-
-Fixes: b95845178328 ("RDMA/mlx5: Change the cache structure to an RB-tree")
-Signed-off-by: Shay Drory <shayd@nvidia.com>
-Signed-off-by: Michael Guralnik <michaelgur@nvidia.com>
-Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
-Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
----
- drivers/infiniband/hw/mlx5/mlx5_ib.h | 1 +
- drivers/infiniband/hw/mlx5/mr.c | 16 ++++++++++++++--
- 2 files changed, 15 insertions(+), 2 deletions(-)
-
---- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
-+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
-@@ -821,6 +821,7 @@ struct mlx5_mkey_cache {
- struct dentry *fs_root;
- unsigned long last_add;
- struct delayed_work remove_ent_dwork;
-+ u8 disable: 1;
- };
-
- struct mlx5_ib_port_resources {
---- a/drivers/infiniband/hw/mlx5/mr.c
-+++ b/drivers/infiniband/hw/mlx5/mr.c
-@@ -994,19 +994,27 @@ int mlx5_mkey_cache_cleanup(struct mlx5_
- if (!dev->cache.wq)
- return 0;
-
-- cancel_delayed_work_sync(&dev->cache.remove_ent_dwork);
- mutex_lock(&dev->cache.rb_lock);
-+ dev->cache.disable = true;
- for (node = rb_first(root); node; node = rb_next(node)) {
- ent = rb_entry(node, struct mlx5_cache_ent, node);
- spin_lock_irq(&ent->mkeys_queue.lock);
- ent->disabled = true;
- spin_unlock_irq(&ent->mkeys_queue.lock);
-- cancel_delayed_work_sync(&ent->dwork);
- }
-+ mutex_unlock(&dev->cache.rb_lock);
-+
-+ /*
-+ * After all entries are disabled and will not reschedule on WQ,
-+ * flush it and all async commands.
-+ */
-+ flush_workqueue(dev->cache.wq);
-
- mlx5_mkey_cache_debugfs_cleanup(dev);
- mlx5_cmd_cleanup_async_ctx(&dev->async_ctx);
-
-+ /* At this point all entries are disabled and have no concurrent work. */
-+ mutex_lock(&dev->cache.rb_lock);
- node = rb_first(root);
- while (node) {
- ent = rb_entry(node, struct mlx5_cache_ent, node);
-@@ -1789,6 +1797,10 @@ static int cache_ent_find_and_store(stru
- }
-
- mutex_lock(&cache->rb_lock);
-+ if (cache->disable) {
-+ mutex_unlock(&cache->rb_lock);
-+ return 0;
-+ }
- ent = mkey_cache_ent_from_rb_key(dev, mr->mmkey.rb_key);
- if (ent) {
- if (ent->rb_key.ndescs == mr->mmkey.rb_key.ndescs) {
+++ /dev/null
-From a53e215f90079f617360439b1b6284820731e34c Mon Sep 17 00:00:00 2001
-From: Moshe Shemesh <moshe@nvidia.com>
-Date: Wed, 25 Oct 2023 20:49:59 +0300
-Subject: RDMA/mlx5: Fix mkey cache WQ flush
-
-From: Moshe Shemesh <moshe@nvidia.com>
-
-commit a53e215f90079f617360439b1b6284820731e34c upstream.
-
-The cited patch tries to ensure no pending works on the mkey cache
-workqueue by disabling adding new works and call flush_workqueue().
-But this workqueue also has delayed works which might still be pending
-the delay time to be queued.
-
-Add cancel_delayed_work() for the delayed works which waits to be queued
-and then the flush_workqueue() will flush all works which are already
-queued and running.
-
-Fixes: 374012b00457 ("RDMA/mlx5: Fix mkey cache possible deadlock on cleanup")
-Link: https://lore.kernel.org/r/b8722f14e7ed81452f791764a26d2ed4cfa11478.1698256179.git.leon@kernel.org
-Signed-off-by: Moshe Shemesh <moshe@nvidia.com>
-Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
-Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
-Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
----
- drivers/infiniband/hw/mlx5/mr.c | 2 ++
- 1 file changed, 2 insertions(+)
-
---- a/drivers/infiniband/hw/mlx5/mr.c
-+++ b/drivers/infiniband/hw/mlx5/mr.c
-@@ -1007,11 +1007,13 @@ int mlx5_mkey_cache_cleanup(struct mlx5_
- return 0;
-
- mutex_lock(&dev->cache.rb_lock);
-+ cancel_delayed_work(&dev->cache.remove_ent_dwork);
- for (node = rb_first(root); node; node = rb_next(node)) {
- ent = rb_entry(node, struct mlx5_cache_ent, node);
- spin_lock_irq(&ent->mkeys_queue.lock);
- ent->disabled = true;
- spin_unlock_irq(&ent->mkeys_queue.lock);
-+ cancel_delayed_work(&ent->dwork);
- }
- mutex_unlock(&dev->cache.rb_lock);
-
+++ /dev/null
-From 828cf5936bea2438c21a3a6c303b34a2a1f6c3c2 Mon Sep 17 00:00:00 2001
-From: Leon Romanovsky <leonro@nvidia.com>
-Date: Thu, 2 Feb 2023 11:03:06 +0200
-Subject: RDMA/mlx5: Fix MR cache debugfs error in IB representors mode
-
-From: Leon Romanovsky <leonro@nvidia.com>
-
-commit 828cf5936bea2438c21a3a6c303b34a2a1f6c3c2 upstream.
-
-Block MR cache debugfs creation for IB representor flow as MR cache shouldn't be used
-at all in that mode. As part of this change, add missing debugfs cleanup in error path
-too.
-
-This change fixes the following debugfs errors:
-
- bond0: (slave enp8s0f1): Enslaving as a backup interface with an up link
- mlx5_core 0000:08:00.0: lag map: port 1:1 port 2:1
- mlx5_core 0000:08:00.0: shared_fdb:1 mode:queue_affinity
- mlx5_core 0000:08:00.0: Operation mode is single FDB
- debugfs: Directory '2' with parent '/' already present!
-...
- debugfs: Directory '22' with parent '/' already present!
-
-Fixes: 73d09b2fe833 ("RDMA/mlx5: Introduce mlx5r_cache_rb_key")
-Signed-off-by: Michael Guralnik <michaelgur@nvidia.com>
-Link: https://lore.kernel.org/r/482a78c54acbcfa1742a0e06a452546428900ffa.1675328463.git.leon@kernel.org
-Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
-Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
----
- drivers/infiniband/hw/mlx5/mr.c | 4 ++++
- 1 file changed, 4 insertions(+)
-
---- a/drivers/infiniband/hw/mlx5/mr.c
-+++ b/drivers/infiniband/hw/mlx5/mr.c
-@@ -789,6 +789,9 @@ static void mlx5_mkey_cache_debugfs_add_
- int order = order_base_2(ent->rb_key.ndescs);
- struct dentry *dir;
-
-+ if (!mlx5_debugfs_root || dev->is_rep)
-+ return;
-+
- if (ent->rb_key.access_mode == MLX5_MKC_ACCESS_MODE_KSM)
- order = MLX5_IMR_KSM_CACHE_ENTRY + 2;
-
-@@ -977,6 +980,7 @@ int mlx5_mkey_cache_init(struct mlx5_ib_
-
- err:
- mutex_unlock(&cache->rb_lock);
-+ mlx5_mkey_cache_debugfs_cleanup(dev);
- mlx5_ib_warn(dev, "failed to create mkey cache entry\n");
- return ret;
- }
+++ /dev/null
-From 7ebb00cea49db641b458edef0ede389f7004821d Mon Sep 17 00:00:00 2001
-From: Michael Guralnik <michaelgur@nvidia.com>
-Date: Tue, 3 Sep 2024 14:24:50 +0300
-Subject: RDMA/mlx5: Fix MR cache temp entries cleanup
-
-From: Michael Guralnik <michaelgur@nvidia.com>
-
-commit 7ebb00cea49db641b458edef0ede389f7004821d upstream.
-
-Fix the cleanup of the temp cache entries that are dynamically created
-in the MR cache.
-
-The cleanup of the temp cache entries is currently scheduled only when a
-new entry is created. Since in the cleanup of the entries only the mkeys
-are destroyed and the cache entry stays in the cache, subsequent
-registrations might reuse the entry and it will eventually be filled with
-new mkeys without cleanup ever getting scheduled again.
-
-On workloads that register and deregister MRs with a wide range of
-properties we see the cache ends up holding many cache entries, each
-holding the max number of mkeys that were ever used through it.
-
-Additionally, as the cleanup work is scheduled to run over the whole
-cache, any mkey that is returned to the cache after the cleanup was
-scheduled will be held for less than the intended 30 seconds timeout.
-
-Solve both issues by dropping the existing remove_ent_work and reusing
-the existing per-entry work to also handle the temp entries cleanup.
-
-Schedule the work to run with a 30 seconds delay every time we push an
-mkey to a clean temp entry.
-This ensures the cleanup runs on each entry only 30 seconds after the
-first mkey was pushed to an empty entry.
-
-As we have already been distinguishing between persistent and temp entries
-when scheduling the cache_work_func, it is not being scheduled in any
-other flows for the temp entries.
-
-Another benefit from moving to a per-entry cleanup is we now not
-required to hold the rb_tree mutex, thus enabling other flow to run
-concurrently.
-
-Fixes: dd1b913fb0d0 ("RDMA/mlx5: Cache all user cacheable mkeys on dereg MR flow")
-Signed-off-by: Michael Guralnik <michaelgur@nvidia.com>
-Link: https://patch.msgid.link/e4fa4bb03bebf20dceae320f26816cd2dde23a26.1725362530.git.leon@kernel.org
-Signed-off-by: Leon Romanovsky <leon@kernel.org>
-Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
----
- drivers/infiniband/hw/mlx5/mlx5_ib.h | 2
- drivers/infiniband/hw/mlx5/mr.c | 85 +++++++++++++----------------------
- 2 files changed, 34 insertions(+), 53 deletions(-)
-
---- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
-+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
-@@ -790,6 +790,7 @@ struct mlx5_cache_ent {
- u8 is_tmp:1;
- u8 disabled:1;
- u8 fill_to_high_water:1;
-+ u8 tmp_cleanup_scheduled:1;
-
- /*
- * - limit is the low water mark for stored mkeys, 2* limit is the
-@@ -821,7 +822,6 @@ struct mlx5_mkey_cache {
- struct mutex rb_lock;
- struct dentry *fs_root;
- unsigned long last_add;
-- struct delayed_work remove_ent_dwork;
- };
-
- struct mlx5_ib_port_resources {
---- a/drivers/infiniband/hw/mlx5/mr.c
-+++ b/drivers/infiniband/hw/mlx5/mr.c
-@@ -525,6 +525,23 @@ static void queue_adjust_cache_locked(st
- }
- }
-
-+static void clean_keys(struct mlx5_ib_dev *dev, struct mlx5_cache_ent *ent)
-+{
-+ u32 mkey;
-+
-+ cancel_delayed_work(&ent->dwork);
-+ spin_lock_irq(&ent->mkeys_queue.lock);
-+ while (ent->mkeys_queue.ci) {
-+ mkey = pop_mkey_locked(ent);
-+ spin_unlock_irq(&ent->mkeys_queue.lock);
-+ mlx5_core_destroy_mkey(dev->mdev, mkey);
-+ spin_lock_irq(&ent->mkeys_queue.lock);
-+ }
-+ ent->tmp_cleanup_scheduled = false;
-+ spin_unlock_irq(&ent->mkeys_queue.lock);
-+}
-+
-+
- static void __cache_work_func(struct mlx5_cache_ent *ent)
- {
- struct mlx5_ib_dev *dev = ent->dev;
-@@ -596,7 +613,11 @@ static void delayed_cache_work_func(stru
- struct mlx5_cache_ent *ent;
-
- ent = container_of(work, struct mlx5_cache_ent, dwork.work);
-- __cache_work_func(ent);
-+ /* temp entries are never filled, only cleaned */
-+ if (ent->is_tmp)
-+ clean_keys(ent->dev, ent);
-+ else
-+ __cache_work_func(ent);
- }
-
- static int cache_ent_key_cmp(struct mlx5r_cache_rb_key key1,
-@@ -771,21 +792,6 @@ struct mlx5_ib_mr *mlx5_mr_cache_alloc(s
- return _mlx5_mr_cache_alloc(dev, ent, access_flags);
- }
-
--static void clean_keys(struct mlx5_ib_dev *dev, struct mlx5_cache_ent *ent)
--{
-- u32 mkey;
--
-- cancel_delayed_work(&ent->dwork);
-- spin_lock_irq(&ent->mkeys_queue.lock);
-- while (ent->mkeys_queue.ci) {
-- mkey = pop_mkey_locked(ent);
-- spin_unlock_irq(&ent->mkeys_queue.lock);
-- mlx5_core_destroy_mkey(dev->mdev, mkey);
-- spin_lock_irq(&ent->mkeys_queue.lock);
-- }
-- spin_unlock_irq(&ent->mkeys_queue.lock);
--}
--
- static void mlx5_mkey_cache_debugfs_cleanup(struct mlx5_ib_dev *dev)
- {
- if (!mlx5_debugfs_root || dev->is_rep)
-@@ -898,10 +904,6 @@ mlx5r_cache_create_ent_locked(struct mlx
- ent->limit = 0;
-
- mlx5_mkey_cache_debugfs_add_ent(dev, ent);
-- } else {
-- mod_delayed_work(ent->dev->cache.wq,
-- &ent->dev->cache.remove_ent_dwork,
-- msecs_to_jiffies(30 * 1000));
- }
-
- return ent;
-@@ -912,35 +914,6 @@ mkeys_err:
- return ERR_PTR(ret);
- }
-
--static void remove_ent_work_func(struct work_struct *work)
--{
-- struct mlx5_mkey_cache *cache;
-- struct mlx5_cache_ent *ent;
-- struct rb_node *cur;
--
-- cache = container_of(work, struct mlx5_mkey_cache,
-- remove_ent_dwork.work);
-- mutex_lock(&cache->rb_lock);
-- cur = rb_last(&cache->rb_root);
-- while (cur) {
-- ent = rb_entry(cur, struct mlx5_cache_ent, node);
-- cur = rb_prev(cur);
-- mutex_unlock(&cache->rb_lock);
--
-- spin_lock_irq(&ent->mkeys_queue.lock);
-- if (!ent->is_tmp) {
-- spin_unlock_irq(&ent->mkeys_queue.lock);
-- mutex_lock(&cache->rb_lock);
-- continue;
-- }
-- spin_unlock_irq(&ent->mkeys_queue.lock);
--
-- clean_keys(ent->dev, ent);
-- mutex_lock(&cache->rb_lock);
-- }
-- mutex_unlock(&cache->rb_lock);
--}
--
- int mlx5_mkey_cache_init(struct mlx5_ib_dev *dev)
- {
- struct mlx5_mkey_cache *cache = &dev->cache;
-@@ -956,7 +929,6 @@ int mlx5_mkey_cache_init(struct mlx5_ib_
- mutex_init(&dev->slow_path_mutex);
- mutex_init(&dev->cache.rb_lock);
- dev->cache.rb_root = RB_ROOT;
-- INIT_DELAYED_WORK(&dev->cache.remove_ent_dwork, remove_ent_work_func);
- cache->wq = alloc_ordered_workqueue("mkey_cache", WQ_MEM_RECLAIM);
- if (!cache->wq) {
- mlx5_ib_warn(dev, "failed to create work queue\n");
-@@ -1007,7 +979,6 @@ int mlx5_mkey_cache_cleanup(struct mlx5_
- return 0;
-
- mutex_lock(&dev->cache.rb_lock);
-- cancel_delayed_work(&dev->cache.remove_ent_dwork);
- for (node = rb_first(root); node; node = rb_next(node)) {
- ent = rb_entry(node, struct mlx5_cache_ent, node);
- spin_lock_irq(&ent->mkeys_queue.lock);
-@@ -1844,8 +1815,18 @@ static int mlx5_revoke_mr(struct mlx5_ib
- struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device);
- struct mlx5_cache_ent *ent = mr->mmkey.cache_ent;
-
-- if (mr->mmkey.cacheable && !mlx5r_umr_revoke_mr(mr) && !cache_ent_find_and_store(dev, mr))
-+ if (mr->mmkey.cacheable && !mlx5r_umr_revoke_mr(mr) && !cache_ent_find_and_store(dev, mr)) {
-+ ent = mr->mmkey.cache_ent;
-+ /* upon storing to a clean temp entry - schedule its cleanup */
-+ spin_lock_irq(&ent->mkeys_queue.lock);
-+ if (ent->is_tmp && !ent->tmp_cleanup_scheduled) {
-+ mod_delayed_work(ent->dev->cache.wq, &ent->dwork,
-+ msecs_to_jiffies(30 * 1000));
-+ ent->tmp_cleanup_scheduled = true;
-+ }
-+ spin_unlock_irq(&ent->mkeys_queue.lock);
- return 0;
-+ }
-
- if (ent) {
- spin_lock_irq(&ent->mkeys_queue.lock);
+++ /dev/null
-From 15ed43c7d41f9929ea55919272003c7ba5aec402 Mon Sep 17 00:00:00 2001
-From: Sasha Levin <sashal@kernel.org>
-Date: Sun, 19 Jan 2025 14:36:13 +0200
-Subject: RDMA/mlx5: Fix the recovery flow of the UMR QP
-
-From: Yishai Hadas <yishaih@nvidia.com>
-
-[ Upstream commit d97505baea64d93538b16baf14ce7b8c1fbad746 ]
-
-This patch addresses an issue in the recovery flow of the UMR QP,
-ensuring tasks do not get stuck, as highlighted by the call trace [1].
-
-During recovery, before transitioning the QP to the RESET state, the
-software must wait for all outstanding WRs to complete.
-
-Failing to do so can cause the firmware to skip sending some flushed
-CQEs with errors and simply discard them upon the RESET, as per the IB
-specification.
-
-This race condition can result in lost CQEs and tasks becoming stuck.
-
-To resolve this, the patch sends a final WR which serves only as a
-barrier before moving the QP state to RESET.
-
-Once a CQE is received for that final WR, it guarantees that no
-outstanding WRs remain, making it safe to transition the QP to RESET and
-subsequently back to RTS, restoring proper functionality.
-
-Note:
-For the barrier WR, we simply reuse the failed and ready WR.
-Since the QP is in an error state, it will only receive
-IB_WC_WR_FLUSH_ERR. However, as it serves only as a barrier we don't
-care about its status.
-
-[1]
-INFO: task rdma_resource_l:1922 blocked for more than 120 seconds.
-Tainted: G W 6.12.0-rc7+ #1626
-"echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
-task:rdma_resource_l state:D stack:0 pid:1922 tgid:1922 ppid:1369
- flags:0x00004004
-Call Trace:
-<TASK>
-__schedule+0x420/0xd30
-schedule+0x47/0x130
-schedule_timeout+0x280/0x300
-? mark_held_locks+0x48/0x80
-? lockdep_hardirqs_on_prepare+0xe5/0x1a0
-wait_for_completion+0x75/0x130
-mlx5r_umr_post_send_wait+0x3c2/0x5b0 [mlx5_ib]
-? __pfx_mlx5r_umr_done+0x10/0x10 [mlx5_ib]
-mlx5r_umr_revoke_mr+0x93/0xc0 [mlx5_ib]
-__mlx5_ib_dereg_mr+0x299/0x520 [mlx5_ib]
-? _raw_spin_unlock_irq+0x24/0x40
-? wait_for_completion+0xfe/0x130
-? rdma_restrack_put+0x63/0xe0 [ib_core]
-ib_dereg_mr_user+0x5f/0x120 [ib_core]
-? lock_release+0xc6/0x280
-destroy_hw_idr_uobject+0x1d/0x60 [ib_uverbs]
-uverbs_destroy_uobject+0x58/0x1d0 [ib_uverbs]
-uobj_destroy+0x3f/0x70 [ib_uverbs]
-ib_uverbs_cmd_verbs+0x3e4/0xbb0 [ib_uverbs]
-? __pfx_uverbs_destroy_def_handler+0x10/0x10 [ib_uverbs]
-? __lock_acquire+0x64e/0x2080
-? mark_held_locks+0x48/0x80
-? find_held_lock+0x2d/0xa0
-? lock_acquire+0xc1/0x2f0
-? ib_uverbs_ioctl+0xcb/0x170 [ib_uverbs]
-? __fget_files+0xc3/0x1b0
-ib_uverbs_ioctl+0xe7/0x170 [ib_uverbs]
-? ib_uverbs_ioctl+0xcb/0x170 [ib_uverbs]
-__x64_sys_ioctl+0x1b0/0xa70
-do_syscall_64+0x6b/0x140
-entry_SYSCALL_64_after_hwframe+0x76/0x7e
-RIP: 0033:0x7f99c918b17b
-RSP: 002b:00007ffc766d0468 EFLAGS: 00000246 ORIG_RAX:
- 0000000000000010
-RAX: ffffffffffffffda RBX: 00007ffc766d0578 RCX:
- 00007f99c918b17b
-RDX: 00007ffc766d0560 RSI: 00000000c0181b01 RDI:
- 0000000000000003
-RBP: 00007ffc766d0540 R08: 00007f99c8f99010 R09:
- 000000000000bd7e
-R10: 00007f99c94c1c70 R11: 0000000000000246 R12:
- 00007ffc766d0530
-R13: 000000000000001c R14: 0000000040246a80 R15:
- 0000000000000000
-</TASK>
-
-Fixes: 158e71bb69e3 ("RDMA/mlx5: Add a umr recovery flow")
-Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
-Reviewed-by: Michael Guralnik <michaelgur@nvidia.com>
-Link: https://patch.msgid.link/27b51b92ec42dfb09d8096fcbd51878f397ce6ec.1737290141.git.leon@kernel.org
-Signed-off-by: Leon Romanovsky <leon@kernel.org>
-Signed-off-by: Sasha Levin <sashal@kernel.org>
----
- drivers/infiniband/hw/mlx5/umr.c | 83 +++++++++++++++++++++-----------
- 1 file changed, 56 insertions(+), 27 deletions(-)
-
-diff --git a/drivers/infiniband/hw/mlx5/umr.c b/drivers/infiniband/hw/mlx5/umr.c
-index fa000182d0b41..1a39e86178ece 100644
---- a/drivers/infiniband/hw/mlx5/umr.c
-+++ b/drivers/infiniband/hw/mlx5/umr.c
-@@ -199,30 +199,6 @@ void mlx5r_umr_resource_cleanup(struct mlx5_ib_dev *dev)
- ib_dealloc_pd(dev->umrc.pd);
- }
-
--static int mlx5r_umr_recover(struct mlx5_ib_dev *dev)
--{
-- struct umr_common *umrc = &dev->umrc;
-- struct ib_qp_attr attr;
-- int err;
--
-- attr.qp_state = IB_QPS_RESET;
-- err = ib_modify_qp(umrc->qp, &attr, IB_QP_STATE);
-- if (err) {
-- mlx5_ib_dbg(dev, "Couldn't modify UMR QP\n");
-- goto err;
-- }
--
-- err = mlx5r_umr_qp_rst2rts(dev, umrc->qp);
-- if (err)
-- goto err;
--
-- umrc->state = MLX5_UMR_STATE_ACTIVE;
-- return 0;
--
--err:
-- umrc->state = MLX5_UMR_STATE_ERR;
-- return err;
--}
-
- static int mlx5r_umr_post_send(struct ib_qp *ibqp, u32 mkey, struct ib_cqe *cqe,
- struct mlx5r_umr_wqe *wqe, bool with_data)
-@@ -270,6 +246,61 @@ static int mlx5r_umr_post_send(struct ib_qp *ibqp, u32 mkey, struct ib_cqe *cqe,
- return err;
- }
-
-+static int mlx5r_umr_recover(struct mlx5_ib_dev *dev, u32 mkey,
-+ struct mlx5r_umr_context *umr_context,
-+ struct mlx5r_umr_wqe *wqe, bool with_data)
-+{
-+ struct umr_common *umrc = &dev->umrc;
-+ struct ib_qp_attr attr;
-+ int err;
-+
-+ mutex_lock(&umrc->lock);
-+ /* Preventing any further WRs to be sent now */
-+ if (umrc->state != MLX5_UMR_STATE_RECOVER) {
-+ mlx5_ib_warn(dev, "UMR recovery encountered an unexpected state=%d\n",
-+ umrc->state);
-+ umrc->state = MLX5_UMR_STATE_RECOVER;
-+ }
-+ mutex_unlock(&umrc->lock);
-+
-+ /* Sending a final/barrier WR (the failed one) and wait for its completion.
-+ * This will ensure that all the previous WRs got a completion before
-+ * we set the QP state to RESET.
-+ */
-+ err = mlx5r_umr_post_send(umrc->qp, mkey, &umr_context->cqe, wqe,
-+ with_data);
-+ if (err) {
-+ mlx5_ib_warn(dev, "UMR recovery post send failed, err %d\n", err);
-+ goto err;
-+ }
-+
-+ /* Since the QP is in an error state, it will only receive
-+ * IB_WC_WR_FLUSH_ERR. However, as it serves only as a barrier
-+ * we don't care about its status.
-+ */
-+ wait_for_completion(&umr_context->done);
-+
-+ attr.qp_state = IB_QPS_RESET;
-+ err = ib_modify_qp(umrc->qp, &attr, IB_QP_STATE);
-+ if (err) {
-+ mlx5_ib_warn(dev, "Couldn't modify UMR QP to RESET, err=%d\n", err);
-+ goto err;
-+ }
-+
-+ err = mlx5r_umr_qp_rst2rts(dev, umrc->qp);
-+ if (err) {
-+ mlx5_ib_warn(dev, "Couldn't modify UMR QP to RTS, err=%d\n", err);
-+ goto err;
-+ }
-+
-+ umrc->state = MLX5_UMR_STATE_ACTIVE;
-+ return 0;
-+
-+err:
-+ umrc->state = MLX5_UMR_STATE_ERR;
-+ return err;
-+}
-+
- static void mlx5r_umr_done(struct ib_cq *cq, struct ib_wc *wc)
- {
- struct mlx5_ib_umr_context *context =
-@@ -334,9 +365,7 @@ static int mlx5r_umr_post_send_wait(struct mlx5_ib_dev *dev, u32 mkey,
- mlx5_ib_warn(dev,
- "reg umr failed (%u). Trying to recover and resubmit the flushed WQEs, mkey = %u\n",
- umr_context.status, mkey);
-- mutex_lock(&umrc->lock);
-- err = mlx5r_umr_recover(dev);
-- mutex_unlock(&umrc->lock);
-+ err = mlx5r_umr_recover(dev, mkey, &umr_context, wqe, with_data);
- if (err)
- mlx5_ib_warn(dev, "couldn't recover UMR, err %d\n",
- err);
---
-2.39.5
-
+++ /dev/null
-From f637040c3339a2ed8c12d65ad03f9552386e2fe7 Mon Sep 17 00:00:00 2001
-From: Jason Gunthorpe <jgg@nvidia.com>
-Date: Tue, 28 May 2024 15:52:53 +0300
-Subject: RDMA/mlx5: Follow rb_key.ats when creating new mkeys
-
-From: Jason Gunthorpe <jgg@nvidia.com>
-
-commit f637040c3339a2ed8c12d65ad03f9552386e2fe7 upstream.
-
-When a cache ent already exists but doesn't have any mkeys in it the cache
-will automatically create a new one based on the specification in the
-ent->rb_key.
-
-ent->ats was missed when creating the new key and so ma_translation_mode
-was not being set even though the ent requires it.
-
-Cc: stable@vger.kernel.org
-Fixes: 73d09b2fe833 ("RDMA/mlx5: Introduce mlx5r_cache_rb_key")
-Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
-Reviewed-by: Michael Guralnik <michaelgur@nvidia.com>
-Link: https://lore.kernel.org/r/7c5613458ecb89fbe5606b7aa4c8d990bdea5b9a.1716900410.git.leon@kernel.org
-Signed-off-by: Leon Romanovsky <leon@kernel.org>
-Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
----
- drivers/infiniband/hw/mlx5/mr.c | 1 +
- 1 file changed, 1 insertion(+)
-
---- a/drivers/infiniband/hw/mlx5/mr.c
-+++ b/drivers/infiniband/hw/mlx5/mr.c
-@@ -243,6 +243,7 @@ static void set_cache_mkc(struct mlx5_ca
- MLX5_SET(mkc, mkc, access_mode_1_0, ent->rb_key.access_mode & 0x3);
- MLX5_SET(mkc, mkc, access_mode_4_2,
- (ent->rb_key.access_mode >> 2) & 0x7);
-+ MLX5_SET(mkc, mkc, ma_translation_mode, !!ent->rb_key.ats);
-
- MLX5_SET(mkc, mkc, translations_octword_size,
- get_mkc_octo_size(ent->rb_key.access_mode,
+++ /dev/null
-From 73daa66bd410fa9662f7e4578ac5b58338c23b31 Mon Sep 17 00:00:00 2001
-From: Sasha Levin <sashal@kernel.org>
-Date: Thu, 21 Sep 2023 11:07:16 +0300
-Subject: RDMA/mlx5: Implement mkeys management via LIFO queue
-
-From: Shay Drory <shayd@nvidia.com>
-
-[ Upstream commit 57e7071683ef6148c9f5ea0ba84598d2ba681375 ]
-
-Currently, mkeys are managed via xarray. This implementation leads to
-a degradation in cases many MRs are unregistered in parallel, due to xarray
-internal implementation, for example: deregistration 1M MRs via 64 threads
-is taking ~15% more time[1].
-
-Hence, implement mkeys management via LIFO queue, which solved the
-degradation.
-
-[1]
-2.8us in kernel v5.19 compare to 3.2us in kernel v6.4
-
-Signed-off-by: Shay Drory <shayd@nvidia.com>
-Link: https://lore.kernel.org/r/fde3d4cfab0f32f0ccb231cd113298256e1502c5.1695283384.git.leon@kernel.org
-Signed-off-by: Leon Romanovsky <leon@kernel.org>
-Stable-dep-of: d97505baea64 ("RDMA/mlx5: Fix the recovery flow of the UMR QP")
-Signed-off-by: Sasha Levin <sashal@kernel.org>
----
- drivers/infiniband/hw/mlx5/mlx5_ib.h | 21 +-
- drivers/infiniband/hw/mlx5/mr.c | 324 ++++++++++++---------------
- drivers/infiniband/hw/mlx5/umr.c | 4 +-
- 3 files changed, 169 insertions(+), 180 deletions(-)
-
-diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
-index 7c72e0e9db54a..024d2071c6a5d 100644
---- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
-+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
-@@ -760,10 +760,25 @@ struct umr_common {
- unsigned int state;
- };
-
-+#define NUM_MKEYS_PER_PAGE \
-+ ((PAGE_SIZE - sizeof(struct list_head)) / sizeof(u32))
-+
-+struct mlx5_mkeys_page {
-+ u32 mkeys[NUM_MKEYS_PER_PAGE];
-+ struct list_head list;
-+};
-+static_assert(sizeof(struct mlx5_mkeys_page) == PAGE_SIZE);
-+
-+struct mlx5_mkeys_queue {
-+ struct list_head pages_list;
-+ u32 num_pages;
-+ unsigned long ci;
-+ spinlock_t lock; /* sync list ops */
-+};
-+
- struct mlx5_cache_ent {
-- struct xarray mkeys;
-- unsigned long stored;
-- unsigned long reserved;
-+ struct mlx5_mkeys_queue mkeys_queue;
-+ u32 pending;
-
- char name[4];
-
-diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
-index 2c1a935734273..b66b8346c2dc6 100644
---- a/drivers/infiniband/hw/mlx5/mr.c
-+++ b/drivers/infiniband/hw/mlx5/mr.c
-@@ -140,110 +140,47 @@ static void create_mkey_warn(struct mlx5_ib_dev *dev, int status, void *out)
- mlx5_cmd_out_err(dev->mdev, MLX5_CMD_OP_CREATE_MKEY, 0, out);
- }
-
--static int push_mkey_locked(struct mlx5_cache_ent *ent, bool limit_pendings,
-- void *to_store)
-+static int push_mkey_locked(struct mlx5_cache_ent *ent, u32 mkey)
- {
-- XA_STATE(xas, &ent->mkeys, 0);
-- void *curr;
-+ unsigned long tmp = ent->mkeys_queue.ci % NUM_MKEYS_PER_PAGE;
-+ struct mlx5_mkeys_page *page;
-
-- if (limit_pendings &&
-- (ent->reserved - ent->stored) > MAX_PENDING_REG_MR)
-- return -EAGAIN;
--
-- while (1) {
-- /*
-- * This is cmpxchg (NULL, XA_ZERO_ENTRY) however this version
-- * doesn't transparently unlock. Instead we set the xas index to
-- * the current value of reserved every iteration.
-- */
-- xas_set(&xas, ent->reserved);
-- curr = xas_load(&xas);
-- if (!curr) {
-- if (to_store && ent->stored == ent->reserved)
-- xas_store(&xas, to_store);
-- else
-- xas_store(&xas, XA_ZERO_ENTRY);
-- if (xas_valid(&xas)) {
-- ent->reserved++;
-- if (to_store) {
-- if (ent->stored != ent->reserved)
-- __xa_store(&ent->mkeys,
-- ent->stored,
-- to_store,
-- GFP_KERNEL);
-- ent->stored++;
-- queue_adjust_cache_locked(ent);
-- WRITE_ONCE(ent->dev->cache.last_add,
-- jiffies);
-- }
-- }
-- }
-- xa_unlock_irq(&ent->mkeys);
--
-- /*
-- * Notice xas_nomem() must always be called as it cleans
-- * up any cached allocation.
-- */
-- if (!xas_nomem(&xas, GFP_KERNEL))
-- break;
-- xa_lock_irq(&ent->mkeys);
-+ lockdep_assert_held(&ent->mkeys_queue.lock);
-+ if (ent->mkeys_queue.ci >=
-+ ent->mkeys_queue.num_pages * NUM_MKEYS_PER_PAGE) {
-+ page = kzalloc(sizeof(*page), GFP_ATOMIC);
-+ if (!page)
-+ return -ENOMEM;
-+ ent->mkeys_queue.num_pages++;
-+ list_add_tail(&page->list, &ent->mkeys_queue.pages_list);
-+ } else {
-+ page = list_last_entry(&ent->mkeys_queue.pages_list,
-+ struct mlx5_mkeys_page, list);
- }
-- xa_lock_irq(&ent->mkeys);
-- if (xas_error(&xas))
-- return xas_error(&xas);
-- if (WARN_ON(curr))
-- return -EINVAL;
-- return 0;
--}
--
--static int push_mkey(struct mlx5_cache_ent *ent, bool limit_pendings,
-- void *to_store)
--{
-- int ret;
--
-- xa_lock_irq(&ent->mkeys);
-- ret = push_mkey_locked(ent, limit_pendings, to_store);
-- xa_unlock_irq(&ent->mkeys);
-- return ret;
--}
--
--static void undo_push_reserve_mkey(struct mlx5_cache_ent *ent)
--{
-- void *old;
--
-- ent->reserved--;
-- old = __xa_erase(&ent->mkeys, ent->reserved);
-- WARN_ON(old);
--}
--
--static void push_to_reserved(struct mlx5_cache_ent *ent, u32 mkey)
--{
-- void *old;
-
-- old = __xa_store(&ent->mkeys, ent->stored, xa_mk_value(mkey), 0);
-- WARN_ON(old);
-- ent->stored++;
-+ page->mkeys[tmp] = mkey;
-+ ent->mkeys_queue.ci++;
-+ return 0;
- }
-
--static u32 pop_stored_mkey(struct mlx5_cache_ent *ent)
-+static int pop_mkey_locked(struct mlx5_cache_ent *ent)
- {
-- void *old, *xa_mkey;
--
-- ent->stored--;
-- ent->reserved--;
-+ unsigned long tmp = (ent->mkeys_queue.ci - 1) % NUM_MKEYS_PER_PAGE;
-+ struct mlx5_mkeys_page *last_page;
-+ u32 mkey;
-
-- if (ent->stored == ent->reserved) {
-- xa_mkey = __xa_erase(&ent->mkeys, ent->stored);
-- WARN_ON(!xa_mkey);
-- return (u32)xa_to_value(xa_mkey);
-+ lockdep_assert_held(&ent->mkeys_queue.lock);
-+ last_page = list_last_entry(&ent->mkeys_queue.pages_list,
-+ struct mlx5_mkeys_page, list);
-+ mkey = last_page->mkeys[tmp];
-+ last_page->mkeys[tmp] = 0;
-+ ent->mkeys_queue.ci--;
-+ if (ent->mkeys_queue.num_pages > 1 && !tmp) {
-+ list_del(&last_page->list);
-+ ent->mkeys_queue.num_pages--;
-+ kfree(last_page);
- }
--
-- xa_mkey = __xa_store(&ent->mkeys, ent->stored, XA_ZERO_ENTRY,
-- GFP_KERNEL);
-- WARN_ON(!xa_mkey || xa_is_err(xa_mkey));
-- old = __xa_erase(&ent->mkeys, ent->reserved);
-- WARN_ON(old);
-- return (u32)xa_to_value(xa_mkey);
-+ return mkey;
- }
-
- static void create_mkey_callback(int status, struct mlx5_async_work *context)
-@@ -257,10 +194,10 @@ static void create_mkey_callback(int status, struct mlx5_async_work *context)
- if (status) {
- create_mkey_warn(dev, status, mkey_out->out);
- kfree(mkey_out);
-- xa_lock_irqsave(&ent->mkeys, flags);
-- undo_push_reserve_mkey(ent);
-+ spin_lock_irqsave(&ent->mkeys_queue.lock, flags);
-+ ent->pending--;
- WRITE_ONCE(dev->fill_delay, 1);
-- xa_unlock_irqrestore(&ent->mkeys, flags);
-+ spin_unlock_irqrestore(&ent->mkeys_queue.lock, flags);
- mod_timer(&dev->delay_timer, jiffies + HZ);
- return;
- }
-@@ -269,11 +206,12 @@ static void create_mkey_callback(int status, struct mlx5_async_work *context)
- MLX5_GET(create_mkey_out, mkey_out->out, mkey_index));
- WRITE_ONCE(dev->cache.last_add, jiffies);
-
-- xa_lock_irqsave(&ent->mkeys, flags);
-- push_to_reserved(ent, mkey_out->mkey);
-+ spin_lock_irqsave(&ent->mkeys_queue.lock, flags);
-+ push_mkey_locked(ent, mkey_out->mkey);
- /* If we are doing fill_to_high_water then keep going. */
- queue_adjust_cache_locked(ent);
-- xa_unlock_irqrestore(&ent->mkeys, flags);
-+ ent->pending--;
-+ spin_unlock_irqrestore(&ent->mkeys_queue.lock, flags);
- kfree(mkey_out);
- }
-
-@@ -329,24 +267,28 @@ static int add_keys(struct mlx5_cache_ent *ent, unsigned int num)
- set_cache_mkc(ent, mkc);
- async_create->ent = ent;
-
-- err = push_mkey(ent, true, NULL);
-- if (err)
-+ spin_lock_irq(&ent->mkeys_queue.lock);
-+ if (ent->pending >= MAX_PENDING_REG_MR) {
-+ err = -EAGAIN;
- goto free_async_create;
-+ }
-+ ent->pending++;
-+ spin_unlock_irq(&ent->mkeys_queue.lock);
-
- err = mlx5_ib_create_mkey_cb(async_create);
- if (err) {
- mlx5_ib_warn(ent->dev, "create mkey failed %d\n", err);
-- goto err_undo_reserve;
-+ goto err_create_mkey;
- }
- }
-
- return 0;
-
--err_undo_reserve:
-- xa_lock_irq(&ent->mkeys);
-- undo_push_reserve_mkey(ent);
-- xa_unlock_irq(&ent->mkeys);
-+err_create_mkey:
-+ spin_lock_irq(&ent->mkeys_queue.lock);
-+ ent->pending--;
- free_async_create:
-+ spin_unlock_irq(&ent->mkeys_queue.lock);
- kfree(async_create);
- return err;
- }
-@@ -379,36 +321,36 @@ static void remove_cache_mr_locked(struct mlx5_cache_ent *ent)
- {
- u32 mkey;
-
-- lockdep_assert_held(&ent->mkeys.xa_lock);
-- if (!ent->stored)
-+ lockdep_assert_held(&ent->mkeys_queue.lock);
-+ if (!ent->mkeys_queue.ci)
- return;
-- mkey = pop_stored_mkey(ent);
-- xa_unlock_irq(&ent->mkeys);
-+ mkey = pop_mkey_locked(ent);
-+ spin_unlock_irq(&ent->mkeys_queue.lock);
- mlx5_core_destroy_mkey(ent->dev->mdev, mkey);
-- xa_lock_irq(&ent->mkeys);
-+ spin_lock_irq(&ent->mkeys_queue.lock);
- }
-
- static int resize_available_mrs(struct mlx5_cache_ent *ent, unsigned int target,
- bool limit_fill)
-- __acquires(&ent->mkeys) __releases(&ent->mkeys)
-+ __acquires(&ent->mkeys_queue.lock) __releases(&ent->mkeys_queue.lock)
- {
- int err;
-
-- lockdep_assert_held(&ent->mkeys.xa_lock);
-+ lockdep_assert_held(&ent->mkeys_queue.lock);
-
- while (true) {
- if (limit_fill)
- target = ent->limit * 2;
-- if (target == ent->reserved)
-+ if (target == ent->pending + ent->mkeys_queue.ci)
- return 0;
-- if (target > ent->reserved) {
-- u32 todo = target - ent->reserved;
-+ if (target > ent->pending + ent->mkeys_queue.ci) {
-+ u32 todo = target - (ent->pending + ent->mkeys_queue.ci);
-
-- xa_unlock_irq(&ent->mkeys);
-+ spin_unlock_irq(&ent->mkeys_queue.lock);
- err = add_keys(ent, todo);
- if (err == -EAGAIN)
- usleep_range(3000, 5000);
-- xa_lock_irq(&ent->mkeys);
-+ spin_lock_irq(&ent->mkeys_queue.lock);
- if (err) {
- if (err != -EAGAIN)
- return err;
-@@ -436,7 +378,7 @@ static ssize_t size_write(struct file *filp, const char __user *buf,
- * cannot free MRs that are in use. Compute the target value for stored
- * mkeys.
- */
-- xa_lock_irq(&ent->mkeys);
-+ spin_lock_irq(&ent->mkeys_queue.lock);
- if (target < ent->in_use) {
- err = -EINVAL;
- goto err_unlock;
-@@ -449,12 +391,12 @@ static ssize_t size_write(struct file *filp, const char __user *buf,
- err = resize_available_mrs(ent, target, false);
- if (err)
- goto err_unlock;
-- xa_unlock_irq(&ent->mkeys);
-+ spin_unlock_irq(&ent->mkeys_queue.lock);
-
- return count;
-
- err_unlock:
-- xa_unlock_irq(&ent->mkeys);
-+ spin_unlock_irq(&ent->mkeys_queue.lock);
- return err;
- }
-
-@@ -465,7 +407,8 @@ static ssize_t size_read(struct file *filp, char __user *buf, size_t count,
- char lbuf[20];
- int err;
-
-- err = snprintf(lbuf, sizeof(lbuf), "%ld\n", ent->stored + ent->in_use);
-+ err = snprintf(lbuf, sizeof(lbuf), "%ld\n",
-+ ent->mkeys_queue.ci + ent->in_use);
- if (err < 0)
- return err;
-
-@@ -494,10 +437,10 @@ static ssize_t limit_write(struct file *filp, const char __user *buf,
- * Upon set we immediately fill the cache to high water mark implied by
- * the limit.
- */
-- xa_lock_irq(&ent->mkeys);
-+ spin_lock_irq(&ent->mkeys_queue.lock);
- ent->limit = var;
- err = resize_available_mrs(ent, 0, true);
-- xa_unlock_irq(&ent->mkeys);
-+ spin_unlock_irq(&ent->mkeys_queue.lock);
- if (err)
- return err;
- return count;
-@@ -533,9 +476,9 @@ static bool someone_adding(struct mlx5_mkey_cache *cache)
- mutex_lock(&cache->rb_lock);
- for (node = rb_first(&cache->rb_root); node; node = rb_next(node)) {
- ent = rb_entry(node, struct mlx5_cache_ent, node);
-- xa_lock_irq(&ent->mkeys);
-- ret = ent->stored < ent->limit;
-- xa_unlock_irq(&ent->mkeys);
-+ spin_lock_irq(&ent->mkeys_queue.lock);
-+ ret = ent->mkeys_queue.ci < ent->limit;
-+ spin_unlock_irq(&ent->mkeys_queue.lock);
- if (ret) {
- mutex_unlock(&cache->rb_lock);
- return true;
-@@ -552,26 +495,26 @@ static bool someone_adding(struct mlx5_mkey_cache *cache)
- */
- static void queue_adjust_cache_locked(struct mlx5_cache_ent *ent)
- {
-- lockdep_assert_held(&ent->mkeys.xa_lock);
-+ lockdep_assert_held(&ent->mkeys_queue.lock);
-
- if (ent->disabled || READ_ONCE(ent->dev->fill_delay) || ent->is_tmp)
- return;
-- if (ent->stored < ent->limit) {
-+ if (ent->mkeys_queue.ci < ent->limit) {
- ent->fill_to_high_water = true;
- mod_delayed_work(ent->dev->cache.wq, &ent->dwork, 0);
- } else if (ent->fill_to_high_water &&
-- ent->reserved < 2 * ent->limit) {
-+ ent->mkeys_queue.ci + ent->pending < 2 * ent->limit) {
- /*
- * Once we start populating due to hitting a low water mark
- * continue until we pass the high water mark.
- */
- mod_delayed_work(ent->dev->cache.wq, &ent->dwork, 0);
-- } else if (ent->stored == 2 * ent->limit) {
-+ } else if (ent->mkeys_queue.ci == 2 * ent->limit) {
- ent->fill_to_high_water = false;
-- } else if (ent->stored > 2 * ent->limit) {
-+ } else if (ent->mkeys_queue.ci > 2 * ent->limit) {
- /* Queue deletion of excess entries */
- ent->fill_to_high_water = false;
-- if (ent->stored != ent->reserved)
-+ if (ent->pending)
- queue_delayed_work(ent->dev->cache.wq, &ent->dwork,
- msecs_to_jiffies(1000));
- else
-@@ -585,15 +528,16 @@ static void __cache_work_func(struct mlx5_cache_ent *ent)
- struct mlx5_mkey_cache *cache = &dev->cache;
- int err;
-
-- xa_lock_irq(&ent->mkeys);
-+ spin_lock_irq(&ent->mkeys_queue.lock);
- if (ent->disabled)
- goto out;
-
-- if (ent->fill_to_high_water && ent->reserved < 2 * ent->limit &&
-+ if (ent->fill_to_high_water &&
-+ ent->mkeys_queue.ci + ent->pending < 2 * ent->limit &&
- !READ_ONCE(dev->fill_delay)) {
-- xa_unlock_irq(&ent->mkeys);
-+ spin_unlock_irq(&ent->mkeys_queue.lock);
- err = add_keys(ent, 1);
-- xa_lock_irq(&ent->mkeys);
-+ spin_lock_irq(&ent->mkeys_queue.lock);
- if (ent->disabled)
- goto out;
- if (err) {
-@@ -611,7 +555,7 @@ static void __cache_work_func(struct mlx5_cache_ent *ent)
- msecs_to_jiffies(1000));
- }
- }
-- } else if (ent->stored > 2 * ent->limit) {
-+ } else if (ent->mkeys_queue.ci > 2 * ent->limit) {
- bool need_delay;
-
- /*
-@@ -626,11 +570,11 @@ static void __cache_work_func(struct mlx5_cache_ent *ent)
- * the garbage collection work to try to run in next cycle, in
- * order to free CPU resources to other tasks.
- */
-- xa_unlock_irq(&ent->mkeys);
-+ spin_unlock_irq(&ent->mkeys_queue.lock);
- need_delay = need_resched() || someone_adding(cache) ||
- !time_after(jiffies,
- READ_ONCE(cache->last_add) + 300 * HZ);
-- xa_lock_irq(&ent->mkeys);
-+ spin_lock_irq(&ent->mkeys_queue.lock);
- if (ent->disabled)
- goto out;
- if (need_delay) {
-@@ -641,7 +585,7 @@ static void __cache_work_func(struct mlx5_cache_ent *ent)
- queue_adjust_cache_locked(ent);
- }
- out:
-- xa_unlock_irq(&ent->mkeys);
-+ spin_unlock_irq(&ent->mkeys_queue.lock);
- }
-
- static void delayed_cache_work_func(struct work_struct *work)
-@@ -749,25 +693,25 @@ static struct mlx5_ib_mr *_mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev,
- if (!mr)
- return ERR_PTR(-ENOMEM);
-
-- xa_lock_irq(&ent->mkeys);
-+ spin_lock_irq(&ent->mkeys_queue.lock);
- ent->in_use++;
-
-- if (!ent->stored) {
-+ if (!ent->mkeys_queue.ci) {
- queue_adjust_cache_locked(ent);
- ent->miss++;
-- xa_unlock_irq(&ent->mkeys);
-+ spin_unlock_irq(&ent->mkeys_queue.lock);
- err = create_cache_mkey(ent, &mr->mmkey.key);
- if (err) {
-- xa_lock_irq(&ent->mkeys);
-+ spin_lock_irq(&ent->mkeys_queue.lock);
- ent->in_use--;
-- xa_unlock_irq(&ent->mkeys);
-+ spin_unlock_irq(&ent->mkeys_queue.lock);
- kfree(mr);
- return ERR_PTR(err);
- }
- } else {
-- mr->mmkey.key = pop_stored_mkey(ent);
-+ mr->mmkey.key = pop_mkey_locked(ent);
- queue_adjust_cache_locked(ent);
-- xa_unlock_irq(&ent->mkeys);
-+ spin_unlock_irq(&ent->mkeys_queue.lock);
- }
- mr->mmkey.cache_ent = ent;
- mr->mmkey.type = MLX5_MKEY_MR;
-@@ -820,14 +764,14 @@ static void clean_keys(struct mlx5_ib_dev *dev, struct mlx5_cache_ent *ent)
- u32 mkey;
-
- cancel_delayed_work(&ent->dwork);
-- xa_lock_irq(&ent->mkeys);
-- while (ent->stored) {
-- mkey = pop_stored_mkey(ent);
-- xa_unlock_irq(&ent->mkeys);
-+ spin_lock_irq(&ent->mkeys_queue.lock);
-+ while (ent->mkeys_queue.ci) {
-+ mkey = pop_mkey_locked(ent);
-+ spin_unlock_irq(&ent->mkeys_queue.lock);
- mlx5_core_destroy_mkey(dev->mdev, mkey);
-- xa_lock_irq(&ent->mkeys);
-+ spin_lock_irq(&ent->mkeys_queue.lock);
- }
-- xa_unlock_irq(&ent->mkeys);
-+ spin_unlock_irq(&ent->mkeys_queue.lock);
- }
-
- static void mlx5_mkey_cache_debugfs_cleanup(struct mlx5_ib_dev *dev)
-@@ -852,7 +796,7 @@ static void mlx5_mkey_cache_debugfs_add_ent(struct mlx5_ib_dev *dev,
- dir = debugfs_create_dir(ent->name, dev->cache.fs_root);
- debugfs_create_file("size", 0600, dir, ent, &size_fops);
- debugfs_create_file("limit", 0600, dir, ent, &limit_fops);
-- debugfs_create_ulong("cur", 0400, dir, &ent->stored);
-+ debugfs_create_ulong("cur", 0400, dir, &ent->mkeys_queue.ci);
- debugfs_create_u32("miss", 0600, dir, &ent->miss);
- }
-
-@@ -874,6 +818,31 @@ static void delay_time_func(struct timer_list *t)
- WRITE_ONCE(dev->fill_delay, 0);
- }
-
-+static int mlx5r_mkeys_init(struct mlx5_cache_ent *ent)
-+{
-+ struct mlx5_mkeys_page *page;
-+
-+ page = kzalloc(sizeof(*page), GFP_KERNEL);
-+ if (!page)
-+ return -ENOMEM;
-+ INIT_LIST_HEAD(&ent->mkeys_queue.pages_list);
-+ spin_lock_init(&ent->mkeys_queue.lock);
-+ list_add_tail(&page->list, &ent->mkeys_queue.pages_list);
-+ ent->mkeys_queue.num_pages++;
-+ return 0;
-+}
-+
-+static void mlx5r_mkeys_uninit(struct mlx5_cache_ent *ent)
-+{
-+ struct mlx5_mkeys_page *page;
-+
-+ WARN_ON(ent->mkeys_queue.ci || ent->mkeys_queue.num_pages > 1);
-+ page = list_last_entry(&ent->mkeys_queue.pages_list,
-+ struct mlx5_mkeys_page, list);
-+ list_del(&page->list);
-+ kfree(page);
-+}
-+
- struct mlx5_cache_ent *
- mlx5r_cache_create_ent_locked(struct mlx5_ib_dev *dev,
- struct mlx5r_cache_rb_key rb_key,
-@@ -887,7 +856,9 @@ mlx5r_cache_create_ent_locked(struct mlx5_ib_dev *dev,
- if (!ent)
- return ERR_PTR(-ENOMEM);
-
-- xa_init_flags(&ent->mkeys, XA_FLAGS_LOCK_IRQ);
-+ ret = mlx5r_mkeys_init(ent);
-+ if (ret)
-+ goto mkeys_err;
- ent->rb_key = rb_key;
- ent->dev = dev;
- ent->is_tmp = !persistent_entry;
-@@ -895,10 +866,8 @@ mlx5r_cache_create_ent_locked(struct mlx5_ib_dev *dev,
- INIT_DELAYED_WORK(&ent->dwork, delayed_cache_work_func);
-
- ret = mlx5_cache_ent_insert(&dev->cache, ent);
-- if (ret) {
-- kfree(ent);
-- return ERR_PTR(ret);
-- }
-+ if (ret)
-+ goto ent_insert_err;
-
- if (persistent_entry) {
- if (rb_key.access_mode == MLX5_MKC_ACCESS_MODE_KSM)
-@@ -921,6 +890,11 @@ mlx5r_cache_create_ent_locked(struct mlx5_ib_dev *dev,
- }
-
- return ent;
-+ent_insert_err:
-+ mlx5r_mkeys_uninit(ent);
-+mkeys_err:
-+ kfree(ent);
-+ return ERR_PTR(ret);
- }
-
- static void remove_ent_work_func(struct work_struct *work)
-@@ -938,13 +912,13 @@ static void remove_ent_work_func(struct work_struct *work)
- cur = rb_prev(cur);
- mutex_unlock(&cache->rb_lock);
-
-- xa_lock_irq(&ent->mkeys);
-+ spin_lock_irq(&ent->mkeys_queue.lock);
- if (!ent->is_tmp) {
-- xa_unlock_irq(&ent->mkeys);
-+ spin_unlock_irq(&ent->mkeys_queue.lock);
- mutex_lock(&cache->rb_lock);
- continue;
- }
-- xa_unlock_irq(&ent->mkeys);
-+ spin_unlock_irq(&ent->mkeys_queue.lock);
-
- clean_keys(ent->dev, ent);
- mutex_lock(&cache->rb_lock);
-@@ -994,9 +968,9 @@ int mlx5_mkey_cache_init(struct mlx5_ib_dev *dev)
- mutex_unlock(&cache->rb_lock);
- for (node = rb_first(root); node; node = rb_next(node)) {
- ent = rb_entry(node, struct mlx5_cache_ent, node);
-- xa_lock_irq(&ent->mkeys);
-+ spin_lock_irq(&ent->mkeys_queue.lock);
- queue_adjust_cache_locked(ent);
-- xa_unlock_irq(&ent->mkeys);
-+ spin_unlock_irq(&ent->mkeys_queue.lock);
- }
-
- return 0;
-@@ -1020,9 +994,9 @@ int mlx5_mkey_cache_cleanup(struct mlx5_ib_dev *dev)
- mutex_lock(&dev->cache.rb_lock);
- for (node = rb_first(root); node; node = rb_next(node)) {
- ent = rb_entry(node, struct mlx5_cache_ent, node);
-- xa_lock_irq(&ent->mkeys);
-+ spin_lock_irq(&ent->mkeys_queue.lock);
- ent->disabled = true;
-- xa_unlock_irq(&ent->mkeys);
-+ spin_unlock_irq(&ent->mkeys_queue.lock);
- cancel_delayed_work_sync(&ent->dwork);
- }
-
-@@ -1035,6 +1009,7 @@ int mlx5_mkey_cache_cleanup(struct mlx5_ib_dev *dev)
- node = rb_next(node);
- clean_keys(dev, ent);
- rb_erase(&ent->node, root);
-+ mlx5r_mkeys_uninit(ent);
- kfree(ent);
- }
- mutex_unlock(&dev->cache.rb_lock);
-@@ -1802,7 +1777,7 @@ static int cache_ent_find_and_store(struct mlx5_ib_dev *dev,
- int ret;
-
- if (mr->mmkey.cache_ent) {
-- xa_lock_irq(&mr->mmkey.cache_ent->mkeys);
-+ spin_lock_irq(&mr->mmkey.cache_ent->mkeys_queue.lock);
- mr->mmkey.cache_ent->in_use--;
- goto end;
- }
-@@ -1816,7 +1791,7 @@ static int cache_ent_find_and_store(struct mlx5_ib_dev *dev,
- return -EOPNOTSUPP;
- }
- mr->mmkey.cache_ent = ent;
-- xa_lock_irq(&mr->mmkey.cache_ent->mkeys);
-+ spin_lock_irq(&mr->mmkey.cache_ent->mkeys_queue.lock);
- mutex_unlock(&cache->rb_lock);
- goto end;
- }
-@@ -1828,12 +1803,11 @@ static int cache_ent_find_and_store(struct mlx5_ib_dev *dev,
- return PTR_ERR(ent);
-
- mr->mmkey.cache_ent = ent;
-- xa_lock_irq(&mr->mmkey.cache_ent->mkeys);
-+ spin_lock_irq(&mr->mmkey.cache_ent->mkeys_queue.lock);
-
- end:
-- ret = push_mkey_locked(mr->mmkey.cache_ent, false,
-- xa_mk_value(mr->mmkey.key));
-- xa_unlock_irq(&mr->mmkey.cache_ent->mkeys);
-+ ret = push_mkey_locked(mr->mmkey.cache_ent, mr->mmkey.key);
-+ spin_unlock_irq(&mr->mmkey.cache_ent->mkeys_queue.lock);
- return ret;
- }
-
-diff --git a/drivers/infiniband/hw/mlx5/umr.c b/drivers/infiniband/hw/mlx5/umr.c
-index cb5cee3dee2b6..fa000182d0b41 100644
---- a/drivers/infiniband/hw/mlx5/umr.c
-+++ b/drivers/infiniband/hw/mlx5/umr.c
-@@ -332,8 +332,8 @@ static int mlx5r_umr_post_send_wait(struct mlx5_ib_dev *dev, u32 mkey,
-
- WARN_ON_ONCE(1);
- mlx5_ib_warn(dev,
-- "reg umr failed (%u). Trying to recover and resubmit the flushed WQEs\n",
-- umr_context.status);
-+ "reg umr failed (%u). Trying to recover and resubmit the flushed WQEs, mkey = %u\n",
-+ umr_context.status, mkey);
- mutex_lock(&umrc->lock);
- err = mlx5r_umr_recover(dev);
- mutex_unlock(&umrc->lock);
---
-2.39.5
-
+++ /dev/null
-From dee0c2d2ab0dbb79d87e227f8b4136f1764cefb4 Mon Sep 17 00:00:00 2001
-From: Sasha Levin <sashal@kernel.org>
-Date: Thu, 26 Jan 2023 00:28:05 +0200
-Subject: RDMA/mlx5: Introduce mlx5r_cache_rb_key
-
-From: Michael Guralnik <michaelgur@nvidia.com>
-
-[ Upstream commit 73d09b2fe8336f5f37935e46418666ddbcd3c343 ]
-
-Switch from using the mkey order to using the new struct as the key to the
-RB tree of cache entries.
-
-The key is all the mkey properties that UMR operations can't modify.
-Using this key to define the cache entries and to search and create cache
-mkeys.
-
-Link: https://lore.kernel.org/r/20230125222807.6921-5-michaelgur@nvidia.com
-Signed-off-by: Michael Guralnik <michaelgur@nvidia.com>
-Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
-Stable-dep-of: d97505baea64 ("RDMA/mlx5: Fix the recovery flow of the UMR QP")
-Signed-off-by: Sasha Levin <sashal@kernel.org>
----
- drivers/infiniband/hw/mlx5/mlx5_ib.h | 27 ++--
- drivers/infiniband/hw/mlx5/mr.c | 228 +++++++++++++++++++--------
- drivers/infiniband/hw/mlx5/odp.c | 30 ++--
- 3 files changed, 201 insertions(+), 84 deletions(-)
-
-diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
-index bd998ac8c29c1..7c9d5648947e9 100644
---- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
-+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
-@@ -637,6 +637,13 @@ enum mlx5_mkey_type {
- MLX5_MKEY_INDIRECT_DEVX,
- };
-
-+struct mlx5r_cache_rb_key {
-+ u8 ats:1;
-+ unsigned int access_mode;
-+ unsigned int access_flags;
-+ unsigned int ndescs;
-+};
-+
- struct mlx5_ib_mkey {
- u32 key;
- enum mlx5_mkey_type type;
-@@ -757,11 +764,9 @@ struct mlx5_cache_ent {
- unsigned long reserved;
-
- char name[4];
-- u32 order;
-- u32 access_mode;
-- unsigned int ndescs;
-
- struct rb_node node;
-+ struct mlx5r_cache_rb_key rb_key;
-
- u8 disabled:1;
- u8 fill_to_high_water:1;
-@@ -1340,14 +1345,13 @@ int mlx5_ib_get_cqe_size(struct ib_cq *ibcq);
- int mlx5_mkey_cache_init(struct mlx5_ib_dev *dev);
- int mlx5_mkey_cache_cleanup(struct mlx5_ib_dev *dev);
- struct mlx5_cache_ent *mlx5r_cache_create_ent(struct mlx5_ib_dev *dev,
-- int order);
-+ struct mlx5r_cache_rb_key rb_key,
-+ bool persistent_entry);
-
- struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev,
-- struct mlx5_cache_ent *ent,
-- int access_flags);
-+ int access_flags, int access_mode,
-+ int ndescs);
-
--struct mlx5_ib_mr *mlx5_mr_cache_alloc_order(struct mlx5_ib_dev *dev, u32 order,
-- int access_flags);
- int mlx5_ib_check_mr_status(struct ib_mr *ibmr, u32 check_mask,
- struct ib_mr_status *mr_status);
- struct ib_wq *mlx5_ib_create_wq(struct ib_pd *pd,
-@@ -1370,7 +1374,7 @@ int mlx5r_odp_create_eq(struct mlx5_ib_dev *dev, struct mlx5_ib_pf_eq *eq);
- void mlx5_ib_odp_cleanup_one(struct mlx5_ib_dev *ibdev);
- int __init mlx5_ib_odp_init(void);
- void mlx5_ib_odp_cleanup(void);
--void mlx5_odp_init_mkey_cache_entry(struct mlx5_cache_ent *ent);
-+int mlx5_odp_init_mkey_cache(struct mlx5_ib_dev *dev);
- void mlx5_odp_populate_xlt(void *xlt, size_t idx, size_t nentries,
- struct mlx5_ib_mr *mr, int flags);
-
-@@ -1389,7 +1393,10 @@ static inline int mlx5r_odp_create_eq(struct mlx5_ib_dev *dev,
- static inline void mlx5_ib_odp_cleanup_one(struct mlx5_ib_dev *ibdev) {}
- static inline int mlx5_ib_odp_init(void) { return 0; }
- static inline void mlx5_ib_odp_cleanup(void) {}
--static inline void mlx5_odp_init_mkey_cache_entry(struct mlx5_cache_ent *ent) {}
-+static inline int mlx5_odp_init_mkey_cache(struct mlx5_ib_dev *dev)
-+{
-+ return 0;
-+}
- static inline void mlx5_odp_populate_xlt(void *xlt, size_t idx, size_t nentries,
- struct mlx5_ib_mr *mr, int flags) {}
-
-diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
-index b3d83920d3cfb..1060b30a837a0 100644
---- a/drivers/infiniband/hw/mlx5/mr.c
-+++ b/drivers/infiniband/hw/mlx5/mr.c
-@@ -292,11 +292,13 @@ static void set_cache_mkc(struct mlx5_cache_ent *ent, void *mkc)
- set_mkc_access_pd_addr_fields(mkc, 0, 0, ent->dev->umrc.pd);
- MLX5_SET(mkc, mkc, free, 1);
- MLX5_SET(mkc, mkc, umr_en, 1);
-- MLX5_SET(mkc, mkc, access_mode_1_0, ent->access_mode & 0x3);
-- MLX5_SET(mkc, mkc, access_mode_4_2, (ent->access_mode >> 2) & 0x7);
-+ MLX5_SET(mkc, mkc, access_mode_1_0, ent->rb_key.access_mode & 0x3);
-+ MLX5_SET(mkc, mkc, access_mode_4_2,
-+ (ent->rb_key.access_mode >> 2) & 0x7);
-
- MLX5_SET(mkc, mkc, translations_octword_size,
-- get_mkc_octo_size(ent->access_mode, ent->ndescs));
-+ get_mkc_octo_size(ent->rb_key.access_mode,
-+ ent->rb_key.ndescs));
- MLX5_SET(mkc, mkc, log_page_size, PAGE_SHIFT);
- }
-
-@@ -594,8 +596,8 @@ static void __cache_work_func(struct mlx5_cache_ent *ent)
- if (err != -EAGAIN) {
- mlx5_ib_warn(
- dev,
-- "command failed order %d, err %d\n",
-- ent->order, err);
-+ "add keys command failed, err %d\n",
-+ err);
- queue_delayed_work(cache->wq, &ent->dwork,
- msecs_to_jiffies(1000));
- }
-@@ -641,22 +643,49 @@ static void delayed_cache_work_func(struct work_struct *work)
- __cache_work_func(ent);
- }
-
-+static int cache_ent_key_cmp(struct mlx5r_cache_rb_key key1,
-+ struct mlx5r_cache_rb_key key2)
-+{
-+ int res;
-+
-+ res = key1.ats - key2.ats;
-+ if (res)
-+ return res;
-+
-+ res = key1.access_mode - key2.access_mode;
-+ if (res)
-+ return res;
-+
-+ res = key1.access_flags - key2.access_flags;
-+ if (res)
-+ return res;
-+
-+ /*
-+ * keep ndescs the last in the compare table since the find function
-+ * searches for an exact match on all properties and only closest
-+ * match in size.
-+ */
-+ return key1.ndescs - key2.ndescs;
-+}
-+
- static int mlx5_cache_ent_insert(struct mlx5_mkey_cache *cache,
- struct mlx5_cache_ent *ent)
- {
- struct rb_node **new = &cache->rb_root.rb_node, *parent = NULL;
- struct mlx5_cache_ent *cur;
-+ int cmp;
-
- mutex_lock(&cache->rb_lock);
- /* Figure out where to put new node */
- while (*new) {
- cur = rb_entry(*new, struct mlx5_cache_ent, node);
- parent = *new;
-- if (ent->order < cur->order)
-+ cmp = cache_ent_key_cmp(cur->rb_key, ent->rb_key);
-+ if (cmp > 0)
- new = &((*new)->rb_left);
-- if (ent->order > cur->order)
-+ if (cmp < 0)
- new = &((*new)->rb_right);
-- if (ent->order == cur->order) {
-+ if (cmp == 0) {
- mutex_unlock(&cache->rb_lock);
- return -EEXIST;
- }
-@@ -670,40 +699,45 @@ static int mlx5_cache_ent_insert(struct mlx5_mkey_cache *cache,
- return 0;
- }
-
--static struct mlx5_cache_ent *mkey_cache_ent_from_order(struct mlx5_ib_dev *dev,
-- unsigned int order)
-+static struct mlx5_cache_ent *
-+mkey_cache_ent_from_rb_key(struct mlx5_ib_dev *dev,
-+ struct mlx5r_cache_rb_key rb_key)
- {
- struct rb_node *node = dev->cache.rb_root.rb_node;
- struct mlx5_cache_ent *cur, *smallest = NULL;
-+ int cmp;
-
- /*
- * Find the smallest ent with order >= requested_order.
- */
- while (node) {
- cur = rb_entry(node, struct mlx5_cache_ent, node);
-- if (cur->order > order) {
-+ cmp = cache_ent_key_cmp(cur->rb_key, rb_key);
-+ if (cmp > 0) {
- smallest = cur;
- node = node->rb_left;
- }
-- if (cur->order < order)
-+ if (cmp < 0)
- node = node->rb_right;
-- if (cur->order == order)
-+ if (cmp == 0)
- return cur;
- }
-
-- return smallest;
-+ return (smallest &&
-+ smallest->rb_key.access_mode == rb_key.access_mode &&
-+ smallest->rb_key.access_flags == rb_key.access_flags &&
-+ smallest->rb_key.ats == rb_key.ats) ?
-+ smallest :
-+ NULL;
- }
-
--struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev,
-- struct mlx5_cache_ent *ent,
-- int access_flags)
-+static struct mlx5_ib_mr *_mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev,
-+ struct mlx5_cache_ent *ent,
-+ int access_flags)
- {
- struct mlx5_ib_mr *mr;
- int err;
-
-- if (!mlx5r_umr_can_reconfig(dev, 0, access_flags))
-- return ERR_PTR(-EOPNOTSUPP);
--
- mr = kzalloc(sizeof(*mr), GFP_KERNEL);
- if (!mr)
- return ERR_PTR(-ENOMEM);
-@@ -734,12 +768,44 @@ struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev,
- return mr;
- }
-
--struct mlx5_ib_mr *mlx5_mr_cache_alloc_order(struct mlx5_ib_dev *dev,
-- u32 order, int access_flags)
-+static int get_unchangeable_access_flags(struct mlx5_ib_dev *dev,
-+ int access_flags)
-+{
-+ int ret = 0;
-+
-+ if ((access_flags & IB_ACCESS_REMOTE_ATOMIC) &&
-+ MLX5_CAP_GEN(dev->mdev, atomic) &&
-+ MLX5_CAP_GEN(dev->mdev, umr_modify_atomic_disabled))
-+ ret |= IB_ACCESS_REMOTE_ATOMIC;
-+
-+ if ((access_flags & IB_ACCESS_RELAXED_ORDERING) &&
-+ MLX5_CAP_GEN(dev->mdev, relaxed_ordering_write) &&
-+ !MLX5_CAP_GEN(dev->mdev, relaxed_ordering_write_umr))
-+ ret |= IB_ACCESS_RELAXED_ORDERING;
-+
-+ if ((access_flags & IB_ACCESS_RELAXED_ORDERING) &&
-+ MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read) &&
-+ !MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read_umr))
-+ ret |= IB_ACCESS_RELAXED_ORDERING;
-+
-+ return ret;
-+}
-+
-+struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev,
-+ int access_flags, int access_mode,
-+ int ndescs)
- {
-- struct mlx5_cache_ent *ent = mkey_cache_ent_from_order(dev, order);
-+ struct mlx5r_cache_rb_key rb_key = {
-+ .ndescs = ndescs,
-+ .access_mode = access_mode,
-+ .access_flags = get_unchangeable_access_flags(dev, access_flags)
-+ };
-+ struct mlx5_cache_ent *ent = mkey_cache_ent_from_rb_key(dev, rb_key);
-
-- return mlx5_mr_cache_alloc(dev, ent, access_flags);
-+ if (!ent)
-+ return ERR_PTR(-EOPNOTSUPP);
-+
-+ return _mlx5_mr_cache_alloc(dev, ent, access_flags);
- }
-
- static void clean_keys(struct mlx5_ib_dev *dev, struct mlx5_cache_ent *ent)
-@@ -766,28 +832,32 @@ static void mlx5_mkey_cache_debugfs_cleanup(struct mlx5_ib_dev *dev)
- dev->cache.fs_root = NULL;
- }
-
-+static void mlx5_mkey_cache_debugfs_add_ent(struct mlx5_ib_dev *dev,
-+ struct mlx5_cache_ent *ent)
-+{
-+ int order = order_base_2(ent->rb_key.ndescs);
-+ struct dentry *dir;
-+
-+ if (ent->rb_key.access_mode == MLX5_MKC_ACCESS_MODE_KSM)
-+ order = MLX5_IMR_KSM_CACHE_ENTRY + 2;
-+
-+ sprintf(ent->name, "%d", order);
-+ dir = debugfs_create_dir(ent->name, dev->cache.fs_root);
-+ debugfs_create_file("size", 0600, dir, ent, &size_fops);
-+ debugfs_create_file("limit", 0600, dir, ent, &limit_fops);
-+ debugfs_create_ulong("cur", 0400, dir, &ent->stored);
-+ debugfs_create_u32("miss", 0600, dir, &ent->miss);
-+}
-+
- static void mlx5_mkey_cache_debugfs_init(struct mlx5_ib_dev *dev)
- {
-+ struct dentry *dbg_root = mlx5_debugfs_get_dev_root(dev->mdev);
- struct mlx5_mkey_cache *cache = &dev->cache;
-- struct mlx5_cache_ent *ent;
-- struct dentry *dir;
-- int i;
-
- if (!mlx5_debugfs_root || dev->is_rep)
- return;
-
-- dir = mlx5_debugfs_get_dev_root(dev->mdev);
-- cache->fs_root = debugfs_create_dir("mr_cache", dir);
--
-- for (i = 0; i < MAX_MKEY_CACHE_ENTRIES; i++) {
-- ent = mkey_cache_ent_from_order(dev, i);
-- sprintf(ent->name, "%d", ent->order);
-- dir = debugfs_create_dir(ent->name, cache->fs_root);
-- debugfs_create_file("size", 0600, dir, ent, &size_fops);
-- debugfs_create_file("limit", 0600, dir, ent, &limit_fops);
-- debugfs_create_ulong("cur", 0400, dir, &ent->stored);
-- debugfs_create_u32("miss", 0600, dir, &ent->miss);
-- }
-+ cache->fs_root = debugfs_create_dir("mr_cache", dbg_root);
- }
-
- static void delay_time_func(struct timer_list *t)
-@@ -798,9 +868,11 @@ static void delay_time_func(struct timer_list *t)
- }
-
- struct mlx5_cache_ent *mlx5r_cache_create_ent(struct mlx5_ib_dev *dev,
-- int order)
-+ struct mlx5r_cache_rb_key rb_key,
-+ bool persistent_entry)
- {
- struct mlx5_cache_ent *ent;
-+ int order;
- int ret;
-
- ent = kzalloc(sizeof(*ent), GFP_KERNEL);
-@@ -808,7 +880,7 @@ struct mlx5_cache_ent *mlx5r_cache_create_ent(struct mlx5_ib_dev *dev,
- return ERR_PTR(-ENOMEM);
-
- xa_init_flags(&ent->mkeys, XA_FLAGS_LOCK_IRQ);
-- ent->order = order;
-+ ent->rb_key = rb_key;
- ent->dev = dev;
-
- INIT_DELAYED_WORK(&ent->dwork, delayed_cache_work_func);
-@@ -818,13 +890,36 @@ struct mlx5_cache_ent *mlx5r_cache_create_ent(struct mlx5_ib_dev *dev,
- kfree(ent);
- return ERR_PTR(ret);
- }
-+
-+ if (persistent_entry) {
-+ if (rb_key.access_mode == MLX5_MKC_ACCESS_MODE_KSM)
-+ order = MLX5_IMR_KSM_CACHE_ENTRY;
-+ else
-+ order = order_base_2(rb_key.ndescs) - 2;
-+
-+ if ((dev->mdev->profile.mask & MLX5_PROF_MASK_MR_CACHE) &&
-+ !dev->is_rep && mlx5_core_is_pf(dev->mdev) &&
-+ mlx5r_umr_can_load_pas(dev, 0))
-+ ent->limit = dev->mdev->profile.mr_cache[order].limit;
-+ else
-+ ent->limit = 0;
-+
-+ mlx5_mkey_cache_debugfs_add_ent(dev, ent);
-+ }
-+
- return ent;
- }
-
- int mlx5_mkey_cache_init(struct mlx5_ib_dev *dev)
- {
- struct mlx5_mkey_cache *cache = &dev->cache;
-+ struct rb_root *root = &dev->cache.rb_root;
-+ struct mlx5r_cache_rb_key rb_key = {
-+ .access_mode = MLX5_MKC_ACCESS_MODE_MTT,
-+ };
- struct mlx5_cache_ent *ent;
-+ struct rb_node *node;
-+ int ret;
- int i;
-
- mutex_init(&dev->slow_path_mutex);
-@@ -838,33 +933,32 @@ int mlx5_mkey_cache_init(struct mlx5_ib_dev *dev)
-
- mlx5_cmd_init_async_ctx(dev->mdev, &dev->async_ctx);
- timer_setup(&dev->delay_timer, delay_time_func, 0);
-- for (i = 0; i < MAX_MKEY_CACHE_ENTRIES; i++) {
-- ent = mlx5r_cache_create_ent(dev, i);
--
-- if (i > MKEY_CACHE_LAST_STD_ENTRY) {
-- mlx5_odp_init_mkey_cache_entry(ent);
-- continue;
-+ mlx5_mkey_cache_debugfs_init(dev);
-+ for (i = 0; i <= mkey_cache_max_order(dev); i++) {
-+ rb_key.ndescs = 1 << (i + 2);
-+ ent = mlx5r_cache_create_ent(dev, rb_key, true);
-+ if (IS_ERR(ent)) {
-+ ret = PTR_ERR(ent);
-+ goto err;
- }
-+ }
-
-- if (ent->order > mkey_cache_max_order(dev))
-- continue;
-+ ret = mlx5_odp_init_mkey_cache(dev);
-+ if (ret)
-+ goto err;
-
-- ent->ndescs = 1 << ent->order;
-- ent->access_mode = MLX5_MKC_ACCESS_MODE_MTT;
-- if ((dev->mdev->profile.mask & MLX5_PROF_MASK_MR_CACHE) &&
-- !dev->is_rep && mlx5_core_is_pf(dev->mdev) &&
-- mlx5r_umr_can_load_pas(dev, 0))
-- ent->limit = dev->mdev->profile.mr_cache[i].limit;
-- else
-- ent->limit = 0;
-+ for (node = rb_first(root); node; node = rb_next(node)) {
-+ ent = rb_entry(node, struct mlx5_cache_ent, node);
- xa_lock_irq(&ent->mkeys);
- queue_adjust_cache_locked(ent);
- xa_unlock_irq(&ent->mkeys);
- }
-
-- mlx5_mkey_cache_debugfs_init(dev);
--
- return 0;
-+
-+err:
-+ mlx5_ib_warn(dev, "failed to create mkey cache entry\n");
-+ return ret;
- }
-
- int mlx5_mkey_cache_cleanup(struct mlx5_ib_dev *dev)
-@@ -965,7 +1059,7 @@ static int get_octo_len(u64 addr, u64 len, int page_shift)
- static int mkey_cache_max_order(struct mlx5_ib_dev *dev)
- {
- if (MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset))
-- return MKEY_CACHE_LAST_STD_ENTRY + 2;
-+ return MKEY_CACHE_LAST_STD_ENTRY;
- return MLX5_MAX_UMR_SHIFT;
- }
-
-@@ -995,6 +1089,9 @@ static struct mlx5_ib_mr *alloc_cacheable_mr(struct ib_pd *pd,
- struct ib_umem *umem, u64 iova,
- int access_flags)
- {
-+ struct mlx5r_cache_rb_key rb_key = {
-+ .access_mode = MLX5_MKC_ACCESS_MODE_MTT,
-+ };
- struct mlx5_ib_dev *dev = to_mdev(pd->device);
- struct mlx5_cache_ent *ent;
- struct mlx5_ib_mr *mr;
-@@ -1007,8 +1104,11 @@ static struct mlx5_ib_mr *alloc_cacheable_mr(struct ib_pd *pd,
- 0, iova);
- if (WARN_ON(!page_size))
- return ERR_PTR(-EINVAL);
-- ent = mkey_cache_ent_from_order(
-- dev, order_base_2(ib_umem_num_dma_blocks(umem, page_size)));
-+
-+ rb_key.ndescs = ib_umem_num_dma_blocks(umem, page_size);
-+ rb_key.ats = mlx5_umem_needs_ats(dev, umem, access_flags);
-+ rb_key.access_flags = get_unchangeable_access_flags(dev, access_flags);
-+ ent = mkey_cache_ent_from_rb_key(dev, rb_key);
- /*
- * Matches access in alloc_cache_mr(). If the MR can't come from the
- * cache then synchronously create an uncached one.
-@@ -1022,7 +1122,7 @@ static struct mlx5_ib_mr *alloc_cacheable_mr(struct ib_pd *pd,
- return mr;
- }
-
-- mr = mlx5_mr_cache_alloc(dev, ent, access_flags);
-+ mr = _mlx5_mr_cache_alloc(dev, ent, access_flags);
- if (IS_ERR(mr))
- return mr;
-
-@@ -1452,7 +1552,7 @@ static bool can_use_umr_rereg_pas(struct mlx5_ib_mr *mr,
- mlx5_umem_find_best_pgsz(new_umem, mkc, log_page_size, 0, iova);
- if (WARN_ON(!*page_size))
- return false;
-- return (1ULL << mr->mmkey.cache_ent->order) >=
-+ return (mr->mmkey.cache_ent->rb_key.ndescs) >=
- ib_umem_num_dma_blocks(new_umem, *page_size);
- }
-
-diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c
-index 7f68940ca0d1e..96d4faabbff8a 100644
---- a/drivers/infiniband/hw/mlx5/odp.c
-+++ b/drivers/infiniband/hw/mlx5/odp.c
-@@ -406,7 +406,6 @@ static void mlx5_ib_page_fault_resume(struct mlx5_ib_dev *dev,
- static struct mlx5_ib_mr *implicit_get_child_mr(struct mlx5_ib_mr *imr,
- unsigned long idx)
- {
-- int order = order_base_2(MLX5_IMR_MTT_ENTRIES);
- struct mlx5_ib_dev *dev = mr_to_mdev(imr);
- struct ib_umem_odp *odp;
- struct mlx5_ib_mr *mr;
-@@ -419,8 +418,9 @@ static struct mlx5_ib_mr *implicit_get_child_mr(struct mlx5_ib_mr *imr,
- if (IS_ERR(odp))
- return ERR_CAST(odp);
-
-- BUILD_BUG_ON(order > MKEY_CACHE_LAST_STD_ENTRY);
-- mr = mlx5_mr_cache_alloc_order(dev, order, imr->access_flags);
-+ mr = mlx5_mr_cache_alloc(dev, imr->access_flags,
-+ MLX5_MKC_ACCESS_MODE_MTT,
-+ MLX5_IMR_MTT_ENTRIES);
- if (IS_ERR(mr)) {
- ib_umem_odp_release(odp);
- return mr;
-@@ -494,8 +494,8 @@ struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd,
- if (IS_ERR(umem_odp))
- return ERR_CAST(umem_odp);
-
-- imr = mlx5_mr_cache_alloc_order(dev, MLX5_IMR_KSM_CACHE_ENTRY,
-- access_flags);
-+ imr = mlx5_mr_cache_alloc(dev, access_flags, MLX5_MKC_ACCESS_MODE_KSM,
-+ mlx5_imr_ksm_entries);
- if (IS_ERR(imr)) {
- ib_umem_odp_release(umem_odp);
- return imr;
-@@ -1591,12 +1591,22 @@ mlx5_ib_odp_destroy_eq(struct mlx5_ib_dev *dev, struct mlx5_ib_pf_eq *eq)
- return err;
- }
-
--void mlx5_odp_init_mkey_cache_entry(struct mlx5_cache_ent *ent)
-+int mlx5_odp_init_mkey_cache(struct mlx5_ib_dev *dev)
- {
-- if (!(ent->dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT))
-- return;
-- ent->ndescs = mlx5_imr_ksm_entries;
-- ent->access_mode = MLX5_MKC_ACCESS_MODE_KSM;
-+ struct mlx5r_cache_rb_key rb_key = {
-+ .access_mode = MLX5_MKC_ACCESS_MODE_KSM,
-+ .ndescs = mlx5_imr_ksm_entries,
-+ };
-+ struct mlx5_cache_ent *ent;
-+
-+ if (!(dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT))
-+ return 0;
-+
-+ ent = mlx5r_cache_create_ent(dev, rb_key, true);
-+ if (IS_ERR(ent))
-+ return PTR_ERR(ent);
-+
-+ return 0;
- }
-
- static const struct ib_device_ops mlx5_ib_dev_odp_ops = {
---
-2.39.5
-
+++ /dev/null
-From ee6d57a2e13d11ce9050cfc3e3b69ef707a44a63 Mon Sep 17 00:00:00 2001
-From: Michael Guralnik <michaelgur@nvidia.com>
-Date: Tue, 3 Sep 2024 14:24:49 +0300
-Subject: RDMA/mlx5: Limit usage of over-sized mkeys from the MR cache
-
-From: Michael Guralnik <michaelgur@nvidia.com>
-
-commit ee6d57a2e13d11ce9050cfc3e3b69ef707a44a63 upstream.
-
-When searching the MR cache for suitable cache entries, don't use mkeys
-larger than twice the size required for the MR.
-This should ensure the usage of mkeys closer to the minimal required size
-and reduce memory waste.
-
-On driver init we create entries for mkeys with clear attributes and
-powers of 2 sizes from 4 to the max supported size.
-This solves the issue for anyone using mkeys that fit these
-requirements.
-
-In the use case where an MR is registered with different attributes,
-like an access flag we can't UMR, we'll create a new cache entry to store
-it upon dereg.
-Without this fix, any later registration with same attributes and smaller
-size will use the newly created cache entry and it's mkeys, disregarding
-the memory waste of using mkeys larger than required.
-
-For example, one worst-case scenario can be when registering and
-deregistering a 1GB mkey with ATS enabled which will cause the creation of
-a new cache entry to hold those type of mkeys. A user registering a 4k MR
-with ATS will end up using the new cache entry and an mkey that can
-support a 1GB MR, thus wasting x250k memory than actually needed in the HW.
-
-Additionally, allow all small registration to use the smallest size
-cache entry that is initialized on driver load even if size is larger
-than twice the required size.
-
-Fixes: 73d09b2fe833 ("RDMA/mlx5: Introduce mlx5r_cache_rb_key")
-Signed-off-by: Michael Guralnik <michaelgur@nvidia.com>
-Link: https://patch.msgid.link/8ba3a6e3748aace2026de8b83da03aba084f78f4.1725362530.git.leon@kernel.org
-Signed-off-by: Leon Romanovsky <leon@kernel.org>
-Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
----
- drivers/infiniband/hw/mlx5/mr.c | 14 ++++++++++++--
- 1 file changed, 12 insertions(+), 2 deletions(-)
-
---- a/drivers/infiniband/hw/mlx5/mr.c
-+++ b/drivers/infiniband/hw/mlx5/mr.c
-@@ -48,6 +48,7 @@ enum {
- MAX_PENDING_REG_MR = 8,
- };
-
-+#define MLX5_MR_CACHE_PERSISTENT_ENTRY_MIN_DESCS 4
- #define MLX5_UMR_ALIGN 2048
-
- static void
-@@ -656,6 +657,7 @@ mkey_cache_ent_from_rb_key(struct mlx5_i
- {
- struct rb_node *node = dev->cache.rb_root.rb_node;
- struct mlx5_cache_ent *cur, *smallest = NULL;
-+ u64 ndescs_limit;
- int cmp;
-
- /*
-@@ -674,10 +676,18 @@ mkey_cache_ent_from_rb_key(struct mlx5_i
- return cur;
- }
-
-+ /*
-+ * Limit the usage of mkeys larger than twice the required size while
-+ * also allowing the usage of smallest cache entry for small MRs.
-+ */
-+ ndescs_limit = max_t(u64, rb_key.ndescs * 2,
-+ MLX5_MR_CACHE_PERSISTENT_ENTRY_MIN_DESCS);
-+
- return (smallest &&
- smallest->rb_key.access_mode == rb_key.access_mode &&
- smallest->rb_key.access_flags == rb_key.access_flags &&
-- smallest->rb_key.ats == rb_key.ats) ?
-+ smallest->rb_key.ats == rb_key.ats &&
-+ smallest->rb_key.ndescs <= ndescs_limit) ?
- smallest :
- NULL;
- }
-@@ -958,7 +968,7 @@ int mlx5_mkey_cache_init(struct mlx5_ib_
- mlx5_mkey_cache_debugfs_init(dev);
- mutex_lock(&cache->rb_lock);
- for (i = 0; i <= mkey_cache_max_order(dev); i++) {
-- rb_key.ndescs = 1 << (i + 2);
-+ rb_key.ndescs = MLX5_MR_CACHE_PERSISTENT_ENTRY_MIN_DESCS << i;
- ent = mlx5r_cache_create_ent_locked(dev, rb_key, true);
- if (IS_ERR(ent)) {
- ret = PTR_ERR(ent);
+++ /dev/null
-From 31e1b4f44049773843852197aab66262fea5d3ca Mon Sep 17 00:00:00 2001
-From: Sasha Levin <sashal@kernel.org>
-Date: Mon, 5 Jun 2023 13:14:05 +0300
-Subject: RDMA/mlx5: Reduce QP table exposure
-
-From: Leon Romanovsky <leonro@nvidia.com>
-
-[ Upstream commit 2ecfd946169e7f56534db2a5f6935858be3005ba ]
-
-driver.h is common header to whole mlx5 code base, but struct
-mlx5_qp_table is used in mlx5_ib driver only. So move that struct
-to be under sole responsibility of mlx5_ib.
-
-Link: https://lore.kernel.org/r/bec0dc1158e795813b135d1143147977f26bf668.1685953497.git.leon@kernel.org
-Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
-Stable-dep-of: c534ffda781f ("RDMA/mlx5: Fix AH static rate parsing")
-Signed-off-by: Sasha Levin <sashal@kernel.org>
----
- drivers/infiniband/hw/mlx5/mlx5_ib.h | 1 +
- drivers/infiniband/hw/mlx5/qp.h | 11 ++++++++++-
- include/linux/mlx5/driver.h | 9 ---------
- 3 files changed, 11 insertions(+), 10 deletions(-)
-
-diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
-index 024d2071c6a5d..5c533023a51a4 100644
---- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
-+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
-@@ -25,6 +25,7 @@
- #include <rdma/mlx5_user_ioctl_verbs.h>
-
- #include "srq.h"
-+#include "qp.h"
-
- #define mlx5_ib_dbg(_dev, format, arg...) \
- dev_dbg(&(_dev)->ib_dev.dev, "%s:%d:(pid %d): " format, __func__, \
-diff --git a/drivers/infiniband/hw/mlx5/qp.h b/drivers/infiniband/hw/mlx5/qp.h
-index fb2f4e030bb8f..e677fa0ca4226 100644
---- a/drivers/infiniband/hw/mlx5/qp.h
-+++ b/drivers/infiniband/hw/mlx5/qp.h
-@@ -6,7 +6,16 @@
- #ifndef _MLX5_IB_QP_H
- #define _MLX5_IB_QP_H
-
--#include "mlx5_ib.h"
-+struct mlx5_ib_dev;
-+
-+struct mlx5_qp_table {
-+ struct notifier_block nb;
-+
-+ /* protect radix tree
-+ */
-+ spinlock_t lock;
-+ struct radix_tree_root tree;
-+};
-
- int mlx5_init_qp_table(struct mlx5_ib_dev *dev);
- void mlx5_cleanup_qp_table(struct mlx5_ib_dev *dev);
-diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
-index 6cea62ca76d6b..060610183fdf9 100644
---- a/include/linux/mlx5/driver.h
-+++ b/include/linux/mlx5/driver.h
-@@ -440,15 +440,6 @@ struct mlx5_core_health {
- struct delayed_work update_fw_log_ts_work;
- };
-
--struct mlx5_qp_table {
-- struct notifier_block nb;
--
-- /* protect radix tree
-- */
-- spinlock_t lock;
-- struct radix_tree_root tree;
--};
--
- enum {
- MLX5_PF_NOTIFY_DISABLE_VF,
- MLX5_PF_NOTIFY_ENABLE_VF,
---
-2.39.5
-
+++ /dev/null
-From c1eb2512596fb3542357bb6c34c286f5e0374538 Mon Sep 17 00:00:00 2001
-From: Jason Gunthorpe <jgg@nvidia.com>
-Date: Tue, 28 May 2024 15:52:52 +0300
-Subject: RDMA/mlx5: Remove extra unlock on error path
-
-From: Jason Gunthorpe <jgg@nvidia.com>
-
-commit c1eb2512596fb3542357bb6c34c286f5e0374538 upstream.
-
-The below commit lifted the locking out of this function but left this
-error path unlock behind resulting in unbalanced locking. Remove the
-missed unlock too.
-
-Cc: stable@vger.kernel.org
-Fixes: 627122280c87 ("RDMA/mlx5: Add work to remove temporary entries from the cache")
-Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
-Reviewed-by: Michael Guralnik <michaelgur@nvidia.com>
-Link: https://lore.kernel.org/r/78090c210c750f47219b95248f9f782f34548bb1.1716900410.git.leon@kernel.org
-Signed-off-by: Leon Romanovsky <leon@kernel.org>
-Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
----
- drivers/infiniband/hw/mlx5/mr.c | 4 +---
- 1 file changed, 1 insertion(+), 3 deletions(-)
-
---- a/drivers/infiniband/hw/mlx5/mr.c
-+++ b/drivers/infiniband/hw/mlx5/mr.c
-@@ -638,10 +638,8 @@ static int mlx5_cache_ent_insert(struct
- new = &((*new)->rb_left);
- if (cmp < 0)
- new = &((*new)->rb_right);
-- if (cmp == 0) {
-- mutex_unlock(&cache->rb_lock);
-+ if (cmp == 0)
- return -EEXIST;
-- }
- }
-
- /* Add new node and rebalance tree. */
+++ /dev/null
-From f1cf3c129548533fa9dc9569a22ff1ed3e3c9e02 Mon Sep 17 00:00:00 2001
-From: Sasha Levin <sashal@kernel.org>
-Date: Thu, 26 Jan 2023 00:28:03 +0200
-Subject: RDMA/mlx5: Remove implicit ODP cache entry
-
-From: Aharon Landau <aharonl@nvidia.com>
-
-[ Upstream commit 18b1746bddf5e7f6b2618966596d9517172a5cd7 ]
-
-Implicit ODP mkey doesn't have unique properties. It shares the same
-properties as the order 18 cache entry. There is no need to devote a
-special entry for that.
-
-Link: https://lore.kernel.org/r/20230125222807.6921-3-michaelgur@nvidia.com
-Signed-off-by: Aharon Landau <aharonl@nvidia.com>
-Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
-Stable-dep-of: d97505baea64 ("RDMA/mlx5: Fix the recovery flow of the UMR QP")
-Signed-off-by: Sasha Levin <sashal@kernel.org>
----
- drivers/infiniband/hw/mlx5/odp.c | 20 +++++---------------
- include/linux/mlx5/driver.h | 1 -
- 2 files changed, 5 insertions(+), 16 deletions(-)
-
-diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c
-index a5c9baec8be85..5f0a17382de73 100644
---- a/drivers/infiniband/hw/mlx5/odp.c
-+++ b/drivers/infiniband/hw/mlx5/odp.c
-@@ -406,6 +406,7 @@ static void mlx5_ib_page_fault_resume(struct mlx5_ib_dev *dev,
- static struct mlx5_ib_mr *implicit_get_child_mr(struct mlx5_ib_mr *imr,
- unsigned long idx)
- {
-+ int order = order_base_2(MLX5_IMR_MTT_ENTRIES);
- struct mlx5_ib_dev *dev = mr_to_mdev(imr);
- struct ib_umem_odp *odp;
- struct mlx5_ib_mr *mr;
-@@ -418,7 +419,8 @@ static struct mlx5_ib_mr *implicit_get_child_mr(struct mlx5_ib_mr *imr,
- if (IS_ERR(odp))
- return ERR_CAST(odp);
-
-- mr = mlx5_mr_cache_alloc(dev, &dev->cache.ent[MLX5_IMR_MTT_CACHE_ENTRY],
-+ BUILD_BUG_ON(order > MKEY_CACHE_LAST_STD_ENTRY);
-+ mr = mlx5_mr_cache_alloc(dev, &dev->cache.ent[order],
- imr->access_flags);
- if (IS_ERR(mr)) {
- ib_umem_odp_release(odp);
-@@ -1595,20 +1597,8 @@ void mlx5_odp_init_mkey_cache_entry(struct mlx5_cache_ent *ent)
- {
- if (!(ent->dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT))
- return;
--
-- switch (ent->order - 2) {
-- case MLX5_IMR_MTT_CACHE_ENTRY:
-- ent->ndescs = MLX5_IMR_MTT_ENTRIES;
-- ent->access_mode = MLX5_MKC_ACCESS_MODE_MTT;
-- ent->limit = 0;
-- break;
--
-- case MLX5_IMR_KSM_CACHE_ENTRY:
-- ent->ndescs = mlx5_imr_ksm_entries;
-- ent->access_mode = MLX5_MKC_ACCESS_MODE_KSM;
-- ent->limit = 0;
-- break;
-- }
-+ ent->ndescs = mlx5_imr_ksm_entries;
-+ ent->access_mode = MLX5_MKC_ACCESS_MODE_KSM;
- }
-
- static const struct ib_device_ops mlx5_ib_dev_odp_ops = {
-diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
-index 3c3e0f26c2446..6cea62ca76d6b 100644
---- a/include/linux/mlx5/driver.h
-+++ b/include/linux/mlx5/driver.h
-@@ -744,7 +744,6 @@ enum {
-
- enum {
- MKEY_CACHE_LAST_STD_ENTRY = 20,
-- MLX5_IMR_MTT_CACHE_ENTRY,
- MLX5_IMR_KSM_CACHE_ENTRY,
- MAX_MKEY_CACHE_ENTRIES
- };
---
-2.39.5
-
+++ /dev/null
-From c99a7457e5bb873914a74307ba2df85f6799203b Mon Sep 17 00:00:00 2001
-From: Leon Romanovsky <leonro@nvidia.com>
-Date: Thu, 28 Sep 2023 20:20:47 +0300
-Subject: RDMA/mlx5: Remove not-used cache disable flag
-
-From: Leon Romanovsky <leonro@nvidia.com>
-
-commit c99a7457e5bb873914a74307ba2df85f6799203b upstream.
-
-During execution of mlx5_mkey_cache_cleanup(), there is a guarantee
-that MR are not registered and/or destroyed. It means that we don't
-need newly introduced cache disable flag.
-
-Fixes: 374012b00457 ("RDMA/mlx5: Fix mkey cache possible deadlock on cleanup")
-Link: https://lore.kernel.org/r/c7e9c9f98c8ae4a7413d97d9349b29f5b0a23dbe.1695921626.git.leon@kernel.org
-Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
-Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
----
- drivers/infiniband/hw/mlx5/mlx5_ib.h | 1 -
- drivers/infiniband/hw/mlx5/mr.c | 5 -----
- 2 files changed, 6 deletions(-)
-
---- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
-+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
-@@ -822,7 +822,6 @@ struct mlx5_mkey_cache {
- struct dentry *fs_root;
- unsigned long last_add;
- struct delayed_work remove_ent_dwork;
-- u8 disable: 1;
- };
-
- struct mlx5_ib_port_resources {
---- a/drivers/infiniband/hw/mlx5/mr.c
-+++ b/drivers/infiniband/hw/mlx5/mr.c
-@@ -1007,7 +1007,6 @@ int mlx5_mkey_cache_cleanup(struct mlx5_
- return 0;
-
- mutex_lock(&dev->cache.rb_lock);
-- dev->cache.disable = true;
- for (node = rb_first(root); node; node = rb_next(node)) {
- ent = rb_entry(node, struct mlx5_cache_ent, node);
- spin_lock_irq(&ent->mkeys_queue.lock);
-@@ -1810,10 +1809,6 @@ static int cache_ent_find_and_store(stru
- }
-
- mutex_lock(&cache->rb_lock);
-- if (cache->disable) {
-- mutex_unlock(&cache->rb_lock);
-- return 0;
-- }
- ent = mkey_cache_ent_from_rb_key(dev, mr->mmkey.rb_key);
- if (ent) {
- if (ent->rb_key.ndescs == mr->mmkey.rb_key.ndescs) {
+++ /dev/null
-From 0611a8e8b475fc5230b9a24d29c8397aaab20b63 Mon Sep 17 00:00:00 2001
-From: Or Har-Toov <ohartoov@nvidia.com>
-Date: Wed, 3 Apr 2024 13:35:59 +0300
-Subject: RDMA/mlx5: Uncacheable mkey has neither rb_key or cache_ent
-
-From: Or Har-Toov <ohartoov@nvidia.com>
-
-commit 0611a8e8b475fc5230b9a24d29c8397aaab20b63 upstream.
-
-As some mkeys can't be modified with UMR due to some UMR limitations,
-like the size of translation that can be updated, not all user mkeys can
-be cached.
-
-Fixes: dd1b913fb0d0 ("RDMA/mlx5: Cache all user cacheable mkeys on dereg MR flow")
-Signed-off-by: Or Har-Toov <ohartoov@nvidia.com>
-Link: https://lore.kernel.org/r/f2742dd934ed73b2d32c66afb8e91b823063880c.1712140377.git.leon@kernel.org
-Signed-off-by: Leon Romanovsky <leon@kernel.org>
-Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
----
- drivers/infiniband/hw/mlx5/mlx5_ib.h | 2 +-
- 1 file changed, 1 insertion(+), 1 deletion(-)
-
---- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
-+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
-@@ -651,7 +651,7 @@ struct mlx5_ib_mkey {
- unsigned int ndescs;
- struct wait_queue_head wait;
- refcount_t usecount;
-- /* User Mkey must hold either a rb_key or a cache_ent. */
-+ /* Cacheable user Mkey must hold either a rb_key or a cache_ent. */
- struct mlx5r_cache_rb_key rb_key;
- struct mlx5_cache_ent *cache_ent;
- };
spi-atmel-quadspi-avoid-overwriting-delay-register-settings.patch
spi-atmel-quadspi-fix-wrong-register-value-written-to-mr.patch
netfilter-allow-exp-not-to-be-removed-in-nf_ct_find_expectation.patch
-rdma-mlx5-don-t-keep-umrable-page_shift-in-cache-ent.patch
-rdma-mlx5-remove-implicit-odp-cache-entry.patch
-rdma-mlx5-change-the-cache-structure-to-an-rb-tree.patch
-rdma-mlx5-introduce-mlx5r_cache_rb_key.patch
-rdma-mlx5-cache-all-user-cacheable-mkeys-on-dereg-mr.patch
-rdma-mlx5-add-work-to-remove-temporary-entries-from-.patch
-rdma-mlx5-implement-mkeys-management-via-lifo-queue.patch
-rdma-mlx5-fix-the-recovery-flow-of-the-umr-qp.patch
ib-mlx5-set-and-get-correct-qp_num-for-a-dct-qp.patch
ovl-fix-uaf-in-ovl_dentry_update_reval-by-moving-dpu.patch
sunrpc-convert-rpc_task_-constants-to-enum.patch
sunrpc-prevent-looping-due-to-rpc_signal_task-races.patch
-rdma-mlx-calling-qp-event-handler-in-workqueue-conte.patch
-rdma-mlx5-reduce-qp-table-exposure.patch
-ib-core-add-support-for-xdr-link-speed.patch
-rdma-mlx5-fix-ah-static-rate-parsing.patch
scsi-core-clear-driver-private-data-when-retrying-re.patch
rdma-mlx5-fix-bind-qp-error-cleanup-flow.patch
sunrpc-suppress-warnings-for-unused-procfs-functions.patch
squashfs-check-the-inode-number-is-not-the-invalid-value-of-zero.patch
pfifo_tail_enqueue-drop-new-packet-when-sch-limit-0.patch
media-mtk-vcodec-potential-null-pointer-deference-in-scp.patch
-rdma-mlx5-fix-mr-cache-debugfs-error-in-ib-representors-mode.patch
-rdma-mlx5-check-reg_create-create-for-errors.patch
-rdma-mlx5-fix-mkey-cache-possible-deadlock-on-cleanup.patch
-rdma-mlx5-fix-assigning-access-flags-to-cache-mkeys.patch
-rdma-mlx5-uncacheable-mkey-has-neither-rb_key-or-cache_ent.patch
-rdma-mlx5-change-check-for-cacheable-mkeys.patch
-rdma-mlx5-remove-extra-unlock-on-error-path.patch
-rdma-mlx5-follow-rb_key.ats-when-creating-new-mkeys.patch
-rdma-mlx5-ensure-created-mkeys-always-have-a-populated-rb_key.patch
-rdma-mlx5-fix-counter-update-on-mr-cache-mkey-creation.patch
-rdma-mlx5-limit-usage-of-over-sized-mkeys-from-the-mr-cache.patch
-rdma-mlx5-remove-not-used-cache-disable-flag.patch
-rdma-mlx5-fix-mkey-cache-wq-flush.patch
-rdma-mlx5-fix-mr-cache-temp-entries-cleanup.patch