From: Sasha Levin Date: Fri, 28 Feb 2025 04:41:22 +0000 (-0500) Subject: Fixes for 6.1 X-Git-Tag: v6.6.81~51 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=192b4c9050e377b8d3ca226227560639614ba71f;p=thirdparty%2Fkernel%2Fstable-queue.git Fixes for 6.1 Signed-off-by: Sasha Levin --- diff --git a/queue-6.1/ib-core-add-support-for-xdr-link-speed.patch b/queue-6.1/ib-core-add-support-for-xdr-link-speed.patch new file mode 100644 index 0000000000..597def3d98 --- /dev/null +++ b/queue-6.1/ib-core-add-support-for-xdr-link-speed.patch @@ -0,0 +1,131 @@ +From 79cad3705d28ff0c133bcd85a9107d0dbbb27e72 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 20 Sep 2023 13:07:40 +0300 +Subject: IB/core: Add support for XDR link speed + +From: Or Har-Toov + +[ Upstream commit 703289ce43f740b0096724300107df82d008552f ] + +Add new IBTA speed XDR, the new rate that was added to Infiniband spec +as part of XDR and supporting signaling rate of 200Gb. + +In order to report that value to rdma-core, add new u32 field to +query_port response. + +Signed-off-by: Or Har-Toov +Reviewed-by: Mark Zhang +Link: https://lore.kernel.org/r/9d235fc600a999e8274010f0e18b40fa60540e6c.1695204156.git.leon@kernel.org +Reviewed-by: Jacob Keller +Signed-off-by: Leon Romanovsky +Stable-dep-of: c534ffda781f ("RDMA/mlx5: Fix AH static rate parsing") +Signed-off-by: Sasha Levin +--- + drivers/infiniband/core/sysfs.c | 4 ++++ + drivers/infiniband/core/uverbs_std_types_device.c | 3 ++- + drivers/infiniband/core/verbs.c | 3 +++ + include/rdma/ib_verbs.h | 2 ++ + include/uapi/rdma/ib_user_ioctl_verbs.h | 3 ++- + 5 files changed, 13 insertions(+), 2 deletions(-) + +diff --git a/drivers/infiniband/core/sysfs.c b/drivers/infiniband/core/sysfs.c +index ec5efdc166601..9f97bef021497 100644 +--- a/drivers/infiniband/core/sysfs.c ++++ b/drivers/infiniband/core/sysfs.c +@@ -342,6 +342,10 @@ static ssize_t rate_show(struct ib_device *ibdev, u32 port_num, + speed = " NDR"; + rate = 1000; + break; ++ case IB_SPEED_XDR: ++ speed = " XDR"; ++ rate = 2000; ++ break; + case IB_SPEED_SDR: + default: /* default to SDR for invalid rates */ + speed = " SDR"; +diff --git a/drivers/infiniband/core/uverbs_std_types_device.c b/drivers/infiniband/core/uverbs_std_types_device.c +index 049684880ae03..fb0555647336f 100644 +--- a/drivers/infiniband/core/uverbs_std_types_device.c ++++ b/drivers/infiniband/core/uverbs_std_types_device.c +@@ -203,6 +203,7 @@ static int UVERBS_HANDLER(UVERBS_METHOD_QUERY_PORT)( + + copy_port_attr_to_resp(&attr, &resp.legacy_resp, ib_dev, port_num); + resp.port_cap_flags2 = attr.port_cap_flags2; ++ resp.active_speed_ex = attr.active_speed; + + return uverbs_copy_to_struct_or_zero(attrs, UVERBS_ATTR_QUERY_PORT_RESP, + &resp, sizeof(resp)); +@@ -461,7 +462,7 @@ DECLARE_UVERBS_NAMED_METHOD( + UVERBS_ATTR_PTR_OUT( + UVERBS_ATTR_QUERY_PORT_RESP, + UVERBS_ATTR_STRUCT(struct ib_uverbs_query_port_resp_ex, +- reserved), ++ active_speed_ex), + UA_MANDATORY)); + + DECLARE_UVERBS_NAMED_METHOD( +diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c +index b99b3cc283b65..90848546f1704 100644 +--- a/drivers/infiniband/core/verbs.c ++++ b/drivers/infiniband/core/verbs.c +@@ -147,6 +147,7 @@ __attribute_const__ int ib_rate_to_mult(enum ib_rate rate) + case IB_RATE_50_GBPS: return 20; + case IB_RATE_400_GBPS: return 160; + case IB_RATE_600_GBPS: return 240; ++ case IB_RATE_800_GBPS: return 320; + default: return -1; + } + } +@@ -176,6 +177,7 @@ __attribute_const__ enum ib_rate mult_to_ib_rate(int mult) + case 20: return IB_RATE_50_GBPS; + case 160: return IB_RATE_400_GBPS; + case 240: return IB_RATE_600_GBPS; ++ case 320: return IB_RATE_800_GBPS; + default: return IB_RATE_PORT_CURRENT; + } + } +@@ -205,6 +207,7 @@ __attribute_const__ int ib_rate_to_mbps(enum ib_rate rate) + case IB_RATE_50_GBPS: return 53125; + case IB_RATE_400_GBPS: return 425000; + case IB_RATE_600_GBPS: return 637500; ++ case IB_RATE_800_GBPS: return 850000; + default: return -1; + } + } +diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h +index 68fd6d22adfd4..750effb875783 100644 +--- a/include/rdma/ib_verbs.h ++++ b/include/rdma/ib_verbs.h +@@ -557,6 +557,7 @@ enum ib_port_speed { + IB_SPEED_EDR = 32, + IB_SPEED_HDR = 64, + IB_SPEED_NDR = 128, ++ IB_SPEED_XDR = 256, + }; + + enum ib_stat_flag { +@@ -836,6 +837,7 @@ enum ib_rate { + IB_RATE_50_GBPS = 20, + IB_RATE_400_GBPS = 21, + IB_RATE_600_GBPS = 22, ++ IB_RATE_800_GBPS = 23, + }; + + /** +diff --git a/include/uapi/rdma/ib_user_ioctl_verbs.h b/include/uapi/rdma/ib_user_ioctl_verbs.h +index 7dd56210226f5..125fb9f0ef4ab 100644 +--- a/include/uapi/rdma/ib_user_ioctl_verbs.h ++++ b/include/uapi/rdma/ib_user_ioctl_verbs.h +@@ -218,7 +218,8 @@ enum ib_uverbs_advise_mr_flag { + struct ib_uverbs_query_port_resp_ex { + struct ib_uverbs_query_port_resp legacy_resp; + __u16 port_cap_flags2; +- __u8 reserved[6]; ++ __u8 reserved[2]; ++ __u32 active_speed_ex; + }; + + struct ib_uverbs_qp_cap { +-- +2.39.5 + diff --git a/queue-6.1/ib-mlx5-set-and-get-correct-qp_num-for-a-dct-qp.patch b/queue-6.1/ib-mlx5-set-and-get-correct-qp_num-for-a-dct-qp.patch new file mode 100644 index 0000000000..9ac408a679 --- /dev/null +++ b/queue-6.1/ib-mlx5-set-and-get-correct-qp_num-for-a-dct-qp.patch @@ -0,0 +1,50 @@ +From 608ee99426cce23b021b11aac5d6732400828ac0 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Sun, 19 Jan 2025 14:39:46 +0200 +Subject: IB/mlx5: Set and get correct qp_num for a DCT QP + +From: Mark Zhang + +[ Upstream commit 12d044770e12c4205fa69535b4fa8a9981fea98f ] + +When a DCT QP is created on an active lag, it's dctc.port is assigned +in a round-robin way, which is from 1 to dev->lag_port. In this case +when querying this QP, we may get qp_attr.port_num > 2. +Fix this by setting qp->port when modifying a DCT QP, and read port_num +from qp->port instead of dctc.port when querying it. + +Fixes: 7c4b1ab9f167 ("IB/mlx5: Add DCT RoCE LAG support") +Signed-off-by: Mark Zhang +Reviewed-by: Maher Sanalla +Link: https://patch.msgid.link/94c76bf0adbea997f87ffa27674e0a7118ad92a9.1737290358.git.leon@kernel.org +Signed-off-by: Leon Romanovsky +Signed-off-by: Sasha Levin +--- + drivers/infiniband/hw/mlx5/qp.c | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c +index 8d132b726c64b..d782a494abcda 100644 +--- a/drivers/infiniband/hw/mlx5/qp.c ++++ b/drivers/infiniband/hw/mlx5/qp.c +@@ -4466,6 +4466,8 @@ static int mlx5_ib_modify_dct(struct ib_qp *ibqp, struct ib_qp_attr *attr, + + set_id = mlx5_ib_get_counters_id(dev, attr->port_num - 1); + MLX5_SET(dctc, dctc, counter_set_id, set_id); ++ ++ qp->port = attr->port_num; + } else if (cur_state == IB_QPS_INIT && new_state == IB_QPS_RTR) { + struct mlx5_ib_modify_qp_resp resp = {}; + u32 out[MLX5_ST_SZ_DW(create_dct_out)] = {}; +@@ -4955,7 +4957,7 @@ static int mlx5_ib_dct_query_qp(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *mqp, + } + + if (qp_attr_mask & IB_QP_PORT) +- qp_attr->port_num = MLX5_GET(dctc, dctc, port); ++ qp_attr->port_num = mqp->port; + if (qp_attr_mask & IB_QP_MIN_RNR_TIMER) + qp_attr->min_rnr_timer = MLX5_GET(dctc, dctc, min_rnr_nak); + if (qp_attr_mask & IB_QP_AV) { +-- +2.39.5 + diff --git a/queue-6.1/ovl-fix-uaf-in-ovl_dentry_update_reval-by-moving-dpu.patch b/queue-6.1/ovl-fix-uaf-in-ovl_dentry_update_reval-by-moving-dpu.patch new file mode 100644 index 0000000000..a36939d22c --- /dev/null +++ b/queue-6.1/ovl-fix-uaf-in-ovl_dentry_update_reval-by-moving-dpu.patch @@ -0,0 +1,71 @@ +From a760d15221ea7d4ac13c92a7e6f47314bd8cb2d3 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Sat, 15 Feb 2025 00:51:48 +0300 +Subject: ovl: fix UAF in ovl_dentry_update_reval by moving dput() in + ovl_link_up + +From: Vasiliy Kovalev + +[ Upstream commit c84e125fff2615b4d9c259e762596134eddd2f27 ] + +The issue was caused by dput(upper) being called before +ovl_dentry_update_reval(), while upper->d_flags was still +accessed in ovl_dentry_remote(). + +Move dput(upper) after its last use to prevent use-after-free. + +BUG: KASAN: slab-use-after-free in ovl_dentry_remote fs/overlayfs/util.c:162 [inline] +BUG: KASAN: slab-use-after-free in ovl_dentry_update_reval+0xd2/0xf0 fs/overlayfs/util.c:167 + +Call Trace: + + __dump_stack lib/dump_stack.c:88 [inline] + dump_stack_lvl+0x116/0x1f0 lib/dump_stack.c:114 + print_address_description mm/kasan/report.c:377 [inline] + print_report+0xc3/0x620 mm/kasan/report.c:488 + kasan_report+0xd9/0x110 mm/kasan/report.c:601 + ovl_dentry_remote fs/overlayfs/util.c:162 [inline] + ovl_dentry_update_reval+0xd2/0xf0 fs/overlayfs/util.c:167 + ovl_link_up fs/overlayfs/copy_up.c:610 [inline] + ovl_copy_up_one+0x2105/0x3490 fs/overlayfs/copy_up.c:1170 + ovl_copy_up_flags+0x18d/0x200 fs/overlayfs/copy_up.c:1223 + ovl_rename+0x39e/0x18c0 fs/overlayfs/dir.c:1136 + vfs_rename+0xf84/0x20a0 fs/namei.c:4893 +... + + +Fixes: b07d5cc93e1b ("ovl: update of dentry revalidate flags after copy up") +Reported-by: syzbot+316db8a1191938280eb6@syzkaller.appspotmail.com +Closes: https://syzkaller.appspot.com/bug?extid=316db8a1191938280eb6 +Signed-off-by: Vasiliy Kovalev +Link: https://lore.kernel.org/r/20250214215148.761147-1-kovalev@altlinux.org +Reviewed-by: Amir Goldstein +Signed-off-by: Christian Brauner +Signed-off-by: Sasha Levin +--- + fs/overlayfs/copy_up.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c +index 86d4b6975dbcb..203b88293f6bb 100644 +--- a/fs/overlayfs/copy_up.c ++++ b/fs/overlayfs/copy_up.c +@@ -532,7 +532,6 @@ static int ovl_link_up(struct ovl_copy_up_ctx *c) + err = PTR_ERR(upper); + if (!IS_ERR(upper)) { + err = ovl_do_link(ofs, ovl_dentry_upper(c->dentry), udir, upper); +- dput(upper); + + if (!err) { + /* Restore timestamps on parent (best effort) */ +@@ -540,6 +539,7 @@ static int ovl_link_up(struct ovl_copy_up_ctx *c) + ovl_dentry_set_upper_alias(c->dentry); + ovl_dentry_update_reval(c->dentry, upper); + } ++ dput(upper); + } + inode_unlock(udir); + if (err) +-- +2.39.5 + diff --git a/queue-6.1/rdma-mlx-calling-qp-event-handler-in-workqueue-conte.patch b/queue-6.1/rdma-mlx-calling-qp-event-handler-in-workqueue-conte.patch new file mode 100644 index 0000000000..70a6ed3b22 --- /dev/null +++ b/queue-6.1/rdma-mlx-calling-qp-event-handler-in-workqueue-conte.patch @@ -0,0 +1,502 @@ +From d5eccf1fd4fbdb90e3f1aba4e5ba5928ea3163c2 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 4 Jan 2023 11:43:34 +0200 +Subject: RDMA/mlx: Calling qp event handler in workqueue context + +From: Mark Zhang + +[ Upstream commit 312b8f79eb05479628ee71357749815b2eeeeea8 ] + +Move the call of qp event handler from atomic to workqueue context, +so that the handler is able to block. This is needed by following +patches. + +Signed-off-by: Mark Zhang +Reviewed-by: Patrisious Haddad +Link: https://lore.kernel.org/r/0cd17b8331e445f03942f4bb28d447f24ac5669d.1672821186.git.leonro@nvidia.com +Signed-off-by: Leon Romanovsky +Stable-dep-of: c534ffda781f ("RDMA/mlx5: Fix AH static rate parsing") +Signed-off-by: Sasha Levin +--- + drivers/infiniband/hw/mlx4/main.c | 8 ++ + drivers/infiniband/hw/mlx4/mlx4_ib.h | 3 + + drivers/infiniband/hw/mlx4/qp.c | 121 +++++++++++++++++------- + drivers/infiniband/hw/mlx5/main.c | 7 ++ + drivers/infiniband/hw/mlx5/qp.c | 119 ++++++++++++++++------- + drivers/infiniband/hw/mlx5/qp.h | 2 + + drivers/infiniband/hw/mlx5/qpc.c | 3 +- + drivers/net/ethernet/mellanox/mlx4/qp.c | 14 ++- + include/linux/mlx4/qp.h | 1 + + include/rdma/ib_verbs.h | 2 +- + 10 files changed, 202 insertions(+), 78 deletions(-) + +diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c +index 7c3dc86ab7f04..0f0b130cc8aac 100644 +--- a/drivers/infiniband/hw/mlx4/main.c ++++ b/drivers/infiniband/hw/mlx4/main.c +@@ -3307,6 +3307,10 @@ static int __init mlx4_ib_init(void) + if (!wq) + return -ENOMEM; + ++ err = mlx4_ib_qp_event_init(); ++ if (err) ++ goto clean_qp_event; ++ + err = mlx4_ib_cm_init(); + if (err) + goto clean_wq; +@@ -3328,6 +3332,9 @@ static int __init mlx4_ib_init(void) + mlx4_ib_cm_destroy(); + + clean_wq: ++ mlx4_ib_qp_event_cleanup(); ++ ++clean_qp_event: + destroy_workqueue(wq); + return err; + } +@@ -3337,6 +3344,7 @@ static void __exit mlx4_ib_cleanup(void) + mlx4_unregister_interface(&mlx4_ib_interface); + mlx4_ib_mcg_destroy(); + mlx4_ib_cm_destroy(); ++ mlx4_ib_qp_event_cleanup(); + destroy_workqueue(wq); + } + +diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h b/drivers/infiniband/hw/mlx4/mlx4_ib.h +index 6a3b0f121045e..17fee1e73a45a 100644 +--- a/drivers/infiniband/hw/mlx4/mlx4_ib.h ++++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h +@@ -940,4 +940,7 @@ int mlx4_ib_umem_calc_optimal_mtt_size(struct ib_umem *umem, u64 start_va, + int mlx4_ib_cm_init(void); + void mlx4_ib_cm_destroy(void); + ++int mlx4_ib_qp_event_init(void); ++void mlx4_ib_qp_event_cleanup(void); ++ + #endif /* MLX4_IB_H */ +diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c +index ac479e81ddee8..9d08aa99f3cb0 100644 +--- a/drivers/infiniband/hw/mlx4/qp.c ++++ b/drivers/infiniband/hw/mlx4/qp.c +@@ -102,6 +102,14 @@ enum mlx4_ib_source_type { + MLX4_IB_RWQ_SRC = 1, + }; + ++struct mlx4_ib_qp_event_work { ++ struct work_struct work; ++ struct mlx4_qp *qp; ++ enum mlx4_event type; ++}; ++ ++static struct workqueue_struct *mlx4_ib_qp_event_wq; ++ + static int is_tunnel_qp(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp) + { + if (!mlx4_is_master(dev->dev)) +@@ -200,50 +208,77 @@ static void stamp_send_wqe(struct mlx4_ib_qp *qp, int n) + } + } + ++static void mlx4_ib_handle_qp_event(struct work_struct *_work) ++{ ++ struct mlx4_ib_qp_event_work *qpe_work = ++ container_of(_work, struct mlx4_ib_qp_event_work, work); ++ struct ib_qp *ibqp = &to_mibqp(qpe_work->qp)->ibqp; ++ struct ib_event event = {}; ++ ++ event.device = ibqp->device; ++ event.element.qp = ibqp; ++ ++ switch (qpe_work->type) { ++ case MLX4_EVENT_TYPE_PATH_MIG: ++ event.event = IB_EVENT_PATH_MIG; ++ break; ++ case MLX4_EVENT_TYPE_COMM_EST: ++ event.event = IB_EVENT_COMM_EST; ++ break; ++ case MLX4_EVENT_TYPE_SQ_DRAINED: ++ event.event = IB_EVENT_SQ_DRAINED; ++ break; ++ case MLX4_EVENT_TYPE_SRQ_QP_LAST_WQE: ++ event.event = IB_EVENT_QP_LAST_WQE_REACHED; ++ break; ++ case MLX4_EVENT_TYPE_WQ_CATAS_ERROR: ++ event.event = IB_EVENT_QP_FATAL; ++ break; ++ case MLX4_EVENT_TYPE_PATH_MIG_FAILED: ++ event.event = IB_EVENT_PATH_MIG_ERR; ++ break; ++ case MLX4_EVENT_TYPE_WQ_INVAL_REQ_ERROR: ++ event.event = IB_EVENT_QP_REQ_ERR; ++ break; ++ case MLX4_EVENT_TYPE_WQ_ACCESS_ERROR: ++ event.event = IB_EVENT_QP_ACCESS_ERR; ++ break; ++ default: ++ pr_warn("Unexpected event type %d on QP %06x\n", ++ qpe_work->type, qpe_work->qp->qpn); ++ goto out; ++ } ++ ++ ibqp->event_handler(&event, ibqp->qp_context); ++ ++out: ++ mlx4_put_qp(qpe_work->qp); ++ kfree(qpe_work); ++} ++ + static void mlx4_ib_qp_event(struct mlx4_qp *qp, enum mlx4_event type) + { +- struct ib_event event; + struct ib_qp *ibqp = &to_mibqp(qp)->ibqp; ++ struct mlx4_ib_qp_event_work *qpe_work; + + if (type == MLX4_EVENT_TYPE_PATH_MIG) + to_mibqp(qp)->port = to_mibqp(qp)->alt_port; + +- if (ibqp->event_handler) { +- event.device = ibqp->device; +- event.element.qp = ibqp; +- switch (type) { +- case MLX4_EVENT_TYPE_PATH_MIG: +- event.event = IB_EVENT_PATH_MIG; +- break; +- case MLX4_EVENT_TYPE_COMM_EST: +- event.event = IB_EVENT_COMM_EST; +- break; +- case MLX4_EVENT_TYPE_SQ_DRAINED: +- event.event = IB_EVENT_SQ_DRAINED; +- break; +- case MLX4_EVENT_TYPE_SRQ_QP_LAST_WQE: +- event.event = IB_EVENT_QP_LAST_WQE_REACHED; +- break; +- case MLX4_EVENT_TYPE_WQ_CATAS_ERROR: +- event.event = IB_EVENT_QP_FATAL; +- break; +- case MLX4_EVENT_TYPE_PATH_MIG_FAILED: +- event.event = IB_EVENT_PATH_MIG_ERR; +- break; +- case MLX4_EVENT_TYPE_WQ_INVAL_REQ_ERROR: +- event.event = IB_EVENT_QP_REQ_ERR; +- break; +- case MLX4_EVENT_TYPE_WQ_ACCESS_ERROR: +- event.event = IB_EVENT_QP_ACCESS_ERR; +- break; +- default: +- pr_warn("Unexpected event type %d " +- "on QP %06x\n", type, qp->qpn); +- return; +- } ++ if (!ibqp->event_handler) ++ goto out_no_handler; + +- ibqp->event_handler(&event, ibqp->qp_context); +- } ++ qpe_work = kzalloc(sizeof(*qpe_work), GFP_ATOMIC); ++ if (!qpe_work) ++ goto out_no_handler; ++ ++ qpe_work->qp = qp; ++ qpe_work->type = type; ++ INIT_WORK(&qpe_work->work, mlx4_ib_handle_qp_event); ++ queue_work(mlx4_ib_qp_event_wq, &qpe_work->work); ++ return; ++ ++out_no_handler: ++ mlx4_put_qp(qp); + } + + static void mlx4_ib_wq_event(struct mlx4_qp *qp, enum mlx4_event type) +@@ -4472,3 +4507,17 @@ void mlx4_ib_drain_rq(struct ib_qp *qp) + + handle_drain_completion(cq, &rdrain, dev); + } ++ ++int mlx4_ib_qp_event_init(void) ++{ ++ mlx4_ib_qp_event_wq = alloc_ordered_workqueue("mlx4_ib_qp_event_wq", 0); ++ if (!mlx4_ib_qp_event_wq) ++ return -ENOMEM; ++ ++ return 0; ++} ++ ++void mlx4_ib_qp_event_cleanup(void) ++{ ++ destroy_workqueue(mlx4_ib_qp_event_wq); ++} +diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c +index 45a414e8d35fa..a22649617e017 100644 +--- a/drivers/infiniband/hw/mlx5/main.c ++++ b/drivers/infiniband/hw/mlx5/main.c +@@ -4410,6 +4410,10 @@ static int __init mlx5_ib_init(void) + return -ENOMEM; + } + ++ ret = mlx5_ib_qp_event_init(); ++ if (ret) ++ goto qp_event_err; ++ + mlx5_ib_odp_init(); + ret = mlx5r_rep_init(); + if (ret) +@@ -4427,6 +4431,8 @@ static int __init mlx5_ib_init(void) + mp_err: + mlx5r_rep_cleanup(); + rep_err: ++ mlx5_ib_qp_event_cleanup(); ++qp_event_err: + destroy_workqueue(mlx5_ib_event_wq); + free_page((unsigned long)xlt_emergency_page); + return ret; +@@ -4438,6 +4444,7 @@ static void __exit mlx5_ib_cleanup(void) + auxiliary_driver_unregister(&mlx5r_mp_driver); + mlx5r_rep_cleanup(); + ++ mlx5_ib_qp_event_cleanup(); + destroy_workqueue(mlx5_ib_event_wq); + free_page((unsigned long)xlt_emergency_page); + } +diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c +index d782a494abcda..43c0123babd10 100644 +--- a/drivers/infiniband/hw/mlx5/qp.c ++++ b/drivers/infiniband/hw/mlx5/qp.c +@@ -71,6 +71,14 @@ struct mlx5_modify_raw_qp_param { + u32 port; + }; + ++struct mlx5_ib_qp_event_work { ++ struct work_struct work; ++ struct mlx5_core_qp *qp; ++ int type; ++}; ++ ++static struct workqueue_struct *mlx5_ib_qp_event_wq; ++ + static void get_cqs(enum ib_qp_type qp_type, + struct ib_cq *ib_send_cq, struct ib_cq *ib_recv_cq, + struct mlx5_ib_cq **send_cq, struct mlx5_ib_cq **recv_cq); +@@ -302,51 +310,78 @@ int mlx5_ib_read_wqe_srq(struct mlx5_ib_srq *srq, int wqe_index, void *buffer, + return mlx5_ib_read_user_wqe_srq(srq, wqe_index, buffer, buflen, bc); + } + ++static void mlx5_ib_handle_qp_event(struct work_struct *_work) ++{ ++ struct mlx5_ib_qp_event_work *qpe_work = ++ container_of(_work, struct mlx5_ib_qp_event_work, work); ++ struct ib_qp *ibqp = &to_mibqp(qpe_work->qp)->ibqp; ++ struct ib_event event = {}; ++ ++ event.device = ibqp->device; ++ event.element.qp = ibqp; ++ switch (qpe_work->type) { ++ case MLX5_EVENT_TYPE_PATH_MIG: ++ event.event = IB_EVENT_PATH_MIG; ++ break; ++ case MLX5_EVENT_TYPE_COMM_EST: ++ event.event = IB_EVENT_COMM_EST; ++ break; ++ case MLX5_EVENT_TYPE_SQ_DRAINED: ++ event.event = IB_EVENT_SQ_DRAINED; ++ break; ++ case MLX5_EVENT_TYPE_SRQ_LAST_WQE: ++ event.event = IB_EVENT_QP_LAST_WQE_REACHED; ++ break; ++ case MLX5_EVENT_TYPE_WQ_CATAS_ERROR: ++ event.event = IB_EVENT_QP_FATAL; ++ break; ++ case MLX5_EVENT_TYPE_PATH_MIG_FAILED: ++ event.event = IB_EVENT_PATH_MIG_ERR; ++ break; ++ case MLX5_EVENT_TYPE_WQ_INVAL_REQ_ERROR: ++ event.event = IB_EVENT_QP_REQ_ERR; ++ break; ++ case MLX5_EVENT_TYPE_WQ_ACCESS_ERROR: ++ event.event = IB_EVENT_QP_ACCESS_ERR; ++ break; ++ default: ++ pr_warn("mlx5_ib: Unexpected event type %d on QP %06x\n", ++ qpe_work->type, qpe_work->qp->qpn); ++ goto out; ++ } ++ ++ ibqp->event_handler(&event, ibqp->qp_context); ++ ++out: ++ mlx5_core_res_put(&qpe_work->qp->common); ++ kfree(qpe_work); ++} ++ + static void mlx5_ib_qp_event(struct mlx5_core_qp *qp, int type) + { + struct ib_qp *ibqp = &to_mibqp(qp)->ibqp; +- struct ib_event event; ++ struct mlx5_ib_qp_event_work *qpe_work; + + if (type == MLX5_EVENT_TYPE_PATH_MIG) { + /* This event is only valid for trans_qps */ + to_mibqp(qp)->port = to_mibqp(qp)->trans_qp.alt_port; + } + +- if (ibqp->event_handler) { +- event.device = ibqp->device; +- event.element.qp = ibqp; +- switch (type) { +- case MLX5_EVENT_TYPE_PATH_MIG: +- event.event = IB_EVENT_PATH_MIG; +- break; +- case MLX5_EVENT_TYPE_COMM_EST: +- event.event = IB_EVENT_COMM_EST; +- break; +- case MLX5_EVENT_TYPE_SQ_DRAINED: +- event.event = IB_EVENT_SQ_DRAINED; +- break; +- case MLX5_EVENT_TYPE_SRQ_LAST_WQE: +- event.event = IB_EVENT_QP_LAST_WQE_REACHED; +- break; +- case MLX5_EVENT_TYPE_WQ_CATAS_ERROR: +- event.event = IB_EVENT_QP_FATAL; +- break; +- case MLX5_EVENT_TYPE_PATH_MIG_FAILED: +- event.event = IB_EVENT_PATH_MIG_ERR; +- break; +- case MLX5_EVENT_TYPE_WQ_INVAL_REQ_ERROR: +- event.event = IB_EVENT_QP_REQ_ERR; +- break; +- case MLX5_EVENT_TYPE_WQ_ACCESS_ERROR: +- event.event = IB_EVENT_QP_ACCESS_ERR; +- break; +- default: +- pr_warn("mlx5_ib: Unexpected event type %d on QP %06x\n", type, qp->qpn); +- return; +- } ++ if (!ibqp->event_handler) ++ goto out_no_handler; + +- ibqp->event_handler(&event, ibqp->qp_context); +- } ++ qpe_work = kzalloc(sizeof(*qpe_work), GFP_ATOMIC); ++ if (!qpe_work) ++ goto out_no_handler; ++ ++ qpe_work->qp = qp; ++ qpe_work->type = type; ++ INIT_WORK(&qpe_work->work, mlx5_ib_handle_qp_event); ++ queue_work(mlx5_ib_qp_event_wq, &qpe_work->work); ++ return; ++ ++out_no_handler: ++ mlx5_core_res_put(&qp->common); + } + + static int set_rq_size(struct mlx5_ib_dev *dev, struct ib_qp_cap *cap, +@@ -5752,3 +5787,17 @@ int mlx5_ib_qp_set_counter(struct ib_qp *qp, struct rdma_counter *counter) + mutex_unlock(&mqp->mutex); + return err; + } ++ ++int mlx5_ib_qp_event_init(void) ++{ ++ mlx5_ib_qp_event_wq = alloc_ordered_workqueue("mlx5_ib_qp_event_wq", 0); ++ if (!mlx5_ib_qp_event_wq) ++ return -ENOMEM; ++ ++ return 0; ++} ++ ++void mlx5_ib_qp_event_cleanup(void) ++{ ++ destroy_workqueue(mlx5_ib_qp_event_wq); ++} +diff --git a/drivers/infiniband/hw/mlx5/qp.h b/drivers/infiniband/hw/mlx5/qp.h +index 5d4e140db99ce..fb2f4e030bb8f 100644 +--- a/drivers/infiniband/hw/mlx5/qp.h ++++ b/drivers/infiniband/hw/mlx5/qp.h +@@ -44,4 +44,6 @@ void mlx5_core_res_put(struct mlx5_core_rsc_common *res); + int mlx5_core_xrcd_alloc(struct mlx5_ib_dev *dev, u32 *xrcdn); + int mlx5_core_xrcd_dealloc(struct mlx5_ib_dev *dev, u32 xrcdn); + int mlx5_ib_qp_set_counter(struct ib_qp *qp, struct rdma_counter *counter); ++int mlx5_ib_qp_event_init(void); ++void mlx5_ib_qp_event_cleanup(void); + #endif /* _MLX5_IB_QP_H */ +diff --git a/drivers/infiniband/hw/mlx5/qpc.c b/drivers/infiniband/hw/mlx5/qpc.c +index d4e7864c56f18..a824ff22f4615 100644 +--- a/drivers/infiniband/hw/mlx5/qpc.c ++++ b/drivers/infiniband/hw/mlx5/qpc.c +@@ -135,7 +135,8 @@ static int rsc_event_notifier(struct notifier_block *nb, + case MLX5_RES_SQ: + qp = (struct mlx5_core_qp *)common; + qp->event(qp, event_type); +- break; ++ /* Need to put resource in event handler */ ++ return NOTIFY_OK; + case MLX5_RES_DCT: + dct = (struct mlx5_core_dct *)common; + if (event_type == MLX5_EVENT_TYPE_DCT_DRAINED) +diff --git a/drivers/net/ethernet/mellanox/mlx4/qp.c b/drivers/net/ethernet/mellanox/mlx4/qp.c +index 48cfaa7eaf50c..913ed255990f4 100644 +--- a/drivers/net/ethernet/mellanox/mlx4/qp.c ++++ b/drivers/net/ethernet/mellanox/mlx4/qp.c +@@ -46,6 +46,13 @@ + #define MLX4_BF_QP_SKIP_MASK 0xc0 + #define MLX4_MAX_BF_QP_RANGE 0x40 + ++void mlx4_put_qp(struct mlx4_qp *qp) ++{ ++ if (refcount_dec_and_test(&qp->refcount)) ++ complete(&qp->free); ++} ++EXPORT_SYMBOL_GPL(mlx4_put_qp); ++ + void mlx4_qp_event(struct mlx4_dev *dev, u32 qpn, int event_type) + { + struct mlx4_qp_table *qp_table = &mlx4_priv(dev)->qp_table; +@@ -64,10 +71,8 @@ void mlx4_qp_event(struct mlx4_dev *dev, u32 qpn, int event_type) + return; + } + ++ /* Need to call mlx4_put_qp() in event handler */ + qp->event(qp, event_type); +- +- if (refcount_dec_and_test(&qp->refcount)) +- complete(&qp->free); + } + + /* used for INIT/CLOSE port logic */ +@@ -523,8 +528,7 @@ EXPORT_SYMBOL_GPL(mlx4_qp_remove); + + void mlx4_qp_free(struct mlx4_dev *dev, struct mlx4_qp *qp) + { +- if (refcount_dec_and_test(&qp->refcount)) +- complete(&qp->free); ++ mlx4_put_qp(qp); + wait_for_completion(&qp->free); + + mlx4_qp_free_icm(dev, qp->qpn); +diff --git a/include/linux/mlx4/qp.h b/include/linux/mlx4/qp.h +index b6b626157b03a..b9a7b1319f5d3 100644 +--- a/include/linux/mlx4/qp.h ++++ b/include/linux/mlx4/qp.h +@@ -504,4 +504,5 @@ static inline u16 folded_qp(u32 q) + + u16 mlx4_qp_roce_entropy(struct mlx4_dev *dev, u32 qpn); + ++void mlx4_put_qp(struct mlx4_qp *qp); + #endif /* MLX4_QP_H */ +diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h +index 5582509003264..68fd6d22adfd4 100644 +--- a/include/rdma/ib_verbs.h ++++ b/include/rdma/ib_verbs.h +@@ -1162,7 +1162,7 @@ enum ib_qp_create_flags { + */ + + struct ib_qp_init_attr { +- /* Consumer's event_handler callback must not block */ ++ /* This callback occurs in workqueue context */ + void (*event_handler)(struct ib_event *, void *); + + void *qp_context; +-- +2.39.5 + diff --git a/queue-6.1/rdma-mlx5-add-work-to-remove-temporary-entries-from-.patch b/queue-6.1/rdma-mlx5-add-work-to-remove-temporary-entries-from-.patch new file mode 100644 index 0000000000..2eba6e0071 --- /dev/null +++ b/queue-6.1/rdma-mlx5-add-work-to-remove-temporary-entries-from-.patch @@ -0,0 +1,315 @@ +From be147ad5b5dbf2b210768ce67d652ae3e1d6ddf1 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 26 Jan 2023 00:28:07 +0200 +Subject: RDMA/mlx5: Add work to remove temporary entries from the cache + +From: Michael Guralnik + +[ Upstream commit 627122280c878cf5d3cda2d2c5a0a8f6a7e35cb7 ] + +The non-cache mkeys are stored in the cache only to shorten restarting +application time. Don't store them longer than needed. + +Configure cache entries that store non-cache MRs as temporary entries. If +30 seconds have passed and no user reclaimed the temporarily cached mkeys, +an asynchronous work will destroy the mkeys entries. + +Link: https://lore.kernel.org/r/20230125222807.6921-7-michaelgur@nvidia.com +Signed-off-by: Michael Guralnik +Signed-off-by: Jason Gunthorpe +Stable-dep-of: d97505baea64 ("RDMA/mlx5: Fix the recovery flow of the UMR QP") +Signed-off-by: Sasha Levin +--- + drivers/infiniband/hw/mlx5/mlx5_ib.h | 9 ++- + drivers/infiniband/hw/mlx5/mr.c | 94 ++++++++++++++++++++++------ + drivers/infiniband/hw/mlx5/odp.c | 2 +- + 3 files changed, 82 insertions(+), 23 deletions(-) + +diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h +index f345e2ae394d2..7c72e0e9db54a 100644 +--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h ++++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h +@@ -770,6 +770,7 @@ struct mlx5_cache_ent { + struct rb_node node; + struct mlx5r_cache_rb_key rb_key; + ++ u8 is_tmp:1; + u8 disabled:1; + u8 fill_to_high_water:1; + +@@ -803,6 +804,7 @@ struct mlx5_mkey_cache { + struct mutex rb_lock; + struct dentry *fs_root; + unsigned long last_add; ++ struct delayed_work remove_ent_dwork; + }; + + struct mlx5_ib_port_resources { +@@ -1346,9 +1348,10 @@ void mlx5_ib_copy_pas(u64 *old, u64 *new, int step, int num); + int mlx5_ib_get_cqe_size(struct ib_cq *ibcq); + int mlx5_mkey_cache_init(struct mlx5_ib_dev *dev); + int mlx5_mkey_cache_cleanup(struct mlx5_ib_dev *dev); +-struct mlx5_cache_ent *mlx5r_cache_create_ent(struct mlx5_ib_dev *dev, +- struct mlx5r_cache_rb_key rb_key, +- bool persistent_entry); ++struct mlx5_cache_ent * ++mlx5r_cache_create_ent_locked(struct mlx5_ib_dev *dev, ++ struct mlx5r_cache_rb_key rb_key, ++ bool persistent_entry); + + struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev, + int access_flags, int access_mode, +diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c +index bf1ca7565be67..2c1a935734273 100644 +--- a/drivers/infiniband/hw/mlx5/mr.c ++++ b/drivers/infiniband/hw/mlx5/mr.c +@@ -140,19 +140,16 @@ static void create_mkey_warn(struct mlx5_ib_dev *dev, int status, void *out) + mlx5_cmd_out_err(dev->mdev, MLX5_CMD_OP_CREATE_MKEY, 0, out); + } + +- +-static int push_mkey(struct mlx5_cache_ent *ent, bool limit_pendings, +- void *to_store) ++static int push_mkey_locked(struct mlx5_cache_ent *ent, bool limit_pendings, ++ void *to_store) + { + XA_STATE(xas, &ent->mkeys, 0); + void *curr; + +- xa_lock_irq(&ent->mkeys); + if (limit_pendings && +- (ent->reserved - ent->stored) > MAX_PENDING_REG_MR) { +- xa_unlock_irq(&ent->mkeys); ++ (ent->reserved - ent->stored) > MAX_PENDING_REG_MR) + return -EAGAIN; +- } ++ + while (1) { + /* + * This is cmpxchg (NULL, XA_ZERO_ENTRY) however this version +@@ -191,6 +188,7 @@ static int push_mkey(struct mlx5_cache_ent *ent, bool limit_pendings, + break; + xa_lock_irq(&ent->mkeys); + } ++ xa_lock_irq(&ent->mkeys); + if (xas_error(&xas)) + return xas_error(&xas); + if (WARN_ON(curr)) +@@ -198,6 +196,17 @@ static int push_mkey(struct mlx5_cache_ent *ent, bool limit_pendings, + return 0; + } + ++static int push_mkey(struct mlx5_cache_ent *ent, bool limit_pendings, ++ void *to_store) ++{ ++ int ret; ++ ++ xa_lock_irq(&ent->mkeys); ++ ret = push_mkey_locked(ent, limit_pendings, to_store); ++ xa_unlock_irq(&ent->mkeys); ++ return ret; ++} ++ + static void undo_push_reserve_mkey(struct mlx5_cache_ent *ent) + { + void *old; +@@ -545,7 +554,7 @@ static void queue_adjust_cache_locked(struct mlx5_cache_ent *ent) + { + lockdep_assert_held(&ent->mkeys.xa_lock); + +- if (ent->disabled || READ_ONCE(ent->dev->fill_delay)) ++ if (ent->disabled || READ_ONCE(ent->dev->fill_delay) || ent->is_tmp) + return; + if (ent->stored < ent->limit) { + ent->fill_to_high_water = true; +@@ -675,7 +684,6 @@ static int mlx5_cache_ent_insert(struct mlx5_mkey_cache *cache, + struct mlx5_cache_ent *cur; + int cmp; + +- mutex_lock(&cache->rb_lock); + /* Figure out where to put new node */ + while (*new) { + cur = rb_entry(*new, struct mlx5_cache_ent, node); +@@ -695,7 +703,6 @@ static int mlx5_cache_ent_insert(struct mlx5_mkey_cache *cache, + rb_link_node(&ent->node, parent, new); + rb_insert_color(&ent->node, &cache->rb_root); + +- mutex_unlock(&cache->rb_lock); + return 0; + } + +@@ -867,9 +874,10 @@ static void delay_time_func(struct timer_list *t) + WRITE_ONCE(dev->fill_delay, 0); + } + +-struct mlx5_cache_ent *mlx5r_cache_create_ent(struct mlx5_ib_dev *dev, +- struct mlx5r_cache_rb_key rb_key, +- bool persistent_entry) ++struct mlx5_cache_ent * ++mlx5r_cache_create_ent_locked(struct mlx5_ib_dev *dev, ++ struct mlx5r_cache_rb_key rb_key, ++ bool persistent_entry) + { + struct mlx5_cache_ent *ent; + int order; +@@ -882,6 +890,7 @@ struct mlx5_cache_ent *mlx5r_cache_create_ent(struct mlx5_ib_dev *dev, + xa_init_flags(&ent->mkeys, XA_FLAGS_LOCK_IRQ); + ent->rb_key = rb_key; + ent->dev = dev; ++ ent->is_tmp = !persistent_entry; + + INIT_DELAYED_WORK(&ent->dwork, delayed_cache_work_func); + +@@ -905,11 +914,44 @@ struct mlx5_cache_ent *mlx5r_cache_create_ent(struct mlx5_ib_dev *dev, + ent->limit = 0; + + mlx5_mkey_cache_debugfs_add_ent(dev, ent); ++ } else { ++ mod_delayed_work(ent->dev->cache.wq, ++ &ent->dev->cache.remove_ent_dwork, ++ msecs_to_jiffies(30 * 1000)); + } + + return ent; + } + ++static void remove_ent_work_func(struct work_struct *work) ++{ ++ struct mlx5_mkey_cache *cache; ++ struct mlx5_cache_ent *ent; ++ struct rb_node *cur; ++ ++ cache = container_of(work, struct mlx5_mkey_cache, ++ remove_ent_dwork.work); ++ mutex_lock(&cache->rb_lock); ++ cur = rb_last(&cache->rb_root); ++ while (cur) { ++ ent = rb_entry(cur, struct mlx5_cache_ent, node); ++ cur = rb_prev(cur); ++ mutex_unlock(&cache->rb_lock); ++ ++ xa_lock_irq(&ent->mkeys); ++ if (!ent->is_tmp) { ++ xa_unlock_irq(&ent->mkeys); ++ mutex_lock(&cache->rb_lock); ++ continue; ++ } ++ xa_unlock_irq(&ent->mkeys); ++ ++ clean_keys(ent->dev, ent); ++ mutex_lock(&cache->rb_lock); ++ } ++ mutex_unlock(&cache->rb_lock); ++} ++ + int mlx5_mkey_cache_init(struct mlx5_ib_dev *dev) + { + struct mlx5_mkey_cache *cache = &dev->cache; +@@ -925,6 +967,7 @@ int mlx5_mkey_cache_init(struct mlx5_ib_dev *dev) + mutex_init(&dev->slow_path_mutex); + mutex_init(&dev->cache.rb_lock); + dev->cache.rb_root = RB_ROOT; ++ INIT_DELAYED_WORK(&dev->cache.remove_ent_dwork, remove_ent_work_func); + cache->wq = alloc_ordered_workqueue("mkey_cache", WQ_MEM_RECLAIM); + if (!cache->wq) { + mlx5_ib_warn(dev, "failed to create work queue\n"); +@@ -934,9 +977,10 @@ int mlx5_mkey_cache_init(struct mlx5_ib_dev *dev) + mlx5_cmd_init_async_ctx(dev->mdev, &dev->async_ctx); + timer_setup(&dev->delay_timer, delay_time_func, 0); + mlx5_mkey_cache_debugfs_init(dev); ++ mutex_lock(&cache->rb_lock); + for (i = 0; i <= mkey_cache_max_order(dev); i++) { + rb_key.ndescs = 1 << (i + 2); +- ent = mlx5r_cache_create_ent(dev, rb_key, true); ++ ent = mlx5r_cache_create_ent_locked(dev, rb_key, true); + if (IS_ERR(ent)) { + ret = PTR_ERR(ent); + goto err; +@@ -947,6 +991,7 @@ int mlx5_mkey_cache_init(struct mlx5_ib_dev *dev) + if (ret) + goto err; + ++ mutex_unlock(&cache->rb_lock); + for (node = rb_first(root); node; node = rb_next(node)) { + ent = rb_entry(node, struct mlx5_cache_ent, node); + xa_lock_irq(&ent->mkeys); +@@ -957,6 +1002,7 @@ int mlx5_mkey_cache_init(struct mlx5_ib_dev *dev) + return 0; + + err: ++ mutex_unlock(&cache->rb_lock); + mlx5_ib_warn(dev, "failed to create mkey cache entry\n"); + return ret; + } +@@ -970,6 +1016,7 @@ int mlx5_mkey_cache_cleanup(struct mlx5_ib_dev *dev) + if (!dev->cache.wq) + return 0; + ++ cancel_delayed_work_sync(&dev->cache.remove_ent_dwork); + mutex_lock(&dev->cache.rb_lock); + for (node = rb_first(root); node; node = rb_next(node)) { + ent = rb_entry(node, struct mlx5_cache_ent, node); +@@ -1752,33 +1799,42 @@ static int cache_ent_find_and_store(struct mlx5_ib_dev *dev, + { + struct mlx5_mkey_cache *cache = &dev->cache; + struct mlx5_cache_ent *ent; ++ int ret; + + if (mr->mmkey.cache_ent) { + xa_lock_irq(&mr->mmkey.cache_ent->mkeys); + mr->mmkey.cache_ent->in_use--; +- xa_unlock_irq(&mr->mmkey.cache_ent->mkeys); + goto end; + } + + mutex_lock(&cache->rb_lock); + ent = mkey_cache_ent_from_rb_key(dev, mr->mmkey.rb_key); +- mutex_unlock(&cache->rb_lock); + if (ent) { + if (ent->rb_key.ndescs == mr->mmkey.rb_key.ndescs) { ++ if (ent->disabled) { ++ mutex_unlock(&cache->rb_lock); ++ return -EOPNOTSUPP; ++ } + mr->mmkey.cache_ent = ent; ++ xa_lock_irq(&mr->mmkey.cache_ent->mkeys); ++ mutex_unlock(&cache->rb_lock); + goto end; + } + } + +- ent = mlx5r_cache_create_ent(dev, mr->mmkey.rb_key, false); ++ ent = mlx5r_cache_create_ent_locked(dev, mr->mmkey.rb_key, false); ++ mutex_unlock(&cache->rb_lock); + if (IS_ERR(ent)) + return PTR_ERR(ent); + + mr->mmkey.cache_ent = ent; ++ xa_lock_irq(&mr->mmkey.cache_ent->mkeys); + + end: +- return push_mkey(mr->mmkey.cache_ent, false, +- xa_mk_value(mr->mmkey.key)); ++ ret = push_mkey_locked(mr->mmkey.cache_ent, false, ++ xa_mk_value(mr->mmkey.key)); ++ xa_unlock_irq(&mr->mmkey.cache_ent->mkeys); ++ return ret; + } + + int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata) +diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c +index 96d4faabbff8a..6ba4aa1afdc2d 100644 +--- a/drivers/infiniband/hw/mlx5/odp.c ++++ b/drivers/infiniband/hw/mlx5/odp.c +@@ -1602,7 +1602,7 @@ int mlx5_odp_init_mkey_cache(struct mlx5_ib_dev *dev) + if (!(dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT)) + return 0; + +- ent = mlx5r_cache_create_ent(dev, rb_key, true); ++ ent = mlx5r_cache_create_ent_locked(dev, rb_key, true); + if (IS_ERR(ent)) + return PTR_ERR(ent); + +-- +2.39.5 + diff --git a/queue-6.1/rdma-mlx5-cache-all-user-cacheable-mkeys-on-dereg-mr.patch b/queue-6.1/rdma-mlx5-cache-all-user-cacheable-mkeys-on-dereg-mr.patch new file mode 100644 index 0000000000..c6c37afc4a --- /dev/null +++ b/queue-6.1/rdma-mlx5-cache-all-user-cacheable-mkeys-on-dereg-mr.patch @@ -0,0 +1,148 @@ +From 5a09f0237455bc487c3d8cb78b82b7263d23d8fe Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 26 Jan 2023 00:28:06 +0200 +Subject: RDMA/mlx5: Cache all user cacheable mkeys on dereg MR flow + +From: Michael Guralnik + +[ Upstream commit dd1b913fb0d0e3e6d55e92d2319d954474dd66ac ] + +Currently, when dereging an MR, if the mkey doesn't belong to a cache +entry, it will be destroyed. As a result, the restart of applications +with many non-cached mkeys is not efficient since all the mkeys are +destroyed and then recreated. This process takes a long time (for 100,000 +MRs, it is ~20 seconds for dereg and ~28 seconds for re-reg). + +To shorten the restart runtime, insert all cacheable mkeys to the cache. +If there is no fitting entry to the mkey properties, create a temporary +entry that fits it. + +After a predetermined timeout, the cache entries will shrink to the +initial high limit. + +The mkeys will still be in the cache when consuming them again after an +application restart. Therefore, the registration will be much faster +(for 100,000 MRs, it is ~4 seconds for dereg and ~5 seconds for re-reg). + +The temporary cache entries created to store the non-cache mkeys are not +exposed through sysfs like the default cache entries. + +Link: https://lore.kernel.org/r/20230125222807.6921-6-michaelgur@nvidia.com +Signed-off-by: Michael Guralnik +Signed-off-by: Jason Gunthorpe +Stable-dep-of: d97505baea64 ("RDMA/mlx5: Fix the recovery flow of the UMR QP") +Signed-off-by: Sasha Levin +--- + drivers/infiniband/hw/mlx5/mlx5_ib.h | 2 + + drivers/infiniband/hw/mlx5/mr.c | 55 +++++++++++++++++++++------- + 2 files changed, 44 insertions(+), 13 deletions(-) + +diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h +index 7c9d5648947e9..f345e2ae394d2 100644 +--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h ++++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h +@@ -650,6 +650,8 @@ struct mlx5_ib_mkey { + unsigned int ndescs; + struct wait_queue_head wait; + refcount_t usecount; ++ /* User Mkey must hold either a rb_key or a cache_ent. */ ++ struct mlx5r_cache_rb_key rb_key; + struct mlx5_cache_ent *cache_ent; + }; + +diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c +index 1060b30a837a0..bf1ca7565be67 100644 +--- a/drivers/infiniband/hw/mlx5/mr.c ++++ b/drivers/infiniband/hw/mlx5/mr.c +@@ -1110,15 +1110,14 @@ static struct mlx5_ib_mr *alloc_cacheable_mr(struct ib_pd *pd, + rb_key.access_flags = get_unchangeable_access_flags(dev, access_flags); + ent = mkey_cache_ent_from_rb_key(dev, rb_key); + /* +- * Matches access in alloc_cache_mr(). If the MR can't come from the +- * cache then synchronously create an uncached one. ++ * If the MR can't come from the cache then synchronously create an uncached ++ * one. + */ +- if (!ent || ent->limit == 0 || +- !mlx5r_umr_can_reconfig(dev, 0, access_flags) || +- mlx5_umem_needs_ats(dev, umem, access_flags)) { ++ if (!ent) { + mutex_lock(&dev->slow_path_mutex); + mr = reg_create(pd, umem, iova, access_flags, page_size, false); + mutex_unlock(&dev->slow_path_mutex); ++ mr->mmkey.rb_key = rb_key; + return mr; + } + +@@ -1209,6 +1208,7 @@ static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, struct ib_umem *umem, + goto err_2; + } + mr->mmkey.type = MLX5_MKEY_MR; ++ mr->mmkey.ndescs = get_octo_len(iova, umem->length, mr->page_shift); + mr->umem = umem; + set_mr_fields(dev, mr, umem->length, access_flags, iova); + kvfree(in); +@@ -1747,6 +1747,40 @@ mlx5_free_priv_descs(struct mlx5_ib_mr *mr) + } + } + ++static int cache_ent_find_and_store(struct mlx5_ib_dev *dev, ++ struct mlx5_ib_mr *mr) ++{ ++ struct mlx5_mkey_cache *cache = &dev->cache; ++ struct mlx5_cache_ent *ent; ++ ++ if (mr->mmkey.cache_ent) { ++ xa_lock_irq(&mr->mmkey.cache_ent->mkeys); ++ mr->mmkey.cache_ent->in_use--; ++ xa_unlock_irq(&mr->mmkey.cache_ent->mkeys); ++ goto end; ++ } ++ ++ mutex_lock(&cache->rb_lock); ++ ent = mkey_cache_ent_from_rb_key(dev, mr->mmkey.rb_key); ++ mutex_unlock(&cache->rb_lock); ++ if (ent) { ++ if (ent->rb_key.ndescs == mr->mmkey.rb_key.ndescs) { ++ mr->mmkey.cache_ent = ent; ++ goto end; ++ } ++ } ++ ++ ent = mlx5r_cache_create_ent(dev, mr->mmkey.rb_key, false); ++ if (IS_ERR(ent)) ++ return PTR_ERR(ent); ++ ++ mr->mmkey.cache_ent = ent; ++ ++end: ++ return push_mkey(mr->mmkey.cache_ent, false, ++ xa_mk_value(mr->mmkey.key)); ++} ++ + int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata) + { + struct mlx5_ib_mr *mr = to_mmr(ibmr); +@@ -1792,16 +1826,11 @@ int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata) + } + + /* Stop DMA */ +- if (mr->mmkey.cache_ent) { +- xa_lock_irq(&mr->mmkey.cache_ent->mkeys); +- mr->mmkey.cache_ent->in_use--; +- xa_unlock_irq(&mr->mmkey.cache_ent->mkeys); +- ++ if (mr->umem && mlx5r_umr_can_load_pas(dev, mr->umem->length)) + if (mlx5r_umr_revoke_mr(mr) || +- push_mkey(mr->mmkey.cache_ent, false, +- xa_mk_value(mr->mmkey.key))) ++ cache_ent_find_and_store(dev, mr)) + mr->mmkey.cache_ent = NULL; +- } ++ + if (!mr->mmkey.cache_ent) { + rc = destroy_mkey(to_mdev(mr->ibmr.device), mr); + if (rc) +-- +2.39.5 + diff --git a/queue-6.1/rdma-mlx5-change-the-cache-structure-to-an-rb-tree.patch b/queue-6.1/rdma-mlx5-change-the-cache-structure-to-an-rb-tree.patch new file mode 100644 index 0000000000..c76604c4fc --- /dev/null +++ b/queue-6.1/rdma-mlx5-change-the-cache-structure-to-an-rb-tree.patch @@ -0,0 +1,354 @@ +From 3a78949c3d99afa32e87cf8cfe46723a057ee4cb Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 26 Jan 2023 00:28:04 +0200 +Subject: RDMA/mlx5: Change the cache structure to an RB-tree + +From: Michael Guralnik + +[ Upstream commit b9584517832858a0f78d6851d09b697a829514cd ] + +Currently, the cache structure is a static linear array. Therefore, his +size is limited to the number of entries in it and is not expandable. The +entries are dedicated to mkeys of size 2^x and no access_flags. Mkeys with +different properties are not cacheable. + +In this patch, we change the cache structure to an RB-tree. This will +allow to extend the cache to support more entries with different mkey +properties. + +Link: https://lore.kernel.org/r/20230125222807.6921-4-michaelgur@nvidia.com +Signed-off-by: Michael Guralnik +Signed-off-by: Jason Gunthorpe +Stable-dep-of: d97505baea64 ("RDMA/mlx5: Fix the recovery flow of the UMR QP") +Signed-off-by: Sasha Levin +--- + drivers/infiniband/hw/mlx5/mlx5_ib.h | 11 +- + drivers/infiniband/hw/mlx5/mr.c | 160 ++++++++++++++++++++------- + drivers/infiniband/hw/mlx5/odp.c | 8 +- + 3 files changed, 132 insertions(+), 47 deletions(-) + +diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h +index 10c87901da27c..bd998ac8c29c1 100644 +--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h ++++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h +@@ -761,6 +761,8 @@ struct mlx5_cache_ent { + u32 access_mode; + unsigned int ndescs; + ++ struct rb_node node; ++ + u8 disabled:1; + u8 fill_to_high_water:1; + +@@ -790,8 +792,9 @@ struct mlx5r_async_create_mkey { + + struct mlx5_mkey_cache { + struct workqueue_struct *wq; +- struct mlx5_cache_ent ent[MAX_MKEY_CACHE_ENTRIES]; +- struct dentry *root; ++ struct rb_root rb_root; ++ struct mutex rb_lock; ++ struct dentry *fs_root; + unsigned long last_add; + }; + +@@ -1336,11 +1339,15 @@ void mlx5_ib_copy_pas(u64 *old, u64 *new, int step, int num); + int mlx5_ib_get_cqe_size(struct ib_cq *ibcq); + int mlx5_mkey_cache_init(struct mlx5_ib_dev *dev); + int mlx5_mkey_cache_cleanup(struct mlx5_ib_dev *dev); ++struct mlx5_cache_ent *mlx5r_cache_create_ent(struct mlx5_ib_dev *dev, ++ int order); + + struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev, + struct mlx5_cache_ent *ent, + int access_flags); + ++struct mlx5_ib_mr *mlx5_mr_cache_alloc_order(struct mlx5_ib_dev *dev, u32 order, ++ int access_flags); + int mlx5_ib_check_mr_status(struct ib_mr *ibmr, u32 check_mask, + struct ib_mr_status *mr_status); + struct ib_wq *mlx5_ib_create_wq(struct ib_pd *pd, +diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c +index 53fadd6edb68d..b3d83920d3cfb 100644 +--- a/drivers/infiniband/hw/mlx5/mr.c ++++ b/drivers/infiniband/hw/mlx5/mr.c +@@ -515,18 +515,22 @@ static const struct file_operations limit_fops = { + + static bool someone_adding(struct mlx5_mkey_cache *cache) + { +- unsigned int i; +- +- for (i = 0; i < MAX_MKEY_CACHE_ENTRIES; i++) { +- struct mlx5_cache_ent *ent = &cache->ent[i]; +- bool ret; ++ struct mlx5_cache_ent *ent; ++ struct rb_node *node; ++ bool ret; + ++ mutex_lock(&cache->rb_lock); ++ for (node = rb_first(&cache->rb_root); node; node = rb_next(node)) { ++ ent = rb_entry(node, struct mlx5_cache_ent, node); + xa_lock_irq(&ent->mkeys); + ret = ent->stored < ent->limit; + xa_unlock_irq(&ent->mkeys); +- if (ret) ++ if (ret) { ++ mutex_unlock(&cache->rb_lock); + return true; ++ } + } ++ mutex_unlock(&cache->rb_lock); + return false; + } + +@@ -637,6 +641,59 @@ static void delayed_cache_work_func(struct work_struct *work) + __cache_work_func(ent); + } + ++static int mlx5_cache_ent_insert(struct mlx5_mkey_cache *cache, ++ struct mlx5_cache_ent *ent) ++{ ++ struct rb_node **new = &cache->rb_root.rb_node, *parent = NULL; ++ struct mlx5_cache_ent *cur; ++ ++ mutex_lock(&cache->rb_lock); ++ /* Figure out where to put new node */ ++ while (*new) { ++ cur = rb_entry(*new, struct mlx5_cache_ent, node); ++ parent = *new; ++ if (ent->order < cur->order) ++ new = &((*new)->rb_left); ++ if (ent->order > cur->order) ++ new = &((*new)->rb_right); ++ if (ent->order == cur->order) { ++ mutex_unlock(&cache->rb_lock); ++ return -EEXIST; ++ } ++ } ++ ++ /* Add new node and rebalance tree. */ ++ rb_link_node(&ent->node, parent, new); ++ rb_insert_color(&ent->node, &cache->rb_root); ++ ++ mutex_unlock(&cache->rb_lock); ++ return 0; ++} ++ ++static struct mlx5_cache_ent *mkey_cache_ent_from_order(struct mlx5_ib_dev *dev, ++ unsigned int order) ++{ ++ struct rb_node *node = dev->cache.rb_root.rb_node; ++ struct mlx5_cache_ent *cur, *smallest = NULL; ++ ++ /* ++ * Find the smallest ent with order >= requested_order. ++ */ ++ while (node) { ++ cur = rb_entry(node, struct mlx5_cache_ent, node); ++ if (cur->order > order) { ++ smallest = cur; ++ node = node->rb_left; ++ } ++ if (cur->order < order) ++ node = node->rb_right; ++ if (cur->order == order) ++ return cur; ++ } ++ ++ return smallest; ++} ++ + struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev, + struct mlx5_cache_ent *ent, + int access_flags) +@@ -677,10 +734,16 @@ struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev, + return mr; + } + +-static void clean_keys(struct mlx5_ib_dev *dev, int c) ++struct mlx5_ib_mr *mlx5_mr_cache_alloc_order(struct mlx5_ib_dev *dev, ++ u32 order, int access_flags) ++{ ++ struct mlx5_cache_ent *ent = mkey_cache_ent_from_order(dev, order); ++ ++ return mlx5_mr_cache_alloc(dev, ent, access_flags); ++} ++ ++static void clean_keys(struct mlx5_ib_dev *dev, struct mlx5_cache_ent *ent) + { +- struct mlx5_mkey_cache *cache = &dev->cache; +- struct mlx5_cache_ent *ent = &cache->ent[c]; + u32 mkey; + + cancel_delayed_work(&ent->dwork); +@@ -699,8 +762,8 @@ static void mlx5_mkey_cache_debugfs_cleanup(struct mlx5_ib_dev *dev) + if (!mlx5_debugfs_root || dev->is_rep) + return; + +- debugfs_remove_recursive(dev->cache.root); +- dev->cache.root = NULL; ++ debugfs_remove_recursive(dev->cache.fs_root); ++ dev->cache.fs_root = NULL; + } + + static void mlx5_mkey_cache_debugfs_init(struct mlx5_ib_dev *dev) +@@ -713,12 +776,13 @@ static void mlx5_mkey_cache_debugfs_init(struct mlx5_ib_dev *dev) + if (!mlx5_debugfs_root || dev->is_rep) + return; + +- cache->root = debugfs_create_dir("mr_cache", mlx5_debugfs_get_dev_root(dev->mdev)); ++ dir = mlx5_debugfs_get_dev_root(dev->mdev); ++ cache->fs_root = debugfs_create_dir("mr_cache", dir); + + for (i = 0; i < MAX_MKEY_CACHE_ENTRIES; i++) { +- ent = &cache->ent[i]; ++ ent = mkey_cache_ent_from_order(dev, i); + sprintf(ent->name, "%d", ent->order); +- dir = debugfs_create_dir(ent->name, cache->root); ++ dir = debugfs_create_dir(ent->name, cache->fs_root); + debugfs_create_file("size", 0600, dir, ent, &size_fops); + debugfs_create_file("limit", 0600, dir, ent, &limit_fops); + debugfs_create_ulong("cur", 0400, dir, &ent->stored); +@@ -733,6 +797,30 @@ static void delay_time_func(struct timer_list *t) + WRITE_ONCE(dev->fill_delay, 0); + } + ++struct mlx5_cache_ent *mlx5r_cache_create_ent(struct mlx5_ib_dev *dev, ++ int order) ++{ ++ struct mlx5_cache_ent *ent; ++ int ret; ++ ++ ent = kzalloc(sizeof(*ent), GFP_KERNEL); ++ if (!ent) ++ return ERR_PTR(-ENOMEM); ++ ++ xa_init_flags(&ent->mkeys, XA_FLAGS_LOCK_IRQ); ++ ent->order = order; ++ ent->dev = dev; ++ ++ INIT_DELAYED_WORK(&ent->dwork, delayed_cache_work_func); ++ ++ ret = mlx5_cache_ent_insert(&dev->cache, ent); ++ if (ret) { ++ kfree(ent); ++ return ERR_PTR(ret); ++ } ++ return ent; ++} ++ + int mlx5_mkey_cache_init(struct mlx5_ib_dev *dev) + { + struct mlx5_mkey_cache *cache = &dev->cache; +@@ -740,6 +828,8 @@ int mlx5_mkey_cache_init(struct mlx5_ib_dev *dev) + int i; + + mutex_init(&dev->slow_path_mutex); ++ mutex_init(&dev->cache.rb_lock); ++ dev->cache.rb_root = RB_ROOT; + cache->wq = alloc_ordered_workqueue("mkey_cache", WQ_MEM_RECLAIM); + if (!cache->wq) { + mlx5_ib_warn(dev, "failed to create work queue\n"); +@@ -749,13 +839,7 @@ int mlx5_mkey_cache_init(struct mlx5_ib_dev *dev) + mlx5_cmd_init_async_ctx(dev->mdev, &dev->async_ctx); + timer_setup(&dev->delay_timer, delay_time_func, 0); + for (i = 0; i < MAX_MKEY_CACHE_ENTRIES; i++) { +- ent = &cache->ent[i]; +- xa_init_flags(&ent->mkeys, XA_FLAGS_LOCK_IRQ); +- ent->order = i + 2; +- ent->dev = dev; +- ent->limit = 0; +- +- INIT_DELAYED_WORK(&ent->dwork, delayed_cache_work_func); ++ ent = mlx5r_cache_create_ent(dev, i); + + if (i > MKEY_CACHE_LAST_STD_ENTRY) { + mlx5_odp_init_mkey_cache_entry(ent); +@@ -785,14 +869,16 @@ int mlx5_mkey_cache_init(struct mlx5_ib_dev *dev) + + int mlx5_mkey_cache_cleanup(struct mlx5_ib_dev *dev) + { +- unsigned int i; ++ struct rb_root *root = &dev->cache.rb_root; ++ struct mlx5_cache_ent *ent; ++ struct rb_node *node; + + if (!dev->cache.wq) + return 0; + +- for (i = 0; i < MAX_MKEY_CACHE_ENTRIES; i++) { +- struct mlx5_cache_ent *ent = &dev->cache.ent[i]; +- ++ mutex_lock(&dev->cache.rb_lock); ++ for (node = rb_first(root); node; node = rb_next(node)) { ++ ent = rb_entry(node, struct mlx5_cache_ent, node); + xa_lock_irq(&ent->mkeys); + ent->disabled = true; + xa_unlock_irq(&ent->mkeys); +@@ -802,8 +888,15 @@ int mlx5_mkey_cache_cleanup(struct mlx5_ib_dev *dev) + mlx5_mkey_cache_debugfs_cleanup(dev); + mlx5_cmd_cleanup_async_ctx(&dev->async_ctx); + +- for (i = 0; i < MAX_MKEY_CACHE_ENTRIES; i++) +- clean_keys(dev, i); ++ node = rb_first(root); ++ while (node) { ++ ent = rb_entry(node, struct mlx5_cache_ent, node); ++ node = rb_next(node); ++ clean_keys(dev, ent); ++ rb_erase(&ent->node, root); ++ kfree(ent); ++ } ++ mutex_unlock(&dev->cache.rb_lock); + + destroy_workqueue(dev->cache.wq); + del_timer_sync(&dev->delay_timer); +@@ -876,19 +969,6 @@ static int mkey_cache_max_order(struct mlx5_ib_dev *dev) + return MLX5_MAX_UMR_SHIFT; + } + +-static struct mlx5_cache_ent *mkey_cache_ent_from_order(struct mlx5_ib_dev *dev, +- unsigned int order) +-{ +- struct mlx5_mkey_cache *cache = &dev->cache; +- +- if (order < cache->ent[0].order) +- return &cache->ent[0]; +- order = order - cache->ent[0].order; +- if (order > MKEY_CACHE_LAST_STD_ENTRY) +- return NULL; +- return &cache->ent[order]; +-} +- + static void set_mr_fields(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr, + u64 length, int access_flags, u64 iova) + { +diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c +index 5f0a17382de73..7f68940ca0d1e 100644 +--- a/drivers/infiniband/hw/mlx5/odp.c ++++ b/drivers/infiniband/hw/mlx5/odp.c +@@ -420,8 +420,7 @@ static struct mlx5_ib_mr *implicit_get_child_mr(struct mlx5_ib_mr *imr, + return ERR_CAST(odp); + + BUILD_BUG_ON(order > MKEY_CACHE_LAST_STD_ENTRY); +- mr = mlx5_mr_cache_alloc(dev, &dev->cache.ent[order], +- imr->access_flags); ++ mr = mlx5_mr_cache_alloc_order(dev, order, imr->access_flags); + if (IS_ERR(mr)) { + ib_umem_odp_release(odp); + return mr; +@@ -495,9 +494,8 @@ struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd, + if (IS_ERR(umem_odp)) + return ERR_CAST(umem_odp); + +- imr = mlx5_mr_cache_alloc(dev, +- &dev->cache.ent[MLX5_IMR_KSM_CACHE_ENTRY], +- access_flags); ++ imr = mlx5_mr_cache_alloc_order(dev, MLX5_IMR_KSM_CACHE_ENTRY, ++ access_flags); + if (IS_ERR(imr)) { + ib_umem_odp_release(umem_odp); + return imr; +-- +2.39.5 + diff --git a/queue-6.1/rdma-mlx5-don-t-keep-umrable-page_shift-in-cache-ent.patch b/queue-6.1/rdma-mlx5-don-t-keep-umrable-page_shift-in-cache-ent.patch new file mode 100644 index 0000000000..2caa17ca8b --- /dev/null +++ b/queue-6.1/rdma-mlx5-don-t-keep-umrable-page_shift-in-cache-ent.patch @@ -0,0 +1,83 @@ +From a85b91bcb6fce39a7511353461ead5a60b13bc69 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 26 Jan 2023 00:28:02 +0200 +Subject: RDMA/mlx5: Don't keep umrable 'page_shift' in cache entries + +From: Aharon Landau + +[ Upstream commit a2a88b8e22d1b202225d0e40b02ad068afab2ccb ] + +mkc.log_page_size can be changed using UMR. Therefore, don't treat it as a +cache entry property. + +Removing it from struct mlx5_cache_ent. + +All cache mkeys will be created with default PAGE_SHIFT, and updated with +the needed page_shift using UMR when passing them to a user. + +Link: https://lore.kernel.org/r/20230125222807.6921-2-michaelgur@nvidia.com +Signed-off-by: Aharon Landau +Signed-off-by: Jason Gunthorpe +Stable-dep-of: d97505baea64 ("RDMA/mlx5: Fix the recovery flow of the UMR QP") +Signed-off-by: Sasha Levin +--- + drivers/infiniband/hw/mlx5/mlx5_ib.h | 1 - + drivers/infiniband/hw/mlx5/mr.c | 3 +-- + drivers/infiniband/hw/mlx5/odp.c | 2 -- + 3 files changed, 1 insertion(+), 5 deletions(-) + +diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h +index 0ef347e91ffeb..10c87901da27c 100644 +--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h ++++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h +@@ -759,7 +759,6 @@ struct mlx5_cache_ent { + char name[4]; + u32 order; + u32 access_mode; +- u32 page; + unsigned int ndescs; + + u8 disabled:1; +diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c +index b81b03aa2a629..53fadd6edb68d 100644 +--- a/drivers/infiniband/hw/mlx5/mr.c ++++ b/drivers/infiniband/hw/mlx5/mr.c +@@ -297,7 +297,7 @@ static void set_cache_mkc(struct mlx5_cache_ent *ent, void *mkc) + + MLX5_SET(mkc, mkc, translations_octword_size, + get_mkc_octo_size(ent->access_mode, ent->ndescs)); +- MLX5_SET(mkc, mkc, log_page_size, ent->page); ++ MLX5_SET(mkc, mkc, log_page_size, PAGE_SHIFT); + } + + /* Asynchronously schedule new MRs to be populated in the cache. */ +@@ -765,7 +765,6 @@ int mlx5_mkey_cache_init(struct mlx5_ib_dev *dev) + if (ent->order > mkey_cache_max_order(dev)) + continue; + +- ent->page = PAGE_SHIFT; + ent->ndescs = 1 << ent->order; + ent->access_mode = MLX5_MKC_ACCESS_MODE_MTT; + if ((dev->mdev->profile.mask & MLX5_PROF_MASK_MR_CACHE) && +diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c +index 87fbee8061003..a5c9baec8be85 100644 +--- a/drivers/infiniband/hw/mlx5/odp.c ++++ b/drivers/infiniband/hw/mlx5/odp.c +@@ -1598,14 +1598,12 @@ void mlx5_odp_init_mkey_cache_entry(struct mlx5_cache_ent *ent) + + switch (ent->order - 2) { + case MLX5_IMR_MTT_CACHE_ENTRY: +- ent->page = PAGE_SHIFT; + ent->ndescs = MLX5_IMR_MTT_ENTRIES; + ent->access_mode = MLX5_MKC_ACCESS_MODE_MTT; + ent->limit = 0; + break; + + case MLX5_IMR_KSM_CACHE_ENTRY: +- ent->page = MLX5_KSM_PAGE_SHIFT; + ent->ndescs = mlx5_imr_ksm_entries; + ent->access_mode = MLX5_MKC_ACCESS_MODE_KSM; + ent->limit = 0; +-- +2.39.5 + diff --git a/queue-6.1/rdma-mlx5-fix-ah-static-rate-parsing.patch b/queue-6.1/rdma-mlx5-fix-ah-static-rate-parsing.patch new file mode 100644 index 0000000000..7c8747036d --- /dev/null +++ b/queue-6.1/rdma-mlx5-fix-ah-static-rate-parsing.patch @@ -0,0 +1,84 @@ +From b79f406d4cc08e99e836a5e95040672efdba5313 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 10 Feb 2025 13:32:39 +0200 +Subject: RDMA/mlx5: Fix AH static rate parsing + +From: Patrisious Haddad + +[ Upstream commit c534ffda781f44a1c6ac25ef6e0e444da38ca8af ] + +Previously static rate wasn't translated according to our PRM but simply +used the 4 lower bytes. + +Correctly translate static rate value passed in AH creation attribute +according to our PRM expected values. + +In addition change 800GB mapping to zero, which is the PRM +specified value. + +Fixes: e126ba97dba9 ("mlx5: Add driver for Mellanox Connect-IB adapters") +Signed-off-by: Patrisious Haddad +Reviewed-by: Maor Gottlieb +Link: https://patch.msgid.link/18ef4cc5396caf80728341eb74738cd777596f60.1739187089.git.leon@kernel.org +Signed-off-by: Leon Romanovsky +Signed-off-by: Sasha Levin +--- + drivers/infiniband/hw/mlx5/ah.c | 3 ++- + drivers/infiniband/hw/mlx5/qp.c | 6 +++--- + drivers/infiniband/hw/mlx5/qp.h | 1 + + 3 files changed, 6 insertions(+), 4 deletions(-) + +diff --git a/drivers/infiniband/hw/mlx5/ah.c b/drivers/infiniband/hw/mlx5/ah.c +index 505bc47fd575d..99036afb3aef0 100644 +--- a/drivers/infiniband/hw/mlx5/ah.c ++++ b/drivers/infiniband/hw/mlx5/ah.c +@@ -67,7 +67,8 @@ static void create_ib_ah(struct mlx5_ib_dev *dev, struct mlx5_ib_ah *ah, + ah->av.tclass = grh->traffic_class; + } + +- ah->av.stat_rate_sl = (rdma_ah_get_static_rate(ah_attr) << 4); ++ ah->av.stat_rate_sl = ++ (mlx5r_ib_rate(dev, rdma_ah_get_static_rate(ah_attr)) << 4); + + if (ah_attr->type == RDMA_AH_ATTR_TYPE_ROCE) { + if (init_attr->xmit_slave) +diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c +index 43c0123babd10..59dca0cd89052 100644 +--- a/drivers/infiniband/hw/mlx5/qp.c ++++ b/drivers/infiniband/hw/mlx5/qp.c +@@ -3379,11 +3379,11 @@ static int ib_to_mlx5_rate_map(u8 rate) + return 0; + } + +-static int ib_rate_to_mlx5(struct mlx5_ib_dev *dev, u8 rate) ++int mlx5r_ib_rate(struct mlx5_ib_dev *dev, u8 rate) + { + u32 stat_rate_support; + +- if (rate == IB_RATE_PORT_CURRENT) ++ if (rate == IB_RATE_PORT_CURRENT || rate == IB_RATE_800_GBPS) + return 0; + + if (rate < IB_RATE_2_5_GBPS || rate > IB_RATE_600_GBPS) +@@ -3528,7 +3528,7 @@ static int mlx5_set_path(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, + sizeof(grh->dgid.raw)); + } + +- err = ib_rate_to_mlx5(dev, rdma_ah_get_static_rate(ah)); ++ err = mlx5r_ib_rate(dev, rdma_ah_get_static_rate(ah)); + if (err < 0) + return err; + MLX5_SET(ads, path, stat_rate, err); +diff --git a/drivers/infiniband/hw/mlx5/qp.h b/drivers/infiniband/hw/mlx5/qp.h +index e677fa0ca4226..4abb77d551670 100644 +--- a/drivers/infiniband/hw/mlx5/qp.h ++++ b/drivers/infiniband/hw/mlx5/qp.h +@@ -55,4 +55,5 @@ int mlx5_core_xrcd_dealloc(struct mlx5_ib_dev *dev, u32 xrcdn); + int mlx5_ib_qp_set_counter(struct ib_qp *qp, struct rdma_counter *counter); + int mlx5_ib_qp_event_init(void); + void mlx5_ib_qp_event_cleanup(void); ++int mlx5r_ib_rate(struct mlx5_ib_dev *dev, u8 rate); + #endif /* _MLX5_IB_QP_H */ +-- +2.39.5 + diff --git a/queue-6.1/rdma-mlx5-fix-bind-qp-error-cleanup-flow.patch b/queue-6.1/rdma-mlx5-fix-bind-qp-error-cleanup-flow.patch new file mode 100644 index 0000000000..d692dc9479 --- /dev/null +++ b/queue-6.1/rdma-mlx5-fix-bind-qp-error-cleanup-flow.patch @@ -0,0 +1,63 @@ +From e1c50bbca08c17189cc312b70852128bf1271cfb Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 20 Feb 2025 08:47:10 +0200 +Subject: RDMA/mlx5: Fix bind QP error cleanup flow + +From: Patrisious Haddad + +[ Upstream commit e1a0bdbdfdf08428f0ede5ae49c7f4139ac73ef5 ] + +When there is a failure during bind QP, the cleanup flow destroys the +counter regardless if it is the one that created it or not, which is +problematic since if it isn't the one that created it, that counter could +still be in use. + +Fix that by destroying the counter only if it was created during this call. + +Fixes: 45842fc627c7 ("IB/mlx5: Support statistic q counter configuration") +Signed-off-by: Patrisious Haddad +Reviewed-by: Mark Zhang +Link: https://patch.msgid.link/25dfefddb0ebefa668c32e06a94d84e3216257cf.1740033937.git.leon@kernel.org +Reviewed-by: Zhu Yanjun +Signed-off-by: Leon Romanovsky +Signed-off-by: Sasha Levin +--- + drivers/infiniband/hw/mlx5/counters.c | 8 ++++++-- + 1 file changed, 6 insertions(+), 2 deletions(-) + +diff --git a/drivers/infiniband/hw/mlx5/counters.c b/drivers/infiniband/hw/mlx5/counters.c +index 3e1272695d993..9915504ad1e18 100644 +--- a/drivers/infiniband/hw/mlx5/counters.c ++++ b/drivers/infiniband/hw/mlx5/counters.c +@@ -444,6 +444,7 @@ static int mlx5_ib_counter_bind_qp(struct rdma_counter *counter, + struct ib_qp *qp) + { + struct mlx5_ib_dev *dev = to_mdev(qp->device); ++ bool new = false; + int err; + + if (!counter->id) { +@@ -458,6 +459,7 @@ static int mlx5_ib_counter_bind_qp(struct rdma_counter *counter, + return err; + counter->id = + MLX5_GET(alloc_q_counter_out, out, counter_set_id); ++ new = true; + } + + err = mlx5_ib_qp_set_counter(qp, counter); +@@ -467,8 +469,10 @@ static int mlx5_ib_counter_bind_qp(struct rdma_counter *counter, + return 0; + + fail_set_counter: +- mlx5_ib_counter_dealloc(counter); +- counter->id = 0; ++ if (new) { ++ mlx5_ib_counter_dealloc(counter); ++ counter->id = 0; ++ } + + return err; + } +-- +2.39.5 + diff --git a/queue-6.1/rdma-mlx5-fix-the-recovery-flow-of-the-umr-qp.patch b/queue-6.1/rdma-mlx5-fix-the-recovery-flow-of-the-umr-qp.patch new file mode 100644 index 0000000000..401d194fd6 --- /dev/null +++ b/queue-6.1/rdma-mlx5-fix-the-recovery-flow-of-the-umr-qp.patch @@ -0,0 +1,209 @@ +From 15ed43c7d41f9929ea55919272003c7ba5aec402 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Sun, 19 Jan 2025 14:36:13 +0200 +Subject: RDMA/mlx5: Fix the recovery flow of the UMR QP + +From: Yishai Hadas + +[ Upstream commit d97505baea64d93538b16baf14ce7b8c1fbad746 ] + +This patch addresses an issue in the recovery flow of the UMR QP, +ensuring tasks do not get stuck, as highlighted by the call trace [1]. + +During recovery, before transitioning the QP to the RESET state, the +software must wait for all outstanding WRs to complete. + +Failing to do so can cause the firmware to skip sending some flushed +CQEs with errors and simply discard them upon the RESET, as per the IB +specification. + +This race condition can result in lost CQEs and tasks becoming stuck. + +To resolve this, the patch sends a final WR which serves only as a +barrier before moving the QP state to RESET. + +Once a CQE is received for that final WR, it guarantees that no +outstanding WRs remain, making it safe to transition the QP to RESET and +subsequently back to RTS, restoring proper functionality. + +Note: +For the barrier WR, we simply reuse the failed and ready WR. +Since the QP is in an error state, it will only receive +IB_WC_WR_FLUSH_ERR. However, as it serves only as a barrier we don't +care about its status. + +[1] +INFO: task rdma_resource_l:1922 blocked for more than 120 seconds. +Tainted: G W 6.12.0-rc7+ #1626 +"echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. +task:rdma_resource_l state:D stack:0 pid:1922 tgid:1922 ppid:1369 + flags:0x00004004 +Call Trace: + +__schedule+0x420/0xd30 +schedule+0x47/0x130 +schedule_timeout+0x280/0x300 +? mark_held_locks+0x48/0x80 +? lockdep_hardirqs_on_prepare+0xe5/0x1a0 +wait_for_completion+0x75/0x130 +mlx5r_umr_post_send_wait+0x3c2/0x5b0 [mlx5_ib] +? __pfx_mlx5r_umr_done+0x10/0x10 [mlx5_ib] +mlx5r_umr_revoke_mr+0x93/0xc0 [mlx5_ib] +__mlx5_ib_dereg_mr+0x299/0x520 [mlx5_ib] +? _raw_spin_unlock_irq+0x24/0x40 +? wait_for_completion+0xfe/0x130 +? rdma_restrack_put+0x63/0xe0 [ib_core] +ib_dereg_mr_user+0x5f/0x120 [ib_core] +? lock_release+0xc6/0x280 +destroy_hw_idr_uobject+0x1d/0x60 [ib_uverbs] +uverbs_destroy_uobject+0x58/0x1d0 [ib_uverbs] +uobj_destroy+0x3f/0x70 [ib_uverbs] +ib_uverbs_cmd_verbs+0x3e4/0xbb0 [ib_uverbs] +? __pfx_uverbs_destroy_def_handler+0x10/0x10 [ib_uverbs] +? __lock_acquire+0x64e/0x2080 +? mark_held_locks+0x48/0x80 +? find_held_lock+0x2d/0xa0 +? lock_acquire+0xc1/0x2f0 +? ib_uverbs_ioctl+0xcb/0x170 [ib_uverbs] +? __fget_files+0xc3/0x1b0 +ib_uverbs_ioctl+0xe7/0x170 [ib_uverbs] +? ib_uverbs_ioctl+0xcb/0x170 [ib_uverbs] +__x64_sys_ioctl+0x1b0/0xa70 +do_syscall_64+0x6b/0x140 +entry_SYSCALL_64_after_hwframe+0x76/0x7e +RIP: 0033:0x7f99c918b17b +RSP: 002b:00007ffc766d0468 EFLAGS: 00000246 ORIG_RAX: + 0000000000000010 +RAX: ffffffffffffffda RBX: 00007ffc766d0578 RCX: + 00007f99c918b17b +RDX: 00007ffc766d0560 RSI: 00000000c0181b01 RDI: + 0000000000000003 +RBP: 00007ffc766d0540 R08: 00007f99c8f99010 R09: + 000000000000bd7e +R10: 00007f99c94c1c70 R11: 0000000000000246 R12: + 00007ffc766d0530 +R13: 000000000000001c R14: 0000000040246a80 R15: + 0000000000000000 + + +Fixes: 158e71bb69e3 ("RDMA/mlx5: Add a umr recovery flow") +Signed-off-by: Yishai Hadas +Reviewed-by: Michael Guralnik +Link: https://patch.msgid.link/27b51b92ec42dfb09d8096fcbd51878f397ce6ec.1737290141.git.leon@kernel.org +Signed-off-by: Leon Romanovsky +Signed-off-by: Sasha Levin +--- + drivers/infiniband/hw/mlx5/umr.c | 83 +++++++++++++++++++++----------- + 1 file changed, 56 insertions(+), 27 deletions(-) + +diff --git a/drivers/infiniband/hw/mlx5/umr.c b/drivers/infiniband/hw/mlx5/umr.c +index fa000182d0b41..1a39e86178ece 100644 +--- a/drivers/infiniband/hw/mlx5/umr.c ++++ b/drivers/infiniband/hw/mlx5/umr.c +@@ -199,30 +199,6 @@ void mlx5r_umr_resource_cleanup(struct mlx5_ib_dev *dev) + ib_dealloc_pd(dev->umrc.pd); + } + +-static int mlx5r_umr_recover(struct mlx5_ib_dev *dev) +-{ +- struct umr_common *umrc = &dev->umrc; +- struct ib_qp_attr attr; +- int err; +- +- attr.qp_state = IB_QPS_RESET; +- err = ib_modify_qp(umrc->qp, &attr, IB_QP_STATE); +- if (err) { +- mlx5_ib_dbg(dev, "Couldn't modify UMR QP\n"); +- goto err; +- } +- +- err = mlx5r_umr_qp_rst2rts(dev, umrc->qp); +- if (err) +- goto err; +- +- umrc->state = MLX5_UMR_STATE_ACTIVE; +- return 0; +- +-err: +- umrc->state = MLX5_UMR_STATE_ERR; +- return err; +-} + + static int mlx5r_umr_post_send(struct ib_qp *ibqp, u32 mkey, struct ib_cqe *cqe, + struct mlx5r_umr_wqe *wqe, bool with_data) +@@ -270,6 +246,61 @@ static int mlx5r_umr_post_send(struct ib_qp *ibqp, u32 mkey, struct ib_cqe *cqe, + return err; + } + ++static int mlx5r_umr_recover(struct mlx5_ib_dev *dev, u32 mkey, ++ struct mlx5r_umr_context *umr_context, ++ struct mlx5r_umr_wqe *wqe, bool with_data) ++{ ++ struct umr_common *umrc = &dev->umrc; ++ struct ib_qp_attr attr; ++ int err; ++ ++ mutex_lock(&umrc->lock); ++ /* Preventing any further WRs to be sent now */ ++ if (umrc->state != MLX5_UMR_STATE_RECOVER) { ++ mlx5_ib_warn(dev, "UMR recovery encountered an unexpected state=%d\n", ++ umrc->state); ++ umrc->state = MLX5_UMR_STATE_RECOVER; ++ } ++ mutex_unlock(&umrc->lock); ++ ++ /* Sending a final/barrier WR (the failed one) and wait for its completion. ++ * This will ensure that all the previous WRs got a completion before ++ * we set the QP state to RESET. ++ */ ++ err = mlx5r_umr_post_send(umrc->qp, mkey, &umr_context->cqe, wqe, ++ with_data); ++ if (err) { ++ mlx5_ib_warn(dev, "UMR recovery post send failed, err %d\n", err); ++ goto err; ++ } ++ ++ /* Since the QP is in an error state, it will only receive ++ * IB_WC_WR_FLUSH_ERR. However, as it serves only as a barrier ++ * we don't care about its status. ++ */ ++ wait_for_completion(&umr_context->done); ++ ++ attr.qp_state = IB_QPS_RESET; ++ err = ib_modify_qp(umrc->qp, &attr, IB_QP_STATE); ++ if (err) { ++ mlx5_ib_warn(dev, "Couldn't modify UMR QP to RESET, err=%d\n", err); ++ goto err; ++ } ++ ++ err = mlx5r_umr_qp_rst2rts(dev, umrc->qp); ++ if (err) { ++ mlx5_ib_warn(dev, "Couldn't modify UMR QP to RTS, err=%d\n", err); ++ goto err; ++ } ++ ++ umrc->state = MLX5_UMR_STATE_ACTIVE; ++ return 0; ++ ++err: ++ umrc->state = MLX5_UMR_STATE_ERR; ++ return err; ++} ++ + static void mlx5r_umr_done(struct ib_cq *cq, struct ib_wc *wc) + { + struct mlx5_ib_umr_context *context = +@@ -334,9 +365,7 @@ static int mlx5r_umr_post_send_wait(struct mlx5_ib_dev *dev, u32 mkey, + mlx5_ib_warn(dev, + "reg umr failed (%u). Trying to recover and resubmit the flushed WQEs, mkey = %u\n", + umr_context.status, mkey); +- mutex_lock(&umrc->lock); +- err = mlx5r_umr_recover(dev); +- mutex_unlock(&umrc->lock); ++ err = mlx5r_umr_recover(dev, mkey, &umr_context, wqe, with_data); + if (err) + mlx5_ib_warn(dev, "couldn't recover UMR, err %d\n", + err); +-- +2.39.5 + diff --git a/queue-6.1/rdma-mlx5-implement-mkeys-management-via-lifo-queue.patch b/queue-6.1/rdma-mlx5-implement-mkeys-management-via-lifo-queue.patch new file mode 100644 index 0000000000..c89fbf4410 --- /dev/null +++ b/queue-6.1/rdma-mlx5-implement-mkeys-management-via-lifo-queue.patch @@ -0,0 +1,704 @@ +From 73daa66bd410fa9662f7e4578ac5b58338c23b31 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 21 Sep 2023 11:07:16 +0300 +Subject: RDMA/mlx5: Implement mkeys management via LIFO queue + +From: Shay Drory + +[ Upstream commit 57e7071683ef6148c9f5ea0ba84598d2ba681375 ] + +Currently, mkeys are managed via xarray. This implementation leads to +a degradation in cases many MRs are unregistered in parallel, due to xarray +internal implementation, for example: deregistration 1M MRs via 64 threads +is taking ~15% more time[1]. + +Hence, implement mkeys management via LIFO queue, which solved the +degradation. + +[1] +2.8us in kernel v5.19 compare to 3.2us in kernel v6.4 + +Signed-off-by: Shay Drory +Link: https://lore.kernel.org/r/fde3d4cfab0f32f0ccb231cd113298256e1502c5.1695283384.git.leon@kernel.org +Signed-off-by: Leon Romanovsky +Stable-dep-of: d97505baea64 ("RDMA/mlx5: Fix the recovery flow of the UMR QP") +Signed-off-by: Sasha Levin +--- + drivers/infiniband/hw/mlx5/mlx5_ib.h | 21 +- + drivers/infiniband/hw/mlx5/mr.c | 324 ++++++++++++--------------- + drivers/infiniband/hw/mlx5/umr.c | 4 +- + 3 files changed, 169 insertions(+), 180 deletions(-) + +diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h +index 7c72e0e9db54a..024d2071c6a5d 100644 +--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h ++++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h +@@ -760,10 +760,25 @@ struct umr_common { + unsigned int state; + }; + ++#define NUM_MKEYS_PER_PAGE \ ++ ((PAGE_SIZE - sizeof(struct list_head)) / sizeof(u32)) ++ ++struct mlx5_mkeys_page { ++ u32 mkeys[NUM_MKEYS_PER_PAGE]; ++ struct list_head list; ++}; ++static_assert(sizeof(struct mlx5_mkeys_page) == PAGE_SIZE); ++ ++struct mlx5_mkeys_queue { ++ struct list_head pages_list; ++ u32 num_pages; ++ unsigned long ci; ++ spinlock_t lock; /* sync list ops */ ++}; ++ + struct mlx5_cache_ent { +- struct xarray mkeys; +- unsigned long stored; +- unsigned long reserved; ++ struct mlx5_mkeys_queue mkeys_queue; ++ u32 pending; + + char name[4]; + +diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c +index 2c1a935734273..b66b8346c2dc6 100644 +--- a/drivers/infiniband/hw/mlx5/mr.c ++++ b/drivers/infiniband/hw/mlx5/mr.c +@@ -140,110 +140,47 @@ static void create_mkey_warn(struct mlx5_ib_dev *dev, int status, void *out) + mlx5_cmd_out_err(dev->mdev, MLX5_CMD_OP_CREATE_MKEY, 0, out); + } + +-static int push_mkey_locked(struct mlx5_cache_ent *ent, bool limit_pendings, +- void *to_store) ++static int push_mkey_locked(struct mlx5_cache_ent *ent, u32 mkey) + { +- XA_STATE(xas, &ent->mkeys, 0); +- void *curr; ++ unsigned long tmp = ent->mkeys_queue.ci % NUM_MKEYS_PER_PAGE; ++ struct mlx5_mkeys_page *page; + +- if (limit_pendings && +- (ent->reserved - ent->stored) > MAX_PENDING_REG_MR) +- return -EAGAIN; +- +- while (1) { +- /* +- * This is cmpxchg (NULL, XA_ZERO_ENTRY) however this version +- * doesn't transparently unlock. Instead we set the xas index to +- * the current value of reserved every iteration. +- */ +- xas_set(&xas, ent->reserved); +- curr = xas_load(&xas); +- if (!curr) { +- if (to_store && ent->stored == ent->reserved) +- xas_store(&xas, to_store); +- else +- xas_store(&xas, XA_ZERO_ENTRY); +- if (xas_valid(&xas)) { +- ent->reserved++; +- if (to_store) { +- if (ent->stored != ent->reserved) +- __xa_store(&ent->mkeys, +- ent->stored, +- to_store, +- GFP_KERNEL); +- ent->stored++; +- queue_adjust_cache_locked(ent); +- WRITE_ONCE(ent->dev->cache.last_add, +- jiffies); +- } +- } +- } +- xa_unlock_irq(&ent->mkeys); +- +- /* +- * Notice xas_nomem() must always be called as it cleans +- * up any cached allocation. +- */ +- if (!xas_nomem(&xas, GFP_KERNEL)) +- break; +- xa_lock_irq(&ent->mkeys); ++ lockdep_assert_held(&ent->mkeys_queue.lock); ++ if (ent->mkeys_queue.ci >= ++ ent->mkeys_queue.num_pages * NUM_MKEYS_PER_PAGE) { ++ page = kzalloc(sizeof(*page), GFP_ATOMIC); ++ if (!page) ++ return -ENOMEM; ++ ent->mkeys_queue.num_pages++; ++ list_add_tail(&page->list, &ent->mkeys_queue.pages_list); ++ } else { ++ page = list_last_entry(&ent->mkeys_queue.pages_list, ++ struct mlx5_mkeys_page, list); + } +- xa_lock_irq(&ent->mkeys); +- if (xas_error(&xas)) +- return xas_error(&xas); +- if (WARN_ON(curr)) +- return -EINVAL; +- return 0; +-} +- +-static int push_mkey(struct mlx5_cache_ent *ent, bool limit_pendings, +- void *to_store) +-{ +- int ret; +- +- xa_lock_irq(&ent->mkeys); +- ret = push_mkey_locked(ent, limit_pendings, to_store); +- xa_unlock_irq(&ent->mkeys); +- return ret; +-} +- +-static void undo_push_reserve_mkey(struct mlx5_cache_ent *ent) +-{ +- void *old; +- +- ent->reserved--; +- old = __xa_erase(&ent->mkeys, ent->reserved); +- WARN_ON(old); +-} +- +-static void push_to_reserved(struct mlx5_cache_ent *ent, u32 mkey) +-{ +- void *old; + +- old = __xa_store(&ent->mkeys, ent->stored, xa_mk_value(mkey), 0); +- WARN_ON(old); +- ent->stored++; ++ page->mkeys[tmp] = mkey; ++ ent->mkeys_queue.ci++; ++ return 0; + } + +-static u32 pop_stored_mkey(struct mlx5_cache_ent *ent) ++static int pop_mkey_locked(struct mlx5_cache_ent *ent) + { +- void *old, *xa_mkey; +- +- ent->stored--; +- ent->reserved--; ++ unsigned long tmp = (ent->mkeys_queue.ci - 1) % NUM_MKEYS_PER_PAGE; ++ struct mlx5_mkeys_page *last_page; ++ u32 mkey; + +- if (ent->stored == ent->reserved) { +- xa_mkey = __xa_erase(&ent->mkeys, ent->stored); +- WARN_ON(!xa_mkey); +- return (u32)xa_to_value(xa_mkey); ++ lockdep_assert_held(&ent->mkeys_queue.lock); ++ last_page = list_last_entry(&ent->mkeys_queue.pages_list, ++ struct mlx5_mkeys_page, list); ++ mkey = last_page->mkeys[tmp]; ++ last_page->mkeys[tmp] = 0; ++ ent->mkeys_queue.ci--; ++ if (ent->mkeys_queue.num_pages > 1 && !tmp) { ++ list_del(&last_page->list); ++ ent->mkeys_queue.num_pages--; ++ kfree(last_page); + } +- +- xa_mkey = __xa_store(&ent->mkeys, ent->stored, XA_ZERO_ENTRY, +- GFP_KERNEL); +- WARN_ON(!xa_mkey || xa_is_err(xa_mkey)); +- old = __xa_erase(&ent->mkeys, ent->reserved); +- WARN_ON(old); +- return (u32)xa_to_value(xa_mkey); ++ return mkey; + } + + static void create_mkey_callback(int status, struct mlx5_async_work *context) +@@ -257,10 +194,10 @@ static void create_mkey_callback(int status, struct mlx5_async_work *context) + if (status) { + create_mkey_warn(dev, status, mkey_out->out); + kfree(mkey_out); +- xa_lock_irqsave(&ent->mkeys, flags); +- undo_push_reserve_mkey(ent); ++ spin_lock_irqsave(&ent->mkeys_queue.lock, flags); ++ ent->pending--; + WRITE_ONCE(dev->fill_delay, 1); +- xa_unlock_irqrestore(&ent->mkeys, flags); ++ spin_unlock_irqrestore(&ent->mkeys_queue.lock, flags); + mod_timer(&dev->delay_timer, jiffies + HZ); + return; + } +@@ -269,11 +206,12 @@ static void create_mkey_callback(int status, struct mlx5_async_work *context) + MLX5_GET(create_mkey_out, mkey_out->out, mkey_index)); + WRITE_ONCE(dev->cache.last_add, jiffies); + +- xa_lock_irqsave(&ent->mkeys, flags); +- push_to_reserved(ent, mkey_out->mkey); ++ spin_lock_irqsave(&ent->mkeys_queue.lock, flags); ++ push_mkey_locked(ent, mkey_out->mkey); + /* If we are doing fill_to_high_water then keep going. */ + queue_adjust_cache_locked(ent); +- xa_unlock_irqrestore(&ent->mkeys, flags); ++ ent->pending--; ++ spin_unlock_irqrestore(&ent->mkeys_queue.lock, flags); + kfree(mkey_out); + } + +@@ -329,24 +267,28 @@ static int add_keys(struct mlx5_cache_ent *ent, unsigned int num) + set_cache_mkc(ent, mkc); + async_create->ent = ent; + +- err = push_mkey(ent, true, NULL); +- if (err) ++ spin_lock_irq(&ent->mkeys_queue.lock); ++ if (ent->pending >= MAX_PENDING_REG_MR) { ++ err = -EAGAIN; + goto free_async_create; ++ } ++ ent->pending++; ++ spin_unlock_irq(&ent->mkeys_queue.lock); + + err = mlx5_ib_create_mkey_cb(async_create); + if (err) { + mlx5_ib_warn(ent->dev, "create mkey failed %d\n", err); +- goto err_undo_reserve; ++ goto err_create_mkey; + } + } + + return 0; + +-err_undo_reserve: +- xa_lock_irq(&ent->mkeys); +- undo_push_reserve_mkey(ent); +- xa_unlock_irq(&ent->mkeys); ++err_create_mkey: ++ spin_lock_irq(&ent->mkeys_queue.lock); ++ ent->pending--; + free_async_create: ++ spin_unlock_irq(&ent->mkeys_queue.lock); + kfree(async_create); + return err; + } +@@ -379,36 +321,36 @@ static void remove_cache_mr_locked(struct mlx5_cache_ent *ent) + { + u32 mkey; + +- lockdep_assert_held(&ent->mkeys.xa_lock); +- if (!ent->stored) ++ lockdep_assert_held(&ent->mkeys_queue.lock); ++ if (!ent->mkeys_queue.ci) + return; +- mkey = pop_stored_mkey(ent); +- xa_unlock_irq(&ent->mkeys); ++ mkey = pop_mkey_locked(ent); ++ spin_unlock_irq(&ent->mkeys_queue.lock); + mlx5_core_destroy_mkey(ent->dev->mdev, mkey); +- xa_lock_irq(&ent->mkeys); ++ spin_lock_irq(&ent->mkeys_queue.lock); + } + + static int resize_available_mrs(struct mlx5_cache_ent *ent, unsigned int target, + bool limit_fill) +- __acquires(&ent->mkeys) __releases(&ent->mkeys) ++ __acquires(&ent->mkeys_queue.lock) __releases(&ent->mkeys_queue.lock) + { + int err; + +- lockdep_assert_held(&ent->mkeys.xa_lock); ++ lockdep_assert_held(&ent->mkeys_queue.lock); + + while (true) { + if (limit_fill) + target = ent->limit * 2; +- if (target == ent->reserved) ++ if (target == ent->pending + ent->mkeys_queue.ci) + return 0; +- if (target > ent->reserved) { +- u32 todo = target - ent->reserved; ++ if (target > ent->pending + ent->mkeys_queue.ci) { ++ u32 todo = target - (ent->pending + ent->mkeys_queue.ci); + +- xa_unlock_irq(&ent->mkeys); ++ spin_unlock_irq(&ent->mkeys_queue.lock); + err = add_keys(ent, todo); + if (err == -EAGAIN) + usleep_range(3000, 5000); +- xa_lock_irq(&ent->mkeys); ++ spin_lock_irq(&ent->mkeys_queue.lock); + if (err) { + if (err != -EAGAIN) + return err; +@@ -436,7 +378,7 @@ static ssize_t size_write(struct file *filp, const char __user *buf, + * cannot free MRs that are in use. Compute the target value for stored + * mkeys. + */ +- xa_lock_irq(&ent->mkeys); ++ spin_lock_irq(&ent->mkeys_queue.lock); + if (target < ent->in_use) { + err = -EINVAL; + goto err_unlock; +@@ -449,12 +391,12 @@ static ssize_t size_write(struct file *filp, const char __user *buf, + err = resize_available_mrs(ent, target, false); + if (err) + goto err_unlock; +- xa_unlock_irq(&ent->mkeys); ++ spin_unlock_irq(&ent->mkeys_queue.lock); + + return count; + + err_unlock: +- xa_unlock_irq(&ent->mkeys); ++ spin_unlock_irq(&ent->mkeys_queue.lock); + return err; + } + +@@ -465,7 +407,8 @@ static ssize_t size_read(struct file *filp, char __user *buf, size_t count, + char lbuf[20]; + int err; + +- err = snprintf(lbuf, sizeof(lbuf), "%ld\n", ent->stored + ent->in_use); ++ err = snprintf(lbuf, sizeof(lbuf), "%ld\n", ++ ent->mkeys_queue.ci + ent->in_use); + if (err < 0) + return err; + +@@ -494,10 +437,10 @@ static ssize_t limit_write(struct file *filp, const char __user *buf, + * Upon set we immediately fill the cache to high water mark implied by + * the limit. + */ +- xa_lock_irq(&ent->mkeys); ++ spin_lock_irq(&ent->mkeys_queue.lock); + ent->limit = var; + err = resize_available_mrs(ent, 0, true); +- xa_unlock_irq(&ent->mkeys); ++ spin_unlock_irq(&ent->mkeys_queue.lock); + if (err) + return err; + return count; +@@ -533,9 +476,9 @@ static bool someone_adding(struct mlx5_mkey_cache *cache) + mutex_lock(&cache->rb_lock); + for (node = rb_first(&cache->rb_root); node; node = rb_next(node)) { + ent = rb_entry(node, struct mlx5_cache_ent, node); +- xa_lock_irq(&ent->mkeys); +- ret = ent->stored < ent->limit; +- xa_unlock_irq(&ent->mkeys); ++ spin_lock_irq(&ent->mkeys_queue.lock); ++ ret = ent->mkeys_queue.ci < ent->limit; ++ spin_unlock_irq(&ent->mkeys_queue.lock); + if (ret) { + mutex_unlock(&cache->rb_lock); + return true; +@@ -552,26 +495,26 @@ static bool someone_adding(struct mlx5_mkey_cache *cache) + */ + static void queue_adjust_cache_locked(struct mlx5_cache_ent *ent) + { +- lockdep_assert_held(&ent->mkeys.xa_lock); ++ lockdep_assert_held(&ent->mkeys_queue.lock); + + if (ent->disabled || READ_ONCE(ent->dev->fill_delay) || ent->is_tmp) + return; +- if (ent->stored < ent->limit) { ++ if (ent->mkeys_queue.ci < ent->limit) { + ent->fill_to_high_water = true; + mod_delayed_work(ent->dev->cache.wq, &ent->dwork, 0); + } else if (ent->fill_to_high_water && +- ent->reserved < 2 * ent->limit) { ++ ent->mkeys_queue.ci + ent->pending < 2 * ent->limit) { + /* + * Once we start populating due to hitting a low water mark + * continue until we pass the high water mark. + */ + mod_delayed_work(ent->dev->cache.wq, &ent->dwork, 0); +- } else if (ent->stored == 2 * ent->limit) { ++ } else if (ent->mkeys_queue.ci == 2 * ent->limit) { + ent->fill_to_high_water = false; +- } else if (ent->stored > 2 * ent->limit) { ++ } else if (ent->mkeys_queue.ci > 2 * ent->limit) { + /* Queue deletion of excess entries */ + ent->fill_to_high_water = false; +- if (ent->stored != ent->reserved) ++ if (ent->pending) + queue_delayed_work(ent->dev->cache.wq, &ent->dwork, + msecs_to_jiffies(1000)); + else +@@ -585,15 +528,16 @@ static void __cache_work_func(struct mlx5_cache_ent *ent) + struct mlx5_mkey_cache *cache = &dev->cache; + int err; + +- xa_lock_irq(&ent->mkeys); ++ spin_lock_irq(&ent->mkeys_queue.lock); + if (ent->disabled) + goto out; + +- if (ent->fill_to_high_water && ent->reserved < 2 * ent->limit && ++ if (ent->fill_to_high_water && ++ ent->mkeys_queue.ci + ent->pending < 2 * ent->limit && + !READ_ONCE(dev->fill_delay)) { +- xa_unlock_irq(&ent->mkeys); ++ spin_unlock_irq(&ent->mkeys_queue.lock); + err = add_keys(ent, 1); +- xa_lock_irq(&ent->mkeys); ++ spin_lock_irq(&ent->mkeys_queue.lock); + if (ent->disabled) + goto out; + if (err) { +@@ -611,7 +555,7 @@ static void __cache_work_func(struct mlx5_cache_ent *ent) + msecs_to_jiffies(1000)); + } + } +- } else if (ent->stored > 2 * ent->limit) { ++ } else if (ent->mkeys_queue.ci > 2 * ent->limit) { + bool need_delay; + + /* +@@ -626,11 +570,11 @@ static void __cache_work_func(struct mlx5_cache_ent *ent) + * the garbage collection work to try to run in next cycle, in + * order to free CPU resources to other tasks. + */ +- xa_unlock_irq(&ent->mkeys); ++ spin_unlock_irq(&ent->mkeys_queue.lock); + need_delay = need_resched() || someone_adding(cache) || + !time_after(jiffies, + READ_ONCE(cache->last_add) + 300 * HZ); +- xa_lock_irq(&ent->mkeys); ++ spin_lock_irq(&ent->mkeys_queue.lock); + if (ent->disabled) + goto out; + if (need_delay) { +@@ -641,7 +585,7 @@ static void __cache_work_func(struct mlx5_cache_ent *ent) + queue_adjust_cache_locked(ent); + } + out: +- xa_unlock_irq(&ent->mkeys); ++ spin_unlock_irq(&ent->mkeys_queue.lock); + } + + static void delayed_cache_work_func(struct work_struct *work) +@@ -749,25 +693,25 @@ static struct mlx5_ib_mr *_mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev, + if (!mr) + return ERR_PTR(-ENOMEM); + +- xa_lock_irq(&ent->mkeys); ++ spin_lock_irq(&ent->mkeys_queue.lock); + ent->in_use++; + +- if (!ent->stored) { ++ if (!ent->mkeys_queue.ci) { + queue_adjust_cache_locked(ent); + ent->miss++; +- xa_unlock_irq(&ent->mkeys); ++ spin_unlock_irq(&ent->mkeys_queue.lock); + err = create_cache_mkey(ent, &mr->mmkey.key); + if (err) { +- xa_lock_irq(&ent->mkeys); ++ spin_lock_irq(&ent->mkeys_queue.lock); + ent->in_use--; +- xa_unlock_irq(&ent->mkeys); ++ spin_unlock_irq(&ent->mkeys_queue.lock); + kfree(mr); + return ERR_PTR(err); + } + } else { +- mr->mmkey.key = pop_stored_mkey(ent); ++ mr->mmkey.key = pop_mkey_locked(ent); + queue_adjust_cache_locked(ent); +- xa_unlock_irq(&ent->mkeys); ++ spin_unlock_irq(&ent->mkeys_queue.lock); + } + mr->mmkey.cache_ent = ent; + mr->mmkey.type = MLX5_MKEY_MR; +@@ -820,14 +764,14 @@ static void clean_keys(struct mlx5_ib_dev *dev, struct mlx5_cache_ent *ent) + u32 mkey; + + cancel_delayed_work(&ent->dwork); +- xa_lock_irq(&ent->mkeys); +- while (ent->stored) { +- mkey = pop_stored_mkey(ent); +- xa_unlock_irq(&ent->mkeys); ++ spin_lock_irq(&ent->mkeys_queue.lock); ++ while (ent->mkeys_queue.ci) { ++ mkey = pop_mkey_locked(ent); ++ spin_unlock_irq(&ent->mkeys_queue.lock); + mlx5_core_destroy_mkey(dev->mdev, mkey); +- xa_lock_irq(&ent->mkeys); ++ spin_lock_irq(&ent->mkeys_queue.lock); + } +- xa_unlock_irq(&ent->mkeys); ++ spin_unlock_irq(&ent->mkeys_queue.lock); + } + + static void mlx5_mkey_cache_debugfs_cleanup(struct mlx5_ib_dev *dev) +@@ -852,7 +796,7 @@ static void mlx5_mkey_cache_debugfs_add_ent(struct mlx5_ib_dev *dev, + dir = debugfs_create_dir(ent->name, dev->cache.fs_root); + debugfs_create_file("size", 0600, dir, ent, &size_fops); + debugfs_create_file("limit", 0600, dir, ent, &limit_fops); +- debugfs_create_ulong("cur", 0400, dir, &ent->stored); ++ debugfs_create_ulong("cur", 0400, dir, &ent->mkeys_queue.ci); + debugfs_create_u32("miss", 0600, dir, &ent->miss); + } + +@@ -874,6 +818,31 @@ static void delay_time_func(struct timer_list *t) + WRITE_ONCE(dev->fill_delay, 0); + } + ++static int mlx5r_mkeys_init(struct mlx5_cache_ent *ent) ++{ ++ struct mlx5_mkeys_page *page; ++ ++ page = kzalloc(sizeof(*page), GFP_KERNEL); ++ if (!page) ++ return -ENOMEM; ++ INIT_LIST_HEAD(&ent->mkeys_queue.pages_list); ++ spin_lock_init(&ent->mkeys_queue.lock); ++ list_add_tail(&page->list, &ent->mkeys_queue.pages_list); ++ ent->mkeys_queue.num_pages++; ++ return 0; ++} ++ ++static void mlx5r_mkeys_uninit(struct mlx5_cache_ent *ent) ++{ ++ struct mlx5_mkeys_page *page; ++ ++ WARN_ON(ent->mkeys_queue.ci || ent->mkeys_queue.num_pages > 1); ++ page = list_last_entry(&ent->mkeys_queue.pages_list, ++ struct mlx5_mkeys_page, list); ++ list_del(&page->list); ++ kfree(page); ++} ++ + struct mlx5_cache_ent * + mlx5r_cache_create_ent_locked(struct mlx5_ib_dev *dev, + struct mlx5r_cache_rb_key rb_key, +@@ -887,7 +856,9 @@ mlx5r_cache_create_ent_locked(struct mlx5_ib_dev *dev, + if (!ent) + return ERR_PTR(-ENOMEM); + +- xa_init_flags(&ent->mkeys, XA_FLAGS_LOCK_IRQ); ++ ret = mlx5r_mkeys_init(ent); ++ if (ret) ++ goto mkeys_err; + ent->rb_key = rb_key; + ent->dev = dev; + ent->is_tmp = !persistent_entry; +@@ -895,10 +866,8 @@ mlx5r_cache_create_ent_locked(struct mlx5_ib_dev *dev, + INIT_DELAYED_WORK(&ent->dwork, delayed_cache_work_func); + + ret = mlx5_cache_ent_insert(&dev->cache, ent); +- if (ret) { +- kfree(ent); +- return ERR_PTR(ret); +- } ++ if (ret) ++ goto ent_insert_err; + + if (persistent_entry) { + if (rb_key.access_mode == MLX5_MKC_ACCESS_MODE_KSM) +@@ -921,6 +890,11 @@ mlx5r_cache_create_ent_locked(struct mlx5_ib_dev *dev, + } + + return ent; ++ent_insert_err: ++ mlx5r_mkeys_uninit(ent); ++mkeys_err: ++ kfree(ent); ++ return ERR_PTR(ret); + } + + static void remove_ent_work_func(struct work_struct *work) +@@ -938,13 +912,13 @@ static void remove_ent_work_func(struct work_struct *work) + cur = rb_prev(cur); + mutex_unlock(&cache->rb_lock); + +- xa_lock_irq(&ent->mkeys); ++ spin_lock_irq(&ent->mkeys_queue.lock); + if (!ent->is_tmp) { +- xa_unlock_irq(&ent->mkeys); ++ spin_unlock_irq(&ent->mkeys_queue.lock); + mutex_lock(&cache->rb_lock); + continue; + } +- xa_unlock_irq(&ent->mkeys); ++ spin_unlock_irq(&ent->mkeys_queue.lock); + + clean_keys(ent->dev, ent); + mutex_lock(&cache->rb_lock); +@@ -994,9 +968,9 @@ int mlx5_mkey_cache_init(struct mlx5_ib_dev *dev) + mutex_unlock(&cache->rb_lock); + for (node = rb_first(root); node; node = rb_next(node)) { + ent = rb_entry(node, struct mlx5_cache_ent, node); +- xa_lock_irq(&ent->mkeys); ++ spin_lock_irq(&ent->mkeys_queue.lock); + queue_adjust_cache_locked(ent); +- xa_unlock_irq(&ent->mkeys); ++ spin_unlock_irq(&ent->mkeys_queue.lock); + } + + return 0; +@@ -1020,9 +994,9 @@ int mlx5_mkey_cache_cleanup(struct mlx5_ib_dev *dev) + mutex_lock(&dev->cache.rb_lock); + for (node = rb_first(root); node; node = rb_next(node)) { + ent = rb_entry(node, struct mlx5_cache_ent, node); +- xa_lock_irq(&ent->mkeys); ++ spin_lock_irq(&ent->mkeys_queue.lock); + ent->disabled = true; +- xa_unlock_irq(&ent->mkeys); ++ spin_unlock_irq(&ent->mkeys_queue.lock); + cancel_delayed_work_sync(&ent->dwork); + } + +@@ -1035,6 +1009,7 @@ int mlx5_mkey_cache_cleanup(struct mlx5_ib_dev *dev) + node = rb_next(node); + clean_keys(dev, ent); + rb_erase(&ent->node, root); ++ mlx5r_mkeys_uninit(ent); + kfree(ent); + } + mutex_unlock(&dev->cache.rb_lock); +@@ -1802,7 +1777,7 @@ static int cache_ent_find_and_store(struct mlx5_ib_dev *dev, + int ret; + + if (mr->mmkey.cache_ent) { +- xa_lock_irq(&mr->mmkey.cache_ent->mkeys); ++ spin_lock_irq(&mr->mmkey.cache_ent->mkeys_queue.lock); + mr->mmkey.cache_ent->in_use--; + goto end; + } +@@ -1816,7 +1791,7 @@ static int cache_ent_find_and_store(struct mlx5_ib_dev *dev, + return -EOPNOTSUPP; + } + mr->mmkey.cache_ent = ent; +- xa_lock_irq(&mr->mmkey.cache_ent->mkeys); ++ spin_lock_irq(&mr->mmkey.cache_ent->mkeys_queue.lock); + mutex_unlock(&cache->rb_lock); + goto end; + } +@@ -1828,12 +1803,11 @@ static int cache_ent_find_and_store(struct mlx5_ib_dev *dev, + return PTR_ERR(ent); + + mr->mmkey.cache_ent = ent; +- xa_lock_irq(&mr->mmkey.cache_ent->mkeys); ++ spin_lock_irq(&mr->mmkey.cache_ent->mkeys_queue.lock); + + end: +- ret = push_mkey_locked(mr->mmkey.cache_ent, false, +- xa_mk_value(mr->mmkey.key)); +- xa_unlock_irq(&mr->mmkey.cache_ent->mkeys); ++ ret = push_mkey_locked(mr->mmkey.cache_ent, mr->mmkey.key); ++ spin_unlock_irq(&mr->mmkey.cache_ent->mkeys_queue.lock); + return ret; + } + +diff --git a/drivers/infiniband/hw/mlx5/umr.c b/drivers/infiniband/hw/mlx5/umr.c +index cb5cee3dee2b6..fa000182d0b41 100644 +--- a/drivers/infiniband/hw/mlx5/umr.c ++++ b/drivers/infiniband/hw/mlx5/umr.c +@@ -332,8 +332,8 @@ static int mlx5r_umr_post_send_wait(struct mlx5_ib_dev *dev, u32 mkey, + + WARN_ON_ONCE(1); + mlx5_ib_warn(dev, +- "reg umr failed (%u). Trying to recover and resubmit the flushed WQEs\n", +- umr_context.status); ++ "reg umr failed (%u). Trying to recover and resubmit the flushed WQEs, mkey = %u\n", ++ umr_context.status, mkey); + mutex_lock(&umrc->lock); + err = mlx5r_umr_recover(dev); + mutex_unlock(&umrc->lock); +-- +2.39.5 + diff --git a/queue-6.1/rdma-mlx5-introduce-mlx5r_cache_rb_key.patch b/queue-6.1/rdma-mlx5-introduce-mlx5r_cache_rb_key.patch new file mode 100644 index 0000000000..21bcc7561d --- /dev/null +++ b/queue-6.1/rdma-mlx5-introduce-mlx5r_cache_rb_key.patch @@ -0,0 +1,565 @@ +From dee0c2d2ab0dbb79d87e227f8b4136f1764cefb4 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 26 Jan 2023 00:28:05 +0200 +Subject: RDMA/mlx5: Introduce mlx5r_cache_rb_key + +From: Michael Guralnik + +[ Upstream commit 73d09b2fe8336f5f37935e46418666ddbcd3c343 ] + +Switch from using the mkey order to using the new struct as the key to the +RB tree of cache entries. + +The key is all the mkey properties that UMR operations can't modify. +Using this key to define the cache entries and to search and create cache +mkeys. + +Link: https://lore.kernel.org/r/20230125222807.6921-5-michaelgur@nvidia.com +Signed-off-by: Michael Guralnik +Signed-off-by: Jason Gunthorpe +Stable-dep-of: d97505baea64 ("RDMA/mlx5: Fix the recovery flow of the UMR QP") +Signed-off-by: Sasha Levin +--- + drivers/infiniband/hw/mlx5/mlx5_ib.h | 27 ++-- + drivers/infiniband/hw/mlx5/mr.c | 228 +++++++++++++++++++-------- + drivers/infiniband/hw/mlx5/odp.c | 30 ++-- + 3 files changed, 201 insertions(+), 84 deletions(-) + +diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h +index bd998ac8c29c1..7c9d5648947e9 100644 +--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h ++++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h +@@ -637,6 +637,13 @@ enum mlx5_mkey_type { + MLX5_MKEY_INDIRECT_DEVX, + }; + ++struct mlx5r_cache_rb_key { ++ u8 ats:1; ++ unsigned int access_mode; ++ unsigned int access_flags; ++ unsigned int ndescs; ++}; ++ + struct mlx5_ib_mkey { + u32 key; + enum mlx5_mkey_type type; +@@ -757,11 +764,9 @@ struct mlx5_cache_ent { + unsigned long reserved; + + char name[4]; +- u32 order; +- u32 access_mode; +- unsigned int ndescs; + + struct rb_node node; ++ struct mlx5r_cache_rb_key rb_key; + + u8 disabled:1; + u8 fill_to_high_water:1; +@@ -1340,14 +1345,13 @@ int mlx5_ib_get_cqe_size(struct ib_cq *ibcq); + int mlx5_mkey_cache_init(struct mlx5_ib_dev *dev); + int mlx5_mkey_cache_cleanup(struct mlx5_ib_dev *dev); + struct mlx5_cache_ent *mlx5r_cache_create_ent(struct mlx5_ib_dev *dev, +- int order); ++ struct mlx5r_cache_rb_key rb_key, ++ bool persistent_entry); + + struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev, +- struct mlx5_cache_ent *ent, +- int access_flags); ++ int access_flags, int access_mode, ++ int ndescs); + +-struct mlx5_ib_mr *mlx5_mr_cache_alloc_order(struct mlx5_ib_dev *dev, u32 order, +- int access_flags); + int mlx5_ib_check_mr_status(struct ib_mr *ibmr, u32 check_mask, + struct ib_mr_status *mr_status); + struct ib_wq *mlx5_ib_create_wq(struct ib_pd *pd, +@@ -1370,7 +1374,7 @@ int mlx5r_odp_create_eq(struct mlx5_ib_dev *dev, struct mlx5_ib_pf_eq *eq); + void mlx5_ib_odp_cleanup_one(struct mlx5_ib_dev *ibdev); + int __init mlx5_ib_odp_init(void); + void mlx5_ib_odp_cleanup(void); +-void mlx5_odp_init_mkey_cache_entry(struct mlx5_cache_ent *ent); ++int mlx5_odp_init_mkey_cache(struct mlx5_ib_dev *dev); + void mlx5_odp_populate_xlt(void *xlt, size_t idx, size_t nentries, + struct mlx5_ib_mr *mr, int flags); + +@@ -1389,7 +1393,10 @@ static inline int mlx5r_odp_create_eq(struct mlx5_ib_dev *dev, + static inline void mlx5_ib_odp_cleanup_one(struct mlx5_ib_dev *ibdev) {} + static inline int mlx5_ib_odp_init(void) { return 0; } + static inline void mlx5_ib_odp_cleanup(void) {} +-static inline void mlx5_odp_init_mkey_cache_entry(struct mlx5_cache_ent *ent) {} ++static inline int mlx5_odp_init_mkey_cache(struct mlx5_ib_dev *dev) ++{ ++ return 0; ++} + static inline void mlx5_odp_populate_xlt(void *xlt, size_t idx, size_t nentries, + struct mlx5_ib_mr *mr, int flags) {} + +diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c +index b3d83920d3cfb..1060b30a837a0 100644 +--- a/drivers/infiniband/hw/mlx5/mr.c ++++ b/drivers/infiniband/hw/mlx5/mr.c +@@ -292,11 +292,13 @@ static void set_cache_mkc(struct mlx5_cache_ent *ent, void *mkc) + set_mkc_access_pd_addr_fields(mkc, 0, 0, ent->dev->umrc.pd); + MLX5_SET(mkc, mkc, free, 1); + MLX5_SET(mkc, mkc, umr_en, 1); +- MLX5_SET(mkc, mkc, access_mode_1_0, ent->access_mode & 0x3); +- MLX5_SET(mkc, mkc, access_mode_4_2, (ent->access_mode >> 2) & 0x7); ++ MLX5_SET(mkc, mkc, access_mode_1_0, ent->rb_key.access_mode & 0x3); ++ MLX5_SET(mkc, mkc, access_mode_4_2, ++ (ent->rb_key.access_mode >> 2) & 0x7); + + MLX5_SET(mkc, mkc, translations_octword_size, +- get_mkc_octo_size(ent->access_mode, ent->ndescs)); ++ get_mkc_octo_size(ent->rb_key.access_mode, ++ ent->rb_key.ndescs)); + MLX5_SET(mkc, mkc, log_page_size, PAGE_SHIFT); + } + +@@ -594,8 +596,8 @@ static void __cache_work_func(struct mlx5_cache_ent *ent) + if (err != -EAGAIN) { + mlx5_ib_warn( + dev, +- "command failed order %d, err %d\n", +- ent->order, err); ++ "add keys command failed, err %d\n", ++ err); + queue_delayed_work(cache->wq, &ent->dwork, + msecs_to_jiffies(1000)); + } +@@ -641,22 +643,49 @@ static void delayed_cache_work_func(struct work_struct *work) + __cache_work_func(ent); + } + ++static int cache_ent_key_cmp(struct mlx5r_cache_rb_key key1, ++ struct mlx5r_cache_rb_key key2) ++{ ++ int res; ++ ++ res = key1.ats - key2.ats; ++ if (res) ++ return res; ++ ++ res = key1.access_mode - key2.access_mode; ++ if (res) ++ return res; ++ ++ res = key1.access_flags - key2.access_flags; ++ if (res) ++ return res; ++ ++ /* ++ * keep ndescs the last in the compare table since the find function ++ * searches for an exact match on all properties and only closest ++ * match in size. ++ */ ++ return key1.ndescs - key2.ndescs; ++} ++ + static int mlx5_cache_ent_insert(struct mlx5_mkey_cache *cache, + struct mlx5_cache_ent *ent) + { + struct rb_node **new = &cache->rb_root.rb_node, *parent = NULL; + struct mlx5_cache_ent *cur; ++ int cmp; + + mutex_lock(&cache->rb_lock); + /* Figure out where to put new node */ + while (*new) { + cur = rb_entry(*new, struct mlx5_cache_ent, node); + parent = *new; +- if (ent->order < cur->order) ++ cmp = cache_ent_key_cmp(cur->rb_key, ent->rb_key); ++ if (cmp > 0) + new = &((*new)->rb_left); +- if (ent->order > cur->order) ++ if (cmp < 0) + new = &((*new)->rb_right); +- if (ent->order == cur->order) { ++ if (cmp == 0) { + mutex_unlock(&cache->rb_lock); + return -EEXIST; + } +@@ -670,40 +699,45 @@ static int mlx5_cache_ent_insert(struct mlx5_mkey_cache *cache, + return 0; + } + +-static struct mlx5_cache_ent *mkey_cache_ent_from_order(struct mlx5_ib_dev *dev, +- unsigned int order) ++static struct mlx5_cache_ent * ++mkey_cache_ent_from_rb_key(struct mlx5_ib_dev *dev, ++ struct mlx5r_cache_rb_key rb_key) + { + struct rb_node *node = dev->cache.rb_root.rb_node; + struct mlx5_cache_ent *cur, *smallest = NULL; ++ int cmp; + + /* + * Find the smallest ent with order >= requested_order. + */ + while (node) { + cur = rb_entry(node, struct mlx5_cache_ent, node); +- if (cur->order > order) { ++ cmp = cache_ent_key_cmp(cur->rb_key, rb_key); ++ if (cmp > 0) { + smallest = cur; + node = node->rb_left; + } +- if (cur->order < order) ++ if (cmp < 0) + node = node->rb_right; +- if (cur->order == order) ++ if (cmp == 0) + return cur; + } + +- return smallest; ++ return (smallest && ++ smallest->rb_key.access_mode == rb_key.access_mode && ++ smallest->rb_key.access_flags == rb_key.access_flags && ++ smallest->rb_key.ats == rb_key.ats) ? ++ smallest : ++ NULL; + } + +-struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev, +- struct mlx5_cache_ent *ent, +- int access_flags) ++static struct mlx5_ib_mr *_mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev, ++ struct mlx5_cache_ent *ent, ++ int access_flags) + { + struct mlx5_ib_mr *mr; + int err; + +- if (!mlx5r_umr_can_reconfig(dev, 0, access_flags)) +- return ERR_PTR(-EOPNOTSUPP); +- + mr = kzalloc(sizeof(*mr), GFP_KERNEL); + if (!mr) + return ERR_PTR(-ENOMEM); +@@ -734,12 +768,44 @@ struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev, + return mr; + } + +-struct mlx5_ib_mr *mlx5_mr_cache_alloc_order(struct mlx5_ib_dev *dev, +- u32 order, int access_flags) ++static int get_unchangeable_access_flags(struct mlx5_ib_dev *dev, ++ int access_flags) ++{ ++ int ret = 0; ++ ++ if ((access_flags & IB_ACCESS_REMOTE_ATOMIC) && ++ MLX5_CAP_GEN(dev->mdev, atomic) && ++ MLX5_CAP_GEN(dev->mdev, umr_modify_atomic_disabled)) ++ ret |= IB_ACCESS_REMOTE_ATOMIC; ++ ++ if ((access_flags & IB_ACCESS_RELAXED_ORDERING) && ++ MLX5_CAP_GEN(dev->mdev, relaxed_ordering_write) && ++ !MLX5_CAP_GEN(dev->mdev, relaxed_ordering_write_umr)) ++ ret |= IB_ACCESS_RELAXED_ORDERING; ++ ++ if ((access_flags & IB_ACCESS_RELAXED_ORDERING) && ++ MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read) && ++ !MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read_umr)) ++ ret |= IB_ACCESS_RELAXED_ORDERING; ++ ++ return ret; ++} ++ ++struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev, ++ int access_flags, int access_mode, ++ int ndescs) + { +- struct mlx5_cache_ent *ent = mkey_cache_ent_from_order(dev, order); ++ struct mlx5r_cache_rb_key rb_key = { ++ .ndescs = ndescs, ++ .access_mode = access_mode, ++ .access_flags = get_unchangeable_access_flags(dev, access_flags) ++ }; ++ struct mlx5_cache_ent *ent = mkey_cache_ent_from_rb_key(dev, rb_key); + +- return mlx5_mr_cache_alloc(dev, ent, access_flags); ++ if (!ent) ++ return ERR_PTR(-EOPNOTSUPP); ++ ++ return _mlx5_mr_cache_alloc(dev, ent, access_flags); + } + + static void clean_keys(struct mlx5_ib_dev *dev, struct mlx5_cache_ent *ent) +@@ -766,28 +832,32 @@ static void mlx5_mkey_cache_debugfs_cleanup(struct mlx5_ib_dev *dev) + dev->cache.fs_root = NULL; + } + ++static void mlx5_mkey_cache_debugfs_add_ent(struct mlx5_ib_dev *dev, ++ struct mlx5_cache_ent *ent) ++{ ++ int order = order_base_2(ent->rb_key.ndescs); ++ struct dentry *dir; ++ ++ if (ent->rb_key.access_mode == MLX5_MKC_ACCESS_MODE_KSM) ++ order = MLX5_IMR_KSM_CACHE_ENTRY + 2; ++ ++ sprintf(ent->name, "%d", order); ++ dir = debugfs_create_dir(ent->name, dev->cache.fs_root); ++ debugfs_create_file("size", 0600, dir, ent, &size_fops); ++ debugfs_create_file("limit", 0600, dir, ent, &limit_fops); ++ debugfs_create_ulong("cur", 0400, dir, &ent->stored); ++ debugfs_create_u32("miss", 0600, dir, &ent->miss); ++} ++ + static void mlx5_mkey_cache_debugfs_init(struct mlx5_ib_dev *dev) + { ++ struct dentry *dbg_root = mlx5_debugfs_get_dev_root(dev->mdev); + struct mlx5_mkey_cache *cache = &dev->cache; +- struct mlx5_cache_ent *ent; +- struct dentry *dir; +- int i; + + if (!mlx5_debugfs_root || dev->is_rep) + return; + +- dir = mlx5_debugfs_get_dev_root(dev->mdev); +- cache->fs_root = debugfs_create_dir("mr_cache", dir); +- +- for (i = 0; i < MAX_MKEY_CACHE_ENTRIES; i++) { +- ent = mkey_cache_ent_from_order(dev, i); +- sprintf(ent->name, "%d", ent->order); +- dir = debugfs_create_dir(ent->name, cache->fs_root); +- debugfs_create_file("size", 0600, dir, ent, &size_fops); +- debugfs_create_file("limit", 0600, dir, ent, &limit_fops); +- debugfs_create_ulong("cur", 0400, dir, &ent->stored); +- debugfs_create_u32("miss", 0600, dir, &ent->miss); +- } ++ cache->fs_root = debugfs_create_dir("mr_cache", dbg_root); + } + + static void delay_time_func(struct timer_list *t) +@@ -798,9 +868,11 @@ static void delay_time_func(struct timer_list *t) + } + + struct mlx5_cache_ent *mlx5r_cache_create_ent(struct mlx5_ib_dev *dev, +- int order) ++ struct mlx5r_cache_rb_key rb_key, ++ bool persistent_entry) + { + struct mlx5_cache_ent *ent; ++ int order; + int ret; + + ent = kzalloc(sizeof(*ent), GFP_KERNEL); +@@ -808,7 +880,7 @@ struct mlx5_cache_ent *mlx5r_cache_create_ent(struct mlx5_ib_dev *dev, + return ERR_PTR(-ENOMEM); + + xa_init_flags(&ent->mkeys, XA_FLAGS_LOCK_IRQ); +- ent->order = order; ++ ent->rb_key = rb_key; + ent->dev = dev; + + INIT_DELAYED_WORK(&ent->dwork, delayed_cache_work_func); +@@ -818,13 +890,36 @@ struct mlx5_cache_ent *mlx5r_cache_create_ent(struct mlx5_ib_dev *dev, + kfree(ent); + return ERR_PTR(ret); + } ++ ++ if (persistent_entry) { ++ if (rb_key.access_mode == MLX5_MKC_ACCESS_MODE_KSM) ++ order = MLX5_IMR_KSM_CACHE_ENTRY; ++ else ++ order = order_base_2(rb_key.ndescs) - 2; ++ ++ if ((dev->mdev->profile.mask & MLX5_PROF_MASK_MR_CACHE) && ++ !dev->is_rep && mlx5_core_is_pf(dev->mdev) && ++ mlx5r_umr_can_load_pas(dev, 0)) ++ ent->limit = dev->mdev->profile.mr_cache[order].limit; ++ else ++ ent->limit = 0; ++ ++ mlx5_mkey_cache_debugfs_add_ent(dev, ent); ++ } ++ + return ent; + } + + int mlx5_mkey_cache_init(struct mlx5_ib_dev *dev) + { + struct mlx5_mkey_cache *cache = &dev->cache; ++ struct rb_root *root = &dev->cache.rb_root; ++ struct mlx5r_cache_rb_key rb_key = { ++ .access_mode = MLX5_MKC_ACCESS_MODE_MTT, ++ }; + struct mlx5_cache_ent *ent; ++ struct rb_node *node; ++ int ret; + int i; + + mutex_init(&dev->slow_path_mutex); +@@ -838,33 +933,32 @@ int mlx5_mkey_cache_init(struct mlx5_ib_dev *dev) + + mlx5_cmd_init_async_ctx(dev->mdev, &dev->async_ctx); + timer_setup(&dev->delay_timer, delay_time_func, 0); +- for (i = 0; i < MAX_MKEY_CACHE_ENTRIES; i++) { +- ent = mlx5r_cache_create_ent(dev, i); +- +- if (i > MKEY_CACHE_LAST_STD_ENTRY) { +- mlx5_odp_init_mkey_cache_entry(ent); +- continue; ++ mlx5_mkey_cache_debugfs_init(dev); ++ for (i = 0; i <= mkey_cache_max_order(dev); i++) { ++ rb_key.ndescs = 1 << (i + 2); ++ ent = mlx5r_cache_create_ent(dev, rb_key, true); ++ if (IS_ERR(ent)) { ++ ret = PTR_ERR(ent); ++ goto err; + } ++ } + +- if (ent->order > mkey_cache_max_order(dev)) +- continue; ++ ret = mlx5_odp_init_mkey_cache(dev); ++ if (ret) ++ goto err; + +- ent->ndescs = 1 << ent->order; +- ent->access_mode = MLX5_MKC_ACCESS_MODE_MTT; +- if ((dev->mdev->profile.mask & MLX5_PROF_MASK_MR_CACHE) && +- !dev->is_rep && mlx5_core_is_pf(dev->mdev) && +- mlx5r_umr_can_load_pas(dev, 0)) +- ent->limit = dev->mdev->profile.mr_cache[i].limit; +- else +- ent->limit = 0; ++ for (node = rb_first(root); node; node = rb_next(node)) { ++ ent = rb_entry(node, struct mlx5_cache_ent, node); + xa_lock_irq(&ent->mkeys); + queue_adjust_cache_locked(ent); + xa_unlock_irq(&ent->mkeys); + } + +- mlx5_mkey_cache_debugfs_init(dev); +- + return 0; ++ ++err: ++ mlx5_ib_warn(dev, "failed to create mkey cache entry\n"); ++ return ret; + } + + int mlx5_mkey_cache_cleanup(struct mlx5_ib_dev *dev) +@@ -965,7 +1059,7 @@ static int get_octo_len(u64 addr, u64 len, int page_shift) + static int mkey_cache_max_order(struct mlx5_ib_dev *dev) + { + if (MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset)) +- return MKEY_CACHE_LAST_STD_ENTRY + 2; ++ return MKEY_CACHE_LAST_STD_ENTRY; + return MLX5_MAX_UMR_SHIFT; + } + +@@ -995,6 +1089,9 @@ static struct mlx5_ib_mr *alloc_cacheable_mr(struct ib_pd *pd, + struct ib_umem *umem, u64 iova, + int access_flags) + { ++ struct mlx5r_cache_rb_key rb_key = { ++ .access_mode = MLX5_MKC_ACCESS_MODE_MTT, ++ }; + struct mlx5_ib_dev *dev = to_mdev(pd->device); + struct mlx5_cache_ent *ent; + struct mlx5_ib_mr *mr; +@@ -1007,8 +1104,11 @@ static struct mlx5_ib_mr *alloc_cacheable_mr(struct ib_pd *pd, + 0, iova); + if (WARN_ON(!page_size)) + return ERR_PTR(-EINVAL); +- ent = mkey_cache_ent_from_order( +- dev, order_base_2(ib_umem_num_dma_blocks(umem, page_size))); ++ ++ rb_key.ndescs = ib_umem_num_dma_blocks(umem, page_size); ++ rb_key.ats = mlx5_umem_needs_ats(dev, umem, access_flags); ++ rb_key.access_flags = get_unchangeable_access_flags(dev, access_flags); ++ ent = mkey_cache_ent_from_rb_key(dev, rb_key); + /* + * Matches access in alloc_cache_mr(). If the MR can't come from the + * cache then synchronously create an uncached one. +@@ -1022,7 +1122,7 @@ static struct mlx5_ib_mr *alloc_cacheable_mr(struct ib_pd *pd, + return mr; + } + +- mr = mlx5_mr_cache_alloc(dev, ent, access_flags); ++ mr = _mlx5_mr_cache_alloc(dev, ent, access_flags); + if (IS_ERR(mr)) + return mr; + +@@ -1452,7 +1552,7 @@ static bool can_use_umr_rereg_pas(struct mlx5_ib_mr *mr, + mlx5_umem_find_best_pgsz(new_umem, mkc, log_page_size, 0, iova); + if (WARN_ON(!*page_size)) + return false; +- return (1ULL << mr->mmkey.cache_ent->order) >= ++ return (mr->mmkey.cache_ent->rb_key.ndescs) >= + ib_umem_num_dma_blocks(new_umem, *page_size); + } + +diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c +index 7f68940ca0d1e..96d4faabbff8a 100644 +--- a/drivers/infiniband/hw/mlx5/odp.c ++++ b/drivers/infiniband/hw/mlx5/odp.c +@@ -406,7 +406,6 @@ static void mlx5_ib_page_fault_resume(struct mlx5_ib_dev *dev, + static struct mlx5_ib_mr *implicit_get_child_mr(struct mlx5_ib_mr *imr, + unsigned long idx) + { +- int order = order_base_2(MLX5_IMR_MTT_ENTRIES); + struct mlx5_ib_dev *dev = mr_to_mdev(imr); + struct ib_umem_odp *odp; + struct mlx5_ib_mr *mr; +@@ -419,8 +418,9 @@ static struct mlx5_ib_mr *implicit_get_child_mr(struct mlx5_ib_mr *imr, + if (IS_ERR(odp)) + return ERR_CAST(odp); + +- BUILD_BUG_ON(order > MKEY_CACHE_LAST_STD_ENTRY); +- mr = mlx5_mr_cache_alloc_order(dev, order, imr->access_flags); ++ mr = mlx5_mr_cache_alloc(dev, imr->access_flags, ++ MLX5_MKC_ACCESS_MODE_MTT, ++ MLX5_IMR_MTT_ENTRIES); + if (IS_ERR(mr)) { + ib_umem_odp_release(odp); + return mr; +@@ -494,8 +494,8 @@ struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd, + if (IS_ERR(umem_odp)) + return ERR_CAST(umem_odp); + +- imr = mlx5_mr_cache_alloc_order(dev, MLX5_IMR_KSM_CACHE_ENTRY, +- access_flags); ++ imr = mlx5_mr_cache_alloc(dev, access_flags, MLX5_MKC_ACCESS_MODE_KSM, ++ mlx5_imr_ksm_entries); + if (IS_ERR(imr)) { + ib_umem_odp_release(umem_odp); + return imr; +@@ -1591,12 +1591,22 @@ mlx5_ib_odp_destroy_eq(struct mlx5_ib_dev *dev, struct mlx5_ib_pf_eq *eq) + return err; + } + +-void mlx5_odp_init_mkey_cache_entry(struct mlx5_cache_ent *ent) ++int mlx5_odp_init_mkey_cache(struct mlx5_ib_dev *dev) + { +- if (!(ent->dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT)) +- return; +- ent->ndescs = mlx5_imr_ksm_entries; +- ent->access_mode = MLX5_MKC_ACCESS_MODE_KSM; ++ struct mlx5r_cache_rb_key rb_key = { ++ .access_mode = MLX5_MKC_ACCESS_MODE_KSM, ++ .ndescs = mlx5_imr_ksm_entries, ++ }; ++ struct mlx5_cache_ent *ent; ++ ++ if (!(dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT)) ++ return 0; ++ ++ ent = mlx5r_cache_create_ent(dev, rb_key, true); ++ if (IS_ERR(ent)) ++ return PTR_ERR(ent); ++ ++ return 0; + } + + static const struct ib_device_ops mlx5_ib_dev_odp_ops = { +-- +2.39.5 + diff --git a/queue-6.1/rdma-mlx5-reduce-qp-table-exposure.patch b/queue-6.1/rdma-mlx5-reduce-qp-table-exposure.patch new file mode 100644 index 0000000000..8abe91b017 --- /dev/null +++ b/queue-6.1/rdma-mlx5-reduce-qp-table-exposure.patch @@ -0,0 +1,80 @@ +From 31e1b4f44049773843852197aab66262fea5d3ca Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 5 Jun 2023 13:14:05 +0300 +Subject: RDMA/mlx5: Reduce QP table exposure + +From: Leon Romanovsky + +[ Upstream commit 2ecfd946169e7f56534db2a5f6935858be3005ba ] + +driver.h is common header to whole mlx5 code base, but struct +mlx5_qp_table is used in mlx5_ib driver only. So move that struct +to be under sole responsibility of mlx5_ib. + +Link: https://lore.kernel.org/r/bec0dc1158e795813b135d1143147977f26bf668.1685953497.git.leon@kernel.org +Signed-off-by: Leon Romanovsky +Stable-dep-of: c534ffda781f ("RDMA/mlx5: Fix AH static rate parsing") +Signed-off-by: Sasha Levin +--- + drivers/infiniband/hw/mlx5/mlx5_ib.h | 1 + + drivers/infiniband/hw/mlx5/qp.h | 11 ++++++++++- + include/linux/mlx5/driver.h | 9 --------- + 3 files changed, 11 insertions(+), 10 deletions(-) + +diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h +index 024d2071c6a5d..5c533023a51a4 100644 +--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h ++++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h +@@ -25,6 +25,7 @@ + #include + + #include "srq.h" ++#include "qp.h" + + #define mlx5_ib_dbg(_dev, format, arg...) \ + dev_dbg(&(_dev)->ib_dev.dev, "%s:%d:(pid %d): " format, __func__, \ +diff --git a/drivers/infiniband/hw/mlx5/qp.h b/drivers/infiniband/hw/mlx5/qp.h +index fb2f4e030bb8f..e677fa0ca4226 100644 +--- a/drivers/infiniband/hw/mlx5/qp.h ++++ b/drivers/infiniband/hw/mlx5/qp.h +@@ -6,7 +6,16 @@ + #ifndef _MLX5_IB_QP_H + #define _MLX5_IB_QP_H + +-#include "mlx5_ib.h" ++struct mlx5_ib_dev; ++ ++struct mlx5_qp_table { ++ struct notifier_block nb; ++ ++ /* protect radix tree ++ */ ++ spinlock_t lock; ++ struct radix_tree_root tree; ++}; + + int mlx5_init_qp_table(struct mlx5_ib_dev *dev); + void mlx5_cleanup_qp_table(struct mlx5_ib_dev *dev); +diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h +index 6cea62ca76d6b..060610183fdf9 100644 +--- a/include/linux/mlx5/driver.h ++++ b/include/linux/mlx5/driver.h +@@ -440,15 +440,6 @@ struct mlx5_core_health { + struct delayed_work update_fw_log_ts_work; + }; + +-struct mlx5_qp_table { +- struct notifier_block nb; +- +- /* protect radix tree +- */ +- spinlock_t lock; +- struct radix_tree_root tree; +-}; +- + enum { + MLX5_PF_NOTIFY_DISABLE_VF, + MLX5_PF_NOTIFY_ENABLE_VF, +-- +2.39.5 + diff --git a/queue-6.1/rdma-mlx5-remove-implicit-odp-cache-entry.patch b/queue-6.1/rdma-mlx5-remove-implicit-odp-cache-entry.patch new file mode 100644 index 0000000000..9ee1e9efa3 --- /dev/null +++ b/queue-6.1/rdma-mlx5-remove-implicit-odp-cache-entry.patch @@ -0,0 +1,83 @@ +From f1cf3c129548533fa9dc9569a22ff1ed3e3c9e02 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 26 Jan 2023 00:28:03 +0200 +Subject: RDMA/mlx5: Remove implicit ODP cache entry + +From: Aharon Landau + +[ Upstream commit 18b1746bddf5e7f6b2618966596d9517172a5cd7 ] + +Implicit ODP mkey doesn't have unique properties. It shares the same +properties as the order 18 cache entry. There is no need to devote a +special entry for that. + +Link: https://lore.kernel.org/r/20230125222807.6921-3-michaelgur@nvidia.com +Signed-off-by: Aharon Landau +Signed-off-by: Jason Gunthorpe +Stable-dep-of: d97505baea64 ("RDMA/mlx5: Fix the recovery flow of the UMR QP") +Signed-off-by: Sasha Levin +--- + drivers/infiniband/hw/mlx5/odp.c | 20 +++++--------------- + include/linux/mlx5/driver.h | 1 - + 2 files changed, 5 insertions(+), 16 deletions(-) + +diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c +index a5c9baec8be85..5f0a17382de73 100644 +--- a/drivers/infiniband/hw/mlx5/odp.c ++++ b/drivers/infiniband/hw/mlx5/odp.c +@@ -406,6 +406,7 @@ static void mlx5_ib_page_fault_resume(struct mlx5_ib_dev *dev, + static struct mlx5_ib_mr *implicit_get_child_mr(struct mlx5_ib_mr *imr, + unsigned long idx) + { ++ int order = order_base_2(MLX5_IMR_MTT_ENTRIES); + struct mlx5_ib_dev *dev = mr_to_mdev(imr); + struct ib_umem_odp *odp; + struct mlx5_ib_mr *mr; +@@ -418,7 +419,8 @@ static struct mlx5_ib_mr *implicit_get_child_mr(struct mlx5_ib_mr *imr, + if (IS_ERR(odp)) + return ERR_CAST(odp); + +- mr = mlx5_mr_cache_alloc(dev, &dev->cache.ent[MLX5_IMR_MTT_CACHE_ENTRY], ++ BUILD_BUG_ON(order > MKEY_CACHE_LAST_STD_ENTRY); ++ mr = mlx5_mr_cache_alloc(dev, &dev->cache.ent[order], + imr->access_flags); + if (IS_ERR(mr)) { + ib_umem_odp_release(odp); +@@ -1595,20 +1597,8 @@ void mlx5_odp_init_mkey_cache_entry(struct mlx5_cache_ent *ent) + { + if (!(ent->dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT)) + return; +- +- switch (ent->order - 2) { +- case MLX5_IMR_MTT_CACHE_ENTRY: +- ent->ndescs = MLX5_IMR_MTT_ENTRIES; +- ent->access_mode = MLX5_MKC_ACCESS_MODE_MTT; +- ent->limit = 0; +- break; +- +- case MLX5_IMR_KSM_CACHE_ENTRY: +- ent->ndescs = mlx5_imr_ksm_entries; +- ent->access_mode = MLX5_MKC_ACCESS_MODE_KSM; +- ent->limit = 0; +- break; +- } ++ ent->ndescs = mlx5_imr_ksm_entries; ++ ent->access_mode = MLX5_MKC_ACCESS_MODE_KSM; + } + + static const struct ib_device_ops mlx5_ib_dev_odp_ops = { +diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h +index 3c3e0f26c2446..6cea62ca76d6b 100644 +--- a/include/linux/mlx5/driver.h ++++ b/include/linux/mlx5/driver.h +@@ -744,7 +744,6 @@ enum { + + enum { + MKEY_CACHE_LAST_STD_ENTRY = 20, +- MLX5_IMR_MTT_CACHE_ENTRY, + MLX5_IMR_KSM_CACHE_ENTRY, + MAX_MKEY_CACHE_ENTRIES + }; +-- +2.39.5 + diff --git a/queue-6.1/scsi-core-clear-driver-private-data-when-retrying-re.patch b/queue-6.1/scsi-core-clear-driver-private-data-when-retrying-re.patch new file mode 100644 index 0000000000..6c1e2a5a8c --- /dev/null +++ b/queue-6.1/scsi-core-clear-driver-private-data-when-retrying-re.patch @@ -0,0 +1,68 @@ +From 724a16cdc34e854b70a5dda60258077b0d252ad1 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 17 Feb 2025 10:16:28 +0800 +Subject: scsi: core: Clear driver private data when retrying request + +From: Ye Bin + +[ Upstream commit dce5c4afd035e8090a26e5d776b1682c0e649683 ] + +After commit 1bad6c4a57ef ("scsi: zero per-cmd private driver data for each +MQ I/O"), the xen-scsifront/virtio_scsi/snic drivers all removed code that +explicitly zeroed driver-private command data. + +In combination with commit 464a00c9e0ad ("scsi: core: Kill DRIVER_SENSE"), +after virtio_scsi performs a capacity expansion, the first request will +return a unit attention to indicate that the capacity has changed. And then +the original command is retried. As driver-private command data was not +cleared, the request would return UA again and eventually time out and fail. + +Zero driver-private command data when a request is retried. + +Fixes: f7de50da1479 ("scsi: xen-scsifront: Remove code that zeroes driver-private command data") +Fixes: c2bb87318baa ("scsi: virtio_scsi: Remove code that zeroes driver-private command data") +Fixes: c3006a926468 ("scsi: snic: Remove code that zeroes driver-private command data") +Signed-off-by: Ye Bin +Reviewed-by: Bart Van Assche +Link: https://lore.kernel.org/r/20250217021628.2929248-1-yebin@huaweicloud.com +Signed-off-by: Martin K. Petersen +Signed-off-by: Sasha Levin +--- + drivers/scsi/scsi_lib.c | 14 +++++++------- + 1 file changed, 7 insertions(+), 7 deletions(-) + +diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c +index 72d31b2267ef4..8e75eb1b6eab8 100644 +--- a/drivers/scsi/scsi_lib.c ++++ b/drivers/scsi/scsi_lib.c +@@ -1579,13 +1579,6 @@ static blk_status_t scsi_prepare_cmd(struct request *req) + if (in_flight) + __set_bit(SCMD_STATE_INFLIGHT, &cmd->state); + +- /* +- * Only clear the driver-private command data if the LLD does not supply +- * a function to initialize that data. +- */ +- if (!shost->hostt->init_cmd_priv) +- memset(cmd + 1, 0, shost->hostt->cmd_size); +- + cmd->prot_op = SCSI_PROT_NORMAL; + if (blk_rq_bytes(req)) + cmd->sc_data_direction = rq_dma_dir(req); +@@ -1747,6 +1740,13 @@ static blk_status_t scsi_queue_rq(struct blk_mq_hw_ctx *hctx, + if (!scsi_host_queue_ready(q, shost, sdev, cmd)) + goto out_dec_target_busy; + ++ /* ++ * Only clear the driver-private command data if the LLD does not supply ++ * a function to initialize that data. ++ */ ++ if (shost->hostt->cmd_size && !shost->hostt->init_cmd_priv) ++ memset(cmd + 1, 0, shost->hostt->cmd_size); ++ + if (!(req->rq_flags & RQF_DONTPREP)) { + ret = scsi_prepare_cmd(req); + if (ret != BLK_STS_OK) +-- +2.39.5 + diff --git a/queue-6.1/series b/queue-6.1/series index e4955ccadd..64540815b6 100644 --- a/queue-6.1/series +++ b/queue-6.1/series @@ -94,3 +94,22 @@ media-mediatek-vcodec-fix-h264-multi-stateless-decoder-smatch-warning.patch spi-atmel-quadspi-avoid-overwriting-delay-register-settings.patch spi-atmel-quadspi-fix-wrong-register-value-written-to-mr.patch netfilter-allow-exp-not-to-be-removed-in-nf_ct_find_expectation.patch +rdma-mlx5-don-t-keep-umrable-page_shift-in-cache-ent.patch +rdma-mlx5-remove-implicit-odp-cache-entry.patch +rdma-mlx5-change-the-cache-structure-to-an-rb-tree.patch +rdma-mlx5-introduce-mlx5r_cache_rb_key.patch +rdma-mlx5-cache-all-user-cacheable-mkeys-on-dereg-mr.patch +rdma-mlx5-add-work-to-remove-temporary-entries-from-.patch +rdma-mlx5-implement-mkeys-management-via-lifo-queue.patch +rdma-mlx5-fix-the-recovery-flow-of-the-umr-qp.patch +ib-mlx5-set-and-get-correct-qp_num-for-a-dct-qp.patch +ovl-fix-uaf-in-ovl_dentry_update_reval-by-moving-dpu.patch +sunrpc-convert-rpc_task_-constants-to-enum.patch +sunrpc-prevent-looping-due-to-rpc_signal_task-races.patch +rdma-mlx-calling-qp-event-handler-in-workqueue-conte.patch +rdma-mlx5-reduce-qp-table-exposure.patch +ib-core-add-support-for-xdr-link-speed.patch +rdma-mlx5-fix-ah-static-rate-parsing.patch +scsi-core-clear-driver-private-data-when-retrying-re.patch +rdma-mlx5-fix-bind-qp-error-cleanup-flow.patch +sunrpc-suppress-warnings-for-unused-procfs-functions.patch diff --git a/queue-6.1/sunrpc-convert-rpc_task_-constants-to-enum.patch b/queue-6.1/sunrpc-convert-rpc_task_-constants-to-enum.patch new file mode 100644 index 0000000000..2852f70ed6 --- /dev/null +++ b/queue-6.1/sunrpc-convert-rpc_task_-constants-to-enum.patch @@ -0,0 +1,54 @@ +From 2efb3833aaf3c7ef2d5028b56fb87338ed97ed8d Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 19 Aug 2024 08:58:59 -0700 +Subject: SUNRPC: convert RPC_TASK_* constants to enum + +From: Stephen Brennan + +[ Upstream commit 0b108e83795c9c23101f584ef7e3ab4f1f120ef0 ] + +The RPC_TASK_* constants are defined as macros, which means that most +kernel builds will not contain their definitions in the debuginfo. +However, it's quite useful for debuggers to be able to view the task +state constant and interpret it correctly. Conversion to an enum will +ensure the constants are present in debuginfo and can be interpreted by +debuggers without needing to hard-code them and track their changes. + +Signed-off-by: Stephen Brennan +Signed-off-by: Anna Schumaker +Stable-dep-of: 5bbd6e863b15 ("SUNRPC: Prevent looping due to rpc_signal_task() races") +Signed-off-by: Sasha Levin +--- + include/linux/sunrpc/sched.h | 16 +++++++++------- + 1 file changed, 9 insertions(+), 7 deletions(-) + +diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h +index 8f9bee0e21c3b..f80b90aca380a 100644 +--- a/include/linux/sunrpc/sched.h ++++ b/include/linux/sunrpc/sched.h +@@ -140,13 +140,15 @@ struct rpc_task_setup { + #define RPC_WAS_SENT(t) ((t)->tk_flags & RPC_TASK_SENT) + #define RPC_IS_MOVEABLE(t) ((t)->tk_flags & RPC_TASK_MOVEABLE) + +-#define RPC_TASK_RUNNING 0 +-#define RPC_TASK_QUEUED 1 +-#define RPC_TASK_ACTIVE 2 +-#define RPC_TASK_NEED_XMIT 3 +-#define RPC_TASK_NEED_RECV 4 +-#define RPC_TASK_MSG_PIN_WAIT 5 +-#define RPC_TASK_SIGNALLED 6 ++enum { ++ RPC_TASK_RUNNING, ++ RPC_TASK_QUEUED, ++ RPC_TASK_ACTIVE, ++ RPC_TASK_NEED_XMIT, ++ RPC_TASK_NEED_RECV, ++ RPC_TASK_MSG_PIN_WAIT, ++ RPC_TASK_SIGNALLED, ++}; + + #define rpc_test_and_set_running(t) \ + test_and_set_bit(RPC_TASK_RUNNING, &(t)->tk_runstate) +-- +2.39.5 + diff --git a/queue-6.1/sunrpc-prevent-looping-due-to-rpc_signal_task-races.patch b/queue-6.1/sunrpc-prevent-looping-due-to-rpc_signal_task-races.patch new file mode 100644 index 0000000000..a30176179a --- /dev/null +++ b/queue-6.1/sunrpc-prevent-looping-due-to-rpc_signal_task-races.patch @@ -0,0 +1,79 @@ +From c19996437d22cce7a5e52b63d46399ef5d7795be Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Sat, 1 Feb 2025 15:00:02 -0500 +Subject: SUNRPC: Prevent looping due to rpc_signal_task() races + +From: Trond Myklebust + +[ Upstream commit 5bbd6e863b15a85221e49b9bdb2d5d8f0bb91f3d ] + +If rpc_signal_task() is called while a task is in an rpc_call_done() +callback function, and the latter calls rpc_restart_call(), the task can +end up looping due to the RPC_TASK_SIGNALLED flag being set without the +tk_rpc_status being set. +Removing the redundant mechanism for signalling the task fixes the +looping behaviour. + +Reported-by: Li Lingfeng +Fixes: 39494194f93b ("SUNRPC: Fix races with rpc_killall_tasks()") +Signed-off-by: Trond Myklebust +Reviewed-by: Jeff Layton +Signed-off-by: Anna Schumaker +Signed-off-by: Sasha Levin +--- + include/linux/sunrpc/sched.h | 3 +-- + include/trace/events/sunrpc.h | 3 +-- + net/sunrpc/sched.c | 2 -- + 3 files changed, 2 insertions(+), 6 deletions(-) + +diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h +index f80b90aca380a..a220b28904ca5 100644 +--- a/include/linux/sunrpc/sched.h ++++ b/include/linux/sunrpc/sched.h +@@ -147,7 +147,6 @@ enum { + RPC_TASK_NEED_XMIT, + RPC_TASK_NEED_RECV, + RPC_TASK_MSG_PIN_WAIT, +- RPC_TASK_SIGNALLED, + }; + + #define rpc_test_and_set_running(t) \ +@@ -160,7 +159,7 @@ enum { + + #define RPC_IS_ACTIVATED(t) test_bit(RPC_TASK_ACTIVE, &(t)->tk_runstate) + +-#define RPC_SIGNALLED(t) test_bit(RPC_TASK_SIGNALLED, &(t)->tk_runstate) ++#define RPC_SIGNALLED(t) (READ_ONCE(task->tk_rpc_status) == -ERESTARTSYS) + + /* + * Task priorities. +diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h +index ffe2679a13ced..b70f47a57bf6d 100644 +--- a/include/trace/events/sunrpc.h ++++ b/include/trace/events/sunrpc.h +@@ -328,8 +328,7 @@ TRACE_EVENT(rpc_request, + { (1UL << RPC_TASK_ACTIVE), "ACTIVE" }, \ + { (1UL << RPC_TASK_NEED_XMIT), "NEED_XMIT" }, \ + { (1UL << RPC_TASK_NEED_RECV), "NEED_RECV" }, \ +- { (1UL << RPC_TASK_MSG_PIN_WAIT), "MSG_PIN_WAIT" }, \ +- { (1UL << RPC_TASK_SIGNALLED), "SIGNALLED" }) ++ { (1UL << RPC_TASK_MSG_PIN_WAIT), "MSG_PIN_WAIT" }) + + DECLARE_EVENT_CLASS(rpc_task_running, + +diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c +index cef623ea15060..9b45fbdc90cab 100644 +--- a/net/sunrpc/sched.c ++++ b/net/sunrpc/sched.c +@@ -864,8 +864,6 @@ void rpc_signal_task(struct rpc_task *task) + if (!rpc_task_set_rpc_status(task, -ERESTARTSYS)) + return; + trace_rpc_task_signalled(task, task->tk_action); +- set_bit(RPC_TASK_SIGNALLED, &task->tk_runstate); +- smp_mb__after_atomic(); + queue = READ_ONCE(task->tk_waitqueue); + if (queue) + rpc_wake_up_queued_task(queue, task); +-- +2.39.5 + diff --git a/queue-6.1/sunrpc-suppress-warnings-for-unused-procfs-functions.patch b/queue-6.1/sunrpc-suppress-warnings-for-unused-procfs-functions.patch new file mode 100644 index 0000000000..6a190fcd06 --- /dev/null +++ b/queue-6.1/sunrpc-suppress-warnings-for-unused-procfs-functions.patch @@ -0,0 +1,71 @@ +From d7fc0a014420a4f96c0e9ad14e2749c17d329b17 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 25 Feb 2025 15:52:21 +0100 +Subject: sunrpc: suppress warnings for unused procfs functions + +From: Arnd Bergmann + +[ Upstream commit 1f7a4f98c11fbeb18ed21f3b3a497e90a50ad2e0 ] + +There is a warning about unused variables when building with W=1 and no procfs: + +net/sunrpc/cache.c:1660:30: error: 'cache_flush_proc_ops' defined but not used [-Werror=unused-const-variable=] + 1660 | static const struct proc_ops cache_flush_proc_ops = { + | ^~~~~~~~~~~~~~~~~~~~ +net/sunrpc/cache.c:1622:30: error: 'content_proc_ops' defined but not used [-Werror=unused-const-variable=] + 1622 | static const struct proc_ops content_proc_ops = { + | ^~~~~~~~~~~~~~~~ +net/sunrpc/cache.c:1598:30: error: 'cache_channel_proc_ops' defined but not used [-Werror=unused-const-variable=] + 1598 | static const struct proc_ops cache_channel_proc_ops = { + | ^~~~~~~~~~~~~~~~~~~~~~ + +These are used inside of an #ifdef, so replacing that with an +IS_ENABLED() check lets the compiler see how they are used while +still dropping them during dead code elimination. + +Fixes: dbf847ecb631 ("knfsd: allow cache_register to return error on failure") +Reviewed-by: Jeff Layton +Acked-by: Chuck Lever +Signed-off-by: Arnd Bergmann +Signed-off-by: Anna Schumaker +Signed-off-by: Sasha Levin +--- + net/sunrpc/cache.c | 10 +++------- + 1 file changed, 3 insertions(+), 7 deletions(-) + +diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c +index 94889df659f0f..7ac4648c7da7f 100644 +--- a/net/sunrpc/cache.c ++++ b/net/sunrpc/cache.c +@@ -1675,12 +1675,14 @@ static void remove_cache_proc_entries(struct cache_detail *cd) + } + } + +-#ifdef CONFIG_PROC_FS + static int create_cache_proc_entries(struct cache_detail *cd, struct net *net) + { + struct proc_dir_entry *p; + struct sunrpc_net *sn; + ++ if (!IS_ENABLED(CONFIG_PROC_FS)) ++ return 0; ++ + sn = net_generic(net, sunrpc_net_id); + cd->procfs = proc_mkdir(cd->name, sn->proc_net_rpc); + if (cd->procfs == NULL) +@@ -1708,12 +1710,6 @@ static int create_cache_proc_entries(struct cache_detail *cd, struct net *net) + remove_cache_proc_entries(cd); + return -ENOMEM; + } +-#else /* CONFIG_PROC_FS */ +-static int create_cache_proc_entries(struct cache_detail *cd, struct net *net) +-{ +- return 0; +-} +-#endif + + void __init cache_initialize(void) + { +-- +2.39.5 +