]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/blame - queue-6.8/rdma-cm-add-timeout-to-cm_destroy_id-wait.patch
Linux 6.6.27
[thirdparty/kernel/stable-queue.git] / queue-6.8 / rdma-cm-add-timeout-to-cm_destroy_id-wait.patch
CommitLineData
335f7cc0
SL
1From 71adc4b0ca414fddd7a83c47b370a48e1a3e7897 Mon Sep 17 00:00:00 2001
2From: Sasha Levin <sashal@kernel.org>
3Date: Fri, 8 Mar 2024 22:33:23 -0800
4Subject: RDMA/cm: add timeout to cm_destroy_id wait
5
6From: Manjunath Patil <manjunath.b.patil@oracle.com>
7
8[ Upstream commit 96d9cbe2f2ff7abde021bac75eafaceabe9a51fa ]
9
10Add timeout to cm_destroy_id, so that userspace can trigger any data
11collection that would help in analyzing the cause of delay in destroying
12the cm_id.
13
14New noinline function helps dtrace/ebpf programs to hook on to it.
15Existing functionality isn't changed except triggering a probe-able new
16function at every timeout interval.
17
18We have seen cases where CM messages stuck with MAD layer (either due to
19software bug or faulty HCA), leading to cm_id getting stuck in the
20following call stack. This patch helps in resolving such issues faster.
21
22kernel: ... INFO: task XXXX:56778 blocked for more than 120 seconds.
23...
24 Call Trace:
25 __schedule+0x2bc/0x895
26 schedule+0x36/0x7c
27 schedule_timeout+0x1f6/0x31f
28 ? __slab_free+0x19c/0x2ba
29 wait_for_completion+0x12b/0x18a
30 ? wake_up_q+0x80/0x73
31 cm_destroy_id+0x345/0x610 [ib_cm]
32 ib_destroy_cm_id+0x10/0x20 [ib_cm]
33 rdma_destroy_id+0xa8/0x300 [rdma_cm]
34 ucma_destroy_id+0x13e/0x190 [rdma_ucm]
35 ucma_write+0xe0/0x160 [rdma_ucm]
36 __vfs_write+0x3a/0x16d
37 vfs_write+0xb2/0x1a1
38 ? syscall_trace_enter+0x1ce/0x2b8
39 SyS_write+0x5c/0xd3
40 do_syscall_64+0x79/0x1b9
41 entry_SYSCALL_64_after_hwframe+0x16d/0x0
42
43Signed-off-by: Manjunath Patil <manjunath.b.patil@oracle.com>
44Link: https://lore.kernel.org/r/20240309063323.458102-1-manjunath.b.patil@oracle.com
45Signed-off-by: Leon Romanovsky <leon@kernel.org>
46Signed-off-by: Sasha Levin <sashal@kernel.org>
47---
48 drivers/infiniband/core/cm.c | 20 +++++++++++++++++++-
49 1 file changed, 19 insertions(+), 1 deletion(-)
50
51diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c
52index ff58058aeadca..bf0df6ee4f785 100644
53--- a/drivers/infiniband/core/cm.c
54+++ b/drivers/infiniband/core/cm.c
55@@ -34,6 +34,7 @@ MODULE_AUTHOR("Sean Hefty");
56 MODULE_DESCRIPTION("InfiniBand CM");
57 MODULE_LICENSE("Dual BSD/GPL");
58
59+#define CM_DESTROY_ID_WAIT_TIMEOUT 10000 /* msecs */
60 static const char * const ibcm_rej_reason_strs[] = {
61 [IB_CM_REJ_NO_QP] = "no QP",
62 [IB_CM_REJ_NO_EEC] = "no EEC",
63@@ -1025,10 +1026,20 @@ static void cm_reset_to_idle(struct cm_id_private *cm_id_priv)
64 }
65 }
66
67+static noinline void cm_destroy_id_wait_timeout(struct ib_cm_id *cm_id)
68+{
69+ struct cm_id_private *cm_id_priv;
70+
71+ cm_id_priv = container_of(cm_id, struct cm_id_private, id);
72+ pr_err("%s: cm_id=%p timed out. state=%d refcnt=%d\n", __func__,
73+ cm_id, cm_id->state, refcount_read(&cm_id_priv->refcount));
74+}
75+
76 static void cm_destroy_id(struct ib_cm_id *cm_id, int err)
77 {
78 struct cm_id_private *cm_id_priv;
79 struct cm_work *work;
80+ int ret;
81
82 cm_id_priv = container_of(cm_id, struct cm_id_private, id);
83 spin_lock_irq(&cm_id_priv->lock);
84@@ -1135,7 +1146,14 @@ static void cm_destroy_id(struct ib_cm_id *cm_id, int err)
85
86 xa_erase(&cm.local_id_table, cm_local_id(cm_id->local_id));
87 cm_deref_id(cm_id_priv);
88- wait_for_completion(&cm_id_priv->comp);
89+ do {
90+ ret = wait_for_completion_timeout(&cm_id_priv->comp,
91+ msecs_to_jiffies(
92+ CM_DESTROY_ID_WAIT_TIMEOUT));
93+ if (!ret) /* timeout happened */
94+ cm_destroy_id_wait_timeout(cm_id);
95+ } while (!ret);
96+
97 while ((work = cm_dequeue_work(cm_id_priv)) != NULL)
98 cm_free_work(work);
99
100--
1012.43.0
102