]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/blob - queue-6.8/rdma-cm-add-timeout-to-cm_destroy_id-wait.patch
Linux 6.6.27
[thirdparty/kernel/stable-queue.git] / queue-6.8 / rdma-cm-add-timeout-to-cm_destroy_id-wait.patch
1 From 71adc4b0ca414fddd7a83c47b370a48e1a3e7897 Mon Sep 17 00:00:00 2001
2 From: Sasha Levin <sashal@kernel.org>
3 Date: Fri, 8 Mar 2024 22:33:23 -0800
4 Subject: RDMA/cm: add timeout to cm_destroy_id wait
5
6 From: Manjunath Patil <manjunath.b.patil@oracle.com>
7
8 [ Upstream commit 96d9cbe2f2ff7abde021bac75eafaceabe9a51fa ]
9
10 Add timeout to cm_destroy_id, so that userspace can trigger any data
11 collection that would help in analyzing the cause of delay in destroying
12 the cm_id.
13
14 New noinline function helps dtrace/ebpf programs to hook on to it.
15 Existing functionality isn't changed except triggering a probe-able new
16 function at every timeout interval.
17
18 We have seen cases where CM messages stuck with MAD layer (either due to
19 software bug or faulty HCA), leading to cm_id getting stuck in the
20 following call stack. This patch helps in resolving such issues faster.
21
22 kernel: ... INFO: task XXXX:56778 blocked for more than 120 seconds.
23 ...
24 Call Trace:
25 __schedule+0x2bc/0x895
26 schedule+0x36/0x7c
27 schedule_timeout+0x1f6/0x31f
28 ? __slab_free+0x19c/0x2ba
29 wait_for_completion+0x12b/0x18a
30 ? wake_up_q+0x80/0x73
31 cm_destroy_id+0x345/0x610 [ib_cm]
32 ib_destroy_cm_id+0x10/0x20 [ib_cm]
33 rdma_destroy_id+0xa8/0x300 [rdma_cm]
34 ucma_destroy_id+0x13e/0x190 [rdma_ucm]
35 ucma_write+0xe0/0x160 [rdma_ucm]
36 __vfs_write+0x3a/0x16d
37 vfs_write+0xb2/0x1a1
38 ? syscall_trace_enter+0x1ce/0x2b8
39 SyS_write+0x5c/0xd3
40 do_syscall_64+0x79/0x1b9
41 entry_SYSCALL_64_after_hwframe+0x16d/0x0
42
43 Signed-off-by: Manjunath Patil <manjunath.b.patil@oracle.com>
44 Link: https://lore.kernel.org/r/20240309063323.458102-1-manjunath.b.patil@oracle.com
45 Signed-off-by: Leon Romanovsky <leon@kernel.org>
46 Signed-off-by: Sasha Levin <sashal@kernel.org>
47 ---
48 drivers/infiniband/core/cm.c | 20 +++++++++++++++++++-
49 1 file changed, 19 insertions(+), 1 deletion(-)
50
51 diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c
52 index ff58058aeadca..bf0df6ee4f785 100644
53 --- a/drivers/infiniband/core/cm.c
54 +++ b/drivers/infiniband/core/cm.c
55 @@ -34,6 +34,7 @@ MODULE_AUTHOR("Sean Hefty");
56 MODULE_DESCRIPTION("InfiniBand CM");
57 MODULE_LICENSE("Dual BSD/GPL");
58
59 +#define CM_DESTROY_ID_WAIT_TIMEOUT 10000 /* msecs */
60 static const char * const ibcm_rej_reason_strs[] = {
61 [IB_CM_REJ_NO_QP] = "no QP",
62 [IB_CM_REJ_NO_EEC] = "no EEC",
63 @@ -1025,10 +1026,20 @@ static void cm_reset_to_idle(struct cm_id_private *cm_id_priv)
64 }
65 }
66
67 +static noinline void cm_destroy_id_wait_timeout(struct ib_cm_id *cm_id)
68 +{
69 + struct cm_id_private *cm_id_priv;
70 +
71 + cm_id_priv = container_of(cm_id, struct cm_id_private, id);
72 + pr_err("%s: cm_id=%p timed out. state=%d refcnt=%d\n", __func__,
73 + cm_id, cm_id->state, refcount_read(&cm_id_priv->refcount));
74 +}
75 +
76 static void cm_destroy_id(struct ib_cm_id *cm_id, int err)
77 {
78 struct cm_id_private *cm_id_priv;
79 struct cm_work *work;
80 + int ret;
81
82 cm_id_priv = container_of(cm_id, struct cm_id_private, id);
83 spin_lock_irq(&cm_id_priv->lock);
84 @@ -1135,7 +1146,14 @@ static void cm_destroy_id(struct ib_cm_id *cm_id, int err)
85
86 xa_erase(&cm.local_id_table, cm_local_id(cm_id->local_id));
87 cm_deref_id(cm_id_priv);
88 - wait_for_completion(&cm_id_priv->comp);
89 + do {
90 + ret = wait_for_completion_timeout(&cm_id_priv->comp,
91 + msecs_to_jiffies(
92 + CM_DESTROY_ID_WAIT_TIMEOUT));
93 + if (!ret) /* timeout happened */
94 + cm_destroy_id_wait_timeout(cm_id);
95 + } while (!ret);
96 +
97 while ((work = cm_dequeue_work(cm_id_priv)) != NULL)
98 cm_free_work(work);
99
100 --
101 2.43.0
102