]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/blob - releases/6.6.26/mlxbf_gige-call-request_irq-after-napi-initialized.patch
Linux 6.6.26
[thirdparty/kernel/stable-queue.git] / releases / 6.6.26 / mlxbf_gige-call-request_irq-after-napi-initialized.patch
1 From d25bd16202368259acc7720a4756c41d2cff1e80 Mon Sep 17 00:00:00 2001
2 From: Sasha Levin <sashal@kernel.org>
3 Date: Mon, 25 Mar 2024 14:36:27 -0400
4 Subject: mlxbf_gige: call request_irq() after NAPI initialized
5
6 From: David Thompson <davthompson@nvidia.com>
7
8 [ Upstream commit f7442a634ac06b953fc1f7418f307b25acd4cfbc ]
9
10 The mlxbf_gige driver encounters a NULL pointer exception in
11 mlxbf_gige_open() when kdump is enabled. The sequence to reproduce
12 the exception is as follows:
13 a) enable kdump
14 b) trigger kdump via "echo c > /proc/sysrq-trigger"
15 c) kdump kernel executes
16 d) kdump kernel loads mlxbf_gige module
17 e) the mlxbf_gige module runs its open() as the
18 the "oob_net0" interface is brought up
19 f) mlxbf_gige module will experience an exception
20 during its open(), something like:
21
22 Unable to handle kernel NULL pointer dereference at virtual address 0000000000000000
23 Mem abort info:
24 ESR = 0x0000000086000004
25 EC = 0x21: IABT (current EL), IL = 32 bits
26 SET = 0, FnV = 0
27 EA = 0, S1PTW = 0
28 FSC = 0x04: level 0 translation fault
29 user pgtable: 4k pages, 48-bit VAs, pgdp=00000000e29a4000
30 [0000000000000000] pgd=0000000000000000, p4d=0000000000000000
31 Internal error: Oops: 0000000086000004 [#1] SMP
32 CPU: 0 PID: 812 Comm: NetworkManager Tainted: G OE 5.15.0-1035-bluefield #37-Ubuntu
33 Hardware name: https://www.mellanox.com BlueField-3 SmartNIC Main Card/BlueField-3 SmartNIC Main Card, BIOS 4.6.0.13024 Jan 19 2024
34 pstate: 80400009 (Nzcv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--)
35 pc : 0x0
36 lr : __napi_poll+0x40/0x230
37 sp : ffff800008003e00
38 x29: ffff800008003e00 x28: 0000000000000000 x27: 00000000ffffffff
39 x26: ffff000066027238 x25: ffff00007cedec00 x24: ffff800008003ec8
40 x23: 000000000000012c x22: ffff800008003eb7 x21: 0000000000000000
41 x20: 0000000000000001 x19: ffff000066027238 x18: 0000000000000000
42 x17: ffff578fcb450000 x16: ffffa870b083c7c0 x15: 0000aaab010441d0
43 x14: 0000000000000001 x13: 00726f7272655f65 x12: 6769675f6662786c
44 x11: 0000000000000000 x10: 0000000000000000 x9 : ffffa870b0842398
45 x8 : 0000000000000004 x7 : fe5a48b9069706ea x6 : 17fdb11fc84ae0d2
46 x5 : d94a82549d594f35 x4 : 0000000000000000 x3 : 0000000000400100
47 x2 : 0000000000000000 x1 : 0000000000000000 x0 : ffff000066027238
48 Call trace:
49 0x0
50 net_rx_action+0x178/0x360
51 __do_softirq+0x15c/0x428
52 __irq_exit_rcu+0xac/0xec
53 irq_exit+0x18/0x2c
54 handle_domain_irq+0x6c/0xa0
55 gic_handle_irq+0xec/0x1b0
56 call_on_irq_stack+0x20/0x2c
57 do_interrupt_handler+0x5c/0x70
58 el1_interrupt+0x30/0x50
59 el1h_64_irq_handler+0x18/0x2c
60 el1h_64_irq+0x7c/0x80
61 __setup_irq+0x4c0/0x950
62 request_threaded_irq+0xf4/0x1bc
63 mlxbf_gige_request_irqs+0x68/0x110 [mlxbf_gige]
64 mlxbf_gige_open+0x5c/0x170 [mlxbf_gige]
65 __dev_open+0x100/0x220
66 __dev_change_flags+0x16c/0x1f0
67 dev_change_flags+0x2c/0x70
68 do_setlink+0x220/0xa40
69 __rtnl_newlink+0x56c/0x8a0
70 rtnl_newlink+0x58/0x84
71 rtnetlink_rcv_msg+0x138/0x3c4
72 netlink_rcv_skb+0x64/0x130
73 rtnetlink_rcv+0x20/0x30
74 netlink_unicast+0x2ec/0x360
75 netlink_sendmsg+0x278/0x490
76 __sock_sendmsg+0x5c/0x6c
77 ____sys_sendmsg+0x290/0x2d4
78 ___sys_sendmsg+0x84/0xd0
79 __sys_sendmsg+0x70/0xd0
80 __arm64_sys_sendmsg+0x2c/0x40
81 invoke_syscall+0x78/0x100
82 el0_svc_common.constprop.0+0x54/0x184
83 do_el0_svc+0x30/0xac
84 el0_svc+0x48/0x160
85 el0t_64_sync_handler+0xa4/0x12c
86 el0t_64_sync+0x1a4/0x1a8
87 Code: bad PC value
88 ---[ end trace 7d1c3f3bf9d81885 ]---
89 Kernel panic - not syncing: Oops: Fatal exception in interrupt
90 Kernel Offset: 0x2870a7a00000 from 0xffff800008000000
91 PHYS_OFFSET: 0x80000000
92 CPU features: 0x0,000005c1,a3332a5a
93 Memory Limit: none
94 ---[ end Kernel panic - not syncing: Oops: Fatal exception in interrupt ]---
95
96 The exception happens because there is a pending RX interrupt before the
97 call to request_irq(RX IRQ) executes. Then, the RX IRQ handler fires
98 immediately after this request_irq() completes. The RX IRQ handler runs
99 "napi_schedule()" before NAPI is fully initialized via "netif_napi_add()"
100 and "napi_enable()", both which happen later in the open() logic.
101
102 The logic in mlxbf_gige_open() must fully initialize NAPI before any calls
103 to request_irq() execute.
104
105 Fixes: f92e1869d74e ("Add Mellanox BlueField Gigabit Ethernet driver")
106 Signed-off-by: David Thompson <davthompson@nvidia.com>
107 Reviewed-by: Asmaa Mnebhi <asmaa@nvidia.com>
108 Link: https://lore.kernel.org/r/20240325183627.7641-1-davthompson@nvidia.com
109 Signed-off-by: Jakub Kicinski <kuba@kernel.org>
110 Signed-off-by: Sasha Levin <sashal@kernel.org>
111 ---
112 .../mellanox/mlxbf_gige/mlxbf_gige_main.c | 18 +++++++++++-------
113 1 file changed, 11 insertions(+), 7 deletions(-)
114
115 diff --git a/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_main.c b/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_main.c
116 index 044ff5f87b5e8..f1fa5f10051f2 100644
117 --- a/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_main.c
118 +++ b/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_main.c
119 @@ -139,13 +139,10 @@ static int mlxbf_gige_open(struct net_device *netdev)
120 control |= MLXBF_GIGE_CONTROL_PORT_EN;
121 writeq(control, priv->base + MLXBF_GIGE_CONTROL);
122
123 - err = mlxbf_gige_request_irqs(priv);
124 - if (err)
125 - return err;
126 mlxbf_gige_cache_stats(priv);
127 err = mlxbf_gige_clean_port(priv);
128 if (err)
129 - goto free_irqs;
130 + return err;
131
132 /* Clear driver's valid_polarity to match hardware,
133 * since the above call to clean_port() resets the
134 @@ -166,6 +163,10 @@ static int mlxbf_gige_open(struct net_device *netdev)
135 napi_enable(&priv->napi);
136 netif_start_queue(netdev);
137
138 + err = mlxbf_gige_request_irqs(priv);
139 + if (err)
140 + goto napi_deinit;
141 +
142 /* Set bits in INT_EN that we care about */
143 int_en = MLXBF_GIGE_INT_EN_HW_ACCESS_ERROR |
144 MLXBF_GIGE_INT_EN_TX_CHECKSUM_INPUTS |
145 @@ -182,14 +183,17 @@ static int mlxbf_gige_open(struct net_device *netdev)
146
147 return 0;
148
149 +napi_deinit:
150 + netif_stop_queue(netdev);
151 + napi_disable(&priv->napi);
152 + netif_napi_del(&priv->napi);
153 + mlxbf_gige_rx_deinit(priv);
154 +
155 tx_deinit:
156 mlxbf_gige_tx_deinit(priv);
157
158 phy_deinit:
159 phy_stop(phydev);
160 -
161 -free_irqs:
162 - mlxbf_gige_free_irqs(priv);
163 return err;
164 }
165
166 --
167 2.43.0
168