]>
Commit | Line | Data |
---|---|---|
04fd09d4 SL |
1 | From a30e1f026b58ba33cc8e0984737396c4a0f321c7 Mon Sep 17 00:00:00 2001 |
2 | From: Chieh-Min Wang <chiehminw@synology.com> | |
3 | Date: Tue, 12 Feb 2019 00:59:55 +0100 | |
4 | Subject: netfilter: conntrack: fix cloned unconfirmed skb->_nfct race in | |
5 | __nf_conntrack_confirm | |
6 | ||
7 | [ Upstream commit 13f5251fd17088170c18844534682d9cab5ff5aa ] | |
8 | ||
9 | For bridge(br_flood) or broadcast/multicast packets, they could clone | |
10 | skb with unconfirmed conntrack which break the rule that unconfirmed | |
11 | skb->_nfct is never shared. With nfqueue running on my system, the race | |
12 | can be easily reproduced with following warning calltrace: | |
13 | ||
14 | [13257.707525] CPU: 0 PID: 12132 Comm: main Tainted: P W 4.4.60 #7744 | |
15 | [13257.707568] Hardware name: Qualcomm (Flattened Device Tree) | |
16 | [13257.714700] [<c021f6dc>] (unwind_backtrace) from [<c021bce8>] (show_stack+0x10/0x14) | |
17 | [13257.720253] [<c021bce8>] (show_stack) from [<c0449e10>] (dump_stack+0x94/0xa8) | |
18 | [13257.728240] [<c0449e10>] (dump_stack) from [<c022a7e0>] (warn_slowpath_common+0x94/0xb0) | |
19 | [13257.735268] [<c022a7e0>] (warn_slowpath_common) from [<c022a898>] (warn_slowpath_null+0x1c/0x24) | |
20 | [13257.743519] [<c022a898>] (warn_slowpath_null) from [<c06ee450>] (__nf_conntrack_confirm+0xa8/0x618) | |
21 | [13257.752284] [<c06ee450>] (__nf_conntrack_confirm) from [<c0772670>] (ipv4_confirm+0xb8/0xfc) | |
22 | [13257.761049] [<c0772670>] (ipv4_confirm) from [<c06e7a60>] (nf_iterate+0x48/0xa8) | |
23 | [13257.769725] [<c06e7a60>] (nf_iterate) from [<c06e7af0>] (nf_hook_slow+0x30/0xb0) | |
24 | [13257.777108] [<c06e7af0>] (nf_hook_slow) from [<c07f20b4>] (br_nf_post_routing+0x274/0x31c) | |
25 | [13257.784486] [<c07f20b4>] (br_nf_post_routing) from [<c06e7a60>] (nf_iterate+0x48/0xa8) | |
26 | [13257.792556] [<c06e7a60>] (nf_iterate) from [<c06e7af0>] (nf_hook_slow+0x30/0xb0) | |
27 | [13257.800458] [<c06e7af0>] (nf_hook_slow) from [<c07e5580>] (br_forward_finish+0x94/0xa4) | |
28 | [13257.808010] [<c07e5580>] (br_forward_finish) from [<c07f22ac>] (br_nf_forward_finish+0x150/0x1ac) | |
29 | [13257.815736] [<c07f22ac>] (br_nf_forward_finish) from [<c06e8df0>] (nf_reinject+0x108/0x170) | |
30 | [13257.824762] [<c06e8df0>] (nf_reinject) from [<c06ea854>] (nfqnl_recv_verdict+0x3d8/0x420) | |
31 | [13257.832924] [<c06ea854>] (nfqnl_recv_verdict) from [<c06e940c>] (nfnetlink_rcv_msg+0x158/0x248) | |
32 | [13257.841256] [<c06e940c>] (nfnetlink_rcv_msg) from [<c06e5564>] (netlink_rcv_skb+0x54/0xb0) | |
33 | [13257.849762] [<c06e5564>] (netlink_rcv_skb) from [<c06e4ec8>] (netlink_unicast+0x148/0x23c) | |
34 | [13257.858093] [<c06e4ec8>] (netlink_unicast) from [<c06e5364>] (netlink_sendmsg+0x2ec/0x368) | |
35 | [13257.866348] [<c06e5364>] (netlink_sendmsg) from [<c069fb8c>] (sock_sendmsg+0x34/0x44) | |
36 | [13257.874590] [<c069fb8c>] (sock_sendmsg) from [<c06a03dc>] (___sys_sendmsg+0x1ec/0x200) | |
37 | [13257.882489] [<c06a03dc>] (___sys_sendmsg) from [<c06a11c8>] (__sys_sendmsg+0x3c/0x64) | |
38 | [13257.890300] [<c06a11c8>] (__sys_sendmsg) from [<c0209b40>] (ret_fast_syscall+0x0/0x34) | |
39 | ||
40 | The original code just triggered the warning but do nothing. It will | |
41 | caused the shared conntrack moves to the dying list and the packet be | |
42 | droppped (nf_ct_resolve_clash returns NF_DROP for dying conntrack). | |
43 | ||
44 | - Reproduce steps: | |
45 | ||
46 | +----------------------------+ | |
47 | | br0(bridge) | | |
48 | | | | |
49 | +-+---------+---------+------+ | |
50 | | eth0| | eth1| | eth2| | |
51 | | | | | | | | |
52 | +--+--+ +--+--+ +---+-+ | |
53 | | | | | |
54 | | | | | |
55 | +--+-+ +-+--+ +--+-+ | |
56 | | PC1| | PC2| | PC3| | |
57 | +----+ +----+ +----+ | |
58 | ||
59 | iptables -A FORWARD -m mark --mark 0x1000000/0x1000000 -j NFQUEUE --queue-num 100 --queue-bypass | |
60 | ||
61 | ps: Our nfq userspace program will set mark on packets whose connection | |
62 | has already been processed. | |
63 | ||
64 | PC1 sends broadcast packets simulated by hping3: | |
65 | ||
66 | hping3 --rand-source --udp 192.168.1.255 -i u100 | |
67 | ||
68 | - Broadcast racing flow chart is as follow: | |
69 | ||
70 | br_handle_frame | |
71 | BR_HOOK(NFPROTO_BRIDGE, NF_BR_PRE_ROUTING, br_handle_frame_finish) | |
72 | // skb->_nfct (unconfirmed conntrack) is constructed at PRE_ROUTING stage | |
73 | br_handle_frame_finish | |
74 | // check if this packet is broadcast | |
75 | br_flood_forward | |
76 | br_flood | |
77 | list_for_each_entry_rcu(p, &br->port_list, list) // iterate through each port | |
78 | maybe_deliver | |
79 | deliver_clone | |
80 | skb = skb_clone(skb) | |
81 | __br_forward | |
82 | BR_HOOK(NFPROTO_BRIDGE, NF_BR_FORWARD,...) | |
83 | // queue in our nfq and received by our userspace program | |
84 | // goto __nf_conntrack_confirm with process context on CPU 1 | |
85 | br_pass_frame_up | |
86 | BR_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_IN,...) | |
87 | // goto __nf_conntrack_confirm with softirq context on CPU 0 | |
88 | ||
89 | Because conntrack confirm can happen at both INPUT and POSTROUTING | |
90 | stage. So with NFQUEUE running, skb->_nfct with the same unconfirmed | |
91 | conntrack could race on different core. | |
92 | ||
93 | This patch fixes a repeating kernel splat, now it is only displayed | |
94 | once. | |
95 | ||
96 | Signed-off-by: Chieh-Min Wang <chiehminw@synology.com> | |
97 | Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org> | |
98 | Signed-off-by: Sasha Levin <sashal@kernel.org> | |
99 | --- | |
100 | net/netfilter/nf_conntrack_core.c | 14 +++++++++++--- | |
101 | 1 file changed, 11 insertions(+), 3 deletions(-) | |
102 | ||
103 | diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c | |
104 | index f07357ba9629..06520bf30f29 100644 | |
105 | --- a/net/netfilter/nf_conntrack_core.c | |
106 | +++ b/net/netfilter/nf_conntrack_core.c | |
107 | @@ -763,10 +763,18 @@ __nf_conntrack_confirm(struct sk_buff *skb) | |
108 | * REJECT will give spurious warnings here. | |
109 | */ | |
110 | ||
111 | - /* No external references means no one else could have | |
112 | - * confirmed us. | |
113 | + /* Another skb with the same unconfirmed conntrack may | |
114 | + * win the race. This may happen for bridge(br_flood) | |
115 | + * or broadcast/multicast packets do skb_clone with | |
116 | + * unconfirmed conntrack. | |
117 | */ | |
118 | - WARN_ON(nf_ct_is_confirmed(ct)); | |
119 | + if (unlikely(nf_ct_is_confirmed(ct))) { | |
120 | + WARN_ON_ONCE(1); | |
121 | + nf_conntrack_double_unlock(hash, reply_hash); | |
122 | + local_bh_enable(); | |
123 | + return NF_DROP; | |
124 | + } | |
125 | + | |
126 | pr_debug("Confirming conntrack %p\n", ct); | |
127 | /* We have to check the DYING flag after unlink to prevent | |
128 | * a race against nf_ct_get_next_corpse() possibly called from | |
129 | -- | |
130 | 2.19.1 | |
131 |