1 From 31116e2cc4380003461e8e9f6703e157b7bc79d4 Mon Sep 17 00:00:00 2001
2 From: Sasha Levin <sashal@kernel.org>
3 Date: Thu, 29 Oct 2020 03:56:06 +0100
4 Subject: netfilter: use actual socket sk rather than skb sk when routing
7 From: Jason A. Donenfeld <Jason@zx2c4.com>
9 [ Upstream commit 46d6c5ae953cc0be38efd0e469284df7c4328cf8 ]
11 If netfilter changes the packet mark when mangling, the packet is
12 rerouted using the route_me_harder set of functions. Prior to this
13 commit, there's one big difference between route_me_harder and the
14 ordinary initial routing functions, described in the comment above
17 /* Note: skb->sk can be different from sk, in case of tunnels */
18 int __ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
20 That function goes on to correctly make use of sk->sk_bound_dev_if,
21 rather than skb->sk->sk_bound_dev_if. And indeed the comment is true: a
22 tunnel will receive a packet in ndo_start_xmit with an initial skb->sk.
23 It will make some transformations to that packet, and then it will send
24 the encapsulated packet out of a *new* socket. That new socket will
25 basically always have a different sk_bound_dev_if (otherwise there'd be
26 a routing loop). So for the purposes of routing the encapsulated packet,
27 the routing information as it pertains to the socket should come from
28 that socket's sk, rather than the packet's original skb->sk. For that
29 reason __ip_queue_xmit() and related functions all do the right thing.
31 One might argue that all tunnels should just call skb_orphan(skb) before
32 transmitting the encapsulated packet into the new socket. But tunnels do
33 *not* do this -- and this is wisely avoided in skb_scrub_packet() too --
34 because features like TSQ rely on skb->destructor() being called when
35 that buffer space is truely available again. Calling skb_orphan(skb) too
36 early would result in buffers filling up unnecessarily and accounting
37 info being all wrong. Instead, additional routing must take into account
38 the new sk, just as __ip_queue_xmit() notes.
40 So, this commit addresses the problem by fishing the correct sk out of
41 state->sk -- it's already set properly in the call to nf_hook() in
42 __ip_local_out(), which receives the sk as part of its normal
43 functionality. So we make sure to plumb state->sk through the various
44 route_me_harder functions, and then make correct use of it following the
45 example of __ip_queue_xmit().
47 Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
48 Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
49 Reviewed-by: Florian Westphal <fw@strlen.de>
50 Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
51 Signed-off-by: Sasha Levin <sashal@kernel.org>
53 include/linux/netfilter_ipv4.h | 2 +-
54 include/linux/netfilter_ipv6.h | 10 +++++-----
55 net/ipv4/netfilter.c | 8 +++++---
56 net/ipv4/netfilter/iptable_mangle.c | 2 +-
57 net/ipv4/netfilter/nf_reject_ipv4.c | 2 +-
58 net/ipv6/netfilter.c | 6 +++---
59 net/ipv6/netfilter/ip6table_mangle.c | 2 +-
60 net/netfilter/ipvs/ip_vs_core.c | 4 ++--
61 net/netfilter/nf_nat_proto.c | 4 ++--
62 net/netfilter/nf_synproxy_core.c | 2 +-
63 net/netfilter/nft_chain_route.c | 4 ++--
64 net/netfilter/utils.c | 4 ++--
65 12 files changed, 26 insertions(+), 24 deletions(-)
67 diff --git a/include/linux/netfilter_ipv4.h b/include/linux/netfilter_ipv4.h
68 index 082e2c41b7ff9..5b70ca868bb19 100644
69 --- a/include/linux/netfilter_ipv4.h
70 +++ b/include/linux/netfilter_ipv4.h
71 @@ -16,7 +16,7 @@ struct ip_rt_info {
75 -int ip_route_me_harder(struct net *net, struct sk_buff *skb, unsigned addr_type);
76 +int ip_route_me_harder(struct net *net, struct sock *sk, struct sk_buff *skb, unsigned addr_type);
78 struct nf_queue_entry;
80 diff --git a/include/linux/netfilter_ipv6.h b/include/linux/netfilter_ipv6.h
81 index 9b67394471e1c..48314ade1506f 100644
82 --- a/include/linux/netfilter_ipv6.h
83 +++ b/include/linux/netfilter_ipv6.h
84 @@ -42,7 +42,7 @@ struct nf_ipv6_ops {
85 #if IS_MODULE(CONFIG_IPV6)
86 int (*chk_addr)(struct net *net, const struct in6_addr *addr,
87 const struct net_device *dev, int strict);
88 - int (*route_me_harder)(struct net *net, struct sk_buff *skb);
89 + int (*route_me_harder)(struct net *net, struct sock *sk, struct sk_buff *skb);
90 int (*dev_get_saddr)(struct net *net, const struct net_device *dev,
91 const struct in6_addr *daddr, unsigned int srcprefs,
92 struct in6_addr *saddr);
93 @@ -143,9 +143,9 @@ static inline int nf_br_ip6_fragment(struct net *net, struct sock *sk,
97 -int ip6_route_me_harder(struct net *net, struct sk_buff *skb);
98 +int ip6_route_me_harder(struct net *net, struct sock *sk, struct sk_buff *skb);
100 -static inline int nf_ip6_route_me_harder(struct net *net, struct sk_buff *skb)
101 +static inline int nf_ip6_route_me_harder(struct net *net, struct sock *sk, struct sk_buff *skb)
103 #if IS_MODULE(CONFIG_IPV6)
104 const struct nf_ipv6_ops *v6_ops = nf_get_ipv6_ops();
105 @@ -153,9 +153,9 @@ static inline int nf_ip6_route_me_harder(struct net *net, struct sk_buff *skb)
107 return -EHOSTUNREACH;
109 - return v6_ops->route_me_harder(net, skb);
110 + return v6_ops->route_me_harder(net, sk, skb);
111 #elif IS_BUILTIN(CONFIG_IPV6)
112 - return ip6_route_me_harder(net, skb);
113 + return ip6_route_me_harder(net, sk, skb);
115 return -EHOSTUNREACH;
117 diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c
118 index a058213b77a78..7c841037c5334 100644
119 --- a/net/ipv4/netfilter.c
120 +++ b/net/ipv4/netfilter.c
122 #include <net/netfilter/nf_queue.h>
124 /* route_me_harder function, used by iptable_nat, iptable_mangle + ip_queue */
125 -int ip_route_me_harder(struct net *net, struct sk_buff *skb, unsigned int addr_type)
126 +int ip_route_me_harder(struct net *net, struct sock *sk, struct sk_buff *skb, unsigned int addr_type)
128 const struct iphdr *iph = ip_hdr(skb);
130 struct flowi4 fl4 = {};
131 __be32 saddr = iph->saddr;
132 - const struct sock *sk = skb_to_full_sk(skb);
133 - __u8 flags = sk ? inet_sk_flowi_flags(sk) : 0;
135 struct net_device *dev = skb_dst(skb)->dev;
138 + sk = sk_to_full_sk(sk);
139 + flags = sk ? inet_sk_flowi_flags(sk) : 0;
141 if (addr_type == RTN_UNSPEC)
142 addr_type = inet_addr_type_dev_table(net, dev, saddr);
143 if (addr_type == RTN_LOCAL || addr_type == RTN_UNICAST)
144 diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c
145 index bb9266ea37858..ae45bcdd335ea 100644
146 --- a/net/ipv4/netfilter/iptable_mangle.c
147 +++ b/net/ipv4/netfilter/iptable_mangle.c
148 @@ -62,7 +62,7 @@ ipt_mangle_out(struct sk_buff *skb, const struct nf_hook_state *state)
149 iph->daddr != daddr ||
152 - err = ip_route_me_harder(state->net, skb, RTN_UNSPEC);
153 + err = ip_route_me_harder(state->net, state->sk, skb, RTN_UNSPEC);
155 ret = NF_DROP_ERR(err);
157 diff --git a/net/ipv4/netfilter/nf_reject_ipv4.c b/net/ipv4/netfilter/nf_reject_ipv4.c
158 index 2361fdac2c438..57817313a85c1 100644
159 --- a/net/ipv4/netfilter/nf_reject_ipv4.c
160 +++ b/net/ipv4/netfilter/nf_reject_ipv4.c
161 @@ -127,7 +127,7 @@ void nf_send_reset(struct net *net, struct sk_buff *oldskb, int hook)
162 ip4_dst_hoplimit(skb_dst(nskb)));
163 nf_reject_ip_tcphdr_put(nskb, oldskb, oth);
165 - if (ip_route_me_harder(net, nskb, RTN_UNSPEC))
166 + if (ip_route_me_harder(net, nskb->sk, nskb, RTN_UNSPEC))
170 diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c
171 index 6d0e942d082d4..ab9a279dd6d47 100644
172 --- a/net/ipv6/netfilter.c
173 +++ b/net/ipv6/netfilter.c
175 #include <net/netfilter/ipv6/nf_defrag_ipv6.h>
176 #include "../bridge/br_private.h"
178 -int ip6_route_me_harder(struct net *net, struct sk_buff *skb)
179 +int ip6_route_me_harder(struct net *net, struct sock *sk_partial, struct sk_buff *skb)
181 const struct ipv6hdr *iph = ipv6_hdr(skb);
182 - struct sock *sk = sk_to_full_sk(skb->sk);
183 + struct sock *sk = sk_to_full_sk(sk_partial);
185 struct dst_entry *dst;
186 int strict = (ipv6_addr_type(&iph->daddr) &
187 @@ -84,7 +84,7 @@ static int nf_ip6_reroute(struct sk_buff *skb,
188 if (!ipv6_addr_equal(&iph->daddr, &rt_info->daddr) ||
189 !ipv6_addr_equal(&iph->saddr, &rt_info->saddr) ||
190 skb->mark != rt_info->mark)
191 - return ip6_route_me_harder(entry->state.net, skb);
192 + return ip6_route_me_harder(entry->state.net, entry->state.sk, skb);
196 diff --git a/net/ipv6/netfilter/ip6table_mangle.c b/net/ipv6/netfilter/ip6table_mangle.c
197 index 070afb97fa2ba..401e8dcb2c84b 100644
198 --- a/net/ipv6/netfilter/ip6table_mangle.c
199 +++ b/net/ipv6/netfilter/ip6table_mangle.c
200 @@ -57,7 +57,7 @@ ip6t_mangle_out(struct sk_buff *skb, const struct nf_hook_state *state)
202 ipv6_hdr(skb)->hop_limit != hop_limit ||
203 flowlabel != *((u_int32_t *)ipv6_hdr(skb)))) {
204 - err = ip6_route_me_harder(state->net, skb);
205 + err = ip6_route_me_harder(state->net, state->sk, skb);
207 ret = NF_DROP_ERR(err);
209 diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
210 index 64a05906cc0e6..89aa1fc334b19 100644
211 --- a/net/netfilter/ipvs/ip_vs_core.c
212 +++ b/net/netfilter/ipvs/ip_vs_core.c
213 @@ -748,12 +748,12 @@ static int ip_vs_route_me_harder(struct netns_ipvs *ipvs, int af,
214 struct dst_entry *dst = skb_dst(skb);
216 if (dst->dev && !(dst->dev->flags & IFF_LOOPBACK) &&
217 - ip6_route_me_harder(ipvs->net, skb) != 0)
218 + ip6_route_me_harder(ipvs->net, skb->sk, skb) != 0)
222 if (!(skb_rtable(skb)->rt_flags & RTCF_LOCAL) &&
223 - ip_route_me_harder(ipvs->net, skb, RTN_LOCAL) != 0)
224 + ip_route_me_harder(ipvs->net, skb->sk, skb, RTN_LOCAL) != 0)
228 diff --git a/net/netfilter/nf_nat_proto.c b/net/netfilter/nf_nat_proto.c
229 index 59151dc07fdc1..e87b6bd6b3cdb 100644
230 --- a/net/netfilter/nf_nat_proto.c
231 +++ b/net/netfilter/nf_nat_proto.c
232 @@ -715,7 +715,7 @@ nf_nat_ipv4_local_fn(void *priv, struct sk_buff *skb,
234 if (ct->tuplehash[dir].tuple.dst.u3.ip !=
235 ct->tuplehash[!dir].tuple.src.u3.ip) {
236 - err = ip_route_me_harder(state->net, skb, RTN_UNSPEC);
237 + err = ip_route_me_harder(state->net, state->sk, skb, RTN_UNSPEC);
239 ret = NF_DROP_ERR(err);
241 @@ -953,7 +953,7 @@ nf_nat_ipv6_local_fn(void *priv, struct sk_buff *skb,
243 if (!nf_inet_addr_cmp(&ct->tuplehash[dir].tuple.dst.u3,
244 &ct->tuplehash[!dir].tuple.src.u3)) {
245 - err = nf_ip6_route_me_harder(state->net, skb);
246 + err = nf_ip6_route_me_harder(state->net, state->sk, skb);
248 ret = NF_DROP_ERR(err);
250 diff --git a/net/netfilter/nf_synproxy_core.c b/net/netfilter/nf_synproxy_core.c
251 index b9cbe1e2453e8..4bb4cfde28b47 100644
252 --- a/net/netfilter/nf_synproxy_core.c
253 +++ b/net/netfilter/nf_synproxy_core.c
254 @@ -446,7 +446,7 @@ synproxy_send_tcp(struct net *net,
256 skb_dst_set_noref(nskb, skb_dst(skb));
257 nskb->protocol = htons(ETH_P_IP);
258 - if (ip_route_me_harder(net, nskb, RTN_UNSPEC))
259 + if (ip_route_me_harder(net, nskb->sk, nskb, RTN_UNSPEC))
263 diff --git a/net/netfilter/nft_chain_route.c b/net/netfilter/nft_chain_route.c
264 index 8826bbe71136c..edd02cda57fca 100644
265 --- a/net/netfilter/nft_chain_route.c
266 +++ b/net/netfilter/nft_chain_route.c
267 @@ -42,7 +42,7 @@ static unsigned int nf_route_table_hook4(void *priv,
268 iph->daddr != daddr ||
271 - err = ip_route_me_harder(state->net, skb, RTN_UNSPEC);
272 + err = ip_route_me_harder(state->net, state->sk, skb, RTN_UNSPEC);
274 ret = NF_DROP_ERR(err);
276 @@ -92,7 +92,7 @@ static unsigned int nf_route_table_hook6(void *priv,
278 ipv6_hdr(skb)->hop_limit != hop_limit ||
279 flowlabel != *((u32 *)ipv6_hdr(skb)))) {
280 - err = nf_ip6_route_me_harder(state->net, skb);
281 + err = nf_ip6_route_me_harder(state->net, state->sk, skb);
283 ret = NF_DROP_ERR(err);
285 diff --git a/net/netfilter/utils.c b/net/netfilter/utils.c
286 index 51b454d8fa9c9..924195861faf7 100644
287 --- a/net/netfilter/utils.c
288 +++ b/net/netfilter/utils.c
289 @@ -191,8 +191,8 @@ static int nf_ip_reroute(struct sk_buff *skb, const struct nf_queue_entry *entry
290 skb->mark == rt_info->mark &&
291 iph->daddr == rt_info->daddr &&
292 iph->saddr == rt_info->saddr))
293 - return ip_route_me_harder(entry->state.net, skb,
295 + return ip_route_me_harder(entry->state.net, entry->state.sk,