]>
Commit | Line | Data |
---|---|---|
2874c5fd | 1 | // SPDX-License-Identifier: GPL-2.0-or-later |
1da177e4 LT |
2 | /* |
3 | * ip_vs_xmit.c: various packet transmitters for IPVS | |
4 | * | |
1da177e4 LT |
5 | * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> |
6 | * Julian Anastasov <ja@ssi.bg> | |
7 | * | |
1da177e4 LT |
8 | * Changes: |
9 | * | |
cb59155f JA |
10 | * Description of forwarding methods: |
11 | * - all transmitters are called from LOCAL_IN (remote clients) and | |
12 | * LOCAL_OUT (local clients) but for ICMP can be called from FORWARD | |
13 | * - not all connections have destination server, for example, | |
14 | * connections in backup server when fwmark is used | |
15 | * - bypass connections use daddr from packet | |
026ace06 JA |
16 | * - we can use dst without ref while sending in RCU section, we use |
17 | * ref when returning NF_ACCEPT for NAT-ed packet via loopback | |
cb59155f JA |
18 | * LOCAL_OUT rules: |
19 | * - skb->dev is NULL, skb->protocol is not set (both are set in POST_ROUTING) | |
20 | * - skb->pkt_type is not set yet | |
21 | * - the only place where we can see skb->sk != NULL | |
1da177e4 LT |
22 | */ |
23 | ||
9aada7ac HE |
24 | #define KMSG_COMPONENT "IPVS" |
25 | #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt | |
26 | ||
1da177e4 | 27 | #include <linux/kernel.h> |
5a0e3ad6 | 28 | #include <linux/slab.h> |
1da177e4 | 29 | #include <linux/tcp.h> /* for tcphdr */ |
c439cb2e | 30 | #include <net/ip.h> |
84c0d5e9 | 31 | #include <net/gue.h> |
6f7b841b | 32 | #include <net/gre.h> |
1da177e4 LT |
33 | #include <net/tcp.h> /* for csum_tcpudp_magic */ |
34 | #include <net/udp.h> | |
35 | #include <net/icmp.h> /* for icmp_send */ | |
36 | #include <net/route.h> /* for ip_route_output */ | |
38cdcc9a JV |
37 | #include <net/ipv6.h> |
38 | #include <net/ip6_route.h> | |
ea1d5d77 | 39 | #include <net/ip_tunnels.h> |
29930e31 | 40 | #include <net/ip6_checksum.h> |
714f095f | 41 | #include <net/addrconf.h> |
38cdcc9a | 42 | #include <linux/icmpv6.h> |
1da177e4 LT |
43 | #include <linux/netfilter.h> |
44 | #include <linux/netfilter_ipv4.h> | |
45 | ||
46 | #include <net/ip_vs.h> | |
47 | ||
17a8f8e3 CG |
48 | enum { |
49 | IP_VS_RT_MODE_LOCAL = 1, /* Allow local dest */ | |
50 | IP_VS_RT_MODE_NON_LOCAL = 2, /* Allow non-local dest */ | |
51 | IP_VS_RT_MODE_RDR = 4, /* Allow redirect from remote daddr to | |
52 | * local | |
53 | */ | |
f2edb9f7 | 54 | IP_VS_RT_MODE_CONNECT = 8, /* Always bind route to saddr */ |
ad4d3ef8 | 55 | IP_VS_RT_MODE_KNOWN_NH = 16,/* Route via remote addr */ |
4115ded1 | 56 | IP_VS_RT_MODE_TUNNEL = 32,/* Tunnel mode */ |
17a8f8e3 | 57 | }; |
1da177e4 | 58 | |
026ace06 JA |
59 | static inline struct ip_vs_dest_dst *ip_vs_dest_dst_alloc(void) |
60 | { | |
61 | return kmalloc(sizeof(struct ip_vs_dest_dst), GFP_ATOMIC); | |
62 | } | |
63 | ||
64 | static inline void ip_vs_dest_dst_free(struct ip_vs_dest_dst *dest_dst) | |
65 | { | |
66 | kfree(dest_dst); | |
67 | } | |
68 | ||
1da177e4 LT |
69 | /* |
70 | * Destination cache to speed up outgoing route lookup | |
71 | */ | |
72 | static inline void | |
026ace06 JA |
73 | __ip_vs_dst_set(struct ip_vs_dest *dest, struct ip_vs_dest_dst *dest_dst, |
74 | struct dst_entry *dst, u32 dst_cookie) | |
1da177e4 | 75 | { |
026ace06 JA |
76 | struct ip_vs_dest_dst *old; |
77 | ||
78 | old = rcu_dereference_protected(dest->dest_dst, | |
79 | lockdep_is_held(&dest->dst_lock)); | |
80 | ||
81 | if (dest_dst) { | |
82 | dest_dst->dst_cache = dst; | |
83 | dest_dst->dst_cookie = dst_cookie; | |
84 | } | |
85 | rcu_assign_pointer(dest->dest_dst, dest_dst); | |
1da177e4 | 86 | |
026ace06 JA |
87 | if (old) |
88 | call_rcu(&old->rcu_head, ip_vs_dest_dst_rcu_free); | |
1da177e4 LT |
89 | } |
90 | ||
026ace06 | 91 | static inline struct ip_vs_dest_dst * |
c90558da | 92 | __ip_vs_dst_check(struct ip_vs_dest *dest) |
1da177e4 | 93 | { |
026ace06 JA |
94 | struct ip_vs_dest_dst *dest_dst = rcu_dereference(dest->dest_dst); |
95 | struct dst_entry *dst; | |
1da177e4 | 96 | |
026ace06 | 97 | if (!dest_dst) |
1da177e4 | 98 | return NULL; |
026ace06 JA |
99 | dst = dest_dst->dst_cache; |
100 | if (dst->obsolete && | |
101 | dst->ops->check(dst, dest_dst->dst_cookie) == NULL) | |
1da177e4 | 102 | return NULL; |
026ace06 | 103 | return dest_dst; |
1da177e4 LT |
104 | } |
105 | ||
590e3f79 JDB |
106 | static inline bool |
107 | __mtu_check_toobig_v6(const struct sk_buff *skb, u32 mtu) | |
108 | { | |
4cdd3408 PM |
109 | if (IP6CB(skb)->frag_max_size) { |
110 | /* frag_max_size tell us that, this packet have been | |
111 | * defragmented by netfilter IPv6 conntrack module. | |
112 | */ | |
113 | if (IP6CB(skb)->frag_max_size > mtu) | |
114 | return true; /* largest fragment violate MTU */ | |
115 | } | |
116 | else if (skb->len > mtu && !skb_is_gso(skb)) { | |
590e3f79 JDB |
117 | return true; /* Packet size violate MTU size */ |
118 | } | |
119 | return false; | |
120 | } | |
121 | ||
f2edb9f7 JA |
122 | /* Get route to daddr, update *saddr, optionally bind route to saddr */ |
123 | static struct rtable *do_output_route4(struct net *net, __be32 daddr, | |
c90558da | 124 | int rt_mode, __be32 *saddr) |
f2edb9f7 JA |
125 | { |
126 | struct flowi4 fl4; | |
127 | struct rtable *rt; | |
f25a9b85 | 128 | bool loop = false; |
f2edb9f7 JA |
129 | |
130 | memset(&fl4, 0, sizeof(fl4)); | |
131 | fl4.daddr = daddr; | |
ad4d3ef8 JA |
132 | fl4.flowi4_flags = (rt_mode & IP_VS_RT_MODE_KNOWN_NH) ? |
133 | FLOWI_FLAG_KNOWN_NH : 0; | |
f2edb9f7 JA |
134 | |
135 | retry: | |
136 | rt = ip_route_output_key(net, &fl4); | |
137 | if (IS_ERR(rt)) { | |
138 | /* Invalid saddr ? */ | |
139 | if (PTR_ERR(rt) == -EINVAL && *saddr && | |
140 | rt_mode & IP_VS_RT_MODE_CONNECT && !loop) { | |
141 | *saddr = 0; | |
3f06760c | 142 | flowi4_update_output(&fl4, 0, daddr, 0); |
f2edb9f7 JA |
143 | goto retry; |
144 | } | |
145 | IP_VS_DBG_RL("ip_route_output error, dest: %pI4\n", &daddr); | |
146 | return NULL; | |
147 | } else if (!*saddr && rt_mode & IP_VS_RT_MODE_CONNECT && fl4.saddr) { | |
148 | ip_rt_put(rt); | |
149 | *saddr = fl4.saddr; | |
3f06760c | 150 | flowi4_update_output(&fl4, 0, daddr, fl4.saddr); |
f25a9b85 | 151 | loop = true; |
f2edb9f7 JA |
152 | goto retry; |
153 | } | |
154 | *saddr = fl4.saddr; | |
155 | return rt; | |
156 | } | |
157 | ||
4a4739d5 AG |
158 | #ifdef CONFIG_IP_VS_IPV6 |
159 | static inline int __ip_vs_is_local_route6(struct rt6_info *rt) | |
160 | { | |
161 | return rt->dst.dev && rt->dst.dev->flags & IFF_LOOPBACK; | |
162 | } | |
163 | #endif | |
164 | ||
165 | static inline bool crosses_local_route_boundary(int skb_af, struct sk_buff *skb, | |
166 | int rt_mode, | |
167 | bool new_rt_is_local) | |
168 | { | |
169 | bool rt_mode_allow_local = !!(rt_mode & IP_VS_RT_MODE_LOCAL); | |
6fcc02e3 | 170 | bool rt_mode_allow_non_local = !!(rt_mode & IP_VS_RT_MODE_NON_LOCAL); |
4a4739d5 AG |
171 | bool rt_mode_allow_redirect = !!(rt_mode & IP_VS_RT_MODE_RDR); |
172 | bool source_is_loopback; | |
173 | bool old_rt_is_local; | |
174 | ||
175 | #ifdef CONFIG_IP_VS_IPV6 | |
176 | if (skb_af == AF_INET6) { | |
177 | int addr_type = ipv6_addr_type(&ipv6_hdr(skb)->saddr); | |
178 | ||
179 | source_is_loopback = | |
180 | (!skb->dev || skb->dev->flags & IFF_LOOPBACK) && | |
181 | (addr_type & IPV6_ADDR_LOOPBACK); | |
182 | old_rt_is_local = __ip_vs_is_local_route6( | |
e8dfd42c | 183 | dst_rt6_info(skb_dst(skb))); |
4a4739d5 AG |
184 | } else |
185 | #endif | |
186 | { | |
187 | source_is_loopback = ipv4_is_loopback(ip_hdr(skb)->saddr); | |
188 | old_rt_is_local = skb_rtable(skb)->rt_flags & RTCF_LOCAL; | |
189 | } | |
190 | ||
191 | if (unlikely(new_rt_is_local)) { | |
192 | if (!rt_mode_allow_local) | |
193 | return true; | |
194 | if (!rt_mode_allow_redirect && !old_rt_is_local) | |
195 | return true; | |
196 | } else { | |
197 | if (!rt_mode_allow_non_local) | |
198 | return true; | |
199 | if (source_is_loopback) | |
200 | return true; | |
201 | } | |
202 | return false; | |
203 | } | |
204 | ||
919aa0b2 AG |
205 | static inline void maybe_update_pmtu(int skb_af, struct sk_buff *skb, int mtu) |
206 | { | |
207 | struct sock *sk = skb->sk; | |
208 | struct rtable *ort = skb_rtable(skb); | |
209 | ||
a8399231 | 210 | if (!skb->dev && sk && sk_fullsock(sk)) |
bd085ef6 | 211 | ort->dst.ops->update_pmtu(&ort->dst, sk, NULL, mtu, true); |
919aa0b2 AG |
212 | } |
213 | ||
20868a40 EB |
214 | static inline bool ensure_mtu_is_adequate(struct netns_ipvs *ipvs, int skb_af, |
215 | int rt_mode, | |
c63e4de2 AG |
216 | struct ip_vs_iphdr *ipvsh, |
217 | struct sk_buff *skb, int mtu) | |
218 | { | |
219 | #ifdef CONFIG_IP_VS_IPV6 | |
220 | if (skb_af == AF_INET6) { | |
20868a40 | 221 | struct net *net = ipvs->net; |
c63e4de2 AG |
222 | |
223 | if (unlikely(__mtu_check_toobig_v6(skb, mtu))) { | |
224 | if (!skb->dev) | |
225 | skb->dev = net->loopback_dev; | |
226 | /* only send ICMP too big on first fragment */ | |
89621f31 | 227 | if (!ipvsh->fragoffs && !ip_vs_iph_icmp(ipvsh)) |
c63e4de2 AG |
228 | icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); |
229 | IP_VS_DBG(1, "frag needed for %pI6c\n", | |
230 | &ipv6_hdr(skb)->saddr); | |
231 | return false; | |
232 | } | |
233 | } else | |
234 | #endif | |
235 | { | |
c63e4de2 AG |
236 | /* If we're going to tunnel the packet and pmtu discovery |
237 | * is disabled, we'll just fragment it anyway | |
238 | */ | |
239 | if ((rt_mode & IP_VS_RT_MODE_TUNNEL) && !sysctl_pmtu_disc(ipvs)) | |
240 | return true; | |
241 | ||
242 | if (unlikely(ip_hdr(skb)->frag_off & htons(IP_DF) && | |
89621f31 AG |
243 | skb->len > mtu && !skb_is_gso(skb) && |
244 | !ip_vs_iph_icmp(ipvsh))) { | |
c63e4de2 AG |
245 | icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, |
246 | htonl(mtu)); | |
247 | IP_VS_DBG(1, "frag needed for %pI4\n", | |
248 | &ip_hdr(skb)->saddr); | |
249 | return false; | |
250 | } | |
251 | } | |
252 | ||
253 | return true; | |
254 | } | |
255 | ||
8d8e20e2 DB |
256 | static inline bool decrement_ttl(struct netns_ipvs *ipvs, |
257 | int skb_af, | |
258 | struct sk_buff *skb) | |
259 | { | |
260 | struct net *net = ipvs->net; | |
261 | ||
262 | #ifdef CONFIG_IP_VS_IPV6 | |
263 | if (skb_af == AF_INET6) { | |
264 | struct dst_entry *dst = skb_dst(skb); | |
265 | ||
266 | /* check and decrement ttl */ | |
267 | if (ipv6_hdr(skb)->hop_limit <= 1) { | |
bdb7cc64 SS |
268 | struct inet6_dev *idev = __in6_dev_get_safely(skb->dev); |
269 | ||
8d8e20e2 DB |
270 | /* Force OUTPUT device used as source address */ |
271 | skb->dev = dst->dev; | |
272 | icmpv6_send(skb, ICMPV6_TIME_EXCEED, | |
273 | ICMPV6_EXC_HOPLIMIT, 0); | |
d6938c1c | 274 | IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); |
8d8e20e2 DB |
275 | |
276 | return false; | |
277 | } | |
278 | ||
279 | /* don't propagate ttl change to cloned packets */ | |
ec0974df | 280 | if (skb_ensure_writable(skb, sizeof(struct ipv6hdr))) |
8d8e20e2 DB |
281 | return false; |
282 | ||
283 | ipv6_hdr(skb)->hop_limit--; | |
284 | } else | |
285 | #endif | |
286 | { | |
287 | if (ip_hdr(skb)->ttl <= 1) { | |
288 | /* Tell the sender its packet died... */ | |
d6938c1c | 289 | IP_INC_STATS(net, IPSTATS_MIB_INHDRERRORS); |
8d8e20e2 DB |
290 | icmp_send(skb, ICMP_TIME_EXCEEDED, ICMP_EXC_TTL, 0); |
291 | return false; | |
292 | } | |
293 | ||
294 | /* don't propagate ttl change to cloned packets */ | |
ec0974df | 295 | if (skb_ensure_writable(skb, sizeof(struct iphdr))) |
8d8e20e2 DB |
296 | return false; |
297 | ||
298 | /* Decrease ttl */ | |
299 | ip_decrease_ttl(ip_hdr(skb)); | |
300 | } | |
301 | ||
302 | return true; | |
303 | } | |
304 | ||
17a8f8e3 | 305 | /* Get route to destination or remote server */ |
4115ded1 | 306 | static int |
ecfe87b8 EB |
307 | __ip_vs_get_out_rt(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb, |
308 | struct ip_vs_dest *dest, | |
c63e4de2 AG |
309 | __be32 daddr, int rt_mode, __be32 *ret_saddr, |
310 | struct ip_vs_iphdr *ipvsh) | |
1da177e4 | 311 | { |
ecfe87b8 | 312 | struct net *net = ipvs->net; |
026ace06 | 313 | struct ip_vs_dest_dst *dest_dst; |
1da177e4 | 314 | struct rtable *rt; /* Route to the other host */ |
4115ded1 | 315 | int mtu; |
026ace06 | 316 | int local, noref = 1; |
1da177e4 LT |
317 | |
318 | if (dest) { | |
026ace06 JA |
319 | dest_dst = __ip_vs_dst_check(dest); |
320 | if (likely(dest_dst)) | |
05d6d492 | 321 | rt = dst_rtable(dest_dst->dst_cache); |
026ace06 JA |
322 | else { |
323 | dest_dst = ip_vs_dest_dst_alloc(); | |
ac69269a | 324 | spin_lock_bh(&dest->dst_lock); |
026ace06 JA |
325 | if (!dest_dst) { |
326 | __ip_vs_dst_set(dest, NULL, NULL, 0); | |
ac69269a | 327 | spin_unlock_bh(&dest->dst_lock); |
026ace06 JA |
328 | goto err_unreach; |
329 | } | |
c90558da | 330 | rt = do_output_route4(net, dest->addr.ip, rt_mode, |
026ace06 | 331 | &dest_dst->dst_saddr.ip); |
f2edb9f7 | 332 | if (!rt) { |
026ace06 | 333 | __ip_vs_dst_set(dest, NULL, NULL, 0); |
ac69269a | 334 | spin_unlock_bh(&dest->dst_lock); |
026ace06 | 335 | ip_vs_dest_dst_free(dest_dst); |
4115ded1 | 336 | goto err_unreach; |
1da177e4 | 337 | } |
026ace06 | 338 | __ip_vs_dst_set(dest, dest_dst, &rt->dst, 0); |
ac69269a | 339 | spin_unlock_bh(&dest->dst_lock); |
c90558da | 340 | IP_VS_DBG(10, "new dst %pI4, src %pI4, refcnt=%d\n", |
026ace06 | 341 | &dest->addr.ip, &dest_dst->dst_saddr.ip, |
bc9d3a9f | 342 | rcuref_read(&rt->dst.__rcuref)); |
1da177e4 | 343 | } |
c92f5ca2 | 344 | if (ret_saddr) |
026ace06 | 345 | *ret_saddr = dest_dst->dst_saddr.ip; |
1da177e4 | 346 | } else { |
f2edb9f7 | 347 | __be32 saddr = htonl(INADDR_ANY); |
c92f5ca2 | 348 | |
026ace06 JA |
349 | noref = 0; |
350 | ||
f2edb9f7 JA |
351 | /* For such unconfigured boxes avoid many route lookups |
352 | * for performance reasons because we do not remember saddr | |
353 | */ | |
354 | rt_mode &= ~IP_VS_RT_MODE_CONNECT; | |
c90558da | 355 | rt = do_output_route4(net, daddr, rt_mode, &saddr); |
f2edb9f7 | 356 | if (!rt) |
4115ded1 | 357 | goto err_unreach; |
c92f5ca2 | 358 | if (ret_saddr) |
f2edb9f7 | 359 | *ret_saddr = saddr; |
1da177e4 LT |
360 | } |
361 | ||
4115ded1 | 362 | local = (rt->rt_flags & RTCF_LOCAL) ? 1 : 0; |
4a4739d5 AG |
363 | if (unlikely(crosses_local_route_boundary(skb_af, skb, rt_mode, |
364 | local))) { | |
365 | IP_VS_DBG_RL("We are crossing local and non-local addresses" | |
3d53666b | 366 | " daddr=%pI4\n", &daddr); |
4115ded1 | 367 | goto err_put; |
fc604767 | 368 | } |
4a4739d5 AG |
369 | |
370 | if (unlikely(local)) { | |
4115ded1 | 371 | /* skb to local stack, preserve old route */ |
026ace06 JA |
372 | if (!noref) |
373 | ip_rt_put(rt); | |
4115ded1 | 374 | return local; |
fc604767 | 375 | } |
4115ded1 | 376 | |
8d8e20e2 DB |
377 | if (!decrement_ttl(ipvs, skb_af, skb)) |
378 | goto err_put; | |
379 | ||
4115ded1 JA |
380 | if (likely(!(rt_mode & IP_VS_RT_MODE_TUNNEL))) { |
381 | mtu = dst_mtu(&rt->dst); | |
4115ded1 | 382 | } else { |
4115ded1 | 383 | mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr); |
84c0d5e9 JH |
384 | if (!dest) |
385 | goto err_put; | |
29930e31 | 386 | if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { |
84c0d5e9 | 387 | mtu -= sizeof(struct udphdr) + sizeof(struct guehdr); |
29930e31 JH |
388 | if ((dest->tun_flags & |
389 | IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) && | |
390 | skb->ip_summed == CHECKSUM_PARTIAL) | |
391 | mtu -= GUE_PLEN_REMCSUM + GUE_LEN_PRIV; | |
6f7b841b | 392 | } else if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) { |
5832c4a7 | 393 | IP_TUNNEL_DECLARE_FLAGS(tflags) = { }; |
6f7b841b VF |
394 | |
395 | if (dest->tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) | |
5832c4a7 | 396 | __set_bit(IP_TUNNEL_CSUM_BIT, tflags); |
6f7b841b | 397 | mtu -= gre_calc_hlen(tflags); |
29930e31 | 398 | } |
4115ded1 JA |
399 | if (mtu < 68) { |
400 | IP_VS_DBG_RL("%s(): mtu less than 68\n", __func__); | |
401 | goto err_put; | |
402 | } | |
919aa0b2 | 403 | maybe_update_pmtu(skb_af, skb, mtu); |
fc604767 JA |
404 | } |
405 | ||
20868a40 | 406 | if (!ensure_mtu_is_adequate(ipvs, skb_af, rt_mode, ipvsh, skb, mtu)) |
4115ded1 | 407 | goto err_put; |
4115ded1 JA |
408 | |
409 | skb_dst_drop(skb); | |
c09b8970 | 410 | if (noref) |
411 | skb_dst_set_noref(skb, &rt->dst); | |
412 | else | |
026ace06 | 413 | skb_dst_set(skb, &rt->dst); |
4115ded1 JA |
414 | |
415 | return local; | |
416 | ||
417 | err_put: | |
026ace06 JA |
418 | if (!noref) |
419 | ip_rt_put(rt); | |
4115ded1 JA |
420 | return -1; |
421 | ||
422 | err_unreach: | |
423 | dst_link_failure(skb); | |
424 | return -1; | |
1da177e4 LT |
425 | } |
426 | ||
38cdcc9a | 427 | #ifdef CONFIG_IP_VS_IPV6 |
714f095f HS |
428 | static struct dst_entry * |
429 | __ip_vs_route_output_v6(struct net *net, struct in6_addr *daddr, | |
48e8aa6e | 430 | struct in6_addr *ret_saddr, int do_xfrm, int rt_mode) |
714f095f HS |
431 | { |
432 | struct dst_entry *dst; | |
4c9483b2 DM |
433 | struct flowi6 fl6 = { |
434 | .daddr = *daddr, | |
714f095f HS |
435 | }; |
436 | ||
48e8aa6e MKL |
437 | if (rt_mode & IP_VS_RT_MODE_KNOWN_NH) |
438 | fl6.flowi6_flags = FLOWI_FLAG_KNOWN_NH; | |
439 | ||
4c9483b2 | 440 | dst = ip6_route_output(net, NULL, &fl6); |
714f095f HS |
441 | if (dst->error) |
442 | goto out_err; | |
443 | if (!ret_saddr) | |
444 | return dst; | |
4c9483b2 | 445 | if (ipv6_addr_any(&fl6.saddr) && |
714f095f | 446 | ipv6_dev_get_saddr(net, ip6_dst_idev(dst)->dev, |
4c9483b2 | 447 | &fl6.daddr, 0, &fl6.saddr) < 0) |
714f095f | 448 | goto out_err; |
452edd59 | 449 | if (do_xfrm) { |
4c9483b2 | 450 | dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), NULL, 0); |
452edd59 DM |
451 | if (IS_ERR(dst)) { |
452 | dst = NULL; | |
453 | goto out_err; | |
454 | } | |
455 | } | |
4e3fd7a0 | 456 | *ret_saddr = fl6.saddr; |
714f095f HS |
457 | return dst; |
458 | ||
459 | out_err: | |
460 | dst_release(dst); | |
461 | IP_VS_DBG_RL("ip6_route_output error, dest: %pI6\n", daddr); | |
462 | return NULL; | |
463 | } | |
464 | ||
fc604767 JA |
465 | /* |
466 | * Get route to destination or remote server | |
fc604767 | 467 | */ |
4115ded1 | 468 | static int |
f5745f8a EB |
469 | __ip_vs_get_out_rt_v6(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb, |
470 | struct ip_vs_dest *dest, | |
fc604767 | 471 | struct in6_addr *daddr, struct in6_addr *ret_saddr, |
4115ded1 | 472 | struct ip_vs_iphdr *ipvsh, int do_xfrm, int rt_mode) |
38cdcc9a | 473 | { |
f5745f8a | 474 | struct net *net = ipvs->net; |
026ace06 | 475 | struct ip_vs_dest_dst *dest_dst; |
38cdcc9a | 476 | struct rt6_info *rt; /* Route to the other host */ |
714f095f | 477 | struct dst_entry *dst; |
4115ded1 | 478 | int mtu; |
026ace06 | 479 | int local, noref = 1; |
38cdcc9a JV |
480 | |
481 | if (dest) { | |
026ace06 JA |
482 | dest_dst = __ip_vs_dst_check(dest); |
483 | if (likely(dest_dst)) | |
e8dfd42c | 484 | rt = dst_rt6_info(dest_dst->dst_cache); |
026ace06 | 485 | else { |
714f095f | 486 | u32 cookie; |
38cdcc9a | 487 | |
026ace06 | 488 | dest_dst = ip_vs_dest_dst_alloc(); |
ac69269a | 489 | spin_lock_bh(&dest->dst_lock); |
026ace06 JA |
490 | if (!dest_dst) { |
491 | __ip_vs_dst_set(dest, NULL, NULL, 0); | |
ac69269a | 492 | spin_unlock_bh(&dest->dst_lock); |
026ace06 JA |
493 | goto err_unreach; |
494 | } | |
714f095f | 495 | dst = __ip_vs_route_output_v6(net, &dest->addr.in6, |
026ace06 | 496 | &dest_dst->dst_saddr.in6, |
48e8aa6e | 497 | do_xfrm, rt_mode); |
714f095f | 498 | if (!dst) { |
026ace06 | 499 | __ip_vs_dst_set(dest, NULL, NULL, 0); |
ac69269a | 500 | spin_unlock_bh(&dest->dst_lock); |
026ace06 | 501 | ip_vs_dest_dst_free(dest_dst); |
4115ded1 | 502 | goto err_unreach; |
38cdcc9a | 503 | } |
e8dfd42c | 504 | rt = dst_rt6_info(dst); |
b197df4f | 505 | cookie = rt6_get_cookie(rt); |
026ace06 | 506 | __ip_vs_dst_set(dest, dest_dst, &rt->dst, cookie); |
ac69269a | 507 | spin_unlock_bh(&dest->dst_lock); |
714f095f | 508 | IP_VS_DBG(10, "new dst %pI6, src %pI6, refcnt=%d\n", |
026ace06 | 509 | &dest->addr.in6, &dest_dst->dst_saddr.in6, |
bc9d3a9f | 510 | rcuref_read(&rt->dst.__rcuref)); |
38cdcc9a | 511 | } |
714f095f | 512 | if (ret_saddr) |
026ace06 | 513 | *ret_saddr = dest_dst->dst_saddr.in6; |
38cdcc9a | 514 | } else { |
026ace06 | 515 | noref = 0; |
48e8aa6e MKL |
516 | dst = __ip_vs_route_output_v6(net, daddr, ret_saddr, do_xfrm, |
517 | rt_mode); | |
714f095f | 518 | if (!dst) |
4115ded1 | 519 | goto err_unreach; |
e8dfd42c | 520 | rt = dst_rt6_info(dst); |
38cdcc9a JV |
521 | } |
522 | ||
fc604767 | 523 | local = __ip_vs_is_local_route6(rt); |
4a4739d5 AG |
524 | |
525 | if (unlikely(crosses_local_route_boundary(skb_af, skb, rt_mode, | |
526 | local))) { | |
527 | IP_VS_DBG_RL("We are crossing local and non-local addresses" | |
3d53666b | 528 | " daddr=%pI6\n", daddr); |
4115ded1 | 529 | goto err_put; |
fc604767 | 530 | } |
4a4739d5 AG |
531 | |
532 | if (unlikely(local)) { | |
4115ded1 | 533 | /* skb to local stack, preserve old route */ |
026ace06 JA |
534 | if (!noref) |
535 | dst_release(&rt->dst); | |
4115ded1 | 536 | return local; |
fc604767 | 537 | } |
4115ded1 | 538 | |
8d8e20e2 DB |
539 | if (!decrement_ttl(ipvs, skb_af, skb)) |
540 | goto err_put; | |
541 | ||
4115ded1 JA |
542 | /* MTU checking */ |
543 | if (likely(!(rt_mode & IP_VS_RT_MODE_TUNNEL))) | |
544 | mtu = dst_mtu(&rt->dst); | |
545 | else { | |
4115ded1 | 546 | mtu = dst_mtu(&rt->dst) - sizeof(struct ipv6hdr); |
84c0d5e9 JH |
547 | if (!dest) |
548 | goto err_put; | |
29930e31 | 549 | if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { |
84c0d5e9 | 550 | mtu -= sizeof(struct udphdr) + sizeof(struct guehdr); |
29930e31 JH |
551 | if ((dest->tun_flags & |
552 | IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) && | |
553 | skb->ip_summed == CHECKSUM_PARTIAL) | |
554 | mtu -= GUE_PLEN_REMCSUM + GUE_LEN_PRIV; | |
6f7b841b | 555 | } else if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) { |
5832c4a7 | 556 | IP_TUNNEL_DECLARE_FLAGS(tflags) = { }; |
6f7b841b VF |
557 | |
558 | if (dest->tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) | |
5832c4a7 | 559 | __set_bit(IP_TUNNEL_CSUM_BIT, tflags); |
6f7b841b | 560 | mtu -= gre_calc_hlen(tflags); |
29930e31 | 561 | } |
4115ded1 JA |
562 | if (mtu < IPV6_MIN_MTU) { |
563 | IP_VS_DBG_RL("%s(): mtu less than %d\n", __func__, | |
564 | IPV6_MIN_MTU); | |
565 | goto err_put; | |
566 | } | |
919aa0b2 | 567 | maybe_update_pmtu(skb_af, skb, mtu); |
fc604767 JA |
568 | } |
569 | ||
20868a40 | 570 | if (!ensure_mtu_is_adequate(ipvs, skb_af, rt_mode, ipvsh, skb, mtu)) |
4115ded1 | 571 | goto err_put; |
4115ded1 JA |
572 | |
573 | skb_dst_drop(skb); | |
c09b8970 | 574 | if (noref) |
575 | skb_dst_set_noref(skb, &rt->dst); | |
576 | else | |
026ace06 | 577 | skb_dst_set(skb, &rt->dst); |
4115ded1 JA |
578 | |
579 | return local; | |
580 | ||
581 | err_put: | |
026ace06 JA |
582 | if (!noref) |
583 | dst_release(&rt->dst); | |
4115ded1 JA |
584 | return -1; |
585 | ||
586 | err_unreach: | |
326bf17e AG |
587 | /* The ip6_link_failure function requires the dev field to be set |
588 | * in order to get the net (further for the sake of fwmark | |
589 | * reflection). | |
590 | */ | |
591 | if (!skb->dev) | |
592 | skb->dev = skb_dst(skb)->dev; | |
593 | ||
4115ded1 JA |
594 | dst_link_failure(skb); |
595 | return -1; | |
38cdcc9a JV |
596 | } |
597 | #endif | |
598 | ||
1da177e4 | 599 | |
b8abdf09 JA |
600 | /* return NF_ACCEPT to allow forwarding or other NF_xxx on error */ |
601 | static inline int ip_vs_tunnel_xmit_prepare(struct sk_buff *skb, | |
602 | struct ip_vs_conn *cp) | |
603 | { | |
604 | int ret = NF_ACCEPT; | |
605 | ||
606 | skb->ipvs_property = 1; | |
607 | if (unlikely(cp->flags & IP_VS_CONN_F_NFCT)) | |
608 | ret = ip_vs_confirm_conntrack(skb); | |
609 | if (ret == NF_ACCEPT) { | |
895b5c9f | 610 | nf_reset_ct(skb); |
b8abdf09 | 611 | skb_forward_csum(skb); |
7980d2ea | 612 | if (skb->dev) |
de799101 | 613 | skb_clear_tstamp(skb); |
b8abdf09 JA |
614 | } |
615 | return ret; | |
616 | } | |
617 | ||
71563f34 AG |
618 | /* In the event of a remote destination, it's possible that we would have |
619 | * matches against an old socket (particularly a TIME-WAIT socket). This | |
620 | * causes havoc down the line (ip_local_out et. al. expect regular sockets | |
621 | * and invalid memory accesses will happen) so simply drop the association | |
622 | * in this case. | |
623 | */ | |
624 | static inline void ip_vs_drop_early_demux_sk(struct sk_buff *skb) | |
625 | { | |
626 | /* If dev is set, the packet came from the LOCAL_IN callback and | |
627 | * not from a local TCP socket. | |
628 | */ | |
629 | if (skb->dev) | |
630 | skb_orphan(skb); | |
631 | } | |
632 | ||
b8abdf09 JA |
633 | /* return NF_STOLEN (sent) or NF_ACCEPT if local=1 (not sent) */ |
634 | static inline int ip_vs_nat_send_or_cont(int pf, struct sk_buff *skb, | |
635 | struct ip_vs_conn *cp, int local) | |
636 | { | |
637 | int ret = NF_STOLEN; | |
638 | ||
639 | skb->ipvs_property = 1; | |
640 | if (likely(!(cp->flags & IP_VS_CONN_F_NFCT))) | |
641 | ip_vs_notrack(skb); | |
642 | else | |
643 | ip_vs_update_conntrack(skb, cp, 1); | |
71563f34 AG |
644 | |
645 | /* Remove the early_demux association unless it's bound for the | |
646 | * exact same port and address on this host after translation. | |
647 | */ | |
648 | if (!local || cp->vport != cp->dport || | |
649 | !ip_vs_addr_equal(cp->af, &cp->vaddr, &cp->daddr)) | |
650 | ip_vs_drop_early_demux_sk(skb); | |
651 | ||
b8abdf09 JA |
652 | if (!local) { |
653 | skb_forward_csum(skb); | |
7980d2ea | 654 | if (skb->dev) |
de799101 | 655 | skb_clear_tstamp(skb); |
58dbc6f2 | 656 | NF_HOOK(pf, NF_INET_LOCAL_OUT, cp->ipvs->net, NULL, skb, |
13206b6b | 657 | NULL, skb_dst(skb)->dev, dst_output); |
b8abdf09 JA |
658 | } else |
659 | ret = NF_ACCEPT; | |
71563f34 | 660 | |
b8abdf09 JA |
661 | return ret; |
662 | } | |
663 | ||
664 | /* return NF_STOLEN (sent) or NF_ACCEPT if local=1 (not sent) */ | |
665 | static inline int ip_vs_send_or_cont(int pf, struct sk_buff *skb, | |
666 | struct ip_vs_conn *cp, int local) | |
667 | { | |
668 | int ret = NF_STOLEN; | |
669 | ||
670 | skb->ipvs_property = 1; | |
671 | if (likely(!(cp->flags & IP_VS_CONN_F_NFCT))) | |
672 | ip_vs_notrack(skb); | |
673 | if (!local) { | |
71563f34 | 674 | ip_vs_drop_early_demux_sk(skb); |
b8abdf09 | 675 | skb_forward_csum(skb); |
7980d2ea | 676 | if (skb->dev) |
de799101 | 677 | skb_clear_tstamp(skb); |
58dbc6f2 | 678 | NF_HOOK(pf, NF_INET_LOCAL_OUT, cp->ipvs->net, NULL, skb, |
13206b6b | 679 | NULL, skb_dst(skb)->dev, dst_output); |
b8abdf09 JA |
680 | } else |
681 | ret = NF_ACCEPT; | |
682 | return ret; | |
683 | } | |
1da177e4 LT |
684 | |
685 | ||
686 | /* | |
687 | * NULL transmitter (do nothing except return NF_ACCEPT) | |
688 | */ | |
689 | int | |
690 | ip_vs_null_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, | |
d4383f04 | 691 | struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh) |
1da177e4 LT |
692 | { |
693 | /* we do not touch skb and do not need pskb ptr */ | |
b8abdf09 | 694 | return ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 1); |
1da177e4 LT |
695 | } |
696 | ||
697 | ||
698 | /* | |
699 | * Bypass transmitter | |
700 | * Let packets bypass the destination when the destination is not | |
701 | * available, it may be only used in transparent cache cluster. | |
702 | */ | |
703 | int | |
704 | ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, | |
d4383f04 | 705 | struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh) |
1da177e4 | 706 | { |
eddc9ec5 | 707 | struct iphdr *iph = ip_hdr(skb); |
1da177e4 | 708 | |
ecfe87b8 | 709 | if (__ip_vs_get_out_rt(cp->ipvs, cp->af, skb, NULL, iph->daddr, |
c63e4de2 | 710 | IP_VS_RT_MODE_NON_LOCAL, NULL, ipvsh) < 0) |
1da177e4 | 711 | goto tx_error; |
1da177e4 | 712 | |
4115ded1 | 713 | ip_send_check(iph); |
1da177e4 LT |
714 | |
715 | /* Another hack: avoid icmp_send in ip_fragment */ | |
60ff7467 | 716 | skb->ignore_df = 1; |
1da177e4 | 717 | |
b8abdf09 | 718 | ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 0); |
1da177e4 | 719 | |
1da177e4 LT |
720 | return NF_STOLEN; |
721 | ||
1da177e4 LT |
722 | tx_error: |
723 | kfree_skb(skb); | |
1da177e4 LT |
724 | return NF_STOLEN; |
725 | } | |
726 | ||
b3cdd2a7 JV |
727 | #ifdef CONFIG_IP_VS_IPV6 |
728 | int | |
729 | ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, | |
4115ded1 | 730 | struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh) |
b3cdd2a7 | 731 | { |
3481894f AG |
732 | struct ipv6hdr *iph = ipv6_hdr(skb); |
733 | ||
f5745f8a EB |
734 | if (__ip_vs_get_out_rt_v6(cp->ipvs, cp->af, skb, NULL, |
735 | &iph->daddr, NULL, | |
4115ded1 | 736 | ipvsh, 0, IP_VS_RT_MODE_NON_LOCAL) < 0) |
b3cdd2a7 | 737 | goto tx_error; |
b3cdd2a7 JV |
738 | |
739 | /* Another hack: avoid icmp_send in ip_fragment */ | |
60ff7467 | 740 | skb->ignore_df = 1; |
b3cdd2a7 | 741 | |
b8abdf09 | 742 | ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 0); |
b3cdd2a7 | 743 | |
b3cdd2a7 JV |
744 | return NF_STOLEN; |
745 | ||
b3cdd2a7 JV |
746 | tx_error: |
747 | kfree_skb(skb); | |
b3cdd2a7 JV |
748 | return NF_STOLEN; |
749 | } | |
750 | #endif | |
1da177e4 LT |
751 | |
752 | /* | |
753 | * NAT transmitter (only for outside-to-inside nat forwarding) | |
754 | * Not used for related ICMP | |
755 | */ | |
756 | int | |
757 | ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, | |
d4383f04 | 758 | struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh) |
1da177e4 LT |
759 | { |
760 | struct rtable *rt; /* Route to the other host */ | |
4115ded1 | 761 | int local, rc, was_input; |
1da177e4 | 762 | |
1da177e4 LT |
763 | /* check if it is a connection of no-client-port */ |
764 | if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) { | |
014d730d | 765 | __be16 _pt, *p; |
4115ded1 JA |
766 | |
767 | p = skb_header_pointer(skb, ipvsh->len, sizeof(_pt), &_pt); | |
1da177e4 LT |
768 | if (p == NULL) |
769 | goto tx_error; | |
770 | ip_vs_conn_fill_cport(cp, *p); | |
771 | IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p)); | |
772 | } | |
773 | ||
4115ded1 | 774 | was_input = rt_is_input_route(skb_rtable(skb)); |
ecfe87b8 | 775 | local = __ip_vs_get_out_rt(cp->ipvs, cp->af, skb, cp->dest, cp->daddr.ip, |
4115ded1 JA |
776 | IP_VS_RT_MODE_LOCAL | |
777 | IP_VS_RT_MODE_NON_LOCAL | | |
c63e4de2 | 778 | IP_VS_RT_MODE_RDR, NULL, ipvsh); |
4115ded1 JA |
779 | if (local < 0) |
780 | goto tx_error; | |
781 | rt = skb_rtable(skb); | |
fc604767 JA |
782 | /* |
783 | * Avoid duplicate tuple in reply direction for NAT traffic | |
784 | * to local address when connection is sync-ed | |
785 | */ | |
c0cd1156 | 786 | #if IS_ENABLED(CONFIG_NF_CONNTRACK) |
fc604767 JA |
787 | if (cp->flags & IP_VS_CONN_F_SYNC && local) { |
788 | enum ip_conntrack_info ctinfo; | |
05b4b065 | 789 | struct nf_conn *ct = nf_ct_get(skb, &ctinfo); |
fc604767 | 790 | |
ab8bc7ed | 791 | if (ct) { |
b0e010c5 | 792 | IP_VS_DBG_RL_PKT(10, AF_INET, pp, skb, ipvsh->off, |
0d79641a | 793 | "ip_vs_nat_xmit(): " |
fc604767 | 794 | "stopping DNAT to local address"); |
4115ded1 | 795 | goto tx_error; |
fc604767 JA |
796 | } |
797 | } | |
798 | #endif | |
799 | ||
800 | /* From world but DNAT to loopback address? */ | |
4115ded1 | 801 | if (local && ipv4_is_loopback(cp->daddr.ip) && was_input) { |
b0e010c5 AG |
802 | IP_VS_DBG_RL_PKT(1, AF_INET, pp, skb, ipvsh->off, |
803 | "ip_vs_nat_xmit(): stopping DNAT to loopback " | |
804 | "address"); | |
4115ded1 | 805 | goto tx_error; |
1da177e4 LT |
806 | } |
807 | ||
808 | /* copy-on-write the packet before mangling it */ | |
ec0974df | 809 | if (skb_ensure_writable(skb, sizeof(struct iphdr))) |
4115ded1 | 810 | goto tx_error; |
1da177e4 | 811 | |
d8d1f30b | 812 | if (skb_cow(skb, rt->dst.dev->hard_header_len)) |
4115ded1 | 813 | goto tx_error; |
1da177e4 | 814 | |
1da177e4 | 815 | /* mangle the packet */ |
d4383f04 | 816 | if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp, ipvsh)) |
4115ded1 | 817 | goto tx_error; |
e7ade46a | 818 | ip_hdr(skb)->daddr = cp->daddr.ip; |
eddc9ec5 | 819 | ip_send_check(ip_hdr(skb)); |
1da177e4 | 820 | |
b0e010c5 | 821 | IP_VS_DBG_PKT(10, AF_INET, pp, skb, ipvsh->off, "After DNAT"); |
1da177e4 LT |
822 | |
823 | /* FIXME: when application helper enlarges the packet and the length | |
824 | is larger than the MTU of outgoing device, there will be still | |
825 | MTU problem. */ | |
826 | ||
827 | /* Another hack: avoid icmp_send in ip_fragment */ | |
60ff7467 | 828 | skb->ignore_df = 1; |
1da177e4 | 829 | |
b8abdf09 | 830 | rc = ip_vs_nat_send_or_cont(NFPROTO_IPV4, skb, cp, local); |
1da177e4 | 831 | |
b8abdf09 | 832 | return rc; |
1da177e4 | 833 | |
1da177e4 | 834 | tx_error: |
1da177e4 LT |
835 | kfree_skb(skb); |
836 | return NF_STOLEN; | |
1da177e4 LT |
837 | } |
838 | ||
b3cdd2a7 JV |
839 | #ifdef CONFIG_IP_VS_IPV6 |
840 | int | |
841 | ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, | |
4115ded1 | 842 | struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh) |
b3cdd2a7 JV |
843 | { |
844 | struct rt6_info *rt; /* Route to the other host */ | |
b8abdf09 | 845 | int local, rc; |
b3cdd2a7 | 846 | |
b3cdd2a7 | 847 | /* check if it is a connection of no-client-port */ |
4115ded1 | 848 | if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT && !ipvsh->fragoffs)) { |
b3cdd2a7 | 849 | __be16 _pt, *p; |
4115ded1 | 850 | p = skb_header_pointer(skb, ipvsh->len, sizeof(_pt), &_pt); |
b3cdd2a7 JV |
851 | if (p == NULL) |
852 | goto tx_error; | |
853 | ip_vs_conn_fill_cport(cp, *p); | |
854 | IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p)); | |
855 | } | |
856 | ||
f5745f8a EB |
857 | local = __ip_vs_get_out_rt_v6(cp->ipvs, cp->af, skb, cp->dest, |
858 | &cp->daddr.in6, | |
4a4739d5 | 859 | NULL, ipvsh, 0, |
4115ded1 JA |
860 | IP_VS_RT_MODE_LOCAL | |
861 | IP_VS_RT_MODE_NON_LOCAL | | |
862 | IP_VS_RT_MODE_RDR); | |
863 | if (local < 0) | |
864 | goto tx_error; | |
e8dfd42c | 865 | rt = dst_rt6_info(skb_dst(skb)); |
fc604767 JA |
866 | /* |
867 | * Avoid duplicate tuple in reply direction for NAT traffic | |
868 | * to local address when connection is sync-ed | |
869 | */ | |
c0cd1156 | 870 | #if IS_ENABLED(CONFIG_NF_CONNTRACK) |
fc604767 JA |
871 | if (cp->flags & IP_VS_CONN_F_SYNC && local) { |
872 | enum ip_conntrack_info ctinfo; | |
05b4b065 | 873 | struct nf_conn *ct = nf_ct_get(skb, &ctinfo); |
fc604767 | 874 | |
ab8bc7ed | 875 | if (ct) { |
b0e010c5 | 876 | IP_VS_DBG_RL_PKT(10, AF_INET6, pp, skb, ipvsh->off, |
fc604767 JA |
877 | "ip_vs_nat_xmit_v6(): " |
878 | "stopping DNAT to local address"); | |
4115ded1 | 879 | goto tx_error; |
fc604767 JA |
880 | } |
881 | } | |
882 | #endif | |
883 | ||
884 | /* From world but DNAT to loopback address? */ | |
885 | if (local && skb->dev && !(skb->dev->flags & IFF_LOOPBACK) && | |
fd0273d7 | 886 | ipv6_addr_type(&cp->daddr.in6) & IPV6_ADDR_LOOPBACK) { |
b0e010c5 | 887 | IP_VS_DBG_RL_PKT(1, AF_INET6, pp, skb, ipvsh->off, |
fc604767 JA |
888 | "ip_vs_nat_xmit_v6(): " |
889 | "stopping DNAT to loopback address"); | |
4115ded1 | 890 | goto tx_error; |
b3cdd2a7 JV |
891 | } |
892 | ||
893 | /* copy-on-write the packet before mangling it */ | |
ec0974df | 894 | if (skb_ensure_writable(skb, sizeof(struct ipv6hdr))) |
4115ded1 | 895 | goto tx_error; |
b3cdd2a7 | 896 | |
d8d1f30b | 897 | if (skb_cow(skb, rt->dst.dev->hard_header_len)) |
4115ded1 | 898 | goto tx_error; |
b3cdd2a7 | 899 | |
b3cdd2a7 | 900 | /* mangle the packet */ |
4115ded1 | 901 | if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp, ipvsh)) |
b3cdd2a7 | 902 | goto tx_error; |
4e3fd7a0 | 903 | ipv6_hdr(skb)->daddr = cp->daddr.in6; |
fc604767 | 904 | |
b0e010c5 | 905 | IP_VS_DBG_PKT(10, AF_INET6, pp, skb, ipvsh->off, "After DNAT"); |
b3cdd2a7 JV |
906 | |
907 | /* FIXME: when application helper enlarges the packet and the length | |
908 | is larger than the MTU of outgoing device, there will be still | |
909 | MTU problem. */ | |
910 | ||
911 | /* Another hack: avoid icmp_send in ip_fragment */ | |
60ff7467 | 912 | skb->ignore_df = 1; |
b3cdd2a7 | 913 | |
b8abdf09 | 914 | rc = ip_vs_nat_send_or_cont(NFPROTO_IPV6, skb, cp, local); |
b3cdd2a7 | 915 | |
b8abdf09 | 916 | return rc; |
b3cdd2a7 | 917 | |
b3cdd2a7 | 918 | tx_error: |
b3cdd2a7 JV |
919 | kfree_skb(skb); |
920 | return NF_STOLEN; | |
b3cdd2a7 JV |
921 | } |
922 | #endif | |
923 | ||
8052ba29 AG |
924 | /* When forwarding a packet, we must ensure that we've got enough headroom |
925 | * for the encapsulation packet in the skb. This also gives us an | |
926 | * opportunity to figure out what the payload_len, dsfield, ttl, and df | |
927 | * values should be, so that we won't need to look at the old ip header | |
928 | * again | |
929 | */ | |
930 | static struct sk_buff * | |
931 | ip_vs_prepare_tunneled_skb(struct sk_buff *skb, int skb_af, | |
932 | unsigned int max_headroom, __u8 *next_protocol, | |
933 | __u32 *payload_len, __u8 *dsfield, __u8 *ttl, | |
934 | __be16 *df) | |
935 | { | |
936 | struct sk_buff *new_skb = NULL; | |
937 | struct iphdr *old_iph = NULL; | |
b621129f | 938 | __u8 old_dsfield; |
8052ba29 AG |
939 | #ifdef CONFIG_IP_VS_IPV6 |
940 | struct ipv6hdr *old_ipv6h = NULL; | |
941 | #endif | |
942 | ||
71563f34 AG |
943 | ip_vs_drop_early_demux_sk(skb); |
944 | ||
8052ba29 AG |
945 | if (skb_headroom(skb) < max_headroom || skb_cloned(skb)) { |
946 | new_skb = skb_realloc_headroom(skb, max_headroom); | |
947 | if (!new_skb) | |
948 | goto error; | |
50656d9d CO |
949 | if (skb->sk) |
950 | skb_set_owner_w(new_skb, skb->sk); | |
8052ba29 AG |
951 | consume_skb(skb); |
952 | skb = new_skb; | |
953 | } | |
954 | ||
955 | #ifdef CONFIG_IP_VS_IPV6 | |
956 | if (skb_af == AF_INET6) { | |
957 | old_ipv6h = ipv6_hdr(skb); | |
958 | *next_protocol = IPPROTO_IPV6; | |
959 | if (payload_len) | |
960 | *payload_len = | |
961 | ntohs(old_ipv6h->payload_len) + | |
962 | sizeof(*old_ipv6h); | |
b621129f | 963 | old_dsfield = ipv6_get_dsfield(old_ipv6h); |
8052ba29 AG |
964 | *ttl = old_ipv6h->hop_limit; |
965 | if (df) | |
966 | *df = 0; | |
967 | } else | |
968 | #endif | |
969 | { | |
970 | old_iph = ip_hdr(skb); | |
971 | /* Copy DF, reset fragment offset and MF */ | |
972 | if (df) | |
973 | *df = (old_iph->frag_off & htons(IP_DF)); | |
974 | *next_protocol = IPPROTO_IPIP; | |
975 | ||
976 | /* fix old IP header checksum */ | |
977 | ip_send_check(old_iph); | |
b621129f | 978 | old_dsfield = ipv4_get_dsfield(old_iph); |
8052ba29 AG |
979 | *ttl = old_iph->ttl; |
980 | if (payload_len) | |
a13fbf5e | 981 | *payload_len = skb_ip_totlen(skb); |
8052ba29 AG |
982 | } |
983 | ||
b621129f VF |
984 | /* Implement full-functionality option for ECN encapsulation */ |
985 | *dsfield = INET_ECN_encapsulate(old_dsfield, old_dsfield); | |
986 | ||
8052ba29 AG |
987 | return skb; |
988 | error: | |
989 | kfree_skb(skb); | |
990 | return ERR_PTR(-ENOMEM); | |
991 | } | |
992 | ||
993 | static inline int __tun_gso_type_mask(int encaps_af, int orig_af) | |
994 | { | |
7e13318d TH |
995 | switch (encaps_af) { |
996 | case AF_INET: | |
997 | return SKB_GSO_IPXIP4; | |
998 | case AF_INET6: | |
999 | return SKB_GSO_IPXIP6; | |
1000 | default: | |
1001 | return 0; | |
8052ba29 | 1002 | } |
8052ba29 | 1003 | } |
1da177e4 | 1004 | |
84c0d5e9 JH |
1005 | static int |
1006 | ipvs_gue_encap(struct net *net, struct sk_buff *skb, | |
1007 | struct ip_vs_conn *cp, __u8 *next_protocol) | |
1008 | { | |
1009 | __be16 dport; | |
1010 | __be16 sport = udp_flow_src_port(net, skb, 0, 0, false); | |
1011 | struct udphdr *udph; /* Our new UDP header */ | |
1012 | struct guehdr *gueh; /* Our new GUE header */ | |
29930e31 JH |
1013 | size_t hdrlen, optlen = 0; |
1014 | void *data; | |
1015 | bool need_priv = false; | |
1016 | ||
1017 | if ((cp->dest->tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) && | |
1018 | skb->ip_summed == CHECKSUM_PARTIAL) { | |
1019 | optlen += GUE_PLEN_REMCSUM + GUE_LEN_PRIV; | |
1020 | need_priv = true; | |
1021 | } | |
84c0d5e9 | 1022 | |
29930e31 JH |
1023 | hdrlen = sizeof(struct guehdr) + optlen; |
1024 | ||
1025 | skb_push(skb, hdrlen); | |
84c0d5e9 JH |
1026 | |
1027 | gueh = (struct guehdr *)skb->data; | |
1028 | ||
1029 | gueh->control = 0; | |
1030 | gueh->version = 0; | |
29930e31 | 1031 | gueh->hlen = optlen >> 2; |
84c0d5e9 JH |
1032 | gueh->flags = 0; |
1033 | gueh->proto_ctype = *next_protocol; | |
1034 | ||
29930e31 JH |
1035 | data = &gueh[1]; |
1036 | ||
1037 | if (need_priv) { | |
1038 | __be32 *flags = data; | |
1039 | u16 csum_start = skb_checksum_start_offset(skb); | |
1040 | __be16 *pd; | |
1041 | ||
1042 | gueh->flags |= GUE_FLAG_PRIV; | |
1043 | *flags = 0; | |
1044 | data += GUE_LEN_PRIV; | |
1045 | ||
1046 | if (csum_start < hdrlen) | |
1047 | return -EINVAL; | |
1048 | ||
1049 | csum_start -= hdrlen; | |
1050 | pd = data; | |
1051 | pd[0] = htons(csum_start); | |
1052 | pd[1] = htons(csum_start + skb->csum_offset); | |
1053 | ||
1054 | if (!skb_is_gso(skb)) { | |
1055 | skb->ip_summed = CHECKSUM_NONE; | |
1056 | skb->encapsulation = 0; | |
1057 | } | |
1058 | ||
1059 | *flags |= GUE_PFLAG_REMCSUM; | |
1060 | data += GUE_PLEN_REMCSUM; | |
1061 | } | |
1062 | ||
84c0d5e9 JH |
1063 | skb_push(skb, sizeof(struct udphdr)); |
1064 | skb_reset_transport_header(skb); | |
1065 | ||
1066 | udph = udp_hdr(skb); | |
1067 | ||
1068 | dport = cp->dest->tun_port; | |
1069 | udph->dest = dport; | |
1070 | udph->source = sport; | |
1071 | udph->len = htons(skb->len); | |
1072 | udph->check = 0; | |
1073 | ||
1074 | *next_protocol = IPPROTO_UDP; | |
1075 | ||
1076 | return 0; | |
1077 | } | |
1078 | ||
6f7b841b VF |
1079 | static void |
1080 | ipvs_gre_encap(struct net *net, struct sk_buff *skb, | |
1081 | struct ip_vs_conn *cp, __u8 *next_protocol) | |
1082 | { | |
1083 | __be16 proto = *next_protocol == IPPROTO_IPIP ? | |
1084 | htons(ETH_P_IP) : htons(ETH_P_IPV6); | |
5832c4a7 | 1085 | IP_TUNNEL_DECLARE_FLAGS(tflags) = { }; |
6f7b841b VF |
1086 | size_t hdrlen; |
1087 | ||
1088 | if (cp->dest->tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) | |
5832c4a7 | 1089 | __set_bit(IP_TUNNEL_CSUM_BIT, tflags); |
6f7b841b VF |
1090 | |
1091 | hdrlen = gre_calc_hlen(tflags); | |
1092 | gre_build_header(skb, hdrlen, tflags, proto, 0, 0); | |
1093 | ||
1094 | *next_protocol = IPPROTO_GRE; | |
1095 | } | |
1096 | ||
1da177e4 LT |
1097 | /* |
1098 | * IP Tunneling transmitter | |
1099 | * | |
1100 | * This function encapsulates the packet in a new IP packet, its | |
1101 | * destination will be set to cp->daddr. Most code of this function | |
1102 | * is taken from ipip.c. | |
1103 | * | |
1104 | * It is used in VS/TUN cluster. The load balancer selects a real | |
1105 | * server from a cluster based on a scheduling algorithm, | |
1106 | * encapsulates the request packet and forwards it to the selected | |
1107 | * server. For example, all real servers are configured with | |
1108 | * "ifconfig tunl0 <Virtual IP Address> up". When the server receives | |
1109 | * the encapsulated packet, it will decapsulate the packet, processe | |
1110 | * the request and return the response packets directly to the client | |
1111 | * without passing the load balancer. This can greatly increase the | |
1112 | * scalability of virtual server. | |
1113 | * | |
1114 | * Used for ANY protocol | |
1115 | */ | |
1116 | int | |
1117 | ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, | |
d4383f04 | 1118 | struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh) |
1da177e4 | 1119 | { |
361c3f52 EB |
1120 | struct netns_ipvs *ipvs = cp->ipvs; |
1121 | struct net *net = ipvs->net; | |
1da177e4 | 1122 | struct rtable *rt; /* Route to the other host */ |
c92f5ca2 | 1123 | __be32 saddr; /* Source for tunnel */ |
1da177e4 | 1124 | struct net_device *tdev; /* Device to other host */ |
8052ba29 AG |
1125 | __u8 next_protocol = 0; |
1126 | __u8 dsfield = 0; | |
1127 | __u8 ttl = 0; | |
1128 | __be16 df = 0; | |
1129 | __be16 *dfp = NULL; | |
1da177e4 | 1130 | struct iphdr *iph; /* Our new IP header */ |
c2636b4d | 1131 | unsigned int max_headroom; /* The extra header space needed */ |
4115ded1 | 1132 | int ret, local; |
84c0d5e9 | 1133 | int tun_type, gso_type; |
29930e31 | 1134 | int tun_flags; |
1da177e4 | 1135 | |
ecfe87b8 | 1136 | local = __ip_vs_get_out_rt(ipvs, cp->af, skb, cp->dest, cp->daddr.ip, |
4115ded1 JA |
1137 | IP_VS_RT_MODE_LOCAL | |
1138 | IP_VS_RT_MODE_NON_LOCAL | | |
1139 | IP_VS_RT_MODE_CONNECT | | |
c63e4de2 | 1140 | IP_VS_RT_MODE_TUNNEL, &saddr, ipvsh); |
4115ded1 JA |
1141 | if (local < 0) |
1142 | goto tx_error; | |
0b35f603 | 1143 | if (local) |
b8abdf09 | 1144 | return ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 1); |
1da177e4 | 1145 | |
4115ded1 | 1146 | rt = skb_rtable(skb); |
d8d1f30b | 1147 | tdev = rt->dst.dev; |
1da177e4 | 1148 | |
1da177e4 LT |
1149 | /* |
1150 | * Okay, now see if we can stuff it in the buffer as-is. | |
1151 | */ | |
1152 | max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct iphdr); | |
1153 | ||
84c0d5e9 | 1154 | tun_type = cp->dest->tun_type; |
29930e31 | 1155 | tun_flags = cp->dest->tun_flags; |
84c0d5e9 | 1156 | |
29930e31 JH |
1157 | if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { |
1158 | size_t gue_hdrlen, gue_optlen = 0; | |
1159 | ||
1160 | if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) && | |
1161 | skb->ip_summed == CHECKSUM_PARTIAL) { | |
1162 | gue_optlen += GUE_PLEN_REMCSUM + GUE_LEN_PRIV; | |
1163 | } | |
1164 | gue_hdrlen = sizeof(struct guehdr) + gue_optlen; | |
1165 | ||
1166 | max_headroom += sizeof(struct udphdr) + gue_hdrlen; | |
6f7b841b | 1167 | } else if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) { |
5832c4a7 | 1168 | IP_TUNNEL_DECLARE_FLAGS(tflags) = { }; |
6f7b841b | 1169 | size_t gre_hdrlen; |
6f7b841b VF |
1170 | |
1171 | if (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) | |
5832c4a7 | 1172 | __set_bit(IP_TUNNEL_CSUM_BIT, tflags); |
6f7b841b VF |
1173 | gre_hdrlen = gre_calc_hlen(tflags); |
1174 | ||
1175 | max_headroom += gre_hdrlen; | |
29930e31 | 1176 | } |
84c0d5e9 | 1177 | |
8052ba29 AG |
1178 | /* We only care about the df field if sysctl_pmtu_disc(ipvs) is set */ |
1179 | dfp = sysctl_pmtu_disc(ipvs) ? &df : NULL; | |
1180 | skb = ip_vs_prepare_tunneled_skb(skb, cp->af, max_headroom, | |
1181 | &next_protocol, NULL, &dsfield, | |
1182 | &ttl, dfp); | |
1183 | if (IS_ERR(skb)) | |
210ffe4a | 1184 | return NF_STOLEN; |
1da177e4 | 1185 | |
84c0d5e9 | 1186 | gso_type = __tun_gso_type_mask(AF_INET, cp->af); |
29930e31 JH |
1187 | if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { |
1188 | if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) || | |
1189 | (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM)) | |
1190 | gso_type |= SKB_GSO_UDP_TUNNEL_CSUM; | |
1191 | else | |
1192 | gso_type |= SKB_GSO_UDP_TUNNEL; | |
1193 | if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) && | |
1194 | skb->ip_summed == CHECKSUM_PARTIAL) { | |
1195 | gso_type |= SKB_GSO_TUNNEL_REMCSUM; | |
1196 | } | |
6f7b841b VF |
1197 | } else if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) { |
1198 | if (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) | |
1199 | gso_type |= SKB_GSO_GRE_CSUM; | |
1200 | else | |
1201 | gso_type |= SKB_GSO_GRE; | |
29930e31 | 1202 | } |
84c0d5e9 JH |
1203 | |
1204 | if (iptunnel_handle_offloads(skb, gso_type)) | |
ea1d5d77 JA |
1205 | goto tx_error; |
1206 | ||
1207 | skb->transport_header = skb->network_header; | |
1208 | ||
84c0d5e9 | 1209 | skb_set_inner_ipproto(skb, next_protocol); |
d7fce52f | 1210 | skb_set_inner_mac_header(skb, skb_inner_network_offset(skb)); |
84c0d5e9 | 1211 | |
29930e31 JH |
1212 | if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { |
1213 | bool check = false; | |
1214 | ||
1215 | if (ipvs_gue_encap(net, skb, cp, &next_protocol)) | |
1216 | goto tx_error; | |
1217 | ||
1218 | if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) || | |
1219 | (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM)) | |
1220 | check = true; | |
1221 | ||
1222 | udp_set_csum(!check, skb, saddr, cp->daddr.ip, skb->len); | |
6f7b841b VF |
1223 | } else if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) |
1224 | ipvs_gre_encap(net, skb, cp, &next_protocol); | |
84c0d5e9 | 1225 | |
e2d1bca7 ACM |
1226 | skb_push(skb, sizeof(struct iphdr)); |
1227 | skb_reset_network_header(skb); | |
1da177e4 LT |
1228 | memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); |
1229 | ||
1da177e4 LT |
1230 | /* |
1231 | * Push down and install the IPIP header. | |
1232 | */ | |
eddc9ec5 | 1233 | iph = ip_hdr(skb); |
1da177e4 LT |
1234 | iph->version = 4; |
1235 | iph->ihl = sizeof(struct iphdr)>>2; | |
1236 | iph->frag_off = df; | |
8052ba29 AG |
1237 | iph->protocol = next_protocol; |
1238 | iph->tos = dsfield; | |
c92f5ca2 JA |
1239 | iph->daddr = cp->daddr.ip; |
1240 | iph->saddr = saddr; | |
8052ba29 | 1241 | iph->ttl = ttl; |
b6a7719a | 1242 | ip_select_ident(net, skb, NULL); |
1da177e4 LT |
1243 | |
1244 | /* Another hack: avoid icmp_send in ip_fragment */ | |
60ff7467 | 1245 | skb->ignore_df = 1; |
1da177e4 | 1246 | |
b8abdf09 | 1247 | ret = ip_vs_tunnel_xmit_prepare(skb, cp); |
f4bc17cd | 1248 | if (ret == NF_ACCEPT) |
33224b16 | 1249 | ip_local_out(net, skb->sk, skb); |
f4bc17cd JA |
1250 | else if (ret == NF_DROP) |
1251 | kfree_skb(skb); | |
1da177e4 | 1252 | |
1da177e4 LT |
1253 | return NF_STOLEN; |
1254 | ||
1da177e4 | 1255 | tx_error: |
210ffe4a | 1256 | kfree_skb(skb); |
1da177e4 LT |
1257 | return NF_STOLEN; |
1258 | } | |
1259 | ||
b3cdd2a7 JV |
1260 | #ifdef CONFIG_IP_VS_IPV6 |
1261 | int | |
1262 | ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, | |
d4383f04 | 1263 | struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh) |
b3cdd2a7 | 1264 | { |
84c0d5e9 JH |
1265 | struct netns_ipvs *ipvs = cp->ipvs; |
1266 | struct net *net = ipvs->net; | |
b3cdd2a7 | 1267 | struct rt6_info *rt; /* Route to the other host */ |
714f095f | 1268 | struct in6_addr saddr; /* Source for tunnel */ |
b3cdd2a7 | 1269 | struct net_device *tdev; /* Device to other host */ |
8052ba29 AG |
1270 | __u8 next_protocol = 0; |
1271 | __u32 payload_len = 0; | |
1272 | __u8 dsfield = 0; | |
1273 | __u8 ttl = 0; | |
b3cdd2a7 JV |
1274 | struct ipv6hdr *iph; /* Our new IP header */ |
1275 | unsigned int max_headroom; /* The extra header space needed */ | |
4115ded1 | 1276 | int ret, local; |
84c0d5e9 | 1277 | int tun_type, gso_type; |
29930e31 | 1278 | int tun_flags; |
b3cdd2a7 | 1279 | |
84c0d5e9 | 1280 | local = __ip_vs_get_out_rt_v6(ipvs, cp->af, skb, cp->dest, |
f5745f8a | 1281 | &cp->daddr.in6, |
4115ded1 JA |
1282 | &saddr, ipvsh, 1, |
1283 | IP_VS_RT_MODE_LOCAL | | |
1284 | IP_VS_RT_MODE_NON_LOCAL | | |
1285 | IP_VS_RT_MODE_TUNNEL); | |
1286 | if (local < 0) | |
1287 | goto tx_error; | |
0b35f603 | 1288 | if (local) |
b8abdf09 | 1289 | return ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 1); |
b3cdd2a7 | 1290 | |
e8dfd42c | 1291 | rt = dst_rt6_info(skb_dst(skb)); |
d8d1f30b | 1292 | tdev = rt->dst.dev; |
b3cdd2a7 | 1293 | |
b3cdd2a7 JV |
1294 | /* |
1295 | * Okay, now see if we can stuff it in the buffer as-is. | |
1296 | */ | |
1297 | max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct ipv6hdr); | |
1298 | ||
84c0d5e9 | 1299 | tun_type = cp->dest->tun_type; |
29930e31 | 1300 | tun_flags = cp->dest->tun_flags; |
84c0d5e9 | 1301 | |
29930e31 JH |
1302 | if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { |
1303 | size_t gue_hdrlen, gue_optlen = 0; | |
1304 | ||
1305 | if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) && | |
1306 | skb->ip_summed == CHECKSUM_PARTIAL) { | |
1307 | gue_optlen += GUE_PLEN_REMCSUM + GUE_LEN_PRIV; | |
1308 | } | |
1309 | gue_hdrlen = sizeof(struct guehdr) + gue_optlen; | |
1310 | ||
1311 | max_headroom += sizeof(struct udphdr) + gue_hdrlen; | |
6f7b841b | 1312 | } else if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) { |
5832c4a7 | 1313 | IP_TUNNEL_DECLARE_FLAGS(tflags) = { }; |
6f7b841b | 1314 | size_t gre_hdrlen; |
6f7b841b VF |
1315 | |
1316 | if (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) | |
5832c4a7 | 1317 | __set_bit(IP_TUNNEL_CSUM_BIT, tflags); |
6f7b841b VF |
1318 | gre_hdrlen = gre_calc_hlen(tflags); |
1319 | ||
1320 | max_headroom += gre_hdrlen; | |
29930e31 | 1321 | } |
84c0d5e9 | 1322 | |
8052ba29 AG |
1323 | skb = ip_vs_prepare_tunneled_skb(skb, cp->af, max_headroom, |
1324 | &next_protocol, &payload_len, | |
1325 | &dsfield, &ttl, NULL); | |
1326 | if (IS_ERR(skb)) | |
210ffe4a | 1327 | return NF_STOLEN; |
b3cdd2a7 | 1328 | |
84c0d5e9 | 1329 | gso_type = __tun_gso_type_mask(AF_INET6, cp->af); |
29930e31 JH |
1330 | if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { |
1331 | if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) || | |
1332 | (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM)) | |
1333 | gso_type |= SKB_GSO_UDP_TUNNEL_CSUM; | |
1334 | else | |
1335 | gso_type |= SKB_GSO_UDP_TUNNEL; | |
1336 | if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) && | |
1337 | skb->ip_summed == CHECKSUM_PARTIAL) { | |
1338 | gso_type |= SKB_GSO_TUNNEL_REMCSUM; | |
1339 | } | |
6f7b841b VF |
1340 | } else if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) { |
1341 | if (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) | |
1342 | gso_type |= SKB_GSO_GRE_CSUM; | |
1343 | else | |
1344 | gso_type |= SKB_GSO_GRE; | |
29930e31 | 1345 | } |
84c0d5e9 JH |
1346 | |
1347 | if (iptunnel_handle_offloads(skb, gso_type)) | |
ea1d5d77 JA |
1348 | goto tx_error; |
1349 | ||
714f095f | 1350 | skb->transport_header = skb->network_header; |
b3cdd2a7 | 1351 | |
84c0d5e9 | 1352 | skb_set_inner_ipproto(skb, next_protocol); |
d7fce52f | 1353 | skb_set_inner_mac_header(skb, skb_inner_network_offset(skb)); |
84c0d5e9 | 1354 | |
29930e31 JH |
1355 | if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { |
1356 | bool check = false; | |
1357 | ||
1358 | if (ipvs_gue_encap(net, skb, cp, &next_protocol)) | |
1359 | goto tx_error; | |
1360 | ||
1361 | if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) || | |
1362 | (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM)) | |
1363 | check = true; | |
1364 | ||
1365 | udp6_set_csum(!check, skb, &saddr, &cp->daddr.in6, skb->len); | |
6f7b841b VF |
1366 | } else if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) |
1367 | ipvs_gre_encap(net, skb, cp, &next_protocol); | |
84c0d5e9 | 1368 | |
b3cdd2a7 JV |
1369 | skb_push(skb, sizeof(struct ipv6hdr)); |
1370 | skb_reset_network_header(skb); | |
1371 | memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); | |
1372 | ||
b3cdd2a7 JV |
1373 | /* |
1374 | * Push down and install the IPIP header. | |
1375 | */ | |
1376 | iph = ipv6_hdr(skb); | |
1377 | iph->version = 6; | |
8052ba29 AG |
1378 | iph->nexthdr = next_protocol; |
1379 | iph->payload_len = htons(payload_len); | |
b3cdd2a7 | 1380 | memset(&iph->flow_lbl, 0, sizeof(iph->flow_lbl)); |
8052ba29 | 1381 | ipv6_change_dsfield(iph, 0, dsfield); |
4e3fd7a0 AD |
1382 | iph->daddr = cp->daddr.in6; |
1383 | iph->saddr = saddr; | |
8052ba29 | 1384 | iph->hop_limit = ttl; |
b3cdd2a7 JV |
1385 | |
1386 | /* Another hack: avoid icmp_send in ip_fragment */ | |
60ff7467 | 1387 | skb->ignore_df = 1; |
b3cdd2a7 | 1388 | |
b8abdf09 | 1389 | ret = ip_vs_tunnel_xmit_prepare(skb, cp); |
f4bc17cd | 1390 | if (ret == NF_ACCEPT) |
84c0d5e9 | 1391 | ip6_local_out(net, skb->sk, skb); |
f4bc17cd JA |
1392 | else if (ret == NF_DROP) |
1393 | kfree_skb(skb); | |
b3cdd2a7 | 1394 | |
b3cdd2a7 JV |
1395 | return NF_STOLEN; |
1396 | ||
b3cdd2a7 | 1397 | tx_error: |
210ffe4a | 1398 | kfree_skb(skb); |
b3cdd2a7 JV |
1399 | return NF_STOLEN; |
1400 | } | |
1401 | #endif | |
1402 | ||
1da177e4 LT |
1403 | |
1404 | /* | |
1405 | * Direct Routing transmitter | |
1406 | * Used for ANY protocol | |
1407 | */ | |
1408 | int | |
1409 | ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, | |
d4383f04 | 1410 | struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh) |
1da177e4 | 1411 | { |
4115ded1 | 1412 | int local; |
1da177e4 | 1413 | |
ecfe87b8 | 1414 | local = __ip_vs_get_out_rt(cp->ipvs, cp->af, skb, cp->dest, cp->daddr.ip, |
4115ded1 JA |
1415 | IP_VS_RT_MODE_LOCAL | |
1416 | IP_VS_RT_MODE_NON_LOCAL | | |
c63e4de2 | 1417 | IP_VS_RT_MODE_KNOWN_NH, NULL, ipvsh); |
4115ded1 | 1418 | if (local < 0) |
1da177e4 | 1419 | goto tx_error; |
0b35f603 | 1420 | if (local) |
4115ded1 | 1421 | return ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 1); |
1da177e4 | 1422 | |
eddc9ec5 | 1423 | ip_send_check(ip_hdr(skb)); |
1da177e4 | 1424 | |
1da177e4 | 1425 | /* Another hack: avoid icmp_send in ip_fragment */ |
60ff7467 | 1426 | skb->ignore_df = 1; |
1da177e4 | 1427 | |
b8abdf09 | 1428 | ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 0); |
1da177e4 | 1429 | |
1da177e4 LT |
1430 | return NF_STOLEN; |
1431 | ||
1da177e4 LT |
1432 | tx_error: |
1433 | kfree_skb(skb); | |
1da177e4 LT |
1434 | return NF_STOLEN; |
1435 | } | |
1436 | ||
b3cdd2a7 JV |
1437 | #ifdef CONFIG_IP_VS_IPV6 |
1438 | int | |
1439 | ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, | |
4115ded1 | 1440 | struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh) |
b3cdd2a7 | 1441 | { |
4115ded1 | 1442 | int local; |
b3cdd2a7 | 1443 | |
f5745f8a EB |
1444 | local = __ip_vs_get_out_rt_v6(cp->ipvs, cp->af, skb, cp->dest, |
1445 | &cp->daddr.in6, | |
4a4739d5 | 1446 | NULL, ipvsh, 0, |
4115ded1 | 1447 | IP_VS_RT_MODE_LOCAL | |
48e8aa6e MKL |
1448 | IP_VS_RT_MODE_NON_LOCAL | |
1449 | IP_VS_RT_MODE_KNOWN_NH); | |
4115ded1 | 1450 | if (local < 0) |
b3cdd2a7 | 1451 | goto tx_error; |
0b35f603 | 1452 | if (local) |
4115ded1 | 1453 | return ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 1); |
b3cdd2a7 JV |
1454 | |
1455 | /* Another hack: avoid icmp_send in ip_fragment */ | |
60ff7467 | 1456 | skb->ignore_df = 1; |
b3cdd2a7 | 1457 | |
b8abdf09 | 1458 | ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 0); |
b3cdd2a7 | 1459 | |
b3cdd2a7 JV |
1460 | return NF_STOLEN; |
1461 | ||
b3cdd2a7 JV |
1462 | tx_error: |
1463 | kfree_skb(skb); | |
b3cdd2a7 JV |
1464 | return NF_STOLEN; |
1465 | } | |
1466 | #endif | |
1467 | ||
1da177e4 LT |
1468 | |
1469 | /* | |
1470 | * ICMP packet transmitter | |
1471 | * called by the ip_vs_in_icmp | |
1472 | */ | |
1473 | int | |
1474 | ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, | |
d4383f04 JDB |
1475 | struct ip_vs_protocol *pp, int offset, unsigned int hooknum, |
1476 | struct ip_vs_iphdr *iph) | |
1da177e4 LT |
1477 | { |
1478 | struct rtable *rt; /* Route to the other host */ | |
1da177e4 | 1479 | int rc; |
fc604767 | 1480 | int local; |
4115ded1 | 1481 | int rt_mode, was_input; |
1da177e4 | 1482 | |
1da177e4 LT |
1483 | /* The ICMP packet for VS/TUN, VS/DR and LOCALNODE will be |
1484 | forwarded directly here, because there is no need to | |
1485 | translate address/port back */ | |
1486 | if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) { | |
1487 | if (cp->packet_xmit) | |
d4383f04 | 1488 | rc = cp->packet_xmit(skb, cp, pp, iph); |
1da177e4 LT |
1489 | else |
1490 | rc = NF_ACCEPT; | |
1491 | /* do not touch skb anymore */ | |
1492 | atomic_inc(&cp->in_pkts); | |
210ffe4a | 1493 | return rc; |
1da177e4 LT |
1494 | } |
1495 | ||
1496 | /* | |
1497 | * mangle and send the packet here (only for VS/NAT) | |
1498 | */ | |
4115ded1 | 1499 | was_input = rt_is_input_route(skb_rtable(skb)); |
1da177e4 | 1500 | |
c92f5ca2 JA |
1501 | /* LOCALNODE from FORWARD hook is not supported */ |
1502 | rt_mode = (hooknum != NF_INET_FORWARD) ? | |
1503 | IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL | | |
1504 | IP_VS_RT_MODE_RDR : IP_VS_RT_MODE_NON_LOCAL; | |
ecfe87b8 | 1505 | local = __ip_vs_get_out_rt(cp->ipvs, cp->af, skb, cp->dest, cp->daddr.ip, rt_mode, |
c63e4de2 | 1506 | NULL, iph); |
4115ded1 JA |
1507 | if (local < 0) |
1508 | goto tx_error; | |
1509 | rt = skb_rtable(skb); | |
fc604767 JA |
1510 | |
1511 | /* | |
1512 | * Avoid duplicate tuple in reply direction for NAT traffic | |
1513 | * to local address when connection is sync-ed | |
1514 | */ | |
c0cd1156 | 1515 | #if IS_ENABLED(CONFIG_NF_CONNTRACK) |
fc604767 JA |
1516 | if (cp->flags & IP_VS_CONN_F_SYNC && local) { |
1517 | enum ip_conntrack_info ctinfo; | |
05b4b065 | 1518 | struct nf_conn *ct = nf_ct_get(skb, &ctinfo); |
fc604767 | 1519 | |
ab8bc7ed | 1520 | if (ct) { |
fc604767 JA |
1521 | IP_VS_DBG(10, "%s(): " |
1522 | "stopping DNAT to local address %pI4\n", | |
1523 | __func__, &cp->daddr.ip); | |
4115ded1 | 1524 | goto tx_error; |
fc604767 JA |
1525 | } |
1526 | } | |
1527 | #endif | |
1528 | ||
1529 | /* From world but DNAT to loopback address? */ | |
4115ded1 | 1530 | if (local && ipv4_is_loopback(cp->daddr.ip) && was_input) { |
fc604767 JA |
1531 | IP_VS_DBG(1, "%s(): " |
1532 | "stopping DNAT to loopback %pI4\n", | |
1533 | __func__, &cp->daddr.ip); | |
4115ded1 | 1534 | goto tx_error; |
1da177e4 LT |
1535 | } |
1536 | ||
1537 | /* copy-on-write the packet before mangling it */ | |
ec0974df | 1538 | if (skb_ensure_writable(skb, offset)) |
4115ded1 | 1539 | goto tx_error; |
1da177e4 | 1540 | |
d8d1f30b | 1541 | if (skb_cow(skb, rt->dst.dev->hard_header_len)) |
4115ded1 | 1542 | goto tx_error; |
1da177e4 | 1543 | |
1da177e4 LT |
1544 | ip_vs_nat_icmp(skb, pp, cp, 0); |
1545 | ||
1546 | /* Another hack: avoid icmp_send in ip_fragment */ | |
60ff7467 | 1547 | skb->ignore_df = 1; |
1da177e4 | 1548 | |
210ffe4a | 1549 | return ip_vs_nat_send_or_cont(NFPROTO_IPV4, skb, cp, local); |
1da177e4 | 1550 | |
1da177e4 | 1551 | tx_error: |
026ace06 | 1552 | kfree_skb(skb); |
1da177e4 | 1553 | rc = NF_STOLEN; |
1da177e4 | 1554 | return rc; |
1da177e4 | 1555 | } |
b3cdd2a7 JV |
1556 | |
1557 | #ifdef CONFIG_IP_VS_IPV6 | |
1558 | int | |
1559 | ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, | |
d4383f04 | 1560 | struct ip_vs_protocol *pp, int offset, unsigned int hooknum, |
4115ded1 | 1561 | struct ip_vs_iphdr *ipvsh) |
b3cdd2a7 JV |
1562 | { |
1563 | struct rt6_info *rt; /* Route to the other host */ | |
b3cdd2a7 | 1564 | int rc; |
fc604767 | 1565 | int local; |
c92f5ca2 | 1566 | int rt_mode; |
b3cdd2a7 | 1567 | |
b3cdd2a7 JV |
1568 | /* The ICMP packet for VS/TUN, VS/DR and LOCALNODE will be |
1569 | forwarded directly here, because there is no need to | |
1570 | translate address/port back */ | |
1571 | if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) { | |
1572 | if (cp->packet_xmit) | |
4115ded1 | 1573 | rc = cp->packet_xmit(skb, cp, pp, ipvsh); |
b3cdd2a7 JV |
1574 | else |
1575 | rc = NF_ACCEPT; | |
1576 | /* do not touch skb anymore */ | |
1577 | atomic_inc(&cp->in_pkts); | |
210ffe4a | 1578 | return rc; |
b3cdd2a7 JV |
1579 | } |
1580 | ||
1581 | /* | |
1582 | * mangle and send the packet here (only for VS/NAT) | |
1583 | */ | |
1584 | ||
c92f5ca2 JA |
1585 | /* LOCALNODE from FORWARD hook is not supported */ |
1586 | rt_mode = (hooknum != NF_INET_FORWARD) ? | |
1587 | IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL | | |
1588 | IP_VS_RT_MODE_RDR : IP_VS_RT_MODE_NON_LOCAL; | |
f5745f8a EB |
1589 | local = __ip_vs_get_out_rt_v6(cp->ipvs, cp->af, skb, cp->dest, |
1590 | &cp->daddr.in6, NULL, ipvsh, 0, rt_mode); | |
4115ded1 JA |
1591 | if (local < 0) |
1592 | goto tx_error; | |
e8dfd42c | 1593 | rt = dst_rt6_info(skb_dst(skb)); |
fc604767 JA |
1594 | /* |
1595 | * Avoid duplicate tuple in reply direction for NAT traffic | |
1596 | * to local address when connection is sync-ed | |
1597 | */ | |
c0cd1156 | 1598 | #if IS_ENABLED(CONFIG_NF_CONNTRACK) |
fc604767 JA |
1599 | if (cp->flags & IP_VS_CONN_F_SYNC && local) { |
1600 | enum ip_conntrack_info ctinfo; | |
05b4b065 | 1601 | struct nf_conn *ct = nf_ct_get(skb, &ctinfo); |
fc604767 | 1602 | |
ab8bc7ed | 1603 | if (ct) { |
fc604767 JA |
1604 | IP_VS_DBG(10, "%s(): " |
1605 | "stopping DNAT to local address %pI6\n", | |
1606 | __func__, &cp->daddr.in6); | |
4115ded1 | 1607 | goto tx_error; |
fc604767 JA |
1608 | } |
1609 | } | |
1610 | #endif | |
1611 | ||
1612 | /* From world but DNAT to loopback address? */ | |
1613 | if (local && skb->dev && !(skb->dev->flags & IFF_LOOPBACK) && | |
fd0273d7 | 1614 | ipv6_addr_type(&cp->daddr.in6) & IPV6_ADDR_LOOPBACK) { |
fc604767 JA |
1615 | IP_VS_DBG(1, "%s(): " |
1616 | "stopping DNAT to loopback %pI6\n", | |
1617 | __func__, &cp->daddr.in6); | |
4115ded1 | 1618 | goto tx_error; |
b3cdd2a7 JV |
1619 | } |
1620 | ||
1621 | /* copy-on-write the packet before mangling it */ | |
ec0974df | 1622 | if (skb_ensure_writable(skb, offset)) |
4115ded1 | 1623 | goto tx_error; |
b3cdd2a7 | 1624 | |
d8d1f30b | 1625 | if (skb_cow(skb, rt->dst.dev->hard_header_len)) |
4115ded1 | 1626 | goto tx_error; |
b3cdd2a7 | 1627 | |
b3cdd2a7 JV |
1628 | ip_vs_nat_icmp_v6(skb, pp, cp, 0); |
1629 | ||
1630 | /* Another hack: avoid icmp_send in ip_fragment */ | |
60ff7467 | 1631 | skb->ignore_df = 1; |
b3cdd2a7 | 1632 | |
210ffe4a | 1633 | return ip_vs_nat_send_or_cont(NFPROTO_IPV6, skb, cp, local); |
b3cdd2a7 | 1634 | |
b3cdd2a7 | 1635 | tx_error: |
026ace06 | 1636 | kfree_skb(skb); |
b3cdd2a7 | 1637 | rc = NF_STOLEN; |
b3cdd2a7 | 1638 | return rc; |
b3cdd2a7 JV |
1639 | } |
1640 | #endif |