]> git.ipfire.org Git - thirdparty/kernel/linux.git/blob - net/ipv6/ip6_output.c
Merge tag 'upstream-5.2-rc1' of ssh://gitolite.kernel.org/pub/scm/linux/kernel/git...
[thirdparty/kernel/linux.git] / net / ipv6 / ip6_output.c
1 /*
2 * IPv6 output functions
3 * Linux INET6 implementation
4 *
5 * Authors:
6 * Pedro Roque <roque@di.fc.ul.pt>
7 *
8 * Based on linux/net/ipv4/ip_output.c
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public License
12 * as published by the Free Software Foundation; either version
13 * 2 of the License, or (at your option) any later version.
14 *
15 * Changes:
16 * A.N.Kuznetsov : airthmetics in fragmentation.
17 * extension headers are implemented.
18 * route changes now work.
19 * ip6_forward does not confuse sniffers.
20 * etc.
21 *
22 * H. von Brand : Added missing #include <linux/string.h>
23 * Imran Patel : frag id should be in NBO
24 * Kazunori MIYAZAWA @USAGI
25 * : add ip6_append_data and related functions
26 * for datagram xmit
27 */
28
29 #include <linux/errno.h>
30 #include <linux/kernel.h>
31 #include <linux/string.h>
32 #include <linux/socket.h>
33 #include <linux/net.h>
34 #include <linux/netdevice.h>
35 #include <linux/if_arp.h>
36 #include <linux/in6.h>
37 #include <linux/tcp.h>
38 #include <linux/route.h>
39 #include <linux/module.h>
40 #include <linux/slab.h>
41
42 #include <linux/bpf-cgroup.h>
43 #include <linux/netfilter.h>
44 #include <linux/netfilter_ipv6.h>
45
46 #include <net/sock.h>
47 #include <net/snmp.h>
48
49 #include <net/ipv6.h>
50 #include <net/ndisc.h>
51 #include <net/protocol.h>
52 #include <net/ip6_route.h>
53 #include <net/addrconf.h>
54 #include <net/rawv6.h>
55 #include <net/icmp.h>
56 #include <net/xfrm.h>
57 #include <net/checksum.h>
58 #include <linux/mroute6.h>
59 #include <net/l3mdev.h>
60 #include <net/lwtunnel.h>
61
62 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
63 {
64 struct dst_entry *dst = skb_dst(skb);
65 struct net_device *dev = dst->dev;
66 struct neighbour *neigh;
67 struct in6_addr *nexthop;
68 int ret;
69
70 if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
71 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
72
73 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
74 ((mroute6_is_socket(net, skb) &&
75 !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
76 ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
77 &ipv6_hdr(skb)->saddr))) {
78 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
79
80 /* Do not check for IFF_ALLMULTI; multicast routing
81 is not supported in any case.
82 */
83 if (newskb)
84 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
85 net, sk, newskb, NULL, newskb->dev,
86 dev_loopback_xmit);
87
88 if (ipv6_hdr(skb)->hop_limit == 0) {
89 IP6_INC_STATS(net, idev,
90 IPSTATS_MIB_OUTDISCARDS);
91 kfree_skb(skb);
92 return 0;
93 }
94 }
95
96 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
97
98 if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <=
99 IPV6_ADDR_SCOPE_NODELOCAL &&
100 !(dev->flags & IFF_LOOPBACK)) {
101 kfree_skb(skb);
102 return 0;
103 }
104 }
105
106 if (lwtunnel_xmit_redirect(dst->lwtstate)) {
107 int res = lwtunnel_xmit(skb);
108
109 if (res < 0 || res == LWTUNNEL_XMIT_DONE)
110 return res;
111 }
112
113 rcu_read_lock_bh();
114 nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
115 neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
116 if (unlikely(!neigh))
117 neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
118 if (!IS_ERR(neigh)) {
119 sock_confirm_neigh(skb, neigh);
120 ret = neigh_output(neigh, skb, false);
121 rcu_read_unlock_bh();
122 return ret;
123 }
124 rcu_read_unlock_bh();
125
126 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
127 kfree_skb(skb);
128 return -EINVAL;
129 }
130
131 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
132 {
133 int ret;
134
135 ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
136 if (ret) {
137 kfree_skb(skb);
138 return ret;
139 }
140
141 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
142 /* Policy lookup after SNAT yielded a new policy */
143 if (skb_dst(skb)->xfrm) {
144 IPCB(skb)->flags |= IPSKB_REROUTED;
145 return dst_output(net, sk, skb);
146 }
147 #endif
148
149 if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
150 dst_allfrag(skb_dst(skb)) ||
151 (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
152 return ip6_fragment(net, sk, skb, ip6_finish_output2);
153 else
154 return ip6_finish_output2(net, sk, skb);
155 }
156
157 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
158 {
159 struct net_device *dev = skb_dst(skb)->dev;
160 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
161
162 skb->protocol = htons(ETH_P_IPV6);
163 skb->dev = dev;
164
165 if (unlikely(idev->cnf.disable_ipv6)) {
166 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
167 kfree_skb(skb);
168 return 0;
169 }
170
171 return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
172 net, sk, skb, NULL, dev,
173 ip6_finish_output,
174 !(IP6CB(skb)->flags & IP6SKB_REROUTED));
175 }
176
177 bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np)
178 {
179 if (!np->autoflowlabel_set)
180 return ip6_default_np_autolabel(net);
181 else
182 return np->autoflowlabel;
183 }
184
185 /*
186 * xmit an sk_buff (used by TCP, SCTP and DCCP)
187 * Note : socket lock is not held for SYNACK packets, but might be modified
188 * by calls to skb_set_owner_w() and ipv6_local_error(),
189 * which are using proper atomic operations or spinlocks.
190 */
191 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
192 __u32 mark, struct ipv6_txoptions *opt, int tclass)
193 {
194 struct net *net = sock_net(sk);
195 const struct ipv6_pinfo *np = inet6_sk(sk);
196 struct in6_addr *first_hop = &fl6->daddr;
197 struct dst_entry *dst = skb_dst(skb);
198 unsigned int head_room;
199 struct ipv6hdr *hdr;
200 u8 proto = fl6->flowi6_proto;
201 int seg_len = skb->len;
202 int hlimit = -1;
203 u32 mtu;
204
205 head_room = sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
206 if (opt)
207 head_room += opt->opt_nflen + opt->opt_flen;
208
209 if (unlikely(skb_headroom(skb) < head_room)) {
210 struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
211 if (!skb2) {
212 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
213 IPSTATS_MIB_OUTDISCARDS);
214 kfree_skb(skb);
215 return -ENOBUFS;
216 }
217 if (skb->sk)
218 skb_set_owner_w(skb2, skb->sk);
219 consume_skb(skb);
220 skb = skb2;
221 }
222
223 if (opt) {
224 seg_len += opt->opt_nflen + opt->opt_flen;
225
226 if (opt->opt_flen)
227 ipv6_push_frag_opts(skb, opt, &proto);
228
229 if (opt->opt_nflen)
230 ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop,
231 &fl6->saddr);
232 }
233
234 skb_push(skb, sizeof(struct ipv6hdr));
235 skb_reset_network_header(skb);
236 hdr = ipv6_hdr(skb);
237
238 /*
239 * Fill in the IPv6 header
240 */
241 if (np)
242 hlimit = np->hop_limit;
243 if (hlimit < 0)
244 hlimit = ip6_dst_hoplimit(dst);
245
246 ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
247 ip6_autoflowlabel(net, np), fl6));
248
249 hdr->payload_len = htons(seg_len);
250 hdr->nexthdr = proto;
251 hdr->hop_limit = hlimit;
252
253 hdr->saddr = fl6->saddr;
254 hdr->daddr = *first_hop;
255
256 skb->protocol = htons(ETH_P_IPV6);
257 skb->priority = sk->sk_priority;
258 skb->mark = mark;
259
260 mtu = dst_mtu(dst);
261 if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
262 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
263 IPSTATS_MIB_OUT, skb->len);
264
265 /* if egress device is enslaved to an L3 master device pass the
266 * skb to its handler for processing
267 */
268 skb = l3mdev_ip6_out((struct sock *)sk, skb);
269 if (unlikely(!skb))
270 return 0;
271
272 /* hooks should never assume socket lock is held.
273 * we promote our socket to non const
274 */
275 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
276 net, (struct sock *)sk, skb, NULL, dst->dev,
277 dst_output);
278 }
279
280 skb->dev = dst->dev;
281 /* ipv6_local_error() does not require socket lock,
282 * we promote our socket to non const
283 */
284 ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
285
286 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
287 kfree_skb(skb);
288 return -EMSGSIZE;
289 }
290 EXPORT_SYMBOL(ip6_xmit);
291
292 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
293 {
294 struct ip6_ra_chain *ra;
295 struct sock *last = NULL;
296
297 read_lock(&ip6_ra_lock);
298 for (ra = ip6_ra_chain; ra; ra = ra->next) {
299 struct sock *sk = ra->sk;
300 if (sk && ra->sel == sel &&
301 (!sk->sk_bound_dev_if ||
302 sk->sk_bound_dev_if == skb->dev->ifindex)) {
303 struct ipv6_pinfo *np = inet6_sk(sk);
304
305 if (np && np->rtalert_isolate &&
306 !net_eq(sock_net(sk), dev_net(skb->dev))) {
307 continue;
308 }
309 if (last) {
310 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
311 if (skb2)
312 rawv6_rcv(last, skb2);
313 }
314 last = sk;
315 }
316 }
317
318 if (last) {
319 rawv6_rcv(last, skb);
320 read_unlock(&ip6_ra_lock);
321 return 1;
322 }
323 read_unlock(&ip6_ra_lock);
324 return 0;
325 }
326
327 static int ip6_forward_proxy_check(struct sk_buff *skb)
328 {
329 struct ipv6hdr *hdr = ipv6_hdr(skb);
330 u8 nexthdr = hdr->nexthdr;
331 __be16 frag_off;
332 int offset;
333
334 if (ipv6_ext_hdr(nexthdr)) {
335 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
336 if (offset < 0)
337 return 0;
338 } else
339 offset = sizeof(struct ipv6hdr);
340
341 if (nexthdr == IPPROTO_ICMPV6) {
342 struct icmp6hdr *icmp6;
343
344 if (!pskb_may_pull(skb, (skb_network_header(skb) +
345 offset + 1 - skb->data)))
346 return 0;
347
348 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
349
350 switch (icmp6->icmp6_type) {
351 case NDISC_ROUTER_SOLICITATION:
352 case NDISC_ROUTER_ADVERTISEMENT:
353 case NDISC_NEIGHBOUR_SOLICITATION:
354 case NDISC_NEIGHBOUR_ADVERTISEMENT:
355 case NDISC_REDIRECT:
356 /* For reaction involving unicast neighbor discovery
357 * message destined to the proxied address, pass it to
358 * input function.
359 */
360 return 1;
361 default:
362 break;
363 }
364 }
365
366 /*
367 * The proxying router can't forward traffic sent to a link-local
368 * address, so signal the sender and discard the packet. This
369 * behavior is clarified by the MIPv6 specification.
370 */
371 if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
372 dst_link_failure(skb);
373 return -1;
374 }
375
376 return 0;
377 }
378
379 static inline int ip6_forward_finish(struct net *net, struct sock *sk,
380 struct sk_buff *skb)
381 {
382 struct dst_entry *dst = skb_dst(skb);
383
384 __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
385 __IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
386
387 #ifdef CONFIG_NET_SWITCHDEV
388 if (skb->offload_l3_fwd_mark) {
389 consume_skb(skb);
390 return 0;
391 }
392 #endif
393
394 skb->tstamp = 0;
395 return dst_output(net, sk, skb);
396 }
397
398 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
399 {
400 if (skb->len <= mtu)
401 return false;
402
403 /* ipv6 conntrack defrag sets max_frag_size + ignore_df */
404 if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
405 return true;
406
407 if (skb->ignore_df)
408 return false;
409
410 if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
411 return false;
412
413 return true;
414 }
415
416 int ip6_forward(struct sk_buff *skb)
417 {
418 struct inet6_dev *idev = __in6_dev_get_safely(skb->dev);
419 struct dst_entry *dst = skb_dst(skb);
420 struct ipv6hdr *hdr = ipv6_hdr(skb);
421 struct inet6_skb_parm *opt = IP6CB(skb);
422 struct net *net = dev_net(dst->dev);
423 u32 mtu;
424
425 if (net->ipv6.devconf_all->forwarding == 0)
426 goto error;
427
428 if (skb->pkt_type != PACKET_HOST)
429 goto drop;
430
431 if (unlikely(skb->sk))
432 goto drop;
433
434 if (skb_warn_if_lro(skb))
435 goto drop;
436
437 if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
438 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
439 goto drop;
440 }
441
442 skb_forward_csum(skb);
443
444 /*
445 * We DO NOT make any processing on
446 * RA packets, pushing them to user level AS IS
447 * without ane WARRANTY that application will be able
448 * to interpret them. The reason is that we
449 * cannot make anything clever here.
450 *
451 * We are not end-node, so that if packet contains
452 * AH/ESP, we cannot make anything.
453 * Defragmentation also would be mistake, RA packets
454 * cannot be fragmented, because there is no warranty
455 * that different fragments will go along one path. --ANK
456 */
457 if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
458 if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
459 return 0;
460 }
461
462 /*
463 * check and decrement ttl
464 */
465 if (hdr->hop_limit <= 1) {
466 /* Force OUTPUT device used as source address */
467 skb->dev = dst->dev;
468 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
469 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
470
471 kfree_skb(skb);
472 return -ETIMEDOUT;
473 }
474
475 /* XXX: idev->cnf.proxy_ndp? */
476 if (net->ipv6.devconf_all->proxy_ndp &&
477 pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
478 int proxied = ip6_forward_proxy_check(skb);
479 if (proxied > 0)
480 return ip6_input(skb);
481 else if (proxied < 0) {
482 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
483 goto drop;
484 }
485 }
486
487 if (!xfrm6_route_forward(skb)) {
488 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
489 goto drop;
490 }
491 dst = skb_dst(skb);
492
493 /* IPv6 specs say nothing about it, but it is clear that we cannot
494 send redirects to source routed frames.
495 We don't send redirects to frames decapsulated from IPsec.
496 */
497 if (IP6CB(skb)->iif == dst->dev->ifindex &&
498 opt->srcrt == 0 && !skb_sec_path(skb)) {
499 struct in6_addr *target = NULL;
500 struct inet_peer *peer;
501 struct rt6_info *rt;
502
503 /*
504 * incoming and outgoing devices are the same
505 * send a redirect.
506 */
507
508 rt = (struct rt6_info *) dst;
509 if (rt->rt6i_flags & RTF_GATEWAY)
510 target = &rt->rt6i_gateway;
511 else
512 target = &hdr->daddr;
513
514 peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
515
516 /* Limit redirects both by destination (here)
517 and by source (inside ndisc_send_redirect)
518 */
519 if (inet_peer_xrlim_allow(peer, 1*HZ))
520 ndisc_send_redirect(skb, target);
521 if (peer)
522 inet_putpeer(peer);
523 } else {
524 int addrtype = ipv6_addr_type(&hdr->saddr);
525
526 /* This check is security critical. */
527 if (addrtype == IPV6_ADDR_ANY ||
528 addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
529 goto error;
530 if (addrtype & IPV6_ADDR_LINKLOCAL) {
531 icmpv6_send(skb, ICMPV6_DEST_UNREACH,
532 ICMPV6_NOT_NEIGHBOUR, 0);
533 goto error;
534 }
535 }
536
537 mtu = ip6_dst_mtu_forward(dst);
538 if (mtu < IPV6_MIN_MTU)
539 mtu = IPV6_MIN_MTU;
540
541 if (ip6_pkt_too_big(skb, mtu)) {
542 /* Again, force OUTPUT device used as source address */
543 skb->dev = dst->dev;
544 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
545 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS);
546 __IP6_INC_STATS(net, ip6_dst_idev(dst),
547 IPSTATS_MIB_FRAGFAILS);
548 kfree_skb(skb);
549 return -EMSGSIZE;
550 }
551
552 if (skb_cow(skb, dst->dev->hard_header_len)) {
553 __IP6_INC_STATS(net, ip6_dst_idev(dst),
554 IPSTATS_MIB_OUTDISCARDS);
555 goto drop;
556 }
557
558 hdr = ipv6_hdr(skb);
559
560 /* Mangling hops number delayed to point after skb COW */
561
562 hdr->hop_limit--;
563
564 return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
565 net, NULL, skb, skb->dev, dst->dev,
566 ip6_forward_finish);
567
568 error:
569 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
570 drop:
571 kfree_skb(skb);
572 return -EINVAL;
573 }
574
575 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
576 {
577 to->pkt_type = from->pkt_type;
578 to->priority = from->priority;
579 to->protocol = from->protocol;
580 skb_dst_drop(to);
581 skb_dst_set(to, dst_clone(skb_dst(from)));
582 to->dev = from->dev;
583 to->mark = from->mark;
584
585 skb_copy_hash(to, from);
586
587 #ifdef CONFIG_NET_SCHED
588 to->tc_index = from->tc_index;
589 #endif
590 nf_copy(to, from);
591 skb_ext_copy(to, from);
592 skb_copy_secmark(to, from);
593 }
594
595 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
596 int (*output)(struct net *, struct sock *, struct sk_buff *))
597 {
598 struct sk_buff *frag;
599 struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
600 struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
601 inet6_sk(skb->sk) : NULL;
602 struct ipv6hdr *tmp_hdr;
603 struct frag_hdr *fh;
604 unsigned int mtu, hlen, left, len, nexthdr_offset;
605 int hroom, troom;
606 __be32 frag_id;
607 int ptr, offset = 0, err = 0;
608 u8 *prevhdr, nexthdr = 0;
609
610 err = ip6_find_1stfragopt(skb, &prevhdr);
611 if (err < 0)
612 goto fail;
613 hlen = err;
614 nexthdr = *prevhdr;
615 nexthdr_offset = prevhdr - skb_network_header(skb);
616
617 mtu = ip6_skb_dst_mtu(skb);
618
619 /* We must not fragment if the socket is set to force MTU discovery
620 * or if the skb it not generated by a local socket.
621 */
622 if (unlikely(!skb->ignore_df && skb->len > mtu))
623 goto fail_toobig;
624
625 if (IP6CB(skb)->frag_max_size) {
626 if (IP6CB(skb)->frag_max_size > mtu)
627 goto fail_toobig;
628
629 /* don't send fragments larger than what we received */
630 mtu = IP6CB(skb)->frag_max_size;
631 if (mtu < IPV6_MIN_MTU)
632 mtu = IPV6_MIN_MTU;
633 }
634
635 if (np && np->frag_size < mtu) {
636 if (np->frag_size)
637 mtu = np->frag_size;
638 }
639 if (mtu < hlen + sizeof(struct frag_hdr) + 8)
640 goto fail_toobig;
641 mtu -= hlen + sizeof(struct frag_hdr);
642
643 frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
644 &ipv6_hdr(skb)->saddr);
645
646 if (skb->ip_summed == CHECKSUM_PARTIAL &&
647 (err = skb_checksum_help(skb)))
648 goto fail;
649
650 prevhdr = skb_network_header(skb) + nexthdr_offset;
651 hroom = LL_RESERVED_SPACE(rt->dst.dev);
652 if (skb_has_frag_list(skb)) {
653 unsigned int first_len = skb_pagelen(skb);
654 struct sk_buff *frag2;
655
656 if (first_len - hlen > mtu ||
657 ((first_len - hlen) & 7) ||
658 skb_cloned(skb) ||
659 skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
660 goto slow_path;
661
662 skb_walk_frags(skb, frag) {
663 /* Correct geometry. */
664 if (frag->len > mtu ||
665 ((frag->len & 7) && frag->next) ||
666 skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
667 goto slow_path_clean;
668
669 /* Partially cloned skb? */
670 if (skb_shared(frag))
671 goto slow_path_clean;
672
673 BUG_ON(frag->sk);
674 if (skb->sk) {
675 frag->sk = skb->sk;
676 frag->destructor = sock_wfree;
677 }
678 skb->truesize -= frag->truesize;
679 }
680
681 err = 0;
682 offset = 0;
683 /* BUILD HEADER */
684
685 *prevhdr = NEXTHDR_FRAGMENT;
686 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
687 if (!tmp_hdr) {
688 err = -ENOMEM;
689 goto fail;
690 }
691 frag = skb_shinfo(skb)->frag_list;
692 skb_frag_list_init(skb);
693
694 __skb_pull(skb, hlen);
695 fh = __skb_push(skb, sizeof(struct frag_hdr));
696 __skb_push(skb, hlen);
697 skb_reset_network_header(skb);
698 memcpy(skb_network_header(skb), tmp_hdr, hlen);
699
700 fh->nexthdr = nexthdr;
701 fh->reserved = 0;
702 fh->frag_off = htons(IP6_MF);
703 fh->identification = frag_id;
704
705 first_len = skb_pagelen(skb);
706 skb->data_len = first_len - skb_headlen(skb);
707 skb->len = first_len;
708 ipv6_hdr(skb)->payload_len = htons(first_len -
709 sizeof(struct ipv6hdr));
710
711 for (;;) {
712 /* Prepare header of the next frame,
713 * before previous one went down. */
714 if (frag) {
715 frag->ip_summed = CHECKSUM_NONE;
716 skb_reset_transport_header(frag);
717 fh = __skb_push(frag, sizeof(struct frag_hdr));
718 __skb_push(frag, hlen);
719 skb_reset_network_header(frag);
720 memcpy(skb_network_header(frag), tmp_hdr,
721 hlen);
722 offset += skb->len - hlen - sizeof(struct frag_hdr);
723 fh->nexthdr = nexthdr;
724 fh->reserved = 0;
725 fh->frag_off = htons(offset);
726 if (frag->next)
727 fh->frag_off |= htons(IP6_MF);
728 fh->identification = frag_id;
729 ipv6_hdr(frag)->payload_len =
730 htons(frag->len -
731 sizeof(struct ipv6hdr));
732 ip6_copy_metadata(frag, skb);
733 }
734
735 err = output(net, sk, skb);
736 if (!err)
737 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
738 IPSTATS_MIB_FRAGCREATES);
739
740 if (err || !frag)
741 break;
742
743 skb = frag;
744 frag = skb->next;
745 skb_mark_not_on_list(skb);
746 }
747
748 kfree(tmp_hdr);
749
750 if (err == 0) {
751 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
752 IPSTATS_MIB_FRAGOKS);
753 return 0;
754 }
755
756 kfree_skb_list(frag);
757
758 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
759 IPSTATS_MIB_FRAGFAILS);
760 return err;
761
762 slow_path_clean:
763 skb_walk_frags(skb, frag2) {
764 if (frag2 == frag)
765 break;
766 frag2->sk = NULL;
767 frag2->destructor = NULL;
768 skb->truesize += frag2->truesize;
769 }
770 }
771
772 slow_path:
773 left = skb->len - hlen; /* Space per frame */
774 ptr = hlen; /* Where to start from */
775
776 /*
777 * Fragment the datagram.
778 */
779
780 troom = rt->dst.dev->needed_tailroom;
781
782 /*
783 * Keep copying data until we run out.
784 */
785 while (left > 0) {
786 u8 *fragnexthdr_offset;
787
788 len = left;
789 /* IF: it doesn't fit, use 'mtu' - the data space left */
790 if (len > mtu)
791 len = mtu;
792 /* IF: we are not sending up to and including the packet end
793 then align the next start on an eight byte boundary */
794 if (len < left) {
795 len &= ~7;
796 }
797
798 /* Allocate buffer */
799 frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
800 hroom + troom, GFP_ATOMIC);
801 if (!frag) {
802 err = -ENOMEM;
803 goto fail;
804 }
805
806 /*
807 * Set up data on packet
808 */
809
810 ip6_copy_metadata(frag, skb);
811 skb_reserve(frag, hroom);
812 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
813 skb_reset_network_header(frag);
814 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
815 frag->transport_header = (frag->network_header + hlen +
816 sizeof(struct frag_hdr));
817
818 /*
819 * Charge the memory for the fragment to any owner
820 * it might possess
821 */
822 if (skb->sk)
823 skb_set_owner_w(frag, skb->sk);
824
825 /*
826 * Copy the packet header into the new buffer.
827 */
828 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
829
830 fragnexthdr_offset = skb_network_header(frag);
831 fragnexthdr_offset += prevhdr - skb_network_header(skb);
832 *fragnexthdr_offset = NEXTHDR_FRAGMENT;
833
834 /*
835 * Build fragment header.
836 */
837 fh->nexthdr = nexthdr;
838 fh->reserved = 0;
839 fh->identification = frag_id;
840
841 /*
842 * Copy a block of the IP datagram.
843 */
844 BUG_ON(skb_copy_bits(skb, ptr, skb_transport_header(frag),
845 len));
846 left -= len;
847
848 fh->frag_off = htons(offset);
849 if (left > 0)
850 fh->frag_off |= htons(IP6_MF);
851 ipv6_hdr(frag)->payload_len = htons(frag->len -
852 sizeof(struct ipv6hdr));
853
854 ptr += len;
855 offset += len;
856
857 /*
858 * Put this fragment into the sending queue.
859 */
860 err = output(net, sk, frag);
861 if (err)
862 goto fail;
863
864 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
865 IPSTATS_MIB_FRAGCREATES);
866 }
867 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
868 IPSTATS_MIB_FRAGOKS);
869 consume_skb(skb);
870 return err;
871
872 fail_toobig:
873 if (skb->sk && dst_allfrag(skb_dst(skb)))
874 sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
875
876 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
877 err = -EMSGSIZE;
878
879 fail:
880 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
881 IPSTATS_MIB_FRAGFAILS);
882 kfree_skb(skb);
883 return err;
884 }
885
886 static inline int ip6_rt_check(const struct rt6key *rt_key,
887 const struct in6_addr *fl_addr,
888 const struct in6_addr *addr_cache)
889 {
890 return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
891 (!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
892 }
893
894 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
895 struct dst_entry *dst,
896 const struct flowi6 *fl6)
897 {
898 struct ipv6_pinfo *np = inet6_sk(sk);
899 struct rt6_info *rt;
900
901 if (!dst)
902 goto out;
903
904 if (dst->ops->family != AF_INET6) {
905 dst_release(dst);
906 return NULL;
907 }
908
909 rt = (struct rt6_info *)dst;
910 /* Yes, checking route validity in not connected
911 * case is not very simple. Take into account,
912 * that we do not support routing by source, TOS,
913 * and MSG_DONTROUTE --ANK (980726)
914 *
915 * 1. ip6_rt_check(): If route was host route,
916 * check that cached destination is current.
917 * If it is network route, we still may
918 * check its validity using saved pointer
919 * to the last used address: daddr_cache.
920 * We do not want to save whole address now,
921 * (because main consumer of this service
922 * is tcp, which has not this problem),
923 * so that the last trick works only on connected
924 * sockets.
925 * 2. oif also should be the same.
926 */
927 if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
928 #ifdef CONFIG_IPV6_SUBTREES
929 ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
930 #endif
931 (!(fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) &&
932 (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex))) {
933 dst_release(dst);
934 dst = NULL;
935 }
936
937 out:
938 return dst;
939 }
940
941 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
942 struct dst_entry **dst, struct flowi6 *fl6)
943 {
944 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
945 struct neighbour *n;
946 struct rt6_info *rt;
947 #endif
948 int err;
949 int flags = 0;
950
951 /* The correct way to handle this would be to do
952 * ip6_route_get_saddr, and then ip6_route_output; however,
953 * the route-specific preferred source forces the
954 * ip6_route_output call _before_ ip6_route_get_saddr.
955 *
956 * In source specific routing (no src=any default route),
957 * ip6_route_output will fail given src=any saddr, though, so
958 * that's why we try it again later.
959 */
960 if (ipv6_addr_any(&fl6->saddr) && (!*dst || !(*dst)->error)) {
961 struct fib6_info *from;
962 struct rt6_info *rt;
963 bool had_dst = *dst != NULL;
964
965 if (!had_dst)
966 *dst = ip6_route_output(net, sk, fl6);
967 rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
968
969 rcu_read_lock();
970 from = rt ? rcu_dereference(rt->from) : NULL;
971 err = ip6_route_get_saddr(net, from, &fl6->daddr,
972 sk ? inet6_sk(sk)->srcprefs : 0,
973 &fl6->saddr);
974 rcu_read_unlock();
975
976 if (err)
977 goto out_err_release;
978
979 /* If we had an erroneous initial result, pretend it
980 * never existed and let the SA-enabled version take
981 * over.
982 */
983 if (!had_dst && (*dst)->error) {
984 dst_release(*dst);
985 *dst = NULL;
986 }
987
988 if (fl6->flowi6_oif)
989 flags |= RT6_LOOKUP_F_IFACE;
990 }
991
992 if (!*dst)
993 *dst = ip6_route_output_flags(net, sk, fl6, flags);
994
995 err = (*dst)->error;
996 if (err)
997 goto out_err_release;
998
999 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1000 /*
1001 * Here if the dst entry we've looked up
1002 * has a neighbour entry that is in the INCOMPLETE
1003 * state and the src address from the flow is
1004 * marked as OPTIMISTIC, we release the found
1005 * dst entry and replace it instead with the
1006 * dst entry of the nexthop router
1007 */
1008 rt = (struct rt6_info *) *dst;
1009 rcu_read_lock_bh();
1010 n = __ipv6_neigh_lookup_noref(rt->dst.dev,
1011 rt6_nexthop(rt, &fl6->daddr));
1012 err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
1013 rcu_read_unlock_bh();
1014
1015 if (err) {
1016 struct inet6_ifaddr *ifp;
1017 struct flowi6 fl_gw6;
1018 int redirect;
1019
1020 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
1021 (*dst)->dev, 1);
1022
1023 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1024 if (ifp)
1025 in6_ifa_put(ifp);
1026
1027 if (redirect) {
1028 /*
1029 * We need to get the dst entry for the
1030 * default router instead
1031 */
1032 dst_release(*dst);
1033 memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1034 memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1035 *dst = ip6_route_output(net, sk, &fl_gw6);
1036 err = (*dst)->error;
1037 if (err)
1038 goto out_err_release;
1039 }
1040 }
1041 #endif
1042 if (ipv6_addr_v4mapped(&fl6->saddr) &&
1043 !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
1044 err = -EAFNOSUPPORT;
1045 goto out_err_release;
1046 }
1047
1048 return 0;
1049
1050 out_err_release:
1051 dst_release(*dst);
1052 *dst = NULL;
1053
1054 if (err == -ENETUNREACH)
1055 IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1056 return err;
1057 }
1058
1059 /**
1060 * ip6_dst_lookup - perform route lookup on flow
1061 * @sk: socket which provides route info
1062 * @dst: pointer to dst_entry * for result
1063 * @fl6: flow to lookup
1064 *
1065 * This function performs a route lookup on the given flow.
1066 *
1067 * It returns zero on success, or a standard errno code on error.
1068 */
1069 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1070 struct flowi6 *fl6)
1071 {
1072 *dst = NULL;
1073 return ip6_dst_lookup_tail(net, sk, dst, fl6);
1074 }
1075 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1076
1077 /**
1078 * ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1079 * @sk: socket which provides route info
1080 * @fl6: flow to lookup
1081 * @final_dst: final destination address for ipsec lookup
1082 *
1083 * This function performs a route lookup on the given flow.
1084 *
1085 * It returns a valid dst pointer on success, or a pointer encoded
1086 * error code.
1087 */
1088 struct dst_entry *ip6_dst_lookup_flow(const struct sock *sk, struct flowi6 *fl6,
1089 const struct in6_addr *final_dst)
1090 {
1091 struct dst_entry *dst = NULL;
1092 int err;
1093
1094 err = ip6_dst_lookup_tail(sock_net(sk), sk, &dst, fl6);
1095 if (err)
1096 return ERR_PTR(err);
1097 if (final_dst)
1098 fl6->daddr = *final_dst;
1099
1100 return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1101 }
1102 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1103
1104 /**
1105 * ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1106 * @sk: socket which provides the dst cache and route info
1107 * @fl6: flow to lookup
1108 * @final_dst: final destination address for ipsec lookup
1109 * @connected: whether @sk is connected or not
1110 *
1111 * This function performs a route lookup on the given flow with the
1112 * possibility of using the cached route in the socket if it is valid.
1113 * It will take the socket dst lock when operating on the dst cache.
1114 * As a result, this function can only be used in process context.
1115 *
1116 * In addition, for a connected socket, cache the dst in the socket
1117 * if the current cache is not valid.
1118 *
1119 * It returns a valid dst pointer on success, or a pointer encoded
1120 * error code.
1121 */
1122 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1123 const struct in6_addr *final_dst,
1124 bool connected)
1125 {
1126 struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1127
1128 dst = ip6_sk_dst_check(sk, dst, fl6);
1129 if (dst)
1130 return dst;
1131
1132 dst = ip6_dst_lookup_flow(sk, fl6, final_dst);
1133 if (connected && !IS_ERR(dst))
1134 ip6_sk_dst_store_flow(sk, dst_clone(dst), fl6);
1135
1136 return dst;
1137 }
1138 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1139
1140 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1141 gfp_t gfp)
1142 {
1143 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1144 }
1145
1146 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1147 gfp_t gfp)
1148 {
1149 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1150 }
1151
1152 static void ip6_append_data_mtu(unsigned int *mtu,
1153 int *maxfraglen,
1154 unsigned int fragheaderlen,
1155 struct sk_buff *skb,
1156 struct rt6_info *rt,
1157 unsigned int orig_mtu)
1158 {
1159 if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1160 if (!skb) {
1161 /* first fragment, reserve header_len */
1162 *mtu = orig_mtu - rt->dst.header_len;
1163
1164 } else {
1165 /*
1166 * this fragment is not first, the headers
1167 * space is regarded as data space.
1168 */
1169 *mtu = orig_mtu;
1170 }
1171 *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1172 + fragheaderlen - sizeof(struct frag_hdr);
1173 }
1174 }
1175
1176 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1177 struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
1178 struct rt6_info *rt, struct flowi6 *fl6)
1179 {
1180 struct ipv6_pinfo *np = inet6_sk(sk);
1181 unsigned int mtu;
1182 struct ipv6_txoptions *opt = ipc6->opt;
1183
1184 /*
1185 * setup for corking
1186 */
1187 if (opt) {
1188 if (WARN_ON(v6_cork->opt))
1189 return -EINVAL;
1190
1191 v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation);
1192 if (unlikely(!v6_cork->opt))
1193 return -ENOBUFS;
1194
1195 v6_cork->opt->tot_len = sizeof(*opt);
1196 v6_cork->opt->opt_flen = opt->opt_flen;
1197 v6_cork->opt->opt_nflen = opt->opt_nflen;
1198
1199 v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1200 sk->sk_allocation);
1201 if (opt->dst0opt && !v6_cork->opt->dst0opt)
1202 return -ENOBUFS;
1203
1204 v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1205 sk->sk_allocation);
1206 if (opt->dst1opt && !v6_cork->opt->dst1opt)
1207 return -ENOBUFS;
1208
1209 v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt,
1210 sk->sk_allocation);
1211 if (opt->hopopt && !v6_cork->opt->hopopt)
1212 return -ENOBUFS;
1213
1214 v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1215 sk->sk_allocation);
1216 if (opt->srcrt && !v6_cork->opt->srcrt)
1217 return -ENOBUFS;
1218
1219 /* need source address above miyazawa*/
1220 }
1221 dst_hold(&rt->dst);
1222 cork->base.dst = &rt->dst;
1223 cork->fl.u.ip6 = *fl6;
1224 v6_cork->hop_limit = ipc6->hlimit;
1225 v6_cork->tclass = ipc6->tclass;
1226 if (rt->dst.flags & DST_XFRM_TUNNEL)
1227 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1228 READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst);
1229 else
1230 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1231 READ_ONCE(rt->dst.dev->mtu) : dst_mtu(xfrm_dst_path(&rt->dst));
1232 if (np->frag_size < mtu) {
1233 if (np->frag_size)
1234 mtu = np->frag_size;
1235 }
1236 if (mtu < IPV6_MIN_MTU)
1237 return -EINVAL;
1238 cork->base.fragsize = mtu;
1239 cork->base.gso_size = ipc6->gso_size;
1240 cork->base.tx_flags = 0;
1241 sock_tx_timestamp(sk, ipc6->sockc.tsflags, &cork->base.tx_flags);
1242
1243 if (dst_allfrag(xfrm_dst_path(&rt->dst)))
1244 cork->base.flags |= IPCORK_ALLFRAG;
1245 cork->base.length = 0;
1246
1247 cork->base.transmit_time = ipc6->sockc.transmit_time;
1248
1249 return 0;
1250 }
1251
1252 static int __ip6_append_data(struct sock *sk,
1253 struct flowi6 *fl6,
1254 struct sk_buff_head *queue,
1255 struct inet_cork *cork,
1256 struct inet6_cork *v6_cork,
1257 struct page_frag *pfrag,
1258 int getfrag(void *from, char *to, int offset,
1259 int len, int odd, struct sk_buff *skb),
1260 void *from, int length, int transhdrlen,
1261 unsigned int flags, struct ipcm6_cookie *ipc6)
1262 {
1263 struct sk_buff *skb, *skb_prev = NULL;
1264 unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
1265 struct ubuf_info *uarg = NULL;
1266 int exthdrlen = 0;
1267 int dst_exthdrlen = 0;
1268 int hh_len;
1269 int copy;
1270 int err;
1271 int offset = 0;
1272 u32 tskey = 0;
1273 struct rt6_info *rt = (struct rt6_info *)cork->dst;
1274 struct ipv6_txoptions *opt = v6_cork->opt;
1275 int csummode = CHECKSUM_NONE;
1276 unsigned int maxnonfragsize, headersize;
1277 unsigned int wmem_alloc_delta = 0;
1278 bool paged, extra_uref;
1279
1280 skb = skb_peek_tail(queue);
1281 if (!skb) {
1282 exthdrlen = opt ? opt->opt_flen : 0;
1283 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1284 }
1285
1286 paged = !!cork->gso_size;
1287 mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize;
1288 orig_mtu = mtu;
1289
1290 if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP &&
1291 sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1292 tskey = sk->sk_tskey++;
1293
1294 hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1295
1296 fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1297 (opt ? opt->opt_nflen : 0);
1298 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1299 sizeof(struct frag_hdr);
1300
1301 headersize = sizeof(struct ipv6hdr) +
1302 (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1303 (dst_allfrag(&rt->dst) ?
1304 sizeof(struct frag_hdr) : 0) +
1305 rt->rt6i_nfheader_len;
1306
1307 /* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit
1308 * the first fragment
1309 */
1310 if (headersize + transhdrlen > mtu)
1311 goto emsgsize;
1312
1313 if (cork->length + length > mtu - headersize && ipc6->dontfrag &&
1314 (sk->sk_protocol == IPPROTO_UDP ||
1315 sk->sk_protocol == IPPROTO_RAW)) {
1316 ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1317 sizeof(struct ipv6hdr));
1318 goto emsgsize;
1319 }
1320
1321 if (ip6_sk_ignore_df(sk))
1322 maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1323 else
1324 maxnonfragsize = mtu;
1325
1326 if (cork->length + length > maxnonfragsize - headersize) {
1327 emsgsize:
1328 pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0);
1329 ipv6_local_error(sk, EMSGSIZE, fl6, pmtu);
1330 return -EMSGSIZE;
1331 }
1332
1333 /* CHECKSUM_PARTIAL only with no extension headers and when
1334 * we are not going to fragment
1335 */
1336 if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1337 headersize == sizeof(struct ipv6hdr) &&
1338 length <= mtu - headersize &&
1339 (!(flags & MSG_MORE) || cork->gso_size) &&
1340 rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1341 csummode = CHECKSUM_PARTIAL;
1342
1343 if (flags & MSG_ZEROCOPY && length && sock_flag(sk, SOCK_ZEROCOPY)) {
1344 uarg = sock_zerocopy_realloc(sk, length, skb_zcopy(skb));
1345 if (!uarg)
1346 return -ENOBUFS;
1347 extra_uref = true;
1348 if (rt->dst.dev->features & NETIF_F_SG &&
1349 csummode == CHECKSUM_PARTIAL) {
1350 paged = true;
1351 } else {
1352 uarg->zerocopy = 0;
1353 skb_zcopy_set(skb, uarg, &extra_uref);
1354 }
1355 }
1356
1357 /*
1358 * Let's try using as much space as possible.
1359 * Use MTU if total length of the message fits into the MTU.
1360 * Otherwise, we need to reserve fragment header and
1361 * fragment alignment (= 8-15 octects, in total).
1362 *
1363 * Note that we may need to "move" the data from the tail of
1364 * of the buffer to the new fragment when we split
1365 * the message.
1366 *
1367 * FIXME: It may be fragmented into multiple chunks
1368 * at once if non-fragmentable extension headers
1369 * are too large.
1370 * --yoshfuji
1371 */
1372
1373 cork->length += length;
1374 if (!skb)
1375 goto alloc_new_skb;
1376
1377 while (length > 0) {
1378 /* Check if the remaining data fits into current packet. */
1379 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1380 if (copy < length)
1381 copy = maxfraglen - skb->len;
1382
1383 if (copy <= 0) {
1384 char *data;
1385 unsigned int datalen;
1386 unsigned int fraglen;
1387 unsigned int fraggap;
1388 unsigned int alloclen;
1389 unsigned int pagedlen;
1390 alloc_new_skb:
1391 /* There's no room in the current skb */
1392 if (skb)
1393 fraggap = skb->len - maxfraglen;
1394 else
1395 fraggap = 0;
1396 /* update mtu and maxfraglen if necessary */
1397 if (!skb || !skb_prev)
1398 ip6_append_data_mtu(&mtu, &maxfraglen,
1399 fragheaderlen, skb, rt,
1400 orig_mtu);
1401
1402 skb_prev = skb;
1403
1404 /*
1405 * If remaining data exceeds the mtu,
1406 * we know we need more fragment(s).
1407 */
1408 datalen = length + fraggap;
1409
1410 if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1411 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1412 fraglen = datalen + fragheaderlen;
1413 pagedlen = 0;
1414
1415 if ((flags & MSG_MORE) &&
1416 !(rt->dst.dev->features&NETIF_F_SG))
1417 alloclen = mtu;
1418 else if (!paged)
1419 alloclen = fraglen;
1420 else {
1421 alloclen = min_t(int, fraglen, MAX_HEADER);
1422 pagedlen = fraglen - alloclen;
1423 }
1424
1425 alloclen += dst_exthdrlen;
1426
1427 if (datalen != length + fraggap) {
1428 /*
1429 * this is not the last fragment, the trailer
1430 * space is regarded as data space.
1431 */
1432 datalen += rt->dst.trailer_len;
1433 }
1434
1435 alloclen += rt->dst.trailer_len;
1436 fraglen = datalen + fragheaderlen;
1437
1438 /*
1439 * We just reserve space for fragment header.
1440 * Note: this may be overallocation if the message
1441 * (without MSG_MORE) fits into the MTU.
1442 */
1443 alloclen += sizeof(struct frag_hdr);
1444
1445 copy = datalen - transhdrlen - fraggap - pagedlen;
1446 if (copy < 0) {
1447 err = -EINVAL;
1448 goto error;
1449 }
1450 if (transhdrlen) {
1451 skb = sock_alloc_send_skb(sk,
1452 alloclen + hh_len,
1453 (flags & MSG_DONTWAIT), &err);
1454 } else {
1455 skb = NULL;
1456 if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <=
1457 2 * sk->sk_sndbuf)
1458 skb = alloc_skb(alloclen + hh_len,
1459 sk->sk_allocation);
1460 if (unlikely(!skb))
1461 err = -ENOBUFS;
1462 }
1463 if (!skb)
1464 goto error;
1465 /*
1466 * Fill in the control structures
1467 */
1468 skb->protocol = htons(ETH_P_IPV6);
1469 skb->ip_summed = csummode;
1470 skb->csum = 0;
1471 /* reserve for fragmentation and ipsec header */
1472 skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1473 dst_exthdrlen);
1474
1475 /*
1476 * Find where to start putting bytes
1477 */
1478 data = skb_put(skb, fraglen - pagedlen);
1479 skb_set_network_header(skb, exthdrlen);
1480 data += fragheaderlen;
1481 skb->transport_header = (skb->network_header +
1482 fragheaderlen);
1483 if (fraggap) {
1484 skb->csum = skb_copy_and_csum_bits(
1485 skb_prev, maxfraglen,
1486 data + transhdrlen, fraggap, 0);
1487 skb_prev->csum = csum_sub(skb_prev->csum,
1488 skb->csum);
1489 data += fraggap;
1490 pskb_trim_unique(skb_prev, maxfraglen);
1491 }
1492 if (copy > 0 &&
1493 getfrag(from, data + transhdrlen, offset,
1494 copy, fraggap, skb) < 0) {
1495 err = -EFAULT;
1496 kfree_skb(skb);
1497 goto error;
1498 }
1499
1500 offset += copy;
1501 length -= copy + transhdrlen;
1502 transhdrlen = 0;
1503 exthdrlen = 0;
1504 dst_exthdrlen = 0;
1505
1506 /* Only the initial fragment is time stamped */
1507 skb_shinfo(skb)->tx_flags = cork->tx_flags;
1508 cork->tx_flags = 0;
1509 skb_shinfo(skb)->tskey = tskey;
1510 tskey = 0;
1511 skb_zcopy_set(skb, uarg, &extra_uref);
1512
1513 if ((flags & MSG_CONFIRM) && !skb_prev)
1514 skb_set_dst_pending_confirm(skb, 1);
1515
1516 /*
1517 * Put the packet on the pending queue
1518 */
1519 if (!skb->destructor) {
1520 skb->destructor = sock_wfree;
1521 skb->sk = sk;
1522 wmem_alloc_delta += skb->truesize;
1523 }
1524 __skb_queue_tail(queue, skb);
1525 continue;
1526 }
1527
1528 if (copy > length)
1529 copy = length;
1530
1531 if (!(rt->dst.dev->features&NETIF_F_SG) &&
1532 skb_tailroom(skb) >= copy) {
1533 unsigned int off;
1534
1535 off = skb->len;
1536 if (getfrag(from, skb_put(skb, copy),
1537 offset, copy, off, skb) < 0) {
1538 __skb_trim(skb, off);
1539 err = -EFAULT;
1540 goto error;
1541 }
1542 } else if (!uarg || !uarg->zerocopy) {
1543 int i = skb_shinfo(skb)->nr_frags;
1544
1545 err = -ENOMEM;
1546 if (!sk_page_frag_refill(sk, pfrag))
1547 goto error;
1548
1549 if (!skb_can_coalesce(skb, i, pfrag->page,
1550 pfrag->offset)) {
1551 err = -EMSGSIZE;
1552 if (i == MAX_SKB_FRAGS)
1553 goto error;
1554
1555 __skb_fill_page_desc(skb, i, pfrag->page,
1556 pfrag->offset, 0);
1557 skb_shinfo(skb)->nr_frags = ++i;
1558 get_page(pfrag->page);
1559 }
1560 copy = min_t(int, copy, pfrag->size - pfrag->offset);
1561 if (getfrag(from,
1562 page_address(pfrag->page) + pfrag->offset,
1563 offset, copy, skb->len, skb) < 0)
1564 goto error_efault;
1565
1566 pfrag->offset += copy;
1567 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1568 skb->len += copy;
1569 skb->data_len += copy;
1570 skb->truesize += copy;
1571 wmem_alloc_delta += copy;
1572 } else {
1573 err = skb_zerocopy_iter_dgram(skb, from, copy);
1574 if (err < 0)
1575 goto error;
1576 }
1577 offset += copy;
1578 length -= copy;
1579 }
1580
1581 if (wmem_alloc_delta)
1582 refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1583 return 0;
1584
1585 error_efault:
1586 err = -EFAULT;
1587 error:
1588 if (uarg)
1589 sock_zerocopy_put_abort(uarg, extra_uref);
1590 cork->length -= length;
1591 IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1592 refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1593 return err;
1594 }
1595
1596 int ip6_append_data(struct sock *sk,
1597 int getfrag(void *from, char *to, int offset, int len,
1598 int odd, struct sk_buff *skb),
1599 void *from, int length, int transhdrlen,
1600 struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1601 struct rt6_info *rt, unsigned int flags)
1602 {
1603 struct inet_sock *inet = inet_sk(sk);
1604 struct ipv6_pinfo *np = inet6_sk(sk);
1605 int exthdrlen;
1606 int err;
1607
1608 if (flags&MSG_PROBE)
1609 return 0;
1610 if (skb_queue_empty(&sk->sk_write_queue)) {
1611 /*
1612 * setup for corking
1613 */
1614 err = ip6_setup_cork(sk, &inet->cork, &np->cork,
1615 ipc6, rt, fl6);
1616 if (err)
1617 return err;
1618
1619 exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1620 length += exthdrlen;
1621 transhdrlen += exthdrlen;
1622 } else {
1623 fl6 = &inet->cork.fl.u.ip6;
1624 transhdrlen = 0;
1625 }
1626
1627 return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base,
1628 &np->cork, sk_page_frag(sk), getfrag,
1629 from, length, transhdrlen, flags, ipc6);
1630 }
1631 EXPORT_SYMBOL_GPL(ip6_append_data);
1632
1633 static void ip6_cork_release(struct inet_cork_full *cork,
1634 struct inet6_cork *v6_cork)
1635 {
1636 if (v6_cork->opt) {
1637 kfree(v6_cork->opt->dst0opt);
1638 kfree(v6_cork->opt->dst1opt);
1639 kfree(v6_cork->opt->hopopt);
1640 kfree(v6_cork->opt->srcrt);
1641 kfree(v6_cork->opt);
1642 v6_cork->opt = NULL;
1643 }
1644
1645 if (cork->base.dst) {
1646 dst_release(cork->base.dst);
1647 cork->base.dst = NULL;
1648 cork->base.flags &= ~IPCORK_ALLFRAG;
1649 }
1650 memset(&cork->fl, 0, sizeof(cork->fl));
1651 }
1652
1653 struct sk_buff *__ip6_make_skb(struct sock *sk,
1654 struct sk_buff_head *queue,
1655 struct inet_cork_full *cork,
1656 struct inet6_cork *v6_cork)
1657 {
1658 struct sk_buff *skb, *tmp_skb;
1659 struct sk_buff **tail_skb;
1660 struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1661 struct ipv6_pinfo *np = inet6_sk(sk);
1662 struct net *net = sock_net(sk);
1663 struct ipv6hdr *hdr;
1664 struct ipv6_txoptions *opt = v6_cork->opt;
1665 struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1666 struct flowi6 *fl6 = &cork->fl.u.ip6;
1667 unsigned char proto = fl6->flowi6_proto;
1668
1669 skb = __skb_dequeue(queue);
1670 if (!skb)
1671 goto out;
1672 tail_skb = &(skb_shinfo(skb)->frag_list);
1673
1674 /* move skb->data to ip header from ext header */
1675 if (skb->data < skb_network_header(skb))
1676 __skb_pull(skb, skb_network_offset(skb));
1677 while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1678 __skb_pull(tmp_skb, skb_network_header_len(skb));
1679 *tail_skb = tmp_skb;
1680 tail_skb = &(tmp_skb->next);
1681 skb->len += tmp_skb->len;
1682 skb->data_len += tmp_skb->len;
1683 skb->truesize += tmp_skb->truesize;
1684 tmp_skb->destructor = NULL;
1685 tmp_skb->sk = NULL;
1686 }
1687
1688 /* Allow local fragmentation. */
1689 skb->ignore_df = ip6_sk_ignore_df(sk);
1690
1691 *final_dst = fl6->daddr;
1692 __skb_pull(skb, skb_network_header_len(skb));
1693 if (opt && opt->opt_flen)
1694 ipv6_push_frag_opts(skb, opt, &proto);
1695 if (opt && opt->opt_nflen)
1696 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr);
1697
1698 skb_push(skb, sizeof(struct ipv6hdr));
1699 skb_reset_network_header(skb);
1700 hdr = ipv6_hdr(skb);
1701
1702 ip6_flow_hdr(hdr, v6_cork->tclass,
1703 ip6_make_flowlabel(net, skb, fl6->flowlabel,
1704 ip6_autoflowlabel(net, np), fl6));
1705 hdr->hop_limit = v6_cork->hop_limit;
1706 hdr->nexthdr = proto;
1707 hdr->saddr = fl6->saddr;
1708 hdr->daddr = *final_dst;
1709
1710 skb->priority = sk->sk_priority;
1711 skb->mark = sk->sk_mark;
1712
1713 skb->tstamp = cork->base.transmit_time;
1714
1715 skb_dst_set(skb, dst_clone(&rt->dst));
1716 IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1717 if (proto == IPPROTO_ICMPV6) {
1718 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1719
1720 ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type);
1721 ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1722 }
1723
1724 ip6_cork_release(cork, v6_cork);
1725 out:
1726 return skb;
1727 }
1728
1729 int ip6_send_skb(struct sk_buff *skb)
1730 {
1731 struct net *net = sock_net(skb->sk);
1732 struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
1733 int err;
1734
1735 err = ip6_local_out(net, skb->sk, skb);
1736 if (err) {
1737 if (err > 0)
1738 err = net_xmit_errno(err);
1739 if (err)
1740 IP6_INC_STATS(net, rt->rt6i_idev,
1741 IPSTATS_MIB_OUTDISCARDS);
1742 }
1743
1744 return err;
1745 }
1746
1747 int ip6_push_pending_frames(struct sock *sk)
1748 {
1749 struct sk_buff *skb;
1750
1751 skb = ip6_finish_skb(sk);
1752 if (!skb)
1753 return 0;
1754
1755 return ip6_send_skb(skb);
1756 }
1757 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1758
1759 static void __ip6_flush_pending_frames(struct sock *sk,
1760 struct sk_buff_head *queue,
1761 struct inet_cork_full *cork,
1762 struct inet6_cork *v6_cork)
1763 {
1764 struct sk_buff *skb;
1765
1766 while ((skb = __skb_dequeue_tail(queue)) != NULL) {
1767 if (skb_dst(skb))
1768 IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1769 IPSTATS_MIB_OUTDISCARDS);
1770 kfree_skb(skb);
1771 }
1772
1773 ip6_cork_release(cork, v6_cork);
1774 }
1775
1776 void ip6_flush_pending_frames(struct sock *sk)
1777 {
1778 __ip6_flush_pending_frames(sk, &sk->sk_write_queue,
1779 &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
1780 }
1781 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
1782
1783 struct sk_buff *ip6_make_skb(struct sock *sk,
1784 int getfrag(void *from, char *to, int offset,
1785 int len, int odd, struct sk_buff *skb),
1786 void *from, int length, int transhdrlen,
1787 struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1788 struct rt6_info *rt, unsigned int flags,
1789 struct inet_cork_full *cork)
1790 {
1791 struct inet6_cork v6_cork;
1792 struct sk_buff_head queue;
1793 int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1794 int err;
1795
1796 if (flags & MSG_PROBE)
1797 return NULL;
1798
1799 __skb_queue_head_init(&queue);
1800
1801 cork->base.flags = 0;
1802 cork->base.addr = 0;
1803 cork->base.opt = NULL;
1804 cork->base.dst = NULL;
1805 v6_cork.opt = NULL;
1806 err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt, fl6);
1807 if (err) {
1808 ip6_cork_release(cork, &v6_cork);
1809 return ERR_PTR(err);
1810 }
1811 if (ipc6->dontfrag < 0)
1812 ipc6->dontfrag = inet6_sk(sk)->dontfrag;
1813
1814 err = __ip6_append_data(sk, fl6, &queue, &cork->base, &v6_cork,
1815 &current->task_frag, getfrag, from,
1816 length + exthdrlen, transhdrlen + exthdrlen,
1817 flags, ipc6);
1818 if (err) {
1819 __ip6_flush_pending_frames(sk, &queue, cork, &v6_cork);
1820 return ERR_PTR(err);
1821 }
1822
1823 return __ip6_make_skb(sk, &queue, cork, &v6_cork);
1824 }