net/ipv6/ip6_output.c

   1 // SPDX-License-Identifier: GPL-2.0-or-later
   2 /*
   3  *      IPv6 output functions
   4  *      Linux INET6 implementation
   5  *
   6  *      Authors:
   7  *      Pedro Roque             <roque@di.fc.ul.pt>
   8  *
   9  *      Based on linux/net/ipv4/ip_output.c
  10  *
  11  *      Changes:
  12  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
  13  *                              extension headers are implemented.
  14  *                              route changes now work.
  15  *                              ip6_forward does not confuse sniffers.
  16  *                              etc.
  17  *
  18  *      H. von Brand    :       Added missing #include <linux/string.h>
  19  *      Imran Patel     :       frag id should be in NBO
  20  *      Kazunori MIYAZAWA @USAGI
  21  *                      :       add ip6_append_data and related functions
  22  *                              for datagram xmit
  23  */
  24
  25 #include <linux/errno.h>
  26 #include <linux/kernel.h>
  27 #include <linux/string.h>
  28 #include <linux/socket.h>
  29 #include <linux/net.h>
  30 #include <linux/netdevice.h>
  31 #include <linux/if_arp.h>
  32 #include <linux/in6.h>
  33 #include <linux/tcp.h>
  34 #include <linux/route.h>
  35 #include <linux/module.h>
  36 #include <linux/slab.h>
  37
  38 #include <linux/bpf-cgroup.h>
  39 #include <linux/netfilter.h>
  40 #include <linux/netfilter_ipv6.h>
  41
  42 #include <net/sock.h>
  43 #include <net/snmp.h>
  44
  45 #include <net/gso.h>
  46 #include <net/ipv6.h>
  47 #include <net/ndisc.h>
  48 #include <net/protocol.h>
  49 #include <net/ip6_route.h>
  50 #include <net/addrconf.h>
  51 #include <net/rawv6.h>
  52 #include <net/icmp.h>
  53 #include <net/xfrm.h>
  54 #include <net/checksum.h>
  55 #include <linux/mroute6.h>
  56 #include <net/l3mdev.h>
  57 #include <net/lwtunnel.h>
  58 #include <net/ip_tunnels.h>
  59
  60 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
  61 {
  62         struct dst_entry *dst = skb_dst(skb);
  63         struct net_device *dev = dst->dev;
  64         struct inet6_dev *idev = ip6_dst_idev(dst);
  65         unsigned int hh_len = LL_RESERVED_SPACE(dev);
  66         const struct in6_addr *daddr, *nexthop;
  67         struct ipv6hdr *hdr;
  68         struct neighbour *neigh;
  69         int ret;
  70
  71         /* Be paranoid, rather than too clever. */
  72         if (unlikely(hh_len > skb_headroom(skb)) && dev->header_ops) {
  73                 skb = skb_expand_head(skb, hh_len);
  74                 if (!skb) {
  75                         IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
  76                         return -ENOMEM;
  77                 }
  78         }
  79
  80         hdr = ipv6_hdr(skb);
  81         daddr = &hdr->daddr;
  82         if (ipv6_addr_is_multicast(daddr)) {
  83                 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
  84                     ((mroute6_is_socket(net, skb) &&
  85                      !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
  86                      ipv6_chk_mcast_addr(dev, daddr, &hdr->saddr))) {
  87                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
  88
  89                         /* Do not check for IFF_ALLMULTI; multicast routing
  90                            is not supported in any case.
  91                          */
  92                         if (newskb)
  93                                 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
  94                                         net, sk, newskb, NULL, newskb->dev,
  95                                         dev_loopback_xmit);
  96
  97                         if (hdr->hop_limit == 0) {
  98                                 IP6_INC_STATS(net, idev,
  99                                               IPSTATS_MIB_OUTDISCARDS);
 100                                 kfree_skb(skb);
 101                                 return 0;
 102                         }
 103                 }
 104
 105                 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
 106                 if (IPV6_ADDR_MC_SCOPE(daddr) <= IPV6_ADDR_SCOPE_NODELOCAL &&
 107                     !(dev->flags & IFF_LOOPBACK)) {
 108                         kfree_skb(skb);
 109                         return 0;
 110                 }
 111         }
 112
 113         if (lwtunnel_xmit_redirect(dst->lwtstate)) {
 114                 int res = lwtunnel_xmit(skb);
 115
 116                 if (res != LWTUNNEL_XMIT_CONTINUE)
 117                         return res;
 118         }
 119
 120         IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUT, skb->len);
 121
 122         rcu_read_lock();
 123         nexthop = rt6_nexthop(dst_rt6_info(dst), daddr);
 124         neigh = __ipv6_neigh_lookup_noref(dev, nexthop);
 125
 126         if (unlikely(IS_ERR_OR_NULL(neigh))) {
 127                 if (unlikely(!neigh))
 128                         neigh = __neigh_create(&nd_tbl, nexthop, dev, false);
 129                 if (IS_ERR(neigh)) {
 130                         rcu_read_unlock();
 131                         IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTNOROUTES);
 132                         kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_CREATEFAIL);
 133                         return -EINVAL;
 134                 }
 135         }
 136         sock_confirm_neigh(skb, neigh);
 137         ret = neigh_output(neigh, skb, false);
 138         rcu_read_unlock();
 139         return ret;
 140 }
 141
 142 static int
 143 ip6_finish_output_gso_slowpath_drop(struct net *net, struct sock *sk,
 144                                     struct sk_buff *skb, unsigned int mtu)
 145 {
 146         struct sk_buff *segs, *nskb;
 147         netdev_features_t features;
 148         int ret = 0;
 149
 150         /* Please see corresponding comment in ip_finish_output_gso
 151          * describing the cases where GSO segment length exceeds the
 152          * egress MTU.
 153          */
 154         features = netif_skb_features(skb);
 155         segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
 156         if (IS_ERR_OR_NULL(segs)) {
 157                 kfree_skb(skb);
 158                 return -ENOMEM;
 159         }
 160
 161         consume_skb(skb);
 162
 163         skb_list_walk_safe(segs, segs, nskb) {
 164                 int err;
 165
 166                 skb_mark_not_on_list(segs);
 167                 /* Last GSO segment can be smaller than gso_size (and MTU).
 168                  * Adding a fragment header would produce an "atomic fragment",
 169                  * which is considered harmful (RFC-8021). Avoid that.
 170                  */
 171                 err = segs->len > mtu ?
 172                         ip6_fragment(net, sk, segs, ip6_finish_output2) :
 173                         ip6_finish_output2(net, sk, segs);
 174                 if (err && ret == 0)
 175                         ret = err;
 176         }
 177
 178         return ret;
 179 }
 180
 181 static int ip6_finish_output_gso(struct net *net, struct sock *sk,
 182                                  struct sk_buff *skb, unsigned int mtu)
 183 {
 184         if (!(IP6CB(skb)->flags & IP6SKB_FAKEJUMBO) &&
 185             !skb_gso_validate_network_len(skb, mtu))
 186                 return ip6_finish_output_gso_slowpath_drop(net, sk, skb, mtu);
 187
 188         return ip6_finish_output2(net, sk, skb);
 189 }
 190
 191 static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 192 {
 193         unsigned int mtu;
 194
 195 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
 196         /* Policy lookup after SNAT yielded a new policy */
 197         if (skb_dst(skb)->xfrm) {
 198                 IP6CB(skb)->flags |= IP6SKB_REROUTED;
 199                 return dst_output(net, sk, skb);
 200         }
 201 #endif
 202
 203         mtu = ip6_skb_dst_mtu(skb);
 204         if (skb_is_gso(skb))
 205                 return ip6_finish_output_gso(net, sk, skb, mtu);
 206
 207         if (skb->len > mtu ||
 208             (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
 209                 return ip6_fragment(net, sk, skb, ip6_finish_output2);
 210
 211         return ip6_finish_output2(net, sk, skb);
 212 }
 213
 214 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 215 {
 216         int ret;
 217
 218         ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
 219         switch (ret) {
 220         case NET_XMIT_SUCCESS:
 221         case NET_XMIT_CN:
 222                 return __ip6_finish_output(net, sk, skb) ? : ret;
 223         default:
 224                 kfree_skb_reason(skb, SKB_DROP_REASON_BPF_CGROUP_EGRESS);
 225                 return ret;
 226         }
 227 }
 228
 229 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 230 {
 231         struct net_device *dev = skb_dst(skb)->dev, *indev = skb->dev;
 232         struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
 233
 234         skb->protocol = htons(ETH_P_IPV6);
 235         skb->dev = dev;
 236
 237         if (unlikely(!idev || READ_ONCE(idev->cnf.disable_ipv6))) {
 238                 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
 239                 kfree_skb_reason(skb, SKB_DROP_REASON_IPV6DISABLED);
 240                 return 0;
 241         }
 242
 243         return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
 244                             net, sk, skb, indev, dev,
 245                             ip6_finish_output,
 246                             !(IP6CB(skb)->flags & IP6SKB_REROUTED));
 247 }
 248 EXPORT_SYMBOL(ip6_output);
 249
 250 bool ip6_autoflowlabel(struct net *net, const struct sock *sk)
 251 {
 252         if (!inet6_test_bit(AUTOFLOWLABEL_SET, sk))
 253                 return ip6_default_np_autolabel(net);
 254         return inet6_test_bit(AUTOFLOWLABEL, sk);
 255 }
 256
 257 /*
 258  * xmit an sk_buff (used by TCP, SCTP and DCCP)
 259  * Note : socket lock is not held for SYNACK packets, but might be modified
 260  * by calls to skb_set_owner_w() and ipv6_local_error(),
 261  * which are using proper atomic operations or spinlocks.
 262  */
 263 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
 264              __u32 mark, struct ipv6_txoptions *opt, int tclass, u32 priority)
 265 {
 266         struct net *net = sock_net(sk);
 267         const struct ipv6_pinfo *np = inet6_sk(sk);
 268         struct in6_addr *first_hop = &fl6->daddr;
 269         struct dst_entry *dst = skb_dst(skb);
 270         struct net_device *dev = dst->dev;
 271         struct inet6_dev *idev = ip6_dst_idev(dst);
 272         struct hop_jumbo_hdr *hop_jumbo;
 273         int hoplen = sizeof(*hop_jumbo);
 274         unsigned int head_room;
 275         struct ipv6hdr *hdr;
 276         u8  proto = fl6->flowi6_proto;
 277         int seg_len = skb->len;
 278         int hlimit = -1;
 279         u32 mtu;
 280
 281         head_room = sizeof(struct ipv6hdr) + hoplen + LL_RESERVED_SPACE(dev);
 282         if (opt)
 283                 head_room += opt->opt_nflen + opt->opt_flen;
 284
 285         if (unlikely(head_room > skb_headroom(skb))) {
 286                 skb = skb_expand_head(skb, head_room);
 287                 if (!skb) {
 288                         IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
 289                         return -ENOBUFS;
 290                 }
 291         }
 292
 293         if (opt) {
 294                 seg_len += opt->opt_nflen + opt->opt_flen;
 295
 296                 if (opt->opt_flen)
 297                         ipv6_push_frag_opts(skb, opt, &proto);
 298
 299                 if (opt->opt_nflen)
 300                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop,
 301                                              &fl6->saddr);
 302         }
 303
 304         if (unlikely(seg_len > IPV6_MAXPLEN)) {
 305                 hop_jumbo = skb_push(skb, hoplen);
 306
 307                 hop_jumbo->nexthdr = proto;
 308                 hop_jumbo->hdrlen = 0;
 309                 hop_jumbo->tlv_type = IPV6_TLV_JUMBO;
 310                 hop_jumbo->tlv_len = 4;
 311                 hop_jumbo->jumbo_payload_len = htonl(seg_len + hoplen);
 312
 313                 proto = IPPROTO_HOPOPTS;
 314                 seg_len = 0;
 315                 IP6CB(skb)->flags |= IP6SKB_FAKEJUMBO;
 316         }
 317
 318         skb_push(skb, sizeof(struct ipv6hdr));
 319         skb_reset_network_header(skb);
 320         hdr = ipv6_hdr(skb);
 321
 322         /*
 323          *      Fill in the IPv6 header
 324          */
 325         if (np)
 326                 hlimit = READ_ONCE(np->hop_limit);
 327         if (hlimit < 0)
 328                 hlimit = ip6_dst_hoplimit(dst);
 329
 330         ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
 331                                 ip6_autoflowlabel(net, sk), fl6));
 332
 333         hdr->payload_len = htons(seg_len);
 334         hdr->nexthdr = proto;
 335         hdr->hop_limit = hlimit;
 336
 337         hdr->saddr = fl6->saddr;
 338         hdr->daddr = *first_hop;
 339
 340         skb->protocol = htons(ETH_P_IPV6);
 341         skb->priority = priority;
 342         skb->mark = mark;
 343
 344         mtu = dst_mtu(dst);
 345         if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
 346                 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTREQUESTS);
 347
 348                 /* if egress device is enslaved to an L3 master device pass the
 349                  * skb to its handler for processing
 350                  */
 351                 skb = l3mdev_ip6_out((struct sock *)sk, skb);
 352                 if (unlikely(!skb))
 353                         return 0;
 354
 355                 /* hooks should never assume socket lock is held.
 356                  * we promote our socket to non const
 357                  */
 358                 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
 359                                net, (struct sock *)sk, skb, NULL, dev,
 360                                dst_output);
 361         }
 362
 363         skb->dev = dev;
 364         /* ipv6_local_error() does not require socket lock,
 365          * we promote our socket to non const
 366          */
 367         ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
 368
 369         IP6_INC_STATS(net, idev, IPSTATS_MIB_FRAGFAILS);
 370         kfree_skb(skb);
 371         return -EMSGSIZE;
 372 }
 373 EXPORT_SYMBOL(ip6_xmit);
 374
 375 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
 376 {
 377         struct ip6_ra_chain *ra;
 378         struct sock *last = NULL;
 379
 380         read_lock(&ip6_ra_lock);
 381         for (ra = ip6_ra_chain; ra; ra = ra->next) {
 382                 struct sock *sk = ra->sk;
 383                 if (sk && ra->sel == sel &&
 384                     (!sk->sk_bound_dev_if ||
 385                      sk->sk_bound_dev_if == skb->dev->ifindex)) {
 386
 387                         if (inet6_test_bit(RTALERT_ISOLATE, sk) &&
 388                             !net_eq(sock_net(sk), dev_net(skb->dev))) {
 389                                 continue;
 390                         }
 391                         if (last) {
 392                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
 393                                 if (skb2)
 394                                         rawv6_rcv(last, skb2);
 395                         }
 396                         last = sk;
 397                 }
 398         }
 399
 400         if (last) {
 401                 rawv6_rcv(last, skb);
 402                 read_unlock(&ip6_ra_lock);
 403                 return 1;
 404         }
 405         read_unlock(&ip6_ra_lock);
 406         return 0;
 407 }
 408
 409 static int ip6_forward_proxy_check(struct sk_buff *skb)
 410 {
 411         struct ipv6hdr *hdr = ipv6_hdr(skb);
 412         u8 nexthdr = hdr->nexthdr;
 413         __be16 frag_off;
 414         int offset;
 415
 416         if (ipv6_ext_hdr(nexthdr)) {
 417                 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
 418                 if (offset < 0)
 419                         return 0;
 420         } else
 421                 offset = sizeof(struct ipv6hdr);
 422
 423         if (nexthdr == IPPROTO_ICMPV6) {
 424                 struct icmp6hdr *icmp6;
 425
 426                 if (!pskb_may_pull(skb, (skb_network_header(skb) +
 427                                          offset + 1 - skb->data)))
 428                         return 0;
 429
 430                 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
 431
 432                 switch (icmp6->icmp6_type) {
 433                 case NDISC_ROUTER_SOLICITATION:
 434                 case NDISC_ROUTER_ADVERTISEMENT:
 435                 case NDISC_NEIGHBOUR_SOLICITATION:
 436                 case NDISC_NEIGHBOUR_ADVERTISEMENT:
 437                 case NDISC_REDIRECT:
 438                         /* For reaction involving unicast neighbor discovery
 439                          * message destined to the proxied address, pass it to
 440                          * input function.
 441                          */
 442                         return 1;
 443                 default:
 444                         break;
 445                 }
 446         }
 447
 448         /*
 449          * The proxying router can't forward traffic sent to a link-local
 450          * address, so signal the sender and discard the packet. This
 451          * behavior is clarified by the MIPv6 specification.
 452          */
 453         if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
 454                 dst_link_failure(skb);
 455                 return -1;
 456         }
 457
 458         return 0;
 459 }
 460
 461 static inline int ip6_forward_finish(struct net *net, struct sock *sk,
 462                                      struct sk_buff *skb)
 463 {
 464 #ifdef CONFIG_NET_SWITCHDEV
 465         if (skb->offload_l3_fwd_mark) {
 466                 consume_skb(skb);
 467                 return 0;
 468         }
 469 #endif
 470
 471         skb_clear_tstamp(skb);
 472         return dst_output(net, sk, skb);
 473 }
 474
 475 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
 476 {
 477         if (skb->len <= mtu)
 478                 return false;
 479
 480         /* ipv6 conntrack defrag sets max_frag_size + ignore_df */
 481         if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
 482                 return true;
 483
 484         if (skb->ignore_df)
 485                 return false;
 486
 487         if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
 488                 return false;
 489
 490         return true;
 491 }
 492
 493 int ip6_forward(struct sk_buff *skb)
 494 {
 495         struct dst_entry *dst = skb_dst(skb);
 496         struct ipv6hdr *hdr = ipv6_hdr(skb);
 497         struct inet6_skb_parm *opt = IP6CB(skb);
 498         struct net *net = dev_net(dst->dev);
 499         struct inet6_dev *idev;
 500         SKB_DR(reason);
 501         u32 mtu;
 502
 503         idev = __in6_dev_get_safely(dev_get_by_index_rcu(net, IP6CB(skb)->iif));
 504         if (READ_ONCE(net->ipv6.devconf_all->forwarding) == 0)
 505                 goto error;
 506
 507         if (skb->pkt_type != PACKET_HOST)
 508                 goto drop;
 509
 510         if (unlikely(skb->sk))
 511                 goto drop;
 512
 513         if (skb_warn_if_lro(skb))
 514                 goto drop;
 515
 516         if (!READ_ONCE(net->ipv6.devconf_all->disable_policy) &&
 517             (!idev || !READ_ONCE(idev->cnf.disable_policy)) &&
 518             !xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
 519                 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
 520                 goto drop;
 521         }
 522
 523         skb_forward_csum(skb);
 524
 525         /*
 526          *      We DO NOT make any processing on
 527          *      RA packets, pushing them to user level AS IS
 528          *      without ane WARRANTY that application will be able
 529          *      to interpret them. The reason is that we
 530          *      cannot make anything clever here.
 531          *
 532          *      We are not end-node, so that if packet contains
 533          *      AH/ESP, we cannot make anything.
 534          *      Defragmentation also would be mistake, RA packets
 535          *      cannot be fragmented, because there is no warranty
 536          *      that different fragments will go along one path. --ANK
 537          */
 538         if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
 539                 if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
 540                         return 0;
 541         }
 542
 543         /*
 544          *      check and decrement ttl
 545          */
 546         if (hdr->hop_limit <= 1) {
 547                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
 548                 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
 549
 550                 kfree_skb_reason(skb, SKB_DROP_REASON_IP_INHDR);
 551                 return -ETIMEDOUT;
 552         }
 553
 554         /* XXX: idev->cnf.proxy_ndp? */
 555         if (READ_ONCE(net->ipv6.devconf_all->proxy_ndp) &&
 556             pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
 557                 int proxied = ip6_forward_proxy_check(skb);
 558                 if (proxied > 0) {
 559                         /* It's tempting to decrease the hop limit
 560                          * here by 1, as we do at the end of the
 561                          * function too.
 562                          *
 563                          * But that would be incorrect, as proxying is
 564                          * not forwarding.  The ip6_input function
 565                          * will handle this packet locally, and it
 566                          * depends on the hop limit being unchanged.
 567                          *
 568                          * One example is the NDP hop limit, that
 569                          * always has to stay 255, but other would be
 570                          * similar checks around RA packets, where the
 571                          * user can even change the desired limit.
 572                          */
 573                         return ip6_input(skb);
 574                 } else if (proxied < 0) {
 575                         __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
 576                         goto drop;
 577                 }
 578         }
 579
 580         if (!xfrm6_route_forward(skb)) {
 581                 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
 582                 SKB_DR_SET(reason, XFRM_POLICY);
 583                 goto drop;
 584         }
 585         dst = skb_dst(skb);
 586
 587         /* IPv6 specs say nothing about it, but it is clear that we cannot
 588            send redirects to source routed frames.
 589            We don't send redirects to frames decapsulated from IPsec.
 590          */
 591         if (IP6CB(skb)->iif == dst->dev->ifindex &&
 592             opt->srcrt == 0 && !skb_sec_path(skb)) {
 593                 struct in6_addr *target = NULL;
 594                 struct inet_peer *peer;
 595                 struct rt6_info *rt;
 596
 597                 /*
 598                  *      incoming and outgoing devices are the same
 599                  *      send a redirect.
 600                  */
 601
 602                 rt = dst_rt6_info(dst);
 603                 if (rt->rt6i_flags & RTF_GATEWAY)
 604                         target = &rt->rt6i_gateway;
 605                 else
 606                         target = &hdr->daddr;
 607
 608                 peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
 609
 610                 /* Limit redirects both by destination (here)
 611                    and by source (inside ndisc_send_redirect)
 612                  */
 613                 if (inet_peer_xrlim_allow(peer, 1*HZ))
 614                         ndisc_send_redirect(skb, target);
 615                 if (peer)
 616                         inet_putpeer(peer);
 617         } else {
 618                 int addrtype = ipv6_addr_type(&hdr->saddr);
 619
 620                 /* This check is security critical. */
 621                 if (addrtype == IPV6_ADDR_ANY ||
 622                     addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
 623                         goto error;
 624                 if (addrtype & IPV6_ADDR_LINKLOCAL) {
 625                         icmpv6_send(skb, ICMPV6_DEST_UNREACH,
 626                                     ICMPV6_NOT_NEIGHBOUR, 0);
 627                         goto error;
 628                 }
 629         }
 630
 631         __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
 632
 633         mtu = ip6_dst_mtu_maybe_forward(dst, true);
 634         if (mtu < IPV6_MIN_MTU)
 635                 mtu = IPV6_MIN_MTU;
 636
 637         if (ip6_pkt_too_big(skb, mtu)) {
 638                 /* Again, force OUTPUT device used as source address */
 639                 skb->dev = dst->dev;
 640                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 641                 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS);
 642                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
 643                                 IPSTATS_MIB_FRAGFAILS);
 644                 kfree_skb_reason(skb, SKB_DROP_REASON_PKT_TOO_BIG);
 645                 return -EMSGSIZE;
 646         }
 647
 648         if (skb_cow(skb, dst->dev->hard_header_len)) {
 649                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
 650                                 IPSTATS_MIB_OUTDISCARDS);
 651                 goto drop;
 652         }
 653
 654         hdr = ipv6_hdr(skb);
 655
 656         /* Mangling hops number delayed to point after skb COW */
 657
 658         hdr->hop_limit--;
 659
 660         return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
 661                        net, NULL, skb, skb->dev, dst->dev,
 662                        ip6_forward_finish);
 663
 664 error:
 665         __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
 666         SKB_DR_SET(reason, IP_INADDRERRORS);
 667 drop:
 668         kfree_skb_reason(skb, reason);
 669         return -EINVAL;
 670 }
 671
 672 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 673 {
 674         to->pkt_type = from->pkt_type;
 675         to->priority = from->priority;
 676         to->protocol = from->protocol;
 677         skb_dst_drop(to);
 678         skb_dst_set(to, dst_clone(skb_dst(from)));
 679         to->dev = from->dev;
 680         to->mark = from->mark;
 681
 682         skb_copy_hash(to, from);
 683
 684 #ifdef CONFIG_NET_SCHED
 685         to->tc_index = from->tc_index;
 686 #endif
 687         nf_copy(to, from);
 688         skb_ext_copy(to, from);
 689         skb_copy_secmark(to, from);
 690 }
 691
 692 int ip6_fraglist_init(struct sk_buff *skb, unsigned int hlen, u8 *prevhdr,
 693                       u8 nexthdr, __be32 frag_id,
 694                       struct ip6_fraglist_iter *iter)
 695 {
 696         unsigned int first_len;
 697         struct frag_hdr *fh;
 698
 699         /* BUILD HEADER */
 700         *prevhdr = NEXTHDR_FRAGMENT;
 701         iter->tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
 702         if (!iter->tmp_hdr)
 703                 return -ENOMEM;
 704
 705         iter->frag = skb_shinfo(skb)->frag_list;
 706         skb_frag_list_init(skb);
 707
 708         iter->offset = 0;
 709         iter->hlen = hlen;
 710         iter->frag_id = frag_id;
 711         iter->nexthdr = nexthdr;
 712
 713         __skb_pull(skb, hlen);
 714         fh = __skb_push(skb, sizeof(struct frag_hdr));
 715         __skb_push(skb, hlen);
 716         skb_reset_network_header(skb);
 717         memcpy(skb_network_header(skb), iter->tmp_hdr, hlen);
 718
 719         fh->nexthdr = nexthdr;
 720         fh->reserved = 0;
 721         fh->frag_off = htons(IP6_MF);
 722         fh->identification = frag_id;
 723
 724         first_len = skb_pagelen(skb);
 725         skb->data_len = first_len - skb_headlen(skb);
 726         skb->len = first_len;
 727         ipv6_hdr(skb)->payload_len = htons(first_len - sizeof(struct ipv6hdr));
 728
 729         return 0;
 730 }
 731 EXPORT_SYMBOL(ip6_fraglist_init);
 732
 733 void ip6_fraglist_prepare(struct sk_buff *skb,
 734                           struct ip6_fraglist_iter *iter)
 735 {
 736         struct sk_buff *frag = iter->frag;
 737         unsigned int hlen = iter->hlen;
 738         struct frag_hdr *fh;
 739
 740         frag->ip_summed = CHECKSUM_NONE;
 741         skb_reset_transport_header(frag);
 742         fh = __skb_push(frag, sizeof(struct frag_hdr));
 743         __skb_push(frag, hlen);
 744         skb_reset_network_header(frag);
 745         memcpy(skb_network_header(frag), iter->tmp_hdr, hlen);
 746         iter->offset += skb->len - hlen - sizeof(struct frag_hdr);
 747         fh->nexthdr = iter->nexthdr;
 748         fh->reserved = 0;
 749         fh->frag_off = htons(iter->offset);
 750         if (frag->next)
 751                 fh->frag_off |= htons(IP6_MF);
 752         fh->identification = iter->frag_id;
 753         ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
 754         ip6_copy_metadata(frag, skb);
 755 }
 756 EXPORT_SYMBOL(ip6_fraglist_prepare);
 757
 758 void ip6_frag_init(struct sk_buff *skb, unsigned int hlen, unsigned int mtu,
 759                    unsigned short needed_tailroom, int hdr_room, u8 *prevhdr,
 760                    u8 nexthdr, __be32 frag_id, struct ip6_frag_state *state)
 761 {
 762         state->prevhdr = prevhdr;
 763         state->nexthdr = nexthdr;
 764         state->frag_id = frag_id;
 765
 766         state->hlen = hlen;
 767         state->mtu = mtu;
 768
 769         state->left = skb->len - hlen;  /* Space per frame */
 770         state->ptr = hlen;              /* Where to start from */
 771
 772         state->hroom = hdr_room;
 773         state->troom = needed_tailroom;
 774
 775         state->offset = 0;
 776 }
 777 EXPORT_SYMBOL(ip6_frag_init);
 778
 779 struct sk_buff *ip6_frag_next(struct sk_buff *skb, struct ip6_frag_state *state)
 780 {
 781         u8 *prevhdr = state->prevhdr, *fragnexthdr_offset;
 782         struct sk_buff *frag;
 783         struct frag_hdr *fh;
 784         unsigned int len;
 785
 786         len = state->left;
 787         /* IF: it doesn't fit, use 'mtu' - the data space left */
 788         if (len > state->mtu)
 789                 len = state->mtu;
 790         /* IF: we are not sending up to and including the packet end
 791            then align the next start on an eight byte boundary */
 792         if (len < state->left)
 793                 len &= ~7;
 794
 795         /* Allocate buffer */
 796         frag = alloc_skb(len + state->hlen + sizeof(struct frag_hdr) +
 797                          state->hroom + state->troom, GFP_ATOMIC);
 798         if (!frag)
 799                 return ERR_PTR(-ENOMEM);
 800
 801         /*
 802          *      Set up data on packet
 803          */
 804
 805         ip6_copy_metadata(frag, skb);
 806         skb_reserve(frag, state->hroom);
 807         skb_put(frag, len + state->hlen + sizeof(struct frag_hdr));
 808         skb_reset_network_header(frag);
 809         fh = (struct frag_hdr *)(skb_network_header(frag) + state->hlen);
 810         frag->transport_header = (frag->network_header + state->hlen +
 811                                   sizeof(struct frag_hdr));
 812
 813         /*
 814          *      Charge the memory for the fragment to any owner
 815          *      it might possess
 816          */
 817         if (skb->sk)
 818                 skb_set_owner_w(frag, skb->sk);
 819
 820         /*
 821          *      Copy the packet header into the new buffer.
 822          */
 823         skb_copy_from_linear_data(skb, skb_network_header(frag), state->hlen);
 824
 825         fragnexthdr_offset = skb_network_header(frag);
 826         fragnexthdr_offset += prevhdr - skb_network_header(skb);
 827         *fragnexthdr_offset = NEXTHDR_FRAGMENT;
 828
 829         /*
 830          *      Build fragment header.
 831          */
 832         fh->nexthdr = state->nexthdr;
 833         fh->reserved = 0;
 834         fh->identification = state->frag_id;
 835
 836         /*
 837          *      Copy a block of the IP datagram.
 838          */
 839         BUG_ON(skb_copy_bits(skb, state->ptr, skb_transport_header(frag),
 840                              len));
 841         state->left -= len;
 842
 843         fh->frag_off = htons(state->offset);
 844         if (state->left > 0)
 845                 fh->frag_off |= htons(IP6_MF);
 846         ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
 847
 848         state->ptr += len;
 849         state->offset += len;
 850
 851         return frag;
 852 }
 853 EXPORT_SYMBOL(ip6_frag_next);
 854
 855 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
 856                  int (*output)(struct net *, struct sock *, struct sk_buff *))
 857 {
 858         struct sk_buff *frag;
 859         struct rt6_info *rt = dst_rt6_info(skb_dst(skb));
 860         struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
 861                                 inet6_sk(skb->sk) : NULL;
 862         bool mono_delivery_time = skb->mono_delivery_time;
 863         struct ip6_frag_state state;
 864         unsigned int mtu, hlen, nexthdr_offset;
 865         ktime_t tstamp = skb->tstamp;
 866         int hroom, err = 0;
 867         __be32 frag_id;
 868         u8 *prevhdr, nexthdr = 0;
 869
 870         err = ip6_find_1stfragopt(skb, &prevhdr);
 871         if (err < 0)
 872                 goto fail;
 873         hlen = err;
 874         nexthdr = *prevhdr;
 875         nexthdr_offset = prevhdr - skb_network_header(skb);
 876
 877         mtu = ip6_skb_dst_mtu(skb);
 878
 879         /* We must not fragment if the socket is set to force MTU discovery
 880          * or if the skb it not generated by a local socket.
 881          */
 882         if (unlikely(!skb->ignore_df && skb->len > mtu))
 883                 goto fail_toobig;
 884
 885         if (IP6CB(skb)->frag_max_size) {
 886                 if (IP6CB(skb)->frag_max_size > mtu)
 887                         goto fail_toobig;
 888
 889                 /* don't send fragments larger than what we received */
 890                 mtu = IP6CB(skb)->frag_max_size;
 891                 if (mtu < IPV6_MIN_MTU)
 892                         mtu = IPV6_MIN_MTU;
 893         }
 894
 895         if (np) {
 896                 u32 frag_size = READ_ONCE(np->frag_size);
 897
 898                 if (frag_size && frag_size < mtu)
 899                         mtu = frag_size;
 900         }
 901         if (mtu < hlen + sizeof(struct frag_hdr) + 8)
 902                 goto fail_toobig;
 903         mtu -= hlen + sizeof(struct frag_hdr);
 904
 905         frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
 906                                     &ipv6_hdr(skb)->saddr);
 907
 908         if (skb->ip_summed == CHECKSUM_PARTIAL &&
 909             (err = skb_checksum_help(skb)))
 910                 goto fail;
 911
 912         prevhdr = skb_network_header(skb) + nexthdr_offset;
 913         hroom = LL_RESERVED_SPACE(rt->dst.dev);
 914         if (skb_has_frag_list(skb)) {
 915                 unsigned int first_len = skb_pagelen(skb);
 916                 struct ip6_fraglist_iter iter;
 917                 struct sk_buff *frag2;
 918
 919                 if (first_len - hlen > mtu ||
 920                     ((first_len - hlen) & 7) ||
 921                     skb_cloned(skb) ||
 922                     skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
 923                         goto slow_path;
 924
 925                 skb_walk_frags(skb, frag) {
 926                         /* Correct geometry. */
 927                         if (frag->len > mtu ||
 928                             ((frag->len & 7) && frag->next) ||
 929                             skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
 930                                 goto slow_path_clean;
 931
 932                         /* Partially cloned skb? */
 933                         if (skb_shared(frag))
 934                                 goto slow_path_clean;
 935
 936                         BUG_ON(frag->sk);
 937                         if (skb->sk) {
 938                                 frag->sk = skb->sk;
 939                                 frag->destructor = sock_wfree;
 940                         }
 941                         skb->truesize -= frag->truesize;
 942                 }
 943
 944                 err = ip6_fraglist_init(skb, hlen, prevhdr, nexthdr, frag_id,
 945                                         &iter);
 946                 if (err < 0)
 947                         goto fail;
 948
 949                 /* We prevent @rt from being freed. */
 950                 rcu_read_lock();
 951
 952                 for (;;) {
 953                         /* Prepare header of the next frame,
 954                          * before previous one went down. */
 955                         if (iter.frag)
 956                                 ip6_fraglist_prepare(skb, &iter);
 957
 958                         skb_set_delivery_time(skb, tstamp, mono_delivery_time);
 959                         err = output(net, sk, skb);
 960                         if (!err)
 961                                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 962                                               IPSTATS_MIB_FRAGCREATES);
 963
 964                         if (err || !iter.frag)
 965                                 break;
 966
 967                         skb = ip6_fraglist_next(&iter);
 968                 }
 969
 970                 kfree(iter.tmp_hdr);
 971
 972                 if (err == 0) {
 973                         IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 974                                       IPSTATS_MIB_FRAGOKS);
 975                         rcu_read_unlock();
 976                         return 0;
 977                 }
 978
 979                 kfree_skb_list(iter.frag);
 980
 981                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 982                               IPSTATS_MIB_FRAGFAILS);
 983                 rcu_read_unlock();
 984                 return err;
 985
 986 slow_path_clean:
 987                 skb_walk_frags(skb, frag2) {
 988                         if (frag2 == frag)
 989                                 break;
 990                         frag2->sk = NULL;
 991                         frag2->destructor = NULL;
 992                         skb->truesize += frag2->truesize;
 993                 }
 994         }
 995
 996 slow_path:
 997         /*
 998          *      Fragment the datagram.
 999          */
1000
1001         ip6_frag_init(skb, hlen, mtu, rt->dst.dev->needed_tailroom,
1002                       LL_RESERVED_SPACE(rt->dst.dev), prevhdr, nexthdr, frag_id,
1003                       &state);
1004
1005         /*
1006          *      Keep copying data until we run out.
1007          */
1008
1009         while (state.left > 0) {
1010                 frag = ip6_frag_next(skb, &state);
1011                 if (IS_ERR(frag)) {
1012                         err = PTR_ERR(frag);
1013                         goto fail;
1014                 }
1015
1016                 /*
1017                  *      Put this fragment into the sending queue.
1018                  */
1019                 skb_set_delivery_time(frag, tstamp, mono_delivery_time);
1020                 err = output(net, sk, frag);
1021                 if (err)
1022                         goto fail;
1023
1024                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
1025                               IPSTATS_MIB_FRAGCREATES);
1026         }
1027         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
1028                       IPSTATS_MIB_FRAGOKS);
1029         consume_skb(skb);
1030         return err;
1031
1032 fail_toobig:
1033         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
1034         err = -EMSGSIZE;
1035
1036 fail:
1037         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
1038                       IPSTATS_MIB_FRAGFAILS);
1039         kfree_skb(skb);
1040         return err;
1041 }
1042
1043 static inline int ip6_rt_check(const struct rt6key *rt_key,
1044                                const struct in6_addr *fl_addr,
1045                                const struct in6_addr *addr_cache)
1046 {
1047         return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
1048                 (!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
1049 }
1050
1051 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
1052                                           struct dst_entry *dst,
1053                                           const struct flowi6 *fl6)
1054 {
1055         struct ipv6_pinfo *np = inet6_sk(sk);
1056         struct rt6_info *rt;
1057
1058         if (!dst)
1059                 goto out;
1060
1061         if (dst->ops->family != AF_INET6) {
1062                 dst_release(dst);
1063                 return NULL;
1064         }
1065
1066         rt = dst_rt6_info(dst);
1067         /* Yes, checking route validity in not connected
1068          * case is not very simple. Take into account,
1069          * that we do not support routing by source, TOS,
1070          * and MSG_DONTROUTE            --ANK (980726)
1071          *
1072          * 1. ip6_rt_check(): If route was host route,
1073          *    check that cached destination is current.
1074          *    If it is network route, we still may
1075          *    check its validity using saved pointer
1076          *    to the last used address: daddr_cache.
1077          *    We do not want to save whole address now,
1078          *    (because main consumer of this service
1079          *    is tcp, which has not this problem),
1080          *    so that the last trick works only on connected
1081          *    sockets.
1082          * 2. oif also should be the same.
1083          */
1084         if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
1085 #ifdef CONFIG_IPV6_SUBTREES
1086             ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
1087 #endif
1088            (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) {
1089                 dst_release(dst);
1090                 dst = NULL;
1091         }
1092
1093 out:
1094         return dst;
1095 }
1096
1097 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
1098                                struct dst_entry **dst, struct flowi6 *fl6)
1099 {
1100 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1101         struct neighbour *n;
1102         struct rt6_info *rt;
1103 #endif
1104         int err;
1105         int flags = 0;
1106
1107         /* The correct way to handle this would be to do
1108          * ip6_route_get_saddr, and then ip6_route_output; however,
1109          * the route-specific preferred source forces the
1110          * ip6_route_output call _before_ ip6_route_get_saddr.
1111          *
1112          * In source specific routing (no src=any default route),
1113          * ip6_route_output will fail given src=any saddr, though, so
1114          * that's why we try it again later.
1115          */
1116         if (ipv6_addr_any(&fl6->saddr)) {
1117                 struct fib6_info *from;
1118                 struct rt6_info *rt;
1119
1120                 *dst = ip6_route_output(net, sk, fl6);
1121                 rt = (*dst)->error ? NULL : dst_rt6_info(*dst);
1122
1123                 rcu_read_lock();
1124                 from = rt ? rcu_dereference(rt->from) : NULL;
1125                 err = ip6_route_get_saddr(net, from, &fl6->daddr,
1126                                           sk ? READ_ONCE(inet6_sk(sk)->srcprefs) : 0,
1127                                           &fl6->saddr);
1128                 rcu_read_unlock();
1129
1130                 if (err)
1131                         goto out_err_release;
1132
1133                 /* If we had an erroneous initial result, pretend it
1134                  * never existed and let the SA-enabled version take
1135                  * over.
1136                  */
1137                 if ((*dst)->error) {
1138                         dst_release(*dst);
1139                         *dst = NULL;
1140                 }
1141
1142                 if (fl6->flowi6_oif)
1143                         flags |= RT6_LOOKUP_F_IFACE;
1144         }
1145
1146         if (!*dst)
1147                 *dst = ip6_route_output_flags(net, sk, fl6, flags);
1148
1149         err = (*dst)->error;
1150         if (err)
1151                 goto out_err_release;
1152
1153 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1154         /*
1155          * Here if the dst entry we've looked up
1156          * has a neighbour entry that is in the INCOMPLETE
1157          * state and the src address from the flow is
1158          * marked as OPTIMISTIC, we release the found
1159          * dst entry and replace it instead with the
1160          * dst entry of the nexthop router
1161          */
1162         rt = dst_rt6_info(*dst);
1163         rcu_read_lock();
1164         n = __ipv6_neigh_lookup_noref(rt->dst.dev,
1165                                       rt6_nexthop(rt, &fl6->daddr));
1166         err = n && !(READ_ONCE(n->nud_state) & NUD_VALID) ? -EINVAL : 0;
1167         rcu_read_unlock();
1168
1169         if (err) {
1170                 struct inet6_ifaddr *ifp;
1171                 struct flowi6 fl_gw6;
1172                 int redirect;
1173
1174                 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
1175                                       (*dst)->dev, 1);
1176
1177                 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1178                 if (ifp)
1179                         in6_ifa_put(ifp);
1180
1181                 if (redirect) {
1182                         /*
1183                          * We need to get the dst entry for the
1184                          * default router instead
1185                          */
1186                         dst_release(*dst);
1187                         memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1188                         memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1189                         *dst = ip6_route_output(net, sk, &fl_gw6);
1190                         err = (*dst)->error;
1191                         if (err)
1192                                 goto out_err_release;
1193                 }
1194         }
1195 #endif
1196         if (ipv6_addr_v4mapped(&fl6->saddr) &&
1197             !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
1198                 err = -EAFNOSUPPORT;
1199                 goto out_err_release;
1200         }
1201
1202         return 0;
1203
1204 out_err_release:
1205         dst_release(*dst);
1206         *dst = NULL;
1207
1208         if (err == -ENETUNREACH)
1209                 IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1210         return err;
1211 }
1212
1213 /**
1214  *      ip6_dst_lookup - perform route lookup on flow
1215  *      @net: Network namespace to perform lookup in
1216  *      @sk: socket which provides route info
1217  *      @dst: pointer to dst_entry * for result
1218  *      @fl6: flow to lookup
1219  *
1220  *      This function performs a route lookup on the given flow.
1221  *
1222  *      It returns zero on success, or a standard errno code on error.
1223  */
1224 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1225                    struct flowi6 *fl6)
1226 {
1227         *dst = NULL;
1228         return ip6_dst_lookup_tail(net, sk, dst, fl6);
1229 }
1230 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1231
1232 /**
1233  *      ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1234  *      @net: Network namespace to perform lookup in
1235  *      @sk: socket which provides route info
1236  *      @fl6: flow to lookup
1237  *      @final_dst: final destination address for ipsec lookup
1238  *
1239  *      This function performs a route lookup on the given flow.
1240  *
1241  *      It returns a valid dst pointer on success, or a pointer encoded
1242  *      error code.
1243  */
1244 struct dst_entry *ip6_dst_lookup_flow(struct net *net, const struct sock *sk, struct flowi6 *fl6,
1245                                       const struct in6_addr *final_dst)
1246 {
1247         struct dst_entry *dst = NULL;
1248         int err;
1249
1250         err = ip6_dst_lookup_tail(net, sk, &dst, fl6);
1251         if (err)
1252                 return ERR_PTR(err);
1253         if (final_dst)
1254                 fl6->daddr = *final_dst;
1255
1256         return xfrm_lookup_route(net, dst, flowi6_to_flowi(fl6), sk, 0);
1257 }
1258 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1259
1260 /**
1261  *      ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1262  *      @sk: socket which provides the dst cache and route info
1263  *      @fl6: flow to lookup
1264  *      @final_dst: final destination address for ipsec lookup
1265  *      @connected: whether @sk is connected or not
1266  *
1267  *      This function performs a route lookup on the given flow with the
1268  *      possibility of using the cached route in the socket if it is valid.
1269  *      It will take the socket dst lock when operating on the dst cache.
1270  *      As a result, this function can only be used in process context.
1271  *
1272  *      In addition, for a connected socket, cache the dst in the socket
1273  *      if the current cache is not valid.
1274  *
1275  *      It returns a valid dst pointer on success, or a pointer encoded
1276  *      error code.
1277  */
1278 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1279                                          const struct in6_addr *final_dst,
1280                                          bool connected)
1281 {
1282         struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1283
1284         dst = ip6_sk_dst_check(sk, dst, fl6);
1285         if (dst)
1286                 return dst;
1287
1288         dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_dst);
1289         if (connected && !IS_ERR(dst))
1290                 ip6_sk_dst_store_flow(sk, dst_clone(dst), fl6);
1291
1292         return dst;
1293 }
1294 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1295
1296 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1297                                                gfp_t gfp)
1298 {
1299         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1300 }
1301
1302 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1303                                                 gfp_t gfp)
1304 {
1305         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1306 }
1307
1308 static void ip6_append_data_mtu(unsigned int *mtu,
1309                                 int *maxfraglen,
1310                                 unsigned int fragheaderlen,
1311                                 struct sk_buff *skb,
1312                                 struct rt6_info *rt,
1313                                 unsigned int orig_mtu)
1314 {
1315         if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1316                 if (!skb) {
1317                         /* first fragment, reserve header_len */
1318                         *mtu = orig_mtu - rt->dst.header_len;
1319
1320                 } else {
1321                         /*
1322                          * this fragment is not first, the headers
1323                          * space is regarded as data space.
1324                          */
1325                         *mtu = orig_mtu;
1326                 }
1327                 *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1328                               + fragheaderlen - sizeof(struct frag_hdr);
1329         }
1330 }
1331
1332 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1333                           struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
1334                           struct rt6_info *rt)
1335 {
1336         struct ipv6_pinfo *np = inet6_sk(sk);
1337         unsigned int mtu, frag_size;
1338         struct ipv6_txoptions *nopt, *opt = ipc6->opt;
1339
1340         /* callers pass dst together with a reference, set it first so
1341          * ip6_cork_release() can put it down even in case of an error.
1342          */
1343         cork->base.dst = &rt->dst;
1344
1345         /*
1346          * setup for corking
1347          */
1348         if (opt) {
1349                 if (WARN_ON(v6_cork->opt))
1350                         return -EINVAL;
1351
1352                 nopt = v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation);
1353                 if (unlikely(!nopt))
1354                         return -ENOBUFS;
1355
1356                 nopt->tot_len = sizeof(*opt);
1357                 nopt->opt_flen = opt->opt_flen;
1358                 nopt->opt_nflen = opt->opt_nflen;
1359
1360                 nopt->dst0opt = ip6_opt_dup(opt->dst0opt, sk->sk_allocation);
1361                 if (opt->dst0opt && !nopt->dst0opt)
1362                         return -ENOBUFS;
1363
1364                 nopt->dst1opt = ip6_opt_dup(opt->dst1opt, sk->sk_allocation);
1365                 if (opt->dst1opt && !nopt->dst1opt)
1366                         return -ENOBUFS;
1367
1368                 nopt->hopopt = ip6_opt_dup(opt->hopopt, sk->sk_allocation);
1369                 if (opt->hopopt && !nopt->hopopt)
1370                         return -ENOBUFS;
1371
1372                 nopt->srcrt = ip6_rthdr_dup(opt->srcrt, sk->sk_allocation);
1373                 if (opt->srcrt && !nopt->srcrt)
1374                         return -ENOBUFS;
1375
1376                 /* need source address above miyazawa*/
1377         }
1378         v6_cork->hop_limit = ipc6->hlimit;
1379         v6_cork->tclass = ipc6->tclass;
1380         if (rt->dst.flags & DST_XFRM_TUNNEL)
1381                 mtu = READ_ONCE(np->pmtudisc) >= IPV6_PMTUDISC_PROBE ?
1382                       READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst);
1383         else
1384                 mtu = READ_ONCE(np->pmtudisc) >= IPV6_PMTUDISC_PROBE ?
1385                         READ_ONCE(rt->dst.dev->mtu) : dst_mtu(xfrm_dst_path(&rt->dst));
1386
1387         frag_size = READ_ONCE(np->frag_size);
1388         if (frag_size && frag_size < mtu)
1389                 mtu = frag_size;
1390
1391         cork->base.fragsize = mtu;
1392         cork->base.gso_size = ipc6->gso_size;
1393         cork->base.tx_flags = 0;
1394         cork->base.mark = ipc6->sockc.mark;
1395         sock_tx_timestamp(sk, ipc6->sockc.tsflags, &cork->base.tx_flags);
1396
1397         cork->base.length = 0;
1398         cork->base.transmit_time = ipc6->sockc.transmit_time;
1399
1400         return 0;
1401 }
1402
1403 static int __ip6_append_data(struct sock *sk,
1404                              struct sk_buff_head *queue,
1405                              struct inet_cork_full *cork_full,
1406                              struct inet6_cork *v6_cork,
1407                              struct page_frag *pfrag,
1408                              int getfrag(void *from, char *to, int offset,
1409                                          int len, int odd, struct sk_buff *skb),
1410                              void *from, size_t length, int transhdrlen,
1411                              unsigned int flags, struct ipcm6_cookie *ipc6)
1412 {
1413         struct sk_buff *skb, *skb_prev = NULL;
1414         struct inet_cork *cork = &cork_full->base;
1415         struct flowi6 *fl6 = &cork_full->fl.u.ip6;
1416         unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
1417         struct ubuf_info *uarg = NULL;
1418         int exthdrlen = 0;
1419         int dst_exthdrlen = 0;
1420         int hh_len;
1421         int copy;
1422         int err;
1423         int offset = 0;
1424         bool zc = false;
1425         u32 tskey = 0;
1426         struct rt6_info *rt = dst_rt6_info(cork->dst);
1427         bool paged, hold_tskey, extra_uref = false;
1428         struct ipv6_txoptions *opt = v6_cork->opt;
1429         int csummode = CHECKSUM_NONE;
1430         unsigned int maxnonfragsize, headersize;
1431         unsigned int wmem_alloc_delta = 0;
1432
1433         skb = skb_peek_tail(queue);
1434         if (!skb) {
1435                 exthdrlen = opt ? opt->opt_flen : 0;
1436                 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1437         }
1438
1439         paged = !!cork->gso_size;
1440         mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize;
1441         orig_mtu = mtu;
1442
1443         hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1444
1445         fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1446                         (opt ? opt->opt_nflen : 0);
1447
1448         headersize = sizeof(struct ipv6hdr) +
1449                      (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1450                      rt->rt6i_nfheader_len;
1451
1452         if (mtu <= fragheaderlen ||
1453             ((mtu - fragheaderlen) & ~7) + fragheaderlen <= sizeof(struct frag_hdr))
1454                 goto emsgsize;
1455
1456         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1457                      sizeof(struct frag_hdr);
1458
1459         /* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit
1460          * the first fragment
1461          */
1462         if (headersize + transhdrlen > mtu)
1463                 goto emsgsize;
1464
1465         if (cork->length + length > mtu - headersize && ipc6->dontfrag &&
1466             (sk->sk_protocol == IPPROTO_UDP ||
1467              sk->sk_protocol == IPPROTO_ICMPV6 ||
1468              sk->sk_protocol == IPPROTO_RAW)) {
1469                 ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1470                                 sizeof(struct ipv6hdr));
1471                 goto emsgsize;
1472         }
1473
1474         if (ip6_sk_ignore_df(sk))
1475                 maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1476         else
1477                 maxnonfragsize = mtu;
1478
1479         if (cork->length + length > maxnonfragsize - headersize) {
1480 emsgsize:
1481                 pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0);
1482                 ipv6_local_error(sk, EMSGSIZE, fl6, pmtu);
1483                 return -EMSGSIZE;
1484         }
1485
1486         /* CHECKSUM_PARTIAL only with no extension headers and when
1487          * we are not going to fragment
1488          */
1489         if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1490             headersize == sizeof(struct ipv6hdr) &&
1491             length <= mtu - headersize &&
1492             (!(flags & MSG_MORE) || cork->gso_size) &&
1493             rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1494                 csummode = CHECKSUM_PARTIAL;
1495
1496         if ((flags & MSG_ZEROCOPY) && length) {
1497                 struct msghdr *msg = from;
1498
1499                 if (getfrag == ip_generic_getfrag && msg->msg_ubuf) {
1500                         if (skb_zcopy(skb) && msg->msg_ubuf != skb_zcopy(skb))
1501                                 return -EINVAL;
1502
1503                         /* Leave uarg NULL if can't zerocopy, callers should
1504                          * be able to handle it.
1505                          */
1506                         if ((rt->dst.dev->features & NETIF_F_SG) &&
1507                             csummode == CHECKSUM_PARTIAL) {
1508                                 paged = true;
1509                                 zc = true;
1510                                 uarg = msg->msg_ubuf;
1511                         }
1512                 } else if (sock_flag(sk, SOCK_ZEROCOPY)) {
1513                         uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb));
1514                         if (!uarg)
1515                                 return -ENOBUFS;
1516                         extra_uref = !skb_zcopy(skb);   /* only ref on new uarg */
1517                         if (rt->dst.dev->features & NETIF_F_SG &&
1518                             csummode == CHECKSUM_PARTIAL) {
1519                                 paged = true;
1520                                 zc = true;
1521                         } else {
1522                                 uarg_to_msgzc(uarg)->zerocopy = 0;
1523                                 skb_zcopy_set(skb, uarg, &extra_uref);
1524                         }
1525                 }
1526         } else if ((flags & MSG_SPLICE_PAGES) && length) {
1527                 if (inet_test_bit(HDRINCL, sk))
1528                         return -EPERM;
1529                 if (rt->dst.dev->features & NETIF_F_SG &&
1530                     getfrag == ip_generic_getfrag)
1531                         /* We need an empty buffer to attach stuff to */
1532                         paged = true;
1533                 else
1534                         flags &= ~MSG_SPLICE_PAGES;
1535         }
1536
1537         hold_tskey = cork->tx_flags & SKBTX_ANY_TSTAMP &&
1538                      READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_OPT_ID;
1539         if (hold_tskey)
1540                 tskey = atomic_inc_return(&sk->sk_tskey) - 1;
1541
1542         /*
1543          * Let's try using as much space as possible.
1544          * Use MTU if total length of the message fits into the MTU.
1545          * Otherwise, we need to reserve fragment header and
1546          * fragment alignment (= 8-15 octects, in total).
1547          *
1548          * Note that we may need to "move" the data from the tail
1549          * of the buffer to the new fragment when we split
1550          * the message.
1551          *
1552          * FIXME: It may be fragmented into multiple chunks
1553          *        at once if non-fragmentable extension headers
1554          *        are too large.
1555          * --yoshfuji
1556          */
1557
1558         cork->length += length;
1559         if (!skb)
1560                 goto alloc_new_skb;
1561
1562         while (length > 0) {
1563                 /* Check if the remaining data fits into current packet. */
1564                 copy = (cork->length <= mtu ? mtu : maxfraglen) - skb->len;
1565                 if (copy < length)
1566                         copy = maxfraglen - skb->len;
1567
1568                 if (copy <= 0) {
1569                         char *data;
1570                         unsigned int datalen;
1571                         unsigned int fraglen;
1572                         unsigned int fraggap;
1573                         unsigned int alloclen, alloc_extra;
1574                         unsigned int pagedlen;
1575 alloc_new_skb:
1576                         /* There's no room in the current skb */
1577                         if (skb)
1578                                 fraggap = skb->len - maxfraglen;
1579                         else
1580                                 fraggap = 0;
1581                         /* update mtu and maxfraglen if necessary */
1582                         if (!skb || !skb_prev)
1583                                 ip6_append_data_mtu(&mtu, &maxfraglen,
1584                                                     fragheaderlen, skb, rt,
1585                                                     orig_mtu);
1586
1587                         skb_prev = skb;
1588
1589                         /*
1590                          * If remaining data exceeds the mtu,
1591                          * we know we need more fragment(s).
1592                          */
1593                         datalen = length + fraggap;
1594
1595                         if (datalen > (cork->length <= mtu ? mtu : maxfraglen) - fragheaderlen)
1596                                 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1597                         fraglen = datalen + fragheaderlen;
1598                         pagedlen = 0;
1599
1600                         alloc_extra = hh_len;
1601                         alloc_extra += dst_exthdrlen;
1602                         alloc_extra += rt->dst.trailer_len;
1603
1604                         /* We just reserve space for fragment header.
1605                          * Note: this may be overallocation if the message
1606                          * (without MSG_MORE) fits into the MTU.
1607                          */
1608                         alloc_extra += sizeof(struct frag_hdr);
1609
1610                         if ((flags & MSG_MORE) &&
1611                             !(rt->dst.dev->features&NETIF_F_SG))
1612                                 alloclen = mtu;
1613                         else if (!paged &&
1614                                  (fraglen + alloc_extra < SKB_MAX_ALLOC ||
1615                                   !(rt->dst.dev->features & NETIF_F_SG)))
1616                                 alloclen = fraglen;
1617                         else {
1618                                 alloclen = fragheaderlen + transhdrlen;
1619                                 pagedlen = datalen - transhdrlen;
1620                         }
1621                         alloclen += alloc_extra;
1622
1623                         if (datalen != length + fraggap) {
1624                                 /*
1625                                  * this is not the last fragment, the trailer
1626                                  * space is regarded as data space.
1627                                  */
1628                                 datalen += rt->dst.trailer_len;
1629                         }
1630
1631                         fraglen = datalen + fragheaderlen;
1632
1633                         copy = datalen - transhdrlen - fraggap - pagedlen;
1634                         /* [!] NOTE: copy may be negative if pagedlen>0
1635                          * because then the equation may reduces to -fraggap.
1636                          */
1637                         if (copy < 0 && !(flags & MSG_SPLICE_PAGES)) {
1638                                 err = -EINVAL;
1639                                 goto error;
1640                         }
1641                         if (transhdrlen) {
1642                                 skb = sock_alloc_send_skb(sk, alloclen,
1643                                                 (flags & MSG_DONTWAIT), &err);
1644                         } else {
1645                                 skb = NULL;
1646                                 if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <=
1647                                     2 * sk->sk_sndbuf)
1648                                         skb = alloc_skb(alloclen,
1649                                                         sk->sk_allocation);
1650                                 if (unlikely(!skb))
1651                                         err = -ENOBUFS;
1652                         }
1653                         if (!skb)
1654                                 goto error;
1655                         /*
1656                          *      Fill in the control structures
1657                          */
1658                         skb->protocol = htons(ETH_P_IPV6);
1659                         skb->ip_summed = csummode;
1660                         skb->csum = 0;
1661                         /* reserve for fragmentation and ipsec header */
1662                         skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1663                                     dst_exthdrlen);
1664
1665                         /*
1666                          *      Find where to start putting bytes
1667                          */
1668                         data = skb_put(skb, fraglen - pagedlen);
1669                         skb_set_network_header(skb, exthdrlen);
1670                         data += fragheaderlen;
1671                         skb->transport_header = (skb->network_header +
1672                                                  fragheaderlen);
1673                         if (fraggap) {
1674                                 skb->csum = skb_copy_and_csum_bits(
1675                                         skb_prev, maxfraglen,
1676                                         data + transhdrlen, fraggap);
1677                                 skb_prev->csum = csum_sub(skb_prev->csum,
1678                                                           skb->csum);
1679                                 data += fraggap;
1680                                 pskb_trim_unique(skb_prev, maxfraglen);
1681                         }
1682                         if (copy > 0 &&
1683                             getfrag(from, data + transhdrlen, offset,
1684                                     copy, fraggap, skb) < 0) {
1685                                 err = -EFAULT;
1686                                 kfree_skb(skb);
1687                                 goto error;
1688                         } else if (flags & MSG_SPLICE_PAGES) {
1689                                 copy = 0;
1690                         }
1691
1692                         offset += copy;
1693                         length -= copy + transhdrlen;
1694                         transhdrlen = 0;
1695                         exthdrlen = 0;
1696                         dst_exthdrlen = 0;
1697
1698                         /* Only the initial fragment is time stamped */
1699                         skb_shinfo(skb)->tx_flags = cork->tx_flags;
1700                         cork->tx_flags = 0;
1701                         skb_shinfo(skb)->tskey = tskey;
1702                         tskey = 0;
1703                         skb_zcopy_set(skb, uarg, &extra_uref);
1704
1705                         if ((flags & MSG_CONFIRM) && !skb_prev)
1706                                 skb_set_dst_pending_confirm(skb, 1);
1707
1708                         /*
1709                          * Put the packet on the pending queue
1710                          */
1711                         if (!skb->destructor) {
1712                                 skb->destructor = sock_wfree;
1713                                 skb->sk = sk;
1714                                 wmem_alloc_delta += skb->truesize;
1715                         }
1716                         __skb_queue_tail(queue, skb);
1717                         continue;
1718                 }
1719
1720                 if (copy > length)
1721                         copy = length;
1722
1723                 if (!(rt->dst.dev->features&NETIF_F_SG) &&
1724                     skb_tailroom(skb) >= copy) {
1725                         unsigned int off;
1726
1727                         off = skb->len;
1728                         if (getfrag(from, skb_put(skb, copy),
1729                                                 offset, copy, off, skb) < 0) {
1730                                 __skb_trim(skb, off);
1731                                 err = -EFAULT;
1732                                 goto error;
1733                         }
1734                 } else if (flags & MSG_SPLICE_PAGES) {
1735                         struct msghdr *msg = from;
1736
1737                         err = -EIO;
1738                         if (WARN_ON_ONCE(copy > msg->msg_iter.count))
1739                                 goto error;
1740
1741                         err = skb_splice_from_iter(skb, &msg->msg_iter, copy,
1742                                                    sk->sk_allocation);
1743                         if (err < 0)
1744                                 goto error;
1745                         copy = err;
1746                         wmem_alloc_delta += copy;
1747                 } else if (!zc) {
1748                         int i = skb_shinfo(skb)->nr_frags;
1749
1750                         err = -ENOMEM;
1751                         if (!sk_page_frag_refill(sk, pfrag))
1752                                 goto error;
1753
1754                         skb_zcopy_downgrade_managed(skb);
1755                         if (!skb_can_coalesce(skb, i, pfrag->page,
1756                                               pfrag->offset)) {
1757                                 err = -EMSGSIZE;
1758                                 if (i == MAX_SKB_FRAGS)
1759                                         goto error;
1760
1761                                 __skb_fill_page_desc(skb, i, pfrag->page,
1762                                                      pfrag->offset, 0);
1763                                 skb_shinfo(skb)->nr_frags = ++i;
1764                                 get_page(pfrag->page);
1765                         }
1766                         copy = min_t(int, copy, pfrag->size - pfrag->offset);
1767                         if (getfrag(from,
1768                                     page_address(pfrag->page) + pfrag->offset,
1769                                     offset, copy, skb->len, skb) < 0)
1770                                 goto error_efault;
1771
1772                         pfrag->offset += copy;
1773                         skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1774                         skb->len += copy;
1775                         skb->data_len += copy;
1776                         skb->truesize += copy;
1777                         wmem_alloc_delta += copy;
1778                 } else {
1779                         err = skb_zerocopy_iter_dgram(skb, from, copy);
1780                         if (err < 0)
1781                                 goto error;
1782                 }
1783                 offset += copy;
1784                 length -= copy;
1785         }
1786
1787         if (wmem_alloc_delta)
1788                 refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1789         return 0;
1790
1791 error_efault:
1792         err = -EFAULT;
1793 error:
1794         net_zcopy_put_abort(uarg, extra_uref);
1795         cork->length -= length;
1796         IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1797         refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1798         if (hold_tskey)
1799                 atomic_dec(&sk->sk_tskey);
1800         return err;
1801 }
1802
1803 int ip6_append_data(struct sock *sk,
1804                     int getfrag(void *from, char *to, int offset, int len,
1805                                 int odd, struct sk_buff *skb),
1806                     void *from, size_t length, int transhdrlen,
1807                     struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1808                     struct rt6_info *rt, unsigned int flags)
1809 {
1810         struct inet_sock *inet = inet_sk(sk);
1811         struct ipv6_pinfo *np = inet6_sk(sk);
1812         int exthdrlen;
1813         int err;
1814
1815         if (flags&MSG_PROBE)
1816                 return 0;
1817         if (skb_queue_empty(&sk->sk_write_queue)) {
1818                 /*
1819                  * setup for corking
1820                  */
1821                 dst_hold(&rt->dst);
1822                 err = ip6_setup_cork(sk, &inet->cork, &np->cork,
1823                                      ipc6, rt);
1824                 if (err)
1825                         return err;
1826
1827                 inet->cork.fl.u.ip6 = *fl6;
1828                 exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1829                 length += exthdrlen;
1830                 transhdrlen += exthdrlen;
1831         } else {
1832                 transhdrlen = 0;
1833         }
1834
1835         return __ip6_append_data(sk, &sk->sk_write_queue, &inet->cork,
1836                                  &np->cork, sk_page_frag(sk), getfrag,
1837                                  from, length, transhdrlen, flags, ipc6);
1838 }
1839 EXPORT_SYMBOL_GPL(ip6_append_data);
1840
1841 static void ip6_cork_steal_dst(struct sk_buff *skb, struct inet_cork_full *cork)
1842 {
1843         struct dst_entry *dst = cork->base.dst;
1844
1845         cork->base.dst = NULL;
1846         skb_dst_set(skb, dst);
1847 }
1848
1849 static void ip6_cork_release(struct inet_cork_full *cork,
1850                              struct inet6_cork *v6_cork)
1851 {
1852         if (v6_cork->opt) {
1853                 struct ipv6_txoptions *opt = v6_cork->opt;
1854
1855                 kfree(opt->dst0opt);
1856                 kfree(opt->dst1opt);
1857                 kfree(opt->hopopt);
1858                 kfree(opt->srcrt);
1859                 kfree(opt);
1860                 v6_cork->opt = NULL;
1861         }
1862
1863         if (cork->base.dst) {
1864                 dst_release(cork->base.dst);
1865                 cork->base.dst = NULL;
1866         }
1867 }
1868
1869 struct sk_buff *__ip6_make_skb(struct sock *sk,
1870                                struct sk_buff_head *queue,
1871                                struct inet_cork_full *cork,
1872                                struct inet6_cork *v6_cork)
1873 {
1874         struct sk_buff *skb, *tmp_skb;
1875         struct sk_buff **tail_skb;
1876         struct in6_addr *final_dst;
1877         struct net *net = sock_net(sk);
1878         struct ipv6hdr *hdr;
1879         struct ipv6_txoptions *opt = v6_cork->opt;
1880         struct rt6_info *rt = dst_rt6_info(cork->base.dst);
1881         struct flowi6 *fl6 = &cork->fl.u.ip6;
1882         unsigned char proto = fl6->flowi6_proto;
1883
1884         skb = __skb_dequeue(queue);
1885         if (!skb)
1886                 goto out;
1887         tail_skb = &(skb_shinfo(skb)->frag_list);
1888
1889         /* move skb->data to ip header from ext header */
1890         if (skb->data < skb_network_header(skb))
1891                 __skb_pull(skb, skb_network_offset(skb));
1892         while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1893                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1894                 *tail_skb = tmp_skb;
1895                 tail_skb = &(tmp_skb->next);
1896                 skb->len += tmp_skb->len;
1897                 skb->data_len += tmp_skb->len;
1898                 skb->truesize += tmp_skb->truesize;
1899                 tmp_skb->destructor = NULL;
1900                 tmp_skb->sk = NULL;
1901         }
1902
1903         /* Allow local fragmentation. */
1904         skb->ignore_df = ip6_sk_ignore_df(sk);
1905         __skb_pull(skb, skb_network_header_len(skb));
1906
1907         final_dst = &fl6->daddr;
1908         if (opt && opt->opt_flen)
1909                 ipv6_push_frag_opts(skb, opt, &proto);
1910         if (opt && opt->opt_nflen)
1911                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr);
1912
1913         skb_push(skb, sizeof(struct ipv6hdr));
1914         skb_reset_network_header(skb);
1915         hdr = ipv6_hdr(skb);
1916
1917         ip6_flow_hdr(hdr, v6_cork->tclass,
1918                      ip6_make_flowlabel(net, skb, fl6->flowlabel,
1919                                         ip6_autoflowlabel(net, sk), fl6));
1920         hdr->hop_limit = v6_cork->hop_limit;
1921         hdr->nexthdr = proto;
1922         hdr->saddr = fl6->saddr;
1923         hdr->daddr = *final_dst;
1924
1925         skb->priority = READ_ONCE(sk->sk_priority);
1926         skb->mark = cork->base.mark;
1927         skb->tstamp = cork->base.transmit_time;
1928
1929         ip6_cork_steal_dst(skb, cork);
1930         IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTREQUESTS);
1931         if (proto == IPPROTO_ICMPV6) {
1932                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1933                 u8 icmp6_type;
1934
1935                 if (sk->sk_socket->type == SOCK_RAW &&
1936                    !(fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH))
1937                         icmp6_type = fl6->fl6_icmp_type;
1938                 else
1939                         icmp6_type = icmp6_hdr(skb)->icmp6_type;
1940                 ICMP6MSGOUT_INC_STATS(net, idev, icmp6_type);
1941                 ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1942         }
1943
1944         ip6_cork_release(cork, v6_cork);
1945 out:
1946         return skb;
1947 }
1948
1949 int ip6_send_skb(struct sk_buff *skb)
1950 {
1951         struct net *net = sock_net(skb->sk);
1952         struct rt6_info *rt = dst_rt6_info(skb_dst(skb));
1953         int err;
1954
1955         err = ip6_local_out(net, skb->sk, skb);
1956         if (err) {
1957                 if (err > 0)
1958                         err = net_xmit_errno(err);
1959                 if (err)
1960                         IP6_INC_STATS(net, rt->rt6i_idev,
1961                                       IPSTATS_MIB_OUTDISCARDS);
1962         }
1963
1964         return err;
1965 }
1966
1967 int ip6_push_pending_frames(struct sock *sk)
1968 {
1969         struct sk_buff *skb;
1970
1971         skb = ip6_finish_skb(sk);
1972         if (!skb)
1973                 return 0;
1974
1975         return ip6_send_skb(skb);
1976 }
1977 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1978
1979 static void __ip6_flush_pending_frames(struct sock *sk,
1980                                        struct sk_buff_head *queue,
1981                                        struct inet_cork_full *cork,
1982                                        struct inet6_cork *v6_cork)
1983 {
1984         struct sk_buff *skb;
1985
1986         while ((skb = __skb_dequeue_tail(queue)) != NULL) {
1987                 if (skb_dst(skb))
1988                         IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1989                                       IPSTATS_MIB_OUTDISCARDS);
1990                 kfree_skb(skb);
1991         }
1992
1993         ip6_cork_release(cork, v6_cork);
1994 }
1995
1996 void ip6_flush_pending_frames(struct sock *sk)
1997 {
1998         __ip6_flush_pending_frames(sk, &sk->sk_write_queue,
1999                                    &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
2000 }
2001 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
2002
2003 struct sk_buff *ip6_make_skb(struct sock *sk,
2004                              int getfrag(void *from, char *to, int offset,
2005                                          int len, int odd, struct sk_buff *skb),
2006                              void *from, size_t length, int transhdrlen,
2007                              struct ipcm6_cookie *ipc6, struct rt6_info *rt,
2008                              unsigned int flags, struct inet_cork_full *cork)
2009 {
2010         struct inet6_cork v6_cork;
2011         struct sk_buff_head queue;
2012         int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
2013         int err;
2014
2015         if (flags & MSG_PROBE) {
2016                 dst_release(&rt->dst);
2017                 return NULL;
2018         }
2019
2020         __skb_queue_head_init(&queue);
2021
2022         cork->base.flags = 0;
2023         cork->base.addr = 0;
2024         cork->base.opt = NULL;
2025         v6_cork.opt = NULL;
2026         err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt);
2027         if (err) {
2028                 ip6_cork_release(cork, &v6_cork);
2029                 return ERR_PTR(err);
2030         }
2031         if (ipc6->dontfrag < 0)
2032                 ipc6->dontfrag = inet6_test_bit(DONTFRAG, sk);
2033
2034         err = __ip6_append_data(sk, &queue, cork, &v6_cork,
2035                                 &current->task_frag, getfrag, from,
2036                                 length + exthdrlen, transhdrlen + exthdrlen,
2037                                 flags, ipc6);
2038         if (err) {
2039                 __ip6_flush_pending_frames(sk, &queue, cork, &v6_cork);
2040                 return ERR_PTR(err);
2041         }
2042
2043         return __ip6_make_skb(sk, &queue, cork, &v6_cork);
2044 }