net/ipv4/tcp_ipv4.c

   1 // SPDX-License-Identifier: GPL-2.0-or-later
   2 /*
   3  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   4  *              operating system.  INET is implemented using the  BSD Socket
   5  *              interface as the means of communication with the user level.
   6  *
   7  *              Implementation of the Transmission Control Protocol(TCP).
   8  *
   9  *              IPv4 specific functions
  10  *
  11  *              code split from:
  12  *              linux/ipv4/tcp.c
  13  *              linux/ipv4/tcp_input.c
  14  *              linux/ipv4/tcp_output.c
  15  *
  16  *              See tcp.c for author information
  17  */
  18
  19 /*
  20  * Changes:
  21  *              David S. Miller :       New socket lookup architecture.
  22  *                                      This code is dedicated to John Dyson.
  23  *              David S. Miller :       Change semantics of established hash,
  24  *                                      half is devoted to TIME_WAIT sockets
  25  *                                      and the rest go in the other half.
  26  *              Andi Kleen :            Add support for syncookies and fixed
  27  *                                      some bugs: ip options weren't passed to
  28  *                                      the TCP layer, missed a check for an
  29  *                                      ACK bit.
  30  *              Andi Kleen :            Implemented fast path mtu discovery.
  31  *                                      Fixed many serious bugs in the
  32  *                                      request_sock handling and moved
  33  *                                      most of it into the af independent code.
  34  *                                      Added tail drop and some other bugfixes.
  35  *                                      Added new listen semantics.
  36  *              Mike McLagan    :       Routing by source
  37  *      Juan Jose Ciarlante:            ip_dynaddr bits
  38  *              Andi Kleen:             various fixes.
  39  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
  40  *                                      coma.
  41  *      Andi Kleen              :       Fix new listen.
  42  *      Andi Kleen              :       Fix accept error reporting.
  43  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
  44  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
  45  *                                      a single port at the same time.
  46  */
  47
  48 #define pr_fmt(fmt) "TCP: " fmt
  49
  50 #include <linux/bottom_half.h>
  51 #include <linux/types.h>
  52 #include <linux/fcntl.h>
  53 #include <linux/module.h>
  54 #include <linux/random.h>
  55 #include <linux/cache.h>
  56 #include <linux/jhash.h>
  57 #include <linux/init.h>
  58 #include <linux/times.h>
  59 #include <linux/slab.h>
  60
  61 #include <net/net_namespace.h>
  62 #include <net/icmp.h>
  63 #include <net/inet_hashtables.h>
  64 #include <net/tcp.h>
  65 #include <net/transp_v6.h>
  66 #include <net/ipv6.h>
  67 #include <net/inet_common.h>
  68 #include <net/timewait_sock.h>
  69 #include <net/xfrm.h>
  70 #include <net/secure_seq.h>
  71 #include <net/busy_poll.h>
  72
  73 #include <linux/inet.h>
  74 #include <linux/ipv6.h>
  75 #include <linux/stddef.h>
  76 #include <linux/proc_fs.h>
  77 #include <linux/seq_file.h>
  78 #include <linux/inetdevice.h>
  79
  80 #include <crypto/hash.h>
  81 #include <linux/scatterlist.h>
  82
  83 #include <trace/events/tcp.h>
  84
  85 #ifdef CONFIG_TCP_MD5SIG
  86 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
  87                                __be32 daddr, __be32 saddr, const struct tcphdr *th);
  88 #endif
  89
  90 struct inet_hashinfo tcp_hashinfo;
  91 EXPORT_SYMBOL(tcp_hashinfo);
  92
  93 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
  94 {
  95         return secure_tcp_seq(ip_hdr(skb)->daddr,
  96                               ip_hdr(skb)->saddr,
  97                               tcp_hdr(skb)->dest,
  98                               tcp_hdr(skb)->source);
  99 }
 100
 101 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
 102 {
 103         return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
 104 }
 105
 106 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
 107 {
 108         const struct inet_timewait_sock *tw = inet_twsk(sktw);
 109         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
 110         struct tcp_sock *tp = tcp_sk(sk);
 111         int reuse = sock_net(sk)->ipv4.sysctl_tcp_tw_reuse;
 112
 113         if (reuse == 2) {
 114                 /* Still does not detect *everything* that goes through
 115                  * lo, since we require a loopback src or dst address
 116                  * or direct binding to 'lo' interface.
 117                  */
 118                 bool loopback = false;
 119                 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
 120                         loopback = true;
 121 #if IS_ENABLED(CONFIG_IPV6)
 122                 if (tw->tw_family == AF_INET6) {
 123                         if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
 124                             ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
 125                             ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
 126                             ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
 127                                 loopback = true;
 128                 } else
 129 #endif
 130                 {
 131                         if (ipv4_is_loopback(tw->tw_daddr) ||
 132                             ipv4_is_loopback(tw->tw_rcv_saddr))
 133                                 loopback = true;
 134                 }
 135                 if (!loopback)
 136                         reuse = 0;
 137         }
 138
 139         /* With PAWS, it is safe from the viewpoint
 140            of data integrity. Even without PAWS it is safe provided sequence
 141            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
 142
 143            Actually, the idea is close to VJ's one, only timestamp cache is
 144            held not per host, but per port pair and TW bucket is used as state
 145            holder.
 146
 147            If TW bucket has been already destroyed we fall back to VJ's scheme
 148            and use initial timestamp retrieved from peer table.
 149          */
 150         if (tcptw->tw_ts_recent_stamp &&
 151             (!twp || (reuse && time_after32(ktime_get_seconds(),
 152                                             tcptw->tw_ts_recent_stamp)))) {
 153                 /* In case of repair and re-using TIME-WAIT sockets we still
 154                  * want to be sure that it is safe as above but honor the
 155                  * sequence numbers and time stamps set as part of the repair
 156                  * process.
 157                  *
 158                  * Without this check re-using a TIME-WAIT socket with TCP
 159                  * repair would accumulate a -1 on the repair assigned
 160                  * sequence number. The first time it is reused the sequence
 161                  * is -1, the second time -2, etc. This fixes that issue
 162                  * without appearing to create any others.
 163                  */
 164                 if (likely(!tp->repair)) {
 165                         u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
 166
 167                         if (!seq)
 168                                 seq = 1;
 169                         WRITE_ONCE(tp->write_seq, seq);
 170                         tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
 171                         tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
 172                 }
 173                 sock_hold(sktw);
 174                 return 1;
 175         }
 176
 177         return 0;
 178 }
 179 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
 180
 181 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
 182                               int addr_len)
 183 {
 184         /* This check is replicated from tcp_v4_connect() and intended to
 185          * prevent BPF program called below from accessing bytes that are out
 186          * of the bound specified by user in addr_len.
 187          */
 188         if (addr_len < sizeof(struct sockaddr_in))
 189                 return -EINVAL;
 190
 191         sock_owned_by_me(sk);
 192
 193         return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
 194 }
 195
 196 /* This will initiate an outgoing connection. */
 197 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 198 {
 199         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 200         struct inet_sock *inet = inet_sk(sk);
 201         struct tcp_sock *tp = tcp_sk(sk);
 202         __be16 orig_sport, orig_dport;
 203         __be32 daddr, nexthop;
 204         struct flowi4 *fl4;
 205         struct rtable *rt;
 206         int err;
 207         struct ip_options_rcu *inet_opt;
 208         struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
 209
 210         if (addr_len < sizeof(struct sockaddr_in))
 211                 return -EINVAL;
 212
 213         if (usin->sin_family != AF_INET)
 214                 return -EAFNOSUPPORT;
 215
 216         nexthop = daddr = usin->sin_addr.s_addr;
 217         inet_opt = rcu_dereference_protected(inet->inet_opt,
 218                                              lockdep_sock_is_held(sk));
 219         if (inet_opt && inet_opt->opt.srr) {
 220                 if (!daddr)
 221                         return -EINVAL;
 222                 nexthop = inet_opt->opt.faddr;
 223         }
 224
 225         orig_sport = inet->inet_sport;
 226         orig_dport = usin->sin_port;
 227         fl4 = &inet->cork.fl.u.ip4;
 228         rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
 229                               RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
 230                               IPPROTO_TCP,
 231                               orig_sport, orig_dport, sk);
 232         if (IS_ERR(rt)) {
 233                 err = PTR_ERR(rt);
 234                 if (err == -ENETUNREACH)
 235                         IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
 236                 return err;
 237         }
 238
 239         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 240                 ip_rt_put(rt);
 241                 return -ENETUNREACH;
 242         }
 243
 244         if (!inet_opt || !inet_opt->opt.srr)
 245                 daddr = fl4->daddr;
 246
 247         if (!inet->inet_saddr)
 248                 inet->inet_saddr = fl4->saddr;
 249         sk_rcv_saddr_set(sk, inet->inet_saddr);
 250
 251         if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
 252                 /* Reset inherited state */
 253                 tp->rx_opt.ts_recent       = 0;
 254                 tp->rx_opt.ts_recent_stamp = 0;
 255                 if (likely(!tp->repair))
 256                         WRITE_ONCE(tp->write_seq, 0);
 257         }
 258
 259         inet->inet_dport = usin->sin_port;
 260         sk_daddr_set(sk, daddr);
 261
 262         inet_csk(sk)->icsk_ext_hdr_len = 0;
 263         if (inet_opt)
 264                 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
 265
 266         tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
 267
 268         /* Socket identity is still unknown (sport may be zero).
 269          * However we set state to SYN-SENT and not releasing socket
 270          * lock select source port, enter ourselves into the hash tables and
 271          * complete initialization after this.
 272          */
 273         tcp_set_state(sk, TCP_SYN_SENT);
 274         err = inet_hash_connect(tcp_death_row, sk);
 275         if (err)
 276                 goto failure;
 277
 278         sk_set_txhash(sk);
 279
 280         rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
 281                                inet->inet_sport, inet->inet_dport, sk);
 282         if (IS_ERR(rt)) {
 283                 err = PTR_ERR(rt);
 284                 rt = NULL;
 285                 goto failure;
 286         }
 287         /* OK, now commit destination to socket.  */
 288         sk->sk_gso_type = SKB_GSO_TCPV4;
 289         sk_setup_caps(sk, &rt->dst);
 290         rt = NULL;
 291
 292         if (likely(!tp->repair)) {
 293                 if (!tp->write_seq)
 294                         WRITE_ONCE(tp->write_seq,
 295                                    secure_tcp_seq(inet->inet_saddr,
 296                                                   inet->inet_daddr,
 297                                                   inet->inet_sport,
 298                                                   usin->sin_port));
 299                 tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
 300                                                  inet->inet_saddr,
 301                                                  inet->inet_daddr);
 302         }
 303
 304         inet->inet_id = prandom_u32();
 305
 306         if (tcp_fastopen_defer_connect(sk, &err))
 307                 return err;
 308         if (err)
 309                 goto failure;
 310
 311         err = tcp_connect(sk);
 312
 313         if (err)
 314                 goto failure;
 315
 316         return 0;
 317
 318 failure:
 319         /*
 320          * This unhashes the socket and releases the local port,
 321          * if necessary.
 322          */
 323         tcp_set_state(sk, TCP_CLOSE);
 324         ip_rt_put(rt);
 325         sk->sk_route_caps = 0;
 326         inet->inet_dport = 0;
 327         return err;
 328 }
 329 EXPORT_SYMBOL(tcp_v4_connect);
 330
 331 /*
 332  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
 333  * It can be called through tcp_release_cb() if socket was owned by user
 334  * at the time tcp_v4_err() was called to handle ICMP message.
 335  */
 336 void tcp_v4_mtu_reduced(struct sock *sk)
 337 {
 338         struct inet_sock *inet = inet_sk(sk);
 339         struct dst_entry *dst;
 340         u32 mtu;
 341
 342         if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
 343                 return;
 344         mtu = tcp_sk(sk)->mtu_info;
 345         dst = inet_csk_update_pmtu(sk, mtu);
 346         if (!dst)
 347                 return;
 348
 349         /* Something is about to be wrong... Remember soft error
 350          * for the case, if this connection will not able to recover.
 351          */
 352         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
 353                 sk->sk_err_soft = EMSGSIZE;
 354
 355         mtu = dst_mtu(dst);
 356
 357         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
 358             ip_sk_accept_pmtu(sk) &&
 359             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
 360                 tcp_sync_mss(sk, mtu);
 361
 362                 /* Resend the TCP packet because it's
 363                  * clear that the old packet has been
 364                  * dropped. This is the new "fast" path mtu
 365                  * discovery.
 366                  */
 367                 tcp_simple_retransmit(sk);
 368         } /* else let the usual retransmit timer handle it */
 369 }
 370 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
 371
 372 static void do_redirect(struct sk_buff *skb, struct sock *sk)
 373 {
 374         struct dst_entry *dst = __sk_dst_check(sk, 0);
 375
 376         if (dst)
 377                 dst->ops->redirect(dst, sk, skb);
 378 }
 379
 380
 381 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
 382 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
 383 {
 384         struct request_sock *req = inet_reqsk(sk);
 385         struct net *net = sock_net(sk);
 386
 387         /* ICMPs are not backlogged, hence we cannot get
 388          * an established socket here.
 389          */
 390         if (seq != tcp_rsk(req)->snt_isn) {
 391                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
 392         } else if (abort) {
 393                 /*
 394                  * Still in SYN_RECV, just remove it silently.
 395                  * There is no good way to pass the error to the newly
 396                  * created socket, and POSIX does not want network
 397                  * errors returned from accept().
 398                  */
 399                 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
 400                 tcp_listendrop(req->rsk_listener);
 401         }
 402         reqsk_put(req);
 403 }
 404 EXPORT_SYMBOL(tcp_req_err);
 405
 406 /*
 407  * This routine is called by the ICMP module when it gets some
 408  * sort of error condition.  If err < 0 then the socket should
 409  * be closed and the error returned to the user.  If err > 0
 410  * it's just the icmp type << 8 | icmp code.  After adjustment
 411  * header points to the first 8 bytes of the tcp header.  We need
 412  * to find the appropriate port.
 413  *
 414  * The locking strategy used here is very "optimistic". When
 415  * someone else accesses the socket the ICMP is just dropped
 416  * and for some paths there is no check at all.
 417  * A more general error queue to queue errors for later handling
 418  * is probably better.
 419  *
 420  */
 421
 422 int tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
 423 {
 424         const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
 425         struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
 426         struct inet_connection_sock *icsk;
 427         struct tcp_sock *tp;
 428         struct inet_sock *inet;
 429         const int type = icmp_hdr(icmp_skb)->type;
 430         const int code = icmp_hdr(icmp_skb)->code;
 431         struct sock *sk;
 432         struct sk_buff *skb;
 433         struct request_sock *fastopen;
 434         u32 seq, snd_una;
 435         s32 remaining;
 436         u32 delta_us;
 437         int err;
 438         struct net *net = dev_net(icmp_skb->dev);
 439
 440         sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
 441                                        th->dest, iph->saddr, ntohs(th->source),
 442                                        inet_iif(icmp_skb), 0);
 443         if (!sk) {
 444                 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
 445                 return -ENOENT;
 446         }
 447         if (sk->sk_state == TCP_TIME_WAIT) {
 448                 inet_twsk_put(inet_twsk(sk));
 449                 return 0;
 450         }
 451         seq = ntohl(th->seq);
 452         if (sk->sk_state == TCP_NEW_SYN_RECV) {
 453                 tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
 454                                      type == ICMP_TIME_EXCEEDED ||
 455                                      (type == ICMP_DEST_UNREACH &&
 456                                       (code == ICMP_NET_UNREACH ||
 457                                        code == ICMP_HOST_UNREACH)));
 458                 return 0;
 459         }
 460
 461         bh_lock_sock(sk);
 462         /* If too many ICMPs get dropped on busy
 463          * servers this needs to be solved differently.
 464          * We do take care of PMTU discovery (RFC1191) special case :
 465          * we can receive locally generated ICMP messages while socket is held.
 466          */
 467         if (sock_owned_by_user(sk)) {
 468                 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
 469                         __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
 470         }
 471         if (sk->sk_state == TCP_CLOSE)
 472                 goto out;
 473
 474         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
 475                 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
 476                 goto out;
 477         }
 478
 479         icsk = inet_csk(sk);
 480         tp = tcp_sk(sk);
 481         /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
 482         fastopen = rcu_dereference(tp->fastopen_rsk);
 483         snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
 484         if (sk->sk_state != TCP_LISTEN &&
 485             !between(seq, snd_una, tp->snd_nxt)) {
 486                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
 487                 goto out;
 488         }
 489
 490         switch (type) {
 491         case ICMP_REDIRECT:
 492                 if (!sock_owned_by_user(sk))
 493                         do_redirect(icmp_skb, sk);
 494                 goto out;
 495         case ICMP_SOURCE_QUENCH:
 496                 /* Just silently ignore these. */
 497                 goto out;
 498         case ICMP_PARAMETERPROB:
 499                 err = EPROTO;
 500                 break;
 501         case ICMP_DEST_UNREACH:
 502                 if (code > NR_ICMP_UNREACH)
 503                         goto out;
 504
 505                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
 506                         /* We are not interested in TCP_LISTEN and open_requests
 507                          * (SYN-ACKs send out by Linux are always <576bytes so
 508                          * they should go through unfragmented).
 509                          */
 510                         if (sk->sk_state == TCP_LISTEN)
 511                                 goto out;
 512
 513                         tp->mtu_info = info;
 514                         if (!sock_owned_by_user(sk)) {
 515                                 tcp_v4_mtu_reduced(sk);
 516                         } else {
 517                                 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
 518                                         sock_hold(sk);
 519                         }
 520                         goto out;
 521                 }
 522
 523                 err = icmp_err_convert[code].errno;
 524                 /* check if icmp_skb allows revert of backoff
 525                  * (see draft-zimmermann-tcp-lcd) */
 526                 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
 527                         break;
 528                 if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
 529                     !icsk->icsk_backoff || fastopen)
 530                         break;
 531
 532                 if (sock_owned_by_user(sk))
 533                         break;
 534
 535                 skb = tcp_rtx_queue_head(sk);
 536                 if (WARN_ON_ONCE(!skb))
 537                         break;
 538
 539                 icsk->icsk_backoff--;
 540                 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
 541                                                TCP_TIMEOUT_INIT;
 542                 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
 543
 544
 545                 tcp_mstamp_refresh(tp);
 546                 delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
 547                 remaining = icsk->icsk_rto -
 548                             usecs_to_jiffies(delta_us);
 549
 550                 if (remaining > 0) {
 551                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
 552                                                   remaining, TCP_RTO_MAX);
 553                 } else {
 554                         /* RTO revert clocked out retransmission.
 555                          * Will retransmit now */
 556                         tcp_retransmit_timer(sk);
 557                 }
 558
 559                 break;
 560         case ICMP_TIME_EXCEEDED:
 561                 err = EHOSTUNREACH;
 562                 break;
 563         default:
 564                 goto out;
 565         }
 566
 567         switch (sk->sk_state) {
 568         case TCP_SYN_SENT:
 569         case TCP_SYN_RECV:
 570                 /* Only in fast or simultaneous open. If a fast open socket is
 571                  * is already accepted it is treated as a connected one below.
 572                  */
 573                 if (fastopen && !fastopen->sk)
 574                         break;
 575
 576                 if (!sock_owned_by_user(sk)) {
 577                         sk->sk_err = err;
 578
 579                         sk->sk_error_report(sk);
 580
 581                         tcp_done(sk);
 582                 } else {
 583                         sk->sk_err_soft = err;
 584                 }
 585                 goto out;
 586         }
 587
 588         /* If we've already connected we will keep trying
 589          * until we time out, or the user gives up.
 590          *
 591          * rfc1122 4.2.3.9 allows to consider as hard errors
 592          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
 593          * but it is obsoleted by pmtu discovery).
 594          *
 595          * Note, that in modern internet, where routing is unreliable
 596          * and in each dark corner broken firewalls sit, sending random
 597          * errors ordered by their masters even this two messages finally lose
 598          * their original sense (even Linux sends invalid PORT_UNREACHs)
 599          *
 600          * Now we are in compliance with RFCs.
 601          *                                                      --ANK (980905)
 602          */
 603
 604         inet = inet_sk(sk);
 605         if (!sock_owned_by_user(sk) && inet->recverr) {
 606                 sk->sk_err = err;
 607                 sk->sk_error_report(sk);
 608         } else  { /* Only an error on timeout */
 609                 sk->sk_err_soft = err;
 610         }
 611
 612 out:
 613         bh_unlock_sock(sk);
 614         sock_put(sk);
 615         return 0;
 616 }
 617
 618 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
 619 {
 620         struct tcphdr *th = tcp_hdr(skb);
 621
 622         th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
 623         skb->csum_start = skb_transport_header(skb) - skb->head;
 624         skb->csum_offset = offsetof(struct tcphdr, check);
 625 }
 626
 627 /* This routine computes an IPv4 TCP checksum. */
 628 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
 629 {
 630         const struct inet_sock *inet = inet_sk(sk);
 631
 632         __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
 633 }
 634 EXPORT_SYMBOL(tcp_v4_send_check);
 635
 636 /*
 637  *      This routine will send an RST to the other tcp.
 638  *
 639  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
 640  *                    for reset.
 641  *      Answer: if a packet caused RST, it is not for a socket
 642  *              existing in our system, if it is matched to a socket,
 643  *              it is just duplicate segment or bug in other side's TCP.
 644  *              So that we build reply only basing on parameters
 645  *              arrived with segment.
 646  *      Exception: precedence violation. We do not implement it in any case.
 647  */
 648
 649 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
 650 {
 651         const struct tcphdr *th = tcp_hdr(skb);
 652         struct {
 653                 struct tcphdr th;
 654 #ifdef CONFIG_TCP_MD5SIG
 655                 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
 656 #endif
 657         } rep;
 658         struct ip_reply_arg arg;
 659 #ifdef CONFIG_TCP_MD5SIG
 660         struct tcp_md5sig_key *key = NULL;
 661         const __u8 *hash_location = NULL;
 662         unsigned char newhash[16];
 663         int genhash;
 664         struct sock *sk1 = NULL;
 665 #endif
 666         u64 transmit_time = 0;
 667         struct sock *ctl_sk;
 668         struct net *net;
 669
 670         /* Never send a reset in response to a reset. */
 671         if (th->rst)
 672                 return;
 673
 674         /* If sk not NULL, it means we did a successful lookup and incoming
 675          * route had to be correct. prequeue might have dropped our dst.
 676          */
 677         if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
 678                 return;
 679
 680         /* Swap the send and the receive. */
 681         memset(&rep, 0, sizeof(rep));
 682         rep.th.dest   = th->source;
 683         rep.th.source = th->dest;
 684         rep.th.doff   = sizeof(struct tcphdr) / 4;
 685         rep.th.rst    = 1;
 686
 687         if (th->ack) {
 688                 rep.th.seq = th->ack_seq;
 689         } else {
 690                 rep.th.ack = 1;
 691                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
 692                                        skb->len - (th->doff << 2));
 693         }
 694
 695         memset(&arg, 0, sizeof(arg));
 696         arg.iov[0].iov_base = (unsigned char *)&rep;
 697         arg.iov[0].iov_len  = sizeof(rep.th);
 698
 699         net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
 700 #ifdef CONFIG_TCP_MD5SIG
 701         rcu_read_lock();
 702         hash_location = tcp_parse_md5sig_option(th);
 703         if (sk && sk_fullsock(sk)) {
 704                 const union tcp_md5_addr *addr;
 705
 706                 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
 707                 key = tcp_md5_do_lookup(sk, addr, AF_INET);
 708         } else if (hash_location) {
 709                 const union tcp_md5_addr *addr;
 710                 int sdif = tcp_v4_sdif(skb);
 711                 int dif = inet_iif(skb);
 712
 713                 /*
 714                  * active side is lost. Try to find listening socket through
 715                  * source port, and then find md5 key through listening socket.
 716                  * we are not loose security here:
 717                  * Incoming packet is checked with md5 hash with finding key,
 718                  * no RST generated if md5 hash doesn't match.
 719                  */
 720                 sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
 721                                              ip_hdr(skb)->saddr,
 722                                              th->source, ip_hdr(skb)->daddr,
 723                                              ntohs(th->source), dif, sdif);
 724                 /* don't send rst if it can't find key */
 725                 if (!sk1)
 726                         goto out;
 727
 728                 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
 729                 key = tcp_md5_do_lookup(sk1, addr, AF_INET);
 730                 if (!key)
 731                         goto out;
 732
 733
 734                 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
 735                 if (genhash || memcmp(hash_location, newhash, 16) != 0)
 736                         goto out;
 737
 738         }
 739
 740         if (key) {
 741                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
 742                                    (TCPOPT_NOP << 16) |
 743                                    (TCPOPT_MD5SIG << 8) |
 744                                    TCPOLEN_MD5SIG);
 745                 /* Update length and the length the header thinks exists */
 746                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 747                 rep.th.doff = arg.iov[0].iov_len / 4;
 748
 749                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
 750                                      key, ip_hdr(skb)->saddr,
 751                                      ip_hdr(skb)->daddr, &rep.th);
 752         }
 753 #endif
 754         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 755                                       ip_hdr(skb)->saddr, /* XXX */
 756                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 757         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 758         arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
 759
 760         /* When socket is gone, all binding information is lost.
 761          * routing might fail in this case. No choice here, if we choose to force
 762          * input interface, we will misroute in case of asymmetric route.
 763          */
 764         if (sk) {
 765                 arg.bound_dev_if = sk->sk_bound_dev_if;
 766                 if (sk_fullsock(sk))
 767                         trace_tcp_send_reset(sk, skb);
 768         }
 769
 770         BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
 771                      offsetof(struct inet_timewait_sock, tw_bound_dev_if));
 772
 773         arg.tos = ip_hdr(skb)->tos;
 774         arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
 775         local_bh_disable();
 776         ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
 777         if (sk) {
 778                 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
 779                                    inet_twsk(sk)->tw_mark : sk->sk_mark;
 780                 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
 781                                    inet_twsk(sk)->tw_priority : sk->sk_priority;
 782                 transmit_time = tcp_transmit_time(sk);
 783         }
 784         ip_send_unicast_reply(ctl_sk,
 785                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
 786                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 787                               &arg, arg.iov[0].iov_len,
 788                               transmit_time);
 789
 790         ctl_sk->sk_mark = 0;
 791         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 792         __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
 793         local_bh_enable();
 794
 795 #ifdef CONFIG_TCP_MD5SIG
 796 out:
 797         rcu_read_unlock();
 798 #endif
 799 }
 800
 801 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
 802    outside socket context is ugly, certainly. What can I do?
 803  */
 804
 805 static void tcp_v4_send_ack(const struct sock *sk,
 806                             struct sk_buff *skb, u32 seq, u32 ack,
 807                             u32 win, u32 tsval, u32 tsecr, int oif,
 808                             struct tcp_md5sig_key *key,
 809                             int reply_flags, u8 tos)
 810 {
 811         const struct tcphdr *th = tcp_hdr(skb);
 812         struct {
 813                 struct tcphdr th;
 814                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
 815 #ifdef CONFIG_TCP_MD5SIG
 816                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
 817 #endif
 818                         ];
 819         } rep;
 820         struct net *net = sock_net(sk);
 821         struct ip_reply_arg arg;
 822         struct sock *ctl_sk;
 823         u64 transmit_time;
 824
 825         memset(&rep.th, 0, sizeof(struct tcphdr));
 826         memset(&arg, 0, sizeof(arg));
 827
 828         arg.iov[0].iov_base = (unsigned char *)&rep;
 829         arg.iov[0].iov_len  = sizeof(rep.th);
 830         if (tsecr) {
 831                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
 832                                    (TCPOPT_TIMESTAMP << 8) |
 833                                    TCPOLEN_TIMESTAMP);
 834                 rep.opt[1] = htonl(tsval);
 835                 rep.opt[2] = htonl(tsecr);
 836                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
 837         }
 838
 839         /* Swap the send and the receive. */
 840         rep.th.dest    = th->source;
 841         rep.th.source  = th->dest;
 842         rep.th.doff    = arg.iov[0].iov_len / 4;
 843         rep.th.seq     = htonl(seq);
 844         rep.th.ack_seq = htonl(ack);
 845         rep.th.ack     = 1;
 846         rep.th.window  = htons(win);
 847
 848 #ifdef CONFIG_TCP_MD5SIG
 849         if (key) {
 850                 int offset = (tsecr) ? 3 : 0;
 851
 852                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
 853                                           (TCPOPT_NOP << 16) |
 854                                           (TCPOPT_MD5SIG << 8) |
 855                                           TCPOLEN_MD5SIG);
 856                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 857                 rep.th.doff = arg.iov[0].iov_len/4;
 858
 859                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
 860                                     key, ip_hdr(skb)->saddr,
 861                                     ip_hdr(skb)->daddr, &rep.th);
 862         }
 863 #endif
 864         arg.flags = reply_flags;
 865         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 866                                       ip_hdr(skb)->saddr, /* XXX */
 867                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 868         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 869         if (oif)
 870                 arg.bound_dev_if = oif;
 871         arg.tos = tos;
 872         arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
 873         local_bh_disable();
 874         ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
 875         ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
 876                            inet_twsk(sk)->tw_mark : sk->sk_mark;
 877         ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
 878                            inet_twsk(sk)->tw_priority : sk->sk_priority;
 879         transmit_time = tcp_transmit_time(sk);
 880         ip_send_unicast_reply(ctl_sk,
 881                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
 882                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 883                               &arg, arg.iov[0].iov_len,
 884                               transmit_time);
 885
 886         ctl_sk->sk_mark = 0;
 887         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 888         local_bh_enable();
 889 }
 890
 891 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 892 {
 893         struct inet_timewait_sock *tw = inet_twsk(sk);
 894         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
 895
 896         tcp_v4_send_ack(sk, skb,
 897                         tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
 898                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
 899                         tcp_time_stamp_raw() + tcptw->tw_ts_offset,
 900                         tcptw->tw_ts_recent,
 901                         tw->tw_bound_dev_if,
 902                         tcp_twsk_md5_key(tcptw),
 903                         tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
 904                         tw->tw_tos
 905                         );
 906
 907         inet_twsk_put(tw);
 908 }
 909
 910 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
 911                                   struct request_sock *req)
 912 {
 913         const union tcp_md5_addr *addr;
 914
 915         /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
 916          * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
 917          */
 918         u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
 919                                              tcp_sk(sk)->snd_nxt;
 920
 921         /* RFC 7323 2.3
 922          * The window field (SEG.WND) of every outgoing segment, with the
 923          * exception of <SYN> segments, MUST be right-shifted by
 924          * Rcv.Wind.Shift bits:
 925          */
 926         addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
 927         tcp_v4_send_ack(sk, skb, seq,
 928                         tcp_rsk(req)->rcv_nxt,
 929                         req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
 930                         tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
 931                         req->ts_recent,
 932                         0,
 933                         tcp_md5_do_lookup(sk, addr, AF_INET),
 934                         inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
 935                         ip_hdr(skb)->tos);
 936 }
 937
 938 /*
 939  *      Send a SYN-ACK after having received a SYN.
 940  *      This still operates on a request_sock only, not on a big
 941  *      socket.
 942  */
 943 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
 944                               struct flowi *fl,
 945                               struct request_sock *req,
 946                               struct tcp_fastopen_cookie *foc,
 947                               enum tcp_synack_type synack_type)
 948 {
 949         const struct inet_request_sock *ireq = inet_rsk(req);
 950         struct flowi4 fl4;
 951         int err = -1;
 952         struct sk_buff *skb;
 953
 954         /* First, grab a route. */
 955         if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
 956                 return -1;
 957
 958         skb = tcp_make_synack(sk, dst, req, foc, synack_type);
 959
 960         if (skb) {
 961                 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
 962
 963                 rcu_read_lock();
 964                 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
 965                                             ireq->ir_rmt_addr,
 966                                             rcu_dereference(ireq->ireq_opt));
 967                 rcu_read_unlock();
 968                 err = net_xmit_eval(err);
 969         }
 970
 971         return err;
 972 }
 973
 974 /*
 975  *      IPv4 request_sock destructor.
 976  */
 977 static void tcp_v4_reqsk_destructor(struct request_sock *req)
 978 {
 979         kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
 980 }
 981
 982 #ifdef CONFIG_TCP_MD5SIG
 983 /*
 984  * RFC2385 MD5 checksumming requires a mapping of
 985  * IP address->MD5 Key.
 986  * We need to maintain these in the sk structure.
 987  */
 988
 989 DEFINE_STATIC_KEY_FALSE(tcp_md5_needed);
 990 EXPORT_SYMBOL(tcp_md5_needed);
 991
 992 /* Find the Key structure for an address.  */
 993 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk,
 994                                            const union tcp_md5_addr *addr,
 995                                            int family)
 996 {
 997         const struct tcp_sock *tp = tcp_sk(sk);
 998         struct tcp_md5sig_key *key;
 999         const struct tcp_md5sig_info *md5sig;
1000         __be32 mask;
1001         struct tcp_md5sig_key *best_match = NULL;
1002         bool match;
1003
1004         /* caller either holds rcu_read_lock() or socket lock */
1005         md5sig = rcu_dereference_check(tp->md5sig_info,
1006                                        lockdep_sock_is_held(sk));
1007         if (!md5sig)
1008                 return NULL;
1009
1010         hlist_for_each_entry_rcu(key, &md5sig->head, node) {
1011                 if (key->family != family)
1012                         continue;
1013
1014                 if (family == AF_INET) {
1015                         mask = inet_make_mask(key->prefixlen);
1016                         match = (key->addr.a4.s_addr & mask) ==
1017                                 (addr->a4.s_addr & mask);
1018 #if IS_ENABLED(CONFIG_IPV6)
1019                 } else if (family == AF_INET6) {
1020                         match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1021                                                   key->prefixlen);
1022 #endif
1023                 } else {
1024                         match = false;
1025                 }
1026
1027                 if (match && (!best_match ||
1028                               key->prefixlen > best_match->prefixlen))
1029                         best_match = key;
1030         }
1031         return best_match;
1032 }
1033 EXPORT_SYMBOL(__tcp_md5_do_lookup);
1034
1035 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1036                                                       const union tcp_md5_addr *addr,
1037                                                       int family, u8 prefixlen)
1038 {
1039         const struct tcp_sock *tp = tcp_sk(sk);
1040         struct tcp_md5sig_key *key;
1041         unsigned int size = sizeof(struct in_addr);
1042         const struct tcp_md5sig_info *md5sig;
1043
1044         /* caller either holds rcu_read_lock() or socket lock */
1045         md5sig = rcu_dereference_check(tp->md5sig_info,
1046                                        lockdep_sock_is_held(sk));
1047         if (!md5sig)
1048                 return NULL;
1049 #if IS_ENABLED(CONFIG_IPV6)
1050         if (family == AF_INET6)
1051                 size = sizeof(struct in6_addr);
1052 #endif
1053         hlist_for_each_entry_rcu(key, &md5sig->head, node) {
1054                 if (key->family != family)
1055                         continue;
1056                 if (!memcmp(&key->addr, addr, size) &&
1057                     key->prefixlen == prefixlen)
1058                         return key;
1059         }
1060         return NULL;
1061 }
1062
1063 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1064                                          const struct sock *addr_sk)
1065 {
1066         const union tcp_md5_addr *addr;
1067
1068         addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1069         return tcp_md5_do_lookup(sk, addr, AF_INET);
1070 }
1071 EXPORT_SYMBOL(tcp_v4_md5_lookup);
1072
1073 /* This can be called on a newly created socket, from other files */
1074 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1075                    int family, u8 prefixlen, const u8 *newkey, u8 newkeylen,
1076                    gfp_t gfp)
1077 {
1078         /* Add Key to the list */
1079         struct tcp_md5sig_key *key;
1080         struct tcp_sock *tp = tcp_sk(sk);
1081         struct tcp_md5sig_info *md5sig;
1082
1083         key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
1084         if (key) {
1085                 /* Pre-existing entry - just update that one. */
1086                 memcpy(key->key, newkey, newkeylen);
1087                 key->keylen = newkeylen;
1088                 return 0;
1089         }
1090
1091         md5sig = rcu_dereference_protected(tp->md5sig_info,
1092                                            lockdep_sock_is_held(sk));
1093         if (!md5sig) {
1094                 md5sig = kmalloc(sizeof(*md5sig), gfp);
1095                 if (!md5sig)
1096                         return -ENOMEM;
1097
1098                 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1099                 INIT_HLIST_HEAD(&md5sig->head);
1100                 rcu_assign_pointer(tp->md5sig_info, md5sig);
1101         }
1102
1103         key = sock_kmalloc(sk, sizeof(*key), gfp);
1104         if (!key)
1105                 return -ENOMEM;
1106         if (!tcp_alloc_md5sig_pool()) {
1107                 sock_kfree_s(sk, key, sizeof(*key));
1108                 return -ENOMEM;
1109         }
1110
1111         memcpy(key->key, newkey, newkeylen);
1112         key->keylen = newkeylen;
1113         key->family = family;
1114         key->prefixlen = prefixlen;
1115         memcpy(&key->addr, addr,
1116                (family == AF_INET6) ? sizeof(struct in6_addr) :
1117                                       sizeof(struct in_addr));
1118         hlist_add_head_rcu(&key->node, &md5sig->head);
1119         return 0;
1120 }
1121 EXPORT_SYMBOL(tcp_md5_do_add);
1122
1123 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1124                    u8 prefixlen)
1125 {
1126         struct tcp_md5sig_key *key;
1127
1128         key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
1129         if (!key)
1130                 return -ENOENT;
1131         hlist_del_rcu(&key->node);
1132         atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1133         kfree_rcu(key, rcu);
1134         return 0;
1135 }
1136 EXPORT_SYMBOL(tcp_md5_do_del);
1137
1138 static void tcp_clear_md5_list(struct sock *sk)
1139 {
1140         struct tcp_sock *tp = tcp_sk(sk);
1141         struct tcp_md5sig_key *key;
1142         struct hlist_node *n;
1143         struct tcp_md5sig_info *md5sig;
1144
1145         md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1146
1147         hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1148                 hlist_del_rcu(&key->node);
1149                 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1150                 kfree_rcu(key, rcu);
1151         }
1152 }
1153
1154 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1155                                  char __user *optval, int optlen)
1156 {
1157         struct tcp_md5sig cmd;
1158         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1159         const union tcp_md5_addr *addr;
1160         u8 prefixlen = 32;
1161
1162         if (optlen < sizeof(cmd))
1163                 return -EINVAL;
1164
1165         if (copy_from_user(&cmd, optval, sizeof(cmd)))
1166                 return -EFAULT;
1167
1168         if (sin->sin_family != AF_INET)
1169                 return -EINVAL;
1170
1171         if (optname == TCP_MD5SIG_EXT &&
1172             cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1173                 prefixlen = cmd.tcpm_prefixlen;
1174                 if (prefixlen > 32)
1175                         return -EINVAL;
1176         }
1177
1178         addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1179
1180         if (!cmd.tcpm_keylen)
1181                 return tcp_md5_do_del(sk, addr, AF_INET, prefixlen);
1182
1183         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1184                 return -EINVAL;
1185
1186         return tcp_md5_do_add(sk, addr, AF_INET, prefixlen,
1187                               cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
1188 }
1189
1190 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1191                                    __be32 daddr, __be32 saddr,
1192                                    const struct tcphdr *th, int nbytes)
1193 {
1194         struct tcp4_pseudohdr *bp;
1195         struct scatterlist sg;
1196         struct tcphdr *_th;
1197
1198         bp = hp->scratch;
1199         bp->saddr = saddr;
1200         bp->daddr = daddr;
1201         bp->pad = 0;
1202         bp->protocol = IPPROTO_TCP;
1203         bp->len = cpu_to_be16(nbytes);
1204
1205         _th = (struct tcphdr *)(bp + 1);
1206         memcpy(_th, th, sizeof(*th));
1207         _th->check = 0;
1208
1209         sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1210         ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1211                                 sizeof(*bp) + sizeof(*th));
1212         return crypto_ahash_update(hp->md5_req);
1213 }
1214
1215 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1216                                __be32 daddr, __be32 saddr, const struct tcphdr *th)
1217 {
1218         struct tcp_md5sig_pool *hp;
1219         struct ahash_request *req;
1220
1221         hp = tcp_get_md5sig_pool();
1222         if (!hp)
1223                 goto clear_hash_noput;
1224         req = hp->md5_req;
1225
1226         if (crypto_ahash_init(req))
1227                 goto clear_hash;
1228         if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1229                 goto clear_hash;
1230         if (tcp_md5_hash_key(hp, key))
1231                 goto clear_hash;
1232         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1233         if (crypto_ahash_final(req))
1234                 goto clear_hash;
1235
1236         tcp_put_md5sig_pool();
1237         return 0;
1238
1239 clear_hash:
1240         tcp_put_md5sig_pool();
1241 clear_hash_noput:
1242         memset(md5_hash, 0, 16);
1243         return 1;
1244 }
1245
1246 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1247                         const struct sock *sk,
1248                         const struct sk_buff *skb)
1249 {
1250         struct tcp_md5sig_pool *hp;
1251         struct ahash_request *req;
1252         const struct tcphdr *th = tcp_hdr(skb);
1253         __be32 saddr, daddr;
1254
1255         if (sk) { /* valid for establish/request sockets */
1256                 saddr = sk->sk_rcv_saddr;
1257                 daddr = sk->sk_daddr;
1258         } else {
1259                 const struct iphdr *iph = ip_hdr(skb);
1260                 saddr = iph->saddr;
1261                 daddr = iph->daddr;
1262         }
1263
1264         hp = tcp_get_md5sig_pool();
1265         if (!hp)
1266                 goto clear_hash_noput;
1267         req = hp->md5_req;
1268
1269         if (crypto_ahash_init(req))
1270                 goto clear_hash;
1271
1272         if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1273                 goto clear_hash;
1274         if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1275                 goto clear_hash;
1276         if (tcp_md5_hash_key(hp, key))
1277                 goto clear_hash;
1278         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1279         if (crypto_ahash_final(req))
1280                 goto clear_hash;
1281
1282         tcp_put_md5sig_pool();
1283         return 0;
1284
1285 clear_hash:
1286         tcp_put_md5sig_pool();
1287 clear_hash_noput:
1288         memset(md5_hash, 0, 16);
1289         return 1;
1290 }
1291 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1292
1293 #endif
1294
1295 /* Called with rcu_read_lock() */
1296 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1297                                     const struct sk_buff *skb,
1298                                     int dif, int sdif)
1299 {
1300 #ifdef CONFIG_TCP_MD5SIG
1301         /*
1302          * This gets called for each TCP segment that arrives
1303          * so we want to be efficient.
1304          * We have 3 drop cases:
1305          * o No MD5 hash and one expected.
1306          * o MD5 hash and we're not expecting one.
1307          * o MD5 hash and its wrong.
1308          */
1309         const __u8 *hash_location = NULL;
1310         struct tcp_md5sig_key *hash_expected;
1311         const struct iphdr *iph = ip_hdr(skb);
1312         const struct tcphdr *th = tcp_hdr(skb);
1313         const union tcp_md5_addr *addr;
1314         int genhash;
1315         unsigned char newhash[16];
1316
1317         addr = (union tcp_md5_addr *)&iph->saddr;
1318         hash_expected = tcp_md5_do_lookup(sk, addr, AF_INET);
1319         hash_location = tcp_parse_md5sig_option(th);
1320
1321         /* We've parsed the options - do we have a hash? */
1322         if (!hash_expected && !hash_location)
1323                 return false;
1324
1325         if (hash_expected && !hash_location) {
1326                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1327                 return true;
1328         }
1329
1330         if (!hash_expected && hash_location) {
1331                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1332                 return true;
1333         }
1334
1335         /* Okay, so this is hash_expected and hash_location -
1336          * so we need to calculate the checksum.
1337          */
1338         genhash = tcp_v4_md5_hash_skb(newhash,
1339                                       hash_expected,
1340                                       NULL, skb);
1341
1342         if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1343                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
1344                 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1345                                      &iph->saddr, ntohs(th->source),
1346                                      &iph->daddr, ntohs(th->dest),
1347                                      genhash ? " tcp_v4_calc_md5_hash failed"
1348                                      : "");
1349                 return true;
1350         }
1351         return false;
1352 #endif
1353         return false;
1354 }
1355
1356 static void tcp_v4_init_req(struct request_sock *req,
1357                             const struct sock *sk_listener,
1358                             struct sk_buff *skb)
1359 {
1360         struct inet_request_sock *ireq = inet_rsk(req);
1361         struct net *net = sock_net(sk_listener);
1362
1363         sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1364         sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1365         RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1366 }
1367
1368 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1369                                           struct flowi *fl,
1370                                           const struct request_sock *req)
1371 {
1372         return inet_csk_route_req(sk, &fl->u.ip4, req);
1373 }
1374
1375 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1376         .family         =       PF_INET,
1377         .obj_size       =       sizeof(struct tcp_request_sock),
1378         .rtx_syn_ack    =       tcp_rtx_synack,
1379         .send_ack       =       tcp_v4_reqsk_send_ack,
1380         .destructor     =       tcp_v4_reqsk_destructor,
1381         .send_reset     =       tcp_v4_send_reset,
1382         .syn_ack_timeout =      tcp_syn_ack_timeout,
1383 };
1384
1385 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1386         .mss_clamp      =       TCP_MSS_DEFAULT,
1387 #ifdef CONFIG_TCP_MD5SIG
1388         .req_md5_lookup =       tcp_v4_md5_lookup,
1389         .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1390 #endif
1391         .init_req       =       tcp_v4_init_req,
1392 #ifdef CONFIG_SYN_COOKIES
1393         .cookie_init_seq =      cookie_v4_init_sequence,
1394 #endif
1395         .route_req      =       tcp_v4_route_req,
1396         .init_seq       =       tcp_v4_init_seq,
1397         .init_ts_off    =       tcp_v4_init_ts_off,
1398         .send_synack    =       tcp_v4_send_synack,
1399 };
1400
1401 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1402 {
1403         /* Never answer to SYNs send to broadcast or multicast */
1404         if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1405                 goto drop;
1406
1407         return tcp_conn_request(&tcp_request_sock_ops,
1408                                 &tcp_request_sock_ipv4_ops, sk, skb);
1409
1410 drop:
1411         tcp_listendrop(sk);
1412         return 0;
1413 }
1414 EXPORT_SYMBOL(tcp_v4_conn_request);
1415
1416
1417 /*
1418  * The three way handshake has completed - we got a valid synack -
1419  * now create the new socket.
1420  */
1421 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1422                                   struct request_sock *req,
1423                                   struct dst_entry *dst,
1424                                   struct request_sock *req_unhash,
1425                                   bool *own_req)
1426 {
1427         struct inet_request_sock *ireq;
1428         struct inet_sock *newinet;
1429         struct tcp_sock *newtp;
1430         struct sock *newsk;
1431 #ifdef CONFIG_TCP_MD5SIG
1432         const union tcp_md5_addr *addr;
1433         struct tcp_md5sig_key *key;
1434 #endif
1435         struct ip_options_rcu *inet_opt;
1436
1437         if (sk_acceptq_is_full(sk))
1438                 goto exit_overflow;
1439
1440         newsk = tcp_create_openreq_child(sk, req, skb);
1441         if (!newsk)
1442                 goto exit_nonewsk;
1443
1444         newsk->sk_gso_type = SKB_GSO_TCPV4;
1445         inet_sk_rx_dst_set(newsk, skb);
1446
1447         newtp                 = tcp_sk(newsk);
1448         newinet               = inet_sk(newsk);
1449         ireq                  = inet_rsk(req);
1450         sk_daddr_set(newsk, ireq->ir_rmt_addr);
1451         sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1452         newsk->sk_bound_dev_if = ireq->ir_iif;
1453         newinet->inet_saddr   = ireq->ir_loc_addr;
1454         inet_opt              = rcu_dereference(ireq->ireq_opt);
1455         RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1456         newinet->mc_index     = inet_iif(skb);
1457         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1458         newinet->rcv_tos      = ip_hdr(skb)->tos;
1459         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1460         if (inet_opt)
1461                 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1462         newinet->inet_id = prandom_u32();
1463
1464         if (!dst) {
1465                 dst = inet_csk_route_child_sock(sk, newsk, req);
1466                 if (!dst)
1467                         goto put_and_exit;
1468         } else {
1469                 /* syncookie case : see end of cookie_v4_check() */
1470         }
1471         sk_setup_caps(newsk, dst);
1472
1473         tcp_ca_openreq_child(newsk, dst);
1474
1475         tcp_sync_mss(newsk, dst_mtu(dst));
1476         newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1477
1478         tcp_initialize_rcv_mss(newsk);
1479
1480 #ifdef CONFIG_TCP_MD5SIG
1481         /* Copy over the MD5 key from the original socket */
1482         addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1483         key = tcp_md5_do_lookup(sk, addr, AF_INET);
1484         if (key) {
1485                 /*
1486                  * We're using one, so create a matching key
1487                  * on the newsk structure. If we fail to get
1488                  * memory, then we end up not copying the key
1489                  * across. Shucks.
1490                  */
1491                 tcp_md5_do_add(newsk, addr, AF_INET, 32,
1492                                key->key, key->keylen, GFP_ATOMIC);
1493                 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1494         }
1495 #endif
1496
1497         if (__inet_inherit_port(sk, newsk) < 0)
1498                 goto put_and_exit;
1499         *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
1500         if (likely(*own_req)) {
1501                 tcp_move_syn(newtp, req);
1502                 ireq->ireq_opt = NULL;
1503         } else {
1504                 newinet->inet_opt = NULL;
1505         }
1506         return newsk;
1507
1508 exit_overflow:
1509         NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1510 exit_nonewsk:
1511         dst_release(dst);
1512 exit:
1513         tcp_listendrop(sk);
1514         return NULL;
1515 put_and_exit:
1516         newinet->inet_opt = NULL;
1517         inet_csk_prepare_forced_close(newsk);
1518         tcp_done(newsk);
1519         goto exit;
1520 }
1521 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1522
1523 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1524 {
1525 #ifdef CONFIG_SYN_COOKIES
1526         const struct tcphdr *th = tcp_hdr(skb);
1527
1528         if (!th->syn)
1529                 sk = cookie_v4_check(sk, skb);
1530 #endif
1531         return sk;
1532 }
1533
1534 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1535                          struct tcphdr *th, u32 *cookie)
1536 {
1537         u16 mss = 0;
1538 #ifdef CONFIG_SYN_COOKIES
1539         mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1540                                     &tcp_request_sock_ipv4_ops, sk, th);
1541         if (mss) {
1542                 *cookie = __cookie_v4_init_sequence(iph, th, &mss);
1543                 tcp_synq_overflow(sk);
1544         }
1545 #endif
1546         return mss;
1547 }
1548
1549 /* The socket must have it's spinlock held when we get
1550  * here, unless it is a TCP_LISTEN socket.
1551  *
1552  * We have a potential double-lock case here, so even when
1553  * doing backlog processing we use the BH locking scheme.
1554  * This is because we cannot sleep with the original spinlock
1555  * held.
1556  */
1557 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1558 {
1559         struct sock *rsk;
1560
1561         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1562                 struct dst_entry *dst = sk->sk_rx_dst;
1563
1564                 sock_rps_save_rxhash(sk, skb);
1565                 sk_mark_napi_id(sk, skb);
1566                 if (dst) {
1567                         if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1568                             !dst->ops->check(dst, 0)) {
1569                                 dst_release(dst);
1570                                 sk->sk_rx_dst = NULL;
1571                         }
1572                 }
1573                 tcp_rcv_established(sk, skb);
1574                 return 0;
1575         }
1576
1577         if (tcp_checksum_complete(skb))
1578                 goto csum_err;
1579
1580         if (sk->sk_state == TCP_LISTEN) {
1581                 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1582
1583                 if (!nsk)
1584                         goto discard;
1585                 if (nsk != sk) {
1586                         if (tcp_child_process(sk, nsk, skb)) {
1587                                 rsk = nsk;
1588                                 goto reset;
1589                         }
1590                         return 0;
1591                 }
1592         } else
1593                 sock_rps_save_rxhash(sk, skb);
1594
1595         if (tcp_rcv_state_process(sk, skb)) {
1596                 rsk = sk;
1597                 goto reset;
1598         }
1599         return 0;
1600
1601 reset:
1602         tcp_v4_send_reset(rsk, skb);
1603 discard:
1604         kfree_skb(skb);
1605         /* Be careful here. If this function gets more complicated and
1606          * gcc suffers from register pressure on the x86, sk (in %ebx)
1607          * might be destroyed here. This current version compiles correctly,
1608          * but you have been warned.
1609          */
1610         return 0;
1611
1612 csum_err:
1613         TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1614         TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1615         goto discard;
1616 }
1617 EXPORT_SYMBOL(tcp_v4_do_rcv);
1618
1619 int tcp_v4_early_demux(struct sk_buff *skb)
1620 {
1621         const struct iphdr *iph;
1622         const struct tcphdr *th;
1623         struct sock *sk;
1624
1625         if (skb->pkt_type != PACKET_HOST)
1626                 return 0;
1627
1628         if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1629                 return 0;
1630
1631         iph = ip_hdr(skb);
1632         th = tcp_hdr(skb);
1633
1634         if (th->doff < sizeof(struct tcphdr) / 4)
1635                 return 0;
1636
1637         sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1638                                        iph->saddr, th->source,
1639                                        iph->daddr, ntohs(th->dest),
1640                                        skb->skb_iif, inet_sdif(skb));
1641         if (sk) {
1642                 skb->sk = sk;
1643                 skb->destructor = sock_edemux;
1644                 if (sk_fullsock(sk)) {
1645                         struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
1646
1647                         if (dst)
1648                                 dst = dst_check(dst, 0);
1649                         if (dst &&
1650                             inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1651                                 skb_dst_set_noref(skb, dst);
1652                 }
1653         }
1654         return 0;
1655 }
1656
1657 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1658 {
1659         u32 limit = READ_ONCE(sk->sk_rcvbuf) + READ_ONCE(sk->sk_sndbuf);
1660         struct skb_shared_info *shinfo;
1661         const struct tcphdr *th;
1662         struct tcphdr *thtail;
1663         struct sk_buff *tail;
1664         unsigned int hdrlen;
1665         bool fragstolen;
1666         u32 gso_segs;
1667         int delta;
1668
1669         /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1670          * we can fix skb->truesize to its real value to avoid future drops.
1671          * This is valid because skb is not yet charged to the socket.
1672          * It has been noticed pure SACK packets were sometimes dropped
1673          * (if cooked by drivers without copybreak feature).
1674          */
1675         skb_condense(skb);
1676
1677         skb_dst_drop(skb);
1678
1679         if (unlikely(tcp_checksum_complete(skb))) {
1680                 bh_unlock_sock(sk);
1681                 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1682                 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1683                 return true;
1684         }
1685
1686         /* Attempt coalescing to last skb in backlog, even if we are
1687          * above the limits.
1688          * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
1689          */
1690         th = (const struct tcphdr *)skb->data;
1691         hdrlen = th->doff * 4;
1692         shinfo = skb_shinfo(skb);
1693
1694         if (!shinfo->gso_size)
1695                 shinfo->gso_size = skb->len - hdrlen;
1696
1697         if (!shinfo->gso_segs)
1698                 shinfo->gso_segs = 1;
1699
1700         tail = sk->sk_backlog.tail;
1701         if (!tail)
1702                 goto no_coalesce;
1703         thtail = (struct tcphdr *)tail->data;
1704
1705         if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
1706             TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
1707             ((TCP_SKB_CB(tail)->tcp_flags |
1708               TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
1709             !((TCP_SKB_CB(tail)->tcp_flags &
1710               TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
1711             ((TCP_SKB_CB(tail)->tcp_flags ^
1712               TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
1713 #ifdef CONFIG_TLS_DEVICE
1714             tail->decrypted != skb->decrypted ||
1715 #endif
1716             thtail->doff != th->doff ||
1717             memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
1718                 goto no_coalesce;
1719
1720         __skb_pull(skb, hdrlen);
1721         if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
1722                 thtail->window = th->window;
1723
1724                 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
1725
1726                 if (after(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))
1727                         TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
1728
1729                 /* We have to update both TCP_SKB_CB(tail)->tcp_flags and
1730                  * thtail->fin, so that the fast path in tcp_rcv_established()
1731                  * is not entered if we append a packet with a FIN.
1732                  * SYN, RST, URG are not present.
1733                  * ACK is set on both packets.
1734                  * PSH : we do not really care in TCP stack,
1735                  *       at least for 'GRO' packets.
1736                  */
1737                 thtail->fin |= th->fin;
1738                 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1739
1740                 if (TCP_SKB_CB(skb)->has_rxtstamp) {
1741                         TCP_SKB_CB(tail)->has_rxtstamp = true;
1742                         tail->tstamp = skb->tstamp;
1743                         skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
1744                 }
1745
1746                 /* Not as strict as GRO. We only need to carry mss max value */
1747                 skb_shinfo(tail)->gso_size = max(shinfo->gso_size,
1748                                                  skb_shinfo(tail)->gso_size);
1749
1750                 gso_segs = skb_shinfo(tail)->gso_segs + shinfo->gso_segs;
1751                 skb_shinfo(tail)->gso_segs = min_t(u32, gso_segs, 0xFFFF);
1752
1753                 sk->sk_backlog.len += delta;
1754                 __NET_INC_STATS(sock_net(sk),
1755                                 LINUX_MIB_TCPBACKLOGCOALESCE);
1756                 kfree_skb_partial(skb, fragstolen);
1757                 return false;
1758         }
1759         __skb_push(skb, hdrlen);
1760
1761 no_coalesce:
1762         /* Only socket owner can try to collapse/prune rx queues
1763          * to reduce memory overhead, so add a little headroom here.
1764          * Few sockets backlog are possibly concurrently non empty.
1765          */
1766         limit += 64*1024;
1767
1768         if (unlikely(sk_add_backlog(sk, skb, limit))) {
1769                 bh_unlock_sock(sk);
1770                 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1771                 return true;
1772         }
1773         return false;
1774 }
1775 EXPORT_SYMBOL(tcp_add_backlog);
1776
1777 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1778 {
1779         struct tcphdr *th = (struct tcphdr *)skb->data;
1780
1781         return sk_filter_trim_cap(sk, skb, th->doff * 4);
1782 }
1783 EXPORT_SYMBOL(tcp_filter);
1784
1785 static void tcp_v4_restore_cb(struct sk_buff *skb)
1786 {
1787         memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1788                 sizeof(struct inet_skb_parm));
1789 }
1790
1791 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1792                            const struct tcphdr *th)
1793 {
1794         /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1795          * barrier() makes sure compiler wont play fool^Waliasing games.
1796          */
1797         memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1798                 sizeof(struct inet_skb_parm));
1799         barrier();
1800
1801         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1802         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1803                                     skb->len - th->doff * 4);
1804         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1805         TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1806         TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1807         TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1808         TCP_SKB_CB(skb)->sacked  = 0;
1809         TCP_SKB_CB(skb)->has_rxtstamp =
1810                         skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1811 }
1812
1813 /*
1814  *      From tcp_input.c
1815  */
1816
1817 int tcp_v4_rcv(struct sk_buff *skb)
1818 {
1819         struct net *net = dev_net(skb->dev);
1820         struct sk_buff *skb_to_free;
1821         int sdif = inet_sdif(skb);
1822         int dif = inet_iif(skb);
1823         const struct iphdr *iph;
1824         const struct tcphdr *th;
1825         bool refcounted;
1826         struct sock *sk;
1827         int ret;
1828
1829         if (skb->pkt_type != PACKET_HOST)
1830                 goto discard_it;
1831
1832         /* Count it even if it's bad */
1833         __TCP_INC_STATS(net, TCP_MIB_INSEGS);
1834
1835         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1836                 goto discard_it;
1837
1838         th = (const struct tcphdr *)skb->data;
1839
1840         if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
1841                 goto bad_packet;
1842         if (!pskb_may_pull(skb, th->doff * 4))
1843                 goto discard_it;
1844
1845         /* An explanation is required here, I think.
1846          * Packet length and doff are validated by header prediction,
1847          * provided case of th->doff==0 is eliminated.
1848          * So, we defer the checks. */
1849
1850         if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1851                 goto csum_error;
1852
1853         th = (const struct tcphdr *)skb->data;
1854         iph = ip_hdr(skb);
1855 lookup:
1856         sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1857                                th->dest, sdif, &refcounted);
1858         if (!sk)
1859                 goto no_tcp_socket;
1860
1861 process:
1862         if (sk->sk_state == TCP_TIME_WAIT)
1863                 goto do_time_wait;
1864
1865         if (sk->sk_state == TCP_NEW_SYN_RECV) {
1866                 struct request_sock *req = inet_reqsk(sk);
1867                 bool req_stolen = false;
1868                 struct sock *nsk;
1869
1870                 sk = req->rsk_listener;
1871                 if (unlikely(tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))) {
1872                         sk_drops_add(sk, skb);
1873                         reqsk_put(req);
1874                         goto discard_it;
1875                 }
1876                 if (tcp_checksum_complete(skb)) {
1877                         reqsk_put(req);
1878                         goto csum_error;
1879                 }
1880                 if (unlikely(sk->sk_state != TCP_LISTEN)) {
1881                         inet_csk_reqsk_queue_drop_and_put(sk, req);
1882                         goto lookup;
1883                 }
1884                 /* We own a reference on the listener, increase it again
1885                  * as we might lose it too soon.
1886                  */
1887                 sock_hold(sk);
1888                 refcounted = true;
1889                 nsk = NULL;
1890                 if (!tcp_filter(sk, skb)) {
1891                         th = (const struct tcphdr *)skb->data;
1892                         iph = ip_hdr(skb);
1893                         tcp_v4_fill_cb(skb, iph, th);
1894                         nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
1895                 }
1896                 if (!nsk) {
1897                         reqsk_put(req);
1898                         if (req_stolen) {
1899                                 /* Another cpu got exclusive access to req
1900                                  * and created a full blown socket.
1901                                  * Try to feed this packet to this socket
1902                                  * instead of discarding it.
1903                                  */
1904                                 tcp_v4_restore_cb(skb);
1905                                 sock_put(sk);
1906                                 goto lookup;
1907                         }
1908                         goto discard_and_relse;
1909                 }
1910                 if (nsk == sk) {
1911                         reqsk_put(req);
1912                         tcp_v4_restore_cb(skb);
1913                 } else if (tcp_child_process(sk, nsk, skb)) {
1914                         tcp_v4_send_reset(nsk, skb);
1915                         goto discard_and_relse;
1916                 } else {
1917                         sock_put(sk);
1918                         return 0;
1919                 }
1920         }
1921         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1922                 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
1923                 goto discard_and_relse;
1924         }
1925
1926         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1927                 goto discard_and_relse;
1928
1929         if (tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))
1930                 goto discard_and_relse;
1931
1932         nf_reset_ct(skb);
1933
1934         if (tcp_filter(sk, skb))
1935                 goto discard_and_relse;
1936         th = (const struct tcphdr *)skb->data;
1937         iph = ip_hdr(skb);
1938         tcp_v4_fill_cb(skb, iph, th);
1939
1940         skb->dev = NULL;
1941
1942         if (sk->sk_state == TCP_LISTEN) {
1943                 ret = tcp_v4_do_rcv(sk, skb);
1944                 goto put_and_return;
1945         }
1946
1947         sk_incoming_cpu_update(sk);
1948
1949         bh_lock_sock_nested(sk);
1950         tcp_segs_in(tcp_sk(sk), skb);
1951         ret = 0;
1952         if (!sock_owned_by_user(sk)) {
1953                 skb_to_free = sk->sk_rx_skb_cache;
1954                 sk->sk_rx_skb_cache = NULL;
1955                 ret = tcp_v4_do_rcv(sk, skb);
1956         } else {
1957                 if (tcp_add_backlog(sk, skb))
1958                         goto discard_and_relse;
1959                 skb_to_free = NULL;
1960         }
1961         bh_unlock_sock(sk);
1962         if (skb_to_free)
1963                 __kfree_skb(skb_to_free);
1964
1965 put_and_return:
1966         if (refcounted)
1967                 sock_put(sk);
1968
1969         return ret;
1970
1971 no_tcp_socket:
1972         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1973                 goto discard_it;
1974
1975         tcp_v4_fill_cb(skb, iph, th);
1976
1977         if (tcp_checksum_complete(skb)) {
1978 csum_error:
1979                 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
1980 bad_packet:
1981                 __TCP_INC_STATS(net, TCP_MIB_INERRS);
1982         } else {
1983                 tcp_v4_send_reset(NULL, skb);
1984         }
1985
1986 discard_it:
1987         /* Discard frame. */
1988         kfree_skb(skb);
1989         return 0;
1990
1991 discard_and_relse:
1992         sk_drops_add(sk, skb);
1993         if (refcounted)
1994                 sock_put(sk);
1995         goto discard_it;
1996
1997 do_time_wait:
1998         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1999                 inet_twsk_put(inet_twsk(sk));
2000                 goto discard_it;
2001         }
2002
2003         tcp_v4_fill_cb(skb, iph, th);
2004
2005         if (tcp_checksum_complete(skb)) {
2006                 inet_twsk_put(inet_twsk(sk));
2007                 goto csum_error;
2008         }
2009         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
2010         case TCP_TW_SYN: {
2011                 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
2012                                                         &tcp_hashinfo, skb,
2013                                                         __tcp_hdrlen(th),
2014                                                         iph->saddr, th->source,
2015                                                         iph->daddr, th->dest,
2016                                                         inet_iif(skb),
2017                                                         sdif);
2018                 if (sk2) {
2019                         inet_twsk_deschedule_put(inet_twsk(sk));
2020                         sk = sk2;
2021                         tcp_v4_restore_cb(skb);
2022                         refcounted = false;
2023                         goto process;
2024                 }
2025         }
2026                 /* to ACK */
2027                 /* fall through */
2028         case TCP_TW_ACK:
2029                 tcp_v4_timewait_ack(sk, skb);
2030                 break;
2031         case TCP_TW_RST:
2032                 tcp_v4_send_reset(sk, skb);
2033                 inet_twsk_deschedule_put(inet_twsk(sk));
2034                 goto discard_it;
2035         case TCP_TW_SUCCESS:;
2036         }
2037         goto discard_it;
2038 }
2039
2040 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2041         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
2042         .twsk_unique    = tcp_twsk_unique,
2043         .twsk_destructor= tcp_twsk_destructor,
2044 };
2045
2046 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2047 {
2048         struct dst_entry *dst = skb_dst(skb);
2049
2050         if (dst && dst_hold_safe(dst)) {
2051                 sk->sk_rx_dst = dst;
2052                 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
2053         }
2054 }
2055 EXPORT_SYMBOL(inet_sk_rx_dst_set);
2056
2057 const struct inet_connection_sock_af_ops ipv4_specific = {
2058         .queue_xmit        = ip_queue_xmit,
2059         .send_check        = tcp_v4_send_check,
2060         .rebuild_header    = inet_sk_rebuild_header,
2061         .sk_rx_dst_set     = inet_sk_rx_dst_set,
2062         .conn_request      = tcp_v4_conn_request,
2063         .syn_recv_sock     = tcp_v4_syn_recv_sock,
2064         .net_header_len    = sizeof(struct iphdr),
2065         .setsockopt        = ip_setsockopt,
2066         .getsockopt        = ip_getsockopt,
2067         .addr2sockaddr     = inet_csk_addr2sockaddr,
2068         .sockaddr_len      = sizeof(struct sockaddr_in),
2069 #ifdef CONFIG_COMPAT
2070         .compat_setsockopt = compat_ip_setsockopt,
2071         .compat_getsockopt = compat_ip_getsockopt,
2072 #endif
2073         .mtu_reduced       = tcp_v4_mtu_reduced,
2074 };
2075 EXPORT_SYMBOL(ipv4_specific);
2076
2077 #ifdef CONFIG_TCP_MD5SIG
2078 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2079         .md5_lookup             = tcp_v4_md5_lookup,
2080         .calc_md5_hash          = tcp_v4_md5_hash_skb,
2081         .md5_parse              = tcp_v4_parse_md5_keys,
2082 };
2083 #endif
2084
2085 /* NOTE: A lot of things set to zero explicitly by call to
2086  *       sk_alloc() so need not be done here.
2087  */
2088 static int tcp_v4_init_sock(struct sock *sk)
2089 {
2090         struct inet_connection_sock *icsk = inet_csk(sk);
2091
2092         tcp_init_sock(sk);
2093
2094         icsk->icsk_af_ops = &ipv4_specific;
2095
2096 #ifdef CONFIG_TCP_MD5SIG
2097         tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2098 #endif
2099
2100         return 0;
2101 }
2102
2103 void tcp_v4_destroy_sock(struct sock *sk)
2104 {
2105         struct tcp_sock *tp = tcp_sk(sk);
2106
2107         trace_tcp_destroy_sock(sk);
2108
2109         tcp_clear_xmit_timers(sk);
2110
2111         tcp_cleanup_congestion_control(sk);
2112
2113         tcp_cleanup_ulp(sk);
2114
2115         /* Cleanup up the write buffer. */
2116         tcp_write_queue_purge(sk);
2117
2118         /* Check if we want to disable active TFO */
2119         tcp_fastopen_active_disable_ofo_check(sk);
2120
2121         /* Cleans up our, hopefully empty, out_of_order_queue. */
2122         skb_rbtree_purge(&tp->out_of_order_queue);
2123
2124 #ifdef CONFIG_TCP_MD5SIG
2125         /* Clean up the MD5 key list, if any */
2126         if (tp->md5sig_info) {
2127                 tcp_clear_md5_list(sk);
2128                 kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
2129                 tp->md5sig_info = NULL;
2130         }
2131 #endif
2132
2133         /* Clean up a referenced TCP bind bucket. */
2134         if (inet_csk(sk)->icsk_bind_hash)
2135                 inet_put_port(sk);
2136
2137         BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2138
2139         /* If socket is aborted during connect operation */
2140         tcp_free_fastopen_req(tp);
2141         tcp_fastopen_destroy_cipher(sk);
2142         tcp_saved_syn_free(tp);
2143
2144         sk_sockets_allocated_dec(sk);
2145 }
2146 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2147
2148 #ifdef CONFIG_PROC_FS
2149 /* Proc filesystem TCP sock list dumping. */
2150
2151 /*
2152  * Get next listener socket follow cur.  If cur is NULL, get first socket
2153  * starting from bucket given in st->bucket; when st->bucket is zero the
2154  * very first socket in the hash table is returned.
2155  */
2156 static void *listening_get_next(struct seq_file *seq, void *cur)
2157 {
2158         struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2159         struct tcp_iter_state *st = seq->private;
2160         struct net *net = seq_file_net(seq);
2161         struct inet_listen_hashbucket *ilb;
2162         struct hlist_nulls_node *node;
2163         struct sock *sk = cur;
2164
2165         if (!sk) {
2166 get_head:
2167                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
2168                 spin_lock(&ilb->lock);
2169                 sk = sk_nulls_head(&ilb->nulls_head);
2170                 st->offset = 0;
2171                 goto get_sk;
2172         }
2173         ilb = &tcp_hashinfo.listening_hash[st->bucket];
2174         ++st->num;
2175         ++st->offset;
2176
2177         sk = sk_nulls_next(sk);
2178 get_sk:
2179         sk_nulls_for_each_from(sk, node) {
2180                 if (!net_eq(sock_net(sk), net))
2181                         continue;
2182                 if (sk->sk_family == afinfo->family)
2183                         return sk;
2184         }
2185         spin_unlock(&ilb->lock);
2186         st->offset = 0;
2187         if (++st->bucket < INET_LHTABLE_SIZE)
2188                 goto get_head;
2189         return NULL;
2190 }
2191
2192 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2193 {
2194         struct tcp_iter_state *st = seq->private;
2195         void *rc;
2196
2197         st->bucket = 0;
2198         st->offset = 0;
2199         rc = listening_get_next(seq, NULL);
2200
2201         while (rc && *pos) {
2202                 rc = listening_get_next(seq, rc);
2203                 --*pos;
2204         }
2205         return rc;
2206 }
2207
2208 static inline bool empty_bucket(const struct tcp_iter_state *st)
2209 {
2210         return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
2211 }
2212
2213 /*
2214  * Get first established socket starting from bucket given in st->bucket.
2215  * If st->bucket is zero, the very first socket in the hash is returned.
2216  */
2217 static void *established_get_first(struct seq_file *seq)
2218 {
2219         struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2220         struct tcp_iter_state *st = seq->private;
2221         struct net *net = seq_file_net(seq);
2222         void *rc = NULL;
2223
2224         st->offset = 0;
2225         for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2226                 struct sock *sk;
2227                 struct hlist_nulls_node *node;
2228                 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2229
2230                 /* Lockless fast path for the common case of empty buckets */
2231                 if (empty_bucket(st))
2232                         continue;
2233
2234                 spin_lock_bh(lock);
2235                 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2236                         if (sk->sk_family != afinfo->family ||
2237                             !net_eq(sock_net(sk), net)) {
2238                                 continue;
2239                         }
2240                         rc = sk;
2241                         goto out;
2242                 }
2243                 spin_unlock_bh(lock);
2244         }
2245 out:
2246         return rc;
2247 }
2248
2249 static void *established_get_next(struct seq_file *seq, void *cur)
2250 {
2251         struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2252         struct sock *sk = cur;
2253         struct hlist_nulls_node *node;
2254         struct tcp_iter_state *st = seq->private;
2255         struct net *net = seq_file_net(seq);
2256
2257         ++st->num;
2258         ++st->offset;
2259
2260         sk = sk_nulls_next(sk);
2261
2262         sk_nulls_for_each_from(sk, node) {
2263                 if (sk->sk_family == afinfo->family &&
2264                     net_eq(sock_net(sk), net))
2265                         return sk;
2266         }
2267
2268         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2269         ++st->bucket;
2270         return established_get_first(seq);
2271 }
2272
2273 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2274 {
2275         struct tcp_iter_state *st = seq->private;
2276         void *rc;
2277
2278         st->bucket = 0;
2279         rc = established_get_first(seq);
2280
2281         while (rc && pos) {
2282                 rc = established_get_next(seq, rc);
2283                 --pos;
2284         }
2285         return rc;
2286 }
2287
2288 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2289 {
2290         void *rc;
2291         struct tcp_iter_state *st = seq->private;
2292
2293         st->state = TCP_SEQ_STATE_LISTENING;
2294         rc        = listening_get_idx(seq, &pos);
2295
2296         if (!rc) {
2297                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2298                 rc        = established_get_idx(seq, pos);
2299         }
2300
2301         return rc;
2302 }
2303
2304 static void *tcp_seek_last_pos(struct seq_file *seq)
2305 {
2306         struct tcp_iter_state *st = seq->private;
2307         int offset = st->offset;
2308         int orig_num = st->num;
2309         void *rc = NULL;
2310
2311         switch (st->state) {
2312         case TCP_SEQ_STATE_LISTENING:
2313                 if (st->bucket >= INET_LHTABLE_SIZE)
2314                         break;
2315                 st->state = TCP_SEQ_STATE_LISTENING;
2316                 rc = listening_get_next(seq, NULL);
2317                 while (offset-- && rc)
2318                         rc = listening_get_next(seq, rc);
2319                 if (rc)
2320                         break;
2321                 st->bucket = 0;
2322                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2323                 /* Fallthrough */
2324         case TCP_SEQ_STATE_ESTABLISHED:
2325                 if (st->bucket > tcp_hashinfo.ehash_mask)
2326                         break;
2327                 rc = established_get_first(seq);
2328                 while (offset-- && rc)
2329                         rc = established_get_next(seq, rc);
2330         }
2331
2332         st->num = orig_num;
2333
2334         return rc;
2335 }
2336
2337 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2338 {
2339         struct tcp_iter_state *st = seq->private;
2340         void *rc;
2341
2342         if (*pos && *pos == st->last_pos) {
2343                 rc = tcp_seek_last_pos(seq);
2344                 if (rc)
2345                         goto out;
2346         }
2347
2348         st->state = TCP_SEQ_STATE_LISTENING;
2349         st->num = 0;
2350         st->bucket = 0;
2351         st->offset = 0;
2352         rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2353
2354 out:
2355         st->last_pos = *pos;
2356         return rc;
2357 }
2358 EXPORT_SYMBOL(tcp_seq_start);
2359
2360 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2361 {
2362         struct tcp_iter_state *st = seq->private;
2363         void *rc = NULL;
2364
2365         if (v == SEQ_START_TOKEN) {
2366                 rc = tcp_get_idx(seq, 0);
2367                 goto out;
2368         }
2369
2370         switch (st->state) {
2371         case TCP_SEQ_STATE_LISTENING:
2372                 rc = listening_get_next(seq, v);
2373                 if (!rc) {
2374                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2375                         st->bucket = 0;
2376                         st->offset = 0;
2377                         rc        = established_get_first(seq);
2378                 }
2379                 break;
2380         case TCP_SEQ_STATE_ESTABLISHED:
2381                 rc = established_get_next(seq, v);
2382                 break;
2383         }
2384 out:
2385         ++*pos;
2386         st->last_pos = *pos;
2387         return rc;
2388 }
2389 EXPORT_SYMBOL(tcp_seq_next);
2390
2391 void tcp_seq_stop(struct seq_file *seq, void *v)
2392 {
2393         struct tcp_iter_state *st = seq->private;
2394
2395         switch (st->state) {
2396         case TCP_SEQ_STATE_LISTENING:
2397                 if (v != SEQ_START_TOKEN)
2398                         spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock);
2399                 break;
2400         case TCP_SEQ_STATE_ESTABLISHED:
2401                 if (v)
2402                         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2403                 break;
2404         }
2405 }
2406 EXPORT_SYMBOL(tcp_seq_stop);
2407
2408 static void get_openreq4(const struct request_sock *req,
2409                          struct seq_file *f, int i)
2410 {
2411         const struct inet_request_sock *ireq = inet_rsk(req);
2412         long delta = req->rsk_timer.expires - jiffies;
2413
2414         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2415                 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2416                 i,
2417                 ireq->ir_loc_addr,
2418                 ireq->ir_num,
2419                 ireq->ir_rmt_addr,
2420                 ntohs(ireq->ir_rmt_port),
2421                 TCP_SYN_RECV,
2422                 0, 0, /* could print option size, but that is af dependent. */
2423                 1,    /* timers active (only the expire timer) */
2424                 jiffies_delta_to_clock_t(delta),
2425                 req->num_timeout,
2426                 from_kuid_munged(seq_user_ns(f),
2427                                  sock_i_uid(req->rsk_listener)),
2428                 0,  /* non standard timer */
2429                 0, /* open_requests have no inode */
2430                 0,
2431                 req);
2432 }
2433
2434 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2435 {
2436         int timer_active;
2437         unsigned long timer_expires;
2438         const struct tcp_sock *tp = tcp_sk(sk);
2439         const struct inet_connection_sock *icsk = inet_csk(sk);
2440         const struct inet_sock *inet = inet_sk(sk);
2441         const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2442         __be32 dest = inet->inet_daddr;
2443         __be32 src = inet->inet_rcv_saddr;
2444         __u16 destp = ntohs(inet->inet_dport);
2445         __u16 srcp = ntohs(inet->inet_sport);
2446         int rx_queue;
2447         int state;
2448
2449         if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2450             icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2451             icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2452                 timer_active    = 1;
2453                 timer_expires   = icsk->icsk_timeout;
2454         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2455                 timer_active    = 4;
2456                 timer_expires   = icsk->icsk_timeout;
2457         } else if (timer_pending(&sk->sk_timer)) {
2458                 timer_active    = 2;
2459                 timer_expires   = sk->sk_timer.expires;
2460         } else {
2461                 timer_active    = 0;
2462                 timer_expires = jiffies;
2463         }
2464
2465         state = inet_sk_state_load(sk);
2466         if (state == TCP_LISTEN)
2467                 rx_queue = READ_ONCE(sk->sk_ack_backlog);
2468         else
2469                 /* Because we don't lock the socket,
2470                  * we might find a transient negative value.
2471                  */
2472                 rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2473                                       READ_ONCE(tp->copied_seq), 0);
2474
2475         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2476                         "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2477                 i, src, srcp, dest, destp, state,
2478                 READ_ONCE(tp->write_seq) - tp->snd_una,
2479                 rx_queue,
2480                 timer_active,
2481                 jiffies_delta_to_clock_t(timer_expires - jiffies),
2482                 icsk->icsk_retransmits,
2483                 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2484                 icsk->icsk_probes_out,
2485                 sock_i_ino(sk),
2486                 refcount_read(&sk->sk_refcnt), sk,
2487                 jiffies_to_clock_t(icsk->icsk_rto),
2488                 jiffies_to_clock_t(icsk->icsk_ack.ato),
2489                 (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2490                 tp->snd_cwnd,
2491                 state == TCP_LISTEN ?
2492                     fastopenq->max_qlen :
2493                     (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2494 }
2495
2496 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2497                                struct seq_file *f, int i)
2498 {
2499         long delta = tw->tw_timer.expires - jiffies;
2500         __be32 dest, src;
2501         __u16 destp, srcp;
2502
2503         dest  = tw->tw_daddr;
2504         src   = tw->tw_rcv_saddr;
2505         destp = ntohs(tw->tw_dport);
2506         srcp  = ntohs(tw->tw_sport);
2507
2508         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2509                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2510                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2511                 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2512                 refcount_read(&tw->tw_refcnt), tw);
2513 }
2514
2515 #define TMPSZ 150
2516
2517 static int tcp4_seq_show(struct seq_file *seq, void *v)
2518 {
2519         struct tcp_iter_state *st;
2520         struct sock *sk = v;
2521
2522         seq_setwidth(seq, TMPSZ - 1);
2523         if (v == SEQ_START_TOKEN) {
2524                 seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2525                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2526                            "inode");
2527                 goto out;
2528         }
2529         st = seq->private;
2530
2531         if (sk->sk_state == TCP_TIME_WAIT)
2532                 get_timewait4_sock(v, seq, st->num);
2533         else if (sk->sk_state == TCP_NEW_SYN_RECV)
2534                 get_openreq4(v, seq, st->num);
2535         else
2536                 get_tcp4_sock(v, seq, st->num);
2537 out:
2538         seq_pad(seq, '\n');
2539         return 0;
2540 }
2541
2542 static const struct seq_operations tcp4_seq_ops = {
2543         .show           = tcp4_seq_show,
2544         .start          = tcp_seq_start,
2545         .next           = tcp_seq_next,
2546         .stop           = tcp_seq_stop,
2547 };
2548
2549 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2550         .family         = AF_INET,
2551 };
2552
2553 static int __net_init tcp4_proc_init_net(struct net *net)
2554 {
2555         if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
2556                         sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
2557                 return -ENOMEM;
2558         return 0;
2559 }
2560
2561 static void __net_exit tcp4_proc_exit_net(struct net *net)
2562 {
2563         remove_proc_entry("tcp", net->proc_net);
2564 }
2565
2566 static struct pernet_operations tcp4_net_ops = {
2567         .init = tcp4_proc_init_net,
2568         .exit = tcp4_proc_exit_net,
2569 };
2570
2571 int __init tcp4_proc_init(void)
2572 {
2573         return register_pernet_subsys(&tcp4_net_ops);
2574 }
2575
2576 void tcp4_proc_exit(void)
2577 {
2578         unregister_pernet_subsys(&tcp4_net_ops);
2579 }
2580 #endif /* CONFIG_PROC_FS */
2581
2582 struct proto tcp_prot = {
2583         .name                   = "TCP",
2584         .owner                  = THIS_MODULE,
2585         .close                  = tcp_close,
2586         .pre_connect            = tcp_v4_pre_connect,
2587         .connect                = tcp_v4_connect,
2588         .disconnect             = tcp_disconnect,
2589         .accept                 = inet_csk_accept,
2590         .ioctl                  = tcp_ioctl,
2591         .init                   = tcp_v4_init_sock,
2592         .destroy                = tcp_v4_destroy_sock,
2593         .shutdown               = tcp_shutdown,
2594         .setsockopt             = tcp_setsockopt,
2595         .getsockopt             = tcp_getsockopt,
2596         .keepalive              = tcp_set_keepalive,
2597         .recvmsg                = tcp_recvmsg,
2598         .sendmsg                = tcp_sendmsg,
2599         .sendpage               = tcp_sendpage,
2600         .backlog_rcv            = tcp_v4_do_rcv,
2601         .release_cb             = tcp_release_cb,
2602         .hash                   = inet_hash,
2603         .unhash                 = inet_unhash,
2604         .get_port               = inet_csk_get_port,
2605         .enter_memory_pressure  = tcp_enter_memory_pressure,
2606         .leave_memory_pressure  = tcp_leave_memory_pressure,
2607         .stream_memory_free     = tcp_stream_memory_free,
2608         .sockets_allocated      = &tcp_sockets_allocated,
2609         .orphan_count           = &tcp_orphan_count,
2610         .memory_allocated       = &tcp_memory_allocated,
2611         .memory_pressure        = &tcp_memory_pressure,
2612         .sysctl_mem             = sysctl_tcp_mem,
2613         .sysctl_wmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_wmem),
2614         .sysctl_rmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_rmem),
2615         .max_header             = MAX_TCP_HEADER,
2616         .obj_size               = sizeof(struct tcp_sock),
2617         .slab_flags             = SLAB_TYPESAFE_BY_RCU,
2618         .twsk_prot              = &tcp_timewait_sock_ops,
2619         .rsk_prot               = &tcp_request_sock_ops,
2620         .h.hashinfo             = &tcp_hashinfo,
2621         .no_autobind            = true,
2622 #ifdef CONFIG_COMPAT
2623         .compat_setsockopt      = compat_tcp_setsockopt,
2624         .compat_getsockopt      = compat_tcp_getsockopt,
2625 #endif
2626         .diag_destroy           = tcp_abort,
2627 };
2628 EXPORT_SYMBOL(tcp_prot);
2629
2630 static void __net_exit tcp_sk_exit(struct net *net)
2631 {
2632         int cpu;
2633
2634         if (net->ipv4.tcp_congestion_control)
2635                 module_put(net->ipv4.tcp_congestion_control->owner);
2636
2637         for_each_possible_cpu(cpu)
2638                 inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2639         free_percpu(net->ipv4.tcp_sk);
2640 }
2641
2642 static int __net_init tcp_sk_init(struct net *net)
2643 {
2644         int res, cpu, cnt;
2645
2646         net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2647         if (!net->ipv4.tcp_sk)
2648                 return -ENOMEM;
2649
2650         for_each_possible_cpu(cpu) {
2651                 struct sock *sk;
2652
2653                 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2654                                            IPPROTO_TCP, net);
2655                 if (res)
2656                         goto fail;
2657                 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
2658
2659                 /* Please enforce IP_DF and IPID==0 for RST and
2660                  * ACK sent in SYN-RECV and TIME-WAIT state.
2661                  */
2662                 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
2663
2664                 *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2665         }
2666
2667         net->ipv4.sysctl_tcp_ecn = 2;
2668         net->ipv4.sysctl_tcp_ecn_fallback = 1;
2669
2670         net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2671         net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
2672         net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2673         net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2674         net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
2675
2676         net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
2677         net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
2678         net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
2679
2680         net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
2681         net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
2682         net->ipv4.sysctl_tcp_syncookies = 1;
2683         net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
2684         net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
2685         net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
2686         net->ipv4.sysctl_tcp_orphan_retries = 0;
2687         net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
2688         net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
2689         net->ipv4.sysctl_tcp_tw_reuse = 2;
2690         net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
2691
2692         cnt = tcp_hashinfo.ehash_mask + 1;
2693         net->ipv4.tcp_death_row.sysctl_max_tw_buckets = cnt / 2;
2694         net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
2695
2696         net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 128);
2697         net->ipv4.sysctl_tcp_sack = 1;
2698         net->ipv4.sysctl_tcp_window_scaling = 1;
2699         net->ipv4.sysctl_tcp_timestamps = 1;
2700         net->ipv4.sysctl_tcp_early_retrans = 3;
2701         net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
2702         net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
2703         net->ipv4.sysctl_tcp_retrans_collapse = 1;
2704         net->ipv4.sysctl_tcp_max_reordering = 300;
2705         net->ipv4.sysctl_tcp_dsack = 1;
2706         net->ipv4.sysctl_tcp_app_win = 31;
2707         net->ipv4.sysctl_tcp_adv_win_scale = 1;
2708         net->ipv4.sysctl_tcp_frto = 2;
2709         net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
2710         /* This limits the percentage of the congestion window which we
2711          * will allow a single TSO frame to consume.  Building TSO frames
2712          * which are too large can cause TCP streams to be bursty.
2713          */
2714         net->ipv4.sysctl_tcp_tso_win_divisor = 3;
2715         /* Default TSQ limit of 16 TSO segments */
2716         net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
2717         /* rfc5961 challenge ack rate limiting */
2718         net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
2719         net->ipv4.sysctl_tcp_min_tso_segs = 2;
2720         net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
2721         net->ipv4.sysctl_tcp_autocorking = 1;
2722         net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
2723         net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
2724         net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
2725         if (net != &init_net) {
2726                 memcpy(net->ipv4.sysctl_tcp_rmem,
2727                        init_net.ipv4.sysctl_tcp_rmem,
2728                        sizeof(init_net.ipv4.sysctl_tcp_rmem));
2729                 memcpy(net->ipv4.sysctl_tcp_wmem,
2730                        init_net.ipv4.sysctl_tcp_wmem,
2731                        sizeof(init_net.ipv4.sysctl_tcp_wmem));
2732         }
2733         net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
2734         net->ipv4.sysctl_tcp_comp_sack_nr = 44;
2735         net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
2736         spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
2737         net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60;
2738         atomic_set(&net->ipv4.tfo_active_disable_times, 0);
2739
2740         /* Reno is always built in */
2741         if (!net_eq(net, &init_net) &&
2742             try_module_get(init_net.ipv4.tcp_congestion_control->owner))
2743                 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
2744         else
2745                 net->ipv4.tcp_congestion_control = &tcp_reno;
2746
2747         return 0;
2748 fail:
2749         tcp_sk_exit(net);
2750
2751         return res;
2752 }
2753
2754 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2755 {
2756         struct net *net;
2757
2758         inet_twsk_purge(&tcp_hashinfo, AF_INET);
2759
2760         list_for_each_entry(net, net_exit_list, exit_list)
2761                 tcp_fastopen_ctx_destroy(net);
2762 }
2763
2764 static struct pernet_operations __net_initdata tcp_sk_ops = {
2765        .init       = tcp_sk_init,
2766        .exit       = tcp_sk_exit,
2767        .exit_batch = tcp_sk_exit_batch,
2768 };
2769
2770 void __init tcp_v4_init(void)
2771 {
2772         if (register_pernet_subsys(&tcp_sk_ops))
2773                 panic("Failed to create the TCP control socket.\n");
2774 }