]> git.ipfire.org Git - thirdparty/kernel/stable.git/blob - net/ipv4/tcp_ipv4.c
ipv4/tcp: Pass dif and sdif to tcp_v4_inbound_md5_hash
[thirdparty/kernel/stable.git] / net / ipv4 / tcp_ipv4.c
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3 * INET An implementation of the TCP/IP protocol suite for the LINUX
4 * operating system. INET is implemented using the BSD Socket
5 * interface as the means of communication with the user level.
6 *
7 * Implementation of the Transmission Control Protocol(TCP).
8 *
9 * IPv4 specific functions
10 *
11 * code split from:
12 * linux/ipv4/tcp.c
13 * linux/ipv4/tcp_input.c
14 * linux/ipv4/tcp_output.c
15 *
16 * See tcp.c for author information
17 */
18
19 /*
20 * Changes:
21 * David S. Miller : New socket lookup architecture.
22 * This code is dedicated to John Dyson.
23 * David S. Miller : Change semantics of established hash,
24 * half is devoted to TIME_WAIT sockets
25 * and the rest go in the other half.
26 * Andi Kleen : Add support for syncookies and fixed
27 * some bugs: ip options weren't passed to
28 * the TCP layer, missed a check for an
29 * ACK bit.
30 * Andi Kleen : Implemented fast path mtu discovery.
31 * Fixed many serious bugs in the
32 * request_sock handling and moved
33 * most of it into the af independent code.
34 * Added tail drop and some other bugfixes.
35 * Added new listen semantics.
36 * Mike McLagan : Routing by source
37 * Juan Jose Ciarlante: ip_dynaddr bits
38 * Andi Kleen: various fixes.
39 * Vitaly E. Lavrov : Transparent proxy revived after year
40 * coma.
41 * Andi Kleen : Fix new listen.
42 * Andi Kleen : Fix accept error reporting.
43 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
44 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
45 * a single port at the same time.
46 */
47
48 #define pr_fmt(fmt) "TCP: " fmt
49
50 #include <linux/bottom_half.h>
51 #include <linux/types.h>
52 #include <linux/fcntl.h>
53 #include <linux/module.h>
54 #include <linux/random.h>
55 #include <linux/cache.h>
56 #include <linux/jhash.h>
57 #include <linux/init.h>
58 #include <linux/times.h>
59 #include <linux/slab.h>
60
61 #include <net/net_namespace.h>
62 #include <net/icmp.h>
63 #include <net/inet_hashtables.h>
64 #include <net/tcp.h>
65 #include <net/transp_v6.h>
66 #include <net/ipv6.h>
67 #include <net/inet_common.h>
68 #include <net/timewait_sock.h>
69 #include <net/xfrm.h>
70 #include <net/secure_seq.h>
71 #include <net/busy_poll.h>
72
73 #include <linux/inet.h>
74 #include <linux/ipv6.h>
75 #include <linux/stddef.h>
76 #include <linux/proc_fs.h>
77 #include <linux/seq_file.h>
78 #include <linux/inetdevice.h>
79
80 #include <crypto/hash.h>
81 #include <linux/scatterlist.h>
82
83 #include <trace/events/tcp.h>
84
85 #ifdef CONFIG_TCP_MD5SIG
86 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
87 __be32 daddr, __be32 saddr, const struct tcphdr *th);
88 #endif
89
90 struct inet_hashinfo tcp_hashinfo;
91 EXPORT_SYMBOL(tcp_hashinfo);
92
93 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
94 {
95 return secure_tcp_seq(ip_hdr(skb)->daddr,
96 ip_hdr(skb)->saddr,
97 tcp_hdr(skb)->dest,
98 tcp_hdr(skb)->source);
99 }
100
101 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
102 {
103 return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
104 }
105
106 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
107 {
108 const struct inet_timewait_sock *tw = inet_twsk(sktw);
109 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
110 struct tcp_sock *tp = tcp_sk(sk);
111 int reuse = sock_net(sk)->ipv4.sysctl_tcp_tw_reuse;
112
113 if (reuse == 2) {
114 /* Still does not detect *everything* that goes through
115 * lo, since we require a loopback src or dst address
116 * or direct binding to 'lo' interface.
117 */
118 bool loopback = false;
119 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
120 loopback = true;
121 #if IS_ENABLED(CONFIG_IPV6)
122 if (tw->tw_family == AF_INET6) {
123 if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
124 ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
125 ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
126 ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
127 loopback = true;
128 } else
129 #endif
130 {
131 if (ipv4_is_loopback(tw->tw_daddr) ||
132 ipv4_is_loopback(tw->tw_rcv_saddr))
133 loopback = true;
134 }
135 if (!loopback)
136 reuse = 0;
137 }
138
139 /* With PAWS, it is safe from the viewpoint
140 of data integrity. Even without PAWS it is safe provided sequence
141 spaces do not overlap i.e. at data rates <= 80Mbit/sec.
142
143 Actually, the idea is close to VJ's one, only timestamp cache is
144 held not per host, but per port pair and TW bucket is used as state
145 holder.
146
147 If TW bucket has been already destroyed we fall back to VJ's scheme
148 and use initial timestamp retrieved from peer table.
149 */
150 if (tcptw->tw_ts_recent_stamp &&
151 (!twp || (reuse && time_after32(ktime_get_seconds(),
152 tcptw->tw_ts_recent_stamp)))) {
153 /* In case of repair and re-using TIME-WAIT sockets we still
154 * want to be sure that it is safe as above but honor the
155 * sequence numbers and time stamps set as part of the repair
156 * process.
157 *
158 * Without this check re-using a TIME-WAIT socket with TCP
159 * repair would accumulate a -1 on the repair assigned
160 * sequence number. The first time it is reused the sequence
161 * is -1, the second time -2, etc. This fixes that issue
162 * without appearing to create any others.
163 */
164 if (likely(!tp->repair)) {
165 u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
166
167 if (!seq)
168 seq = 1;
169 WRITE_ONCE(tp->write_seq, seq);
170 tp->rx_opt.ts_recent = tcptw->tw_ts_recent;
171 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
172 }
173 sock_hold(sktw);
174 return 1;
175 }
176
177 return 0;
178 }
179 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
180
181 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
182 int addr_len)
183 {
184 /* This check is replicated from tcp_v4_connect() and intended to
185 * prevent BPF program called below from accessing bytes that are out
186 * of the bound specified by user in addr_len.
187 */
188 if (addr_len < sizeof(struct sockaddr_in))
189 return -EINVAL;
190
191 sock_owned_by_me(sk);
192
193 return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
194 }
195
196 /* This will initiate an outgoing connection. */
197 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
198 {
199 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
200 struct inet_sock *inet = inet_sk(sk);
201 struct tcp_sock *tp = tcp_sk(sk);
202 __be16 orig_sport, orig_dport;
203 __be32 daddr, nexthop;
204 struct flowi4 *fl4;
205 struct rtable *rt;
206 int err;
207 struct ip_options_rcu *inet_opt;
208 struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
209
210 if (addr_len < sizeof(struct sockaddr_in))
211 return -EINVAL;
212
213 if (usin->sin_family != AF_INET)
214 return -EAFNOSUPPORT;
215
216 nexthop = daddr = usin->sin_addr.s_addr;
217 inet_opt = rcu_dereference_protected(inet->inet_opt,
218 lockdep_sock_is_held(sk));
219 if (inet_opt && inet_opt->opt.srr) {
220 if (!daddr)
221 return -EINVAL;
222 nexthop = inet_opt->opt.faddr;
223 }
224
225 orig_sport = inet->inet_sport;
226 orig_dport = usin->sin_port;
227 fl4 = &inet->cork.fl.u.ip4;
228 rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
229 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
230 IPPROTO_TCP,
231 orig_sport, orig_dport, sk);
232 if (IS_ERR(rt)) {
233 err = PTR_ERR(rt);
234 if (err == -ENETUNREACH)
235 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
236 return err;
237 }
238
239 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
240 ip_rt_put(rt);
241 return -ENETUNREACH;
242 }
243
244 if (!inet_opt || !inet_opt->opt.srr)
245 daddr = fl4->daddr;
246
247 if (!inet->inet_saddr)
248 inet->inet_saddr = fl4->saddr;
249 sk_rcv_saddr_set(sk, inet->inet_saddr);
250
251 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
252 /* Reset inherited state */
253 tp->rx_opt.ts_recent = 0;
254 tp->rx_opt.ts_recent_stamp = 0;
255 if (likely(!tp->repair))
256 WRITE_ONCE(tp->write_seq, 0);
257 }
258
259 inet->inet_dport = usin->sin_port;
260 sk_daddr_set(sk, daddr);
261
262 inet_csk(sk)->icsk_ext_hdr_len = 0;
263 if (inet_opt)
264 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
265
266 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
267
268 /* Socket identity is still unknown (sport may be zero).
269 * However we set state to SYN-SENT and not releasing socket
270 * lock select source port, enter ourselves into the hash tables and
271 * complete initialization after this.
272 */
273 tcp_set_state(sk, TCP_SYN_SENT);
274 err = inet_hash_connect(tcp_death_row, sk);
275 if (err)
276 goto failure;
277
278 sk_set_txhash(sk);
279
280 rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
281 inet->inet_sport, inet->inet_dport, sk);
282 if (IS_ERR(rt)) {
283 err = PTR_ERR(rt);
284 rt = NULL;
285 goto failure;
286 }
287 /* OK, now commit destination to socket. */
288 sk->sk_gso_type = SKB_GSO_TCPV4;
289 sk_setup_caps(sk, &rt->dst);
290 rt = NULL;
291
292 if (likely(!tp->repair)) {
293 if (!tp->write_seq)
294 WRITE_ONCE(tp->write_seq,
295 secure_tcp_seq(inet->inet_saddr,
296 inet->inet_daddr,
297 inet->inet_sport,
298 usin->sin_port));
299 tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
300 inet->inet_saddr,
301 inet->inet_daddr);
302 }
303
304 inet->inet_id = prandom_u32();
305
306 if (tcp_fastopen_defer_connect(sk, &err))
307 return err;
308 if (err)
309 goto failure;
310
311 err = tcp_connect(sk);
312
313 if (err)
314 goto failure;
315
316 return 0;
317
318 failure:
319 /*
320 * This unhashes the socket and releases the local port,
321 * if necessary.
322 */
323 tcp_set_state(sk, TCP_CLOSE);
324 ip_rt_put(rt);
325 sk->sk_route_caps = 0;
326 inet->inet_dport = 0;
327 return err;
328 }
329 EXPORT_SYMBOL(tcp_v4_connect);
330
331 /*
332 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
333 * It can be called through tcp_release_cb() if socket was owned by user
334 * at the time tcp_v4_err() was called to handle ICMP message.
335 */
336 void tcp_v4_mtu_reduced(struct sock *sk)
337 {
338 struct inet_sock *inet = inet_sk(sk);
339 struct dst_entry *dst;
340 u32 mtu;
341
342 if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
343 return;
344 mtu = tcp_sk(sk)->mtu_info;
345 dst = inet_csk_update_pmtu(sk, mtu);
346 if (!dst)
347 return;
348
349 /* Something is about to be wrong... Remember soft error
350 * for the case, if this connection will not able to recover.
351 */
352 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
353 sk->sk_err_soft = EMSGSIZE;
354
355 mtu = dst_mtu(dst);
356
357 if (inet->pmtudisc != IP_PMTUDISC_DONT &&
358 ip_sk_accept_pmtu(sk) &&
359 inet_csk(sk)->icsk_pmtu_cookie > mtu) {
360 tcp_sync_mss(sk, mtu);
361
362 /* Resend the TCP packet because it's
363 * clear that the old packet has been
364 * dropped. This is the new "fast" path mtu
365 * discovery.
366 */
367 tcp_simple_retransmit(sk);
368 } /* else let the usual retransmit timer handle it */
369 }
370 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
371
372 static void do_redirect(struct sk_buff *skb, struct sock *sk)
373 {
374 struct dst_entry *dst = __sk_dst_check(sk, 0);
375
376 if (dst)
377 dst->ops->redirect(dst, sk, skb);
378 }
379
380
381 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
382 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
383 {
384 struct request_sock *req = inet_reqsk(sk);
385 struct net *net = sock_net(sk);
386
387 /* ICMPs are not backlogged, hence we cannot get
388 * an established socket here.
389 */
390 if (seq != tcp_rsk(req)->snt_isn) {
391 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
392 } else if (abort) {
393 /*
394 * Still in SYN_RECV, just remove it silently.
395 * There is no good way to pass the error to the newly
396 * created socket, and POSIX does not want network
397 * errors returned from accept().
398 */
399 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
400 tcp_listendrop(req->rsk_listener);
401 }
402 reqsk_put(req);
403 }
404 EXPORT_SYMBOL(tcp_req_err);
405
406 /*
407 * This routine is called by the ICMP module when it gets some
408 * sort of error condition. If err < 0 then the socket should
409 * be closed and the error returned to the user. If err > 0
410 * it's just the icmp type << 8 | icmp code. After adjustment
411 * header points to the first 8 bytes of the tcp header. We need
412 * to find the appropriate port.
413 *
414 * The locking strategy used here is very "optimistic". When
415 * someone else accesses the socket the ICMP is just dropped
416 * and for some paths there is no check at all.
417 * A more general error queue to queue errors for later handling
418 * is probably better.
419 *
420 */
421
422 int tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
423 {
424 const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
425 struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
426 struct inet_connection_sock *icsk;
427 struct tcp_sock *tp;
428 struct inet_sock *inet;
429 const int type = icmp_hdr(icmp_skb)->type;
430 const int code = icmp_hdr(icmp_skb)->code;
431 struct sock *sk;
432 struct sk_buff *skb;
433 struct request_sock *fastopen;
434 u32 seq, snd_una;
435 s32 remaining;
436 u32 delta_us;
437 int err;
438 struct net *net = dev_net(icmp_skb->dev);
439
440 sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
441 th->dest, iph->saddr, ntohs(th->source),
442 inet_iif(icmp_skb), 0);
443 if (!sk) {
444 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
445 return -ENOENT;
446 }
447 if (sk->sk_state == TCP_TIME_WAIT) {
448 inet_twsk_put(inet_twsk(sk));
449 return 0;
450 }
451 seq = ntohl(th->seq);
452 if (sk->sk_state == TCP_NEW_SYN_RECV) {
453 tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
454 type == ICMP_TIME_EXCEEDED ||
455 (type == ICMP_DEST_UNREACH &&
456 (code == ICMP_NET_UNREACH ||
457 code == ICMP_HOST_UNREACH)));
458 return 0;
459 }
460
461 bh_lock_sock(sk);
462 /* If too many ICMPs get dropped on busy
463 * servers this needs to be solved differently.
464 * We do take care of PMTU discovery (RFC1191) special case :
465 * we can receive locally generated ICMP messages while socket is held.
466 */
467 if (sock_owned_by_user(sk)) {
468 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
469 __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
470 }
471 if (sk->sk_state == TCP_CLOSE)
472 goto out;
473
474 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
475 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
476 goto out;
477 }
478
479 icsk = inet_csk(sk);
480 tp = tcp_sk(sk);
481 /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
482 fastopen = rcu_dereference(tp->fastopen_rsk);
483 snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
484 if (sk->sk_state != TCP_LISTEN &&
485 !between(seq, snd_una, tp->snd_nxt)) {
486 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
487 goto out;
488 }
489
490 switch (type) {
491 case ICMP_REDIRECT:
492 if (!sock_owned_by_user(sk))
493 do_redirect(icmp_skb, sk);
494 goto out;
495 case ICMP_SOURCE_QUENCH:
496 /* Just silently ignore these. */
497 goto out;
498 case ICMP_PARAMETERPROB:
499 err = EPROTO;
500 break;
501 case ICMP_DEST_UNREACH:
502 if (code > NR_ICMP_UNREACH)
503 goto out;
504
505 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
506 /* We are not interested in TCP_LISTEN and open_requests
507 * (SYN-ACKs send out by Linux are always <576bytes so
508 * they should go through unfragmented).
509 */
510 if (sk->sk_state == TCP_LISTEN)
511 goto out;
512
513 tp->mtu_info = info;
514 if (!sock_owned_by_user(sk)) {
515 tcp_v4_mtu_reduced(sk);
516 } else {
517 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
518 sock_hold(sk);
519 }
520 goto out;
521 }
522
523 err = icmp_err_convert[code].errno;
524 /* check if icmp_skb allows revert of backoff
525 * (see draft-zimmermann-tcp-lcd) */
526 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
527 break;
528 if (seq != tp->snd_una || !icsk->icsk_retransmits ||
529 !icsk->icsk_backoff || fastopen)
530 break;
531
532 if (sock_owned_by_user(sk))
533 break;
534
535 skb = tcp_rtx_queue_head(sk);
536 if (WARN_ON_ONCE(!skb))
537 break;
538
539 icsk->icsk_backoff--;
540 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
541 TCP_TIMEOUT_INIT;
542 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
543
544
545 tcp_mstamp_refresh(tp);
546 delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
547 remaining = icsk->icsk_rto -
548 usecs_to_jiffies(delta_us);
549
550 if (remaining > 0) {
551 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
552 remaining, TCP_RTO_MAX);
553 } else {
554 /* RTO revert clocked out retransmission.
555 * Will retransmit now */
556 tcp_retransmit_timer(sk);
557 }
558
559 break;
560 case ICMP_TIME_EXCEEDED:
561 err = EHOSTUNREACH;
562 break;
563 default:
564 goto out;
565 }
566
567 switch (sk->sk_state) {
568 case TCP_SYN_SENT:
569 case TCP_SYN_RECV:
570 /* Only in fast or simultaneous open. If a fast open socket is
571 * is already accepted it is treated as a connected one below.
572 */
573 if (fastopen && !fastopen->sk)
574 break;
575
576 if (!sock_owned_by_user(sk)) {
577 sk->sk_err = err;
578
579 sk->sk_error_report(sk);
580
581 tcp_done(sk);
582 } else {
583 sk->sk_err_soft = err;
584 }
585 goto out;
586 }
587
588 /* If we've already connected we will keep trying
589 * until we time out, or the user gives up.
590 *
591 * rfc1122 4.2.3.9 allows to consider as hard errors
592 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
593 * but it is obsoleted by pmtu discovery).
594 *
595 * Note, that in modern internet, where routing is unreliable
596 * and in each dark corner broken firewalls sit, sending random
597 * errors ordered by their masters even this two messages finally lose
598 * their original sense (even Linux sends invalid PORT_UNREACHs)
599 *
600 * Now we are in compliance with RFCs.
601 * --ANK (980905)
602 */
603
604 inet = inet_sk(sk);
605 if (!sock_owned_by_user(sk) && inet->recverr) {
606 sk->sk_err = err;
607 sk->sk_error_report(sk);
608 } else { /* Only an error on timeout */
609 sk->sk_err_soft = err;
610 }
611
612 out:
613 bh_unlock_sock(sk);
614 sock_put(sk);
615 return 0;
616 }
617
618 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
619 {
620 struct tcphdr *th = tcp_hdr(skb);
621
622 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
623 skb->csum_start = skb_transport_header(skb) - skb->head;
624 skb->csum_offset = offsetof(struct tcphdr, check);
625 }
626
627 /* This routine computes an IPv4 TCP checksum. */
628 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
629 {
630 const struct inet_sock *inet = inet_sk(sk);
631
632 __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
633 }
634 EXPORT_SYMBOL(tcp_v4_send_check);
635
636 /*
637 * This routine will send an RST to the other tcp.
638 *
639 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
640 * for reset.
641 * Answer: if a packet caused RST, it is not for a socket
642 * existing in our system, if it is matched to a socket,
643 * it is just duplicate segment or bug in other side's TCP.
644 * So that we build reply only basing on parameters
645 * arrived with segment.
646 * Exception: precedence violation. We do not implement it in any case.
647 */
648
649 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
650 {
651 const struct tcphdr *th = tcp_hdr(skb);
652 struct {
653 struct tcphdr th;
654 #ifdef CONFIG_TCP_MD5SIG
655 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
656 #endif
657 } rep;
658 struct ip_reply_arg arg;
659 #ifdef CONFIG_TCP_MD5SIG
660 struct tcp_md5sig_key *key = NULL;
661 const __u8 *hash_location = NULL;
662 unsigned char newhash[16];
663 int genhash;
664 struct sock *sk1 = NULL;
665 #endif
666 u64 transmit_time = 0;
667 struct sock *ctl_sk;
668 struct net *net;
669
670 /* Never send a reset in response to a reset. */
671 if (th->rst)
672 return;
673
674 /* If sk not NULL, it means we did a successful lookup and incoming
675 * route had to be correct. prequeue might have dropped our dst.
676 */
677 if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
678 return;
679
680 /* Swap the send and the receive. */
681 memset(&rep, 0, sizeof(rep));
682 rep.th.dest = th->source;
683 rep.th.source = th->dest;
684 rep.th.doff = sizeof(struct tcphdr) / 4;
685 rep.th.rst = 1;
686
687 if (th->ack) {
688 rep.th.seq = th->ack_seq;
689 } else {
690 rep.th.ack = 1;
691 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
692 skb->len - (th->doff << 2));
693 }
694
695 memset(&arg, 0, sizeof(arg));
696 arg.iov[0].iov_base = (unsigned char *)&rep;
697 arg.iov[0].iov_len = sizeof(rep.th);
698
699 net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
700 #ifdef CONFIG_TCP_MD5SIG
701 rcu_read_lock();
702 hash_location = tcp_parse_md5sig_option(th);
703 if (sk && sk_fullsock(sk)) {
704 const union tcp_md5_addr *addr;
705
706 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
707 key = tcp_md5_do_lookup(sk, addr, AF_INET);
708 } else if (hash_location) {
709 const union tcp_md5_addr *addr;
710 int sdif = tcp_v4_sdif(skb);
711 int dif = inet_iif(skb);
712
713 /*
714 * active side is lost. Try to find listening socket through
715 * source port, and then find md5 key through listening socket.
716 * we are not loose security here:
717 * Incoming packet is checked with md5 hash with finding key,
718 * no RST generated if md5 hash doesn't match.
719 */
720 sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
721 ip_hdr(skb)->saddr,
722 th->source, ip_hdr(skb)->daddr,
723 ntohs(th->source), dif, sdif);
724 /* don't send rst if it can't find key */
725 if (!sk1)
726 goto out;
727
728 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
729 key = tcp_md5_do_lookup(sk1, addr, AF_INET);
730 if (!key)
731 goto out;
732
733
734 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
735 if (genhash || memcmp(hash_location, newhash, 16) != 0)
736 goto out;
737
738 }
739
740 if (key) {
741 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
742 (TCPOPT_NOP << 16) |
743 (TCPOPT_MD5SIG << 8) |
744 TCPOLEN_MD5SIG);
745 /* Update length and the length the header thinks exists */
746 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
747 rep.th.doff = arg.iov[0].iov_len / 4;
748
749 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
750 key, ip_hdr(skb)->saddr,
751 ip_hdr(skb)->daddr, &rep.th);
752 }
753 #endif
754 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
755 ip_hdr(skb)->saddr, /* XXX */
756 arg.iov[0].iov_len, IPPROTO_TCP, 0);
757 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
758 arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
759
760 /* When socket is gone, all binding information is lost.
761 * routing might fail in this case. No choice here, if we choose to force
762 * input interface, we will misroute in case of asymmetric route.
763 */
764 if (sk) {
765 arg.bound_dev_if = sk->sk_bound_dev_if;
766 if (sk_fullsock(sk))
767 trace_tcp_send_reset(sk, skb);
768 }
769
770 BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
771 offsetof(struct inet_timewait_sock, tw_bound_dev_if));
772
773 arg.tos = ip_hdr(skb)->tos;
774 arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
775 local_bh_disable();
776 ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
777 if (sk) {
778 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
779 inet_twsk(sk)->tw_mark : sk->sk_mark;
780 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
781 inet_twsk(sk)->tw_priority : sk->sk_priority;
782 transmit_time = tcp_transmit_time(sk);
783 }
784 ip_send_unicast_reply(ctl_sk,
785 skb, &TCP_SKB_CB(skb)->header.h4.opt,
786 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
787 &arg, arg.iov[0].iov_len,
788 transmit_time);
789
790 ctl_sk->sk_mark = 0;
791 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
792 __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
793 local_bh_enable();
794
795 #ifdef CONFIG_TCP_MD5SIG
796 out:
797 rcu_read_unlock();
798 #endif
799 }
800
801 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
802 outside socket context is ugly, certainly. What can I do?
803 */
804
805 static void tcp_v4_send_ack(const struct sock *sk,
806 struct sk_buff *skb, u32 seq, u32 ack,
807 u32 win, u32 tsval, u32 tsecr, int oif,
808 struct tcp_md5sig_key *key,
809 int reply_flags, u8 tos)
810 {
811 const struct tcphdr *th = tcp_hdr(skb);
812 struct {
813 struct tcphdr th;
814 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
815 #ifdef CONFIG_TCP_MD5SIG
816 + (TCPOLEN_MD5SIG_ALIGNED >> 2)
817 #endif
818 ];
819 } rep;
820 struct net *net = sock_net(sk);
821 struct ip_reply_arg arg;
822 struct sock *ctl_sk;
823 u64 transmit_time;
824
825 memset(&rep.th, 0, sizeof(struct tcphdr));
826 memset(&arg, 0, sizeof(arg));
827
828 arg.iov[0].iov_base = (unsigned char *)&rep;
829 arg.iov[0].iov_len = sizeof(rep.th);
830 if (tsecr) {
831 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
832 (TCPOPT_TIMESTAMP << 8) |
833 TCPOLEN_TIMESTAMP);
834 rep.opt[1] = htonl(tsval);
835 rep.opt[2] = htonl(tsecr);
836 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
837 }
838
839 /* Swap the send and the receive. */
840 rep.th.dest = th->source;
841 rep.th.source = th->dest;
842 rep.th.doff = arg.iov[0].iov_len / 4;
843 rep.th.seq = htonl(seq);
844 rep.th.ack_seq = htonl(ack);
845 rep.th.ack = 1;
846 rep.th.window = htons(win);
847
848 #ifdef CONFIG_TCP_MD5SIG
849 if (key) {
850 int offset = (tsecr) ? 3 : 0;
851
852 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
853 (TCPOPT_NOP << 16) |
854 (TCPOPT_MD5SIG << 8) |
855 TCPOLEN_MD5SIG);
856 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
857 rep.th.doff = arg.iov[0].iov_len/4;
858
859 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
860 key, ip_hdr(skb)->saddr,
861 ip_hdr(skb)->daddr, &rep.th);
862 }
863 #endif
864 arg.flags = reply_flags;
865 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
866 ip_hdr(skb)->saddr, /* XXX */
867 arg.iov[0].iov_len, IPPROTO_TCP, 0);
868 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
869 if (oif)
870 arg.bound_dev_if = oif;
871 arg.tos = tos;
872 arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
873 local_bh_disable();
874 ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
875 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
876 inet_twsk(sk)->tw_mark : sk->sk_mark;
877 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
878 inet_twsk(sk)->tw_priority : sk->sk_priority;
879 transmit_time = tcp_transmit_time(sk);
880 ip_send_unicast_reply(ctl_sk,
881 skb, &TCP_SKB_CB(skb)->header.h4.opt,
882 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
883 &arg, arg.iov[0].iov_len,
884 transmit_time);
885
886 ctl_sk->sk_mark = 0;
887 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
888 local_bh_enable();
889 }
890
891 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
892 {
893 struct inet_timewait_sock *tw = inet_twsk(sk);
894 struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
895
896 tcp_v4_send_ack(sk, skb,
897 tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
898 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
899 tcp_time_stamp_raw() + tcptw->tw_ts_offset,
900 tcptw->tw_ts_recent,
901 tw->tw_bound_dev_if,
902 tcp_twsk_md5_key(tcptw),
903 tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
904 tw->tw_tos
905 );
906
907 inet_twsk_put(tw);
908 }
909
910 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
911 struct request_sock *req)
912 {
913 const union tcp_md5_addr *addr;
914
915 /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
916 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
917 */
918 u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
919 tcp_sk(sk)->snd_nxt;
920
921 /* RFC 7323 2.3
922 * The window field (SEG.WND) of every outgoing segment, with the
923 * exception of <SYN> segments, MUST be right-shifted by
924 * Rcv.Wind.Shift bits:
925 */
926 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
927 tcp_v4_send_ack(sk, skb, seq,
928 tcp_rsk(req)->rcv_nxt,
929 req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
930 tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
931 req->ts_recent,
932 0,
933 tcp_md5_do_lookup(sk, addr, AF_INET),
934 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
935 ip_hdr(skb)->tos);
936 }
937
938 /*
939 * Send a SYN-ACK after having received a SYN.
940 * This still operates on a request_sock only, not on a big
941 * socket.
942 */
943 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
944 struct flowi *fl,
945 struct request_sock *req,
946 struct tcp_fastopen_cookie *foc,
947 enum tcp_synack_type synack_type)
948 {
949 const struct inet_request_sock *ireq = inet_rsk(req);
950 struct flowi4 fl4;
951 int err = -1;
952 struct sk_buff *skb;
953
954 /* First, grab a route. */
955 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
956 return -1;
957
958 skb = tcp_make_synack(sk, dst, req, foc, synack_type);
959
960 if (skb) {
961 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
962
963 rcu_read_lock();
964 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
965 ireq->ir_rmt_addr,
966 rcu_dereference(ireq->ireq_opt));
967 rcu_read_unlock();
968 err = net_xmit_eval(err);
969 }
970
971 return err;
972 }
973
974 /*
975 * IPv4 request_sock destructor.
976 */
977 static void tcp_v4_reqsk_destructor(struct request_sock *req)
978 {
979 kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
980 }
981
982 #ifdef CONFIG_TCP_MD5SIG
983 /*
984 * RFC2385 MD5 checksumming requires a mapping of
985 * IP address->MD5 Key.
986 * We need to maintain these in the sk structure.
987 */
988
989 DEFINE_STATIC_KEY_FALSE(tcp_md5_needed);
990 EXPORT_SYMBOL(tcp_md5_needed);
991
992 /* Find the Key structure for an address. */
993 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk,
994 const union tcp_md5_addr *addr,
995 int family)
996 {
997 const struct tcp_sock *tp = tcp_sk(sk);
998 struct tcp_md5sig_key *key;
999 const struct tcp_md5sig_info *md5sig;
1000 __be32 mask;
1001 struct tcp_md5sig_key *best_match = NULL;
1002 bool match;
1003
1004 /* caller either holds rcu_read_lock() or socket lock */
1005 md5sig = rcu_dereference_check(tp->md5sig_info,
1006 lockdep_sock_is_held(sk));
1007 if (!md5sig)
1008 return NULL;
1009
1010 hlist_for_each_entry_rcu(key, &md5sig->head, node) {
1011 if (key->family != family)
1012 continue;
1013
1014 if (family == AF_INET) {
1015 mask = inet_make_mask(key->prefixlen);
1016 match = (key->addr.a4.s_addr & mask) ==
1017 (addr->a4.s_addr & mask);
1018 #if IS_ENABLED(CONFIG_IPV6)
1019 } else if (family == AF_INET6) {
1020 match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1021 key->prefixlen);
1022 #endif
1023 } else {
1024 match = false;
1025 }
1026
1027 if (match && (!best_match ||
1028 key->prefixlen > best_match->prefixlen))
1029 best_match = key;
1030 }
1031 return best_match;
1032 }
1033 EXPORT_SYMBOL(__tcp_md5_do_lookup);
1034
1035 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1036 const union tcp_md5_addr *addr,
1037 int family, u8 prefixlen)
1038 {
1039 const struct tcp_sock *tp = tcp_sk(sk);
1040 struct tcp_md5sig_key *key;
1041 unsigned int size = sizeof(struct in_addr);
1042 const struct tcp_md5sig_info *md5sig;
1043
1044 /* caller either holds rcu_read_lock() or socket lock */
1045 md5sig = rcu_dereference_check(tp->md5sig_info,
1046 lockdep_sock_is_held(sk));
1047 if (!md5sig)
1048 return NULL;
1049 #if IS_ENABLED(CONFIG_IPV6)
1050 if (family == AF_INET6)
1051 size = sizeof(struct in6_addr);
1052 #endif
1053 hlist_for_each_entry_rcu(key, &md5sig->head, node) {
1054 if (key->family != family)
1055 continue;
1056 if (!memcmp(&key->addr, addr, size) &&
1057 key->prefixlen == prefixlen)
1058 return key;
1059 }
1060 return NULL;
1061 }
1062
1063 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1064 const struct sock *addr_sk)
1065 {
1066 const union tcp_md5_addr *addr;
1067
1068 addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1069 return tcp_md5_do_lookup(sk, addr, AF_INET);
1070 }
1071 EXPORT_SYMBOL(tcp_v4_md5_lookup);
1072
1073 /* This can be called on a newly created socket, from other files */
1074 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1075 int family, u8 prefixlen, const u8 *newkey, u8 newkeylen,
1076 gfp_t gfp)
1077 {
1078 /* Add Key to the list */
1079 struct tcp_md5sig_key *key;
1080 struct tcp_sock *tp = tcp_sk(sk);
1081 struct tcp_md5sig_info *md5sig;
1082
1083 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
1084 if (key) {
1085 /* Pre-existing entry - just update that one. */
1086 memcpy(key->key, newkey, newkeylen);
1087 key->keylen = newkeylen;
1088 return 0;
1089 }
1090
1091 md5sig = rcu_dereference_protected(tp->md5sig_info,
1092 lockdep_sock_is_held(sk));
1093 if (!md5sig) {
1094 md5sig = kmalloc(sizeof(*md5sig), gfp);
1095 if (!md5sig)
1096 return -ENOMEM;
1097
1098 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1099 INIT_HLIST_HEAD(&md5sig->head);
1100 rcu_assign_pointer(tp->md5sig_info, md5sig);
1101 }
1102
1103 key = sock_kmalloc(sk, sizeof(*key), gfp);
1104 if (!key)
1105 return -ENOMEM;
1106 if (!tcp_alloc_md5sig_pool()) {
1107 sock_kfree_s(sk, key, sizeof(*key));
1108 return -ENOMEM;
1109 }
1110
1111 memcpy(key->key, newkey, newkeylen);
1112 key->keylen = newkeylen;
1113 key->family = family;
1114 key->prefixlen = prefixlen;
1115 memcpy(&key->addr, addr,
1116 (family == AF_INET6) ? sizeof(struct in6_addr) :
1117 sizeof(struct in_addr));
1118 hlist_add_head_rcu(&key->node, &md5sig->head);
1119 return 0;
1120 }
1121 EXPORT_SYMBOL(tcp_md5_do_add);
1122
1123 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1124 u8 prefixlen)
1125 {
1126 struct tcp_md5sig_key *key;
1127
1128 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
1129 if (!key)
1130 return -ENOENT;
1131 hlist_del_rcu(&key->node);
1132 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1133 kfree_rcu(key, rcu);
1134 return 0;
1135 }
1136 EXPORT_SYMBOL(tcp_md5_do_del);
1137
1138 static void tcp_clear_md5_list(struct sock *sk)
1139 {
1140 struct tcp_sock *tp = tcp_sk(sk);
1141 struct tcp_md5sig_key *key;
1142 struct hlist_node *n;
1143 struct tcp_md5sig_info *md5sig;
1144
1145 md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1146
1147 hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1148 hlist_del_rcu(&key->node);
1149 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1150 kfree_rcu(key, rcu);
1151 }
1152 }
1153
1154 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1155 char __user *optval, int optlen)
1156 {
1157 struct tcp_md5sig cmd;
1158 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1159 const union tcp_md5_addr *addr;
1160 u8 prefixlen = 32;
1161
1162 if (optlen < sizeof(cmd))
1163 return -EINVAL;
1164
1165 if (copy_from_user(&cmd, optval, sizeof(cmd)))
1166 return -EFAULT;
1167
1168 if (sin->sin_family != AF_INET)
1169 return -EINVAL;
1170
1171 if (optname == TCP_MD5SIG_EXT &&
1172 cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1173 prefixlen = cmd.tcpm_prefixlen;
1174 if (prefixlen > 32)
1175 return -EINVAL;
1176 }
1177
1178 addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1179
1180 if (!cmd.tcpm_keylen)
1181 return tcp_md5_do_del(sk, addr, AF_INET, prefixlen);
1182
1183 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1184 return -EINVAL;
1185
1186 return tcp_md5_do_add(sk, addr, AF_INET, prefixlen,
1187 cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
1188 }
1189
1190 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1191 __be32 daddr, __be32 saddr,
1192 const struct tcphdr *th, int nbytes)
1193 {
1194 struct tcp4_pseudohdr *bp;
1195 struct scatterlist sg;
1196 struct tcphdr *_th;
1197
1198 bp = hp->scratch;
1199 bp->saddr = saddr;
1200 bp->daddr = daddr;
1201 bp->pad = 0;
1202 bp->protocol = IPPROTO_TCP;
1203 bp->len = cpu_to_be16(nbytes);
1204
1205 _th = (struct tcphdr *)(bp + 1);
1206 memcpy(_th, th, sizeof(*th));
1207 _th->check = 0;
1208
1209 sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1210 ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1211 sizeof(*bp) + sizeof(*th));
1212 return crypto_ahash_update(hp->md5_req);
1213 }
1214
1215 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1216 __be32 daddr, __be32 saddr, const struct tcphdr *th)
1217 {
1218 struct tcp_md5sig_pool *hp;
1219 struct ahash_request *req;
1220
1221 hp = tcp_get_md5sig_pool();
1222 if (!hp)
1223 goto clear_hash_noput;
1224 req = hp->md5_req;
1225
1226 if (crypto_ahash_init(req))
1227 goto clear_hash;
1228 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1229 goto clear_hash;
1230 if (tcp_md5_hash_key(hp, key))
1231 goto clear_hash;
1232 ahash_request_set_crypt(req, NULL, md5_hash, 0);
1233 if (crypto_ahash_final(req))
1234 goto clear_hash;
1235
1236 tcp_put_md5sig_pool();
1237 return 0;
1238
1239 clear_hash:
1240 tcp_put_md5sig_pool();
1241 clear_hash_noput:
1242 memset(md5_hash, 0, 16);
1243 return 1;
1244 }
1245
1246 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1247 const struct sock *sk,
1248 const struct sk_buff *skb)
1249 {
1250 struct tcp_md5sig_pool *hp;
1251 struct ahash_request *req;
1252 const struct tcphdr *th = tcp_hdr(skb);
1253 __be32 saddr, daddr;
1254
1255 if (sk) { /* valid for establish/request sockets */
1256 saddr = sk->sk_rcv_saddr;
1257 daddr = sk->sk_daddr;
1258 } else {
1259 const struct iphdr *iph = ip_hdr(skb);
1260 saddr = iph->saddr;
1261 daddr = iph->daddr;
1262 }
1263
1264 hp = tcp_get_md5sig_pool();
1265 if (!hp)
1266 goto clear_hash_noput;
1267 req = hp->md5_req;
1268
1269 if (crypto_ahash_init(req))
1270 goto clear_hash;
1271
1272 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1273 goto clear_hash;
1274 if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1275 goto clear_hash;
1276 if (tcp_md5_hash_key(hp, key))
1277 goto clear_hash;
1278 ahash_request_set_crypt(req, NULL, md5_hash, 0);
1279 if (crypto_ahash_final(req))
1280 goto clear_hash;
1281
1282 tcp_put_md5sig_pool();
1283 return 0;
1284
1285 clear_hash:
1286 tcp_put_md5sig_pool();
1287 clear_hash_noput:
1288 memset(md5_hash, 0, 16);
1289 return 1;
1290 }
1291 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1292
1293 #endif
1294
1295 /* Called with rcu_read_lock() */
1296 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1297 const struct sk_buff *skb,
1298 int dif, int sdif)
1299 {
1300 #ifdef CONFIG_TCP_MD5SIG
1301 /*
1302 * This gets called for each TCP segment that arrives
1303 * so we want to be efficient.
1304 * We have 3 drop cases:
1305 * o No MD5 hash and one expected.
1306 * o MD5 hash and we're not expecting one.
1307 * o MD5 hash and its wrong.
1308 */
1309 const __u8 *hash_location = NULL;
1310 struct tcp_md5sig_key *hash_expected;
1311 const struct iphdr *iph = ip_hdr(skb);
1312 const struct tcphdr *th = tcp_hdr(skb);
1313 const union tcp_md5_addr *addr;
1314 int genhash;
1315 unsigned char newhash[16];
1316
1317 addr = (union tcp_md5_addr *)&iph->saddr;
1318 hash_expected = tcp_md5_do_lookup(sk, addr, AF_INET);
1319 hash_location = tcp_parse_md5sig_option(th);
1320
1321 /* We've parsed the options - do we have a hash? */
1322 if (!hash_expected && !hash_location)
1323 return false;
1324
1325 if (hash_expected && !hash_location) {
1326 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1327 return true;
1328 }
1329
1330 if (!hash_expected && hash_location) {
1331 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1332 return true;
1333 }
1334
1335 /* Okay, so this is hash_expected and hash_location -
1336 * so we need to calculate the checksum.
1337 */
1338 genhash = tcp_v4_md5_hash_skb(newhash,
1339 hash_expected,
1340 NULL, skb);
1341
1342 if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1343 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
1344 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1345 &iph->saddr, ntohs(th->source),
1346 &iph->daddr, ntohs(th->dest),
1347 genhash ? " tcp_v4_calc_md5_hash failed"
1348 : "");
1349 return true;
1350 }
1351 return false;
1352 #endif
1353 return false;
1354 }
1355
1356 static void tcp_v4_init_req(struct request_sock *req,
1357 const struct sock *sk_listener,
1358 struct sk_buff *skb)
1359 {
1360 struct inet_request_sock *ireq = inet_rsk(req);
1361 struct net *net = sock_net(sk_listener);
1362
1363 sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1364 sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1365 RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1366 }
1367
1368 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1369 struct flowi *fl,
1370 const struct request_sock *req)
1371 {
1372 return inet_csk_route_req(sk, &fl->u.ip4, req);
1373 }
1374
1375 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1376 .family = PF_INET,
1377 .obj_size = sizeof(struct tcp_request_sock),
1378 .rtx_syn_ack = tcp_rtx_synack,
1379 .send_ack = tcp_v4_reqsk_send_ack,
1380 .destructor = tcp_v4_reqsk_destructor,
1381 .send_reset = tcp_v4_send_reset,
1382 .syn_ack_timeout = tcp_syn_ack_timeout,
1383 };
1384
1385 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1386 .mss_clamp = TCP_MSS_DEFAULT,
1387 #ifdef CONFIG_TCP_MD5SIG
1388 .req_md5_lookup = tcp_v4_md5_lookup,
1389 .calc_md5_hash = tcp_v4_md5_hash_skb,
1390 #endif
1391 .init_req = tcp_v4_init_req,
1392 #ifdef CONFIG_SYN_COOKIES
1393 .cookie_init_seq = cookie_v4_init_sequence,
1394 #endif
1395 .route_req = tcp_v4_route_req,
1396 .init_seq = tcp_v4_init_seq,
1397 .init_ts_off = tcp_v4_init_ts_off,
1398 .send_synack = tcp_v4_send_synack,
1399 };
1400
1401 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1402 {
1403 /* Never answer to SYNs send to broadcast or multicast */
1404 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1405 goto drop;
1406
1407 return tcp_conn_request(&tcp_request_sock_ops,
1408 &tcp_request_sock_ipv4_ops, sk, skb);
1409
1410 drop:
1411 tcp_listendrop(sk);
1412 return 0;
1413 }
1414 EXPORT_SYMBOL(tcp_v4_conn_request);
1415
1416
1417 /*
1418 * The three way handshake has completed - we got a valid synack -
1419 * now create the new socket.
1420 */
1421 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1422 struct request_sock *req,
1423 struct dst_entry *dst,
1424 struct request_sock *req_unhash,
1425 bool *own_req)
1426 {
1427 struct inet_request_sock *ireq;
1428 struct inet_sock *newinet;
1429 struct tcp_sock *newtp;
1430 struct sock *newsk;
1431 #ifdef CONFIG_TCP_MD5SIG
1432 const union tcp_md5_addr *addr;
1433 struct tcp_md5sig_key *key;
1434 #endif
1435 struct ip_options_rcu *inet_opt;
1436
1437 if (sk_acceptq_is_full(sk))
1438 goto exit_overflow;
1439
1440 newsk = tcp_create_openreq_child(sk, req, skb);
1441 if (!newsk)
1442 goto exit_nonewsk;
1443
1444 newsk->sk_gso_type = SKB_GSO_TCPV4;
1445 inet_sk_rx_dst_set(newsk, skb);
1446
1447 newtp = tcp_sk(newsk);
1448 newinet = inet_sk(newsk);
1449 ireq = inet_rsk(req);
1450 sk_daddr_set(newsk, ireq->ir_rmt_addr);
1451 sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1452 newsk->sk_bound_dev_if = ireq->ir_iif;
1453 newinet->inet_saddr = ireq->ir_loc_addr;
1454 inet_opt = rcu_dereference(ireq->ireq_opt);
1455 RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1456 newinet->mc_index = inet_iif(skb);
1457 newinet->mc_ttl = ip_hdr(skb)->ttl;
1458 newinet->rcv_tos = ip_hdr(skb)->tos;
1459 inet_csk(newsk)->icsk_ext_hdr_len = 0;
1460 if (inet_opt)
1461 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1462 newinet->inet_id = prandom_u32();
1463
1464 if (!dst) {
1465 dst = inet_csk_route_child_sock(sk, newsk, req);
1466 if (!dst)
1467 goto put_and_exit;
1468 } else {
1469 /* syncookie case : see end of cookie_v4_check() */
1470 }
1471 sk_setup_caps(newsk, dst);
1472
1473 tcp_ca_openreq_child(newsk, dst);
1474
1475 tcp_sync_mss(newsk, dst_mtu(dst));
1476 newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1477
1478 tcp_initialize_rcv_mss(newsk);
1479
1480 #ifdef CONFIG_TCP_MD5SIG
1481 /* Copy over the MD5 key from the original socket */
1482 addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1483 key = tcp_md5_do_lookup(sk, addr, AF_INET);
1484 if (key) {
1485 /*
1486 * We're using one, so create a matching key
1487 * on the newsk structure. If we fail to get
1488 * memory, then we end up not copying the key
1489 * across. Shucks.
1490 */
1491 tcp_md5_do_add(newsk, addr, AF_INET, 32,
1492 key->key, key->keylen, GFP_ATOMIC);
1493 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1494 }
1495 #endif
1496
1497 if (__inet_inherit_port(sk, newsk) < 0)
1498 goto put_and_exit;
1499 *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
1500 if (likely(*own_req)) {
1501 tcp_move_syn(newtp, req);
1502 ireq->ireq_opt = NULL;
1503 } else {
1504 newinet->inet_opt = NULL;
1505 }
1506 return newsk;
1507
1508 exit_overflow:
1509 NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1510 exit_nonewsk:
1511 dst_release(dst);
1512 exit:
1513 tcp_listendrop(sk);
1514 return NULL;
1515 put_and_exit:
1516 newinet->inet_opt = NULL;
1517 inet_csk_prepare_forced_close(newsk);
1518 tcp_done(newsk);
1519 goto exit;
1520 }
1521 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1522
1523 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1524 {
1525 #ifdef CONFIG_SYN_COOKIES
1526 const struct tcphdr *th = tcp_hdr(skb);
1527
1528 if (!th->syn)
1529 sk = cookie_v4_check(sk, skb);
1530 #endif
1531 return sk;
1532 }
1533
1534 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1535 struct tcphdr *th, u32 *cookie)
1536 {
1537 u16 mss = 0;
1538 #ifdef CONFIG_SYN_COOKIES
1539 mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1540 &tcp_request_sock_ipv4_ops, sk, th);
1541 if (mss) {
1542 *cookie = __cookie_v4_init_sequence(iph, th, &mss);
1543 tcp_synq_overflow(sk);
1544 }
1545 #endif
1546 return mss;
1547 }
1548
1549 /* The socket must have it's spinlock held when we get
1550 * here, unless it is a TCP_LISTEN socket.
1551 *
1552 * We have a potential double-lock case here, so even when
1553 * doing backlog processing we use the BH locking scheme.
1554 * This is because we cannot sleep with the original spinlock
1555 * held.
1556 */
1557 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1558 {
1559 struct sock *rsk;
1560
1561 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1562 struct dst_entry *dst = sk->sk_rx_dst;
1563
1564 sock_rps_save_rxhash(sk, skb);
1565 sk_mark_napi_id(sk, skb);
1566 if (dst) {
1567 if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1568 !dst->ops->check(dst, 0)) {
1569 dst_release(dst);
1570 sk->sk_rx_dst = NULL;
1571 }
1572 }
1573 tcp_rcv_established(sk, skb);
1574 return 0;
1575 }
1576
1577 if (tcp_checksum_complete(skb))
1578 goto csum_err;
1579
1580 if (sk->sk_state == TCP_LISTEN) {
1581 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1582
1583 if (!nsk)
1584 goto discard;
1585 if (nsk != sk) {
1586 if (tcp_child_process(sk, nsk, skb)) {
1587 rsk = nsk;
1588 goto reset;
1589 }
1590 return 0;
1591 }
1592 } else
1593 sock_rps_save_rxhash(sk, skb);
1594
1595 if (tcp_rcv_state_process(sk, skb)) {
1596 rsk = sk;
1597 goto reset;
1598 }
1599 return 0;
1600
1601 reset:
1602 tcp_v4_send_reset(rsk, skb);
1603 discard:
1604 kfree_skb(skb);
1605 /* Be careful here. If this function gets more complicated and
1606 * gcc suffers from register pressure on the x86, sk (in %ebx)
1607 * might be destroyed here. This current version compiles correctly,
1608 * but you have been warned.
1609 */
1610 return 0;
1611
1612 csum_err:
1613 TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1614 TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1615 goto discard;
1616 }
1617 EXPORT_SYMBOL(tcp_v4_do_rcv);
1618
1619 int tcp_v4_early_demux(struct sk_buff *skb)
1620 {
1621 const struct iphdr *iph;
1622 const struct tcphdr *th;
1623 struct sock *sk;
1624
1625 if (skb->pkt_type != PACKET_HOST)
1626 return 0;
1627
1628 if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1629 return 0;
1630
1631 iph = ip_hdr(skb);
1632 th = tcp_hdr(skb);
1633
1634 if (th->doff < sizeof(struct tcphdr) / 4)
1635 return 0;
1636
1637 sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1638 iph->saddr, th->source,
1639 iph->daddr, ntohs(th->dest),
1640 skb->skb_iif, inet_sdif(skb));
1641 if (sk) {
1642 skb->sk = sk;
1643 skb->destructor = sock_edemux;
1644 if (sk_fullsock(sk)) {
1645 struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
1646
1647 if (dst)
1648 dst = dst_check(dst, 0);
1649 if (dst &&
1650 inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1651 skb_dst_set_noref(skb, dst);
1652 }
1653 }
1654 return 0;
1655 }
1656
1657 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1658 {
1659 u32 limit = READ_ONCE(sk->sk_rcvbuf) + READ_ONCE(sk->sk_sndbuf);
1660 struct skb_shared_info *shinfo;
1661 const struct tcphdr *th;
1662 struct tcphdr *thtail;
1663 struct sk_buff *tail;
1664 unsigned int hdrlen;
1665 bool fragstolen;
1666 u32 gso_segs;
1667 int delta;
1668
1669 /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1670 * we can fix skb->truesize to its real value to avoid future drops.
1671 * This is valid because skb is not yet charged to the socket.
1672 * It has been noticed pure SACK packets were sometimes dropped
1673 * (if cooked by drivers without copybreak feature).
1674 */
1675 skb_condense(skb);
1676
1677 skb_dst_drop(skb);
1678
1679 if (unlikely(tcp_checksum_complete(skb))) {
1680 bh_unlock_sock(sk);
1681 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1682 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1683 return true;
1684 }
1685
1686 /* Attempt coalescing to last skb in backlog, even if we are
1687 * above the limits.
1688 * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
1689 */
1690 th = (const struct tcphdr *)skb->data;
1691 hdrlen = th->doff * 4;
1692 shinfo = skb_shinfo(skb);
1693
1694 if (!shinfo->gso_size)
1695 shinfo->gso_size = skb->len - hdrlen;
1696
1697 if (!shinfo->gso_segs)
1698 shinfo->gso_segs = 1;
1699
1700 tail = sk->sk_backlog.tail;
1701 if (!tail)
1702 goto no_coalesce;
1703 thtail = (struct tcphdr *)tail->data;
1704
1705 if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
1706 TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
1707 ((TCP_SKB_CB(tail)->tcp_flags |
1708 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
1709 !((TCP_SKB_CB(tail)->tcp_flags &
1710 TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
1711 ((TCP_SKB_CB(tail)->tcp_flags ^
1712 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
1713 #ifdef CONFIG_TLS_DEVICE
1714 tail->decrypted != skb->decrypted ||
1715 #endif
1716 thtail->doff != th->doff ||
1717 memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
1718 goto no_coalesce;
1719
1720 __skb_pull(skb, hdrlen);
1721 if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
1722 thtail->window = th->window;
1723
1724 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
1725
1726 if (after(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))
1727 TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
1728
1729 /* We have to update both TCP_SKB_CB(tail)->tcp_flags and
1730 * thtail->fin, so that the fast path in tcp_rcv_established()
1731 * is not entered if we append a packet with a FIN.
1732 * SYN, RST, URG are not present.
1733 * ACK is set on both packets.
1734 * PSH : we do not really care in TCP stack,
1735 * at least for 'GRO' packets.
1736 */
1737 thtail->fin |= th->fin;
1738 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1739
1740 if (TCP_SKB_CB(skb)->has_rxtstamp) {
1741 TCP_SKB_CB(tail)->has_rxtstamp = true;
1742 tail->tstamp = skb->tstamp;
1743 skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
1744 }
1745
1746 /* Not as strict as GRO. We only need to carry mss max value */
1747 skb_shinfo(tail)->gso_size = max(shinfo->gso_size,
1748 skb_shinfo(tail)->gso_size);
1749
1750 gso_segs = skb_shinfo(tail)->gso_segs + shinfo->gso_segs;
1751 skb_shinfo(tail)->gso_segs = min_t(u32, gso_segs, 0xFFFF);
1752
1753 sk->sk_backlog.len += delta;
1754 __NET_INC_STATS(sock_net(sk),
1755 LINUX_MIB_TCPBACKLOGCOALESCE);
1756 kfree_skb_partial(skb, fragstolen);
1757 return false;
1758 }
1759 __skb_push(skb, hdrlen);
1760
1761 no_coalesce:
1762 /* Only socket owner can try to collapse/prune rx queues
1763 * to reduce memory overhead, so add a little headroom here.
1764 * Few sockets backlog are possibly concurrently non empty.
1765 */
1766 limit += 64*1024;
1767
1768 if (unlikely(sk_add_backlog(sk, skb, limit))) {
1769 bh_unlock_sock(sk);
1770 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1771 return true;
1772 }
1773 return false;
1774 }
1775 EXPORT_SYMBOL(tcp_add_backlog);
1776
1777 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1778 {
1779 struct tcphdr *th = (struct tcphdr *)skb->data;
1780
1781 return sk_filter_trim_cap(sk, skb, th->doff * 4);
1782 }
1783 EXPORT_SYMBOL(tcp_filter);
1784
1785 static void tcp_v4_restore_cb(struct sk_buff *skb)
1786 {
1787 memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1788 sizeof(struct inet_skb_parm));
1789 }
1790
1791 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1792 const struct tcphdr *th)
1793 {
1794 /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1795 * barrier() makes sure compiler wont play fool^Waliasing games.
1796 */
1797 memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1798 sizeof(struct inet_skb_parm));
1799 barrier();
1800
1801 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1802 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1803 skb->len - th->doff * 4);
1804 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1805 TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1806 TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1807 TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1808 TCP_SKB_CB(skb)->sacked = 0;
1809 TCP_SKB_CB(skb)->has_rxtstamp =
1810 skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1811 }
1812
1813 /*
1814 * From tcp_input.c
1815 */
1816
1817 int tcp_v4_rcv(struct sk_buff *skb)
1818 {
1819 struct net *net = dev_net(skb->dev);
1820 struct sk_buff *skb_to_free;
1821 int sdif = inet_sdif(skb);
1822 int dif = inet_iif(skb);
1823 const struct iphdr *iph;
1824 const struct tcphdr *th;
1825 bool refcounted;
1826 struct sock *sk;
1827 int ret;
1828
1829 if (skb->pkt_type != PACKET_HOST)
1830 goto discard_it;
1831
1832 /* Count it even if it's bad */
1833 __TCP_INC_STATS(net, TCP_MIB_INSEGS);
1834
1835 if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1836 goto discard_it;
1837
1838 th = (const struct tcphdr *)skb->data;
1839
1840 if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
1841 goto bad_packet;
1842 if (!pskb_may_pull(skb, th->doff * 4))
1843 goto discard_it;
1844
1845 /* An explanation is required here, I think.
1846 * Packet length and doff are validated by header prediction,
1847 * provided case of th->doff==0 is eliminated.
1848 * So, we defer the checks. */
1849
1850 if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1851 goto csum_error;
1852
1853 th = (const struct tcphdr *)skb->data;
1854 iph = ip_hdr(skb);
1855 lookup:
1856 sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1857 th->dest, sdif, &refcounted);
1858 if (!sk)
1859 goto no_tcp_socket;
1860
1861 process:
1862 if (sk->sk_state == TCP_TIME_WAIT)
1863 goto do_time_wait;
1864
1865 if (sk->sk_state == TCP_NEW_SYN_RECV) {
1866 struct request_sock *req = inet_reqsk(sk);
1867 bool req_stolen = false;
1868 struct sock *nsk;
1869
1870 sk = req->rsk_listener;
1871 if (unlikely(tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))) {
1872 sk_drops_add(sk, skb);
1873 reqsk_put(req);
1874 goto discard_it;
1875 }
1876 if (tcp_checksum_complete(skb)) {
1877 reqsk_put(req);
1878 goto csum_error;
1879 }
1880 if (unlikely(sk->sk_state != TCP_LISTEN)) {
1881 inet_csk_reqsk_queue_drop_and_put(sk, req);
1882 goto lookup;
1883 }
1884 /* We own a reference on the listener, increase it again
1885 * as we might lose it too soon.
1886 */
1887 sock_hold(sk);
1888 refcounted = true;
1889 nsk = NULL;
1890 if (!tcp_filter(sk, skb)) {
1891 th = (const struct tcphdr *)skb->data;
1892 iph = ip_hdr(skb);
1893 tcp_v4_fill_cb(skb, iph, th);
1894 nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
1895 }
1896 if (!nsk) {
1897 reqsk_put(req);
1898 if (req_stolen) {
1899 /* Another cpu got exclusive access to req
1900 * and created a full blown socket.
1901 * Try to feed this packet to this socket
1902 * instead of discarding it.
1903 */
1904 tcp_v4_restore_cb(skb);
1905 sock_put(sk);
1906 goto lookup;
1907 }
1908 goto discard_and_relse;
1909 }
1910 if (nsk == sk) {
1911 reqsk_put(req);
1912 tcp_v4_restore_cb(skb);
1913 } else if (tcp_child_process(sk, nsk, skb)) {
1914 tcp_v4_send_reset(nsk, skb);
1915 goto discard_and_relse;
1916 } else {
1917 sock_put(sk);
1918 return 0;
1919 }
1920 }
1921 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1922 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
1923 goto discard_and_relse;
1924 }
1925
1926 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1927 goto discard_and_relse;
1928
1929 if (tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))
1930 goto discard_and_relse;
1931
1932 nf_reset_ct(skb);
1933
1934 if (tcp_filter(sk, skb))
1935 goto discard_and_relse;
1936 th = (const struct tcphdr *)skb->data;
1937 iph = ip_hdr(skb);
1938 tcp_v4_fill_cb(skb, iph, th);
1939
1940 skb->dev = NULL;
1941
1942 if (sk->sk_state == TCP_LISTEN) {
1943 ret = tcp_v4_do_rcv(sk, skb);
1944 goto put_and_return;
1945 }
1946
1947 sk_incoming_cpu_update(sk);
1948
1949 bh_lock_sock_nested(sk);
1950 tcp_segs_in(tcp_sk(sk), skb);
1951 ret = 0;
1952 if (!sock_owned_by_user(sk)) {
1953 skb_to_free = sk->sk_rx_skb_cache;
1954 sk->sk_rx_skb_cache = NULL;
1955 ret = tcp_v4_do_rcv(sk, skb);
1956 } else {
1957 if (tcp_add_backlog(sk, skb))
1958 goto discard_and_relse;
1959 skb_to_free = NULL;
1960 }
1961 bh_unlock_sock(sk);
1962 if (skb_to_free)
1963 __kfree_skb(skb_to_free);
1964
1965 put_and_return:
1966 if (refcounted)
1967 sock_put(sk);
1968
1969 return ret;
1970
1971 no_tcp_socket:
1972 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1973 goto discard_it;
1974
1975 tcp_v4_fill_cb(skb, iph, th);
1976
1977 if (tcp_checksum_complete(skb)) {
1978 csum_error:
1979 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
1980 bad_packet:
1981 __TCP_INC_STATS(net, TCP_MIB_INERRS);
1982 } else {
1983 tcp_v4_send_reset(NULL, skb);
1984 }
1985
1986 discard_it:
1987 /* Discard frame. */
1988 kfree_skb(skb);
1989 return 0;
1990
1991 discard_and_relse:
1992 sk_drops_add(sk, skb);
1993 if (refcounted)
1994 sock_put(sk);
1995 goto discard_it;
1996
1997 do_time_wait:
1998 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1999 inet_twsk_put(inet_twsk(sk));
2000 goto discard_it;
2001 }
2002
2003 tcp_v4_fill_cb(skb, iph, th);
2004
2005 if (tcp_checksum_complete(skb)) {
2006 inet_twsk_put(inet_twsk(sk));
2007 goto csum_error;
2008 }
2009 switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
2010 case TCP_TW_SYN: {
2011 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
2012 &tcp_hashinfo, skb,
2013 __tcp_hdrlen(th),
2014 iph->saddr, th->source,
2015 iph->daddr, th->dest,
2016 inet_iif(skb),
2017 sdif);
2018 if (sk2) {
2019 inet_twsk_deschedule_put(inet_twsk(sk));
2020 sk = sk2;
2021 tcp_v4_restore_cb(skb);
2022 refcounted = false;
2023 goto process;
2024 }
2025 }
2026 /* to ACK */
2027 /* fall through */
2028 case TCP_TW_ACK:
2029 tcp_v4_timewait_ack(sk, skb);
2030 break;
2031 case TCP_TW_RST:
2032 tcp_v4_send_reset(sk, skb);
2033 inet_twsk_deschedule_put(inet_twsk(sk));
2034 goto discard_it;
2035 case TCP_TW_SUCCESS:;
2036 }
2037 goto discard_it;
2038 }
2039
2040 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2041 .twsk_obj_size = sizeof(struct tcp_timewait_sock),
2042 .twsk_unique = tcp_twsk_unique,
2043 .twsk_destructor= tcp_twsk_destructor,
2044 };
2045
2046 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2047 {
2048 struct dst_entry *dst = skb_dst(skb);
2049
2050 if (dst && dst_hold_safe(dst)) {
2051 sk->sk_rx_dst = dst;
2052 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
2053 }
2054 }
2055 EXPORT_SYMBOL(inet_sk_rx_dst_set);
2056
2057 const struct inet_connection_sock_af_ops ipv4_specific = {
2058 .queue_xmit = ip_queue_xmit,
2059 .send_check = tcp_v4_send_check,
2060 .rebuild_header = inet_sk_rebuild_header,
2061 .sk_rx_dst_set = inet_sk_rx_dst_set,
2062 .conn_request = tcp_v4_conn_request,
2063 .syn_recv_sock = tcp_v4_syn_recv_sock,
2064 .net_header_len = sizeof(struct iphdr),
2065 .setsockopt = ip_setsockopt,
2066 .getsockopt = ip_getsockopt,
2067 .addr2sockaddr = inet_csk_addr2sockaddr,
2068 .sockaddr_len = sizeof(struct sockaddr_in),
2069 #ifdef CONFIG_COMPAT
2070 .compat_setsockopt = compat_ip_setsockopt,
2071 .compat_getsockopt = compat_ip_getsockopt,
2072 #endif
2073 .mtu_reduced = tcp_v4_mtu_reduced,
2074 };
2075 EXPORT_SYMBOL(ipv4_specific);
2076
2077 #ifdef CONFIG_TCP_MD5SIG
2078 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2079 .md5_lookup = tcp_v4_md5_lookup,
2080 .calc_md5_hash = tcp_v4_md5_hash_skb,
2081 .md5_parse = tcp_v4_parse_md5_keys,
2082 };
2083 #endif
2084
2085 /* NOTE: A lot of things set to zero explicitly by call to
2086 * sk_alloc() so need not be done here.
2087 */
2088 static int tcp_v4_init_sock(struct sock *sk)
2089 {
2090 struct inet_connection_sock *icsk = inet_csk(sk);
2091
2092 tcp_init_sock(sk);
2093
2094 icsk->icsk_af_ops = &ipv4_specific;
2095
2096 #ifdef CONFIG_TCP_MD5SIG
2097 tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2098 #endif
2099
2100 return 0;
2101 }
2102
2103 void tcp_v4_destroy_sock(struct sock *sk)
2104 {
2105 struct tcp_sock *tp = tcp_sk(sk);
2106
2107 trace_tcp_destroy_sock(sk);
2108
2109 tcp_clear_xmit_timers(sk);
2110
2111 tcp_cleanup_congestion_control(sk);
2112
2113 tcp_cleanup_ulp(sk);
2114
2115 /* Cleanup up the write buffer. */
2116 tcp_write_queue_purge(sk);
2117
2118 /* Check if we want to disable active TFO */
2119 tcp_fastopen_active_disable_ofo_check(sk);
2120
2121 /* Cleans up our, hopefully empty, out_of_order_queue. */
2122 skb_rbtree_purge(&tp->out_of_order_queue);
2123
2124 #ifdef CONFIG_TCP_MD5SIG
2125 /* Clean up the MD5 key list, if any */
2126 if (tp->md5sig_info) {
2127 tcp_clear_md5_list(sk);
2128 kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
2129 tp->md5sig_info = NULL;
2130 }
2131 #endif
2132
2133 /* Clean up a referenced TCP bind bucket. */
2134 if (inet_csk(sk)->icsk_bind_hash)
2135 inet_put_port(sk);
2136
2137 BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2138
2139 /* If socket is aborted during connect operation */
2140 tcp_free_fastopen_req(tp);
2141 tcp_fastopen_destroy_cipher(sk);
2142 tcp_saved_syn_free(tp);
2143
2144 sk_sockets_allocated_dec(sk);
2145 }
2146 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2147
2148 #ifdef CONFIG_PROC_FS
2149 /* Proc filesystem TCP sock list dumping. */
2150
2151 /*
2152 * Get next listener socket follow cur. If cur is NULL, get first socket
2153 * starting from bucket given in st->bucket; when st->bucket is zero the
2154 * very first socket in the hash table is returned.
2155 */
2156 static void *listening_get_next(struct seq_file *seq, void *cur)
2157 {
2158 struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2159 struct tcp_iter_state *st = seq->private;
2160 struct net *net = seq_file_net(seq);
2161 struct inet_listen_hashbucket *ilb;
2162 struct hlist_nulls_node *node;
2163 struct sock *sk = cur;
2164
2165 if (!sk) {
2166 get_head:
2167 ilb = &tcp_hashinfo.listening_hash[st->bucket];
2168 spin_lock(&ilb->lock);
2169 sk = sk_nulls_head(&ilb->nulls_head);
2170 st->offset = 0;
2171 goto get_sk;
2172 }
2173 ilb = &tcp_hashinfo.listening_hash[st->bucket];
2174 ++st->num;
2175 ++st->offset;
2176
2177 sk = sk_nulls_next(sk);
2178 get_sk:
2179 sk_nulls_for_each_from(sk, node) {
2180 if (!net_eq(sock_net(sk), net))
2181 continue;
2182 if (sk->sk_family == afinfo->family)
2183 return sk;
2184 }
2185 spin_unlock(&ilb->lock);
2186 st->offset = 0;
2187 if (++st->bucket < INET_LHTABLE_SIZE)
2188 goto get_head;
2189 return NULL;
2190 }
2191
2192 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2193 {
2194 struct tcp_iter_state *st = seq->private;
2195 void *rc;
2196
2197 st->bucket = 0;
2198 st->offset = 0;
2199 rc = listening_get_next(seq, NULL);
2200
2201 while (rc && *pos) {
2202 rc = listening_get_next(seq, rc);
2203 --*pos;
2204 }
2205 return rc;
2206 }
2207
2208 static inline bool empty_bucket(const struct tcp_iter_state *st)
2209 {
2210 return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
2211 }
2212
2213 /*
2214 * Get first established socket starting from bucket given in st->bucket.
2215 * If st->bucket is zero, the very first socket in the hash is returned.
2216 */
2217 static void *established_get_first(struct seq_file *seq)
2218 {
2219 struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2220 struct tcp_iter_state *st = seq->private;
2221 struct net *net = seq_file_net(seq);
2222 void *rc = NULL;
2223
2224 st->offset = 0;
2225 for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2226 struct sock *sk;
2227 struct hlist_nulls_node *node;
2228 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2229
2230 /* Lockless fast path for the common case of empty buckets */
2231 if (empty_bucket(st))
2232 continue;
2233
2234 spin_lock_bh(lock);
2235 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2236 if (sk->sk_family != afinfo->family ||
2237 !net_eq(sock_net(sk), net)) {
2238 continue;
2239 }
2240 rc = sk;
2241 goto out;
2242 }
2243 spin_unlock_bh(lock);
2244 }
2245 out:
2246 return rc;
2247 }
2248
2249 static void *established_get_next(struct seq_file *seq, void *cur)
2250 {
2251 struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2252 struct sock *sk = cur;
2253 struct hlist_nulls_node *node;
2254 struct tcp_iter_state *st = seq->private;
2255 struct net *net = seq_file_net(seq);
2256
2257 ++st->num;
2258 ++st->offset;
2259
2260 sk = sk_nulls_next(sk);
2261
2262 sk_nulls_for_each_from(sk, node) {
2263 if (sk->sk_family == afinfo->family &&
2264 net_eq(sock_net(sk), net))
2265 return sk;
2266 }
2267
2268 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2269 ++st->bucket;
2270 return established_get_first(seq);
2271 }
2272
2273 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2274 {
2275 struct tcp_iter_state *st = seq->private;
2276 void *rc;
2277
2278 st->bucket = 0;
2279 rc = established_get_first(seq);
2280
2281 while (rc && pos) {
2282 rc = established_get_next(seq, rc);
2283 --pos;
2284 }
2285 return rc;
2286 }
2287
2288 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2289 {
2290 void *rc;
2291 struct tcp_iter_state *st = seq->private;
2292
2293 st->state = TCP_SEQ_STATE_LISTENING;
2294 rc = listening_get_idx(seq, &pos);
2295
2296 if (!rc) {
2297 st->state = TCP_SEQ_STATE_ESTABLISHED;
2298 rc = established_get_idx(seq, pos);
2299 }
2300
2301 return rc;
2302 }
2303
2304 static void *tcp_seek_last_pos(struct seq_file *seq)
2305 {
2306 struct tcp_iter_state *st = seq->private;
2307 int offset = st->offset;
2308 int orig_num = st->num;
2309 void *rc = NULL;
2310
2311 switch (st->state) {
2312 case TCP_SEQ_STATE_LISTENING:
2313 if (st->bucket >= INET_LHTABLE_SIZE)
2314 break;
2315 st->state = TCP_SEQ_STATE_LISTENING;
2316 rc = listening_get_next(seq, NULL);
2317 while (offset-- && rc)
2318 rc = listening_get_next(seq, rc);
2319 if (rc)
2320 break;
2321 st->bucket = 0;
2322 st->state = TCP_SEQ_STATE_ESTABLISHED;
2323 /* Fallthrough */
2324 case TCP_SEQ_STATE_ESTABLISHED:
2325 if (st->bucket > tcp_hashinfo.ehash_mask)
2326 break;
2327 rc = established_get_first(seq);
2328 while (offset-- && rc)
2329 rc = established_get_next(seq, rc);
2330 }
2331
2332 st->num = orig_num;
2333
2334 return rc;
2335 }
2336
2337 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2338 {
2339 struct tcp_iter_state *st = seq->private;
2340 void *rc;
2341
2342 if (*pos && *pos == st->last_pos) {
2343 rc = tcp_seek_last_pos(seq);
2344 if (rc)
2345 goto out;
2346 }
2347
2348 st->state = TCP_SEQ_STATE_LISTENING;
2349 st->num = 0;
2350 st->bucket = 0;
2351 st->offset = 0;
2352 rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2353
2354 out:
2355 st->last_pos = *pos;
2356 return rc;
2357 }
2358 EXPORT_SYMBOL(tcp_seq_start);
2359
2360 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2361 {
2362 struct tcp_iter_state *st = seq->private;
2363 void *rc = NULL;
2364
2365 if (v == SEQ_START_TOKEN) {
2366 rc = tcp_get_idx(seq, 0);
2367 goto out;
2368 }
2369
2370 switch (st->state) {
2371 case TCP_SEQ_STATE_LISTENING:
2372 rc = listening_get_next(seq, v);
2373 if (!rc) {
2374 st->state = TCP_SEQ_STATE_ESTABLISHED;
2375 st->bucket = 0;
2376 st->offset = 0;
2377 rc = established_get_first(seq);
2378 }
2379 break;
2380 case TCP_SEQ_STATE_ESTABLISHED:
2381 rc = established_get_next(seq, v);
2382 break;
2383 }
2384 out:
2385 ++*pos;
2386 st->last_pos = *pos;
2387 return rc;
2388 }
2389 EXPORT_SYMBOL(tcp_seq_next);
2390
2391 void tcp_seq_stop(struct seq_file *seq, void *v)
2392 {
2393 struct tcp_iter_state *st = seq->private;
2394
2395 switch (st->state) {
2396 case TCP_SEQ_STATE_LISTENING:
2397 if (v != SEQ_START_TOKEN)
2398 spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock);
2399 break;
2400 case TCP_SEQ_STATE_ESTABLISHED:
2401 if (v)
2402 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2403 break;
2404 }
2405 }
2406 EXPORT_SYMBOL(tcp_seq_stop);
2407
2408 static void get_openreq4(const struct request_sock *req,
2409 struct seq_file *f, int i)
2410 {
2411 const struct inet_request_sock *ireq = inet_rsk(req);
2412 long delta = req->rsk_timer.expires - jiffies;
2413
2414 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2415 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2416 i,
2417 ireq->ir_loc_addr,
2418 ireq->ir_num,
2419 ireq->ir_rmt_addr,
2420 ntohs(ireq->ir_rmt_port),
2421 TCP_SYN_RECV,
2422 0, 0, /* could print option size, but that is af dependent. */
2423 1, /* timers active (only the expire timer) */
2424 jiffies_delta_to_clock_t(delta),
2425 req->num_timeout,
2426 from_kuid_munged(seq_user_ns(f),
2427 sock_i_uid(req->rsk_listener)),
2428 0, /* non standard timer */
2429 0, /* open_requests have no inode */
2430 0,
2431 req);
2432 }
2433
2434 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2435 {
2436 int timer_active;
2437 unsigned long timer_expires;
2438 const struct tcp_sock *tp = tcp_sk(sk);
2439 const struct inet_connection_sock *icsk = inet_csk(sk);
2440 const struct inet_sock *inet = inet_sk(sk);
2441 const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2442 __be32 dest = inet->inet_daddr;
2443 __be32 src = inet->inet_rcv_saddr;
2444 __u16 destp = ntohs(inet->inet_dport);
2445 __u16 srcp = ntohs(inet->inet_sport);
2446 int rx_queue;
2447 int state;
2448
2449 if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2450 icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2451 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2452 timer_active = 1;
2453 timer_expires = icsk->icsk_timeout;
2454 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2455 timer_active = 4;
2456 timer_expires = icsk->icsk_timeout;
2457 } else if (timer_pending(&sk->sk_timer)) {
2458 timer_active = 2;
2459 timer_expires = sk->sk_timer.expires;
2460 } else {
2461 timer_active = 0;
2462 timer_expires = jiffies;
2463 }
2464
2465 state = inet_sk_state_load(sk);
2466 if (state == TCP_LISTEN)
2467 rx_queue = READ_ONCE(sk->sk_ack_backlog);
2468 else
2469 /* Because we don't lock the socket,
2470 * we might find a transient negative value.
2471 */
2472 rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2473 READ_ONCE(tp->copied_seq), 0);
2474
2475 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2476 "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2477 i, src, srcp, dest, destp, state,
2478 READ_ONCE(tp->write_seq) - tp->snd_una,
2479 rx_queue,
2480 timer_active,
2481 jiffies_delta_to_clock_t(timer_expires - jiffies),
2482 icsk->icsk_retransmits,
2483 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2484 icsk->icsk_probes_out,
2485 sock_i_ino(sk),
2486 refcount_read(&sk->sk_refcnt), sk,
2487 jiffies_to_clock_t(icsk->icsk_rto),
2488 jiffies_to_clock_t(icsk->icsk_ack.ato),
2489 (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2490 tp->snd_cwnd,
2491 state == TCP_LISTEN ?
2492 fastopenq->max_qlen :
2493 (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2494 }
2495
2496 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2497 struct seq_file *f, int i)
2498 {
2499 long delta = tw->tw_timer.expires - jiffies;
2500 __be32 dest, src;
2501 __u16 destp, srcp;
2502
2503 dest = tw->tw_daddr;
2504 src = tw->tw_rcv_saddr;
2505 destp = ntohs(tw->tw_dport);
2506 srcp = ntohs(tw->tw_sport);
2507
2508 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2509 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2510 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2511 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2512 refcount_read(&tw->tw_refcnt), tw);
2513 }
2514
2515 #define TMPSZ 150
2516
2517 static int tcp4_seq_show(struct seq_file *seq, void *v)
2518 {
2519 struct tcp_iter_state *st;
2520 struct sock *sk = v;
2521
2522 seq_setwidth(seq, TMPSZ - 1);
2523 if (v == SEQ_START_TOKEN) {
2524 seq_puts(seq, " sl local_address rem_address st tx_queue "
2525 "rx_queue tr tm->when retrnsmt uid timeout "
2526 "inode");
2527 goto out;
2528 }
2529 st = seq->private;
2530
2531 if (sk->sk_state == TCP_TIME_WAIT)
2532 get_timewait4_sock(v, seq, st->num);
2533 else if (sk->sk_state == TCP_NEW_SYN_RECV)
2534 get_openreq4(v, seq, st->num);
2535 else
2536 get_tcp4_sock(v, seq, st->num);
2537 out:
2538 seq_pad(seq, '\n');
2539 return 0;
2540 }
2541
2542 static const struct seq_operations tcp4_seq_ops = {
2543 .show = tcp4_seq_show,
2544 .start = tcp_seq_start,
2545 .next = tcp_seq_next,
2546 .stop = tcp_seq_stop,
2547 };
2548
2549 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2550 .family = AF_INET,
2551 };
2552
2553 static int __net_init tcp4_proc_init_net(struct net *net)
2554 {
2555 if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
2556 sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
2557 return -ENOMEM;
2558 return 0;
2559 }
2560
2561 static void __net_exit tcp4_proc_exit_net(struct net *net)
2562 {
2563 remove_proc_entry("tcp", net->proc_net);
2564 }
2565
2566 static struct pernet_operations tcp4_net_ops = {
2567 .init = tcp4_proc_init_net,
2568 .exit = tcp4_proc_exit_net,
2569 };
2570
2571 int __init tcp4_proc_init(void)
2572 {
2573 return register_pernet_subsys(&tcp4_net_ops);
2574 }
2575
2576 void tcp4_proc_exit(void)
2577 {
2578 unregister_pernet_subsys(&tcp4_net_ops);
2579 }
2580 #endif /* CONFIG_PROC_FS */
2581
2582 struct proto tcp_prot = {
2583 .name = "TCP",
2584 .owner = THIS_MODULE,
2585 .close = tcp_close,
2586 .pre_connect = tcp_v4_pre_connect,
2587 .connect = tcp_v4_connect,
2588 .disconnect = tcp_disconnect,
2589 .accept = inet_csk_accept,
2590 .ioctl = tcp_ioctl,
2591 .init = tcp_v4_init_sock,
2592 .destroy = tcp_v4_destroy_sock,
2593 .shutdown = tcp_shutdown,
2594 .setsockopt = tcp_setsockopt,
2595 .getsockopt = tcp_getsockopt,
2596 .keepalive = tcp_set_keepalive,
2597 .recvmsg = tcp_recvmsg,
2598 .sendmsg = tcp_sendmsg,
2599 .sendpage = tcp_sendpage,
2600 .backlog_rcv = tcp_v4_do_rcv,
2601 .release_cb = tcp_release_cb,
2602 .hash = inet_hash,
2603 .unhash = inet_unhash,
2604 .get_port = inet_csk_get_port,
2605 .enter_memory_pressure = tcp_enter_memory_pressure,
2606 .leave_memory_pressure = tcp_leave_memory_pressure,
2607 .stream_memory_free = tcp_stream_memory_free,
2608 .sockets_allocated = &tcp_sockets_allocated,
2609 .orphan_count = &tcp_orphan_count,
2610 .memory_allocated = &tcp_memory_allocated,
2611 .memory_pressure = &tcp_memory_pressure,
2612 .sysctl_mem = sysctl_tcp_mem,
2613 .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem),
2614 .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem),
2615 .max_header = MAX_TCP_HEADER,
2616 .obj_size = sizeof(struct tcp_sock),
2617 .slab_flags = SLAB_TYPESAFE_BY_RCU,
2618 .twsk_prot = &tcp_timewait_sock_ops,
2619 .rsk_prot = &tcp_request_sock_ops,
2620 .h.hashinfo = &tcp_hashinfo,
2621 .no_autobind = true,
2622 #ifdef CONFIG_COMPAT
2623 .compat_setsockopt = compat_tcp_setsockopt,
2624 .compat_getsockopt = compat_tcp_getsockopt,
2625 #endif
2626 .diag_destroy = tcp_abort,
2627 };
2628 EXPORT_SYMBOL(tcp_prot);
2629
2630 static void __net_exit tcp_sk_exit(struct net *net)
2631 {
2632 int cpu;
2633
2634 if (net->ipv4.tcp_congestion_control)
2635 module_put(net->ipv4.tcp_congestion_control->owner);
2636
2637 for_each_possible_cpu(cpu)
2638 inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2639 free_percpu(net->ipv4.tcp_sk);
2640 }
2641
2642 static int __net_init tcp_sk_init(struct net *net)
2643 {
2644 int res, cpu, cnt;
2645
2646 net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2647 if (!net->ipv4.tcp_sk)
2648 return -ENOMEM;
2649
2650 for_each_possible_cpu(cpu) {
2651 struct sock *sk;
2652
2653 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2654 IPPROTO_TCP, net);
2655 if (res)
2656 goto fail;
2657 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
2658
2659 /* Please enforce IP_DF and IPID==0 for RST and
2660 * ACK sent in SYN-RECV and TIME-WAIT state.
2661 */
2662 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
2663
2664 *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2665 }
2666
2667 net->ipv4.sysctl_tcp_ecn = 2;
2668 net->ipv4.sysctl_tcp_ecn_fallback = 1;
2669
2670 net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2671 net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
2672 net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2673 net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2674 net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
2675
2676 net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
2677 net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
2678 net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
2679
2680 net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
2681 net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
2682 net->ipv4.sysctl_tcp_syncookies = 1;
2683 net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
2684 net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
2685 net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
2686 net->ipv4.sysctl_tcp_orphan_retries = 0;
2687 net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
2688 net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
2689 net->ipv4.sysctl_tcp_tw_reuse = 2;
2690 net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
2691
2692 cnt = tcp_hashinfo.ehash_mask + 1;
2693 net->ipv4.tcp_death_row.sysctl_max_tw_buckets = cnt / 2;
2694 net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
2695
2696 net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 128);
2697 net->ipv4.sysctl_tcp_sack = 1;
2698 net->ipv4.sysctl_tcp_window_scaling = 1;
2699 net->ipv4.sysctl_tcp_timestamps = 1;
2700 net->ipv4.sysctl_tcp_early_retrans = 3;
2701 net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
2702 net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior. */
2703 net->ipv4.sysctl_tcp_retrans_collapse = 1;
2704 net->ipv4.sysctl_tcp_max_reordering = 300;
2705 net->ipv4.sysctl_tcp_dsack = 1;
2706 net->ipv4.sysctl_tcp_app_win = 31;
2707 net->ipv4.sysctl_tcp_adv_win_scale = 1;
2708 net->ipv4.sysctl_tcp_frto = 2;
2709 net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
2710 /* This limits the percentage of the congestion window which we
2711 * will allow a single TSO frame to consume. Building TSO frames
2712 * which are too large can cause TCP streams to be bursty.
2713 */
2714 net->ipv4.sysctl_tcp_tso_win_divisor = 3;
2715 /* Default TSQ limit of 16 TSO segments */
2716 net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
2717 /* rfc5961 challenge ack rate limiting */
2718 net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
2719 net->ipv4.sysctl_tcp_min_tso_segs = 2;
2720 net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
2721 net->ipv4.sysctl_tcp_autocorking = 1;
2722 net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
2723 net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
2724 net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
2725 if (net != &init_net) {
2726 memcpy(net->ipv4.sysctl_tcp_rmem,
2727 init_net.ipv4.sysctl_tcp_rmem,
2728 sizeof(init_net.ipv4.sysctl_tcp_rmem));
2729 memcpy(net->ipv4.sysctl_tcp_wmem,
2730 init_net.ipv4.sysctl_tcp_wmem,
2731 sizeof(init_net.ipv4.sysctl_tcp_wmem));
2732 }
2733 net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
2734 net->ipv4.sysctl_tcp_comp_sack_nr = 44;
2735 net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
2736 spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
2737 net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60;
2738 atomic_set(&net->ipv4.tfo_active_disable_times, 0);
2739
2740 /* Reno is always built in */
2741 if (!net_eq(net, &init_net) &&
2742 try_module_get(init_net.ipv4.tcp_congestion_control->owner))
2743 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
2744 else
2745 net->ipv4.tcp_congestion_control = &tcp_reno;
2746
2747 return 0;
2748 fail:
2749 tcp_sk_exit(net);
2750
2751 return res;
2752 }
2753
2754 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2755 {
2756 struct net *net;
2757
2758 inet_twsk_purge(&tcp_hashinfo, AF_INET);
2759
2760 list_for_each_entry(net, net_exit_list, exit_list)
2761 tcp_fastopen_ctx_destroy(net);
2762 }
2763
2764 static struct pernet_operations __net_initdata tcp_sk_ops = {
2765 .init = tcp_sk_init,
2766 .exit = tcp_sk_exit,
2767 .exit_batch = tcp_sk_exit_batch,
2768 };
2769
2770 void __init tcp_v4_init(void)
2771 {
2772 if (register_pernet_subsys(&tcp_sk_ops))
2773 panic("Failed to create the TCP control socket.\n");
2774 }