]> git.ipfire.org Git - people/ms/linux.git/blame - net/ipv4/tcp_ipv4.c
mlx4: restore conditional call to napi_complete_done()
[people/ms/linux.git] / net / ipv4 / tcp_ipv4.c
CommitLineData
1da177e4
LT
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * Implementation of the Transmission Control Protocol(TCP).
7 *
1da177e4
LT
8 * IPv4 specific functions
9 *
10 *
11 * code split from:
12 * linux/ipv4/tcp.c
13 * linux/ipv4/tcp_input.c
14 * linux/ipv4/tcp_output.c
15 *
16 * See tcp.c for author information
17 *
18 * This program is free software; you can redistribute it and/or
19 * modify it under the terms of the GNU General Public License
20 * as published by the Free Software Foundation; either version
21 * 2 of the License, or (at your option) any later version.
22 */
23
24/*
25 * Changes:
26 * David S. Miller : New socket lookup architecture.
27 * This code is dedicated to John Dyson.
28 * David S. Miller : Change semantics of established hash,
29 * half is devoted to TIME_WAIT sockets
30 * and the rest go in the other half.
31 * Andi Kleen : Add support for syncookies and fixed
32 * some bugs: ip options weren't passed to
33 * the TCP layer, missed a check for an
34 * ACK bit.
35 * Andi Kleen : Implemented fast path mtu discovery.
36 * Fixed many serious bugs in the
60236fdd 37 * request_sock handling and moved
1da177e4
LT
38 * most of it into the af independent code.
39 * Added tail drop and some other bugfixes.
caa20d9a 40 * Added new listen semantics.
1da177e4
LT
41 * Mike McLagan : Routing by source
42 * Juan Jose Ciarlante: ip_dynaddr bits
43 * Andi Kleen: various fixes.
44 * Vitaly E. Lavrov : Transparent proxy revived after year
45 * coma.
46 * Andi Kleen : Fix new listen.
47 * Andi Kleen : Fix accept error reporting.
48 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
49 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
50 * a single port at the same time.
51 */
52
afd46503 53#define pr_fmt(fmt) "TCP: " fmt
1da177e4 54
eb4dea58 55#include <linux/bottom_half.h>
1da177e4
LT
56#include <linux/types.h>
57#include <linux/fcntl.h>
58#include <linux/module.h>
59#include <linux/random.h>
60#include <linux/cache.h>
61#include <linux/jhash.h>
62#include <linux/init.h>
63#include <linux/times.h>
5a0e3ad6 64#include <linux/slab.h>
1da177e4 65
457c4cbc 66#include <net/net_namespace.h>
1da177e4 67#include <net/icmp.h>
304a1618 68#include <net/inet_hashtables.h>
1da177e4 69#include <net/tcp.h>
20380731 70#include <net/transp_v6.h>
1da177e4
LT
71#include <net/ipv6.h>
72#include <net/inet_common.h>
6d6ee43e 73#include <net/timewait_sock.h>
1da177e4 74#include <net/xfrm.h>
6e5714ea 75#include <net/secure_seq.h>
d1a4c0b3 76#include <net/tcp_memcontrol.h>
076bb0c8 77#include <net/busy_poll.h>
1da177e4
LT
78
79#include <linux/inet.h>
80#include <linux/ipv6.h>
81#include <linux/stddef.h>
82#include <linux/proc_fs.h>
83#include <linux/seq_file.h>
84
cfb6eeb4
YH
85#include <linux/crypto.h>
86#include <linux/scatterlist.h>
87
ab32ea5d
BH
88int sysctl_tcp_tw_reuse __read_mostly;
89int sysctl_tcp_low_latency __read_mostly;
4bc2f18b 90EXPORT_SYMBOL(sysctl_tcp_low_latency);
1da177e4 91
cfb6eeb4 92#ifdef CONFIG_TCP_MD5SIG
a915da9b 93static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
318cf7aa 94 __be32 daddr, __be32 saddr, const struct tcphdr *th);
cfb6eeb4
YH
95#endif
96
5caea4ea 97struct inet_hashinfo tcp_hashinfo;
4bc2f18b 98EXPORT_SYMBOL(tcp_hashinfo);
1da177e4 99
936b8bdb 100static __u32 tcp_v4_init_sequence(const struct sk_buff *skb)
1da177e4 101{
eddc9ec5
ACM
102 return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
103 ip_hdr(skb)->saddr,
aa8223c7
ACM
104 tcp_hdr(skb)->dest,
105 tcp_hdr(skb)->source);
1da177e4
LT
106}
107
6d6ee43e
ACM
108int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
109{
110 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
111 struct tcp_sock *tp = tcp_sk(sk);
112
113 /* With PAWS, it is safe from the viewpoint
114 of data integrity. Even without PAWS it is safe provided sequence
115 spaces do not overlap i.e. at data rates <= 80Mbit/sec.
116
117 Actually, the idea is close to VJ's one, only timestamp cache is
118 held not per host, but per port pair and TW bucket is used as state
119 holder.
120
121 If TW bucket has been already destroyed we fall back to VJ's scheme
122 and use initial timestamp retrieved from peer table.
123 */
124 if (tcptw->tw_ts_recent_stamp &&
125 (twp == NULL || (sysctl_tcp_tw_reuse &&
9d729f72 126 get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
6d6ee43e
ACM
127 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
128 if (tp->write_seq == 0)
129 tp->write_seq = 1;
130 tp->rx_opt.ts_recent = tcptw->tw_ts_recent;
131 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
132 sock_hold(sktw);
133 return 1;
134 }
135
136 return 0;
137}
6d6ee43e
ACM
138EXPORT_SYMBOL_GPL(tcp_twsk_unique);
139
1da177e4
LT
140/* This will initiate an outgoing connection. */
141int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
142{
2d7192d6 143 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
1da177e4
LT
144 struct inet_sock *inet = inet_sk(sk);
145 struct tcp_sock *tp = tcp_sk(sk);
dca8b089 146 __be16 orig_sport, orig_dport;
bada8adc 147 __be32 daddr, nexthop;
da905bd1 148 struct flowi4 *fl4;
2d7192d6 149 struct rtable *rt;
1da177e4 150 int err;
f6d8bd05 151 struct ip_options_rcu *inet_opt;
1da177e4
LT
152
153 if (addr_len < sizeof(struct sockaddr_in))
154 return -EINVAL;
155
156 if (usin->sin_family != AF_INET)
157 return -EAFNOSUPPORT;
158
159 nexthop = daddr = usin->sin_addr.s_addr;
f6d8bd05
ED
160 inet_opt = rcu_dereference_protected(inet->inet_opt,
161 sock_owned_by_user(sk));
162 if (inet_opt && inet_opt->opt.srr) {
1da177e4
LT
163 if (!daddr)
164 return -EINVAL;
f6d8bd05 165 nexthop = inet_opt->opt.faddr;
1da177e4
LT
166 }
167
dca8b089
DM
168 orig_sport = inet->inet_sport;
169 orig_dport = usin->sin_port;
da905bd1
DM
170 fl4 = &inet->cork.fl.u.ip4;
171 rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
b23dd4fe
DM
172 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
173 IPPROTO_TCP,
0e0d44ab 174 orig_sport, orig_dport, sk);
b23dd4fe
DM
175 if (IS_ERR(rt)) {
176 err = PTR_ERR(rt);
177 if (err == -ENETUNREACH)
f1d8cba6 178 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
b23dd4fe 179 return err;
584bdf8c 180 }
1da177e4
LT
181
182 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
183 ip_rt_put(rt);
184 return -ENETUNREACH;
185 }
186
f6d8bd05 187 if (!inet_opt || !inet_opt->opt.srr)
da905bd1 188 daddr = fl4->daddr;
1da177e4 189
c720c7e8 190 if (!inet->inet_saddr)
da905bd1 191 inet->inet_saddr = fl4->saddr;
c720c7e8 192 inet->inet_rcv_saddr = inet->inet_saddr;
1da177e4 193
c720c7e8 194 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
1da177e4
LT
195 /* Reset inherited state */
196 tp->rx_opt.ts_recent = 0;
197 tp->rx_opt.ts_recent_stamp = 0;
ee995283
PE
198 if (likely(!tp->repair))
199 tp->write_seq = 0;
1da177e4
LT
200 }
201
295ff7ed 202 if (tcp_death_row.sysctl_tw_recycle &&
81166dd6
DM
203 !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr)
204 tcp_fetch_timewait_stamp(sk, &rt->dst);
1da177e4 205
c720c7e8
ED
206 inet->inet_dport = usin->sin_port;
207 inet->inet_daddr = daddr;
1da177e4 208
d83d8461 209 inet_csk(sk)->icsk_ext_hdr_len = 0;
f6d8bd05
ED
210 if (inet_opt)
211 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1da177e4 212
bee7ca9e 213 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
1da177e4
LT
214
215 /* Socket identity is still unknown (sport may be zero).
216 * However we set state to SYN-SENT and not releasing socket
217 * lock select source port, enter ourselves into the hash tables and
218 * complete initialization after this.
219 */
220 tcp_set_state(sk, TCP_SYN_SENT);
a7f5e7f1 221 err = inet_hash_connect(&tcp_death_row, sk);
1da177e4
LT
222 if (err)
223 goto failure;
224
9e7ceb06
SP
225 inet_set_txhash(sk);
226
da905bd1 227 rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
b23dd4fe
DM
228 inet->inet_sport, inet->inet_dport, sk);
229 if (IS_ERR(rt)) {
230 err = PTR_ERR(rt);
231 rt = NULL;
1da177e4 232 goto failure;
b23dd4fe 233 }
1da177e4 234 /* OK, now commit destination to socket. */
bcd76111 235 sk->sk_gso_type = SKB_GSO_TCPV4;
d8d1f30b 236 sk_setup_caps(sk, &rt->dst);
1da177e4 237
ee995283 238 if (!tp->write_seq && likely(!tp->repair))
c720c7e8
ED
239 tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
240 inet->inet_daddr,
241 inet->inet_sport,
1da177e4
LT
242 usin->sin_port);
243
c720c7e8 244 inet->inet_id = tp->write_seq ^ jiffies;
1da177e4 245
2b916477 246 err = tcp_connect(sk);
ee995283 247
1da177e4
LT
248 rt = NULL;
249 if (err)
250 goto failure;
251
252 return 0;
253
254failure:
7174259e
ACM
255 /*
256 * This unhashes the socket and releases the local port,
257 * if necessary.
258 */
1da177e4
LT
259 tcp_set_state(sk, TCP_CLOSE);
260 ip_rt_put(rt);
261 sk->sk_route_caps = 0;
c720c7e8 262 inet->inet_dport = 0;
1da177e4
LT
263 return err;
264}
4bc2f18b 265EXPORT_SYMBOL(tcp_v4_connect);
1da177e4 266
1da177e4 267/*
563d34d0
ED
268 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
269 * It can be called through tcp_release_cb() if socket was owned by user
270 * at the time tcp_v4_err() was called to handle ICMP message.
1da177e4 271 */
4fab9071 272void tcp_v4_mtu_reduced(struct sock *sk)
1da177e4
LT
273{
274 struct dst_entry *dst;
275 struct inet_sock *inet = inet_sk(sk);
563d34d0 276 u32 mtu = tcp_sk(sk)->mtu_info;
1da177e4 277
80d0a69f
DM
278 dst = inet_csk_update_pmtu(sk, mtu);
279 if (!dst)
1da177e4
LT
280 return;
281
1da177e4
LT
282 /* Something is about to be wrong... Remember soft error
283 * for the case, if this connection will not able to recover.
284 */
285 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
286 sk->sk_err_soft = EMSGSIZE;
287
288 mtu = dst_mtu(dst);
289
290 if (inet->pmtudisc != IP_PMTUDISC_DONT &&
482fc609 291 ip_sk_accept_pmtu(sk) &&
d83d8461 292 inet_csk(sk)->icsk_pmtu_cookie > mtu) {
1da177e4
LT
293 tcp_sync_mss(sk, mtu);
294
295 /* Resend the TCP packet because it's
296 * clear that the old packet has been
297 * dropped. This is the new "fast" path mtu
298 * discovery.
299 */
300 tcp_simple_retransmit(sk);
301 } /* else let the usual retransmit timer handle it */
302}
4fab9071 303EXPORT_SYMBOL(tcp_v4_mtu_reduced);
1da177e4 304
55be7a9c
DM
305static void do_redirect(struct sk_buff *skb, struct sock *sk)
306{
307 struct dst_entry *dst = __sk_dst_check(sk, 0);
308
1ed5c48f 309 if (dst)
6700c270 310 dst->ops->redirect(dst, sk, skb);
55be7a9c
DM
311}
312
1da177e4
LT
313/*
314 * This routine is called by the ICMP module when it gets some
315 * sort of error condition. If err < 0 then the socket should
316 * be closed and the error returned to the user. If err > 0
317 * it's just the icmp type << 8 | icmp code. After adjustment
318 * header points to the first 8 bytes of the tcp header. We need
319 * to find the appropriate port.
320 *
321 * The locking strategy used here is very "optimistic". When
322 * someone else accesses the socket the ICMP is just dropped
323 * and for some paths there is no check at all.
324 * A more general error queue to queue errors for later handling
325 * is probably better.
326 *
327 */
328
4d1a2d9e 329void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
1da177e4 330{
b71d1d42 331 const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
4d1a2d9e 332 struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
f1ecd5d9 333 struct inet_connection_sock *icsk;
1da177e4
LT
334 struct tcp_sock *tp;
335 struct inet_sock *inet;
4d1a2d9e
DL
336 const int type = icmp_hdr(icmp_skb)->type;
337 const int code = icmp_hdr(icmp_skb)->code;
1da177e4 338 struct sock *sk;
f1ecd5d9 339 struct sk_buff *skb;
0a672f74
YC
340 struct request_sock *fastopen;
341 __u32 seq, snd_una;
f1ecd5d9 342 __u32 remaining;
1da177e4 343 int err;
4d1a2d9e 344 struct net *net = dev_net(icmp_skb->dev);
1da177e4 345
fd54d716 346 sk = inet_lookup(net, &tcp_hashinfo, iph->daddr, th->dest,
4d1a2d9e 347 iph->saddr, th->source, inet_iif(icmp_skb));
1da177e4 348 if (!sk) {
dcfc23ca 349 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
1da177e4
LT
350 return;
351 }
352 if (sk->sk_state == TCP_TIME_WAIT) {
9469c7b4 353 inet_twsk_put(inet_twsk(sk));
1da177e4
LT
354 return;
355 }
356
357 bh_lock_sock(sk);
358 /* If too many ICMPs get dropped on busy
359 * servers this needs to be solved differently.
563d34d0
ED
360 * We do take care of PMTU discovery (RFC1191) special case :
361 * we can receive locally generated ICMP messages while socket is held.
1da177e4 362 */
b74aa930
ED
363 if (sock_owned_by_user(sk)) {
364 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
365 NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
366 }
1da177e4
LT
367 if (sk->sk_state == TCP_CLOSE)
368 goto out;
369
97e3ecd1 370 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
371 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
372 goto out;
373 }
374
f1ecd5d9 375 icsk = inet_csk(sk);
1da177e4
LT
376 tp = tcp_sk(sk);
377 seq = ntohl(th->seq);
0a672f74
YC
378 /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
379 fastopen = tp->fastopen_rsk;
380 snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
1da177e4 381 if (sk->sk_state != TCP_LISTEN &&
0a672f74 382 !between(seq, snd_una, tp->snd_nxt)) {
de0744af 383 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
1da177e4
LT
384 goto out;
385 }
386
387 switch (type) {
55be7a9c
DM
388 case ICMP_REDIRECT:
389 do_redirect(icmp_skb, sk);
390 goto out;
1da177e4
LT
391 case ICMP_SOURCE_QUENCH:
392 /* Just silently ignore these. */
393 goto out;
394 case ICMP_PARAMETERPROB:
395 err = EPROTO;
396 break;
397 case ICMP_DEST_UNREACH:
398 if (code > NR_ICMP_UNREACH)
399 goto out;
400
401 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
0d4f0608
ED
402 /* We are not interested in TCP_LISTEN and open_requests
403 * (SYN-ACKs send out by Linux are always <576bytes so
404 * they should go through unfragmented).
405 */
406 if (sk->sk_state == TCP_LISTEN)
407 goto out;
408
563d34d0 409 tp->mtu_info = info;
144d56e9 410 if (!sock_owned_by_user(sk)) {
563d34d0 411 tcp_v4_mtu_reduced(sk);
144d56e9
ED
412 } else {
413 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags))
414 sock_hold(sk);
415 }
1da177e4
LT
416 goto out;
417 }
418
419 err = icmp_err_convert[code].errno;
f1ecd5d9
DL
420 /* check if icmp_skb allows revert of backoff
421 * (see draft-zimmermann-tcp-lcd) */
422 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
423 break;
424 if (seq != tp->snd_una || !icsk->icsk_retransmits ||
0a672f74 425 !icsk->icsk_backoff || fastopen)
f1ecd5d9
DL
426 break;
427
8f49c270
DM
428 if (sock_owned_by_user(sk))
429 break;
430
f1ecd5d9 431 icsk->icsk_backoff--;
fcdd1cf4
ED
432 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
433 TCP_TIMEOUT_INIT;
434 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
f1ecd5d9
DL
435
436 skb = tcp_write_queue_head(sk);
437 BUG_ON(!skb);
438
7faee5c0
ED
439 remaining = icsk->icsk_rto -
440 min(icsk->icsk_rto,
441 tcp_time_stamp - tcp_skb_timestamp(skb));
f1ecd5d9
DL
442
443 if (remaining) {
444 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
445 remaining, TCP_RTO_MAX);
f1ecd5d9
DL
446 } else {
447 /* RTO revert clocked out retransmission.
448 * Will retransmit now */
449 tcp_retransmit_timer(sk);
450 }
451
1da177e4
LT
452 break;
453 case ICMP_TIME_EXCEEDED:
454 err = EHOSTUNREACH;
455 break;
456 default:
457 goto out;
458 }
459
460 switch (sk->sk_state) {
60236fdd 461 struct request_sock *req, **prev;
1da177e4
LT
462 case TCP_LISTEN:
463 if (sock_owned_by_user(sk))
464 goto out;
465
463c84b9
ACM
466 req = inet_csk_search_req(sk, &prev, th->dest,
467 iph->daddr, iph->saddr);
1da177e4
LT
468 if (!req)
469 goto out;
470
471 /* ICMPs are not backlogged, hence we cannot get
472 an established socket here.
473 */
547b792c 474 WARN_ON(req->sk);
1da177e4 475
2e6599cb 476 if (seq != tcp_rsk(req)->snt_isn) {
de0744af 477 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
1da177e4
LT
478 goto out;
479 }
480
481 /*
482 * Still in SYN_RECV, just remove it silently.
483 * There is no good way to pass the error to the newly
484 * created socket, and POSIX does not want network
485 * errors returned from accept().
486 */
463c84b9 487 inet_csk_reqsk_queue_drop(sk, req, prev);
848bf15f 488 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1da177e4
LT
489 goto out;
490
491 case TCP_SYN_SENT:
0a672f74
YC
492 case TCP_SYN_RECV:
493 /* Only in fast or simultaneous open. If a fast open socket is
494 * is already accepted it is treated as a connected one below.
495 */
496 if (fastopen && fastopen->sk == NULL)
497 break;
498
1da177e4 499 if (!sock_owned_by_user(sk)) {
1da177e4
LT
500 sk->sk_err = err;
501
502 sk->sk_error_report(sk);
503
504 tcp_done(sk);
505 } else {
506 sk->sk_err_soft = err;
507 }
508 goto out;
509 }
510
511 /* If we've already connected we will keep trying
512 * until we time out, or the user gives up.
513 *
514 * rfc1122 4.2.3.9 allows to consider as hard errors
515 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
516 * but it is obsoleted by pmtu discovery).
517 *
518 * Note, that in modern internet, where routing is unreliable
519 * and in each dark corner broken firewalls sit, sending random
520 * errors ordered by their masters even this two messages finally lose
521 * their original sense (even Linux sends invalid PORT_UNREACHs)
522 *
523 * Now we are in compliance with RFCs.
524 * --ANK (980905)
525 */
526
527 inet = inet_sk(sk);
528 if (!sock_owned_by_user(sk) && inet->recverr) {
529 sk->sk_err = err;
530 sk->sk_error_report(sk);
531 } else { /* Only an error on timeout */
532 sk->sk_err_soft = err;
533 }
534
535out:
536 bh_unlock_sock(sk);
537 sock_put(sk);
538}
539
28850dc7 540void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
1da177e4 541{
aa8223c7 542 struct tcphdr *th = tcp_hdr(skb);
1da177e4 543
84fa7933 544 if (skb->ip_summed == CHECKSUM_PARTIAL) {
419f9f89 545 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
663ead3b 546 skb->csum_start = skb_transport_header(skb) - skb->head;
ff1dcadb 547 skb->csum_offset = offsetof(struct tcphdr, check);
1da177e4 548 } else {
419f9f89 549 th->check = tcp_v4_check(skb->len, saddr, daddr,
07f0757a 550 csum_partial(th,
1da177e4
LT
551 th->doff << 2,
552 skb->csum));
553 }
554}
555
419f9f89 556/* This routine computes an IPv4 TCP checksum. */
bb296246 557void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
419f9f89 558{
cf533ea5 559 const struct inet_sock *inet = inet_sk(sk);
419f9f89
HX
560
561 __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
562}
4bc2f18b 563EXPORT_SYMBOL(tcp_v4_send_check);
419f9f89 564
1da177e4
LT
565/*
566 * This routine will send an RST to the other tcp.
567 *
568 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
569 * for reset.
570 * Answer: if a packet caused RST, it is not for a socket
571 * existing in our system, if it is matched to a socket,
572 * it is just duplicate segment or bug in other side's TCP.
573 * So that we build reply only basing on parameters
574 * arrived with segment.
575 * Exception: precedence violation. We do not implement it in any case.
576 */
577
cfb6eeb4 578static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
1da177e4 579{
cf533ea5 580 const struct tcphdr *th = tcp_hdr(skb);
cfb6eeb4
YH
581 struct {
582 struct tcphdr th;
583#ifdef CONFIG_TCP_MD5SIG
714e85be 584 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
cfb6eeb4
YH
585#endif
586 } rep;
1da177e4 587 struct ip_reply_arg arg;
cfb6eeb4
YH
588#ifdef CONFIG_TCP_MD5SIG
589 struct tcp_md5sig_key *key;
658ddaaf
SL
590 const __u8 *hash_location = NULL;
591 unsigned char newhash[16];
592 int genhash;
593 struct sock *sk1 = NULL;
cfb6eeb4 594#endif
a86b1e30 595 struct net *net;
1da177e4
LT
596
597 /* Never send a reset in response to a reset. */
598 if (th->rst)
599 return;
600
511c3f92 601 if (skb_rtable(skb)->rt_type != RTN_LOCAL)
1da177e4
LT
602 return;
603
604 /* Swap the send and the receive. */
cfb6eeb4
YH
605 memset(&rep, 0, sizeof(rep));
606 rep.th.dest = th->source;
607 rep.th.source = th->dest;
608 rep.th.doff = sizeof(struct tcphdr) / 4;
609 rep.th.rst = 1;
1da177e4
LT
610
611 if (th->ack) {
cfb6eeb4 612 rep.th.seq = th->ack_seq;
1da177e4 613 } else {
cfb6eeb4
YH
614 rep.th.ack = 1;
615 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
616 skb->len - (th->doff << 2));
1da177e4
LT
617 }
618
7174259e 619 memset(&arg, 0, sizeof(arg));
cfb6eeb4
YH
620 arg.iov[0].iov_base = (unsigned char *)&rep;
621 arg.iov[0].iov_len = sizeof(rep.th);
622
623#ifdef CONFIG_TCP_MD5SIG
658ddaaf
SL
624 hash_location = tcp_parse_md5sig_option(th);
625 if (!sk && hash_location) {
626 /*
627 * active side is lost. Try to find listening socket through
628 * source port, and then find md5 key through listening socket.
629 * we are not loose security here:
630 * Incoming packet is checked with md5 hash with finding key,
631 * no RST generated if md5 hash doesn't match.
632 */
633 sk1 = __inet_lookup_listener(dev_net(skb_dst(skb)->dev),
da5e3630
TH
634 &tcp_hashinfo, ip_hdr(skb)->saddr,
635 th->source, ip_hdr(skb)->daddr,
658ddaaf
SL
636 ntohs(th->source), inet_iif(skb));
637 /* don't send rst if it can't find key */
638 if (!sk1)
639 return;
640 rcu_read_lock();
641 key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
642 &ip_hdr(skb)->saddr, AF_INET);
643 if (!key)
644 goto release_sk1;
645
646 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, NULL, skb);
647 if (genhash || memcmp(hash_location, newhash, 16) != 0)
648 goto release_sk1;
649 } else {
650 key = sk ? tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
651 &ip_hdr(skb)->saddr,
652 AF_INET) : NULL;
653 }
654
cfb6eeb4
YH
655 if (key) {
656 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
657 (TCPOPT_NOP << 16) |
658 (TCPOPT_MD5SIG << 8) |
659 TCPOLEN_MD5SIG);
660 /* Update length and the length the header thinks exists */
661 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
662 rep.th.doff = arg.iov[0].iov_len / 4;
663
49a72dfb 664 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
78e645cb
IJ
665 key, ip_hdr(skb)->saddr,
666 ip_hdr(skb)->daddr, &rep.th);
cfb6eeb4
YH
667 }
668#endif
eddc9ec5
ACM
669 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
670 ip_hdr(skb)->saddr, /* XXX */
52cd5750 671 arg.iov[0].iov_len, IPPROTO_TCP, 0);
1da177e4 672 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
88ef4a5a 673 arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0;
e2446eaa 674 /* When socket is gone, all binding information is lost.
4c675258
AK
675 * routing might fail in this case. No choice here, if we choose to force
676 * input interface, we will misroute in case of asymmetric route.
e2446eaa 677 */
4c675258
AK
678 if (sk)
679 arg.bound_dev_if = sk->sk_bound_dev_if;
1da177e4 680
adf30907 681 net = dev_net(skb_dst(skb)->dev);
66b13d99 682 arg.tos = ip_hdr(skb)->tos;
24a2d43d
ED
683 ip_send_unicast_reply(net, skb, &TCP_SKB_CB(skb)->header.h4.opt,
684 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
685 &arg, arg.iov[0].iov_len);
1da177e4 686
63231bdd
PE
687 TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
688 TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
658ddaaf
SL
689
690#ifdef CONFIG_TCP_MD5SIG
691release_sk1:
692 if (sk1) {
693 rcu_read_unlock();
694 sock_put(sk1);
695 }
696#endif
1da177e4
LT
697}
698
699/* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
700 outside socket context is ugly, certainly. What can I do?
701 */
702
9501f972 703static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
ee684b6f 704 u32 win, u32 tsval, u32 tsecr, int oif,
88ef4a5a 705 struct tcp_md5sig_key *key,
66b13d99 706 int reply_flags, u8 tos)
1da177e4 707{
cf533ea5 708 const struct tcphdr *th = tcp_hdr(skb);
1da177e4
LT
709 struct {
710 struct tcphdr th;
714e85be 711 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
cfb6eeb4 712#ifdef CONFIG_TCP_MD5SIG
714e85be 713 + (TCPOLEN_MD5SIG_ALIGNED >> 2)
cfb6eeb4
YH
714#endif
715 ];
1da177e4
LT
716 } rep;
717 struct ip_reply_arg arg;
adf30907 718 struct net *net = dev_net(skb_dst(skb)->dev);
1da177e4
LT
719
720 memset(&rep.th, 0, sizeof(struct tcphdr));
7174259e 721 memset(&arg, 0, sizeof(arg));
1da177e4
LT
722
723 arg.iov[0].iov_base = (unsigned char *)&rep;
724 arg.iov[0].iov_len = sizeof(rep.th);
ee684b6f 725 if (tsecr) {
cfb6eeb4
YH
726 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
727 (TCPOPT_TIMESTAMP << 8) |
728 TCPOLEN_TIMESTAMP);
ee684b6f
AV
729 rep.opt[1] = htonl(tsval);
730 rep.opt[2] = htonl(tsecr);
cb48cfe8 731 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
1da177e4
LT
732 }
733
734 /* Swap the send and the receive. */
735 rep.th.dest = th->source;
736 rep.th.source = th->dest;
737 rep.th.doff = arg.iov[0].iov_len / 4;
738 rep.th.seq = htonl(seq);
739 rep.th.ack_seq = htonl(ack);
740 rep.th.ack = 1;
741 rep.th.window = htons(win);
742
cfb6eeb4 743#ifdef CONFIG_TCP_MD5SIG
cfb6eeb4 744 if (key) {
ee684b6f 745 int offset = (tsecr) ? 3 : 0;
cfb6eeb4
YH
746
747 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
748 (TCPOPT_NOP << 16) |
749 (TCPOPT_MD5SIG << 8) |
750 TCPOLEN_MD5SIG);
751 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
752 rep.th.doff = arg.iov[0].iov_len/4;
753
49a72dfb 754 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
90b7e112
AL
755 key, ip_hdr(skb)->saddr,
756 ip_hdr(skb)->daddr, &rep.th);
cfb6eeb4
YH
757 }
758#endif
88ef4a5a 759 arg.flags = reply_flags;
eddc9ec5
ACM
760 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
761 ip_hdr(skb)->saddr, /* XXX */
1da177e4
LT
762 arg.iov[0].iov_len, IPPROTO_TCP, 0);
763 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
9501f972
YH
764 if (oif)
765 arg.bound_dev_if = oif;
66b13d99 766 arg.tos = tos;
24a2d43d
ED
767 ip_send_unicast_reply(net, skb, &TCP_SKB_CB(skb)->header.h4.opt,
768 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
769 &arg, arg.iov[0].iov_len);
1da177e4 770
63231bdd 771 TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
1da177e4
LT
772}
773
774static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
775{
8feaf0c0 776 struct inet_timewait_sock *tw = inet_twsk(sk);
cfb6eeb4 777 struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
1da177e4 778
9501f972 779 tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
7174259e 780 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
ee684b6f 781 tcp_time_stamp + tcptw->tw_ts_offset,
9501f972
YH
782 tcptw->tw_ts_recent,
783 tw->tw_bound_dev_if,
88ef4a5a 784 tcp_twsk_md5_key(tcptw),
66b13d99
ED
785 tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
786 tw->tw_tos
9501f972 787 );
1da177e4 788
8feaf0c0 789 inet_twsk_put(tw);
1da177e4
LT
790}
791
6edafaaf 792static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
7174259e 793 struct request_sock *req)
1da177e4 794{
168a8f58
JC
795 /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
796 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
797 */
798 tcp_v4_send_ack(skb, (sk->sk_state == TCP_LISTEN) ?
799 tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt,
800 tcp_rsk(req)->rcv_nxt, req->rcv_wnd,
ee684b6f 801 tcp_time_stamp,
9501f972
YH
802 req->ts_recent,
803 0,
a915da9b
ED
804 tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr,
805 AF_INET),
66b13d99
ED
806 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
807 ip_hdr(skb)->tos);
1da177e4
LT
808}
809
1da177e4 810/*
9bf1d83e 811 * Send a SYN-ACK after having received a SYN.
60236fdd 812 * This still operates on a request_sock only, not on a big
1da177e4
LT
813 * socket.
814 */
72659ecc 815static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
d6274bd8 816 struct flowi *fl,
72659ecc 817 struct request_sock *req,
843f4a55
YC
818 u16 queue_mapping,
819 struct tcp_fastopen_cookie *foc)
1da177e4 820{
2e6599cb 821 const struct inet_request_sock *ireq = inet_rsk(req);
6bd023f3 822 struct flowi4 fl4;
1da177e4 823 int err = -1;
d41db5af 824 struct sk_buff *skb;
1da177e4
LT
825
826 /* First, grab a route. */
ba3f7f04 827 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
fd80eb94 828 return -1;
1da177e4 829
843f4a55 830 skb = tcp_make_synack(sk, dst, req, foc);
1da177e4
LT
831
832 if (skb) {
634fb979 833 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
1da177e4 834
fff32699 835 skb_set_queue_mapping(skb, queue_mapping);
634fb979
ED
836 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
837 ireq->ir_rmt_addr,
2e6599cb 838 ireq->opt);
b9df3cb8 839 err = net_xmit_eval(err);
1da177e4
LT
840 }
841
1da177e4
LT
842 return err;
843}
844
845/*
60236fdd 846 * IPv4 request_sock destructor.
1da177e4 847 */
60236fdd 848static void tcp_v4_reqsk_destructor(struct request_sock *req)
1da177e4 849{
a51482bd 850 kfree(inet_rsk(req)->opt);
1da177e4
LT
851}
852
946cedcc 853/*
a2a385d6 854 * Return true if a syncookie should be sent
946cedcc 855 */
a2a385d6 856bool tcp_syn_flood_action(struct sock *sk,
946cedcc
ED
857 const struct sk_buff *skb,
858 const char *proto)
1da177e4 859{
946cedcc 860 const char *msg = "Dropping request";
a2a385d6 861 bool want_cookie = false;
946cedcc
ED
862 struct listen_sock *lopt;
863
2a1d4bd4 864#ifdef CONFIG_SYN_COOKIES
946cedcc 865 if (sysctl_tcp_syncookies) {
2a1d4bd4 866 msg = "Sending cookies";
a2a385d6 867 want_cookie = true;
946cedcc
ED
868 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDOCOOKIES);
869 } else
80e40daa 870#endif
946cedcc
ED
871 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP);
872
873 lopt = inet_csk(sk)->icsk_accept_queue.listen_opt;
5ad37d5d 874 if (!lopt->synflood_warned && sysctl_tcp_syncookies != 2) {
946cedcc 875 lopt->synflood_warned = 1;
afd46503 876 pr_info("%s: Possible SYN flooding on port %d. %s. Check SNMP counters.\n",
946cedcc
ED
877 proto, ntohs(tcp_hdr(skb)->dest), msg);
878 }
879 return want_cookie;
2a1d4bd4 880}
946cedcc 881EXPORT_SYMBOL(tcp_syn_flood_action);
1da177e4 882
cfb6eeb4
YH
883#ifdef CONFIG_TCP_MD5SIG
884/*
885 * RFC2385 MD5 checksumming requires a mapping of
886 * IP address->MD5 Key.
887 * We need to maintain these in the sk structure.
888 */
889
890/* Find the Key structure for an address. */
a915da9b
ED
891struct tcp_md5sig_key *tcp_md5_do_lookup(struct sock *sk,
892 const union tcp_md5_addr *addr,
893 int family)
cfb6eeb4
YH
894{
895 struct tcp_sock *tp = tcp_sk(sk);
a915da9b 896 struct tcp_md5sig_key *key;
a915da9b 897 unsigned int size = sizeof(struct in_addr);
a8afca03 898 struct tcp_md5sig_info *md5sig;
cfb6eeb4 899
a8afca03
ED
900 /* caller either holds rcu_read_lock() or socket lock */
901 md5sig = rcu_dereference_check(tp->md5sig_info,
b4fb05ea
ED
902 sock_owned_by_user(sk) ||
903 lockdep_is_held(&sk->sk_lock.slock));
a8afca03 904 if (!md5sig)
cfb6eeb4 905 return NULL;
a915da9b
ED
906#if IS_ENABLED(CONFIG_IPV6)
907 if (family == AF_INET6)
908 size = sizeof(struct in6_addr);
909#endif
b67bfe0d 910 hlist_for_each_entry_rcu(key, &md5sig->head, node) {
a915da9b
ED
911 if (key->family != family)
912 continue;
913 if (!memcmp(&key->addr, addr, size))
914 return key;
cfb6eeb4
YH
915 }
916 return NULL;
917}
a915da9b 918EXPORT_SYMBOL(tcp_md5_do_lookup);
cfb6eeb4
YH
919
920struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
921 struct sock *addr_sk)
922{
a915da9b
ED
923 union tcp_md5_addr *addr;
924
925 addr = (union tcp_md5_addr *)&inet_sk(addr_sk)->inet_daddr;
926 return tcp_md5_do_lookup(sk, addr, AF_INET);
cfb6eeb4 927}
cfb6eeb4
YH
928EXPORT_SYMBOL(tcp_v4_md5_lookup);
929
f5b99bcd
AB
930static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
931 struct request_sock *req)
cfb6eeb4 932{
a915da9b
ED
933 union tcp_md5_addr *addr;
934
634fb979 935 addr = (union tcp_md5_addr *)&inet_rsk(req)->ir_rmt_addr;
a915da9b 936 return tcp_md5_do_lookup(sk, addr, AF_INET);
cfb6eeb4
YH
937}
938
939/* This can be called on a newly created socket, from other files */
a915da9b
ED
940int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
941 int family, const u8 *newkey, u8 newkeylen, gfp_t gfp)
cfb6eeb4
YH
942{
943 /* Add Key to the list */
b0a713e9 944 struct tcp_md5sig_key *key;
cfb6eeb4 945 struct tcp_sock *tp = tcp_sk(sk);
a915da9b 946 struct tcp_md5sig_info *md5sig;
cfb6eeb4 947
c0353c7b 948 key = tcp_md5_do_lookup(sk, addr, family);
cfb6eeb4
YH
949 if (key) {
950 /* Pre-existing entry - just update that one. */
a915da9b 951 memcpy(key->key, newkey, newkeylen);
b0a713e9 952 key->keylen = newkeylen;
a915da9b
ED
953 return 0;
954 }
260fcbeb 955
a8afca03
ED
956 md5sig = rcu_dereference_protected(tp->md5sig_info,
957 sock_owned_by_user(sk));
a915da9b
ED
958 if (!md5sig) {
959 md5sig = kmalloc(sizeof(*md5sig), gfp);
960 if (!md5sig)
cfb6eeb4 961 return -ENOMEM;
cfb6eeb4 962
a915da9b
ED
963 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
964 INIT_HLIST_HEAD(&md5sig->head);
a8afca03 965 rcu_assign_pointer(tp->md5sig_info, md5sig);
a915da9b 966 }
cfb6eeb4 967
5f3d9cb2 968 key = sock_kmalloc(sk, sizeof(*key), gfp);
a915da9b
ED
969 if (!key)
970 return -ENOMEM;
71cea17e 971 if (!tcp_alloc_md5sig_pool()) {
5f3d9cb2 972 sock_kfree_s(sk, key, sizeof(*key));
a915da9b 973 return -ENOMEM;
cfb6eeb4 974 }
a915da9b
ED
975
976 memcpy(key->key, newkey, newkeylen);
977 key->keylen = newkeylen;
978 key->family = family;
979 memcpy(&key->addr, addr,
980 (family == AF_INET6) ? sizeof(struct in6_addr) :
981 sizeof(struct in_addr));
982 hlist_add_head_rcu(&key->node, &md5sig->head);
cfb6eeb4
YH
983 return 0;
984}
a915da9b 985EXPORT_SYMBOL(tcp_md5_do_add);
cfb6eeb4 986
a915da9b 987int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family)
cfb6eeb4 988{
a915da9b
ED
989 struct tcp_md5sig_key *key;
990
c0353c7b 991 key = tcp_md5_do_lookup(sk, addr, family);
a915da9b
ED
992 if (!key)
993 return -ENOENT;
994 hlist_del_rcu(&key->node);
5f3d9cb2 995 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
a915da9b 996 kfree_rcu(key, rcu);
a915da9b 997 return 0;
cfb6eeb4 998}
a915da9b 999EXPORT_SYMBOL(tcp_md5_do_del);
cfb6eeb4 1000
e0683e70 1001static void tcp_clear_md5_list(struct sock *sk)
cfb6eeb4
YH
1002{
1003 struct tcp_sock *tp = tcp_sk(sk);
a915da9b 1004 struct tcp_md5sig_key *key;
b67bfe0d 1005 struct hlist_node *n;
a8afca03 1006 struct tcp_md5sig_info *md5sig;
cfb6eeb4 1007
a8afca03
ED
1008 md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1009
b67bfe0d 1010 hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
a915da9b 1011 hlist_del_rcu(&key->node);
5f3d9cb2 1012 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
a915da9b 1013 kfree_rcu(key, rcu);
cfb6eeb4
YH
1014 }
1015}
1016
7174259e
ACM
1017static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
1018 int optlen)
cfb6eeb4
YH
1019{
1020 struct tcp_md5sig cmd;
1021 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
cfb6eeb4
YH
1022
1023 if (optlen < sizeof(cmd))
1024 return -EINVAL;
1025
7174259e 1026 if (copy_from_user(&cmd, optval, sizeof(cmd)))
cfb6eeb4
YH
1027 return -EFAULT;
1028
1029 if (sin->sin_family != AF_INET)
1030 return -EINVAL;
1031
64a124ed 1032 if (!cmd.tcpm_keylen)
a915da9b
ED
1033 return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1034 AF_INET);
cfb6eeb4
YH
1035
1036 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1037 return -EINVAL;
1038
a915da9b
ED
1039 return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1040 AF_INET, cmd.tcpm_key, cmd.tcpm_keylen,
1041 GFP_KERNEL);
cfb6eeb4
YH
1042}
1043
49a72dfb
AL
1044static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
1045 __be32 daddr, __be32 saddr, int nbytes)
cfb6eeb4 1046{
cfb6eeb4 1047 struct tcp4_pseudohdr *bp;
49a72dfb 1048 struct scatterlist sg;
cfb6eeb4
YH
1049
1050 bp = &hp->md5_blk.ip4;
cfb6eeb4
YH
1051
1052 /*
49a72dfb 1053 * 1. the TCP pseudo-header (in the order: source IP address,
cfb6eeb4
YH
1054 * destination IP address, zero-padded protocol number, and
1055 * segment length)
1056 */
1057 bp->saddr = saddr;
1058 bp->daddr = daddr;
1059 bp->pad = 0;
076fb722 1060 bp->protocol = IPPROTO_TCP;
49a72dfb 1061 bp->len = cpu_to_be16(nbytes);
c7da57a1 1062
49a72dfb
AL
1063 sg_init_one(&sg, bp, sizeof(*bp));
1064 return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
1065}
1066
a915da9b 1067static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
318cf7aa 1068 __be32 daddr, __be32 saddr, const struct tcphdr *th)
49a72dfb
AL
1069{
1070 struct tcp_md5sig_pool *hp;
1071 struct hash_desc *desc;
1072
1073 hp = tcp_get_md5sig_pool();
1074 if (!hp)
1075 goto clear_hash_noput;
1076 desc = &hp->md5_desc;
1077
1078 if (crypto_hash_init(desc))
1079 goto clear_hash;
1080 if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
1081 goto clear_hash;
1082 if (tcp_md5_hash_header(hp, th))
1083 goto clear_hash;
1084 if (tcp_md5_hash_key(hp, key))
1085 goto clear_hash;
1086 if (crypto_hash_final(desc, md5_hash))
cfb6eeb4
YH
1087 goto clear_hash;
1088
cfb6eeb4 1089 tcp_put_md5sig_pool();
cfb6eeb4 1090 return 0;
49a72dfb 1091
cfb6eeb4
YH
1092clear_hash:
1093 tcp_put_md5sig_pool();
1094clear_hash_noput:
1095 memset(md5_hash, 0, 16);
49a72dfb 1096 return 1;
cfb6eeb4
YH
1097}
1098
49a72dfb 1099int tcp_v4_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key,
318cf7aa
ED
1100 const struct sock *sk, const struct request_sock *req,
1101 const struct sk_buff *skb)
cfb6eeb4 1102{
49a72dfb
AL
1103 struct tcp_md5sig_pool *hp;
1104 struct hash_desc *desc;
318cf7aa 1105 const struct tcphdr *th = tcp_hdr(skb);
cfb6eeb4
YH
1106 __be32 saddr, daddr;
1107
1108 if (sk) {
c720c7e8
ED
1109 saddr = inet_sk(sk)->inet_saddr;
1110 daddr = inet_sk(sk)->inet_daddr;
49a72dfb 1111 } else if (req) {
634fb979
ED
1112 saddr = inet_rsk(req)->ir_loc_addr;
1113 daddr = inet_rsk(req)->ir_rmt_addr;
cfb6eeb4 1114 } else {
49a72dfb
AL
1115 const struct iphdr *iph = ip_hdr(skb);
1116 saddr = iph->saddr;
1117 daddr = iph->daddr;
cfb6eeb4 1118 }
49a72dfb
AL
1119
1120 hp = tcp_get_md5sig_pool();
1121 if (!hp)
1122 goto clear_hash_noput;
1123 desc = &hp->md5_desc;
1124
1125 if (crypto_hash_init(desc))
1126 goto clear_hash;
1127
1128 if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
1129 goto clear_hash;
1130 if (tcp_md5_hash_header(hp, th))
1131 goto clear_hash;
1132 if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1133 goto clear_hash;
1134 if (tcp_md5_hash_key(hp, key))
1135 goto clear_hash;
1136 if (crypto_hash_final(desc, md5_hash))
1137 goto clear_hash;
1138
1139 tcp_put_md5sig_pool();
1140 return 0;
1141
1142clear_hash:
1143 tcp_put_md5sig_pool();
1144clear_hash_noput:
1145 memset(md5_hash, 0, 16);
1146 return 1;
cfb6eeb4 1147}
49a72dfb 1148EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
cfb6eeb4 1149
9ea88a15
DP
1150static bool __tcp_v4_inbound_md5_hash(struct sock *sk,
1151 const struct sk_buff *skb)
cfb6eeb4
YH
1152{
1153 /*
1154 * This gets called for each TCP segment that arrives
1155 * so we want to be efficient.
1156 * We have 3 drop cases:
1157 * o No MD5 hash and one expected.
1158 * o MD5 hash and we're not expecting one.
1159 * o MD5 hash and its wrong.
1160 */
cf533ea5 1161 const __u8 *hash_location = NULL;
cfb6eeb4 1162 struct tcp_md5sig_key *hash_expected;
eddc9ec5 1163 const struct iphdr *iph = ip_hdr(skb);
cf533ea5 1164 const struct tcphdr *th = tcp_hdr(skb);
cfb6eeb4 1165 int genhash;
cfb6eeb4
YH
1166 unsigned char newhash[16];
1167
a915da9b
ED
1168 hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1169 AF_INET);
7d5d5525 1170 hash_location = tcp_parse_md5sig_option(th);
cfb6eeb4 1171
cfb6eeb4
YH
1172 /* We've parsed the options - do we have a hash? */
1173 if (!hash_expected && !hash_location)
a2a385d6 1174 return false;
cfb6eeb4
YH
1175
1176 if (hash_expected && !hash_location) {
785957d3 1177 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
a2a385d6 1178 return true;
cfb6eeb4
YH
1179 }
1180
1181 if (!hash_expected && hash_location) {
785957d3 1182 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
a2a385d6 1183 return true;
cfb6eeb4
YH
1184 }
1185
1186 /* Okay, so this is hash_expected and hash_location -
1187 * so we need to calculate the checksum.
1188 */
49a72dfb
AL
1189 genhash = tcp_v4_md5_hash_skb(newhash,
1190 hash_expected,
1191 NULL, NULL, skb);
cfb6eeb4
YH
1192
1193 if (genhash || memcmp(hash_location, newhash, 16) != 0) {
e87cc472
JP
1194 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1195 &iph->saddr, ntohs(th->source),
1196 &iph->daddr, ntohs(th->dest),
1197 genhash ? " tcp_v4_calc_md5_hash failed"
1198 : "");
a2a385d6 1199 return true;
cfb6eeb4 1200 }
a2a385d6 1201 return false;
cfb6eeb4
YH
1202}
1203
9ea88a15
DP
1204static bool tcp_v4_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb)
1205{
1206 bool ret;
1207
1208 rcu_read_lock();
1209 ret = __tcp_v4_inbound_md5_hash(sk, skb);
1210 rcu_read_unlock();
1211
1212 return ret;
1213}
1214
cfb6eeb4
YH
1215#endif
1216
16bea70a
OP
1217static void tcp_v4_init_req(struct request_sock *req, struct sock *sk,
1218 struct sk_buff *skb)
1219{
1220 struct inet_request_sock *ireq = inet_rsk(req);
1221
1222 ireq->ir_loc_addr = ip_hdr(skb)->daddr;
1223 ireq->ir_rmt_addr = ip_hdr(skb)->saddr;
1224 ireq->no_srccheck = inet_sk(sk)->transparent;
1225 ireq->opt = tcp_v4_save_options(skb);
1226}
1227
d94e0417
OP
1228static struct dst_entry *tcp_v4_route_req(struct sock *sk, struct flowi *fl,
1229 const struct request_sock *req,
1230 bool *strict)
1231{
1232 struct dst_entry *dst = inet_csk_route_req(sk, &fl->u.ip4, req);
1233
1234 if (strict) {
1235 if (fl->u.ip4.daddr == inet_rsk(req)->ir_rmt_addr)
1236 *strict = true;
1237 else
1238 *strict = false;
1239 }
1240
1241 return dst;
1242}
1243
72a3effa 1244struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1da177e4 1245 .family = PF_INET,
2e6599cb 1246 .obj_size = sizeof(struct tcp_request_sock),
5db92c99 1247 .rtx_syn_ack = tcp_rtx_synack,
60236fdd
ACM
1248 .send_ack = tcp_v4_reqsk_send_ack,
1249 .destructor = tcp_v4_reqsk_destructor,
1da177e4 1250 .send_reset = tcp_v4_send_reset,
688d1945 1251 .syn_ack_timeout = tcp_syn_ack_timeout,
1da177e4
LT
1252};
1253
b2e4b3de 1254static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
2aec4a29 1255 .mss_clamp = TCP_MSS_DEFAULT,
16bea70a 1256#ifdef CONFIG_TCP_MD5SIG
cfb6eeb4 1257 .md5_lookup = tcp_v4_reqsk_md5_lookup,
e3afe7b7 1258 .calc_md5_hash = tcp_v4_md5_hash_skb,
b6332e6c 1259#endif
16bea70a 1260 .init_req = tcp_v4_init_req,
fb7b37a7
OP
1261#ifdef CONFIG_SYN_COOKIES
1262 .cookie_init_seq = cookie_v4_init_sequence,
1263#endif
d94e0417 1264 .route_req = tcp_v4_route_req,
936b8bdb 1265 .init_seq = tcp_v4_init_sequence,
d6274bd8 1266 .send_synack = tcp_v4_send_synack,
695da14e 1267 .queue_hash_add = inet_csk_reqsk_queue_hash_add,
16bea70a 1268};
cfb6eeb4 1269
1da177e4
LT
1270int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1271{
1da177e4 1272 /* Never answer to SYNs send to broadcast or multicast */
511c3f92 1273 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1da177e4
LT
1274 goto drop;
1275
1fb6f159
OP
1276 return tcp_conn_request(&tcp_request_sock_ops,
1277 &tcp_request_sock_ipv4_ops, sk, skb);
1da177e4 1278
1da177e4 1279drop:
848bf15f 1280 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1da177e4
LT
1281 return 0;
1282}
4bc2f18b 1283EXPORT_SYMBOL(tcp_v4_conn_request);
1da177e4
LT
1284
1285
1286/*
1287 * The three way handshake has completed - we got a valid synack -
1288 * now create the new socket.
1289 */
1290struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
60236fdd 1291 struct request_sock *req,
1da177e4
LT
1292 struct dst_entry *dst)
1293{
2e6599cb 1294 struct inet_request_sock *ireq;
1da177e4
LT
1295 struct inet_sock *newinet;
1296 struct tcp_sock *newtp;
1297 struct sock *newsk;
cfb6eeb4
YH
1298#ifdef CONFIG_TCP_MD5SIG
1299 struct tcp_md5sig_key *key;
1300#endif
f6d8bd05 1301 struct ip_options_rcu *inet_opt;
1da177e4
LT
1302
1303 if (sk_acceptq_is_full(sk))
1304 goto exit_overflow;
1305
1da177e4
LT
1306 newsk = tcp_create_openreq_child(sk, req, skb);
1307 if (!newsk)
093d2823 1308 goto exit_nonewsk;
1da177e4 1309
bcd76111 1310 newsk->sk_gso_type = SKB_GSO_TCPV4;
fae6ef87 1311 inet_sk_rx_dst_set(newsk, skb);
1da177e4
LT
1312
1313 newtp = tcp_sk(newsk);
1314 newinet = inet_sk(newsk);
2e6599cb 1315 ireq = inet_rsk(req);
634fb979
ED
1316 newinet->inet_daddr = ireq->ir_rmt_addr;
1317 newinet->inet_rcv_saddr = ireq->ir_loc_addr;
1318 newinet->inet_saddr = ireq->ir_loc_addr;
f6d8bd05
ED
1319 inet_opt = ireq->opt;
1320 rcu_assign_pointer(newinet->inet_opt, inet_opt);
2e6599cb 1321 ireq->opt = NULL;
463c84b9 1322 newinet->mc_index = inet_iif(skb);
eddc9ec5 1323 newinet->mc_ttl = ip_hdr(skb)->ttl;
4c507d28 1324 newinet->rcv_tos = ip_hdr(skb)->tos;
d83d8461 1325 inet_csk(newsk)->icsk_ext_hdr_len = 0;
b73c3d0e 1326 inet_set_txhash(newsk);
f6d8bd05
ED
1327 if (inet_opt)
1328 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
c720c7e8 1329 newinet->inet_id = newtp->write_seq ^ jiffies;
1da177e4 1330
dfd25fff
ED
1331 if (!dst) {
1332 dst = inet_csk_route_child_sock(sk, newsk, req);
1333 if (!dst)
1334 goto put_and_exit;
1335 } else {
1336 /* syncookie case : see end of cookie_v4_check() */
1337 }
0e734419
DM
1338 sk_setup_caps(newsk, dst);
1339
1da177e4 1340 tcp_sync_mss(newsk, dst_mtu(dst));
0dbaee3b 1341 newtp->advmss = dst_metric_advmss(dst);
f5fff5dc
TQ
1342 if (tcp_sk(sk)->rx_opt.user_mss &&
1343 tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1344 newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1345
1da177e4
LT
1346 tcp_initialize_rcv_mss(newsk);
1347
cfb6eeb4
YH
1348#ifdef CONFIG_TCP_MD5SIG
1349 /* Copy over the MD5 key from the original socket */
a915da9b
ED
1350 key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1351 AF_INET);
c720c7e8 1352 if (key != NULL) {
cfb6eeb4
YH
1353 /*
1354 * We're using one, so create a matching key
1355 * on the newsk structure. If we fail to get
1356 * memory, then we end up not copying the key
1357 * across. Shucks.
1358 */
a915da9b
ED
1359 tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1360 AF_INET, key->key, key->keylen, GFP_ATOMIC);
a465419b 1361 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
cfb6eeb4
YH
1362 }
1363#endif
1364
0e734419
DM
1365 if (__inet_inherit_port(sk, newsk) < 0)
1366 goto put_and_exit;
9327f705 1367 __inet_hash_nolisten(newsk, NULL);
1da177e4
LT
1368
1369 return newsk;
1370
1371exit_overflow:
de0744af 1372 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
093d2823
BS
1373exit_nonewsk:
1374 dst_release(dst);
1da177e4 1375exit:
de0744af 1376 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1da177e4 1377 return NULL;
0e734419 1378put_and_exit:
e337e24d
CP
1379 inet_csk_prepare_forced_close(newsk);
1380 tcp_done(newsk);
0e734419 1381 goto exit;
1da177e4 1382}
4bc2f18b 1383EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1da177e4
LT
1384
1385static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1386{
aa8223c7 1387 struct tcphdr *th = tcp_hdr(skb);
eddc9ec5 1388 const struct iphdr *iph = ip_hdr(skb);
1da177e4 1389 struct sock *nsk;
60236fdd 1390 struct request_sock **prev;
1da177e4 1391 /* Find possible connection requests. */
463c84b9
ACM
1392 struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1393 iph->saddr, iph->daddr);
1da177e4 1394 if (req)
8336886f 1395 return tcp_check_req(sk, skb, req, prev, false);
1da177e4 1396
3b1e0a65 1397 nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
c67499c0 1398 th->source, iph->daddr, th->dest, inet_iif(skb));
1da177e4
LT
1399
1400 if (nsk) {
1401 if (nsk->sk_state != TCP_TIME_WAIT) {
1402 bh_lock_sock(nsk);
1403 return nsk;
1404 }
9469c7b4 1405 inet_twsk_put(inet_twsk(nsk));
1da177e4
LT
1406 return NULL;
1407 }
1408
1409#ifdef CONFIG_SYN_COOKIES
af9b4738 1410 if (!th->syn)
461b74c3 1411 sk = cookie_v4_check(sk, skb);
1da177e4
LT
1412#endif
1413 return sk;
1414}
1415
1da177e4
LT
1416/* The socket must have it's spinlock held when we get
1417 * here.
1418 *
1419 * We have a potential double-lock case here, so even when
1420 * doing backlog processing we use the BH locking scheme.
1421 * This is because we cannot sleep with the original spinlock
1422 * held.
1423 */
1424int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1425{
cfb6eeb4 1426 struct sock *rsk;
cfb6eeb4 1427
1da177e4 1428 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
404e0a8b
ED
1429 struct dst_entry *dst = sk->sk_rx_dst;
1430
bdeab991 1431 sock_rps_save_rxhash(sk, skb);
404e0a8b 1432 if (dst) {
505fbcf0
ED
1433 if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1434 dst->ops->check(dst, 0) == NULL) {
92101b3b
DM
1435 dst_release(dst);
1436 sk->sk_rx_dst = NULL;
1437 }
1438 }
c995ae22 1439 tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len);
1da177e4
LT
1440 return 0;
1441 }
1442
ab6a5bb6 1443 if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
1da177e4
LT
1444 goto csum_err;
1445
1446 if (sk->sk_state == TCP_LISTEN) {
1447 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1448 if (!nsk)
1449 goto discard;
1450
1451 if (nsk != sk) {
bdeab991 1452 sock_rps_save_rxhash(nsk, skb);
cfb6eeb4
YH
1453 if (tcp_child_process(sk, nsk, skb)) {
1454 rsk = nsk;
1da177e4 1455 goto reset;
cfb6eeb4 1456 }
1da177e4
LT
1457 return 0;
1458 }
ca55158c 1459 } else
bdeab991 1460 sock_rps_save_rxhash(sk, skb);
ca55158c 1461
aa8223c7 1462 if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
cfb6eeb4 1463 rsk = sk;
1da177e4 1464 goto reset;
cfb6eeb4 1465 }
1da177e4
LT
1466 return 0;
1467
1468reset:
cfb6eeb4 1469 tcp_v4_send_reset(rsk, skb);
1da177e4
LT
1470discard:
1471 kfree_skb(skb);
1472 /* Be careful here. If this function gets more complicated and
1473 * gcc suffers from register pressure on the x86, sk (in %ebx)
1474 * might be destroyed here. This current version compiles correctly,
1475 * but you have been warned.
1476 */
1477 return 0;
1478
1479csum_err:
6a5dc9e5 1480 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_CSUMERRORS);
63231bdd 1481 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
1da177e4
LT
1482 goto discard;
1483}
4bc2f18b 1484EXPORT_SYMBOL(tcp_v4_do_rcv);
1da177e4 1485
160eb5a6 1486void tcp_v4_early_demux(struct sk_buff *skb)
41063e9d 1487{
41063e9d
DM
1488 const struct iphdr *iph;
1489 const struct tcphdr *th;
1490 struct sock *sk;
41063e9d 1491
41063e9d 1492 if (skb->pkt_type != PACKET_HOST)
160eb5a6 1493 return;
41063e9d 1494
45f00f99 1495 if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
160eb5a6 1496 return;
41063e9d
DM
1497
1498 iph = ip_hdr(skb);
45f00f99 1499 th = tcp_hdr(skb);
41063e9d
DM
1500
1501 if (th->doff < sizeof(struct tcphdr) / 4)
160eb5a6 1502 return;
41063e9d 1503
45f00f99 1504 sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
41063e9d 1505 iph->saddr, th->source,
7011d085 1506 iph->daddr, ntohs(th->dest),
9cb429d6 1507 skb->skb_iif);
41063e9d
DM
1508 if (sk) {
1509 skb->sk = sk;
1510 skb->destructor = sock_edemux;
1511 if (sk->sk_state != TCP_TIME_WAIT) {
1512 struct dst_entry *dst = sk->sk_rx_dst;
505fbcf0 1513
41063e9d
DM
1514 if (dst)
1515 dst = dst_check(dst, 0);
92101b3b 1516 if (dst &&
505fbcf0 1517 inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
92101b3b 1518 skb_dst_set_noref(skb, dst);
41063e9d
DM
1519 }
1520 }
41063e9d
DM
1521}
1522
b2fb4f54
ED
1523/* Packet is added to VJ-style prequeue for processing in process
1524 * context, if a reader task is waiting. Apparently, this exciting
1525 * idea (VJ's mail "Re: query about TCP header on tcp-ip" of 07 Sep 93)
1526 * failed somewhere. Latency? Burstiness? Well, at least now we will
1527 * see, why it failed. 8)8) --ANK
1528 *
1529 */
1530bool tcp_prequeue(struct sock *sk, struct sk_buff *skb)
1531{
1532 struct tcp_sock *tp = tcp_sk(sk);
1533
1534 if (sysctl_tcp_low_latency || !tp->ucopy.task)
1535 return false;
1536
1537 if (skb->len <= tcp_hdrlen(skb) &&
1538 skb_queue_len(&tp->ucopy.prequeue) == 0)
1539 return false;
1540
ca777eff
ED
1541 /* Before escaping RCU protected region, we need to take care of skb
1542 * dst. Prequeue is only enabled for established sockets.
1543 * For such sockets, we might need the skb dst only to set sk->sk_rx_dst
1544 * Instead of doing full sk_rx_dst validity here, let's perform
1545 * an optimistic check.
1546 */
1547 if (likely(sk->sk_rx_dst))
1548 skb_dst_drop(skb);
1549 else
1550 skb_dst_force(skb);
1551
b2fb4f54
ED
1552 __skb_queue_tail(&tp->ucopy.prequeue, skb);
1553 tp->ucopy.memory += skb->truesize;
1554 if (tp->ucopy.memory > sk->sk_rcvbuf) {
1555 struct sk_buff *skb1;
1556
1557 BUG_ON(sock_owned_by_user(sk));
1558
1559 while ((skb1 = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) {
1560 sk_backlog_rcv(sk, skb1);
1561 NET_INC_STATS_BH(sock_net(sk),
1562 LINUX_MIB_TCPPREQUEUEDROPPED);
1563 }
1564
1565 tp->ucopy.memory = 0;
1566 } else if (skb_queue_len(&tp->ucopy.prequeue) == 1) {
1567 wake_up_interruptible_sync_poll(sk_sleep(sk),
1568 POLLIN | POLLRDNORM | POLLRDBAND);
1569 if (!inet_csk_ack_scheduled(sk))
1570 inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
1571 (3 * tcp_rto_min(sk)) / 4,
1572 TCP_RTO_MAX);
1573 }
1574 return true;
1575}
1576EXPORT_SYMBOL(tcp_prequeue);
1577
1da177e4
LT
1578/*
1579 * From tcp_input.c
1580 */
1581
1582int tcp_v4_rcv(struct sk_buff *skb)
1583{
eddc9ec5 1584 const struct iphdr *iph;
cf533ea5 1585 const struct tcphdr *th;
1da177e4
LT
1586 struct sock *sk;
1587 int ret;
a86b1e30 1588 struct net *net = dev_net(skb->dev);
1da177e4
LT
1589
1590 if (skb->pkt_type != PACKET_HOST)
1591 goto discard_it;
1592
1593 /* Count it even if it's bad */
63231bdd 1594 TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
1da177e4
LT
1595
1596 if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1597 goto discard_it;
1598
aa8223c7 1599 th = tcp_hdr(skb);
1da177e4
LT
1600
1601 if (th->doff < sizeof(struct tcphdr) / 4)
1602 goto bad_packet;
1603 if (!pskb_may_pull(skb, th->doff * 4))
1604 goto discard_it;
1605
1606 /* An explanation is required here, I think.
1607 * Packet length and doff are validated by header prediction,
caa20d9a 1608 * provided case of th->doff==0 is eliminated.
1da177e4 1609 * So, we defer the checks. */
ed70fcfc
TH
1610
1611 if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
6a5dc9e5 1612 goto csum_error;
1da177e4 1613
aa8223c7 1614 th = tcp_hdr(skb);
eddc9ec5 1615 iph = ip_hdr(skb);
971f10ec
ED
1616 /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1617 * barrier() makes sure compiler wont play fool^Waliasing games.
1618 */
1619 memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1620 sizeof(struct inet_skb_parm));
1621 barrier();
1622
1da177e4
LT
1623 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1624 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1625 skb->len - th->doff * 4);
1626 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
e11ecddf 1627 TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
04317daf 1628 TCP_SKB_CB(skb)->tcp_tw_isn = 0;
b82d1bb4 1629 TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1da177e4
LT
1630 TCP_SKB_CB(skb)->sacked = 0;
1631
9a1f27c4 1632 sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
1da177e4
LT
1633 if (!sk)
1634 goto no_tcp_socket;
1635
bb134d5d
ED
1636process:
1637 if (sk->sk_state == TCP_TIME_WAIT)
1638 goto do_time_wait;
1639
6cce09f8
ED
1640 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1641 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
d218d111 1642 goto discard_and_relse;
6cce09f8 1643 }
d218d111 1644
1da177e4
LT
1645 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1646 goto discard_and_relse;
9ea88a15
DP
1647
1648#ifdef CONFIG_TCP_MD5SIG
1649 /*
1650 * We really want to reject the packet as early as possible
1651 * if:
1652 * o We're expecting an MD5'd packet and this is no MD5 tcp option
1653 * o There is an MD5 option and we're not expecting one
1654 */
1655 if (tcp_v4_inbound_md5_hash(sk, skb))
1656 goto discard_and_relse;
1657#endif
1658
b59c2701 1659 nf_reset(skb);
1da177e4 1660
fda9ef5d 1661 if (sk_filter(sk, skb))
1da177e4
LT
1662 goto discard_and_relse;
1663
8b80cda5 1664 sk_mark_napi_id(sk, skb);
1da177e4
LT
1665 skb->dev = NULL;
1666
c6366184 1667 bh_lock_sock_nested(sk);
1da177e4
LT
1668 ret = 0;
1669 if (!sock_owned_by_user(sk)) {
7bced397 1670 if (!tcp_prequeue(sk, skb))
1da177e4 1671 ret = tcp_v4_do_rcv(sk, skb);
da882c1f
ED
1672 } else if (unlikely(sk_add_backlog(sk, skb,
1673 sk->sk_rcvbuf + sk->sk_sndbuf))) {
6b03a53a 1674 bh_unlock_sock(sk);
6cce09f8 1675 NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
6b03a53a
ZY
1676 goto discard_and_relse;
1677 }
1da177e4
LT
1678 bh_unlock_sock(sk);
1679
1680 sock_put(sk);
1681
1682 return ret;
1683
1684no_tcp_socket:
1685 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1686 goto discard_it;
1687
1688 if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
6a5dc9e5
ED
1689csum_error:
1690 TCP_INC_STATS_BH(net, TCP_MIB_CSUMERRORS);
1da177e4 1691bad_packet:
63231bdd 1692 TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1da177e4 1693 } else {
cfb6eeb4 1694 tcp_v4_send_reset(NULL, skb);
1da177e4
LT
1695 }
1696
1697discard_it:
1698 /* Discard frame. */
1699 kfree_skb(skb);
e905a9ed 1700 return 0;
1da177e4
LT
1701
1702discard_and_relse:
1703 sock_put(sk);
1704 goto discard_it;
1705
1706do_time_wait:
1707 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
9469c7b4 1708 inet_twsk_put(inet_twsk(sk));
1da177e4
LT
1709 goto discard_it;
1710 }
1711
6a5dc9e5 1712 if (skb->len < (th->doff << 2)) {
9469c7b4 1713 inet_twsk_put(inet_twsk(sk));
6a5dc9e5
ED
1714 goto bad_packet;
1715 }
1716 if (tcp_checksum_complete(skb)) {
1717 inet_twsk_put(inet_twsk(sk));
1718 goto csum_error;
1da177e4 1719 }
9469c7b4 1720 switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1da177e4 1721 case TCP_TW_SYN: {
c346dca1 1722 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
c67499c0 1723 &tcp_hashinfo,
da5e3630 1724 iph->saddr, th->source,
eddc9ec5 1725 iph->daddr, th->dest,
463c84b9 1726 inet_iif(skb));
1da177e4 1727 if (sk2) {
9469c7b4
YH
1728 inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
1729 inet_twsk_put(inet_twsk(sk));
1da177e4
LT
1730 sk = sk2;
1731 goto process;
1732 }
1733 /* Fall through to ACK */
1734 }
1735 case TCP_TW_ACK:
1736 tcp_v4_timewait_ack(sk, skb);
1737 break;
1738 case TCP_TW_RST:
1739 goto no_tcp_socket;
1740 case TCP_TW_SUCCESS:;
1741 }
1742 goto discard_it;
1743}
1744
ccb7c410
DM
1745static struct timewait_sock_ops tcp_timewait_sock_ops = {
1746 .twsk_obj_size = sizeof(struct tcp_timewait_sock),
1747 .twsk_unique = tcp_twsk_unique,
1748 .twsk_destructor= tcp_twsk_destructor,
ccb7c410 1749};
1da177e4 1750
63d02d15 1751void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
5d299f3d
ED
1752{
1753 struct dst_entry *dst = skb_dst(skb);
1754
ca777eff
ED
1755 if (dst) {
1756 dst_hold(dst);
1757 sk->sk_rx_dst = dst;
1758 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
1759 }
5d299f3d 1760}
63d02d15 1761EXPORT_SYMBOL(inet_sk_rx_dst_set);
5d299f3d 1762
3b401a81 1763const struct inet_connection_sock_af_ops ipv4_specific = {
543d9cfe
ACM
1764 .queue_xmit = ip_queue_xmit,
1765 .send_check = tcp_v4_send_check,
1766 .rebuild_header = inet_sk_rebuild_header,
5d299f3d 1767 .sk_rx_dst_set = inet_sk_rx_dst_set,
543d9cfe
ACM
1768 .conn_request = tcp_v4_conn_request,
1769 .syn_recv_sock = tcp_v4_syn_recv_sock,
543d9cfe
ACM
1770 .net_header_len = sizeof(struct iphdr),
1771 .setsockopt = ip_setsockopt,
1772 .getsockopt = ip_getsockopt,
1773 .addr2sockaddr = inet_csk_addr2sockaddr,
1774 .sockaddr_len = sizeof(struct sockaddr_in),
ab1e0a13 1775 .bind_conflict = inet_csk_bind_conflict,
3fdadf7d 1776#ifdef CONFIG_COMPAT
543d9cfe
ACM
1777 .compat_setsockopt = compat_ip_setsockopt,
1778 .compat_getsockopt = compat_ip_getsockopt,
3fdadf7d 1779#endif
4fab9071 1780 .mtu_reduced = tcp_v4_mtu_reduced,
1da177e4 1781};
4bc2f18b 1782EXPORT_SYMBOL(ipv4_specific);
1da177e4 1783
cfb6eeb4 1784#ifdef CONFIG_TCP_MD5SIG
b2e4b3de 1785static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
cfb6eeb4 1786 .md5_lookup = tcp_v4_md5_lookup,
49a72dfb 1787 .calc_md5_hash = tcp_v4_md5_hash_skb,
cfb6eeb4 1788 .md5_parse = tcp_v4_parse_md5_keys,
cfb6eeb4 1789};
b6332e6c 1790#endif
cfb6eeb4 1791
1da177e4
LT
1792/* NOTE: A lot of things set to zero explicitly by call to
1793 * sk_alloc() so need not be done here.
1794 */
1795static int tcp_v4_init_sock(struct sock *sk)
1796{
6687e988 1797 struct inet_connection_sock *icsk = inet_csk(sk);
1da177e4 1798
900f65d3 1799 tcp_init_sock(sk);
1da177e4 1800
8292a17a 1801 icsk->icsk_af_ops = &ipv4_specific;
900f65d3 1802
cfb6eeb4 1803#ifdef CONFIG_TCP_MD5SIG
ac807fa8 1804 tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
cfb6eeb4 1805#endif
1da177e4 1806
1da177e4
LT
1807 return 0;
1808}
1809
7d06b2e0 1810void tcp_v4_destroy_sock(struct sock *sk)
1da177e4
LT
1811{
1812 struct tcp_sock *tp = tcp_sk(sk);
1813
1814 tcp_clear_xmit_timers(sk);
1815
6687e988 1816 tcp_cleanup_congestion_control(sk);
317a76f9 1817
1da177e4 1818 /* Cleanup up the write buffer. */
fe067e8a 1819 tcp_write_queue_purge(sk);
1da177e4
LT
1820
1821 /* Cleans up our, hopefully empty, out_of_order_queue. */
e905a9ed 1822 __skb_queue_purge(&tp->out_of_order_queue);
1da177e4 1823
cfb6eeb4
YH
1824#ifdef CONFIG_TCP_MD5SIG
1825 /* Clean up the MD5 key list, if any */
1826 if (tp->md5sig_info) {
a915da9b 1827 tcp_clear_md5_list(sk);
a8afca03 1828 kfree_rcu(tp->md5sig_info, rcu);
cfb6eeb4
YH
1829 tp->md5sig_info = NULL;
1830 }
1831#endif
1a2449a8 1832
1da177e4
LT
1833 /* Clean prequeue, it must be empty really */
1834 __skb_queue_purge(&tp->ucopy.prequeue);
1835
1836 /* Clean up a referenced TCP bind bucket. */
463c84b9 1837 if (inet_csk(sk)->icsk_bind_hash)
ab1e0a13 1838 inet_put_port(sk);
1da177e4 1839
168a8f58 1840 BUG_ON(tp->fastopen_rsk != NULL);
435cf559 1841
cf60af03
YC
1842 /* If socket is aborted during connect operation */
1843 tcp_free_fastopen_req(tp);
1844
180d8cd9 1845 sk_sockets_allocated_dec(sk);
d1a4c0b3 1846 sock_release_memcg(sk);
1da177e4 1847}
1da177e4
LT
1848EXPORT_SYMBOL(tcp_v4_destroy_sock);
1849
1850#ifdef CONFIG_PROC_FS
1851/* Proc filesystem TCP sock list dumping. */
1852
a8b690f9
TH
1853/*
1854 * Get next listener socket follow cur. If cur is NULL, get first socket
1855 * starting from bucket given in st->bucket; when st->bucket is zero the
1856 * very first socket in the hash table is returned.
1857 */
1da177e4
LT
1858static void *listening_get_next(struct seq_file *seq, void *cur)
1859{
463c84b9 1860 struct inet_connection_sock *icsk;
c25eb3bf 1861 struct hlist_nulls_node *node;
1da177e4 1862 struct sock *sk = cur;
5caea4ea 1863 struct inet_listen_hashbucket *ilb;
5799de0b 1864 struct tcp_iter_state *st = seq->private;
a4146b1b 1865 struct net *net = seq_file_net(seq);
1da177e4
LT
1866
1867 if (!sk) {
a8b690f9 1868 ilb = &tcp_hashinfo.listening_hash[st->bucket];
5caea4ea 1869 spin_lock_bh(&ilb->lock);
c25eb3bf 1870 sk = sk_nulls_head(&ilb->head);
a8b690f9 1871 st->offset = 0;
1da177e4
LT
1872 goto get_sk;
1873 }
5caea4ea 1874 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1da177e4 1875 ++st->num;
a8b690f9 1876 ++st->offset;
1da177e4
LT
1877
1878 if (st->state == TCP_SEQ_STATE_OPENREQ) {
60236fdd 1879 struct request_sock *req = cur;
1da177e4 1880
72a3effa 1881 icsk = inet_csk(st->syn_wait_sk);
1da177e4
LT
1882 req = req->dl_next;
1883 while (1) {
1884 while (req) {
bdccc4ca 1885 if (req->rsk_ops->family == st->family) {
1da177e4
LT
1886 cur = req;
1887 goto out;
1888 }
1889 req = req->dl_next;
1890 }
72a3effa 1891 if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
1da177e4
LT
1892 break;
1893get_req:
463c84b9 1894 req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
1da177e4 1895 }
1bde5ac4 1896 sk = sk_nulls_next(st->syn_wait_sk);
1da177e4 1897 st->state = TCP_SEQ_STATE_LISTENING;
463c84b9 1898 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1da177e4 1899 } else {
e905a9ed 1900 icsk = inet_csk(sk);
463c84b9
ACM
1901 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1902 if (reqsk_queue_len(&icsk->icsk_accept_queue))
1da177e4 1903 goto start_req;
463c84b9 1904 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1bde5ac4 1905 sk = sk_nulls_next(sk);
1da177e4
LT
1906 }
1907get_sk:
c25eb3bf 1908 sk_nulls_for_each_from(sk, node) {
8475ef9f
PE
1909 if (!net_eq(sock_net(sk), net))
1910 continue;
1911 if (sk->sk_family == st->family) {
1da177e4
LT
1912 cur = sk;
1913 goto out;
1914 }
e905a9ed 1915 icsk = inet_csk(sk);
463c84b9
ACM
1916 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1917 if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
1da177e4
LT
1918start_req:
1919 st->uid = sock_i_uid(sk);
1920 st->syn_wait_sk = sk;
1921 st->state = TCP_SEQ_STATE_OPENREQ;
1922 st->sbucket = 0;
1923 goto get_req;
1924 }
463c84b9 1925 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1da177e4 1926 }
5caea4ea 1927 spin_unlock_bh(&ilb->lock);
a8b690f9 1928 st->offset = 0;
0f7ff927 1929 if (++st->bucket < INET_LHTABLE_SIZE) {
5caea4ea
ED
1930 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1931 spin_lock_bh(&ilb->lock);
c25eb3bf 1932 sk = sk_nulls_head(&ilb->head);
1da177e4
LT
1933 goto get_sk;
1934 }
1935 cur = NULL;
1936out:
1937 return cur;
1938}
1939
1940static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1941{
a8b690f9
TH
1942 struct tcp_iter_state *st = seq->private;
1943 void *rc;
1944
1945 st->bucket = 0;
1946 st->offset = 0;
1947 rc = listening_get_next(seq, NULL);
1da177e4
LT
1948
1949 while (rc && *pos) {
1950 rc = listening_get_next(seq, rc);
1951 --*pos;
1952 }
1953 return rc;
1954}
1955
05dbc7b5 1956static inline bool empty_bucket(const struct tcp_iter_state *st)
6eac5604 1957{
05dbc7b5 1958 return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
6eac5604
AK
1959}
1960
a8b690f9
TH
1961/*
1962 * Get first established socket starting from bucket given in st->bucket.
1963 * If st->bucket is zero, the very first socket in the hash is returned.
1964 */
1da177e4
LT
1965static void *established_get_first(struct seq_file *seq)
1966{
5799de0b 1967 struct tcp_iter_state *st = seq->private;
a4146b1b 1968 struct net *net = seq_file_net(seq);
1da177e4
LT
1969 void *rc = NULL;
1970
a8b690f9
TH
1971 st->offset = 0;
1972 for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
1da177e4 1973 struct sock *sk;
3ab5aee7 1974 struct hlist_nulls_node *node;
9db66bdc 1975 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
1da177e4 1976
6eac5604
AK
1977 /* Lockless fast path for the common case of empty buckets */
1978 if (empty_bucket(st))
1979 continue;
1980
9db66bdc 1981 spin_lock_bh(lock);
3ab5aee7 1982 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
f40c8174 1983 if (sk->sk_family != st->family ||
878628fb 1984 !net_eq(sock_net(sk), net)) {
1da177e4
LT
1985 continue;
1986 }
1987 rc = sk;
1988 goto out;
1989 }
9db66bdc 1990 spin_unlock_bh(lock);
1da177e4
LT
1991 }
1992out:
1993 return rc;
1994}
1995
1996static void *established_get_next(struct seq_file *seq, void *cur)
1997{
1998 struct sock *sk = cur;
3ab5aee7 1999 struct hlist_nulls_node *node;
5799de0b 2000 struct tcp_iter_state *st = seq->private;
a4146b1b 2001 struct net *net = seq_file_net(seq);
1da177e4
LT
2002
2003 ++st->num;
a8b690f9 2004 ++st->offset;
1da177e4 2005
05dbc7b5 2006 sk = sk_nulls_next(sk);
1da177e4 2007
3ab5aee7 2008 sk_nulls_for_each_from(sk, node) {
878628fb 2009 if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
05dbc7b5 2010 return sk;
1da177e4
LT
2011 }
2012
05dbc7b5
ED
2013 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2014 ++st->bucket;
2015 return established_get_first(seq);
1da177e4
LT
2016}
2017
2018static void *established_get_idx(struct seq_file *seq, loff_t pos)
2019{
a8b690f9
TH
2020 struct tcp_iter_state *st = seq->private;
2021 void *rc;
2022
2023 st->bucket = 0;
2024 rc = established_get_first(seq);
1da177e4
LT
2025
2026 while (rc && pos) {
2027 rc = established_get_next(seq, rc);
2028 --pos;
7174259e 2029 }
1da177e4
LT
2030 return rc;
2031}
2032
2033static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2034{
2035 void *rc;
5799de0b 2036 struct tcp_iter_state *st = seq->private;
1da177e4 2037
1da177e4
LT
2038 st->state = TCP_SEQ_STATE_LISTENING;
2039 rc = listening_get_idx(seq, &pos);
2040
2041 if (!rc) {
1da177e4
LT
2042 st->state = TCP_SEQ_STATE_ESTABLISHED;
2043 rc = established_get_idx(seq, pos);
2044 }
2045
2046 return rc;
2047}
2048
a8b690f9
TH
2049static void *tcp_seek_last_pos(struct seq_file *seq)
2050{
2051 struct tcp_iter_state *st = seq->private;
2052 int offset = st->offset;
2053 int orig_num = st->num;
2054 void *rc = NULL;
2055
2056 switch (st->state) {
2057 case TCP_SEQ_STATE_OPENREQ:
2058 case TCP_SEQ_STATE_LISTENING:
2059 if (st->bucket >= INET_LHTABLE_SIZE)
2060 break;
2061 st->state = TCP_SEQ_STATE_LISTENING;
2062 rc = listening_get_next(seq, NULL);
2063 while (offset-- && rc)
2064 rc = listening_get_next(seq, rc);
2065 if (rc)
2066 break;
2067 st->bucket = 0;
05dbc7b5 2068 st->state = TCP_SEQ_STATE_ESTABLISHED;
a8b690f9
TH
2069 /* Fallthrough */
2070 case TCP_SEQ_STATE_ESTABLISHED:
a8b690f9
TH
2071 if (st->bucket > tcp_hashinfo.ehash_mask)
2072 break;
2073 rc = established_get_first(seq);
2074 while (offset-- && rc)
2075 rc = established_get_next(seq, rc);
2076 }
2077
2078 st->num = orig_num;
2079
2080 return rc;
2081}
2082
1da177e4
LT
2083static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2084{
5799de0b 2085 struct tcp_iter_state *st = seq->private;
a8b690f9
TH
2086 void *rc;
2087
2088 if (*pos && *pos == st->last_pos) {
2089 rc = tcp_seek_last_pos(seq);
2090 if (rc)
2091 goto out;
2092 }
2093
1da177e4
LT
2094 st->state = TCP_SEQ_STATE_LISTENING;
2095 st->num = 0;
a8b690f9
TH
2096 st->bucket = 0;
2097 st->offset = 0;
2098 rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2099
2100out:
2101 st->last_pos = *pos;
2102 return rc;
1da177e4
LT
2103}
2104
2105static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2106{
a8b690f9 2107 struct tcp_iter_state *st = seq->private;
1da177e4 2108 void *rc = NULL;
1da177e4
LT
2109
2110 if (v == SEQ_START_TOKEN) {
2111 rc = tcp_get_idx(seq, 0);
2112 goto out;
2113 }
1da177e4
LT
2114
2115 switch (st->state) {
2116 case TCP_SEQ_STATE_OPENREQ:
2117 case TCP_SEQ_STATE_LISTENING:
2118 rc = listening_get_next(seq, v);
2119 if (!rc) {
1da177e4 2120 st->state = TCP_SEQ_STATE_ESTABLISHED;
a8b690f9
TH
2121 st->bucket = 0;
2122 st->offset = 0;
1da177e4
LT
2123 rc = established_get_first(seq);
2124 }
2125 break;
2126 case TCP_SEQ_STATE_ESTABLISHED:
1da177e4
LT
2127 rc = established_get_next(seq, v);
2128 break;
2129 }
2130out:
2131 ++*pos;
a8b690f9 2132 st->last_pos = *pos;
1da177e4
LT
2133 return rc;
2134}
2135
2136static void tcp_seq_stop(struct seq_file *seq, void *v)
2137{
5799de0b 2138 struct tcp_iter_state *st = seq->private;
1da177e4
LT
2139
2140 switch (st->state) {
2141 case TCP_SEQ_STATE_OPENREQ:
2142 if (v) {
463c84b9
ACM
2143 struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
2144 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1da177e4
LT
2145 }
2146 case TCP_SEQ_STATE_LISTENING:
2147 if (v != SEQ_START_TOKEN)
5caea4ea 2148 spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
1da177e4 2149 break;
1da177e4
LT
2150 case TCP_SEQ_STATE_ESTABLISHED:
2151 if (v)
9db66bdc 2152 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
1da177e4
LT
2153 break;
2154 }
2155}
2156
73cb88ec 2157int tcp_seq_open(struct inode *inode, struct file *file)
1da177e4 2158{
d9dda78b 2159 struct tcp_seq_afinfo *afinfo = PDE_DATA(inode);
1da177e4 2160 struct tcp_iter_state *s;
52d6f3f1 2161 int err;
1da177e4 2162
52d6f3f1
DL
2163 err = seq_open_net(inode, file, &afinfo->seq_ops,
2164 sizeof(struct tcp_iter_state));
2165 if (err < 0)
2166 return err;
f40c8174 2167
52d6f3f1 2168 s = ((struct seq_file *)file->private_data)->private;
1da177e4 2169 s->family = afinfo->family;
688d1945 2170 s->last_pos = 0;
f40c8174
DL
2171 return 0;
2172}
73cb88ec 2173EXPORT_SYMBOL(tcp_seq_open);
f40c8174 2174
6f8b13bc 2175int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
1da177e4
LT
2176{
2177 int rc = 0;
2178 struct proc_dir_entry *p;
2179
9427c4b3
DL
2180 afinfo->seq_ops.start = tcp_seq_start;
2181 afinfo->seq_ops.next = tcp_seq_next;
2182 afinfo->seq_ops.stop = tcp_seq_stop;
2183
84841c3c 2184 p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
73cb88ec 2185 afinfo->seq_fops, afinfo);
84841c3c 2186 if (!p)
1da177e4
LT
2187 rc = -ENOMEM;
2188 return rc;
2189}
4bc2f18b 2190EXPORT_SYMBOL(tcp_proc_register);
1da177e4 2191
6f8b13bc 2192void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
1da177e4 2193{
ece31ffd 2194 remove_proc_entry(afinfo->name, net->proc_net);
1da177e4 2195}
4bc2f18b 2196EXPORT_SYMBOL(tcp_proc_unregister);
1da177e4 2197
cf533ea5 2198static void get_openreq4(const struct sock *sk, const struct request_sock *req,
652586df 2199 struct seq_file *f, int i, kuid_t uid)
1da177e4 2200{
2e6599cb 2201 const struct inet_request_sock *ireq = inet_rsk(req);
a399a805 2202 long delta = req->expires - jiffies;
1da177e4 2203
5e659e4c 2204 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
652586df 2205 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
1da177e4 2206 i,
634fb979 2207 ireq->ir_loc_addr,
c720c7e8 2208 ntohs(inet_sk(sk)->inet_sport),
634fb979
ED
2209 ireq->ir_rmt_addr,
2210 ntohs(ireq->ir_rmt_port),
1da177e4
LT
2211 TCP_SYN_RECV,
2212 0, 0, /* could print option size, but that is af dependent. */
2213 1, /* timers active (only the expire timer) */
a399a805 2214 jiffies_delta_to_clock_t(delta),
e6c022a4 2215 req->num_timeout,
a7cb5a49 2216 from_kuid_munged(seq_user_ns(f), uid),
1da177e4
LT
2217 0, /* non standard timer */
2218 0, /* open_requests have no inode */
2219 atomic_read(&sk->sk_refcnt),
652586df 2220 req);
1da177e4
LT
2221}
2222
652586df 2223static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
1da177e4
LT
2224{
2225 int timer_active;
2226 unsigned long timer_expires;
cf533ea5 2227 const struct tcp_sock *tp = tcp_sk(sk);
cf4c6bf8 2228 const struct inet_connection_sock *icsk = inet_csk(sk);
cf533ea5 2229 const struct inet_sock *inet = inet_sk(sk);
168a8f58 2230 struct fastopen_queue *fastopenq = icsk->icsk_accept_queue.fastopenq;
c720c7e8
ED
2231 __be32 dest = inet->inet_daddr;
2232 __be32 src = inet->inet_rcv_saddr;
2233 __u16 destp = ntohs(inet->inet_dport);
2234 __u16 srcp = ntohs(inet->inet_sport);
49d09007 2235 int rx_queue;
1da177e4 2236
6ba8a3b1
ND
2237 if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2238 icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
2239 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
1da177e4 2240 timer_active = 1;
463c84b9
ACM
2241 timer_expires = icsk->icsk_timeout;
2242 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
1da177e4 2243 timer_active = 4;
463c84b9 2244 timer_expires = icsk->icsk_timeout;
cf4c6bf8 2245 } else if (timer_pending(&sk->sk_timer)) {
1da177e4 2246 timer_active = 2;
cf4c6bf8 2247 timer_expires = sk->sk_timer.expires;
1da177e4
LT
2248 } else {
2249 timer_active = 0;
2250 timer_expires = jiffies;
2251 }
2252
49d09007
ED
2253 if (sk->sk_state == TCP_LISTEN)
2254 rx_queue = sk->sk_ack_backlog;
2255 else
2256 /*
2257 * because we dont lock socket, we might find a transient negative value
2258 */
2259 rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2260
5e659e4c 2261 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
652586df 2262 "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
cf4c6bf8 2263 i, src, srcp, dest, destp, sk->sk_state,
47da8ee6 2264 tp->write_seq - tp->snd_una,
49d09007 2265 rx_queue,
1da177e4 2266 timer_active,
a399a805 2267 jiffies_delta_to_clock_t(timer_expires - jiffies),
463c84b9 2268 icsk->icsk_retransmits,
a7cb5a49 2269 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
6687e988 2270 icsk->icsk_probes_out,
cf4c6bf8
IJ
2271 sock_i_ino(sk),
2272 atomic_read(&sk->sk_refcnt), sk,
7be87351
SH
2273 jiffies_to_clock_t(icsk->icsk_rto),
2274 jiffies_to_clock_t(icsk->icsk_ack.ato),
463c84b9 2275 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
1da177e4 2276 tp->snd_cwnd,
168a8f58
JC
2277 sk->sk_state == TCP_LISTEN ?
2278 (fastopenq ? fastopenq->max_qlen : 0) :
652586df 2279 (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
1da177e4
LT
2280}
2281
cf533ea5 2282static void get_timewait4_sock(const struct inet_timewait_sock *tw,
652586df 2283 struct seq_file *f, int i)
1da177e4 2284{
23f33c2d 2285 __be32 dest, src;
1da177e4 2286 __u16 destp, srcp;
e2a1d3e4 2287 s32 delta = tw->tw_ttd - inet_tw_time_stamp();
1da177e4
LT
2288
2289 dest = tw->tw_daddr;
2290 src = tw->tw_rcv_saddr;
2291 destp = ntohs(tw->tw_dport);
2292 srcp = ntohs(tw->tw_sport);
2293
5e659e4c 2294 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
652586df 2295 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
1da177e4 2296 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
a399a805 2297 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
652586df 2298 atomic_read(&tw->tw_refcnt), tw);
1da177e4
LT
2299}
2300
2301#define TMPSZ 150
2302
2303static int tcp4_seq_show(struct seq_file *seq, void *v)
2304{
5799de0b 2305 struct tcp_iter_state *st;
05dbc7b5 2306 struct sock *sk = v;
1da177e4 2307
652586df 2308 seq_setwidth(seq, TMPSZ - 1);
1da177e4 2309 if (v == SEQ_START_TOKEN) {
652586df 2310 seq_puts(seq, " sl local_address rem_address st tx_queue "
1da177e4
LT
2311 "rx_queue tr tm->when retrnsmt uid timeout "
2312 "inode");
2313 goto out;
2314 }
2315 st = seq->private;
2316
2317 switch (st->state) {
2318 case TCP_SEQ_STATE_LISTENING:
2319 case TCP_SEQ_STATE_ESTABLISHED:
05dbc7b5 2320 if (sk->sk_state == TCP_TIME_WAIT)
652586df 2321 get_timewait4_sock(v, seq, st->num);
05dbc7b5 2322 else
652586df 2323 get_tcp4_sock(v, seq, st->num);
1da177e4
LT
2324 break;
2325 case TCP_SEQ_STATE_OPENREQ:
652586df 2326 get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid);
1da177e4
LT
2327 break;
2328 }
1da177e4 2329out:
652586df 2330 seq_pad(seq, '\n');
1da177e4
LT
2331 return 0;
2332}
2333
73cb88ec
AV
2334static const struct file_operations tcp_afinfo_seq_fops = {
2335 .owner = THIS_MODULE,
2336 .open = tcp_seq_open,
2337 .read = seq_read,
2338 .llseek = seq_lseek,
2339 .release = seq_release_net
2340};
2341
1da177e4 2342static struct tcp_seq_afinfo tcp4_seq_afinfo = {
1da177e4
LT
2343 .name = "tcp",
2344 .family = AF_INET,
73cb88ec 2345 .seq_fops = &tcp_afinfo_seq_fops,
9427c4b3
DL
2346 .seq_ops = {
2347 .show = tcp4_seq_show,
2348 },
1da177e4
LT
2349};
2350
2c8c1e72 2351static int __net_init tcp4_proc_init_net(struct net *net)
757764f6
PE
2352{
2353 return tcp_proc_register(net, &tcp4_seq_afinfo);
2354}
2355
2c8c1e72 2356static void __net_exit tcp4_proc_exit_net(struct net *net)
757764f6
PE
2357{
2358 tcp_proc_unregister(net, &tcp4_seq_afinfo);
2359}
2360
2361static struct pernet_operations tcp4_net_ops = {
2362 .init = tcp4_proc_init_net,
2363 .exit = tcp4_proc_exit_net,
2364};
2365
1da177e4
LT
2366int __init tcp4_proc_init(void)
2367{
757764f6 2368 return register_pernet_subsys(&tcp4_net_ops);
1da177e4
LT
2369}
2370
2371void tcp4_proc_exit(void)
2372{
757764f6 2373 unregister_pernet_subsys(&tcp4_net_ops);
1da177e4
LT
2374}
2375#endif /* CONFIG_PROC_FS */
2376
2377struct proto tcp_prot = {
2378 .name = "TCP",
2379 .owner = THIS_MODULE,
2380 .close = tcp_close,
2381 .connect = tcp_v4_connect,
2382 .disconnect = tcp_disconnect,
463c84b9 2383 .accept = inet_csk_accept,
1da177e4
LT
2384 .ioctl = tcp_ioctl,
2385 .init = tcp_v4_init_sock,
2386 .destroy = tcp_v4_destroy_sock,
2387 .shutdown = tcp_shutdown,
2388 .setsockopt = tcp_setsockopt,
2389 .getsockopt = tcp_getsockopt,
1da177e4 2390 .recvmsg = tcp_recvmsg,
7ba42910
CG
2391 .sendmsg = tcp_sendmsg,
2392 .sendpage = tcp_sendpage,
1da177e4 2393 .backlog_rcv = tcp_v4_do_rcv,
46d3ceab 2394 .release_cb = tcp_release_cb,
ab1e0a13
ACM
2395 .hash = inet_hash,
2396 .unhash = inet_unhash,
2397 .get_port = inet_csk_get_port,
1da177e4 2398 .enter_memory_pressure = tcp_enter_memory_pressure,
c9bee3b7 2399 .stream_memory_free = tcp_stream_memory_free,
1da177e4 2400 .sockets_allocated = &tcp_sockets_allocated,
0a5578cf 2401 .orphan_count = &tcp_orphan_count,
1da177e4
LT
2402 .memory_allocated = &tcp_memory_allocated,
2403 .memory_pressure = &tcp_memory_pressure,
a4fe34bf 2404 .sysctl_mem = sysctl_tcp_mem,
1da177e4
LT
2405 .sysctl_wmem = sysctl_tcp_wmem,
2406 .sysctl_rmem = sysctl_tcp_rmem,
2407 .max_header = MAX_TCP_HEADER,
2408 .obj_size = sizeof(struct tcp_sock),
3ab5aee7 2409 .slab_flags = SLAB_DESTROY_BY_RCU,
6d6ee43e 2410 .twsk_prot = &tcp_timewait_sock_ops,
60236fdd 2411 .rsk_prot = &tcp_request_sock_ops,
39d8cda7 2412 .h.hashinfo = &tcp_hashinfo,
7ba42910 2413 .no_autobind = true,
543d9cfe
ACM
2414#ifdef CONFIG_COMPAT
2415 .compat_setsockopt = compat_tcp_setsockopt,
2416 .compat_getsockopt = compat_tcp_getsockopt,
2417#endif
c255a458 2418#ifdef CONFIG_MEMCG_KMEM
d1a4c0b3
GC
2419 .init_cgroup = tcp_init_cgroup,
2420 .destroy_cgroup = tcp_destroy_cgroup,
2421 .proto_cgroup = tcp_proto_cgroup,
2422#endif
1da177e4 2423};
4bc2f18b 2424EXPORT_SYMBOL(tcp_prot);
1da177e4 2425
046ee902
DL
2426static int __net_init tcp_sk_init(struct net *net)
2427{
5d134f1c 2428 net->ipv4.sysctl_tcp_ecn = 2;
be9f4a44 2429 return 0;
046ee902
DL
2430}
2431
2432static void __net_exit tcp_sk_exit(struct net *net)
2433{
b099ce26
EB
2434}
2435
2436static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2437{
2438 inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
046ee902
DL
2439}
2440
2441static struct pernet_operations __net_initdata tcp_sk_ops = {
b099ce26
EB
2442 .init = tcp_sk_init,
2443 .exit = tcp_sk_exit,
2444 .exit_batch = tcp_sk_exit_batch,
046ee902
DL
2445};
2446
9b0f976f 2447void __init tcp_v4_init(void)
1da177e4 2448{
5caea4ea 2449 inet_hashinfo_init(&tcp_hashinfo);
6a1b3054 2450 if (register_pernet_subsys(&tcp_sk_ops))
1da177e4 2451 panic("Failed to create the TCP control socket.\n");
1da177e4 2452}