--- /dev/null
+From 36b9639fc96df9a60cc08401cdeddb5360141b63 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 20 Nov 2020 11:11:33 +0000
+Subject: tcp: fix race condition when creating child sockets from syncookies
+
+From: Ricardo Dias <rdias@singlestore.com>
+
+[ Upstream commit 01770a166165738a6e05c3d911fb4609cc4eb416 ]
+
+When the TCP stack is in SYN flood mode, the server child socket is
+created from the SYN cookie received in a TCP packet with the ACK flag
+set.
+
+The child socket is created when the server receives the first TCP
+packet with a valid SYN cookie from the client. Usually, this packet
+corresponds to the final step of the TCP 3-way handshake, the ACK
+packet. But is also possible to receive a valid SYN cookie from the
+first TCP data packet sent by the client, and thus create a child socket
+from that SYN cookie.
+
+Since a client socket is ready to send data as soon as it receives the
+SYN+ACK packet from the server, the client can send the ACK packet (sent
+by the TCP stack code), and the first data packet (sent by the userspace
+program) almost at the same time, and thus the server will equally
+receive the two TCP packets with valid SYN cookies almost at the same
+instant.
+
+When such event happens, the TCP stack code has a race condition that
+occurs between the momement a lookup is done to the established
+connections hashtable to check for the existence of a connection for the
+same client, and the moment that the child socket is added to the
+established connections hashtable. As a consequence, this race condition
+can lead to a situation where we add two child sockets to the
+established connections hashtable and deliver two sockets to the
+userspace program to the same client.
+
+This patch fixes the race condition by checking if an existing child
+socket exists for the same client when we are adding the second child
+socket to the established connections socket. If an existing child
+socket exists, we drop the packet and discard the second child socket
+to the same client.
+
+Signed-off-by: Ricardo Dias <rdias@singlestore.com>
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Link: https://lore.kernel.org/r/20201120111133.GA67501@rdias-suse-pc.lan
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ include/net/inet_hashtables.h | 5 ++-
+ net/dccp/ipv4.c | 2 +-
+ net/dccp/ipv6.c | 2 +-
+ net/ipv4/inet_connection_sock.c | 2 +-
+ net/ipv4/inet_hashtables.c | 68 +++++++++++++++++++++++++++++----
+ net/ipv4/tcp_ipv4.c | 15 +++++++-
+ net/ipv6/tcp_ipv6.c | 13 ++++++-
+ 7 files changed, 91 insertions(+), 16 deletions(-)
+
+diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
+index b875dcef173c..fa5fe23ca6aa 100644
+--- a/include/net/inet_hashtables.h
++++ b/include/net/inet_hashtables.h
+@@ -232,8 +232,9 @@ void inet_hashinfo2_init(struct inet_hashinfo *h, const char *name,
+ unsigned long low_limit,
+ unsigned long high_limit);
+
+-bool inet_ehash_insert(struct sock *sk, struct sock *osk);
+-bool inet_ehash_nolisten(struct sock *sk, struct sock *osk);
++bool inet_ehash_insert(struct sock *sk, struct sock *osk, bool *found_dup_sk);
++bool inet_ehash_nolisten(struct sock *sk, struct sock *osk,
++ bool *found_dup_sk);
+ int __inet_hash(struct sock *sk, struct sock *osk);
+ int inet_hash(struct sock *sk);
+ void inet_unhash(struct sock *sk);
+diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
+index 176bddacc16e..7e93087d1366 100644
+--- a/net/dccp/ipv4.c
++++ b/net/dccp/ipv4.c
+@@ -428,7 +428,7 @@ struct sock *dccp_v4_request_recv_sock(const struct sock *sk,
+
+ if (__inet_inherit_port(sk, newsk) < 0)
+ goto put_and_exit;
+- *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
++ *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash), NULL);
+ if (*own_req)
+ ireq->ireq_opt = NULL;
+ else
+diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
+index 2cd3508a3786..ae4851fdbe9e 100644
+--- a/net/dccp/ipv6.c
++++ b/net/dccp/ipv6.c
+@@ -538,7 +538,7 @@ static struct sock *dccp_v6_request_recv_sock(const struct sock *sk,
+ dccp_done(newsk);
+ goto out;
+ }
+- *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
++ *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash), NULL);
+ /* Clone pktoptions received with SYN, if we own the req */
+ if (*own_req && ireq->pktopts) {
+ newnp->pktoptions = skb_clone(ireq->pktopts, GFP_ATOMIC);
+diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
+index 439a55d1aa99..0a69f92da71b 100644
+--- a/net/ipv4/inet_connection_sock.c
++++ b/net/ipv4/inet_connection_sock.c
+@@ -793,7 +793,7 @@ static void reqsk_queue_hash_req(struct request_sock *req,
+ timer_setup(&req->rsk_timer, reqsk_timer_handler, TIMER_PINNED);
+ mod_timer(&req->rsk_timer, jiffies + timeout);
+
+- inet_ehash_insert(req_to_sk(req), NULL);
++ inet_ehash_insert(req_to_sk(req), NULL, NULL);
+ /* before letting lookups find us, make sure all req fields
+ * are committed to memory and refcnt initialized.
+ */
+diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
+index 3a5f12f011cb..c96a5871b49d 100644
+--- a/net/ipv4/inet_hashtables.c
++++ b/net/ipv4/inet_hashtables.c
+@@ -24,6 +24,9 @@
+ #include <net/addrconf.h>
+ #include <net/inet_connection_sock.h>
+ #include <net/inet_hashtables.h>
++#if IS_ENABLED(CONFIG_IPV6)
++#include <net/inet6_hashtables.h>
++#endif
+ #include <net/secure_seq.h>
+ #include <net/ip.h>
+ #include <net/tcp.h>
+@@ -513,10 +516,52 @@ static u32 inet_sk_port_offset(const struct sock *sk)
+ inet->inet_dport);
+ }
+
+-/* insert a socket into ehash, and eventually remove another one
+- * (The another one can be a SYN_RECV or TIMEWAIT
++/* Searches for an exsiting socket in the ehash bucket list.
++ * Returns true if found, false otherwise.
+ */
+-bool inet_ehash_insert(struct sock *sk, struct sock *osk)
++static bool inet_ehash_lookup_by_sk(struct sock *sk,
++ struct hlist_nulls_head *list)
++{
++ const __portpair ports = INET_COMBINED_PORTS(sk->sk_dport, sk->sk_num);
++ const int sdif = sk->sk_bound_dev_if;
++ const int dif = sk->sk_bound_dev_if;
++ const struct hlist_nulls_node *node;
++ struct net *net = sock_net(sk);
++ struct sock *esk;
++
++ INET_ADDR_COOKIE(acookie, sk->sk_daddr, sk->sk_rcv_saddr);
++
++ sk_nulls_for_each_rcu(esk, node, list) {
++ if (esk->sk_hash != sk->sk_hash)
++ continue;
++ if (sk->sk_family == AF_INET) {
++ if (unlikely(INET_MATCH(esk, net, acookie,
++ sk->sk_daddr,
++ sk->sk_rcv_saddr,
++ ports, dif, sdif))) {
++ return true;
++ }
++ }
++#if IS_ENABLED(CONFIG_IPV6)
++ else if (sk->sk_family == AF_INET6) {
++ if (unlikely(INET6_MATCH(esk, net,
++ &sk->sk_v6_daddr,
++ &sk->sk_v6_rcv_saddr,
++ ports, dif, sdif))) {
++ return true;
++ }
++ }
++#endif
++ }
++ return false;
++}
++
++/* Insert a socket into ehash, and eventually remove another one
++ * (The another one can be a SYN_RECV or TIMEWAIT)
++ * If an existing socket already exists, socket sk is not inserted,
++ * and sets found_dup_sk parameter to true.
++ */
++bool inet_ehash_insert(struct sock *sk, struct sock *osk, bool *found_dup_sk)
+ {
+ struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
+ struct hlist_nulls_head *list;
+@@ -535,16 +580,23 @@ bool inet_ehash_insert(struct sock *sk, struct sock *osk)
+ if (osk) {
+ WARN_ON_ONCE(sk->sk_hash != osk->sk_hash);
+ ret = sk_nulls_del_node_init_rcu(osk);
++ } else if (found_dup_sk) {
++ *found_dup_sk = inet_ehash_lookup_by_sk(sk, list);
++ if (*found_dup_sk)
++ ret = false;
+ }
++
+ if (ret)
+ __sk_nulls_add_node_rcu(sk, list);
++
+ spin_unlock(lock);
++
+ return ret;
+ }
+
+-bool inet_ehash_nolisten(struct sock *sk, struct sock *osk)
++bool inet_ehash_nolisten(struct sock *sk, struct sock *osk, bool *found_dup_sk)
+ {
+- bool ok = inet_ehash_insert(sk, osk);
++ bool ok = inet_ehash_insert(sk, osk, found_dup_sk);
+
+ if (ok) {
+ sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
+@@ -588,7 +640,7 @@ int __inet_hash(struct sock *sk, struct sock *osk)
+ int err = 0;
+
+ if (sk->sk_state != TCP_LISTEN) {
+- inet_ehash_nolisten(sk, osk);
++ inet_ehash_nolisten(sk, osk, NULL);
+ return 0;
+ }
+ WARN_ON(!sk_unhashed(sk));
+@@ -683,7 +735,7 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
+ tb = inet_csk(sk)->icsk_bind_hash;
+ spin_lock_bh(&head->lock);
+ if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
+- inet_ehash_nolisten(sk, NULL);
++ inet_ehash_nolisten(sk, NULL, NULL);
+ spin_unlock_bh(&head->lock);
+ return 0;
+ }
+@@ -759,7 +811,7 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
+ inet_bind_hash(sk, tb, port);
+ if (sk_unhashed(sk)) {
+ inet_sk(sk)->inet_sport = htons(port);
+- inet_ehash_nolisten(sk, (struct sock *)tw);
++ inet_ehash_nolisten(sk, (struct sock *)tw, NULL);
+ }
+ if (tw)
+ inet_twsk_bind_unhash(tw, hinfo);
+diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
+index de4edfbc9e46..7ebdfab5b24a 100644
+--- a/net/ipv4/tcp_ipv4.c
++++ b/net/ipv4/tcp_ipv4.c
+@@ -1415,6 +1415,7 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
+ bool *own_req)
+ {
+ struct inet_request_sock *ireq;
++ bool found_dup_sk = false;
+ struct inet_sock *newinet;
+ struct tcp_sock *newtp;
+ struct sock *newsk;
+@@ -1485,12 +1486,22 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
+
+ if (__inet_inherit_port(sk, newsk) < 0)
+ goto put_and_exit;
+- *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
++ *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
++ &found_dup_sk);
+ if (likely(*own_req)) {
+ tcp_move_syn(newtp, req);
+ ireq->ireq_opt = NULL;
+ } else {
+- newinet->inet_opt = NULL;
++ if (!req_unhash && found_dup_sk) {
++ /* This code path should only be executed in the
++ * syncookie case only
++ */
++ bh_unlock_sock(newsk);
++ sock_put(newsk);
++ newsk = NULL;
++ } else {
++ newinet->inet_opt = NULL;
++ }
+ }
+ return newsk;
+
+diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
+index e8d206725cb7..c332f75f4e9a 100644
+--- a/net/ipv6/tcp_ipv6.c
++++ b/net/ipv6/tcp_ipv6.c
+@@ -1090,6 +1090,7 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *
+ struct ipv6_txoptions *opt;
+ struct tcp6_sock *newtcp6sk;
+ struct inet_sock *newinet;
++ bool found_dup_sk = false;
+ struct tcp_sock *newtp;
+ struct sock *newsk;
+ #ifdef CONFIG_TCP_MD5SIG
+@@ -1258,7 +1259,8 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *
+ tcp_done(newsk);
+ goto out;
+ }
+- *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
++ *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
++ &found_dup_sk);
+ if (*own_req) {
+ tcp_move_syn(newtp, req);
+
+@@ -1273,6 +1275,15 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *
+ skb_set_owner_r(newnp->pktoptions, newsk);
+ }
+ }
++ } else {
++ if (!req_unhash && found_dup_sk) {
++ /* This code path should only be executed in the
++ * syncookie case only
++ */
++ bh_unlock_sock(newsk);
++ sock_put(newsk);
++ newsk = NULL;
++ }
+ }
+
+ return newsk;
+--
+2.35.1
+