From 055cbdd693042d5600e6025cd368302fba6ab814 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Thu, 11 Oct 2018 16:07:14 +0200
Subject: [PATCH] 4.4-stable patches

added patches:
	tcp-add-tcp_ooo_try_coalesce-helper.patch
	tcp-call-tcp_drop-from-tcp_data_queue_ofo.patch
	tcp-fix-a-stale-ooo_last_skb-after-a-replace.patch
	tcp-free-batches-of-packets-in-tcp_prune_ofo_queue.patch
	tcp-increment-sk_drops-for-dropped-rx-packets.patch
	tcp-use-an-rb-tree-for-ooo-receive-queue.patch
---
 queue-4.4/series                              |   6 +
 .../tcp-add-tcp_ooo_try_coalesce-helper.patch |  75 ++
 ...all-tcp_drop-from-tcp_data_queue_ofo.patch |  45 ++
 ...a-stale-ooo_last_skb-after-a-replace.patch |  76 ++
 ...es-of-packets-in-tcp_prune_ofo_queue.patch |  79 ++
 ...ment-sk_drops-for-dropped-rx-packets.patch | 178 ++++
 ...use-an-rb-tree-for-ooo-receive-queue.patch | 757 ++++++++++++++++++
 7 files changed, 1216 insertions(+)
 create mode 100644 queue-4.4/tcp-add-tcp_ooo_try_coalesce-helper.patch
 create mode 100644 queue-4.4/tcp-call-tcp_drop-from-tcp_data_queue_ofo.patch
 create mode 100644 queue-4.4/tcp-fix-a-stale-ooo_last_skb-after-a-replace.patch
 create mode 100644 queue-4.4/tcp-free-batches-of-packets-in-tcp_prune_ofo_queue.patch
 create mode 100644 queue-4.4/tcp-increment-sk_drops-for-dropped-rx-packets.patch
 create mode 100644 queue-4.4/tcp-use-an-rb-tree-for-ooo-receive-queue.patch

diff --git a/queue-4.4/series b/queue-4.4/series
index 960f761dd15..20189986b39 100644
--- a/queue-4.4/series
+++ b/queue-4.4/series
@@ -17,3 +17,9 @@ powerpc-fadump-return-error-when-fadump-registration-fails.patch
 arc-clone-syscall-to-setp-r25-as-thread-pointer.patch
 ucma-fix-a-use-after-free-in-ucma_resolve_ip.patch
 ubifs-check-for-name-being-null-while-mounting.patch
+tcp-increment-sk_drops-for-dropped-rx-packets.patch
+tcp-use-an-rb-tree-for-ooo-receive-queue.patch
+tcp-fix-a-stale-ooo_last_skb-after-a-replace.patch
+tcp-free-batches-of-packets-in-tcp_prune_ofo_queue.patch
+tcp-call-tcp_drop-from-tcp_data_queue_ofo.patch
+tcp-add-tcp_ooo_try_coalesce-helper.patch
diff --git a/queue-4.4/tcp-add-tcp_ooo_try_coalesce-helper.patch b/queue-4.4/tcp-add-tcp_ooo_try_coalesce-helper.patch
new file mode 100644
index 00000000000..6105e769cc0
--- /dev/null
+++ b/queue-4.4/tcp-add-tcp_ooo_try_coalesce-helper.patch
@@ -0,0 +1,75 @@
+From foo@baz Thu Oct 11 16:06:02 CEST 2018
+From: Mao Wenan <maowenan@huawei.com>
+Date: Fri, 14 Sep 2018 16:24:10 +0800
+Subject: tcp: add tcp_ooo_try_coalesce() helper
+To: <netdev@vger.kernel.org>, <gregkh@linux-foundation.org>, <dwmw2@infradead.org>, <eric.dumazet@gmail.com>, <davem@davemloft.net>, <stable@vger.kernel.org>, <linux-kernel@vger.kernel.org>, <maowenan@huawei.com>
+Message-ID: <1536913450-12380-7-git-send-email-maowenan@huawei.com>
+
+From: Eric Dumazet <edumazet@google.com>
+
+[ Upstream commit 58152ecbbcc6a0ce7fddd5bf5f6ee535834ece0c ]
+
+In case skb in out_or_order_queue is the result of
+multiple skbs coalescing, we would like to get a proper gso_segs
+counter tracking, so that future tcp_drop() can report an accurate
+number.
+
+I chose to not implement this tracking for skbs in receive queue,
+since they are not dropped, unless socket is disconnected.
+
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
+Acked-by: Yuchung Cheng <ycheng@google.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Mao Wenan <maowenan@huawei.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/ipv4/tcp_input.c |   23 +++++++++++++++++++++--
+ 1 file changed, 21 insertions(+), 2 deletions(-)
+
+--- a/net/ipv4/tcp_input.c
++++ b/net/ipv4/tcp_input.c
+@@ -4296,6 +4296,23 @@ static bool tcp_try_coalesce(struct sock
+ 	return true;
+ }
+ 
++static bool tcp_ooo_try_coalesce(struct sock *sk,
++			     struct sk_buff *to,
++			     struct sk_buff *from,
++			     bool *fragstolen)
++{
++	bool res = tcp_try_coalesce(sk, to, from, fragstolen);
++
++	/* In case tcp_drop() is called later, update to->gso_segs */
++	if (res) {
++		u32 gso_segs = max_t(u16, 1, skb_shinfo(to)->gso_segs) +
++			       max_t(u16, 1, skb_shinfo(from)->gso_segs);
++
++		skb_shinfo(to)->gso_segs = min_t(u32, gso_segs, 0xFFFF);
++	}
++	return res;
++}
++
+ static void tcp_drop(struct sock *sk, struct sk_buff *skb)
+ {
+ 	sk_drops_add(sk, skb);
+@@ -4422,7 +4439,8 @@ static void tcp_data_queue_ofo(struct so
+ 	/* In the typical case, we are adding an skb to the end of the list.
+ 	 * Use of ooo_last_skb avoids the O(Log(N)) rbtree lookup.
+ 	 */
+-	if (tcp_try_coalesce(sk, tp->ooo_last_skb, skb, &fragstolen)) {
++	if (tcp_ooo_try_coalesce(sk, tp->ooo_last_skb,
++				 skb, &fragstolen)) {
+ coalesce_done:
+ 		tcp_grow_window(sk, skb);
+ 		kfree_skb_partial(skb, fragstolen);
+@@ -4467,7 +4485,8 @@ coalesce_done:
+ 				tcp_drop(sk, skb1);
+ 				goto merge_right;
+ 			}
+-		} else if (tcp_try_coalesce(sk, skb1, skb, &fragstolen)) {
++		} else if (tcp_ooo_try_coalesce(sk, skb1,
++						skb, &fragstolen)) {
+ 			goto coalesce_done;
+ 		}
+ 		p = &parent->rb_right;
diff --git a/queue-4.4/tcp-call-tcp_drop-from-tcp_data_queue_ofo.patch b/queue-4.4/tcp-call-tcp_drop-from-tcp_data_queue_ofo.patch
new file mode 100644
index 00000000000..13427c8cc8a
--- /dev/null
+++ b/queue-4.4/tcp-call-tcp_drop-from-tcp_data_queue_ofo.patch
@@ -0,0 +1,45 @@
+From foo@baz Thu Oct 11 16:06:02 CEST 2018
+From: Mao Wenan <maowenan@huawei.com>
+Date: Fri, 14 Sep 2018 16:24:09 +0800
+Subject: tcp: call tcp_drop() from tcp_data_queue_ofo()
+To: <netdev@vger.kernel.org>, <gregkh@linux-foundation.org>, <dwmw2@infradead.org>, <eric.dumazet@gmail.com>, <davem@davemloft.net>, <stable@vger.kernel.org>, <linux-kernel@vger.kernel.org>, <maowenan@huawei.com>
+Message-ID: <1536913450-12380-6-git-send-email-maowenan@huawei.com>
+
+From: Eric Dumazet <edumazet@google.com>
+
+[ Upstream commit 8541b21e781a22dce52a74fef0b9bed00404a1cd ]
+
+In order to be able to give better diagnostics and detect
+malicious traffic, we need to have better sk->sk_drops tracking.
+
+Fixes: 9f5afeae5152 ("tcp: use an RB tree for ooo receive queue")
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
+Acked-by: Yuchung Cheng <ycheng@google.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Mao Wenan <maowenan@huawei.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/ipv4/tcp_input.c |    4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+--- a/net/ipv4/tcp_input.c
++++ b/net/ipv4/tcp_input.c
+@@ -4445,7 +4445,7 @@ coalesce_done:
+ 				/* All the bits are present. Drop. */
+ 				NET_INC_STATS(sock_net(sk),
+ 					      LINUX_MIB_TCPOFOMERGE);
+-				__kfree_skb(skb);
++				tcp_drop(sk, skb);
+ 				skb = NULL;
+ 				tcp_dsack_set(sk, seq, end_seq);
+ 				goto add_sack;
+@@ -4464,7 +4464,7 @@ coalesce_done:
+ 						 TCP_SKB_CB(skb1)->end_seq);
+ 				NET_INC_STATS(sock_net(sk),
+ 					      LINUX_MIB_TCPOFOMERGE);
+-				__kfree_skb(skb1);
++				tcp_drop(sk, skb1);
+ 				goto merge_right;
+ 			}
+ 		} else if (tcp_try_coalesce(sk, skb1, skb, &fragstolen)) {
diff --git a/queue-4.4/tcp-fix-a-stale-ooo_last_skb-after-a-replace.patch b/queue-4.4/tcp-fix-a-stale-ooo_last_skb-after-a-replace.patch
new file mode 100644
index 00000000000..164e4593062
--- /dev/null
+++ b/queue-4.4/tcp-fix-a-stale-ooo_last_skb-after-a-replace.patch
@@ -0,0 +1,76 @@
+From foo@baz Thu Oct 11 16:06:02 CEST 2018
+From: Mao Wenan <maowenan@huawei.com>
+Date: Fri, 14 Sep 2018 16:24:07 +0800
+Subject: tcp: fix a stale ooo_last_skb after a replace
+To: <netdev@vger.kernel.org>, <gregkh@linux-foundation.org>, <dwmw2@infradead.org>, <eric.dumazet@gmail.com>, <davem@davemloft.net>, <stable@vger.kernel.org>, <linux-kernel@vger.kernel.org>, <maowenan@huawei.com>
+Message-ID: <1536913450-12380-4-git-send-email-maowenan@huawei.com>
+
+From: Eric Dumazet <edumazet@google.com>
+
+[ Upstream commit 76f0dcbb5ae1a7c3dbeec13dd98233b8e6b0b32a ]
+
+When skb replaces another one in ooo queue, I forgot to also
+update tp->ooo_last_skb as well, if the replaced skb was the last one
+in the queue.
+
+To fix this, we simply can re-use the code that runs after an insertion,
+trying to merge skbs at the right of current skb.
+
+This not only fixes the bug, but also remove all small skbs that might
+be a subset of the new one.
+
+Example:
+
+We receive segments 2001:3001,  4001:5001
+
+Then we receive 2001:8001 : We should replace 2001:3001 with the big
+skb, but also remove 4001:50001 from the queue to save space.
+
+packetdrill test demonstrating the bug
+
+0.000 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
++0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
++0 bind(3, ..., ...) = 0
++0 listen(3, 1) = 0
+
++0 < S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 7>
++0 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 7>
++0.100 < . 1:1(0) ack 1 win 1024
++0 accept(3, ..., ...) = 4
+
++0.01 < . 1001:2001(1000) ack 1 win 1024
++0    > . 1:1(0) ack 1 <nop,nop, sack 1001:2001>
+
++0.01 < . 1001:3001(2000) ack 1 win 1024
++0    > . 1:1(0) ack 1 <nop,nop, sack 1001:2001 1001:3001>
+
+Fixes: 9f5afeae5152 ("tcp: use an RB tree for ooo receive queue")
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Reported-by: Yuchung Cheng <ycheng@google.com>
+Cc: Yaogong Wang <wygivan@google.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Mao Wenan <maowenan@huawei.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/ipv4/tcp_input.c |    3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+--- a/net/ipv4/tcp_input.c
++++ b/net/ipv4/tcp_input.c
+@@ -4465,7 +4465,7 @@ coalesce_done:
+ 				NET_INC_STATS(sock_net(sk),
+ 					      LINUX_MIB_TCPOFOMERGE);
+ 				__kfree_skb(skb1);
+-				goto add_sack;
++				goto merge_right;
+ 			}
+ 		} else if (tcp_try_coalesce(sk, skb1, skb, &fragstolen)) {
+ 			goto coalesce_done;
+@@ -4477,6 +4477,7 @@ coalesce_done:
+ 	rb_link_node(&skb->rbnode, parent, p);
+ 	rb_insert_color(&skb->rbnode, &tp->out_of_order_queue);
+ 
++merge_right:
+ 	/* Remove other segments covered by skb. */
+ 	while ((q = rb_next(&skb->rbnode)) != NULL) {
+ 		skb1 = rb_entry(q, struct sk_buff, rbnode);
diff --git a/queue-4.4/tcp-free-batches-of-packets-in-tcp_prune_ofo_queue.patch b/queue-4.4/tcp-free-batches-of-packets-in-tcp_prune_ofo_queue.patch
new file mode 100644
index 00000000000..e6c44a5f172
--- /dev/null
+++ b/queue-4.4/tcp-free-batches-of-packets-in-tcp_prune_ofo_queue.patch
@@ -0,0 +1,79 @@
+From foo@baz Thu Oct 11 16:06:02 CEST 2018
+From: Mao Wenan <maowenan@huawei.com>
+Date: Fri, 14 Sep 2018 16:24:08 +0800
+Subject: tcp: free batches of packets in tcp_prune_ofo_queue()
+To: <netdev@vger.kernel.org>, <gregkh@linux-foundation.org>, <dwmw2@infradead.org>, <eric.dumazet@gmail.com>, <davem@davemloft.net>, <stable@vger.kernel.org>, <linux-kernel@vger.kernel.org>, <maowenan@huawei.com>
+Message-ID: <1536913450-12380-5-git-send-email-maowenan@huawei.com>
+
+From: Eric Dumazet <edumazet@google.com>
+
+[ Upstream commit 72cd43ba64fc172a443410ce01645895850844c8 ]
+
+Juha-Matti Tilli reported that malicious peers could inject tiny
+packets in out_of_order_queue, forcing very expensive calls
+to tcp_collapse_ofo_queue() and tcp_prune_ofo_queue() for
+every incoming packet. out_of_order_queue rb-tree can contain
+thousands of nodes, iterating over all of them is not nice.
+
+Before linux-4.9, we would have pruned all packets in ofo_queue
+in one go, every XXXX packets. XXXX depends on sk_rcvbuf and skbs
+truesize, but is about 7000 packets with tcp_rmem[2] default of 6 MB.
+
+Since we plan to increase tcp_rmem[2] in the future to cope with
+modern BDP, can not revert to the old behavior, without great pain.
+
+Strategy taken in this patch is to purge ~12.5 % of the queue capacity.
+
+Fixes: 36a6503fedda ("tcp: refine tcp_prune_ofo_queue() to not drop all packets")
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Reported-by: Juha-Matti Tilli <juha-matti.tilli@iki.fi>
+Acked-by: Yuchung Cheng <ycheng@google.com>
+Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Mao Wenan <maowenan@huawei.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/ipv4/tcp_input.c |   16 +++++++++++-----
+ 1 file changed, 11 insertions(+), 5 deletions(-)
+
+--- a/net/ipv4/tcp_input.c
++++ b/net/ipv4/tcp_input.c
+@@ -4899,27 +4899,33 @@ new_range:
+ 
+ /*
+  * Purge the out-of-order queue.
++ * Drop at least 12.5 % of sk_rcvbuf to avoid malicious attacks.
+  * Return true if queue was pruned.
+  */
+ static bool tcp_prune_ofo_queue(struct sock *sk)
+ {
+ 	struct tcp_sock *tp = tcp_sk(sk);
+ 	struct rb_node *node, *prev;
++	int goal;
+ 
+ 	if (RB_EMPTY_ROOT(&tp->out_of_order_queue))
+ 		return false;
+ 
+ 	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_OFOPRUNED);
+-
++	goal = sk->sk_rcvbuf >> 3;
+ 	node = &tp->ooo_last_skb->rbnode;
+ 	do {
+ 		prev = rb_prev(node);
+ 		rb_erase(node, &tp->out_of_order_queue);
++		goal -= rb_to_skb(node)->truesize;
+ 		__kfree_skb(rb_to_skb(node));
+-		sk_mem_reclaim(sk);
+-		if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf &&
+-		    !tcp_under_memory_pressure(sk))
+-			break;
++		if (!prev || goal <= 0) {
++			sk_mem_reclaim(sk);
++			if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf &&
++			    !tcp_under_memory_pressure(sk))
++				break;
++			goal = sk->sk_rcvbuf >> 3;
++		}
+ 
+ 		node = prev;
+ 	} while (node);
diff --git a/queue-4.4/tcp-increment-sk_drops-for-dropped-rx-packets.patch b/queue-4.4/tcp-increment-sk_drops-for-dropped-rx-packets.patch
new file mode 100644
index 00000000000..faf9a3f98e3
--- /dev/null
+++ b/queue-4.4/tcp-increment-sk_drops-for-dropped-rx-packets.patch
@@ -0,0 +1,178 @@
+From foo@baz Thu Oct 11 16:06:02 CEST 2018
+From: Mao Wenan <maowenan@huawei.com>
+Date: Fri, 14 Sep 2018 16:24:05 +0800
+Subject: tcp: increment sk_drops for dropped rx packets
+To: <netdev@vger.kernel.org>, <gregkh@linux-foundation.org>, <dwmw2@infradead.org>, <eric.dumazet@gmail.com>, <davem@davemloft.net>, <stable@vger.kernel.org>, <linux-kernel@vger.kernel.org>, <maowenan@huawei.com>
+Message-ID: <1536913450-12380-2-git-send-email-maowenan@huawei.com>
+
+From: Eric Dumazet <edumazet@google.com>
+
+[ Upstream commit 532182cd610782db8c18230c2747626562032205 ]
+
+Now ss can report sk_drops, we can instruct TCP to increment
+this per socket counter when it drops an incoming frame, to refine
+monitoring and debugging.
+
+Following patch takes care of listeners drops.
+
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Mao Wenan <maowenan@huawei.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/net/sock.h   |    7 +++++++
+ net/ipv4/tcp_input.c |   33 ++++++++++++++++++++-------------
+ net/ipv4/tcp_ipv4.c  |    1 +
+ net/ipv6/tcp_ipv6.c  |    1 +
+ 4 files changed, 29 insertions(+), 13 deletions(-)
+
+--- a/include/net/sock.h
++++ b/include/net/sock.h
+@@ -2139,6 +2139,13 @@ sock_skb_set_dropcount(const struct sock
+ 	SOCK_SKB_CB(skb)->dropcount = atomic_read(&sk->sk_drops);
+ }
+ 
++static inline void sk_drops_add(struct sock *sk, const struct sk_buff *skb)
++{
++	int segs = max_t(u16, 1, skb_shinfo(skb)->gso_segs);
++
++	atomic_add(segs, &sk->sk_drops);
++}
++
+ void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk,
+ 			   struct sk_buff *skb);
+ void __sock_recv_wifi_status(struct msghdr *msg, struct sock *sk,
+--- a/net/ipv4/tcp_input.c
++++ b/net/ipv4/tcp_input.c
+@@ -4296,6 +4296,12 @@ static bool tcp_try_coalesce(struct sock
+ 	return true;
+ }
+ 
++static void tcp_drop(struct sock *sk, struct sk_buff *skb)
++{
++	sk_drops_add(sk, skb);
++	__kfree_skb(skb);
++}
++
+ /* This one checks to see if we can put data from the
+  * out_of_order queue into the receive_queue.
+  */
+@@ -4320,7 +4326,7 @@ static void tcp_ofo_queue(struct sock *s
+ 		__skb_unlink(skb, &tp->out_of_order_queue);
+ 		if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
+ 			SOCK_DEBUG(sk, "ofo packet was already received\n");
+-			__kfree_skb(skb);
++			tcp_drop(sk, skb);
+ 			continue;
+ 		}
+ 		SOCK_DEBUG(sk, "ofo requeuing : rcv_next %X seq %X - %X\n",
+@@ -4372,7 +4378,7 @@ static void tcp_data_queue_ofo(struct so
+ 
+ 	if (unlikely(tcp_try_rmem_schedule(sk, skb, skb->truesize))) {
+ 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFODROP);
+-		__kfree_skb(skb);
++		tcp_drop(sk, skb);
+ 		return;
+ 	}
+ 
+@@ -4436,7 +4442,7 @@ static void tcp_data_queue_ofo(struct so
+ 		if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
+ 			/* All the bits are present. Drop. */
+ 			NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFOMERGE);
+-			__kfree_skb(skb);
++			tcp_drop(sk, skb);
+ 			skb = NULL;
+ 			tcp_dsack_set(sk, seq, end_seq);
+ 			goto add_sack;
+@@ -4475,7 +4481,7 @@ static void tcp_data_queue_ofo(struct so
+ 		tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq,
+ 				 TCP_SKB_CB(skb1)->end_seq);
+ 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFOMERGE);
+-		__kfree_skb(skb1);
++		tcp_drop(sk, skb1);
+ 	}
+ 
+ add_sack:
+@@ -4558,12 +4564,13 @@ err:
+ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
+ {
+ 	struct tcp_sock *tp = tcp_sk(sk);
+-	int eaten = -1;
+ 	bool fragstolen = false;
++	int eaten = -1;
+ 
+-	if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq)
+-		goto drop;
+-
++	if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) {
++		__kfree_skb(skb);
++		return;
++	}
+ 	skb_dst_drop(skb);
+ 	__skb_pull(skb, tcp_hdr(skb)->doff * 4);
+ 
+@@ -4645,7 +4652,7 @@ out_of_window:
+ 		tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS);
+ 		inet_csk_schedule_ack(sk);
+ drop:
+-		__kfree_skb(skb);
++		tcp_drop(sk, skb);
+ 		return;
+ 	}
+ 
+@@ -5236,7 +5243,7 @@ syn_challenge:
+ 	return true;
+ 
+ discard:
+-	__kfree_skb(skb);
++	tcp_drop(sk, skb);
+ 	return false;
+ }
+ 
+@@ -5454,7 +5461,7 @@ csum_error:
+ 	TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
+ 
+ discard:
+-	__kfree_skb(skb);
++	tcp_drop(sk, skb);
+ }
+ EXPORT_SYMBOL(tcp_rcv_established);
+ 
+@@ -5684,7 +5691,7 @@ static int tcp_rcv_synsent_state_process
+ 						  TCP_DELACK_MAX, TCP_RTO_MAX);
+ 
+ discard:
+-			__kfree_skb(skb);
++			tcp_drop(sk, skb);
+ 			return 0;
+ 		} else {
+ 			tcp_send_ack(sk);
+@@ -6041,7 +6048,7 @@ int tcp_rcv_state_process(struct sock *s
+ 
+ 	if (!queued) {
+ discard:
+-		__kfree_skb(skb);
++		tcp_drop(sk, skb);
+ 	}
+ 	return 0;
+ }
+--- a/net/ipv4/tcp_ipv4.c
++++ b/net/ipv4/tcp_ipv4.c
+@@ -1716,6 +1716,7 @@ discard_it:
+ 	return 0;
+ 
+ discard_and_relse:
++	sk_drops_add(sk, skb);
+ 	sock_put(sk);
+ 	goto discard_it;
+ 
+--- a/net/ipv6/tcp_ipv6.c
++++ b/net/ipv6/tcp_ipv6.c
+@@ -1505,6 +1505,7 @@ discard_it:
+ 	return 0;
+ 
+ discard_and_relse:
++	sk_drops_add(sk, skb);
+ 	sock_put(sk);
+ 	goto discard_it;
+ 
diff --git a/queue-4.4/tcp-use-an-rb-tree-for-ooo-receive-queue.patch b/queue-4.4/tcp-use-an-rb-tree-for-ooo-receive-queue.patch
new file mode 100644
index 00000000000..74ec66f2897
--- /dev/null
+++ b/queue-4.4/tcp-use-an-rb-tree-for-ooo-receive-queue.patch
@@ -0,0 +1,757 @@
+From foo@baz Thu Oct 11 16:06:02 CEST 2018
+From: Mao Wenan <maowenan@huawei.com>
+Date: Fri, 14 Sep 2018 16:24:06 +0800
+Subject: tcp: use an RB tree for ooo receive queue
+To: <netdev@vger.kernel.org>, <gregkh@linux-foundation.org>, <dwmw2@infradead.org>, <eric.dumazet@gmail.com>, <davem@davemloft.net>, <stable@vger.kernel.org>, <linux-kernel@vger.kernel.org>, <maowenan@huawei.com>
+Message-ID: <1536913450-12380-3-git-send-email-maowenan@huawei.com>
+
+From: Yaogong Wang <wygivan@google.com>
+
+[ Upstream commit 9f5afeae51526b3ad7b7cb21ee8b145ce6ea7a7a ]
+
+Over the years, TCP BDP has increased by several orders of magnitude,
+and some people are considering to reach the 2 Gbytes limit.
+
+Even with current window scale limit of 14, ~1 Gbytes maps to ~740,000
+MSS.
+
+In presence of packet losses (or reorders), TCP stores incoming packets
+into an out of order queue, and number of skbs sitting there waiting for
+the missing packets to be received can be in the 10^5 range.
+
+Most packets are appended to the tail of this queue, and when
+packets can finally be transferred to receive queue, we scan the queue
+from its head.
+
+However, in presence of heavy losses, we might have to find an arbitrary
+point in this queue, involving a linear scan for every incoming packet,
+throwing away cpu caches.
+
+This patch converts it to a RB tree, to get bounded latencies.
+
+Yaogong wrote a preliminary patch about 2 years ago.
+Eric did the rebase, added ofo_last_skb cache, polishing and tests.
+
+Tested with network dropping between 1 and 10 % packets, with good
+success (about 30 % increase of throughput in stress tests)
+
+Next step would be to also use an RB tree for the write queue at sender
+side ;)
+
+Signed-off-by: Yaogong Wang <wygivan@google.com>
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Cc: Yuchung Cheng <ycheng@google.com>
+Cc: Neal Cardwell <ncardwell@google.com>
+Cc: Ilpo JÃ¤rvinen <ilpo.jarvinen@helsinki.fi>
+Acked-By: Ilpo JÃ¤rvinen <ilpo.jarvinen@helsinki.fi>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Mao Wenan <maowenan@huawei.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/linux/skbuff.h   |    8 +
+ include/linux/tcp.h      |    7 
+ include/net/tcp.h        |    2 
+ net/core/skbuff.c        |   19 ++
+ net/ipv4/tcp.c           |    4 
+ net/ipv4/tcp_input.c     |  356 +++++++++++++++++++++++++++--------------------
+ net/ipv4/tcp_ipv4.c      |    2 
+ net/ipv4/tcp_minisocks.c |    1 
+ 8 files changed, 241 insertions(+), 158 deletions(-)
+
+--- a/include/linux/skbuff.h
++++ b/include/linux/skbuff.h
+@@ -2273,6 +2273,8 @@ static inline void __skb_queue_purge(str
+ 		kfree_skb(skb);
+ }
+ 
++void skb_rbtree_purge(struct rb_root *root);
++
+ void *netdev_alloc_frag(unsigned int fragsz);
+ 
+ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int length,
+@@ -2807,6 +2809,12 @@ static inline int pskb_trim_rcsum(struct
+ 	return __pskb_trim(skb, len);
+ }
+ 
++#define rb_to_skb(rb) rb_entry_safe(rb, struct sk_buff, rbnode)
++#define skb_rb_first(root) rb_to_skb(rb_first(root))
++#define skb_rb_last(root)  rb_to_skb(rb_last(root))
++#define skb_rb_next(skb)   rb_to_skb(rb_next(&(skb)->rbnode))
++#define skb_rb_prev(skb)   rb_to_skb(rb_prev(&(skb)->rbnode))
++
+ #define skb_queue_walk(queue, skb) \
+ 		for (skb = (queue)->next;					\
+ 		     skb != (struct sk_buff *)(queue);				\
+--- a/include/linux/tcp.h
++++ b/include/linux/tcp.h
+@@ -279,10 +279,9 @@ struct tcp_sock {
+ 	struct sk_buff* lost_skb_hint;
+ 	struct sk_buff *retransmit_skb_hint;
+ 
+-	/* OOO segments go in this list. Note that socket lock must be held,
+-	 * as we do not use sk_buff_head lock.
+-	 */
+-	struct sk_buff_head	out_of_order_queue;
++	/* OOO segments go in this rbtree. Socket lock must be held. */
++	struct rb_root	out_of_order_queue;
++	struct sk_buff	*ooo_last_skb; /* cache rb_last(out_of_order_queue) */
+ 
+ 	/* SACKs data, these 2 need to be together (see tcp_options_write) */
+ 	struct tcp_sack_block duplicate_sack[1]; /* D-SACK block */
+--- a/include/net/tcp.h
++++ b/include/net/tcp.h
+@@ -649,7 +649,7 @@ static inline void tcp_fast_path_check(s
+ {
+ 	struct tcp_sock *tp = tcp_sk(sk);
+ 
+-	if (skb_queue_empty(&tp->out_of_order_queue) &&
++	if (RB_EMPTY_ROOT(&tp->out_of_order_queue) &&
+ 	    tp->rcv_wnd &&
+ 	    atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf &&
+ 	    !tp->urg_data)
+--- a/net/core/skbuff.c
++++ b/net/core/skbuff.c
+@@ -2378,6 +2378,25 @@ void skb_queue_purge(struct sk_buff_head
+ EXPORT_SYMBOL(skb_queue_purge);
+ 
+ /**
++ *	skb_rbtree_purge - empty a skb rbtree
++ *	@root: root of the rbtree to empty
++ *
++ *	Delete all buffers on an &sk_buff rbtree. Each buffer is removed from
++ *	the list and one reference dropped. This function does not take
++ *	any lock. Synchronization should be handled by the caller (e.g., TCP
++ *	out-of-order queue is protected by the socket lock).
++ */
++void skb_rbtree_purge(struct rb_root *root)
++{
++	struct sk_buff *skb, *next;
++
++	rbtree_postorder_for_each_entry_safe(skb, next, root, rbnode)
++		kfree_skb(skb);
++
++	*root = RB_ROOT;
++}
++
++/**
+  *	skb_queue_head - queue a buffer at the list head
+  *	@list: list to use
+  *	@newsk: buffer to queue
+--- a/net/ipv4/tcp.c
++++ b/net/ipv4/tcp.c
+@@ -382,7 +382,7 @@ void tcp_init_sock(struct sock *sk)
+ 	struct inet_connection_sock *icsk = inet_csk(sk);
+ 	struct tcp_sock *tp = tcp_sk(sk);
+ 
+-	__skb_queue_head_init(&tp->out_of_order_queue);
++	tp->out_of_order_queue = RB_ROOT;
+ 	tcp_init_xmit_timers(sk);
+ 	tcp_prequeue_init(tp);
+ 	INIT_LIST_HEAD(&tp->tsq_node);
+@@ -2240,7 +2240,7 @@ int tcp_disconnect(struct sock *sk, int
+ 	tcp_clear_xmit_timers(sk);
+ 	__skb_queue_purge(&sk->sk_receive_queue);
+ 	tcp_write_queue_purge(sk);
+-	__skb_queue_purge(&tp->out_of_order_queue);
++	skb_rbtree_purge(&tp->out_of_order_queue);
+ 
+ 	inet->inet_dport = 0;
+ 
+--- a/net/ipv4/tcp_input.c
++++ b/net/ipv4/tcp_input.c
+@@ -4073,7 +4073,7 @@ static void tcp_fin(struct sock *sk)
+ 	/* It _is_ possible, that we have something out-of-order _after_ FIN.
+ 	 * Probably, we should reset in this case. For now drop them.
+ 	 */
+-	__skb_queue_purge(&tp->out_of_order_queue);
++	skb_rbtree_purge(&tp->out_of_order_queue);
+ 	if (tcp_is_sack(tp))
+ 		tcp_sack_reset(&tp->rx_opt);
+ 	sk_mem_reclaim(sk);
+@@ -4233,7 +4233,7 @@ static void tcp_sack_remove(struct tcp_s
+ 	int this_sack;
+ 
+ 	/* Empty ofo queue, hence, all the SACKs are eaten. Clear. */
+-	if (skb_queue_empty(&tp->out_of_order_queue)) {
++	if (RB_EMPTY_ROOT(&tp->out_of_order_queue)) {
+ 		tp->rx_opt.num_sacks = 0;
+ 		return;
+ 	}
+@@ -4309,10 +4309,13 @@ static void tcp_ofo_queue(struct sock *s
+ {
+ 	struct tcp_sock *tp = tcp_sk(sk);
+ 	__u32 dsack_high = tp->rcv_nxt;
++	bool fin, fragstolen, eaten;
+ 	struct sk_buff *skb, *tail;
+-	bool fragstolen, eaten;
++	struct rb_node *p;
+ 
+-	while ((skb = skb_peek(&tp->out_of_order_queue)) != NULL) {
++	p = rb_first(&tp->out_of_order_queue);
++	while (p) {
++		skb = rb_entry(p, struct sk_buff, rbnode);
+ 		if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt))
+ 			break;
+ 
+@@ -4322,9 +4325,10 @@ static void tcp_ofo_queue(struct sock *s
+ 				dsack_high = TCP_SKB_CB(skb)->end_seq;
+ 			tcp_dsack_extend(sk, TCP_SKB_CB(skb)->seq, dsack);
+ 		}
++		p = rb_next(p);
++		rb_erase(&skb->rbnode, &tp->out_of_order_queue);
+ 
+-		__skb_unlink(skb, &tp->out_of_order_queue);
+-		if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
++		if (unlikely(!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt))) {
+ 			SOCK_DEBUG(sk, "ofo packet was already received\n");
+ 			tcp_drop(sk, skb);
+ 			continue;
+@@ -4336,12 +4340,19 @@ static void tcp_ofo_queue(struct sock *s
+ 		tail = skb_peek_tail(&sk->sk_receive_queue);
+ 		eaten = tail && tcp_try_coalesce(sk, tail, skb, &fragstolen);
+ 		tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq);
++		fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN;
+ 		if (!eaten)
+ 			__skb_queue_tail(&sk->sk_receive_queue, skb);
+-		if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
+-			tcp_fin(sk);
+-		if (eaten)
++		else
+ 			kfree_skb_partial(skb, fragstolen);
++
++		if (unlikely(fin)) {
++			tcp_fin(sk);
++			/* tcp_fin() purges tp->out_of_order_queue,
++			 * so we must end this loop right now.
++			 */
++			break;
++		}
+ 	}
+ }
+ 
+@@ -4371,8 +4382,10 @@ static int tcp_try_rmem_schedule(struct
+ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
+ {
+ 	struct tcp_sock *tp = tcp_sk(sk);
++	struct rb_node **p, *q, *parent;
+ 	struct sk_buff *skb1;
+ 	u32 seq, end_seq;
++	bool fragstolen;
+ 
+ 	tcp_ecn_check_ce(sk, skb);
+ 
+@@ -4387,89 +4400,86 @@ static void tcp_data_queue_ofo(struct so
+ 	inet_csk_schedule_ack(sk);
+ 
+ 	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFOQUEUE);
++	seq = TCP_SKB_CB(skb)->seq;
++	end_seq = TCP_SKB_CB(skb)->end_seq;
+ 	SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n",
+-		   tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
++		   tp->rcv_nxt, seq, end_seq);
+ 
+-	skb1 = skb_peek_tail(&tp->out_of_order_queue);
+-	if (!skb1) {
++	p = &tp->out_of_order_queue.rb_node;
++	if (RB_EMPTY_ROOT(&tp->out_of_order_queue)) {
+ 		/* Initial out of order segment, build 1 SACK. */
+ 		if (tcp_is_sack(tp)) {
+ 			tp->rx_opt.num_sacks = 1;
+-			tp->selective_acks[0].start_seq = TCP_SKB_CB(skb)->seq;
+-			tp->selective_acks[0].end_seq =
+-						TCP_SKB_CB(skb)->end_seq;
++			tp->selective_acks[0].start_seq = seq;
++			tp->selective_acks[0].end_seq = end_seq;
+ 		}
+-		__skb_queue_head(&tp->out_of_order_queue, skb);
++		rb_link_node(&skb->rbnode, NULL, p);
++		rb_insert_color(&skb->rbnode, &tp->out_of_order_queue);
++		tp->ooo_last_skb = skb;
+ 		goto end;
+ 	}
+ 
+-	seq = TCP_SKB_CB(skb)->seq;
+-	end_seq = TCP_SKB_CB(skb)->end_seq;
+-
+-	if (seq == TCP_SKB_CB(skb1)->end_seq) {
+-		bool fragstolen;
+-
+-		if (!tcp_try_coalesce(sk, skb1, skb, &fragstolen)) {
+-			__skb_queue_after(&tp->out_of_order_queue, skb1, skb);
+-		} else {
+-			tcp_grow_window(sk, skb);
+-			kfree_skb_partial(skb, fragstolen);
+-			skb = NULL;
+-		}
+-
+-		if (!tp->rx_opt.num_sacks ||
+-		    tp->selective_acks[0].end_seq != seq)
+-			goto add_sack;
+-
+-		/* Common case: data arrive in order after hole. */
+-		tp->selective_acks[0].end_seq = end_seq;
+-		goto end;
++	/* In the typical case, we are adding an skb to the end of the list.
++	 * Use of ooo_last_skb avoids the O(Log(N)) rbtree lookup.
++	 */
++	if (tcp_try_coalesce(sk, tp->ooo_last_skb, skb, &fragstolen)) {
++coalesce_done:
++		tcp_grow_window(sk, skb);
++		kfree_skb_partial(skb, fragstolen);
++		skb = NULL;
++		goto add_sack;
+ 	}
+ 
+-	/* Find place to insert this segment. */
+-	while (1) {
+-		if (!after(TCP_SKB_CB(skb1)->seq, seq))
+-			break;
+-		if (skb_queue_is_first(&tp->out_of_order_queue, skb1)) {
+-			skb1 = NULL;
+-			break;
++	/* Find place to insert this segment. Handle overlaps on the way. */
++	parent = NULL;
++	while (*p) {
++		parent = *p;
++		skb1 = rb_entry(parent, struct sk_buff, rbnode);
++		if (before(seq, TCP_SKB_CB(skb1)->seq)) {
++			p = &parent->rb_left;
++			continue;
+ 		}
+-		skb1 = skb_queue_prev(&tp->out_of_order_queue, skb1);
+-	}
+ 
+-	/* Do skb overlap to previous one? */
+-	if (skb1 && before(seq, TCP_SKB_CB(skb1)->end_seq)) {
+-		if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
+-			/* All the bits are present. Drop. */
+-			NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFOMERGE);
+-			tcp_drop(sk, skb);
+-			skb = NULL;
+-			tcp_dsack_set(sk, seq, end_seq);
+-			goto add_sack;
+-		}
+-		if (after(seq, TCP_SKB_CB(skb1)->seq)) {
+-			/* Partial overlap. */
+-			tcp_dsack_set(sk, seq,
+-				      TCP_SKB_CB(skb1)->end_seq);
+-		} else {
+-			if (skb_queue_is_first(&tp->out_of_order_queue,
+-					       skb1))
+-				skb1 = NULL;
+-			else
+-				skb1 = skb_queue_prev(
+-					&tp->out_of_order_queue,
+-					skb1);
++		if (before(seq, TCP_SKB_CB(skb1)->end_seq)) {
++			if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
++				/* All the bits are present. Drop. */
++				NET_INC_STATS(sock_net(sk),
++					      LINUX_MIB_TCPOFOMERGE);
++				__kfree_skb(skb);
++				skb = NULL;
++				tcp_dsack_set(sk, seq, end_seq);
++				goto add_sack;
++			}
++			if (after(seq, TCP_SKB_CB(skb1)->seq)) {
++				/* Partial overlap. */
++				tcp_dsack_set(sk, seq, TCP_SKB_CB(skb1)->end_seq);
++			} else {
++				/* skb's seq == skb1's seq and skb covers skb1.
++				 * Replace skb1 with skb.
++				 */
++				rb_replace_node(&skb1->rbnode, &skb->rbnode,
++						&tp->out_of_order_queue);
++				tcp_dsack_extend(sk,
++						 TCP_SKB_CB(skb1)->seq,
++						 TCP_SKB_CB(skb1)->end_seq);
++				NET_INC_STATS(sock_net(sk),
++					      LINUX_MIB_TCPOFOMERGE);
++				__kfree_skb(skb1);
++				goto add_sack;
++			}
++		} else if (tcp_try_coalesce(sk, skb1, skb, &fragstolen)) {
++			goto coalesce_done;
+ 		}
++		p = &parent->rb_right;
+ 	}
+-	if (!skb1)
+-		__skb_queue_head(&tp->out_of_order_queue, skb);
+-	else
+-		__skb_queue_after(&tp->out_of_order_queue, skb1, skb);
+ 
+-	/* And clean segments covered by new one as whole. */
+-	while (!skb_queue_is_last(&tp->out_of_order_queue, skb)) {
+-		skb1 = skb_queue_next(&tp->out_of_order_queue, skb);
++	/* Insert segment into RB tree. */
++	rb_link_node(&skb->rbnode, parent, p);
++	rb_insert_color(&skb->rbnode, &tp->out_of_order_queue);
+ 
++	/* Remove other segments covered by skb. */
++	while ((q = rb_next(&skb->rbnode)) != NULL) {
++		skb1 = rb_entry(q, struct sk_buff, rbnode);
+ 		if (!after(end_seq, TCP_SKB_CB(skb1)->seq))
+ 			break;
+ 		if (before(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
+@@ -4477,12 +4487,15 @@ static void tcp_data_queue_ofo(struct so
+ 					 end_seq);
+ 			break;
+ 		}
+-		__skb_unlink(skb1, &tp->out_of_order_queue);
++		rb_erase(&skb1->rbnode, &tp->out_of_order_queue);
+ 		tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq,
+ 				 TCP_SKB_CB(skb1)->end_seq);
+ 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFOMERGE);
+ 		tcp_drop(sk, skb1);
+ 	}
++	/* If there is no skb after us, we are the last_skb ! */
++	if (!q)
++		tp->ooo_last_skb = skb;
+ 
+ add_sack:
+ 	if (tcp_is_sack(tp))
+@@ -4621,13 +4634,13 @@ queue_and_out:
+ 		if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
+ 			tcp_fin(sk);
+ 
+-		if (!skb_queue_empty(&tp->out_of_order_queue)) {
++		if (!RB_EMPTY_ROOT(&tp->out_of_order_queue)) {
+ 			tcp_ofo_queue(sk);
+ 
+ 			/* RFC2581. 4.2. SHOULD send immediate ACK, when
+ 			 * gap in queue is filled.
+ 			 */
+-			if (skb_queue_empty(&tp->out_of_order_queue))
++			if (RB_EMPTY_ROOT(&tp->out_of_order_queue))
+ 				inet_csk(sk)->icsk_ack.pingpong = 0;
+ 		}
+ 
+@@ -4679,48 +4692,76 @@ drop:
+ 	tcp_data_queue_ofo(sk, skb);
+ }
+ 
++static struct sk_buff *tcp_skb_next(struct sk_buff *skb, struct sk_buff_head *list)
++{
++	if (list)
++		return !skb_queue_is_last(list, skb) ? skb->next : NULL;
++
++	return rb_entry_safe(rb_next(&skb->rbnode), struct sk_buff, rbnode);
++}
++
+ static struct sk_buff *tcp_collapse_one(struct sock *sk, struct sk_buff *skb,
+-					struct sk_buff_head *list)
++					struct sk_buff_head *list,
++					struct rb_root *root)
+ {
+-	struct sk_buff *next = NULL;
++	struct sk_buff *next = tcp_skb_next(skb, list);
+ 
+-	if (!skb_queue_is_last(list, skb))
+-		next = skb_queue_next(list, skb);
++	if (list)
++		__skb_unlink(skb, list);
++	else
++		rb_erase(&skb->rbnode, root);
+ 
+-	__skb_unlink(skb, list);
+ 	__kfree_skb(skb);
+ 	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRCVCOLLAPSED);
+ 
+ 	return next;
+ }
+ 
++/* Insert skb into rb tree, ordered by TCP_SKB_CB(skb)->seq */
++static void tcp_rbtree_insert(struct rb_root *root, struct sk_buff *skb)
++{
++	struct rb_node **p = &root->rb_node;
++	struct rb_node *parent = NULL;
++	struct sk_buff *skb1;
++
++	while (*p) {
++		parent = *p;
++		skb1 = rb_entry(parent, struct sk_buff, rbnode);
++		if (before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb1)->seq))
++			p = &parent->rb_left;
++		else
++			p = &parent->rb_right;
++	}
++	rb_link_node(&skb->rbnode, parent, p);
++	rb_insert_color(&skb->rbnode, root);
++}
++
+ /* Collapse contiguous sequence of skbs head..tail with
+  * sequence numbers start..end.
+  *
+- * If tail is NULL, this means until the end of the list.
++ * If tail is NULL, this means until the end of the queue.
+  *
+  * Segments with FIN/SYN are not collapsed (only because this
+  * simplifies code)
+  */
+ static void
+-tcp_collapse(struct sock *sk, struct sk_buff_head *list,
+-	     struct sk_buff *head, struct sk_buff *tail,
+-	     u32 start, u32 end)
++tcp_collapse(struct sock *sk, struct sk_buff_head *list, struct rb_root *root,
++	     struct sk_buff *head, struct sk_buff *tail, u32 start, u32 end)
+ {
+-	struct sk_buff *skb, *n;
++	struct sk_buff *skb = head, *n;
++	struct sk_buff_head tmp;
+ 	bool end_of_skbs;
+ 
+ 	/* First, check that queue is collapsible and find
+-	 * the point where collapsing can be useful. */
+-	skb = head;
++	 * the point where collapsing can be useful.
++	 */
+ restart:
+-	end_of_skbs = true;
+-	skb_queue_walk_from_safe(list, skb, n) {
+-		if (skb == tail)
+-			break;
++	for (end_of_skbs = true; skb != NULL && skb != tail; skb = n) {
++		n = tcp_skb_next(skb, list);
++
+ 		/* No new bits? It is possible on ofo queue. */
+ 		if (!before(start, TCP_SKB_CB(skb)->end_seq)) {
+-			skb = tcp_collapse_one(sk, skb, list);
++			skb = tcp_collapse_one(sk, skb, list, root);
+ 			if (!skb)
+ 				break;
+ 			goto restart;
+@@ -4738,13 +4779,10 @@ restart:
+ 			break;
+ 		}
+ 
+-		if (!skb_queue_is_last(list, skb)) {
+-			struct sk_buff *next = skb_queue_next(list, skb);
+-			if (next != tail &&
+-			    TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(next)->seq) {
+-				end_of_skbs = false;
+-				break;
+-			}
++		if (n && n != tail &&
++		    TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(n)->seq) {
++			end_of_skbs = false;
++			break;
+ 		}
+ 
+ 		/* Decided to skip this, advance start seq. */
+@@ -4754,17 +4792,22 @@ restart:
+ 	    (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)))
+ 		return;
+ 
++	__skb_queue_head_init(&tmp);
++
+ 	while (before(start, end)) {
+ 		int copy = min_t(int, SKB_MAX_ORDER(0, 0), end - start);
+ 		struct sk_buff *nskb;
+ 
+ 		nskb = alloc_skb(copy, GFP_ATOMIC);
+ 		if (!nskb)
+-			return;
++			break;
+ 
+ 		memcpy(nskb->cb, skb->cb, sizeof(skb->cb));
+ 		TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start;
+-		__skb_queue_before(list, skb, nskb);
++		if (list)
++			__skb_queue_before(list, skb, nskb);
++		else
++			__skb_queue_tail(&tmp, nskb); /* defer rbtree insertion */
+ 		skb_set_owner_r(nskb, sk);
+ 
+ 		/* Copy data, releasing collapsed skbs. */
+@@ -4782,14 +4825,17 @@ restart:
+ 				start += size;
+ 			}
+ 			if (!before(start, TCP_SKB_CB(skb)->end_seq)) {
+-				skb = tcp_collapse_one(sk, skb, list);
++				skb = tcp_collapse_one(sk, skb, list, root);
+ 				if (!skb ||
+ 				    skb == tail ||
+ 				    (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)))
+-					return;
++					goto end;
+ 			}
+ 		}
+ 	}
++end:
++	skb_queue_walk_safe(&tmp, skb, n)
++		tcp_rbtree_insert(root, skb);
+ }
+ 
+ /* Collapse ofo queue. Algorithm: select contiguous sequence of skbs
+@@ -4799,34 +4845,39 @@ static void tcp_collapse_ofo_queue(struc
+ {
+ 	struct tcp_sock *tp = tcp_sk(sk);
+ 	u32 range_truesize, sum_tiny = 0;
+-	struct sk_buff *skb = skb_peek(&tp->out_of_order_queue);
+-	struct sk_buff *head;
++	struct sk_buff *skb, *head;
++	struct rb_node *p;
+ 	u32 start, end;
+ 
+-	if (!skb)
++	p = rb_first(&tp->out_of_order_queue);
++	skb = rb_entry_safe(p, struct sk_buff, rbnode);
++new_range:
++	if (!skb) {
++		p = rb_last(&tp->out_of_order_queue);
++		/* Note: This is possible p is NULL here. We do not
++		 * use rb_entry_safe(), as ooo_last_skb is valid only
++		 * if rbtree is not empty.
++		 */
++		tp->ooo_last_skb = rb_entry(p, struct sk_buff, rbnode);
+ 		return;
+-
++	}
+ 	start = TCP_SKB_CB(skb)->seq;
+ 	end = TCP_SKB_CB(skb)->end_seq;
+ 	range_truesize = skb->truesize;
+-	head = skb;
+-
+-	for (;;) {
+-		struct sk_buff *next = NULL;
+ 
+-		if (!skb_queue_is_last(&tp->out_of_order_queue, skb))
+-			next = skb_queue_next(&tp->out_of_order_queue, skb);
+-		skb = next;
++	for (head = skb;;) {
++		skb = tcp_skb_next(skb, NULL);
+ 
+-		/* Segment is terminated when we see gap or when
+-		 * we are at the end of all the queue. */
++		/* Range is terminated when we see a gap or when
++		 * we are at the queue end.
++		 */
+ 		if (!skb ||
+ 		    after(TCP_SKB_CB(skb)->seq, end) ||
+ 		    before(TCP_SKB_CB(skb)->end_seq, start)) {
+ 			/* Do not attempt collapsing tiny skbs */
+ 			if (range_truesize != head->truesize ||
+ 			    end - start >= SKB_WITH_OVERHEAD(SK_MEM_QUANTUM)) {
+-				tcp_collapse(sk, &tp->out_of_order_queue,
++				tcp_collapse(sk, NULL, &tp->out_of_order_queue,
+ 					     head, skb, start, end);
+ 			} else {
+ 				sum_tiny += range_truesize;
+@@ -4834,20 +4885,14 @@ static void tcp_collapse_ofo_queue(struc
+ 					return;
+ 			}
+ 
+-			head = skb;
+-			if (!skb)
+-				break;
+-			/* Start new segment */
++			goto new_range;
++		}
++
++		range_truesize += skb->truesize;
++		if (unlikely(before(TCP_SKB_CB(skb)->seq, start)))
+ 			start = TCP_SKB_CB(skb)->seq;
++		if (after(TCP_SKB_CB(skb)->end_seq, end))
+ 			end = TCP_SKB_CB(skb)->end_seq;
+-			range_truesize = skb->truesize;
+-		} else {
+-			range_truesize += skb->truesize;
+-			if (before(TCP_SKB_CB(skb)->seq, start))
+-				start = TCP_SKB_CB(skb)->seq;
+-			if (after(TCP_SKB_CB(skb)->end_seq, end))
+-				end = TCP_SKB_CB(skb)->end_seq;
+-		}
+ 	}
+ }
+ 
+@@ -4858,23 +4903,36 @@ static void tcp_collapse_ofo_queue(struc
+ static bool tcp_prune_ofo_queue(struct sock *sk)
+ {
+ 	struct tcp_sock *tp = tcp_sk(sk);
+-	bool res = false;
++	struct rb_node *node, *prev;
+ 
+-	if (!skb_queue_empty(&tp->out_of_order_queue)) {
+-		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_OFOPRUNED);
+-		__skb_queue_purge(&tp->out_of_order_queue);
+-
+-		/* Reset SACK state.  A conforming SACK implementation will
+-		 * do the same at a timeout based retransmit.  When a connection
+-		 * is in a sad state like this, we care only about integrity
+-		 * of the connection not performance.
+-		 */
+-		if (tp->rx_opt.sack_ok)
+-			tcp_sack_reset(&tp->rx_opt);
++	if (RB_EMPTY_ROOT(&tp->out_of_order_queue))
++		return false;
++
++	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_OFOPRUNED);
++
++	node = &tp->ooo_last_skb->rbnode;
++	do {
++		prev = rb_prev(node);
++		rb_erase(node, &tp->out_of_order_queue);
++		__kfree_skb(rb_to_skb(node));
+ 		sk_mem_reclaim(sk);
+-		res = true;
+-	}
+-	return res;
++		if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf &&
++		    !tcp_under_memory_pressure(sk))
++			break;
++
++		node = prev;
++	} while (node);
++	tp->ooo_last_skb = rb_entry(prev, struct sk_buff, rbnode);
++
++	/* Reset SACK state.  A conforming SACK implementation will
++	 * do the same at a timeout based retransmit.  When a connection
++	 * is in a sad state like this, we care only about integrity
++	 * of the connection not performance.
++	 */
++	if (tp->rx_opt.sack_ok)
++		tcp_sack_reset(&tp->rx_opt);
++
++	return true;
+ }
+ 
+ /* Reduce allocated memory if we can, trying to get
+@@ -4902,7 +4960,7 @@ static int tcp_prune_queue(struct sock *
+ 
+ 	tcp_collapse_ofo_queue(sk);
+ 	if (!skb_queue_empty(&sk->sk_receive_queue))
+-		tcp_collapse(sk, &sk->sk_receive_queue,
++		tcp_collapse(sk, &sk->sk_receive_queue, NULL,
+ 			     skb_peek(&sk->sk_receive_queue),
+ 			     NULL,
+ 			     tp->copied_seq, tp->rcv_nxt);
+@@ -5007,7 +5065,7 @@ static void __tcp_ack_snd_check(struct s
+ 	    /* We ACK each frame or... */
+ 	    tcp_in_quickack_mode(sk) ||
+ 	    /* We have out of order data. */
+-	    (ofo_possible && skb_peek(&tp->out_of_order_queue))) {
++	    (ofo_possible && !RB_EMPTY_ROOT(&tp->out_of_order_queue))) {
+ 		/* Then ack it now */
+ 		tcp_send_ack(sk);
+ 	} else {
+--- a/net/ipv4/tcp_ipv4.c
++++ b/net/ipv4/tcp_ipv4.c
+@@ -1830,7 +1830,7 @@ void tcp_v4_destroy_sock(struct sock *sk
+ 	tcp_write_queue_purge(sk);
+ 
+ 	/* Cleans up our, hopefully empty, out_of_order_queue. */
+-	__skb_queue_purge(&tp->out_of_order_queue);
++	skb_rbtree_purge(&tp->out_of_order_queue);
+ 
+ #ifdef CONFIG_TCP_MD5SIG
+ 	/* Clean up the MD5 key list, if any */
+--- a/net/ipv4/tcp_minisocks.c
++++ b/net/ipv4/tcp_minisocks.c
+@@ -496,7 +496,6 @@ struct sock *tcp_create_openreq_child(co
+ 		newtp->snd_cwnd_cnt = 0;
+ 
+ 		tcp_init_xmit_timers(newsk);
+-		__skb_queue_head_init(&newtp->out_of_order_queue);
+ 		newtp->write_seq = newtp->pushed_seq = treq->snt_isn + 1;
+ 
+ 		newtp->rx_opt.saw_tstamp = 0;
-- 
2.47.2