4.4-stable patches

author Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Thu, 11 Oct 2018 14:07:14 +0000 (16:07 +0200)

committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Thu, 11 Oct 2018 14:07:14 +0000 (16:07 +0200)
author Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Thu, 11 Oct 2018 14:07:14 +0000 (16:07 +0200)
committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Thu, 11 Oct 2018 14:07:14 +0000 (16:07 +0200)
diff --git a/queue-4.4/series b/queue-4.4/series

index 960f761dd1520cf0ec8bb5e31c3b57364f5e00de..20189986b39435c77bd80fbd43e10e5c8f424a94 100644 (file)
--- a/queue-4.4/series
+++ b/queue-4.4/series
@@ -17,3 +17,9 @@ powerpc-fadump-return-error-when-fadump-registration-fails.patch
  arc-clone-syscall-to-setp-r25-as-thread-pointer.patch
  ucma-fix-a-use-after-free-in-ucma_resolve_ip.patch
  ubifs-check-for-name-being-null-while-mounting.patch
+tcp-increment-sk_drops-for-dropped-rx-packets.patch
+tcp-use-an-rb-tree-for-ooo-receive-queue.patch
+tcp-fix-a-stale-ooo_last_skb-after-a-replace.patch
+tcp-free-batches-of-packets-in-tcp_prune_ofo_queue.patch
+tcp-call-tcp_drop-from-tcp_data_queue_ofo.patch
+tcp-add-tcp_ooo_try_coalesce-helper.patch
diff --git a/queue-4.4/tcp-add-tcp_ooo_try_coalesce-helper.patch b/queue-4.4/tcp-add-tcp_ooo_try_coalesce-helper.patch

new file mode 100644 (file)

index 0000000..6105e76
--- /dev/null
+++ b/queue-4.4/tcp-add-tcp_ooo_try_coalesce-helper.patch
@@ -0,0 +1,75 @@
+From foo@baz Thu Oct 11 16:06:02 CEST 2018
+From: Mao Wenan <maowenan@huawei.com>
+Date: Fri, 14 Sep 2018 16:24:10 +0800
+Subject: tcp: add tcp_ooo_try_coalesce() helper
+To: <netdev@vger.kernel.org>, <gregkh@linux-foundation.org>, <dwmw2@infradead.org>, <eric.dumazet@gmail.com>, <davem@davemloft.net>, <stable@vger.kernel.org>, <linux-kernel@vger.kernel.org>, <maowenan@huawei.com>
+Message-ID: <1536913450-12380-7-git-send-email-maowenan@huawei.com>
+
+From: Eric Dumazet <edumazet@google.com>
+
+[ Upstream commit 58152ecbbcc6a0ce7fddd5bf5f6ee535834ece0c ]
+
+In case skb in out_or_order_queue is the result of
+multiple skbs coalescing, we would like to get a proper gso_segs
+counter tracking, so that future tcp_drop() can report an accurate
+number.
+
+I chose to not implement this tracking for skbs in receive queue,
+since they are not dropped, unless socket is disconnected.
+
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
+Acked-by: Yuchung Cheng <ycheng@google.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Mao Wenan <maowenan@huawei.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/ipv4/tcp_input.c |   23 +++++++++++++++++++++--
+ 1 file changed, 21 insertions(+), 2 deletions(-)
+
+--- a/net/ipv4/tcp_input.c
++++ b/net/ipv4/tcp_input.c
+@@ -4296,6 +4296,23 @@ static bool tcp_try_coalesce(struct sock
+       return true;
+ }
+ 
++static bool tcp_ooo_try_coalesce(struct sock *sk,
++                           struct sk_buff *to,
++                           struct sk_buff *from,
++                           bool *fragstolen)
++{
++      bool res = tcp_try_coalesce(sk, to, from, fragstolen);
++
++      /* In case tcp_drop() is called later, update to->gso_segs */
++      if (res) {
++              u32 gso_segs = max_t(u16, 1, skb_shinfo(to)->gso_segs) +
++                             max_t(u16, 1, skb_shinfo(from)->gso_segs);
++
++              skb_shinfo(to)->gso_segs = min_t(u32, gso_segs, 0xFFFF);
++      }
++      return res;
++}
++
+ static void tcp_drop(struct sock *sk, struct sk_buff *skb)
+ {
+       sk_drops_add(sk, skb);
+@@ -4422,7 +4439,8 @@ static void tcp_data_queue_ofo(struct so
+       /* In the typical case, we are adding an skb to the end of the list.
+        * Use of ooo_last_skb avoids the O(Log(N)) rbtree lookup.
+        */
+-      if (tcp_try_coalesce(sk, tp->ooo_last_skb, skb, &fragstolen)) {
++      if (tcp_ooo_try_coalesce(sk, tp->ooo_last_skb,
++                               skb, &fragstolen)) {
+ coalesce_done:
+               tcp_grow_window(sk, skb);
+               kfree_skb_partial(skb, fragstolen);
+@@ -4467,7 +4485,8 @@ coalesce_done:
+                               tcp_drop(sk, skb1);
+                               goto merge_right;
+                       }
+-              } else if (tcp_try_coalesce(sk, skb1, skb, &fragstolen)) {
++              } else if (tcp_ooo_try_coalesce(sk, skb1,
++                                              skb, &fragstolen)) {
+                       goto coalesce_done;
+               }
+               p = &parent->rb_right;
diff --git a/queue-4.4/tcp-call-tcp_drop-from-tcp_data_queue_ofo.patch b/queue-4.4/tcp-call-tcp_drop-from-tcp_data_queue_ofo.patch

new file mode 100644 (file)

index 0000000..13427c8
--- /dev/null
+++ b/queue-4.4/tcp-call-tcp_drop-from-tcp_data_queue_ofo.patch
@@ -0,0 +1,45 @@
+From foo@baz Thu Oct 11 16:06:02 CEST 2018
+From: Mao Wenan <maowenan@huawei.com>
+Date: Fri, 14 Sep 2018 16:24:09 +0800
+Subject: tcp: call tcp_drop() from tcp_data_queue_ofo()
+To: <netdev@vger.kernel.org>, <gregkh@linux-foundation.org>, <dwmw2@infradead.org>, <eric.dumazet@gmail.com>, <davem@davemloft.net>, <stable@vger.kernel.org>, <linux-kernel@vger.kernel.org>, <maowenan@huawei.com>
+Message-ID: <1536913450-12380-6-git-send-email-maowenan@huawei.com>
+
+From: Eric Dumazet <edumazet@google.com>
+
+[ Upstream commit 8541b21e781a22dce52a74fef0b9bed00404a1cd ]
+
+In order to be able to give better diagnostics and detect
+malicious traffic, we need to have better sk->sk_drops tracking.
+
+Fixes: 9f5afeae5152 ("tcp: use an RB tree for ooo receive queue")
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
+Acked-by: Yuchung Cheng <ycheng@google.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Mao Wenan <maowenan@huawei.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/ipv4/tcp_input.c |    4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+--- a/net/ipv4/tcp_input.c
++++ b/net/ipv4/tcp_input.c
+@@ -4445,7 +4445,7 @@ coalesce_done:
+                               /* All the bits are present. Drop. */
+                               NET_INC_STATS(sock_net(sk),
+                                             LINUX_MIB_TCPOFOMERGE);
+-                              __kfree_skb(skb);
++                              tcp_drop(sk, skb);
+                               skb = NULL;
+                               tcp_dsack_set(sk, seq, end_seq);
+                               goto add_sack;
+@@ -4464,7 +4464,7 @@ coalesce_done:
+                                                TCP_SKB_CB(skb1)->end_seq);
+                               NET_INC_STATS(sock_net(sk),
+                                             LINUX_MIB_TCPOFOMERGE);
+-                              __kfree_skb(skb1);
++                              tcp_drop(sk, skb1);
+                               goto merge_right;
+                       }
+               } else if (tcp_try_coalesce(sk, skb1, skb, &fragstolen)) {
diff --git a/queue-4.4/tcp-fix-a-stale-ooo_last_skb-after-a-replace.patch b/queue-4.4/tcp-fix-a-stale-ooo_last_skb-after-a-replace.patch

new file mode 100644 (file)

index 0000000..164e459
--- /dev/null
+++ b/queue-4.4/tcp-fix-a-stale-ooo_last_skb-after-a-replace.patch
@@ -0,0 +1,76 @@
+From foo@baz Thu Oct 11 16:06:02 CEST 2018
+From: Mao Wenan <maowenan@huawei.com>
+Date: Fri, 14 Sep 2018 16:24:07 +0800
+Subject: tcp: fix a stale ooo_last_skb after a replace
+To: <netdev@vger.kernel.org>, <gregkh@linux-foundation.org>, <dwmw2@infradead.org>, <eric.dumazet@gmail.com>, <davem@davemloft.net>, <stable@vger.kernel.org>, <linux-kernel@vger.kernel.org>, <maowenan@huawei.com>
+Message-ID: <1536913450-12380-4-git-send-email-maowenan@huawei.com>
+
+From: Eric Dumazet <edumazet@google.com>
+
+[ Upstream commit 76f0dcbb5ae1a7c3dbeec13dd98233b8e6b0b32a ]
+
+When skb replaces another one in ooo queue, I forgot to also
+update tp->ooo_last_skb as well, if the replaced skb was the last one
+in the queue.
+
+To fix this, we simply can re-use the code that runs after an insertion,
+trying to merge skbs at the right of current skb.
+
+This not only fixes the bug, but also remove all small skbs that might
+be a subset of the new one.
+
+Example:
+
+We receive segments 2001:3001,  4001:5001
+
+Then we receive 2001:8001 : We should replace 2001:3001 with the big
+skb, but also remove 4001:50001 from the queue to save space.
+
+packetdrill test demonstrating the bug
+
+0.000 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
++0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
++0 bind(3, ..., ...) = 0
++0 listen(3, 1) = 0
+
++0 < S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 7>
++0 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 7>
++0.100 < . 1:1(0) ack 1 win 1024
++0 accept(3, ..., ...) = 4
+
++0.01 < . 1001:2001(1000) ack 1 win 1024
++0    > . 1:1(0) ack 1 <nop,nop, sack 1001:2001>
+
++0.01 < . 1001:3001(2000) ack 1 win 1024
++0    > . 1:1(0) ack 1 <nop,nop, sack 1001:2001 1001:3001>
+
+Fixes: 9f5afeae5152 ("tcp: use an RB tree for ooo receive queue")
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Reported-by: Yuchung Cheng <ycheng@google.com>
+Cc: Yaogong Wang <wygivan@google.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Mao Wenan <maowenan@huawei.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/ipv4/tcp_input.c |    3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+--- a/net/ipv4/tcp_input.c
++++ b/net/ipv4/tcp_input.c
+@@ -4465,7 +4465,7 @@ coalesce_done:
+                               NET_INC_STATS(sock_net(sk),
+                                             LINUX_MIB_TCPOFOMERGE);
+                               __kfree_skb(skb1);
+-                              goto add_sack;
++                              goto merge_right;
+                       }
+               } else if (tcp_try_coalesce(sk, skb1, skb, &fragstolen)) {
+                       goto coalesce_done;
+@@ -4477,6 +4477,7 @@ coalesce_done:
+       rb_link_node(&skb->rbnode, parent, p);
+       rb_insert_color(&skb->rbnode, &tp->out_of_order_queue);
+ 
++merge_right:
+       /* Remove other segments covered by skb. */
+       while ((q = rb_next(&skb->rbnode)) != NULL) {
+               skb1 = rb_entry(q, struct sk_buff, rbnode);
diff --git a/queue-4.4/tcp-free-batches-of-packets-in-tcp_prune_ofo_queue.patch b/queue-4.4/tcp-free-batches-of-packets-in-tcp_prune_ofo_queue.patch

new file mode 100644 (file)

index 0000000..e6c44a5
--- /dev/null
+++ b/queue-4.4/tcp-free-batches-of-packets-in-tcp_prune_ofo_queue.patch
@@ -0,0 +1,79 @@
+From foo@baz Thu Oct 11 16:06:02 CEST 2018
+From: Mao Wenan <maowenan@huawei.com>
+Date: Fri, 14 Sep 2018 16:24:08 +0800
+Subject: tcp: free batches of packets in tcp_prune_ofo_queue()
+To: <netdev@vger.kernel.org>, <gregkh@linux-foundation.org>, <dwmw2@infradead.org>, <eric.dumazet@gmail.com>, <davem@davemloft.net>, <stable@vger.kernel.org>, <linux-kernel@vger.kernel.org>, <maowenan@huawei.com>
+Message-ID: <1536913450-12380-5-git-send-email-maowenan@huawei.com>
+
+From: Eric Dumazet <edumazet@google.com>
+
+[ Upstream commit 72cd43ba64fc172a443410ce01645895850844c8 ]
+
+Juha-Matti Tilli reported that malicious peers could inject tiny
+packets in out_of_order_queue, forcing very expensive calls
+to tcp_collapse_ofo_queue() and tcp_prune_ofo_queue() for
+every incoming packet. out_of_order_queue rb-tree can contain
+thousands of nodes, iterating over all of them is not nice.
+
+Before linux-4.9, we would have pruned all packets in ofo_queue
+in one go, every XXXX packets. XXXX depends on sk_rcvbuf and skbs
+truesize, but is about 7000 packets with tcp_rmem[2] default of 6 MB.
+
+Since we plan to increase tcp_rmem[2] in the future to cope with
+modern BDP, can not revert to the old behavior, without great pain.
+
+Strategy taken in this patch is to purge ~12.5 % of the queue capacity.
+
+Fixes: 36a6503fedda ("tcp: refine tcp_prune_ofo_queue() to not drop all packets")
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Reported-by: Juha-Matti Tilli <juha-matti.tilli@iki.fi>
+Acked-by: Yuchung Cheng <ycheng@google.com>
+Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Mao Wenan <maowenan@huawei.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/ipv4/tcp_input.c |   16 +++++++++++-----
+ 1 file changed, 11 insertions(+), 5 deletions(-)
+
+--- a/net/ipv4/tcp_input.c
++++ b/net/ipv4/tcp_input.c
+@@ -4899,27 +4899,33 @@ new_range:
+ 
+ /*
+  * Purge the out-of-order queue.
++ * Drop at least 12.5 % of sk_rcvbuf to avoid malicious attacks.
+  * Return true if queue was pruned.
+  */
+ static bool tcp_prune_ofo_queue(struct sock *sk)
+ {
+       struct tcp_sock *tp = tcp_sk(sk);
+       struct rb_node *node, *prev;
++      int goal;
+ 
+       if (RB_EMPTY_ROOT(&tp->out_of_order_queue))
+               return false;
+ 
+       NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_OFOPRUNED);
+-
++      goal = sk->sk_rcvbuf >> 3;
+       node = &tp->ooo_last_skb->rbnode;
+       do {
+               prev = rb_prev(node);
+               rb_erase(node, &tp->out_of_order_queue);
++              goal -= rb_to_skb(node)->truesize;
+               __kfree_skb(rb_to_skb(node));
+-              sk_mem_reclaim(sk);
+-              if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf &&
+-                  !tcp_under_memory_pressure(sk))
+-                      break;
++              if (!prev || goal <= 0) {
++                      sk_mem_reclaim(sk);
++                      if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf &&
++                          !tcp_under_memory_pressure(sk))
++                              break;
++                      goal = sk->sk_rcvbuf >> 3;
++              }
+ 
+               node = prev;
+       } while (node);
diff --git a/queue-4.4/tcp-increment-sk_drops-for-dropped-rx-packets.patch b/queue-4.4/tcp-increment-sk_drops-for-dropped-rx-packets.patch

new file mode 100644 (file)

index 0000000..faf9a3f
--- /dev/null
+++ b/queue-4.4/tcp-increment-sk_drops-for-dropped-rx-packets.patch
@@ -0,0 +1,178 @@
+From foo@baz Thu Oct 11 16:06:02 CEST 2018
+From: Mao Wenan <maowenan@huawei.com>
+Date: Fri, 14 Sep 2018 16:24:05 +0800
+Subject: tcp: increment sk_drops for dropped rx packets
+To: <netdev@vger.kernel.org>, <gregkh@linux-foundation.org>, <dwmw2@infradead.org>, <eric.dumazet@gmail.com>, <davem@davemloft.net>, <stable@vger.kernel.org>, <linux-kernel@vger.kernel.org>, <maowenan@huawei.com>
+Message-ID: <1536913450-12380-2-git-send-email-maowenan@huawei.com>
+
+From: Eric Dumazet <edumazet@google.com>
+
+[ Upstream commit 532182cd610782db8c18230c2747626562032205 ]
+
+Now ss can report sk_drops, we can instruct TCP to increment
+this per socket counter when it drops an incoming frame, to refine
+monitoring and debugging.
+
+Following patch takes care of listeners drops.
+
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Mao Wenan <maowenan@huawei.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/net/sock.h   |    7 +++++++
+ net/ipv4/tcp_input.c |   33 ++++++++++++++++++++-------------
+ net/ipv4/tcp_ipv4.c  |    1 +
+ net/ipv6/tcp_ipv6.c  |    1 +
+ 4 files changed, 29 insertions(+), 13 deletions(-)
+
+--- a/include/net/sock.h
++++ b/include/net/sock.h
+@@ -2139,6 +2139,13 @@ sock_skb_set_dropcount(const struct sock
+       SOCK_SKB_CB(skb)->dropcount = atomic_read(&sk->sk_drops);
+ }
+ 
++static inline void sk_drops_add(struct sock *sk, const struct sk_buff *skb)
++{
++      int segs = max_t(u16, 1, skb_shinfo(skb)->gso_segs);
++
++      atomic_add(segs, &sk->sk_drops);
++}
++
+ void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk,
+                          struct sk_buff *skb);
+ void __sock_recv_wifi_status(struct msghdr *msg, struct sock *sk,
+--- a/net/ipv4/tcp_input.c
++++ b/net/ipv4/tcp_input.c
+@@ -4296,6 +4296,12 @@ static bool tcp_try_coalesce(struct sock
+       return true;
+ }
+ 
++static void tcp_drop(struct sock *sk, struct sk_buff *skb)
++{
++      sk_drops_add(sk, skb);
++      __kfree_skb(skb);
++}
++
+ /* This one checks to see if we can put data from the
+  * out_of_order queue into the receive_queue.
+  */
+@@ -4320,7 +4326,7 @@ static void tcp_ofo_queue(struct sock *s
+               __skb_unlink(skb, &tp->out_of_order_queue);
+               if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
+                       SOCK_DEBUG(sk, "ofo packet was already received\n");
+-                      __kfree_skb(skb);
++                      tcp_drop(sk, skb);
+                       continue;
+               }
+               SOCK_DEBUG(sk, "ofo requeuing : rcv_next %X seq %X - %X\n",
+@@ -4372,7 +4378,7 @@ static void tcp_data_queue_ofo(struct so
+ 
+       if (unlikely(tcp_try_rmem_schedule(sk, skb, skb->truesize))) {
+               NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFODROP);
+-              __kfree_skb(skb);
++              tcp_drop(sk, skb);
+               return;
+       }
+ 
+@@ -4436,7 +4442,7 @@ static void tcp_data_queue_ofo(struct so
+               if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
+                       /* All the bits are present. Drop. */
+                       NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFOMERGE);
+-                      __kfree_skb(skb);
++                      tcp_drop(sk, skb);
+                       skb = NULL;
+                       tcp_dsack_set(sk, seq, end_seq);
+                       goto add_sack;
+@@ -4475,7 +4481,7 @@ static void tcp_data_queue_ofo(struct so
+               tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq,
+                                TCP_SKB_CB(skb1)->end_seq);
+               NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFOMERGE);
+-              __kfree_skb(skb1);
++              tcp_drop(sk, skb1);
+       }
+ 
+ add_sack:
+@@ -4558,12 +4564,13 @@ err:
+ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
+ {
+       struct tcp_sock *tp = tcp_sk(sk);
+-      int eaten = -1;
+       bool fragstolen = false;
++      int eaten = -1;
+ 
+-      if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq)
+-              goto drop;
+-
++      if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) {
++              __kfree_skb(skb);
++              return;
++      }
+       skb_dst_drop(skb);
+       __skb_pull(skb, tcp_hdr(skb)->doff * 4);
+ 
+@@ -4645,7 +4652,7 @@ out_of_window:
+               tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS);
+               inet_csk_schedule_ack(sk);
+ drop:
+-              __kfree_skb(skb);
++              tcp_drop(sk, skb);
+               return;
+       }
+ 
+@@ -5236,7 +5243,7 @@ syn_challenge:
+       return true;
+ 
+ discard:
+-      __kfree_skb(skb);
++      tcp_drop(sk, skb);
+       return false;
+ }
+ 
+@@ -5454,7 +5461,7 @@ csum_error:
+       TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
+ 
+ discard:
+-      __kfree_skb(skb);
++      tcp_drop(sk, skb);
+ }
+ EXPORT_SYMBOL(tcp_rcv_established);
+ 
+@@ -5684,7 +5691,7 @@ static int tcp_rcv_synsent_state_process
+                                                 TCP_DELACK_MAX, TCP_RTO_MAX);
+ 
+ discard:
+-                      __kfree_skb(skb);
++                      tcp_drop(sk, skb);
+                       return 0;
+               } else {
+                       tcp_send_ack(sk);
+@@ -6041,7 +6048,7 @@ int tcp_rcv_state_process(struct sock *s
+ 
+       if (!queued) {
+ discard:
+-              __kfree_skb(skb);
++              tcp_drop(sk, skb);
+       }
+       return 0;
+ }
+--- a/net/ipv4/tcp_ipv4.c
++++ b/net/ipv4/tcp_ipv4.c
+@@ -1716,6 +1716,7 @@ discard_it:
+       return 0;
+ 
+ discard_and_relse:
++      sk_drops_add(sk, skb);
+       sock_put(sk);
+       goto discard_it;
+ 
+--- a/net/ipv6/tcp_ipv6.c
++++ b/net/ipv6/tcp_ipv6.c
+@@ -1505,6 +1505,7 @@ discard_it:
+       return 0;
+ 
+ discard_and_relse:
++      sk_drops_add(sk, skb);
+       sock_put(sk);
+       goto discard_it;
+ 
diff --git a/queue-4.4/tcp-use-an-rb-tree-for-ooo-receive-queue.patch b/queue-4.4/tcp-use-an-rb-tree-for-ooo-receive-queue.patch

new file mode 100644 (file)

index 0000000..74ec66f
--- /dev/null
+++ b/queue-4.4/tcp-use-an-rb-tree-for-ooo-receive-queue.patch
@@ -0,0 +1,757 @@
+From foo@baz Thu Oct 11 16:06:02 CEST 2018
+From: Mao Wenan <maowenan@huawei.com>
+Date: Fri, 14 Sep 2018 16:24:06 +0800
+Subject: tcp: use an RB tree for ooo receive queue
+To: <netdev@vger.kernel.org>, <gregkh@linux-foundation.org>, <dwmw2@infradead.org>, <eric.dumazet@gmail.com>, <davem@davemloft.net>, <stable@vger.kernel.org>, <linux-kernel@vger.kernel.org>, <maowenan@huawei.com>
+Message-ID: <1536913450-12380-3-git-send-email-maowenan@huawei.com>
+
+From: Yaogong Wang <wygivan@google.com>
+
+[ Upstream commit 9f5afeae51526b3ad7b7cb21ee8b145ce6ea7a7a ]
+
+Over the years, TCP BDP has increased by several orders of magnitude,
+and some people are considering to reach the 2 Gbytes limit.
+
+Even with current window scale limit of 14, ~1 Gbytes maps to ~740,000
+MSS.
+
+In presence of packet losses (or reorders), TCP stores incoming packets
+into an out of order queue, and number of skbs sitting there waiting for
+the missing packets to be received can be in the 10^5 range.
+
+Most packets are appended to the tail of this queue, and when
+packets can finally be transferred to receive queue, we scan the queue
+from its head.
+
+However, in presence of heavy losses, we might have to find an arbitrary
+point in this queue, involving a linear scan for every incoming packet,
+throwing away cpu caches.
+
+This patch converts it to a RB tree, to get bounded latencies.
+
+Yaogong wrote a preliminary patch about 2 years ago.
+Eric did the rebase, added ofo_last_skb cache, polishing and tests.
+
+Tested with network dropping between 1 and 10 % packets, with good
+success (about 30 % increase of throughput in stress tests)
+
+Next step would be to also use an RB tree for the write queue at sender
+side ;)
+
+Signed-off-by: Yaogong Wang <wygivan@google.com>
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Cc: Yuchung Cheng <ycheng@google.com>
+Cc: Neal Cardwell <ncardwell@google.com>
+Cc: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
+Acked-By: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Mao Wenan <maowenan@huawei.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/linux/skbuff.h   |    8 +
+ include/linux/tcp.h      |    7 
+ include/net/tcp.h        |    2 
+ net/core/skbuff.c        |   19 ++
+ net/ipv4/tcp.c           |    4 
+ net/ipv4/tcp_input.c     |  356 +++++++++++++++++++++++++++--------------------
+ net/ipv4/tcp_ipv4.c      |    2 
+ net/ipv4/tcp_minisocks.c |    1 
+ 8 files changed, 241 insertions(+), 158 deletions(-)
+
+--- a/include/linux/skbuff.h
++++ b/include/linux/skbuff.h
+@@ -2273,6 +2273,8 @@ static inline void __skb_queue_purge(str
+               kfree_skb(skb);
+ }
+ 
++void skb_rbtree_purge(struct rb_root *root);
++
+ void *netdev_alloc_frag(unsigned int fragsz);
+ 
+ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int length,
+@@ -2807,6 +2809,12 @@ static inline int pskb_trim_rcsum(struct
+       return __pskb_trim(skb, len);
+ }
+ 
++#define rb_to_skb(rb) rb_entry_safe(rb, struct sk_buff, rbnode)
++#define skb_rb_first(root) rb_to_skb(rb_first(root))
++#define skb_rb_last(root)  rb_to_skb(rb_last(root))
++#define skb_rb_next(skb)   rb_to_skb(rb_next(&(skb)->rbnode))
++#define skb_rb_prev(skb)   rb_to_skb(rb_prev(&(skb)->rbnode))
++
+ #define skb_queue_walk(queue, skb) \
+               for (skb = (queue)->next;                                       \
+                    skb != (struct sk_buff *)(queue);                          \
+--- a/include/linux/tcp.h
++++ b/include/linux/tcp.h
+@@ -279,10 +279,9 @@ struct tcp_sock {
+       struct sk_buff* lost_skb_hint;
+       struct sk_buff *retransmit_skb_hint;
+ 
+-      /* OOO segments go in this list. Note that socket lock must be held,
+-       * as we do not use sk_buff_head lock.
+-       */
+-      struct sk_buff_head     out_of_order_queue;
++      /* OOO segments go in this rbtree. Socket lock must be held. */
++      struct rb_root  out_of_order_queue;
++      struct sk_buff  *ooo_last_skb; /* cache rb_last(out_of_order_queue) */
+ 
+       /* SACKs data, these 2 need to be together (see tcp_options_write) */
+       struct tcp_sack_block duplicate_sack[1]; /* D-SACK block */
+--- a/include/net/tcp.h
++++ b/include/net/tcp.h
+@@ -649,7 +649,7 @@ static inline void tcp_fast_path_check(s
+ {
+       struct tcp_sock *tp = tcp_sk(sk);
+ 
+-      if (skb_queue_empty(&tp->out_of_order_queue) &&
++      if (RB_EMPTY_ROOT(&tp->out_of_order_queue) &&
+           tp->rcv_wnd &&
+           atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf &&
+           !tp->urg_data)
+--- a/net/core/skbuff.c
++++ b/net/core/skbuff.c
+@@ -2378,6 +2378,25 @@ void skb_queue_purge(struct sk_buff_head
+ EXPORT_SYMBOL(skb_queue_purge);
+ 
+ /**
++ *    skb_rbtree_purge - empty a skb rbtree
++ *    @root: root of the rbtree to empty
++ *
++ *    Delete all buffers on an &sk_buff rbtree. Each buffer is removed from
++ *    the list and one reference dropped. This function does not take
++ *    any lock. Synchronization should be handled by the caller (e.g., TCP
++ *    out-of-order queue is protected by the socket lock).
++ */
++void skb_rbtree_purge(struct rb_root *root)
++{
++      struct sk_buff *skb, *next;
++
++      rbtree_postorder_for_each_entry_safe(skb, next, root, rbnode)
++              kfree_skb(skb);
++
++      *root = RB_ROOT;
++}
++
++/**
+  *    skb_queue_head - queue a buffer at the list head
+  *    @list: list to use
+  *    @newsk: buffer to queue
+--- a/net/ipv4/tcp.c
++++ b/net/ipv4/tcp.c
+@@ -382,7 +382,7 @@ void tcp_init_sock(struct sock *sk)
+       struct inet_connection_sock *icsk = inet_csk(sk);
+       struct tcp_sock *tp = tcp_sk(sk);
+ 
+-      __skb_queue_head_init(&tp->out_of_order_queue);
++      tp->out_of_order_queue = RB_ROOT;
+       tcp_init_xmit_timers(sk);
+       tcp_prequeue_init(tp);
+       INIT_LIST_HEAD(&tp->tsq_node);
+@@ -2240,7 +2240,7 @@ int tcp_disconnect(struct sock *sk, int
+       tcp_clear_xmit_timers(sk);
+       __skb_queue_purge(&sk->sk_receive_queue);
+       tcp_write_queue_purge(sk);
+-      __skb_queue_purge(&tp->out_of_order_queue);
++      skb_rbtree_purge(&tp->out_of_order_queue);
+ 
+       inet->inet_dport = 0;
+ 
+--- a/net/ipv4/tcp_input.c
++++ b/net/ipv4/tcp_input.c
+@@ -4073,7 +4073,7 @@ static void tcp_fin(struct sock *sk)
+       /* It _is_ possible, that we have something out-of-order _after_ FIN.
+        * Probably, we should reset in this case. For now drop them.
+        */
+-      __skb_queue_purge(&tp->out_of_order_queue);
++      skb_rbtree_purge(&tp->out_of_order_queue);
+       if (tcp_is_sack(tp))
+               tcp_sack_reset(&tp->rx_opt);
+       sk_mem_reclaim(sk);
+@@ -4233,7 +4233,7 @@ static void tcp_sack_remove(struct tcp_s
+       int this_sack;
+ 
+       /* Empty ofo queue, hence, all the SACKs are eaten. Clear. */
+-      if (skb_queue_empty(&tp->out_of_order_queue)) {
++      if (RB_EMPTY_ROOT(&tp->out_of_order_queue)) {
+               tp->rx_opt.num_sacks = 0;
+               return;
+       }
+@@ -4309,10 +4309,13 @@ static void tcp_ofo_queue(struct sock *s
+ {
+       struct tcp_sock *tp = tcp_sk(sk);
+       __u32 dsack_high = tp->rcv_nxt;
++      bool fin, fragstolen, eaten;
+       struct sk_buff *skb, *tail;
+-      bool fragstolen, eaten;
++      struct rb_node *p;
+ 
+-      while ((skb = skb_peek(&tp->out_of_order_queue)) != NULL) {
++      p = rb_first(&tp->out_of_order_queue);
++      while (p) {
++              skb = rb_entry(p, struct sk_buff, rbnode);
+               if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt))
+                       break;
+ 
+@@ -4322,9 +4325,10 @@ static void tcp_ofo_queue(struct sock *s
+                               dsack_high = TCP_SKB_CB(skb)->end_seq;
+                       tcp_dsack_extend(sk, TCP_SKB_CB(skb)->seq, dsack);
+               }
++              p = rb_next(p);
++              rb_erase(&skb->rbnode, &tp->out_of_order_queue);
+ 
+-              __skb_unlink(skb, &tp->out_of_order_queue);
+-              if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
++              if (unlikely(!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt))) {
+                       SOCK_DEBUG(sk, "ofo packet was already received\n");
+                       tcp_drop(sk, skb);
+                       continue;
+@@ -4336,12 +4340,19 @@ static void tcp_ofo_queue(struct sock *s
+               tail = skb_peek_tail(&sk->sk_receive_queue);
+               eaten = tail && tcp_try_coalesce(sk, tail, skb, &fragstolen);
+               tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq);
++              fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN;
+               if (!eaten)
+                       __skb_queue_tail(&sk->sk_receive_queue, skb);
+-              if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
+-                      tcp_fin(sk);
+-              if (eaten)
++              else
+                       kfree_skb_partial(skb, fragstolen);
++
++              if (unlikely(fin)) {
++                      tcp_fin(sk);
++                      /* tcp_fin() purges tp->out_of_order_queue,
++                       * so we must end this loop right now.
++                       */
++                      break;
++              }
+       }
+ }
+ 
+@@ -4371,8 +4382,10 @@ static int tcp_try_rmem_schedule(struct
+ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
+ {
+       struct tcp_sock *tp = tcp_sk(sk);
++      struct rb_node **p, *q, *parent;
+       struct sk_buff *skb1;
+       u32 seq, end_seq;
++      bool fragstolen;
+ 
+       tcp_ecn_check_ce(sk, skb);
+ 
+@@ -4387,89 +4400,86 @@ static void tcp_data_queue_ofo(struct so
+       inet_csk_schedule_ack(sk);
+ 
+       NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFOQUEUE);
++      seq = TCP_SKB_CB(skb)->seq;
++      end_seq = TCP_SKB_CB(skb)->end_seq;
+       SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n",
+-                 tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
++                 tp->rcv_nxt, seq, end_seq);
+ 
+-      skb1 = skb_peek_tail(&tp->out_of_order_queue);
+-      if (!skb1) {
++      p = &tp->out_of_order_queue.rb_node;
++      if (RB_EMPTY_ROOT(&tp->out_of_order_queue)) {
+               /* Initial out of order segment, build 1 SACK. */
+               if (tcp_is_sack(tp)) {
+                       tp->rx_opt.num_sacks = 1;
+-                      tp->selective_acks[0].start_seq = TCP_SKB_CB(skb)->seq;
+-                      tp->selective_acks[0].end_seq =
+-                                              TCP_SKB_CB(skb)->end_seq;
++                      tp->selective_acks[0].start_seq = seq;
++                      tp->selective_acks[0].end_seq = end_seq;
+               }
+-              __skb_queue_head(&tp->out_of_order_queue, skb);
++              rb_link_node(&skb->rbnode, NULL, p);
++              rb_insert_color(&skb->rbnode, &tp->out_of_order_queue);
++              tp->ooo_last_skb = skb;
+               goto end;
+       }
+ 
+-      seq = TCP_SKB_CB(skb)->seq;
+-      end_seq = TCP_SKB_CB(skb)->end_seq;
+-
+-      if (seq == TCP_SKB_CB(skb1)->end_seq) {
+-              bool fragstolen;
+-
+-              if (!tcp_try_coalesce(sk, skb1, skb, &fragstolen)) {
+-                      __skb_queue_after(&tp->out_of_order_queue, skb1, skb);
+-              } else {
+-                      tcp_grow_window(sk, skb);
+-                      kfree_skb_partial(skb, fragstolen);
+-                      skb = NULL;
+-              }
+-
+-              if (!tp->rx_opt.num_sacks ||
+-                  tp->selective_acks[0].end_seq != seq)
+-                      goto add_sack;
+-
+-              /* Common case: data arrive in order after hole. */
+-              tp->selective_acks[0].end_seq = end_seq;
+-              goto end;
++      /* In the typical case, we are adding an skb to the end of the list.
++       * Use of ooo_last_skb avoids the O(Log(N)) rbtree lookup.
++       */
++      if (tcp_try_coalesce(sk, tp->ooo_last_skb, skb, &fragstolen)) {
++coalesce_done:
++              tcp_grow_window(sk, skb);
++              kfree_skb_partial(skb, fragstolen);
++              skb = NULL;
++              goto add_sack;
+       }
+ 
+-      /* Find place to insert this segment. */
+-      while (1) {
+-              if (!after(TCP_SKB_CB(skb1)->seq, seq))
+-                      break;
+-              if (skb_queue_is_first(&tp->out_of_order_queue, skb1)) {
+-                      skb1 = NULL;
+-                      break;
++      /* Find place to insert this segment. Handle overlaps on the way. */
++      parent = NULL;
++      while (*p) {
++              parent = *p;
++              skb1 = rb_entry(parent, struct sk_buff, rbnode);
++              if (before(seq, TCP_SKB_CB(skb1)->seq)) {
++                      p = &parent->rb_left;
++                      continue;
+               }
+-              skb1 = skb_queue_prev(&tp->out_of_order_queue, skb1);
+-      }
+ 
+-      /* Do skb overlap to previous one? */
+-      if (skb1 && before(seq, TCP_SKB_CB(skb1)->end_seq)) {
+-              if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
+-                      /* All the bits are present. Drop. */
+-                      NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFOMERGE);
+-                      tcp_drop(sk, skb);
+-                      skb = NULL;
+-                      tcp_dsack_set(sk, seq, end_seq);
+-                      goto add_sack;
+-              }
+-              if (after(seq, TCP_SKB_CB(skb1)->seq)) {
+-                      /* Partial overlap. */
+-                      tcp_dsack_set(sk, seq,
+-                                    TCP_SKB_CB(skb1)->end_seq);
+-              } else {
+-                      if (skb_queue_is_first(&tp->out_of_order_queue,
+-                                             skb1))
+-                              skb1 = NULL;
+-                      else
+-                              skb1 = skb_queue_prev(
+-                                      &tp->out_of_order_queue,
+-                                      skb1);
++              if (before(seq, TCP_SKB_CB(skb1)->end_seq)) {
++                      if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
++                              /* All the bits are present. Drop. */
++                              NET_INC_STATS(sock_net(sk),
++                                            LINUX_MIB_TCPOFOMERGE);
++                              __kfree_skb(skb);
++                              skb = NULL;
++                              tcp_dsack_set(sk, seq, end_seq);
++                              goto add_sack;
++                      }
++                      if (after(seq, TCP_SKB_CB(skb1)->seq)) {
++                              /* Partial overlap. */
++                              tcp_dsack_set(sk, seq, TCP_SKB_CB(skb1)->end_seq);
++                      } else {
++                              /* skb's seq == skb1's seq and skb covers skb1.
++                               * Replace skb1 with skb.
++                               */
++                              rb_replace_node(&skb1->rbnode, &skb->rbnode,
++                                              &tp->out_of_order_queue);
++                              tcp_dsack_extend(sk,
++                                               TCP_SKB_CB(skb1)->seq,
++                                               TCP_SKB_CB(skb1)->end_seq);
++                              NET_INC_STATS(sock_net(sk),
++                                            LINUX_MIB_TCPOFOMERGE);
++                              __kfree_skb(skb1);
++                              goto add_sack;
++                      }
++              } else if (tcp_try_coalesce(sk, skb1, skb, &fragstolen)) {
++                      goto coalesce_done;
+               }
++              p = &parent->rb_right;
+       }
+-      if (!skb1)
+-              __skb_queue_head(&tp->out_of_order_queue, skb);
+-      else
+-              __skb_queue_after(&tp->out_of_order_queue, skb1, skb);
+ 
+-      /* And clean segments covered by new one as whole. */
+-      while (!skb_queue_is_last(&tp->out_of_order_queue, skb)) {
+-              skb1 = skb_queue_next(&tp->out_of_order_queue, skb);
++      /* Insert segment into RB tree. */
++      rb_link_node(&skb->rbnode, parent, p);
++      rb_insert_color(&skb->rbnode, &tp->out_of_order_queue);
+ 
++      /* Remove other segments covered by skb. */
++      while ((q = rb_next(&skb->rbnode)) != NULL) {
++              skb1 = rb_entry(q, struct sk_buff, rbnode);
+               if (!after(end_seq, TCP_SKB_CB(skb1)->seq))
+                       break;
+               if (before(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
+@@ -4477,12 +4487,15 @@ static void tcp_data_queue_ofo(struct so
+                                        end_seq);
+                       break;
+               }
+-              __skb_unlink(skb1, &tp->out_of_order_queue);
++              rb_erase(&skb1->rbnode, &tp->out_of_order_queue);
+               tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq,
+                                TCP_SKB_CB(skb1)->end_seq);
+               NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFOMERGE);
+               tcp_drop(sk, skb1);
+       }
++      /* If there is no skb after us, we are the last_skb ! */
++      if (!q)
++              tp->ooo_last_skb = skb;
+ 
+ add_sack:
+       if (tcp_is_sack(tp))
+@@ -4621,13 +4634,13 @@ queue_and_out:
+               if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
+                       tcp_fin(sk);
+ 
+-              if (!skb_queue_empty(&tp->out_of_order_queue)) {
++              if (!RB_EMPTY_ROOT(&tp->out_of_order_queue)) {
+                       tcp_ofo_queue(sk);
+ 
+                       /* RFC2581. 4.2. SHOULD send immediate ACK, when
+                        * gap in queue is filled.
+                        */
+-                      if (skb_queue_empty(&tp->out_of_order_queue))
++                      if (RB_EMPTY_ROOT(&tp->out_of_order_queue))
+                               inet_csk(sk)->icsk_ack.pingpong = 0;
+               }
+ 
+@@ -4679,48 +4692,76 @@ drop:
+       tcp_data_queue_ofo(sk, skb);
+ }
+ 
++static struct sk_buff *tcp_skb_next(struct sk_buff *skb, struct sk_buff_head *list)
++{
++      if (list)
++              return !skb_queue_is_last(list, skb) ? skb->next : NULL;
++
++      return rb_entry_safe(rb_next(&skb->rbnode), struct sk_buff, rbnode);
++}
++
+ static struct sk_buff *tcp_collapse_one(struct sock *sk, struct sk_buff *skb,
+-                                      struct sk_buff_head *list)
++                                      struct sk_buff_head *list,
++                                      struct rb_root *root)
+ {
+-      struct sk_buff *next = NULL;
++      struct sk_buff *next = tcp_skb_next(skb, list);
+ 
+-      if (!skb_queue_is_last(list, skb))
+-              next = skb_queue_next(list, skb);
++      if (list)
++              __skb_unlink(skb, list);
++      else
++              rb_erase(&skb->rbnode, root);
+ 
+-      __skb_unlink(skb, list);
+       __kfree_skb(skb);
+       NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRCVCOLLAPSED);
+ 
+       return next;
+ }
+ 
++/* Insert skb into rb tree, ordered by TCP_SKB_CB(skb)->seq */
++static void tcp_rbtree_insert(struct rb_root *root, struct sk_buff *skb)
++{
++      struct rb_node **p = &root->rb_node;
++      struct rb_node *parent = NULL;
++      struct sk_buff *skb1;
++
++      while (*p) {
++              parent = *p;
++              skb1 = rb_entry(parent, struct sk_buff, rbnode);
++              if (before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb1)->seq))
++                      p = &parent->rb_left;
++              else
++                      p = &parent->rb_right;
++      }
++      rb_link_node(&skb->rbnode, parent, p);
++      rb_insert_color(&skb->rbnode, root);
++}
++
+ /* Collapse contiguous sequence of skbs head..tail with
+  * sequence numbers start..end.
+  *
+- * If tail is NULL, this means until the end of the list.
++ * If tail is NULL, this means until the end of the queue.
+  *
+  * Segments with FIN/SYN are not collapsed (only because this
+  * simplifies code)
+  */
+ static void
+-tcp_collapse(struct sock *sk, struct sk_buff_head *list,
+-           struct sk_buff *head, struct sk_buff *tail,
+-           u32 start, u32 end)
++tcp_collapse(struct sock *sk, struct sk_buff_head *list, struct rb_root *root,
++           struct sk_buff *head, struct sk_buff *tail, u32 start, u32 end)
+ {
+-      struct sk_buff *skb, *n;
++      struct sk_buff *skb = head, *n;
++      struct sk_buff_head tmp;
+       bool end_of_skbs;
+ 
+       /* First, check that queue is collapsible and find
+-       * the point where collapsing can be useful. */
+-      skb = head;
++       * the point where collapsing can be useful.
++       */
+ restart:
+-      end_of_skbs = true;
+-      skb_queue_walk_from_safe(list, skb, n) {
+-              if (skb == tail)
+-                      break;
++      for (end_of_skbs = true; skb != NULL && skb != tail; skb = n) {
++              n = tcp_skb_next(skb, list);
++
+               /* No new bits? It is possible on ofo queue. */
+               if (!before(start, TCP_SKB_CB(skb)->end_seq)) {
+-                      skb = tcp_collapse_one(sk, skb, list);
++                      skb = tcp_collapse_one(sk, skb, list, root);
+                       if (!skb)
+                               break;
+                       goto restart;
+@@ -4738,13 +4779,10 @@ restart:
+                       break;
+               }
+ 
+-              if (!skb_queue_is_last(list, skb)) {
+-                      struct sk_buff *next = skb_queue_next(list, skb);
+-                      if (next != tail &&
+-                          TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(next)->seq) {
+-                              end_of_skbs = false;
+-                              break;
+-                      }
++              if (n && n != tail &&
++                  TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(n)->seq) {
++                      end_of_skbs = false;
++                      break;
+               }
+ 
+               /* Decided to skip this, advance start seq. */
+@@ -4754,17 +4792,22 @@ restart:
+           (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)))
+               return;
+ 
++      __skb_queue_head_init(&tmp);
++
+       while (before(start, end)) {
+               int copy = min_t(int, SKB_MAX_ORDER(0, 0), end - start);
+               struct sk_buff *nskb;
+ 
+               nskb = alloc_skb(copy, GFP_ATOMIC);
+               if (!nskb)
+-                      return;
++                      break;
+ 
+               memcpy(nskb->cb, skb->cb, sizeof(skb->cb));
+               TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start;
+-              __skb_queue_before(list, skb, nskb);
++              if (list)
++                      __skb_queue_before(list, skb, nskb);
++              else
++                      __skb_queue_tail(&tmp, nskb); /* defer rbtree insertion */
+               skb_set_owner_r(nskb, sk);
+ 
+               /* Copy data, releasing collapsed skbs. */
+@@ -4782,14 +4825,17 @@ restart:
+                               start += size;
+                       }
+                       if (!before(start, TCP_SKB_CB(skb)->end_seq)) {
+-                              skb = tcp_collapse_one(sk, skb, list);
++                              skb = tcp_collapse_one(sk, skb, list, root);
+                               if (!skb ||
+                                   skb == tail ||
+                                   (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)))
+-                                      return;
++                                      goto end;
+                       }
+               }
+       }
++end:
++      skb_queue_walk_safe(&tmp, skb, n)
++              tcp_rbtree_insert(root, skb);
+ }
+ 
+ /* Collapse ofo queue. Algorithm: select contiguous sequence of skbs
+@@ -4799,34 +4845,39 @@ static void tcp_collapse_ofo_queue(struc
+ {
+       struct tcp_sock *tp = tcp_sk(sk);
+       u32 range_truesize, sum_tiny = 0;
+-      struct sk_buff *skb = skb_peek(&tp->out_of_order_queue);
+-      struct sk_buff *head;
++      struct sk_buff *skb, *head;
++      struct rb_node *p;
+       u32 start, end;
+ 
+-      if (!skb)
++      p = rb_first(&tp->out_of_order_queue);
++      skb = rb_entry_safe(p, struct sk_buff, rbnode);
++new_range:
++      if (!skb) {
++              p = rb_last(&tp->out_of_order_queue);
++              /* Note: This is possible p is NULL here. We do not
++               * use rb_entry_safe(), as ooo_last_skb is valid only
++               * if rbtree is not empty.
++               */
++              tp->ooo_last_skb = rb_entry(p, struct sk_buff, rbnode);
+               return;
+-
++      }
+       start = TCP_SKB_CB(skb)->seq;
+       end = TCP_SKB_CB(skb)->end_seq;
+       range_truesize = skb->truesize;
+-      head = skb;
+-
+-      for (;;) {
+-              struct sk_buff *next = NULL;
+ 
+-              if (!skb_queue_is_last(&tp->out_of_order_queue, skb))
+-                      next = skb_queue_next(&tp->out_of_order_queue, skb);
+-              skb = next;
++      for (head = skb;;) {
++              skb = tcp_skb_next(skb, NULL);
+ 
+-              /* Segment is terminated when we see gap or when
+-               * we are at the end of all the queue. */
++              /* Range is terminated when we see a gap or when
++               * we are at the queue end.
++               */
+               if (!skb ||
+                   after(TCP_SKB_CB(skb)->seq, end) ||
+                   before(TCP_SKB_CB(skb)->end_seq, start)) {
+                       /* Do not attempt collapsing tiny skbs */
+                       if (range_truesize != head->truesize ||
+                           end - start >= SKB_WITH_OVERHEAD(SK_MEM_QUANTUM)) {
+-                              tcp_collapse(sk, &tp->out_of_order_queue,
++                              tcp_collapse(sk, NULL, &tp->out_of_order_queue,
+                                            head, skb, start, end);
+                       } else {
+                               sum_tiny += range_truesize;
+@@ -4834,20 +4885,14 @@ static void tcp_collapse_ofo_queue(struc
+                                       return;
+                       }
+ 
+-                      head = skb;
+-                      if (!skb)
+-                              break;
+-                      /* Start new segment */
++                      goto new_range;
++              }
++
++              range_truesize += skb->truesize;
++              if (unlikely(before(TCP_SKB_CB(skb)->seq, start)))
+                       start = TCP_SKB_CB(skb)->seq;
++              if (after(TCP_SKB_CB(skb)->end_seq, end))
+                       end = TCP_SKB_CB(skb)->end_seq;
+-                      range_truesize = skb->truesize;
+-              } else {
+-                      range_truesize += skb->truesize;
+-                      if (before(TCP_SKB_CB(skb)->seq, start))
+-                              start = TCP_SKB_CB(skb)->seq;
+-                      if (after(TCP_SKB_CB(skb)->end_seq, end))
+-                              end = TCP_SKB_CB(skb)->end_seq;
+-              }
+       }
+ }
+ 
+@@ -4858,23 +4903,36 @@ static void tcp_collapse_ofo_queue(struc
+ static bool tcp_prune_ofo_queue(struct sock *sk)
+ {
+       struct tcp_sock *tp = tcp_sk(sk);
+-      bool res = false;
++      struct rb_node *node, *prev;
+ 
+-      if (!skb_queue_empty(&tp->out_of_order_queue)) {
+-              NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_OFOPRUNED);
+-              __skb_queue_purge(&tp->out_of_order_queue);
+-
+-              /* Reset SACK state.  A conforming SACK implementation will
+-               * do the same at a timeout based retransmit.  When a connection
+-               * is in a sad state like this, we care only about integrity
+-               * of the connection not performance.
+-               */
+-              if (tp->rx_opt.sack_ok)
+-                      tcp_sack_reset(&tp->rx_opt);
++      if (RB_EMPTY_ROOT(&tp->out_of_order_queue))
++              return false;
++
++      NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_OFOPRUNED);
++
++      node = &tp->ooo_last_skb->rbnode;
++      do {
++              prev = rb_prev(node);
++              rb_erase(node, &tp->out_of_order_queue);
++              __kfree_skb(rb_to_skb(node));
+               sk_mem_reclaim(sk);
+-              res = true;
+-      }
+-      return res;
++              if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf &&
++                  !tcp_under_memory_pressure(sk))
++                      break;
++
++              node = prev;
++      } while (node);
++      tp->ooo_last_skb = rb_entry(prev, struct sk_buff, rbnode);
++
++      /* Reset SACK state.  A conforming SACK implementation will
++       * do the same at a timeout based retransmit.  When a connection
++       * is in a sad state like this, we care only about integrity
++       * of the connection not performance.
++       */
++      if (tp->rx_opt.sack_ok)
++              tcp_sack_reset(&tp->rx_opt);
++
++      return true;
+ }
+ 
+ /* Reduce allocated memory if we can, trying to get
+@@ -4902,7 +4960,7 @@ static int tcp_prune_queue(struct sock *
+ 
+       tcp_collapse_ofo_queue(sk);
+       if (!skb_queue_empty(&sk->sk_receive_queue))
+-              tcp_collapse(sk, &sk->sk_receive_queue,
++              tcp_collapse(sk, &sk->sk_receive_queue, NULL,
+                            skb_peek(&sk->sk_receive_queue),
+                            NULL,
+                            tp->copied_seq, tp->rcv_nxt);
+@@ -5007,7 +5065,7 @@ static void __tcp_ack_snd_check(struct s
+           /* We ACK each frame or... */
+           tcp_in_quickack_mode(sk) ||
+           /* We have out of order data. */
+-          (ofo_possible && skb_peek(&tp->out_of_order_queue))) {
++          (ofo_possible && !RB_EMPTY_ROOT(&tp->out_of_order_queue))) {
+               /* Then ack it now */
+               tcp_send_ack(sk);
+       } else {
+--- a/net/ipv4/tcp_ipv4.c
++++ b/net/ipv4/tcp_ipv4.c
+@@ -1830,7 +1830,7 @@ void tcp_v4_destroy_sock(struct sock *sk
+       tcp_write_queue_purge(sk);
+ 
+       /* Cleans up our, hopefully empty, out_of_order_queue. */
+-      __skb_queue_purge(&tp->out_of_order_queue);
++      skb_rbtree_purge(&tp->out_of_order_queue);
+ 
+ #ifdef CONFIG_TCP_MD5SIG
+       /* Clean up the MD5 key list, if any */
+--- a/net/ipv4/tcp_minisocks.c
++++ b/net/ipv4/tcp_minisocks.c
+@@ -496,7 +496,6 @@ struct sock *tcp_create_openreq_child(co
+               newtp->snd_cwnd_cnt = 0;
+ 
+               tcp_init_xmit_timers(newsk);
+-              __skb_queue_head_init(&newtp->out_of_order_queue);
+               newtp->write_seq = newtp->pushed_seq = treq->snt_isn + 1;
+ 
+               newtp->rx_opt.saw_tstamp = 0;
author	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
	Thu, 11 Oct 2018 14:07:14 +0000 (16:07 +0200)
committer	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
	Thu, 11 Oct 2018 14:07:14 +0000 (16:07 +0200)
queue-4.4/series		patch \| blob \| blame \| history
queue-4.4/tcp-add-tcp_ooo_try_coalesce-helper.patch	[new file with mode: 0644]	patch \| blob
queue-4.4/tcp-call-tcp_drop-from-tcp_data_queue_ofo.patch	[new file with mode: 0644]	patch \| blob
queue-4.4/tcp-fix-a-stale-ooo_last_skb-after-a-replace.patch	[new file with mode: 0644]	patch \| blob
queue-4.4/tcp-free-batches-of-packets-in-tcp_prune_ofo_queue.patch	[new file with mode: 0644]	patch \| blob
queue-4.4/tcp-increment-sk_drops-for-dropped-rx-packets.patch	[new file with mode: 0644]	patch \| blob
queue-4.4/tcp-use-an-rb-tree-for-ooo-receive-queue.patch	[new file with mode: 0644]	patch \| blob