From 055cbdd693042d5600e6025cd368302fba6ab814 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Thu, 11 Oct 2018 16:07:14 +0200 Subject: [PATCH] 4.4-stable patches added patches: tcp-add-tcp_ooo_try_coalesce-helper.patch tcp-call-tcp_drop-from-tcp_data_queue_ofo.patch tcp-fix-a-stale-ooo_last_skb-after-a-replace.patch tcp-free-batches-of-packets-in-tcp_prune_ofo_queue.patch tcp-increment-sk_drops-for-dropped-rx-packets.patch tcp-use-an-rb-tree-for-ooo-receive-queue.patch --- queue-4.4/series | 6 + .../tcp-add-tcp_ooo_try_coalesce-helper.patch | 75 ++ ...all-tcp_drop-from-tcp_data_queue_ofo.patch | 45 ++ ...a-stale-ooo_last_skb-after-a-replace.patch | 76 ++ ...es-of-packets-in-tcp_prune_ofo_queue.patch | 79 ++ ...ment-sk_drops-for-dropped-rx-packets.patch | 178 ++++ ...use-an-rb-tree-for-ooo-receive-queue.patch | 757 ++++++++++++++++++ 7 files changed, 1216 insertions(+) create mode 100644 queue-4.4/tcp-add-tcp_ooo_try_coalesce-helper.patch create mode 100644 queue-4.4/tcp-call-tcp_drop-from-tcp_data_queue_ofo.patch create mode 100644 queue-4.4/tcp-fix-a-stale-ooo_last_skb-after-a-replace.patch create mode 100644 queue-4.4/tcp-free-batches-of-packets-in-tcp_prune_ofo_queue.patch create mode 100644 queue-4.4/tcp-increment-sk_drops-for-dropped-rx-packets.patch create mode 100644 queue-4.4/tcp-use-an-rb-tree-for-ooo-receive-queue.patch diff --git a/queue-4.4/series b/queue-4.4/series index 960f761dd15..20189986b39 100644 --- a/queue-4.4/series +++ b/queue-4.4/series @@ -17,3 +17,9 @@ powerpc-fadump-return-error-when-fadump-registration-fails.patch arc-clone-syscall-to-setp-r25-as-thread-pointer.patch ucma-fix-a-use-after-free-in-ucma_resolve_ip.patch ubifs-check-for-name-being-null-while-mounting.patch +tcp-increment-sk_drops-for-dropped-rx-packets.patch +tcp-use-an-rb-tree-for-ooo-receive-queue.patch +tcp-fix-a-stale-ooo_last_skb-after-a-replace.patch +tcp-free-batches-of-packets-in-tcp_prune_ofo_queue.patch +tcp-call-tcp_drop-from-tcp_data_queue_ofo.patch +tcp-add-tcp_ooo_try_coalesce-helper.patch diff --git a/queue-4.4/tcp-add-tcp_ooo_try_coalesce-helper.patch b/queue-4.4/tcp-add-tcp_ooo_try_coalesce-helper.patch new file mode 100644 index 00000000000..6105e769cc0 --- /dev/null +++ b/queue-4.4/tcp-add-tcp_ooo_try_coalesce-helper.patch @@ -0,0 +1,75 @@ +From foo@baz Thu Oct 11 16:06:02 CEST 2018 +From: Mao Wenan +Date: Fri, 14 Sep 2018 16:24:10 +0800 +Subject: tcp: add tcp_ooo_try_coalesce() helper +To: , , , , , , , +Message-ID: <1536913450-12380-7-git-send-email-maowenan@huawei.com> + +From: Eric Dumazet + +[ Upstream commit 58152ecbbcc6a0ce7fddd5bf5f6ee535834ece0c ] + +In case skb in out_or_order_queue is the result of +multiple skbs coalescing, we would like to get a proper gso_segs +counter tracking, so that future tcp_drop() can report an accurate +number. + +I chose to not implement this tracking for skbs in receive queue, +since they are not dropped, unless socket is disconnected. + +Signed-off-by: Eric Dumazet +Acked-by: Soheil Hassas Yeganeh +Acked-by: Yuchung Cheng +Signed-off-by: David S. Miller +Signed-off-by: Mao Wenan +Signed-off-by: Greg Kroah-Hartman +--- + net/ipv4/tcp_input.c | 23 +++++++++++++++++++++-- + 1 file changed, 21 insertions(+), 2 deletions(-) + +--- a/net/ipv4/tcp_input.c ++++ b/net/ipv4/tcp_input.c +@@ -4296,6 +4296,23 @@ static bool tcp_try_coalesce(struct sock + return true; + } + ++static bool tcp_ooo_try_coalesce(struct sock *sk, ++ struct sk_buff *to, ++ struct sk_buff *from, ++ bool *fragstolen) ++{ ++ bool res = tcp_try_coalesce(sk, to, from, fragstolen); ++ ++ /* In case tcp_drop() is called later, update to->gso_segs */ ++ if (res) { ++ u32 gso_segs = max_t(u16, 1, skb_shinfo(to)->gso_segs) + ++ max_t(u16, 1, skb_shinfo(from)->gso_segs); ++ ++ skb_shinfo(to)->gso_segs = min_t(u32, gso_segs, 0xFFFF); ++ } ++ return res; ++} ++ + static void tcp_drop(struct sock *sk, struct sk_buff *skb) + { + sk_drops_add(sk, skb); +@@ -4422,7 +4439,8 @@ static void tcp_data_queue_ofo(struct so + /* In the typical case, we are adding an skb to the end of the list. + * Use of ooo_last_skb avoids the O(Log(N)) rbtree lookup. + */ +- if (tcp_try_coalesce(sk, tp->ooo_last_skb, skb, &fragstolen)) { ++ if (tcp_ooo_try_coalesce(sk, tp->ooo_last_skb, ++ skb, &fragstolen)) { + coalesce_done: + tcp_grow_window(sk, skb); + kfree_skb_partial(skb, fragstolen); +@@ -4467,7 +4485,8 @@ coalesce_done: + tcp_drop(sk, skb1); + goto merge_right; + } +- } else if (tcp_try_coalesce(sk, skb1, skb, &fragstolen)) { ++ } else if (tcp_ooo_try_coalesce(sk, skb1, ++ skb, &fragstolen)) { + goto coalesce_done; + } + p = &parent->rb_right; diff --git a/queue-4.4/tcp-call-tcp_drop-from-tcp_data_queue_ofo.patch b/queue-4.4/tcp-call-tcp_drop-from-tcp_data_queue_ofo.patch new file mode 100644 index 00000000000..13427c8cc8a --- /dev/null +++ b/queue-4.4/tcp-call-tcp_drop-from-tcp_data_queue_ofo.patch @@ -0,0 +1,45 @@ +From foo@baz Thu Oct 11 16:06:02 CEST 2018 +From: Mao Wenan +Date: Fri, 14 Sep 2018 16:24:09 +0800 +Subject: tcp: call tcp_drop() from tcp_data_queue_ofo() +To: , , , , , , , +Message-ID: <1536913450-12380-6-git-send-email-maowenan@huawei.com> + +From: Eric Dumazet + +[ Upstream commit 8541b21e781a22dce52a74fef0b9bed00404a1cd ] + +In order to be able to give better diagnostics and detect +malicious traffic, we need to have better sk->sk_drops tracking. + +Fixes: 9f5afeae5152 ("tcp: use an RB tree for ooo receive queue") +Signed-off-by: Eric Dumazet +Acked-by: Soheil Hassas Yeganeh +Acked-by: Yuchung Cheng +Signed-off-by: David S. Miller +Signed-off-by: Mao Wenan +Signed-off-by: Greg Kroah-Hartman +--- + net/ipv4/tcp_input.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +--- a/net/ipv4/tcp_input.c ++++ b/net/ipv4/tcp_input.c +@@ -4445,7 +4445,7 @@ coalesce_done: + /* All the bits are present. Drop. */ + NET_INC_STATS(sock_net(sk), + LINUX_MIB_TCPOFOMERGE); +- __kfree_skb(skb); ++ tcp_drop(sk, skb); + skb = NULL; + tcp_dsack_set(sk, seq, end_seq); + goto add_sack; +@@ -4464,7 +4464,7 @@ coalesce_done: + TCP_SKB_CB(skb1)->end_seq); + NET_INC_STATS(sock_net(sk), + LINUX_MIB_TCPOFOMERGE); +- __kfree_skb(skb1); ++ tcp_drop(sk, skb1); + goto merge_right; + } + } else if (tcp_try_coalesce(sk, skb1, skb, &fragstolen)) { diff --git a/queue-4.4/tcp-fix-a-stale-ooo_last_skb-after-a-replace.patch b/queue-4.4/tcp-fix-a-stale-ooo_last_skb-after-a-replace.patch new file mode 100644 index 00000000000..164e4593062 --- /dev/null +++ b/queue-4.4/tcp-fix-a-stale-ooo_last_skb-after-a-replace.patch @@ -0,0 +1,76 @@ +From foo@baz Thu Oct 11 16:06:02 CEST 2018 +From: Mao Wenan +Date: Fri, 14 Sep 2018 16:24:07 +0800 +Subject: tcp: fix a stale ooo_last_skb after a replace +To: , , , , , , , +Message-ID: <1536913450-12380-4-git-send-email-maowenan@huawei.com> + +From: Eric Dumazet + +[ Upstream commit 76f0dcbb5ae1a7c3dbeec13dd98233b8e6b0b32a ] + +When skb replaces another one in ooo queue, I forgot to also +update tp->ooo_last_skb as well, if the replaced skb was the last one +in the queue. + +To fix this, we simply can re-use the code that runs after an insertion, +trying to merge skbs at the right of current skb. + +This not only fixes the bug, but also remove all small skbs that might +be a subset of the new one. + +Example: + +We receive segments 2001:3001, 4001:5001 + +Then we receive 2001:8001 : We should replace 2001:3001 with the big +skb, but also remove 4001:50001 from the queue to save space. + +packetdrill test demonstrating the bug + +0.000 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 ++0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 ++0 bind(3, ..., ...) = 0 ++0 listen(3, 1) = 0 + ++0 < S 0:0(0) win 32792 ++0 > S. 0:0(0) ack 1 ++0.100 < . 1:1(0) ack 1 win 1024 ++0 accept(3, ..., ...) = 4 + ++0.01 < . 1001:2001(1000) ack 1 win 1024 ++0 > . 1:1(0) ack 1 + ++0.01 < . 1001:3001(2000) ack 1 win 1024 ++0 > . 1:1(0) ack 1 + +Fixes: 9f5afeae5152 ("tcp: use an RB tree for ooo receive queue") +Signed-off-by: Eric Dumazet +Reported-by: Yuchung Cheng +Cc: Yaogong Wang +Signed-off-by: David S. Miller +Signed-off-by: Mao Wenan +Signed-off-by: Greg Kroah-Hartman +--- + net/ipv4/tcp_input.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +--- a/net/ipv4/tcp_input.c ++++ b/net/ipv4/tcp_input.c +@@ -4465,7 +4465,7 @@ coalesce_done: + NET_INC_STATS(sock_net(sk), + LINUX_MIB_TCPOFOMERGE); + __kfree_skb(skb1); +- goto add_sack; ++ goto merge_right; + } + } else if (tcp_try_coalesce(sk, skb1, skb, &fragstolen)) { + goto coalesce_done; +@@ -4477,6 +4477,7 @@ coalesce_done: + rb_link_node(&skb->rbnode, parent, p); + rb_insert_color(&skb->rbnode, &tp->out_of_order_queue); + ++merge_right: + /* Remove other segments covered by skb. */ + while ((q = rb_next(&skb->rbnode)) != NULL) { + skb1 = rb_entry(q, struct sk_buff, rbnode); diff --git a/queue-4.4/tcp-free-batches-of-packets-in-tcp_prune_ofo_queue.patch b/queue-4.4/tcp-free-batches-of-packets-in-tcp_prune_ofo_queue.patch new file mode 100644 index 00000000000..e6c44a5f172 --- /dev/null +++ b/queue-4.4/tcp-free-batches-of-packets-in-tcp_prune_ofo_queue.patch @@ -0,0 +1,79 @@ +From foo@baz Thu Oct 11 16:06:02 CEST 2018 +From: Mao Wenan +Date: Fri, 14 Sep 2018 16:24:08 +0800 +Subject: tcp: free batches of packets in tcp_prune_ofo_queue() +To: , , , , , , , +Message-ID: <1536913450-12380-5-git-send-email-maowenan@huawei.com> + +From: Eric Dumazet + +[ Upstream commit 72cd43ba64fc172a443410ce01645895850844c8 ] + +Juha-Matti Tilli reported that malicious peers could inject tiny +packets in out_of_order_queue, forcing very expensive calls +to tcp_collapse_ofo_queue() and tcp_prune_ofo_queue() for +every incoming packet. out_of_order_queue rb-tree can contain +thousands of nodes, iterating over all of them is not nice. + +Before linux-4.9, we would have pruned all packets in ofo_queue +in one go, every XXXX packets. XXXX depends on sk_rcvbuf and skbs +truesize, but is about 7000 packets with tcp_rmem[2] default of 6 MB. + +Since we plan to increase tcp_rmem[2] in the future to cope with +modern BDP, can not revert to the old behavior, without great pain. + +Strategy taken in this patch is to purge ~12.5 % of the queue capacity. + +Fixes: 36a6503fedda ("tcp: refine tcp_prune_ofo_queue() to not drop all packets") +Signed-off-by: Eric Dumazet +Reported-by: Juha-Matti Tilli +Acked-by: Yuchung Cheng +Acked-by: Soheil Hassas Yeganeh +Signed-off-by: David S. Miller +Signed-off-by: Mao Wenan +Signed-off-by: Greg Kroah-Hartman +--- + net/ipv4/tcp_input.c | 16 +++++++++++----- + 1 file changed, 11 insertions(+), 5 deletions(-) + +--- a/net/ipv4/tcp_input.c ++++ b/net/ipv4/tcp_input.c +@@ -4899,27 +4899,33 @@ new_range: + + /* + * Purge the out-of-order queue. ++ * Drop at least 12.5 % of sk_rcvbuf to avoid malicious attacks. + * Return true if queue was pruned. + */ + static bool tcp_prune_ofo_queue(struct sock *sk) + { + struct tcp_sock *tp = tcp_sk(sk); + struct rb_node *node, *prev; ++ int goal; + + if (RB_EMPTY_ROOT(&tp->out_of_order_queue)) + return false; + + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_OFOPRUNED); +- ++ goal = sk->sk_rcvbuf >> 3; + node = &tp->ooo_last_skb->rbnode; + do { + prev = rb_prev(node); + rb_erase(node, &tp->out_of_order_queue); ++ goal -= rb_to_skb(node)->truesize; + __kfree_skb(rb_to_skb(node)); +- sk_mem_reclaim(sk); +- if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf && +- !tcp_under_memory_pressure(sk)) +- break; ++ if (!prev || goal <= 0) { ++ sk_mem_reclaim(sk); ++ if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf && ++ !tcp_under_memory_pressure(sk)) ++ break; ++ goal = sk->sk_rcvbuf >> 3; ++ } + + node = prev; + } while (node); diff --git a/queue-4.4/tcp-increment-sk_drops-for-dropped-rx-packets.patch b/queue-4.4/tcp-increment-sk_drops-for-dropped-rx-packets.patch new file mode 100644 index 00000000000..faf9a3f98e3 --- /dev/null +++ b/queue-4.4/tcp-increment-sk_drops-for-dropped-rx-packets.patch @@ -0,0 +1,178 @@ +From foo@baz Thu Oct 11 16:06:02 CEST 2018 +From: Mao Wenan +Date: Fri, 14 Sep 2018 16:24:05 +0800 +Subject: tcp: increment sk_drops for dropped rx packets +To: , , , , , , , +Message-ID: <1536913450-12380-2-git-send-email-maowenan@huawei.com> + +From: Eric Dumazet + +[ Upstream commit 532182cd610782db8c18230c2747626562032205 ] + +Now ss can report sk_drops, we can instruct TCP to increment +this per socket counter when it drops an incoming frame, to refine +monitoring and debugging. + +Following patch takes care of listeners drops. + +Signed-off-by: Eric Dumazet +Signed-off-by: David S. Miller +Signed-off-by: Mao Wenan +Signed-off-by: Greg Kroah-Hartman +--- + include/net/sock.h | 7 +++++++ + net/ipv4/tcp_input.c | 33 ++++++++++++++++++++------------- + net/ipv4/tcp_ipv4.c | 1 + + net/ipv6/tcp_ipv6.c | 1 + + 4 files changed, 29 insertions(+), 13 deletions(-) + +--- a/include/net/sock.h ++++ b/include/net/sock.h +@@ -2139,6 +2139,13 @@ sock_skb_set_dropcount(const struct sock + SOCK_SKB_CB(skb)->dropcount = atomic_read(&sk->sk_drops); + } + ++static inline void sk_drops_add(struct sock *sk, const struct sk_buff *skb) ++{ ++ int segs = max_t(u16, 1, skb_shinfo(skb)->gso_segs); ++ ++ atomic_add(segs, &sk->sk_drops); ++} ++ + void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk, + struct sk_buff *skb); + void __sock_recv_wifi_status(struct msghdr *msg, struct sock *sk, +--- a/net/ipv4/tcp_input.c ++++ b/net/ipv4/tcp_input.c +@@ -4296,6 +4296,12 @@ static bool tcp_try_coalesce(struct sock + return true; + } + ++static void tcp_drop(struct sock *sk, struct sk_buff *skb) ++{ ++ sk_drops_add(sk, skb); ++ __kfree_skb(skb); ++} ++ + /* This one checks to see if we can put data from the + * out_of_order queue into the receive_queue. + */ +@@ -4320,7 +4326,7 @@ static void tcp_ofo_queue(struct sock *s + __skb_unlink(skb, &tp->out_of_order_queue); + if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) { + SOCK_DEBUG(sk, "ofo packet was already received\n"); +- __kfree_skb(skb); ++ tcp_drop(sk, skb); + continue; + } + SOCK_DEBUG(sk, "ofo requeuing : rcv_next %X seq %X - %X\n", +@@ -4372,7 +4378,7 @@ static void tcp_data_queue_ofo(struct so + + if (unlikely(tcp_try_rmem_schedule(sk, skb, skb->truesize))) { + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFODROP); +- __kfree_skb(skb); ++ tcp_drop(sk, skb); + return; + } + +@@ -4436,7 +4442,7 @@ static void tcp_data_queue_ofo(struct so + if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) { + /* All the bits are present. Drop. */ + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFOMERGE); +- __kfree_skb(skb); ++ tcp_drop(sk, skb); + skb = NULL; + tcp_dsack_set(sk, seq, end_seq); + goto add_sack; +@@ -4475,7 +4481,7 @@ static void tcp_data_queue_ofo(struct so + tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq, + TCP_SKB_CB(skb1)->end_seq); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFOMERGE); +- __kfree_skb(skb1); ++ tcp_drop(sk, skb1); + } + + add_sack: +@@ -4558,12 +4564,13 @@ err: + static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) + { + struct tcp_sock *tp = tcp_sk(sk); +- int eaten = -1; + bool fragstolen = false; ++ int eaten = -1; + +- if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) +- goto drop; +- ++ if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) { ++ __kfree_skb(skb); ++ return; ++ } + skb_dst_drop(skb); + __skb_pull(skb, tcp_hdr(skb)->doff * 4); + +@@ -4645,7 +4652,7 @@ out_of_window: + tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS); + inet_csk_schedule_ack(sk); + drop: +- __kfree_skb(skb); ++ tcp_drop(sk, skb); + return; + } + +@@ -5236,7 +5243,7 @@ syn_challenge: + return true; + + discard: +- __kfree_skb(skb); ++ tcp_drop(sk, skb); + return false; + } + +@@ -5454,7 +5461,7 @@ csum_error: + TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS); + + discard: +- __kfree_skb(skb); ++ tcp_drop(sk, skb); + } + EXPORT_SYMBOL(tcp_rcv_established); + +@@ -5684,7 +5691,7 @@ static int tcp_rcv_synsent_state_process + TCP_DELACK_MAX, TCP_RTO_MAX); + + discard: +- __kfree_skb(skb); ++ tcp_drop(sk, skb); + return 0; + } else { + tcp_send_ack(sk); +@@ -6041,7 +6048,7 @@ int tcp_rcv_state_process(struct sock *s + + if (!queued) { + discard: +- __kfree_skb(skb); ++ tcp_drop(sk, skb); + } + return 0; + } +--- a/net/ipv4/tcp_ipv4.c ++++ b/net/ipv4/tcp_ipv4.c +@@ -1716,6 +1716,7 @@ discard_it: + return 0; + + discard_and_relse: ++ sk_drops_add(sk, skb); + sock_put(sk); + goto discard_it; + +--- a/net/ipv6/tcp_ipv6.c ++++ b/net/ipv6/tcp_ipv6.c +@@ -1505,6 +1505,7 @@ discard_it: + return 0; + + discard_and_relse: ++ sk_drops_add(sk, skb); + sock_put(sk); + goto discard_it; + diff --git a/queue-4.4/tcp-use-an-rb-tree-for-ooo-receive-queue.patch b/queue-4.4/tcp-use-an-rb-tree-for-ooo-receive-queue.patch new file mode 100644 index 00000000000..74ec66f2897 --- /dev/null +++ b/queue-4.4/tcp-use-an-rb-tree-for-ooo-receive-queue.patch @@ -0,0 +1,757 @@ +From foo@baz Thu Oct 11 16:06:02 CEST 2018 +From: Mao Wenan +Date: Fri, 14 Sep 2018 16:24:06 +0800 +Subject: tcp: use an RB tree for ooo receive queue +To: , , , , , , , +Message-ID: <1536913450-12380-3-git-send-email-maowenan@huawei.com> + +From: Yaogong Wang + +[ Upstream commit 9f5afeae51526b3ad7b7cb21ee8b145ce6ea7a7a ] + +Over the years, TCP BDP has increased by several orders of magnitude, +and some people are considering to reach the 2 Gbytes limit. + +Even with current window scale limit of 14, ~1 Gbytes maps to ~740,000 +MSS. + +In presence of packet losses (or reorders), TCP stores incoming packets +into an out of order queue, and number of skbs sitting there waiting for +the missing packets to be received can be in the 10^5 range. + +Most packets are appended to the tail of this queue, and when +packets can finally be transferred to receive queue, we scan the queue +from its head. + +However, in presence of heavy losses, we might have to find an arbitrary +point in this queue, involving a linear scan for every incoming packet, +throwing away cpu caches. + +This patch converts it to a RB tree, to get bounded latencies. + +Yaogong wrote a preliminary patch about 2 years ago. +Eric did the rebase, added ofo_last_skb cache, polishing and tests. + +Tested with network dropping between 1 and 10 % packets, with good +success (about 30 % increase of throughput in stress tests) + +Next step would be to also use an RB tree for the write queue at sender +side ;) + +Signed-off-by: Yaogong Wang +Signed-off-by: Eric Dumazet +Cc: Yuchung Cheng +Cc: Neal Cardwell +Cc: Ilpo Järvinen +Acked-By: Ilpo Järvinen +Signed-off-by: David S. Miller +Signed-off-by: Mao Wenan +Signed-off-by: Greg Kroah-Hartman +--- + include/linux/skbuff.h | 8 + + include/linux/tcp.h | 7 + include/net/tcp.h | 2 + net/core/skbuff.c | 19 ++ + net/ipv4/tcp.c | 4 + net/ipv4/tcp_input.c | 356 +++++++++++++++++++++++++++-------------------- + net/ipv4/tcp_ipv4.c | 2 + net/ipv4/tcp_minisocks.c | 1 + 8 files changed, 241 insertions(+), 158 deletions(-) + +--- a/include/linux/skbuff.h ++++ b/include/linux/skbuff.h +@@ -2273,6 +2273,8 @@ static inline void __skb_queue_purge(str + kfree_skb(skb); + } + ++void skb_rbtree_purge(struct rb_root *root); ++ + void *netdev_alloc_frag(unsigned int fragsz); + + struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int length, +@@ -2807,6 +2809,12 @@ static inline int pskb_trim_rcsum(struct + return __pskb_trim(skb, len); + } + ++#define rb_to_skb(rb) rb_entry_safe(rb, struct sk_buff, rbnode) ++#define skb_rb_first(root) rb_to_skb(rb_first(root)) ++#define skb_rb_last(root) rb_to_skb(rb_last(root)) ++#define skb_rb_next(skb) rb_to_skb(rb_next(&(skb)->rbnode)) ++#define skb_rb_prev(skb) rb_to_skb(rb_prev(&(skb)->rbnode)) ++ + #define skb_queue_walk(queue, skb) \ + for (skb = (queue)->next; \ + skb != (struct sk_buff *)(queue); \ +--- a/include/linux/tcp.h ++++ b/include/linux/tcp.h +@@ -279,10 +279,9 @@ struct tcp_sock { + struct sk_buff* lost_skb_hint; + struct sk_buff *retransmit_skb_hint; + +- /* OOO segments go in this list. Note that socket lock must be held, +- * as we do not use sk_buff_head lock. +- */ +- struct sk_buff_head out_of_order_queue; ++ /* OOO segments go in this rbtree. Socket lock must be held. */ ++ struct rb_root out_of_order_queue; ++ struct sk_buff *ooo_last_skb; /* cache rb_last(out_of_order_queue) */ + + /* SACKs data, these 2 need to be together (see tcp_options_write) */ + struct tcp_sack_block duplicate_sack[1]; /* D-SACK block */ +--- a/include/net/tcp.h ++++ b/include/net/tcp.h +@@ -649,7 +649,7 @@ static inline void tcp_fast_path_check(s + { + struct tcp_sock *tp = tcp_sk(sk); + +- if (skb_queue_empty(&tp->out_of_order_queue) && ++ if (RB_EMPTY_ROOT(&tp->out_of_order_queue) && + tp->rcv_wnd && + atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf && + !tp->urg_data) +--- a/net/core/skbuff.c ++++ b/net/core/skbuff.c +@@ -2378,6 +2378,25 @@ void skb_queue_purge(struct sk_buff_head + EXPORT_SYMBOL(skb_queue_purge); + + /** ++ * skb_rbtree_purge - empty a skb rbtree ++ * @root: root of the rbtree to empty ++ * ++ * Delete all buffers on an &sk_buff rbtree. Each buffer is removed from ++ * the list and one reference dropped. This function does not take ++ * any lock. Synchronization should be handled by the caller (e.g., TCP ++ * out-of-order queue is protected by the socket lock). ++ */ ++void skb_rbtree_purge(struct rb_root *root) ++{ ++ struct sk_buff *skb, *next; ++ ++ rbtree_postorder_for_each_entry_safe(skb, next, root, rbnode) ++ kfree_skb(skb); ++ ++ *root = RB_ROOT; ++} ++ ++/** + * skb_queue_head - queue a buffer at the list head + * @list: list to use + * @newsk: buffer to queue +--- a/net/ipv4/tcp.c ++++ b/net/ipv4/tcp.c +@@ -382,7 +382,7 @@ void tcp_init_sock(struct sock *sk) + struct inet_connection_sock *icsk = inet_csk(sk); + struct tcp_sock *tp = tcp_sk(sk); + +- __skb_queue_head_init(&tp->out_of_order_queue); ++ tp->out_of_order_queue = RB_ROOT; + tcp_init_xmit_timers(sk); + tcp_prequeue_init(tp); + INIT_LIST_HEAD(&tp->tsq_node); +@@ -2240,7 +2240,7 @@ int tcp_disconnect(struct sock *sk, int + tcp_clear_xmit_timers(sk); + __skb_queue_purge(&sk->sk_receive_queue); + tcp_write_queue_purge(sk); +- __skb_queue_purge(&tp->out_of_order_queue); ++ skb_rbtree_purge(&tp->out_of_order_queue); + + inet->inet_dport = 0; + +--- a/net/ipv4/tcp_input.c ++++ b/net/ipv4/tcp_input.c +@@ -4073,7 +4073,7 @@ static void tcp_fin(struct sock *sk) + /* It _is_ possible, that we have something out-of-order _after_ FIN. + * Probably, we should reset in this case. For now drop them. + */ +- __skb_queue_purge(&tp->out_of_order_queue); ++ skb_rbtree_purge(&tp->out_of_order_queue); + if (tcp_is_sack(tp)) + tcp_sack_reset(&tp->rx_opt); + sk_mem_reclaim(sk); +@@ -4233,7 +4233,7 @@ static void tcp_sack_remove(struct tcp_s + int this_sack; + + /* Empty ofo queue, hence, all the SACKs are eaten. Clear. */ +- if (skb_queue_empty(&tp->out_of_order_queue)) { ++ if (RB_EMPTY_ROOT(&tp->out_of_order_queue)) { + tp->rx_opt.num_sacks = 0; + return; + } +@@ -4309,10 +4309,13 @@ static void tcp_ofo_queue(struct sock *s + { + struct tcp_sock *tp = tcp_sk(sk); + __u32 dsack_high = tp->rcv_nxt; ++ bool fin, fragstolen, eaten; + struct sk_buff *skb, *tail; +- bool fragstolen, eaten; ++ struct rb_node *p; + +- while ((skb = skb_peek(&tp->out_of_order_queue)) != NULL) { ++ p = rb_first(&tp->out_of_order_queue); ++ while (p) { ++ skb = rb_entry(p, struct sk_buff, rbnode); + if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) + break; + +@@ -4322,9 +4325,10 @@ static void tcp_ofo_queue(struct sock *s + dsack_high = TCP_SKB_CB(skb)->end_seq; + tcp_dsack_extend(sk, TCP_SKB_CB(skb)->seq, dsack); + } ++ p = rb_next(p); ++ rb_erase(&skb->rbnode, &tp->out_of_order_queue); + +- __skb_unlink(skb, &tp->out_of_order_queue); +- if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) { ++ if (unlikely(!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt))) { + SOCK_DEBUG(sk, "ofo packet was already received\n"); + tcp_drop(sk, skb); + continue; +@@ -4336,12 +4340,19 @@ static void tcp_ofo_queue(struct sock *s + tail = skb_peek_tail(&sk->sk_receive_queue); + eaten = tail && tcp_try_coalesce(sk, tail, skb, &fragstolen); + tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq); ++ fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN; + if (!eaten) + __skb_queue_tail(&sk->sk_receive_queue, skb); +- if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) +- tcp_fin(sk); +- if (eaten) ++ else + kfree_skb_partial(skb, fragstolen); ++ ++ if (unlikely(fin)) { ++ tcp_fin(sk); ++ /* tcp_fin() purges tp->out_of_order_queue, ++ * so we must end this loop right now. ++ */ ++ break; ++ } + } + } + +@@ -4371,8 +4382,10 @@ static int tcp_try_rmem_schedule(struct + static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) + { + struct tcp_sock *tp = tcp_sk(sk); ++ struct rb_node **p, *q, *parent; + struct sk_buff *skb1; + u32 seq, end_seq; ++ bool fragstolen; + + tcp_ecn_check_ce(sk, skb); + +@@ -4387,89 +4400,86 @@ static void tcp_data_queue_ofo(struct so + inet_csk_schedule_ack(sk); + + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFOQUEUE); ++ seq = TCP_SKB_CB(skb)->seq; ++ end_seq = TCP_SKB_CB(skb)->end_seq; + SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n", +- tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq); ++ tp->rcv_nxt, seq, end_seq); + +- skb1 = skb_peek_tail(&tp->out_of_order_queue); +- if (!skb1) { ++ p = &tp->out_of_order_queue.rb_node; ++ if (RB_EMPTY_ROOT(&tp->out_of_order_queue)) { + /* Initial out of order segment, build 1 SACK. */ + if (tcp_is_sack(tp)) { + tp->rx_opt.num_sacks = 1; +- tp->selective_acks[0].start_seq = TCP_SKB_CB(skb)->seq; +- tp->selective_acks[0].end_seq = +- TCP_SKB_CB(skb)->end_seq; ++ tp->selective_acks[0].start_seq = seq; ++ tp->selective_acks[0].end_seq = end_seq; + } +- __skb_queue_head(&tp->out_of_order_queue, skb); ++ rb_link_node(&skb->rbnode, NULL, p); ++ rb_insert_color(&skb->rbnode, &tp->out_of_order_queue); ++ tp->ooo_last_skb = skb; + goto end; + } + +- seq = TCP_SKB_CB(skb)->seq; +- end_seq = TCP_SKB_CB(skb)->end_seq; +- +- if (seq == TCP_SKB_CB(skb1)->end_seq) { +- bool fragstolen; +- +- if (!tcp_try_coalesce(sk, skb1, skb, &fragstolen)) { +- __skb_queue_after(&tp->out_of_order_queue, skb1, skb); +- } else { +- tcp_grow_window(sk, skb); +- kfree_skb_partial(skb, fragstolen); +- skb = NULL; +- } +- +- if (!tp->rx_opt.num_sacks || +- tp->selective_acks[0].end_seq != seq) +- goto add_sack; +- +- /* Common case: data arrive in order after hole. */ +- tp->selective_acks[0].end_seq = end_seq; +- goto end; ++ /* In the typical case, we are adding an skb to the end of the list. ++ * Use of ooo_last_skb avoids the O(Log(N)) rbtree lookup. ++ */ ++ if (tcp_try_coalesce(sk, tp->ooo_last_skb, skb, &fragstolen)) { ++coalesce_done: ++ tcp_grow_window(sk, skb); ++ kfree_skb_partial(skb, fragstolen); ++ skb = NULL; ++ goto add_sack; + } + +- /* Find place to insert this segment. */ +- while (1) { +- if (!after(TCP_SKB_CB(skb1)->seq, seq)) +- break; +- if (skb_queue_is_first(&tp->out_of_order_queue, skb1)) { +- skb1 = NULL; +- break; ++ /* Find place to insert this segment. Handle overlaps on the way. */ ++ parent = NULL; ++ while (*p) { ++ parent = *p; ++ skb1 = rb_entry(parent, struct sk_buff, rbnode); ++ if (before(seq, TCP_SKB_CB(skb1)->seq)) { ++ p = &parent->rb_left; ++ continue; + } +- skb1 = skb_queue_prev(&tp->out_of_order_queue, skb1); +- } + +- /* Do skb overlap to previous one? */ +- if (skb1 && before(seq, TCP_SKB_CB(skb1)->end_seq)) { +- if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) { +- /* All the bits are present. Drop. */ +- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFOMERGE); +- tcp_drop(sk, skb); +- skb = NULL; +- tcp_dsack_set(sk, seq, end_seq); +- goto add_sack; +- } +- if (after(seq, TCP_SKB_CB(skb1)->seq)) { +- /* Partial overlap. */ +- tcp_dsack_set(sk, seq, +- TCP_SKB_CB(skb1)->end_seq); +- } else { +- if (skb_queue_is_first(&tp->out_of_order_queue, +- skb1)) +- skb1 = NULL; +- else +- skb1 = skb_queue_prev( +- &tp->out_of_order_queue, +- skb1); ++ if (before(seq, TCP_SKB_CB(skb1)->end_seq)) { ++ if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) { ++ /* All the bits are present. Drop. */ ++ NET_INC_STATS(sock_net(sk), ++ LINUX_MIB_TCPOFOMERGE); ++ __kfree_skb(skb); ++ skb = NULL; ++ tcp_dsack_set(sk, seq, end_seq); ++ goto add_sack; ++ } ++ if (after(seq, TCP_SKB_CB(skb1)->seq)) { ++ /* Partial overlap. */ ++ tcp_dsack_set(sk, seq, TCP_SKB_CB(skb1)->end_seq); ++ } else { ++ /* skb's seq == skb1's seq and skb covers skb1. ++ * Replace skb1 with skb. ++ */ ++ rb_replace_node(&skb1->rbnode, &skb->rbnode, ++ &tp->out_of_order_queue); ++ tcp_dsack_extend(sk, ++ TCP_SKB_CB(skb1)->seq, ++ TCP_SKB_CB(skb1)->end_seq); ++ NET_INC_STATS(sock_net(sk), ++ LINUX_MIB_TCPOFOMERGE); ++ __kfree_skb(skb1); ++ goto add_sack; ++ } ++ } else if (tcp_try_coalesce(sk, skb1, skb, &fragstolen)) { ++ goto coalesce_done; + } ++ p = &parent->rb_right; + } +- if (!skb1) +- __skb_queue_head(&tp->out_of_order_queue, skb); +- else +- __skb_queue_after(&tp->out_of_order_queue, skb1, skb); + +- /* And clean segments covered by new one as whole. */ +- while (!skb_queue_is_last(&tp->out_of_order_queue, skb)) { +- skb1 = skb_queue_next(&tp->out_of_order_queue, skb); ++ /* Insert segment into RB tree. */ ++ rb_link_node(&skb->rbnode, parent, p); ++ rb_insert_color(&skb->rbnode, &tp->out_of_order_queue); + ++ /* Remove other segments covered by skb. */ ++ while ((q = rb_next(&skb->rbnode)) != NULL) { ++ skb1 = rb_entry(q, struct sk_buff, rbnode); + if (!after(end_seq, TCP_SKB_CB(skb1)->seq)) + break; + if (before(end_seq, TCP_SKB_CB(skb1)->end_seq)) { +@@ -4477,12 +4487,15 @@ static void tcp_data_queue_ofo(struct so + end_seq); + break; + } +- __skb_unlink(skb1, &tp->out_of_order_queue); ++ rb_erase(&skb1->rbnode, &tp->out_of_order_queue); + tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq, + TCP_SKB_CB(skb1)->end_seq); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFOMERGE); + tcp_drop(sk, skb1); + } ++ /* If there is no skb after us, we are the last_skb ! */ ++ if (!q) ++ tp->ooo_last_skb = skb; + + add_sack: + if (tcp_is_sack(tp)) +@@ -4621,13 +4634,13 @@ queue_and_out: + if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) + tcp_fin(sk); + +- if (!skb_queue_empty(&tp->out_of_order_queue)) { ++ if (!RB_EMPTY_ROOT(&tp->out_of_order_queue)) { + tcp_ofo_queue(sk); + + /* RFC2581. 4.2. SHOULD send immediate ACK, when + * gap in queue is filled. + */ +- if (skb_queue_empty(&tp->out_of_order_queue)) ++ if (RB_EMPTY_ROOT(&tp->out_of_order_queue)) + inet_csk(sk)->icsk_ack.pingpong = 0; + } + +@@ -4679,48 +4692,76 @@ drop: + tcp_data_queue_ofo(sk, skb); + } + ++static struct sk_buff *tcp_skb_next(struct sk_buff *skb, struct sk_buff_head *list) ++{ ++ if (list) ++ return !skb_queue_is_last(list, skb) ? skb->next : NULL; ++ ++ return rb_entry_safe(rb_next(&skb->rbnode), struct sk_buff, rbnode); ++} ++ + static struct sk_buff *tcp_collapse_one(struct sock *sk, struct sk_buff *skb, +- struct sk_buff_head *list) ++ struct sk_buff_head *list, ++ struct rb_root *root) + { +- struct sk_buff *next = NULL; ++ struct sk_buff *next = tcp_skb_next(skb, list); + +- if (!skb_queue_is_last(list, skb)) +- next = skb_queue_next(list, skb); ++ if (list) ++ __skb_unlink(skb, list); ++ else ++ rb_erase(&skb->rbnode, root); + +- __skb_unlink(skb, list); + __kfree_skb(skb); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRCVCOLLAPSED); + + return next; + } + ++/* Insert skb into rb tree, ordered by TCP_SKB_CB(skb)->seq */ ++static void tcp_rbtree_insert(struct rb_root *root, struct sk_buff *skb) ++{ ++ struct rb_node **p = &root->rb_node; ++ struct rb_node *parent = NULL; ++ struct sk_buff *skb1; ++ ++ while (*p) { ++ parent = *p; ++ skb1 = rb_entry(parent, struct sk_buff, rbnode); ++ if (before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb1)->seq)) ++ p = &parent->rb_left; ++ else ++ p = &parent->rb_right; ++ } ++ rb_link_node(&skb->rbnode, parent, p); ++ rb_insert_color(&skb->rbnode, root); ++} ++ + /* Collapse contiguous sequence of skbs head..tail with + * sequence numbers start..end. + * +- * If tail is NULL, this means until the end of the list. ++ * If tail is NULL, this means until the end of the queue. + * + * Segments with FIN/SYN are not collapsed (only because this + * simplifies code) + */ + static void +-tcp_collapse(struct sock *sk, struct sk_buff_head *list, +- struct sk_buff *head, struct sk_buff *tail, +- u32 start, u32 end) ++tcp_collapse(struct sock *sk, struct sk_buff_head *list, struct rb_root *root, ++ struct sk_buff *head, struct sk_buff *tail, u32 start, u32 end) + { +- struct sk_buff *skb, *n; ++ struct sk_buff *skb = head, *n; ++ struct sk_buff_head tmp; + bool end_of_skbs; + + /* First, check that queue is collapsible and find +- * the point where collapsing can be useful. */ +- skb = head; ++ * the point where collapsing can be useful. ++ */ + restart: +- end_of_skbs = true; +- skb_queue_walk_from_safe(list, skb, n) { +- if (skb == tail) +- break; ++ for (end_of_skbs = true; skb != NULL && skb != tail; skb = n) { ++ n = tcp_skb_next(skb, list); ++ + /* No new bits? It is possible on ofo queue. */ + if (!before(start, TCP_SKB_CB(skb)->end_seq)) { +- skb = tcp_collapse_one(sk, skb, list); ++ skb = tcp_collapse_one(sk, skb, list, root); + if (!skb) + break; + goto restart; +@@ -4738,13 +4779,10 @@ restart: + break; + } + +- if (!skb_queue_is_last(list, skb)) { +- struct sk_buff *next = skb_queue_next(list, skb); +- if (next != tail && +- TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(next)->seq) { +- end_of_skbs = false; +- break; +- } ++ if (n && n != tail && ++ TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(n)->seq) { ++ end_of_skbs = false; ++ break; + } + + /* Decided to skip this, advance start seq. */ +@@ -4754,17 +4792,22 @@ restart: + (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN))) + return; + ++ __skb_queue_head_init(&tmp); ++ + while (before(start, end)) { + int copy = min_t(int, SKB_MAX_ORDER(0, 0), end - start); + struct sk_buff *nskb; + + nskb = alloc_skb(copy, GFP_ATOMIC); + if (!nskb) +- return; ++ break; + + memcpy(nskb->cb, skb->cb, sizeof(skb->cb)); + TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start; +- __skb_queue_before(list, skb, nskb); ++ if (list) ++ __skb_queue_before(list, skb, nskb); ++ else ++ __skb_queue_tail(&tmp, nskb); /* defer rbtree insertion */ + skb_set_owner_r(nskb, sk); + + /* Copy data, releasing collapsed skbs. */ +@@ -4782,14 +4825,17 @@ restart: + start += size; + } + if (!before(start, TCP_SKB_CB(skb)->end_seq)) { +- skb = tcp_collapse_one(sk, skb, list); ++ skb = tcp_collapse_one(sk, skb, list, root); + if (!skb || + skb == tail || + (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN))) +- return; ++ goto end; + } + } + } ++end: ++ skb_queue_walk_safe(&tmp, skb, n) ++ tcp_rbtree_insert(root, skb); + } + + /* Collapse ofo queue. Algorithm: select contiguous sequence of skbs +@@ -4799,34 +4845,39 @@ static void tcp_collapse_ofo_queue(struc + { + struct tcp_sock *tp = tcp_sk(sk); + u32 range_truesize, sum_tiny = 0; +- struct sk_buff *skb = skb_peek(&tp->out_of_order_queue); +- struct sk_buff *head; ++ struct sk_buff *skb, *head; ++ struct rb_node *p; + u32 start, end; + +- if (!skb) ++ p = rb_first(&tp->out_of_order_queue); ++ skb = rb_entry_safe(p, struct sk_buff, rbnode); ++new_range: ++ if (!skb) { ++ p = rb_last(&tp->out_of_order_queue); ++ /* Note: This is possible p is NULL here. We do not ++ * use rb_entry_safe(), as ooo_last_skb is valid only ++ * if rbtree is not empty. ++ */ ++ tp->ooo_last_skb = rb_entry(p, struct sk_buff, rbnode); + return; +- ++ } + start = TCP_SKB_CB(skb)->seq; + end = TCP_SKB_CB(skb)->end_seq; + range_truesize = skb->truesize; +- head = skb; +- +- for (;;) { +- struct sk_buff *next = NULL; + +- if (!skb_queue_is_last(&tp->out_of_order_queue, skb)) +- next = skb_queue_next(&tp->out_of_order_queue, skb); +- skb = next; ++ for (head = skb;;) { ++ skb = tcp_skb_next(skb, NULL); + +- /* Segment is terminated when we see gap or when +- * we are at the end of all the queue. */ ++ /* Range is terminated when we see a gap or when ++ * we are at the queue end. ++ */ + if (!skb || + after(TCP_SKB_CB(skb)->seq, end) || + before(TCP_SKB_CB(skb)->end_seq, start)) { + /* Do not attempt collapsing tiny skbs */ + if (range_truesize != head->truesize || + end - start >= SKB_WITH_OVERHEAD(SK_MEM_QUANTUM)) { +- tcp_collapse(sk, &tp->out_of_order_queue, ++ tcp_collapse(sk, NULL, &tp->out_of_order_queue, + head, skb, start, end); + } else { + sum_tiny += range_truesize; +@@ -4834,20 +4885,14 @@ static void tcp_collapse_ofo_queue(struc + return; + } + +- head = skb; +- if (!skb) +- break; +- /* Start new segment */ ++ goto new_range; ++ } ++ ++ range_truesize += skb->truesize; ++ if (unlikely(before(TCP_SKB_CB(skb)->seq, start))) + start = TCP_SKB_CB(skb)->seq; ++ if (after(TCP_SKB_CB(skb)->end_seq, end)) + end = TCP_SKB_CB(skb)->end_seq; +- range_truesize = skb->truesize; +- } else { +- range_truesize += skb->truesize; +- if (before(TCP_SKB_CB(skb)->seq, start)) +- start = TCP_SKB_CB(skb)->seq; +- if (after(TCP_SKB_CB(skb)->end_seq, end)) +- end = TCP_SKB_CB(skb)->end_seq; +- } + } + } + +@@ -4858,23 +4903,36 @@ static void tcp_collapse_ofo_queue(struc + static bool tcp_prune_ofo_queue(struct sock *sk) + { + struct tcp_sock *tp = tcp_sk(sk); +- bool res = false; ++ struct rb_node *node, *prev; + +- if (!skb_queue_empty(&tp->out_of_order_queue)) { +- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_OFOPRUNED); +- __skb_queue_purge(&tp->out_of_order_queue); +- +- /* Reset SACK state. A conforming SACK implementation will +- * do the same at a timeout based retransmit. When a connection +- * is in a sad state like this, we care only about integrity +- * of the connection not performance. +- */ +- if (tp->rx_opt.sack_ok) +- tcp_sack_reset(&tp->rx_opt); ++ if (RB_EMPTY_ROOT(&tp->out_of_order_queue)) ++ return false; ++ ++ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_OFOPRUNED); ++ ++ node = &tp->ooo_last_skb->rbnode; ++ do { ++ prev = rb_prev(node); ++ rb_erase(node, &tp->out_of_order_queue); ++ __kfree_skb(rb_to_skb(node)); + sk_mem_reclaim(sk); +- res = true; +- } +- return res; ++ if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf && ++ !tcp_under_memory_pressure(sk)) ++ break; ++ ++ node = prev; ++ } while (node); ++ tp->ooo_last_skb = rb_entry(prev, struct sk_buff, rbnode); ++ ++ /* Reset SACK state. A conforming SACK implementation will ++ * do the same at a timeout based retransmit. When a connection ++ * is in a sad state like this, we care only about integrity ++ * of the connection not performance. ++ */ ++ if (tp->rx_opt.sack_ok) ++ tcp_sack_reset(&tp->rx_opt); ++ ++ return true; + } + + /* Reduce allocated memory if we can, trying to get +@@ -4902,7 +4960,7 @@ static int tcp_prune_queue(struct sock * + + tcp_collapse_ofo_queue(sk); + if (!skb_queue_empty(&sk->sk_receive_queue)) +- tcp_collapse(sk, &sk->sk_receive_queue, ++ tcp_collapse(sk, &sk->sk_receive_queue, NULL, + skb_peek(&sk->sk_receive_queue), + NULL, + tp->copied_seq, tp->rcv_nxt); +@@ -5007,7 +5065,7 @@ static void __tcp_ack_snd_check(struct s + /* We ACK each frame or... */ + tcp_in_quickack_mode(sk) || + /* We have out of order data. */ +- (ofo_possible && skb_peek(&tp->out_of_order_queue))) { ++ (ofo_possible && !RB_EMPTY_ROOT(&tp->out_of_order_queue))) { + /* Then ack it now */ + tcp_send_ack(sk); + } else { +--- a/net/ipv4/tcp_ipv4.c ++++ b/net/ipv4/tcp_ipv4.c +@@ -1830,7 +1830,7 @@ void tcp_v4_destroy_sock(struct sock *sk + tcp_write_queue_purge(sk); + + /* Cleans up our, hopefully empty, out_of_order_queue. */ +- __skb_queue_purge(&tp->out_of_order_queue); ++ skb_rbtree_purge(&tp->out_of_order_queue); + + #ifdef CONFIG_TCP_MD5SIG + /* Clean up the MD5 key list, if any */ +--- a/net/ipv4/tcp_minisocks.c ++++ b/net/ipv4/tcp_minisocks.c +@@ -496,7 +496,6 @@ struct sock *tcp_create_openreq_child(co + newtp->snd_cwnd_cnt = 0; + + tcp_init_xmit_timers(newsk); +- __skb_queue_head_init(&newtp->out_of_order_queue); + newtp->write_seq = newtp->pushed_seq = treq->snt_isn + 1; + + newtp->rx_opt.saw_tstamp = 0; -- 2.47.2