From: Greg Kroah-Hartman Date: Wed, 6 Feb 2019 18:40:48 +0000 (+0100) Subject: drop ip defrag patches from 4.4.y queue X-Git-Tag: v4.4.173~1 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=7d68e9c027b62b2ec343f52f0fcae0194e7a8dee;p=thirdparty%2Fkernel%2Fstable-queue.git drop ip defrag patches from 4.4.y queue --- diff --git a/queue-4.4/fix-net-ipv4-do-not-handle-duplicate-fragments-as-overlapping.patch b/queue-4.4/fix-net-ipv4-do-not-handle-duplicate-fragments-as-overlapping.patch deleted file mode 100644 index afcc7fdaf7c..00000000000 --- a/queue-4.4/fix-net-ipv4-do-not-handle-duplicate-fragments-as-overlapping.patch +++ /dev/null @@ -1,43 +0,0 @@ -From foo@baz Thu Jan 31 15:59:51 CET 2019 -Date: Thu, 31 Jan 2019 15:59:51 +0100 -To: Greg KH -From: Greg Kroah-Hartman -Subject: Fix "net: ipv4: do not handle duplicate fragments as overlapping" - -From: Greg Kroah-Hartman - -ade446403bfb ("net: ipv4: do not handle duplicate fragments as -overlapping") was backported to many stable trees, but it had a problem -that was "accidentally" fixed by the upstream commit 0ff89efb5246 ("ip: -fail fast on IP defrag errors") - -This is the fixup for that problem as we do not want the larger patch in -the older stable trees. - -Fixes: ade446403bfb ("net: ipv4: do not handle duplicate fragments as overlapping") -Reported-by: Ivan Babrou -Reported-by: Eric Dumazet -Signed-off-by: Greg Kroah-Hartman - ---- - net/ipv4/ip_fragment.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - ---- a/net/ipv4/ip_fragment.c -+++ b/net/ipv4/ip_fragment.c -@@ -478,6 +478,7 @@ static int ip_frag_queue(struct ipq *qp, - * fragment. - */ - -+ err = -EINVAL; - /* Find out where to put this fragment. */ - prev_tail = qp->q.fragments_tail; - if (!prev_tail) -@@ -556,7 +557,6 @@ static int ip_frag_queue(struct ipq *qp, - - discard_qp: - ipq_kill(qp); -- err = -EINVAL; - IP_INC_STATS(net, IPSTATS_MIB_REASM_OVERLAPS); - err: - kfree_skb(skb); diff --git a/queue-4.4/inet-frags-get-rif-of-inet_frag_evicting.patch b/queue-4.4/inet-frags-get-rif-of-inet_frag_evicting.patch deleted file mode 100644 index 99306ea3a1e..00000000000 --- a/queue-4.4/inet-frags-get-rif-of-inet_frag_evicting.patch +++ /dev/null @@ -1,147 +0,0 @@ -From 399d1404be660d355192ff4df5ccc3f4159ec1e4 Mon Sep 17 00:00:00 2001 -From: Eric Dumazet -Date: Sat, 31 Mar 2018 12:58:51 -0700 -Subject: inet: frags: get rif of inet_frag_evicting() - -From: Eric Dumazet - -commit 399d1404be660d355192ff4df5ccc3f4159ec1e4 upstream. - -This refactors ip_expire() since one indentation level is removed. - -Note: in the future, we should try hard to avoid the skb_clone() -since this is a serious performance cost. -Under DDOS, the ICMP message wont be sent because of rate limits. - -Fact that ip6_expire_frag_queue() does not use skb_clone() is -disturbing too. Presumably IPv6 should have the same -issue than the one we fixed in commit ec4fbd64751d -("inet: frag: release spinlock before calling icmp_send()") - -Signed-off-by: Eric Dumazet -Signed-off-by: David S. Miller -Signed-off-by: Mao Wenan -Signed-off-by: Greg Kroah-Hartman - ---- - include/net/inet_frag.h | 5 --- - net/ipv4/ip_fragment.c | 66 +++++++++++++++++++++++------------------------- - net/ipv6/reassembly.c | 4 -- - 3 files changed, 32 insertions(+), 43 deletions(-) - ---- a/include/net/inet_frag.h -+++ b/include/net/inet_frag.h -@@ -123,11 +123,6 @@ static inline void inet_frag_put(struct - inet_frag_destroy(q, f); - } - --static inline bool inet_frag_evicting(struct inet_frag_queue *q) --{ -- return !hlist_unhashed(&q->list_evictor); --} -- - /* Memory Tracking Functions. */ - - static inline int frag_mem_limit(struct netns_frags *nf) ---- a/net/ipv4/ip_fragment.c -+++ b/net/ipv4/ip_fragment.c -@@ -194,8 +194,11 @@ static bool frag_expire_skip_icmp(u32 us - */ - static void ip_expire(unsigned long arg) - { -- struct ipq *qp; -+ struct sk_buff *clone, *head; -+ const struct iphdr *iph; - struct net *net; -+ struct ipq *qp; -+ int err; - - qp = container_of((struct inet_frag_queue *) arg, struct ipq, q); - net = container_of(qp->q.net, struct net, ipv4.frags); -@@ -209,45 +212,40 @@ static void ip_expire(unsigned long arg) - ipq_kill(qp); - IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS); - -- if (!inet_frag_evicting(&qp->q)) { -- struct sk_buff *clone, *head = qp->q.fragments; -- const struct iphdr *iph; -- int err; -- -- IP_INC_STATS_BH(net, IPSTATS_MIB_REASMTIMEOUT); -+ head = qp->q.fragments; - -- if (!(qp->q.flags & INET_FRAG_FIRST_IN) || !qp->q.fragments) -- goto out; -+ IP_INC_STATS_BH(net, IPSTATS_MIB_REASMTIMEOUT); - -- head->dev = dev_get_by_index_rcu(net, qp->iif); -- if (!head->dev) -- goto out; -+ if (!(qp->q.flags & INET_FRAG_FIRST_IN) || !head) -+ goto out; - -+ head->dev = dev_get_by_index_rcu(net, qp->iif); -+ if (!head->dev) -+ goto out; - -- /* skb has no dst, perform route lookup again */ -- iph = ip_hdr(head); -- err = ip_route_input_noref(head, iph->daddr, iph->saddr, -+ /* skb has no dst, perform route lookup again */ -+ iph = ip_hdr(head); -+ err = ip_route_input_noref(head, iph->daddr, iph->saddr, - iph->tos, head->dev); -- if (err) -- goto out; -+ if (err) -+ goto out; -+ -+ /* Only an end host needs to send an ICMP -+ * "Fragment Reassembly Timeout" message, per RFC792. -+ */ -+ if (frag_expire_skip_icmp(qp->user) && -+ (skb_rtable(head)->rt_type != RTN_LOCAL)) -+ goto out; -+ -+ clone = skb_clone(head, GFP_ATOMIC); - -- /* Only an end host needs to send an ICMP -- * "Fragment Reassembly Timeout" message, per RFC792. -- */ -- if (frag_expire_skip_icmp(qp->user) && -- (skb_rtable(head)->rt_type != RTN_LOCAL)) -- goto out; -- -- clone = skb_clone(head, GFP_ATOMIC); -- -- /* Send an ICMP "Fragment Reassembly Timeout" message. */ -- if (clone) { -- spin_unlock(&qp->q.lock); -- icmp_send(clone, ICMP_TIME_EXCEEDED, -- ICMP_EXC_FRAGTIME, 0); -- consume_skb(clone); -- goto out_rcu_unlock; -- } -+ /* Send an ICMP "Fragment Reassembly Timeout" message. */ -+ if (clone) { -+ spin_unlock(&qp->q.lock); -+ icmp_send(clone, ICMP_TIME_EXCEEDED, -+ ICMP_EXC_FRAGTIME, 0); -+ consume_skb(clone); -+ goto out_rcu_unlock; - } - out: - spin_unlock(&qp->q.lock); ---- a/net/ipv6/reassembly.c -+++ b/net/ipv6/reassembly.c -@@ -146,10 +146,6 @@ void ip6_expire_frag_queue(struct net *n - goto out_rcu_unlock; - - IP6_INC_STATS_BH(net, __in6_dev_get(dev), IPSTATS_MIB_REASMFAILS); -- -- if (inet_frag_evicting(&fq->q)) -- goto out_rcu_unlock; -- - IP6_INC_STATS_BH(net, __in6_dev_get(dev), IPSTATS_MIB_REASMTIMEOUT); - - /* Don't send error if the first segment did not arrive. */ diff --git a/queue-4.4/ip-add-helpers-to-process-in-order-fragments-faster.patch b/queue-4.4/ip-add-helpers-to-process-in-order-fragments-faster.patch deleted file mode 100644 index 7ac5d7b3f67..00000000000 --- a/queue-4.4/ip-add-helpers-to-process-in-order-fragments-faster.patch +++ /dev/null @@ -1,157 +0,0 @@ -From 353c9cb360874e737fb000545f783df756c06f9a Mon Sep 17 00:00:00 2001 -From: Peter Oskolkov -Date: Sat, 11 Aug 2018 20:27:24 +0000 -Subject: ip: add helpers to process in-order fragments faster. - -From: Peter Oskolkov - -commit 353c9cb360874e737fb000545f783df756c06f9a upstream. - -This patch introduces several helper functions/macros that will be -used in the follow-up patch. No runtime changes yet. - -The new logic (fully implemented in the second patch) is as follows: - -* Nodes in the rb-tree will now contain not single fragments, but lists - of consecutive fragments ("runs"). - -* At each point in time, the current "active" run at the tail is - maintained/tracked. Fragments that arrive in-order, adjacent - to the previous tail fragment, are added to this tail run without - triggering the re-balancing of the rb-tree. - -* If a fragment arrives out of order with the offset _before_ the tail run, - it is inserted into the rb-tree as a single fragment. - -* If a fragment arrives after the current tail fragment (with a gap), - it starts a new "tail" run, as is inserted into the rb-tree - at the end as the head of the new run. - -skb->cb is used to store additional information -needed here (suggested by Eric Dumazet). - -Reported-by: Willem de Bruijn -Signed-off-by: Peter Oskolkov -Cc: Eric Dumazet -Cc: Florian Westphal -Signed-off-by: David S. Miller -Signed-off-by: Mao Wenan -Signed-off-by: Greg Kroah-Hartman ---- - include/net/inet_frag.h | 4 ++ - net/ipv4/ip_fragment.c | 74 +++++++++++++++++++++++++++++++++++++++++++++--- - 2 files changed, 74 insertions(+), 4 deletions(-) - ---- a/include/net/inet_frag.h -+++ b/include/net/inet_frag.h -@@ -48,6 +48,7 @@ struct inet_frag_queue { - struct sk_buff *fragments; /* Used in IPv6. */ - struct rb_root rb_fragments; /* Used in IPv4. */ - struct sk_buff *fragments_tail; -+ struct sk_buff *last_run_head; - ktime_t stamp; - int len; - int meat; -@@ -118,6 +119,9 @@ struct inet_frag_queue *inet_frag_find(s - void inet_frag_maybe_warn_overflow(struct inet_frag_queue *q, - const char *prefix); - -+/* Free all skbs in the queue; return the sum of their truesizes. */ -+unsigned int inet_frag_rbtree_purge(struct rb_root *root); -+ - static inline void inet_frag_put(struct inet_frag_queue *q, struct inet_frags *f) - { - if (atomic_dec_and_test(&q->refcnt)) ---- a/net/ipv4/ip_fragment.c -+++ b/net/ipv4/ip_fragment.c -@@ -58,13 +58,57 @@ - static int sysctl_ipfrag_max_dist __read_mostly = 64; - static const char ip_frag_cache_name[] = "ip4-frags"; - --struct ipfrag_skb_cb --{ -+/* Use skb->cb to track consecutive/adjacent fragments coming at -+ * the end of the queue. Nodes in the rb-tree queue will -+ * contain "runs" of one or more adjacent fragments. -+ * -+ * Invariants: -+ * - next_frag is NULL at the tail of a "run"; -+ * - the head of a "run" has the sum of all fragment lengths in frag_run_len. -+ */ -+struct ipfrag_skb_cb { - struct inet_skb_parm h; -- int offset; -+ int offset; -+ struct sk_buff *next_frag; -+ int frag_run_len; - }; - --#define FRAG_CB(skb) ((struct ipfrag_skb_cb *)((skb)->cb)) -+#define FRAG_CB(skb) ((struct ipfrag_skb_cb *)((skb)->cb)) -+ -+static void ip4_frag_init_run(struct sk_buff *skb) -+{ -+ BUILD_BUG_ON(sizeof(struct ipfrag_skb_cb) > sizeof(skb->cb)); -+ -+ FRAG_CB(skb)->next_frag = NULL; -+ FRAG_CB(skb)->frag_run_len = skb->len; -+} -+ -+/* Append skb to the last "run". */ -+static void ip4_frag_append_to_last_run(struct inet_frag_queue *q, -+ struct sk_buff *skb) -+{ -+ RB_CLEAR_NODE(&skb->rbnode); -+ FRAG_CB(skb)->next_frag = NULL; -+ -+ FRAG_CB(q->last_run_head)->frag_run_len += skb->len; -+ FRAG_CB(q->fragments_tail)->next_frag = skb; -+ q->fragments_tail = skb; -+} -+ -+/* Create a new "run" with the skb. */ -+static void ip4_frag_create_run(struct inet_frag_queue *q, struct sk_buff *skb) -+{ -+ if (q->last_run_head) -+ rb_link_node(&skb->rbnode, &q->last_run_head->rbnode, -+ &q->last_run_head->rbnode.rb_right); -+ else -+ rb_link_node(&skb->rbnode, NULL, &q->rb_fragments.rb_node); -+ rb_insert_color(&skb->rbnode, &q->rb_fragments); -+ -+ ip4_frag_init_run(skb); -+ q->fragments_tail = skb; -+ q->last_run_head = skb; -+} - - /* Describe an entry in the "incomplete datagrams" queue. */ - struct ipq { -@@ -721,6 +765,28 @@ struct sk_buff *ip_check_defrag(struct n - } - EXPORT_SYMBOL(ip_check_defrag); - -+unsigned int inet_frag_rbtree_purge(struct rb_root *root) -+{ -+ struct rb_node *p = rb_first(root); -+ unsigned int sum = 0; -+ -+ while (p) { -+ struct sk_buff *skb = rb_entry(p, struct sk_buff, rbnode); -+ -+ p = rb_next(p); -+ rb_erase(&skb->rbnode, root); -+ while (skb) { -+ struct sk_buff *next = FRAG_CB(skb)->next_frag; -+ -+ sum += skb->truesize; -+ kfree_skb(skb); -+ skb = next; -+ } -+ } -+ return sum; -+} -+EXPORT_SYMBOL(inet_frag_rbtree_purge); -+ - #ifdef CONFIG_SYSCTL - static int zero; - diff --git a/queue-4.4/ip-discard-ipv4-datagrams-with-overlapping-segments.patch b/queue-4.4/ip-discard-ipv4-datagrams-with-overlapping-segments.patch deleted file mode 100644 index 09d82210368..00000000000 --- a/queue-4.4/ip-discard-ipv4-datagrams-with-overlapping-segments.patch +++ /dev/null @@ -1,146 +0,0 @@ -From 7969e5c40dfd04799d4341f1b7cd266b6e47f227 Mon Sep 17 00:00:00 2001 -From: Peter Oskolkov -Date: Thu, 2 Aug 2018 23:34:37 +0000 -Subject: ip: discard IPv4 datagrams with overlapping segments. - -From: Peter Oskolkov - -commit 7969e5c40dfd04799d4341f1b7cd266b6e47f227 upstream. - -This behavior is required in IPv6, and there is little need -to tolerate overlapping fragments in IPv4. This change -simplifies the code and eliminates potential DDoS attack vectors. - -Tested: ran ip_defrag selftest (not yet available uptream). - -Suggested-by: David S. Miller -Signed-off-by: Peter Oskolkov -Signed-off-by: Eric Dumazet -Cc: Florian Westphal -Acked-by: Stephen Hemminger -Signed-off-by: David S. Miller -Signed-off-by: Mao Wenan -Signed-off-by: Greg Kroah-Hartman ---- - include/uapi/linux/snmp.h | 1 - net/ipv4/ip_fragment.c | 73 ++++++++++++---------------------------------- - net/ipv4/proc.c | 1 - 3 files changed, 22 insertions(+), 53 deletions(-) - ---- a/include/uapi/linux/snmp.h -+++ b/include/uapi/linux/snmp.h -@@ -55,6 +55,7 @@ enum - IPSTATS_MIB_ECT1PKTS, /* InECT1Pkts */ - IPSTATS_MIB_ECT0PKTS, /* InECT0Pkts */ - IPSTATS_MIB_CEPKTS, /* InCEPkts */ -+ IPSTATS_MIB_REASM_OVERLAPS, /* ReasmOverlaps */ - __IPSTATS_MIB_MAX - }; - ---- a/net/ipv4/ip_fragment.c -+++ b/net/ipv4/ip_fragment.c -@@ -342,6 +342,7 @@ static int ip_frag_reinit(struct ipq *qp - /* Add new segment to existing queue. */ - static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) - { -+ struct net *net = container_of(qp->q.net, struct net, ipv4.frags); - struct sk_buff *prev, *next; - struct net_device *dev; - unsigned int fragsize; -@@ -422,60 +423,22 @@ static int ip_frag_queue(struct ipq *qp, - } - - found: -- /* We found where to put this one. Check for overlap with -- * preceding fragment, and, if needed, align things so that -- * any overlaps are eliminated. -+ /* RFC5722, Section 4, amended by Errata ID : 3089 -+ * When reassembling an IPv6 datagram, if -+ * one or more its constituent fragments is determined to be an -+ * overlapping fragment, the entire datagram (and any constituent -+ * fragments) MUST be silently discarded. -+ * -+ * We do the same here for IPv4. - */ -- if (prev) { -- int i = (FRAG_CB(prev)->offset + prev->len) - offset; -- -- if (i > 0) { -- offset += i; -- err = -EINVAL; -- if (end <= offset) -- goto err; -- err = -ENOMEM; -- if (!pskb_pull(skb, i)) -- goto err; -- if (skb->ip_summed != CHECKSUM_UNNECESSARY) -- skb->ip_summed = CHECKSUM_NONE; -- } -- } -- -- err = -ENOMEM; -- -- while (next && FRAG_CB(next)->offset < end) { -- int i = end - FRAG_CB(next)->offset; /* overlap is 'i' bytes */ -- -- if (i < next->len) { -- /* Eat head of the next overlapped fragment -- * and leave the loop. The next ones cannot overlap. -- */ -- if (!pskb_pull(next, i)) -- goto err; -- FRAG_CB(next)->offset += i; -- qp->q.meat -= i; -- if (next->ip_summed != CHECKSUM_UNNECESSARY) -- next->ip_summed = CHECKSUM_NONE; -- break; -- } else { -- struct sk_buff *free_it = next; -- -- /* Old fragment is completely overridden with -- * new one drop it. -- */ -- next = next->next; -- -- if (prev) -- prev->next = next; -- else -- qp->q.fragments = next; -- -- qp->q.meat -= free_it->len; -- sub_frag_mem_limit(qp->q.net, free_it->truesize); -- kfree_skb(free_it); -- } -- } -+ /* Is there an overlap with the previous fragment? */ -+ if (prev && -+ (FRAG_CB(prev)->offset + prev->len) > offset) -+ goto discard_qp; -+ -+ /* Is there an overlap with the next fragment? */ -+ if (next && FRAG_CB(next)->offset < end) -+ goto discard_qp; - - FRAG_CB(skb)->offset = offset; - -@@ -522,6 +485,10 @@ found: - skb_dst_drop(skb); - return -EINPROGRESS; - -+discard_qp: -+ ipq_kill(qp); -+ err = -EINVAL; -+ IP_INC_STATS(net, IPSTATS_MIB_REASM_OVERLAPS); - err: - kfree_skb(skb); - return err; ---- a/net/ipv4/proc.c -+++ b/net/ipv4/proc.c -@@ -132,6 +132,7 @@ static const struct snmp_mib snmp4_ipext - SNMP_MIB_ITEM("InECT1Pkts", IPSTATS_MIB_ECT1PKTS), - SNMP_MIB_ITEM("InECT0Pkts", IPSTATS_MIB_ECT0PKTS), - SNMP_MIB_ITEM("InCEPkts", IPSTATS_MIB_CEPKTS), -+ SNMP_MIB_ITEM("ReasmOverlaps", IPSTATS_MIB_REASM_OVERLAPS), - SNMP_MIB_SENTINEL - }; - diff --git a/queue-4.4/ip-frags-fix-crash-in-ip_do_fragment.patch b/queue-4.4/ip-frags-fix-crash-in-ip_do_fragment.patch deleted file mode 100644 index f4c8b521f84..00000000000 --- a/queue-4.4/ip-frags-fix-crash-in-ip_do_fragment.patch +++ /dev/null @@ -1,110 +0,0 @@ -From 5d407b071dc369c26a38398326ee2be53651cfe4 Mon Sep 17 00:00:00 2001 -From: Taehee Yoo -Date: Mon, 10 Sep 2018 02:47:05 +0900 -Subject: ip: frags: fix crash in ip_do_fragment() - -From: Taehee Yoo - -commit 5d407b071dc369c26a38398326ee2be53651cfe4 upstream. - -A kernel crash occurrs when defragmented packet is fragmented -in ip_do_fragment(). -In defragment routine, skb_orphan() is called and -skb->ip_defrag_offset is set. but skb->sk and -skb->ip_defrag_offset are same union member. so that -frag->sk is not NULL. -Hence crash occurrs in skb->sk check routine in ip_do_fragment() when -defragmented packet is fragmented. - -test commands: - %iptables -t nat -I POSTROUTING -j MASQUERADE - %hping3 192.168.4.2 -s 1000 -p 2000 -d 60000 - -splat looks like: -[ 261.069429] kernel BUG at net/ipv4/ip_output.c:636! -[ 261.075753] invalid opcode: 0000 [#1] SMP DEBUG_PAGEALLOC KASAN PTI -[ 261.083854] CPU: 1 PID: 1349 Comm: hping3 Not tainted 4.19.0-rc2+ #3 -[ 261.100977] RIP: 0010:ip_do_fragment+0x1613/0x2600 -[ 261.106945] Code: e8 e2 38 e3 fe 4c 8b 44 24 18 48 8b 74 24 08 e9 92 f6 ff ff 80 3c 02 00 0f 85 da 07 00 00 48 8b b5 d0 00 00 00 e9 25 f6 ff ff <0f> 0b 0f 0b 44 8b 54 24 58 4c 8b 4c 24 18 4c 8b 5c 24 60 4c 8b 6c -[ 261.127015] RSP: 0018:ffff8801031cf2c0 EFLAGS: 00010202 -[ 261.134156] RAX: 1ffff1002297537b RBX: ffffed0020639e6e RCX: 0000000000000004 -[ 261.142156] RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffff880114ba9bd8 -[ 261.150157] RBP: ffff880114ba8a40 R08: ffffed0022975395 R09: ffffed0022975395 -[ 261.158157] R10: 0000000000000001 R11: ffffed0022975394 R12: ffff880114ba9ca4 -[ 261.166159] R13: 0000000000000010 R14: ffff880114ba9bc0 R15: dffffc0000000000 -[ 261.174169] FS: 00007fbae2199700(0000) GS:ffff88011b400000(0000) knlGS:0000000000000000 -[ 261.183012] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 -[ 261.189013] CR2: 00005579244fe000 CR3: 0000000119bf4000 CR4: 00000000001006e0 -[ 261.198158] Call Trace: -[ 261.199018] ? dst_output+0x180/0x180 -[ 261.205011] ? save_trace+0x300/0x300 -[ 261.209018] ? ip_copy_metadata+0xb00/0xb00 -[ 261.213034] ? sched_clock_local+0xd4/0x140 -[ 261.218158] ? kill_l4proto+0x120/0x120 [nf_conntrack] -[ 261.223014] ? rt_cpu_seq_stop+0x10/0x10 -[ 261.227014] ? find_held_lock+0x39/0x1c0 -[ 261.233008] ip_finish_output+0x51d/0xb50 -[ 261.237006] ? ip_fragment.constprop.56+0x220/0x220 -[ 261.243011] ? nf_ct_l4proto_register_one+0x5b0/0x5b0 [nf_conntrack] -[ 261.250152] ? rcu_is_watching+0x77/0x120 -[ 261.255010] ? nf_nat_ipv4_out+0x1e/0x2b0 [nf_nat_ipv4] -[ 261.261033] ? nf_hook_slow+0xb1/0x160 -[ 261.265007] ip_output+0x1c7/0x710 -[ 261.269005] ? ip_mc_output+0x13f0/0x13f0 -[ 261.273002] ? __local_bh_enable_ip+0xe9/0x1b0 -[ 261.278152] ? ip_fragment.constprop.56+0x220/0x220 -[ 261.282996] ? nf_hook_slow+0xb1/0x160 -[ 261.287007] raw_sendmsg+0x21f9/0x4420 -[ 261.291008] ? dst_output+0x180/0x180 -[ 261.297003] ? sched_clock_cpu+0x126/0x170 -[ 261.301003] ? find_held_lock+0x39/0x1c0 -[ 261.306155] ? stop_critical_timings+0x420/0x420 -[ 261.311004] ? check_flags.part.36+0x450/0x450 -[ 261.315005] ? _raw_spin_unlock_irq+0x29/0x40 -[ 261.320995] ? _raw_spin_unlock_irq+0x29/0x40 -[ 261.326142] ? cyc2ns_read_end+0x10/0x10 -[ 261.330139] ? raw_bind+0x280/0x280 -[ 261.334138] ? sched_clock_cpu+0x126/0x170 -[ 261.338995] ? check_flags.part.36+0x450/0x450 -[ 261.342991] ? __lock_acquire+0x4500/0x4500 -[ 261.348994] ? inet_sendmsg+0x11c/0x500 -[ 261.352989] ? dst_output+0x180/0x180 -[ 261.357012] inet_sendmsg+0x11c/0x500 -[ ... ] - -v2: - - clear skb->sk at reassembly routine.(Eric Dumarzet) - -Fixes: fa0f527358bd ("ip: use rb trees for IP frag queue.") -Suggested-by: Eric Dumazet -Signed-off-by: Taehee Yoo -Reviewed-by: Eric Dumazet -Signed-off-by: David S. Miller -Signed-off-by: Mao Wenan -Signed-off-by: Greg Kroah-Hartman - ---- - net/ipv4/ip_fragment.c | 1 + - net/ipv6/netfilter/nf_conntrack_reasm.c | 1 + - 2 files changed, 2 insertions(+) - ---- a/net/ipv4/ip_fragment.c -+++ b/net/ipv4/ip_fragment.c -@@ -661,6 +661,7 @@ static int ip_frag_reasm(struct ipq *qp, - nextp = &fp->next; - fp->prev = NULL; - memset(&fp->rbnode, 0, sizeof(fp->rbnode)); -+ fp->sk = NULL; - head->data_len += fp->len; - head->len += fp->len; - if (head->ip_summed != fp->ip_summed) ---- a/net/ipv6/netfilter/nf_conntrack_reasm.c -+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c -@@ -454,6 +454,7 @@ nf_ct_frag6_reasm(struct frag_queue *fq, - else if (head->ip_summed == CHECKSUM_COMPLETE) - head->csum = csum_add(head->csum, fp->csum); - head->truesize += fp->truesize; -+ fp->sk = NULL; - } - sub_frag_mem_limit(fq->q.net, head->truesize); - diff --git a/queue-4.4/ip-process-in-order-fragments-efficiently.patch b/queue-4.4/ip-process-in-order-fragments-efficiently.patch deleted file mode 100644 index 33b5def8169..00000000000 --- a/queue-4.4/ip-process-in-order-fragments-efficiently.patch +++ /dev/null @@ -1,268 +0,0 @@ -From a4fd284a1f8fd4b6c59aa59db2185b1e17c5c11c Mon Sep 17 00:00:00 2001 -From: Peter Oskolkov -Date: Sat, 11 Aug 2018 20:27:25 +0000 -Subject: ip: process in-order fragments efficiently - -From: Peter Oskolkov - -commit a4fd284a1f8fd4b6c59aa59db2185b1e17c5c11c upstream. - -This patch changes the runtime behavior of IP defrag queue: -incoming in-order fragments are added to the end of the current -list/"run" of in-order fragments at the tail. - -On some workloads, UDP stream performance is substantially improved: - -RX: ./udp_stream -F 10 -T 2 -l 60 -TX: ./udp_stream -c -H -F 10 -T 5 -l 60 - -with this patchset applied on a 10Gbps receiver: - - throughput=9524.18 - throughput_units=Mbit/s - -upstream (net-next): - - throughput=4608.93 - throughput_units=Mbit/s - -Reported-by: Willem de Bruijn -Signed-off-by: Peter Oskolkov -Cc: Eric Dumazet -Cc: Florian Westphal -Signed-off-by: David S. Miller -Signed-off-by: Mao Wenan -Signed-off-by: Greg Kroah-Hartman ---- - net/ipv4/inet_fragment.c | 2 - net/ipv4/ip_fragment.c | 110 +++++++++++++++++++++++++++++------------------ - 2 files changed, 70 insertions(+), 42 deletions(-) - ---- a/net/ipv4/inet_fragment.c -+++ b/net/ipv4/inet_fragment.c -@@ -315,7 +315,7 @@ void inet_frag_destroy(struct inet_frag_ - fp = xp; - } while (fp); - } else { -- sum_truesize = skb_rbtree_purge(&q->rb_fragments); -+ sum_truesize = inet_frag_rbtree_purge(&q->rb_fragments); - } - sum = sum_truesize + f->qsize; - ---- a/net/ipv4/ip_fragment.c -+++ b/net/ipv4/ip_fragment.c -@@ -139,8 +139,8 @@ int ip_frag_mem(struct net *net) - return sum_frag_mem_limit(&net->ipv4.frags); - } - --static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, -- struct net_device *dev); -+static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb, -+ struct sk_buff *prev_tail, struct net_device *dev); - - struct ip4_create_arg { - struct iphdr *iph; -@@ -271,7 +271,12 @@ static void ip_expire(unsigned long arg) - head = skb_rb_first(&qp->q.rb_fragments); - if (!head) - goto out; -- rb_erase(&head->rbnode, &qp->q.rb_fragments); -+ if (FRAG_CB(head)->next_frag) -+ rb_replace_node(&head->rbnode, -+ &FRAG_CB(head)->next_frag->rbnode, -+ &qp->q.rb_fragments); -+ else -+ rb_erase(&head->rbnode, &qp->q.rb_fragments); - memset(&head->rbnode, 0, sizeof(head->rbnode)); - barrier(); - } -@@ -373,7 +378,7 @@ static int ip_frag_reinit(struct ipq *qp - return -ETIMEDOUT; - } - -- sum_truesize = skb_rbtree_purge(&qp->q.rb_fragments); -+ sum_truesize = inet_frag_rbtree_purge(&qp->q.rb_fragments); - sub_frag_mem_limit(qp->q.net, sum_truesize); - - qp->q.flags = 0; -@@ -382,6 +387,7 @@ static int ip_frag_reinit(struct ipq *qp - qp->q.fragments = NULL; - qp->q.rb_fragments = RB_ROOT; - qp->q.fragments_tail = NULL; -+ qp->q.last_run_head = NULL; - qp->iif = 0; - qp->ecn = 0; - -@@ -393,7 +399,7 @@ static int ip_frag_queue(struct ipq *qp, - { - struct net *net = container_of(qp->q.net, struct net, ipv4.frags); - struct rb_node **rbn, *parent; -- struct sk_buff *skb1; -+ struct sk_buff *skb1, *prev_tail; - struct net_device *dev; - unsigned int fragsize; - int flags, offset; -@@ -471,38 +477,41 @@ static int ip_frag_queue(struct ipq *qp, - */ - - /* Find out where to put this fragment. */ -- skb1 = qp->q.fragments_tail; -- if (!skb1) { -- /* This is the first fragment we've received. */ -- rb_link_node(&skb->rbnode, NULL, &qp->q.rb_fragments.rb_node); -- qp->q.fragments_tail = skb; -- } else if ((FRAG_CB(skb1)->offset + skb1->len) < end) { -- /* This is the common/special case: skb goes to the end. */ -+ prev_tail = qp->q.fragments_tail; -+ if (!prev_tail) -+ ip4_frag_create_run(&qp->q, skb); /* First fragment. */ -+ else if (FRAG_CB(prev_tail)->offset + prev_tail->len < end) { -+ /* This is the common case: skb goes to the end. */ - /* Detect and discard overlaps. */ -- if (offset < (FRAG_CB(skb1)->offset + skb1->len)) -+ if (offset < FRAG_CB(prev_tail)->offset + prev_tail->len) - goto discard_qp; -- /* Insert after skb1. */ -- rb_link_node(&skb->rbnode, &skb1->rbnode, &skb1->rbnode.rb_right); -- qp->q.fragments_tail = skb; -+ if (offset == FRAG_CB(prev_tail)->offset + prev_tail->len) -+ ip4_frag_append_to_last_run(&qp->q, skb); -+ else -+ ip4_frag_create_run(&qp->q, skb); - } else { -- /* Binary search. Note that skb can become the first fragment, but -- * not the last (covered above). */ -+ /* Binary search. Note that skb can become the first fragment, -+ * but not the last (covered above). -+ */ - rbn = &qp->q.rb_fragments.rb_node; - do { - parent = *rbn; - skb1 = rb_to_skb(parent); - if (end <= FRAG_CB(skb1)->offset) - rbn = &parent->rb_left; -- else if (offset >= FRAG_CB(skb1)->offset + skb1->len) -+ else if (offset >= FRAG_CB(skb1)->offset + -+ FRAG_CB(skb1)->frag_run_len) - rbn = &parent->rb_right; - else /* Found an overlap with skb1. */ - goto discard_qp; - } while (*rbn); - /* Here we have parent properly set, and rbn pointing to -- * one of its NULL left/right children. Insert skb. */ -+ * one of its NULL left/right children. Insert skb. -+ */ -+ ip4_frag_init_run(skb); - rb_link_node(&skb->rbnode, parent, rbn); -+ rb_insert_color(&skb->rbnode, &qp->q.rb_fragments); - } -- rb_insert_color(&skb->rbnode, &qp->q.rb_fragments); - - if (dev) { - qp->iif = dev->ifindex; -@@ -531,7 +540,7 @@ static int ip_frag_queue(struct ipq *qp, - unsigned long orefdst = skb->_skb_refdst; - - skb->_skb_refdst = 0UL; -- err = ip_frag_reasm(qp, skb, dev); -+ err = ip_frag_reasm(qp, skb, prev_tail, dev); - skb->_skb_refdst = orefdst; - return err; - } -@@ -550,7 +559,7 @@ err: - - /* Build a new IP datagram from all its fragments. */ - static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb, -- struct net_device *dev) -+ struct sk_buff *prev_tail, struct net_device *dev) - { - struct net *net = container_of(qp->q.net, struct net, ipv4.frags); - struct iphdr *iph; -@@ -575,10 +584,16 @@ static int ip_frag_reasm(struct ipq *qp, - if (!fp) - goto out_nomem; - -- rb_replace_node(&skb->rbnode, &fp->rbnode, &qp->q.rb_fragments); -+ FRAG_CB(fp)->next_frag = FRAG_CB(skb)->next_frag; -+ if (RB_EMPTY_NODE(&skb->rbnode)) -+ FRAG_CB(prev_tail)->next_frag = fp; -+ else -+ rb_replace_node(&skb->rbnode, &fp->rbnode, -+ &qp->q.rb_fragments); - if (qp->q.fragments_tail == skb) - qp->q.fragments_tail = fp; - skb_morph(skb, head); -+ FRAG_CB(skb)->next_frag = FRAG_CB(head)->next_frag; - rb_replace_node(&head->rbnode, &skb->rbnode, - &qp->q.rb_fragments); - consume_skb(head); -@@ -614,7 +629,7 @@ static int ip_frag_reasm(struct ipq *qp, - for (i = 0; i < skb_shinfo(head)->nr_frags; i++) - plen += skb_frag_size(&skb_shinfo(head)->frags[i]); - clone->len = clone->data_len = head->data_len - plen; -- skb->truesize += clone->truesize; -+ head->truesize += clone->truesize; - clone->csum = 0; - clone->ip_summed = head->ip_summed; - add_frag_mem_limit(qp->q.net, clone->truesize); -@@ -627,24 +642,36 @@ static int ip_frag_reasm(struct ipq *qp, - skb_push(head, head->data - skb_network_header(head)); - - /* Traverse the tree in order, to build frag_list. */ -+ fp = FRAG_CB(head)->next_frag; - rbn = rb_next(&head->rbnode); - rb_erase(&head->rbnode, &qp->q.rb_fragments); -- while (rbn) { -- struct rb_node *rbnext = rb_next(rbn); -- fp = rb_to_skb(rbn); -- rb_erase(rbn, &qp->q.rb_fragments); -- rbn = rbnext; -- *nextp = fp; -- nextp = &fp->next; -- fp->prev = NULL; -- memset(&fp->rbnode, 0, sizeof(fp->rbnode)); -- head->data_len += fp->len; -- head->len += fp->len; -- if (head->ip_summed != fp->ip_summed) -- head->ip_summed = CHECKSUM_NONE; -- else if (head->ip_summed == CHECKSUM_COMPLETE) -- head->csum = csum_add(head->csum, fp->csum); -- head->truesize += fp->truesize; -+ while (rbn || fp) { -+ /* fp points to the next sk_buff in the current run; -+ * rbn points to the next run. -+ */ -+ /* Go through the current run. */ -+ while (fp) { -+ *nextp = fp; -+ nextp = &fp->next; -+ fp->prev = NULL; -+ memset(&fp->rbnode, 0, sizeof(fp->rbnode)); -+ head->data_len += fp->len; -+ head->len += fp->len; -+ if (head->ip_summed != fp->ip_summed) -+ head->ip_summed = CHECKSUM_NONE; -+ else if (head->ip_summed == CHECKSUM_COMPLETE) -+ head->csum = csum_add(head->csum, fp->csum); -+ head->truesize += fp->truesize; -+ fp = FRAG_CB(fp)->next_frag; -+ } -+ /* Move to the next run. */ -+ if (rbn) { -+ struct rb_node *rbnext = rb_next(rbn); -+ -+ fp = rb_to_skb(rbn); -+ rb_erase(rbn, &qp->q.rb_fragments); -+ rbn = rbnext; -+ } - } - sub_frag_mem_limit(qp->q.net, head->truesize); - -@@ -680,6 +707,7 @@ static int ip_frag_reasm(struct ipq *qp, - qp->q.fragments = NULL; - qp->q.rb_fragments = RB_ROOT; - qp->q.fragments_tail = NULL; -+ qp->q.last_run_head = NULL; - return 0; - - out_nomem: diff --git a/queue-4.4/ip-use-rb-trees-for-ip-frag-queue.patch b/queue-4.4/ip-use-rb-trees-for-ip-frag-queue.patch deleted file mode 100644 index cd5b3a0d01b..00000000000 --- a/queue-4.4/ip-use-rb-trees-for-ip-frag-queue.patch +++ /dev/null @@ -1,462 +0,0 @@ -From fa0f527358bd900ef92f925878ed6bfbd51305cc Mon Sep 17 00:00:00 2001 -From: Peter Oskolkov -Date: Thu, 2 Aug 2018 23:34:39 +0000 -Subject: ip: use rb trees for IP frag queue. - -From: Peter Oskolkov - -commit fa0f527358bd900ef92f925878ed6bfbd51305cc upstream. - -Similar to TCP OOO RX queue, it makes sense to use rb trees to store -IP fragments, so that OOO fragments are inserted faster. - -Tested: - -- a follow-up patch contains a rather comprehensive ip defrag - self-test (functional) -- ran neper `udp_stream -c -H -F 100 -l 300 -T 20`: - netstat --statistics - Ip: - 282078937 total packets received - 0 forwarded - 0 incoming packets discarded - 946760 incoming packets delivered - 18743456 requests sent out - 101 fragments dropped after timeout - 282077129 reassemblies required - 944952 packets reassembled ok - 262734239 packet reassembles failed - (The numbers/stats above are somewhat better re: - reassemblies vs a kernel without this patchset. More - comprehensive performance testing TBD). - -Reported-by: Jann Horn -Reported-by: Juha-Matti Tilli -Suggested-by: Eric Dumazet -Signed-off-by: Peter Oskolkov -Signed-off-by: Eric Dumazet -Cc: Florian Westphal -Signed-off-by: David S. Miller -Signed-off-by: Mao Wenan -Signed-off-by: Greg Kroah-Hartman - ---- - include/linux/skbuff.h | 2 - include/net/inet_frag.h | 3 - net/ipv4/inet_fragment.c | 14 +- - net/ipv4/ip_fragment.c | 190 +++++++++++++++++--------------- - net/ipv6/netfilter/nf_conntrack_reasm.c | 1 - net/ipv6/reassembly.c | 1 - 6 files changed, 120 insertions(+), 91 deletions(-) - ---- a/include/linux/skbuff.h -+++ b/include/linux/skbuff.h -@@ -556,7 +556,7 @@ struct sk_buff { - struct skb_mstamp skb_mstamp; - }; - }; -- struct rb_node rbnode; /* used in netem & tcp stack */ -+ struct rb_node rbnode; /* used in netem, ip4 defrag, and tcp stack */ - }; - struct sock *sk; - struct net_device *dev; ---- a/include/net/inet_frag.h -+++ b/include/net/inet_frag.h -@@ -45,7 +45,8 @@ struct inet_frag_queue { - struct timer_list timer; - struct hlist_node list; - atomic_t refcnt; -- struct sk_buff *fragments; -+ struct sk_buff *fragments; /* Used in IPv6. */ -+ struct rb_root rb_fragments; /* Used in IPv4. */ - struct sk_buff *fragments_tail; - ktime_t stamp; - int len; ---- a/net/ipv4/inet_fragment.c -+++ b/net/ipv4/inet_fragment.c -@@ -306,12 +306,16 @@ void inet_frag_destroy(struct inet_frag_ - /* Release all fragment data. */ - fp = q->fragments; - nf = q->net; -- while (fp) { -- struct sk_buff *xp = fp->next; -+ if (fp) { -+ do { -+ struct sk_buff *xp = fp->next; - -- sum_truesize += fp->truesize; -- frag_kfree_skb(nf, f, fp); -- fp = xp; -+ sum_truesize += fp->truesize; -+ kfree_skb(fp); -+ fp = xp; -+ } while (fp); -+ } else { -+ sum_truesize = skb_rbtree_purge(&q->rb_fragments); - } - sum = sum_truesize + f->qsize; - ---- a/net/ipv4/ip_fragment.c -+++ b/net/ipv4/ip_fragment.c -@@ -194,7 +194,7 @@ static bool frag_expire_skip_icmp(u32 us - */ - static void ip_expire(unsigned long arg) - { -- struct sk_buff *clone, *head; -+ struct sk_buff *head = NULL; - const struct iphdr *iph; - struct net *net; - struct ipq *qp; -@@ -211,14 +211,31 @@ static void ip_expire(unsigned long arg) - - ipq_kill(qp); - IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS); -- -- head = qp->q.fragments; -- - IP_INC_STATS_BH(net, IPSTATS_MIB_REASMTIMEOUT); - -- if (!(qp->q.flags & INET_FRAG_FIRST_IN) || !head) -+ if (!qp->q.flags & INET_FRAG_FIRST_IN) - goto out; - -+ /* sk_buff::dev and sk_buff::rbnode are unionized. So we -+ * pull the head out of the tree in order to be able to -+ * deal with head->dev. -+ */ -+ if (qp->q.fragments) { -+ head = qp->q.fragments; -+ qp->q.fragments = head->next; -+ } else { -+ head = skb_rb_first(&qp->q.rb_fragments); -+ if (!head) -+ goto out; -+ rb_erase(&head->rbnode, &qp->q.rb_fragments); -+ memset(&head->rbnode, 0, sizeof(head->rbnode)); -+ barrier(); -+ } -+ if (head == qp->q.fragments_tail) -+ qp->q.fragments_tail = NULL; -+ -+ sub_frag_mem_limit(qp->q.net, head->truesize); -+ - head->dev = dev_get_by_index_rcu(net, qp->iif); - if (!head->dev) - goto out; -@@ -237,20 +254,17 @@ static void ip_expire(unsigned long arg) - (skb_rtable(head)->rt_type != RTN_LOCAL)) - goto out; - -- clone = skb_clone(head, GFP_ATOMIC); -- - /* Send an ICMP "Fragment Reassembly Timeout" message. */ -- if (clone) { -- spin_unlock(&qp->q.lock); -- icmp_send(clone, ICMP_TIME_EXCEEDED, -- ICMP_EXC_FRAGTIME, 0); -- consume_skb(clone); -- goto out_rcu_unlock; -- } -+ spin_unlock(&qp->q.lock); -+ icmp_send(head, ICMP_TIME_EXCEEDED, ICMP_EXC_FRAGTIME, 0); -+ goto out_rcu_unlock; -+ - out: - spin_unlock(&qp->q.lock); - out_rcu_unlock: - rcu_read_unlock(); -+ if (head) -+ kfree_skb(head); - ipq_put(qp); - } - -@@ -294,7 +308,7 @@ static int ip_frag_too_far(struct ipq *q - end = atomic_inc_return(&peer->rid); - qp->rid = end; - -- rc = qp->q.fragments && (end - start) > max; -+ rc = qp->q.fragments_tail && (end - start) > max; - - if (rc) { - struct net *net; -@@ -308,7 +322,6 @@ static int ip_frag_too_far(struct ipq *q - - static int ip_frag_reinit(struct ipq *qp) - { -- struct sk_buff *fp; - unsigned int sum_truesize = 0; - - if (!mod_timer(&qp->q.timer, jiffies + qp->q.net->timeout)) { -@@ -316,20 +329,14 @@ static int ip_frag_reinit(struct ipq *qp - return -ETIMEDOUT; - } - -- fp = qp->q.fragments; -- do { -- struct sk_buff *xp = fp->next; -- -- sum_truesize += fp->truesize; -- kfree_skb(fp); -- fp = xp; -- } while (fp); -+ sum_truesize = skb_rbtree_purge(&qp->q.rb_fragments); - sub_frag_mem_limit(qp->q.net, sum_truesize); - - qp->q.flags = 0; - qp->q.len = 0; - qp->q.meat = 0; - qp->q.fragments = NULL; -+ qp->q.rb_fragments = RB_ROOT; - qp->q.fragments_tail = NULL; - qp->iif = 0; - qp->ecn = 0; -@@ -341,7 +348,8 @@ static int ip_frag_reinit(struct ipq *qp - static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) - { - struct net *net = container_of(qp->q.net, struct net, ipv4.frags); -- struct sk_buff *prev, *next; -+ struct rb_node **rbn, *parent; -+ struct sk_buff *skb1; - struct net_device *dev; - unsigned int fragsize; - int flags, offset; -@@ -404,56 +412,60 @@ static int ip_frag_queue(struct ipq *qp, - if (err) - goto err; - -- /* Find out which fragments are in front and at the back of us -- * in the chain of fragments so far. We must know where to put -- * this fragment, right? -- */ -- prev = qp->q.fragments_tail; -- if (!prev || FRAG_CB(prev)->offset < offset) { -- next = NULL; -- goto found; -- } -- prev = NULL; -- for (next = qp->q.fragments; next != NULL; next = next->next) { -- if (FRAG_CB(next)->offset >= offset) -- break; /* bingo! */ -- prev = next; -- } -+ /* Note : skb->rbnode and skb->dev share the same location. */ -+ dev = skb->dev; -+ /* Makes sure compiler wont do silly aliasing games */ -+ barrier(); - --found: - /* RFC5722, Section 4, amended by Errata ID : 3089 - * When reassembling an IPv6 datagram, if - * one or more its constituent fragments is determined to be an - * overlapping fragment, the entire datagram (and any constituent - * fragments) MUST be silently discarded. - * -- * We do the same here for IPv4. -+ * We do the same here for IPv4 (and increment an snmp counter). - */ -- /* Is there an overlap with the previous fragment? */ -- if (prev && -- (FRAG_CB(prev)->offset + prev->len) > offset) -- goto discard_qp; -- -- /* Is there an overlap with the next fragment? */ -- if (next && FRAG_CB(next)->offset < end) -- goto discard_qp; -- -- FRAG_CB(skb)->offset = offset; - -- /* Insert this fragment in the chain of fragments. */ -- skb->next = next; -- if (!next) -+ /* Find out where to put this fragment. */ -+ skb1 = qp->q.fragments_tail; -+ if (!skb1) { -+ /* This is the first fragment we've received. */ -+ rb_link_node(&skb->rbnode, NULL, &qp->q.rb_fragments.rb_node); -+ qp->q.fragments_tail = skb; -+ } else if ((FRAG_CB(skb1)->offset + skb1->len) < end) { -+ /* This is the common/special case: skb goes to the end. */ -+ /* Detect and discard overlaps. */ -+ if (offset < (FRAG_CB(skb1)->offset + skb1->len)) -+ goto discard_qp; -+ /* Insert after skb1. */ -+ rb_link_node(&skb->rbnode, &skb1->rbnode, &skb1->rbnode.rb_right); - qp->q.fragments_tail = skb; -- if (prev) -- prev->next = skb; -- else -- qp->q.fragments = skb; -+ } else { -+ /* Binary search. Note that skb can become the first fragment, but -+ * not the last (covered above). */ -+ rbn = &qp->q.rb_fragments.rb_node; -+ do { -+ parent = *rbn; -+ skb1 = rb_to_skb(parent); -+ if (end <= FRAG_CB(skb1)->offset) -+ rbn = &parent->rb_left; -+ else if (offset >= FRAG_CB(skb1)->offset + skb1->len) -+ rbn = &parent->rb_right; -+ else /* Found an overlap with skb1. */ -+ goto discard_qp; -+ } while (*rbn); -+ /* Here we have parent properly set, and rbn pointing to -+ * one of its NULL left/right children. Insert skb. */ -+ rb_link_node(&skb->rbnode, parent, rbn); -+ } -+ rb_insert_color(&skb->rbnode, &qp->q.rb_fragments); - -- dev = skb->dev; - if (dev) { - qp->iif = dev->ifindex; - skb->dev = NULL; - } -+ FRAG_CB(skb)->offset = offset; -+ - qp->q.stamp = skb->tstamp; - qp->q.meat += skb->len; - qp->ecn |= ecn; -@@ -475,7 +487,7 @@ found: - unsigned long orefdst = skb->_skb_refdst; - - skb->_skb_refdst = 0UL; -- err = ip_frag_reasm(qp, prev, dev); -+ err = ip_frag_reasm(qp, skb, dev); - skb->_skb_refdst = orefdst; - return err; - } -@@ -492,15 +504,15 @@ err: - return err; - } - -- - /* Build a new IP datagram from all its fragments. */ -- --static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, -+static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb, - struct net_device *dev) - { - struct net *net = container_of(qp->q.net, struct net, ipv4.frags); - struct iphdr *iph; -- struct sk_buff *fp, *head = qp->q.fragments; -+ struct sk_buff *fp, *head = skb_rb_first(&qp->q.rb_fragments); -+ struct sk_buff **nextp; /* To build frag_list. */ -+ struct rb_node *rbn; - int len; - int ihlen; - int err; -@@ -514,25 +526,21 @@ static int ip_frag_reasm(struct ipq *qp, - goto out_fail; - } - /* Make the one we just received the head. */ -- if (prev) { -- head = prev->next; -- fp = skb_clone(head, GFP_ATOMIC); -+ if (head != skb) { -+ fp = skb_clone(skb, GFP_ATOMIC); - if (!fp) - goto out_nomem; - -- fp->next = head->next; -- if (!fp->next) -+ rb_replace_node(&skb->rbnode, &fp->rbnode, &qp->q.rb_fragments); -+ if (qp->q.fragments_tail == skb) - qp->q.fragments_tail = fp; -- prev->next = fp; -- -- skb_morph(head, qp->q.fragments); -- head->next = qp->q.fragments->next; -- -- consume_skb(qp->q.fragments); -- qp->q.fragments = head; -+ skb_morph(skb, head); -+ rb_replace_node(&head->rbnode, &skb->rbnode, -+ &qp->q.rb_fragments); -+ consume_skb(head); -+ head = skb; - } - -- WARN_ON(!head); - WARN_ON(FRAG_CB(head)->offset != 0); - - /* Allocate a new buffer for the datagram. */ -@@ -557,24 +565,35 @@ static int ip_frag_reasm(struct ipq *qp, - clone = alloc_skb(0, GFP_ATOMIC); - if (!clone) - goto out_nomem; -- clone->next = head->next; -- head->next = clone; - skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list; - skb_frag_list_init(head); - for (i = 0; i < skb_shinfo(head)->nr_frags; i++) - plen += skb_frag_size(&skb_shinfo(head)->frags[i]); - clone->len = clone->data_len = head->data_len - plen; -- head->data_len -= clone->len; -- head->len -= clone->len; -+ skb->truesize += clone->truesize; - clone->csum = 0; - clone->ip_summed = head->ip_summed; - add_frag_mem_limit(qp->q.net, clone->truesize); -+ skb_shinfo(head)->frag_list = clone; -+ nextp = &clone->next; -+ } else { -+ nextp = &skb_shinfo(head)->frag_list; - } - -- skb_shinfo(head)->frag_list = head->next; - skb_push(head, head->data - skb_network_header(head)); - -- for (fp=head->next; fp; fp = fp->next) { -+ /* Traverse the tree in order, to build frag_list. */ -+ rbn = rb_next(&head->rbnode); -+ rb_erase(&head->rbnode, &qp->q.rb_fragments); -+ while (rbn) { -+ struct rb_node *rbnext = rb_next(rbn); -+ fp = rb_to_skb(rbn); -+ rb_erase(rbn, &qp->q.rb_fragments); -+ rbn = rbnext; -+ *nextp = fp; -+ nextp = &fp->next; -+ fp->prev = NULL; -+ memset(&fp->rbnode, 0, sizeof(fp->rbnode)); - head->data_len += fp->len; - head->len += fp->len; - if (head->ip_summed != fp->ip_summed) -@@ -585,7 +604,9 @@ static int ip_frag_reasm(struct ipq *qp, - } - sub_frag_mem_limit(qp->q.net, head->truesize); - -+ *nextp = NULL; - head->next = NULL; -+ head->prev = NULL; - head->dev = dev; - head->tstamp = qp->q.stamp; - IPCB(head)->frag_max_size = max(qp->max_df_size, qp->q.max_size); -@@ -613,6 +634,7 @@ static int ip_frag_reasm(struct ipq *qp, - - IP_INC_STATS_BH(net, IPSTATS_MIB_REASMOKS); - qp->q.fragments = NULL; -+ qp->q.rb_fragments = RB_ROOT; - qp->q.fragments_tail = NULL; - return 0; - ---- a/net/ipv6/netfilter/nf_conntrack_reasm.c -+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c -@@ -472,6 +472,7 @@ nf_ct_frag6_reasm(struct frag_queue *fq, - head->csum); - - fq->q.fragments = NULL; -+ fq->q.rb_fragments = RB_ROOT; - fq->q.fragments_tail = NULL; - - /* all original skbs are linked into the NFCT_FRAG6_CB(head).orig */ ---- a/net/ipv6/reassembly.c -+++ b/net/ipv6/reassembly.c -@@ -499,6 +499,7 @@ static int ip6_frag_reasm(struct frag_qu - IP6_INC_STATS_BH(net, __in6_dev_get(dev), IPSTATS_MIB_REASMOKS); - rcu_read_unlock(); - fq->q.fragments = NULL; -+ fq->q.rb_fragments = RB_ROOT; - fq->q.fragments_tail = NULL; - return 1; - diff --git a/queue-4.4/ipv4-frags-precedence-bug-in-ip_expire.patch b/queue-4.4/ipv4-frags-precedence-bug-in-ip_expire.patch deleted file mode 100644 index a68b23b6703..00000000000 --- a/queue-4.4/ipv4-frags-precedence-bug-in-ip_expire.patch +++ /dev/null @@ -1,33 +0,0 @@ -From 70837ffe3085c9a91488b52ca13ac84424da1042 Mon Sep 17 00:00:00 2001 -From: Dan Carpenter -Date: Mon, 6 Aug 2018 22:17:35 +0300 -Subject: ipv4: frags: precedence bug in ip_expire() - -From: Dan Carpenter - -commit 70837ffe3085c9a91488b52ca13ac84424da1042 upstream. - -We accidentally removed the parentheses here, but they are required -because '!' has higher precedence than '&'. - -Fixes: fa0f527358bd ("ip: use rb trees for IP frag queue.") -Signed-off-by: Dan Carpenter -Signed-off-by: David S. Miller -Signed-off-by: Mao Wenan -Signed-off-by: Greg Kroah-Hartman - ---- - net/ipv4/ip_fragment.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - ---- a/net/ipv4/ip_fragment.c -+++ b/net/ipv4/ip_fragment.c -@@ -257,7 +257,7 @@ static void ip_expire(unsigned long arg) - IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS); - IP_INC_STATS_BH(net, IPSTATS_MIB_REASMTIMEOUT); - -- if (!qp->q.flags & INET_FRAG_FIRST_IN) -+ if (!(qp->q.flags & INET_FRAG_FIRST_IN)) - goto out; - - /* sk_buff::dev and sk_buff::rbnode are unionized. So we diff --git a/queue-4.4/ipv6-defrag-drop-non-last-frags-smaller-than-min-mtu.patch b/queue-4.4/ipv6-defrag-drop-non-last-frags-smaller-than-min-mtu.patch deleted file mode 100644 index a054cde03f7..00000000000 --- a/queue-4.4/ipv6-defrag-drop-non-last-frags-smaller-than-min-mtu.patch +++ /dev/null @@ -1,55 +0,0 @@ -From 0ed4229b08c13c84a3c301a08defdc9e7f4467e6 Mon Sep 17 00:00:00 2001 -From: Florian Westphal -Date: Fri, 3 Aug 2018 02:22:20 +0200 -Subject: ipv6: defrag: drop non-last frags smaller than min mtu - -From: Florian Westphal - -commit 0ed4229b08c13c84a3c301a08defdc9e7f4467e6 upstream. - -don't bother with pathological cases, they only waste cycles. -IPv6 requires a minimum MTU of 1280 so we should never see fragments -smaller than this (except last frag). - -v3: don't use awkward "-offset + len" -v2: drop IPv4 part, which added same check w. IPV4_MIN_MTU (68). - There were concerns that there could be even smaller frags - generated by intermediate nodes, e.g. on radio networks. - -Cc: Peter Oskolkov -Cc: Eric Dumazet -Signed-off-by: Florian Westphal -Signed-off-by: David S. Miller -Signed-off-by: Mao Wenan -Signed-off-by: Greg Kroah-Hartman ---- - net/ipv6/netfilter/nf_conntrack_reasm.c | 4 ++++ - net/ipv6/reassembly.c | 4 ++++ - 2 files changed, 8 insertions(+) - ---- a/net/ipv6/netfilter/nf_conntrack_reasm.c -+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c -@@ -602,6 +602,10 @@ struct sk_buff *nf_ct_frag6_gather(struc - hdr = ipv6_hdr(clone); - fhdr = (struct frag_hdr *)skb_transport_header(clone); - -+ if (skb->len - skb_network_offset(skb) < IPV6_MIN_MTU && -+ fhdr->frag_off & htons(IP6_MF)) -+ goto ret_orig; -+ - skb_orphan(skb); - fq = fq_find(net, fhdr->identification, user, &hdr->saddr, &hdr->daddr, - skb->dev ? skb->dev->ifindex : 0, ip6_frag_ecn(hdr)); ---- a/net/ipv6/reassembly.c -+++ b/net/ipv6/reassembly.c -@@ -549,6 +549,10 @@ static int ipv6_frag_rcv(struct sk_buff - return 1; - } - -+ if (skb->len - skb_network_offset(skb) < IPV6_MIN_MTU && -+ fhdr->frag_off & htons(IP6_MF)) -+ goto fail_hdr; -+ - fq = fq_find(net, fhdr->identification, &hdr->saddr, &hdr->daddr, - skb->dev ? skb->dev->ifindex : 0, ip6_frag_ecn(hdr)); - if (fq) { diff --git a/queue-4.4/net-ipv4-do-not-handle-duplicate-fragments-as-overlapping.patch b/queue-4.4/net-ipv4-do-not-handle-duplicate-fragments-as-overlapping.patch deleted file mode 100644 index 6094b9c9f64..00000000000 --- a/queue-4.4/net-ipv4-do-not-handle-duplicate-fragments-as-overlapping.patch +++ /dev/null @@ -1,83 +0,0 @@ -From ade446403bfb79d3528d56071a84b15351a139ad Mon Sep 17 00:00:00 2001 -From: Michal Kubecek -Date: Thu, 13 Dec 2018 17:23:32 +0100 -Subject: net: ipv4: do not handle duplicate fragments as overlapping - -From: Michal Kubecek - -commit ade446403bfb79d3528d56071a84b15351a139ad upstream. - -Since commit 7969e5c40dfd ("ip: discard IPv4 datagrams with overlapping -segments.") IPv4 reassembly code drops the whole queue whenever an -overlapping fragment is received. However, the test is written in a way -which detects duplicate fragments as overlapping so that in environments -with many duplicate packets, fragmented packets may be undeliverable. - -Add an extra test and for (potentially) duplicate fragment, only drop the -new fragment rather than the whole queue. Only starting offset and length -are checked, not the contents of the fragments as that would be too -expensive. For similar reason, linear list ("run") of a rbtree node is not -iterated, we only check if the new fragment is a subset of the interval -covered by existing consecutive fragments. - -v2: instead of an exact check iterating through linear list of an rbtree -node, only check if the new fragment is subset of the "run" (suggested -by Eric Dumazet) - -Fixes: 7969e5c40dfd ("ip: discard IPv4 datagrams with overlapping segments.") -Signed-off-by: Michal Kubecek -Signed-off-by: David S. Miller -Signed-off-by: Mao Wenan -Signed-off-by: Greg Kroah-Hartman - ---- - net/ipv4/ip_fragment.c | 18 ++++++++++++------ - 1 file changed, 12 insertions(+), 6 deletions(-) - ---- a/net/ipv4/ip_fragment.c -+++ b/net/ipv4/ip_fragment.c -@@ -400,10 +400,10 @@ static int ip_frag_queue(struct ipq *qp, - struct net *net = container_of(qp->q.net, struct net, ipv4.frags); - struct rb_node **rbn, *parent; - struct sk_buff *skb1, *prev_tail; -+ int ihl, end, skb1_run_end; - struct net_device *dev; - unsigned int fragsize; - int flags, offset; -- int ihl, end; - int err = -ENOENT; - u8 ecn; - -@@ -473,7 +473,9 @@ static int ip_frag_queue(struct ipq *qp, - * overlapping fragment, the entire datagram (and any constituent - * fragments) MUST be silently discarded. - * -- * We do the same here for IPv4 (and increment an snmp counter). -+ * We do the same here for IPv4 (and increment an snmp counter) but -+ * we do not want to drop the whole queue in response to a duplicate -+ * fragment. - */ - - /* Find out where to put this fragment. */ -@@ -497,13 +499,17 @@ static int ip_frag_queue(struct ipq *qp, - do { - parent = *rbn; - skb1 = rb_to_skb(parent); -+ skb1_run_end = FRAG_CB(skb1)->offset + -+ FRAG_CB(skb1)->frag_run_len; - if (end <= FRAG_CB(skb1)->offset) - rbn = &parent->rb_left; -- else if (offset >= FRAG_CB(skb1)->offset + -- FRAG_CB(skb1)->frag_run_len) -+ else if (offset >= skb1_run_end) - rbn = &parent->rb_right; -- else /* Found an overlap with skb1. */ -- goto discard_qp; -+ else if (offset >= FRAG_CB(skb1)->offset && -+ end <= skb1_run_end) -+ goto err; /* No new data, potential duplicate */ -+ else -+ goto discard_qp; /* Found an overlap */ - } while (*rbn); - /* Here we have parent properly set, and rbn pointing to - * one of its NULL left/right children. Insert skb. diff --git a/queue-4.4/net-modify-skb_rbtree_purge-to-return-the-truesize-of-all-purged-skbs.patch b/queue-4.4/net-modify-skb_rbtree_purge-to-return-the-truesize-of-all-purged-skbs.patch deleted file mode 100644 index 17fe45cc0f1..00000000000 --- a/queue-4.4/net-modify-skb_rbtree_purge-to-return-the-truesize-of-all-purged-skbs.patch +++ /dev/null @@ -1,66 +0,0 @@ -From 385114dec8a49b5e5945e77ba7de6356106713f4 Mon Sep 17 00:00:00 2001 -From: Peter Oskolkov -Date: Thu, 2 Aug 2018 23:34:38 +0000 -Subject: net: modify skb_rbtree_purge to return the truesize of all purged skbs. - -From: Peter Oskolkov - -commit 385114dec8a49b5e5945e77ba7de6356106713f4 upstream. - -Tested: see the next patch is the series. - -Suggested-by: Eric Dumazet -Signed-off-by: Peter Oskolkov -Signed-off-by: Eric Dumazet -Cc: Florian Westphal -Signed-off-by: David S. Miller -Signed-off-by: Mao Wenan -Signed-off-by: Greg Kroah-Hartman - ---- - include/linux/skbuff.h | 2 +- - net/core/skbuff.c | 6 +++++- - 2 files changed, 6 insertions(+), 2 deletions(-) - ---- a/include/linux/skbuff.h -+++ b/include/linux/skbuff.h -@@ -2273,7 +2273,7 @@ static inline void __skb_queue_purge(str - kfree_skb(skb); - } - --void skb_rbtree_purge(struct rb_root *root); -+unsigned int skb_rbtree_purge(struct rb_root *root); - - void *netdev_alloc_frag(unsigned int fragsz); - ---- a/net/core/skbuff.c -+++ b/net/core/skbuff.c -@@ -2380,23 +2380,27 @@ EXPORT_SYMBOL(skb_queue_purge); - /** - * skb_rbtree_purge - empty a skb rbtree - * @root: root of the rbtree to empty -+ * Return value: the sum of truesizes of all purged skbs. - * - * Delete all buffers on an &sk_buff rbtree. Each buffer is removed from - * the list and one reference dropped. This function does not take - * any lock. Synchronization should be handled by the caller (e.g., TCP - * out-of-order queue is protected by the socket lock). - */ --void skb_rbtree_purge(struct rb_root *root) -+unsigned int skb_rbtree_purge(struct rb_root *root) - { - struct rb_node *p = rb_first(root); -+ unsigned int sum = 0; - - while (p) { - struct sk_buff *skb = rb_entry(p, struct sk_buff, rbnode); - - p = rb_next(p); - rb_erase(&skb->rbnode, root); -+ sum += skb->truesize; - kfree_skb(skb); - } -+ return sum; - } - - /** diff --git a/queue-4.4/series b/queue-4.4/series index e1a91c500dd..9bd0d1d9dba 100644 --- a/queue-4.4/series +++ b/queue-4.4/series @@ -52,14 +52,3 @@ cifs-always-resolve-hostname-before-reconnecting.patch drivers-core-remove-glue-dirs-from-sysfs-earlier.patch mm-migrate-don-t-rely-on-__pagemovable-of-newpage-after-unlocking-it.patch fs-don-t-scan-the-inode-cache-before-sb_born-is-set.patch -ip-discard-ipv4-datagrams-with-overlapping-segments.patch -net-modify-skb_rbtree_purge-to-return-the-truesize-of-all-purged-skbs.patch -inet-frags-get-rif-of-inet_frag_evicting.patch -ip-use-rb-trees-for-ip-frag-queue.patch -ipv6-defrag-drop-non-last-frags-smaller-than-min-mtu.patch -ip-add-helpers-to-process-in-order-fragments-faster.patch -ip-process-in-order-fragments-efficiently.patch -net-ipv4-do-not-handle-duplicate-fragments-as-overlapping.patch -ip-frags-fix-crash-in-ip_do_fragment.patch -ipv4-frags-precedence-bug-in-ip_expire.patch -fix-net-ipv4-do-not-handle-duplicate-fragments-as-overlapping.patch