+++ /dev/null
-From foo@baz Thu Jan 31 15:59:51 CET 2019
-Date: Thu, 31 Jan 2019 15:59:51 +0100
-To: Greg KH <gregkh@linuxfoundation.org>
-From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
-Subject: Fix "net: ipv4: do not handle duplicate fragments as overlapping"
-
-From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
-
-ade446403bfb ("net: ipv4: do not handle duplicate fragments as
-overlapping") was backported to many stable trees, but it had a problem
-that was "accidentally" fixed by the upstream commit 0ff89efb5246 ("ip:
-fail fast on IP defrag errors")
-
-This is the fixup for that problem as we do not want the larger patch in
-the older stable trees.
-
-Fixes: ade446403bfb ("net: ipv4: do not handle duplicate fragments as overlapping")
-Reported-by: Ivan Babrou <ivan@cloudflare.com>
-Reported-by: Eric Dumazet <edumazet@google.com>
-Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
-
----
- net/ipv4/ip_fragment.c | 2 +-
- 1 file changed, 1 insertion(+), 1 deletion(-)
-
---- a/net/ipv4/ip_fragment.c
-+++ b/net/ipv4/ip_fragment.c
-@@ -478,6 +478,7 @@ static int ip_frag_queue(struct ipq *qp,
- * fragment.
- */
-
-+ err = -EINVAL;
- /* Find out where to put this fragment. */
- prev_tail = qp->q.fragments_tail;
- if (!prev_tail)
-@@ -556,7 +557,6 @@ static int ip_frag_queue(struct ipq *qp,
-
- discard_qp:
- ipq_kill(qp);
-- err = -EINVAL;
- IP_INC_STATS(net, IPSTATS_MIB_REASM_OVERLAPS);
- err:
- kfree_skb(skb);
+++ /dev/null
-From 399d1404be660d355192ff4df5ccc3f4159ec1e4 Mon Sep 17 00:00:00 2001
-From: Eric Dumazet <edumazet@google.com>
-Date: Sat, 31 Mar 2018 12:58:51 -0700
-Subject: inet: frags: get rif of inet_frag_evicting()
-
-From: Eric Dumazet <edumazet@google.com>
-
-commit 399d1404be660d355192ff4df5ccc3f4159ec1e4 upstream.
-
-This refactors ip_expire() since one indentation level is removed.
-
-Note: in the future, we should try hard to avoid the skb_clone()
-since this is a serious performance cost.
-Under DDOS, the ICMP message wont be sent because of rate limits.
-
-Fact that ip6_expire_frag_queue() does not use skb_clone() is
-disturbing too. Presumably IPv6 should have the same
-issue than the one we fixed in commit ec4fbd64751d
-("inet: frag: release spinlock before calling icmp_send()")
-
-Signed-off-by: Eric Dumazet <edumazet@google.com>
-Signed-off-by: David S. Miller <davem@davemloft.net>
-Signed-off-by: Mao Wenan <maowenan@huawei.com>
-Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
-
----
- include/net/inet_frag.h | 5 ---
- net/ipv4/ip_fragment.c | 66 +++++++++++++++++++++++-------------------------
- net/ipv6/reassembly.c | 4 --
- 3 files changed, 32 insertions(+), 43 deletions(-)
-
---- a/include/net/inet_frag.h
-+++ b/include/net/inet_frag.h
-@@ -123,11 +123,6 @@ static inline void inet_frag_put(struct
- inet_frag_destroy(q, f);
- }
-
--static inline bool inet_frag_evicting(struct inet_frag_queue *q)
--{
-- return !hlist_unhashed(&q->list_evictor);
--}
--
- /* Memory Tracking Functions. */
-
- static inline int frag_mem_limit(struct netns_frags *nf)
---- a/net/ipv4/ip_fragment.c
-+++ b/net/ipv4/ip_fragment.c
-@@ -194,8 +194,11 @@ static bool frag_expire_skip_icmp(u32 us
- */
- static void ip_expire(unsigned long arg)
- {
-- struct ipq *qp;
-+ struct sk_buff *clone, *head;
-+ const struct iphdr *iph;
- struct net *net;
-+ struct ipq *qp;
-+ int err;
-
- qp = container_of((struct inet_frag_queue *) arg, struct ipq, q);
- net = container_of(qp->q.net, struct net, ipv4.frags);
-@@ -209,45 +212,40 @@ static void ip_expire(unsigned long arg)
- ipq_kill(qp);
- IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS);
-
-- if (!inet_frag_evicting(&qp->q)) {
-- struct sk_buff *clone, *head = qp->q.fragments;
-- const struct iphdr *iph;
-- int err;
--
-- IP_INC_STATS_BH(net, IPSTATS_MIB_REASMTIMEOUT);
-+ head = qp->q.fragments;
-
-- if (!(qp->q.flags & INET_FRAG_FIRST_IN) || !qp->q.fragments)
-- goto out;
-+ IP_INC_STATS_BH(net, IPSTATS_MIB_REASMTIMEOUT);
-
-- head->dev = dev_get_by_index_rcu(net, qp->iif);
-- if (!head->dev)
-- goto out;
-+ if (!(qp->q.flags & INET_FRAG_FIRST_IN) || !head)
-+ goto out;
-
-+ head->dev = dev_get_by_index_rcu(net, qp->iif);
-+ if (!head->dev)
-+ goto out;
-
-- /* skb has no dst, perform route lookup again */
-- iph = ip_hdr(head);
-- err = ip_route_input_noref(head, iph->daddr, iph->saddr,
-+ /* skb has no dst, perform route lookup again */
-+ iph = ip_hdr(head);
-+ err = ip_route_input_noref(head, iph->daddr, iph->saddr,
- iph->tos, head->dev);
-- if (err)
-- goto out;
-+ if (err)
-+ goto out;
-+
-+ /* Only an end host needs to send an ICMP
-+ * "Fragment Reassembly Timeout" message, per RFC792.
-+ */
-+ if (frag_expire_skip_icmp(qp->user) &&
-+ (skb_rtable(head)->rt_type != RTN_LOCAL))
-+ goto out;
-+
-+ clone = skb_clone(head, GFP_ATOMIC);
-
-- /* Only an end host needs to send an ICMP
-- * "Fragment Reassembly Timeout" message, per RFC792.
-- */
-- if (frag_expire_skip_icmp(qp->user) &&
-- (skb_rtable(head)->rt_type != RTN_LOCAL))
-- goto out;
--
-- clone = skb_clone(head, GFP_ATOMIC);
--
-- /* Send an ICMP "Fragment Reassembly Timeout" message. */
-- if (clone) {
-- spin_unlock(&qp->q.lock);
-- icmp_send(clone, ICMP_TIME_EXCEEDED,
-- ICMP_EXC_FRAGTIME, 0);
-- consume_skb(clone);
-- goto out_rcu_unlock;
-- }
-+ /* Send an ICMP "Fragment Reassembly Timeout" message. */
-+ if (clone) {
-+ spin_unlock(&qp->q.lock);
-+ icmp_send(clone, ICMP_TIME_EXCEEDED,
-+ ICMP_EXC_FRAGTIME, 0);
-+ consume_skb(clone);
-+ goto out_rcu_unlock;
- }
- out:
- spin_unlock(&qp->q.lock);
---- a/net/ipv6/reassembly.c
-+++ b/net/ipv6/reassembly.c
-@@ -146,10 +146,6 @@ void ip6_expire_frag_queue(struct net *n
- goto out_rcu_unlock;
-
- IP6_INC_STATS_BH(net, __in6_dev_get(dev), IPSTATS_MIB_REASMFAILS);
--
-- if (inet_frag_evicting(&fq->q))
-- goto out_rcu_unlock;
--
- IP6_INC_STATS_BH(net, __in6_dev_get(dev), IPSTATS_MIB_REASMTIMEOUT);
-
- /* Don't send error if the first segment did not arrive. */
+++ /dev/null
-From 353c9cb360874e737fb000545f783df756c06f9a Mon Sep 17 00:00:00 2001
-From: Peter Oskolkov <posk@google.com>
-Date: Sat, 11 Aug 2018 20:27:24 +0000
-Subject: ip: add helpers to process in-order fragments faster.
-
-From: Peter Oskolkov <posk@google.com>
-
-commit 353c9cb360874e737fb000545f783df756c06f9a upstream.
-
-This patch introduces several helper functions/macros that will be
-used in the follow-up patch. No runtime changes yet.
-
-The new logic (fully implemented in the second patch) is as follows:
-
-* Nodes in the rb-tree will now contain not single fragments, but lists
- of consecutive fragments ("runs").
-
-* At each point in time, the current "active" run at the tail is
- maintained/tracked. Fragments that arrive in-order, adjacent
- to the previous tail fragment, are added to this tail run without
- triggering the re-balancing of the rb-tree.
-
-* If a fragment arrives out of order with the offset _before_ the tail run,
- it is inserted into the rb-tree as a single fragment.
-
-* If a fragment arrives after the current tail fragment (with a gap),
- it starts a new "tail" run, as is inserted into the rb-tree
- at the end as the head of the new run.
-
-skb->cb is used to store additional information
-needed here (suggested by Eric Dumazet).
-
-Reported-by: Willem de Bruijn <willemb@google.com>
-Signed-off-by: Peter Oskolkov <posk@google.com>
-Cc: Eric Dumazet <edumazet@google.com>
-Cc: Florian Westphal <fw@strlen.de>
-Signed-off-by: David S. Miller <davem@davemloft.net>
-Signed-off-by: Mao Wenan <maowenan@huawei.com>
-Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
----
- include/net/inet_frag.h | 4 ++
- net/ipv4/ip_fragment.c | 74 +++++++++++++++++++++++++++++++++++++++++++++---
- 2 files changed, 74 insertions(+), 4 deletions(-)
-
---- a/include/net/inet_frag.h
-+++ b/include/net/inet_frag.h
-@@ -48,6 +48,7 @@ struct inet_frag_queue {
- struct sk_buff *fragments; /* Used in IPv6. */
- struct rb_root rb_fragments; /* Used in IPv4. */
- struct sk_buff *fragments_tail;
-+ struct sk_buff *last_run_head;
- ktime_t stamp;
- int len;
- int meat;
-@@ -118,6 +119,9 @@ struct inet_frag_queue *inet_frag_find(s
- void inet_frag_maybe_warn_overflow(struct inet_frag_queue *q,
- const char *prefix);
-
-+/* Free all skbs in the queue; return the sum of their truesizes. */
-+unsigned int inet_frag_rbtree_purge(struct rb_root *root);
-+
- static inline void inet_frag_put(struct inet_frag_queue *q, struct inet_frags *f)
- {
- if (atomic_dec_and_test(&q->refcnt))
---- a/net/ipv4/ip_fragment.c
-+++ b/net/ipv4/ip_fragment.c
-@@ -58,13 +58,57 @@
- static int sysctl_ipfrag_max_dist __read_mostly = 64;
- static const char ip_frag_cache_name[] = "ip4-frags";
-
--struct ipfrag_skb_cb
--{
-+/* Use skb->cb to track consecutive/adjacent fragments coming at
-+ * the end of the queue. Nodes in the rb-tree queue will
-+ * contain "runs" of one or more adjacent fragments.
-+ *
-+ * Invariants:
-+ * - next_frag is NULL at the tail of a "run";
-+ * - the head of a "run" has the sum of all fragment lengths in frag_run_len.
-+ */
-+struct ipfrag_skb_cb {
- struct inet_skb_parm h;
-- int offset;
-+ int offset;
-+ struct sk_buff *next_frag;
-+ int frag_run_len;
- };
-
--#define FRAG_CB(skb) ((struct ipfrag_skb_cb *)((skb)->cb))
-+#define FRAG_CB(skb) ((struct ipfrag_skb_cb *)((skb)->cb))
-+
-+static void ip4_frag_init_run(struct sk_buff *skb)
-+{
-+ BUILD_BUG_ON(sizeof(struct ipfrag_skb_cb) > sizeof(skb->cb));
-+
-+ FRAG_CB(skb)->next_frag = NULL;
-+ FRAG_CB(skb)->frag_run_len = skb->len;
-+}
-+
-+/* Append skb to the last "run". */
-+static void ip4_frag_append_to_last_run(struct inet_frag_queue *q,
-+ struct sk_buff *skb)
-+{
-+ RB_CLEAR_NODE(&skb->rbnode);
-+ FRAG_CB(skb)->next_frag = NULL;
-+
-+ FRAG_CB(q->last_run_head)->frag_run_len += skb->len;
-+ FRAG_CB(q->fragments_tail)->next_frag = skb;
-+ q->fragments_tail = skb;
-+}
-+
-+/* Create a new "run" with the skb. */
-+static void ip4_frag_create_run(struct inet_frag_queue *q, struct sk_buff *skb)
-+{
-+ if (q->last_run_head)
-+ rb_link_node(&skb->rbnode, &q->last_run_head->rbnode,
-+ &q->last_run_head->rbnode.rb_right);
-+ else
-+ rb_link_node(&skb->rbnode, NULL, &q->rb_fragments.rb_node);
-+ rb_insert_color(&skb->rbnode, &q->rb_fragments);
-+
-+ ip4_frag_init_run(skb);
-+ q->fragments_tail = skb;
-+ q->last_run_head = skb;
-+}
-
- /* Describe an entry in the "incomplete datagrams" queue. */
- struct ipq {
-@@ -721,6 +765,28 @@ struct sk_buff *ip_check_defrag(struct n
- }
- EXPORT_SYMBOL(ip_check_defrag);
-
-+unsigned int inet_frag_rbtree_purge(struct rb_root *root)
-+{
-+ struct rb_node *p = rb_first(root);
-+ unsigned int sum = 0;
-+
-+ while (p) {
-+ struct sk_buff *skb = rb_entry(p, struct sk_buff, rbnode);
-+
-+ p = rb_next(p);
-+ rb_erase(&skb->rbnode, root);
-+ while (skb) {
-+ struct sk_buff *next = FRAG_CB(skb)->next_frag;
-+
-+ sum += skb->truesize;
-+ kfree_skb(skb);
-+ skb = next;
-+ }
-+ }
-+ return sum;
-+}
-+EXPORT_SYMBOL(inet_frag_rbtree_purge);
-+
- #ifdef CONFIG_SYSCTL
- static int zero;
-
+++ /dev/null
-From 7969e5c40dfd04799d4341f1b7cd266b6e47f227 Mon Sep 17 00:00:00 2001
-From: Peter Oskolkov <posk@google.com>
-Date: Thu, 2 Aug 2018 23:34:37 +0000
-Subject: ip: discard IPv4 datagrams with overlapping segments.
-
-From: Peter Oskolkov <posk@google.com>
-
-commit 7969e5c40dfd04799d4341f1b7cd266b6e47f227 upstream.
-
-This behavior is required in IPv6, and there is little need
-to tolerate overlapping fragments in IPv4. This change
-simplifies the code and eliminates potential DDoS attack vectors.
-
-Tested: ran ip_defrag selftest (not yet available uptream).
-
-Suggested-by: David S. Miller <davem@davemloft.net>
-Signed-off-by: Peter Oskolkov <posk@google.com>
-Signed-off-by: Eric Dumazet <edumazet@google.com>
-Cc: Florian Westphal <fw@strlen.de>
-Acked-by: Stephen Hemminger <stephen@networkplumber.org>
-Signed-off-by: David S. Miller <davem@davemloft.net>
-Signed-off-by: Mao Wenan <maowenan@huawei.com>
-Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
----
- include/uapi/linux/snmp.h | 1
- net/ipv4/ip_fragment.c | 73 ++++++++++++----------------------------------
- net/ipv4/proc.c | 1
- 3 files changed, 22 insertions(+), 53 deletions(-)
-
---- a/include/uapi/linux/snmp.h
-+++ b/include/uapi/linux/snmp.h
-@@ -55,6 +55,7 @@ enum
- IPSTATS_MIB_ECT1PKTS, /* InECT1Pkts */
- IPSTATS_MIB_ECT0PKTS, /* InECT0Pkts */
- IPSTATS_MIB_CEPKTS, /* InCEPkts */
-+ IPSTATS_MIB_REASM_OVERLAPS, /* ReasmOverlaps */
- __IPSTATS_MIB_MAX
- };
-
---- a/net/ipv4/ip_fragment.c
-+++ b/net/ipv4/ip_fragment.c
-@@ -342,6 +342,7 @@ static int ip_frag_reinit(struct ipq *qp
- /* Add new segment to existing queue. */
- static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
- {
-+ struct net *net = container_of(qp->q.net, struct net, ipv4.frags);
- struct sk_buff *prev, *next;
- struct net_device *dev;
- unsigned int fragsize;
-@@ -422,60 +423,22 @@ static int ip_frag_queue(struct ipq *qp,
- }
-
- found:
-- /* We found where to put this one. Check for overlap with
-- * preceding fragment, and, if needed, align things so that
-- * any overlaps are eliminated.
-+ /* RFC5722, Section 4, amended by Errata ID : 3089
-+ * When reassembling an IPv6 datagram, if
-+ * one or more its constituent fragments is determined to be an
-+ * overlapping fragment, the entire datagram (and any constituent
-+ * fragments) MUST be silently discarded.
-+ *
-+ * We do the same here for IPv4.
- */
-- if (prev) {
-- int i = (FRAG_CB(prev)->offset + prev->len) - offset;
--
-- if (i > 0) {
-- offset += i;
-- err = -EINVAL;
-- if (end <= offset)
-- goto err;
-- err = -ENOMEM;
-- if (!pskb_pull(skb, i))
-- goto err;
-- if (skb->ip_summed != CHECKSUM_UNNECESSARY)
-- skb->ip_summed = CHECKSUM_NONE;
-- }
-- }
--
-- err = -ENOMEM;
--
-- while (next && FRAG_CB(next)->offset < end) {
-- int i = end - FRAG_CB(next)->offset; /* overlap is 'i' bytes */
--
-- if (i < next->len) {
-- /* Eat head of the next overlapped fragment
-- * and leave the loop. The next ones cannot overlap.
-- */
-- if (!pskb_pull(next, i))
-- goto err;
-- FRAG_CB(next)->offset += i;
-- qp->q.meat -= i;
-- if (next->ip_summed != CHECKSUM_UNNECESSARY)
-- next->ip_summed = CHECKSUM_NONE;
-- break;
-- } else {
-- struct sk_buff *free_it = next;
--
-- /* Old fragment is completely overridden with
-- * new one drop it.
-- */
-- next = next->next;
--
-- if (prev)
-- prev->next = next;
-- else
-- qp->q.fragments = next;
--
-- qp->q.meat -= free_it->len;
-- sub_frag_mem_limit(qp->q.net, free_it->truesize);
-- kfree_skb(free_it);
-- }
-- }
-+ /* Is there an overlap with the previous fragment? */
-+ if (prev &&
-+ (FRAG_CB(prev)->offset + prev->len) > offset)
-+ goto discard_qp;
-+
-+ /* Is there an overlap with the next fragment? */
-+ if (next && FRAG_CB(next)->offset < end)
-+ goto discard_qp;
-
- FRAG_CB(skb)->offset = offset;
-
-@@ -522,6 +485,10 @@ found:
- skb_dst_drop(skb);
- return -EINPROGRESS;
-
-+discard_qp:
-+ ipq_kill(qp);
-+ err = -EINVAL;
-+ IP_INC_STATS(net, IPSTATS_MIB_REASM_OVERLAPS);
- err:
- kfree_skb(skb);
- return err;
---- a/net/ipv4/proc.c
-+++ b/net/ipv4/proc.c
-@@ -132,6 +132,7 @@ static const struct snmp_mib snmp4_ipext
- SNMP_MIB_ITEM("InECT1Pkts", IPSTATS_MIB_ECT1PKTS),
- SNMP_MIB_ITEM("InECT0Pkts", IPSTATS_MIB_ECT0PKTS),
- SNMP_MIB_ITEM("InCEPkts", IPSTATS_MIB_CEPKTS),
-+ SNMP_MIB_ITEM("ReasmOverlaps", IPSTATS_MIB_REASM_OVERLAPS),
- SNMP_MIB_SENTINEL
- };
-
+++ /dev/null
-From 5d407b071dc369c26a38398326ee2be53651cfe4 Mon Sep 17 00:00:00 2001
-From: Taehee Yoo <ap420073@gmail.com>
-Date: Mon, 10 Sep 2018 02:47:05 +0900
-Subject: ip: frags: fix crash in ip_do_fragment()
-
-From: Taehee Yoo <ap420073@gmail.com>
-
-commit 5d407b071dc369c26a38398326ee2be53651cfe4 upstream.
-
-A kernel crash occurrs when defragmented packet is fragmented
-in ip_do_fragment().
-In defragment routine, skb_orphan() is called and
-skb->ip_defrag_offset is set. but skb->sk and
-skb->ip_defrag_offset are same union member. so that
-frag->sk is not NULL.
-Hence crash occurrs in skb->sk check routine in ip_do_fragment() when
-defragmented packet is fragmented.
-
-test commands:
- %iptables -t nat -I POSTROUTING -j MASQUERADE
- %hping3 192.168.4.2 -s 1000 -p 2000 -d 60000
-
-splat looks like:
-[ 261.069429] kernel BUG at net/ipv4/ip_output.c:636!
-[ 261.075753] invalid opcode: 0000 [#1] SMP DEBUG_PAGEALLOC KASAN PTI
-[ 261.083854] CPU: 1 PID: 1349 Comm: hping3 Not tainted 4.19.0-rc2+ #3
-[ 261.100977] RIP: 0010:ip_do_fragment+0x1613/0x2600
-[ 261.106945] Code: e8 e2 38 e3 fe 4c 8b 44 24 18 48 8b 74 24 08 e9 92 f6 ff ff 80 3c 02 00 0f 85 da 07 00 00 48 8b b5 d0 00 00 00 e9 25 f6 ff ff <0f> 0b 0f 0b 44 8b 54 24 58 4c 8b 4c 24 18 4c 8b 5c 24 60 4c 8b 6c
-[ 261.127015] RSP: 0018:ffff8801031cf2c0 EFLAGS: 00010202
-[ 261.134156] RAX: 1ffff1002297537b RBX: ffffed0020639e6e RCX: 0000000000000004
-[ 261.142156] RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffff880114ba9bd8
-[ 261.150157] RBP: ffff880114ba8a40 R08: ffffed0022975395 R09: ffffed0022975395
-[ 261.158157] R10: 0000000000000001 R11: ffffed0022975394 R12: ffff880114ba9ca4
-[ 261.166159] R13: 0000000000000010 R14: ffff880114ba9bc0 R15: dffffc0000000000
-[ 261.174169] FS: 00007fbae2199700(0000) GS:ffff88011b400000(0000) knlGS:0000000000000000
-[ 261.183012] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
-[ 261.189013] CR2: 00005579244fe000 CR3: 0000000119bf4000 CR4: 00000000001006e0
-[ 261.198158] Call Trace:
-[ 261.199018] ? dst_output+0x180/0x180
-[ 261.205011] ? save_trace+0x300/0x300
-[ 261.209018] ? ip_copy_metadata+0xb00/0xb00
-[ 261.213034] ? sched_clock_local+0xd4/0x140
-[ 261.218158] ? kill_l4proto+0x120/0x120 [nf_conntrack]
-[ 261.223014] ? rt_cpu_seq_stop+0x10/0x10
-[ 261.227014] ? find_held_lock+0x39/0x1c0
-[ 261.233008] ip_finish_output+0x51d/0xb50
-[ 261.237006] ? ip_fragment.constprop.56+0x220/0x220
-[ 261.243011] ? nf_ct_l4proto_register_one+0x5b0/0x5b0 [nf_conntrack]
-[ 261.250152] ? rcu_is_watching+0x77/0x120
-[ 261.255010] ? nf_nat_ipv4_out+0x1e/0x2b0 [nf_nat_ipv4]
-[ 261.261033] ? nf_hook_slow+0xb1/0x160
-[ 261.265007] ip_output+0x1c7/0x710
-[ 261.269005] ? ip_mc_output+0x13f0/0x13f0
-[ 261.273002] ? __local_bh_enable_ip+0xe9/0x1b0
-[ 261.278152] ? ip_fragment.constprop.56+0x220/0x220
-[ 261.282996] ? nf_hook_slow+0xb1/0x160
-[ 261.287007] raw_sendmsg+0x21f9/0x4420
-[ 261.291008] ? dst_output+0x180/0x180
-[ 261.297003] ? sched_clock_cpu+0x126/0x170
-[ 261.301003] ? find_held_lock+0x39/0x1c0
-[ 261.306155] ? stop_critical_timings+0x420/0x420
-[ 261.311004] ? check_flags.part.36+0x450/0x450
-[ 261.315005] ? _raw_spin_unlock_irq+0x29/0x40
-[ 261.320995] ? _raw_spin_unlock_irq+0x29/0x40
-[ 261.326142] ? cyc2ns_read_end+0x10/0x10
-[ 261.330139] ? raw_bind+0x280/0x280
-[ 261.334138] ? sched_clock_cpu+0x126/0x170
-[ 261.338995] ? check_flags.part.36+0x450/0x450
-[ 261.342991] ? __lock_acquire+0x4500/0x4500
-[ 261.348994] ? inet_sendmsg+0x11c/0x500
-[ 261.352989] ? dst_output+0x180/0x180
-[ 261.357012] inet_sendmsg+0x11c/0x500
-[ ... ]
-
-v2:
- - clear skb->sk at reassembly routine.(Eric Dumarzet)
-
-Fixes: fa0f527358bd ("ip: use rb trees for IP frag queue.")
-Suggested-by: Eric Dumazet <edumazet@google.com>
-Signed-off-by: Taehee Yoo <ap420073@gmail.com>
-Reviewed-by: Eric Dumazet <edumazet@google.com>
-Signed-off-by: David S. Miller <davem@davemloft.net>
-Signed-off-by: Mao Wenan <maowenan@huawei.com>
-Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
-
----
- net/ipv4/ip_fragment.c | 1 +
- net/ipv6/netfilter/nf_conntrack_reasm.c | 1 +
- 2 files changed, 2 insertions(+)
-
---- a/net/ipv4/ip_fragment.c
-+++ b/net/ipv4/ip_fragment.c
-@@ -661,6 +661,7 @@ static int ip_frag_reasm(struct ipq *qp,
- nextp = &fp->next;
- fp->prev = NULL;
- memset(&fp->rbnode, 0, sizeof(fp->rbnode));
-+ fp->sk = NULL;
- head->data_len += fp->len;
- head->len += fp->len;
- if (head->ip_summed != fp->ip_summed)
---- a/net/ipv6/netfilter/nf_conntrack_reasm.c
-+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
-@@ -454,6 +454,7 @@ nf_ct_frag6_reasm(struct frag_queue *fq,
- else if (head->ip_summed == CHECKSUM_COMPLETE)
- head->csum = csum_add(head->csum, fp->csum);
- head->truesize += fp->truesize;
-+ fp->sk = NULL;
- }
- sub_frag_mem_limit(fq->q.net, head->truesize);
-
+++ /dev/null
-From a4fd284a1f8fd4b6c59aa59db2185b1e17c5c11c Mon Sep 17 00:00:00 2001
-From: Peter Oskolkov <posk@google.com>
-Date: Sat, 11 Aug 2018 20:27:25 +0000
-Subject: ip: process in-order fragments efficiently
-
-From: Peter Oskolkov <posk@google.com>
-
-commit a4fd284a1f8fd4b6c59aa59db2185b1e17c5c11c upstream.
-
-This patch changes the runtime behavior of IP defrag queue:
-incoming in-order fragments are added to the end of the current
-list/"run" of in-order fragments at the tail.
-
-On some workloads, UDP stream performance is substantially improved:
-
-RX: ./udp_stream -F 10 -T 2 -l 60
-TX: ./udp_stream -c -H <host> -F 10 -T 5 -l 60
-
-with this patchset applied on a 10Gbps receiver:
-
- throughput=9524.18
- throughput_units=Mbit/s
-
-upstream (net-next):
-
- throughput=4608.93
- throughput_units=Mbit/s
-
-Reported-by: Willem de Bruijn <willemb@google.com>
-Signed-off-by: Peter Oskolkov <posk@google.com>
-Cc: Eric Dumazet <edumazet@google.com>
-Cc: Florian Westphal <fw@strlen.de>
-Signed-off-by: David S. Miller <davem@davemloft.net>
-Signed-off-by: Mao Wenan <maowenan@huawei.com>
-Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
----
- net/ipv4/inet_fragment.c | 2
- net/ipv4/ip_fragment.c | 110 +++++++++++++++++++++++++++++------------------
- 2 files changed, 70 insertions(+), 42 deletions(-)
-
---- a/net/ipv4/inet_fragment.c
-+++ b/net/ipv4/inet_fragment.c
-@@ -315,7 +315,7 @@ void inet_frag_destroy(struct inet_frag_
- fp = xp;
- } while (fp);
- } else {
-- sum_truesize = skb_rbtree_purge(&q->rb_fragments);
-+ sum_truesize = inet_frag_rbtree_purge(&q->rb_fragments);
- }
- sum = sum_truesize + f->qsize;
-
---- a/net/ipv4/ip_fragment.c
-+++ b/net/ipv4/ip_fragment.c
-@@ -139,8 +139,8 @@ int ip_frag_mem(struct net *net)
- return sum_frag_mem_limit(&net->ipv4.frags);
- }
-
--static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
-- struct net_device *dev);
-+static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb,
-+ struct sk_buff *prev_tail, struct net_device *dev);
-
- struct ip4_create_arg {
- struct iphdr *iph;
-@@ -271,7 +271,12 @@ static void ip_expire(unsigned long arg)
- head = skb_rb_first(&qp->q.rb_fragments);
- if (!head)
- goto out;
-- rb_erase(&head->rbnode, &qp->q.rb_fragments);
-+ if (FRAG_CB(head)->next_frag)
-+ rb_replace_node(&head->rbnode,
-+ &FRAG_CB(head)->next_frag->rbnode,
-+ &qp->q.rb_fragments);
-+ else
-+ rb_erase(&head->rbnode, &qp->q.rb_fragments);
- memset(&head->rbnode, 0, sizeof(head->rbnode));
- barrier();
- }
-@@ -373,7 +378,7 @@ static int ip_frag_reinit(struct ipq *qp
- return -ETIMEDOUT;
- }
-
-- sum_truesize = skb_rbtree_purge(&qp->q.rb_fragments);
-+ sum_truesize = inet_frag_rbtree_purge(&qp->q.rb_fragments);
- sub_frag_mem_limit(qp->q.net, sum_truesize);
-
- qp->q.flags = 0;
-@@ -382,6 +387,7 @@ static int ip_frag_reinit(struct ipq *qp
- qp->q.fragments = NULL;
- qp->q.rb_fragments = RB_ROOT;
- qp->q.fragments_tail = NULL;
-+ qp->q.last_run_head = NULL;
- qp->iif = 0;
- qp->ecn = 0;
-
-@@ -393,7 +399,7 @@ static int ip_frag_queue(struct ipq *qp,
- {
- struct net *net = container_of(qp->q.net, struct net, ipv4.frags);
- struct rb_node **rbn, *parent;
-- struct sk_buff *skb1;
-+ struct sk_buff *skb1, *prev_tail;
- struct net_device *dev;
- unsigned int fragsize;
- int flags, offset;
-@@ -471,38 +477,41 @@ static int ip_frag_queue(struct ipq *qp,
- */
-
- /* Find out where to put this fragment. */
-- skb1 = qp->q.fragments_tail;
-- if (!skb1) {
-- /* This is the first fragment we've received. */
-- rb_link_node(&skb->rbnode, NULL, &qp->q.rb_fragments.rb_node);
-- qp->q.fragments_tail = skb;
-- } else if ((FRAG_CB(skb1)->offset + skb1->len) < end) {
-- /* This is the common/special case: skb goes to the end. */
-+ prev_tail = qp->q.fragments_tail;
-+ if (!prev_tail)
-+ ip4_frag_create_run(&qp->q, skb); /* First fragment. */
-+ else if (FRAG_CB(prev_tail)->offset + prev_tail->len < end) {
-+ /* This is the common case: skb goes to the end. */
- /* Detect and discard overlaps. */
-- if (offset < (FRAG_CB(skb1)->offset + skb1->len))
-+ if (offset < FRAG_CB(prev_tail)->offset + prev_tail->len)
- goto discard_qp;
-- /* Insert after skb1. */
-- rb_link_node(&skb->rbnode, &skb1->rbnode, &skb1->rbnode.rb_right);
-- qp->q.fragments_tail = skb;
-+ if (offset == FRAG_CB(prev_tail)->offset + prev_tail->len)
-+ ip4_frag_append_to_last_run(&qp->q, skb);
-+ else
-+ ip4_frag_create_run(&qp->q, skb);
- } else {
-- /* Binary search. Note that skb can become the first fragment, but
-- * not the last (covered above). */
-+ /* Binary search. Note that skb can become the first fragment,
-+ * but not the last (covered above).
-+ */
- rbn = &qp->q.rb_fragments.rb_node;
- do {
- parent = *rbn;
- skb1 = rb_to_skb(parent);
- if (end <= FRAG_CB(skb1)->offset)
- rbn = &parent->rb_left;
-- else if (offset >= FRAG_CB(skb1)->offset + skb1->len)
-+ else if (offset >= FRAG_CB(skb1)->offset +
-+ FRAG_CB(skb1)->frag_run_len)
- rbn = &parent->rb_right;
- else /* Found an overlap with skb1. */
- goto discard_qp;
- } while (*rbn);
- /* Here we have parent properly set, and rbn pointing to
-- * one of its NULL left/right children. Insert skb. */
-+ * one of its NULL left/right children. Insert skb.
-+ */
-+ ip4_frag_init_run(skb);
- rb_link_node(&skb->rbnode, parent, rbn);
-+ rb_insert_color(&skb->rbnode, &qp->q.rb_fragments);
- }
-- rb_insert_color(&skb->rbnode, &qp->q.rb_fragments);
-
- if (dev) {
- qp->iif = dev->ifindex;
-@@ -531,7 +540,7 @@ static int ip_frag_queue(struct ipq *qp,
- unsigned long orefdst = skb->_skb_refdst;
-
- skb->_skb_refdst = 0UL;
-- err = ip_frag_reasm(qp, skb, dev);
-+ err = ip_frag_reasm(qp, skb, prev_tail, dev);
- skb->_skb_refdst = orefdst;
- return err;
- }
-@@ -550,7 +559,7 @@ err:
-
- /* Build a new IP datagram from all its fragments. */
- static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb,
-- struct net_device *dev)
-+ struct sk_buff *prev_tail, struct net_device *dev)
- {
- struct net *net = container_of(qp->q.net, struct net, ipv4.frags);
- struct iphdr *iph;
-@@ -575,10 +584,16 @@ static int ip_frag_reasm(struct ipq *qp,
- if (!fp)
- goto out_nomem;
-
-- rb_replace_node(&skb->rbnode, &fp->rbnode, &qp->q.rb_fragments);
-+ FRAG_CB(fp)->next_frag = FRAG_CB(skb)->next_frag;
-+ if (RB_EMPTY_NODE(&skb->rbnode))
-+ FRAG_CB(prev_tail)->next_frag = fp;
-+ else
-+ rb_replace_node(&skb->rbnode, &fp->rbnode,
-+ &qp->q.rb_fragments);
- if (qp->q.fragments_tail == skb)
- qp->q.fragments_tail = fp;
- skb_morph(skb, head);
-+ FRAG_CB(skb)->next_frag = FRAG_CB(head)->next_frag;
- rb_replace_node(&head->rbnode, &skb->rbnode,
- &qp->q.rb_fragments);
- consume_skb(head);
-@@ -614,7 +629,7 @@ static int ip_frag_reasm(struct ipq *qp,
- for (i = 0; i < skb_shinfo(head)->nr_frags; i++)
- plen += skb_frag_size(&skb_shinfo(head)->frags[i]);
- clone->len = clone->data_len = head->data_len - plen;
-- skb->truesize += clone->truesize;
-+ head->truesize += clone->truesize;
- clone->csum = 0;
- clone->ip_summed = head->ip_summed;
- add_frag_mem_limit(qp->q.net, clone->truesize);
-@@ -627,24 +642,36 @@ static int ip_frag_reasm(struct ipq *qp,
- skb_push(head, head->data - skb_network_header(head));
-
- /* Traverse the tree in order, to build frag_list. */
-+ fp = FRAG_CB(head)->next_frag;
- rbn = rb_next(&head->rbnode);
- rb_erase(&head->rbnode, &qp->q.rb_fragments);
-- while (rbn) {
-- struct rb_node *rbnext = rb_next(rbn);
-- fp = rb_to_skb(rbn);
-- rb_erase(rbn, &qp->q.rb_fragments);
-- rbn = rbnext;
-- *nextp = fp;
-- nextp = &fp->next;
-- fp->prev = NULL;
-- memset(&fp->rbnode, 0, sizeof(fp->rbnode));
-- head->data_len += fp->len;
-- head->len += fp->len;
-- if (head->ip_summed != fp->ip_summed)
-- head->ip_summed = CHECKSUM_NONE;
-- else if (head->ip_summed == CHECKSUM_COMPLETE)
-- head->csum = csum_add(head->csum, fp->csum);
-- head->truesize += fp->truesize;
-+ while (rbn || fp) {
-+ /* fp points to the next sk_buff in the current run;
-+ * rbn points to the next run.
-+ */
-+ /* Go through the current run. */
-+ while (fp) {
-+ *nextp = fp;
-+ nextp = &fp->next;
-+ fp->prev = NULL;
-+ memset(&fp->rbnode, 0, sizeof(fp->rbnode));
-+ head->data_len += fp->len;
-+ head->len += fp->len;
-+ if (head->ip_summed != fp->ip_summed)
-+ head->ip_summed = CHECKSUM_NONE;
-+ else if (head->ip_summed == CHECKSUM_COMPLETE)
-+ head->csum = csum_add(head->csum, fp->csum);
-+ head->truesize += fp->truesize;
-+ fp = FRAG_CB(fp)->next_frag;
-+ }
-+ /* Move to the next run. */
-+ if (rbn) {
-+ struct rb_node *rbnext = rb_next(rbn);
-+
-+ fp = rb_to_skb(rbn);
-+ rb_erase(rbn, &qp->q.rb_fragments);
-+ rbn = rbnext;
-+ }
- }
- sub_frag_mem_limit(qp->q.net, head->truesize);
-
-@@ -680,6 +707,7 @@ static int ip_frag_reasm(struct ipq *qp,
- qp->q.fragments = NULL;
- qp->q.rb_fragments = RB_ROOT;
- qp->q.fragments_tail = NULL;
-+ qp->q.last_run_head = NULL;
- return 0;
-
- out_nomem:
+++ /dev/null
-From fa0f527358bd900ef92f925878ed6bfbd51305cc Mon Sep 17 00:00:00 2001
-From: Peter Oskolkov <posk@google.com>
-Date: Thu, 2 Aug 2018 23:34:39 +0000
-Subject: ip: use rb trees for IP frag queue.
-
-From: Peter Oskolkov <posk@google.com>
-
-commit fa0f527358bd900ef92f925878ed6bfbd51305cc upstream.
-
-Similar to TCP OOO RX queue, it makes sense to use rb trees to store
-IP fragments, so that OOO fragments are inserted faster.
-
-Tested:
-
-- a follow-up patch contains a rather comprehensive ip defrag
- self-test (functional)
-- ran neper `udp_stream -c -H <host> -F 100 -l 300 -T 20`:
- netstat --statistics
- Ip:
- 282078937 total packets received
- 0 forwarded
- 0 incoming packets discarded
- 946760 incoming packets delivered
- 18743456 requests sent out
- 101 fragments dropped after timeout
- 282077129 reassemblies required
- 944952 packets reassembled ok
- 262734239 packet reassembles failed
- (The numbers/stats above are somewhat better re:
- reassemblies vs a kernel without this patchset. More
- comprehensive performance testing TBD).
-
-Reported-by: Jann Horn <jannh@google.com>
-Reported-by: Juha-Matti Tilli <juha-matti.tilli@iki.fi>
-Suggested-by: Eric Dumazet <edumazet@google.com>
-Signed-off-by: Peter Oskolkov <posk@google.com>
-Signed-off-by: Eric Dumazet <edumazet@google.com>
-Cc: Florian Westphal <fw@strlen.de>
-Signed-off-by: David S. Miller <davem@davemloft.net>
-Signed-off-by: Mao Wenan <maowenan@huawei.com>
-Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
-
----
- include/linux/skbuff.h | 2
- include/net/inet_frag.h | 3
- net/ipv4/inet_fragment.c | 14 +-
- net/ipv4/ip_fragment.c | 190 +++++++++++++++++---------------
- net/ipv6/netfilter/nf_conntrack_reasm.c | 1
- net/ipv6/reassembly.c | 1
- 6 files changed, 120 insertions(+), 91 deletions(-)
-
---- a/include/linux/skbuff.h
-+++ b/include/linux/skbuff.h
-@@ -556,7 +556,7 @@ struct sk_buff {
- struct skb_mstamp skb_mstamp;
- };
- };
-- struct rb_node rbnode; /* used in netem & tcp stack */
-+ struct rb_node rbnode; /* used in netem, ip4 defrag, and tcp stack */
- };
- struct sock *sk;
- struct net_device *dev;
---- a/include/net/inet_frag.h
-+++ b/include/net/inet_frag.h
-@@ -45,7 +45,8 @@ struct inet_frag_queue {
- struct timer_list timer;
- struct hlist_node list;
- atomic_t refcnt;
-- struct sk_buff *fragments;
-+ struct sk_buff *fragments; /* Used in IPv6. */
-+ struct rb_root rb_fragments; /* Used in IPv4. */
- struct sk_buff *fragments_tail;
- ktime_t stamp;
- int len;
---- a/net/ipv4/inet_fragment.c
-+++ b/net/ipv4/inet_fragment.c
-@@ -306,12 +306,16 @@ void inet_frag_destroy(struct inet_frag_
- /* Release all fragment data. */
- fp = q->fragments;
- nf = q->net;
-- while (fp) {
-- struct sk_buff *xp = fp->next;
-+ if (fp) {
-+ do {
-+ struct sk_buff *xp = fp->next;
-
-- sum_truesize += fp->truesize;
-- frag_kfree_skb(nf, f, fp);
-- fp = xp;
-+ sum_truesize += fp->truesize;
-+ kfree_skb(fp);
-+ fp = xp;
-+ } while (fp);
-+ } else {
-+ sum_truesize = skb_rbtree_purge(&q->rb_fragments);
- }
- sum = sum_truesize + f->qsize;
-
---- a/net/ipv4/ip_fragment.c
-+++ b/net/ipv4/ip_fragment.c
-@@ -194,7 +194,7 @@ static bool frag_expire_skip_icmp(u32 us
- */
- static void ip_expire(unsigned long arg)
- {
-- struct sk_buff *clone, *head;
-+ struct sk_buff *head = NULL;
- const struct iphdr *iph;
- struct net *net;
- struct ipq *qp;
-@@ -211,14 +211,31 @@ static void ip_expire(unsigned long arg)
-
- ipq_kill(qp);
- IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS);
--
-- head = qp->q.fragments;
--
- IP_INC_STATS_BH(net, IPSTATS_MIB_REASMTIMEOUT);
-
-- if (!(qp->q.flags & INET_FRAG_FIRST_IN) || !head)
-+ if (!qp->q.flags & INET_FRAG_FIRST_IN)
- goto out;
-
-+ /* sk_buff::dev and sk_buff::rbnode are unionized. So we
-+ * pull the head out of the tree in order to be able to
-+ * deal with head->dev.
-+ */
-+ if (qp->q.fragments) {
-+ head = qp->q.fragments;
-+ qp->q.fragments = head->next;
-+ } else {
-+ head = skb_rb_first(&qp->q.rb_fragments);
-+ if (!head)
-+ goto out;
-+ rb_erase(&head->rbnode, &qp->q.rb_fragments);
-+ memset(&head->rbnode, 0, sizeof(head->rbnode));
-+ barrier();
-+ }
-+ if (head == qp->q.fragments_tail)
-+ qp->q.fragments_tail = NULL;
-+
-+ sub_frag_mem_limit(qp->q.net, head->truesize);
-+
- head->dev = dev_get_by_index_rcu(net, qp->iif);
- if (!head->dev)
- goto out;
-@@ -237,20 +254,17 @@ static void ip_expire(unsigned long arg)
- (skb_rtable(head)->rt_type != RTN_LOCAL))
- goto out;
-
-- clone = skb_clone(head, GFP_ATOMIC);
--
- /* Send an ICMP "Fragment Reassembly Timeout" message. */
-- if (clone) {
-- spin_unlock(&qp->q.lock);
-- icmp_send(clone, ICMP_TIME_EXCEEDED,
-- ICMP_EXC_FRAGTIME, 0);
-- consume_skb(clone);
-- goto out_rcu_unlock;
-- }
-+ spin_unlock(&qp->q.lock);
-+ icmp_send(head, ICMP_TIME_EXCEEDED, ICMP_EXC_FRAGTIME, 0);
-+ goto out_rcu_unlock;
-+
- out:
- spin_unlock(&qp->q.lock);
- out_rcu_unlock:
- rcu_read_unlock();
-+ if (head)
-+ kfree_skb(head);
- ipq_put(qp);
- }
-
-@@ -294,7 +308,7 @@ static int ip_frag_too_far(struct ipq *q
- end = atomic_inc_return(&peer->rid);
- qp->rid = end;
-
-- rc = qp->q.fragments && (end - start) > max;
-+ rc = qp->q.fragments_tail && (end - start) > max;
-
- if (rc) {
- struct net *net;
-@@ -308,7 +322,6 @@ static int ip_frag_too_far(struct ipq *q
-
- static int ip_frag_reinit(struct ipq *qp)
- {
-- struct sk_buff *fp;
- unsigned int sum_truesize = 0;
-
- if (!mod_timer(&qp->q.timer, jiffies + qp->q.net->timeout)) {
-@@ -316,20 +329,14 @@ static int ip_frag_reinit(struct ipq *qp
- return -ETIMEDOUT;
- }
-
-- fp = qp->q.fragments;
-- do {
-- struct sk_buff *xp = fp->next;
--
-- sum_truesize += fp->truesize;
-- kfree_skb(fp);
-- fp = xp;
-- } while (fp);
-+ sum_truesize = skb_rbtree_purge(&qp->q.rb_fragments);
- sub_frag_mem_limit(qp->q.net, sum_truesize);
-
- qp->q.flags = 0;
- qp->q.len = 0;
- qp->q.meat = 0;
- qp->q.fragments = NULL;
-+ qp->q.rb_fragments = RB_ROOT;
- qp->q.fragments_tail = NULL;
- qp->iif = 0;
- qp->ecn = 0;
-@@ -341,7 +348,8 @@ static int ip_frag_reinit(struct ipq *qp
- static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
- {
- struct net *net = container_of(qp->q.net, struct net, ipv4.frags);
-- struct sk_buff *prev, *next;
-+ struct rb_node **rbn, *parent;
-+ struct sk_buff *skb1;
- struct net_device *dev;
- unsigned int fragsize;
- int flags, offset;
-@@ -404,56 +412,60 @@ static int ip_frag_queue(struct ipq *qp,
- if (err)
- goto err;
-
-- /* Find out which fragments are in front and at the back of us
-- * in the chain of fragments so far. We must know where to put
-- * this fragment, right?
-- */
-- prev = qp->q.fragments_tail;
-- if (!prev || FRAG_CB(prev)->offset < offset) {
-- next = NULL;
-- goto found;
-- }
-- prev = NULL;
-- for (next = qp->q.fragments; next != NULL; next = next->next) {
-- if (FRAG_CB(next)->offset >= offset)
-- break; /* bingo! */
-- prev = next;
-- }
-+ /* Note : skb->rbnode and skb->dev share the same location. */
-+ dev = skb->dev;
-+ /* Makes sure compiler wont do silly aliasing games */
-+ barrier();
-
--found:
- /* RFC5722, Section 4, amended by Errata ID : 3089
- * When reassembling an IPv6 datagram, if
- * one or more its constituent fragments is determined to be an
- * overlapping fragment, the entire datagram (and any constituent
- * fragments) MUST be silently discarded.
- *
-- * We do the same here for IPv4.
-+ * We do the same here for IPv4 (and increment an snmp counter).
- */
-- /* Is there an overlap with the previous fragment? */
-- if (prev &&
-- (FRAG_CB(prev)->offset + prev->len) > offset)
-- goto discard_qp;
--
-- /* Is there an overlap with the next fragment? */
-- if (next && FRAG_CB(next)->offset < end)
-- goto discard_qp;
--
-- FRAG_CB(skb)->offset = offset;
-
-- /* Insert this fragment in the chain of fragments. */
-- skb->next = next;
-- if (!next)
-+ /* Find out where to put this fragment. */
-+ skb1 = qp->q.fragments_tail;
-+ if (!skb1) {
-+ /* This is the first fragment we've received. */
-+ rb_link_node(&skb->rbnode, NULL, &qp->q.rb_fragments.rb_node);
-+ qp->q.fragments_tail = skb;
-+ } else if ((FRAG_CB(skb1)->offset + skb1->len) < end) {
-+ /* This is the common/special case: skb goes to the end. */
-+ /* Detect and discard overlaps. */
-+ if (offset < (FRAG_CB(skb1)->offset + skb1->len))
-+ goto discard_qp;
-+ /* Insert after skb1. */
-+ rb_link_node(&skb->rbnode, &skb1->rbnode, &skb1->rbnode.rb_right);
- qp->q.fragments_tail = skb;
-- if (prev)
-- prev->next = skb;
-- else
-- qp->q.fragments = skb;
-+ } else {
-+ /* Binary search. Note that skb can become the first fragment, but
-+ * not the last (covered above). */
-+ rbn = &qp->q.rb_fragments.rb_node;
-+ do {
-+ parent = *rbn;
-+ skb1 = rb_to_skb(parent);
-+ if (end <= FRAG_CB(skb1)->offset)
-+ rbn = &parent->rb_left;
-+ else if (offset >= FRAG_CB(skb1)->offset + skb1->len)
-+ rbn = &parent->rb_right;
-+ else /* Found an overlap with skb1. */
-+ goto discard_qp;
-+ } while (*rbn);
-+ /* Here we have parent properly set, and rbn pointing to
-+ * one of its NULL left/right children. Insert skb. */
-+ rb_link_node(&skb->rbnode, parent, rbn);
-+ }
-+ rb_insert_color(&skb->rbnode, &qp->q.rb_fragments);
-
-- dev = skb->dev;
- if (dev) {
- qp->iif = dev->ifindex;
- skb->dev = NULL;
- }
-+ FRAG_CB(skb)->offset = offset;
-+
- qp->q.stamp = skb->tstamp;
- qp->q.meat += skb->len;
- qp->ecn |= ecn;
-@@ -475,7 +487,7 @@ found:
- unsigned long orefdst = skb->_skb_refdst;
-
- skb->_skb_refdst = 0UL;
-- err = ip_frag_reasm(qp, prev, dev);
-+ err = ip_frag_reasm(qp, skb, dev);
- skb->_skb_refdst = orefdst;
- return err;
- }
-@@ -492,15 +504,15 @@ err:
- return err;
- }
-
--
- /* Build a new IP datagram from all its fragments. */
--
--static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
-+static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb,
- struct net_device *dev)
- {
- struct net *net = container_of(qp->q.net, struct net, ipv4.frags);
- struct iphdr *iph;
-- struct sk_buff *fp, *head = qp->q.fragments;
-+ struct sk_buff *fp, *head = skb_rb_first(&qp->q.rb_fragments);
-+ struct sk_buff **nextp; /* To build frag_list. */
-+ struct rb_node *rbn;
- int len;
- int ihlen;
- int err;
-@@ -514,25 +526,21 @@ static int ip_frag_reasm(struct ipq *qp,
- goto out_fail;
- }
- /* Make the one we just received the head. */
-- if (prev) {
-- head = prev->next;
-- fp = skb_clone(head, GFP_ATOMIC);
-+ if (head != skb) {
-+ fp = skb_clone(skb, GFP_ATOMIC);
- if (!fp)
- goto out_nomem;
-
-- fp->next = head->next;
-- if (!fp->next)
-+ rb_replace_node(&skb->rbnode, &fp->rbnode, &qp->q.rb_fragments);
-+ if (qp->q.fragments_tail == skb)
- qp->q.fragments_tail = fp;
-- prev->next = fp;
--
-- skb_morph(head, qp->q.fragments);
-- head->next = qp->q.fragments->next;
--
-- consume_skb(qp->q.fragments);
-- qp->q.fragments = head;
-+ skb_morph(skb, head);
-+ rb_replace_node(&head->rbnode, &skb->rbnode,
-+ &qp->q.rb_fragments);
-+ consume_skb(head);
-+ head = skb;
- }
-
-- WARN_ON(!head);
- WARN_ON(FRAG_CB(head)->offset != 0);
-
- /* Allocate a new buffer for the datagram. */
-@@ -557,24 +565,35 @@ static int ip_frag_reasm(struct ipq *qp,
- clone = alloc_skb(0, GFP_ATOMIC);
- if (!clone)
- goto out_nomem;
-- clone->next = head->next;
-- head->next = clone;
- skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list;
- skb_frag_list_init(head);
- for (i = 0; i < skb_shinfo(head)->nr_frags; i++)
- plen += skb_frag_size(&skb_shinfo(head)->frags[i]);
- clone->len = clone->data_len = head->data_len - plen;
-- head->data_len -= clone->len;
-- head->len -= clone->len;
-+ skb->truesize += clone->truesize;
- clone->csum = 0;
- clone->ip_summed = head->ip_summed;
- add_frag_mem_limit(qp->q.net, clone->truesize);
-+ skb_shinfo(head)->frag_list = clone;
-+ nextp = &clone->next;
-+ } else {
-+ nextp = &skb_shinfo(head)->frag_list;
- }
-
-- skb_shinfo(head)->frag_list = head->next;
- skb_push(head, head->data - skb_network_header(head));
-
-- for (fp=head->next; fp; fp = fp->next) {
-+ /* Traverse the tree in order, to build frag_list. */
-+ rbn = rb_next(&head->rbnode);
-+ rb_erase(&head->rbnode, &qp->q.rb_fragments);
-+ while (rbn) {
-+ struct rb_node *rbnext = rb_next(rbn);
-+ fp = rb_to_skb(rbn);
-+ rb_erase(rbn, &qp->q.rb_fragments);
-+ rbn = rbnext;
-+ *nextp = fp;
-+ nextp = &fp->next;
-+ fp->prev = NULL;
-+ memset(&fp->rbnode, 0, sizeof(fp->rbnode));
- head->data_len += fp->len;
- head->len += fp->len;
- if (head->ip_summed != fp->ip_summed)
-@@ -585,7 +604,9 @@ static int ip_frag_reasm(struct ipq *qp,
- }
- sub_frag_mem_limit(qp->q.net, head->truesize);
-
-+ *nextp = NULL;
- head->next = NULL;
-+ head->prev = NULL;
- head->dev = dev;
- head->tstamp = qp->q.stamp;
- IPCB(head)->frag_max_size = max(qp->max_df_size, qp->q.max_size);
-@@ -613,6 +634,7 @@ static int ip_frag_reasm(struct ipq *qp,
-
- IP_INC_STATS_BH(net, IPSTATS_MIB_REASMOKS);
- qp->q.fragments = NULL;
-+ qp->q.rb_fragments = RB_ROOT;
- qp->q.fragments_tail = NULL;
- return 0;
-
---- a/net/ipv6/netfilter/nf_conntrack_reasm.c
-+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
-@@ -472,6 +472,7 @@ nf_ct_frag6_reasm(struct frag_queue *fq,
- head->csum);
-
- fq->q.fragments = NULL;
-+ fq->q.rb_fragments = RB_ROOT;
- fq->q.fragments_tail = NULL;
-
- /* all original skbs are linked into the NFCT_FRAG6_CB(head).orig */
---- a/net/ipv6/reassembly.c
-+++ b/net/ipv6/reassembly.c
-@@ -499,6 +499,7 @@ static int ip6_frag_reasm(struct frag_qu
- IP6_INC_STATS_BH(net, __in6_dev_get(dev), IPSTATS_MIB_REASMOKS);
- rcu_read_unlock();
- fq->q.fragments = NULL;
-+ fq->q.rb_fragments = RB_ROOT;
- fq->q.fragments_tail = NULL;
- return 1;
-
+++ /dev/null
-From 70837ffe3085c9a91488b52ca13ac84424da1042 Mon Sep 17 00:00:00 2001
-From: Dan Carpenter <dan.carpenter@oracle.com>
-Date: Mon, 6 Aug 2018 22:17:35 +0300
-Subject: ipv4: frags: precedence bug in ip_expire()
-
-From: Dan Carpenter <dan.carpenter@oracle.com>
-
-commit 70837ffe3085c9a91488b52ca13ac84424da1042 upstream.
-
-We accidentally removed the parentheses here, but they are required
-because '!' has higher precedence than '&'.
-
-Fixes: fa0f527358bd ("ip: use rb trees for IP frag queue.")
-Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
-Signed-off-by: David S. Miller <davem@davemloft.net>
-Signed-off-by: Mao Wenan <maowenan@huawei.com>
-Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
-
----
- net/ipv4/ip_fragment.c | 2 +-
- 1 file changed, 1 insertion(+), 1 deletion(-)
-
---- a/net/ipv4/ip_fragment.c
-+++ b/net/ipv4/ip_fragment.c
-@@ -257,7 +257,7 @@ static void ip_expire(unsigned long arg)
- IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS);
- IP_INC_STATS_BH(net, IPSTATS_MIB_REASMTIMEOUT);
-
-- if (!qp->q.flags & INET_FRAG_FIRST_IN)
-+ if (!(qp->q.flags & INET_FRAG_FIRST_IN))
- goto out;
-
- /* sk_buff::dev and sk_buff::rbnode are unionized. So we
+++ /dev/null
-From 0ed4229b08c13c84a3c301a08defdc9e7f4467e6 Mon Sep 17 00:00:00 2001
-From: Florian Westphal <fw@strlen.de>
-Date: Fri, 3 Aug 2018 02:22:20 +0200
-Subject: ipv6: defrag: drop non-last frags smaller than min mtu
-
-From: Florian Westphal <fw@strlen.de>
-
-commit 0ed4229b08c13c84a3c301a08defdc9e7f4467e6 upstream.
-
-don't bother with pathological cases, they only waste cycles.
-IPv6 requires a minimum MTU of 1280 so we should never see fragments
-smaller than this (except last frag).
-
-v3: don't use awkward "-offset + len"
-v2: drop IPv4 part, which added same check w. IPV4_MIN_MTU (68).
- There were concerns that there could be even smaller frags
- generated by intermediate nodes, e.g. on radio networks.
-
-Cc: Peter Oskolkov <posk@google.com>
-Cc: Eric Dumazet <edumazet@google.com>
-Signed-off-by: Florian Westphal <fw@strlen.de>
-Signed-off-by: David S. Miller <davem@davemloft.net>
-Signed-off-by: Mao Wenan <maowenan@huawei.com>
-Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
----
- net/ipv6/netfilter/nf_conntrack_reasm.c | 4 ++++
- net/ipv6/reassembly.c | 4 ++++
- 2 files changed, 8 insertions(+)
-
---- a/net/ipv6/netfilter/nf_conntrack_reasm.c
-+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
-@@ -602,6 +602,10 @@ struct sk_buff *nf_ct_frag6_gather(struc
- hdr = ipv6_hdr(clone);
- fhdr = (struct frag_hdr *)skb_transport_header(clone);
-
-+ if (skb->len - skb_network_offset(skb) < IPV6_MIN_MTU &&
-+ fhdr->frag_off & htons(IP6_MF))
-+ goto ret_orig;
-+
- skb_orphan(skb);
- fq = fq_find(net, fhdr->identification, user, &hdr->saddr, &hdr->daddr,
- skb->dev ? skb->dev->ifindex : 0, ip6_frag_ecn(hdr));
---- a/net/ipv6/reassembly.c
-+++ b/net/ipv6/reassembly.c
-@@ -549,6 +549,10 @@ static int ipv6_frag_rcv(struct sk_buff
- return 1;
- }
-
-+ if (skb->len - skb_network_offset(skb) < IPV6_MIN_MTU &&
-+ fhdr->frag_off & htons(IP6_MF))
-+ goto fail_hdr;
-+
- fq = fq_find(net, fhdr->identification, &hdr->saddr, &hdr->daddr,
- skb->dev ? skb->dev->ifindex : 0, ip6_frag_ecn(hdr));
- if (fq) {
+++ /dev/null
-From ade446403bfb79d3528d56071a84b15351a139ad Mon Sep 17 00:00:00 2001
-From: Michal Kubecek <mkubecek@suse.cz>
-Date: Thu, 13 Dec 2018 17:23:32 +0100
-Subject: net: ipv4: do not handle duplicate fragments as overlapping
-
-From: Michal Kubecek <mkubecek@suse.cz>
-
-commit ade446403bfb79d3528d56071a84b15351a139ad upstream.
-
-Since commit 7969e5c40dfd ("ip: discard IPv4 datagrams with overlapping
-segments.") IPv4 reassembly code drops the whole queue whenever an
-overlapping fragment is received. However, the test is written in a way
-which detects duplicate fragments as overlapping so that in environments
-with many duplicate packets, fragmented packets may be undeliverable.
-
-Add an extra test and for (potentially) duplicate fragment, only drop the
-new fragment rather than the whole queue. Only starting offset and length
-are checked, not the contents of the fragments as that would be too
-expensive. For similar reason, linear list ("run") of a rbtree node is not
-iterated, we only check if the new fragment is a subset of the interval
-covered by existing consecutive fragments.
-
-v2: instead of an exact check iterating through linear list of an rbtree
-node, only check if the new fragment is subset of the "run" (suggested
-by Eric Dumazet)
-
-Fixes: 7969e5c40dfd ("ip: discard IPv4 datagrams with overlapping segments.")
-Signed-off-by: Michal Kubecek <mkubecek@suse.cz>
-Signed-off-by: David S. Miller <davem@davemloft.net>
-Signed-off-by: Mao Wenan <maowenan@huawei.com>
-Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
-
----
- net/ipv4/ip_fragment.c | 18 ++++++++++++------
- 1 file changed, 12 insertions(+), 6 deletions(-)
-
---- a/net/ipv4/ip_fragment.c
-+++ b/net/ipv4/ip_fragment.c
-@@ -400,10 +400,10 @@ static int ip_frag_queue(struct ipq *qp,
- struct net *net = container_of(qp->q.net, struct net, ipv4.frags);
- struct rb_node **rbn, *parent;
- struct sk_buff *skb1, *prev_tail;
-+ int ihl, end, skb1_run_end;
- struct net_device *dev;
- unsigned int fragsize;
- int flags, offset;
-- int ihl, end;
- int err = -ENOENT;
- u8 ecn;
-
-@@ -473,7 +473,9 @@ static int ip_frag_queue(struct ipq *qp,
- * overlapping fragment, the entire datagram (and any constituent
- * fragments) MUST be silently discarded.
- *
-- * We do the same here for IPv4 (and increment an snmp counter).
-+ * We do the same here for IPv4 (and increment an snmp counter) but
-+ * we do not want to drop the whole queue in response to a duplicate
-+ * fragment.
- */
-
- /* Find out where to put this fragment. */
-@@ -497,13 +499,17 @@ static int ip_frag_queue(struct ipq *qp,
- do {
- parent = *rbn;
- skb1 = rb_to_skb(parent);
-+ skb1_run_end = FRAG_CB(skb1)->offset +
-+ FRAG_CB(skb1)->frag_run_len;
- if (end <= FRAG_CB(skb1)->offset)
- rbn = &parent->rb_left;
-- else if (offset >= FRAG_CB(skb1)->offset +
-- FRAG_CB(skb1)->frag_run_len)
-+ else if (offset >= skb1_run_end)
- rbn = &parent->rb_right;
-- else /* Found an overlap with skb1. */
-- goto discard_qp;
-+ else if (offset >= FRAG_CB(skb1)->offset &&
-+ end <= skb1_run_end)
-+ goto err; /* No new data, potential duplicate */
-+ else
-+ goto discard_qp; /* Found an overlap */
- } while (*rbn);
- /* Here we have parent properly set, and rbn pointing to
- * one of its NULL left/right children. Insert skb.
+++ /dev/null
-From 385114dec8a49b5e5945e77ba7de6356106713f4 Mon Sep 17 00:00:00 2001
-From: Peter Oskolkov <posk@google.com>
-Date: Thu, 2 Aug 2018 23:34:38 +0000
-Subject: net: modify skb_rbtree_purge to return the truesize of all purged skbs.
-
-From: Peter Oskolkov <posk@google.com>
-
-commit 385114dec8a49b5e5945e77ba7de6356106713f4 upstream.
-
-Tested: see the next patch is the series.
-
-Suggested-by: Eric Dumazet <edumazet@google.com>
-Signed-off-by: Peter Oskolkov <posk@google.com>
-Signed-off-by: Eric Dumazet <edumazet@google.com>
-Cc: Florian Westphal <fw@strlen.de>
-Signed-off-by: David S. Miller <davem@davemloft.net>
-Signed-off-by: Mao Wenan <maowenan@huawei.com>
-Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
-
----
- include/linux/skbuff.h | 2 +-
- net/core/skbuff.c | 6 +++++-
- 2 files changed, 6 insertions(+), 2 deletions(-)
-
---- a/include/linux/skbuff.h
-+++ b/include/linux/skbuff.h
-@@ -2273,7 +2273,7 @@ static inline void __skb_queue_purge(str
- kfree_skb(skb);
- }
-
--void skb_rbtree_purge(struct rb_root *root);
-+unsigned int skb_rbtree_purge(struct rb_root *root);
-
- void *netdev_alloc_frag(unsigned int fragsz);
-
---- a/net/core/skbuff.c
-+++ b/net/core/skbuff.c
-@@ -2380,23 +2380,27 @@ EXPORT_SYMBOL(skb_queue_purge);
- /**
- * skb_rbtree_purge - empty a skb rbtree
- * @root: root of the rbtree to empty
-+ * Return value: the sum of truesizes of all purged skbs.
- *
- * Delete all buffers on an &sk_buff rbtree. Each buffer is removed from
- * the list and one reference dropped. This function does not take
- * any lock. Synchronization should be handled by the caller (e.g., TCP
- * out-of-order queue is protected by the socket lock).
- */
--void skb_rbtree_purge(struct rb_root *root)
-+unsigned int skb_rbtree_purge(struct rb_root *root)
- {
- struct rb_node *p = rb_first(root);
-+ unsigned int sum = 0;
-
- while (p) {
- struct sk_buff *skb = rb_entry(p, struct sk_buff, rbnode);
-
- p = rb_next(p);
- rb_erase(&skb->rbnode, root);
-+ sum += skb->truesize;
- kfree_skb(skb);
- }
-+ return sum;
- }
-
- /**
+++ /dev/null
-ip-discard-ipv4-datagrams-with-overlapping-segments.patch
-net-modify-skb_rbtree_purge-to-return-the-truesize-of-all-purged-skbs.patch
-inet-frags-get-rif-of-inet_frag_evicting.patch
-ip-use-rb-trees-for-ip-frag-queue.patch
-ipv6-defrag-drop-non-last-frags-smaller-than-min-mtu.patch
-ip-add-helpers-to-process-in-order-fragments-faster.patch
-ip-process-in-order-fragments-efficiently.patch
-net-ipv4-do-not-handle-duplicate-fragments-as-overlapping.patch
-ip-frags-fix-crash-in-ip_do_fragment.patch
-ipv4-frags-precedence-bug-in-ip_expire.patch
-fix-net-ipv4-do-not-handle-duplicate-fragments-as-overlapping.patch