--- /dev/null
+From 79f546a696bff2590169fb5684e23d65f4d9f591 Mon Sep 17 00:00:00 2001
+From: Dave Chinner <dchinner@redhat.com>
+Date: Fri, 11 May 2018 11:20:57 +1000
+Subject: fs: don't scan the inode cache before SB_BORN is set
+
+From: Dave Chinner <dchinner@redhat.com>
+
+commit 79f546a696bff2590169fb5684e23d65f4d9f591 upstream.
+
+We recently had an oops reported on a 4.14 kernel in
+xfs_reclaim_inodes_count() where sb->s_fs_info pointed to garbage
+and so the m_perag_tree lookup walked into lala land. It produces
+an oops down this path during the failed mount:
+
+ radix_tree_gang_lookup_tag+0xc4/0x130
+ xfs_perag_get_tag+0x37/0xf0
+ xfs_reclaim_inodes_count+0x32/0x40
+ xfs_fs_nr_cached_objects+0x11/0x20
+ super_cache_count+0x35/0xc0
+ shrink_slab.part.66+0xb1/0x370
+ shrink_node+0x7e/0x1a0
+ try_to_free_pages+0x199/0x470
+ __alloc_pages_slowpath+0x3a1/0xd20
+ __alloc_pages_nodemask+0x1c3/0x200
+ cache_grow_begin+0x20b/0x2e0
+ fallback_alloc+0x160/0x200
+ kmem_cache_alloc+0x111/0x4e0
+
+The problem is that the superblock shrinker is running before the
+filesystem structures it depends on have been fully set up. i.e.
+the shrinker is registered in sget(), before ->fill_super() has been
+called, and the shrinker can call into the filesystem before
+fill_super() does it's setup work. Essentially we are exposed to
+both use-after-free and use-before-initialisation bugs here.
+
+To fix this, add a check for the SB_BORN flag in super_cache_count.
+In general, this flag is not set until ->fs_mount() completes
+successfully, so we know that it is set after the filesystem
+setup has completed. This matches the trylock_super() behaviour
+which will not let super_cache_scan() run if SB_BORN is not set, and
+hence will not allow the superblock shrinker from entering the
+filesystem while it is being set up or after it has failed setup
+and is being torn down.
+
+Cc: stable@kernel.org
+Signed-Off-By: Dave Chinner <dchinner@redhat.com>
+Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
+Signed-off-by: Aaron Lu <aaron.lu@linux.alibaba.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/super.c | 30 ++++++++++++++++++++++++------
+ 1 file changed, 24 insertions(+), 6 deletions(-)
+
+--- a/fs/super.c
++++ b/fs/super.c
+@@ -118,13 +118,23 @@ static unsigned long super_cache_count(s
+ sb = container_of(shrink, struct super_block, s_shrink);
+
+ /*
+- * Don't call trylock_super as it is a potential
+- * scalability bottleneck. The counts could get updated
+- * between super_cache_count and super_cache_scan anyway.
+- * Call to super_cache_count with shrinker_rwsem held
+- * ensures the safety of call to list_lru_shrink_count() and
+- * s_op->nr_cached_objects().
++ * We don't call trylock_super() here as it is a scalability bottleneck,
++ * so we're exposed to partial setup state. The shrinker rwsem does not
++ * protect filesystem operations backing list_lru_shrink_count() or
++ * s_op->nr_cached_objects(). Counts can change between
++ * super_cache_count and super_cache_scan, so we really don't need locks
++ * here.
++ *
++ * However, if we are currently mounting the superblock, the underlying
++ * filesystem might be in a state of partial construction and hence it
++ * is dangerous to access it. trylock_super() uses a MS_BORN check to
++ * avoid this situation, so do the same here. The memory barrier is
++ * matched with the one in mount_fs() as we don't hold locks here.
+ */
++ if (!(sb->s_flags & MS_BORN))
++ return 0;
++ smp_rmb();
++
+ if (sb->s_op && sb->s_op->nr_cached_objects)
+ total_objects = sb->s_op->nr_cached_objects(sb, sc);
+
+@@ -1133,6 +1143,14 @@ mount_fs(struct file_system_type *type,
+ sb = root->d_sb;
+ BUG_ON(!sb);
+ WARN_ON(!sb->s_bdi);
++
++ /*
++ * Write barrier is for super_cache_count(). We place it before setting
++ * MS_BORN as the data dependency between the two functions is the
++ * superblock structure contents that we just set up, not the MS_BORN
++ * flag.
++ */
++ smp_wmb();
+ sb->s_flags |= MS_BORN;
+
+ error = security_sb_kern_mount(sb, flags, secdata);
--- /dev/null
+From 399d1404be660d355192ff4df5ccc3f4159ec1e4 Mon Sep 17 00:00:00 2001
+From: Eric Dumazet <edumazet@google.com>
+Date: Sat, 31 Mar 2018 12:58:51 -0700
+Subject: inet: frags: get rif of inet_frag_evicting()
+
+From: Eric Dumazet <edumazet@google.com>
+
+commit 399d1404be660d355192ff4df5ccc3f4159ec1e4 upstream.
+
+This refactors ip_expire() since one indentation level is removed.
+
+Note: in the future, we should try hard to avoid the skb_clone()
+since this is a serious performance cost.
+Under DDOS, the ICMP message wont be sent because of rate limits.
+
+Fact that ip6_expire_frag_queue() does not use skb_clone() is
+disturbing too. Presumably IPv6 should have the same
+issue than the one we fixed in commit ec4fbd64751d
+("inet: frag: release spinlock before calling icmp_send()")
+
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Mao Wenan <maowenan@huawei.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ include/net/inet_frag.h | 5 ---
+ net/ipv4/ip_fragment.c | 66 +++++++++++++++++++++++-------------------------
+ net/ipv6/reassembly.c | 4 --
+ 3 files changed, 32 insertions(+), 43 deletions(-)
+
+--- a/include/net/inet_frag.h
++++ b/include/net/inet_frag.h
+@@ -123,11 +123,6 @@ static inline void inet_frag_put(struct
+ inet_frag_destroy(q, f);
+ }
+
+-static inline bool inet_frag_evicting(struct inet_frag_queue *q)
+-{
+- return !hlist_unhashed(&q->list_evictor);
+-}
+-
+ /* Memory Tracking Functions. */
+
+ static inline int frag_mem_limit(struct netns_frags *nf)
+--- a/net/ipv4/ip_fragment.c
++++ b/net/ipv4/ip_fragment.c
+@@ -194,8 +194,11 @@ static bool frag_expire_skip_icmp(u32 us
+ */
+ static void ip_expire(unsigned long arg)
+ {
+- struct ipq *qp;
++ struct sk_buff *clone, *head;
++ const struct iphdr *iph;
+ struct net *net;
++ struct ipq *qp;
++ int err;
+
+ qp = container_of((struct inet_frag_queue *) arg, struct ipq, q);
+ net = container_of(qp->q.net, struct net, ipv4.frags);
+@@ -209,45 +212,40 @@ static void ip_expire(unsigned long arg)
+ ipq_kill(qp);
+ IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS);
+
+- if (!inet_frag_evicting(&qp->q)) {
+- struct sk_buff *clone, *head = qp->q.fragments;
+- const struct iphdr *iph;
+- int err;
+-
+- IP_INC_STATS_BH(net, IPSTATS_MIB_REASMTIMEOUT);
++ head = qp->q.fragments;
+
+- if (!(qp->q.flags & INET_FRAG_FIRST_IN) || !qp->q.fragments)
+- goto out;
++ IP_INC_STATS_BH(net, IPSTATS_MIB_REASMTIMEOUT);
+
+- head->dev = dev_get_by_index_rcu(net, qp->iif);
+- if (!head->dev)
+- goto out;
++ if (!(qp->q.flags & INET_FRAG_FIRST_IN) || !head)
++ goto out;
+
++ head->dev = dev_get_by_index_rcu(net, qp->iif);
++ if (!head->dev)
++ goto out;
+
+- /* skb has no dst, perform route lookup again */
+- iph = ip_hdr(head);
+- err = ip_route_input_noref(head, iph->daddr, iph->saddr,
++ /* skb has no dst, perform route lookup again */
++ iph = ip_hdr(head);
++ err = ip_route_input_noref(head, iph->daddr, iph->saddr,
+ iph->tos, head->dev);
+- if (err)
+- goto out;
++ if (err)
++ goto out;
++
++ /* Only an end host needs to send an ICMP
++ * "Fragment Reassembly Timeout" message, per RFC792.
++ */
++ if (frag_expire_skip_icmp(qp->user) &&
++ (skb_rtable(head)->rt_type != RTN_LOCAL))
++ goto out;
++
++ clone = skb_clone(head, GFP_ATOMIC);
+
+- /* Only an end host needs to send an ICMP
+- * "Fragment Reassembly Timeout" message, per RFC792.
+- */
+- if (frag_expire_skip_icmp(qp->user) &&
+- (skb_rtable(head)->rt_type != RTN_LOCAL))
+- goto out;
+-
+- clone = skb_clone(head, GFP_ATOMIC);
+-
+- /* Send an ICMP "Fragment Reassembly Timeout" message. */
+- if (clone) {
+- spin_unlock(&qp->q.lock);
+- icmp_send(clone, ICMP_TIME_EXCEEDED,
+- ICMP_EXC_FRAGTIME, 0);
+- consume_skb(clone);
+- goto out_rcu_unlock;
+- }
++ /* Send an ICMP "Fragment Reassembly Timeout" message. */
++ if (clone) {
++ spin_unlock(&qp->q.lock);
++ icmp_send(clone, ICMP_TIME_EXCEEDED,
++ ICMP_EXC_FRAGTIME, 0);
++ consume_skb(clone);
++ goto out_rcu_unlock;
+ }
+ out:
+ spin_unlock(&qp->q.lock);
+--- a/net/ipv6/reassembly.c
++++ b/net/ipv6/reassembly.c
+@@ -146,10 +146,6 @@ void ip6_expire_frag_queue(struct net *n
+ goto out_rcu_unlock;
+
+ IP6_INC_STATS_BH(net, __in6_dev_get(dev), IPSTATS_MIB_REASMFAILS);
+-
+- if (inet_frag_evicting(&fq->q))
+- goto out_rcu_unlock;
+-
+ IP6_INC_STATS_BH(net, __in6_dev_get(dev), IPSTATS_MIB_REASMTIMEOUT);
+
+ /* Don't send error if the first segment did not arrive. */
--- /dev/null
+From 353c9cb360874e737fb000545f783df756c06f9a Mon Sep 17 00:00:00 2001
+From: Peter Oskolkov <posk@google.com>
+Date: Sat, 11 Aug 2018 20:27:24 +0000
+Subject: ip: add helpers to process in-order fragments faster.
+
+From: Peter Oskolkov <posk@google.com>
+
+commit 353c9cb360874e737fb000545f783df756c06f9a upstream.
+
+This patch introduces several helper functions/macros that will be
+used in the follow-up patch. No runtime changes yet.
+
+The new logic (fully implemented in the second patch) is as follows:
+
+* Nodes in the rb-tree will now contain not single fragments, but lists
+ of consecutive fragments ("runs").
+
+* At each point in time, the current "active" run at the tail is
+ maintained/tracked. Fragments that arrive in-order, adjacent
+ to the previous tail fragment, are added to this tail run without
+ triggering the re-balancing of the rb-tree.
+
+* If a fragment arrives out of order with the offset _before_ the tail run,
+ it is inserted into the rb-tree as a single fragment.
+
+* If a fragment arrives after the current tail fragment (with a gap),
+ it starts a new "tail" run, as is inserted into the rb-tree
+ at the end as the head of the new run.
+
+skb->cb is used to store additional information
+needed here (suggested by Eric Dumazet).
+
+Reported-by: Willem de Bruijn <willemb@google.com>
+Signed-off-by: Peter Oskolkov <posk@google.com>
+Cc: Eric Dumazet <edumazet@google.com>
+Cc: Florian Westphal <fw@strlen.de>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Mao Wenan <maowenan@huawei.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/net/inet_frag.h | 4 ++
+ net/ipv4/ip_fragment.c | 74 +++++++++++++++++++++++++++++++++++++++++++++---
+ 2 files changed, 74 insertions(+), 4 deletions(-)
+
+--- a/include/net/inet_frag.h
++++ b/include/net/inet_frag.h
+@@ -48,6 +48,7 @@ struct inet_frag_queue {
+ struct sk_buff *fragments; /* Used in IPv6. */
+ struct rb_root rb_fragments; /* Used in IPv4. */
+ struct sk_buff *fragments_tail;
++ struct sk_buff *last_run_head;
+ ktime_t stamp;
+ int len;
+ int meat;
+@@ -118,6 +119,9 @@ struct inet_frag_queue *inet_frag_find(s
+ void inet_frag_maybe_warn_overflow(struct inet_frag_queue *q,
+ const char *prefix);
+
++/* Free all skbs in the queue; return the sum of their truesizes. */
++unsigned int inet_frag_rbtree_purge(struct rb_root *root);
++
+ static inline void inet_frag_put(struct inet_frag_queue *q, struct inet_frags *f)
+ {
+ if (atomic_dec_and_test(&q->refcnt))
+--- a/net/ipv4/ip_fragment.c
++++ b/net/ipv4/ip_fragment.c
+@@ -58,13 +58,57 @@
+ static int sysctl_ipfrag_max_dist __read_mostly = 64;
+ static const char ip_frag_cache_name[] = "ip4-frags";
+
+-struct ipfrag_skb_cb
+-{
++/* Use skb->cb to track consecutive/adjacent fragments coming at
++ * the end of the queue. Nodes in the rb-tree queue will
++ * contain "runs" of one or more adjacent fragments.
++ *
++ * Invariants:
++ * - next_frag is NULL at the tail of a "run";
++ * - the head of a "run" has the sum of all fragment lengths in frag_run_len.
++ */
++struct ipfrag_skb_cb {
+ struct inet_skb_parm h;
+- int offset;
++ int offset;
++ struct sk_buff *next_frag;
++ int frag_run_len;
+ };
+
+-#define FRAG_CB(skb) ((struct ipfrag_skb_cb *)((skb)->cb))
++#define FRAG_CB(skb) ((struct ipfrag_skb_cb *)((skb)->cb))
++
++static void ip4_frag_init_run(struct sk_buff *skb)
++{
++ BUILD_BUG_ON(sizeof(struct ipfrag_skb_cb) > sizeof(skb->cb));
++
++ FRAG_CB(skb)->next_frag = NULL;
++ FRAG_CB(skb)->frag_run_len = skb->len;
++}
++
++/* Append skb to the last "run". */
++static void ip4_frag_append_to_last_run(struct inet_frag_queue *q,
++ struct sk_buff *skb)
++{
++ RB_CLEAR_NODE(&skb->rbnode);
++ FRAG_CB(skb)->next_frag = NULL;
++
++ FRAG_CB(q->last_run_head)->frag_run_len += skb->len;
++ FRAG_CB(q->fragments_tail)->next_frag = skb;
++ q->fragments_tail = skb;
++}
++
++/* Create a new "run" with the skb. */
++static void ip4_frag_create_run(struct inet_frag_queue *q, struct sk_buff *skb)
++{
++ if (q->last_run_head)
++ rb_link_node(&skb->rbnode, &q->last_run_head->rbnode,
++ &q->last_run_head->rbnode.rb_right);
++ else
++ rb_link_node(&skb->rbnode, NULL, &q->rb_fragments.rb_node);
++ rb_insert_color(&skb->rbnode, &q->rb_fragments);
++
++ ip4_frag_init_run(skb);
++ q->fragments_tail = skb;
++ q->last_run_head = skb;
++}
+
+ /* Describe an entry in the "incomplete datagrams" queue. */
+ struct ipq {
+@@ -721,6 +765,28 @@ struct sk_buff *ip_check_defrag(struct n
+ }
+ EXPORT_SYMBOL(ip_check_defrag);
+
++unsigned int inet_frag_rbtree_purge(struct rb_root *root)
++{
++ struct rb_node *p = rb_first(root);
++ unsigned int sum = 0;
++
++ while (p) {
++ struct sk_buff *skb = rb_entry(p, struct sk_buff, rbnode);
++
++ p = rb_next(p);
++ rb_erase(&skb->rbnode, root);
++ while (skb) {
++ struct sk_buff *next = FRAG_CB(skb)->next_frag;
++
++ sum += skb->truesize;
++ kfree_skb(skb);
++ skb = next;
++ }
++ }
++ return sum;
++}
++EXPORT_SYMBOL(inet_frag_rbtree_purge);
++
+ #ifdef CONFIG_SYSCTL
+ static int zero;
+
--- /dev/null
+From 7969e5c40dfd04799d4341f1b7cd266b6e47f227 Mon Sep 17 00:00:00 2001
+From: Peter Oskolkov <posk@google.com>
+Date: Thu, 2 Aug 2018 23:34:37 +0000
+Subject: ip: discard IPv4 datagrams with overlapping segments.
+
+From: Peter Oskolkov <posk@google.com>
+
+commit 7969e5c40dfd04799d4341f1b7cd266b6e47f227 upstream.
+
+This behavior is required in IPv6, and there is little need
+to tolerate overlapping fragments in IPv4. This change
+simplifies the code and eliminates potential DDoS attack vectors.
+
+Tested: ran ip_defrag selftest (not yet available uptream).
+
+Suggested-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Peter Oskolkov <posk@google.com>
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Cc: Florian Westphal <fw@strlen.de>
+Acked-by: Stephen Hemminger <stephen@networkplumber.org>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Mao Wenan <maowenan@huawei.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/uapi/linux/snmp.h | 1
+ net/ipv4/ip_fragment.c | 73 ++++++++++++----------------------------------
+ net/ipv4/proc.c | 1
+ 3 files changed, 22 insertions(+), 53 deletions(-)
+
+--- a/include/uapi/linux/snmp.h
++++ b/include/uapi/linux/snmp.h
+@@ -55,6 +55,7 @@ enum
+ IPSTATS_MIB_ECT1PKTS, /* InECT1Pkts */
+ IPSTATS_MIB_ECT0PKTS, /* InECT0Pkts */
+ IPSTATS_MIB_CEPKTS, /* InCEPkts */
++ IPSTATS_MIB_REASM_OVERLAPS, /* ReasmOverlaps */
+ __IPSTATS_MIB_MAX
+ };
+
+--- a/net/ipv4/ip_fragment.c
++++ b/net/ipv4/ip_fragment.c
+@@ -342,6 +342,7 @@ static int ip_frag_reinit(struct ipq *qp
+ /* Add new segment to existing queue. */
+ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
+ {
++ struct net *net = container_of(qp->q.net, struct net, ipv4.frags);
+ struct sk_buff *prev, *next;
+ struct net_device *dev;
+ unsigned int fragsize;
+@@ -422,60 +423,22 @@ static int ip_frag_queue(struct ipq *qp,
+ }
+
+ found:
+- /* We found where to put this one. Check for overlap with
+- * preceding fragment, and, if needed, align things so that
+- * any overlaps are eliminated.
++ /* RFC5722, Section 4, amended by Errata ID : 3089
++ * When reassembling an IPv6 datagram, if
++ * one or more its constituent fragments is determined to be an
++ * overlapping fragment, the entire datagram (and any constituent
++ * fragments) MUST be silently discarded.
++ *
++ * We do the same here for IPv4.
+ */
+- if (prev) {
+- int i = (FRAG_CB(prev)->offset + prev->len) - offset;
+-
+- if (i > 0) {
+- offset += i;
+- err = -EINVAL;
+- if (end <= offset)
+- goto err;
+- err = -ENOMEM;
+- if (!pskb_pull(skb, i))
+- goto err;
+- if (skb->ip_summed != CHECKSUM_UNNECESSARY)
+- skb->ip_summed = CHECKSUM_NONE;
+- }
+- }
+-
+- err = -ENOMEM;
+-
+- while (next && FRAG_CB(next)->offset < end) {
+- int i = end - FRAG_CB(next)->offset; /* overlap is 'i' bytes */
+-
+- if (i < next->len) {
+- /* Eat head of the next overlapped fragment
+- * and leave the loop. The next ones cannot overlap.
+- */
+- if (!pskb_pull(next, i))
+- goto err;
+- FRAG_CB(next)->offset += i;
+- qp->q.meat -= i;
+- if (next->ip_summed != CHECKSUM_UNNECESSARY)
+- next->ip_summed = CHECKSUM_NONE;
+- break;
+- } else {
+- struct sk_buff *free_it = next;
+-
+- /* Old fragment is completely overridden with
+- * new one drop it.
+- */
+- next = next->next;
+-
+- if (prev)
+- prev->next = next;
+- else
+- qp->q.fragments = next;
+-
+- qp->q.meat -= free_it->len;
+- sub_frag_mem_limit(qp->q.net, free_it->truesize);
+- kfree_skb(free_it);
+- }
+- }
++ /* Is there an overlap with the previous fragment? */
++ if (prev &&
++ (FRAG_CB(prev)->offset + prev->len) > offset)
++ goto discard_qp;
++
++ /* Is there an overlap with the next fragment? */
++ if (next && FRAG_CB(next)->offset < end)
++ goto discard_qp;
+
+ FRAG_CB(skb)->offset = offset;
+
+@@ -522,6 +485,10 @@ found:
+ skb_dst_drop(skb);
+ return -EINPROGRESS;
+
++discard_qp:
++ ipq_kill(qp);
++ err = -EINVAL;
++ IP_INC_STATS(net, IPSTATS_MIB_REASM_OVERLAPS);
+ err:
+ kfree_skb(skb);
+ return err;
+--- a/net/ipv4/proc.c
++++ b/net/ipv4/proc.c
+@@ -132,6 +132,7 @@ static const struct snmp_mib snmp4_ipext
+ SNMP_MIB_ITEM("InECT1Pkts", IPSTATS_MIB_ECT1PKTS),
+ SNMP_MIB_ITEM("InECT0Pkts", IPSTATS_MIB_ECT0PKTS),
+ SNMP_MIB_ITEM("InCEPkts", IPSTATS_MIB_CEPKTS),
++ SNMP_MIB_ITEM("ReasmOverlaps", IPSTATS_MIB_REASM_OVERLAPS),
+ SNMP_MIB_SENTINEL
+ };
+
--- /dev/null
+From 5d407b071dc369c26a38398326ee2be53651cfe4 Mon Sep 17 00:00:00 2001
+From: Taehee Yoo <ap420073@gmail.com>
+Date: Mon, 10 Sep 2018 02:47:05 +0900
+Subject: ip: frags: fix crash in ip_do_fragment()
+
+From: Taehee Yoo <ap420073@gmail.com>
+
+commit 5d407b071dc369c26a38398326ee2be53651cfe4 upstream.
+
+A kernel crash occurrs when defragmented packet is fragmented
+in ip_do_fragment().
+In defragment routine, skb_orphan() is called and
+skb->ip_defrag_offset is set. but skb->sk and
+skb->ip_defrag_offset are same union member. so that
+frag->sk is not NULL.
+Hence crash occurrs in skb->sk check routine in ip_do_fragment() when
+defragmented packet is fragmented.
+
+test commands:
+ %iptables -t nat -I POSTROUTING -j MASQUERADE
+ %hping3 192.168.4.2 -s 1000 -p 2000 -d 60000
+
+splat looks like:
+[ 261.069429] kernel BUG at net/ipv4/ip_output.c:636!
+[ 261.075753] invalid opcode: 0000 [#1] SMP DEBUG_PAGEALLOC KASAN PTI
+[ 261.083854] CPU: 1 PID: 1349 Comm: hping3 Not tainted 4.19.0-rc2+ #3
+[ 261.100977] RIP: 0010:ip_do_fragment+0x1613/0x2600
+[ 261.106945] Code: e8 e2 38 e3 fe 4c 8b 44 24 18 48 8b 74 24 08 e9 92 f6 ff ff 80 3c 02 00 0f 85 da 07 00 00 48 8b b5 d0 00 00 00 e9 25 f6 ff ff <0f> 0b 0f 0b 44 8b 54 24 58 4c 8b 4c 24 18 4c 8b 5c 24 60 4c 8b 6c
+[ 261.127015] RSP: 0018:ffff8801031cf2c0 EFLAGS: 00010202
+[ 261.134156] RAX: 1ffff1002297537b RBX: ffffed0020639e6e RCX: 0000000000000004
+[ 261.142156] RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffff880114ba9bd8
+[ 261.150157] RBP: ffff880114ba8a40 R08: ffffed0022975395 R09: ffffed0022975395
+[ 261.158157] R10: 0000000000000001 R11: ffffed0022975394 R12: ffff880114ba9ca4
+[ 261.166159] R13: 0000000000000010 R14: ffff880114ba9bc0 R15: dffffc0000000000
+[ 261.174169] FS: 00007fbae2199700(0000) GS:ffff88011b400000(0000) knlGS:0000000000000000
+[ 261.183012] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
+[ 261.189013] CR2: 00005579244fe000 CR3: 0000000119bf4000 CR4: 00000000001006e0
+[ 261.198158] Call Trace:
+[ 261.199018] ? dst_output+0x180/0x180
+[ 261.205011] ? save_trace+0x300/0x300
+[ 261.209018] ? ip_copy_metadata+0xb00/0xb00
+[ 261.213034] ? sched_clock_local+0xd4/0x140
+[ 261.218158] ? kill_l4proto+0x120/0x120 [nf_conntrack]
+[ 261.223014] ? rt_cpu_seq_stop+0x10/0x10
+[ 261.227014] ? find_held_lock+0x39/0x1c0
+[ 261.233008] ip_finish_output+0x51d/0xb50
+[ 261.237006] ? ip_fragment.constprop.56+0x220/0x220
+[ 261.243011] ? nf_ct_l4proto_register_one+0x5b0/0x5b0 [nf_conntrack]
+[ 261.250152] ? rcu_is_watching+0x77/0x120
+[ 261.255010] ? nf_nat_ipv4_out+0x1e/0x2b0 [nf_nat_ipv4]
+[ 261.261033] ? nf_hook_slow+0xb1/0x160
+[ 261.265007] ip_output+0x1c7/0x710
+[ 261.269005] ? ip_mc_output+0x13f0/0x13f0
+[ 261.273002] ? __local_bh_enable_ip+0xe9/0x1b0
+[ 261.278152] ? ip_fragment.constprop.56+0x220/0x220
+[ 261.282996] ? nf_hook_slow+0xb1/0x160
+[ 261.287007] raw_sendmsg+0x21f9/0x4420
+[ 261.291008] ? dst_output+0x180/0x180
+[ 261.297003] ? sched_clock_cpu+0x126/0x170
+[ 261.301003] ? find_held_lock+0x39/0x1c0
+[ 261.306155] ? stop_critical_timings+0x420/0x420
+[ 261.311004] ? check_flags.part.36+0x450/0x450
+[ 261.315005] ? _raw_spin_unlock_irq+0x29/0x40
+[ 261.320995] ? _raw_spin_unlock_irq+0x29/0x40
+[ 261.326142] ? cyc2ns_read_end+0x10/0x10
+[ 261.330139] ? raw_bind+0x280/0x280
+[ 261.334138] ? sched_clock_cpu+0x126/0x170
+[ 261.338995] ? check_flags.part.36+0x450/0x450
+[ 261.342991] ? __lock_acquire+0x4500/0x4500
+[ 261.348994] ? inet_sendmsg+0x11c/0x500
+[ 261.352989] ? dst_output+0x180/0x180
+[ 261.357012] inet_sendmsg+0x11c/0x500
+[ ... ]
+
+v2:
+ - clear skb->sk at reassembly routine.(Eric Dumarzet)
+
+Fixes: fa0f527358bd ("ip: use rb trees for IP frag queue.")
+Suggested-by: Eric Dumazet <edumazet@google.com>
+Signed-off-by: Taehee Yoo <ap420073@gmail.com>
+Reviewed-by: Eric Dumazet <edumazet@google.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Mao Wenan <maowenan@huawei.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ net/ipv4/ip_fragment.c | 1 +
+ net/ipv6/netfilter/nf_conntrack_reasm.c | 1 +
+ 2 files changed, 2 insertions(+)
+
+--- a/net/ipv4/ip_fragment.c
++++ b/net/ipv4/ip_fragment.c
+@@ -661,6 +661,7 @@ static int ip_frag_reasm(struct ipq *qp,
+ nextp = &fp->next;
+ fp->prev = NULL;
+ memset(&fp->rbnode, 0, sizeof(fp->rbnode));
++ fp->sk = NULL;
+ head->data_len += fp->len;
+ head->len += fp->len;
+ if (head->ip_summed != fp->ip_summed)
+--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
++++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
+@@ -454,6 +454,7 @@ nf_ct_frag6_reasm(struct frag_queue *fq,
+ else if (head->ip_summed == CHECKSUM_COMPLETE)
+ head->csum = csum_add(head->csum, fp->csum);
+ head->truesize += fp->truesize;
++ fp->sk = NULL;
+ }
+ sub_frag_mem_limit(fq->q.net, head->truesize);
+
--- /dev/null
+From a4fd284a1f8fd4b6c59aa59db2185b1e17c5c11c Mon Sep 17 00:00:00 2001
+From: Peter Oskolkov <posk@google.com>
+Date: Sat, 11 Aug 2018 20:27:25 +0000
+Subject: ip: process in-order fragments efficiently
+
+From: Peter Oskolkov <posk@google.com>
+
+commit a4fd284a1f8fd4b6c59aa59db2185b1e17c5c11c upstream.
+
+This patch changes the runtime behavior of IP defrag queue:
+incoming in-order fragments are added to the end of the current
+list/"run" of in-order fragments at the tail.
+
+On some workloads, UDP stream performance is substantially improved:
+
+RX: ./udp_stream -F 10 -T 2 -l 60
+TX: ./udp_stream -c -H <host> -F 10 -T 5 -l 60
+
+with this patchset applied on a 10Gbps receiver:
+
+ throughput=9524.18
+ throughput_units=Mbit/s
+
+upstream (net-next):
+
+ throughput=4608.93
+ throughput_units=Mbit/s
+
+Reported-by: Willem de Bruijn <willemb@google.com>
+Signed-off-by: Peter Oskolkov <posk@google.com>
+Cc: Eric Dumazet <edumazet@google.com>
+Cc: Florian Westphal <fw@strlen.de>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Mao Wenan <maowenan@huawei.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/ipv4/inet_fragment.c | 2
+ net/ipv4/ip_fragment.c | 110 +++++++++++++++++++++++++++++------------------
+ 2 files changed, 70 insertions(+), 42 deletions(-)
+
+--- a/net/ipv4/inet_fragment.c
++++ b/net/ipv4/inet_fragment.c
+@@ -315,7 +315,7 @@ void inet_frag_destroy(struct inet_frag_
+ fp = xp;
+ } while (fp);
+ } else {
+- sum_truesize = skb_rbtree_purge(&q->rb_fragments);
++ sum_truesize = inet_frag_rbtree_purge(&q->rb_fragments);
+ }
+ sum = sum_truesize + f->qsize;
+
+--- a/net/ipv4/ip_fragment.c
++++ b/net/ipv4/ip_fragment.c
+@@ -139,8 +139,8 @@ int ip_frag_mem(struct net *net)
+ return sum_frag_mem_limit(&net->ipv4.frags);
+ }
+
+-static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
+- struct net_device *dev);
++static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb,
++ struct sk_buff *prev_tail, struct net_device *dev);
+
+ struct ip4_create_arg {
+ struct iphdr *iph;
+@@ -271,7 +271,12 @@ static void ip_expire(unsigned long arg)
+ head = skb_rb_first(&qp->q.rb_fragments);
+ if (!head)
+ goto out;
+- rb_erase(&head->rbnode, &qp->q.rb_fragments);
++ if (FRAG_CB(head)->next_frag)
++ rb_replace_node(&head->rbnode,
++ &FRAG_CB(head)->next_frag->rbnode,
++ &qp->q.rb_fragments);
++ else
++ rb_erase(&head->rbnode, &qp->q.rb_fragments);
+ memset(&head->rbnode, 0, sizeof(head->rbnode));
+ barrier();
+ }
+@@ -373,7 +378,7 @@ static int ip_frag_reinit(struct ipq *qp
+ return -ETIMEDOUT;
+ }
+
+- sum_truesize = skb_rbtree_purge(&qp->q.rb_fragments);
++ sum_truesize = inet_frag_rbtree_purge(&qp->q.rb_fragments);
+ sub_frag_mem_limit(qp->q.net, sum_truesize);
+
+ qp->q.flags = 0;
+@@ -382,6 +387,7 @@ static int ip_frag_reinit(struct ipq *qp
+ qp->q.fragments = NULL;
+ qp->q.rb_fragments = RB_ROOT;
+ qp->q.fragments_tail = NULL;
++ qp->q.last_run_head = NULL;
+ qp->iif = 0;
+ qp->ecn = 0;
+
+@@ -393,7 +399,7 @@ static int ip_frag_queue(struct ipq *qp,
+ {
+ struct net *net = container_of(qp->q.net, struct net, ipv4.frags);
+ struct rb_node **rbn, *parent;
+- struct sk_buff *skb1;
++ struct sk_buff *skb1, *prev_tail;
+ struct net_device *dev;
+ unsigned int fragsize;
+ int flags, offset;
+@@ -471,38 +477,41 @@ static int ip_frag_queue(struct ipq *qp,
+ */
+
+ /* Find out where to put this fragment. */
+- skb1 = qp->q.fragments_tail;
+- if (!skb1) {
+- /* This is the first fragment we've received. */
+- rb_link_node(&skb->rbnode, NULL, &qp->q.rb_fragments.rb_node);
+- qp->q.fragments_tail = skb;
+- } else if ((FRAG_CB(skb1)->offset + skb1->len) < end) {
+- /* This is the common/special case: skb goes to the end. */
++ prev_tail = qp->q.fragments_tail;
++ if (!prev_tail)
++ ip4_frag_create_run(&qp->q, skb); /* First fragment. */
++ else if (FRAG_CB(prev_tail)->offset + prev_tail->len < end) {
++ /* This is the common case: skb goes to the end. */
+ /* Detect and discard overlaps. */
+- if (offset < (FRAG_CB(skb1)->offset + skb1->len))
++ if (offset < FRAG_CB(prev_tail)->offset + prev_tail->len)
+ goto discard_qp;
+- /* Insert after skb1. */
+- rb_link_node(&skb->rbnode, &skb1->rbnode, &skb1->rbnode.rb_right);
+- qp->q.fragments_tail = skb;
++ if (offset == FRAG_CB(prev_tail)->offset + prev_tail->len)
++ ip4_frag_append_to_last_run(&qp->q, skb);
++ else
++ ip4_frag_create_run(&qp->q, skb);
+ } else {
+- /* Binary search. Note that skb can become the first fragment, but
+- * not the last (covered above). */
++ /* Binary search. Note that skb can become the first fragment,
++ * but not the last (covered above).
++ */
+ rbn = &qp->q.rb_fragments.rb_node;
+ do {
+ parent = *rbn;
+ skb1 = rb_to_skb(parent);
+ if (end <= FRAG_CB(skb1)->offset)
+ rbn = &parent->rb_left;
+- else if (offset >= FRAG_CB(skb1)->offset + skb1->len)
++ else if (offset >= FRAG_CB(skb1)->offset +
++ FRAG_CB(skb1)->frag_run_len)
+ rbn = &parent->rb_right;
+ else /* Found an overlap with skb1. */
+ goto discard_qp;
+ } while (*rbn);
+ /* Here we have parent properly set, and rbn pointing to
+- * one of its NULL left/right children. Insert skb. */
++ * one of its NULL left/right children. Insert skb.
++ */
++ ip4_frag_init_run(skb);
+ rb_link_node(&skb->rbnode, parent, rbn);
++ rb_insert_color(&skb->rbnode, &qp->q.rb_fragments);
+ }
+- rb_insert_color(&skb->rbnode, &qp->q.rb_fragments);
+
+ if (dev) {
+ qp->iif = dev->ifindex;
+@@ -531,7 +540,7 @@ static int ip_frag_queue(struct ipq *qp,
+ unsigned long orefdst = skb->_skb_refdst;
+
+ skb->_skb_refdst = 0UL;
+- err = ip_frag_reasm(qp, skb, dev);
++ err = ip_frag_reasm(qp, skb, prev_tail, dev);
+ skb->_skb_refdst = orefdst;
+ return err;
+ }
+@@ -550,7 +559,7 @@ err:
+
+ /* Build a new IP datagram from all its fragments. */
+ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb,
+- struct net_device *dev)
++ struct sk_buff *prev_tail, struct net_device *dev)
+ {
+ struct net *net = container_of(qp->q.net, struct net, ipv4.frags);
+ struct iphdr *iph;
+@@ -575,10 +584,16 @@ static int ip_frag_reasm(struct ipq *qp,
+ if (!fp)
+ goto out_nomem;
+
+- rb_replace_node(&skb->rbnode, &fp->rbnode, &qp->q.rb_fragments);
++ FRAG_CB(fp)->next_frag = FRAG_CB(skb)->next_frag;
++ if (RB_EMPTY_NODE(&skb->rbnode))
++ FRAG_CB(prev_tail)->next_frag = fp;
++ else
++ rb_replace_node(&skb->rbnode, &fp->rbnode,
++ &qp->q.rb_fragments);
+ if (qp->q.fragments_tail == skb)
+ qp->q.fragments_tail = fp;
+ skb_morph(skb, head);
++ FRAG_CB(skb)->next_frag = FRAG_CB(head)->next_frag;
+ rb_replace_node(&head->rbnode, &skb->rbnode,
+ &qp->q.rb_fragments);
+ consume_skb(head);
+@@ -614,7 +629,7 @@ static int ip_frag_reasm(struct ipq *qp,
+ for (i = 0; i < skb_shinfo(head)->nr_frags; i++)
+ plen += skb_frag_size(&skb_shinfo(head)->frags[i]);
+ clone->len = clone->data_len = head->data_len - plen;
+- skb->truesize += clone->truesize;
++ head->truesize += clone->truesize;
+ clone->csum = 0;
+ clone->ip_summed = head->ip_summed;
+ add_frag_mem_limit(qp->q.net, clone->truesize);
+@@ -627,24 +642,36 @@ static int ip_frag_reasm(struct ipq *qp,
+ skb_push(head, head->data - skb_network_header(head));
+
+ /* Traverse the tree in order, to build frag_list. */
++ fp = FRAG_CB(head)->next_frag;
+ rbn = rb_next(&head->rbnode);
+ rb_erase(&head->rbnode, &qp->q.rb_fragments);
+- while (rbn) {
+- struct rb_node *rbnext = rb_next(rbn);
+- fp = rb_to_skb(rbn);
+- rb_erase(rbn, &qp->q.rb_fragments);
+- rbn = rbnext;
+- *nextp = fp;
+- nextp = &fp->next;
+- fp->prev = NULL;
+- memset(&fp->rbnode, 0, sizeof(fp->rbnode));
+- head->data_len += fp->len;
+- head->len += fp->len;
+- if (head->ip_summed != fp->ip_summed)
+- head->ip_summed = CHECKSUM_NONE;
+- else if (head->ip_summed == CHECKSUM_COMPLETE)
+- head->csum = csum_add(head->csum, fp->csum);
+- head->truesize += fp->truesize;
++ while (rbn || fp) {
++ /* fp points to the next sk_buff in the current run;
++ * rbn points to the next run.
++ */
++ /* Go through the current run. */
++ while (fp) {
++ *nextp = fp;
++ nextp = &fp->next;
++ fp->prev = NULL;
++ memset(&fp->rbnode, 0, sizeof(fp->rbnode));
++ head->data_len += fp->len;
++ head->len += fp->len;
++ if (head->ip_summed != fp->ip_summed)
++ head->ip_summed = CHECKSUM_NONE;
++ else if (head->ip_summed == CHECKSUM_COMPLETE)
++ head->csum = csum_add(head->csum, fp->csum);
++ head->truesize += fp->truesize;
++ fp = FRAG_CB(fp)->next_frag;
++ }
++ /* Move to the next run. */
++ if (rbn) {
++ struct rb_node *rbnext = rb_next(rbn);
++
++ fp = rb_to_skb(rbn);
++ rb_erase(rbn, &qp->q.rb_fragments);
++ rbn = rbnext;
++ }
+ }
+ sub_frag_mem_limit(qp->q.net, head->truesize);
+
+@@ -680,6 +707,7 @@ static int ip_frag_reasm(struct ipq *qp,
+ qp->q.fragments = NULL;
+ qp->q.rb_fragments = RB_ROOT;
+ qp->q.fragments_tail = NULL;
++ qp->q.last_run_head = NULL;
+ return 0;
+
+ out_nomem:
--- /dev/null
+From fa0f527358bd900ef92f925878ed6bfbd51305cc Mon Sep 17 00:00:00 2001
+From: Peter Oskolkov <posk@google.com>
+Date: Thu, 2 Aug 2018 23:34:39 +0000
+Subject: ip: use rb trees for IP frag queue.
+
+From: Peter Oskolkov <posk@google.com>
+
+commit fa0f527358bd900ef92f925878ed6bfbd51305cc upstream.
+
+Similar to TCP OOO RX queue, it makes sense to use rb trees to store
+IP fragments, so that OOO fragments are inserted faster.
+
+Tested:
+
+- a follow-up patch contains a rather comprehensive ip defrag
+ self-test (functional)
+- ran neper `udp_stream -c -H <host> -F 100 -l 300 -T 20`:
+ netstat --statistics
+ Ip:
+ 282078937 total packets received
+ 0 forwarded
+ 0 incoming packets discarded
+ 946760 incoming packets delivered
+ 18743456 requests sent out
+ 101 fragments dropped after timeout
+ 282077129 reassemblies required
+ 944952 packets reassembled ok
+ 262734239 packet reassembles failed
+ (The numbers/stats above are somewhat better re:
+ reassemblies vs a kernel without this patchset. More
+ comprehensive performance testing TBD).
+
+Reported-by: Jann Horn <jannh@google.com>
+Reported-by: Juha-Matti Tilli <juha-matti.tilli@iki.fi>
+Suggested-by: Eric Dumazet <edumazet@google.com>
+Signed-off-by: Peter Oskolkov <posk@google.com>
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Cc: Florian Westphal <fw@strlen.de>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Mao Wenan <maowenan@huawei.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ include/linux/skbuff.h | 2
+ include/net/inet_frag.h | 3
+ net/ipv4/inet_fragment.c | 14 +-
+ net/ipv4/ip_fragment.c | 190 +++++++++++++++++---------------
+ net/ipv6/netfilter/nf_conntrack_reasm.c | 1
+ net/ipv6/reassembly.c | 1
+ 6 files changed, 120 insertions(+), 91 deletions(-)
+
+--- a/include/linux/skbuff.h
++++ b/include/linux/skbuff.h
+@@ -556,7 +556,7 @@ struct sk_buff {
+ struct skb_mstamp skb_mstamp;
+ };
+ };
+- struct rb_node rbnode; /* used in netem & tcp stack */
++ struct rb_node rbnode; /* used in netem, ip4 defrag, and tcp stack */
+ };
+ struct sock *sk;
+ struct net_device *dev;
+--- a/include/net/inet_frag.h
++++ b/include/net/inet_frag.h
+@@ -45,7 +45,8 @@ struct inet_frag_queue {
+ struct timer_list timer;
+ struct hlist_node list;
+ atomic_t refcnt;
+- struct sk_buff *fragments;
++ struct sk_buff *fragments; /* Used in IPv6. */
++ struct rb_root rb_fragments; /* Used in IPv4. */
+ struct sk_buff *fragments_tail;
+ ktime_t stamp;
+ int len;
+--- a/net/ipv4/inet_fragment.c
++++ b/net/ipv4/inet_fragment.c
+@@ -306,12 +306,16 @@ void inet_frag_destroy(struct inet_frag_
+ /* Release all fragment data. */
+ fp = q->fragments;
+ nf = q->net;
+- while (fp) {
+- struct sk_buff *xp = fp->next;
++ if (fp) {
++ do {
++ struct sk_buff *xp = fp->next;
+
+- sum_truesize += fp->truesize;
+- frag_kfree_skb(nf, f, fp);
+- fp = xp;
++ sum_truesize += fp->truesize;
++ kfree_skb(fp);
++ fp = xp;
++ } while (fp);
++ } else {
++ sum_truesize = skb_rbtree_purge(&q->rb_fragments);
+ }
+ sum = sum_truesize + f->qsize;
+
+--- a/net/ipv4/ip_fragment.c
++++ b/net/ipv4/ip_fragment.c
+@@ -194,7 +194,7 @@ static bool frag_expire_skip_icmp(u32 us
+ */
+ static void ip_expire(unsigned long arg)
+ {
+- struct sk_buff *clone, *head;
++ struct sk_buff *head = NULL;
+ const struct iphdr *iph;
+ struct net *net;
+ struct ipq *qp;
+@@ -211,14 +211,31 @@ static void ip_expire(unsigned long arg)
+
+ ipq_kill(qp);
+ IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS);
+-
+- head = qp->q.fragments;
+-
+ IP_INC_STATS_BH(net, IPSTATS_MIB_REASMTIMEOUT);
+
+- if (!(qp->q.flags & INET_FRAG_FIRST_IN) || !head)
++ if (!qp->q.flags & INET_FRAG_FIRST_IN)
+ goto out;
+
++ /* sk_buff::dev and sk_buff::rbnode are unionized. So we
++ * pull the head out of the tree in order to be able to
++ * deal with head->dev.
++ */
++ if (qp->q.fragments) {
++ head = qp->q.fragments;
++ qp->q.fragments = head->next;
++ } else {
++ head = skb_rb_first(&qp->q.rb_fragments);
++ if (!head)
++ goto out;
++ rb_erase(&head->rbnode, &qp->q.rb_fragments);
++ memset(&head->rbnode, 0, sizeof(head->rbnode));
++ barrier();
++ }
++ if (head == qp->q.fragments_tail)
++ qp->q.fragments_tail = NULL;
++
++ sub_frag_mem_limit(qp->q.net, head->truesize);
++
+ head->dev = dev_get_by_index_rcu(net, qp->iif);
+ if (!head->dev)
+ goto out;
+@@ -237,20 +254,17 @@ static void ip_expire(unsigned long arg)
+ (skb_rtable(head)->rt_type != RTN_LOCAL))
+ goto out;
+
+- clone = skb_clone(head, GFP_ATOMIC);
+-
+ /* Send an ICMP "Fragment Reassembly Timeout" message. */
+- if (clone) {
+- spin_unlock(&qp->q.lock);
+- icmp_send(clone, ICMP_TIME_EXCEEDED,
+- ICMP_EXC_FRAGTIME, 0);
+- consume_skb(clone);
+- goto out_rcu_unlock;
+- }
++ spin_unlock(&qp->q.lock);
++ icmp_send(head, ICMP_TIME_EXCEEDED, ICMP_EXC_FRAGTIME, 0);
++ goto out_rcu_unlock;
++
+ out:
+ spin_unlock(&qp->q.lock);
+ out_rcu_unlock:
+ rcu_read_unlock();
++ if (head)
++ kfree_skb(head);
+ ipq_put(qp);
+ }
+
+@@ -294,7 +308,7 @@ static int ip_frag_too_far(struct ipq *q
+ end = atomic_inc_return(&peer->rid);
+ qp->rid = end;
+
+- rc = qp->q.fragments && (end - start) > max;
++ rc = qp->q.fragments_tail && (end - start) > max;
+
+ if (rc) {
+ struct net *net;
+@@ -308,7 +322,6 @@ static int ip_frag_too_far(struct ipq *q
+
+ static int ip_frag_reinit(struct ipq *qp)
+ {
+- struct sk_buff *fp;
+ unsigned int sum_truesize = 0;
+
+ if (!mod_timer(&qp->q.timer, jiffies + qp->q.net->timeout)) {
+@@ -316,20 +329,14 @@ static int ip_frag_reinit(struct ipq *qp
+ return -ETIMEDOUT;
+ }
+
+- fp = qp->q.fragments;
+- do {
+- struct sk_buff *xp = fp->next;
+-
+- sum_truesize += fp->truesize;
+- kfree_skb(fp);
+- fp = xp;
+- } while (fp);
++ sum_truesize = skb_rbtree_purge(&qp->q.rb_fragments);
+ sub_frag_mem_limit(qp->q.net, sum_truesize);
+
+ qp->q.flags = 0;
+ qp->q.len = 0;
+ qp->q.meat = 0;
+ qp->q.fragments = NULL;
++ qp->q.rb_fragments = RB_ROOT;
+ qp->q.fragments_tail = NULL;
+ qp->iif = 0;
+ qp->ecn = 0;
+@@ -341,7 +348,8 @@ static int ip_frag_reinit(struct ipq *qp
+ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
+ {
+ struct net *net = container_of(qp->q.net, struct net, ipv4.frags);
+- struct sk_buff *prev, *next;
++ struct rb_node **rbn, *parent;
++ struct sk_buff *skb1;
+ struct net_device *dev;
+ unsigned int fragsize;
+ int flags, offset;
+@@ -404,56 +412,60 @@ static int ip_frag_queue(struct ipq *qp,
+ if (err)
+ goto err;
+
+- /* Find out which fragments are in front and at the back of us
+- * in the chain of fragments so far. We must know where to put
+- * this fragment, right?
+- */
+- prev = qp->q.fragments_tail;
+- if (!prev || FRAG_CB(prev)->offset < offset) {
+- next = NULL;
+- goto found;
+- }
+- prev = NULL;
+- for (next = qp->q.fragments; next != NULL; next = next->next) {
+- if (FRAG_CB(next)->offset >= offset)
+- break; /* bingo! */
+- prev = next;
+- }
++ /* Note : skb->rbnode and skb->dev share the same location. */
++ dev = skb->dev;
++ /* Makes sure compiler wont do silly aliasing games */
++ barrier();
+
+-found:
+ /* RFC5722, Section 4, amended by Errata ID : 3089
+ * When reassembling an IPv6 datagram, if
+ * one or more its constituent fragments is determined to be an
+ * overlapping fragment, the entire datagram (and any constituent
+ * fragments) MUST be silently discarded.
+ *
+- * We do the same here for IPv4.
++ * We do the same here for IPv4 (and increment an snmp counter).
+ */
+- /* Is there an overlap with the previous fragment? */
+- if (prev &&
+- (FRAG_CB(prev)->offset + prev->len) > offset)
+- goto discard_qp;
+-
+- /* Is there an overlap with the next fragment? */
+- if (next && FRAG_CB(next)->offset < end)
+- goto discard_qp;
+-
+- FRAG_CB(skb)->offset = offset;
+
+- /* Insert this fragment in the chain of fragments. */
+- skb->next = next;
+- if (!next)
++ /* Find out where to put this fragment. */
++ skb1 = qp->q.fragments_tail;
++ if (!skb1) {
++ /* This is the first fragment we've received. */
++ rb_link_node(&skb->rbnode, NULL, &qp->q.rb_fragments.rb_node);
++ qp->q.fragments_tail = skb;
++ } else if ((FRAG_CB(skb1)->offset + skb1->len) < end) {
++ /* This is the common/special case: skb goes to the end. */
++ /* Detect and discard overlaps. */
++ if (offset < (FRAG_CB(skb1)->offset + skb1->len))
++ goto discard_qp;
++ /* Insert after skb1. */
++ rb_link_node(&skb->rbnode, &skb1->rbnode, &skb1->rbnode.rb_right);
+ qp->q.fragments_tail = skb;
+- if (prev)
+- prev->next = skb;
+- else
+- qp->q.fragments = skb;
++ } else {
++ /* Binary search. Note that skb can become the first fragment, but
++ * not the last (covered above). */
++ rbn = &qp->q.rb_fragments.rb_node;
++ do {
++ parent = *rbn;
++ skb1 = rb_to_skb(parent);
++ if (end <= FRAG_CB(skb1)->offset)
++ rbn = &parent->rb_left;
++ else if (offset >= FRAG_CB(skb1)->offset + skb1->len)
++ rbn = &parent->rb_right;
++ else /* Found an overlap with skb1. */
++ goto discard_qp;
++ } while (*rbn);
++ /* Here we have parent properly set, and rbn pointing to
++ * one of its NULL left/right children. Insert skb. */
++ rb_link_node(&skb->rbnode, parent, rbn);
++ }
++ rb_insert_color(&skb->rbnode, &qp->q.rb_fragments);
+
+- dev = skb->dev;
+ if (dev) {
+ qp->iif = dev->ifindex;
+ skb->dev = NULL;
+ }
++ FRAG_CB(skb)->offset = offset;
++
+ qp->q.stamp = skb->tstamp;
+ qp->q.meat += skb->len;
+ qp->ecn |= ecn;
+@@ -475,7 +487,7 @@ found:
+ unsigned long orefdst = skb->_skb_refdst;
+
+ skb->_skb_refdst = 0UL;
+- err = ip_frag_reasm(qp, prev, dev);
++ err = ip_frag_reasm(qp, skb, dev);
+ skb->_skb_refdst = orefdst;
+ return err;
+ }
+@@ -492,15 +504,15 @@ err:
+ return err;
+ }
+
+-
+ /* Build a new IP datagram from all its fragments. */
+-
+-static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
++static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb,
+ struct net_device *dev)
+ {
+ struct net *net = container_of(qp->q.net, struct net, ipv4.frags);
+ struct iphdr *iph;
+- struct sk_buff *fp, *head = qp->q.fragments;
++ struct sk_buff *fp, *head = skb_rb_first(&qp->q.rb_fragments);
++ struct sk_buff **nextp; /* To build frag_list. */
++ struct rb_node *rbn;
+ int len;
+ int ihlen;
+ int err;
+@@ -514,25 +526,21 @@ static int ip_frag_reasm(struct ipq *qp,
+ goto out_fail;
+ }
+ /* Make the one we just received the head. */
+- if (prev) {
+- head = prev->next;
+- fp = skb_clone(head, GFP_ATOMIC);
++ if (head != skb) {
++ fp = skb_clone(skb, GFP_ATOMIC);
+ if (!fp)
+ goto out_nomem;
+
+- fp->next = head->next;
+- if (!fp->next)
++ rb_replace_node(&skb->rbnode, &fp->rbnode, &qp->q.rb_fragments);
++ if (qp->q.fragments_tail == skb)
+ qp->q.fragments_tail = fp;
+- prev->next = fp;
+-
+- skb_morph(head, qp->q.fragments);
+- head->next = qp->q.fragments->next;
+-
+- consume_skb(qp->q.fragments);
+- qp->q.fragments = head;
++ skb_morph(skb, head);
++ rb_replace_node(&head->rbnode, &skb->rbnode,
++ &qp->q.rb_fragments);
++ consume_skb(head);
++ head = skb;
+ }
+
+- WARN_ON(!head);
+ WARN_ON(FRAG_CB(head)->offset != 0);
+
+ /* Allocate a new buffer for the datagram. */
+@@ -557,24 +565,35 @@ static int ip_frag_reasm(struct ipq *qp,
+ clone = alloc_skb(0, GFP_ATOMIC);
+ if (!clone)
+ goto out_nomem;
+- clone->next = head->next;
+- head->next = clone;
+ skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list;
+ skb_frag_list_init(head);
+ for (i = 0; i < skb_shinfo(head)->nr_frags; i++)
+ plen += skb_frag_size(&skb_shinfo(head)->frags[i]);
+ clone->len = clone->data_len = head->data_len - plen;
+- head->data_len -= clone->len;
+- head->len -= clone->len;
++ skb->truesize += clone->truesize;
+ clone->csum = 0;
+ clone->ip_summed = head->ip_summed;
+ add_frag_mem_limit(qp->q.net, clone->truesize);
++ skb_shinfo(head)->frag_list = clone;
++ nextp = &clone->next;
++ } else {
++ nextp = &skb_shinfo(head)->frag_list;
+ }
+
+- skb_shinfo(head)->frag_list = head->next;
+ skb_push(head, head->data - skb_network_header(head));
+
+- for (fp=head->next; fp; fp = fp->next) {
++ /* Traverse the tree in order, to build frag_list. */
++ rbn = rb_next(&head->rbnode);
++ rb_erase(&head->rbnode, &qp->q.rb_fragments);
++ while (rbn) {
++ struct rb_node *rbnext = rb_next(rbn);
++ fp = rb_to_skb(rbn);
++ rb_erase(rbn, &qp->q.rb_fragments);
++ rbn = rbnext;
++ *nextp = fp;
++ nextp = &fp->next;
++ fp->prev = NULL;
++ memset(&fp->rbnode, 0, sizeof(fp->rbnode));
+ head->data_len += fp->len;
+ head->len += fp->len;
+ if (head->ip_summed != fp->ip_summed)
+@@ -585,7 +604,9 @@ static int ip_frag_reasm(struct ipq *qp,
+ }
+ sub_frag_mem_limit(qp->q.net, head->truesize);
+
++ *nextp = NULL;
+ head->next = NULL;
++ head->prev = NULL;
+ head->dev = dev;
+ head->tstamp = qp->q.stamp;
+ IPCB(head)->frag_max_size = max(qp->max_df_size, qp->q.max_size);
+@@ -613,6 +634,7 @@ static int ip_frag_reasm(struct ipq *qp,
+
+ IP_INC_STATS_BH(net, IPSTATS_MIB_REASMOKS);
+ qp->q.fragments = NULL;
++ qp->q.rb_fragments = RB_ROOT;
+ qp->q.fragments_tail = NULL;
+ return 0;
+
+--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
++++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
+@@ -472,6 +472,7 @@ nf_ct_frag6_reasm(struct frag_queue *fq,
+ head->csum);
+
+ fq->q.fragments = NULL;
++ fq->q.rb_fragments = RB_ROOT;
+ fq->q.fragments_tail = NULL;
+
+ /* all original skbs are linked into the NFCT_FRAG6_CB(head).orig */
+--- a/net/ipv6/reassembly.c
++++ b/net/ipv6/reassembly.c
+@@ -499,6 +499,7 @@ static int ip6_frag_reasm(struct frag_qu
+ IP6_INC_STATS_BH(net, __in6_dev_get(dev), IPSTATS_MIB_REASMOKS);
+ rcu_read_unlock();
+ fq->q.fragments = NULL;
++ fq->q.rb_fragments = RB_ROOT;
+ fq->q.fragments_tail = NULL;
+ return 1;
+
--- /dev/null
+From 70837ffe3085c9a91488b52ca13ac84424da1042 Mon Sep 17 00:00:00 2001
+From: Dan Carpenter <dan.carpenter@oracle.com>
+Date: Mon, 6 Aug 2018 22:17:35 +0300
+Subject: ipv4: frags: precedence bug in ip_expire()
+
+From: Dan Carpenter <dan.carpenter@oracle.com>
+
+commit 70837ffe3085c9a91488b52ca13ac84424da1042 upstream.
+
+We accidentally removed the parentheses here, but they are required
+because '!' has higher precedence than '&'.
+
+Fixes: fa0f527358bd ("ip: use rb trees for IP frag queue.")
+Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Mao Wenan <maowenan@huawei.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ net/ipv4/ip_fragment.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/net/ipv4/ip_fragment.c
++++ b/net/ipv4/ip_fragment.c
+@@ -257,7 +257,7 @@ static void ip_expire(unsigned long arg)
+ IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS);
+ IP_INC_STATS_BH(net, IPSTATS_MIB_REASMTIMEOUT);
+
+- if (!qp->q.flags & INET_FRAG_FIRST_IN)
++ if (!(qp->q.flags & INET_FRAG_FIRST_IN))
+ goto out;
+
+ /* sk_buff::dev and sk_buff::rbnode are unionized. So we
--- /dev/null
+From 0ed4229b08c13c84a3c301a08defdc9e7f4467e6 Mon Sep 17 00:00:00 2001
+From: Florian Westphal <fw@strlen.de>
+Date: Fri, 3 Aug 2018 02:22:20 +0200
+Subject: ipv6: defrag: drop non-last frags smaller than min mtu
+
+From: Florian Westphal <fw@strlen.de>
+
+commit 0ed4229b08c13c84a3c301a08defdc9e7f4467e6 upstream.
+
+don't bother with pathological cases, they only waste cycles.
+IPv6 requires a minimum MTU of 1280 so we should never see fragments
+smaller than this (except last frag).
+
+v3: don't use awkward "-offset + len"
+v2: drop IPv4 part, which added same check w. IPV4_MIN_MTU (68).
+ There were concerns that there could be even smaller frags
+ generated by intermediate nodes, e.g. on radio networks.
+
+Cc: Peter Oskolkov <posk@google.com>
+Cc: Eric Dumazet <edumazet@google.com>
+Signed-off-by: Florian Westphal <fw@strlen.de>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Mao Wenan <maowenan@huawei.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/ipv6/netfilter/nf_conntrack_reasm.c | 4 ++++
+ net/ipv6/reassembly.c | 4 ++++
+ 2 files changed, 8 insertions(+)
+
+--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
++++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
+@@ -602,6 +602,10 @@ struct sk_buff *nf_ct_frag6_gather(struc
+ hdr = ipv6_hdr(clone);
+ fhdr = (struct frag_hdr *)skb_transport_header(clone);
+
++ if (skb->len - skb_network_offset(skb) < IPV6_MIN_MTU &&
++ fhdr->frag_off & htons(IP6_MF))
++ goto ret_orig;
++
+ skb_orphan(skb);
+ fq = fq_find(net, fhdr->identification, user, &hdr->saddr, &hdr->daddr,
+ skb->dev ? skb->dev->ifindex : 0, ip6_frag_ecn(hdr));
+--- a/net/ipv6/reassembly.c
++++ b/net/ipv6/reassembly.c
+@@ -549,6 +549,10 @@ static int ipv6_frag_rcv(struct sk_buff
+ return 1;
+ }
+
++ if (skb->len - skb_network_offset(skb) < IPV6_MIN_MTU &&
++ fhdr->frag_off & htons(IP6_MF))
++ goto fail_hdr;
++
+ fq = fq_find(net, fhdr->identification, &hdr->saddr, &hdr->daddr,
+ skb->dev ? skb->dev->ifindex : 0, ip6_frag_ecn(hdr));
+ if (fq) {
--- /dev/null
+From ade446403bfb79d3528d56071a84b15351a139ad Mon Sep 17 00:00:00 2001
+From: Michal Kubecek <mkubecek@suse.cz>
+Date: Thu, 13 Dec 2018 17:23:32 +0100
+Subject: net: ipv4: do not handle duplicate fragments as overlapping
+
+From: Michal Kubecek <mkubecek@suse.cz>
+
+commit ade446403bfb79d3528d56071a84b15351a139ad upstream.
+
+Since commit 7969e5c40dfd ("ip: discard IPv4 datagrams with overlapping
+segments.") IPv4 reassembly code drops the whole queue whenever an
+overlapping fragment is received. However, the test is written in a way
+which detects duplicate fragments as overlapping so that in environments
+with many duplicate packets, fragmented packets may be undeliverable.
+
+Add an extra test and for (potentially) duplicate fragment, only drop the
+new fragment rather than the whole queue. Only starting offset and length
+are checked, not the contents of the fragments as that would be too
+expensive. For similar reason, linear list ("run") of a rbtree node is not
+iterated, we only check if the new fragment is a subset of the interval
+covered by existing consecutive fragments.
+
+v2: instead of an exact check iterating through linear list of an rbtree
+node, only check if the new fragment is subset of the "run" (suggested
+by Eric Dumazet)
+
+Fixes: 7969e5c40dfd ("ip: discard IPv4 datagrams with overlapping segments.")
+Signed-off-by: Michal Kubecek <mkubecek@suse.cz>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Mao Wenan <maowenan@huawei.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ net/ipv4/ip_fragment.c | 18 ++++++++++++------
+ 1 file changed, 12 insertions(+), 6 deletions(-)
+
+--- a/net/ipv4/ip_fragment.c
++++ b/net/ipv4/ip_fragment.c
+@@ -400,10 +400,10 @@ static int ip_frag_queue(struct ipq *qp,
+ struct net *net = container_of(qp->q.net, struct net, ipv4.frags);
+ struct rb_node **rbn, *parent;
+ struct sk_buff *skb1, *prev_tail;
++ int ihl, end, skb1_run_end;
+ struct net_device *dev;
+ unsigned int fragsize;
+ int flags, offset;
+- int ihl, end;
+ int err = -ENOENT;
+ u8 ecn;
+
+@@ -473,7 +473,9 @@ static int ip_frag_queue(struct ipq *qp,
+ * overlapping fragment, the entire datagram (and any constituent
+ * fragments) MUST be silently discarded.
+ *
+- * We do the same here for IPv4 (and increment an snmp counter).
++ * We do the same here for IPv4 (and increment an snmp counter) but
++ * we do not want to drop the whole queue in response to a duplicate
++ * fragment.
+ */
+
+ /* Find out where to put this fragment. */
+@@ -497,13 +499,17 @@ static int ip_frag_queue(struct ipq *qp,
+ do {
+ parent = *rbn;
+ skb1 = rb_to_skb(parent);
++ skb1_run_end = FRAG_CB(skb1)->offset +
++ FRAG_CB(skb1)->frag_run_len;
+ if (end <= FRAG_CB(skb1)->offset)
+ rbn = &parent->rb_left;
+- else if (offset >= FRAG_CB(skb1)->offset +
+- FRAG_CB(skb1)->frag_run_len)
++ else if (offset >= skb1_run_end)
+ rbn = &parent->rb_right;
+- else /* Found an overlap with skb1. */
+- goto discard_qp;
++ else if (offset >= FRAG_CB(skb1)->offset &&
++ end <= skb1_run_end)
++ goto err; /* No new data, potential duplicate */
++ else
++ goto discard_qp; /* Found an overlap */
+ } while (*rbn);
+ /* Here we have parent properly set, and rbn pointing to
+ * one of its NULL left/right children. Insert skb.
--- /dev/null
+From 385114dec8a49b5e5945e77ba7de6356106713f4 Mon Sep 17 00:00:00 2001
+From: Peter Oskolkov <posk@google.com>
+Date: Thu, 2 Aug 2018 23:34:38 +0000
+Subject: net: modify skb_rbtree_purge to return the truesize of all purged skbs.
+
+From: Peter Oskolkov <posk@google.com>
+
+commit 385114dec8a49b5e5945e77ba7de6356106713f4 upstream.
+
+Tested: see the next patch is the series.
+
+Suggested-by: Eric Dumazet <edumazet@google.com>
+Signed-off-by: Peter Oskolkov <posk@google.com>
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Cc: Florian Westphal <fw@strlen.de>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Mao Wenan <maowenan@huawei.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ include/linux/skbuff.h | 2 +-
+ net/core/skbuff.c | 6 +++++-
+ 2 files changed, 6 insertions(+), 2 deletions(-)
+
+--- a/include/linux/skbuff.h
++++ b/include/linux/skbuff.h
+@@ -2273,7 +2273,7 @@ static inline void __skb_queue_purge(str
+ kfree_skb(skb);
+ }
+
+-void skb_rbtree_purge(struct rb_root *root);
++unsigned int skb_rbtree_purge(struct rb_root *root);
+
+ void *netdev_alloc_frag(unsigned int fragsz);
+
+--- a/net/core/skbuff.c
++++ b/net/core/skbuff.c
+@@ -2380,23 +2380,27 @@ EXPORT_SYMBOL(skb_queue_purge);
+ /**
+ * skb_rbtree_purge - empty a skb rbtree
+ * @root: root of the rbtree to empty
++ * Return value: the sum of truesizes of all purged skbs.
+ *
+ * Delete all buffers on an &sk_buff rbtree. Each buffer is removed from
+ * the list and one reference dropped. This function does not take
+ * any lock. Synchronization should be handled by the caller (e.g., TCP
+ * out-of-order queue is protected by the socket lock).
+ */
+-void skb_rbtree_purge(struct rb_root *root)
++unsigned int skb_rbtree_purge(struct rb_root *root)
+ {
+ struct rb_node *p = rb_first(root);
++ unsigned int sum = 0;
+
+ while (p) {
+ struct sk_buff *skb = rb_entry(p, struct sk_buff, rbnode);
+
+ p = rb_next(p);
+ rb_erase(&skb->rbnode, root);
++ sum += skb->truesize;
+ kfree_skb(skb);
+ }
++ return sum;
+ }
+
+ /**
cifs-always-resolve-hostname-before-reconnecting.patch
drivers-core-remove-glue-dirs-from-sysfs-earlier.patch
mm-migrate-don-t-rely-on-__pagemovable-of-newpage-after-unlocking-it.patch
+fs-don-t-scan-the-inode-cache-before-sb_born-is-set.patch
+ip-discard-ipv4-datagrams-with-overlapping-segments.patch
+net-modify-skb_rbtree_purge-to-return-the-truesize-of-all-purged-skbs.patch
+inet-frags-get-rif-of-inet_frag_evicting.patch
+ip-use-rb-trees-for-ip-frag-queue.patch
+ipv6-defrag-drop-non-last-frags-smaller-than-min-mtu.patch
+ip-add-helpers-to-process-in-order-fragments-faster.patch
+ip-process-in-order-fragments-efficiently.patch
+net-ipv4-do-not-handle-duplicate-fragments-as-overlapping.patch
+ip-frags-fix-crash-in-ip_do_fragment.patch
+ipv4-frags-precedence-bug-in-ip_expire.patch