]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/commitdiff
4.4-stable patches
authorGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Mon, 4 Feb 2019 10:01:28 +0000 (11:01 +0100)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Mon, 4 Feb 2019 10:01:28 +0000 (11:01 +0100)
added patches:
fs-don-t-scan-the-inode-cache-before-sb_born-is-set.patch
inet-frags-get-rif-of-inet_frag_evicting.patch
ip-add-helpers-to-process-in-order-fragments-faster.patch
ip-discard-ipv4-datagrams-with-overlapping-segments.patch
ip-frags-fix-crash-in-ip_do_fragment.patch
ip-process-in-order-fragments-efficiently.patch
ip-use-rb-trees-for-ip-frag-queue.patch
ipv4-frags-precedence-bug-in-ip_expire.patch
ipv6-defrag-drop-non-last-frags-smaller-than-min-mtu.patch
net-ipv4-do-not-handle-duplicate-fragments-as-overlapping.patch
net-modify-skb_rbtree_purge-to-return-the-truesize-of-all-purged-skbs.patch

12 files changed:
queue-4.4/fs-don-t-scan-the-inode-cache-before-sb_born-is-set.patch [new file with mode: 0644]
queue-4.4/inet-frags-get-rif-of-inet_frag_evicting.patch [new file with mode: 0644]
queue-4.4/ip-add-helpers-to-process-in-order-fragments-faster.patch [new file with mode: 0644]
queue-4.4/ip-discard-ipv4-datagrams-with-overlapping-segments.patch [new file with mode: 0644]
queue-4.4/ip-frags-fix-crash-in-ip_do_fragment.patch [new file with mode: 0644]
queue-4.4/ip-process-in-order-fragments-efficiently.patch [new file with mode: 0644]
queue-4.4/ip-use-rb-trees-for-ip-frag-queue.patch [new file with mode: 0644]
queue-4.4/ipv4-frags-precedence-bug-in-ip_expire.patch [new file with mode: 0644]
queue-4.4/ipv6-defrag-drop-non-last-frags-smaller-than-min-mtu.patch [new file with mode: 0644]
queue-4.4/net-ipv4-do-not-handle-duplicate-fragments-as-overlapping.patch [new file with mode: 0644]
queue-4.4/net-modify-skb_rbtree_purge-to-return-the-truesize-of-all-purged-skbs.patch [new file with mode: 0644]
queue-4.4/series

diff --git a/queue-4.4/fs-don-t-scan-the-inode-cache-before-sb_born-is-set.patch b/queue-4.4/fs-don-t-scan-the-inode-cache-before-sb_born-is-set.patch
new file mode 100644 (file)
index 0000000..056f0fe
--- /dev/null
@@ -0,0 +1,101 @@
+From 79f546a696bff2590169fb5684e23d65f4d9f591 Mon Sep 17 00:00:00 2001
+From: Dave Chinner <dchinner@redhat.com>
+Date: Fri, 11 May 2018 11:20:57 +1000
+Subject: fs: don't scan the inode cache before SB_BORN is set
+
+From: Dave Chinner <dchinner@redhat.com>
+
+commit 79f546a696bff2590169fb5684e23d65f4d9f591 upstream.
+
+We recently had an oops reported on a 4.14 kernel in
+xfs_reclaim_inodes_count() where sb->s_fs_info pointed to garbage
+and so the m_perag_tree lookup walked into lala land.  It produces
+an oops down this path during the failed mount:
+
+  radix_tree_gang_lookup_tag+0xc4/0x130
+  xfs_perag_get_tag+0x37/0xf0
+  xfs_reclaim_inodes_count+0x32/0x40
+  xfs_fs_nr_cached_objects+0x11/0x20
+  super_cache_count+0x35/0xc0
+  shrink_slab.part.66+0xb1/0x370
+  shrink_node+0x7e/0x1a0
+  try_to_free_pages+0x199/0x470
+  __alloc_pages_slowpath+0x3a1/0xd20
+  __alloc_pages_nodemask+0x1c3/0x200
+  cache_grow_begin+0x20b/0x2e0
+  fallback_alloc+0x160/0x200
+  kmem_cache_alloc+0x111/0x4e0
+
+The problem is that the superblock shrinker is running before the
+filesystem structures it depends on have been fully set up. i.e.
+the shrinker is registered in sget(), before ->fill_super() has been
+called, and the shrinker can call into the filesystem before
+fill_super() does it's setup work. Essentially we are exposed to
+both use-after-free and use-before-initialisation bugs here.
+
+To fix this, add a check for the SB_BORN flag in super_cache_count.
+In general, this flag is not set until ->fs_mount() completes
+successfully, so we know that it is set after the filesystem
+setup has completed. This matches the trylock_super() behaviour
+which will not let super_cache_scan() run if SB_BORN is not set, and
+hence will not allow the superblock shrinker from entering the
+filesystem while it is being set up or after it has failed setup
+and is being torn down.
+
+Cc: stable@kernel.org
+Signed-Off-By: Dave Chinner <dchinner@redhat.com>
+Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
+Signed-off-by: Aaron Lu <aaron.lu@linux.alibaba.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/super.c |   30 ++++++++++++++++++++++++------
+ 1 file changed, 24 insertions(+), 6 deletions(-)
+
+--- a/fs/super.c
++++ b/fs/super.c
+@@ -118,13 +118,23 @@ static unsigned long super_cache_count(s
+       sb = container_of(shrink, struct super_block, s_shrink);
+       /*
+-       * Don't call trylock_super as it is a potential
+-       * scalability bottleneck. The counts could get updated
+-       * between super_cache_count and super_cache_scan anyway.
+-       * Call to super_cache_count with shrinker_rwsem held
+-       * ensures the safety of call to list_lru_shrink_count() and
+-       * s_op->nr_cached_objects().
++       * We don't call trylock_super() here as it is a scalability bottleneck,
++       * so we're exposed to partial setup state. The shrinker rwsem does not
++       * protect filesystem operations backing list_lru_shrink_count() or
++       * s_op->nr_cached_objects(). Counts can change between
++       * super_cache_count and super_cache_scan, so we really don't need locks
++       * here.
++       *
++       * However, if we are currently mounting the superblock, the underlying
++       * filesystem might be in a state of partial construction and hence it
++       * is dangerous to access it.  trylock_super() uses a MS_BORN check to
++       * avoid this situation, so do the same here. The memory barrier is
++       * matched with the one in mount_fs() as we don't hold locks here.
+        */
++      if (!(sb->s_flags & MS_BORN))
++              return 0;
++      smp_rmb();
++
+       if (sb->s_op && sb->s_op->nr_cached_objects)
+               total_objects = sb->s_op->nr_cached_objects(sb, sc);
+@@ -1133,6 +1143,14 @@ mount_fs(struct file_system_type *type,
+       sb = root->d_sb;
+       BUG_ON(!sb);
+       WARN_ON(!sb->s_bdi);
++
++      /*
++       * Write barrier is for super_cache_count(). We place it before setting
++       * MS_BORN as the data dependency between the two functions is the
++       * superblock structure contents that we just set up, not the MS_BORN
++       * flag.
++       */
++      smp_wmb();
+       sb->s_flags |= MS_BORN;
+       error = security_sb_kern_mount(sb, flags, secdata);
diff --git a/queue-4.4/inet-frags-get-rif-of-inet_frag_evicting.patch b/queue-4.4/inet-frags-get-rif-of-inet_frag_evicting.patch
new file mode 100644 (file)
index 0000000..99306ea
--- /dev/null
@@ -0,0 +1,147 @@
+From 399d1404be660d355192ff4df5ccc3f4159ec1e4 Mon Sep 17 00:00:00 2001
+From: Eric Dumazet <edumazet@google.com>
+Date: Sat, 31 Mar 2018 12:58:51 -0700
+Subject: inet: frags: get rif of inet_frag_evicting()
+
+From: Eric Dumazet <edumazet@google.com>
+
+commit 399d1404be660d355192ff4df5ccc3f4159ec1e4 upstream.
+
+This refactors ip_expire() since one indentation level is removed.
+
+Note: in the future, we should try hard to avoid the skb_clone()
+since this is a serious performance cost.
+Under DDOS, the ICMP message wont be sent because of rate limits.
+
+Fact that ip6_expire_frag_queue() does not use skb_clone() is
+disturbing too. Presumably IPv6 should have the same
+issue than the one we fixed in commit ec4fbd64751d
+("inet: frag: release spinlock before calling icmp_send()")
+
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Mao Wenan <maowenan@huawei.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ include/net/inet_frag.h |    5 ---
+ net/ipv4/ip_fragment.c  |   66 +++++++++++++++++++++++-------------------------
+ net/ipv6/reassembly.c   |    4 --
+ 3 files changed, 32 insertions(+), 43 deletions(-)
+
+--- a/include/net/inet_frag.h
++++ b/include/net/inet_frag.h
+@@ -123,11 +123,6 @@ static inline void inet_frag_put(struct
+               inet_frag_destroy(q, f);
+ }
+-static inline bool inet_frag_evicting(struct inet_frag_queue *q)
+-{
+-      return !hlist_unhashed(&q->list_evictor);
+-}
+-
+ /* Memory Tracking Functions. */
+ static inline int frag_mem_limit(struct netns_frags *nf)
+--- a/net/ipv4/ip_fragment.c
++++ b/net/ipv4/ip_fragment.c
+@@ -194,8 +194,11 @@ static bool frag_expire_skip_icmp(u32 us
+  */
+ static void ip_expire(unsigned long arg)
+ {
+-      struct ipq *qp;
++      struct sk_buff *clone, *head;
++      const struct iphdr *iph;
+       struct net *net;
++      struct ipq *qp;
++      int err;
+       qp = container_of((struct inet_frag_queue *) arg, struct ipq, q);
+       net = container_of(qp->q.net, struct net, ipv4.frags);
+@@ -209,45 +212,40 @@ static void ip_expire(unsigned long arg)
+       ipq_kill(qp);
+       IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS);
+-      if (!inet_frag_evicting(&qp->q)) {
+-              struct sk_buff *clone, *head = qp->q.fragments;
+-              const struct iphdr *iph;
+-              int err;
+-
+-              IP_INC_STATS_BH(net, IPSTATS_MIB_REASMTIMEOUT);
++      head = qp->q.fragments;
+-              if (!(qp->q.flags & INET_FRAG_FIRST_IN) || !qp->q.fragments)
+-                      goto out;
++      IP_INC_STATS_BH(net, IPSTATS_MIB_REASMTIMEOUT);
+-              head->dev = dev_get_by_index_rcu(net, qp->iif);
+-              if (!head->dev)
+-                      goto out;
++      if (!(qp->q.flags & INET_FRAG_FIRST_IN) || !head)
++              goto out;
++      head->dev = dev_get_by_index_rcu(net, qp->iif);
++      if (!head->dev)
++              goto out;
+-              /* skb has no dst, perform route lookup again */
+-              iph = ip_hdr(head);
+-              err = ip_route_input_noref(head, iph->daddr, iph->saddr,
++      /* skb has no dst, perform route lookup again */
++      iph = ip_hdr(head);
++      err = ip_route_input_noref(head, iph->daddr, iph->saddr,
+                                          iph->tos, head->dev);
+-              if (err)
+-                      goto out;
++      if (err)
++              goto out;
++
++      /* Only an end host needs to send an ICMP
++       * "Fragment Reassembly Timeout" message, per RFC792.
++       */
++      if (frag_expire_skip_icmp(qp->user) &&
++          (skb_rtable(head)->rt_type != RTN_LOCAL))
++              goto out;
++
++      clone = skb_clone(head, GFP_ATOMIC);
+-              /* Only an end host needs to send an ICMP
+-               * "Fragment Reassembly Timeout" message, per RFC792.
+-               */
+-              if (frag_expire_skip_icmp(qp->user) &&
+-                  (skb_rtable(head)->rt_type != RTN_LOCAL))
+-                      goto out;
+-
+-              clone = skb_clone(head, GFP_ATOMIC);
+-
+-              /* Send an ICMP "Fragment Reassembly Timeout" message. */
+-              if (clone) {
+-                      spin_unlock(&qp->q.lock);
+-                      icmp_send(clone, ICMP_TIME_EXCEEDED,
+-                                ICMP_EXC_FRAGTIME, 0);
+-                      consume_skb(clone);
+-                      goto out_rcu_unlock;
+-              }
++      /* Send an ICMP "Fragment Reassembly Timeout" message. */
++      if (clone) {
++              spin_unlock(&qp->q.lock);
++              icmp_send(clone, ICMP_TIME_EXCEEDED,
++                        ICMP_EXC_FRAGTIME, 0);
++              consume_skb(clone);
++              goto out_rcu_unlock;
+       }
+ out:
+       spin_unlock(&qp->q.lock);
+--- a/net/ipv6/reassembly.c
++++ b/net/ipv6/reassembly.c
+@@ -146,10 +146,6 @@ void ip6_expire_frag_queue(struct net *n
+               goto out_rcu_unlock;
+       IP6_INC_STATS_BH(net, __in6_dev_get(dev), IPSTATS_MIB_REASMFAILS);
+-
+-      if (inet_frag_evicting(&fq->q))
+-              goto out_rcu_unlock;
+-
+       IP6_INC_STATS_BH(net, __in6_dev_get(dev), IPSTATS_MIB_REASMTIMEOUT);
+       /* Don't send error if the first segment did not arrive. */
diff --git a/queue-4.4/ip-add-helpers-to-process-in-order-fragments-faster.patch b/queue-4.4/ip-add-helpers-to-process-in-order-fragments-faster.patch
new file mode 100644 (file)
index 0000000..7ac5d7b
--- /dev/null
@@ -0,0 +1,157 @@
+From 353c9cb360874e737fb000545f783df756c06f9a Mon Sep 17 00:00:00 2001
+From: Peter Oskolkov <posk@google.com>
+Date: Sat, 11 Aug 2018 20:27:24 +0000
+Subject: ip: add helpers to process in-order fragments faster.
+
+From: Peter Oskolkov <posk@google.com>
+
+commit 353c9cb360874e737fb000545f783df756c06f9a upstream.
+
+This patch introduces several helper functions/macros that will be
+used in the follow-up patch. No runtime changes yet.
+
+The new logic (fully implemented in the second patch) is as follows:
+
+* Nodes in the rb-tree will now contain not single fragments, but lists
+  of consecutive fragments ("runs").
+
+* At each point in time, the current "active" run at the tail is
+  maintained/tracked. Fragments that arrive in-order, adjacent
+  to the previous tail fragment, are added to this tail run without
+  triggering the re-balancing of the rb-tree.
+
+* If a fragment arrives out of order with the offset _before_ the tail run,
+  it is inserted into the rb-tree as a single fragment.
+
+* If a fragment arrives after the current tail fragment (with a gap),
+  it starts a new "tail" run, as is inserted into the rb-tree
+  at the end as the head of the new run.
+
+skb->cb is used to store additional information
+needed here (suggested by Eric Dumazet).
+
+Reported-by: Willem de Bruijn <willemb@google.com>
+Signed-off-by: Peter Oskolkov <posk@google.com>
+Cc: Eric Dumazet <edumazet@google.com>
+Cc: Florian Westphal <fw@strlen.de>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Mao Wenan <maowenan@huawei.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/net/inet_frag.h |    4 ++
+ net/ipv4/ip_fragment.c  |   74 +++++++++++++++++++++++++++++++++++++++++++++---
+ 2 files changed, 74 insertions(+), 4 deletions(-)
+
+--- a/include/net/inet_frag.h
++++ b/include/net/inet_frag.h
+@@ -48,6 +48,7 @@ struct inet_frag_queue {
+       struct sk_buff          *fragments;  /* Used in IPv6. */
+       struct rb_root          rb_fragments; /* Used in IPv4. */
+       struct sk_buff          *fragments_tail;
++      struct sk_buff          *last_run_head;
+       ktime_t                 stamp;
+       int                     len;
+       int                     meat;
+@@ -118,6 +119,9 @@ struct inet_frag_queue *inet_frag_find(s
+ void inet_frag_maybe_warn_overflow(struct inet_frag_queue *q,
+                                  const char *prefix);
++/* Free all skbs in the queue; return the sum of their truesizes. */
++unsigned int inet_frag_rbtree_purge(struct rb_root *root);
++
+ static inline void inet_frag_put(struct inet_frag_queue *q, struct inet_frags *f)
+ {
+       if (atomic_dec_and_test(&q->refcnt))
+--- a/net/ipv4/ip_fragment.c
++++ b/net/ipv4/ip_fragment.c
+@@ -58,13 +58,57 @@
+ static int sysctl_ipfrag_max_dist __read_mostly = 64;
+ static const char ip_frag_cache_name[] = "ip4-frags";
+-struct ipfrag_skb_cb
+-{
++/* Use skb->cb to track consecutive/adjacent fragments coming at
++ * the end of the queue. Nodes in the rb-tree queue will
++ * contain "runs" of one or more adjacent fragments.
++ *
++ * Invariants:
++ * - next_frag is NULL at the tail of a "run";
++ * - the head of a "run" has the sum of all fragment lengths in frag_run_len.
++ */
++struct ipfrag_skb_cb {
+       struct inet_skb_parm    h;
+-      int                     offset;
++      int                     offset;
++      struct sk_buff          *next_frag;
++      int                     frag_run_len;
+ };
+-#define FRAG_CB(skb)  ((struct ipfrag_skb_cb *)((skb)->cb))
++#define FRAG_CB(skb)          ((struct ipfrag_skb_cb *)((skb)->cb))
++
++static void ip4_frag_init_run(struct sk_buff *skb)
++{
++      BUILD_BUG_ON(sizeof(struct ipfrag_skb_cb) > sizeof(skb->cb));
++
++      FRAG_CB(skb)->next_frag = NULL;
++      FRAG_CB(skb)->frag_run_len = skb->len;
++}
++
++/* Append skb to the last "run". */
++static void ip4_frag_append_to_last_run(struct inet_frag_queue *q,
++                                      struct sk_buff *skb)
++{
++      RB_CLEAR_NODE(&skb->rbnode);
++      FRAG_CB(skb)->next_frag = NULL;
++
++      FRAG_CB(q->last_run_head)->frag_run_len += skb->len;
++      FRAG_CB(q->fragments_tail)->next_frag = skb;
++      q->fragments_tail = skb;
++}
++
++/* Create a new "run" with the skb. */
++static void ip4_frag_create_run(struct inet_frag_queue *q, struct sk_buff *skb)
++{
++      if (q->last_run_head)
++              rb_link_node(&skb->rbnode, &q->last_run_head->rbnode,
++                           &q->last_run_head->rbnode.rb_right);
++      else
++              rb_link_node(&skb->rbnode, NULL, &q->rb_fragments.rb_node);
++      rb_insert_color(&skb->rbnode, &q->rb_fragments);
++
++      ip4_frag_init_run(skb);
++      q->fragments_tail = skb;
++      q->last_run_head = skb;
++}
+ /* Describe an entry in the "incomplete datagrams" queue. */
+ struct ipq {
+@@ -721,6 +765,28 @@ struct sk_buff *ip_check_defrag(struct n
+ }
+ EXPORT_SYMBOL(ip_check_defrag);
++unsigned int inet_frag_rbtree_purge(struct rb_root *root)
++{
++      struct rb_node *p = rb_first(root);
++      unsigned int sum = 0;
++
++      while (p) {
++              struct sk_buff *skb = rb_entry(p, struct sk_buff, rbnode);
++
++              p = rb_next(p);
++              rb_erase(&skb->rbnode, root);
++              while (skb) {
++                      struct sk_buff *next = FRAG_CB(skb)->next_frag;
++
++                      sum += skb->truesize;
++                      kfree_skb(skb);
++                      skb = next;
++              }
++      }
++      return sum;
++}
++EXPORT_SYMBOL(inet_frag_rbtree_purge);
++
+ #ifdef CONFIG_SYSCTL
+ static int zero;
diff --git a/queue-4.4/ip-discard-ipv4-datagrams-with-overlapping-segments.patch b/queue-4.4/ip-discard-ipv4-datagrams-with-overlapping-segments.patch
new file mode 100644 (file)
index 0000000..09d8221
--- /dev/null
@@ -0,0 +1,146 @@
+From 7969e5c40dfd04799d4341f1b7cd266b6e47f227 Mon Sep 17 00:00:00 2001
+From: Peter Oskolkov <posk@google.com>
+Date: Thu, 2 Aug 2018 23:34:37 +0000
+Subject: ip: discard IPv4 datagrams with overlapping segments.
+
+From: Peter Oskolkov <posk@google.com>
+
+commit 7969e5c40dfd04799d4341f1b7cd266b6e47f227 upstream.
+
+This behavior is required in IPv6, and there is little need
+to tolerate overlapping fragments in IPv4. This change
+simplifies the code and eliminates potential DDoS attack vectors.
+
+Tested: ran ip_defrag selftest (not yet available uptream).
+
+Suggested-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Peter Oskolkov <posk@google.com>
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Cc: Florian Westphal <fw@strlen.de>
+Acked-by: Stephen Hemminger <stephen@networkplumber.org>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Mao Wenan <maowenan@huawei.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/uapi/linux/snmp.h |    1 
+ net/ipv4/ip_fragment.c    |   73 ++++++++++++----------------------------------
+ net/ipv4/proc.c           |    1 
+ 3 files changed, 22 insertions(+), 53 deletions(-)
+
+--- a/include/uapi/linux/snmp.h
++++ b/include/uapi/linux/snmp.h
+@@ -55,6 +55,7 @@ enum
+       IPSTATS_MIB_ECT1PKTS,                   /* InECT1Pkts */
+       IPSTATS_MIB_ECT0PKTS,                   /* InECT0Pkts */
+       IPSTATS_MIB_CEPKTS,                     /* InCEPkts */
++      IPSTATS_MIB_REASM_OVERLAPS,             /* ReasmOverlaps */
+       __IPSTATS_MIB_MAX
+ };
+--- a/net/ipv4/ip_fragment.c
++++ b/net/ipv4/ip_fragment.c
+@@ -342,6 +342,7 @@ static int ip_frag_reinit(struct ipq *qp
+ /* Add new segment to existing queue. */
+ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
+ {
++      struct net *net = container_of(qp->q.net, struct net, ipv4.frags);
+       struct sk_buff *prev, *next;
+       struct net_device *dev;
+       unsigned int fragsize;
+@@ -422,60 +423,22 @@ static int ip_frag_queue(struct ipq *qp,
+       }
+ found:
+-      /* We found where to put this one.  Check for overlap with
+-       * preceding fragment, and, if needed, align things so that
+-       * any overlaps are eliminated.
++      /* RFC5722, Section 4, amended by Errata ID : 3089
++       *                          When reassembling an IPv6 datagram, if
++       *   one or more its constituent fragments is determined to be an
++       *   overlapping fragment, the entire datagram (and any constituent
++       *   fragments) MUST be silently discarded.
++       *
++       * We do the same here for IPv4.
+        */
+-      if (prev) {
+-              int i = (FRAG_CB(prev)->offset + prev->len) - offset;
+-
+-              if (i > 0) {
+-                      offset += i;
+-                      err = -EINVAL;
+-                      if (end <= offset)
+-                              goto err;
+-                      err = -ENOMEM;
+-                      if (!pskb_pull(skb, i))
+-                              goto err;
+-                      if (skb->ip_summed != CHECKSUM_UNNECESSARY)
+-                              skb->ip_summed = CHECKSUM_NONE;
+-              }
+-      }
+-
+-      err = -ENOMEM;
+-
+-      while (next && FRAG_CB(next)->offset < end) {
+-              int i = end - FRAG_CB(next)->offset; /* overlap is 'i' bytes */
+-
+-              if (i < next->len) {
+-                      /* Eat head of the next overlapped fragment
+-                       * and leave the loop. The next ones cannot overlap.
+-                       */
+-                      if (!pskb_pull(next, i))
+-                              goto err;
+-                      FRAG_CB(next)->offset += i;
+-                      qp->q.meat -= i;
+-                      if (next->ip_summed != CHECKSUM_UNNECESSARY)
+-                              next->ip_summed = CHECKSUM_NONE;
+-                      break;
+-              } else {
+-                      struct sk_buff *free_it = next;
+-
+-                      /* Old fragment is completely overridden with
+-                       * new one drop it.
+-                       */
+-                      next = next->next;
+-
+-                      if (prev)
+-                              prev->next = next;
+-                      else
+-                              qp->q.fragments = next;
+-
+-                      qp->q.meat -= free_it->len;
+-                      sub_frag_mem_limit(qp->q.net, free_it->truesize);
+-                      kfree_skb(free_it);
+-              }
+-      }
++      /* Is there an overlap with the previous fragment? */
++      if (prev &&
++          (FRAG_CB(prev)->offset + prev->len) > offset)
++              goto discard_qp;
++
++      /* Is there an overlap with the next fragment? */
++      if (next && FRAG_CB(next)->offset < end)
++              goto discard_qp;
+       FRAG_CB(skb)->offset = offset;
+@@ -522,6 +485,10 @@ found:
+       skb_dst_drop(skb);
+       return -EINPROGRESS;
++discard_qp:
++      ipq_kill(qp);
++      err = -EINVAL;
++      IP_INC_STATS(net, IPSTATS_MIB_REASM_OVERLAPS);
+ err:
+       kfree_skb(skb);
+       return err;
+--- a/net/ipv4/proc.c
++++ b/net/ipv4/proc.c
+@@ -132,6 +132,7 @@ static const struct snmp_mib snmp4_ipext
+       SNMP_MIB_ITEM("InECT1Pkts", IPSTATS_MIB_ECT1PKTS),
+       SNMP_MIB_ITEM("InECT0Pkts", IPSTATS_MIB_ECT0PKTS),
+       SNMP_MIB_ITEM("InCEPkts", IPSTATS_MIB_CEPKTS),
++      SNMP_MIB_ITEM("ReasmOverlaps", IPSTATS_MIB_REASM_OVERLAPS),
+       SNMP_MIB_SENTINEL
+ };
diff --git a/queue-4.4/ip-frags-fix-crash-in-ip_do_fragment.patch b/queue-4.4/ip-frags-fix-crash-in-ip_do_fragment.patch
new file mode 100644 (file)
index 0000000..f4c8b52
--- /dev/null
@@ -0,0 +1,110 @@
+From 5d407b071dc369c26a38398326ee2be53651cfe4 Mon Sep 17 00:00:00 2001
+From: Taehee Yoo <ap420073@gmail.com>
+Date: Mon, 10 Sep 2018 02:47:05 +0900
+Subject: ip: frags: fix crash in ip_do_fragment()
+
+From: Taehee Yoo <ap420073@gmail.com>
+
+commit 5d407b071dc369c26a38398326ee2be53651cfe4 upstream.
+
+A kernel crash occurrs when defragmented packet is fragmented
+in ip_do_fragment().
+In defragment routine, skb_orphan() is called and
+skb->ip_defrag_offset is set. but skb->sk and
+skb->ip_defrag_offset are same union member. so that
+frag->sk is not NULL.
+Hence crash occurrs in skb->sk check routine in ip_do_fragment() when
+defragmented packet is fragmented.
+
+test commands:
+   %iptables -t nat -I POSTROUTING -j MASQUERADE
+   %hping3 192.168.4.2 -s 1000 -p 2000 -d 60000
+
+splat looks like:
+[  261.069429] kernel BUG at net/ipv4/ip_output.c:636!
+[  261.075753] invalid opcode: 0000 [#1] SMP DEBUG_PAGEALLOC KASAN PTI
+[  261.083854] CPU: 1 PID: 1349 Comm: hping3 Not tainted 4.19.0-rc2+ #3
+[  261.100977] RIP: 0010:ip_do_fragment+0x1613/0x2600
+[  261.106945] Code: e8 e2 38 e3 fe 4c 8b 44 24 18 48 8b 74 24 08 e9 92 f6 ff ff 80 3c 02 00 0f 85 da 07 00 00 48 8b b5 d0 00 00 00 e9 25 f6 ff ff <0f> 0b 0f 0b 44 8b 54 24 58 4c 8b 4c 24 18 4c 8b 5c 24 60 4c 8b 6c
+[  261.127015] RSP: 0018:ffff8801031cf2c0 EFLAGS: 00010202
+[  261.134156] RAX: 1ffff1002297537b RBX: ffffed0020639e6e RCX: 0000000000000004
+[  261.142156] RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffff880114ba9bd8
+[  261.150157] RBP: ffff880114ba8a40 R08: ffffed0022975395 R09: ffffed0022975395
+[  261.158157] R10: 0000000000000001 R11: ffffed0022975394 R12: ffff880114ba9ca4
+[  261.166159] R13: 0000000000000010 R14: ffff880114ba9bc0 R15: dffffc0000000000
+[  261.174169] FS:  00007fbae2199700(0000) GS:ffff88011b400000(0000) knlGS:0000000000000000
+[  261.183012] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
+[  261.189013] CR2: 00005579244fe000 CR3: 0000000119bf4000 CR4: 00000000001006e0
+[  261.198158] Call Trace:
+[  261.199018]  ? dst_output+0x180/0x180
+[  261.205011]  ? save_trace+0x300/0x300
+[  261.209018]  ? ip_copy_metadata+0xb00/0xb00
+[  261.213034]  ? sched_clock_local+0xd4/0x140
+[  261.218158]  ? kill_l4proto+0x120/0x120 [nf_conntrack]
+[  261.223014]  ? rt_cpu_seq_stop+0x10/0x10
+[  261.227014]  ? find_held_lock+0x39/0x1c0
+[  261.233008]  ip_finish_output+0x51d/0xb50
+[  261.237006]  ? ip_fragment.constprop.56+0x220/0x220
+[  261.243011]  ? nf_ct_l4proto_register_one+0x5b0/0x5b0 [nf_conntrack]
+[  261.250152]  ? rcu_is_watching+0x77/0x120
+[  261.255010]  ? nf_nat_ipv4_out+0x1e/0x2b0 [nf_nat_ipv4]
+[  261.261033]  ? nf_hook_slow+0xb1/0x160
+[  261.265007]  ip_output+0x1c7/0x710
+[  261.269005]  ? ip_mc_output+0x13f0/0x13f0
+[  261.273002]  ? __local_bh_enable_ip+0xe9/0x1b0
+[  261.278152]  ? ip_fragment.constprop.56+0x220/0x220
+[  261.282996]  ? nf_hook_slow+0xb1/0x160
+[  261.287007]  raw_sendmsg+0x21f9/0x4420
+[  261.291008]  ? dst_output+0x180/0x180
+[  261.297003]  ? sched_clock_cpu+0x126/0x170
+[  261.301003]  ? find_held_lock+0x39/0x1c0
+[  261.306155]  ? stop_critical_timings+0x420/0x420
+[  261.311004]  ? check_flags.part.36+0x450/0x450
+[  261.315005]  ? _raw_spin_unlock_irq+0x29/0x40
+[  261.320995]  ? _raw_spin_unlock_irq+0x29/0x40
+[  261.326142]  ? cyc2ns_read_end+0x10/0x10
+[  261.330139]  ? raw_bind+0x280/0x280
+[  261.334138]  ? sched_clock_cpu+0x126/0x170
+[  261.338995]  ? check_flags.part.36+0x450/0x450
+[  261.342991]  ? __lock_acquire+0x4500/0x4500
+[  261.348994]  ? inet_sendmsg+0x11c/0x500
+[  261.352989]  ? dst_output+0x180/0x180
+[  261.357012]  inet_sendmsg+0x11c/0x500
+[ ... ]
+
+v2:
+ - clear skb->sk at reassembly routine.(Eric Dumarzet)
+
+Fixes: fa0f527358bd ("ip: use rb trees for IP frag queue.")
+Suggested-by: Eric Dumazet <edumazet@google.com>
+Signed-off-by: Taehee Yoo <ap420073@gmail.com>
+Reviewed-by: Eric Dumazet <edumazet@google.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Mao Wenan <maowenan@huawei.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ net/ipv4/ip_fragment.c                  |    1 +
+ net/ipv6/netfilter/nf_conntrack_reasm.c |    1 +
+ 2 files changed, 2 insertions(+)
+
+--- a/net/ipv4/ip_fragment.c
++++ b/net/ipv4/ip_fragment.c
+@@ -661,6 +661,7 @@ static int ip_frag_reasm(struct ipq *qp,
+                       nextp = &fp->next;
+                       fp->prev = NULL;
+                       memset(&fp->rbnode, 0, sizeof(fp->rbnode));
++                      fp->sk = NULL;
+                       head->data_len += fp->len;
+                       head->len += fp->len;
+                       if (head->ip_summed != fp->ip_summed)
+--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
++++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
+@@ -454,6 +454,7 @@ nf_ct_frag6_reasm(struct frag_queue *fq,
+               else if (head->ip_summed == CHECKSUM_COMPLETE)
+                       head->csum = csum_add(head->csum, fp->csum);
+               head->truesize += fp->truesize;
++              fp->sk = NULL;
+       }
+       sub_frag_mem_limit(fq->q.net, head->truesize);
diff --git a/queue-4.4/ip-process-in-order-fragments-efficiently.patch b/queue-4.4/ip-process-in-order-fragments-efficiently.patch
new file mode 100644 (file)
index 0000000..33b5def
--- /dev/null
@@ -0,0 +1,268 @@
+From a4fd284a1f8fd4b6c59aa59db2185b1e17c5c11c Mon Sep 17 00:00:00 2001
+From: Peter Oskolkov <posk@google.com>
+Date: Sat, 11 Aug 2018 20:27:25 +0000
+Subject: ip: process in-order fragments efficiently
+
+From: Peter Oskolkov <posk@google.com>
+
+commit a4fd284a1f8fd4b6c59aa59db2185b1e17c5c11c upstream.
+
+This patch changes the runtime behavior of IP defrag queue:
+incoming in-order fragments are added to the end of the current
+list/"run" of in-order fragments at the tail.
+
+On some workloads, UDP stream performance is substantially improved:
+
+RX: ./udp_stream -F 10 -T 2 -l 60
+TX: ./udp_stream -c -H <host> -F 10 -T 5 -l 60
+
+with this patchset applied on a 10Gbps receiver:
+
+  throughput=9524.18
+  throughput_units=Mbit/s
+
+upstream (net-next):
+
+  throughput=4608.93
+  throughput_units=Mbit/s
+
+Reported-by: Willem de Bruijn <willemb@google.com>
+Signed-off-by: Peter Oskolkov <posk@google.com>
+Cc: Eric Dumazet <edumazet@google.com>
+Cc: Florian Westphal <fw@strlen.de>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Mao Wenan <maowenan@huawei.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/ipv4/inet_fragment.c |    2 
+ net/ipv4/ip_fragment.c   |  110 +++++++++++++++++++++++++++++------------------
+ 2 files changed, 70 insertions(+), 42 deletions(-)
+
+--- a/net/ipv4/inet_fragment.c
++++ b/net/ipv4/inet_fragment.c
+@@ -315,7 +315,7 @@ void inet_frag_destroy(struct inet_frag_
+                       fp = xp;
+               } while (fp);
+       } else {
+-              sum_truesize = skb_rbtree_purge(&q->rb_fragments);
++              sum_truesize = inet_frag_rbtree_purge(&q->rb_fragments);
+       }
+       sum = sum_truesize + f->qsize;
+--- a/net/ipv4/ip_fragment.c
++++ b/net/ipv4/ip_fragment.c
+@@ -139,8 +139,8 @@ int ip_frag_mem(struct net *net)
+       return sum_frag_mem_limit(&net->ipv4.frags);
+ }
+-static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
+-                       struct net_device *dev);
++static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb,
++                      struct sk_buff *prev_tail, struct net_device *dev);
+ struct ip4_create_arg {
+       struct iphdr *iph;
+@@ -271,7 +271,12 @@ static void ip_expire(unsigned long arg)
+               head = skb_rb_first(&qp->q.rb_fragments);
+               if (!head)
+                       goto out;
+-              rb_erase(&head->rbnode, &qp->q.rb_fragments);
++              if (FRAG_CB(head)->next_frag)
++                      rb_replace_node(&head->rbnode,
++                                      &FRAG_CB(head)->next_frag->rbnode,
++                                      &qp->q.rb_fragments);
++              else
++                      rb_erase(&head->rbnode, &qp->q.rb_fragments);
+               memset(&head->rbnode, 0, sizeof(head->rbnode));
+               barrier();
+       }
+@@ -373,7 +378,7 @@ static int ip_frag_reinit(struct ipq *qp
+               return -ETIMEDOUT;
+       }
+-      sum_truesize = skb_rbtree_purge(&qp->q.rb_fragments);
++      sum_truesize = inet_frag_rbtree_purge(&qp->q.rb_fragments);
+       sub_frag_mem_limit(qp->q.net, sum_truesize);
+       qp->q.flags = 0;
+@@ -382,6 +387,7 @@ static int ip_frag_reinit(struct ipq *qp
+       qp->q.fragments = NULL;
+       qp->q.rb_fragments = RB_ROOT;
+       qp->q.fragments_tail = NULL;
++      qp->q.last_run_head = NULL;
+       qp->iif = 0;
+       qp->ecn = 0;
+@@ -393,7 +399,7 @@ static int ip_frag_queue(struct ipq *qp,
+ {
+       struct net *net = container_of(qp->q.net, struct net, ipv4.frags);
+       struct rb_node **rbn, *parent;
+-      struct sk_buff *skb1;
++      struct sk_buff *skb1, *prev_tail;
+       struct net_device *dev;
+       unsigned int fragsize;
+       int flags, offset;
+@@ -471,38 +477,41 @@ static int ip_frag_queue(struct ipq *qp,
+        */
+       /* Find out where to put this fragment.  */
+-      skb1 = qp->q.fragments_tail;
+-      if (!skb1) {
+-              /* This is the first fragment we've received. */
+-              rb_link_node(&skb->rbnode, NULL, &qp->q.rb_fragments.rb_node);
+-              qp->q.fragments_tail = skb;
+-      } else if ((FRAG_CB(skb1)->offset + skb1->len) < end) {
+-              /* This is the common/special case: skb goes to the end. */
++      prev_tail = qp->q.fragments_tail;
++      if (!prev_tail)
++              ip4_frag_create_run(&qp->q, skb);  /* First fragment. */
++      else if (FRAG_CB(prev_tail)->offset + prev_tail->len < end) {
++              /* This is the common case: skb goes to the end. */
+               /* Detect and discard overlaps. */
+-              if (offset < (FRAG_CB(skb1)->offset + skb1->len))
++              if (offset < FRAG_CB(prev_tail)->offset + prev_tail->len)
+                       goto discard_qp;
+-              /* Insert after skb1. */
+-              rb_link_node(&skb->rbnode, &skb1->rbnode, &skb1->rbnode.rb_right);
+-              qp->q.fragments_tail = skb;
++              if (offset == FRAG_CB(prev_tail)->offset + prev_tail->len)
++                      ip4_frag_append_to_last_run(&qp->q, skb);
++              else
++                      ip4_frag_create_run(&qp->q, skb);
+       } else {
+-              /* Binary search. Note that skb can become the first fragment, but
+-               * not the last (covered above). */
++              /* Binary search. Note that skb can become the first fragment,
++               * but not the last (covered above).
++               */
+               rbn = &qp->q.rb_fragments.rb_node;
+               do {
+                       parent = *rbn;
+                       skb1 = rb_to_skb(parent);
+                       if (end <= FRAG_CB(skb1)->offset)
+                               rbn = &parent->rb_left;
+-                      else if (offset >= FRAG_CB(skb1)->offset + skb1->len)
++                      else if (offset >= FRAG_CB(skb1)->offset +
++                                              FRAG_CB(skb1)->frag_run_len)
+                               rbn = &parent->rb_right;
+                       else /* Found an overlap with skb1. */
+                               goto discard_qp;
+               } while (*rbn);
+               /* Here we have parent properly set, and rbn pointing to
+-               * one of its NULL left/right children. Insert skb. */
++               * one of its NULL left/right children. Insert skb.
++               */
++              ip4_frag_init_run(skb);
+               rb_link_node(&skb->rbnode, parent, rbn);
++              rb_insert_color(&skb->rbnode, &qp->q.rb_fragments);
+       }
+-      rb_insert_color(&skb->rbnode, &qp->q.rb_fragments);
+       if (dev) {
+               qp->iif = dev->ifindex;
+@@ -531,7 +540,7 @@ static int ip_frag_queue(struct ipq *qp,
+               unsigned long orefdst = skb->_skb_refdst;
+               skb->_skb_refdst = 0UL;
+-              err = ip_frag_reasm(qp, skb, dev);
++              err = ip_frag_reasm(qp, skb, prev_tail, dev);
+               skb->_skb_refdst = orefdst;
+               return err;
+       }
+@@ -550,7 +559,7 @@ err:
+ /* Build a new IP datagram from all its fragments. */
+ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb,
+-                       struct net_device *dev)
++                       struct sk_buff *prev_tail, struct net_device *dev)
+ {
+       struct net *net = container_of(qp->q.net, struct net, ipv4.frags);
+       struct iphdr *iph;
+@@ -575,10 +584,16 @@ static int ip_frag_reasm(struct ipq *qp,
+               if (!fp)
+                       goto out_nomem;
+-              rb_replace_node(&skb->rbnode, &fp->rbnode, &qp->q.rb_fragments);
++              FRAG_CB(fp)->next_frag = FRAG_CB(skb)->next_frag;
++              if (RB_EMPTY_NODE(&skb->rbnode))
++                      FRAG_CB(prev_tail)->next_frag = fp;
++              else
++                      rb_replace_node(&skb->rbnode, &fp->rbnode,
++                                      &qp->q.rb_fragments);
+               if (qp->q.fragments_tail == skb)
+                       qp->q.fragments_tail = fp;
+               skb_morph(skb, head);
++              FRAG_CB(skb)->next_frag = FRAG_CB(head)->next_frag;
+               rb_replace_node(&head->rbnode, &skb->rbnode,
+                               &qp->q.rb_fragments);
+               consume_skb(head);
+@@ -614,7 +629,7 @@ static int ip_frag_reasm(struct ipq *qp,
+               for (i = 0; i < skb_shinfo(head)->nr_frags; i++)
+                       plen += skb_frag_size(&skb_shinfo(head)->frags[i]);
+               clone->len = clone->data_len = head->data_len - plen;
+-              skb->truesize += clone->truesize;
++              head->truesize += clone->truesize;
+               clone->csum = 0;
+               clone->ip_summed = head->ip_summed;
+               add_frag_mem_limit(qp->q.net, clone->truesize);
+@@ -627,24 +642,36 @@ static int ip_frag_reasm(struct ipq *qp,
+       skb_push(head, head->data - skb_network_header(head));
+       /* Traverse the tree in order, to build frag_list. */
++      fp = FRAG_CB(head)->next_frag;
+       rbn = rb_next(&head->rbnode);
+       rb_erase(&head->rbnode, &qp->q.rb_fragments);
+-      while (rbn) {
+-              struct rb_node *rbnext = rb_next(rbn);
+-              fp = rb_to_skb(rbn);
+-              rb_erase(rbn, &qp->q.rb_fragments);
+-              rbn = rbnext;
+-              *nextp = fp;
+-              nextp = &fp->next;
+-              fp->prev = NULL;
+-              memset(&fp->rbnode, 0, sizeof(fp->rbnode));
+-              head->data_len += fp->len;
+-              head->len += fp->len;
+-              if (head->ip_summed != fp->ip_summed)
+-                      head->ip_summed = CHECKSUM_NONE;
+-              else if (head->ip_summed == CHECKSUM_COMPLETE)
+-                      head->csum = csum_add(head->csum, fp->csum);
+-              head->truesize += fp->truesize;
++      while (rbn || fp) {
++              /* fp points to the next sk_buff in the current run;
++               * rbn points to the next run.
++               */
++              /* Go through the current run. */
++              while (fp) {
++                      *nextp = fp;
++                      nextp = &fp->next;
++                      fp->prev = NULL;
++                      memset(&fp->rbnode, 0, sizeof(fp->rbnode));
++                      head->data_len += fp->len;
++                      head->len += fp->len;
++                      if (head->ip_summed != fp->ip_summed)
++                              head->ip_summed = CHECKSUM_NONE;
++                      else if (head->ip_summed == CHECKSUM_COMPLETE)
++                              head->csum = csum_add(head->csum, fp->csum);
++                      head->truesize += fp->truesize;
++                      fp = FRAG_CB(fp)->next_frag;
++              }
++              /* Move to the next run. */
++              if (rbn) {
++                      struct rb_node *rbnext = rb_next(rbn);
++
++                      fp = rb_to_skb(rbn);
++                      rb_erase(rbn, &qp->q.rb_fragments);
++                      rbn = rbnext;
++              }
+       }
+       sub_frag_mem_limit(qp->q.net, head->truesize);
+@@ -680,6 +707,7 @@ static int ip_frag_reasm(struct ipq *qp,
+       qp->q.fragments = NULL;
+       qp->q.rb_fragments = RB_ROOT;
+       qp->q.fragments_tail = NULL;
++      qp->q.last_run_head = NULL;
+       return 0;
+ out_nomem:
diff --git a/queue-4.4/ip-use-rb-trees-for-ip-frag-queue.patch b/queue-4.4/ip-use-rb-trees-for-ip-frag-queue.patch
new file mode 100644 (file)
index 0000000..cd5b3a0
--- /dev/null
@@ -0,0 +1,462 @@
+From fa0f527358bd900ef92f925878ed6bfbd51305cc Mon Sep 17 00:00:00 2001
+From: Peter Oskolkov <posk@google.com>
+Date: Thu, 2 Aug 2018 23:34:39 +0000
+Subject: ip: use rb trees for IP frag queue.
+
+From: Peter Oskolkov <posk@google.com>
+
+commit fa0f527358bd900ef92f925878ed6bfbd51305cc upstream.
+
+Similar to TCP OOO RX queue, it makes sense to use rb trees to store
+IP fragments, so that OOO fragments are inserted faster.
+
+Tested:
+
+- a follow-up patch contains a rather comprehensive ip defrag
+  self-test (functional)
+- ran neper `udp_stream -c -H <host> -F 100 -l 300 -T 20`:
+    netstat --statistics
+    Ip:
+        282078937 total packets received
+        0 forwarded
+        0 incoming packets discarded
+        946760 incoming packets delivered
+        18743456 requests sent out
+        101 fragments dropped after timeout
+        282077129 reassemblies required
+        944952 packets reassembled ok
+        262734239 packet reassembles failed
+   (The numbers/stats above are somewhat better re:
+    reassemblies vs a kernel without this patchset. More
+    comprehensive performance testing TBD).
+
+Reported-by: Jann Horn <jannh@google.com>
+Reported-by: Juha-Matti Tilli <juha-matti.tilli@iki.fi>
+Suggested-by: Eric Dumazet <edumazet@google.com>
+Signed-off-by: Peter Oskolkov <posk@google.com>
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Cc: Florian Westphal <fw@strlen.de>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Mao Wenan <maowenan@huawei.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ include/linux/skbuff.h                  |    2 
+ include/net/inet_frag.h                 |    3 
+ net/ipv4/inet_fragment.c                |   14 +-
+ net/ipv4/ip_fragment.c                  |  190 +++++++++++++++++---------------
+ net/ipv6/netfilter/nf_conntrack_reasm.c |    1 
+ net/ipv6/reassembly.c                   |    1 
+ 6 files changed, 120 insertions(+), 91 deletions(-)
+
+--- a/include/linux/skbuff.h
++++ b/include/linux/skbuff.h
+@@ -556,7 +556,7 @@ struct sk_buff {
+                               struct skb_mstamp skb_mstamp;
+                       };
+               };
+-              struct rb_node  rbnode; /* used in netem & tcp stack */
++              struct rb_node  rbnode; /* used in netem, ip4 defrag, and tcp stack */
+       };
+       struct sock             *sk;
+       struct net_device       *dev;
+--- a/include/net/inet_frag.h
++++ b/include/net/inet_frag.h
+@@ -45,7 +45,8 @@ struct inet_frag_queue {
+       struct timer_list       timer;
+       struct hlist_node       list;
+       atomic_t                refcnt;
+-      struct sk_buff          *fragments;
++      struct sk_buff          *fragments;  /* Used in IPv6. */
++      struct rb_root          rb_fragments; /* Used in IPv4. */
+       struct sk_buff          *fragments_tail;
+       ktime_t                 stamp;
+       int                     len;
+--- a/net/ipv4/inet_fragment.c
++++ b/net/ipv4/inet_fragment.c
+@@ -306,12 +306,16 @@ void inet_frag_destroy(struct inet_frag_
+       /* Release all fragment data. */
+       fp = q->fragments;
+       nf = q->net;
+-      while (fp) {
+-              struct sk_buff *xp = fp->next;
++      if (fp) {
++              do {
++                      struct sk_buff *xp = fp->next;
+-              sum_truesize += fp->truesize;
+-              frag_kfree_skb(nf, f, fp);
+-              fp = xp;
++                      sum_truesize += fp->truesize;
++                      kfree_skb(fp);
++                      fp = xp;
++              } while (fp);
++      } else {
++              sum_truesize = skb_rbtree_purge(&q->rb_fragments);
+       }
+       sum = sum_truesize + f->qsize;
+--- a/net/ipv4/ip_fragment.c
++++ b/net/ipv4/ip_fragment.c
+@@ -194,7 +194,7 @@ static bool frag_expire_skip_icmp(u32 us
+  */
+ static void ip_expire(unsigned long arg)
+ {
+-      struct sk_buff *clone, *head;
++      struct sk_buff *head = NULL;
+       const struct iphdr *iph;
+       struct net *net;
+       struct ipq *qp;
+@@ -211,14 +211,31 @@ static void ip_expire(unsigned long arg)
+       ipq_kill(qp);
+       IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS);
+-
+-      head = qp->q.fragments;
+-
+       IP_INC_STATS_BH(net, IPSTATS_MIB_REASMTIMEOUT);
+-      if (!(qp->q.flags & INET_FRAG_FIRST_IN) || !head)
++      if (!qp->q.flags & INET_FRAG_FIRST_IN)
+               goto out;
++      /* sk_buff::dev and sk_buff::rbnode are unionized. So we
++       * pull the head out of the tree in order to be able to
++       * deal with head->dev.
++       */
++      if (qp->q.fragments) {
++              head = qp->q.fragments;
++              qp->q.fragments = head->next;
++      } else {
++              head = skb_rb_first(&qp->q.rb_fragments);
++              if (!head)
++                      goto out;
++              rb_erase(&head->rbnode, &qp->q.rb_fragments);
++              memset(&head->rbnode, 0, sizeof(head->rbnode));
++              barrier();
++      }
++      if (head == qp->q.fragments_tail)
++              qp->q.fragments_tail = NULL;
++
++      sub_frag_mem_limit(qp->q.net, head->truesize);
++
+       head->dev = dev_get_by_index_rcu(net, qp->iif);
+       if (!head->dev)
+               goto out;
+@@ -237,20 +254,17 @@ static void ip_expire(unsigned long arg)
+           (skb_rtable(head)->rt_type != RTN_LOCAL))
+               goto out;
+-      clone = skb_clone(head, GFP_ATOMIC);
+-
+       /* Send an ICMP "Fragment Reassembly Timeout" message. */
+-      if (clone) {
+-              spin_unlock(&qp->q.lock);
+-              icmp_send(clone, ICMP_TIME_EXCEEDED,
+-                        ICMP_EXC_FRAGTIME, 0);
+-              consume_skb(clone);
+-              goto out_rcu_unlock;
+-      }
++      spin_unlock(&qp->q.lock);
++      icmp_send(head, ICMP_TIME_EXCEEDED, ICMP_EXC_FRAGTIME, 0);
++      goto out_rcu_unlock;
++
+ out:
+       spin_unlock(&qp->q.lock);
+ out_rcu_unlock:
+       rcu_read_unlock();
++      if (head)
++              kfree_skb(head);
+       ipq_put(qp);
+ }
+@@ -294,7 +308,7 @@ static int ip_frag_too_far(struct ipq *q
+       end = atomic_inc_return(&peer->rid);
+       qp->rid = end;
+-      rc = qp->q.fragments && (end - start) > max;
++      rc = qp->q.fragments_tail && (end - start) > max;
+       if (rc) {
+               struct net *net;
+@@ -308,7 +322,6 @@ static int ip_frag_too_far(struct ipq *q
+ static int ip_frag_reinit(struct ipq *qp)
+ {
+-      struct sk_buff *fp;
+       unsigned int sum_truesize = 0;
+       if (!mod_timer(&qp->q.timer, jiffies + qp->q.net->timeout)) {
+@@ -316,20 +329,14 @@ static int ip_frag_reinit(struct ipq *qp
+               return -ETIMEDOUT;
+       }
+-      fp = qp->q.fragments;
+-      do {
+-              struct sk_buff *xp = fp->next;
+-
+-              sum_truesize += fp->truesize;
+-              kfree_skb(fp);
+-              fp = xp;
+-      } while (fp);
++      sum_truesize = skb_rbtree_purge(&qp->q.rb_fragments);
+       sub_frag_mem_limit(qp->q.net, sum_truesize);
+       qp->q.flags = 0;
+       qp->q.len = 0;
+       qp->q.meat = 0;
+       qp->q.fragments = NULL;
++      qp->q.rb_fragments = RB_ROOT;
+       qp->q.fragments_tail = NULL;
+       qp->iif = 0;
+       qp->ecn = 0;
+@@ -341,7 +348,8 @@ static int ip_frag_reinit(struct ipq *qp
+ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
+ {
+       struct net *net = container_of(qp->q.net, struct net, ipv4.frags);
+-      struct sk_buff *prev, *next;
++      struct rb_node **rbn, *parent;
++      struct sk_buff *skb1;
+       struct net_device *dev;
+       unsigned int fragsize;
+       int flags, offset;
+@@ -404,56 +412,60 @@ static int ip_frag_queue(struct ipq *qp,
+       if (err)
+               goto err;
+-      /* Find out which fragments are in front and at the back of us
+-       * in the chain of fragments so far.  We must know where to put
+-       * this fragment, right?
+-       */
+-      prev = qp->q.fragments_tail;
+-      if (!prev || FRAG_CB(prev)->offset < offset) {
+-              next = NULL;
+-              goto found;
+-      }
+-      prev = NULL;
+-      for (next = qp->q.fragments; next != NULL; next = next->next) {
+-              if (FRAG_CB(next)->offset >= offset)
+-                      break;  /* bingo! */
+-              prev = next;
+-      }
++      /* Note : skb->rbnode and skb->dev share the same location. */
++      dev = skb->dev;
++      /* Makes sure compiler wont do silly aliasing games */
++      barrier();
+-found:
+       /* RFC5722, Section 4, amended by Errata ID : 3089
+        *                          When reassembling an IPv6 datagram, if
+        *   one or more its constituent fragments is determined to be an
+        *   overlapping fragment, the entire datagram (and any constituent
+        *   fragments) MUST be silently discarded.
+        *
+-       * We do the same here for IPv4.
++       * We do the same here for IPv4 (and increment an snmp counter).
+        */
+-      /* Is there an overlap with the previous fragment? */
+-      if (prev &&
+-          (FRAG_CB(prev)->offset + prev->len) > offset)
+-              goto discard_qp;
+-
+-      /* Is there an overlap with the next fragment? */
+-      if (next && FRAG_CB(next)->offset < end)
+-              goto discard_qp;
+-
+-      FRAG_CB(skb)->offset = offset;
+-      /* Insert this fragment in the chain of fragments. */
+-      skb->next = next;
+-      if (!next)
++      /* Find out where to put this fragment.  */
++      skb1 = qp->q.fragments_tail;
++      if (!skb1) {
++              /* This is the first fragment we've received. */
++              rb_link_node(&skb->rbnode, NULL, &qp->q.rb_fragments.rb_node);
++              qp->q.fragments_tail = skb;
++      } else if ((FRAG_CB(skb1)->offset + skb1->len) < end) {
++              /* This is the common/special case: skb goes to the end. */
++              /* Detect and discard overlaps. */
++              if (offset < (FRAG_CB(skb1)->offset + skb1->len))
++                      goto discard_qp;
++              /* Insert after skb1. */
++              rb_link_node(&skb->rbnode, &skb1->rbnode, &skb1->rbnode.rb_right);
+               qp->q.fragments_tail = skb;
+-      if (prev)
+-              prev->next = skb;
+-      else
+-              qp->q.fragments = skb;
++      } else {
++              /* Binary search. Note that skb can become the first fragment, but
++               * not the last (covered above). */
++              rbn = &qp->q.rb_fragments.rb_node;
++              do {
++                      parent = *rbn;
++                      skb1 = rb_to_skb(parent);
++                      if (end <= FRAG_CB(skb1)->offset)
++                              rbn = &parent->rb_left;
++                      else if (offset >= FRAG_CB(skb1)->offset + skb1->len)
++                              rbn = &parent->rb_right;
++                      else /* Found an overlap with skb1. */
++                              goto discard_qp;
++              } while (*rbn);
++              /* Here we have parent properly set, and rbn pointing to
++               * one of its NULL left/right children. Insert skb. */
++              rb_link_node(&skb->rbnode, parent, rbn);
++      }
++      rb_insert_color(&skb->rbnode, &qp->q.rb_fragments);
+-      dev = skb->dev;
+       if (dev) {
+               qp->iif = dev->ifindex;
+               skb->dev = NULL;
+       }
++      FRAG_CB(skb)->offset = offset;
++
+       qp->q.stamp = skb->tstamp;
+       qp->q.meat += skb->len;
+       qp->ecn |= ecn;
+@@ -475,7 +487,7 @@ found:
+               unsigned long orefdst = skb->_skb_refdst;
+               skb->_skb_refdst = 0UL;
+-              err = ip_frag_reasm(qp, prev, dev);
++              err = ip_frag_reasm(qp, skb, dev);
+               skb->_skb_refdst = orefdst;
+               return err;
+       }
+@@ -492,15 +504,15 @@ err:
+       return err;
+ }
+-
+ /* Build a new IP datagram from all its fragments. */
+-
+-static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
++static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb,
+                        struct net_device *dev)
+ {
+       struct net *net = container_of(qp->q.net, struct net, ipv4.frags);
+       struct iphdr *iph;
+-      struct sk_buff *fp, *head = qp->q.fragments;
++      struct sk_buff *fp, *head = skb_rb_first(&qp->q.rb_fragments);
++      struct sk_buff **nextp; /* To build frag_list. */
++      struct rb_node *rbn;
+       int len;
+       int ihlen;
+       int err;
+@@ -514,25 +526,21 @@ static int ip_frag_reasm(struct ipq *qp,
+               goto out_fail;
+       }
+       /* Make the one we just received the head. */
+-      if (prev) {
+-              head = prev->next;
+-              fp = skb_clone(head, GFP_ATOMIC);
++      if (head != skb) {
++              fp = skb_clone(skb, GFP_ATOMIC);
+               if (!fp)
+                       goto out_nomem;
+-              fp->next = head->next;
+-              if (!fp->next)
++              rb_replace_node(&skb->rbnode, &fp->rbnode, &qp->q.rb_fragments);
++              if (qp->q.fragments_tail == skb)
+                       qp->q.fragments_tail = fp;
+-              prev->next = fp;
+-
+-              skb_morph(head, qp->q.fragments);
+-              head->next = qp->q.fragments->next;
+-
+-              consume_skb(qp->q.fragments);
+-              qp->q.fragments = head;
++              skb_morph(skb, head);
++              rb_replace_node(&head->rbnode, &skb->rbnode,
++                              &qp->q.rb_fragments);
++              consume_skb(head);
++              head = skb;
+       }
+-      WARN_ON(!head);
+       WARN_ON(FRAG_CB(head)->offset != 0);
+       /* Allocate a new buffer for the datagram. */
+@@ -557,24 +565,35 @@ static int ip_frag_reasm(struct ipq *qp,
+               clone = alloc_skb(0, GFP_ATOMIC);
+               if (!clone)
+                       goto out_nomem;
+-              clone->next = head->next;
+-              head->next = clone;
+               skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list;
+               skb_frag_list_init(head);
+               for (i = 0; i < skb_shinfo(head)->nr_frags; i++)
+                       plen += skb_frag_size(&skb_shinfo(head)->frags[i]);
+               clone->len = clone->data_len = head->data_len - plen;
+-              head->data_len -= clone->len;
+-              head->len -= clone->len;
++              skb->truesize += clone->truesize;
+               clone->csum = 0;
+               clone->ip_summed = head->ip_summed;
+               add_frag_mem_limit(qp->q.net, clone->truesize);
++              skb_shinfo(head)->frag_list = clone;
++              nextp = &clone->next;
++      } else {
++              nextp = &skb_shinfo(head)->frag_list;
+       }
+-      skb_shinfo(head)->frag_list = head->next;
+       skb_push(head, head->data - skb_network_header(head));
+-      for (fp=head->next; fp; fp = fp->next) {
++      /* Traverse the tree in order, to build frag_list. */
++      rbn = rb_next(&head->rbnode);
++      rb_erase(&head->rbnode, &qp->q.rb_fragments);
++      while (rbn) {
++              struct rb_node *rbnext = rb_next(rbn);
++              fp = rb_to_skb(rbn);
++              rb_erase(rbn, &qp->q.rb_fragments);
++              rbn = rbnext;
++              *nextp = fp;
++              nextp = &fp->next;
++              fp->prev = NULL;
++              memset(&fp->rbnode, 0, sizeof(fp->rbnode));
+               head->data_len += fp->len;
+               head->len += fp->len;
+               if (head->ip_summed != fp->ip_summed)
+@@ -585,7 +604,9 @@ static int ip_frag_reasm(struct ipq *qp,
+       }
+       sub_frag_mem_limit(qp->q.net, head->truesize);
++      *nextp = NULL;
+       head->next = NULL;
++      head->prev = NULL;
+       head->dev = dev;
+       head->tstamp = qp->q.stamp;
+       IPCB(head)->frag_max_size = max(qp->max_df_size, qp->q.max_size);
+@@ -613,6 +634,7 @@ static int ip_frag_reasm(struct ipq *qp,
+       IP_INC_STATS_BH(net, IPSTATS_MIB_REASMOKS);
+       qp->q.fragments = NULL;
++      qp->q.rb_fragments = RB_ROOT;
+       qp->q.fragments_tail = NULL;
+       return 0;
+--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
++++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
+@@ -472,6 +472,7 @@ nf_ct_frag6_reasm(struct frag_queue *fq,
+                                         head->csum);
+       fq->q.fragments = NULL;
++      fq->q.rb_fragments = RB_ROOT;
+       fq->q.fragments_tail = NULL;
+       /* all original skbs are linked into the NFCT_FRAG6_CB(head).orig */
+--- a/net/ipv6/reassembly.c
++++ b/net/ipv6/reassembly.c
+@@ -499,6 +499,7 @@ static int ip6_frag_reasm(struct frag_qu
+       IP6_INC_STATS_BH(net, __in6_dev_get(dev), IPSTATS_MIB_REASMOKS);
+       rcu_read_unlock();
+       fq->q.fragments = NULL;
++      fq->q.rb_fragments = RB_ROOT;
+       fq->q.fragments_tail = NULL;
+       return 1;
diff --git a/queue-4.4/ipv4-frags-precedence-bug-in-ip_expire.patch b/queue-4.4/ipv4-frags-precedence-bug-in-ip_expire.patch
new file mode 100644 (file)
index 0000000..a68b23b
--- /dev/null
@@ -0,0 +1,33 @@
+From 70837ffe3085c9a91488b52ca13ac84424da1042 Mon Sep 17 00:00:00 2001
+From: Dan Carpenter <dan.carpenter@oracle.com>
+Date: Mon, 6 Aug 2018 22:17:35 +0300
+Subject: ipv4: frags: precedence bug in ip_expire()
+
+From: Dan Carpenter <dan.carpenter@oracle.com>
+
+commit 70837ffe3085c9a91488b52ca13ac84424da1042 upstream.
+
+We accidentally removed the parentheses here, but they are required
+because '!' has higher precedence than '&'.
+
+Fixes: fa0f527358bd ("ip: use rb trees for IP frag queue.")
+Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Mao Wenan <maowenan@huawei.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ net/ipv4/ip_fragment.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/net/ipv4/ip_fragment.c
++++ b/net/ipv4/ip_fragment.c
+@@ -257,7 +257,7 @@ static void ip_expire(unsigned long arg)
+       IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS);
+       IP_INC_STATS_BH(net, IPSTATS_MIB_REASMTIMEOUT);
+-      if (!qp->q.flags & INET_FRAG_FIRST_IN)
++      if (!(qp->q.flags & INET_FRAG_FIRST_IN))
+               goto out;
+       /* sk_buff::dev and sk_buff::rbnode are unionized. So we
diff --git a/queue-4.4/ipv6-defrag-drop-non-last-frags-smaller-than-min-mtu.patch b/queue-4.4/ipv6-defrag-drop-non-last-frags-smaller-than-min-mtu.patch
new file mode 100644 (file)
index 0000000..a054cde
--- /dev/null
@@ -0,0 +1,55 @@
+From 0ed4229b08c13c84a3c301a08defdc9e7f4467e6 Mon Sep 17 00:00:00 2001
+From: Florian Westphal <fw@strlen.de>
+Date: Fri, 3 Aug 2018 02:22:20 +0200
+Subject: ipv6: defrag: drop non-last frags smaller than min mtu
+
+From: Florian Westphal <fw@strlen.de>
+
+commit 0ed4229b08c13c84a3c301a08defdc9e7f4467e6 upstream.
+
+don't bother with pathological cases, they only waste cycles.
+IPv6 requires a minimum MTU of 1280 so we should never see fragments
+smaller than this (except last frag).
+
+v3: don't use awkward "-offset + len"
+v2: drop IPv4 part, which added same check w. IPV4_MIN_MTU (68).
+    There were concerns that there could be even smaller frags
+    generated by intermediate nodes, e.g. on radio networks.
+
+Cc: Peter Oskolkov <posk@google.com>
+Cc: Eric Dumazet <edumazet@google.com>
+Signed-off-by: Florian Westphal <fw@strlen.de>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Mao Wenan <maowenan@huawei.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/ipv6/netfilter/nf_conntrack_reasm.c |    4 ++++
+ net/ipv6/reassembly.c                   |    4 ++++
+ 2 files changed, 8 insertions(+)
+
+--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
++++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
+@@ -602,6 +602,10 @@ struct sk_buff *nf_ct_frag6_gather(struc
+       hdr = ipv6_hdr(clone);
+       fhdr = (struct frag_hdr *)skb_transport_header(clone);
++      if (skb->len - skb_network_offset(skb) < IPV6_MIN_MTU &&
++          fhdr->frag_off & htons(IP6_MF))
++              goto ret_orig;
++
+       skb_orphan(skb);
+       fq = fq_find(net, fhdr->identification, user, &hdr->saddr, &hdr->daddr,
+                    skb->dev ? skb->dev->ifindex : 0, ip6_frag_ecn(hdr));
+--- a/net/ipv6/reassembly.c
++++ b/net/ipv6/reassembly.c
+@@ -549,6 +549,10 @@ static int ipv6_frag_rcv(struct sk_buff
+               return 1;
+       }
++      if (skb->len - skb_network_offset(skb) < IPV6_MIN_MTU &&
++          fhdr->frag_off & htons(IP6_MF))
++              goto fail_hdr;
++
+       fq = fq_find(net, fhdr->identification, &hdr->saddr, &hdr->daddr,
+                    skb->dev ? skb->dev->ifindex : 0, ip6_frag_ecn(hdr));
+       if (fq) {
diff --git a/queue-4.4/net-ipv4-do-not-handle-duplicate-fragments-as-overlapping.patch b/queue-4.4/net-ipv4-do-not-handle-duplicate-fragments-as-overlapping.patch
new file mode 100644 (file)
index 0000000..6094b9c
--- /dev/null
@@ -0,0 +1,83 @@
+From ade446403bfb79d3528d56071a84b15351a139ad Mon Sep 17 00:00:00 2001
+From: Michal Kubecek <mkubecek@suse.cz>
+Date: Thu, 13 Dec 2018 17:23:32 +0100
+Subject: net: ipv4: do not handle duplicate fragments as overlapping
+
+From: Michal Kubecek <mkubecek@suse.cz>
+
+commit ade446403bfb79d3528d56071a84b15351a139ad upstream.
+
+Since commit 7969e5c40dfd ("ip: discard IPv4 datagrams with overlapping
+segments.") IPv4 reassembly code drops the whole queue whenever an
+overlapping fragment is received. However, the test is written in a way
+which detects duplicate fragments as overlapping so that in environments
+with many duplicate packets, fragmented packets may be undeliverable.
+
+Add an extra test and for (potentially) duplicate fragment, only drop the
+new fragment rather than the whole queue. Only starting offset and length
+are checked, not the contents of the fragments as that would be too
+expensive. For similar reason, linear list ("run") of a rbtree node is not
+iterated, we only check if the new fragment is a subset of the interval
+covered by existing consecutive fragments.
+
+v2: instead of an exact check iterating through linear list of an rbtree
+node, only check if the new fragment is subset of the "run" (suggested
+by Eric Dumazet)
+
+Fixes: 7969e5c40dfd ("ip: discard IPv4 datagrams with overlapping segments.")
+Signed-off-by: Michal Kubecek <mkubecek@suse.cz>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Mao Wenan <maowenan@huawei.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ net/ipv4/ip_fragment.c |   18 ++++++++++++------
+ 1 file changed, 12 insertions(+), 6 deletions(-)
+
+--- a/net/ipv4/ip_fragment.c
++++ b/net/ipv4/ip_fragment.c
+@@ -400,10 +400,10 @@ static int ip_frag_queue(struct ipq *qp,
+       struct net *net = container_of(qp->q.net, struct net, ipv4.frags);
+       struct rb_node **rbn, *parent;
+       struct sk_buff *skb1, *prev_tail;
++      int ihl, end, skb1_run_end;
+       struct net_device *dev;
+       unsigned int fragsize;
+       int flags, offset;
+-      int ihl, end;
+       int err = -ENOENT;
+       u8 ecn;
+@@ -473,7 +473,9 @@ static int ip_frag_queue(struct ipq *qp,
+        *   overlapping fragment, the entire datagram (and any constituent
+        *   fragments) MUST be silently discarded.
+        *
+-       * We do the same here for IPv4 (and increment an snmp counter).
++       * We do the same here for IPv4 (and increment an snmp counter) but
++       * we do not want to drop the whole queue in response to a duplicate
++       * fragment.
+        */
+       /* Find out where to put this fragment.  */
+@@ -497,13 +499,17 @@ static int ip_frag_queue(struct ipq *qp,
+               do {
+                       parent = *rbn;
+                       skb1 = rb_to_skb(parent);
++                      skb1_run_end = FRAG_CB(skb1)->offset +
++                                     FRAG_CB(skb1)->frag_run_len;
+                       if (end <= FRAG_CB(skb1)->offset)
+                               rbn = &parent->rb_left;
+-                      else if (offset >= FRAG_CB(skb1)->offset +
+-                                              FRAG_CB(skb1)->frag_run_len)
++                      else if (offset >= skb1_run_end)
+                               rbn = &parent->rb_right;
+-                      else /* Found an overlap with skb1. */
+-                              goto discard_qp;
++                      else if (offset >= FRAG_CB(skb1)->offset &&
++                               end <= skb1_run_end)
++                              goto err; /* No new data, potential duplicate */
++                      else
++                              goto discard_qp; /* Found an overlap */
+               } while (*rbn);
+               /* Here we have parent properly set, and rbn pointing to
+                * one of its NULL left/right children. Insert skb.
diff --git a/queue-4.4/net-modify-skb_rbtree_purge-to-return-the-truesize-of-all-purged-skbs.patch b/queue-4.4/net-modify-skb_rbtree_purge-to-return-the-truesize-of-all-purged-skbs.patch
new file mode 100644 (file)
index 0000000..17fe45c
--- /dev/null
@@ -0,0 +1,66 @@
+From 385114dec8a49b5e5945e77ba7de6356106713f4 Mon Sep 17 00:00:00 2001
+From: Peter Oskolkov <posk@google.com>
+Date: Thu, 2 Aug 2018 23:34:38 +0000
+Subject: net: modify skb_rbtree_purge to return the truesize of all purged skbs.
+
+From: Peter Oskolkov <posk@google.com>
+
+commit 385114dec8a49b5e5945e77ba7de6356106713f4 upstream.
+
+Tested: see the next patch is the series.
+
+Suggested-by: Eric Dumazet <edumazet@google.com>
+Signed-off-by: Peter Oskolkov <posk@google.com>
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Cc: Florian Westphal <fw@strlen.de>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Mao Wenan <maowenan@huawei.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ include/linux/skbuff.h |    2 +-
+ net/core/skbuff.c      |    6 +++++-
+ 2 files changed, 6 insertions(+), 2 deletions(-)
+
+--- a/include/linux/skbuff.h
++++ b/include/linux/skbuff.h
+@@ -2273,7 +2273,7 @@ static inline void __skb_queue_purge(str
+               kfree_skb(skb);
+ }
+-void skb_rbtree_purge(struct rb_root *root);
++unsigned int skb_rbtree_purge(struct rb_root *root);
+ void *netdev_alloc_frag(unsigned int fragsz);
+--- a/net/core/skbuff.c
++++ b/net/core/skbuff.c
+@@ -2380,23 +2380,27 @@ EXPORT_SYMBOL(skb_queue_purge);
+ /**
+  *    skb_rbtree_purge - empty a skb rbtree
+  *    @root: root of the rbtree to empty
++ *    Return value: the sum of truesizes of all purged skbs.
+  *
+  *    Delete all buffers on an &sk_buff rbtree. Each buffer is removed from
+  *    the list and one reference dropped. This function does not take
+  *    any lock. Synchronization should be handled by the caller (e.g., TCP
+  *    out-of-order queue is protected by the socket lock).
+  */
+-void skb_rbtree_purge(struct rb_root *root)
++unsigned int skb_rbtree_purge(struct rb_root *root)
+ {
+       struct rb_node *p = rb_first(root);
++      unsigned int sum = 0;
+       while (p) {
+               struct sk_buff *skb = rb_entry(p, struct sk_buff, rbnode);
+               p = rb_next(p);
+               rb_erase(&skb->rbnode, root);
++              sum += skb->truesize;
+               kfree_skb(skb);
+       }
++      return sum;
+ }
+ /**
index 4f4710deb1c05871b1ae54d48981151460a3a5be..6f645796b98f2cd9700ee9eb2d4135628b8ff4f4 100644 (file)
@@ -52,3 +52,14 @@ mm-oom-fix-use-after-free-in-oom_kill_process.patch
 cifs-always-resolve-hostname-before-reconnecting.patch
 drivers-core-remove-glue-dirs-from-sysfs-earlier.patch
 mm-migrate-don-t-rely-on-__pagemovable-of-newpage-after-unlocking-it.patch
+fs-don-t-scan-the-inode-cache-before-sb_born-is-set.patch
+ip-discard-ipv4-datagrams-with-overlapping-segments.patch
+net-modify-skb_rbtree_purge-to-return-the-truesize-of-all-purged-skbs.patch
+inet-frags-get-rif-of-inet_frag_evicting.patch
+ip-use-rb-trees-for-ip-frag-queue.patch
+ipv6-defrag-drop-non-last-frags-smaller-than-min-mtu.patch
+ip-add-helpers-to-process-in-order-fragments-faster.patch
+ip-process-in-order-fragments-efficiently.patch
+net-ipv4-do-not-handle-duplicate-fragments-as-overlapping.patch
+ip-frags-fix-crash-in-ip_do_fragment.patch
+ipv4-frags-precedence-bug-in-ip_expire.patch