]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/commitdiff
4.4-stable patches
authorGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Thu, 7 Feb 2019 11:22:59 +0000 (12:22 +0100)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Thu, 7 Feb 2019 11:22:59 +0000 (12:22 +0100)
added patches:
inet-frags-add-a-pointer-to-struct-netns_frags.patch
inet-frags-better-deal-with-smp-races.patch
inet-frags-break-the-2gb-limit-for-frags-storage.patch
inet-frags-change-inet_frags_init_net-return-value.patch
inet-frags-do-not-clone-skb-in-ip_expire.patch
inet-frags-fix-ip6frag_low_thresh-boundary.patch
inet-frags-get-rid-of-ipfrag_skb_cb-frag_cb.patch
inet-frags-get-rif-of-inet_frag_evicting.patch
inet-frags-refactor-ipfrag_init.patch
inet-frags-refactor-ipv6_frag_init.patch
inet-frags-refactor-lowpan_net_frag_init.patch
inet-frags-remove-inet_frag_maybe_warn_overflow.patch
inet-frags-remove-some-helpers.patch
inet-frags-reorganize-struct-netns_frags.patch
inet-frags-use-rhashtables-for-reassembly-units.patch
ip-add-helpers-to-process-in-order-fragments-faster.patch
ip-discard-ipv4-datagrams-with-overlapping-segments.patch
ip-frags-fix-crash-in-ip_do_fragment.patch
ip-process-in-order-fragments-efficiently.patch
ip-use-rb-trees-for-ip-frag-queue.patch
ipfrag-really-prevent-allocation-on-netns-exit.patch
ipv4-frags-precedence-bug-in-ip_expire.patch
ipv6-defrag-drop-non-last-frags-smaller-than-min-mtu.patch
ipv6-frags-rewrite-ip6_expire_frag_queue.patch
net-fix-pskb_trim_rcsum_slow-with-odd-trim-offset.patch
net-ieee802154-6lowpan-fix-frag-reassembly.patch
net-ipv4-do-not-handle-duplicate-fragments-as-overlapping.patch
net-modify-skb_rbtree_purge-to-return-the-truesize-of-all-purged-skbs.patch
net-pskb_trim_rcsum-and-checksum_complete-are-friends.patch
rhashtable-add-rhashtable_lookup.patch
rhashtable-add-rhashtable_lookup_get_insert_key.patch
rhashtable-add-schedule-points.patch
rhashtable-reorganize-struct-rhashtable-layout.patch

34 files changed:
queue-4.4/inet-frags-add-a-pointer-to-struct-netns_frags.patch [new file with mode: 0644]
queue-4.4/inet-frags-better-deal-with-smp-races.patch [new file with mode: 0644]
queue-4.4/inet-frags-break-the-2gb-limit-for-frags-storage.patch [new file with mode: 0644]
queue-4.4/inet-frags-change-inet_frags_init_net-return-value.patch [new file with mode: 0644]
queue-4.4/inet-frags-do-not-clone-skb-in-ip_expire.patch [new file with mode: 0644]
queue-4.4/inet-frags-fix-ip6frag_low_thresh-boundary.patch [new file with mode: 0644]
queue-4.4/inet-frags-get-rid-of-ipfrag_skb_cb-frag_cb.patch [new file with mode: 0644]
queue-4.4/inet-frags-get-rif-of-inet_frag_evicting.patch [new file with mode: 0644]
queue-4.4/inet-frags-refactor-ipfrag_init.patch [new file with mode: 0644]
queue-4.4/inet-frags-refactor-ipv6_frag_init.patch [new file with mode: 0644]
queue-4.4/inet-frags-refactor-lowpan_net_frag_init.patch [new file with mode: 0644]
queue-4.4/inet-frags-remove-inet_frag_maybe_warn_overflow.patch [new file with mode: 0644]
queue-4.4/inet-frags-remove-some-helpers.patch [new file with mode: 0644]
queue-4.4/inet-frags-reorganize-struct-netns_frags.patch [new file with mode: 0644]
queue-4.4/inet-frags-use-rhashtables-for-reassembly-units.patch [new file with mode: 0644]
queue-4.4/ip-add-helpers-to-process-in-order-fragments-faster.patch [new file with mode: 0644]
queue-4.4/ip-discard-ipv4-datagrams-with-overlapping-segments.patch [new file with mode: 0644]
queue-4.4/ip-frags-fix-crash-in-ip_do_fragment.patch [new file with mode: 0644]
queue-4.4/ip-process-in-order-fragments-efficiently.patch [new file with mode: 0644]
queue-4.4/ip-use-rb-trees-for-ip-frag-queue.patch [new file with mode: 0644]
queue-4.4/ipfrag-really-prevent-allocation-on-netns-exit.patch [new file with mode: 0644]
queue-4.4/ipv4-frags-precedence-bug-in-ip_expire.patch [new file with mode: 0644]
queue-4.4/ipv6-defrag-drop-non-last-frags-smaller-than-min-mtu.patch [new file with mode: 0644]
queue-4.4/ipv6-frags-rewrite-ip6_expire_frag_queue.patch [new file with mode: 0644]
queue-4.4/net-fix-pskb_trim_rcsum_slow-with-odd-trim-offset.patch [new file with mode: 0644]
queue-4.4/net-ieee802154-6lowpan-fix-frag-reassembly.patch [new file with mode: 0644]
queue-4.4/net-ipv4-do-not-handle-duplicate-fragments-as-overlapping.patch [new file with mode: 0644]
queue-4.4/net-modify-skb_rbtree_purge-to-return-the-truesize-of-all-purged-skbs.patch [new file with mode: 0644]
queue-4.4/net-pskb_trim_rcsum-and-checksum_complete-are-friends.patch [new file with mode: 0644]
queue-4.4/rhashtable-add-rhashtable_lookup.patch [new file with mode: 0644]
queue-4.4/rhashtable-add-rhashtable_lookup_get_insert_key.patch [new file with mode: 0644]
queue-4.4/rhashtable-add-schedule-points.patch [new file with mode: 0644]
queue-4.4/rhashtable-reorganize-struct-rhashtable-layout.patch [new file with mode: 0644]
queue-4.4/series [new file with mode: 0644]

diff --git a/queue-4.4/inet-frags-add-a-pointer-to-struct-netns_frags.patch b/queue-4.4/inet-frags-add-a-pointer-to-struct-netns_frags.patch
new file mode 100644 (file)
index 0000000..3fc76d4
--- /dev/null
@@ -0,0 +1,420 @@
+From foo@baz Thu Feb  7 12:09:56 CET 2019
+From: Eric Dumazet <edumazet@google.com>
+Date: Wed, 10 Oct 2018 12:29:50 -0700
+Subject: inet: frags: add a pointer to struct netns_frags
+
+From: Eric Dumazet <edumazet@google.com>
+
+commit 093ba72914b696521e4885756a68a3332782c8de upstream.
+
+In order to simplify the API, add a pointer to struct inet_frags.
+This will allow us to make things less complex.
+
+These functions no longer have a struct inet_frags parameter :
+
+inet_frag_destroy(struct inet_frag_queue *q  /*, struct inet_frags *f */)
+inet_frag_put(struct inet_frag_queue *q /*, struct inet_frags *f */)
+inet_frag_kill(struct inet_frag_queue *q /*, struct inet_frags *f */)
+inet_frags_exit_net(struct netns_frags *nf /*, struct inet_frags *f */)
+ip6_expire_frag_queue(struct net *net, struct frag_queue *fq)
+
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+[bwh: Backported to 4.4: inet_frag_{kill,put}() are called in some
+ different places; update all calls]
+Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/net/inet_frag.h                 |   11 ++++++-----
+ include/net/ipv6.h                      |    3 +--
+ net/ieee802154/6lowpan/reassembly.c     |   13 +++++++------
+ net/ipv4/inet_fragment.c                |   17 ++++++++++-------
+ net/ipv4/ip_fragment.c                  |   10 ++++++----
+ net/ipv6/netfilter/nf_conntrack_reasm.c |   16 +++++++++-------
+ net/ipv6/reassembly.c                   |   20 ++++++++++----------
+ 7 files changed, 49 insertions(+), 41 deletions(-)
+
+--- a/include/net/inet_frag.h
++++ b/include/net/inet_frag.h
+@@ -8,6 +8,7 @@ struct netns_frags {
+       int                     timeout;
+       int                     high_thresh;
+       int                     low_thresh;
++      struct inet_frags       *f;
+ };
+ /**
+@@ -108,20 +109,20 @@ static inline int inet_frags_init_net(st
+       atomic_set(&nf->mem, 0);
+       return 0;
+ }
+-void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f);
++void inet_frags_exit_net(struct netns_frags *nf);
+-void inet_frag_kill(struct inet_frag_queue *q, struct inet_frags *f);
+-void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f);
++void inet_frag_kill(struct inet_frag_queue *q);
++void inet_frag_destroy(struct inet_frag_queue *q);
+ struct inet_frag_queue *inet_frag_find(struct netns_frags *nf,
+               struct inet_frags *f, void *key, unsigned int hash);
+ void inet_frag_maybe_warn_overflow(struct inet_frag_queue *q,
+                                  const char *prefix);
+-static inline void inet_frag_put(struct inet_frag_queue *q, struct inet_frags *f)
++static inline void inet_frag_put(struct inet_frag_queue *q)
+ {
+       if (atomic_dec_and_test(&q->refcnt))
+-              inet_frag_destroy(q, f);
++              inet_frag_destroy(q);
+ }
+ static inline bool inet_frag_evicting(struct inet_frag_queue *q)
+--- a/include/net/ipv6.h
++++ b/include/net/ipv6.h
+@@ -534,8 +534,7 @@ struct frag_queue {
+       u8                      ecn;
+ };
+-void ip6_expire_frag_queue(struct net *net, struct frag_queue *fq,
+-                         struct inet_frags *frags);
++void ip6_expire_frag_queue(struct net *net, struct frag_queue *fq);
+ static inline bool ipv6_addr_any(const struct in6_addr *a)
+ {
+--- a/net/ieee802154/6lowpan/reassembly.c
++++ b/net/ieee802154/6lowpan/reassembly.c
+@@ -93,10 +93,10 @@ static void lowpan_frag_expire(unsigned
+       if (fq->q.flags & INET_FRAG_COMPLETE)
+               goto out;
+-      inet_frag_kill(&fq->q, &lowpan_frags);
++      inet_frag_kill(&fq->q);
+ out:
+       spin_unlock(&fq->q.lock);
+-      inet_frag_put(&fq->q, &lowpan_frags);
++      inet_frag_put(&fq->q);
+ }
+ static inline struct lowpan_frag_queue *
+@@ -229,7 +229,7 @@ static int lowpan_frag_reasm(struct lowp
+       struct sk_buff *fp, *head = fq->q.fragments;
+       int sum_truesize;
+-      inet_frag_kill(&fq->q, &lowpan_frags);
++      inet_frag_kill(&fq->q);
+       /* Make the one we just received the head. */
+       if (prev) {
+@@ -437,7 +437,7 @@ int lowpan_frag_rcv(struct sk_buff *skb,
+               ret = lowpan_frag_queue(fq, skb, frag_type);
+               spin_unlock(&fq->q.lock);
+-              inet_frag_put(&fq->q, &lowpan_frags);
++              inet_frag_put(&fq->q);
+               return ret;
+       }
+@@ -585,13 +585,14 @@ static int __net_init lowpan_frags_init_
+       ieee802154_lowpan->frags.high_thresh = IPV6_FRAG_HIGH_THRESH;
+       ieee802154_lowpan->frags.low_thresh = IPV6_FRAG_LOW_THRESH;
+       ieee802154_lowpan->frags.timeout = IPV6_FRAG_TIMEOUT;
++      ieee802154_lowpan->frags.f = &lowpan_frags;
+       res = inet_frags_init_net(&ieee802154_lowpan->frags);
+       if (res < 0)
+               return res;
+       res = lowpan_frags_ns_sysctl_register(net);
+       if (res < 0)
+-              inet_frags_exit_net(&ieee802154_lowpan->frags, &lowpan_frags);
++              inet_frags_exit_net(&ieee802154_lowpan->frags);
+       return res;
+ }
+@@ -601,7 +602,7 @@ static void __net_exit lowpan_frags_exit
+               net_ieee802154_lowpan(net);
+       lowpan_frags_ns_sysctl_unregister(net);
+-      inet_frags_exit_net(&ieee802154_lowpan->frags, &lowpan_frags);
++      inet_frags_exit_net(&ieee802154_lowpan->frags);
+ }
+ static struct pernet_operations lowpan_frags_ops = {
+--- a/net/ipv4/inet_fragment.c
++++ b/net/ipv4/inet_fragment.c
+@@ -219,8 +219,9 @@ void inet_frags_fini(struct inet_frags *
+ }
+ EXPORT_SYMBOL(inet_frags_fini);
+-void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f)
++void inet_frags_exit_net(struct netns_frags *nf)
+ {
++      struct inet_frags *f =nf->f;
+       unsigned int seq;
+       int i;
+@@ -264,23 +265,23 @@ __acquires(hb->chain_lock)
+       return hb;
+ }
+-static inline void fq_unlink(struct inet_frag_queue *fq, struct inet_frags *f)
++static inline void fq_unlink(struct inet_frag_queue *fq)
+ {
+       struct inet_frag_bucket *hb;
+-      hb = get_frag_bucket_locked(fq, f);
++      hb = get_frag_bucket_locked(fq, fq->net->f);
+       hlist_del(&fq->list);
+       fq->flags |= INET_FRAG_COMPLETE;
+       spin_unlock(&hb->chain_lock);
+ }
+-void inet_frag_kill(struct inet_frag_queue *fq, struct inet_frags *f)
++void inet_frag_kill(struct inet_frag_queue *fq)
+ {
+       if (del_timer(&fq->timer))
+               atomic_dec(&fq->refcnt);
+       if (!(fq->flags & INET_FRAG_COMPLETE)) {
+-              fq_unlink(fq, f);
++              fq_unlink(fq);
+               atomic_dec(&fq->refcnt);
+       }
+ }
+@@ -294,11 +295,12 @@ static inline void frag_kfree_skb(struct
+       kfree_skb(skb);
+ }
+-void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f)
++void inet_frag_destroy(struct inet_frag_queue *q)
+ {
+       struct sk_buff *fp;
+       struct netns_frags *nf;
+       unsigned int sum, sum_truesize = 0;
++      struct inet_frags *f;
+       WARN_ON(!(q->flags & INET_FRAG_COMPLETE));
+       WARN_ON(del_timer(&q->timer) != 0);
+@@ -306,6 +308,7 @@ void inet_frag_destroy(struct inet_frag_
+       /* Release all fragment data. */
+       fp = q->fragments;
+       nf = q->net;
++      f = nf->f;
+       while (fp) {
+               struct sk_buff *xp = fp->next;
+@@ -341,7 +344,7 @@ static struct inet_frag_queue *inet_frag
+                       atomic_inc(&qp->refcnt);
+                       spin_unlock(&hb->chain_lock);
+                       qp_in->flags |= INET_FRAG_COMPLETE;
+-                      inet_frag_put(qp_in, f);
++                      inet_frag_put(qp_in);
+                       return qp;
+               }
+       }
+--- a/net/ipv4/ip_fragment.c
++++ b/net/ipv4/ip_fragment.c
+@@ -169,7 +169,7 @@ static void ip4_frag_free(struct inet_fr
+ static void ipq_put(struct ipq *ipq)
+ {
+-      inet_frag_put(&ipq->q, &ip4_frags);
++      inet_frag_put(&ipq->q);
+ }
+ /* Kill ipq entry. It is not destroyed immediately,
+@@ -177,7 +177,7 @@ static void ipq_put(struct ipq *ipq)
+  */
+ static void ipq_kill(struct ipq *ipq)
+ {
+-      inet_frag_kill(&ipq->q, &ip4_frags);
++      inet_frag_kill(&ipq->q);
+ }
+ static bool frag_expire_skip_icmp(u32 user)
+@@ -878,19 +878,21 @@ static int __net_init ipv4_frags_init_ne
+        */
+       net->ipv4.frags.timeout = IP_FRAG_TIME;
++      net->ipv4.frags.f = &ip4_frags;
++
+       res = inet_frags_init_net(&net->ipv4.frags);
+       if (res < 0)
+               return res;
+       res = ip4_frags_ns_ctl_register(net);
+       if (res < 0)
+-              inet_frags_exit_net(&net->ipv4.frags, &ip4_frags);
++              inet_frags_exit_net(&net->ipv4.frags);
+       return res;
+ }
+ static void __net_exit ipv4_frags_exit_net(struct net *net)
+ {
+       ip4_frags_ns_ctl_unregister(net);
+-      inet_frags_exit_net(&net->ipv4.frags, &ip4_frags);
++      inet_frags_exit_net(&net->ipv4.frags);
+ }
+ static struct pernet_operations ip4_frags_ops = {
+--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
++++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
+@@ -184,7 +184,7 @@ static void nf_ct_frag6_expire(unsigned
+       fq = container_of((struct inet_frag_queue *)data, struct frag_queue, q);
+       net = container_of(fq->q.net, struct net, nf_frag.frags);
+-      ip6_expire_frag_queue(net, fq, &nf_frags);
++      ip6_expire_frag_queue(net, fq);
+ }
+ /* Creation primitives. */
+@@ -362,7 +362,7 @@ found:
+       return 0;
+ discard_fq:
+-      inet_frag_kill(&fq->q, &nf_frags);
++      inet_frag_kill(&fq->q);
+ err:
+       return -1;
+ }
+@@ -383,7 +383,7 @@ nf_ct_frag6_reasm(struct frag_queue *fq,
+       int    payload_len;
+       u8 ecn;
+-      inet_frag_kill(&fq->q, &nf_frags);
++      inet_frag_kill(&fq->q);
+       WARN_ON(head == NULL);
+       WARN_ON(NFCT_FRAG6_CB(head)->offset != 0);
+@@ -614,7 +614,7 @@ struct sk_buff *nf_ct_frag6_gather(struc
+       if (nf_ct_frag6_queue(fq, clone, fhdr, nhoff) < 0) {
+               spin_unlock_bh(&fq->q.lock);
+               pr_debug("Can't insert skb to queue\n");
+-              inet_frag_put(&fq->q, &nf_frags);
++              inet_frag_put(&fq->q);
+               goto ret_orig;
+       }
+@@ -626,7 +626,7 @@ struct sk_buff *nf_ct_frag6_gather(struc
+       }
+       spin_unlock_bh(&fq->q.lock);
+-      inet_frag_put(&fq->q, &nf_frags);
++      inet_frag_put(&fq->q);
+       return ret_skb;
+ ret_orig:
+@@ -655,19 +655,21 @@ static int nf_ct_net_init(struct net *ne
+       net->nf_frag.frags.high_thresh = IPV6_FRAG_HIGH_THRESH;
+       net->nf_frag.frags.low_thresh = IPV6_FRAG_LOW_THRESH;
+       net->nf_frag.frags.timeout = IPV6_FRAG_TIMEOUT;
++      net->nf_frag.frags.f = &nf_frags;
++
+       res = inet_frags_init_net(&net->nf_frag.frags);
+       if (res < 0)
+               return res;
+       res = nf_ct_frag6_sysctl_register(net);
+       if (res < 0)
+-              inet_frags_exit_net(&net->nf_frag.frags, &nf_frags);
++              inet_frags_exit_net(&net->nf_frag.frags);
+       return res;
+ }
+ static void nf_ct_net_exit(struct net *net)
+ {
+       nf_ct_frags6_sysctl_unregister(net);
+-      inet_frags_exit_net(&net->nf_frag.frags, &nf_frags);
++      inet_frags_exit_net(&net->nf_frag.frags);
+ }
+ static struct pernet_operations nf_ct_net_ops = {
+--- a/net/ipv6/reassembly.c
++++ b/net/ipv6/reassembly.c
+@@ -128,8 +128,7 @@ void ip6_frag_init(struct inet_frag_queu
+ }
+ EXPORT_SYMBOL(ip6_frag_init);
+-void ip6_expire_frag_queue(struct net *net, struct frag_queue *fq,
+-                         struct inet_frags *frags)
++void ip6_expire_frag_queue(struct net *net, struct frag_queue *fq)
+ {
+       struct net_device *dev = NULL;
+@@ -138,7 +137,7 @@ void ip6_expire_frag_queue(struct net *n
+       if (fq->q.flags & INET_FRAG_COMPLETE)
+               goto out;
+-      inet_frag_kill(&fq->q, frags);
++      inet_frag_kill(&fq->q);
+       rcu_read_lock();
+       dev = dev_get_by_index_rcu(net, fq->iif);
+@@ -166,7 +165,7 @@ out_rcu_unlock:
+       rcu_read_unlock();
+ out:
+       spin_unlock(&fq->q.lock);
+-      inet_frag_put(&fq->q, frags);
++      inet_frag_put(&fq->q);
+ }
+ EXPORT_SYMBOL(ip6_expire_frag_queue);
+@@ -178,7 +177,7 @@ static void ip6_frag_expire(unsigned lon
+       fq = container_of((struct inet_frag_queue *)data, struct frag_queue, q);
+       net = container_of(fq->q.net, struct net, ipv6.frags);
+-      ip6_expire_frag_queue(net, fq, &ip6_frags);
++      ip6_expire_frag_queue(net, fq);
+ }
+ static struct frag_queue *
+@@ -359,7 +358,7 @@ found:
+       return -1;
+ discard_fq:
+-      inet_frag_kill(&fq->q, &ip6_frags);
++      inet_frag_kill(&fq->q);
+ err:
+       IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)),
+                        IPSTATS_MIB_REASMFAILS);
+@@ -386,7 +385,7 @@ static int ip6_frag_reasm(struct frag_qu
+       int sum_truesize;
+       u8 ecn;
+-      inet_frag_kill(&fq->q, &ip6_frags);
++      inet_frag_kill(&fq->q);
+       ecn = ip_frag_ecn_table[fq->ecn];
+       if (unlikely(ecn == 0xff))
+@@ -562,7 +561,7 @@ static int ipv6_frag_rcv(struct sk_buff
+               ret = ip6_frag_queue(fq, skb, fhdr, IP6CB(skb)->nhoff);
+               spin_unlock(&fq->q.lock);
+-              inet_frag_put(&fq->q, &ip6_frags);
++              inet_frag_put(&fq->q);
+               return ret;
+       }
+@@ -713,6 +712,7 @@ static int __net_init ipv6_frags_init_ne
+       net->ipv6.frags.high_thresh = IPV6_FRAG_HIGH_THRESH;
+       net->ipv6.frags.low_thresh = IPV6_FRAG_LOW_THRESH;
+       net->ipv6.frags.timeout = IPV6_FRAG_TIMEOUT;
++      net->ipv6.frags.f = &ip6_frags;
+       res = inet_frags_init_net(&net->ipv6.frags);
+       if (res < 0)
+@@ -720,14 +720,14 @@ static int __net_init ipv6_frags_init_ne
+       res = ip6_frags_ns_sysctl_register(net);
+       if (res < 0)
+-              inet_frags_exit_net(&net->ipv6.frags, &ip6_frags);
++              inet_frags_exit_net(&net->ipv6.frags);
+       return res;
+ }
+ static void __net_exit ipv6_frags_exit_net(struct net *net)
+ {
+       ip6_frags_ns_sysctl_unregister(net);
+-      inet_frags_exit_net(&net->ipv6.frags, &ip6_frags);
++      inet_frags_exit_net(&net->ipv6.frags);
+ }
+ static struct pernet_operations ip6_frags_ops = {
diff --git a/queue-4.4/inet-frags-better-deal-with-smp-races.patch b/queue-4.4/inet-frags-better-deal-with-smp-races.patch
new file mode 100644 (file)
index 0000000..1350b0d
--- /dev/null
@@ -0,0 +1,85 @@
+From foo@baz Thu Feb  7 12:09:56 CET 2019
+From: Eric Dumazet <edumazet@google.com>
+Date: Thu, 8 Nov 2018 17:34:27 -0800
+Subject: inet: frags: better deal with smp races
+
+From: Eric Dumazet <edumazet@google.com>
+
+commit 0d5b9311baf27bb545f187f12ecfd558220c607d upstream.
+
+Multiple cpus might attempt to insert a new fragment in rhashtable,
+if for example RPS is buggy, as reported by 배석진 in
+https://patchwork.ozlabs.org/patch/994601/
+
+We use rhashtable_lookup_get_insert_key() instead of
+rhashtable_insert_fast() to let cpus losing the race
+free their own inet_frag_queue and use the one that
+was inserted by another cpu.
+
+Fixes: 648700f76b03 ("inet: frags: use rhashtables for reassembly units")
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Reported-by: 배석진 <soukjin.bae@samsung.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/ipv4/inet_fragment.c |   28 +++++++++++++++-------------
+ 1 file changed, 15 insertions(+), 13 deletions(-)
+
+--- a/net/ipv4/inet_fragment.c
++++ b/net/ipv4/inet_fragment.c
+@@ -188,21 +188,22 @@ static struct inet_frag_queue *inet_frag
+ }
+ static struct inet_frag_queue *inet_frag_create(struct netns_frags *nf,
+-                                              void *arg)
++                                              void *arg,
++                                              struct inet_frag_queue **prev)
+ {
+       struct inet_frags *f = nf->f;
+       struct inet_frag_queue *q;
+-      int err;
+       q = inet_frag_alloc(nf, f, arg);
+-      if (!q)
++      if (!q) {
++              *prev = ERR_PTR(-ENOMEM);
+               return NULL;
+-
++      }
+       mod_timer(&q->timer, jiffies + nf->timeout);
+-      err = rhashtable_insert_fast(&nf->rhashtable, &q->node,
+-                                   f->rhash_params);
+-      if (err < 0) {
++      *prev = rhashtable_lookup_get_insert_key(&nf->rhashtable, &q->key,
++                                               &q->node, f->rhash_params);
++      if (*prev) {
+               q->flags |= INET_FRAG_COMPLETE;
+               inet_frag_kill(q);
+               inet_frag_destroy(q);
+@@ -215,17 +216,18 @@ EXPORT_SYMBOL(inet_frag_create);
+ /* TODO : call from rcu_read_lock() and no longer use refcount_inc_not_zero() */
+ struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, void *key)
+ {
+-      struct inet_frag_queue *fq;
++      struct inet_frag_queue *fq = NULL, *prev;
+       rcu_read_lock();
+-      fq = rhashtable_lookup(&nf->rhashtable, key, nf->f->rhash_params);
+-      if (fq) {
++      prev = rhashtable_lookup(&nf->rhashtable, key, nf->f->rhash_params);
++      if (!prev)
++              fq = inet_frag_create(nf, key, &prev);
++      if (prev && !IS_ERR(prev)) {
++              fq = prev;
+               if (!atomic_inc_not_zero(&fq->refcnt))
+                       fq = NULL;
+-              rcu_read_unlock();
+-              return fq;
+       }
+       rcu_read_unlock();
+-      return inet_frag_create(nf, key);
++      return fq;
+ }
+ EXPORT_SYMBOL(inet_frag_find);
diff --git a/queue-4.4/inet-frags-break-the-2gb-limit-for-frags-storage.patch b/queue-4.4/inet-frags-break-the-2gb-limit-for-frags-storage.patch
new file mode 100644 (file)
index 0000000..c40c6d9
--- /dev/null
@@ -0,0 +1,256 @@
+From foo@baz Thu Feb  7 12:09:56 CET 2019
+From: Eric Dumazet <edumazet@google.com>
+Date: Wed, 10 Oct 2018 12:30:00 -0700
+Subject: inet: frags: break the 2GB limit for frags storage
+
+From: Eric Dumazet <edumazet@google.com>
+
+commit 3e67f106f619dcfaf6f4e2039599bdb69848c714 upstream.
+
+Some users are willing to provision huge amounts of memory to be able
+to perform reassembly reasonnably well under pressure.
+
+Current memory tracking is using one atomic_t and integers.
+
+Switch to atomic_long_t so that 64bit arches can use more than 2GB,
+without any cost for 32bit arches.
+
+Note that this patch avoids an overflow error, if high_thresh was set
+to ~2GB, since this test in inet_frag_alloc() was never true :
+
+if (... || frag_mem_limit(nf) > nf->high_thresh)
+
+Tested:
+
+$ echo 16000000000 >/proc/sys/net/ipv4/ipfrag_high_thresh
+
+<frag DDOS>
+
+$ grep FRAG /proc/net/sockstat
+FRAG: inuse 14705885 memory 16000002880
+
+$ nstat -n ; sleep 1 ; nstat | grep Reas
+IpReasmReqds                    3317150            0.0
+IpReasmFails                    3317112            0.0
+
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ Documentation/networking/ip-sysctl.txt  |    4 ++--
+ include/net/inet_frag.h                 |   20 ++++++++++----------
+ net/ieee802154/6lowpan/reassembly.c     |   10 +++++-----
+ net/ipv4/ip_fragment.c                  |   10 +++++-----
+ net/ipv4/proc.c                         |    2 +-
+ net/ipv6/netfilter/nf_conntrack_reasm.c |   10 +++++-----
+ net/ipv6/proc.c                         |    2 +-
+ net/ipv6/reassembly.c                   |    6 +++---
+ 8 files changed, 32 insertions(+), 32 deletions(-)
+
+--- a/Documentation/networking/ip-sysctl.txt
++++ b/Documentation/networking/ip-sysctl.txt
+@@ -112,10 +112,10 @@ min_adv_mss - INTEGER
+ IP Fragmentation:
+-ipfrag_high_thresh - INTEGER
++ipfrag_high_thresh - LONG INTEGER
+       Maximum memory used to reassemble IP fragments.
+-ipfrag_low_thresh - INTEGER
++ipfrag_low_thresh - LONG INTEGER
+       (Obsolete since linux-4.17)
+       Maximum memory used to reassemble IP fragments before the kernel
+       begins to remove incomplete fragment queues to free up resources.
+--- a/include/net/inet_frag.h
++++ b/include/net/inet_frag.h
+@@ -7,11 +7,11 @@ struct netns_frags {
+       struct rhashtable       rhashtable ____cacheline_aligned_in_smp;
+       /* Keep atomic mem on separate cachelines in structs that include it */
+-      atomic_t                mem ____cacheline_aligned_in_smp;
++      atomic_long_t           mem ____cacheline_aligned_in_smp;
+       /* sysctls */
++      long                    high_thresh;
++      long                    low_thresh;
+       int                     timeout;
+-      int                     high_thresh;
+-      int                     low_thresh;
+       struct inet_frags       *f;
+ };
+@@ -101,7 +101,7 @@ void inet_frags_fini(struct inet_frags *
+ static inline int inet_frags_init_net(struct netns_frags *nf)
+ {
+-      atomic_set(&nf->mem, 0);
++      atomic_long_set(&nf->mem, 0);
+       return rhashtable_init(&nf->rhashtable, &nf->f->rhash_params);
+ }
+ void inet_frags_exit_net(struct netns_frags *nf);
+@@ -118,19 +118,19 @@ static inline void inet_frag_put(struct
+ /* Memory Tracking Functions. */
+-static inline int frag_mem_limit(struct netns_frags *nf)
++static inline long frag_mem_limit(const struct netns_frags *nf)
+ {
+-      return atomic_read(&nf->mem);
++      return atomic_long_read(&nf->mem);
+ }
+-static inline void sub_frag_mem_limit(struct netns_frags *nf, int i)
++static inline void sub_frag_mem_limit(struct netns_frags *nf, long val)
+ {
+-      atomic_sub(i, &nf->mem);
++      atomic_long_sub(val, &nf->mem);
+ }
+-static inline void add_frag_mem_limit(struct netns_frags *nf, int i)
++static inline void add_frag_mem_limit(struct netns_frags *nf, long val)
+ {
+-      atomic_add(i, &nf->mem);
++      atomic_long_add(val, &nf->mem);
+ }
+ /* RFC 3168 support :
+--- a/net/ieee802154/6lowpan/reassembly.c
++++ b/net/ieee802154/6lowpan/reassembly.c
+@@ -410,23 +410,23 @@ err:
+ }
+ #ifdef CONFIG_SYSCTL
+-static int zero;
++static long zero;
+ static struct ctl_table lowpan_frags_ns_ctl_table[] = {
+       {
+               .procname       = "6lowpanfrag_high_thresh",
+               .data           = &init_net.ieee802154_lowpan.frags.high_thresh,
+-              .maxlen         = sizeof(int),
++              .maxlen         = sizeof(unsigned long),
+               .mode           = 0644,
+-              .proc_handler   = proc_dointvec_minmax,
++              .proc_handler   = proc_doulongvec_minmax,
+               .extra1         = &init_net.ieee802154_lowpan.frags.low_thresh
+       },
+       {
+               .procname       = "6lowpanfrag_low_thresh",
+               .data           = &init_net.ieee802154_lowpan.frags.low_thresh,
+-              .maxlen         = sizeof(int),
++              .maxlen         = sizeof(unsigned long),
+               .mode           = 0644,
+-              .proc_handler   = proc_dointvec_minmax,
++              .proc_handler   = proc_doulongvec_minmax,
+               .extra1         = &zero,
+               .extra2         = &init_net.ieee802154_lowpan.frags.high_thresh
+       },
+--- a/net/ipv4/ip_fragment.c
++++ b/net/ipv4/ip_fragment.c
+@@ -682,23 +682,23 @@ struct sk_buff *ip_check_defrag(struct n
+ EXPORT_SYMBOL(ip_check_defrag);
+ #ifdef CONFIG_SYSCTL
+-static int zero;
++static long zero;
+ static struct ctl_table ip4_frags_ns_ctl_table[] = {
+       {
+               .procname       = "ipfrag_high_thresh",
+               .data           = &init_net.ipv4.frags.high_thresh,
+-              .maxlen         = sizeof(int),
++              .maxlen         = sizeof(unsigned long),
+               .mode           = 0644,
+-              .proc_handler   = proc_dointvec_minmax,
++              .proc_handler   = proc_doulongvec_minmax,
+               .extra1         = &init_net.ipv4.frags.low_thresh
+       },
+       {
+               .procname       = "ipfrag_low_thresh",
+               .data           = &init_net.ipv4.frags.low_thresh,
+-              .maxlen         = sizeof(int),
++              .maxlen         = sizeof(unsigned long),
+               .mode           = 0644,
+-              .proc_handler   = proc_dointvec_minmax,
++              .proc_handler   = proc_doulongvec_minmax,
+               .extra1         = &zero,
+               .extra2         = &init_net.ipv4.frags.high_thresh
+       },
+--- a/net/ipv4/proc.c
++++ b/net/ipv4/proc.c
+@@ -71,7 +71,7 @@ static int sockstat_seq_show(struct seq_
+                  sock_prot_inuse_get(net, &udplite_prot));
+       seq_printf(seq, "RAW: inuse %d\n",
+                  sock_prot_inuse_get(net, &raw_prot));
+-      seq_printf(seq,  "FRAG: inuse %u memory %u\n",
++      seq_printf(seq,  "FRAG: inuse %u memory %lu\n",
+                  atomic_read(&net->ipv4.frags.rhashtable.nelems),
+                  frag_mem_limit(&net->ipv4.frags));
+       return 0;
+--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
++++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
+@@ -64,7 +64,7 @@ struct nf_ct_frag6_skb_cb
+ static struct inet_frags nf_frags;
+ #ifdef CONFIG_SYSCTL
+-static int zero;
++static long zero;
+ static struct ctl_table nf_ct_frag6_sysctl_table[] = {
+       {
+@@ -77,18 +77,18 @@ static struct ctl_table nf_ct_frag6_sysc
+       {
+               .procname       = "nf_conntrack_frag6_low_thresh",
+               .data           = &init_net.nf_frag.frags.low_thresh,
+-              .maxlen         = sizeof(unsigned int),
++              .maxlen         = sizeof(unsigned long),
+               .mode           = 0644,
+-              .proc_handler   = proc_dointvec_minmax,
++              .proc_handler   = proc_doulongvec_minmax,
+               .extra1         = &zero,
+               .extra2         = &init_net.nf_frag.frags.high_thresh
+       },
+       {
+               .procname       = "nf_conntrack_frag6_high_thresh",
+               .data           = &init_net.nf_frag.frags.high_thresh,
+-              .maxlen         = sizeof(unsigned int),
++              .maxlen         = sizeof(unsigned long),
+               .mode           = 0644,
+-              .proc_handler   = proc_dointvec_minmax,
++              .proc_handler   = proc_doulongvec_minmax,
+               .extra1         = &init_net.nf_frag.frags.low_thresh
+       },
+       { }
+--- a/net/ipv6/proc.c
++++ b/net/ipv6/proc.c
+@@ -42,7 +42,7 @@ static int sockstat6_seq_show(struct seq
+                       sock_prot_inuse_get(net, &udplitev6_prot));
+       seq_printf(seq, "RAW6: inuse %d\n",
+                      sock_prot_inuse_get(net, &rawv6_prot));
+-      seq_printf(seq, "FRAG6: inuse %u memory %u\n",
++      seq_printf(seq, "FRAG6: inuse %u memory %lu\n",
+                  atomic_read(&net->ipv6.frags.rhashtable.nelems),
+                  frag_mem_limit(&net->ipv6.frags));
+       return 0;
+--- a/net/ipv6/reassembly.c
++++ b/net/ipv6/reassembly.c
+@@ -545,15 +545,15 @@ static struct ctl_table ip6_frags_ns_ctl
+       {
+               .procname       = "ip6frag_high_thresh",
+               .data           = &init_net.ipv6.frags.high_thresh,
+-              .maxlen         = sizeof(int),
++              .maxlen         = sizeof(unsigned long),
+               .mode           = 0644,
+-              .proc_handler   = proc_dointvec_minmax,
++              .proc_handler   = proc_doulongvec_minmax,
+               .extra1         = &init_net.ipv6.frags.low_thresh
+       },
+       {
+               .procname       = "ip6frag_low_thresh",
+               .data           = &init_net.ipv6.frags.low_thresh,
+-              .maxlen         = sizeof(int),
++              .maxlen         = sizeof(unsigned long),
+               .mode           = 0644,
+               .proc_handler   = proc_dointvec_minmax,
+               .extra1         = &zero,
diff --git a/queue-4.4/inet-frags-change-inet_frags_init_net-return-value.patch b/queue-4.4/inet-frags-change-inet_frags_init_net-return-value.patch
new file mode 100644 (file)
index 0000000..125a106
--- /dev/null
@@ -0,0 +1,143 @@
+From foo@baz Thu Feb  7 12:09:56 CET 2019
+From: Eric Dumazet <edumazet@google.com>
+Date: Wed, 10 Oct 2018 12:29:49 -0700
+Subject: inet: frags: change inet_frags_init_net() return value
+
+From: Eric Dumazet <edumazet@google.com>
+
+commit 787bea7748a76130566f881c2342a0be4127d182 upstream.
+
+We will soon initialize one rhashtable per struct netns_frags
+in inet_frags_init_net().
+
+This patch changes the return value to eventually propagate an
+error.
+
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/net/inet_frag.h                 |    3 ++-
+ net/ieee802154/6lowpan/reassembly.c     |   11 ++++++++---
+ net/ipv4/ip_fragment.c                  |   12 +++++++++---
+ net/ipv6/netfilter/nf_conntrack_reasm.c |   12 +++++++++---
+ net/ipv6/reassembly.c                   |   11 +++++++++--
+ 5 files changed, 37 insertions(+), 12 deletions(-)
+
+--- a/include/net/inet_frag.h
++++ b/include/net/inet_frag.h
+@@ -103,9 +103,10 @@ struct inet_frags {
+ int inet_frags_init(struct inet_frags *);
+ void inet_frags_fini(struct inet_frags *);
+-static inline void inet_frags_init_net(struct netns_frags *nf)
++static inline int inet_frags_init_net(struct netns_frags *nf)
+ {
+       atomic_set(&nf->mem, 0);
++      return 0;
+ }
+ void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f);
+--- a/net/ieee802154/6lowpan/reassembly.c
++++ b/net/ieee802154/6lowpan/reassembly.c
+@@ -580,14 +580,19 @@ static int __net_init lowpan_frags_init_
+ {
+       struct netns_ieee802154_lowpan *ieee802154_lowpan =
+               net_ieee802154_lowpan(net);
++      int res;
+       ieee802154_lowpan->frags.high_thresh = IPV6_FRAG_HIGH_THRESH;
+       ieee802154_lowpan->frags.low_thresh = IPV6_FRAG_LOW_THRESH;
+       ieee802154_lowpan->frags.timeout = IPV6_FRAG_TIMEOUT;
+-      inet_frags_init_net(&ieee802154_lowpan->frags);
+-
+-      return lowpan_frags_ns_sysctl_register(net);
++      res = inet_frags_init_net(&ieee802154_lowpan->frags);
++      if (res < 0)
++              return res;
++      res = lowpan_frags_ns_sysctl_register(net);
++      if (res < 0)
++              inet_frags_exit_net(&ieee802154_lowpan->frags, &lowpan_frags);
++      return res;
+ }
+ static void __net_exit lowpan_frags_exit_net(struct net *net)
+--- a/net/ipv4/ip_fragment.c
++++ b/net/ipv4/ip_fragment.c
+@@ -853,6 +853,8 @@ static void __init ip4_frags_ctl_registe
+ static int __net_init ipv4_frags_init_net(struct net *net)
+ {
++      int res;
++
+       /* Fragment cache limits.
+        *
+        * The fragment memory accounting code, (tries to) account for
+@@ -876,9 +878,13 @@ static int __net_init ipv4_frags_init_ne
+        */
+       net->ipv4.frags.timeout = IP_FRAG_TIME;
+-      inet_frags_init_net(&net->ipv4.frags);
+-
+-      return ip4_frags_ns_ctl_register(net);
++      res = inet_frags_init_net(&net->ipv4.frags);
++      if (res < 0)
++              return res;
++      res = ip4_frags_ns_ctl_register(net);
++      if (res < 0)
++              inet_frags_exit_net(&net->ipv4.frags, &ip4_frags);
++      return res;
+ }
+ static void __net_exit ipv4_frags_exit_net(struct net *net)
+--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
++++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
+@@ -650,12 +650,18 @@ EXPORT_SYMBOL_GPL(nf_ct_frag6_consume_or
+ static int nf_ct_net_init(struct net *net)
+ {
++      int res;
++
+       net->nf_frag.frags.high_thresh = IPV6_FRAG_HIGH_THRESH;
+       net->nf_frag.frags.low_thresh = IPV6_FRAG_LOW_THRESH;
+       net->nf_frag.frags.timeout = IPV6_FRAG_TIMEOUT;
+-      inet_frags_init_net(&net->nf_frag.frags);
+-
+-      return nf_ct_frag6_sysctl_register(net);
++      res = inet_frags_init_net(&net->nf_frag.frags);
++      if (res < 0)
++              return res;
++      res = nf_ct_frag6_sysctl_register(net);
++      if (res < 0)
++              inet_frags_exit_net(&net->nf_frag.frags, &nf_frags);
++      return res;
+ }
+ static void nf_ct_net_exit(struct net *net)
+--- a/net/ipv6/reassembly.c
++++ b/net/ipv6/reassembly.c
+@@ -708,13 +708,20 @@ static void ip6_frags_sysctl_unregister(
+ static int __net_init ipv6_frags_init_net(struct net *net)
+ {
++      int res;
++
+       net->ipv6.frags.high_thresh = IPV6_FRAG_HIGH_THRESH;
+       net->ipv6.frags.low_thresh = IPV6_FRAG_LOW_THRESH;
+       net->ipv6.frags.timeout = IPV6_FRAG_TIMEOUT;
+-      inet_frags_init_net(&net->ipv6.frags);
++      res = inet_frags_init_net(&net->ipv6.frags);
++      if (res < 0)
++              return res;
+-      return ip6_frags_ns_sysctl_register(net);
++      res = ip6_frags_ns_sysctl_register(net);
++      if (res < 0)
++              inet_frags_exit_net(&net->ipv6.frags, &ip6_frags);
++      return res;
+ }
+ static void __net_exit ipv6_frags_exit_net(struct net *net)
diff --git a/queue-4.4/inet-frags-do-not-clone-skb-in-ip_expire.patch b/queue-4.4/inet-frags-do-not-clone-skb-in-ip_expire.patch
new file mode 100644 (file)
index 0000000..7a660d4
--- /dev/null
@@ -0,0 +1,62 @@
+From foo@baz Thu Feb  7 12:09:56 CET 2019
+From: Eric Dumazet <edumazet@google.com>
+Date: Wed, 10 Oct 2018 12:30:01 -0700
+Subject: inet: frags: do not clone skb in ip_expire()
+
+From: Eric Dumazet <edumazet@google.com>
+
+commit 1eec5d5670084ee644597bd26c25e22c69b9f748 upstream.
+
+An skb_clone() was added in commit ec4fbd64751d ("inet: frag: release
+spinlock before calling icmp_send()")
+
+While fixing the bug at that time, it also added a very high cost
+for DDOS frags, as the ICMP rate limit is applied after this
+expensive operation (skb_clone() + consume_skb(), implying memory
+allocations, copy, and freeing)
+
+We can use skb_get(head) here, all we want is to make sure skb wont
+be freed by another cpu.
+
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/ipv4/ip_fragment.c |   16 ++++++----------
+ 1 file changed, 6 insertions(+), 10 deletions(-)
+
+--- a/net/ipv4/ip_fragment.c
++++ b/net/ipv4/ip_fragment.c
+@@ -143,8 +143,8 @@ static bool frag_expire_skip_icmp(u32 us
+  */
+ static void ip_expire(unsigned long arg)
+ {
+-      struct sk_buff *clone, *head;
+       const struct iphdr *iph;
++      struct sk_buff *head;
+       struct net *net;
+       struct ipq *qp;
+       int err;
+@@ -187,16 +187,12 @@ static void ip_expire(unsigned long arg)
+           (skb_rtable(head)->rt_type != RTN_LOCAL))
+               goto out;
+-      clone = skb_clone(head, GFP_ATOMIC);
++      skb_get(head);
++      spin_unlock(&qp->q.lock);
++      icmp_send(head, ICMP_TIME_EXCEEDED, ICMP_EXC_FRAGTIME, 0);
++      kfree_skb(head);
++      goto out_rcu_unlock;
+-      /* Send an ICMP "Fragment Reassembly Timeout" message. */
+-      if (clone) {
+-              spin_unlock(&qp->q.lock);
+-              icmp_send(clone, ICMP_TIME_EXCEEDED,
+-                        ICMP_EXC_FRAGTIME, 0);
+-              consume_skb(clone);
+-              goto out_rcu_unlock;
+-      }
+ out:
+       spin_unlock(&qp->q.lock);
+ out_rcu_unlock:
diff --git a/queue-4.4/inet-frags-fix-ip6frag_low_thresh-boundary.patch b/queue-4.4/inet-frags-fix-ip6frag_low_thresh-boundary.patch
new file mode 100644 (file)
index 0000000..5bbdf81
--- /dev/null
@@ -0,0 +1,212 @@
+From foo@baz Thu Feb  7 12:09:56 CET 2019
+From: Eric Dumazet <edumazet@google.com>
+Date: Wed, 10 Oct 2018 12:30:06 -0700
+Subject: inet: frags: fix ip6frag_low_thresh boundary
+
+From: Eric Dumazet <edumazet@google.com>
+
+commit 3d23401283e80ceb03f765842787e0e79ff598b7 upstream.
+
+Giving an integer to proc_doulongvec_minmax() is dangerous on 64bit arches,
+since linker might place next to it a non zero value preventing a change
+to ip6frag_low_thresh.
+
+ip6frag_low_thresh is not used anymore in the kernel, but we do not
+want to prematuraly break user scripts wanting to change it.
+
+Since specifying a minimal value of 0 for proc_doulongvec_minmax()
+is moot, let's remove these zero values in all defrag units.
+
+Fixes: 6e00f7dd5e4e ("ipv6: frags: fix /proc/sys/net/ipv6/ip6frag_low_thresh")
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Reported-by: Maciej Żenczykowski <maze@google.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/ieee802154/6lowpan/reassembly.c     |    2 -
+ net/ipv4/ip_fragment.c                  |   40 ++++++++++++--------------------
+ net/ipv6/netfilter/nf_conntrack_reasm.c |    2 -
+ net/ipv6/reassembly.c                   |    4 ---
+ 4 files changed, 17 insertions(+), 31 deletions(-)
+
+--- a/net/ieee802154/6lowpan/reassembly.c
++++ b/net/ieee802154/6lowpan/reassembly.c
+@@ -410,7 +410,6 @@ err:
+ }
+ #ifdef CONFIG_SYSCTL
+-static long zero;
+ static struct ctl_table lowpan_frags_ns_ctl_table[] = {
+       {
+@@ -427,7 +426,6 @@ static struct ctl_table lowpan_frags_ns_
+               .maxlen         = sizeof(unsigned long),
+               .mode           = 0644,
+               .proc_handler   = proc_doulongvec_minmax,
+-              .extra1         = &zero,
+               .extra2         = &init_net.ieee802154_lowpan.frags.high_thresh
+       },
+       {
+--- a/net/ipv4/ip_fragment.c
++++ b/net/ipv4/ip_fragment.c
+@@ -58,14 +58,6 @@
+ static int sysctl_ipfrag_max_dist __read_mostly = 64;
+ static const char ip_frag_cache_name[] = "ip4-frags";
+-struct ipfrag_skb_cb
+-{
+-      struct inet_skb_parm    h;
+-      int                     offset;
+-};
+-
+-#define FRAG_CB(skb)  ((struct ipfrag_skb_cb *)((skb)->cb))
+-
+ /* Describe an entry in the "incomplete datagrams" queue. */
+ struct ipq {
+       struct inet_frag_queue q;
+@@ -353,13 +345,13 @@ static int ip_frag_queue(struct ipq *qp,
+        * this fragment, right?
+        */
+       prev = qp->q.fragments_tail;
+-      if (!prev || FRAG_CB(prev)->offset < offset) {
++      if (!prev || prev->ip_defrag_offset < offset) {
+               next = NULL;
+               goto found;
+       }
+       prev = NULL;
+       for (next = qp->q.fragments; next != NULL; next = next->next) {
+-              if (FRAG_CB(next)->offset >= offset)
++              if (next->ip_defrag_offset >= offset)
+                       break;  /* bingo! */
+               prev = next;
+       }
+@@ -370,7 +362,7 @@ found:
+        * any overlaps are eliminated.
+        */
+       if (prev) {
+-              int i = (FRAG_CB(prev)->offset + prev->len) - offset;
++              int i = (prev->ip_defrag_offset + prev->len) - offset;
+               if (i > 0) {
+                       offset += i;
+@@ -387,8 +379,8 @@ found:
+       err = -ENOMEM;
+-      while (next && FRAG_CB(next)->offset < end) {
+-              int i = end - FRAG_CB(next)->offset; /* overlap is 'i' bytes */
++      while (next && next->ip_defrag_offset < end) {
++              int i = end - next->ip_defrag_offset; /* overlap is 'i' bytes */
+               if (i < next->len) {
+                       /* Eat head of the next overlapped fragment
+@@ -396,7 +388,7 @@ found:
+                        */
+                       if (!pskb_pull(next, i))
+                               goto err;
+-                      FRAG_CB(next)->offset += i;
++                      next->ip_defrag_offset += i;
+                       qp->q.meat -= i;
+                       if (next->ip_summed != CHECKSUM_UNNECESSARY)
+                               next->ip_summed = CHECKSUM_NONE;
+@@ -420,7 +412,13 @@ found:
+               }
+       }
+-      FRAG_CB(skb)->offset = offset;
++      /* Note : skb->ip_defrag_offset and skb->dev share the same location */
++      dev = skb->dev;
++      if (dev)
++              qp->iif = dev->ifindex;
++      /* Makes sure compiler wont do silly aliasing games */
++      barrier();
++      skb->ip_defrag_offset = offset;
+       /* Insert this fragment in the chain of fragments. */
+       skb->next = next;
+@@ -431,11 +429,6 @@ found:
+       else
+               qp->q.fragments = skb;
+-      dev = skb->dev;
+-      if (dev) {
+-              qp->iif = dev->ifindex;
+-              skb->dev = NULL;
+-      }
+       qp->q.stamp = skb->tstamp;
+       qp->q.meat += skb->len;
+       qp->ecn |= ecn;
+@@ -511,7 +504,7 @@ static int ip_frag_reasm(struct ipq *qp,
+       }
+       WARN_ON(!head);
+-      WARN_ON(FRAG_CB(head)->offset != 0);
++      WARN_ON(head->ip_defrag_offset != 0);
+       /* Allocate a new buffer for the datagram. */
+       ihlen = ip_hdrlen(head);
+@@ -678,7 +671,7 @@ struct sk_buff *ip_check_defrag(struct n
+ EXPORT_SYMBOL(ip_check_defrag);
+ #ifdef CONFIG_SYSCTL
+-static long zero;
++static int dist_min;
+ static struct ctl_table ip4_frags_ns_ctl_table[] = {
+       {
+@@ -695,7 +688,6 @@ static struct ctl_table ip4_frags_ns_ctl
+               .maxlen         = sizeof(unsigned long),
+               .mode           = 0644,
+               .proc_handler   = proc_doulongvec_minmax,
+-              .extra1         = &zero,
+               .extra2         = &init_net.ipv4.frags.high_thresh
+       },
+       {
+@@ -724,7 +716,7 @@ static struct ctl_table ip4_frags_ctl_ta
+               .maxlen         = sizeof(int),
+               .mode           = 0644,
+               .proc_handler   = proc_dointvec_minmax,
+-              .extra1         = &zero
++              .extra1         = &dist_min,
+       },
+       { }
+ };
+--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
++++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
+@@ -64,7 +64,6 @@ struct nf_ct_frag6_skb_cb
+ static struct inet_frags nf_frags;
+ #ifdef CONFIG_SYSCTL
+-static long zero;
+ static struct ctl_table nf_ct_frag6_sysctl_table[] = {
+       {
+@@ -80,7 +79,6 @@ static struct ctl_table nf_ct_frag6_sysc
+               .maxlen         = sizeof(unsigned long),
+               .mode           = 0644,
+               .proc_handler   = proc_doulongvec_minmax,
+-              .extra1         = &zero,
+               .extra2         = &init_net.nf_frag.frags.high_thresh
+       },
+       {
+--- a/net/ipv6/reassembly.c
++++ b/net/ipv6/reassembly.c
+@@ -547,7 +547,6 @@ static const struct inet6_protocol frag_
+ };
+ #ifdef CONFIG_SYSCTL
+-static int zero;
+ static struct ctl_table ip6_frags_ns_ctl_table[] = {
+       {
+@@ -563,8 +562,7 @@ static struct ctl_table ip6_frags_ns_ctl
+               .data           = &init_net.ipv6.frags.low_thresh,
+               .maxlen         = sizeof(unsigned long),
+               .mode           = 0644,
+-              .proc_handler   = proc_dointvec_minmax,
+-              .extra1         = &zero,
++              .proc_handler   = proc_doulongvec_minmax,
+               .extra2         = &init_net.ipv6.frags.high_thresh
+       },
+       {
diff --git a/queue-4.4/inet-frags-get-rid-of-ipfrag_skb_cb-frag_cb.patch b/queue-4.4/inet-frags-get-rid-of-ipfrag_skb_cb-frag_cb.patch
new file mode 100644 (file)
index 0000000..acf588d
--- /dev/null
@@ -0,0 +1,48 @@
+From foo@baz Thu Feb  7 12:09:56 CET 2019
+From: Eric Dumazet <edumazet@google.com>
+Date: Wed, 10 Oct 2018 12:30:05 -0700
+Subject: inet: frags: get rid of ipfrag_skb_cb/FRAG_CB
+
+From: Eric Dumazet <edumazet@google.com>
+
+commit bf66337140c64c27fa37222b7abca7e49d63fb57 upstream.
+
+ip_defrag uses skb->cb[] to store the fragment offset, and unfortunately
+this integer is currently in a different cache line than skb->next,
+meaning that we use two cache lines per skb when finding the insertion point.
+
+By aliasing skb->ip_defrag_offset and skb->dev, we pack all the fields
+in a single cache line and save precious memory bandwidth.
+
+Note that after the fast path added by Changli Gao in commit
+d6bebca92c66 ("fragment: add fast path for in-order fragments")
+this change wont help the fast path, since we still need
+to access prev->len (2nd cache line), but will show great
+benefits when slow path is entered, since we perform
+a linear scan of a potentially long list.
+
+Also, note that this potential long list is an attack vector,
+we might consider also using an rb-tree there eventually.
+
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/linux/skbuff.h |    5 +++++
+ 1 file changed, 5 insertions(+)
+
+--- a/include/linux/skbuff.h
++++ b/include/linux/skbuff.h
+@@ -558,6 +558,11 @@ struct sk_buff {
+               };
+               struct rb_node  rbnode; /* used in netem & tcp stack */
+       };
++
++      union {
++              int                     ip_defrag_offset;
++      };
++
+       struct sock             *sk;
+       struct net_device       *dev;
diff --git a/queue-4.4/inet-frags-get-rif-of-inet_frag_evicting.patch b/queue-4.4/inet-frags-get-rif-of-inet_frag_evicting.patch
new file mode 100644 (file)
index 0000000..c1c9ae7
--- /dev/null
@@ -0,0 +1,148 @@
+From foo@baz Thu Feb  7 12:09:56 CET 2019
+From: Eric Dumazet <edumazet@google.com>
+Date: Wed, 10 Oct 2018 12:29:58 -0700
+Subject: inet: frags: get rif of inet_frag_evicting()
+
+From: Eric Dumazet <edumazet@google.com>
+
+commit 399d1404be660d355192ff4df5ccc3f4159ec1e4 upstream.
+
+This refactors ip_expire() since one indentation level is removed.
+
+Note: in the future, we should try hard to avoid the skb_clone()
+since this is a serious performance cost.
+Under DDOS, the ICMP message wont be sent because of rate limits.
+
+Fact that ip6_expire_frag_queue() does not use skb_clone() is
+disturbing too. Presumably IPv6 should have the same
+issue than the one we fixed in commit ec4fbd64751d
+("inet: frag: release spinlock before calling icmp_send()")
+
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Mao Wenan <maowenan@huawei.com>
+[bwh: Backported to 4.4: adjust context]
+Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/net/inet_frag.h |    5 ---
+ net/ipv4/ip_fragment.c  |   65 +++++++++++++++++++++++-------------------------
+ net/ipv6/reassembly.c   |    4 --
+ 3 files changed, 32 insertions(+), 42 deletions(-)
+
+--- a/include/net/inet_frag.h
++++ b/include/net/inet_frag.h
+@@ -118,11 +118,6 @@ static inline void inet_frag_put(struct
+               inet_frag_destroy(q);
+ }
+-static inline bool inet_frag_evicting(struct inet_frag_queue *q)
+-{
+-      return false;
+-}
+-
+ /* Memory Tracking Functions. */
+ static inline int frag_mem_limit(struct netns_frags *nf)
+--- a/net/ipv4/ip_fragment.c
++++ b/net/ipv4/ip_fragment.c
+@@ -143,8 +143,11 @@ static bool frag_expire_skip_icmp(u32 us
+  */
+ static void ip_expire(unsigned long arg)
+ {
+-      struct ipq *qp;
++      struct sk_buff *clone, *head;
++      const struct iphdr *iph;
+       struct net *net;
++      struct ipq *qp;
++      int err;
+       qp = container_of((struct inet_frag_queue *) arg, struct ipq, q);
+       net = container_of(qp->q.net, struct net, ipv4.frags);
+@@ -158,45 +161,41 @@ static void ip_expire(unsigned long arg)
+       ipq_kill(qp);
+       IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS);
+-      if (!inet_frag_evicting(&qp->q)) {
+-              struct sk_buff *clone, *head = qp->q.fragments;
+-              const struct iphdr *iph;
+-              int err;
++      head = qp->q.fragments;
+-              IP_INC_STATS_BH(net, IPSTATS_MIB_REASMTIMEOUT);
++      IP_INC_STATS_BH(net, IPSTATS_MIB_REASMTIMEOUT);
+-              if (!(qp->q.flags & INET_FRAG_FIRST_IN) || !qp->q.fragments)
+-                      goto out;
++      if (!(qp->q.flags & INET_FRAG_FIRST_IN) || !head)
++              goto out;
+-              head->dev = dev_get_by_index_rcu(net, qp->iif);
+-              if (!head->dev)
+-                      goto out;
++      head->dev = dev_get_by_index_rcu(net, qp->iif);
++      if (!head->dev)
++              goto out;
+-              /* skb has no dst, perform route lookup again */
+-              iph = ip_hdr(head);
+-              err = ip_route_input_noref(head, iph->daddr, iph->saddr,
++      /* skb has no dst, perform route lookup again */
++      iph = ip_hdr(head);
++      err = ip_route_input_noref(head, iph->daddr, iph->saddr,
+                                          iph->tos, head->dev);
+-              if (err)
+-                      goto out;
++      if (err)
++              goto out;
++
++      /* Only an end host needs to send an ICMP
++       * "Fragment Reassembly Timeout" message, per RFC792.
++       */
++      if (frag_expire_skip_icmp(qp->q.key.v4.user) &&
++          (skb_rtable(head)->rt_type != RTN_LOCAL))
++              goto out;
++
++      clone = skb_clone(head, GFP_ATOMIC);
+-              /* Only an end host needs to send an ICMP
+-               * "Fragment Reassembly Timeout" message, per RFC792.
+-               */
+-              if (frag_expire_skip_icmp(qp->q.key.v4.user) &&
+-                  (skb_rtable(head)->rt_type != RTN_LOCAL))
+-                      goto out;
+-
+-              clone = skb_clone(head, GFP_ATOMIC);
+-
+-              /* Send an ICMP "Fragment Reassembly Timeout" message. */
+-              if (clone) {
+-                      spin_unlock(&qp->q.lock);
+-                      icmp_send(clone, ICMP_TIME_EXCEEDED,
+-                                ICMP_EXC_FRAGTIME, 0);
+-                      consume_skb(clone);
+-                      goto out_rcu_unlock;
+-              }
++      /* Send an ICMP "Fragment Reassembly Timeout" message. */
++      if (clone) {
++              spin_unlock(&qp->q.lock);
++              icmp_send(clone, ICMP_TIME_EXCEEDED,
++                        ICMP_EXC_FRAGTIME, 0);
++              consume_skb(clone);
++              goto out_rcu_unlock;
+       }
+ out:
+       spin_unlock(&qp->q.lock);
+--- a/net/ipv6/reassembly.c
++++ b/net/ipv6/reassembly.c
+@@ -106,10 +106,6 @@ void ip6_expire_frag_queue(struct net *n
+               goto out_rcu_unlock;
+       IP6_INC_STATS_BH(net, __in6_dev_get(dev), IPSTATS_MIB_REASMFAILS);
+-
+-      if (inet_frag_evicting(&fq->q))
+-              goto out_rcu_unlock;
+-
+       IP6_INC_STATS_BH(net, __in6_dev_get(dev), IPSTATS_MIB_REASMTIMEOUT);
+       /* Don't send error if the first segment did not arrive. */
diff --git a/queue-4.4/inet-frags-refactor-ipfrag_init.patch b/queue-4.4/inet-frags-refactor-ipfrag_init.patch
new file mode 100644 (file)
index 0000000..e4768af
--- /dev/null
@@ -0,0 +1,38 @@
+From foo@baz Thu Feb  7 12:09:56 CET 2019
+From: Eric Dumazet <edumazet@google.com>
+Date: Wed, 10 Oct 2018 12:29:51 -0700
+Subject: inet: frags: refactor ipfrag_init()
+
+From: Eric Dumazet <edumazet@google.com>
+
+commit 483a6e4fa055123142d8956866fe2aa9c98d546d upstream.
+
+We need to call inet_frags_init() before register_pernet_subsys(),
+as a prereq for following patch ("inet: frags: use rhashtables for reassembly units")
+
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/ipv4/ip_fragment.c |    4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+--- a/net/ipv4/ip_fragment.c
++++ b/net/ipv4/ip_fragment.c
+@@ -902,8 +902,6 @@ static struct pernet_operations ip4_frag
+ void __init ipfrag_init(void)
+ {
+-      ip4_frags_ctl_register();
+-      register_pernet_subsys(&ip4_frags_ops);
+       ip4_frags.hashfn = ip4_hashfn;
+       ip4_frags.constructor = ip4_frag_init;
+       ip4_frags.destructor = ip4_frag_free;
+@@ -914,4 +912,6 @@ void __init ipfrag_init(void)
+       ip4_frags.frags_cache_name = ip_frag_cache_name;
+       if (inet_frags_init(&ip4_frags))
+               panic("IP: failed to allocate ip4_frags cache\n");
++      ip4_frags_ctl_register();
++      register_pernet_subsys(&ip4_frags_ops);
+ }
diff --git a/queue-4.4/inet-frags-refactor-ipv6_frag_init.patch b/queue-4.4/inet-frags-refactor-ipv6_frag_init.patch
new file mode 100644 (file)
index 0000000..d8e9d21
--- /dev/null
@@ -0,0 +1,75 @@
+From foo@baz Thu Feb  7 12:09:56 CET 2019
+From: Eric Dumazet <edumazet@google.com>
+Date: Wed, 10 Oct 2018 12:29:52 -0700
+Subject: inet: frags: refactor ipv6_frag_init()
+
+From: Eric Dumazet <edumazet@google.com>
+
+commit 5b975bab23615cd0fdf67af6c9298eb01c4b9f61 upstream.
+
+We want to call inet_frags_init() earlier.
+
+This is a prereq to "inet: frags: use rhashtables for reassembly units"
+
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+[bwh: Backported to 4.4: Also delete a redundant assignment to
+ ip6_frags.skb_free]
+Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/ipv6/reassembly.c |   26 ++++++++++++++------------
+ 1 file changed, 14 insertions(+), 12 deletions(-)
+
+--- a/net/ipv6/reassembly.c
++++ b/net/ipv6/reassembly.c
+@@ -739,10 +739,21 @@ int __init ipv6_frag_init(void)
+ {
+       int ret;
+-      ret = inet6_add_protocol(&frag_protocol, IPPROTO_FRAGMENT);
++      ip6_frags.hashfn = ip6_hashfn;
++      ip6_frags.constructor = ip6_frag_init;
++      ip6_frags.destructor = NULL;
++      ip6_frags.qsize = sizeof(struct frag_queue);
++      ip6_frags.match = ip6_frag_match;
++      ip6_frags.frag_expire = ip6_frag_expire;
++      ip6_frags.frags_cache_name = ip6_frag_cache_name;
++      ret = inet_frags_init(&ip6_frags);
+       if (ret)
+               goto out;
++      ret = inet6_add_protocol(&frag_protocol, IPPROTO_FRAGMENT);
++      if (ret)
++              goto err_protocol;
++
+       ret = ip6_frags_sysctl_register();
+       if (ret)
+               goto err_sysctl;
+@@ -751,17 +762,6 @@ int __init ipv6_frag_init(void)
+       if (ret)
+               goto err_pernet;
+-      ip6_frags.hashfn = ip6_hashfn;
+-      ip6_frags.constructor = ip6_frag_init;
+-      ip6_frags.destructor = NULL;
+-      ip6_frags.skb_free = NULL;
+-      ip6_frags.qsize = sizeof(struct frag_queue);
+-      ip6_frags.match = ip6_frag_match;
+-      ip6_frags.frag_expire = ip6_frag_expire;
+-      ip6_frags.frags_cache_name = ip6_frag_cache_name;
+-      ret = inet_frags_init(&ip6_frags);
+-      if (ret)
+-              goto err_pernet;
+ out:
+       return ret;
+@@ -769,6 +769,8 @@ err_pernet:
+       ip6_frags_sysctl_unregister();
+ err_sysctl:
+       inet6_del_protocol(&frag_protocol, IPPROTO_FRAGMENT);
++err_protocol:
++      inet_frags_fini(&ip6_frags);
+       goto out;
+ }
diff --git a/queue-4.4/inet-frags-refactor-lowpan_net_frag_init.patch b/queue-4.4/inet-frags-refactor-lowpan_net_frag_init.patch
new file mode 100644 (file)
index 0000000..a8cf8b4
--- /dev/null
@@ -0,0 +1,62 @@
+From foo@baz Thu Feb  7 12:09:56 CET 2019
+From: Eric Dumazet <edumazet@google.com>
+Date: Wed, 10 Oct 2018 12:29:53 -0700
+Subject: inet: frags: refactor lowpan_net_frag_init()
+
+From: Eric Dumazet <edumazet@google.com>
+
+commit 807f1844df4ac23594268fa9f41902d0549e92aa upstream.
+
+We want to call lowpan_net_frag_init() earlier.
+Similar to commit "inet: frags: refactor ipv6_frag_init()"
+
+This is a prereq to "inet: frags: use rhashtables for reassembly units"
+
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/ieee802154/6lowpan/reassembly.c |   20 +++++++++++---------
+ 1 file changed, 11 insertions(+), 9 deletions(-)
+
+--- a/net/ieee802154/6lowpan/reassembly.c
++++ b/net/ieee802154/6lowpan/reassembly.c
+@@ -614,14 +614,6 @@ int __init lowpan_net_frag_init(void)
+ {
+       int ret;
+-      ret = lowpan_frags_sysctl_register();
+-      if (ret)
+-              return ret;
+-
+-      ret = register_pernet_subsys(&lowpan_frags_ops);
+-      if (ret)
+-              goto err_pernet;
+-
+       lowpan_frags.hashfn = lowpan_hashfn;
+       lowpan_frags.constructor = lowpan_frag_init;
+       lowpan_frags.destructor = NULL;
+@@ -632,11 +624,21 @@ int __init lowpan_net_frag_init(void)
+       lowpan_frags.frags_cache_name = lowpan_frags_cache_name;
+       ret = inet_frags_init(&lowpan_frags);
+       if (ret)
+-              goto err_pernet;
++              goto out;
++
++      ret = lowpan_frags_sysctl_register();
++      if (ret)
++              goto err_sysctl;
++      ret = register_pernet_subsys(&lowpan_frags_ops);
++      if (ret)
++              goto err_pernet;
++out:
+       return ret;
+ err_pernet:
+       lowpan_frags_sysctl_unregister();
++err_sysctl:
++      inet_frags_fini(&lowpan_frags);
+       return ret;
+ }
diff --git a/queue-4.4/inet-frags-remove-inet_frag_maybe_warn_overflow.patch b/queue-4.4/inet-frags-remove-inet_frag_maybe_warn_overflow.patch
new file mode 100644 (file)
index 0000000..6459688
--- /dev/null
@@ -0,0 +1,112 @@
+From foo@baz Thu Feb  7 12:09:56 CET 2019
+From: Eric Dumazet <edumazet@google.com>
+Date: Wed, 10 Oct 2018 12:29:59 -0700
+Subject: inet: frags: remove inet_frag_maybe_warn_overflow()
+
+From: Eric Dumazet <edumazet@google.com>
+
+commit 2d44ed22e607f9a285b049de2263e3840673a260 upstream.
+
+This function is obsolete, after rhashtable addition to inet defrag.
+
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/net/inet_frag.h                 |    2 --
+ net/ieee802154/6lowpan/reassembly.c     |    5 ++---
+ net/ipv4/inet_fragment.c                |   11 -----------
+ net/ipv4/ip_fragment.c                  |    5 ++---
+ net/ipv6/netfilter/nf_conntrack_reasm.c |    5 ++---
+ net/ipv6/reassembly.c                   |    5 ++---
+ 6 files changed, 8 insertions(+), 25 deletions(-)
+
+--- a/include/net/inet_frag.h
++++ b/include/net/inet_frag.h
+@@ -109,8 +109,6 @@ void inet_frags_exit_net(struct netns_fr
+ void inet_frag_kill(struct inet_frag_queue *q);
+ void inet_frag_destroy(struct inet_frag_queue *q);
+ struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, void *key);
+-void inet_frag_maybe_warn_overflow(struct inet_frag_queue *q,
+-                                 const char *prefix);
+ static inline void inet_frag_put(struct inet_frag_queue *q)
+ {
+--- a/net/ieee802154/6lowpan/reassembly.c
++++ b/net/ieee802154/6lowpan/reassembly.c
+@@ -83,10 +83,9 @@ fq_find(struct net *net, const struct lo
+       key.dst = *dst;
+       q = inet_frag_find(&ieee802154_lowpan->frags, &key);
+-      if (IS_ERR_OR_NULL(q)) {
+-              inet_frag_maybe_warn_overflow(q, pr_fmt());
++      if (!q)
+               return NULL;
+-      }
++
+       return container_of(q, struct lowpan_frag_queue, q);
+ }
+--- a/net/ipv4/inet_fragment.c
++++ b/net/ipv4/inet_fragment.c
+@@ -225,14 +225,3 @@ struct inet_frag_queue *inet_frag_find(s
+       return inet_frag_create(nf, key);
+ }
+ EXPORT_SYMBOL(inet_frag_find);
+-
+-void inet_frag_maybe_warn_overflow(struct inet_frag_queue *q,
+-                                 const char *prefix)
+-{
+-      static const char msg[] = "inet_frag_find: Fragment hash bucket"
+-              " list length grew over limit. Dropping fragment.\n";
+-
+-      if (PTR_ERR(q) == -ENOBUFS)
+-              net_dbg_ratelimited("%s%s", prefix, msg);
+-}
+-EXPORT_SYMBOL(inet_frag_maybe_warn_overflow);
+--- a/net/ipv4/ip_fragment.c
++++ b/net/ipv4/ip_fragment.c
+@@ -221,10 +221,9 @@ static struct ipq *ip_find(struct net *n
+       struct inet_frag_queue *q;
+       q = inet_frag_find(&net->ipv4.frags, &key);
+-      if (IS_ERR_OR_NULL(q)) {
+-              inet_frag_maybe_warn_overflow(q, pr_fmt());
++      if (!q)
+               return NULL;
+-      }
++
+       return container_of(q, struct ipq, q);
+ }
+--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
++++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
+@@ -184,10 +184,9 @@ static struct frag_queue *fq_find(struct
+       struct inet_frag_queue *q;
+       q = inet_frag_find(&net->nf_frag.frags, &key);
+-      if (IS_ERR_OR_NULL(q)) {
+-              inet_frag_maybe_warn_overflow(q, pr_fmt());
++      if (!q)
+               return NULL;
+-      }
++
+       return container_of(q, struct frag_queue, q);
+ }
+--- a/net/ipv6/reassembly.c
++++ b/net/ipv6/reassembly.c
+@@ -154,10 +154,9 @@ fq_find(struct net *net, __be32 id, cons
+               key.iif = 0;
+       q = inet_frag_find(&net->ipv6.frags, &key);
+-      if (IS_ERR_OR_NULL(q)) {
+-              inet_frag_maybe_warn_overflow(q, pr_fmt());
++      if (!q)
+               return NULL;
+-      }
++
+       return container_of(q, struct frag_queue, q);
+ }
diff --git a/queue-4.4/inet-frags-remove-some-helpers.patch b/queue-4.4/inet-frags-remove-some-helpers.patch
new file mode 100644 (file)
index 0000000..5ebad50
--- /dev/null
@@ -0,0 +1,126 @@
+From foo@baz Thu Feb  7 12:09:56 CET 2019
+From: Eric Dumazet <edumazet@google.com>
+Date: Wed, 10 Oct 2018 12:29:57 -0700
+Subject: inet: frags: remove some helpers
+
+From: Eric Dumazet <edumazet@google.com>
+
+commit 6befe4a78b1553edb6eed3a78b4bcd9748526672 upstream.
+
+Remove sum_frag_mem_limit(), ip_frag_mem() & ip6_frag_mem()
+
+Also since we use rhashtable we can bring back the number of fragments
+in "grep FRAG /proc/net/sockstat /proc/net/sockstat6" that was
+removed in commit 434d305405ab ("inet: frag: don't account number
+of fragment queues")
+
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/net/inet_frag.h |    5 -----
+ include/net/ip.h        |    1 -
+ include/net/ipv6.h      |    7 -------
+ net/ipv4/ip_fragment.c  |    5 -----
+ net/ipv4/proc.c         |    6 +++---
+ net/ipv6/proc.c         |    5 +++--
+ 6 files changed, 6 insertions(+), 23 deletions(-)
+
+--- a/include/net/inet_frag.h
++++ b/include/net/inet_frag.h
+@@ -140,11 +140,6 @@ static inline void add_frag_mem_limit(st
+       atomic_add(i, &nf->mem);
+ }
+-static inline int sum_frag_mem_limit(struct netns_frags *nf)
+-{
+-      return atomic_read(&nf->mem);
+-}
+-
+ /* RFC 3168 support :
+  * We want to check ECN values of all fragments, do detect invalid combinations.
+  * In ipq->ecn, we store the OR value of each ip4_frag_ecn() fragment value.
+--- a/include/net/ip.h
++++ b/include/net/ip.h
+@@ -524,7 +524,6 @@ static inline struct sk_buff *ip_check_d
+       return skb;
+ }
+ #endif
+-int ip_frag_mem(struct net *net);
+ /*
+  *    Functions provided by ip_forward.c
+--- a/include/net/ipv6.h
++++ b/include/net/ipv6.h
+@@ -320,13 +320,6 @@ static inline bool ipv6_accept_ra(struct
+           idev->cnf.accept_ra;
+ }
+-#if IS_ENABLED(CONFIG_IPV6)
+-static inline int ip6_frag_mem(struct net *net)
+-{
+-      return sum_frag_mem_limit(&net->ipv6.frags);
+-}
+-#endif
+-
+ #define IPV6_FRAG_HIGH_THRESH (4 * 1024*1024) /* 4194304 */
+ #define IPV6_FRAG_LOW_THRESH  (3 * 1024*1024) /* 3145728 */
+ #define IPV6_FRAG_TIMEOUT     (60 * HZ)       /* 60 seconds */
+--- a/net/ipv4/ip_fragment.c
++++ b/net/ipv4/ip_fragment.c
+@@ -84,11 +84,6 @@ static u8 ip4_frag_ecn(u8 tos)
+ static struct inet_frags ip4_frags;
+-int ip_frag_mem(struct net *net)
+-{
+-      return sum_frag_mem_limit(&net->ipv4.frags);
+-}
+-
+ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
+                        struct net_device *dev);
+--- a/net/ipv4/proc.c
++++ b/net/ipv4/proc.c
+@@ -52,7 +52,6 @@
+ static int sockstat_seq_show(struct seq_file *seq, void *v)
+ {
+       struct net *net = seq->private;
+-      unsigned int frag_mem;
+       int orphans, sockets;
+       local_bh_disable();
+@@ -72,8 +71,9 @@ static int sockstat_seq_show(struct seq_
+                  sock_prot_inuse_get(net, &udplite_prot));
+       seq_printf(seq, "RAW: inuse %d\n",
+                  sock_prot_inuse_get(net, &raw_prot));
+-      frag_mem = ip_frag_mem(net);
+-      seq_printf(seq,  "FRAG: inuse %u memory %u\n", !!frag_mem, frag_mem);
++      seq_printf(seq,  "FRAG: inuse %u memory %u\n",
++                 atomic_read(&net->ipv4.frags.rhashtable.nelems),
++                 frag_mem_limit(&net->ipv4.frags));
+       return 0;
+ }
+--- a/net/ipv6/proc.c
++++ b/net/ipv6/proc.c
+@@ -33,7 +33,6 @@
+ static int sockstat6_seq_show(struct seq_file *seq, void *v)
+ {
+       struct net *net = seq->private;
+-      unsigned int frag_mem = ip6_frag_mem(net);
+       seq_printf(seq, "TCP6: inuse %d\n",
+                      sock_prot_inuse_get(net, &tcpv6_prot));
+@@ -43,7 +42,9 @@ static int sockstat6_seq_show(struct seq
+                       sock_prot_inuse_get(net, &udplitev6_prot));
+       seq_printf(seq, "RAW6: inuse %d\n",
+                      sock_prot_inuse_get(net, &rawv6_prot));
+-      seq_printf(seq, "FRAG6: inuse %u memory %u\n", !!frag_mem, frag_mem);
++      seq_printf(seq, "FRAG6: inuse %u memory %u\n",
++                 atomic_read(&net->ipv6.frags.rhashtable.nelems),
++                 frag_mem_limit(&net->ipv6.frags));
+       return 0;
+ }
diff --git a/queue-4.4/inet-frags-reorganize-struct-netns_frags.patch b/queue-4.4/inet-frags-reorganize-struct-netns_frags.patch
new file mode 100644 (file)
index 0000000..38b4633
--- /dev/null
@@ -0,0 +1,45 @@
+From foo@baz Thu Feb  7 12:09:56 CET 2019
+From: Eric Dumazet <edumazet@google.com>
+Date: Wed, 10 Oct 2018 12:30:04 -0700
+Subject: inet: frags: reorganize struct netns_frags
+
+From: Eric Dumazet <edumazet@google.com>
+
+commit c2615cf5a761b32bf74e85bddc223dfff3d9b9f0 upstream.
+
+Put the read-mostly fields in a separate cache line
+at the beginning of struct netns_frags, to reduce
+false sharing noticed in inet_frag_kill()
+
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+[bwh: Backported to 4.4: adjust context]
+Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/net/inet_frag.h |    9 +++++----
+ 1 file changed, 5 insertions(+), 4 deletions(-)
+
+--- a/include/net/inet_frag.h
++++ b/include/net/inet_frag.h
+@@ -4,15 +4,16 @@
+ #include <linux/rhashtable.h>
+ struct netns_frags {
+-      struct rhashtable       rhashtable ____cacheline_aligned_in_smp;
+-
+-      /* Keep atomic mem on separate cachelines in structs that include it */
+-      atomic_long_t           mem ____cacheline_aligned_in_smp;
+       /* sysctls */
+       long                    high_thresh;
+       long                    low_thresh;
+       int                     timeout;
+       struct inet_frags       *f;
++
++      struct rhashtable       rhashtable ____cacheline_aligned_in_smp;
++
++      /* Keep atomic mem on separate cachelines in structs that include it */
++      atomic_long_t           mem ____cacheline_aligned_in_smp;
+ };
+ /**
diff --git a/queue-4.4/inet-frags-use-rhashtables-for-reassembly-units.patch b/queue-4.4/inet-frags-use-rhashtables-for-reassembly-units.patch
new file mode 100644 (file)
index 0000000..72cf29d
--- /dev/null
@@ -0,0 +1,1332 @@
+From foo@baz Thu Feb  7 12:09:56 CET 2019
+From: Eric Dumazet <edumazet@google.com>
+Date: Wed, 10 Oct 2018 12:29:56 -0700
+Subject: inet: frags: use rhashtables for reassembly units
+
+From: Eric Dumazet <edumazet@google.com>
+
+commit 648700f76b03b7e8149d13cc2bdb3355035258a9 upstream.
+
+Some applications still rely on IP fragmentation, and to be fair linux
+reassembly unit is not working under any serious load.
+
+It uses static hash tables of 1024 buckets, and up to 128 items per bucket (!!!)
+
+A work queue is supposed to garbage collect items when host is under memory
+pressure, and doing a hash rebuild, changing seed used in hash computations.
+
+This work queue blocks softirqs for up to 25 ms when doing a hash rebuild,
+occurring every 5 seconds if host is under fire.
+
+Then there is the problem of sharing this hash table for all netns.
+
+It is time to switch to rhashtables, and allocate one of them per netns
+to speedup netns dismantle, since this is a critical metric these days.
+
+Lookup is now using RCU. A followup patch will even remove
+the refcount hold/release left from prior implementation and save
+a couple of atomic operations.
+
+Before this patch, 16 cpus (16 RX queue NIC) could not handle more
+than 1 Mpps frags DDOS.
+
+After the patch, I reach 9 Mpps without any tuning, and can use up to 2GB
+of storage for the fragments (exact number depends on frags being evicted
+after timeout)
+
+$ grep FRAG /proc/net/sockstat
+FRAG: inuse 1966916 memory 2140004608
+
+A followup patch will change the limits for 64bit arches.
+
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Cc: Kirill Tkhai <ktkhai@virtuozzo.com>
+Cc: Herbert Xu <herbert@gondor.apana.org.au>
+Cc: Florian Westphal <fw@strlen.de>
+Cc: Jesper Dangaard Brouer <brouer@redhat.com>
+Cc: Alexander Aring <alex.aring@gmail.com>
+Cc: Stefan Schmidt <stefan@osg.samsung.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+[bwh: Backported to 4.4: adjust context]
+Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ Documentation/networking/ip-sysctl.txt  |    7 
+ include/net/inet_frag.h                 |   81 +++----
+ include/net/ipv6.h                      |   16 -
+ net/ieee802154/6lowpan/6lowpan_i.h      |   26 --
+ net/ieee802154/6lowpan/reassembly.c     |   91 +++-----
+ net/ipv4/inet_fragment.c                |  349 ++++++--------------------------
+ net/ipv4/ip_fragment.c                  |  112 ++++------
+ net/ipv6/netfilter/nf_conntrack_reasm.c |   51 +---
+ net/ipv6/reassembly.c                   |  110 ++++------
+ 9 files changed, 267 insertions(+), 576 deletions(-)
+
+--- a/Documentation/networking/ip-sysctl.txt
++++ b/Documentation/networking/ip-sysctl.txt
+@@ -113,13 +113,10 @@ min_adv_mss - INTEGER
+ IP Fragmentation:
+ ipfrag_high_thresh - INTEGER
+-      Maximum memory used to reassemble IP fragments. When
+-      ipfrag_high_thresh bytes of memory is allocated for this purpose,
+-      the fragment handler will toss packets until ipfrag_low_thresh
+-      is reached. This also serves as a maximum limit to namespaces
+-      different from the initial one.
++      Maximum memory used to reassemble IP fragments.
+ ipfrag_low_thresh - INTEGER
++      (Obsolete since linux-4.17)
+       Maximum memory used to reassemble IP fragments before the kernel
+       begins to remove incomplete fragment queues to free up resources.
+       The kernel still accepts new fragments for defragmentation.
+--- a/include/net/inet_frag.h
++++ b/include/net/inet_frag.h
+@@ -1,7 +1,11 @@
+ #ifndef __NET_FRAG_H__
+ #define __NET_FRAG_H__
++#include <linux/rhashtable.h>
++
+ struct netns_frags {
++      struct rhashtable       rhashtable ____cacheline_aligned_in_smp;
++
+       /* Keep atomic mem on separate cachelines in structs that include it */
+       atomic_t                mem ____cacheline_aligned_in_smp;
+       /* sysctls */
+@@ -24,12 +28,30 @@ enum {
+       INET_FRAG_COMPLETE      = BIT(2),
+ };
++struct frag_v4_compare_key {
++      __be32          saddr;
++      __be32          daddr;
++      u32             user;
++      u32             vif;
++      __be16          id;
++      u16             protocol;
++};
++
++struct frag_v6_compare_key {
++      struct in6_addr saddr;
++      struct in6_addr daddr;
++      u32             user;
++      __be32          id;
++      u32             iif;
++};
++
+ /**
+  * struct inet_frag_queue - fragment queue
+  *
+- * @lock: spinlock protecting the queue
++ * @node: rhash node
++ * @key: keys identifying this frag.
+  * @timer: queue expiration timer
+- * @list: hash bucket list
++ * @lock: spinlock protecting this frag
+  * @refcnt: reference count of the queue
+  * @fragments: received fragments head
+  * @fragments_tail: received fragments tail
+@@ -39,12 +61,16 @@ enum {
+  * @flags: fragment queue flags
+  * @max_size: maximum received fragment size
+  * @net: namespace that this frag belongs to
+- * @list_evictor: list of queues to forcefully evict (e.g. due to low memory)
++ * @rcu: rcu head for freeing deferall
+  */
+ struct inet_frag_queue {
+-      spinlock_t              lock;
++      struct rhash_head       node;
++      union {
++              struct frag_v4_compare_key v4;
++              struct frag_v6_compare_key v6;
++      } key;
+       struct timer_list       timer;
+-      struct hlist_node       list;
++      spinlock_t              lock;
+       atomic_t                refcnt;
+       struct sk_buff          *fragments;
+       struct sk_buff          *fragments_tail;
+@@ -53,45 +79,13 @@ struct inet_frag_queue {
+       int                     meat;
+       __u8                    flags;
+       u16                     max_size;
+-      struct netns_frags      *net;
+-      struct hlist_node       list_evictor;
+-};
+-
+-#define INETFRAGS_HASHSZ      1024
+-
+-/* averaged:
+- * max_depth = default ipfrag_high_thresh / INETFRAGS_HASHSZ /
+- *           rounded up (SKB_TRUELEN(0) + sizeof(struct ipq or
+- *           struct frag_queue))
+- */
+-#define INETFRAGS_MAXDEPTH    128
+-
+-struct inet_frag_bucket {
+-      struct hlist_head       chain;
+-      spinlock_t              chain_lock;
++      struct netns_frags      *net;
++      struct rcu_head         rcu;
+ };
+ struct inet_frags {
+-      struct inet_frag_bucket hash[INETFRAGS_HASHSZ];
+-
+-      struct work_struct      frags_work;
+-      unsigned int next_bucket;
+-      unsigned long last_rebuild_jiffies;
+-      bool rebuild;
+-
+-      /* The first call to hashfn is responsible to initialize
+-       * rnd. This is best done with net_get_random_once.
+-       *
+-       * rnd_seqlock is used to let hash insertion detect
+-       * when it needs to re-lookup the hash chain to use.
+-       */
+-      u32                     rnd;
+-      seqlock_t               rnd_seqlock;
+       int                     qsize;
+-      unsigned int            (*hashfn)(const struct inet_frag_queue *);
+-      bool                    (*match)(const struct inet_frag_queue *q,
+-                                       const void *arg);
+       void                    (*constructor)(struct inet_frag_queue *q,
+                                              const void *arg);
+       void                    (*destructor)(struct inet_frag_queue *);
+@@ -99,6 +93,7 @@ struct inet_frags {
+       void                    (*frag_expire)(unsigned long data);
+       struct kmem_cache       *frags_cachep;
+       const char              *frags_cache_name;
++      struct rhashtable_params rhash_params;
+ };
+ int inet_frags_init(struct inet_frags *);
+@@ -107,15 +102,13 @@ void inet_frags_fini(struct inet_frags *
+ static inline int inet_frags_init_net(struct netns_frags *nf)
+ {
+       atomic_set(&nf->mem, 0);
+-      return 0;
++      return rhashtable_init(&nf->rhashtable, &nf->f->rhash_params);
+ }
+ void inet_frags_exit_net(struct netns_frags *nf);
+ void inet_frag_kill(struct inet_frag_queue *q);
+ void inet_frag_destroy(struct inet_frag_queue *q);
+-struct inet_frag_queue *inet_frag_find(struct netns_frags *nf,
+-              struct inet_frags *f, void *key, unsigned int hash);
+-
++struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, void *key);
+ void inet_frag_maybe_warn_overflow(struct inet_frag_queue *q,
+                                  const char *prefix);
+@@ -127,7 +120,7 @@ static inline void inet_frag_put(struct
+ static inline bool inet_frag_evicting(struct inet_frag_queue *q)
+ {
+-      return !hlist_unhashed(&q->list_evictor);
++      return false;
+ }
+ /* Memory Tracking Functions. */
+--- a/include/net/ipv6.h
++++ b/include/net/ipv6.h
+@@ -505,17 +505,8 @@ enum ip6_defrag_users {
+       __IP6_DEFRAG_CONNTRACK_BRIDGE_IN = IP6_DEFRAG_CONNTRACK_BRIDGE_IN + USHRT_MAX,
+ };
+-struct ip6_create_arg {
+-      __be32 id;
+-      u32 user;
+-      const struct in6_addr *src;
+-      const struct in6_addr *dst;
+-      int iif;
+-      u8 ecn;
+-};
+-
+ void ip6_frag_init(struct inet_frag_queue *q, const void *a);
+-bool ip6_frag_match(const struct inet_frag_queue *q, const void *a);
++extern const struct rhashtable_params ip6_rhash_params;
+ /*
+  *    Equivalent of ipv4 struct ip
+@@ -523,11 +514,6 @@ bool ip6_frag_match(const struct inet_fr
+ struct frag_queue {
+       struct inet_frag_queue  q;
+-      __be32                  id;             /* fragment id          */
+-      u32                     user;
+-      struct in6_addr         saddr;
+-      struct in6_addr         daddr;
+-
+       int                     iif;
+       unsigned int            csum;
+       __u16                   nhoffset;
+--- a/net/ieee802154/6lowpan/6lowpan_i.h
++++ b/net/ieee802154/6lowpan/6lowpan_i.h
+@@ -16,37 +16,19 @@ typedef unsigned __bitwise__ lowpan_rx_r
+ #define LOWPAN_DISPATCH_FRAG1           0xc0
+ #define LOWPAN_DISPATCH_FRAGN           0xe0
+-struct lowpan_create_arg {
++struct frag_lowpan_compare_key {
+       u16 tag;
+       u16 d_size;
+-      const struct ieee802154_addr *src;
+-      const struct ieee802154_addr *dst;
++      const struct ieee802154_addr src;
++      const struct ieee802154_addr dst;
+ };
+-/* Equivalent of ipv4 struct ip
++/* Equivalent of ipv4 struct ipq
+  */
+ struct lowpan_frag_queue {
+       struct inet_frag_queue  q;
+-
+-      u16                     tag;
+-      u16                     d_size;
+-      struct ieee802154_addr  saddr;
+-      struct ieee802154_addr  daddr;
+ };
+-static inline u32 ieee802154_addr_hash(const struct ieee802154_addr *a)
+-{
+-      switch (a->mode) {
+-      case IEEE802154_ADDR_LONG:
+-              return (((__force u64)a->extended_addr) >> 32) ^
+-                      (((__force u64)a->extended_addr) & 0xffffffff);
+-      case IEEE802154_ADDR_SHORT:
+-              return (__force u32)(a->short_addr);
+-      default:
+-              return 0;
+-      }
+-}
+-
+ /* private device info */
+ struct lowpan_dev_info {
+       struct net_device       *wdev; /* wpan device ptr */
+--- a/net/ieee802154/6lowpan/reassembly.c
++++ b/net/ieee802154/6lowpan/reassembly.c
+@@ -37,47 +37,15 @@ static struct inet_frags lowpan_frags;
+ static int lowpan_frag_reasm(struct lowpan_frag_queue *fq,
+                            struct sk_buff *prev, struct net_device *ldev);
+-static unsigned int lowpan_hash_frag(u16 tag, u16 d_size,
+-                                   const struct ieee802154_addr *saddr,
+-                                   const struct ieee802154_addr *daddr)
+-{
+-      net_get_random_once(&lowpan_frags.rnd, sizeof(lowpan_frags.rnd));
+-      return jhash_3words(ieee802154_addr_hash(saddr),
+-                          ieee802154_addr_hash(daddr),
+-                          (__force u32)(tag + (d_size << 16)),
+-                          lowpan_frags.rnd);
+-}
+-
+-static unsigned int lowpan_hashfn(const struct inet_frag_queue *q)
+-{
+-      const struct lowpan_frag_queue *fq;
+-
+-      fq = container_of(q, struct lowpan_frag_queue, q);
+-      return lowpan_hash_frag(fq->tag, fq->d_size, &fq->saddr, &fq->daddr);
+-}
+-
+-static bool lowpan_frag_match(const struct inet_frag_queue *q, const void *a)
+-{
+-      const struct lowpan_frag_queue *fq;
+-      const struct lowpan_create_arg *arg = a;
+-
+-      fq = container_of(q, struct lowpan_frag_queue, q);
+-      return  fq->tag == arg->tag && fq->d_size == arg->d_size &&
+-              ieee802154_addr_equal(&fq->saddr, arg->src) &&
+-              ieee802154_addr_equal(&fq->daddr, arg->dst);
+-}
+-
+ static void lowpan_frag_init(struct inet_frag_queue *q, const void *a)
+ {
+-      const struct lowpan_create_arg *arg = a;
++      const struct frag_lowpan_compare_key *key = a;
+       struct lowpan_frag_queue *fq;
+       fq = container_of(q, struct lowpan_frag_queue, q);
+-      fq->tag = arg->tag;
+-      fq->d_size = arg->d_size;
+-      fq->saddr = *arg->src;
+-      fq->daddr = *arg->dst;
++      BUILD_BUG_ON(sizeof(*key) > sizeof(q->key));
++      memcpy(&q->key, key, sizeof(*key));
+ }
+ static void lowpan_frag_expire(unsigned long data)
+@@ -104,21 +72,17 @@ fq_find(struct net *net, const struct lo
+       const struct ieee802154_addr *src,
+       const struct ieee802154_addr *dst)
+ {
+-      struct inet_frag_queue *q;
+-      struct lowpan_create_arg arg;
+-      unsigned int hash;
+       struct netns_ieee802154_lowpan *ieee802154_lowpan =
+               net_ieee802154_lowpan(net);
++      struct frag_lowpan_compare_key key = {
++              .tag = cb->d_tag,
++              .d_size = cb->d_size,
++              .src = *src,
++              .dst = *dst,
++      };
++      struct inet_frag_queue *q;
+-      arg.tag = cb->d_tag;
+-      arg.d_size = cb->d_size;
+-      arg.src = src;
+-      arg.dst = dst;
+-
+-      hash = lowpan_hash_frag(cb->d_tag, cb->d_size, src, dst);
+-
+-      q = inet_frag_find(&ieee802154_lowpan->frags,
+-                         &lowpan_frags, &arg, hash);
++      q = inet_frag_find(&ieee802154_lowpan->frags, &key);
+       if (IS_ERR_OR_NULL(q)) {
+               inet_frag_maybe_warn_overflow(q, pr_fmt());
+               return NULL;
+@@ -610,18 +574,47 @@ static struct pernet_operations lowpan_f
+       .exit = lowpan_frags_exit_net,
+ };
++static u32 lowpan_key_hashfn(const void *data, u32 len, u32 seed)
++{
++      return jhash2(data,
++                    sizeof(struct frag_lowpan_compare_key) / sizeof(u32), seed);
++}
++
++static u32 lowpan_obj_hashfn(const void *data, u32 len, u32 seed)
++{
++      const struct inet_frag_queue *fq = data;
++
++      return jhash2((const u32 *)&fq->key,
++                    sizeof(struct frag_lowpan_compare_key) / sizeof(u32), seed);
++}
++
++static int lowpan_obj_cmpfn(struct rhashtable_compare_arg *arg, const void *ptr)
++{
++      const struct frag_lowpan_compare_key *key = arg->key;
++      const struct inet_frag_queue *fq = ptr;
++
++      return !!memcmp(&fq->key, key, sizeof(*key));
++}
++
++static const struct rhashtable_params lowpan_rhash_params = {
++      .head_offset            = offsetof(struct inet_frag_queue, node),
++      .hashfn                 = lowpan_key_hashfn,
++      .obj_hashfn             = lowpan_obj_hashfn,
++      .obj_cmpfn              = lowpan_obj_cmpfn,
++      .automatic_shrinking    = true,
++};
++
+ int __init lowpan_net_frag_init(void)
+ {
+       int ret;
+-      lowpan_frags.hashfn = lowpan_hashfn;
+       lowpan_frags.constructor = lowpan_frag_init;
+       lowpan_frags.destructor = NULL;
+       lowpan_frags.skb_free = NULL;
+       lowpan_frags.qsize = sizeof(struct frag_queue);
+-      lowpan_frags.match = lowpan_frag_match;
+       lowpan_frags.frag_expire = lowpan_frag_expire;
+       lowpan_frags.frags_cache_name = lowpan_frags_cache_name;
++      lowpan_frags.rhash_params = lowpan_rhash_params;
+       ret = inet_frags_init(&lowpan_frags);
+       if (ret)
+               goto out;
+--- a/net/ipv4/inet_fragment.c
++++ b/net/ipv4/inet_fragment.c
+@@ -25,12 +25,6 @@
+ #include <net/inet_frag.h>
+ #include <net/inet_ecn.h>
+-#define INETFRAGS_EVICT_BUCKETS   128
+-#define INETFRAGS_EVICT_MAX     512
+-
+-/* don't rebuild inetfrag table with new secret more often than this */
+-#define INETFRAGS_MIN_REBUILD_INTERVAL (5 * HZ)
+-
+ /* Given the OR values of all fragments, apply RFC 3168 5.3 requirements
+  * Value : 0xff if frame should be dropped.
+  *         0 or INET_ECN_CE value, to be ORed in to final iph->tos field
+@@ -52,157 +46,8 @@ const u8 ip_frag_ecn_table[16] = {
+ };
+ EXPORT_SYMBOL(ip_frag_ecn_table);
+-static unsigned int
+-inet_frag_hashfn(const struct inet_frags *f, const struct inet_frag_queue *q)
+-{
+-      return f->hashfn(q) & (INETFRAGS_HASHSZ - 1);
+-}
+-
+-static bool inet_frag_may_rebuild(struct inet_frags *f)
+-{
+-      return time_after(jiffies,
+-             f->last_rebuild_jiffies + INETFRAGS_MIN_REBUILD_INTERVAL);
+-}
+-
+-static void inet_frag_secret_rebuild(struct inet_frags *f)
+-{
+-      int i;
+-
+-      write_seqlock_bh(&f->rnd_seqlock);
+-
+-      if (!inet_frag_may_rebuild(f))
+-              goto out;
+-
+-      get_random_bytes(&f->rnd, sizeof(u32));
+-
+-      for (i = 0; i < INETFRAGS_HASHSZ; i++) {
+-              struct inet_frag_bucket *hb;
+-              struct inet_frag_queue *q;
+-              struct hlist_node *n;
+-
+-              hb = &f->hash[i];
+-              spin_lock(&hb->chain_lock);
+-
+-              hlist_for_each_entry_safe(q, n, &hb->chain, list) {
+-                      unsigned int hval = inet_frag_hashfn(f, q);
+-
+-                      if (hval != i) {
+-                              struct inet_frag_bucket *hb_dest;
+-
+-                              hlist_del(&q->list);
+-
+-                              /* Relink to new hash chain. */
+-                              hb_dest = &f->hash[hval];
+-
+-                              /* This is the only place where we take
+-                               * another chain_lock while already holding
+-                               * one.  As this will not run concurrently,
+-                               * we cannot deadlock on hb_dest lock below, if its
+-                               * already locked it will be released soon since
+-                               * other caller cannot be waiting for hb lock
+-                               * that we've taken above.
+-                               */
+-                              spin_lock_nested(&hb_dest->chain_lock,
+-                                               SINGLE_DEPTH_NESTING);
+-                              hlist_add_head(&q->list, &hb_dest->chain);
+-                              spin_unlock(&hb_dest->chain_lock);
+-                      }
+-              }
+-              spin_unlock(&hb->chain_lock);
+-      }
+-
+-      f->rebuild = false;
+-      f->last_rebuild_jiffies = jiffies;
+-out:
+-      write_sequnlock_bh(&f->rnd_seqlock);
+-}
+-
+-static bool inet_fragq_should_evict(const struct inet_frag_queue *q)
+-{
+-      if (!hlist_unhashed(&q->list_evictor))
+-              return false;
+-
+-      return q->net->low_thresh == 0 ||
+-             frag_mem_limit(q->net) >= q->net->low_thresh;
+-}
+-
+-static unsigned int
+-inet_evict_bucket(struct inet_frags *f, struct inet_frag_bucket *hb)
+-{
+-      struct inet_frag_queue *fq;
+-      struct hlist_node *n;
+-      unsigned int evicted = 0;
+-      HLIST_HEAD(expired);
+-
+-      spin_lock(&hb->chain_lock);
+-
+-      hlist_for_each_entry_safe(fq, n, &hb->chain, list) {
+-              if (!inet_fragq_should_evict(fq))
+-                      continue;
+-
+-              if (!del_timer(&fq->timer))
+-                      continue;
+-
+-              hlist_add_head(&fq->list_evictor, &expired);
+-              ++evicted;
+-      }
+-
+-      spin_unlock(&hb->chain_lock);
+-
+-      hlist_for_each_entry_safe(fq, n, &expired, list_evictor)
+-              f->frag_expire((unsigned long) fq);
+-
+-      return evicted;
+-}
+-
+-static void inet_frag_worker(struct work_struct *work)
+-{
+-      unsigned int budget = INETFRAGS_EVICT_BUCKETS;
+-      unsigned int i, evicted = 0;
+-      struct inet_frags *f;
+-
+-      f = container_of(work, struct inet_frags, frags_work);
+-
+-      BUILD_BUG_ON(INETFRAGS_EVICT_BUCKETS >= INETFRAGS_HASHSZ);
+-
+-      local_bh_disable();
+-
+-      for (i = ACCESS_ONCE(f->next_bucket); budget; --budget) {
+-              evicted += inet_evict_bucket(f, &f->hash[i]);
+-              i = (i + 1) & (INETFRAGS_HASHSZ - 1);
+-              if (evicted > INETFRAGS_EVICT_MAX)
+-                      break;
+-      }
+-
+-      f->next_bucket = i;
+-
+-      local_bh_enable();
+-
+-      if (f->rebuild && inet_frag_may_rebuild(f))
+-              inet_frag_secret_rebuild(f);
+-}
+-
+-static void inet_frag_schedule_worker(struct inet_frags *f)
+-{
+-      if (unlikely(!work_pending(&f->frags_work)))
+-              schedule_work(&f->frags_work);
+-}
+-
+ int inet_frags_init(struct inet_frags *f)
+ {
+-      int i;
+-
+-      INIT_WORK(&f->frags_work, inet_frag_worker);
+-
+-      for (i = 0; i < INETFRAGS_HASHSZ; i++) {
+-              struct inet_frag_bucket *hb = &f->hash[i];
+-
+-              spin_lock_init(&hb->chain_lock);
+-              INIT_HLIST_HEAD(&hb->chain);
+-      }
+-
+-      seqlock_init(&f->rnd_seqlock);
+-      f->last_rebuild_jiffies = 0;
+       f->frags_cachep = kmem_cache_create(f->frags_cache_name, f->qsize, 0, 0,
+                                           NULL);
+       if (!f->frags_cachep)
+@@ -214,66 +59,42 @@ EXPORT_SYMBOL(inet_frags_init);
+ void inet_frags_fini(struct inet_frags *f)
+ {
+-      cancel_work_sync(&f->frags_work);
++      /* We must wait that all inet_frag_destroy_rcu() have completed. */
++      rcu_barrier();
++
+       kmem_cache_destroy(f->frags_cachep);
++      f->frags_cachep = NULL;
+ }
+ EXPORT_SYMBOL(inet_frags_fini);
+-void inet_frags_exit_net(struct netns_frags *nf)
++static void inet_frags_free_cb(void *ptr, void *arg)
+ {
+-      struct inet_frags *f =nf->f;
+-      unsigned int seq;
+-      int i;
++      struct inet_frag_queue *fq = ptr;
+-      nf->low_thresh = 0;
+-
+-evict_again:
+-      local_bh_disable();
+-      seq = read_seqbegin(&f->rnd_seqlock);
+-
+-      for (i = 0; i < INETFRAGS_HASHSZ ; i++)
+-              inet_evict_bucket(f, &f->hash[i]);
+-
+-      local_bh_enable();
+-      cond_resched();
+-
+-      if (read_seqretry(&f->rnd_seqlock, seq) ||
+-          sum_frag_mem_limit(nf))
+-              goto evict_again;
+-}
+-EXPORT_SYMBOL(inet_frags_exit_net);
++      /* If we can not cancel the timer, it means this frag_queue
++       * is already disappearing, we have nothing to do.
++       * Otherwise, we own a refcount until the end of this function.
++       */
++      if (!del_timer(&fq->timer))
++              return;
+-static struct inet_frag_bucket *
+-get_frag_bucket_locked(struct inet_frag_queue *fq, struct inet_frags *f)
+-__acquires(hb->chain_lock)
+-{
+-      struct inet_frag_bucket *hb;
+-      unsigned int seq, hash;
+-
+- restart:
+-      seq = read_seqbegin(&f->rnd_seqlock);
+-
+-      hash = inet_frag_hashfn(f, fq);
+-      hb = &f->hash[hash];
+-
+-      spin_lock(&hb->chain_lock);
+-      if (read_seqretry(&f->rnd_seqlock, seq)) {
+-              spin_unlock(&hb->chain_lock);
+-              goto restart;
++      spin_lock_bh(&fq->lock);
++      if (!(fq->flags & INET_FRAG_COMPLETE)) {
++              fq->flags |= INET_FRAG_COMPLETE;
++              atomic_dec(&fq->refcnt);
+       }
++      spin_unlock_bh(&fq->lock);
+-      return hb;
++      inet_frag_put(fq);
+ }
+-static inline void fq_unlink(struct inet_frag_queue *fq)
++void inet_frags_exit_net(struct netns_frags *nf)
+ {
+-      struct inet_frag_bucket *hb;
++      nf->low_thresh = 0; /* prevent creation of new frags */
+-      hb = get_frag_bucket_locked(fq, fq->net->f);
+-      hlist_del(&fq->list);
+-      fq->flags |= INET_FRAG_COMPLETE;
+-      spin_unlock(&hb->chain_lock);
++      rhashtable_free_and_destroy(&nf->rhashtable, inet_frags_free_cb, NULL);
+ }
++EXPORT_SYMBOL(inet_frags_exit_net);
+ void inet_frag_kill(struct inet_frag_queue *fq)
+ {
+@@ -281,7 +102,10 @@ void inet_frag_kill(struct inet_frag_que
+               atomic_dec(&fq->refcnt);
+       if (!(fq->flags & INET_FRAG_COMPLETE)) {
+-              fq_unlink(fq);
++              struct netns_frags *nf = fq->net;
++
++              fq->flags |= INET_FRAG_COMPLETE;
++              rhashtable_remove_fast(&nf->rhashtable, &fq->node, nf->f->rhash_params);
+               atomic_dec(&fq->refcnt);
+       }
+ }
+@@ -295,6 +119,17 @@ static inline void frag_kfree_skb(struct
+       kfree_skb(skb);
+ }
++static void inet_frag_destroy_rcu(struct rcu_head *head)
++{
++      struct inet_frag_queue *q = container_of(head, struct inet_frag_queue,
++                                               rcu);
++      struct inet_frags *f = q->net->f;
++
++      if (f->destructor)
++              f->destructor(q);
++      kmem_cache_free(f->frags_cachep, q);
++}
++
+ void inet_frag_destroy(struct inet_frag_queue *q)
+ {
+       struct sk_buff *fp;
+@@ -318,55 +153,21 @@ void inet_frag_destroy(struct inet_frag_
+       }
+       sum = sum_truesize + f->qsize;
+-      if (f->destructor)
+-              f->destructor(q);
+-      kmem_cache_free(f->frags_cachep, q);
++      call_rcu(&q->rcu, inet_frag_destroy_rcu);
+       sub_frag_mem_limit(nf, sum);
+ }
+ EXPORT_SYMBOL(inet_frag_destroy);
+-static struct inet_frag_queue *inet_frag_intern(struct netns_frags *nf,
+-                                              struct inet_frag_queue *qp_in,
+-                                              struct inet_frags *f,
+-                                              void *arg)
+-{
+-      struct inet_frag_bucket *hb = get_frag_bucket_locked(qp_in, f);
+-      struct inet_frag_queue *qp;
+-
+-#ifdef CONFIG_SMP
+-      /* With SMP race we have to recheck hash table, because
+-       * such entry could have been created on other cpu before
+-       * we acquired hash bucket lock.
+-       */
+-      hlist_for_each_entry(qp, &hb->chain, list) {
+-              if (qp->net == nf && f->match(qp, arg)) {
+-                      atomic_inc(&qp->refcnt);
+-                      spin_unlock(&hb->chain_lock);
+-                      qp_in->flags |= INET_FRAG_COMPLETE;
+-                      inet_frag_put(qp_in);
+-                      return qp;
+-              }
+-      }
+-#endif
+-      qp = qp_in;
+-      if (!mod_timer(&qp->timer, jiffies + nf->timeout))
+-              atomic_inc(&qp->refcnt);
+-
+-      atomic_inc(&qp->refcnt);
+-      hlist_add_head(&qp->list, &hb->chain);
+-
+-      spin_unlock(&hb->chain_lock);
+-
+-      return qp;
+-}
+-
+ static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf,
+                                              struct inet_frags *f,
+                                              void *arg)
+ {
+       struct inet_frag_queue *q;
++      if (!nf->high_thresh || frag_mem_limit(nf) > nf->high_thresh)
++              return NULL;
++
+       q = kmem_cache_zalloc(f->frags_cachep, GFP_ATOMIC);
+       if (!q)
+               return NULL;
+@@ -377,64 +178,51 @@ static struct inet_frag_queue *inet_frag
+       setup_timer(&q->timer, f->frag_expire, (unsigned long)q);
+       spin_lock_init(&q->lock);
+-      atomic_set(&q->refcnt, 1);
++      atomic_set(&q->refcnt, 3);
+       return q;
+ }
+ static struct inet_frag_queue *inet_frag_create(struct netns_frags *nf,
+-                                              struct inet_frags *f,
+                                               void *arg)
+ {
++      struct inet_frags *f = nf->f;
+       struct inet_frag_queue *q;
++      int err;
+       q = inet_frag_alloc(nf, f, arg);
+       if (!q)
+               return NULL;
+-      return inet_frag_intern(nf, q, f, arg);
+-}
+-
+-struct inet_frag_queue *inet_frag_find(struct netns_frags *nf,
+-                                     struct inet_frags *f, void *key,
+-                                     unsigned int hash)
+-{
+-      struct inet_frag_bucket *hb;
+-      struct inet_frag_queue *q;
+-      int depth = 0;
++      mod_timer(&q->timer, jiffies + nf->timeout);
+-      if (!nf->high_thresh || frag_mem_limit(nf) > nf->high_thresh) {
+-              inet_frag_schedule_worker(f);
++      err = rhashtable_insert_fast(&nf->rhashtable, &q->node,
++                                   f->rhash_params);
++      if (err < 0) {
++              q->flags |= INET_FRAG_COMPLETE;
++              inet_frag_kill(q);
++              inet_frag_destroy(q);
+               return NULL;
+       }
++      return q;
++}
++EXPORT_SYMBOL(inet_frag_create);
+-      if (frag_mem_limit(nf) > nf->low_thresh)
+-              inet_frag_schedule_worker(f);
+-
+-      hash &= (INETFRAGS_HASHSZ - 1);
+-      hb = &f->hash[hash];
+-
+-      spin_lock(&hb->chain_lock);
+-      hlist_for_each_entry(q, &hb->chain, list) {
+-              if (q->net == nf && f->match(q, key)) {
+-                      atomic_inc(&q->refcnt);
+-                      spin_unlock(&hb->chain_lock);
+-                      return q;
+-              }
+-              depth++;
+-      }
+-      spin_unlock(&hb->chain_lock);
+-
+-      if (depth <= INETFRAGS_MAXDEPTH)
+-              return inet_frag_create(nf, f, key);
++/* TODO : call from rcu_read_lock() and no longer use refcount_inc_not_zero() */
++struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, void *key)
++{
++      struct inet_frag_queue *fq;
+-      if (inet_frag_may_rebuild(f)) {
+-              if (!f->rebuild)
+-                      f->rebuild = true;
+-              inet_frag_schedule_worker(f);
++      rcu_read_lock();
++      fq = rhashtable_lookup(&nf->rhashtable, key, nf->f->rhash_params);
++      if (fq) {
++              if (!atomic_inc_not_zero(&fq->refcnt))
++                      fq = NULL;
++              rcu_read_unlock();
++              return fq;
+       }
+-
+-      return ERR_PTR(-ENOBUFS);
++      rcu_read_unlock();
++      return inet_frag_create(nf, key);
+ }
+ EXPORT_SYMBOL(inet_frag_find);
+@@ -442,8 +230,7 @@ void inet_frag_maybe_warn_overflow(struc
+                                  const char *prefix)
+ {
+       static const char msg[] = "inet_frag_find: Fragment hash bucket"
+-              " list length grew over limit " __stringify(INETFRAGS_MAXDEPTH)
+-              ". Dropping fragment.\n";
++              " list length grew over limit. Dropping fragment.\n";
+       if (PTR_ERR(q) == -ENOBUFS)
+               net_dbg_ratelimited("%s%s", prefix, msg);
+--- a/net/ipv4/ip_fragment.c
++++ b/net/ipv4/ip_fragment.c
+@@ -70,15 +70,9 @@ struct ipfrag_skb_cb
+ struct ipq {
+       struct inet_frag_queue q;
+-      u32             user;
+-      __be32          saddr;
+-      __be32          daddr;
+-      __be16          id;
+-      u8              protocol;
+       u8              ecn; /* RFC3168 support */
+       u16             max_df_size; /* largest frag with DF set seen */
+       int             iif;
+-      int             vif;   /* L3 master device index */
+       unsigned int    rid;
+       struct inet_peer *peer;
+ };
+@@ -98,41 +92,6 @@ int ip_frag_mem(struct net *net)
+ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
+                        struct net_device *dev);
+-struct ip4_create_arg {
+-      struct iphdr *iph;
+-      u32 user;
+-      int vif;
+-};
+-
+-static unsigned int ipqhashfn(__be16 id, __be32 saddr, __be32 daddr, u8 prot)
+-{
+-      net_get_random_once(&ip4_frags.rnd, sizeof(ip4_frags.rnd));
+-      return jhash_3words((__force u32)id << 16 | prot,
+-                          (__force u32)saddr, (__force u32)daddr,
+-                          ip4_frags.rnd);
+-}
+-
+-static unsigned int ip4_hashfn(const struct inet_frag_queue *q)
+-{
+-      const struct ipq *ipq;
+-
+-      ipq = container_of(q, struct ipq, q);
+-      return ipqhashfn(ipq->id, ipq->saddr, ipq->daddr, ipq->protocol);
+-}
+-
+-static bool ip4_frag_match(const struct inet_frag_queue *q, const void *a)
+-{
+-      const struct ipq *qp;
+-      const struct ip4_create_arg *arg = a;
+-
+-      qp = container_of(q, struct ipq, q);
+-      return  qp->id == arg->iph->id &&
+-              qp->saddr == arg->iph->saddr &&
+-              qp->daddr == arg->iph->daddr &&
+-              qp->protocol == arg->iph->protocol &&
+-              qp->user == arg->user &&
+-              qp->vif == arg->vif;
+-}
+ static void ip4_frag_init(struct inet_frag_queue *q, const void *a)
+ {
+@@ -141,17 +100,12 @@ static void ip4_frag_init(struct inet_fr
+                                              frags);
+       struct net *net = container_of(ipv4, struct net, ipv4);
+-      const struct ip4_create_arg *arg = a;
++      const struct frag_v4_compare_key *key = a;
+-      qp->protocol = arg->iph->protocol;
+-      qp->id = arg->iph->id;
+-      qp->ecn = ip4_frag_ecn(arg->iph->tos);
+-      qp->saddr = arg->iph->saddr;
+-      qp->daddr = arg->iph->daddr;
+-      qp->vif = arg->vif;
+-      qp->user = arg->user;
++      q->key.v4 = *key;
++      qp->ecn = 0;
+       qp->peer = sysctl_ipfrag_max_dist ?
+-              inet_getpeer_v4(net->ipv4.peers, arg->iph->saddr, arg->vif, 1) :
++              inet_getpeer_v4(net->ipv4.peers, key->saddr, key->vif, 1) :
+               NULL;
+ }
+@@ -234,7 +188,7 @@ static void ip_expire(unsigned long arg)
+               /* Only an end host needs to send an ICMP
+                * "Fragment Reassembly Timeout" message, per RFC792.
+                */
+-              if (frag_expire_skip_icmp(qp->user) &&
++              if (frag_expire_skip_icmp(qp->q.key.v4.user) &&
+                   (skb_rtable(head)->rt_type != RTN_LOCAL))
+                       goto out;
+@@ -262,17 +216,17 @@ out_rcu_unlock:
+ static struct ipq *ip_find(struct net *net, struct iphdr *iph,
+                          u32 user, int vif)
+ {
++      struct frag_v4_compare_key key = {
++              .saddr = iph->saddr,
++              .daddr = iph->daddr,
++              .user = user,
++              .vif = vif,
++              .id = iph->id,
++              .protocol = iph->protocol,
++      };
+       struct inet_frag_queue *q;
+-      struct ip4_create_arg arg;
+-      unsigned int hash;
+-
+-      arg.iph = iph;
+-      arg.user = user;
+-      arg.vif = vif;
+-      hash = ipqhashfn(iph->id, iph->saddr, iph->daddr, iph->protocol);
+-
+-      q = inet_frag_find(&net->ipv4.frags, &ip4_frags, &arg, hash);
++      q = inet_frag_find(&net->ipv4.frags, &key);
+       if (IS_ERR_OR_NULL(q)) {
+               inet_frag_maybe_warn_overflow(q, pr_fmt());
+               return NULL;
+@@ -656,7 +610,7 @@ out_nomem:
+       err = -ENOMEM;
+       goto out_fail;
+ out_oversize:
+-      net_info_ratelimited("Oversized IP packet from %pI4\n", &qp->saddr);
++      net_info_ratelimited("Oversized IP packet from %pI4\n", &qp->q.key.v4.saddr);
+ out_fail:
+       IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS);
+       return err;
+@@ -900,16 +854,48 @@ static struct pernet_operations ip4_frag
+       .exit = ipv4_frags_exit_net,
+ };
++
++static u32 ip4_key_hashfn(const void *data, u32 len, u32 seed)
++{
++      return jhash2(data,
++                    sizeof(struct frag_v4_compare_key) / sizeof(u32), seed);
++}
++
++static u32 ip4_obj_hashfn(const void *data, u32 len, u32 seed)
++{
++      const struct inet_frag_queue *fq = data;
++
++      return jhash2((const u32 *)&fq->key.v4,
++                    sizeof(struct frag_v4_compare_key) / sizeof(u32), seed);
++}
++
++static int ip4_obj_cmpfn(struct rhashtable_compare_arg *arg, const void *ptr)
++{
++      const struct frag_v4_compare_key *key = arg->key;
++      const struct inet_frag_queue *fq = ptr;
++
++      return !!memcmp(&fq->key, key, sizeof(*key));
++}
++
++static const struct rhashtable_params ip4_rhash_params = {
++      .head_offset            = offsetof(struct inet_frag_queue, node),
++      .key_offset             = offsetof(struct inet_frag_queue, key),
++      .key_len                = sizeof(struct frag_v4_compare_key),
++      .hashfn                 = ip4_key_hashfn,
++      .obj_hashfn             = ip4_obj_hashfn,
++      .obj_cmpfn              = ip4_obj_cmpfn,
++      .automatic_shrinking    = true,
++};
++
+ void __init ipfrag_init(void)
+ {
+-      ip4_frags.hashfn = ip4_hashfn;
+       ip4_frags.constructor = ip4_frag_init;
+       ip4_frags.destructor = ip4_frag_free;
+       ip4_frags.skb_free = NULL;
+       ip4_frags.qsize = sizeof(struct ipq);
+-      ip4_frags.match = ip4_frag_match;
+       ip4_frags.frag_expire = ip_expire;
+       ip4_frags.frags_cache_name = ip_frag_cache_name;
++      ip4_frags.rhash_params = ip4_rhash_params;
+       if (inet_frags_init(&ip4_frags))
+               panic("IP: failed to allocate ip4_frags cache\n");
+       ip4_frags_ctl_register();
+--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
++++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
+@@ -153,23 +153,6 @@ static inline u8 ip6_frag_ecn(const stru
+       return 1 << (ipv6_get_dsfield(ipv6h) & INET_ECN_MASK);
+ }
+-static unsigned int nf_hash_frag(__be32 id, const struct in6_addr *saddr,
+-                               const struct in6_addr *daddr)
+-{
+-      net_get_random_once(&nf_frags.rnd, sizeof(nf_frags.rnd));
+-      return jhash_3words(ipv6_addr_hash(saddr), ipv6_addr_hash(daddr),
+-                          (__force u32)id, nf_frags.rnd);
+-}
+-
+-
+-static unsigned int nf_hashfn(const struct inet_frag_queue *q)
+-{
+-      const struct frag_queue *nq;
+-
+-      nq = container_of(q, struct frag_queue, q);
+-      return nf_hash_frag(nq->id, &nq->saddr, &nq->daddr);
+-}
+-
+ static void nf_skb_free(struct sk_buff *skb)
+ {
+       if (NFCT_FRAG6_CB(skb)->orig)
+@@ -188,26 +171,19 @@ static void nf_ct_frag6_expire(unsigned
+ }
+ /* Creation primitives. */
+-static inline struct frag_queue *fq_find(struct net *net, __be32 id,
+-                                       u32 user, struct in6_addr *src,
+-                                       struct in6_addr *dst, int iif, u8 ecn)
++static struct frag_queue *fq_find(struct net *net, __be32 id, u32 user,
++                                const struct ipv6hdr *hdr, int iif)
+ {
++      struct frag_v6_compare_key key = {
++              .id = id,
++              .saddr = hdr->saddr,
++              .daddr = hdr->daddr,
++              .user = user,
++              .iif = iif,
++      };
+       struct inet_frag_queue *q;
+-      struct ip6_create_arg arg;
+-      unsigned int hash;
+-
+-      arg.id = id;
+-      arg.user = user;
+-      arg.src = src;
+-      arg.dst = dst;
+-      arg.iif = iif;
+-      arg.ecn = ecn;
+-
+-      local_bh_disable();
+-      hash = nf_hash_frag(id, src, dst);
+-      q = inet_frag_find(&net->nf_frag.frags, &nf_frags, &arg, hash);
+-      local_bh_enable();
++      q = inet_frag_find(&net->nf_frag.frags, &key);
+       if (IS_ERR_OR_NULL(q)) {
+               inet_frag_maybe_warn_overflow(q, pr_fmt());
+               return NULL;
+@@ -602,8 +578,8 @@ struct sk_buff *nf_ct_frag6_gather(struc
+       fhdr = (struct frag_hdr *)skb_transport_header(clone);
+       skb_orphan(skb);
+-      fq = fq_find(net, fhdr->identification, user, &hdr->saddr, &hdr->daddr,
+-                   skb->dev ? skb->dev->ifindex : 0, ip6_frag_ecn(hdr));
++      fq = fq_find(net, fhdr->identification, user, hdr,
++                   skb->dev ? skb->dev->ifindex : 0);
+       if (fq == NULL) {
+               pr_debug("Can't find and can't create new queue\n");
+               goto ret_orig;
+@@ -681,14 +657,13 @@ int nf_ct_frag6_init(void)
+ {
+       int ret = 0;
+-      nf_frags.hashfn = nf_hashfn;
+       nf_frags.constructor = ip6_frag_init;
+       nf_frags.destructor = NULL;
+       nf_frags.skb_free = nf_skb_free;
+       nf_frags.qsize = sizeof(struct frag_queue);
+-      nf_frags.match = ip6_frag_match;
+       nf_frags.frag_expire = nf_ct_frag6_expire;
+       nf_frags.frags_cache_name = nf_frags_cache_name;
++      nf_frags.rhash_params = ip6_rhash_params;
+       ret = inet_frags_init(&nf_frags);
+       if (ret)
+               goto out;
+--- a/net/ipv6/reassembly.c
++++ b/net/ipv6/reassembly.c
+@@ -79,52 +79,13 @@ static struct inet_frags ip6_frags;
+ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev,
+                         struct net_device *dev);
+-/*
+- * callers should be careful not to use the hash value outside the ipfrag_lock
+- * as doing so could race with ipfrag_hash_rnd being recalculated.
+- */
+-static unsigned int inet6_hash_frag(__be32 id, const struct in6_addr *saddr,
+-                                  const struct in6_addr *daddr)
+-{
+-      net_get_random_once(&ip6_frags.rnd, sizeof(ip6_frags.rnd));
+-      return jhash_3words(ipv6_addr_hash(saddr), ipv6_addr_hash(daddr),
+-                          (__force u32)id, ip6_frags.rnd);
+-}
+-
+-static unsigned int ip6_hashfn(const struct inet_frag_queue *q)
+-{
+-      const struct frag_queue *fq;
+-
+-      fq = container_of(q, struct frag_queue, q);
+-      return inet6_hash_frag(fq->id, &fq->saddr, &fq->daddr);
+-}
+-
+-bool ip6_frag_match(const struct inet_frag_queue *q, const void *a)
+-{
+-      const struct frag_queue *fq;
+-      const struct ip6_create_arg *arg = a;
+-
+-      fq = container_of(q, struct frag_queue, q);
+-      return  fq->id == arg->id &&
+-              fq->user == arg->user &&
+-              ipv6_addr_equal(&fq->saddr, arg->src) &&
+-              ipv6_addr_equal(&fq->daddr, arg->dst) &&
+-              (arg->iif == fq->iif ||
+-               !(ipv6_addr_type(arg->dst) & (IPV6_ADDR_MULTICAST |
+-                                             IPV6_ADDR_LINKLOCAL)));
+-}
+-EXPORT_SYMBOL(ip6_frag_match);
+-
+ void ip6_frag_init(struct inet_frag_queue *q, const void *a)
+ {
+       struct frag_queue *fq = container_of(q, struct frag_queue, q);
+-      const struct ip6_create_arg *arg = a;
++      const struct frag_v6_compare_key *key = a;
+-      fq->id = arg->id;
+-      fq->user = arg->user;
+-      fq->saddr = *arg->src;
+-      fq->daddr = *arg->dst;
+-      fq->ecn = arg->ecn;
++      q->key.v6 = *key;
++      fq->ecn = 0;
+ }
+ EXPORT_SYMBOL(ip6_frag_init);
+@@ -181,23 +142,22 @@ static void ip6_frag_expire(unsigned lon
+ }
+ static struct frag_queue *
+-fq_find(struct net *net, __be32 id, const struct in6_addr *src,
+-      const struct in6_addr *dst, int iif, u8 ecn)
++fq_find(struct net *net, __be32 id, const struct ipv6hdr *hdr, int iif)
+ {
++      struct frag_v6_compare_key key = {
++              .id = id,
++              .saddr = hdr->saddr,
++              .daddr = hdr->daddr,
++              .user = IP6_DEFRAG_LOCAL_DELIVER,
++              .iif = iif,
++      };
+       struct inet_frag_queue *q;
+-      struct ip6_create_arg arg;
+-      unsigned int hash;
+-
+-      arg.id = id;
+-      arg.user = IP6_DEFRAG_LOCAL_DELIVER;
+-      arg.src = src;
+-      arg.dst = dst;
+-      arg.iif = iif;
+-      arg.ecn = ecn;
+-      hash = inet6_hash_frag(id, src, dst);
++      if (!(ipv6_addr_type(&hdr->daddr) & (IPV6_ADDR_MULTICAST |
++                                          IPV6_ADDR_LINKLOCAL)))
++              key.iif = 0;
+-      q = inet_frag_find(&net->ipv6.frags, &ip6_frags, &arg, hash);
++      q = inet_frag_find(&net->ipv6.frags, &key);
+       if (IS_ERR_OR_NULL(q)) {
+               inet_frag_maybe_warn_overflow(q, pr_fmt());
+               return NULL;
+@@ -523,6 +483,7 @@ static int ipv6_frag_rcv(struct sk_buff
+       struct frag_queue *fq;
+       const struct ipv6hdr *hdr = ipv6_hdr(skb);
+       struct net *net = dev_net(skb_dst(skb)->dev);
++      int iif;
+       if (IP6CB(skb)->flags & IP6SKB_FRAGMENTED)
+               goto fail_hdr;
+@@ -551,13 +512,14 @@ static int ipv6_frag_rcv(struct sk_buff
+               return 1;
+       }
+-      fq = fq_find(net, fhdr->identification, &hdr->saddr, &hdr->daddr,
+-                   skb->dev ? skb->dev->ifindex : 0, ip6_frag_ecn(hdr));
++      iif = skb->dev ? skb->dev->ifindex : 0;
++      fq = fq_find(net, fhdr->identification, hdr, iif);
+       if (fq) {
+               int ret;
+               spin_lock(&fq->q.lock);
++              fq->iif = iif;
+               ret = ip6_frag_queue(fq, skb, fhdr, IP6CB(skb)->nhoff);
+               spin_unlock(&fq->q.lock);
+@@ -735,17 +697,47 @@ static struct pernet_operations ip6_frag
+       .exit = ipv6_frags_exit_net,
+ };
++static u32 ip6_key_hashfn(const void *data, u32 len, u32 seed)
++{
++      return jhash2(data,
++                    sizeof(struct frag_v6_compare_key) / sizeof(u32), seed);
++}
++
++static u32 ip6_obj_hashfn(const void *data, u32 len, u32 seed)
++{
++      const struct inet_frag_queue *fq = data;
++
++      return jhash2((const u32 *)&fq->key.v6,
++                    sizeof(struct frag_v6_compare_key) / sizeof(u32), seed);
++}
++
++static int ip6_obj_cmpfn(struct rhashtable_compare_arg *arg, const void *ptr)
++{
++      const struct frag_v6_compare_key *key = arg->key;
++      const struct inet_frag_queue *fq = ptr;
++
++      return !!memcmp(&fq->key, key, sizeof(*key));
++}
++
++const struct rhashtable_params ip6_rhash_params = {
++      .head_offset            = offsetof(struct inet_frag_queue, node),
++      .hashfn                 = ip6_key_hashfn,
++      .obj_hashfn             = ip6_obj_hashfn,
++      .obj_cmpfn              = ip6_obj_cmpfn,
++      .automatic_shrinking    = true,
++};
++EXPORT_SYMBOL(ip6_rhash_params);
++
+ int __init ipv6_frag_init(void)
+ {
+       int ret;
+-      ip6_frags.hashfn = ip6_hashfn;
+       ip6_frags.constructor = ip6_frag_init;
+       ip6_frags.destructor = NULL;
+       ip6_frags.qsize = sizeof(struct frag_queue);
+-      ip6_frags.match = ip6_frag_match;
+       ip6_frags.frag_expire = ip6_frag_expire;
+       ip6_frags.frags_cache_name = ip6_frag_cache_name;
++      ip6_frags.rhash_params = ip6_rhash_params;
+       ret = inet_frags_init(&ip6_frags);
+       if (ret)
+               goto out;
diff --git a/queue-4.4/ip-add-helpers-to-process-in-order-fragments-faster.patch b/queue-4.4/ip-add-helpers-to-process-in-order-fragments-faster.patch
new file mode 100644 (file)
index 0000000..93f1a16
--- /dev/null
@@ -0,0 +1,164 @@
+From foo@baz Thu Feb  7 12:09:56 CET 2019
+From: Peter Oskolkov <posk@google.com>
+Date: Wed, 10 Oct 2018 12:30:14 -0700
+Subject: ip: add helpers to process in-order fragments faster.
+
+From: Peter Oskolkov <posk@google.com>
+
+commit 353c9cb360874e737fb000545f783df756c06f9a upstream.
+
+This patch introduces several helper functions/macros that will be
+used in the follow-up patch. No runtime changes yet.
+
+The new logic (fully implemented in the second patch) is as follows:
+
+* Nodes in the rb-tree will now contain not single fragments, but lists
+  of consecutive fragments ("runs").
+
+* At each point in time, the current "active" run at the tail is
+  maintained/tracked. Fragments that arrive in-order, adjacent
+  to the previous tail fragment, are added to this tail run without
+  triggering the re-balancing of the rb-tree.
+
+* If a fragment arrives out of order with the offset _before_ the tail run,
+  it is inserted into the rb-tree as a single fragment.
+
+* If a fragment arrives after the current tail fragment (with a gap),
+  it starts a new "tail" run, as is inserted into the rb-tree
+  at the end as the head of the new run.
+
+skb->cb is used to store additional information
+needed here (suggested by Eric Dumazet).
+
+Reported-by: Willem de Bruijn <willemb@google.com>
+Signed-off-by: Peter Oskolkov <posk@google.com>
+Cc: Eric Dumazet <edumazet@google.com>
+Cc: Florian Westphal <fw@strlen.de>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Mao Wenan <maowenan@huawei.com>
+Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/net/inet_frag.h |    6 +++
+ net/ipv4/ip_fragment.c  |   73 ++++++++++++++++++++++++++++++++++++++++++++++++
+ 2 files changed, 79 insertions(+)
+
+--- a/include/net/inet_frag.h
++++ b/include/net/inet_frag.h
+@@ -55,7 +55,9 @@ struct frag_v6_compare_key {
+  * @lock: spinlock protecting this frag
+  * @refcnt: reference count of the queue
+  * @fragments: received fragments head
++ * @rb_fragments: received fragments rb-tree root
+  * @fragments_tail: received fragments tail
++ * @last_run_head: the head of the last "run". see ip_fragment.c
+  * @stamp: timestamp of the last received fragment
+  * @len: total length of the original datagram
+  * @meat: length of received fragments so far
+@@ -76,6 +78,7 @@ struct inet_frag_queue {
+       struct sk_buff          *fragments;  /* Used in IPv6. */
+       struct rb_root          rb_fragments; /* Used in IPv4. */
+       struct sk_buff          *fragments_tail;
++      struct sk_buff          *last_run_head;
+       ktime_t                 stamp;
+       int                     len;
+       int                     meat;
+@@ -112,6 +115,9 @@ void inet_frag_kill(struct inet_frag_que
+ void inet_frag_destroy(struct inet_frag_queue *q);
+ struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, void *key);
++/* Free all skbs in the queue; return the sum of their truesizes. */
++unsigned int inet_frag_rbtree_purge(struct rb_root *root);
++
+ static inline void inet_frag_put(struct inet_frag_queue *q)
+ {
+       if (atomic_dec_and_test(&q->refcnt))
+--- a/net/ipv4/ip_fragment.c
++++ b/net/ipv4/ip_fragment.c
+@@ -58,6 +58,57 @@
+ static int sysctl_ipfrag_max_dist __read_mostly = 64;
+ static const char ip_frag_cache_name[] = "ip4-frags";
++/* Use skb->cb to track consecutive/adjacent fragments coming at
++ * the end of the queue. Nodes in the rb-tree queue will
++ * contain "runs" of one or more adjacent fragments.
++ *
++ * Invariants:
++ * - next_frag is NULL at the tail of a "run";
++ * - the head of a "run" has the sum of all fragment lengths in frag_run_len.
++ */
++struct ipfrag_skb_cb {
++      struct inet_skb_parm    h;
++      struct sk_buff          *next_frag;
++      int                     frag_run_len;
++};
++
++#define FRAG_CB(skb)          ((struct ipfrag_skb_cb *)((skb)->cb))
++
++static void ip4_frag_init_run(struct sk_buff *skb)
++{
++      BUILD_BUG_ON(sizeof(struct ipfrag_skb_cb) > sizeof(skb->cb));
++
++      FRAG_CB(skb)->next_frag = NULL;
++      FRAG_CB(skb)->frag_run_len = skb->len;
++}
++
++/* Append skb to the last "run". */
++static void ip4_frag_append_to_last_run(struct inet_frag_queue *q,
++                                      struct sk_buff *skb)
++{
++      RB_CLEAR_NODE(&skb->rbnode);
++      FRAG_CB(skb)->next_frag = NULL;
++
++      FRAG_CB(q->last_run_head)->frag_run_len += skb->len;
++      FRAG_CB(q->fragments_tail)->next_frag = skb;
++      q->fragments_tail = skb;
++}
++
++/* Create a new "run" with the skb. */
++static void ip4_frag_create_run(struct inet_frag_queue *q, struct sk_buff *skb)
++{
++      if (q->last_run_head)
++              rb_link_node(&skb->rbnode, &q->last_run_head->rbnode,
++                           &q->last_run_head->rbnode.rb_right);
++      else
++              rb_link_node(&skb->rbnode, NULL, &q->rb_fragments.rb_node);
++      rb_insert_color(&skb->rbnode, &q->rb_fragments);
++
++      ip4_frag_init_run(skb);
++      q->fragments_tail = skb;
++      q->last_run_head = skb;
++}
++
+ /* Describe an entry in the "incomplete datagrams" queue. */
+ struct ipq {
+       struct inet_frag_queue q;
+@@ -658,6 +709,28 @@ struct sk_buff *ip_check_defrag(struct n
+ }
+ EXPORT_SYMBOL(ip_check_defrag);
++unsigned int inet_frag_rbtree_purge(struct rb_root *root)
++{
++      struct rb_node *p = rb_first(root);
++      unsigned int sum = 0;
++
++      while (p) {
++              struct sk_buff *skb = rb_entry(p, struct sk_buff, rbnode);
++
++              p = rb_next(p);
++              rb_erase(&skb->rbnode, root);
++              while (skb) {
++                      struct sk_buff *next = FRAG_CB(skb)->next_frag;
++
++                      sum += skb->truesize;
++                      kfree_skb(skb);
++                      skb = next;
++              }
++      }
++      return sum;
++}
++EXPORT_SYMBOL(inet_frag_rbtree_purge);
++
+ #ifdef CONFIG_SYSCTL
+ static int dist_min;
diff --git a/queue-4.4/ip-discard-ipv4-datagrams-with-overlapping-segments.patch b/queue-4.4/ip-discard-ipv4-datagrams-with-overlapping-segments.patch
new file mode 100644 (file)
index 0000000..5e5dbe3
--- /dev/null
@@ -0,0 +1,150 @@
+From foo@baz Thu Feb  7 12:09:56 CET 2019
+From: Peter Oskolkov <posk@google.com>
+Date: Wed, 10 Oct 2018 12:30:07 -0700
+Subject: ip: discard IPv4 datagrams with overlapping segments.
+
+From: Peter Oskolkov <posk@google.com>
+
+commit 7969e5c40dfd04799d4341f1b7cd266b6e47f227 upstream.
+
+This behavior is required in IPv6, and there is little need
+to tolerate overlapping fragments in IPv4. This change
+simplifies the code and eliminates potential DDoS attack vectors.
+
+Tested: ran ip_defrag selftest (not yet available uptream).
+
+Suggested-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Peter Oskolkov <posk@google.com>
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Cc: Florian Westphal <fw@strlen.de>
+Acked-by: Stephen Hemminger <stephen@networkplumber.org>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Mao Wenan <maowenan@huawei.com>
+[bwh: Backported to 4.4:
+ - s/__IP_INC_STATS/IP_INC_STATS_BH/
+ - Deleted code is slightly different]
+Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/uapi/linux/snmp.h |    1 
+ net/ipv4/ip_fragment.c    |   72 ++++++++++++----------------------------------
+ net/ipv4/proc.c           |    1 
+ 3 files changed, 22 insertions(+), 52 deletions(-)
+
+--- a/include/uapi/linux/snmp.h
++++ b/include/uapi/linux/snmp.h
+@@ -55,6 +55,7 @@ enum
+       IPSTATS_MIB_ECT1PKTS,                   /* InECT1Pkts */
+       IPSTATS_MIB_ECT0PKTS,                   /* InECT0Pkts */
+       IPSTATS_MIB_CEPKTS,                     /* InCEPkts */
++      IPSTATS_MIB_REASM_OVERLAPS,             /* ReasmOverlaps */
+       __IPSTATS_MIB_MAX
+ };
+--- a/net/ipv4/ip_fragment.c
++++ b/net/ipv4/ip_fragment.c
+@@ -277,6 +277,7 @@ static int ip_frag_reinit(struct ipq *qp
+ /* Add new segment to existing queue. */
+ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
+ {
++      struct net *net = container_of(qp->q.net, struct net, ipv4.frags);
+       struct sk_buff *prev, *next;
+       struct net_device *dev;
+       unsigned int fragsize;
+@@ -357,60 +358,23 @@ static int ip_frag_queue(struct ipq *qp,
+       }
+ found:
+-      /* We found where to put this one.  Check for overlap with
+-       * preceding fragment, and, if needed, align things so that
+-       * any overlaps are eliminated.
++      /* RFC5722, Section 4, amended by Errata ID : 3089
++       *                          When reassembling an IPv6 datagram, if
++       *   one or more its constituent fragments is determined to be an
++       *   overlapping fragment, the entire datagram (and any constituent
++       *   fragments) MUST be silently discarded.
++       *
++       * We do the same here for IPv4.
+        */
+-      if (prev) {
+-              int i = (prev->ip_defrag_offset + prev->len) - offset;
+-              if (i > 0) {
+-                      offset += i;
+-                      err = -EINVAL;
+-                      if (end <= offset)
+-                              goto err;
+-                      err = -ENOMEM;
+-                      if (!pskb_pull(skb, i))
+-                              goto err;
+-                      if (skb->ip_summed != CHECKSUM_UNNECESSARY)
+-                              skb->ip_summed = CHECKSUM_NONE;
+-              }
+-      }
+-
+-      err = -ENOMEM;
+-
+-      while (next && next->ip_defrag_offset < end) {
+-              int i = end - next->ip_defrag_offset; /* overlap is 'i' bytes */
+-
+-              if (i < next->len) {
+-                      /* Eat head of the next overlapped fragment
+-                       * and leave the loop. The next ones cannot overlap.
+-                       */
+-                      if (!pskb_pull(next, i))
+-                              goto err;
+-                      next->ip_defrag_offset += i;
+-                      qp->q.meat -= i;
+-                      if (next->ip_summed != CHECKSUM_UNNECESSARY)
+-                              next->ip_summed = CHECKSUM_NONE;
+-                      break;
+-              } else {
+-                      struct sk_buff *free_it = next;
+-
+-                      /* Old fragment is completely overridden with
+-                       * new one drop it.
+-                       */
+-                      next = next->next;
+-
+-                      if (prev)
+-                              prev->next = next;
+-                      else
+-                              qp->q.fragments = next;
+-
+-                      qp->q.meat -= free_it->len;
+-                      sub_frag_mem_limit(qp->q.net, free_it->truesize);
+-                      kfree_skb(free_it);
+-              }
+-      }
++      /* Is there an overlap with the previous fragment? */
++      if (prev &&
++          (prev->ip_defrag_offset + prev->len) > offset)
++              goto discard_qp;
++
++      /* Is there an overlap with the next fragment? */
++      if (next && next->ip_defrag_offset < end)
++              goto discard_qp;
+       /* Note : skb->ip_defrag_offset and skb->dev share the same location */
+       dev = skb->dev;
+@@ -458,6 +422,10 @@ found:
+       skb_dst_drop(skb);
+       return -EINPROGRESS;
++discard_qp:
++      inet_frag_kill(&qp->q);
++      err = -EINVAL;
++      IP_INC_STATS_BH(net, IPSTATS_MIB_REASM_OVERLAPS);
+ err:
+       kfree_skb(skb);
+       return err;
+--- a/net/ipv4/proc.c
++++ b/net/ipv4/proc.c
+@@ -132,6 +132,7 @@ static const struct snmp_mib snmp4_ipext
+       SNMP_MIB_ITEM("InECT1Pkts", IPSTATS_MIB_ECT1PKTS),
+       SNMP_MIB_ITEM("InECT0Pkts", IPSTATS_MIB_ECT0PKTS),
+       SNMP_MIB_ITEM("InCEPkts", IPSTATS_MIB_CEPKTS),
++      SNMP_MIB_ITEM("ReasmOverlaps", IPSTATS_MIB_REASM_OVERLAPS),
+       SNMP_MIB_SENTINEL
+ };
diff --git a/queue-4.4/ip-frags-fix-crash-in-ip_do_fragment.patch b/queue-4.4/ip-frags-fix-crash-in-ip_do_fragment.patch
new file mode 100644 (file)
index 0000000..d5f12cf
--- /dev/null
@@ -0,0 +1,110 @@
+From foo@baz Thu Feb  7 12:09:56 CET 2019
+From: Taehee Yoo <ap420073@gmail.com>
+Date: Wed, 10 Oct 2018 12:30:16 -0700
+Subject: ip: frags: fix crash in ip_do_fragment()
+
+From: Taehee Yoo <ap420073@gmail.com>
+
+commit 5d407b071dc369c26a38398326ee2be53651cfe4 upstream.
+
+A kernel crash occurrs when defragmented packet is fragmented
+in ip_do_fragment().
+In defragment routine, skb_orphan() is called and
+skb->ip_defrag_offset is set. but skb->sk and
+skb->ip_defrag_offset are same union member. so that
+frag->sk is not NULL.
+Hence crash occurrs in skb->sk check routine in ip_do_fragment() when
+defragmented packet is fragmented.
+
+test commands:
+   %iptables -t nat -I POSTROUTING -j MASQUERADE
+   %hping3 192.168.4.2 -s 1000 -p 2000 -d 60000
+
+splat looks like:
+[  261.069429] kernel BUG at net/ipv4/ip_output.c:636!
+[  261.075753] invalid opcode: 0000 [#1] SMP DEBUG_PAGEALLOC KASAN PTI
+[  261.083854] CPU: 1 PID: 1349 Comm: hping3 Not tainted 4.19.0-rc2+ #3
+[  261.100977] RIP: 0010:ip_do_fragment+0x1613/0x2600
+[  261.106945] Code: e8 e2 38 e3 fe 4c 8b 44 24 18 48 8b 74 24 08 e9 92 f6 ff ff 80 3c 02 00 0f 85 da 07 00 00 48 8b b5 d0 00 00 00 e9 25 f6 ff ff <0f> 0b 0f 0b 44 8b 54 24 58 4c 8b 4c 24 18 4c 8b 5c 24 60 4c 8b 6c
+[  261.127015] RSP: 0018:ffff8801031cf2c0 EFLAGS: 00010202
+[  261.134156] RAX: 1ffff1002297537b RBX: ffffed0020639e6e RCX: 0000000000000004
+[  261.142156] RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffff880114ba9bd8
+[  261.150157] RBP: ffff880114ba8a40 R08: ffffed0022975395 R09: ffffed0022975395
+[  261.158157] R10: 0000000000000001 R11: ffffed0022975394 R12: ffff880114ba9ca4
+[  261.166159] R13: 0000000000000010 R14: ffff880114ba9bc0 R15: dffffc0000000000
+[  261.174169] FS:  00007fbae2199700(0000) GS:ffff88011b400000(0000) knlGS:0000000000000000
+[  261.183012] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
+[  261.189013] CR2: 00005579244fe000 CR3: 0000000119bf4000 CR4: 00000000001006e0
+[  261.198158] Call Trace:
+[  261.199018]  ? dst_output+0x180/0x180
+[  261.205011]  ? save_trace+0x300/0x300
+[  261.209018]  ? ip_copy_metadata+0xb00/0xb00
+[  261.213034]  ? sched_clock_local+0xd4/0x140
+[  261.218158]  ? kill_l4proto+0x120/0x120 [nf_conntrack]
+[  261.223014]  ? rt_cpu_seq_stop+0x10/0x10
+[  261.227014]  ? find_held_lock+0x39/0x1c0
+[  261.233008]  ip_finish_output+0x51d/0xb50
+[  261.237006]  ? ip_fragment.constprop.56+0x220/0x220
+[  261.243011]  ? nf_ct_l4proto_register_one+0x5b0/0x5b0 [nf_conntrack]
+[  261.250152]  ? rcu_is_watching+0x77/0x120
+[  261.255010]  ? nf_nat_ipv4_out+0x1e/0x2b0 [nf_nat_ipv4]
+[  261.261033]  ? nf_hook_slow+0xb1/0x160
+[  261.265007]  ip_output+0x1c7/0x710
+[  261.269005]  ? ip_mc_output+0x13f0/0x13f0
+[  261.273002]  ? __local_bh_enable_ip+0xe9/0x1b0
+[  261.278152]  ? ip_fragment.constprop.56+0x220/0x220
+[  261.282996]  ? nf_hook_slow+0xb1/0x160
+[  261.287007]  raw_sendmsg+0x21f9/0x4420
+[  261.291008]  ? dst_output+0x180/0x180
+[  261.297003]  ? sched_clock_cpu+0x126/0x170
+[  261.301003]  ? find_held_lock+0x39/0x1c0
+[  261.306155]  ? stop_critical_timings+0x420/0x420
+[  261.311004]  ? check_flags.part.36+0x450/0x450
+[  261.315005]  ? _raw_spin_unlock_irq+0x29/0x40
+[  261.320995]  ? _raw_spin_unlock_irq+0x29/0x40
+[  261.326142]  ? cyc2ns_read_end+0x10/0x10
+[  261.330139]  ? raw_bind+0x280/0x280
+[  261.334138]  ? sched_clock_cpu+0x126/0x170
+[  261.338995]  ? check_flags.part.36+0x450/0x450
+[  261.342991]  ? __lock_acquire+0x4500/0x4500
+[  261.348994]  ? inet_sendmsg+0x11c/0x500
+[  261.352989]  ? dst_output+0x180/0x180
+[  261.357012]  inet_sendmsg+0x11c/0x500
+[ ... ]
+
+v2:
+ - clear skb->sk at reassembly routine.(Eric Dumarzet)
+
+Fixes: fa0f527358bd ("ip: use rb trees for IP frag queue.")
+Suggested-by: Eric Dumazet <edumazet@google.com>
+Signed-off-by: Taehee Yoo <ap420073@gmail.com>
+Reviewed-by: Eric Dumazet <edumazet@google.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Mao Wenan <maowenan@huawei.com>
+Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/ipv4/ip_fragment.c                  |    1 +
+ net/ipv6/netfilter/nf_conntrack_reasm.c |    1 +
+ 2 files changed, 2 insertions(+)
+
+--- a/net/ipv4/ip_fragment.c
++++ b/net/ipv4/ip_fragment.c
+@@ -599,6 +599,7 @@ static int ip_frag_reasm(struct ipq *qp,
+                       nextp = &fp->next;
+                       fp->prev = NULL;
+                       memset(&fp->rbnode, 0, sizeof(fp->rbnode));
++                      fp->sk = NULL;
+                       head->data_len += fp->len;
+                       head->len += fp->len;
+                       if (head->ip_summed != fp->ip_summed)
+--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
++++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
+@@ -427,6 +427,7 @@ nf_ct_frag6_reasm(struct frag_queue *fq,
+               else if (head->ip_summed == CHECKSUM_COMPLETE)
+                       head->csum = csum_add(head->csum, fp->csum);
+               head->truesize += fp->truesize;
++              fp->sk = NULL;
+       }
+       sub_frag_mem_limit(fq->q.net, head->truesize);
diff --git a/queue-4.4/ip-process-in-order-fragments-efficiently.patch b/queue-4.4/ip-process-in-order-fragments-efficiently.patch
new file mode 100644 (file)
index 0000000..58a44fc
--- /dev/null
@@ -0,0 +1,269 @@
+From foo@baz Thu Feb  7 12:09:56 CET 2019
+From: Peter Oskolkov <posk@google.com>
+Date: Wed, 10 Oct 2018 12:30:15 -0700
+Subject: ip: process in-order fragments efficiently
+
+From: Peter Oskolkov <posk@google.com>
+
+commit a4fd284a1f8fd4b6c59aa59db2185b1e17c5c11c upstream.
+
+This patch changes the runtime behavior of IP defrag queue:
+incoming in-order fragments are added to the end of the current
+list/"run" of in-order fragments at the tail.
+
+On some workloads, UDP stream performance is substantially improved:
+
+RX: ./udp_stream -F 10 -T 2 -l 60
+TX: ./udp_stream -c -H <host> -F 10 -T 5 -l 60
+
+with this patchset applied on a 10Gbps receiver:
+
+  throughput=9524.18
+  throughput_units=Mbit/s
+
+upstream (net-next):
+
+  throughput=4608.93
+  throughput_units=Mbit/s
+
+Reported-by: Willem de Bruijn <willemb@google.com>
+Signed-off-by: Peter Oskolkov <posk@google.com>
+Cc: Eric Dumazet <edumazet@google.com>
+Cc: Florian Westphal <fw@strlen.de>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Mao Wenan <maowenan@huawei.com>
+Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/ipv4/inet_fragment.c |    2 
+ net/ipv4/ip_fragment.c   |  110 +++++++++++++++++++++++++++++------------------
+ 2 files changed, 70 insertions(+), 42 deletions(-)
+
+--- a/net/ipv4/inet_fragment.c
++++ b/net/ipv4/inet_fragment.c
+@@ -153,7 +153,7 @@ void inet_frag_destroy(struct inet_frag_
+                       fp = xp;
+               } while (fp);
+       } else {
+-              sum_truesize = skb_rbtree_purge(&q->rb_fragments);
++              sum_truesize = inet_frag_rbtree_purge(&q->rb_fragments);
+       }
+       sum = sum_truesize + f->qsize;
+--- a/net/ipv4/ip_fragment.c
++++ b/net/ipv4/ip_fragment.c
+@@ -127,8 +127,8 @@ static u8 ip4_frag_ecn(u8 tos)
+ static struct inet_frags ip4_frags;
+-static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
+-                       struct net_device *dev);
++static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb,
++                       struct sk_buff *prev_tail, struct net_device *dev);
+ static void ip4_frag_init(struct inet_frag_queue *q, const void *a)
+@@ -219,7 +219,12 @@ static void ip_expire(unsigned long arg)
+               head = skb_rb_first(&qp->q.rb_fragments);
+               if (!head)
+                       goto out;
+-              rb_erase(&head->rbnode, &qp->q.rb_fragments);
++              if (FRAG_CB(head)->next_frag)
++                      rb_replace_node(&head->rbnode,
++                                      &FRAG_CB(head)->next_frag->rbnode,
++                                      &qp->q.rb_fragments);
++              else
++                      rb_erase(&head->rbnode, &qp->q.rb_fragments);
+               memset(&head->rbnode, 0, sizeof(head->rbnode));
+               barrier();
+       }
+@@ -320,7 +325,7 @@ static int ip_frag_reinit(struct ipq *qp
+               return -ETIMEDOUT;
+       }
+-      sum_truesize = skb_rbtree_purge(&qp->q.rb_fragments);
++      sum_truesize = inet_frag_rbtree_purge(&qp->q.rb_fragments);
+       sub_frag_mem_limit(qp->q.net, sum_truesize);
+       qp->q.flags = 0;
+@@ -329,6 +334,7 @@ static int ip_frag_reinit(struct ipq *qp
+       qp->q.fragments = NULL;
+       qp->q.rb_fragments = RB_ROOT;
+       qp->q.fragments_tail = NULL;
++      qp->q.last_run_head = NULL;
+       qp->iif = 0;
+       qp->ecn = 0;
+@@ -340,7 +346,7 @@ static int ip_frag_queue(struct ipq *qp,
+ {
+       struct net *net = container_of(qp->q.net, struct net, ipv4.frags);
+       struct rb_node **rbn, *parent;
+-      struct sk_buff *skb1;
++      struct sk_buff *skb1, *prev_tail;
+       struct net_device *dev;
+       unsigned int fragsize;
+       int flags, offset;
+@@ -418,38 +424,41 @@ static int ip_frag_queue(struct ipq *qp,
+        */
+       /* Find out where to put this fragment.  */
+-      skb1 = qp->q.fragments_tail;
+-      if (!skb1) {
+-              /* This is the first fragment we've received. */
+-              rb_link_node(&skb->rbnode, NULL, &qp->q.rb_fragments.rb_node);
+-              qp->q.fragments_tail = skb;
+-      } else if ((skb1->ip_defrag_offset + skb1->len) < end) {
+-              /* This is the common/special case: skb goes to the end. */
++      prev_tail = qp->q.fragments_tail;
++      if (!prev_tail)
++              ip4_frag_create_run(&qp->q, skb);  /* First fragment. */
++      else if (prev_tail->ip_defrag_offset + prev_tail->len < end) {
++              /* This is the common case: skb goes to the end. */
+               /* Detect and discard overlaps. */
+-              if (offset < (skb1->ip_defrag_offset + skb1->len))
++              if (offset < prev_tail->ip_defrag_offset + prev_tail->len)
+                       goto discard_qp;
+-              /* Insert after skb1. */
+-              rb_link_node(&skb->rbnode, &skb1->rbnode, &skb1->rbnode.rb_right);
+-              qp->q.fragments_tail = skb;
++              if (offset == prev_tail->ip_defrag_offset + prev_tail->len)
++                      ip4_frag_append_to_last_run(&qp->q, skb);
++              else
++                      ip4_frag_create_run(&qp->q, skb);
+       } else {
+-              /* Binary search. Note that skb can become the first fragment, but
+-               * not the last (covered above). */
++              /* Binary search. Note that skb can become the first fragment,
++               * but not the last (covered above).
++               */
+               rbn = &qp->q.rb_fragments.rb_node;
+               do {
+                       parent = *rbn;
+                       skb1 = rb_to_skb(parent);
+                       if (end <= skb1->ip_defrag_offset)
+                               rbn = &parent->rb_left;
+-                      else if (offset >= skb1->ip_defrag_offset + skb1->len)
++                      else if (offset >= skb1->ip_defrag_offset +
++                                              FRAG_CB(skb1)->frag_run_len)
+                               rbn = &parent->rb_right;
+                       else /* Found an overlap with skb1. */
+                               goto discard_qp;
+               } while (*rbn);
+               /* Here we have parent properly set, and rbn pointing to
+-               * one of its NULL left/right children. Insert skb. */
++               * one of its NULL left/right children. Insert skb.
++               */
++              ip4_frag_init_run(skb);
+               rb_link_node(&skb->rbnode, parent, rbn);
++              rb_insert_color(&skb->rbnode, &qp->q.rb_fragments);
+       }
+-      rb_insert_color(&skb->rbnode, &qp->q.rb_fragments);
+       if (dev)
+               qp->iif = dev->ifindex;
+@@ -476,7 +485,7 @@ static int ip_frag_queue(struct ipq *qp,
+               unsigned long orefdst = skb->_skb_refdst;
+               skb->_skb_refdst = 0UL;
+-              err = ip_frag_reasm(qp, skb, dev);
++              err = ip_frag_reasm(qp, skb, prev_tail, dev);
+               skb->_skb_refdst = orefdst;
+               return err;
+       }
+@@ -495,7 +504,7 @@ err:
+ /* Build a new IP datagram from all its fragments. */
+ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb,
+-                       struct net_device *dev)
++                       struct sk_buff *prev_tail, struct net_device *dev)
+ {
+       struct net *net = container_of(qp->q.net, struct net, ipv4.frags);
+       struct iphdr *iph;
+@@ -519,10 +528,16 @@ static int ip_frag_reasm(struct ipq *qp,
+               fp = skb_clone(skb, GFP_ATOMIC);
+               if (!fp)
+                       goto out_nomem;
+-              rb_replace_node(&skb->rbnode, &fp->rbnode, &qp->q.rb_fragments);
++              FRAG_CB(fp)->next_frag = FRAG_CB(skb)->next_frag;
++              if (RB_EMPTY_NODE(&skb->rbnode))
++                      FRAG_CB(prev_tail)->next_frag = fp;
++              else
++                      rb_replace_node(&skb->rbnode, &fp->rbnode,
++                                      &qp->q.rb_fragments);
+               if (qp->q.fragments_tail == skb)
+                       qp->q.fragments_tail = fp;
+               skb_morph(skb, head);
++              FRAG_CB(skb)->next_frag = FRAG_CB(head)->next_frag;
+               rb_replace_node(&head->rbnode, &skb->rbnode,
+                               &qp->q.rb_fragments);
+               consume_skb(head);
+@@ -558,7 +573,7 @@ static int ip_frag_reasm(struct ipq *qp,
+               for (i = 0; i < skb_shinfo(head)->nr_frags; i++)
+                       plen += skb_frag_size(&skb_shinfo(head)->frags[i]);
+               clone->len = clone->data_len = head->data_len - plen;
+-              skb->truesize += clone->truesize;
++              head->truesize += clone->truesize;
+               clone->csum = 0;
+               clone->ip_summed = head->ip_summed;
+               add_frag_mem_limit(qp->q.net, clone->truesize);
+@@ -571,24 +586,36 @@ static int ip_frag_reasm(struct ipq *qp,
+       skb_push(head, head->data - skb_network_header(head));
+       /* Traverse the tree in order, to build frag_list. */
++      fp = FRAG_CB(head)->next_frag;
+       rbn = rb_next(&head->rbnode);
+       rb_erase(&head->rbnode, &qp->q.rb_fragments);
+-      while (rbn) {
+-              struct rb_node *rbnext = rb_next(rbn);
+-              fp = rb_to_skb(rbn);
+-              rb_erase(rbn, &qp->q.rb_fragments);
+-              rbn = rbnext;
+-              *nextp = fp;
+-              nextp = &fp->next;
+-              fp->prev = NULL;
+-              memset(&fp->rbnode, 0, sizeof(fp->rbnode));
+-              head->data_len += fp->len;
+-              head->len += fp->len;
+-              if (head->ip_summed != fp->ip_summed)
+-                      head->ip_summed = CHECKSUM_NONE;
+-              else if (head->ip_summed == CHECKSUM_COMPLETE)
+-                      head->csum = csum_add(head->csum, fp->csum);
+-              head->truesize += fp->truesize;
++      while (rbn || fp) {
++              /* fp points to the next sk_buff in the current run;
++               * rbn points to the next run.
++               */
++              /* Go through the current run. */
++              while (fp) {
++                      *nextp = fp;
++                      nextp = &fp->next;
++                      fp->prev = NULL;
++                      memset(&fp->rbnode, 0, sizeof(fp->rbnode));
++                      head->data_len += fp->len;
++                      head->len += fp->len;
++                      if (head->ip_summed != fp->ip_summed)
++                              head->ip_summed = CHECKSUM_NONE;
++                      else if (head->ip_summed == CHECKSUM_COMPLETE)
++                              head->csum = csum_add(head->csum, fp->csum);
++                      head->truesize += fp->truesize;
++                      fp = FRAG_CB(fp)->next_frag;
++              }
++              /* Move to the next run. */
++              if (rbn) {
++                      struct rb_node *rbnext = rb_next(rbn);
++
++                      fp = rb_to_skb(rbn);
++                      rb_erase(rbn, &qp->q.rb_fragments);
++                      rbn = rbnext;
++              }
+       }
+       sub_frag_mem_limit(qp->q.net, head->truesize);
+@@ -624,6 +651,7 @@ static int ip_frag_reasm(struct ipq *qp,
+       qp->q.fragments = NULL;
+       qp->q.rb_fragments = RB_ROOT;
+       qp->q.fragments_tail = NULL;
++      qp->q.last_run_head = NULL;
+       return 0;
+ out_nomem:
diff --git a/queue-4.4/ip-use-rb-trees-for-ip-frag-queue.patch b/queue-4.4/ip-use-rb-trees-for-ip-frag-queue.patch
new file mode 100644 (file)
index 0000000..442c05e
--- /dev/null
@@ -0,0 +1,466 @@
+From foo@baz Thu Feb  7 12:09:56 CET 2019
+From: Peter Oskolkov <posk@google.com>
+Date: Wed, 10 Oct 2018 12:30:13 -0700
+Subject: ip: use rb trees for IP frag queue.
+
+From: Peter Oskolkov <posk@google.com>
+
+commit fa0f527358bd900ef92f925878ed6bfbd51305cc upstream.
+
+Similar to TCP OOO RX queue, it makes sense to use rb trees to store
+IP fragments, so that OOO fragments are inserted faster.
+
+Tested:
+
+- a follow-up patch contains a rather comprehensive ip defrag
+  self-test (functional)
+- ran neper `udp_stream -c -H <host> -F 100 -l 300 -T 20`:
+    netstat --statistics
+    Ip:
+        282078937 total packets received
+        0 forwarded
+        0 incoming packets discarded
+        946760 incoming packets delivered
+        18743456 requests sent out
+        101 fragments dropped after timeout
+        282077129 reassemblies required
+        944952 packets reassembled ok
+        262734239 packet reassembles failed
+   (The numbers/stats above are somewhat better re:
+    reassemblies vs a kernel without this patchset. More
+    comprehensive performance testing TBD).
+
+Reported-by: Jann Horn <jannh@google.com>
+Reported-by: Juha-Matti Tilli <juha-matti.tilli@iki.fi>
+Suggested-by: Eric Dumazet <edumazet@google.com>
+Signed-off-by: Peter Oskolkov <posk@google.com>
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Cc: Florian Westphal <fw@strlen.de>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Mao Wenan <maowenan@huawei.com>
+[bwh: Backported to 4.4:
+ - Keep using frag_kfree_skb() in inet_frag_destroy()
+ - Adjust context]
+Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/linux/skbuff.h                  |    4 
+ include/net/inet_frag.h                 |    3 
+ net/ipv4/inet_fragment.c                |   14 +-
+ net/ipv4/ip_fragment.c                  |  182 +++++++++++++++++---------------
+ net/ipv6/netfilter/nf_conntrack_reasm.c |    1 
+ net/ipv6/reassembly.c                   |    1 
+ 6 files changed, 116 insertions(+), 89 deletions(-)
+
+--- a/include/linux/skbuff.h
++++ b/include/linux/skbuff.h
+@@ -556,14 +556,14 @@ struct sk_buff {
+                               struct skb_mstamp skb_mstamp;
+                       };
+               };
+-              struct rb_node  rbnode; /* used in netem & tcp stack */
++              struct rb_node          rbnode; /* used in netem, ip4 defrag, and tcp stack */
+       };
+       union {
++              struct sock             *sk;
+               int                     ip_defrag_offset;
+       };
+-      struct sock             *sk;
+       struct net_device       *dev;
+       /*
+--- a/include/net/inet_frag.h
++++ b/include/net/inet_frag.h
+@@ -73,7 +73,8 @@ struct inet_frag_queue {
+       struct timer_list       timer;
+       spinlock_t              lock;
+       atomic_t                refcnt;
+-      struct sk_buff          *fragments;
++      struct sk_buff          *fragments;  /* Used in IPv6. */
++      struct rb_root          rb_fragments; /* Used in IPv4. */
+       struct sk_buff          *fragments_tail;
+       ktime_t                 stamp;
+       int                     len;
+--- a/net/ipv4/inet_fragment.c
++++ b/net/ipv4/inet_fragment.c
+@@ -144,12 +144,16 @@ void inet_frag_destroy(struct inet_frag_
+       fp = q->fragments;
+       nf = q->net;
+       f = nf->f;
+-      while (fp) {
+-              struct sk_buff *xp = fp->next;
++      if (fp) {
++              do {
++                      struct sk_buff *xp = fp->next;
+-              sum_truesize += fp->truesize;
+-              frag_kfree_skb(nf, f, fp);
+-              fp = xp;
++                      sum_truesize += fp->truesize;
++                      frag_kfree_skb(nf, f, fp);
++                      fp = xp;
++              } while (fp);
++      } else {
++              sum_truesize = skb_rbtree_purge(&q->rb_fragments);
+       }
+       sum = sum_truesize + f->qsize;
+--- a/net/ipv4/ip_fragment.c
++++ b/net/ipv4/ip_fragment.c
+@@ -136,7 +136,7 @@ static bool frag_expire_skip_icmp(u32 us
+ static void ip_expire(unsigned long arg)
+ {
+       const struct iphdr *iph;
+-      struct sk_buff *head;
++      struct sk_buff *head = NULL;
+       struct net *net;
+       struct ipq *qp;
+       int err;
+@@ -152,14 +152,31 @@ static void ip_expire(unsigned long arg)
+       ipq_kill(qp);
+       IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS);
+-
+-      head = qp->q.fragments;
+-
+       IP_INC_STATS_BH(net, IPSTATS_MIB_REASMTIMEOUT);
+-      if (!(qp->q.flags & INET_FRAG_FIRST_IN) || !head)
++      if (!qp->q.flags & INET_FRAG_FIRST_IN)
+               goto out;
++      /* sk_buff::dev and sk_buff::rbnode are unionized. So we
++       * pull the head out of the tree in order to be able to
++       * deal with head->dev.
++       */
++      if (qp->q.fragments) {
++              head = qp->q.fragments;
++              qp->q.fragments = head->next;
++      } else {
++              head = skb_rb_first(&qp->q.rb_fragments);
++              if (!head)
++                      goto out;
++              rb_erase(&head->rbnode, &qp->q.rb_fragments);
++              memset(&head->rbnode, 0, sizeof(head->rbnode));
++              barrier();
++      }
++      if (head == qp->q.fragments_tail)
++              qp->q.fragments_tail = NULL;
++
++      sub_frag_mem_limit(qp->q.net, head->truesize);
++
+       head->dev = dev_get_by_index_rcu(net, qp->iif);
+       if (!head->dev)
+               goto out;
+@@ -179,16 +196,16 @@ static void ip_expire(unsigned long arg)
+           (skb_rtable(head)->rt_type != RTN_LOCAL))
+               goto out;
+-      skb_get(head);
+       spin_unlock(&qp->q.lock);
+       icmp_send(head, ICMP_TIME_EXCEEDED, ICMP_EXC_FRAGTIME, 0);
+-      kfree_skb(head);
+       goto out_rcu_unlock;
+ out:
+       spin_unlock(&qp->q.lock);
+ out_rcu_unlock:
+       rcu_read_unlock();
++      if (head)
++              kfree_skb(head);
+       ipq_put(qp);
+ }
+@@ -231,7 +248,7 @@ static int ip_frag_too_far(struct ipq *q
+       end = atomic_inc_return(&peer->rid);
+       qp->rid = end;
+-      rc = qp->q.fragments && (end - start) > max;
++      rc = qp->q.fragments_tail && (end - start) > max;
+       if (rc) {
+               struct net *net;
+@@ -245,7 +262,6 @@ static int ip_frag_too_far(struct ipq *q
+ static int ip_frag_reinit(struct ipq *qp)
+ {
+-      struct sk_buff *fp;
+       unsigned int sum_truesize = 0;
+       if (!mod_timer(&qp->q.timer, jiffies + qp->q.net->timeout)) {
+@@ -253,20 +269,14 @@ static int ip_frag_reinit(struct ipq *qp
+               return -ETIMEDOUT;
+       }
+-      fp = qp->q.fragments;
+-      do {
+-              struct sk_buff *xp = fp->next;
+-
+-              sum_truesize += fp->truesize;
+-              kfree_skb(fp);
+-              fp = xp;
+-      } while (fp);
++      sum_truesize = skb_rbtree_purge(&qp->q.rb_fragments);
+       sub_frag_mem_limit(qp->q.net, sum_truesize);
+       qp->q.flags = 0;
+       qp->q.len = 0;
+       qp->q.meat = 0;
+       qp->q.fragments = NULL;
++      qp->q.rb_fragments = RB_ROOT;
+       qp->q.fragments_tail = NULL;
+       qp->iif = 0;
+       qp->ecn = 0;
+@@ -278,7 +288,8 @@ static int ip_frag_reinit(struct ipq *qp
+ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
+ {
+       struct net *net = container_of(qp->q.net, struct net, ipv4.frags);
+-      struct sk_buff *prev, *next;
++      struct rb_node **rbn, *parent;
++      struct sk_buff *skb1;
+       struct net_device *dev;
+       unsigned int fragsize;
+       int flags, offset;
+@@ -341,58 +352,58 @@ static int ip_frag_queue(struct ipq *qp,
+       if (err)
+               goto err;
+-      /* Find out which fragments are in front and at the back of us
+-       * in the chain of fragments so far.  We must know where to put
+-       * this fragment, right?
+-       */
+-      prev = qp->q.fragments_tail;
+-      if (!prev || prev->ip_defrag_offset < offset) {
+-              next = NULL;
+-              goto found;
+-      }
+-      prev = NULL;
+-      for (next = qp->q.fragments; next != NULL; next = next->next) {
+-              if (next->ip_defrag_offset >= offset)
+-                      break;  /* bingo! */
+-              prev = next;
+-      }
++      /* Note : skb->rbnode and skb->dev share the same location. */
++      dev = skb->dev;
++      /* Makes sure compiler wont do silly aliasing games */
++      barrier();
+-found:
+       /* RFC5722, Section 4, amended by Errata ID : 3089
+        *                          When reassembling an IPv6 datagram, if
+        *   one or more its constituent fragments is determined to be an
+        *   overlapping fragment, the entire datagram (and any constituent
+        *   fragments) MUST be silently discarded.
+        *
+-       * We do the same here for IPv4.
++       * We do the same here for IPv4 (and increment an snmp counter).
+        */
+-      /* Is there an overlap with the previous fragment? */
+-      if (prev &&
+-          (prev->ip_defrag_offset + prev->len) > offset)
+-              goto discard_qp;
+-
+-      /* Is there an overlap with the next fragment? */
+-      if (next && next->ip_defrag_offset < end)
+-              goto discard_qp;
++      /* Find out where to put this fragment.  */
++      skb1 = qp->q.fragments_tail;
++      if (!skb1) {
++              /* This is the first fragment we've received. */
++              rb_link_node(&skb->rbnode, NULL, &qp->q.rb_fragments.rb_node);
++              qp->q.fragments_tail = skb;
++      } else if ((skb1->ip_defrag_offset + skb1->len) < end) {
++              /* This is the common/special case: skb goes to the end. */
++              /* Detect and discard overlaps. */
++              if (offset < (skb1->ip_defrag_offset + skb1->len))
++                      goto discard_qp;
++              /* Insert after skb1. */
++              rb_link_node(&skb->rbnode, &skb1->rbnode, &skb1->rbnode.rb_right);
++              qp->q.fragments_tail = skb;
++      } else {
++              /* Binary search. Note that skb can become the first fragment, but
++               * not the last (covered above). */
++              rbn = &qp->q.rb_fragments.rb_node;
++              do {
++                      parent = *rbn;
++                      skb1 = rb_to_skb(parent);
++                      if (end <= skb1->ip_defrag_offset)
++                              rbn = &parent->rb_left;
++                      else if (offset >= skb1->ip_defrag_offset + skb1->len)
++                              rbn = &parent->rb_right;
++                      else /* Found an overlap with skb1. */
++                              goto discard_qp;
++              } while (*rbn);
++              /* Here we have parent properly set, and rbn pointing to
++               * one of its NULL left/right children. Insert skb. */
++              rb_link_node(&skb->rbnode, parent, rbn);
++      }
++      rb_insert_color(&skb->rbnode, &qp->q.rb_fragments);
+-      /* Note : skb->ip_defrag_offset and skb->dev share the same location */
+-      dev = skb->dev;
+       if (dev)
+               qp->iif = dev->ifindex;
+-      /* Makes sure compiler wont do silly aliasing games */
+-      barrier();
+       skb->ip_defrag_offset = offset;
+-      /* Insert this fragment in the chain of fragments. */
+-      skb->next = next;
+-      if (!next)
+-              qp->q.fragments_tail = skb;
+-      if (prev)
+-              prev->next = skb;
+-      else
+-              qp->q.fragments = skb;
+-
+       qp->q.stamp = skb->tstamp;
+       qp->q.meat += skb->len;
+       qp->ecn |= ecn;
+@@ -414,7 +425,7 @@ found:
+               unsigned long orefdst = skb->_skb_refdst;
+               skb->_skb_refdst = 0UL;
+-              err = ip_frag_reasm(qp, prev, dev);
++              err = ip_frag_reasm(qp, skb, dev);
+               skb->_skb_refdst = orefdst;
+               return err;
+       }
+@@ -431,15 +442,15 @@ err:
+       return err;
+ }
+-
+ /* Build a new IP datagram from all its fragments. */
+-
+-static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
++static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb,
+                        struct net_device *dev)
+ {
+       struct net *net = container_of(qp->q.net, struct net, ipv4.frags);
+       struct iphdr *iph;
+-      struct sk_buff *fp, *head = qp->q.fragments;
++      struct sk_buff *fp, *head = skb_rb_first(&qp->q.rb_fragments);
++      struct sk_buff **nextp; /* To build frag_list. */
++      struct rb_node *rbn;
+       int len;
+       int ihlen;
+       int err;
+@@ -453,25 +464,20 @@ static int ip_frag_reasm(struct ipq *qp,
+               goto out_fail;
+       }
+       /* Make the one we just received the head. */
+-      if (prev) {
+-              head = prev->next;
+-              fp = skb_clone(head, GFP_ATOMIC);
++      if (head != skb) {
++              fp = skb_clone(skb, GFP_ATOMIC);
+               if (!fp)
+                       goto out_nomem;
+-
+-              fp->next = head->next;
+-              if (!fp->next)
++              rb_replace_node(&skb->rbnode, &fp->rbnode, &qp->q.rb_fragments);
++              if (qp->q.fragments_tail == skb)
+                       qp->q.fragments_tail = fp;
+-              prev->next = fp;
+-
+-              skb_morph(head, qp->q.fragments);
+-              head->next = qp->q.fragments->next;
+-
+-              consume_skb(qp->q.fragments);
+-              qp->q.fragments = head;
++              skb_morph(skb, head);
++              rb_replace_node(&head->rbnode, &skb->rbnode,
++                              &qp->q.rb_fragments);
++              consume_skb(head);
++              head = skb;
+       }
+-      WARN_ON(!head);
+       WARN_ON(head->ip_defrag_offset != 0);
+       /* Allocate a new buffer for the datagram. */
+@@ -496,24 +502,35 @@ static int ip_frag_reasm(struct ipq *qp,
+               clone = alloc_skb(0, GFP_ATOMIC);
+               if (!clone)
+                       goto out_nomem;
+-              clone->next = head->next;
+-              head->next = clone;
+               skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list;
+               skb_frag_list_init(head);
+               for (i = 0; i < skb_shinfo(head)->nr_frags; i++)
+                       plen += skb_frag_size(&skb_shinfo(head)->frags[i]);
+               clone->len = clone->data_len = head->data_len - plen;
+-              head->data_len -= clone->len;
+-              head->len -= clone->len;
++              skb->truesize += clone->truesize;
+               clone->csum = 0;
+               clone->ip_summed = head->ip_summed;
+               add_frag_mem_limit(qp->q.net, clone->truesize);
++              skb_shinfo(head)->frag_list = clone;
++              nextp = &clone->next;
++      } else {
++              nextp = &skb_shinfo(head)->frag_list;
+       }
+-      skb_shinfo(head)->frag_list = head->next;
+       skb_push(head, head->data - skb_network_header(head));
+-      for (fp=head->next; fp; fp = fp->next) {
++      /* Traverse the tree in order, to build frag_list. */
++      rbn = rb_next(&head->rbnode);
++      rb_erase(&head->rbnode, &qp->q.rb_fragments);
++      while (rbn) {
++              struct rb_node *rbnext = rb_next(rbn);
++              fp = rb_to_skb(rbn);
++              rb_erase(rbn, &qp->q.rb_fragments);
++              rbn = rbnext;
++              *nextp = fp;
++              nextp = &fp->next;
++              fp->prev = NULL;
++              memset(&fp->rbnode, 0, sizeof(fp->rbnode));
+               head->data_len += fp->len;
+               head->len += fp->len;
+               if (head->ip_summed != fp->ip_summed)
+@@ -524,7 +541,9 @@ static int ip_frag_reasm(struct ipq *qp,
+       }
+       sub_frag_mem_limit(qp->q.net, head->truesize);
++      *nextp = NULL;
+       head->next = NULL;
++      head->prev = NULL;
+       head->dev = dev;
+       head->tstamp = qp->q.stamp;
+       IPCB(head)->frag_max_size = max(qp->max_df_size, qp->q.max_size);
+@@ -552,6 +571,7 @@ static int ip_frag_reasm(struct ipq *qp,
+       IP_INC_STATS_BH(net, IPSTATS_MIB_REASMOKS);
+       qp->q.fragments = NULL;
++      qp->q.rb_fragments = RB_ROOT;
+       qp->q.fragments_tail = NULL;
+       return 0;
+--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
++++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
+@@ -445,6 +445,7 @@ nf_ct_frag6_reasm(struct frag_queue *fq,
+                                         head->csum);
+       fq->q.fragments = NULL;
++      fq->q.rb_fragments = RB_ROOT;
+       fq->q.fragments_tail = NULL;
+       /* all original skbs are linked into the NFCT_FRAG6_CB(head).orig */
+--- a/net/ipv6/reassembly.c
++++ b/net/ipv6/reassembly.c
+@@ -465,6 +465,7 @@ static int ip6_frag_reasm(struct frag_qu
+       IP6_INC_STATS_BH(net, __in6_dev_get(dev), IPSTATS_MIB_REASMOKS);
+       rcu_read_unlock();
+       fq->q.fragments = NULL;
++      fq->q.rb_fragments = RB_ROOT;
+       fq->q.fragments_tail = NULL;
+       return 1;
diff --git a/queue-4.4/ipfrag-really-prevent-allocation-on-netns-exit.patch b/queue-4.4/ipfrag-really-prevent-allocation-on-netns-exit.patch
new file mode 100644 (file)
index 0000000..ea66096
--- /dev/null
@@ -0,0 +1,37 @@
+From foo@baz Thu Feb  7 12:09:56 CET 2019
+From: Paolo Abeni <pabeni@redhat.com>
+Date: Fri, 6 Jul 2018 12:30:20 +0200
+Subject: ipfrag: really prevent allocation on netns exit
+
+From: Paolo Abeni <pabeni@redhat.com>
+
+commit f6f2a4a2eb92bc73671204198bb2f8ab53ff59fb upstream.
+
+Setting the low threshold to 0 has no effect on frags allocation,
+we need to clear high_thresh instead.
+
+The code was pre-existent to commit 648700f76b03 ("inet: frags:
+use rhashtables for reassembly units"), but before the above,
+such assignment had a different role: prevent concurrent eviction
+from the worker and the netns cleanup helper.
+
+Fixes: 648700f76b03 ("inet: frags: use rhashtables for reassembly units")
+Signed-off-by: Paolo Abeni <pabeni@redhat.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/ipv4/inet_fragment.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/net/ipv4/inet_fragment.c
++++ b/net/ipv4/inet_fragment.c
+@@ -90,7 +90,7 @@ static void inet_frags_free_cb(void *ptr
+ void inet_frags_exit_net(struct netns_frags *nf)
+ {
+-      nf->low_thresh = 0; /* prevent creation of new frags */
++      nf->high_thresh = 0; /* prevent creation of new frags */
+       rhashtable_free_and_destroy(&nf->rhashtable, inet_frags_free_cb, NULL);
+ }
diff --git a/queue-4.4/ipv4-frags-precedence-bug-in-ip_expire.patch b/queue-4.4/ipv4-frags-precedence-bug-in-ip_expire.patch
new file mode 100644 (file)
index 0000000..f042a34
--- /dev/null
@@ -0,0 +1,33 @@
+From foo@baz Thu Feb  7 12:09:56 CET 2019
+From: Dan Carpenter <dan.carpenter@oracle.com>
+Date: Wed, 10 Oct 2018 12:30:17 -0700
+Subject: ipv4: frags: precedence bug in ip_expire()
+
+From: Dan Carpenter <dan.carpenter@oracle.com>
+
+commit 70837ffe3085c9a91488b52ca13ac84424da1042 upstream.
+
+We accidentally removed the parentheses here, but they are required
+because '!' has higher precedence than '&'.
+
+Fixes: fa0f527358bd ("ip: use rb trees for IP frag queue.")
+Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Mao Wenan <maowenan@huawei.com>
+Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/ipv4/ip_fragment.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/net/ipv4/ip_fragment.c
++++ b/net/ipv4/ip_fragment.c
+@@ -205,7 +205,7 @@ static void ip_expire(unsigned long arg)
+       IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS);
+       IP_INC_STATS_BH(net, IPSTATS_MIB_REASMTIMEOUT);
+-      if (!qp->q.flags & INET_FRAG_FIRST_IN)
++      if (!(qp->q.flags & INET_FRAG_FIRST_IN))
+               goto out;
+       /* sk_buff::dev and sk_buff::rbnode are unionized. So we
diff --git a/queue-4.4/ipv6-defrag-drop-non-last-frags-smaller-than-min-mtu.patch b/queue-4.4/ipv6-defrag-drop-non-last-frags-smaller-than-min-mtu.patch
new file mode 100644 (file)
index 0000000..1b665f2
--- /dev/null
@@ -0,0 +1,58 @@
+From foo@baz Thu Feb  7 12:09:56 CET 2019
+From: Florian Westphal <fw@strlen.de>
+Date: Wed, 10 Oct 2018 12:30:10 -0700
+Subject: ipv6: defrag: drop non-last frags smaller than min mtu
+
+From: Florian Westphal <fw@strlen.de>
+
+commit 0ed4229b08c13c84a3c301a08defdc9e7f4467e6 upstream.
+
+don't bother with pathological cases, they only waste cycles.
+IPv6 requires a minimum MTU of 1280 so we should never see fragments
+smaller than this (except last frag).
+
+v3: don't use awkward "-offset + len"
+v2: drop IPv4 part, which added same check w. IPV4_MIN_MTU (68).
+    There were concerns that there could be even smaller frags
+    generated by intermediate nodes, e.g. on radio networks.
+
+Cc: Peter Oskolkov <posk@google.com>
+Cc: Eric Dumazet <edumazet@google.com>
+Signed-off-by: Florian Westphal <fw@strlen.de>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Mao Wenan <maowenan@huawei.com>
+[bwh: Backported to 4.4: In nf_ct_frag6_gather() use clone instead of skb,
+ and goto ret_orig in case of error]
+Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/ipv6/netfilter/nf_conntrack_reasm.c |    4 ++++
+ net/ipv6/reassembly.c                   |    4 ++++
+ 2 files changed, 8 insertions(+)
+
+--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
++++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
+@@ -574,6 +574,10 @@ struct sk_buff *nf_ct_frag6_gather(struc
+       hdr = ipv6_hdr(clone);
+       fhdr = (struct frag_hdr *)skb_transport_header(clone);
++      if (clone->len - skb_network_offset(clone) < IPV6_MIN_MTU &&
++          fhdr->frag_off & htons(IP6_MF))
++              goto ret_orig;
++
+       skb_orphan(skb);
+       fq = fq_find(net, fhdr->identification, user, hdr,
+                    skb->dev ? skb->dev->ifindex : 0);
+--- a/net/ipv6/reassembly.c
++++ b/net/ipv6/reassembly.c
+@@ -515,6 +515,10 @@ static int ipv6_frag_rcv(struct sk_buff
+               return 1;
+       }
++      if (skb->len - skb_network_offset(skb) < IPV6_MIN_MTU &&
++          fhdr->frag_off & htons(IP6_MF))
++              goto fail_hdr;
++
+       iif = skb->dev ? skb->dev->ifindex : 0;
+       fq = fq_find(net, fhdr->identification, hdr, iif);
+       if (fq) {
diff --git a/queue-4.4/ipv6-frags-rewrite-ip6_expire_frag_queue.patch b/queue-4.4/ipv6-frags-rewrite-ip6_expire_frag_queue.patch
new file mode 100644 (file)
index 0000000..b22e8f6
--- /dev/null
@@ -0,0 +1,76 @@
+From foo@baz Thu Feb  7 12:09:56 CET 2019
+From: Eric Dumazet <edumazet@google.com>
+Date: Wed, 10 Oct 2018 12:30:02 -0700
+Subject: ipv6: frags: rewrite ip6_expire_frag_queue()
+
+From: Eric Dumazet <edumazet@google.com>
+
+commit 05c0b86b9696802fd0ce5676a92a63f1b455bdf3 upstream.
+
+Make it similar to IPv4 ip_expire(), and release the lock
+before calling icmp functions.
+
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+[bwh: Backported to 4.4: adjust context]
+Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/ipv6/reassembly.c |   24 ++++++++++++++++--------
+ 1 file changed, 16 insertions(+), 8 deletions(-)
+
+--- a/net/ipv6/reassembly.c
++++ b/net/ipv6/reassembly.c
+@@ -92,7 +92,9 @@ EXPORT_SYMBOL(ip6_frag_init);
+ void ip6_expire_frag_queue(struct net *net, struct frag_queue *fq)
+ {
+       struct net_device *dev = NULL;
++      struct sk_buff *head;
++      rcu_read_lock();
+       spin_lock(&fq->q.lock);
+       if (fq->q.flags & INET_FRAG_COMPLETE)
+@@ -100,28 +102,34 @@ void ip6_expire_frag_queue(struct net *n
+       inet_frag_kill(&fq->q);
+-      rcu_read_lock();
+       dev = dev_get_by_index_rcu(net, fq->iif);
+       if (!dev)
+-              goto out_rcu_unlock;
++              goto out;
+       IP6_INC_STATS_BH(net, __in6_dev_get(dev), IPSTATS_MIB_REASMFAILS);
+       IP6_INC_STATS_BH(net, __in6_dev_get(dev), IPSTATS_MIB_REASMTIMEOUT);
+       /* Don't send error if the first segment did not arrive. */
+-      if (!(fq->q.flags & INET_FRAG_FIRST_IN) || !fq->q.fragments)
+-              goto out_rcu_unlock;
++      head = fq->q.fragments;
++      if (!(fq->q.flags & INET_FRAG_FIRST_IN) || !head)
++              goto out;
+       /* But use as source device on which LAST ARRIVED
+        * segment was received. And do not use fq->dev
+        * pointer directly, device might already disappeared.
+        */
+-      fq->q.fragments->dev = dev;
+-      icmpv6_send(fq->q.fragments, ICMPV6_TIME_EXCEED, ICMPV6_EXC_FRAGTIME, 0);
+-out_rcu_unlock:
+-      rcu_read_unlock();
++      head->dev = dev;
++      skb_get(head);
++      spin_unlock(&fq->q.lock);
++
++      icmpv6_send(head, ICMPV6_TIME_EXCEED, ICMPV6_EXC_FRAGTIME, 0);
++      kfree_skb(head);
++      goto out_rcu_unlock;
++
+ out:
+       spin_unlock(&fq->q.lock);
++out_rcu_unlock:
++      rcu_read_unlock();
+       inet_frag_put(&fq->q);
+ }
+ EXPORT_SYMBOL(ip6_expire_frag_queue);
diff --git a/queue-4.4/net-fix-pskb_trim_rcsum_slow-with-odd-trim-offset.patch b/queue-4.4/net-fix-pskb_trim_rcsum_slow-with-odd-trim-offset.patch
new file mode 100644 (file)
index 0000000..b898ab7
--- /dev/null
@@ -0,0 +1,48 @@
+From foo@baz Thu Feb  7 12:09:56 CET 2019
+From: Dimitris Michailidis <dmichail@google.com>
+Date: Fri, 19 Oct 2018 17:07:13 -0700
+Subject: net: fix pskb_trim_rcsum_slow() with odd trim offset
+
+From: Dimitris Michailidis <dmichail@google.com>
+
+commit d55bef5059dd057bd077155375c581b49d25be7e upstream.
+
+We've been getting checksum errors involving small UDP packets, usually
+59B packets with 1 extra non-zero padding byte. netdev_rx_csum_fault()
+has been complaining that HW is providing bad checksums. Turns out the
+problem is in pskb_trim_rcsum_slow(), introduced in commit 88078d98d1bb
+("net: pskb_trim_rcsum() and CHECKSUM_COMPLETE are friends").
+
+The source of the problem is that when the bytes we are trimming start
+at an odd address, as in the case of the 1 padding byte above,
+skb_checksum() returns a byte-swapped value. We cannot just combine this
+with skb->csum using csum_sub(). We need to use csum_block_sub() here
+that takes into account the parity of the start address and handles the
+swapping.
+
+Matches existing code in __skb_postpull_rcsum() and esp_remove_trailer().
+
+Fixes: 88078d98d1bb ("net: pskb_trim_rcsum() and CHECKSUM_COMPLETE are friends")
+Signed-off-by: Dimitris Michailidis <dmichail@google.com>
+Reviewed-by: Eric Dumazet <edumazet@google.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/core/skbuff.c |    5 +++--
+ 1 file changed, 3 insertions(+), 2 deletions(-)
+
+--- a/net/core/skbuff.c
++++ b/net/core/skbuff.c
+@@ -1509,8 +1509,9 @@ int pskb_trim_rcsum_slow(struct sk_buff
+       if (skb->ip_summed == CHECKSUM_COMPLETE) {
+               int delta = skb->len - len;
+-              skb->csum = csum_sub(skb->csum,
+-                                   skb_checksum(skb, len, delta, 0));
++              skb->csum = csum_block_sub(skb->csum,
++                                         skb_checksum(skb, len, delta, 0),
++                                         len);
+       }
+       return __pskb_trim(skb, len);
+ }
diff --git a/queue-4.4/net-ieee802154-6lowpan-fix-frag-reassembly.patch b/queue-4.4/net-ieee802154-6lowpan-fix-frag-reassembly.patch
new file mode 100644 (file)
index 0000000..5f436f7
--- /dev/null
@@ -0,0 +1,71 @@
+From foo@baz Thu Feb  7 12:09:56 CET 2019
+From: Alexander Aring <aring@mojatatu.com>
+Date: Fri, 20 Apr 2018 14:54:13 -0400
+Subject: net: ieee802154: 6lowpan: fix frag reassembly
+
+From: Alexander Aring <aring@mojatatu.com>
+
+commit f18fa5de5ba7f1d6650951502bb96a6e4715a948 upstream.
+
+This patch initialize stack variables which are used in
+frag_lowpan_compare_key to zero. In my case there are padding bytes in the
+structures ieee802154_addr as well in frag_lowpan_compare_key. Otherwise
+the key variable contains random bytes. The result is that a compare of
+two keys by memcmp works incorrect.
+
+Fixes: 648700f76b03 ("inet: frags: use rhashtables for reassembly units")
+Signed-off-by: Alexander Aring <aring@mojatatu.com>
+Reported-by: Stefan Schmidt <stefan@osg.samsung.com>
+Signed-off-by: Stefan Schmidt <stefan@osg.samsung.com>
+Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/ieee802154/6lowpan/6lowpan_i.h  |    4 ++--
+ net/ieee802154/6lowpan/reassembly.c |   14 +++++++-------
+ 2 files changed, 9 insertions(+), 9 deletions(-)
+
+--- a/net/ieee802154/6lowpan/6lowpan_i.h
++++ b/net/ieee802154/6lowpan/6lowpan_i.h
+@@ -19,8 +19,8 @@ typedef unsigned __bitwise__ lowpan_rx_r
+ struct frag_lowpan_compare_key {
+       u16 tag;
+       u16 d_size;
+-      const struct ieee802154_addr src;
+-      const struct ieee802154_addr dst;
++      struct ieee802154_addr src;
++      struct ieee802154_addr dst;
+ };
+ /* Equivalent of ipv4 struct ipq
+--- a/net/ieee802154/6lowpan/reassembly.c
++++ b/net/ieee802154/6lowpan/reassembly.c
+@@ -74,14 +74,14 @@ fq_find(struct net *net, const struct lo
+ {
+       struct netns_ieee802154_lowpan *ieee802154_lowpan =
+               net_ieee802154_lowpan(net);
+-      struct frag_lowpan_compare_key key = {
+-              .tag = cb->d_tag,
+-              .d_size = cb->d_size,
+-              .src = *src,
+-              .dst = *dst,
+-      };
++      struct frag_lowpan_compare_key key = {};
+       struct inet_frag_queue *q;
++      key.tag = cb->d_tag;
++      key.d_size = cb->d_size;
++      key.src = *src;
++      key.dst = *dst;
++
+       q = inet_frag_find(&ieee802154_lowpan->frags, &key);
+       if (IS_ERR_OR_NULL(q)) {
+               inet_frag_maybe_warn_overflow(q, pr_fmt());
+@@ -372,7 +372,7 @@ int lowpan_frag_rcv(struct sk_buff *skb,
+       struct lowpan_frag_queue *fq;
+       struct net *net = dev_net(skb->dev);
+       struct lowpan_802154_cb *cb = lowpan_802154_cb(skb);
+-      struct ieee802154_hdr hdr;
++      struct ieee802154_hdr hdr = {};
+       int err;
+       if (ieee802154_hdr_peek_addrs(skb, &hdr) < 0)
diff --git a/queue-4.4/net-ipv4-do-not-handle-duplicate-fragments-as-overlapping.patch b/queue-4.4/net-ipv4-do-not-handle-duplicate-fragments-as-overlapping.patch
new file mode 100644 (file)
index 0000000..3205702
--- /dev/null
@@ -0,0 +1,98 @@
+From foo@baz Thu Feb  7 12:09:56 CET 2019
+From: Michal Kubecek <mkubecek@suse.cz>
+Date: Thu, 13 Dec 2018 17:23:32 +0100
+Subject: net: ipv4: do not handle duplicate fragments as overlapping
+
+From: Michal Kubecek <mkubecek@suse.cz>
+
+commit ade446403bfb79d3528d56071a84b15351a139ad upstream.
+
+Since commit 7969e5c40dfd ("ip: discard IPv4 datagrams with overlapping
+segments.") IPv4 reassembly code drops the whole queue whenever an
+overlapping fragment is received. However, the test is written in a way
+which detects duplicate fragments as overlapping so that in environments
+with many duplicate packets, fragmented packets may be undeliverable.
+
+Add an extra test and for (potentially) duplicate fragment, only drop the
+new fragment rather than the whole queue. Only starting offset and length
+are checked, not the contents of the fragments as that would be too
+expensive. For similar reason, linear list ("run") of a rbtree node is not
+iterated, we only check if the new fragment is a subset of the interval
+covered by existing consecutive fragments.
+
+v2: instead of an exact check iterating through linear list of an rbtree
+node, only check if the new fragment is subset of the "run" (suggested
+by Eric Dumazet)
+
+Fixes: 7969e5c40dfd ("ip: discard IPv4 datagrams with overlapping segments.")
+Signed-off-by: Michal Kubecek <mkubecek@suse.cz>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Mao Wenan <maowenan@huawei.com>
+[bwh: Backported to 4.4:
+ - goto discard_qp, not err, in case of overlap
+ - Set err earlier variable, as done upstream in commit 0ff89efb5246
+   "ip: fail fast on IP defrag errors"]
+Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/ipv4/ip_fragment.c |   20 +++++++++++++-------
+ 1 file changed, 13 insertions(+), 7 deletions(-)
+
+--- a/net/ipv4/ip_fragment.c
++++ b/net/ipv4/ip_fragment.c
+@@ -347,10 +347,10 @@ static int ip_frag_queue(struct ipq *qp,
+       struct net *net = container_of(qp->q.net, struct net, ipv4.frags);
+       struct rb_node **rbn, *parent;
+       struct sk_buff *skb1, *prev_tail;
++      int ihl, end, skb1_run_end;
+       struct net_device *dev;
+       unsigned int fragsize;
+       int flags, offset;
+-      int ihl, end;
+       int err = -ENOENT;
+       u8 ecn;
+@@ -420,9 +420,12 @@ static int ip_frag_queue(struct ipq *qp,
+        *   overlapping fragment, the entire datagram (and any constituent
+        *   fragments) MUST be silently discarded.
+        *
+-       * We do the same here for IPv4 (and increment an snmp counter).
++       * We do the same here for IPv4 (and increment an snmp counter) but
++       * we do not want to drop the whole queue in response to a duplicate
++       * fragment.
+        */
++      err = -EINVAL;
+       /* Find out where to put this fragment.  */
+       prev_tail = qp->q.fragments_tail;
+       if (!prev_tail)
+@@ -444,13 +447,17 @@ static int ip_frag_queue(struct ipq *qp,
+               do {
+                       parent = *rbn;
+                       skb1 = rb_to_skb(parent);
++                      skb1_run_end = skb1->ip_defrag_offset +
++                                     FRAG_CB(skb1)->frag_run_len;
+                       if (end <= skb1->ip_defrag_offset)
+                               rbn = &parent->rb_left;
+-                      else if (offset >= skb1->ip_defrag_offset +
+-                                              FRAG_CB(skb1)->frag_run_len)
++                      else if (offset >= skb1_run_end)
+                               rbn = &parent->rb_right;
+-                      else /* Found an overlap with skb1. */
+-                              goto discard_qp;
++                      else if (offset >= skb1->ip_defrag_offset &&
++                               end <= skb1_run_end)
++                              goto err; /* No new data, potential duplicate */
++                      else
++                              goto discard_qp; /* Found an overlap */
+               } while (*rbn);
+               /* Here we have parent properly set, and rbn pointing to
+                * one of its NULL left/right children. Insert skb.
+@@ -495,7 +502,6 @@ static int ip_frag_queue(struct ipq *qp,
+ discard_qp:
+       inet_frag_kill(&qp->q);
+-      err = -EINVAL;
+       IP_INC_STATS_BH(net, IPSTATS_MIB_REASM_OVERLAPS);
+ err:
+       kfree_skb(skb);
diff --git a/queue-4.4/net-modify-skb_rbtree_purge-to-return-the-truesize-of-all-purged-skbs.patch b/queue-4.4/net-modify-skb_rbtree_purge-to-return-the-truesize-of-all-purged-skbs.patch
new file mode 100644 (file)
index 0000000..619d7ff
--- /dev/null
@@ -0,0 +1,66 @@
+From foo@baz Thu Feb  7 12:09:56 CET 2019
+From: Peter Oskolkov <posk@google.com>
+Date: Wed, 10 Oct 2018 12:30:09 -0700
+Subject: net: modify skb_rbtree_purge to return the truesize of all purged skbs.
+
+From: Peter Oskolkov <posk@google.com>
+
+commit 385114dec8a49b5e5945e77ba7de6356106713f4 upstream.
+
+Tested: see the next patch is the series.
+
+Suggested-by: Eric Dumazet <edumazet@google.com>
+Signed-off-by: Peter Oskolkov <posk@google.com>
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Cc: Florian Westphal <fw@strlen.de>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Mao Wenan <maowenan@huawei.com>
+Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/linux/skbuff.h |    2 +-
+ net/core/skbuff.c      |    6 +++++-
+ 2 files changed, 6 insertions(+), 2 deletions(-)
+
+--- a/include/linux/skbuff.h
++++ b/include/linux/skbuff.h
+@@ -2278,7 +2278,7 @@ static inline void __skb_queue_purge(str
+               kfree_skb(skb);
+ }
+-void skb_rbtree_purge(struct rb_root *root);
++unsigned int skb_rbtree_purge(struct rb_root *root);
+ void *netdev_alloc_frag(unsigned int fragsz);
+--- a/net/core/skbuff.c
++++ b/net/core/skbuff.c
+@@ -2380,23 +2380,27 @@ EXPORT_SYMBOL(skb_queue_purge);
+ /**
+  *    skb_rbtree_purge - empty a skb rbtree
+  *    @root: root of the rbtree to empty
++ *    Return value: the sum of truesizes of all purged skbs.
+  *
+  *    Delete all buffers on an &sk_buff rbtree. Each buffer is removed from
+  *    the list and one reference dropped. This function does not take
+  *    any lock. Synchronization should be handled by the caller (e.g., TCP
+  *    out-of-order queue is protected by the socket lock).
+  */
+-void skb_rbtree_purge(struct rb_root *root)
++unsigned int skb_rbtree_purge(struct rb_root *root)
+ {
+       struct rb_node *p = rb_first(root);
++      unsigned int sum = 0;
+       while (p) {
+               struct sk_buff *skb = rb_entry(p, struct sk_buff, rbnode);
+               p = rb_next(p);
+               rb_erase(&skb->rbnode, root);
++              sum += skb->truesize;
+               kfree_skb(skb);
+       }
++      return sum;
+ }
+ /**
diff --git a/queue-4.4/net-pskb_trim_rcsum-and-checksum_complete-are-friends.patch b/queue-4.4/net-pskb_trim_rcsum-and-checksum_complete-are-friends.patch
new file mode 100644 (file)
index 0000000..c3071b3
--- /dev/null
@@ -0,0 +1,73 @@
+From foo@baz Thu Feb  7 12:09:56 CET 2019
+From: Eric Dumazet <edumazet@google.com>
+Date: Wed, 10 Oct 2018 12:30:11 -0700
+Subject: net: pskb_trim_rcsum() and CHECKSUM_COMPLETE are friends
+
+From: Eric Dumazet <edumazet@google.com>
+
+commit 88078d98d1bb085d72af8437707279e203524fa5 upstream.
+
+After working on IP defragmentation lately, I found that some large
+packets defeat CHECKSUM_COMPLETE optimization because of NIC adding
+zero paddings on the last (small) fragment.
+
+While removing the padding with pskb_trim_rcsum(), we set skb->ip_summed
+to CHECKSUM_NONE, forcing a full csum validation, even if all prior
+fragments had CHECKSUM_COMPLETE set.
+
+We can instead compute the checksum of the part we are trimming,
+usually smaller than the part we keep.
+
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/linux/skbuff.h |    5 ++---
+ net/core/skbuff.c      |   14 ++++++++++++++
+ 2 files changed, 16 insertions(+), 3 deletions(-)
+
+--- a/include/linux/skbuff.h
++++ b/include/linux/skbuff.h
+@@ -2796,6 +2796,7 @@ static inline unsigned char *skb_push_rc
+       return skb->data;
+ }
++int pskb_trim_rcsum_slow(struct sk_buff *skb, unsigned int len);
+ /**
+  *    pskb_trim_rcsum - trim received skb and update checksum
+  *    @skb: buffer to trim
+@@ -2810,9 +2811,7 @@ static inline int pskb_trim_rcsum(struct
+ {
+       if (likely(len >= skb->len))
+               return 0;
+-      if (skb->ip_summed == CHECKSUM_COMPLETE)
+-              skb->ip_summed = CHECKSUM_NONE;
+-      return __pskb_trim(skb, len);
++      return pskb_trim_rcsum_slow(skb, len);
+ }
+ #define rb_to_skb(rb) rb_entry_safe(rb, struct sk_buff, rbnode)
+--- a/net/core/skbuff.c
++++ b/net/core/skbuff.c
+@@ -1502,6 +1502,20 @@ done:
+ }
+ EXPORT_SYMBOL(___pskb_trim);
++/* Note : use pskb_trim_rcsum() instead of calling this directly
++ */
++int pskb_trim_rcsum_slow(struct sk_buff *skb, unsigned int len)
++{
++      if (skb->ip_summed == CHECKSUM_COMPLETE) {
++              int delta = skb->len - len;
++
++              skb->csum = csum_sub(skb->csum,
++                                   skb_checksum(skb, len, delta, 0));
++      }
++      return __pskb_trim(skb, len);
++}
++EXPORT_SYMBOL(pskb_trim_rcsum_slow);
++
+ /**
+  *    __pskb_pull_tail - advance tail of skb header
+  *    @skb: buffer to reallocate
diff --git a/queue-4.4/rhashtable-add-rhashtable_lookup.patch b/queue-4.4/rhashtable-add-rhashtable_lookup.patch
new file mode 100644 (file)
index 0000000..1822daf
--- /dev/null
@@ -0,0 +1,119 @@
+From foo@baz Thu Feb  7 12:09:56 CET 2019
+From: Ben Hutchings <ben.hutchings@codethink.co.uk>
+Date: Fri, 7 Dec 2018 17:16:46 +0000
+Subject: rhashtable: Add rhashtable_lookup()
+
+From: Ben Hutchings <ben.hutchings@codethink.co.uk>
+
+Extracted from commit ca26893f05e8 "rhashtable: Add rhlist interface".
+
+Cc: Herbert Xu <herbert@gondor.apana.org.au>
+Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/linux/rhashtable.h |   69 +++++++++++++++++++++++++++++++++------------
+ 1 file changed, 52 insertions(+), 17 deletions(-)
+
+--- a/include/linux/rhashtable.h
++++ b/include/linux/rhashtable.h
+@@ -515,18 +515,8 @@ static inline int rhashtable_compare(str
+       return memcmp(ptr + ht->p.key_offset, arg->key, ht->p.key_len);
+ }
+-/**
+- * rhashtable_lookup_fast - search hash table, inlined version
+- * @ht:               hash table
+- * @key:      the pointer to the key
+- * @params:   hash table parameters
+- *
+- * Computes the hash value for the key and traverses the bucket chain looking
+- * for a entry with an identical key. The first matching entry is returned.
+- *
+- * Returns the first entry on which the compare function returned true.
+- */
+-static inline void *rhashtable_lookup_fast(
++/* Internal function, do not use. */
++static inline struct rhash_head *__rhashtable_lookup(
+       struct rhashtable *ht, const void *key,
+       const struct rhashtable_params params)
+ {
+@@ -538,8 +528,6 @@ static inline void *rhashtable_lookup_fa
+       struct rhash_head *he;
+       unsigned int hash;
+-      rcu_read_lock();
+-
+       tbl = rht_dereference_rcu(ht->tbl, ht);
+ restart:
+       hash = rht_key_hashfn(ht, tbl, key, params);
+@@ -548,8 +536,7 @@ restart:
+                   params.obj_cmpfn(&arg, rht_obj(ht, he)) :
+                   rhashtable_compare(&arg, rht_obj(ht, he)))
+                       continue;
+-              rcu_read_unlock();
+-              return rht_obj(ht, he);
++              return he;
+       }
+       /* Ensure we see any new tables. */
+@@ -558,11 +545,59 @@ restart:
+       tbl = rht_dereference_rcu(tbl->future_tbl, ht);
+       if (unlikely(tbl))
+               goto restart;
+-      rcu_read_unlock();
+       return NULL;
+ }
++/**
++ * rhashtable_lookup - search hash table
++ * @ht:               hash table
++ * @key:      the pointer to the key
++ * @params:   hash table parameters
++ *
++ * Computes the hash value for the key and traverses the bucket chain looking
++ * for a entry with an identical key. The first matching entry is returned.
++ *
++ * This must only be called under the RCU read lock.
++ *
++ * Returns the first entry on which the compare function returned true.
++ */
++static inline void *rhashtable_lookup(
++      struct rhashtable *ht, const void *key,
++      const struct rhashtable_params params)
++{
++      struct rhash_head *he = __rhashtable_lookup(ht, key, params);
++
++      return he ? rht_obj(ht, he) : NULL;
++}
++
++/**
++ * rhashtable_lookup_fast - search hash table, without RCU read lock
++ * @ht:               hash table
++ * @key:      the pointer to the key
++ * @params:   hash table parameters
++ *
++ * Computes the hash value for the key and traverses the bucket chain looking
++ * for a entry with an identical key. The first matching entry is returned.
++ *
++ * Only use this function when you have other mechanisms guaranteeing
++ * that the object won't go away after the RCU read lock is released.
++ *
++ * Returns the first entry on which the compare function returned true.
++ */
++static inline void *rhashtable_lookup_fast(
++      struct rhashtable *ht, const void *key,
++      const struct rhashtable_params params)
++{
++      void *obj;
++
++      rcu_read_lock();
++      obj = rhashtable_lookup(ht, key, params);
++      rcu_read_unlock();
++
++      return obj;
++}
++
+ /* Internal function, please use rhashtable_insert_fast() instead. This
+  * function returns the existing element already in hashes in there is a clash,
+  * otherwise it returns an error via ERR_PTR().
diff --git a/queue-4.4/rhashtable-add-rhashtable_lookup_get_insert_key.patch b/queue-4.4/rhashtable-add-rhashtable_lookup_get_insert_key.patch
new file mode 100644 (file)
index 0000000..73e2d15
--- /dev/null
@@ -0,0 +1,216 @@
+From foo@baz Thu Feb  7 12:09:56 CET 2019
+From: Pablo Neira Ayuso <pablo@netfilter.org>
+Date: Wed, 24 Aug 2016 12:31:31 +0200
+Subject: rhashtable: add rhashtable_lookup_get_insert_key()
+
+From: Pablo Neira Ayuso <pablo@netfilter.org>
+
+commit 5ca8cc5bf11faed257c762018aea9106d529232f upstream.
+
+This patch modifies __rhashtable_insert_fast() so it returns the
+existing object that clashes with the one that you want to insert.
+In case the object is successfully inserted, NULL is returned.
+Otherwise, you get an error via ERR_PTR().
+
+This patch adapts the existing callers of __rhashtable_insert_fast()
+so they handle this new logic, and it adds a new
+rhashtable_lookup_get_insert_key() interface to fetch this existing
+object.
+
+nf_tables needs this change to improve handling of EEXIST cases via
+honoring the NLM_F_EXCL flag and by checking if the data part of the
+mapping matches what we have.
+
+Cc: Herbert Xu <herbert@gondor.apana.org.au>
+Cc: Thomas Graf <tgraf@suug.ch>
+Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
+Acked-by: Herbert Xu <herbert@gondor.apana.org.au>
+Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/linux/rhashtable.h |   70 ++++++++++++++++++++++++++++++++++++---------
+ lib/rhashtable.c           |   10 ++++--
+ 2 files changed, 64 insertions(+), 16 deletions(-)
+
+--- a/include/linux/rhashtable.h
++++ b/include/linux/rhashtable.h
+@@ -343,7 +343,8 @@ int rhashtable_init(struct rhashtable *h
+ struct bucket_table *rhashtable_insert_slow(struct rhashtable *ht,
+                                           const void *key,
+                                           struct rhash_head *obj,
+-                                          struct bucket_table *old_tbl);
++                                          struct bucket_table *old_tbl,
++                                          void **data);
+ int rhashtable_insert_rehash(struct rhashtable *ht, struct bucket_table *tbl);
+ int rhashtable_walk_init(struct rhashtable *ht, struct rhashtable_iter *iter);
+@@ -562,8 +563,11 @@ restart:
+       return NULL;
+ }
+-/* Internal function, please use rhashtable_insert_fast() instead */
+-static inline int __rhashtable_insert_fast(
++/* Internal function, please use rhashtable_insert_fast() instead. This
++ * function returns the existing element already in hashes in there is a clash,
++ * otherwise it returns an error via ERR_PTR().
++ */
++static inline void *__rhashtable_insert_fast(
+       struct rhashtable *ht, const void *key, struct rhash_head *obj,
+       const struct rhashtable_params params)
+ {
+@@ -576,6 +580,7 @@ static inline int __rhashtable_insert_fa
+       spinlock_t *lock;
+       unsigned int elasticity;
+       unsigned int hash;
++      void *data = NULL;
+       int err;
+ restart:
+@@ -600,11 +605,14 @@ restart:
+       new_tbl = rht_dereference_rcu(tbl->future_tbl, ht);
+       if (unlikely(new_tbl)) {
+-              tbl = rhashtable_insert_slow(ht, key, obj, new_tbl);
++              tbl = rhashtable_insert_slow(ht, key, obj, new_tbl, &data);
+               if (!IS_ERR_OR_NULL(tbl))
+                       goto slow_path;
+               err = PTR_ERR(tbl);
++              if (err == -EEXIST)
++                      err = 0;
++
+               goto out;
+       }
+@@ -618,25 +626,25 @@ slow_path:
+               err = rhashtable_insert_rehash(ht, tbl);
+               rcu_read_unlock();
+               if (err)
+-                      return err;
++                      return ERR_PTR(err);
+               goto restart;
+       }
+-      err = -EEXIST;
++      err = 0;
+       elasticity = ht->elasticity;
+       rht_for_each(head, tbl, hash) {
+               if (key &&
+                   unlikely(!(params.obj_cmpfn ?
+                              params.obj_cmpfn(&arg, rht_obj(ht, head)) :
+-                             rhashtable_compare(&arg, rht_obj(ht, head)))))
++                             rhashtable_compare(&arg, rht_obj(ht, head))))) {
++                      data = rht_obj(ht, head);
+                       goto out;
++              }
+               if (!--elasticity)
+                       goto slow_path;
+       }
+-      err = 0;
+-
+       head = rht_dereference_bucket(tbl->buckets[hash], tbl, hash);
+       RCU_INIT_POINTER(obj->next, head);
+@@ -651,7 +659,7 @@ out:
+       spin_unlock_bh(lock);
+       rcu_read_unlock();
+-      return err;
++      return err ? ERR_PTR(err) : data;
+ }
+ /**
+@@ -674,7 +682,13 @@ static inline int rhashtable_insert_fast
+       struct rhashtable *ht, struct rhash_head *obj,
+       const struct rhashtable_params params)
+ {
+-      return __rhashtable_insert_fast(ht, NULL, obj, params);
++      void *ret;
++
++      ret = __rhashtable_insert_fast(ht, NULL, obj, params);
++      if (IS_ERR(ret))
++              return PTR_ERR(ret);
++
++      return ret == NULL ? 0 : -EEXIST;
+ }
+ /**
+@@ -703,11 +717,15 @@ static inline int rhashtable_lookup_inse
+       const struct rhashtable_params params)
+ {
+       const char *key = rht_obj(ht, obj);
++      void *ret;
+       BUG_ON(ht->p.obj_hashfn);
+-      return __rhashtable_insert_fast(ht, key + ht->p.key_offset, obj,
+-                                      params);
++      ret = __rhashtable_insert_fast(ht, key + ht->p.key_offset, obj, params);
++      if (IS_ERR(ret))
++              return PTR_ERR(ret);
++
++      return ret == NULL ? 0 : -EEXIST;
+ }
+ /**
+@@ -736,6 +754,32 @@ static inline int rhashtable_lookup_inse
+       struct rhashtable *ht, const void *key, struct rhash_head *obj,
+       const struct rhashtable_params params)
+ {
++      void *ret;
++
++      BUG_ON(!ht->p.obj_hashfn || !key);
++
++      ret = __rhashtable_insert_fast(ht, key, obj, params);
++      if (IS_ERR(ret))
++              return PTR_ERR(ret);
++
++      return ret == NULL ? 0 : -EEXIST;
++}
++
++/**
++ * rhashtable_lookup_get_insert_key - lookup and insert object into hash table
++ * @ht:               hash table
++ * @obj:      pointer to hash head inside object
++ * @params:   hash table parameters
++ * @data:     pointer to element data already in hashes
++ *
++ * Just like rhashtable_lookup_insert_key(), but this function returns the
++ * object if it exists, NULL if it does not and the insertion was successful,
++ * and an ERR_PTR otherwise.
++ */
++static inline void *rhashtable_lookup_get_insert_key(
++      struct rhashtable *ht, const void *key, struct rhash_head *obj,
++      const struct rhashtable_params params)
++{
+       BUG_ON(!ht->p.obj_hashfn || !key);
+       return __rhashtable_insert_fast(ht, key, obj, params);
+--- a/lib/rhashtable.c
++++ b/lib/rhashtable.c
+@@ -441,7 +441,8 @@ EXPORT_SYMBOL_GPL(rhashtable_insert_reha
+ struct bucket_table *rhashtable_insert_slow(struct rhashtable *ht,
+                                           const void *key,
+                                           struct rhash_head *obj,
+-                                          struct bucket_table *tbl)
++                                          struct bucket_table *tbl,
++                                          void **data)
+ {
+       struct rhash_head *head;
+       unsigned int hash;
+@@ -452,8 +453,11 @@ struct bucket_table *rhashtable_insert_s
+       spin_lock_nested(rht_bucket_lock(tbl, hash), SINGLE_DEPTH_NESTING);
+       err = -EEXIST;
+-      if (key && rhashtable_lookup_fast(ht, key, ht->p))
+-              goto exit;
++      if (key) {
++              *data = rhashtable_lookup_fast(ht, key, ht->p);
++              if (*data)
++                      goto exit;
++      }
+       err = -E2BIG;
+       if (unlikely(rht_grow_above_max(ht, tbl)))
diff --git a/queue-4.4/rhashtable-add-schedule-points.patch b/queue-4.4/rhashtable-add-schedule-points.patch
new file mode 100644 (file)
index 0000000..340318c
--- /dev/null
@@ -0,0 +1,44 @@
+From foo@baz Thu Feb  7 12:09:56 CET 2019
+From: Eric Dumazet <edumazet@google.com>
+Date: Wed, 10 Oct 2018 12:29:55 -0700
+Subject: rhashtable: add schedule points
+
+From: Eric Dumazet <edumazet@google.com>
+
+commit ae6da1f503abb5a5081f9f6c4a6881de97830f3e upstream.
+
+Rehashing and destroying large hash table takes a lot of time,
+and happens in process context. It is safe to add cond_resched()
+in rhashtable_rehash_table() and rhashtable_free_and_destroy()
+
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Acked-by: Herbert Xu <herbert@gondor.apana.org.au>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ lib/rhashtable.c |    5 ++++-
+ 1 file changed, 4 insertions(+), 1 deletion(-)
+
+--- a/lib/rhashtable.c
++++ b/lib/rhashtable.c
+@@ -250,8 +250,10 @@ static int rhashtable_rehash_table(struc
+       if (!new_tbl)
+               return 0;
+-      for (old_hash = 0; old_hash < old_tbl->size; old_hash++)
++      for (old_hash = 0; old_hash < old_tbl->size; old_hash++) {
+               rhashtable_rehash_chain(ht, old_hash);
++              cond_resched();
++      }
+       /* Publish the new table pointer. */
+       rcu_assign_pointer(ht->tbl, new_tbl);
+@@ -842,6 +844,7 @@ void rhashtable_free_and_destroy(struct
+               for (i = 0; i < tbl->size; i++) {
+                       struct rhash_head *pos, *next;
++                      cond_resched();
+                       for (pos = rht_dereference(tbl->buckets[i], ht),
+                            next = !rht_is_a_nulls(pos) ?
+                                       rht_dereference(pos->next, ht) : NULL;
diff --git a/queue-4.4/rhashtable-reorganize-struct-rhashtable-layout.patch b/queue-4.4/rhashtable-reorganize-struct-rhashtable-layout.patch
new file mode 100644 (file)
index 0000000..cbd07ff
--- /dev/null
@@ -0,0 +1,51 @@
+From foo@baz Thu Feb  7 12:09:56 CET 2019
+From: Eric Dumazet <edumazet@google.com>
+Date: Wed, 10 Oct 2018 12:30:03 -0700
+Subject: rhashtable: reorganize struct rhashtable layout
+
+From: Eric Dumazet <edumazet@google.com>
+
+commit e5d672a0780d9e7118caad4c171ec88b8299398d upstream.
+
+While under frags DDOS I noticed unfortunate false sharing between
+@nelems and @params.automatic_shrinking
+
+Move @nelems at the end of struct rhashtable so that first cache line
+is shared between all cpus, because almost never dirtied.
+
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/linux/rhashtable.h |    4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+--- a/include/linux/rhashtable.h
++++ b/include/linux/rhashtable.h
+@@ -133,23 +133,23 @@ struct rhashtable_params {
+ /**
+  * struct rhashtable - Hash table handle
+  * @tbl: Bucket table
+- * @nelems: Number of elements in table
+  * @key_len: Key length for hashfn
+  * @elasticity: Maximum chain length before rehash
+  * @p: Configuration parameters
+  * @run_work: Deferred worker to expand/shrink asynchronously
+  * @mutex: Mutex to protect current/future table swapping
+  * @lock: Spin lock to protect walker list
++ * @nelems: Number of elements in table
+  */
+ struct rhashtable {
+       struct bucket_table __rcu       *tbl;
+-      atomic_t                        nelems;
+       unsigned int                    key_len;
+       unsigned int                    elasticity;
+       struct rhashtable_params        p;
+       struct work_struct              run_work;
+       struct mutex                    mutex;
+       spinlock_t                      lock;
++      atomic_t                        nelems;
+ };
+ /**
diff --git a/queue-4.4/series b/queue-4.4/series
new file mode 100644 (file)
index 0000000..cfaf490
--- /dev/null
@@ -0,0 +1,33 @@
+inet-frags-change-inet_frags_init_net-return-value.patch
+inet-frags-add-a-pointer-to-struct-netns_frags.patch
+inet-frags-refactor-ipfrag_init.patch
+inet-frags-refactor-ipv6_frag_init.patch
+inet-frags-refactor-lowpan_net_frag_init.patch
+rhashtable-add-rhashtable_lookup_get_insert_key.patch
+rhashtable-add-rhashtable_lookup.patch
+rhashtable-add-schedule-points.patch
+inet-frags-use-rhashtables-for-reassembly-units.patch
+net-ieee802154-6lowpan-fix-frag-reassembly.patch
+ipfrag-really-prevent-allocation-on-netns-exit.patch
+inet-frags-remove-some-helpers.patch
+inet-frags-get-rif-of-inet_frag_evicting.patch
+inet-frags-remove-inet_frag_maybe_warn_overflow.patch
+inet-frags-break-the-2gb-limit-for-frags-storage.patch
+inet-frags-do-not-clone-skb-in-ip_expire.patch
+ipv6-frags-rewrite-ip6_expire_frag_queue.patch
+rhashtable-reorganize-struct-rhashtable-layout.patch
+inet-frags-reorganize-struct-netns_frags.patch
+inet-frags-get-rid-of-ipfrag_skb_cb-frag_cb.patch
+inet-frags-fix-ip6frag_low_thresh-boundary.patch
+ip-discard-ipv4-datagrams-with-overlapping-segments.patch
+net-modify-skb_rbtree_purge-to-return-the-truesize-of-all-purged-skbs.patch
+ipv6-defrag-drop-non-last-frags-smaller-than-min-mtu.patch
+net-pskb_trim_rcsum-and-checksum_complete-are-friends.patch
+ip-use-rb-trees-for-ip-frag-queue.patch
+ip-add-helpers-to-process-in-order-fragments-faster.patch
+ip-process-in-order-fragments-efficiently.patch
+ip-frags-fix-crash-in-ip_do_fragment.patch
+ipv4-frags-precedence-bug-in-ip_expire.patch
+inet-frags-better-deal-with-smp-races.patch
+net-fix-pskb_trim_rcsum_slow-with-odd-trim-offset.patch
+net-ipv4-do-not-handle-duplicate-fragments-as-overlapping.patch