From: Greg Kroah-Hartman Date: Tue, 30 Apr 2019 10:56:25 +0000 (+0200) Subject: 4.9-stable patches X-Git-Tag: v4.9.172~5 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=7e3ded1b5cb3616c2e9567fa6d5c5fa82a6dcb63;p=thirdparty%2Fkernel%2Fstable-queue.git 4.9-stable patches added patches: ipv6-frags-fix-a-lockdep-false-positive.patch ipv6-remove-dependency-of-nf_defrag_ipv6-on-ipv6-module.patch net-ip-defrag-encapsulate-rbtree-defrag-code-into-callable-functions.patch net-ip6-defrag-use-rbtrees-for-ipv6-defrag.patch net-ip6-defrag-use-rbtrees-in-nf_conntrack_reasm.c.patch --- diff --git a/queue-4.9/ipv6-frags-fix-a-lockdep-false-positive.patch b/queue-4.9/ipv6-frags-fix-a-lockdep-false-positive.patch new file mode 100644 index 00000000000..2d2ed145ccd --- /dev/null +++ b/queue-4.9/ipv6-frags-fix-a-lockdep-false-positive.patch @@ -0,0 +1,103 @@ +From foo@baz Tue 30 Apr 2019 12:43:33 PM CEST +From: Peter Oskolkov +Date: Fri, 26 Apr 2019 08:41:04 -0700 +Subject: ipv6: frags: fix a lockdep false positive +To: Greg Kroah-Hartman , stable@vger.kernel.org, netdev@vger.kernel.org +Cc: Peter Oskolkov , David Miller , Eric Dumazet , Sasha Levin , Captain Wiggum , Lars Persson +Message-ID: <20190426154108.52277-2-posk@google.com> + +From: Eric Dumazet + +[ Upstream commit 415787d7799f4fccbe8d49cb0b8e5811be6b0389 ] + +lockdep does not know that the locks used by IPv4 defrag +and IPv6 reassembly units are of different classes. + +It complains because of following chains : + +1) sch_direct_xmit() (lock txq->_xmit_lock) + dev_hard_start_xmit() + xmit_one() + dev_queue_xmit_nit() + packet_rcv_fanout() + ip_check_defrag() + ip_defrag() + spin_lock() (lock frag queue spinlock) + +2) ip6_input_finish() + ipv6_frag_rcv() (lock frag queue spinlock) + ip6_frag_queue() + icmpv6_param_prob() (lock txq->_xmit_lock at some point) + +We could add lockdep annotations, but we also can make sure IPv6 +calls icmpv6_param_prob() only after the release of the frag queue spinlock, +since this naturally makes frag queue spinlock a leaf in lock hierarchy. + +Signed-off-by: Eric Dumazet +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/ipv6/reassembly.c | 23 ++++++++++++----------- + 1 file changed, 12 insertions(+), 11 deletions(-) + +--- a/net/ipv6/reassembly.c ++++ b/net/ipv6/reassembly.c +@@ -169,7 +169,8 @@ fq_find(struct net *net, __be32 id, cons + } + + static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb, +- struct frag_hdr *fhdr, int nhoff) ++ struct frag_hdr *fhdr, int nhoff, ++ u32 *prob_offset) + { + struct sk_buff *prev, *next; + struct net_device *dev; +@@ -185,11 +186,7 @@ static int ip6_frag_queue(struct frag_qu + ((u8 *)(fhdr + 1) - (u8 *)(ipv6_hdr(skb) + 1))); + + if ((unsigned int)end > IPV6_MAXPLEN) { +- __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), +- IPSTATS_MIB_INHDRERRORS); +- icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, +- ((u8 *)&fhdr->frag_off - +- skb_network_header(skb))); ++ *prob_offset = (u8 *)&fhdr->frag_off - skb_network_header(skb); + return -1; + } + +@@ -220,10 +217,7 @@ static int ip6_frag_queue(struct frag_qu + /* RFC2460 says always send parameter problem in + * this case. -DaveM + */ +- __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), +- IPSTATS_MIB_INHDRERRORS); +- icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, +- offsetof(struct ipv6hdr, payload_len)); ++ *prob_offset = offsetof(struct ipv6hdr, payload_len); + return -1; + } + if (end > fq->q.len) { +@@ -524,15 +518,22 @@ static int ipv6_frag_rcv(struct sk_buff + iif = skb->dev ? skb->dev->ifindex : 0; + fq = fq_find(net, fhdr->identification, hdr, iif); + if (fq) { ++ u32 prob_offset = 0; + int ret; + + spin_lock(&fq->q.lock); + + fq->iif = iif; +- ret = ip6_frag_queue(fq, skb, fhdr, IP6CB(skb)->nhoff); ++ ret = ip6_frag_queue(fq, skb, fhdr, IP6CB(skb)->nhoff, ++ &prob_offset); + + spin_unlock(&fq->q.lock); + inet_frag_put(&fq->q); ++ if (prob_offset) { ++ __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), ++ IPSTATS_MIB_INHDRERRORS); ++ icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, prob_offset); ++ } + return ret; + } + diff --git a/queue-4.9/ipv6-remove-dependency-of-nf_defrag_ipv6-on-ipv6-module.patch b/queue-4.9/ipv6-remove-dependency-of-nf_defrag_ipv6-on-ipv6-module.patch new file mode 100644 index 00000000000..32b74ad87b7 --- /dev/null +++ b/queue-4.9/ipv6-remove-dependency-of-nf_defrag_ipv6-on-ipv6-module.patch @@ -0,0 +1,400 @@ +From foo@baz Tue 30 Apr 2019 12:43:33 PM CEST +From: Peter Oskolkov +Date: Fri, 26 Apr 2019 08:41:06 -0700 +Subject: ipv6: remove dependency of nf_defrag_ipv6 on ipv6 module +To: Greg Kroah-Hartman , stable@vger.kernel.org, netdev@vger.kernel.org +Cc: Peter Oskolkov , David Miller , Eric Dumazet , Sasha Levin , Captain Wiggum , Lars Persson , Florian Westphal , Pablo Neira Ayuso +Message-ID: <20190426154108.52277-4-posk@google.com> + +From: Florian Westphal + +[ Upstream commit 70b095c84326640eeacfd69a411db8fc36e8ab1a ] + +IPV6=m +DEFRAG_IPV6=m +CONNTRACK=y yields: + +net/netfilter/nf_conntrack_proto.o: In function `nf_ct_netns_do_get': +net/netfilter/nf_conntrack_proto.c:802: undefined reference to `nf_defrag_ipv6_enable' +net/netfilter/nf_conntrack_proto.o:(.rodata+0x640): undefined reference to `nf_conntrack_l4proto_icmpv6' + +Setting DEFRAG_IPV6=y causes undefined references to ip6_rhash_params +ip6_frag_init and ip6_expire_frag_queue so it would be needed to force +IPV6=y too. + +This patch gets rid of the 'followup linker error' by removing +the dependency of ipv6.ko symbols from netfilter ipv6 defrag. + +Shared code is placed into a header, then used from both. + +Signed-off-by: Florian Westphal +Signed-off-by: Pablo Neira Ayuso +Signed-off-by: Greg Kroah-Hartman +--- + include/net/ipv6.h | 29 -------- + include/net/ipv6_frag.h | 104 ++++++++++++++++++++++++++++++ + net/ieee802154/6lowpan/reassembly.c | 2 + net/ipv6/netfilter/nf_conntrack_reasm.c | 17 +++- + net/ipv6/netfilter/nf_defrag_ipv6_hooks.c | 3 + net/ipv6/reassembly.c | 92 ++------------------------ + net/openvswitch/conntrack.c | 1 + 7 files changed, 126 insertions(+), 122 deletions(-) + create mode 100644 include/net/ipv6_frag.h + +--- a/include/net/ipv6.h ++++ b/include/net/ipv6.h +@@ -511,35 +511,6 @@ static inline bool ipv6_prefix_equal(con + } + #endif + +-struct inet_frag_queue; +- +-enum ip6_defrag_users { +- IP6_DEFRAG_LOCAL_DELIVER, +- IP6_DEFRAG_CONNTRACK_IN, +- __IP6_DEFRAG_CONNTRACK_IN = IP6_DEFRAG_CONNTRACK_IN + USHRT_MAX, +- IP6_DEFRAG_CONNTRACK_OUT, +- __IP6_DEFRAG_CONNTRACK_OUT = IP6_DEFRAG_CONNTRACK_OUT + USHRT_MAX, +- IP6_DEFRAG_CONNTRACK_BRIDGE_IN, +- __IP6_DEFRAG_CONNTRACK_BRIDGE_IN = IP6_DEFRAG_CONNTRACK_BRIDGE_IN + USHRT_MAX, +-}; +- +-void ip6_frag_init(struct inet_frag_queue *q, const void *a); +-extern const struct rhashtable_params ip6_rhash_params; +- +-/* +- * Equivalent of ipv4 struct ip +- */ +-struct frag_queue { +- struct inet_frag_queue q; +- +- int iif; +- unsigned int csum; +- __u16 nhoffset; +- u8 ecn; +-}; +- +-void ip6_expire_frag_queue(struct net *net, struct frag_queue *fq); +- + static inline bool ipv6_addr_any(const struct in6_addr *a) + { + #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 +--- /dev/null ++++ b/include/net/ipv6_frag.h +@@ -0,0 +1,104 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _IPV6_FRAG_H ++#define _IPV6_FRAG_H ++#include ++#include ++#include ++#include ++ ++enum ip6_defrag_users { ++ IP6_DEFRAG_LOCAL_DELIVER, ++ IP6_DEFRAG_CONNTRACK_IN, ++ __IP6_DEFRAG_CONNTRACK_IN = IP6_DEFRAG_CONNTRACK_IN + USHRT_MAX, ++ IP6_DEFRAG_CONNTRACK_OUT, ++ __IP6_DEFRAG_CONNTRACK_OUT = IP6_DEFRAG_CONNTRACK_OUT + USHRT_MAX, ++ IP6_DEFRAG_CONNTRACK_BRIDGE_IN, ++ __IP6_DEFRAG_CONNTRACK_BRIDGE_IN = IP6_DEFRAG_CONNTRACK_BRIDGE_IN + USHRT_MAX, ++}; ++ ++/* ++ * Equivalent of ipv4 struct ip ++ */ ++struct frag_queue { ++ struct inet_frag_queue q; ++ ++ int iif; ++ __u16 nhoffset; ++ u8 ecn; ++}; ++ ++#if IS_ENABLED(CONFIG_IPV6) ++static inline void ip6frag_init(struct inet_frag_queue *q, const void *a) ++{ ++ struct frag_queue *fq = container_of(q, struct frag_queue, q); ++ const struct frag_v6_compare_key *key = a; ++ ++ q->key.v6 = *key; ++ fq->ecn = 0; ++} ++ ++static inline u32 ip6frag_key_hashfn(const void *data, u32 len, u32 seed) ++{ ++ return jhash2(data, ++ sizeof(struct frag_v6_compare_key) / sizeof(u32), seed); ++} ++ ++static inline u32 ip6frag_obj_hashfn(const void *data, u32 len, u32 seed) ++{ ++ const struct inet_frag_queue *fq = data; ++ ++ return jhash2((const u32 *)&fq->key.v6, ++ sizeof(struct frag_v6_compare_key) / sizeof(u32), seed); ++} ++ ++static inline int ++ip6frag_obj_cmpfn(struct rhashtable_compare_arg *arg, const void *ptr) ++{ ++ const struct frag_v6_compare_key *key = arg->key; ++ const struct inet_frag_queue *fq = ptr; ++ ++ return !!memcmp(&fq->key, key, sizeof(*key)); ++} ++ ++static inline void ++ip6frag_expire_frag_queue(struct net *net, struct frag_queue *fq) ++{ ++ struct net_device *dev = NULL; ++ struct sk_buff *head; ++ ++ rcu_read_lock(); ++ spin_lock(&fq->q.lock); ++ ++ if (fq->q.flags & INET_FRAG_COMPLETE) ++ goto out; ++ ++ inet_frag_kill(&fq->q); ++ ++ dev = dev_get_by_index_rcu(net, fq->iif); ++ if (!dev) ++ goto out; ++ ++ __IP6_INC_STATS(net, __in6_dev_get(dev), IPSTATS_MIB_REASMFAILS); ++ __IP6_INC_STATS(net, __in6_dev_get(dev), IPSTATS_MIB_REASMTIMEOUT); ++ ++ /* Don't send error if the first segment did not arrive. */ ++ head = fq->q.fragments; ++ if (!(fq->q.flags & INET_FRAG_FIRST_IN) || !head) ++ goto out; ++ ++ head->dev = dev; ++ skb_get(head); ++ spin_unlock(&fq->q.lock); ++ ++ icmpv6_send(head, ICMPV6_TIME_EXCEED, ICMPV6_EXC_FRAGTIME, 0); ++ kfree_skb(head); ++ goto out_rcu_unlock; ++ ++out: ++ spin_unlock(&fq->q.lock); ++out_rcu_unlock: ++ rcu_read_unlock(); ++ inet_frag_put(&fq->q); ++} ++#endif ++#endif +--- a/net/ieee802154/6lowpan/reassembly.c ++++ b/net/ieee802154/6lowpan/reassembly.c +@@ -25,7 +25,7 @@ + + #include + #include +-#include ++#include + #include + + #include "6lowpan_i.h" +--- a/net/ipv6/netfilter/nf_conntrack_reasm.c ++++ b/net/ipv6/netfilter/nf_conntrack_reasm.c +@@ -33,9 +33,8 @@ + + #include + #include +-#include ++#include + +-#include + #include + #include + #include +@@ -158,7 +157,7 @@ static void nf_ct_frag6_expire(unsigned + fq = container_of((struct inet_frag_queue *)data, struct frag_queue, q); + net = container_of(fq->q.net, struct net, nf_frag.frags); + +- ip6_expire_frag_queue(net, fq); ++ ip6frag_expire_frag_queue(net, fq); + } + + /* Creation primitives. */ +@@ -634,16 +633,24 @@ static struct pernet_operations nf_ct_ne + .exit = nf_ct_net_exit, + }; + ++static const struct rhashtable_params nfct_rhash_params = { ++ .head_offset = offsetof(struct inet_frag_queue, node), ++ .hashfn = ip6frag_key_hashfn, ++ .obj_hashfn = ip6frag_obj_hashfn, ++ .obj_cmpfn = ip6frag_obj_cmpfn, ++ .automatic_shrinking = true, ++}; ++ + int nf_ct_frag6_init(void) + { + int ret = 0; + +- nf_frags.constructor = ip6_frag_init; ++ nf_frags.constructor = ip6frag_init; + nf_frags.destructor = NULL; + nf_frags.qsize = sizeof(struct frag_queue); + nf_frags.frag_expire = nf_ct_frag6_expire; + nf_frags.frags_cache_name = nf_frags_cache_name; +- nf_frags.rhash_params = ip6_rhash_params; ++ nf_frags.rhash_params = nfct_rhash_params; + ret = inet_frags_init(&nf_frags); + if (ret) + goto out; +--- a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c ++++ b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c +@@ -14,8 +14,7 @@ + #include + #include + #include +-#include +-#include ++#include + + #include + #include +--- a/net/ipv6/reassembly.c ++++ b/net/ipv6/reassembly.c +@@ -57,7 +57,7 @@ + #include + #include + #include +-#include ++#include + #include + + static const char ip6_frag_cache_name[] = "ip6-frags"; +@@ -79,61 +79,6 @@ static struct inet_frags ip6_frags; + static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev, + struct net_device *dev); + +-void ip6_frag_init(struct inet_frag_queue *q, const void *a) +-{ +- struct frag_queue *fq = container_of(q, struct frag_queue, q); +- const struct frag_v6_compare_key *key = a; +- +- q->key.v6 = *key; +- fq->ecn = 0; +-} +-EXPORT_SYMBOL(ip6_frag_init); +- +-void ip6_expire_frag_queue(struct net *net, struct frag_queue *fq) +-{ +- struct net_device *dev = NULL; +- struct sk_buff *head; +- +- rcu_read_lock(); +- spin_lock(&fq->q.lock); +- +- if (fq->q.flags & INET_FRAG_COMPLETE) +- goto out; +- +- inet_frag_kill(&fq->q); +- +- dev = dev_get_by_index_rcu(net, fq->iif); +- if (!dev) +- goto out; +- +- __IP6_INC_STATS(net, __in6_dev_get(dev), IPSTATS_MIB_REASMFAILS); +- __IP6_INC_STATS(net, __in6_dev_get(dev), IPSTATS_MIB_REASMTIMEOUT); +- +- /* Don't send error if the first segment did not arrive. */ +- head = fq->q.fragments; +- if (!(fq->q.flags & INET_FRAG_FIRST_IN) || !head) +- goto out; +- +- /* But use as source device on which LAST ARRIVED +- * segment was received. And do not use fq->dev +- * pointer directly, device might already disappeared. +- */ +- head->dev = dev; +- skb_get(head); +- spin_unlock(&fq->q.lock); +- +- icmpv6_send(head, ICMPV6_TIME_EXCEED, ICMPV6_EXC_FRAGTIME, 0); +- kfree_skb(head); +- goto out_rcu_unlock; +- +-out: +- spin_unlock(&fq->q.lock); +-out_rcu_unlock: +- rcu_read_unlock(); +- inet_frag_put(&fq->q); +-} +-EXPORT_SYMBOL(ip6_expire_frag_queue); +- + static void ip6_frag_expire(unsigned long data) + { + struct frag_queue *fq; +@@ -142,7 +87,7 @@ static void ip6_frag_expire(unsigned lon + fq = container_of((struct inet_frag_queue *)data, struct frag_queue, q); + net = container_of(fq->q.net, struct net, ipv6.frags); + +- ip6_expire_frag_queue(net, fq); ++ ip6frag_expire_frag_queue(net, fq); + } + + static struct frag_queue * +@@ -701,42 +646,19 @@ static struct pernet_operations ip6_frag + .exit = ipv6_frags_exit_net, + }; + +-static u32 ip6_key_hashfn(const void *data, u32 len, u32 seed) +-{ +- return jhash2(data, +- sizeof(struct frag_v6_compare_key) / sizeof(u32), seed); +-} +- +-static u32 ip6_obj_hashfn(const void *data, u32 len, u32 seed) +-{ +- const struct inet_frag_queue *fq = data; +- +- return jhash2((const u32 *)&fq->key.v6, +- sizeof(struct frag_v6_compare_key) / sizeof(u32), seed); +-} +- +-static int ip6_obj_cmpfn(struct rhashtable_compare_arg *arg, const void *ptr) +-{ +- const struct frag_v6_compare_key *key = arg->key; +- const struct inet_frag_queue *fq = ptr; +- +- return !!memcmp(&fq->key, key, sizeof(*key)); +-} +- +-const struct rhashtable_params ip6_rhash_params = { ++static const struct rhashtable_params ip6_rhash_params = { + .head_offset = offsetof(struct inet_frag_queue, node), +- .hashfn = ip6_key_hashfn, +- .obj_hashfn = ip6_obj_hashfn, +- .obj_cmpfn = ip6_obj_cmpfn, ++ .hashfn = ip6frag_key_hashfn, ++ .obj_hashfn = ip6frag_obj_hashfn, ++ .obj_cmpfn = ip6frag_obj_cmpfn, + .automatic_shrinking = true, + }; +-EXPORT_SYMBOL(ip6_rhash_params); + + int __init ipv6_frag_init(void) + { + int ret; + +- ip6_frags.constructor = ip6_frag_init; ++ ip6_frags.constructor = ip6frag_init; + ip6_frags.destructor = NULL; + ip6_frags.qsize = sizeof(struct frag_queue); + ip6_frags.frag_expire = ip6_frag_expire; +--- a/net/openvswitch/conntrack.c ++++ b/net/openvswitch/conntrack.c +@@ -23,6 +23,7 @@ + #include + #include + #include ++#include + + #ifdef CONFIG_NF_NAT_NEEDED + #include diff --git a/queue-4.9/net-ip-defrag-encapsulate-rbtree-defrag-code-into-callable-functions.patch b/queue-4.9/net-ip-defrag-encapsulate-rbtree-defrag-code-into-callable-functions.patch new file mode 100644 index 00000000000..69a7ef55963 --- /dev/null +++ b/queue-4.9/net-ip-defrag-encapsulate-rbtree-defrag-code-into-callable-functions.patch @@ -0,0 +1,776 @@ +From foo@baz Tue 30 Apr 2019 12:43:33 PM CEST +From: Peter Oskolkov +Date: Fri, 26 Apr 2019 08:41:05 -0700 +Subject: net: IP defrag: encapsulate rbtree defrag code into callable functions +To: Greg Kroah-Hartman , stable@vger.kernel.org, netdev@vger.kernel.org +Cc: Peter Oskolkov , David Miller , Eric Dumazet , Sasha Levin , Captain Wiggum , Lars Persson , Peter Oskolkov , Florian Westphal , Tom Herbert +Message-ID: <20190426154108.52277-3-posk@google.com> + +From: Peter Oskolkov + +[ Upstream commit c23f35d19db3b36ffb9e04b08f1d91565d15f84f ] + +This is a refactoring patch: without changing runtime behavior, +it moves rbtree-related code from IPv4-specific files/functions +into .h/.c defrag files shared with IPv6 defragmentation code. + +Signed-off-by: Peter Oskolkov +Cc: Eric Dumazet +Cc: Florian Westphal +Cc: Tom Herbert +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + include/net/inet_frag.h | 16 ++ + net/ipv4/inet_fragment.c | 293 ++++++++++++++++++++++++++++++++++++++++++++++ + net/ipv4/ip_fragment.c | 295 +++++------------------------------------------ + 3 files changed, 342 insertions(+), 262 deletions(-) + +--- a/include/net/inet_frag.h ++++ b/include/net/inet_frag.h +@@ -76,8 +76,8 @@ struct inet_frag_queue { + struct timer_list timer; + spinlock_t lock; + atomic_t refcnt; +- struct sk_buff *fragments; /* Used in IPv6. */ +- struct rb_root rb_fragments; /* Used in IPv4. */ ++ struct sk_buff *fragments; /* used in 6lopwpan IPv6. */ ++ struct rb_root rb_fragments; /* Used in IPv4/IPv6. */ + struct sk_buff *fragments_tail; + struct sk_buff *last_run_head; + ktime_t stamp; +@@ -152,4 +152,16 @@ static inline void add_frag_mem_limit(st + + extern const u8 ip_frag_ecn_table[16]; + ++/* Return values of inet_frag_queue_insert() */ ++#define IPFRAG_OK 0 ++#define IPFRAG_DUP 1 ++#define IPFRAG_OVERLAP 2 ++int inet_frag_queue_insert(struct inet_frag_queue *q, struct sk_buff *skb, ++ int offset, int end); ++void *inet_frag_reasm_prepare(struct inet_frag_queue *q, struct sk_buff *skb, ++ struct sk_buff *parent); ++void inet_frag_reasm_finish(struct inet_frag_queue *q, struct sk_buff *head, ++ void *reasm_data); ++struct sk_buff *inet_frag_pull_head(struct inet_frag_queue *q); ++ + #endif +--- a/net/ipv4/inet_fragment.c ++++ b/net/ipv4/inet_fragment.c +@@ -24,6 +24,62 @@ + #include + #include + #include ++#include ++#include ++ ++/* Use skb->cb to track consecutive/adjacent fragments coming at ++ * the end of the queue. Nodes in the rb-tree queue will ++ * contain "runs" of one or more adjacent fragments. ++ * ++ * Invariants: ++ * - next_frag is NULL at the tail of a "run"; ++ * - the head of a "run" has the sum of all fragment lengths in frag_run_len. ++ */ ++struct ipfrag_skb_cb { ++ union { ++ struct inet_skb_parm h4; ++ struct inet6_skb_parm h6; ++ }; ++ struct sk_buff *next_frag; ++ int frag_run_len; ++}; ++ ++#define FRAG_CB(skb) ((struct ipfrag_skb_cb *)((skb)->cb)) ++ ++static void fragcb_clear(struct sk_buff *skb) ++{ ++ RB_CLEAR_NODE(&skb->rbnode); ++ FRAG_CB(skb)->next_frag = NULL; ++ FRAG_CB(skb)->frag_run_len = skb->len; ++} ++ ++/* Append skb to the last "run". */ ++static void fragrun_append_to_last(struct inet_frag_queue *q, ++ struct sk_buff *skb) ++{ ++ fragcb_clear(skb); ++ ++ FRAG_CB(q->last_run_head)->frag_run_len += skb->len; ++ FRAG_CB(q->fragments_tail)->next_frag = skb; ++ q->fragments_tail = skb; ++} ++ ++/* Create a new "run" with the skb. */ ++static void fragrun_create(struct inet_frag_queue *q, struct sk_buff *skb) ++{ ++ BUILD_BUG_ON(sizeof(struct ipfrag_skb_cb) > sizeof(skb->cb)); ++ fragcb_clear(skb); ++ ++ if (q->last_run_head) ++ rb_link_node(&skb->rbnode, &q->last_run_head->rbnode, ++ &q->last_run_head->rbnode.rb_right); ++ else ++ rb_link_node(&skb->rbnode, NULL, &q->rb_fragments.rb_node); ++ rb_insert_color(&skb->rbnode, &q->rb_fragments); ++ ++ q->fragments_tail = skb; ++ q->last_run_head = skb; ++} + + /* Given the OR values of all fragments, apply RFC 3168 5.3 requirements + * Value : 0xff if frame should be dropped. +@@ -122,6 +178,28 @@ static void inet_frag_destroy_rcu(struct + kmem_cache_free(f->frags_cachep, q); + } + ++unsigned int inet_frag_rbtree_purge(struct rb_root *root) ++{ ++ struct rb_node *p = rb_first(root); ++ unsigned int sum = 0; ++ ++ while (p) { ++ struct sk_buff *skb = rb_entry(p, struct sk_buff, rbnode); ++ ++ p = rb_next(p); ++ rb_erase(&skb->rbnode, root); ++ while (skb) { ++ struct sk_buff *next = FRAG_CB(skb)->next_frag; ++ ++ sum += skb->truesize; ++ kfree_skb(skb); ++ skb = next; ++ } ++ } ++ return sum; ++} ++EXPORT_SYMBOL(inet_frag_rbtree_purge); ++ + void inet_frag_destroy(struct inet_frag_queue *q) + { + struct sk_buff *fp; +@@ -223,3 +301,218 @@ struct inet_frag_queue *inet_frag_find(s + return fq; + } + EXPORT_SYMBOL(inet_frag_find); ++ ++int inet_frag_queue_insert(struct inet_frag_queue *q, struct sk_buff *skb, ++ int offset, int end) ++{ ++ struct sk_buff *last = q->fragments_tail; ++ ++ /* RFC5722, Section 4, amended by Errata ID : 3089 ++ * When reassembling an IPv6 datagram, if ++ * one or more its constituent fragments is determined to be an ++ * overlapping fragment, the entire datagram (and any constituent ++ * fragments) MUST be silently discarded. ++ * ++ * Duplicates, however, should be ignored (i.e. skb dropped, but the ++ * queue/fragments kept for later reassembly). ++ */ ++ if (!last) ++ fragrun_create(q, skb); /* First fragment. */ ++ else if (last->ip_defrag_offset + last->len < end) { ++ /* This is the common case: skb goes to the end. */ ++ /* Detect and discard overlaps. */ ++ if (offset < last->ip_defrag_offset + last->len) ++ return IPFRAG_OVERLAP; ++ if (offset == last->ip_defrag_offset + last->len) ++ fragrun_append_to_last(q, skb); ++ else ++ fragrun_create(q, skb); ++ } else { ++ /* Binary search. Note that skb can become the first fragment, ++ * but not the last (covered above). ++ */ ++ struct rb_node **rbn, *parent; ++ ++ rbn = &q->rb_fragments.rb_node; ++ do { ++ struct sk_buff *curr; ++ int curr_run_end; ++ ++ parent = *rbn; ++ curr = rb_to_skb(parent); ++ curr_run_end = curr->ip_defrag_offset + ++ FRAG_CB(curr)->frag_run_len; ++ if (end <= curr->ip_defrag_offset) ++ rbn = &parent->rb_left; ++ else if (offset >= curr_run_end) ++ rbn = &parent->rb_right; ++ else if (offset >= curr->ip_defrag_offset && ++ end <= curr_run_end) ++ return IPFRAG_DUP; ++ else ++ return IPFRAG_OVERLAP; ++ } while (*rbn); ++ /* Here we have parent properly set, and rbn pointing to ++ * one of its NULL left/right children. Insert skb. ++ */ ++ fragcb_clear(skb); ++ rb_link_node(&skb->rbnode, parent, rbn); ++ rb_insert_color(&skb->rbnode, &q->rb_fragments); ++ } ++ ++ skb->ip_defrag_offset = offset; ++ ++ return IPFRAG_OK; ++} ++EXPORT_SYMBOL(inet_frag_queue_insert); ++ ++void *inet_frag_reasm_prepare(struct inet_frag_queue *q, struct sk_buff *skb, ++ struct sk_buff *parent) ++{ ++ struct sk_buff *fp, *head = skb_rb_first(&q->rb_fragments); ++ struct sk_buff **nextp; ++ int delta; ++ ++ if (head != skb) { ++ fp = skb_clone(skb, GFP_ATOMIC); ++ if (!fp) ++ return NULL; ++ FRAG_CB(fp)->next_frag = FRAG_CB(skb)->next_frag; ++ if (RB_EMPTY_NODE(&skb->rbnode)) ++ FRAG_CB(parent)->next_frag = fp; ++ else ++ rb_replace_node(&skb->rbnode, &fp->rbnode, ++ &q->rb_fragments); ++ if (q->fragments_tail == skb) ++ q->fragments_tail = fp; ++ skb_morph(skb, head); ++ FRAG_CB(skb)->next_frag = FRAG_CB(head)->next_frag; ++ rb_replace_node(&head->rbnode, &skb->rbnode, ++ &q->rb_fragments); ++ consume_skb(head); ++ head = skb; ++ } ++ WARN_ON(head->ip_defrag_offset != 0); ++ ++ delta = -head->truesize; ++ ++ /* Head of list must not be cloned. */ ++ if (skb_unclone(head, GFP_ATOMIC)) ++ return NULL; ++ ++ delta += head->truesize; ++ if (delta) ++ add_frag_mem_limit(q->net, delta); ++ ++ /* If the first fragment is fragmented itself, we split ++ * it to two chunks: the first with data and paged part ++ * and the second, holding only fragments. ++ */ ++ if (skb_has_frag_list(head)) { ++ struct sk_buff *clone; ++ int i, plen = 0; ++ ++ clone = alloc_skb(0, GFP_ATOMIC); ++ if (!clone) ++ return NULL; ++ skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list; ++ skb_frag_list_init(head); ++ for (i = 0; i < skb_shinfo(head)->nr_frags; i++) ++ plen += skb_frag_size(&skb_shinfo(head)->frags[i]); ++ clone->data_len = head->data_len - plen; ++ clone->len = clone->data_len; ++ head->truesize += clone->truesize; ++ clone->csum = 0; ++ clone->ip_summed = head->ip_summed; ++ add_frag_mem_limit(q->net, clone->truesize); ++ skb_shinfo(head)->frag_list = clone; ++ nextp = &clone->next; ++ } else { ++ nextp = &skb_shinfo(head)->frag_list; ++ } ++ ++ return nextp; ++} ++EXPORT_SYMBOL(inet_frag_reasm_prepare); ++ ++void inet_frag_reasm_finish(struct inet_frag_queue *q, struct sk_buff *head, ++ void *reasm_data) ++{ ++ struct sk_buff **nextp = (struct sk_buff **)reasm_data; ++ struct rb_node *rbn; ++ struct sk_buff *fp; ++ ++ skb_push(head, head->data - skb_network_header(head)); ++ ++ /* Traverse the tree in order, to build frag_list. */ ++ fp = FRAG_CB(head)->next_frag; ++ rbn = rb_next(&head->rbnode); ++ rb_erase(&head->rbnode, &q->rb_fragments); ++ while (rbn || fp) { ++ /* fp points to the next sk_buff in the current run; ++ * rbn points to the next run. ++ */ ++ /* Go through the current run. */ ++ while (fp) { ++ *nextp = fp; ++ nextp = &fp->next; ++ fp->prev = NULL; ++ memset(&fp->rbnode, 0, sizeof(fp->rbnode)); ++ fp->sk = NULL; ++ head->data_len += fp->len; ++ head->len += fp->len; ++ if (head->ip_summed != fp->ip_summed) ++ head->ip_summed = CHECKSUM_NONE; ++ else if (head->ip_summed == CHECKSUM_COMPLETE) ++ head->csum = csum_add(head->csum, fp->csum); ++ head->truesize += fp->truesize; ++ fp = FRAG_CB(fp)->next_frag; ++ } ++ /* Move to the next run. */ ++ if (rbn) { ++ struct rb_node *rbnext = rb_next(rbn); ++ ++ fp = rb_to_skb(rbn); ++ rb_erase(rbn, &q->rb_fragments); ++ rbn = rbnext; ++ } ++ } ++ sub_frag_mem_limit(q->net, head->truesize); ++ ++ *nextp = NULL; ++ head->next = NULL; ++ head->prev = NULL; ++ head->tstamp = q->stamp; ++} ++EXPORT_SYMBOL(inet_frag_reasm_finish); ++ ++struct sk_buff *inet_frag_pull_head(struct inet_frag_queue *q) ++{ ++ struct sk_buff *head; ++ ++ if (q->fragments) { ++ head = q->fragments; ++ q->fragments = head->next; ++ } else { ++ struct sk_buff *skb; ++ ++ head = skb_rb_first(&q->rb_fragments); ++ if (!head) ++ return NULL; ++ skb = FRAG_CB(head)->next_frag; ++ if (skb) ++ rb_replace_node(&head->rbnode, &skb->rbnode, ++ &q->rb_fragments); ++ else ++ rb_erase(&head->rbnode, &q->rb_fragments); ++ memset(&head->rbnode, 0, sizeof(head->rbnode)); ++ barrier(); ++ } ++ if (head == q->fragments_tail) ++ q->fragments_tail = NULL; ++ ++ sub_frag_mem_limit(q->net, head->truesize); ++ ++ return head; ++} ++EXPORT_SYMBOL(inet_frag_pull_head); +--- a/net/ipv4/ip_fragment.c ++++ b/net/ipv4/ip_fragment.c +@@ -56,57 +56,6 @@ + */ + static const char ip_frag_cache_name[] = "ip4-frags"; + +-/* Use skb->cb to track consecutive/adjacent fragments coming at +- * the end of the queue. Nodes in the rb-tree queue will +- * contain "runs" of one or more adjacent fragments. +- * +- * Invariants: +- * - next_frag is NULL at the tail of a "run"; +- * - the head of a "run" has the sum of all fragment lengths in frag_run_len. +- */ +-struct ipfrag_skb_cb { +- struct inet_skb_parm h; +- struct sk_buff *next_frag; +- int frag_run_len; +-}; +- +-#define FRAG_CB(skb) ((struct ipfrag_skb_cb *)((skb)->cb)) +- +-static void ip4_frag_init_run(struct sk_buff *skb) +-{ +- BUILD_BUG_ON(sizeof(struct ipfrag_skb_cb) > sizeof(skb->cb)); +- +- FRAG_CB(skb)->next_frag = NULL; +- FRAG_CB(skb)->frag_run_len = skb->len; +-} +- +-/* Append skb to the last "run". */ +-static void ip4_frag_append_to_last_run(struct inet_frag_queue *q, +- struct sk_buff *skb) +-{ +- RB_CLEAR_NODE(&skb->rbnode); +- FRAG_CB(skb)->next_frag = NULL; +- +- FRAG_CB(q->last_run_head)->frag_run_len += skb->len; +- FRAG_CB(q->fragments_tail)->next_frag = skb; +- q->fragments_tail = skb; +-} +- +-/* Create a new "run" with the skb. */ +-static void ip4_frag_create_run(struct inet_frag_queue *q, struct sk_buff *skb) +-{ +- if (q->last_run_head) +- rb_link_node(&skb->rbnode, &q->last_run_head->rbnode, +- &q->last_run_head->rbnode.rb_right); +- else +- rb_link_node(&skb->rbnode, NULL, &q->rb_fragments.rb_node); +- rb_insert_color(&skb->rbnode, &q->rb_fragments); +- +- ip4_frag_init_run(skb); +- q->fragments_tail = skb; +- q->last_run_head = skb; +-} +- + /* Describe an entry in the "incomplete datagrams" queue. */ + struct ipq { + struct inet_frag_queue q; +@@ -210,27 +159,9 @@ static void ip_expire(unsigned long arg) + * pull the head out of the tree in order to be able to + * deal with head->dev. + */ +- if (qp->q.fragments) { +- head = qp->q.fragments; +- qp->q.fragments = head->next; +- } else { +- head = skb_rb_first(&qp->q.rb_fragments); +- if (!head) +- goto out; +- if (FRAG_CB(head)->next_frag) +- rb_replace_node(&head->rbnode, +- &FRAG_CB(head)->next_frag->rbnode, +- &qp->q.rb_fragments); +- else +- rb_erase(&head->rbnode, &qp->q.rb_fragments); +- memset(&head->rbnode, 0, sizeof(head->rbnode)); +- barrier(); +- } +- if (head == qp->q.fragments_tail) +- qp->q.fragments_tail = NULL; +- +- sub_frag_mem_limit(qp->q.net, head->truesize); +- ++ head = inet_frag_pull_head(&qp->q); ++ if (!head) ++ goto out; + head->dev = dev_get_by_index_rcu(net, qp->iif); + if (!head->dev) + goto out; +@@ -343,12 +274,10 @@ static int ip_frag_reinit(struct ipq *qp + static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) + { + struct net *net = container_of(qp->q.net, struct net, ipv4.frags); +- struct rb_node **rbn, *parent; +- struct sk_buff *skb1, *prev_tail; +- int ihl, end, skb1_run_end; ++ int ihl, end, flags, offset; ++ struct sk_buff *prev_tail; + struct net_device *dev; + unsigned int fragsize; +- int flags, offset; + int err = -ENOENT; + u8 ecn; + +@@ -380,7 +309,7 @@ static int ip_frag_queue(struct ipq *qp, + */ + if (end < qp->q.len || + ((qp->q.flags & INET_FRAG_LAST_IN) && end != qp->q.len)) +- goto err; ++ goto discard_qp; + qp->q.flags |= INET_FRAG_LAST_IN; + qp->q.len = end; + } else { +@@ -392,82 +321,33 @@ static int ip_frag_queue(struct ipq *qp, + if (end > qp->q.len) { + /* Some bits beyond end -> corruption. */ + if (qp->q.flags & INET_FRAG_LAST_IN) +- goto err; ++ goto discard_qp; + qp->q.len = end; + } + } + if (end == offset) +- goto err; ++ goto discard_qp; + + err = -ENOMEM; + if (!pskb_pull(skb, skb_network_offset(skb) + ihl)) +- goto err; ++ goto discard_qp; + + err = pskb_trim_rcsum(skb, end - offset); + if (err) +- goto err; ++ goto discard_qp; + + /* Note : skb->rbnode and skb->dev share the same location. */ + dev = skb->dev; + /* Makes sure compiler wont do silly aliasing games */ + barrier(); + +- /* RFC5722, Section 4, amended by Errata ID : 3089 +- * When reassembling an IPv6 datagram, if +- * one or more its constituent fragments is determined to be an +- * overlapping fragment, the entire datagram (and any constituent +- * fragments) MUST be silently discarded. +- * +- * We do the same here for IPv4 (and increment an snmp counter) but +- * we do not want to drop the whole queue in response to a duplicate +- * fragment. +- */ +- +- err = -EINVAL; +- /* Find out where to put this fragment. */ + prev_tail = qp->q.fragments_tail; +- if (!prev_tail) +- ip4_frag_create_run(&qp->q, skb); /* First fragment. */ +- else if (prev_tail->ip_defrag_offset + prev_tail->len < end) { +- /* This is the common case: skb goes to the end. */ +- /* Detect and discard overlaps. */ +- if (offset < prev_tail->ip_defrag_offset + prev_tail->len) +- goto discard_qp; +- if (offset == prev_tail->ip_defrag_offset + prev_tail->len) +- ip4_frag_append_to_last_run(&qp->q, skb); +- else +- ip4_frag_create_run(&qp->q, skb); +- } else { +- /* Binary search. Note that skb can become the first fragment, +- * but not the last (covered above). +- */ +- rbn = &qp->q.rb_fragments.rb_node; +- do { +- parent = *rbn; +- skb1 = rb_to_skb(parent); +- skb1_run_end = skb1->ip_defrag_offset + +- FRAG_CB(skb1)->frag_run_len; +- if (end <= skb1->ip_defrag_offset) +- rbn = &parent->rb_left; +- else if (offset >= skb1_run_end) +- rbn = &parent->rb_right; +- else if (offset >= skb1->ip_defrag_offset && +- end <= skb1_run_end) +- goto err; /* No new data, potential duplicate */ +- else +- goto discard_qp; /* Found an overlap */ +- } while (*rbn); +- /* Here we have parent properly set, and rbn pointing to +- * one of its NULL left/right children. Insert skb. +- */ +- ip4_frag_init_run(skb); +- rb_link_node(&skb->rbnode, parent, rbn); +- rb_insert_color(&skb->rbnode, &qp->q.rb_fragments); +- } ++ err = inet_frag_queue_insert(&qp->q, skb, offset, end); ++ if (err) ++ goto insert_error; + + if (dev) + qp->iif = dev->ifindex; +- skb->ip_defrag_offset = offset; + + qp->q.stamp = skb->tstamp; + qp->q.meat += skb->len; +@@ -492,15 +372,24 @@ static int ip_frag_queue(struct ipq *qp, + skb->_skb_refdst = 0UL; + err = ip_frag_reasm(qp, skb, prev_tail, dev); + skb->_skb_refdst = orefdst; ++ if (err) ++ inet_frag_kill(&qp->q); + return err; + } + + skb_dst_drop(skb); + return -EINPROGRESS; + ++insert_error: ++ if (err == IPFRAG_DUP) { ++ kfree_skb(skb); ++ return -EINVAL; ++ } ++ err = -EINVAL; ++ __IP_INC_STATS(net, IPSTATS_MIB_REASM_OVERLAPS); + discard_qp: + inet_frag_kill(&qp->q); +- __IP_INC_STATS(net, IPSTATS_MIB_REASM_OVERLAPS); ++ __IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS); + err: + kfree_skb(skb); + return err; +@@ -512,12 +401,8 @@ static int ip_frag_reasm(struct ipq *qp, + { + struct net *net = container_of(qp->q.net, struct net, ipv4.frags); + struct iphdr *iph; +- struct sk_buff *fp, *head = skb_rb_first(&qp->q.rb_fragments); +- struct sk_buff **nextp; /* To build frag_list. */ +- struct rb_node *rbn; +- int len; +- int ihlen; +- int err; ++ void *reasm_data; ++ int len, err; + u8 ecn; + + ipq_kill(qp); +@@ -527,111 +412,23 @@ static int ip_frag_reasm(struct ipq *qp, + err = -EINVAL; + goto out_fail; + } +- /* Make the one we just received the head. */ +- if (head != skb) { +- fp = skb_clone(skb, GFP_ATOMIC); +- if (!fp) +- goto out_nomem; +- FRAG_CB(fp)->next_frag = FRAG_CB(skb)->next_frag; +- if (RB_EMPTY_NODE(&skb->rbnode)) +- FRAG_CB(prev_tail)->next_frag = fp; +- else +- rb_replace_node(&skb->rbnode, &fp->rbnode, +- &qp->q.rb_fragments); +- if (qp->q.fragments_tail == skb) +- qp->q.fragments_tail = fp; +- skb_morph(skb, head); +- FRAG_CB(skb)->next_frag = FRAG_CB(head)->next_frag; +- rb_replace_node(&head->rbnode, &skb->rbnode, +- &qp->q.rb_fragments); +- consume_skb(head); +- head = skb; +- } +- +- WARN_ON(head->ip_defrag_offset != 0); + +- /* Allocate a new buffer for the datagram. */ +- ihlen = ip_hdrlen(head); +- len = ihlen + qp->q.len; ++ /* Make the one we just received the head. */ ++ reasm_data = inet_frag_reasm_prepare(&qp->q, skb, prev_tail); ++ if (!reasm_data) ++ goto out_nomem; + ++ len = ip_hdrlen(skb) + qp->q.len; + err = -E2BIG; + if (len > 65535) + goto out_oversize; + +- /* Head of list must not be cloned. */ +- if (skb_unclone(head, GFP_ATOMIC)) +- goto out_nomem; +- +- /* If the first fragment is fragmented itself, we split +- * it to two chunks: the first with data and paged part +- * and the second, holding only fragments. */ +- if (skb_has_frag_list(head)) { +- struct sk_buff *clone; +- int i, plen = 0; +- +- clone = alloc_skb(0, GFP_ATOMIC); +- if (!clone) +- goto out_nomem; +- skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list; +- skb_frag_list_init(head); +- for (i = 0; i < skb_shinfo(head)->nr_frags; i++) +- plen += skb_frag_size(&skb_shinfo(head)->frags[i]); +- clone->len = clone->data_len = head->data_len - plen; +- head->truesize += clone->truesize; +- clone->csum = 0; +- clone->ip_summed = head->ip_summed; +- add_frag_mem_limit(qp->q.net, clone->truesize); +- skb_shinfo(head)->frag_list = clone; +- nextp = &clone->next; +- } else { +- nextp = &skb_shinfo(head)->frag_list; +- } +- +- skb_push(head, head->data - skb_network_header(head)); +- +- /* Traverse the tree in order, to build frag_list. */ +- fp = FRAG_CB(head)->next_frag; +- rbn = rb_next(&head->rbnode); +- rb_erase(&head->rbnode, &qp->q.rb_fragments); +- while (rbn || fp) { +- /* fp points to the next sk_buff in the current run; +- * rbn points to the next run. +- */ +- /* Go through the current run. */ +- while (fp) { +- *nextp = fp; +- nextp = &fp->next; +- fp->prev = NULL; +- memset(&fp->rbnode, 0, sizeof(fp->rbnode)); +- fp->sk = NULL; +- head->data_len += fp->len; +- head->len += fp->len; +- if (head->ip_summed != fp->ip_summed) +- head->ip_summed = CHECKSUM_NONE; +- else if (head->ip_summed == CHECKSUM_COMPLETE) +- head->csum = csum_add(head->csum, fp->csum); +- head->truesize += fp->truesize; +- fp = FRAG_CB(fp)->next_frag; +- } +- /* Move to the next run. */ +- if (rbn) { +- struct rb_node *rbnext = rb_next(rbn); +- +- fp = rb_to_skb(rbn); +- rb_erase(rbn, &qp->q.rb_fragments); +- rbn = rbnext; +- } +- } +- sub_frag_mem_limit(qp->q.net, head->truesize); ++ inet_frag_reasm_finish(&qp->q, skb, reasm_data); + +- *nextp = NULL; +- head->next = NULL; +- head->prev = NULL; +- head->dev = dev; +- head->tstamp = qp->q.stamp; +- IPCB(head)->frag_max_size = max(qp->max_df_size, qp->q.max_size); ++ skb->dev = dev; ++ IPCB(skb)->frag_max_size = max(qp->max_df_size, qp->q.max_size); + +- iph = ip_hdr(head); ++ iph = ip_hdr(skb); + iph->tot_len = htons(len); + iph->tos |= ecn; + +@@ -644,7 +441,7 @@ static int ip_frag_reasm(struct ipq *qp, + * from one very small df-fragment and one large non-df frag. + */ + if (qp->max_df_size == qp->q.max_size) { +- IPCB(head)->flags |= IPSKB_FRAG_PMTU; ++ IPCB(skb)->flags |= IPSKB_FRAG_PMTU; + iph->frag_off = htons(IP_DF); + } else { + iph->frag_off = 0; +@@ -742,28 +539,6 @@ struct sk_buff *ip_check_defrag(struct n + } + EXPORT_SYMBOL(ip_check_defrag); + +-unsigned int inet_frag_rbtree_purge(struct rb_root *root) +-{ +- struct rb_node *p = rb_first(root); +- unsigned int sum = 0; +- +- while (p) { +- struct sk_buff *skb = rb_entry(p, struct sk_buff, rbnode); +- +- p = rb_next(p); +- rb_erase(&skb->rbnode, root); +- while (skb) { +- struct sk_buff *next = FRAG_CB(skb)->next_frag; +- +- sum += skb->truesize; +- kfree_skb(skb); +- skb = next; +- } +- } +- return sum; +-} +-EXPORT_SYMBOL(inet_frag_rbtree_purge); +- + #ifdef CONFIG_SYSCTL + static int dist_min; + diff --git a/queue-4.9/net-ip6-defrag-use-rbtrees-for-ipv6-defrag.patch b/queue-4.9/net-ip6-defrag-use-rbtrees-for-ipv6-defrag.patch new file mode 100644 index 00000000000..a00e19d24de --- /dev/null +++ b/queue-4.9/net-ip6-defrag-use-rbtrees-for-ipv6-defrag.patch @@ -0,0 +1,449 @@ +From foo@baz Tue 30 Apr 2019 12:43:33 PM CEST +From: Peter Oskolkov +Date: Fri, 26 Apr 2019 08:41:07 -0700 +Subject: net: IP6 defrag: use rbtrees for IPv6 defrag +To: Greg Kroah-Hartman , stable@vger.kernel.org, netdev@vger.kernel.org +Cc: Peter Oskolkov , David Miller , Eric Dumazet , Sasha Levin , Captain Wiggum , Lars Persson , Peter Oskolkov , Tom Herbert , Florian Westphal +Message-ID: <20190426154108.52277-5-posk@google.com> + +From: Peter Oskolkov + +[ Upstream commit d4289fcc9b16b89619ee1c54f829e05e56de8b9a ] + +Currently, IPv6 defragmentation code drops non-last fragments that +are smaller than 1280 bytes: see +commit 0ed4229b08c1 ("ipv6: defrag: drop non-last frags smaller than min mtu") + +This behavior is not specified in IPv6 RFCs and appears to break +compatibility with some IPv6 implemenations, as reported here: +https://www.spinics.net/lists/netdev/msg543846.html + +This patch re-uses common IP defragmentation queueing and reassembly +code in IPv6, removing the 1280 byte restriction. + +Signed-off-by: Peter Oskolkov +Reported-by: Tom Herbert +Cc: Eric Dumazet +Cc: Florian Westphal +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + include/net/ipv6_frag.h | 11 +- + net/ipv6/reassembly.c | 248 ++++++++++++++---------------------------------- + 2 files changed, 82 insertions(+), 177 deletions(-) + +--- a/include/net/ipv6_frag.h ++++ b/include/net/ipv6_frag.h +@@ -82,8 +82,15 @@ ip6frag_expire_frag_queue(struct net *ne + __IP6_INC_STATS(net, __in6_dev_get(dev), IPSTATS_MIB_REASMTIMEOUT); + + /* Don't send error if the first segment did not arrive. */ +- head = fq->q.fragments; +- if (!(fq->q.flags & INET_FRAG_FIRST_IN) || !head) ++ if (!(fq->q.flags & INET_FRAG_FIRST_IN)) ++ goto out; ++ ++ /* sk_buff::dev and sk_buff::rbnode are unionized. So we ++ * pull the head out of the tree in order to be able to ++ * deal with head->dev. ++ */ ++ head = inet_frag_pull_head(&fq->q); ++ if (!head) + goto out; + + head->dev = dev; +--- a/net/ipv6/reassembly.c ++++ b/net/ipv6/reassembly.c +@@ -62,13 +62,6 @@ + + static const char ip6_frag_cache_name[] = "ip6-frags"; + +-struct ip6frag_skb_cb { +- struct inet6_skb_parm h; +- int offset; +-}; +- +-#define FRAG6_CB(skb) ((struct ip6frag_skb_cb *)((skb)->cb)) +- + static u8 ip6_frag_ecn(const struct ipv6hdr *ipv6h) + { + return 1 << (ipv6_get_dsfield(ipv6h) & INET_ECN_MASK); +@@ -76,8 +69,8 @@ static u8 ip6_frag_ecn(const struct ipv6 + + static struct inet_frags ip6_frags; + +-static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev, +- struct net_device *dev); ++static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *skb, ++ struct sk_buff *prev_tail, struct net_device *dev); + + static void ip6_frag_expire(unsigned long data) + { +@@ -117,21 +110,26 @@ static int ip6_frag_queue(struct frag_qu + struct frag_hdr *fhdr, int nhoff, + u32 *prob_offset) + { +- struct sk_buff *prev, *next; +- struct net_device *dev; +- int offset, end; + struct net *net = dev_net(skb_dst(skb)->dev); ++ int offset, end, fragsize; ++ struct sk_buff *prev_tail; ++ struct net_device *dev; ++ int err = -ENOENT; + u8 ecn; + + if (fq->q.flags & INET_FRAG_COMPLETE) + goto err; + ++ err = -EINVAL; + offset = ntohs(fhdr->frag_off) & ~0x7; + end = offset + (ntohs(ipv6_hdr(skb)->payload_len) - + ((u8 *)(fhdr + 1) - (u8 *)(ipv6_hdr(skb) + 1))); + + if ((unsigned int)end > IPV6_MAXPLEN) { + *prob_offset = (u8 *)&fhdr->frag_off - skb_network_header(skb); ++ /* note that if prob_offset is set, the skb is freed elsewhere, ++ * we do not free it here. ++ */ + return -1; + } + +@@ -151,7 +149,7 @@ static int ip6_frag_queue(struct frag_qu + */ + if (end < fq->q.len || + ((fq->q.flags & INET_FRAG_LAST_IN) && end != fq->q.len)) +- goto err; ++ goto discard_fq; + fq->q.flags |= INET_FRAG_LAST_IN; + fq->q.len = end; + } else { +@@ -168,75 +166,45 @@ static int ip6_frag_queue(struct frag_qu + if (end > fq->q.len) { + /* Some bits beyond end -> corruption. */ + if (fq->q.flags & INET_FRAG_LAST_IN) +- goto err; ++ goto discard_fq; + fq->q.len = end; + } + } + + if (end == offset) +- goto err; ++ goto discard_fq; + ++ err = -ENOMEM; + /* Point into the IP datagram 'data' part. */ + if (!pskb_pull(skb, (u8 *) (fhdr + 1) - skb->data)) +- goto err; +- +- if (pskb_trim_rcsum(skb, end - offset)) +- goto err; +- +- /* Find out which fragments are in front and at the back of us +- * in the chain of fragments so far. We must know where to put +- * this fragment, right? +- */ +- prev = fq->q.fragments_tail; +- if (!prev || FRAG6_CB(prev)->offset < offset) { +- next = NULL; +- goto found; +- } +- prev = NULL; +- for (next = fq->q.fragments; next != NULL; next = next->next) { +- if (FRAG6_CB(next)->offset >= offset) +- break; /* bingo! */ +- prev = next; +- } +- +-found: +- /* RFC5722, Section 4, amended by Errata ID : 3089 +- * When reassembling an IPv6 datagram, if +- * one or more its constituent fragments is determined to be an +- * overlapping fragment, the entire datagram (and any constituent +- * fragments) MUST be silently discarded. +- */ +- +- /* Check for overlap with preceding fragment. */ +- if (prev && +- (FRAG6_CB(prev)->offset + prev->len) > offset) + goto discard_fq; + +- /* Look for overlap with succeeding segment. */ +- if (next && FRAG6_CB(next)->offset < end) ++ err = pskb_trim_rcsum(skb, end - offset); ++ if (err) + goto discard_fq; + +- FRAG6_CB(skb)->offset = offset; ++ /* Note : skb->rbnode and skb->dev share the same location. */ ++ dev = skb->dev; ++ /* Makes sure compiler wont do silly aliasing games */ ++ barrier(); + +- /* Insert this fragment in the chain of fragments. */ +- skb->next = next; +- if (!next) +- fq->q.fragments_tail = skb; +- if (prev) +- prev->next = skb; +- else +- fq->q.fragments = skb; ++ prev_tail = fq->q.fragments_tail; ++ err = inet_frag_queue_insert(&fq->q, skb, offset, end); ++ if (err) ++ goto insert_error; + +- dev = skb->dev; +- if (dev) { ++ if (dev) + fq->iif = dev->ifindex; +- skb->dev = NULL; +- } ++ + fq->q.stamp = skb->tstamp; + fq->q.meat += skb->len; + fq->ecn |= ecn; + add_frag_mem_limit(fq->q.net, skb->truesize); + ++ fragsize = -skb_network_offset(skb) + skb->len; ++ if (fragsize > fq->q.max_size) ++ fq->q.max_size = fragsize; ++ + /* The first fragment. + * nhoffset is obtained from the first fragment, of course. + */ +@@ -247,44 +215,48 @@ found: + + if (fq->q.flags == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) && + fq->q.meat == fq->q.len) { +- int res; + unsigned long orefdst = skb->_skb_refdst; + + skb->_skb_refdst = 0UL; +- res = ip6_frag_reasm(fq, prev, dev); ++ err = ip6_frag_reasm(fq, skb, prev_tail, dev); + skb->_skb_refdst = orefdst; +- return res; ++ return err; + } + + skb_dst_drop(skb); +- return -1; ++ return -EINPROGRESS; + ++insert_error: ++ if (err == IPFRAG_DUP) { ++ kfree_skb(skb); ++ return -EINVAL; ++ } ++ err = -EINVAL; ++ __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), ++ IPSTATS_MIB_REASM_OVERLAPS); + discard_fq: + inet_frag_kill(&fq->q); +-err: + __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), + IPSTATS_MIB_REASMFAILS); ++err: + kfree_skb(skb); +- return -1; ++ return err; + } + + /* + * Check if this packet is complete. +- * Returns NULL on failure by any reason, and pointer +- * to current nexthdr field in reassembled frame. + * + * It is called with locked fq, and caller must check that + * queue is eligible for reassembly i.e. it is not COMPLETE, + * the last and the first frames arrived and all the bits are here. + */ +-static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev, +- struct net_device *dev) ++static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *skb, ++ struct sk_buff *prev_tail, struct net_device *dev) + { + struct net *net = container_of(fq->q.net, struct net, ipv6.frags); +- struct sk_buff *fp, *head = fq->q.fragments; +- int payload_len; + unsigned int nhoff; +- int sum_truesize; ++ void *reasm_data; ++ int payload_len; + u8 ecn; + + inet_frag_kill(&fq->q); +@@ -293,113 +265,40 @@ static int ip6_frag_reasm(struct frag_qu + if (unlikely(ecn == 0xff)) + goto out_fail; + +- /* Make the one we just received the head. */ +- if (prev) { +- head = prev->next; +- fp = skb_clone(head, GFP_ATOMIC); +- +- if (!fp) +- goto out_oom; +- +- fp->next = head->next; +- if (!fp->next) +- fq->q.fragments_tail = fp; +- prev->next = fp; +- +- skb_morph(head, fq->q.fragments); +- head->next = fq->q.fragments->next; +- +- consume_skb(fq->q.fragments); +- fq->q.fragments = head; +- } +- +- WARN_ON(head == NULL); +- WARN_ON(FRAG6_CB(head)->offset != 0); ++ reasm_data = inet_frag_reasm_prepare(&fq->q, skb, prev_tail); ++ if (!reasm_data) ++ goto out_oom; + +- /* Unfragmented part is taken from the first segment. */ +- payload_len = ((head->data - skb_network_header(head)) - ++ payload_len = ((skb->data - skb_network_header(skb)) - + sizeof(struct ipv6hdr) + fq->q.len - + sizeof(struct frag_hdr)); + if (payload_len > IPV6_MAXPLEN) + goto out_oversize; + +- /* Head of list must not be cloned. */ +- if (skb_unclone(head, GFP_ATOMIC)) +- goto out_oom; +- +- /* If the first fragment is fragmented itself, we split +- * it to two chunks: the first with data and paged part +- * and the second, holding only fragments. */ +- if (skb_has_frag_list(head)) { +- struct sk_buff *clone; +- int i, plen = 0; +- +- clone = alloc_skb(0, GFP_ATOMIC); +- if (!clone) +- goto out_oom; +- clone->next = head->next; +- head->next = clone; +- skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list; +- skb_frag_list_init(head); +- for (i = 0; i < skb_shinfo(head)->nr_frags; i++) +- plen += skb_frag_size(&skb_shinfo(head)->frags[i]); +- clone->len = clone->data_len = head->data_len - plen; +- head->data_len -= clone->len; +- head->len -= clone->len; +- clone->csum = 0; +- clone->ip_summed = head->ip_summed; +- add_frag_mem_limit(fq->q.net, clone->truesize); +- } +- + /* We have to remove fragment header from datagram and to relocate + * header in order to calculate ICV correctly. */ + nhoff = fq->nhoffset; +- skb_network_header(head)[nhoff] = skb_transport_header(head)[0]; +- memmove(head->head + sizeof(struct frag_hdr), head->head, +- (head->data - head->head) - sizeof(struct frag_hdr)); +- if (skb_mac_header_was_set(head)) +- head->mac_header += sizeof(struct frag_hdr); +- head->network_header += sizeof(struct frag_hdr); +- +- skb_reset_transport_header(head); +- skb_push(head, head->data - skb_network_header(head)); +- +- sum_truesize = head->truesize; +- for (fp = head->next; fp;) { +- bool headstolen; +- int delta; +- struct sk_buff *next = fp->next; +- +- sum_truesize += fp->truesize; +- if (head->ip_summed != fp->ip_summed) +- head->ip_summed = CHECKSUM_NONE; +- else if (head->ip_summed == CHECKSUM_COMPLETE) +- head->csum = csum_add(head->csum, fp->csum); +- +- if (skb_try_coalesce(head, fp, &headstolen, &delta)) { +- kfree_skb_partial(fp, headstolen); +- } else { +- if (!skb_shinfo(head)->frag_list) +- skb_shinfo(head)->frag_list = fp; +- head->data_len += fp->len; +- head->len += fp->len; +- head->truesize += fp->truesize; +- } +- fp = next; +- } +- sub_frag_mem_limit(fq->q.net, sum_truesize); +- +- head->next = NULL; +- head->dev = dev; +- head->tstamp = fq->q.stamp; +- ipv6_hdr(head)->payload_len = htons(payload_len); +- ipv6_change_dsfield(ipv6_hdr(head), 0xff, ecn); +- IP6CB(head)->nhoff = nhoff; +- IP6CB(head)->flags |= IP6SKB_FRAGMENTED; ++ skb_network_header(skb)[nhoff] = skb_transport_header(skb)[0]; ++ memmove(skb->head + sizeof(struct frag_hdr), skb->head, ++ (skb->data - skb->head) - sizeof(struct frag_hdr)); ++ if (skb_mac_header_was_set(skb)) ++ skb->mac_header += sizeof(struct frag_hdr); ++ skb->network_header += sizeof(struct frag_hdr); ++ ++ skb_reset_transport_header(skb); ++ ++ inet_frag_reasm_finish(&fq->q, skb, reasm_data); ++ ++ skb->dev = dev; ++ ipv6_hdr(skb)->payload_len = htons(payload_len); ++ ipv6_change_dsfield(ipv6_hdr(skb), 0xff, ecn); ++ IP6CB(skb)->nhoff = nhoff; ++ IP6CB(skb)->flags |= IP6SKB_FRAGMENTED; ++ IP6CB(skb)->frag_max_size = fq->q.max_size; + + /* Yes, and fold redundant checksum back. 8) */ +- skb_postpush_rcsum(head, skb_network_header(head), +- skb_network_header_len(head)); ++ skb_postpush_rcsum(skb, skb_network_header(skb), ++ skb_network_header_len(skb)); + + rcu_read_lock(); + __IP6_INC_STATS(net, __in6_dev_get(dev), IPSTATS_MIB_REASMOKS); +@@ -407,6 +306,7 @@ static int ip6_frag_reasm(struct frag_qu + fq->q.fragments = NULL; + fq->q.rb_fragments = RB_ROOT; + fq->q.fragments_tail = NULL; ++ fq->q.last_run_head = NULL; + return 1; + + out_oversize: +@@ -418,6 +318,7 @@ out_fail: + rcu_read_lock(); + __IP6_INC_STATS(net, __in6_dev_get(dev), IPSTATS_MIB_REASMFAILS); + rcu_read_unlock(); ++ inet_frag_kill(&fq->q); + return -1; + } + +@@ -456,10 +357,6 @@ static int ipv6_frag_rcv(struct sk_buff + return 1; + } + +- if (skb->len - skb_network_offset(skb) < IPV6_MIN_MTU && +- fhdr->frag_off & htons(IP6_MF)) +- goto fail_hdr; +- + iif = skb->dev ? skb->dev->ifindex : 0; + fq = fq_find(net, fhdr->identification, hdr, iif); + if (fq) { +@@ -477,6 +374,7 @@ static int ipv6_frag_rcv(struct sk_buff + if (prob_offset) { + __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), + IPSTATS_MIB_INHDRERRORS); ++ /* icmpv6_param_prob() calls kfree_skb(skb) */ + icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, prob_offset); + } + return ret; diff --git a/queue-4.9/net-ip6-defrag-use-rbtrees-in-nf_conntrack_reasm.c.patch b/queue-4.9/net-ip6-defrag-use-rbtrees-in-nf_conntrack_reasm.c.patch new file mode 100644 index 00000000000..6495f15309a --- /dev/null +++ b/queue-4.9/net-ip6-defrag-use-rbtrees-in-nf_conntrack_reasm.c.patch @@ -0,0 +1,396 @@ +From foo@baz Tue 30 Apr 2019 12:43:33 PM CEST +From: Peter Oskolkov +Date: Fri, 26 Apr 2019 08:41:08 -0700 +Subject: net: IP6 defrag: use rbtrees in nf_conntrack_reasm.c +To: Greg Kroah-Hartman , stable@vger.kernel.org, netdev@vger.kernel.org +Cc: Peter Oskolkov , David Miller , Eric Dumazet , Sasha Levin , Captain Wiggum , Lars Persson , Peter Oskolkov , Tom Herbert , Florian Westphal +Message-ID: <20190426154108.52277-6-posk@google.com> + +From: Peter Oskolkov + +[ Upstream commit 997dd96471641e147cb2c33ad54284000d0f5e35 ] + +Currently, IPv6 defragmentation code drops non-last fragments that +are smaller than 1280 bytes: see +commit 0ed4229b08c1 ("ipv6: defrag: drop non-last frags smaller than min mtu") + +This behavior is not specified in IPv6 RFCs and appears to break +compatibility with some IPv6 implemenations, as reported here: +https://www.spinics.net/lists/netdev/msg543846.html + +This patch re-uses common IP defragmentation queueing and reassembly +code in IP6 defragmentation in nf_conntrack, removing the 1280 byte +restriction. + +Signed-off-by: Peter Oskolkov +Reported-by: Tom Herbert +Cc: Eric Dumazet +Cc: Florian Westphal +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/ipv6/netfilter/nf_conntrack_reasm.c | 260 +++++++++----------------------- + 1 file changed, 74 insertions(+), 186 deletions(-) + +--- a/net/ipv6/netfilter/nf_conntrack_reasm.c ++++ b/net/ipv6/netfilter/nf_conntrack_reasm.c +@@ -51,14 +51,6 @@ + + static const char nf_frags_cache_name[] = "nf-frags"; + +-struct nf_ct_frag6_skb_cb +-{ +- struct inet6_skb_parm h; +- int offset; +-}; +- +-#define NFCT_FRAG6_CB(skb) ((struct nf_ct_frag6_skb_cb *)((skb)->cb)) +- + static struct inet_frags nf_frags; + + #ifdef CONFIG_SYSCTL +@@ -144,6 +136,9 @@ static void __net_exit nf_ct_frags6_sysc + } + #endif + ++static int nf_ct_frag6_reasm(struct frag_queue *fq, struct sk_buff *skb, ++ struct sk_buff *prev_tail, struct net_device *dev); ++ + static inline u8 ip6_frag_ecn(const struct ipv6hdr *ipv6h) + { + return 1 << (ipv6_get_dsfield(ipv6h) & INET_ECN_MASK); +@@ -184,9 +179,10 @@ static struct frag_queue *fq_find(struct + static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb, + const struct frag_hdr *fhdr, int nhoff) + { +- struct sk_buff *prev, *next; + unsigned int payload_len; +- int offset, end; ++ struct net_device *dev; ++ struct sk_buff *prev; ++ int offset, end, err; + u8 ecn; + + if (fq->q.flags & INET_FRAG_COMPLETE) { +@@ -261,55 +257,19 @@ static int nf_ct_frag6_queue(struct frag + goto err; + } + +- /* Find out which fragments are in front and at the back of us +- * in the chain of fragments so far. We must know where to put +- * this fragment, right? +- */ ++ /* Note : skb->rbnode and skb->dev share the same location. */ ++ dev = skb->dev; ++ /* Makes sure compiler wont do silly aliasing games */ ++ barrier(); ++ + prev = fq->q.fragments_tail; +- if (!prev || NFCT_FRAG6_CB(prev)->offset < offset) { +- next = NULL; +- goto found; +- } +- prev = NULL; +- for (next = fq->q.fragments; next != NULL; next = next->next) { +- if (NFCT_FRAG6_CB(next)->offset >= offset) +- break; /* bingo! */ +- prev = next; +- } +- +-found: +- /* RFC5722, Section 4: +- * When reassembling an IPv6 datagram, if +- * one or more its constituent fragments is determined to be an +- * overlapping fragment, the entire datagram (and any constituent +- * fragments, including those not yet received) MUST be silently +- * discarded. +- */ ++ err = inet_frag_queue_insert(&fq->q, skb, offset, end); ++ if (err) ++ goto insert_error; ++ ++ if (dev) ++ fq->iif = dev->ifindex; + +- /* Check for overlap with preceding fragment. */ +- if (prev && +- (NFCT_FRAG6_CB(prev)->offset + prev->len) > offset) +- goto discard_fq; +- +- /* Look for overlap with succeeding segment. */ +- if (next && NFCT_FRAG6_CB(next)->offset < end) +- goto discard_fq; +- +- NFCT_FRAG6_CB(skb)->offset = offset; +- +- /* Insert this fragment in the chain of fragments. */ +- skb->next = next; +- if (!next) +- fq->q.fragments_tail = skb; +- if (prev) +- prev->next = skb; +- else +- fq->q.fragments = skb; +- +- if (skb->dev) { +- fq->iif = skb->dev->ifindex; +- skb->dev = NULL; +- } + fq->q.stamp = skb->tstamp; + fq->q.meat += skb->len; + fq->ecn |= ecn; +@@ -325,11 +285,25 @@ found: + fq->q.flags |= INET_FRAG_FIRST_IN; + } + +- return 0; ++ if (fq->q.flags == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) && ++ fq->q.meat == fq->q.len) { ++ unsigned long orefdst = skb->_skb_refdst; ++ ++ skb->_skb_refdst = 0UL; ++ err = nf_ct_frag6_reasm(fq, skb, prev, dev); ++ skb->_skb_refdst = orefdst; ++ return err; ++ } ++ ++ skb_dst_drop(skb); ++ return -EINPROGRESS; + +-discard_fq: ++insert_error: ++ if (err == IPFRAG_DUP) ++ goto err; + inet_frag_kill(&fq->q); + err: ++ skb_dst_drop(skb); + return -EINVAL; + } + +@@ -339,141 +313,67 @@ err: + * It is called with locked fq, and caller must check that + * queue is eligible for reassembly i.e. it is not COMPLETE, + * the last and the first frames arrived and all the bits are here. +- * +- * returns true if *prev skb has been transformed into the reassembled +- * skb, false otherwise. + */ +-static bool +-nf_ct_frag6_reasm(struct frag_queue *fq, struct sk_buff *prev, struct net_device *dev) ++static int nf_ct_frag6_reasm(struct frag_queue *fq, struct sk_buff *skb, ++ struct sk_buff *prev_tail, struct net_device *dev) + { +- struct sk_buff *fp, *head = fq->q.fragments; +- int payload_len; ++ void *reasm_data; ++ int payload_len; + u8 ecn; + + inet_frag_kill(&fq->q); + +- WARN_ON(head == NULL); +- WARN_ON(NFCT_FRAG6_CB(head)->offset != 0); +- + ecn = ip_frag_ecn_table[fq->ecn]; + if (unlikely(ecn == 0xff)) +- return false; ++ goto err; ++ ++ reasm_data = inet_frag_reasm_prepare(&fq->q, skb, prev_tail); ++ if (!reasm_data) ++ goto err; + +- /* Unfragmented part is taken from the first segment. */ +- payload_len = ((head->data - skb_network_header(head)) - ++ payload_len = ((skb->data - skb_network_header(skb)) - + sizeof(struct ipv6hdr) + fq->q.len - + sizeof(struct frag_hdr)); + if (payload_len > IPV6_MAXPLEN) { + net_dbg_ratelimited("nf_ct_frag6_reasm: payload len = %d\n", + payload_len); +- return false; +- } +- +- /* Head of list must not be cloned. */ +- if (skb_unclone(head, GFP_ATOMIC)) +- return false; +- +- /* If the first fragment is fragmented itself, we split +- * it to two chunks: the first with data and paged part +- * and the second, holding only fragments. */ +- if (skb_has_frag_list(head)) { +- struct sk_buff *clone; +- int i, plen = 0; +- +- clone = alloc_skb(0, GFP_ATOMIC); +- if (clone == NULL) +- return false; +- +- clone->next = head->next; +- head->next = clone; +- skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list; +- skb_frag_list_init(head); +- for (i = 0; i < skb_shinfo(head)->nr_frags; i++) +- plen += skb_frag_size(&skb_shinfo(head)->frags[i]); +- clone->len = clone->data_len = head->data_len - plen; +- head->data_len -= clone->len; +- head->len -= clone->len; +- clone->csum = 0; +- clone->ip_summed = head->ip_summed; +- +- add_frag_mem_limit(fq->q.net, clone->truesize); +- } +- +- /* morph head into last received skb: prev. +- * +- * This allows callers of ipv6 conntrack defrag to continue +- * to use the last skb(frag) passed into the reasm engine. +- * The last skb frag 'silently' turns into the full reassembled skb. +- * +- * Since prev is also part of q->fragments we have to clone it first. +- */ +- if (head != prev) { +- struct sk_buff *iter; +- +- fp = skb_clone(prev, GFP_ATOMIC); +- if (!fp) +- return false; +- +- fp->next = prev->next; +- +- iter = head; +- while (iter) { +- if (iter->next == prev) { +- iter->next = fp; +- break; +- } +- iter = iter->next; +- } +- +- skb_morph(prev, head); +- prev->next = head->next; +- consume_skb(head); +- head = prev; ++ goto err; + } + + /* We have to remove fragment header from datagram and to relocate + * header in order to calculate ICV correctly. */ +- skb_network_header(head)[fq->nhoffset] = skb_transport_header(head)[0]; +- memmove(head->head + sizeof(struct frag_hdr), head->head, +- (head->data - head->head) - sizeof(struct frag_hdr)); +- head->mac_header += sizeof(struct frag_hdr); +- head->network_header += sizeof(struct frag_hdr); +- +- skb_shinfo(head)->frag_list = head->next; +- skb_reset_transport_header(head); +- skb_push(head, head->data - skb_network_header(head)); +- +- for (fp = head->next; fp; fp = fp->next) { +- head->data_len += fp->len; +- head->len += fp->len; +- if (head->ip_summed != fp->ip_summed) +- head->ip_summed = CHECKSUM_NONE; +- else if (head->ip_summed == CHECKSUM_COMPLETE) +- head->csum = csum_add(head->csum, fp->csum); +- head->truesize += fp->truesize; +- fp->sk = NULL; +- } +- sub_frag_mem_limit(fq->q.net, head->truesize); +- +- head->ignore_df = 1; +- head->next = NULL; +- head->dev = dev; +- head->tstamp = fq->q.stamp; +- ipv6_hdr(head)->payload_len = htons(payload_len); +- ipv6_change_dsfield(ipv6_hdr(head), 0xff, ecn); +- IP6CB(head)->frag_max_size = sizeof(struct ipv6hdr) + fq->q.max_size; ++ skb_network_header(skb)[fq->nhoffset] = skb_transport_header(skb)[0]; ++ memmove(skb->head + sizeof(struct frag_hdr), skb->head, ++ (skb->data - skb->head) - sizeof(struct frag_hdr)); ++ skb->mac_header += sizeof(struct frag_hdr); ++ skb->network_header += sizeof(struct frag_hdr); ++ ++ skb_reset_transport_header(skb); ++ ++ inet_frag_reasm_finish(&fq->q, skb, reasm_data); ++ ++ skb->ignore_df = 1; ++ skb->dev = dev; ++ ipv6_hdr(skb)->payload_len = htons(payload_len); ++ ipv6_change_dsfield(ipv6_hdr(skb), 0xff, ecn); ++ IP6CB(skb)->frag_max_size = sizeof(struct ipv6hdr) + fq->q.max_size; + + /* Yes, and fold redundant checksum back. 8) */ +- if (head->ip_summed == CHECKSUM_COMPLETE) +- head->csum = csum_partial(skb_network_header(head), +- skb_network_header_len(head), +- head->csum); ++ if (skb->ip_summed == CHECKSUM_COMPLETE) ++ skb->csum = csum_partial(skb_network_header(skb), ++ skb_network_header_len(skb), ++ skb->csum); + + fq->q.fragments = NULL; + fq->q.rb_fragments = RB_ROOT; + fq->q.fragments_tail = NULL; ++ fq->q.last_run_head = NULL; ++ ++ return 0; + +- return true; ++err: ++ inet_frag_kill(&fq->q); ++ return -EINVAL; + } + + /* +@@ -542,7 +442,6 @@ find_prev_fhdr(struct sk_buff *skb, u8 * + int nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user) + { + u16 savethdr = skb->transport_header; +- struct net_device *dev = skb->dev; + int fhoff, nhoff, ret; + struct frag_hdr *fhdr; + struct frag_queue *fq; +@@ -565,10 +464,6 @@ int nf_ct_frag6_gather(struct net *net, + hdr = ipv6_hdr(skb); + fhdr = (struct frag_hdr *)skb_transport_header(skb); + +- if (skb->len - skb_network_offset(skb) < IPV6_MIN_MTU && +- fhdr->frag_off & htons(IP6_MF)) +- return -EINVAL; +- + skb_orphan(skb); + fq = fq_find(net, fhdr->identification, user, hdr, + skb->dev ? skb->dev->ifindex : 0); +@@ -580,24 +475,17 @@ int nf_ct_frag6_gather(struct net *net, + spin_lock_bh(&fq->q.lock); + + ret = nf_ct_frag6_queue(fq, skb, fhdr, nhoff); +- if (ret < 0) { +- if (ret == -EPROTO) { +- skb->transport_header = savethdr; +- ret = 0; +- } +- goto out_unlock; ++ if (ret == -EPROTO) { ++ skb->transport_header = savethdr; ++ ret = 0; + } + + /* after queue has assumed skb ownership, only 0 or -EINPROGRESS + * must be returned. + */ +- ret = -EINPROGRESS; +- if (fq->q.flags == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) && +- fq->q.meat == fq->q.len && +- nf_ct_frag6_reasm(fq, skb, dev)) +- ret = 0; ++ if (ret) ++ ret = -EINPROGRESS; + +-out_unlock: + spin_unlock_bh(&fq->q.lock); + inet_frag_put(&fq->q); + return ret; diff --git a/queue-4.9/series b/queue-4.9/series index 43c3b76ebe4..5790b670f1d 100644 --- a/queue-4.9/series +++ b/queue-4.9/series @@ -34,3 +34,8 @@ net-rds-exchange-of-8k-and-1m-pool.patch team-fix-possible-recursive-locking-when-add-slaves.patch net-stmmac-move-stmmac_check_ether_addr-to-driver-probe.patch ipv4-set-the-tcp_min_rtt_wlen-range-from-0-to-one-day.patch +ipv6-frags-fix-a-lockdep-false-positive.patch +net-ip-defrag-encapsulate-rbtree-defrag-code-into-callable-functions.patch +ipv6-remove-dependency-of-nf_defrag_ipv6-on-ipv6-module.patch +net-ip6-defrag-use-rbtrees-for-ipv6-defrag.patch +net-ip6-defrag-use-rbtrees-in-nf_conntrack_reasm.c.patch