From: Sasha Levin Date: Mon, 22 Apr 2019 02:26:14 +0000 (-0400) Subject: net fix for 4.19 X-Git-Tag: v3.18.139~48 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=e82bcef496123aafbd80fd42663677e9aa3cc2a3;p=thirdparty%2Fkernel%2Fstable-queue.git net fix for 4.19 Signed-off-by: Sasha Levin --- diff --git a/queue-4.19/net-ip-defrag-encapsulate-rbtree-defrag-code-into-ca.patch b/queue-4.19/net-ip-defrag-encapsulate-rbtree-defrag-code-into-ca.patch new file mode 100644 index 00000000000..fa8cc3e375a --- /dev/null +++ b/queue-4.19/net-ip-defrag-encapsulate-rbtree-defrag-code-into-ca.patch @@ -0,0 +1,748 @@ +From 7905dc07a3d1aac701694adcb0c0cb4439a47117 Mon Sep 17 00:00:00 2001 +From: Peter Oskolkov +Date: Mon, 8 Apr 2019 17:10:03 -0700 +Subject: net: IP defrag: encapsulate rbtree defrag code into callable + functions + +[ Upstream commit c23f35d19db3b36ffb9e04b08f1d91565d15f84f ] + +This is a refactoring patch: without changing runtime behavior, +it moves rbtree-related code from IPv4-specific files/functions +into .h/.c defrag files shared with IPv6 defragmentation code. + +Signed-off-by: Peter Oskolkov +Cc: Eric Dumazet +Cc: Florian Westphal +Cc: Tom Herbert +Signed-off-by: David S. Miller +Acked-by: David S. Miller +Signed-off-by: Sasha Levin +--- + include/net/inet_frag.h | 16 ++- + net/ipv4/inet_fragment.c | 293 +++++++++++++++++++++++++++++++++++++++ + net/ipv4/ip_fragment.c | 290 ++++---------------------------------- + 3 files changed, 335 insertions(+), 264 deletions(-) + +diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h +index 1662cbc0b46b..b02bf737d019 100644 +--- a/include/net/inet_frag.h ++++ b/include/net/inet_frag.h +@@ -77,8 +77,8 @@ struct inet_frag_queue { + struct timer_list timer; + spinlock_t lock; + refcount_t refcnt; +- struct sk_buff *fragments; /* Used in IPv6. */ +- struct rb_root rb_fragments; /* Used in IPv4. */ ++ struct sk_buff *fragments; /* used in 6lopwpan IPv6. */ ++ struct rb_root rb_fragments; /* Used in IPv4/IPv6. */ + struct sk_buff *fragments_tail; + struct sk_buff *last_run_head; + ktime_t stamp; +@@ -153,4 +153,16 @@ static inline void add_frag_mem_limit(struct netns_frags *nf, long val) + + extern const u8 ip_frag_ecn_table[16]; + ++/* Return values of inet_frag_queue_insert() */ ++#define IPFRAG_OK 0 ++#define IPFRAG_DUP 1 ++#define IPFRAG_OVERLAP 2 ++int inet_frag_queue_insert(struct inet_frag_queue *q, struct sk_buff *skb, ++ int offset, int end); ++void *inet_frag_reasm_prepare(struct inet_frag_queue *q, struct sk_buff *skb, ++ struct sk_buff *parent); ++void inet_frag_reasm_finish(struct inet_frag_queue *q, struct sk_buff *head, ++ void *reasm_data); ++struct sk_buff *inet_frag_pull_head(struct inet_frag_queue *q); ++ + #endif +diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c +index 760a9e52e02b..9f69411251d0 100644 +--- a/net/ipv4/inet_fragment.c ++++ b/net/ipv4/inet_fragment.c +@@ -25,6 +25,62 @@ + #include + #include + #include ++#include ++#include ++ ++/* Use skb->cb to track consecutive/adjacent fragments coming at ++ * the end of the queue. Nodes in the rb-tree queue will ++ * contain "runs" of one or more adjacent fragments. ++ * ++ * Invariants: ++ * - next_frag is NULL at the tail of a "run"; ++ * - the head of a "run" has the sum of all fragment lengths in frag_run_len. ++ */ ++struct ipfrag_skb_cb { ++ union { ++ struct inet_skb_parm h4; ++ struct inet6_skb_parm h6; ++ }; ++ struct sk_buff *next_frag; ++ int frag_run_len; ++}; ++ ++#define FRAG_CB(skb) ((struct ipfrag_skb_cb *)((skb)->cb)) ++ ++static void fragcb_clear(struct sk_buff *skb) ++{ ++ RB_CLEAR_NODE(&skb->rbnode); ++ FRAG_CB(skb)->next_frag = NULL; ++ FRAG_CB(skb)->frag_run_len = skb->len; ++} ++ ++/* Append skb to the last "run". */ ++static void fragrun_append_to_last(struct inet_frag_queue *q, ++ struct sk_buff *skb) ++{ ++ fragcb_clear(skb); ++ ++ FRAG_CB(q->last_run_head)->frag_run_len += skb->len; ++ FRAG_CB(q->fragments_tail)->next_frag = skb; ++ q->fragments_tail = skb; ++} ++ ++/* Create a new "run" with the skb. */ ++static void fragrun_create(struct inet_frag_queue *q, struct sk_buff *skb) ++{ ++ BUILD_BUG_ON(sizeof(struct ipfrag_skb_cb) > sizeof(skb->cb)); ++ fragcb_clear(skb); ++ ++ if (q->last_run_head) ++ rb_link_node(&skb->rbnode, &q->last_run_head->rbnode, ++ &q->last_run_head->rbnode.rb_right); ++ else ++ rb_link_node(&skb->rbnode, NULL, &q->rb_fragments.rb_node); ++ rb_insert_color(&skb->rbnode, &q->rb_fragments); ++ ++ q->fragments_tail = skb; ++ q->last_run_head = skb; ++} + + /* Given the OR values of all fragments, apply RFC 3168 5.3 requirements + * Value : 0xff if frame should be dropped. +@@ -123,6 +179,28 @@ static void inet_frag_destroy_rcu(struct rcu_head *head) + kmem_cache_free(f->frags_cachep, q); + } + ++unsigned int inet_frag_rbtree_purge(struct rb_root *root) ++{ ++ struct rb_node *p = rb_first(root); ++ unsigned int sum = 0; ++ ++ while (p) { ++ struct sk_buff *skb = rb_entry(p, struct sk_buff, rbnode); ++ ++ p = rb_next(p); ++ rb_erase(&skb->rbnode, root); ++ while (skb) { ++ struct sk_buff *next = FRAG_CB(skb)->next_frag; ++ ++ sum += skb->truesize; ++ kfree_skb(skb); ++ skb = next; ++ } ++ } ++ return sum; ++} ++EXPORT_SYMBOL(inet_frag_rbtree_purge); ++ + void inet_frag_destroy(struct inet_frag_queue *q) + { + struct sk_buff *fp; +@@ -224,3 +302,218 @@ struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, void *key) + return fq; + } + EXPORT_SYMBOL(inet_frag_find); ++ ++int inet_frag_queue_insert(struct inet_frag_queue *q, struct sk_buff *skb, ++ int offset, int end) ++{ ++ struct sk_buff *last = q->fragments_tail; ++ ++ /* RFC5722, Section 4, amended by Errata ID : 3089 ++ * When reassembling an IPv6 datagram, if ++ * one or more its constituent fragments is determined to be an ++ * overlapping fragment, the entire datagram (and any constituent ++ * fragments) MUST be silently discarded. ++ * ++ * Duplicates, however, should be ignored (i.e. skb dropped, but the ++ * queue/fragments kept for later reassembly). ++ */ ++ if (!last) ++ fragrun_create(q, skb); /* First fragment. */ ++ else if (last->ip_defrag_offset + last->len < end) { ++ /* This is the common case: skb goes to the end. */ ++ /* Detect and discard overlaps. */ ++ if (offset < last->ip_defrag_offset + last->len) ++ return IPFRAG_OVERLAP; ++ if (offset == last->ip_defrag_offset + last->len) ++ fragrun_append_to_last(q, skb); ++ else ++ fragrun_create(q, skb); ++ } else { ++ /* Binary search. Note that skb can become the first fragment, ++ * but not the last (covered above). ++ */ ++ struct rb_node **rbn, *parent; ++ ++ rbn = &q->rb_fragments.rb_node; ++ do { ++ struct sk_buff *curr; ++ int curr_run_end; ++ ++ parent = *rbn; ++ curr = rb_to_skb(parent); ++ curr_run_end = curr->ip_defrag_offset + ++ FRAG_CB(curr)->frag_run_len; ++ if (end <= curr->ip_defrag_offset) ++ rbn = &parent->rb_left; ++ else if (offset >= curr_run_end) ++ rbn = &parent->rb_right; ++ else if (offset >= curr->ip_defrag_offset && ++ end <= curr_run_end) ++ return IPFRAG_DUP; ++ else ++ return IPFRAG_OVERLAP; ++ } while (*rbn); ++ /* Here we have parent properly set, and rbn pointing to ++ * one of its NULL left/right children. Insert skb. ++ */ ++ fragcb_clear(skb); ++ rb_link_node(&skb->rbnode, parent, rbn); ++ rb_insert_color(&skb->rbnode, &q->rb_fragments); ++ } ++ ++ skb->ip_defrag_offset = offset; ++ ++ return IPFRAG_OK; ++} ++EXPORT_SYMBOL(inet_frag_queue_insert); ++ ++void *inet_frag_reasm_prepare(struct inet_frag_queue *q, struct sk_buff *skb, ++ struct sk_buff *parent) ++{ ++ struct sk_buff *fp, *head = skb_rb_first(&q->rb_fragments); ++ struct sk_buff **nextp; ++ int delta; ++ ++ if (head != skb) { ++ fp = skb_clone(skb, GFP_ATOMIC); ++ if (!fp) ++ return NULL; ++ FRAG_CB(fp)->next_frag = FRAG_CB(skb)->next_frag; ++ if (RB_EMPTY_NODE(&skb->rbnode)) ++ FRAG_CB(parent)->next_frag = fp; ++ else ++ rb_replace_node(&skb->rbnode, &fp->rbnode, ++ &q->rb_fragments); ++ if (q->fragments_tail == skb) ++ q->fragments_tail = fp; ++ skb_morph(skb, head); ++ FRAG_CB(skb)->next_frag = FRAG_CB(head)->next_frag; ++ rb_replace_node(&head->rbnode, &skb->rbnode, ++ &q->rb_fragments); ++ consume_skb(head); ++ head = skb; ++ } ++ WARN_ON(head->ip_defrag_offset != 0); ++ ++ delta = -head->truesize; ++ ++ /* Head of list must not be cloned. */ ++ if (skb_unclone(head, GFP_ATOMIC)) ++ return NULL; ++ ++ delta += head->truesize; ++ if (delta) ++ add_frag_mem_limit(q->net, delta); ++ ++ /* If the first fragment is fragmented itself, we split ++ * it to two chunks: the first with data and paged part ++ * and the second, holding only fragments. ++ */ ++ if (skb_has_frag_list(head)) { ++ struct sk_buff *clone; ++ int i, plen = 0; ++ ++ clone = alloc_skb(0, GFP_ATOMIC); ++ if (!clone) ++ return NULL; ++ skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list; ++ skb_frag_list_init(head); ++ for (i = 0; i < skb_shinfo(head)->nr_frags; i++) ++ plen += skb_frag_size(&skb_shinfo(head)->frags[i]); ++ clone->data_len = head->data_len - plen; ++ clone->len = clone->data_len; ++ head->truesize += clone->truesize; ++ clone->csum = 0; ++ clone->ip_summed = head->ip_summed; ++ add_frag_mem_limit(q->net, clone->truesize); ++ skb_shinfo(head)->frag_list = clone; ++ nextp = &clone->next; ++ } else { ++ nextp = &skb_shinfo(head)->frag_list; ++ } ++ ++ return nextp; ++} ++EXPORT_SYMBOL(inet_frag_reasm_prepare); ++ ++void inet_frag_reasm_finish(struct inet_frag_queue *q, struct sk_buff *head, ++ void *reasm_data) ++{ ++ struct sk_buff **nextp = (struct sk_buff **)reasm_data; ++ struct rb_node *rbn; ++ struct sk_buff *fp; ++ ++ skb_push(head, head->data - skb_network_header(head)); ++ ++ /* Traverse the tree in order, to build frag_list. */ ++ fp = FRAG_CB(head)->next_frag; ++ rbn = rb_next(&head->rbnode); ++ rb_erase(&head->rbnode, &q->rb_fragments); ++ while (rbn || fp) { ++ /* fp points to the next sk_buff in the current run; ++ * rbn points to the next run. ++ */ ++ /* Go through the current run. */ ++ while (fp) { ++ *nextp = fp; ++ nextp = &fp->next; ++ fp->prev = NULL; ++ memset(&fp->rbnode, 0, sizeof(fp->rbnode)); ++ fp->sk = NULL; ++ head->data_len += fp->len; ++ head->len += fp->len; ++ if (head->ip_summed != fp->ip_summed) ++ head->ip_summed = CHECKSUM_NONE; ++ else if (head->ip_summed == CHECKSUM_COMPLETE) ++ head->csum = csum_add(head->csum, fp->csum); ++ head->truesize += fp->truesize; ++ fp = FRAG_CB(fp)->next_frag; ++ } ++ /* Move to the next run. */ ++ if (rbn) { ++ struct rb_node *rbnext = rb_next(rbn); ++ ++ fp = rb_to_skb(rbn); ++ rb_erase(rbn, &q->rb_fragments); ++ rbn = rbnext; ++ } ++ } ++ sub_frag_mem_limit(q->net, head->truesize); ++ ++ *nextp = NULL; ++ skb_mark_not_on_list(head); ++ head->prev = NULL; ++ head->tstamp = q->stamp; ++} ++EXPORT_SYMBOL(inet_frag_reasm_finish); ++ ++struct sk_buff *inet_frag_pull_head(struct inet_frag_queue *q) ++{ ++ struct sk_buff *head; ++ ++ if (q->fragments) { ++ head = q->fragments; ++ q->fragments = head->next; ++ } else { ++ struct sk_buff *skb; ++ ++ head = skb_rb_first(&q->rb_fragments); ++ if (!head) ++ return NULL; ++ skb = FRAG_CB(head)->next_frag; ++ if (skb) ++ rb_replace_node(&head->rbnode, &skb->rbnode, ++ &q->rb_fragments); ++ else ++ rb_erase(&head->rbnode, &q->rb_fragments); ++ memset(&head->rbnode, 0, sizeof(head->rbnode)); ++ barrier(); ++ } ++ if (head == q->fragments_tail) ++ q->fragments_tail = NULL; ++ ++ sub_frag_mem_limit(q->net, head->truesize); ++ ++ return head; ++} ++EXPORT_SYMBOL(inet_frag_pull_head); +diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c +index d95b32af4a0e..15c609e6f12e 100644 +--- a/net/ipv4/ip_fragment.c ++++ b/net/ipv4/ip_fragment.c +@@ -57,57 +57,6 @@ + */ + static const char ip_frag_cache_name[] = "ip4-frags"; + +-/* Use skb->cb to track consecutive/adjacent fragments coming at +- * the end of the queue. Nodes in the rb-tree queue will +- * contain "runs" of one or more adjacent fragments. +- * +- * Invariants: +- * - next_frag is NULL at the tail of a "run"; +- * - the head of a "run" has the sum of all fragment lengths in frag_run_len. +- */ +-struct ipfrag_skb_cb { +- struct inet_skb_parm h; +- struct sk_buff *next_frag; +- int frag_run_len; +-}; +- +-#define FRAG_CB(skb) ((struct ipfrag_skb_cb *)((skb)->cb)) +- +-static void ip4_frag_init_run(struct sk_buff *skb) +-{ +- BUILD_BUG_ON(sizeof(struct ipfrag_skb_cb) > sizeof(skb->cb)); +- +- FRAG_CB(skb)->next_frag = NULL; +- FRAG_CB(skb)->frag_run_len = skb->len; +-} +- +-/* Append skb to the last "run". */ +-static void ip4_frag_append_to_last_run(struct inet_frag_queue *q, +- struct sk_buff *skb) +-{ +- RB_CLEAR_NODE(&skb->rbnode); +- FRAG_CB(skb)->next_frag = NULL; +- +- FRAG_CB(q->last_run_head)->frag_run_len += skb->len; +- FRAG_CB(q->fragments_tail)->next_frag = skb; +- q->fragments_tail = skb; +-} +- +-/* Create a new "run" with the skb. */ +-static void ip4_frag_create_run(struct inet_frag_queue *q, struct sk_buff *skb) +-{ +- if (q->last_run_head) +- rb_link_node(&skb->rbnode, &q->last_run_head->rbnode, +- &q->last_run_head->rbnode.rb_right); +- else +- rb_link_node(&skb->rbnode, NULL, &q->rb_fragments.rb_node); +- rb_insert_color(&skb->rbnode, &q->rb_fragments); +- +- ip4_frag_init_run(skb); +- q->fragments_tail = skb; +- q->last_run_head = skb; +-} +- + /* Describe an entry in the "incomplete datagrams" queue. */ + struct ipq { + struct inet_frag_queue q; +@@ -212,27 +161,9 @@ static void ip_expire(struct timer_list *t) + * pull the head out of the tree in order to be able to + * deal with head->dev. + */ +- if (qp->q.fragments) { +- head = qp->q.fragments; +- qp->q.fragments = head->next; +- } else { +- head = skb_rb_first(&qp->q.rb_fragments); +- if (!head) +- goto out; +- if (FRAG_CB(head)->next_frag) +- rb_replace_node(&head->rbnode, +- &FRAG_CB(head)->next_frag->rbnode, +- &qp->q.rb_fragments); +- else +- rb_erase(&head->rbnode, &qp->q.rb_fragments); +- memset(&head->rbnode, 0, sizeof(head->rbnode)); +- barrier(); +- } +- if (head == qp->q.fragments_tail) +- qp->q.fragments_tail = NULL; +- +- sub_frag_mem_limit(qp->q.net, head->truesize); +- ++ head = inet_frag_pull_head(&qp->q); ++ if (!head) ++ goto out; + head->dev = dev_get_by_index_rcu(net, qp->iif); + if (!head->dev) + goto out; +@@ -345,12 +276,10 @@ static int ip_frag_reinit(struct ipq *qp) + static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) + { + struct net *net = container_of(qp->q.net, struct net, ipv4.frags); +- struct rb_node **rbn, *parent; +- struct sk_buff *skb1, *prev_tail; +- int ihl, end, skb1_run_end; ++ int ihl, end, flags, offset; ++ struct sk_buff *prev_tail; + struct net_device *dev; + unsigned int fragsize; +- int flags, offset; + int err = -ENOENT; + u8 ecn; + +@@ -414,62 +343,13 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) + /* Makes sure compiler wont do silly aliasing games */ + barrier(); + +- /* RFC5722, Section 4, amended by Errata ID : 3089 +- * When reassembling an IPv6 datagram, if +- * one or more its constituent fragments is determined to be an +- * overlapping fragment, the entire datagram (and any constituent +- * fragments) MUST be silently discarded. +- * +- * We do the same here for IPv4 (and increment an snmp counter) but +- * we do not want to drop the whole queue in response to a duplicate +- * fragment. +- */ +- +- err = -EINVAL; +- /* Find out where to put this fragment. */ + prev_tail = qp->q.fragments_tail; +- if (!prev_tail) +- ip4_frag_create_run(&qp->q, skb); /* First fragment. */ +- else if (prev_tail->ip_defrag_offset + prev_tail->len < end) { +- /* This is the common case: skb goes to the end. */ +- /* Detect and discard overlaps. */ +- if (offset < prev_tail->ip_defrag_offset + prev_tail->len) +- goto discard_qp; +- if (offset == prev_tail->ip_defrag_offset + prev_tail->len) +- ip4_frag_append_to_last_run(&qp->q, skb); +- else +- ip4_frag_create_run(&qp->q, skb); +- } else { +- /* Binary search. Note that skb can become the first fragment, +- * but not the last (covered above). +- */ +- rbn = &qp->q.rb_fragments.rb_node; +- do { +- parent = *rbn; +- skb1 = rb_to_skb(parent); +- skb1_run_end = skb1->ip_defrag_offset + +- FRAG_CB(skb1)->frag_run_len; +- if (end <= skb1->ip_defrag_offset) +- rbn = &parent->rb_left; +- else if (offset >= skb1_run_end) +- rbn = &parent->rb_right; +- else if (offset >= skb1->ip_defrag_offset && +- end <= skb1_run_end) +- goto err; /* No new data, potential duplicate */ +- else +- goto discard_qp; /* Found an overlap */ +- } while (*rbn); +- /* Here we have parent properly set, and rbn pointing to +- * one of its NULL left/right children. Insert skb. +- */ +- ip4_frag_init_run(skb); +- rb_link_node(&skb->rbnode, parent, rbn); +- rb_insert_color(&skb->rbnode, &qp->q.rb_fragments); +- } ++ err = inet_frag_queue_insert(&qp->q, skb, offset, end); ++ if (err) ++ goto insert_error; + + if (dev) + qp->iif = dev->ifindex; +- skb->ip_defrag_offset = offset; + + qp->q.stamp = skb->tstamp; + qp->q.meat += skb->len; +@@ -500,9 +380,16 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) + skb_dst_drop(skb); + return -EINPROGRESS; + ++insert_error: ++ if (err == IPFRAG_DUP) { ++ kfree_skb(skb); ++ return -EINVAL; ++ } ++ err = -EINVAL; ++ __IP_INC_STATS(net, IPSTATS_MIB_REASM_OVERLAPS); + discard_qp: + inet_frag_kill(&qp->q); +- __IP_INC_STATS(net, IPSTATS_MIB_REASM_OVERLAPS); ++ __IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS); + err: + kfree_skb(skb); + return err; +@@ -514,13 +401,8 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb, + { + struct net *net = container_of(qp->q.net, struct net, ipv4.frags); + struct iphdr *iph; +- struct sk_buff *fp, *head = skb_rb_first(&qp->q.rb_fragments); +- struct sk_buff **nextp; /* To build frag_list. */ +- struct rb_node *rbn; +- int len; +- int ihlen; +- int delta; +- int err; ++ void *reasm_data; ++ int len, err; + u8 ecn; + + ipq_kill(qp); +@@ -530,117 +412,23 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb, + err = -EINVAL; + goto out_fail; + } +- /* Make the one we just received the head. */ +- if (head != skb) { +- fp = skb_clone(skb, GFP_ATOMIC); +- if (!fp) +- goto out_nomem; +- FRAG_CB(fp)->next_frag = FRAG_CB(skb)->next_frag; +- if (RB_EMPTY_NODE(&skb->rbnode)) +- FRAG_CB(prev_tail)->next_frag = fp; +- else +- rb_replace_node(&skb->rbnode, &fp->rbnode, +- &qp->q.rb_fragments); +- if (qp->q.fragments_tail == skb) +- qp->q.fragments_tail = fp; +- skb_morph(skb, head); +- FRAG_CB(skb)->next_frag = FRAG_CB(head)->next_frag; +- rb_replace_node(&head->rbnode, &skb->rbnode, +- &qp->q.rb_fragments); +- consume_skb(head); +- head = skb; +- } +- +- WARN_ON(head->ip_defrag_offset != 0); + +- /* Allocate a new buffer for the datagram. */ +- ihlen = ip_hdrlen(head); +- len = ihlen + qp->q.len; ++ /* Make the one we just received the head. */ ++ reasm_data = inet_frag_reasm_prepare(&qp->q, skb, prev_tail); ++ if (!reasm_data) ++ goto out_nomem; + ++ len = ip_hdrlen(skb) + qp->q.len; + err = -E2BIG; + if (len > 65535) + goto out_oversize; + +- delta = - head->truesize; +- +- /* Head of list must not be cloned. */ +- if (skb_unclone(head, GFP_ATOMIC)) +- goto out_nomem; +- +- delta += head->truesize; +- if (delta) +- add_frag_mem_limit(qp->q.net, delta); +- +- /* If the first fragment is fragmented itself, we split +- * it to two chunks: the first with data and paged part +- * and the second, holding only fragments. */ +- if (skb_has_frag_list(head)) { +- struct sk_buff *clone; +- int i, plen = 0; +- +- clone = alloc_skb(0, GFP_ATOMIC); +- if (!clone) +- goto out_nomem; +- skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list; +- skb_frag_list_init(head); +- for (i = 0; i < skb_shinfo(head)->nr_frags; i++) +- plen += skb_frag_size(&skb_shinfo(head)->frags[i]); +- clone->len = clone->data_len = head->data_len - plen; +- head->truesize += clone->truesize; +- clone->csum = 0; +- clone->ip_summed = head->ip_summed; +- add_frag_mem_limit(qp->q.net, clone->truesize); +- skb_shinfo(head)->frag_list = clone; +- nextp = &clone->next; +- } else { +- nextp = &skb_shinfo(head)->frag_list; +- } ++ inet_frag_reasm_finish(&qp->q, skb, reasm_data); + +- skb_push(head, head->data - skb_network_header(head)); ++ skb->dev = dev; ++ IPCB(skb)->frag_max_size = max(qp->max_df_size, qp->q.max_size); + +- /* Traverse the tree in order, to build frag_list. */ +- fp = FRAG_CB(head)->next_frag; +- rbn = rb_next(&head->rbnode); +- rb_erase(&head->rbnode, &qp->q.rb_fragments); +- while (rbn || fp) { +- /* fp points to the next sk_buff in the current run; +- * rbn points to the next run. +- */ +- /* Go through the current run. */ +- while (fp) { +- *nextp = fp; +- nextp = &fp->next; +- fp->prev = NULL; +- memset(&fp->rbnode, 0, sizeof(fp->rbnode)); +- fp->sk = NULL; +- head->data_len += fp->len; +- head->len += fp->len; +- if (head->ip_summed != fp->ip_summed) +- head->ip_summed = CHECKSUM_NONE; +- else if (head->ip_summed == CHECKSUM_COMPLETE) +- head->csum = csum_add(head->csum, fp->csum); +- head->truesize += fp->truesize; +- fp = FRAG_CB(fp)->next_frag; +- } +- /* Move to the next run. */ +- if (rbn) { +- struct rb_node *rbnext = rb_next(rbn); +- +- fp = rb_to_skb(rbn); +- rb_erase(rbn, &qp->q.rb_fragments); +- rbn = rbnext; +- } +- } +- sub_frag_mem_limit(qp->q.net, head->truesize); +- +- *nextp = NULL; +- head->next = NULL; +- head->prev = NULL; +- head->dev = dev; +- head->tstamp = qp->q.stamp; +- IPCB(head)->frag_max_size = max(qp->max_df_size, qp->q.max_size); +- +- iph = ip_hdr(head); ++ iph = ip_hdr(skb); + iph->tot_len = htons(len); + iph->tos |= ecn; + +@@ -653,7 +441,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb, + * from one very small df-fragment and one large non-df frag. + */ + if (qp->max_df_size == qp->q.max_size) { +- IPCB(head)->flags |= IPSKB_FRAG_PMTU; ++ IPCB(skb)->flags |= IPSKB_FRAG_PMTU; + iph->frag_off = htons(IP_DF); + } else { + iph->frag_off = 0; +@@ -751,28 +539,6 @@ struct sk_buff *ip_check_defrag(struct net *net, struct sk_buff *skb, u32 user) + } + EXPORT_SYMBOL(ip_check_defrag); + +-unsigned int inet_frag_rbtree_purge(struct rb_root *root) +-{ +- struct rb_node *p = rb_first(root); +- unsigned int sum = 0; +- +- while (p) { +- struct sk_buff *skb = rb_entry(p, struct sk_buff, rbnode); +- +- p = rb_next(p); +- rb_erase(&skb->rbnode, root); +- while (skb) { +- struct sk_buff *next = FRAG_CB(skb)->next_frag; +- +- sum += skb->truesize; +- kfree_skb(skb); +- skb = next; +- } +- } +- return sum; +-} +-EXPORT_SYMBOL(inet_frag_rbtree_purge); +- + #ifdef CONFIG_SYSCTL + static int dist_min; + +-- +2.19.1 + diff --git a/queue-4.19/net-ip6-defrag-use-rbtrees-for-ipv6-defrag.patch b/queue-4.19/net-ip6-defrag-use-rbtrees-for-ipv6-defrag.patch new file mode 100644 index 00000000000..0d802c4bda2 --- /dev/null +++ b/queue-4.19/net-ip6-defrag-use-rbtrees-for-ipv6-defrag.patch @@ -0,0 +1,407 @@ +From a4875906060ceefdd345989aed28133d070053d3 Mon Sep 17 00:00:00 2001 +From: Peter Oskolkov +Date: Mon, 8 Apr 2019 17:10:04 -0700 +Subject: net: IP6 defrag: use rbtrees for IPv6 defrag + +[ Upstream commit d4289fcc9b16b89619ee1c54f829e05e56de8b9a ] + +Currently, IPv6 defragmentation code drops non-last fragments that +are smaller than 1280 bytes: see +commit 0ed4229b08c1 ("ipv6: defrag: drop non-last frags smaller than min mtu") + +This behavior is not specified in IPv6 RFCs and appears to break +compatibility with some IPv6 implemenations, as reported here: +https://www.spinics.net/lists/netdev/msg543846.html + +This patch re-uses common IP defragmentation queueing and reassembly +code in IPv6, removing the 1280 byte restriction. + +Signed-off-by: Peter Oskolkov +Reported-by: Tom Herbert +Cc: Eric Dumazet +Cc: Florian Westphal +Signed-off-by: David S. Miller +Acked-by: David S. Miller +Signed-off-by: Sasha Levin +--- + include/net/ipv6_frag.h | 11 +- + net/ipv6/reassembly.c | 233 +++++++++++----------------------------- + 2 files changed, 71 insertions(+), 173 deletions(-) + +diff --git a/include/net/ipv6_frag.h b/include/net/ipv6_frag.h +index 6ced1e6899b6..28aa9b30aece 100644 +--- a/include/net/ipv6_frag.h ++++ b/include/net/ipv6_frag.h +@@ -82,8 +82,15 @@ ip6frag_expire_frag_queue(struct net *net, struct frag_queue *fq) + __IP6_INC_STATS(net, __in6_dev_get(dev), IPSTATS_MIB_REASMTIMEOUT); + + /* Don't send error if the first segment did not arrive. */ +- head = fq->q.fragments; +- if (!(fq->q.flags & INET_FRAG_FIRST_IN) || !head) ++ if (!(fq->q.flags & INET_FRAG_FIRST_IN)) ++ goto out; ++ ++ /* sk_buff::dev and sk_buff::rbnode are unionized. So we ++ * pull the head out of the tree in order to be able to ++ * deal with head->dev. ++ */ ++ head = inet_frag_pull_head(&fq->q); ++ if (!head) + goto out; + + head->dev = dev; +diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c +index 7c943392c128..642f9f53b01d 100644 +--- a/net/ipv6/reassembly.c ++++ b/net/ipv6/reassembly.c +@@ -69,8 +69,8 @@ static u8 ip6_frag_ecn(const struct ipv6hdr *ipv6h) + + static struct inet_frags ip6_frags; + +-static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev, +- struct net_device *dev); ++static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *skb, ++ struct sk_buff *prev_tail, struct net_device *dev); + + static void ip6_frag_expire(struct timer_list *t) + { +@@ -111,21 +111,26 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb, + struct frag_hdr *fhdr, int nhoff, + u32 *prob_offset) + { +- struct sk_buff *prev, *next; +- struct net_device *dev; +- int offset, end, fragsize; + struct net *net = dev_net(skb_dst(skb)->dev); ++ int offset, end, fragsize; ++ struct sk_buff *prev_tail; ++ struct net_device *dev; ++ int err = -ENOENT; + u8 ecn; + + if (fq->q.flags & INET_FRAG_COMPLETE) + goto err; + ++ err = -EINVAL; + offset = ntohs(fhdr->frag_off) & ~0x7; + end = offset + (ntohs(ipv6_hdr(skb)->payload_len) - + ((u8 *)(fhdr + 1) - (u8 *)(ipv6_hdr(skb) + 1))); + + if ((unsigned int)end > IPV6_MAXPLEN) { + *prob_offset = (u8 *)&fhdr->frag_off - skb_network_header(skb); ++ /* note that if prob_offset is set, the skb is freed elsewhere, ++ * we do not free it here. ++ */ + return -1; + } + +@@ -170,62 +175,27 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb, + if (end == offset) + goto err; + ++ err = -ENOMEM; + /* Point into the IP datagram 'data' part. */ + if (!pskb_pull(skb, (u8 *) (fhdr + 1) - skb->data)) + goto err; + +- if (pskb_trim_rcsum(skb, end - offset)) +- goto err; +- +- /* Find out which fragments are in front and at the back of us +- * in the chain of fragments so far. We must know where to put +- * this fragment, right? +- */ +- prev = fq->q.fragments_tail; +- if (!prev || prev->ip_defrag_offset < offset) { +- next = NULL; +- goto found; +- } +- prev = NULL; +- for (next = fq->q.fragments; next != NULL; next = next->next) { +- if (next->ip_defrag_offset >= offset) +- break; /* bingo! */ +- prev = next; +- } +- +-found: +- /* RFC5722, Section 4, amended by Errata ID : 3089 +- * When reassembling an IPv6 datagram, if +- * one or more its constituent fragments is determined to be an +- * overlapping fragment, the entire datagram (and any constituent +- * fragments) MUST be silently discarded. +- */ +- +- /* Check for overlap with preceding fragment. */ +- if (prev && +- (prev->ip_defrag_offset + prev->len) > offset) +- goto discard_fq; +- +- /* Look for overlap with succeeding segment. */ +- if (next && next->ip_defrag_offset < end) ++ err = pskb_trim_rcsum(skb, end - offset); ++ if (err) + goto discard_fq; + +- /* Note : skb->ip_defrag_offset and skb->dev share the same location */ ++ /* Note : skb->rbnode and skb->dev share the same location. */ + dev = skb->dev; +- if (dev) +- fq->iif = dev->ifindex; + /* Makes sure compiler wont do silly aliasing games */ + barrier(); +- skb->ip_defrag_offset = offset; + +- /* Insert this fragment in the chain of fragments. */ +- skb->next = next; +- if (!next) +- fq->q.fragments_tail = skb; +- if (prev) +- prev->next = skb; +- else +- fq->q.fragments = skb; ++ prev_tail = fq->q.fragments_tail; ++ err = inet_frag_queue_insert(&fq->q, skb, offset, end); ++ if (err) ++ goto insert_error; ++ ++ if (dev) ++ fq->iif = dev->ifindex; + + fq->q.stamp = skb->tstamp; + fq->q.meat += skb->len; +@@ -246,44 +216,48 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb, + + if (fq->q.flags == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) && + fq->q.meat == fq->q.len) { +- int res; + unsigned long orefdst = skb->_skb_refdst; + + skb->_skb_refdst = 0UL; +- res = ip6_frag_reasm(fq, prev, dev); ++ err = ip6_frag_reasm(fq, skb, prev_tail, dev); + skb->_skb_refdst = orefdst; +- return res; ++ return err; + } + + skb_dst_drop(skb); +- return -1; ++ return -EINPROGRESS; + ++insert_error: ++ if (err == IPFRAG_DUP) { ++ kfree_skb(skb); ++ return -EINVAL; ++ } ++ err = -EINVAL; ++ __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), ++ IPSTATS_MIB_REASM_OVERLAPS); + discard_fq: + inet_frag_kill(&fq->q); +-err: + __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), + IPSTATS_MIB_REASMFAILS); ++err: + kfree_skb(skb); +- return -1; ++ return err; + } + + /* + * Check if this packet is complete. +- * Returns NULL on failure by any reason, and pointer +- * to current nexthdr field in reassembled frame. + * + * It is called with locked fq, and caller must check that + * queue is eligible for reassembly i.e. it is not COMPLETE, + * the last and the first frames arrived and all the bits are here. + */ +-static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev, +- struct net_device *dev) ++static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *skb, ++ struct sk_buff *prev_tail, struct net_device *dev) + { + struct net *net = container_of(fq->q.net, struct net, ipv6.frags); +- struct sk_buff *fp, *head = fq->q.fragments; +- int payload_len, delta; + unsigned int nhoff; +- int sum_truesize; ++ void *reasm_data; ++ int payload_len; + u8 ecn; + + inet_frag_kill(&fq->q); +@@ -292,121 +266,40 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev, + if (unlikely(ecn == 0xff)) + goto out_fail; + +- /* Make the one we just received the head. */ +- if (prev) { +- head = prev->next; +- fp = skb_clone(head, GFP_ATOMIC); +- +- if (!fp) +- goto out_oom; +- +- fp->next = head->next; +- if (!fp->next) +- fq->q.fragments_tail = fp; +- prev->next = fp; +- +- skb_morph(head, fq->q.fragments); +- head->next = fq->q.fragments->next; +- +- consume_skb(fq->q.fragments); +- fq->q.fragments = head; +- } +- +- WARN_ON(head == NULL); +- WARN_ON(head->ip_defrag_offset != 0); ++ reasm_data = inet_frag_reasm_prepare(&fq->q, skb, prev_tail); ++ if (!reasm_data) ++ goto out_oom; + +- /* Unfragmented part is taken from the first segment. */ +- payload_len = ((head->data - skb_network_header(head)) - ++ payload_len = ((skb->data - skb_network_header(skb)) - + sizeof(struct ipv6hdr) + fq->q.len - + sizeof(struct frag_hdr)); + if (payload_len > IPV6_MAXPLEN) + goto out_oversize; + +- delta = - head->truesize; +- +- /* Head of list must not be cloned. */ +- if (skb_unclone(head, GFP_ATOMIC)) +- goto out_oom; +- +- delta += head->truesize; +- if (delta) +- add_frag_mem_limit(fq->q.net, delta); +- +- /* If the first fragment is fragmented itself, we split +- * it to two chunks: the first with data and paged part +- * and the second, holding only fragments. */ +- if (skb_has_frag_list(head)) { +- struct sk_buff *clone; +- int i, plen = 0; +- +- clone = alloc_skb(0, GFP_ATOMIC); +- if (!clone) +- goto out_oom; +- clone->next = head->next; +- head->next = clone; +- skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list; +- skb_frag_list_init(head); +- for (i = 0; i < skb_shinfo(head)->nr_frags; i++) +- plen += skb_frag_size(&skb_shinfo(head)->frags[i]); +- clone->len = clone->data_len = head->data_len - plen; +- head->data_len -= clone->len; +- head->len -= clone->len; +- clone->csum = 0; +- clone->ip_summed = head->ip_summed; +- add_frag_mem_limit(fq->q.net, clone->truesize); +- } +- + /* We have to remove fragment header from datagram and to relocate + * header in order to calculate ICV correctly. */ + nhoff = fq->nhoffset; +- skb_network_header(head)[nhoff] = skb_transport_header(head)[0]; +- memmove(head->head + sizeof(struct frag_hdr), head->head, +- (head->data - head->head) - sizeof(struct frag_hdr)); +- if (skb_mac_header_was_set(head)) +- head->mac_header += sizeof(struct frag_hdr); +- head->network_header += sizeof(struct frag_hdr); +- +- skb_reset_transport_header(head); +- skb_push(head, head->data - skb_network_header(head)); +- +- sum_truesize = head->truesize; +- for (fp = head->next; fp;) { +- bool headstolen; +- int delta; +- struct sk_buff *next = fp->next; +- +- sum_truesize += fp->truesize; +- if (head->ip_summed != fp->ip_summed) +- head->ip_summed = CHECKSUM_NONE; +- else if (head->ip_summed == CHECKSUM_COMPLETE) +- head->csum = csum_add(head->csum, fp->csum); +- +- if (skb_try_coalesce(head, fp, &headstolen, &delta)) { +- kfree_skb_partial(fp, headstolen); +- } else { +- fp->sk = NULL; +- if (!skb_shinfo(head)->frag_list) +- skb_shinfo(head)->frag_list = fp; +- head->data_len += fp->len; +- head->len += fp->len; +- head->truesize += fp->truesize; +- } +- fp = next; +- } +- sub_frag_mem_limit(fq->q.net, sum_truesize); ++ skb_network_header(skb)[nhoff] = skb_transport_header(skb)[0]; ++ memmove(skb->head + sizeof(struct frag_hdr), skb->head, ++ (skb->data - skb->head) - sizeof(struct frag_hdr)); ++ if (skb_mac_header_was_set(skb)) ++ skb->mac_header += sizeof(struct frag_hdr); ++ skb->network_header += sizeof(struct frag_hdr); ++ ++ skb_reset_transport_header(skb); ++ ++ inet_frag_reasm_finish(&fq->q, skb, reasm_data); + +- head->next = NULL; +- head->dev = dev; +- head->tstamp = fq->q.stamp; +- ipv6_hdr(head)->payload_len = htons(payload_len); +- ipv6_change_dsfield(ipv6_hdr(head), 0xff, ecn); +- IP6CB(head)->nhoff = nhoff; +- IP6CB(head)->flags |= IP6SKB_FRAGMENTED; +- IP6CB(head)->frag_max_size = fq->q.max_size; ++ skb->dev = dev; ++ ipv6_hdr(skb)->payload_len = htons(payload_len); ++ ipv6_change_dsfield(ipv6_hdr(skb), 0xff, ecn); ++ IP6CB(skb)->nhoff = nhoff; ++ IP6CB(skb)->flags |= IP6SKB_FRAGMENTED; ++ IP6CB(skb)->frag_max_size = fq->q.max_size; + + /* Yes, and fold redundant checksum back. 8) */ +- skb_postpush_rcsum(head, skb_network_header(head), +- skb_network_header_len(head)); ++ skb_postpush_rcsum(skb, skb_network_header(skb), ++ skb_network_header_len(skb)); + + rcu_read_lock(); + __IP6_INC_STATS(net, __in6_dev_get(dev), IPSTATS_MIB_REASMOKS); +@@ -414,6 +307,7 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev, + fq->q.fragments = NULL; + fq->q.rb_fragments = RB_ROOT; + fq->q.fragments_tail = NULL; ++ fq->q.last_run_head = NULL; + return 1; + + out_oversize: +@@ -463,10 +357,6 @@ static int ipv6_frag_rcv(struct sk_buff *skb) + return 1; + } + +- if (skb->len - skb_network_offset(skb) < IPV6_MIN_MTU && +- fhdr->frag_off & htons(IP6_MF)) +- goto fail_hdr; +- + iif = skb->dev ? skb->dev->ifindex : 0; + fq = fq_find(net, fhdr->identification, hdr, iif); + if (fq) { +@@ -484,6 +374,7 @@ static int ipv6_frag_rcv(struct sk_buff *skb) + if (prob_offset) { + __IP6_INC_STATS(net, __in6_dev_get_safely(skb->dev), + IPSTATS_MIB_INHDRERRORS); ++ /* icmpv6_param_prob() calls kfree_skb(skb) */ + icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, prob_offset); + } + return ret; +-- +2.19.1 + diff --git a/queue-4.19/net-ip6-defrag-use-rbtrees-in-nf_conntrack_reasm.c.patch b/queue-4.19/net-ip6-defrag-use-rbtrees-in-nf_conntrack_reasm.c.patch new file mode 100644 index 00000000000..937e39eec5f --- /dev/null +++ b/queue-4.19/net-ip6-defrag-use-rbtrees-in-nf_conntrack_reasm.c.patch @@ -0,0 +1,392 @@ +From d1647fd4aa1024ca0c3361080279a663334751b9 Mon Sep 17 00:00:00 2001 +From: Peter Oskolkov +Date: Mon, 8 Apr 2019 17:10:05 -0700 +Subject: net: IP6 defrag: use rbtrees in nf_conntrack_reasm.c + +[ Upstream commit 997dd96471641e147cb2c33ad54284000d0f5e35 ] + +Currently, IPv6 defragmentation code drops non-last fragments that +are smaller than 1280 bytes: see +commit 0ed4229b08c1 ("ipv6: defrag: drop non-last frags smaller than min mtu") + +This behavior is not specified in IPv6 RFCs and appears to break +compatibility with some IPv6 implemenations, as reported here: +https://www.spinics.net/lists/netdev/msg543846.html + +This patch re-uses common IP defragmentation queueing and reassembly +code in IP6 defragmentation in nf_conntrack, removing the 1280 byte +restriction. + +Signed-off-by: Peter Oskolkov +Reported-by: Tom Herbert +Cc: Eric Dumazet +Cc: Florian Westphal +Signed-off-by: David S. Miller +Acked-by: David S. Miller +Signed-off-by: Sasha Levin +--- + net/ipv6/netfilter/nf_conntrack_reasm.c | 260 +++++++----------------- + 1 file changed, 71 insertions(+), 189 deletions(-) + +diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c +index 043ed8eb0ab9..cb1b4772dac0 100644 +--- a/net/ipv6/netfilter/nf_conntrack_reasm.c ++++ b/net/ipv6/netfilter/nf_conntrack_reasm.c +@@ -136,6 +136,9 @@ static void __net_exit nf_ct_frags6_sysctl_unregister(struct net *net) + } + #endif + ++static int nf_ct_frag6_reasm(struct frag_queue *fq, struct sk_buff *skb, ++ struct sk_buff *prev_tail, struct net_device *dev); ++ + static inline u8 ip6_frag_ecn(const struct ipv6hdr *ipv6h) + { + return 1 << (ipv6_get_dsfield(ipv6h) & INET_ECN_MASK); +@@ -177,9 +180,10 @@ static struct frag_queue *fq_find(struct net *net, __be32 id, u32 user, + static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb, + const struct frag_hdr *fhdr, int nhoff) + { +- struct sk_buff *prev, *next; + unsigned int payload_len; +- int offset, end; ++ struct net_device *dev; ++ struct sk_buff *prev; ++ int offset, end, err; + u8 ecn; + + if (fq->q.flags & INET_FRAG_COMPLETE) { +@@ -254,55 +258,18 @@ static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb, + goto err; + } + +- /* Find out which fragments are in front and at the back of us +- * in the chain of fragments so far. We must know where to put +- * this fragment, right? +- */ +- prev = fq->q.fragments_tail; +- if (!prev || prev->ip_defrag_offset < offset) { +- next = NULL; +- goto found; +- } +- prev = NULL; +- for (next = fq->q.fragments; next != NULL; next = next->next) { +- if (next->ip_defrag_offset >= offset) +- break; /* bingo! */ +- prev = next; +- } +- +-found: +- /* RFC5722, Section 4: +- * When reassembling an IPv6 datagram, if +- * one or more its constituent fragments is determined to be an +- * overlapping fragment, the entire datagram (and any constituent +- * fragments, including those not yet received) MUST be silently +- * discarded. +- */ +- +- /* Check for overlap with preceding fragment. */ +- if (prev && +- (prev->ip_defrag_offset + prev->len) > offset) +- goto discard_fq; +- +- /* Look for overlap with succeeding segment. */ +- if (next && next->ip_defrag_offset < end) +- goto discard_fq; +- +- /* Note : skb->ip_defrag_offset and skb->dev share the same location */ +- if (skb->dev) +- fq->iif = skb->dev->ifindex; ++ /* Note : skb->rbnode and skb->dev share the same location. */ ++ dev = skb->dev; + /* Makes sure compiler wont do silly aliasing games */ + barrier(); +- skb->ip_defrag_offset = offset; + +- /* Insert this fragment in the chain of fragments. */ +- skb->next = next; +- if (!next) +- fq->q.fragments_tail = skb; +- if (prev) +- prev->next = skb; +- else +- fq->q.fragments = skb; ++ prev = fq->q.fragments_tail; ++ err = inet_frag_queue_insert(&fq->q, skb, offset, end); ++ if (err) ++ goto insert_error; ++ ++ if (dev) ++ fq->iif = dev->ifindex; + + fq->q.stamp = skb->tstamp; + fq->q.meat += skb->len; +@@ -319,11 +286,25 @@ static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb, + fq->q.flags |= INET_FRAG_FIRST_IN; + } + +- return 0; ++ if (fq->q.flags == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) && ++ fq->q.meat == fq->q.len) { ++ unsigned long orefdst = skb->_skb_refdst; ++ ++ skb->_skb_refdst = 0UL; ++ err = nf_ct_frag6_reasm(fq, skb, prev, dev); ++ skb->_skb_refdst = orefdst; ++ return err; ++ } ++ ++ skb_dst_drop(skb); ++ return -EINPROGRESS; + +-discard_fq: ++insert_error: ++ if (err == IPFRAG_DUP) ++ goto err; + inet_frag_kill(&fq->q); + err: ++ skb_dst_drop(skb); + return -EINVAL; + } + +@@ -333,147 +314,67 @@ static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb, + * It is called with locked fq, and caller must check that + * queue is eligible for reassembly i.e. it is not COMPLETE, + * the last and the first frames arrived and all the bits are here. +- * +- * returns true if *prev skb has been transformed into the reassembled +- * skb, false otherwise. + */ +-static bool +-nf_ct_frag6_reasm(struct frag_queue *fq, struct sk_buff *prev, struct net_device *dev) ++static int nf_ct_frag6_reasm(struct frag_queue *fq, struct sk_buff *skb, ++ struct sk_buff *prev_tail, struct net_device *dev) + { +- struct sk_buff *fp, *head = fq->q.fragments; +- int payload_len, delta; ++ void *reasm_data; ++ int payload_len; + u8 ecn; + + inet_frag_kill(&fq->q); + +- WARN_ON(head == NULL); +- WARN_ON(head->ip_defrag_offset != 0); +- + ecn = ip_frag_ecn_table[fq->ecn]; + if (unlikely(ecn == 0xff)) +- return false; ++ goto err; ++ ++ reasm_data = inet_frag_reasm_prepare(&fq->q, skb, prev_tail); ++ if (!reasm_data) ++ goto err; + +- /* Unfragmented part is taken from the first segment. */ +- payload_len = ((head->data - skb_network_header(head)) - ++ payload_len = ((skb->data - skb_network_header(skb)) - + sizeof(struct ipv6hdr) + fq->q.len - + sizeof(struct frag_hdr)); + if (payload_len > IPV6_MAXPLEN) { + net_dbg_ratelimited("nf_ct_frag6_reasm: payload len = %d\n", + payload_len); +- return false; +- } +- +- delta = - head->truesize; +- +- /* Head of list must not be cloned. */ +- if (skb_unclone(head, GFP_ATOMIC)) +- return false; +- +- delta += head->truesize; +- if (delta) +- add_frag_mem_limit(fq->q.net, delta); +- +- /* If the first fragment is fragmented itself, we split +- * it to two chunks: the first with data and paged part +- * and the second, holding only fragments. */ +- if (skb_has_frag_list(head)) { +- struct sk_buff *clone; +- int i, plen = 0; +- +- clone = alloc_skb(0, GFP_ATOMIC); +- if (clone == NULL) +- return false; +- +- clone->next = head->next; +- head->next = clone; +- skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list; +- skb_frag_list_init(head); +- for (i = 0; i < skb_shinfo(head)->nr_frags; i++) +- plen += skb_frag_size(&skb_shinfo(head)->frags[i]); +- clone->len = clone->data_len = head->data_len - plen; +- head->data_len -= clone->len; +- head->len -= clone->len; +- clone->csum = 0; +- clone->ip_summed = head->ip_summed; +- +- add_frag_mem_limit(fq->q.net, clone->truesize); +- } +- +- /* morph head into last received skb: prev. +- * +- * This allows callers of ipv6 conntrack defrag to continue +- * to use the last skb(frag) passed into the reasm engine. +- * The last skb frag 'silently' turns into the full reassembled skb. +- * +- * Since prev is also part of q->fragments we have to clone it first. +- */ +- if (head != prev) { +- struct sk_buff *iter; +- +- fp = skb_clone(prev, GFP_ATOMIC); +- if (!fp) +- return false; +- +- fp->next = prev->next; +- +- iter = head; +- while (iter) { +- if (iter->next == prev) { +- iter->next = fp; +- break; +- } +- iter = iter->next; +- } +- +- skb_morph(prev, head); +- prev->next = head->next; +- consume_skb(head); +- head = prev; ++ goto err; + } + + /* We have to remove fragment header from datagram and to relocate + * header in order to calculate ICV correctly. */ +- skb_network_header(head)[fq->nhoffset] = skb_transport_header(head)[0]; +- memmove(head->head + sizeof(struct frag_hdr), head->head, +- (head->data - head->head) - sizeof(struct frag_hdr)); +- head->mac_header += sizeof(struct frag_hdr); +- head->network_header += sizeof(struct frag_hdr); +- +- skb_shinfo(head)->frag_list = head->next; +- skb_reset_transport_header(head); +- skb_push(head, head->data - skb_network_header(head)); +- +- for (fp = head->next; fp; fp = fp->next) { +- head->data_len += fp->len; +- head->len += fp->len; +- if (head->ip_summed != fp->ip_summed) +- head->ip_summed = CHECKSUM_NONE; +- else if (head->ip_summed == CHECKSUM_COMPLETE) +- head->csum = csum_add(head->csum, fp->csum); +- head->truesize += fp->truesize; +- fp->sk = NULL; +- } +- sub_frag_mem_limit(fq->q.net, head->truesize); ++ skb_network_header(skb)[fq->nhoffset] = skb_transport_header(skb)[0]; ++ memmove(skb->head + sizeof(struct frag_hdr), skb->head, ++ (skb->data - skb->head) - sizeof(struct frag_hdr)); ++ skb->mac_header += sizeof(struct frag_hdr); ++ skb->network_header += sizeof(struct frag_hdr); ++ ++ skb_reset_transport_header(skb); ++ ++ inet_frag_reasm_finish(&fq->q, skb, reasm_data); + +- head->ignore_df = 1; +- head->next = NULL; +- head->dev = dev; +- head->tstamp = fq->q.stamp; +- ipv6_hdr(head)->payload_len = htons(payload_len); +- ipv6_change_dsfield(ipv6_hdr(head), 0xff, ecn); +- IP6CB(head)->frag_max_size = sizeof(struct ipv6hdr) + fq->q.max_size; ++ skb->ignore_df = 1; ++ skb->dev = dev; ++ ipv6_hdr(skb)->payload_len = htons(payload_len); ++ ipv6_change_dsfield(ipv6_hdr(skb), 0xff, ecn); ++ IP6CB(skb)->frag_max_size = sizeof(struct ipv6hdr) + fq->q.max_size; + + /* Yes, and fold redundant checksum back. 8) */ +- if (head->ip_summed == CHECKSUM_COMPLETE) +- head->csum = csum_partial(skb_network_header(head), +- skb_network_header_len(head), +- head->csum); ++ if (skb->ip_summed == CHECKSUM_COMPLETE) ++ skb->csum = csum_partial(skb_network_header(skb), ++ skb_network_header_len(skb), ++ skb->csum); + + fq->q.fragments = NULL; + fq->q.rb_fragments = RB_ROOT; + fq->q.fragments_tail = NULL; ++ fq->q.last_run_head = NULL; + +- return true; ++ return 0; ++ ++err: ++ inet_frag_kill(&fq->q); ++ return -EINVAL; + } + + /* +@@ -542,7 +443,6 @@ find_prev_fhdr(struct sk_buff *skb, u8 *prevhdrp, int *prevhoff, int *fhoff) + int nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user) + { + u16 savethdr = skb->transport_header; +- struct net_device *dev = skb->dev; + int fhoff, nhoff, ret; + struct frag_hdr *fhdr; + struct frag_queue *fq; +@@ -565,10 +465,6 @@ int nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user) + hdr = ipv6_hdr(skb); + fhdr = (struct frag_hdr *)skb_transport_header(skb); + +- if (skb->len - skb_network_offset(skb) < IPV6_MIN_MTU && +- fhdr->frag_off & htons(IP6_MF)) +- return -EINVAL; +- + skb_orphan(skb); + fq = fq_find(net, fhdr->identification, user, hdr, + skb->dev ? skb->dev->ifindex : 0); +@@ -580,31 +476,17 @@ int nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user) + spin_lock_bh(&fq->q.lock); + + ret = nf_ct_frag6_queue(fq, skb, fhdr, nhoff); +- if (ret < 0) { +- if (ret == -EPROTO) { +- skb->transport_header = savethdr; +- ret = 0; +- } +- goto out_unlock; ++ if (ret == -EPROTO) { ++ skb->transport_header = savethdr; ++ ret = 0; + } + + /* after queue has assumed skb ownership, only 0 or -EINPROGRESS + * must be returned. + */ +- ret = -EINPROGRESS; +- if (fq->q.flags == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) && +- fq->q.meat == fq->q.len) { +- unsigned long orefdst = skb->_skb_refdst; +- +- skb->_skb_refdst = 0UL; +- if (nf_ct_frag6_reasm(fq, skb, dev)) +- ret = 0; +- skb->_skb_refdst = orefdst; +- } else { +- skb_dst_drop(skb); +- } ++ if (ret) ++ ret = -EINPROGRESS; + +-out_unlock: + spin_unlock_bh(&fq->q.lock); + inet_frag_put(&fq->q); + return ret; +-- +2.19.1 + diff --git a/queue-4.19/series b/queue-4.19/series index c23d61d855a..117fa46c645 100644 --- a/queue-4.19/series +++ b/queue-4.19/series @@ -22,3 +22,6 @@ sch_cake-make-sure-we-can-write-the-ip-header-before-changing-dscp-bits.patch nfp-flower-replace-cfi-with-vlan-present.patch nfp-flower-remove-vlan-cfi-bit-from-push-vlan-action.patch sch_cake-simplify-logic-in-cake_select_tin.patch +net-ip-defrag-encapsulate-rbtree-defrag-code-into-ca.patch +net-ip6-defrag-use-rbtrees-for-ipv6-defrag.patch +net-ip6-defrag-use-rbtrees-in-nf_conntrack_reasm.c.patch