]>
Commit | Line | Data |
---|---|---|
82b2f3ba SL |
1 | From 3cce1561230cd3939fe4662c2a899a322b51f76a Mon Sep 17 00:00:00 2001 |
2 | From: Peter Oskolkov <posk@google.com> | |
3 | Date: Tue, 23 Apr 2019 10:48:24 -0700 | |
4 | Subject: net: IP6 defrag: use rbtrees for IPv6 defrag | |
5 | ||
6 | [ Upstream commit d4289fcc9b16b89619ee1c54f829e05e56de8b9a ] | |
7 | ||
8 | Currently, IPv6 defragmentation code drops non-last fragments that | |
9 | are smaller than 1280 bytes: see | |
10 | commit 0ed4229b08c1 ("ipv6: defrag: drop non-last frags smaller than min mtu") | |
11 | ||
12 | This behavior is not specified in IPv6 RFCs and appears to break | |
13 | compatibility with some IPv6 implemenations, as reported here: | |
14 | https://www.spinics.net/lists/netdev/msg543846.html | |
15 | ||
16 | This patch re-uses common IP defragmentation queueing and reassembly | |
17 | code in IPv6, removing the 1280 byte restriction. | |
18 | ||
19 | v2: change handling of overlaps to match that of upstream. | |
20 | ||
21 | Signed-off-by: Peter Oskolkov <posk@google.com> | |
22 | Reported-by: Tom Herbert <tom@herbertland.com> | |
23 | Cc: Eric Dumazet <edumazet@google.com> | |
24 | Cc: Florian Westphal <fw@strlen.de> | |
25 | Signed-off-by: David S. Miller <davem@davemloft.net> | |
26 | Signed-off-by: Sasha Levin <sashal@kernel.org> | |
27 | --- | |
28 | include/net/ipv6_frag.h | 11 +- | |
29 | net/ipv6/reassembly.c | 249 +++++++++++----------------------------- | |
30 | 2 files changed, 77 insertions(+), 183 deletions(-) | |
31 | ||
32 | diff --git a/include/net/ipv6_frag.h b/include/net/ipv6_frag.h | |
33 | index 6ced1e6899b6..28aa9b30aece 100644 | |
34 | --- a/include/net/ipv6_frag.h | |
35 | +++ b/include/net/ipv6_frag.h | |
36 | @@ -82,8 +82,15 @@ ip6frag_expire_frag_queue(struct net *net, struct frag_queue *fq) | |
37 | __IP6_INC_STATS(net, __in6_dev_get(dev), IPSTATS_MIB_REASMTIMEOUT); | |
38 | ||
39 | /* Don't send error if the first segment did not arrive. */ | |
40 | - head = fq->q.fragments; | |
41 | - if (!(fq->q.flags & INET_FRAG_FIRST_IN) || !head) | |
42 | + if (!(fq->q.flags & INET_FRAG_FIRST_IN)) | |
43 | + goto out; | |
44 | + | |
45 | + /* sk_buff::dev and sk_buff::rbnode are unionized. So we | |
46 | + * pull the head out of the tree in order to be able to | |
47 | + * deal with head->dev. | |
48 | + */ | |
49 | + head = inet_frag_pull_head(&fq->q); | |
50 | + if (!head) | |
51 | goto out; | |
52 | ||
53 | head->dev = dev; | |
54 | diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c | |
55 | index e5ab3b7813d6..fe797b29ca89 100644 | |
56 | --- a/net/ipv6/reassembly.c | |
57 | +++ b/net/ipv6/reassembly.c | |
58 | @@ -62,13 +62,6 @@ | |
59 | ||
60 | static const char ip6_frag_cache_name[] = "ip6-frags"; | |
61 | ||
62 | -struct ip6frag_skb_cb { | |
63 | - struct inet6_skb_parm h; | |
64 | - int offset; | |
65 | -}; | |
66 | - | |
67 | -#define FRAG6_CB(skb) ((struct ip6frag_skb_cb *)((skb)->cb)) | |
68 | - | |
69 | static u8 ip6_frag_ecn(const struct ipv6hdr *ipv6h) | |
70 | { | |
71 | return 1 << (ipv6_get_dsfield(ipv6h) & INET_ECN_MASK); | |
72 | @@ -76,8 +69,8 @@ static u8 ip6_frag_ecn(const struct ipv6hdr *ipv6h) | |
73 | ||
74 | static struct inet_frags ip6_frags; | |
75 | ||
76 | -static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev, | |
77 | - struct net_device *dev); | |
78 | +static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *skb, | |
79 | + struct sk_buff *prev_tail, struct net_device *dev); | |
80 | ||
81 | static void ip6_frag_expire(struct timer_list *t) | |
82 | { | |
83 | @@ -118,21 +111,26 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb, | |
84 | struct frag_hdr *fhdr, int nhoff, | |
85 | u32 *prob_offset) | |
86 | { | |
87 | - struct sk_buff *prev, *next; | |
88 | - struct net_device *dev; | |
89 | - int offset, end, fragsize; | |
90 | struct net *net = dev_net(skb_dst(skb)->dev); | |
91 | + int offset, end, fragsize; | |
92 | + struct sk_buff *prev_tail; | |
93 | + struct net_device *dev; | |
94 | + int err = -ENOENT; | |
95 | u8 ecn; | |
96 | ||
97 | if (fq->q.flags & INET_FRAG_COMPLETE) | |
98 | goto err; | |
99 | ||
100 | + err = -EINVAL; | |
101 | offset = ntohs(fhdr->frag_off) & ~0x7; | |
102 | end = offset + (ntohs(ipv6_hdr(skb)->payload_len) - | |
103 | ((u8 *)(fhdr + 1) - (u8 *)(ipv6_hdr(skb) + 1))); | |
104 | ||
105 | if ((unsigned int)end > IPV6_MAXPLEN) { | |
106 | *prob_offset = (u8 *)&fhdr->frag_off - skb_network_header(skb); | |
107 | + /* note that if prob_offset is set, the skb is freed elsewhere, | |
108 | + * we do not free it here. | |
109 | + */ | |
110 | return -1; | |
111 | } | |
112 | ||
113 | @@ -152,7 +150,7 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb, | |
114 | */ | |
115 | if (end < fq->q.len || | |
116 | ((fq->q.flags & INET_FRAG_LAST_IN) && end != fq->q.len)) | |
117 | - goto err; | |
118 | + goto discard_fq; | |
119 | fq->q.flags |= INET_FRAG_LAST_IN; | |
120 | fq->q.len = end; | |
121 | } else { | |
122 | @@ -169,70 +167,36 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb, | |
123 | if (end > fq->q.len) { | |
124 | /* Some bits beyond end -> corruption. */ | |
125 | if (fq->q.flags & INET_FRAG_LAST_IN) | |
126 | - goto err; | |
127 | + goto discard_fq; | |
128 | fq->q.len = end; | |
129 | } | |
130 | } | |
131 | ||
132 | if (end == offset) | |
133 | - goto err; | |
134 | + goto discard_fq; | |
135 | ||
136 | + err = -ENOMEM; | |
137 | /* Point into the IP datagram 'data' part. */ | |
138 | if (!pskb_pull(skb, (u8 *) (fhdr + 1) - skb->data)) | |
139 | - goto err; | |
140 | - | |
141 | - if (pskb_trim_rcsum(skb, end - offset)) | |
142 | - goto err; | |
143 | - | |
144 | - /* Find out which fragments are in front and at the back of us | |
145 | - * in the chain of fragments so far. We must know where to put | |
146 | - * this fragment, right? | |
147 | - */ | |
148 | - prev = fq->q.fragments_tail; | |
149 | - if (!prev || FRAG6_CB(prev)->offset < offset) { | |
150 | - next = NULL; | |
151 | - goto found; | |
152 | - } | |
153 | - prev = NULL; | |
154 | - for (next = fq->q.fragments; next != NULL; next = next->next) { | |
155 | - if (FRAG6_CB(next)->offset >= offset) | |
156 | - break; /* bingo! */ | |
157 | - prev = next; | |
158 | - } | |
159 | - | |
160 | -found: | |
161 | - /* RFC5722, Section 4, amended by Errata ID : 3089 | |
162 | - * When reassembling an IPv6 datagram, if | |
163 | - * one or more its constituent fragments is determined to be an | |
164 | - * overlapping fragment, the entire datagram (and any constituent | |
165 | - * fragments) MUST be silently discarded. | |
166 | - */ | |
167 | - | |
168 | - /* Check for overlap with preceding fragment. */ | |
169 | - if (prev && | |
170 | - (FRAG6_CB(prev)->offset + prev->len) > offset) | |
171 | goto discard_fq; | |
172 | ||
173 | - /* Look for overlap with succeeding segment. */ | |
174 | - if (next && FRAG6_CB(next)->offset < end) | |
175 | + err = pskb_trim_rcsum(skb, end - offset); | |
176 | + if (err) | |
177 | goto discard_fq; | |
178 | ||
179 | - FRAG6_CB(skb)->offset = offset; | |
180 | + /* Note : skb->rbnode and skb->dev share the same location. */ | |
181 | + dev = skb->dev; | |
182 | + /* Makes sure compiler wont do silly aliasing games */ | |
183 | + barrier(); | |
184 | ||
185 | - /* Insert this fragment in the chain of fragments. */ | |
186 | - skb->next = next; | |
187 | - if (!next) | |
188 | - fq->q.fragments_tail = skb; | |
189 | - if (prev) | |
190 | - prev->next = skb; | |
191 | - else | |
192 | - fq->q.fragments = skb; | |
193 | + prev_tail = fq->q.fragments_tail; | |
194 | + err = inet_frag_queue_insert(&fq->q, skb, offset, end); | |
195 | + if (err) | |
196 | + goto insert_error; | |
197 | ||
198 | - dev = skb->dev; | |
199 | - if (dev) { | |
200 | + if (dev) | |
201 | fq->iif = dev->ifindex; | |
202 | - skb->dev = NULL; | |
203 | - } | |
204 | + | |
205 | fq->q.stamp = skb->tstamp; | |
206 | fq->q.meat += skb->len; | |
207 | fq->ecn |= ecn; | |
208 | @@ -252,44 +216,48 @@ found: | |
209 | ||
210 | if (fq->q.flags == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) && | |
211 | fq->q.meat == fq->q.len) { | |
212 | - int res; | |
213 | unsigned long orefdst = skb->_skb_refdst; | |
214 | ||
215 | skb->_skb_refdst = 0UL; | |
216 | - res = ip6_frag_reasm(fq, prev, dev); | |
217 | + err = ip6_frag_reasm(fq, skb, prev_tail, dev); | |
218 | skb->_skb_refdst = orefdst; | |
219 | - return res; | |
220 | + return err; | |
221 | } | |
222 | ||
223 | skb_dst_drop(skb); | |
224 | - return -1; | |
225 | + return -EINPROGRESS; | |
226 | ||
227 | +insert_error: | |
228 | + if (err == IPFRAG_DUP) { | |
229 | + kfree_skb(skb); | |
230 | + return -EINVAL; | |
231 | + } | |
232 | + err = -EINVAL; | |
233 | + __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), | |
234 | + IPSTATS_MIB_REASM_OVERLAPS); | |
235 | discard_fq: | |
236 | inet_frag_kill(&fq->q); | |
237 | -err: | |
238 | __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), | |
239 | IPSTATS_MIB_REASMFAILS); | |
240 | +err: | |
241 | kfree_skb(skb); | |
242 | - return -1; | |
243 | + return err; | |
244 | } | |
245 | ||
246 | /* | |
247 | * Check if this packet is complete. | |
248 | - * Returns NULL on failure by any reason, and pointer | |
249 | - * to current nexthdr field in reassembled frame. | |
250 | * | |
251 | * It is called with locked fq, and caller must check that | |
252 | * queue is eligible for reassembly i.e. it is not COMPLETE, | |
253 | * the last and the first frames arrived and all the bits are here. | |
254 | */ | |
255 | -static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev, | |
256 | - struct net_device *dev) | |
257 | +static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *skb, | |
258 | + struct sk_buff *prev_tail, struct net_device *dev) | |
259 | { | |
260 | struct net *net = container_of(fq->q.net, struct net, ipv6.frags); | |
261 | - struct sk_buff *fp, *head = fq->q.fragments; | |
262 | - int payload_len, delta; | |
263 | unsigned int nhoff; | |
264 | - int sum_truesize; | |
265 | + void *reasm_data; | |
266 | + int payload_len; | |
267 | u8 ecn; | |
268 | ||
269 | inet_frag_kill(&fq->q); | |
270 | @@ -298,120 +266,40 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev, | |
271 | if (unlikely(ecn == 0xff)) | |
272 | goto out_fail; | |
273 | ||
274 | - /* Make the one we just received the head. */ | |
275 | - if (prev) { | |
276 | - head = prev->next; | |
277 | - fp = skb_clone(head, GFP_ATOMIC); | |
278 | - | |
279 | - if (!fp) | |
280 | - goto out_oom; | |
281 | - | |
282 | - fp->next = head->next; | |
283 | - if (!fp->next) | |
284 | - fq->q.fragments_tail = fp; | |
285 | - prev->next = fp; | |
286 | - | |
287 | - skb_morph(head, fq->q.fragments); | |
288 | - head->next = fq->q.fragments->next; | |
289 | - | |
290 | - consume_skb(fq->q.fragments); | |
291 | - fq->q.fragments = head; | |
292 | - } | |
293 | - | |
294 | - WARN_ON(head == NULL); | |
295 | - WARN_ON(FRAG6_CB(head)->offset != 0); | |
296 | + reasm_data = inet_frag_reasm_prepare(&fq->q, skb, prev_tail); | |
297 | + if (!reasm_data) | |
298 | + goto out_oom; | |
299 | ||
300 | - /* Unfragmented part is taken from the first segment. */ | |
301 | - payload_len = ((head->data - skb_network_header(head)) - | |
302 | + payload_len = ((skb->data - skb_network_header(skb)) - | |
303 | sizeof(struct ipv6hdr) + fq->q.len - | |
304 | sizeof(struct frag_hdr)); | |
305 | if (payload_len > IPV6_MAXPLEN) | |
306 | goto out_oversize; | |
307 | ||
308 | - delta = - head->truesize; | |
309 | - | |
310 | - /* Head of list must not be cloned. */ | |
311 | - if (skb_unclone(head, GFP_ATOMIC)) | |
312 | - goto out_oom; | |
313 | - | |
314 | - delta += head->truesize; | |
315 | - if (delta) | |
316 | - add_frag_mem_limit(fq->q.net, delta); | |
317 | - | |
318 | - /* If the first fragment is fragmented itself, we split | |
319 | - * it to two chunks: the first with data and paged part | |
320 | - * and the second, holding only fragments. */ | |
321 | - if (skb_has_frag_list(head)) { | |
322 | - struct sk_buff *clone; | |
323 | - int i, plen = 0; | |
324 | - | |
325 | - clone = alloc_skb(0, GFP_ATOMIC); | |
326 | - if (!clone) | |
327 | - goto out_oom; | |
328 | - clone->next = head->next; | |
329 | - head->next = clone; | |
330 | - skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list; | |
331 | - skb_frag_list_init(head); | |
332 | - for (i = 0; i < skb_shinfo(head)->nr_frags; i++) | |
333 | - plen += skb_frag_size(&skb_shinfo(head)->frags[i]); | |
334 | - clone->len = clone->data_len = head->data_len - plen; | |
335 | - head->data_len -= clone->len; | |
336 | - head->len -= clone->len; | |
337 | - clone->csum = 0; | |
338 | - clone->ip_summed = head->ip_summed; | |
339 | - add_frag_mem_limit(fq->q.net, clone->truesize); | |
340 | - } | |
341 | - | |
342 | /* We have to remove fragment header from datagram and to relocate | |
343 | * header in order to calculate ICV correctly. */ | |
344 | nhoff = fq->nhoffset; | |
345 | - skb_network_header(head)[nhoff] = skb_transport_header(head)[0]; | |
346 | - memmove(head->head + sizeof(struct frag_hdr), head->head, | |
347 | - (head->data - head->head) - sizeof(struct frag_hdr)); | |
348 | - if (skb_mac_header_was_set(head)) | |
349 | - head->mac_header += sizeof(struct frag_hdr); | |
350 | - head->network_header += sizeof(struct frag_hdr); | |
351 | - | |
352 | - skb_reset_transport_header(head); | |
353 | - skb_push(head, head->data - skb_network_header(head)); | |
354 | - | |
355 | - sum_truesize = head->truesize; | |
356 | - for (fp = head->next; fp;) { | |
357 | - bool headstolen; | |
358 | - int delta; | |
359 | - struct sk_buff *next = fp->next; | |
360 | - | |
361 | - sum_truesize += fp->truesize; | |
362 | - if (head->ip_summed != fp->ip_summed) | |
363 | - head->ip_summed = CHECKSUM_NONE; | |
364 | - else if (head->ip_summed == CHECKSUM_COMPLETE) | |
365 | - head->csum = csum_add(head->csum, fp->csum); | |
366 | - | |
367 | - if (skb_try_coalesce(head, fp, &headstolen, &delta)) { | |
368 | - kfree_skb_partial(fp, headstolen); | |
369 | - } else { | |
370 | - if (!skb_shinfo(head)->frag_list) | |
371 | - skb_shinfo(head)->frag_list = fp; | |
372 | - head->data_len += fp->len; | |
373 | - head->len += fp->len; | |
374 | - head->truesize += fp->truesize; | |
375 | - } | |
376 | - fp = next; | |
377 | - } | |
378 | - sub_frag_mem_limit(fq->q.net, sum_truesize); | |
379 | + skb_network_header(skb)[nhoff] = skb_transport_header(skb)[0]; | |
380 | + memmove(skb->head + sizeof(struct frag_hdr), skb->head, | |
381 | + (skb->data - skb->head) - sizeof(struct frag_hdr)); | |
382 | + if (skb_mac_header_was_set(skb)) | |
383 | + skb->mac_header += sizeof(struct frag_hdr); | |
384 | + skb->network_header += sizeof(struct frag_hdr); | |
385 | + | |
386 | + skb_reset_transport_header(skb); | |
387 | + | |
388 | + inet_frag_reasm_finish(&fq->q, skb, reasm_data); | |
389 | ||
390 | - head->next = NULL; | |
391 | - head->dev = dev; | |
392 | - head->tstamp = fq->q.stamp; | |
393 | - ipv6_hdr(head)->payload_len = htons(payload_len); | |
394 | - ipv6_change_dsfield(ipv6_hdr(head), 0xff, ecn); | |
395 | - IP6CB(head)->nhoff = nhoff; | |
396 | - IP6CB(head)->flags |= IP6SKB_FRAGMENTED; | |
397 | - IP6CB(head)->frag_max_size = fq->q.max_size; | |
398 | + skb->dev = dev; | |
399 | + ipv6_hdr(skb)->payload_len = htons(payload_len); | |
400 | + ipv6_change_dsfield(ipv6_hdr(skb), 0xff, ecn); | |
401 | + IP6CB(skb)->nhoff = nhoff; | |
402 | + IP6CB(skb)->flags |= IP6SKB_FRAGMENTED; | |
403 | + IP6CB(skb)->frag_max_size = fq->q.max_size; | |
404 | ||
405 | /* Yes, and fold redundant checksum back. 8) */ | |
406 | - skb_postpush_rcsum(head, skb_network_header(head), | |
407 | - skb_network_header_len(head)); | |
408 | + skb_postpush_rcsum(skb, skb_network_header(skb), | |
409 | + skb_network_header_len(skb)); | |
410 | ||
411 | rcu_read_lock(); | |
412 | __IP6_INC_STATS(net, __in6_dev_get(dev), IPSTATS_MIB_REASMOKS); | |
413 | @@ -419,6 +307,7 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev, | |
414 | fq->q.fragments = NULL; | |
415 | fq->q.rb_fragments = RB_ROOT; | |
416 | fq->q.fragments_tail = NULL; | |
417 | + fq->q.last_run_head = NULL; | |
418 | return 1; | |
419 | ||
420 | out_oversize: | |
421 | @@ -430,6 +319,7 @@ out_fail: | |
422 | rcu_read_lock(); | |
423 | __IP6_INC_STATS(net, __in6_dev_get(dev), IPSTATS_MIB_REASMFAILS); | |
424 | rcu_read_unlock(); | |
425 | + inet_frag_kill(&fq->q); | |
426 | return -1; | |
427 | } | |
428 | ||
429 | @@ -468,10 +358,6 @@ static int ipv6_frag_rcv(struct sk_buff *skb) | |
430 | return 1; | |
431 | } | |
432 | ||
433 | - if (skb->len - skb_network_offset(skb) < IPV6_MIN_MTU && | |
434 | - fhdr->frag_off & htons(IP6_MF)) | |
435 | - goto fail_hdr; | |
436 | - | |
437 | iif = skb->dev ? skb->dev->ifindex : 0; | |
438 | fq = fq_find(net, fhdr->identification, hdr, iif); | |
439 | if (fq) { | |
440 | @@ -489,6 +375,7 @@ static int ipv6_frag_rcv(struct sk_buff *skb) | |
441 | if (prob_offset) { | |
442 | __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), | |
443 | IPSTATS_MIB_INHDRERRORS); | |
444 | + /* icmpv6_param_prob() calls kfree_skb(skb) */ | |
445 | icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, prob_offset); | |
446 | } | |
447 | return ret; | |
448 | -- | |
449 | 2.19.1 | |
450 |