]> git.ipfire.org Git - thirdparty/linux.git/blame - net/core/skbuff.c
qlcnic: Fix protcol type in case of inband vlan.
[thirdparty/linux.git] / net / core / skbuff.c
CommitLineData
1da177e4
LT
1/*
2 * Routines having to do with the 'struct sk_buff' memory handlers.
3 *
113aa838 4 * Authors: Alan Cox <alan@lxorguk.ukuu.org.uk>
1da177e4
LT
5 * Florian La Roche <rzsfl@rz.uni-sb.de>
6 *
1da177e4
LT
7 * Fixes:
8 * Alan Cox : Fixed the worst of the load
9 * balancer bugs.
10 * Dave Platt : Interrupt stacking fix.
11 * Richard Kooijman : Timestamp fixes.
12 * Alan Cox : Changed buffer format.
13 * Alan Cox : destructor hook for AF_UNIX etc.
14 * Linus Torvalds : Better skb_clone.
15 * Alan Cox : Added skb_copy.
16 * Alan Cox : Added all the changed routines Linus
17 * only put in the headers
18 * Ray VanTassle : Fixed --skb->lock in free
19 * Alan Cox : skb_copy copy arp field
20 * Andi Kleen : slabified it.
21 * Robert Olsson : Removed skb_head_pool
22 *
23 * NOTE:
24 * The __skb_ routines should be called with interrupts
25 * disabled, or you better be *real* sure that the operation is atomic
26 * with respect to whatever list is being frobbed (e.g. via lock_sock()
27 * or via disabling bottom half handlers, etc).
28 *
29 * This program is free software; you can redistribute it and/or
30 * modify it under the terms of the GNU General Public License
31 * as published by the Free Software Foundation; either version
32 * 2 of the License, or (at your option) any later version.
33 */
34
35/*
36 * The functions in this file will not compile correctly with gcc 2.4.x
37 */
38
e005d193
JP
39#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
40
1da177e4
LT
41#include <linux/module.h>
42#include <linux/types.h>
43#include <linux/kernel.h>
fe55f6d5 44#include <linux/kmemcheck.h>
1da177e4
LT
45#include <linux/mm.h>
46#include <linux/interrupt.h>
47#include <linux/in.h>
48#include <linux/inet.h>
49#include <linux/slab.h>
50#include <linux/netdevice.h>
51#ifdef CONFIG_NET_CLS_ACT
52#include <net/pkt_sched.h>
53#endif
54#include <linux/string.h>
55#include <linux/skbuff.h>
9c55e01c 56#include <linux/splice.h>
1da177e4
LT
57#include <linux/cache.h>
58#include <linux/rtnetlink.h>
59#include <linux/init.h>
716ea3a7 60#include <linux/scatterlist.h>
ac45f602 61#include <linux/errqueue.h>
268bb0ce 62#include <linux/prefetch.h>
1da177e4
LT
63
64#include <net/protocol.h>
65#include <net/dst.h>
66#include <net/sock.h>
67#include <net/checksum.h>
68#include <net/xfrm.h>
69
70#include <asm/uaccess.h>
ad8d75ff 71#include <trace/events/skb.h>
51c56b00 72#include <linux/highmem.h>
a1f8e7f7 73
d7e8883c 74struct kmem_cache *skbuff_head_cache __read_mostly;
e18b890b 75static struct kmem_cache *skbuff_fclone_cache __read_mostly;
1da177e4 76
9c55e01c
JA
77static void sock_pipe_buf_release(struct pipe_inode_info *pipe,
78 struct pipe_buffer *buf)
79{
8b9d3728 80 put_page(buf->page);
9c55e01c
JA
81}
82
83static void sock_pipe_buf_get(struct pipe_inode_info *pipe,
84 struct pipe_buffer *buf)
85{
8b9d3728 86 get_page(buf->page);
9c55e01c
JA
87}
88
89static int sock_pipe_buf_steal(struct pipe_inode_info *pipe,
90 struct pipe_buffer *buf)
91{
92 return 1;
93}
94
95
96/* Pipe buffer operations for a socket. */
28dfef8f 97static const struct pipe_buf_operations sock_pipe_buf_ops = {
9c55e01c
JA
98 .can_merge = 0,
99 .map = generic_pipe_buf_map,
100 .unmap = generic_pipe_buf_unmap,
101 .confirm = generic_pipe_buf_confirm,
102 .release = sock_pipe_buf_release,
103 .steal = sock_pipe_buf_steal,
104 .get = sock_pipe_buf_get,
105};
106
1da177e4
LT
107/*
108 * Keep out-of-line to prevent kernel bloat.
109 * __builtin_return_address is not used because it is not always
110 * reliable.
111 */
112
113/**
114 * skb_over_panic - private function
115 * @skb: buffer
116 * @sz: size
117 * @here: address
118 *
119 * Out of line support code for skb_put(). Not user callable.
120 */
ccb7c773 121static void skb_over_panic(struct sk_buff *skb, int sz, void *here)
1da177e4 122{
e005d193
JP
123 pr_emerg("%s: text:%p len:%d put:%d head:%p data:%p tail:%#lx end:%#lx dev:%s\n",
124 __func__, here, skb->len, sz, skb->head, skb->data,
125 (unsigned long)skb->tail, (unsigned long)skb->end,
126 skb->dev ? skb->dev->name : "<NULL>");
1da177e4
LT
127 BUG();
128}
129
130/**
131 * skb_under_panic - private function
132 * @skb: buffer
133 * @sz: size
134 * @here: address
135 *
136 * Out of line support code for skb_push(). Not user callable.
137 */
138
ccb7c773 139static void skb_under_panic(struct sk_buff *skb, int sz, void *here)
1da177e4 140{
e005d193
JP
141 pr_emerg("%s: text:%p len:%d put:%d head:%p data:%p tail:%#lx end:%#lx dev:%s\n",
142 __func__, here, skb->len, sz, skb->head, skb->data,
143 (unsigned long)skb->tail, (unsigned long)skb->end,
144 skb->dev ? skb->dev->name : "<NULL>");
1da177e4
LT
145 BUG();
146}
147
148/* Allocate a new skbuff. We do this ourselves so we can fill in a few
149 * 'private' fields and also do memory statistics to find all the
150 * [BEEP] leaks.
151 *
152 */
153
154/**
d179cd12 155 * __alloc_skb - allocate a network buffer
1da177e4
LT
156 * @size: size to allocate
157 * @gfp_mask: allocation mask
c83c2486
RD
158 * @fclone: allocate from fclone cache instead of head cache
159 * and allocate a cloned (child) skb
b30973f8 160 * @node: numa node to allocate memory on
1da177e4
LT
161 *
162 * Allocate a new &sk_buff. The returned buffer has no headroom and a
163 * tail room of size bytes. The object has a reference count of one.
164 * The return is the buffer. On a failure the return is %NULL.
165 *
166 * Buffers may only be allocated from interrupts using a @gfp_mask of
167 * %GFP_ATOMIC.
168 */
dd0fc66f 169struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
b30973f8 170 int fclone, int node)
1da177e4 171{
e18b890b 172 struct kmem_cache *cache;
4947d3ef 173 struct skb_shared_info *shinfo;
1da177e4
LT
174 struct sk_buff *skb;
175 u8 *data;
176
8798b3fb
HX
177 cache = fclone ? skbuff_fclone_cache : skbuff_head_cache;
178
1da177e4 179 /* Get the HEAD */
b30973f8 180 skb = kmem_cache_alloc_node(cache, gfp_mask & ~__GFP_DMA, node);
1da177e4
LT
181 if (!skb)
182 goto out;
ec7d2f2c 183 prefetchw(skb);
1da177e4 184
87fb4b7b
ED
185 /* We do our best to align skb_shared_info on a separate cache
186 * line. It usually works because kmalloc(X > SMP_CACHE_BYTES) gives
187 * aligned memory blocks, unless SLUB/SLAB debug is enabled.
188 * Both skb->head and skb_shared_info are cache line aligned.
189 */
bc417e30 190 size = SKB_DATA_ALIGN(size);
87fb4b7b
ED
191 size += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
192 data = kmalloc_node_track_caller(size, gfp_mask, node);
1da177e4
LT
193 if (!data)
194 goto nodata;
87fb4b7b
ED
195 /* kmalloc(size) might give us more room than requested.
196 * Put skb_shared_info exactly at the end of allocated zone,
197 * to allow max possible filling before reallocation.
198 */
199 size = SKB_WITH_OVERHEAD(ksize(data));
ec7d2f2c 200 prefetchw(data + size);
1da177e4 201
ca0605a7 202 /*
c8005785
JB
203 * Only clear those fields we need to clear, not those that we will
204 * actually initialise below. Hence, don't put any more fields after
205 * the tail pointer in struct sk_buff!
ca0605a7
ACM
206 */
207 memset(skb, 0, offsetof(struct sk_buff, tail));
87fb4b7b
ED
208 /* Account for allocated memory : skb + skb->head */
209 skb->truesize = SKB_TRUESIZE(size);
1da177e4
LT
210 atomic_set(&skb->users, 1);
211 skb->head = data;
212 skb->data = data;
27a884dc 213 skb_reset_tail_pointer(skb);
4305b541 214 skb->end = skb->tail + size;
19633e12
SH
215#ifdef NET_SKBUFF_DATA_USES_OFFSET
216 skb->mac_header = ~0U;
217#endif
218
4947d3ef
BL
219 /* make sure we initialize shinfo sequentially */
220 shinfo = skb_shinfo(skb);
ec7d2f2c 221 memset(shinfo, 0, offsetof(struct skb_shared_info, dataref));
4947d3ef 222 atomic_set(&shinfo->dataref, 1);
c2aa3665 223 kmemcheck_annotate_variable(shinfo->destructor_arg);
4947d3ef 224
d179cd12
DM
225 if (fclone) {
226 struct sk_buff *child = skb + 1;
227 atomic_t *fclone_ref = (atomic_t *) (child + 1);
1da177e4 228
fe55f6d5
VN
229 kmemcheck_annotate_bitfield(child, flags1);
230 kmemcheck_annotate_bitfield(child, flags2);
d179cd12
DM
231 skb->fclone = SKB_FCLONE_ORIG;
232 atomic_set(fclone_ref, 1);
233
234 child->fclone = SKB_FCLONE_UNAVAILABLE;
235 }
1da177e4
LT
236out:
237 return skb;
238nodata:
8798b3fb 239 kmem_cache_free(cache, skb);
1da177e4
LT
240 skb = NULL;
241 goto out;
1da177e4 242}
b4ac530f 243EXPORT_SYMBOL(__alloc_skb);
1da177e4 244
b2b5ce9d
ED
245/**
246 * build_skb - build a network buffer
247 * @data: data buffer provided by caller
d3836f21 248 * @frag_size: size of fragment, or 0 if head was kmalloced
b2b5ce9d
ED
249 *
250 * Allocate a new &sk_buff. Caller provides space holding head and
251 * skb_shared_info. @data must have been allocated by kmalloc()
252 * The return is the new skb buffer.
253 * On a failure the return is %NULL, and @data is not freed.
254 * Notes :
255 * Before IO, driver allocates only data buffer where NIC put incoming frame
256 * Driver should add room at head (NET_SKB_PAD) and
257 * MUST add room at tail (SKB_DATA_ALIGN(skb_shared_info))
258 * After IO, driver calls build_skb(), to allocate sk_buff and populate it
259 * before giving packet to stack.
260 * RX rings only contains data buffers, not full skbs.
261 */
d3836f21 262struct sk_buff *build_skb(void *data, unsigned int frag_size)
b2b5ce9d
ED
263{
264 struct skb_shared_info *shinfo;
265 struct sk_buff *skb;
d3836f21 266 unsigned int size = frag_size ? : ksize(data);
b2b5ce9d
ED
267
268 skb = kmem_cache_alloc(skbuff_head_cache, GFP_ATOMIC);
269 if (!skb)
270 return NULL;
271
d3836f21 272 size -= SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
b2b5ce9d
ED
273
274 memset(skb, 0, offsetof(struct sk_buff, tail));
275 skb->truesize = SKB_TRUESIZE(size);
d3836f21 276 skb->head_frag = frag_size != 0;
b2b5ce9d
ED
277 atomic_set(&skb->users, 1);
278 skb->head = data;
279 skb->data = data;
280 skb_reset_tail_pointer(skb);
281 skb->end = skb->tail + size;
282#ifdef NET_SKBUFF_DATA_USES_OFFSET
283 skb->mac_header = ~0U;
284#endif
285
286 /* make sure we initialize shinfo sequentially */
287 shinfo = skb_shinfo(skb);
288 memset(shinfo, 0, offsetof(struct skb_shared_info, dataref));
289 atomic_set(&shinfo->dataref, 1);
290 kmemcheck_annotate_variable(shinfo->destructor_arg);
291
292 return skb;
293}
294EXPORT_SYMBOL(build_skb);
295
a1c7fff7
ED
296struct netdev_alloc_cache {
297 struct page *page;
298 unsigned int offset;
299};
300static DEFINE_PER_CPU(struct netdev_alloc_cache, netdev_alloc_cache);
301
6f532612
ED
302/**
303 * netdev_alloc_frag - allocate a page fragment
304 * @fragsz: fragment size
305 *
306 * Allocates a frag from a page for receive buffer.
307 * Uses GFP_ATOMIC allocations.
308 */
309void *netdev_alloc_frag(unsigned int fragsz)
310{
311 struct netdev_alloc_cache *nc;
312 void *data = NULL;
313 unsigned long flags;
314
315 local_irq_save(flags);
316 nc = &__get_cpu_var(netdev_alloc_cache);
317 if (unlikely(!nc->page)) {
318refill:
319 nc->page = alloc_page(GFP_ATOMIC | __GFP_COLD);
320 nc->offset = 0;
321 }
322 if (likely(nc->page)) {
323 if (nc->offset + fragsz > PAGE_SIZE) {
324 put_page(nc->page);
325 goto refill;
326 }
327 data = page_address(nc->page) + nc->offset;
328 nc->offset += fragsz;
329 get_page(nc->page);
330 }
331 local_irq_restore(flags);
332 return data;
333}
334EXPORT_SYMBOL(netdev_alloc_frag);
335
8af27456
CH
336/**
337 * __netdev_alloc_skb - allocate an skbuff for rx on a specific device
338 * @dev: network device to receive on
339 * @length: length to allocate
340 * @gfp_mask: get_free_pages mask, passed to alloc_skb
341 *
342 * Allocate a new &sk_buff and assign it a usage count of one. The
343 * buffer has unspecified headroom built in. Users should allocate
344 * the headroom they think they need without accounting for the
345 * built in space. The built in space is used for optimisations.
346 *
347 * %NULL is returned if there is no free memory.
348 */
349struct sk_buff *__netdev_alloc_skb(struct net_device *dev,
6f532612 350 unsigned int length, gfp_t gfp_mask)
8af27456 351{
6f532612 352 struct sk_buff *skb = NULL;
a1c7fff7
ED
353 unsigned int fragsz = SKB_DATA_ALIGN(length + NET_SKB_PAD) +
354 SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
355
356 if (fragsz <= PAGE_SIZE && !(gfp_mask & __GFP_WAIT)) {
6f532612 357 void *data = netdev_alloc_frag(fragsz);
a1c7fff7 358
6f532612
ED
359 if (likely(data)) {
360 skb = build_skb(data, fragsz);
361 if (unlikely(!skb))
362 put_page(virt_to_head_page(data));
a1c7fff7 363 }
a1c7fff7
ED
364 } else {
365 skb = __alloc_skb(length + NET_SKB_PAD, gfp_mask, 0, NUMA_NO_NODE);
366 }
7b2e497a 367 if (likely(skb)) {
8af27456 368 skb_reserve(skb, NET_SKB_PAD);
7b2e497a
CH
369 skb->dev = dev;
370 }
8af27456
CH
371 return skb;
372}
b4ac530f 373EXPORT_SYMBOL(__netdev_alloc_skb);
1da177e4 374
654bed16 375void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page, int off,
50269e19 376 int size, unsigned int truesize)
654bed16
PZ
377{
378 skb_fill_page_desc(skb, i, page, off, size);
379 skb->len += size;
380 skb->data_len += size;
50269e19 381 skb->truesize += truesize;
654bed16
PZ
382}
383EXPORT_SYMBOL(skb_add_rx_frag);
384
27b437c8 385static void skb_drop_list(struct sk_buff **listp)
1da177e4 386{
27b437c8 387 struct sk_buff *list = *listp;
1da177e4 388
27b437c8 389 *listp = NULL;
1da177e4
LT
390
391 do {
392 struct sk_buff *this = list;
393 list = list->next;
394 kfree_skb(this);
395 } while (list);
396}
397
27b437c8
HX
398static inline void skb_drop_fraglist(struct sk_buff *skb)
399{
400 skb_drop_list(&skb_shinfo(skb)->frag_list);
401}
402
1da177e4
LT
403static void skb_clone_fraglist(struct sk_buff *skb)
404{
405 struct sk_buff *list;
406
fbb398a8 407 skb_walk_frags(skb, list)
1da177e4
LT
408 skb_get(list);
409}
410
d3836f21
ED
411static void skb_free_head(struct sk_buff *skb)
412{
413 if (skb->head_frag)
414 put_page(virt_to_head_page(skb->head));
415 else
416 kfree(skb->head);
417}
418
5bba1712 419static void skb_release_data(struct sk_buff *skb)
1da177e4
LT
420{
421 if (!skb->cloned ||
422 !atomic_sub_return(skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1,
423 &skb_shinfo(skb)->dataref)) {
424 if (skb_shinfo(skb)->nr_frags) {
425 int i;
426 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
ea2ab693 427 skb_frag_unref(skb, i);
1da177e4
LT
428 }
429
a6686f2f
SM
430 /*
431 * If skb buf is from userspace, we need to notify the caller
432 * the lower device DMA has done;
433 */
434 if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
435 struct ubuf_info *uarg;
436
437 uarg = skb_shinfo(skb)->destructor_arg;
438 if (uarg->callback)
439 uarg->callback(uarg);
440 }
441
21dc3301 442 if (skb_has_frag_list(skb))
1da177e4
LT
443 skb_drop_fraglist(skb);
444
d3836f21 445 skb_free_head(skb);
1da177e4
LT
446 }
447}
448
449/*
450 * Free an skbuff by memory without cleaning the state.
451 */
2d4baff8 452static void kfree_skbmem(struct sk_buff *skb)
1da177e4 453{
d179cd12
DM
454 struct sk_buff *other;
455 atomic_t *fclone_ref;
456
d179cd12
DM
457 switch (skb->fclone) {
458 case SKB_FCLONE_UNAVAILABLE:
459 kmem_cache_free(skbuff_head_cache, skb);
460 break;
461
462 case SKB_FCLONE_ORIG:
463 fclone_ref = (atomic_t *) (skb + 2);
464 if (atomic_dec_and_test(fclone_ref))
465 kmem_cache_free(skbuff_fclone_cache, skb);
466 break;
467
468 case SKB_FCLONE_CLONE:
469 fclone_ref = (atomic_t *) (skb + 1);
470 other = skb - 1;
471
472 /* The clone portion is available for
473 * fast-cloning again.
474 */
475 skb->fclone = SKB_FCLONE_UNAVAILABLE;
476
477 if (atomic_dec_and_test(fclone_ref))
478 kmem_cache_free(skbuff_fclone_cache, other);
479 break;
3ff50b79 480 }
1da177e4
LT
481}
482
04a4bb55 483static void skb_release_head_state(struct sk_buff *skb)
1da177e4 484{
adf30907 485 skb_dst_drop(skb);
1da177e4
LT
486#ifdef CONFIG_XFRM
487 secpath_put(skb->sp);
488#endif
9c2b3328
SH
489 if (skb->destructor) {
490 WARN_ON(in_irq());
1da177e4
LT
491 skb->destructor(skb);
492 }
a3bf7ae9 493#if IS_ENABLED(CONFIG_NF_CONNTRACK)
5f79e0f9 494 nf_conntrack_put(skb->nfct);
2fc72c7b
KK
495#endif
496#ifdef NET_SKBUFF_NF_DEFRAG_NEEDED
9fb9cbb1
YK
497 nf_conntrack_put_reasm(skb->nfct_reasm);
498#endif
1da177e4
LT
499#ifdef CONFIG_BRIDGE_NETFILTER
500 nf_bridge_put(skb->nf_bridge);
501#endif
1da177e4
LT
502/* XXX: IS this still necessary? - JHS */
503#ifdef CONFIG_NET_SCHED
504 skb->tc_index = 0;
505#ifdef CONFIG_NET_CLS_ACT
506 skb->tc_verd = 0;
1da177e4
LT
507#endif
508#endif
04a4bb55
LB
509}
510
511/* Free everything but the sk_buff shell. */
512static void skb_release_all(struct sk_buff *skb)
513{
514 skb_release_head_state(skb);
2d4baff8
HX
515 skb_release_data(skb);
516}
517
518/**
519 * __kfree_skb - private function
520 * @skb: buffer
521 *
522 * Free an sk_buff. Release anything attached to the buffer.
523 * Clean the state. This is an internal helper function. Users should
524 * always call kfree_skb
525 */
1da177e4 526
2d4baff8
HX
527void __kfree_skb(struct sk_buff *skb)
528{
529 skb_release_all(skb);
1da177e4
LT
530 kfree_skbmem(skb);
531}
b4ac530f 532EXPORT_SYMBOL(__kfree_skb);
1da177e4 533