]> git.ipfire.org Git - thirdparty/linux.git/blame - net/core/skbuff.c
net: Store virtual address instead of page in netdev_alloc_cache
[thirdparty/linux.git] / net / core / skbuff.c
CommitLineData
1da177e4
LT
1/*
2 * Routines having to do with the 'struct sk_buff' memory handlers.
3 *
113aa838 4 * Authors: Alan Cox <alan@lxorguk.ukuu.org.uk>
1da177e4
LT
5 * Florian La Roche <rzsfl@rz.uni-sb.de>
6 *
1da177e4
LT
7 * Fixes:
8 * Alan Cox : Fixed the worst of the load
9 * balancer bugs.
10 * Dave Platt : Interrupt stacking fix.
11 * Richard Kooijman : Timestamp fixes.
12 * Alan Cox : Changed buffer format.
13 * Alan Cox : destructor hook for AF_UNIX etc.
14 * Linus Torvalds : Better skb_clone.
15 * Alan Cox : Added skb_copy.
16 * Alan Cox : Added all the changed routines Linus
17 * only put in the headers
18 * Ray VanTassle : Fixed --skb->lock in free
19 * Alan Cox : skb_copy copy arp field
20 * Andi Kleen : slabified it.
21 * Robert Olsson : Removed skb_head_pool
22 *
23 * NOTE:
24 * The __skb_ routines should be called with interrupts
25 * disabled, or you better be *real* sure that the operation is atomic
26 * with respect to whatever list is being frobbed (e.g. via lock_sock()
27 * or via disabling bottom half handlers, etc).
28 *
29 * This program is free software; you can redistribute it and/or
30 * modify it under the terms of the GNU General Public License
31 * as published by the Free Software Foundation; either version
32 * 2 of the License, or (at your option) any later version.
33 */
34
35/*
36 * The functions in this file will not compile correctly with gcc 2.4.x
37 */
38
e005d193
JP
39#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
40
1da177e4
LT
41#include <linux/module.h>
42#include <linux/types.h>
43#include <linux/kernel.h>
fe55f6d5 44#include <linux/kmemcheck.h>
1da177e4
LT
45#include <linux/mm.h>
46#include <linux/interrupt.h>
47#include <linux/in.h>
48#include <linux/inet.h>
49#include <linux/slab.h>
de960aa9
FW
50#include <linux/tcp.h>
51#include <linux/udp.h>
1da177e4
LT
52#include <linux/netdevice.h>
53#ifdef CONFIG_NET_CLS_ACT
54#include <net/pkt_sched.h>
55#endif
56#include <linux/string.h>
57#include <linux/skbuff.h>
9c55e01c 58#include <linux/splice.h>
1da177e4
LT
59#include <linux/cache.h>
60#include <linux/rtnetlink.h>
61#include <linux/init.h>
716ea3a7 62#include <linux/scatterlist.h>
ac45f602 63#include <linux/errqueue.h>
268bb0ce 64#include <linux/prefetch.h>
0d5501c1 65#include <linux/if_vlan.h>
1da177e4
LT
66
67#include <net/protocol.h>
68#include <net/dst.h>
69#include <net/sock.h>
70#include <net/checksum.h>
ed1f50c3 71#include <net/ip6_checksum.h>
1da177e4
LT
72#include <net/xfrm.h>
73
74#include <asm/uaccess.h>
ad8d75ff 75#include <trace/events/skb.h>
51c56b00 76#include <linux/highmem.h>
b245be1f
WB
77#include <linux/capability.h>
78#include <linux/user_namespace.h>
a1f8e7f7 79
d7e8883c 80struct kmem_cache *skbuff_head_cache __read_mostly;
e18b890b 81static struct kmem_cache *skbuff_fclone_cache __read_mostly;
1da177e4 82
1da177e4 83/**
f05de73b
JS
84 * skb_panic - private function for out-of-line support
85 * @skb: buffer
86 * @sz: size
87 * @addr: address
99d5851e 88 * @msg: skb_over_panic or skb_under_panic
1da177e4 89 *
f05de73b
JS
90 * Out-of-line support for skb_put() and skb_push().
91 * Called via the wrapper skb_over_panic() or skb_under_panic().
92 * Keep out of line to prevent kernel bloat.
93 * __builtin_return_address is not used because it is not always reliable.
1da177e4 94 */
f05de73b 95static void skb_panic(struct sk_buff *skb, unsigned int sz, void *addr,
99d5851e 96 const char msg[])
1da177e4 97{
e005d193 98 pr_emerg("%s: text:%p len:%d put:%d head:%p data:%p tail:%#lx end:%#lx dev:%s\n",
99d5851e 99 msg, addr, skb->len, sz, skb->head, skb->data,
e005d193
JP
100 (unsigned long)skb->tail, (unsigned long)skb->end,
101 skb->dev ? skb->dev->name : "<NULL>");
1da177e4
LT
102 BUG();
103}
104
f05de73b 105static void skb_over_panic(struct sk_buff *skb, unsigned int sz, void *addr)
1da177e4 106{
f05de73b 107 skb_panic(skb, sz, addr, __func__);
1da177e4
LT
108}
109
f05de73b
JS
110static void skb_under_panic(struct sk_buff *skb, unsigned int sz, void *addr)
111{
112 skb_panic(skb, sz, addr, __func__);
113}
c93bdd0e
MG
114
115/*
116 * kmalloc_reserve is a wrapper around kmalloc_node_track_caller that tells
117 * the caller if emergency pfmemalloc reserves are being used. If it is and
118 * the socket is later found to be SOCK_MEMALLOC then PFMEMALLOC reserves
119 * may be used. Otherwise, the packet data may be discarded until enough
120 * memory is free
121 */
122#define kmalloc_reserve(size, gfp, node, pfmemalloc) \
123 __kmalloc_reserve(size, gfp, node, _RET_IP_, pfmemalloc)
61c5e88a 124
125static void *__kmalloc_reserve(size_t size, gfp_t flags, int node,
126 unsigned long ip, bool *pfmemalloc)
c93bdd0e
MG
127{
128 void *obj;
129 bool ret_pfmemalloc = false;
130
131 /*
132 * Try a regular allocation, when that fails and we're not entitled
133 * to the reserves, fail.
134 */
135 obj = kmalloc_node_track_caller(size,
136 flags | __GFP_NOMEMALLOC | __GFP_NOWARN,
137 node);
138 if (obj || !(gfp_pfmemalloc_allowed(flags)))
139 goto out;
140
141 /* Try again but now we are using pfmemalloc reserves */
142 ret_pfmemalloc = true;
143 obj = kmalloc_node_track_caller(size, flags, node);
144
145out:
146 if (pfmemalloc)
147 *pfmemalloc = ret_pfmemalloc;
148
149 return obj;
150}
151
1da177e4
LT
152/* Allocate a new skbuff. We do this ourselves so we can fill in a few
153 * 'private' fields and also do memory statistics to find all the
154 * [BEEP] leaks.
155 *
156 */
157
0ebd0ac5
PM
158struct sk_buff *__alloc_skb_head(gfp_t gfp_mask, int node)
159{
160 struct sk_buff *skb;
161
162 /* Get the HEAD */
163 skb = kmem_cache_alloc_node(skbuff_head_cache,
164 gfp_mask & ~__GFP_DMA, node);
165 if (!skb)
166 goto out;
167
168 /*
169 * Only clear those fields we need to clear, not those that we will
170 * actually initialise below. Hence, don't put any more fields after
171 * the tail pointer in struct sk_buff!
172 */
173 memset(skb, 0, offsetof(struct sk_buff, tail));
5e71d9d7 174 skb->head = NULL;
0ebd0ac5
PM
175 skb->truesize = sizeof(struct sk_buff);
176 atomic_set(&skb->users, 1);
177
35d04610 178 skb->mac_header = (typeof(skb->mac_header))~0U;
0ebd0ac5
PM
179out:
180 return skb;
181}
182
1da177e4 183/**
d179cd12 184 * __alloc_skb - allocate a network buffer
1da177e4
LT
185 * @size: size to allocate
186 * @gfp_mask: allocation mask
c93bdd0e
MG
187 * @flags: If SKB_ALLOC_FCLONE is set, allocate from fclone cache
188 * instead of head cache and allocate a cloned (child) skb.
189 * If SKB_ALLOC_RX is set, __GFP_MEMALLOC will be used for
190 * allocations in case the data is required for writeback
b30973f8 191 * @node: numa node to allocate memory on
1da177e4
LT
192 *
193 * Allocate a new &sk_buff. The returned buffer has no headroom and a
94b6042c
BH
194 * tail room of at least size bytes. The object has a reference count
195 * of one. The return is the buffer. On a failure the return is %NULL.
1da177e4
LT
196 *
197 * Buffers may only be allocated from interrupts using a @gfp_mask of
198 * %GFP_ATOMIC.
199 */
dd0fc66f 200struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
c93bdd0e 201 int flags, int node)
1da177e4 202{
e18b890b 203 struct kmem_cache *cache;
4947d3ef 204 struct skb_shared_info *shinfo;
1da177e4
LT
205 struct sk_buff *skb;
206 u8 *data;
c93bdd0e 207 bool pfmemalloc;
1da177e4 208
c93bdd0e
MG
209 cache = (flags & SKB_ALLOC_FCLONE)
210 ? skbuff_fclone_cache : skbuff_head_cache;
211
212 if (sk_memalloc_socks() && (flags & SKB_ALLOC_RX))
213 gfp_mask |= __GFP_MEMALLOC;
8798b3fb 214
1da177e4 215 /* Get the HEAD */
b30973f8 216 skb = kmem_cache_alloc_node(cache, gfp_mask & ~__GFP_DMA, node);
1da177e4
LT
217 if (!skb)
218 goto out;
ec7d2f2c 219 prefetchw(skb);
1da177e4 220
87fb4b7b
ED
221 /* We do our best to align skb_shared_info on a separate cache
222 * line. It usually works because kmalloc(X > SMP_CACHE_BYTES) gives
223 * aligned memory blocks, unless SLUB/SLAB debug is enabled.
224 * Both skb->head and skb_shared_info are cache line aligned.
225 */
bc417e30 226 size = SKB_DATA_ALIGN(size);
87fb4b7b 227 size += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
c93bdd0e 228 data = kmalloc_reserve(size, gfp_mask, node, &pfmemalloc);
1da177e4
LT
229 if (!data)
230 goto nodata;
87fb4b7b
ED
231 /* kmalloc(size) might give us more room than requested.
232 * Put skb_shared_info exactly at the end of allocated zone,
233 * to allow max possible filling before reallocation.
234 */
235 size = SKB_WITH_OVERHEAD(ksize(data));
ec7d2f2c 236 prefetchw(data + size);
1da177e4 237
ca0605a7 238 /*
c8005785
JB
239 * Only clear those fields we need to clear, not those that we will
240 * actually initialise below. Hence, don't put any more fields after
241 * the tail pointer in struct sk_buff!
ca0605a7
ACM
242 */
243 memset(skb, 0, offsetof(struct sk_buff, tail));
87fb4b7b
ED
244 /* Account for allocated memory : skb + skb->head */
245 skb->truesize = SKB_TRUESIZE(size);
c93bdd0e 246 skb->pfmemalloc = pfmemalloc;
1da177e4
LT
247 atomic_set(&skb->users, 1);
248 skb->head = data;
249 skb->data = data;
27a884dc 250 skb_reset_tail_pointer(skb);
4305b541 251 skb->end = skb->tail + size;
35d04610
CW
252 skb->mac_header = (typeof(skb->mac_header))~0U;
253 skb->transport_header = (typeof(skb->transport_header))~0U;
19633e12 254
4947d3ef
BL
255 /* make sure we initialize shinfo sequentially */
256 shinfo = skb_shinfo(skb);
ec7d2f2c 257 memset(shinfo, 0, offsetof(struct skb_shared_info, dataref));
4947d3ef 258 atomic_set(&shinfo->dataref, 1);
c2aa3665 259 kmemcheck_annotate_variable(shinfo->destructor_arg);
4947d3ef 260
c93bdd0e 261 if (flags & SKB_ALLOC_FCLONE) {
d0bf4a9e 262 struct sk_buff_fclones *fclones;
1da177e4 263
d0bf4a9e
ED
264 fclones = container_of(skb, struct sk_buff_fclones, skb1);
265
266 kmemcheck_annotate_bitfield(&fclones->skb2, flags1);
d179cd12 267 skb->fclone = SKB_FCLONE_ORIG;
d0bf4a9e 268 atomic_set(&fclones->fclone_ref, 1);
d179cd12 269
6ffe75eb 270 fclones->skb2.fclone = SKB_FCLONE_CLONE;
d0bf4a9e 271 fclones->skb2.pfmemalloc = pfmemalloc;
d179cd12 272 }
1da177e4
LT
273out:
274 return skb;
275nodata:
8798b3fb 276 kmem_cache_free(cache, skb);
1da177e4
LT
277 skb = NULL;
278 goto out;
1da177e4 279}
b4ac530f 280EXPORT_SYMBOL(__alloc_skb);
1da177e4 281
b2b5ce9d 282/**
2ea2f62c 283 * __build_skb - build a network buffer
b2b5ce9d 284 * @data: data buffer provided by caller
2ea2f62c 285 * @frag_size: size of data, or 0 if head was kmalloced
b2b5ce9d
ED
286 *
287 * Allocate a new &sk_buff. Caller provides space holding head and
deceb4c0 288 * skb_shared_info. @data must have been allocated by kmalloc() only if
2ea2f62c
ED
289 * @frag_size is 0, otherwise data should come from the page allocator
290 * or vmalloc()
b2b5ce9d
ED
291 * The return is the new skb buffer.
292 * On a failure the return is %NULL, and @data is not freed.
293 * Notes :
294 * Before IO, driver allocates only data buffer where NIC put incoming frame
295 * Driver should add room at head (NET_SKB_PAD) and
296 * MUST add room at tail (SKB_DATA_ALIGN(skb_shared_info))
297 * After IO, driver calls build_skb(), to allocate sk_buff and populate it
298 * before giving packet to stack.
299 * RX rings only contains data buffers, not full skbs.
300 */
2ea2f62c 301struct sk_buff *__build_skb(void *data, unsigned int frag_size)
b2b5ce9d
ED
302{
303 struct skb_shared_info *shinfo;
304 struct sk_buff *skb;
d3836f21 305 unsigned int size = frag_size ? : ksize(data);
b2b5ce9d
ED
306
307 skb = kmem_cache_alloc(skbuff_head_cache, GFP_ATOMIC);
308 if (!skb)
309 return NULL;
310
d3836f21 311 size -= SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
b2b5ce9d
ED
312
313 memset(skb, 0, offsetof(struct sk_buff, tail));
314 skb->truesize = SKB_TRUESIZE(size);
315 atomic_set(&skb->users, 1);
316 skb->head = data;
317 skb->data = data;
318 skb_reset_tail_pointer(skb);
319 skb->end = skb->tail + size;
35d04610
CW
320 skb->mac_header = (typeof(skb->mac_header))~0U;
321 skb->transport_header = (typeof(skb->transport_header))~0U;
b2b5ce9d
ED
322
323 /* make sure we initialize shinfo sequentially */
324 shinfo = skb_shinfo(skb);
325 memset(shinfo, 0, offsetof(struct skb_shared_info, dataref));
326 atomic_set(&shinfo->dataref, 1);
327 kmemcheck_annotate_variable(shinfo->destructor_arg);
328
329 return skb;
330}
2ea2f62c
ED
331
332/* build_skb() is wrapper over __build_skb(), that specifically
333 * takes care of skb->head and skb->pfmemalloc
334 * This means that if @frag_size is not zero, then @data must be backed
335 * by a page fragment, not kmalloc() or vmalloc()
336 */
337struct sk_buff *build_skb(void *data, unsigned int frag_size)
338{
339 struct sk_buff *skb = __build_skb(data, frag_size);
340
341 if (skb && frag_size) {
342 skb->head_frag = 1;
343 if (virt_to_head_page(data)->pfmemalloc)
344 skb->pfmemalloc = 1;
345 }
346 return skb;
347}
b2b5ce9d
ED
348EXPORT_SYMBOL(build_skb);
349
a1c7fff7 350struct netdev_alloc_cache {
0e392508
AD
351 void * va;
352#if (PAGE_SIZE < NETDEV_FRAG_PAGE_MAX_SIZE)
353 __u16 offset;
354 __u16 size;
355#else
356 __u32 offset;
357#endif
69b08f62
ED
358 /* we maintain a pagecount bias, so that we dont dirty cache line
359 * containing page->_count every time we allocate a fragment.
360 */
361 unsigned int pagecnt_bias;
9451980a 362 bool pfmemalloc;
a1c7fff7
ED
363};
364static DEFINE_PER_CPU(struct netdev_alloc_cache, netdev_alloc_cache);
ffde7328 365static DEFINE_PER_CPU(struct netdev_alloc_cache, napi_alloc_cache);
a1c7fff7 366
ffde7328
AD
367static struct page *__page_frag_refill(struct netdev_alloc_cache *nc,
368 gfp_t gfp_mask)
6f532612 369{
ffde7328
AD
370 struct page *page = NULL;
371 gfp_t gfp = gfp_mask;
372
0e392508
AD
373#if (PAGE_SIZE < NETDEV_FRAG_PAGE_MAX_SIZE)
374 gfp_mask |= __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY |
375 __GFP_NOMEMALLOC;
376 page = alloc_pages_node(NUMA_NO_NODE, gfp_mask,
377 NETDEV_FRAG_PAGE_MAX_ORDER);
378 nc->size = page ? NETDEV_FRAG_PAGE_MAX_SIZE : PAGE_SIZE;
379#endif
ffde7328
AD
380 if (unlikely(!page))
381 page = alloc_pages_node(NUMA_NO_NODE, gfp, 0);
382
0e392508 383 nc->va = page ? page_address(page) : NULL;
ffde7328
AD
384
385 return page;
386}
387
9451980a 388static void *__alloc_page_frag(struct netdev_alloc_cache *nc,
ffde7328
AD
389 unsigned int fragsz, gfp_t gfp_mask)
390{
0e392508
AD
391 unsigned int size = PAGE_SIZE;
392 struct page *page;
ffde7328
AD
393 int offset;
394
0e392508 395 if (unlikely(!nc->va)) {
6f532612 396refill:
ffde7328
AD
397 page = __page_frag_refill(nc, gfp_mask);
398 if (!page)
399 return NULL;
400
0e392508
AD
401#if (PAGE_SIZE < NETDEV_FRAG_PAGE_MAX_SIZE)
402 /* if size can vary use size else just use PAGE_SIZE */
403 size = nc->size;
404#endif
4c450583
ED
405 /* Even if we own the page, we do not use atomic_set().
406 * This would break get_page_unless_zero() users.
407 */
ffde7328
AD
408 atomic_add(size - 1, &page->_count);
409
410 /* reset page count bias and offset to start of new frag */
9451980a 411 nc->pfmemalloc = page->pfmemalloc;
ffde7328 412 nc->pagecnt_bias = size;
0e392508 413 nc->offset = size;
6f532612 414 }
540eb7bf 415
0e392508 416 offset = nc->offset - fragsz;
ffde7328 417 if (unlikely(offset < 0)) {
0e392508
AD
418 page = virt_to_page(nc->va);
419
ffde7328
AD
420 if (!atomic_sub_and_test(nc->pagecnt_bias, &page->_count))
421 goto refill;
422
0e392508
AD
423#if (PAGE_SIZE < NETDEV_FRAG_PAGE_MAX_SIZE)
424 /* if size can vary use size else just use PAGE_SIZE */
425 size = nc->size;
426#endif
ffde7328
AD
427 /* OK, page count is 0, we can safely set it */
428 atomic_set(&page->_count, size);
429
430 /* reset page count bias and offset to start of new frag */
431 nc->pagecnt_bias = size;
432 offset = size - fragsz;
6f532612 433 }
540eb7bf 434
540eb7bf 435 nc->pagecnt_bias--;
0e392508 436 nc->offset = offset;
ffde7328 437
0e392508 438 return nc->va + offset;
ffde7328
AD
439}
440
441static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
442{
9451980a 443 struct netdev_alloc_cache *nc;
ffde7328
AD
444 unsigned long flags;
445 void *data;
446
447 local_irq_save(flags);
9451980a
AD
448 nc = this_cpu_ptr(&netdev_alloc_cache);
449 data = __alloc_page_frag(nc, fragsz, gfp_mask);
6f532612
ED
450 local_irq_restore(flags);
451 return data;
452}
c93bdd0e
MG
453
454/**
455 * netdev_alloc_frag - allocate a page fragment
456 * @fragsz: fragment size
457 *
458 * Allocates a frag from a page for receive buffer.
459 * Uses GFP_ATOMIC allocations.
460 */
461void *netdev_alloc_frag(unsigned int fragsz)
462{
463 return __netdev_alloc_frag(fragsz, GFP_ATOMIC | __GFP_COLD);
464}
6f532612
ED
465EXPORT_SYMBOL(netdev_alloc_frag);
466
ffde7328
AD
467static void *__napi_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
468{
9451980a
AD
469 struct netdev_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
470
471 return __alloc_page_frag(nc, fragsz, gfp_mask);
ffde7328
AD
472}
473
474void *napi_alloc_frag(unsigned int fragsz)
475{
476 return __napi_alloc_frag(fragsz, GFP_ATOMIC | __GFP_COLD);
477}
478EXPORT_SYMBOL(napi_alloc_frag);
479
fd11a83d
AD
480/**
481 * __netdev_alloc_skb - allocate an skbuff for rx on a specific device
482 * @dev: network device to receive on
483 * @length: length to allocate
484 * @gfp_mask: get_free_pages mask, passed to alloc_skb
485 *
486 * Allocate a new &sk_buff and assign it a usage count of one. The
487 * buffer has NET_SKB_PAD headroom built in. Users should allocate
488 * the headroom they think they need without accounting for the
489 * built in space. The built in space is used for optimisations.
490 *
491 * %NULL is returned if there is no free memory.
492 */
9451980a
AD
493struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len,
494 gfp_t gfp_mask)
fd11a83d 495{
9451980a
AD
496 struct netdev_alloc_cache *nc;
497 unsigned long flags;
fd11a83d 498 struct sk_buff *skb;
9451980a
AD
499 bool pfmemalloc;
500 void *data;
501
502 len += NET_SKB_PAD;
fd11a83d 503
9451980a
AD
504 if ((len > SKB_WITH_OVERHEAD(PAGE_SIZE)) ||
505 (gfp_mask & (__GFP_WAIT | GFP_DMA)))
506 return __alloc_skb(len, gfp_mask, SKB_ALLOC_RX, NUMA_NO_NODE);
fd11a83d 507
9451980a
AD
508 len += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
509 len = SKB_DATA_ALIGN(len);
510
511 if (sk_memalloc_socks())
512 gfp_mask |= __GFP_MEMALLOC;
513
514 local_irq_save(flags);
515
516 nc = this_cpu_ptr(&netdev_alloc_cache);
517 data = __alloc_page_frag(nc, len, gfp_mask);
518 pfmemalloc = nc->pfmemalloc;
519
520 local_irq_restore(flags);
521
522 if (unlikely(!data))
523 return NULL;
524
525 skb = __build_skb(data, len);
526 if (unlikely(!skb)) {
527 put_page(virt_to_head_page(data));
528 return NULL;
7b2e497a 529 }
fd11a83d 530
9451980a
AD
531 /* use OR instead of assignment to avoid clearing of bits in mask */
532 if (pfmemalloc)
533 skb->pfmemalloc = 1;
534 skb->head_frag = 1;
535
536 skb_reserve(skb, NET_SKB_PAD);
537 skb->dev = dev;
538
8af27456
CH
539 return skb;
540}
b4ac530f 541EXPORT_SYMBOL(__netdev_alloc_skb);
1da177e4 542
fd11a83d
AD
543/**
544 * __napi_alloc_skb - allocate skbuff for rx in a specific NAPI instance
545 * @napi: napi instance this buffer was allocated for
546 * @length: length to allocate
547 * @gfp_mask: get_free_pages mask, passed to alloc_skb and alloc_pages
548 *
549 * Allocate a new sk_buff for use in NAPI receive. This buffer will
550 * attempt to allocate the head from a special reserved region used
551 * only for NAPI Rx allocation. By doing this we can save several
552 * CPU cycles by avoiding having to disable and re-enable IRQs.
553 *
554 * %NULL is returned if there is no free memory.
555 */
9451980a
AD
556struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
557 gfp_t gfp_mask)
fd11a83d 558{
9451980a 559 struct netdev_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
fd11a83d 560 struct sk_buff *skb;
9451980a
AD
561 void *data;
562
563 len += NET_SKB_PAD + NET_IP_ALIGN;
fd11a83d 564
9451980a
AD
565 if ((len > SKB_WITH_OVERHEAD(PAGE_SIZE)) ||
566 (gfp_mask & (__GFP_WAIT | GFP_DMA)))
567 return __alloc_skb(len, gfp_mask, SKB_ALLOC_RX, NUMA_NO_NODE);
568
569 len += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
570 len = SKB_DATA_ALIGN(len);
571
572 if (sk_memalloc_socks())
573 gfp_mask |= __GFP_MEMALLOC;
fd11a83d 574
9451980a
AD
575 data = __alloc_page_frag(nc, len, gfp_mask);
576 if (unlikely(!data))
577 return NULL;
578
579 skb = __build_skb(data, len);
580 if (unlikely(!skb)) {
581 put_page(virt_to_head_page(data));
582 return NULL;
fd11a83d
AD
583 }
584
9451980a
AD
585 /* use OR instead of assignment to avoid clearing of bits in mask */
586 if (nc->pfmemalloc)
587 skb->pfmemalloc = 1;
588 skb->head_frag = 1;
589
590 skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN);
591 skb->dev = napi->dev;
592
fd11a83d
AD
593 return skb;
594}
595EXPORT_SYMBOL(__napi_alloc_skb);
596
654bed16 597void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page, int off,
50269e19 598 int size, unsigned int truesize)
654bed16
PZ
599{
600 skb_fill_page_desc(skb, i, page, off, size);
601 skb->len += size;
602 skb->data_len += size;
50269e19 603 skb->truesize += truesize;
654bed16
PZ
604}
605EXPORT_SYMBOL(skb_add_rx_frag);
606
f8e617e1
JW
607void skb_coalesce_rx_frag(struct sk_buff *skb, int i, int size,
608 unsigned int truesize)
609{
610 skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
611
612 skb_frag_size_add(frag, size);
613 skb->len += size;
614 skb->data_len += size;
615 skb->truesize += truesize;
616}
617EXPORT_SYMBOL(skb_coalesce_rx_frag);
618
27b437c8 619static void skb_drop_list(struct sk_buff **listp)
1da177e4 620{
bd8a7036 621 kfree_skb_list(*listp);
27b437c8 622 *listp = NULL;
1da177e4
LT
623}
624
27b437c8
HX
625static inline void skb_drop_fraglist(struct sk_buff *skb)
626{
627 skb_drop_list(&skb_shinfo(skb)->frag_list);
628}
629
1da177e4
LT
630static void skb_clone_fraglist(struct sk_buff *skb)
631{
632 struct sk_buff *list;
633
fbb398a8 634 skb_walk_frags(skb, list)
1da177e4
LT
635 skb_get(list);
636}
637
d3836f21
ED
638static void skb_free_head(struct sk_buff *skb)
639{
640 if (skb->head_frag)
641 put_page(virt_to_head_page(skb->head));
642 else
643 kfree(skb->head);
644}
645
5bba1712 646static void skb_release_data(struct sk_buff *skb)
1da177e4 647{
ff04a771
ED
648 struct skb_shared_info *shinfo = skb_shinfo(skb);
649 int i;
1da177e4 650
ff04a771
ED
651 if (skb->cloned &&
652 atomic_sub_return(skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1,
653 &shinfo->dataref))
654 return;
a6686f2f 655
ff04a771
ED
656 for (i = 0; i < shinfo->nr_frags; i++)
657 __skb_frag_unref(&shinfo->frags[i]);
a6686f2f 658
ff04a771
ED
659 /*
660 * If skb buf is from userspace, we need to notify the caller
661 * the lower device DMA has done;
662 */
663 if (shinfo->tx_flags & SKBTX_DEV_ZEROCOPY) {
664 struct ubuf_info *uarg;
1da177e4 665
ff04a771
ED
666 uarg = shinfo->destructor_arg;
667 if (uarg->callback)
668 uarg->callback(uarg, true);
1da177e4 669 }
ff04a771
ED
670
671 if (shinfo->frag_list)
672 kfree_skb_list(shinfo->frag_list);
673
674 skb_free_head(skb);
1da177e4
LT
675}
676
677/*
678 * Free an skbuff by memory without cleaning the state.
679 */
2d4baff8 680static void kfree_skbmem(struct sk_buff *skb)
1da177e4 681{
d0bf4a9e 682 struct sk_buff_fclones *fclones;
d179cd12 683
d179cd12
DM
684 switch (skb->fclone) {
685 case SKB_FCLONE_UNAVAILABLE:
686 kmem_cache_free(skbuff_head_cache, skb);
6ffe75eb 687 return;
d179cd12
DM
688
689 case SKB_FCLONE_ORIG:
d0bf4a9e 690 fclones = container_of(skb, struct sk_buff_fclones, skb1);
d179cd12 691
6ffe75eb
ED
692 /* We usually free the clone (TX completion) before original skb
693 * This test would have no chance to be true for the clone,
694 * while here, branch prediction will be good.
d179cd12 695 */
6ffe75eb
ED
696 if (atomic_read(&fclones->fclone_ref) == 1)
697 goto fastpath;
698 break;
e7820e39 699
6ffe75eb
ED
700 default: /* SKB_FCLONE_CLONE */
701 fclones = container_of(skb, struct sk_buff_fclones, skb2);
d179cd12 702 break;
3ff50b79 703 }
6ffe75eb
ED
704 if (!atomic_dec_and_test(&fclones->fclone_ref))
705 return;
706fastpath:
707 kmem_cache_free(skbuff_fclone_cache, fclones);
1da177e4
LT
708}
709
04a4bb55 710static void skb_release_head_state(struct sk_buff *skb)
1da177e4 711{
adf30907 712 skb_dst_drop(skb);
1da177e4
LT
713#ifdef CONFIG_XFRM
714 secpath_put(skb->sp);
715#endif
9c2b3328
SH
716 if (skb->destructor) {
717 WARN_ON(in_irq());
1da177e4
LT
718 skb->destructor(skb);
719 }
a3bf7ae9 720#if IS_ENABLED(CONFIG_NF_CONNTRACK)
5f79e0f9 721 nf_conntrack_put(skb->nfct);
2fc72c7b 722#endif
1109a90c 723#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
1da177e4
LT
724 nf_bridge_put(skb->nf_bridge);
725#endif
04a4bb55
LB
726}
727
728/* Free everything but the sk_buff shell. */
729static void skb_release_all(struct sk_buff *skb)
730{
731 skb_release_head_state(skb);
5e71d9d7 732 if (likely(skb->head))
0ebd0ac5 733 skb_release_data(skb);
2d4baff8
HX
734}
735
736/**
737 * __kfree_skb - private function
738 * @skb: buffer
739 *
740 * Free an sk_buff. Release anything attached to the buffer.
741 * Clean the state. This is an internal helper function. Users should
742 * always call kfree_skb
743 */
1da177e4 744
2d4baff8
HX
745void __kfree_skb(struct sk_buff *skb)
746{
747 skb_release_all(skb);
1da177e4
LT
748 kfree_skbmem(skb);
749}
b4ac530f 750EXPORT_SYMBOL(__kfree_skb);
1da177e4 751