1 From: Peter Zijlstra <a.p.zijlstra@chello.nl>
2 Subject: netvm: hook skb allocation to reserves
4 References: FATE#303834
6 Change the skb allocation api to indicate RX usage and use this to fall back to
7 the reserve when needed. SKBs allocated from the reserve are tagged in
10 Teach all other skb ops about emergency skbs and the reserve accounting.
12 Use the (new) packet split API to allocate and track fragment pages from the
13 emergency reserve. Do this using an atomic counter in page->index. This is
14 needed because the fragments have a different sharing semantic than that
15 indicated by skb_shinfo()->dataref.
17 Note that the decision to distinguish between regular and emergency SKBs allows
18 the accounting overhead to be limited to the later kind.
20 Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
21 Acked-by: Neil Brown <neilb@suse.de>
22 Acked-by: Suresh Jayaraman <sjayaraman@suse.de>
25 include/linux/mm_types.h | 1
26 include/linux/skbuff.h | 27 +++++++--
27 net/core/skbuff.c | 135 +++++++++++++++++++++++++++++++++++++----------
28 3 files changed, 132 insertions(+), 31 deletions(-)
30 --- a/include/linux/mm_types.h
31 +++ b/include/linux/mm_types.h
32 @@ -75,6 +75,7 @@ struct page {
33 pgoff_t index; /* Our offset within mapping. */
34 void *freelist; /* SLUB: freelist req. slab lock */
35 int reserve; /* page_alloc: page is a reserve page */
36 + atomic_t frag_count; /* skb fragment use count */
38 struct list_head lru; /* Pageout list, eg. active_list
39 * protected by zone->lru_lock !
40 --- a/include/linux/skbuff.h
41 +++ b/include/linux/skbuff.h
42 @@ -320,7 +320,10 @@ struct sk_buff {
43 #if defined(CONFIG_MAC80211) || defined(CONFIG_MAC80211_MODULE)
44 __u8 do_not_encrypt:1;
46 - /* 0/13/14 bit hole */
50 + /* 12-16 bit hole */
53 dma_cookie_t dma_cookie;
54 @@ -353,10 +356,22 @@ struct sk_buff {
56 #include <asm/system.h>
58 +#define SKB_ALLOC_FCLONE 0x01
59 +#define SKB_ALLOC_RX 0x02
61 +static inline bool skb_emergency(const struct sk_buff *skb)
64 + return unlikely(skb->emergency);
70 extern void kfree_skb(struct sk_buff *skb);
71 extern void __kfree_skb(struct sk_buff *skb);
72 extern struct sk_buff *__alloc_skb(unsigned int size,
73 - gfp_t priority, int fclone, int node);
74 + gfp_t priority, int flags, int node);
75 static inline struct sk_buff *alloc_skb(unsigned int size,
78 @@ -366,7 +381,7 @@ static inline struct sk_buff *alloc_skb(
79 static inline struct sk_buff *alloc_skb_fclone(unsigned int size,
82 - return __alloc_skb(size, priority, 1, -1);
83 + return __alloc_skb(size, priority, SKB_ALLOC_FCLONE, -1);
86 extern struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src);
87 @@ -1216,7 +1231,8 @@ static inline void __skb_queue_purge(str
88 static inline struct sk_buff *__dev_alloc_skb(unsigned int length,
91 - struct sk_buff *skb = alloc_skb(length + NET_SKB_PAD, gfp_mask);
92 + struct sk_buff *skb =
93 + __alloc_skb(length + NET_SKB_PAD, gfp_mask, SKB_ALLOC_RX, -1);
95 skb_reserve(skb, NET_SKB_PAD);
97 @@ -1247,6 +1263,7 @@ static inline struct sk_buff *netdev_all
100 extern struct page *__netdev_alloc_page(struct net_device *dev, gfp_t gfp_mask);
101 +extern void __netdev_free_page(struct net_device *dev, struct page *page);
104 * netdev_alloc_page - allocate a page for ps-rx on a specific device
105 @@ -1263,7 +1280,7 @@ static inline struct page *netdev_alloc_
107 static inline void netdev_free_page(struct net_device *dev, struct page *page)
110 + __netdev_free_page(dev, page);
114 --- a/net/core/skbuff.c
115 +++ b/net/core/skbuff.c
116 @@ -173,23 +173,29 @@ EXPORT_SYMBOL(skb_truesize_bug);
119 struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
120 - int fclone, int node)
121 + int flags, int node)
123 struct kmem_cache *cache;
124 struct skb_shared_info *shinfo;
128 + int memalloc = sk_memalloc_socks();
130 - cache = fclone ? skbuff_fclone_cache : skbuff_head_cache;
131 + size = SKB_DATA_ALIGN(size);
132 + cache = (flags & SKB_ALLOC_FCLONE)
133 + ? skbuff_fclone_cache : skbuff_head_cache;
135 + if (memalloc && (flags & SKB_ALLOC_RX))
136 + gfp_mask |= __GFP_MEMALLOC;
139 skb = kmem_cache_alloc_node(cache, gfp_mask & ~__GFP_DMA, node);
143 - size = SKB_DATA_ALIGN(size);
144 - data = kmalloc_node_track_caller(size + sizeof(struct skb_shared_info),
146 + data = kmalloc_reserve(size + sizeof(struct skb_shared_info),
147 + gfp_mask, node, &net_skb_reserve, &emergency);
151 @@ -199,6 +205,9 @@ struct sk_buff *__alloc_skb(unsigned int
152 * the tail pointer in struct sk_buff!
154 memset(skb, 0, offsetof(struct sk_buff, tail));
156 + skb->emergency = emergency;
158 skb->truesize = size + sizeof(struct sk_buff);
159 atomic_set(&skb->users, 1);
161 @@ -215,7 +224,7 @@ struct sk_buff *__alloc_skb(unsigned int
162 shinfo->ip6_frag_id = 0;
163 shinfo->frag_list = NULL;
166 + if (flags & SKB_ALLOC_FCLONE) {
167 struct sk_buff *child = skb + 1;
168 atomic_t *fclone_ref = (atomic_t *) (child + 1);
170 @@ -223,6 +232,9 @@ struct sk_buff *__alloc_skb(unsigned int
171 atomic_set(fclone_ref, 1);
173 child->fclone = SKB_FCLONE_UNAVAILABLE;
175 + child->emergency = skb->emergency;
180 @@ -251,7 +263,7 @@ struct sk_buff *__netdev_alloc_skb(struc
181 int node = dev->dev.parent ? dev_to_node(dev->dev.parent) : -1;
184 - skb = __alloc_skb(length + NET_SKB_PAD, gfp_mask, 0, node);
185 + skb = __alloc_skb(length + NET_SKB_PAD, gfp_mask, SKB_ALLOC_RX, node);
187 skb_reserve(skb, NET_SKB_PAD);
189 @@ -264,11 +276,19 @@ struct page *__netdev_alloc_page(struct
190 int node = dev->dev.parent ? dev_to_node(dev->dev.parent) : -1;
193 - page = alloc_pages_node(node, gfp_mask, 0);
194 + page = alloc_pages_reserve(node, gfp_mask | __GFP_MEMALLOC, 0,
195 + &net_skb_reserve, NULL);
199 EXPORT_SYMBOL(__netdev_alloc_page);
201 +void __netdev_free_page(struct net_device *dev, struct page *page)
203 + free_pages_reserve(page, 0, &net_skb_reserve, page->reserve);
205 +EXPORT_SYMBOL(__netdev_free_page);
207 void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page, int off,
210 @@ -276,6 +296,27 @@ void skb_add_rx_frag(struct sk_buff *skb
212 skb->data_len += size;
213 skb->truesize += size;
217 + * In the rare case that skb_emergency() != page->reserved we'll
218 + * skew the accounting slightly, but since its only a 'small' constant
221 + if (skb_emergency(skb)) {
223 + * We need to track fragment pages so that we properly
224 + * release their reserve in skb_put_page().
226 + atomic_set(&page->frag_count, 1);
227 + } else if (unlikely(page->reserve)) {
229 + * Release the reserve now, because normal skbs don't
230 + * do the emergency accounting.
232 + mem_reserve_pages_charge(&net_skb_reserve, -1);
236 EXPORT_SYMBOL(skb_add_rx_frag);
238 @@ -327,21 +368,38 @@ static void skb_clone_fraglist(struct sk
242 +static void skb_get_page(struct sk_buff *skb, struct page *page)
245 + if (skb_emergency(skb))
246 + atomic_inc(&page->frag_count);
249 +static void skb_put_page(struct sk_buff *skb, struct page *page)
251 + if (skb_emergency(skb) && atomic_dec_and_test(&page->frag_count))
252 + mem_reserve_pages_charge(&net_skb_reserve, -1);
256 static void skb_release_data(struct sk_buff *skb)
259 !atomic_sub_return(skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1,
260 &skb_shinfo(skb)->dataref)) {
262 if (skb_shinfo(skb)->nr_frags) {
264 - for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
265 - put_page(skb_shinfo(skb)->frags[i].page);
266 + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
268 + skb_shinfo(skb)->frags[i].page);
272 if (skb_shinfo(skb)->frag_list)
273 skb_drop_fraglist(skb);
276 + kfree_reserve(skb->head, &net_skb_reserve, skb_emergency(skb));
280 @@ -462,6 +520,9 @@ static void __copy_skb_header(struct sk_
281 #if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
282 new->ipvs_property = old->ipvs_property;
285 + new->emergency = old->emergency;
287 new->protocol = old->protocol;
288 new->mark = old->mark;
290 @@ -555,6 +616,9 @@ struct sk_buff *skb_clone(struct sk_buff
291 n->fclone = SKB_FCLONE_CLONE;
292 atomic_inc(fclone_ref);
294 + if (skb_emergency(skb))
295 + gfp_mask |= __GFP_MEMALLOC;
297 n = kmem_cache_alloc(skbuff_head_cache, gfp_mask);
300 @@ -586,6 +650,14 @@ static void copy_skb_header(struct sk_bu
301 skb_shinfo(new)->gso_type = skb_shinfo(old)->gso_type;
304 +static inline int skb_alloc_rx_flag(const struct sk_buff *skb)
306 + if (skb_emergency(skb))
307 + return SKB_ALLOC_RX;
313 * skb_copy - create private copy of an sk_buff
314 * @skb: buffer to copy
315 @@ -606,15 +678,17 @@ static void copy_skb_header(struct sk_bu
316 struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t gfp_mask)
318 int headerlen = skb->data - skb->head;
321 * Allocate the copy buffer
324 #ifdef NET_SKBUFF_DATA_USES_OFFSET
325 - n = alloc_skb(skb->end + skb->data_len, gfp_mask);
326 + size = skb->end + skb->data_len;
328 - n = alloc_skb(skb->end - skb->head + skb->data_len, gfp_mask);
329 + size = skb->end - skb->head + skb->data_len;
331 + n = __alloc_skb(size, gfp_mask, skb_alloc_rx_flag(skb), -1);
335 @@ -649,12 +723,14 @@ struct sk_buff *pskb_copy(struct sk_buff
337 * Allocate the copy buffer
341 #ifdef NET_SKBUFF_DATA_USES_OFFSET
342 - n = alloc_skb(skb->end, gfp_mask);
345 - n = alloc_skb(skb->end - skb->head, gfp_mask);
346 + size = skb->end - skb->head;
348 + n = __alloc_skb(size, gfp_mask, skb_alloc_rx_flag(skb), -1);
352 @@ -673,8 +749,9 @@ struct sk_buff *pskb_copy(struct sk_buff
355 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
356 - skb_shinfo(n)->frags[i] = skb_shinfo(skb)->frags[i];
357 - get_page(skb_shinfo(n)->frags[i].page);
358 + skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
359 + skb_shinfo(n)->frags[i] = *frag;
360 + skb_get_page(n, frag->page);
362 skb_shinfo(n)->nr_frags = i;
364 @@ -722,7 +799,11 @@ int pskb_expand_head(struct sk_buff *skb
366 size = SKB_DATA_ALIGN(size);
368 - data = kmalloc(size + sizeof(struct skb_shared_info), gfp_mask);
369 + if (skb_emergency(skb))
370 + gfp_mask |= __GFP_MEMALLOC;
372 + data = kmalloc_reserve(size + sizeof(struct skb_shared_info),
373 + gfp_mask, -1, &net_skb_reserve, NULL);
377 @@ -737,7 +818,7 @@ int pskb_expand_head(struct sk_buff *skb
378 sizeof(struct skb_shared_info));
380 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
381 - get_page(skb_shinfo(skb)->frags[i].page);
382 + skb_get_page(skb, skb_shinfo(skb)->frags[i].page);
384 if (skb_shinfo(skb)->frag_list)
385 skb_clone_fraglist(skb);
386 @@ -816,8 +897,8 @@ struct sk_buff *skb_copy_expand(const st
388 * Allocate the copy buffer
390 - struct sk_buff *n = alloc_skb(newheadroom + skb->len + newtailroom,
392 + struct sk_buff *n = __alloc_skb(newheadroom + skb->len + newtailroom,
393 + gfp_mask, skb_alloc_rx_flag(skb), -1);
394 int oldheadroom = skb_headroom(skb);
395 int head_copy_len, head_copy_off;
397 @@ -1006,7 +1087,7 @@ drop_pages:
398 skb_shinfo(skb)->nr_frags = i;
400 for (; i < nfrags; i++)
401 - put_page(skb_shinfo(skb)->frags[i].page);
402 + skb_put_page(skb, skb_shinfo(skb)->frags[i].page);
404 if (skb_shinfo(skb)->frag_list)
405 skb_drop_fraglist(skb);
406 @@ -1175,7 +1256,7 @@ pull_pages:
408 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
409 if (skb_shinfo(skb)->frags[i].size <= eat) {
410 - put_page(skb_shinfo(skb)->frags[i].page);
411 + skb_put_page(skb, skb_shinfo(skb)->frags[i].page);
412 eat -= skb_shinfo(skb)->frags[i].size;
414 skb_shinfo(skb)->frags[k] = skb_shinfo(skb)->frags[i];
415 @@ -1925,6 +2006,7 @@ static inline void skb_split_no_header(s
416 skb_shinfo(skb1)->frags[k] = skb_shinfo(skb)->frags[i];
419 + struct page *page = skb_shinfo(skb)->frags[i].page;
421 * We have two variants in this case:
422 * 1. Move all the frag to the second
423 @@ -1933,7 +2015,7 @@ static inline void skb_split_no_header(s
424 * where splitting is expensive.
425 * 2. Split is accurately. We make this.
427 - get_page(skb_shinfo(skb)->frags[i].page);
428 + skb_get_page(skb1, page);
429 skb_shinfo(skb1)->frags[0].page_offset += len - pos;
430 skb_shinfo(skb1)->frags[0].size -= len - pos;
431 skb_shinfo(skb)->frags[i].size = len - pos;
432 @@ -2264,7 +2346,8 @@ struct sk_buff *skb_segment(struct sk_bu
433 if (hsize > len || !sg)
436 - nskb = alloc_skb(hsize + doffset + headroom, GFP_ATOMIC);
437 + nskb = __alloc_skb(hsize + doffset + headroom, GFP_ATOMIC,
438 + skb_alloc_rx_flag(skb), -1);
442 @@ -2302,7 +2385,7 @@ struct sk_buff *skb_segment(struct sk_bu
445 *frag = skb_shinfo(skb)->frags[i];
446 - get_page(frag->page);
447 + skb_get_page(nskb, frag->page);