]>
Commit | Line | Data |
---|---|---|
2cb7cef9 BS |
1 | From: Peter Zijlstra <a.p.zijlstra@chello.nl> |
2 | Subject: netvm: hook skb allocation to reserves | |
3 | Patch-mainline: No | |
4 | References: FATE#303834 | |
5 | ||
6 | Change the skb allocation api to indicate RX usage and use this to fall back to | |
7 | the reserve when needed. SKBs allocated from the reserve are tagged in | |
8 | skb->emergency. | |
9 | ||
10 | Teach all other skb ops about emergency skbs and the reserve accounting. | |
11 | ||
12 | Use the (new) packet split API to allocate and track fragment pages from the | |
13 | emergency reserve. Do this using an atomic counter in page->index. This is | |
14 | needed because the fragments have a different sharing semantic than that | |
15 | indicated by skb_shinfo()->dataref. | |
16 | ||
17 | Note that the decision to distinguish between regular and emergency SKBs allows | |
18 | the accounting overhead to be limited to the later kind. | |
19 | ||
20 | Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> | |
21 | Acked-by: Neil Brown <neilb@suse.de> | |
22 | Acked-by: Suresh Jayaraman <sjayaraman@suse.de> | |
23 | ||
24 | --- | |
25 | include/linux/mm_types.h | 1 | |
26 | include/linux/skbuff.h | 27 +++++++-- | |
27 | net/core/skbuff.c | 135 +++++++++++++++++++++++++++++++++++++---------- | |
28 | 3 files changed, 132 insertions(+), 31 deletions(-) | |
29 | ||
30 | --- a/include/linux/mm_types.h | |
31 | +++ b/include/linux/mm_types.h | |
32 | @@ -75,6 +75,7 @@ struct page { | |
33 | pgoff_t index; /* Our offset within mapping. */ | |
34 | void *freelist; /* SLUB: freelist req. slab lock */ | |
35 | int reserve; /* page_alloc: page is a reserve page */ | |
36 | + atomic_t frag_count; /* skb fragment use count */ | |
37 | }; | |
38 | struct list_head lru; /* Pageout list, eg. active_list | |
39 | * protected by zone->lru_lock ! | |
40 | --- a/include/linux/skbuff.h | |
41 | +++ b/include/linux/skbuff.h | |
42 | @@ -320,7 +320,10 @@ struct sk_buff { | |
43 | #if defined(CONFIG_MAC80211) || defined(CONFIG_MAC80211_MODULE) | |
44 | __u8 do_not_encrypt:1; | |
45 | #endif | |
46 | - /* 0/13/14 bit hole */ | |
47 | +#ifdef CONFIG_NETVM | |
48 | + __u8 emergency:1; | |
49 | +#endif | |
50 | + /* 12-16 bit hole */ | |
51 | ||
52 | #ifdef CONFIG_NET_DMA | |
53 | dma_cookie_t dma_cookie; | |
54 | @@ -353,10 +356,22 @@ struct sk_buff { | |
55 | ||
56 | #include <asm/system.h> | |
57 | ||
58 | +#define SKB_ALLOC_FCLONE 0x01 | |
59 | +#define SKB_ALLOC_RX 0x02 | |
60 | + | |
61 | +static inline bool skb_emergency(const struct sk_buff *skb) | |
62 | +{ | |
63 | +#ifdef CONFIG_NETVM | |
64 | + return unlikely(skb->emergency); | |
65 | +#else | |
66 | + return false; | |
67 | +#endif | |
68 | +} | |
69 | + | |
70 | extern void kfree_skb(struct sk_buff *skb); | |
71 | extern void __kfree_skb(struct sk_buff *skb); | |
72 | extern struct sk_buff *__alloc_skb(unsigned int size, | |
73 | - gfp_t priority, int fclone, int node); | |
74 | + gfp_t priority, int flags, int node); | |
75 | static inline struct sk_buff *alloc_skb(unsigned int size, | |
76 | gfp_t priority) | |
77 | { | |
78 | @@ -366,7 +381,7 @@ static inline struct sk_buff *alloc_skb( | |
79 | static inline struct sk_buff *alloc_skb_fclone(unsigned int size, | |
80 | gfp_t priority) | |
81 | { | |
82 | - return __alloc_skb(size, priority, 1, -1); | |
83 | + return __alloc_skb(size, priority, SKB_ALLOC_FCLONE, -1); | |
84 | } | |
85 | ||
86 | extern struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src); | |
87 | @@ -1216,7 +1231,8 @@ static inline void __skb_queue_purge(str | |
88 | static inline struct sk_buff *__dev_alloc_skb(unsigned int length, | |
89 | gfp_t gfp_mask) | |
90 | { | |
91 | - struct sk_buff *skb = alloc_skb(length + NET_SKB_PAD, gfp_mask); | |
92 | + struct sk_buff *skb = | |
93 | + __alloc_skb(length + NET_SKB_PAD, gfp_mask, SKB_ALLOC_RX, -1); | |
94 | if (likely(skb)) | |
95 | skb_reserve(skb, NET_SKB_PAD); | |
96 | return skb; | |
97 | @@ -1247,6 +1263,7 @@ static inline struct sk_buff *netdev_all | |
98 | } | |
99 | ||
100 | extern struct page *__netdev_alloc_page(struct net_device *dev, gfp_t gfp_mask); | |
101 | +extern void __netdev_free_page(struct net_device *dev, struct page *page); | |
102 | ||
103 | /** | |
104 | * netdev_alloc_page - allocate a page for ps-rx on a specific device | |
105 | @@ -1263,7 +1280,7 @@ static inline struct page *netdev_alloc_ | |
106 | ||
107 | static inline void netdev_free_page(struct net_device *dev, struct page *page) | |
108 | { | |
109 | - __free_page(page); | |
110 | + __netdev_free_page(dev, page); | |
111 | } | |
112 | ||
113 | /** | |
114 | --- a/net/core/skbuff.c | |
115 | +++ b/net/core/skbuff.c | |
116 | @@ -173,23 +173,29 @@ EXPORT_SYMBOL(skb_truesize_bug); | |
117 | * %GFP_ATOMIC. | |
118 | */ | |
119 | struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, | |
120 | - int fclone, int node) | |
121 | + int flags, int node) | |
122 | { | |
123 | struct kmem_cache *cache; | |
124 | struct skb_shared_info *shinfo; | |
125 | struct sk_buff *skb; | |
126 | u8 *data; | |
127 | + int emergency = 0; | |
128 | + int memalloc = sk_memalloc_socks(); | |
129 | ||
130 | - cache = fclone ? skbuff_fclone_cache : skbuff_head_cache; | |
131 | + size = SKB_DATA_ALIGN(size); | |
132 | + cache = (flags & SKB_ALLOC_FCLONE) | |
133 | + ? skbuff_fclone_cache : skbuff_head_cache; | |
134 | + | |
135 | + if (memalloc && (flags & SKB_ALLOC_RX)) | |
136 | + gfp_mask |= __GFP_MEMALLOC; | |
137 | ||
138 | /* Get the HEAD */ | |
139 | skb = kmem_cache_alloc_node(cache, gfp_mask & ~__GFP_DMA, node); | |
140 | if (!skb) | |
141 | goto out; | |
142 | ||
143 | - size = SKB_DATA_ALIGN(size); | |
144 | - data = kmalloc_node_track_caller(size + sizeof(struct skb_shared_info), | |
145 | - gfp_mask, node); | |
146 | + data = kmalloc_reserve(size + sizeof(struct skb_shared_info), | |
147 | + gfp_mask, node, &net_skb_reserve, &emergency); | |
148 | if (!data) | |
149 | goto nodata; | |
150 | ||
151 | @@ -199,6 +205,9 @@ struct sk_buff *__alloc_skb(unsigned int | |
152 | * the tail pointer in struct sk_buff! | |
153 | */ | |
154 | memset(skb, 0, offsetof(struct sk_buff, tail)); | |
155 | +#ifdef CONFIG_NETVM | |
156 | + skb->emergency = emergency; | |
157 | +#endif | |
158 | skb->truesize = size + sizeof(struct sk_buff); | |
159 | atomic_set(&skb->users, 1); | |
160 | skb->head = data; | |
161 | @@ -215,7 +224,7 @@ struct sk_buff *__alloc_skb(unsigned int | |
162 | shinfo->ip6_frag_id = 0; | |
163 | shinfo->frag_list = NULL; | |
164 | ||
165 | - if (fclone) { | |
166 | + if (flags & SKB_ALLOC_FCLONE) { | |
167 | struct sk_buff *child = skb + 1; | |
168 | atomic_t *fclone_ref = (atomic_t *) (child + 1); | |
169 | ||
170 | @@ -223,6 +232,9 @@ struct sk_buff *__alloc_skb(unsigned int | |
171 | atomic_set(fclone_ref, 1); | |
172 | ||
173 | child->fclone = SKB_FCLONE_UNAVAILABLE; | |
174 | +#ifdef CONFIG_NETVM | |
175 | + child->emergency = skb->emergency; | |
176 | +#endif | |
177 | } | |
178 | out: | |
179 | return skb; | |
180 | @@ -251,7 +263,7 @@ struct sk_buff *__netdev_alloc_skb(struc | |
181 | int node = dev->dev.parent ? dev_to_node(dev->dev.parent) : -1; | |
182 | struct sk_buff *skb; | |
183 | ||
184 | - skb = __alloc_skb(length + NET_SKB_PAD, gfp_mask, 0, node); | |
185 | + skb = __alloc_skb(length + NET_SKB_PAD, gfp_mask, SKB_ALLOC_RX, node); | |
186 | if (likely(skb)) { | |
187 | skb_reserve(skb, NET_SKB_PAD); | |
188 | skb->dev = dev; | |
189 | @@ -264,11 +276,19 @@ struct page *__netdev_alloc_page(struct | |
190 | int node = dev->dev.parent ? dev_to_node(dev->dev.parent) : -1; | |
191 | struct page *page; | |
192 | ||
193 | - page = alloc_pages_node(node, gfp_mask, 0); | |
194 | + page = alloc_pages_reserve(node, gfp_mask | __GFP_MEMALLOC, 0, | |
195 | + &net_skb_reserve, NULL); | |
196 | + | |
197 | return page; | |
198 | } | |
199 | EXPORT_SYMBOL(__netdev_alloc_page); | |
200 | ||
201 | +void __netdev_free_page(struct net_device *dev, struct page *page) | |
202 | +{ | |
203 | + free_pages_reserve(page, 0, &net_skb_reserve, page->reserve); | |
204 | +} | |
205 | +EXPORT_SYMBOL(__netdev_free_page); | |
206 | + | |
207 | void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page, int off, | |
208 | int size) | |
209 | { | |
210 | @@ -276,6 +296,27 @@ void skb_add_rx_frag(struct sk_buff *skb | |
211 | skb->len += size; | |
212 | skb->data_len += size; | |
213 | skb->truesize += size; | |
214 | + | |
215 | +#ifdef CONFIG_NETVM | |
216 | + /* | |
217 | + * In the rare case that skb_emergency() != page->reserved we'll | |
218 | + * skew the accounting slightly, but since its only a 'small' constant | |
219 | + * shift its ok. | |
220 | + */ | |
221 | + if (skb_emergency(skb)) { | |
222 | + /* | |
223 | + * We need to track fragment pages so that we properly | |
224 | + * release their reserve in skb_put_page(). | |
225 | + */ | |
226 | + atomic_set(&page->frag_count, 1); | |
227 | + } else if (unlikely(page->reserve)) { | |
228 | + /* | |
229 | + * Release the reserve now, because normal skbs don't | |
230 | + * do the emergency accounting. | |
231 | + */ | |
232 | + mem_reserve_pages_charge(&net_skb_reserve, -1); | |
233 | + } | |
234 | +#endif | |
235 | } | |
236 | EXPORT_SYMBOL(skb_add_rx_frag); | |
237 | ||
238 | @@ -327,21 +368,38 @@ static void skb_clone_fraglist(struct sk | |
239 | skb_get(list); | |
240 | } | |
241 | ||
242 | +static void skb_get_page(struct sk_buff *skb, struct page *page) | |
243 | +{ | |
244 | + get_page(page); | |
245 | + if (skb_emergency(skb)) | |
246 | + atomic_inc(&page->frag_count); | |
247 | +} | |
248 | + | |
249 | +static void skb_put_page(struct sk_buff *skb, struct page *page) | |
250 | +{ | |
251 | + if (skb_emergency(skb) && atomic_dec_and_test(&page->frag_count)) | |
252 | + mem_reserve_pages_charge(&net_skb_reserve, -1); | |
253 | + put_page(page); | |
254 | +} | |
255 | + | |
256 | static void skb_release_data(struct sk_buff *skb) | |
257 | { | |
258 | if (!skb->cloned || | |
259 | !atomic_sub_return(skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1, | |
260 | &skb_shinfo(skb)->dataref)) { | |
261 | + | |
262 | if (skb_shinfo(skb)->nr_frags) { | |
263 | int i; | |
264 | - for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) | |
265 | - put_page(skb_shinfo(skb)->frags[i].page); | |
266 | + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { | |
267 | + skb_put_page(skb, | |
268 | + skb_shinfo(skb)->frags[i].page); | |
269 | + } | |
270 | } | |
271 | ||
272 | if (skb_shinfo(skb)->frag_list) | |
273 | skb_drop_fraglist(skb); | |
274 | ||
275 | - kfree(skb->head); | |
276 | + kfree_reserve(skb->head, &net_skb_reserve, skb_emergency(skb)); | |
277 | } | |
278 | } | |
279 | ||
280 | @@ -462,6 +520,9 @@ static void __copy_skb_header(struct sk_ | |
281 | #if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE) | |
282 | new->ipvs_property = old->ipvs_property; | |
283 | #endif | |
284 | +#ifdef CONFIG_NETVM | |
285 | + new->emergency = old->emergency; | |
286 | +#endif | |
287 | new->protocol = old->protocol; | |
288 | new->mark = old->mark; | |
289 | __nf_copy(new, old); | |
290 | @@ -555,6 +616,9 @@ struct sk_buff *skb_clone(struct sk_buff | |
291 | n->fclone = SKB_FCLONE_CLONE; | |
292 | atomic_inc(fclone_ref); | |
293 | } else { | |
294 | + if (skb_emergency(skb)) | |
295 | + gfp_mask |= __GFP_MEMALLOC; | |
296 | + | |
297 | n = kmem_cache_alloc(skbuff_head_cache, gfp_mask); | |
298 | if (!n) | |
299 | return NULL; | |
300 | @@ -586,6 +650,14 @@ static void copy_skb_header(struct sk_bu | |
301 | skb_shinfo(new)->gso_type = skb_shinfo(old)->gso_type; | |
302 | } | |
303 | ||
304 | +static inline int skb_alloc_rx_flag(const struct sk_buff *skb) | |
305 | +{ | |
306 | + if (skb_emergency(skb)) | |
307 | + return SKB_ALLOC_RX; | |
308 | + | |
309 | + return 0; | |
310 | +} | |
311 | + | |
312 | /** | |
313 | * skb_copy - create private copy of an sk_buff | |
314 | * @skb: buffer to copy | |
315 | @@ -606,15 +678,17 @@ static void copy_skb_header(struct sk_bu | |
316 | struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t gfp_mask) | |
317 | { | |
318 | int headerlen = skb->data - skb->head; | |
319 | + int size; | |
320 | /* | |
321 | * Allocate the copy buffer | |
322 | */ | |
323 | struct sk_buff *n; | |
324 | #ifdef NET_SKBUFF_DATA_USES_OFFSET | |
325 | - n = alloc_skb(skb->end + skb->data_len, gfp_mask); | |
326 | + size = skb->end + skb->data_len; | |
327 | #else | |
328 | - n = alloc_skb(skb->end - skb->head + skb->data_len, gfp_mask); | |
329 | + size = skb->end - skb->head + skb->data_len; | |
330 | #endif | |
331 | + n = __alloc_skb(size, gfp_mask, skb_alloc_rx_flag(skb), -1); | |
332 | if (!n) | |
333 | return NULL; | |
334 | ||
335 | @@ -649,12 +723,14 @@ struct sk_buff *pskb_copy(struct sk_buff | |
336 | /* | |
337 | * Allocate the copy buffer | |
338 | */ | |
339 | + int size; | |
340 | struct sk_buff *n; | |
341 | #ifdef NET_SKBUFF_DATA_USES_OFFSET | |
342 | - n = alloc_skb(skb->end, gfp_mask); | |
343 | + size = skb->end; | |
344 | #else | |
345 | - n = alloc_skb(skb->end - skb->head, gfp_mask); | |
346 | + size = skb->end - skb->head; | |
347 | #endif | |
348 | + n = __alloc_skb(size, gfp_mask, skb_alloc_rx_flag(skb), -1); | |
349 | if (!n) | |
350 | goto out; | |
351 | ||
352 | @@ -673,8 +749,9 @@ struct sk_buff *pskb_copy(struct sk_buff | |
353 | int i; | |
354 | ||
355 | for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { | |
356 | - skb_shinfo(n)->frags[i] = skb_shinfo(skb)->frags[i]; | |
357 | - get_page(skb_shinfo(n)->frags[i].page); | |
358 | + skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; | |
359 | + skb_shinfo(n)->frags[i] = *frag; | |
360 | + skb_get_page(n, frag->page); | |
361 | } | |
362 | skb_shinfo(n)->nr_frags = i; | |
363 | } | |
364 | @@ -722,7 +799,11 @@ int pskb_expand_head(struct sk_buff *skb | |
365 | ||
366 | size = SKB_DATA_ALIGN(size); | |
367 | ||
368 | - data = kmalloc(size + sizeof(struct skb_shared_info), gfp_mask); | |
369 | + if (skb_emergency(skb)) | |
370 | + gfp_mask |= __GFP_MEMALLOC; | |
371 | + | |
372 | + data = kmalloc_reserve(size + sizeof(struct skb_shared_info), | |
373 | + gfp_mask, -1, &net_skb_reserve, NULL); | |
374 | if (!data) | |
375 | goto nodata; | |
376 | ||
377 | @@ -737,7 +818,7 @@ int pskb_expand_head(struct sk_buff *skb | |
378 | sizeof(struct skb_shared_info)); | |
379 | ||
380 | for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) | |
381 | - get_page(skb_shinfo(skb)->frags[i].page); | |
382 | + skb_get_page(skb, skb_shinfo(skb)->frags[i].page); | |
383 | ||
384 | if (skb_shinfo(skb)->frag_list) | |
385 | skb_clone_fraglist(skb); | |
386 | @@ -816,8 +897,8 @@ struct sk_buff *skb_copy_expand(const st | |
387 | /* | |
388 | * Allocate the copy buffer | |
389 | */ | |
390 | - struct sk_buff *n = alloc_skb(newheadroom + skb->len + newtailroom, | |
391 | - gfp_mask); | |
392 | + struct sk_buff *n = __alloc_skb(newheadroom + skb->len + newtailroom, | |
393 | + gfp_mask, skb_alloc_rx_flag(skb), -1); | |
394 | int oldheadroom = skb_headroom(skb); | |
395 | int head_copy_len, head_copy_off; | |
396 | int off; | |
397 | @@ -1006,7 +1087,7 @@ drop_pages: | |
398 | skb_shinfo(skb)->nr_frags = i; | |
399 | ||
400 | for (; i < nfrags; i++) | |
401 | - put_page(skb_shinfo(skb)->frags[i].page); | |
402 | + skb_put_page(skb, skb_shinfo(skb)->frags[i].page); | |
403 | ||
404 | if (skb_shinfo(skb)->frag_list) | |
405 | skb_drop_fraglist(skb); | |
406 | @@ -1175,7 +1256,7 @@ pull_pages: | |
407 | k = 0; | |
408 | for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { | |
409 | if (skb_shinfo(skb)->frags[i].size <= eat) { | |
410 | - put_page(skb_shinfo(skb)->frags[i].page); | |
411 | + skb_put_page(skb, skb_shinfo(skb)->frags[i].page); | |
412 | eat -= skb_shinfo(skb)->frags[i].size; | |
413 | } else { | |
414 | skb_shinfo(skb)->frags[k] = skb_shinfo(skb)->frags[i]; | |
415 | @@ -1925,6 +2006,7 @@ static inline void skb_split_no_header(s | |
416 | skb_shinfo(skb1)->frags[k] = skb_shinfo(skb)->frags[i]; | |
417 | ||
418 | if (pos < len) { | |
419 | + struct page *page = skb_shinfo(skb)->frags[i].page; | |
420 | /* Split frag. | |
421 | * We have two variants in this case: | |
422 | * 1. Move all the frag to the second | |
423 | @@ -1933,7 +2015,7 @@ static inline void skb_split_no_header(s | |
424 | * where splitting is expensive. | |
425 | * 2. Split is accurately. We make this. | |
426 | */ | |
427 | - get_page(skb_shinfo(skb)->frags[i].page); | |
428 | + skb_get_page(skb1, page); | |
429 | skb_shinfo(skb1)->frags[0].page_offset += len - pos; | |
430 | skb_shinfo(skb1)->frags[0].size -= len - pos; | |
431 | skb_shinfo(skb)->frags[i].size = len - pos; | |
432 | @@ -2264,7 +2346,8 @@ struct sk_buff *skb_segment(struct sk_bu | |
433 | if (hsize > len || !sg) | |
434 | hsize = len; | |
435 | ||
436 | - nskb = alloc_skb(hsize + doffset + headroom, GFP_ATOMIC); | |
437 | + nskb = __alloc_skb(hsize + doffset + headroom, GFP_ATOMIC, | |
438 | + skb_alloc_rx_flag(skb), -1); | |
439 | if (unlikely(!nskb)) | |
440 | goto err; | |
441 | ||
442 | @@ -2302,7 +2385,7 @@ struct sk_buff *skb_segment(struct sk_bu | |
443 | BUG_ON(i >= nfrags); | |
444 | ||
445 | *frag = skb_shinfo(skb)->frags[i]; | |
446 | - get_page(frag->page); | |
447 | + skb_get_page(nskb, frag->page); | |
448 | size = frag->size; | |
449 | ||
450 | if (pos < offset) { |