]>
Commit | Line | Data |
---|---|---|
2cb7cef9 BS |
1 | From: Peter Zijlstra <a.p.zijlstra@chello.nl> |
2 | Subject: mm: memory reserve management | |
3 | Patch-mainline: No | |
4 | References: FATE#303834 | |
5 | ||
6 | Generic reserve management code. | |
7 | ||
8 | It provides methods to reserve and charge. Upon this, generic alloc/free style | |
9 | reserve pools could be build, which could fully replace mempool_t | |
10 | functionality. | |
11 | ||
12 | It should also allow for a Banker's algorithm replacement of __GFP_NOFAIL. | |
13 | ||
14 | Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> | |
15 | Acked-by: Neil Brown <neilb@suse.de> | |
16 | Acked-by: Suresh Jayaraman <sjayaraman@suse.de> | |
17 | ||
18 | --- | |
19 | include/linux/reserve.h | 198 ++++++++++++++ | |
20 | include/linux/slab.h | 20 - | |
21 | mm/Makefile | 2 | |
22 | mm/reserve.c | 637 ++++++++++++++++++++++++++++++++++++++++++++++++ | |
23 | mm/slub.c | 2 | |
24 | 5 files changed, 848 insertions(+), 11 deletions(-) | |
25 | ||
26 | --- /dev/null | |
27 | +++ b/include/linux/reserve.h | |
28 | @@ -0,0 +1,198 @@ | |
29 | +/* | |
30 | + * Memory reserve management. | |
31 | + * | |
32 | + * Copyright (C) 2007-2008 Red Hat, Inc., | |
33 | + * Peter Zijlstra <pzijlstr@redhat.com> | |
34 | + * | |
35 | + * This file contains the public data structure and API definitions. | |
36 | + */ | |
37 | + | |
38 | +#ifndef _LINUX_RESERVE_H | |
39 | +#define _LINUX_RESERVE_H | |
40 | + | |
41 | +#include <linux/list.h> | |
42 | +#include <linux/spinlock.h> | |
43 | +#include <linux/wait.h> | |
44 | +#include <linux/slab.h> | |
45 | + | |
46 | +struct mem_reserve { | |
47 | + struct mem_reserve *parent; | |
48 | + struct list_head children; | |
49 | + struct list_head siblings; | |
50 | + | |
51 | + const char *name; | |
52 | + | |
53 | + long pages; | |
54 | + long limit; | |
55 | + long usage; | |
56 | + spinlock_t lock; /* protects limit and usage */ | |
57 | + | |
58 | + wait_queue_head_t waitqueue; | |
59 | +}; | |
60 | + | |
61 | +extern struct mem_reserve mem_reserve_root; | |
62 | + | |
63 | +void mem_reserve_init(struct mem_reserve *res, const char *name, | |
64 | + struct mem_reserve *parent); | |
65 | +int mem_reserve_connect(struct mem_reserve *new_child, | |
66 | + struct mem_reserve *node); | |
67 | +void mem_reserve_disconnect(struct mem_reserve *node); | |
68 | + | |
69 | +int mem_reserve_pages_set(struct mem_reserve *res, long pages); | |
70 | +int mem_reserve_pages_add(struct mem_reserve *res, long pages); | |
71 | +int mem_reserve_pages_charge(struct mem_reserve *res, long pages); | |
72 | + | |
73 | +int mem_reserve_kmalloc_set(struct mem_reserve *res, long bytes); | |
74 | +int mem_reserve_kmalloc_charge(struct mem_reserve *res, long bytes); | |
75 | + | |
76 | +struct kmem_cache; | |
77 | + | |
78 | +int mem_reserve_kmem_cache_set(struct mem_reserve *res, | |
79 | + struct kmem_cache *s, | |
80 | + int objects); | |
81 | +int mem_reserve_kmem_cache_charge(struct mem_reserve *res, | |
82 | + struct kmem_cache *s, long objs); | |
83 | + | |
84 | +void *___kmalloc_reserve(size_t size, gfp_t flags, int node, void *ip, | |
85 | + struct mem_reserve *res, int *emerg); | |
86 | + | |
87 | +static inline | |
88 | +void *__kmalloc_reserve(size_t size, gfp_t flags, int node, void *ip, | |
89 | + struct mem_reserve *res, int *emerg) | |
90 | +{ | |
91 | + void *obj; | |
92 | + | |
93 | + obj = __kmalloc_node_track_caller(size, | |
94 | + flags | __GFP_NOMEMALLOC | __GFP_NOWARN, node, ip); | |
95 | + if (!obj) | |
96 | + obj = ___kmalloc_reserve(size, flags, node, ip, res, emerg); | |
97 | + | |
98 | + return obj; | |
99 | +} | |
100 | + | |
101 | +/** | |
102 | + * kmalloc_reserve() - kmalloc() and charge against @res for @emerg allocations | |
103 | + * @size - size of the requested memory region | |
104 | + * @gfp - allocation flags to use for this allocation | |
105 | + * @node - preferred memory node for this allocation | |
106 | + * @res - reserve to charge emergency allocations against | |
107 | + * @emerg - bit 0 is set when the allocation was an emergency allocation | |
108 | + * | |
109 | + * Returns NULL on failure | |
110 | + */ | |
111 | +#define kmalloc_reserve(size, gfp, node, res, emerg) \ | |
112 | + __kmalloc_reserve(size, gfp, node, \ | |
113 | + __builtin_return_address(0), res, emerg) | |
114 | + | |
115 | +void __kfree_reserve(void *obj, struct mem_reserve *res, int emerg); | |
116 | + | |
117 | +/** | |
118 | + * kfree_reserve() - kfree() and uncharge against @res for @emerg allocations | |
119 | + * @obj - memory to free | |
120 | + * @res - reserve to uncharge emergency allocations from | |
121 | + * @emerg - was this an emergency allocation | |
122 | + */ | |
123 | +static inline | |
124 | +void kfree_reserve(void *obj, struct mem_reserve *res, int emerg) | |
125 | +{ | |
126 | + if (unlikely(obj && res && emerg)) | |
127 | + __kfree_reserve(obj, res, emerg); | |
128 | + else | |
129 | + kfree(obj); | |
130 | +} | |
131 | + | |
132 | +void *__kmem_cache_alloc_reserve(struct kmem_cache *s, gfp_t flags, int node, | |
133 | + struct mem_reserve *res, int *emerg); | |
134 | + | |
135 | +/** | |
136 | + * kmem_cache_alloc_reserve() - kmem_cache_alloc() and charge against @res | |
137 | + * @s - kmem_cache to allocate from | |
138 | + * @gfp - allocation flags to use for this allocation | |
139 | + * @node - preferred memory node for this allocation | |
140 | + * @res - reserve to charge emergency allocations against | |
141 | + * @emerg - bit 0 is set when the allocation was an emergency allocation | |
142 | + * | |
143 | + * Returns NULL on failure | |
144 | + */ | |
145 | +static inline | |
146 | +void *kmem_cache_alloc_reserve(struct kmem_cache *s, gfp_t flags, int node, | |
147 | + struct mem_reserve *res, int *emerg) | |
148 | +{ | |
149 | + void *obj; | |
150 | + | |
151 | + obj = kmem_cache_alloc_node(s, | |
152 | + flags | __GFP_NOMEMALLOC | __GFP_NOWARN, node); | |
153 | + if (!obj) | |
154 | + obj = __kmem_cache_alloc_reserve(s, flags, node, res, emerg); | |
155 | + | |
156 | + return obj; | |
157 | +} | |
158 | + | |
159 | +void __kmem_cache_free_reserve(struct kmem_cache *s, void *obj, | |
160 | + struct mem_reserve *res, int emerg); | |
161 | + | |
162 | +/** | |
163 | + * kmem_cache_free_reserve() - kmem_cache_free() and uncharge against @res | |
164 | + * @s - kmem_cache to free to | |
165 | + * @obj - memory to free | |
166 | + * @res - reserve to uncharge emergency allocations from | |
167 | + * @emerg - was this an emergency allocation | |
168 | + */ | |
169 | +static inline | |
170 | +void kmem_cache_free_reserve(struct kmem_cache *s, void *obj, | |
171 | + struct mem_reserve *res, int emerg) | |
172 | +{ | |
173 | + if (unlikely(obj && res && emerg)) | |
174 | + __kmem_cache_free_reserve(s, obj, res, emerg); | |
175 | + else | |
176 | + kmem_cache_free(s, obj); | |
177 | +} | |
178 | + | |
179 | +struct page *__alloc_pages_reserve(int node, gfp_t flags, int order, | |
180 | + struct mem_reserve *res, int *emerg); | |
181 | + | |
182 | +/** | |
183 | + * alloc_pages_reserve() - alloc_pages() and charge against @res | |
184 | + * @node - preferred memory node for this allocation | |
185 | + * @gfp - allocation flags to use for this allocation | |
186 | + * @order - page order | |
187 | + * @res - reserve to charge emergency allocations against | |
188 | + * @emerg - bit 0 is set when the allocation was an emergency allocation | |
189 | + * | |
190 | + * Returns NULL on failure | |
191 | + */ | |
192 | +static inline | |
193 | +struct page *alloc_pages_reserve(int node, gfp_t flags, int order, | |
194 | + struct mem_reserve *res, int *emerg) | |
195 | +{ | |
196 | + struct page *page; | |
197 | + | |
198 | + page = alloc_pages_node(node, | |
199 | + flags | __GFP_NOMEMALLOC | __GFP_NOWARN, order); | |
200 | + if (!page) | |
201 | + page = __alloc_pages_reserve(node, flags, order, res, emerg); | |
202 | + | |
203 | + return page; | |
204 | +} | |
205 | + | |
206 | +void __free_pages_reserve(struct page *page, int order, | |
207 | + struct mem_reserve *res, int emerg); | |
208 | + | |
209 | +/** | |
210 | + * free_pages_reserve() - __free_pages() and uncharge against @res | |
211 | + * @page - page to free | |
212 | + * @order - page order | |
213 | + * @res - reserve to uncharge emergency allocations from | |
214 | + * @emerg - was this an emergency allocation | |
215 | + */ | |
216 | +static inline | |
217 | +void free_pages_reserve(struct page *page, int order, | |
218 | + struct mem_reserve *res, int emerg) | |
219 | +{ | |
220 | + if (unlikely(page && res && emerg)) | |
221 | + __free_pages_reserve(page, order, res, emerg); | |
222 | + else | |
223 | + __free_pages(page, order); | |
224 | +} | |
225 | + | |
226 | +#endif /* _LINUX_RESERVE_H */ | |
227 | --- a/include/linux/slab.h | |
228 | +++ b/include/linux/slab.h | |
229 | @@ -230,13 +230,14 @@ static inline void *kmem_cache_alloc_nod | |
230 | */ | |
231 | #if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_SLUB) | |
232 | extern void *__kmalloc_track_caller(size_t, gfp_t, void*); | |
233 | -#define kmalloc_track_caller(size, flags) \ | |
234 | - __kmalloc_track_caller(size, flags, __builtin_return_address(0)) | |
235 | #else | |
236 | -#define kmalloc_track_caller(size, flags) \ | |
237 | +#define __kmalloc_track_caller(size, flags, ip) \ | |
238 | __kmalloc(size, flags) | |
239 | #endif /* DEBUG_SLAB */ | |
240 | ||
241 | +#define kmalloc_track_caller(size, flags) \ | |
242 | + __kmalloc_track_caller(size, flags, __builtin_return_address(0)) | |
243 | + | |
244 | #ifdef CONFIG_NUMA | |
245 | /* | |
246 | * kmalloc_node_track_caller is a special version of kmalloc_node that | |
247 | @@ -248,21 +249,22 @@ extern void *__kmalloc_track_caller(size | |
248 | */ | |
249 | #if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_SLUB) | |
250 | extern void *__kmalloc_node_track_caller(size_t, gfp_t, int, void *); | |
251 | -#define kmalloc_node_track_caller(size, flags, node) \ | |
252 | - __kmalloc_node_track_caller(size, flags, node, \ | |
253 | - __builtin_return_address(0)) | |
254 | #else | |
255 | -#define kmalloc_node_track_caller(size, flags, node) \ | |
256 | +#define __kmalloc_node_track_caller(size, flags, node, ip) \ | |
257 | __kmalloc_node(size, flags, node) | |
258 | #endif | |
259 | ||
260 | #else /* CONFIG_NUMA */ | |
261 | ||
262 | -#define kmalloc_node_track_caller(size, flags, node) \ | |
263 | - kmalloc_track_caller(size, flags) | |
264 | +#define __kmalloc_node_track_caller(size, flags, node, ip) \ | |
265 | + __kmalloc_track_caller(size, flags, ip) | |
266 | ||
267 | #endif /* DEBUG_SLAB */ | |
268 | ||
269 | +#define kmalloc_node_track_caller(size, flags, node) \ | |
270 | + __kmalloc_node_track_caller(size, flags, node, \ | |
271 | + __builtin_return_address(0)) | |
272 | + | |
273 | /* | |
274 | * Shortcuts | |
275 | */ | |
276 | --- a/mm/Makefile | |
277 | +++ b/mm/Makefile | |
278 | @@ -11,7 +11,7 @@ obj-y := bootmem.o filemap.o mempool.o | |
279 | maccess.o page_alloc.o page-writeback.o pdflush.o \ | |
280 | readahead.o swap.o truncate.o vmscan.o \ | |
281 | prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \ | |
282 | - page_isolation.o mm_init.o $(mmu-y) | |
283 | + page_isolation.o mm_init.o reserve.o $(mmu-y) | |
284 | ||
285 | obj-$(CONFIG_PROC_PAGE_MONITOR) += pagewalk.o | |
286 | obj-$(CONFIG_BOUNCE) += bounce.o | |
287 | --- /dev/null | |
288 | +++ b/mm/reserve.c | |
289 | @@ -0,0 +1,637 @@ | |
290 | +/* | |
291 | + * Memory reserve management. | |
292 | + * | |
293 | + * Copyright (C) 2007-2008, Red Hat, Inc., | |
294 | + * Peter Zijlstra <pzijlstr@redhat.com> | |
295 | + * | |
296 | + * Description: | |
297 | + * | |
298 | + * Manage a set of memory reserves. | |
299 | + * | |
300 | + * A memory reserve is a reserve for a specified number of object of specified | |
301 | + * size. Since memory is managed in pages, this reserve demand is then | |
302 | + * translated into a page unit. | |
303 | + * | |
304 | + * So each reserve has a specified object limit, an object usage count and a | |
305 | + * number of pages required to back these objects. | |
306 | + * | |
307 | + * Usage is charged against a reserve, if the charge fails, the resource must | |
308 | + * not be allocated/used. | |
309 | + * | |
310 | + * The reserves are managed in a tree, and the resource demands (pages and | |
311 | + * limit) are propagated up the tree. Obviously the object limit will be | |
312 | + * meaningless as soon as the unit starts mixing, but the required page reserve | |
313 | + * (being of one unit) is still valid at the root. | |
314 | + * | |
315 | + * It is the page demand of the root node that is used to set the global | |
316 | + * reserve (adjust_memalloc_reserve() which sets zone->pages_emerg). | |
317 | + * | |
318 | + * As long as a subtree has the same usage unit, an aggregate node can be used | |
319 | + * to charge against, instead of the leaf nodes. However, do be consistent with | |
320 | + * who is charged, resource usage is not propagated up the tree (for | |
321 | + * performance reasons). | |
322 | + */ | |
323 | + | |
324 | +#include <linux/reserve.h> | |
325 | +#include <linux/mutex.h> | |
326 | +#include <linux/mmzone.h> | |
327 | +#include <linux/log2.h> | |
328 | +#include <linux/proc_fs.h> | |
329 | +#include <linux/seq_file.h> | |
330 | +#include <linux/module.h> | |
331 | +#include <linux/slab.h> | |
332 | +#include <linux/sched.h> | |
333 | +#include "internal.h" | |
334 | + | |
335 | +static DEFINE_MUTEX(mem_reserve_mutex); | |
336 | + | |
337 | +/** | |
338 | + * @mem_reserve_root - the global reserve root | |
339 | + * | |
340 | + * The global reserve is empty, and has no limit unit, it merely | |
341 | + * acts as an aggregation point for reserves and an interface to | |
342 | + * adjust_memalloc_reserve(). | |
343 | + */ | |
344 | +struct mem_reserve mem_reserve_root = { | |
345 | + .children = LIST_HEAD_INIT(mem_reserve_root.children), | |
346 | + .siblings = LIST_HEAD_INIT(mem_reserve_root.siblings), | |
347 | + .name = "total reserve", | |
348 | + .lock = __SPIN_LOCK_UNLOCKED(mem_reserve_root.lock), | |
349 | + .waitqueue = __WAIT_QUEUE_HEAD_INITIALIZER(mem_reserve_root.waitqueue), | |
350 | +}; | |
351 | +EXPORT_SYMBOL_GPL(mem_reserve_root); | |
352 | + | |
353 | +/** | |
354 | + * mem_reserve_init() - initialize a memory reserve object | |
355 | + * @res - the new reserve object | |
356 | + * @name - a name for this reserve | |
357 | + * @parent - when non NULL, the parent to connect to. | |
358 | + */ | |
359 | +void mem_reserve_init(struct mem_reserve *res, const char *name, | |
360 | + struct mem_reserve *parent) | |
361 | +{ | |
362 | + memset(res, 0, sizeof(*res)); | |
363 | + INIT_LIST_HEAD(&res->children); | |
364 | + INIT_LIST_HEAD(&res->siblings); | |
365 | + res->name = name; | |
366 | + spin_lock_init(&res->lock); | |
367 | + init_waitqueue_head(&res->waitqueue); | |
368 | + | |
369 | + if (parent) | |
370 | + mem_reserve_connect(res, parent); | |
371 | +} | |
372 | +EXPORT_SYMBOL_GPL(mem_reserve_init); | |
373 | + | |
374 | +/* | |
375 | + * propagate the pages and limit changes up the (sub)tree. | |
376 | + */ | |
377 | +static void __calc_reserve(struct mem_reserve *res, long pages, long limit) | |
378 | +{ | |
379 | + unsigned long flags; | |
380 | + | |
381 | + for ( ; res; res = res->parent) { | |
382 | + res->pages += pages; | |
383 | + | |
384 | + if (limit) { | |
385 | + spin_lock_irqsave(&res->lock, flags); | |
386 | + res->limit += limit; | |
387 | + spin_unlock_irqrestore(&res->lock, flags); | |
388 | + } | |
389 | + } | |
390 | +} | |
391 | + | |
392 | +/** | |
393 | + * __mem_reserve_add() - primitive to change the size of a reserve | |
394 | + * @res - reserve to change | |
395 | + * @pages - page delta | |
396 | + * @limit - usage limit delta | |
397 | + * | |
398 | + * Returns -ENOMEM when a size increase is not possible atm. | |
399 | + */ | |
400 | +static int __mem_reserve_add(struct mem_reserve *res, long pages, long limit) | |
401 | +{ | |
402 | + int ret = 0; | |
403 | + long reserve; | |
404 | + | |
405 | + /* | |
406 | + * This looks more complex than need be, that is because we handle | |
407 | + * the case where @res isn't actually connected to mem_reserve_root. | |
408 | + * | |
409 | + * So, by propagating the new pages up the (sub)tree and computing | |
410 | + * the difference in mem_reserve_root.pages we find if this action | |
411 | + * affects the actual reserve. | |
412 | + * | |
413 | + * The (partial) propagation also makes that mem_reserve_connect() | |
414 | + * needs only look at the direct child, since each disconnected | |
415 | + * sub-tree is fully up-to-date. | |
416 | + */ | |
417 | + reserve = mem_reserve_root.pages; | |
418 | + __calc_reserve(res, pages, 0); | |
419 | + reserve = mem_reserve_root.pages - reserve; | |
420 | + | |
421 | + if (reserve) { | |
422 | + ret = adjust_memalloc_reserve(reserve); | |
423 | + if (ret) | |
424 | + __calc_reserve(res, -pages, 0); | |
425 | + } | |
426 | + | |
427 | + /* | |
428 | + * Delay updating the limits until we've acquired the resources to | |
429 | + * back it. | |
430 | + */ | |
431 | + if (!ret) | |
432 | + __calc_reserve(res, 0, limit); | |
433 | + | |
434 | + return ret; | |
435 | +} | |
436 | + | |
437 | +/** | |
438 | + * __mem_reserve_charge() - primitive to charge object usage of a reserve | |
439 | + * @res - reserve to charge | |
440 | + * @charge - size of the charge | |
441 | + * | |
442 | + * Returns non-zero on success, zero on failure. | |
443 | + */ | |
444 | +static | |
445 | +int __mem_reserve_charge(struct mem_reserve *res, long charge) | |
446 | +{ | |
447 | + unsigned long flags; | |
448 | + int ret = 0; | |
449 | + | |
450 | + spin_lock_irqsave(&res->lock, flags); | |
451 | + if (charge < 0 || res->usage + charge < res->limit) { | |
452 | + res->usage += charge; | |
453 | + if (unlikely(res->usage < 0)) | |
454 | + res->usage = 0; | |
455 | + ret = 1; | |
456 | + } | |
457 | + if (charge < 0) | |
458 | + wake_up_all(&res->waitqueue); | |
459 | + spin_unlock_irqrestore(&res->lock, flags); | |
460 | + | |
461 | + return ret; | |
462 | +} | |
463 | + | |
464 | +/** | |
465 | + * mem_reserve_connect() - connect a reserve to another in a child-parent relation | |
466 | + * @new_child - the reserve node to connect (child) | |
467 | + * @node - the reserve node to connect to (parent) | |
468 | + * | |
469 | + * Connecting a node results in an increase of the reserve by the amount of | |
470 | + * pages in @new_child->pages if @node has a connection to mem_reserve_root. | |
471 | + * | |
472 | + * Returns -ENOMEM when the new connection would increase the reserve (parent | |
473 | + * is connected to mem_reserve_root) and there is no memory to do so. | |
474 | + * | |
475 | + * On error, the child is _NOT_ connected. | |
476 | + */ | |
477 | +int mem_reserve_connect(struct mem_reserve *new_child, struct mem_reserve *node) | |
478 | +{ | |
479 | + int ret; | |
480 | + | |
481 | + WARN_ON(!new_child->name); | |
482 | + | |
483 | + mutex_lock(&mem_reserve_mutex); | |
484 | + if (new_child->parent) { | |
485 | + ret = -EEXIST; | |
486 | + goto unlock; | |
487 | + } | |
488 | + new_child->parent = node; | |
489 | + list_add(&new_child->siblings, &node->children); | |
490 | + ret = __mem_reserve_add(node, new_child->pages, new_child->limit); | |
491 | + if (ret) { | |
492 | + new_child->parent = NULL; | |
493 | + list_del_init(&new_child->siblings); | |
494 | + } | |
495 | +unlock: | |
496 | + mutex_unlock(&mem_reserve_mutex); | |
497 | + | |
498 | + return ret; | |
499 | +} | |
500 | +EXPORT_SYMBOL_GPL(mem_reserve_connect); | |
501 | + | |
502 | +/** | |
503 | + * mem_reserve_disconnect() - sever a nodes connection to the reserve tree | |
504 | + * @node - the node to disconnect | |
505 | + * | |
506 | + * Disconnecting a node results in a reduction of the reserve by @node->pages | |
507 | + * if node had a connection to mem_reserve_root. | |
508 | + */ | |
509 | +void mem_reserve_disconnect(struct mem_reserve *node) | |
510 | +{ | |
511 | + int ret; | |
512 | + | |
513 | + BUG_ON(!node->parent); | |
514 | + | |
515 | + mutex_lock(&mem_reserve_mutex); | |
516 | + if (!node->parent) { | |
517 | + ret = -ENOENT; | |
518 | + goto unlock; | |
519 | + } | |
520 | + ret = __mem_reserve_add(node->parent, -node->pages, -node->limit); | |
521 | + if (!ret) { | |
522 | + node->parent = NULL; | |
523 | + list_del_init(&node->siblings); | |
524 | + } | |
525 | +unlock: | |
526 | + mutex_unlock(&mem_reserve_mutex); | |
527 | + | |
528 | + /* | |
529 | + * We cannot fail to shrink the reserves, can we? | |
530 | + */ | |
531 | + WARN_ON(ret); | |
532 | +} | |
533 | +EXPORT_SYMBOL_GPL(mem_reserve_disconnect); | |
534 | + | |
535 | +#ifdef CONFIG_PROC_FS | |
536 | + | |
537 | +/* | |
538 | + * Simple output of the reserve tree in: /proc/reserve_info | |
539 | + * Example: | |
540 | + * | |
541 | + * localhost ~ # cat /proc/reserve_info | |
542 | + * 1:0 "total reserve" 6232K 0/278581 | |
543 | + * 2:1 "total network reserve" 6232K 0/278581 | |
544 | + * 3:2 "network TX reserve" 212K 0/53 | |
545 | + * 4:3 "protocol TX pages" 212K 0/53 | |
546 | + * 5:2 "network RX reserve" 6020K 0/278528 | |
547 | + * 6:5 "IPv4 route cache" 5508K 0/16384 | |
548 | + * 7:5 "SKB data reserve" 512K 0/262144 | |
549 | + * 8:7 "IPv4 fragment cache" 512K 0/262144 | |
550 | + */ | |
551 | + | |
552 | +static void mem_reserve_show_item(struct seq_file *m, struct mem_reserve *res, | |
553 | + unsigned int parent, unsigned int *id) | |
554 | +{ | |
555 | + struct mem_reserve *child; | |
556 | + unsigned int my_id = ++*id; | |
557 | + | |
558 | + seq_printf(m, "%d:%d \"%s\" %ldK %ld/%ld\n", | |
559 | + my_id, parent, res->name, | |
560 | + res->pages << (PAGE_SHIFT - 10), | |
561 | + res->usage, res->limit); | |
562 | + | |
563 | + list_for_each_entry(child, &res->children, siblings) | |
564 | + mem_reserve_show_item(m, child, my_id, id); | |
565 | +} | |
566 | + | |
567 | +static int mem_reserve_show(struct seq_file *m, void *v) | |
568 | +{ | |
569 | + unsigned int ident = 0; | |
570 | + | |
571 | + mutex_lock(&mem_reserve_mutex); | |
572 | + mem_reserve_show_item(m, &mem_reserve_root, ident, &ident); | |
573 | + mutex_unlock(&mem_reserve_mutex); | |
574 | + | |
575 | + return 0; | |
576 | +} | |
577 | + | |
578 | +static int mem_reserve_open(struct inode *inode, struct file *file) | |
579 | +{ | |
580 | + return single_open(file, mem_reserve_show, NULL); | |
581 | +} | |
582 | + | |
583 | +static const struct file_operations mem_reserve_opterations = { | |
584 | + .open = mem_reserve_open, | |
585 | + .read = seq_read, | |
586 | + .llseek = seq_lseek, | |
587 | + .release = single_release, | |
588 | +}; | |
589 | + | |
590 | +static __init int mem_reserve_proc_init(void) | |
591 | +{ | |
592 | + proc_create("reserve_info", S_IRUSR, NULL, &mem_reserve_opterations); | |
593 | + return 0; | |
594 | +} | |
595 | + | |
596 | +module_init(mem_reserve_proc_init); | |
597 | + | |
598 | +#endif | |
599 | + | |
600 | +/* | |
601 | + * alloc_page helpers | |
602 | + */ | |
603 | + | |
604 | +/** | |
605 | + * mem_reserve_pages_set() - set reserves size in pages | |
606 | + * @res - reserve to set | |
607 | + * @pages - size in pages to set it to | |
608 | + * | |
609 | + * Returns -ENOMEM when it fails to set the reserve. On failure the old size | |
610 | + * is preserved. | |
611 | + */ | |
612 | +int mem_reserve_pages_set(struct mem_reserve *res, long pages) | |
613 | +{ | |
614 | + int ret; | |
615 | + | |
616 | + mutex_lock(&mem_reserve_mutex); | |
617 | + pages -= res->pages; | |
618 | + ret = __mem_reserve_add(res, pages, pages * PAGE_SIZE); | |
619 | + mutex_unlock(&mem_reserve_mutex); | |
620 | + | |
621 | + return ret; | |
622 | +} | |
623 | +EXPORT_SYMBOL_GPL(mem_reserve_pages_set); | |
624 | + | |
625 | +/** | |
626 | + * mem_reserve_pages_add() - change the size in a relative way | |
627 | + * @res - reserve to change | |
628 | + * @pages - number of pages to add (or subtract when negative) | |
629 | + * | |
630 | + * Similar to mem_reserve_pages_set, except that the argument is relative | |
631 | + * instead of absolute. | |
632 | + * | |
633 | + * Returns -ENOMEM when it fails to increase. | |
634 | + */ | |
635 | +int mem_reserve_pages_add(struct mem_reserve *res, long pages) | |
636 | +{ | |
637 | + int ret; | |
638 | + | |
639 | + mutex_lock(&mem_reserve_mutex); | |
640 | + ret = __mem_reserve_add(res, pages, pages * PAGE_SIZE); | |
641 | + mutex_unlock(&mem_reserve_mutex); | |
642 | + | |
643 | + return ret; | |
644 | +} | |
645 | + | |
646 | +/** | |
647 | + * mem_reserve_pages_charge() - charge page usage to a reserve | |
648 | + * @res - reserve to charge | |
649 | + * @pages - size to charge | |
650 | + * | |
651 | + * Returns non-zero on success. | |
652 | + */ | |
653 | +int mem_reserve_pages_charge(struct mem_reserve *res, long pages) | |
654 | +{ | |
655 | + return __mem_reserve_charge(res, pages * PAGE_SIZE); | |
656 | +} | |
657 | +EXPORT_SYMBOL_GPL(mem_reserve_pages_charge); | |
658 | + | |
659 | +/* | |
660 | + * kmalloc helpers | |
661 | + */ | |
662 | + | |
663 | +/** | |
664 | + * mem_reserve_kmalloc_set() - set this reserve to bytes worth of kmalloc | |
665 | + * @res - reserve to change | |
666 | + * @bytes - size in bytes to reserve | |
667 | + * | |
668 | + * Returns -ENOMEM on failure. | |
669 | + */ | |
670 | +int mem_reserve_kmalloc_set(struct mem_reserve *res, long bytes) | |
671 | +{ | |
672 | + int ret; | |
673 | + long pages; | |
674 | + | |
675 | + mutex_lock(&mem_reserve_mutex); | |
676 | + pages = kmalloc_estimate_bytes(GFP_ATOMIC, bytes); | |
677 | + pages -= res->pages; | |
678 | + bytes -= res->limit; | |
679 | + ret = __mem_reserve_add(res, pages, bytes); | |
680 | + mutex_unlock(&mem_reserve_mutex); | |
681 | + | |
682 | + return ret; | |
683 | +} | |
684 | +EXPORT_SYMBOL_GPL(mem_reserve_kmalloc_set); | |
685 | + | |
686 | +/** | |
687 | + * mem_reserve_kmalloc_charge() - charge bytes to a reserve | |
688 | + * @res - reserve to charge | |
689 | + * @bytes - bytes to charge | |
690 | + * | |
691 | + * Returns non-zero on success. | |
692 | + */ | |
693 | +int mem_reserve_kmalloc_charge(struct mem_reserve *res, long bytes) | |
694 | +{ | |
695 | + if (bytes < 0) | |
696 | + bytes = -roundup_pow_of_two(-bytes); | |
697 | + else | |
698 | + bytes = roundup_pow_of_two(bytes); | |
699 | + | |
700 | + return __mem_reserve_charge(res, bytes); | |
701 | +} | |
702 | +EXPORT_SYMBOL_GPL(mem_reserve_kmalloc_charge); | |
703 | + | |
704 | +/* | |
705 | + * kmem_cache helpers | |
706 | + */ | |
707 | + | |
708 | +/** | |
709 | + * mem_reserve_kmem_cache_set() - set reserve to @objects worth of kmem_cache_alloc of @s | |
710 | + * @res - reserve to set | |
711 | + * @s - kmem_cache to reserve from | |
712 | + * @objects - number of objects to reserve | |
713 | + * | |
714 | + * Returns -ENOMEM on failure. | |
715 | + */ | |
716 | +int mem_reserve_kmem_cache_set(struct mem_reserve *res, struct kmem_cache *s, | |
717 | + int objects) | |
718 | +{ | |
719 | + int ret; | |
720 | + long pages, bytes; | |
721 | + | |
722 | + mutex_lock(&mem_reserve_mutex); | |
723 | + pages = kmem_alloc_estimate(s, GFP_ATOMIC, objects); | |
724 | + pages -= res->pages; | |
725 | + bytes = objects * kmem_cache_size(s) - res->limit; | |
726 | + ret = __mem_reserve_add(res, pages, bytes); | |
727 | + mutex_unlock(&mem_reserve_mutex); | |
728 | + | |
729 | + return ret; | |
730 | +} | |
731 | +EXPORT_SYMBOL_GPL(mem_reserve_kmem_cache_set); | |
732 | + | |
733 | +/** | |
734 | + * mem_reserve_kmem_cache_charge() - charge (or uncharge) usage of objs | |
735 | + * @res - reserve to charge | |
736 | + * @objs - objects to charge for | |
737 | + * | |
738 | + * Returns non-zero on success. | |
739 | + */ | |
740 | +int mem_reserve_kmem_cache_charge(struct mem_reserve *res, struct kmem_cache *s, | |
741 | + long objs) | |
742 | +{ | |
743 | + return __mem_reserve_charge(res, objs * kmem_cache_size(s)); | |
744 | +} | |
745 | +EXPORT_SYMBOL_GPL(mem_reserve_kmem_cache_charge); | |
746 | + | |
747 | +/* | |
748 | + * Alloc wrappers. | |
749 | + * | |
750 | + * Actual usage is commented in linux/reserve.h where the interface functions | |
751 | + * live. Furthermore, the code is 3 instances of the same paradigm, hence only | |
752 | + * the first contains extensive comments. | |
753 | + */ | |
754 | + | |
755 | +/* | |
756 | + * kmalloc/kfree | |
757 | + */ | |
758 | + | |
759 | +void *___kmalloc_reserve(size_t size, gfp_t flags, int node, void *ip, | |
760 | + struct mem_reserve *res, int *emerg) | |
761 | +{ | |
762 | + void *obj; | |
763 | + gfp_t gfp; | |
764 | + | |
765 | + /* | |
766 | + * Try a regular allocation, when that fails and we're not entitled | |
767 | + * to the reserves, fail. | |
768 | + */ | |
769 | + gfp = flags | __GFP_NOMEMALLOC | __GFP_NOWARN; | |
770 | + obj = __kmalloc_node_track_caller(size, gfp, node, ip); | |
771 | + | |
772 | + if (obj || !(gfp_to_alloc_flags(flags) & ALLOC_NO_WATERMARKS)) | |
773 | + goto out; | |
774 | + | |
775 | + /* | |
776 | + * If we were given a reserve to charge against, try that. | |
777 | + */ | |
778 | + if (res && !mem_reserve_kmalloc_charge(res, size)) { | |
779 | + /* | |
780 | + * If we failed to charge and we're not allowed to wait for | |
781 | + * it to succeed, bail. | |
782 | + */ | |
783 | + if (!(flags & __GFP_WAIT)) | |
784 | + goto out; | |
785 | + | |
786 | + /* | |
787 | + * Wait for a successfull charge against the reserve. All | |
788 | + * uncharge operations against this reserve will wake us up. | |
789 | + */ | |
790 | + wait_event(res->waitqueue, | |
791 | + mem_reserve_kmalloc_charge(res, size)); | |
792 | + | |
793 | + /* | |
794 | + * After waiting for it, again try a regular allocation. | |
795 | + * Pressure could have lifted during our sleep. If this | |
796 | + * succeeds, uncharge the reserve. | |
797 | + */ | |
798 | + obj = __kmalloc_node_track_caller(size, gfp, node, ip); | |
799 | + if (obj) { | |
800 | + mem_reserve_kmalloc_charge(res, -size); | |
801 | + goto out; | |
802 | + } | |
803 | + } | |
804 | + | |
805 | + /* | |
806 | + * Regular allocation failed, and we've successfully charged our | |
807 | + * requested usage against the reserve. Do the emergency allocation. | |
808 | + */ | |
809 | + obj = __kmalloc_node_track_caller(size, flags, node, ip); | |
810 | + WARN_ON(!obj); | |
811 | + if (emerg) | |
812 | + *emerg = 1; | |
813 | + | |
814 | +out: | |
815 | + return obj; | |
816 | +} | |
817 | + | |
818 | +void __kfree_reserve(void *obj, struct mem_reserve *res, int emerg) | |
819 | +{ | |
820 | + /* | |
821 | + * ksize gives the full allocated size vs the requested size we used to | |
822 | + * charge; however since we round up to the nearest power of two, this | |
823 | + * should all work nicely. | |
824 | + */ | |
825 | + size_t size = ksize(obj); | |
826 | + | |
827 | + kfree(obj); | |
828 | + /* | |
829 | + * Free before uncharge, this ensures memory is actually present when | |
830 | + * a subsequent charge succeeds. | |
831 | + */ | |
832 | + mem_reserve_kmalloc_charge(res, -size); | |
833 | +} | |
834 | + | |
835 | +/* | |
836 | + * kmem_cache_alloc/kmem_cache_free | |
837 | + */ | |
838 | + | |
839 | +void *__kmem_cache_alloc_reserve(struct kmem_cache *s, gfp_t flags, int node, | |
840 | + struct mem_reserve *res, int *emerg) | |
841 | +{ | |
842 | + void *obj; | |
843 | + gfp_t gfp; | |
844 | + | |
845 | + gfp = flags | __GFP_NOMEMALLOC | __GFP_NOWARN; | |
846 | + obj = kmem_cache_alloc_node(s, gfp, node); | |
847 | + | |
848 | + if (obj || !(gfp_to_alloc_flags(flags) & ALLOC_NO_WATERMARKS)) | |
849 | + goto out; | |
850 | + | |
851 | + if (res && !mem_reserve_kmem_cache_charge(res, s, 1)) { | |
852 | + if (!(flags & __GFP_WAIT)) | |
853 | + goto out; | |
854 | + | |
855 | + wait_event(res->waitqueue, | |
856 | + mem_reserve_kmem_cache_charge(res, s, 1)); | |
857 | + | |
858 | + obj = kmem_cache_alloc_node(s, gfp, node); | |
859 | + if (obj) { | |
860 | + mem_reserve_kmem_cache_charge(res, s, -1); | |
861 | + goto out; | |
862 | + } | |
863 | + } | |
864 | + | |
865 | + obj = kmem_cache_alloc_node(s, flags, node); | |
866 | + WARN_ON(!obj); | |
867 | + if (emerg) | |
868 | + *emerg = 1; | |
869 | + | |
870 | +out: | |
871 | + return obj; | |
872 | +} | |
873 | + | |
874 | +void __kmem_cache_free_reserve(struct kmem_cache *s, void *obj, | |
875 | + struct mem_reserve *res, int emerg) | |
876 | +{ | |
877 | + kmem_cache_free(s, obj); | |
878 | + mem_reserve_kmem_cache_charge(res, s, -1); | |
879 | +} | |
880 | + | |
881 | +/* | |
882 | + * alloc_pages/free_pages | |
883 | + */ | |
884 | + | |
885 | +struct page *__alloc_pages_reserve(int node, gfp_t flags, int order, | |
886 | + struct mem_reserve *res, int *emerg) | |
887 | +{ | |
888 | + struct page *page; | |
889 | + gfp_t gfp; | |
890 | + long pages = 1 << order; | |
891 | + | |
892 | + gfp = flags | __GFP_NOMEMALLOC | __GFP_NOWARN; | |
893 | + page = alloc_pages_node(node, gfp, order); | |
894 | + | |
895 | + if (page || !(gfp_to_alloc_flags(flags) & ALLOC_NO_WATERMARKS)) | |
896 | + goto out; | |
897 | + | |
898 | + if (res && !mem_reserve_pages_charge(res, pages)) { | |
899 | + if (!(flags & __GFP_WAIT)) | |
900 | + goto out; | |
901 | + | |
902 | + wait_event(res->waitqueue, | |
903 | + mem_reserve_pages_charge(res, pages)); | |
904 | + | |
905 | + page = alloc_pages_node(node, gfp, order); | |
906 | + if (page) { | |
907 | + mem_reserve_pages_charge(res, -pages); | |
908 | + goto out; | |
909 | + } | |
910 | + } | |
911 | + | |
912 | + page = alloc_pages_node(node, flags, order); | |
913 | + WARN_ON(!page); | |
914 | + if (emerg) | |
915 | + *emerg = 1; | |
916 | + | |
917 | +out: | |
918 | + return page; | |
919 | +} | |
920 | + | |
921 | +void __free_pages_reserve(struct page *page, int order, | |
922 | + struct mem_reserve *res, int emerg) | |
923 | +{ | |
924 | + __free_pages(page, order); | |
925 | + mem_reserve_pages_charge(res, -(1 << order)); | |
926 | +} | |
927 | --- a/mm/slub.c | |
928 | +++ b/mm/slub.c | |
929 | @@ -2728,6 +2728,7 @@ void *__kmalloc(size_t size, gfp_t flags | |
930 | } | |
931 | EXPORT_SYMBOL(__kmalloc); | |
932 | ||
933 | +#ifdef CONFIG_NUMA | |
934 | static void *kmalloc_large_node(size_t size, gfp_t flags, int node) | |
935 | { | |
936 | struct page *page = alloc_pages_node(node, flags | __GFP_COMP, | |
937 | @@ -2739,7 +2740,6 @@ static void *kmalloc_large_node(size_t s | |
938 | return NULL; | |
939 | } | |
940 | ||
941 | -#ifdef CONFIG_NUMA | |
942 | void *__kmalloc_node(size_t size, gfp_t flags, int node) | |
943 | { | |
944 | struct kmem_cache *s; |