]>
Commit | Line | Data |
---|---|---|
00e5a55c BS |
1 | From: Peter Zijlstra <a.p.zijlstra@chello.nl> |
2 | Subject: mm: memory reserve management | |
3 | Patch-mainline: No | |
4 | References: FATE#303834 | |
5 | ||
6 | Generic reserve management code. | |
7 | ||
8 | It provides methods to reserve and charge. Upon this, generic alloc/free style | |
9 | reserve pools could be build, which could fully replace mempool_t | |
10 | functionality. | |
11 | ||
12 | It should also allow for a Banker's algorithm replacement of __GFP_NOFAIL. | |
13 | ||
14 | Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> | |
15 | Acked-by: Neil Brown <neilb@suse.de> | |
16 | Acked-by: Suresh Jayaraman <sjayaraman@suse.de> | |
17 | ||
18 | --- | |
19 | include/linux/reserve.h | 198 ++++++++++++++ | |
20 | include/linux/slab.h | 20 - | |
21 | mm/Makefile | 2 | |
22 | mm/reserve.c | 637 ++++++++++++++++++++++++++++++++++++++++++++++++ | |
23 | mm/slub.c | 2 | |
24 | 5 files changed, 848 insertions(+), 11 deletions(-) | |
25 | ||
26 | Index: linux-2.6.27/include/linux/reserve.h | |
27 | =================================================================== | |
28 | --- /dev/null | |
29 | +++ linux-2.6.27/include/linux/reserve.h | |
30 | @@ -0,0 +1,198 @@ | |
31 | +/* | |
32 | + * Memory reserve management. | |
33 | + * | |
34 | + * Copyright (C) 2007-2008 Red Hat, Inc., | |
35 | + * Peter Zijlstra <pzijlstr@redhat.com> | |
36 | + * | |
37 | + * This file contains the public data structure and API definitions. | |
38 | + */ | |
39 | + | |
40 | +#ifndef _LINUX_RESERVE_H | |
41 | +#define _LINUX_RESERVE_H | |
42 | + | |
43 | +#include <linux/list.h> | |
44 | +#include <linux/spinlock.h> | |
45 | +#include <linux/wait.h> | |
46 | +#include <linux/slab.h> | |
47 | + | |
48 | +struct mem_reserve { | |
49 | + struct mem_reserve *parent; | |
50 | + struct list_head children; | |
51 | + struct list_head siblings; | |
52 | + | |
53 | + const char *name; | |
54 | + | |
55 | + long pages; | |
56 | + long limit; | |
57 | + long usage; | |
58 | + spinlock_t lock; /* protects limit and usage */ | |
59 | + | |
60 | + wait_queue_head_t waitqueue; | |
61 | +}; | |
62 | + | |
63 | +extern struct mem_reserve mem_reserve_root; | |
64 | + | |
65 | +void mem_reserve_init(struct mem_reserve *res, const char *name, | |
66 | + struct mem_reserve *parent); | |
67 | +int mem_reserve_connect(struct mem_reserve *new_child, | |
68 | + struct mem_reserve *node); | |
69 | +void mem_reserve_disconnect(struct mem_reserve *node); | |
70 | + | |
71 | +int mem_reserve_pages_set(struct mem_reserve *res, long pages); | |
72 | +int mem_reserve_pages_add(struct mem_reserve *res, long pages); | |
73 | +int mem_reserve_pages_charge(struct mem_reserve *res, long pages); | |
74 | + | |
75 | +int mem_reserve_kmalloc_set(struct mem_reserve *res, long bytes); | |
76 | +int mem_reserve_kmalloc_charge(struct mem_reserve *res, long bytes); | |
77 | + | |
78 | +struct kmem_cache; | |
79 | + | |
80 | +int mem_reserve_kmem_cache_set(struct mem_reserve *res, | |
81 | + struct kmem_cache *s, | |
82 | + int objects); | |
83 | +int mem_reserve_kmem_cache_charge(struct mem_reserve *res, | |
84 | + struct kmem_cache *s, long objs); | |
85 | + | |
86 | +void *___kmalloc_reserve(size_t size, gfp_t flags, int node, void *ip, | |
87 | + struct mem_reserve *res, int *emerg); | |
88 | + | |
89 | +static inline | |
90 | +void *__kmalloc_reserve(size_t size, gfp_t flags, int node, void *ip, | |
91 | + struct mem_reserve *res, int *emerg) | |
92 | +{ | |
93 | + void *obj; | |
94 | + | |
95 | + obj = __kmalloc_node_track_caller(size, | |
96 | + flags | __GFP_NOMEMALLOC | __GFP_NOWARN, node, ip); | |
97 | + if (!obj) | |
98 | + obj = ___kmalloc_reserve(size, flags, node, ip, res, emerg); | |
99 | + | |
100 | + return obj; | |
101 | +} | |
102 | + | |
103 | +/** | |
104 | + * kmalloc_reserve() - kmalloc() and charge against @res for @emerg allocations | |
105 | + * @size - size of the requested memory region | |
106 | + * @gfp - allocation flags to use for this allocation | |
107 | + * @node - preferred memory node for this allocation | |
108 | + * @res - reserve to charge emergency allocations against | |
109 | + * @emerg - bit 0 is set when the allocation was an emergency allocation | |
110 | + * | |
111 | + * Returns NULL on failure | |
112 | + */ | |
113 | +#define kmalloc_reserve(size, gfp, node, res, emerg) \ | |
114 | + __kmalloc_reserve(size, gfp, node, \ | |
115 | + __builtin_return_address(0), res, emerg) | |
116 | + | |
117 | +void __kfree_reserve(void *obj, struct mem_reserve *res, int emerg); | |
118 | + | |
119 | +/** | |
120 | + * kfree_reserve() - kfree() and uncharge against @res for @emerg allocations | |
121 | + * @obj - memory to free | |
122 | + * @res - reserve to uncharge emergency allocations from | |
123 | + * @emerg - was this an emergency allocation | |
124 | + */ | |
125 | +static inline | |
126 | +void kfree_reserve(void *obj, struct mem_reserve *res, int emerg) | |
127 | +{ | |
128 | + if (unlikely(obj && res && emerg)) | |
129 | + __kfree_reserve(obj, res, emerg); | |
130 | + else | |
131 | + kfree(obj); | |
132 | +} | |
133 | + | |
134 | +void *__kmem_cache_alloc_reserve(struct kmem_cache *s, gfp_t flags, int node, | |
135 | + struct mem_reserve *res, int *emerg); | |
136 | + | |
137 | +/** | |
138 | + * kmem_cache_alloc_reserve() - kmem_cache_alloc() and charge against @res | |
139 | + * @s - kmem_cache to allocate from | |
140 | + * @gfp - allocation flags to use for this allocation | |
141 | + * @node - preferred memory node for this allocation | |
142 | + * @res - reserve to charge emergency allocations against | |
143 | + * @emerg - bit 0 is set when the allocation was an emergency allocation | |
144 | + * | |
145 | + * Returns NULL on failure | |
146 | + */ | |
147 | +static inline | |
148 | +void *kmem_cache_alloc_reserve(struct kmem_cache *s, gfp_t flags, int node, | |
149 | + struct mem_reserve *res, int *emerg) | |
150 | +{ | |
151 | + void *obj; | |
152 | + | |
153 | + obj = kmem_cache_alloc_node(s, | |
154 | + flags | __GFP_NOMEMALLOC | __GFP_NOWARN, node); | |
155 | + if (!obj) | |
156 | + obj = __kmem_cache_alloc_reserve(s, flags, node, res, emerg); | |
157 | + | |
158 | + return obj; | |
159 | +} | |
160 | + | |
161 | +void __kmem_cache_free_reserve(struct kmem_cache *s, void *obj, | |
162 | + struct mem_reserve *res, int emerg); | |
163 | + | |
164 | +/** | |
165 | + * kmem_cache_free_reserve() - kmem_cache_free() and uncharge against @res | |
166 | + * @s - kmem_cache to free to | |
167 | + * @obj - memory to free | |
168 | + * @res - reserve to uncharge emergency allocations from | |
169 | + * @emerg - was this an emergency allocation | |
170 | + */ | |
171 | +static inline | |
172 | +void kmem_cache_free_reserve(struct kmem_cache *s, void *obj, | |
173 | + struct mem_reserve *res, int emerg) | |
174 | +{ | |
175 | + if (unlikely(obj && res && emerg)) | |
176 | + __kmem_cache_free_reserve(s, obj, res, emerg); | |
177 | + else | |
178 | + kmem_cache_free(s, obj); | |
179 | +} | |
180 | + | |
181 | +struct page *__alloc_pages_reserve(int node, gfp_t flags, int order, | |
182 | + struct mem_reserve *res, int *emerg); | |
183 | + | |
184 | +/** | |
185 | + * alloc_pages_reserve() - alloc_pages() and charge against @res | |
186 | + * @node - preferred memory node for this allocation | |
187 | + * @gfp - allocation flags to use for this allocation | |
188 | + * @order - page order | |
189 | + * @res - reserve to charge emergency allocations against | |
190 | + * @emerg - bit 0 is set when the allocation was an emergency allocation | |
191 | + * | |
192 | + * Returns NULL on failure | |
193 | + */ | |
194 | +static inline | |
195 | +struct page *alloc_pages_reserve(int node, gfp_t flags, int order, | |
196 | + struct mem_reserve *res, int *emerg) | |
197 | +{ | |
198 | + struct page *page; | |
199 | + | |
200 | + page = alloc_pages_node(node, | |
201 | + flags | __GFP_NOMEMALLOC | __GFP_NOWARN, order); | |
202 | + if (!page) | |
203 | + page = __alloc_pages_reserve(node, flags, order, res, emerg); | |
204 | + | |
205 | + return page; | |
206 | +} | |
207 | + | |
208 | +void __free_pages_reserve(struct page *page, int order, | |
209 | + struct mem_reserve *res, int emerg); | |
210 | + | |
211 | +/** | |
212 | + * free_pages_reserve() - __free_pages() and uncharge against @res | |
213 | + * @page - page to free | |
214 | + * @order - page order | |
215 | + * @res - reserve to uncharge emergency allocations from | |
216 | + * @emerg - was this an emergency allocation | |
217 | + */ | |
218 | +static inline | |
219 | +void free_pages_reserve(struct page *page, int order, | |
220 | + struct mem_reserve *res, int emerg) | |
221 | +{ | |
222 | + if (unlikely(page && res && emerg)) | |
223 | + __free_pages_reserve(page, order, res, emerg); | |
224 | + else | |
225 | + __free_pages(page, order); | |
226 | +} | |
227 | + | |
228 | +#endif /* _LINUX_RESERVE_H */ | |
229 | Index: linux-2.6.27/mm/Makefile | |
230 | =================================================================== | |
231 | --- linux-2.6.27.orig/mm/Makefile | |
232 | +++ linux-2.6.27/mm/Makefile | |
233 | @@ -11,7 +11,7 @@ obj-y := bootmem.o filemap.o mempool.o | |
234 | maccess.o page_alloc.o page-writeback.o pdflush.o \ | |
235 | readahead.o swap.o truncate.o vmscan.o \ | |
236 | prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \ | |
237 | - page_isolation.o mm_init.o $(mmu-y) | |
238 | + page_isolation.o mm_init.o reserve.o $(mmu-y) | |
239 | ||
240 | obj-$(CONFIG_PROC_PAGE_MONITOR) += pagewalk.o | |
241 | obj-$(CONFIG_BOUNCE) += bounce.o | |
242 | Index: linux-2.6.27/mm/reserve.c | |
243 | =================================================================== | |
244 | --- /dev/null | |
245 | +++ linux-2.6.27/mm/reserve.c | |
246 | @@ -0,0 +1,637 @@ | |
247 | +/* | |
248 | + * Memory reserve management. | |
249 | + * | |
250 | + * Copyright (C) 2007-2008, Red Hat, Inc., | |
251 | + * Peter Zijlstra <pzijlstr@redhat.com> | |
252 | + * | |
253 | + * Description: | |
254 | + * | |
255 | + * Manage a set of memory reserves. | |
256 | + * | |
257 | + * A memory reserve is a reserve for a specified number of object of specified | |
258 | + * size. Since memory is managed in pages, this reserve demand is then | |
259 | + * translated into a page unit. | |
260 | + * | |
261 | + * So each reserve has a specified object limit, an object usage count and a | |
262 | + * number of pages required to back these objects. | |
263 | + * | |
264 | + * Usage is charged against a reserve, if the charge fails, the resource must | |
265 | + * not be allocated/used. | |
266 | + * | |
267 | + * The reserves are managed in a tree, and the resource demands (pages and | |
268 | + * limit) are propagated up the tree. Obviously the object limit will be | |
269 | + * meaningless as soon as the unit starts mixing, but the required page reserve | |
270 | + * (being of one unit) is still valid at the root. | |
271 | + * | |
272 | + * It is the page demand of the root node that is used to set the global | |
273 | + * reserve (adjust_memalloc_reserve() which sets zone->pages_emerg). | |
274 | + * | |
275 | + * As long as a subtree has the same usage unit, an aggregate node can be used | |
276 | + * to charge against, instead of the leaf nodes. However, do be consistent with | |
277 | + * who is charged, resource usage is not propagated up the tree (for | |
278 | + * performance reasons). | |
279 | + */ | |
280 | + | |
281 | +#include <linux/reserve.h> | |
282 | +#include <linux/mutex.h> | |
283 | +#include <linux/mmzone.h> | |
284 | +#include <linux/log2.h> | |
285 | +#include <linux/proc_fs.h> | |
286 | +#include <linux/seq_file.h> | |
287 | +#include <linux/module.h> | |
288 | +#include <linux/slab.h> | |
289 | +#include <linux/sched.h> | |
290 | +#include "internal.h" | |
291 | + | |
292 | +static DEFINE_MUTEX(mem_reserve_mutex); | |
293 | + | |
294 | +/** | |
295 | + * @mem_reserve_root - the global reserve root | |
296 | + * | |
297 | + * The global reserve is empty, and has no limit unit, it merely | |
298 | + * acts as an aggregation point for reserves and an interface to | |
299 | + * adjust_memalloc_reserve(). | |
300 | + */ | |
301 | +struct mem_reserve mem_reserve_root = { | |
302 | + .children = LIST_HEAD_INIT(mem_reserve_root.children), | |
303 | + .siblings = LIST_HEAD_INIT(mem_reserve_root.siblings), | |
304 | + .name = "total reserve", | |
305 | + .lock = __SPIN_LOCK_UNLOCKED(mem_reserve_root.lock), | |
306 | + .waitqueue = __WAIT_QUEUE_HEAD_INITIALIZER(mem_reserve_root.waitqueue), | |
307 | +}; | |
308 | +EXPORT_SYMBOL_GPL(mem_reserve_root); | |
309 | + | |
310 | +/** | |
311 | + * mem_reserve_init() - initialize a memory reserve object | |
312 | + * @res - the new reserve object | |
313 | + * @name - a name for this reserve | |
314 | + * @parent - when non NULL, the parent to connect to. | |
315 | + */ | |
316 | +void mem_reserve_init(struct mem_reserve *res, const char *name, | |
317 | + struct mem_reserve *parent) | |
318 | +{ | |
319 | + memset(res, 0, sizeof(*res)); | |
320 | + INIT_LIST_HEAD(&res->children); | |
321 | + INIT_LIST_HEAD(&res->siblings); | |
322 | + res->name = name; | |
323 | + spin_lock_init(&res->lock); | |
324 | + init_waitqueue_head(&res->waitqueue); | |
325 | + | |
326 | + if (parent) | |
327 | + mem_reserve_connect(res, parent); | |
328 | +} | |
329 | +EXPORT_SYMBOL_GPL(mem_reserve_init); | |
330 | + | |
331 | +/* | |
332 | + * propagate the pages and limit changes up the (sub)tree. | |
333 | + */ | |
334 | +static void __calc_reserve(struct mem_reserve *res, long pages, long limit) | |
335 | +{ | |
336 | + unsigned long flags; | |
337 | + | |
338 | + for ( ; res; res = res->parent) { | |
339 | + res->pages += pages; | |
340 | + | |
341 | + if (limit) { | |
342 | + spin_lock_irqsave(&res->lock, flags); | |
343 | + res->limit += limit; | |
344 | + spin_unlock_irqrestore(&res->lock, flags); | |
345 | + } | |
346 | + } | |
347 | +} | |
348 | + | |
349 | +/** | |
350 | + * __mem_reserve_add() - primitive to change the size of a reserve | |
351 | + * @res - reserve to change | |
352 | + * @pages - page delta | |
353 | + * @limit - usage limit delta | |
354 | + * | |
355 | + * Returns -ENOMEM when a size increase is not possible atm. | |
356 | + */ | |
357 | +static int __mem_reserve_add(struct mem_reserve *res, long pages, long limit) | |
358 | +{ | |
359 | + int ret = 0; | |
360 | + long reserve; | |
361 | + | |
362 | + /* | |
363 | + * This looks more complex than need be, that is because we handle | |
364 | + * the case where @res isn't actually connected to mem_reserve_root. | |
365 | + * | |
366 | + * So, by propagating the new pages up the (sub)tree and computing | |
367 | + * the difference in mem_reserve_root.pages we find if this action | |
368 | + * affects the actual reserve. | |
369 | + * | |
370 | + * The (partial) propagation also makes that mem_reserve_connect() | |
371 | + * needs only look at the direct child, since each disconnected | |
372 | + * sub-tree is fully up-to-date. | |
373 | + */ | |
374 | + reserve = mem_reserve_root.pages; | |
375 | + __calc_reserve(res, pages, 0); | |
376 | + reserve = mem_reserve_root.pages - reserve; | |
377 | + | |
378 | + if (reserve) { | |
379 | + ret = adjust_memalloc_reserve(reserve); | |
380 | + if (ret) | |
381 | + __calc_reserve(res, -pages, 0); | |
382 | + } | |
383 | + | |
384 | + /* | |
385 | + * Delay updating the limits until we've acquired the resources to | |
386 | + * back it. | |
387 | + */ | |
388 | + if (!ret) | |
389 | + __calc_reserve(res, 0, limit); | |
390 | + | |
391 | + return ret; | |
392 | +} | |
393 | + | |
394 | +/** | |
395 | + * __mem_reserve_charge() - primitive to charge object usage of a reserve | |
396 | + * @res - reserve to charge | |
397 | + * @charge - size of the charge | |
398 | + * | |
399 | + * Returns non-zero on success, zero on failure. | |
400 | + */ | |
401 | +static | |
402 | +int __mem_reserve_charge(struct mem_reserve *res, long charge) | |
403 | +{ | |
404 | + unsigned long flags; | |
405 | + int ret = 0; | |
406 | + | |
407 | + spin_lock_irqsave(&res->lock, flags); | |
408 | + if (charge < 0 || res->usage + charge < res->limit) { | |
409 | + res->usage += charge; | |
410 | + if (unlikely(res->usage < 0)) | |
411 | + res->usage = 0; | |
412 | + ret = 1; | |
413 | + } | |
414 | + if (charge < 0) | |
415 | + wake_up_all(&res->waitqueue); | |
416 | + spin_unlock_irqrestore(&res->lock, flags); | |
417 | + | |
418 | + return ret; | |
419 | +} | |
420 | + | |
421 | +/** | |
422 | + * mem_reserve_connect() - connect a reserve to another in a child-parent relation | |
423 | + * @new_child - the reserve node to connect (child) | |
424 | + * @node - the reserve node to connect to (parent) | |
425 | + * | |
426 | + * Connecting a node results in an increase of the reserve by the amount of | |
427 | + * pages in @new_child->pages if @node has a connection to mem_reserve_root. | |
428 | + * | |
429 | + * Returns -ENOMEM when the new connection would increase the reserve (parent | |
430 | + * is connected to mem_reserve_root) and there is no memory to do so. | |
431 | + * | |
432 | + * On error, the child is _NOT_ connected. | |
433 | + */ | |
434 | +int mem_reserve_connect(struct mem_reserve *new_child, struct mem_reserve *node) | |
435 | +{ | |
436 | + int ret; | |
437 | + | |
438 | + WARN_ON(!new_child->name); | |
439 | + | |
440 | + mutex_lock(&mem_reserve_mutex); | |
441 | + if (new_child->parent) { | |
442 | + ret = -EEXIST; | |
443 | + goto unlock; | |
444 | + } | |
445 | + new_child->parent = node; | |
446 | + list_add(&new_child->siblings, &node->children); | |
447 | + ret = __mem_reserve_add(node, new_child->pages, new_child->limit); | |
448 | + if (ret) { | |
449 | + new_child->parent = NULL; | |
450 | + list_del_init(&new_child->siblings); | |
451 | + } | |
452 | +unlock: | |
453 | + mutex_unlock(&mem_reserve_mutex); | |
454 | + | |
455 | + return ret; | |
456 | +} | |
457 | +EXPORT_SYMBOL_GPL(mem_reserve_connect); | |
458 | + | |
459 | +/** | |
460 | + * mem_reserve_disconnect() - sever a nodes connection to the reserve tree | |
461 | + * @node - the node to disconnect | |
462 | + * | |
463 | + * Disconnecting a node results in a reduction of the reserve by @node->pages | |
464 | + * if node had a connection to mem_reserve_root. | |
465 | + */ | |
466 | +void mem_reserve_disconnect(struct mem_reserve *node) | |
467 | +{ | |
468 | + int ret; | |
469 | + | |
470 | + BUG_ON(!node->parent); | |
471 | + | |
472 | + mutex_lock(&mem_reserve_mutex); | |
473 | + if (!node->parent) { | |
474 | + ret = -ENOENT; | |
475 | + goto unlock; | |
476 | + } | |
477 | + ret = __mem_reserve_add(node->parent, -node->pages, -node->limit); | |
478 | + if (!ret) { | |
479 | + node->parent = NULL; | |
480 | + list_del_init(&node->siblings); | |
481 | + } | |
482 | +unlock: | |
483 | + mutex_unlock(&mem_reserve_mutex); | |
484 | + | |
485 | + /* | |
486 | + * We cannot fail to shrink the reserves, can we? | |
487 | + */ | |
488 | + WARN_ON(ret); | |
489 | +} | |
490 | +EXPORT_SYMBOL_GPL(mem_reserve_disconnect); | |
491 | + | |
492 | +#ifdef CONFIG_PROC_FS | |
493 | + | |
494 | +/* | |
495 | + * Simple output of the reserve tree in: /proc/reserve_info | |
496 | + * Example: | |
497 | + * | |
498 | + * localhost ~ # cat /proc/reserve_info | |
499 | + * 1:0 "total reserve" 6232K 0/278581 | |
500 | + * 2:1 "total network reserve" 6232K 0/278581 | |
501 | + * 3:2 "network TX reserve" 212K 0/53 | |
502 | + * 4:3 "protocol TX pages" 212K 0/53 | |
503 | + * 5:2 "network RX reserve" 6020K 0/278528 | |
504 | + * 6:5 "IPv4 route cache" 5508K 0/16384 | |
505 | + * 7:5 "SKB data reserve" 512K 0/262144 | |
506 | + * 8:7 "IPv4 fragment cache" 512K 0/262144 | |
507 | + */ | |
508 | + | |
509 | +static void mem_reserve_show_item(struct seq_file *m, struct mem_reserve *res, | |
510 | + unsigned int parent, unsigned int *id) | |
511 | +{ | |
512 | + struct mem_reserve *child; | |
513 | + unsigned int my_id = ++*id; | |
514 | + | |
515 | + seq_printf(m, "%d:%d \"%s\" %ldK %ld/%ld\n", | |
516 | + my_id, parent, res->name, | |
517 | + res->pages << (PAGE_SHIFT - 10), | |
518 | + res->usage, res->limit); | |
519 | + | |
520 | + list_for_each_entry(child, &res->children, siblings) | |
521 | + mem_reserve_show_item(m, child, my_id, id); | |
522 | +} | |
523 | + | |
524 | +static int mem_reserve_show(struct seq_file *m, void *v) | |
525 | +{ | |
526 | + unsigned int ident = 0; | |
527 | + | |
528 | + mutex_lock(&mem_reserve_mutex); | |
529 | + mem_reserve_show_item(m, &mem_reserve_root, ident, &ident); | |
530 | + mutex_unlock(&mem_reserve_mutex); | |
531 | + | |
532 | + return 0; | |
533 | +} | |
534 | + | |
535 | +static int mem_reserve_open(struct inode *inode, struct file *file) | |
536 | +{ | |
537 | + return single_open(file, mem_reserve_show, NULL); | |
538 | +} | |
539 | + | |
540 | +static const struct file_operations mem_reserve_opterations = { | |
541 | + .open = mem_reserve_open, | |
542 | + .read = seq_read, | |
543 | + .llseek = seq_lseek, | |
544 | + .release = single_release, | |
545 | +}; | |
546 | + | |
547 | +static __init int mem_reserve_proc_init(void) | |
548 | +{ | |
549 | + proc_create("reserve_info", S_IRUSR, NULL, &mem_reserve_opterations); | |
550 | + return 0; | |
551 | +} | |
552 | + | |
553 | +module_init(mem_reserve_proc_init); | |
554 | + | |
555 | +#endif | |
556 | + | |
557 | +/* | |
558 | + * alloc_page helpers | |
559 | + */ | |
560 | + | |
561 | +/** | |
562 | + * mem_reserve_pages_set() - set reserves size in pages | |
563 | + * @res - reserve to set | |
564 | + * @pages - size in pages to set it to | |
565 | + * | |
566 | + * Returns -ENOMEM when it fails to set the reserve. On failure the old size | |
567 | + * is preserved. | |
568 | + */ | |
569 | +int mem_reserve_pages_set(struct mem_reserve *res, long pages) | |
570 | +{ | |
571 | + int ret; | |
572 | + | |
573 | + mutex_lock(&mem_reserve_mutex); | |
574 | + pages -= res->pages; | |
575 | + ret = __mem_reserve_add(res, pages, pages * PAGE_SIZE); | |
576 | + mutex_unlock(&mem_reserve_mutex); | |
577 | + | |
578 | + return ret; | |
579 | +} | |
580 | +EXPORT_SYMBOL_GPL(mem_reserve_pages_set); | |
581 | + | |
582 | +/** | |
583 | + * mem_reserve_pages_add() - change the size in a relative way | |
584 | + * @res - reserve to change | |
585 | + * @pages - number of pages to add (or subtract when negative) | |
586 | + * | |
587 | + * Similar to mem_reserve_pages_set, except that the argument is relative | |
588 | + * instead of absolute. | |
589 | + * | |
590 | + * Returns -ENOMEM when it fails to increase. | |
591 | + */ | |
592 | +int mem_reserve_pages_add(struct mem_reserve *res, long pages) | |
593 | +{ | |
594 | + int ret; | |
595 | + | |
596 | + mutex_lock(&mem_reserve_mutex); | |
597 | + ret = __mem_reserve_add(res, pages, pages * PAGE_SIZE); | |
598 | + mutex_unlock(&mem_reserve_mutex); | |
599 | + | |
600 | + return ret; | |
601 | +} | |
602 | + | |
603 | +/** | |
604 | + * mem_reserve_pages_charge() - charge page usage to a reserve | |
605 | + * @res - reserve to charge | |
606 | + * @pages - size to charge | |
607 | + * | |
608 | + * Returns non-zero on success. | |
609 | + */ | |
610 | +int mem_reserve_pages_charge(struct mem_reserve *res, long pages) | |
611 | +{ | |
612 | + return __mem_reserve_charge(res, pages * PAGE_SIZE); | |
613 | +} | |
614 | +EXPORT_SYMBOL_GPL(mem_reserve_pages_charge); | |
615 | + | |
616 | +/* | |
617 | + * kmalloc helpers | |
618 | + */ | |
619 | + | |
620 | +/** | |
621 | + * mem_reserve_kmalloc_set() - set this reserve to bytes worth of kmalloc | |
622 | + * @res - reserve to change | |
623 | + * @bytes - size in bytes to reserve | |
624 | + * | |
625 | + * Returns -ENOMEM on failure. | |
626 | + */ | |
627 | +int mem_reserve_kmalloc_set(struct mem_reserve *res, long bytes) | |
628 | +{ | |
629 | + int ret; | |
630 | + long pages; | |
631 | + | |
632 | + mutex_lock(&mem_reserve_mutex); | |
633 | + pages = kmalloc_estimate_bytes(GFP_ATOMIC, bytes); | |
634 | + pages -= res->pages; | |
635 | + bytes -= res->limit; | |
636 | + ret = __mem_reserve_add(res, pages, bytes); | |
637 | + mutex_unlock(&mem_reserve_mutex); | |
638 | + | |
639 | + return ret; | |
640 | +} | |
641 | +EXPORT_SYMBOL_GPL(mem_reserve_kmalloc_set); | |
642 | + | |
643 | +/** | |
644 | + * mem_reserve_kmalloc_charge() - charge bytes to a reserve | |
645 | + * @res - reserve to charge | |
646 | + * @bytes - bytes to charge | |
647 | + * | |
648 | + * Returns non-zero on success. | |
649 | + */ | |
650 | +int mem_reserve_kmalloc_charge(struct mem_reserve *res, long bytes) | |
651 | +{ | |
652 | + if (bytes < 0) | |
653 | + bytes = -roundup_pow_of_two(-bytes); | |
654 | + else | |
655 | + bytes = roundup_pow_of_two(bytes); | |
656 | + | |
657 | + return __mem_reserve_charge(res, bytes); | |
658 | +} | |
659 | +EXPORT_SYMBOL_GPL(mem_reserve_kmalloc_charge); | |
660 | + | |
661 | +/* | |
662 | + * kmem_cache helpers | |
663 | + */ | |
664 | + | |
665 | +/** | |
666 | + * mem_reserve_kmem_cache_set() - set reserve to @objects worth of kmem_cache_alloc of @s | |
667 | + * @res - reserve to set | |
668 | + * @s - kmem_cache to reserve from | |
669 | + * @objects - number of objects to reserve | |
670 | + * | |
671 | + * Returns -ENOMEM on failure. | |
672 | + */ | |
673 | +int mem_reserve_kmem_cache_set(struct mem_reserve *res, struct kmem_cache *s, | |
674 | + int objects) | |
675 | +{ | |
676 | + int ret; | |
677 | + long pages, bytes; | |
678 | + | |
679 | + mutex_lock(&mem_reserve_mutex); | |
680 | + pages = kmem_alloc_estimate(s, GFP_ATOMIC, objects); | |
681 | + pages -= res->pages; | |
682 | + bytes = objects * kmem_cache_size(s) - res->limit; | |
683 | + ret = __mem_reserve_add(res, pages, bytes); | |
684 | + mutex_unlock(&mem_reserve_mutex); | |
685 | + | |
686 | + return ret; | |
687 | +} | |
688 | +EXPORT_SYMBOL_GPL(mem_reserve_kmem_cache_set); | |
689 | + | |
690 | +/** | |
691 | + * mem_reserve_kmem_cache_charge() - charge (or uncharge) usage of objs | |
692 | + * @res - reserve to charge | |
693 | + * @objs - objects to charge for | |
694 | + * | |
695 | + * Returns non-zero on success. | |
696 | + */ | |
697 | +int mem_reserve_kmem_cache_charge(struct mem_reserve *res, struct kmem_cache *s, | |
698 | + long objs) | |
699 | +{ | |
700 | + return __mem_reserve_charge(res, objs * kmem_cache_size(s)); | |
701 | +} | |
702 | +EXPORT_SYMBOL_GPL(mem_reserve_kmem_cache_charge); | |
703 | + | |
704 | +/* | |
705 | + * Alloc wrappers. | |
706 | + * | |
707 | + * Actual usage is commented in linux/reserve.h where the interface functions | |
708 | + * live. Furthermore, the code is 3 instances of the same paradigm, hence only | |
709 | + * the first contains extensive comments. | |
710 | + */ | |
711 | + | |
712 | +/* | |
713 | + * kmalloc/kfree | |
714 | + */ | |
715 | + | |
716 | +void *___kmalloc_reserve(size_t size, gfp_t flags, int node, void *ip, | |
717 | + struct mem_reserve *res, int *emerg) | |
718 | +{ | |
719 | + void *obj; | |
720 | + gfp_t gfp; | |
721 | + | |
722 | + /* | |
723 | + * Try a regular allocation, when that fails and we're not entitled | |
724 | + * to the reserves, fail. | |
725 | + */ | |
726 | + gfp = flags | __GFP_NOMEMALLOC | __GFP_NOWARN; | |
727 | + obj = __kmalloc_node_track_caller(size, gfp, node, ip); | |
728 | + | |
729 | + if (obj || !(gfp_to_alloc_flags(flags) & ALLOC_NO_WATERMARKS)) | |
730 | + goto out; | |
731 | + | |
732 | + /* | |
733 | + * If we were given a reserve to charge against, try that. | |
734 | + */ | |
735 | + if (res && !mem_reserve_kmalloc_charge(res, size)) { | |
736 | + /* | |
737 | + * If we failed to charge and we're not allowed to wait for | |
738 | + * it to succeed, bail. | |
739 | + */ | |
740 | + if (!(flags & __GFP_WAIT)) | |
741 | + goto out; | |
742 | + | |
743 | + /* | |
744 | + * Wait for a successfull charge against the reserve. All | |
745 | + * uncharge operations against this reserve will wake us up. | |
746 | + */ | |
747 | + wait_event(res->waitqueue, | |
748 | + mem_reserve_kmalloc_charge(res, size)); | |
749 | + | |
750 | + /* | |
751 | + * After waiting for it, again try a regular allocation. | |
752 | + * Pressure could have lifted during our sleep. If this | |
753 | + * succeeds, uncharge the reserve. | |
754 | + */ | |
755 | + obj = __kmalloc_node_track_caller(size, gfp, node, ip); | |
756 | + if (obj) { | |
757 | + mem_reserve_kmalloc_charge(res, -size); | |
758 | + goto out; | |
759 | + } | |
760 | + } | |
761 | + | |
762 | + /* | |
763 | + * Regular allocation failed, and we've successfully charged our | |
764 | + * requested usage against the reserve. Do the emergency allocation. | |
765 | + */ | |
766 | + obj = __kmalloc_node_track_caller(size, flags, node, ip); | |
767 | + WARN_ON(!obj); | |
768 | + if (emerg) | |
769 | + *emerg = 1; | |
770 | + | |
771 | +out: | |
772 | + return obj; | |
773 | +} | |
774 | + | |
775 | +void __kfree_reserve(void *obj, struct mem_reserve *res, int emerg) | |
776 | +{ | |
777 | + /* | |
778 | + * ksize gives the full allocated size vs the requested size we used to | |
779 | + * charge; however since we round up to the nearest power of two, this | |
780 | + * should all work nicely. | |
781 | + */ | |
782 | + size_t size = ksize(obj); | |
783 | + | |
784 | + kfree(obj); | |
785 | + /* | |
786 | + * Free before uncharge, this ensures memory is actually present when | |
787 | + * a subsequent charge succeeds. | |
788 | + */ | |
789 | + mem_reserve_kmalloc_charge(res, -size); | |
790 | +} | |
791 | + | |
792 | +/* | |
793 | + * kmem_cache_alloc/kmem_cache_free | |
794 | + */ | |
795 | + | |
796 | +void *__kmem_cache_alloc_reserve(struct kmem_cache *s, gfp_t flags, int node, | |
797 | + struct mem_reserve *res, int *emerg) | |
798 | +{ | |
799 | + void *obj; | |
800 | + gfp_t gfp; | |
801 | + | |
802 | + gfp = flags | __GFP_NOMEMALLOC | __GFP_NOWARN; | |
803 | + obj = kmem_cache_alloc_node(s, gfp, node); | |
804 | + | |
805 | + if (obj || !(gfp_to_alloc_flags(flags) & ALLOC_NO_WATERMARKS)) | |
806 | + goto out; | |
807 | + | |
808 | + if (res && !mem_reserve_kmem_cache_charge(res, s, 1)) { | |
809 | + if (!(flags & __GFP_WAIT)) | |
810 | + goto out; | |
811 | + | |
812 | + wait_event(res->waitqueue, | |
813 | + mem_reserve_kmem_cache_charge(res, s, 1)); | |
814 | + | |
815 | + obj = kmem_cache_alloc_node(s, gfp, node); | |
816 | + if (obj) { | |
817 | + mem_reserve_kmem_cache_charge(res, s, -1); | |
818 | + goto out; | |
819 | + } | |
820 | + } | |
821 | + | |
822 | + obj = kmem_cache_alloc_node(s, flags, node); | |
823 | + WARN_ON(!obj); | |
824 | + if (emerg) | |
825 | + *emerg = 1; | |
826 | + | |
827 | +out: | |
828 | + return obj; | |
829 | +} | |
830 | + | |
831 | +void __kmem_cache_free_reserve(struct kmem_cache *s, void *obj, | |
832 | + struct mem_reserve *res, int emerg) | |
833 | +{ | |
834 | + kmem_cache_free(s, obj); | |
835 | + mem_reserve_kmem_cache_charge(res, s, -1); | |
836 | +} | |
837 | + | |
838 | +/* | |
839 | + * alloc_pages/free_pages | |
840 | + */ | |
841 | + | |
842 | +struct page *__alloc_pages_reserve(int node, gfp_t flags, int order, | |
843 | + struct mem_reserve *res, int *emerg) | |
844 | +{ | |
845 | + struct page *page; | |
846 | + gfp_t gfp; | |
847 | + long pages = 1 << order; | |
848 | + | |
849 | + gfp = flags | __GFP_NOMEMALLOC | __GFP_NOWARN; | |
850 | + page = alloc_pages_node(node, gfp, order); | |
851 | + | |
852 | + if (page || !(gfp_to_alloc_flags(flags) & ALLOC_NO_WATERMARKS)) | |
853 | + goto out; | |
854 | + | |
855 | + if (res && !mem_reserve_pages_charge(res, pages)) { | |
856 | + if (!(flags & __GFP_WAIT)) | |
857 | + goto out; | |
858 | + | |
859 | + wait_event(res->waitqueue, | |
860 | + mem_reserve_pages_charge(res, pages)); | |
861 | + | |
862 | + page = alloc_pages_node(node, gfp, order); | |
863 | + if (page) { | |
864 | + mem_reserve_pages_charge(res, -pages); | |
865 | + goto out; | |
866 | + } | |
867 | + } | |
868 | + | |
869 | + page = alloc_pages_node(node, flags, order); | |
870 | + WARN_ON(!page); | |
871 | + if (emerg) | |
872 | + *emerg = 1; | |
873 | + | |
874 | +out: | |
875 | + return page; | |
876 | +} | |
877 | + | |
878 | +void __free_pages_reserve(struct page *page, int order, | |
879 | + struct mem_reserve *res, int emerg) | |
880 | +{ | |
881 | + __free_pages(page, order); | |
882 | + mem_reserve_pages_charge(res, -(1 << order)); | |
883 | +} | |
884 | Index: linux-2.6.27/include/linux/slab.h | |
885 | =================================================================== | |
886 | --- linux-2.6.27.orig/include/linux/slab.h | |
887 | +++ linux-2.6.27/include/linux/slab.h | |
888 | @@ -230,13 +230,14 @@ static inline void *kmem_cache_alloc_nod | |
889 | */ | |
890 | #if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_SLUB) | |
891 | extern void *__kmalloc_track_caller(size_t, gfp_t, void*); | |
892 | -#define kmalloc_track_caller(size, flags) \ | |
893 | - __kmalloc_track_caller(size, flags, __builtin_return_address(0)) | |
894 | #else | |
895 | -#define kmalloc_track_caller(size, flags) \ | |
896 | +#define __kmalloc_track_caller(size, flags, ip) \ | |
897 | __kmalloc(size, flags) | |
898 | #endif /* DEBUG_SLAB */ | |
899 | ||
900 | +#define kmalloc_track_caller(size, flags) \ | |
901 | + __kmalloc_track_caller(size, flags, __builtin_return_address(0)) | |
902 | + | |
903 | #ifdef CONFIG_NUMA | |
904 | /* | |
905 | * kmalloc_node_track_caller is a special version of kmalloc_node that | |
906 | @@ -248,21 +249,22 @@ extern void *__kmalloc_track_caller(size | |
907 | */ | |
908 | #if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_SLUB) | |
909 | extern void *__kmalloc_node_track_caller(size_t, gfp_t, int, void *); | |
910 | -#define kmalloc_node_track_caller(size, flags, node) \ | |
911 | - __kmalloc_node_track_caller(size, flags, node, \ | |
912 | - __builtin_return_address(0)) | |
913 | #else | |
914 | -#define kmalloc_node_track_caller(size, flags, node) \ | |
915 | +#define __kmalloc_node_track_caller(size, flags, node, ip) \ | |
916 | __kmalloc_node(size, flags, node) | |
917 | #endif | |
918 | ||
919 | #else /* CONFIG_NUMA */ | |
920 | ||
921 | -#define kmalloc_node_track_caller(size, flags, node) \ | |
922 | - kmalloc_track_caller(size, flags) | |
923 | +#define __kmalloc_node_track_caller(size, flags, node, ip) \ | |
924 | + __kmalloc_track_caller(size, flags, ip) | |
925 | ||
926 | #endif /* DEBUG_SLAB */ | |
927 | ||
928 | +#define kmalloc_node_track_caller(size, flags, node) \ | |
929 | + __kmalloc_node_track_caller(size, flags, node, \ | |
930 | + __builtin_return_address(0)) | |
931 | + | |
932 | /* | |
933 | * Shortcuts | |
934 | */ | |
935 | Index: linux-2.6.27/mm/slub.c | |
936 | =================================================================== | |
937 | --- linux-2.6.27.orig/mm/slub.c | |
938 | +++ linux-2.6.27/mm/slub.c | |
939 | @@ -2726,6 +2726,7 @@ void *__kmalloc(size_t size, gfp_t flags | |
940 | } | |
941 | EXPORT_SYMBOL(__kmalloc); | |
942 | ||
943 | +#ifdef CONFIG_NUMA | |
944 | static void *kmalloc_large_node(size_t size, gfp_t flags, int node) | |
945 | { | |
946 | struct page *page = alloc_pages_node(node, flags | __GFP_COMP, | |
947 | @@ -2737,7 +2738,6 @@ static void *kmalloc_large_node(size_t s | |
948 | return NULL; | |
949 | } | |
950 | ||
951 | -#ifdef CONFIG_NUMA | |
952 | void *__kmalloc_node(size_t size, gfp_t flags, int node) | |
953 | { | |
954 | struct kmem_cache *s; |