]> git.ipfire.org Git - people/teissler/ipfire-2.x.git/blob - src/patches/suse-2.6.27.31/patches.suse/SoN-14-mm-reserve.patch
Reenabled linux-xen, added patches for Xen Kernel Version 2.6.27.31,
[people/teissler/ipfire-2.x.git] / src / patches / suse-2.6.27.31 / patches.suse / SoN-14-mm-reserve.patch
1 From: Peter Zijlstra <a.p.zijlstra@chello.nl>
2 Subject: mm: memory reserve management
3 Patch-mainline: No
4 References: FATE#303834
5
6 Generic reserve management code.
7
8 It provides methods to reserve and charge. Upon this, generic alloc/free style
9 reserve pools could be build, which could fully replace mempool_t
10 functionality.
11
12 It should also allow for a Banker's algorithm replacement of __GFP_NOFAIL.
13
14 Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
15 Acked-by: Neil Brown <neilb@suse.de>
16 Acked-by: Suresh Jayaraman <sjayaraman@suse.de>
17
18 ---
19 include/linux/reserve.h | 198 ++++++++++++++
20 include/linux/slab.h | 20 -
21 mm/Makefile | 2
22 mm/reserve.c | 637 ++++++++++++++++++++++++++++++++++++++++++++++++
23 mm/slub.c | 2
24 5 files changed, 848 insertions(+), 11 deletions(-)
25
26 --- /dev/null
27 +++ b/include/linux/reserve.h
28 @@ -0,0 +1,198 @@
29 +/*
30 + * Memory reserve management.
31 + *
32 + * Copyright (C) 2007-2008 Red Hat, Inc.,
33 + * Peter Zijlstra <pzijlstr@redhat.com>
34 + *
35 + * This file contains the public data structure and API definitions.
36 + */
37 +
38 +#ifndef _LINUX_RESERVE_H
39 +#define _LINUX_RESERVE_H
40 +
41 +#include <linux/list.h>
42 +#include <linux/spinlock.h>
43 +#include <linux/wait.h>
44 +#include <linux/slab.h>
45 +
46 +struct mem_reserve {
47 + struct mem_reserve *parent;
48 + struct list_head children;
49 + struct list_head siblings;
50 +
51 + const char *name;
52 +
53 + long pages;
54 + long limit;
55 + long usage;
56 + spinlock_t lock; /* protects limit and usage */
57 +
58 + wait_queue_head_t waitqueue;
59 +};
60 +
61 +extern struct mem_reserve mem_reserve_root;
62 +
63 +void mem_reserve_init(struct mem_reserve *res, const char *name,
64 + struct mem_reserve *parent);
65 +int mem_reserve_connect(struct mem_reserve *new_child,
66 + struct mem_reserve *node);
67 +void mem_reserve_disconnect(struct mem_reserve *node);
68 +
69 +int mem_reserve_pages_set(struct mem_reserve *res, long pages);
70 +int mem_reserve_pages_add(struct mem_reserve *res, long pages);
71 +int mem_reserve_pages_charge(struct mem_reserve *res, long pages);
72 +
73 +int mem_reserve_kmalloc_set(struct mem_reserve *res, long bytes);
74 +int mem_reserve_kmalloc_charge(struct mem_reserve *res, long bytes);
75 +
76 +struct kmem_cache;
77 +
78 +int mem_reserve_kmem_cache_set(struct mem_reserve *res,
79 + struct kmem_cache *s,
80 + int objects);
81 +int mem_reserve_kmem_cache_charge(struct mem_reserve *res,
82 + struct kmem_cache *s, long objs);
83 +
84 +void *___kmalloc_reserve(size_t size, gfp_t flags, int node, void *ip,
85 + struct mem_reserve *res, int *emerg);
86 +
87 +static inline
88 +void *__kmalloc_reserve(size_t size, gfp_t flags, int node, void *ip,
89 + struct mem_reserve *res, int *emerg)
90 +{
91 + void *obj;
92 +
93 + obj = __kmalloc_node_track_caller(size,
94 + flags | __GFP_NOMEMALLOC | __GFP_NOWARN, node, ip);
95 + if (!obj)
96 + obj = ___kmalloc_reserve(size, flags, node, ip, res, emerg);
97 +
98 + return obj;
99 +}
100 +
101 +/**
102 + * kmalloc_reserve() - kmalloc() and charge against @res for @emerg allocations
103 + * @size - size of the requested memory region
104 + * @gfp - allocation flags to use for this allocation
105 + * @node - preferred memory node for this allocation
106 + * @res - reserve to charge emergency allocations against
107 + * @emerg - bit 0 is set when the allocation was an emergency allocation
108 + *
109 + * Returns NULL on failure
110 + */
111 +#define kmalloc_reserve(size, gfp, node, res, emerg) \
112 + __kmalloc_reserve(size, gfp, node, \
113 + __builtin_return_address(0), res, emerg)
114 +
115 +void __kfree_reserve(void *obj, struct mem_reserve *res, int emerg);
116 +
117 +/**
118 + * kfree_reserve() - kfree() and uncharge against @res for @emerg allocations
119 + * @obj - memory to free
120 + * @res - reserve to uncharge emergency allocations from
121 + * @emerg - was this an emergency allocation
122 + */
123 +static inline
124 +void kfree_reserve(void *obj, struct mem_reserve *res, int emerg)
125 +{
126 + if (unlikely(obj && res && emerg))
127 + __kfree_reserve(obj, res, emerg);
128 + else
129 + kfree(obj);
130 +}
131 +
132 +void *__kmem_cache_alloc_reserve(struct kmem_cache *s, gfp_t flags, int node,
133 + struct mem_reserve *res, int *emerg);
134 +
135 +/**
136 + * kmem_cache_alloc_reserve() - kmem_cache_alloc() and charge against @res
137 + * @s - kmem_cache to allocate from
138 + * @gfp - allocation flags to use for this allocation
139 + * @node - preferred memory node for this allocation
140 + * @res - reserve to charge emergency allocations against
141 + * @emerg - bit 0 is set when the allocation was an emergency allocation
142 + *
143 + * Returns NULL on failure
144 + */
145 +static inline
146 +void *kmem_cache_alloc_reserve(struct kmem_cache *s, gfp_t flags, int node,
147 + struct mem_reserve *res, int *emerg)
148 +{
149 + void *obj;
150 +
151 + obj = kmem_cache_alloc_node(s,
152 + flags | __GFP_NOMEMALLOC | __GFP_NOWARN, node);
153 + if (!obj)
154 + obj = __kmem_cache_alloc_reserve(s, flags, node, res, emerg);
155 +
156 + return obj;
157 +}
158 +
159 +void __kmem_cache_free_reserve(struct kmem_cache *s, void *obj,
160 + struct mem_reserve *res, int emerg);
161 +
162 +/**
163 + * kmem_cache_free_reserve() - kmem_cache_free() and uncharge against @res
164 + * @s - kmem_cache to free to
165 + * @obj - memory to free
166 + * @res - reserve to uncharge emergency allocations from
167 + * @emerg - was this an emergency allocation
168 + */
169 +static inline
170 +void kmem_cache_free_reserve(struct kmem_cache *s, void *obj,
171 + struct mem_reserve *res, int emerg)
172 +{
173 + if (unlikely(obj && res && emerg))
174 + __kmem_cache_free_reserve(s, obj, res, emerg);
175 + else
176 + kmem_cache_free(s, obj);
177 +}
178 +
179 +struct page *__alloc_pages_reserve(int node, gfp_t flags, int order,
180 + struct mem_reserve *res, int *emerg);
181 +
182 +/**
183 + * alloc_pages_reserve() - alloc_pages() and charge against @res
184 + * @node - preferred memory node for this allocation
185 + * @gfp - allocation flags to use for this allocation
186 + * @order - page order
187 + * @res - reserve to charge emergency allocations against
188 + * @emerg - bit 0 is set when the allocation was an emergency allocation
189 + *
190 + * Returns NULL on failure
191 + */
192 +static inline
193 +struct page *alloc_pages_reserve(int node, gfp_t flags, int order,
194 + struct mem_reserve *res, int *emerg)
195 +{
196 + struct page *page;
197 +
198 + page = alloc_pages_node(node,
199 + flags | __GFP_NOMEMALLOC | __GFP_NOWARN, order);
200 + if (!page)
201 + page = __alloc_pages_reserve(node, flags, order, res, emerg);
202 +
203 + return page;
204 +}
205 +
206 +void __free_pages_reserve(struct page *page, int order,
207 + struct mem_reserve *res, int emerg);
208 +
209 +/**
210 + * free_pages_reserve() - __free_pages() and uncharge against @res
211 + * @page - page to free
212 + * @order - page order
213 + * @res - reserve to uncharge emergency allocations from
214 + * @emerg - was this an emergency allocation
215 + */
216 +static inline
217 +void free_pages_reserve(struct page *page, int order,
218 + struct mem_reserve *res, int emerg)
219 +{
220 + if (unlikely(page && res && emerg))
221 + __free_pages_reserve(page, order, res, emerg);
222 + else
223 + __free_pages(page, order);
224 +}
225 +
226 +#endif /* _LINUX_RESERVE_H */
227 --- a/include/linux/slab.h
228 +++ b/include/linux/slab.h
229 @@ -230,13 +230,14 @@ static inline void *kmem_cache_alloc_nod
230 */
231 #if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_SLUB)
232 extern void *__kmalloc_track_caller(size_t, gfp_t, void*);
233 -#define kmalloc_track_caller(size, flags) \
234 - __kmalloc_track_caller(size, flags, __builtin_return_address(0))
235 #else
236 -#define kmalloc_track_caller(size, flags) \
237 +#define __kmalloc_track_caller(size, flags, ip) \
238 __kmalloc(size, flags)
239 #endif /* DEBUG_SLAB */
240
241 +#define kmalloc_track_caller(size, flags) \
242 + __kmalloc_track_caller(size, flags, __builtin_return_address(0))
243 +
244 #ifdef CONFIG_NUMA
245 /*
246 * kmalloc_node_track_caller is a special version of kmalloc_node that
247 @@ -248,21 +249,22 @@ extern void *__kmalloc_track_caller(size
248 */
249 #if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_SLUB)
250 extern void *__kmalloc_node_track_caller(size_t, gfp_t, int, void *);
251 -#define kmalloc_node_track_caller(size, flags, node) \
252 - __kmalloc_node_track_caller(size, flags, node, \
253 - __builtin_return_address(0))
254 #else
255 -#define kmalloc_node_track_caller(size, flags, node) \
256 +#define __kmalloc_node_track_caller(size, flags, node, ip) \
257 __kmalloc_node(size, flags, node)
258 #endif
259
260 #else /* CONFIG_NUMA */
261
262 -#define kmalloc_node_track_caller(size, flags, node) \
263 - kmalloc_track_caller(size, flags)
264 +#define __kmalloc_node_track_caller(size, flags, node, ip) \
265 + __kmalloc_track_caller(size, flags, ip)
266
267 #endif /* DEBUG_SLAB */
268
269 +#define kmalloc_node_track_caller(size, flags, node) \
270 + __kmalloc_node_track_caller(size, flags, node, \
271 + __builtin_return_address(0))
272 +
273 /*
274 * Shortcuts
275 */
276 --- a/mm/Makefile
277 +++ b/mm/Makefile
278 @@ -11,7 +11,7 @@ obj-y := bootmem.o filemap.o mempool.o
279 maccess.o page_alloc.o page-writeback.o pdflush.o \
280 readahead.o swap.o truncate.o vmscan.o \
281 prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \
282 - page_isolation.o mm_init.o $(mmu-y)
283 + page_isolation.o mm_init.o reserve.o $(mmu-y)
284
285 obj-$(CONFIG_PROC_PAGE_MONITOR) += pagewalk.o
286 obj-$(CONFIG_BOUNCE) += bounce.o
287 --- /dev/null
288 +++ b/mm/reserve.c
289 @@ -0,0 +1,637 @@
290 +/*
291 + * Memory reserve management.
292 + *
293 + * Copyright (C) 2007-2008, Red Hat, Inc.,
294 + * Peter Zijlstra <pzijlstr@redhat.com>
295 + *
296 + * Description:
297 + *
298 + * Manage a set of memory reserves.
299 + *
300 + * A memory reserve is a reserve for a specified number of object of specified
301 + * size. Since memory is managed in pages, this reserve demand is then
302 + * translated into a page unit.
303 + *
304 + * So each reserve has a specified object limit, an object usage count and a
305 + * number of pages required to back these objects.
306 + *
307 + * Usage is charged against a reserve, if the charge fails, the resource must
308 + * not be allocated/used.
309 + *
310 + * The reserves are managed in a tree, and the resource demands (pages and
311 + * limit) are propagated up the tree. Obviously the object limit will be
312 + * meaningless as soon as the unit starts mixing, but the required page reserve
313 + * (being of one unit) is still valid at the root.
314 + *
315 + * It is the page demand of the root node that is used to set the global
316 + * reserve (adjust_memalloc_reserve() which sets zone->pages_emerg).
317 + *
318 + * As long as a subtree has the same usage unit, an aggregate node can be used
319 + * to charge against, instead of the leaf nodes. However, do be consistent with
320 + * who is charged, resource usage is not propagated up the tree (for
321 + * performance reasons).
322 + */
323 +
324 +#include <linux/reserve.h>
325 +#include <linux/mutex.h>
326 +#include <linux/mmzone.h>
327 +#include <linux/log2.h>
328 +#include <linux/proc_fs.h>
329 +#include <linux/seq_file.h>
330 +#include <linux/module.h>
331 +#include <linux/slab.h>
332 +#include <linux/sched.h>
333 +#include "internal.h"
334 +
335 +static DEFINE_MUTEX(mem_reserve_mutex);
336 +
337 +/**
338 + * @mem_reserve_root - the global reserve root
339 + *
340 + * The global reserve is empty, and has no limit unit, it merely
341 + * acts as an aggregation point for reserves and an interface to
342 + * adjust_memalloc_reserve().
343 + */
344 +struct mem_reserve mem_reserve_root = {
345 + .children = LIST_HEAD_INIT(mem_reserve_root.children),
346 + .siblings = LIST_HEAD_INIT(mem_reserve_root.siblings),
347 + .name = "total reserve",
348 + .lock = __SPIN_LOCK_UNLOCKED(mem_reserve_root.lock),
349 + .waitqueue = __WAIT_QUEUE_HEAD_INITIALIZER(mem_reserve_root.waitqueue),
350 +};
351 +EXPORT_SYMBOL_GPL(mem_reserve_root);
352 +
353 +/**
354 + * mem_reserve_init() - initialize a memory reserve object
355 + * @res - the new reserve object
356 + * @name - a name for this reserve
357 + * @parent - when non NULL, the parent to connect to.
358 + */
359 +void mem_reserve_init(struct mem_reserve *res, const char *name,
360 + struct mem_reserve *parent)
361 +{
362 + memset(res, 0, sizeof(*res));
363 + INIT_LIST_HEAD(&res->children);
364 + INIT_LIST_HEAD(&res->siblings);
365 + res->name = name;
366 + spin_lock_init(&res->lock);
367 + init_waitqueue_head(&res->waitqueue);
368 +
369 + if (parent)
370 + mem_reserve_connect(res, parent);
371 +}
372 +EXPORT_SYMBOL_GPL(mem_reserve_init);
373 +
374 +/*
375 + * propagate the pages and limit changes up the (sub)tree.
376 + */
377 +static void __calc_reserve(struct mem_reserve *res, long pages, long limit)
378 +{
379 + unsigned long flags;
380 +
381 + for ( ; res; res = res->parent) {
382 + res->pages += pages;
383 +
384 + if (limit) {
385 + spin_lock_irqsave(&res->lock, flags);
386 + res->limit += limit;
387 + spin_unlock_irqrestore(&res->lock, flags);
388 + }
389 + }
390 +}
391 +
392 +/**
393 + * __mem_reserve_add() - primitive to change the size of a reserve
394 + * @res - reserve to change
395 + * @pages - page delta
396 + * @limit - usage limit delta
397 + *
398 + * Returns -ENOMEM when a size increase is not possible atm.
399 + */
400 +static int __mem_reserve_add(struct mem_reserve *res, long pages, long limit)
401 +{
402 + int ret = 0;
403 + long reserve;
404 +
405 + /*
406 + * This looks more complex than need be, that is because we handle
407 + * the case where @res isn't actually connected to mem_reserve_root.
408 + *
409 + * So, by propagating the new pages up the (sub)tree and computing
410 + * the difference in mem_reserve_root.pages we find if this action
411 + * affects the actual reserve.
412 + *
413 + * The (partial) propagation also makes that mem_reserve_connect()
414 + * needs only look at the direct child, since each disconnected
415 + * sub-tree is fully up-to-date.
416 + */
417 + reserve = mem_reserve_root.pages;
418 + __calc_reserve(res, pages, 0);
419 + reserve = mem_reserve_root.pages - reserve;
420 +
421 + if (reserve) {
422 + ret = adjust_memalloc_reserve(reserve);
423 + if (ret)
424 + __calc_reserve(res, -pages, 0);
425 + }
426 +
427 + /*
428 + * Delay updating the limits until we've acquired the resources to
429 + * back it.
430 + */
431 + if (!ret)
432 + __calc_reserve(res, 0, limit);
433 +
434 + return ret;
435 +}
436 +
437 +/**
438 + * __mem_reserve_charge() - primitive to charge object usage of a reserve
439 + * @res - reserve to charge
440 + * @charge - size of the charge
441 + *
442 + * Returns non-zero on success, zero on failure.
443 + */
444 +static
445 +int __mem_reserve_charge(struct mem_reserve *res, long charge)
446 +{
447 + unsigned long flags;
448 + int ret = 0;
449 +
450 + spin_lock_irqsave(&res->lock, flags);
451 + if (charge < 0 || res->usage + charge < res->limit) {
452 + res->usage += charge;
453 + if (unlikely(res->usage < 0))
454 + res->usage = 0;
455 + ret = 1;
456 + }
457 + if (charge < 0)
458 + wake_up_all(&res->waitqueue);
459 + spin_unlock_irqrestore(&res->lock, flags);
460 +
461 + return ret;
462 +}
463 +
464 +/**
465 + * mem_reserve_connect() - connect a reserve to another in a child-parent relation
466 + * @new_child - the reserve node to connect (child)
467 + * @node - the reserve node to connect to (parent)
468 + *
469 + * Connecting a node results in an increase of the reserve by the amount of
470 + * pages in @new_child->pages if @node has a connection to mem_reserve_root.
471 + *
472 + * Returns -ENOMEM when the new connection would increase the reserve (parent
473 + * is connected to mem_reserve_root) and there is no memory to do so.
474 + *
475 + * On error, the child is _NOT_ connected.
476 + */
477 +int mem_reserve_connect(struct mem_reserve *new_child, struct mem_reserve *node)
478 +{
479 + int ret;
480 +
481 + WARN_ON(!new_child->name);
482 +
483 + mutex_lock(&mem_reserve_mutex);
484 + if (new_child->parent) {
485 + ret = -EEXIST;
486 + goto unlock;
487 + }
488 + new_child->parent = node;
489 + list_add(&new_child->siblings, &node->children);
490 + ret = __mem_reserve_add(node, new_child->pages, new_child->limit);
491 + if (ret) {
492 + new_child->parent = NULL;
493 + list_del_init(&new_child->siblings);
494 + }
495 +unlock:
496 + mutex_unlock(&mem_reserve_mutex);
497 +
498 + return ret;
499 +}
500 +EXPORT_SYMBOL_GPL(mem_reserve_connect);
501 +
502 +/**
503 + * mem_reserve_disconnect() - sever a nodes connection to the reserve tree
504 + * @node - the node to disconnect
505 + *
506 + * Disconnecting a node results in a reduction of the reserve by @node->pages
507 + * if node had a connection to mem_reserve_root.
508 + */
509 +void mem_reserve_disconnect(struct mem_reserve *node)
510 +{
511 + int ret;
512 +
513 + BUG_ON(!node->parent);
514 +
515 + mutex_lock(&mem_reserve_mutex);
516 + if (!node->parent) {
517 + ret = -ENOENT;
518 + goto unlock;
519 + }
520 + ret = __mem_reserve_add(node->parent, -node->pages, -node->limit);
521 + if (!ret) {
522 + node->parent = NULL;
523 + list_del_init(&node->siblings);
524 + }
525 +unlock:
526 + mutex_unlock(&mem_reserve_mutex);
527 +
528 + /*
529 + * We cannot fail to shrink the reserves, can we?
530 + */
531 + WARN_ON(ret);
532 +}
533 +EXPORT_SYMBOL_GPL(mem_reserve_disconnect);
534 +
535 +#ifdef CONFIG_PROC_FS
536 +
537 +/*
538 + * Simple output of the reserve tree in: /proc/reserve_info
539 + * Example:
540 + *
541 + * localhost ~ # cat /proc/reserve_info
542 + * 1:0 "total reserve" 6232K 0/278581
543 + * 2:1 "total network reserve" 6232K 0/278581
544 + * 3:2 "network TX reserve" 212K 0/53
545 + * 4:3 "protocol TX pages" 212K 0/53
546 + * 5:2 "network RX reserve" 6020K 0/278528
547 + * 6:5 "IPv4 route cache" 5508K 0/16384
548 + * 7:5 "SKB data reserve" 512K 0/262144
549 + * 8:7 "IPv4 fragment cache" 512K 0/262144
550 + */
551 +
552 +static void mem_reserve_show_item(struct seq_file *m, struct mem_reserve *res,
553 + unsigned int parent, unsigned int *id)
554 +{
555 + struct mem_reserve *child;
556 + unsigned int my_id = ++*id;
557 +
558 + seq_printf(m, "%d:%d \"%s\" %ldK %ld/%ld\n",
559 + my_id, parent, res->name,
560 + res->pages << (PAGE_SHIFT - 10),
561 + res->usage, res->limit);
562 +
563 + list_for_each_entry(child, &res->children, siblings)
564 + mem_reserve_show_item(m, child, my_id, id);
565 +}
566 +
567 +static int mem_reserve_show(struct seq_file *m, void *v)
568 +{
569 + unsigned int ident = 0;
570 +
571 + mutex_lock(&mem_reserve_mutex);
572 + mem_reserve_show_item(m, &mem_reserve_root, ident, &ident);
573 + mutex_unlock(&mem_reserve_mutex);
574 +
575 + return 0;
576 +}
577 +
578 +static int mem_reserve_open(struct inode *inode, struct file *file)
579 +{
580 + return single_open(file, mem_reserve_show, NULL);
581 +}
582 +
583 +static const struct file_operations mem_reserve_opterations = {
584 + .open = mem_reserve_open,
585 + .read = seq_read,
586 + .llseek = seq_lseek,
587 + .release = single_release,
588 +};
589 +
590 +static __init int mem_reserve_proc_init(void)
591 +{
592 + proc_create("reserve_info", S_IRUSR, NULL, &mem_reserve_opterations);
593 + return 0;
594 +}
595 +
596 +module_init(mem_reserve_proc_init);
597 +
598 +#endif
599 +
600 +/*
601 + * alloc_page helpers
602 + */
603 +
604 +/**
605 + * mem_reserve_pages_set() - set reserves size in pages
606 + * @res - reserve to set
607 + * @pages - size in pages to set it to
608 + *
609 + * Returns -ENOMEM when it fails to set the reserve. On failure the old size
610 + * is preserved.
611 + */
612 +int mem_reserve_pages_set(struct mem_reserve *res, long pages)
613 +{
614 + int ret;
615 +
616 + mutex_lock(&mem_reserve_mutex);
617 + pages -= res->pages;
618 + ret = __mem_reserve_add(res, pages, pages * PAGE_SIZE);
619 + mutex_unlock(&mem_reserve_mutex);
620 +
621 + return ret;
622 +}
623 +EXPORT_SYMBOL_GPL(mem_reserve_pages_set);
624 +
625 +/**
626 + * mem_reserve_pages_add() - change the size in a relative way
627 + * @res - reserve to change
628 + * @pages - number of pages to add (or subtract when negative)
629 + *
630 + * Similar to mem_reserve_pages_set, except that the argument is relative
631 + * instead of absolute.
632 + *
633 + * Returns -ENOMEM when it fails to increase.
634 + */
635 +int mem_reserve_pages_add(struct mem_reserve *res, long pages)
636 +{
637 + int ret;
638 +
639 + mutex_lock(&mem_reserve_mutex);
640 + ret = __mem_reserve_add(res, pages, pages * PAGE_SIZE);
641 + mutex_unlock(&mem_reserve_mutex);
642 +
643 + return ret;
644 +}
645 +
646 +/**
647 + * mem_reserve_pages_charge() - charge page usage to a reserve
648 + * @res - reserve to charge
649 + * @pages - size to charge
650 + *
651 + * Returns non-zero on success.
652 + */
653 +int mem_reserve_pages_charge(struct mem_reserve *res, long pages)
654 +{
655 + return __mem_reserve_charge(res, pages * PAGE_SIZE);
656 +}
657 +EXPORT_SYMBOL_GPL(mem_reserve_pages_charge);
658 +
659 +/*
660 + * kmalloc helpers
661 + */
662 +
663 +/**
664 + * mem_reserve_kmalloc_set() - set this reserve to bytes worth of kmalloc
665 + * @res - reserve to change
666 + * @bytes - size in bytes to reserve
667 + *
668 + * Returns -ENOMEM on failure.
669 + */
670 +int mem_reserve_kmalloc_set(struct mem_reserve *res, long bytes)
671 +{
672 + int ret;
673 + long pages;
674 +
675 + mutex_lock(&mem_reserve_mutex);
676 + pages = kmalloc_estimate_bytes(GFP_ATOMIC, bytes);
677 + pages -= res->pages;
678 + bytes -= res->limit;
679 + ret = __mem_reserve_add(res, pages, bytes);
680 + mutex_unlock(&mem_reserve_mutex);
681 +
682 + return ret;
683 +}
684 +EXPORT_SYMBOL_GPL(mem_reserve_kmalloc_set);
685 +
686 +/**
687 + * mem_reserve_kmalloc_charge() - charge bytes to a reserve
688 + * @res - reserve to charge
689 + * @bytes - bytes to charge
690 + *
691 + * Returns non-zero on success.
692 + */
693 +int mem_reserve_kmalloc_charge(struct mem_reserve *res, long bytes)
694 +{
695 + if (bytes < 0)
696 + bytes = -roundup_pow_of_two(-bytes);
697 + else
698 + bytes = roundup_pow_of_two(bytes);
699 +
700 + return __mem_reserve_charge(res, bytes);
701 +}
702 +EXPORT_SYMBOL_GPL(mem_reserve_kmalloc_charge);
703 +
704 +/*
705 + * kmem_cache helpers
706 + */
707 +
708 +/**
709 + * mem_reserve_kmem_cache_set() - set reserve to @objects worth of kmem_cache_alloc of @s
710 + * @res - reserve to set
711 + * @s - kmem_cache to reserve from
712 + * @objects - number of objects to reserve
713 + *
714 + * Returns -ENOMEM on failure.
715 + */
716 +int mem_reserve_kmem_cache_set(struct mem_reserve *res, struct kmem_cache *s,
717 + int objects)
718 +{
719 + int ret;
720 + long pages, bytes;
721 +
722 + mutex_lock(&mem_reserve_mutex);
723 + pages = kmem_alloc_estimate(s, GFP_ATOMIC, objects);
724 + pages -= res->pages;
725 + bytes = objects * kmem_cache_size(s) - res->limit;
726 + ret = __mem_reserve_add(res, pages, bytes);
727 + mutex_unlock(&mem_reserve_mutex);
728 +
729 + return ret;
730 +}
731 +EXPORT_SYMBOL_GPL(mem_reserve_kmem_cache_set);
732 +
733 +/**
734 + * mem_reserve_kmem_cache_charge() - charge (or uncharge) usage of objs
735 + * @res - reserve to charge
736 + * @objs - objects to charge for
737 + *
738 + * Returns non-zero on success.
739 + */
740 +int mem_reserve_kmem_cache_charge(struct mem_reserve *res, struct kmem_cache *s,
741 + long objs)
742 +{
743 + return __mem_reserve_charge(res, objs * kmem_cache_size(s));
744 +}
745 +EXPORT_SYMBOL_GPL(mem_reserve_kmem_cache_charge);
746 +
747 +/*
748 + * Alloc wrappers.
749 + *
750 + * Actual usage is commented in linux/reserve.h where the interface functions
751 + * live. Furthermore, the code is 3 instances of the same paradigm, hence only
752 + * the first contains extensive comments.
753 + */
754 +
755 +/*
756 + * kmalloc/kfree
757 + */
758 +
759 +void *___kmalloc_reserve(size_t size, gfp_t flags, int node, void *ip,
760 + struct mem_reserve *res, int *emerg)
761 +{
762 + void *obj;
763 + gfp_t gfp;
764 +
765 + /*
766 + * Try a regular allocation, when that fails and we're not entitled
767 + * to the reserves, fail.
768 + */
769 + gfp = flags | __GFP_NOMEMALLOC | __GFP_NOWARN;
770 + obj = __kmalloc_node_track_caller(size, gfp, node, ip);
771 +
772 + if (obj || !(gfp_to_alloc_flags(flags) & ALLOC_NO_WATERMARKS))
773 + goto out;
774 +
775 + /*
776 + * If we were given a reserve to charge against, try that.
777 + */
778 + if (res && !mem_reserve_kmalloc_charge(res, size)) {
779 + /*
780 + * If we failed to charge and we're not allowed to wait for
781 + * it to succeed, bail.
782 + */
783 + if (!(flags & __GFP_WAIT))
784 + goto out;
785 +
786 + /*
787 + * Wait for a successfull charge against the reserve. All
788 + * uncharge operations against this reserve will wake us up.
789 + */
790 + wait_event(res->waitqueue,
791 + mem_reserve_kmalloc_charge(res, size));
792 +
793 + /*
794 + * After waiting for it, again try a regular allocation.
795 + * Pressure could have lifted during our sleep. If this
796 + * succeeds, uncharge the reserve.
797 + */
798 + obj = __kmalloc_node_track_caller(size, gfp, node, ip);
799 + if (obj) {
800 + mem_reserve_kmalloc_charge(res, -size);
801 + goto out;
802 + }
803 + }
804 +
805 + /*
806 + * Regular allocation failed, and we've successfully charged our
807 + * requested usage against the reserve. Do the emergency allocation.
808 + */
809 + obj = __kmalloc_node_track_caller(size, flags, node, ip);
810 + WARN_ON(!obj);
811 + if (emerg)
812 + *emerg = 1;
813 +
814 +out:
815 + return obj;
816 +}
817 +
818 +void __kfree_reserve(void *obj, struct mem_reserve *res, int emerg)
819 +{
820 + /*
821 + * ksize gives the full allocated size vs the requested size we used to
822 + * charge; however since we round up to the nearest power of two, this
823 + * should all work nicely.
824 + */
825 + size_t size = ksize(obj);
826 +
827 + kfree(obj);
828 + /*
829 + * Free before uncharge, this ensures memory is actually present when
830 + * a subsequent charge succeeds.
831 + */
832 + mem_reserve_kmalloc_charge(res, -size);
833 +}
834 +
835 +/*
836 + * kmem_cache_alloc/kmem_cache_free
837 + */
838 +
839 +void *__kmem_cache_alloc_reserve(struct kmem_cache *s, gfp_t flags, int node,
840 + struct mem_reserve *res, int *emerg)
841 +{
842 + void *obj;
843 + gfp_t gfp;
844 +
845 + gfp = flags | __GFP_NOMEMALLOC | __GFP_NOWARN;
846 + obj = kmem_cache_alloc_node(s, gfp, node);
847 +
848 + if (obj || !(gfp_to_alloc_flags(flags) & ALLOC_NO_WATERMARKS))
849 + goto out;
850 +
851 + if (res && !mem_reserve_kmem_cache_charge(res, s, 1)) {
852 + if (!(flags & __GFP_WAIT))
853 + goto out;
854 +
855 + wait_event(res->waitqueue,
856 + mem_reserve_kmem_cache_charge(res, s, 1));
857 +
858 + obj = kmem_cache_alloc_node(s, gfp, node);
859 + if (obj) {
860 + mem_reserve_kmem_cache_charge(res, s, -1);
861 + goto out;
862 + }
863 + }
864 +
865 + obj = kmem_cache_alloc_node(s, flags, node);
866 + WARN_ON(!obj);
867 + if (emerg)
868 + *emerg = 1;
869 +
870 +out:
871 + return obj;
872 +}
873 +
874 +void __kmem_cache_free_reserve(struct kmem_cache *s, void *obj,
875 + struct mem_reserve *res, int emerg)
876 +{
877 + kmem_cache_free(s, obj);
878 + mem_reserve_kmem_cache_charge(res, s, -1);
879 +}
880 +
881 +/*
882 + * alloc_pages/free_pages
883 + */
884 +
885 +struct page *__alloc_pages_reserve(int node, gfp_t flags, int order,
886 + struct mem_reserve *res, int *emerg)
887 +{
888 + struct page *page;
889 + gfp_t gfp;
890 + long pages = 1 << order;
891 +
892 + gfp = flags | __GFP_NOMEMALLOC | __GFP_NOWARN;
893 + page = alloc_pages_node(node, gfp, order);
894 +
895 + if (page || !(gfp_to_alloc_flags(flags) & ALLOC_NO_WATERMARKS))
896 + goto out;
897 +
898 + if (res && !mem_reserve_pages_charge(res, pages)) {
899 + if (!(flags & __GFP_WAIT))
900 + goto out;
901 +
902 + wait_event(res->waitqueue,
903 + mem_reserve_pages_charge(res, pages));
904 +
905 + page = alloc_pages_node(node, gfp, order);
906 + if (page) {
907 + mem_reserve_pages_charge(res, -pages);
908 + goto out;
909 + }
910 + }
911 +
912 + page = alloc_pages_node(node, flags, order);
913 + WARN_ON(!page);
914 + if (emerg)
915 + *emerg = 1;
916 +
917 +out:
918 + return page;
919 +}
920 +
921 +void __free_pages_reserve(struct page *page, int order,
922 + struct mem_reserve *res, int emerg)
923 +{
924 + __free_pages(page, order);
925 + mem_reserve_pages_charge(res, -(1 << order));
926 +}
927 --- a/mm/slub.c
928 +++ b/mm/slub.c
929 @@ -2728,6 +2728,7 @@ void *__kmalloc(size_t size, gfp_t flags
930 }
931 EXPORT_SYMBOL(__kmalloc);
932
933 +#ifdef CONFIG_NUMA
934 static void *kmalloc_large_node(size_t size, gfp_t flags, int node)
935 {
936 struct page *page = alloc_pages_node(node, flags | __GFP_COMP,
937 @@ -2739,7 +2740,6 @@ static void *kmalloc_large_node(size_t s
938 return NULL;
939 }
940
941 -#ifdef CONFIG_NUMA
942 void *__kmalloc_node(size_t size, gfp_t flags, int node)
943 {
944 struct kmem_cache *s;