]> git.ipfire.org Git - ipfire-2.x.git/blame - src/patches/suse-2.6.27.39/patches.suse/SoN-14-mm-reserve.patch
Imported linux-2.6.27.39 suse/xen patches.
[ipfire-2.x.git] / src / patches / suse-2.6.27.39 / patches.suse / SoN-14-mm-reserve.patch
CommitLineData
2cb7cef9
BS
1From: Peter Zijlstra <a.p.zijlstra@chello.nl>
2Subject: mm: memory reserve management
3Patch-mainline: No
4References: FATE#303834
5
6Generic reserve management code.
7
8It provides methods to reserve and charge. Upon this, generic alloc/free style
9reserve pools could be build, which could fully replace mempool_t
10functionality.
11
12It should also allow for a Banker's algorithm replacement of __GFP_NOFAIL.
13
14Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
15Acked-by: Neil Brown <neilb@suse.de>
16Acked-by: Suresh Jayaraman <sjayaraman@suse.de>
17
18---
19 include/linux/reserve.h | 198 ++++++++++++++
20 include/linux/slab.h | 20 -
21 mm/Makefile | 2
22 mm/reserve.c | 637 ++++++++++++++++++++++++++++++++++++++++++++++++
23 mm/slub.c | 2
24 5 files changed, 848 insertions(+), 11 deletions(-)
25
26--- /dev/null
27+++ b/include/linux/reserve.h
28@@ -0,0 +1,198 @@
29+/*
30+ * Memory reserve management.
31+ *
32+ * Copyright (C) 2007-2008 Red Hat, Inc.,
33+ * Peter Zijlstra <pzijlstr@redhat.com>
34+ *
35+ * This file contains the public data structure and API definitions.
36+ */
37+
38+#ifndef _LINUX_RESERVE_H
39+#define _LINUX_RESERVE_H
40+
41+#include <linux/list.h>
42+#include <linux/spinlock.h>
43+#include <linux/wait.h>
44+#include <linux/slab.h>
45+
46+struct mem_reserve {
47+ struct mem_reserve *parent;
48+ struct list_head children;
49+ struct list_head siblings;
50+
51+ const char *name;
52+
53+ long pages;
54+ long limit;
55+ long usage;
56+ spinlock_t lock; /* protects limit and usage */
57+
58+ wait_queue_head_t waitqueue;
59+};
60+
61+extern struct mem_reserve mem_reserve_root;
62+
63+void mem_reserve_init(struct mem_reserve *res, const char *name,
64+ struct mem_reserve *parent);
65+int mem_reserve_connect(struct mem_reserve *new_child,
66+ struct mem_reserve *node);
67+void mem_reserve_disconnect(struct mem_reserve *node);
68+
69+int mem_reserve_pages_set(struct mem_reserve *res, long pages);
70+int mem_reserve_pages_add(struct mem_reserve *res, long pages);
71+int mem_reserve_pages_charge(struct mem_reserve *res, long pages);
72+
73+int mem_reserve_kmalloc_set(struct mem_reserve *res, long bytes);
74+int mem_reserve_kmalloc_charge(struct mem_reserve *res, long bytes);
75+
76+struct kmem_cache;
77+
78+int mem_reserve_kmem_cache_set(struct mem_reserve *res,
79+ struct kmem_cache *s,
80+ int objects);
81+int mem_reserve_kmem_cache_charge(struct mem_reserve *res,
82+ struct kmem_cache *s, long objs);
83+
84+void *___kmalloc_reserve(size_t size, gfp_t flags, int node, void *ip,
85+ struct mem_reserve *res, int *emerg);
86+
87+static inline
88+void *__kmalloc_reserve(size_t size, gfp_t flags, int node, void *ip,
89+ struct mem_reserve *res, int *emerg)
90+{
91+ void *obj;
92+
93+ obj = __kmalloc_node_track_caller(size,
94+ flags | __GFP_NOMEMALLOC | __GFP_NOWARN, node, ip);
95+ if (!obj)
96+ obj = ___kmalloc_reserve(size, flags, node, ip, res, emerg);
97+
98+ return obj;
99+}
100+
101+/**
102+ * kmalloc_reserve() - kmalloc() and charge against @res for @emerg allocations
103+ * @size - size of the requested memory region
104+ * @gfp - allocation flags to use for this allocation
105+ * @node - preferred memory node for this allocation
106+ * @res - reserve to charge emergency allocations against
107+ * @emerg - bit 0 is set when the allocation was an emergency allocation
108+ *
109+ * Returns NULL on failure
110+ */
111+#define kmalloc_reserve(size, gfp, node, res, emerg) \
112+ __kmalloc_reserve(size, gfp, node, \
113+ __builtin_return_address(0), res, emerg)
114+
115+void __kfree_reserve(void *obj, struct mem_reserve *res, int emerg);
116+
117+/**
118+ * kfree_reserve() - kfree() and uncharge against @res for @emerg allocations
119+ * @obj - memory to free
120+ * @res - reserve to uncharge emergency allocations from
121+ * @emerg - was this an emergency allocation
122+ */
123+static inline
124+void kfree_reserve(void *obj, struct mem_reserve *res, int emerg)
125+{
126+ if (unlikely(obj && res && emerg))
127+ __kfree_reserve(obj, res, emerg);
128+ else
129+ kfree(obj);
130+}
131+
132+void *__kmem_cache_alloc_reserve(struct kmem_cache *s, gfp_t flags, int node,
133+ struct mem_reserve *res, int *emerg);
134+
135+/**
136+ * kmem_cache_alloc_reserve() - kmem_cache_alloc() and charge against @res
137+ * @s - kmem_cache to allocate from
138+ * @gfp - allocation flags to use for this allocation
139+ * @node - preferred memory node for this allocation
140+ * @res - reserve to charge emergency allocations against
141+ * @emerg - bit 0 is set when the allocation was an emergency allocation
142+ *
143+ * Returns NULL on failure
144+ */
145+static inline
146+void *kmem_cache_alloc_reserve(struct kmem_cache *s, gfp_t flags, int node,
147+ struct mem_reserve *res, int *emerg)
148+{
149+ void *obj;
150+
151+ obj = kmem_cache_alloc_node(s,
152+ flags | __GFP_NOMEMALLOC | __GFP_NOWARN, node);
153+ if (!obj)
154+ obj = __kmem_cache_alloc_reserve(s, flags, node, res, emerg);
155+
156+ return obj;
157+}
158+
159+void __kmem_cache_free_reserve(struct kmem_cache *s, void *obj,
160+ struct mem_reserve *res, int emerg);
161+
162+/**
163+ * kmem_cache_free_reserve() - kmem_cache_free() and uncharge against @res
164+ * @s - kmem_cache to free to
165+ * @obj - memory to free
166+ * @res - reserve to uncharge emergency allocations from
167+ * @emerg - was this an emergency allocation
168+ */
169+static inline
170+void kmem_cache_free_reserve(struct kmem_cache *s, void *obj,
171+ struct mem_reserve *res, int emerg)
172+{
173+ if (unlikely(obj && res && emerg))
174+ __kmem_cache_free_reserve(s, obj, res, emerg);
175+ else
176+ kmem_cache_free(s, obj);
177+}
178+
179+struct page *__alloc_pages_reserve(int node, gfp_t flags, int order,
180+ struct mem_reserve *res, int *emerg);
181+
182+/**
183+ * alloc_pages_reserve() - alloc_pages() and charge against @res
184+ * @node - preferred memory node for this allocation
185+ * @gfp - allocation flags to use for this allocation
186+ * @order - page order
187+ * @res - reserve to charge emergency allocations against
188+ * @emerg - bit 0 is set when the allocation was an emergency allocation
189+ *
190+ * Returns NULL on failure
191+ */
192+static inline
193+struct page *alloc_pages_reserve(int node, gfp_t flags, int order,
194+ struct mem_reserve *res, int *emerg)
195+{
196+ struct page *page;
197+
198+ page = alloc_pages_node(node,
199+ flags | __GFP_NOMEMALLOC | __GFP_NOWARN, order);
200+ if (!page)
201+ page = __alloc_pages_reserve(node, flags, order, res, emerg);
202+
203+ return page;
204+}
205+
206+void __free_pages_reserve(struct page *page, int order,
207+ struct mem_reserve *res, int emerg);
208+
209+/**
210+ * free_pages_reserve() - __free_pages() and uncharge against @res
211+ * @page - page to free
212+ * @order - page order
213+ * @res - reserve to uncharge emergency allocations from
214+ * @emerg - was this an emergency allocation
215+ */
216+static inline
217+void free_pages_reserve(struct page *page, int order,
218+ struct mem_reserve *res, int emerg)
219+{
220+ if (unlikely(page && res && emerg))
221+ __free_pages_reserve(page, order, res, emerg);
222+ else
223+ __free_pages(page, order);
224+}
225+
226+#endif /* _LINUX_RESERVE_H */
227--- a/include/linux/slab.h
228+++ b/include/linux/slab.h
229@@ -230,13 +230,14 @@ static inline void *kmem_cache_alloc_nod
230 */
231 #if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_SLUB)
232 extern void *__kmalloc_track_caller(size_t, gfp_t, void*);
233-#define kmalloc_track_caller(size, flags) \
234- __kmalloc_track_caller(size, flags, __builtin_return_address(0))
235 #else
236-#define kmalloc_track_caller(size, flags) \
237+#define __kmalloc_track_caller(size, flags, ip) \
238 __kmalloc(size, flags)
239 #endif /* DEBUG_SLAB */
240
241+#define kmalloc_track_caller(size, flags) \
242+ __kmalloc_track_caller(size, flags, __builtin_return_address(0))
243+
244 #ifdef CONFIG_NUMA
245 /*
246 * kmalloc_node_track_caller is a special version of kmalloc_node that
247@@ -248,21 +249,22 @@ extern void *__kmalloc_track_caller(size
248 */
249 #if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_SLUB)
250 extern void *__kmalloc_node_track_caller(size_t, gfp_t, int, void *);
251-#define kmalloc_node_track_caller(size, flags, node) \
252- __kmalloc_node_track_caller(size, flags, node, \
253- __builtin_return_address(0))
254 #else
255-#define kmalloc_node_track_caller(size, flags, node) \
256+#define __kmalloc_node_track_caller(size, flags, node, ip) \
257 __kmalloc_node(size, flags, node)
258 #endif
259
260 #else /* CONFIG_NUMA */
261
262-#define kmalloc_node_track_caller(size, flags, node) \
263- kmalloc_track_caller(size, flags)
264+#define __kmalloc_node_track_caller(size, flags, node, ip) \
265+ __kmalloc_track_caller(size, flags, ip)
266
267 #endif /* DEBUG_SLAB */
268
269+#define kmalloc_node_track_caller(size, flags, node) \
270+ __kmalloc_node_track_caller(size, flags, node, \
271+ __builtin_return_address(0))
272+
273 /*
274 * Shortcuts
275 */
276--- a/mm/Makefile
277+++ b/mm/Makefile
278@@ -11,7 +11,7 @@ obj-y := bootmem.o filemap.o mempool.o
279 maccess.o page_alloc.o page-writeback.o pdflush.o \
280 readahead.o swap.o truncate.o vmscan.o \
281 prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \
282- page_isolation.o mm_init.o $(mmu-y)
283+ page_isolation.o mm_init.o reserve.o $(mmu-y)
284
285 obj-$(CONFIG_PROC_PAGE_MONITOR) += pagewalk.o
286 obj-$(CONFIG_BOUNCE) += bounce.o
287--- /dev/null
288+++ b/mm/reserve.c
289@@ -0,0 +1,637 @@
290+/*
291+ * Memory reserve management.
292+ *
293+ * Copyright (C) 2007-2008, Red Hat, Inc.,
294+ * Peter Zijlstra <pzijlstr@redhat.com>
295+ *
296+ * Description:
297+ *
298+ * Manage a set of memory reserves.
299+ *
300+ * A memory reserve is a reserve for a specified number of object of specified
301+ * size. Since memory is managed in pages, this reserve demand is then
302+ * translated into a page unit.
303+ *
304+ * So each reserve has a specified object limit, an object usage count and a
305+ * number of pages required to back these objects.
306+ *
307+ * Usage is charged against a reserve, if the charge fails, the resource must
308+ * not be allocated/used.
309+ *
310+ * The reserves are managed in a tree, and the resource demands (pages and
311+ * limit) are propagated up the tree. Obviously the object limit will be
312+ * meaningless as soon as the unit starts mixing, but the required page reserve
313+ * (being of one unit) is still valid at the root.
314+ *
315+ * It is the page demand of the root node that is used to set the global
316+ * reserve (adjust_memalloc_reserve() which sets zone->pages_emerg).
317+ *
318+ * As long as a subtree has the same usage unit, an aggregate node can be used
319+ * to charge against, instead of the leaf nodes. However, do be consistent with
320+ * who is charged, resource usage is not propagated up the tree (for
321+ * performance reasons).
322+ */
323+
324+#include <linux/reserve.h>
325+#include <linux/mutex.h>
326+#include <linux/mmzone.h>
327+#include <linux/log2.h>
328+#include <linux/proc_fs.h>
329+#include <linux/seq_file.h>
330+#include <linux/module.h>
331+#include <linux/slab.h>
332+#include <linux/sched.h>
333+#include "internal.h"
334+
335+static DEFINE_MUTEX(mem_reserve_mutex);
336+
337+/**
338+ * @mem_reserve_root - the global reserve root
339+ *
340+ * The global reserve is empty, and has no limit unit, it merely
341+ * acts as an aggregation point for reserves and an interface to
342+ * adjust_memalloc_reserve().
343+ */
344+struct mem_reserve mem_reserve_root = {
345+ .children = LIST_HEAD_INIT(mem_reserve_root.children),
346+ .siblings = LIST_HEAD_INIT(mem_reserve_root.siblings),
347+ .name = "total reserve",
348+ .lock = __SPIN_LOCK_UNLOCKED(mem_reserve_root.lock),
349+ .waitqueue = __WAIT_QUEUE_HEAD_INITIALIZER(mem_reserve_root.waitqueue),
350+};
351+EXPORT_SYMBOL_GPL(mem_reserve_root);
352+
353+/**
354+ * mem_reserve_init() - initialize a memory reserve object
355+ * @res - the new reserve object
356+ * @name - a name for this reserve
357+ * @parent - when non NULL, the parent to connect to.
358+ */
359+void mem_reserve_init(struct mem_reserve *res, const char *name,
360+ struct mem_reserve *parent)
361+{
362+ memset(res, 0, sizeof(*res));
363+ INIT_LIST_HEAD(&res->children);
364+ INIT_LIST_HEAD(&res->siblings);
365+ res->name = name;
366+ spin_lock_init(&res->lock);
367+ init_waitqueue_head(&res->waitqueue);
368+
369+ if (parent)
370+ mem_reserve_connect(res, parent);
371+}
372+EXPORT_SYMBOL_GPL(mem_reserve_init);
373+
374+/*
375+ * propagate the pages and limit changes up the (sub)tree.
376+ */
377+static void __calc_reserve(struct mem_reserve *res, long pages, long limit)
378+{
379+ unsigned long flags;
380+
381+ for ( ; res; res = res->parent) {
382+ res->pages += pages;
383+
384+ if (limit) {
385+ spin_lock_irqsave(&res->lock, flags);
386+ res->limit += limit;
387+ spin_unlock_irqrestore(&res->lock, flags);
388+ }
389+ }
390+}
391+
392+/**
393+ * __mem_reserve_add() - primitive to change the size of a reserve
394+ * @res - reserve to change
395+ * @pages - page delta
396+ * @limit - usage limit delta
397+ *
398+ * Returns -ENOMEM when a size increase is not possible atm.
399+ */
400+static int __mem_reserve_add(struct mem_reserve *res, long pages, long limit)
401+{
402+ int ret = 0;
403+ long reserve;
404+
405+ /*
406+ * This looks more complex than need be, that is because we handle
407+ * the case where @res isn't actually connected to mem_reserve_root.
408+ *
409+ * So, by propagating the new pages up the (sub)tree and computing
410+ * the difference in mem_reserve_root.pages we find if this action
411+ * affects the actual reserve.
412+ *
413+ * The (partial) propagation also makes that mem_reserve_connect()
414+ * needs only look at the direct child, since each disconnected
415+ * sub-tree is fully up-to-date.
416+ */
417+ reserve = mem_reserve_root.pages;
418+ __calc_reserve(res, pages, 0);
419+ reserve = mem_reserve_root.pages - reserve;
420+
421+ if (reserve) {
422+ ret = adjust_memalloc_reserve(reserve);
423+ if (ret)
424+ __calc_reserve(res, -pages, 0);
425+ }
426+
427+ /*
428+ * Delay updating the limits until we've acquired the resources to
429+ * back it.
430+ */
431+ if (!ret)
432+ __calc_reserve(res, 0, limit);
433+
434+ return ret;
435+}
436+
437+/**
438+ * __mem_reserve_charge() - primitive to charge object usage of a reserve
439+ * @res - reserve to charge
440+ * @charge - size of the charge
441+ *
442+ * Returns non-zero on success, zero on failure.
443+ */
444+static
445+int __mem_reserve_charge(struct mem_reserve *res, long charge)
446+{
447+ unsigned long flags;
448+ int ret = 0;
449+
450+ spin_lock_irqsave(&res->lock, flags);
451+ if (charge < 0 || res->usage + charge < res->limit) {
452+ res->usage += charge;
453+ if (unlikely(res->usage < 0))
454+ res->usage = 0;
455+ ret = 1;
456+ }
457+ if (charge < 0)
458+ wake_up_all(&res->waitqueue);
459+ spin_unlock_irqrestore(&res->lock, flags);
460+
461+ return ret;
462+}
463+
464+/**
465+ * mem_reserve_connect() - connect a reserve to another in a child-parent relation
466+ * @new_child - the reserve node to connect (child)
467+ * @node - the reserve node to connect to (parent)
468+ *
469+ * Connecting a node results in an increase of the reserve by the amount of
470+ * pages in @new_child->pages if @node has a connection to mem_reserve_root.
471+ *
472+ * Returns -ENOMEM when the new connection would increase the reserve (parent
473+ * is connected to mem_reserve_root) and there is no memory to do so.
474+ *
475+ * On error, the child is _NOT_ connected.
476+ */
477+int mem_reserve_connect(struct mem_reserve *new_child, struct mem_reserve *node)
478+{
479+ int ret;
480+
481+ WARN_ON(!new_child->name);
482+
483+ mutex_lock(&mem_reserve_mutex);
484+ if (new_child->parent) {
485+ ret = -EEXIST;
486+ goto unlock;
487+ }
488+ new_child->parent = node;
489+ list_add(&new_child->siblings, &node->children);
490+ ret = __mem_reserve_add(node, new_child->pages, new_child->limit);
491+ if (ret) {
492+ new_child->parent = NULL;
493+ list_del_init(&new_child->siblings);
494+ }
495+unlock:
496+ mutex_unlock(&mem_reserve_mutex);
497+
498+ return ret;
499+}
500+EXPORT_SYMBOL_GPL(mem_reserve_connect);
501+
502+/**
503+ * mem_reserve_disconnect() - sever a nodes connection to the reserve tree
504+ * @node - the node to disconnect
505+ *
506+ * Disconnecting a node results in a reduction of the reserve by @node->pages
507+ * if node had a connection to mem_reserve_root.
508+ */
509+void mem_reserve_disconnect(struct mem_reserve *node)
510+{
511+ int ret;
512+
513+ BUG_ON(!node->parent);
514+
515+ mutex_lock(&mem_reserve_mutex);
516+ if (!node->parent) {
517+ ret = -ENOENT;
518+ goto unlock;
519+ }
520+ ret = __mem_reserve_add(node->parent, -node->pages, -node->limit);
521+ if (!ret) {
522+ node->parent = NULL;
523+ list_del_init(&node->siblings);
524+ }
525+unlock:
526+ mutex_unlock(&mem_reserve_mutex);
527+
528+ /*
529+ * We cannot fail to shrink the reserves, can we?
530+ */
531+ WARN_ON(ret);
532+}
533+EXPORT_SYMBOL_GPL(mem_reserve_disconnect);
534+
535+#ifdef CONFIG_PROC_FS
536+
537+/*
538+ * Simple output of the reserve tree in: /proc/reserve_info
539+ * Example:
540+ *
541+ * localhost ~ # cat /proc/reserve_info
542+ * 1:0 "total reserve" 6232K 0/278581
543+ * 2:1 "total network reserve" 6232K 0/278581
544+ * 3:2 "network TX reserve" 212K 0/53
545+ * 4:3 "protocol TX pages" 212K 0/53
546+ * 5:2 "network RX reserve" 6020K 0/278528
547+ * 6:5 "IPv4 route cache" 5508K 0/16384
548+ * 7:5 "SKB data reserve" 512K 0/262144
549+ * 8:7 "IPv4 fragment cache" 512K 0/262144
550+ */
551+
552+static void mem_reserve_show_item(struct seq_file *m, struct mem_reserve *res,
553+ unsigned int parent, unsigned int *id)
554+{
555+ struct mem_reserve *child;
556+ unsigned int my_id = ++*id;
557+
558+ seq_printf(m, "%d:%d \"%s\" %ldK %ld/%ld\n",
559+ my_id, parent, res->name,
560+ res->pages << (PAGE_SHIFT - 10),
561+ res->usage, res->limit);
562+
563+ list_for_each_entry(child, &res->children, siblings)
564+ mem_reserve_show_item(m, child, my_id, id);
565+}
566+
567+static int mem_reserve_show(struct seq_file *m, void *v)
568+{
569+ unsigned int ident = 0;
570+
571+ mutex_lock(&mem_reserve_mutex);
572+ mem_reserve_show_item(m, &mem_reserve_root, ident, &ident);
573+ mutex_unlock(&mem_reserve_mutex);
574+
575+ return 0;
576+}
577+
578+static int mem_reserve_open(struct inode *inode, struct file *file)
579+{
580+ return single_open(file, mem_reserve_show, NULL);
581+}
582+
583+static const struct file_operations mem_reserve_opterations = {
584+ .open = mem_reserve_open,
585+ .read = seq_read,
586+ .llseek = seq_lseek,
587+ .release = single_release,
588+};
589+
590+static __init int mem_reserve_proc_init(void)
591+{
592+ proc_create("reserve_info", S_IRUSR, NULL, &mem_reserve_opterations);
593+ return 0;
594+}
595+
596+module_init(mem_reserve_proc_init);
597+
598+#endif
599+
600+/*
601+ * alloc_page helpers
602+ */
603+
604+/**
605+ * mem_reserve_pages_set() - set reserves size in pages
606+ * @res - reserve to set
607+ * @pages - size in pages to set it to
608+ *
609+ * Returns -ENOMEM when it fails to set the reserve. On failure the old size
610+ * is preserved.
611+ */
612+int mem_reserve_pages_set(struct mem_reserve *res, long pages)
613+{
614+ int ret;
615+
616+ mutex_lock(&mem_reserve_mutex);
617+ pages -= res->pages;
618+ ret = __mem_reserve_add(res, pages, pages * PAGE_SIZE);
619+ mutex_unlock(&mem_reserve_mutex);
620+
621+ return ret;
622+}
623+EXPORT_SYMBOL_GPL(mem_reserve_pages_set);
624+
625+/**
626+ * mem_reserve_pages_add() - change the size in a relative way
627+ * @res - reserve to change
628+ * @pages - number of pages to add (or subtract when negative)
629+ *
630+ * Similar to mem_reserve_pages_set, except that the argument is relative
631+ * instead of absolute.
632+ *
633+ * Returns -ENOMEM when it fails to increase.
634+ */
635+int mem_reserve_pages_add(struct mem_reserve *res, long pages)
636+{
637+ int ret;
638+
639+ mutex_lock(&mem_reserve_mutex);
640+ ret = __mem_reserve_add(res, pages, pages * PAGE_SIZE);
641+ mutex_unlock(&mem_reserve_mutex);
642+
643+ return ret;
644+}
645+
646+/**
647+ * mem_reserve_pages_charge() - charge page usage to a reserve
648+ * @res - reserve to charge
649+ * @pages - size to charge
650+ *
651+ * Returns non-zero on success.
652+ */
653+int mem_reserve_pages_charge(struct mem_reserve *res, long pages)
654+{
655+ return __mem_reserve_charge(res, pages * PAGE_SIZE);
656+}
657+EXPORT_SYMBOL_GPL(mem_reserve_pages_charge);
658+
659+/*
660+ * kmalloc helpers
661+ */
662+
663+/**
664+ * mem_reserve_kmalloc_set() - set this reserve to bytes worth of kmalloc
665+ * @res - reserve to change
666+ * @bytes - size in bytes to reserve
667+ *
668+ * Returns -ENOMEM on failure.
669+ */
670+int mem_reserve_kmalloc_set(struct mem_reserve *res, long bytes)
671+{
672+ int ret;
673+ long pages;
674+
675+ mutex_lock(&mem_reserve_mutex);
676+ pages = kmalloc_estimate_bytes(GFP_ATOMIC, bytes);
677+ pages -= res->pages;
678+ bytes -= res->limit;
679+ ret = __mem_reserve_add(res, pages, bytes);
680+ mutex_unlock(&mem_reserve_mutex);
681+
682+ return ret;
683+}
684+EXPORT_SYMBOL_GPL(mem_reserve_kmalloc_set);
685+
686+/**
687+ * mem_reserve_kmalloc_charge() - charge bytes to a reserve
688+ * @res - reserve to charge
689+ * @bytes - bytes to charge
690+ *
691+ * Returns non-zero on success.
692+ */
693+int mem_reserve_kmalloc_charge(struct mem_reserve *res, long bytes)
694+{
695+ if (bytes < 0)
696+ bytes = -roundup_pow_of_two(-bytes);
697+ else
698+ bytes = roundup_pow_of_two(bytes);
699+
700+ return __mem_reserve_charge(res, bytes);
701+}
702+EXPORT_SYMBOL_GPL(mem_reserve_kmalloc_charge);
703+
704+/*
705+ * kmem_cache helpers
706+ */
707+
708+/**
709+ * mem_reserve_kmem_cache_set() - set reserve to @objects worth of kmem_cache_alloc of @s
710+ * @res - reserve to set
711+ * @s - kmem_cache to reserve from
712+ * @objects - number of objects to reserve
713+ *
714+ * Returns -ENOMEM on failure.
715+ */
716+int mem_reserve_kmem_cache_set(struct mem_reserve *res, struct kmem_cache *s,
717+ int objects)
718+{
719+ int ret;
720+ long pages, bytes;
721+
722+ mutex_lock(&mem_reserve_mutex);
723+ pages = kmem_alloc_estimate(s, GFP_ATOMIC, objects);
724+ pages -= res->pages;
725+ bytes = objects * kmem_cache_size(s) - res->limit;
726+ ret = __mem_reserve_add(res, pages, bytes);
727+ mutex_unlock(&mem_reserve_mutex);
728+
729+ return ret;
730+}
731+EXPORT_SYMBOL_GPL(mem_reserve_kmem_cache_set);
732+
733+/**
734+ * mem_reserve_kmem_cache_charge() - charge (or uncharge) usage of objs
735+ * @res - reserve to charge
736+ * @objs - objects to charge for
737+ *
738+ * Returns non-zero on success.
739+ */
740+int mem_reserve_kmem_cache_charge(struct mem_reserve *res, struct kmem_cache *s,
741+ long objs)
742+{
743+ return __mem_reserve_charge(res, objs * kmem_cache_size(s));
744+}
745+EXPORT_SYMBOL_GPL(mem_reserve_kmem_cache_charge);
746+
747+/*
748+ * Alloc wrappers.
749+ *
750+ * Actual usage is commented in linux/reserve.h where the interface functions
751+ * live. Furthermore, the code is 3 instances of the same paradigm, hence only
752+ * the first contains extensive comments.
753+ */
754+
755+/*
756+ * kmalloc/kfree
757+ */
758+
759+void *___kmalloc_reserve(size_t size, gfp_t flags, int node, void *ip,
760+ struct mem_reserve *res, int *emerg)
761+{
762+ void *obj;
763+ gfp_t gfp;
764+
765+ /*
766+ * Try a regular allocation, when that fails and we're not entitled
767+ * to the reserves, fail.
768+ */
769+ gfp = flags | __GFP_NOMEMALLOC | __GFP_NOWARN;
770+ obj = __kmalloc_node_track_caller(size, gfp, node, ip);
771+
772+ if (obj || !(gfp_to_alloc_flags(flags) & ALLOC_NO_WATERMARKS))
773+ goto out;
774+
775+ /*
776+ * If we were given a reserve to charge against, try that.
777+ */
778+ if (res && !mem_reserve_kmalloc_charge(res, size)) {
779+ /*
780+ * If we failed to charge and we're not allowed to wait for
781+ * it to succeed, bail.
782+ */
783+ if (!(flags & __GFP_WAIT))
784+ goto out;
785+
786+ /*
787+ * Wait for a successfull charge against the reserve. All
788+ * uncharge operations against this reserve will wake us up.
789+ */
790+ wait_event(res->waitqueue,
791+ mem_reserve_kmalloc_charge(res, size));
792+
793+ /*
794+ * After waiting for it, again try a regular allocation.
795+ * Pressure could have lifted during our sleep. If this
796+ * succeeds, uncharge the reserve.
797+ */
798+ obj = __kmalloc_node_track_caller(size, gfp, node, ip);
799+ if (obj) {
800+ mem_reserve_kmalloc_charge(res, -size);
801+ goto out;
802+ }
803+ }
804+
805+ /*
806+ * Regular allocation failed, and we've successfully charged our
807+ * requested usage against the reserve. Do the emergency allocation.
808+ */
809+ obj = __kmalloc_node_track_caller(size, flags, node, ip);
810+ WARN_ON(!obj);
811+ if (emerg)
812+ *emerg = 1;
813+
814+out:
815+ return obj;
816+}
817+
818+void __kfree_reserve(void *obj, struct mem_reserve *res, int emerg)
819+{
820+ /*
821+ * ksize gives the full allocated size vs the requested size we used to
822+ * charge; however since we round up to the nearest power of two, this
823+ * should all work nicely.
824+ */
825+ size_t size = ksize(obj);
826+
827+ kfree(obj);
828+ /*
829+ * Free before uncharge, this ensures memory is actually present when
830+ * a subsequent charge succeeds.
831+ */
832+ mem_reserve_kmalloc_charge(res, -size);
833+}
834+
835+/*
836+ * kmem_cache_alloc/kmem_cache_free
837+ */
838+
839+void *__kmem_cache_alloc_reserve(struct kmem_cache *s, gfp_t flags, int node,
840+ struct mem_reserve *res, int *emerg)
841+{
842+ void *obj;
843+ gfp_t gfp;
844+
845+ gfp = flags | __GFP_NOMEMALLOC | __GFP_NOWARN;
846+ obj = kmem_cache_alloc_node(s, gfp, node);
847+
848+ if (obj || !(gfp_to_alloc_flags(flags) & ALLOC_NO_WATERMARKS))
849+ goto out;
850+
851+ if (res && !mem_reserve_kmem_cache_charge(res, s, 1)) {
852+ if (!(flags & __GFP_WAIT))
853+ goto out;
854+
855+ wait_event(res->waitqueue,
856+ mem_reserve_kmem_cache_charge(res, s, 1));
857+
858+ obj = kmem_cache_alloc_node(s, gfp, node);
859+ if (obj) {
860+ mem_reserve_kmem_cache_charge(res, s, -1);
861+ goto out;
862+ }
863+ }
864+
865+ obj = kmem_cache_alloc_node(s, flags, node);
866+ WARN_ON(!obj);
867+ if (emerg)
868+ *emerg = 1;
869+
870+out:
871+ return obj;
872+}
873+
874+void __kmem_cache_free_reserve(struct kmem_cache *s, void *obj,
875+ struct mem_reserve *res, int emerg)
876+{
877+ kmem_cache_free(s, obj);
878+ mem_reserve_kmem_cache_charge(res, s, -1);
879+}
880+
881+/*
882+ * alloc_pages/free_pages
883+ */
884+
885+struct page *__alloc_pages_reserve(int node, gfp_t flags, int order,
886+ struct mem_reserve *res, int *emerg)
887+{
888+ struct page *page;
889+ gfp_t gfp;
890+ long pages = 1 << order;
891+
892+ gfp = flags | __GFP_NOMEMALLOC | __GFP_NOWARN;
893+ page = alloc_pages_node(node, gfp, order);
894+
895+ if (page || !(gfp_to_alloc_flags(flags) & ALLOC_NO_WATERMARKS))
896+ goto out;
897+
898+ if (res && !mem_reserve_pages_charge(res, pages)) {
899+ if (!(flags & __GFP_WAIT))
900+ goto out;
901+
902+ wait_event(res->waitqueue,
903+ mem_reserve_pages_charge(res, pages));
904+
905+ page = alloc_pages_node(node, gfp, order);
906+ if (page) {
907+ mem_reserve_pages_charge(res, -pages);
908+ goto out;
909+ }
910+ }
911+
912+ page = alloc_pages_node(node, flags, order);
913+ WARN_ON(!page);
914+ if (emerg)
915+ *emerg = 1;
916+
917+out:
918+ return page;
919+}
920+
921+void __free_pages_reserve(struct page *page, int order,
922+ struct mem_reserve *res, int emerg)
923+{
924+ __free_pages(page, order);
925+ mem_reserve_pages_charge(res, -(1 << order));
926+}
927--- a/mm/slub.c
928+++ b/mm/slub.c
929@@ -2728,6 +2728,7 @@ void *__kmalloc(size_t size, gfp_t flags
930 }
931 EXPORT_SYMBOL(__kmalloc);
932
933+#ifdef CONFIG_NUMA
934 static void *kmalloc_large_node(size_t size, gfp_t flags, int node)
935 {
936 struct page *page = alloc_pages_node(node, flags | __GFP_COMP,
937@@ -2739,7 +2740,6 @@ static void *kmalloc_large_node(size_t s
938 return NULL;
939 }
940
941-#ifdef CONFIG_NUMA
942 void *__kmalloc_node(size_t size, gfp_t flags, int node)
943 {
944 struct kmem_cache *s;