1 From: Peter Zijlstra <a.p.zijlstra@chello.nl>
2 Subject: mm: sl[au]b: add knowledge of reserve pages
4 References: FATE#303834
6 Restrict objects from reserve slabs (ALLOC_NO_WATERMARKS) to allocation
7 contexts that are entitled to it. This is done to ensure reserve pages don't
8 leak out and get consumed.
10 The basic pattern used for all # allocators is the following, for each active
11 slab page we store if it came from an emergency allocation. When we find it
12 did, make sure the current allocation context would have been able to allocate
13 page from the emergency reserves as well. In that case allow the allocation. If
14 not, force a new slab allocation. When that works the memory pressure has
15 lifted enough to allow this context to get an object, otherwise fail the
18 Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
19 Acked-by: Neil Brown <neilb@suse.de>
20 Acked-by: Suresh Jayaraman <sjayaraman@suse.de>
23 include/linux/slub_def.h | 1
24 mm/slab.c | 60 +++++++++++++++++++++++++++++++++++++++--------
25 mm/slob.c | 16 +++++++++++-
26 mm/slub.c | 42 +++++++++++++++++++++++++++-----
27 4 files changed, 102 insertions(+), 17 deletions(-)
29 Index: linux-2.6.26/mm/slub.c
30 ===================================================================
31 --- linux-2.6.26.orig/mm/slub.c
32 +++ linux-2.6.26/mm/slub.c
34 #include <linux/kallsyms.h>
35 #include <linux/memory.h>
36 #include <linux/math64.h>
37 +#include "internal.h"
41 @@ -1106,7 +1107,8 @@ static void setup_object(struct kmem_cac
45 -static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
47 +struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node, int *reserve)
51 @@ -1120,6 +1122,8 @@ static struct page *new_slab(struct kmem
55 + *reserve = page->reserve;
57 inc_slabs_node(s, page_to_nid(page), page->objects);
59 page->flags |= 1 << PG_slab;
60 @@ -1503,10 +1507,20 @@ static void *__slab_alloc(struct kmem_ca
66 /* We handle __GFP_ZERO in the caller */
67 gfpflags &= ~__GFP_ZERO;
69 + if (unlikely(c->reserve)) {
71 + * If the current slab is a reserve slab and the current
72 + * allocation context does not allow access to the reserves we
73 + * must force an allocation to test the current levels.
75 + if (!(gfp_to_alloc_flags(gfpflags) & ALLOC_NO_WATERMARKS))
81 @@ -1520,8 +1534,8 @@ load_freelist:
82 object = c->page->freelist;
83 if (unlikely(!object))
85 - if (unlikely(SLABDEBUG && PageSlubDebug(c->page)))
87 + if (unlikely(PageSlubDebug(c->page) || c->reserve))
90 c->freelist = object[c->offset];
91 c->page->inuse = c->page->objects;
92 @@ -1543,16 +1557,18 @@ new_slab:
97 if (gfpflags & __GFP_WAIT)
100 - new = new_slab(s, gfpflags, node);
101 + new = new_slab(s, gfpflags, node, &reserve);
103 if (gfpflags & __GFP_WAIT)
107 c = get_cpu_slab(s, smp_processor_id());
108 + c->reserve = reserve;
112 @@ -1562,10 +1578,21 @@ new_slab:
117 - if (!alloc_debug_processing(s, c->page, object, addr))
120 + if (PageSlubDebug(c->page) &&
121 + !alloc_debug_processing(s, c->page, object, addr))
125 + * Avoid the slub fast path in slab_alloc() by not setting
126 + * c->freelist and the fast path in slab_free() by making
127 + * node_match() fail by setting c->node to -1.
129 + * We use this for for debug and reserve checks which need
130 + * to be done for each allocation.
134 c->page->freelist = object[c->offset];
136 @@ -2078,10 +2105,11 @@ static struct kmem_cache_node *early_kme
138 struct kmem_cache_node *n;
142 BUG_ON(kmalloc_caches->size < sizeof(struct kmem_cache_node));
144 - page = new_slab(kmalloc_caches, gfpflags, node);
145 + page = new_slab(kmalloc_caches, gfpflags, node, &reserve);
148 if (page_to_nid(page) != node) {
149 Index: linux-2.6.26/include/linux/slub_def.h
150 ===================================================================
151 --- linux-2.6.26.orig/include/linux/slub_def.h
152 +++ linux-2.6.26/include/linux/slub_def.h
153 @@ -38,6 +38,7 @@ struct kmem_cache_cpu {
154 int node; /* The node of the page (or -1 for debug) */
155 unsigned int offset; /* Freepointer offset (in word units) */
156 unsigned int objsize; /* Size of an object (from kmem_cache) */
157 + int reserve; /* Did the current page come from the reserve */
158 #ifdef CONFIG_SLUB_STATS
159 unsigned stat[NR_SLUB_STAT_ITEMS];
161 Index: linux-2.6.26/mm/slab.c
162 ===================================================================
163 --- linux-2.6.26.orig/mm/slab.c
164 +++ linux-2.6.26/mm/slab.c
166 #include <asm/tlbflush.h>
167 #include <asm/page.h>
169 +#include "internal.h"
172 * DEBUG - 1 for kmem_cache_create() to honour; SLAB_RED_ZONE & SLAB_POISON.
173 * 0 for faster, smaller code (especially in the critical paths).
174 @@ -264,7 +266,8 @@ struct array_cache {
177 unsigned int batchcount;
178 - unsigned int touched;
179 + unsigned int touched:1,
183 * Must have this definition in here for the proper
184 @@ -760,6 +763,27 @@ static inline struct array_cache *cpu_ca
185 return cachep->array[smp_processor_id()];
189 + * If the last page came from the reserves, and the current allocation context
190 + * does not have access to them, force an allocation to test the watermarks.
192 +static inline int slab_force_alloc(struct kmem_cache *cachep, gfp_t flags)
194 + if (unlikely(cpu_cache_get(cachep)->reserve) &&
195 + !(gfp_to_alloc_flags(flags) & ALLOC_NO_WATERMARKS))
201 +static inline void slab_set_reserve(struct kmem_cache *cachep, int reserve)
203 + struct array_cache *ac = cpu_cache_get(cachep);
205 + if (unlikely(ac->reserve != reserve))
206 + ac->reserve = reserve;
209 static inline struct kmem_cache *__find_general_cachep(size_t size,
212 @@ -959,6 +983,7 @@ static struct array_cache *alloc_arrayca
214 nc->batchcount = batchcount;
217 spin_lock_init(&nc->lock);
220 @@ -1661,7 +1686,8 @@ __initcall(cpucache_init);
221 * did not request dmaable memory, we might get it, but that
222 * would be relatively rare and ignorable.
224 -static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
225 +static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid,
230 @@ -1683,6 +1709,7 @@ static void *kmem_getpages(struct kmem_c
234 + *reserve = page->reserve;
235 nr_pages = (1 << cachep->gfporder);
236 if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
237 add_zone_page_state(page_zone(page),
238 @@ -2103,6 +2130,7 @@ static int __init_refok setup_cpu_cache(
239 cpu_cache_get(cachep)->limit = BOOT_CPUCACHE_ENTRIES;
240 cpu_cache_get(cachep)->batchcount = 1;
241 cpu_cache_get(cachep)->touched = 0;
242 + cpu_cache_get(cachep)->reserve = 0;
243 cachep->batchcount = 1;
244 cachep->limit = BOOT_CPUCACHE_ENTRIES;
246 @@ -2757,6 +2785,7 @@ static int cache_grow(struct kmem_cache
249 struct kmem_list3 *l3;
253 * Be lazy and only check for valid flags here, keeping it out of the
254 @@ -2795,7 +2824,7 @@ static int cache_grow(struct kmem_cache
258 - objp = kmem_getpages(cachep, local_flags, nodeid);
259 + objp = kmem_getpages(cachep, local_flags, nodeid, &reserve);
263 @@ -2812,6 +2841,7 @@ static int cache_grow(struct kmem_cache
264 if (local_flags & __GFP_WAIT)
267 + slab_set_reserve(cachep, reserve);
268 spin_lock(&l3->list_lock);
270 /* Make slab active. */
271 @@ -2946,7 +2976,8 @@ bad:
272 #define check_slabp(x,y) do { } while(0)
275 -static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags)
276 +static void *cache_alloc_refill(struct kmem_cache *cachep,
277 + gfp_t flags, int must_refill)
280 struct kmem_list3 *l3;
281 @@ -2956,6 +2987,8 @@ static void *cache_alloc_refill(struct k
284 node = numa_node_id();
285 + if (unlikely(must_refill))
287 ac = cpu_cache_get(cachep);
288 batchcount = ac->batchcount;
289 if (!ac->touched && batchcount > BATCHREFILL_LIMIT) {
290 @@ -3023,11 +3056,14 @@ alloc_done:
292 if (unlikely(!ac->avail)) {
295 x = cache_grow(cachep, flags | GFP_THISNODE, node, NULL);
297 /* cache_grow can reenable interrupts, then ac could change. */
298 ac = cpu_cache_get(cachep);
299 - if (!x && ac->avail == 0) /* no objects in sight? abort */
301 + /* no objects in sight? abort */
302 + if (!x && (ac->avail == 0 || must_refill))
305 if (!ac->avail) /* objects refilled by interrupt? */
306 @@ -3182,17 +3218,18 @@ static inline void *____cache_alloc(stru
309 struct array_cache *ac;
310 + int must_refill = slab_force_alloc(cachep, flags);
314 ac = cpu_cache_get(cachep);
315 - if (likely(ac->avail)) {
316 + if (likely(ac->avail && !must_refill)) {
317 STATS_INC_ALLOCHIT(cachep);
319 objp = ac->entry[--ac->avail];
321 STATS_INC_ALLOCMISS(cachep);
322 - objp = cache_alloc_refill(cachep, flags);
323 + objp = cache_alloc_refill(cachep, flags, must_refill);
327 @@ -3236,7 +3273,7 @@ static void *fallback_alloc(struct kmem_
329 enum zone_type high_zoneidx = gfp_zone(flags);
334 if (flags & __GFP_THISNODE)
336 @@ -3272,10 +3309,11 @@ retry:
337 if (local_flags & __GFP_WAIT)
339 kmem_flagcheck(cache, flags);
340 - obj = kmem_getpages(cache, local_flags, -1);
341 + obj = kmem_getpages(cache, local_flags, -1, &reserve);
342 if (local_flags & __GFP_WAIT)
345 + slab_set_reserve(cache, reserve);
347 * Insert into the appropriate per node queues
349 @@ -3314,6 +3352,9 @@ static void *____cache_alloc_node(struct
350 l3 = cachep->nodelists[nodeid];
353 + if (unlikely(slab_force_alloc(cachep, flags)))
358 spin_lock(&l3->list_lock);
359 @@ -3351,6 +3392,7 @@ retry:
362 spin_unlock(&l3->list_lock);
364 x = cache_grow(cachep, flags | GFP_THISNODE, nodeid, NULL);
367 Index: linux-2.6.26/mm/slob.c
368 ===================================================================
369 --- linux-2.6.26.orig/mm/slob.c
370 +++ linux-2.6.26/mm/slob.c
372 #include <linux/rcupdate.h>
373 #include <linux/list.h>
374 #include <asm/atomic.h>
375 +#include "internal.h"
378 * slob_block has a field 'units', which indicates size of block if +ve,
379 @@ -183,6 +184,11 @@ struct slob_rcu {
380 static DEFINE_SPINLOCK(slob_lock);
383 + * tracks the reserve state for the allocator.
385 +static int slob_reserve;
388 * Encode the given size and next info into a free slob block s.
390 static void set_slob(slob_t *s, slobidx_t size, slob_t *next)
391 @@ -232,7 +238,7 @@ static int slob_last(slob_t *s)
393 static void *slob_new_page(gfp_t gfp, int order, int node)
400 @@ -244,6 +250,8 @@ static void *slob_new_page(gfp_t gfp, in
404 + slob_reserve = page->reserve;
406 return page_address(page);
409 @@ -309,6 +317,11 @@ static void *slob_alloc(size_t size, gfp
413 + if (unlikely(slob_reserve)) {
414 + if (!(gfp_to_alloc_flags(gfp) & ALLOC_NO_WATERMARKS))
418 if (size < SLOB_BREAK1)
419 slob_list = &free_slob_small;
420 else if (size < SLOB_BREAK2)
421 @@ -347,6 +360,7 @@ static void *slob_alloc(size_t size, gfp
423 spin_unlock_irqrestore(&slob_lock, flags);
426 /* Not enough space: must allocate a new page */
428 b = slob_new_page(gfp & ~__GFP_ZERO, 0, node);