]> git.ipfire.org Git - people/teissler/ipfire-2.x.git/blob - src/patches/suse-2.6.27.31/patches.suse/SoN-08-reserve-slub.patch
Reenabled linux-xen, added patches for Xen Kernel Version 2.6.27.31,
[people/teissler/ipfire-2.x.git] / src / patches / suse-2.6.27.31 / patches.suse / SoN-08-reserve-slub.patch
1 From: Peter Zijlstra <a.p.zijlstra@chello.nl>
2 Subject: mm: sl[au]b: add knowledge of reserve pages
3 Patch-mainline: No
4 References: FATE#303834
5
6 Restrict objects from reserve slabs (ALLOC_NO_WATERMARKS) to allocation
7 contexts that are entitled to it. This is done to ensure reserve pages don't
8 leak out and get consumed.
9
10 The basic pattern used for all # allocators is the following, for each active
11 slab page we store if it came from an emergency allocation. When we find it
12 did, make sure the current allocation context would have been able to allocate
13 page from the emergency reserves as well. In that case allow the allocation. If
14 not, force a new slab allocation. When that works the memory pressure has
15 lifted enough to allow this context to get an object, otherwise fail the
16 allocation.
17
18 Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
19 Acked-by: Neil Brown <neilb@suse.de>
20 Acked-by: Suresh Jayaraman <sjayaraman@suse.de>
21
22 ---
23 include/linux/slub_def.h | 1
24 mm/slab.c | 60 +++++++++++++++++++++++++++++++++++++++--------
25 mm/slob.c | 16 +++++++++++-
26 mm/slub.c | 42 +++++++++++++++++++++++++++-----
27 4 files changed, 102 insertions(+), 17 deletions(-)
28
29 Index: linux-2.6.26/mm/slub.c
30 ===================================================================
31 --- linux-2.6.26.orig/mm/slub.c
32 +++ linux-2.6.26/mm/slub.c
33 @@ -23,6 +23,7 @@
34 #include <linux/kallsyms.h>
35 #include <linux/memory.h>
36 #include <linux/math64.h>
37 +#include "internal.h"
38
39 /*
40 * Lock order:
41 @@ -1106,7 +1107,8 @@ static void setup_object(struct kmem_cac
42 s->ctor(object);
43 }
44
45 -static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
46 +static
47 +struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node, int *reserve)
48 {
49 struct page *page;
50 void *start;
51 @@ -1120,6 +1122,8 @@ static struct page *new_slab(struct kmem
52 if (!page)
53 goto out;
54
55 + *reserve = page->reserve;
56 +
57 inc_slabs_node(s, page_to_nid(page), page->objects);
58 page->slab = s;
59 page->flags |= 1 << PG_slab;
60 @@ -1503,10 +1507,20 @@ static void *__slab_alloc(struct kmem_ca
61 {
62 void **object;
63 struct page *new;
64 + int reserve;
65
66 /* We handle __GFP_ZERO in the caller */
67 gfpflags &= ~__GFP_ZERO;
68
69 + if (unlikely(c->reserve)) {
70 + /*
71 + * If the current slab is a reserve slab and the current
72 + * allocation context does not allow access to the reserves we
73 + * must force an allocation to test the current levels.
74 + */
75 + if (!(gfp_to_alloc_flags(gfpflags) & ALLOC_NO_WATERMARKS))
76 + goto grow_slab;
77 + }
78 if (!c->page)
79 goto new_slab;
80
81 @@ -1520,8 +1534,8 @@ load_freelist:
82 object = c->page->freelist;
83 if (unlikely(!object))
84 goto another_slab;
85 - if (unlikely(SLABDEBUG && PageSlubDebug(c->page)))
86 - goto debug;
87 + if (unlikely(PageSlubDebug(c->page) || c->reserve))
88 + goto slow_path;
89
90 c->freelist = object[c->offset];
91 c->page->inuse = c->page->objects;
92 @@ -1543,16 +1557,18 @@ new_slab:
93 goto load_freelist;
94 }
95
96 +grow_slab:
97 if (gfpflags & __GFP_WAIT)
98 local_irq_enable();
99
100 - new = new_slab(s, gfpflags, node);
101 + new = new_slab(s, gfpflags, node, &reserve);
102
103 if (gfpflags & __GFP_WAIT)
104 local_irq_disable();
105
106 if (new) {
107 c = get_cpu_slab(s, smp_processor_id());
108 + c->reserve = reserve;
109 stat(c, ALLOC_SLAB);
110 if (c->page)
111 flush_slab(s, c);
112 @@ -1562,10 +1578,21 @@ new_slab:
113 goto load_freelist;
114 }
115 return NULL;
116 -debug:
117 - if (!alloc_debug_processing(s, c->page, object, addr))
118 +
119 +slow_path:
120 + if (PageSlubDebug(c->page) &&
121 + !alloc_debug_processing(s, c->page, object, addr))
122 goto another_slab;
123
124 + /*
125 + * Avoid the slub fast path in slab_alloc() by not setting
126 + * c->freelist and the fast path in slab_free() by making
127 + * node_match() fail by setting c->node to -1.
128 + *
129 + * We use this for for debug and reserve checks which need
130 + * to be done for each allocation.
131 + */
132 +
133 c->page->inuse++;
134 c->page->freelist = object[c->offset];
135 c->node = -1;
136 @@ -2078,10 +2105,11 @@ static struct kmem_cache_node *early_kme
137 struct page *page;
138 struct kmem_cache_node *n;
139 unsigned long flags;
140 + int reserve;
141
142 BUG_ON(kmalloc_caches->size < sizeof(struct kmem_cache_node));
143
144 - page = new_slab(kmalloc_caches, gfpflags, node);
145 + page = new_slab(kmalloc_caches, gfpflags, node, &reserve);
146
147 BUG_ON(!page);
148 if (page_to_nid(page) != node) {
149 Index: linux-2.6.26/include/linux/slub_def.h
150 ===================================================================
151 --- linux-2.6.26.orig/include/linux/slub_def.h
152 +++ linux-2.6.26/include/linux/slub_def.h
153 @@ -38,6 +38,7 @@ struct kmem_cache_cpu {
154 int node; /* The node of the page (or -1 for debug) */
155 unsigned int offset; /* Freepointer offset (in word units) */
156 unsigned int objsize; /* Size of an object (from kmem_cache) */
157 + int reserve; /* Did the current page come from the reserve */
158 #ifdef CONFIG_SLUB_STATS
159 unsigned stat[NR_SLUB_STAT_ITEMS];
160 #endif
161 Index: linux-2.6.26/mm/slab.c
162 ===================================================================
163 --- linux-2.6.26.orig/mm/slab.c
164 +++ linux-2.6.26/mm/slab.c
165 @@ -116,6 +116,8 @@
166 #include <asm/tlbflush.h>
167 #include <asm/page.h>
168
169 +#include "internal.h"
170 +
171 /*
172 * DEBUG - 1 for kmem_cache_create() to honour; SLAB_RED_ZONE & SLAB_POISON.
173 * 0 for faster, smaller code (especially in the critical paths).
174 @@ -264,7 +266,8 @@ struct array_cache {
175 unsigned int avail;
176 unsigned int limit;
177 unsigned int batchcount;
178 - unsigned int touched;
179 + unsigned int touched:1,
180 + reserve:1;
181 spinlock_t lock;
182 void *entry[]; /*
183 * Must have this definition in here for the proper
184 @@ -760,6 +763,27 @@ static inline struct array_cache *cpu_ca
185 return cachep->array[smp_processor_id()];
186 }
187
188 +/*
189 + * If the last page came from the reserves, and the current allocation context
190 + * does not have access to them, force an allocation to test the watermarks.
191 + */
192 +static inline int slab_force_alloc(struct kmem_cache *cachep, gfp_t flags)
193 +{
194 + if (unlikely(cpu_cache_get(cachep)->reserve) &&
195 + !(gfp_to_alloc_flags(flags) & ALLOC_NO_WATERMARKS))
196 + return 1;
197 +
198 + return 0;
199 +}
200 +
201 +static inline void slab_set_reserve(struct kmem_cache *cachep, int reserve)
202 +{
203 + struct array_cache *ac = cpu_cache_get(cachep);
204 +
205 + if (unlikely(ac->reserve != reserve))
206 + ac->reserve = reserve;
207 +}
208 +
209 static inline struct kmem_cache *__find_general_cachep(size_t size,
210 gfp_t gfpflags)
211 {
212 @@ -959,6 +983,7 @@ static struct array_cache *alloc_arrayca
213 nc->limit = entries;
214 nc->batchcount = batchcount;
215 nc->touched = 0;
216 + nc->reserve = 0;
217 spin_lock_init(&nc->lock);
218 }
219 return nc;
220 @@ -1661,7 +1686,8 @@ __initcall(cpucache_init);
221 * did not request dmaable memory, we might get it, but that
222 * would be relatively rare and ignorable.
223 */
224 -static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
225 +static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid,
226 + int *reserve)
227 {
228 struct page *page;
229 int nr_pages;
230 @@ -1683,6 +1709,7 @@ static void *kmem_getpages(struct kmem_c
231 if (!page)
232 return NULL;
233
234 + *reserve = page->reserve;
235 nr_pages = (1 << cachep->gfporder);
236 if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
237 add_zone_page_state(page_zone(page),
238 @@ -2103,6 +2130,7 @@ static int __init_refok setup_cpu_cache(
239 cpu_cache_get(cachep)->limit = BOOT_CPUCACHE_ENTRIES;
240 cpu_cache_get(cachep)->batchcount = 1;
241 cpu_cache_get(cachep)->touched = 0;
242 + cpu_cache_get(cachep)->reserve = 0;
243 cachep->batchcount = 1;
244 cachep->limit = BOOT_CPUCACHE_ENTRIES;
245 return 0;
246 @@ -2757,6 +2785,7 @@ static int cache_grow(struct kmem_cache
247 size_t offset;
248 gfp_t local_flags;
249 struct kmem_list3 *l3;
250 + int reserve;
251
252 /*
253 * Be lazy and only check for valid flags here, keeping it out of the
254 @@ -2795,7 +2824,7 @@ static int cache_grow(struct kmem_cache
255 * 'nodeid'.
256 */
257 if (!objp)
258 - objp = kmem_getpages(cachep, local_flags, nodeid);
259 + objp = kmem_getpages(cachep, local_flags, nodeid, &reserve);
260 if (!objp)
261 goto failed;
262
263 @@ -2812,6 +2841,7 @@ static int cache_grow(struct kmem_cache
264 if (local_flags & __GFP_WAIT)
265 local_irq_disable();
266 check_irq_off();
267 + slab_set_reserve(cachep, reserve);
268 spin_lock(&l3->list_lock);
269
270 /* Make slab active. */
271 @@ -2946,7 +2976,8 @@ bad:
272 #define check_slabp(x,y) do { } while(0)
273 #endif
274
275 -static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags)
276 +static void *cache_alloc_refill(struct kmem_cache *cachep,
277 + gfp_t flags, int must_refill)
278 {
279 int batchcount;
280 struct kmem_list3 *l3;
281 @@ -2956,6 +2987,8 @@ static void *cache_alloc_refill(struct k
282 retry:
283 check_irq_off();
284 node = numa_node_id();
285 + if (unlikely(must_refill))
286 + goto force_grow;
287 ac = cpu_cache_get(cachep);
288 batchcount = ac->batchcount;
289 if (!ac->touched && batchcount > BATCHREFILL_LIMIT) {
290 @@ -3023,11 +3056,14 @@ alloc_done:
291
292 if (unlikely(!ac->avail)) {
293 int x;
294 +force_grow:
295 x = cache_grow(cachep, flags | GFP_THISNODE, node, NULL);
296
297 /* cache_grow can reenable interrupts, then ac could change. */
298 ac = cpu_cache_get(cachep);
299 - if (!x && ac->avail == 0) /* no objects in sight? abort */
300 +
301 + /* no objects in sight? abort */
302 + if (!x && (ac->avail == 0 || must_refill))
303 return NULL;
304
305 if (!ac->avail) /* objects refilled by interrupt? */
306 @@ -3182,17 +3218,18 @@ static inline void *____cache_alloc(stru
307 {
308 void *objp;
309 struct array_cache *ac;
310 + int must_refill = slab_force_alloc(cachep, flags);
311
312 check_irq_off();
313
314 ac = cpu_cache_get(cachep);
315 - if (likely(ac->avail)) {
316 + if (likely(ac->avail && !must_refill)) {
317 STATS_INC_ALLOCHIT(cachep);
318 ac->touched = 1;
319 objp = ac->entry[--ac->avail];
320 } else {
321 STATS_INC_ALLOCMISS(cachep);
322 - objp = cache_alloc_refill(cachep, flags);
323 + objp = cache_alloc_refill(cachep, flags, must_refill);
324 }
325 return objp;
326 }
327 @@ -3236,7 +3273,7 @@ static void *fallback_alloc(struct kmem_
328 struct zone *zone;
329 enum zone_type high_zoneidx = gfp_zone(flags);
330 void *obj = NULL;
331 - int nid;
332 + int nid, reserve;
333
334 if (flags & __GFP_THISNODE)
335 return NULL;
336 @@ -3272,10 +3309,11 @@ retry:
337 if (local_flags & __GFP_WAIT)
338 local_irq_enable();
339 kmem_flagcheck(cache, flags);
340 - obj = kmem_getpages(cache, local_flags, -1);
341 + obj = kmem_getpages(cache, local_flags, -1, &reserve);
342 if (local_flags & __GFP_WAIT)
343 local_irq_disable();
344 if (obj) {
345 + slab_set_reserve(cache, reserve);
346 /*
347 * Insert into the appropriate per node queues
348 */
349 @@ -3314,6 +3352,9 @@ static void *____cache_alloc_node(struct
350 l3 = cachep->nodelists[nodeid];
351 BUG_ON(!l3);
352
353 + if (unlikely(slab_force_alloc(cachep, flags)))
354 + goto force_grow;
355 +
356 retry:
357 check_irq_off();
358 spin_lock(&l3->list_lock);
359 @@ -3351,6 +3392,7 @@ retry:
360
361 must_grow:
362 spin_unlock(&l3->list_lock);
363 +force_grow:
364 x = cache_grow(cachep, flags | GFP_THISNODE, nodeid, NULL);
365 if (x)
366 goto retry;
367 Index: linux-2.6.26/mm/slob.c
368 ===================================================================
369 --- linux-2.6.26.orig/mm/slob.c
370 +++ linux-2.6.26/mm/slob.c
371 @@ -66,6 +66,7 @@
372 #include <linux/rcupdate.h>
373 #include <linux/list.h>
374 #include <asm/atomic.h>
375 +#include "internal.h"
376
377 /*
378 * slob_block has a field 'units', which indicates size of block if +ve,
379 @@ -183,6 +184,11 @@ struct slob_rcu {
380 static DEFINE_SPINLOCK(slob_lock);
381
382 /*
383 + * tracks the reserve state for the allocator.
384 + */
385 +static int slob_reserve;
386 +
387 +/*
388 * Encode the given size and next info into a free slob block s.
389 */
390 static void set_slob(slob_t *s, slobidx_t size, slob_t *next)
391 @@ -232,7 +238,7 @@ static int slob_last(slob_t *s)
392
393 static void *slob_new_page(gfp_t gfp, int order, int node)
394 {
395 - void *page;
396 + struct page *page;
397
398 #ifdef CONFIG_NUMA
399 if (node != -1)
400 @@ -244,6 +250,8 @@ static void *slob_new_page(gfp_t gfp, in
401 if (!page)
402 return NULL;
403
404 + slob_reserve = page->reserve;
405 +
406 return page_address(page);
407 }
408
409 @@ -309,6 +317,11 @@ static void *slob_alloc(size_t size, gfp
410 slob_t *b = NULL;
411 unsigned long flags;
412
413 + if (unlikely(slob_reserve)) {
414 + if (!(gfp_to_alloc_flags(gfp) & ALLOC_NO_WATERMARKS))
415 + goto grow;
416 + }
417 +
418 if (size < SLOB_BREAK1)
419 slob_list = &free_slob_small;
420 else if (size < SLOB_BREAK2)
421 @@ -347,6 +360,7 @@ static void *slob_alloc(size_t size, gfp
422 }
423 spin_unlock_irqrestore(&slob_lock, flags);
424
425 +grow:
426 /* Not enough space: must allocate a new page */
427 if (!b) {
428 b = slob_new_page(gfp & ~__GFP_ZERO, 0, node);