src/patches/suse-2.6.27.31/patches.suse/SoN-08-reserve-slub.patch

   1 From: Peter Zijlstra <a.p.zijlstra@chello.nl>
   2 Subject: mm: sl[au]b: add knowledge of reserve pages
   3 Patch-mainline: No
   4 References: FATE#303834
   5
   6 Restrict objects from reserve slabs (ALLOC_NO_WATERMARKS) to allocation
   7 contexts that are entitled to it. This is done to ensure reserve pages don't
   8 leak out and get consumed.
   9
  10 The basic pattern used for all # allocators is the following, for each active
  11 slab page we store if it came from an emergency allocation. When we find it
  12 did, make sure the current allocation context would have been able to allocate
  13 page from the emergency reserves as well. In that case allow the allocation. If
  14 not, force a new slab allocation. When that works the memory pressure has
  15 lifted enough to allow this context to get an object, otherwise fail the
  16 allocation.
  17
  18 Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
  19 Acked-by: Neil Brown <neilb@suse.de>
  20 Acked-by: Suresh Jayaraman <sjayaraman@suse.de>
  21
  22 ---
  23  include/linux/slub_def.h |    1
  24  mm/slab.c                |   60 +++++++++++++++++++++++++++++++++++++++--------
  25  mm/slob.c                |   16 +++++++++++-
  26  mm/slub.c                |   42 +++++++++++++++++++++++++++-----
  27  4 files changed, 102 insertions(+), 17 deletions(-)
  28
  29 Index: linux-2.6.26/mm/slub.c
  30 ===================================================================
  31 --- linux-2.6.26.orig/mm/slub.c
  32 +++ linux-2.6.26/mm/slub.c
  33 @@ -23,6 +23,7 @@
  34  #include <linux/kallsyms.h>
  35  #include <linux/memory.h>
  36  #include <linux/math64.h>
  37 +#include "internal.h"
  38
  39  /*
  40   * Lock order:
  41 @@ -1106,7 +1107,8 @@ static void setup_object(struct kmem_cac
  42                 s->ctor(object);
  43  }
  44
  45 -static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
  46 +static
  47 +struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node, int *reserve)
  48  {
  49         struct page *page;
  50         void *start;
  51 @@ -1120,6 +1122,8 @@ static struct page *new_slab(struct kmem
  52         if (!page)
  53                 goto out;
  54
  55 +       *reserve = page->reserve;
  56 +
  57         inc_slabs_node(s, page_to_nid(page), page->objects);
  58         page->slab = s;
  59         page->flags |= 1 << PG_slab;
  60 @@ -1503,10 +1507,20 @@ static void *__slab_alloc(struct kmem_ca
  61  {
  62         void **object;
  63         struct page *new;
  64 +       int reserve;
  65
  66         /* We handle __GFP_ZERO in the caller */
  67         gfpflags &= ~__GFP_ZERO;
  68
  69 +       if (unlikely(c->reserve)) {
  70 +               /*
  71 +                * If the current slab is a reserve slab and the current
  72 +                * allocation context does not allow access to the reserves we
  73 +                * must force an allocation to test the current levels.
  74 +                */
  75 +               if (!(gfp_to_alloc_flags(gfpflags) & ALLOC_NO_WATERMARKS))
  76 +                       goto grow_slab;
  77 +       }
  78         if (!c->page)
  79                 goto new_slab;
  80
  81 @@ -1520,8 +1534,8 @@ load_freelist:
  82         object = c->page->freelist;
  83         if (unlikely(!object))
  84                 goto another_slab;
  85 -       if (unlikely(SLABDEBUG && PageSlubDebug(c->page)))
  86 -               goto debug;
  87 +       if (unlikely(PageSlubDebug(c->page) || c->reserve))
  88 +               goto slow_path;
  89
  90         c->freelist = object[c->offset];
  91         c->page->inuse = c->page->objects;
  92 @@ -1543,16 +1557,18 @@ new_slab:
  93                 goto load_freelist;
  94         }
  95
  96 +grow_slab:
  97         if (gfpflags & __GFP_WAIT)
  98                 local_irq_enable();
  99
 100 -       new = new_slab(s, gfpflags, node);
 101 +       new = new_slab(s, gfpflags, node, &reserve);
 102
 103         if (gfpflags & __GFP_WAIT)
 104                 local_irq_disable();
 105
 106         if (new) {
 107                 c = get_cpu_slab(s, smp_processor_id());
 108 +               c->reserve = reserve;
 109                 stat(c, ALLOC_SLAB);
 110                 if (c->page)
 111                         flush_slab(s, c);
 112 @@ -1562,10 +1578,21 @@ new_slab:
 113                 goto load_freelist;
 114         }
 115         return NULL;
 116 -debug:
 117 -       if (!alloc_debug_processing(s, c->page, object, addr))
 118 +
 119 +slow_path:
 120 +       if (PageSlubDebug(c->page) &&
 121 +                       !alloc_debug_processing(s, c->page, object, addr))
 122                 goto another_slab;
 123
 124 +       /*
 125 +        * Avoid the slub fast path in slab_alloc() by not setting
 126 +        * c->freelist and the fast path in slab_free() by making
 127 +        * node_match() fail by setting c->node to -1.
 128 +        *
 129 +        * We use this for for debug and reserve checks which need
 130 +        * to be done for each allocation.
 131 +        */
 132 +
 133         c->page->inuse++;
 134         c->page->freelist = object[c->offset];
 135         c->node = -1;
 136 @@ -2078,10 +2105,11 @@ static struct kmem_cache_node *early_kme
 137         struct page *page;
 138         struct kmem_cache_node *n;
 139         unsigned long flags;
 140 +       int reserve;
 141
 142         BUG_ON(kmalloc_caches->size < sizeof(struct kmem_cache_node));
 143
 144 -       page = new_slab(kmalloc_caches, gfpflags, node);
 145 +       page = new_slab(kmalloc_caches, gfpflags, node, &reserve);
 146
 147         BUG_ON(!page);
 148         if (page_to_nid(page) != node) {
 149 Index: linux-2.6.26/include/linux/slub_def.h
 150 ===================================================================
 151 --- linux-2.6.26.orig/include/linux/slub_def.h
 152 +++ linux-2.6.26/include/linux/slub_def.h
 153 @@ -38,6 +38,7 @@ struct kmem_cache_cpu {
 154         int node;               /* The node of the page (or -1 for debug) */
 155         unsigned int offset;    /* Freepointer offset (in word units) */
 156         unsigned int objsize;   /* Size of an object (from kmem_cache) */
 157 +       int reserve;            /* Did the current page come from the reserve */
 158  #ifdef CONFIG_SLUB_STATS
 159         unsigned stat[NR_SLUB_STAT_ITEMS];
 160  #endif
 161 Index: linux-2.6.26/mm/slab.c
 162 ===================================================================
 163 --- linux-2.6.26.orig/mm/slab.c
 164 +++ linux-2.6.26/mm/slab.c
 165 @@ -116,6 +116,8 @@
 166  #include       <asm/tlbflush.h>
 167  #include       <asm/page.h>
 168
 169 +#include       "internal.h"
 170 +
 171  /*
 172   * DEBUG       - 1 for kmem_cache_create() to honour; SLAB_RED_ZONE & SLAB_POISON.
 173   *               0 for faster, smaller code (especially in the critical paths).
 174 @@ -264,7 +266,8 @@ struct array_cache {
 175         unsigned int avail;
 176         unsigned int limit;
 177         unsigned int batchcount;
 178 -       unsigned int touched;
 179 +       unsigned int touched:1,
 180 +                    reserve:1;
 181         spinlock_t lock;
 182         void *entry[];  /*
 183                          * Must have this definition in here for the proper
 184 @@ -760,6 +763,27 @@ static inline struct array_cache *cpu_ca
 185         return cachep->array[smp_processor_id()];
 186  }
 187
 188 +/*
 189 + * If the last page came from the reserves, and the current allocation context
 190 + * does not have access to them, force an allocation to test the watermarks.
 191 + */
 192 +static inline int slab_force_alloc(struct kmem_cache *cachep, gfp_t flags)
 193 +{
 194 +       if (unlikely(cpu_cache_get(cachep)->reserve) &&
 195 +                       !(gfp_to_alloc_flags(flags) & ALLOC_NO_WATERMARKS))
 196 +               return 1;
 197 +
 198 +       return 0;
 199 +}
 200 +
 201 +static inline void slab_set_reserve(struct kmem_cache *cachep, int reserve)
 202 +{
 203 +       struct array_cache *ac = cpu_cache_get(cachep);
 204 +
 205 +       if (unlikely(ac->reserve != reserve))
 206 +               ac->reserve = reserve;
 207 +}
 208 +
 209  static inline struct kmem_cache *__find_general_cachep(size_t size,
 210                                                         gfp_t gfpflags)
 211  {
 212 @@ -959,6 +983,7 @@ static struct array_cache *alloc_arrayca
 213                 nc->limit = entries;
 214                 nc->batchcount = batchcount;
 215                 nc->touched = 0;
 216 +               nc->reserve = 0;
 217                 spin_lock_init(&nc->lock);
 218         }
 219         return nc;
 220 @@ -1661,7 +1686,8 @@ __initcall(cpucache_init);
 221   * did not request dmaable memory, we might get it, but that
 222   * would be relatively rare and ignorable.
 223   */
 224 -static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
 225 +static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid,
 226 +               int *reserve)
 227  {
 228         struct page *page;
 229         int nr_pages;
 230 @@ -1683,6 +1709,7 @@ static void *kmem_getpages(struct kmem_c
 231         if (!page)
 232                 return NULL;
 233
 234 +       *reserve = page->reserve;
 235         nr_pages = (1 << cachep->gfporder);
 236         if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
 237                 add_zone_page_state(page_zone(page),
 238 @@ -2103,6 +2130,7 @@ static int __init_refok setup_cpu_cache(
 239         cpu_cache_get(cachep)->limit = BOOT_CPUCACHE_ENTRIES;
 240         cpu_cache_get(cachep)->batchcount = 1;
 241         cpu_cache_get(cachep)->touched = 0;
 242 +       cpu_cache_get(cachep)->reserve = 0;
 243         cachep->batchcount = 1;
 244         cachep->limit = BOOT_CPUCACHE_ENTRIES;
 245         return 0;
 246 @@ -2757,6 +2785,7 @@ static int cache_grow(struct kmem_cache
 247         size_t offset;
 248         gfp_t local_flags;
 249         struct kmem_list3 *l3;
 250 +       int reserve;
 251
 252         /*
 253          * Be lazy and only check for valid flags here,  keeping it out of the
 254 @@ -2795,7 +2824,7 @@ static int cache_grow(struct kmem_cache
 255          * 'nodeid'.
 256          */
 257         if (!objp)
 258 -               objp = kmem_getpages(cachep, local_flags, nodeid);
 259 +               objp = kmem_getpages(cachep, local_flags, nodeid, &reserve);
 260         if (!objp)
 261                 goto failed;
 262
 263 @@ -2812,6 +2841,7 @@ static int cache_grow(struct kmem_cache
 264         if (local_flags & __GFP_WAIT)
 265                 local_irq_disable();
 266         check_irq_off();
 267 +       slab_set_reserve(cachep, reserve);
 268         spin_lock(&l3->list_lock);
 269
 270         /* Make slab active. */
 271 @@ -2946,7 +2976,8 @@ bad:
 272  #define check_slabp(x,y) do { } while(0)
 273  #endif
 274
 275 -static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags)
 276 +static void *cache_alloc_refill(struct kmem_cache *cachep,
 277 +               gfp_t flags, int must_refill)
 278  {
 279         int batchcount;
 280         struct kmem_list3 *l3;
 281 @@ -2956,6 +2987,8 @@ static void *cache_alloc_refill(struct k
 282  retry:
 283         check_irq_off();
 284         node = numa_node_id();
 285 +       if (unlikely(must_refill))
 286 +               goto force_grow;
 287         ac = cpu_cache_get(cachep);
 288         batchcount = ac->batchcount;
 289         if (!ac->touched && batchcount > BATCHREFILL_LIMIT) {
 290 @@ -3023,11 +3056,14 @@ alloc_done:
 291
 292         if (unlikely(!ac->avail)) {
 293                 int x;
 294 +force_grow:
 295                 x = cache_grow(cachep, flags | GFP_THISNODE, node, NULL);
 296
 297                 /* cache_grow can reenable interrupts, then ac could change. */
 298                 ac = cpu_cache_get(cachep);
 299 -               if (!x && ac->avail == 0)       /* no objects in sight? abort */
 300 +
 301 +               /* no objects in sight? abort */
 302 +               if (!x && (ac->avail == 0 || must_refill))
 303                         return NULL;
 304
 305                 if (!ac->avail)         /* objects refilled by interrupt? */
 306 @@ -3182,17 +3218,18 @@ static inline void *____cache_alloc(stru
 307  {
 308         void *objp;
 309         struct array_cache *ac;
 310 +       int must_refill = slab_force_alloc(cachep, flags);
 311
 312         check_irq_off();
 313
 314         ac = cpu_cache_get(cachep);
 315 -       if (likely(ac->avail)) {
 316 +       if (likely(ac->avail && !must_refill)) {
 317                 STATS_INC_ALLOCHIT(cachep);
 318                 ac->touched = 1;
 319                 objp = ac->entry[--ac->avail];
 320         } else {
 321                 STATS_INC_ALLOCMISS(cachep);
 322 -               objp = cache_alloc_refill(cachep, flags);
 323 +               objp = cache_alloc_refill(cachep, flags, must_refill);
 324         }
 325         return objp;
 326  }
 327 @@ -3236,7 +3273,7 @@ static void *fallback_alloc(struct kmem_
 328         struct zone *zone;
 329         enum zone_type high_zoneidx = gfp_zone(flags);
 330         void *obj = NULL;
 331 -       int nid;
 332 +       int nid, reserve;
 333
 334         if (flags & __GFP_THISNODE)
 335                 return NULL;
 336 @@ -3272,10 +3309,11 @@ retry:
 337                 if (local_flags & __GFP_WAIT)
 338                         local_irq_enable();
 339                 kmem_flagcheck(cache, flags);
 340 -               obj = kmem_getpages(cache, local_flags, -1);
 341 +               obj = kmem_getpages(cache, local_flags, -1, &reserve);
 342                 if (local_flags & __GFP_WAIT)
 343                         local_irq_disable();
 344                 if (obj) {
 345 +                       slab_set_reserve(cache, reserve);
 346                         /*
 347                          * Insert into the appropriate per node queues
 348                          */
 349 @@ -3314,6 +3352,9 @@ static void *____cache_alloc_node(struct
 350         l3 = cachep->nodelists[nodeid];
 351         BUG_ON(!l3);
 352
 353 +       if (unlikely(slab_force_alloc(cachep, flags)))
 354 +               goto force_grow;
 355 +
 356  retry:
 357         check_irq_off();
 358         spin_lock(&l3->list_lock);
 359 @@ -3351,6 +3392,7 @@ retry:
 360
 361  must_grow:
 362         spin_unlock(&l3->list_lock);
 363 +force_grow:
 364         x = cache_grow(cachep, flags | GFP_THISNODE, nodeid, NULL);
 365         if (x)
 366                 goto retry;
 367 Index: linux-2.6.26/mm/slob.c
 368 ===================================================================
 369 --- linux-2.6.26.orig/mm/slob.c
 370 +++ linux-2.6.26/mm/slob.c
 371 @@ -66,6 +66,7 @@
 372  #include <linux/rcupdate.h>
 373  #include <linux/list.h>
 374  #include <asm/atomic.h>
 375 +#include "internal.h"
 376
 377  /*
 378   * slob_block has a field 'units', which indicates size of block if +ve,
 379 @@ -183,6 +184,11 @@ struct slob_rcu {
 380  static DEFINE_SPINLOCK(slob_lock);
 381
 382  /*
 383 + * tracks the reserve state for the allocator.
 384 + */
 385 +static int slob_reserve;
 386 +
 387 +/*
 388   * Encode the given size and next info into a free slob block s.
 389   */
 390  static void set_slob(slob_t *s, slobidx_t size, slob_t *next)
 391 @@ -232,7 +238,7 @@ static int slob_last(slob_t *s)
 392
 393  static void *slob_new_page(gfp_t gfp, int order, int node)
 394  {
 395 -       void *page;
 396 +       struct page *page;
 397
 398  #ifdef CONFIG_NUMA
 399         if (node != -1)
 400 @@ -244,6 +250,8 @@ static void *slob_new_page(gfp_t gfp, in
 401         if (!page)
 402                 return NULL;
 403
 404 +       slob_reserve = page->reserve;
 405 +
 406         return page_address(page);
 407  }
 408
 409 @@ -309,6 +317,11 @@ static void *slob_alloc(size_t size, gfp
 410         slob_t *b = NULL;
 411         unsigned long flags;
 412
 413 +       if (unlikely(slob_reserve)) {
 414 +               if (!(gfp_to_alloc_flags(gfp) & ALLOC_NO_WATERMARKS))
 415 +                       goto grow;
 416 +       }
 417 +
 418         if (size < SLOB_BREAK1)
 419                 slob_list = &free_slob_small;
 420         else if (size < SLOB_BREAK2)
 421 @@ -347,6 +360,7 @@ static void *slob_alloc(size_t size, gfp
 422         }
 423         spin_unlock_irqrestore(&slob_lock, flags);
 424
 425 +grow:
 426         /* Not enough space: must allocate a new page */
 427         if (!b) {
 428                 b = slob_new_page(gfp & ~__GFP_ZERO, 0, node);