Move xen patchset to new version's subdir.

[people/pmueller/ipfire-2.x.git] / src / patches / suse-2.6.27.31 / patches.suse / SoN-08-reserve-slub.patch
diff --git a/src/patches/suse-2.6.27.31/patches.suse/SoN-08-reserve-slub.patch b/src/patches/suse-2.6.27.31/patches.suse/SoN-08-reserve-slub.patch

new file mode 100644 (file)

index 0000000..45cc13c
--- /dev/null
+++ b/src/patches/suse-2.6.27.31/patches.suse/SoN-08-reserve-slub.patch
@@ -0,0 +1,428 @@
+From: Peter Zijlstra <a.p.zijlstra@chello.nl> 
+Subject: mm: sl[au]b: add knowledge of reserve pages
+Patch-mainline: No
+References: FATE#303834
+
+Restrict objects from reserve slabs (ALLOC_NO_WATERMARKS) to allocation
+contexts that are entitled to it. This is done to ensure reserve pages don't
+leak out and get consumed.
+
+The basic pattern used for all # allocators is the following, for each active
+slab page we store if it came from an emergency allocation. When we find it
+did, make sure the current allocation context would have been able to allocate
+page from the emergency reserves as well. In that case allow the allocation. If
+not, force a new slab allocation. When that works the memory pressure has
+lifted enough to allow this context to get an object, otherwise fail the
+allocation.
+
+Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
+Acked-by: Neil Brown <neilb@suse.de>
+Acked-by: Suresh Jayaraman <sjayaraman@suse.de>
+
+---
+ include/linux/slub_def.h |    1 
+ mm/slab.c                |   60 +++++++++++++++++++++++++++++++++++++++--------
+ mm/slob.c                |   16 +++++++++++-
+ mm/slub.c                |   42 +++++++++++++++++++++++++++-----
+ 4 files changed, 102 insertions(+), 17 deletions(-)
+
+Index: linux-2.6.26/mm/slub.c
+===================================================================
+--- linux-2.6.26.orig/mm/slub.c
++++ linux-2.6.26/mm/slub.c
+@@ -23,6 +23,7 @@
+ #include <linux/kallsyms.h>
+ #include <linux/memory.h>
+ #include <linux/math64.h>
++#include "internal.h"
+ 
+ /*
+  * Lock order:
+@@ -1106,7 +1107,8 @@ static void setup_object(struct kmem_cac
+               s->ctor(object);
+ }
+ 
+-static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
++static
++struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node, int *reserve)
+ {
+       struct page *page;
+       void *start;
+@@ -1120,6 +1122,8 @@ static struct page *new_slab(struct kmem
+       if (!page)
+               goto out;
+ 
++      *reserve = page->reserve;
++
+       inc_slabs_node(s, page_to_nid(page), page->objects);
+       page->slab = s;
+       page->flags |= 1 << PG_slab;
+@@ -1503,10 +1507,20 @@ static void *__slab_alloc(struct kmem_ca
+ {
+       void **object;
+       struct page *new;
++      int reserve;
+ 
+       /* We handle __GFP_ZERO in the caller */
+       gfpflags &= ~__GFP_ZERO;
+ 
++      if (unlikely(c->reserve)) {
++              /*
++               * If the current slab is a reserve slab and the current
++               * allocation context does not allow access to the reserves we
++               * must force an allocation to test the current levels.
++               */
++              if (!(gfp_to_alloc_flags(gfpflags) & ALLOC_NO_WATERMARKS))
++                      goto grow_slab;
++      }
+       if (!c->page)
+               goto new_slab;
+ 
+@@ -1520,8 +1534,8 @@ load_freelist:
+       object = c->page->freelist;
+       if (unlikely(!object))
+               goto another_slab;
+-      if (unlikely(SLABDEBUG && PageSlubDebug(c->page)))
+-              goto debug;
++      if (unlikely(PageSlubDebug(c->page) || c->reserve))
++              goto slow_path;
+ 
+       c->freelist = object[c->offset];
+       c->page->inuse = c->page->objects;
+@@ -1543,16 +1557,18 @@ new_slab:
+               goto load_freelist;
+       }
+ 
++grow_slab:
+       if (gfpflags & __GFP_WAIT)
+               local_irq_enable();
+ 
+-      new = new_slab(s, gfpflags, node);
++      new = new_slab(s, gfpflags, node, &reserve);
+ 
+       if (gfpflags & __GFP_WAIT)
+               local_irq_disable();
+ 
+       if (new) {
+               c = get_cpu_slab(s, smp_processor_id());
++              c->reserve = reserve;
+               stat(c, ALLOC_SLAB);
+               if (c->page)
+                       flush_slab(s, c);
+@@ -1562,10 +1578,21 @@ new_slab:
+               goto load_freelist;
+       }
+       return NULL;
+-debug:
+-      if (!alloc_debug_processing(s, c->page, object, addr))
++
++slow_path:
++      if (PageSlubDebug(c->page) &&
++                      !alloc_debug_processing(s, c->page, object, addr))
+               goto another_slab;
+ 
++      /*
++       * Avoid the slub fast path in slab_alloc() by not setting
++       * c->freelist and the fast path in slab_free() by making
++       * node_match() fail by setting c->node to -1.
++       *
++       * We use this for for debug and reserve checks which need
++       * to be done for each allocation.
++       */
++
+       c->page->inuse++;
+       c->page->freelist = object[c->offset];
+       c->node = -1;
+@@ -2078,10 +2105,11 @@ static struct kmem_cache_node *early_kme
+       struct page *page;
+       struct kmem_cache_node *n;
+       unsigned long flags;
++      int reserve;
+ 
+       BUG_ON(kmalloc_caches->size < sizeof(struct kmem_cache_node));
+ 
+-      page = new_slab(kmalloc_caches, gfpflags, node);
++      page = new_slab(kmalloc_caches, gfpflags, node, &reserve);
+ 
+       BUG_ON(!page);
+       if (page_to_nid(page) != node) {
+Index: linux-2.6.26/include/linux/slub_def.h
+===================================================================
+--- linux-2.6.26.orig/include/linux/slub_def.h
++++ linux-2.6.26/include/linux/slub_def.h
+@@ -38,6 +38,7 @@ struct kmem_cache_cpu {
+       int node;               /* The node of the page (or -1 for debug) */
+       unsigned int offset;    /* Freepointer offset (in word units) */
+       unsigned int objsize;   /* Size of an object (from kmem_cache) */
++      int reserve;            /* Did the current page come from the reserve */
+ #ifdef CONFIG_SLUB_STATS
+       unsigned stat[NR_SLUB_STAT_ITEMS];
+ #endif
+Index: linux-2.6.26/mm/slab.c
+===================================================================
+--- linux-2.6.26.orig/mm/slab.c
++++ linux-2.6.26/mm/slab.c
+@@ -116,6 +116,8 @@
+ #include      <asm/tlbflush.h>
+ #include      <asm/page.h>
+ 
++#include      "internal.h"
++
+ /*
+  * DEBUG      - 1 for kmem_cache_create() to honour; SLAB_RED_ZONE & SLAB_POISON.
+  *              0 for faster, smaller code (especially in the critical paths).
+@@ -264,7 +266,8 @@ struct array_cache {
+       unsigned int avail;
+       unsigned int limit;
+       unsigned int batchcount;
+-      unsigned int touched;
++      unsigned int touched:1,
++                   reserve:1;
+       spinlock_t lock;
+       void *entry[];  /*
+                        * Must have this definition in here for the proper
+@@ -760,6 +763,27 @@ static inline struct array_cache *cpu_ca
+       return cachep->array[smp_processor_id()];
+ }
+ 
++/*
++ * If the last page came from the reserves, and the current allocation context
++ * does not have access to them, force an allocation to test the watermarks.
++ */
++static inline int slab_force_alloc(struct kmem_cache *cachep, gfp_t flags)
++{
++      if (unlikely(cpu_cache_get(cachep)->reserve) &&
++                      !(gfp_to_alloc_flags(flags) & ALLOC_NO_WATERMARKS))
++              return 1;
++
++      return 0;
++}
++
++static inline void slab_set_reserve(struct kmem_cache *cachep, int reserve)
++{
++      struct array_cache *ac = cpu_cache_get(cachep);
++
++      if (unlikely(ac->reserve != reserve))
++              ac->reserve = reserve;
++}
++
+ static inline struct kmem_cache *__find_general_cachep(size_t size,
+                                                       gfp_t gfpflags)
+ {
+@@ -959,6 +983,7 @@ static struct array_cache *alloc_arrayca
+               nc->limit = entries;
+               nc->batchcount = batchcount;
+               nc->touched = 0;
++              nc->reserve = 0;
+               spin_lock_init(&nc->lock);
+       }
+       return nc;
+@@ -1661,7 +1686,8 @@ __initcall(cpucache_init);
+  * did not request dmaable memory, we might get it, but that
+  * would be relatively rare and ignorable.
+  */
+-static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
++static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid,
++              int *reserve)
+ {
+       struct page *page;
+       int nr_pages;
+@@ -1683,6 +1709,7 @@ static void *kmem_getpages(struct kmem_c
+       if (!page)
+               return NULL;
+ 
++      *reserve = page->reserve;
+       nr_pages = (1 << cachep->gfporder);
+       if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
+               add_zone_page_state(page_zone(page),
+@@ -2103,6 +2130,7 @@ static int __init_refok setup_cpu_cache(
+       cpu_cache_get(cachep)->limit = BOOT_CPUCACHE_ENTRIES;
+       cpu_cache_get(cachep)->batchcount = 1;
+       cpu_cache_get(cachep)->touched = 0;
++      cpu_cache_get(cachep)->reserve = 0;
+       cachep->batchcount = 1;
+       cachep->limit = BOOT_CPUCACHE_ENTRIES;
+       return 0;
+@@ -2757,6 +2785,7 @@ static int cache_grow(struct kmem_cache 
+       size_t offset;
+       gfp_t local_flags;
+       struct kmem_list3 *l3;
++      int reserve;
+ 
+       /*
+        * Be lazy and only check for valid flags here,  keeping it out of the
+@@ -2795,7 +2824,7 @@ static int cache_grow(struct kmem_cache 
+        * 'nodeid'.
+        */
+       if (!objp)
+-              objp = kmem_getpages(cachep, local_flags, nodeid);
++              objp = kmem_getpages(cachep, local_flags, nodeid, &reserve);
+       if (!objp)
+               goto failed;
+ 
+@@ -2812,6 +2841,7 @@ static int cache_grow(struct kmem_cache 
+       if (local_flags & __GFP_WAIT)
+               local_irq_disable();
+       check_irq_off();
++      slab_set_reserve(cachep, reserve);
+       spin_lock(&l3->list_lock);
+ 
+       /* Make slab active. */
+@@ -2946,7 +2976,8 @@ bad:
+ #define check_slabp(x,y) do { } while(0)
+ #endif
+ 
+-static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags)
++static void *cache_alloc_refill(struct kmem_cache *cachep,
++              gfp_t flags, int must_refill)
+ {
+       int batchcount;
+       struct kmem_list3 *l3;
+@@ -2956,6 +2987,8 @@ static void *cache_alloc_refill(struct k
+ retry:
+       check_irq_off();
+       node = numa_node_id();
++      if (unlikely(must_refill))
++              goto force_grow;
+       ac = cpu_cache_get(cachep);
+       batchcount = ac->batchcount;
+       if (!ac->touched && batchcount > BATCHREFILL_LIMIT) {
+@@ -3023,11 +3056,14 @@ alloc_done:
+ 
+       if (unlikely(!ac->avail)) {
+               int x;
++force_grow:
+               x = cache_grow(cachep, flags | GFP_THISNODE, node, NULL);
+ 
+               /* cache_grow can reenable interrupts, then ac could change. */
+               ac = cpu_cache_get(cachep);
+-              if (!x && ac->avail == 0)       /* no objects in sight? abort */
++
++              /* no objects in sight? abort */
++              if (!x && (ac->avail == 0 || must_refill))
+                       return NULL;
+ 
+               if (!ac->avail)         /* objects refilled by interrupt? */
+@@ -3182,17 +3218,18 @@ static inline void *____cache_alloc(stru
+ {
+       void *objp;
+       struct array_cache *ac;
++      int must_refill = slab_force_alloc(cachep, flags);
+ 
+       check_irq_off();
+ 
+       ac = cpu_cache_get(cachep);
+-      if (likely(ac->avail)) {
++      if (likely(ac->avail && !must_refill)) {
+               STATS_INC_ALLOCHIT(cachep);
+               ac->touched = 1;
+               objp = ac->entry[--ac->avail];
+       } else {
+               STATS_INC_ALLOCMISS(cachep);
+-              objp = cache_alloc_refill(cachep, flags);
++              objp = cache_alloc_refill(cachep, flags, must_refill);
+       }
+       return objp;
+ }
+@@ -3236,7 +3273,7 @@ static void *fallback_alloc(struct kmem_
+       struct zone *zone;
+       enum zone_type high_zoneidx = gfp_zone(flags);
+       void *obj = NULL;
+-      int nid;
++      int nid, reserve;
+ 
+       if (flags & __GFP_THISNODE)
+               return NULL;
+@@ -3272,10 +3309,11 @@ retry:
+               if (local_flags & __GFP_WAIT)
+                       local_irq_enable();
+               kmem_flagcheck(cache, flags);
+-              obj = kmem_getpages(cache, local_flags, -1);
++              obj = kmem_getpages(cache, local_flags, -1, &reserve);
+               if (local_flags & __GFP_WAIT)
+                       local_irq_disable();
+               if (obj) {
++                      slab_set_reserve(cache, reserve);
+                       /*
+                        * Insert into the appropriate per node queues
+                        */
+@@ -3314,6 +3352,9 @@ static void *____cache_alloc_node(struct
+       l3 = cachep->nodelists[nodeid];
+       BUG_ON(!l3);
+ 
++      if (unlikely(slab_force_alloc(cachep, flags)))
++              goto force_grow;
++
+ retry:
+       check_irq_off();
+       spin_lock(&l3->list_lock);
+@@ -3351,6 +3392,7 @@ retry:
+ 
+ must_grow:
+       spin_unlock(&l3->list_lock);
++force_grow:
+       x = cache_grow(cachep, flags | GFP_THISNODE, nodeid, NULL);
+       if (x)
+               goto retry;
+Index: linux-2.6.26/mm/slob.c
+===================================================================
+--- linux-2.6.26.orig/mm/slob.c
++++ linux-2.6.26/mm/slob.c
+@@ -66,6 +66,7 @@
+ #include <linux/rcupdate.h>
+ #include <linux/list.h>
+ #include <asm/atomic.h>
++#include "internal.h"
+ 
+ /*
+  * slob_block has a field 'units', which indicates size of block if +ve,
+@@ -183,6 +184,11 @@ struct slob_rcu {
+ static DEFINE_SPINLOCK(slob_lock);
+ 
+ /*
++ * tracks the reserve state for the allocator.
++ */
++static int slob_reserve;
++
++/*
+  * Encode the given size and next info into a free slob block s.
+  */
+ static void set_slob(slob_t *s, slobidx_t size, slob_t *next)
+@@ -232,7 +238,7 @@ static int slob_last(slob_t *s)
+ 
+ static void *slob_new_page(gfp_t gfp, int order, int node)
+ {
+-      void *page;
++      struct page *page;
+ 
+ #ifdef CONFIG_NUMA
+       if (node != -1)
+@@ -244,6 +250,8 @@ static void *slob_new_page(gfp_t gfp, in
+       if (!page)
+               return NULL;
+ 
++      slob_reserve = page->reserve;
++
+       return page_address(page);
+ }
+ 
+@@ -309,6 +317,11 @@ static void *slob_alloc(size_t size, gfp
+       slob_t *b = NULL;
+       unsigned long flags;
+ 
++      if (unlikely(slob_reserve)) {
++              if (!(gfp_to_alloc_flags(gfp) & ALLOC_NO_WATERMARKS))
++                      goto grow;
++      }
++
+       if (size < SLOB_BREAK1)
+               slob_list = &free_slob_small;
+       else if (size < SLOB_BREAK2)
+@@ -347,6 +360,7 @@ static void *slob_alloc(size_t size, gfp
+       }
+       spin_unlock_irqrestore(&slob_lock, flags);
+ 
++grow:
+       /* Not enough space: must allocate a new page */
+       if (!b) {
+               b = slob_new_page(gfp & ~__GFP_ZERO, 0, node);