slab: add optimized sheaf refill from partial list

author Vlastimil Babka <vbabka@suse.cz>

Fri, 23 Jan 2026 06:52:48 +0000 (07:52 +0100)

committer Vlastimil Babka <vbabka@suse.cz>

Thu, 29 Jan 2026 08:27:51 +0000 (09:27 +0100)
author Vlastimil Babka <vbabka@suse.cz>
Fri, 23 Jan 2026 06:52:48 +0000 (07:52 +0100)
committer Vlastimil Babka <vbabka@suse.cz>
Thu, 29 Jan 2026 08:27:51 +0000 (09:27 +0100)
diff --git a/mm/slub.c b/mm/slub.c

index cd8d3712b195733ac32b344c16d26f5f4c40cd16..872340cc5f928038fc4176d5c2f6513806d14793 100644 (file)
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -248,6 +248,14 @@ struct partial_context {
         void *object;
  };
  
+/* Structure holding parameters for get_partial_node_bulk() */
+struct partial_bulk_context {
+       gfp_t flags;
+       unsigned int min_objects;
+       unsigned int max_objects;
+       struct list_head slabs;
+};
+
  static inline bool kmem_cache_debug(struct kmem_cache *s)
  {
         return kmem_cache_debug_flags(s, SLAB_DEBUG_FLAGS);
@@ -779,7 +787,8 @@ __update_freelist_slow(struct slab *slab, struct freelist_counters *old,
         if (slab->freelist == old->freelist &&
             slab->counters == old->counters) {
                 slab->freelist = new->freelist;
-               slab->counters = new->counters;
+               /* prevent tearing for the read in get_partial_node_bulk() */
+               WRITE_ONCE(slab->counters, new->counters);
                 ret = true;
         }
         slab_unlock(slab);
@@ -2638,9 +2647,9 @@ static void free_empty_sheaf(struct kmem_cache *s, struct slab_sheaf *sheaf)
         stat(s, SHEAF_FREE);
  }
  
-static int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags,
-                                  size_t size, void **p);
-
+static unsigned int
+__refill_objects(struct kmem_cache *s, void **p, gfp_t gfp, unsigned int min,
+                unsigned int max);
  
  static int refill_sheaf(struct kmem_cache *s, struct slab_sheaf *sheaf,
                          gfp_t gfp)
@@ -2651,8 +2660,8 @@ static int refill_sheaf(struct kmem_cache *s, struct slab_sheaf *sheaf,
         if (!to_fill)
                 return 0;
  
-       filled = __kmem_cache_alloc_bulk(s, gfp, to_fill,
-                                        &sheaf->objects[sheaf->size]);
+       filled = __refill_objects(s, &sheaf->objects[sheaf->size], gfp,
+                       to_fill, to_fill);
  
         sheaf->size += filled;
  
@@ -3518,6 +3527,57 @@ static inline void put_cpu_partial(struct kmem_cache *s, struct slab *slab,
  #endif
  static inline bool pfmemalloc_match(struct slab *slab, gfp_t gfpflags);
  
+static bool get_partial_node_bulk(struct kmem_cache *s,
+                                 struct kmem_cache_node *n,
+                                 struct partial_bulk_context *pc)
+{
+       struct slab *slab, *slab2;
+       unsigned int total_free = 0;
+       unsigned long flags;
+
+       /* Racy check to avoid taking the lock unnecessarily. */
+       if (!n || data_race(!n->nr_partial))
+               return false;
+
+       INIT_LIST_HEAD(&pc->slabs);
+
+       spin_lock_irqsave(&n->list_lock, flags);
+
+       list_for_each_entry_safe(slab, slab2, &n->partial, slab_list) {
+               struct freelist_counters flc;
+               unsigned int slab_free;
+
+               if (!pfmemalloc_match(slab, pc->flags))
+                       continue;
+
+               /*
+                * determine the number of free objects in the slab racily
+                *
+                * slab_free is a lower bound due to possible subsequent
+                * concurrent freeing, so the caller may get more objects than
+                * requested and must handle that
+                */
+               flc.counters = data_race(READ_ONCE(slab->counters));
+               slab_free = flc.objects - flc.inuse;
+
+               /* we have already min and this would get us over the max */
+               if (total_free >= pc->min_objects
+                   && total_free + slab_free > pc->max_objects)
+                       break;
+
+               remove_partial(n, slab);
+
+               list_add(&slab->slab_list, &pc->slabs);
+
+               total_free += slab_free;
+               if (total_free >= pc->max_objects)
+                       break;
+       }
+
+       spin_unlock_irqrestore(&n->list_lock, flags);
+       return total_free > 0;
+}
+
  /*
   * Try to allocate a partial slab from a specific node.
   */
@@ -4444,6 +4504,33 @@ static inline void *get_freelist(struct kmem_cache *s, struct slab *slab)
         return old.freelist;
  }
  
+/*
+ * Get the slab's freelist and do not freeze it.
+ *
+ * Assumes the slab is isolated from node partial list and not frozen.
+ *
+ * Assumes this is performed only for caches without debugging so we
+ * don't need to worry about adding the slab to the full list.
+ */
+static inline void *get_freelist_nofreeze(struct kmem_cache *s, struct slab *slab)
+{
+       struct freelist_counters old, new;
+
+       do {
+               old.freelist = slab->freelist;
+               old.counters = slab->counters;
+
+               new.freelist = NULL;
+               new.counters = old.counters;
+               VM_WARN_ON_ONCE(new.frozen);
+
+               new.inuse = old.objects;
+
+       } while (!slab_update_freelist(s, slab, &old, &new, "get_freelist_nofreeze"));
+
+       return old.freelist;
+}
+
  /*
   * Freeze the partial slab and return the pointer to the freelist.
   */
@@ -4467,6 +4554,72 @@ static inline void *freeze_slab(struct kmem_cache *s, struct slab *slab)
         return old.freelist;
  }
  
+/*
+ * If the object has been wiped upon free, make sure it's fully initialized by
+ * zeroing out freelist pointer.
+ *
+ * Note that we also wipe custom freelist pointers.
+ */
+static __always_inline void maybe_wipe_obj_freeptr(struct kmem_cache *s,
+                                                  void *obj)
+{
+       if (unlikely(slab_want_init_on_free(s)) && obj &&
+           !freeptr_outside_object(s))
+               memset((void *)((char *)kasan_reset_tag(obj) + s->offset),
+                       0, sizeof(void *));
+}
+
+static unsigned int alloc_from_new_slab(struct kmem_cache *s, struct slab *slab,
+               void **p, unsigned int count, bool allow_spin)
+{
+       unsigned int allocated = 0;
+       struct kmem_cache_node *n;
+       bool needs_add_partial;
+       unsigned long flags;
+       void *object;
+
+       /*
+        * Are we going to put the slab on the partial list?
+        * Note slab->inuse is 0 on a new slab.
+        */
+       needs_add_partial = (slab->objects > count);
+
+       if (!allow_spin && needs_add_partial) {
+
+               n = get_node(s, slab_nid(slab));
+
+               if (!spin_trylock_irqsave(&n->list_lock, flags)) {
+                       /* Unlucky, discard newly allocated slab */
+                       defer_deactivate_slab(slab, NULL);
+                       return 0;
+               }
+       }
+
+       object = slab->freelist;
+       while (object && allocated < count) {
+               p[allocated] = object;
+               object = get_freepointer(s, object);
+               maybe_wipe_obj_freeptr(s, p[allocated]);
+
+               slab->inuse++;
+               allocated++;
+       }
+       slab->freelist = object;
+
+       if (needs_add_partial) {
+
+               if (allow_spin) {
+                       n = get_node(s, slab_nid(slab));
+                       spin_lock_irqsave(&n->list_lock, flags);
+               }
+               add_partial(n, slab, DEACTIVATE_TO_HEAD);
+               spin_unlock_irqrestore(&n->list_lock, flags);
+       }
+
+       inc_slabs_node(s, slab_nid(slab), slab->objects);
+       return allocated;
+}
+
  /*
   * Slow path. The lockless freelist is empty or we need to perform
   * debugging duties.
@@ -4909,21 +5062,6 @@ redo:
         return object;
  }
  
-/*
- * If the object has been wiped upon free, make sure it's fully initialized by
- * zeroing out freelist pointer.
- *
- * Note that we also wipe custom freelist pointers.
- */
-static __always_inline void maybe_wipe_obj_freeptr(struct kmem_cache *s,
-                                                  void *obj)
-{
-       if (unlikely(slab_want_init_on_free(s)) && obj &&
-           !freeptr_outside_object(s))
-               memset((void *)((char *)kasan_reset_tag(obj) + s->offset),
-                       0, sizeof(void *));
-}
-
  static __fastpath_inline
  struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags)
  {
@@ -5384,6 +5522,9 @@ static int __prefill_sheaf_pfmemalloc(struct kmem_cache *s,
         return ret;
  }
  
+static int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags,
+                                  size_t size, void **p);
+
  /*
   * returns a sheaf that has at least the requested size
   * when prefilling is needed, do so with given gfp flags
@@ -7497,6 +7638,116 @@ void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p)
  }
  EXPORT_SYMBOL(kmem_cache_free_bulk);
  
+static unsigned int
+__refill_objects(struct kmem_cache *s, void **p, gfp_t gfp, unsigned int min,
+                unsigned int max)
+{
+       struct partial_bulk_context pc;
+       struct slab *slab, *slab2;
+       unsigned int refilled = 0;
+       unsigned long flags;
+       void *object;
+       int node;
+
+       pc.flags = gfp;
+       pc.min_objects = min;
+       pc.max_objects = max;
+
+       node = numa_mem_id();
+
+       if (WARN_ON_ONCE(!gfpflags_allow_spinning(gfp)))
+               return 0;
+
+       /* TODO: consider also other nodes? */
+       if (!get_partial_node_bulk(s, get_node(s, node), &pc))
+               goto new_slab;
+
+       list_for_each_entry_safe(slab, slab2, &pc.slabs, slab_list) {
+
+               list_del(&slab->slab_list);
+
+               object = get_freelist_nofreeze(s, slab);
+
+               while (object && refilled < max) {
+                       p[refilled] = object;
+                       object = get_freepointer(s, object);
+                       maybe_wipe_obj_freeptr(s, p[refilled]);
+
+                       refilled++;
+               }
+
+               /*
+                * Freelist had more objects than we can accommodate, we need to
+                * free them back. We can treat it like a detached freelist, just
+                * need to find the tail object.
+                */
+               if (unlikely(object)) {
+                       void *head = object;
+                       void *tail;
+                       int cnt = 0;
+
+                       do {
+                               tail = object;
+                               cnt++;
+                               object = get_freepointer(s, object);
+                       } while (object);
+                       do_slab_free(s, slab, head, tail, cnt, _RET_IP_);
+               }
+
+               if (refilled >= max)
+                       break;
+       }
+
+       if (unlikely(!list_empty(&pc.slabs))) {
+               struct kmem_cache_node *n = get_node(s, node);
+
+               spin_lock_irqsave(&n->list_lock, flags);
+
+               list_for_each_entry_safe(slab, slab2, &pc.slabs, slab_list) {
+
+                       if (unlikely(!slab->inuse && n->nr_partial >= s->min_partial))
+                               continue;
+
+                       list_del(&slab->slab_list);
+                       add_partial(n, slab, DEACTIVATE_TO_HEAD);
+               }
+
+               spin_unlock_irqrestore(&n->list_lock, flags);
+
+               /* any slabs left are completely free and for discard */
+               list_for_each_entry_safe(slab, slab2, &pc.slabs, slab_list) {
+
+                       list_del(&slab->slab_list);
+                       discard_slab(s, slab);
+               }
+       }
+
+
+       if (likely(refilled >= min))
+               goto out;
+
+new_slab:
+
+       slab = new_slab(s, pc.flags, node);
+       if (!slab)
+               goto out;
+
+       stat(s, ALLOC_SLAB);
+
+       /*
+        * TODO: possible optimization - if we know we will consume the whole
+        * slab we might skip creating the freelist?
+        */
+       refilled += alloc_from_new_slab(s, slab, p + refilled, max - refilled,
+                                       /* allow_spin = */ true);
+
+       if (refilled < min)
+               goto new_slab;
+out:
+
+       return refilled;
+}
+
  static inline
  int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
                             void **p)
author	Vlastimil Babka <vbabka@suse.cz>
	Fri, 23 Jan 2026 06:52:48 +0000 (07:52 +0100)
committer	Vlastimil Babka <vbabka@suse.cz>
	Thu, 29 Jan 2026 08:27:51 +0000 (09:27 +0100)