slab: Introduce kmalloc_nolock() and kfree_nolock().

author Alexei Starovoitov <ast@kernel.org>

Tue, 9 Sep 2025 01:00:07 +0000 (18:00 -0700)

committer Vlastimil Babka <vbabka@suse.cz>

Mon, 29 Sep 2025 07:42:36 +0000 (09:42 +0200)
author Alexei Starovoitov <ast@kernel.org>
Tue, 9 Sep 2025 01:00:07 +0000 (18:00 -0700)
committer Vlastimil Babka <vbabka@suse.cz>
Mon, 29 Sep 2025 07:42:36 +0000 (09:42 +0200)
diff --git a/include/linux/kasan.h b/include/linux/kasan.h

index 890011071f2b142b77ecf7eb0d08dd17546a2dbf..acdc8cb0152e6ff2d850c4a2bdda1ae70b276664 100644 (file)
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -200,7 +200,7 @@ static __always_inline bool kasan_slab_pre_free(struct kmem_cache *s,
  }
  
  bool __kasan_slab_free(struct kmem_cache *s, void *object, bool init,
-                      bool still_accessible);
+                      bool still_accessible, bool no_quarantine);
  /**
   * kasan_slab_free - Poison, initialize, and quarantine a slab object.
   * @object: Object to be freed.
@@ -226,11 +226,13 @@ bool __kasan_slab_free(struct kmem_cache *s, void *object, bool init,
   * @Return true if KASAN took ownership of the object; false otherwise.
   */
  static __always_inline bool kasan_slab_free(struct kmem_cache *s,
-                                               void *object, bool init,
-                                               bool still_accessible)
+                                           void *object, bool init,
+                                           bool still_accessible,
+                                           bool no_quarantine)
  {
         if (kasan_enabled())
-               return __kasan_slab_free(s, object, init, still_accessible);
+               return __kasan_slab_free(s, object, init, still_accessible,
+                                        no_quarantine);
         return false;
  }
  
@@ -427,7 +429,8 @@ static inline bool kasan_slab_pre_free(struct kmem_cache *s, void *object)
  }
  
  static inline bool kasan_slab_free(struct kmem_cache *s, void *object,
-                                  bool init, bool still_accessible)
+                                  bool init, bool still_accessible,
+                                  bool no_quarantine)
  {
         return false;
  }
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h

index d254c0b96d0d50e623551aa8cc3f0f3535ffcafd..82563236f35c7b5af2e2b89f0c8d67d6536f693e 100644 (file)
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -358,6 +358,8 @@ enum objext_flags {
          * MEMCG_DATA_OBJEXTS.
          */
         OBJEXTS_ALLOC_FAIL = __OBJEXTS_ALLOC_FAIL,
+       /* slabobj_ext vector allocated with kmalloc_nolock() */
+       OBJEXTS_NOSPIN_ALLOC = __FIRST_OBJEXT_FLAG,
         /* the next bit after the last actual flag */
         __NR_OBJEXTS_FLAGS  = (__FIRST_OBJEXT_FLAG << 1),
  };
diff --git a/include/linux/slab.h b/include/linux/slab.h

index 680193356ac7a22f9df5cd9b71ff8b81e26404ad..561597dd216496467d758515986a450b11952261 100644 (file)
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -501,6 +501,7 @@ void * __must_check krealloc_noprof(const void *objp, size_t new_size,
  #define krealloc(...)                          alloc_hooks(krealloc_noprof(__VA_ARGS__))
  
  void kfree(const void *objp);
+void kfree_nolock(const void *objp);
  void kfree_sensitive(const void *objp);
  size_t __ksize(const void *objp);
  
@@ -957,6 +958,9 @@ static __always_inline __alloc_size(1) void *kmalloc_noprof(size_t size, gfp_t f
  }
  #define kmalloc(...)                           alloc_hooks(kmalloc_noprof(__VA_ARGS__))
  
+void *kmalloc_nolock_noprof(size_t size, gfp_t gfp_flags, int node);
+#define kmalloc_nolock(...)                    alloc_hooks(kmalloc_nolock_noprof(__VA_ARGS__))
+
  #define kmem_buckets_alloc(_b, _size, _flags)  \
         alloc_hooks(__kmalloc_node_noprof(PASS_BUCKET_PARAMS(_size, _b), _flags, NUMA_NO_NODE))
  
diff --git a/mm/Kconfig b/mm/Kconfig

index e443fe8cd6cf256ca20b8a58b95d18b6f5448f47..202e044f2b4d7236f7b24c09e876d84ad3d437b1 100644 (file)
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -194,6 +194,7 @@ menu "Slab allocator options"
  
  config SLUB
         def_bool y
+       select IRQ_WORK
  
  config KVFREE_RCU_BATCHED
         def_bool y
diff --git a/mm/kasan/common.c b/mm/kasan/common.c

index 9142964ab9c9572fe6bc1779b73005ca4e7854d4..3264900b942f91d268a4d00869861266012c5b9c 100644 (file)
--- a/mm/kasan/common.c
+++ b/mm/kasan/common.c
@@ -252,7 +252,7 @@ bool __kasan_slab_pre_free(struct kmem_cache *cache, void *object,
  }
  
  bool __kasan_slab_free(struct kmem_cache *cache, void *object, bool init,
-                      bool still_accessible)
+                      bool still_accessible, bool no_quarantine)
  {
         if (!kasan_arch_is_ready() || is_kfence_address(object))
                 return false;
@@ -274,6 +274,9 @@ bool __kasan_slab_free(struct kmem_cache *cache, void *object, bool init,
  
         poison_slab_object(cache, object, init);
  
+       if (no_quarantine)
+               return false;
+
         /*
          * If the object is put into quarantine, do not let slab put the object
          * onto the freelist for now. The object's metadata is kept until the
diff --git a/mm/slab.h b/mm/slab.h

index 43245d9207b6d58e42bf576ad6ac2c066845b350..35e533e59b077c795803b77f382adebfff4c4138 100644 (file)
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -57,6 +57,10 @@ struct slab {
                 struct {
                         union {
                                 struct list_head slab_list;
+                               struct { /* For deferred deactivate_slab() */
+                                       struct llist_node llnode;
+                                       void *flush_freelist;
+                               };
  #ifdef CONFIG_SLUB_CPU_PARTIAL
                                 struct {
                                         struct slab *next;
@@ -662,6 +666,8 @@ void __kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *slab)
  void __check_heap_object(const void *ptr, unsigned long n,
                          const struct slab *slab, bool to_user);
  
+void defer_free_barrier(void);
+
  static inline bool slub_debug_orig_size(struct kmem_cache *s)
  {
         return (kmem_cache_debug_flags(s, SLAB_STORE_USER) &&
diff --git a/mm/slab_common.c b/mm/slab_common.c

index b6601e0fe598e24bd8d456dce4fc82c65b342bfd..932d13ada36c0d0c97931f5a2108566922f29236 100644 (file)
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -510,6 +510,9 @@ void kmem_cache_destroy(struct kmem_cache *s)
                 rcu_barrier();
         }
  
+       /* Wait for deferred work from kmalloc/kfree_nolock() */
+       defer_free_barrier();
+
         cpus_read_lock();
         mutex_lock(&slab_mutex);
  
diff --git a/mm/slub.c b/mm/slub.c

index 189cd5aa4ac462ecec582957e5058c43dfc51dd1..f9f7f3942074fd856ce25ef37df3b1cbad80f121 100644 (file)
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -44,7 +44,8 @@
  #include <kunit/test.h>
  #include <kunit/test-bug.h>
  #include <linux/sort.h>
-
+#include <linux/irq_work.h>
+#include <linux/kprobes.h>
  #include <linux/debugfs.h>
  #include <trace/events/kmem.h>
  
@@ -426,7 +427,7 @@ struct kmem_cache_cpu {
  #ifdef CONFIG_SLUB_CPU_PARTIAL
         struct slab *partial;   /* Partially allocated slabs */
  #endif
-       local_lock_t lock;      /* Protects the fields above */
+       local_trylock_t lock;   /* Protects the fields above */
  #ifdef CONFIG_SLUB_STATS
         unsigned int stat[NR_SLUB_STAT_ITEMS];
  #endif
@@ -2079,6 +2080,7 @@ static inline void init_slab_obj_exts(struct slab *slab)
  int alloc_slab_obj_exts(struct slab *slab, struct kmem_cache *s,
                         gfp_t gfp, bool new_slab)
  {
+       bool allow_spin = gfpflags_allow_spinning(gfp);
         unsigned int objects = objs_per_slab(s, slab);
         unsigned long new_exts;
         unsigned long old_exts;
@@ -2087,8 +2089,22 @@ int alloc_slab_obj_exts(struct slab *slab, struct kmem_cache *s,
         gfp &= ~OBJCGS_CLEAR_MASK;
         /* Prevent recursive extension vector allocation */
         gfp |= __GFP_NO_OBJ_EXT;
-       vec = kcalloc_node(objects, sizeof(struct slabobj_ext), gfp,
-                          slab_nid(slab));
+
+       /*
+        * Note that allow_spin may be false during early boot and its
+        * restricted GFP_BOOT_MASK. Due to kmalloc_nolock() only supporting
+        * architectures with cmpxchg16b, early obj_exts will be missing for
+        * very early allocations on those.
+        */
+       if (unlikely(!allow_spin)) {
+               size_t sz = objects * sizeof(struct slabobj_ext);
+
+               vec = kmalloc_nolock(sz, __GFP_ZERO | __GFP_NO_OBJ_EXT,
+                                    slab_nid(slab));
+       } else {
+               vec = kcalloc_node(objects, sizeof(struct slabobj_ext), gfp,
+                                  slab_nid(slab));
+       }
         if (!vec) {
                 /* Mark vectors which failed to allocate */
                 if (new_slab)
@@ -2098,6 +2114,8 @@ int alloc_slab_obj_exts(struct slab *slab, struct kmem_cache *s,
         }
  
         new_exts = (unsigned long)vec;
+       if (unlikely(!allow_spin))
+               new_exts |= OBJEXTS_NOSPIN_ALLOC;
  #ifdef CONFIG_MEMCG
         new_exts |= MEMCG_DATA_OBJEXTS;
  #endif
@@ -2118,7 +2136,10 @@ int alloc_slab_obj_exts(struct slab *slab, struct kmem_cache *s,
                  * objcg vector should be reused.
                  */
                 mark_objexts_empty(vec);
-               kfree(vec);
+               if (unlikely(!allow_spin))
+                       kfree_nolock(vec);
+               else
+                       kfree(vec);
                 return 0;
         }
  
@@ -2142,7 +2163,10 @@ static inline void free_slab_obj_exts(struct slab *slab)
          * the extension for obj_exts is expected to be NULL.
          */
         mark_objexts_empty(obj_exts);
-       kfree(obj_exts);
+       if (unlikely(READ_ONCE(slab->obj_exts) & OBJEXTS_NOSPIN_ALLOC))
+               kfree_nolock(obj_exts);
+       else
+               kfree(obj_exts);
         slab->obj_exts = 0;
  }
  
@@ -2476,7 +2500,7 @@ bool slab_free_hook(struct kmem_cache *s, void *x, bool init,
  
         }
         /* KASAN might put x into memory quarantine, delaying its reuse. */
-       return !kasan_slab_free(s, x, init, still_accessible);
+       return !kasan_slab_free(s, x, init, still_accessible, false);
  }
  
  static __fastpath_inline
@@ -2981,13 +3005,17 @@ static void barn_shrink(struct kmem_cache *s, struct node_barn *barn)
   * Slab allocation and freeing
   */
  static inline struct slab *alloc_slab_page(gfp_t flags, int node,
-               struct kmem_cache_order_objects oo)
+                                          struct kmem_cache_order_objects oo,
+                                          bool allow_spin)
  {
         struct folio *folio;
         struct slab *slab;
         unsigned int order = oo_order(oo);
  
-       if (node == NUMA_NO_NODE)
+       if (unlikely(!allow_spin))
+               folio = (struct folio *)alloc_frozen_pages_nolock(0/* __GFP_COMP is implied */,
+                                                                 node, order);
+       else if (node == NUMA_NO_NODE)
                 folio = (struct folio *)alloc_frozen_pages(flags, order);
         else
                 folio = (struct folio *)__alloc_frozen_pages(flags, order, node, NULL);
@@ -3137,6 +3165,7 @@ static __always_inline void unaccount_slab(struct slab *slab, int order,
  
  static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
  {
+       bool allow_spin = gfpflags_allow_spinning(flags);
         struct slab *slab;
         struct kmem_cache_order_objects oo = s->oo;
         gfp_t alloc_gfp;
@@ -3156,7 +3185,11 @@ static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
         if ((alloc_gfp & __GFP_DIRECT_RECLAIM) && oo_order(oo) > oo_order(s->min))
                 alloc_gfp = (alloc_gfp | __GFP_NOMEMALLOC) & ~__GFP_RECLAIM;
  
-       slab = alloc_slab_page(alloc_gfp, node, oo);
+       /*
+        * __GFP_RECLAIM could be cleared on the first allocation attempt,
+        * so pass allow_spin flag directly.
+        */
+       slab = alloc_slab_page(alloc_gfp, node, oo, allow_spin);
         if (unlikely(!slab)) {
                 oo = s->min;
                 alloc_gfp = flags;
@@ -3164,7 +3197,7 @@ static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
                  * Allocation may have failed due to fragmentation.
                  * Try a lower order alloc if possible
                  */
-               slab = alloc_slab_page(alloc_gfp, node, oo);
+               slab = alloc_slab_page(alloc_gfp, node, oo, allow_spin);
                 if (unlikely(!slab))
                         return NULL;
                 stat(s, ORDER_FALLBACK);
@@ -3333,33 +3366,47 @@ static void *alloc_single_from_partial(struct kmem_cache *s,
         return object;
  }
  
+static void defer_deactivate_slab(struct slab *slab, void *flush_freelist);
+
  /*
   * Called only for kmem_cache_debug() caches to allocate from a freshly
   * allocated slab. Allocate a single object instead of whole freelist
   * and put the slab to the partial (or full) list.
   */
-static void *alloc_single_from_new_slab(struct kmem_cache *s,
-                                       struct slab *slab, int orig_size)
+static void *alloc_single_from_new_slab(struct kmem_cache *s, struct slab *slab,
+                                       int orig_size, gfp_t gfpflags)
  {
+       bool allow_spin = gfpflags_allow_spinning(gfpflags);
         int nid = slab_nid(slab);
         struct kmem_cache_node *n = get_node(s, nid);
         unsigned long flags;
         void *object;
  
+       if (!allow_spin && !spin_trylock_irqsave(&n->list_lock, flags)) {
+               /* Unlucky, discard newly allocated slab */
+               slab->frozen = 1;
+               defer_deactivate_slab(slab, NULL);
+               return NULL;
+       }
  
         object = slab->freelist;
         slab->freelist = get_freepointer(s, object);
         slab->inuse = 1;
  
-       if (!alloc_debug_processing(s, slab, object, orig_size))
+       if (!alloc_debug_processing(s, slab, object, orig_size)) {
                 /*
                  * It's not really expected that this would fail on a
                  * freshly allocated slab, but a concurrent memory
                  * corruption in theory could cause that.
+                * Leak memory of allocated slab.
                  */
+               if (!allow_spin)
+                       spin_unlock_irqrestore(&n->list_lock, flags);
                 return NULL;
+       }
  
-       spin_lock_irqsave(&n->list_lock, flags);
+       if (allow_spin)
+               spin_lock_irqsave(&n->list_lock, flags);
  
         if (slab->inuse == slab->objects)
                 add_full(s, n, slab);
@@ -3400,7 +3447,10 @@ static struct slab *get_partial_node(struct kmem_cache *s,
         if (!n || !n->nr_partial)
                 return NULL;
  
-       spin_lock_irqsave(&n->list_lock, flags);
+       if (gfpflags_allow_spinning(pc->flags))
+               spin_lock_irqsave(&n->list_lock, flags);
+       else if (!spin_trylock_irqsave(&n->list_lock, flags))
+               return NULL;
         list_for_each_entry_safe(slab, slab2, &n->partial, slab_list) {
                 if (!pfmemalloc_match(slab, pc->flags))
                         continue;
@@ -3606,7 +3656,7 @@ static void init_kmem_cache_cpus(struct kmem_cache *s)
                 lockdep_register_key(&s->lock_key);
         for_each_possible_cpu(cpu) {
                 c = per_cpu_ptr(s->cpu_slab, cpu);
-               local_lock_init(&c->lock);
+               local_trylock_init(&c->lock);
                 if (finegrain_lockdep)
                         lockdep_set_class(&c->lock, &s->lock_key);
                 c->tid = init_tid(cpu);
@@ -3699,6 +3749,47 @@ static void deactivate_slab(struct kmem_cache *s, struct slab *slab,
         }
  }
  
+/*
+ * ___slab_alloc()'s caller is supposed to check if kmem_cache::kmem_cache_cpu::lock
+ * can be acquired without a deadlock before invoking the function.
+ *
+ * Without LOCKDEP we trust the code to be correct. kmalloc_nolock() is
+ * using local_lock_is_locked() properly before calling local_lock_cpu_slab(),
+ * and kmalloc() is not used in an unsupported context.
+ *
+ * With LOCKDEP, on PREEMPT_RT lockdep does its checking in local_lock_irqsave().
+ * On !PREEMPT_RT we use trylock to avoid false positives in NMI, but
+ * lockdep_assert() will catch a bug in case:
+ * #1
+ * kmalloc() -> ___slab_alloc() -> irqsave -> NMI -> bpf -> kmalloc_nolock()
+ * or
+ * #2
+ * kmalloc() -> ___slab_alloc() -> irqsave -> tracepoint/kprobe -> bpf -> kmalloc_nolock()
+ *
+ * On PREEMPT_RT an invocation is not possible from IRQ-off or preempt
+ * disabled context. The lock will always be acquired and if needed it
+ * block and sleep until the lock is available.
+ * #1 is possible in !PREEMPT_RT only.
+ * #2 is possible in both with a twist that irqsave is replaced with rt_spinlock:
+ * kmalloc() -> ___slab_alloc() -> rt_spin_lock(kmem_cache_A) ->
+ *    tracepoint/kprobe -> bpf -> kmalloc_nolock() -> rt_spin_lock(kmem_cache_B)
+ *
+ * local_lock_is_locked() prevents the case kmem_cache_A == kmem_cache_B
+ */
+#if defined(CONFIG_PREEMPT_RT) || !defined(CONFIG_LOCKDEP)
+#define local_lock_cpu_slab(s, flags)  \
+       local_lock_irqsave(&(s)->cpu_slab->lock, flags)
+#else
+#define local_lock_cpu_slab(s, flags)                                         \
+       do {                                                                   \
+               bool __l = local_trylock_irqsave(&(s)->cpu_slab->lock, flags); \
+               lockdep_assert(__l);                                           \
+       } while (0)
+#endif
+
+#define local_unlock_cpu_slab(s, flags)        \
+       local_unlock_irqrestore(&(s)->cpu_slab->lock, flags)
+
  #ifdef CONFIG_SLUB_CPU_PARTIAL
  static void __put_partials(struct kmem_cache *s, struct slab *partial_slab)
  {
@@ -3783,7 +3874,7 @@ static void put_cpu_partial(struct kmem_cache *s, struct slab *slab, int drain)
         unsigned long flags;
         int slabs = 0;
  
-       local_lock_irqsave(&s->cpu_slab->lock, flags);
+       local_lock_cpu_slab(s, flags);
  
         oldslab = this_cpu_read(s->cpu_slab->partial);
  
@@ -3808,7 +3899,7 @@ static void put_cpu_partial(struct kmem_cache *s, struct slab *slab, int drain)
  
         this_cpu_write(s->cpu_slab->partial, slab);
  
-       local_unlock_irqrestore(&s->cpu_slab->lock, flags);
+       local_unlock_cpu_slab(s, flags);
  
         if (slab_to_put) {
                 __put_partials(s, slab_to_put);
@@ -4323,6 +4414,7 @@ static inline void *freeze_slab(struct kmem_cache *s, struct slab *slab)
  static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
                           unsigned long addr, struct kmem_cache_cpu *c, unsigned int orig_size)
  {
+       bool allow_spin = gfpflags_allow_spinning(gfpflags);
         void *freelist;
         struct slab *slab;
         unsigned long flags;
@@ -4348,9 +4440,21 @@ reread_slab:
         if (unlikely(!node_match(slab, node))) {
                 /*
                  * same as above but node_match() being false already
-                * implies node != NUMA_NO_NODE
+                * implies node != NUMA_NO_NODE.
+                *
+                * We don't strictly honor pfmemalloc and NUMA preferences
+                * when !allow_spin because:
+                *
+                * 1. Most kmalloc() users allocate objects on the local node,
+                *    so kmalloc_nolock() tries not to interfere with them by
+                *    deactivating the cpu slab.
+                *
+                * 2. Deactivating due to NUMA or pfmemalloc mismatch may cause
+                *    unnecessary slab allocations even when n->partial list
+                *    is not empty.
                  */
-               if (!node_isset(node, slab_nodes)) {
+               if (!node_isset(node, slab_nodes) ||
+                   !allow_spin) {
                         node = NUMA_NO_NODE;
                 } else {
                         stat(s, ALLOC_NODE_MISMATCH);
@@ -4363,13 +4467,14 @@ reread_slab:
          * PFMEMALLOC but right now, we are losing the pfmemalloc
          * information when the page leaves the per-cpu allocator
          */
-       if (unlikely(!pfmemalloc_match(slab, gfpflags)))
+       if (unlikely(!pfmemalloc_match(slab, gfpflags) && allow_spin))
                 goto deactivate_slab;
  
         /* must check again c->slab in case we got preempted and it changed */
-       local_lock_irqsave(&s->cpu_slab->lock, flags);
+       local_lock_cpu_slab(s, flags);
+
         if (unlikely(slab != c->slab)) {
-               local_unlock_irqrestore(&s->cpu_slab->lock, flags);
+               local_unlock_cpu_slab(s, flags);
                 goto reread_slab;
         }
         freelist = c->freelist;
@@ -4381,7 +4486,7 @@ reread_slab:
         if (!freelist) {
                 c->slab = NULL;
                 c->tid = next_tid(c->tid);
-               local_unlock_irqrestore(&s->cpu_slab->lock, flags);
+               local_unlock_cpu_slab(s, flags);
                 stat(s, DEACTIVATE_BYPASS);
                 goto new_slab;
         }
@@ -4400,34 +4505,34 @@ load_freelist:
         VM_BUG_ON(!c->slab->frozen);
         c->freelist = get_freepointer(s, freelist);
         c->tid = next_tid(c->tid);
-       local_unlock_irqrestore(&s->cpu_slab->lock, flags);
+       local_unlock_cpu_slab(s, flags);
         return freelist;
  
  deactivate_slab:
  
-       local_lock_irqsave(&s->cpu_slab->lock, flags);
+       local_lock_cpu_slab(s, flags);
         if (slab != c->slab) {
-               local_unlock_irqrestore(&s->cpu_slab->lock, flags);
+               local_unlock_cpu_slab(s, flags);
                 goto reread_slab;
         }
         freelist = c->freelist;
         c->slab = NULL;
         c->freelist = NULL;
         c->tid = next_tid(c->tid);
-       local_unlock_irqrestore(&s->cpu_slab->lock, flags);
+       local_unlock_cpu_slab(s, flags);
         deactivate_slab(s, slab, freelist);
  
  new_slab:
  
  #ifdef CONFIG_SLUB_CPU_PARTIAL
         while (slub_percpu_partial(c)) {
-               local_lock_irqsave(&s->cpu_slab->lock, flags);
+               local_lock_cpu_slab(s, flags);
                 if (unlikely(c->slab)) {
-                       local_unlock_irqrestore(&s->cpu_slab->lock, flags);
+                       local_unlock_cpu_slab(s, flags);
                         goto reread_slab;
                 }
                 if (unlikely(!slub_percpu_partial(c))) {
-                       local_unlock_irqrestore(&s->cpu_slab->lock, flags);
+                       local_unlock_cpu_slab(s, flags);
                         /* we were preempted and partial list got empty */
                         goto new_objects;
                 }
@@ -4436,7 +4541,8 @@ new_slab:
                 slub_set_percpu_partial(c, slab);
  
                 if (likely(node_match(slab, node) &&
-                          pfmemalloc_match(slab, gfpflags))) {
+                          pfmemalloc_match(slab, gfpflags)) ||
+                   !allow_spin) {
                         c->slab = slab;
                         freelist = get_freelist(s, slab);
                         VM_BUG_ON(!freelist);
@@ -4444,7 +4550,7 @@ new_slab:
                         goto load_freelist;
                 }
  
-               local_unlock_irqrestore(&s->cpu_slab->lock, flags);
+               local_unlock_cpu_slab(s, flags);
  
                 slab->next = NULL;
                 __put_partials(s, slab);
@@ -4466,8 +4572,13 @@ new_objects:
          *    allocating new page from other nodes
          */
         if (unlikely(node != NUMA_NO_NODE && !(gfpflags & __GFP_THISNODE)
-                    && try_thisnode))
-               pc.flags = GFP_NOWAIT | __GFP_THISNODE;
+                    && try_thisnode)) {
+               if (unlikely(!allow_spin))
+                       /* Do not upgrade gfp to NOWAIT from more restrictive mode */
+                       pc.flags = gfpflags | __GFP_THISNODE;
+               else
+                       pc.flags = GFP_NOWAIT | __GFP_THISNODE;
+       }
  
         pc.orig_size = orig_size;
         slab = get_partial(s, node, &pc);
@@ -4506,7 +4617,7 @@ new_objects:
         stat(s, ALLOC_SLAB);
  
         if (kmem_cache_debug(s)) {
-               freelist = alloc_single_from_new_slab(s, slab, orig_size);
+               freelist = alloc_single_from_new_slab(s, slab, orig_size, gfpflags);
  
                 if (unlikely(!freelist))
                         goto new_objects;
@@ -4528,7 +4639,7 @@ new_objects:
  
         inc_slabs_node(s, slab_nid(slab), slab->objects);
  
-       if (unlikely(!pfmemalloc_match(slab, gfpflags))) {
+       if (unlikely(!pfmemalloc_match(slab, gfpflags) && allow_spin)) {
                 /*
                  * For !pfmemalloc_match() case we don't load freelist so that
                  * we don't make further mismatched allocations easier.
@@ -4539,7 +4650,7 @@ new_objects:
  
  retry_load_slab:
  
-       local_lock_irqsave(&s->cpu_slab->lock, flags);
+       local_lock_cpu_slab(s, flags);
         if (unlikely(c->slab)) {
                 void *flush_freelist = c->freelist;
                 struct slab *flush_slab = c->slab;
@@ -4548,9 +4659,14 @@ retry_load_slab:
                 c->freelist = NULL;
                 c->tid = next_tid(c->tid);
  
-               local_unlock_irqrestore(&s->cpu_slab->lock, flags);
+               local_unlock_cpu_slab(s, flags);
  
-               deactivate_slab(s, flush_slab, flush_freelist);
+               if (unlikely(!allow_spin)) {
+                       /* Reentrant slub cannot take locks, defer */
+                       defer_deactivate_slab(flush_slab, flush_freelist);
+               } else {
+                       deactivate_slab(s, flush_slab, flush_freelist);
+               }
  
                 stat(s, CPUSLAB_FLUSH);
  
@@ -4560,6 +4676,19 @@ retry_load_slab:
  
         goto load_freelist;
  }
+/*
+ * We disallow kprobes in ___slab_alloc() to prevent reentrance
+ *
+ * kmalloc() -> ___slab_alloc() -> local_lock_cpu_slab() protected part of
+ * ___slab_alloc() manipulating c->freelist -> kprobe -> bpf ->
+ * kmalloc_nolock() or kfree_nolock() -> __update_cpu_freelist_fast()
+ * manipulating c->freelist without lock.
+ *
+ * This does not prevent kprobe in functions called from ___slab_alloc() such as
+ * local_lock_irqsave() itself, and that is fine, we only need to protect the
+ * c->freelist manipulation in ___slab_alloc() itself.
+ */
+NOKPROBE_SYMBOL(___slab_alloc);
  
  /*
   * A wrapper for ___slab_alloc() for contexts where preemption is not yet
@@ -4579,8 +4708,19 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
          */
         c = slub_get_cpu_ptr(s->cpu_slab);
  #endif
-
+       if (unlikely(!gfpflags_allow_spinning(gfpflags))) {
+               if (local_lock_is_locked(&s->cpu_slab->lock)) {
+                       /*
+                        * EBUSY is an internal signal to kmalloc_nolock() to
+                        * retry a different bucket. It's not propagated
+                        * to the caller.
+                        */
+                       p = ERR_PTR(-EBUSY);
+                       goto out;
+               }
+       }
         p = ___slab_alloc(s, gfpflags, node, addr, c, orig_size);
+out:
  #ifdef CONFIG_PREEMPT_COUNT
         slub_put_cpu_ptr(s->cpu_slab);
  #endif
@@ -4704,7 +4844,7 @@ static void *__slab_alloc_node(struct kmem_cache *s,
                 return NULL;
         }
  
-       object = alloc_single_from_new_slab(s, slab, orig_size);
+       object = alloc_single_from_new_slab(s, slab, orig_size, gfpflags);
  
         return object;
  }
@@ -4783,8 +4923,9 @@ bool slab_post_alloc_hook(struct kmem_cache *s, struct list_lru *lru,
                 if (p[i] && init && (!kasan_init ||
                                      !kasan_has_integrated_init()))
                         memset(p[i], 0, zero_size);
-               kmemleak_alloc_recursive(p[i], s->object_size, 1,
-                                        s->flags, init_flags);
+               if (gfpflags_allow_spinning(flags))
+                       kmemleak_alloc_recursive(p[i], s->object_size, 1,
+                                                s->flags, init_flags);
                 kmsan_slab_alloc(s, p[i], init_flags);
                 alloc_tagging_slab_alloc_hook(s, p[i], flags);
         }
@@ -5451,6 +5592,96 @@ void *__kmalloc_noprof(size_t size, gfp_t flags)
  }
  EXPORT_SYMBOL(__kmalloc_noprof);
  
+/**
+ * kmalloc_nolock - Allocate an object of given size from any context.
+ * @size: size to allocate
+ * @gfp_flags: GFP flags. Only __GFP_ACCOUNT, __GFP_ZERO, __GFP_NO_OBJ_EXT
+ * allowed.
+ * @node: node number of the target node.
+ *
+ * Return: pointer to the new object or NULL in case of error.
+ * NULL does not mean EBUSY or EAGAIN. It means ENOMEM.
+ * There is no reason to call it again and expect !NULL.
+ */
+void *kmalloc_nolock_noprof(size_t size, gfp_t gfp_flags, int node)
+{
+       gfp_t alloc_gfp = __GFP_NOWARN | __GFP_NOMEMALLOC | gfp_flags;
+       struct kmem_cache *s;
+       bool can_retry = true;
+       void *ret = ERR_PTR(-EBUSY);
+
+       VM_WARN_ON_ONCE(gfp_flags & ~(__GFP_ACCOUNT | __GFP_ZERO |
+                                     __GFP_NO_OBJ_EXT));
+
+       if (unlikely(!size))
+               return ZERO_SIZE_PTR;
+
+       if (IS_ENABLED(CONFIG_PREEMPT_RT) && (in_nmi() || in_hardirq()))
+               /* kmalloc_nolock() in PREEMPT_RT is not supported from irq */
+               return NULL;
+retry:
+       if (unlikely(size > KMALLOC_MAX_CACHE_SIZE))
+               return NULL;
+       s = kmalloc_slab(size, NULL, alloc_gfp, _RET_IP_);
+
+       if (!(s->flags & __CMPXCHG_DOUBLE) && !kmem_cache_debug(s))
+               /*
+                * kmalloc_nolock() is not supported on architectures that
+                * don't implement cmpxchg16b, but debug caches don't use
+                * per-cpu slab and per-cpu partial slabs. They rely on
+                * kmem_cache_node->list_lock, so kmalloc_nolock() can
+                * attempt to allocate from debug caches by
+                * spin_trylock_irqsave(&n->list_lock, ...)
+                */
+               return NULL;
+
+       /*
+        * Do not call slab_alloc_node(), since trylock mode isn't
+        * compatible with slab_pre_alloc_hook/should_failslab and
+        * kfence_alloc. Hence call __slab_alloc_node() (at most twice)
+        * and slab_post_alloc_hook() directly.
+        *
+        * In !PREEMPT_RT ___slab_alloc() manipulates (freelist,tid) pair
+        * in irq saved region. It assumes that the same cpu will not
+        * __update_cpu_freelist_fast() into the same (freelist,tid) pair.
+        * Therefore use in_nmi() to check whether particular bucket is in
+        * irq protected section.
+        *
+        * If in_nmi() && local_lock_is_locked(s->cpu_slab) then it means that
+        * this cpu was interrupted somewhere inside ___slab_alloc() after
+        * it did local_lock_irqsave(&s->cpu_slab->lock, flags).
+        * In this case fast path with __update_cpu_freelist_fast() is not safe.
+        */
+#ifndef CONFIG_SLUB_TINY
+       if (!in_nmi() || !local_lock_is_locked(&s->cpu_slab->lock))
+#endif
+               ret = __slab_alloc_node(s, alloc_gfp, node, _RET_IP_, size);
+
+       if (PTR_ERR(ret) == -EBUSY) {
+               if (can_retry) {
+                       /* pick the next kmalloc bucket */
+                       size = s->object_size + 1;
+                       /*
+                        * Another alternative is to
+                        * if (memcg) alloc_gfp &= ~__GFP_ACCOUNT;
+                        * else if (!memcg) alloc_gfp |= __GFP_ACCOUNT;
+                        * to retry from bucket of the same size.
+                        */
+                       can_retry = false;
+                       goto retry;
+               }
+               ret = NULL;
+       }
+
+       maybe_wipe_obj_freeptr(s, ret);
+       slab_post_alloc_hook(s, NULL, alloc_gfp, 1, &ret,
+                            slab_want_init_on_alloc(alloc_gfp, s), size);
+
+       ret = kasan_kmalloc(s, ret, size, alloc_gfp);
+       return ret;
+}
+EXPORT_SYMBOL_GPL(kmalloc_nolock_noprof);
+
  void *__kmalloc_node_track_caller_noprof(DECL_BUCKET_PARAMS(size, b), gfp_t flags,
                                          int node, unsigned long caller)
  {
@@ -6108,6 +6339,93 @@ flush_remote:
         }
  }
  
+struct defer_free {
+       struct llist_head objects;
+       struct llist_head slabs;
+       struct irq_work work;
+};
+
+static void free_deferred_objects(struct irq_work *work);
+
+static DEFINE_PER_CPU(struct defer_free, defer_free_objects) = {
+       .objects = LLIST_HEAD_INIT(objects),
+       .slabs = LLIST_HEAD_INIT(slabs),
+       .work = IRQ_WORK_INIT(free_deferred_objects),
+};
+
+/*
+ * In PREEMPT_RT irq_work runs in per-cpu kthread, so it's safe
+ * to take sleeping spin_locks from __slab_free() and deactivate_slab().
+ * In !PREEMPT_RT irq_work will run after local_unlock_irqrestore().
+ */
+static void free_deferred_objects(struct irq_work *work)
+{
+       struct defer_free *df = container_of(work, struct defer_free, work);
+       struct llist_head *objs = &df->objects;
+       struct llist_head *slabs = &df->slabs;
+       struct llist_node *llnode, *pos, *t;
+
+       if (llist_empty(objs) && llist_empty(slabs))
+               return;
+
+       llnode = llist_del_all(objs);
+       llist_for_each_safe(pos, t, llnode) {
+               struct kmem_cache *s;
+               struct slab *slab;
+               void *x = pos;
+
+               slab = virt_to_slab(x);
+               s = slab->slab_cache;
+
+               /*
+                * We used freepointer in 'x' to link 'x' into df->objects.
+                * Clear it to NULL to avoid false positive detection
+                * of "Freepointer corruption".
+                */
+               *(void **)x = NULL;
+
+               /* Point 'x' back to the beginning of allocated object */
+               x -= s->offset;
+               __slab_free(s, slab, x, x, 1, _THIS_IP_);
+       }
+
+       llnode = llist_del_all(slabs);
+       llist_for_each_safe(pos, t, llnode) {
+               struct slab *slab = container_of(pos, struct slab, llnode);
+
+#ifdef CONFIG_SLUB_TINY
+               discard_slab(slab->slab_cache, slab);
+#else
+               deactivate_slab(slab->slab_cache, slab, slab->flush_freelist);
+#endif
+       }
+}
+
+static void defer_free(struct kmem_cache *s, void *head)
+{
+       struct defer_free *df = this_cpu_ptr(&defer_free_objects);
+
+       if (llist_add(head + s->offset, &df->objects))
+               irq_work_queue(&df->work);
+}
+
+static void defer_deactivate_slab(struct slab *slab, void *flush_freelist)
+{
+       struct defer_free *df = this_cpu_ptr(&defer_free_objects);
+
+       slab->flush_freelist = flush_freelist;
+       if (llist_add(&slab->llnode, &df->slabs))
+               irq_work_queue(&df->work);
+}
+
+void defer_free_barrier(void)
+{
+       int cpu;
+
+       for_each_possible_cpu(cpu)
+               irq_work_sync(&per_cpu_ptr(&defer_free_objects, cpu)->work);
+}
+
  #ifndef CONFIG_SLUB_TINY
  /*
   * Fastpath with forced inlining to produce a kfree and kmem_cache_free that
@@ -6128,6 +6446,8 @@ static __always_inline void do_slab_free(struct kmem_cache *s,
                                 struct slab *slab, void *head, void *tail,
                                 int cnt, unsigned long addr)
  {
+       /* cnt == 0 signals that it's called from kfree_nolock() */
+       bool allow_spin = cnt;
         struct kmem_cache_cpu *c;
         unsigned long tid;
         void **freelist;
@@ -6146,10 +6466,29 @@ redo:
         barrier();
  
         if (unlikely(slab != c->slab)) {
-               __slab_free(s, slab, head, tail, cnt, addr);
+               if (unlikely(!allow_spin)) {
+                       /*
+                        * __slab_free() can locklessly cmpxchg16 into a slab,
+                        * but then it might need to take spin_lock or local_lock
+                        * in put_cpu_partial() for further processing.
+                        * Avoid the complexity and simply add to a deferred list.
+                        */
+                       defer_free(s, head);
+               } else {
+                       __slab_free(s, slab, head, tail, cnt, addr);
+               }
                 return;
         }
  
+       if (unlikely(!allow_spin)) {
+               if ((in_nmi() || !USE_LOCKLESS_FAST_PATH()) &&
+                   local_lock_is_locked(&s->cpu_slab->lock)) {
+                       defer_free(s, head);
+                       return;
+               }
+               cnt = 1; /* restore cnt. kfree_nolock() frees one object at a time */
+       }
+
         if (USE_LOCKLESS_FAST_PATH()) {
                 freelist = READ_ONCE(c->freelist);
  
@@ -6160,11 +6499,13 @@ redo:
                         goto redo;
                 }
         } else {
+               __maybe_unused unsigned long flags = 0;
+
                 /* Update the free list under the local lock */
-               local_lock(&s->cpu_slab->lock);
+               local_lock_cpu_slab(s, flags);
                 c = this_cpu_ptr(s->cpu_slab);
                 if (unlikely(slab != c->slab)) {
-                       local_unlock(&s->cpu_slab->lock);
+                       local_unlock_cpu_slab(s, flags);
                         goto redo;
                 }
                 tid = c->tid;
@@ -6174,7 +6515,7 @@ redo:
                 c->freelist = head;
                 c->tid = next_tid(tid);
  
-               local_unlock(&s->cpu_slab->lock);
+               local_unlock_cpu_slab(s, flags);
         }
         stat_add(s, FREE_FASTPATH, cnt);
  }
@@ -6405,6 +6746,71 @@ void kfree(const void *object)
  }
  EXPORT_SYMBOL(kfree);
  
+/*
+ * Can be called while holding raw_spinlock_t or from IRQ and NMI,
+ * but ONLY for objects allocated by kmalloc_nolock().
+ * Debug checks (like kmemleak and kfence) were skipped on allocation,
+ * hence
+ * obj = kmalloc(); kfree_nolock(obj);
+ * will miss kmemleak/kfence book keeping and will cause false positives.
+ * large_kmalloc is not supported either.
+ */
+void kfree_nolock(const void *object)
+{
+       struct folio *folio;
+       struct slab *slab;
+       struct kmem_cache *s;
+       void *x = (void *)object;
+
+       if (unlikely(ZERO_OR_NULL_PTR(object)))
+               return;
+
+       folio = virt_to_folio(object);
+       if (unlikely(!folio_test_slab(folio))) {
+               WARN_ONCE(1, "large_kmalloc is not supported by kfree_nolock()");
+               return;
+       }
+
+       slab = folio_slab(folio);
+       s = slab->slab_cache;
+
+       memcg_slab_free_hook(s, slab, &x, 1);
+       alloc_tagging_slab_free_hook(s, slab, &x, 1);
+       /*
+        * Unlike slab_free() do NOT call the following:
+        * kmemleak_free_recursive(x, s->flags);
+        * debug_check_no_locks_freed(x, s->object_size);
+        * debug_check_no_obj_freed(x, s->object_size);
+        * __kcsan_check_access(x, s->object_size, ..);
+        * kfence_free(x);
+        * since they take spinlocks or not safe from any context.
+        */
+       kmsan_slab_free(s, x);
+       /*
+        * If KASAN finds a kernel bug it will do kasan_report_invalid_free()
+        * which will call raw_spin_lock_irqsave() which is technically
+        * unsafe from NMI, but take chance and report kernel bug.
+        * The sequence of
+        * kasan_report_invalid_free() -> raw_spin_lock_irqsave() -> NMI
+        *  -> kfree_nolock() -> kasan_report_invalid_free() on the same CPU
+        * is double buggy and deserves to deadlock.
+        */
+       if (kasan_slab_pre_free(s, x))
+               return;
+       /*
+        * memcg, kasan_slab_pre_free are done for 'x'.
+        * The only thing left is kasan_poison without quarantine,
+        * since kasan quarantine takes locks and not supported from NMI.
+        */
+       kasan_slab_free(s, x, false, false, /* skip quarantine */true);
+#ifndef CONFIG_SLUB_TINY
+       do_slab_free(s, slab, x, x, 0, _RET_IP_);
+#else
+       defer_free(s, x);
+#endif
+}
+EXPORT_SYMBOL_GPL(kfree_nolock);
+
  static __always_inline __realloc_size(2) void *
  __do_krealloc(const void *p, size_t new_size, gfp_t flags)
  {
author	Alexei Starovoitov <ast@kernel.org>
	Tue, 9 Sep 2025 01:00:07 +0000 (18:00 -0700)
committer	Vlastimil Babka <vbabka@suse.cz>
	Mon, 29 Sep 2025 07:42:36 +0000 (09:42 +0200)
include/linux/kasan.h		patch \| blob \| blame \| history
include/linux/memcontrol.h		patch \| blob \| blame \| history
include/linux/slab.h		patch \| blob \| blame \| history
mm/Kconfig		patch \| blob \| blame \| history
mm/kasan/common.c		patch \| blob \| blame \| history
mm/slab.h		patch \| blob \| blame \| history
mm/slab_common.c		patch \| blob \| blame \| history
mm/slub.c		patch \| blob \| blame \| history