1 // SPDX-License-Identifier: GPL-2.0
3 * SLUB: A slab allocator that limits cache line use instead of queuing
4 * objects in per cpu and per node lists.
6 * The allocator synchronizes using per slab locks or atomic operations
7 * and only uses a centralized lock to manage a pool of partial slabs.
9 * (C) 2007 SGI, Christoph Lameter
10 * (C) 2011 Linux Foundation, Christoph Lameter
14 #include <linux/swap.h> /* mm_account_reclaimed_pages() */
15 #include <linux/module.h>
16 #include <linux/bit_spinlock.h>
17 #include <linux/interrupt.h>
18 #include <linux/swab.h>
19 #include <linux/bitops.h>
20 #include <linux/slab.h>
22 #include <linux/proc_fs.h>
23 #include <linux/seq_file.h>
24 #include <linux/kasan.h>
25 #include <linux/kmsan.h>
26 #include <linux/cpu.h>
27 #include <linux/cpuset.h>
28 #include <linux/mempolicy.h>
29 #include <linux/ctype.h>
30 #include <linux/stackdepot.h>
31 #include <linux/debugobjects.h>
32 #include <linux/kallsyms.h>
33 #include <linux/kfence.h>
34 #include <linux/memory.h>
35 #include <linux/math64.h>
36 #include <linux/fault-inject.h>
37 #include <linux/stacktrace.h>
38 #include <linux/prefetch.h>
39 #include <linux/memcontrol.h>
40 #include <linux/random.h>
41 #include <kunit/test.h>
42 #include <kunit/test-bug.h>
43 #include <linux/sort.h>
45 #include <linux/debugfs.h>
46 #include <trace/events/kmem.h>
52 * 1. slab_mutex (Global Mutex)
53 * 2. node->list_lock (Spinlock)
54 * 3. kmem_cache->cpu_slab->lock (Local lock)
55 * 4. slab_lock(slab) (Only on some arches)
56 * 5. object_map_lock (Only for debugging)
60 * The role of the slab_mutex is to protect the list of all the slabs
61 * and to synchronize major metadata changes to slab cache structures.
62 * Also synchronizes memory hotplug callbacks.
66 * The slab_lock is a wrapper around the page lock, thus it is a bit
69 * The slab_lock is only used on arches that do not have the ability
70 * to do a cmpxchg_double. It only protects:
72 * A. slab->freelist -> List of free objects in a slab
73 * B. slab->inuse -> Number of objects in use
74 * C. slab->objects -> Number of objects in slab
75 * D. slab->frozen -> frozen state
79 * If a slab is frozen then it is exempt from list management. It is not
80 * on any list except per cpu partial list. The processor that froze the
81 * slab is the one who can perform list operations on the slab. Other
82 * processors may put objects onto the freelist but the processor that
83 * froze the slab is the only one that can retrieve the objects from the
88 * The list_lock protects the partial and full list on each node and
89 * the partial slab counter. If taken then no new slabs may be added or
90 * removed from the lists nor make the number of partial slabs be modified.
91 * (Note that the total number of slabs is an atomic value that may be
92 * modified without taking the list lock).
94 * The list_lock is a centralized lock and thus we avoid taking it as
95 * much as possible. As long as SLUB does not have to handle partial
96 * slabs, operations can continue without any centralized lock. F.e.
97 * allocating a long series of objects that fill up slabs does not require
100 * For debug caches, all allocations are forced to go through a list_lock
101 * protected region to serialize against concurrent validation.
103 * cpu_slab->lock local lock
105 * This locks protect slowpath manipulation of all kmem_cache_cpu fields
106 * except the stat counters. This is a percpu structure manipulated only by
107 * the local cpu, so the lock protects against being preempted or interrupted
108 * by an irq. Fast path operations rely on lockless operations instead.
110 * On PREEMPT_RT, the local lock neither disables interrupts nor preemption
111 * which means the lockless fastpath cannot be used as it might interfere with
112 * an in-progress slow path operations. In this case the local lock is always
113 * taken but it still utilizes the freelist for the common operations.
117 * The fast path allocation (slab_alloc_node()) and freeing (do_slab_free())
118 * are fully lockless when satisfied from the percpu slab (and when
119 * cmpxchg_double is possible to use, otherwise slab_lock is taken).
120 * They also don't disable preemption or migration or irqs. They rely on
121 * the transaction id (tid) field to detect being preempted or moved to
124 * irq, preemption, migration considerations
126 * Interrupts are disabled as part of list_lock or local_lock operations, or
127 * around the slab_lock operation, in order to make the slab allocator safe
128 * to use in the context of an irq.
130 * In addition, preemption (or migration on PREEMPT_RT) is disabled in the
131 * allocation slowpath, bulk allocation, and put_cpu_partial(), so that the
132 * local cpu doesn't change in the process and e.g. the kmem_cache_cpu pointer
133 * doesn't have to be revalidated in each section protected by the local lock.
135 * SLUB assigns one slab for allocation to each processor.
136 * Allocations only occur from these slabs called cpu slabs.
138 * Slabs with free elements are kept on a partial list and during regular
139 * operations no list for full slabs is used. If an object in a full slab is
140 * freed then the slab will show up again on the partial lists.
141 * We track full slabs for debugging purposes though because otherwise we
142 * cannot scan all objects.
144 * Slabs are freed when they become empty. Teardown and setup is
145 * minimal so we rely on the page allocators per cpu caches for
146 * fast frees and allocs.
148 * slab->frozen The slab is frozen and exempt from list processing.
149 * This means that the slab is dedicated to a purpose
150 * such as satisfying allocations for a specific
151 * processor. Objects may be freed in the slab while
152 * it is frozen but slab_free will then skip the usual
153 * list operations. It is up to the processor holding
154 * the slab to integrate the slab into the slab lists
155 * when the slab is no longer needed.
157 * One use of this flag is to mark slabs that are
158 * used for allocations. Then such a slab becomes a cpu
159 * slab. The cpu slab may be equipped with an additional
160 * freelist that allows lockless access to
161 * free objects in addition to the regular freelist
162 * that requires the slab lock.
164 * SLAB_DEBUG_FLAGS Slab requires special handling due to debug
165 * options set. This moves slab handling out of
166 * the fast path and disables lockless freelists.
170 * We could simply use migrate_disable()/enable() but as long as it's a
171 * function call even on !PREEMPT_RT, use inline preempt_disable() there.
173 #ifndef CONFIG_PREEMPT_RT
174 #define slub_get_cpu_ptr(var) get_cpu_ptr(var)
175 #define slub_put_cpu_ptr(var) put_cpu_ptr(var)
176 #define USE_LOCKLESS_FAST_PATH() (true)
178 #define slub_get_cpu_ptr(var) \
183 #define slub_put_cpu_ptr(var) \
188 #define USE_LOCKLESS_FAST_PATH() (false)
191 #ifndef CONFIG_SLUB_TINY
192 #define __fastpath_inline __always_inline
194 #define __fastpath_inline
197 #ifdef CONFIG_SLUB_DEBUG
198 #ifdef CONFIG_SLUB_DEBUG_ON
199 DEFINE_STATIC_KEY_TRUE(slub_debug_enabled
);
201 DEFINE_STATIC_KEY_FALSE(slub_debug_enabled
);
203 #endif /* CONFIG_SLUB_DEBUG */
205 /* Structure holding parameters for get_partial() call chain */
206 struct partial_context
{
209 unsigned int orig_size
;
212 static inline bool kmem_cache_debug(struct kmem_cache
*s
)
214 return kmem_cache_debug_flags(s
, SLAB_DEBUG_FLAGS
);
217 static inline bool slub_debug_orig_size(struct kmem_cache
*s
)
219 return (kmem_cache_debug_flags(s
, SLAB_STORE_USER
) &&
220 (s
->flags
& SLAB_KMALLOC
));
223 void *fixup_red_left(struct kmem_cache
*s
, void *p
)
225 if (kmem_cache_debug_flags(s
, SLAB_RED_ZONE
))
226 p
+= s
->red_left_pad
;
231 static inline bool kmem_cache_has_cpu_partial(struct kmem_cache
*s
)
233 #ifdef CONFIG_SLUB_CPU_PARTIAL
234 return !kmem_cache_debug(s
);
241 * Issues still to be resolved:
243 * - Support PAGE_ALLOC_DEBUG. Should be easy to do.
245 * - Variable sizing of the per node arrays
248 /* Enable to log cmpxchg failures */
249 #undef SLUB_DEBUG_CMPXCHG
251 #ifndef CONFIG_SLUB_TINY
253 * Minimum number of partial slabs. These will be left on the partial
254 * lists even if they are empty. kmem_cache_shrink may reclaim them.
256 #define MIN_PARTIAL 5
259 * Maximum number of desirable partial slabs.
260 * The existence of more partial slabs makes kmem_cache_shrink
261 * sort the partial list by the number of objects in use.
263 #define MAX_PARTIAL 10
265 #define MIN_PARTIAL 0
266 #define MAX_PARTIAL 0
269 #define DEBUG_DEFAULT_FLAGS (SLAB_CONSISTENCY_CHECKS | SLAB_RED_ZONE | \
270 SLAB_POISON | SLAB_STORE_USER)
273 * These debug flags cannot use CMPXCHG because there might be consistency
274 * issues when checking or reading debug information
276 #define SLAB_NO_CMPXCHG (SLAB_CONSISTENCY_CHECKS | SLAB_STORE_USER | \
281 * Debugging flags that require metadata to be stored in the slab. These get
282 * disabled when slub_debug=O is used and a cache's min order increases with
285 #define DEBUG_METADATA_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER)
288 #define OO_MASK ((1 << OO_SHIFT) - 1)
289 #define MAX_OBJS_PER_PAGE 32767 /* since slab.objects is u15 */
291 /* Internal SLUB flags */
293 #define __OBJECT_POISON ((slab_flags_t __force)0x80000000U)
294 /* Use cmpxchg_double */
296 #ifdef system_has_freelist_aba
297 #define __CMPXCHG_DOUBLE ((slab_flags_t __force)0x40000000U)
299 #define __CMPXCHG_DOUBLE ((slab_flags_t __force)0U)
303 * Tracking user of a slab.
305 #define TRACK_ADDRS_COUNT 16
307 unsigned long addr
; /* Called from address */
308 #ifdef CONFIG_STACKDEPOT
309 depot_stack_handle_t handle
;
311 int cpu
; /* Was running on cpu */
312 int pid
; /* Pid context */
313 unsigned long when
; /* When did the operation occur */
316 enum track_item
{ TRACK_ALLOC
, TRACK_FREE
};
318 #ifdef SLAB_SUPPORTS_SYSFS
319 static int sysfs_slab_add(struct kmem_cache
*);
320 static int sysfs_slab_alias(struct kmem_cache
*, const char *);
322 static inline int sysfs_slab_add(struct kmem_cache
*s
) { return 0; }
323 static inline int sysfs_slab_alias(struct kmem_cache
*s
, const char *p
)
327 #if defined(CONFIG_DEBUG_FS) && defined(CONFIG_SLUB_DEBUG)
328 static void debugfs_slab_add(struct kmem_cache
*);
330 static inline void debugfs_slab_add(struct kmem_cache
*s
) { }
333 static inline void stat(const struct kmem_cache
*s
, enum stat_item si
)
335 #ifdef CONFIG_SLUB_STATS
337 * The rmw is racy on a preemptible kernel but this is acceptable, so
338 * avoid this_cpu_add()'s irq-disable overhead.
340 raw_cpu_inc(s
->cpu_slab
->stat
[si
]);
345 * Tracks for which NUMA nodes we have kmem_cache_nodes allocated.
346 * Corresponds to node_state[N_NORMAL_MEMORY], but can temporarily
347 * differ during memory hotplug/hotremove operations.
348 * Protected by slab_mutex.
350 static nodemask_t slab_nodes
;
352 #ifndef CONFIG_SLUB_TINY
354 * Workqueue used for flush_cpu_slab().
356 static struct workqueue_struct
*flushwq
;
359 /********************************************************************
360 * Core slab cache functions
361 *******************************************************************/
364 * freeptr_t represents a SLUB freelist pointer, which might be encoded
365 * and not dereferenceable if CONFIG_SLAB_FREELIST_HARDENED is enabled.
367 typedef struct { unsigned long v
; } freeptr_t
;
370 * Returns freelist pointer (ptr). With hardening, this is obfuscated
371 * with an XOR of the address where the pointer is held and a per-cache
374 static inline freeptr_t
freelist_ptr_encode(const struct kmem_cache
*s
,
375 void *ptr
, unsigned long ptr_addr
)
377 unsigned long encoded
;
379 #ifdef CONFIG_SLAB_FREELIST_HARDENED
380 encoded
= (unsigned long)ptr
^ s
->random
^ swab(ptr_addr
);
382 encoded
= (unsigned long)ptr
;
384 return (freeptr_t
){.v
= encoded
};
387 static inline void *freelist_ptr_decode(const struct kmem_cache
*s
,
388 freeptr_t ptr
, unsigned long ptr_addr
)
392 #ifdef CONFIG_SLAB_FREELIST_HARDENED
393 decoded
= (void *)(ptr
.v
^ s
->random
^ swab(ptr_addr
));
395 decoded
= (void *)ptr
.v
;
400 static inline void *get_freepointer(struct kmem_cache
*s
, void *object
)
402 unsigned long ptr_addr
;
405 object
= kasan_reset_tag(object
);
406 ptr_addr
= (unsigned long)object
+ s
->offset
;
407 p
= *(freeptr_t
*)(ptr_addr
);
408 return freelist_ptr_decode(s
, p
, ptr_addr
);
411 #ifndef CONFIG_SLUB_TINY
412 static void prefetch_freepointer(const struct kmem_cache
*s
, void *object
)
414 prefetchw(object
+ s
->offset
);
419 * When running under KMSAN, get_freepointer_safe() may return an uninitialized
420 * pointer value in the case the current thread loses the race for the next
421 * memory chunk in the freelist. In that case this_cpu_cmpxchg_double() in
422 * slab_alloc_node() will fail, so the uninitialized value won't be used, but
423 * KMSAN will still check all arguments of cmpxchg because of imperfect
424 * handling of inline assembly.
425 * To work around this problem, we apply __no_kmsan_checks to ensure that
426 * get_freepointer_safe() returns initialized memory.
429 static inline void *get_freepointer_safe(struct kmem_cache
*s
, void *object
)
431 unsigned long freepointer_addr
;
434 if (!debug_pagealloc_enabled_static())
435 return get_freepointer(s
, object
);
437 object
= kasan_reset_tag(object
);
438 freepointer_addr
= (unsigned long)object
+ s
->offset
;
439 copy_from_kernel_nofault(&p
, (freeptr_t
*)freepointer_addr
, sizeof(p
));
440 return freelist_ptr_decode(s
, p
, freepointer_addr
);
443 static inline void set_freepointer(struct kmem_cache
*s
, void *object
, void *fp
)
445 unsigned long freeptr_addr
= (unsigned long)object
+ s
->offset
;
447 #ifdef CONFIG_SLAB_FREELIST_HARDENED
448 BUG_ON(object
== fp
); /* naive detection of double free or corruption */
451 freeptr_addr
= (unsigned long)kasan_reset_tag((void *)freeptr_addr
);
452 *(freeptr_t
*)freeptr_addr
= freelist_ptr_encode(s
, fp
, freeptr_addr
);
455 /* Loop over all objects in a slab */
456 #define for_each_object(__p, __s, __addr, __objects) \
457 for (__p = fixup_red_left(__s, __addr); \
458 __p < (__addr) + (__objects) * (__s)->size; \
461 static inline unsigned int order_objects(unsigned int order
, unsigned int size
)
463 return ((unsigned int)PAGE_SIZE
<< order
) / size
;
466 static inline struct kmem_cache_order_objects
oo_make(unsigned int order
,
469 struct kmem_cache_order_objects x
= {
470 (order
<< OO_SHIFT
) + order_objects(order
, size
)
476 static inline unsigned int oo_order(struct kmem_cache_order_objects x
)
478 return x
.x
>> OO_SHIFT
;
481 static inline unsigned int oo_objects(struct kmem_cache_order_objects x
)
483 return x
.x
& OO_MASK
;
486 #ifdef CONFIG_SLUB_CPU_PARTIAL
487 static void slub_set_cpu_partial(struct kmem_cache
*s
, unsigned int nr_objects
)
489 unsigned int nr_slabs
;
491 s
->cpu_partial
= nr_objects
;
494 * We take the number of objects but actually limit the number of
495 * slabs on the per cpu partial list, in order to limit excessive
496 * growth of the list. For simplicity we assume that the slabs will
499 nr_slabs
= DIV_ROUND_UP(nr_objects
* 2, oo_objects(s
->oo
));
500 s
->cpu_partial_slabs
= nr_slabs
;
504 slub_set_cpu_partial(struct kmem_cache
*s
, unsigned int nr_objects
)
507 #endif /* CONFIG_SLUB_CPU_PARTIAL */
510 * Per slab locking using the pagelock
512 static __always_inline
void slab_lock(struct slab
*slab
)
514 struct page
*page
= slab_page(slab
);
516 VM_BUG_ON_PAGE(PageTail(page
), page
);
517 bit_spin_lock(PG_locked
, &page
->flags
);
520 static __always_inline
void slab_unlock(struct slab
*slab
)
522 struct page
*page
= slab_page(slab
);
524 VM_BUG_ON_PAGE(PageTail(page
), page
);
525 __bit_spin_unlock(PG_locked
, &page
->flags
);
529 __update_freelist_fast(struct slab
*slab
,
530 void *freelist_old
, unsigned long counters_old
,
531 void *freelist_new
, unsigned long counters_new
)
533 #ifdef system_has_freelist_aba
534 freelist_aba_t old
= { .freelist
= freelist_old
, .counter
= counters_old
};
535 freelist_aba_t
new = { .freelist
= freelist_new
, .counter
= counters_new
};
537 return try_cmpxchg_freelist(&slab
->freelist_counter
.full
, &old
.full
, new.full
);
544 __update_freelist_slow(struct slab
*slab
,
545 void *freelist_old
, unsigned long counters_old
,
546 void *freelist_new
, unsigned long counters_new
)
551 if (slab
->freelist
== freelist_old
&&
552 slab
->counters
== counters_old
) {
553 slab
->freelist
= freelist_new
;
554 slab
->counters
= counters_new
;
563 * Interrupts must be disabled (for the fallback code to work right), typically
564 * by an _irqsave() lock variant. On PREEMPT_RT the preempt_disable(), which is
565 * part of bit_spin_lock(), is sufficient because the policy is not to allow any
566 * allocation/ free operation in hardirq context. Therefore nothing can
567 * interrupt the operation.
569 static inline bool __slab_update_freelist(struct kmem_cache
*s
, struct slab
*slab
,
570 void *freelist_old
, unsigned long counters_old
,
571 void *freelist_new
, unsigned long counters_new
,
576 if (USE_LOCKLESS_FAST_PATH())
577 lockdep_assert_irqs_disabled();
579 if (s
->flags
& __CMPXCHG_DOUBLE
) {
580 ret
= __update_freelist_fast(slab
, freelist_old
, counters_old
,
581 freelist_new
, counters_new
);
583 ret
= __update_freelist_slow(slab
, freelist_old
, counters_old
,
584 freelist_new
, counters_new
);
590 stat(s
, CMPXCHG_DOUBLE_FAIL
);
592 #ifdef SLUB_DEBUG_CMPXCHG
593 pr_info("%s %s: cmpxchg double redo ", n
, s
->name
);
599 static inline bool slab_update_freelist(struct kmem_cache
*s
, struct slab
*slab
,
600 void *freelist_old
, unsigned long counters_old
,
601 void *freelist_new
, unsigned long counters_new
,
606 if (s
->flags
& __CMPXCHG_DOUBLE
) {
607 ret
= __update_freelist_fast(slab
, freelist_old
, counters_old
,
608 freelist_new
, counters_new
);
612 local_irq_save(flags
);
613 ret
= __update_freelist_slow(slab
, freelist_old
, counters_old
,
614 freelist_new
, counters_new
);
615 local_irq_restore(flags
);
621 stat(s
, CMPXCHG_DOUBLE_FAIL
);
623 #ifdef SLUB_DEBUG_CMPXCHG
624 pr_info("%s %s: cmpxchg double redo ", n
, s
->name
);
630 #ifdef CONFIG_SLUB_DEBUG
631 static unsigned long object_map
[BITS_TO_LONGS(MAX_OBJS_PER_PAGE
)];
632 static DEFINE_SPINLOCK(object_map_lock
);
634 static void __fill_map(unsigned long *obj_map
, struct kmem_cache
*s
,
637 void *addr
= slab_address(slab
);
640 bitmap_zero(obj_map
, slab
->objects
);
642 for (p
= slab
->freelist
; p
; p
= get_freepointer(s
, p
))
643 set_bit(__obj_to_index(s
, addr
, p
), obj_map
);
646 #if IS_ENABLED(CONFIG_KUNIT)
647 static bool slab_add_kunit_errors(void)
649 struct kunit_resource
*resource
;
651 if (!kunit_get_current_test())
654 resource
= kunit_find_named_resource(current
->kunit_test
, "slab_errors");
658 (*(int *)resource
->data
)++;
659 kunit_put_resource(resource
);
663 static inline bool slab_add_kunit_errors(void) { return false; }
666 static inline unsigned int size_from_object(struct kmem_cache
*s
)
668 if (s
->flags
& SLAB_RED_ZONE
)
669 return s
->size
- s
->red_left_pad
;
674 static inline void *restore_red_left(struct kmem_cache
*s
, void *p
)
676 if (s
->flags
& SLAB_RED_ZONE
)
677 p
-= s
->red_left_pad
;
685 #if defined(CONFIG_SLUB_DEBUG_ON)
686 static slab_flags_t slub_debug
= DEBUG_DEFAULT_FLAGS
;
688 static slab_flags_t slub_debug
;
691 static char *slub_debug_string
;
692 static int disable_higher_order_debug
;
695 * slub is about to manipulate internal object metadata. This memory lies
696 * outside the range of the allocated object, so accessing it would normally
697 * be reported by kasan as a bounds error. metadata_access_enable() is used
698 * to tell kasan that these accesses are OK.
700 static inline void metadata_access_enable(void)
702 kasan_disable_current();
705 static inline void metadata_access_disable(void)
707 kasan_enable_current();
714 /* Verify that a pointer has an address that is valid within a slab page */
715 static inline int check_valid_pointer(struct kmem_cache
*s
,
716 struct slab
*slab
, void *object
)
723 base
= slab_address(slab
);
724 object
= kasan_reset_tag(object
);
725 object
= restore_red_left(s
, object
);
726 if (object
< base
|| object
>= base
+ slab
->objects
* s
->size
||
727 (object
- base
) % s
->size
) {
734 static void print_section(char *level
, char *text
, u8
*addr
,
737 metadata_access_enable();
738 print_hex_dump(level
, text
, DUMP_PREFIX_ADDRESS
,
739 16, 1, kasan_reset_tag((void *)addr
), length
, 1);
740 metadata_access_disable();
744 * See comment in calculate_sizes().
746 static inline bool freeptr_outside_object(struct kmem_cache
*s
)
748 return s
->offset
>= s
->inuse
;
752 * Return offset of the end of info block which is inuse + free pointer if
753 * not overlapping with object.
755 static inline unsigned int get_info_end(struct kmem_cache
*s
)
757 if (freeptr_outside_object(s
))
758 return s
->inuse
+ sizeof(void *);
763 static struct track
*get_track(struct kmem_cache
*s
, void *object
,
764 enum track_item alloc
)
768 p
= object
+ get_info_end(s
);
770 return kasan_reset_tag(p
+ alloc
);
773 #ifdef CONFIG_STACKDEPOT
774 static noinline depot_stack_handle_t
set_track_prepare(void)
776 depot_stack_handle_t handle
;
777 unsigned long entries
[TRACK_ADDRS_COUNT
];
778 unsigned int nr_entries
;
780 nr_entries
= stack_trace_save(entries
, ARRAY_SIZE(entries
), 3);
781 handle
= stack_depot_save(entries
, nr_entries
, GFP_NOWAIT
);
786 static inline depot_stack_handle_t
set_track_prepare(void)
792 static void set_track_update(struct kmem_cache
*s
, void *object
,
793 enum track_item alloc
, unsigned long addr
,
794 depot_stack_handle_t handle
)
796 struct track
*p
= get_track(s
, object
, alloc
);
798 #ifdef CONFIG_STACKDEPOT
802 p
->cpu
= smp_processor_id();
803 p
->pid
= current
->pid
;
807 static __always_inline
void set_track(struct kmem_cache
*s
, void *object
,
808 enum track_item alloc
, unsigned long addr
)
810 depot_stack_handle_t handle
= set_track_prepare();
812 set_track_update(s
, object
, alloc
, addr
, handle
);
815 static void init_tracking(struct kmem_cache
*s
, void *object
)
819 if (!(s
->flags
& SLAB_STORE_USER
))
822 p
= get_track(s
, object
, TRACK_ALLOC
);
823 memset(p
, 0, 2*sizeof(struct track
));
826 static void print_track(const char *s
, struct track
*t
, unsigned long pr_time
)
828 depot_stack_handle_t handle __maybe_unused
;
833 pr_err("%s in %pS age=%lu cpu=%u pid=%d\n",
834 s
, (void *)t
->addr
, pr_time
- t
->when
, t
->cpu
, t
->pid
);
835 #ifdef CONFIG_STACKDEPOT
836 handle
= READ_ONCE(t
->handle
);
838 stack_depot_print(handle
);
840 pr_err("object allocation/free stack trace missing\n");
844 void print_tracking(struct kmem_cache
*s
, void *object
)
846 unsigned long pr_time
= jiffies
;
847 if (!(s
->flags
& SLAB_STORE_USER
))
850 print_track("Allocated", get_track(s
, object
, TRACK_ALLOC
), pr_time
);
851 print_track("Freed", get_track(s
, object
, TRACK_FREE
), pr_time
);
854 static void print_slab_info(const struct slab
*slab
)
856 struct folio
*folio
= (struct folio
*)slab_folio(slab
);
858 pr_err("Slab 0x%p objects=%u used=%u fp=0x%p flags=%pGp\n",
859 slab
, slab
->objects
, slab
->inuse
, slab
->freelist
,
860 folio_flags(folio
, 0));
864 * kmalloc caches has fixed sizes (mostly power of 2), and kmalloc() API
865 * family will round up the real request size to these fixed ones, so
866 * there could be an extra area than what is requested. Save the original
867 * request size in the meta data area, for better debug and sanity check.
869 static inline void set_orig_size(struct kmem_cache
*s
,
870 void *object
, unsigned int orig_size
)
872 void *p
= kasan_reset_tag(object
);
874 if (!slub_debug_orig_size(s
))
877 #ifdef CONFIG_KASAN_GENERIC
879 * KASAN could save its free meta data in object's data area at
880 * offset 0, if the size is larger than 'orig_size', it will
881 * overlap the data redzone in [orig_size+1, object_size], and
882 * the check should be skipped.
884 if (kasan_metadata_size(s
, true) > orig_size
)
885 orig_size
= s
->object_size
;
888 p
+= get_info_end(s
);
889 p
+= sizeof(struct track
) * 2;
891 *(unsigned int *)p
= orig_size
;
894 static inline unsigned int get_orig_size(struct kmem_cache
*s
, void *object
)
896 void *p
= kasan_reset_tag(object
);
898 if (!slub_debug_orig_size(s
))
899 return s
->object_size
;
901 p
+= get_info_end(s
);
902 p
+= sizeof(struct track
) * 2;
904 return *(unsigned int *)p
;
907 void skip_orig_size_check(struct kmem_cache
*s
, const void *object
)
909 set_orig_size(s
, (void *)object
, s
->object_size
);
912 static void slab_bug(struct kmem_cache
*s
, char *fmt
, ...)
914 struct va_format vaf
;
920 pr_err("=============================================================================\n");
921 pr_err("BUG %s (%s): %pV\n", s
->name
, print_tainted(), &vaf
);
922 pr_err("-----------------------------------------------------------------------------\n\n");
927 static void slab_fix(struct kmem_cache
*s
, char *fmt
, ...)
929 struct va_format vaf
;
932 if (slab_add_kunit_errors())
938 pr_err("FIX %s: %pV\n", s
->name
, &vaf
);
942 static void print_trailer(struct kmem_cache
*s
, struct slab
*slab
, u8
*p
)
944 unsigned int off
; /* Offset of last byte */
945 u8
*addr
= slab_address(slab
);
947 print_tracking(s
, p
);
949 print_slab_info(slab
);
951 pr_err("Object 0x%p @offset=%tu fp=0x%p\n\n",
952 p
, p
- addr
, get_freepointer(s
, p
));
954 if (s
->flags
& SLAB_RED_ZONE
)
955 print_section(KERN_ERR
, "Redzone ", p
- s
->red_left_pad
,
957 else if (p
> addr
+ 16)
958 print_section(KERN_ERR
, "Bytes b4 ", p
- 16, 16);
960 print_section(KERN_ERR
, "Object ", p
,
961 min_t(unsigned int, s
->object_size
, PAGE_SIZE
));
962 if (s
->flags
& SLAB_RED_ZONE
)
963 print_section(KERN_ERR
, "Redzone ", p
+ s
->object_size
,
964 s
->inuse
- s
->object_size
);
966 off
= get_info_end(s
);
968 if (s
->flags
& SLAB_STORE_USER
)
969 off
+= 2 * sizeof(struct track
);
971 if (slub_debug_orig_size(s
))
972 off
+= sizeof(unsigned int);
974 off
+= kasan_metadata_size(s
, false);
976 if (off
!= size_from_object(s
))
977 /* Beginning of the filler is the free pointer */
978 print_section(KERN_ERR
, "Padding ", p
+ off
,
979 size_from_object(s
) - off
);
984 static void object_err(struct kmem_cache
*s
, struct slab
*slab
,
985 u8
*object
, char *reason
)
987 if (slab_add_kunit_errors())
990 slab_bug(s
, "%s", reason
);
991 print_trailer(s
, slab
, object
);
992 add_taint(TAINT_BAD_PAGE
, LOCKDEP_NOW_UNRELIABLE
);
995 static bool freelist_corrupted(struct kmem_cache
*s
, struct slab
*slab
,
996 void **freelist
, void *nextfree
)
998 if ((s
->flags
& SLAB_CONSISTENCY_CHECKS
) &&
999 !check_valid_pointer(s
, slab
, nextfree
) && freelist
) {
1000 object_err(s
, slab
, *freelist
, "Freechain corrupt");
1002 slab_fix(s
, "Isolate corrupted freechain");
1009 static __printf(3, 4) void slab_err(struct kmem_cache
*s
, struct slab
*slab
,
1010 const char *fmt
, ...)
1015 if (slab_add_kunit_errors())
1018 va_start(args
, fmt
);
1019 vsnprintf(buf
, sizeof(buf
), fmt
, args
);
1021 slab_bug(s
, "%s", buf
);
1022 print_slab_info(slab
);
1024 add_taint(TAINT_BAD_PAGE
, LOCKDEP_NOW_UNRELIABLE
);
1027 static void init_object(struct kmem_cache
*s
, void *object
, u8 val
)
1029 u8
*p
= kasan_reset_tag(object
);
1030 unsigned int poison_size
= s
->object_size
;
1032 if (s
->flags
& SLAB_RED_ZONE
) {
1033 memset(p
- s
->red_left_pad
, val
, s
->red_left_pad
);
1035 if (slub_debug_orig_size(s
) && val
== SLUB_RED_ACTIVE
) {
1037 * Redzone the extra allocated space by kmalloc than
1038 * requested, and the poison size will be limited to
1039 * the original request size accordingly.
1041 poison_size
= get_orig_size(s
, object
);
1045 if (s
->flags
& __OBJECT_POISON
) {
1046 memset(p
, POISON_FREE
, poison_size
- 1);
1047 p
[poison_size
- 1] = POISON_END
;
1050 if (s
->flags
& SLAB_RED_ZONE
)
1051 memset(p
+ poison_size
, val
, s
->inuse
- poison_size
);
1054 static void restore_bytes(struct kmem_cache
*s
, char *message
, u8 data
,
1055 void *from
, void *to
)
1057 slab_fix(s
, "Restoring %s 0x%p-0x%p=0x%x", message
, from
, to
- 1, data
);
1058 memset(from
, data
, to
- from
);
1061 static int check_bytes_and_report(struct kmem_cache
*s
, struct slab
*slab
,
1062 u8
*object
, char *what
,
1063 u8
*start
, unsigned int value
, unsigned int bytes
)
1067 u8
*addr
= slab_address(slab
);
1069 metadata_access_enable();
1070 fault
= memchr_inv(kasan_reset_tag(start
), value
, bytes
);
1071 metadata_access_disable();
1075 end
= start
+ bytes
;
1076 while (end
> fault
&& end
[-1] == value
)
1079 if (slab_add_kunit_errors())
1080 goto skip_bug_print
;
1082 slab_bug(s
, "%s overwritten", what
);
1083 pr_err("0x%p-0x%p @offset=%tu. First byte 0x%x instead of 0x%x\n",
1084 fault
, end
- 1, fault
- addr
,
1086 print_trailer(s
, slab
, object
);
1087 add_taint(TAINT_BAD_PAGE
, LOCKDEP_NOW_UNRELIABLE
);
1090 restore_bytes(s
, what
, value
, fault
, end
);
1098 * Bytes of the object to be managed.
1099 * If the freepointer may overlay the object then the free
1100 * pointer is at the middle of the object.
1102 * Poisoning uses 0x6b (POISON_FREE) and the last byte is
1105 * object + s->object_size
1106 * Padding to reach word boundary. This is also used for Redzoning.
1107 * Padding is extended by another word if Redzoning is enabled and
1108 * object_size == inuse.
1110 * We fill with 0xbb (RED_INACTIVE) for inactive objects and with
1111 * 0xcc (RED_ACTIVE) for objects in use.
1114 * Meta data starts here.
1116 * A. Free pointer (if we cannot overwrite object on free)
1117 * B. Tracking data for SLAB_STORE_USER
1118 * C. Original request size for kmalloc object (SLAB_STORE_USER enabled)
1119 * D. Padding to reach required alignment boundary or at minimum
1120 * one word if debugging is on to be able to detect writes
1121 * before the word boundary.
1123 * Padding is done using 0x5a (POISON_INUSE)
1126 * Nothing is used beyond s->size.
1128 * If slabcaches are merged then the object_size and inuse boundaries are mostly
1129 * ignored. And therefore no slab options that rely on these boundaries
1130 * may be used with merged slabcaches.
1133 static int check_pad_bytes(struct kmem_cache
*s
, struct slab
*slab
, u8
*p
)
1135 unsigned long off
= get_info_end(s
); /* The end of info */
1137 if (s
->flags
& SLAB_STORE_USER
) {
1138 /* We also have user information there */
1139 off
+= 2 * sizeof(struct track
);
1141 if (s
->flags
& SLAB_KMALLOC
)
1142 off
+= sizeof(unsigned int);
1145 off
+= kasan_metadata_size(s
, false);
1147 if (size_from_object(s
) == off
)
1150 return check_bytes_and_report(s
, slab
, p
, "Object padding",
1151 p
+ off
, POISON_INUSE
, size_from_object(s
) - off
);
1154 /* Check the pad bytes at the end of a slab page */
1155 static void slab_pad_check(struct kmem_cache
*s
, struct slab
*slab
)
1164 if (!(s
->flags
& SLAB_POISON
))
1167 start
= slab_address(slab
);
1168 length
= slab_size(slab
);
1169 end
= start
+ length
;
1170 remainder
= length
% s
->size
;
1174 pad
= end
- remainder
;
1175 metadata_access_enable();
1176 fault
= memchr_inv(kasan_reset_tag(pad
), POISON_INUSE
, remainder
);
1177 metadata_access_disable();
1180 while (end
> fault
&& end
[-1] == POISON_INUSE
)
1183 slab_err(s
, slab
, "Padding overwritten. 0x%p-0x%p @offset=%tu",
1184 fault
, end
- 1, fault
- start
);
1185 print_section(KERN_ERR
, "Padding ", pad
, remainder
);
1187 restore_bytes(s
, "slab padding", POISON_INUSE
, fault
, end
);
1190 static int check_object(struct kmem_cache
*s
, struct slab
*slab
,
1191 void *object
, u8 val
)
1194 u8
*endobject
= object
+ s
->object_size
;
1195 unsigned int orig_size
;
1197 if (s
->flags
& SLAB_RED_ZONE
) {
1198 if (!check_bytes_and_report(s
, slab
, object
, "Left Redzone",
1199 object
- s
->red_left_pad
, val
, s
->red_left_pad
))
1202 if (!check_bytes_and_report(s
, slab
, object
, "Right Redzone",
1203 endobject
, val
, s
->inuse
- s
->object_size
))
1206 if (slub_debug_orig_size(s
) && val
== SLUB_RED_ACTIVE
) {
1207 orig_size
= get_orig_size(s
, object
);
1209 if (s
->object_size
> orig_size
&&
1210 !check_bytes_and_report(s
, slab
, object
,
1211 "kmalloc Redzone", p
+ orig_size
,
1212 val
, s
->object_size
- orig_size
)) {
1217 if ((s
->flags
& SLAB_POISON
) && s
->object_size
< s
->inuse
) {
1218 check_bytes_and_report(s
, slab
, p
, "Alignment padding",
1219 endobject
, POISON_INUSE
,
1220 s
->inuse
- s
->object_size
);
1224 if (s
->flags
& SLAB_POISON
) {
1225 if (val
!= SLUB_RED_ACTIVE
&& (s
->flags
& __OBJECT_POISON
) &&
1226 (!check_bytes_and_report(s
, slab
, p
, "Poison", p
,
1227 POISON_FREE
, s
->object_size
- 1) ||
1228 !check_bytes_and_report(s
, slab
, p
, "End Poison",
1229 p
+ s
->object_size
- 1, POISON_END
, 1)))
1232 * check_pad_bytes cleans up on its own.
1234 check_pad_bytes(s
, slab
, p
);
1237 if (!freeptr_outside_object(s
) && val
== SLUB_RED_ACTIVE
)
1239 * Object and freepointer overlap. Cannot check
1240 * freepointer while object is allocated.
1244 /* Check free pointer validity */
1245 if (!check_valid_pointer(s
, slab
, get_freepointer(s
, p
))) {
1246 object_err(s
, slab
, p
, "Freepointer corrupt");
1248 * No choice but to zap it and thus lose the remainder
1249 * of the free objects in this slab. May cause
1250 * another error because the object count is now wrong.
1252 set_freepointer(s
, p
, NULL
);
1258 static int check_slab(struct kmem_cache
*s
, struct slab
*slab
)
1262 if (!folio_test_slab(slab_folio(slab
))) {
1263 slab_err(s
, slab
, "Not a valid slab page");
1267 maxobj
= order_objects(slab_order(slab
), s
->size
);
1268 if (slab
->objects
> maxobj
) {
1269 slab_err(s
, slab
, "objects %u > max %u",
1270 slab
->objects
, maxobj
);
1273 if (slab
->inuse
> slab
->objects
) {
1274 slab_err(s
, slab
, "inuse %u > max %u",
1275 slab
->inuse
, slab
->objects
);
1278 /* Slab_pad_check fixes things up after itself */
1279 slab_pad_check(s
, slab
);
1284 * Determine if a certain object in a slab is on the freelist. Must hold the
1285 * slab lock to guarantee that the chains are in a consistent state.
1287 static int on_freelist(struct kmem_cache
*s
, struct slab
*slab
, void *search
)
1291 void *object
= NULL
;
1294 fp
= slab
->freelist
;
1295 while (fp
&& nr
<= slab
->objects
) {
1298 if (!check_valid_pointer(s
, slab
, fp
)) {
1300 object_err(s
, slab
, object
,
1301 "Freechain corrupt");
1302 set_freepointer(s
, object
, NULL
);
1304 slab_err(s
, slab
, "Freepointer corrupt");
1305 slab
->freelist
= NULL
;
1306 slab
->inuse
= slab
->objects
;
1307 slab_fix(s
, "Freelist cleared");
1313 fp
= get_freepointer(s
, object
);
1317 max_objects
= order_objects(slab_order(slab
), s
->size
);
1318 if (max_objects
> MAX_OBJS_PER_PAGE
)
1319 max_objects
= MAX_OBJS_PER_PAGE
;
1321 if (slab
->objects
!= max_objects
) {
1322 slab_err(s
, slab
, "Wrong number of objects. Found %d but should be %d",
1323 slab
->objects
, max_objects
);
1324 slab
->objects
= max_objects
;
1325 slab_fix(s
, "Number of objects adjusted");
1327 if (slab
->inuse
!= slab
->objects
- nr
) {
1328 slab_err(s
, slab
, "Wrong object count. Counter is %d but counted were %d",
1329 slab
->inuse
, slab
->objects
- nr
);
1330 slab
->inuse
= slab
->objects
- nr
;
1331 slab_fix(s
, "Object count adjusted");
1333 return search
== NULL
;
1336 static void trace(struct kmem_cache
*s
, struct slab
*slab
, void *object
,
1339 if (s
->flags
& SLAB_TRACE
) {
1340 pr_info("TRACE %s %s 0x%p inuse=%d fp=0x%p\n",
1342 alloc
? "alloc" : "free",
1343 object
, slab
->inuse
,
1347 print_section(KERN_INFO
, "Object ", (void *)object
,
1355 * Tracking of fully allocated slabs for debugging purposes.
1357 static void add_full(struct kmem_cache
*s
,
1358 struct kmem_cache_node
*n
, struct slab
*slab
)
1360 if (!(s
->flags
& SLAB_STORE_USER
))
1363 lockdep_assert_held(&n
->list_lock
);
1364 list_add(&slab
->slab_list
, &n
->full
);
1367 static void remove_full(struct kmem_cache
*s
, struct kmem_cache_node
*n
, struct slab
*slab
)
1369 if (!(s
->flags
& SLAB_STORE_USER
))
1372 lockdep_assert_held(&n
->list_lock
);
1373 list_del(&slab
->slab_list
);
1376 static inline unsigned long node_nr_slabs(struct kmem_cache_node
*n
)
1378 return atomic_long_read(&n
->nr_slabs
);
1381 static inline void inc_slabs_node(struct kmem_cache
*s
, int node
, int objects
)
1383 struct kmem_cache_node
*n
= get_node(s
, node
);
1386 * May be called early in order to allocate a slab for the
1387 * kmem_cache_node structure. Solve the chicken-egg
1388 * dilemma by deferring the increment of the count during
1389 * bootstrap (see early_kmem_cache_node_alloc).
1392 atomic_long_inc(&n
->nr_slabs
);
1393 atomic_long_add(objects
, &n
->total_objects
);
1396 static inline void dec_slabs_node(struct kmem_cache
*s
, int node
, int objects
)
1398 struct kmem_cache_node
*n
= get_node(s
, node
);
1400 atomic_long_dec(&n
->nr_slabs
);
1401 atomic_long_sub(objects
, &n
->total_objects
);
1404 /* Object debug checks for alloc/free paths */
1405 static void setup_object_debug(struct kmem_cache
*s
, void *object
)
1407 if (!kmem_cache_debug_flags(s
, SLAB_STORE_USER
|SLAB_RED_ZONE
|__OBJECT_POISON
))
1410 init_object(s
, object
, SLUB_RED_INACTIVE
);
1411 init_tracking(s
, object
);
1415 void setup_slab_debug(struct kmem_cache
*s
, struct slab
*slab
, void *addr
)
1417 if (!kmem_cache_debug_flags(s
, SLAB_POISON
))
1420 metadata_access_enable();
1421 memset(kasan_reset_tag(addr
), POISON_INUSE
, slab_size(slab
));
1422 metadata_access_disable();
1425 static inline int alloc_consistency_checks(struct kmem_cache
*s
,
1426 struct slab
*slab
, void *object
)
1428 if (!check_slab(s
, slab
))
1431 if (!check_valid_pointer(s
, slab
, object
)) {
1432 object_err(s
, slab
, object
, "Freelist Pointer check fails");
1436 if (!check_object(s
, slab
, object
, SLUB_RED_INACTIVE
))
1442 static noinline
bool alloc_debug_processing(struct kmem_cache
*s
,
1443 struct slab
*slab
, void *object
, int orig_size
)
1445 if (s
->flags
& SLAB_CONSISTENCY_CHECKS
) {
1446 if (!alloc_consistency_checks(s
, slab
, object
))
1450 /* Success. Perform special debug activities for allocs */
1451 trace(s
, slab
, object
, 1);
1452 set_orig_size(s
, object
, orig_size
);
1453 init_object(s
, object
, SLUB_RED_ACTIVE
);
1457 if (folio_test_slab(slab_folio(slab
))) {
1459 * If this is a slab page then lets do the best we can
1460 * to avoid issues in the future. Marking all objects
1461 * as used avoids touching the remaining objects.
1463 slab_fix(s
, "Marking all objects used");
1464 slab
->inuse
= slab
->objects
;
1465 slab
->freelist
= NULL
;
1470 static inline int free_consistency_checks(struct kmem_cache
*s
,
1471 struct slab
*slab
, void *object
, unsigned long addr
)
1473 if (!check_valid_pointer(s
, slab
, object
)) {
1474 slab_err(s
, slab
, "Invalid object pointer 0x%p", object
);
1478 if (on_freelist(s
, slab
, object
)) {
1479 object_err(s
, slab
, object
, "Object already free");
1483 if (!check_object(s
, slab
, object
, SLUB_RED_ACTIVE
))
1486 if (unlikely(s
!= slab
->slab_cache
)) {
1487 if (!folio_test_slab(slab_folio(slab
))) {
1488 slab_err(s
, slab
, "Attempt to free object(0x%p) outside of slab",
1490 } else if (!slab
->slab_cache
) {
1491 pr_err("SLUB <none>: no slab for object 0x%p.\n",
1495 object_err(s
, slab
, object
,
1496 "page slab pointer corrupt.");
1503 * Parse a block of slub_debug options. Blocks are delimited by ';'
1505 * @str: start of block
1506 * @flags: returns parsed flags, or DEBUG_DEFAULT_FLAGS if none specified
1507 * @slabs: return start of list of slabs, or NULL when there's no list
1508 * @init: assume this is initial parsing and not per-kmem-create parsing
1510 * returns the start of next block if there's any, or NULL
1513 parse_slub_debug_flags(char *str
, slab_flags_t
*flags
, char **slabs
, bool init
)
1515 bool higher_order_disable
= false;
1517 /* Skip any completely empty blocks */
1518 while (*str
&& *str
== ';')
1523 * No options but restriction on slabs. This means full
1524 * debugging for slabs matching a pattern.
1526 *flags
= DEBUG_DEFAULT_FLAGS
;
1531 /* Determine which debug features should be switched on */
1532 for (; *str
&& *str
!= ',' && *str
!= ';'; str
++) {
1533 switch (tolower(*str
)) {
1538 *flags
|= SLAB_CONSISTENCY_CHECKS
;
1541 *flags
|= SLAB_RED_ZONE
;
1544 *flags
|= SLAB_POISON
;
1547 *flags
|= SLAB_STORE_USER
;
1550 *flags
|= SLAB_TRACE
;
1553 *flags
|= SLAB_FAILSLAB
;
1557 * Avoid enabling debugging on caches if its minimum
1558 * order would increase as a result.
1560 higher_order_disable
= true;
1564 pr_err("slub_debug option '%c' unknown. skipped\n", *str
);
1573 /* Skip over the slab list */
1574 while (*str
&& *str
!= ';')
1577 /* Skip any completely empty blocks */
1578 while (*str
&& *str
== ';')
1581 if (init
&& higher_order_disable
)
1582 disable_higher_order_debug
= 1;
1590 static int __init
setup_slub_debug(char *str
)
1593 slab_flags_t global_flags
;
1596 bool global_slub_debug_changed
= false;
1597 bool slab_list_specified
= false;
1599 global_flags
= DEBUG_DEFAULT_FLAGS
;
1600 if (*str
++ != '=' || !*str
)
1602 * No options specified. Switch on full debugging.
1608 str
= parse_slub_debug_flags(str
, &flags
, &slab_list
, true);
1611 global_flags
= flags
;
1612 global_slub_debug_changed
= true;
1614 slab_list_specified
= true;
1615 if (flags
& SLAB_STORE_USER
)
1616 stack_depot_request_early_init();
1621 * For backwards compatibility, a single list of flags with list of
1622 * slabs means debugging is only changed for those slabs, so the global
1623 * slub_debug should be unchanged (0 or DEBUG_DEFAULT_FLAGS, depending
1624 * on CONFIG_SLUB_DEBUG_ON). We can extended that to multiple lists as
1625 * long as there is no option specifying flags without a slab list.
1627 if (slab_list_specified
) {
1628 if (!global_slub_debug_changed
)
1629 global_flags
= slub_debug
;
1630 slub_debug_string
= saved_str
;
1633 slub_debug
= global_flags
;
1634 if (slub_debug
& SLAB_STORE_USER
)
1635 stack_depot_request_early_init();
1636 if (slub_debug
!= 0 || slub_debug_string
)
1637 static_branch_enable(&slub_debug_enabled
);
1639 static_branch_disable(&slub_debug_enabled
);
1640 if ((static_branch_unlikely(&init_on_alloc
) ||
1641 static_branch_unlikely(&init_on_free
)) &&
1642 (slub_debug
& SLAB_POISON
))
1643 pr_info("mem auto-init: SLAB_POISON will take precedence over init_on_alloc/init_on_free\n");
1647 __setup("slub_debug", setup_slub_debug
);
1650 * kmem_cache_flags - apply debugging options to the cache
1651 * @object_size: the size of an object without meta data
1652 * @flags: flags to set
1653 * @name: name of the cache
1655 * Debug option(s) are applied to @flags. In addition to the debug
1656 * option(s), if a slab name (or multiple) is specified i.e.
1657 * slub_debug=<Debug-Options>,<slab name1>,<slab name2> ...
1658 * then only the select slabs will receive the debug option(s).
1660 slab_flags_t
kmem_cache_flags(unsigned int object_size
,
1661 slab_flags_t flags
, const char *name
)
1666 slab_flags_t block_flags
;
1667 slab_flags_t slub_debug_local
= slub_debug
;
1669 if (flags
& SLAB_NO_USER_FLAGS
)
1673 * If the slab cache is for debugging (e.g. kmemleak) then
1674 * don't store user (stack trace) information by default,
1675 * but let the user enable it via the command line below.
1677 if (flags
& SLAB_NOLEAKTRACE
)
1678 slub_debug_local
&= ~SLAB_STORE_USER
;
1681 next_block
= slub_debug_string
;
1682 /* Go through all blocks of debug options, see if any matches our slab's name */
1683 while (next_block
) {
1684 next_block
= parse_slub_debug_flags(next_block
, &block_flags
, &iter
, false);
1687 /* Found a block that has a slab list, search it */
1692 end
= strchrnul(iter
, ',');
1693 if (next_block
&& next_block
< end
)
1694 end
= next_block
- 1;
1696 glob
= strnchr(iter
, end
- iter
, '*');
1698 cmplen
= glob
- iter
;
1700 cmplen
= max_t(size_t, len
, (end
- iter
));
1702 if (!strncmp(name
, iter
, cmplen
)) {
1703 flags
|= block_flags
;
1707 if (!*end
|| *end
== ';')
1713 return flags
| slub_debug_local
;
1715 #else /* !CONFIG_SLUB_DEBUG */
1716 static inline void setup_object_debug(struct kmem_cache
*s
, void *object
) {}
1718 void setup_slab_debug(struct kmem_cache
*s
, struct slab
*slab
, void *addr
) {}
1720 static inline bool alloc_debug_processing(struct kmem_cache
*s
,
1721 struct slab
*slab
, void *object
, int orig_size
) { return true; }
1723 static inline bool free_debug_processing(struct kmem_cache
*s
,
1724 struct slab
*slab
, void *head
, void *tail
, int *bulk_cnt
,
1725 unsigned long addr
, depot_stack_handle_t handle
) { return true; }
1727 static inline void slab_pad_check(struct kmem_cache
*s
, struct slab
*slab
) {}
1728 static inline int check_object(struct kmem_cache
*s
, struct slab
*slab
,
1729 void *object
, u8 val
) { return 1; }
1730 static inline depot_stack_handle_t
set_track_prepare(void) { return 0; }
1731 static inline void set_track(struct kmem_cache
*s
, void *object
,
1732 enum track_item alloc
, unsigned long addr
) {}
1733 static inline void add_full(struct kmem_cache
*s
, struct kmem_cache_node
*n
,
1734 struct slab
*slab
) {}
1735 static inline void remove_full(struct kmem_cache
*s
, struct kmem_cache_node
*n
,
1736 struct slab
*slab
) {}
1737 slab_flags_t
kmem_cache_flags(unsigned int object_size
,
1738 slab_flags_t flags
, const char *name
)
1742 #define slub_debug 0
1744 #define disable_higher_order_debug 0
1746 static inline unsigned long node_nr_slabs(struct kmem_cache_node
*n
)
1748 static inline void inc_slabs_node(struct kmem_cache
*s
, int node
,
1750 static inline void dec_slabs_node(struct kmem_cache
*s
, int node
,
1753 #ifndef CONFIG_SLUB_TINY
1754 static bool freelist_corrupted(struct kmem_cache
*s
, struct slab
*slab
,
1755 void **freelist
, void *nextfree
)
1760 #endif /* CONFIG_SLUB_DEBUG */
1763 * Hooks for other subsystems that check memory allocations. In a typical
1764 * production configuration these hooks all should produce no code at all.
1766 static __always_inline
bool slab_free_hook(struct kmem_cache
*s
,
1769 kmemleak_free_recursive(x
, s
->flags
);
1770 kmsan_slab_free(s
, x
);
1772 debug_check_no_locks_freed(x
, s
->object_size
);
1774 if (!(s
->flags
& SLAB_DEBUG_OBJECTS
))
1775 debug_check_no_obj_freed(x
, s
->object_size
);
1777 /* Use KCSAN to help debug racy use-after-free. */
1778 if (!(s
->flags
& SLAB_TYPESAFE_BY_RCU
))
1779 __kcsan_check_access(x
, s
->object_size
,
1780 KCSAN_ACCESS_WRITE
| KCSAN_ACCESS_ASSERT
);
1783 * As memory initialization might be integrated into KASAN,
1784 * kasan_slab_free and initialization memset's must be
1785 * kept together to avoid discrepancies in behavior.
1787 * The initialization memset's clear the object and the metadata,
1788 * but don't touch the SLAB redzone.
1793 if (!kasan_has_integrated_init())
1794 memset(kasan_reset_tag(x
), 0, s
->object_size
);
1795 rsize
= (s
->flags
& SLAB_RED_ZONE
) ? s
->red_left_pad
: 0;
1796 memset((char *)kasan_reset_tag(x
) + s
->inuse
, 0,
1797 s
->size
- s
->inuse
- rsize
);
1799 /* KASAN might put x into memory quarantine, delaying its reuse. */
1800 return kasan_slab_free(s
, x
, init
);
1803 static inline bool slab_free_freelist_hook(struct kmem_cache
*s
,
1804 void **head
, void **tail
,
1810 void *old_tail
= *tail
? *tail
: *head
;
1812 if (is_kfence_address(next
)) {
1813 slab_free_hook(s
, next
, false);
1817 /* Head and tail of the reconstructed freelist */
1823 next
= get_freepointer(s
, object
);
1825 /* If object's reuse doesn't have to be delayed */
1826 if (!slab_free_hook(s
, object
, slab_want_init_on_free(s
))) {
1827 /* Move object to the new freelist */
1828 set_freepointer(s
, object
, *head
);
1834 * Adjust the reconstructed freelist depth
1835 * accordingly if object's reuse is delayed.
1839 } while (object
!= old_tail
);
1844 return *head
!= NULL
;
1847 static void *setup_object(struct kmem_cache
*s
, void *object
)
1849 setup_object_debug(s
, object
);
1850 object
= kasan_init_slab_obj(s
, object
);
1851 if (unlikely(s
->ctor
)) {
1852 kasan_unpoison_object_data(s
, object
);
1854 kasan_poison_object_data(s
, object
);
1860 * Slab allocation and freeing
1862 static inline struct slab
*alloc_slab_page(gfp_t flags
, int node
,
1863 struct kmem_cache_order_objects oo
)
1865 struct folio
*folio
;
1867 unsigned int order
= oo_order(oo
);
1869 if (node
== NUMA_NO_NODE
)
1870 folio
= (struct folio
*)alloc_pages(flags
, order
);
1872 folio
= (struct folio
*)__alloc_pages_node(node
, flags
, order
);
1877 slab
= folio_slab(folio
);
1878 __folio_set_slab(folio
);
1879 /* Make the flag visible before any changes to folio->mapping */
1881 if (folio_is_pfmemalloc(folio
))
1882 slab_set_pfmemalloc(slab
);
1887 #ifdef CONFIG_SLAB_FREELIST_RANDOM
1888 /* Pre-initialize the random sequence cache */
1889 static int init_cache_random_seq(struct kmem_cache
*s
)
1891 unsigned int count
= oo_objects(s
->oo
);
1894 /* Bailout if already initialised */
1898 err
= cache_random_seq_create(s
, count
, GFP_KERNEL
);
1900 pr_err("SLUB: Unable to initialize free list for %s\n",
1905 /* Transform to an offset on the set of pages */
1906 if (s
->random_seq
) {
1909 for (i
= 0; i
< count
; i
++)
1910 s
->random_seq
[i
] *= s
->size
;
1915 /* Initialize each random sequence freelist per cache */
1916 static void __init
init_freelist_randomization(void)
1918 struct kmem_cache
*s
;
1920 mutex_lock(&slab_mutex
);
1922 list_for_each_entry(s
, &slab_caches
, list
)
1923 init_cache_random_seq(s
);
1925 mutex_unlock(&slab_mutex
);
1928 /* Get the next entry on the pre-computed freelist randomized */
1929 static void *next_freelist_entry(struct kmem_cache
*s
, struct slab
*slab
,
1930 unsigned long *pos
, void *start
,
1931 unsigned long page_limit
,
1932 unsigned long freelist_count
)
1937 * If the target page allocation failed, the number of objects on the
1938 * page might be smaller than the usual size defined by the cache.
1941 idx
= s
->random_seq
[*pos
];
1943 if (*pos
>= freelist_count
)
1945 } while (unlikely(idx
>= page_limit
));
1947 return (char *)start
+ idx
;
1950 /* Shuffle the single linked freelist based on a random pre-computed sequence */
1951 static bool shuffle_freelist(struct kmem_cache
*s
, struct slab
*slab
)
1956 unsigned long idx
, pos
, page_limit
, freelist_count
;
1958 if (slab
->objects
< 2 || !s
->random_seq
)
1961 freelist_count
= oo_objects(s
->oo
);
1962 pos
= get_random_u32_below(freelist_count
);
1964 page_limit
= slab
->objects
* s
->size
;
1965 start
= fixup_red_left(s
, slab_address(slab
));
1967 /* First entry is used as the base of the freelist */
1968 cur
= next_freelist_entry(s
, slab
, &pos
, start
, page_limit
,
1970 cur
= setup_object(s
, cur
);
1971 slab
->freelist
= cur
;
1973 for (idx
= 1; idx
< slab
->objects
; idx
++) {
1974 next
= next_freelist_entry(s
, slab
, &pos
, start
, page_limit
,
1976 next
= setup_object(s
, next
);
1977 set_freepointer(s
, cur
, next
);
1980 set_freepointer(s
, cur
, NULL
);
1985 static inline int init_cache_random_seq(struct kmem_cache
*s
)
1989 static inline void init_freelist_randomization(void) { }
1990 static inline bool shuffle_freelist(struct kmem_cache
*s
, struct slab
*slab
)
1994 #endif /* CONFIG_SLAB_FREELIST_RANDOM */
1996 static struct slab
*allocate_slab(struct kmem_cache
*s
, gfp_t flags
, int node
)
1999 struct kmem_cache_order_objects oo
= s
->oo
;
2001 void *start
, *p
, *next
;
2005 flags
&= gfp_allowed_mask
;
2007 flags
|= s
->allocflags
;
2010 * Let the initial higher-order allocation fail under memory pressure
2011 * so we fall-back to the minimum order allocation.
2013 alloc_gfp
= (flags
| __GFP_NOWARN
| __GFP_NORETRY
) & ~__GFP_NOFAIL
;
2014 if ((alloc_gfp
& __GFP_DIRECT_RECLAIM
) && oo_order(oo
) > oo_order(s
->min
))
2015 alloc_gfp
= (alloc_gfp
| __GFP_NOMEMALLOC
) & ~__GFP_RECLAIM
;
2017 slab
= alloc_slab_page(alloc_gfp
, node
, oo
);
2018 if (unlikely(!slab
)) {
2022 * Allocation may have failed due to fragmentation.
2023 * Try a lower order alloc if possible
2025 slab
= alloc_slab_page(alloc_gfp
, node
, oo
);
2026 if (unlikely(!slab
))
2028 stat(s
, ORDER_FALLBACK
);
2031 slab
->objects
= oo_objects(oo
);
2035 account_slab(slab
, oo_order(oo
), s
, flags
);
2037 slab
->slab_cache
= s
;
2039 kasan_poison_slab(slab
);
2041 start
= slab_address(slab
);
2043 setup_slab_debug(s
, slab
, start
);
2045 shuffle
= shuffle_freelist(s
, slab
);
2048 start
= fixup_red_left(s
, start
);
2049 start
= setup_object(s
, start
);
2050 slab
->freelist
= start
;
2051 for (idx
= 0, p
= start
; idx
< slab
->objects
- 1; idx
++) {
2053 next
= setup_object(s
, next
);
2054 set_freepointer(s
, p
, next
);
2057 set_freepointer(s
, p
, NULL
);
2063 static struct slab
*new_slab(struct kmem_cache
*s
, gfp_t flags
, int node
)
2065 if (unlikely(flags
& GFP_SLAB_BUG_MASK
))
2066 flags
= kmalloc_fix_flags(flags
);
2068 WARN_ON_ONCE(s
->ctor
&& (flags
& __GFP_ZERO
));
2070 return allocate_slab(s
,
2071 flags
& (GFP_RECLAIM_MASK
| GFP_CONSTRAINT_MASK
), node
);
2074 static void __free_slab(struct kmem_cache
*s
, struct slab
*slab
)
2076 struct folio
*folio
= slab_folio(slab
);
2077 int order
= folio_order(folio
);
2078 int pages
= 1 << order
;
2080 __slab_clear_pfmemalloc(slab
);
2081 folio
->mapping
= NULL
;
2082 /* Make the mapping reset visible before clearing the flag */
2084 __folio_clear_slab(folio
);
2085 mm_account_reclaimed_pages(pages
);
2086 unaccount_slab(slab
, order
, s
);
2087 __free_pages(&folio
->page
, order
);
2090 static void rcu_free_slab(struct rcu_head
*h
)
2092 struct slab
*slab
= container_of(h
, struct slab
, rcu_head
);
2094 __free_slab(slab
->slab_cache
, slab
);
2097 static void free_slab(struct kmem_cache
*s
, struct slab
*slab
)
2099 if (kmem_cache_debug_flags(s
, SLAB_CONSISTENCY_CHECKS
)) {
2102 slab_pad_check(s
, slab
);
2103 for_each_object(p
, s
, slab_address(slab
), slab
->objects
)
2104 check_object(s
, slab
, p
, SLUB_RED_INACTIVE
);
2107 if (unlikely(s
->flags
& SLAB_TYPESAFE_BY_RCU
))
2108 call_rcu(&slab
->rcu_head
, rcu_free_slab
);
2110 __free_slab(s
, slab
);
2113 static void discard_slab(struct kmem_cache
*s
, struct slab
*slab
)
2115 dec_slabs_node(s
, slab_nid(slab
), slab
->objects
);
2120 * Management of partially allocated slabs.
2123 __add_partial(struct kmem_cache_node
*n
, struct slab
*slab
, int tail
)
2126 if (tail
== DEACTIVATE_TO_TAIL
)
2127 list_add_tail(&slab
->slab_list
, &n
->partial
);
2129 list_add(&slab
->slab_list
, &n
->partial
);
2132 static inline void add_partial(struct kmem_cache_node
*n
,
2133 struct slab
*slab
, int tail
)
2135 lockdep_assert_held(&n
->list_lock
);
2136 __add_partial(n
, slab
, tail
);
2139 static inline void remove_partial(struct kmem_cache_node
*n
,
2142 lockdep_assert_held(&n
->list_lock
);
2143 list_del(&slab
->slab_list
);
2148 * Called only for kmem_cache_debug() caches instead of acquire_slab(), with a
2149 * slab from the n->partial list. Remove only a single object from the slab, do
2150 * the alloc_debug_processing() checks and leave the slab on the list, or move
2151 * it to full list if it was the last free object.
2153 static void *alloc_single_from_partial(struct kmem_cache
*s
,
2154 struct kmem_cache_node
*n
, struct slab
*slab
, int orig_size
)
2158 lockdep_assert_held(&n
->list_lock
);
2160 object
= slab
->freelist
;
2161 slab
->freelist
= get_freepointer(s
, object
);
2164 if (!alloc_debug_processing(s
, slab
, object
, orig_size
)) {
2165 remove_partial(n
, slab
);
2169 if (slab
->inuse
== slab
->objects
) {
2170 remove_partial(n
, slab
);
2171 add_full(s
, n
, slab
);
2178 * Called only for kmem_cache_debug() caches to allocate from a freshly
2179 * allocated slab. Allocate a single object instead of whole freelist
2180 * and put the slab to the partial (or full) list.
2182 static void *alloc_single_from_new_slab(struct kmem_cache
*s
,
2183 struct slab
*slab
, int orig_size
)
2185 int nid
= slab_nid(slab
);
2186 struct kmem_cache_node
*n
= get_node(s
, nid
);
2187 unsigned long flags
;
2191 object
= slab
->freelist
;
2192 slab
->freelist
= get_freepointer(s
, object
);
2195 if (!alloc_debug_processing(s
, slab
, object
, orig_size
))
2197 * It's not really expected that this would fail on a
2198 * freshly allocated slab, but a concurrent memory
2199 * corruption in theory could cause that.
2203 spin_lock_irqsave(&n
->list_lock
, flags
);
2205 if (slab
->inuse
== slab
->objects
)
2206 add_full(s
, n
, slab
);
2208 add_partial(n
, slab
, DEACTIVATE_TO_HEAD
);
2210 inc_slabs_node(s
, nid
, slab
->objects
);
2211 spin_unlock_irqrestore(&n
->list_lock
, flags
);
2217 * Remove slab from the partial list, freeze it and
2218 * return the pointer to the freelist.
2220 * Returns a list of objects or NULL if it fails.
2222 static inline void *acquire_slab(struct kmem_cache
*s
,
2223 struct kmem_cache_node
*n
, struct slab
*slab
,
2227 unsigned long counters
;
2230 lockdep_assert_held(&n
->list_lock
);
2233 * Zap the freelist and set the frozen bit.
2234 * The old freelist is the list of objects for the
2235 * per cpu allocation list.
2237 freelist
= slab
->freelist
;
2238 counters
= slab
->counters
;
2239 new.counters
= counters
;
2241 new.inuse
= slab
->objects
;
2242 new.freelist
= NULL
;
2244 new.freelist
= freelist
;
2247 VM_BUG_ON(new.frozen
);
2250 if (!__slab_update_freelist(s
, slab
,
2252 new.freelist
, new.counters
,
2256 remove_partial(n
, slab
);
2261 #ifdef CONFIG_SLUB_CPU_PARTIAL
2262 static void put_cpu_partial(struct kmem_cache
*s
, struct slab
*slab
, int drain
);
2264 static inline void put_cpu_partial(struct kmem_cache
*s
, struct slab
*slab
,
2267 static inline bool pfmemalloc_match(struct slab
*slab
, gfp_t gfpflags
);
2270 * Try to allocate a partial slab from a specific node.
2272 static void *get_partial_node(struct kmem_cache
*s
, struct kmem_cache_node
*n
,
2273 struct partial_context
*pc
)
2275 struct slab
*slab
, *slab2
;
2276 void *object
= NULL
;
2277 unsigned long flags
;
2278 unsigned int partial_slabs
= 0;
2281 * Racy check. If we mistakenly see no partial slabs then we
2282 * just allocate an empty slab. If we mistakenly try to get a
2283 * partial slab and there is none available then get_partial()
2286 if (!n
|| !n
->nr_partial
)
2289 spin_lock_irqsave(&n
->list_lock
, flags
);
2290 list_for_each_entry_safe(slab
, slab2
, &n
->partial
, slab_list
) {
2293 if (!pfmemalloc_match(slab
, pc
->flags
))
2296 if (IS_ENABLED(CONFIG_SLUB_TINY
) || kmem_cache_debug(s
)) {
2297 object
= alloc_single_from_partial(s
, n
, slab
,
2304 t
= acquire_slab(s
, n
, slab
, object
== NULL
);
2310 stat(s
, ALLOC_FROM_PARTIAL
);
2313 put_cpu_partial(s
, slab
, 0);
2314 stat(s
, CPU_PARTIAL_NODE
);
2317 #ifdef CONFIG_SLUB_CPU_PARTIAL
2318 if (!kmem_cache_has_cpu_partial(s
)
2319 || partial_slabs
> s
->cpu_partial_slabs
/ 2)
2326 spin_unlock_irqrestore(&n
->list_lock
, flags
);
2331 * Get a slab from somewhere. Search in increasing NUMA distances.
2333 static void *get_any_partial(struct kmem_cache
*s
, struct partial_context
*pc
)
2336 struct zonelist
*zonelist
;
2339 enum zone_type highest_zoneidx
= gfp_zone(pc
->flags
);
2341 unsigned int cpuset_mems_cookie
;
2344 * The defrag ratio allows a configuration of the tradeoffs between
2345 * inter node defragmentation and node local allocations. A lower
2346 * defrag_ratio increases the tendency to do local allocations
2347 * instead of attempting to obtain partial slabs from other nodes.
2349 * If the defrag_ratio is set to 0 then kmalloc() always
2350 * returns node local objects. If the ratio is higher then kmalloc()
2351 * may return off node objects because partial slabs are obtained
2352 * from other nodes and filled up.
2354 * If /sys/kernel/slab/xx/remote_node_defrag_ratio is set to 100
2355 * (which makes defrag_ratio = 1000) then every (well almost)
2356 * allocation will first attempt to defrag slab caches on other nodes.
2357 * This means scanning over all nodes to look for partial slabs which
2358 * may be expensive if we do it every time we are trying to find a slab
2359 * with available objects.
2361 if (!s
->remote_node_defrag_ratio
||
2362 get_cycles() % 1024 > s
->remote_node_defrag_ratio
)
2366 cpuset_mems_cookie
= read_mems_allowed_begin();
2367 zonelist
= node_zonelist(mempolicy_slab_node(), pc
->flags
);
2368 for_each_zone_zonelist(zone
, z
, zonelist
, highest_zoneidx
) {
2369 struct kmem_cache_node
*n
;
2371 n
= get_node(s
, zone_to_nid(zone
));
2373 if (n
&& cpuset_zone_allowed(zone
, pc
->flags
) &&
2374 n
->nr_partial
> s
->min_partial
) {
2375 object
= get_partial_node(s
, n
, pc
);
2378 * Don't check read_mems_allowed_retry()
2379 * here - if mems_allowed was updated in
2380 * parallel, that was a harmless race
2381 * between allocation and the cpuset
2388 } while (read_mems_allowed_retry(cpuset_mems_cookie
));
2389 #endif /* CONFIG_NUMA */
2394 * Get a partial slab, lock it and return it.
2396 static void *get_partial(struct kmem_cache
*s
, int node
, struct partial_context
*pc
)
2399 int searchnode
= node
;
2401 if (node
== NUMA_NO_NODE
)
2402 searchnode
= numa_mem_id();
2404 object
= get_partial_node(s
, get_node(s
, searchnode
), pc
);
2405 if (object
|| node
!= NUMA_NO_NODE
)
2408 return get_any_partial(s
, pc
);
2411 #ifndef CONFIG_SLUB_TINY
2413 #ifdef CONFIG_PREEMPTION
2415 * Calculate the next globally unique transaction for disambiguation
2416 * during cmpxchg. The transactions start with the cpu number and are then
2417 * incremented by CONFIG_NR_CPUS.
2419 #define TID_STEP roundup_pow_of_two(CONFIG_NR_CPUS)
2422 * No preemption supported therefore also no need to check for
2426 #endif /* CONFIG_PREEMPTION */
2428 static inline unsigned long next_tid(unsigned long tid
)
2430 return tid
+ TID_STEP
;
2433 #ifdef SLUB_DEBUG_CMPXCHG
2434 static inline unsigned int tid_to_cpu(unsigned long tid
)
2436 return tid
% TID_STEP
;
2439 static inline unsigned long tid_to_event(unsigned long tid
)
2441 return tid
/ TID_STEP
;
2445 static inline unsigned int init_tid(int cpu
)
2450 static inline void note_cmpxchg_failure(const char *n
,
2451 const struct kmem_cache
*s
, unsigned long tid
)
2453 #ifdef SLUB_DEBUG_CMPXCHG
2454 unsigned long actual_tid
= __this_cpu_read(s
->cpu_slab
->tid
);
2456 pr_info("%s %s: cmpxchg redo ", n
, s
->name
);
2458 #ifdef CONFIG_PREEMPTION
2459 if (tid_to_cpu(tid
) != tid_to_cpu(actual_tid
))
2460 pr_warn("due to cpu change %d -> %d\n",
2461 tid_to_cpu(tid
), tid_to_cpu(actual_tid
));
2464 if (tid_to_event(tid
) != tid_to_event(actual_tid
))
2465 pr_warn("due to cpu running other code. Event %ld->%ld\n",
2466 tid_to_event(tid
), tid_to_event(actual_tid
));
2468 pr_warn("for unknown reason: actual=%lx was=%lx target=%lx\n",
2469 actual_tid
, tid
, next_tid(tid
));
2471 stat(s
, CMPXCHG_DOUBLE_CPU_FAIL
);
2474 static void init_kmem_cache_cpus(struct kmem_cache
*s
)
2477 struct kmem_cache_cpu
*c
;
2479 for_each_possible_cpu(cpu
) {
2480 c
= per_cpu_ptr(s
->cpu_slab
, cpu
);
2481 local_lock_init(&c
->lock
);
2482 c
->tid
= init_tid(cpu
);
2487 * Finishes removing the cpu slab. Merges cpu's freelist with slab's freelist,
2488 * unfreezes the slabs and puts it on the proper list.
2489 * Assumes the slab has been already safely taken away from kmem_cache_cpu
2492 static void deactivate_slab(struct kmem_cache
*s
, struct slab
*slab
,
2495 enum slab_modes
{ M_NONE
, M_PARTIAL
, M_FREE
, M_FULL_NOLIST
};
2496 struct kmem_cache_node
*n
= get_node(s
, slab_nid(slab
));
2498 enum slab_modes mode
= M_NONE
;
2499 void *nextfree
, *freelist_iter
, *freelist_tail
;
2500 int tail
= DEACTIVATE_TO_HEAD
;
2501 unsigned long flags
= 0;
2505 if (slab
->freelist
) {
2506 stat(s
, DEACTIVATE_REMOTE_FREES
);
2507 tail
= DEACTIVATE_TO_TAIL
;
2511 * Stage one: Count the objects on cpu's freelist as free_delta and
2512 * remember the last object in freelist_tail for later splicing.
2514 freelist_tail
= NULL
;
2515 freelist_iter
= freelist
;
2516 while (freelist_iter
) {
2517 nextfree
= get_freepointer(s
, freelist_iter
);
2520 * If 'nextfree' is invalid, it is possible that the object at
2521 * 'freelist_iter' is already corrupted. So isolate all objects
2522 * starting at 'freelist_iter' by skipping them.
2524 if (freelist_corrupted(s
, slab
, &freelist_iter
, nextfree
))
2527 freelist_tail
= freelist_iter
;
2530 freelist_iter
= nextfree
;
2534 * Stage two: Unfreeze the slab while splicing the per-cpu
2535 * freelist to the head of slab's freelist.
2537 * Ensure that the slab is unfrozen while the list presence
2538 * reflects the actual number of objects during unfreeze.
2540 * We first perform cmpxchg holding lock and insert to list
2541 * when it succeed. If there is mismatch then the slab is not
2542 * unfrozen and number of objects in the slab may have changed.
2543 * Then release lock and retry cmpxchg again.
2547 old
.freelist
= READ_ONCE(slab
->freelist
);
2548 old
.counters
= READ_ONCE(slab
->counters
);
2549 VM_BUG_ON(!old
.frozen
);
2551 /* Determine target state of the slab */
2552 new.counters
= old
.counters
;
2553 if (freelist_tail
) {
2554 new.inuse
-= free_delta
;
2555 set_freepointer(s
, freelist_tail
, old
.freelist
);
2556 new.freelist
= freelist
;
2558 new.freelist
= old
.freelist
;
2562 if (!new.inuse
&& n
->nr_partial
>= s
->min_partial
) {
2564 } else if (new.freelist
) {
2567 * Taking the spinlock removes the possibility that
2568 * acquire_slab() will see a slab that is frozen
2570 spin_lock_irqsave(&n
->list_lock
, flags
);
2572 mode
= M_FULL_NOLIST
;
2576 if (!slab_update_freelist(s
, slab
,
2577 old
.freelist
, old
.counters
,
2578 new.freelist
, new.counters
,
2579 "unfreezing slab")) {
2580 if (mode
== M_PARTIAL
)
2581 spin_unlock_irqrestore(&n
->list_lock
, flags
);
2586 if (mode
== M_PARTIAL
) {
2587 add_partial(n
, slab
, tail
);
2588 spin_unlock_irqrestore(&n
->list_lock
, flags
);
2590 } else if (mode
== M_FREE
) {
2591 stat(s
, DEACTIVATE_EMPTY
);
2592 discard_slab(s
, slab
);
2594 } else if (mode
== M_FULL_NOLIST
) {
2595 stat(s
, DEACTIVATE_FULL
);
2599 #ifdef CONFIG_SLUB_CPU_PARTIAL
2600 static void __unfreeze_partials(struct kmem_cache
*s
, struct slab
*partial_slab
)
2602 struct kmem_cache_node
*n
= NULL
, *n2
= NULL
;
2603 struct slab
*slab
, *slab_to_discard
= NULL
;
2604 unsigned long flags
= 0;
2606 while (partial_slab
) {
2610 slab
= partial_slab
;
2611 partial_slab
= slab
->next
;
2613 n2
= get_node(s
, slab_nid(slab
));
2616 spin_unlock_irqrestore(&n
->list_lock
, flags
);
2619 spin_lock_irqsave(&n
->list_lock
, flags
);
2624 old
.freelist
= slab
->freelist
;
2625 old
.counters
= slab
->counters
;
2626 VM_BUG_ON(!old
.frozen
);
2628 new.counters
= old
.counters
;
2629 new.freelist
= old
.freelist
;
2633 } while (!__slab_update_freelist(s
, slab
,
2634 old
.freelist
, old
.counters
,
2635 new.freelist
, new.counters
,
2636 "unfreezing slab"));
2638 if (unlikely(!new.inuse
&& n
->nr_partial
>= s
->min_partial
)) {
2639 slab
->next
= slab_to_discard
;
2640 slab_to_discard
= slab
;
2642 add_partial(n
, slab
, DEACTIVATE_TO_TAIL
);
2643 stat(s
, FREE_ADD_PARTIAL
);
2648 spin_unlock_irqrestore(&n
->list_lock
, flags
);
2650 while (slab_to_discard
) {
2651 slab
= slab_to_discard
;
2652 slab_to_discard
= slab_to_discard
->next
;
2654 stat(s
, DEACTIVATE_EMPTY
);
2655 discard_slab(s
, slab
);
2661 * Unfreeze all the cpu partial slabs.
2663 static void unfreeze_partials(struct kmem_cache
*s
)
2665 struct slab
*partial_slab
;
2666 unsigned long flags
;
2668 local_lock_irqsave(&s
->cpu_slab
->lock
, flags
);
2669 partial_slab
= this_cpu_read(s
->cpu_slab
->partial
);
2670 this_cpu_write(s
->cpu_slab
->partial
, NULL
);
2671 local_unlock_irqrestore(&s
->cpu_slab
->lock
, flags
);
2674 __unfreeze_partials(s
, partial_slab
);
2677 static void unfreeze_partials_cpu(struct kmem_cache
*s
,
2678 struct kmem_cache_cpu
*c
)
2680 struct slab
*partial_slab
;
2682 partial_slab
= slub_percpu_partial(c
);
2686 __unfreeze_partials(s
, partial_slab
);
2690 * Put a slab that was just frozen (in __slab_free|get_partial_node) into a
2691 * partial slab slot if available.
2693 * If we did not find a slot then simply move all the partials to the
2694 * per node partial list.
2696 static void put_cpu_partial(struct kmem_cache
*s
, struct slab
*slab
, int drain
)
2698 struct slab
*oldslab
;
2699 struct slab
*slab_to_unfreeze
= NULL
;
2700 unsigned long flags
;
2703 local_lock_irqsave(&s
->cpu_slab
->lock
, flags
);
2705 oldslab
= this_cpu_read(s
->cpu_slab
->partial
);
2708 if (drain
&& oldslab
->slabs
>= s
->cpu_partial_slabs
) {
2710 * Partial array is full. Move the existing set to the
2711 * per node partial list. Postpone the actual unfreezing
2712 * outside of the critical section.
2714 slab_to_unfreeze
= oldslab
;
2717 slabs
= oldslab
->slabs
;
2723 slab
->slabs
= slabs
;
2724 slab
->next
= oldslab
;
2726 this_cpu_write(s
->cpu_slab
->partial
, slab
);
2728 local_unlock_irqrestore(&s
->cpu_slab
->lock
, flags
);
2730 if (slab_to_unfreeze
) {
2731 __unfreeze_partials(s
, slab_to_unfreeze
);
2732 stat(s
, CPU_PARTIAL_DRAIN
);
2736 #else /* CONFIG_SLUB_CPU_PARTIAL */
2738 static inline void unfreeze_partials(struct kmem_cache
*s
) { }
2739 static inline void unfreeze_partials_cpu(struct kmem_cache
*s
,
2740 struct kmem_cache_cpu
*c
) { }
2742 #endif /* CONFIG_SLUB_CPU_PARTIAL */
2744 static inline void flush_slab(struct kmem_cache
*s
, struct kmem_cache_cpu
*c
)
2746 unsigned long flags
;
2750 local_lock_irqsave(&s
->cpu_slab
->lock
, flags
);
2753 freelist
= c
->freelist
;
2757 c
->tid
= next_tid(c
->tid
);
2759 local_unlock_irqrestore(&s
->cpu_slab
->lock
, flags
);
2762 deactivate_slab(s
, slab
, freelist
);
2763 stat(s
, CPUSLAB_FLUSH
);
2767 static inline void __flush_cpu_slab(struct kmem_cache
*s
, int cpu
)
2769 struct kmem_cache_cpu
*c
= per_cpu_ptr(s
->cpu_slab
, cpu
);
2770 void *freelist
= c
->freelist
;
2771 struct slab
*slab
= c
->slab
;
2775 c
->tid
= next_tid(c
->tid
);
2778 deactivate_slab(s
, slab
, freelist
);
2779 stat(s
, CPUSLAB_FLUSH
);
2782 unfreeze_partials_cpu(s
, c
);
2785 struct slub_flush_work
{
2786 struct work_struct work
;
2787 struct kmem_cache
*s
;
2794 * Called from CPU work handler with migration disabled.
2796 static void flush_cpu_slab(struct work_struct
*w
)
2798 struct kmem_cache
*s
;
2799 struct kmem_cache_cpu
*c
;
2800 struct slub_flush_work
*sfw
;
2802 sfw
= container_of(w
, struct slub_flush_work
, work
);
2805 c
= this_cpu_ptr(s
->cpu_slab
);
2810 unfreeze_partials(s
);
2813 static bool has_cpu_slab(int cpu
, struct kmem_cache
*s
)
2815 struct kmem_cache_cpu
*c
= per_cpu_ptr(s
->cpu_slab
, cpu
);
2817 return c
->slab
|| slub_percpu_partial(c
);
2820 static DEFINE_MUTEX(flush_lock
);
2821 static DEFINE_PER_CPU(struct slub_flush_work
, slub_flush
);
2823 static void flush_all_cpus_locked(struct kmem_cache
*s
)
2825 struct slub_flush_work
*sfw
;
2828 lockdep_assert_cpus_held();
2829 mutex_lock(&flush_lock
);
2831 for_each_online_cpu(cpu
) {
2832 sfw
= &per_cpu(slub_flush
, cpu
);
2833 if (!has_cpu_slab(cpu
, s
)) {
2837 INIT_WORK(&sfw
->work
, flush_cpu_slab
);
2840 queue_work_on(cpu
, flushwq
, &sfw
->work
);
2843 for_each_online_cpu(cpu
) {
2844 sfw
= &per_cpu(slub_flush
, cpu
);
2847 flush_work(&sfw
->work
);
2850 mutex_unlock(&flush_lock
);
2853 static void flush_all(struct kmem_cache
*s
)
2856 flush_all_cpus_locked(s
);
2861 * Use the cpu notifier to insure that the cpu slabs are flushed when
2864 static int slub_cpu_dead(unsigned int cpu
)
2866 struct kmem_cache
*s
;
2868 mutex_lock(&slab_mutex
);
2869 list_for_each_entry(s
, &slab_caches
, list
)
2870 __flush_cpu_slab(s
, cpu
);
2871 mutex_unlock(&slab_mutex
);
2875 #else /* CONFIG_SLUB_TINY */
2876 static inline void flush_all_cpus_locked(struct kmem_cache
*s
) { }
2877 static inline void flush_all(struct kmem_cache
*s
) { }
2878 static inline void __flush_cpu_slab(struct kmem_cache
*s
, int cpu
) { }
2879 static inline int slub_cpu_dead(unsigned int cpu
) { return 0; }
2880 #endif /* CONFIG_SLUB_TINY */
2883 * Check if the objects in a per cpu structure fit numa
2884 * locality expectations.
2886 static inline int node_match(struct slab
*slab
, int node
)
2889 if (node
!= NUMA_NO_NODE
&& slab_nid(slab
) != node
)
2895 #ifdef CONFIG_SLUB_DEBUG
2896 static int count_free(struct slab
*slab
)
2898 return slab
->objects
- slab
->inuse
;
2901 static inline unsigned long node_nr_objs(struct kmem_cache_node
*n
)
2903 return atomic_long_read(&n
->total_objects
);
2906 /* Supports checking bulk free of a constructed freelist */
2907 static inline bool free_debug_processing(struct kmem_cache
*s
,
2908 struct slab
*slab
, void *head
, void *tail
, int *bulk_cnt
,
2909 unsigned long addr
, depot_stack_handle_t handle
)
2911 bool checks_ok
= false;
2912 void *object
= head
;
2915 if (s
->flags
& SLAB_CONSISTENCY_CHECKS
) {
2916 if (!check_slab(s
, slab
))
2920 if (slab
->inuse
< *bulk_cnt
) {
2921 slab_err(s
, slab
, "Slab has %d allocated objects but %d are to be freed\n",
2922 slab
->inuse
, *bulk_cnt
);
2928 if (++cnt
> *bulk_cnt
)
2931 if (s
->flags
& SLAB_CONSISTENCY_CHECKS
) {
2932 if (!free_consistency_checks(s
, slab
, object
, addr
))
2936 if (s
->flags
& SLAB_STORE_USER
)
2937 set_track_update(s
, object
, TRACK_FREE
, addr
, handle
);
2938 trace(s
, slab
, object
, 0);
2939 /* Freepointer not overwritten by init_object(), SLAB_POISON moved it */
2940 init_object(s
, object
, SLUB_RED_INACTIVE
);
2942 /* Reached end of constructed freelist yet? */
2943 if (object
!= tail
) {
2944 object
= get_freepointer(s
, object
);
2950 if (cnt
!= *bulk_cnt
) {
2951 slab_err(s
, slab
, "Bulk free expected %d objects but found %d\n",
2959 slab_fix(s
, "Object at 0x%p not freed", object
);
2963 #endif /* CONFIG_SLUB_DEBUG */
2965 #if defined(CONFIG_SLUB_DEBUG) || defined(SLAB_SUPPORTS_SYSFS)
2966 static unsigned long count_partial(struct kmem_cache_node
*n
,
2967 int (*get_count
)(struct slab
*))
2969 unsigned long flags
;
2970 unsigned long x
= 0;
2973 spin_lock_irqsave(&n
->list_lock
, flags
);
2974 list_for_each_entry(slab
, &n
->partial
, slab_list
)
2975 x
+= get_count(slab
);
2976 spin_unlock_irqrestore(&n
->list_lock
, flags
);
2979 #endif /* CONFIG_SLUB_DEBUG || SLAB_SUPPORTS_SYSFS */
2981 #ifdef CONFIG_SLUB_DEBUG
2982 static noinline
void
2983 slab_out_of_memory(struct kmem_cache
*s
, gfp_t gfpflags
, int nid
)
2985 static DEFINE_RATELIMIT_STATE(slub_oom_rs
, DEFAULT_RATELIMIT_INTERVAL
,
2986 DEFAULT_RATELIMIT_BURST
);
2988 struct kmem_cache_node
*n
;
2990 if ((gfpflags
& __GFP_NOWARN
) || !__ratelimit(&slub_oom_rs
))
2993 pr_warn("SLUB: Unable to allocate memory on node %d, gfp=%#x(%pGg)\n",
2994 nid
, gfpflags
, &gfpflags
);
2995 pr_warn(" cache: %s, object size: %u, buffer size: %u, default order: %u, min order: %u\n",
2996 s
->name
, s
->object_size
, s
->size
, oo_order(s
->oo
),
2999 if (oo_order(s
->min
) > get_order(s
->object_size
))
3000 pr_warn(" %s debugging increased min order, use slub_debug=O to disable.\n",
3003 for_each_kmem_cache_node(s
, node
, n
) {
3004 unsigned long nr_slabs
;
3005 unsigned long nr_objs
;
3006 unsigned long nr_free
;
3008 nr_free
= count_partial(n
, count_free
);
3009 nr_slabs
= node_nr_slabs(n
);
3010 nr_objs
= node_nr_objs(n
);
3012 pr_warn(" node %d: slabs: %ld, objs: %ld, free: %ld\n",
3013 node
, nr_slabs
, nr_objs
, nr_free
);
3016 #else /* CONFIG_SLUB_DEBUG */
3018 slab_out_of_memory(struct kmem_cache
*s
, gfp_t gfpflags
, int nid
) { }
3021 static inline bool pfmemalloc_match(struct slab
*slab
, gfp_t gfpflags
)
3023 if (unlikely(slab_test_pfmemalloc(slab
)))
3024 return gfp_pfmemalloc_allowed(gfpflags
);
3029 #ifndef CONFIG_SLUB_TINY
3031 __update_cpu_freelist_fast(struct kmem_cache
*s
,
3032 void *freelist_old
, void *freelist_new
,
3035 freelist_aba_t old
= { .freelist
= freelist_old
, .counter
= tid
};
3036 freelist_aba_t
new = { .freelist
= freelist_new
, .counter
= next_tid(tid
) };
3038 return this_cpu_try_cmpxchg_freelist(s
->cpu_slab
->freelist_tid
.full
,
3039 &old
.full
, new.full
);
3043 * Check the slab->freelist and either transfer the freelist to the
3044 * per cpu freelist or deactivate the slab.
3046 * The slab is still frozen if the return value is not NULL.
3048 * If this function returns NULL then the slab has been unfrozen.
3050 static inline void *get_freelist(struct kmem_cache
*s
, struct slab
*slab
)
3053 unsigned long counters
;
3056 lockdep_assert_held(this_cpu_ptr(&s
->cpu_slab
->lock
));
3059 freelist
= slab
->freelist
;
3060 counters
= slab
->counters
;
3062 new.counters
= counters
;
3063 VM_BUG_ON(!new.frozen
);
3065 new.inuse
= slab
->objects
;
3066 new.frozen
= freelist
!= NULL
;
3068 } while (!__slab_update_freelist(s
, slab
,
3077 * Slow path. The lockless freelist is empty or we need to perform
3080 * Processing is still very fast if new objects have been freed to the
3081 * regular freelist. In that case we simply take over the regular freelist
3082 * as the lockless freelist and zap the regular freelist.
3084 * If that is not working then we fall back to the partial lists. We take the
3085 * first element of the freelist as the object to allocate now and move the
3086 * rest of the freelist to the lockless freelist.
3088 * And if we were unable to get a new slab from the partial slab lists then
3089 * we need to allocate a new slab. This is the slowest path since it involves
3090 * a call to the page allocator and the setup of a new slab.
3092 * Version of __slab_alloc to use when we know that preemption is
3093 * already disabled (which is the case for bulk allocation).
3095 static void *___slab_alloc(struct kmem_cache
*s
, gfp_t gfpflags
, int node
,
3096 unsigned long addr
, struct kmem_cache_cpu
*c
, unsigned int orig_size
)
3100 unsigned long flags
;
3101 struct partial_context pc
;
3103 stat(s
, ALLOC_SLOWPATH
);
3107 slab
= READ_ONCE(c
->slab
);
3110 * if the node is not online or has no normal memory, just
3111 * ignore the node constraint
3113 if (unlikely(node
!= NUMA_NO_NODE
&&
3114 !node_isset(node
, slab_nodes
)))
3115 node
= NUMA_NO_NODE
;
3120 if (unlikely(!node_match(slab
, node
))) {
3122 * same as above but node_match() being false already
3123 * implies node != NUMA_NO_NODE
3125 if (!node_isset(node
, slab_nodes
)) {
3126 node
= NUMA_NO_NODE
;
3128 stat(s
, ALLOC_NODE_MISMATCH
);
3129 goto deactivate_slab
;
3134 * By rights, we should be searching for a slab page that was
3135 * PFMEMALLOC but right now, we are losing the pfmemalloc
3136 * information when the page leaves the per-cpu allocator
3138 if (unlikely(!pfmemalloc_match(slab
, gfpflags
)))
3139 goto deactivate_slab
;
3141 /* must check again c->slab in case we got preempted and it changed */
3142 local_lock_irqsave(&s
->cpu_slab
->lock
, flags
);
3143 if (unlikely(slab
!= c
->slab
)) {
3144 local_unlock_irqrestore(&s
->cpu_slab
->lock
, flags
);
3147 freelist
= c
->freelist
;
3151 freelist
= get_freelist(s
, slab
);
3155 c
->tid
= next_tid(c
->tid
);
3156 local_unlock_irqrestore(&s
->cpu_slab
->lock
, flags
);
3157 stat(s
, DEACTIVATE_BYPASS
);
3161 stat(s
, ALLOC_REFILL
);
3165 lockdep_assert_held(this_cpu_ptr(&s
->cpu_slab
->lock
));
3168 * freelist is pointing to the list of objects to be used.
3169 * slab is pointing to the slab from which the objects are obtained.
3170 * That slab must be frozen for per cpu allocations to work.
3172 VM_BUG_ON(!c
->slab
->frozen
);
3173 c
->freelist
= get_freepointer(s
, freelist
);
3174 c
->tid
= next_tid(c
->tid
);
3175 local_unlock_irqrestore(&s
->cpu_slab
->lock
, flags
);
3180 local_lock_irqsave(&s
->cpu_slab
->lock
, flags
);
3181 if (slab
!= c
->slab
) {
3182 local_unlock_irqrestore(&s
->cpu_slab
->lock
, flags
);
3185 freelist
= c
->freelist
;
3188 c
->tid
= next_tid(c
->tid
);
3189 local_unlock_irqrestore(&s
->cpu_slab
->lock
, flags
);
3190 deactivate_slab(s
, slab
, freelist
);
3194 if (slub_percpu_partial(c
)) {
3195 local_lock_irqsave(&s
->cpu_slab
->lock
, flags
);
3196 if (unlikely(c
->slab
)) {
3197 local_unlock_irqrestore(&s
->cpu_slab
->lock
, flags
);
3200 if (unlikely(!slub_percpu_partial(c
))) {
3201 local_unlock_irqrestore(&s
->cpu_slab
->lock
, flags
);
3202 /* we were preempted and partial list got empty */
3206 slab
= c
->slab
= slub_percpu_partial(c
);
3207 slub_set_percpu_partial(c
, slab
);
3208 local_unlock_irqrestore(&s
->cpu_slab
->lock
, flags
);
3209 stat(s
, CPU_PARTIAL_ALLOC
);
3215 pc
.flags
= gfpflags
;
3217 pc
.orig_size
= orig_size
;
3218 freelist
= get_partial(s
, node
, &pc
);
3220 goto check_new_slab
;
3222 slub_put_cpu_ptr(s
->cpu_slab
);
3223 slab
= new_slab(s
, gfpflags
, node
);
3224 c
= slub_get_cpu_ptr(s
->cpu_slab
);
3226 if (unlikely(!slab
)) {
3227 slab_out_of_memory(s
, gfpflags
, node
);
3231 stat(s
, ALLOC_SLAB
);
3233 if (kmem_cache_debug(s
)) {
3234 freelist
= alloc_single_from_new_slab(s
, slab
, orig_size
);
3236 if (unlikely(!freelist
))
3239 if (s
->flags
& SLAB_STORE_USER
)
3240 set_track(s
, freelist
, TRACK_ALLOC
, addr
);
3246 * No other reference to the slab yet so we can
3247 * muck around with it freely without cmpxchg
3249 freelist
= slab
->freelist
;
3250 slab
->freelist
= NULL
;
3251 slab
->inuse
= slab
->objects
;
3254 inc_slabs_node(s
, slab_nid(slab
), slab
->objects
);
3258 if (kmem_cache_debug(s
)) {
3260 * For debug caches here we had to go through
3261 * alloc_single_from_partial() so just store the tracking info
3262 * and return the object
3264 if (s
->flags
& SLAB_STORE_USER
)
3265 set_track(s
, freelist
, TRACK_ALLOC
, addr
);
3270 if (unlikely(!pfmemalloc_match(slab
, gfpflags
))) {
3272 * For !pfmemalloc_match() case we don't load freelist so that
3273 * we don't make further mismatched allocations easier.
3275 deactivate_slab(s
, slab
, get_freepointer(s
, freelist
));
3281 local_lock_irqsave(&s
->cpu_slab
->lock
, flags
);
3282 if (unlikely(c
->slab
)) {
3283 void *flush_freelist
= c
->freelist
;
3284 struct slab
*flush_slab
= c
->slab
;
3288 c
->tid
= next_tid(c
->tid
);
3290 local_unlock_irqrestore(&s
->cpu_slab
->lock
, flags
);
3292 deactivate_slab(s
, flush_slab
, flush_freelist
);
3294 stat(s
, CPUSLAB_FLUSH
);
3296 goto retry_load_slab
;
3304 * A wrapper for ___slab_alloc() for contexts where preemption is not yet
3305 * disabled. Compensates for possible cpu changes by refetching the per cpu area
3308 static void *__slab_alloc(struct kmem_cache
*s
, gfp_t gfpflags
, int node
,
3309 unsigned long addr
, struct kmem_cache_cpu
*c
, unsigned int orig_size
)
3313 #ifdef CONFIG_PREEMPT_COUNT
3315 * We may have been preempted and rescheduled on a different
3316 * cpu before disabling preemption. Need to reload cpu area
3319 c
= slub_get_cpu_ptr(s
->cpu_slab
);
3322 p
= ___slab_alloc(s
, gfpflags
, node
, addr
, c
, orig_size
);
3323 #ifdef CONFIG_PREEMPT_COUNT
3324 slub_put_cpu_ptr(s
->cpu_slab
);
3329 static __always_inline
void *__slab_alloc_node(struct kmem_cache
*s
,
3330 gfp_t gfpflags
, int node
, unsigned long addr
, size_t orig_size
)
3332 struct kmem_cache_cpu
*c
;
3339 * Must read kmem_cache cpu data via this cpu ptr. Preemption is
3340 * enabled. We may switch back and forth between cpus while
3341 * reading from one cpu area. That does not matter as long
3342 * as we end up on the original cpu again when doing the cmpxchg.
3344 * We must guarantee that tid and kmem_cache_cpu are retrieved on the
3345 * same cpu. We read first the kmem_cache_cpu pointer and use it to read
3346 * the tid. If we are preempted and switched to another cpu between the
3347 * two reads, it's OK as the two are still associated with the same cpu
3348 * and cmpxchg later will validate the cpu.
3350 c
= raw_cpu_ptr(s
->cpu_slab
);
3351 tid
= READ_ONCE(c
->tid
);
3354 * Irqless object alloc/free algorithm used here depends on sequence
3355 * of fetching cpu_slab's data. tid should be fetched before anything
3356 * on c to guarantee that object and slab associated with previous tid
3357 * won't be used with current tid. If we fetch tid first, object and
3358 * slab could be one associated with next tid and our alloc/free
3359 * request will be failed. In this case, we will retry. So, no problem.
3364 * The transaction ids are globally unique per cpu and per operation on
3365 * a per cpu queue. Thus they can be guarantee that the cmpxchg_double
3366 * occurs on the right processor and that there was no operation on the
3367 * linked list in between.
3370 object
= c
->freelist
;
3373 if (!USE_LOCKLESS_FAST_PATH() ||
3374 unlikely(!object
|| !slab
|| !node_match(slab
, node
))) {
3375 object
= __slab_alloc(s
, gfpflags
, node
, addr
, c
, orig_size
);
3377 void *next_object
= get_freepointer_safe(s
, object
);
3380 * The cmpxchg will only match if there was no additional
3381 * operation and if we are on the right processor.
3383 * The cmpxchg does the following atomically (without lock
3385 * 1. Relocate first pointer to the current per cpu area.
3386 * 2. Verify that tid and freelist have not been changed
3387 * 3. If they were not changed replace tid and freelist
3389 * Since this is without lock semantics the protection is only
3390 * against code executing on this cpu *not* from access by
3393 if (unlikely(!__update_cpu_freelist_fast(s
, object
, next_object
, tid
))) {
3394 note_cmpxchg_failure("slab_alloc", s
, tid
);
3397 prefetch_freepointer(s
, next_object
);
3398 stat(s
, ALLOC_FASTPATH
);
3403 #else /* CONFIG_SLUB_TINY */
3404 static void *__slab_alloc_node(struct kmem_cache
*s
,
3405 gfp_t gfpflags
, int node
, unsigned long addr
, size_t orig_size
)
3407 struct partial_context pc
;
3411 pc
.flags
= gfpflags
;
3413 pc
.orig_size
= orig_size
;
3414 object
= get_partial(s
, node
, &pc
);
3419 slab
= new_slab(s
, gfpflags
, node
);
3420 if (unlikely(!slab
)) {
3421 slab_out_of_memory(s
, gfpflags
, node
);
3425 object
= alloc_single_from_new_slab(s
, slab
, orig_size
);
3429 #endif /* CONFIG_SLUB_TINY */
3432 * If the object has been wiped upon free, make sure it's fully initialized by
3433 * zeroing out freelist pointer.
3435 static __always_inline
void maybe_wipe_obj_freeptr(struct kmem_cache
*s
,
3438 if (unlikely(slab_want_init_on_free(s
)) && obj
)
3439 memset((void *)((char *)kasan_reset_tag(obj
) + s
->offset
),
3444 * Inlined fastpath so that allocation functions (kmalloc, kmem_cache_alloc)
3445 * have the fastpath folded into their functions. So no function call
3446 * overhead for requests that can be satisfied on the fastpath.
3448 * The fastpath works by first checking if the lockless freelist can be used.
3449 * If not then __slab_alloc is called for slow processing.
3451 * Otherwise we can simply pick the next object from the lockless free list.
3453 static __fastpath_inline
void *slab_alloc_node(struct kmem_cache
*s
, struct list_lru
*lru
,
3454 gfp_t gfpflags
, int node
, unsigned long addr
, size_t orig_size
)
3457 struct obj_cgroup
*objcg
= NULL
;
3460 s
= slab_pre_alloc_hook(s
, lru
, &objcg
, 1, gfpflags
);
3464 object
= kfence_alloc(s
, orig_size
, gfpflags
);
3465 if (unlikely(object
))
3468 object
= __slab_alloc_node(s
, gfpflags
, node
, addr
, orig_size
);
3470 maybe_wipe_obj_freeptr(s
, object
);
3471 init
= slab_want_init_on_alloc(gfpflags
, s
);
3475 * When init equals 'true', like for kzalloc() family, only
3476 * @orig_size bytes might be zeroed instead of s->object_size
3478 slab_post_alloc_hook(s
, objcg
, gfpflags
, 1, &object
, init
, orig_size
);
3483 static __fastpath_inline
void *slab_alloc(struct kmem_cache
*s
, struct list_lru
*lru
,
3484 gfp_t gfpflags
, unsigned long addr
, size_t orig_size
)
3486 return slab_alloc_node(s
, lru
, gfpflags
, NUMA_NO_NODE
, addr
, orig_size
);
3489 static __fastpath_inline
3490 void *__kmem_cache_alloc_lru(struct kmem_cache
*s
, struct list_lru
*lru
,
3493 void *ret
= slab_alloc(s
, lru
, gfpflags
, _RET_IP_
, s
->object_size
);
3495 trace_kmem_cache_alloc(_RET_IP_
, ret
, s
, gfpflags
, NUMA_NO_NODE
);
3500 void *kmem_cache_alloc(struct kmem_cache
*s
, gfp_t gfpflags
)
3502 return __kmem_cache_alloc_lru(s
, NULL
, gfpflags
);
3504 EXPORT_SYMBOL(kmem_cache_alloc
);
3506 void *kmem_cache_alloc_lru(struct kmem_cache
*s
, struct list_lru
*lru
,
3509 return __kmem_cache_alloc_lru(s
, lru
, gfpflags
);
3511 EXPORT_SYMBOL(kmem_cache_alloc_lru
);
3513 void *__kmem_cache_alloc_node(struct kmem_cache
*s
, gfp_t gfpflags
,
3514 int node
, size_t orig_size
,
3515 unsigned long caller
)
3517 return slab_alloc_node(s
, NULL
, gfpflags
, node
,
3521 void *kmem_cache_alloc_node(struct kmem_cache
*s
, gfp_t gfpflags
, int node
)
3523 void *ret
= slab_alloc_node(s
, NULL
, gfpflags
, node
, _RET_IP_
, s
->object_size
);
3525 trace_kmem_cache_alloc(_RET_IP_
, ret
, s
, gfpflags
, node
);
3529 EXPORT_SYMBOL(kmem_cache_alloc_node
);
3531 static noinline
void free_to_partial_list(
3532 struct kmem_cache
*s
, struct slab
*slab
,
3533 void *head
, void *tail
, int bulk_cnt
,
3536 struct kmem_cache_node
*n
= get_node(s
, slab_nid(slab
));
3537 struct slab
*slab_free
= NULL
;
3539 unsigned long flags
;
3540 depot_stack_handle_t handle
= 0;
3542 if (s
->flags
& SLAB_STORE_USER
)
3543 handle
= set_track_prepare();
3545 spin_lock_irqsave(&n
->list_lock
, flags
);
3547 if (free_debug_processing(s
, slab
, head
, tail
, &cnt
, addr
, handle
)) {
3548 void *prior
= slab
->freelist
;
3550 /* Perform the actual freeing while we still hold the locks */
3552 set_freepointer(s
, tail
, prior
);
3553 slab
->freelist
= head
;
3556 * If the slab is empty, and node's partial list is full,
3557 * it should be discarded anyway no matter it's on full or
3560 if (slab
->inuse
== 0 && n
->nr_partial
>= s
->min_partial
)
3564 /* was on full list */
3565 remove_full(s
, n
, slab
);
3567 add_partial(n
, slab
, DEACTIVATE_TO_TAIL
);
3568 stat(s
, FREE_ADD_PARTIAL
);
3570 } else if (slab_free
) {
3571 remove_partial(n
, slab
);
3572 stat(s
, FREE_REMOVE_PARTIAL
);
3578 * Update the counters while still holding n->list_lock to
3579 * prevent spurious validation warnings
3581 dec_slabs_node(s
, slab_nid(slab_free
), slab_free
->objects
);
3584 spin_unlock_irqrestore(&n
->list_lock
, flags
);
3588 free_slab(s
, slab_free
);
3593 * Slow path handling. This may still be called frequently since objects
3594 * have a longer lifetime than the cpu slabs in most processing loads.
3596 * So we still attempt to reduce cache line usage. Just take the slab
3597 * lock and free the item. If there is no additional partial slab
3598 * handling required then we can return immediately.
3600 static void __slab_free(struct kmem_cache
*s
, struct slab
*slab
,
3601 void *head
, void *tail
, int cnt
,
3608 unsigned long counters
;
3609 struct kmem_cache_node
*n
= NULL
;
3610 unsigned long flags
;
3612 stat(s
, FREE_SLOWPATH
);
3614 if (kfence_free(head
))
3617 if (IS_ENABLED(CONFIG_SLUB_TINY
) || kmem_cache_debug(s
)) {
3618 free_to_partial_list(s
, slab
, head
, tail
, cnt
, addr
);
3624 spin_unlock_irqrestore(&n
->list_lock
, flags
);
3627 prior
= slab
->freelist
;
3628 counters
= slab
->counters
;
3629 set_freepointer(s
, tail
, prior
);
3630 new.counters
= counters
;
3631 was_frozen
= new.frozen
;
3633 if ((!new.inuse
|| !prior
) && !was_frozen
) {
3635 if (kmem_cache_has_cpu_partial(s
) && !prior
) {
3638 * Slab was on no list before and will be
3640 * We can defer the list move and instead
3645 } else { /* Needs to be taken off a list */
3647 n
= get_node(s
, slab_nid(slab
));
3649 * Speculatively acquire the list_lock.
3650 * If the cmpxchg does not succeed then we may
3651 * drop the list_lock without any processing.
3653 * Otherwise the list_lock will synchronize with
3654 * other processors updating the list of slabs.
3656 spin_lock_irqsave(&n
->list_lock
, flags
);
3661 } while (!slab_update_freelist(s
, slab
,
3668 if (likely(was_frozen
)) {
3670 * The list lock was not taken therefore no list
3671 * activity can be necessary.
3673 stat(s
, FREE_FROZEN
);
3674 } else if (new.frozen
) {
3676 * If we just froze the slab then put it onto the
3677 * per cpu partial list.
3679 put_cpu_partial(s
, slab
, 1);
3680 stat(s
, CPU_PARTIAL_FREE
);
3686 if (unlikely(!new.inuse
&& n
->nr_partial
>= s
->min_partial
))
3690 * Objects left in the slab. If it was not on the partial list before
3693 if (!kmem_cache_has_cpu_partial(s
) && unlikely(!prior
)) {
3694 remove_full(s
, n
, slab
);
3695 add_partial(n
, slab
, DEACTIVATE_TO_TAIL
);
3696 stat(s
, FREE_ADD_PARTIAL
);
3698 spin_unlock_irqrestore(&n
->list_lock
, flags
);
3704 * Slab on the partial list.
3706 remove_partial(n
, slab
);
3707 stat(s
, FREE_REMOVE_PARTIAL
);
3709 /* Slab must be on the full list */
3710 remove_full(s
, n
, slab
);
3713 spin_unlock_irqrestore(&n
->list_lock
, flags
);
3715 discard_slab(s
, slab
);
3718 #ifndef CONFIG_SLUB_TINY
3720 * Fastpath with forced inlining to produce a kfree and kmem_cache_free that
3721 * can perform fastpath freeing without additional function calls.
3723 * The fastpath is only possible if we are freeing to the current cpu slab
3724 * of this processor. This typically the case if we have just allocated
3727 * If fastpath is not possible then fall back to __slab_free where we deal
3728 * with all sorts of special processing.
3730 * Bulk free of a freelist with several objects (all pointing to the
3731 * same slab) possible by specifying head and tail ptr, plus objects
3732 * count (cnt). Bulk free indicated by tail pointer being set.
3734 static __always_inline
void do_slab_free(struct kmem_cache
*s
,
3735 struct slab
*slab
, void *head
, void *tail
,
3736 int cnt
, unsigned long addr
)
3738 void *tail_obj
= tail
? : head
;
3739 struct kmem_cache_cpu
*c
;
3745 * Determine the currently cpus per cpu slab.
3746 * The cpu may change afterward. However that does not matter since
3747 * data is retrieved via this pointer. If we are on the same cpu
3748 * during the cmpxchg then the free will succeed.
3750 c
= raw_cpu_ptr(s
->cpu_slab
);
3751 tid
= READ_ONCE(c
->tid
);
3753 /* Same with comment on barrier() in slab_alloc_node() */
3756 if (unlikely(slab
!= c
->slab
)) {
3757 __slab_free(s
, slab
, head
, tail_obj
, cnt
, addr
);
3761 if (USE_LOCKLESS_FAST_PATH()) {
3762 freelist
= READ_ONCE(c
->freelist
);
3764 set_freepointer(s
, tail_obj
, freelist
);
3766 if (unlikely(!__update_cpu_freelist_fast(s
, freelist
, head
, tid
))) {
3767 note_cmpxchg_failure("slab_free", s
, tid
);
3771 /* Update the free list under the local lock */
3772 local_lock(&s
->cpu_slab
->lock
);
3773 c
= this_cpu_ptr(s
->cpu_slab
);
3774 if (unlikely(slab
!= c
->slab
)) {
3775 local_unlock(&s
->cpu_slab
->lock
);
3779 freelist
= c
->freelist
;
3781 set_freepointer(s
, tail_obj
, freelist
);
3783 c
->tid
= next_tid(tid
);
3785 local_unlock(&s
->cpu_slab
->lock
);
3787 stat(s
, FREE_FASTPATH
);
3789 #else /* CONFIG_SLUB_TINY */
3790 static void do_slab_free(struct kmem_cache
*s
,
3791 struct slab
*slab
, void *head
, void *tail
,
3792 int cnt
, unsigned long addr
)
3794 void *tail_obj
= tail
? : head
;
3796 __slab_free(s
, slab
, head
, tail_obj
, cnt
, addr
);
3798 #endif /* CONFIG_SLUB_TINY */
3800 static __fastpath_inline
void slab_free(struct kmem_cache
*s
, struct slab
*slab
,
3801 void *head
, void *tail
, void **p
, int cnt
,
3804 memcg_slab_free_hook(s
, slab
, p
, cnt
);
3806 * With KASAN enabled slab_free_freelist_hook modifies the freelist
3807 * to remove objects, whose reuse must be delayed.
3809 if (slab_free_freelist_hook(s
, &head
, &tail
, &cnt
))
3810 do_slab_free(s
, slab
, head
, tail
, cnt
, addr
);
3813 #ifdef CONFIG_KASAN_GENERIC
3814 void ___cache_free(struct kmem_cache
*cache
, void *x
, unsigned long addr
)
3816 do_slab_free(cache
, virt_to_slab(x
), x
, NULL
, 1, addr
);
3820 void __kmem_cache_free(struct kmem_cache
*s
, void *x
, unsigned long caller
)
3822 slab_free(s
, virt_to_slab(x
), x
, NULL
, &x
, 1, caller
);
3825 void kmem_cache_free(struct kmem_cache
*s
, void *x
)
3827 s
= cache_from_obj(s
, x
);
3830 trace_kmem_cache_free(_RET_IP_
, x
, s
);
3831 slab_free(s
, virt_to_slab(x
), x
, NULL
, &x
, 1, _RET_IP_
);
3833 EXPORT_SYMBOL(kmem_cache_free
);
3835 struct detached_freelist
{
3840 struct kmem_cache
*s
;
3844 * This function progressively scans the array with free objects (with
3845 * a limited look ahead) and extract objects belonging to the same
3846 * slab. It builds a detached freelist directly within the given
3847 * slab/objects. This can happen without any need for
3848 * synchronization, because the objects are owned by running process.
3849 * The freelist is build up as a single linked list in the objects.
3850 * The idea is, that this detached freelist can then be bulk
3851 * transferred to the real freelist(s), but only requiring a single
3852 * synchronization primitive. Look ahead in the array is limited due
3853 * to performance reasons.
3856 int build_detached_freelist(struct kmem_cache
*s
, size_t size
,
3857 void **p
, struct detached_freelist
*df
)
3861 struct folio
*folio
;
3865 folio
= virt_to_folio(object
);
3867 /* Handle kalloc'ed objects */
3868 if (unlikely(!folio_test_slab(folio
))) {
3869 free_large_kmalloc(folio
, object
);
3873 /* Derive kmem_cache from object */
3874 df
->slab
= folio_slab(folio
);
3875 df
->s
= df
->slab
->slab_cache
;
3877 df
->slab
= folio_slab(folio
);
3878 df
->s
= cache_from_obj(s
, object
); /* Support for memcg */
3881 /* Start new detached freelist */
3883 df
->freelist
= object
;
3886 if (is_kfence_address(object
))
3889 set_freepointer(df
->s
, object
, NULL
);
3894 /* df->slab is always set at this point */
3895 if (df
->slab
== virt_to_slab(object
)) {
3896 /* Opportunity build freelist */
3897 set_freepointer(df
->s
, object
, df
->freelist
);
3898 df
->freelist
= object
;
3902 swap(p
[size
], p
[same
]);
3906 /* Limit look ahead search */
3914 /* Note that interrupts must be enabled when calling this function. */
3915 void kmem_cache_free_bulk(struct kmem_cache
*s
, size_t size
, void **p
)
3921 struct detached_freelist df
;
3923 size
= build_detached_freelist(s
, size
, p
, &df
);
3927 slab_free(df
.s
, df
.slab
, df
.freelist
, df
.tail
, &p
[size
], df
.cnt
,
3929 } while (likely(size
));
3931 EXPORT_SYMBOL(kmem_cache_free_bulk
);
3933 #ifndef CONFIG_SLUB_TINY
3934 static inline int __kmem_cache_alloc_bulk(struct kmem_cache
*s
, gfp_t flags
,
3935 size_t size
, void **p
, struct obj_cgroup
*objcg
)
3937 struct kmem_cache_cpu
*c
;
3938 unsigned long irqflags
;
3942 * Drain objects in the per cpu slab, while disabling local
3943 * IRQs, which protects against PREEMPT and interrupts
3944 * handlers invoking normal fastpath.
3946 c
= slub_get_cpu_ptr(s
->cpu_slab
);
3947 local_lock_irqsave(&s
->cpu_slab
->lock
, irqflags
);
3949 for (i
= 0; i
< size
; i
++) {
3950 void *object
= kfence_alloc(s
, s
->object_size
, flags
);
3952 if (unlikely(object
)) {
3957 object
= c
->freelist
;
3958 if (unlikely(!object
)) {
3960 * We may have removed an object from c->freelist using
3961 * the fastpath in the previous iteration; in that case,
3962 * c->tid has not been bumped yet.
3963 * Since ___slab_alloc() may reenable interrupts while
3964 * allocating memory, we should bump c->tid now.
3966 c
->tid
= next_tid(c
->tid
);
3968 local_unlock_irqrestore(&s
->cpu_slab
->lock
, irqflags
);
3971 * Invoking slow path likely have side-effect
3972 * of re-populating per CPU c->freelist
3974 p
[i
] = ___slab_alloc(s
, flags
, NUMA_NO_NODE
,
3975 _RET_IP_
, c
, s
->object_size
);
3976 if (unlikely(!p
[i
]))
3979 c
= this_cpu_ptr(s
->cpu_slab
);
3980 maybe_wipe_obj_freeptr(s
, p
[i
]);
3982 local_lock_irqsave(&s
->cpu_slab
->lock
, irqflags
);
3984 continue; /* goto for-loop */
3986 c
->freelist
= get_freepointer(s
, object
);
3988 maybe_wipe_obj_freeptr(s
, p
[i
]);
3990 c
->tid
= next_tid(c
->tid
);
3991 local_unlock_irqrestore(&s
->cpu_slab
->lock
, irqflags
);
3992 slub_put_cpu_ptr(s
->cpu_slab
);
3997 slub_put_cpu_ptr(s
->cpu_slab
);
3998 slab_post_alloc_hook(s
, objcg
, flags
, i
, p
, false, s
->object_size
);
3999 kmem_cache_free_bulk(s
, i
, p
);
4003 #else /* CONFIG_SLUB_TINY */
4004 static int __kmem_cache_alloc_bulk(struct kmem_cache
*s
, gfp_t flags
,
4005 size_t size
, void **p
, struct obj_cgroup
*objcg
)
4009 for (i
= 0; i
< size
; i
++) {
4010 void *object
= kfence_alloc(s
, s
->object_size
, flags
);
4012 if (unlikely(object
)) {
4017 p
[i
] = __slab_alloc_node(s
, flags
, NUMA_NO_NODE
,
4018 _RET_IP_
, s
->object_size
);
4019 if (unlikely(!p
[i
]))
4022 maybe_wipe_obj_freeptr(s
, p
[i
]);
4028 slab_post_alloc_hook(s
, objcg
, flags
, i
, p
, false, s
->object_size
);
4029 kmem_cache_free_bulk(s
, i
, p
);
4032 #endif /* CONFIG_SLUB_TINY */
4034 /* Note that interrupts must be enabled when calling this function. */
4035 int kmem_cache_alloc_bulk(struct kmem_cache
*s
, gfp_t flags
, size_t size
,
4039 struct obj_cgroup
*objcg
= NULL
;
4044 /* memcg and kmem_cache debug support */
4045 s
= slab_pre_alloc_hook(s
, NULL
, &objcg
, size
, flags
);
4049 i
= __kmem_cache_alloc_bulk(s
, flags
, size
, p
, objcg
);
4052 * memcg and kmem_cache debug support and memory initialization.
4053 * Done outside of the IRQ disabled fastpath loop.
4056 slab_post_alloc_hook(s
, objcg
, flags
, size
, p
,
4057 slab_want_init_on_alloc(flags
, s
), s
->object_size
);
4060 EXPORT_SYMBOL(kmem_cache_alloc_bulk
);
4064 * Object placement in a slab is made very easy because we always start at
4065 * offset 0. If we tune the size of the object to the alignment then we can
4066 * get the required alignment by putting one properly sized object after
4069 * Notice that the allocation order determines the sizes of the per cpu
4070 * caches. Each processor has always one slab available for allocations.
4071 * Increasing the allocation order reduces the number of times that slabs
4072 * must be moved on and off the partial lists and is therefore a factor in
4077 * Minimum / Maximum order of slab pages. This influences locking overhead
4078 * and slab fragmentation. A higher order reduces the number of partial slabs
4079 * and increases the number of allocations possible without having to
4080 * take the list_lock.
4082 static unsigned int slub_min_order
;
4083 static unsigned int slub_max_order
=
4084 IS_ENABLED(CONFIG_SLUB_TINY
) ? 1 : PAGE_ALLOC_COSTLY_ORDER
;
4085 static unsigned int slub_min_objects
;
4088 * Calculate the order of allocation given an slab object size.
4090 * The order of allocation has significant impact on performance and other
4091 * system components. Generally order 0 allocations should be preferred since
4092 * order 0 does not cause fragmentation in the page allocator. Larger objects
4093 * be problematic to put into order 0 slabs because there may be too much
4094 * unused space left. We go to a higher order if more than 1/16th of the slab
4097 * In order to reach satisfactory performance we must ensure that a minimum
4098 * number of objects is in one slab. Otherwise we may generate too much
4099 * activity on the partial lists which requires taking the list_lock. This is
4100 * less a concern for large slabs though which are rarely used.
4102 * slub_max_order specifies the order where we begin to stop considering the
4103 * number of objects in a slab as critical. If we reach slub_max_order then
4104 * we try to keep the page order as low as possible. So we accept more waste
4105 * of space in favor of a small page order.
4107 * Higher order allocations also allow the placement of more objects in a
4108 * slab and thereby reduce object handling overhead. If the user has
4109 * requested a higher minimum order then we start with that one instead of
4110 * the smallest order which will fit the object.
4112 static inline unsigned int calc_slab_order(unsigned int size
,
4113 unsigned int min_objects
, unsigned int max_order
,
4114 unsigned int fract_leftover
)
4116 unsigned int min_order
= slub_min_order
;
4119 if (order_objects(min_order
, size
) > MAX_OBJS_PER_PAGE
)
4120 return get_order(size
* MAX_OBJS_PER_PAGE
) - 1;
4122 for (order
= max(min_order
, (unsigned int)get_order(min_objects
* size
));
4123 order
<= max_order
; order
++) {
4125 unsigned int slab_size
= (unsigned int)PAGE_SIZE
<< order
;
4128 rem
= slab_size
% size
;
4130 if (rem
<= slab_size
/ fract_leftover
)
4137 static inline int calculate_order(unsigned int size
)
4140 unsigned int min_objects
;
4141 unsigned int max_objects
;
4142 unsigned int nr_cpus
;
4145 * Attempt to find best configuration for a slab. This
4146 * works by first attempting to generate a layout with
4147 * the best configuration and backing off gradually.
4149 * First we increase the acceptable waste in a slab. Then
4150 * we reduce the minimum objects required in a slab.
4152 min_objects
= slub_min_objects
;
4155 * Some architectures will only update present cpus when
4156 * onlining them, so don't trust the number if it's just 1. But
4157 * we also don't want to use nr_cpu_ids always, as on some other
4158 * architectures, there can be many possible cpus, but never
4159 * onlined. Here we compromise between trying to avoid too high
4160 * order on systems that appear larger than they are, and too
4161 * low order on systems that appear smaller than they are.
4163 nr_cpus
= num_present_cpus();
4165 nr_cpus
= nr_cpu_ids
;
4166 min_objects
= 4 * (fls(nr_cpus
) + 1);
4168 max_objects
= order_objects(slub_max_order
, size
);
4169 min_objects
= min(min_objects
, max_objects
);
4171 while (min_objects
> 1) {
4172 unsigned int fraction
;
4175 while (fraction
>= 4) {
4176 order
= calc_slab_order(size
, min_objects
,
4177 slub_max_order
, fraction
);
4178 if (order
<= slub_max_order
)
4186 * We were unable to place multiple objects in a slab. Now
4187 * lets see if we can place a single object there.
4189 order
= calc_slab_order(size
, 1, slub_max_order
, 1);
4190 if (order
<= slub_max_order
)
4194 * Doh this slab cannot be placed using slub_max_order.
4196 order
= calc_slab_order(size
, 1, MAX_ORDER
, 1);
4197 if (order
<= MAX_ORDER
)
4203 init_kmem_cache_node(struct kmem_cache_node
*n
)
4206 spin_lock_init(&n
->list_lock
);
4207 INIT_LIST_HEAD(&n
->partial
);
4208 #ifdef CONFIG_SLUB_DEBUG
4209 atomic_long_set(&n
->nr_slabs
, 0);
4210 atomic_long_set(&n
->total_objects
, 0);
4211 INIT_LIST_HEAD(&n
->full
);
4215 #ifndef CONFIG_SLUB_TINY
4216 static inline int alloc_kmem_cache_cpus(struct kmem_cache
*s
)
4218 BUILD_BUG_ON(PERCPU_DYNAMIC_EARLY_SIZE
<
4219 NR_KMALLOC_TYPES
* KMALLOC_SHIFT_HIGH
*
4220 sizeof(struct kmem_cache_cpu
));
4223 * Must align to double word boundary for the double cmpxchg
4224 * instructions to work; see __pcpu_double_call_return_bool().
4226 s
->cpu_slab
= __alloc_percpu(sizeof(struct kmem_cache_cpu
),
4227 2 * sizeof(void *));
4232 init_kmem_cache_cpus(s
);
4237 static inline int alloc_kmem_cache_cpus(struct kmem_cache
*s
)
4241 #endif /* CONFIG_SLUB_TINY */
4243 static struct kmem_cache
*kmem_cache_node
;
4246 * No kmalloc_node yet so do it by hand. We know that this is the first
4247 * slab on the node for this slabcache. There are no concurrent accesses
4250 * Note that this function only works on the kmem_cache_node
4251 * when allocating for the kmem_cache_node. This is used for bootstrapping
4252 * memory on a fresh node that has no slab structures yet.
4254 static void early_kmem_cache_node_alloc(int node
)
4257 struct kmem_cache_node
*n
;
4259 BUG_ON(kmem_cache_node
->size
< sizeof(struct kmem_cache_node
));
4261 slab
= new_slab(kmem_cache_node
, GFP_NOWAIT
, node
);
4264 inc_slabs_node(kmem_cache_node
, slab_nid(slab
), slab
->objects
);
4265 if (slab_nid(slab
) != node
) {
4266 pr_err("SLUB: Unable to allocate memory from node %d\n", node
);
4267 pr_err("SLUB: Allocating a useless per node structure in order to be able to continue\n");
4272 #ifdef CONFIG_SLUB_DEBUG
4273 init_object(kmem_cache_node
, n
, SLUB_RED_ACTIVE
);
4274 init_tracking(kmem_cache_node
, n
);
4276 n
= kasan_slab_alloc(kmem_cache_node
, n
, GFP_KERNEL
, false);
4277 slab
->freelist
= get_freepointer(kmem_cache_node
, n
);
4279 kmem_cache_node
->node
[node
] = n
;
4280 init_kmem_cache_node(n
);
4281 inc_slabs_node(kmem_cache_node
, node
, slab
->objects
);
4284 * No locks need to be taken here as it has just been
4285 * initialized and there is no concurrent access.
4287 __add_partial(n
, slab
, DEACTIVATE_TO_HEAD
);
4290 static void free_kmem_cache_nodes(struct kmem_cache
*s
)
4293 struct kmem_cache_node
*n
;
4295 for_each_kmem_cache_node(s
, node
, n
) {
4296 s
->node
[node
] = NULL
;
4297 kmem_cache_free(kmem_cache_node
, n
);
4301 void __kmem_cache_release(struct kmem_cache
*s
)
4303 cache_random_seq_destroy(s
);
4304 #ifndef CONFIG_SLUB_TINY
4305 free_percpu(s
->cpu_slab
);
4307 free_kmem_cache_nodes(s
);
4310 static int init_kmem_cache_nodes(struct kmem_cache
*s
)
4314 for_each_node_mask(node
, slab_nodes
) {
4315 struct kmem_cache_node
*n
;
4317 if (slab_state
== DOWN
) {
4318 early_kmem_cache_node_alloc(node
);
4321 n
= kmem_cache_alloc_node(kmem_cache_node
,
4325 free_kmem_cache_nodes(s
);
4329 init_kmem_cache_node(n
);
4335 static void set_cpu_partial(struct kmem_cache
*s
)
4337 #ifdef CONFIG_SLUB_CPU_PARTIAL
4338 unsigned int nr_objects
;
4341 * cpu_partial determined the maximum number of objects kept in the
4342 * per cpu partial lists of a processor.
4344 * Per cpu partial lists mainly contain slabs that just have one
4345 * object freed. If they are used for allocation then they can be
4346 * filled up again with minimal effort. The slab will never hit the
4347 * per node partial lists and therefore no locking will be required.
4349 * For backwards compatibility reasons, this is determined as number
4350 * of objects, even though we now limit maximum number of pages, see
4351 * slub_set_cpu_partial()
4353 if (!kmem_cache_has_cpu_partial(s
))
4355 else if (s
->size
>= PAGE_SIZE
)
4357 else if (s
->size
>= 1024)
4359 else if (s
->size
>= 256)
4364 slub_set_cpu_partial(s
, nr_objects
);
4369 * calculate_sizes() determines the order and the distribution of data within
4372 static int calculate_sizes(struct kmem_cache
*s
)
4374 slab_flags_t flags
= s
->flags
;
4375 unsigned int size
= s
->object_size
;
4379 * Round up object size to the next word boundary. We can only
4380 * place the free pointer at word boundaries and this determines
4381 * the possible location of the free pointer.
4383 size
= ALIGN(size
, sizeof(void *));
4385 #ifdef CONFIG_SLUB_DEBUG
4387 * Determine if we can poison the object itself. If the user of
4388 * the slab may touch the object after free or before allocation
4389 * then we should never poison the object itself.
4391 if ((flags
& SLAB_POISON
) && !(flags
& SLAB_TYPESAFE_BY_RCU
) &&
4393 s
->flags
|= __OBJECT_POISON
;
4395 s
->flags
&= ~__OBJECT_POISON
;
4399 * If we are Redzoning then check if there is some space between the
4400 * end of the object and the free pointer. If not then add an
4401 * additional word to have some bytes to store Redzone information.
4403 if ((flags
& SLAB_RED_ZONE
) && size
== s
->object_size
)
4404 size
+= sizeof(void *);
4408 * With that we have determined the number of bytes in actual use
4409 * by the object and redzoning.
4413 if (slub_debug_orig_size(s
) ||
4414 (flags
& (SLAB_TYPESAFE_BY_RCU
| SLAB_POISON
)) ||
4415 ((flags
& SLAB_RED_ZONE
) && s
->object_size
< sizeof(void *)) ||
4418 * Relocate free pointer after the object if it is not
4419 * permitted to overwrite the first word of the object on
4422 * This is the case if we do RCU, have a constructor or
4423 * destructor, are poisoning the objects, or are
4424 * redzoning an object smaller than sizeof(void *).
4426 * The assumption that s->offset >= s->inuse means free
4427 * pointer is outside of the object is used in the
4428 * freeptr_outside_object() function. If that is no
4429 * longer true, the function needs to be modified.
4432 size
+= sizeof(void *);
4435 * Store freelist pointer near middle of object to keep
4436 * it away from the edges of the object to avoid small
4437 * sized over/underflows from neighboring allocations.
4439 s
->offset
= ALIGN_DOWN(s
->object_size
/ 2, sizeof(void *));
4442 #ifdef CONFIG_SLUB_DEBUG
4443 if (flags
& SLAB_STORE_USER
) {
4445 * Need to store information about allocs and frees after
4448 size
+= 2 * sizeof(struct track
);
4450 /* Save the original kmalloc request size */
4451 if (flags
& SLAB_KMALLOC
)
4452 size
+= sizeof(unsigned int);
4456 kasan_cache_create(s
, &size
, &s
->flags
);
4457 #ifdef CONFIG_SLUB_DEBUG
4458 if (flags
& SLAB_RED_ZONE
) {
4460 * Add some empty padding so that we can catch
4461 * overwrites from earlier objects rather than let
4462 * tracking information or the free pointer be
4463 * corrupted if a user writes before the start
4466 size
+= sizeof(void *);
4468 s
->red_left_pad
= sizeof(void *);
4469 s
->red_left_pad
= ALIGN(s
->red_left_pad
, s
->align
);
4470 size
+= s
->red_left_pad
;
4475 * SLUB stores one object immediately after another beginning from
4476 * offset 0. In order to align the objects we have to simply size
4477 * each object to conform to the alignment.
4479 size
= ALIGN(size
, s
->align
);
4481 s
->reciprocal_size
= reciprocal_value(size
);
4482 order
= calculate_order(size
);
4489 s
->allocflags
|= __GFP_COMP
;
4491 if (s
->flags
& SLAB_CACHE_DMA
)
4492 s
->allocflags
|= GFP_DMA
;
4494 if (s
->flags
& SLAB_CACHE_DMA32
)
4495 s
->allocflags
|= GFP_DMA32
;
4497 if (s
->flags
& SLAB_RECLAIM_ACCOUNT
)
4498 s
->allocflags
|= __GFP_RECLAIMABLE
;
4501 * Determine the number of objects per slab
4503 s
->oo
= oo_make(order
, size
);
4504 s
->min
= oo_make(get_order(size
), size
);
4506 return !!oo_objects(s
->oo
);
4509 static int kmem_cache_open(struct kmem_cache
*s
, slab_flags_t flags
)
4511 s
->flags
= kmem_cache_flags(s
->size
, flags
, s
->name
);
4512 #ifdef CONFIG_SLAB_FREELIST_HARDENED
4513 s
->random
= get_random_long();
4516 if (!calculate_sizes(s
))
4518 if (disable_higher_order_debug
) {
4520 * Disable debugging flags that store metadata if the min slab
4523 if (get_order(s
->size
) > get_order(s
->object_size
)) {
4524 s
->flags
&= ~DEBUG_METADATA_FLAGS
;
4526 if (!calculate_sizes(s
))
4531 #ifdef system_has_freelist_aba
4532 if (system_has_freelist_aba() && !(s
->flags
& SLAB_NO_CMPXCHG
)) {
4533 /* Enable fast mode */
4534 s
->flags
|= __CMPXCHG_DOUBLE
;
4539 * The larger the object size is, the more slabs we want on the partial
4540 * list to avoid pounding the page allocator excessively.
4542 s
->min_partial
= min_t(unsigned long, MAX_PARTIAL
, ilog2(s
->size
) / 2);
4543 s
->min_partial
= max_t(unsigned long, MIN_PARTIAL
, s
->min_partial
);
4548 s
->remote_node_defrag_ratio
= 1000;
4551 /* Initialize the pre-computed randomized freelist if slab is up */
4552 if (slab_state
>= UP
) {
4553 if (init_cache_random_seq(s
))
4557 if (!init_kmem_cache_nodes(s
))
4560 if (alloc_kmem_cache_cpus(s
))
4564 __kmem_cache_release(s
);
4568 static void list_slab_objects(struct kmem_cache
*s
, struct slab
*slab
,
4571 #ifdef CONFIG_SLUB_DEBUG
4572 void *addr
= slab_address(slab
);
4575 slab_err(s
, slab
, text
, s
->name
);
4577 spin_lock(&object_map_lock
);
4578 __fill_map(object_map
, s
, slab
);
4580 for_each_object(p
, s
, addr
, slab
->objects
) {
4582 if (!test_bit(__obj_to_index(s
, addr
, p
), object_map
)) {
4583 pr_err("Object 0x%p @offset=%tu\n", p
, p
- addr
);
4584 print_tracking(s
, p
);
4587 spin_unlock(&object_map_lock
);
4592 * Attempt to free all partial slabs on a node.
4593 * This is called from __kmem_cache_shutdown(). We must take list_lock
4594 * because sysfs file might still access partial list after the shutdowning.
4596 static void free_partial(struct kmem_cache
*s
, struct kmem_cache_node
*n
)
4599 struct slab
*slab
, *h
;
4601 BUG_ON(irqs_disabled());
4602 spin_lock_irq(&n
->list_lock
);
4603 list_for_each_entry_safe(slab
, h
, &n
->partial
, slab_list
) {
4605 remove_partial(n
, slab
);
4606 list_add(&slab
->slab_list
, &discard
);
4608 list_slab_objects(s
, slab
,
4609 "Objects remaining in %s on __kmem_cache_shutdown()");
4612 spin_unlock_irq(&n
->list_lock
);
4614 list_for_each_entry_safe(slab
, h
, &discard
, slab_list
)
4615 discard_slab(s
, slab
);
4618 bool __kmem_cache_empty(struct kmem_cache
*s
)
4621 struct kmem_cache_node
*n
;
4623 for_each_kmem_cache_node(s
, node
, n
)
4624 if (n
->nr_partial
|| node_nr_slabs(n
))
4630 * Release all resources used by a slab cache.
4632 int __kmem_cache_shutdown(struct kmem_cache
*s
)
4635 struct kmem_cache_node
*n
;
4637 flush_all_cpus_locked(s
);
4638 /* Attempt to free all objects */
4639 for_each_kmem_cache_node(s
, node
, n
) {
4641 if (n
->nr_partial
|| node_nr_slabs(n
))
4647 #ifdef CONFIG_PRINTK
4648 void __kmem_obj_info(struct kmem_obj_info
*kpp
, void *object
, struct slab
*slab
)
4651 int __maybe_unused i
;
4655 struct kmem_cache
*s
= slab
->slab_cache
;
4656 struct track __maybe_unused
*trackp
;
4658 kpp
->kp_ptr
= object
;
4659 kpp
->kp_slab
= slab
;
4660 kpp
->kp_slab_cache
= s
;
4661 base
= slab_address(slab
);
4662 objp0
= kasan_reset_tag(object
);
4663 #ifdef CONFIG_SLUB_DEBUG
4664 objp
= restore_red_left(s
, objp0
);
4668 objnr
= obj_to_index(s
, slab
, objp
);
4669 kpp
->kp_data_offset
= (unsigned long)((char *)objp0
- (char *)objp
);
4670 objp
= base
+ s
->size
* objnr
;
4671 kpp
->kp_objp
= objp
;
4672 if (WARN_ON_ONCE(objp
< base
|| objp
>= base
+ slab
->objects
* s
->size
4673 || (objp
- base
) % s
->size
) ||
4674 !(s
->flags
& SLAB_STORE_USER
))
4676 #ifdef CONFIG_SLUB_DEBUG
4677 objp
= fixup_red_left(s
, objp
);
4678 trackp
= get_track(s
, objp
, TRACK_ALLOC
);
4679 kpp
->kp_ret
= (void *)trackp
->addr
;
4680 #ifdef CONFIG_STACKDEPOT
4682 depot_stack_handle_t handle
;
4683 unsigned long *entries
;
4684 unsigned int nr_entries
;
4686 handle
= READ_ONCE(trackp
->handle
);
4688 nr_entries
= stack_depot_fetch(handle
, &entries
);
4689 for (i
= 0; i
< KS_ADDRS_COUNT
&& i
< nr_entries
; i
++)
4690 kpp
->kp_stack
[i
] = (void *)entries
[i
];
4693 trackp
= get_track(s
, objp
, TRACK_FREE
);
4694 handle
= READ_ONCE(trackp
->handle
);
4696 nr_entries
= stack_depot_fetch(handle
, &entries
);
4697 for (i
= 0; i
< KS_ADDRS_COUNT
&& i
< nr_entries
; i
++)
4698 kpp
->kp_free_stack
[i
] = (void *)entries
[i
];
4706 /********************************************************************
4708 *******************************************************************/
4710 static int __init
setup_slub_min_order(char *str
)
4712 get_option(&str
, (int *)&slub_min_order
);
4717 __setup("slub_min_order=", setup_slub_min_order
);
4719 static int __init
setup_slub_max_order(char *str
)
4721 get_option(&str
, (int *)&slub_max_order
);
4722 slub_max_order
= min_t(unsigned int, slub_max_order
, MAX_ORDER
);
4727 __setup("slub_max_order=", setup_slub_max_order
);
4729 static int __init
setup_slub_min_objects(char *str
)
4731 get_option(&str
, (int *)&slub_min_objects
);
4736 __setup("slub_min_objects=", setup_slub_min_objects
);
4738 #ifdef CONFIG_HARDENED_USERCOPY
4740 * Rejects incorrectly sized objects and objects that are to be copied
4741 * to/from userspace but do not fall entirely within the containing slab
4742 * cache's usercopy region.
4744 * Returns NULL if check passes, otherwise const char * to name of cache
4745 * to indicate an error.
4747 void __check_heap_object(const void *ptr
, unsigned long n
,
4748 const struct slab
*slab
, bool to_user
)
4750 struct kmem_cache
*s
;
4751 unsigned int offset
;
4752 bool is_kfence
= is_kfence_address(ptr
);
4754 ptr
= kasan_reset_tag(ptr
);
4756 /* Find object and usable object size. */
4757 s
= slab
->slab_cache
;
4759 /* Reject impossible pointers. */
4760 if (ptr
< slab_address(slab
))
4761 usercopy_abort("SLUB object not in SLUB page?!", NULL
,
4764 /* Find offset within object. */
4766 offset
= ptr
- kfence_object_start(ptr
);
4768 offset
= (ptr
- slab_address(slab
)) % s
->size
;
4770 /* Adjust for redzone and reject if within the redzone. */
4771 if (!is_kfence
&& kmem_cache_debug_flags(s
, SLAB_RED_ZONE
)) {
4772 if (offset
< s
->red_left_pad
)
4773 usercopy_abort("SLUB object in left red zone",
4774 s
->name
, to_user
, offset
, n
);
4775 offset
-= s
->red_left_pad
;
4778 /* Allow address range falling entirely within usercopy region. */
4779 if (offset
>= s
->useroffset
&&
4780 offset
- s
->useroffset
<= s
->usersize
&&
4781 n
<= s
->useroffset
- offset
+ s
->usersize
)
4784 usercopy_abort("SLUB object", s
->name
, to_user
, offset
, n
);
4786 #endif /* CONFIG_HARDENED_USERCOPY */
4788 #define SHRINK_PROMOTE_MAX 32
4791 * kmem_cache_shrink discards empty slabs and promotes the slabs filled
4792 * up most to the head of the partial lists. New allocations will then
4793 * fill those up and thus they can be removed from the partial lists.
4795 * The slabs with the least items are placed last. This results in them
4796 * being allocated from last increasing the chance that the last objects
4797 * are freed in them.
4799 static int __kmem_cache_do_shrink(struct kmem_cache
*s
)
4803 struct kmem_cache_node
*n
;
4806 struct list_head discard
;
4807 struct list_head promote
[SHRINK_PROMOTE_MAX
];
4808 unsigned long flags
;
4811 for_each_kmem_cache_node(s
, node
, n
) {
4812 INIT_LIST_HEAD(&discard
);
4813 for (i
= 0; i
< SHRINK_PROMOTE_MAX
; i
++)
4814 INIT_LIST_HEAD(promote
+ i
);
4816 spin_lock_irqsave(&n
->list_lock
, flags
);
4819 * Build lists of slabs to discard or promote.
4821 * Note that concurrent frees may occur while we hold the
4822 * list_lock. slab->inuse here is the upper limit.
4824 list_for_each_entry_safe(slab
, t
, &n
->partial
, slab_list
) {
4825 int free
= slab
->objects
- slab
->inuse
;
4827 /* Do not reread slab->inuse */
4830 /* We do not keep full slabs on the list */
4833 if (free
== slab
->objects
) {
4834 list_move(&slab
->slab_list
, &discard
);
4836 dec_slabs_node(s
, node
, slab
->objects
);
4837 } else if (free
<= SHRINK_PROMOTE_MAX
)
4838 list_move(&slab
->slab_list
, promote
+ free
- 1);
4842 * Promote the slabs filled up most to the head of the
4845 for (i
= SHRINK_PROMOTE_MAX
- 1; i
>= 0; i
--)
4846 list_splice(promote
+ i
, &n
->partial
);
4848 spin_unlock_irqrestore(&n
->list_lock
, flags
);
4850 /* Release empty slabs */
4851 list_for_each_entry_safe(slab
, t
, &discard
, slab_list
)
4854 if (node_nr_slabs(n
))
4861 int __kmem_cache_shrink(struct kmem_cache
*s
)
4864 return __kmem_cache_do_shrink(s
);
4867 static int slab_mem_going_offline_callback(void *arg
)
4869 struct kmem_cache
*s
;
4871 mutex_lock(&slab_mutex
);
4872 list_for_each_entry(s
, &slab_caches
, list
) {
4873 flush_all_cpus_locked(s
);
4874 __kmem_cache_do_shrink(s
);
4876 mutex_unlock(&slab_mutex
);
4881 static void slab_mem_offline_callback(void *arg
)
4883 struct memory_notify
*marg
= arg
;
4886 offline_node
= marg
->status_change_nid_normal
;
4889 * If the node still has available memory. we need kmem_cache_node
4892 if (offline_node
< 0)
4895 mutex_lock(&slab_mutex
);
4896 node_clear(offline_node
, slab_nodes
);
4898 * We no longer free kmem_cache_node structures here, as it would be
4899 * racy with all get_node() users, and infeasible to protect them with
4902 mutex_unlock(&slab_mutex
);
4905 static int slab_mem_going_online_callback(void *arg
)
4907 struct kmem_cache_node
*n
;
4908 struct kmem_cache
*s
;
4909 struct memory_notify
*marg
= arg
;
4910 int nid
= marg
->status_change_nid_normal
;
4914 * If the node's memory is already available, then kmem_cache_node is
4915 * already created. Nothing to do.
4921 * We are bringing a node online. No memory is available yet. We must
4922 * allocate a kmem_cache_node structure in order to bring the node
4925 mutex_lock(&slab_mutex
);
4926 list_for_each_entry(s
, &slab_caches
, list
) {
4928 * The structure may already exist if the node was previously
4929 * onlined and offlined.
4931 if (get_node(s
, nid
))
4934 * XXX: kmem_cache_alloc_node will fallback to other nodes
4935 * since memory is not yet available from the node that
4938 n
= kmem_cache_alloc(kmem_cache_node
, GFP_KERNEL
);
4943 init_kmem_cache_node(n
);
4947 * Any cache created after this point will also have kmem_cache_node
4948 * initialized for the new node.
4950 node_set(nid
, slab_nodes
);
4952 mutex_unlock(&slab_mutex
);
4956 static int slab_memory_callback(struct notifier_block
*self
,
4957 unsigned long action
, void *arg
)
4962 case MEM_GOING_ONLINE
:
4963 ret
= slab_mem_going_online_callback(arg
);
4965 case MEM_GOING_OFFLINE
:
4966 ret
= slab_mem_going_offline_callback(arg
);
4969 case MEM_CANCEL_ONLINE
:
4970 slab_mem_offline_callback(arg
);
4973 case MEM_CANCEL_OFFLINE
:
4977 ret
= notifier_from_errno(ret
);
4983 /********************************************************************
4984 * Basic setup of slabs
4985 *******************************************************************/
4988 * Used for early kmem_cache structures that were allocated using
4989 * the page allocator. Allocate them properly then fix up the pointers
4990 * that may be pointing to the wrong kmem_cache structure.
4993 static struct kmem_cache
* __init
bootstrap(struct kmem_cache
*static_cache
)
4996 struct kmem_cache
*s
= kmem_cache_zalloc(kmem_cache
, GFP_NOWAIT
);
4997 struct kmem_cache_node
*n
;
4999 memcpy(s
, static_cache
, kmem_cache
->object_size
);
5002 * This runs very early, and only the boot processor is supposed to be
5003 * up. Even if it weren't true, IRQs are not up so we couldn't fire
5006 __flush_cpu_slab(s
, smp_processor_id());
5007 for_each_kmem_cache_node(s
, node
, n
) {
5010 list_for_each_entry(p
, &n
->partial
, slab_list
)
5013 #ifdef CONFIG_SLUB_DEBUG
5014 list_for_each_entry(p
, &n
->full
, slab_list
)
5018 list_add(&s
->list
, &slab_caches
);
5022 void __init
kmem_cache_init(void)
5024 static __initdata
struct kmem_cache boot_kmem_cache
,
5025 boot_kmem_cache_node
;
5028 if (debug_guardpage_minorder())
5031 /* Print slub debugging pointers without hashing */
5032 if (__slub_debug_enabled())
5033 no_hash_pointers_enable(NULL
);
5035 kmem_cache_node
= &boot_kmem_cache_node
;
5036 kmem_cache
= &boot_kmem_cache
;
5039 * Initialize the nodemask for which we will allocate per node
5040 * structures. Here we don't need taking slab_mutex yet.
5042 for_each_node_state(node
, N_NORMAL_MEMORY
)
5043 node_set(node
, slab_nodes
);
5045 create_boot_cache(kmem_cache_node
, "kmem_cache_node",
5046 sizeof(struct kmem_cache_node
), SLAB_HWCACHE_ALIGN
, 0, 0);
5048 hotplug_memory_notifier(slab_memory_callback
, SLAB_CALLBACK_PRI
);
5050 /* Able to allocate the per node structures */
5051 slab_state
= PARTIAL
;
5053 create_boot_cache(kmem_cache
, "kmem_cache",
5054 offsetof(struct kmem_cache
, node
) +
5055 nr_node_ids
* sizeof(struct kmem_cache_node
*),
5056 SLAB_HWCACHE_ALIGN
, 0, 0);
5058 kmem_cache
= bootstrap(&boot_kmem_cache
);
5059 kmem_cache_node
= bootstrap(&boot_kmem_cache_node
);
5061 /* Now we can use the kmem_cache to allocate kmalloc slabs */
5062 setup_kmalloc_cache_index_table();
5063 create_kmalloc_caches(0);
5065 /* Setup random freelists for each cache */
5066 init_freelist_randomization();
5068 cpuhp_setup_state_nocalls(CPUHP_SLUB_DEAD
, "slub:dead", NULL
,
5071 pr_info("SLUB: HWalign=%d, Order=%u-%u, MinObjects=%u, CPUs=%u, Nodes=%u\n",
5073 slub_min_order
, slub_max_order
, slub_min_objects
,
5074 nr_cpu_ids
, nr_node_ids
);
5077 void __init
kmem_cache_init_late(void)
5079 #ifndef CONFIG_SLUB_TINY
5080 flushwq
= alloc_workqueue("slub_flushwq", WQ_MEM_RECLAIM
, 0);
5086 __kmem_cache_alias(const char *name
, unsigned int size
, unsigned int align
,
5087 slab_flags_t flags
, void (*ctor
)(void *))
5089 struct kmem_cache
*s
;
5091 s
= find_mergeable(size
, align
, flags
, name
, ctor
);
5093 if (sysfs_slab_alias(s
, name
))
5099 * Adjust the object sizes so that we clear
5100 * the complete object on kzalloc.
5102 s
->object_size
= max(s
->object_size
, size
);
5103 s
->inuse
= max(s
->inuse
, ALIGN(size
, sizeof(void *)));
5109 int __kmem_cache_create(struct kmem_cache
*s
, slab_flags_t flags
)
5113 err
= kmem_cache_open(s
, flags
);
5117 /* Mutex is not taken during early boot */
5118 if (slab_state
<= UP
)
5121 err
= sysfs_slab_add(s
);
5123 __kmem_cache_release(s
);
5127 if (s
->flags
& SLAB_STORE_USER
)
5128 debugfs_slab_add(s
);
5133 #ifdef SLAB_SUPPORTS_SYSFS
5134 static int count_inuse(struct slab
*slab
)
5139 static int count_total(struct slab
*slab
)
5141 return slab
->objects
;
5145 #ifdef CONFIG_SLUB_DEBUG
5146 static void validate_slab(struct kmem_cache
*s
, struct slab
*slab
,
5147 unsigned long *obj_map
)
5150 void *addr
= slab_address(slab
);
5152 if (!check_slab(s
, slab
) || !on_freelist(s
, slab
, NULL
))
5155 /* Now we know that a valid freelist exists */
5156 __fill_map(obj_map
, s
, slab
);
5157 for_each_object(p
, s
, addr
, slab
->objects
) {
5158 u8 val
= test_bit(__obj_to_index(s
, addr
, p
), obj_map
) ?
5159 SLUB_RED_INACTIVE
: SLUB_RED_ACTIVE
;
5161 if (!check_object(s
, slab
, p
, val
))
5166 static int validate_slab_node(struct kmem_cache
*s
,
5167 struct kmem_cache_node
*n
, unsigned long *obj_map
)
5169 unsigned long count
= 0;
5171 unsigned long flags
;
5173 spin_lock_irqsave(&n
->list_lock
, flags
);
5175 list_for_each_entry(slab
, &n
->partial
, slab_list
) {
5176 validate_slab(s
, slab
, obj_map
);
5179 if (count
!= n
->nr_partial
) {
5180 pr_err("SLUB %s: %ld partial slabs counted but counter=%ld\n",
5181 s
->name
, count
, n
->nr_partial
);
5182 slab_add_kunit_errors();
5185 if (!(s
->flags
& SLAB_STORE_USER
))
5188 list_for_each_entry(slab
, &n
->full
, slab_list
) {
5189 validate_slab(s
, slab
, obj_map
);
5192 if (count
!= node_nr_slabs(n
)) {
5193 pr_err("SLUB: %s %ld slabs counted but counter=%ld\n",
5194 s
->name
, count
, node_nr_slabs(n
));
5195 slab_add_kunit_errors();
5199 spin_unlock_irqrestore(&n
->list_lock
, flags
);
5203 long validate_slab_cache(struct kmem_cache
*s
)
5206 unsigned long count
= 0;
5207 struct kmem_cache_node
*n
;
5208 unsigned long *obj_map
;
5210 obj_map
= bitmap_alloc(oo_objects(s
->oo
), GFP_KERNEL
);
5215 for_each_kmem_cache_node(s
, node
, n
)
5216 count
+= validate_slab_node(s
, n
, obj_map
);
5218 bitmap_free(obj_map
);
5222 EXPORT_SYMBOL(validate_slab_cache
);
5224 #ifdef CONFIG_DEBUG_FS
5226 * Generate lists of code addresses where slabcache objects are allocated
5231 depot_stack_handle_t handle
;
5232 unsigned long count
;
5234 unsigned long waste
;
5240 DECLARE_BITMAP(cpus
, NR_CPUS
);
5246 unsigned long count
;
5247 struct location
*loc
;
5251 static struct dentry
*slab_debugfs_root
;
5253 static void free_loc_track(struct loc_track
*t
)
5256 free_pages((unsigned long)t
->loc
,
5257 get_order(sizeof(struct location
) * t
->max
));
5260 static int alloc_loc_track(struct loc_track
*t
, unsigned long max
, gfp_t flags
)
5265 order
= get_order(sizeof(struct location
) * max
);
5267 l
= (void *)__get_free_pages(flags
, order
);
5272 memcpy(l
, t
->loc
, sizeof(struct location
) * t
->count
);
5280 static int add_location(struct loc_track
*t
, struct kmem_cache
*s
,
5281 const struct track
*track
,
5282 unsigned int orig_size
)
5284 long start
, end
, pos
;
5286 unsigned long caddr
, chandle
, cwaste
;
5287 unsigned long age
= jiffies
- track
->when
;
5288 depot_stack_handle_t handle
= 0;
5289 unsigned int waste
= s
->object_size
- orig_size
;
5291 #ifdef CONFIG_STACKDEPOT
5292 handle
= READ_ONCE(track
->handle
);
5298 pos
= start
+ (end
- start
+ 1) / 2;
5301 * There is nothing at "end". If we end up there
5302 * we need to add something to before end.
5309 chandle
= l
->handle
;
5311 if ((track
->addr
== caddr
) && (handle
== chandle
) &&
5312 (waste
== cwaste
)) {
5317 if (age
< l
->min_time
)
5319 if (age
> l
->max_time
)
5322 if (track
->pid
< l
->min_pid
)
5323 l
->min_pid
= track
->pid
;
5324 if (track
->pid
> l
->max_pid
)
5325 l
->max_pid
= track
->pid
;
5327 cpumask_set_cpu(track
->cpu
,
5328 to_cpumask(l
->cpus
));
5330 node_set(page_to_nid(virt_to_page(track
)), l
->nodes
);
5334 if (track
->addr
< caddr
)
5336 else if (track
->addr
== caddr
&& handle
< chandle
)
5338 else if (track
->addr
== caddr
&& handle
== chandle
&&
5346 * Not found. Insert new tracking element.
5348 if (t
->count
>= t
->max
&& !alloc_loc_track(t
, 2 * t
->max
, GFP_ATOMIC
))
5354 (t
->count
- pos
) * sizeof(struct location
));
5357 l
->addr
= track
->addr
;
5361 l
->min_pid
= track
->pid
;
5362 l
->max_pid
= track
->pid
;
5365 cpumask_clear(to_cpumask(l
->cpus
));
5366 cpumask_set_cpu(track
->cpu
, to_cpumask(l
->cpus
));
5367 nodes_clear(l
->nodes
);
5368 node_set(page_to_nid(virt_to_page(track
)), l
->nodes
);
5372 static void process_slab(struct loc_track
*t
, struct kmem_cache
*s
,
5373 struct slab
*slab
, enum track_item alloc
,
5374 unsigned long *obj_map
)
5376 void *addr
= slab_address(slab
);
5377 bool is_alloc
= (alloc
== TRACK_ALLOC
);
5380 __fill_map(obj_map
, s
, slab
);
5382 for_each_object(p
, s
, addr
, slab
->objects
)
5383 if (!test_bit(__obj_to_index(s
, addr
, p
), obj_map
))
5384 add_location(t
, s
, get_track(s
, p
, alloc
),
5385 is_alloc
? get_orig_size(s
, p
) :
5388 #endif /* CONFIG_DEBUG_FS */
5389 #endif /* CONFIG_SLUB_DEBUG */
5391 #ifdef SLAB_SUPPORTS_SYSFS
5392 enum slab_stat_type
{
5393 SL_ALL
, /* All slabs */
5394 SL_PARTIAL
, /* Only partially allocated slabs */
5395 SL_CPU
, /* Only slabs used for cpu caches */
5396 SL_OBJECTS
, /* Determine allocated objects not slabs */
5397 SL_TOTAL
/* Determine object capacity not slabs */
5400 #define SO_ALL (1 << SL_ALL)
5401 #define SO_PARTIAL (1 << SL_PARTIAL)
5402 #define SO_CPU (1 << SL_CPU)
5403 #define SO_OBJECTS (1 << SL_OBJECTS)
5404 #define SO_TOTAL (1 << SL_TOTAL)
5406 static ssize_t
show_slab_objects(struct kmem_cache
*s
,
5407 char *buf
, unsigned long flags
)
5409 unsigned long total
= 0;
5412 unsigned long *nodes
;
5415 nodes
= kcalloc(nr_node_ids
, sizeof(unsigned long), GFP_KERNEL
);
5419 if (flags
& SO_CPU
) {
5422 for_each_possible_cpu(cpu
) {
5423 struct kmem_cache_cpu
*c
= per_cpu_ptr(s
->cpu_slab
,
5428 slab
= READ_ONCE(c
->slab
);
5432 node
= slab_nid(slab
);
5433 if (flags
& SO_TOTAL
)
5435 else if (flags
& SO_OBJECTS
)
5443 #ifdef CONFIG_SLUB_CPU_PARTIAL
5444 slab
= slub_percpu_partial_read_once(c
);
5446 node
= slab_nid(slab
);
5447 if (flags
& SO_TOTAL
)
5449 else if (flags
& SO_OBJECTS
)
5461 * It is impossible to take "mem_hotplug_lock" here with "kernfs_mutex"
5462 * already held which will conflict with an existing lock order:
5464 * mem_hotplug_lock->slab_mutex->kernfs_mutex
5466 * We don't really need mem_hotplug_lock (to hold off
5467 * slab_mem_going_offline_callback) here because slab's memory hot
5468 * unplug code doesn't destroy the kmem_cache->node[] data.
5471 #ifdef CONFIG_SLUB_DEBUG
5472 if (flags
& SO_ALL
) {
5473 struct kmem_cache_node
*n
;
5475 for_each_kmem_cache_node(s
, node
, n
) {
5477 if (flags
& SO_TOTAL
)
5478 x
= node_nr_objs(n
);
5479 else if (flags
& SO_OBJECTS
)
5480 x
= node_nr_objs(n
) - count_partial(n
, count_free
);
5482 x
= node_nr_slabs(n
);
5489 if (flags
& SO_PARTIAL
) {
5490 struct kmem_cache_node
*n
;
5492 for_each_kmem_cache_node(s
, node
, n
) {
5493 if (flags
& SO_TOTAL
)
5494 x
= count_partial(n
, count_total
);
5495 else if (flags
& SO_OBJECTS
)
5496 x
= count_partial(n
, count_inuse
);
5504 len
+= sysfs_emit_at(buf
, len
, "%lu", total
);
5506 for (node
= 0; node
< nr_node_ids
; node
++) {
5508 len
+= sysfs_emit_at(buf
, len
, " N%d=%lu",
5512 len
+= sysfs_emit_at(buf
, len
, "\n");
5518 #define to_slab_attr(n) container_of(n, struct slab_attribute, attr)
5519 #define to_slab(n) container_of(n, struct kmem_cache, kobj)
5521 struct slab_attribute
{
5522 struct attribute attr
;
5523 ssize_t (*show
)(struct kmem_cache
*s
, char *buf
);
5524 ssize_t (*store
)(struct kmem_cache
*s
, const char *x
, size_t count
);
5527 #define SLAB_ATTR_RO(_name) \
5528 static struct slab_attribute _name##_attr = __ATTR_RO_MODE(_name, 0400)
5530 #define SLAB_ATTR(_name) \
5531 static struct slab_attribute _name##_attr = __ATTR_RW_MODE(_name, 0600)
5533 static ssize_t
slab_size_show(struct kmem_cache
*s
, char *buf
)
5535 return sysfs_emit(buf
, "%u\n", s
->size
);
5537 SLAB_ATTR_RO(slab_size
);
5539 static ssize_t
align_show(struct kmem_cache
*s
, char *buf
)
5541 return sysfs_emit(buf
, "%u\n", s
->align
);
5543 SLAB_ATTR_RO(align
);
5545 static ssize_t
object_size_show(struct kmem_cache
*s
, char *buf
)
5547 return sysfs_emit(buf
, "%u\n", s
->object_size
);
5549 SLAB_ATTR_RO(object_size
);
5551 static ssize_t
objs_per_slab_show(struct kmem_cache
*s
, char *buf
)
5553 return sysfs_emit(buf
, "%u\n", oo_objects(s
->oo
));
5555 SLAB_ATTR_RO(objs_per_slab
);
5557 static ssize_t
order_show(struct kmem_cache
*s
, char *buf
)
5559 return sysfs_emit(buf
, "%u\n", oo_order(s
->oo
));
5561 SLAB_ATTR_RO(order
);
5563 static ssize_t
min_partial_show(struct kmem_cache
*s
, char *buf
)
5565 return sysfs_emit(buf
, "%lu\n", s
->min_partial
);
5568 static ssize_t
min_partial_store(struct kmem_cache
*s
, const char *buf
,
5574 err
= kstrtoul(buf
, 10, &min
);
5578 s
->min_partial
= min
;
5581 SLAB_ATTR(min_partial
);
5583 static ssize_t
cpu_partial_show(struct kmem_cache
*s
, char *buf
)
5585 unsigned int nr_partial
= 0;
5586 #ifdef CONFIG_SLUB_CPU_PARTIAL
5587 nr_partial
= s
->cpu_partial
;
5590 return sysfs_emit(buf
, "%u\n", nr_partial
);
5593 static ssize_t
cpu_partial_store(struct kmem_cache
*s
, const char *buf
,
5596 unsigned int objects
;
5599 err
= kstrtouint(buf
, 10, &objects
);
5602 if (objects
&& !kmem_cache_has_cpu_partial(s
))
5605 slub_set_cpu_partial(s
, objects
);
5609 SLAB_ATTR(cpu_partial
);
5611 static ssize_t
ctor_show(struct kmem_cache
*s
, char *buf
)
5615 return sysfs_emit(buf
, "%pS\n", s
->ctor
);
5619 static ssize_t
aliases_show(struct kmem_cache
*s
, char *buf
)
5621 return sysfs_emit(buf
, "%d\n", s
->refcount
< 0 ? 0 : s
->refcount
- 1);
5623 SLAB_ATTR_RO(aliases
);
5625 static ssize_t
partial_show(struct kmem_cache
*s
, char *buf
)
5627 return show_slab_objects(s
, buf
, SO_PARTIAL
);
5629 SLAB_ATTR_RO(partial
);
5631 static ssize_t
cpu_slabs_show(struct kmem_cache
*s
, char *buf
)
5633 return show_slab_objects(s
, buf
, SO_CPU
);
5635 SLAB_ATTR_RO(cpu_slabs
);
5637 static ssize_t
objects_partial_show(struct kmem_cache
*s
, char *buf
)
5639 return show_slab_objects(s
, buf
, SO_PARTIAL
|SO_OBJECTS
);
5641 SLAB_ATTR_RO(objects_partial
);
5643 static ssize_t
slabs_cpu_partial_show(struct kmem_cache
*s
, char *buf
)
5647 int cpu __maybe_unused
;
5650 #ifdef CONFIG_SLUB_CPU_PARTIAL
5651 for_each_online_cpu(cpu
) {
5654 slab
= slub_percpu_partial(per_cpu_ptr(s
->cpu_slab
, cpu
));
5657 slabs
+= slab
->slabs
;
5661 /* Approximate half-full slabs, see slub_set_cpu_partial() */
5662 objects
= (slabs
* oo_objects(s
->oo
)) / 2;
5663 len
+= sysfs_emit_at(buf
, len
, "%d(%d)", objects
, slabs
);
5665 #ifdef CONFIG_SLUB_CPU_PARTIAL
5666 for_each_online_cpu(cpu
) {
5669 slab
= slub_percpu_partial(per_cpu_ptr(s
->cpu_slab
, cpu
));
5671 slabs
= READ_ONCE(slab
->slabs
);
5672 objects
= (slabs
* oo_objects(s
->oo
)) / 2;
5673 len
+= sysfs_emit_at(buf
, len
, " C%d=%d(%d)",
5674 cpu
, objects
, slabs
);
5678 len
+= sysfs_emit_at(buf
, len
, "\n");
5682 SLAB_ATTR_RO(slabs_cpu_partial
);
5684 static ssize_t
reclaim_account_show(struct kmem_cache
*s
, char *buf
)
5686 return sysfs_emit(buf
, "%d\n", !!(s
->flags
& SLAB_RECLAIM_ACCOUNT
));
5688 SLAB_ATTR_RO(reclaim_account
);
5690 static ssize_t
hwcache_align_show(struct kmem_cache
*s
, char *buf
)
5692 return sysfs_emit(buf
, "%d\n", !!(s
->flags
& SLAB_HWCACHE_ALIGN
));
5694 SLAB_ATTR_RO(hwcache_align
);
5696 #ifdef CONFIG_ZONE_DMA
5697 static ssize_t
cache_dma_show(struct kmem_cache
*s
, char *buf
)
5699 return sysfs_emit(buf
, "%d\n", !!(s
->flags
& SLAB_CACHE_DMA
));
5701 SLAB_ATTR_RO(cache_dma
);
5704 #ifdef CONFIG_HARDENED_USERCOPY
5705 static ssize_t
usersize_show(struct kmem_cache
*s
, char *buf
)
5707 return sysfs_emit(buf
, "%u\n", s
->usersize
);
5709 SLAB_ATTR_RO(usersize
);
5712 static ssize_t
destroy_by_rcu_show(struct kmem_cache
*s
, char *buf
)
5714 return sysfs_emit(buf
, "%d\n", !!(s
->flags
& SLAB_TYPESAFE_BY_RCU
));
5716 SLAB_ATTR_RO(destroy_by_rcu
);
5718 #ifdef CONFIG_SLUB_DEBUG
5719 static ssize_t
slabs_show(struct kmem_cache
*s
, char *buf
)
5721 return show_slab_objects(s
, buf
, SO_ALL
);
5723 SLAB_ATTR_RO(slabs
);
5725 static ssize_t
total_objects_show(struct kmem_cache
*s
, char *buf
)
5727 return show_slab_objects(s
, buf
, SO_ALL
|SO_TOTAL
);
5729 SLAB_ATTR_RO(total_objects
);
5731 static ssize_t
objects_show(struct kmem_cache
*s
, char *buf
)
5733 return show_slab_objects(s
, buf
, SO_ALL
|SO_OBJECTS
);
5735 SLAB_ATTR_RO(objects
);
5737 static ssize_t
sanity_checks_show(struct kmem_cache
*s
, char *buf
)
5739 return sysfs_emit(buf
, "%d\n", !!(s
->flags
& SLAB_CONSISTENCY_CHECKS
));
5741 SLAB_ATTR_RO(sanity_checks
);
5743 static ssize_t
trace_show(struct kmem_cache
*s
, char *buf
)
5745 return sysfs_emit(buf
, "%d\n", !!(s
->flags
& SLAB_TRACE
));
5747 SLAB_ATTR_RO(trace
);
5749 static ssize_t
red_zone_show(struct kmem_cache
*s
, char *buf
)
5751 return sysfs_emit(buf
, "%d\n", !!(s
->flags
& SLAB_RED_ZONE
));
5754 SLAB_ATTR_RO(red_zone
);
5756 static ssize_t
poison_show(struct kmem_cache
*s
, char *buf
)
5758 return sysfs_emit(buf
, "%d\n", !!(s
->flags
& SLAB_POISON
));
5761 SLAB_ATTR_RO(poison
);
5763 static ssize_t
store_user_show(struct kmem_cache
*s
, char *buf
)
5765 return sysfs_emit(buf
, "%d\n", !!(s
->flags
& SLAB_STORE_USER
));
5768 SLAB_ATTR_RO(store_user
);
5770 static ssize_t
validate_show(struct kmem_cache
*s
, char *buf
)
5775 static ssize_t
validate_store(struct kmem_cache
*s
,
5776 const char *buf
, size_t length
)
5780 if (buf
[0] == '1' && kmem_cache_debug(s
)) {
5781 ret
= validate_slab_cache(s
);
5787 SLAB_ATTR(validate
);
5789 #endif /* CONFIG_SLUB_DEBUG */
5791 #ifdef CONFIG_FAILSLAB
5792 static ssize_t
failslab_show(struct kmem_cache
*s
, char *buf
)
5794 return sysfs_emit(buf
, "%d\n", !!(s
->flags
& SLAB_FAILSLAB
));
5797 static ssize_t
failslab_store(struct kmem_cache
*s
, const char *buf
,
5800 if (s
->refcount
> 1)
5804 WRITE_ONCE(s
->flags
, s
->flags
| SLAB_FAILSLAB
);
5806 WRITE_ONCE(s
->flags
, s
->flags
& ~SLAB_FAILSLAB
);
5810 SLAB_ATTR(failslab
);
5813 static ssize_t
shrink_show(struct kmem_cache
*s
, char *buf
)
5818 static ssize_t
shrink_store(struct kmem_cache
*s
,
5819 const char *buf
, size_t length
)
5822 kmem_cache_shrink(s
);
5830 static ssize_t
remote_node_defrag_ratio_show(struct kmem_cache
*s
, char *buf
)
5832 return sysfs_emit(buf
, "%u\n", s
->remote_node_defrag_ratio
/ 10);
5835 static ssize_t
remote_node_defrag_ratio_store(struct kmem_cache
*s
,
5836 const char *buf
, size_t length
)
5841 err
= kstrtouint(buf
, 10, &ratio
);
5847 s
->remote_node_defrag_ratio
= ratio
* 10;
5851 SLAB_ATTR(remote_node_defrag_ratio
);
5854 #ifdef CONFIG_SLUB_STATS
5855 static int show_stat(struct kmem_cache
*s
, char *buf
, enum stat_item si
)
5857 unsigned long sum
= 0;
5860 int *data
= kmalloc_array(nr_cpu_ids
, sizeof(int), GFP_KERNEL
);
5865 for_each_online_cpu(cpu
) {
5866 unsigned x
= per_cpu_ptr(s
->cpu_slab
, cpu
)->stat
[si
];
5872 len
+= sysfs_emit_at(buf
, len
, "%lu", sum
);
5875 for_each_online_cpu(cpu
) {
5877 len
+= sysfs_emit_at(buf
, len
, " C%d=%u",
5882 len
+= sysfs_emit_at(buf
, len
, "\n");
5887 static void clear_stat(struct kmem_cache
*s
, enum stat_item si
)
5891 for_each_online_cpu(cpu
)
5892 per_cpu_ptr(s
->cpu_slab
, cpu
)->stat
[si
] = 0;
5895 #define STAT_ATTR(si, text) \
5896 static ssize_t text##_show(struct kmem_cache *s, char *buf) \
5898 return show_stat(s, buf, si); \
5900 static ssize_t text##_store(struct kmem_cache *s, \
5901 const char *buf, size_t length) \
5903 if (buf[0] != '0') \
5905 clear_stat(s, si); \
5910 STAT_ATTR(ALLOC_FASTPATH, alloc_fastpath);
5911 STAT_ATTR(ALLOC_SLOWPATH
, alloc_slowpath
);
5912 STAT_ATTR(FREE_FASTPATH
, free_fastpath
);
5913 STAT_ATTR(FREE_SLOWPATH
, free_slowpath
);
5914 STAT_ATTR(FREE_FROZEN
, free_frozen
);
5915 STAT_ATTR(FREE_ADD_PARTIAL
, free_add_partial
);
5916 STAT_ATTR(FREE_REMOVE_PARTIAL
, free_remove_partial
);
5917 STAT_ATTR(ALLOC_FROM_PARTIAL
, alloc_from_partial
);
5918 STAT_ATTR(ALLOC_SLAB
, alloc_slab
);
5919 STAT_ATTR(ALLOC_REFILL
, alloc_refill
);
5920 STAT_ATTR(ALLOC_NODE_MISMATCH
, alloc_node_mismatch
);
5921 STAT_ATTR(FREE_SLAB
, free_slab
);
5922 STAT_ATTR(CPUSLAB_FLUSH
, cpuslab_flush
);
5923 STAT_ATTR(DEACTIVATE_FULL
, deactivate_full
);
5924 STAT_ATTR(DEACTIVATE_EMPTY
, deactivate_empty
);
5925 STAT_ATTR(DEACTIVATE_TO_HEAD
, deactivate_to_head
);
5926 STAT_ATTR(DEACTIVATE_TO_TAIL
, deactivate_to_tail
);
5927 STAT_ATTR(DEACTIVATE_REMOTE_FREES
, deactivate_remote_frees
);
5928 STAT_ATTR(DEACTIVATE_BYPASS
, deactivate_bypass
);
5929 STAT_ATTR(ORDER_FALLBACK
, order_fallback
);
5930 STAT_ATTR(CMPXCHG_DOUBLE_CPU_FAIL
, cmpxchg_double_cpu_fail
);
5931 STAT_ATTR(CMPXCHG_DOUBLE_FAIL
, cmpxchg_double_fail
);
5932 STAT_ATTR(CPU_PARTIAL_ALLOC
, cpu_partial_alloc
);
5933 STAT_ATTR(CPU_PARTIAL_FREE
, cpu_partial_free
);
5934 STAT_ATTR(CPU_PARTIAL_NODE
, cpu_partial_node
);
5935 STAT_ATTR(CPU_PARTIAL_DRAIN
, cpu_partial_drain
);
5936 #endif /* CONFIG_SLUB_STATS */
5938 #ifdef CONFIG_KFENCE
5939 static ssize_t
skip_kfence_show(struct kmem_cache
*s
, char *buf
)
5941 return sysfs_emit(buf
, "%d\n", !!(s
->flags
& SLAB_SKIP_KFENCE
));
5944 static ssize_t
skip_kfence_store(struct kmem_cache
*s
,
5945 const char *buf
, size_t length
)
5950 s
->flags
&= ~SLAB_SKIP_KFENCE
;
5951 else if (buf
[0] == '1')
5952 s
->flags
|= SLAB_SKIP_KFENCE
;
5958 SLAB_ATTR(skip_kfence
);
5961 static struct attribute
*slab_attrs
[] = {
5962 &slab_size_attr
.attr
,
5963 &object_size_attr
.attr
,
5964 &objs_per_slab_attr
.attr
,
5966 &min_partial_attr
.attr
,
5967 &cpu_partial_attr
.attr
,
5968 &objects_partial_attr
.attr
,
5970 &cpu_slabs_attr
.attr
,
5974 &hwcache_align_attr
.attr
,
5975 &reclaim_account_attr
.attr
,
5976 &destroy_by_rcu_attr
.attr
,
5978 &slabs_cpu_partial_attr
.attr
,
5979 #ifdef CONFIG_SLUB_DEBUG
5980 &total_objects_attr
.attr
,
5983 &sanity_checks_attr
.attr
,
5985 &red_zone_attr
.attr
,
5987 &store_user_attr
.attr
,
5988 &validate_attr
.attr
,
5990 #ifdef CONFIG_ZONE_DMA
5991 &cache_dma_attr
.attr
,
5994 &remote_node_defrag_ratio_attr
.attr
,
5996 #ifdef CONFIG_SLUB_STATS
5997 &alloc_fastpath_attr
.attr
,
5998 &alloc_slowpath_attr
.attr
,
5999 &free_fastpath_attr
.attr
,
6000 &free_slowpath_attr
.attr
,
6001 &free_frozen_attr
.attr
,
6002 &free_add_partial_attr
.attr
,
6003 &free_remove_partial_attr
.attr
,
6004 &alloc_from_partial_attr
.attr
,
6005 &alloc_slab_attr
.attr
,
6006 &alloc_refill_attr
.attr
,
6007 &alloc_node_mismatch_attr
.attr
,
6008 &free_slab_attr
.attr
,
6009 &cpuslab_flush_attr
.attr
,
6010 &deactivate_full_attr
.attr
,
6011 &deactivate_empty_attr
.attr
,
6012 &deactivate_to_head_attr
.attr
,
6013 &deactivate_to_tail_attr
.attr
,
6014 &deactivate_remote_frees_attr
.attr
,
6015 &deactivate_bypass_attr
.attr
,
6016 &order_fallback_attr
.attr
,
6017 &cmpxchg_double_fail_attr
.attr
,
6018 &cmpxchg_double_cpu_fail_attr
.attr
,
6019 &cpu_partial_alloc_attr
.attr
,
6020 &cpu_partial_free_attr
.attr
,
6021 &cpu_partial_node_attr
.attr
,
6022 &cpu_partial_drain_attr
.attr
,
6024 #ifdef CONFIG_FAILSLAB
6025 &failslab_attr
.attr
,
6027 #ifdef CONFIG_HARDENED_USERCOPY
6028 &usersize_attr
.attr
,
6030 #ifdef CONFIG_KFENCE
6031 &skip_kfence_attr
.attr
,
6037 static const struct attribute_group slab_attr_group
= {
6038 .attrs
= slab_attrs
,
6041 static ssize_t
slab_attr_show(struct kobject
*kobj
,
6042 struct attribute
*attr
,
6045 struct slab_attribute
*attribute
;
6046 struct kmem_cache
*s
;
6048 attribute
= to_slab_attr(attr
);
6051 if (!attribute
->show
)
6054 return attribute
->show(s
, buf
);
6057 static ssize_t
slab_attr_store(struct kobject
*kobj
,
6058 struct attribute
*attr
,
6059 const char *buf
, size_t len
)
6061 struct slab_attribute
*attribute
;
6062 struct kmem_cache
*s
;
6064 attribute
= to_slab_attr(attr
);
6067 if (!attribute
->store
)
6070 return attribute
->store(s
, buf
, len
);
6073 static void kmem_cache_release(struct kobject
*k
)
6075 slab_kmem_cache_release(to_slab(k
));
6078 static const struct sysfs_ops slab_sysfs_ops
= {
6079 .show
= slab_attr_show
,
6080 .store
= slab_attr_store
,
6083 static const struct kobj_type slab_ktype
= {
6084 .sysfs_ops
= &slab_sysfs_ops
,
6085 .release
= kmem_cache_release
,
6088 static struct kset
*slab_kset
;
6090 static inline struct kset
*cache_kset(struct kmem_cache
*s
)
6095 #define ID_STR_LENGTH 32
6097 /* Create a unique string id for a slab cache:
6099 * Format :[flags-]size
6101 static char *create_unique_id(struct kmem_cache
*s
)
6103 char *name
= kmalloc(ID_STR_LENGTH
, GFP_KERNEL
);
6107 return ERR_PTR(-ENOMEM
);
6111 * First flags affecting slabcache operations. We will only
6112 * get here for aliasable slabs so we do not need to support
6113 * too many flags. The flags here must cover all flags that
6114 * are matched during merging to guarantee that the id is
6117 if (s
->flags
& SLAB_CACHE_DMA
)
6119 if (s
->flags
& SLAB_CACHE_DMA32
)
6121 if (s
->flags
& SLAB_RECLAIM_ACCOUNT
)
6123 if (s
->flags
& SLAB_CONSISTENCY_CHECKS
)
6125 if (s
->flags
& SLAB_ACCOUNT
)
6129 p
+= snprintf(p
, ID_STR_LENGTH
- (p
- name
), "%07u", s
->size
);
6131 if (WARN_ON(p
> name
+ ID_STR_LENGTH
- 1)) {
6133 return ERR_PTR(-EINVAL
);
6135 kmsan_unpoison_memory(name
, p
- name
);
6139 static int sysfs_slab_add(struct kmem_cache
*s
)
6143 struct kset
*kset
= cache_kset(s
);
6144 int unmergeable
= slab_unmergeable(s
);
6146 if (!unmergeable
&& disable_higher_order_debug
&&
6147 (slub_debug
& DEBUG_METADATA_FLAGS
))
6152 * Slabcache can never be merged so we can use the name proper.
6153 * This is typically the case for debug situations. In that
6154 * case we can catch duplicate names easily.
6156 sysfs_remove_link(&slab_kset
->kobj
, s
->name
);
6160 * Create a unique name for the slab as a target
6163 name
= create_unique_id(s
);
6165 return PTR_ERR(name
);
6168 s
->kobj
.kset
= kset
;
6169 err
= kobject_init_and_add(&s
->kobj
, &slab_ktype
, NULL
, "%s", name
);
6173 err
= sysfs_create_group(&s
->kobj
, &slab_attr_group
);
6178 /* Setup first alias */
6179 sysfs_slab_alias(s
, s
->name
);
6186 kobject_del(&s
->kobj
);
6190 void sysfs_slab_unlink(struct kmem_cache
*s
)
6192 if (slab_state
>= FULL
)
6193 kobject_del(&s
->kobj
);
6196 void sysfs_slab_release(struct kmem_cache
*s
)
6198 if (slab_state
>= FULL
)
6199 kobject_put(&s
->kobj
);
6203 * Need to buffer aliases during bootup until sysfs becomes
6204 * available lest we lose that information.
6206 struct saved_alias
{
6207 struct kmem_cache
*s
;
6209 struct saved_alias
*next
;
6212 static struct saved_alias
*alias_list
;
6214 static int sysfs_slab_alias(struct kmem_cache
*s
, const char *name
)
6216 struct saved_alias
*al
;
6218 if (slab_state
== FULL
) {
6220 * If we have a leftover link then remove it.
6222 sysfs_remove_link(&slab_kset
->kobj
, name
);
6223 return sysfs_create_link(&slab_kset
->kobj
, &s
->kobj
, name
);
6226 al
= kmalloc(sizeof(struct saved_alias
), GFP_KERNEL
);
6232 al
->next
= alias_list
;
6234 kmsan_unpoison_memory(al
, sizeof(*al
));
6238 static int __init
slab_sysfs_init(void)
6240 struct kmem_cache
*s
;
6243 mutex_lock(&slab_mutex
);
6245 slab_kset
= kset_create_and_add("slab", NULL
, kernel_kobj
);
6247 mutex_unlock(&slab_mutex
);
6248 pr_err("Cannot register slab subsystem.\n");
6254 list_for_each_entry(s
, &slab_caches
, list
) {
6255 err
= sysfs_slab_add(s
);
6257 pr_err("SLUB: Unable to add boot slab %s to sysfs\n",
6261 while (alias_list
) {
6262 struct saved_alias
*al
= alias_list
;
6264 alias_list
= alias_list
->next
;
6265 err
= sysfs_slab_alias(al
->s
, al
->name
);
6267 pr_err("SLUB: Unable to add boot slab alias %s to sysfs\n",
6272 mutex_unlock(&slab_mutex
);
6275 late_initcall(slab_sysfs_init
);
6276 #endif /* SLAB_SUPPORTS_SYSFS */
6278 #if defined(CONFIG_SLUB_DEBUG) && defined(CONFIG_DEBUG_FS)
6279 static int slab_debugfs_show(struct seq_file
*seq
, void *v
)
6281 struct loc_track
*t
= seq
->private;
6285 idx
= (unsigned long) t
->idx
;
6286 if (idx
< t
->count
) {
6289 seq_printf(seq
, "%7ld ", l
->count
);
6292 seq_printf(seq
, "%pS", (void *)l
->addr
);
6294 seq_puts(seq
, "<not-available>");
6297 seq_printf(seq
, " waste=%lu/%lu",
6298 l
->count
* l
->waste
, l
->waste
);
6300 if (l
->sum_time
!= l
->min_time
) {
6301 seq_printf(seq
, " age=%ld/%llu/%ld",
6302 l
->min_time
, div_u64(l
->sum_time
, l
->count
),
6305 seq_printf(seq
, " age=%ld", l
->min_time
);
6307 if (l
->min_pid
!= l
->max_pid
)
6308 seq_printf(seq
, " pid=%ld-%ld", l
->min_pid
, l
->max_pid
);
6310 seq_printf(seq
, " pid=%ld",
6313 if (num_online_cpus() > 1 && !cpumask_empty(to_cpumask(l
->cpus
)))
6314 seq_printf(seq
, " cpus=%*pbl",
6315 cpumask_pr_args(to_cpumask(l
->cpus
)));
6317 if (nr_online_nodes
> 1 && !nodes_empty(l
->nodes
))
6318 seq_printf(seq
, " nodes=%*pbl",
6319 nodemask_pr_args(&l
->nodes
));
6321 #ifdef CONFIG_STACKDEPOT
6323 depot_stack_handle_t handle
;
6324 unsigned long *entries
;
6325 unsigned int nr_entries
, j
;
6327 handle
= READ_ONCE(l
->handle
);
6329 nr_entries
= stack_depot_fetch(handle
, &entries
);
6330 seq_puts(seq
, "\n");
6331 for (j
= 0; j
< nr_entries
; j
++)
6332 seq_printf(seq
, " %pS\n", (void *)entries
[j
]);
6336 seq_puts(seq
, "\n");
6339 if (!idx
&& !t
->count
)
6340 seq_puts(seq
, "No data\n");
6345 static void slab_debugfs_stop(struct seq_file
*seq
, void *v
)
6349 static void *slab_debugfs_next(struct seq_file
*seq
, void *v
, loff_t
*ppos
)
6351 struct loc_track
*t
= seq
->private;
6354 if (*ppos
<= t
->count
)
6360 static int cmp_loc_by_count(const void *a
, const void *b
, const void *data
)
6362 struct location
*loc1
= (struct location
*)a
;
6363 struct location
*loc2
= (struct location
*)b
;
6365 if (loc1
->count
> loc2
->count
)
6371 static void *slab_debugfs_start(struct seq_file
*seq
, loff_t
*ppos
)
6373 struct loc_track
*t
= seq
->private;
6379 static const struct seq_operations slab_debugfs_sops
= {
6380 .start
= slab_debugfs_start
,
6381 .next
= slab_debugfs_next
,
6382 .stop
= slab_debugfs_stop
,
6383 .show
= slab_debugfs_show
,
6386 static int slab_debug_trace_open(struct inode
*inode
, struct file
*filep
)
6389 struct kmem_cache_node
*n
;
6390 enum track_item alloc
;
6392 struct loc_track
*t
= __seq_open_private(filep
, &slab_debugfs_sops
,
6393 sizeof(struct loc_track
));
6394 struct kmem_cache
*s
= file_inode(filep
)->i_private
;
6395 unsigned long *obj_map
;
6400 obj_map
= bitmap_alloc(oo_objects(s
->oo
), GFP_KERNEL
);
6402 seq_release_private(inode
, filep
);
6406 if (strcmp(filep
->f_path
.dentry
->d_name
.name
, "alloc_traces") == 0)
6407 alloc
= TRACK_ALLOC
;
6411 if (!alloc_loc_track(t
, PAGE_SIZE
/ sizeof(struct location
), GFP_KERNEL
)) {
6412 bitmap_free(obj_map
);
6413 seq_release_private(inode
, filep
);
6417 for_each_kmem_cache_node(s
, node
, n
) {
6418 unsigned long flags
;
6421 if (!node_nr_slabs(n
))
6424 spin_lock_irqsave(&n
->list_lock
, flags
);
6425 list_for_each_entry(slab
, &n
->partial
, slab_list
)
6426 process_slab(t
, s
, slab
, alloc
, obj_map
);
6427 list_for_each_entry(slab
, &n
->full
, slab_list
)
6428 process_slab(t
, s
, slab
, alloc
, obj_map
);
6429 spin_unlock_irqrestore(&n
->list_lock
, flags
);
6432 /* Sort locations by count */
6433 sort_r(t
->loc
, t
->count
, sizeof(struct location
),
6434 cmp_loc_by_count
, NULL
, NULL
);
6436 bitmap_free(obj_map
);
6440 static int slab_debug_trace_release(struct inode
*inode
, struct file
*file
)
6442 struct seq_file
*seq
= file
->private_data
;
6443 struct loc_track
*t
= seq
->private;
6446 return seq_release_private(inode
, file
);
6449 static const struct file_operations slab_debugfs_fops
= {
6450 .open
= slab_debug_trace_open
,
6452 .llseek
= seq_lseek
,
6453 .release
= slab_debug_trace_release
,
6456 static void debugfs_slab_add(struct kmem_cache
*s
)
6458 struct dentry
*slab_cache_dir
;
6460 if (unlikely(!slab_debugfs_root
))
6463 slab_cache_dir
= debugfs_create_dir(s
->name
, slab_debugfs_root
);
6465 debugfs_create_file("alloc_traces", 0400,
6466 slab_cache_dir
, s
, &slab_debugfs_fops
);
6468 debugfs_create_file("free_traces", 0400,
6469 slab_cache_dir
, s
, &slab_debugfs_fops
);
6472 void debugfs_slab_release(struct kmem_cache
*s
)
6474 debugfs_lookup_and_remove(s
->name
, slab_debugfs_root
);
6477 static int __init
slab_debugfs_init(void)
6479 struct kmem_cache
*s
;
6481 slab_debugfs_root
= debugfs_create_dir("slab", NULL
);
6483 list_for_each_entry(s
, &slab_caches
, list
)
6484 if (s
->flags
& SLAB_STORE_USER
)
6485 debugfs_slab_add(s
);
6490 __initcall(slab_debugfs_init
);
6493 * The /proc/slabinfo ABI
6495 #ifdef CONFIG_SLUB_DEBUG
6496 void get_slabinfo(struct kmem_cache
*s
, struct slabinfo
*sinfo
)
6498 unsigned long nr_slabs
= 0;
6499 unsigned long nr_objs
= 0;
6500 unsigned long nr_free
= 0;
6502 struct kmem_cache_node
*n
;
6504 for_each_kmem_cache_node(s
, node
, n
) {
6505 nr_slabs
+= node_nr_slabs(n
);
6506 nr_objs
+= node_nr_objs(n
);
6507 nr_free
+= count_partial(n
, count_free
);
6510 sinfo
->active_objs
= nr_objs
- nr_free
;
6511 sinfo
->num_objs
= nr_objs
;
6512 sinfo
->active_slabs
= nr_slabs
;
6513 sinfo
->num_slabs
= nr_slabs
;
6514 sinfo
->objects_per_slab
= oo_objects(s
->oo
);
6515 sinfo
->cache_order
= oo_order(s
->oo
);
6518 void slabinfo_show_stats(struct seq_file
*m
, struct kmem_cache
*s
)
6522 ssize_t
slabinfo_write(struct file
*file
, const char __user
*buffer
,
6523 size_t count
, loff_t
*ppos
)
6527 #endif /* CONFIG_SLUB_DEBUG */