1 // SPDX-License-Identifier: GPL-2.0
3 * SLUB: A slab allocator that limits cache line use instead of queuing
4 * objects in per cpu and per node lists.
6 * The allocator synchronizes using per slab locks or atomic operations
7 * and only uses a centralized lock to manage a pool of partial slabs.
9 * (C) 2007 SGI, Christoph Lameter
10 * (C) 2011 Linux Foundation, Christoph Lameter
14 #include <linux/swap.h> /* mm_account_reclaimed_pages() */
15 #include <linux/module.h>
16 #include <linux/bit_spinlock.h>
17 #include <linux/interrupt.h>
18 #include <linux/swab.h>
19 #include <linux/bitops.h>
20 #include <linux/slab.h>
22 #include <linux/proc_fs.h>
23 #include <linux/seq_file.h>
24 #include <linux/kasan.h>
25 #include <linux/kmsan.h>
26 #include <linux/cpu.h>
27 #include <linux/cpuset.h>
28 #include <linux/mempolicy.h>
29 #include <linux/ctype.h>
30 #include <linux/stackdepot.h>
31 #include <linux/debugobjects.h>
32 #include <linux/kallsyms.h>
33 #include <linux/kfence.h>
34 #include <linux/memory.h>
35 #include <linux/math64.h>
36 #include <linux/fault-inject.h>
37 #include <linux/stacktrace.h>
38 #include <linux/prefetch.h>
39 #include <linux/memcontrol.h>
40 #include <linux/random.h>
41 #include <kunit/test.h>
42 #include <kunit/test-bug.h>
43 #include <linux/sort.h>
45 #include <linux/debugfs.h>
46 #include <trace/events/kmem.h>
52 * 1. slab_mutex (Global Mutex)
53 * 2. node->list_lock (Spinlock)
54 * 3. kmem_cache->cpu_slab->lock (Local lock)
55 * 4. slab_lock(slab) (Only on some arches)
56 * 5. object_map_lock (Only for debugging)
60 * The role of the slab_mutex is to protect the list of all the slabs
61 * and to synchronize major metadata changes to slab cache structures.
62 * Also synchronizes memory hotplug callbacks.
66 * The slab_lock is a wrapper around the page lock, thus it is a bit
69 * The slab_lock is only used on arches that do not have the ability
70 * to do a cmpxchg_double. It only protects:
72 * A. slab->freelist -> List of free objects in a slab
73 * B. slab->inuse -> Number of objects in use
74 * C. slab->objects -> Number of objects in slab
75 * D. slab->frozen -> frozen state
79 * If a slab is frozen then it is exempt from list management. It is not
80 * on any list except per cpu partial list. The processor that froze the
81 * slab is the one who can perform list operations on the slab. Other
82 * processors may put objects onto the freelist but the processor that
83 * froze the slab is the only one that can retrieve the objects from the
88 * The list_lock protects the partial and full list on each node and
89 * the partial slab counter. If taken then no new slabs may be added or
90 * removed from the lists nor make the number of partial slabs be modified.
91 * (Note that the total number of slabs is an atomic value that may be
92 * modified without taking the list lock).
94 * The list_lock is a centralized lock and thus we avoid taking it as
95 * much as possible. As long as SLUB does not have to handle partial
96 * slabs, operations can continue without any centralized lock. F.e.
97 * allocating a long series of objects that fill up slabs does not require
100 * For debug caches, all allocations are forced to go through a list_lock
101 * protected region to serialize against concurrent validation.
103 * cpu_slab->lock local lock
105 * This locks protect slowpath manipulation of all kmem_cache_cpu fields
106 * except the stat counters. This is a percpu structure manipulated only by
107 * the local cpu, so the lock protects against being preempted or interrupted
108 * by an irq. Fast path operations rely on lockless operations instead.
110 * On PREEMPT_RT, the local lock neither disables interrupts nor preemption
111 * which means the lockless fastpath cannot be used as it might interfere with
112 * an in-progress slow path operations. In this case the local lock is always
113 * taken but it still utilizes the freelist for the common operations.
117 * The fast path allocation (slab_alloc_node()) and freeing (do_slab_free())
118 * are fully lockless when satisfied from the percpu slab (and when
119 * cmpxchg_double is possible to use, otherwise slab_lock is taken).
120 * They also don't disable preemption or migration or irqs. They rely on
121 * the transaction id (tid) field to detect being preempted or moved to
124 * irq, preemption, migration considerations
126 * Interrupts are disabled as part of list_lock or local_lock operations, or
127 * around the slab_lock operation, in order to make the slab allocator safe
128 * to use in the context of an irq.
130 * In addition, preemption (or migration on PREEMPT_RT) is disabled in the
131 * allocation slowpath, bulk allocation, and put_cpu_partial(), so that the
132 * local cpu doesn't change in the process and e.g. the kmem_cache_cpu pointer
133 * doesn't have to be revalidated in each section protected by the local lock.
135 * SLUB assigns one slab for allocation to each processor.
136 * Allocations only occur from these slabs called cpu slabs.
138 * Slabs with free elements are kept on a partial list and during regular
139 * operations no list for full slabs is used. If an object in a full slab is
140 * freed then the slab will show up again on the partial lists.
141 * We track full slabs for debugging purposes though because otherwise we
142 * cannot scan all objects.
144 * Slabs are freed when they become empty. Teardown and setup is
145 * minimal so we rely on the page allocators per cpu caches for
146 * fast frees and allocs.
148 * slab->frozen The slab is frozen and exempt from list processing.
149 * This means that the slab is dedicated to a purpose
150 * such as satisfying allocations for a specific
151 * processor. Objects may be freed in the slab while
152 * it is frozen but slab_free will then skip the usual
153 * list operations. It is up to the processor holding
154 * the slab to integrate the slab into the slab lists
155 * when the slab is no longer needed.
157 * One use of this flag is to mark slabs that are
158 * used for allocations. Then such a slab becomes a cpu
159 * slab. The cpu slab may be equipped with an additional
160 * freelist that allows lockless access to
161 * free objects in addition to the regular freelist
162 * that requires the slab lock.
164 * SLAB_DEBUG_FLAGS Slab requires special handling due to debug
165 * options set. This moves slab handling out of
166 * the fast path and disables lockless freelists.
170 * We could simply use migrate_disable()/enable() but as long as it's a
171 * function call even on !PREEMPT_RT, use inline preempt_disable() there.
173 #ifndef CONFIG_PREEMPT_RT
174 #define slub_get_cpu_ptr(var) get_cpu_ptr(var)
175 #define slub_put_cpu_ptr(var) put_cpu_ptr(var)
176 #define USE_LOCKLESS_FAST_PATH() (true)
178 #define slub_get_cpu_ptr(var) \
183 #define slub_put_cpu_ptr(var) \
188 #define USE_LOCKLESS_FAST_PATH() (false)
191 #ifndef CONFIG_SLUB_TINY
192 #define __fastpath_inline __always_inline
194 #define __fastpath_inline
197 #ifdef CONFIG_SLUB_DEBUG
198 #ifdef CONFIG_SLUB_DEBUG_ON
199 DEFINE_STATIC_KEY_TRUE(slub_debug_enabled
);
201 DEFINE_STATIC_KEY_FALSE(slub_debug_enabled
);
203 #endif /* CONFIG_SLUB_DEBUG */
205 /* Structure holding parameters for get_partial() call chain */
206 struct partial_context
{
209 unsigned int orig_size
;
212 static inline bool kmem_cache_debug(struct kmem_cache
*s
)
214 return kmem_cache_debug_flags(s
, SLAB_DEBUG_FLAGS
);
217 static inline bool slub_debug_orig_size(struct kmem_cache
*s
)
219 return (kmem_cache_debug_flags(s
, SLAB_STORE_USER
) &&
220 (s
->flags
& SLAB_KMALLOC
));
223 void *fixup_red_left(struct kmem_cache
*s
, void *p
)
225 if (kmem_cache_debug_flags(s
, SLAB_RED_ZONE
))
226 p
+= s
->red_left_pad
;
231 static inline bool kmem_cache_has_cpu_partial(struct kmem_cache
*s
)
233 #ifdef CONFIG_SLUB_CPU_PARTIAL
234 return !kmem_cache_debug(s
);
241 * Issues still to be resolved:
243 * - Support PAGE_ALLOC_DEBUG. Should be easy to do.
245 * - Variable sizing of the per node arrays
248 /* Enable to log cmpxchg failures */
249 #undef SLUB_DEBUG_CMPXCHG
251 #ifndef CONFIG_SLUB_TINY
253 * Minimum number of partial slabs. These will be left on the partial
254 * lists even if they are empty. kmem_cache_shrink may reclaim them.
256 #define MIN_PARTIAL 5
259 * Maximum number of desirable partial slabs.
260 * The existence of more partial slabs makes kmem_cache_shrink
261 * sort the partial list by the number of objects in use.
263 #define MAX_PARTIAL 10
265 #define MIN_PARTIAL 0
266 #define MAX_PARTIAL 0
269 #define DEBUG_DEFAULT_FLAGS (SLAB_CONSISTENCY_CHECKS | SLAB_RED_ZONE | \
270 SLAB_POISON | SLAB_STORE_USER)
273 * These debug flags cannot use CMPXCHG because there might be consistency
274 * issues when checking or reading debug information
276 #define SLAB_NO_CMPXCHG (SLAB_CONSISTENCY_CHECKS | SLAB_STORE_USER | \
281 * Debugging flags that require metadata to be stored in the slab. These get
282 * disabled when slub_debug=O is used and a cache's min order increases with
285 #define DEBUG_METADATA_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER)
288 #define OO_MASK ((1 << OO_SHIFT) - 1)
289 #define MAX_OBJS_PER_PAGE 32767 /* since slab.objects is u15 */
291 /* Internal SLUB flags */
293 #define __OBJECT_POISON ((slab_flags_t __force)0x80000000U)
294 /* Use cmpxchg_double */
296 #ifdef system_has_freelist_aba
297 #define __CMPXCHG_DOUBLE ((slab_flags_t __force)0x40000000U)
299 #define __CMPXCHG_DOUBLE ((slab_flags_t __force)0U)
303 * Tracking user of a slab.
305 #define TRACK_ADDRS_COUNT 16
307 unsigned long addr
; /* Called from address */
308 #ifdef CONFIG_STACKDEPOT
309 depot_stack_handle_t handle
;
311 int cpu
; /* Was running on cpu */
312 int pid
; /* Pid context */
313 unsigned long when
; /* When did the operation occur */
316 enum track_item
{ TRACK_ALLOC
, TRACK_FREE
};
318 #ifdef SLAB_SUPPORTS_SYSFS
319 static int sysfs_slab_add(struct kmem_cache
*);
320 static int sysfs_slab_alias(struct kmem_cache
*, const char *);
322 static inline int sysfs_slab_add(struct kmem_cache
*s
) { return 0; }
323 static inline int sysfs_slab_alias(struct kmem_cache
*s
, const char *p
)
327 #if defined(CONFIG_DEBUG_FS) && defined(CONFIG_SLUB_DEBUG)
328 static void debugfs_slab_add(struct kmem_cache
*);
330 static inline void debugfs_slab_add(struct kmem_cache
*s
) { }
333 static inline void stat(const struct kmem_cache
*s
, enum stat_item si
)
335 #ifdef CONFIG_SLUB_STATS
337 * The rmw is racy on a preemptible kernel but this is acceptable, so
338 * avoid this_cpu_add()'s irq-disable overhead.
340 raw_cpu_inc(s
->cpu_slab
->stat
[si
]);
345 * Tracks for which NUMA nodes we have kmem_cache_nodes allocated.
346 * Corresponds to node_state[N_NORMAL_MEMORY], but can temporarily
347 * differ during memory hotplug/hotremove operations.
348 * Protected by slab_mutex.
350 static nodemask_t slab_nodes
;
352 #ifndef CONFIG_SLUB_TINY
354 * Workqueue used for flush_cpu_slab().
356 static struct workqueue_struct
*flushwq
;
359 /********************************************************************
360 * Core slab cache functions
361 *******************************************************************/
364 * Returns freelist pointer (ptr). With hardening, this is obfuscated
365 * with an XOR of the address where the pointer is held and a per-cache
368 static inline void *freelist_ptr(const struct kmem_cache
*s
, void *ptr
,
369 unsigned long ptr_addr
)
371 #ifdef CONFIG_SLAB_FREELIST_HARDENED
373 * When CONFIG_KASAN_SW/HW_TAGS is enabled, ptr_addr might be tagged.
374 * Normally, this doesn't cause any issues, as both set_freepointer()
375 * and get_freepointer() are called with a pointer with the same tag.
376 * However, there are some issues with CONFIG_SLUB_DEBUG code. For
377 * example, when __free_slub() iterates over objects in a cache, it
378 * passes untagged pointers to check_object(). check_object() in turns
379 * calls get_freepointer() with an untagged pointer, which causes the
380 * freepointer to be restored incorrectly.
382 return (void *)((unsigned long)ptr
^ s
->random
^
383 swab((unsigned long)kasan_reset_tag((void *)ptr_addr
)));
389 /* Returns the freelist pointer recorded at location ptr_addr. */
390 static inline void *freelist_dereference(const struct kmem_cache
*s
,
393 return freelist_ptr(s
, (void *)*(unsigned long *)(ptr_addr
),
394 (unsigned long)ptr_addr
);
397 static inline void *get_freepointer(struct kmem_cache
*s
, void *object
)
399 object
= kasan_reset_tag(object
);
400 return freelist_dereference(s
, object
+ s
->offset
);
403 #ifndef CONFIG_SLUB_TINY
404 static void prefetch_freepointer(const struct kmem_cache
*s
, void *object
)
406 prefetchw(object
+ s
->offset
);
411 * When running under KMSAN, get_freepointer_safe() may return an uninitialized
412 * pointer value in the case the current thread loses the race for the next
413 * memory chunk in the freelist. In that case this_cpu_cmpxchg_double() in
414 * slab_alloc_node() will fail, so the uninitialized value won't be used, but
415 * KMSAN will still check all arguments of cmpxchg because of imperfect
416 * handling of inline assembly.
417 * To work around this problem, we apply __no_kmsan_checks to ensure that
418 * get_freepointer_safe() returns initialized memory.
421 static inline void *get_freepointer_safe(struct kmem_cache
*s
, void *object
)
423 unsigned long freepointer_addr
;
426 if (!debug_pagealloc_enabled_static())
427 return get_freepointer(s
, object
);
429 object
= kasan_reset_tag(object
);
430 freepointer_addr
= (unsigned long)object
+ s
->offset
;
431 copy_from_kernel_nofault(&p
, (void **)freepointer_addr
, sizeof(p
));
432 return freelist_ptr(s
, p
, freepointer_addr
);
435 static inline void set_freepointer(struct kmem_cache
*s
, void *object
, void *fp
)
437 unsigned long freeptr_addr
= (unsigned long)object
+ s
->offset
;
439 #ifdef CONFIG_SLAB_FREELIST_HARDENED
440 BUG_ON(object
== fp
); /* naive detection of double free or corruption */
443 freeptr_addr
= (unsigned long)kasan_reset_tag((void *)freeptr_addr
);
444 *(void **)freeptr_addr
= freelist_ptr(s
, fp
, freeptr_addr
);
447 /* Loop over all objects in a slab */
448 #define for_each_object(__p, __s, __addr, __objects) \
449 for (__p = fixup_red_left(__s, __addr); \
450 __p < (__addr) + (__objects) * (__s)->size; \
453 static inline unsigned int order_objects(unsigned int order
, unsigned int size
)
455 return ((unsigned int)PAGE_SIZE
<< order
) / size
;
458 static inline struct kmem_cache_order_objects
oo_make(unsigned int order
,
461 struct kmem_cache_order_objects x
= {
462 (order
<< OO_SHIFT
) + order_objects(order
, size
)
468 static inline unsigned int oo_order(struct kmem_cache_order_objects x
)
470 return x
.x
>> OO_SHIFT
;
473 static inline unsigned int oo_objects(struct kmem_cache_order_objects x
)
475 return x
.x
& OO_MASK
;
478 #ifdef CONFIG_SLUB_CPU_PARTIAL
479 static void slub_set_cpu_partial(struct kmem_cache
*s
, unsigned int nr_objects
)
481 unsigned int nr_slabs
;
483 s
->cpu_partial
= nr_objects
;
486 * We take the number of objects but actually limit the number of
487 * slabs on the per cpu partial list, in order to limit excessive
488 * growth of the list. For simplicity we assume that the slabs will
491 nr_slabs
= DIV_ROUND_UP(nr_objects
* 2, oo_objects(s
->oo
));
492 s
->cpu_partial_slabs
= nr_slabs
;
496 slub_set_cpu_partial(struct kmem_cache
*s
, unsigned int nr_objects
)
499 #endif /* CONFIG_SLUB_CPU_PARTIAL */
502 * Per slab locking using the pagelock
504 static __always_inline
void slab_lock(struct slab
*slab
)
506 struct page
*page
= slab_page(slab
);
508 VM_BUG_ON_PAGE(PageTail(page
), page
);
509 bit_spin_lock(PG_locked
, &page
->flags
);
512 static __always_inline
void slab_unlock(struct slab
*slab
)
514 struct page
*page
= slab_page(slab
);
516 VM_BUG_ON_PAGE(PageTail(page
), page
);
517 __bit_spin_unlock(PG_locked
, &page
->flags
);
521 __update_freelist_fast(struct slab
*slab
,
522 void *freelist_old
, unsigned long counters_old
,
523 void *freelist_new
, unsigned long counters_new
)
525 #ifdef system_has_freelist_aba
526 freelist_aba_t old
= { .freelist
= freelist_old
, .counter
= counters_old
};
527 freelist_aba_t
new = { .freelist
= freelist_new
, .counter
= counters_new
};
529 return try_cmpxchg_freelist(&slab
->freelist_counter
.full
, &old
.full
, new.full
);
536 __update_freelist_slow(struct slab
*slab
,
537 void *freelist_old
, unsigned long counters_old
,
538 void *freelist_new
, unsigned long counters_new
)
543 if (slab
->freelist
== freelist_old
&&
544 slab
->counters
== counters_old
) {
545 slab
->freelist
= freelist_new
;
546 slab
->counters
= counters_new
;
555 * Interrupts must be disabled (for the fallback code to work right), typically
556 * by an _irqsave() lock variant. On PREEMPT_RT the preempt_disable(), which is
557 * part of bit_spin_lock(), is sufficient because the policy is not to allow any
558 * allocation/ free operation in hardirq context. Therefore nothing can
559 * interrupt the operation.
561 static inline bool __slab_update_freelist(struct kmem_cache
*s
, struct slab
*slab
,
562 void *freelist_old
, unsigned long counters_old
,
563 void *freelist_new
, unsigned long counters_new
,
568 if (USE_LOCKLESS_FAST_PATH())
569 lockdep_assert_irqs_disabled();
571 if (s
->flags
& __CMPXCHG_DOUBLE
) {
572 ret
= __update_freelist_fast(slab
, freelist_old
, counters_old
,
573 freelist_new
, counters_new
);
575 ret
= __update_freelist_slow(slab
, freelist_old
, counters_old
,
576 freelist_new
, counters_new
);
582 stat(s
, CMPXCHG_DOUBLE_FAIL
);
584 #ifdef SLUB_DEBUG_CMPXCHG
585 pr_info("%s %s: cmpxchg double redo ", n
, s
->name
);
591 static inline bool slab_update_freelist(struct kmem_cache
*s
, struct slab
*slab
,
592 void *freelist_old
, unsigned long counters_old
,
593 void *freelist_new
, unsigned long counters_new
,
598 if (s
->flags
& __CMPXCHG_DOUBLE
) {
599 ret
= __update_freelist_fast(slab
, freelist_old
, counters_old
,
600 freelist_new
, counters_new
);
604 local_irq_save(flags
);
605 ret
= __update_freelist_slow(slab
, freelist_old
, counters_old
,
606 freelist_new
, counters_new
);
607 local_irq_restore(flags
);
613 stat(s
, CMPXCHG_DOUBLE_FAIL
);
615 #ifdef SLUB_DEBUG_CMPXCHG
616 pr_info("%s %s: cmpxchg double redo ", n
, s
->name
);
622 #ifdef CONFIG_SLUB_DEBUG
623 static unsigned long object_map
[BITS_TO_LONGS(MAX_OBJS_PER_PAGE
)];
624 static DEFINE_SPINLOCK(object_map_lock
);
626 static void __fill_map(unsigned long *obj_map
, struct kmem_cache
*s
,
629 void *addr
= slab_address(slab
);
632 bitmap_zero(obj_map
, slab
->objects
);
634 for (p
= slab
->freelist
; p
; p
= get_freepointer(s
, p
))
635 set_bit(__obj_to_index(s
, addr
, p
), obj_map
);
638 #if IS_ENABLED(CONFIG_KUNIT)
639 static bool slab_add_kunit_errors(void)
641 struct kunit_resource
*resource
;
643 if (!kunit_get_current_test())
646 resource
= kunit_find_named_resource(current
->kunit_test
, "slab_errors");
650 (*(int *)resource
->data
)++;
651 kunit_put_resource(resource
);
655 static inline bool slab_add_kunit_errors(void) { return false; }
658 static inline unsigned int size_from_object(struct kmem_cache
*s
)
660 if (s
->flags
& SLAB_RED_ZONE
)
661 return s
->size
- s
->red_left_pad
;
666 static inline void *restore_red_left(struct kmem_cache
*s
, void *p
)
668 if (s
->flags
& SLAB_RED_ZONE
)
669 p
-= s
->red_left_pad
;
677 #if defined(CONFIG_SLUB_DEBUG_ON)
678 static slab_flags_t slub_debug
= DEBUG_DEFAULT_FLAGS
;
680 static slab_flags_t slub_debug
;
683 static char *slub_debug_string
;
684 static int disable_higher_order_debug
;
687 * slub is about to manipulate internal object metadata. This memory lies
688 * outside the range of the allocated object, so accessing it would normally
689 * be reported by kasan as a bounds error. metadata_access_enable() is used
690 * to tell kasan that these accesses are OK.
692 static inline void metadata_access_enable(void)
694 kasan_disable_current();
697 static inline void metadata_access_disable(void)
699 kasan_enable_current();
706 /* Verify that a pointer has an address that is valid within a slab page */
707 static inline int check_valid_pointer(struct kmem_cache
*s
,
708 struct slab
*slab
, void *object
)
715 base
= slab_address(slab
);
716 object
= kasan_reset_tag(object
);
717 object
= restore_red_left(s
, object
);
718 if (object
< base
|| object
>= base
+ slab
->objects
* s
->size
||
719 (object
- base
) % s
->size
) {
726 static void print_section(char *level
, char *text
, u8
*addr
,
729 metadata_access_enable();
730 print_hex_dump(level
, text
, DUMP_PREFIX_ADDRESS
,
731 16, 1, kasan_reset_tag((void *)addr
), length
, 1);
732 metadata_access_disable();
736 * See comment in calculate_sizes().
738 static inline bool freeptr_outside_object(struct kmem_cache
*s
)
740 return s
->offset
>= s
->inuse
;
744 * Return offset of the end of info block which is inuse + free pointer if
745 * not overlapping with object.
747 static inline unsigned int get_info_end(struct kmem_cache
*s
)
749 if (freeptr_outside_object(s
))
750 return s
->inuse
+ sizeof(void *);
755 static struct track
*get_track(struct kmem_cache
*s
, void *object
,
756 enum track_item alloc
)
760 p
= object
+ get_info_end(s
);
762 return kasan_reset_tag(p
+ alloc
);
765 #ifdef CONFIG_STACKDEPOT
766 static noinline depot_stack_handle_t
set_track_prepare(void)
768 depot_stack_handle_t handle
;
769 unsigned long entries
[TRACK_ADDRS_COUNT
];
770 unsigned int nr_entries
;
772 nr_entries
= stack_trace_save(entries
, ARRAY_SIZE(entries
), 3);
773 handle
= stack_depot_save(entries
, nr_entries
, GFP_NOWAIT
);
778 static inline depot_stack_handle_t
set_track_prepare(void)
784 static void set_track_update(struct kmem_cache
*s
, void *object
,
785 enum track_item alloc
, unsigned long addr
,
786 depot_stack_handle_t handle
)
788 struct track
*p
= get_track(s
, object
, alloc
);
790 #ifdef CONFIG_STACKDEPOT
794 p
->cpu
= smp_processor_id();
795 p
->pid
= current
->pid
;
799 static __always_inline
void set_track(struct kmem_cache
*s
, void *object
,
800 enum track_item alloc
, unsigned long addr
)
802 depot_stack_handle_t handle
= set_track_prepare();
804 set_track_update(s
, object
, alloc
, addr
, handle
);
807 static void init_tracking(struct kmem_cache
*s
, void *object
)
811 if (!(s
->flags
& SLAB_STORE_USER
))
814 p
= get_track(s
, object
, TRACK_ALLOC
);
815 memset(p
, 0, 2*sizeof(struct track
));
818 static void print_track(const char *s
, struct track
*t
, unsigned long pr_time
)
820 depot_stack_handle_t handle __maybe_unused
;
825 pr_err("%s in %pS age=%lu cpu=%u pid=%d\n",
826 s
, (void *)t
->addr
, pr_time
- t
->when
, t
->cpu
, t
->pid
);
827 #ifdef CONFIG_STACKDEPOT
828 handle
= READ_ONCE(t
->handle
);
830 stack_depot_print(handle
);
832 pr_err("object allocation/free stack trace missing\n");
836 void print_tracking(struct kmem_cache
*s
, void *object
)
838 unsigned long pr_time
= jiffies
;
839 if (!(s
->flags
& SLAB_STORE_USER
))
842 print_track("Allocated", get_track(s
, object
, TRACK_ALLOC
), pr_time
);
843 print_track("Freed", get_track(s
, object
, TRACK_FREE
), pr_time
);
846 static void print_slab_info(const struct slab
*slab
)
848 struct folio
*folio
= (struct folio
*)slab_folio(slab
);
850 pr_err("Slab 0x%p objects=%u used=%u fp=0x%p flags=%pGp\n",
851 slab
, slab
->objects
, slab
->inuse
, slab
->freelist
,
852 folio_flags(folio
, 0));
856 * kmalloc caches has fixed sizes (mostly power of 2), and kmalloc() API
857 * family will round up the real request size to these fixed ones, so
858 * there could be an extra area than what is requested. Save the original
859 * request size in the meta data area, for better debug and sanity check.
861 static inline void set_orig_size(struct kmem_cache
*s
,
862 void *object
, unsigned int orig_size
)
864 void *p
= kasan_reset_tag(object
);
866 if (!slub_debug_orig_size(s
))
869 #ifdef CONFIG_KASAN_GENERIC
871 * KASAN could save its free meta data in object's data area at
872 * offset 0, if the size is larger than 'orig_size', it will
873 * overlap the data redzone in [orig_size+1, object_size], and
874 * the check should be skipped.
876 if (kasan_metadata_size(s
, true) > orig_size
)
877 orig_size
= s
->object_size
;
880 p
+= get_info_end(s
);
881 p
+= sizeof(struct track
) * 2;
883 *(unsigned int *)p
= orig_size
;
886 static inline unsigned int get_orig_size(struct kmem_cache
*s
, void *object
)
888 void *p
= kasan_reset_tag(object
);
890 if (!slub_debug_orig_size(s
))
891 return s
->object_size
;
893 p
+= get_info_end(s
);
894 p
+= sizeof(struct track
) * 2;
896 return *(unsigned int *)p
;
899 void skip_orig_size_check(struct kmem_cache
*s
, const void *object
)
901 set_orig_size(s
, (void *)object
, s
->object_size
);
904 static void slab_bug(struct kmem_cache
*s
, char *fmt
, ...)
906 struct va_format vaf
;
912 pr_err("=============================================================================\n");
913 pr_err("BUG %s (%s): %pV\n", s
->name
, print_tainted(), &vaf
);
914 pr_err("-----------------------------------------------------------------------------\n\n");
919 static void slab_fix(struct kmem_cache
*s
, char *fmt
, ...)
921 struct va_format vaf
;
924 if (slab_add_kunit_errors())
930 pr_err("FIX %s: %pV\n", s
->name
, &vaf
);
934 static void print_trailer(struct kmem_cache
*s
, struct slab
*slab
, u8
*p
)
936 unsigned int off
; /* Offset of last byte */
937 u8
*addr
= slab_address(slab
);
939 print_tracking(s
, p
);
941 print_slab_info(slab
);
943 pr_err("Object 0x%p @offset=%tu fp=0x%p\n\n",
944 p
, p
- addr
, get_freepointer(s
, p
));
946 if (s
->flags
& SLAB_RED_ZONE
)
947 print_section(KERN_ERR
, "Redzone ", p
- s
->red_left_pad
,
949 else if (p
> addr
+ 16)
950 print_section(KERN_ERR
, "Bytes b4 ", p
- 16, 16);
952 print_section(KERN_ERR
, "Object ", p
,
953 min_t(unsigned int, s
->object_size
, PAGE_SIZE
));
954 if (s
->flags
& SLAB_RED_ZONE
)
955 print_section(KERN_ERR
, "Redzone ", p
+ s
->object_size
,
956 s
->inuse
- s
->object_size
);
958 off
= get_info_end(s
);
960 if (s
->flags
& SLAB_STORE_USER
)
961 off
+= 2 * sizeof(struct track
);
963 if (slub_debug_orig_size(s
))
964 off
+= sizeof(unsigned int);
966 off
+= kasan_metadata_size(s
, false);
968 if (off
!= size_from_object(s
))
969 /* Beginning of the filler is the free pointer */
970 print_section(KERN_ERR
, "Padding ", p
+ off
,
971 size_from_object(s
) - off
);
976 static void object_err(struct kmem_cache
*s
, struct slab
*slab
,
977 u8
*object
, char *reason
)
979 if (slab_add_kunit_errors())
982 slab_bug(s
, "%s", reason
);
983 print_trailer(s
, slab
, object
);
984 add_taint(TAINT_BAD_PAGE
, LOCKDEP_NOW_UNRELIABLE
);
987 static bool freelist_corrupted(struct kmem_cache
*s
, struct slab
*slab
,
988 void **freelist
, void *nextfree
)
990 if ((s
->flags
& SLAB_CONSISTENCY_CHECKS
) &&
991 !check_valid_pointer(s
, slab
, nextfree
) && freelist
) {
992 object_err(s
, slab
, *freelist
, "Freechain corrupt");
994 slab_fix(s
, "Isolate corrupted freechain");
1001 static __printf(3, 4) void slab_err(struct kmem_cache
*s
, struct slab
*slab
,
1002 const char *fmt
, ...)
1007 if (slab_add_kunit_errors())
1010 va_start(args
, fmt
);
1011 vsnprintf(buf
, sizeof(buf
), fmt
, args
);
1013 slab_bug(s
, "%s", buf
);
1014 print_slab_info(slab
);
1016 add_taint(TAINT_BAD_PAGE
, LOCKDEP_NOW_UNRELIABLE
);
1019 static void init_object(struct kmem_cache
*s
, void *object
, u8 val
)
1021 u8
*p
= kasan_reset_tag(object
);
1022 unsigned int poison_size
= s
->object_size
;
1024 if (s
->flags
& SLAB_RED_ZONE
) {
1025 memset(p
- s
->red_left_pad
, val
, s
->red_left_pad
);
1027 if (slub_debug_orig_size(s
) && val
== SLUB_RED_ACTIVE
) {
1029 * Redzone the extra allocated space by kmalloc than
1030 * requested, and the poison size will be limited to
1031 * the original request size accordingly.
1033 poison_size
= get_orig_size(s
, object
);
1037 if (s
->flags
& __OBJECT_POISON
) {
1038 memset(p
, POISON_FREE
, poison_size
- 1);
1039 p
[poison_size
- 1] = POISON_END
;
1042 if (s
->flags
& SLAB_RED_ZONE
)
1043 memset(p
+ poison_size
, val
, s
->inuse
- poison_size
);
1046 static void restore_bytes(struct kmem_cache
*s
, char *message
, u8 data
,
1047 void *from
, void *to
)
1049 slab_fix(s
, "Restoring %s 0x%p-0x%p=0x%x", message
, from
, to
- 1, data
);
1050 memset(from
, data
, to
- from
);
1053 static int check_bytes_and_report(struct kmem_cache
*s
, struct slab
*slab
,
1054 u8
*object
, char *what
,
1055 u8
*start
, unsigned int value
, unsigned int bytes
)
1059 u8
*addr
= slab_address(slab
);
1061 metadata_access_enable();
1062 fault
= memchr_inv(kasan_reset_tag(start
), value
, bytes
);
1063 metadata_access_disable();
1067 end
= start
+ bytes
;
1068 while (end
> fault
&& end
[-1] == value
)
1071 if (slab_add_kunit_errors())
1072 goto skip_bug_print
;
1074 slab_bug(s
, "%s overwritten", what
);
1075 pr_err("0x%p-0x%p @offset=%tu. First byte 0x%x instead of 0x%x\n",
1076 fault
, end
- 1, fault
- addr
,
1078 print_trailer(s
, slab
, object
);
1079 add_taint(TAINT_BAD_PAGE
, LOCKDEP_NOW_UNRELIABLE
);
1082 restore_bytes(s
, what
, value
, fault
, end
);
1090 * Bytes of the object to be managed.
1091 * If the freepointer may overlay the object then the free
1092 * pointer is at the middle of the object.
1094 * Poisoning uses 0x6b (POISON_FREE) and the last byte is
1097 * object + s->object_size
1098 * Padding to reach word boundary. This is also used for Redzoning.
1099 * Padding is extended by another word if Redzoning is enabled and
1100 * object_size == inuse.
1102 * We fill with 0xbb (RED_INACTIVE) for inactive objects and with
1103 * 0xcc (RED_ACTIVE) for objects in use.
1106 * Meta data starts here.
1108 * A. Free pointer (if we cannot overwrite object on free)
1109 * B. Tracking data for SLAB_STORE_USER
1110 * C. Original request size for kmalloc object (SLAB_STORE_USER enabled)
1111 * D. Padding to reach required alignment boundary or at minimum
1112 * one word if debugging is on to be able to detect writes
1113 * before the word boundary.
1115 * Padding is done using 0x5a (POISON_INUSE)
1118 * Nothing is used beyond s->size.
1120 * If slabcaches are merged then the object_size and inuse boundaries are mostly
1121 * ignored. And therefore no slab options that rely on these boundaries
1122 * may be used with merged slabcaches.
1125 static int check_pad_bytes(struct kmem_cache
*s
, struct slab
*slab
, u8
*p
)
1127 unsigned long off
= get_info_end(s
); /* The end of info */
1129 if (s
->flags
& SLAB_STORE_USER
) {
1130 /* We also have user information there */
1131 off
+= 2 * sizeof(struct track
);
1133 if (s
->flags
& SLAB_KMALLOC
)
1134 off
+= sizeof(unsigned int);
1137 off
+= kasan_metadata_size(s
, false);
1139 if (size_from_object(s
) == off
)
1142 return check_bytes_and_report(s
, slab
, p
, "Object padding",
1143 p
+ off
, POISON_INUSE
, size_from_object(s
) - off
);
1146 /* Check the pad bytes at the end of a slab page */
1147 static void slab_pad_check(struct kmem_cache
*s
, struct slab
*slab
)
1156 if (!(s
->flags
& SLAB_POISON
))
1159 start
= slab_address(slab
);
1160 length
= slab_size(slab
);
1161 end
= start
+ length
;
1162 remainder
= length
% s
->size
;
1166 pad
= end
- remainder
;
1167 metadata_access_enable();
1168 fault
= memchr_inv(kasan_reset_tag(pad
), POISON_INUSE
, remainder
);
1169 metadata_access_disable();
1172 while (end
> fault
&& end
[-1] == POISON_INUSE
)
1175 slab_err(s
, slab
, "Padding overwritten. 0x%p-0x%p @offset=%tu",
1176 fault
, end
- 1, fault
- start
);
1177 print_section(KERN_ERR
, "Padding ", pad
, remainder
);
1179 restore_bytes(s
, "slab padding", POISON_INUSE
, fault
, end
);
1182 static int check_object(struct kmem_cache
*s
, struct slab
*slab
,
1183 void *object
, u8 val
)
1186 u8
*endobject
= object
+ s
->object_size
;
1187 unsigned int orig_size
;
1189 if (s
->flags
& SLAB_RED_ZONE
) {
1190 if (!check_bytes_and_report(s
, slab
, object
, "Left Redzone",
1191 object
- s
->red_left_pad
, val
, s
->red_left_pad
))
1194 if (!check_bytes_and_report(s
, slab
, object
, "Right Redzone",
1195 endobject
, val
, s
->inuse
- s
->object_size
))
1198 if (slub_debug_orig_size(s
) && val
== SLUB_RED_ACTIVE
) {
1199 orig_size
= get_orig_size(s
, object
);
1201 if (s
->object_size
> orig_size
&&
1202 !check_bytes_and_report(s
, slab
, object
,
1203 "kmalloc Redzone", p
+ orig_size
,
1204 val
, s
->object_size
- orig_size
)) {
1209 if ((s
->flags
& SLAB_POISON
) && s
->object_size
< s
->inuse
) {
1210 check_bytes_and_report(s
, slab
, p
, "Alignment padding",
1211 endobject
, POISON_INUSE
,
1212 s
->inuse
- s
->object_size
);
1216 if (s
->flags
& SLAB_POISON
) {
1217 if (val
!= SLUB_RED_ACTIVE
&& (s
->flags
& __OBJECT_POISON
) &&
1218 (!check_bytes_and_report(s
, slab
, p
, "Poison", p
,
1219 POISON_FREE
, s
->object_size
- 1) ||
1220 !check_bytes_and_report(s
, slab
, p
, "End Poison",
1221 p
+ s
->object_size
- 1, POISON_END
, 1)))
1224 * check_pad_bytes cleans up on its own.
1226 check_pad_bytes(s
, slab
, p
);
1229 if (!freeptr_outside_object(s
) && val
== SLUB_RED_ACTIVE
)
1231 * Object and freepointer overlap. Cannot check
1232 * freepointer while object is allocated.
1236 /* Check free pointer validity */
1237 if (!check_valid_pointer(s
, slab
, get_freepointer(s
, p
))) {
1238 object_err(s
, slab
, p
, "Freepointer corrupt");
1240 * No choice but to zap it and thus lose the remainder
1241 * of the free objects in this slab. May cause
1242 * another error because the object count is now wrong.
1244 set_freepointer(s
, p
, NULL
);
1250 static int check_slab(struct kmem_cache
*s
, struct slab
*slab
)
1254 if (!folio_test_slab(slab_folio(slab
))) {
1255 slab_err(s
, slab
, "Not a valid slab page");
1259 maxobj
= order_objects(slab_order(slab
), s
->size
);
1260 if (slab
->objects
> maxobj
) {
1261 slab_err(s
, slab
, "objects %u > max %u",
1262 slab
->objects
, maxobj
);
1265 if (slab
->inuse
> slab
->objects
) {
1266 slab_err(s
, slab
, "inuse %u > max %u",
1267 slab
->inuse
, slab
->objects
);
1270 /* Slab_pad_check fixes things up after itself */
1271 slab_pad_check(s
, slab
);
1276 * Determine if a certain object in a slab is on the freelist. Must hold the
1277 * slab lock to guarantee that the chains are in a consistent state.
1279 static int on_freelist(struct kmem_cache
*s
, struct slab
*slab
, void *search
)
1283 void *object
= NULL
;
1286 fp
= slab
->freelist
;
1287 while (fp
&& nr
<= slab
->objects
) {
1290 if (!check_valid_pointer(s
, slab
, fp
)) {
1292 object_err(s
, slab
, object
,
1293 "Freechain corrupt");
1294 set_freepointer(s
, object
, NULL
);
1296 slab_err(s
, slab
, "Freepointer corrupt");
1297 slab
->freelist
= NULL
;
1298 slab
->inuse
= slab
->objects
;
1299 slab_fix(s
, "Freelist cleared");
1305 fp
= get_freepointer(s
, object
);
1309 max_objects
= order_objects(slab_order(slab
), s
->size
);
1310 if (max_objects
> MAX_OBJS_PER_PAGE
)
1311 max_objects
= MAX_OBJS_PER_PAGE
;
1313 if (slab
->objects
!= max_objects
) {
1314 slab_err(s
, slab
, "Wrong number of objects. Found %d but should be %d",
1315 slab
->objects
, max_objects
);
1316 slab
->objects
= max_objects
;
1317 slab_fix(s
, "Number of objects adjusted");
1319 if (slab
->inuse
!= slab
->objects
- nr
) {
1320 slab_err(s
, slab
, "Wrong object count. Counter is %d but counted were %d",
1321 slab
->inuse
, slab
->objects
- nr
);
1322 slab
->inuse
= slab
->objects
- nr
;
1323 slab_fix(s
, "Object count adjusted");
1325 return search
== NULL
;
1328 static void trace(struct kmem_cache
*s
, struct slab
*slab
, void *object
,
1331 if (s
->flags
& SLAB_TRACE
) {
1332 pr_info("TRACE %s %s 0x%p inuse=%d fp=0x%p\n",
1334 alloc
? "alloc" : "free",
1335 object
, slab
->inuse
,
1339 print_section(KERN_INFO
, "Object ", (void *)object
,
1347 * Tracking of fully allocated slabs for debugging purposes.
1349 static void add_full(struct kmem_cache
*s
,
1350 struct kmem_cache_node
*n
, struct slab
*slab
)
1352 if (!(s
->flags
& SLAB_STORE_USER
))
1355 lockdep_assert_held(&n
->list_lock
);
1356 list_add(&slab
->slab_list
, &n
->full
);
1359 static void remove_full(struct kmem_cache
*s
, struct kmem_cache_node
*n
, struct slab
*slab
)
1361 if (!(s
->flags
& SLAB_STORE_USER
))
1364 lockdep_assert_held(&n
->list_lock
);
1365 list_del(&slab
->slab_list
);
1368 static inline unsigned long node_nr_slabs(struct kmem_cache_node
*n
)
1370 return atomic_long_read(&n
->nr_slabs
);
1373 static inline void inc_slabs_node(struct kmem_cache
*s
, int node
, int objects
)
1375 struct kmem_cache_node
*n
= get_node(s
, node
);
1378 * May be called early in order to allocate a slab for the
1379 * kmem_cache_node structure. Solve the chicken-egg
1380 * dilemma by deferring the increment of the count during
1381 * bootstrap (see early_kmem_cache_node_alloc).
1384 atomic_long_inc(&n
->nr_slabs
);
1385 atomic_long_add(objects
, &n
->total_objects
);
1388 static inline void dec_slabs_node(struct kmem_cache
*s
, int node
, int objects
)
1390 struct kmem_cache_node
*n
= get_node(s
, node
);
1392 atomic_long_dec(&n
->nr_slabs
);
1393 atomic_long_sub(objects
, &n
->total_objects
);
1396 /* Object debug checks for alloc/free paths */
1397 static void setup_object_debug(struct kmem_cache
*s
, void *object
)
1399 if (!kmem_cache_debug_flags(s
, SLAB_STORE_USER
|SLAB_RED_ZONE
|__OBJECT_POISON
))
1402 init_object(s
, object
, SLUB_RED_INACTIVE
);
1403 init_tracking(s
, object
);
1407 void setup_slab_debug(struct kmem_cache
*s
, struct slab
*slab
, void *addr
)
1409 if (!kmem_cache_debug_flags(s
, SLAB_POISON
))
1412 metadata_access_enable();
1413 memset(kasan_reset_tag(addr
), POISON_INUSE
, slab_size(slab
));
1414 metadata_access_disable();
1417 static inline int alloc_consistency_checks(struct kmem_cache
*s
,
1418 struct slab
*slab
, void *object
)
1420 if (!check_slab(s
, slab
))
1423 if (!check_valid_pointer(s
, slab
, object
)) {
1424 object_err(s
, slab
, object
, "Freelist Pointer check fails");
1428 if (!check_object(s
, slab
, object
, SLUB_RED_INACTIVE
))
1434 static noinline
bool alloc_debug_processing(struct kmem_cache
*s
,
1435 struct slab
*slab
, void *object
, int orig_size
)
1437 if (s
->flags
& SLAB_CONSISTENCY_CHECKS
) {
1438 if (!alloc_consistency_checks(s
, slab
, object
))
1442 /* Success. Perform special debug activities for allocs */
1443 trace(s
, slab
, object
, 1);
1444 set_orig_size(s
, object
, orig_size
);
1445 init_object(s
, object
, SLUB_RED_ACTIVE
);
1449 if (folio_test_slab(slab_folio(slab
))) {
1451 * If this is a slab page then lets do the best we can
1452 * to avoid issues in the future. Marking all objects
1453 * as used avoids touching the remaining objects.
1455 slab_fix(s
, "Marking all objects used");
1456 slab
->inuse
= slab
->objects
;
1457 slab
->freelist
= NULL
;
1462 static inline int free_consistency_checks(struct kmem_cache
*s
,
1463 struct slab
*slab
, void *object
, unsigned long addr
)
1465 if (!check_valid_pointer(s
, slab
, object
)) {
1466 slab_err(s
, slab
, "Invalid object pointer 0x%p", object
);
1470 if (on_freelist(s
, slab
, object
)) {
1471 object_err(s
, slab
, object
, "Object already free");
1475 if (!check_object(s
, slab
, object
, SLUB_RED_ACTIVE
))
1478 if (unlikely(s
!= slab
->slab_cache
)) {
1479 if (!folio_test_slab(slab_folio(slab
))) {
1480 slab_err(s
, slab
, "Attempt to free object(0x%p) outside of slab",
1482 } else if (!slab
->slab_cache
) {
1483 pr_err("SLUB <none>: no slab for object 0x%p.\n",
1487 object_err(s
, slab
, object
,
1488 "page slab pointer corrupt.");
1495 * Parse a block of slub_debug options. Blocks are delimited by ';'
1497 * @str: start of block
1498 * @flags: returns parsed flags, or DEBUG_DEFAULT_FLAGS if none specified
1499 * @slabs: return start of list of slabs, or NULL when there's no list
1500 * @init: assume this is initial parsing and not per-kmem-create parsing
1502 * returns the start of next block if there's any, or NULL
1505 parse_slub_debug_flags(char *str
, slab_flags_t
*flags
, char **slabs
, bool init
)
1507 bool higher_order_disable
= false;
1509 /* Skip any completely empty blocks */
1510 while (*str
&& *str
== ';')
1515 * No options but restriction on slabs. This means full
1516 * debugging for slabs matching a pattern.
1518 *flags
= DEBUG_DEFAULT_FLAGS
;
1523 /* Determine which debug features should be switched on */
1524 for (; *str
&& *str
!= ',' && *str
!= ';'; str
++) {
1525 switch (tolower(*str
)) {
1530 *flags
|= SLAB_CONSISTENCY_CHECKS
;
1533 *flags
|= SLAB_RED_ZONE
;
1536 *flags
|= SLAB_POISON
;
1539 *flags
|= SLAB_STORE_USER
;
1542 *flags
|= SLAB_TRACE
;
1545 *flags
|= SLAB_FAILSLAB
;
1549 * Avoid enabling debugging on caches if its minimum
1550 * order would increase as a result.
1552 higher_order_disable
= true;
1556 pr_err("slub_debug option '%c' unknown. skipped\n", *str
);
1565 /* Skip over the slab list */
1566 while (*str
&& *str
!= ';')
1569 /* Skip any completely empty blocks */
1570 while (*str
&& *str
== ';')
1573 if (init
&& higher_order_disable
)
1574 disable_higher_order_debug
= 1;
1582 static int __init
setup_slub_debug(char *str
)
1585 slab_flags_t global_flags
;
1588 bool global_slub_debug_changed
= false;
1589 bool slab_list_specified
= false;
1591 global_flags
= DEBUG_DEFAULT_FLAGS
;
1592 if (*str
++ != '=' || !*str
)
1594 * No options specified. Switch on full debugging.
1600 str
= parse_slub_debug_flags(str
, &flags
, &slab_list
, true);
1603 global_flags
= flags
;
1604 global_slub_debug_changed
= true;
1606 slab_list_specified
= true;
1607 if (flags
& SLAB_STORE_USER
)
1608 stack_depot_request_early_init();
1613 * For backwards compatibility, a single list of flags with list of
1614 * slabs means debugging is only changed for those slabs, so the global
1615 * slub_debug should be unchanged (0 or DEBUG_DEFAULT_FLAGS, depending
1616 * on CONFIG_SLUB_DEBUG_ON). We can extended that to multiple lists as
1617 * long as there is no option specifying flags without a slab list.
1619 if (slab_list_specified
) {
1620 if (!global_slub_debug_changed
)
1621 global_flags
= slub_debug
;
1622 slub_debug_string
= saved_str
;
1625 slub_debug
= global_flags
;
1626 if (slub_debug
& SLAB_STORE_USER
)
1627 stack_depot_request_early_init();
1628 if (slub_debug
!= 0 || slub_debug_string
)
1629 static_branch_enable(&slub_debug_enabled
);
1631 static_branch_disable(&slub_debug_enabled
);
1632 if ((static_branch_unlikely(&init_on_alloc
) ||
1633 static_branch_unlikely(&init_on_free
)) &&
1634 (slub_debug
& SLAB_POISON
))
1635 pr_info("mem auto-init: SLAB_POISON will take precedence over init_on_alloc/init_on_free\n");
1639 __setup("slub_debug", setup_slub_debug
);
1642 * kmem_cache_flags - apply debugging options to the cache
1643 * @object_size: the size of an object without meta data
1644 * @flags: flags to set
1645 * @name: name of the cache
1647 * Debug option(s) are applied to @flags. In addition to the debug
1648 * option(s), if a slab name (or multiple) is specified i.e.
1649 * slub_debug=<Debug-Options>,<slab name1>,<slab name2> ...
1650 * then only the select slabs will receive the debug option(s).
1652 slab_flags_t
kmem_cache_flags(unsigned int object_size
,
1653 slab_flags_t flags
, const char *name
)
1658 slab_flags_t block_flags
;
1659 slab_flags_t slub_debug_local
= slub_debug
;
1661 if (flags
& SLAB_NO_USER_FLAGS
)
1665 * If the slab cache is for debugging (e.g. kmemleak) then
1666 * don't store user (stack trace) information by default,
1667 * but let the user enable it via the command line below.
1669 if (flags
& SLAB_NOLEAKTRACE
)
1670 slub_debug_local
&= ~SLAB_STORE_USER
;
1673 next_block
= slub_debug_string
;
1674 /* Go through all blocks of debug options, see if any matches our slab's name */
1675 while (next_block
) {
1676 next_block
= parse_slub_debug_flags(next_block
, &block_flags
, &iter
, false);
1679 /* Found a block that has a slab list, search it */
1684 end
= strchrnul(iter
, ',');
1685 if (next_block
&& next_block
< end
)
1686 end
= next_block
- 1;
1688 glob
= strnchr(iter
, end
- iter
, '*');
1690 cmplen
= glob
- iter
;
1692 cmplen
= max_t(size_t, len
, (end
- iter
));
1694 if (!strncmp(name
, iter
, cmplen
)) {
1695 flags
|= block_flags
;
1699 if (!*end
|| *end
== ';')
1705 return flags
| slub_debug_local
;
1707 #else /* !CONFIG_SLUB_DEBUG */
1708 static inline void setup_object_debug(struct kmem_cache
*s
, void *object
) {}
1710 void setup_slab_debug(struct kmem_cache
*s
, struct slab
*slab
, void *addr
) {}
1712 static inline bool alloc_debug_processing(struct kmem_cache
*s
,
1713 struct slab
*slab
, void *object
, int orig_size
) { return true; }
1715 static inline bool free_debug_processing(struct kmem_cache
*s
,
1716 struct slab
*slab
, void *head
, void *tail
, int *bulk_cnt
,
1717 unsigned long addr
, depot_stack_handle_t handle
) { return true; }
1719 static inline void slab_pad_check(struct kmem_cache
*s
, struct slab
*slab
) {}
1720 static inline int check_object(struct kmem_cache
*s
, struct slab
*slab
,
1721 void *object
, u8 val
) { return 1; }
1722 static inline depot_stack_handle_t
set_track_prepare(void) { return 0; }
1723 static inline void set_track(struct kmem_cache
*s
, void *object
,
1724 enum track_item alloc
, unsigned long addr
) {}
1725 static inline void add_full(struct kmem_cache
*s
, struct kmem_cache_node
*n
,
1726 struct slab
*slab
) {}
1727 static inline void remove_full(struct kmem_cache
*s
, struct kmem_cache_node
*n
,
1728 struct slab
*slab
) {}
1729 slab_flags_t
kmem_cache_flags(unsigned int object_size
,
1730 slab_flags_t flags
, const char *name
)
1734 #define slub_debug 0
1736 #define disable_higher_order_debug 0
1738 static inline unsigned long node_nr_slabs(struct kmem_cache_node
*n
)
1740 static inline void inc_slabs_node(struct kmem_cache
*s
, int node
,
1742 static inline void dec_slabs_node(struct kmem_cache
*s
, int node
,
1745 #ifndef CONFIG_SLUB_TINY
1746 static bool freelist_corrupted(struct kmem_cache
*s
, struct slab
*slab
,
1747 void **freelist
, void *nextfree
)
1752 #endif /* CONFIG_SLUB_DEBUG */
1755 * Hooks for other subsystems that check memory allocations. In a typical
1756 * production configuration these hooks all should produce no code at all.
1758 static __always_inline
bool slab_free_hook(struct kmem_cache
*s
,
1761 kmemleak_free_recursive(x
, s
->flags
);
1762 kmsan_slab_free(s
, x
);
1764 debug_check_no_locks_freed(x
, s
->object_size
);
1766 if (!(s
->flags
& SLAB_DEBUG_OBJECTS
))
1767 debug_check_no_obj_freed(x
, s
->object_size
);
1769 /* Use KCSAN to help debug racy use-after-free. */
1770 if (!(s
->flags
& SLAB_TYPESAFE_BY_RCU
))
1771 __kcsan_check_access(x
, s
->object_size
,
1772 KCSAN_ACCESS_WRITE
| KCSAN_ACCESS_ASSERT
);
1775 * As memory initialization might be integrated into KASAN,
1776 * kasan_slab_free and initialization memset's must be
1777 * kept together to avoid discrepancies in behavior.
1779 * The initialization memset's clear the object and the metadata,
1780 * but don't touch the SLAB redzone.
1785 if (!kasan_has_integrated_init())
1786 memset(kasan_reset_tag(x
), 0, s
->object_size
);
1787 rsize
= (s
->flags
& SLAB_RED_ZONE
) ? s
->red_left_pad
: 0;
1788 memset((char *)kasan_reset_tag(x
) + s
->inuse
, 0,
1789 s
->size
- s
->inuse
- rsize
);
1791 /* KASAN might put x into memory quarantine, delaying its reuse. */
1792 return kasan_slab_free(s
, x
, init
);
1795 static inline bool slab_free_freelist_hook(struct kmem_cache
*s
,
1796 void **head
, void **tail
,
1802 void *old_tail
= *tail
? *tail
: *head
;
1804 if (is_kfence_address(next
)) {
1805 slab_free_hook(s
, next
, false);
1809 /* Head and tail of the reconstructed freelist */
1815 next
= get_freepointer(s
, object
);
1817 /* If object's reuse doesn't have to be delayed */
1818 if (!slab_free_hook(s
, object
, slab_want_init_on_free(s
))) {
1819 /* Move object to the new freelist */
1820 set_freepointer(s
, object
, *head
);
1826 * Adjust the reconstructed freelist depth
1827 * accordingly if object's reuse is delayed.
1831 } while (object
!= old_tail
);
1836 return *head
!= NULL
;
1839 static void *setup_object(struct kmem_cache
*s
, void *object
)
1841 setup_object_debug(s
, object
);
1842 object
= kasan_init_slab_obj(s
, object
);
1843 if (unlikely(s
->ctor
)) {
1844 kasan_unpoison_object_data(s
, object
);
1846 kasan_poison_object_data(s
, object
);
1852 * Slab allocation and freeing
1854 static inline struct slab
*alloc_slab_page(gfp_t flags
, int node
,
1855 struct kmem_cache_order_objects oo
)
1857 struct folio
*folio
;
1859 unsigned int order
= oo_order(oo
);
1861 if (node
== NUMA_NO_NODE
)
1862 folio
= (struct folio
*)alloc_pages(flags
, order
);
1864 folio
= (struct folio
*)__alloc_pages_node(node
, flags
, order
);
1869 slab
= folio_slab(folio
);
1870 __folio_set_slab(folio
);
1871 /* Make the flag visible before any changes to folio->mapping */
1873 if (folio_is_pfmemalloc(folio
))
1874 slab_set_pfmemalloc(slab
);
1879 #ifdef CONFIG_SLAB_FREELIST_RANDOM
1880 /* Pre-initialize the random sequence cache */
1881 static int init_cache_random_seq(struct kmem_cache
*s
)
1883 unsigned int count
= oo_objects(s
->oo
);
1886 /* Bailout if already initialised */
1890 err
= cache_random_seq_create(s
, count
, GFP_KERNEL
);
1892 pr_err("SLUB: Unable to initialize free list for %s\n",
1897 /* Transform to an offset on the set of pages */
1898 if (s
->random_seq
) {
1901 for (i
= 0; i
< count
; i
++)
1902 s
->random_seq
[i
] *= s
->size
;
1907 /* Initialize each random sequence freelist per cache */
1908 static void __init
init_freelist_randomization(void)
1910 struct kmem_cache
*s
;
1912 mutex_lock(&slab_mutex
);
1914 list_for_each_entry(s
, &slab_caches
, list
)
1915 init_cache_random_seq(s
);
1917 mutex_unlock(&slab_mutex
);
1920 /* Get the next entry on the pre-computed freelist randomized */
1921 static void *next_freelist_entry(struct kmem_cache
*s
, struct slab
*slab
,
1922 unsigned long *pos
, void *start
,
1923 unsigned long page_limit
,
1924 unsigned long freelist_count
)
1929 * If the target page allocation failed, the number of objects on the
1930 * page might be smaller than the usual size defined by the cache.
1933 idx
= s
->random_seq
[*pos
];
1935 if (*pos
>= freelist_count
)
1937 } while (unlikely(idx
>= page_limit
));
1939 return (char *)start
+ idx
;
1942 /* Shuffle the single linked freelist based on a random pre-computed sequence */
1943 static bool shuffle_freelist(struct kmem_cache
*s
, struct slab
*slab
)
1948 unsigned long idx
, pos
, page_limit
, freelist_count
;
1950 if (slab
->objects
< 2 || !s
->random_seq
)
1953 freelist_count
= oo_objects(s
->oo
);
1954 pos
= get_random_u32_below(freelist_count
);
1956 page_limit
= slab
->objects
* s
->size
;
1957 start
= fixup_red_left(s
, slab_address(slab
));
1959 /* First entry is used as the base of the freelist */
1960 cur
= next_freelist_entry(s
, slab
, &pos
, start
, page_limit
,
1962 cur
= setup_object(s
, cur
);
1963 slab
->freelist
= cur
;
1965 for (idx
= 1; idx
< slab
->objects
; idx
++) {
1966 next
= next_freelist_entry(s
, slab
, &pos
, start
, page_limit
,
1968 next
= setup_object(s
, next
);
1969 set_freepointer(s
, cur
, next
);
1972 set_freepointer(s
, cur
, NULL
);
1977 static inline int init_cache_random_seq(struct kmem_cache
*s
)
1981 static inline void init_freelist_randomization(void) { }
1982 static inline bool shuffle_freelist(struct kmem_cache
*s
, struct slab
*slab
)
1986 #endif /* CONFIG_SLAB_FREELIST_RANDOM */
1988 static struct slab
*allocate_slab(struct kmem_cache
*s
, gfp_t flags
, int node
)
1991 struct kmem_cache_order_objects oo
= s
->oo
;
1993 void *start
, *p
, *next
;
1997 flags
&= gfp_allowed_mask
;
1999 flags
|= s
->allocflags
;
2002 * Let the initial higher-order allocation fail under memory pressure
2003 * so we fall-back to the minimum order allocation.
2005 alloc_gfp
= (flags
| __GFP_NOWARN
| __GFP_NORETRY
) & ~__GFP_NOFAIL
;
2006 if ((alloc_gfp
& __GFP_DIRECT_RECLAIM
) && oo_order(oo
) > oo_order(s
->min
))
2007 alloc_gfp
= (alloc_gfp
| __GFP_NOMEMALLOC
) & ~__GFP_RECLAIM
;
2009 slab
= alloc_slab_page(alloc_gfp
, node
, oo
);
2010 if (unlikely(!slab
)) {
2014 * Allocation may have failed due to fragmentation.
2015 * Try a lower order alloc if possible
2017 slab
= alloc_slab_page(alloc_gfp
, node
, oo
);
2018 if (unlikely(!slab
))
2020 stat(s
, ORDER_FALLBACK
);
2023 slab
->objects
= oo_objects(oo
);
2027 account_slab(slab
, oo_order(oo
), s
, flags
);
2029 slab
->slab_cache
= s
;
2031 kasan_poison_slab(slab
);
2033 start
= slab_address(slab
);
2035 setup_slab_debug(s
, slab
, start
);
2037 shuffle
= shuffle_freelist(s
, slab
);
2040 start
= fixup_red_left(s
, start
);
2041 start
= setup_object(s
, start
);
2042 slab
->freelist
= start
;
2043 for (idx
= 0, p
= start
; idx
< slab
->objects
- 1; idx
++) {
2045 next
= setup_object(s
, next
);
2046 set_freepointer(s
, p
, next
);
2049 set_freepointer(s
, p
, NULL
);
2055 static struct slab
*new_slab(struct kmem_cache
*s
, gfp_t flags
, int node
)
2057 if (unlikely(flags
& GFP_SLAB_BUG_MASK
))
2058 flags
= kmalloc_fix_flags(flags
);
2060 WARN_ON_ONCE(s
->ctor
&& (flags
& __GFP_ZERO
));
2062 return allocate_slab(s
,
2063 flags
& (GFP_RECLAIM_MASK
| GFP_CONSTRAINT_MASK
), node
);
2066 static void __free_slab(struct kmem_cache
*s
, struct slab
*slab
)
2068 struct folio
*folio
= slab_folio(slab
);
2069 int order
= folio_order(folio
);
2070 int pages
= 1 << order
;
2072 __slab_clear_pfmemalloc(slab
);
2073 folio
->mapping
= NULL
;
2074 /* Make the mapping reset visible before clearing the flag */
2076 __folio_clear_slab(folio
);
2077 mm_account_reclaimed_pages(pages
);
2078 unaccount_slab(slab
, order
, s
);
2079 __free_pages(&folio
->page
, order
);
2082 static void rcu_free_slab(struct rcu_head
*h
)
2084 struct slab
*slab
= container_of(h
, struct slab
, rcu_head
);
2086 __free_slab(slab
->slab_cache
, slab
);
2089 static void free_slab(struct kmem_cache
*s
, struct slab
*slab
)
2091 if (kmem_cache_debug_flags(s
, SLAB_CONSISTENCY_CHECKS
)) {
2094 slab_pad_check(s
, slab
);
2095 for_each_object(p
, s
, slab_address(slab
), slab
->objects
)
2096 check_object(s
, slab
, p
, SLUB_RED_INACTIVE
);
2099 if (unlikely(s
->flags
& SLAB_TYPESAFE_BY_RCU
))
2100 call_rcu(&slab
->rcu_head
, rcu_free_slab
);
2102 __free_slab(s
, slab
);
2105 static void discard_slab(struct kmem_cache
*s
, struct slab
*slab
)
2107 dec_slabs_node(s
, slab_nid(slab
), slab
->objects
);
2112 * Management of partially allocated slabs.
2115 __add_partial(struct kmem_cache_node
*n
, struct slab
*slab
, int tail
)
2118 if (tail
== DEACTIVATE_TO_TAIL
)
2119 list_add_tail(&slab
->slab_list
, &n
->partial
);
2121 list_add(&slab
->slab_list
, &n
->partial
);
2124 static inline void add_partial(struct kmem_cache_node
*n
,
2125 struct slab
*slab
, int tail
)
2127 lockdep_assert_held(&n
->list_lock
);
2128 __add_partial(n
, slab
, tail
);
2131 static inline void remove_partial(struct kmem_cache_node
*n
,
2134 lockdep_assert_held(&n
->list_lock
);
2135 list_del(&slab
->slab_list
);
2140 * Called only for kmem_cache_debug() caches instead of acquire_slab(), with a
2141 * slab from the n->partial list. Remove only a single object from the slab, do
2142 * the alloc_debug_processing() checks and leave the slab on the list, or move
2143 * it to full list if it was the last free object.
2145 static void *alloc_single_from_partial(struct kmem_cache
*s
,
2146 struct kmem_cache_node
*n
, struct slab
*slab
, int orig_size
)
2150 lockdep_assert_held(&n
->list_lock
);
2152 object
= slab
->freelist
;
2153 slab
->freelist
= get_freepointer(s
, object
);
2156 if (!alloc_debug_processing(s
, slab
, object
, orig_size
)) {
2157 remove_partial(n
, slab
);
2161 if (slab
->inuse
== slab
->objects
) {
2162 remove_partial(n
, slab
);
2163 add_full(s
, n
, slab
);
2170 * Called only for kmem_cache_debug() caches to allocate from a freshly
2171 * allocated slab. Allocate a single object instead of whole freelist
2172 * and put the slab to the partial (or full) list.
2174 static void *alloc_single_from_new_slab(struct kmem_cache
*s
,
2175 struct slab
*slab
, int orig_size
)
2177 int nid
= slab_nid(slab
);
2178 struct kmem_cache_node
*n
= get_node(s
, nid
);
2179 unsigned long flags
;
2183 object
= slab
->freelist
;
2184 slab
->freelist
= get_freepointer(s
, object
);
2187 if (!alloc_debug_processing(s
, slab
, object
, orig_size
))
2189 * It's not really expected that this would fail on a
2190 * freshly allocated slab, but a concurrent memory
2191 * corruption in theory could cause that.
2195 spin_lock_irqsave(&n
->list_lock
, flags
);
2197 if (slab
->inuse
== slab
->objects
)
2198 add_full(s
, n
, slab
);
2200 add_partial(n
, slab
, DEACTIVATE_TO_HEAD
);
2202 inc_slabs_node(s
, nid
, slab
->objects
);
2203 spin_unlock_irqrestore(&n
->list_lock
, flags
);
2209 * Remove slab from the partial list, freeze it and
2210 * return the pointer to the freelist.
2212 * Returns a list of objects or NULL if it fails.
2214 static inline void *acquire_slab(struct kmem_cache
*s
,
2215 struct kmem_cache_node
*n
, struct slab
*slab
,
2219 unsigned long counters
;
2222 lockdep_assert_held(&n
->list_lock
);
2225 * Zap the freelist and set the frozen bit.
2226 * The old freelist is the list of objects for the
2227 * per cpu allocation list.
2229 freelist
= slab
->freelist
;
2230 counters
= slab
->counters
;
2231 new.counters
= counters
;
2233 new.inuse
= slab
->objects
;
2234 new.freelist
= NULL
;
2236 new.freelist
= freelist
;
2239 VM_BUG_ON(new.frozen
);
2242 if (!__slab_update_freelist(s
, slab
,
2244 new.freelist
, new.counters
,
2248 remove_partial(n
, slab
);
2253 #ifdef CONFIG_SLUB_CPU_PARTIAL
2254 static void put_cpu_partial(struct kmem_cache
*s
, struct slab
*slab
, int drain
);
2256 static inline void put_cpu_partial(struct kmem_cache
*s
, struct slab
*slab
,
2259 static inline bool pfmemalloc_match(struct slab
*slab
, gfp_t gfpflags
);
2262 * Try to allocate a partial slab from a specific node.
2264 static void *get_partial_node(struct kmem_cache
*s
, struct kmem_cache_node
*n
,
2265 struct partial_context
*pc
)
2267 struct slab
*slab
, *slab2
;
2268 void *object
= NULL
;
2269 unsigned long flags
;
2270 unsigned int partial_slabs
= 0;
2273 * Racy check. If we mistakenly see no partial slabs then we
2274 * just allocate an empty slab. If we mistakenly try to get a
2275 * partial slab and there is none available then get_partial()
2278 if (!n
|| !n
->nr_partial
)
2281 spin_lock_irqsave(&n
->list_lock
, flags
);
2282 list_for_each_entry_safe(slab
, slab2
, &n
->partial
, slab_list
) {
2285 if (!pfmemalloc_match(slab
, pc
->flags
))
2288 if (IS_ENABLED(CONFIG_SLUB_TINY
) || kmem_cache_debug(s
)) {
2289 object
= alloc_single_from_partial(s
, n
, slab
,
2296 t
= acquire_slab(s
, n
, slab
, object
== NULL
);
2302 stat(s
, ALLOC_FROM_PARTIAL
);
2305 put_cpu_partial(s
, slab
, 0);
2306 stat(s
, CPU_PARTIAL_NODE
);
2309 #ifdef CONFIG_SLUB_CPU_PARTIAL
2310 if (!kmem_cache_has_cpu_partial(s
)
2311 || partial_slabs
> s
->cpu_partial_slabs
/ 2)
2318 spin_unlock_irqrestore(&n
->list_lock
, flags
);
2323 * Get a slab from somewhere. Search in increasing NUMA distances.
2325 static void *get_any_partial(struct kmem_cache
*s
, struct partial_context
*pc
)
2328 struct zonelist
*zonelist
;
2331 enum zone_type highest_zoneidx
= gfp_zone(pc
->flags
);
2333 unsigned int cpuset_mems_cookie
;
2336 * The defrag ratio allows a configuration of the tradeoffs between
2337 * inter node defragmentation and node local allocations. A lower
2338 * defrag_ratio increases the tendency to do local allocations
2339 * instead of attempting to obtain partial slabs from other nodes.
2341 * If the defrag_ratio is set to 0 then kmalloc() always
2342 * returns node local objects. If the ratio is higher then kmalloc()
2343 * may return off node objects because partial slabs are obtained
2344 * from other nodes and filled up.
2346 * If /sys/kernel/slab/xx/remote_node_defrag_ratio is set to 100
2347 * (which makes defrag_ratio = 1000) then every (well almost)
2348 * allocation will first attempt to defrag slab caches on other nodes.
2349 * This means scanning over all nodes to look for partial slabs which
2350 * may be expensive if we do it every time we are trying to find a slab
2351 * with available objects.
2353 if (!s
->remote_node_defrag_ratio
||
2354 get_cycles() % 1024 > s
->remote_node_defrag_ratio
)
2358 cpuset_mems_cookie
= read_mems_allowed_begin();
2359 zonelist
= node_zonelist(mempolicy_slab_node(), pc
->flags
);
2360 for_each_zone_zonelist(zone
, z
, zonelist
, highest_zoneidx
) {
2361 struct kmem_cache_node
*n
;
2363 n
= get_node(s
, zone_to_nid(zone
));
2365 if (n
&& cpuset_zone_allowed(zone
, pc
->flags
) &&
2366 n
->nr_partial
> s
->min_partial
) {
2367 object
= get_partial_node(s
, n
, pc
);
2370 * Don't check read_mems_allowed_retry()
2371 * here - if mems_allowed was updated in
2372 * parallel, that was a harmless race
2373 * between allocation and the cpuset
2380 } while (read_mems_allowed_retry(cpuset_mems_cookie
));
2381 #endif /* CONFIG_NUMA */
2386 * Get a partial slab, lock it and return it.
2388 static void *get_partial(struct kmem_cache
*s
, int node
, struct partial_context
*pc
)
2391 int searchnode
= node
;
2393 if (node
== NUMA_NO_NODE
)
2394 searchnode
= numa_mem_id();
2396 object
= get_partial_node(s
, get_node(s
, searchnode
), pc
);
2397 if (object
|| node
!= NUMA_NO_NODE
)
2400 return get_any_partial(s
, pc
);
2403 #ifndef CONFIG_SLUB_TINY
2405 #ifdef CONFIG_PREEMPTION
2407 * Calculate the next globally unique transaction for disambiguation
2408 * during cmpxchg. The transactions start with the cpu number and are then
2409 * incremented by CONFIG_NR_CPUS.
2411 #define TID_STEP roundup_pow_of_two(CONFIG_NR_CPUS)
2414 * No preemption supported therefore also no need to check for
2418 #endif /* CONFIG_PREEMPTION */
2420 static inline unsigned long next_tid(unsigned long tid
)
2422 return tid
+ TID_STEP
;
2425 #ifdef SLUB_DEBUG_CMPXCHG
2426 static inline unsigned int tid_to_cpu(unsigned long tid
)
2428 return tid
% TID_STEP
;
2431 static inline unsigned long tid_to_event(unsigned long tid
)
2433 return tid
/ TID_STEP
;
2437 static inline unsigned int init_tid(int cpu
)
2442 static inline void note_cmpxchg_failure(const char *n
,
2443 const struct kmem_cache
*s
, unsigned long tid
)
2445 #ifdef SLUB_DEBUG_CMPXCHG
2446 unsigned long actual_tid
= __this_cpu_read(s
->cpu_slab
->tid
);
2448 pr_info("%s %s: cmpxchg redo ", n
, s
->name
);
2450 #ifdef CONFIG_PREEMPTION
2451 if (tid_to_cpu(tid
) != tid_to_cpu(actual_tid
))
2452 pr_warn("due to cpu change %d -> %d\n",
2453 tid_to_cpu(tid
), tid_to_cpu(actual_tid
));
2456 if (tid_to_event(tid
) != tid_to_event(actual_tid
))
2457 pr_warn("due to cpu running other code. Event %ld->%ld\n",
2458 tid_to_event(tid
), tid_to_event(actual_tid
));
2460 pr_warn("for unknown reason: actual=%lx was=%lx target=%lx\n",
2461 actual_tid
, tid
, next_tid(tid
));
2463 stat(s
, CMPXCHG_DOUBLE_CPU_FAIL
);
2466 static void init_kmem_cache_cpus(struct kmem_cache
*s
)
2469 struct kmem_cache_cpu
*c
;
2471 for_each_possible_cpu(cpu
) {
2472 c
= per_cpu_ptr(s
->cpu_slab
, cpu
);
2473 local_lock_init(&c
->lock
);
2474 c
->tid
= init_tid(cpu
);
2479 * Finishes removing the cpu slab. Merges cpu's freelist with slab's freelist,
2480 * unfreezes the slabs and puts it on the proper list.
2481 * Assumes the slab has been already safely taken away from kmem_cache_cpu
2484 static void deactivate_slab(struct kmem_cache
*s
, struct slab
*slab
,
2487 enum slab_modes
{ M_NONE
, M_PARTIAL
, M_FREE
, M_FULL_NOLIST
};
2488 struct kmem_cache_node
*n
= get_node(s
, slab_nid(slab
));
2490 enum slab_modes mode
= M_NONE
;
2491 void *nextfree
, *freelist_iter
, *freelist_tail
;
2492 int tail
= DEACTIVATE_TO_HEAD
;
2493 unsigned long flags
= 0;
2497 if (slab
->freelist
) {
2498 stat(s
, DEACTIVATE_REMOTE_FREES
);
2499 tail
= DEACTIVATE_TO_TAIL
;
2503 * Stage one: Count the objects on cpu's freelist as free_delta and
2504 * remember the last object in freelist_tail for later splicing.
2506 freelist_tail
= NULL
;
2507 freelist_iter
= freelist
;
2508 while (freelist_iter
) {
2509 nextfree
= get_freepointer(s
, freelist_iter
);
2512 * If 'nextfree' is invalid, it is possible that the object at
2513 * 'freelist_iter' is already corrupted. So isolate all objects
2514 * starting at 'freelist_iter' by skipping them.
2516 if (freelist_corrupted(s
, slab
, &freelist_iter
, nextfree
))
2519 freelist_tail
= freelist_iter
;
2522 freelist_iter
= nextfree
;
2526 * Stage two: Unfreeze the slab while splicing the per-cpu
2527 * freelist to the head of slab's freelist.
2529 * Ensure that the slab is unfrozen while the list presence
2530 * reflects the actual number of objects during unfreeze.
2532 * We first perform cmpxchg holding lock and insert to list
2533 * when it succeed. If there is mismatch then the slab is not
2534 * unfrozen and number of objects in the slab may have changed.
2535 * Then release lock and retry cmpxchg again.
2539 old
.freelist
= READ_ONCE(slab
->freelist
);
2540 old
.counters
= READ_ONCE(slab
->counters
);
2541 VM_BUG_ON(!old
.frozen
);
2543 /* Determine target state of the slab */
2544 new.counters
= old
.counters
;
2545 if (freelist_tail
) {
2546 new.inuse
-= free_delta
;
2547 set_freepointer(s
, freelist_tail
, old
.freelist
);
2548 new.freelist
= freelist
;
2550 new.freelist
= old
.freelist
;
2554 if (!new.inuse
&& n
->nr_partial
>= s
->min_partial
) {
2556 } else if (new.freelist
) {
2559 * Taking the spinlock removes the possibility that
2560 * acquire_slab() will see a slab that is frozen
2562 spin_lock_irqsave(&n
->list_lock
, flags
);
2564 mode
= M_FULL_NOLIST
;
2568 if (!slab_update_freelist(s
, slab
,
2569 old
.freelist
, old
.counters
,
2570 new.freelist
, new.counters
,
2571 "unfreezing slab")) {
2572 if (mode
== M_PARTIAL
)
2573 spin_unlock_irqrestore(&n
->list_lock
, flags
);
2578 if (mode
== M_PARTIAL
) {
2579 add_partial(n
, slab
, tail
);
2580 spin_unlock_irqrestore(&n
->list_lock
, flags
);
2582 } else if (mode
== M_FREE
) {
2583 stat(s
, DEACTIVATE_EMPTY
);
2584 discard_slab(s
, slab
);
2586 } else if (mode
== M_FULL_NOLIST
) {
2587 stat(s
, DEACTIVATE_FULL
);
2591 #ifdef CONFIG_SLUB_CPU_PARTIAL
2592 static void __unfreeze_partials(struct kmem_cache
*s
, struct slab
*partial_slab
)
2594 struct kmem_cache_node
*n
= NULL
, *n2
= NULL
;
2595 struct slab
*slab
, *slab_to_discard
= NULL
;
2596 unsigned long flags
= 0;
2598 while (partial_slab
) {
2602 slab
= partial_slab
;
2603 partial_slab
= slab
->next
;
2605 n2
= get_node(s
, slab_nid(slab
));
2608 spin_unlock_irqrestore(&n
->list_lock
, flags
);
2611 spin_lock_irqsave(&n
->list_lock
, flags
);
2616 old
.freelist
= slab
->freelist
;
2617 old
.counters
= slab
->counters
;
2618 VM_BUG_ON(!old
.frozen
);
2620 new.counters
= old
.counters
;
2621 new.freelist
= old
.freelist
;
2625 } while (!__slab_update_freelist(s
, slab
,
2626 old
.freelist
, old
.counters
,
2627 new.freelist
, new.counters
,
2628 "unfreezing slab"));
2630 if (unlikely(!new.inuse
&& n
->nr_partial
>= s
->min_partial
)) {
2631 slab
->next
= slab_to_discard
;
2632 slab_to_discard
= slab
;
2634 add_partial(n
, slab
, DEACTIVATE_TO_TAIL
);
2635 stat(s
, FREE_ADD_PARTIAL
);
2640 spin_unlock_irqrestore(&n
->list_lock
, flags
);
2642 while (slab_to_discard
) {
2643 slab
= slab_to_discard
;
2644 slab_to_discard
= slab_to_discard
->next
;
2646 stat(s
, DEACTIVATE_EMPTY
);
2647 discard_slab(s
, slab
);
2653 * Unfreeze all the cpu partial slabs.
2655 static void unfreeze_partials(struct kmem_cache
*s
)
2657 struct slab
*partial_slab
;
2658 unsigned long flags
;
2660 local_lock_irqsave(&s
->cpu_slab
->lock
, flags
);
2661 partial_slab
= this_cpu_read(s
->cpu_slab
->partial
);
2662 this_cpu_write(s
->cpu_slab
->partial
, NULL
);
2663 local_unlock_irqrestore(&s
->cpu_slab
->lock
, flags
);
2666 __unfreeze_partials(s
, partial_slab
);
2669 static void unfreeze_partials_cpu(struct kmem_cache
*s
,
2670 struct kmem_cache_cpu
*c
)
2672 struct slab
*partial_slab
;
2674 partial_slab
= slub_percpu_partial(c
);
2678 __unfreeze_partials(s
, partial_slab
);
2682 * Put a slab that was just frozen (in __slab_free|get_partial_node) into a
2683 * partial slab slot if available.
2685 * If we did not find a slot then simply move all the partials to the
2686 * per node partial list.
2688 static void put_cpu_partial(struct kmem_cache
*s
, struct slab
*slab
, int drain
)
2690 struct slab
*oldslab
;
2691 struct slab
*slab_to_unfreeze
= NULL
;
2692 unsigned long flags
;
2695 local_lock_irqsave(&s
->cpu_slab
->lock
, flags
);
2697 oldslab
= this_cpu_read(s
->cpu_slab
->partial
);
2700 if (drain
&& oldslab
->slabs
>= s
->cpu_partial_slabs
) {
2702 * Partial array is full. Move the existing set to the
2703 * per node partial list. Postpone the actual unfreezing
2704 * outside of the critical section.
2706 slab_to_unfreeze
= oldslab
;
2709 slabs
= oldslab
->slabs
;
2715 slab
->slabs
= slabs
;
2716 slab
->next
= oldslab
;
2718 this_cpu_write(s
->cpu_slab
->partial
, slab
);
2720 local_unlock_irqrestore(&s
->cpu_slab
->lock
, flags
);
2722 if (slab_to_unfreeze
) {
2723 __unfreeze_partials(s
, slab_to_unfreeze
);
2724 stat(s
, CPU_PARTIAL_DRAIN
);
2728 #else /* CONFIG_SLUB_CPU_PARTIAL */
2730 static inline void unfreeze_partials(struct kmem_cache
*s
) { }
2731 static inline void unfreeze_partials_cpu(struct kmem_cache
*s
,
2732 struct kmem_cache_cpu
*c
) { }
2734 #endif /* CONFIG_SLUB_CPU_PARTIAL */
2736 static inline void flush_slab(struct kmem_cache
*s
, struct kmem_cache_cpu
*c
)
2738 unsigned long flags
;
2742 local_lock_irqsave(&s
->cpu_slab
->lock
, flags
);
2745 freelist
= c
->freelist
;
2749 c
->tid
= next_tid(c
->tid
);
2751 local_unlock_irqrestore(&s
->cpu_slab
->lock
, flags
);
2754 deactivate_slab(s
, slab
, freelist
);
2755 stat(s
, CPUSLAB_FLUSH
);
2759 static inline void __flush_cpu_slab(struct kmem_cache
*s
, int cpu
)
2761 struct kmem_cache_cpu
*c
= per_cpu_ptr(s
->cpu_slab
, cpu
);
2762 void *freelist
= c
->freelist
;
2763 struct slab
*slab
= c
->slab
;
2767 c
->tid
= next_tid(c
->tid
);
2770 deactivate_slab(s
, slab
, freelist
);
2771 stat(s
, CPUSLAB_FLUSH
);
2774 unfreeze_partials_cpu(s
, c
);
2777 struct slub_flush_work
{
2778 struct work_struct work
;
2779 struct kmem_cache
*s
;
2786 * Called from CPU work handler with migration disabled.
2788 static void flush_cpu_slab(struct work_struct
*w
)
2790 struct kmem_cache
*s
;
2791 struct kmem_cache_cpu
*c
;
2792 struct slub_flush_work
*sfw
;
2794 sfw
= container_of(w
, struct slub_flush_work
, work
);
2797 c
= this_cpu_ptr(s
->cpu_slab
);
2802 unfreeze_partials(s
);
2805 static bool has_cpu_slab(int cpu
, struct kmem_cache
*s
)
2807 struct kmem_cache_cpu
*c
= per_cpu_ptr(s
->cpu_slab
, cpu
);
2809 return c
->slab
|| slub_percpu_partial(c
);
2812 static DEFINE_MUTEX(flush_lock
);
2813 static DEFINE_PER_CPU(struct slub_flush_work
, slub_flush
);
2815 static void flush_all_cpus_locked(struct kmem_cache
*s
)
2817 struct slub_flush_work
*sfw
;
2820 lockdep_assert_cpus_held();
2821 mutex_lock(&flush_lock
);
2823 for_each_online_cpu(cpu
) {
2824 sfw
= &per_cpu(slub_flush
, cpu
);
2825 if (!has_cpu_slab(cpu
, s
)) {
2829 INIT_WORK(&sfw
->work
, flush_cpu_slab
);
2832 queue_work_on(cpu
, flushwq
, &sfw
->work
);
2835 for_each_online_cpu(cpu
) {
2836 sfw
= &per_cpu(slub_flush
, cpu
);
2839 flush_work(&sfw
->work
);
2842 mutex_unlock(&flush_lock
);
2845 static void flush_all(struct kmem_cache
*s
)
2848 flush_all_cpus_locked(s
);
2853 * Use the cpu notifier to insure that the cpu slabs are flushed when
2856 static int slub_cpu_dead(unsigned int cpu
)
2858 struct kmem_cache
*s
;
2860 mutex_lock(&slab_mutex
);
2861 list_for_each_entry(s
, &slab_caches
, list
)
2862 __flush_cpu_slab(s
, cpu
);
2863 mutex_unlock(&slab_mutex
);
2867 #else /* CONFIG_SLUB_TINY */
2868 static inline void flush_all_cpus_locked(struct kmem_cache
*s
) { }
2869 static inline void flush_all(struct kmem_cache
*s
) { }
2870 static inline void __flush_cpu_slab(struct kmem_cache
*s
, int cpu
) { }
2871 static inline int slub_cpu_dead(unsigned int cpu
) { return 0; }
2872 #endif /* CONFIG_SLUB_TINY */
2875 * Check if the objects in a per cpu structure fit numa
2876 * locality expectations.
2878 static inline int node_match(struct slab
*slab
, int node
)
2881 if (node
!= NUMA_NO_NODE
&& slab_nid(slab
) != node
)
2887 #ifdef CONFIG_SLUB_DEBUG
2888 static int count_free(struct slab
*slab
)
2890 return slab
->objects
- slab
->inuse
;
2893 static inline unsigned long node_nr_objs(struct kmem_cache_node
*n
)
2895 return atomic_long_read(&n
->total_objects
);
2898 /* Supports checking bulk free of a constructed freelist */
2899 static inline bool free_debug_processing(struct kmem_cache
*s
,
2900 struct slab
*slab
, void *head
, void *tail
, int *bulk_cnt
,
2901 unsigned long addr
, depot_stack_handle_t handle
)
2903 bool checks_ok
= false;
2904 void *object
= head
;
2907 if (s
->flags
& SLAB_CONSISTENCY_CHECKS
) {
2908 if (!check_slab(s
, slab
))
2912 if (slab
->inuse
< *bulk_cnt
) {
2913 slab_err(s
, slab
, "Slab has %d allocated objects but %d are to be freed\n",
2914 slab
->inuse
, *bulk_cnt
);
2920 if (++cnt
> *bulk_cnt
)
2923 if (s
->flags
& SLAB_CONSISTENCY_CHECKS
) {
2924 if (!free_consistency_checks(s
, slab
, object
, addr
))
2928 if (s
->flags
& SLAB_STORE_USER
)
2929 set_track_update(s
, object
, TRACK_FREE
, addr
, handle
);
2930 trace(s
, slab
, object
, 0);
2931 /* Freepointer not overwritten by init_object(), SLAB_POISON moved it */
2932 init_object(s
, object
, SLUB_RED_INACTIVE
);
2934 /* Reached end of constructed freelist yet? */
2935 if (object
!= tail
) {
2936 object
= get_freepointer(s
, object
);
2942 if (cnt
!= *bulk_cnt
) {
2943 slab_err(s
, slab
, "Bulk free expected %d objects but found %d\n",
2951 slab_fix(s
, "Object at 0x%p not freed", object
);
2955 #endif /* CONFIG_SLUB_DEBUG */
2957 #if defined(CONFIG_SLUB_DEBUG) || defined(SLAB_SUPPORTS_SYSFS)
2958 static unsigned long count_partial(struct kmem_cache_node
*n
,
2959 int (*get_count
)(struct slab
*))
2961 unsigned long flags
;
2962 unsigned long x
= 0;
2965 spin_lock_irqsave(&n
->list_lock
, flags
);
2966 list_for_each_entry(slab
, &n
->partial
, slab_list
)
2967 x
+= get_count(slab
);
2968 spin_unlock_irqrestore(&n
->list_lock
, flags
);
2971 #endif /* CONFIG_SLUB_DEBUG || SLAB_SUPPORTS_SYSFS */
2973 #ifdef CONFIG_SLUB_DEBUG
2974 static noinline
void
2975 slab_out_of_memory(struct kmem_cache
*s
, gfp_t gfpflags
, int nid
)
2977 static DEFINE_RATELIMIT_STATE(slub_oom_rs
, DEFAULT_RATELIMIT_INTERVAL
,
2978 DEFAULT_RATELIMIT_BURST
);
2980 struct kmem_cache_node
*n
;
2982 if ((gfpflags
& __GFP_NOWARN
) || !__ratelimit(&slub_oom_rs
))
2985 pr_warn("SLUB: Unable to allocate memory on node %d, gfp=%#x(%pGg)\n",
2986 nid
, gfpflags
, &gfpflags
);
2987 pr_warn(" cache: %s, object size: %u, buffer size: %u, default order: %u, min order: %u\n",
2988 s
->name
, s
->object_size
, s
->size
, oo_order(s
->oo
),
2991 if (oo_order(s
->min
) > get_order(s
->object_size
))
2992 pr_warn(" %s debugging increased min order, use slub_debug=O to disable.\n",
2995 for_each_kmem_cache_node(s
, node
, n
) {
2996 unsigned long nr_slabs
;
2997 unsigned long nr_objs
;
2998 unsigned long nr_free
;
3000 nr_free
= count_partial(n
, count_free
);
3001 nr_slabs
= node_nr_slabs(n
);
3002 nr_objs
= node_nr_objs(n
);
3004 pr_warn(" node %d: slabs: %ld, objs: %ld, free: %ld\n",
3005 node
, nr_slabs
, nr_objs
, nr_free
);
3008 #else /* CONFIG_SLUB_DEBUG */
3010 slab_out_of_memory(struct kmem_cache
*s
, gfp_t gfpflags
, int nid
) { }
3013 static inline bool pfmemalloc_match(struct slab
*slab
, gfp_t gfpflags
)
3015 if (unlikely(slab_test_pfmemalloc(slab
)))
3016 return gfp_pfmemalloc_allowed(gfpflags
);
3021 #ifndef CONFIG_SLUB_TINY
3023 __update_cpu_freelist_fast(struct kmem_cache
*s
,
3024 void *freelist_old
, void *freelist_new
,
3027 freelist_aba_t old
= { .freelist
= freelist_old
, .counter
= tid
};
3028 freelist_aba_t
new = { .freelist
= freelist_new
, .counter
= next_tid(tid
) };
3030 return this_cpu_try_cmpxchg_freelist(s
->cpu_slab
->freelist_tid
.full
,
3031 &old
.full
, new.full
);
3035 * Check the slab->freelist and either transfer the freelist to the
3036 * per cpu freelist or deactivate the slab.
3038 * The slab is still frozen if the return value is not NULL.
3040 * If this function returns NULL then the slab has been unfrozen.
3042 static inline void *get_freelist(struct kmem_cache
*s
, struct slab
*slab
)
3045 unsigned long counters
;
3048 lockdep_assert_held(this_cpu_ptr(&s
->cpu_slab
->lock
));
3051 freelist
= slab
->freelist
;
3052 counters
= slab
->counters
;
3054 new.counters
= counters
;
3055 VM_BUG_ON(!new.frozen
);
3057 new.inuse
= slab
->objects
;
3058 new.frozen
= freelist
!= NULL
;
3060 } while (!__slab_update_freelist(s
, slab
,
3069 * Slow path. The lockless freelist is empty or we need to perform
3072 * Processing is still very fast if new objects have been freed to the
3073 * regular freelist. In that case we simply take over the regular freelist
3074 * as the lockless freelist and zap the regular freelist.
3076 * If that is not working then we fall back to the partial lists. We take the
3077 * first element of the freelist as the object to allocate now and move the
3078 * rest of the freelist to the lockless freelist.
3080 * And if we were unable to get a new slab from the partial slab lists then
3081 * we need to allocate a new slab. This is the slowest path since it involves
3082 * a call to the page allocator and the setup of a new slab.
3084 * Version of __slab_alloc to use when we know that preemption is
3085 * already disabled (which is the case for bulk allocation).
3087 static void *___slab_alloc(struct kmem_cache
*s
, gfp_t gfpflags
, int node
,
3088 unsigned long addr
, struct kmem_cache_cpu
*c
, unsigned int orig_size
)
3092 unsigned long flags
;
3093 struct partial_context pc
;
3095 stat(s
, ALLOC_SLOWPATH
);
3099 slab
= READ_ONCE(c
->slab
);
3102 * if the node is not online or has no normal memory, just
3103 * ignore the node constraint
3105 if (unlikely(node
!= NUMA_NO_NODE
&&
3106 !node_isset(node
, slab_nodes
)))
3107 node
= NUMA_NO_NODE
;
3112 if (unlikely(!node_match(slab
, node
))) {
3114 * same as above but node_match() being false already
3115 * implies node != NUMA_NO_NODE
3117 if (!node_isset(node
, slab_nodes
)) {
3118 node
= NUMA_NO_NODE
;
3120 stat(s
, ALLOC_NODE_MISMATCH
);
3121 goto deactivate_slab
;
3126 * By rights, we should be searching for a slab page that was
3127 * PFMEMALLOC but right now, we are losing the pfmemalloc
3128 * information when the page leaves the per-cpu allocator
3130 if (unlikely(!pfmemalloc_match(slab
, gfpflags
)))
3131 goto deactivate_slab
;
3133 /* must check again c->slab in case we got preempted and it changed */
3134 local_lock_irqsave(&s
->cpu_slab
->lock
, flags
);
3135 if (unlikely(slab
!= c
->slab
)) {
3136 local_unlock_irqrestore(&s
->cpu_slab
->lock
, flags
);
3139 freelist
= c
->freelist
;
3143 freelist
= get_freelist(s
, slab
);
3147 c
->tid
= next_tid(c
->tid
);
3148 local_unlock_irqrestore(&s
->cpu_slab
->lock
, flags
);
3149 stat(s
, DEACTIVATE_BYPASS
);
3153 stat(s
, ALLOC_REFILL
);
3157 lockdep_assert_held(this_cpu_ptr(&s
->cpu_slab
->lock
));
3160 * freelist is pointing to the list of objects to be used.
3161 * slab is pointing to the slab from which the objects are obtained.
3162 * That slab must be frozen for per cpu allocations to work.
3164 VM_BUG_ON(!c
->slab
->frozen
);
3165 c
->freelist
= get_freepointer(s
, freelist
);
3166 c
->tid
= next_tid(c
->tid
);
3167 local_unlock_irqrestore(&s
->cpu_slab
->lock
, flags
);
3172 local_lock_irqsave(&s
->cpu_slab
->lock
, flags
);
3173 if (slab
!= c
->slab
) {
3174 local_unlock_irqrestore(&s
->cpu_slab
->lock
, flags
);
3177 freelist
= c
->freelist
;
3180 c
->tid
= next_tid(c
->tid
);
3181 local_unlock_irqrestore(&s
->cpu_slab
->lock
, flags
);
3182 deactivate_slab(s
, slab
, freelist
);
3186 if (slub_percpu_partial(c
)) {
3187 local_lock_irqsave(&s
->cpu_slab
->lock
, flags
);
3188 if (unlikely(c
->slab
)) {
3189 local_unlock_irqrestore(&s
->cpu_slab
->lock
, flags
);
3192 if (unlikely(!slub_percpu_partial(c
))) {
3193 local_unlock_irqrestore(&s
->cpu_slab
->lock
, flags
);
3194 /* we were preempted and partial list got empty */
3198 slab
= c
->slab
= slub_percpu_partial(c
);
3199 slub_set_percpu_partial(c
, slab
);
3200 local_unlock_irqrestore(&s
->cpu_slab
->lock
, flags
);
3201 stat(s
, CPU_PARTIAL_ALLOC
);
3207 pc
.flags
= gfpflags
;
3209 pc
.orig_size
= orig_size
;
3210 freelist
= get_partial(s
, node
, &pc
);
3212 goto check_new_slab
;
3214 slub_put_cpu_ptr(s
->cpu_slab
);
3215 slab
= new_slab(s
, gfpflags
, node
);
3216 c
= slub_get_cpu_ptr(s
->cpu_slab
);
3218 if (unlikely(!slab
)) {
3219 slab_out_of_memory(s
, gfpflags
, node
);
3223 stat(s
, ALLOC_SLAB
);
3225 if (kmem_cache_debug(s
)) {
3226 freelist
= alloc_single_from_new_slab(s
, slab
, orig_size
);
3228 if (unlikely(!freelist
))
3231 if (s
->flags
& SLAB_STORE_USER
)
3232 set_track(s
, freelist
, TRACK_ALLOC
, addr
);
3238 * No other reference to the slab yet so we can
3239 * muck around with it freely without cmpxchg
3241 freelist
= slab
->freelist
;
3242 slab
->freelist
= NULL
;
3243 slab
->inuse
= slab
->objects
;
3246 inc_slabs_node(s
, slab_nid(slab
), slab
->objects
);
3250 if (kmem_cache_debug(s
)) {
3252 * For debug caches here we had to go through
3253 * alloc_single_from_partial() so just store the tracking info
3254 * and return the object
3256 if (s
->flags
& SLAB_STORE_USER
)
3257 set_track(s
, freelist
, TRACK_ALLOC
, addr
);
3262 if (unlikely(!pfmemalloc_match(slab
, gfpflags
))) {
3264 * For !pfmemalloc_match() case we don't load freelist so that
3265 * we don't make further mismatched allocations easier.
3267 deactivate_slab(s
, slab
, get_freepointer(s
, freelist
));
3273 local_lock_irqsave(&s
->cpu_slab
->lock
, flags
);
3274 if (unlikely(c
->slab
)) {
3275 void *flush_freelist
= c
->freelist
;
3276 struct slab
*flush_slab
= c
->slab
;
3280 c
->tid
= next_tid(c
->tid
);
3282 local_unlock_irqrestore(&s
->cpu_slab
->lock
, flags
);
3284 deactivate_slab(s
, flush_slab
, flush_freelist
);
3286 stat(s
, CPUSLAB_FLUSH
);
3288 goto retry_load_slab
;
3296 * A wrapper for ___slab_alloc() for contexts where preemption is not yet
3297 * disabled. Compensates for possible cpu changes by refetching the per cpu area
3300 static void *__slab_alloc(struct kmem_cache
*s
, gfp_t gfpflags
, int node
,
3301 unsigned long addr
, struct kmem_cache_cpu
*c
, unsigned int orig_size
)
3305 #ifdef CONFIG_PREEMPT_COUNT
3307 * We may have been preempted and rescheduled on a different
3308 * cpu before disabling preemption. Need to reload cpu area
3311 c
= slub_get_cpu_ptr(s
->cpu_slab
);
3314 p
= ___slab_alloc(s
, gfpflags
, node
, addr
, c
, orig_size
);
3315 #ifdef CONFIG_PREEMPT_COUNT
3316 slub_put_cpu_ptr(s
->cpu_slab
);
3321 static __always_inline
void *__slab_alloc_node(struct kmem_cache
*s
,
3322 gfp_t gfpflags
, int node
, unsigned long addr
, size_t orig_size
)
3324 struct kmem_cache_cpu
*c
;
3331 * Must read kmem_cache cpu data via this cpu ptr. Preemption is
3332 * enabled. We may switch back and forth between cpus while
3333 * reading from one cpu area. That does not matter as long
3334 * as we end up on the original cpu again when doing the cmpxchg.
3336 * We must guarantee that tid and kmem_cache_cpu are retrieved on the
3337 * same cpu. We read first the kmem_cache_cpu pointer and use it to read
3338 * the tid. If we are preempted and switched to another cpu between the
3339 * two reads, it's OK as the two are still associated with the same cpu
3340 * and cmpxchg later will validate the cpu.
3342 c
= raw_cpu_ptr(s
->cpu_slab
);
3343 tid
= READ_ONCE(c
->tid
);
3346 * Irqless object alloc/free algorithm used here depends on sequence
3347 * of fetching cpu_slab's data. tid should be fetched before anything
3348 * on c to guarantee that object and slab associated with previous tid
3349 * won't be used with current tid. If we fetch tid first, object and
3350 * slab could be one associated with next tid and our alloc/free
3351 * request will be failed. In this case, we will retry. So, no problem.
3356 * The transaction ids are globally unique per cpu and per operation on
3357 * a per cpu queue. Thus they can be guarantee that the cmpxchg_double
3358 * occurs on the right processor and that there was no operation on the
3359 * linked list in between.
3362 object
= c
->freelist
;
3365 if (!USE_LOCKLESS_FAST_PATH() ||
3366 unlikely(!object
|| !slab
|| !node_match(slab
, node
))) {
3367 object
= __slab_alloc(s
, gfpflags
, node
, addr
, c
, orig_size
);
3369 void *next_object
= get_freepointer_safe(s
, object
);
3372 * The cmpxchg will only match if there was no additional
3373 * operation and if we are on the right processor.
3375 * The cmpxchg does the following atomically (without lock
3377 * 1. Relocate first pointer to the current per cpu area.
3378 * 2. Verify that tid and freelist have not been changed
3379 * 3. If they were not changed replace tid and freelist
3381 * Since this is without lock semantics the protection is only
3382 * against code executing on this cpu *not* from access by
3385 if (unlikely(!__update_cpu_freelist_fast(s
, object
, next_object
, tid
))) {
3386 note_cmpxchg_failure("slab_alloc", s
, tid
);
3389 prefetch_freepointer(s
, next_object
);
3390 stat(s
, ALLOC_FASTPATH
);
3395 #else /* CONFIG_SLUB_TINY */
3396 static void *__slab_alloc_node(struct kmem_cache
*s
,
3397 gfp_t gfpflags
, int node
, unsigned long addr
, size_t orig_size
)
3399 struct partial_context pc
;
3403 pc
.flags
= gfpflags
;
3405 pc
.orig_size
= orig_size
;
3406 object
= get_partial(s
, node
, &pc
);
3411 slab
= new_slab(s
, gfpflags
, node
);
3412 if (unlikely(!slab
)) {
3413 slab_out_of_memory(s
, gfpflags
, node
);
3417 object
= alloc_single_from_new_slab(s
, slab
, orig_size
);
3421 #endif /* CONFIG_SLUB_TINY */
3424 * If the object has been wiped upon free, make sure it's fully initialized by
3425 * zeroing out freelist pointer.
3427 static __always_inline
void maybe_wipe_obj_freeptr(struct kmem_cache
*s
,
3430 if (unlikely(slab_want_init_on_free(s
)) && obj
)
3431 memset((void *)((char *)kasan_reset_tag(obj
) + s
->offset
),
3436 * Inlined fastpath so that allocation functions (kmalloc, kmem_cache_alloc)
3437 * have the fastpath folded into their functions. So no function call
3438 * overhead for requests that can be satisfied on the fastpath.
3440 * The fastpath works by first checking if the lockless freelist can be used.
3441 * If not then __slab_alloc is called for slow processing.
3443 * Otherwise we can simply pick the next object from the lockless free list.
3445 static __fastpath_inline
void *slab_alloc_node(struct kmem_cache
*s
, struct list_lru
*lru
,
3446 gfp_t gfpflags
, int node
, unsigned long addr
, size_t orig_size
)
3449 struct obj_cgroup
*objcg
= NULL
;
3452 s
= slab_pre_alloc_hook(s
, lru
, &objcg
, 1, gfpflags
);
3456 object
= kfence_alloc(s
, orig_size
, gfpflags
);
3457 if (unlikely(object
))
3460 object
= __slab_alloc_node(s
, gfpflags
, node
, addr
, orig_size
);
3462 maybe_wipe_obj_freeptr(s
, object
);
3463 init
= slab_want_init_on_alloc(gfpflags
, s
);
3467 * When init equals 'true', like for kzalloc() family, only
3468 * @orig_size bytes might be zeroed instead of s->object_size
3470 slab_post_alloc_hook(s
, objcg
, gfpflags
, 1, &object
, init
, orig_size
);
3475 static __fastpath_inline
void *slab_alloc(struct kmem_cache
*s
, struct list_lru
*lru
,
3476 gfp_t gfpflags
, unsigned long addr
, size_t orig_size
)
3478 return slab_alloc_node(s
, lru
, gfpflags
, NUMA_NO_NODE
, addr
, orig_size
);
3481 static __fastpath_inline
3482 void *__kmem_cache_alloc_lru(struct kmem_cache
*s
, struct list_lru
*lru
,
3485 void *ret
= slab_alloc(s
, lru
, gfpflags
, _RET_IP_
, s
->object_size
);
3487 trace_kmem_cache_alloc(_RET_IP_
, ret
, s
, gfpflags
, NUMA_NO_NODE
);
3492 void *kmem_cache_alloc(struct kmem_cache
*s
, gfp_t gfpflags
)
3494 return __kmem_cache_alloc_lru(s
, NULL
, gfpflags
);
3496 EXPORT_SYMBOL(kmem_cache_alloc
);
3498 void *kmem_cache_alloc_lru(struct kmem_cache
*s
, struct list_lru
*lru
,
3501 return __kmem_cache_alloc_lru(s
, lru
, gfpflags
);
3503 EXPORT_SYMBOL(kmem_cache_alloc_lru
);
3505 void *__kmem_cache_alloc_node(struct kmem_cache
*s
, gfp_t gfpflags
,
3506 int node
, size_t orig_size
,
3507 unsigned long caller
)
3509 return slab_alloc_node(s
, NULL
, gfpflags
, node
,
3513 void *kmem_cache_alloc_node(struct kmem_cache
*s
, gfp_t gfpflags
, int node
)
3515 void *ret
= slab_alloc_node(s
, NULL
, gfpflags
, node
, _RET_IP_
, s
->object_size
);
3517 trace_kmem_cache_alloc(_RET_IP_
, ret
, s
, gfpflags
, node
);
3521 EXPORT_SYMBOL(kmem_cache_alloc_node
);
3523 static noinline
void free_to_partial_list(
3524 struct kmem_cache
*s
, struct slab
*slab
,
3525 void *head
, void *tail
, int bulk_cnt
,
3528 struct kmem_cache_node
*n
= get_node(s
, slab_nid(slab
));
3529 struct slab
*slab_free
= NULL
;
3531 unsigned long flags
;
3532 depot_stack_handle_t handle
= 0;
3534 if (s
->flags
& SLAB_STORE_USER
)
3535 handle
= set_track_prepare();
3537 spin_lock_irqsave(&n
->list_lock
, flags
);
3539 if (free_debug_processing(s
, slab
, head
, tail
, &cnt
, addr
, handle
)) {
3540 void *prior
= slab
->freelist
;
3542 /* Perform the actual freeing while we still hold the locks */
3544 set_freepointer(s
, tail
, prior
);
3545 slab
->freelist
= head
;
3548 * If the slab is empty, and node's partial list is full,
3549 * it should be discarded anyway no matter it's on full or
3552 if (slab
->inuse
== 0 && n
->nr_partial
>= s
->min_partial
)
3556 /* was on full list */
3557 remove_full(s
, n
, slab
);
3559 add_partial(n
, slab
, DEACTIVATE_TO_TAIL
);
3560 stat(s
, FREE_ADD_PARTIAL
);
3562 } else if (slab_free
) {
3563 remove_partial(n
, slab
);
3564 stat(s
, FREE_REMOVE_PARTIAL
);
3570 * Update the counters while still holding n->list_lock to
3571 * prevent spurious validation warnings
3573 dec_slabs_node(s
, slab_nid(slab_free
), slab_free
->objects
);
3576 spin_unlock_irqrestore(&n
->list_lock
, flags
);
3580 free_slab(s
, slab_free
);
3585 * Slow path handling. This may still be called frequently since objects
3586 * have a longer lifetime than the cpu slabs in most processing loads.
3588 * So we still attempt to reduce cache line usage. Just take the slab
3589 * lock and free the item. If there is no additional partial slab
3590 * handling required then we can return immediately.
3592 static void __slab_free(struct kmem_cache
*s
, struct slab
*slab
,
3593 void *head
, void *tail
, int cnt
,
3600 unsigned long counters
;
3601 struct kmem_cache_node
*n
= NULL
;
3602 unsigned long flags
;
3604 stat(s
, FREE_SLOWPATH
);
3606 if (kfence_free(head
))
3609 if (IS_ENABLED(CONFIG_SLUB_TINY
) || kmem_cache_debug(s
)) {
3610 free_to_partial_list(s
, slab
, head
, tail
, cnt
, addr
);
3616 spin_unlock_irqrestore(&n
->list_lock
, flags
);
3619 prior
= slab
->freelist
;
3620 counters
= slab
->counters
;
3621 set_freepointer(s
, tail
, prior
);
3622 new.counters
= counters
;
3623 was_frozen
= new.frozen
;
3625 if ((!new.inuse
|| !prior
) && !was_frozen
) {
3627 if (kmem_cache_has_cpu_partial(s
) && !prior
) {
3630 * Slab was on no list before and will be
3632 * We can defer the list move and instead
3637 } else { /* Needs to be taken off a list */
3639 n
= get_node(s
, slab_nid(slab
));
3641 * Speculatively acquire the list_lock.
3642 * If the cmpxchg does not succeed then we may
3643 * drop the list_lock without any processing.
3645 * Otherwise the list_lock will synchronize with
3646 * other processors updating the list of slabs.
3648 spin_lock_irqsave(&n
->list_lock
, flags
);
3653 } while (!slab_update_freelist(s
, slab
,
3660 if (likely(was_frozen
)) {
3662 * The list lock was not taken therefore no list
3663 * activity can be necessary.
3665 stat(s
, FREE_FROZEN
);
3666 } else if (new.frozen
) {
3668 * If we just froze the slab then put it onto the
3669 * per cpu partial list.
3671 put_cpu_partial(s
, slab
, 1);
3672 stat(s
, CPU_PARTIAL_FREE
);
3678 if (unlikely(!new.inuse
&& n
->nr_partial
>= s
->min_partial
))
3682 * Objects left in the slab. If it was not on the partial list before
3685 if (!kmem_cache_has_cpu_partial(s
) && unlikely(!prior
)) {
3686 remove_full(s
, n
, slab
);
3687 add_partial(n
, slab
, DEACTIVATE_TO_TAIL
);
3688 stat(s
, FREE_ADD_PARTIAL
);
3690 spin_unlock_irqrestore(&n
->list_lock
, flags
);
3696 * Slab on the partial list.
3698 remove_partial(n
, slab
);
3699 stat(s
, FREE_REMOVE_PARTIAL
);
3701 /* Slab must be on the full list */
3702 remove_full(s
, n
, slab
);
3705 spin_unlock_irqrestore(&n
->list_lock
, flags
);
3707 discard_slab(s
, slab
);
3710 #ifndef CONFIG_SLUB_TINY
3712 * Fastpath with forced inlining to produce a kfree and kmem_cache_free that
3713 * can perform fastpath freeing without additional function calls.
3715 * The fastpath is only possible if we are freeing to the current cpu slab
3716 * of this processor. This typically the case if we have just allocated
3719 * If fastpath is not possible then fall back to __slab_free where we deal
3720 * with all sorts of special processing.
3722 * Bulk free of a freelist with several objects (all pointing to the
3723 * same slab) possible by specifying head and tail ptr, plus objects
3724 * count (cnt). Bulk free indicated by tail pointer being set.
3726 static __always_inline
void do_slab_free(struct kmem_cache
*s
,
3727 struct slab
*slab
, void *head
, void *tail
,
3728 int cnt
, unsigned long addr
)
3730 void *tail_obj
= tail
? : head
;
3731 struct kmem_cache_cpu
*c
;
3737 * Determine the currently cpus per cpu slab.
3738 * The cpu may change afterward. However that does not matter since
3739 * data is retrieved via this pointer. If we are on the same cpu
3740 * during the cmpxchg then the free will succeed.
3742 c
= raw_cpu_ptr(s
->cpu_slab
);
3743 tid
= READ_ONCE(c
->tid
);
3745 /* Same with comment on barrier() in slab_alloc_node() */
3748 if (unlikely(slab
!= c
->slab
)) {
3749 __slab_free(s
, slab
, head
, tail_obj
, cnt
, addr
);
3753 if (USE_LOCKLESS_FAST_PATH()) {
3754 freelist
= READ_ONCE(c
->freelist
);
3756 set_freepointer(s
, tail_obj
, freelist
);
3758 if (unlikely(!__update_cpu_freelist_fast(s
, freelist
, head
, tid
))) {
3759 note_cmpxchg_failure("slab_free", s
, tid
);
3763 /* Update the free list under the local lock */
3764 local_lock(&s
->cpu_slab
->lock
);
3765 c
= this_cpu_ptr(s
->cpu_slab
);
3766 if (unlikely(slab
!= c
->slab
)) {
3767 local_unlock(&s
->cpu_slab
->lock
);
3771 freelist
= c
->freelist
;
3773 set_freepointer(s
, tail_obj
, freelist
);
3775 c
->tid
= next_tid(tid
);
3777 local_unlock(&s
->cpu_slab
->lock
);
3779 stat(s
, FREE_FASTPATH
);
3781 #else /* CONFIG_SLUB_TINY */
3782 static void do_slab_free(struct kmem_cache
*s
,
3783 struct slab
*slab
, void *head
, void *tail
,
3784 int cnt
, unsigned long addr
)
3786 void *tail_obj
= tail
? : head
;
3788 __slab_free(s
, slab
, head
, tail_obj
, cnt
, addr
);
3790 #endif /* CONFIG_SLUB_TINY */
3792 static __fastpath_inline
void slab_free(struct kmem_cache
*s
, struct slab
*slab
,
3793 void *head
, void *tail
, void **p
, int cnt
,
3796 memcg_slab_free_hook(s
, slab
, p
, cnt
);
3798 * With KASAN enabled slab_free_freelist_hook modifies the freelist
3799 * to remove objects, whose reuse must be delayed.
3801 if (slab_free_freelist_hook(s
, &head
, &tail
, &cnt
))
3802 do_slab_free(s
, slab
, head
, tail
, cnt
, addr
);
3805 #ifdef CONFIG_KASAN_GENERIC
3806 void ___cache_free(struct kmem_cache
*cache
, void *x
, unsigned long addr
)
3808 do_slab_free(cache
, virt_to_slab(x
), x
, NULL
, 1, addr
);
3812 void __kmem_cache_free(struct kmem_cache
*s
, void *x
, unsigned long caller
)
3814 slab_free(s
, virt_to_slab(x
), x
, NULL
, &x
, 1, caller
);
3817 void kmem_cache_free(struct kmem_cache
*s
, void *x
)
3819 s
= cache_from_obj(s
, x
);
3822 trace_kmem_cache_free(_RET_IP_
, x
, s
);
3823 slab_free(s
, virt_to_slab(x
), x
, NULL
, &x
, 1, _RET_IP_
);
3825 EXPORT_SYMBOL(kmem_cache_free
);
3827 struct detached_freelist
{
3832 struct kmem_cache
*s
;
3836 * This function progressively scans the array with free objects (with
3837 * a limited look ahead) and extract objects belonging to the same
3838 * slab. It builds a detached freelist directly within the given
3839 * slab/objects. This can happen without any need for
3840 * synchronization, because the objects are owned by running process.
3841 * The freelist is build up as a single linked list in the objects.
3842 * The idea is, that this detached freelist can then be bulk
3843 * transferred to the real freelist(s), but only requiring a single
3844 * synchronization primitive. Look ahead in the array is limited due
3845 * to performance reasons.
3848 int build_detached_freelist(struct kmem_cache
*s
, size_t size
,
3849 void **p
, struct detached_freelist
*df
)
3853 struct folio
*folio
;
3857 folio
= virt_to_folio(object
);
3859 /* Handle kalloc'ed objects */
3860 if (unlikely(!folio_test_slab(folio
))) {
3861 free_large_kmalloc(folio
, object
);
3865 /* Derive kmem_cache from object */
3866 df
->slab
= folio_slab(folio
);
3867 df
->s
= df
->slab
->slab_cache
;
3869 df
->slab
= folio_slab(folio
);
3870 df
->s
= cache_from_obj(s
, object
); /* Support for memcg */
3873 /* Start new detached freelist */
3875 df
->freelist
= object
;
3878 if (is_kfence_address(object
))
3881 set_freepointer(df
->s
, object
, NULL
);
3886 /* df->slab is always set at this point */
3887 if (df
->slab
== virt_to_slab(object
)) {
3888 /* Opportunity build freelist */
3889 set_freepointer(df
->s
, object
, df
->freelist
);
3890 df
->freelist
= object
;
3894 swap(p
[size
], p
[same
]);
3898 /* Limit look ahead search */
3906 /* Note that interrupts must be enabled when calling this function. */
3907 void kmem_cache_free_bulk(struct kmem_cache
*s
, size_t size
, void **p
)
3913 struct detached_freelist df
;
3915 size
= build_detached_freelist(s
, size
, p
, &df
);
3919 slab_free(df
.s
, df
.slab
, df
.freelist
, df
.tail
, &p
[size
], df
.cnt
,
3921 } while (likely(size
));
3923 EXPORT_SYMBOL(kmem_cache_free_bulk
);
3925 #ifndef CONFIG_SLUB_TINY
3926 static inline int __kmem_cache_alloc_bulk(struct kmem_cache
*s
, gfp_t flags
,
3927 size_t size
, void **p
, struct obj_cgroup
*objcg
)
3929 struct kmem_cache_cpu
*c
;
3930 unsigned long irqflags
;
3934 * Drain objects in the per cpu slab, while disabling local
3935 * IRQs, which protects against PREEMPT and interrupts
3936 * handlers invoking normal fastpath.
3938 c
= slub_get_cpu_ptr(s
->cpu_slab
);
3939 local_lock_irqsave(&s
->cpu_slab
->lock
, irqflags
);
3941 for (i
= 0; i
< size
; i
++) {
3942 void *object
= kfence_alloc(s
, s
->object_size
, flags
);
3944 if (unlikely(object
)) {
3949 object
= c
->freelist
;
3950 if (unlikely(!object
)) {
3952 * We may have removed an object from c->freelist using
3953 * the fastpath in the previous iteration; in that case,
3954 * c->tid has not been bumped yet.
3955 * Since ___slab_alloc() may reenable interrupts while
3956 * allocating memory, we should bump c->tid now.
3958 c
->tid
= next_tid(c
->tid
);
3960 local_unlock_irqrestore(&s
->cpu_slab
->lock
, irqflags
);
3963 * Invoking slow path likely have side-effect
3964 * of re-populating per CPU c->freelist
3966 p
[i
] = ___slab_alloc(s
, flags
, NUMA_NO_NODE
,
3967 _RET_IP_
, c
, s
->object_size
);
3968 if (unlikely(!p
[i
]))
3971 c
= this_cpu_ptr(s
->cpu_slab
);
3972 maybe_wipe_obj_freeptr(s
, p
[i
]);
3974 local_lock_irqsave(&s
->cpu_slab
->lock
, irqflags
);
3976 continue; /* goto for-loop */
3978 c
->freelist
= get_freepointer(s
, object
);
3980 maybe_wipe_obj_freeptr(s
, p
[i
]);
3982 c
->tid
= next_tid(c
->tid
);
3983 local_unlock_irqrestore(&s
->cpu_slab
->lock
, irqflags
);
3984 slub_put_cpu_ptr(s
->cpu_slab
);
3989 slub_put_cpu_ptr(s
->cpu_slab
);
3990 slab_post_alloc_hook(s
, objcg
, flags
, i
, p
, false, s
->object_size
);
3991 kmem_cache_free_bulk(s
, i
, p
);
3995 #else /* CONFIG_SLUB_TINY */
3996 static int __kmem_cache_alloc_bulk(struct kmem_cache
*s
, gfp_t flags
,
3997 size_t size
, void **p
, struct obj_cgroup
*objcg
)
4001 for (i
= 0; i
< size
; i
++) {
4002 void *object
= kfence_alloc(s
, s
->object_size
, flags
);
4004 if (unlikely(object
)) {
4009 p
[i
] = __slab_alloc_node(s
, flags
, NUMA_NO_NODE
,
4010 _RET_IP_
, s
->object_size
);
4011 if (unlikely(!p
[i
]))
4014 maybe_wipe_obj_freeptr(s
, p
[i
]);
4020 slab_post_alloc_hook(s
, objcg
, flags
, i
, p
, false, s
->object_size
);
4021 kmem_cache_free_bulk(s
, i
, p
);
4024 #endif /* CONFIG_SLUB_TINY */
4026 /* Note that interrupts must be enabled when calling this function. */
4027 int kmem_cache_alloc_bulk(struct kmem_cache
*s
, gfp_t flags
, size_t size
,
4031 struct obj_cgroup
*objcg
= NULL
;
4036 /* memcg and kmem_cache debug support */
4037 s
= slab_pre_alloc_hook(s
, NULL
, &objcg
, size
, flags
);
4041 i
= __kmem_cache_alloc_bulk(s
, flags
, size
, p
, objcg
);
4044 * memcg and kmem_cache debug support and memory initialization.
4045 * Done outside of the IRQ disabled fastpath loop.
4048 slab_post_alloc_hook(s
, objcg
, flags
, size
, p
,
4049 slab_want_init_on_alloc(flags
, s
), s
->object_size
);
4052 EXPORT_SYMBOL(kmem_cache_alloc_bulk
);
4056 * Object placement in a slab is made very easy because we always start at
4057 * offset 0. If we tune the size of the object to the alignment then we can
4058 * get the required alignment by putting one properly sized object after
4061 * Notice that the allocation order determines the sizes of the per cpu
4062 * caches. Each processor has always one slab available for allocations.
4063 * Increasing the allocation order reduces the number of times that slabs
4064 * must be moved on and off the partial lists and is therefore a factor in
4069 * Minimum / Maximum order of slab pages. This influences locking overhead
4070 * and slab fragmentation. A higher order reduces the number of partial slabs
4071 * and increases the number of allocations possible without having to
4072 * take the list_lock.
4074 static unsigned int slub_min_order
;
4075 static unsigned int slub_max_order
=
4076 IS_ENABLED(CONFIG_SLUB_TINY
) ? 1 : PAGE_ALLOC_COSTLY_ORDER
;
4077 static unsigned int slub_min_objects
;
4080 * Calculate the order of allocation given an slab object size.
4082 * The order of allocation has significant impact on performance and other
4083 * system components. Generally order 0 allocations should be preferred since
4084 * order 0 does not cause fragmentation in the page allocator. Larger objects
4085 * be problematic to put into order 0 slabs because there may be too much
4086 * unused space left. We go to a higher order if more than 1/16th of the slab
4089 * In order to reach satisfactory performance we must ensure that a minimum
4090 * number of objects is in one slab. Otherwise we may generate too much
4091 * activity on the partial lists which requires taking the list_lock. This is
4092 * less a concern for large slabs though which are rarely used.
4094 * slub_max_order specifies the order where we begin to stop considering the
4095 * number of objects in a slab as critical. If we reach slub_max_order then
4096 * we try to keep the page order as low as possible. So we accept more waste
4097 * of space in favor of a small page order.
4099 * Higher order allocations also allow the placement of more objects in a
4100 * slab and thereby reduce object handling overhead. If the user has
4101 * requested a higher minimum order then we start with that one instead of
4102 * the smallest order which will fit the object.
4104 static inline unsigned int calc_slab_order(unsigned int size
,
4105 unsigned int min_objects
, unsigned int max_order
,
4106 unsigned int fract_leftover
)
4108 unsigned int min_order
= slub_min_order
;
4111 if (order_objects(min_order
, size
) > MAX_OBJS_PER_PAGE
)
4112 return get_order(size
* MAX_OBJS_PER_PAGE
) - 1;
4114 for (order
= max(min_order
, (unsigned int)get_order(min_objects
* size
));
4115 order
<= max_order
; order
++) {
4117 unsigned int slab_size
= (unsigned int)PAGE_SIZE
<< order
;
4120 rem
= slab_size
% size
;
4122 if (rem
<= slab_size
/ fract_leftover
)
4129 static inline int calculate_order(unsigned int size
)
4132 unsigned int min_objects
;
4133 unsigned int max_objects
;
4134 unsigned int nr_cpus
;
4137 * Attempt to find best configuration for a slab. This
4138 * works by first attempting to generate a layout with
4139 * the best configuration and backing off gradually.
4141 * First we increase the acceptable waste in a slab. Then
4142 * we reduce the minimum objects required in a slab.
4144 min_objects
= slub_min_objects
;
4147 * Some architectures will only update present cpus when
4148 * onlining them, so don't trust the number if it's just 1. But
4149 * we also don't want to use nr_cpu_ids always, as on some other
4150 * architectures, there can be many possible cpus, but never
4151 * onlined. Here we compromise between trying to avoid too high
4152 * order on systems that appear larger than they are, and too
4153 * low order on systems that appear smaller than they are.
4155 nr_cpus
= num_present_cpus();
4157 nr_cpus
= nr_cpu_ids
;
4158 min_objects
= 4 * (fls(nr_cpus
) + 1);
4160 max_objects
= order_objects(slub_max_order
, size
);
4161 min_objects
= min(min_objects
, max_objects
);
4163 while (min_objects
> 1) {
4164 unsigned int fraction
;
4167 while (fraction
>= 4) {
4168 order
= calc_slab_order(size
, min_objects
,
4169 slub_max_order
, fraction
);
4170 if (order
<= slub_max_order
)
4178 * We were unable to place multiple objects in a slab. Now
4179 * lets see if we can place a single object there.
4181 order
= calc_slab_order(size
, 1, slub_max_order
, 1);
4182 if (order
<= slub_max_order
)
4186 * Doh this slab cannot be placed using slub_max_order.
4188 order
= calc_slab_order(size
, 1, MAX_ORDER
, 1);
4189 if (order
<= MAX_ORDER
)
4195 init_kmem_cache_node(struct kmem_cache_node
*n
)
4198 spin_lock_init(&n
->list_lock
);
4199 INIT_LIST_HEAD(&n
->partial
);
4200 #ifdef CONFIG_SLUB_DEBUG
4201 atomic_long_set(&n
->nr_slabs
, 0);
4202 atomic_long_set(&n
->total_objects
, 0);
4203 INIT_LIST_HEAD(&n
->full
);
4207 #ifndef CONFIG_SLUB_TINY
4208 static inline int alloc_kmem_cache_cpus(struct kmem_cache
*s
)
4210 BUILD_BUG_ON(PERCPU_DYNAMIC_EARLY_SIZE
<
4211 NR_KMALLOC_TYPES
* KMALLOC_SHIFT_HIGH
*
4212 sizeof(struct kmem_cache_cpu
));
4215 * Must align to double word boundary for the double cmpxchg
4216 * instructions to work; see __pcpu_double_call_return_bool().
4218 s
->cpu_slab
= __alloc_percpu(sizeof(struct kmem_cache_cpu
),
4219 2 * sizeof(void *));
4224 init_kmem_cache_cpus(s
);
4229 static inline int alloc_kmem_cache_cpus(struct kmem_cache
*s
)
4233 #endif /* CONFIG_SLUB_TINY */
4235 static struct kmem_cache
*kmem_cache_node
;
4238 * No kmalloc_node yet so do it by hand. We know that this is the first
4239 * slab on the node for this slabcache. There are no concurrent accesses
4242 * Note that this function only works on the kmem_cache_node
4243 * when allocating for the kmem_cache_node. This is used for bootstrapping
4244 * memory on a fresh node that has no slab structures yet.
4246 static void early_kmem_cache_node_alloc(int node
)
4249 struct kmem_cache_node
*n
;
4251 BUG_ON(kmem_cache_node
->size
< sizeof(struct kmem_cache_node
));
4253 slab
= new_slab(kmem_cache_node
, GFP_NOWAIT
, node
);
4256 inc_slabs_node(kmem_cache_node
, slab_nid(slab
), slab
->objects
);
4257 if (slab_nid(slab
) != node
) {
4258 pr_err("SLUB: Unable to allocate memory from node %d\n", node
);
4259 pr_err("SLUB: Allocating a useless per node structure in order to be able to continue\n");
4264 #ifdef CONFIG_SLUB_DEBUG
4265 init_object(kmem_cache_node
, n
, SLUB_RED_ACTIVE
);
4266 init_tracking(kmem_cache_node
, n
);
4268 n
= kasan_slab_alloc(kmem_cache_node
, n
, GFP_KERNEL
, false);
4269 slab
->freelist
= get_freepointer(kmem_cache_node
, n
);
4271 kmem_cache_node
->node
[node
] = n
;
4272 init_kmem_cache_node(n
);
4273 inc_slabs_node(kmem_cache_node
, node
, slab
->objects
);
4276 * No locks need to be taken here as it has just been
4277 * initialized and there is no concurrent access.
4279 __add_partial(n
, slab
, DEACTIVATE_TO_HEAD
);
4282 static void free_kmem_cache_nodes(struct kmem_cache
*s
)
4285 struct kmem_cache_node
*n
;
4287 for_each_kmem_cache_node(s
, node
, n
) {
4288 s
->node
[node
] = NULL
;
4289 kmem_cache_free(kmem_cache_node
, n
);
4293 void __kmem_cache_release(struct kmem_cache
*s
)
4295 cache_random_seq_destroy(s
);
4296 #ifndef CONFIG_SLUB_TINY
4297 free_percpu(s
->cpu_slab
);
4299 free_kmem_cache_nodes(s
);
4302 static int init_kmem_cache_nodes(struct kmem_cache
*s
)
4306 for_each_node_mask(node
, slab_nodes
) {
4307 struct kmem_cache_node
*n
;
4309 if (slab_state
== DOWN
) {
4310 early_kmem_cache_node_alloc(node
);
4313 n
= kmem_cache_alloc_node(kmem_cache_node
,
4317 free_kmem_cache_nodes(s
);
4321 init_kmem_cache_node(n
);
4327 static void set_cpu_partial(struct kmem_cache
*s
)
4329 #ifdef CONFIG_SLUB_CPU_PARTIAL
4330 unsigned int nr_objects
;
4333 * cpu_partial determined the maximum number of objects kept in the
4334 * per cpu partial lists of a processor.
4336 * Per cpu partial lists mainly contain slabs that just have one
4337 * object freed. If they are used for allocation then they can be
4338 * filled up again with minimal effort. The slab will never hit the
4339 * per node partial lists and therefore no locking will be required.
4341 * For backwards compatibility reasons, this is determined as number
4342 * of objects, even though we now limit maximum number of pages, see
4343 * slub_set_cpu_partial()
4345 if (!kmem_cache_has_cpu_partial(s
))
4347 else if (s
->size
>= PAGE_SIZE
)
4349 else if (s
->size
>= 1024)
4351 else if (s
->size
>= 256)
4356 slub_set_cpu_partial(s
, nr_objects
);
4361 * calculate_sizes() determines the order and the distribution of data within
4364 static int calculate_sizes(struct kmem_cache
*s
)
4366 slab_flags_t flags
= s
->flags
;
4367 unsigned int size
= s
->object_size
;
4371 * Round up object size to the next word boundary. We can only
4372 * place the free pointer at word boundaries and this determines
4373 * the possible location of the free pointer.
4375 size
= ALIGN(size
, sizeof(void *));
4377 #ifdef CONFIG_SLUB_DEBUG
4379 * Determine if we can poison the object itself. If the user of
4380 * the slab may touch the object after free or before allocation
4381 * then we should never poison the object itself.
4383 if ((flags
& SLAB_POISON
) && !(flags
& SLAB_TYPESAFE_BY_RCU
) &&
4385 s
->flags
|= __OBJECT_POISON
;
4387 s
->flags
&= ~__OBJECT_POISON
;
4391 * If we are Redzoning then check if there is some space between the
4392 * end of the object and the free pointer. If not then add an
4393 * additional word to have some bytes to store Redzone information.
4395 if ((flags
& SLAB_RED_ZONE
) && size
== s
->object_size
)
4396 size
+= sizeof(void *);
4400 * With that we have determined the number of bytes in actual use
4401 * by the object and redzoning.
4405 if (slub_debug_orig_size(s
) ||
4406 (flags
& (SLAB_TYPESAFE_BY_RCU
| SLAB_POISON
)) ||
4407 ((flags
& SLAB_RED_ZONE
) && s
->object_size
< sizeof(void *)) ||
4410 * Relocate free pointer after the object if it is not
4411 * permitted to overwrite the first word of the object on
4414 * This is the case if we do RCU, have a constructor or
4415 * destructor, are poisoning the objects, or are
4416 * redzoning an object smaller than sizeof(void *).
4418 * The assumption that s->offset >= s->inuse means free
4419 * pointer is outside of the object is used in the
4420 * freeptr_outside_object() function. If that is no
4421 * longer true, the function needs to be modified.
4424 size
+= sizeof(void *);
4427 * Store freelist pointer near middle of object to keep
4428 * it away from the edges of the object to avoid small
4429 * sized over/underflows from neighboring allocations.
4431 s
->offset
= ALIGN_DOWN(s
->object_size
/ 2, sizeof(void *));
4434 #ifdef CONFIG_SLUB_DEBUG
4435 if (flags
& SLAB_STORE_USER
) {
4437 * Need to store information about allocs and frees after
4440 size
+= 2 * sizeof(struct track
);
4442 /* Save the original kmalloc request size */
4443 if (flags
& SLAB_KMALLOC
)
4444 size
+= sizeof(unsigned int);
4448 kasan_cache_create(s
, &size
, &s
->flags
);
4449 #ifdef CONFIG_SLUB_DEBUG
4450 if (flags
& SLAB_RED_ZONE
) {
4452 * Add some empty padding so that we can catch
4453 * overwrites from earlier objects rather than let
4454 * tracking information or the free pointer be
4455 * corrupted if a user writes before the start
4458 size
+= sizeof(void *);
4460 s
->red_left_pad
= sizeof(void *);
4461 s
->red_left_pad
= ALIGN(s
->red_left_pad
, s
->align
);
4462 size
+= s
->red_left_pad
;
4467 * SLUB stores one object immediately after another beginning from
4468 * offset 0. In order to align the objects we have to simply size
4469 * each object to conform to the alignment.
4471 size
= ALIGN(size
, s
->align
);
4473 s
->reciprocal_size
= reciprocal_value(size
);
4474 order
= calculate_order(size
);
4481 s
->allocflags
|= __GFP_COMP
;
4483 if (s
->flags
& SLAB_CACHE_DMA
)
4484 s
->allocflags
|= GFP_DMA
;
4486 if (s
->flags
& SLAB_CACHE_DMA32
)
4487 s
->allocflags
|= GFP_DMA32
;
4489 if (s
->flags
& SLAB_RECLAIM_ACCOUNT
)
4490 s
->allocflags
|= __GFP_RECLAIMABLE
;
4493 * Determine the number of objects per slab
4495 s
->oo
= oo_make(order
, size
);
4496 s
->min
= oo_make(get_order(size
), size
);
4498 return !!oo_objects(s
->oo
);
4501 static int kmem_cache_open(struct kmem_cache
*s
, slab_flags_t flags
)
4503 s
->flags
= kmem_cache_flags(s
->size
, flags
, s
->name
);
4504 #ifdef CONFIG_SLAB_FREELIST_HARDENED
4505 s
->random
= get_random_long();
4508 if (!calculate_sizes(s
))
4510 if (disable_higher_order_debug
) {
4512 * Disable debugging flags that store metadata if the min slab
4515 if (get_order(s
->size
) > get_order(s
->object_size
)) {
4516 s
->flags
&= ~DEBUG_METADATA_FLAGS
;
4518 if (!calculate_sizes(s
))
4523 #ifdef system_has_freelist_aba
4524 if (system_has_freelist_aba() && !(s
->flags
& SLAB_NO_CMPXCHG
)) {
4525 /* Enable fast mode */
4526 s
->flags
|= __CMPXCHG_DOUBLE
;
4531 * The larger the object size is, the more slabs we want on the partial
4532 * list to avoid pounding the page allocator excessively.
4534 s
->min_partial
= min_t(unsigned long, MAX_PARTIAL
, ilog2(s
->size
) / 2);
4535 s
->min_partial
= max_t(unsigned long, MIN_PARTIAL
, s
->min_partial
);
4540 s
->remote_node_defrag_ratio
= 1000;
4543 /* Initialize the pre-computed randomized freelist if slab is up */
4544 if (slab_state
>= UP
) {
4545 if (init_cache_random_seq(s
))
4549 if (!init_kmem_cache_nodes(s
))
4552 if (alloc_kmem_cache_cpus(s
))
4556 __kmem_cache_release(s
);
4560 static void list_slab_objects(struct kmem_cache
*s
, struct slab
*slab
,
4563 #ifdef CONFIG_SLUB_DEBUG
4564 void *addr
= slab_address(slab
);
4567 slab_err(s
, slab
, text
, s
->name
);
4569 spin_lock(&object_map_lock
);
4570 __fill_map(object_map
, s
, slab
);
4572 for_each_object(p
, s
, addr
, slab
->objects
) {
4574 if (!test_bit(__obj_to_index(s
, addr
, p
), object_map
)) {
4575 pr_err("Object 0x%p @offset=%tu\n", p
, p
- addr
);
4576 print_tracking(s
, p
);
4579 spin_unlock(&object_map_lock
);
4584 * Attempt to free all partial slabs on a node.
4585 * This is called from __kmem_cache_shutdown(). We must take list_lock
4586 * because sysfs file might still access partial list after the shutdowning.
4588 static void free_partial(struct kmem_cache
*s
, struct kmem_cache_node
*n
)
4591 struct slab
*slab
, *h
;
4593 BUG_ON(irqs_disabled());
4594 spin_lock_irq(&n
->list_lock
);
4595 list_for_each_entry_safe(slab
, h
, &n
->partial
, slab_list
) {
4597 remove_partial(n
, slab
);
4598 list_add(&slab
->slab_list
, &discard
);
4600 list_slab_objects(s
, slab
,
4601 "Objects remaining in %s on __kmem_cache_shutdown()");
4604 spin_unlock_irq(&n
->list_lock
);
4606 list_for_each_entry_safe(slab
, h
, &discard
, slab_list
)
4607 discard_slab(s
, slab
);
4610 bool __kmem_cache_empty(struct kmem_cache
*s
)
4613 struct kmem_cache_node
*n
;
4615 for_each_kmem_cache_node(s
, node
, n
)
4616 if (n
->nr_partial
|| node_nr_slabs(n
))
4622 * Release all resources used by a slab cache.
4624 int __kmem_cache_shutdown(struct kmem_cache
*s
)
4627 struct kmem_cache_node
*n
;
4629 flush_all_cpus_locked(s
);
4630 /* Attempt to free all objects */
4631 for_each_kmem_cache_node(s
, node
, n
) {
4633 if (n
->nr_partial
|| node_nr_slabs(n
))
4639 #ifdef CONFIG_PRINTK
4640 void __kmem_obj_info(struct kmem_obj_info
*kpp
, void *object
, struct slab
*slab
)
4643 int __maybe_unused i
;
4647 struct kmem_cache
*s
= slab
->slab_cache
;
4648 struct track __maybe_unused
*trackp
;
4650 kpp
->kp_ptr
= object
;
4651 kpp
->kp_slab
= slab
;
4652 kpp
->kp_slab_cache
= s
;
4653 base
= slab_address(slab
);
4654 objp0
= kasan_reset_tag(object
);
4655 #ifdef CONFIG_SLUB_DEBUG
4656 objp
= restore_red_left(s
, objp0
);
4660 objnr
= obj_to_index(s
, slab
, objp
);
4661 kpp
->kp_data_offset
= (unsigned long)((char *)objp0
- (char *)objp
);
4662 objp
= base
+ s
->size
* objnr
;
4663 kpp
->kp_objp
= objp
;
4664 if (WARN_ON_ONCE(objp
< base
|| objp
>= base
+ slab
->objects
* s
->size
4665 || (objp
- base
) % s
->size
) ||
4666 !(s
->flags
& SLAB_STORE_USER
))
4668 #ifdef CONFIG_SLUB_DEBUG
4669 objp
= fixup_red_left(s
, objp
);
4670 trackp
= get_track(s
, objp
, TRACK_ALLOC
);
4671 kpp
->kp_ret
= (void *)trackp
->addr
;
4672 #ifdef CONFIG_STACKDEPOT
4674 depot_stack_handle_t handle
;
4675 unsigned long *entries
;
4676 unsigned int nr_entries
;
4678 handle
= READ_ONCE(trackp
->handle
);
4680 nr_entries
= stack_depot_fetch(handle
, &entries
);
4681 for (i
= 0; i
< KS_ADDRS_COUNT
&& i
< nr_entries
; i
++)
4682 kpp
->kp_stack
[i
] = (void *)entries
[i
];
4685 trackp
= get_track(s
, objp
, TRACK_FREE
);
4686 handle
= READ_ONCE(trackp
->handle
);
4688 nr_entries
= stack_depot_fetch(handle
, &entries
);
4689 for (i
= 0; i
< KS_ADDRS_COUNT
&& i
< nr_entries
; i
++)
4690 kpp
->kp_free_stack
[i
] = (void *)entries
[i
];
4698 /********************************************************************
4700 *******************************************************************/
4702 static int __init
setup_slub_min_order(char *str
)
4704 get_option(&str
, (int *)&slub_min_order
);
4709 __setup("slub_min_order=", setup_slub_min_order
);
4711 static int __init
setup_slub_max_order(char *str
)
4713 get_option(&str
, (int *)&slub_max_order
);
4714 slub_max_order
= min_t(unsigned int, slub_max_order
, MAX_ORDER
);
4719 __setup("slub_max_order=", setup_slub_max_order
);
4721 static int __init
setup_slub_min_objects(char *str
)
4723 get_option(&str
, (int *)&slub_min_objects
);
4728 __setup("slub_min_objects=", setup_slub_min_objects
);
4730 #ifdef CONFIG_HARDENED_USERCOPY
4732 * Rejects incorrectly sized objects and objects that are to be copied
4733 * to/from userspace but do not fall entirely within the containing slab
4734 * cache's usercopy region.
4736 * Returns NULL if check passes, otherwise const char * to name of cache
4737 * to indicate an error.
4739 void __check_heap_object(const void *ptr
, unsigned long n
,
4740 const struct slab
*slab
, bool to_user
)
4742 struct kmem_cache
*s
;
4743 unsigned int offset
;
4744 bool is_kfence
= is_kfence_address(ptr
);
4746 ptr
= kasan_reset_tag(ptr
);
4748 /* Find object and usable object size. */
4749 s
= slab
->slab_cache
;
4751 /* Reject impossible pointers. */
4752 if (ptr
< slab_address(slab
))
4753 usercopy_abort("SLUB object not in SLUB page?!", NULL
,
4756 /* Find offset within object. */
4758 offset
= ptr
- kfence_object_start(ptr
);
4760 offset
= (ptr
- slab_address(slab
)) % s
->size
;
4762 /* Adjust for redzone and reject if within the redzone. */
4763 if (!is_kfence
&& kmem_cache_debug_flags(s
, SLAB_RED_ZONE
)) {
4764 if (offset
< s
->red_left_pad
)
4765 usercopy_abort("SLUB object in left red zone",
4766 s
->name
, to_user
, offset
, n
);
4767 offset
-= s
->red_left_pad
;
4770 /* Allow address range falling entirely within usercopy region. */
4771 if (offset
>= s
->useroffset
&&
4772 offset
- s
->useroffset
<= s
->usersize
&&
4773 n
<= s
->useroffset
- offset
+ s
->usersize
)
4776 usercopy_abort("SLUB object", s
->name
, to_user
, offset
, n
);
4778 #endif /* CONFIG_HARDENED_USERCOPY */
4780 #define SHRINK_PROMOTE_MAX 32
4783 * kmem_cache_shrink discards empty slabs and promotes the slabs filled
4784 * up most to the head of the partial lists. New allocations will then
4785 * fill those up and thus they can be removed from the partial lists.
4787 * The slabs with the least items are placed last. This results in them
4788 * being allocated from last increasing the chance that the last objects
4789 * are freed in them.
4791 static int __kmem_cache_do_shrink(struct kmem_cache
*s
)
4795 struct kmem_cache_node
*n
;
4798 struct list_head discard
;
4799 struct list_head promote
[SHRINK_PROMOTE_MAX
];
4800 unsigned long flags
;
4803 for_each_kmem_cache_node(s
, node
, n
) {
4804 INIT_LIST_HEAD(&discard
);
4805 for (i
= 0; i
< SHRINK_PROMOTE_MAX
; i
++)
4806 INIT_LIST_HEAD(promote
+ i
);
4808 spin_lock_irqsave(&n
->list_lock
, flags
);
4811 * Build lists of slabs to discard or promote.
4813 * Note that concurrent frees may occur while we hold the
4814 * list_lock. slab->inuse here is the upper limit.
4816 list_for_each_entry_safe(slab
, t
, &n
->partial
, slab_list
) {
4817 int free
= slab
->objects
- slab
->inuse
;
4819 /* Do not reread slab->inuse */
4822 /* We do not keep full slabs on the list */
4825 if (free
== slab
->objects
) {
4826 list_move(&slab
->slab_list
, &discard
);
4828 dec_slabs_node(s
, node
, slab
->objects
);
4829 } else if (free
<= SHRINK_PROMOTE_MAX
)
4830 list_move(&slab
->slab_list
, promote
+ free
- 1);
4834 * Promote the slabs filled up most to the head of the
4837 for (i
= SHRINK_PROMOTE_MAX
- 1; i
>= 0; i
--)
4838 list_splice(promote
+ i
, &n
->partial
);
4840 spin_unlock_irqrestore(&n
->list_lock
, flags
);
4842 /* Release empty slabs */
4843 list_for_each_entry_safe(slab
, t
, &discard
, slab_list
)
4846 if (node_nr_slabs(n
))
4853 int __kmem_cache_shrink(struct kmem_cache
*s
)
4856 return __kmem_cache_do_shrink(s
);
4859 static int slab_mem_going_offline_callback(void *arg
)
4861 struct kmem_cache
*s
;
4863 mutex_lock(&slab_mutex
);
4864 list_for_each_entry(s
, &slab_caches
, list
) {
4865 flush_all_cpus_locked(s
);
4866 __kmem_cache_do_shrink(s
);
4868 mutex_unlock(&slab_mutex
);
4873 static void slab_mem_offline_callback(void *arg
)
4875 struct memory_notify
*marg
= arg
;
4878 offline_node
= marg
->status_change_nid_normal
;
4881 * If the node still has available memory. we need kmem_cache_node
4884 if (offline_node
< 0)
4887 mutex_lock(&slab_mutex
);
4888 node_clear(offline_node
, slab_nodes
);
4890 * We no longer free kmem_cache_node structures here, as it would be
4891 * racy with all get_node() users, and infeasible to protect them with
4894 mutex_unlock(&slab_mutex
);
4897 static int slab_mem_going_online_callback(void *arg
)
4899 struct kmem_cache_node
*n
;
4900 struct kmem_cache
*s
;
4901 struct memory_notify
*marg
= arg
;
4902 int nid
= marg
->status_change_nid_normal
;
4906 * If the node's memory is already available, then kmem_cache_node is
4907 * already created. Nothing to do.
4913 * We are bringing a node online. No memory is available yet. We must
4914 * allocate a kmem_cache_node structure in order to bring the node
4917 mutex_lock(&slab_mutex
);
4918 list_for_each_entry(s
, &slab_caches
, list
) {
4920 * The structure may already exist if the node was previously
4921 * onlined and offlined.
4923 if (get_node(s
, nid
))
4926 * XXX: kmem_cache_alloc_node will fallback to other nodes
4927 * since memory is not yet available from the node that
4930 n
= kmem_cache_alloc(kmem_cache_node
, GFP_KERNEL
);
4935 init_kmem_cache_node(n
);
4939 * Any cache created after this point will also have kmem_cache_node
4940 * initialized for the new node.
4942 node_set(nid
, slab_nodes
);
4944 mutex_unlock(&slab_mutex
);
4948 static int slab_memory_callback(struct notifier_block
*self
,
4949 unsigned long action
, void *arg
)
4954 case MEM_GOING_ONLINE
:
4955 ret
= slab_mem_going_online_callback(arg
);
4957 case MEM_GOING_OFFLINE
:
4958 ret
= slab_mem_going_offline_callback(arg
);
4961 case MEM_CANCEL_ONLINE
:
4962 slab_mem_offline_callback(arg
);
4965 case MEM_CANCEL_OFFLINE
:
4969 ret
= notifier_from_errno(ret
);
4975 /********************************************************************
4976 * Basic setup of slabs
4977 *******************************************************************/
4980 * Used for early kmem_cache structures that were allocated using
4981 * the page allocator. Allocate them properly then fix up the pointers
4982 * that may be pointing to the wrong kmem_cache structure.
4985 static struct kmem_cache
* __init
bootstrap(struct kmem_cache
*static_cache
)
4988 struct kmem_cache
*s
= kmem_cache_zalloc(kmem_cache
, GFP_NOWAIT
);
4989 struct kmem_cache_node
*n
;
4991 memcpy(s
, static_cache
, kmem_cache
->object_size
);
4994 * This runs very early, and only the boot processor is supposed to be
4995 * up. Even if it weren't true, IRQs are not up so we couldn't fire
4998 __flush_cpu_slab(s
, smp_processor_id());
4999 for_each_kmem_cache_node(s
, node
, n
) {
5002 list_for_each_entry(p
, &n
->partial
, slab_list
)
5005 #ifdef CONFIG_SLUB_DEBUG
5006 list_for_each_entry(p
, &n
->full
, slab_list
)
5010 list_add(&s
->list
, &slab_caches
);
5014 void __init
kmem_cache_init(void)
5016 static __initdata
struct kmem_cache boot_kmem_cache
,
5017 boot_kmem_cache_node
;
5020 if (debug_guardpage_minorder())
5023 /* Print slub debugging pointers without hashing */
5024 if (__slub_debug_enabled())
5025 no_hash_pointers_enable(NULL
);
5027 kmem_cache_node
= &boot_kmem_cache_node
;
5028 kmem_cache
= &boot_kmem_cache
;
5031 * Initialize the nodemask for which we will allocate per node
5032 * structures. Here we don't need taking slab_mutex yet.
5034 for_each_node_state(node
, N_NORMAL_MEMORY
)
5035 node_set(node
, slab_nodes
);
5037 create_boot_cache(kmem_cache_node
, "kmem_cache_node",
5038 sizeof(struct kmem_cache_node
), SLAB_HWCACHE_ALIGN
, 0, 0);
5040 hotplug_memory_notifier(slab_memory_callback
, SLAB_CALLBACK_PRI
);
5042 /* Able to allocate the per node structures */
5043 slab_state
= PARTIAL
;
5045 create_boot_cache(kmem_cache
, "kmem_cache",
5046 offsetof(struct kmem_cache
, node
) +
5047 nr_node_ids
* sizeof(struct kmem_cache_node
*),
5048 SLAB_HWCACHE_ALIGN
, 0, 0);
5050 kmem_cache
= bootstrap(&boot_kmem_cache
);
5051 kmem_cache_node
= bootstrap(&boot_kmem_cache_node
);
5053 /* Now we can use the kmem_cache to allocate kmalloc slabs */
5054 setup_kmalloc_cache_index_table();
5055 create_kmalloc_caches(0);
5057 /* Setup random freelists for each cache */
5058 init_freelist_randomization();
5060 cpuhp_setup_state_nocalls(CPUHP_SLUB_DEAD
, "slub:dead", NULL
,
5063 pr_info("SLUB: HWalign=%d, Order=%u-%u, MinObjects=%u, CPUs=%u, Nodes=%u\n",
5065 slub_min_order
, slub_max_order
, slub_min_objects
,
5066 nr_cpu_ids
, nr_node_ids
);
5069 void __init
kmem_cache_init_late(void)
5071 #ifndef CONFIG_SLUB_TINY
5072 flushwq
= alloc_workqueue("slub_flushwq", WQ_MEM_RECLAIM
, 0);
5078 __kmem_cache_alias(const char *name
, unsigned int size
, unsigned int align
,
5079 slab_flags_t flags
, void (*ctor
)(void *))
5081 struct kmem_cache
*s
;
5083 s
= find_mergeable(size
, align
, flags
, name
, ctor
);
5085 if (sysfs_slab_alias(s
, name
))
5091 * Adjust the object sizes so that we clear
5092 * the complete object on kzalloc.
5094 s
->object_size
= max(s
->object_size
, size
);
5095 s
->inuse
= max(s
->inuse
, ALIGN(size
, sizeof(void *)));
5101 int __kmem_cache_create(struct kmem_cache
*s
, slab_flags_t flags
)
5105 err
= kmem_cache_open(s
, flags
);
5109 /* Mutex is not taken during early boot */
5110 if (slab_state
<= UP
)
5113 err
= sysfs_slab_add(s
);
5115 __kmem_cache_release(s
);
5119 if (s
->flags
& SLAB_STORE_USER
)
5120 debugfs_slab_add(s
);
5125 #ifdef SLAB_SUPPORTS_SYSFS
5126 static int count_inuse(struct slab
*slab
)
5131 static int count_total(struct slab
*slab
)
5133 return slab
->objects
;
5137 #ifdef CONFIG_SLUB_DEBUG
5138 static void validate_slab(struct kmem_cache
*s
, struct slab
*slab
,
5139 unsigned long *obj_map
)
5142 void *addr
= slab_address(slab
);
5144 if (!check_slab(s
, slab
) || !on_freelist(s
, slab
, NULL
))
5147 /* Now we know that a valid freelist exists */
5148 __fill_map(obj_map
, s
, slab
);
5149 for_each_object(p
, s
, addr
, slab
->objects
) {
5150 u8 val
= test_bit(__obj_to_index(s
, addr
, p
), obj_map
) ?
5151 SLUB_RED_INACTIVE
: SLUB_RED_ACTIVE
;
5153 if (!check_object(s
, slab
, p
, val
))
5158 static int validate_slab_node(struct kmem_cache
*s
,
5159 struct kmem_cache_node
*n
, unsigned long *obj_map
)
5161 unsigned long count
= 0;
5163 unsigned long flags
;
5165 spin_lock_irqsave(&n
->list_lock
, flags
);
5167 list_for_each_entry(slab
, &n
->partial
, slab_list
) {
5168 validate_slab(s
, slab
, obj_map
);
5171 if (count
!= n
->nr_partial
) {
5172 pr_err("SLUB %s: %ld partial slabs counted but counter=%ld\n",
5173 s
->name
, count
, n
->nr_partial
);
5174 slab_add_kunit_errors();
5177 if (!(s
->flags
& SLAB_STORE_USER
))
5180 list_for_each_entry(slab
, &n
->full
, slab_list
) {
5181 validate_slab(s
, slab
, obj_map
);
5184 if (count
!= node_nr_slabs(n
)) {
5185 pr_err("SLUB: %s %ld slabs counted but counter=%ld\n",
5186 s
->name
, count
, node_nr_slabs(n
));
5187 slab_add_kunit_errors();
5191 spin_unlock_irqrestore(&n
->list_lock
, flags
);
5195 long validate_slab_cache(struct kmem_cache
*s
)
5198 unsigned long count
= 0;
5199 struct kmem_cache_node
*n
;
5200 unsigned long *obj_map
;
5202 obj_map
= bitmap_alloc(oo_objects(s
->oo
), GFP_KERNEL
);
5207 for_each_kmem_cache_node(s
, node
, n
)
5208 count
+= validate_slab_node(s
, n
, obj_map
);
5210 bitmap_free(obj_map
);
5214 EXPORT_SYMBOL(validate_slab_cache
);
5216 #ifdef CONFIG_DEBUG_FS
5218 * Generate lists of code addresses where slabcache objects are allocated
5223 depot_stack_handle_t handle
;
5224 unsigned long count
;
5226 unsigned long waste
;
5232 DECLARE_BITMAP(cpus
, NR_CPUS
);
5238 unsigned long count
;
5239 struct location
*loc
;
5243 static struct dentry
*slab_debugfs_root
;
5245 static void free_loc_track(struct loc_track
*t
)
5248 free_pages((unsigned long)t
->loc
,
5249 get_order(sizeof(struct location
) * t
->max
));
5252 static int alloc_loc_track(struct loc_track
*t
, unsigned long max
, gfp_t flags
)
5257 order
= get_order(sizeof(struct location
) * max
);
5259 l
= (void *)__get_free_pages(flags
, order
);
5264 memcpy(l
, t
->loc
, sizeof(struct location
) * t
->count
);
5272 static int add_location(struct loc_track
*t
, struct kmem_cache
*s
,
5273 const struct track
*track
,
5274 unsigned int orig_size
)
5276 long start
, end
, pos
;
5278 unsigned long caddr
, chandle
, cwaste
;
5279 unsigned long age
= jiffies
- track
->when
;
5280 depot_stack_handle_t handle
= 0;
5281 unsigned int waste
= s
->object_size
- orig_size
;
5283 #ifdef CONFIG_STACKDEPOT
5284 handle
= READ_ONCE(track
->handle
);
5290 pos
= start
+ (end
- start
+ 1) / 2;
5293 * There is nothing at "end". If we end up there
5294 * we need to add something to before end.
5301 chandle
= l
->handle
;
5303 if ((track
->addr
== caddr
) && (handle
== chandle
) &&
5304 (waste
== cwaste
)) {
5309 if (age
< l
->min_time
)
5311 if (age
> l
->max_time
)
5314 if (track
->pid
< l
->min_pid
)
5315 l
->min_pid
= track
->pid
;
5316 if (track
->pid
> l
->max_pid
)
5317 l
->max_pid
= track
->pid
;
5319 cpumask_set_cpu(track
->cpu
,
5320 to_cpumask(l
->cpus
));
5322 node_set(page_to_nid(virt_to_page(track
)), l
->nodes
);
5326 if (track
->addr
< caddr
)
5328 else if (track
->addr
== caddr
&& handle
< chandle
)
5330 else if (track
->addr
== caddr
&& handle
== chandle
&&
5338 * Not found. Insert new tracking element.
5340 if (t
->count
>= t
->max
&& !alloc_loc_track(t
, 2 * t
->max
, GFP_ATOMIC
))
5346 (t
->count
- pos
) * sizeof(struct location
));
5349 l
->addr
= track
->addr
;
5353 l
->min_pid
= track
->pid
;
5354 l
->max_pid
= track
->pid
;
5357 cpumask_clear(to_cpumask(l
->cpus
));
5358 cpumask_set_cpu(track
->cpu
, to_cpumask(l
->cpus
));
5359 nodes_clear(l
->nodes
);
5360 node_set(page_to_nid(virt_to_page(track
)), l
->nodes
);
5364 static void process_slab(struct loc_track
*t
, struct kmem_cache
*s
,
5365 struct slab
*slab
, enum track_item alloc
,
5366 unsigned long *obj_map
)
5368 void *addr
= slab_address(slab
);
5369 bool is_alloc
= (alloc
== TRACK_ALLOC
);
5372 __fill_map(obj_map
, s
, slab
);
5374 for_each_object(p
, s
, addr
, slab
->objects
)
5375 if (!test_bit(__obj_to_index(s
, addr
, p
), obj_map
))
5376 add_location(t
, s
, get_track(s
, p
, alloc
),
5377 is_alloc
? get_orig_size(s
, p
) :
5380 #endif /* CONFIG_DEBUG_FS */
5381 #endif /* CONFIG_SLUB_DEBUG */
5383 #ifdef SLAB_SUPPORTS_SYSFS
5384 enum slab_stat_type
{
5385 SL_ALL
, /* All slabs */
5386 SL_PARTIAL
, /* Only partially allocated slabs */
5387 SL_CPU
, /* Only slabs used for cpu caches */
5388 SL_OBJECTS
, /* Determine allocated objects not slabs */
5389 SL_TOTAL
/* Determine object capacity not slabs */
5392 #define SO_ALL (1 << SL_ALL)
5393 #define SO_PARTIAL (1 << SL_PARTIAL)
5394 #define SO_CPU (1 << SL_CPU)
5395 #define SO_OBJECTS (1 << SL_OBJECTS)
5396 #define SO_TOTAL (1 << SL_TOTAL)
5398 static ssize_t
show_slab_objects(struct kmem_cache
*s
,
5399 char *buf
, unsigned long flags
)
5401 unsigned long total
= 0;
5404 unsigned long *nodes
;
5407 nodes
= kcalloc(nr_node_ids
, sizeof(unsigned long), GFP_KERNEL
);
5411 if (flags
& SO_CPU
) {
5414 for_each_possible_cpu(cpu
) {
5415 struct kmem_cache_cpu
*c
= per_cpu_ptr(s
->cpu_slab
,
5420 slab
= READ_ONCE(c
->slab
);
5424 node
= slab_nid(slab
);
5425 if (flags
& SO_TOTAL
)
5427 else if (flags
& SO_OBJECTS
)
5435 #ifdef CONFIG_SLUB_CPU_PARTIAL
5436 slab
= slub_percpu_partial_read_once(c
);
5438 node
= slab_nid(slab
);
5439 if (flags
& SO_TOTAL
)
5441 else if (flags
& SO_OBJECTS
)
5453 * It is impossible to take "mem_hotplug_lock" here with "kernfs_mutex"
5454 * already held which will conflict with an existing lock order:
5456 * mem_hotplug_lock->slab_mutex->kernfs_mutex
5458 * We don't really need mem_hotplug_lock (to hold off
5459 * slab_mem_going_offline_callback) here because slab's memory hot
5460 * unplug code doesn't destroy the kmem_cache->node[] data.
5463 #ifdef CONFIG_SLUB_DEBUG
5464 if (flags
& SO_ALL
) {
5465 struct kmem_cache_node
*n
;
5467 for_each_kmem_cache_node(s
, node
, n
) {
5469 if (flags
& SO_TOTAL
)
5470 x
= node_nr_objs(n
);
5471 else if (flags
& SO_OBJECTS
)
5472 x
= node_nr_objs(n
) - count_partial(n
, count_free
);
5474 x
= node_nr_slabs(n
);
5481 if (flags
& SO_PARTIAL
) {
5482 struct kmem_cache_node
*n
;
5484 for_each_kmem_cache_node(s
, node
, n
) {
5485 if (flags
& SO_TOTAL
)
5486 x
= count_partial(n
, count_total
);
5487 else if (flags
& SO_OBJECTS
)
5488 x
= count_partial(n
, count_inuse
);
5496 len
+= sysfs_emit_at(buf
, len
, "%lu", total
);
5498 for (node
= 0; node
< nr_node_ids
; node
++) {
5500 len
+= sysfs_emit_at(buf
, len
, " N%d=%lu",
5504 len
+= sysfs_emit_at(buf
, len
, "\n");
5510 #define to_slab_attr(n) container_of(n, struct slab_attribute, attr)
5511 #define to_slab(n) container_of(n, struct kmem_cache, kobj)
5513 struct slab_attribute
{
5514 struct attribute attr
;
5515 ssize_t (*show
)(struct kmem_cache
*s
, char *buf
);
5516 ssize_t (*store
)(struct kmem_cache
*s
, const char *x
, size_t count
);
5519 #define SLAB_ATTR_RO(_name) \
5520 static struct slab_attribute _name##_attr = __ATTR_RO_MODE(_name, 0400)
5522 #define SLAB_ATTR(_name) \
5523 static struct slab_attribute _name##_attr = __ATTR_RW_MODE(_name, 0600)
5525 static ssize_t
slab_size_show(struct kmem_cache
*s
, char *buf
)
5527 return sysfs_emit(buf
, "%u\n", s
->size
);
5529 SLAB_ATTR_RO(slab_size
);
5531 static ssize_t
align_show(struct kmem_cache
*s
, char *buf
)
5533 return sysfs_emit(buf
, "%u\n", s
->align
);
5535 SLAB_ATTR_RO(align
);
5537 static ssize_t
object_size_show(struct kmem_cache
*s
, char *buf
)
5539 return sysfs_emit(buf
, "%u\n", s
->object_size
);
5541 SLAB_ATTR_RO(object_size
);
5543 static ssize_t
objs_per_slab_show(struct kmem_cache
*s
, char *buf
)
5545 return sysfs_emit(buf
, "%u\n", oo_objects(s
->oo
));
5547 SLAB_ATTR_RO(objs_per_slab
);
5549 static ssize_t
order_show(struct kmem_cache
*s
, char *buf
)
5551 return sysfs_emit(buf
, "%u\n", oo_order(s
->oo
));
5553 SLAB_ATTR_RO(order
);
5555 static ssize_t
min_partial_show(struct kmem_cache
*s
, char *buf
)
5557 return sysfs_emit(buf
, "%lu\n", s
->min_partial
);
5560 static ssize_t
min_partial_store(struct kmem_cache
*s
, const char *buf
,
5566 err
= kstrtoul(buf
, 10, &min
);
5570 s
->min_partial
= min
;
5573 SLAB_ATTR(min_partial
);
5575 static ssize_t
cpu_partial_show(struct kmem_cache
*s
, char *buf
)
5577 unsigned int nr_partial
= 0;
5578 #ifdef CONFIG_SLUB_CPU_PARTIAL
5579 nr_partial
= s
->cpu_partial
;
5582 return sysfs_emit(buf
, "%u\n", nr_partial
);
5585 static ssize_t
cpu_partial_store(struct kmem_cache
*s
, const char *buf
,
5588 unsigned int objects
;
5591 err
= kstrtouint(buf
, 10, &objects
);
5594 if (objects
&& !kmem_cache_has_cpu_partial(s
))
5597 slub_set_cpu_partial(s
, objects
);
5601 SLAB_ATTR(cpu_partial
);
5603 static ssize_t
ctor_show(struct kmem_cache
*s
, char *buf
)
5607 return sysfs_emit(buf
, "%pS\n", s
->ctor
);
5611 static ssize_t
aliases_show(struct kmem_cache
*s
, char *buf
)
5613 return sysfs_emit(buf
, "%d\n", s
->refcount
< 0 ? 0 : s
->refcount
- 1);
5615 SLAB_ATTR_RO(aliases
);
5617 static ssize_t
partial_show(struct kmem_cache
*s
, char *buf
)
5619 return show_slab_objects(s
, buf
, SO_PARTIAL
);
5621 SLAB_ATTR_RO(partial
);
5623 static ssize_t
cpu_slabs_show(struct kmem_cache
*s
, char *buf
)
5625 return show_slab_objects(s
, buf
, SO_CPU
);
5627 SLAB_ATTR_RO(cpu_slabs
);
5629 static ssize_t
objects_partial_show(struct kmem_cache
*s
, char *buf
)
5631 return show_slab_objects(s
, buf
, SO_PARTIAL
|SO_OBJECTS
);
5633 SLAB_ATTR_RO(objects_partial
);
5635 static ssize_t
slabs_cpu_partial_show(struct kmem_cache
*s
, char *buf
)
5639 int cpu __maybe_unused
;
5642 #ifdef CONFIG_SLUB_CPU_PARTIAL
5643 for_each_online_cpu(cpu
) {
5646 slab
= slub_percpu_partial(per_cpu_ptr(s
->cpu_slab
, cpu
));
5649 slabs
+= slab
->slabs
;
5653 /* Approximate half-full slabs, see slub_set_cpu_partial() */
5654 objects
= (slabs
* oo_objects(s
->oo
)) / 2;
5655 len
+= sysfs_emit_at(buf
, len
, "%d(%d)", objects
, slabs
);
5657 #ifdef CONFIG_SLUB_CPU_PARTIAL
5658 for_each_online_cpu(cpu
) {
5661 slab
= slub_percpu_partial(per_cpu_ptr(s
->cpu_slab
, cpu
));
5663 slabs
= READ_ONCE(slab
->slabs
);
5664 objects
= (slabs
* oo_objects(s
->oo
)) / 2;
5665 len
+= sysfs_emit_at(buf
, len
, " C%d=%d(%d)",
5666 cpu
, objects
, slabs
);
5670 len
+= sysfs_emit_at(buf
, len
, "\n");
5674 SLAB_ATTR_RO(slabs_cpu_partial
);
5676 static ssize_t
reclaim_account_show(struct kmem_cache
*s
, char *buf
)
5678 return sysfs_emit(buf
, "%d\n", !!(s
->flags
& SLAB_RECLAIM_ACCOUNT
));
5680 SLAB_ATTR_RO(reclaim_account
);
5682 static ssize_t
hwcache_align_show(struct kmem_cache
*s
, char *buf
)
5684 return sysfs_emit(buf
, "%d\n", !!(s
->flags
& SLAB_HWCACHE_ALIGN
));
5686 SLAB_ATTR_RO(hwcache_align
);
5688 #ifdef CONFIG_ZONE_DMA
5689 static ssize_t
cache_dma_show(struct kmem_cache
*s
, char *buf
)
5691 return sysfs_emit(buf
, "%d\n", !!(s
->flags
& SLAB_CACHE_DMA
));
5693 SLAB_ATTR_RO(cache_dma
);
5696 #ifdef CONFIG_HARDENED_USERCOPY
5697 static ssize_t
usersize_show(struct kmem_cache
*s
, char *buf
)
5699 return sysfs_emit(buf
, "%u\n", s
->usersize
);
5701 SLAB_ATTR_RO(usersize
);
5704 static ssize_t
destroy_by_rcu_show(struct kmem_cache
*s
, char *buf
)
5706 return sysfs_emit(buf
, "%d\n", !!(s
->flags
& SLAB_TYPESAFE_BY_RCU
));
5708 SLAB_ATTR_RO(destroy_by_rcu
);
5710 #ifdef CONFIG_SLUB_DEBUG
5711 static ssize_t
slabs_show(struct kmem_cache
*s
, char *buf
)
5713 return show_slab_objects(s
, buf
, SO_ALL
);
5715 SLAB_ATTR_RO(slabs
);
5717 static ssize_t
total_objects_show(struct kmem_cache
*s
, char *buf
)
5719 return show_slab_objects(s
, buf
, SO_ALL
|SO_TOTAL
);
5721 SLAB_ATTR_RO(total_objects
);
5723 static ssize_t
objects_show(struct kmem_cache
*s
, char *buf
)
5725 return show_slab_objects(s
, buf
, SO_ALL
|SO_OBJECTS
);
5727 SLAB_ATTR_RO(objects
);
5729 static ssize_t
sanity_checks_show(struct kmem_cache
*s
, char *buf
)
5731 return sysfs_emit(buf
, "%d\n", !!(s
->flags
& SLAB_CONSISTENCY_CHECKS
));
5733 SLAB_ATTR_RO(sanity_checks
);
5735 static ssize_t
trace_show(struct kmem_cache
*s
, char *buf
)
5737 return sysfs_emit(buf
, "%d\n", !!(s
->flags
& SLAB_TRACE
));
5739 SLAB_ATTR_RO(trace
);
5741 static ssize_t
red_zone_show(struct kmem_cache
*s
, char *buf
)
5743 return sysfs_emit(buf
, "%d\n", !!(s
->flags
& SLAB_RED_ZONE
));
5746 SLAB_ATTR_RO(red_zone
);
5748 static ssize_t
poison_show(struct kmem_cache
*s
, char *buf
)
5750 return sysfs_emit(buf
, "%d\n", !!(s
->flags
& SLAB_POISON
));
5753 SLAB_ATTR_RO(poison
);
5755 static ssize_t
store_user_show(struct kmem_cache
*s
, char *buf
)
5757 return sysfs_emit(buf
, "%d\n", !!(s
->flags
& SLAB_STORE_USER
));
5760 SLAB_ATTR_RO(store_user
);
5762 static ssize_t
validate_show(struct kmem_cache
*s
, char *buf
)
5767 static ssize_t
validate_store(struct kmem_cache
*s
,
5768 const char *buf
, size_t length
)
5772 if (buf
[0] == '1' && kmem_cache_debug(s
)) {
5773 ret
= validate_slab_cache(s
);
5779 SLAB_ATTR(validate
);
5781 #endif /* CONFIG_SLUB_DEBUG */
5783 #ifdef CONFIG_FAILSLAB
5784 static ssize_t
failslab_show(struct kmem_cache
*s
, char *buf
)
5786 return sysfs_emit(buf
, "%d\n", !!(s
->flags
& SLAB_FAILSLAB
));
5789 static ssize_t
failslab_store(struct kmem_cache
*s
, const char *buf
,
5792 if (s
->refcount
> 1)
5796 WRITE_ONCE(s
->flags
, s
->flags
| SLAB_FAILSLAB
);
5798 WRITE_ONCE(s
->flags
, s
->flags
& ~SLAB_FAILSLAB
);
5802 SLAB_ATTR(failslab
);
5805 static ssize_t
shrink_show(struct kmem_cache
*s
, char *buf
)
5810 static ssize_t
shrink_store(struct kmem_cache
*s
,
5811 const char *buf
, size_t length
)
5814 kmem_cache_shrink(s
);
5822 static ssize_t
remote_node_defrag_ratio_show(struct kmem_cache
*s
, char *buf
)
5824 return sysfs_emit(buf
, "%u\n", s
->remote_node_defrag_ratio
/ 10);
5827 static ssize_t
remote_node_defrag_ratio_store(struct kmem_cache
*s
,
5828 const char *buf
, size_t length
)
5833 err
= kstrtouint(buf
, 10, &ratio
);
5839 s
->remote_node_defrag_ratio
= ratio
* 10;
5843 SLAB_ATTR(remote_node_defrag_ratio
);
5846 #ifdef CONFIG_SLUB_STATS
5847 static int show_stat(struct kmem_cache
*s
, char *buf
, enum stat_item si
)
5849 unsigned long sum
= 0;
5852 int *data
= kmalloc_array(nr_cpu_ids
, sizeof(int), GFP_KERNEL
);
5857 for_each_online_cpu(cpu
) {
5858 unsigned x
= per_cpu_ptr(s
->cpu_slab
, cpu
)->stat
[si
];
5864 len
+= sysfs_emit_at(buf
, len
, "%lu", sum
);
5867 for_each_online_cpu(cpu
) {
5869 len
+= sysfs_emit_at(buf
, len
, " C%d=%u",
5874 len
+= sysfs_emit_at(buf
, len
, "\n");
5879 static void clear_stat(struct kmem_cache
*s
, enum stat_item si
)
5883 for_each_online_cpu(cpu
)
5884 per_cpu_ptr(s
->cpu_slab
, cpu
)->stat
[si
] = 0;
5887 #define STAT_ATTR(si, text) \
5888 static ssize_t text##_show(struct kmem_cache *s, char *buf) \
5890 return show_stat(s, buf, si); \
5892 static ssize_t text##_store(struct kmem_cache *s, \
5893 const char *buf, size_t length) \
5895 if (buf[0] != '0') \
5897 clear_stat(s, si); \
5902 STAT_ATTR(ALLOC_FASTPATH, alloc_fastpath);
5903 STAT_ATTR(ALLOC_SLOWPATH
, alloc_slowpath
);
5904 STAT_ATTR(FREE_FASTPATH
, free_fastpath
);
5905 STAT_ATTR(FREE_SLOWPATH
, free_slowpath
);
5906 STAT_ATTR(FREE_FROZEN
, free_frozen
);
5907 STAT_ATTR(FREE_ADD_PARTIAL
, free_add_partial
);
5908 STAT_ATTR(FREE_REMOVE_PARTIAL
, free_remove_partial
);
5909 STAT_ATTR(ALLOC_FROM_PARTIAL
, alloc_from_partial
);
5910 STAT_ATTR(ALLOC_SLAB
, alloc_slab
);
5911 STAT_ATTR(ALLOC_REFILL
, alloc_refill
);
5912 STAT_ATTR(ALLOC_NODE_MISMATCH
, alloc_node_mismatch
);
5913 STAT_ATTR(FREE_SLAB
, free_slab
);
5914 STAT_ATTR(CPUSLAB_FLUSH
, cpuslab_flush
);
5915 STAT_ATTR(DEACTIVATE_FULL
, deactivate_full
);
5916 STAT_ATTR(DEACTIVATE_EMPTY
, deactivate_empty
);
5917 STAT_ATTR(DEACTIVATE_TO_HEAD
, deactivate_to_head
);
5918 STAT_ATTR(DEACTIVATE_TO_TAIL
, deactivate_to_tail
);
5919 STAT_ATTR(DEACTIVATE_REMOTE_FREES
, deactivate_remote_frees
);
5920 STAT_ATTR(DEACTIVATE_BYPASS
, deactivate_bypass
);
5921 STAT_ATTR(ORDER_FALLBACK
, order_fallback
);
5922 STAT_ATTR(CMPXCHG_DOUBLE_CPU_FAIL
, cmpxchg_double_cpu_fail
);
5923 STAT_ATTR(CMPXCHG_DOUBLE_FAIL
, cmpxchg_double_fail
);
5924 STAT_ATTR(CPU_PARTIAL_ALLOC
, cpu_partial_alloc
);
5925 STAT_ATTR(CPU_PARTIAL_FREE
, cpu_partial_free
);
5926 STAT_ATTR(CPU_PARTIAL_NODE
, cpu_partial_node
);
5927 STAT_ATTR(CPU_PARTIAL_DRAIN
, cpu_partial_drain
);
5928 #endif /* CONFIG_SLUB_STATS */
5930 #ifdef CONFIG_KFENCE
5931 static ssize_t
skip_kfence_show(struct kmem_cache
*s
, char *buf
)
5933 return sysfs_emit(buf
, "%d\n", !!(s
->flags
& SLAB_SKIP_KFENCE
));
5936 static ssize_t
skip_kfence_store(struct kmem_cache
*s
,
5937 const char *buf
, size_t length
)
5942 s
->flags
&= ~SLAB_SKIP_KFENCE
;
5943 else if (buf
[0] == '1')
5944 s
->flags
|= SLAB_SKIP_KFENCE
;
5950 SLAB_ATTR(skip_kfence
);
5953 static struct attribute
*slab_attrs
[] = {
5954 &slab_size_attr
.attr
,
5955 &object_size_attr
.attr
,
5956 &objs_per_slab_attr
.attr
,
5958 &min_partial_attr
.attr
,
5959 &cpu_partial_attr
.attr
,
5960 &objects_partial_attr
.attr
,
5962 &cpu_slabs_attr
.attr
,
5966 &hwcache_align_attr
.attr
,
5967 &reclaim_account_attr
.attr
,
5968 &destroy_by_rcu_attr
.attr
,
5970 &slabs_cpu_partial_attr
.attr
,
5971 #ifdef CONFIG_SLUB_DEBUG
5972 &total_objects_attr
.attr
,
5975 &sanity_checks_attr
.attr
,
5977 &red_zone_attr
.attr
,
5979 &store_user_attr
.attr
,
5980 &validate_attr
.attr
,
5982 #ifdef CONFIG_ZONE_DMA
5983 &cache_dma_attr
.attr
,
5986 &remote_node_defrag_ratio_attr
.attr
,
5988 #ifdef CONFIG_SLUB_STATS
5989 &alloc_fastpath_attr
.attr
,
5990 &alloc_slowpath_attr
.attr
,
5991 &free_fastpath_attr
.attr
,
5992 &free_slowpath_attr
.attr
,
5993 &free_frozen_attr
.attr
,
5994 &free_add_partial_attr
.attr
,
5995 &free_remove_partial_attr
.attr
,
5996 &alloc_from_partial_attr
.attr
,
5997 &alloc_slab_attr
.attr
,
5998 &alloc_refill_attr
.attr
,
5999 &alloc_node_mismatch_attr
.attr
,
6000 &free_slab_attr
.attr
,
6001 &cpuslab_flush_attr
.attr
,
6002 &deactivate_full_attr
.attr
,
6003 &deactivate_empty_attr
.attr
,
6004 &deactivate_to_head_attr
.attr
,
6005 &deactivate_to_tail_attr
.attr
,
6006 &deactivate_remote_frees_attr
.attr
,
6007 &deactivate_bypass_attr
.attr
,
6008 &order_fallback_attr
.attr
,
6009 &cmpxchg_double_fail_attr
.attr
,
6010 &cmpxchg_double_cpu_fail_attr
.attr
,
6011 &cpu_partial_alloc_attr
.attr
,
6012 &cpu_partial_free_attr
.attr
,
6013 &cpu_partial_node_attr
.attr
,
6014 &cpu_partial_drain_attr
.attr
,
6016 #ifdef CONFIG_FAILSLAB
6017 &failslab_attr
.attr
,
6019 #ifdef CONFIG_HARDENED_USERCOPY
6020 &usersize_attr
.attr
,
6022 #ifdef CONFIG_KFENCE
6023 &skip_kfence_attr
.attr
,
6029 static const struct attribute_group slab_attr_group
= {
6030 .attrs
= slab_attrs
,
6033 static ssize_t
slab_attr_show(struct kobject
*kobj
,
6034 struct attribute
*attr
,
6037 struct slab_attribute
*attribute
;
6038 struct kmem_cache
*s
;
6040 attribute
= to_slab_attr(attr
);
6043 if (!attribute
->show
)
6046 return attribute
->show(s
, buf
);
6049 static ssize_t
slab_attr_store(struct kobject
*kobj
,
6050 struct attribute
*attr
,
6051 const char *buf
, size_t len
)
6053 struct slab_attribute
*attribute
;
6054 struct kmem_cache
*s
;
6056 attribute
= to_slab_attr(attr
);
6059 if (!attribute
->store
)
6062 return attribute
->store(s
, buf
, len
);
6065 static void kmem_cache_release(struct kobject
*k
)
6067 slab_kmem_cache_release(to_slab(k
));
6070 static const struct sysfs_ops slab_sysfs_ops
= {
6071 .show
= slab_attr_show
,
6072 .store
= slab_attr_store
,
6075 static const struct kobj_type slab_ktype
= {
6076 .sysfs_ops
= &slab_sysfs_ops
,
6077 .release
= kmem_cache_release
,
6080 static struct kset
*slab_kset
;
6082 static inline struct kset
*cache_kset(struct kmem_cache
*s
)
6087 #define ID_STR_LENGTH 32
6089 /* Create a unique string id for a slab cache:
6091 * Format :[flags-]size
6093 static char *create_unique_id(struct kmem_cache
*s
)
6095 char *name
= kmalloc(ID_STR_LENGTH
, GFP_KERNEL
);
6099 return ERR_PTR(-ENOMEM
);
6103 * First flags affecting slabcache operations. We will only
6104 * get here for aliasable slabs so we do not need to support
6105 * too many flags. The flags here must cover all flags that
6106 * are matched during merging to guarantee that the id is
6109 if (s
->flags
& SLAB_CACHE_DMA
)
6111 if (s
->flags
& SLAB_CACHE_DMA32
)
6113 if (s
->flags
& SLAB_RECLAIM_ACCOUNT
)
6115 if (s
->flags
& SLAB_CONSISTENCY_CHECKS
)
6117 if (s
->flags
& SLAB_ACCOUNT
)
6121 p
+= snprintf(p
, ID_STR_LENGTH
- (p
- name
), "%07u", s
->size
);
6123 if (WARN_ON(p
> name
+ ID_STR_LENGTH
- 1)) {
6125 return ERR_PTR(-EINVAL
);
6127 kmsan_unpoison_memory(name
, p
- name
);
6131 static int sysfs_slab_add(struct kmem_cache
*s
)
6135 struct kset
*kset
= cache_kset(s
);
6136 int unmergeable
= slab_unmergeable(s
);
6138 if (!unmergeable
&& disable_higher_order_debug
&&
6139 (slub_debug
& DEBUG_METADATA_FLAGS
))
6144 * Slabcache can never be merged so we can use the name proper.
6145 * This is typically the case for debug situations. In that
6146 * case we can catch duplicate names easily.
6148 sysfs_remove_link(&slab_kset
->kobj
, s
->name
);
6152 * Create a unique name for the slab as a target
6155 name
= create_unique_id(s
);
6157 return PTR_ERR(name
);
6160 s
->kobj
.kset
= kset
;
6161 err
= kobject_init_and_add(&s
->kobj
, &slab_ktype
, NULL
, "%s", name
);
6165 err
= sysfs_create_group(&s
->kobj
, &slab_attr_group
);
6170 /* Setup first alias */
6171 sysfs_slab_alias(s
, s
->name
);
6178 kobject_del(&s
->kobj
);
6182 void sysfs_slab_unlink(struct kmem_cache
*s
)
6184 if (slab_state
>= FULL
)
6185 kobject_del(&s
->kobj
);
6188 void sysfs_slab_release(struct kmem_cache
*s
)
6190 if (slab_state
>= FULL
)
6191 kobject_put(&s
->kobj
);
6195 * Need to buffer aliases during bootup until sysfs becomes
6196 * available lest we lose that information.
6198 struct saved_alias
{
6199 struct kmem_cache
*s
;
6201 struct saved_alias
*next
;
6204 static struct saved_alias
*alias_list
;
6206 static int sysfs_slab_alias(struct kmem_cache
*s
, const char *name
)
6208 struct saved_alias
*al
;
6210 if (slab_state
== FULL
) {
6212 * If we have a leftover link then remove it.
6214 sysfs_remove_link(&slab_kset
->kobj
, name
);
6215 return sysfs_create_link(&slab_kset
->kobj
, &s
->kobj
, name
);
6218 al
= kmalloc(sizeof(struct saved_alias
), GFP_KERNEL
);
6224 al
->next
= alias_list
;
6226 kmsan_unpoison_memory(al
, sizeof(*al
));
6230 static int __init
slab_sysfs_init(void)
6232 struct kmem_cache
*s
;
6235 mutex_lock(&slab_mutex
);
6237 slab_kset
= kset_create_and_add("slab", NULL
, kernel_kobj
);
6239 mutex_unlock(&slab_mutex
);
6240 pr_err("Cannot register slab subsystem.\n");
6246 list_for_each_entry(s
, &slab_caches
, list
) {
6247 err
= sysfs_slab_add(s
);
6249 pr_err("SLUB: Unable to add boot slab %s to sysfs\n",
6253 while (alias_list
) {
6254 struct saved_alias
*al
= alias_list
;
6256 alias_list
= alias_list
->next
;
6257 err
= sysfs_slab_alias(al
->s
, al
->name
);
6259 pr_err("SLUB: Unable to add boot slab alias %s to sysfs\n",
6264 mutex_unlock(&slab_mutex
);
6267 late_initcall(slab_sysfs_init
);
6268 #endif /* SLAB_SUPPORTS_SYSFS */
6270 #if defined(CONFIG_SLUB_DEBUG) && defined(CONFIG_DEBUG_FS)
6271 static int slab_debugfs_show(struct seq_file
*seq
, void *v
)
6273 struct loc_track
*t
= seq
->private;
6277 idx
= (unsigned long) t
->idx
;
6278 if (idx
< t
->count
) {
6281 seq_printf(seq
, "%7ld ", l
->count
);
6284 seq_printf(seq
, "%pS", (void *)l
->addr
);
6286 seq_puts(seq
, "<not-available>");
6289 seq_printf(seq
, " waste=%lu/%lu",
6290 l
->count
* l
->waste
, l
->waste
);
6292 if (l
->sum_time
!= l
->min_time
) {
6293 seq_printf(seq
, " age=%ld/%llu/%ld",
6294 l
->min_time
, div_u64(l
->sum_time
, l
->count
),
6297 seq_printf(seq
, " age=%ld", l
->min_time
);
6299 if (l
->min_pid
!= l
->max_pid
)
6300 seq_printf(seq
, " pid=%ld-%ld", l
->min_pid
, l
->max_pid
);
6302 seq_printf(seq
, " pid=%ld",
6305 if (num_online_cpus() > 1 && !cpumask_empty(to_cpumask(l
->cpus
)))
6306 seq_printf(seq
, " cpus=%*pbl",
6307 cpumask_pr_args(to_cpumask(l
->cpus
)));
6309 if (nr_online_nodes
> 1 && !nodes_empty(l
->nodes
))
6310 seq_printf(seq
, " nodes=%*pbl",
6311 nodemask_pr_args(&l
->nodes
));
6313 #ifdef CONFIG_STACKDEPOT
6315 depot_stack_handle_t handle
;
6316 unsigned long *entries
;
6317 unsigned int nr_entries
, j
;
6319 handle
= READ_ONCE(l
->handle
);
6321 nr_entries
= stack_depot_fetch(handle
, &entries
);
6322 seq_puts(seq
, "\n");
6323 for (j
= 0; j
< nr_entries
; j
++)
6324 seq_printf(seq
, " %pS\n", (void *)entries
[j
]);
6328 seq_puts(seq
, "\n");
6331 if (!idx
&& !t
->count
)
6332 seq_puts(seq
, "No data\n");
6337 static void slab_debugfs_stop(struct seq_file
*seq
, void *v
)
6341 static void *slab_debugfs_next(struct seq_file
*seq
, void *v
, loff_t
*ppos
)
6343 struct loc_track
*t
= seq
->private;
6346 if (*ppos
<= t
->count
)
6352 static int cmp_loc_by_count(const void *a
, const void *b
, const void *data
)
6354 struct location
*loc1
= (struct location
*)a
;
6355 struct location
*loc2
= (struct location
*)b
;
6357 if (loc1
->count
> loc2
->count
)
6363 static void *slab_debugfs_start(struct seq_file
*seq
, loff_t
*ppos
)
6365 struct loc_track
*t
= seq
->private;
6371 static const struct seq_operations slab_debugfs_sops
= {
6372 .start
= slab_debugfs_start
,
6373 .next
= slab_debugfs_next
,
6374 .stop
= slab_debugfs_stop
,
6375 .show
= slab_debugfs_show
,
6378 static int slab_debug_trace_open(struct inode
*inode
, struct file
*filep
)
6381 struct kmem_cache_node
*n
;
6382 enum track_item alloc
;
6384 struct loc_track
*t
= __seq_open_private(filep
, &slab_debugfs_sops
,
6385 sizeof(struct loc_track
));
6386 struct kmem_cache
*s
= file_inode(filep
)->i_private
;
6387 unsigned long *obj_map
;
6392 obj_map
= bitmap_alloc(oo_objects(s
->oo
), GFP_KERNEL
);
6394 seq_release_private(inode
, filep
);
6398 if (strcmp(filep
->f_path
.dentry
->d_name
.name
, "alloc_traces") == 0)
6399 alloc
= TRACK_ALLOC
;
6403 if (!alloc_loc_track(t
, PAGE_SIZE
/ sizeof(struct location
), GFP_KERNEL
)) {
6404 bitmap_free(obj_map
);
6405 seq_release_private(inode
, filep
);
6409 for_each_kmem_cache_node(s
, node
, n
) {
6410 unsigned long flags
;
6413 if (!node_nr_slabs(n
))
6416 spin_lock_irqsave(&n
->list_lock
, flags
);
6417 list_for_each_entry(slab
, &n
->partial
, slab_list
)
6418 process_slab(t
, s
, slab
, alloc
, obj_map
);
6419 list_for_each_entry(slab
, &n
->full
, slab_list
)
6420 process_slab(t
, s
, slab
, alloc
, obj_map
);
6421 spin_unlock_irqrestore(&n
->list_lock
, flags
);
6424 /* Sort locations by count */
6425 sort_r(t
->loc
, t
->count
, sizeof(struct location
),
6426 cmp_loc_by_count
, NULL
, NULL
);
6428 bitmap_free(obj_map
);
6432 static int slab_debug_trace_release(struct inode
*inode
, struct file
*file
)
6434 struct seq_file
*seq
= file
->private_data
;
6435 struct loc_track
*t
= seq
->private;
6438 return seq_release_private(inode
, file
);
6441 static const struct file_operations slab_debugfs_fops
= {
6442 .open
= slab_debug_trace_open
,
6444 .llseek
= seq_lseek
,
6445 .release
= slab_debug_trace_release
,
6448 static void debugfs_slab_add(struct kmem_cache
*s
)
6450 struct dentry
*slab_cache_dir
;
6452 if (unlikely(!slab_debugfs_root
))
6455 slab_cache_dir
= debugfs_create_dir(s
->name
, slab_debugfs_root
);
6457 debugfs_create_file("alloc_traces", 0400,
6458 slab_cache_dir
, s
, &slab_debugfs_fops
);
6460 debugfs_create_file("free_traces", 0400,
6461 slab_cache_dir
, s
, &slab_debugfs_fops
);
6464 void debugfs_slab_release(struct kmem_cache
*s
)
6466 debugfs_lookup_and_remove(s
->name
, slab_debugfs_root
);
6469 static int __init
slab_debugfs_init(void)
6471 struct kmem_cache
*s
;
6473 slab_debugfs_root
= debugfs_create_dir("slab", NULL
);
6475 list_for_each_entry(s
, &slab_caches
, list
)
6476 if (s
->flags
& SLAB_STORE_USER
)
6477 debugfs_slab_add(s
);
6482 __initcall(slab_debugfs_init
);
6485 * The /proc/slabinfo ABI
6487 #ifdef CONFIG_SLUB_DEBUG
6488 void get_slabinfo(struct kmem_cache
*s
, struct slabinfo
*sinfo
)
6490 unsigned long nr_slabs
= 0;
6491 unsigned long nr_objs
= 0;
6492 unsigned long nr_free
= 0;
6494 struct kmem_cache_node
*n
;
6496 for_each_kmem_cache_node(s
, node
, n
) {
6497 nr_slabs
+= node_nr_slabs(n
);
6498 nr_objs
+= node_nr_objs(n
);
6499 nr_free
+= count_partial(n
, count_free
);
6502 sinfo
->active_objs
= nr_objs
- nr_free
;
6503 sinfo
->num_objs
= nr_objs
;
6504 sinfo
->active_slabs
= nr_slabs
;
6505 sinfo
->num_slabs
= nr_slabs
;
6506 sinfo
->objects_per_slab
= oo_objects(s
->oo
);
6507 sinfo
->cache_order
= oo_order(s
->oo
);
6510 void slabinfo_show_stats(struct seq_file
*m
, struct kmem_cache
*s
)
6514 ssize_t
slabinfo_write(struct file
*file
, const char __user
*buffer
,
6515 size_t count
, loff_t
*ppos
)
6519 #endif /* CONFIG_SLUB_DEBUG */