From: Vlastimil Babka (SUSE) <vbabka@kernel.org>
Date: Wed, 11 Mar 2026 08:25:56 +0000 (+0100)
Subject: slab: create barns for online memoryless nodes
X-Git-Tag: v7.1-rc1~165^2~4
X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=7f693882f00963fd7808e333b86a87e0f9b9873b;p=thirdparty%2Fkernel%2Fstable.git

slab: create barns for online memoryless nodes

Ming Lei has reported [1] a performance regression due to replacing cpu
(partial) slabs with sheaves. With slub stats enabled, a large amount of
slowpath allocations were observed. The affected system has 8 online
NUMA nodes but only 2 have memory.

For sheaves to work effectively on given cpu, its NUMA node has to have
struct node_barn allocated. Those are currently only allocated on nodes
with memory (N_MEMORY) where kmem_cache_node also exist as the goal is
to cache only node-local objects. But in order to have good performance
on a memoryless node, we need its barn to exist and use sheaves to cache
non-local objects (as no local objects can exist anyway).

Therefore change the implementation to allocate barns on all online
nodes, tracked in a new nodemask slab_barn_nodes. Also add a cpu hotplug
callback as that's when a memoryless node can become online.

Change both get_barn() and rcu_sheaf->node assignment to numa_node_id()
so it's returned to the barn of the local cpu's (potentially memoryless)
node, and not to the nearest node with memory anymore.

On systems with CONFIG_HAVE_MEMORYLESS_NODES=y (which are not the main
target of this change) barns did not exist on memoryless nodes, but
get_barn() using numa_mem_id() meant a barn was returned from the
nearest node with memory. This works, but the barn lock contention
increases with every such memoryless node. With this change, barn will
be allocated also on the memoryless node, reducing this contention in
exchange for increased memory consumption.

Reported-by: Ming Lei <ming.lei@redhat.com>
Link: https://lore.kernel.org/all/aZ0SbIqaIkwoW2mB@fedora/ [1]
Link: https://patch.msgid.link/20260311-b4-slab-memoryless-barns-v1-2-70ab850be4ce@kernel.org
Signed-off-by: Vlastimil Babka (SUSE) <vbabka@kernel.org>
Reviewed-by: Harry Yoo <harry.yoo@oracle.com>
Reviewed-by: Hao Li <hao.li@linux.dev>
---

diff --git a/mm/slub.c b/mm/slub.c
index 6f65cc1361084..73fe34014c6ea 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -453,7 +453,7 @@ static inline struct node_barn *get_barn_node(struct kmem_cache *s, int node)
  */
 static inline struct node_barn *get_barn(struct kmem_cache *s)
 {
-	return get_barn_node(s, numa_mem_id());
+	return get_barn_node(s, numa_node_id());
 }
 
 /*
@@ -472,6 +472,12 @@ static inline struct node_barn *get_barn(struct kmem_cache *s)
  */
 static nodemask_t slab_nodes;
 
+/*
+ * Similar to slab_nodes but for where we have node_barn allocated.
+ * Corresponds to N_ONLINE nodes.
+ */
+static nodemask_t slab_barn_nodes;
+
 /*
  * Workqueue used for flushing cpu and kfree_rcu sheaves.
  */
@@ -4062,6 +4068,51 @@ void flush_all_rcu_sheaves(void)
 	rcu_barrier();
 }
 
+static int slub_cpu_setup(unsigned int cpu)
+{
+	int nid = cpu_to_node(cpu);
+	struct kmem_cache *s;
+	int ret = 0;
+
+	/*
+	 * we never clear a nid so it's safe to do a quick check before taking
+	 * the mutex, and then recheck to handle parallel cpu hotplug safely
+	 */
+	if (node_isset(nid, slab_barn_nodes))
+		return 0;
+
+	mutex_lock(&slab_mutex);
+
+	if (node_isset(nid, slab_barn_nodes))
+		goto out;
+
+	list_for_each_entry(s, &slab_caches, list) {
+		struct node_barn *barn;
+
+		/*
+		 * barn might already exist if a previous callback failed midway
+		 */
+		if (!cache_has_sheaves(s) || get_barn_node(s, nid))
+			continue;
+
+		barn = kmalloc_node(sizeof(*barn), GFP_KERNEL, nid);
+
+		if (!barn) {
+			ret = -ENOMEM;
+			goto out;
+		}
+
+		barn_init(barn);
+		s->per_node[nid].barn = barn;
+	}
+	node_set(nid, slab_barn_nodes);
+
+out:
+	mutex_unlock(&slab_mutex);
+
+	return ret;
+}
+
 /*
  * Use the cpu notifier to insure that the cpu slabs are flushed when
  * necessary.
@@ -5916,7 +5967,7 @@ do_free:
 		rcu_sheaf = NULL;
 	} else {
 		pcs->rcu_free = NULL;
-		rcu_sheaf->node = numa_mem_id();
+		rcu_sheaf->node = numa_node_id();
 	}
 
 	/*
@@ -7577,7 +7628,7 @@ static int init_kmem_cache_nodes(struct kmem_cache *s)
 	if (slab_state == DOWN || !cache_has_sheaves(s))
 		return 1;
 
-	for_each_node_mask(node, slab_nodes) {
+	for_each_node_mask(node, slab_barn_nodes) {
 		struct node_barn *barn;
 
 		barn = kmalloc_node(sizeof(*barn), GFP_KERNEL, node);
@@ -8230,6 +8281,7 @@ static int slab_mem_going_online_callback(int nid)
 	 * and barn initialized for the new node.
 	 */
 	node_set(nid, slab_nodes);
+	node_set(nid, slab_barn_nodes);
 out:
 	mutex_unlock(&slab_mutex);
 	return ret;
@@ -8308,7 +8360,7 @@ static void __init bootstrap_cache_sheaves(struct kmem_cache *s)
 	if (!capacity)
 		return;
 
-	for_each_node_mask(node, slab_nodes) {
+	for_each_node_mask(node, slab_barn_nodes) {
 		struct node_barn *barn;
 
 		barn = kmalloc_node(sizeof(*barn), GFP_KERNEL, node);
@@ -8380,6 +8432,9 @@ void __init kmem_cache_init(void)
 	for_each_node_state(node, N_MEMORY)
 		node_set(node, slab_nodes);
 
+	for_each_online_node(node)
+		node_set(node, slab_barn_nodes);
+
 	create_boot_cache(kmem_cache_node, "kmem_cache_node",
 			sizeof(struct kmem_cache_node),
 			SLAB_HWCACHE_ALIGN | SLAB_NO_OBJ_EXT, 0, 0);
@@ -8406,7 +8461,7 @@ void __init kmem_cache_init(void)
 	/* Setup random freelists for each cache */
 	init_freelist_randomization();
 
-	cpuhp_setup_state_nocalls(CPUHP_SLUB_DEAD, "slub:dead", NULL,
+	cpuhp_setup_state_nocalls(CPUHP_SLUB_DEAD, "slub:dead", slub_cpu_setup,
 				  slub_cpu_dead);
 
 	pr_info("SLUB: HWalign=%d, Order=%u-%u, MinObjects=%u, CPUs=%u, Nodes=%u\n",