fork: stop ignoring NUMA while handling cached thread stacks

author Mateusz Guzik <mjguzik@gmail.com>

Thu, 20 Nov 2025 05:40:15 +0000 (06:40 +0100)

committer Andrew Morton <akpm@linux-foundation.org>

Thu, 27 Nov 2025 22:24:31 +0000 (14:24 -0800)
author Mateusz Guzik <mjguzik@gmail.com>
Thu, 20 Nov 2025 05:40:15 +0000 (06:40 +0100)
committer Andrew Morton <akpm@linux-foundation.org>
Thu, 27 Nov 2025 22:24:31 +0000 (14:24 -0800)
diff --git a/kernel/fork.c b/kernel/fork.c

index 3da0f08615a95eea55373a3f266fbfddd6fc9943..17fcb75ca5d57f13dded5fdb1c44a18f7a41a935 100644 (file)
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -208,15 +208,62 @@ struct vm_stack {
         struct vm_struct *stack_vm_area;
  };
  
+static struct vm_struct *alloc_thread_stack_node_from_cache(struct task_struct *tsk, int node)
+{
+       struct vm_struct *vm_area;
+       unsigned int i;
+
+       /*
+        * If the node has memory, we are guaranteed the stacks are backed by local pages.
+        * Otherwise the pages are arbitrary.
+        *
+        * Note that depending on cpuset it is possible we will get migrated to a different
+        * node immediately after allocating here, so this does *not* guarantee locality for
+        * arbitrary callers.
+        */
+       scoped_guard(preempt) {
+               if (node != NUMA_NO_NODE && numa_node_id() != node)
+                       return NULL;
+
+               for (i = 0; i < NR_CACHED_STACKS; i++) {
+                       vm_area = this_cpu_xchg(cached_stacks[i], NULL);
+                       if (vm_area)
+                               return vm_area;
+               }
+       }
+
+       return NULL;
+}
+
  static bool try_release_thread_stack_to_cache(struct vm_struct *vm_area)
  {
         unsigned int i;
+       int nid;
  
-       for (i = 0; i < NR_CACHED_STACKS; i++) {
-               struct vm_struct *tmp = NULL;
+       /*
+        * Don't cache stacks if any of the pages don't match the local domain, unless
+        * there is no local memory to begin with.
+        *
+        * Note that lack of local memory does not automatically mean it makes no difference
+        * performance-wise which other domain backs the stack. In this case we are merely
+        * trying to avoid constantly going to vmalloc.
+        */
+       scoped_guard(preempt) {
+               nid = numa_node_id();
+               if (node_state(nid, N_MEMORY)) {
+                       for (i = 0; i < vm_area->nr_pages; i++) {
+                               struct page *page = vm_area->pages[i];
+                               if (page_to_nid(page) != nid)
+                                       return false;
+                       }
+               }
+
+               for (i = 0; i < NR_CACHED_STACKS; i++) {
+                       struct vm_struct *tmp = NULL;
  
-               if (this_cpu_try_cmpxchg(cached_stacks[i], &tmp, vm_area))
-                       return true;
+                       if (this_cpu_try_cmpxchg(cached_stacks[i], &tmp, vm_area))
+                               return true;
+               }
         }
         return false;
  }
@@ -283,13 +330,9 @@ static int alloc_thread_stack_node(struct task_struct *tsk, int node)
  {
         struct vm_struct *vm_area;
         void *stack;
-       int i;
-
-       for (i = 0; i < NR_CACHED_STACKS; i++) {
-               vm_area = this_cpu_xchg(cached_stacks[i], NULL);
-               if (!vm_area)
-                       continue;
  
+       vm_area = alloc_thread_stack_node_from_cache(tsk, node);
+       if (vm_area) {
                 if (memcg_charge_kernel_stack(vm_area)) {
                         vfree(vm_area->addr);
                         return -ENOMEM;
author	Mateusz Guzik <mjguzik@gmail.com>
	Thu, 20 Nov 2025 05:40:15 +0000 (06:40 +0100)
committer	Andrew Morton <akpm@linux-foundation.org>
	Thu, 27 Nov 2025 22:24:31 +0000 (14:24 -0800)