sched/cache: Limit the scan number of CPUs when calculating task occupancy

author Chen Yu <yu.c.chen@intel.com>

Wed, 1 Apr 2026 21:52:14 +0000 (14:52 -0700)

committer Peter Zijlstra <peterz@infradead.org>

Thu, 9 Apr 2026 13:49:47 +0000 (15:49 +0200)
author Chen Yu <yu.c.chen@intel.com>
Wed, 1 Apr 2026 21:52:14 +0000 (14:52 -0700)
committer Peter Zijlstra <peterz@infradead.org>
Thu, 9 Apr 2026 13:49:47 +0000 (15:49 +0200)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c

index c9cd064223e51f6e943055ab61394a0b5f2ebbae..a55ada22e40c7d67f4c8d6f80a80db8ea8d2e7a9 100644 (file)
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1522,6 +1522,51 @@ static void task_tick_cache(struct rq *rq, struct task_struct *p)
         }
  }
  
+static void get_scan_cpumasks(cpumask_var_t cpus, struct task_struct *p)
+{
+#ifdef CONFIG_NUMA_BALANCING
+       int cpu, curr_cpu, nid, pref_nid;
+
+       if (!static_branch_likely(&sched_numa_balancing))
+               goto out;
+
+       cpu = p->mm->sc_stat.cpu;
+       if (cpu != -1)
+               nid = cpu_to_node(cpu);
+       curr_cpu = task_cpu(p);
+
+       /*
+        * Scanning in the preferred NUMA node is ideal. However, the NUMA
+        * preferred node is per-task rather than per-process. It is possible
+        * for different threads of the process to have distinct preferred
+        * nodes; consequently, the process-wide preferred LLC may bounce
+        * between different nodes. As a workaround, maintain the scan
+        * CPU mask to also cover the process's current preferred LLC and the
+        * current running node to mitigate the bouncing risk.
+        * TBD: numa_group should be considered during task aggregation.
+        */
+       pref_nid = p->numa_preferred_nid;
+       /* honor the task's preferred node */
+       if (pref_nid == NUMA_NO_NODE)
+               goto out;
+
+       cpumask_or(cpus, cpus, cpumask_of_node(pref_nid));
+
+       /* honor the task's preferred LLC CPU */
+       if (cpu != -1 && !cpumask_test_cpu(cpu, cpus) && nid != NUMA_NO_NODE)
+               cpumask_or(cpus, cpus, cpumask_of_node(nid));
+
+       /* make sure the task's current running node is included */
+       if (!cpumask_test_cpu(curr_cpu, cpus))
+               cpumask_or(cpus, cpus, cpumask_of_node(cpu_to_node(curr_cpu)));
+
+       return;
+
+out:
+#endif
+       cpumask_copy(cpus, cpu_online_mask);
+}
+
  static void task_cache_work(struct callback_head *work)
  {
         struct task_struct *p = current;
@@ -1544,7 +1589,7 @@ static void task_cache_work(struct callback_head *work)
         scoped_guard (cpus_read_lock) {
                 guard(rcu)();
  
-               cpumask_copy(cpus, cpu_online_mask);
+               get_scan_cpumasks(cpus, p);
  
                 for_each_cpu(cpu, cpus) {
                         /* XXX sched_cluster_active */
author	Chen Yu <yu.c.chen@intel.com>
	Wed, 1 Apr 2026 21:52:14 +0000 (14:52 -0700)
committer	Peter Zijlstra <peterz@infradead.org>
	Thu, 9 Apr 2026 13:49:47 +0000 (15:49 +0200)