sched/cache: Introduce infrastructure for cache-aware load balancing

author Peter Zijlstra (Intel) <peterz@infradead.org>

Wed, 1 Apr 2026 21:52:13 +0000 (14:52 -0700)

committer Peter Zijlstra <peterz@infradead.org>

Thu, 9 Apr 2026 13:49:47 +0000 (15:49 +0200)
author Peter Zijlstra (Intel) <peterz@infradead.org>
Wed, 1 Apr 2026 21:52:13 +0000 (14:52 -0700)
committer Peter Zijlstra <peterz@infradead.org>
Thu, 9 Apr 2026 13:49:47 +0000 (15:49 +0200)
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h

index 3cc8ae722886012594076f41a5a82581fe420956..67b2dfcc71ea44a7ae3ef7aaaf152b451884660a 100644 (file)
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -1173,6 +1173,8 @@ struct mm_struct {
                 /* MM CID related storage */
                 struct mm_mm_cid mm_cid;
  
+               /* sched_cache related statistics */
+               struct sched_cache_stat sc_stat;
  #ifdef CONFIG_MMU
                 atomic_long_t pgtables_bytes;   /* size of all page tables */
  #endif
@@ -1575,6 +1577,36 @@ static inline unsigned int mm_cid_size(void)
  # define MM_CID_STATIC_SIZE    0
  #endif /* CONFIG_SCHED_MM_CID */
  
+#ifdef CONFIG_SCHED_CACHE
+void mm_init_sched(struct mm_struct *mm,
+                  struct sched_cache_time __percpu *pcpu_sched);
+
+static inline int mm_alloc_sched_noprof(struct mm_struct *mm)
+{
+       struct sched_cache_time __percpu *pcpu_sched =
+               alloc_percpu_noprof(struct sched_cache_time);
+
+       if (!pcpu_sched)
+               return -ENOMEM;
+
+       mm_init_sched(mm, pcpu_sched);
+       return 0;
+}
+
+#define mm_alloc_sched(...)    alloc_hooks(mm_alloc_sched_noprof(__VA_ARGS__))
+
+static inline void mm_destroy_sched(struct mm_struct *mm)
+{
+       free_percpu(mm->sc_stat.pcpu_sched);
+       mm->sc_stat.pcpu_sched = NULL;
+}
+#else /* !CONFIG_SCHED_CACHE */
+
+static inline int mm_alloc_sched(struct mm_struct *mm) { return 0; }
+static inline void mm_destroy_sched(struct mm_struct *mm) { }
+
+#endif /* CONFIG_SCHED_CACHE */
+
  struct mmu_gather;
  extern void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm);
  extern void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm);
diff --git a/include/linux/sched.h b/include/linux/sched.h

index 8ec3b6d7d718b61e6ce964b073dd69d2a7dce9f8..2bf261bcd7b6fa844068b3602e3d8e58bb7fc2ce 100644 (file)
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1407,6 +1407,10 @@ struct task_struct {
         unsigned long                   numa_pages_migrated;
  #endif /* CONFIG_NUMA_BALANCING */
  
+#ifdef CONFIG_SCHED_CACHE
+       struct callback_head            cache_work;
+#endif
+
         struct rseq_data                rseq;
         struct sched_mm_cid             mm_cid;
  
@@ -2407,6 +2411,26 @@ static __always_inline int task_mm_cid(struct task_struct *t)
  }
  #endif
  
+#ifdef CONFIG_SCHED_CACHE
+
+struct sched_cache_time {
+       u64 runtime;
+       unsigned long epoch;
+};
+
+struct sched_cache_stat {
+       struct sched_cache_time __percpu *pcpu_sched;
+       raw_spinlock_t lock;
+       unsigned long epoch;
+       int cpu;
+} ____cacheline_aligned_in_smp;
+
+#else
+
+struct sched_cache_stat { };
+
+#endif
+
  #ifndef MODULE
  #ifndef COMPILE_OFFSETS
  
diff --git a/init/Kconfig b/init/Kconfig

index 7484cd703bc1ab263345f2c6ef38a13bda4a046e..2dfd4744d1d4dd02ecb4b658269f101dd0d11c66 100644 (file)
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1005,6 +1005,17 @@ config NUMA_BALANCING
  
           This system will be inactive on UMA systems.
  
+config SCHED_CACHE
+       bool "Cache aware load balance"
+       default y
+       depends on SMP
+       help
+         When enabled, the scheduler will attempt to aggregate tasks from
+         the same process onto a single Last Level Cache (LLC) domain when
+         possible. This improves cache locality by keeping tasks that share
+         resources within the same cache domain, reducing cache misses and
+         lowering data access latency.
+
  config NUMA_BALANCING_DEFAULT_ENABLED
         bool "Automatically enable NUMA aware memory/task placement"
         default y
diff --git a/kernel/fork.c b/kernel/fork.c

index 079802cb61002ab43d89e0efee663fc93f1387f9..61042bc3482d2157921074c0947860a5e500ea2a 100644 (file)
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -724,6 +724,7 @@ void __mmdrop(struct mm_struct *mm)
         cleanup_lazy_tlbs(mm);
  
         WARN_ON_ONCE(mm == current->active_mm);
+       mm_destroy_sched(mm);
         mm_free_pgd(mm);
         mm_free_id(mm);
         destroy_context(mm);
@@ -1125,6 +1126,9 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
         if (mm_alloc_cid(mm, p))
                 goto fail_cid;
  
+       if (mm_alloc_sched(mm))
+               goto fail_sched;
+
         if (percpu_counter_init_many(mm->rss_stat, 0, GFP_KERNEL_ACCOUNT,
                                      NR_MM_COUNTERS))
                 goto fail_pcpu;
@@ -1134,6 +1138,8 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
         return mm;
  
  fail_pcpu:
+       mm_destroy_sched(mm);
+fail_sched:
         mm_destroy_cid(mm);
  fail_cid:
         destroy_context(mm);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c

index 49cd5d21716130c51f835d2524aed0ae3cfa59a9..7e0b55e7ef5c83208f680b3d3fbb75ffc0a799e9 100644 (file)
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4434,6 +4434,7 @@ static void __sched_fork(u64 clone_flags, struct task_struct *p)
         init_numa_balancing(clone_flags, p);
         p->wake_entry.u_flags = CSD_TYPE_TTWU;
         p->migration_pending = NULL;
+       init_sched_mm(p);
  }
  
  DEFINE_STATIC_KEY_FALSE(sched_numa_balancing);
@@ -8962,6 +8963,11 @@ void __init sched_init(void)
  
                 rq->core_cookie = 0UL;
  #endif
+#ifdef CONFIG_SCHED_CACHE
+               raw_spin_lock_init(&rq->cpu_epoch_lock);
+               rq->cpu_epoch_next = jiffies;
+#endif
+
                 zalloc_cpumask_var_node(&rq->scratch_mask, GFP_KERNEL, cpu_to_node(i));
         }
  
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c

index 12890ef16603bff822c5a8160bc2c2e2f9a40427..c9cd064223e51f6e943055ab61394a0b5f2ebbae 100644 (file)
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1321,6 +1321,8 @@ void post_init_entity_util_avg(struct task_struct *p)
         sa->runnable_avg = sa->util_avg;
  }
  
+static inline void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec);
+
  static s64 update_se(struct rq *rq, struct sched_entity *se)
  {
         u64 now = rq_clock_task(rq);
@@ -1343,6 +1345,7 @@ static s64 update_se(struct rq *rq, struct sched_entity *se)
  
                 trace_sched_stat_runtime(running, delta_exec);
                 account_group_exec_runtime(running, delta_exec);
+               account_mm_sched(rq, running, delta_exec);
  
                 /* cgroup time is always accounted against the donor */
                 cgroup_account_cputime(donor, delta_exec);
@@ -1364,6 +1367,267 @@ static s64 update_se(struct rq *rq, struct sched_entity *se)
  
  static void set_next_buddy(struct sched_entity *se);
  
+#ifdef CONFIG_SCHED_CACHE
+
+/*
+ * XXX numbers come from a place the sun don't shine -- probably wants to be SD
+ * tunable or so.
+ */
+#define EPOCH_PERIOD   (HZ / 100)      /* 10 ms */
+#define EPOCH_LLC_AFFINITY_TIMEOUT     5       /* 50 ms */
+
+static int llc_id(int cpu)
+{
+       if (cpu < 0)
+               return -1;
+
+       return per_cpu(sd_llc_id, cpu);
+}
+
+void mm_init_sched(struct mm_struct *mm,
+                  struct sched_cache_time __percpu *_pcpu_sched)
+{
+       unsigned long epoch = 0;
+       int i;
+
+       for_each_possible_cpu(i) {
+               struct sched_cache_time *pcpu_sched = per_cpu_ptr(_pcpu_sched, i);
+               struct rq *rq = cpu_rq(i);
+
+               pcpu_sched->runtime = 0;
+               /* a slightly stale cpu epoch is acceptible */
+               pcpu_sched->epoch = rq->cpu_epoch;
+               epoch = rq->cpu_epoch;
+       }
+
+       raw_spin_lock_init(&mm->sc_stat.lock);
+       mm->sc_stat.epoch = epoch;
+       mm->sc_stat.cpu = -1;
+
+       /*
+        * The update to mm->sc_stat should not be reordered
+        * before initialization to mm's other fields, in case
+        * the readers may get invalid mm_sched_epoch, etc.
+        */
+       smp_store_release(&mm->sc_stat.pcpu_sched, _pcpu_sched);
+}
+
+/* because why would C be fully specified */
+static __always_inline void __shr_u64(u64 *val, unsigned int n)
+{
+       if (n >= 64) {
+               *val = 0;
+               return;
+       }
+       *val >>= n;
+}
+
+static inline void __update_mm_sched(struct rq *rq,
+                                    struct sched_cache_time *pcpu_sched)
+{
+       lockdep_assert_held(&rq->cpu_epoch_lock);
+
+       unsigned long n, now = jiffies;
+       long delta = now - rq->cpu_epoch_next;
+
+       if (delta > 0) {
+               n = (delta + EPOCH_PERIOD - 1) / EPOCH_PERIOD;
+               rq->cpu_epoch += n;
+               rq->cpu_epoch_next += n * EPOCH_PERIOD;
+               __shr_u64(&rq->cpu_runtime, n);
+       }
+
+       n = rq->cpu_epoch - pcpu_sched->epoch;
+       if (n) {
+               pcpu_sched->epoch += n;
+               __shr_u64(&pcpu_sched->runtime, n);
+       }
+}
+
+static unsigned long fraction_mm_sched(struct rq *rq,
+                                      struct sched_cache_time *pcpu_sched)
+{
+       guard(raw_spinlock_irqsave)(&rq->cpu_epoch_lock);
+
+       __update_mm_sched(rq, pcpu_sched);
+
+       /*
+        * Runtime is a geometric series (r=0.5) and as such will sum to twice
+        * the accumulation period, this means the multiplcation here should
+        * not overflow.
+        */
+       return div64_u64(NICE_0_LOAD * pcpu_sched->runtime, rq->cpu_runtime + 1);
+}
+
+static inline
+void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec)
+{
+       struct sched_cache_time *pcpu_sched;
+       struct mm_struct *mm = p->mm;
+       unsigned long epoch;
+
+       if (!sched_cache_enabled())
+               return;
+
+       if (p->sched_class != &fair_sched_class)
+               return;
+       /*
+        * init_task, kthreads and user thread created
+        * by user_mode_thread() don't have mm.
+        */
+       if (!mm || !mm->sc_stat.pcpu_sched)
+               return;
+
+       pcpu_sched = per_cpu_ptr(p->mm->sc_stat.pcpu_sched, cpu_of(rq));
+
+       scoped_guard (raw_spinlock, &rq->cpu_epoch_lock) {
+               __update_mm_sched(rq, pcpu_sched);
+               pcpu_sched->runtime += delta_exec;
+               rq->cpu_runtime += delta_exec;
+               epoch = rq->cpu_epoch;
+       }
+
+       /*
+        * If this process hasn't hit task_cache_work() for a while invalidate
+        * its preferred state.
+        */
+       if (epoch - READ_ONCE(mm->sc_stat.epoch) > EPOCH_LLC_AFFINITY_TIMEOUT) {
+               if (mm->sc_stat.cpu != -1)
+                       mm->sc_stat.cpu = -1;
+       }
+}
+
+static void task_tick_cache(struct rq *rq, struct task_struct *p)
+{
+       struct callback_head *work = &p->cache_work;
+       struct mm_struct *mm = p->mm;
+       unsigned long epoch;
+
+       if (!sched_cache_enabled())
+               return;
+
+       if (!mm || !mm->sc_stat.pcpu_sched)
+               return;
+
+       epoch = rq->cpu_epoch;
+       /* avoid moving backwards */
+       if (time_after_eq(mm->sc_stat.epoch, epoch))
+               return;
+
+       guard(raw_spinlock)(&mm->sc_stat.lock);
+
+       if (work->next == work) {
+               task_work_add(p, work, TWA_RESUME);
+               WRITE_ONCE(mm->sc_stat.epoch, epoch);
+       }
+}
+
+static void task_cache_work(struct callback_head *work)
+{
+       struct task_struct *p = current;
+       struct mm_struct *mm = p->mm;
+       unsigned long m_a_occ = 0;
+       unsigned long curr_m_a_occ = 0;
+       int cpu, m_a_cpu = -1;
+       cpumask_var_t cpus;
+
+       WARN_ON_ONCE(work != &p->cache_work);
+
+       work->next = work;
+
+       if (p->flags & PF_EXITING)
+               return;
+
+       if (!zalloc_cpumask_var(&cpus, GFP_KERNEL))
+               return;
+
+       scoped_guard (cpus_read_lock) {
+               guard(rcu)();
+
+               cpumask_copy(cpus, cpu_online_mask);
+
+               for_each_cpu(cpu, cpus) {
+                       /* XXX sched_cluster_active */
+                       struct sched_domain *sd = per_cpu(sd_llc, cpu);
+                       unsigned long occ, m_occ = 0, a_occ = 0;
+                       int m_cpu = -1, i;
+
+                       if (!sd)
+                               continue;
+
+                       for_each_cpu(i, sched_domain_span(sd)) {
+                               occ = fraction_mm_sched(cpu_rq(i),
+                                                       per_cpu_ptr(mm->sc_stat.pcpu_sched, i));
+                               a_occ += occ;
+                               if (occ > m_occ) {
+                                       m_occ = occ;
+                                       m_cpu = i;
+                               }
+                       }
+
+                       /*
+                        * Compare the accumulated occupancy of each LLC. The
+                        * reason for using accumulated occupancy rather than average
+                        * per CPU occupancy is that it works better in asymmetric LLC
+                        * scenarios.
+                        * For example, if there are 2 threads in a 4CPU LLC and 3
+                        * threads in an 8CPU LLC, it might be better to choose the one
+                        * with 3 threads. However, this would not be the case if the
+                        * occupancy is divided by the number of CPUs in an LLC (i.e.,
+                        * if average per CPU occupancy is used).
+                        * Besides, NUMA balancing fault statistics behave similarly:
+                        * the total number of faults per node is compared rather than
+                        * the average number of faults per CPU. This strategy is also
+                        * followed here.
+                        */
+                       if (a_occ > m_a_occ) {
+                               m_a_occ = a_occ;
+                               m_a_cpu = m_cpu;
+                       }
+
+                       if (llc_id(cpu) == llc_id(mm->sc_stat.cpu))
+                               curr_m_a_occ = a_occ;
+
+                       cpumask_andnot(cpus, cpus, sched_domain_span(sd));
+               }
+       }
+
+       if (m_a_occ > (2 * curr_m_a_occ)) {
+               /*
+                * Avoid switching sc_stat.cpu too fast.
+                * The reason to choose 2X is because:
+                * 1. It is better to keep the preferred LLC stable,
+                *    rather than changing it frequently and cause migrations
+                * 2. 2X means the new preferred LLC has at least 1 more
+                *    busy CPU than the old one(200% vs 100%, eg)
+                * 3. 2X is chosen based on test results, as it delivers
+                *    the optimal performance gain so far.
+                */
+               mm->sc_stat.cpu = m_a_cpu;
+       }
+
+       free_cpumask_var(cpus);
+}
+
+void init_sched_mm(struct task_struct *p)
+{
+       struct callback_head *work = &p->cache_work;
+
+       init_task_work(work, task_cache_work);
+       work->next = work;
+}
+
+#else /* CONFIG_SCHED_CACHE */
+
+static inline void account_mm_sched(struct rq *rq, struct task_struct *p,
+                                   s64 delta_exec) { }
+
+void init_sched_mm(struct task_struct *p) { }
+
+static void task_tick_cache(struct rq *rq, struct task_struct *p) { }
+
+#endif /* CONFIG_SCHED_CACHE */
+
  /*
   * Used by other classes to account runtime.
   */
@@ -13653,6 +13917,8 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
         if (static_branch_unlikely(&sched_numa_balancing))
                 task_tick_numa(rq, curr);
  
+       task_tick_cache(rq, curr);
+
         update_misfit_status(curr, rq);
         check_update_overutilized_status(task_rq(curr));
  
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h

index c95584191d58f488159fa5b5337f0f58ea2a659c..f939d45fe043625f2505e7ae695e98b60fa6cb04 100644 (file)
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1178,6 +1178,12 @@ struct rq {
         struct scx_rq           scx;
         struct sched_dl_entity  ext_server;
  #endif
+#ifdef CONFIG_SCHED_CACHE
+       raw_spinlock_t          cpu_epoch_lock ____cacheline_aligned;
+       u64                     cpu_runtime;
+       unsigned long           cpu_epoch;
+       unsigned long           cpu_epoch_next;
+#endif
  
         struct sched_dl_entity  fair_server;
  
@@ -4041,6 +4047,14 @@ static inline void mm_cid_switch_to(struct task_struct *prev, struct task_struct
  static inline void mm_cid_switch_to(struct task_struct *prev, struct task_struct *next) { }
  #endif /* !CONFIG_SCHED_MM_CID */
  
+#ifdef CONFIG_SCHED_CACHE
+static inline bool sched_cache_enabled(void)
+{
+       return false;
+}
+#endif
+extern void init_sched_mm(struct task_struct *p);
+
  extern u64 avg_vruntime(struct cfs_rq *cfs_rq);
  extern int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se);
  static inline
author	Peter Zijlstra (Intel) <peterz@infradead.org>
	Wed, 1 Apr 2026 21:52:13 +0000 (14:52 -0700)
committer	Peter Zijlstra <peterz@infradead.org>
	Thu, 9 Apr 2026 13:49:47 +0000 (15:49 +0200)
include/linux/mm_types.h		patch \| blob \| blame \| history
include/linux/sched.h		patch \| blob \| blame \| history
init/Kconfig		patch \| blob \| blame \| history
kernel/fork.c		patch \| blob \| blame \| history
kernel/sched/core.c		patch \| blob \| blame \| history
kernel/sched/fair.c		patch \| blob \| blame \| history
kernel/sched/sched.h		patch \| blob \| blame \| history