]> git.ipfire.org Git - thirdparty/linux.git/commitdiff
sched/cache: Introduce infrastructure for cache-aware load balancing
authorPeter Zijlstra (Intel) <peterz@infradead.org>
Wed, 1 Apr 2026 21:52:13 +0000 (14:52 -0700)
committerPeter Zijlstra <peterz@infradead.org>
Thu, 9 Apr 2026 13:49:47 +0000 (15:49 +0200)
Adds infrastructure to enable cache-aware load balancing,
which improves cache locality by grouping tasks that share resources
within the same cache domain. This reduces cache misses and improves
overall data access efficiency.

In this initial implementation, threads belonging to the same process
are treated as entities that likely share working sets. The mechanism
tracks per-process CPU occupancy across cache domains and attempts to
migrate threads toward cache-hot domains where their process already
has active threads, thereby enhancing locality.

This provides a basic model for cache affinity. While the current code
targets the last-level cache (LLC), the approach could be extended to
other domain types such as clusters (L2) or node-internal groupings.

At present, the mechanism selects the CPU within an LLC that has the
highest recent runtime. Subsequent patches in this series will use this
information in the load-balancing path to guide task placement toward
preferred LLCs.

In the future, more advanced policies could be integrated through NUMA
balancing-for example, migrating a task to its preferred LLC when spare
capacity exists, or swapping tasks across LLCs to improve cache affinity.
Grouping of tasks could also be generalized from that of a process
to be that of a NUMA group, or be user configurable.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Chen Yu <yu.c.chen@intel.com>
Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://patch.msgid.link/6269a53221b9439b9ca00d18a9d1946fb64d8cff.1775065312.git.tim.c.chen@linux.intel.com
include/linux/mm_types.h
include/linux/sched.h
init/Kconfig
kernel/fork.c
kernel/sched/core.c
kernel/sched/fair.c
kernel/sched/sched.h

index 3cc8ae722886012594076f41a5a82581fe420956..67b2dfcc71ea44a7ae3ef7aaaf152b451884660a 100644 (file)
@@ -1173,6 +1173,8 @@ struct mm_struct {
                /* MM CID related storage */
                struct mm_mm_cid mm_cid;
 
+               /* sched_cache related statistics */
+               struct sched_cache_stat sc_stat;
 #ifdef CONFIG_MMU
                atomic_long_t pgtables_bytes;   /* size of all page tables */
 #endif
@@ -1575,6 +1577,36 @@ static inline unsigned int mm_cid_size(void)
 # define MM_CID_STATIC_SIZE    0
 #endif /* CONFIG_SCHED_MM_CID */
 
+#ifdef CONFIG_SCHED_CACHE
+void mm_init_sched(struct mm_struct *mm,
+                  struct sched_cache_time __percpu *pcpu_sched);
+
+static inline int mm_alloc_sched_noprof(struct mm_struct *mm)
+{
+       struct sched_cache_time __percpu *pcpu_sched =
+               alloc_percpu_noprof(struct sched_cache_time);
+
+       if (!pcpu_sched)
+               return -ENOMEM;
+
+       mm_init_sched(mm, pcpu_sched);
+       return 0;
+}
+
+#define mm_alloc_sched(...)    alloc_hooks(mm_alloc_sched_noprof(__VA_ARGS__))
+
+static inline void mm_destroy_sched(struct mm_struct *mm)
+{
+       free_percpu(mm->sc_stat.pcpu_sched);
+       mm->sc_stat.pcpu_sched = NULL;
+}
+#else /* !CONFIG_SCHED_CACHE */
+
+static inline int mm_alloc_sched(struct mm_struct *mm) { return 0; }
+static inline void mm_destroy_sched(struct mm_struct *mm) { }
+
+#endif /* CONFIG_SCHED_CACHE */
+
 struct mmu_gather;
 extern void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm);
 extern void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm);
index 8ec3b6d7d718b61e6ce964b073dd69d2a7dce9f8..2bf261bcd7b6fa844068b3602e3d8e58bb7fc2ce 100644 (file)
@@ -1407,6 +1407,10 @@ struct task_struct {
        unsigned long                   numa_pages_migrated;
 #endif /* CONFIG_NUMA_BALANCING */
 
+#ifdef CONFIG_SCHED_CACHE
+       struct callback_head            cache_work;
+#endif
+
        struct rseq_data                rseq;
        struct sched_mm_cid             mm_cid;
 
@@ -2407,6 +2411,26 @@ static __always_inline int task_mm_cid(struct task_struct *t)
 }
 #endif
 
+#ifdef CONFIG_SCHED_CACHE
+
+struct sched_cache_time {
+       u64 runtime;
+       unsigned long epoch;
+};
+
+struct sched_cache_stat {
+       struct sched_cache_time __percpu *pcpu_sched;
+       raw_spinlock_t lock;
+       unsigned long epoch;
+       int cpu;
+} ____cacheline_aligned_in_smp;
+
+#else
+
+struct sched_cache_stat { };
+
+#endif
+
 #ifndef MODULE
 #ifndef COMPILE_OFFSETS
 
index 7484cd703bc1ab263345f2c6ef38a13bda4a046e..2dfd4744d1d4dd02ecb4b658269f101dd0d11c66 100644 (file)
@@ -1005,6 +1005,17 @@ config NUMA_BALANCING
 
          This system will be inactive on UMA systems.
 
+config SCHED_CACHE
+       bool "Cache aware load balance"
+       default y
+       depends on SMP
+       help
+         When enabled, the scheduler will attempt to aggregate tasks from
+         the same process onto a single Last Level Cache (LLC) domain when
+         possible. This improves cache locality by keeping tasks that share
+         resources within the same cache domain, reducing cache misses and
+         lowering data access latency.
+
 config NUMA_BALANCING_DEFAULT_ENABLED
        bool "Automatically enable NUMA aware memory/task placement"
        default y
index 079802cb61002ab43d89e0efee663fc93f1387f9..61042bc3482d2157921074c0947860a5e500ea2a 100644 (file)
@@ -724,6 +724,7 @@ void __mmdrop(struct mm_struct *mm)
        cleanup_lazy_tlbs(mm);
 
        WARN_ON_ONCE(mm == current->active_mm);
+       mm_destroy_sched(mm);
        mm_free_pgd(mm);
        mm_free_id(mm);
        destroy_context(mm);
@@ -1125,6 +1126,9 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
        if (mm_alloc_cid(mm, p))
                goto fail_cid;
 
+       if (mm_alloc_sched(mm))
+               goto fail_sched;
+
        if (percpu_counter_init_many(mm->rss_stat, 0, GFP_KERNEL_ACCOUNT,
                                     NR_MM_COUNTERS))
                goto fail_pcpu;
@@ -1134,6 +1138,8 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
        return mm;
 
 fail_pcpu:
+       mm_destroy_sched(mm);
+fail_sched:
        mm_destroy_cid(mm);
 fail_cid:
        destroy_context(mm);
index 49cd5d21716130c51f835d2524aed0ae3cfa59a9..7e0b55e7ef5c83208f680b3d3fbb75ffc0a799e9 100644 (file)
@@ -4434,6 +4434,7 @@ static void __sched_fork(u64 clone_flags, struct task_struct *p)
        init_numa_balancing(clone_flags, p);
        p->wake_entry.u_flags = CSD_TYPE_TTWU;
        p->migration_pending = NULL;
+       init_sched_mm(p);
 }
 
 DEFINE_STATIC_KEY_FALSE(sched_numa_balancing);
@@ -8962,6 +8963,11 @@ void __init sched_init(void)
 
                rq->core_cookie = 0UL;
 #endif
+#ifdef CONFIG_SCHED_CACHE
+               raw_spin_lock_init(&rq->cpu_epoch_lock);
+               rq->cpu_epoch_next = jiffies;
+#endif
+
                zalloc_cpumask_var_node(&rq->scratch_mask, GFP_KERNEL, cpu_to_node(i));
        }
 
index 12890ef16603bff822c5a8160bc2c2e2f9a40427..c9cd064223e51f6e943055ab61394a0b5f2ebbae 100644 (file)
@@ -1321,6 +1321,8 @@ void post_init_entity_util_avg(struct task_struct *p)
        sa->runnable_avg = sa->util_avg;
 }
 
+static inline void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec);
+
 static s64 update_se(struct rq *rq, struct sched_entity *se)
 {
        u64 now = rq_clock_task(rq);
@@ -1343,6 +1345,7 @@ static s64 update_se(struct rq *rq, struct sched_entity *se)
 
                trace_sched_stat_runtime(running, delta_exec);
                account_group_exec_runtime(running, delta_exec);
+               account_mm_sched(rq, running, delta_exec);
 
                /* cgroup time is always accounted against the donor */
                cgroup_account_cputime(donor, delta_exec);
@@ -1364,6 +1367,267 @@ static s64 update_se(struct rq *rq, struct sched_entity *se)
 
 static void set_next_buddy(struct sched_entity *se);
 
+#ifdef CONFIG_SCHED_CACHE
+
+/*
+ * XXX numbers come from a place the sun don't shine -- probably wants to be SD
+ * tunable or so.
+ */
+#define EPOCH_PERIOD   (HZ / 100)      /* 10 ms */
+#define EPOCH_LLC_AFFINITY_TIMEOUT     5       /* 50 ms */
+
+static int llc_id(int cpu)
+{
+       if (cpu < 0)
+               return -1;
+
+       return per_cpu(sd_llc_id, cpu);
+}
+
+void mm_init_sched(struct mm_struct *mm,
+                  struct sched_cache_time __percpu *_pcpu_sched)
+{
+       unsigned long epoch = 0;
+       int i;
+
+       for_each_possible_cpu(i) {
+               struct sched_cache_time *pcpu_sched = per_cpu_ptr(_pcpu_sched, i);
+               struct rq *rq = cpu_rq(i);
+
+               pcpu_sched->runtime = 0;
+               /* a slightly stale cpu epoch is acceptible */
+               pcpu_sched->epoch = rq->cpu_epoch;
+               epoch = rq->cpu_epoch;
+       }
+
+       raw_spin_lock_init(&mm->sc_stat.lock);
+       mm->sc_stat.epoch = epoch;
+       mm->sc_stat.cpu = -1;
+
+       /*
+        * The update to mm->sc_stat should not be reordered
+        * before initialization to mm's other fields, in case
+        * the readers may get invalid mm_sched_epoch, etc.
+        */
+       smp_store_release(&mm->sc_stat.pcpu_sched, _pcpu_sched);
+}
+
+/* because why would C be fully specified */
+static __always_inline void __shr_u64(u64 *val, unsigned int n)
+{
+       if (n >= 64) {
+               *val = 0;
+               return;
+       }
+       *val >>= n;
+}
+
+static inline void __update_mm_sched(struct rq *rq,
+                                    struct sched_cache_time *pcpu_sched)
+{
+       lockdep_assert_held(&rq->cpu_epoch_lock);
+
+       unsigned long n, now = jiffies;
+       long delta = now - rq->cpu_epoch_next;
+
+       if (delta > 0) {
+               n = (delta + EPOCH_PERIOD - 1) / EPOCH_PERIOD;
+               rq->cpu_epoch += n;
+               rq->cpu_epoch_next += n * EPOCH_PERIOD;
+               __shr_u64(&rq->cpu_runtime, n);
+       }
+
+       n = rq->cpu_epoch - pcpu_sched->epoch;
+       if (n) {
+               pcpu_sched->epoch += n;
+               __shr_u64(&pcpu_sched->runtime, n);
+       }
+}
+
+static unsigned long fraction_mm_sched(struct rq *rq,
+                                      struct sched_cache_time *pcpu_sched)
+{
+       guard(raw_spinlock_irqsave)(&rq->cpu_epoch_lock);
+
+       __update_mm_sched(rq, pcpu_sched);
+
+       /*
+        * Runtime is a geometric series (r=0.5) and as such will sum to twice
+        * the accumulation period, this means the multiplcation here should
+        * not overflow.
+        */
+       return div64_u64(NICE_0_LOAD * pcpu_sched->runtime, rq->cpu_runtime + 1);
+}
+
+static inline
+void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec)
+{
+       struct sched_cache_time *pcpu_sched;
+       struct mm_struct *mm = p->mm;
+       unsigned long epoch;
+
+       if (!sched_cache_enabled())
+               return;
+
+       if (p->sched_class != &fair_sched_class)
+               return;
+       /*
+        * init_task, kthreads and user thread created
+        * by user_mode_thread() don't have mm.
+        */
+       if (!mm || !mm->sc_stat.pcpu_sched)
+               return;
+
+       pcpu_sched = per_cpu_ptr(p->mm->sc_stat.pcpu_sched, cpu_of(rq));
+
+       scoped_guard (raw_spinlock, &rq->cpu_epoch_lock) {
+               __update_mm_sched(rq, pcpu_sched);
+               pcpu_sched->runtime += delta_exec;
+               rq->cpu_runtime += delta_exec;
+               epoch = rq->cpu_epoch;
+       }
+
+       /*
+        * If this process hasn't hit task_cache_work() for a while invalidate
+        * its preferred state.
+        */
+       if (epoch - READ_ONCE(mm->sc_stat.epoch) > EPOCH_LLC_AFFINITY_TIMEOUT) {
+               if (mm->sc_stat.cpu != -1)
+                       mm->sc_stat.cpu = -1;
+       }
+}
+
+static void task_tick_cache(struct rq *rq, struct task_struct *p)
+{
+       struct callback_head *work = &p->cache_work;
+       struct mm_struct *mm = p->mm;
+       unsigned long epoch;
+
+       if (!sched_cache_enabled())
+               return;
+
+       if (!mm || !mm->sc_stat.pcpu_sched)
+               return;
+
+       epoch = rq->cpu_epoch;
+       /* avoid moving backwards */
+       if (time_after_eq(mm->sc_stat.epoch, epoch))
+               return;
+
+       guard(raw_spinlock)(&mm->sc_stat.lock);
+
+       if (work->next == work) {
+               task_work_add(p, work, TWA_RESUME);
+               WRITE_ONCE(mm->sc_stat.epoch, epoch);
+       }
+}
+
+static void task_cache_work(struct callback_head *work)
+{
+       struct task_struct *p = current;
+       struct mm_struct *mm = p->mm;
+       unsigned long m_a_occ = 0;
+       unsigned long curr_m_a_occ = 0;
+       int cpu, m_a_cpu = -1;
+       cpumask_var_t cpus;
+
+       WARN_ON_ONCE(work != &p->cache_work);
+
+       work->next = work;
+
+       if (p->flags & PF_EXITING)
+               return;
+
+       if (!zalloc_cpumask_var(&cpus, GFP_KERNEL))
+               return;
+
+       scoped_guard (cpus_read_lock) {
+               guard(rcu)();
+
+               cpumask_copy(cpus, cpu_online_mask);
+
+               for_each_cpu(cpu, cpus) {
+                       /* XXX sched_cluster_active */
+                       struct sched_domain *sd = per_cpu(sd_llc, cpu);
+                       unsigned long occ, m_occ = 0, a_occ = 0;
+                       int m_cpu = -1, i;
+
+                       if (!sd)
+                               continue;
+
+                       for_each_cpu(i, sched_domain_span(sd)) {
+                               occ = fraction_mm_sched(cpu_rq(i),
+                                                       per_cpu_ptr(mm->sc_stat.pcpu_sched, i));
+                               a_occ += occ;
+                               if (occ > m_occ) {
+                                       m_occ = occ;
+                                       m_cpu = i;
+                               }
+                       }
+
+                       /*
+                        * Compare the accumulated occupancy of each LLC. The
+                        * reason for using accumulated occupancy rather than average
+                        * per CPU occupancy is that it works better in asymmetric LLC
+                        * scenarios.
+                        * For example, if there are 2 threads in a 4CPU LLC and 3
+                        * threads in an 8CPU LLC, it might be better to choose the one
+                        * with 3 threads. However, this would not be the case if the
+                        * occupancy is divided by the number of CPUs in an LLC (i.e.,
+                        * if average per CPU occupancy is used).
+                        * Besides, NUMA balancing fault statistics behave similarly:
+                        * the total number of faults per node is compared rather than
+                        * the average number of faults per CPU. This strategy is also
+                        * followed here.
+                        */
+                       if (a_occ > m_a_occ) {
+                               m_a_occ = a_occ;
+                               m_a_cpu = m_cpu;
+                       }
+
+                       if (llc_id(cpu) == llc_id(mm->sc_stat.cpu))
+                               curr_m_a_occ = a_occ;
+
+                       cpumask_andnot(cpus, cpus, sched_domain_span(sd));
+               }
+       }
+
+       if (m_a_occ > (2 * curr_m_a_occ)) {
+               /*
+                * Avoid switching sc_stat.cpu too fast.
+                * The reason to choose 2X is because:
+                * 1. It is better to keep the preferred LLC stable,
+                *    rather than changing it frequently and cause migrations
+                * 2. 2X means the new preferred LLC has at least 1 more
+                *    busy CPU than the old one(200% vs 100%, eg)
+                * 3. 2X is chosen based on test results, as it delivers
+                *    the optimal performance gain so far.
+                */
+               mm->sc_stat.cpu = m_a_cpu;
+       }
+
+       free_cpumask_var(cpus);
+}
+
+void init_sched_mm(struct task_struct *p)
+{
+       struct callback_head *work = &p->cache_work;
+
+       init_task_work(work, task_cache_work);
+       work->next = work;
+}
+
+#else /* CONFIG_SCHED_CACHE */
+
+static inline void account_mm_sched(struct rq *rq, struct task_struct *p,
+                                   s64 delta_exec) { }
+
+void init_sched_mm(struct task_struct *p) { }
+
+static void task_tick_cache(struct rq *rq, struct task_struct *p) { }
+
+#endif /* CONFIG_SCHED_CACHE */
+
 /*
  * Used by other classes to account runtime.
  */
@@ -13653,6 +13917,8 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
        if (static_branch_unlikely(&sched_numa_balancing))
                task_tick_numa(rq, curr);
 
+       task_tick_cache(rq, curr);
+
        update_misfit_status(curr, rq);
        check_update_overutilized_status(task_rq(curr));
 
index c95584191d58f488159fa5b5337f0f58ea2a659c..f939d45fe043625f2505e7ae695e98b60fa6cb04 100644 (file)
@@ -1178,6 +1178,12 @@ struct rq {
        struct scx_rq           scx;
        struct sched_dl_entity  ext_server;
 #endif
+#ifdef CONFIG_SCHED_CACHE
+       raw_spinlock_t          cpu_epoch_lock ____cacheline_aligned;
+       u64                     cpu_runtime;
+       unsigned long           cpu_epoch;
+       unsigned long           cpu_epoch_next;
+#endif
 
        struct sched_dl_entity  fair_server;
 
@@ -4041,6 +4047,14 @@ static inline void mm_cid_switch_to(struct task_struct *prev, struct task_struct
 static inline void mm_cid_switch_to(struct task_struct *prev, struct task_struct *next) { }
 #endif /* !CONFIG_SCHED_MM_CID */
 
+#ifdef CONFIG_SCHED_CACHE
+static inline bool sched_cache_enabled(void)
+{
+       return false;
+}
+#endif
+extern void init_sched_mm(struct task_struct *p);
+
 extern u64 avg_vruntime(struct cfs_rq *cfs_rq);
 extern int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se);
 static inline