]> git.ipfire.org Git - thirdparty/linux.git/commitdiff
sched/cache: Record per LLC utilization to guide cache aware scheduling decisions
authorChen Yu <yu.c.chen@intel.com>
Wed, 1 Apr 2026 21:52:15 +0000 (14:52 -0700)
committerPeter Zijlstra <peterz@infradead.org>
Thu, 9 Apr 2026 13:49:48 +0000 (15:49 +0200)
When a system becomes busy and a process's preferred LLC is
saturated with too many threads, tasks within that LLC migrate
frequently. These in LLC migrations introduce latency and degrade
performance. To avoid this, task aggregation should be suppressed
when the preferred LLC is overloaded, which requires a metric to
indicate LLC utilization.

Record per LLC utilization/cpu capacity during periodic load
balancing. These statistics will be used in later patches to decide
whether tasks should be aggregated into their preferred LLC.

Signed-off-by: Chen Yu <yu.c.chen@intel.com>
Co-developed-by: Tim Chen <tim.c.chen@linux.intel.com>
Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://patch.msgid.link/a48151b3d57f2a42a5971aaead1b7f81e69229f4.1775065312.git.tim.c.chen@linux.intel.com
include/linux/sched/topology.h
kernel/sched/fair.c

index 36553e14866d0dffdbb57afbfb0e8a7fc80a8e40..159716fa0d3aae8261def11e69d0cbda518baa84 100644 (file)
@@ -68,6 +68,10 @@ struct sched_domain_shared {
        atomic_t        nr_busy_cpus;
        int             has_idle_cores;
        int             nr_idle_scan;
+#ifdef CONFIG_SCHED_CACHE
+       unsigned long   util_avg;
+       unsigned long   capacity;
+#endif
 };
 
 struct sched_domain {
index a55ada22e40c7d67f4c8d6f80a80db8ea8d2e7a9..6647d465b59eee5972d23cdabf1035558de0b00b 100644 (file)
@@ -9992,6 +9992,28 @@ static inline int task_is_ineligible_on_dst_cpu(struct task_struct *p, int dest_
        return 0;
 }
 
+#ifdef CONFIG_SCHED_CACHE
+static __maybe_unused bool get_llc_stats(int cpu, unsigned long *util,
+                                        unsigned long *cap)
+{
+       struct sched_domain_shared *sd_share;
+
+       sd_share = rcu_dereference_all(per_cpu(sd_llc_shared, cpu));
+       if (!sd_share)
+               return false;
+
+       *util = READ_ONCE(sd_share->util_avg);
+       *cap = READ_ONCE(sd_share->capacity);
+
+       return true;
+}
+#else
+static inline bool get_llc_stats(int cpu, unsigned long *util,
+                                unsigned long *cap)
+{
+       return false;
+}
+#endif
 /*
  * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
  */
@@ -10948,6 +10970,53 @@ sched_reduced_capacity(struct rq *rq, struct sched_domain *sd)
        return check_cpu_capacity(rq, sd);
 }
 
+#ifdef CONFIG_SCHED_CACHE
+/*
+ * Record the statistics for this scheduler group for later
+ * use. These values guide load balancing on aggregating tasks
+ * to a LLC.
+ */
+static void record_sg_llc_stats(struct lb_env *env,
+                               struct sg_lb_stats *sgs,
+                               struct sched_group *group)
+{
+       struct sched_domain_shared *sd_share;
+       int cpu;
+
+       if (!sched_cache_enabled() || env->idle == CPU_NEWLY_IDLE)
+               return;
+
+       /* Only care about sched domain spanning multiple LLCs */
+       if (env->sd->child != rcu_dereference_all(per_cpu(sd_llc, env->dst_cpu)))
+               return;
+
+       /*
+        * At this point we know this group spans a LLC domain.
+        * Record the statistic of this group in its corresponding
+        * shared LLC domain.
+        * Note: sd_share cannot be obtained via sd->child->shared,
+        * because the latter refers to the domain that covers the
+        * local group. Instead, sd_share should be located using
+        * the first CPU of the LLC group.
+        */
+       cpu = cpumask_first(sched_group_span(group));
+       sd_share = rcu_dereference_all(per_cpu(sd_llc_shared, cpu));
+       if (!sd_share)
+               return;
+
+       if (READ_ONCE(sd_share->util_avg) != sgs->group_util)
+               WRITE_ONCE(sd_share->util_avg, sgs->group_util);
+
+       if (unlikely(READ_ONCE(sd_share->capacity) != sgs->group_capacity))
+               WRITE_ONCE(sd_share->capacity, sgs->group_capacity);
+}
+#else
+static inline void record_sg_llc_stats(struct lb_env *env, struct sg_lb_stats *sgs,
+                                      struct sched_group *group)
+{
+}
+#endif
+
 /**
  * update_sg_lb_stats - Update sched_group's statistics for load balancing.
  * @env: The load balancing environment.
@@ -11035,6 +11104,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
 
        sgs->group_type = group_classify(env->sd->imbalance_pct, group, sgs);
 
+       record_sg_llc_stats(env, sgs, group);
        /* Computing avg_load makes sense only when group is overloaded */
        if (sgs->group_type == group_overloaded)
                sgs->avg_load = (sgs->group_load * SCHED_CAPACITY_SCALE) /