sched/cache: Calculate the LLC size and store it in sched_domain

author Chen Yu <yu.c.chen@intel.com>

Wed, 13 May 2026 20:39:15 +0000 (13:39 -0700)

committer Peter Zijlstra <peterz@infradead.org>

Mon, 18 May 2026 19:33:15 +0000 (21:33 +0200)
author Chen Yu <yu.c.chen@intel.com>
Wed, 13 May 2026 20:39:15 +0000 (13:39 -0700)
committer Peter Zijlstra <peterz@infradead.org>
Mon, 18 May 2026 19:33:15 +0000 (21:33 +0200)
diff --git a/drivers/base/cacheinfo.c b/drivers/base/cacheinfo.c

index 391ac5e3d2f566e077d470e961f511a5fb754d76..70701d3bc81c6fe2dc9752638e12bcb7b7daf99f 100644 (file)
--- a/drivers/base/cacheinfo.c
+++ b/drivers/base/cacheinfo.c
@@ -17,6 +17,7 @@
  #include <linux/init.h>
  #include <linux/of.h>
  #include <linux/sched.h>
+#include <linux/sched/topology.h>
  #include <linux/slab.h>
  #include <linux/smp.h>
  #include <linux/sysfs.h>
@@ -68,6 +69,24 @@ bool last_level_cache_is_valid(unsigned int cpu)
  
  }
  
+/*
+ * Get the cacheinfo of the LLC associated with @cpu.
+ * Derived from update_per_cpu_data_slice_size_cpu().
+ */
+struct cacheinfo *get_cpu_cacheinfo_llc(unsigned int cpu)
+{
+       struct cacheinfo *llc;
+
+       if (!last_level_cache_is_valid(cpu))
+               return NULL;
+
+       llc = per_cpu_cacheinfo_idx(cpu, cache_leaves(cpu) - 1);
+       if (llc->type != CACHE_TYPE_DATA && llc->type != CACHE_TYPE_UNIFIED)
+               return NULL;
+
+       return llc;
+}
+
  bool last_level_cache_is_shared(unsigned int cpu_x, unsigned int cpu_y)
  {
         struct cacheinfo *llc_x, *llc_y;
@@ -1018,6 +1037,7 @@ static int cacheinfo_cpu_online(unsigned int cpu)
                 goto err;
         if (cpu_map_shared_cache(true, cpu, &cpu_map))
                 update_per_cpu_data_slice_size(true, cpu, cpu_map);
+       sched_update_llc_bytes(cpu);
         return 0;
  err:
         free_cache_attributes(cpu);
@@ -1036,6 +1056,9 @@ static int cacheinfo_cpu_pre_down(unsigned int cpu)
         free_cache_attributes(cpu);
         if (nr_shared > 1)
                 update_per_cpu_data_slice_size(false, cpu, cpu_map);
+
+       sched_update_llc_bytes(cpu);
+
         return 0;
  }
  
diff --git a/include/linux/cacheinfo.h b/include/linux/cacheinfo.h

index c8f4f0a0b874e2942576b8f2d8f6f092c17d919c..fc879ac4cc4f2843f440b32c2a23167275d0afd1 100644 (file)
--- a/include/linux/cacheinfo.h
+++ b/include/linux/cacheinfo.h
@@ -89,6 +89,7 @@ int populate_cache_leaves(unsigned int cpu);
  int cache_setup_acpi(unsigned int cpu);
  bool last_level_cache_is_valid(unsigned int cpu);
  bool last_level_cache_is_shared(unsigned int cpu_x, unsigned int cpu_y);
+struct cacheinfo *get_cpu_cacheinfo_llc(unsigned int cpu);
  int fetch_cache_info(unsigned int cpu);
  int detect_cache_attributes(unsigned int cpu);
  #ifndef CONFIG_ACPI_PPTT
diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h

index 0036d6b4bd67460d6a2907cba70d947a5a8b2a55..fe09d3268bc9ce449b1e3f1349a74200d8cd11ea 100644 (file)
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -106,6 +106,7 @@ struct sched_domain {
  #ifdef CONFIG_SCHED_CACHE
         unsigned int llc_max;
         unsigned int *llc_counts __counted_by_ptr(llc_max);
+       unsigned long llc_bytes;
  #endif
  
  #ifdef CONFIG_SCHEDSTATS
@@ -265,4 +266,10 @@ static inline int task_node(const struct task_struct *p)
         return cpu_to_node(task_cpu(p));
  }
  
+#ifdef CONFIG_SCHED_CACHE
+extern void sched_update_llc_bytes(unsigned int cpu);
+#else
+static inline void sched_update_llc_bytes(unsigned int cpu) { }
+#endif
+
  #endif /* _LINUX_SCHED_TOPOLOGY_H */
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c

index 9fc99346ef4f943c39650e97b07609bed7452027..7248a7279abeba0924fec0a4a660adb51963c53d 100644 (file)
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -776,9 +776,11 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
                         /* move buffer to parent as child is being destroyed */
                         sd->llc_counts = tmp->llc_counts;
                         sd->llc_max = tmp->llc_max;
+                       sd->llc_bytes = tmp->llc_bytes;
                         /* make sure destroy_sched_domain() does not free it */
                         tmp->llc_counts = NULL;
                         tmp->llc_max = 0;
+                       tmp->llc_bytes = 0;
  #endif
                         /*
                          * sched groups hold the flags of the child sched
@@ -831,10 +833,42 @@ DEFINE_STATIC_KEY_FALSE(sched_cache_active);
  /* user wants cache aware scheduling [0 or 1] */
  int sysctl_sched_cache_user = 1;
  
+/*
+ * Get the effective LLC size in bytes that @cpu's bottom sched_domain
+ * can use. A CPU within a cpuset partition can only use a proportion
+ * of the physical LLC, scaled by the ratio of the partition's span
+ * weight to the hardware LLC sharing weight. @sd should be the
+ * topmost domain with SD_SHARE_LLC.
+ *
+ * Returns 0 if cacheinfo is not yet populated. This happens during
+ * early boot when build_sched_domains() runs before the generic
+ * cacheinfo framework has been initialized (cacheinfo_cpu_online()
+ * is a device_initcall cpuhp callback). In that case,
+ * cacheinfo_cpu_online() will later call sched_update_llc_bytes()
+ * to fill in the bottom domain's llc_bytes once the cache attributes
+ * are available.
+ */
+static unsigned long get_effective_llc_bytes(int cpu,
+                                            struct sched_domain *sd)
+{
+       struct cacheinfo *ci;
+       unsigned int hw_weight;
+
+       ci = get_cpu_cacheinfo_llc(cpu);
+       if (!ci)
+               return 0;
+
+       hw_weight = cpumask_weight(&ci->shared_cpu_map);
+       if (!hw_weight)
+               return 0;
+
+       return div_u64((u64)ci->size * sd->span_weight, hw_weight);
+}
+
  static bool alloc_sd_llc(const struct cpumask *cpu_map,
                          struct s_data *d)
  {
-       struct sched_domain *sd;
+       struct sched_domain *sd, *top_llc, *parent;
         unsigned int *p;
         int i;
  
@@ -848,8 +882,24 @@ static bool alloc_sd_llc(const struct cpumask *cpu_map,
                 if (!p)
                         goto err;
  
-               sd->llc_max = max_lid + 1;
-               sd->llc_counts = p;
+               top_llc = sd;
+               /*
+                * Find the topmost SD_SHARE_LLC domain.
+                * Not yet attached to the CPU, so per_cpu(sd_llc, i)
+                * can not be used.
+                */
+               while ((parent = rcu_dereference_protected(top_llc->parent, true)) &&
+                      (parent->flags & SD_SHARE_LLC))
+                       top_llc = parent;
+
+               if (top_llc->flags & SD_SHARE_LLC) {
+                       sd->llc_max = max_lid + 1;
+                       sd->llc_counts = p;
+                       sd->llc_bytes = get_effective_llc_bytes(i, top_llc);
+               } else {
+                       /* avoid memory leak */
+                       kfree(p);
+               }
         }
  
         return true;
@@ -860,6 +910,7 @@ err:
                         kfree(sd->llc_counts);
                         sd->llc_counts = NULL;
                         sd->llc_max = 0;
+                       sd->llc_bytes = 0;
                 }
         }
  
@@ -919,6 +970,47 @@ void sched_cache_active_set_unlocked(void)
  {
         return sched_cache_active_set(false);
  }
+
+/*
+ * Update the bottom sched_domain's llc_bytes for @cpu and all its
+ * LLC siblings. Called from cacheinfo_cpu_online() or
+ * cacheinfo_cpu_pre_down() with cpu hotplug lock held.
+ *
+ * Note: get_effective_llc_bytes() returns 0 on PowerPC.
+ * thus cache aware scheduling is disabled on PowerPC for
+ * now. PowerPC does not use the generic cacheinfo framework --
+ * it has its own cacheinfo with a separate struct cache hierarchy
+ * and does not populates the per-CPU struct cpu_cacheinfo array
+ * that get_cpu_cacheinfo_llc() reads.
+ */
+void sched_update_llc_bytes(unsigned int cpu)
+{
+       struct sched_domain *sd, *sdp;
+       unsigned int i;
+
+       sched_domains_mutex_lock();
+
+       sdp = rcu_dereference_sched_domain(per_cpu(sd_llc, cpu));
+       if (!sdp)
+               goto unlock;
+
+       /*
+        * ci->shared_cpu_map is built incrementally as CPUs come
+        * online, so the first CPU in an LLC initially sees
+        * hw_weight == 1 and computes an inflated llc_bytes in
+        * get_effective_llc_bytes().  Re-evaluating every LLC
+        * sibling on each online event corrects this once the full
+        * shared_cpu_map is known.
+        */
+       for_each_cpu(i, sched_domain_span(sdp)) {
+               sd = rcu_dereference_sched_domain(cpu_rq(i)->sd);
+               if (sd)
+                       sd->llc_bytes = get_effective_llc_bytes(i, sdp);
+       }
+
+unlock:
+       sched_domains_mutex_unlock();
+}
  #else
  static bool alloc_sd_llc(const struct cpumask *cpu_map,
                          struct s_data *d)
author	Chen Yu <yu.c.chen@intel.com>
	Wed, 13 May 2026 20:39:15 +0000 (13:39 -0700)
committer	Peter Zijlstra <peterz@infradead.org>
	Mon, 18 May 2026 19:33:15 +0000 (21:33 +0200)
drivers/base/cacheinfo.c		patch \| blob \| blame \| history
include/linux/cacheinfo.h		patch \| blob \| blame \| history
include/linux/sched/topology.h		patch \| blob \| blame \| history
kernel/sched/topology.c		patch \| blob \| blame \| history