]> git.ipfire.org Git - thirdparty/kernel/linux.git/commitdiff
sched/fair: Attach sched_domain_shared to sd_asym_cpucapacity
authorK Prateek Nayak <kprateek.nayak@amd.com>
Sat, 16 May 2026 05:58:50 +0000 (07:58 +0200)
committerPeter Zijlstra <peterz@infradead.org>
Tue, 19 May 2026 10:17:38 +0000 (12:17 +0200)
On asymmetric CPU capacity systems, the wakeup path uses
select_idle_capacity(), which scans the span of sd_asym_cpucapacity
rather than sd_llc.

The has_idle_cores hint however lives on sd_llc->shared, so the
wakeup-time read of has_idle_cores operates on an LLC-scoped blob while
the actual scan/decision spans the asym domain; nr_busy_cpus also lives
in the same shared sched_domain data, but it's never used in the asym
CPU capacity scenario.

Therefore, move the sched_domain_shared object to sd_asym_cpucapacity
whenever the CPU has a SD_ASYM_CPUCAPACITY_FULL ancestor and that
ancestor is non-overlapping (i.e., not built from SD_NUMA). In that case
the scope of has_idle_cores matches the scope of the wakeup scan.

Fall back to attaching the shared object to sd_llc in three cases:

  1) plain symmetric systems (no SD_ASYM_CPUCAPACITY_FULL anywhere);

  2) CPUs in an exclusive cpuset that carves out a symmetric capacity
     island: has_asym is system-wide but those CPUs have no
     SD_ASYM_CPUCAPACITY_FULL ancestor in their hierarchy and follow
     the symmetric LLC path in select_idle_sibling();

  3) exotic topologies where SD_ASYM_CPUCAPACITY_FULL lands on an
     SD_NUMA-built domain. init_sched_domain_shared() keys the shared
     blob off cpumask_first(span), which on overlapping NUMA domains
     would alias unrelated spans onto the same blob. Keep the shared
     object on the LLC there; select_idle_capacity() gracefully skips
     the has_idle_cores preference when sd->shared is NULL.

While at it, also rename the per-CPU sd_llc_shared to sd_balance_shared,
as it is no longer strictly tied to the LLC.

Co-developed-by: Andrea Righi <arighi@nvidia.com>
Signed-off-by: Andrea Righi <arighi@nvidia.com>
Signed-off-by: K Prateek Nayak <kprateek.nayak@amd.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Shrikanth Hegde <sshegde@linux.ibm.com>
Acked-by: Vincent Guittot <vincent.guittot@linaro.org>
Link: https://patch.msgid.link/20260516055850.1345932-1-arighi@nvidia.com
kernel/sched/fair.c
kernel/sched/sched.h
kernel/sched/topology.c

index 03f63b094ff984090a47cf73b51b744e266bfc9f..2637a6fe9a87edd58d61077d9be251d68a8878cd 100644 (file)
@@ -7773,7 +7773,7 @@ static inline void set_idle_cores(int cpu, int val)
 {
        struct sched_domain_shared *sds;
 
-       sds = rcu_dereference_all(per_cpu(sd_llc_shared, cpu));
+       sds = rcu_dereference_all(per_cpu(sd_balance_shared, cpu));
        if (sds)
                WRITE_ONCE(sds->has_idle_cores, val);
 }
@@ -7782,7 +7782,7 @@ static inline bool test_idle_cores(int cpu)
 {
        struct sched_domain_shared *sds;
 
-       sds = rcu_dereference_all(per_cpu(sd_llc_shared, cpu));
+       sds = rcu_dereference_all(per_cpu(sd_balance_shared, cpu));
        if (sds)
                return READ_ONCE(sds->has_idle_cores);
 
@@ -7791,7 +7791,7 @@ static inline bool test_idle_cores(int cpu)
 
 /*
  * Scans the local SMT mask to see if the entire core is idle, and records this
- * information in sd_llc_shared->has_idle_cores.
+ * information in sd_balance_shared->has_idle_cores.
  *
  * Since SMT siblings share all cache levels, inspecting this limited remote
  * state should be fairly cheap.
@@ -7821,7 +7821,8 @@ unlock:
 /*
  * Scan the entire LLC domain for idle cores; this dynamically switches off if
  * there are no idle cores left in the system; tracked through
- * sd_llc->shared->has_idle_cores and enabled through update_idle_core() above.
+ * sd_balance_shared->has_idle_cores and enabled through update_idle_core()
+ * above.
  */
 static int select_idle_core(struct task_struct *p, int core, struct cpumask *cpus, int *idle_cpu)
 {
@@ -7885,7 +7886,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
        struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_rq_mask);
        int i, cpu, idle_cpu = -1, nr = INT_MAX;
 
-       if (sched_feat(SIS_UTIL)) {
+       if (sched_feat(SIS_UTIL) && sd->shared) {
                /*
                 * Increment because !--nr is the condition to stop scan.
                 *
@@ -12764,7 +12765,7 @@ static void nohz_balancer_kick(struct rq *rq)
                goto out;
        }
 
-       sds = rcu_dereference_all(per_cpu(sd_llc_shared, cpu));
+       sds = rcu_dereference_all(per_cpu(sd_balance_shared, cpu));
        if (sds) {
                /*
                 * If there is an imbalance between LLC domains (IOW we could
@@ -12792,7 +12793,11 @@ static void set_cpu_sd_state_busy(int cpu)
        struct sched_domain *sd;
        sd = rcu_dereference_all(per_cpu(sd_llc, cpu));
 
-       if (!sd || !sd->nohz_idle)
+       /*
+        * sd->nohz_idle only pairs with nr_busy_cpus on sd->shared; if this
+        * domain has no shared object there is nothing to clear or account.
+        */
+       if (!sd || !sd->shared || !sd->nohz_idle)
                return;
        sd->nohz_idle = 0;
 
@@ -12817,7 +12822,8 @@ static void set_cpu_sd_state_idle(int cpu)
        struct sched_domain *sd;
        sd = rcu_dereference_all(per_cpu(sd_llc, cpu));
 
-       if (!sd || sd->nohz_idle)
+       /* See set_cpu_sd_state_busy(): nohz_idle is only used with sd->shared. */
+       if (!sd || !sd->shared || sd->nohz_idle)
                return;
        sd->nohz_idle = 1;
 
index ffe77b2b6296ce1d9cbc11034a7ec4a388b1e78f..bfb4b47c021b2a929dd23503408f148ea8df3868 100644 (file)
@@ -2164,7 +2164,7 @@ DECLARE_PER_CPU(struct sched_domain __rcu *, sd_llc);
 DECLARE_PER_CPU(int, sd_llc_size);
 DECLARE_PER_CPU(int, sd_llc_id);
 DECLARE_PER_CPU(int, sd_share_id);
-DECLARE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared);
+DECLARE_PER_CPU(struct sched_domain_shared __rcu *, sd_balance_shared);
 DECLARE_PER_CPU(struct sched_domain __rcu *, sd_numa);
 DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing);
 DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity);
index a1f46e3f4edea3686c64157fe32a7511c93cf9ec..f96d50131495abf99b80fa5817e2bbf380505da4 100644 (file)
@@ -665,7 +665,7 @@ DEFINE_PER_CPU(struct sched_domain __rcu *, sd_llc);
 DEFINE_PER_CPU(int, sd_llc_size);
 DEFINE_PER_CPU(int, sd_llc_id);
 DEFINE_PER_CPU(int, sd_share_id);
-DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared);
+DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_balance_shared);
 DEFINE_PER_CPU(struct sched_domain __rcu *, sd_numa);
 DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing);
 DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity);
@@ -680,20 +680,38 @@ static void update_top_cache_domain(int cpu)
        int id = cpu;
        int size = 1;
 
+       sd = lowest_flag_domain(cpu, SD_ASYM_CPUCAPACITY_FULL);
+       /*
+        * The shared object is attached to sd_asym_cpucapacity only when the
+        * asym domain is non-overlapping (i.e., not built from SD_NUMA).
+        * On overlapping (NUMA) asym domains we fall back to letting the
+        * SD_SHARE_LLC path own the shared object, so sd->shared may be NULL
+        * here.
+        */
+       if (sd && sd->shared)
+               sds = sd->shared;
+
+       rcu_assign_pointer(per_cpu(sd_asym_cpucapacity, cpu), sd);
+
        sd = highest_flag_domain(cpu, SD_SHARE_LLC);
        if (sd) {
                id = cpumask_first(sched_domain_span(sd));
                size = cpumask_weight(sched_domain_span(sd));
 
-               /* If sd_llc exists, sd_llc_shared should exist too. */
-               WARN_ON_ONCE(!sd->shared);
-               sds = sd->shared;
+               /*
+                * If sd_asym_cpucapacity didn't claim the shared object,
+                * sd_llc must have one linked.
+                */
+               if (!sds) {
+                       WARN_ON_ONCE(!sd->shared);
+                       sds = sd->shared;
+               }
        }
 
        rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
        per_cpu(sd_llc_size, cpu) = size;
        per_cpu(sd_llc_id, cpu) = id;
-       rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds);
+       rcu_assign_pointer(per_cpu(sd_balance_shared, cpu), sds);
 
        sd = lowest_flag_domain(cpu, SD_CLUSTER);
        if (sd)
@@ -711,9 +729,6 @@ static void update_top_cache_domain(int cpu)
 
        sd = highest_flag_domain(cpu, SD_ASYM_PACKING);
        rcu_assign_pointer(per_cpu(sd_asym_packing, cpu), sd);
-
-       sd = lowest_flag_domain(cpu, SD_ASYM_CPUCAPACITY_FULL);
-       rcu_assign_pointer(per_cpu(sd_asym_cpucapacity, cpu), sd);
 }
 
 /*
@@ -2648,6 +2663,54 @@ static void adjust_numa_imbalance(struct sched_domain *sd_llc)
        }
 }
 
+static void init_sched_domain_shared(struct s_data *d, struct sched_domain *sd)
+{
+       int sd_id = cpumask_first(sched_domain_span(sd));
+
+       sd->shared = *per_cpu_ptr(d->sds, sd_id);
+       /*
+        * nr_busy_cpus is consumed only by the NOHZ kick path via
+        * sd_balance_shared; on the asym-capacity path it is initialized but
+        * never read.
+        */
+       atomic_set(&sd->shared->nr_busy_cpus, sd->span_weight);
+       atomic_inc(&sd->shared->ref);
+}
+
+/*
+ * For asymmetric CPU capacity, attach sched_domain_shared on the innermost
+ * SD_ASYM_CPUCAPACITY_FULL ancestor of @cpu's base domain when that ancestor is
+ * not an overlapping NUMA-built domain (then LLC should claim shared).
+ *
+ * A CPU may lack any FULL ancestor (e.g., exclusive cpuset symmetric island),
+ * then LLC must claim shared instead.
+ *
+ * Note: SD_ASYM_CPUCAPACITY_FULL is only set when all CPU capacity values
+ * are present in the domain span, so the asym domain we attach to cannot
+ * degenerate into a single-capacity group. The relevant edge cases are instead
+ * covered by the caveats above.
+ *
+ * Return true if this CPU's asym path claimed sd->shared, false otherwise.
+ */
+static bool claim_asym_sched_domain_shared(struct s_data *d, int cpu)
+{
+       struct sched_domain *sd = *per_cpu_ptr(d->sd, cpu);
+       struct sched_domain *sd_asym;
+
+       if (!sd)
+               return false;
+
+       sd_asym = sd;
+       while (sd_asym && !(sd_asym->flags & SD_ASYM_CPUCAPACITY_FULL))
+               sd_asym = sd_asym->parent;
+
+       if (!sd_asym || (sd_asym->flags & SD_NUMA))
+               return false;
+
+       init_sched_domain_shared(d, sd_asym);
+       return true;
+}
+
 /*
  * Build sched domains for a given set of CPUs and attach the sched domains
  * to the individual CPUs
@@ -2706,20 +2769,26 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
        }
 
        for_each_cpu(i, cpu_map) {
+               bool asym_claimed = false;
+
                sd = *per_cpu_ptr(d.sd, i);
                if (!sd)
                        continue;
 
+               if (has_asym)
+                       asym_claimed = claim_asym_sched_domain_shared(&d, i);
+
                /* First, find the topmost SD_SHARE_LLC domain */
                while (sd->parent && (sd->parent->flags & SD_SHARE_LLC))
                        sd = sd->parent;
 
                if (sd->flags & SD_SHARE_LLC) {
-                       int sd_id = cpumask_first(sched_domain_span(sd));
-
-                       sd->shared = *per_cpu_ptr(d.sds, sd_id);
-                       atomic_set(&sd->shared->nr_busy_cpus, sd->span_weight);
-                       atomic_inc(&sd->shared->ref);
+                       /*
+                        * Initialize the sd->shared for SD_SHARE_LLC unless
+                        * the asym path above already claimed it.
+                        */
+                       if (!asym_claimed)
+                               init_sched_domain_shared(&d, sd);
 
                        /*
                         * In presence of higher domains, adjust the