From: K Prateek Nayak Date: Sat, 16 May 2026 05:58:50 +0000 (+0200) Subject: sched/fair: Attach sched_domain_shared to sd_asym_cpucapacity X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=fdfe5a8cd8731dd81840f26abfb6527edd27b0cb;p=thirdparty%2Fkernel%2Flinux.git sched/fair: Attach sched_domain_shared to sd_asym_cpucapacity On asymmetric CPU capacity systems, the wakeup path uses select_idle_capacity(), which scans the span of sd_asym_cpucapacity rather than sd_llc. The has_idle_cores hint however lives on sd_llc->shared, so the wakeup-time read of has_idle_cores operates on an LLC-scoped blob while the actual scan/decision spans the asym domain; nr_busy_cpus also lives in the same shared sched_domain data, but it's never used in the asym CPU capacity scenario. Therefore, move the sched_domain_shared object to sd_asym_cpucapacity whenever the CPU has a SD_ASYM_CPUCAPACITY_FULL ancestor and that ancestor is non-overlapping (i.e., not built from SD_NUMA). In that case the scope of has_idle_cores matches the scope of the wakeup scan. Fall back to attaching the shared object to sd_llc in three cases: 1) plain symmetric systems (no SD_ASYM_CPUCAPACITY_FULL anywhere); 2) CPUs in an exclusive cpuset that carves out a symmetric capacity island: has_asym is system-wide but those CPUs have no SD_ASYM_CPUCAPACITY_FULL ancestor in their hierarchy and follow the symmetric LLC path in select_idle_sibling(); 3) exotic topologies where SD_ASYM_CPUCAPACITY_FULL lands on an SD_NUMA-built domain. init_sched_domain_shared() keys the shared blob off cpumask_first(span), which on overlapping NUMA domains would alias unrelated spans onto the same blob. Keep the shared object on the LLC there; select_idle_capacity() gracefully skips the has_idle_cores preference when sd->shared is NULL. While at it, also rename the per-CPU sd_llc_shared to sd_balance_shared, as it is no longer strictly tied to the LLC. Co-developed-by: Andrea Righi Signed-off-by: Andrea Righi Signed-off-by: K Prateek Nayak Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Shrikanth Hegde Acked-by: Vincent Guittot Link: https://patch.msgid.link/20260516055850.1345932-1-arighi@nvidia.com --- diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 03f63b094ff98..2637a6fe9a87e 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7773,7 +7773,7 @@ static inline void set_idle_cores(int cpu, int val) { struct sched_domain_shared *sds; - sds = rcu_dereference_all(per_cpu(sd_llc_shared, cpu)); + sds = rcu_dereference_all(per_cpu(sd_balance_shared, cpu)); if (sds) WRITE_ONCE(sds->has_idle_cores, val); } @@ -7782,7 +7782,7 @@ static inline bool test_idle_cores(int cpu) { struct sched_domain_shared *sds; - sds = rcu_dereference_all(per_cpu(sd_llc_shared, cpu)); + sds = rcu_dereference_all(per_cpu(sd_balance_shared, cpu)); if (sds) return READ_ONCE(sds->has_idle_cores); @@ -7791,7 +7791,7 @@ static inline bool test_idle_cores(int cpu) /* * Scans the local SMT mask to see if the entire core is idle, and records this - * information in sd_llc_shared->has_idle_cores. + * information in sd_balance_shared->has_idle_cores. * * Since SMT siblings share all cache levels, inspecting this limited remote * state should be fairly cheap. @@ -7821,7 +7821,8 @@ unlock: /* * Scan the entire LLC domain for idle cores; this dynamically switches off if * there are no idle cores left in the system; tracked through - * sd_llc->shared->has_idle_cores and enabled through update_idle_core() above. + * sd_balance_shared->has_idle_cores and enabled through update_idle_core() + * above. */ static int select_idle_core(struct task_struct *p, int core, struct cpumask *cpus, int *idle_cpu) { @@ -7885,7 +7886,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_rq_mask); int i, cpu, idle_cpu = -1, nr = INT_MAX; - if (sched_feat(SIS_UTIL)) { + if (sched_feat(SIS_UTIL) && sd->shared) { /* * Increment because !--nr is the condition to stop scan. * @@ -12764,7 +12765,7 @@ static void nohz_balancer_kick(struct rq *rq) goto out; } - sds = rcu_dereference_all(per_cpu(sd_llc_shared, cpu)); + sds = rcu_dereference_all(per_cpu(sd_balance_shared, cpu)); if (sds) { /* * If there is an imbalance between LLC domains (IOW we could @@ -12792,7 +12793,11 @@ static void set_cpu_sd_state_busy(int cpu) struct sched_domain *sd; sd = rcu_dereference_all(per_cpu(sd_llc, cpu)); - if (!sd || !sd->nohz_idle) + /* + * sd->nohz_idle only pairs with nr_busy_cpus on sd->shared; if this + * domain has no shared object there is nothing to clear or account. + */ + if (!sd || !sd->shared || !sd->nohz_idle) return; sd->nohz_idle = 0; @@ -12817,7 +12822,8 @@ static void set_cpu_sd_state_idle(int cpu) struct sched_domain *sd; sd = rcu_dereference_all(per_cpu(sd_llc, cpu)); - if (!sd || sd->nohz_idle) + /* See set_cpu_sd_state_busy(): nohz_idle is only used with sd->shared. */ + if (!sd || !sd->shared || sd->nohz_idle) return; sd->nohz_idle = 1; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index ffe77b2b6296c..bfb4b47c021b2 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2164,7 +2164,7 @@ DECLARE_PER_CPU(struct sched_domain __rcu *, sd_llc); DECLARE_PER_CPU(int, sd_llc_size); DECLARE_PER_CPU(int, sd_llc_id); DECLARE_PER_CPU(int, sd_share_id); -DECLARE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared); +DECLARE_PER_CPU(struct sched_domain_shared __rcu *, sd_balance_shared); DECLARE_PER_CPU(struct sched_domain __rcu *, sd_numa); DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing); DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity); diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index a1f46e3f4edea..f96d50131495a 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -665,7 +665,7 @@ DEFINE_PER_CPU(struct sched_domain __rcu *, sd_llc); DEFINE_PER_CPU(int, sd_llc_size); DEFINE_PER_CPU(int, sd_llc_id); DEFINE_PER_CPU(int, sd_share_id); -DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared); +DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_balance_shared); DEFINE_PER_CPU(struct sched_domain __rcu *, sd_numa); DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing); DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity); @@ -680,20 +680,38 @@ static void update_top_cache_domain(int cpu) int id = cpu; int size = 1; + sd = lowest_flag_domain(cpu, SD_ASYM_CPUCAPACITY_FULL); + /* + * The shared object is attached to sd_asym_cpucapacity only when the + * asym domain is non-overlapping (i.e., not built from SD_NUMA). + * On overlapping (NUMA) asym domains we fall back to letting the + * SD_SHARE_LLC path own the shared object, so sd->shared may be NULL + * here. + */ + if (sd && sd->shared) + sds = sd->shared; + + rcu_assign_pointer(per_cpu(sd_asym_cpucapacity, cpu), sd); + sd = highest_flag_domain(cpu, SD_SHARE_LLC); if (sd) { id = cpumask_first(sched_domain_span(sd)); size = cpumask_weight(sched_domain_span(sd)); - /* If sd_llc exists, sd_llc_shared should exist too. */ - WARN_ON_ONCE(!sd->shared); - sds = sd->shared; + /* + * If sd_asym_cpucapacity didn't claim the shared object, + * sd_llc must have one linked. + */ + if (!sds) { + WARN_ON_ONCE(!sd->shared); + sds = sd->shared; + } } rcu_assign_pointer(per_cpu(sd_llc, cpu), sd); per_cpu(sd_llc_size, cpu) = size; per_cpu(sd_llc_id, cpu) = id; - rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds); + rcu_assign_pointer(per_cpu(sd_balance_shared, cpu), sds); sd = lowest_flag_domain(cpu, SD_CLUSTER); if (sd) @@ -711,9 +729,6 @@ static void update_top_cache_domain(int cpu) sd = highest_flag_domain(cpu, SD_ASYM_PACKING); rcu_assign_pointer(per_cpu(sd_asym_packing, cpu), sd); - - sd = lowest_flag_domain(cpu, SD_ASYM_CPUCAPACITY_FULL); - rcu_assign_pointer(per_cpu(sd_asym_cpucapacity, cpu), sd); } /* @@ -2648,6 +2663,54 @@ static void adjust_numa_imbalance(struct sched_domain *sd_llc) } } +static void init_sched_domain_shared(struct s_data *d, struct sched_domain *sd) +{ + int sd_id = cpumask_first(sched_domain_span(sd)); + + sd->shared = *per_cpu_ptr(d->sds, sd_id); + /* + * nr_busy_cpus is consumed only by the NOHZ kick path via + * sd_balance_shared; on the asym-capacity path it is initialized but + * never read. + */ + atomic_set(&sd->shared->nr_busy_cpus, sd->span_weight); + atomic_inc(&sd->shared->ref); +} + +/* + * For asymmetric CPU capacity, attach sched_domain_shared on the innermost + * SD_ASYM_CPUCAPACITY_FULL ancestor of @cpu's base domain when that ancestor is + * not an overlapping NUMA-built domain (then LLC should claim shared). + * + * A CPU may lack any FULL ancestor (e.g., exclusive cpuset symmetric island), + * then LLC must claim shared instead. + * + * Note: SD_ASYM_CPUCAPACITY_FULL is only set when all CPU capacity values + * are present in the domain span, so the asym domain we attach to cannot + * degenerate into a single-capacity group. The relevant edge cases are instead + * covered by the caveats above. + * + * Return true if this CPU's asym path claimed sd->shared, false otherwise. + */ +static bool claim_asym_sched_domain_shared(struct s_data *d, int cpu) +{ + struct sched_domain *sd = *per_cpu_ptr(d->sd, cpu); + struct sched_domain *sd_asym; + + if (!sd) + return false; + + sd_asym = sd; + while (sd_asym && !(sd_asym->flags & SD_ASYM_CPUCAPACITY_FULL)) + sd_asym = sd_asym->parent; + + if (!sd_asym || (sd_asym->flags & SD_NUMA)) + return false; + + init_sched_domain_shared(d, sd_asym); + return true; +} + /* * Build sched domains for a given set of CPUs and attach the sched domains * to the individual CPUs @@ -2706,20 +2769,26 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att } for_each_cpu(i, cpu_map) { + bool asym_claimed = false; + sd = *per_cpu_ptr(d.sd, i); if (!sd) continue; + if (has_asym) + asym_claimed = claim_asym_sched_domain_shared(&d, i); + /* First, find the topmost SD_SHARE_LLC domain */ while (sd->parent && (sd->parent->flags & SD_SHARE_LLC)) sd = sd->parent; if (sd->flags & SD_SHARE_LLC) { - int sd_id = cpumask_first(sched_domain_span(sd)); - - sd->shared = *per_cpu_ptr(d.sds, sd_id); - atomic_set(&sd->shared->nr_busy_cpus, sd->span_weight); - atomic_inc(&sd->shared->ref); + /* + * Initialize the sd->shared for SD_SHARE_LLC unless + * the asym path above already claimed it. + */ + if (!asym_claimed) + init_sched_domain_shared(&d, sd); /* * In presence of higher domains, adjust the