static DEFINE_STATIC_KEY_FALSE(scx_ops_cpu_preempt);
static DEFINE_STATIC_KEY_FALSE(scx_builtin_idle_enabled);
+#ifdef CONFIG_SMP
+static DEFINE_STATIC_KEY_FALSE(scx_selcpu_topo_llc);
+static DEFINE_STATIC_KEY_FALSE(scx_selcpu_topo_numa);
+#endif
+
static struct static_key_false scx_has_op[SCX_OPI_END] =
{ [0 ... SCX_OPI_END-1] = STATIC_KEY_FALSE_INIT };
goto retry;
}
-#ifdef CONFIG_SCHED_MC
/*
- * Return the cpumask of CPUs usable by task @p in the same LLC domain of @cpu,
- * or NULL if the LLC domain cannot be determined.
+ * Initialize topology-aware scheduling.
+ *
+ * Detect if the system has multiple LLC or multiple NUMA domains and enable
+ * cache-aware / NUMA-aware scheduling optimizations in the default CPU idle
+ * selection policy.
*/
-static const struct cpumask *llc_domain(const struct task_struct *p, s32 cpu)
+static void update_selcpu_topology(void)
{
- struct sched_domain *sd = rcu_dereference(per_cpu(sd_llc, cpu));
- const struct cpumask *llc_cpus = sd ? sched_domain_span(sd) : NULL;
+ bool enable_llc = false, enable_numa = false;
+ struct sched_domain *sd;
+ const struct cpumask *cpus;
+ s32 cpu = cpumask_first(cpu_online_mask);
/*
- * Return the LLC domain only if the task is allowed to run on all
- * CPUs.
+ * We only need to check the NUMA node and LLC domain of the first
+ * available CPU to determine if they cover all CPUs.
+ *
+ * If all CPUs belong to the same NUMA node or share the same LLC
+ * domain, enabling NUMA or LLC optimizations is unnecessary.
+ * Otherwise, these optimizations can be enabled.
*/
- return p->nr_cpus_allowed == nr_cpu_ids ? llc_cpus : NULL;
-}
-#else /* CONFIG_SCHED_MC */
-static inline const struct cpumask *llc_domain(struct task_struct *p, s32 cpu)
-{
- return NULL;
+ rcu_read_lock();
+ sd = rcu_dereference(per_cpu(sd_llc, cpu));
+ if (sd) {
+ cpus = sched_domain_span(sd);
+ if (cpumask_weight(cpus) < num_possible_cpus())
+ enable_llc = true;
+ }
+ sd = highest_flag_domain(cpu, SD_NUMA);
+ if (sd) {
+ cpus = sched_group_span(sd->groups);
+ if (cpumask_weight(cpus) < num_possible_cpus())
+ enable_numa = true;
+ }
+ rcu_read_unlock();
+
+ pr_debug("sched_ext: LLC idle selection %s\n",
+ enable_llc ? "enabled" : "disabled");
+ pr_debug("sched_ext: NUMA idle selection %s\n",
+ enable_numa ? "enabled" : "disabled");
+
+ if (enable_llc)
+ static_branch_enable_cpuslocked(&scx_selcpu_topo_llc);
+ else
+ static_branch_disable_cpuslocked(&scx_selcpu_topo_llc);
+ if (enable_numa)
+ static_branch_enable_cpuslocked(&scx_selcpu_topo_numa);
+ else
+ static_branch_disable_cpuslocked(&scx_selcpu_topo_numa);
}
-#endif /* CONFIG_SCHED_MC */
/*
- * Built-in cpu idle selection policy.
+ * Built-in CPU idle selection policy:
+ *
+ * 1. Prioritize full-idle cores:
+ * - always prioritize CPUs from fully idle cores (both logical CPUs are
+ * idle) to avoid interference caused by SMT.
+ *
+ * 2. Reuse the same CPU:
+ * - prefer the last used CPU to take advantage of cached data (L1, L2) and
+ * branch prediction optimizations.
+ *
+ * 3. Pick a CPU within the same LLC (Last-Level Cache):
+ * - if the above conditions aren't met, pick a CPU that shares the same LLC
+ * to maintain cache locality.
+ *
+ * 4. Pick a CPU within the same NUMA node, if enabled:
+ * - choose a CPU from the same NUMA node to reduce memory access latency.
+ *
+ * Step 3 and 4 are performed only if the system has, respectively, multiple
+ * LLC domains / multiple NUMA nodes (see scx_selcpu_topo_llc and
+ * scx_selcpu_topo_numa).
*
* NOTE: tasks that can only run on 1 CPU are excluded by this logic, because
* we never call ops.select_cpu() for them, see select_task_rq().
static s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu,
u64 wake_flags, bool *found)
{
- const struct cpumask *llc_cpus = llc_domain(p, prev_cpu);
+ const struct cpumask *llc_cpus = NULL;
+ const struct cpumask *numa_cpus = NULL;
s32 cpu;
*found = false;
return prev_cpu;
}
+ /*
+ * Determine the scheduling domain only if the task is allowed to run
+ * on all CPUs.
+ *
+ * This is done primarily for efficiency, as it avoids the overhead of
+ * updating a cpumask every time we need to select an idle CPU (which
+ * can be costly in large SMP systems), but it also aligns logically:
+ * if a task's scheduling domain is restricted by user-space (through
+ * CPU affinity), the task will simply use the flat scheduling domain
+ * defined by user-space.
+ */
+ if (p->nr_cpus_allowed >= num_possible_cpus()) {
+ if (static_branch_maybe(CONFIG_NUMA, &scx_selcpu_topo_numa))
+ numa_cpus = cpumask_of_node(cpu_to_node(prev_cpu));
+
+ if (static_branch_maybe(CONFIG_SCHED_MC, &scx_selcpu_topo_llc)) {
+ struct sched_domain *sd;
+
+ sd = rcu_dereference(per_cpu(sd_llc, prev_cpu));
+ if (sd)
+ llc_cpus = sched_domain_span(sd);
+ }
+ }
+
/*
* If WAKE_SYNC, try to migrate the wakee to the waker's CPU.
*/
goto cpu_found;
}
+ /*
+ * Search for any fully idle core in the same NUMA node.
+ */
+ if (numa_cpus) {
+ cpu = scx_pick_idle_cpu(numa_cpus, SCX_PICK_IDLE_CORE);
+ if (cpu >= 0)
+ goto cpu_found;
+ }
+
/*
* Search for any full idle core usable by the task.
*/
goto cpu_found;
}
+ /*
+ * Search for any idle CPU in the same NUMA node.
+ */
+ if (numa_cpus) {
+ cpu = scx_pick_idle_cpu(numa_cpus, 0);
+ if (cpu >= 0)
+ goto cpu_found;
+ }
+
/*
* Search for any idle CPU usable by the task.
*/
atomic_long_inc(&scx_hotplug_seq);
+ if (scx_enabled())
+ update_selcpu_topology();
+
if (online && SCX_HAS_OP(cpu_online))
SCX_CALL_OP(SCX_KF_UNLOCKED, cpu_online, cpu);
else if (!online && SCX_HAS_OP(cpu_offline))
static_branch_enable_cpuslocked(&scx_has_op[i]);
check_hotplug_seq(ops);
+#ifdef CONFIG_SMP
+ update_selcpu_topology();
+#endif
cpus_read_unlock();
ret = validate_ops(ops);