sched_ext: Introduce NUMA awareness to the default idle selection policy

author Andrea Righi <arighi@nvidia.com>

Tue, 29 Oct 2024 10:16:18 +0000 (11:16 +0100)

committer Tejun Heo <tj@kernel.org>

Tue, 29 Oct 2024 19:36:35 +0000 (09:36 -1000)
author Andrea Righi <arighi@nvidia.com>
Tue, 29 Oct 2024 10:16:18 +0000 (11:16 +0100)
committer Tejun Heo <tj@kernel.org>
Tue, 29 Oct 2024 19:36:35 +0000 (09:36 -1000)
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c

index 6705c2e67c9939326d3b420660fc318811458008..fc7f15eefe54cc6b081cf8454e5ae99e16558d5d 100644 (file)
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -870,6 +870,11 @@ static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_exiting);
  static DEFINE_STATIC_KEY_FALSE(scx_ops_cpu_preempt);
  static DEFINE_STATIC_KEY_FALSE(scx_builtin_idle_enabled);
  
+#ifdef CONFIG_SMP
+static DEFINE_STATIC_KEY_FALSE(scx_selcpu_topo_llc);
+static DEFINE_STATIC_KEY_FALSE(scx_selcpu_topo_numa);
+#endif
+
  static struct static_key_false scx_has_op[SCX_OPI_END] =
         { [0 ... SCX_OPI_END-1] = STATIC_KEY_FALSE_INIT };
  
@@ -3124,31 +3129,79 @@ found:
                 goto retry;
  }
  
-#ifdef CONFIG_SCHED_MC
  /*
- * Return the cpumask of CPUs usable by task @p in the same LLC domain of @cpu,
- * or NULL if the LLC domain cannot be determined.
+ * Initialize topology-aware scheduling.
+ *
+ * Detect if the system has multiple LLC or multiple NUMA domains and enable
+ * cache-aware / NUMA-aware scheduling optimizations in the default CPU idle
+ * selection policy.
   */
-static const struct cpumask *llc_domain(const struct task_struct *p, s32 cpu)
+static void update_selcpu_topology(void)
  {
-       struct sched_domain *sd = rcu_dereference(per_cpu(sd_llc, cpu));
-       const struct cpumask *llc_cpus = sd ? sched_domain_span(sd) : NULL;
+       bool enable_llc = false, enable_numa = false;
+       struct sched_domain *sd;
+       const struct cpumask *cpus;
+       s32 cpu = cpumask_first(cpu_online_mask);
  
         /*
-        * Return the LLC domain only if the task is allowed to run on all
-        * CPUs.
+        * We only need to check the NUMA node and LLC domain of the first
+        * available CPU to determine if they cover all CPUs.
+        *
+        * If all CPUs belong to the same NUMA node or share the same LLC
+        * domain, enabling NUMA or LLC optimizations is unnecessary.
+        * Otherwise, these optimizations can be enabled.
          */
-       return p->nr_cpus_allowed == nr_cpu_ids ? llc_cpus : NULL;
-}
-#else /* CONFIG_SCHED_MC */
-static inline const struct cpumask *llc_domain(struct task_struct *p, s32 cpu)
-{
-       return NULL;
+       rcu_read_lock();
+       sd = rcu_dereference(per_cpu(sd_llc, cpu));
+       if (sd) {
+               cpus = sched_domain_span(sd);
+               if (cpumask_weight(cpus) < num_possible_cpus())
+                       enable_llc = true;
+       }
+       sd = highest_flag_domain(cpu, SD_NUMA);
+       if (sd) {
+               cpus = sched_group_span(sd->groups);
+               if (cpumask_weight(cpus) < num_possible_cpus())
+                       enable_numa = true;
+       }
+       rcu_read_unlock();
+
+       pr_debug("sched_ext: LLC idle selection %s\n",
+                enable_llc ? "enabled" : "disabled");
+       pr_debug("sched_ext: NUMA idle selection %s\n",
+                enable_numa ? "enabled" : "disabled");
+
+       if (enable_llc)
+               static_branch_enable_cpuslocked(&scx_selcpu_topo_llc);
+       else
+               static_branch_disable_cpuslocked(&scx_selcpu_topo_llc);
+       if (enable_numa)
+               static_branch_enable_cpuslocked(&scx_selcpu_topo_numa);
+       else
+               static_branch_disable_cpuslocked(&scx_selcpu_topo_numa);
  }
-#endif /* CONFIG_SCHED_MC */
  
  /*
- * Built-in cpu idle selection policy.
+ * Built-in CPU idle selection policy:
+ *
+ * 1. Prioritize full-idle cores:
+ *   - always prioritize CPUs from fully idle cores (both logical CPUs are
+ *     idle) to avoid interference caused by SMT.
+ *
+ * 2. Reuse the same CPU:
+ *   - prefer the last used CPU to take advantage of cached data (L1, L2) and
+ *     branch prediction optimizations.
+ *
+ * 3. Pick a CPU within the same LLC (Last-Level Cache):
+ *   - if the above conditions aren't met, pick a CPU that shares the same LLC
+ *     to maintain cache locality.
+ *
+ * 4. Pick a CPU within the same NUMA node, if enabled:
+ *   - choose a CPU from the same NUMA node to reduce memory access latency.
+ *
+ * Step 3 and 4 are performed only if the system has, respectively, multiple
+ * LLC domains / multiple NUMA nodes (see scx_selcpu_topo_llc and
+ * scx_selcpu_topo_numa).
   *
   * NOTE: tasks that can only run on 1 CPU are excluded by this logic, because
   * we never call ops.select_cpu() for them, see select_task_rq().
@@ -3156,7 +3209,8 @@ static inline const struct cpumask *llc_domain(struct task_struct *p, s32 cpu)
  static s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu,
                               u64 wake_flags, bool *found)
  {
-       const struct cpumask *llc_cpus = llc_domain(p, prev_cpu);
+       const struct cpumask *llc_cpus = NULL;
+       const struct cpumask *numa_cpus = NULL;
         s32 cpu;
  
         *found = false;
@@ -3166,6 +3220,30 @@ static s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu,
                 return prev_cpu;
         }
  
+       /*
+        * Determine the scheduling domain only if the task is allowed to run
+        * on all CPUs.
+        *
+        * This is done primarily for efficiency, as it avoids the overhead of
+        * updating a cpumask every time we need to select an idle CPU (which
+        * can be costly in large SMP systems), but it also aligns logically:
+        * if a task's scheduling domain is restricted by user-space (through
+        * CPU affinity), the task will simply use the flat scheduling domain
+        * defined by user-space.
+        */
+       if (p->nr_cpus_allowed >= num_possible_cpus()) {
+               if (static_branch_maybe(CONFIG_NUMA, &scx_selcpu_topo_numa))
+                       numa_cpus = cpumask_of_node(cpu_to_node(prev_cpu));
+
+               if (static_branch_maybe(CONFIG_SCHED_MC, &scx_selcpu_topo_llc)) {
+                       struct sched_domain *sd;
+
+                       sd = rcu_dereference(per_cpu(sd_llc, prev_cpu));
+                       if (sd)
+                               llc_cpus = sched_domain_span(sd);
+               }
+       }
+
         /*
          * If WAKE_SYNC, try to migrate the wakee to the waker's CPU.
          */
@@ -3226,6 +3304,15 @@ static s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu,
                                 goto cpu_found;
                 }
  
+               /*
+                * Search for any fully idle core in the same NUMA node.
+                */
+               if (numa_cpus) {
+                       cpu = scx_pick_idle_cpu(numa_cpus, SCX_PICK_IDLE_CORE);
+                       if (cpu >= 0)
+                               goto cpu_found;
+               }
+
                 /*
                  * Search for any full idle core usable by the task.
                  */
@@ -3251,6 +3338,15 @@ static s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu,
                         goto cpu_found;
         }
  
+       /*
+        * Search for any idle CPU in the same NUMA node.
+        */
+       if (numa_cpus) {
+               cpu = scx_pick_idle_cpu(numa_cpus, 0);
+               if (cpu >= 0)
+                       goto cpu_found;
+       }
+
         /*
          * Search for any idle CPU usable by the task.
          */
@@ -3383,6 +3479,9 @@ static void handle_hotplug(struct rq *rq, bool online)
  
         atomic_long_inc(&scx_hotplug_seq);
  
+       if (scx_enabled())
+               update_selcpu_topology();
+
         if (online && SCX_HAS_OP(cpu_online))
                 SCX_CALL_OP(SCX_KF_UNLOCKED, cpu_online, cpu);
         else if (!online && SCX_HAS_OP(cpu_offline))
@@ -5202,6 +5301,9 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
                         static_branch_enable_cpuslocked(&scx_has_op[i]);
  
         check_hotplug_seq(ops);
+#ifdef CONFIG_SMP
+       update_selcpu_topology();
+#endif
         cpus_read_unlock();
  
         ret = validate_ops(ops);
author	Andrea Righi <arighi@nvidia.com>
	Tue, 29 Oct 2024 10:16:18 +0000 (11:16 +0100)
committer	Tejun Heo <tj@kernel.org>
	Tue, 29 Oct 2024 19:36:35 +0000 (09:36 -1000)