]> git.ipfire.org Git - thirdparty/linux.git/commitdiff
sched: Create architecture specific sched domain distances
authorTim Chen <tim.c.chen@linux.intel.com>
Fri, 3 Oct 2025 19:31:27 +0000 (12:31 -0700)
committerPeter Zijlstra <peterz@infradead.org>
Thu, 16 Oct 2025 09:13:49 +0000 (11:13 +0200)
Allow architecture specific sched domain NUMA distances that are
modified from actual NUMA node distances for the purpose of building
NUMA sched domains.

Keep actual NUMA distances separately if modified distances
are used for building sched domains. Such distances
are still needed as NUMA balancing benefits from finding the
NUMA nodes that are actually closer to a task numa_group.

Consolidate the recording of unique NUMA distances in an array to
sched_record_numa_dist() so the function can be reused to record NUMA
distances when the NUMA distance metric is changed.

No functional change and additional distance array
allocated if there're no arch specific NUMA distances
being defined.

Co-developed-by: Vinicius Costa Gomes <vinicius.gomes@intel.com>
Signed-off-by: Vinicius Costa Gomes <vinicius.gomes@intel.com>
Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Chen Yu <yu.c.chen@intel.com>
kernel/sched/topology.c

index 444bdfdab731801aac04eb72ab6039e242324af5..711076aa4980184ca88ca28e0a21e635fb79794d 100644 (file)
@@ -1590,10 +1590,17 @@ static void claim_allocations(int cpu, struct sched_domain *sd)
 #ifdef CONFIG_NUMA
 enum numa_topology_type sched_numa_topology_type;
 
+/*
+ * sched_domains_numa_distance is derived from sched_numa_node_distance
+ * and provides a simplified view of NUMA distances used specifically
+ * for building NUMA scheduling domains.
+ */
 static int                     sched_domains_numa_levels;
+static int                     sched_numa_node_levels;
 
 int                            sched_max_numa_distance;
 static int                     *sched_domains_numa_distance;
+static int                     *sched_numa_node_distance;
 static struct cpumask          ***sched_domains_numa_masks;
 #endif /* CONFIG_NUMA */
 
@@ -1845,10 +1852,10 @@ bool find_numa_distance(int distance)
                return true;
 
        rcu_read_lock();
-       distances = rcu_dereference(sched_domains_numa_distance);
+       distances = rcu_dereference(sched_numa_node_distance);
        if (!distances)
                goto unlock;
-       for (i = 0; i < sched_domains_numa_levels; i++) {
+       for (i = 0; i < sched_numa_node_levels; i++) {
                if (distances[i] == distance) {
                        found = true;
                        break;
@@ -1924,14 +1931,34 @@ static void init_numa_topology_type(int offline_node)
 
 #define NR_DISTANCE_VALUES (1 << DISTANCE_BITS)
 
-void sched_init_numa(int offline_node)
+/*
+ * An architecture could modify its NUMA distance, to change
+ * grouping of NUMA nodes and number of NUMA levels when creating
+ * NUMA level sched domains.
+ *
+ * A NUMA level is created for each unique
+ * arch_sched_node_distance.
+ */
+static int numa_node_dist(int i, int j)
 {
-       struct sched_domain_topology_level *tl;
-       unsigned long *distance_map;
+       return node_distance(i, j);
+}
+
+int arch_sched_node_distance(int from, int to)
+                            __weak __alias(numa_node_dist);
+
+static bool modified_sched_node_distance(void)
+{
+       return numa_node_dist != arch_sched_node_distance;
+}
+
+static int sched_record_numa_dist(int offline_node, int (*n_dist)(int, int),
+                                 int **dist, int *levels)
+{
+       unsigned long *distance_map __free(bitmap) = NULL;
        int nr_levels = 0;
        int i, j;
        int *distances;
-       struct cpumask ***masks;
 
        /*
         * O(nr_nodes^2) de-duplicating selection sort -- in order to find the
@@ -1939,17 +1966,16 @@ void sched_init_numa(int offline_node)
         */
        distance_map = bitmap_alloc(NR_DISTANCE_VALUES, GFP_KERNEL);
        if (!distance_map)
-               return;
+               return -ENOMEM;
 
        bitmap_zero(distance_map, NR_DISTANCE_VALUES);
        for_each_cpu_node_but(i, offline_node) {
                for_each_cpu_node_but(j, offline_node) {
-                       int distance = node_distance(i, j);
+                       int distance = n_dist(i, j);
 
                        if (distance < LOCAL_DISTANCE || distance >= NR_DISTANCE_VALUES) {
                                sched_numa_warn("Invalid distance value range");
-                               bitmap_free(distance_map);
-                               return;
+                               return -EINVAL;
                        }
 
                        bitmap_set(distance_map, distance, 1);
@@ -1962,18 +1988,46 @@ void sched_init_numa(int offline_node)
        nr_levels = bitmap_weight(distance_map, NR_DISTANCE_VALUES);
 
        distances = kcalloc(nr_levels, sizeof(int), GFP_KERNEL);
-       if (!distances) {
-               bitmap_free(distance_map);
-               return;
-       }
+       if (!distances)
+               return -ENOMEM;
 
        for (i = 0, j = 0; i < nr_levels; i++, j++) {
                j = find_next_bit(distance_map, NR_DISTANCE_VALUES, j);
                distances[i] = j;
        }
-       rcu_assign_pointer(sched_domains_numa_distance, distances);
+       *dist = distances;
+       *levels = nr_levels;
+
+       return 0;
+}
+
+void sched_init_numa(int offline_node)
+{
+       struct sched_domain_topology_level *tl;
+       int nr_levels, nr_node_levels;
+       int i, j;
+       int *distances, *domain_distances;
+       struct cpumask ***masks;
 
-       bitmap_free(distance_map);
+       /* Record the NUMA distances from SLIT table */
+       if (sched_record_numa_dist(offline_node, numa_node_dist, &distances,
+                                  &nr_node_levels))
+               return;
+
+       /* Record modified NUMA distances for building sched domains */
+       if (modified_sched_node_distance()) {
+               if (sched_record_numa_dist(offline_node, arch_sched_node_distance,
+                                          &domain_distances, &nr_levels)) {
+                       kfree(distances);
+                       return;
+               }
+       } else {
+               domain_distances = distances;
+               nr_levels = nr_node_levels;
+       }
+       rcu_assign_pointer(sched_numa_node_distance, distances);
+       WRITE_ONCE(sched_max_numa_distance, distances[nr_node_levels - 1]);
+       WRITE_ONCE(sched_numa_node_levels, nr_node_levels);
 
        /*
         * 'nr_levels' contains the number of unique distances
@@ -1991,6 +2045,8 @@ void sched_init_numa(int offline_node)
         *
         * We reset it to 'nr_levels' at the end of this function.
         */
+       rcu_assign_pointer(sched_domains_numa_distance, domain_distances);
+
        sched_domains_numa_levels = 0;
 
        masks = kzalloc(sizeof(void *) * nr_levels, GFP_KERNEL);
@@ -2016,10 +2072,13 @@ void sched_init_numa(int offline_node)
                        masks[i][j] = mask;
 
                        for_each_cpu_node_but(k, offline_node) {
-                               if (sched_debug() && (node_distance(j, k) != node_distance(k, j)))
+                               if (sched_debug() &&
+                                   (arch_sched_node_distance(j, k) !=
+                                    arch_sched_node_distance(k, j)))
                                        sched_numa_warn("Node-distance not symmetric");
 
-                               if (node_distance(j, k) > sched_domains_numa_distance[i])
+                               if (arch_sched_node_distance(j, k) >
+                                   sched_domains_numa_distance[i])
                                        continue;
 
                                cpumask_or(mask, mask, cpumask_of_node(k));
@@ -2059,7 +2118,6 @@ void sched_init_numa(int offline_node)
        sched_domain_topology = tl;
 
        sched_domains_numa_levels = nr_levels;
-       WRITE_ONCE(sched_max_numa_distance, sched_domains_numa_distance[nr_levels - 1]);
 
        init_numa_topology_type(offline_node);
 }
@@ -2067,14 +2125,18 @@ void sched_init_numa(int offline_node)
 
 static void sched_reset_numa(void)
 {
-       int nr_levels, *distances;
+       int nr_levels, *distances, *dom_distances = NULL;
        struct cpumask ***masks;
 
        nr_levels = sched_domains_numa_levels;
+       sched_numa_node_levels = 0;
        sched_domains_numa_levels = 0;
        sched_max_numa_distance = 0;
        sched_numa_topology_type = NUMA_DIRECT;
-       distances = sched_domains_numa_distance;
+       distances = sched_numa_node_distance;
+       if (sched_numa_node_distance != sched_domains_numa_distance)
+               dom_distances = sched_domains_numa_distance;
+       rcu_assign_pointer(sched_numa_node_distance, NULL);
        rcu_assign_pointer(sched_domains_numa_distance, NULL);
        masks = sched_domains_numa_masks;
        rcu_assign_pointer(sched_domains_numa_masks, NULL);
@@ -2083,6 +2145,7 @@ static void sched_reset_numa(void)
 
                synchronize_rcu();
                kfree(distances);
+               kfree(dom_distances);
                for (i = 0; i < nr_levels && masks; i++) {
                        if (!masks[i])
                                continue;
@@ -2129,7 +2192,8 @@ void sched_domains_numa_masks_set(unsigned int cpu)
                                continue;
 
                        /* Set ourselves in the remote node's masks */
-                       if (node_distance(j, node) <= sched_domains_numa_distance[i])
+                       if (arch_sched_node_distance(j, node) <=
+                           sched_domains_numa_distance[i])
                                cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]);
                }
        }