sched/topology: Extract "imb_numa_nr" calculation into a separate helper

author K Prateek Nayak <kprateek.nayak@amd.com>

Thu, 12 Mar 2026 04:44:27 +0000 (04:44 +0000)

committer Peter Zijlstra <peterz@infradead.org>

Wed, 18 Mar 2026 08:06:48 +0000 (09:06 +0100)
author K Prateek Nayak <kprateek.nayak@amd.com>
Thu, 12 Mar 2026 04:44:27 +0000 (04:44 +0000)
committer Peter Zijlstra <peterz@infradead.org>
Wed, 18 Mar 2026 08:06:48 +0000 (09:06 +0100)
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c

index 79bab80af8f2bbec3544b1f36dd89a2643068ecf..6303790a414370d75e894d82930cfd43b8f46906 100644 (file)
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -2549,6 +2549,74 @@ static bool topology_span_sane(const struct cpumask *cpu_map)
         return true;
  }
  
+/*
+ * Calculate an allowed NUMA imbalance such that LLCs do not get
+ * imbalanced.
+ */
+static void adjust_numa_imbalance(struct sched_domain *sd_llc)
+{
+       struct sched_domain *parent;
+       unsigned int imb_span = 1;
+       unsigned int imb = 0;
+       unsigned int nr_llcs;
+
+       WARN_ON(!(sd_llc->flags & SD_SHARE_LLC));
+       WARN_ON(!sd_llc->parent);
+
+       /*
+        * For a single LLC per node, allow an
+        * imbalance up to 12.5% of the node. This is
+        * arbitrary cutoff based two factors -- SMT and
+        * memory channels. For SMT-2, the intent is to
+        * avoid premature sharing of HT resources but
+        * SMT-4 or SMT-8 *may* benefit from a different
+        * cutoff. For memory channels, this is a very
+        * rough estimate of how many channels may be
+        * active and is based on recent CPUs with
+        * many cores.
+        *
+        * For multiple LLCs, allow an imbalance
+        * until multiple tasks would share an LLC
+        * on one node while LLCs on another node
+        * remain idle. This assumes that there are
+        * enough logical CPUs per LLC to avoid SMT
+        * factors and that there is a correlation
+        * between LLCs and memory channels.
+        */
+       nr_llcs = sd_llc->parent->span_weight / sd_llc->span_weight;
+       if (nr_llcs == 1)
+               imb = sd_llc->parent->span_weight >> 3;
+       else
+               imb = nr_llcs;
+
+       imb = max(1U, imb);
+       sd_llc->parent->imb_numa_nr = imb;
+
+       /*
+        * Set span based on the first NUMA domain.
+        *
+        * NUMA systems always add a NODE domain before
+        * iterating the NUMA domains. Since this is before
+        * degeneration, start from sd_llc's parent's
+        * parent which is the lowest an SD_NUMA domain can
+        * be relative to sd_llc.
+        */
+       parent = sd_llc->parent->parent;
+       while (parent && !(parent->flags & SD_NUMA))
+               parent = parent->parent;
+
+       imb_span = parent ? parent->span_weight : sd_llc->parent->span_weight;
+
+       /* Update the upper remainder of the topology */
+       parent = sd_llc->parent;
+       while (parent) {
+               int factor = max(1U, (parent->span_weight / imb_span));
+
+               parent->imb_numa_nr = imb * factor;
+               parent = parent->parent;
+       }
+}
+
  /*
   * Build sched domains for a given set of CPUs and attach the sched domains
   * to the individual CPUs
@@ -2606,62 +2674,21 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
                 }
         }
  
-       /*
-        * Calculate an allowed NUMA imbalance such that LLCs do not get
-        * imbalanced.
-        */
         for_each_cpu(i, cpu_map) {
-               unsigned int imb = 0;
-               unsigned int imb_span = 1;
+               sd = *per_cpu_ptr(d.sd, i);
+               if (!sd)
+                       continue;
  
-               for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
-                       struct sched_domain *child = sd->child;
-
-                       if (!(sd->flags & SD_SHARE_LLC) && child &&
-                           (child->flags & SD_SHARE_LLC)) {
-                               struct sched_domain __rcu *top_p;
-                               unsigned int nr_llcs;
-
-                               /*
-                                * For a single LLC per node, allow an
-                                * imbalance up to 12.5% of the node. This is
-                                * arbitrary cutoff based two factors -- SMT and
-                                * memory channels. For SMT-2, the intent is to
-                                * avoid premature sharing of HT resources but
-                                * SMT-4 or SMT-8 *may* benefit from a different
-                                * cutoff. For memory channels, this is a very
-                                * rough estimate of how many channels may be
-                                * active and is based on recent CPUs with
-                                * many cores.
-                                *
-                                * For multiple LLCs, allow an imbalance
-                                * until multiple tasks would share an LLC
-                                * on one node while LLCs on another node
-                                * remain idle. This assumes that there are
-                                * enough logical CPUs per LLC to avoid SMT
-                                * factors and that there is a correlation
-                                * between LLCs and memory channels.
-                                */
-                               nr_llcs = sd->span_weight / child->span_weight;
-                               if (nr_llcs == 1)
-                                       imb = sd->span_weight >> 3;
-                               else
-                                       imb = nr_llcs;
-                               imb = max(1U, imb);
-                               sd->imb_numa_nr = imb;
-
-                               /* Set span based on the first NUMA domain. */
-                               top_p = sd->parent;
-                               while (top_p && !(top_p->flags & SD_NUMA)) {
-                                       top_p = top_p->parent;
-                               }
-                               imb_span = top_p ? top_p->span_weight : sd->span_weight;
-                       } else {
-                               int factor = max(1U, (sd->span_weight / imb_span));
+               /* First, find the topmost SD_SHARE_LLC domain */
+               while (sd->parent && (sd->parent->flags & SD_SHARE_LLC))
+                       sd = sd->parent;
  
-                               sd->imb_numa_nr = imb * factor;
-                       }
-               }
+               /*
+                * In presence of higher domains, adjust the
+                * NUMA imbalance stats for the hierarchy.
+                */
+               if (IS_ENABLED(CONFIG_NUMA) && (sd->flags & SD_SHARE_LLC) && sd->parent)
+                       adjust_numa_imbalance(sd);
         }
  
         /* Calculate CPU capacity for physical packages and nodes */
author	K Prateek Nayak <kprateek.nayak@amd.com>
	Thu, 12 Mar 2026 04:44:27 +0000 (04:44 +0000)
committer	Peter Zijlstra <peterz@infradead.org>
	Wed, 18 Mar 2026 08:06:48 +0000 (09:06 +0100)