lib/group_cpus: make group CPU cluster aware

author Wangyang Guo <wangyang.guo@intel.com>

Tue, 13 Jan 2026 02:29:58 +0000 (10:29 +0800)

committer Andrew Morton <akpm@linux-foundation.org>

Tue, 27 Jan 2026 03:07:15 +0000 (19:07 -0800)
author Wangyang Guo <wangyang.guo@intel.com>
Tue, 13 Jan 2026 02:29:58 +0000 (10:29 +0800)
committer Andrew Morton <akpm@linux-foundation.org>
Tue, 27 Jan 2026 03:07:15 +0000 (19:07 -0800)
diff --git a/lib/group_cpus.c b/lib/group_cpus.c

index 6d08ac05f371bf880571507d935d9eb501616a84..a93df70919dfbc823e5544bbecf798c9a1e5ee82 100644 (file)
--- a/lib/group_cpus.c
+++ b/lib/group_cpus.c
@@ -114,48 +114,15 @@ static int ncpus_cmp_func(const void *l, const void *r)
         return ln->ncpus - rn->ncpus;
  }
  
-/*
- * Allocate group number for each node, so that for each node:
- *
- * 1) the allocated number is >= 1
- *
- * 2) the allocated number is <= active CPU number of this node
- *
- * The actual allocated total groups may be less than @numgrps when
- * active total CPU number is less than @numgrps.
- *
- * Active CPUs means the CPUs in '@cpu_mask AND @node_to_cpumask[]'
- * for each node.
- */
-static void alloc_nodes_groups(unsigned int numgrps,
-                              cpumask_var_t *node_to_cpumask,
-                              const struct cpumask *cpu_mask,
-                              const nodemask_t nodemsk,
-                              struct cpumask *nmsk,
-                              struct node_groups *node_groups)
+static void alloc_groups_to_nodes(unsigned int numgrps,
+                                 unsigned int numcpus,
+                                 struct node_groups *node_groups,
+                                 unsigned int num_nodes)
  {
-       unsigned n, remaining_ncpus = 0;
-
-       for (n = 0; n < nr_node_ids; n++) {
-               node_groups[n].id = n;
-               node_groups[n].ncpus = UINT_MAX;
-       }
-
-       for_each_node_mask(n, nodemsk) {
-               unsigned ncpus;
-
-               cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]);
-               ncpus = cpumask_weight(nmsk);
-
-               if (!ncpus)
-                       continue;
-               remaining_ncpus += ncpus;
-               node_groups[n].ncpus = ncpus;
-       }
+       unsigned int n, remaining_ncpus = numcpus;
+       unsigned int  ngroups, ncpus;
  
-       numgrps = min_t(unsigned, remaining_ncpus, numgrps);
-
-       sort(node_groups, nr_node_ids, sizeof(node_groups[0]),
+       sort(node_groups, num_nodes, sizeof(node_groups[0]),
              ncpus_cmp_func, NULL);
  
         /*
@@ -226,9 +193,8 @@ static void alloc_nodes_groups(unsigned int numgrps,
          * finally for each node X: grps(X) <= ncpu(X).
          *
          */
-       for (n = 0; n < nr_node_ids; n++) {
-               unsigned ngroups, ncpus;
  
+       for (n = 0; n < num_nodes; n++) {
                 if (node_groups[n].ncpus == UINT_MAX)
                         continue;
  
@@ -246,12 +212,201 @@ static void alloc_nodes_groups(unsigned int numgrps,
         }
  }
  
+/*
+ * Allocate group number for each node, so that for each node:
+ *
+ * 1) the allocated number is >= 1
+ *
+ * 2) the allocated number is <= active CPU number of this node
+ *
+ * The actual allocated total groups may be less than @numgrps when
+ * active total CPU number is less than @numgrps.
+ *
+ * Active CPUs means the CPUs in '@cpu_mask AND @node_to_cpumask[]'
+ * for each node.
+ */
+static void alloc_nodes_groups(unsigned int numgrps,
+                              cpumask_var_t *node_to_cpumask,
+                              const struct cpumask *cpu_mask,
+                              const nodemask_t nodemsk,
+                              struct cpumask *nmsk,
+                              struct node_groups *node_groups)
+{
+       unsigned int n, numcpus = 0;
+
+       for (n = 0; n < nr_node_ids; n++) {
+               node_groups[n].id = n;
+               node_groups[n].ncpus = UINT_MAX;
+       }
+
+       for_each_node_mask(n, nodemsk) {
+               unsigned int ncpus;
+
+               cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]);
+               ncpus = cpumask_weight(nmsk);
+
+               if (!ncpus)
+                       continue;
+               numcpus += ncpus;
+               node_groups[n].ncpus = ncpus;
+       }
+
+       numgrps = min_t(unsigned int, numcpus, numgrps);
+       alloc_groups_to_nodes(numgrps, numcpus, node_groups, nr_node_ids);
+}
+
+static void assign_cpus_to_groups(unsigned int ncpus,
+                                 struct cpumask *nmsk,
+                                 struct node_groups *nv,
+                                 struct cpumask *masks,
+                                 unsigned int *curgrp,
+                                 unsigned int last_grp)
+{
+       unsigned int v, cpus_per_grp, extra_grps;
+       /* Account for rounding errors */
+       extra_grps = ncpus - nv->ngroups * (ncpus / nv->ngroups);
+
+       /* Spread allocated groups on CPUs of the current node */
+       for (v = 0; v < nv->ngroups; v++, *curgrp += 1) {
+               cpus_per_grp = ncpus / nv->ngroups;
+
+               /* Account for extra groups to compensate rounding errors */
+               if (extra_grps) {
+                       cpus_per_grp++;
+                       --extra_grps;
+               }
+
+               /*
+                * wrapping has to be considered given 'startgrp'
+                * may start anywhere
+                */
+               if (*curgrp >= last_grp)
+                       *curgrp = 0;
+               grp_spread_init_one(&masks[*curgrp], nmsk, cpus_per_grp);
+       }
+}
+
+static int alloc_cluster_groups(unsigned int ncpus,
+                               unsigned int ngroups,
+                               struct cpumask *node_cpumask,
+                               cpumask_var_t msk,
+                               const struct cpumask ***clusters_ptr,
+                               struct node_groups **cluster_groups_ptr)
+{
+       unsigned int ncluster = 0;
+       unsigned int cpu, nc, n;
+       const struct cpumask *cluster_mask;
+       const struct cpumask **clusters;
+       struct node_groups *cluster_groups;
+
+       cpumask_copy(msk, node_cpumask);
+
+       /* Probe how many clusters in this node. */
+       while (1) {
+               cpu = cpumask_first(msk);
+               if (cpu >= nr_cpu_ids)
+                       break;
+
+               cluster_mask = topology_cluster_cpumask(cpu);
+               if (!cpumask_weight(cluster_mask))
+                       goto no_cluster;
+               /* Clean out CPUs on the same cluster. */
+               cpumask_andnot(msk, msk, cluster_mask);
+               ncluster++;
+       }
+
+       /* If ngroups < ncluster, cross cluster is inevitable, skip. */
+       if (ncluster == 0 || ncluster > ngroups)
+               goto no_cluster;
+
+       /* Allocate memory based on cluster number. */
+       clusters = kcalloc(ncluster, sizeof(struct cpumask *), GFP_KERNEL);
+       if (!clusters)
+               goto no_cluster;
+       cluster_groups = kcalloc(ncluster, sizeof(struct node_groups), GFP_KERNEL);
+       if (!cluster_groups)
+               goto fail_cluster_groups;
+
+       /* Filling cluster info for later process. */
+       cpumask_copy(msk, node_cpumask);
+       for (n = 0; n < ncluster; n++) {
+               cpu = cpumask_first(msk);
+               cluster_mask = topology_cluster_cpumask(cpu);
+               nc = cpumask_weight_and(cluster_mask, node_cpumask);
+               clusters[n] = cluster_mask;
+               cluster_groups[n].id = n;
+               cluster_groups[n].ncpus = nc;
+               cpumask_andnot(msk, msk, cluster_mask);
+       }
+
+       alloc_groups_to_nodes(ngroups, ncpus, cluster_groups, ncluster);
+
+       *clusters_ptr = clusters;
+       *cluster_groups_ptr = cluster_groups;
+       return ncluster;
+
+ fail_cluster_groups:
+       kfree(clusters);
+ no_cluster:
+       return 0;
+}
+
+/*
+ * Try group CPUs evenly for cluster locality within a NUMA node.
+ *
+ * Return: true if success, false otherwise.
+ */
+static bool __try_group_cluster_cpus(unsigned int ncpus,
+                                    unsigned int ngroups,
+                                    struct cpumask *node_cpumask,
+                                    struct cpumask *masks,
+                                    unsigned int *curgrp,
+                                    unsigned int last_grp)
+{
+       struct node_groups *cluster_groups;
+       const struct cpumask **clusters;
+       unsigned int ncluster;
+       bool ret = false;
+       cpumask_var_t nmsk;
+       unsigned int i, nc;
+
+       if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL))
+               goto fail_nmsk_alloc;
+
+       ncluster = alloc_cluster_groups(ncpus, ngroups, node_cpumask, nmsk,
+                                       &clusters, &cluster_groups);
+
+       if (ncluster == 0)
+               goto fail_no_clusters;
+
+       for (i = 0; i < ncluster; i++) {
+               struct node_groups *nv = &cluster_groups[i];
+
+               /* Get the cpus on this cluster. */
+               cpumask_and(nmsk, node_cpumask, clusters[nv->id]);
+               nc = cpumask_weight(nmsk);
+               if (!nc)
+                       continue;
+               WARN_ON_ONCE(nv->ngroups > nc);
+
+               assign_cpus_to_groups(nc, nmsk, nv, masks, curgrp, last_grp);
+       }
+
+       ret = true;
+       kfree(cluster_groups);
+       kfree(clusters);
+ fail_no_clusters:
+       free_cpumask_var(nmsk);
+ fail_nmsk_alloc:
+       return ret;
+}
+
  static int __group_cpus_evenly(unsigned int startgrp, unsigned int numgrps,
                                cpumask_var_t *node_to_cpumask,
                                const struct cpumask *cpu_mask,
                                struct cpumask *nmsk, struct cpumask *masks)
  {
-       unsigned int i, n, nodes, cpus_per_grp, extra_grps, done = 0;
+       unsigned int i, n, nodes, done = 0;
         unsigned int last_grp = numgrps;
         unsigned int curgrp = startgrp;
         nodemask_t nodemsk = NODE_MASK_NONE;
@@ -287,7 +442,7 @@ static int __group_cpus_evenly(unsigned int startgrp, unsigned int numgrps,
         alloc_nodes_groups(numgrps, node_to_cpumask, cpu_mask,
                            nodemsk, nmsk, node_groups);
         for (i = 0; i < nr_node_ids; i++) {
-               unsigned int ncpus, v;
+               unsigned int ncpus;
                 struct node_groups *nv = &node_groups[i];
  
                 if (nv->ngroups == UINT_MAX)
@@ -301,28 +456,14 @@ static int __group_cpus_evenly(unsigned int startgrp, unsigned int numgrps,
  
                 WARN_ON_ONCE(nv->ngroups > ncpus);
  
-               /* Account for rounding errors */
-               extra_grps = ncpus - nv->ngroups * (ncpus / nv->ngroups);
-
-               /* Spread allocated groups on CPUs of the current node */
-               for (v = 0; v < nv->ngroups; v++, curgrp++) {
-                       cpus_per_grp = ncpus / nv->ngroups;
-
-                       /* Account for extra groups to compensate rounding errors */
-                       if (extra_grps) {
-                               cpus_per_grp++;
-                               --extra_grps;
-                       }
-
-                       /*
-                        * wrapping has to be considered given 'startgrp'
-                        * may start anywhere
-                        */
-                       if (curgrp >= last_grp)
-                               curgrp = 0;
-                       grp_spread_init_one(&masks[curgrp], nmsk,
-                                               cpus_per_grp);
+               if (__try_group_cluster_cpus(ncpus, nv->ngroups, nmsk,
+                                            masks, &curgrp, last_grp)) {
+                       done += nv->ngroups;
+                       continue;
                 }
+
+               assign_cpus_to_groups(ncpus, nmsk, nv, masks, &curgrp,
+                                     last_grp);
                 done += nv->ngroups;
         }
         kfree(node_groups);
author	Wangyang Guo <wangyang.guo@intel.com>
	Tue, 13 Jan 2026 02:29:58 +0000 (10:29 +0800)
committer	Andrew Morton <akpm@linux-foundation.org>
	Tue, 27 Jan 2026 03:07:15 +0000 (19:07 -0800)