]> git.ipfire.org Git - thirdparty/systemd.git/commitdiff
udev/net: implement IRQAffinityPolicy=spread with topology awareness
authorQuentin Deslandes <qde@naccy.de>
Mon, 16 Feb 2026 19:38:19 +0000 (20:38 +0100)
committerQuentin Deslandes <qde@naccy.de>
Wed, 20 May 2026 09:39:10 +0000 (11:39 +0200)
Implement the spread policy for IRQ affinity distribution using a
topology-aware algorithm. The algorithm:

1. Discovers CPU topology from sysfs (NUMA node, package, die/L3, core)
2. Groups CPUs by L3 cache domain (die) with equidistant ordering
3. Round-robins across dies, spreading IRQs across the system
4. Uses first hyperthread of each core before second hyperthreads
5. Applies IRQ affinity via /proc/irq/<n>/smp_affinity

When there are more IRQs than CPUs, queues wrap around using round-robin.

src/basic/sort-util.c
src/basic/sort-util.h
src/shared/numa-util.c
src/shared/numa-util.h
src/udev/net/link-config.c
src/udev/net/link-config.h
test/units/TEST-17-UDEV.irq-affinity.sh

index a76a6c85a8219bbffff834b0051b7a17282a1fd8..8aca5cac769cdda89029a5767ad4904d9aa44c76 100644 (file)
@@ -81,3 +81,12 @@ int cmp_uint16(const uint16_t *a, const uint16_t *b) {
 
         return CMP(*a, *b);
 }
+
+int cmp_unsigned(const unsigned *a, const unsigned *b) {
+        /* This is called from qsort()s inner loops. Correctly implemented qsort will never pass NULL so we
+           just suppress the check via POINTER_MAY_BE_NULL instead of assert() to avoid the runtime cost. */
+        POINTER_MAY_BE_NULL(a);
+        POINTER_MAY_BE_NULL(b);
+
+        return CMP(*a, *b);
+}
index 8e7a1991bb692d2ff89161be29f1a0abe2378ed8..37c196158eb388e89442822df6ccbfc1db9a6bfa 100644 (file)
@@ -44,3 +44,4 @@ void qsort_r_safe(void *base, size_t nmemb, size_t size, comparison_userdata_fn_
 
 int cmp_int(const int *a, const int *b);
 int cmp_uint16(const uint16_t *a, const uint16_t *b);
+int cmp_unsigned(const unsigned *a, const unsigned *b);
index 228ea7ad2d14f7f95d8595b14dd3d2114efb9363..9097ccbc313c72b71db88e01b38bd54a085668a1 100644 (file)
@@ -154,6 +154,63 @@ static int numa_max_node(void) {
         return max_node;
 }
 
+int numa_get_node_from_cpu(unsigned cpu, unsigned *ret) {
+        _cleanup_closedir_ DIR *d = NULL;
+        int r;
+
+        assert(ret);
+
+        d = opendir("/sys/devices/system/node");
+        if (!d)
+                return -errno;
+
+        FOREACH_DIRENT(de, d, break) {
+                char p[STRLEN("/sys/devices/system/node/node/cpulist") + DECIMAL_STR_MAX(unsigned) + 1];
+                _cleanup_(cpu_set_done) CPUSet cpus = {};
+                _cleanup_free_ char *cpulist = NULL;
+                const char *n;
+                unsigned node;
+
+                if (de->d_type != DT_DIR)
+                        continue;
+
+                n = startswith(de->d_name, "node");
+                if (!n)
+                        continue;
+
+                r = safe_atou(n, &node);
+                if (r < 0) {
+                        log_debug_errno(r, "Failed to parse node number %s to unsigned, ignoring: %m", n);
+                        continue;
+                }
+
+                xsprintf(p, "/sys/devices/system/node/node%u/cpulist", node);
+
+                r = read_virtual_file(p, SIZE_MAX, &cpulist, /* ret_size= */ NULL);
+                if (r < 0) {
+                        log_debug_errno(r, "Failed to read %s, ignoring: %m", p);
+                        continue;
+                }
+
+                r = parse_cpu_set(cpulist, &cpus);
+                if (r < 0) {
+                        log_debug_errno(r, "Failed to parse cpu set %s, ignoring: %m", cpulist);
+                        continue;
+                }
+
+                if (CPU_ISSET_S(cpu, cpus.allocated, cpus.set)) {
+                        *ret = node;
+                        return 0;
+                }
+        }
+
+        /* CPU not found in any NUMA node, assume node 0 */
+        log_debug("CPU %u not found in any NUMA node, assuming node 0.", cpu);
+        *ret = 0;
+
+        return 0;
+}
+
 int numa_mask_add_all(CPUSet *mask) {
         int m;
 
index c684dea803eca366694e2de979ae97925f40093f..6fec7c587baa8de20f5979ef955321e39e16f817 100644 (file)
@@ -31,6 +31,8 @@ static inline void numa_policy_reset(NUMAPolicy *p) {
 int apply_numa_policy(const NUMAPolicy *policy);
 int numa_to_cpu_set(const NUMAPolicy *policy, CPUSet *ret);
 
+int numa_get_node_from_cpu(unsigned cpu, unsigned *ret);
+
 int numa_mask_add_all(CPUSet *mask);
 
 DECLARE_STRING_TABLE_LOOKUP(mpol, int);
index 4a1512365d6b2305e9df6c92ab41ed390d955416..6211081d79a647605202847b04982e978df8a3eb 100644 (file)
@@ -1,6 +1,7 @@
 /* SPDX-License-Identifier: LGPL-2.1-or-later */
 
 #include <linux/netdevice.h>
+#include <net/if.h>
 #include <net/if_arp.h>
 #include <unistd.h>
 
 #include "netif-util.h"
 #include "netlink-util.h"
 #include "network-util.h"
+#include "numa-util.h"
 #include "parse-util.h"
 #include "path-util.h"
 #include "proc-cmdline.h"
 #include "random-util.h"
 #include "socket-util.h"
+#include "sort-util.h"
 #include "specifier.h"
 #include "stat-util.h"
+#include "stdio-util.h"
 #include "string-table.h"
 #include "string-util.h"
 #include "strv.h"
@@ -924,6 +928,456 @@ static int link_apply_sr_iov_config(Link *link) {
         return 0;
 }
 
+/* CPU topology information for IRQ affinity spread algorithm. */
+typedef struct CPUTopology {
+        unsigned cpu;
+        unsigned numa_node;
+        unsigned package_id;
+        unsigned die_id; /* L3 cache domain / chiplet */
+        unsigned core_id;
+        bool is_first_thread; /* First hyperthread of a physical core */
+} CPUTopology;
+
+/* Die (L3 cache domain) information for spread algorithm */
+typedef struct DieInfo {
+        unsigned die_id;
+        unsigned *cpus; /* CPUs in this die (first HT only, sorted by core) */
+        size_t cpu_count;
+        size_t next_idx; /* For round-robin within die */
+} DieInfo;
+
+/* Returns the first thread of a CPU siblings list */
+static int cpu_topology_get_first_thread(sd_device *cpu_node, unsigned *ret) {
+        const char *content, *end;
+        int r;
+
+        assert(cpu_node);
+        assert(ret);
+
+        r = sd_device_get_sysattr_value(cpu_node, "topology/thread_siblings_list", &content);
+        if (r < 0)
+                return r;
+
+        end = content + strcspn(content, ",-");
+
+        _cleanup_free_ char *first = strndup(content, end - content);
+        if (!first)
+                return -ENOMEM;
+
+        return safe_atou(first, ret);
+}
+
+static int cpu_topology_compare(const CPUTopology *a, const CPUTopology *b) {
+        int r;
+
+        assert(a);
+        assert(b);
+
+        /* Sort by die first (for L3 cache grouping), then core, then CPU number */
+        r = CMP(a->die_id, b->die_id);
+        if (r != 0)
+                return r;
+
+        r = CMP(a->core_id, b->core_id);
+        if (r != 0)
+                return r;
+
+        return CMP(a->cpu, b->cpu);
+}
+
+/* Comparison function for sorting CPUs by CPU number (for die ID assignment) */
+static int cpu_number_compare(const CPUTopology *a, const CPUTopology *b) {
+        assert(a);
+        assert(b);
+
+        return CMP(a->cpu, b->cpu);
+}
+
+/* Assign logical die IDs based on L3 cache sharing topology.
+ *
+ * For IRQ spreading, the goal is to distribute interrupts across CPUs that
+ * don't share cache, minimizing cache line contention when processing packets.
+ * The L3 cache boundary is the key locality domain: CPUs sharing an L3 can
+ * exchange data cheaply, while cross-L3 communication is expensive.
+ *
+ * We use L3 shared_cpu_list rather than the kernel's physical die_id because:
+ * - On AMD EPYC, multiple CCXs on the same physical die have separate L3 caches
+ * - On Intel with Sub-NUMA Clustering, one die may have multiple L3 domains
+ * - L3 sharing reflects actual data locality, not physical packaging */
+static int assign_sequential_die_ids(CPUTopology *cpus, size_t count) {
+        _cleanup_strv_free_ char **l3_groups = NULL;
+        int r;
+
+        assert(cpus);
+
+        /* First, sort CPUs by CPU number for consistent discovery order */
+        typesafe_qsort(cpus, count, cpu_number_compare);
+
+        /* Assign die IDs based on order of L3 shared_cpu_list discovery */
+        FOREACH_ARRAY(cpu, cpus, count) {
+                _cleanup_(sd_device_unrefp) sd_device *cpu_node = NULL;
+                char cpu_path[STRLEN("/sys/devices/system/cpu/cpu") + DECIMAL_STR_MAX(unsigned) + 1];
+                const char *l3_list;
+                unsigned die_id = 0;
+                bool found = false;
+
+                xsprintf(cpu_path, "/sys/devices/system/cpu/cpu%u", cpu->cpu);
+                r = sd_device_new_from_syspath(&cpu_node, cpu_path);
+                if (r < 0)
+                        return r;
+
+                r = sd_device_get_sysattr_value(cpu_node, "cache/index3/shared_cpu_list", &l3_list);
+                if (r < 0) {
+                        /* No L3 info, fall back to package ID */
+                        cpu->die_id = cpu->package_id;
+                        continue;
+                }
+
+                /* Check if we've seen this L3 group before */
+                STRV_FOREACH(g, l3_groups) {
+                        if (streq(*g, l3_list)) {
+                                cpu->die_id = die_id;
+                                found = true;
+                                break;
+                        }
+                        die_id++;
+                }
+
+                if (!found) {
+                        /* New L3 group, assign next sequential die ID */
+                        cpu->die_id = die_id;
+                        r = strv_extend(&l3_groups, l3_list);
+                        if (r < 0)
+                                return r;
+                }
+        }
+
+        return 0;
+}
+
+static int discover_cpu_topology(CPUTopology **ret, size_t *ret_count) {
+        _cleanup_(sd_device_unrefp) sd_device *parent_node = NULL;
+        _cleanup_free_ CPUTopology *cpus = NULL;
+        const char *name;
+        size_t count = 0;
+        int r;
+
+        assert(ret);
+        assert(ret_count);
+
+        r = sd_device_new_from_syspath(&parent_node, "/sys/devices/system/cpu");
+        if (r < 0)
+                return r;
+
+        FOREACH_DEVICE_CHILD_WITH_SUFFIX(parent_node, cpu_node, name) {
+                char topo_path[STRLEN("/sys/devices/system/cpu/cpu/topology") + DECIMAL_STR_MAX(unsigned) + 1];
+                const char *n;
+                unsigned cpu, online, first_thread;
+
+                n = startswith(name, "cpu");
+                if (!n)
+                        continue;
+
+                r = safe_atou(n, &cpu);
+                if (r < 0) {
+                        log_debug_errno(r, "Failed to convert %s to unsigned, skipping: %m", n);
+                        continue;
+                }
+
+                r = device_get_sysattr_unsigned(cpu_node, "online", &online);
+                if (r == -ENOENT)
+                        online = 1; /* CPU 0 lacks 'online' file, assume online */
+                else if (r < 0 || online == 0)
+                        continue;
+
+                /* Check if topology directory exists (filters out cpu0 on some systems) */
+                xsprintf(topo_path, "/sys/devices/system/cpu/cpu%u/topology", cpu);
+                if (access(topo_path, F_OK) < 0) {
+                        log_debug_errno(errno, "Failed to access %s, ignoring: %m", topo_path);
+                        continue;
+                }
+
+                if (!GREEDY_REALLOC(cpus, count + 1))
+                        return -ENOMEM;
+
+                cpus[count].cpu = cpu;
+
+                r = numa_get_node_from_cpu(cpu, &cpus[count].numa_node);
+                if (r < 0) {
+                        log_debug_errno(r, "Failed to get NUMA node for CPU %u, assuming NUMA node 0: %m", cpu);
+                        cpus[count].numa_node = 0;
+                }
+
+                r = device_get_sysattr_unsigned(cpu_node, "topology/physical_package_id", &cpus[count].package_id);
+                if (r < 0) {
+                        log_device_debug_errno(cpu_node, r, "Failed to get physical_package_id, assuming package ID 0: %m");
+                        cpus[count].package_id = 0;
+                }
+
+                /* die_id will be assigned later by assign_sequential_die_ids() */
+                cpus[count].die_id = 0;
+
+                r = device_get_sysattr_unsigned(cpu_node, "topology/core_id", &cpus[count].core_id);
+                if (r < 0) {
+                        log_device_debug_errno(cpu_node, r, "Failed to get core_id, assuming core ID %u: %m", cpu);
+                        cpus[count].core_id = cpu;
+                }
+
+                r = cpu_topology_get_first_thread(cpu_node, &first_thread);
+                if (r < 0)
+                        cpus[count].is_first_thread = true;
+                else
+                        cpus[count].is_first_thread = (first_thread == cpu);
+
+                count++;
+        }
+
+        if (count == 0)
+                return -ENOENT;
+
+        /* Assign sequential die IDs based on L3 discovery order */
+        r = assign_sequential_die_ids(cpus, count);
+        if (r < 0)
+                return r;
+
+        /* Sort CPUs by topology for consistent ordering */
+        typesafe_qsort(cpus, count, cpu_topology_compare);
+
+        *ret = TAKE_PTR(cpus);
+        *ret_count = count;
+
+        return 0;
+}
+
+/* Reorder indices so consecutive elements are maximally spread apart.
+ *
+ * Uses recursive divide-and-conquer: split in half, permute each half,
+ * then interleave. This ensures elements originally far apart become adjacent.
+ *
+ * Example trace for [0,1,2,3,4,5,6,7]:
+ *   split into [0,1,2,3] and [4,5,6,7]
+ *   recurse left:  [0,1,2,3] -> [0,2,1,3]
+ *   recurse right: [4,5,6,7] -> [4,6,5,7]
+ *   interleave -> [0,4,2,6,1,5,3,7]
+ *
+ * The first N elements of the output are roughly evenly distributed across the
+ * original range, for any N. This is useful when assigning IRQs to CPUs: if a
+ * NIC has fewer IRQs than CPUs, the assigned CPUs will still be spread across
+ * the CPUs rather than all at the beginning. */
+static int equidist_permute(size_t *indices, size_t n_indices) {
+        _cleanup_free_ size_t *left = NULL, *right = NULL;
+        size_t left_count, right_count;
+        size_t li = 0, ri = 0, ti = 0;
+        int r;
+
+        assert(indices);
+
+        if (n_indices <= 1)
+                return 0;
+
+        left_count = DIV_ROUND_UP(n_indices, 2);
+        right_count = n_indices - left_count;
+
+        /* Recursively permute each half */
+        left = newdup(size_t, indices, left_count);
+        right = newdup(size_t, &indices[left_count], right_count);
+        if (!left || !right)
+                return log_oom();
+
+        r = equidist_permute(left, left_count);
+        if (r < 0)
+                return r;
+
+        r = equidist_permute(right, right_count);
+        if (r < 0)
+                return r;
+
+        /* Interleave: left[0], right[0], left[1], right[1], ... */
+        for (size_t i = 0; i < n_indices; i++) {
+                if (i % 2 == 0 && li < left_count)
+                        indices[ti++] = left[li++];
+                else if (ri < right_count)
+                        indices[ti++] = right[ri++];
+                else if (li < left_count)
+                        indices[ti++] = left[li++];
+        }
+
+        return 0;
+}
+
+static void die_info_free(DieInfo *dies, size_t count) {
+        assert(dies || count == 0);
+
+        FOREACH_ARRAY(die, dies, count)
+                free(die->cpus);
+        free(dies);
+}
+
+/* Build die information from topology, grouping CPUs by L3/die and filtering to first HT only */
+static int build_die_info(const CPUTopology *topology, size_t topology_count, DieInfo **ret, size_t *ret_count) {
+        DieInfo *dies = NULL;
+        size_t die_count = 0;
+        int r;
+
+        assert(topology);
+        assert(ret);
+        assert(ret_count);
+
+        CLEANUP_ARRAY(dies, die_count, die_info_free);
+
+        FOREACH_ARRAY(cpu_topology, topology, topology_count) {
+                DieInfo *die = NULL;
+
+                /* Only consider first hyperthreads for initial spread */
+                if (!cpu_topology->is_first_thread)
+                        continue;
+
+                /* Find or create die entry */
+                for (size_t j = 0; j < die_count; j++)
+                        if (dies[j].die_id == cpu_topology->die_id) {
+                                die = &dies[j];
+                                break;
+                        }
+
+                if (!die) {
+                        if (!GREEDY_REALLOC(dies, die_count + 1))
+                                return log_oom();
+                        die = &dies[die_count++];
+                        *die = (DieInfo) { .die_id = cpu_topology->die_id };
+                }
+
+                if (!GREEDY_REALLOC(die->cpus, die->cpu_count + 1))
+                        return log_oom();
+
+                die->cpus[die->cpu_count++] = cpu_topology->cpu;
+        }
+
+        /* Sort dies by die_id for determinism, then apply equidist to CPUs within each die */
+        FOREACH_ARRAY(die, dies, die_count) {
+                _cleanup_free_ unsigned *reordered = NULL;
+                _cleanup_free_ size_t *indices = new(size_t, die->cpu_count);
+                if (!indices)
+                        return log_oom();
+
+                for (size_t j = 0; j < die->cpu_count; j++)
+                        indices[j] = j;
+
+                r = equidist_permute(indices, die->cpu_count);
+                if (r < 0)
+                        return r;
+
+                /* Reorder CPUs according to equidist permutation */
+                reordered = new(unsigned, die->cpu_count);
+                if (!reordered)
+                        return log_oom();
+
+                for (size_t j = 0; j < die->cpu_count; j++)
+                        reordered[j] = die->cpus[indices[j]];
+
+                memcpy(die->cpus, reordered, die->cpu_count * sizeof(unsigned));
+        }
+
+        *ret = TAKE_PTR(dies);
+        *ret_count = die_count;
+
+        return 0;
+}
+
+/* Select CPUs for IRQ affinity spreading with optimal topology distribution.
+ *
+ * Algorithm:
+ * 1. Group CPUs by die (L3 cache domain), using only first hyperthreads
+ * 2. Apply equidistant permutation to both die order and CPUs within each die,
+ *    so consecutive selections are maximally spread (e.g., [0,1,2,3] -> [0,2,1,3])
+ * 3. Round-robin across dies, picking one CPU per die per round
+ * 4. If more IRQs than physical cores, wrap around and reuse the same CPUs
+ *
+ * Ensures each IRQ gets a dedicated physical core before any core handles
+ * multiple IRQs. Two IRQs on one physical core time-share but benefit from warm
+ * cache, whereas spreading across SMT siblings causes resource contention with
+ * no cache benefit.
+ * Maximizes physical distance between consecutively assigned IRQs, improving
+ * cache distribution even when only a few IRQs are assigned. */
+static int select_spread_cpus(
+                const CPUTopology *topology,
+                size_t topology_count,
+                size_t n_irqs,
+                unsigned **ret,
+                size_t *ret_count) {
+
+        _cleanup_free_ unsigned *selected = NULL;
+        _cleanup_free_ size_t *die_order = NULL;
+        DieInfo *dies = NULL;
+        size_t die_count = 0, selected_count = 0;
+        int r;
+
+        assert(topology);
+        assert(ret);
+        assert(ret_count);
+
+        CLEANUP_ARRAY(dies, die_count, die_info_free);
+
+        selected = new(unsigned, n_irqs);
+        if (!selected)
+                return -ENOMEM;
+
+        /* Build die information with first HT CPUs only */
+        r = build_die_info(topology, topology_count, &dies, &die_count);
+        if (r < 0)
+                return r;
+
+        if (die_count == 0)
+                return -ENOENT;
+
+        /* Create equidistant die ordering */
+        die_order = new(size_t, die_count);
+        if (!die_order)
+                return -ENOMEM;
+
+        for (size_t i = 0; i < die_count; i++)
+                die_order[i] = i;
+
+        r = equidist_permute(die_order, die_count);
+        if (r < 0)
+                return r;
+
+        /* Round-robin across dies, picking one CPU from each die at a time */
+        size_t dies_exhausted = 0;
+        while (selected_count < n_irqs) {
+                bool made_progress = false;
+
+                for (size_t i = 0; i < die_count && selected_count < n_irqs; i++) {
+                        DieInfo *die = &dies[die_order[i]];
+
+                        if (die->next_idx >= die->cpu_count)
+                                continue;
+
+                        selected[selected_count++] = die->cpus[die->next_idx++];
+                        made_progress = true;
+
+                        if (die->next_idx >= die->cpu_count)
+                                dies_exhausted++;
+                }
+
+                if (made_progress)
+                        continue;
+
+                /* All first HTs exhausted, wrap around for remaining IRQs */
+                if (dies_exhausted < die_count)
+                        break;
+
+                /* Reset all dies for round-robin wrap */
+                FOREACH_ARRAY(die, dies, die_count)
+                        die->next_idx = 0;
+                dies_exhausted = 0;
+        }
+
+        *ret = TAKE_PTR(selected);
+        *ret_count = selected_count;
+
+        return 0;
+}
+
 static int set_irq_affinity(Link *link, unsigned irq, unsigned cpu) {
         _cleanup_free_ char *affinity_path = NULL, *mask_str = NULL;
         unsigned n_groups = cpu / 32;
@@ -954,6 +1408,64 @@ static int set_irq_affinity(Link *link, unsigned irq, unsigned cpu) {
         return 0;
 }
 
+static int link_apply_irq_affinity_spread(Link *link) {
+        _cleanup_closedir_ DIR *dir = NULL;
+        _cleanup_free_ CPUTopology *topology = NULL;
+        _cleanup_free_ unsigned *irqs = NULL;
+        _cleanup_free_ unsigned *spread_cpus = NULL;
+        size_t topology_count = 0, irq_count = 0, spread_count = 0;
+        int r;
+
+        assert(link);
+
+        r = device_opendir(link->event->dev, "device/msi_irqs", &dir);
+        if (r < 0) {
+                if (r != -ENOENT)
+                        return log_link_error_errno(link, r, "Failed to open device/msi_irqs: %m");
+                log_link_debug_errno(link, r, "No MSI IRQs found, skipping IRQ affinity configuration: %m");
+                return 0;
+        }
+
+        FOREACH_DIRENT(de, dir, return log_link_error_errno(link, errno, "Failed to read directory device/msi_irqs: %m")) {
+                unsigned irq;
+
+                r = safe_atou(de->d_name, &irq);
+                if (r < 0)
+                        return log_link_error_errno(link, r, "Failed to convert parse IRQ number: %s", de->d_name);
+
+                if (!GREEDY_REALLOC(irqs, irq_count + 1))
+                        return log_oom();
+
+                irqs[irq_count++] = irq;
+        }
+
+        if (irq_count == 0) {
+                log_link_debug(link, "No IRQs found, skipping spread.");
+                return 0;
+        }
+
+        typesafe_qsort(irqs, irq_count, cmp_unsigned);
+
+        r = discover_cpu_topology(&topology, &topology_count);
+        if (r < 0)
+                return log_link_error_errno(link, r, "Failed to discover CPU topology: %m");
+
+        log_link_debug(link, "Discovered %zu CPUs, spreading %zu IRQs.", topology_count, irq_count);
+
+        /* Select CPUs using maximum distance algorithm */
+        r = select_spread_cpus(topology, topology_count, irq_count, &spread_cpus, &spread_count);
+        if (r < 0)
+                return log_link_error_errno(link, r, "Failed to select spread CPUs: %m");
+
+        for (size_t i = 0; i < spread_count; i++)
+                (void) set_irq_affinity(link, irqs[i], spread_cpus[i]);
+
+        log_link_info(link, "Applied IRQ affinity policy 'spread' across %zu CPUs for %zu IRQs.",
+                      MIN(topology_count, irq_count), irq_count);
+
+        return 0;
+}
+
 static int link_apply_irq_affinity_single(Link *link) {
         _cleanup_closedir_ DIR *dir = NULL;
         int r;
@@ -1009,6 +1521,8 @@ static int link_apply_irq_affinity(Link *link) {
         switch (link->config->irq_affinity_policy) {
         case IRQ_AFFINITY_POLICY_SINGLE:
                 return link_apply_irq_affinity_single(link);
+        case IRQ_AFFINITY_POLICY_SPREAD:
+                return link_apply_irq_affinity_spread(link);
         default:
                 assert_not_reached();
         }
@@ -1499,6 +2013,7 @@ DEFINE_CONFIG_PARSE_ENUMV(config_parse_alternative_names_policy, alternative_nam
 
 static const char* const irq_affinity_policy_table[_IRQ_AFFINITY_POLICY_MAX] = {
         [IRQ_AFFINITY_POLICY_SINGLE] = "single",
+        [IRQ_AFFINITY_POLICY_SPREAD] = "spread",
 };
 
 DEFINE_STRING_TABLE_LOOKUP(irq_affinity_policy, IRQAffinityPolicy);
index b6853cd922c99ff76c20e95feb87979aa2b2f76d..ff581d8b021eb261a78423348b54b71106e52c84 100644 (file)
@@ -24,6 +24,7 @@ typedef enum MACAddressPolicy {
 
 typedef enum IRQAffinityPolicy {
         IRQ_AFFINITY_POLICY_SINGLE,
+        IRQ_AFFINITY_POLICY_SPREAD,
         _IRQ_AFFINITY_POLICY_MAX,
         _IRQ_AFFINITY_POLICY_INVALID = -EINVAL,
 } IRQAffinityPolicy;
index 5fa8bd331f9d565ca0283119e02b7c2e59c76477..f9dde104fa6a720d7a9dbbf72a619753d0af4c96 100755 (executable)
@@ -85,6 +85,47 @@ EOF
         fi
     done
 
+    # Test 1b: test spread policy on the same interface
+    cat >/run/systemd/network/00-test-irq-affinity.link <<EOF
+[Match]
+MACAddress=$mac
+
+[Link]
+IRQAffinityPolicy=spread
+EOF
+
+    udevadm control --reload
+    udevadm trigger --action=add "/sys/class/net/$iface"
+    udevadm settle --timeout=30
+
+    # Get the number of online CPUs
+    n_cpus=$(nproc)
+    irq_count=$(echo "$irqs" | wc -w)
+
+    echo "System has $n_cpus CPUs, interface has $irq_count IRQs"
+
+    # Verify IRQs are spread (not all on CPU 0)
+    # With spread policy, if we have more than 1 CPU and more than 1 IRQ,
+    # at least some IRQs should be on different CPUs
+    if [[ "$n_cpus" -gt 1 ]] && [[ "$irq_count" -gt 1 ]]; then
+        cpu_set=()
+        for irq in $irqs; do
+            affinity=$(cat "/proc/irq/$irq/smp_affinity_list")
+            echo "IRQ $irq is on CPU(s): $affinity"
+            cpu_set+=("$affinity")
+        done
+
+        # Check that we have at least 2 different CPU assignments
+        unique_cpus=$(printf '%s\n' "${cpu_set[@]}" | sort -u | wc -l)
+        if [[ "$unique_cpus" -lt 2 ]]; then
+            echo "ERROR: spread policy should distribute IRQs across CPUs, but all are on same CPU"
+            exit 1
+        fi
+        echo "IRQ affinity policy 'spread' successfully distributed IRQs across $unique_cpus CPUs"
+    else
+        echo "Skipping spread verification (need >1 CPU and >1 IRQ)"
+    fi
+
     # Cleanup
     rm -f /run/systemd/network/00-test-irq-affinity.link
     udevadm control --reload