/* SPDX-License-Identifier: LGPL-2.1-or-later */
#include <linux/netdevice.h>
+#include <net/if.h>
#include <net/if_arp.h>
#include <unistd.h>
#include "netif-util.h"
#include "netlink-util.h"
#include "network-util.h"
+#include "numa-util.h"
#include "parse-util.h"
#include "path-util.h"
#include "proc-cmdline.h"
#include "random-util.h"
#include "socket-util.h"
+#include "sort-util.h"
#include "specifier.h"
#include "stat-util.h"
+#include "stdio-util.h"
#include "string-table.h"
#include "string-util.h"
#include "strv.h"
return 0;
}
+/* CPU topology information for IRQ affinity spread algorithm. */
+typedef struct CPUTopology {
+ unsigned cpu;
+ unsigned numa_node;
+ unsigned package_id;
+ unsigned die_id; /* L3 cache domain / chiplet */
+ unsigned core_id;
+ bool is_first_thread; /* First hyperthread of a physical core */
+} CPUTopology;
+
+/* Die (L3 cache domain) information for spread algorithm */
+typedef struct DieInfo {
+ unsigned die_id;
+ unsigned *cpus; /* CPUs in this die (first HT only, sorted by core) */
+ size_t cpu_count;
+ size_t next_idx; /* For round-robin within die */
+} DieInfo;
+
+/* Returns the first thread of a CPU siblings list */
+static int cpu_topology_get_first_thread(sd_device *cpu_node, unsigned *ret) {
+ const char *content, *end;
+ int r;
+
+ assert(cpu_node);
+ assert(ret);
+
+ r = sd_device_get_sysattr_value(cpu_node, "topology/thread_siblings_list", &content);
+ if (r < 0)
+ return r;
+
+ end = content + strcspn(content, ",-");
+
+ _cleanup_free_ char *first = strndup(content, end - content);
+ if (!first)
+ return -ENOMEM;
+
+ return safe_atou(first, ret);
+}
+
+static int cpu_topology_compare(const CPUTopology *a, const CPUTopology *b) {
+ int r;
+
+ assert(a);
+ assert(b);
+
+ /* Sort by die first (for L3 cache grouping), then core, then CPU number */
+ r = CMP(a->die_id, b->die_id);
+ if (r != 0)
+ return r;
+
+ r = CMP(a->core_id, b->core_id);
+ if (r != 0)
+ return r;
+
+ return CMP(a->cpu, b->cpu);
+}
+
+/* Comparison function for sorting CPUs by CPU number (for die ID assignment) */
+static int cpu_number_compare(const CPUTopology *a, const CPUTopology *b) {
+ assert(a);
+ assert(b);
+
+ return CMP(a->cpu, b->cpu);
+}
+
+/* Assign logical die IDs based on L3 cache sharing topology.
+ *
+ * For IRQ spreading, the goal is to distribute interrupts across CPUs that
+ * don't share cache, minimizing cache line contention when processing packets.
+ * The L3 cache boundary is the key locality domain: CPUs sharing an L3 can
+ * exchange data cheaply, while cross-L3 communication is expensive.
+ *
+ * We use L3 shared_cpu_list rather than the kernel's physical die_id because:
+ * - On AMD EPYC, multiple CCXs on the same physical die have separate L3 caches
+ * - On Intel with Sub-NUMA Clustering, one die may have multiple L3 domains
+ * - L3 sharing reflects actual data locality, not physical packaging */
+static int assign_sequential_die_ids(CPUTopology *cpus, size_t count) {
+ _cleanup_strv_free_ char **l3_groups = NULL;
+ int r;
+
+ assert(cpus);
+
+ /* First, sort CPUs by CPU number for consistent discovery order */
+ typesafe_qsort(cpus, count, cpu_number_compare);
+
+ /* Assign die IDs based on order of L3 shared_cpu_list discovery */
+ FOREACH_ARRAY(cpu, cpus, count) {
+ _cleanup_(sd_device_unrefp) sd_device *cpu_node = NULL;
+ char cpu_path[STRLEN("/sys/devices/system/cpu/cpu") + DECIMAL_STR_MAX(unsigned) + 1];
+ const char *l3_list;
+ unsigned die_id = 0;
+ bool found = false;
+
+ xsprintf(cpu_path, "/sys/devices/system/cpu/cpu%u", cpu->cpu);
+ r = sd_device_new_from_syspath(&cpu_node, cpu_path);
+ if (r < 0)
+ return r;
+
+ r = sd_device_get_sysattr_value(cpu_node, "cache/index3/shared_cpu_list", &l3_list);
+ if (r < 0) {
+ /* No L3 info, fall back to package ID */
+ cpu->die_id = cpu->package_id;
+ continue;
+ }
+
+ /* Check if we've seen this L3 group before */
+ STRV_FOREACH(g, l3_groups) {
+ if (streq(*g, l3_list)) {
+ cpu->die_id = die_id;
+ found = true;
+ break;
+ }
+ die_id++;
+ }
+
+ if (!found) {
+ /* New L3 group, assign next sequential die ID */
+ cpu->die_id = die_id;
+ r = strv_extend(&l3_groups, l3_list);
+ if (r < 0)
+ return r;
+ }
+ }
+
+ return 0;
+}
+
+static int discover_cpu_topology(CPUTopology **ret, size_t *ret_count) {
+ _cleanup_(sd_device_unrefp) sd_device *parent_node = NULL;
+ _cleanup_free_ CPUTopology *cpus = NULL;
+ const char *name;
+ size_t count = 0;
+ int r;
+
+ assert(ret);
+ assert(ret_count);
+
+ r = sd_device_new_from_syspath(&parent_node, "/sys/devices/system/cpu");
+ if (r < 0)
+ return r;
+
+ FOREACH_DEVICE_CHILD_WITH_SUFFIX(parent_node, cpu_node, name) {
+ char topo_path[STRLEN("/sys/devices/system/cpu/cpu/topology") + DECIMAL_STR_MAX(unsigned) + 1];
+ const char *n;
+ unsigned cpu, online, first_thread;
+
+ n = startswith(name, "cpu");
+ if (!n)
+ continue;
+
+ r = safe_atou(n, &cpu);
+ if (r < 0) {
+ log_debug_errno(r, "Failed to convert %s to unsigned, skipping: %m", n);
+ continue;
+ }
+
+ r = device_get_sysattr_unsigned(cpu_node, "online", &online);
+ if (r == -ENOENT)
+ online = 1; /* CPU 0 lacks 'online' file, assume online */
+ else if (r < 0 || online == 0)
+ continue;
+
+ /* Check if topology directory exists (filters out cpu0 on some systems) */
+ xsprintf(topo_path, "/sys/devices/system/cpu/cpu%u/topology", cpu);
+ if (access(topo_path, F_OK) < 0) {
+ log_debug_errno(errno, "Failed to access %s, ignoring: %m", topo_path);
+ continue;
+ }
+
+ if (!GREEDY_REALLOC(cpus, count + 1))
+ return -ENOMEM;
+
+ cpus[count].cpu = cpu;
+
+ r = numa_get_node_from_cpu(cpu, &cpus[count].numa_node);
+ if (r < 0) {
+ log_debug_errno(r, "Failed to get NUMA node for CPU %u, assuming NUMA node 0: %m", cpu);
+ cpus[count].numa_node = 0;
+ }
+
+ r = device_get_sysattr_unsigned(cpu_node, "topology/physical_package_id", &cpus[count].package_id);
+ if (r < 0) {
+ log_device_debug_errno(cpu_node, r, "Failed to get physical_package_id, assuming package ID 0: %m");
+ cpus[count].package_id = 0;
+ }
+
+ /* die_id will be assigned later by assign_sequential_die_ids() */
+ cpus[count].die_id = 0;
+
+ r = device_get_sysattr_unsigned(cpu_node, "topology/core_id", &cpus[count].core_id);
+ if (r < 0) {
+ log_device_debug_errno(cpu_node, r, "Failed to get core_id, assuming core ID %u: %m", cpu);
+ cpus[count].core_id = cpu;
+ }
+
+ r = cpu_topology_get_first_thread(cpu_node, &first_thread);
+ if (r < 0)
+ cpus[count].is_first_thread = true;
+ else
+ cpus[count].is_first_thread = (first_thread == cpu);
+
+ count++;
+ }
+
+ if (count == 0)
+ return -ENOENT;
+
+ /* Assign sequential die IDs based on L3 discovery order */
+ r = assign_sequential_die_ids(cpus, count);
+ if (r < 0)
+ return r;
+
+ /* Sort CPUs by topology for consistent ordering */
+ typesafe_qsort(cpus, count, cpu_topology_compare);
+
+ *ret = TAKE_PTR(cpus);
+ *ret_count = count;
+
+ return 0;
+}
+
+/* Reorder indices so consecutive elements are maximally spread apart.
+ *
+ * Uses recursive divide-and-conquer: split in half, permute each half,
+ * then interleave. This ensures elements originally far apart become adjacent.
+ *
+ * Example trace for [0,1,2,3,4,5,6,7]:
+ * split into [0,1,2,3] and [4,5,6,7]
+ * recurse left: [0,1,2,3] -> [0,2,1,3]
+ * recurse right: [4,5,6,7] -> [4,6,5,7]
+ * interleave -> [0,4,2,6,1,5,3,7]
+ *
+ * The first N elements of the output are roughly evenly distributed across the
+ * original range, for any N. This is useful when assigning IRQs to CPUs: if a
+ * NIC has fewer IRQs than CPUs, the assigned CPUs will still be spread across
+ * the CPUs rather than all at the beginning. */
+static int equidist_permute(size_t *indices, size_t n_indices) {
+ _cleanup_free_ size_t *left = NULL, *right = NULL;
+ size_t left_count, right_count;
+ size_t li = 0, ri = 0, ti = 0;
+ int r;
+
+ assert(indices);
+
+ if (n_indices <= 1)
+ return 0;
+
+ left_count = DIV_ROUND_UP(n_indices, 2);
+ right_count = n_indices - left_count;
+
+ /* Recursively permute each half */
+ left = newdup(size_t, indices, left_count);
+ right = newdup(size_t, &indices[left_count], right_count);
+ if (!left || !right)
+ return log_oom();
+
+ r = equidist_permute(left, left_count);
+ if (r < 0)
+ return r;
+
+ r = equidist_permute(right, right_count);
+ if (r < 0)
+ return r;
+
+ /* Interleave: left[0], right[0], left[1], right[1], ... */
+ for (size_t i = 0; i < n_indices; i++) {
+ if (i % 2 == 0 && li < left_count)
+ indices[ti++] = left[li++];
+ else if (ri < right_count)
+ indices[ti++] = right[ri++];
+ else if (li < left_count)
+ indices[ti++] = left[li++];
+ }
+
+ return 0;
+}
+
+static void die_info_free(DieInfo *dies, size_t count) {
+ assert(dies || count == 0);
+
+ FOREACH_ARRAY(die, dies, count)
+ free(die->cpus);
+ free(dies);
+}
+
+/* Build die information from topology, grouping CPUs by L3/die and filtering to first HT only */
+static int build_die_info(const CPUTopology *topology, size_t topology_count, DieInfo **ret, size_t *ret_count) {
+ DieInfo *dies = NULL;
+ size_t die_count = 0;
+ int r;
+
+ assert(topology);
+ assert(ret);
+ assert(ret_count);
+
+ CLEANUP_ARRAY(dies, die_count, die_info_free);
+
+ FOREACH_ARRAY(cpu_topology, topology, topology_count) {
+ DieInfo *die = NULL;
+
+ /* Only consider first hyperthreads for initial spread */
+ if (!cpu_topology->is_first_thread)
+ continue;
+
+ /* Find or create die entry */
+ for (size_t j = 0; j < die_count; j++)
+ if (dies[j].die_id == cpu_topology->die_id) {
+ die = &dies[j];
+ break;
+ }
+
+ if (!die) {
+ if (!GREEDY_REALLOC(dies, die_count + 1))
+ return log_oom();
+ die = &dies[die_count++];
+ *die = (DieInfo) { .die_id = cpu_topology->die_id };
+ }
+
+ if (!GREEDY_REALLOC(die->cpus, die->cpu_count + 1))
+ return log_oom();
+
+ die->cpus[die->cpu_count++] = cpu_topology->cpu;
+ }
+
+ /* Sort dies by die_id for determinism, then apply equidist to CPUs within each die */
+ FOREACH_ARRAY(die, dies, die_count) {
+ _cleanup_free_ unsigned *reordered = NULL;
+ _cleanup_free_ size_t *indices = new(size_t, die->cpu_count);
+ if (!indices)
+ return log_oom();
+
+ for (size_t j = 0; j < die->cpu_count; j++)
+ indices[j] = j;
+
+ r = equidist_permute(indices, die->cpu_count);
+ if (r < 0)
+ return r;
+
+ /* Reorder CPUs according to equidist permutation */
+ reordered = new(unsigned, die->cpu_count);
+ if (!reordered)
+ return log_oom();
+
+ for (size_t j = 0; j < die->cpu_count; j++)
+ reordered[j] = die->cpus[indices[j]];
+
+ memcpy(die->cpus, reordered, die->cpu_count * sizeof(unsigned));
+ }
+
+ *ret = TAKE_PTR(dies);
+ *ret_count = die_count;
+
+ return 0;
+}
+
+/* Select CPUs for IRQ affinity spreading with optimal topology distribution.
+ *
+ * Algorithm:
+ * 1. Group CPUs by die (L3 cache domain), using only first hyperthreads
+ * 2. Apply equidistant permutation to both die order and CPUs within each die,
+ * so consecutive selections are maximally spread (e.g., [0,1,2,3] -> [0,2,1,3])
+ * 3. Round-robin across dies, picking one CPU per die per round
+ * 4. If more IRQs than physical cores, wrap around and reuse the same CPUs
+ *
+ * Ensures each IRQ gets a dedicated physical core before any core handles
+ * multiple IRQs. Two IRQs on one physical core time-share but benefit from warm
+ * cache, whereas spreading across SMT siblings causes resource contention with
+ * no cache benefit.
+ * Maximizes physical distance between consecutively assigned IRQs, improving
+ * cache distribution even when only a few IRQs are assigned. */
+static int select_spread_cpus(
+ const CPUTopology *topology,
+ size_t topology_count,
+ size_t n_irqs,
+ unsigned **ret,
+ size_t *ret_count) {
+
+ _cleanup_free_ unsigned *selected = NULL;
+ _cleanup_free_ size_t *die_order = NULL;
+ DieInfo *dies = NULL;
+ size_t die_count = 0, selected_count = 0;
+ int r;
+
+ assert(topology);
+ assert(ret);
+ assert(ret_count);
+
+ CLEANUP_ARRAY(dies, die_count, die_info_free);
+
+ selected = new(unsigned, n_irqs);
+ if (!selected)
+ return -ENOMEM;
+
+ /* Build die information with first HT CPUs only */
+ r = build_die_info(topology, topology_count, &dies, &die_count);
+ if (r < 0)
+ return r;
+
+ if (die_count == 0)
+ return -ENOENT;
+
+ /* Create equidistant die ordering */
+ die_order = new(size_t, die_count);
+ if (!die_order)
+ return -ENOMEM;
+
+ for (size_t i = 0; i < die_count; i++)
+ die_order[i] = i;
+
+ r = equidist_permute(die_order, die_count);
+ if (r < 0)
+ return r;
+
+ /* Round-robin across dies, picking one CPU from each die at a time */
+ size_t dies_exhausted = 0;
+ while (selected_count < n_irqs) {
+ bool made_progress = false;
+
+ for (size_t i = 0; i < die_count && selected_count < n_irqs; i++) {
+ DieInfo *die = &dies[die_order[i]];
+
+ if (die->next_idx >= die->cpu_count)
+ continue;
+
+ selected[selected_count++] = die->cpus[die->next_idx++];
+ made_progress = true;
+
+ if (die->next_idx >= die->cpu_count)
+ dies_exhausted++;
+ }
+
+ if (made_progress)
+ continue;
+
+ /* All first HTs exhausted, wrap around for remaining IRQs */
+ if (dies_exhausted < die_count)
+ break;
+
+ /* Reset all dies for round-robin wrap */
+ FOREACH_ARRAY(die, dies, die_count)
+ die->next_idx = 0;
+ dies_exhausted = 0;
+ }
+
+ *ret = TAKE_PTR(selected);
+ *ret_count = selected_count;
+
+ return 0;
+}
+
static int set_irq_affinity(Link *link, unsigned irq, unsigned cpu) {
_cleanup_free_ char *affinity_path = NULL, *mask_str = NULL;
unsigned n_groups = cpu / 32;
return 0;
}
+static int link_apply_irq_affinity_spread(Link *link) {
+ _cleanup_closedir_ DIR *dir = NULL;
+ _cleanup_free_ CPUTopology *topology = NULL;
+ _cleanup_free_ unsigned *irqs = NULL;
+ _cleanup_free_ unsigned *spread_cpus = NULL;
+ size_t topology_count = 0, irq_count = 0, spread_count = 0;
+ int r;
+
+ assert(link);
+
+ r = device_opendir(link->event->dev, "device/msi_irqs", &dir);
+ if (r < 0) {
+ if (r != -ENOENT)
+ return log_link_error_errno(link, r, "Failed to open device/msi_irqs: %m");
+ log_link_debug_errno(link, r, "No MSI IRQs found, skipping IRQ affinity configuration: %m");
+ return 0;
+ }
+
+ FOREACH_DIRENT(de, dir, return log_link_error_errno(link, errno, "Failed to read directory device/msi_irqs: %m")) {
+ unsigned irq;
+
+ r = safe_atou(de->d_name, &irq);
+ if (r < 0)
+ return log_link_error_errno(link, r, "Failed to convert parse IRQ number: %s", de->d_name);
+
+ if (!GREEDY_REALLOC(irqs, irq_count + 1))
+ return log_oom();
+
+ irqs[irq_count++] = irq;
+ }
+
+ if (irq_count == 0) {
+ log_link_debug(link, "No IRQs found, skipping spread.");
+ return 0;
+ }
+
+ typesafe_qsort(irqs, irq_count, cmp_unsigned);
+
+ r = discover_cpu_topology(&topology, &topology_count);
+ if (r < 0)
+ return log_link_error_errno(link, r, "Failed to discover CPU topology: %m");
+
+ log_link_debug(link, "Discovered %zu CPUs, spreading %zu IRQs.", topology_count, irq_count);
+
+ /* Select CPUs using maximum distance algorithm */
+ r = select_spread_cpus(topology, topology_count, irq_count, &spread_cpus, &spread_count);
+ if (r < 0)
+ return log_link_error_errno(link, r, "Failed to select spread CPUs: %m");
+
+ for (size_t i = 0; i < spread_count; i++)
+ (void) set_irq_affinity(link, irqs[i], spread_cpus[i]);
+
+ log_link_info(link, "Applied IRQ affinity policy 'spread' across %zu CPUs for %zu IRQs.",
+ MIN(topology_count, irq_count), irq_count);
+
+ return 0;
+}
+
static int link_apply_irq_affinity_single(Link *link) {
_cleanup_closedir_ DIR *dir = NULL;
int r;
switch (link->config->irq_affinity_policy) {
case IRQ_AFFINITY_POLICY_SINGLE:
return link_apply_irq_affinity_single(link);
+ case IRQ_AFFINITY_POLICY_SPREAD:
+ return link_apply_irq_affinity_spread(link);
default:
assert_not_reached();
}
static const char* const irq_affinity_policy_table[_IRQ_AFFINITY_POLICY_MAX] = {
[IRQ_AFFINITY_POLICY_SINGLE] = "single",
+ [IRQ_AFFINITY_POLICY_SPREAD] = "spread",
};
DEFINE_STRING_TABLE_LOOKUP(irq_affinity_policy, IRQAffinityPolicy);