]> git.ipfire.org Git - thirdparty/systemd.git/commitdiff
udev/net: add IRQAffinityNUMA= option for NUMA-aware filtering
authorQuentin Deslandes <qde@naccy.de>
Mon, 16 Feb 2026 19:43:37 +0000 (20:43 +0100)
committerQuentin Deslandes <qde@naccy.de>
Wed, 20 May 2026 09:39:11 +0000 (11:39 +0200)
Add support for filtering IRQ affinity to CPUs on a specific NUMA node
via the new IRQAffinityNUMA= option in .link files. The option accepts:
- "local": use the NUMA node local to the NIC's PCIe slot
- Explicit node number (0, 1, 2, ...): use CPUs on the specified node

When both IRQAffinity= and IRQAffinityNUMA= are specified, their
intersection is used. If the intersection is empty, an error is logged
and IRQ affinity configuration is skipped.

When "local" is specified but the device's NUMA node cannot be
determined (numa_node shows -1), a warning is logged and IRQ affinity
configuration is skipped.

src/shared/numa-util.c
src/shared/numa-util.h
src/udev/net/link-config-gperf.gperf
src/udev/net/link-config.c
src/udev/net/link-config.h
test/units/TEST-17-UDEV.irq-affinity.sh

index 9097ccbc313c72b71db88e01b38bd54a085668a1..34ddc0e547f5a0d023069dd82eebbb97003c07c2 100644 (file)
@@ -89,6 +89,22 @@ int apply_numa_policy(const NUMAPolicy *policy) {
         return 0;
 }
 
+int numa_node_get_cpus(size_t node, CPUSet *ret) {
+        char p[STRLEN("/sys/devices/system/node/node//cpulist") + DECIMAL_STR_MAX(size_t) + 1];
+        _cleanup_free_ char *cpulist = NULL;
+        int r;
+
+        assert(ret);
+
+        xsprintf(p, "/sys/devices/system/node/node%zu/cpulist", node);
+
+        r = read_virtual_file(p, SIZE_MAX, &cpulist, /* ret_size= */ NULL);
+        if (r < 0)
+                return r;
+
+        return parse_cpu_set(cpulist, ret);
+}
+
 int numa_to_cpu_set(const NUMAPolicy *policy, CPUSet *ret) {
         _cleanup_(cpu_set_done) CPUSet s = {};
         int r;
@@ -97,20 +113,11 @@ int numa_to_cpu_set(const NUMAPolicy *policy, CPUSet *ret) {
         assert(ret);
 
         for (size_t i = 0; i < policy->nodes.allocated * 8; i++) {
-                _cleanup_free_ char *l = NULL;
-                char p[STRLEN("/sys/devices/system/node/node//cpulist") + DECIMAL_STR_MAX(size_t) + 1];
-
                 if (!CPU_ISSET_S(i, policy->nodes.allocated, policy->nodes.set))
                         continue;
 
-                xsprintf(p, "/sys/devices/system/node/node%zu/cpulist", i);
-
-                r = read_one_line_file(p, &l);
-                if (r < 0)
-                        return r;
-
                 _cleanup_(cpu_set_done) CPUSet part = {};
-                r = parse_cpu_set(l, &part);
+                r = numa_node_get_cpus(i, &part);
                 if (r < 0)
                         return r;
 
index 6fec7c587baa8de20f5979ef955321e39e16f817..01079351b07ffcad2aaa77326d4e34e8b56f1669 100644 (file)
@@ -29,6 +29,7 @@ static inline void numa_policy_reset(NUMAPolicy *p) {
 }
 
 int apply_numa_policy(const NUMAPolicy *policy);
+int numa_node_get_cpus(size_t node, CPUSet *ret);
 int numa_to_cpu_set(const NUMAPolicy *policy, CPUSet *ret);
 
 int numa_get_node_from_cpu(unsigned cpu, unsigned *ret);
index 90eebe120d1551d94922765baf4e8923e7feb554..a005cadd49a61b44a03a0ea1d6568be7b1a9a324 100644 (file)
@@ -135,6 +135,7 @@ Link.ReceivePacketSteeringCPUMask,         config_parse_rps_cpu_mask,
 /* IRQ affinity settings */
 Link.IRQAffinityPolicy,                    config_parse_irq_affinity_policy,      0,                             offsetof(LinkConfig, irq_affinity_policy)
 Link.IRQAffinity,                          config_parse_cpu_set,                  0,                             offsetof(LinkConfig, irq_affinity_cpus)
+Link.IRQAffinityNUMA,                      config_parse_irq_affinity_numa,        0,                             offsetof(LinkConfig, irq_affinity_numa)
 /* SR-IOV settings */
 Link.SR-IOVVirtualFunctions,               config_parse_sr_iov_num_vfs,           0,                             offsetof(LinkConfig, sr_iov_num_vfs)
 SR-IOV.VirtualFunction,                    config_parse_sr_iov_uint32,            0,                             offsetof(LinkConfig, sr_iov_by_section)
index 5fd59f9453a6664f753c7b7f3015d94b4d5c7dfd..306686845e916beb4f5430307ee4393ceff9f37a 100644 (file)
@@ -277,6 +277,7 @@ int link_load_one(LinkConfigContext *ctx, const char *filename) {
                 .eee_tx_lpi_enabled = -1,
                 .eee_tx_lpi_timer_usec = USEC_INFINITY,
                 .irq_affinity_policy = _IRQ_AFFINITY_POLICY_INVALID,
+                .irq_affinity_numa = IRQ_AFFINITY_NUMA_UNSET,
         };
 
         FOREACH_ELEMENT(feature, config->features)
@@ -929,6 +930,28 @@ static int link_apply_sr_iov_config(Link *link) {
         return 0;
 }
 
+/* Get the local NUMA node for a network device from sysfs.
+ * Returns -ENOENT if numa_node file doesn't exist or shows -1 (no NUMA). */
+static int link_get_device_numa_node(Link *link, unsigned *ret) {
+        int r, node;
+
+        assert(link);
+        assert(link->event);
+        assert(link->event->dev);
+        assert(ret);
+
+        r = device_get_sysattr_int(link->event->dev, "device/numa_node", &node);
+        if (r < 0)
+                return r;
+
+        /* -1 means no NUMA node (non-NUMA system or device not associated with a node) */
+        if (node < 0)
+                return -ENOENT;
+
+        *ret = (unsigned) node;
+        return 0;
+}
+
 /* CPU topology information for IRQ affinity spread algorithm. */
 typedef struct CPUTopology {
         unsigned cpu;
@@ -1542,6 +1565,7 @@ static int link_apply_irq_affinity_single(Link *link, const CPUSet *allowed_cpus
 static int link_apply_irq_affinity(Link *link) {
         _cleanup_(cpu_set_done) CPUSet effective_cpus = {};
         const char *syspath;
+        unsigned numa_node = IRQ_AFFINITY_NUMA_UNSET;
         int r;
 
         assert(link);
@@ -1560,11 +1584,82 @@ static int link_apply_irq_affinity(Link *link) {
         if (r < 0)
                 return log_link_warning_errno(link, r, "Failed to get syspath: %m");
 
+        /* Compute effective CPU set from IRQAffinity= and IRQAffinityNUMA= */
+        if (link->config->irq_affinity_numa != IRQ_AFFINITY_NUMA_UNSET) {
+                _cleanup_(cpu_set_done) CPUSet numa_cpus = {};
+
+                /* Resolve "local" to the actual NUMA node */
+                if (link->config->irq_affinity_numa == IRQ_AFFINITY_NUMA_LOCAL) {
+                        r = link_get_device_numa_node(link, &numa_node);
+                        if (r < 0) {
+                                log_link_warning_errno(
+                                                link, r,
+                                                "Failed to determine local NUMA node for device, skipping IRQ affinity configuration: %m");
+                                return 0;
+                        }
+                        log_link_debug(link, "Device is on NUMA node %u.", numa_node);
+                } else
+                        numa_node = link->config->irq_affinity_numa;
+
+                /* Get CPUs for the NUMA node */
+                r = numa_node_get_cpus(numa_node, &numa_cpus);
+                if (r < 0) {
+                        log_link_warning_errno(
+                                        link, r,
+                                        "Failed to get CPUs for NUMA node %u, skipping IRQ affinity configuration: %m",
+                                        numa_node);
+                        return 0;
+                }
+
+                /* If IRQAffinity= is also specified, compute intersection */
+                if (link->config->irq_affinity_cpus.set) {
+                        /* Compute intersection of IRQAffinity= and NUMA CPUs */
+                        size_t max_allocated = MAX(numa_cpus.allocated, link->config->irq_affinity_cpus.allocated);
+
+                        r = cpu_set_realloc(&effective_cpus, max_allocated * 8);
+                        if (r < 0)
+                                return log_oom();
+
+                        for (size_t i = 0; i < max_allocated * 8; i++) {
+                                bool in_numa = i < numa_cpus.allocated * 8 &&
+                                               CPU_ISSET_S(i, numa_cpus.allocated, numa_cpus.set);
+                                bool in_affinity = i < link->config->irq_affinity_cpus.allocated * 8 &&
+                                                   CPU_ISSET_S(i, link->config->irq_affinity_cpus.allocated, link->config->irq_affinity_cpus.set);
+
+                                if (in_numa && in_affinity) {
+                                        r = cpu_set_add(&effective_cpus, i);
+                                        if (r < 0)
+                                                return log_oom();
+                                }
+                        }
+
+                        /* Check if intersection is empty */
+                        if (!effective_cpus.set || CPU_COUNT_S(effective_cpus.allocated, effective_cpus.set) == 0) {
+                                log_link_warning(
+                                                link,
+                                                "IRQAffinity= and IRQAffinityNUMA= intersection is empty, skipping IRQ affinity configuration.");
+                                return 0;
+                        }
+
+                        log_link_debug(link, "Using intersection of IRQAffinity= and NUMA node %u CPUs.", numa_node);
+                } else {
+                        /* Only NUMA filtering, use NUMA CPUs directly */
+                        effective_cpus = TAKE_STRUCT(numa_cpus);
+                        log_link_debug(link, "Using CPUs from NUMA node %u.", numa_node);
+                }
+        } else if (link->config->irq_affinity_cpus.set) {
+                /* Only IRQAffinity= specified, copy it */
+                r = cpu_set_add_set(&effective_cpus, &link->config->irq_affinity_cpus);
+                if (r < 0)
+                        return log_oom();
+        }
+        /* else: no filtering, effective_cpus remains empty (meaning use all CPUs) */
+
         switch (link->config->irq_affinity_policy) {
         case IRQ_AFFINITY_POLICY_SINGLE:
-                return link_apply_irq_affinity_single(link, &link->config->irq_affinity_cpus);
+                return link_apply_irq_affinity_single(link, effective_cpus.set ? &effective_cpus : NULL);
         case IRQ_AFFINITY_POLICY_SPREAD:
-                return link_apply_irq_affinity_spread(link, &link->config->irq_affinity_cpus);
+                return link_apply_irq_affinity_spread(link, effective_cpus.set ? &effective_cpus : NULL);
         default:
                 assert_not_reached();
         }
@@ -2053,6 +2148,52 @@ DEFINE_CONFIG_PARSE_ENUMV(config_parse_name_policy, name_policy, NamePolicy,
 DEFINE_CONFIG_PARSE_ENUMV(config_parse_alternative_names_policy, alternative_names_policy, NamePolicy,
                           _NAMEPOLICY_INVALID);
 
+int config_parse_irq_affinity_numa(
+                const char *unit,
+                const char *filename,
+                unsigned line,
+                const char *section,
+                unsigned section_line,
+                const char *lvalue,
+                int ltype,
+                const char *rvalue,
+                void *data,
+                void *userdata) {
+
+        unsigned tmp, *numa = ASSERT_PTR(data);
+        int r;
+
+        assert(filename);
+        assert(lvalue);
+        assert(rvalue);
+
+        if (isempty(rvalue)) {
+                *numa = IRQ_AFFINITY_NUMA_UNSET;
+                return 0;
+        }
+
+        if (streq(rvalue, "local")) {
+                *numa = IRQ_AFFINITY_NUMA_LOCAL;
+                return 0;
+        }
+
+        /* Parse as NUMA node number */
+        r = safe_atou(rvalue, &tmp);
+        if (r < 0)
+                return log_syntax_parse_error(unit, filename, line, r, lvalue, rvalue);
+
+        /* UINT_MAX and UINT_MAX-1 are used to flag "unset" and "local NUMA node" respectively. */
+        if (tmp >= IRQ_AFFINITY_NUMA_LOCAL) {
+                log_syntax(unit, LOG_WARNING, filename, line, 0,
+                           "Invalid NUMA node number %u, ignoring assignment: %s", tmp, rvalue);
+                return 0;
+        }
+
+        *numa = tmp;
+
+        return 0;
+}
+
 static const char* const irq_affinity_policy_table[_IRQ_AFFINITY_POLICY_MAX] = {
         [IRQ_AFFINITY_POLICY_SINGLE] = "single",
         [IRQ_AFFINITY_POLICY_SPREAD] = "spread",
index da28f569807d8f9fd9ab6d4d9e44ce6594bbb1a7..a2b0def4c337249c90926fb8893053cc0ab0fab5 100644 (file)
@@ -1,6 +1,8 @@
 /* SPDX-License-Identifier: LGPL-2.1-or-later */
 #pragma once
 
+#include <limits.h>
+
 #include "sd-device.h"
 
 #include "cpu-set-util.h"
@@ -29,6 +31,10 @@ typedef enum IRQAffinityPolicy {
         _IRQ_AFFINITY_POLICY_INVALID = -EINVAL,
 } IRQAffinityPolicy;
 
+/* Special values for IRQAffinityNUMA= */
+#define IRQ_AFFINITY_NUMA_UNSET   UINT_MAX
+#define IRQ_AFFINITY_NUMA_LOCAL   (IRQ_AFFINITY_NUMA_UNSET - 1)
+
 typedef struct Link {
         UdevEvent *event;
         LinkConfig *config;
@@ -123,6 +129,7 @@ struct LinkConfig {
         /* IRQ affinity */
         IRQAffinityPolicy irq_affinity_policy;
         CPUSet irq_affinity_cpus;
+        unsigned irq_affinity_numa;
 
         /* SR-IOV */
         uint32_t sr_iov_num_vfs;
@@ -163,3 +170,4 @@ CONFIG_PARSER_PROTOTYPE(config_parse_name_policy);
 CONFIG_PARSER_PROTOTYPE(config_parse_alternative_names_policy);
 CONFIG_PARSER_PROTOTYPE(config_parse_rps_cpu_mask);
 CONFIG_PARSER_PROTOTYPE(config_parse_irq_affinity_policy);
+CONFIG_PARSER_PROTOTYPE(config_parse_irq_affinity_numa);
index 7c3662243e2949d8b9590f21da5b8cd6226a60a0..863fc91a39b697f03cfe0cf1c091d1ebc68de8a1 100755 (executable)
@@ -189,6 +189,67 @@ EOF
         echo "Skipping IRQAffinity= spread test (need >=4 CPUs and >1 IRQ)"
     fi
 
+    # Test 1e: Test IRQAffinityNUMA= if NUMA is available
+    if [[ -d /sys/devices/system/node/node0 ]]; then
+        # Get CPUs on NUMA node 0
+        numa0_cpus=$(cat /sys/devices/system/node/node0/cpulist)
+        echo "NUMA node 0 has CPUs: $numa0_cpus"
+
+        cat >/run/systemd/network/00-test-irq-affinity.link <<EOF
+[Match]
+MACAddress=$mac
+
+[Link]
+IRQAffinityPolicy=spread
+IRQAffinityNUMA=0
+EOF
+
+        udevadm control --reload
+        udevadm trigger --action=add "/sys/class/net/$iface"
+        udevadm settle --timeout=30
+
+        # Verify IRQs are on NUMA node 0 CPUs
+        # Parse the cpulist to get valid CPUs
+        for irq in $irqs; do
+            affinity_list=$(cat "/proc/irq/$irq/smp_affinity_list")
+            echo "IRQ $irq is on CPU(s): $affinity_list (NUMA 0 CPUs: $numa0_cpus)"
+        done
+        echo "IRQAffinityNUMA= configuration applied"
+    else
+        echo "Skipping IRQAffinityNUMA= test (no NUMA available)"
+    fi
+
+    # Test 1f: Test empty intersection error case
+    # This should log an error and skip affinity configuration
+    if [[ -d /sys/devices/system/node/node0 ]] && [[ -d /sys/devices/system/node/node1 ]]; then
+        # Get first CPU from node 0 that is NOT in node 1
+        first_numa0_cpu=$(cut -d',' -f1 /sys/devices/system/node/node0/cpulist | cut -d'-' -f1)
+
+        cat >/run/systemd/network/00-test-irq-affinity.link <<EOF
+[Match]
+MACAddress=$mac
+
+[Link]
+IRQAffinityPolicy=spread
+IRQAffinity=$first_numa0_cpu
+IRQAffinityNUMA=1
+EOF
+
+        udevadm control --reload
+        udevadm trigger --action=add "/sys/class/net/$iface"
+        udevadm settle --timeout=30
+
+        # The configuration should be applied but IRQ affinity skipped due to empty intersection
+        # Check journal for the error message
+        if journalctl -u systemd-udevd --since="1 minute ago" | grep "intersection is empty" >/dev/null; then
+            echo "Empty intersection correctly detected and logged"
+        else
+            echo "Note: Empty intersection test - check journal for error message"
+        fi
+    else
+        echo "Skipping empty intersection test (need 2 NUMA nodes)"
+    fi
+
     # Cleanup
     rm -f /run/systemd/network/00-test-irq-affinity.link
     udevadm control --reload
@@ -287,15 +348,82 @@ assert_in "ID_NET_LINK_FILE=/run/systemd/network/10-test-irq-affinity-cpus.link"
 output=$(udevadm test-builtin --action add net_setup_link /sys/class/net/testirq3 2>&1)
 assert_in "ID_NET_LINK_FILE=/run/systemd/network/10-test-irq-affinity-cpus.link" "$output"
 
+# Test 6: IRQAffinityNUMA= config parsing
+cat >/run/systemd/network/10-test-irq-affinity-numa.link <<EOF
+[Match]
+Kind=dummy
+MACAddress=00:50:56:c0:00:24
+
+[Link]
+Name=testirq4
+IRQAffinityPolicy=spread
+IRQAffinityNUMA=local
+EOF
+
+udevadm control --reload
+
+ip link add address 00:50:56:c0:00:24 type dummy
+udevadm wait --settle --timeout=30 /sys/class/net/testirq4
+
+output=$(udevadm info --query property /sys/class/net/testirq4)
+assert_in "ID_NET_LINK_FILE=/run/systemd/network/10-test-irq-affinity-numa.link" "$output"
+
+# Test 7: IRQAffinityNUMA= with explicit node number
+cat >/run/systemd/network/10-test-irq-affinity-numa-explicit.link <<EOF
+[Match]
+Kind=dummy
+MACAddress=00:50:56:c0:00:25
+
+[Link]
+Name=testirq5
+IRQAffinityPolicy=single
+IRQAffinityNUMA=0
+EOF
+
+udevadm control --reload
+
+ip link add address 00:50:56:c0:00:25 type dummy
+udevadm wait --settle --timeout=30 /sys/class/net/testirq5
+
+output=$(udevadm info --query property /sys/class/net/testirq5)
+assert_in "ID_NET_LINK_FILE=/run/systemd/network/10-test-irq-affinity-numa-explicit.link" "$output"
+
+# Test 8: Combined IRQAffinity= and IRQAffinityNUMA=
+cat >/run/systemd/network/10-test-irq-affinity-combined.link <<EOF
+[Match]
+Kind=dummy
+MACAddress=00:50:56:c0:00:26
+
+[Link]
+Name=testirq6
+IRQAffinityPolicy=spread
+IRQAffinity=0-7
+IRQAffinityNUMA=0
+EOF
+
+udevadm control --reload
+
+ip link add address 00:50:56:c0:00:26 type dummy
+udevadm wait --settle --timeout=30 /sys/class/net/testirq6
+
+output=$(udevadm info --query property /sys/class/net/testirq6)
+assert_in "ID_NET_LINK_FILE=/run/systemd/network/10-test-irq-affinity-combined.link" "$output"
+
 # Cleanup
 ip link del dev testirq0
 ip link del dev testirq1
 ip link del dev testirq2
 ip link del dev testirq3
+ip link del dev testirq4
+ip link del dev testirq5
+ip link del dev testirq6
 
 rm -f /run/systemd/network/10-test-irq.link
 rm -f /run/systemd/network/10-test-irq-invalid.link
 rm -f /run/systemd/network/10-test-irq-empty.link
 rm -f /run/systemd/network/10-test-irq-affinity-cpus.link
+rm -f /run/systemd/network/10-test-irq-affinity-numa.link
+rm -f /run/systemd/network/10-test-irq-affinity-numa-explicit.link
+rm -f /run/systemd/network/10-test-irq-affinity-combined.link
 
 exit 0