From: Quentin Deslandes Date: Mon, 16 Feb 2026 19:43:37 +0000 (+0100) Subject: udev/net: add IRQAffinityNUMA= option for NUMA-aware filtering X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=7dfbbfb90dd162ce14d94a4a18a5856235c3fac2;p=thirdparty%2Fsystemd.git udev/net: add IRQAffinityNUMA= option for NUMA-aware filtering Add support for filtering IRQ affinity to CPUs on a specific NUMA node via the new IRQAffinityNUMA= option in .link files. The option accepts: - "local": use the NUMA node local to the NIC's PCIe slot - Explicit node number (0, 1, 2, ...): use CPUs on the specified node When both IRQAffinity= and IRQAffinityNUMA= are specified, their intersection is used. If the intersection is empty, an error is logged and IRQ affinity configuration is skipped. When "local" is specified but the device's NUMA node cannot be determined (numa_node shows -1), a warning is logged and IRQ affinity configuration is skipped. --- diff --git a/src/shared/numa-util.c b/src/shared/numa-util.c index 9097ccbc313..34ddc0e547f 100644 --- a/src/shared/numa-util.c +++ b/src/shared/numa-util.c @@ -89,6 +89,22 @@ int apply_numa_policy(const NUMAPolicy *policy) { return 0; } +int numa_node_get_cpus(size_t node, CPUSet *ret) { + char p[STRLEN("/sys/devices/system/node/node//cpulist") + DECIMAL_STR_MAX(size_t) + 1]; + _cleanup_free_ char *cpulist = NULL; + int r; + + assert(ret); + + xsprintf(p, "/sys/devices/system/node/node%zu/cpulist", node); + + r = read_virtual_file(p, SIZE_MAX, &cpulist, /* ret_size= */ NULL); + if (r < 0) + return r; + + return parse_cpu_set(cpulist, ret); +} + int numa_to_cpu_set(const NUMAPolicy *policy, CPUSet *ret) { _cleanup_(cpu_set_done) CPUSet s = {}; int r; @@ -97,20 +113,11 @@ int numa_to_cpu_set(const NUMAPolicy *policy, CPUSet *ret) { assert(ret); for (size_t i = 0; i < policy->nodes.allocated * 8; i++) { - _cleanup_free_ char *l = NULL; - char p[STRLEN("/sys/devices/system/node/node//cpulist") + DECIMAL_STR_MAX(size_t) + 1]; - if (!CPU_ISSET_S(i, policy->nodes.allocated, policy->nodes.set)) continue; - xsprintf(p, "/sys/devices/system/node/node%zu/cpulist", i); - - r = read_one_line_file(p, &l); - if (r < 0) - return r; - _cleanup_(cpu_set_done) CPUSet part = {}; - r = parse_cpu_set(l, &part); + r = numa_node_get_cpus(i, &part); if (r < 0) return r; diff --git a/src/shared/numa-util.h b/src/shared/numa-util.h index 6fec7c587ba..01079351b07 100644 --- a/src/shared/numa-util.h +++ b/src/shared/numa-util.h @@ -29,6 +29,7 @@ static inline void numa_policy_reset(NUMAPolicy *p) { } int apply_numa_policy(const NUMAPolicy *policy); +int numa_node_get_cpus(size_t node, CPUSet *ret); int numa_to_cpu_set(const NUMAPolicy *policy, CPUSet *ret); int numa_get_node_from_cpu(unsigned cpu, unsigned *ret); diff --git a/src/udev/net/link-config-gperf.gperf b/src/udev/net/link-config-gperf.gperf index 90eebe120d1..a005cadd49a 100644 --- a/src/udev/net/link-config-gperf.gperf +++ b/src/udev/net/link-config-gperf.gperf @@ -135,6 +135,7 @@ Link.ReceivePacketSteeringCPUMask, config_parse_rps_cpu_mask, /* IRQ affinity settings */ Link.IRQAffinityPolicy, config_parse_irq_affinity_policy, 0, offsetof(LinkConfig, irq_affinity_policy) Link.IRQAffinity, config_parse_cpu_set, 0, offsetof(LinkConfig, irq_affinity_cpus) +Link.IRQAffinityNUMA, config_parse_irq_affinity_numa, 0, offsetof(LinkConfig, irq_affinity_numa) /* SR-IOV settings */ Link.SR-IOVVirtualFunctions, config_parse_sr_iov_num_vfs, 0, offsetof(LinkConfig, sr_iov_num_vfs) SR-IOV.VirtualFunction, config_parse_sr_iov_uint32, 0, offsetof(LinkConfig, sr_iov_by_section) diff --git a/src/udev/net/link-config.c b/src/udev/net/link-config.c index 5fd59f9453a..306686845e9 100644 --- a/src/udev/net/link-config.c +++ b/src/udev/net/link-config.c @@ -277,6 +277,7 @@ int link_load_one(LinkConfigContext *ctx, const char *filename) { .eee_tx_lpi_enabled = -1, .eee_tx_lpi_timer_usec = USEC_INFINITY, .irq_affinity_policy = _IRQ_AFFINITY_POLICY_INVALID, + .irq_affinity_numa = IRQ_AFFINITY_NUMA_UNSET, }; FOREACH_ELEMENT(feature, config->features) @@ -929,6 +930,28 @@ static int link_apply_sr_iov_config(Link *link) { return 0; } +/* Get the local NUMA node for a network device from sysfs. + * Returns -ENOENT if numa_node file doesn't exist or shows -1 (no NUMA). */ +static int link_get_device_numa_node(Link *link, unsigned *ret) { + int r, node; + + assert(link); + assert(link->event); + assert(link->event->dev); + assert(ret); + + r = device_get_sysattr_int(link->event->dev, "device/numa_node", &node); + if (r < 0) + return r; + + /* -1 means no NUMA node (non-NUMA system or device not associated with a node) */ + if (node < 0) + return -ENOENT; + + *ret = (unsigned) node; + return 0; +} + /* CPU topology information for IRQ affinity spread algorithm. */ typedef struct CPUTopology { unsigned cpu; @@ -1542,6 +1565,7 @@ static int link_apply_irq_affinity_single(Link *link, const CPUSet *allowed_cpus static int link_apply_irq_affinity(Link *link) { _cleanup_(cpu_set_done) CPUSet effective_cpus = {}; const char *syspath; + unsigned numa_node = IRQ_AFFINITY_NUMA_UNSET; int r; assert(link); @@ -1560,11 +1584,82 @@ static int link_apply_irq_affinity(Link *link) { if (r < 0) return log_link_warning_errno(link, r, "Failed to get syspath: %m"); + /* Compute effective CPU set from IRQAffinity= and IRQAffinityNUMA= */ + if (link->config->irq_affinity_numa != IRQ_AFFINITY_NUMA_UNSET) { + _cleanup_(cpu_set_done) CPUSet numa_cpus = {}; + + /* Resolve "local" to the actual NUMA node */ + if (link->config->irq_affinity_numa == IRQ_AFFINITY_NUMA_LOCAL) { + r = link_get_device_numa_node(link, &numa_node); + if (r < 0) { + log_link_warning_errno( + link, r, + "Failed to determine local NUMA node for device, skipping IRQ affinity configuration: %m"); + return 0; + } + log_link_debug(link, "Device is on NUMA node %u.", numa_node); + } else + numa_node = link->config->irq_affinity_numa; + + /* Get CPUs for the NUMA node */ + r = numa_node_get_cpus(numa_node, &numa_cpus); + if (r < 0) { + log_link_warning_errno( + link, r, + "Failed to get CPUs for NUMA node %u, skipping IRQ affinity configuration: %m", + numa_node); + return 0; + } + + /* If IRQAffinity= is also specified, compute intersection */ + if (link->config->irq_affinity_cpus.set) { + /* Compute intersection of IRQAffinity= and NUMA CPUs */ + size_t max_allocated = MAX(numa_cpus.allocated, link->config->irq_affinity_cpus.allocated); + + r = cpu_set_realloc(&effective_cpus, max_allocated * 8); + if (r < 0) + return log_oom(); + + for (size_t i = 0; i < max_allocated * 8; i++) { + bool in_numa = i < numa_cpus.allocated * 8 && + CPU_ISSET_S(i, numa_cpus.allocated, numa_cpus.set); + bool in_affinity = i < link->config->irq_affinity_cpus.allocated * 8 && + CPU_ISSET_S(i, link->config->irq_affinity_cpus.allocated, link->config->irq_affinity_cpus.set); + + if (in_numa && in_affinity) { + r = cpu_set_add(&effective_cpus, i); + if (r < 0) + return log_oom(); + } + } + + /* Check if intersection is empty */ + if (!effective_cpus.set || CPU_COUNT_S(effective_cpus.allocated, effective_cpus.set) == 0) { + log_link_warning( + link, + "IRQAffinity= and IRQAffinityNUMA= intersection is empty, skipping IRQ affinity configuration."); + return 0; + } + + log_link_debug(link, "Using intersection of IRQAffinity= and NUMA node %u CPUs.", numa_node); + } else { + /* Only NUMA filtering, use NUMA CPUs directly */ + effective_cpus = TAKE_STRUCT(numa_cpus); + log_link_debug(link, "Using CPUs from NUMA node %u.", numa_node); + } + } else if (link->config->irq_affinity_cpus.set) { + /* Only IRQAffinity= specified, copy it */ + r = cpu_set_add_set(&effective_cpus, &link->config->irq_affinity_cpus); + if (r < 0) + return log_oom(); + } + /* else: no filtering, effective_cpus remains empty (meaning use all CPUs) */ + switch (link->config->irq_affinity_policy) { case IRQ_AFFINITY_POLICY_SINGLE: - return link_apply_irq_affinity_single(link, &link->config->irq_affinity_cpus); + return link_apply_irq_affinity_single(link, effective_cpus.set ? &effective_cpus : NULL); case IRQ_AFFINITY_POLICY_SPREAD: - return link_apply_irq_affinity_spread(link, &link->config->irq_affinity_cpus); + return link_apply_irq_affinity_spread(link, effective_cpus.set ? &effective_cpus : NULL); default: assert_not_reached(); } @@ -2053,6 +2148,52 @@ DEFINE_CONFIG_PARSE_ENUMV(config_parse_name_policy, name_policy, NamePolicy, DEFINE_CONFIG_PARSE_ENUMV(config_parse_alternative_names_policy, alternative_names_policy, NamePolicy, _NAMEPOLICY_INVALID); +int config_parse_irq_affinity_numa( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + unsigned tmp, *numa = ASSERT_PTR(data); + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + if (isempty(rvalue)) { + *numa = IRQ_AFFINITY_NUMA_UNSET; + return 0; + } + + if (streq(rvalue, "local")) { + *numa = IRQ_AFFINITY_NUMA_LOCAL; + return 0; + } + + /* Parse as NUMA node number */ + r = safe_atou(rvalue, &tmp); + if (r < 0) + return log_syntax_parse_error(unit, filename, line, r, lvalue, rvalue); + + /* UINT_MAX and UINT_MAX-1 are used to flag "unset" and "local NUMA node" respectively. */ + if (tmp >= IRQ_AFFINITY_NUMA_LOCAL) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Invalid NUMA node number %u, ignoring assignment: %s", tmp, rvalue); + return 0; + } + + *numa = tmp; + + return 0; +} + static const char* const irq_affinity_policy_table[_IRQ_AFFINITY_POLICY_MAX] = { [IRQ_AFFINITY_POLICY_SINGLE] = "single", [IRQ_AFFINITY_POLICY_SPREAD] = "spread", diff --git a/src/udev/net/link-config.h b/src/udev/net/link-config.h index da28f569807..a2b0def4c33 100644 --- a/src/udev/net/link-config.h +++ b/src/udev/net/link-config.h @@ -1,6 +1,8 @@ /* SPDX-License-Identifier: LGPL-2.1-or-later */ #pragma once +#include + #include "sd-device.h" #include "cpu-set-util.h" @@ -29,6 +31,10 @@ typedef enum IRQAffinityPolicy { _IRQ_AFFINITY_POLICY_INVALID = -EINVAL, } IRQAffinityPolicy; +/* Special values for IRQAffinityNUMA= */ +#define IRQ_AFFINITY_NUMA_UNSET UINT_MAX +#define IRQ_AFFINITY_NUMA_LOCAL (IRQ_AFFINITY_NUMA_UNSET - 1) + typedef struct Link { UdevEvent *event; LinkConfig *config; @@ -123,6 +129,7 @@ struct LinkConfig { /* IRQ affinity */ IRQAffinityPolicy irq_affinity_policy; CPUSet irq_affinity_cpus; + unsigned irq_affinity_numa; /* SR-IOV */ uint32_t sr_iov_num_vfs; @@ -163,3 +170,4 @@ CONFIG_PARSER_PROTOTYPE(config_parse_name_policy); CONFIG_PARSER_PROTOTYPE(config_parse_alternative_names_policy); CONFIG_PARSER_PROTOTYPE(config_parse_rps_cpu_mask); CONFIG_PARSER_PROTOTYPE(config_parse_irq_affinity_policy); +CONFIG_PARSER_PROTOTYPE(config_parse_irq_affinity_numa); diff --git a/test/units/TEST-17-UDEV.irq-affinity.sh b/test/units/TEST-17-UDEV.irq-affinity.sh index 7c3662243e2..863fc91a39b 100755 --- a/test/units/TEST-17-UDEV.irq-affinity.sh +++ b/test/units/TEST-17-UDEV.irq-affinity.sh @@ -189,6 +189,67 @@ EOF echo "Skipping IRQAffinity= spread test (need >=4 CPUs and >1 IRQ)" fi + # Test 1e: Test IRQAffinityNUMA= if NUMA is available + if [[ -d /sys/devices/system/node/node0 ]]; then + # Get CPUs on NUMA node 0 + numa0_cpus=$(cat /sys/devices/system/node/node0/cpulist) + echo "NUMA node 0 has CPUs: $numa0_cpus" + + cat >/run/systemd/network/00-test-irq-affinity.link </run/systemd/network/00-test-irq-affinity.link </dev/null; then + echo "Empty intersection correctly detected and logged" + else + echo "Note: Empty intersection test - check journal for error message" + fi + else + echo "Skipping empty intersection test (need 2 NUMA nodes)" + fi + # Cleanup rm -f /run/systemd/network/00-test-irq-affinity.link udevadm control --reload @@ -287,15 +348,82 @@ assert_in "ID_NET_LINK_FILE=/run/systemd/network/10-test-irq-affinity-cpus.link" output=$(udevadm test-builtin --action add net_setup_link /sys/class/net/testirq3 2>&1) assert_in "ID_NET_LINK_FILE=/run/systemd/network/10-test-irq-affinity-cpus.link" "$output" +# Test 6: IRQAffinityNUMA= config parsing +cat >/run/systemd/network/10-test-irq-affinity-numa.link </run/systemd/network/10-test-irq-affinity-numa-explicit.link </run/systemd/network/10-test-irq-affinity-combined.link <