From: dongshengyuan <545258830@qq.com> Date: Mon, 22 Jun 2026 02:55:13 +0000 (+0800) Subject: numa: add support for preferred-many and weighted-interleave policies X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=c99674a408ffe37923f281f6ac67c2839ce2883f;p=thirdparty%2Fsystemd.git numa: add support for preferred-many and weighted-interleave policies Add support for two newer NUMA memory policies: - MPOL_PREFERRED_MANY (Linux 5.15): like MPOL_PREFERRED but accepts a set of nodes instead of a single node, falling back to all nodes if preferred nodes cannot satisfy the allocation. - MPOL_WEIGHTED_INTERLEAVE (Linux 6.9): like MPOL_INTERLEAVE but distributes pages across nodes proportionally to per-node weights configured via /sys/kernel/mm/mempolicy/weighted_interleave/. On kernels that do not support the requested policy, set_mempolicy() returns EINVAL. We convert EINVAL to EOPNOTSUPP only for the two new policies (MPOL_PREFERRED_MANY, MPOL_WEIGHTED_INTERLEAVE), so that a bad NUMAMask= for already-supported policies still fails the service rather than being silently ignored. The NUMA subsystem being absent (ENOSYS) continues to be handled silently at debug level, as before. Varlink serialization uses json_underscorify() on an owned copy of the policy name string to convert hyphenated names to the underscore form declared in the IDL enum, avoiding mutation of the read-only static string table. Signed-off-by: dongshengyuan --- diff --git a/README b/README index 9a551f76fa1..5553915be00 100644 --- a/README +++ b/README @@ -67,13 +67,14 @@ REQUIREMENTS: of systemd. Taint flag 'old-kernel' will be set. systemd will most likely still function, but upstream support and testing are limited. - Linux kernel ≥ 6.3 for MFD_EXEC/MFD_NOEXEC_SEAL and tmpfs noswap option + Linux kernel ≥ 5.15 for MPOL_PREFERRED_MANY NUMA policy + ≥ 6.3 for MFD_EXEC/MFD_NOEXEC_SEAL and tmpfs noswap option ≥ 6.5 for name_to_handle_at() AT_HANDLE_FID, SO_PEERPIDFD/SO_PASSPIDFD, and MOVE_MOUNT_BENEATH ≥ 6.6 for quota support on tmpfs ≥ 6.7 for cgroup2fs memory_hugetlb_accounting option ≥ 6.8 for STATX_MNT_ID_UNIQUE - ≥ 6.9 for pidfs + ≥ 6.9 for pidfs and MPOL_WEIGHTED_INTERLEAVE NUMA policy ≥ 6.10 for fcntl(F_DUPFD_QUERY), unprivileged linkat(AT_EMPTY_PATH), and block device 'partscan' sysfs attribute ≥ 6.12 for AT_HANDLE_MNT_ID_UNIQUE diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml index 6524ba631a7..0b43bc69603 100644 --- a/man/systemd.exec.xml +++ b/man/systemd.exec.xml @@ -1408,12 +1408,16 @@ CapabilityBoundingSet=~CAP_B CAP_C NUMAPolicy= Controls the NUMA memory policy of the executed processes. Takes a policy type, one of: - , , , and - . A list of NUMA nodes that should be associated with the policy must be specified - in NUMAMask=. For more details on each policy please see, + , , , , + , (requires Linux 5.15 or newer) and + (requires Linux 6.9 or newer, weights are configured via + /sys/kernel/mm/mempolicy/weighted_interleave/). A list of NUMA nodes that should be + associated with the policy must be specified in NUMAMask=. For more details on each + policy please see, set_mempolicy2. For overall overview of NUMA support in Linux see, numa7. + If the kernel does not support the requested policy, a warning is logged and the setting is ignored. diff --git a/src/core/exec-invoke.c b/src/core/exec-invoke.c index c45c807198d..2b0ced7b7d5 100644 --- a/src/core/exec-invoke.c +++ b/src/core/exec-invoke.c @@ -5646,8 +5646,10 @@ int exec_invoke( if (mpol_is_valid(numa_policy_get_type(&context->numa_policy))) { r = apply_numa_policy(&context->numa_policy); - if (ERRNO_IS_NEG_NOT_SUPPORTED(r)) + if (r == -ENOSYS) log_debug_errno(r, "NUMA support not available, ignoring."); + else if (ERRNO_IS_NEG_NOT_SUPPORTED(r)) + log_warning_errno(r, "NUMA policy not supported by kernel, ignoring."); else if (r < 0) { *exit_status = EXIT_NUMA_POLICY; return log_error_errno(r, "Failed to set NUMA memory policy: %m"); diff --git a/src/core/main.c b/src/core/main.c index 6c83953f30e..810da526d78 100644 --- a/src/core/main.c +++ b/src/core/main.c @@ -2029,8 +2029,10 @@ static void update_numa_policy(bool skip_setup) { } r = apply_numa_policy(&arg_numa_policy); - if (r == -EOPNOTSUPP) + if (r == -ENOSYS) log_debug_errno(r, "NUMA support not available, ignoring."); + else if (ERRNO_IS_NEG_NOT_SUPPORTED(r)) + log_warning_errno(r, "NUMA policy not supported by kernel, ignoring."); else if (r < 0) log_warning_errno(r, "Failed to set NUMA memory policy, ignoring: %m"); } diff --git a/src/core/varlink-execute.c b/src/core/varlink-execute.c index 7056439a220..a80fa969274 100644 --- a/src/core/varlink-execute.c +++ b/src/core/varlink-execute.c @@ -315,7 +315,11 @@ static int numa_policy_build_json(sd_json_variant **ret, const char *name, void return 0; } - return sd_json_variant_new_string(ret, mpol_to_string(t)); + _cleanup_free_ char *s = strdup(mpol_to_string(t)); + if (!s) + return -ENOMEM; + + return sd_json_variant_new_string(ret, json_underscorify(s)); } static int numa_mask_build_json(sd_json_variant **ret, const char *name, void *userdata) { diff --git a/src/shared/numa-util.c b/src/shared/numa-util.c index 34ddc0e547f..af8770106e3 100644 --- a/src/shared/numa-util.c +++ b/src/shared/numa-util.c @@ -73,7 +73,11 @@ int apply_numa_policy(const NUMAPolicy *policy) { assert(policy); if (get_mempolicy(NULL, NULL, 0, NULL, 0) < 0 && errno == ENOSYS) - return -EOPNOTSUPP; + /* NUMA syscall interface not available (kernel compiled without NUMA support). + * Return -ENOSYS so callers can distinguish this from -EOPNOTSUPP, which we + * return below when the syscall interface exists but the requested policy is + * not supported by this kernel version. */ + return -ENOSYS; if (!numa_policy_is_valid(policy)) return -EINVAL; @@ -83,8 +87,14 @@ int apply_numa_policy(const NUMAPolicy *policy) { return r; r = set_mempolicy(numa_policy_get_type(policy), nodes, maxnode); - if (r < 0) + if (r < 0) { + // FIXME: This compatibility code path shall be removed once kernel 6.9 + // becomes the new minimal baseline (MPOL_WEIGHTED_INTERLEAVE). + if (errno == EINVAL && IN_SET(numa_policy_get_type(policy), + MPOL_PREFERRED_MANY, MPOL_WEIGHTED_INTERLEAVE)) + return -EOPNOTSUPP; return -errno; + } return 0; } @@ -241,11 +251,13 @@ int numa_mask_add_all(CPUSet *mask) { } static const char* const mpol_table[] = { - [MPOL_DEFAULT] = "default", - [MPOL_PREFERRED] = "preferred", - [MPOL_BIND] = "bind", - [MPOL_INTERLEAVE] = "interleave", - [MPOL_LOCAL] = "local", + [MPOL_DEFAULT] = "default", + [MPOL_PREFERRED] = "preferred", + [MPOL_BIND] = "bind", + [MPOL_INTERLEAVE] = "interleave", + [MPOL_LOCAL] = "local", + [MPOL_PREFERRED_MANY] = "preferred-many", + [MPOL_WEIGHTED_INTERLEAVE] = "weighted-interleave", }; DEFINE_STRING_TABLE_LOOKUP(mpol, int); diff --git a/src/shared/numa-util.h b/src/shared/numa-util.h index 01079351b07..8b9a81852f7 100644 --- a/src/shared/numa-util.h +++ b/src/shared/numa-util.h @@ -7,7 +7,7 @@ #include "shared-forward.h" static inline bool mpol_is_valid(int t) { - return t >= MPOL_DEFAULT && t <= MPOL_LOCAL; + return t >= MPOL_DEFAULT && t <= MPOL_WEIGHTED_INTERLEAVE; } typedef struct NUMAPolicy { diff --git a/src/shared/varlink-io.systemd.Unit.c b/src/shared/varlink-io.systemd.Unit.c index 3fc2d6fc794..5a8e7c98331 100644 --- a/src/shared/varlink-io.systemd.Unit.c +++ b/src/shared/varlink-io.systemd.Unit.c @@ -173,7 +173,9 @@ SD_VARLINK_DEFINE_ENUM_TYPE( SD_VARLINK_DEFINE_ENUM_VALUE(preferred), SD_VARLINK_DEFINE_ENUM_VALUE(bind), SD_VARLINK_DEFINE_ENUM_VALUE(interleave), - SD_VARLINK_DEFINE_ENUM_VALUE(local)); + SD_VARLINK_DEFINE_ENUM_VALUE(local), + SD_VARLINK_DEFINE_ENUM_VALUE(preferred_many), + SD_VARLINK_DEFINE_ENUM_VALUE(weighted_interleave)); SD_VARLINK_DEFINE_ENUM_TYPE( MountPropagationFlag, diff --git a/test/units/TEST-36-NUMAPOLICY.sh b/test/units/TEST-36-NUMAPOLICY.sh index 1e2d7ac8ac1..3eb9b2410fb 100755 --- a/test/units/TEST-36-NUMAPOLICY.sh +++ b/test/units/TEST-36-NUMAPOLICY.sh @@ -227,6 +227,26 @@ else pid1ReloadWithStrace grep -E "set_mempolicy\((MPOL_LOCAL|0x4 [^,]*), NULL" "$straceLog" + echo "PID1 NUMAPolicy support - Preferred-many policy w/o mask" + writePID1NUMAPolicy "preferred-many" + pid1ReloadWithJournal + grep "Failed to set NUMA memory policy, ignoring: Invalid argument" "$journalLog" + + echo "PID1 NUMAPolicy support - Preferred-many policy w/ mask" + writePID1NUMAPolicy "preferred-many" "0" + pid1ReloadWithStrace + grep -E "set_mempolicy\((MPOL_PREFERRED_MANY|0x5 [^,]*), \[0x0*1\]" "$straceLog" + + echo "PID1 NUMAPolicy support - Weighted-interleave policy w/o mask" + writePID1NUMAPolicy "weighted-interleave" + pid1ReloadWithJournal + grep "Failed to set NUMA memory policy, ignoring: Invalid argument" "$journalLog" + + echo "PID1 NUMAPolicy support - Weighted-interleave policy w/ mask" + writePID1NUMAPolicy "weighted-interleave" "0" + pid1ReloadWithStrace + grep -E "set_mempolicy\((MPOL_WEIGHTED_INTERLEAVE|0x6 [^,]*), \[0x0*1\]" "$straceLog" + echo "Unit file NUMAPolicy support - Default policy w/o mask" writeTestUnitNUMAPolicy "default" pid1StartUnitWithStrace "$testUnit" @@ -297,6 +317,34 @@ else # Mask must be ignored grep -E "set_mempolicy\((MPOL_LOCAL|0x4 [^,]*), NULL" "$straceLog" + echo "Unit file NUMAPolicy support - Preferred-many policy w/o mask" + writeTestUnitNUMAPolicy "preferred-many" + pid1StartUnitWithStrace "$testUnit" + pid1StopUnit "$testUnit" + [[ $(systemctl show "$testUnit" -P ExecMainStatus) == "242" ]] + + echo "Unit file NUMAPolicy support - Preferred-many policy w/ mask" + writeTestUnitNUMAPolicy "preferred-many" "0" + pid1StartUnitWithStrace "$testUnit" + systemctlCheckNUMAProperties "$testUnit" "preferred-many" "0" + varlinkctl call /run/systemd/io.systemd.Manager io.systemd.Unit.List "{\"name\":\"$testUnit\"}" | jq -e '.context.Exec.NUMAPolicy == "preferred_many"' + pid1StopUnit "$testUnit" + grep -E "set_mempolicy\((MPOL_PREFERRED_MANY|0x5 [^,]*), \[0x0*1\]" "$straceLog" + + echo "Unit file NUMAPolicy support - Weighted-interleave policy w/o mask" + writeTestUnitNUMAPolicy "weighted-interleave" + pid1StartUnitWithStrace "$testUnit" + pid1StopUnit "$testUnit" + [[ $(systemctl show "$testUnit" -P ExecMainStatus) == "242" ]] + + echo "Unit file NUMAPolicy support - Weighted-interleave policy w/ mask" + writeTestUnitNUMAPolicy "weighted-interleave" "0" + pid1StartUnitWithStrace "$testUnit" + systemctlCheckNUMAProperties "$testUnit" "weighted-interleave" "0" + varlinkctl call /run/systemd/io.systemd.Manager io.systemd.Unit.List "{\"name\":\"$testUnit\"}" | jq -e '.context.Exec.NUMAPolicy == "weighted_interleave"' + pid1StopUnit "$testUnit" + grep -E "set_mempolicy\((MPOL_WEIGHTED_INTERLEAVE|0x6 [^,]*), \[0x0*1\]" "$straceLog" + echo "Unit file CPUAffinity=NUMA support" writeTestUnitNUMAPolicy "bind" "0" echo "CPUAffinity=numa" >>"$testUnitNUMAConf" @@ -343,6 +391,14 @@ else systemctlCheckNUMAProperties "$runUnit" "local" "" systemctl cat "$runUnit" | grep 'CPUAffinity=numa' >/dev/null pid1StopUnit "$runUnit" + + systemd-run -p NUMAPolicy=preferred-many -p NUMAMask=0 --unit "$runUnit" sleep 1000 + systemctlCheckNUMAProperties "$runUnit" "preferred-many" "0" + pid1StopUnit "$runUnit" + + systemd-run -p NUMAPolicy=weighted-interleave -p NUMAMask=0 --unit "$runUnit" sleep 1000 + systemctlCheckNUMAProperties "$runUnit" "weighted-interleave" "0" + pid1StopUnit "$runUnit" fi # Cleanup