]> git.ipfire.org Git - thirdparty/systemd.git/commitdiff
numa: add support for preferred-many and weighted-interleave policies
authordongshengyuan <545258830@qq.com>
Mon, 22 Jun 2026 02:55:13 +0000 (10:55 +0800)
committerYu Watanabe <watanabe.yu+github@gmail.com>
Sun, 28 Jun 2026 04:40:07 +0000 (13:40 +0900)
Add support for two newer NUMA memory policies:

- MPOL_PREFERRED_MANY (Linux 5.15): like MPOL_PREFERRED but accepts
  a set of nodes instead of a single node, falling back to all nodes
  if preferred nodes cannot satisfy the allocation.

- MPOL_WEIGHTED_INTERLEAVE (Linux 6.9): like MPOL_INTERLEAVE but
  distributes pages across nodes proportionally to per-node weights
  configured via /sys/kernel/mm/mempolicy/weighted_interleave/.

On kernels that do not support the requested policy, set_mempolicy()
returns EINVAL. We convert EINVAL to EOPNOTSUPP only for the two new
policies (MPOL_PREFERRED_MANY, MPOL_WEIGHTED_INTERLEAVE), so that a
bad NUMAMask= for already-supported policies still fails the service
rather than being silently ignored.

The NUMA subsystem being absent (ENOSYS) continues to be handled
silently at debug level, as before.

Varlink serialization uses json_underscorify() on an owned copy of
the policy name string to convert hyphenated names to the underscore
form declared in the IDL enum, avoiding mutation of the read-only
static string table.

Signed-off-by: dongshengyuan <dongshengyuan@uniontech.com>
README
man/systemd.exec.xml
src/core/exec-invoke.c
src/core/main.c
src/core/varlink-execute.c
src/shared/numa-util.c
src/shared/numa-util.h
src/shared/varlink-io.systemd.Unit.c
test/units/TEST-36-NUMAPOLICY.sh

diff --git a/README b/README
index 9a551f76fa13777954c23469b61677b500ec9777..5553915be00216ad1e15211d0347e6e42d147b2b 100644 (file)
--- a/README
+++ b/README
@@ -67,13 +67,14 @@ REQUIREMENTS:
            of systemd. Taint flag 'old-kernel' will be set. systemd will most likely
            still function, but upstream support and testing are limited.
 
-        Linux kernel ≥ 6.3 for MFD_EXEC/MFD_NOEXEC_SEAL and tmpfs noswap option
+        Linux kernel ≥ 5.15 for MPOL_PREFERRED_MANY NUMA policy
+                     ≥ 6.3 for MFD_EXEC/MFD_NOEXEC_SEAL and tmpfs noswap option
                      ≥ 6.5 for name_to_handle_at() AT_HANDLE_FID, SO_PEERPIDFD/SO_PASSPIDFD,
                                and MOVE_MOUNT_BENEATH
                      ≥ 6.6 for quota support on tmpfs
                      ≥ 6.7 for cgroup2fs memory_hugetlb_accounting option
                      ≥ 6.8 for STATX_MNT_ID_UNIQUE
-                     ≥ 6.9 for pidfs
+                     ≥ 6.9 for pidfs and MPOL_WEIGHTED_INTERLEAVE NUMA policy
                      ≥ 6.10 for fcntl(F_DUPFD_QUERY), unprivileged linkat(AT_EMPTY_PATH),
                                 and block device 'partscan' sysfs attribute
                      ≥ 6.12 for AT_HANDLE_MNT_ID_UNIQUE
index 6524ba631a7cc5279ec223910dddc458afd0e1eb..0b43bc69603b89bef9e696609bdcd2466ddaf45c 100644 (file)
@@ -1408,12 +1408,16 @@ CapabilityBoundingSet=~CAP_B CAP_C</programlisting>
         <term><varname>NUMAPolicy=</varname></term>
 
         <listitem><para>Controls the NUMA memory policy of the executed processes. Takes a policy type, one of:
-        <option>default</option>, <option>preferred</option>, <option>bind</option>, <option>interleave</option> and
-        <option>local</option>. A list of NUMA nodes that should be associated with the policy must be specified
-        in <varname>NUMAMask=</varname>. For more details on each policy please see,
+        <option>default</option>, <option>preferred</option>, <option>bind</option>, <option>interleave</option>,
+        <option>local</option>, <option>preferred-many</option> (requires Linux 5.15 or newer) and
+        <option>weighted-interleave</option> (requires Linux 6.9 or newer, weights are configured via
+        <filename>/sys/kernel/mm/mempolicy/weighted_interleave/</filename>). A list of NUMA nodes that should be
+        associated with the policy must be specified in <varname>NUMAMask=</varname>. For more details on each
+        policy please see,
         <citerefentry><refentrytitle>set_mempolicy</refentrytitle><manvolnum>2</manvolnum></citerefentry>. For overall
         overview of NUMA support in Linux see,
         <citerefentry project='man-pages'><refentrytitle>numa</refentrytitle><manvolnum>7</manvolnum></citerefentry>.
+        If the kernel does not support the requested policy, a warning is logged and the setting is ignored.
         </para>
 
         <xi:include href="version-info.xml" xpointer="v243"/></listitem>
index c45c807198d6b9ed1d138d315063cc6fe656a504..2b0ced7b7d51e13b4e7227d3c4b2d8569e11e15b 100644 (file)
@@ -5646,8 +5646,10 @@ int exec_invoke(
 
         if (mpol_is_valid(numa_policy_get_type(&context->numa_policy))) {
                 r = apply_numa_policy(&context->numa_policy);
-                if (ERRNO_IS_NEG_NOT_SUPPORTED(r))
+                if (r == -ENOSYS)
                         log_debug_errno(r, "NUMA support not available, ignoring.");
+                else if (ERRNO_IS_NEG_NOT_SUPPORTED(r))
+                        log_warning_errno(r, "NUMA policy not supported by kernel, ignoring.");
                 else if (r < 0) {
                         *exit_status = EXIT_NUMA_POLICY;
                         return log_error_errno(r, "Failed to set NUMA memory policy: %m");
index 6c83953f30e3ecfe1275e2e4b80801e3fbb3687c..810da526d785520e6cdd1c63c5ebdd9897ee0397 100644 (file)
@@ -2029,8 +2029,10 @@ static void update_numa_policy(bool skip_setup) {
         }
 
         r = apply_numa_policy(&arg_numa_policy);
-        if (r == -EOPNOTSUPP)
+        if (r == -ENOSYS)
                 log_debug_errno(r, "NUMA support not available, ignoring.");
+        else if (ERRNO_IS_NEG_NOT_SUPPORTED(r))
+                log_warning_errno(r, "NUMA policy not supported by kernel, ignoring.");
         else if (r < 0)
                 log_warning_errno(r, "Failed to set NUMA memory policy, ignoring: %m");
 }
index 7056439a220ae21d82eb20d0eed352fdb8fa4448..a80fa9692747a06e7c40fbff61986f353e6ebc45 100644 (file)
@@ -315,7 +315,11 @@ static int numa_policy_build_json(sd_json_variant **ret, const char *name, void
                 return 0;
         }
 
-        return sd_json_variant_new_string(ret, mpol_to_string(t));
+        _cleanup_free_ char *s = strdup(mpol_to_string(t));
+        if (!s)
+                return -ENOMEM;
+
+        return sd_json_variant_new_string(ret, json_underscorify(s));
 }
 
 static int numa_mask_build_json(sd_json_variant **ret, const char *name, void *userdata) {
index 34ddc0e547f5a0d023069dd82eebbb97003c07c2..af8770106e36533a8d61e63e184abe17a6f4e84e 100644 (file)
@@ -73,7 +73,11 @@ int apply_numa_policy(const NUMAPolicy *policy) {
         assert(policy);
 
         if (get_mempolicy(NULL, NULL, 0, NULL, 0) < 0 && errno == ENOSYS)
-                return -EOPNOTSUPP;
+                /* NUMA syscall interface not available (kernel compiled without NUMA support).
+                 * Return -ENOSYS so callers can distinguish this from -EOPNOTSUPP, which we
+                 * return below when the syscall interface exists but the requested policy is
+                 * not supported by this kernel version. */
+                return -ENOSYS;
 
         if (!numa_policy_is_valid(policy))
                 return -EINVAL;
@@ -83,8 +87,14 @@ int apply_numa_policy(const NUMAPolicy *policy) {
                 return r;
 
         r = set_mempolicy(numa_policy_get_type(policy), nodes, maxnode);
-        if (r < 0)
+        if (r < 0) {
+                // FIXME: This compatibility code path shall be removed once kernel 6.9
+                //        becomes the new minimal baseline (MPOL_WEIGHTED_INTERLEAVE).
+                if (errno == EINVAL && IN_SET(numa_policy_get_type(policy),
+                                             MPOL_PREFERRED_MANY, MPOL_WEIGHTED_INTERLEAVE))
+                        return -EOPNOTSUPP;
                 return -errno;
+        }
 
         return 0;
 }
@@ -241,11 +251,13 @@ int numa_mask_add_all(CPUSet *mask) {
 }
 
 static const char* const mpol_table[] = {
-        [MPOL_DEFAULT]    = "default",
-        [MPOL_PREFERRED]  = "preferred",
-        [MPOL_BIND]       = "bind",
-        [MPOL_INTERLEAVE] = "interleave",
-        [MPOL_LOCAL]      = "local",
+        [MPOL_DEFAULT]             = "default",
+        [MPOL_PREFERRED]           = "preferred",
+        [MPOL_BIND]                = "bind",
+        [MPOL_INTERLEAVE]          = "interleave",
+        [MPOL_LOCAL]               = "local",
+        [MPOL_PREFERRED_MANY]      = "preferred-many",
+        [MPOL_WEIGHTED_INTERLEAVE] = "weighted-interleave",
 };
 
 DEFINE_STRING_TABLE_LOOKUP(mpol, int);
index 01079351b07ffcad2aaa77326d4e34e8b56f1669..8b9a81852f769caaac5ce34a2ae3a81ab136f1d8 100644 (file)
@@ -7,7 +7,7 @@
 #include "shared-forward.h"
 
 static inline bool mpol_is_valid(int t) {
-        return t >= MPOL_DEFAULT && t <= MPOL_LOCAL;
+        return t >= MPOL_DEFAULT && t <= MPOL_WEIGHTED_INTERLEAVE;
 }
 
 typedef struct NUMAPolicy {
index 3fc2d6fc7942c9f18bece71835c1f6252fe737f8..5a8e7c98331edfe95bd1b78fdce07c9bf45a53ea 100644 (file)
@@ -173,7 +173,9 @@ SD_VARLINK_DEFINE_ENUM_TYPE(
                 SD_VARLINK_DEFINE_ENUM_VALUE(preferred),
                 SD_VARLINK_DEFINE_ENUM_VALUE(bind),
                 SD_VARLINK_DEFINE_ENUM_VALUE(interleave),
-                SD_VARLINK_DEFINE_ENUM_VALUE(local));
+                SD_VARLINK_DEFINE_ENUM_VALUE(local),
+                SD_VARLINK_DEFINE_ENUM_VALUE(preferred_many),
+                SD_VARLINK_DEFINE_ENUM_VALUE(weighted_interleave));
 
 SD_VARLINK_DEFINE_ENUM_TYPE(
                 MountPropagationFlag,
index 1e2d7ac8ac133f47c15996c7a243d764a02f5da7..3eb9b2410fb510e8723316a5b2bc853d9e35f4b6 100755 (executable)
@@ -227,6 +227,26 @@ else
     pid1ReloadWithStrace
     grep -E "set_mempolicy\((MPOL_LOCAL|0x4 [^,]*), NULL" "$straceLog"
 
+    echo "PID1 NUMAPolicy support - Preferred-many policy w/o mask"
+    writePID1NUMAPolicy "preferred-many"
+    pid1ReloadWithJournal
+    grep "Failed to set NUMA memory policy, ignoring: Invalid argument" "$journalLog"
+
+    echo "PID1 NUMAPolicy support - Preferred-many policy w/ mask"
+    writePID1NUMAPolicy "preferred-many" "0"
+    pid1ReloadWithStrace
+    grep -E "set_mempolicy\((MPOL_PREFERRED_MANY|0x5 [^,]*), \[0x0*1\]" "$straceLog"
+
+    echo "PID1 NUMAPolicy support - Weighted-interleave policy w/o mask"
+    writePID1NUMAPolicy "weighted-interleave"
+    pid1ReloadWithJournal
+    grep "Failed to set NUMA memory policy, ignoring: Invalid argument" "$journalLog"
+
+    echo "PID1 NUMAPolicy support - Weighted-interleave policy w/ mask"
+    writePID1NUMAPolicy "weighted-interleave" "0"
+    pid1ReloadWithStrace
+    grep -E "set_mempolicy\((MPOL_WEIGHTED_INTERLEAVE|0x6 [^,]*), \[0x0*1\]" "$straceLog"
+
     echo "Unit file NUMAPolicy support - Default policy w/o mask"
     writeTestUnitNUMAPolicy "default"
     pid1StartUnitWithStrace "$testUnit"
@@ -297,6 +317,34 @@ else
     # Mask must be ignored
     grep -E "set_mempolicy\((MPOL_LOCAL|0x4 [^,]*), NULL" "$straceLog"
 
+    echo "Unit file NUMAPolicy support - Preferred-many policy w/o mask"
+    writeTestUnitNUMAPolicy "preferred-many"
+    pid1StartUnitWithStrace "$testUnit"
+    pid1StopUnit "$testUnit"
+    [[ $(systemctl show "$testUnit" -P ExecMainStatus) == "242" ]]
+
+    echo "Unit file NUMAPolicy support - Preferred-many policy w/ mask"
+    writeTestUnitNUMAPolicy "preferred-many" "0"
+    pid1StartUnitWithStrace "$testUnit"
+    systemctlCheckNUMAProperties "$testUnit" "preferred-many" "0"
+    varlinkctl call /run/systemd/io.systemd.Manager io.systemd.Unit.List "{\"name\":\"$testUnit\"}" | jq -e '.context.Exec.NUMAPolicy == "preferred_many"'
+    pid1StopUnit "$testUnit"
+    grep -E "set_mempolicy\((MPOL_PREFERRED_MANY|0x5 [^,]*), \[0x0*1\]" "$straceLog"
+
+    echo "Unit file NUMAPolicy support - Weighted-interleave policy w/o mask"
+    writeTestUnitNUMAPolicy "weighted-interleave"
+    pid1StartUnitWithStrace "$testUnit"
+    pid1StopUnit "$testUnit"
+    [[ $(systemctl show "$testUnit" -P ExecMainStatus) == "242" ]]
+
+    echo "Unit file NUMAPolicy support - Weighted-interleave policy w/ mask"
+    writeTestUnitNUMAPolicy "weighted-interleave" "0"
+    pid1StartUnitWithStrace "$testUnit"
+    systemctlCheckNUMAProperties "$testUnit" "weighted-interleave" "0"
+    varlinkctl call /run/systemd/io.systemd.Manager io.systemd.Unit.List "{\"name\":\"$testUnit\"}" | jq -e '.context.Exec.NUMAPolicy == "weighted_interleave"'
+    pid1StopUnit "$testUnit"
+    grep -E "set_mempolicy\((MPOL_WEIGHTED_INTERLEAVE|0x6 [^,]*), \[0x0*1\]" "$straceLog"
+
     echo "Unit file CPUAffinity=NUMA support"
     writeTestUnitNUMAPolicy "bind" "0"
     echo "CPUAffinity=numa" >>"$testUnitNUMAConf"
@@ -343,6 +391,14 @@ else
     systemctlCheckNUMAProperties "$runUnit" "local" ""
     systemctl cat "$runUnit" | grep 'CPUAffinity=numa' >/dev/null
     pid1StopUnit "$runUnit"
+
+    systemd-run -p NUMAPolicy=preferred-many -p NUMAMask=0 --unit "$runUnit" sleep 1000
+    systemctlCheckNUMAProperties "$runUnit" "preferred-many" "0"
+    pid1StopUnit "$runUnit"
+
+    systemd-run -p NUMAPolicy=weighted-interleave -p NUMAMask=0 --unit "$runUnit" sleep 1000
+    systemctlCheckNUMAProperties "$runUnit" "weighted-interleave" "0"
+    pid1StopUnit "$runUnit"
 fi
 
 # Cleanup