]> git.ipfire.org Git - thirdparty/systemd.git/commitdiff
core/unit: fail earlier before spawning executor when we failed to realize cgroup
authorYu Watanabe <watanabe.yu+github@gmail.com>
Tue, 23 Sep 2025 20:29:22 +0000 (05:29 +0900)
committerYu Watanabe <watanabe.yu+github@gmail.com>
Fri, 26 Sep 2025 00:28:12 +0000 (09:28 +0900)
Before 23ac08115af83e3a0a937fa207fc52511aba2ffa, even if we failed to
create the cgroup for a unit, a cgroup runtime object for the cgroup is
created with the cgroup path. Hence, the creation of cgroup is failed,
execution of the unit will fail in posix_spawn_wrapper() and logged
something like the following:
```
systemd[1]: testservice.service: Failed to create cgroup /testslice.slice/testservice.service: Cannot allocate memory
systemd[1]: testservice.service: Failed to spawn executor: No such file or directory
systemd[1]: testservice.service: Failed to spawn 'start' task: No such file or directory
systemd[1]: testservice.service: Failed with result 'resources'.
systemd[1]: Failed to start testservice.service.
```

However, after the commit, when we failed to create the cgroup, a cgroup
runtime object is not created, hence NULL will be assigned to
ExecParameters.cgroup_path in unit_set_exec_params().
Hence, the unit process will be invoked in the init.scope.
```
systemd[1]: testservice.service: Failed to create cgroup /testslice.slice/testservice.service: Cannot allocate memory
systemd[1]: Starting testservice.service...
cat[1094]: 0::/init.scope
systemd[1]: testservice.service: Deactivated successfully.
systemd[1]: Finished testservice.service.
```
where the test service calls 'cat /proc/self/cgroup'.

To fix the issue, let's fail earlier when we failed to create cgroup.

Follow-up for 23ac08115af83e3a0a937fa207fc52511aba2ffa (v258).

src/core/unit.c
test/units/TEST-19-CGROUP.abort-on-cgroup-creation-failure.sh [new file with mode: 0755]

index e99f8f2b3e745b1aec1f8facab5fa4610eb0b41b..147bc8f5f29a7175819066817b13d85d7486c50a 100644 (file)
@@ -5559,11 +5559,11 @@ int unit_fork_helper_process(Unit *u, const char *name, bool into_cgroup, PidRef
          * with the child's PID. */
 
         if (into_cgroup) {
-                (void) unit_realize_cgroup(u);
+                r = unit_realize_cgroup(u);
+                if (r < 0)
+                        return r;
 
-                crt = unit_setup_cgroup_runtime(u);
-                if (!crt)
-                        return -ENOMEM;
+                crt = unit_get_cgroup_runtime(u);
         }
 
         r = safe_fork(name, FORK_REOPEN_LOG|FORK_DEATHSIG_SIGTERM, &pid);
@@ -6013,7 +6013,9 @@ int unit_prepare_exec(Unit *u) {
 
         /* Prepares everything so that we can fork of a process for this unit */
 
-        (void) unit_realize_cgroup(u);
+        r = unit_realize_cgroup(u);
+        if (r < 0)
+                return r;
 
         CGroupRuntime *crt = unit_get_cgroup_runtime(u);
         if (crt && crt->reset_accounting) {
diff --git a/test/units/TEST-19-CGROUP.abort-on-cgroup-creation-failure.sh b/test/units/TEST-19-CGROUP.abort-on-cgroup-creation-failure.sh
new file mode 100755 (executable)
index 0000000..41f8b96
--- /dev/null
@@ -0,0 +1,46 @@
+#!/usr/bin/env bash
+# SPDX-License-Identifier: LGPL-2.1-or-later
+set -ex
+set -o pipefail
+
+# Test that the service is not invoked if the cgroup cannot be created.
+
+# It seems openSUSE kernel (at least kernel-default-6.16.8-1.1.x86_64.rpm) has a
+# bag in kernel oom killer or clone3 syscall, and spawning executor on a cgroup
+# with too small MemoryMax= triggers infinite loop of OOM kill, and posix_spawn()
+# will never return, and the service manager will stuck.
+####
+# [  119.776797] systemd invoked oom-killer: gfp_mask=0xcc0(GFP_KERNEL), order=0, oom_score_adj=0
+# [  119.776859] CPU: 1 UID: 0 PID: 1472 Comm: systemd Not tainted 6.16.8-1-default #1 PREEMPT(voluntary) openSUSE Tumbleweed  6c85865973e4ae641870ed68afe8933a6986c974
+# [  119.776865] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.17.0-5.fc42 04/01/2014
+# [  119.776867] Call Trace:
+# (snip)
+# [  119.778126] Out of memory and no killable processes...
+####
+# On other distributions, the oom killer is triggered, but clone3 immediately
+# fails with ENOMEM, and such problematic loop does not happen.
+. /etc/os-release
+if [[ "$ID" =~ opensuse ]]; then
+    echo "Skipping cgroup test with too small MemoryMax= setting on openSUSE."
+    exit 0
+fi
+
+cat >/run/systemd/system/testslice.slice <<EOF
+[Slice]
+MemoryMax=1
+EOF
+
+cat >/run/systemd/system/testservice.service <<EOF
+[Service]
+Type=oneshot
+ExecStart=cat /proc/self/cgroup
+Slice=testslice.slice
+EOF
+
+systemctl daemon-reload
+(! systemctl start testservice.service)
+
+rm /run/systemd/system/testslice.slice
+rm /run/systemd/system/testservice.service
+
+exit 0