From: Daan De Meyer <daan.j.demeyer@gmail.com>
Date: Fri, 25 Apr 2025 10:40:52 +0000 (+0200)
Subject:  core: Make sure we handle DelegateSubgroup= in combo with cgroupns
X-Git-Tag: v258-rc1~379^2
X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=refs%2Fpull%2F36815%2Fhead;p=thirdparty%2Fsystemd.git

 core: Make sure we handle DelegateSubgroup= in combo with cgroupns

Currently, if we use a cgroup namespace together with DelegateSubgroup=,
the subgroup becomes the root of the cgroup namespace because we move the
service process to the subgroup before we unshare the cgroup namespace, and
the current cgroup becomes the root of the cgroup namespace when we unshare
the cgroup namespace.

Let's fix the problem by not moving the service process to the subgroup until
we've unshared the cgroup namespace. Note that this doesn't break the primary use
case of CLONE_INTO_CGROUP since we still use it to immediately clone into the service
main cgroup, just not anymore into the subgroup, but this shouldn't matter in practice.

Additionally, we need special handling for control processes, as those *do*
need to get spawned into the subcgroup immediately if delegation is configured to
avoid violating the cgroupsv2 "no inner processes" rule.

Effectively, this leaves us with the following logic:
- In exec_spawn(), spawn into subgroup if we're spawning a control process
  that needs to be spawned into a subgroup immediately. Otherwise, spawn into
  main service cgroup.
- In exec_invoke(), move into subgroup early if we don't need a cgroup namespace.
  Otherwise, move into subgroup after we've unshared the cgroup namespace.
---

diff --git a/src/core/exec-invoke.c b/src/core/exec-invoke.c
index a9c8664c4a2..9e7af81f1ef 100644
--- a/src/core/exec-invoke.c
+++ b/src/core/exec-invoke.c
@@ -1177,14 +1177,56 @@ static int pam_close_session_and_delete_credentials(pam_handle_t *handle, int fl
 }
 #endif
 
+static int attach_to_subcgroup(
+                const ExecContext *context,
+                const CGroupContext *cgroup_context,
+                const ExecParameters *params,
+                const char *prefix) {
+
+        _cleanup_free_ char *subgroup = NULL;
+        int r;
+
+        assert(context);
+        assert(cgroup_context);
+        assert(params);
+
+        /* If we're a control process that needs a subgroup, we've already been spawned into it as otherwise
+         * we'd violate the "no inner processes" rule, so no need to do anything. */
+        if (exec_params_needs_control_subcgroup(params))
+                return 0;
+
+        r = exec_params_get_cgroup_path(params, cgroup_context, prefix, &subgroup);
+        if (r < 0)
+                return log_error_errno(r, "Failed to acquire cgroup path: %m");
+        /* No subgroup required? Then there's nothing to do. */
+        if (r == 0)
+                return 0;
+
+        r = cg_attach(subgroup, 0);
+        if (r == -EUCLEAN)
+                return log_error_errno(r,
+                                "Failed to attach process " PID_FMT " to cgroup '%s', "
+                                "because the cgroup or one of its parents or "
+                                "siblings is in the threaded mode.",
+                                getpid_cached(), subgroup);
+        if (r < 0)
+                return log_error_errno(r,
+                                "Failed to attach process " PID_FMT " to cgroup %s: %m",
+                                getpid_cached(), subgroup);
+
+        return 0;
+}
+
 static int setup_pam(
                 const ExecContext *context,
+                const CGroupContext *cgroup_context,
                 ExecParameters *params,
                 const char *user,
                 uid_t uid,
                 gid_t gid,
                 char ***env, /* updated on success */
                 const int fds[], size_t n_fds,
+                bool needs_sandboxing,
                 int exec_fd) {
 
 #if HAVE_PAM
@@ -1290,6 +1332,15 @@ static int setup_pam(
         if (r == 0) {
                 int ret = EXIT_PAM;
 
+                if (needs_sandboxing && exec_needs_cgroup_namespace(context) && params->cgroup_path) {
+                        /* Move PAM process into subgroup immediately if the main process hasn't been moved
+                         * into the subgroup yet (when cgroup namespacing is enabled) and a subgroup is
+                         * configured. */
+                        r = attach_to_subcgroup(context, cgroup_context, params, params->cgroup_path);
+                        if (r < 0)
+                                return r;
+                }
+
                 /* The child's job is to reset the PAM session on termination */
                 barrier_set_role(&barrier, BARRIER_CHILD);
 
@@ -4911,28 +4962,48 @@ int exec_invoke(
         if (socket_fd >= 0)
                 (void) fd_nonblock(socket_fd, false);
 
+        /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted
+         * from it. */
+        needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
+
         /* Journald will try to look-up our cgroup in order to populate _SYSTEMD_CGROUP and _SYSTEMD_UNIT fields.
          * Hence we need to migrate to the target cgroup from init.scope before connecting to journald */
         if (params->cgroup_path) {
-                _cleanup_free_ char *p = NULL;
+                _cleanup_free_ char *subcgroup = NULL;
 
-                r = exec_params_get_cgroup_path(params, cgroup_context, &p);
+                r = exec_params_get_cgroup_path(params, cgroup_context, params->cgroup_path, &subcgroup);
                 if (r < 0) {
                         *exit_status = EXIT_CGROUP;
                         return log_error_errno(r, "Failed to acquire cgroup path: %m");
                 }
+                if (r > 0) {
+                        /* If there is a subcgroup required, let's make sure to create it now. */
+                        r = cg_create(subcgroup);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to create subcgroup '%s': %m", subcgroup);
+                }
+
+                /* If we need a cgroup namespace, we cannot yet move the service to its configured subgroup,
+                 * as unsharing the cgroup namespace later on makes the current cgroup the root of the
+                 * namespace and we want the root of the namespace to be the main service cgroup and not the
+                 * subgroup. One edge case is if we're a control process that needs to be spawned in a
+                 * subgroup, in this case, we have no choice as moving into the main service cgroup might
+                 * violate the no inner processes rule of cgroupv2. */
+                const char *cgtarget = needs_sandboxing && exec_needs_cgroup_namespace(context) &&
+                                                           !exec_params_needs_control_subcgroup(params)
+                                                           ? params->cgroup_path : subcgroup;
 
-                r = cg_attach(p, 0);
+                r = cg_attach(cgtarget, 0);
                 if (r == -EUCLEAN) {
                         *exit_status = EXIT_CGROUP;
                         return log_error_errno(r,
                                                "Failed to attach process to cgroup '%s', "
                                                "because the cgroup or one of its parents or "
-                                               "siblings is in the threaded mode.", p);
+                                               "siblings is in the threaded mode.", cgtarget);
                 }
                 if (r < 0) {
                         *exit_status = EXIT_CGROUP;
-                        return log_error_errno(r, "Failed to attach to cgroup %s: %m", p);
+                        return log_error_errno(r, "Failed to attach to cgroup %s: %m", cgtarget);
                 }
         }
 
@@ -5128,10 +5199,6 @@ int exec_invoke(
                 }
         }
 
-        /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted
-         * from it. */
-        needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
-
         if (params->cgroup_path) {
                 /* If delegation is enabled we'll pass ownership of the cgroup to the user of the new process. On cgroup v1
                  * this is only about systemd's own hierarchy, i.e. not the controller hierarchies, simply because that's not
@@ -5147,7 +5214,7 @@ int exec_invoke(
                                 return log_error_errno(r, "Failed to adjust control group access: %m");
                         }
 
-                        r = exec_params_get_cgroup_path(params, cgroup_context, &p);
+                        r = exec_params_get_cgroup_path(params, cgroup_context, params->cgroup_path, &p);
                         if (r < 0) {
                                 *exit_status = EXIT_CGROUP;
                                 return log_error_errno(r, "Failed to acquire cgroup path: %m");
@@ -5328,7 +5395,8 @@ int exec_invoke(
                  * wins here. (See above.) */
 
                 /* All fds passed in the fds array will be closed in the pam child process. */
-                r = setup_pam(context, params, username, uid, gid, &accum_env, params->fds, n_fds, params->exec_fd);
+                r = setup_pam(context, cgroup_context, params, username, uid, gid, &accum_env,
+                              params->fds, n_fds, needs_sandboxing, params->exec_fd);
                 if (r < 0) {
                         *exit_status = EXIT_PAM;
                         return log_error_errno(r, "Failed to set up PAM session: %m");
@@ -5453,6 +5521,17 @@ int exec_invoke(
         if (r < 0)
                 return r;
 
+        if (needs_sandboxing && exec_needs_cgroup_namespace(context) && params->cgroup_path) {
+                /* Move ourselves into the subcgroup now *after* we've unshared the cgroup namespace, which
+                 * ensures the root of the cgroup namespace is the top level service cgroup and not the
+                 * subcgroup. Adjust the prefix accordingly since we're in a cgroup namespace now. */
+                r = attach_to_subcgroup(context, cgroup_context, params, /* prefix= */ NULL);
+                if (r < 0) {
+                        *exit_status = EXIT_CGROUP;
+                        return r;
+                }
+        }
+
         /* Now that the mount namespace has been set up and privileges adjusted, let's look for the thing we
          * shall execute. */
 
diff --git a/src/core/execute.c b/src/core/execute.c
index 31b5e13fe4a..1fdb3190427 100644
--- a/src/core/execute.c
+++ b/src/core/execute.c
@@ -405,20 +405,24 @@ bool exec_directory_is_private(const ExecContext *context, ExecDirectoryType typ
         return true;
 }
 
+int exec_params_needs_control_subcgroup(const ExecParameters *params) {
+        /* Keep this in sync with exec_params_get_cgroup_path(). */
+        return FLAGS_SET(params->flags, EXEC_CGROUP_DELEGATE|EXEC_CONTROL_CGROUP|EXEC_IS_CONTROL);
+}
+
 int exec_params_get_cgroup_path(
                 const ExecParameters *params,
                 const CGroupContext *c,
+                const char *prefix,
                 char **ret) {
 
         const char *subgroup = NULL;
         char *p;
 
         assert(params);
+        assert(c);
         assert(ret);
 
-        if (!params->cgroup_path)
-                return -EINVAL;
-
         /* If we are called for a unit where cgroup delegation is on, and the payload created its own populated
          * subcgroup (which we expect it to do, after all it asked for delegation), then we cannot place the control
          * processes started after the main unit's process in the unit's main cgroup because it is now an inner one,
@@ -428,6 +432,7 @@ int exec_params_get_cgroup_path(
          * this is not necessary, the cgroup is still empty. We distinguish these cases with the EXEC_CONTROL_CGROUP
          * flag, which is only passed for the former statements, not for the latter. */
 
+        /* Keep this in sync with exec_params_needs_control_subcgroup(). */
         if (FLAGS_SET(params->flags, EXEC_CGROUP_DELEGATE) && (FLAGS_SET(params->flags, EXEC_CONTROL_CGROUP) || c->delegate_subgroup)) {
                 if (FLAGS_SET(params->flags, EXEC_IS_CONTROL))
                         subgroup = ".control";
@@ -436,9 +441,9 @@ int exec_params_get_cgroup_path(
         }
 
         if (subgroup)
-                p = path_join(params->cgroup_path, subgroup);
+                p = path_join(prefix, subgroup);
         else
-                p = strdup(params->cgroup_path);
+                p = strdup(strempty(prefix));
         if (!p)
                 return -ENOMEM;
 
@@ -506,8 +511,16 @@ int exec_spawn(
            want to log from the parent, so we use the possibly inaccurate path here. */
         log_command_line(unit, "About to execute", command->path, command->argv);
 
-        if (params->cgroup_path) {
-                r = exec_params_get_cgroup_path(params, cgroup_context, &subcgroup_path);
+        /* We cannot spawn the main service process into the subcgroup as it might need to unshare the cgroup
+         * namespace first if one is configured to make sure the root of the cgroup namespace is the service
+         * cgroup and not the subcgroup. However, when running control commands on a live service, the
+         * commands have to be spawned inside a subcgroup, otherwise we violate the no inner processes rule
+         * of cgroupv2 as the main service process might already have enabled controllers by writing to
+         * cgroup.subtree_control. */
+
+        const char *cgtarget;
+        if (exec_params_needs_control_subcgroup(params)) {
+                r = exec_params_get_cgroup_path(params, cgroup_context, params->cgroup_path, &subcgroup_path);
                 if (r < 0)
                         return log_unit_error_errno(unit, r, "Failed to acquire subcgroup path: %m");
                 if (r > 0) {
@@ -518,7 +531,10 @@ int exec_spawn(
                         if (r < 0)
                                 return log_unit_error_errno(unit, r, "Failed to create subcgroup '%s': %m", subcgroup_path);
                 }
-        }
+
+                cgtarget = subcgroup_path;
+        } else
+                cgtarget = params->cgroup_path;
 
         /* In order to avoid copy-on-write traps and OOM-kills when pid1's memory.current is above the
          * child's memory.max, serialize all the state needed to start the unit, and pass it to the
@@ -582,24 +598,24 @@ int exec_spawn(
                                   "--log-level", max_log_levels,
                                   "--log-target", log_target_to_string(manager_get_executor_log_target(unit->manager))),
                         environ,
-                        subcgroup_path,
+                        cgtarget,
                         &pidref);
 
         /* Drop the ambient set again, so no processes other than sd-executore spawned from the manager inherit it. */
         (void) capability_ambient_set_apply(0, /* also_inherit= */ false);
 
-        if (r == -EUCLEAN && subcgroup_path)
+        if (r == -EUCLEAN && cgtarget)
                 return log_unit_error_errno(unit, r,
                                             "Failed to spawn process into cgroup '%s', because the cgroup "
                                             "or one of its parents or siblings is in the threaded mode.",
-                                            subcgroup_path);
+                                            cgtarget);
         if (r < 0)
                 return log_unit_error_errno(unit, r, "Failed to spawn executor: %m");
         /* We add the new process to the cgroup both in the child (so that we can be sure that no user code is ever
          * executed outside of the cgroup) and in the parent (so that we can be sure that when we kill the cgroup the
          * process will be killed too). */
-        if (r == 0 && subcgroup_path)
-                (void) cg_attach(subcgroup_path, pidref.pid);
+        if (r == 0 && cgtarget)
+                (void) cg_attach(cgtarget, pidref.pid);
         /* r > 0: Already in the right cgroup thanks to CLONE_INTO_CGROUP */
 
         log_unit_debug(unit, "Forked %s as " PID_FMT " (%s CLONE_INTO_CGROUP)",
diff --git a/src/core/execute.h b/src/core/execute.h
index fba08627391..e8f4b3eea0e 100644
--- a/src/core/execute.h
+++ b/src/core/execute.h
@@ -551,7 +551,8 @@ DEFINE_TRIVIAL_CLEANUP_FUNC(ExecRuntime*, exec_runtime_free);
 ExecRuntime* exec_runtime_destroy(ExecRuntime *rt);
 void exec_runtime_clear(ExecRuntime *rt);
 
-int exec_params_get_cgroup_path(const ExecParameters *params, const CGroupContext *c, char **ret);
+int exec_params_needs_control_subcgroup(const ExecParameters *params);
+int exec_params_get_cgroup_path(const ExecParameters *params, const CGroupContext *c, const char *prefix, char **ret);
 void exec_params_shallow_clear(ExecParameters *p);
 void exec_params_dump(const ExecParameters *p, FILE* f, const char *prefix);
 void exec_params_deep_clear(ExecParameters *p);
diff --git a/test/units/TEST-07-PID1.protect-control-groups.sh b/test/units/TEST-07-PID1.protect-control-groups.sh
index e7752ffb4b1..0220e1b6922 100755
--- a/test/units/TEST-07-PID1.protect-control-groups.sh
+++ b/test/units/TEST-07-PID1.protect-control-groups.sh
@@ -104,4 +104,53 @@ testcase_basic_strict() {
     test_basic "strict" "yes" true "$READ_ONLY_MOUNT_FLAG"
 }
 
+testcase_delegate_subgroup() {
+    # Make sure the service cgroup is the root of the cgroup namespace when we use DelegateSubgroup.
+    systemd-run \
+        -p ProtectControlGroupsEx=private \
+        -p PrivateMounts=yes \
+        -p Delegate=yes \
+        -p DelegateSubgroup=supervisor \
+        --wait \
+        --pipe \
+        ls /sys/fs/cgroup/supervisor
+}
+
+testcase_delegate_subgroup_control() {
+    # Make sure control processes are namespaced, are put in the .control cgroup, have the .control group as
+    # the root of their cgroup namespace and don't violate the no inner processes rule. To ensure we don't
+    # violate the no inner processes rule, we make sure to enable a cgroup controller so that
+    # cgroup.subtree_control for the main service cgroup is not empty which will make any attempt to spawn
+    # processes into that cgroup fail with EBUSY.
+    assert_eq "$(
+        systemd-run \
+            --service-type=notify \
+            -p ProtectControlGroupsEx=private \
+            -p PrivateMounts=yes \
+            -p Delegate=yes \
+            -p DelegateSubgroup=supervisor \
+            -p ExecStartPost='sh -c "cat /proc/self/cgroup; kill $MAINPID"' \
+            --unit delegate-subgroup-control \
+            --wait \
+            --pipe \
+            sh -c 'echo +pids >/sys/fs/cgroup/cgroup.subtree_control; systemd-notify --ready; sleep infinity'
+    )" "0::/"
+}
+
+testcase_delegate_subgroup_pam() {
+    # Make sure any pam processes we spawn don't violate the no inner processes rule.
+    systemd-run \
+        --service-type=oneshot \
+        -p ProtectControlGroupsEx=private \
+        -p PrivateMounts=yes \
+        -p Delegate=yes \
+        -p DelegateSubgroup=supervisor \
+        -p User=testuser \
+        -p PAMName=systemd-user \
+        --unit delegate-subgroup-pam \
+        --wait \
+        --pipe \
+        sh -c 'echo +pids >/sys/fs/cgroup/cgroup.subtree_control'
+}
+
 run_testcases