core: Make sure we handle DelegateSubgroup= in combo with cgroupns

author Daan De Meyer <daan.j.demeyer@gmail.com>

Fri, 25 Apr 2025 10:40:52 +0000 (12:40 +0200)

committer Daan De Meyer <daan.j.demeyer@gmail.com>

Thu, 5 Jun 2025 10:37:02 +0000 (12:37 +0200)
author Daan De Meyer <daan.j.demeyer@gmail.com>
Fri, 25 Apr 2025 10:40:52 +0000 (12:40 +0200)
committer Daan De Meyer <daan.j.demeyer@gmail.com>
Thu, 5 Jun 2025 10:37:02 +0000 (12:37 +0200)
diff --git a/src/core/exec-invoke.c b/src/core/exec-invoke.c

index a9c8664c4a2ec10d672d8347dff6ec65a853c377..9e7af81f1ef1259af29f99e1fe77a7d2e7247468 100644 (file)
--- a/src/core/exec-invoke.c
+++ b/src/core/exec-invoke.c
@@ -1177,14 +1177,56 @@ static int pam_close_session_and_delete_credentials(pam_handle_t *handle, int fl
  }
  #endif
  
+static int attach_to_subcgroup(
+                const ExecContext *context,
+                const CGroupContext *cgroup_context,
+                const ExecParameters *params,
+                const char *prefix) {
+
+        _cleanup_free_ char *subgroup = NULL;
+        int r;
+
+        assert(context);
+        assert(cgroup_context);
+        assert(params);
+
+        /* If we're a control process that needs a subgroup, we've already been spawned into it as otherwise
+         * we'd violate the "no inner processes" rule, so no need to do anything. */
+        if (exec_params_needs_control_subcgroup(params))
+                return 0;
+
+        r = exec_params_get_cgroup_path(params, cgroup_context, prefix, &subgroup);
+        if (r < 0)
+                return log_error_errno(r, "Failed to acquire cgroup path: %m");
+        /* No subgroup required? Then there's nothing to do. */
+        if (r == 0)
+                return 0;
+
+        r = cg_attach(subgroup, 0);
+        if (r == -EUCLEAN)
+                return log_error_errno(r,
+                                "Failed to attach process " PID_FMT " to cgroup '%s', "
+                                "because the cgroup or one of its parents or "
+                                "siblings is in the threaded mode.",
+                                getpid_cached(), subgroup);
+        if (r < 0)
+                return log_error_errno(r,
+                                "Failed to attach process " PID_FMT " to cgroup %s: %m",
+                                getpid_cached(), subgroup);
+
+        return 0;
+}
+
  static int setup_pam(
                  const ExecContext *context,
+                const CGroupContext *cgroup_context,
                  ExecParameters *params,
                  const char *user,
                  uid_t uid,
                  gid_t gid,
                  char ***env, /* updated on success */
                  const int fds[], size_t n_fds,
+                bool needs_sandboxing,
                  int exec_fd) {
  
  #if HAVE_PAM
@@ -1290,6 +1332,15 @@ static int setup_pam(
          if (r == 0) {
                  int ret = EXIT_PAM;
  
+                if (needs_sandboxing && exec_needs_cgroup_namespace(context) && params->cgroup_path) {
+                        /* Move PAM process into subgroup immediately if the main process hasn't been moved
+                         * into the subgroup yet (when cgroup namespacing is enabled) and a subgroup is
+                         * configured. */
+                        r = attach_to_subcgroup(context, cgroup_context, params, params->cgroup_path);
+                        if (r < 0)
+                                return r;
+                }
+
                  /* The child's job is to reset the PAM session on termination */
                  barrier_set_role(&barrier, BARRIER_CHILD);
  
@@ -4911,28 +4962,48 @@ int exec_invoke(
          if (socket_fd >= 0)
                  (void) fd_nonblock(socket_fd, false);
  
+        /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted
+         * from it. */
+        needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
+
          /* Journald will try to look-up our cgroup in order to populate _SYSTEMD_CGROUP and _SYSTEMD_UNIT fields.
           * Hence we need to migrate to the target cgroup from init.scope before connecting to journald */
          if (params->cgroup_path) {
-                _cleanup_free_ char *p = NULL;
+                _cleanup_free_ char *subcgroup = NULL;
  
-                r = exec_params_get_cgroup_path(params, cgroup_context, &p);
+                r = exec_params_get_cgroup_path(params, cgroup_context, params->cgroup_path, &subcgroup);
                  if (r < 0) {
                          *exit_status = EXIT_CGROUP;
                          return log_error_errno(r, "Failed to acquire cgroup path: %m");
                  }
+                if (r > 0) {
+                        /* If there is a subcgroup required, let's make sure to create it now. */
+                        r = cg_create(subcgroup);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to create subcgroup '%s': %m", subcgroup);
+                }
+
+                /* If we need a cgroup namespace, we cannot yet move the service to its configured subgroup,
+                 * as unsharing the cgroup namespace later on makes the current cgroup the root of the
+                 * namespace and we want the root of the namespace to be the main service cgroup and not the
+                 * subgroup. One edge case is if we're a control process that needs to be spawned in a
+                 * subgroup, in this case, we have no choice as moving into the main service cgroup might
+                 * violate the no inner processes rule of cgroupv2. */
+                const char *cgtarget = needs_sandboxing && exec_needs_cgroup_namespace(context) &&
+                                                           !exec_params_needs_control_subcgroup(params)
+                                                           ? params->cgroup_path : subcgroup;
  
-                r = cg_attach(p, 0);
+                r = cg_attach(cgtarget, 0);
                  if (r == -EUCLEAN) {
                          *exit_status = EXIT_CGROUP;
                          return log_error_errno(r,
                                                 "Failed to attach process to cgroup '%s', "
                                                 "because the cgroup or one of its parents or "
-                                               "siblings is in the threaded mode.", p);
+                                               "siblings is in the threaded mode.", cgtarget);
                  }
                  if (r < 0) {
                          *exit_status = EXIT_CGROUP;
-                        return log_error_errno(r, "Failed to attach to cgroup %s: %m", p);
+                        return log_error_errno(r, "Failed to attach to cgroup %s: %m", cgtarget);
                  }
          }
  
@@ -5128,10 +5199,6 @@ int exec_invoke(
                  }
          }
  
-        /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted
-         * from it. */
-        needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
-
          if (params->cgroup_path) {
                  /* If delegation is enabled we'll pass ownership of the cgroup to the user of the new process. On cgroup v1
                   * this is only about systemd's own hierarchy, i.e. not the controller hierarchies, simply because that's not
@@ -5147,7 +5214,7 @@ int exec_invoke(
                                  return log_error_errno(r, "Failed to adjust control group access: %m");
                          }
  
-                        r = exec_params_get_cgroup_path(params, cgroup_context, &p);
+                        r = exec_params_get_cgroup_path(params, cgroup_context, params->cgroup_path, &p);
                          if (r < 0) {
                                  *exit_status = EXIT_CGROUP;
                                  return log_error_errno(r, "Failed to acquire cgroup path: %m");
@@ -5328,7 +5395,8 @@ int exec_invoke(
                   * wins here. (See above.) */
  
                  /* All fds passed in the fds array will be closed in the pam child process. */
-                r = setup_pam(context, params, username, uid, gid, &accum_env, params->fds, n_fds, params->exec_fd);
+                r = setup_pam(context, cgroup_context, params, username, uid, gid, &accum_env,
+                              params->fds, n_fds, needs_sandboxing, params->exec_fd);
                  if (r < 0) {
                          *exit_status = EXIT_PAM;
                          return log_error_errno(r, "Failed to set up PAM session: %m");
@@ -5453,6 +5521,17 @@ int exec_invoke(
          if (r < 0)
                  return r;
  
+        if (needs_sandboxing && exec_needs_cgroup_namespace(context) && params->cgroup_path) {
+                /* Move ourselves into the subcgroup now *after* we've unshared the cgroup namespace, which
+                 * ensures the root of the cgroup namespace is the top level service cgroup and not the
+                 * subcgroup. Adjust the prefix accordingly since we're in a cgroup namespace now. */
+                r = attach_to_subcgroup(context, cgroup_context, params, /* prefix= */ NULL);
+                if (r < 0) {
+                        *exit_status = EXIT_CGROUP;
+                        return r;
+                }
+        }
+
          /* Now that the mount namespace has been set up and privileges adjusted, let's look for the thing we
           * shall execute. */
  
diff --git a/src/core/execute.c b/src/core/execute.c

index 31b5e13fe4a788b2b780fdc7a844c2ba9ae2a3bb..1fdb3190427944f16043dc97e5c42e74657dd18f 100644 (file)
--- a/src/core/execute.c
+++ b/src/core/execute.c
@@ -405,20 +405,24 @@ bool exec_directory_is_private(const ExecContext *context, ExecDirectoryType typ
          return true;
  }
  
+int exec_params_needs_control_subcgroup(const ExecParameters *params) {
+        /* Keep this in sync with exec_params_get_cgroup_path(). */
+        return FLAGS_SET(params->flags, EXEC_CGROUP_DELEGATE|EXEC_CONTROL_CGROUP|EXEC_IS_CONTROL);
+}
+
  int exec_params_get_cgroup_path(
                  const ExecParameters *params,
                  const CGroupContext *c,
+                const char *prefix,
                  char **ret) {
  
          const char *subgroup = NULL;
          char *p;
  
          assert(params);
+        assert(c);
          assert(ret);
  
-        if (!params->cgroup_path)
-                return -EINVAL;
-
          /* If we are called for a unit where cgroup delegation is on, and the payload created its own populated
           * subcgroup (which we expect it to do, after all it asked for delegation), then we cannot place the control
           * processes started after the main unit's process in the unit's main cgroup because it is now an inner one,
@@ -428,6 +432,7 @@ int exec_params_get_cgroup_path(
           * this is not necessary, the cgroup is still empty. We distinguish these cases with the EXEC_CONTROL_CGROUP
           * flag, which is only passed for the former statements, not for the latter. */
  
+        /* Keep this in sync with exec_params_needs_control_subcgroup(). */
          if (FLAGS_SET(params->flags, EXEC_CGROUP_DELEGATE) && (FLAGS_SET(params->flags, EXEC_CONTROL_CGROUP) || c->delegate_subgroup)) {
                  if (FLAGS_SET(params->flags, EXEC_IS_CONTROL))
                          subgroup = ".control";
@@ -436,9 +441,9 @@ int exec_params_get_cgroup_path(
          }
  
          if (subgroup)
-                p = path_join(params->cgroup_path, subgroup);
+                p = path_join(prefix, subgroup);
          else
-                p = strdup(params->cgroup_path);
+                p = strdup(strempty(prefix));
          if (!p)
                  return -ENOMEM;
  
@@ -506,8 +511,16 @@ int exec_spawn(
             want to log from the parent, so we use the possibly inaccurate path here. */
          log_command_line(unit, "About to execute", command->path, command->argv);
  
-        if (params->cgroup_path) {
-                r = exec_params_get_cgroup_path(params, cgroup_context, &subcgroup_path);
+        /* We cannot spawn the main service process into the subcgroup as it might need to unshare the cgroup
+         * namespace first if one is configured to make sure the root of the cgroup namespace is the service
+         * cgroup and not the subcgroup. However, when running control commands on a live service, the
+         * commands have to be spawned inside a subcgroup, otherwise we violate the no inner processes rule
+         * of cgroupv2 as the main service process might already have enabled controllers by writing to
+         * cgroup.subtree_control. */
+
+        const char *cgtarget;
+        if (exec_params_needs_control_subcgroup(params)) {
+                r = exec_params_get_cgroup_path(params, cgroup_context, params->cgroup_path, &subcgroup_path);
                  if (r < 0)
                          return log_unit_error_errno(unit, r, "Failed to acquire subcgroup path: %m");
                  if (r > 0) {
@@ -518,7 +531,10 @@ int exec_spawn(
                          if (r < 0)
                                  return log_unit_error_errno(unit, r, "Failed to create subcgroup '%s': %m", subcgroup_path);
                  }
-        }
+
+                cgtarget = subcgroup_path;
+        } else
+                cgtarget = params->cgroup_path;
  
          /* In order to avoid copy-on-write traps and OOM-kills when pid1's memory.current is above the
           * child's memory.max, serialize all the state needed to start the unit, and pass it to the
@@ -582,24 +598,24 @@ int exec_spawn(
                                    "--log-level", max_log_levels,
                                    "--log-target", log_target_to_string(manager_get_executor_log_target(unit->manager))),
                          environ,
-                        subcgroup_path,
+                        cgtarget,
                          &pidref);
  
          /* Drop the ambient set again, so no processes other than sd-executore spawned from the manager inherit it. */
          (void) capability_ambient_set_apply(0, /* also_inherit= */ false);
  
-        if (r == -EUCLEAN && subcgroup_path)
+        if (r == -EUCLEAN && cgtarget)
                  return log_unit_error_errno(unit, r,
                                              "Failed to spawn process into cgroup '%s', because the cgroup "
                                              "or one of its parents or siblings is in the threaded mode.",
-                                            subcgroup_path);
+                                            cgtarget);
          if (r < 0)
                  return log_unit_error_errno(unit, r, "Failed to spawn executor: %m");
          /* We add the new process to the cgroup both in the child (so that we can be sure that no user code is ever
           * executed outside of the cgroup) and in the parent (so that we can be sure that when we kill the cgroup the
           * process will be killed too). */
-        if (r == 0 && subcgroup_path)
-                (void) cg_attach(subcgroup_path, pidref.pid);
+        if (r == 0 && cgtarget)
+                (void) cg_attach(cgtarget, pidref.pid);
          /* r > 0: Already in the right cgroup thanks to CLONE_INTO_CGROUP */
  
          log_unit_debug(unit, "Forked %s as " PID_FMT " (%s CLONE_INTO_CGROUP)",
diff --git a/src/core/execute.h b/src/core/execute.h

index fba08627391af291507159b5938c293af34903f1..e8f4b3eea0ea605751c5292a4edf5d3c4e9892b2 100644 (file)
--- a/src/core/execute.h
+++ b/src/core/execute.h
@@ -551,7 +551,8 @@ DEFINE_TRIVIAL_CLEANUP_FUNC(ExecRuntime*, exec_runtime_free);
  ExecRuntime* exec_runtime_destroy(ExecRuntime *rt);
  void exec_runtime_clear(ExecRuntime *rt);
  
-int exec_params_get_cgroup_path(const ExecParameters *params, const CGroupContext *c, char **ret);
+int exec_params_needs_control_subcgroup(const ExecParameters *params);
+int exec_params_get_cgroup_path(const ExecParameters *params, const CGroupContext *c, const char *prefix, char **ret);
  void exec_params_shallow_clear(ExecParameters *p);
  void exec_params_dump(const ExecParameters *p, FILE* f, const char *prefix);
  void exec_params_deep_clear(ExecParameters *p);
diff --git a/test/units/TEST-07-PID1.protect-control-groups.sh b/test/units/TEST-07-PID1.protect-control-groups.sh

index e7752ffb4b14d1176b180739610039ad9bf1b9e3..0220e1b6922dd0fa0ecc87fd5517881569dcd028 100755 (executable)
--- a/test/units/TEST-07-PID1.protect-control-groups.sh
+++ b/test/units/TEST-07-PID1.protect-control-groups.sh
@@ -104,4 +104,53 @@ testcase_basic_strict() {
      test_basic "strict" "yes" true "$READ_ONLY_MOUNT_FLAG"
  }
  
+testcase_delegate_subgroup() {
+    # Make sure the service cgroup is the root of the cgroup namespace when we use DelegateSubgroup.
+    systemd-run \
+        -p ProtectControlGroupsEx=private \
+        -p PrivateMounts=yes \
+        -p Delegate=yes \
+        -p DelegateSubgroup=supervisor \
+        --wait \
+        --pipe \
+        ls /sys/fs/cgroup/supervisor
+}
+
+testcase_delegate_subgroup_control() {
+    # Make sure control processes are namespaced, are put in the .control cgroup, have the .control group as
+    # the root of their cgroup namespace and don't violate the no inner processes rule. To ensure we don't
+    # violate the no inner processes rule, we make sure to enable a cgroup controller so that
+    # cgroup.subtree_control for the main service cgroup is not empty which will make any attempt to spawn
+    # processes into that cgroup fail with EBUSY.
+    assert_eq "$(
+        systemd-run \
+            --service-type=notify \
+            -p ProtectControlGroupsEx=private \
+            -p PrivateMounts=yes \
+            -p Delegate=yes \
+            -p DelegateSubgroup=supervisor \
+            -p ExecStartPost='sh -c "cat /proc/self/cgroup; kill $MAINPID"' \
+            --unit delegate-subgroup-control \
+            --wait \
+            --pipe \
+            sh -c 'echo +pids >/sys/fs/cgroup/cgroup.subtree_control; systemd-notify --ready; sleep infinity'
+    )" "0::/"
+}
+
+testcase_delegate_subgroup_pam() {
+    # Make sure any pam processes we spawn don't violate the no inner processes rule.
+    systemd-run \
+        --service-type=oneshot \
+        -p ProtectControlGroupsEx=private \
+        -p PrivateMounts=yes \
+        -p Delegate=yes \
+        -p DelegateSubgroup=supervisor \
+        -p User=testuser \
+        -p PAMName=systemd-user \
+        --unit delegate-subgroup-pam \
+        --wait \
+        --pipe \
+        sh -c 'echo +pids >/sys/fs/cgroup/cgroup.subtree_control'
+}
+
  run_testcases
author	Daan De Meyer <daan.j.demeyer@gmail.com>
	Fri, 25 Apr 2025 10:40:52 +0000 (12:40 +0200)
committer	Daan De Meyer <daan.j.demeyer@gmail.com>
	Thu, 5 Jun 2025 10:37:02 +0000 (12:37 +0200)
src/core/exec-invoke.c		patch \| blob \| blame \| history
src/core/execute.c		patch \| blob \| blame \| history
src/core/execute.h		patch \| blob \| blame \| history
test/units/TEST-07-PID1.protect-control-groups.sh		patch \| blob \| blame \| history