}
#endif
+static int attach_to_subcgroup(
+ const ExecContext *context,
+ const CGroupContext *cgroup_context,
+ const ExecParameters *params,
+ const char *prefix) {
+
+ _cleanup_free_ char *subgroup = NULL;
+ int r;
+
+ assert(context);
+ assert(cgroup_context);
+ assert(params);
+
+ /* If we're a control process that needs a subgroup, we've already been spawned into it as otherwise
+ * we'd violate the "no inner processes" rule, so no need to do anything. */
+ if (exec_params_needs_control_subcgroup(params))
+ return 0;
+
+ r = exec_params_get_cgroup_path(params, cgroup_context, prefix, &subgroup);
+ if (r < 0)
+ return log_error_errno(r, "Failed to acquire cgroup path: %m");
+ /* No subgroup required? Then there's nothing to do. */
+ if (r == 0)
+ return 0;
+
+ r = cg_attach(subgroup, 0);
+ if (r == -EUCLEAN)
+ return log_error_errno(r,
+ "Failed to attach process " PID_FMT " to cgroup '%s', "
+ "because the cgroup or one of its parents or "
+ "siblings is in the threaded mode.",
+ getpid_cached(), subgroup);
+ if (r < 0)
+ return log_error_errno(r,
+ "Failed to attach process " PID_FMT " to cgroup %s: %m",
+ getpid_cached(), subgroup);
+
+ return 0;
+}
+
static int setup_pam(
const ExecContext *context,
+ const CGroupContext *cgroup_context,
ExecParameters *params,
const char *user,
uid_t uid,
gid_t gid,
char ***env, /* updated on success */
const int fds[], size_t n_fds,
+ bool needs_sandboxing,
int exec_fd) {
#if HAVE_PAM
if (r == 0) {
int ret = EXIT_PAM;
+ if (needs_sandboxing && exec_needs_cgroup_namespace(context) && params->cgroup_path) {
+ /* Move PAM process into subgroup immediately if the main process hasn't been moved
+ * into the subgroup yet (when cgroup namespacing is enabled) and a subgroup is
+ * configured. */
+ r = attach_to_subcgroup(context, cgroup_context, params, params->cgroup_path);
+ if (r < 0)
+ return r;
+ }
+
/* The child's job is to reset the PAM session on termination */
barrier_set_role(&barrier, BARRIER_CHILD);
if (socket_fd >= 0)
(void) fd_nonblock(socket_fd, false);
+ /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted
+ * from it. */
+ needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
+
/* Journald will try to look-up our cgroup in order to populate _SYSTEMD_CGROUP and _SYSTEMD_UNIT fields.
* Hence we need to migrate to the target cgroup from init.scope before connecting to journald */
if (params->cgroup_path) {
- _cleanup_free_ char *p = NULL;
+ _cleanup_free_ char *subcgroup = NULL;
- r = exec_params_get_cgroup_path(params, cgroup_context, &p);
+ r = exec_params_get_cgroup_path(params, cgroup_context, params->cgroup_path, &subcgroup);
if (r < 0) {
*exit_status = EXIT_CGROUP;
return log_error_errno(r, "Failed to acquire cgroup path: %m");
}
+ if (r > 0) {
+ /* If there is a subcgroup required, let's make sure to create it now. */
+ r = cg_create(subcgroup);
+ if (r < 0)
+ return log_error_errno(r, "Failed to create subcgroup '%s': %m", subcgroup);
+ }
+
+ /* If we need a cgroup namespace, we cannot yet move the service to its configured subgroup,
+ * as unsharing the cgroup namespace later on makes the current cgroup the root of the
+ * namespace and we want the root of the namespace to be the main service cgroup and not the
+ * subgroup. One edge case is if we're a control process that needs to be spawned in a
+ * subgroup, in this case, we have no choice as moving into the main service cgroup might
+ * violate the no inner processes rule of cgroupv2. */
+ const char *cgtarget = needs_sandboxing && exec_needs_cgroup_namespace(context) &&
+ !exec_params_needs_control_subcgroup(params)
+ ? params->cgroup_path : subcgroup;
- r = cg_attach(p, 0);
+ r = cg_attach(cgtarget, 0);
if (r == -EUCLEAN) {
*exit_status = EXIT_CGROUP;
return log_error_errno(r,
"Failed to attach process to cgroup '%s', "
"because the cgroup or one of its parents or "
- "siblings is in the threaded mode.", p);
+ "siblings is in the threaded mode.", cgtarget);
}
if (r < 0) {
*exit_status = EXIT_CGROUP;
- return log_error_errno(r, "Failed to attach to cgroup %s: %m", p);
+ return log_error_errno(r, "Failed to attach to cgroup %s: %m", cgtarget);
}
}
}
}
- /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted
- * from it. */
- needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
-
if (params->cgroup_path) {
/* If delegation is enabled we'll pass ownership of the cgroup to the user of the new process. On cgroup v1
* this is only about systemd's own hierarchy, i.e. not the controller hierarchies, simply because that's not
return log_error_errno(r, "Failed to adjust control group access: %m");
}
- r = exec_params_get_cgroup_path(params, cgroup_context, &p);
+ r = exec_params_get_cgroup_path(params, cgroup_context, params->cgroup_path, &p);
if (r < 0) {
*exit_status = EXIT_CGROUP;
return log_error_errno(r, "Failed to acquire cgroup path: %m");
* wins here. (See above.) */
/* All fds passed in the fds array will be closed in the pam child process. */
- r = setup_pam(context, params, username, uid, gid, &accum_env, params->fds, n_fds, params->exec_fd);
+ r = setup_pam(context, cgroup_context, params, username, uid, gid, &accum_env,
+ params->fds, n_fds, needs_sandboxing, params->exec_fd);
if (r < 0) {
*exit_status = EXIT_PAM;
return log_error_errno(r, "Failed to set up PAM session: %m");
if (r < 0)
return r;
+ if (needs_sandboxing && exec_needs_cgroup_namespace(context) && params->cgroup_path) {
+ /* Move ourselves into the subcgroup now *after* we've unshared the cgroup namespace, which
+ * ensures the root of the cgroup namespace is the top level service cgroup and not the
+ * subcgroup. Adjust the prefix accordingly since we're in a cgroup namespace now. */
+ r = attach_to_subcgroup(context, cgroup_context, params, /* prefix= */ NULL);
+ if (r < 0) {
+ *exit_status = EXIT_CGROUP;
+ return r;
+ }
+ }
+
/* Now that the mount namespace has been set up and privileges adjusted, let's look for the thing we
* shall execute. */
return true;
}
+int exec_params_needs_control_subcgroup(const ExecParameters *params) {
+ /* Keep this in sync with exec_params_get_cgroup_path(). */
+ return FLAGS_SET(params->flags, EXEC_CGROUP_DELEGATE|EXEC_CONTROL_CGROUP|EXEC_IS_CONTROL);
+}
+
int exec_params_get_cgroup_path(
const ExecParameters *params,
const CGroupContext *c,
+ const char *prefix,
char **ret) {
const char *subgroup = NULL;
char *p;
assert(params);
+ assert(c);
assert(ret);
- if (!params->cgroup_path)
- return -EINVAL;
-
/* If we are called for a unit where cgroup delegation is on, and the payload created its own populated
* subcgroup (which we expect it to do, after all it asked for delegation), then we cannot place the control
* processes started after the main unit's process in the unit's main cgroup because it is now an inner one,
* this is not necessary, the cgroup is still empty. We distinguish these cases with the EXEC_CONTROL_CGROUP
* flag, which is only passed for the former statements, not for the latter. */
+ /* Keep this in sync with exec_params_needs_control_subcgroup(). */
if (FLAGS_SET(params->flags, EXEC_CGROUP_DELEGATE) && (FLAGS_SET(params->flags, EXEC_CONTROL_CGROUP) || c->delegate_subgroup)) {
if (FLAGS_SET(params->flags, EXEC_IS_CONTROL))
subgroup = ".control";
}
if (subgroup)
- p = path_join(params->cgroup_path, subgroup);
+ p = path_join(prefix, subgroup);
else
- p = strdup(params->cgroup_path);
+ p = strdup(strempty(prefix));
if (!p)
return -ENOMEM;
want to log from the parent, so we use the possibly inaccurate path here. */
log_command_line(unit, "About to execute", command->path, command->argv);
- if (params->cgroup_path) {
- r = exec_params_get_cgroup_path(params, cgroup_context, &subcgroup_path);
+ /* We cannot spawn the main service process into the subcgroup as it might need to unshare the cgroup
+ * namespace first if one is configured to make sure the root of the cgroup namespace is the service
+ * cgroup and not the subcgroup. However, when running control commands on a live service, the
+ * commands have to be spawned inside a subcgroup, otherwise we violate the no inner processes rule
+ * of cgroupv2 as the main service process might already have enabled controllers by writing to
+ * cgroup.subtree_control. */
+
+ const char *cgtarget;
+ if (exec_params_needs_control_subcgroup(params)) {
+ r = exec_params_get_cgroup_path(params, cgroup_context, params->cgroup_path, &subcgroup_path);
if (r < 0)
return log_unit_error_errno(unit, r, "Failed to acquire subcgroup path: %m");
if (r > 0) {
if (r < 0)
return log_unit_error_errno(unit, r, "Failed to create subcgroup '%s': %m", subcgroup_path);
}
- }
+
+ cgtarget = subcgroup_path;
+ } else
+ cgtarget = params->cgroup_path;
/* In order to avoid copy-on-write traps and OOM-kills when pid1's memory.current is above the
* child's memory.max, serialize all the state needed to start the unit, and pass it to the
"--log-level", max_log_levels,
"--log-target", log_target_to_string(manager_get_executor_log_target(unit->manager))),
environ,
- subcgroup_path,
+ cgtarget,
&pidref);
/* Drop the ambient set again, so no processes other than sd-executore spawned from the manager inherit it. */
(void) capability_ambient_set_apply(0, /* also_inherit= */ false);
- if (r == -EUCLEAN && subcgroup_path)
+ if (r == -EUCLEAN && cgtarget)
return log_unit_error_errno(unit, r,
"Failed to spawn process into cgroup '%s', because the cgroup "
"or one of its parents or siblings is in the threaded mode.",
- subcgroup_path);
+ cgtarget);
if (r < 0)
return log_unit_error_errno(unit, r, "Failed to spawn executor: %m");
/* We add the new process to the cgroup both in the child (so that we can be sure that no user code is ever
* executed outside of the cgroup) and in the parent (so that we can be sure that when we kill the cgroup the
* process will be killed too). */
- if (r == 0 && subcgroup_path)
- (void) cg_attach(subcgroup_path, pidref.pid);
+ if (r == 0 && cgtarget)
+ (void) cg_attach(cgtarget, pidref.pid);
/* r > 0: Already in the right cgroup thanks to CLONE_INTO_CGROUP */
log_unit_debug(unit, "Forked %s as " PID_FMT " (%s CLONE_INTO_CGROUP)",
test_basic "strict" "yes" true "$READ_ONLY_MOUNT_FLAG"
}
+testcase_delegate_subgroup() {
+ # Make sure the service cgroup is the root of the cgroup namespace when we use DelegateSubgroup.
+ systemd-run \
+ -p ProtectControlGroupsEx=private \
+ -p PrivateMounts=yes \
+ -p Delegate=yes \
+ -p DelegateSubgroup=supervisor \
+ --wait \
+ --pipe \
+ ls /sys/fs/cgroup/supervisor
+}
+
+testcase_delegate_subgroup_control() {
+ # Make sure control processes are namespaced, are put in the .control cgroup, have the .control group as
+ # the root of their cgroup namespace and don't violate the no inner processes rule. To ensure we don't
+ # violate the no inner processes rule, we make sure to enable a cgroup controller so that
+ # cgroup.subtree_control for the main service cgroup is not empty which will make any attempt to spawn
+ # processes into that cgroup fail with EBUSY.
+ assert_eq "$(
+ systemd-run \
+ --service-type=notify \
+ -p ProtectControlGroupsEx=private \
+ -p PrivateMounts=yes \
+ -p Delegate=yes \
+ -p DelegateSubgroup=supervisor \
+ -p ExecStartPost='sh -c "cat /proc/self/cgroup; kill $MAINPID"' \
+ --unit delegate-subgroup-control \
+ --wait \
+ --pipe \
+ sh -c 'echo +pids >/sys/fs/cgroup/cgroup.subtree_control; systemd-notify --ready; sleep infinity'
+ )" "0::/"
+}
+
+testcase_delegate_subgroup_pam() {
+ # Make sure any pam processes we spawn don't violate the no inner processes rule.
+ systemd-run \
+ --service-type=oneshot \
+ -p ProtectControlGroupsEx=private \
+ -p PrivateMounts=yes \
+ -p Delegate=yes \
+ -p DelegateSubgroup=supervisor \
+ -p User=testuser \
+ -p PAMName=systemd-user \
+ --unit delegate-subgroup-pam \
+ --wait \
+ --pipe \
+ sh -c 'echo +pids >/sys/fs/cgroup/cgroup.subtree_control'
+}
+
run_testcases