return 0;
}
-int posix_spawn_wrapper(const char *path, char *const *argv, char *const *envp, PidRef *ret_pidref) {
+DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(posix_spawnattr_t*, posix_spawnattr_destroy, NULL);
+
+int posix_spawn_wrapper(const char *path, char *const *argv, char *const *envp, const char *cgroup, PidRef *ret_pidref) {
+ short flags = POSIX_SPAWN_SETSIGMASK|POSIX_SPAWN_SETSIGDEF;
posix_spawnattr_t attr;
sigset_t mask;
- pid_t pid;
int r;
/* Forks and invokes 'path' with 'argv' and 'envp' using CLONE_VM and CLONE_VFORK, which means the
r = posix_spawnattr_init(&attr);
if (r != 0)
return -r; /* These functions return a positive errno on failure */
- /* Set all signals to SIG_DFL */
- r = posix_spawnattr_setflags(&attr, POSIX_SPAWN_SETSIGMASK|POSIX_SPAWN_SETSIGDEF);
+
+ /* Initialization needs to succeed before we can set up a destructor. */
+ _unused_ _cleanup_(posix_spawnattr_destroyp) posix_spawnattr_t *attr_destructor = &attr;
+
+#if HAVE_PIDFD_SPAWN
+ _cleanup_close_ int cgroup_fd = -EBADF;
+
+ if (cgroup) {
+ _cleanup_free_ char *resolved_cgroup = NULL;
+
+ r = cg_get_path_and_check(
+ SYSTEMD_CGROUP_CONTROLLER,
+ cgroup,
+ /* suffix= */ NULL,
+ &resolved_cgroup);
+ if (r < 0)
+ return r;
+
+ cgroup_fd = open(resolved_cgroup, O_PATH|O_DIRECTORY|O_CLOEXEC);
+ if (cgroup_fd < 0)
+ return -errno;
+
+ r = posix_spawnattr_setcgroup_np(&attr, cgroup_fd);
+ if (r != 0)
+ return -r;
+
+ flags |= POSIX_SPAWN_SETCGROUP;
+ }
+#endif
+
+ r = posix_spawnattr_setflags(&attr, flags);
if (r != 0)
- goto fail;
+ return -r;
r = posix_spawnattr_setsigmask(&attr, &mask);
if (r != 0)
- goto fail;
+ return -r;
- r = posix_spawn(&pid, path, NULL, &attr, argv, envp);
+#if HAVE_PIDFD_SPAWN
+ _cleanup_close_ int pidfd = -EBADF;
+
+ r = pidfd_spawn(&pidfd, path, NULL, &attr, argv, envp);
+ if (r == 0) {
+ r = pidref_set_pidfd_consume(ret_pidref, TAKE_FD(pidfd));
+ if (r < 0)
+ return r;
+
+ if (cgroup_fd == -EBADF)
+ return 0; /* We managed to use the new API but we are running under cgroupv1 */
+
+ return 1; /* We managed to use the new API and we are already in the new cgroup */
+ }
+ if (!(ERRNO_IS_NOT_SUPPORTED(r) || ERRNO_IS_PRIVILEGE(r)))
+ return -r;
+
+ /* Compiled on a newer host, or seccomp&friends blocking clone3()? Fallback, but need to change the
+ * flags to remove the cgroup one, which is what redirects to clone3() */
+ flags &= ~POSIX_SPAWN_SETCGROUP;
+ r = posix_spawnattr_setflags(&attr, flags);
if (r != 0)
- goto fail;
+ return -r;
+#endif
- posix_spawnattr_destroy(&attr);
+ pid_t pid;
+ r = posix_spawn(&pid, path, NULL, &attr, argv, envp);
+ if (r != 0)
+ return -r;
- return pidref_set_pid(ret_pidref, pid);
+ r = pidref_set_pid(ret_pidref, pid);
+ if (r < 0)
+ return r;
-fail:
- assert(r > 0);
- posix_spawnattr_destroy(&attr);
- return -r;
+ return 0; /* We did not use CLONE_INTO_CGROUP so return 0, the caller will have to move the child */
}
int proc_dir_open(DIR **ret) {
_cleanup_(pidref_done) PidRef pidref = PIDREF_NULL;
_cleanup_fdset_free_ FDSet *fdset = NULL;
_cleanup_fclose_ FILE *f = NULL;
+ bool move_child = true;
int r;
assert(unit);
* child's memory.max, serialize all the state needed to start the unit, and pass it to the
* systemd-executor binary. clone() with CLONE_VM + CLONE_VFORK will pause the parent until the exec
* and ensure all memory is shared. The child immediately execs the new binary so the delay should
- * be minimal. Once glibc provides a clone3 wrapper we can switch to that, and clone directly in the
- * target cgroup. */
+ * be minimal. If glibc 2.39 is available pidfd_spawn() is used in order to get a race-free pid fd
+ * and to clone directly into the target cgroup (if we booted with cgroupv2). */
r = open_serialization_file("sd-executor-state", &f);
if (r < 0)
"--log-level", log_level,
"--log-target", log_target_to_string(manager_get_executor_log_target(unit->manager))),
environ,
+ cg_unified() > 0 ? subcgroup_path : NULL,
&pidref);
if (r < 0)
return log_unit_error_errno(unit, r, "Failed to spawn executor: %m");
+ if (r > 0)
+ move_child = false; /* Already in the right cgroup thanks to CLONE_INTO_CGROUP */
log_unit_debug(unit, "Forked %s as "PID_FMT, command->path, pidref.pid);
/* We add the new process to the cgroup both in the child (so that we can be sure that no user code is ever
* executed outside of the cgroup) and in the parent (so that we can be sure that when we kill the cgroup the
* process will be killed too). */
- if (subcgroup_path)
+ if (move_child && subcgroup_path)
(void) cg_attach(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path, pidref.pid);
exec_status_start(&command->exec_status, pidref.pid);