]> git.ipfire.org Git - thirdparty/systemd.git/commitdiff
core: add support for pidfd_spawn
authorLuca Boccassi <bluca@debian.org>
Fri, 12 Jan 2024 21:32:20 +0000 (21:32 +0000)
committerLuca Boccassi <bluca@debian.org>
Mon, 5 Feb 2024 21:52:36 +0000 (21:52 +0000)
Added in glibc 2.39, allows cloning into a cgroup and to get
a pid fd back instead of a pid. Removes race conditions for
both changing cgroups and getting a reliable reference for the
child process.

Fixes https://github.com/systemd/systemd/pull/18843
Replaces https://github.com/systemd/systemd/pull/16706

meson.build
src/basic/process-util.c
src/basic/process-util.h
src/core/execute.c

index 1001c114a424add7dcb3925656b1ce3526633604..4ebe46dd2342bb9ed99dd41143dc56c10458fa9d 100644 (file)
@@ -640,6 +640,7 @@ foreach ident : [
         ['fsconfig',          '''#include <sys/mount.h>'''],
         ['fsmount',           '''#include <sys/mount.h>'''],
         ['getdents64',        '''#include <dirent.h>'''],
+        ['pidfd_spawn',       '''#include <spawn.h>'''],
 ]
 
         have = cc.has_function(ident[0], prefix : ident[1], args : '-D_GNU_SOURCE')
index 701eea673c5aa38b4bc3ff2e0e917471b58a529e..ebf3344bf1b8b31c8f3165fa6bf73980ededc9d3 100644 (file)
@@ -2034,10 +2034,12 @@ int make_reaper_process(bool b) {
         return 0;
 }
 
-int posix_spawn_wrapper(const char *path, char *const *argv, char *const *envp, PidRef *ret_pidref) {
+DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(posix_spawnattr_t*, posix_spawnattr_destroy, NULL);
+
+int posix_spawn_wrapper(const char *path, char *const *argv, char *const *envp, const char *cgroup, PidRef *ret_pidref) {
+        short flags = POSIX_SPAWN_SETSIGMASK|POSIX_SPAWN_SETSIGDEF;
         posix_spawnattr_t attr;
         sigset_t mask;
-        pid_t pid;
         int r;
 
         /* Forks and invokes 'path' with 'argv' and 'envp' using CLONE_VM and CLONE_VFORK, which means the
@@ -2054,26 +2056,78 @@ int posix_spawn_wrapper(const char *path, char *const *argv, char *const *envp,
         r = posix_spawnattr_init(&attr);
         if (r != 0)
                 return -r; /* These functions return a positive errno on failure */
-        /* Set all signals to SIG_DFL */
-        r = posix_spawnattr_setflags(&attr, POSIX_SPAWN_SETSIGMASK|POSIX_SPAWN_SETSIGDEF);
+
+        /* Initialization needs to succeed before we can set up a destructor. */
+        _unused_ _cleanup_(posix_spawnattr_destroyp) posix_spawnattr_t *attr_destructor = &attr;
+
+#if HAVE_PIDFD_SPAWN
+        _cleanup_close_ int cgroup_fd = -EBADF;
+
+        if (cgroup) {
+                _cleanup_free_ char *resolved_cgroup = NULL;
+
+                r = cg_get_path_and_check(
+                                SYSTEMD_CGROUP_CONTROLLER,
+                                cgroup,
+                                /* suffix= */ NULL,
+                                &resolved_cgroup);
+                if (r < 0)
+                        return r;
+
+                cgroup_fd = open(resolved_cgroup, O_PATH|O_DIRECTORY|O_CLOEXEC);
+                if (cgroup_fd < 0)
+                        return -errno;
+
+                r = posix_spawnattr_setcgroup_np(&attr, cgroup_fd);
+                if (r != 0)
+                        return -r;
+
+                flags |= POSIX_SPAWN_SETCGROUP;
+        }
+#endif
+
+        r = posix_spawnattr_setflags(&attr, flags);
         if (r != 0)
-                goto fail;
+                return -r;
         r = posix_spawnattr_setsigmask(&attr, &mask);
         if (r != 0)
-                goto fail;
+                return -r;
 
-        r = posix_spawn(&pid, path, NULL, &attr, argv, envp);
+#if HAVE_PIDFD_SPAWN
+        _cleanup_close_ int pidfd = -EBADF;
+
+        r = pidfd_spawn(&pidfd, path, NULL, &attr, argv, envp);
+        if (r == 0) {
+                r = pidref_set_pidfd_consume(ret_pidref, TAKE_FD(pidfd));
+                if (r < 0)
+                        return r;
+
+                if (cgroup_fd == -EBADF)
+                        return 0; /* We managed to use the new API but we are running under cgroupv1 */
+
+                return 1; /* We managed to use the new API and we are already in the new cgroup */
+        }
+        if (!(ERRNO_IS_NOT_SUPPORTED(r) || ERRNO_IS_PRIVILEGE(r)))
+                return -r;
+
+        /* Compiled on a newer host, or seccomp&friends blocking clone3()? Fallback, but need to change the
+         * flags to remove the cgroup one, which is what redirects to clone3() */
+        flags &= ~POSIX_SPAWN_SETCGROUP;
+        r = posix_spawnattr_setflags(&attr, flags);
         if (r != 0)
-                goto fail;
+                return -r;
+#endif
 
-        posix_spawnattr_destroy(&attr);
+        pid_t pid;
+        r = posix_spawn(&pid, path, NULL, &attr, argv, envp);
+        if (r != 0)
+                return -r;
 
-        return pidref_set_pid(ret_pidref, pid);
+        r = pidref_set_pid(ret_pidref, pid);
+        if (r < 0)
+                return r;
 
-fail:
-        assert(r > 0);
-        posix_spawnattr_destroy(&attr);
-        return -r;
+        return 0; /* We did not use CLONE_INTO_CGROUP so return 0, the caller will have to move the child */
 }
 
 int proc_dir_open(DIR **ret) {
index bfa43318956f5bec702a2651136198bef708be66..06c7f0a7d862950737e9b4c500ae3390db87c81d 100644 (file)
@@ -238,7 +238,7 @@ int get_process_threads(pid_t pid);
 int is_reaper_process(void);
 int make_reaper_process(bool b);
 
-int posix_spawn_wrapper(const char *path, char *const *argv, char *const *envp, PidRef *ret_pidref);
+int posix_spawn_wrapper(const char *path, char *const *argv, char *const *envp, const char *cgroup, PidRef *ret_pidref);
 
 int proc_dir_open(DIR **ret);
 int proc_dir_read(DIR *d, pid_t *ret);
index c9c07714a3b310c951415b35d87b9164d9f6343d..e5062e1b466fc0211012a6d6b49e1b1284de48bb 100644 (file)
@@ -364,6 +364,7 @@ int exec_spawn(Unit *unit,
         _cleanup_(pidref_done) PidRef pidref = PIDREF_NULL;
         _cleanup_fdset_free_ FDSet *fdset = NULL;
         _cleanup_fclose_ FILE *f = NULL;
+        bool move_child = true;
         int r;
 
         assert(unit);
@@ -404,8 +405,8 @@ int exec_spawn(Unit *unit,
          * child's memory.max, serialize all the state needed to start the unit, and pass it to the
          * systemd-executor binary. clone() with CLONE_VM + CLONE_VFORK will pause the parent until the exec
          * and ensure all memory is shared. The child immediately execs the new binary so the delay should
-         * be minimal. Once glibc provides a clone3 wrapper we can switch to that, and clone directly in the
-         * target cgroup. */
+         * be minimal. If glibc 2.39 is available pidfd_spawn() is used in order to get a race-free pid fd
+         * and to clone directly into the target cgroup (if we booted with cgroupv2). */
 
         r = open_serialization_file("sd-executor-state", &f);
         if (r < 0)
@@ -451,16 +452,19 @@ int exec_spawn(Unit *unit,
                                   "--log-level", log_level,
                                   "--log-target", log_target_to_string(manager_get_executor_log_target(unit->manager))),
                         environ,
+                        cg_unified() > 0 ? subcgroup_path : NULL,
                         &pidref);
         if (r < 0)
                 return log_unit_error_errno(unit, r, "Failed to spawn executor: %m");
+        if (r > 0)
+                move_child = false; /* Already in the right cgroup thanks to CLONE_INTO_CGROUP */
 
         log_unit_debug(unit, "Forked %s as "PID_FMT, command->path, pidref.pid);
 
         /* We add the new process to the cgroup both in the child (so that we can be sure that no user code is ever
          * executed outside of the cgroup) and in the parent (so that we can be sure that when we kill the cgroup the
          * process will be killed too). */
-        if (subcgroup_path)
+        if (move_child && subcgroup_path)
                 (void) cg_attach(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path, pidref.pid);
 
         exec_status_start(&command->exec_status, pidref.pid);