]> git.ipfire.org Git - thirdparty/systemd.git/commitdiff
core: add systemd-executor binary
authorLuca Boccassi <bluca@debian.org>
Thu, 1 Jun 2023 18:51:42 +0000 (19:51 +0100)
committerLuca Boccassi <bluca@debian.org>
Thu, 12 Oct 2023 14:01:51 +0000 (15:01 +0100)
Currently we spawn services by forking a child process, doing a bunch
of work, and then exec'ing the service executable.

There are some advantages to this approach:

- quick: we immediately have access to all the enourmous amount of
  state simply by virtue of sharing the memory with the parent
- easy to refactor and add features
- part of the same binary, will never be out of sync

There are however significant drawbacks:

- doing work after fork and before exec is against glibc's supported
  case for several APIs we call
- copy-on-write trap: anytime any memory is touched in either parent
  or child, a copy of that page will be triggered
- memory footprint of the child process will be memory footprint of
  PID1, but using the cgroup memory limits of the unit

The last issue is especially problematic on resource constrained
systems where hard memory caps are enforced and swap is not allowed.
As soon as PID1 is under load, with no page out due to no swap, and a
service with a low MemoryMax= tries to start, hilarity ensues.

Add a new systemd-executor binary, that is able to receive all the
required state via memfd, deserialize it, prepare the appropriate
data structures and call exec_child.

Use posix_spawn which uses CLONE_VM + CLONE_VFORK, to ensure there is
no copy-on-write (same address space will be used, and parent process
will be frozen, until exec).
The sd-executor binary is pinned by FD on startup, so that we can
guarantee there will be no incompatibilities during upgrades.

15 files changed:
docs/ARCHITECTURE.md
meson.build
src/basic/cgroup-util.h
src/core/dynamic-user.c
src/core/dynamic-user.h
src/core/execute.c
src/core/execute.h
src/core/executor.c [new file with mode: 0644]
src/core/fuzz-manager-serialize.c
src/core/fuzz-unit-file.c
src/core/manager.c
src/core/manager.h
src/core/meson.build
src/core/unit.c
test/units/testsuite-55.sh

index c777fa75de5cf12cfdb1766b60e35b2ffeba0f58..33486203a6553b130a1f2513c1389be4d669f65b 100644 (file)
@@ -201,3 +201,25 @@ can be found under various directories such as `factory/`, `modprobe.d/`, `netwo
 `tools/`, `coccinelle/`, `.github/`, `.semaphore/`, `.mkosi/` host various
 utilities and scripts that are used by maintainers and developers. They are not
 shipped or installed.
+
+# Service Manager Overview
+
+The Service Manager takes configuration in the form of unit files, credentials,
+kernel command line options and D-Bus commands, and based on those manages the
+system and spawns other processes. It runs in system mode as PID1, and in user
+mode with one instance per user session.
+
+When starting a unit requires forking a new process, configuration for the new
+process will be serialized and passed over to the new process, created via a
+posix_spawn() call. This is done in order to avoid excessive processing after
+a fork() but before an exec(), which is against glibc's best practices and can
+also result in a copy-on-write trap. The new process will start as the
+`systemd-executor` binary, which will deserialize the configuration and apply
+all the options (sandboxing, namespacing, cgroup, etc.) before exec'ing the
+configured executable.
+
+```
+ ┌──────┐posix_spawn() ┌───────────┐execve() ┌────────┐
+ │ PID1 ├─────────────►│sd-executor├────────►│program │
+ └──────┘  (memfd)     └───────────┘         └────────┘
+```
index 1517065db66e2910c9ea0acf2288a4fbb09f3d61..5b6b928276261379d918c38bfef21bfd6c45ee90 100644 (file)
@@ -225,6 +225,7 @@ conf.set_quoted('SYSCONF_DIR',                                sysconfdir)
 conf.set_quoted('SYSCTL_DIR',                                 sysctldir)
 conf.set_quoted('SYSTEMCTL_BINARY_PATH',                      bindir / 'systemctl')
 conf.set_quoted('SYSTEMD_BINARY_PATH',                        libexecdir / 'systemd')
+conf.set_quoted('SYSTEMD_EXECUTOR_BINARY_PATH',               libexecdir / 'systemd-executor')
 conf.set_quoted('SYSTEMD_CATALOG_DIR',                        catalogdir)
 conf.set_quoted('SYSTEMD_CGROUPS_AGENT_PATH',                 libexecdir / 'systemd-cgroups-agent')
 conf.set_quoted('SYSTEMD_CRYPTSETUP_PATH',                    bindir / 'systemd-cryptsetup')
index 625816d9cf1de2960f68918681d7115367323d35..80ea7e7ffa80bfdf6379af65e33e714feef1caad 100644 (file)
@@ -36,7 +36,7 @@ typedef enum CGroupController {
         CGROUP_CONTROLLER_BPF_SOCKET_BIND,
         CGROUP_CONTROLLER_BPF_RESTRICT_NETWORK_INTERFACES,
         /* The BPF hook implementing RestrictFileSystems= is not defined here.
-         * It's applied as late as possible in exec_child() so we don't block
+         * It's applied as late as possible in exec_invoke() so we don't block
          * our own unit setup code. */
 
         _CGROUP_CONTROLLER_MAX,
index 2cab0d44acb713b87b89160380904591967f9b54..dc349afecfac0d429d9fd25a870c27d208213bfa 100644 (file)
@@ -28,7 +28,7 @@
 
 DEFINE_TRIVIAL_REF_FUNC(DynamicUser, dynamic_user);
 
-static DynamicUser* dynamic_user_free(DynamicUser *d) {
+DynamicUser* dynamic_user_free(DynamicUser *d) {
         if (!d)
                 return NULL;
 
@@ -850,3 +850,12 @@ DynamicCreds* dynamic_creds_destroy(DynamicCreds *creds) {
 
         return mfree(creds);
 }
+
+void dynamic_creds_done(DynamicCreds *creds) {
+        if (!creds)
+                return;
+
+        if (creds->group != creds->user)
+                dynamic_user_free(creds->group);
+        creds->group = creds->user = dynamic_user_free(creds->user);
+}
index 679c588a76dffae52cb790a3aeb00f86ba0ae9c3..e86ee02796b1282f10f247db08ffa1952cf4ace6 100644 (file)
@@ -28,6 +28,7 @@ struct DynamicUser {
 int dynamic_user_serialize(Manager *m, FILE *f, FDSet *fds);
 int dynamic_user_serialize_one(DynamicUser *d, const char *key, FILE *f, FDSet *fds);
 void dynamic_user_deserialize_one(Manager *m, const char *value, FDSet *fds, DynamicUser **ret);
+DynamicUser* dynamic_user_free(DynamicUser *d);
 void dynamic_user_vacuum(Manager *m, bool close_user);
 
 int dynamic_user_current(DynamicUser *d, uid_t *ret);
@@ -39,6 +40,7 @@ int dynamic_creds_realize(DynamicCreds *creds, char **suggested_paths, uid_t *ui
 
 DynamicCreds *dynamic_creds_unref(DynamicCreds *creds);
 DynamicCreds *dynamic_creds_destroy(DynamicCreds *creds);
+void dynamic_creds_done(DynamicCreds *creds);
 
 DEFINE_TRIVIAL_CLEANUP_FUNC(DynamicCreds*, dynamic_creds_unref);
 DEFINE_TRIVIAL_CLEANUP_FUNC(DynamicCreds*, dynamic_creds_destroy);
index c13024b1f8d3d358b8fb05c51d729cf8094144fd..ff9560665cbbff9f158e93ee92c4ba75c58ca13e 100644 (file)
@@ -39,6 +39,7 @@
 #include "argv-util.h"
 #include "async.h"
 #include "barrier.h"
+#include "bpf-dlopen.h"
 #include "bpf-lsm.h"
 #include "btrfs-util.h"
 #include "cap-list.h"
@@ -56,6 +57,7 @@
 #include "escape.h"
 #include "exec-credential.h"
 #include "execute.h"
+#include "execute-serialize.h"
 #include "exit-status.h"
 #include "fd-util.h"
 #include "fileio.h"
@@ -85,6 +87,7 @@
 #include "seccomp-util.h"
 #include "securebits-util.h"
 #include "selinux-util.h"
+#include "serialize.h"
 #include "signal-util.h"
 #include "smack-util.h"
 #include "socket-util.h"
@@ -1789,6 +1792,8 @@ static int apply_lock_personality(const ExecContext *c, const ExecParameters *p)
 
 #if HAVE_LIBBPF
 static int apply_restrict_filesystems(const ExecContext *c, const ExecParameters *p) {
+        int r;
+
         assert(c);
         assert(p);
 
@@ -1801,6 +1806,11 @@ static int apply_restrict_filesystems(const ExecContext *c, const ExecParameters
                 return 0;
         }
 
+        /* We are in a new binary, so dl-open again */
+        r = dlopen_bpf();
+        if (r < 0)
+                return r;
+
         return lsm_bpf_restrict_filesystems(c->restrict_filesystems, p->cgroup_id, p->bpf_outer_map_fd, c->restrict_filesystems_allow_list);
 }
 #endif
@@ -4062,7 +4072,7 @@ static bool exec_context_shall_confirm_spawn(const ExecContext *context) {
 static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***l);
 static int exec_context_named_iofds(const ExecContext *c, const ExecParameters *p, int named_iofds[static 3]);
 
-static int exec_child(
+int exec_invoke(
                 const ExecCommand *command,
                 const ExecContext *context,
                 ExecParameters *params,
@@ -4117,6 +4127,8 @@ static int exec_child(
         assert(command->path);
         assert(!strv_isempty(command->argv));
 
+        LOG_CONTEXT_PUSH_EXEC(context, params);
+
         if (context->std_input == EXEC_INPUT_SOCKET ||
             context->std_output == EXEC_OUTPUT_SOCKET ||
             context->std_error == EXEC_OUTPUT_SOCKET) {
@@ -5283,7 +5295,6 @@ static int exec_child(
         return log_exec_error_errno(context, params, r, "Failed to execute %s: %m", executable);
 }
 
-
 int exec_spawn(Unit *unit,
                ExecCommand *command,
                const ExecContext *context,
@@ -5292,12 +5303,16 @@ int exec_spawn(Unit *unit,
                const CGroupContext *cgroup_context,
                pid_t *ret) {
 
-        _cleanup_free_ char *subcgroup_path = NULL;
+        char serialization_fd_number[DECIMAL_STR_MAX(int) + 1];
+        _cleanup_free_ char *subcgroup_path = NULL, *log_level = NULL, *executor_path = NULL;
+        _cleanup_fdset_free_ FDSet *fdset = NULL;
+        _cleanup_fclose_ FILE *f = NULL;
         pid_t pid;
         int r;
 
         assert(unit);
         assert(unit->manager);
+        assert(unit->manager->executor_fd >= 0);
         assert(command);
         assert(context);
         assert(ret);
@@ -5333,35 +5348,56 @@ int exec_spawn(Unit *unit,
                 }
         }
 
-        pid = fork();
-        if (pid < 0)
-                return log_unit_error_errno(unit, errno, "Failed to fork: %m");
+        /* In order to avoid copy-on-write traps and OOM-kills when pid1's memory.current is above the
+         * child's memory.max, serialize all the state needed to start the unit, and pass it to the
+         * systemd-executor binary. clone() with CLONE_VM + CLONE_VFORK will pause the parent until the exec
+         * and ensure all memory is shared. The child immediately execs the new binary so the delay should
+         * be minimal. Once glibc provides a clone3 wrapper we can switch to that, and clone directly in the
+         * target cgroup. */
 
-        if (pid == 0) {
-                int exit_status;
+        r = open_serialization_file("sd-executor-state", &f);
+        if (r < 0)
+                return log_unit_error_errno(unit, r, "Failed to open serialization stream: %m");
 
-                r = exec_child(command,
-                               context,
-                               params,
-                               runtime,
-                               cgroup_context,
-                               &exit_status);
+        fdset = fdset_new();
+        if (!fdset)
+                return log_oom();
 
-                if (r < 0) {
-                        const char *status = ASSERT_PTR(
-                                        exit_status_to_string(exit_status, EXIT_STATUS_LIBC | EXIT_STATUS_SYSTEMD));
+        r = exec_serialize_invocation(f, fdset, context, command, params, runtime, cgroup_context);
+        if (r < 0)
+                return log_unit_error_errno(unit, r, "Failed to serialize parameters: %m");
 
-                        log_unit_struct_errno(unit, LOG_ERR, r,
-                                              "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
-                                              LOG_UNIT_INVOCATION_ID(unit),
-                                              LOG_UNIT_MESSAGE(unit, "Failed at step %s spawning %s: %m",
-                                                               status, command->path),
-                                              "EXECUTABLE=%s", command->path);
-                } else
-                        assert(exit_status == EXIT_SUCCESS);
+        if (fseeko(f, 0, SEEK_SET) == (off_t) -1)
+                return log_unit_error_errno(unit, errno, "Failed to reseek on serialization stream: %m");
 
-                _exit(exit_status);
-        }
+        r = fd_cloexec(fileno(f), false);
+        if (r < 0)
+                return log_unit_error_errno(unit, r, "Failed to set O_CLOEXEC on serialization fd: %m");
+
+        r = fdset_cloexec(fdset, false);
+        if (r < 0)
+                return log_unit_error_errno(unit, r, "Failed to set O_CLOEXEC on serialized fds: %m");
+
+        r = log_level_to_string_alloc(log_get_max_level(), &log_level);
+        if (r < 0)
+                return log_unit_error_errno(unit, r, "Failed to convert log level to string: %m");
+
+        r = fd_get_path(unit->manager->executor_fd, &executor_path);
+        if (r < 0)
+                return log_unit_error_errno(unit, r, "Failed to get executor path from fd: %m");
+
+        xsprintf(serialization_fd_number, "%i", fileno(f));
+
+        /* The executor binary is pinned, to avoid compatibility problems during upgrades. */
+        r = posix_spawn_wrapper(FORMAT_PROC_FD_PATH(unit->manager->executor_fd),
+                        STRV_MAKE(executor_path,
+                                  "--deserialize", serialization_fd_number,
+                                  "--log-level", log_level,
+                                  "--log-target", log_target_to_string(manager_get_executor_log_target(unit->manager))),
+                        environ,
+                        &pid);
+        if (r < 0)
+                return log_unit_error_errno(unit, r, "Failed to spawn executor: %m");
 
         log_unit_debug(unit, "Forked %s as "PID_FMT, command->path, pid);
 
@@ -5563,7 +5599,7 @@ int exec_context_destroy_mount_ns_dir(Unit *u) {
         return 0;
 }
 
-static void exec_command_done(ExecCommand *c) {
+void exec_command_done(ExecCommand *c) {
         assert(c);
 
         c->path = mfree(c->path);
@@ -6679,9 +6715,9 @@ static char *destroy_tree(char *path) {
         return mfree(path);
 }
 
-static ExecSharedRuntime* exec_shared_runtime_free(ExecSharedRuntime *rt) {
+void exec_shared_runtime_done(ExecSharedRuntime *rt) {
         if (!rt)
-                return NULL;
+                return;
 
         if (rt->manager)
                 (void) hashmap_remove(rt->manager->exec_shared_runtime_by_id, rt->id);
@@ -6691,6 +6727,11 @@ static ExecSharedRuntime* exec_shared_runtime_free(ExecSharedRuntime *rt) {
         rt->var_tmp_dir = mfree(rt->var_tmp_dir);
         safe_close_pair(rt->netns_storage_socket);
         safe_close_pair(rt->ipcns_storage_socket);
+}
+
+static ExecSharedRuntime* exec_shared_runtime_free(ExecSharedRuntime *rt) {
+        exec_shared_runtime_done(rt);
+
         return mfree(rt);
 }
 
@@ -7216,6 +7257,14 @@ ExecRuntime* exec_runtime_destroy(ExecRuntime *rt) {
         return exec_runtime_free(rt);
 }
 
+void exec_runtime_clear(ExecRuntime *rt) {
+        if (!rt)
+                return;
+
+        safe_close_pair(rt->ephemeral_storage_socket);
+        rt->ephemeral_copy = mfree(rt->ephemeral_copy);
+}
+
 void exec_params_clear(ExecParameters *p) {
         if (!p)
                 return;
@@ -7230,6 +7279,37 @@ void exec_params_clear(ExecParameters *p) {
         p->unit_id = mfree(p->unit_id);
         p->invocation_id = SD_ID128_NULL;
         p->invocation_id_string[0] = '\0';
+        p->confirm_spawn = mfree(p->confirm_spawn);
+}
+
+void exec_params_serialized_done(ExecParameters *p) {
+        if (!p)
+                return;
+
+        for (size_t i = 0; p->fds && i < p->n_socket_fds + p->n_storage_fds; i++)
+                p->fds[i] = safe_close(p->fds[i]);
+
+        p->cgroup_path = mfree(p->cgroup_path);
+
+        p->prefix = strv_free(p->prefix);
+        p->received_credentials_directory = mfree(p->received_credentials_directory);
+        p->received_encrypted_credentials_directory = mfree(p->received_encrypted_credentials_directory);
+
+        for (size_t i = 0; p->idle_pipe && i < 4; i++)
+                p->idle_pipe[i] = safe_close(p->idle_pipe[i]);
+        p->idle_pipe = mfree(p->idle_pipe);
+
+        p->stdin_fd = safe_close(p->stdin_fd);
+        p->stdout_fd = safe_close(p->stdout_fd);
+        p->stderr_fd = safe_close(p->stderr_fd);
+
+        p->notify_socket = mfree(p->notify_socket);
+
+        open_file_free_many(&p->open_files);
+
+        p->fallback_smack_process_label = mfree(p->fallback_smack_process_label);
+
+        exec_params_clear(p);
 }
 
 void exec_directory_done(ExecDirectory *d) {
index 25f8531d44a67f151426e277a35ade74032958c1..f3150445823bd4a82cef91db2b494dce5c4d5ac2 100644 (file)
@@ -471,6 +471,13 @@ struct ExecParameters {
 #include "unit.h"
 #include "dynamic-user.h"
 
+int exec_invoke(const ExecCommand *command,
+                const ExecContext *context,
+                ExecParameters *params,
+                ExecRuntime *runtime,
+                const CGroupContext *cgroup_context,
+                int *exit_status);
+
 int exec_spawn(Unit *unit,
                ExecCommand *command,
                const ExecContext *context,
@@ -479,6 +486,7 @@ int exec_spawn(Unit *unit,
                const CGroupContext *cgroup_context,
                pid_t *ret);
 
+void exec_command_done(ExecCommand *c);
 void exec_command_done_array(ExecCommand *c, size_t n);
 ExecCommand* exec_command_free_list(ExecCommand *c);
 void exec_command_free_array(ExecCommand **c, size_t n);
@@ -524,15 +532,18 @@ DEFINE_TRIVIAL_CLEANUP_FUNC(ExecSharedRuntime*, exec_shared_runtime_unref);
 int exec_shared_runtime_serialize(const Manager *m, FILE *f, FDSet *fds);
 int exec_shared_runtime_deserialize_compat(Unit *u, const char *key, const char *value, FDSet *fds);
 int exec_shared_runtime_deserialize_one(Manager *m, const char *value, FDSet *fds);
+void exec_shared_runtime_done(ExecSharedRuntime *rt);
 void exec_shared_runtime_vacuum(Manager *m);
 
 int exec_runtime_make(const Unit *unit, const ExecContext *context, ExecSharedRuntime *shared, DynamicCreds *creds, ExecRuntime **ret);
 ExecRuntime* exec_runtime_free(ExecRuntime *rt);
 DEFINE_TRIVIAL_CLEANUP_FUNC(ExecRuntime*, exec_runtime_free);
 ExecRuntime* exec_runtime_destroy(ExecRuntime *rt);
+void exec_runtime_clear(ExecRuntime *rt);
 
 void exec_params_clear(ExecParameters *p);
 void exec_params_dump(const ExecParameters *p, FILE* f, const char *prefix);
+void exec_params_serialized_done(ExecParameters *p);
 
 bool exec_context_get_cpu_affinity_from_numa(const ExecContext *c);
 
diff --git a/src/core/executor.c b/src/core/executor.c
new file mode 100644 (file)
index 0000000..0f154ea
--- /dev/null
@@ -0,0 +1,251 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <getopt.h>
+#include <unistd.h>
+
+#include "sd-messages.h"
+
+#include "alloc-util.h"
+#include "build.h"
+#include "execute-serialize.h"
+#include "execute.h"
+#include "exit-status.h"
+#include "fdset.h"
+#include "fd-util.h"
+#include "fileio.h"
+#include "getopt-defs.h"
+#include "parse-util.h"
+#include "pretty-print.h"
+#include "static-destruct.h"
+
+static FILE* arg_serialization = NULL;
+
+STATIC_DESTRUCTOR_REGISTER(arg_serialization, fclosep);
+
+static int help(void) {
+        _cleanup_free_ char *link = NULL;
+        int r;
+
+        r = terminal_urlify_man("systemd", "1", &link);
+        if (r < 0)
+                return log_oom();
+
+        printf("%s [OPTIONS...]\n\n"
+               "%sSandbox and execute processes.%s\n\n"
+               "  -h --help                Show this help and exit\n"
+               "     --version             Print version string and exit\n"
+               "     --log-target=TARGET   Set log target (console, journal,\n"
+               "                                           journal-or-kmsg,\n"
+               "                                           kmsg, null)\n"
+               "     --log-level=LEVEL     Set log level (debug, info, notice,\n"
+               "                                          warning, err, crit,\n"
+               "                                          alert, emerg)\n"
+               "     --log-color=BOOL      Highlight important messages\n"
+               "     --log-location=BOOL   Include code location in messages\n"
+               "     --log-time=BOOL       Prefix messages with current time\n"
+               "     --deserialize=FD      Deserialize process config from FD\n"
+               "\nSee the %s for details.\n",
+               program_invocation_short_name,
+               ansi_highlight(),
+               ansi_normal(),
+               link);
+
+        return 0;
+}
+
+static int parse_argv(int argc, char *argv[]) {
+        enum {
+                COMMON_GETOPT_ARGS,
+                ARG_VERSION,
+                ARG_DESERIALIZE,
+        };
+
+        static const struct option options[] = {
+                { "log-level",      required_argument, NULL, ARG_LOG_LEVEL      },
+                { "log-target",     required_argument, NULL, ARG_LOG_TARGET     },
+                { "log-color",      required_argument, NULL, ARG_LOG_COLOR      },
+                { "log-location",   required_argument, NULL, ARG_LOG_LOCATION   },
+                { "log-time",       required_argument, NULL, ARG_LOG_TIME       },
+                { "help",           no_argument,       NULL, 'h'                },
+                { "version",        no_argument,       NULL, ARG_VERSION        },
+                { "deserialize",    required_argument, NULL, ARG_DESERIALIZE    },
+                {}
+        };
+
+        int c, r;
+
+        assert(argc >= 0);
+        assert(argv);
+
+        while ((c = getopt_long(argc, argv, "h", options, NULL)) >= 0)
+                switch (c) {
+                case 'h':
+                        return help();
+
+                case ARG_VERSION:
+                        return version();
+
+                case ARG_LOG_LEVEL:
+                        r = log_set_max_level_from_string(optarg);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to parse log level \"%s\": %m", optarg);
+
+                        break;
+
+                case ARG_LOG_TARGET:
+                        r = log_set_target_from_string(optarg);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to parse log target \"%s\": %m", optarg);
+
+                        break;
+
+                case ARG_LOG_COLOR:
+                        r = log_show_color_from_string(optarg);
+                        if (r < 0)
+                                return log_error_errno(
+                                                r,
+                                                "Failed to parse log color setting \"%s\": %m",
+                                                optarg);
+
+                        break;
+
+                case ARG_LOG_LOCATION:
+                        r = log_show_location_from_string(optarg);
+                        if (r < 0)
+                                return log_error_errno(
+                                                r,
+                                                "Failed to parse log location setting \"%s\": %m",
+                                                optarg);
+
+                        break;
+
+                case ARG_LOG_TIME:
+                        r = log_show_time_from_string(optarg);
+                        if (r < 0)
+                                return log_error_errno(
+                                                r,
+                                                "Failed to parse log time setting \"%s\": %m",
+                                                optarg);
+
+                        break;
+
+                case ARG_DESERIALIZE: {
+                        FILE *f;
+                        int fd;
+
+                        fd = parse_fd(optarg);
+                        if (fd < 0)
+                                return log_error_errno(
+                                                fd,
+                                                "Failed to parse serialization fd \"%s\": %m",
+                                                optarg);
+
+                        r = fd_cloexec(fd, /* cloexec= */ true);
+                        if (r < 0)
+                                return log_error_errno(
+                                                r,
+                                                "Failed to set serialization fd \"%s\" to close-on-exec: %m",
+                                                optarg);
+
+                        f = fdopen(fd, "r");
+                        if (!f)
+                                return log_error_errno(errno, "Failed to open serialization fd %d: %m", fd);
+
+                        safe_fclose(arg_serialization);
+                        arg_serialization = f;
+
+                        break;
+                }
+
+                case '?':
+                        return -EINVAL;
+
+                default:
+                        assert_not_reached();
+                }
+
+        if (!arg_serialization)
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                       "No serialization fd specified.");
+
+        return 1 /* work to do */;
+}
+
+int main(int argc, char *argv[]) {
+        _cleanup_fdset_free_ FDSet *fdset = NULL;
+        int exit_status = EXIT_SUCCESS, r;
+        _cleanup_(cgroup_context_done) CGroupContext cgroup_context = {};
+        _cleanup_(exec_context_done) ExecContext context = {};
+        _cleanup_(exec_command_done) ExecCommand command = {};
+        _cleanup_(exec_params_serialized_done) ExecParameters params = EXEC_PARAMETERS_INIT(/* flags= */ 0);
+        _cleanup_(exec_shared_runtime_done) ExecSharedRuntime shared = {
+                .netns_storage_socket = PIPE_EBADF,
+                .ipcns_storage_socket = PIPE_EBADF,
+        };
+        _cleanup_(dynamic_creds_done) DynamicCreds dynamic_creds = {};
+        _cleanup_(exec_runtime_clear) ExecRuntime runtime = {
+                .ephemeral_storage_socket = PIPE_EBADF,
+                .shared = &shared,
+                .dynamic_creds = &dynamic_creds,
+        };
+
+        exec_context_init(&context);
+        cgroup_context_init(&cgroup_context);
+
+        /* We might be starting the journal itself, we'll be told by the caller what to do */
+        log_set_always_reopen_console(true);
+        log_set_prohibit_ipc(true);
+        log_setup();
+
+        r = fdset_new_fill(/* filter_cloexec= */ 0, &fdset);
+        if (r < 0)
+                return log_error_errno(r, "Failed to create fd set: %m");
+
+        r = parse_argv(argc, argv);
+        if (r <= 0)
+                return r;
+
+        /* Now try again if we were told it's fine to use a different target */
+        if (log_get_target() != LOG_TARGET_KMSG) {
+                log_set_prohibit_ipc(false);
+                log_open();
+        }
+
+        r = fdset_remove(fdset, fileno(arg_serialization));
+        if (r < 0)
+                return log_error_errno(r, "Failed to remove serialization fd from fd set: %m");
+
+        r = exec_deserialize_invocation(arg_serialization,
+                                        fdset,
+                                        &context,
+                                        &command,
+                                        &params,
+                                        &runtime,
+                                        &cgroup_context);
+        if (r < 0)
+                return log_error_errno(r, "Failed to deserialize: %m");
+
+        arg_serialization = safe_fclose(arg_serialization);
+        fdset = fdset_free(fdset);
+
+        r = exec_invoke(&command,
+                        &context,
+                        &params,
+                        &runtime,
+                        &cgroup_context,
+                        &exit_status);
+        if (r < 0) {
+                const char *status = ASSERT_PTR(
+                                exit_status_to_string(exit_status, EXIT_STATUS_LIBC | EXIT_STATUS_SYSTEMD));
+
+                log_exec_struct_errno(&context, &params, LOG_ERR, r,
+                                      "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
+                                      LOG_EXEC_INVOCATION_ID(&params),
+                                      LOG_EXEC_MESSAGE(&params, "Failed at step %s spawning %s: %m",
+                                                       status, command.path),
+                                      "EXECUTABLE=%s", command.path);
+        } else
+                assert(exit_status == EXIT_SUCCESS); /* When 'skip' is chosen in the confirm spawn prompt */
+
+        return exit_status;
+}
index cbc89f5737344899371f0f57164bd6547f2747d9..0e4bfa4484918b918e2c417a4f50256071087d66 100644 (file)
@@ -24,7 +24,7 @@ int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
                 log_set_target(LOG_TARGET_NULL);
         }
 
-        assert_se(manager_new(RUNTIME_SCOPE_SYSTEM, MANAGER_TEST_RUN_MINIMAL, &m) >= 0);
+        assert_se(manager_new(RUNTIME_SCOPE_SYSTEM, MANAGER_TEST_RUN_MINIMAL|MANAGER_TEST_DONT_OPEN_EXECUTOR, &m) >= 0);
         /* Set log overrides as well to make it harder for a serialization file
          * to switch log levels/targets during fuzzing */
         manager_override_log_level(m, log_get_max_level());
index a11d6b53b5e4181e37f5d75792e9243d713b28da..7b738062b509d83edbdf7e193fd515d77308d176 100644 (file)
@@ -65,7 +65,7 @@ int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
         if (!getenv("SYSTEMD_LOG_LEVEL"))
                 log_set_max_level(LOG_CRIT);
 
-        assert_se(manager_new(RUNTIME_SCOPE_SYSTEM, MANAGER_TEST_RUN_MINIMAL, &m) >= 0);
+        assert_se(manager_new(RUNTIME_SCOPE_SYSTEM, MANAGER_TEST_RUN_MINIMAL|MANAGER_TEST_DONT_OPEN_EXECUTOR, &m) >= 0);
 
         name = strjoina("a.", unit_type_to_string(t));
         assert_se(unit_new_for_name(m, unit_vtable[t]->object_size, name, &u) >= 0);
index 6042bda2394254feea9e127398d2e7a55efb5cc2..9307a13a79ffd71785ea3e22b7c77be86f14833a 100644 (file)
@@ -921,6 +921,8 @@ int manager_new(RuntimeScope runtime_scope, ManagerTestRunFlags test_run_flags,
                         .interval = 10 * USEC_PER_MINUTE,
                         .burst = 10,
                 },
+
+                .executor_fd = -EBADF,
         };
 
         unit_defaults_init(&m->defaults, runtime_scope);
@@ -1039,6 +1041,42 @@ int manager_new(RuntimeScope runtime_scope, ManagerTestRunFlags test_run_flags,
 
                 if (r < 0 && r != -EEXIST)
                         return r;
+
+                m->executor_fd = open(SYSTEMD_EXECUTOR_BINARY_PATH, O_CLOEXEC|O_PATH);
+                if (m->executor_fd < 0)
+                        return log_warning_errno(errno,
+                                                 "Failed to open executor binary '%s': %m",
+                                                 SYSTEMD_EXECUTOR_BINARY_PATH);
+        } else if (!FLAGS_SET(test_run_flags, MANAGER_TEST_DONT_OPEN_EXECUTOR)) {
+                _cleanup_free_ char *self_exe = NULL, *executor_path = NULL;
+                _cleanup_close_ int self_dir_fd = -EBADF;
+                int level = LOG_DEBUG;
+
+                /* Prefer sd-executor from the same directory as the test, e.g.: when running unit tests from the
+                * build directory. Fallback to working directory and then the installation path. */
+                r = readlink_and_make_absolute("/proc/self/exe", &self_exe);
+                if (r < 0)
+                        return r;
+
+                self_dir_fd = open_parent(self_exe, O_CLOEXEC|O_DIRECTORY, 0);
+                if (self_dir_fd < 0)
+                        return -errno;
+
+                m->executor_fd = openat(self_dir_fd, "systemd-executor", O_CLOEXEC|O_PATH);
+                if (m->executor_fd < 0 && errno == ENOENT)
+                        m->executor_fd = openat(AT_FDCWD, "systemd-executor", O_CLOEXEC|O_PATH);
+                if (m->executor_fd < 0 && errno == ENOENT) {
+                        m->executor_fd = open(SYSTEMD_EXECUTOR_BINARY_PATH, O_CLOEXEC|O_PATH);
+                        level = LOG_WARNING; /* Tests should normally use local builds */
+                }
+                if (m->executor_fd < 0)
+                        return -errno;
+
+                r = fd_get_path(m->executor_fd, &executor_path);
+                if (r < 0)
+                        return r;
+
+                log_full(level, "Using systemd-executor binary from '%s'", executor_path);
         }
 
         /* Note that we do not set up the notify fd here. We do that after deserialization,
@@ -1701,6 +1739,8 @@ Manager* manager_free(Manager *m) {
         lsm_bpf_destroy(m->restrict_fs);
 #endif
 
+        safe_close(m->executor_fd);
+
         return mfree(m);
 }
 
@@ -4956,6 +4996,17 @@ void unit_defaults_done(UnitDefaults *defaults) {
         rlimit_free_all(defaults->rlimit);
 }
 
+LogTarget manager_get_executor_log_target(Manager *m) {
+        assert(m);
+
+        /* If journald is not available tell sd-executor to go to kmsg, as it might be starting journald */
+
+        if (manager_journal_is_running(m))
+                return log_get_target();
+
+        return LOG_TARGET_KMSG;
+}
+
 static const char *const manager_state_table[_MANAGER_STATE_MAX] = {
         [MANAGER_INITIALIZING] = "initializing",
         [MANAGER_STARTING]     = "starting",
index 4595b1b6863bc94c5eb1038b89df15ec5487ca58..6321a353a5a1525ac82d2f7ce724abdb4e98cb67 100644 (file)
@@ -145,6 +145,7 @@ typedef enum ManagerTestRunFlags {
         MANAGER_TEST_RUN_ENV_GENERATORS      = 1 << 2,  /* also run env generators  */
         MANAGER_TEST_RUN_GENERATORS          = 1 << 3,  /* also run unit generators */
         MANAGER_TEST_RUN_IGNORE_DEPENDENCIES = 1 << 4,  /* run while ignoring dependencies */
+        MANAGER_TEST_DONT_OPEN_EXECUTOR      = 1 << 5,  /* avoid trying to load sd-executor */
         MANAGER_TEST_FULL = MANAGER_TEST_RUN_BASIC | MANAGER_TEST_RUN_ENV_GENERATORS | MANAGER_TEST_RUN_GENERATORS,
 } ManagerTestRunFlags;
 
@@ -496,6 +497,10 @@ struct Manager {
 
         /* For NFTSet= */
         FirewallContext *fw_ctx;
+
+        /* Pin the systemd-executor binary, so that it never changes until re-exec, ensuring we don't have
+         * serialization/deserialization compatibility issues during upgrades. */
+        int executor_fd;
 };
 
 static inline usec_t manager_default_timeout_abort_usec(Manager *m) {
@@ -628,6 +633,8 @@ void manager_override_watchdog(Manager *m, WatchdogType t, usec_t timeout);
 int manager_set_watchdog_pretimeout_governor(Manager *m, const char *governor);
 int manager_override_watchdog_pretimeout_governor(Manager *m, const char *governor);
 
+LogTarget manager_get_executor_log_target(Manager *m);
+
 const char* oom_policy_to_string(OOMPolicy i) _const_;
 OOMPolicy oom_policy_from_string(const char *s) _pure_;
 
index 8c3fd76922bf049b293e3be719f6125b6804577e..0508254d9ad078ea21186ac3bd2331123e296cf3 100644 (file)
@@ -144,6 +144,10 @@ systemd_sources = files(
         'crash-handler.c',
 )
 
+systemd_executor_sources = files(
+        'executor.c',
+)
+
 executables += [
         libexec_template + {
                 'name' : 'systemd',
@@ -156,6 +160,17 @@ executables += [
                 ],
                 'dependencies' : libseccomp,
         },
+        libexec_template + {
+                'name' : 'systemd-executor',
+                'public' : true,
+                'sources' : systemd_executor_sources,
+                'include_directories' : core_includes,
+                'link_with' : [
+                        libcore,
+                        libshared,
+                ],
+                'dependencies' : libseccomp,
+        },
         fuzz_template + {
                 'sources' : files('fuzz-unit-file.c'),
                 'link_with' : [
index b71c21580ea27d304f8b932cfea6b7ce4a18f72e..aa809843f731d4257f34fedac37ff499675dadee 100644 (file)
@@ -5357,6 +5357,7 @@ int unit_acquire_invocation_id(Unit *u) {
 }
 
 int unit_set_exec_params(Unit *u, ExecParameters *p) {
+        const char *confirm_spawn;
         int r;
 
         assert(u);
@@ -5369,7 +5370,13 @@ int unit_set_exec_params(Unit *u, ExecParameters *p) {
 
         p->runtime_scope = u->manager->runtime_scope;
 
-        p->confirm_spawn = (char *)manager_get_confirm_spawn(u->manager);
+        confirm_spawn = manager_get_confirm_spawn(u->manager);
+        if (confirm_spawn) {
+                p->confirm_spawn = strdup(confirm_spawn);
+                if (!p->confirm_spawn)
+                        return -ENOMEM;
+        }
+
         p->cgroup_supported = u->manager->cgroup_supported;
         p->prefix = u->manager->prefix;
         SET_FLAG(p->flags, EXEC_PASS_LOG_UNIT|EXEC_CHOWN_DIRECTORIES, MANAGER_IS_SYSTEM(u->manager));
index c6258dead63a16c0c84fafa6355f84c1074302cc..ffa9af2f6ac5fa880a0064833111d2842ac7e33e 100755 (executable)
@@ -68,6 +68,12 @@ if systemctl is-active systemd-oomd.service; then
     systemctl restart systemd-oomd.service
 fi
 
+# Ensure that we can start services even with a very low hard memory cap without oom-kills, but skip under
+# sanitizers as they balloon memory usage.
+if ! [[ -v ASAN_OPTIONS || -v UBSAN_OPTIONS ]]; then
+    systemd-run -t -p MemoryMax=10M -p MemorySwapMax=0 -p MemoryZSwapMax=0 /bin/true
+fi
+
 systemctl start testsuite-55-testchill.service
 systemctl start testsuite-55-testbloat.service