]> git.ipfire.org Git - thirdparty/systemd.git/commitdiff
core: Add RootEphemeral= setting
authorDaan De Meyer <daan.j.demeyer@gmail.com>
Tue, 6 Jun 2023 15:44:09 +0000 (17:44 +0200)
committerDaan De Meyer <daan.j.demeyer@gmail.com>
Wed, 21 Jun 2023 10:48:46 +0000 (12:48 +0200)
This setting allows services to run in an ephemeral copy of the root
directory or root image. To make sure the ephemeral copies are always
cleaned up, we add a tmpfiles snippet to unconditionally clean up
/var/lib/systemd/ephemeral. To prevent in use ephemeral copies from
being cleaned up by tmpfiles, we use the newly added COPY_LOCK_BSD
and BTRFS_SNAPSHOT_LOCK_BSD flags to take a BSD lock on the ephemeral
copies which instruct tmpfiles to not touch those ephemeral copies as
long as the BSD lock is held.

man/org.freedesktop.systemd1.xml
man/systemd.exec.xml
src/core/dbus-execute.c
src/core/execute.c
src/core/execute.h
src/core/load-fragment-gperf.gperf.in
src/core/service.c
src/core/unit.c
src/shared/bus-unit-util.c
test/fuzz/fuzz-unit-file/directives-all.service
tmpfiles.d/systemd.conf.in

index b50ddb95e96ed7bd99bc93b7b5173b13711bf535..560ae252e35563044dd02d6636f985ca95a0831a 100644 (file)
@@ -2950,6 +2950,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice {
       @org.freedesktop.DBus.Property.EmitsChangedSignal("const")
       readonly s RootVerity = '...';
       @org.freedesktop.DBus.Property.EmitsChangedSignal("const")
+      readonly b RootEphemeral = ...;
+      @org.freedesktop.DBus.Property.EmitsChangedSignal("const")
       readonly as ExtensionDirectories = ['...', ...];
       @org.freedesktop.DBus.Property.EmitsChangedSignal("const")
       readonly a(sba(ss)) ExtensionImages = [...];
@@ -3547,6 +3549,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice {
 
     <!--property RootHashSignaturePath is not documented!-->
 
+    <!--property RootEphemeral is not documented!-->
+
     <!--property OOMScoreAdjust is not documented!-->
 
     <!--property CoredumpFilter is not documented!-->
@@ -4189,6 +4193,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice {
 
     <variablelist class="dbus-property" generated="True" extra-ref="RootVerity"/>
 
+    <variablelist class="dbus-property" generated="True" extra-ref="RootEphemeral"/>
+
     <variablelist class="dbus-property" generated="True" extra-ref="ExtensionDirectories"/>
 
     <variablelist class="dbus-property" generated="True" extra-ref="ExtensionImages"/>
@@ -4972,6 +4978,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket {
       @org.freedesktop.DBus.Property.EmitsChangedSignal("const")
       readonly s RootVerity = '...';
       @org.freedesktop.DBus.Property.EmitsChangedSignal("const")
+      readonly b RootEphemeral = ...;
+      @org.freedesktop.DBus.Property.EmitsChangedSignal("const")
       readonly as ExtensionDirectories = ['...', ...];
       @org.freedesktop.DBus.Property.EmitsChangedSignal("const")
       readonly a(sba(ss)) ExtensionImages = [...];
@@ -5581,6 +5589,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket {
 
     <!--property RootHashSignaturePath is not documented!-->
 
+    <!--property RootEphemeral is not documented!-->
+
     <!--property OOMScoreAdjust is not documented!-->
 
     <!--property CoredumpFilter is not documented!-->
@@ -6203,6 +6213,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket {
 
     <variablelist class="dbus-property" generated="True" extra-ref="RootVerity"/>
 
+    <variablelist class="dbus-property" generated="True" extra-ref="RootEphemeral"/>
+
     <variablelist class="dbus-property" generated="True" extra-ref="ExtensionDirectories"/>
 
     <variablelist class="dbus-property" generated="True" extra-ref="ExtensionImages"/>
@@ -6861,6 +6873,8 @@ node /org/freedesktop/systemd1/unit/home_2emount {
       @org.freedesktop.DBus.Property.EmitsChangedSignal("const")
       readonly s RootVerity = '...';
       @org.freedesktop.DBus.Property.EmitsChangedSignal("const")
+      readonly b RootEphemeral = ...;
+      @org.freedesktop.DBus.Property.EmitsChangedSignal("const")
       readonly as ExtensionDirectories = ['...', ...];
       @org.freedesktop.DBus.Property.EmitsChangedSignal("const")
       readonly a(sba(ss)) ExtensionImages = [...];
@@ -7398,6 +7412,8 @@ node /org/freedesktop/systemd1/unit/home_2emount {
 
     <!--property RootHashSignaturePath is not documented!-->
 
+    <!--property RootEphemeral is not documented!-->
+
     <!--property OOMScoreAdjust is not documented!-->
 
     <!--property CoredumpFilter is not documented!-->
@@ -7938,6 +7954,8 @@ node /org/freedesktop/systemd1/unit/home_2emount {
 
     <variablelist class="dbus-property" generated="True" extra-ref="RootVerity"/>
 
+    <variablelist class="dbus-property" generated="True" extra-ref="RootEphemeral"/>
+
     <variablelist class="dbus-property" generated="True" extra-ref="ExtensionDirectories"/>
 
     <variablelist class="dbus-property" generated="True" extra-ref="ExtensionImages"/>
@@ -8723,6 +8741,8 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap {
       @org.freedesktop.DBus.Property.EmitsChangedSignal("const")
       readonly s RootVerity = '...';
       @org.freedesktop.DBus.Property.EmitsChangedSignal("const")
+      readonly b RootEphemeral = ...;
+      @org.freedesktop.DBus.Property.EmitsChangedSignal("const")
       readonly as ExtensionDirectories = ['...', ...];
       @org.freedesktop.DBus.Property.EmitsChangedSignal("const")
       readonly a(sba(ss)) ExtensionImages = [...];
@@ -9246,6 +9266,8 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap {
 
     <!--property RootHashSignaturePath is not documented!-->
 
+    <!--property RootEphemeral is not documented!-->
+
     <!--property OOMScoreAdjust is not documented!-->
 
     <!--property CoredumpFilter is not documented!-->
@@ -9772,6 +9794,8 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap {
 
     <variablelist class="dbus-property" generated="True" extra-ref="RootVerity"/>
 
+    <variablelist class="dbus-property" generated="True" extra-ref="RootEphemeral"/>
+
     <variablelist class="dbus-property" generated="True" extra-ref="ExtensionDirectories"/>
 
     <variablelist class="dbus-property" generated="True" extra-ref="ExtensionImages"/>
index 938a3c87a9f8f24953ebf8552eda257f8b8c6b4f..84eda5c58471c573810f3aff5dc54bfbf3693195 100644 (file)
         <xi:include href="system-only.xml" xpointer="singular"/></listitem>
       </varlistentry>
 
+      <varlistentry>
+        <term><varname>RootEphemeral=</varname></term>
+
+        <listitem><para>Takes a boolean argument. If enabled, executed processes will run in an ephemeral
+        copy of the root directory or root image. The ephemeral copy is placed in
+        <filename>/var/lib/systemd/ephemeral-trees/</filename> while the service is active and is cleaned up
+        when the service is stopped or restarted. If <varname>RootDirectory=</varname> is used and the root
+        directory is a subvolume, the ephemeral copy will be created by making a snapshot of the subvolume.
+        </para>
+
+        <para>To make sure making ephemeral copies can be made efficiently, the root directory or root image
+        should be located on the same filesystem as <filename>/var/lib/systemd/ephemeral-trees/</filename>.
+        When using <varname>RootEphemeral=</varname> with root directories, btrfs should be used as the
+        filesystem and the root directory should ideally be a subvolume which <command>systemd</command> can
+        snapshot to make the ephemeral copy. For root images, a filesystem with support for reflinks should
+        be used to ensure an efficient ephemeral copy.</para>
+
+        <xi:include href="system-only.xml" xpointer="singular"/></listitem>
+      </varlistentry>
+
       <varlistentry>
         <term><varname>RootHash=</varname></term>
 
index 04070a7f1efb7519ca7fe83c7d6ade7ae81ff3e3..80a035ab90f8b9ab1e3667809c391fe3dd14cbfb 100644 (file)
@@ -1231,6 +1231,7 @@ const sd_bus_vtable bus_exec_vtable[] = {
         SD_BUS_PROPERTY("RootHashSignature", "ay", property_get_root_hash_sig, 0, SD_BUS_VTABLE_PROPERTY_CONST),
         SD_BUS_PROPERTY("RootHashSignaturePath", "s", NULL, offsetof(ExecContext, root_hash_sig_path), SD_BUS_VTABLE_PROPERTY_CONST),
         SD_BUS_PROPERTY("RootVerity", "s", NULL, offsetof(ExecContext, root_verity), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("RootEphemeral", "b", bus_property_get_bool, offsetof(ExecContext, root_ephemeral), SD_BUS_VTABLE_PROPERTY_CONST),
         SD_BUS_PROPERTY("ExtensionDirectories", "as", NULL, offsetof(ExecContext, extension_directories), SD_BUS_VTABLE_PROPERTY_CONST),
         SD_BUS_PROPERTY("ExtensionImages", "a(sba(ss))", property_get_extension_images, 0, SD_BUS_VTABLE_PROPERTY_CONST),
         SD_BUS_PROPERTY("MountImages", "a(ssba(ss))", property_get_mount_images, 0, SD_BUS_VTABLE_PROPERTY_CONST),
@@ -1865,6 +1866,9 @@ int bus_exec_context_set_transient_property(
         if (streq(name, "RootDirectory"))
                 return bus_set_transient_path(u, name, &c->root_directory, message, flags, error);
 
+        if (streq(name, "RootEphemeral"))
+                return bus_set_transient_bool(u, name, &c->root_ephemeral, message, flags, error);
+
         if (streq(name, "SyslogIdentifier"))
                 return bus_set_transient_string(u, name, &c->syslog_identifier, message, flags, error);
 
index 204c5a1f8c7940f109bae9952f7792ee3609e870..b7fe922c7a950c4bb63f56776afe6c9d52c74ded 100644 (file)
@@ -15,6 +15,8 @@
 #include <unistd.h>
 #include <utmpx.h>
 
+#include <linux/fs.h> /* Must be included after <sys/mount.h> */
+
 #if HAVE_PAM
 #include <security/pam_appl.h>
 #endif
 #include "async.h"
 #include "barrier.h"
 #include "bpf-lsm.h"
+#include "btrfs-util.h"
 #include "cap-list.h"
 #include "capability-util.h"
+#include "chattr-util.h"
 #include "cgroup-setup.h"
 #include "chase.h"
 #include "chown-recursive.h"
@@ -66,6 +70,7 @@
 #include "io-util.h"
 #include "ioprio-util.h"
 #include "label-util.h"
+#include "lock-util.h"
 #include "log.h"
 #include "macro.h"
 #include "manager.h"
@@ -2170,6 +2175,10 @@ bool exec_needs_network_namespace(const ExecContext *context) {
         return context->private_network || context->network_namespace_path;
 }
 
+static bool exec_needs_ephemeral(const ExecContext *context) {
+        return (context->root_image || context->root_directory) && context->root_ephemeral;
+}
+
 static bool exec_needs_ipc_namespace(const ExecContext *context) {
         assert(context);
 
@@ -3823,6 +3832,63 @@ static bool insist_on_sandboxing(
         return false;
 }
 
+static int setup_ephemeral(const ExecContext *context, ExecRuntime *runtime) {
+        _cleanup_close_ int fd = -EBADF;
+        int r;
+
+        if (!runtime || !runtime->ephemeral_copy)
+                return 0;
+
+        r = posix_lock(runtime->ephemeral_storage_socket[0], LOCK_EX);
+        if (r < 0)
+                return log_debug_errno(r, "Failed to lock ephemeral storage socket: %m");
+
+        CLEANUP_POSIX_UNLOCK(runtime->ephemeral_storage_socket[0]);
+
+        fd = receive_one_fd(runtime->ephemeral_storage_socket[0], MSG_PEEK|MSG_DONTWAIT);
+        if (fd >= 0)
+                /* We got an fd! That means ephemeral has already been set up, so nothing to do here. */
+                return 0;
+
+        if (fd != -EAGAIN)
+                return log_debug_errno(fd, "Failed to receive file descriptor queued on ephemeral storage socket: %m");
+
+        log_debug("Making ephemeral snapshot of %s to %s",
+                  context->root_image ?: context->root_directory, runtime->ephemeral_copy);
+
+        if (context->root_image)
+                fd = copy_file(context->root_image, runtime->ephemeral_copy, O_EXCL, 0600,
+                               COPY_LOCK_BSD|COPY_REFLINK|COPY_CRTIME);
+        else
+                fd = btrfs_subvol_snapshot_at(AT_FDCWD, context->root_directory,
+                                              AT_FDCWD, runtime->ephemeral_copy,
+                                              BTRFS_SNAPSHOT_FALLBACK_COPY |
+                                              BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
+                                              BTRFS_SNAPSHOT_RECURSIVE |
+                                              BTRFS_SNAPSHOT_LOCK_BSD);
+        if (fd < 0)
+                return log_debug_errno(fd, "Failed to snapshot %s to %s: %m",
+                                       context->root_image ?: context->root_directory, runtime->ephemeral_copy);
+
+        if (context->root_image) {
+                /* A root image might be subject to lots of random writes so let's try to disable COW on it
+                 * which tends to not perform well in combination with lots of random writes.
+                 *
+                 * Note: btrfs actually isn't impressed by us setting the flag after making the reflink'ed
+                 * copy, but we at least want to make the intention clear.
+                 */
+                r = chattr_fd(fd, FS_NOCOW_FL, FS_NOCOW_FL, NULL);
+                if (r < 0)
+                        log_debug_errno(fd, "Failed to disable copy-on-write for %s, ignoring: %m", runtime->ephemeral_copy);
+        }
+
+        r = send_one_fd(runtime->ephemeral_storage_socket[1], fd, MSG_DONTWAIT);
+        if (r < 0)
+                return log_debug_errno(r, "Failed to queue file descriptor on ephemeral storage socket: %m");
+
+        return 1;
+}
+
 static int verity_settings_prepare(
                 VeritySettings *verity,
                 const char *root_image,
@@ -3884,7 +3950,7 @@ static int apply_mount_namespace(
                 ExecCommandFlags command_flags,
                 const ExecContext *context,
                 const ExecParameters *params,
-                const ExecRuntime *runtime,
+                ExecRuntime *runtime,
                 const char *memory_pressure_path,
                 char **error_path) {
 
@@ -3906,10 +3972,14 @@ static int apply_mount_namespace(
         CLEANUP_ARRAY(bind_mounts, n_bind_mounts, bind_mount_free_many);
 
         if (params->flags & EXEC_APPLY_CHROOT) {
-                root_image = context->root_image;
+                r = setup_ephemeral(context, runtime);
+                if (r < 0)
+                        return r;
 
-                if (!root_image)
-                        root_dir = context->root_directory;
+                if (context->root_image)
+                        root_image = (runtime ? runtime->ephemeral_copy : NULL) ?: context->root_image;
+                else
+                        root_dir = (runtime ? runtime->ephemeral_copy : NULL) ?: context->root_directory;
         }
 
         r = compile_bind_mounts(context, params, &bind_mounts, &n_bind_mounts, &empty_directories);
@@ -4090,6 +4160,7 @@ static int apply_mount_namespace(
 static int apply_working_directory(
                 const ExecContext *context,
                 const ExecParameters *params,
+                ExecRuntime *runtime,
                 const char *home,
                 int *exit_status) {
 
@@ -4113,7 +4184,7 @@ static int apply_working_directory(
         if (params->flags & EXEC_APPLY_CHROOT)
                 d = wd;
         else
-                d = prefix_roota(context->root_directory, wd);
+                d = prefix_roota((runtime ? runtime->ephemeral_copy : NULL) ?: context->root_directory, wd);
 
         if (chdir(d) < 0 && !context->working_directory_missing_ok) {
                 *exit_status = EXIT_CHDIR;
@@ -4126,6 +4197,7 @@ static int apply_working_directory(
 static int apply_root_directory(
                 const ExecContext *context,
                 const ExecParameters *params,
+                ExecRuntime *runtime,
                 const bool needs_mount_ns,
                 int *exit_status) {
 
@@ -4134,7 +4206,7 @@ static int apply_root_directory(
 
         if (params->flags & EXEC_APPLY_CHROOT)
                 if (!needs_mount_ns && context->root_directory)
-                        if (chroot(context->root_directory) < 0) {
+                        if (chroot((runtime ? runtime->ephemeral_copy : NULL) ?: context->root_directory) < 0) {
                                 *exit_status = EXIT_CHROOT;
                                 return -errno;
                         }
@@ -4271,7 +4343,7 @@ static int close_remaining_fds(
                 const int *fds, size_t n_fds) {
 
         size_t n_dont_close = 0;
-        int dont_close[n_fds + 12];
+        int dont_close[n_fds + 14];
 
         assert(params);
 
@@ -4289,6 +4361,9 @@ static int close_remaining_fds(
                 n_dont_close += n_fds;
         }
 
+        if (runtime)
+                append_socket_pair(dont_close, &n_dont_close, runtime->ephemeral_storage_socket);
+
         if (runtime && runtime->shared) {
                 append_socket_pair(dont_close, &n_dont_close, runtime->shared->netns_storage_socket);
                 append_socket_pair(dont_close, &n_dont_close, runtime->shared->ipcns_storage_socket);
@@ -5584,7 +5659,7 @@ static int exec_child(
         }
 
         /* chroot to root directory first, before we lose the ability to chroot */
-        r = apply_root_directory(context, params, needs_mount_namespace, exit_status);
+        r = apply_root_directory(context, params, runtime, needs_mount_namespace, exit_status);
         if (r < 0)
                 return log_unit_error_errno(unit, r, "Chrooting to the requested root directory failed: %m");
 
@@ -5610,7 +5685,7 @@ static int exec_child(
 
         /* Apply working directory here, because the working directory might be on NFS and only the user running
          * this service might have the correct privilege to change to the working directory */
-        r = apply_working_directory(context, params, home, exit_status);
+        r = apply_working_directory(context, params, runtime, home, exit_status);
         if (r < 0)
                 return log_unit_error_errno(unit, r, "Changing to the requested working directory failed: %m");
 
@@ -6422,6 +6497,7 @@ void exec_context_dump(const ExecContext *c, FILE* f, const char *prefix) {
                 "%sUMask: %04o\n"
                 "%sWorkingDirectory: %s\n"
                 "%sRootDirectory: %s\n"
+                "%sRootEphemeral: %s\n"
                 "%sNonBlocking: %s\n"
                 "%sPrivateTmp: %s\n"
                 "%sPrivateDevices: %s\n"
@@ -6446,6 +6522,7 @@ void exec_context_dump(const ExecContext *c, FILE* f, const char *prefix) {
                 prefix, c->umask,
                 prefix, empty_to_root(c->working_directory),
                 prefix, empty_to_root(c->root_directory),
+                prefix, yes_no(c->root_ephemeral),
                 prefix, yes_no(c->non_blocking),
                 prefix, yes_no(c->private_tmp),
                 prefix, yes_no(c->private_devices),
@@ -7241,13 +7318,30 @@ int exec_command_append(ExecCommand *c, const char *path, ...) {
         return 0;
 }
 
-static void *remove_tmpdir_thread(void *p) {
+static void *rm_rf_thread(void *p) {
         _cleanup_free_ char *path = p;
 
-        (void) rm_rf(path, REMOVE_ROOT|REMOVE_PHYSICAL);
+        (void) rm_rf(path, REMOVE_ROOT|REMOVE_SUBVOLUME|REMOVE_PHYSICAL);
         return NULL;
 }
 
+static void asynchronous_rm_rf(char **path) {
+        int r;
+
+        assert(path);
+
+        if (!*path || streq(*path, RUN_SYSTEMD_EMPTY))
+                return;
+
+        log_debug("Spawning thread to nuke %s", *path);
+
+        r = asynchronous_job(rm_rf_thread, *path);
+        if (r < 0)
+                log_warning_errno(r, "Failed to nuke %s: %m", *path);
+        else
+                *path = NULL;
+}
+
 static ExecSharedRuntime* exec_shared_runtime_free(ExecSharedRuntime *rt) {
         if (!rt)
                 return NULL;
@@ -7267,8 +7361,6 @@ DEFINE_TRIVIAL_UNREF_FUNC(ExecSharedRuntime, exec_shared_runtime, exec_shared_ru
 DEFINE_TRIVIAL_CLEANUP_FUNC(ExecSharedRuntime*, exec_shared_runtime_free);
 
 ExecSharedRuntime* exec_shared_runtime_destroy(ExecSharedRuntime *rt) {
-        int r;
-
         if (!rt)
                 return NULL;
 
@@ -7278,25 +7370,8 @@ ExecSharedRuntime* exec_shared_runtime_destroy(ExecSharedRuntime *rt) {
         if (rt->n_ref > 0)
                 return NULL;
 
-        if (rt->tmp_dir && !streq(rt->tmp_dir, RUN_SYSTEMD_EMPTY)) {
-                log_debug("Spawning thread to nuke %s", rt->tmp_dir);
-
-                r = asynchronous_job(remove_tmpdir_thread, rt->tmp_dir);
-                if (r < 0)
-                        log_warning_errno(r, "Failed to nuke %s: %m", rt->tmp_dir);
-                else
-                        rt->tmp_dir = NULL;
-        }
-
-        if (rt->var_tmp_dir && !streq(rt->var_tmp_dir, RUN_SYSTEMD_EMPTY)) {
-                log_debug("Spawning thread to nuke %s", rt->var_tmp_dir);
-
-                r = asynchronous_job(remove_tmpdir_thread, rt->var_tmp_dir);
-                if (r < 0)
-                        log_warning_errno(r, "Failed to nuke %s: %m", rt->var_tmp_dir);
-                else
-                        rt->var_tmp_dir = NULL;
-        }
+        asynchronous_rm_rf(&rt->tmp_dir);
+        asynchronous_rm_rf(&rt->var_tmp_dir);
 
         return exec_shared_runtime_free(rt);
 }
@@ -7731,16 +7806,39 @@ void exec_shared_runtime_vacuum(Manager *m) {
         }
 }
 
-int exec_runtime_make(ExecSharedRuntime *shared, DynamicCreds *creds, ExecRuntime **ret) {
+int exec_runtime_make(
+                const Unit *unit,
+                const ExecContext *context,
+                ExecSharedRuntime *shared,
+                DynamicCreds *creds,
+                ExecRuntime **ret) {
+        _cleanup_close_pair_ int ephemeral_storage_socket[2] = PIPE_EBADF;
+        _cleanup_free_ char *ephemeral = NULL;
         _cleanup_(exec_runtime_freep) ExecRuntime *rt = NULL;
+        int r;
 
+        assert(unit);
+        assert(context);
         assert(ret);
 
-        if (!shared && !creds) {
+        if (!shared && !creds && !exec_needs_ephemeral(context)) {
                 *ret = NULL;
                 return 0;
         }
 
+        if (exec_needs_ephemeral(context)) {
+                r = mkdir_p("/var/lib/systemd/ephemeral-trees", 0755);
+                if (r < 0)
+                        return r;
+
+                r = tempfn_random_child("/var/lib/systemd/ephemeral-trees", unit->id, &ephemeral);
+                if (r < 0)
+                        return r;
+
+                if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, ephemeral_storage_socket) < 0)
+                        return -errno;
+        }
+
         rt = new(ExecRuntime, 1);
         if (!rt)
                 return -ENOMEM;
@@ -7748,6 +7846,9 @@ int exec_runtime_make(ExecSharedRuntime *shared, DynamicCreds *creds, ExecRuntim
         *rt = (ExecRuntime) {
                 .shared = shared,
                 .dynamic_creds = creds,
+                .ephemeral_copy = TAKE_PTR(ephemeral),
+                .ephemeral_storage_socket[0] = TAKE_FD(ephemeral_storage_socket[0]),
+                .ephemeral_storage_socket[1] = TAKE_FD(ephemeral_storage_socket[1]),
         };
 
         *ret = TAKE_PTR(rt);
@@ -7760,6 +7861,11 @@ ExecRuntime* exec_runtime_free(ExecRuntime *rt) {
 
         exec_shared_runtime_unref(rt->shared);
         dynamic_creds_unref(rt->dynamic_creds);
+
+        asynchronous_rm_rf(&rt->ephemeral_copy);
+
+        free(rt->ephemeral_copy);
+        safe_close_pair(rt->ephemeral_storage_socket);
         return mfree(rt);
 }
 
index 953dc9e7f778aef9cac7f5f4faaf047619eb4159..ee73fb6367957f86d04498661d6573084ce828bc 100644 (file)
@@ -129,6 +129,14 @@ struct ExecSharedRuntime {
 struct ExecRuntime {
         ExecSharedRuntime *shared;
         DynamicCreds *dynamic_creds;
+
+        /* The path to the ephemeral snapshot of the root directory or root image if one was requested. */
+        char *ephemeral_copy;
+
+        /* An AF_UNIX socket pair that receives the locked file descriptor referring to the ephemeral copy of
+         * the root directory or root image. The lock prevents tmpfiles from removing the ephemeral snapshot
+         * until we're done using it. */
+        int ephemeral_storage_socket[2];
 };
 
 typedef enum ExecDirectoryType {
@@ -195,6 +203,7 @@ struct ExecContext {
         void *root_hash, *root_hash_sig;
         size_t root_hash_size, root_hash_sig_size;
         LIST_HEAD(MountOptions, root_image_options);
+        bool root_ephemeral;
         bool working_directory_missing_ok:1;
         bool working_directory_home:1;
 
@@ -506,7 +515,7 @@ int exec_shared_runtime_deserialize_compat(Unit *u, const char *key, const char
 int exec_shared_runtime_deserialize_one(Manager *m, const char *value, FDSet *fds);
 void exec_shared_runtime_vacuum(Manager *m);
 
-int exec_runtime_make(ExecSharedRuntime *shared, DynamicCreds *creds, ExecRuntime **ret);
+int exec_runtime_make(const Unit *unit, const ExecContext *context, ExecSharedRuntime *shared, DynamicCreds *creds, ExecRuntime **ret);
 ExecRuntime* exec_runtime_free(ExecRuntime *rt);
 DEFINE_TRIVIAL_CLEANUP_FUNC(ExecRuntime*, exec_runtime_free);
 ExecRuntime* exec_runtime_destroy(ExecRuntime *rt);
index ae318dae895a3a4c60d32bd9b86ec888cf1a0eb4..382b60ea90a92a25312e578ad0d1ebf0273af2a6 100644 (file)
@@ -10,6 +10,7 @@
 {{type}}.RootHash,                         config_parse_exec_root_hash,                 0,                                  offsetof({{type}}, exec_context)
 {{type}}.RootHashSignature,                config_parse_exec_root_hash_sig,             0,                                  offsetof({{type}}, exec_context)
 {{type}}.RootVerity,                       config_parse_unit_path_printf,               true,                               offsetof({{type}}, exec_context.root_verity)
+{{type}}.RootEphemeral,                    config_parse_bool,                           0,                                  offsetof({{type}}, exec_context.root_ephemeral)
 {{type}}.ExtensionDirectories,             config_parse_namespace_path_strv,            0,                                  offsetof({{type}}, exec_context.extension_directories)
 {{type}}.ExtensionImages,                  config_parse_extension_images,               0,                                  offsetof({{type}}, exec_context)
 {{type}}.ExtensionImagePolicy,             config_parse_image_policy,                   0,                                  offsetof({{type}}, exec_context.extension_image_policy)
index cecdd3bf50f4ea15b6b7e1b9942acea030fb1923..146b892e460483859793076b57fcac0ea2f60904 100644 (file)
@@ -2029,7 +2029,7 @@ static void service_enter_dead(Service *s, ServiceResult f, bool allow_restart)
         /* Reset NotifyAccess override */
         s->notify_access_override = _NOTIFY_ACCESS_INVALID;
 
-        /* We want fresh tmpdirs in case service is started again immediately */
+        /* We want fresh tmpdirs and ephemeral snapshots in case the service is started again immediately. */
         s->exec_runtime = exec_runtime_destroy(s->exec_runtime);
 
         /* Also, remove the runtime directory */
index 570234c8f41308ffdd7af01d5f318dce3e159493..f51b5687f8d5a7b30179fecf91ee114340ee361c 100644 (file)
@@ -5012,7 +5012,7 @@ int unit_setup_exec_runtime(Unit *u) {
                         return r;
         }
 
-        r = exec_runtime_make(esr, dcreds, rt);
+        r = exec_runtime_make(u, ec, esr, dcreds, rt);
         if (r < 0)
                 return r;
 
index 6e93d0ca434bfb9441bbc9f33273cbb6555b81b3..cc287feb8eb801f4eb165c843df373832c19740e 100644 (file)
@@ -992,7 +992,8 @@ static int bus_append_execute_property(sd_bus_message *m, const char *field, con
                               "LockPersonality",
                               "ProtectHostname",
                               "MemoryKSM",
-                              "RestrictSUIDSGID"))
+                              "RestrictSUIDSGID",
+                              "RootEphemeral"))
                 return bus_append_parse_boolean(m, field, eq);
 
         if (STR_IN_SET(field, "ReadWriteDirectories",
index 818fb28dbf60c116f3fc99bc9a94baaa73906315..4bdc48a59b823ffbfe6d504d97a40eec5383f968 100644 (file)
@@ -222,6 +222,7 @@ RootImage=
 RootHash=
 RootHashSignature=
 RootVerity=
+RootEphemeral=
 ExtensionDirectories=
 ExtensionImages=
 RuntimeMaxSec=
index fa838d8d0672619f7413d04acf04d4b0eddd2a2a..3781c579e047440165dbb8778898a20bddb626ec 100644 (file)
@@ -60,6 +60,10 @@ a+ /var/log/journal/%m/system.journal - - - - group:wheel:r--
 
 d /var/lib/systemd 0755 root root -
 d /var/lib/systemd/coredump 0755 root root 3d
+# Files and directories in /var/lib/systemd/ephemeral-trees are locked by pid 1 to prevent tmpfiles from
+# removing them, and tmpfiles is told to clean up anything in /var/lib/systemd/ephemeral-trees that isn't
+# locked unconditionally.
+d /var/lib/systemd/ephemeral-trees 0755 root root 0
 
 d /var/lib/private 0700 root root -
 d /var/log/private 0700 root root -