]> git.ipfire.org Git - thirdparty/systemd.git/commitdiff
core: stage /run/host/os-release with a symlink to avoid possible race condition
authorLuca Boccassi <bluca@debian.org>
Sun, 13 Aug 2023 21:29:25 +0000 (22:29 +0100)
committerLuca Boccassi <luca.boccassi@gmail.com>
Wed, 16 Aug 2023 15:17:41 +0000 (16:17 +0100)
If someone reads /run/host/os-release at the exact same time it is being updated, and it
is large enough, they might read a half-written file. This is very unlikely as
os-release is typically small and very rarely changes, but it is not
impossible.

Bind mount a staging directory instead of the file, and symlink the file
into into, so that we can do atomic file updates and close this gap.
Atomic replacement creates a new inode, so existing bind mounts would
continue to see the old file, and only new services would see the new file.
The indirection via the directory allows to work around this, as the
directory is fixed and never changes so the bind mount is always valid,
and its content is shared with all existing services.

Fixes https://github.com/systemd/systemd/issues/28794

Follow-up for 3f37a82545d461ab

src/core/execute.c
src/core/main.c
src/core/namespace.c
src/core/namespace.h
test/units/testsuite-82.sh

index 854e40ed6df44d341655dcc077fd759d0f946ebc..2356e96628f8d046dc942f362997e7d622f9460c 100644 (file)
@@ -3767,6 +3767,7 @@ static int compile_bind_mounts(
 static int compile_symlinks(
                 const ExecContext *context,
                 const ExecParameters *params,
+                bool setup_os_release_symlink,
                 char ***ret_symlinks) {
 
         _cleanup_strv_free_ char **symlinks = NULL;
@@ -3812,6 +3813,20 @@ static int compile_symlinks(
                 }
         }
 
+        /* We make the host's os-release available via a symlink, so that we can copy it atomically
+         * and readers will never get a half-written version. Note that, while the paths specified here are
+         * absolute, when they are processed in namespace.c they will be made relative automatically, i.e.:
+         * 'os-release -> .os-release-stage/os-release' is what will be created. */
+        if (setup_os_release_symlink) {
+                r = strv_extend(&symlinks, "/run/host/.os-release-stage/os-release");
+                if (r < 0)
+                        return r;
+
+                r = strv_extend(&symlinks, "/run/host/os-release");
+                if (r < 0)
+                        return r;
+        }
+
         *ret_symlinks = TAKE_PTR(symlinks);
 
         return 0;
@@ -3984,11 +3999,11 @@ static int apply_mount_namespace(
         _cleanup_strv_free_ char **empty_directories = NULL, **symlinks = NULL,
                         **read_write_paths_cleanup = NULL;
         _cleanup_free_ char *creds_path = NULL, *incoming_dir = NULL, *propagate_dir = NULL,
-                        *extension_dir = NULL, *host_os_release = NULL;
+                        *extension_dir = NULL, *host_os_release_stage = NULL;
         const char *root_dir = NULL, *root_image = NULL, *tmp_dir = NULL, *var_tmp_dir = NULL;
         char **read_write_paths;
         NamespaceInfo ns_info;
-        bool needs_sandboxing;
+        bool needs_sandboxing, setup_os_release_symlink;
         BindMount *bind_mounts = NULL;
         size_t n_bind_mounts = 0;
         int r;
@@ -4012,11 +4027,6 @@ static int apply_mount_namespace(
         if (r < 0)
                 return r;
 
-        /* Symlinks for exec dirs are set up after other mounts, before they are made read-only. */
-        r = compile_symlinks(context, params, &symlinks);
-        if (r < 0)
-                return r;
-
         /* We need to make the pressure path writable even if /sys/fs/cgroups is made read-only, as the
          * service will need to write to it in order to start the notifications. */
         if (context->protect_control_groups && memory_pressure_path && !streq(memory_pressure_path, "/dev/null")) {
@@ -4081,6 +4091,12 @@ static int apply_mount_namespace(
         else
                 ns_info = (NamespaceInfo) {};
 
+        /* Symlinks (exec dirs, os-release) are set up after other mounts, before they are made read-only. */
+        setup_os_release_symlink = ns_info.mount_apivfs && (root_dir || root_image);
+        r = compile_symlinks(context, params, setup_os_release_symlink, &symlinks);
+        if (r < 0)
+                return r;
+
         if (context->mount_propagation_flag == MS_SHARED)
                 log_unit_debug(u, "shared mount propagation hidden by other fs namespacing unit settings: ignoring");
 
@@ -4107,9 +4123,9 @@ static int apply_mount_namespace(
 
                 /* If running under a different root filesystem, propagate the host's os-release. We make a
                  * copy rather than just bind mounting it, so that it can be updated on soft-reboot. */
-                if (root_dir || root_image) {
-                        host_os_release = strdup("/run/systemd/propagate/os-release");
-                        if (!host_os_release)
+                if (setup_os_release_symlink) {
+                        host_os_release_stage = strdup("/run/systemd/propagate/.os-release-stage");
+                        if (!host_os_release_stage)
                                 return -ENOMEM;
                 }
         } else {
@@ -4118,8 +4134,10 @@ static int apply_mount_namespace(
                 if (asprintf(&extension_dir, "/run/user/" UID_FMT "/systemd/unit-extensions", geteuid()) < 0)
                         return -ENOMEM;
 
-                if (root_dir || root_image) {
-                        if (asprintf(&host_os_release, "/run/user/" UID_FMT "/systemd/propagate/os-release", geteuid()) < 0)
+                if (setup_os_release_symlink) {
+                        if (asprintf(&host_os_release_stage,
+                                     "/run/user/" UID_FMT "/systemd/propagate/.os-release-stage",
+                                     geteuid()) < 0)
                                 return -ENOMEM;
                 }
         }
@@ -4169,7 +4187,7 @@ static int apply_mount_namespace(
                         incoming_dir,
                         extension_dir,
                         root_dir || root_image ? params->notify_socket : NULL,
-                        host_os_release,
+                        host_os_release_stage,
                         error_path);
 
         /* If we couldn't set up the namespace this is probably due to a missing capability. setup_namespace() reports
index e6932784d151e5143383e51acdb5ea77e6d1635f..c09f922700e84ad3d5266c68eeafcc055d1c7c7e 100644 (file)
@@ -1397,11 +1397,11 @@ static int setup_os_release(RuntimeScope scope) {
         }
 
         if (scope == RUNTIME_SCOPE_SYSTEM) {
-                os_release_dst = strdup("/run/systemd/propagate/os-release");
+                os_release_dst = strdup("/run/systemd/propagate/.os-release-stage/os-release");
                 if (!os_release_dst)
                         return log_oom_debug();
         } else {
-                if (asprintf(&os_release_dst, "/run/user/" UID_FMT "/systemd/propagate/os-release", geteuid()) < 0)
+                if (asprintf(&os_release_dst, "/run/user/" UID_FMT "/systemd/propagate/.os-release-stage/os-release", geteuid()) < 0)
                         return log_oom_debug();
         }
 
@@ -1409,7 +1409,7 @@ static int setup_os_release(RuntimeScope scope) {
         if (r < 0)
                 return log_debug_errno(r, "Failed to create parent directory of %s, ignoring: %m", os_release_dst);
 
-        r = copy_file(os_release_src, os_release_dst, /* open_flags= */ 0, 0644, COPY_MAC_CREATE|COPY_TRUNCATE);
+        r = copy_file_atomic(os_release_src, os_release_dst, 0644, COPY_MAC_CREATE|COPY_REPLACE);
         if (r < 0)
                 return log_debug_errno(r, "Failed to create %s, ignoring: %m", os_release_dst);
 
index 32f9c45deaa8d66e604f759b0594bc717e3a5d24..51b5aad9c906d592703ea6248ac95a9e177733cb 100644 (file)
@@ -1823,7 +1823,7 @@ static int apply_mounts(
                 const NamespaceInfo *ns_info,
                 MountEntry *mounts,
                 size_t *n_mounts,
-                char **exec_dir_symlinks,
+                char **symlinks,
                 char **error_path) {
 
         _cleanup_fclose_ FILE *proc_self_mountinfo = NULL;
@@ -1891,12 +1891,12 @@ static int apply_mounts(
         }
 
         /* Now that all filesystems have been set up, but before the
-         * read-only switches are flipped, create the exec dirs symlinks.
+         * read-only switches are flipped, create the exec dirs and other symlinks.
          * Note that when /var/lib is not empty/tmpfs, these symlinks will already
          * exist, which means this will be a no-op. */
-        r = create_symlinks_from_tuples(root, exec_dir_symlinks);
+        r = create_symlinks_from_tuples(root, symlinks);
         if (r < 0)
-                return log_debug_errno(r, "Failed to set up ExecDirectories symlinks inside mount namespace: %m");
+                return log_debug_errno(r, "Failed to set up symlinks inside mount namespace: %m");
 
         /* Create a deny list we can pass to bind_mount_recursive() */
         deny_list = new(char*, (*n_mounts)+1);
@@ -2006,7 +2006,7 @@ int setup_namespace(
                 char** exec_paths,
                 char** no_exec_paths,
                 char** empty_directories,
-                char** exec_dir_symlinks,
+                char** symlinks,
                 const BindMount *bind_mounts,
                 size_t n_bind_mounts,
                 const TemporaryFileSystem *temporary_filesystems,
@@ -2028,7 +2028,7 @@ int setup_namespace(
                 const char *incoming_dir,
                 const char *extension_dir,
                 const char *notify_socket,
-                const char *host_os_release,
+                const char *host_os_release_stage,
                 char **error_path) {
 
         _cleanup_(loop_device_unrefp) LoopDevice *loop_device = NULL;
@@ -2156,7 +2156,7 @@ int setup_namespace(
                         log_namespace,
                         setup_propagate,
                         notify_socket,
-                        host_os_release);
+                        host_os_release_stage);
 
         if (n_mounts > 0) {
                 m = mounts = new0(MountEntry, n_mounts);
@@ -2391,10 +2391,10 @@ int setup_namespace(
                                 .read_only = true,
                         };
 
-                if (host_os_release)
+                if (host_os_release_stage)
                         *(m++) = (MountEntry) {
-                                .path_const = "/run/host/os-release",
-                                .source_const = host_os_release,
+                                .path_const = "/run/host/.os-release-stage/",
+                                .source_const = host_os_release_stage,
                                 .mode = BIND_MOUNT,
                                 .read_only = true,
                                 .ignore = true, /* Live copy, don't hard-fail if it goes missing */
@@ -2489,7 +2489,7 @@ int setup_namespace(
                 (void) base_filesystem_create(root, UID_INVALID, GID_INVALID);
 
         /* Now make the magic happen */
-        r = apply_mounts(root, mount_image_policy, extension_image_policy, ns_info, mounts, &n_mounts, exec_dir_symlinks, error_path);
+        r = apply_mounts(root, mount_image_policy, extension_image_policy, ns_info, mounts, &n_mounts, symlinks, error_path);
         if (r < 0)
                 goto finish;
 
index 44e8f097dac11fe47bba06c404b4744651b10b8e..b6132154c5132ec0c98d83b7b3c3e82e4b1f1ba8 100644 (file)
@@ -111,7 +111,7 @@ int setup_namespace(
                 char **exec_paths,
                 char **no_exec_paths,
                 char **empty_directories,
-                char **exec_dir_symlinks,
+                char **symlinks,
                 const BindMount *bind_mounts,
                 size_t n_bind_mounts,
                 const TemporaryFileSystem *temporary_filesystems,
@@ -133,7 +133,7 @@ int setup_namespace(
                 const char *incoming_dir,
                 const char *extension_dir,
                 const char *notify_socket,
-                const char *host_os_release,
+                const char *host_os_release_stage,
                 char **error_path);
 
 #define RUN_SYSTEMD_EMPTY "/run/systemd/empty"
index 7adee341c1e001e9ea65baa3ca96fec8e3448657..0bbab330f4e3d203e7df6c6c414a3f5fbfcad51d 100755 (executable)
@@ -50,7 +50,7 @@ elif [ -f /run/testsuite82.touch2 ]; then
     # Test that we really are in the new overlayfs root fs
     read -r x </lower
     test "$x" = "miep"
-    cmp /etc/os-release /run/systemd/propagate/os-release
+    cmp /etc/os-release /run/systemd/propagate/.os-release-stage/os-release
     grep -q MARKER=1 /etc/os-release
 
     # Switch back to the original root, away from the overlayfs
@@ -96,7 +96,7 @@ elif [ -f /run/testsuite82.touch ]; then
     mkdir -p /tmp/nextroot-lower/usr/lib
     grep ID /etc/os-release >/tmp/nextroot-lower/usr/lib/os-release
     echo MARKER=1 >>/tmp/nextroot-lower/usr/lib/os-release
-    cmp /etc/os-release /run/systemd/propagate/os-release
+    cmp /etc/os-release /run/systemd/propagate/.os-release-stage/os-release
     (! grep -q MARKER=1 /etc/os-release)
 
     mount -t overlay nextroot /run/nextroot -o lowerdir=/tmp/nextroot-lower:/,ro