If someone reads /run/host/os-release at the exact same time it is being updated, and it
is large enough, they might read a half-written file. This is very unlikely as
os-release is typically small and very rarely changes, but it is not
impossible.
Bind mount a staging directory instead of the file, and symlink the file
into into, so that we can do atomic file updates and close this gap.
Atomic replacement creates a new inode, so existing bind mounts would
continue to see the old file, and only new services would see the new file.
The indirection via the directory allows to work around this, as the
directory is fixed and never changes so the bind mount is always valid,
and its content is shared with all existing services.
Fixes https://github.com/systemd/systemd/issues/28794
Follow-up for
3f37a82545d461ab
static int compile_symlinks(
const ExecContext *context,
const ExecParameters *params,
+ bool setup_os_release_symlink,
char ***ret_symlinks) {
_cleanup_strv_free_ char **symlinks = NULL;
}
}
+ /* We make the host's os-release available via a symlink, so that we can copy it atomically
+ * and readers will never get a half-written version. Note that, while the paths specified here are
+ * absolute, when they are processed in namespace.c they will be made relative automatically, i.e.:
+ * 'os-release -> .os-release-stage/os-release' is what will be created. */
+ if (setup_os_release_symlink) {
+ r = strv_extend(&symlinks, "/run/host/.os-release-stage/os-release");
+ if (r < 0)
+ return r;
+
+ r = strv_extend(&symlinks, "/run/host/os-release");
+ if (r < 0)
+ return r;
+ }
+
*ret_symlinks = TAKE_PTR(symlinks);
return 0;
_cleanup_strv_free_ char **empty_directories = NULL, **symlinks = NULL,
**read_write_paths_cleanup = NULL;
_cleanup_free_ char *creds_path = NULL, *incoming_dir = NULL, *propagate_dir = NULL,
- *extension_dir = NULL, *host_os_release = NULL;
+ *extension_dir = NULL, *host_os_release_stage = NULL;
const char *root_dir = NULL, *root_image = NULL, *tmp_dir = NULL, *var_tmp_dir = NULL;
char **read_write_paths;
NamespaceInfo ns_info;
- bool needs_sandboxing;
+ bool needs_sandboxing, setup_os_release_symlink;
BindMount *bind_mounts = NULL;
size_t n_bind_mounts = 0;
int r;
if (r < 0)
return r;
- /* Symlinks for exec dirs are set up after other mounts, before they are made read-only. */
- r = compile_symlinks(context, params, &symlinks);
- if (r < 0)
- return r;
-
/* We need to make the pressure path writable even if /sys/fs/cgroups is made read-only, as the
* service will need to write to it in order to start the notifications. */
if (context->protect_control_groups && memory_pressure_path && !streq(memory_pressure_path, "/dev/null")) {
else
ns_info = (NamespaceInfo) {};
+ /* Symlinks (exec dirs, os-release) are set up after other mounts, before they are made read-only. */
+ setup_os_release_symlink = ns_info.mount_apivfs && (root_dir || root_image);
+ r = compile_symlinks(context, params, setup_os_release_symlink, &symlinks);
+ if (r < 0)
+ return r;
+
if (context->mount_propagation_flag == MS_SHARED)
log_unit_debug(u, "shared mount propagation hidden by other fs namespacing unit settings: ignoring");
/* If running under a different root filesystem, propagate the host's os-release. We make a
* copy rather than just bind mounting it, so that it can be updated on soft-reboot. */
- if (root_dir || root_image) {
- host_os_release = strdup("/run/systemd/propagate/os-release");
- if (!host_os_release)
+ if (setup_os_release_symlink) {
+ host_os_release_stage = strdup("/run/systemd/propagate/.os-release-stage");
+ if (!host_os_release_stage)
return -ENOMEM;
}
} else {
if (asprintf(&extension_dir, "/run/user/" UID_FMT "/systemd/unit-extensions", geteuid()) < 0)
return -ENOMEM;
- if (root_dir || root_image) {
- if (asprintf(&host_os_release, "/run/user/" UID_FMT "/systemd/propagate/os-release", geteuid()) < 0)
+ if (setup_os_release_symlink) {
+ if (asprintf(&host_os_release_stage,
+ "/run/user/" UID_FMT "/systemd/propagate/.os-release-stage",
+ geteuid()) < 0)
return -ENOMEM;
}
}
incoming_dir,
extension_dir,
root_dir || root_image ? params->notify_socket : NULL,
- host_os_release,
+ host_os_release_stage,
error_path);
/* If we couldn't set up the namespace this is probably due to a missing capability. setup_namespace() reports
}
if (scope == RUNTIME_SCOPE_SYSTEM) {
- os_release_dst = strdup("/run/systemd/propagate/os-release");
+ os_release_dst = strdup("/run/systemd/propagate/.os-release-stage/os-release");
if (!os_release_dst)
return log_oom_debug();
} else {
- if (asprintf(&os_release_dst, "/run/user/" UID_FMT "/systemd/propagate/os-release", geteuid()) < 0)
+ if (asprintf(&os_release_dst, "/run/user/" UID_FMT "/systemd/propagate/.os-release-stage/os-release", geteuid()) < 0)
return log_oom_debug();
}
if (r < 0)
return log_debug_errno(r, "Failed to create parent directory of %s, ignoring: %m", os_release_dst);
- r = copy_file(os_release_src, os_release_dst, /* open_flags= */ 0, 0644, COPY_MAC_CREATE|COPY_TRUNCATE);
+ r = copy_file_atomic(os_release_src, os_release_dst, 0644, COPY_MAC_CREATE|COPY_REPLACE);
if (r < 0)
return log_debug_errno(r, "Failed to create %s, ignoring: %m", os_release_dst);
const NamespaceInfo *ns_info,
MountEntry *mounts,
size_t *n_mounts,
- char **exec_dir_symlinks,
+ char **symlinks,
char **error_path) {
_cleanup_fclose_ FILE *proc_self_mountinfo = NULL;
}
/* Now that all filesystems have been set up, but before the
- * read-only switches are flipped, create the exec dirs symlinks.
+ * read-only switches are flipped, create the exec dirs and other symlinks.
* Note that when /var/lib is not empty/tmpfs, these symlinks will already
* exist, which means this will be a no-op. */
- r = create_symlinks_from_tuples(root, exec_dir_symlinks);
+ r = create_symlinks_from_tuples(root, symlinks);
if (r < 0)
- return log_debug_errno(r, "Failed to set up ExecDirectories symlinks inside mount namespace: %m");
+ return log_debug_errno(r, "Failed to set up symlinks inside mount namespace: %m");
/* Create a deny list we can pass to bind_mount_recursive() */
deny_list = new(char*, (*n_mounts)+1);
char** exec_paths,
char** no_exec_paths,
char** empty_directories,
- char** exec_dir_symlinks,
+ char** symlinks,
const BindMount *bind_mounts,
size_t n_bind_mounts,
const TemporaryFileSystem *temporary_filesystems,
const char *incoming_dir,
const char *extension_dir,
const char *notify_socket,
- const char *host_os_release,
+ const char *host_os_release_stage,
char **error_path) {
_cleanup_(loop_device_unrefp) LoopDevice *loop_device = NULL;
log_namespace,
setup_propagate,
notify_socket,
- host_os_release);
+ host_os_release_stage);
if (n_mounts > 0) {
m = mounts = new0(MountEntry, n_mounts);
.read_only = true,
};
- if (host_os_release)
+ if (host_os_release_stage)
*(m++) = (MountEntry) {
- .path_const = "/run/host/os-release",
- .source_const = host_os_release,
+ .path_const = "/run/host/.os-release-stage/",
+ .source_const = host_os_release_stage,
.mode = BIND_MOUNT,
.read_only = true,
.ignore = true, /* Live copy, don't hard-fail if it goes missing */
(void) base_filesystem_create(root, UID_INVALID, GID_INVALID);
/* Now make the magic happen */
- r = apply_mounts(root, mount_image_policy, extension_image_policy, ns_info, mounts, &n_mounts, exec_dir_symlinks, error_path);
+ r = apply_mounts(root, mount_image_policy, extension_image_policy, ns_info, mounts, &n_mounts, symlinks, error_path);
if (r < 0)
goto finish;
char **exec_paths,
char **no_exec_paths,
char **empty_directories,
- char **exec_dir_symlinks,
+ char **symlinks,
const BindMount *bind_mounts,
size_t n_bind_mounts,
const TemporaryFileSystem *temporary_filesystems,
const char *incoming_dir,
const char *extension_dir,
const char *notify_socket,
- const char *host_os_release,
+ const char *host_os_release_stage,
char **error_path);
#define RUN_SYSTEMD_EMPTY "/run/systemd/empty"
# Test that we really are in the new overlayfs root fs
read -r x </lower
test "$x" = "miep"
- cmp /etc/os-release /run/systemd/propagate/os-release
+ cmp /etc/os-release /run/systemd/propagate/.os-release-stage/os-release
grep -q MARKER=1 /etc/os-release
# Switch back to the original root, away from the overlayfs
mkdir -p /tmp/nextroot-lower/usr/lib
grep ID /etc/os-release >/tmp/nextroot-lower/usr/lib/os-release
echo MARKER=1 >>/tmp/nextroot-lower/usr/lib/os-release
- cmp /etc/os-release /run/systemd/propagate/os-release
+ cmp /etc/os-release /run/systemd/propagate/.os-release-stage/os-release
(! grep -q MARKER=1 /etc/os-release)
mount -t overlay nextroot /run/nextroot -o lowerdir=/tmp/nextroot-lower:/,ro