]> git.ipfire.org Git - thirdparty/systemd.git/blobdiff - src/core/namespace.c
Merge pull request #29343 from DaanDeMeyer/tmp
[thirdparty/systemd.git] / src / core / namespace.c
index 2fcc096217d8fbbbb9f6ff32ebbc71703fbfebfc..206453f30f9cf45c3f276a09fe14f04e1a4198ac 100644 (file)
@@ -22,7 +22,7 @@
 #include "fd-util.h"
 #include "format-util.h"
 #include "glyph-util.h"
-#include "label.h"
+#include "label-util.h"
 #include "list.h"
 #include "lock-util.h"
 #include "loop-util.h"
@@ -138,9 +138,6 @@ static const MountEntry protect_kernel_tunables_sys_table[] = {
 
 /* ProtectKernelModules= option */
 static const MountEntry protect_kernel_modules_table[] = {
-#if HAVE_SPLIT_USR
-        { "/lib/modules",        INACCESSIBLE, true  },
-#endif
         { "/usr/lib/modules",    INACCESSIBLE, true  },
 };
 
@@ -182,14 +179,6 @@ static const MountEntry protect_system_yes_table[] = {
         { "/usr",                READONLY,     false },
         { "/boot",               READONLY,     true  },
         { "/efi",                READONLY,     true  },
-#if HAVE_SPLIT_USR
-        { "/lib",                READONLY,     true  },
-        { "/lib64",              READONLY,     true  },
-        { "/bin",                READONLY,     true  },
-#  if HAVE_SPLIT_BIN
-        { "/sbin",               READONLY,     true  },
-#  endif
-#endif
 };
 
 /* ProtectSystem=full includes ProtectSystem=yes */
@@ -198,14 +187,6 @@ static const MountEntry protect_system_full_table[] = {
         { "/boot",               READONLY,     true  },
         { "/efi",                READONLY,     true  },
         { "/etc",                READONLY,     false },
-#if HAVE_SPLIT_USR
-        { "/lib",                READONLY,     true  },
-        { "/lib64",              READONLY,     true  },
-        { "/bin",                READONLY,     true  },
-#  if HAVE_SPLIT_BIN
-        { "/sbin",               READONLY,     true  },
-#  endif
-#endif
 };
 
 /*
@@ -227,25 +208,44 @@ static const MountEntry protect_system_strict_table[] = {
 };
 
 static const char * const mount_mode_table[_MOUNT_MODE_MAX] = {
-        [INACCESSIBLE]         = "inaccessible",
-        [OVERLAY_MOUNT]        = "overlay",
-        [BIND_MOUNT]           = "bind",
-        [BIND_MOUNT_RECURSIVE] = "rbind",
-        [PRIVATE_TMP]          = "private-tmp",
-        [PRIVATE_DEV]          = "private-dev",
-        [BIND_DEV]             = "bind-dev",
-        [EMPTY_DIR]            = "empty",
-        [PRIVATE_SYSFS]        = "private-sysfs",
-        [BIND_SYSFS]           = "bind-sysfs",
-        [PROCFS]               = "procfs",
-        [READONLY]             = "read-only",
-        [READWRITE]            = "read-write",
-        [TMPFS]                = "tmpfs",
-        [MOUNT_IMAGES]         = "mount-images",
-        [READWRITE_IMPLICIT]   = "rw-implicit",
-        [EXEC]                 = "exec",
-        [NOEXEC]               = "noexec",
-        [MQUEUEFS]             = "mqueuefs",
+        [INACCESSIBLE]          = "inaccessible",
+        [OVERLAY_MOUNT]         = "overlay",
+        [MOUNT_IMAGES]          = "mount-images",
+        [BIND_MOUNT]            = "bind",
+        [BIND_MOUNT_RECURSIVE]  = "rbind",
+        [PRIVATE_TMP]           = "private-tmp",
+        [PRIVATE_TMP_READONLY]  = "private-tmp-read-only",
+        [PRIVATE_DEV]           = "private-dev",
+        [BIND_DEV]              = "bind-dev",
+        [EMPTY_DIR]             = "empty",
+        [PRIVATE_SYSFS]         = "private-sysfs",
+        [BIND_SYSFS]            = "bind-sysfs",
+        [PROCFS]                = "procfs",
+        [READONLY]              = "read-only",
+        [READWRITE]             = "read-write",
+        [NOEXEC]                = "noexec",
+        [EXEC]                  = "exec",
+        [TMPFS]                 = "tmpfs",
+        [RUN]                   = "run",
+        [EXTENSION_DIRECTORIES] = "extension-directories",
+        [EXTENSION_IMAGES]      = "extension-images",
+        [MQUEUEFS]              = "mqueuefs",
+        [READWRITE_IMPLICIT]    = "read-write-implicit",
+};
+
+/* Helper struct for naming simplicity and reusability */
+static const struct {
+        const char *level_env;
+        const char *level_env_print;
+} image_class_info[_IMAGE_CLASS_MAX] = {
+        [IMAGE_SYSEXT] = {
+                .level_env = "SYSEXT_LEVEL",
+                .level_env_print = " SYSEXT_LEVEL=",
+        },
+        [IMAGE_CONFEXT] = {
+                .level_env = "CONFEXT_LEVEL",
+                .level_env_print = " CONFEXT_LEVEL=",
+        }
 };
 
 DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(mount_mode, MountMode);
@@ -536,7 +536,7 @@ static int append_extensions(
 
                 *((*p)++) = (MountEntry) {
                         .path_malloc = TAKE_PTR(mount_point),
-                        .source_const = TAKE_PTR(source),
+                        .source_malloc = TAKE_PTR(source),
                         .mode = EXTENSION_DIRECTORIES,
                         .ignore = ignore_enoent,
                         .has_prefix = true,
@@ -909,7 +909,19 @@ add_symlink:
         return 0;
 }
 
-static int mount_private_dev(MountEntry *m) {
+static char *settle_runtime_dir(RuntimeScope scope) {
+        char *runtime_dir;
+
+        if (scope != RUNTIME_SCOPE_USER)
+                return strdup("/run/");
+
+        if (asprintf(&runtime_dir, "/run/user/" UID_FMT, geteuid()) < 0)
+                return NULL;
+
+        return runtime_dir;
+}
+
+static int mount_private_dev(MountEntry *m, RuntimeScope scope) {
         static const char devnodes[] =
                 "/dev/null\0"
                 "/dev/zero\0"
@@ -918,13 +930,21 @@ static int mount_private_dev(MountEntry *m) {
                 "/dev/urandom\0"
                 "/dev/tty\0";
 
-        char temporary_mount[] = "/tmp/namespace-dev-XXXXXX";
+        _cleanup_free_ char *runtime_dir = NULL, *temporary_mount = NULL;
         const char *dev = NULL, *devpts = NULL, *devshm = NULL, *devhugepages = NULL, *devmqueue = NULL, *devlog = NULL, *devptmx = NULL;
         bool can_mknod = true;
         int r;
 
         assert(m);
 
+        runtime_dir = settle_runtime_dir(scope);
+        if (!runtime_dir)
+                return log_oom_debug();
+
+        temporary_mount = path_join(runtime_dir, "systemd/namespace-dev-XXXXXX");
+        if (!temporary_mount)
+                return log_oom_debug();
+
         if (!mkdtemp(temporary_mount))
                 return log_debug_errno(errno, "Failed to create temporary directory '%s': %m", temporary_mount);
 
@@ -995,6 +1015,11 @@ static int mount_private_dev(MountEntry *m) {
         if (r < 0)
                 log_debug_errno(r, "Failed to set up basic device tree at '%s', ignoring: %m", temporary_mount);
 
+        /* Make the bind mount read-only. */
+        r = mount_nofollow_verbose(LOG_DEBUG, NULL, dev, NULL, MS_REMOUNT|MS_BIND|MS_RDONLY, NULL);
+        if (r < 0)
+                return r;
+
         /* Create the /dev directory if missing. It is more likely to be missing when the service is started
          * with RootDirectory. This is consistent with mount units creating the mount points when missing. */
         (void) mkdir_p_label(mount_entry_path(m), 0755);
@@ -1049,34 +1074,7 @@ static int mount_bind_dev(const MountEntry *m) {
         if (r > 0) /* make this a NOP if /dev is already a mount point */
                 return 0;
 
-        r = mount_nofollow_verbose(LOG_DEBUG, "/dev", mount_entry_path(m), NULL, MS_BIND|MS_REC, NULL);
-        if (r < 0)
-                return r;
-
-        return 1;
-}
-
-static int mount_private_sysfs(const MountEntry *m) {
-        const char *p = mount_entry_path(ASSERT_PTR(m));
-        int r;
-
-        (void) mkdir_p_label(p, 0755);
-
-        r = remount_sysfs(p);
-        if (r < 0 && (ERRNO_IS_PRIVILEGE(r) || ERRNO_IS_NOT_SUPPORTED(r))) {
-                /* Running with an unprivileged user (PrivateUsers=yes), or the kernel seems old. Falling
-                 * back to bind mount the host's version so that we get all child mounts of it, too. */
-
-                log_debug_errno(r, "Failed to remount sysfs on %s, falling back to bind mount: %m", p);
-
-                (void) umount_recursive(p, 0);
-
-                r = mount_nofollow_verbose(LOG_DEBUG, "/sys", p, NULL, MS_BIND|MS_REC, NULL);
-        }
-        if (r < 0)
-                return log_debug_errno(r, "Failed to remount sysfs on %s: %m", p);
-
-        return 1;
+        return mount_nofollow_verbose(LOG_DEBUG, "/dev", mount_entry_path(m), NULL, MS_BIND|MS_REC, NULL);
 }
 
 static int mount_bind_sysfs(const MountEntry *m) {
@@ -1093,11 +1091,34 @@ static int mount_bind_sysfs(const MountEntry *m) {
                 return 0;
 
         /* Bind mount the host's version so that we get all child mounts of it, too. */
-        r = mount_nofollow_verbose(LOG_DEBUG, "/sys", mount_entry_path(m), NULL, MS_BIND|MS_REC, NULL);
-        if (r < 0)
+        return mount_nofollow_verbose(LOG_DEBUG, "/sys", mount_entry_path(m), NULL, MS_BIND|MS_REC, NULL);
+}
+
+static int mount_private_sysfs(const MountEntry *m) {
+        const char *entry_path = mount_entry_path(ASSERT_PTR(m));
+        int r, n;
+
+        (void) mkdir_p_label(entry_path, 0755);
+
+        n = umount_recursive(entry_path, 0);
+
+        r = mount_nofollow_verbose(LOG_DEBUG, "sysfs", entry_path, "sysfs", MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL);
+        if (ERRNO_IS_NEG_PRIVILEGE(r)) {
+                /* When we do not have enough privileges to mount sysfs, fall back to use existing /sys. */
+
+                if (n > 0)
+                        /* /sys or some of sub-mounts are umounted in the above. Refuse incomplete tree.
+                         * Propagate the original error code returned by mount() in the above. */
+                        return r;
+
+                return mount_bind_sysfs(m);
+
+        } else if (r < 0)
                 return r;
 
-        return 1;
+        /* We mounted a new instance now. Let's bind mount the children over now. */
+        (void) bind_mount_submounts("/sys", entry_path);
+        return 0;
 }
 
 static int mount_procfs(const MountEntry *m, const NamespaceInfo *ns_info) {
@@ -1156,30 +1177,32 @@ static int mount_procfs(const MountEntry *m, const NamespaceInfo *ns_info) {
                  * means we really don't want to use it, since it would affect our host's /proc
                  * mount. Hence let's gracefully fallback to a classic, unrestricted version. */
                 r = mount_nofollow_verbose(LOG_DEBUG, "proc", entry_path, "proc", MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL);
-        if (r == -EPERM) {
-                /* When we do not have enough privileges to mount /proc, fallback to use existing /proc. */
+        if (ERRNO_IS_NEG_PRIVILEGE(r)) {
+                /* When we do not have enough privileges to mount /proc, fall back to use existing /proc. */
 
                 if (n > 0)
                         /* /proc or some of sub-mounts are umounted in the above. Refuse incomplete tree.
                          * Propagate the original error code returned by mount() in the above. */
-                        return -EPERM;
+                        return r;
 
                 r = path_is_mount_point(entry_path, NULL, 0);
                 if (r < 0)
                         return log_debug_errno(r, "Unable to determine whether /proc is already mounted: %m");
-                if (r == 0) {
-                        /* We lack permissions to mount a new instance of /proc, and it is not already
-                         * mounted. But we can access the host's, so as a final fallback bind-mount it to
-                         * the destination, as most likely we are inside a user manager in an unprivileged
-                         * user namespace. */
-                        r = mount_nofollow_verbose(LOG_DEBUG, "/proc", entry_path, NULL, MS_BIND|MS_REC, NULL);
-                        if (r < 0)
-                                return -EPERM;
-                }
+                if (r > 0)
+                        return 0;
+
+                /* We lack permissions to mount a new instance of /proc, and it is not already mounted. But
+                 * we can access the host's, so as a final fallback bind-mount it to the destination, as most
+                 * likely we are inside a user manager in an unprivileged user namespace. */
+                return mount_nofollow_verbose(LOG_DEBUG, "/proc", entry_path, NULL, MS_BIND|MS_REC, NULL);
+
         } else if (r < 0)
                 return r;
 
-        return 1;
+        /* We mounted a new instance now. Let's bind mount the children over now. This matters for nspawn
+         * where a bunch of files are overmounted, in particular the boot id */
+        (void) bind_mount_submounts("/proc", entry_path);
+        return 0;
 }
 
 static int mount_tmpfs(const MountEntry *m) {
@@ -1205,7 +1228,7 @@ static int mount_tmpfs(const MountEntry *m) {
         if (r < 0)
                 return log_debug_errno(r, "Failed to fix label of '%s' as '%s': %m", entry_path, inner_path);
 
-        return 1;
+        return 0;
 }
 
 static int mount_run(const MountEntry *m) {
@@ -1246,17 +1269,32 @@ static int mount_image(
                 const ImagePolicy *image_policy) {
 
         _cleanup_free_ char *host_os_release_id = NULL, *host_os_release_version_id = NULL,
-                            *host_os_release_sysext_level = NULL;
+                            *host_os_release_level = NULL, *extension_name = NULL;
+        _cleanup_strv_free_ char **extension_release = NULL;
+        ImageClass class = IMAGE_SYSEXT;
         int r;
 
         assert(m);
 
+        r = path_extract_filename(mount_entry_source(m), &extension_name);
+        if (r < 0)
+                return log_debug_errno(r, "Failed to extract extension name from %s: %m", mount_entry_source(m));
+
         if (m->mode == EXTENSION_IMAGES) {
+                r = load_extension_release_pairs(mount_entry_source(m), IMAGE_SYSEXT, extension_name, /* relax_extension_release_check= */ false, &extension_release);
+                if (r == -ENOENT) {
+                        r = load_extension_release_pairs(mount_entry_source(m), IMAGE_CONFEXT, extension_name, /* relax_extension_release_check= */ false, &extension_release);
+                        if (r >= 0)
+                                class = IMAGE_CONFEXT;
+                }
+                if (r == -ENOENT)
+                        return r;
+
                 r = parse_os_release(
                                 empty_to_root(root_directory),
                                 "ID", &host_os_release_id,
                                 "VERSION_ID", &host_os_release_version_id,
-                                "SYSEXT_LEVEL", &host_os_release_sysext_level,
+                                image_class_info[class].level_env, &host_os_release_level,
                                 NULL);
                 if (r < 0)
                         return log_debug_errno(r, "Failed to acquire 'os-release' data of OS tree '%s': %m", empty_to_root(root_directory));
@@ -1272,7 +1310,7 @@ static int mount_image(
                         image_policy,
                         host_os_release_id,
                         host_os_release_version_id,
-                        host_os_release_sysext_level,
+                        host_os_release_level,
                         NULL);
         if (r == -ENOENT && m->ignore)
                 return 0;
@@ -1283,12 +1321,12 @@ static int mount_image(
                                        host_os_release_id,
                                        host_os_release_version_id ? " VERSION_ID=" : "",
                                        strempty(host_os_release_version_id),
-                                       host_os_release_sysext_level ? " SYSEXT_LEVEL=" : "",
-                                       strempty(host_os_release_sysext_level));
+                                       host_os_release_level ? image_class_info[class].level_env_print : "",
+                                       strempty(host_os_release_level));
         if (r < 0)
                 return log_debug_errno(r, "Failed to mount image %s on %s: %m", mount_entry_source(m), mount_entry_path(m));
 
-        return 1;
+        return 0;
 }
 
 static int mount_overlay(const MountEntry *m) {
@@ -1304,10 +1342,8 @@ static int mount_overlay(const MountEntry *m) {
         r = mount_nofollow_verbose(LOG_DEBUG, "overlay", mount_entry_path(m), "overlay", MS_RDONLY, options);
         if (r == -ENOENT && m->ignore)
                 return 0;
-        if (r < 0)
-                return r;
 
-        return 1;
+        return r;
 }
 
 static int follow_symlink(
@@ -1348,7 +1384,8 @@ static int apply_one_mount(
                 MountEntry *m,
                 const ImagePolicy *mount_image_policy,
                 const ImagePolicy *extension_image_policy,
-                const NamespaceInfo *ns_info) {
+                const NamespaceInfo *ns_info,
+                RuntimeScope scope) {
 
         _cleanup_free_ char *inaccessible = NULL;
         bool rbind = true, make = false;
@@ -1363,8 +1400,7 @@ static int apply_one_mount(
         switch (m->mode) {
 
         case INACCESSIBLE: {
-                _cleanup_free_ char *tmp = NULL;
-                const char *runtime_dir;
+                _cleanup_free_ char *runtime_dir = NULL;
                 struct stat target;
 
                 /* First, get rid of everything that is below if there
@@ -1380,14 +1416,14 @@ static int apply_one_mount(
                                                mount_entry_path(m));
                 }
 
-                if (geteuid() == 0)
-                        runtime_dir = "/run";
-                else {
-                        if (asprintf(&tmp, "/run/user/" UID_FMT, geteuid()) < 0)
-                                return -ENOMEM;
-
-                        runtime_dir = tmp;
-                }
+                /* We don't pass the literal runtime scope through here but one based purely on our UID. This
+                 * means that the root user's --user services will use the host's inaccessible inodes rather
+                 * then root's private ones. This is preferable since it means device nodes that are
+                 * overmounted to make them inaccessible will be overmounted with a device node, rather than
+                 * an AF_UNIX socket inode. */
+                runtime_dir = settle_runtime_dir(geteuid() == 0 ? RUNTIME_SCOPE_SYSTEM : RUNTIME_SCOPE_USER);
+                if (!runtime_dir)
+                        return log_oom_debug();
 
                 r = mode_to_inaccessible_node(runtime_dir, target.st_mode, &inaccessible);
                 if (r < 0)
@@ -1417,25 +1453,35 @@ static int apply_one_mount(
 
         case EXTENSION_DIRECTORIES: {
                 _cleanup_free_ char *host_os_release_id = NULL, *host_os_release_version_id = NULL,
-                                *host_os_release_sysext_level = NULL, *extension_name = NULL;
+                                *host_os_release_level = NULL, *extension_name = NULL;
                 _cleanup_strv_free_ char **extension_release = NULL;
+                ImageClass class = IMAGE_SYSEXT;
 
                 r = path_extract_filename(mount_entry_source(m), &extension_name);
                 if (r < 0)
                         return log_debug_errno(r, "Failed to extract extension name from %s: %m", mount_entry_source(m));
 
+                r = load_extension_release_pairs(mount_entry_source(m), IMAGE_SYSEXT, extension_name, /* relax_extension_release_check= */ false, &extension_release);
+                if (r == -ENOENT) {
+                        r = load_extension_release_pairs(mount_entry_source(m), IMAGE_CONFEXT, extension_name, /* relax_extension_release_check= */ false, &extension_release);
+                        if (r >= 0)
+                                class = IMAGE_CONFEXT;
+                }
+                if (r == -ENOENT)
+                        return r;
+
                 r = parse_os_release(
                                 empty_to_root(root_directory),
                                 "ID", &host_os_release_id,
                                 "VERSION_ID", &host_os_release_version_id,
-                                "SYSEXT_LEVEL", &host_os_release_sysext_level,
+                                image_class_info[class].level_env, &host_os_release_level,
                                 NULL);
                 if (r < 0)
                         return log_debug_errno(r, "Failed to acquire 'os-release' data of OS tree '%s': %m", empty_to_root(root_directory));
                 if (isempty(host_os_release_id))
                         return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "'ID' field not found or empty in 'os-release' data of OS tree '%s': %m", empty_to_root(root_directory));
 
-                r = load_extension_release_pairs(mount_entry_source(m), IMAGE_SYSEXT, extension_name, /* relax_extension_release_check= */ false, &extension_release);
+                r = load_extension_release_pairs(mount_entry_source(m), class, extension_name, /* relax_extension_release_check= */ false, &extension_release);
                 if (r == -ENOENT && m->ignore)
                         return 0;
                 if (r < 0)
@@ -1445,10 +1491,10 @@ static int apply_one_mount(
                                 extension_name,
                                 host_os_release_id,
                                 host_os_release_version_id,
-                                host_os_release_sysext_level,
-                                /* host_sysext_scope */ NULL, /* Leave empty, we need to accept both system and portable */
+                                host_os_release_level,
+                                /* host_extension_scope */ NULL, /* Leave empty, we need to accept both system and portable */
                                 extension_release,
-                                IMAGE_SYSEXT);
+                                class);
                 if (r == 0)
                         return log_debug_errno(SYNTHETIC_ERRNO(ESTALE), "Directory %s extension-release metadata does not match the root's", extension_name);
                 if (r < 0)
@@ -1497,7 +1543,7 @@ static int apply_one_mount(
                 break;
 
         case PRIVATE_DEV:
-                return mount_private_dev(m);
+                return mount_private_dev(m, scope);
 
         case BIND_DEV:
                 return mount_bind_dev(m);
@@ -1697,7 +1743,8 @@ static size_t namespace_calculate_mounts(
                 const char *creds_path,
                 const char* log_namespace,
                 bool setup_propagate,
-                const char* notify_socket) {
+                const char* notify_socket,
+                const char* host_os_release) {
 
         size_t protect_home_cnt;
         size_t protect_system_cnt =
@@ -1742,6 +1789,7 @@ static size_t namespace_calculate_mounts(
                 !!log_namespace +
                 setup_propagate + /* /run/systemd/incoming */
                 !!notify_socket +
+                !!host_os_release +
                 ns_info->private_network + /* /sys */
                 ns_info->private_ipc; /* /dev/mqueue */
 }
@@ -1789,6 +1837,39 @@ static int create_symlinks_from_tuples(const char *root, char **strv_symlinks) {
         return 0;
 }
 
+static void mount_entry_path_debug_string(const char *root, MountEntry *m, char **error_path) {
+        assert(m);
+
+        /* Create a string suitable for debugging logs, stripping for example the local working directory.
+         * For example, with a BindPaths=/var/bar that does not exist on the host:
+         *
+         * Before:
+         *  foo.service: Failed to set up mount namespacing: /run/systemd/unit-root/var/bar: No such file or directory
+         * After:
+         *  foo.service: Failed to set up mount namespacing: /var/bar: No such file or directory
+         *
+         * Note that this is an error path, so no OOM check is done on purpose. */
+
+        if (!error_path)
+                return;
+
+        if (!mount_entry_path(m)) {
+                *error_path = NULL;
+                return;
+        }
+
+        if (root) {
+                const char *e = startswith(mount_entry_path(m), root);
+                if (e) {
+                        *error_path = strdup(e);
+                        return;
+                }
+        }
+
+        *error_path = strdup(mount_entry_path(m));
+        return;
+}
+
 static int apply_mounts(
                 const char *root,
                 const ImagePolicy *mount_image_policy,
@@ -1796,7 +1877,8 @@ static int apply_mounts(
                 const NamespaceInfo *ns_info,
                 MountEntry *mounts,
                 size_t *n_mounts,
-                char **exec_dir_symlinks,
+                RuntimeScope scope,
+                char **symlinks,
                 char **error_path) {
 
         _cleanup_fclose_ FILE *proc_self_mountinfo = NULL;
@@ -1834,8 +1916,7 @@ static int apply_mounts(
                         /* ExtensionImages/Directories are first opened in the propagate directory, not in the root_directory */
                         r = follow_symlink(!IN_SET(m->mode, EXTENSION_IMAGES, EXTENSION_DIRECTORIES) ? root : NULL, m);
                         if (r < 0) {
-                                if (error_path && mount_entry_path(m))
-                                        *error_path = strdup(mount_entry_path(m));
+                                mount_entry_path_debug_string(root, m, error_path);
                                 return r;
                         }
                         if (r == 0) {
@@ -1847,10 +1928,9 @@ static int apply_mounts(
                                 break;
                         }
 
-                        r = apply_one_mount(root, m, mount_image_policy, extension_image_policy, ns_info);
+                        r = apply_one_mount(root, m, mount_image_policy, extension_image_policy, ns_info, scope);
                         if (r < 0) {
-                                if (error_path && mount_entry_path(m))
-                                        *error_path = strdup(mount_entry_path(m));
+                                mount_entry_path_debug_string(root, m, error_path);
                                 return r;
                         }
 
@@ -1864,12 +1944,12 @@ static int apply_mounts(
         }
 
         /* Now that all filesystems have been set up, but before the
-         * read-only switches are flipped, create the exec dirs symlinks.
+         * read-only switches are flipped, create the exec dirs and other symlinks.
          * Note that when /var/lib is not empty/tmpfs, these symlinks will already
          * exist, which means this will be a no-op. */
-        r = create_symlinks_from_tuples(root, exec_dir_symlinks);
+        r = create_symlinks_from_tuples(root, symlinks);
         if (r < 0)
-                return log_debug_errno(r, "Failed to set up ExecDirectories symlinks inside mount namespace: %m");
+                return log_debug_errno(r, "Failed to set up symlinks inside mount namespace: %m");
 
         /* Create a deny list we can pass to bind_mount_recursive() */
         deny_list = new(char*, (*n_mounts)+1);
@@ -1883,8 +1963,7 @@ static int apply_mounts(
         for (MountEntry *m = mounts; m < mounts + *n_mounts; ++m) {
                 r = make_read_only(m, deny_list, proc_self_mountinfo);
                 if (r < 0) {
-                        if (error_path && mount_entry_path(m))
-                                *error_path = strdup(mount_entry_path(m));
+                        mount_entry_path_debug_string(root, m, error_path);
                         return r;
                 }
         }
@@ -1898,8 +1977,7 @@ static int apply_mounts(
         for (MountEntry *m = mounts; m < mounts + *n_mounts; ++m) {
                 r = make_noexec(m, deny_list, proc_self_mountinfo);
                 if (r < 0) {
-                        if (error_path && mount_entry_path(m))
-                                *error_path = strdup(mount_entry_path(m));
+                        mount_entry_path_debug_string(root, m, error_path);
                         return r;
                 }
         }
@@ -1909,8 +1987,7 @@ static int apply_mounts(
                 for (MountEntry *m = mounts; m < mounts + *n_mounts; ++m) {
                         r = make_nosuid(m, proc_self_mountinfo);
                         if (r < 0) {
-                                if (error_path && mount_entry_path(m))
-                                        *error_path = strdup(mount_entry_path(m));
+                                mount_entry_path_debug_string(root, m, error_path);
                                 return r;
                         }
                 }
@@ -1967,62 +2044,6 @@ static bool home_read_only(
         return false;
 }
 
-static int verity_settings_prepare(
-                VeritySettings *verity,
-                const char *root_image,
-                const void *root_hash,
-                size_t root_hash_size,
-                const char *root_hash_path,
-                const void *root_hash_sig,
-                size_t root_hash_sig_size,
-                const char *root_hash_sig_path,
-                const char *verity_data_path) {
-
-        int r;
-
-        assert(verity);
-
-        if (root_hash) {
-                void *d;
-
-                d = memdup(root_hash, root_hash_size);
-                if (!d)
-                        return -ENOMEM;
-
-                free_and_replace(verity->root_hash, d);
-                verity->root_hash_size = root_hash_size;
-                verity->designator = PARTITION_ROOT;
-        }
-
-        if (root_hash_sig) {
-                void *d;
-
-                d = memdup(root_hash_sig, root_hash_sig_size);
-                if (!d)
-                        return -ENOMEM;
-
-                free_and_replace(verity->root_hash_sig, d);
-                verity->root_hash_sig_size = root_hash_sig_size;
-                verity->designator = PARTITION_ROOT;
-        }
-
-        if (verity_data_path) {
-                r = free_and_strdup(&verity->data_path, verity_data_path);
-                if (r < 0)
-                        return r;
-        }
-
-        r = verity_settings_load(
-                        verity,
-                        root_image,
-                        root_hash_path,
-                        root_hash_sig_path);
-        if (r < 0)
-                return log_debug_errno(r, "Failed to load root hash: %m");
-
-        return 0;
-}
-
 int setup_namespace(
                 const char* root_directory,
                 const char* root_image,
@@ -2035,7 +2056,7 @@ int setup_namespace(
                 char** exec_paths,
                 char** no_exec_paths,
                 char** empty_directories,
-                char** exec_dir_symlinks,
+                char** symlinks,
                 const BindMount *bind_mounts,
                 size_t n_bind_mounts,
                 const TemporaryFileSystem *temporary_filesystems,
@@ -2048,13 +2069,7 @@ int setup_namespace(
                 const char *creds_path,
                 const char *log_namespace,
                 unsigned long mount_propagation_flag,
-                const void *root_hash,
-                size_t root_hash_size,
-                const char *root_hash_path,
-                const void *root_hash_sig,
-                size_t root_hash_sig_size,
-                const char *root_hash_sig_path,
-                const char *verity_data_path,
+                VeritySettings *verity,
                 const MountImage *extension_images,
                 size_t n_extension_images,
                 const ImagePolicy *extension_image_policy,
@@ -2063,11 +2078,12 @@ int setup_namespace(
                 const char *incoming_dir,
                 const char *extension_dir,
                 const char *notify_socket,
+                const char *host_os_release_stage,
+                RuntimeScope scope,
                 char **error_path) {
 
         _cleanup_(loop_device_unrefp) LoopDevice *loop_device = NULL;
         _cleanup_(dissected_image_unrefp) DissectedImage *dissected_image = NULL;
-        _cleanup_(verity_settings_done) VeritySettings verity = VERITY_SETTINGS_DEFAULT;
         _cleanup_strv_free_ char **hierarchies = NULL;
         MountEntry *m = NULL, *mounts = NULL;
         bool require_prefix = false, setup_propagate = false;
@@ -2107,16 +2123,7 @@ int setup_namespace(
                     strv_isempty(read_write_paths))
                         dissect_image_flags |= DISSECT_IMAGE_READ_ONLY;
 
-                r = verity_settings_prepare(
-                                &verity,
-                                root_image,
-                                root_hash, root_hash_size, root_hash_path,
-                                root_hash_sig, root_hash_sig_size, root_hash_sig_path,
-                                verity_data_path);
-                if (r < 0)
-                        return r;
-
-                SET_FLAG(dissect_image_flags, DISSECT_IMAGE_NO_PARTITION_TABLE, verity.data_path);
+                SET_FLAG(dissect_image_flags, DISSECT_IMAGE_NO_PARTITION_TABLE, verity && verity->data_path);
 
                 r = loop_device_make_by_path(
                                 root_image,
@@ -2130,7 +2137,7 @@ int setup_namespace(
 
                 r = dissect_loop_device(
                                 loop_device,
-                                &verity,
+                                verity,
                                 root_image_mount_options,
                                 root_image_policy,
                                 dissect_image_flags,
@@ -2141,14 +2148,14 @@ int setup_namespace(
                 r = dissected_image_load_verity_sig_partition(
                                 dissected_image,
                                 loop_device->fd,
-                                &verity);
+                                verity);
                 if (r < 0)
                         return r;
 
                 r = dissected_image_decrypt(
                                 dissected_image,
                                 NULL,
-                                &verity,
+                                verity,
                                 dissect_image_flags);
                 if (r < 0)
                         return log_debug_errno(r, "Failed to decrypt dissected image: %m");
@@ -2166,15 +2173,17 @@ int setup_namespace(
                  * in the root. The temporary directory prevents any mounts from being potentially obscured
                  * my other mounts we already applied.  We use the same mount point for all images, which is
                  * safe, since they all live in their own namespaces after all, and hence won't see each
-                 * other. */
+                 * other. (Note: this directory is also created by PID 1 early on, we create it here for
+                 * similar reasons as /run/systemd/ first.) */
+                root = "/run/systemd/mount-rootfs";
+                (void) mkdir_label(root, 0555);
 
-                root = "/run/systemd/unit-root";
-                (void) mkdir_label(root, 0700);
                 require_prefix = true;
         }
 
         if (n_extension_images > 0 || !strv_isempty(extension_directories)) {
-                r = parse_env_extension_hierarchies(&hierarchies, "SYSTEMD_SYSEXT_HIERARCHIES");
+                /* Hierarchy population needs to be done for sysext and confext extension images */
+                r = parse_env_extension_hierarchies(&hierarchies, "SYSTEMD_SYSEXT_AND_CONFEXT_HIERARCHIES");
                 if (r < 0)
                         return r;
         }
@@ -2197,7 +2206,8 @@ int setup_namespace(
                         creds_path,
                         log_namespace,
                         setup_propagate,
-                        notify_socket);
+                        notify_socket,
+                        host_os_release_stage);
 
         if (n_mounts > 0) {
                 m = mounts = new0(MountEntry, n_mounts);
@@ -2432,6 +2442,15 @@ int setup_namespace(
                                 .read_only = true,
                         };
 
+                if (host_os_release_stage)
+                        *(m++) = (MountEntry) {
+                                .path_const = "/run/host/.os-release-stage/",
+                                .source_const = host_os_release_stage,
+                                .mode = BIND_MOUNT,
+                                .read_only = true,
+                                .ignore = true, /* Live copy, don't hard-fail if it goes missing */
+                        };
+
                 assert(mounts + n_mounts == m);
 
                 /* Prepend the root directory where that's necessary */
@@ -2521,7 +2540,14 @@ int setup_namespace(
                 (void) base_filesystem_create(root, UID_INVALID, GID_INVALID);
 
         /* Now make the magic happen */
-        r = apply_mounts(root, mount_image_policy, extension_image_policy, ns_info, mounts, &n_mounts, exec_dir_symlinks, error_path);
+        r = apply_mounts(root,
+                         mount_image_policy,
+                         extension_image_policy,
+                         ns_info,
+                         mounts, &n_mounts,
+                         scope,
+                         symlinks,
+                         error_path);
         if (r < 0)
                 goto finish;