]> git.ipfire.org Git - thirdparty/systemd.git/commitdiff
core: introduce PinnedResource
authorLennart Poettering <lennart@amutable.com>
Thu, 27 Nov 2025 07:07:31 +0000 (08:07 +0100)
committerLennart Poettering <lennart@amutable.com>
Thu, 19 Feb 2026 14:08:20 +0000 (15:08 +0100)
This introduces PinnedResources as a structure combining pinned
references to a root directory, root image, or root mstack. This is not
only easier to work with, but essential to make certain unpriv things
work, as we need some mechanism to pin resources before we drop into a
userns which might possibly not provide access anymore to those
resources.

Hence this does two things: introduce the new structure, and immediately
hook it up so that we pin things properly before dropping into userns,
and then makes use of this after dropping the right way, and enables
unpriv userns operation.

The concept is generic enough to eventually implement extension images +
mount images with the same structure, but in order to keep the changes
managable this is left for another time.

(This also makes one further clean-up: client-side verity-reuse checks
are moved server side if we are unpriv. Previously we'd do them client
side, but they were doomed to fail because of lack of privs. Hence let's
drop the client side if we are unpriv and purely do them server-side in
that case.)

src/core/exec-invoke.c
src/core/namespace.c
src/core/namespace.h
src/core/service.c
src/shared/shared-forward.h
src/test/test-namespace.c
src/test/test-ns.c

index 888a2555fbbea28f86cb417985b78ffaa0251da6..315ef410315b48305bbd2b39db790128dd01003d 100644 (file)
@@ -56,6 +56,7 @@
 #include "mkdir-label.h"
 #include "mount-util.h"
 #include "mountpoint-util.h"
+#include "mstack.h"
 #include "namespace-util.h"
 #include "nsflags.h"
 #include "nsresource.h"
@@ -3537,8 +3538,7 @@ static int compile_symlinks(
 
 static bool insist_on_sandboxing(
                 const ExecContext *context,
-                const char *root_dir,
-                const char *root_image,
+                const PinnedResource *rootfs,
                 const BindMount *bind_mounts,
                 size_t n_bind_mounts) {
 
@@ -3552,7 +3552,7 @@ static bool insist_on_sandboxing(
         if (context->n_temporary_filesystems > 0)
                 return true;
 
-        if (root_dir || root_image || context->root_directory_as_fd)
+        if (pinned_resource_is_set(rootfs))
                 return true;
 
         if (context->n_mount_images > 0)
@@ -3579,8 +3579,7 @@ static bool insist_on_sandboxing(
 static int setup_ephemeral(
                 const ExecContext *context,
                 ExecRuntime *runtime,
-                char **root_image,            /* both input and output! modified if ephemeral logic enabled */
-                char **root_directory,        /* ditto */
+                PinnedResource *rootfs,  /* both input and output! modified if ephemeral logic enabled */
                 char **reterr_path) {
 
         _cleanup_close_ int fd = -EBADF;
@@ -3588,12 +3587,10 @@ static int setup_ephemeral(
         int r;
 
         assert(context);
-        assert(!context->root_directory_as_fd);
         assert(runtime);
-        assert(root_image);
-        assert(root_directory);
+        assert(rootfs);
 
-        if (!*root_image && !*root_directory)
+        if (!rootfs->image && !rootfs->directory)
                 return 0;
 
         if (!runtime->ephemeral_copy)
@@ -3619,32 +3616,32 @@ static int setup_ephemeral(
         if (fd != -EAGAIN)
                 return log_debug_errno(fd, "Failed to receive file descriptor queued on ephemeral storage socket: %m");
 
-        if (*root_image) {
-                log_debug("Making ephemeral copy of %s to %s", *root_image, new_root);
+        if (rootfs->image) {
+                log_debug("Making ephemeral copy of %s to %s", rootfs->image, new_root);
 
-                fd = copy_file(*root_image, new_root, O_EXCL, 0600,
+                fd = copy_file(rootfs->image, new_root, O_EXCL, 0600,
                                COPY_LOCK_BSD|COPY_REFLINK|COPY_CRTIME|COPY_NOCOW_AFTER);
                 if (fd < 0) {
-                        *reterr_path = strdup(*root_image);
+                        *reterr_path = strdup(rootfs->image);
                         return log_debug_errno(fd, "Failed to copy image %s to %s: %m",
-                                               *root_image, new_root);
+                                               rootfs->image, new_root);
                 }
         } else {
-                assert(*root_directory);
+                assert(rootfs->directory);
 
-                log_debug("Making ephemeral snapshot of %s to %s", *root_directory, new_root);
+                log_debug("Making ephemeral snapshot of %s to %s", rootfs->directory, new_root);
 
                 fd = btrfs_subvol_snapshot_at(
-                                AT_FDCWD, *root_directory,
+                                AT_FDCWD, rootfs->directory,
                                 AT_FDCWD, new_root,
                                 BTRFS_SNAPSHOT_FALLBACK_COPY |
                                 BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
                                 BTRFS_SNAPSHOT_RECURSIVE |
                                 BTRFS_SNAPSHOT_LOCK_BSD);
                 if (fd < 0) {
-                        *reterr_path = strdup(*root_directory);
+                        *reterr_path = strdup(rootfs->directory);
                         return log_debug_errno(fd, "Failed to snapshot directory %s to %s: %m",
-                                               *root_directory, new_root);
+                                               rootfs->directory, new_root);
                 }
         }
 
@@ -3652,11 +3649,14 @@ static int setup_ephemeral(
         if (r < 0)
                 return log_debug_errno(r, "Failed to queue file descriptor on ephemeral storage socket: %m");
 
-        if (*root_image)
-                free_and_replace(*root_image, new_root);
-        else {
-                assert(*root_directory);
-                free_and_replace(*root_directory, new_root);
+        if (rootfs->image) {
+                free_and_replace(rootfs->image, new_root);
+                close_and_replace(rootfs->image_fd, fd);
+        } else {
+                assert(rootfs->directory);
+
+                free_and_replace(rootfs->directory, new_root);
+                close_and_replace(rootfs->directory_fd, fd);
         }
 
         return 1;
@@ -3710,20 +3710,35 @@ static int verity_settings_prepare(
         return 0;
 }
 
-static int pick_versions(
+static int pin_rootfs(
                 const ExecContext *context,
                 const ExecParameters *params,
-                char **ret_root_image,
-                char **ret_root_directory,
+                PinnedResource *ret,
                 char **reterr_path) {
 
         int r;
 
         assert(context);
-        assert(!context->root_directory_as_fd);
         assert(params);
-        assert(ret_root_image);
-        assert(ret_root_directory);
+        assert(ret);
+
+        if (!FLAGS_SET(params->flags, EXEC_APPLY_CHROOT)) {
+                *ret = PINNED_RESOURCE_NULL;
+                return 0;
+        }
+
+        if (context->root_directory_as_fd) {
+                _cleanup_close_ int fd = fcntl(params->root_directory_fd, F_DUPFD_CLOEXEC, 3);
+                if (fd < 0)
+                        return log_debug_errno(errno, "Failed to duplicate root directory fd: %m");
+
+                *ret = (PinnedResource) {
+                        .directory_fd = TAKE_FD(fd),
+                        .image_fd = -EBADF,
+                };
+
+                return 1;
+        }
 
         if (context->root_image) {
                 _cleanup_(pick_result_done) PickResult result = PICK_RESULT_NULL;
@@ -3745,8 +3760,24 @@ static int pick_versions(
                         return log_debug_errno(SYNTHETIC_ERRNO(ENOENT), "No matching entry in .v/ directory %s found.", context->root_image);
                 }
 
-                *ret_root_image = TAKE_PTR(result.path);
-                *ret_root_directory = NULL;
+                /* path_pick() returns us an O_PATH fd, let's turn this into a fully opened file, because
+                 * mountfsd will want this later, and it wants a fully opened fd, so that security checks
+                 * have been passed */
+                _cleanup_close_ int reopened_fd = -EBADF;
+                reopened_fd = fd_reopen(result.fd, O_CLOEXEC|O_NONBLOCK|O_NOCTTY|O_RDWR);
+                if (ERRNO_IS_NEG_FS_WRITE_REFUSED(reopened_fd))
+                        reopened_fd = fd_reopen(result.fd, O_CLOEXEC|O_NONBLOCK|O_NOCTTY|O_RDONLY);
+                if (reopened_fd < 0) {
+                        *reterr_path = strdup(context->root_image);
+                        return log_debug_errno(reopened_fd, "Failed to open image '%s': %m", context->root_image);
+                }
+
+                *ret = (PinnedResource) {
+                        .image = TAKE_PTR(result.path),
+                        .image_fd = TAKE_FD(reopened_fd),
+                        .directory_fd = -EBADF,
+                };
+
                 return r;
         }
 
@@ -3770,12 +3801,53 @@ static int pick_versions(
                         return log_debug_errno(SYNTHETIC_ERRNO(ENOENT), "No matching entry in .v/ directory %s found.", context->root_directory);
                 }
 
-                *ret_root_image = NULL;
-                *ret_root_directory = TAKE_PTR(result.path);
+                *ret = (PinnedResource) {
+                        .directory = TAKE_PTR(result.path),
+                        .directory_fd = TAKE_FD(result.fd),
+                        .image_fd = -EBADF,
+                };
+
                 return r;
         }
 
-        *ret_root_image = *ret_root_directory = NULL;
+        if (context->root_mstack) {
+                _cleanup_(pick_result_done) PickResult result = PICK_RESULT_NULL;
+
+                r = path_pick(/* toplevel_path= */ NULL,
+                              /* toplevel_fd= */ AT_FDCWD,
+                              context->root_mstack,
+                              pick_filter_image_mstack,
+                              /* n_filters= */ 1,
+                              PICK_ARCHITECTURE|PICK_TRIES|PICK_RESOLVE,
+                              &result);
+                if (r < 0) {
+                        *reterr_path = strdup(context->root_mstack);
+                        return r;
+                }
+
+                if (!result.path) {
+                        *reterr_path = strdup(context->root_mstack);
+                        return log_debug_errno(SYNTHETIC_ERRNO(ENOENT), "No matching entry in .v/ directory %s found.", context->root_mstack);
+                }
+
+                _cleanup_(mstack_freep) MStack *mstack = NULL;
+                r = mstack_load(result.path, result.fd, &mstack);
+                if (r < 0) {
+                        *reterr_path = TAKE_PTR(result.path);
+                        return r;
+                }
+
+                *ret = (PinnedResource) {
+                        .mstack = TAKE_PTR(result.path),
+                        .mstack_loaded = TAKE_PTR(mstack),
+                        .image_fd = -EBADF,
+                        .directory_fd = -EBADF,
+                };
+
+                return r;
+        }
+
+        *ret = PINNED_RESOURCE_NULL;
         return 0;
 }
 
@@ -3783,7 +3855,8 @@ static int apply_mount_namespace(
                 ExecCommandFlags command_flags,
                 const ExecContext *context,
                 const ExecParameters *params,
-                ExecRuntime *runtime,
+                const ExecRuntime *runtime,
+                const PinnedResource *rootfs,
                 const char *memory_pressure_path,
                 bool needs_sandboxing,
                 uid_t exec_directory_uid,
@@ -3798,7 +3871,7 @@ static int apply_mount_namespace(
         _cleanup_strv_free_ char **empty_directories = NULL, **symlinks = NULL,
                         **read_write_paths_cleanup = NULL;
         _cleanup_free_ char *creds_path = NULL, *incoming_dir = NULL, *propagate_dir = NULL,
-                *private_namespace_dir = NULL, *host_os_release_stage = NULL, *root_image = NULL, *root_dir = NULL;
+                *private_namespace_dir = NULL, *host_os_release_stage = NULL;
         const char *tmp_dir = NULL, *var_tmp_dir = NULL;
         char **read_write_paths;
         bool setup_os_release_symlink;
@@ -3812,26 +3885,6 @@ static int apply_mount_namespace(
 
         CLEANUP_ARRAY(bind_mounts, n_bind_mounts, bind_mount_free_many);
 
-        if (params->flags & EXEC_APPLY_CHROOT && !context->root_directory_as_fd) {
-                r = pick_versions(
-                                context,
-                                params,
-                                &root_image,
-                                &root_dir,
-                                reterr_path);
-                if (r < 0)
-                        return r;
-
-                r = setup_ephemeral(
-                                context,
-                                runtime,
-                                &root_image,
-                                &root_dir,
-                                reterr_path);
-                if (r < 0)
-                        return r;
-        }
-
         r = compile_bind_mounts(context, params, exec_directory_uid, exec_directory_gid, &bind_mounts, &n_bind_mounts, &empty_directories);
         if (r < 0)
                 return r;
@@ -3870,7 +3923,7 @@ static int apply_mount_namespace(
         }
 
         /* Symlinks (exec dirs, os-release) are set up after other mounts, before they are made read-only. */
-        setup_os_release_symlink = needs_sandboxing && exec_context_get_effective_mount_apivfs(context) && (root_dir || root_image);
+        setup_os_release_symlink = needs_sandboxing && exec_context_get_effective_mount_apivfs(context) && pinned_resource_is_set(rootfs);
         r = compile_symlinks(context, params, setup_os_release_symlink, &symlinks);
         if (r < 0)
                 return r;
@@ -3917,10 +3970,10 @@ static int apply_mount_namespace(
                         return -ENOMEM;
         }
 
-        if (root_image) {
+        if (rootfs->image) {
                 r = verity_settings_prepare(
                         &verity,
-                        root_image,
+                        rootfs->image,
                         &context->root_hash, context->root_hash_path,
                         &context->root_hash_sig, context->root_hash_sig_path,
                         context->root_verity);
@@ -3931,9 +3984,7 @@ static int apply_mount_namespace(
         NamespaceParameters parameters = {
                 .runtime_scope = params->runtime_scope,
 
-                .root_directory = root_dir,
-                .root_image = root_image,
-                .root_directory_fd = params->flags & EXEC_APPLY_CHROOT ? params->root_directory_fd : -EBADF,
+                .rootfs = rootfs,
                 .root_image_options = context->root_image_options,
                 .root_image_policy = context->root_image_policy ?: &image_policy_service,
 
@@ -3981,7 +4032,7 @@ static int apply_mount_namespace(
                 /* If DynamicUser=no and RootDirectory= is set then lets pass a relaxed sandbox info,
                  * otherwise enforce it, don't ignore protected paths and fail if we are enable to apply the
                  * sandbox inside the mount namespace. */
-                .ignore_protect_paths = !needs_sandboxing && !context->dynamic_user && root_dir,
+                .ignore_protect_paths = !needs_sandboxing && !context->dynamic_user && pinned_resource_is_set(rootfs),
 
                 .protect_control_groups = needs_sandboxing ? exec_get_protect_control_groups(context) : PROTECT_CONTROL_GROUPS_NO,
                 .protect_kernel_tunables = needs_sandboxing && context->protect_kernel_tunables,
@@ -4007,6 +4058,7 @@ static int apply_mount_namespace(
                 .protect_proc = needs_sandboxing ? context->protect_proc : PROTECT_PROC_DEFAULT,
                 .proc_subset = needs_sandboxing ? context->proc_subset : PROC_SUBSET_ALL,
                 .private_bpf = needs_sandboxing ? context->private_bpf : PRIVATE_BPF_NO,
+                .private_users = needs_sandboxing ? context->private_users : PRIVATE_USERS_NO,
 
                 .bpffs_pidref = bpffs_pidref,
                 .bpffs_socket_fd = bpffs_socket_fd,
@@ -4023,17 +4075,18 @@ static int apply_mount_namespace(
         if (r == -ENOANO) {
                 if (insist_on_sandboxing(
                                     context,
-                                    root_dir, root_image,
+                                    rootfs,
                                     bind_mounts,
                                     n_bind_mounts))
                         return log_debug_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
                                                "Failed to set up namespace, and refusing to continue since "
                                                "the selected namespacing options alter mount environment non-trivially.\n"
-                                               "Bind mounts: %zu, temporary filesystems: %zu, root directory: %s, root image: %s, dynamic user: %s",
+                                               "Bind mounts: %zu, temporary filesystems: %zu, root directory: %s, root image: %s, root mstack: %s, dynamic user: %s",
                                                n_bind_mounts,
                                                context->n_temporary_filesystems,
-                                               yes_no(root_dir),
-                                               yes_no(root_image),
+                                               yes_no(rootfs->directory_fd >= 0),
+                                               yes_no(rootfs->image_fd >= 0),
+                                               yes_no(!!rootfs->mstack_loaded),
                                                yes_no(context->dynamic_user));
 
                 log_debug("Failed to set up namespace, assuming containerized execution and ignoring.");
@@ -4656,7 +4709,8 @@ static bool exec_namespace_is_delegated(
 static int setup_delegated_namespaces(
                 const ExecContext *context,
                 ExecParameters *params,
-                ExecRuntime *runtime,
+                const ExecRuntime *runtime,
+                const PinnedResource *rootfs,
                 bool delegate,
                 const char *memory_pressure_path,
                 uid_t uid,
@@ -4682,6 +4736,7 @@ static int setup_delegated_namespaces(
         assert(context);
         assert(params);
         assert(runtime);
+        assert(rootfs);
         assert(reterr_exit_status);
 
         if (exec_needs_network_namespace(context) &&
@@ -4787,7 +4842,8 @@ static int setup_delegated_namespaces(
                                 context,
                                 params,
                                 runtime,
-                                          memory_pressure_path,
+                                rootfs,
+                                memory_pressure_path,
                                 needs_sandboxing,
                                 uid,
                                 gid,
@@ -5943,6 +5999,20 @@ int exec_invoke(
                 }
         }
 
+        _cleanup_(pinned_resource_done) PinnedResource rootfs = PINNED_RESOURCE_NULL;
+        _cleanup_free_ char *error_path = NULL;
+        r = pin_rootfs(context, params, &rootfs, &error_path);
+        if (r < 0) {
+                *exit_status = EXIT_NAMESPACE;
+                return log_error_errno(r, "Failed to open service's root fs%s%s: %m", error_path ? ": " : "", strempty(error_path));
+        }
+
+        r = setup_ephemeral(context, runtime, &rootfs, &error_path);
+        if (r < 0) {
+                *exit_status = EXIT_NAMESPACE;
+                return log_error_errno(r, "Failed to make ephemeral copy of service's root fs%s%s: %m", error_path ? ": " : "", strempty(error_path));
+        }
+
         /* Load a bunch of libraries we'll possibly need later, before we turn off dlopen() */
         (void) dlopen_bpf();
         (void) dlopen_cryptsetup();
@@ -5979,13 +6049,14 @@ int exec_invoke(
                                 /* allow_setgroups= */ false);
                 /* If it was requested explicitly and we can't set it up, fail early. Otherwise, continue and let
                  * the actual requested operations fail (or silently continue). */
-                if (r < 0 && context->private_users != PRIVATE_USERS_NO) {
-                        *exit_status = EXIT_USER;
-                        return log_error_errno(r, "Failed to set up user namespacing for unprivileged user: %m");
-                }
-                if (r < 0)
-                        log_info_errno(r, "Failed to set up user namespacing for unprivileged user, ignoring: %m");
-                else {
+                if (r < 0) {
+                        if (context->private_users != PRIVATE_USERS_NO) {
+                                *exit_status = EXIT_USER;
+                                return log_error_errno(r, "Failed to set up user namespacing for unprivileged user: %m");
+                        }
+
+                        log_notice_errno(r, "Failed to set up user namespacing for unprivileged user, ignoring: %m");
+                } else {
                         assert(r > 0);
                         userns_set_up = true;
                         log_debug("Set up unprivileged user namespace");
@@ -5997,6 +6068,7 @@ int exec_invoke(
                         context,
                         params,
                         runtime,
+                        &rootfs,
                         /* delegate= */ false,
                         memory_pressure_path,
                         uid,
@@ -6093,6 +6165,7 @@ int exec_invoke(
                         context,
                         params,
                         runtime,
+                        &rootfs,
                         /* delegate= */ true,
                         memory_pressure_path,
                         uid,
@@ -6112,6 +6185,10 @@ int exec_invoke(
         nsresource_link = sd_varlink_unref(nsresource_link);
         mountfsd_link = sd_varlink_unref(mountfsd_link);
 
+        /* We don't need the pinned rootfs anymore at this point. Close the fds now, so that they are
+         * definitely gone before we do our fd rearrangements below. */
+        pinned_resource_done(&rootfs);
+
         /* Kill unnecessary process, for the case that e.g. when the bpffs mount point is hidden. */
         pidref_done_sigkill_wait(&bpffs_pidref);
 
@@ -6545,7 +6622,6 @@ int exec_invoke(
                         }
                 }
 #endif
-
         }
 
         if (!strv_isempty(context->unset_environment)) {
index 504f80cb6358c7aa6225f1d10f72e320169a2fba..d363ac0973aac1ff9e38a9b8f721919847647746 100644 (file)
@@ -1297,13 +1297,6 @@ static int create_temporary_mount_point(RuntimeScope scope, char **ret) {
         return 0;
 }
 
-static bool namespace_with_rootfs(const NamespaceParameters *p) {
-        /* Returns true, if we have a root dir, root image or too mstack, and hence the root mount is
-         * changed */
-
-        return p->root_image || p->root_directory || p->root_directory_fd >= 0 || p->root_mstack;
-}
-
 static int mount_private_dev(const MountEntry *m, const NamespaceParameters *p) {
         static const char devnodes[] =
                 "/dev/null\0"
@@ -1368,7 +1361,7 @@ static int mount_private_dev(const MountEntry *m, const NamespaceParameters *p)
 
         /* We assume /run/systemd/journal/ is available if not changing root, which isn't entirely accurate
          * but shouldn't matter, as either way the user would get ENOENT when accessing /dev/log */
-        if (!namespace_with_rootfs(p) || p->bind_log_sockets) {
+        if (!pinned_resource_is_set(p->rootfs) || p->bind_log_sockets) {
                 const char *devlog = strjoina(temporary_mount, "/dev/log");
                 if (symlink("/run/systemd/journal/dev-log", devlog) < 0)
                         log_debug_errno(errno,
@@ -2562,7 +2555,6 @@ int setup_namespace(const NamespaceParameters *p, char **reterr_path) {
 
         _cleanup_(loop_device_unrefp) LoopDevice *loop_device = NULL;
         _cleanup_(dissected_image_unrefp) DissectedImage *dissected_image = NULL;
-        _cleanup_(mstack_freep) MStack *mstack = NULL;
         _cleanup_strv_free_ char **hierarchies = NULL;
         _cleanup_(mount_list_done) MountList ml = {};
         _cleanup_close_ int userns_fd = -EBADF;
@@ -2592,83 +2584,136 @@ int setup_namespace(const NamespaceParameters *p, char **reterr_path) {
         bool setup_propagate = !isempty(p->propagate_dir) && !isempty(p->incoming_dir);
         unsigned long mount_propagation_flag = p->mount_propagation_flag != 0 ? p->mount_propagation_flag : MS_SHARED;
 
-        if (p->root_image) {
-                /* Make the whole image read-only if we can determine that we only access it in a read-only fashion. */
-                if (namespace_read_only(p))
-                        dissect_image_flags |= DISSECT_IMAGE_READ_ONLY;
-
-                SET_FLAG(dissect_image_flags, DISSECT_IMAGE_NO_PARTITION_TABLE, p->verity && p->verity->data_path);
-
-                /* First check if we have a verity device already open and with a fstype pinned by policy. If it
-                * cannot be found, then fallback to the slow path (full dissect). */
-                r = dissected_image_new_from_existing_verity(
-                                p->root_image,
-                                p->verity,
-                                p->root_image_options,
-                                p->root_image_policy,
-                                /* image_filter= */ NULL,
-                                p->runtime_scope,
-                                dissect_image_flags,
-                                &dissected_image);
-                if (r < 0 && !ERRNO_IS_NEG_DEVICE_ABSENT(r) && r != -ENOPKG)
-                        return r;
-                if (r >= 0)
-                        log_debug("Reusing pre-existing verity-protected root image %s", p->root_image);
-                else {
-                        if (p->runtime_scope == RUNTIME_SCOPE_SYSTEM) {
-                                /* In system mode we mount directly */
+        /* Make the whole image read-only if we can determine that we only access it in a read-only fashion. */
+        bool ro = namespace_read_only(p);
+        if (ro) {
+                dissect_image_flags |= DISSECT_IMAGE_READ_ONLY;
+                mstack_flags |= MSTACK_RDONLY;
+        }
 
-                                r = loop_device_make_by_path(
-                                                p->root_image,
-                                                FLAGS_SET(dissect_image_flags, DISSECT_IMAGE_DEVICE_READ_ONLY) ? O_RDONLY : -1 /* < 0 means writable if possible, read-only as fallback */,
-                                                /* sector_size= */ UINT32_MAX,
-                                                FLAGS_SET(dissect_image_flags, DISSECT_IMAGE_NO_PARTITION_TABLE) ? 0 : LO_FLAGS_PARTSCAN,
-                                                LOCK_SH,
-                                                &loop_device);
+        _cleanup_close_ int _root_mount_fd = -EBADF;
+        int root_mount_fd = -EBADF;
+        if (pinned_resource_is_set(p->rootfs)) {
+                if (p->rootfs->directory_fd >= 0) {
+
+                        /* In "managed" mode we need to map from foreign UID/GID space, hence go via mountfsd */
+                        if (p->private_users == PRIVATE_USERS_MANAGED) {
+                                userns_fd = namespace_open_by_type(NAMESPACE_USER);
+                                if (userns_fd < 0)
+                                        return log_debug_errno(userns_fd, "Failed to open our own user namespace: %m");
+
+                                r = mountfsd_mount_directory_fd(
+                                                p->mountfsd_link,
+                                                p->rootfs->directory_fd,
+                                                userns_fd,
+                                                dissect_image_flags,
+                                                &_root_mount_fd);
                                 if (r < 0)
-                                        return log_debug_errno(r, "Failed to create loop device for root image: %m");
+                                        return r;
 
-                                r = dissect_loop_device(
-                                                loop_device,
+                                root_mount_fd = _root_mount_fd;
+                        }
+
+                        /* Try to to clone the directory mount if we have privs to, so that we can apply the
+                         * MS_SLAVE propagation settings right-away. */
+                        if (root_mount_fd < 0) {
+                                _root_mount_fd = open_tree_attr_with_fallback(
+                                                p->rootfs->directory_fd,
+                                                "",
+                                                OPEN_TREE_CLONE|OPEN_TREE_CLOEXEC|AT_SYMLINK_NOFOLLOW|AT_EMPTY_PATH|AT_RECURSIVE,
+                                                &(struct mount_attr) {
+                                                        /* We just remounted / as slave, but that didn't affect the detached
+                                                         * mount that we just mounted, so remount that one as slave recursive
+                                                         * as well now. */
+                                                        .propagation = MS_SLAVE,
+                                                });
+                                if (_root_mount_fd < 0 && !ERRNO_IS_NEG_PRIVILEGE(_root_mount_fd) && _root_mount_fd != -EINVAL)
+                                        return log_debug_errno(_root_mount_fd, "Failed to clone specified directory: %m");
+
+                                root_mount_fd = _root_mount_fd;
+                        }
+                        /* If we have only a root fd (and we couldn't make it ours), and we have no path,
+                         * then try to go on with the literal fd */
+                        if (root_mount_fd < 0 && !p->rootfs->directory)
+                                root_mount_fd = p->rootfs->directory_fd;
+                }
+
+                if (p->rootfs->image_fd >= 0) {
+                        SET_FLAG(dissect_image_flags, DISSECT_IMAGE_NO_PARTITION_TABLE, p->verity && p->verity->data_path);
+
+                        if (p->runtime_scope == RUNTIME_SCOPE_SYSTEM) {
+                                /* In system mode we mount directly */
+
+                                /* First check if we have a verity device already open and with a fstype pinned by policy. If it
+                                 * cannot be found, then fallback to the slow path (full dissect). */
+                                r = dissected_image_new_from_existing_verity(
+                                                p->rootfs->image,
                                                 p->verity,
                                                 p->root_image_options,
                                                 p->root_image_policy,
                                                 /* image_filter= */ NULL,
+                                                p->runtime_scope,
                                                 dissect_image_flags,
                                                 &dissected_image);
-                                if (r < 0)
-                                        return log_debug_errno(r, "Failed to dissect image: %m");
-
-                                r = dissected_image_load_verity_sig_partition(
-                                                dissected_image,
-                                                loop_device->fd,
-                                                p->verity);
-                                if (r < 0)
-                                        return r;
-
-                                r = dissected_image_guess_verity_roothash(
-                                                dissected_image,
-                                                p->verity);
-                                if (r < 0)
+                                if (r < 0 && !ERRNO_IS_NEG_DEVICE_ABSENT(r) && r != -ENOPKG)
                                         return r;
-
-                                r = dissected_image_decrypt(
-                                                dissected_image,
-                                                /* root= */ NULL,
-                                                /* passphrase= */ NULL,
-                                                p->verity,
-                                                p->root_image_policy,
-                                                dissect_image_flags);
-                                if (r < 0)
-                                        return log_debug_errno(r, "Failed to decrypt dissected image: %m");
+                                if (r >= 0)
+                                        log_debug("Reusing pre-existing verity-protected root image %s", p->rootfs->image);
+                                else {
+                                        r = loop_device_make(
+                                                        p->rootfs->image_fd,
+                                                        FLAGS_SET(dissect_image_flags, DISSECT_IMAGE_DEVICE_READ_ONLY) ? O_RDONLY : -1 /* < 0 means take access mode from fd */,
+                                                        /* offset= */ 0,
+                                                        /* size= */ UINT64_MAX,
+                                                        /* sector_size= */ UINT32_MAX,
+                                                        FLAGS_SET(dissect_image_flags, DISSECT_IMAGE_NO_PARTITION_TABLE) ? 0 : LO_FLAGS_PARTSCAN,
+                                                        LOCK_SH,
+                                                        &loop_device);
+                                        if (r < 0)
+                                                return log_debug_errno(r, "Failed to create loop device for root image: %m");
+
+                                        r = dissect_loop_device(
+                                                        loop_device,
+                                                        p->verity,
+                                                        p->root_image_options,
+                                                        p->root_image_policy,
+                                                        /* image_filter= */ NULL,
+                                                        dissect_image_flags,
+                                                        &dissected_image);
+                                        if (r < 0)
+                                                return log_debug_errno(r, "Failed to dissect image: %m");
+
+                                        r = dissected_image_load_verity_sig_partition(
+                                                        dissected_image,
+                                                        loop_device->fd,
+                                                        p->verity);
+                                        if (r < 0)
+                                                return r;
+
+                                        r = dissected_image_guess_verity_roothash(
+                                                        dissected_image,
+                                                        p->verity);
+                                        if (r < 0)
+                                                return r;
+
+                                        r = dissected_image_decrypt(
+                                                        dissected_image,
+                                                        /* root= */ NULL,
+                                                        /* passphrase= */ NULL,
+                                                        p->verity,
+                                                        p->root_image_policy,
+                                                        dissect_image_flags);
+                                        if (r < 0)
+                                                return log_debug_errno(r, "Failed to decrypt dissected image: %m");
+                                }
                         } else {
                                 userns_fd = namespace_open_by_type(NAMESPACE_USER);
                                 if (userns_fd < 0)
                                         return log_debug_errno(userns_fd, "Failed to open our own user namespace: %m");
 
-                                r = mountfsd_mount_image(
+                                r = mountfsd_mount_image_fd(
                                                 p->mountfsd_link,
-                                                p->root_image,
+                                                p->rootfs->image_fd,
                                                 userns_fd,
                                                 p->root_image_options,
                                                 p->root_image_policy,
@@ -2679,33 +2724,28 @@ int setup_namespace(const NamespaceParameters *p, char **reterr_path) {
                                         return r;
                         }
                 }
-        } else if (p->root_mstack) {
-                if (namespace_read_only(p))
-                        mstack_flags |= MSTACK_RDONLY;
 
-                r = mstack_load(p->root_mstack, /* dir_fd= */ -EBADF, &mstack);
-                if (r < 0)
-                        return r;
+                if (p->rootfs->mstack_loaded) {
+                        if (p->runtime_scope != RUNTIME_SCOPE_SYSTEM) {
+                                userns_fd = namespace_open_by_type(NAMESPACE_USER);
+                                if (userns_fd < 0)
+                                        return log_debug_errno(userns_fd, "Failed to open our own user namespace: %m");
+                        }
 
-                if (p->runtime_scope != RUNTIME_SCOPE_SYSTEM) {
-                        userns_fd = namespace_open_by_type(NAMESPACE_USER);
-                        if (userns_fd < 0)
-                                return log_debug_errno(userns_fd, "Failed to open our own user namespace: %m");
+                        r = mstack_open_images(
+                                        p->rootfs->mstack_loaded,
+                                        p->mountfsd_link,
+                                        userns_fd,
+                                        p->root_image_policy,
+                                        /* image_filter= */ NULL,
+                                        mstack_flags);
+                        if (r < 0)
+                                return r;
                 }
-
-                r = mstack_open_images(
-                                mstack,
-                                p->mountfsd_link,
-                                userns_fd,
-                                p->root_image_policy,
-                                /* image_filter= */ NULL,
-                                mstack_flags);
-                if (r < 0)
-                        return r;
         }
 
-        if (p->root_directory)
-                root = p->root_directory;
+        if (p->rootfs && p->rootfs->directory)
+                root = p->rootfs->directory;
         else {
                 /* /run/systemd should have been created by PID 1 early on already, but in some cases, like
                  * when running tests (test-execute), it might not have been created yet so let's make sure
@@ -3046,21 +3086,36 @@ int setup_namespace(const NamespaceParameters *p, char **reterr_path) {
 
         /* Remount / as SLAVE so that nothing now mounted in the namespace
          * shows up in the parent */
-        if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0)
-                return log_debug_errno(errno, "Failed to remount '/' as SLAVE: %m");
+        r = mount_nofollow_verbose(LOG_DEBUG, /* what= */ NULL, "/", /* fstype= */ NULL, MS_SLAVE|MS_REC, /* options= */ NULL);
+        if (r < 0)
+                return r;
 
-        if (p->root_directory_fd >= 0) {
+        if (root_mount_fd >= 0) {
+                /* If we have root_mount_fd we have a ready-to-use detached mount. Attach it. */
 
-                if (move_mount(p->root_directory_fd, "", AT_FDCWD, root, MOVE_MOUNT_F_EMPTY_PATH) < 0)
+                if (move_mount(root_mount_fd, "", AT_FDCWD, root, MOVE_MOUNT_F_EMPTY_PATH) < 0)
                         return log_debug_errno(errno, "Failed to move detached mount to '%s': %m", root);
 
-                /* We just remounted / as slave, but that didn't affect the detached mount that we just
-                 * mounted, so remount that one as slave recursive as well now. */
+                r = mount_nofollow_verbose(LOG_DEBUG, /* what= */ NULL, root, /* fstype= */ NULL, MS_SLAVE|MS_REC, /* options= */ NULL);
+                if (r < 0)
+                        return r;
+
+        } else if (p->rootfs && p->rootfs->directory) {
 
-                if (mount(NULL, root, NULL, MS_SLAVE|MS_REC, NULL) < 0)
-                        return log_debug_errno(errno, "Failed to remount '%s' as SLAVE: %m", root);
+                /* If we do not have root_mount_fd, but a directory was specified, then we can use it directly. */
+
+                /* A root directory is specified. Turn its directory into bind mount, if it isn't one yet. */
+                r = path_is_mount_point_full(root, /* root = */ NULL, AT_SYMLINK_FOLLOW);
+                if (r < 0)
+                        return log_debug_errno(r, "Failed to detect that %s is a mount point or not: %m", root);
+                if (r == 0) {
+                        r = mount_nofollow_verbose(LOG_DEBUG, root, root, /* fstype= */ NULL, MS_BIND|MS_REC, /* options= */ NULL);
+                        if (r < 0)
+                                return r;
+                }
+
+        } else if (dissected_image) {
 
-        } else if (p->root_image) {
                 /* A root image is specified, mount it to the right place */
                 r = dissected_image_mount(
                                 dissected_image,
@@ -3084,24 +3139,13 @@ int setup_namespace(const NamespaceParameters *p, char **reterr_path) {
                 if (r < 0)
                         return log_debug_errno(r, "Failed to relinquish dissected image: %m");
 
-        } else if (p->root_directory) {
-
-                /* A root directory is specified. Turn its directory into bind mount, if it isn't one yet. */
-                r = path_is_mount_point_full(root, /* root= */ NULL, AT_SYMLINK_FOLLOW);
-                if (r < 0)
-                        return log_debug_errno(r, "Failed to detect that %s is a mount point or not: %m", root);
-                if (r == 0) {
-                        r = mount_nofollow_verbose(LOG_DEBUG, root, root, NULL, MS_BIND|MS_REC, NULL);
-                        if (r < 0)
-                                return r;
-                }
+        } else if (p->rootfs && p->rootfs->mstack_loaded) {
 
-        } else if (p->root_mstack) {
-                r = mstack_make_mounts(mstack, root, mstack_flags);
+                r = mstack_make_mounts(p->rootfs->mstack_loaded, root, mstack_flags);
                 if (r < 0)
                         return r;
 
-                r = mstack_bind_mounts(mstack, root, /* where_fd= */ -EBADF, mstack_flags, /* ret_root_fd= */ NULL);
+                r = mstack_bind_mounts(p->rootfs->mstack_loaded, root, /* where_fd= */ -EBADF, mstack_flags, /* ret_root_fd= */ NULL);
                 if (r < 0)
                         return r;
 
@@ -3113,7 +3157,7 @@ int setup_namespace(const NamespaceParameters *p, char **reterr_path) {
         }
 
         /* Try to set up the new root directory before mounting anything else there. */
-        if (namespace_with_rootfs(p))
+        if (pinned_resource_is_set(p->rootfs))
                 (void) base_filesystem_create(root, UID_INVALID, GID_INVALID);
 
         /* Now make the magic happen */
@@ -3122,8 +3166,8 @@ int setup_namespace(const NamespaceParameters *p, char **reterr_path) {
                 return r;
 
         /* MS_MOVE does not work on MS_SHARED so the remount MS_SHARED will be done later */
-        r = mount_switch_root(root, /* mount_propagation_flag= */ 0);
-        if (r == -EINVAL && p->root_directory) {
+        r = mount_switch_root(root, /* mount_propagation_flag = */ 0);
+        if (r == -EINVAL && p->rootfs && p->rootfs->directory) {
                 /* If we are using root_directory and we don't have privileges (ie: user manager in a user
                  * namespace) and the root_directory is already a mount point in the parent namespace,
                  * MS_MOVE will fail as we don't have permission to change it (with EINVAL rather than
@@ -3132,6 +3176,7 @@ int setup_namespace(const NamespaceParameters *p, char **reterr_path) {
                 r = mount_nofollow_verbose(LOG_DEBUG, root, root, NULL, MS_BIND|MS_REC, NULL);
                 if (r < 0)
                         return r;
+
                 r = mount_switch_root(root, /* mount_propagation_flag= */ 0);
         }
         if (r < 0)
@@ -4196,3 +4241,26 @@ static const char* const private_pids_table[_PRIVATE_PIDS_MAX] = {
 };
 
 DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(private_pids, PrivatePIDs, PRIVATE_PIDS_YES);
+
+void pinned_resource_done(PinnedResource *p) {
+        assert(p);
+
+        p->directory_fd = safe_close(p->directory_fd);
+        p->directory = mfree(p->directory);
+        p->image_fd = safe_close(p->image_fd);
+        p->image = mfree(p->image);
+        p->mstack_loaded = mstack_free(p->mstack_loaded);
+        p->mstack = mfree(p->mstack);
+}
+
+bool pinned_resource_is_set(const PinnedResource *p) {
+        if (!p)
+                return false;
+
+        return p->directory_fd >= 0 ||
+                p->directory ||
+                p->image_fd >= 0 ||
+                p->image ||
+                p->mstack_loaded ||
+                p->mstack;
+}
index 26b0bf8ff2dcef6467a89a15868349598492cb46..318836651cd8a4ce207472e49972e0b0b60fc523 100644 (file)
@@ -91,6 +91,24 @@ typedef enum PrivatePIDs {
         _PRIVATE_PIDS_INVALID = -EINVAL,
 } PrivatePIDs;
 
+typedef struct PinnedResource {
+        /* Pins a disk image, directory or mstack by file descriptors. The paths are stored too, but they are
+         * intended to be decoration only, to enhance log messages and should not be load-bearing
+         * otherwise. */
+        int directory_fd;
+        char *directory;
+        int image_fd;
+        char *image;
+        MStack *mstack_loaded;
+        char *mstack;
+} PinnedResource;
+
+#define PINNED_RESOURCE_NULL                    \
+        (PinnedResource) {                      \
+                .directory_fd = -EBADF,         \
+                .image_fd = -EBADF,             \
+        }
+
 typedef struct BindMount {
         char *source;
         char *destination;
@@ -128,10 +146,7 @@ typedef struct MountImage {
 typedef struct NamespaceParameters {
         RuntimeScope runtime_scope;
 
-        int root_directory_fd;
-        const char *root_directory;
-        const char *root_image;
-        const char *root_mstack;
+        const PinnedResource *rootfs;
         const MountOptions *root_image_options;
         const ImagePolicy *root_image_policy;
 
@@ -201,6 +216,7 @@ typedef struct NamespaceParameters {
         PrivateTmp private_tmp;
         PrivateTmp private_var_tmp;
         PrivatePIDs private_pids;
+        PrivateUsers private_users;
 
         PidRef *bpffs_pidref;
         int bpffs_socket_fd;
@@ -304,3 +320,6 @@ int refresh_extensions_in_namespace(
                 const PidRef *target,
                 const char *hierarchy_env,
                 const NamespaceParameters *p);
+
+void pinned_resource_done(PinnedResource *p);
+bool pinned_resource_is_set(const PinnedResource *p);
index aa3690e92a0597c6b0ee274a495f4464b0bdc925..b9efd9bdb9cd749ff35ecd814dce76153dd7a011 100644 (file)
@@ -3019,7 +3019,6 @@ static void service_enter_refresh_extensions(Service *s) {
                         .n_extension_images = s->exec_context.n_extension_images,
                         .extension_directories = s->exec_context.extension_directories,
                         .extension_image_policy = s->exec_context.extension_image_policy,
-                        .root_directory_fd = -EBADF,
                 };
 
                 /* Only reload confext, and not sysext as they also typically contain the executable(s) used
index 82bdf86330faa876e5d307ed1c8a1ff7c675eff4..09ce806ad2b7fad673199848004721561b96cb5c 100644 (file)
@@ -74,6 +74,7 @@ typedef struct LoopDevice LoopDevice;
 typedef struct MachineBindUserContext MachineBindUserContext;
 typedef struct MachineCredentialContext MachineCredentialContext;
 typedef struct MountOptions MountOptions;
+typedef struct MStack MStack;
 typedef struct OpenFile OpenFile;
 typedef struct Pkcs11EncryptedKey Pkcs11EncryptedKey;
 typedef struct Table Table;
index c487fc54ec5bb39311cd2376accd9c8e00206d0a..5b67cd9f96e641e76b289874924b5a894e013266 100644 (file)
@@ -198,7 +198,6 @@ TEST(protect_kernel_logs) {
         static const NamespaceParameters p = {
                 .runtime_scope = RUNTIME_SCOPE_SYSTEM,
                 .protect_kernel_logs = true,
-                .root_directory_fd = -EBADF,
         };
         int r;
 
index c6d6f2e4232baefc9d2265bf63bad790f623ab90..9a17ed4f360f24373ed8c6c3f4e685d7c03c196c 100644 (file)
@@ -1,5 +1,6 @@
 /* SPDX-License-Identifier: LGPL-2.1-or-later */
 
+#include <fcntl.h>
 #include <stdlib.h>
 #include <unistd.h>
 
@@ -77,11 +78,20 @@ int main(int argc, char *argv[]) {
         else
                 log_info("Not chrooted");
 
+        _cleanup_(pinned_resource_done) PinnedResource pr = PINNED_RESOURCE_NULL;
+
+        if (root_directory) {
+                pr.directory_fd = open(root_directory, O_PATH|O_CLOEXEC|O_DIRECTORY);
+                assert_se(pr.directory_fd >= 0);
+
+                pr.directory = strdup(root_directory);
+                assert_se(pr.directory);
+        }
+
         NamespaceParameters p = {
                 .runtime_scope = RUNTIME_SCOPE_SYSTEM,
 
-                .root_directory = root_directory,
-                .root_directory_fd = -EBADF,
+                .rootfs = &pr,
 
                 .read_write_paths = (char**) writable,
                 .read_only_paths = (char**) readonly,