#include "mkdir-label.h"
#include "mount-util.h"
#include "mountpoint-util.h"
+#include "mstack.h"
#include "namespace-util.h"
#include "nsflags.h"
#include "nsresource.h"
static bool insist_on_sandboxing(
const ExecContext *context,
- const char *root_dir,
- const char *root_image,
+ const PinnedResource *rootfs,
const BindMount *bind_mounts,
size_t n_bind_mounts) {
if (context->n_temporary_filesystems > 0)
return true;
- if (root_dir || root_image || context->root_directory_as_fd)
+ if (pinned_resource_is_set(rootfs))
return true;
if (context->n_mount_images > 0)
static int setup_ephemeral(
const ExecContext *context,
ExecRuntime *runtime,
- char **root_image, /* both input and output! modified if ephemeral logic enabled */
- char **root_directory, /* ditto */
+ PinnedResource *rootfs, /* both input and output! modified if ephemeral logic enabled */
char **reterr_path) {
_cleanup_close_ int fd = -EBADF;
int r;
assert(context);
- assert(!context->root_directory_as_fd);
assert(runtime);
- assert(root_image);
- assert(root_directory);
+ assert(rootfs);
- if (!*root_image && !*root_directory)
+ if (!rootfs->image && !rootfs->directory)
return 0;
if (!runtime->ephemeral_copy)
if (fd != -EAGAIN)
return log_debug_errno(fd, "Failed to receive file descriptor queued on ephemeral storage socket: %m");
- if (*root_image) {
- log_debug("Making ephemeral copy of %s to %s", *root_image, new_root);
+ if (rootfs->image) {
+ log_debug("Making ephemeral copy of %s to %s", rootfs->image, new_root);
- fd = copy_file(*root_image, new_root, O_EXCL, 0600,
+ fd = copy_file(rootfs->image, new_root, O_EXCL, 0600,
COPY_LOCK_BSD|COPY_REFLINK|COPY_CRTIME|COPY_NOCOW_AFTER);
if (fd < 0) {
- *reterr_path = strdup(*root_image);
+ *reterr_path = strdup(rootfs->image);
return log_debug_errno(fd, "Failed to copy image %s to %s: %m",
- *root_image, new_root);
+ rootfs->image, new_root);
}
} else {
- assert(*root_directory);
+ assert(rootfs->directory);
- log_debug("Making ephemeral snapshot of %s to %s", *root_directory, new_root);
+ log_debug("Making ephemeral snapshot of %s to %s", rootfs->directory, new_root);
fd = btrfs_subvol_snapshot_at(
- AT_FDCWD, *root_directory,
+ AT_FDCWD, rootfs->directory,
AT_FDCWD, new_root,
BTRFS_SNAPSHOT_FALLBACK_COPY |
BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
BTRFS_SNAPSHOT_RECURSIVE |
BTRFS_SNAPSHOT_LOCK_BSD);
if (fd < 0) {
- *reterr_path = strdup(*root_directory);
+ *reterr_path = strdup(rootfs->directory);
return log_debug_errno(fd, "Failed to snapshot directory %s to %s: %m",
- *root_directory, new_root);
+ rootfs->directory, new_root);
}
}
if (r < 0)
return log_debug_errno(r, "Failed to queue file descriptor on ephemeral storage socket: %m");
- if (*root_image)
- free_and_replace(*root_image, new_root);
- else {
- assert(*root_directory);
- free_and_replace(*root_directory, new_root);
+ if (rootfs->image) {
+ free_and_replace(rootfs->image, new_root);
+ close_and_replace(rootfs->image_fd, fd);
+ } else {
+ assert(rootfs->directory);
+
+ free_and_replace(rootfs->directory, new_root);
+ close_and_replace(rootfs->directory_fd, fd);
}
return 1;
return 0;
}
-static int pick_versions(
+static int pin_rootfs(
const ExecContext *context,
const ExecParameters *params,
- char **ret_root_image,
- char **ret_root_directory,
+ PinnedResource *ret,
char **reterr_path) {
int r;
assert(context);
- assert(!context->root_directory_as_fd);
assert(params);
- assert(ret_root_image);
- assert(ret_root_directory);
+ assert(ret);
+
+ if (!FLAGS_SET(params->flags, EXEC_APPLY_CHROOT)) {
+ *ret = PINNED_RESOURCE_NULL;
+ return 0;
+ }
+
+ if (context->root_directory_as_fd) {
+ _cleanup_close_ int fd = fcntl(params->root_directory_fd, F_DUPFD_CLOEXEC, 3);
+ if (fd < 0)
+ return log_debug_errno(errno, "Failed to duplicate root directory fd: %m");
+
+ *ret = (PinnedResource) {
+ .directory_fd = TAKE_FD(fd),
+ .image_fd = -EBADF,
+ };
+
+ return 1;
+ }
if (context->root_image) {
_cleanup_(pick_result_done) PickResult result = PICK_RESULT_NULL;
return log_debug_errno(SYNTHETIC_ERRNO(ENOENT), "No matching entry in .v/ directory %s found.", context->root_image);
}
- *ret_root_image = TAKE_PTR(result.path);
- *ret_root_directory = NULL;
+ /* path_pick() returns us an O_PATH fd, let's turn this into a fully opened file, because
+ * mountfsd will want this later, and it wants a fully opened fd, so that security checks
+ * have been passed */
+ _cleanup_close_ int reopened_fd = -EBADF;
+ reopened_fd = fd_reopen(result.fd, O_CLOEXEC|O_NONBLOCK|O_NOCTTY|O_RDWR);
+ if (ERRNO_IS_NEG_FS_WRITE_REFUSED(reopened_fd))
+ reopened_fd = fd_reopen(result.fd, O_CLOEXEC|O_NONBLOCK|O_NOCTTY|O_RDONLY);
+ if (reopened_fd < 0) {
+ *reterr_path = strdup(context->root_image);
+ return log_debug_errno(reopened_fd, "Failed to open image '%s': %m", context->root_image);
+ }
+
+ *ret = (PinnedResource) {
+ .image = TAKE_PTR(result.path),
+ .image_fd = TAKE_FD(reopened_fd),
+ .directory_fd = -EBADF,
+ };
+
return r;
}
return log_debug_errno(SYNTHETIC_ERRNO(ENOENT), "No matching entry in .v/ directory %s found.", context->root_directory);
}
- *ret_root_image = NULL;
- *ret_root_directory = TAKE_PTR(result.path);
+ *ret = (PinnedResource) {
+ .directory = TAKE_PTR(result.path),
+ .directory_fd = TAKE_FD(result.fd),
+ .image_fd = -EBADF,
+ };
+
return r;
}
- *ret_root_image = *ret_root_directory = NULL;
+ if (context->root_mstack) {
+ _cleanup_(pick_result_done) PickResult result = PICK_RESULT_NULL;
+
+ r = path_pick(/* toplevel_path= */ NULL,
+ /* toplevel_fd= */ AT_FDCWD,
+ context->root_mstack,
+ pick_filter_image_mstack,
+ /* n_filters= */ 1,
+ PICK_ARCHITECTURE|PICK_TRIES|PICK_RESOLVE,
+ &result);
+ if (r < 0) {
+ *reterr_path = strdup(context->root_mstack);
+ return r;
+ }
+
+ if (!result.path) {
+ *reterr_path = strdup(context->root_mstack);
+ return log_debug_errno(SYNTHETIC_ERRNO(ENOENT), "No matching entry in .v/ directory %s found.", context->root_mstack);
+ }
+
+ _cleanup_(mstack_freep) MStack *mstack = NULL;
+ r = mstack_load(result.path, result.fd, &mstack);
+ if (r < 0) {
+ *reterr_path = TAKE_PTR(result.path);
+ return r;
+ }
+
+ *ret = (PinnedResource) {
+ .mstack = TAKE_PTR(result.path),
+ .mstack_loaded = TAKE_PTR(mstack),
+ .image_fd = -EBADF,
+ .directory_fd = -EBADF,
+ };
+
+ return r;
+ }
+
+ *ret = PINNED_RESOURCE_NULL;
return 0;
}
ExecCommandFlags command_flags,
const ExecContext *context,
const ExecParameters *params,
- ExecRuntime *runtime,
+ const ExecRuntime *runtime,
+ const PinnedResource *rootfs,
const char *memory_pressure_path,
bool needs_sandboxing,
uid_t exec_directory_uid,
_cleanup_strv_free_ char **empty_directories = NULL, **symlinks = NULL,
**read_write_paths_cleanup = NULL;
_cleanup_free_ char *creds_path = NULL, *incoming_dir = NULL, *propagate_dir = NULL,
- *private_namespace_dir = NULL, *host_os_release_stage = NULL, *root_image = NULL, *root_dir = NULL;
+ *private_namespace_dir = NULL, *host_os_release_stage = NULL;
const char *tmp_dir = NULL, *var_tmp_dir = NULL;
char **read_write_paths;
bool setup_os_release_symlink;
CLEANUP_ARRAY(bind_mounts, n_bind_mounts, bind_mount_free_many);
- if (params->flags & EXEC_APPLY_CHROOT && !context->root_directory_as_fd) {
- r = pick_versions(
- context,
- params,
- &root_image,
- &root_dir,
- reterr_path);
- if (r < 0)
- return r;
-
- r = setup_ephemeral(
- context,
- runtime,
- &root_image,
- &root_dir,
- reterr_path);
- if (r < 0)
- return r;
- }
-
r = compile_bind_mounts(context, params, exec_directory_uid, exec_directory_gid, &bind_mounts, &n_bind_mounts, &empty_directories);
if (r < 0)
return r;
}
/* Symlinks (exec dirs, os-release) are set up after other mounts, before they are made read-only. */
- setup_os_release_symlink = needs_sandboxing && exec_context_get_effective_mount_apivfs(context) && (root_dir || root_image);
+ setup_os_release_symlink = needs_sandboxing && exec_context_get_effective_mount_apivfs(context) && pinned_resource_is_set(rootfs);
r = compile_symlinks(context, params, setup_os_release_symlink, &symlinks);
if (r < 0)
return r;
return -ENOMEM;
}
- if (root_image) {
+ if (rootfs->image) {
r = verity_settings_prepare(
&verity,
- root_image,
+ rootfs->image,
&context->root_hash, context->root_hash_path,
&context->root_hash_sig, context->root_hash_sig_path,
context->root_verity);
NamespaceParameters parameters = {
.runtime_scope = params->runtime_scope,
- .root_directory = root_dir,
- .root_image = root_image,
- .root_directory_fd = params->flags & EXEC_APPLY_CHROOT ? params->root_directory_fd : -EBADF,
+ .rootfs = rootfs,
.root_image_options = context->root_image_options,
.root_image_policy = context->root_image_policy ?: &image_policy_service,
/* If DynamicUser=no and RootDirectory= is set then lets pass a relaxed sandbox info,
* otherwise enforce it, don't ignore protected paths and fail if we are enable to apply the
* sandbox inside the mount namespace. */
- .ignore_protect_paths = !needs_sandboxing && !context->dynamic_user && root_dir,
+ .ignore_protect_paths = !needs_sandboxing && !context->dynamic_user && pinned_resource_is_set(rootfs),
.protect_control_groups = needs_sandboxing ? exec_get_protect_control_groups(context) : PROTECT_CONTROL_GROUPS_NO,
.protect_kernel_tunables = needs_sandboxing && context->protect_kernel_tunables,
.protect_proc = needs_sandboxing ? context->protect_proc : PROTECT_PROC_DEFAULT,
.proc_subset = needs_sandboxing ? context->proc_subset : PROC_SUBSET_ALL,
.private_bpf = needs_sandboxing ? context->private_bpf : PRIVATE_BPF_NO,
+ .private_users = needs_sandboxing ? context->private_users : PRIVATE_USERS_NO,
.bpffs_pidref = bpffs_pidref,
.bpffs_socket_fd = bpffs_socket_fd,
if (r == -ENOANO) {
if (insist_on_sandboxing(
context,
- root_dir, root_image,
+ rootfs,
bind_mounts,
n_bind_mounts))
return log_debug_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
"Failed to set up namespace, and refusing to continue since "
"the selected namespacing options alter mount environment non-trivially.\n"
- "Bind mounts: %zu, temporary filesystems: %zu, root directory: %s, root image: %s, dynamic user: %s",
+ "Bind mounts: %zu, temporary filesystems: %zu, root directory: %s, root image: %s, root mstack: %s, dynamic user: %s",
n_bind_mounts,
context->n_temporary_filesystems,
- yes_no(root_dir),
- yes_no(root_image),
+ yes_no(rootfs->directory_fd >= 0),
+ yes_no(rootfs->image_fd >= 0),
+ yes_no(!!rootfs->mstack_loaded),
yes_no(context->dynamic_user));
log_debug("Failed to set up namespace, assuming containerized execution and ignoring.");
static int setup_delegated_namespaces(
const ExecContext *context,
ExecParameters *params,
- ExecRuntime *runtime,
+ const ExecRuntime *runtime,
+ const PinnedResource *rootfs,
bool delegate,
const char *memory_pressure_path,
uid_t uid,
assert(context);
assert(params);
assert(runtime);
+ assert(rootfs);
assert(reterr_exit_status);
if (exec_needs_network_namespace(context) &&
context,
params,
runtime,
- memory_pressure_path,
+ rootfs,
+ memory_pressure_path,
needs_sandboxing,
uid,
gid,
}
}
+ _cleanup_(pinned_resource_done) PinnedResource rootfs = PINNED_RESOURCE_NULL;
+ _cleanup_free_ char *error_path = NULL;
+ r = pin_rootfs(context, params, &rootfs, &error_path);
+ if (r < 0) {
+ *exit_status = EXIT_NAMESPACE;
+ return log_error_errno(r, "Failed to open service's root fs%s%s: %m", error_path ? ": " : "", strempty(error_path));
+ }
+
+ r = setup_ephemeral(context, runtime, &rootfs, &error_path);
+ if (r < 0) {
+ *exit_status = EXIT_NAMESPACE;
+ return log_error_errno(r, "Failed to make ephemeral copy of service's root fs%s%s: %m", error_path ? ": " : "", strempty(error_path));
+ }
+
/* Load a bunch of libraries we'll possibly need later, before we turn off dlopen() */
(void) dlopen_bpf();
(void) dlopen_cryptsetup();
/* allow_setgroups= */ false);
/* If it was requested explicitly and we can't set it up, fail early. Otherwise, continue and let
* the actual requested operations fail (or silently continue). */
- if (r < 0 && context->private_users != PRIVATE_USERS_NO) {
- *exit_status = EXIT_USER;
- return log_error_errno(r, "Failed to set up user namespacing for unprivileged user: %m");
- }
- if (r < 0)
- log_info_errno(r, "Failed to set up user namespacing for unprivileged user, ignoring: %m");
- else {
+ if (r < 0) {
+ if (context->private_users != PRIVATE_USERS_NO) {
+ *exit_status = EXIT_USER;
+ return log_error_errno(r, "Failed to set up user namespacing for unprivileged user: %m");
+ }
+
+ log_notice_errno(r, "Failed to set up user namespacing for unprivileged user, ignoring: %m");
+ } else {
assert(r > 0);
userns_set_up = true;
log_debug("Set up unprivileged user namespace");
context,
params,
runtime,
+ &rootfs,
/* delegate= */ false,
memory_pressure_path,
uid,
context,
params,
runtime,
+ &rootfs,
/* delegate= */ true,
memory_pressure_path,
uid,
nsresource_link = sd_varlink_unref(nsresource_link);
mountfsd_link = sd_varlink_unref(mountfsd_link);
+ /* We don't need the pinned rootfs anymore at this point. Close the fds now, so that they are
+ * definitely gone before we do our fd rearrangements below. */
+ pinned_resource_done(&rootfs);
+
/* Kill unnecessary process, for the case that e.g. when the bpffs mount point is hidden. */
pidref_done_sigkill_wait(&bpffs_pidref);
}
}
#endif
-
}
if (!strv_isempty(context->unset_environment)) {
return 0;
}
-static bool namespace_with_rootfs(const NamespaceParameters *p) {
- /* Returns true, if we have a root dir, root image or too mstack, and hence the root mount is
- * changed */
-
- return p->root_image || p->root_directory || p->root_directory_fd >= 0 || p->root_mstack;
-}
-
static int mount_private_dev(const MountEntry *m, const NamespaceParameters *p) {
static const char devnodes[] =
"/dev/null\0"
/* We assume /run/systemd/journal/ is available if not changing root, which isn't entirely accurate
* but shouldn't matter, as either way the user would get ENOENT when accessing /dev/log */
- if (!namespace_with_rootfs(p) || p->bind_log_sockets) {
+ if (!pinned_resource_is_set(p->rootfs) || p->bind_log_sockets) {
const char *devlog = strjoina(temporary_mount, "/dev/log");
if (symlink("/run/systemd/journal/dev-log", devlog) < 0)
log_debug_errno(errno,
_cleanup_(loop_device_unrefp) LoopDevice *loop_device = NULL;
_cleanup_(dissected_image_unrefp) DissectedImage *dissected_image = NULL;
- _cleanup_(mstack_freep) MStack *mstack = NULL;
_cleanup_strv_free_ char **hierarchies = NULL;
_cleanup_(mount_list_done) MountList ml = {};
_cleanup_close_ int userns_fd = -EBADF;
bool setup_propagate = !isempty(p->propagate_dir) && !isempty(p->incoming_dir);
unsigned long mount_propagation_flag = p->mount_propagation_flag != 0 ? p->mount_propagation_flag : MS_SHARED;
- if (p->root_image) {
- /* Make the whole image read-only if we can determine that we only access it in a read-only fashion. */
- if (namespace_read_only(p))
- dissect_image_flags |= DISSECT_IMAGE_READ_ONLY;
-
- SET_FLAG(dissect_image_flags, DISSECT_IMAGE_NO_PARTITION_TABLE, p->verity && p->verity->data_path);
-
- /* First check if we have a verity device already open and with a fstype pinned by policy. If it
- * cannot be found, then fallback to the slow path (full dissect). */
- r = dissected_image_new_from_existing_verity(
- p->root_image,
- p->verity,
- p->root_image_options,
- p->root_image_policy,
- /* image_filter= */ NULL,
- p->runtime_scope,
- dissect_image_flags,
- &dissected_image);
- if (r < 0 && !ERRNO_IS_NEG_DEVICE_ABSENT(r) && r != -ENOPKG)
- return r;
- if (r >= 0)
- log_debug("Reusing pre-existing verity-protected root image %s", p->root_image);
- else {
- if (p->runtime_scope == RUNTIME_SCOPE_SYSTEM) {
- /* In system mode we mount directly */
+ /* Make the whole image read-only if we can determine that we only access it in a read-only fashion. */
+ bool ro = namespace_read_only(p);
+ if (ro) {
+ dissect_image_flags |= DISSECT_IMAGE_READ_ONLY;
+ mstack_flags |= MSTACK_RDONLY;
+ }
- r = loop_device_make_by_path(
- p->root_image,
- FLAGS_SET(dissect_image_flags, DISSECT_IMAGE_DEVICE_READ_ONLY) ? O_RDONLY : -1 /* < 0 means writable if possible, read-only as fallback */,
- /* sector_size= */ UINT32_MAX,
- FLAGS_SET(dissect_image_flags, DISSECT_IMAGE_NO_PARTITION_TABLE) ? 0 : LO_FLAGS_PARTSCAN,
- LOCK_SH,
- &loop_device);
+ _cleanup_close_ int _root_mount_fd = -EBADF;
+ int root_mount_fd = -EBADF;
+ if (pinned_resource_is_set(p->rootfs)) {
+ if (p->rootfs->directory_fd >= 0) {
+
+ /* In "managed" mode we need to map from foreign UID/GID space, hence go via mountfsd */
+ if (p->private_users == PRIVATE_USERS_MANAGED) {
+ userns_fd = namespace_open_by_type(NAMESPACE_USER);
+ if (userns_fd < 0)
+ return log_debug_errno(userns_fd, "Failed to open our own user namespace: %m");
+
+ r = mountfsd_mount_directory_fd(
+ p->mountfsd_link,
+ p->rootfs->directory_fd,
+ userns_fd,
+ dissect_image_flags,
+ &_root_mount_fd);
if (r < 0)
- return log_debug_errno(r, "Failed to create loop device for root image: %m");
+ return r;
- r = dissect_loop_device(
- loop_device,
+ root_mount_fd = _root_mount_fd;
+ }
+
+ /* Try to to clone the directory mount if we have privs to, so that we can apply the
+ * MS_SLAVE propagation settings right-away. */
+ if (root_mount_fd < 0) {
+ _root_mount_fd = open_tree_attr_with_fallback(
+ p->rootfs->directory_fd,
+ "",
+ OPEN_TREE_CLONE|OPEN_TREE_CLOEXEC|AT_SYMLINK_NOFOLLOW|AT_EMPTY_PATH|AT_RECURSIVE,
+ &(struct mount_attr) {
+ /* We just remounted / as slave, but that didn't affect the detached
+ * mount that we just mounted, so remount that one as slave recursive
+ * as well now. */
+ .propagation = MS_SLAVE,
+ });
+ if (_root_mount_fd < 0 && !ERRNO_IS_NEG_PRIVILEGE(_root_mount_fd) && _root_mount_fd != -EINVAL)
+ return log_debug_errno(_root_mount_fd, "Failed to clone specified directory: %m");
+
+ root_mount_fd = _root_mount_fd;
+ }
+ /* If we have only a root fd (and we couldn't make it ours), and we have no path,
+ * then try to go on with the literal fd */
+ if (root_mount_fd < 0 && !p->rootfs->directory)
+ root_mount_fd = p->rootfs->directory_fd;
+ }
+
+ if (p->rootfs->image_fd >= 0) {
+ SET_FLAG(dissect_image_flags, DISSECT_IMAGE_NO_PARTITION_TABLE, p->verity && p->verity->data_path);
+
+ if (p->runtime_scope == RUNTIME_SCOPE_SYSTEM) {
+ /* In system mode we mount directly */
+
+ /* First check if we have a verity device already open and with a fstype pinned by policy. If it
+ * cannot be found, then fallback to the slow path (full dissect). */
+ r = dissected_image_new_from_existing_verity(
+ p->rootfs->image,
p->verity,
p->root_image_options,
p->root_image_policy,
/* image_filter= */ NULL,
+ p->runtime_scope,
dissect_image_flags,
&dissected_image);
- if (r < 0)
- return log_debug_errno(r, "Failed to dissect image: %m");
-
- r = dissected_image_load_verity_sig_partition(
- dissected_image,
- loop_device->fd,
- p->verity);
- if (r < 0)
- return r;
-
- r = dissected_image_guess_verity_roothash(
- dissected_image,
- p->verity);
- if (r < 0)
+ if (r < 0 && !ERRNO_IS_NEG_DEVICE_ABSENT(r) && r != -ENOPKG)
return r;
-
- r = dissected_image_decrypt(
- dissected_image,
- /* root= */ NULL,
- /* passphrase= */ NULL,
- p->verity,
- p->root_image_policy,
- dissect_image_flags);
- if (r < 0)
- return log_debug_errno(r, "Failed to decrypt dissected image: %m");
+ if (r >= 0)
+ log_debug("Reusing pre-existing verity-protected root image %s", p->rootfs->image);
+ else {
+ r = loop_device_make(
+ p->rootfs->image_fd,
+ FLAGS_SET(dissect_image_flags, DISSECT_IMAGE_DEVICE_READ_ONLY) ? O_RDONLY : -1 /* < 0 means take access mode from fd */,
+ /* offset= */ 0,
+ /* size= */ UINT64_MAX,
+ /* sector_size= */ UINT32_MAX,
+ FLAGS_SET(dissect_image_flags, DISSECT_IMAGE_NO_PARTITION_TABLE) ? 0 : LO_FLAGS_PARTSCAN,
+ LOCK_SH,
+ &loop_device);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to create loop device for root image: %m");
+
+ r = dissect_loop_device(
+ loop_device,
+ p->verity,
+ p->root_image_options,
+ p->root_image_policy,
+ /* image_filter= */ NULL,
+ dissect_image_flags,
+ &dissected_image);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to dissect image: %m");
+
+ r = dissected_image_load_verity_sig_partition(
+ dissected_image,
+ loop_device->fd,
+ p->verity);
+ if (r < 0)
+ return r;
+
+ r = dissected_image_guess_verity_roothash(
+ dissected_image,
+ p->verity);
+ if (r < 0)
+ return r;
+
+ r = dissected_image_decrypt(
+ dissected_image,
+ /* root= */ NULL,
+ /* passphrase= */ NULL,
+ p->verity,
+ p->root_image_policy,
+ dissect_image_flags);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to decrypt dissected image: %m");
+ }
} else {
userns_fd = namespace_open_by_type(NAMESPACE_USER);
if (userns_fd < 0)
return log_debug_errno(userns_fd, "Failed to open our own user namespace: %m");
- r = mountfsd_mount_image(
+ r = mountfsd_mount_image_fd(
p->mountfsd_link,
- p->root_image,
+ p->rootfs->image_fd,
userns_fd,
p->root_image_options,
p->root_image_policy,
return r;
}
}
- } else if (p->root_mstack) {
- if (namespace_read_only(p))
- mstack_flags |= MSTACK_RDONLY;
- r = mstack_load(p->root_mstack, /* dir_fd= */ -EBADF, &mstack);
- if (r < 0)
- return r;
+ if (p->rootfs->mstack_loaded) {
+ if (p->runtime_scope != RUNTIME_SCOPE_SYSTEM) {
+ userns_fd = namespace_open_by_type(NAMESPACE_USER);
+ if (userns_fd < 0)
+ return log_debug_errno(userns_fd, "Failed to open our own user namespace: %m");
+ }
- if (p->runtime_scope != RUNTIME_SCOPE_SYSTEM) {
- userns_fd = namespace_open_by_type(NAMESPACE_USER);
- if (userns_fd < 0)
- return log_debug_errno(userns_fd, "Failed to open our own user namespace: %m");
+ r = mstack_open_images(
+ p->rootfs->mstack_loaded,
+ p->mountfsd_link,
+ userns_fd,
+ p->root_image_policy,
+ /* image_filter= */ NULL,
+ mstack_flags);
+ if (r < 0)
+ return r;
}
-
- r = mstack_open_images(
- mstack,
- p->mountfsd_link,
- userns_fd,
- p->root_image_policy,
- /* image_filter= */ NULL,
- mstack_flags);
- if (r < 0)
- return r;
}
- if (p->root_directory)
- root = p->root_directory;
+ if (p->rootfs && p->rootfs->directory)
+ root = p->rootfs->directory;
else {
/* /run/systemd should have been created by PID 1 early on already, but in some cases, like
* when running tests (test-execute), it might not have been created yet so let's make sure
/* Remount / as SLAVE so that nothing now mounted in the namespace
* shows up in the parent */
- if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0)
- return log_debug_errno(errno, "Failed to remount '/' as SLAVE: %m");
+ r = mount_nofollow_verbose(LOG_DEBUG, /* what= */ NULL, "/", /* fstype= */ NULL, MS_SLAVE|MS_REC, /* options= */ NULL);
+ if (r < 0)
+ return r;
- if (p->root_directory_fd >= 0) {
+ if (root_mount_fd >= 0) {
+ /* If we have root_mount_fd we have a ready-to-use detached mount. Attach it. */
- if (move_mount(p->root_directory_fd, "", AT_FDCWD, root, MOVE_MOUNT_F_EMPTY_PATH) < 0)
+ if (move_mount(root_mount_fd, "", AT_FDCWD, root, MOVE_MOUNT_F_EMPTY_PATH) < 0)
return log_debug_errno(errno, "Failed to move detached mount to '%s': %m", root);
- /* We just remounted / as slave, but that didn't affect the detached mount that we just
- * mounted, so remount that one as slave recursive as well now. */
+ r = mount_nofollow_verbose(LOG_DEBUG, /* what= */ NULL, root, /* fstype= */ NULL, MS_SLAVE|MS_REC, /* options= */ NULL);
+ if (r < 0)
+ return r;
+
+ } else if (p->rootfs && p->rootfs->directory) {
- if (mount(NULL, root, NULL, MS_SLAVE|MS_REC, NULL) < 0)
- return log_debug_errno(errno, "Failed to remount '%s' as SLAVE: %m", root);
+ /* If we do not have root_mount_fd, but a directory was specified, then we can use it directly. */
+
+ /* A root directory is specified. Turn its directory into bind mount, if it isn't one yet. */
+ r = path_is_mount_point_full(root, /* root = */ NULL, AT_SYMLINK_FOLLOW);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to detect that %s is a mount point or not: %m", root);
+ if (r == 0) {
+ r = mount_nofollow_verbose(LOG_DEBUG, root, root, /* fstype= */ NULL, MS_BIND|MS_REC, /* options= */ NULL);
+ if (r < 0)
+ return r;
+ }
+
+ } else if (dissected_image) {
- } else if (p->root_image) {
/* A root image is specified, mount it to the right place */
r = dissected_image_mount(
dissected_image,
if (r < 0)
return log_debug_errno(r, "Failed to relinquish dissected image: %m");
- } else if (p->root_directory) {
-
- /* A root directory is specified. Turn its directory into bind mount, if it isn't one yet. */
- r = path_is_mount_point_full(root, /* root= */ NULL, AT_SYMLINK_FOLLOW);
- if (r < 0)
- return log_debug_errno(r, "Failed to detect that %s is a mount point or not: %m", root);
- if (r == 0) {
- r = mount_nofollow_verbose(LOG_DEBUG, root, root, NULL, MS_BIND|MS_REC, NULL);
- if (r < 0)
- return r;
- }
+ } else if (p->rootfs && p->rootfs->mstack_loaded) {
- } else if (p->root_mstack) {
- r = mstack_make_mounts(mstack, root, mstack_flags);
+ r = mstack_make_mounts(p->rootfs->mstack_loaded, root, mstack_flags);
if (r < 0)
return r;
- r = mstack_bind_mounts(mstack, root, /* where_fd= */ -EBADF, mstack_flags, /* ret_root_fd= */ NULL);
+ r = mstack_bind_mounts(p->rootfs->mstack_loaded, root, /* where_fd= */ -EBADF, mstack_flags, /* ret_root_fd= */ NULL);
if (r < 0)
return r;
}
/* Try to set up the new root directory before mounting anything else there. */
- if (namespace_with_rootfs(p))
+ if (pinned_resource_is_set(p->rootfs))
(void) base_filesystem_create(root, UID_INVALID, GID_INVALID);
/* Now make the magic happen */
return r;
/* MS_MOVE does not work on MS_SHARED so the remount MS_SHARED will be done later */
- r = mount_switch_root(root, /* mount_propagation_flag= */ 0);
- if (r == -EINVAL && p->root_directory) {
+ r = mount_switch_root(root, /* mount_propagation_flag = */ 0);
+ if (r == -EINVAL && p->rootfs && p->rootfs->directory) {
/* If we are using root_directory and we don't have privileges (ie: user manager in a user
* namespace) and the root_directory is already a mount point in the parent namespace,
* MS_MOVE will fail as we don't have permission to change it (with EINVAL rather than
r = mount_nofollow_verbose(LOG_DEBUG, root, root, NULL, MS_BIND|MS_REC, NULL);
if (r < 0)
return r;
+
r = mount_switch_root(root, /* mount_propagation_flag= */ 0);
}
if (r < 0)
};
DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(private_pids, PrivatePIDs, PRIVATE_PIDS_YES);
+
+void pinned_resource_done(PinnedResource *p) {
+ assert(p);
+
+ p->directory_fd = safe_close(p->directory_fd);
+ p->directory = mfree(p->directory);
+ p->image_fd = safe_close(p->image_fd);
+ p->image = mfree(p->image);
+ p->mstack_loaded = mstack_free(p->mstack_loaded);
+ p->mstack = mfree(p->mstack);
+}
+
+bool pinned_resource_is_set(const PinnedResource *p) {
+ if (!p)
+ return false;
+
+ return p->directory_fd >= 0 ||
+ p->directory ||
+ p->image_fd >= 0 ||
+ p->image ||
+ p->mstack_loaded ||
+ p->mstack;
+}
_PRIVATE_PIDS_INVALID = -EINVAL,
} PrivatePIDs;
+typedef struct PinnedResource {
+ /* Pins a disk image, directory or mstack by file descriptors. The paths are stored too, but they are
+ * intended to be decoration only, to enhance log messages and should not be load-bearing
+ * otherwise. */
+ int directory_fd;
+ char *directory;
+ int image_fd;
+ char *image;
+ MStack *mstack_loaded;
+ char *mstack;
+} PinnedResource;
+
+#define PINNED_RESOURCE_NULL \
+ (PinnedResource) { \
+ .directory_fd = -EBADF, \
+ .image_fd = -EBADF, \
+ }
+
typedef struct BindMount {
char *source;
char *destination;
typedef struct NamespaceParameters {
RuntimeScope runtime_scope;
- int root_directory_fd;
- const char *root_directory;
- const char *root_image;
- const char *root_mstack;
+ const PinnedResource *rootfs;
const MountOptions *root_image_options;
const ImagePolicy *root_image_policy;
PrivateTmp private_tmp;
PrivateTmp private_var_tmp;
PrivatePIDs private_pids;
+ PrivateUsers private_users;
PidRef *bpffs_pidref;
int bpffs_socket_fd;
const PidRef *target,
const char *hierarchy_env,
const NamespaceParameters *p);
+
+void pinned_resource_done(PinnedResource *p);
+bool pinned_resource_is_set(const PinnedResource *p);
.n_extension_images = s->exec_context.n_extension_images,
.extension_directories = s->exec_context.extension_directories,
.extension_image_policy = s->exec_context.extension_image_policy,
- .root_directory_fd = -EBADF,
};
/* Only reload confext, and not sysext as they also typically contain the executable(s) used
typedef struct MachineBindUserContext MachineBindUserContext;
typedef struct MachineCredentialContext MachineCredentialContext;
typedef struct MountOptions MountOptions;
+typedef struct MStack MStack;
typedef struct OpenFile OpenFile;
typedef struct Pkcs11EncryptedKey Pkcs11EncryptedKey;
typedef struct Table Table;
static const NamespaceParameters p = {
.runtime_scope = RUNTIME_SCOPE_SYSTEM,
.protect_kernel_logs = true,
- .root_directory_fd = -EBADF,
};
int r;
/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#include <fcntl.h>
#include <stdlib.h>
#include <unistd.h>
else
log_info("Not chrooted");
+ _cleanup_(pinned_resource_done) PinnedResource pr = PINNED_RESOURCE_NULL;
+
+ if (root_directory) {
+ pr.directory_fd = open(root_directory, O_PATH|O_CLOEXEC|O_DIRECTORY);
+ assert_se(pr.directory_fd >= 0);
+
+ pr.directory = strdup(root_directory);
+ assert_se(pr.directory);
+ }
+
NamespaceParameters p = {
.runtime_scope = RUNTIME_SCOPE_SYSTEM,
- .root_directory = root_directory,
- .root_directory_fd = -EBADF,
+ .rootfs = &pr,
.read_write_paths = (char**) writable,
.read_only_paths = (char**) readonly,