/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#include <sys/mount.h>
+#if WANT_LINUX_FS_H
+#include <linux/fs.h>
+#endif
+
#include "sd-daemon.h"
#include "sd-varlink.h"
#include "json-util.h"
#include "main-func.h"
#include "missing_loop.h"
+#include "missing_mount.h"
+#include "missing_syscall.h"
#include "namespace-util.h"
#include "nsresource.h"
#include "nulstr-util.h"
#include "os-util.h"
#include "process-util.h"
#include "stat-util.h"
+#include "string-table.h"
+#include "uid-classification.h"
+#include "uid-range.h"
#include "user-util.h"
#include "varlink-io.systemd.MountFileSystem.h"
#include "varlink-util.h"
SD_JSON_BUILD_PAIR_CONDITION(!sd_id128_is_null(di->image_uuid), "imageUuid", SD_JSON_BUILD_UUID(di->image_uuid)));
}
+typedef enum MountMapMode {
+ MOUNT_MAP_AUTO = 0, /* determine automatically from image and caller */
+ MOUNT_MAP_ROOT, /* map caller's UID to root in namespace (map 1 UID only) */
+ MOUNT_MAP_FOREIGN, /* map foreign UID range to base in namespace (map 64K) */
+ MOUNT_MAP_IDENTITY, /* apply identity mapping (map 64K) */
+ _MOUNT_MAP_MODE_MAX,
+ _MOUNT_MAP_MODE_INVALID = -EINVAL,
+} MountMapMode;
+
+static const char *const mount_map_mode_table[_MOUNT_MAP_MODE_MAX] = {
+ [MOUNT_MAP_AUTO] = "auto",
+ [MOUNT_MAP_ROOT] = "root",
+ [MOUNT_MAP_FOREIGN] = "foreign",
+ [MOUNT_MAP_IDENTITY] = "identity",
+};
+
+DEFINE_PRIVATE_STRING_TABLE_LOOKUP(mount_map_mode, MountMapMode);
+
+typedef struct MountDirectoryParameters {
+ MountMapMode mode;
+ unsigned directory_fd_idx;
+ unsigned userns_fd_idx;
+ int read_only;
+} MountDirectoryParameters;
+
+typedef enum DirectoryOwnership {
+ DIRECTORY_IS_ROOT_PEER_OWNED, /* This is returned if the directory is owned by the root user and the peer is root */
+ DIRECTORY_IS_ROOT_OWNED, /* This is returned if the directory is owned by the root user (and the peer user is not root) */
+ DIRECTORY_IS_PEER_OWNED, /* This is returned if the directory is owned by the peer user (who is not root) */
+ DIRECTORY_IS_FOREIGN_OWNED, /* This is returned if the directory is owned by the foreign UID range */
+ DIRECTORY_IS_OTHERWISE_OWNED, /* This is returned if the directory is owned by something else */
+ _DIRECTORY_OWNERSHIP_MAX,
+ _DIRECTORY_OWNERSHIP_ERRNO_MAX = -ERRNO_MAX, /* Guarantee the whole negative errno range fits */
+} DirectoryOwnership;
+
+static MountMapMode default_mount_map_mode(DirectoryOwnership ownership) {
+ /* Derives a suitable mapping mode from the ownership of the base tree */
+
+ switch (ownership) {
+ case DIRECTORY_IS_PEER_OWNED:
+ return MOUNT_MAP_ROOT; /* Map the peer's UID to root in the container */
+
+ case DIRECTORY_IS_FOREIGN_OWNED:
+ return MOUNT_MAP_FOREIGN; /* Map the foreign UID range to the container's UID range */
+
+ case DIRECTORY_IS_ROOT_PEER_OWNED:
+ case DIRECTORY_IS_ROOT_OWNED:
+ case DIRECTORY_IS_OTHERWISE_OWNED:
+ return MOUNT_MAP_IDENTITY; /* Don't map */
+
+ default:
+ return _MOUNT_MAP_MODE_INVALID;
+ }
+}
+
+static JSON_DISPATCH_ENUM_DEFINE(dispatch_mount_directory_mode, MountMapMode, mount_map_mode_from_string);
+
+static DirectoryOwnership validate_directory_fd(int fd, uid_t peer_uid) {
+ int r, fl;
+
+ assert(fd >= 0);
+
+ /* Checks if the specified directory fd looks sane. Returns a DirectoryOwnership that categorizes the
+ * ownership situation in comparison to the peer's UID.
+ *
+ * Note one key difference to image validation (as implemented above): for regular files if the
+ * client provided us with an open fd it implies the client has access, as well as what kind of
+ * access (i.e. ro or rw). But for directories this doesn't work the same way, as directories are
+ * always opened read-only only. Hence we use a different mechanism to validate access to them: we
+ * check if the directory is owned by the peer UID or by the foreign UID range (in the latter case
+ * one of the parent directories must be owned by the peer though). */
+
+ struct stat st;
+ if (fstat(fd, &st) < 0)
+ return log_debug_errno(errno, "Failed to stat() directory fd: %m");
+
+ r = stat_verify_directory(&st);
+ if (r < 0)
+ return r;
+
+ fl = fd_verify_safe_flags_full(fd, O_DIRECTORY);
+ if (fl < 0)
+ return log_debug_errno(fl, "Directory file descriptor has unsafe flags set: %m");
+
+ if (st.st_uid == 0) {
+ if (peer_uid == 0) {
+ log_debug("Directory file descriptor points to root owned directory, who is also the peer.");
+ return DIRECTORY_IS_ROOT_PEER_OWNED;
+ }
+ log_debug("Directory file descriptor points to root owned directory.");
+ return DIRECTORY_IS_ROOT_OWNED;
+ }
+ if (st.st_uid == peer_uid) {
+ log_debug("Directory file descriptor points to peer owned directory.");
+ return DIRECTORY_IS_PEER_OWNED;
+ }
+
+ /* For bind mounted directories we check if they are either owned by the client's UID, or by the
+ * foreign UID set, but in that case the parent directory must be owned by the client's UID, or some
+ * directory iteratively up the chain */
+
+ _cleanup_close_ int parent_fd = -EBADF;
+ unsigned n_level;
+ for (n_level = 0; n_level < 16; n_level++) {
+ /* Stop iteration if we find a directory up the tree that is neither owned by the user, nor is from the foreign UID range */
+ if (!uid_is_foreign(st.st_uid) || !gid_is_foreign(st.st_gid)) {
+ log_debug("Directory file descriptor points to directory which itself or its parents is neither owned by foreign UID range nor by the user.");
+ return DIRECTORY_IS_OTHERWISE_OWNED;
+ }
+
+ /* If the peer is root, then it doesn't matter if we find a parent owned by root, let's shortcut things. */
+ if (peer_uid == 0) {
+ log_debug("Directory file descriptor is owned by foreign UID range, and peer is root.");
+ return DIRECTORY_IS_FOREIGN_OWNED;
+ }
+
+ /* Go one level up */
+ _cleanup_close_ int new_parent_fd = openat(fd, "..", O_DIRECTORY|O_PATH|O_CLOEXEC);
+ if (new_parent_fd < 0)
+ return log_debug_errno(errno, "Failed to open parent directory of directory file descriptor: %m");
+
+ struct stat new_st;
+ if (fstat(new_parent_fd, &new_st) < 0)
+ return log_debug_errno(errno, "Failed to stat parent directory of directory file descriptor: %m");
+
+ /* Safety check to see if we hit the root dir */
+ if (stat_inode_same(&st, &new_st)) {
+ log_debug("Directory file descriptor is owned by foreign UID range, but didn't find parent directory that is owned by peer among ancestors.");
+ return DIRECTORY_IS_OTHERWISE_OWNED;
+ }
+
+ if (new_st.st_uid == peer_uid) { /* Parent inode is owned by the peer. That's good! Everything's fine. */
+ log_debug("Directory file descriptor is owned by foreign UID range, and ancestor is owned by peer.");
+ return DIRECTORY_IS_FOREIGN_OWNED;
+ }
+
+ close_and_replace(parent_fd, new_parent_fd);
+ st = new_st;
+ }
+
+ log_debug("Failed to find peer owned parent directory after %u levels, refusing.", n_level);
+ return DIRECTORY_IS_OTHERWISE_OWNED;
+}
+
+static int vl_method_mount_directory(
+ sd_varlink *link,
+ sd_json_variant *parameters,
+ sd_varlink_method_flags_t flags,
+ void *userdata) {
+
+ static const sd_json_dispatch_field dispatch_table[] = {
+ { "mode", SD_JSON_VARIANT_STRING, dispatch_mount_directory_mode, offsetof(MountDirectoryParameters, mode), 0 },
+ { "directoryFileDescriptor", SD_JSON_VARIANT_UNSIGNED, sd_json_dispatch_uint, offsetof(MountDirectoryParameters, directory_fd_idx), SD_JSON_MANDATORY },
+ { "userNamespaceFileDescriptor", SD_JSON_VARIANT_UNSIGNED, sd_json_dispatch_uint, offsetof(MountDirectoryParameters, userns_fd_idx), 0 },
+ { "readOnly", SD_JSON_VARIANT_BOOLEAN, sd_json_dispatch_tristate, offsetof(MountDirectoryParameters, read_only), 0 },
+ VARLINK_DISPATCH_POLKIT_FIELD,
+ {}
+ };
+
+ MountDirectoryParameters p = {
+ .mode = MOUNT_MAP_AUTO,
+ .directory_fd_idx = UINT_MAX,
+ .userns_fd_idx = UINT_MAX,
+ .read_only = -1,
+ };
+ _cleanup_close_ int directory_fd = -EBADF, userns_fd = -EBADF;
+ Hashmap **polkit_registry = ASSERT_PTR(userdata);
+ int r;
+
+ r = sd_varlink_dispatch(link, parameters, dispatch_table, &p);
+ if (r != 0)
+ return r;
+
+ if (p.directory_fd_idx == UINT_MAX)
+ return sd_varlink_error_invalid_parameter_name(link, "directoryFileDescriptor");
+
+ directory_fd = sd_varlink_peek_dup_fd(link, p.directory_fd_idx);
+ if (directory_fd < 0)
+ return log_debug_errno(directory_fd, "Failed to peek directory fd from client: %m");
+
+ if (p.userns_fd_idx != UINT_MAX) {
+ userns_fd = sd_varlink_peek_dup_fd(link, p.userns_fd_idx);
+ if (userns_fd < 0)
+ return log_debug_errno(userns_fd, "Failed to peek user namespace fd from client: %m");
+ }
+
+ uid_t peer_uid;
+ r = sd_varlink_get_peer_uid(link, &peer_uid);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to get client UID: %m");
+
+ DirectoryOwnership owned_by = validate_directory_fd(directory_fd, peer_uid);
+ if (owned_by < 0)
+ return owned_by;
+
+ r = validate_userns(link, &userns_fd);
+ if (r != 0)
+ return r;
+
+ /* If no mode is specified, pick sensible default */
+ if (p.mode <= 0) {
+ p.mode = default_mount_map_mode(owned_by);
+ assert(p.mode > 0);
+ }
+
+ _cleanup_free_ char *directory_path = NULL;
+ (void) fd_get_path(directory_fd, &directory_path);
+
+ log_debug("Mounting '%s' with mapping mode: %s", strna(directory_path), mount_map_mode_to_string(p.mode));
+
+ const char *polkit_details[] = {
+ "read_only", one_zero(p.read_only > 0),
+ "directory", strna(directory_path),
+ NULL,
+ };
+
+ const char *polkit_action, *polkit_untrusted_action;
+ PolkitFlags polkit_flags;
+ if (userns_fd < 0) {
+ /* Mount into the host user namespace */
+ polkit_action = "io.systemd.mount-file-system.mount-directory";
+ polkit_untrusted_action = "io.systemd.mount-file-system.mount-untrusted-directory";
+ polkit_flags = 0;
+ } else {
+ /* Mount into a private user namespace */
+ polkit_action = "io.systemd.mount-file-system.mount-directory-privately";
+ polkit_untrusted_action = "io.systemd.mount-file-system.mount-untrusted-directory-privately";
+
+ /* If polkit is not around, let's allow mounting authenticated images by default */
+ polkit_flags = POLKIT_DEFAULT_ALLOW;
+ }
+
+ /* We consider a directory "trusted" if it is owned by the peer or the foreign UID range */
+ bool trusted_directory = IN_SET(owned_by, DIRECTORY_IS_ROOT_PEER_OWNED, DIRECTORY_IS_PEER_OWNED, DIRECTORY_IS_FOREIGN_OWNED);
+
+ /* Let's definitely acquire the regular action privilege, for mounting properly signed images */
+ r = varlink_verify_polkit_async_full(
+ link,
+ /* bus= */ NULL,
+ trusted_directory ? polkit_action : polkit_untrusted_action,
+ polkit_details,
+ /* good_user= */ UID_INVALID,
+ trusted_directory ? polkit_flags : 0,
+ polkit_registry);
+ if (r <= 0)
+ return r;
+
+ /* Generate the common dissection directory here. We are not going to use it, but the clients might,
+ * and they likely are unprivileged, hence cannot create it themselves. Hence let's just create it
+ * here, if it is missing. */
+ r = get_common_dissect_directory(NULL);
+ if (r < 0)
+ return r;
+
+ _cleanup_close_ int mount_fd = open_tree(directory_fd, "", OPEN_TREE_CLONE|OPEN_TREE_CLOEXEC|AT_SYMLINK_NOFOLLOW|AT_EMPTY_PATH);
+ if (mount_fd < 0)
+ return log_debug_errno(errno, "Failed to issue open_tree() of provided directory '%s': %m", strna(directory_path));
+
+ if (p.read_only > 0 && mount_setattr(
+ mount_fd, "", AT_EMPTY_PATH,
+ &(struct mount_attr) {
+ .attr_set = MOUNT_ATTR_RDONLY,
+ }, MOUNT_ATTR_SIZE_VER0) < 0)
+ return log_debug_errno(errno, "Failed to enable read-only mode: %m");
+
+ if (p.mode != MOUNT_MAP_IDENTITY) {
+ uid_t start;
+
+ if (userns_fd >= 0) {
+ _cleanup_(uid_range_freep) UIDRange *uid_range_outside = NULL, *uid_range_inside = NULL, *gid_range_outside = NULL, *gid_range_inside = NULL;
+ r = uid_range_load_userns_by_fd(userns_fd, UID_RANGE_USERNS_OUTSIDE, &uid_range_outside);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to load outside UID range of provided userns: %m");
+ r = uid_range_load_userns_by_fd(userns_fd, UID_RANGE_USERNS_INSIDE, &uid_range_inside);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to load inside UID range of provided userns: %m");
+ r = uid_range_load_userns_by_fd(userns_fd, GID_RANGE_USERNS_OUTSIDE, &gid_range_outside);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to load outside GID range of provided userns: %m");
+ r = uid_range_load_userns_by_fd(userns_fd, GID_RANGE_USERNS_INSIDE, &gid_range_inside);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to load inside GID range of provided userns: %m");
+
+ /* Be very strict for now */
+ if (!uid_range_equal(uid_range_outside, gid_range_outside) ||
+ !uid_range_equal(uid_range_inside, gid_range_inside) ||
+ uid_range_outside->n_entries != 1 ||
+ uid_range_outside->entries[0].nr != 0x10000 ||
+ uid_range_inside->n_entries != 1 ||
+ uid_range_inside->entries[0].start != 0 ||
+ uid_range_inside->entries[0].nr != 0x10000)
+ return sd_varlink_error_invalid_parameter_name(link, "userNamespaceFileDescriptor");
+
+ start = uid_range_outside->entries[0].start;
+ } else
+ start = 0;
+
+ _cleanup_free_ char *new_uid_map = NULL;
+ switch (p.mode) {
+ case MOUNT_MAP_ROOT:
+ r = strextendf(&new_uid_map, UID_FMT " " UID_FMT " " UID_FMT,
+ peer_uid, start, (uid_t) 1);
+ break;
+ case MOUNT_MAP_FOREIGN:
+ r = strextendf(&new_uid_map, UID_FMT " " UID_FMT " " UID_FMT,
+ (uid_t) FOREIGN_UID_MIN, start, (uid_t) 0x10000);
+ break;
+ default:
+ assert_not_reached();
+ }
+ if (r < 0)
+ return r;
+
+ _cleanup_close_ int idmap_userns_fd = userns_acquire(new_uid_map, new_uid_map);
+ if (idmap_userns_fd < 0)
+ return log_debug_errno(idmap_userns_fd, "Failed to acquire user namespace for id mapping: %m");
+
+ if (mount_setattr(mount_fd, "", AT_EMPTY_PATH,
+ &(struct mount_attr) {
+ .attr_set = MOUNT_ATTR_IDMAP,
+ .userns_fd = idmap_userns_fd,
+ .propagation = MS_PRIVATE,
+ }, MOUNT_ATTR_SIZE_VER0) < 0)
+ return log_debug_errno(errno, "Failed to enable id mapping: %m");
+ }
+
+ if (userns_fd >= 0) {
+ r = nsresource_add_mount(userns_fd, mount_fd);
+ if (r < 0)
+ return r;
+ }
+
+ int fd_idx = sd_varlink_push_fd(link, mount_fd);
+ if (fd_idx < 0)
+ return fd_idx;
+
+ TAKE_FD(mount_fd);
+
+ return sd_varlink_replybo(
+ link,
+ SD_JSON_BUILD_PAIR("mountFileDescriptor", SD_JSON_BUILD_INTEGER(fd_idx)));
+}
+
static int process_connection(sd_varlink_server *server, int _fd) {
_cleanup_close_ int fd = TAKE_FD(_fd); /* always take possession */
_cleanup_(sd_varlink_close_unrefp) sd_varlink *vl = NULL;
r = sd_varlink_server_bind_method_many(
server,
- "io.systemd.MountFileSystem.MountImage", vl_method_mount_image);
+ "io.systemd.MountFileSystem.MountImage", vl_method_mount_image,
+ "io.systemd.MountFileSystem.MountDirectory", vl_method_mount_directory);
if (r < 0)
return log_error_errno(r, "Failed to bind methods: %m");
SD_VARLINK_DEFINE_OUTPUT(imageName, SD_VARLINK_STRING, SD_VARLINK_NULLABLE),
SD_VARLINK_DEFINE_OUTPUT(imageUuid, SD_VARLINK_STRING, SD_VARLINK_NULLABLE));
+static SD_VARLINK_DEFINE_ENUM_TYPE(
+ MountMapMode,
+ SD_VARLINK_FIELD_COMMENT("Map the caller's UID to root in the user namespace, do not map anything else."),
+ SD_VARLINK_DEFINE_ENUM_VALUE(root),
+ SD_VARLINK_FIELD_COMMENT("Map the foreign UID range to the base UID range in the user namespace (i.e. UID zero and above), covering 64K users."),
+ SD_VARLINK_DEFINE_ENUM_VALUE(foreign),
+ SD_VARLINK_FIELD_COMMENT("Apply an identity (1:1) mapping, but limit it to 64K users."),
+ SD_VARLINK_DEFINE_ENUM_VALUE(identity),
+ SD_VARLINK_FIELD_COMMENT("Determine automatically based on provided directory and caller."),
+ SD_VARLINK_DEFINE_ENUM_VALUE(auto));
+
+static SD_VARLINK_DEFINE_METHOD(
+ MountDirectory,
+ SD_VARLINK_FIELD_COMMENT("Directory file descriptor of the directory to assign to the user namespace. Must be a regular, i.e. non-O_PATH file descriptor."),
+ SD_VARLINK_DEFINE_INPUT(directoryFileDescriptor, SD_VARLINK_INT, 0),
+ SD_VARLINK_FIELD_COMMENT("File descriptor to the user namespace to assign this directory to. If not specified uses the host user namespace."),
+ SD_VARLINK_DEFINE_INPUT(userNamespaceFileDescriptor, SD_VARLINK_INT, SD_VARLINK_NULLABLE),
+ SD_VARLINK_FIELD_COMMENT("Whether to mark the resulting mount file descriptor as read-only. If not specified defaults to false."),
+ SD_VARLINK_DEFINE_INPUT(readOnly, SD_VARLINK_BOOL, SD_VARLINK_NULLABLE),
+ SD_VARLINK_FIELD_COMMENT("Which kinda of UID/GID mapping to apply to the resulting mount file descriptor."),
+ SD_VARLINK_DEFINE_INPUT_BY_TYPE(mode, MountMapMode, SD_VARLINK_NULLABLE),
+ VARLINK_DEFINE_POLKIT_INPUT,
+ SD_VARLINK_FIELD_COMMENT("The freshly allocated mount file descriptor for the mount."),
+ SD_VARLINK_DEFINE_OUTPUT(mountFileDescriptor, SD_VARLINK_INT, 0));
+
static SD_VARLINK_DEFINE_ERROR(IncompatibleImage);
static SD_VARLINK_DEFINE_ERROR(MultipleRootPartitionsFound);
static SD_VARLINK_DEFINE_ERROR(RootPartitionNotFound);
SD_VARLINK_DEFINE_INTERFACE(
io_systemd_MountFileSystem,
"io.systemd.MountFileSystem",
+ SD_VARLINK_INTERFACE_COMMENT("APIs for unpriviliged mounting."),
+ SD_VARLINK_SYMBOL_COMMENT("Encodes the designated purpose of a partition."),
&vl_type_PartitionDesignator,
+ SD_VARLINK_SYMBOL_COMMENT("Information about a specific partition."),
&vl_type_PartitionInfo,
+ SD_VARLINK_SYMBOL_COMMENT("Selects the type of UID/GID mapping to apply."),
+ &vl_type_MountMapMode,
+ SD_VARLINK_SYMBOL_COMMENT("Takes a disk image file descriptor as input, returns a set of mount file descriptors for it."),
&vl_method_MountImage,
+ SD_VARLINK_SYMBOL_COMMENT("Takes a directory file descriptor as input, returns a mount file descriptor."),
+ &vl_method_MountDirectory,
&vl_error_IncompatibleImage,
&vl_error_MultipleRootPartitionsFound,
&vl_error_RootPartitionNotFound,