]> git.ipfire.org Git - thirdparty/systemd.git/commitdiff
mountfsd: make MountDirectory() work with systemd-homed
authorLennart Poettering <lennart@poettering.net>
Sat, 23 Aug 2025 06:15:01 +0000 (08:15 +0200)
committerDaan De Meyer <daan.j.demeyer@gmail.com>
Wed, 15 Oct 2025 08:21:59 +0000 (10:21 +0200)
systemd-homed already applies an idmap to its mounts, hence we need to
undo it before we can create our own.

src/mountfsd/mountwork.c
src/nspawn/nspawn-mount.c
src/shared/mount-util.c
src/shared/mount-util.h
units/systemd-mountfsd.service.in

index 74b97d7371ad8b207281a5376ec686d011d042d1..401d8c6dfc710efea845383a31ae79ffbd8a1eef 100644 (file)
@@ -27,6 +27,7 @@
 #include "loop-util.h"
 #include "main-func.h"
 #include "memory-util.h"
+#include "mount-util.h"
 #include "namespace-util.h"
 #include "nsresource.h"
 #include "nulstr-util.h"
@@ -41,6 +42,7 @@
 #include "time-util.h"
 #include "uid-classification.h"
 #include "uid-range.h"
+#include "user-util.h"
 #include "varlink-io.systemd.MountFileSystem.h"
 #include "varlink-util.h"
 
@@ -637,10 +639,16 @@ static MountMapMode default_mount_map_mode(DirectoryOwnership ownership) {
 
 static JSON_DISPATCH_ENUM_DEFINE(dispatch_mount_directory_mode, MountMapMode, mount_map_mode_from_string);
 
-static DirectoryOwnership validate_directory_fd(int fd, uid_t peer_uid) {
+static DirectoryOwnership validate_directory_fd(
+                int fd,
+                uid_t peer_uid,
+                uid_t *ret_current_owner_uid) {
+
         int r, fl;
 
         assert(fd >= 0);
+        assert(uid_is_valid(peer_uid));
+        assert(ret_current_owner_uid);
 
         /* Checks if the specified directory fd looks sane. Returns a DirectoryOwnership that categorizes the
          * ownership situation in comparison to the peer's UID.
@@ -665,6 +673,7 @@ static DirectoryOwnership validate_directory_fd(int fd, uid_t peer_uid) {
                 return log_debug_errno(fl, "Directory file descriptor has unsafe flags set: %m");
 
         if (st.st_uid == 0) {
+                *ret_current_owner_uid = st.st_uid;
                 if (peer_uid == 0) {
                         log_debug("Directory file descriptor points to root owned directory, who is also the peer.");
                         return DIRECTORY_IS_ROOT_PEER_OWNED;
@@ -674,6 +683,7 @@ static DirectoryOwnership validate_directory_fd(int fd, uid_t peer_uid) {
         }
         if (st.st_uid == peer_uid) {
                 log_debug("Directory file descriptor points to peer owned directory.");
+                *ret_current_owner_uid = st.st_uid;
                 return DIRECTORY_IS_PEER_OWNED;
         }
 
@@ -687,12 +697,14 @@ static DirectoryOwnership validate_directory_fd(int fd, uid_t peer_uid) {
                 /* Stop iteration if we find a directory up the tree that is neither owned by the user, nor is from the foreign UID range */
                 if (!uid_is_foreign(st.st_uid) || !gid_is_foreign(st.st_gid)) {
                         log_debug("Directory file descriptor points to directory which itself or its parents is neither owned by foreign UID range nor by the user.");
+                        *ret_current_owner_uid = st.st_uid;
                         return DIRECTORY_IS_OTHERWISE_OWNED;
                 }
 
                 /* If the peer is root, then it doesn't matter if we find a parent owned by root, let's shortcut things. */
                 if (peer_uid == 0) {
                         log_debug("Directory file descriptor is owned by foreign UID range, and peer is root.");
+                        *ret_current_owner_uid = st.st_uid;
                         return DIRECTORY_IS_FOREIGN_OWNED;
                 }
 
@@ -708,11 +720,13 @@ static DirectoryOwnership validate_directory_fd(int fd, uid_t peer_uid) {
                 /* Safety check to see if we hit the root dir */
                 if (stat_inode_same(&st, &new_st)) {
                         log_debug("Directory file descriptor is owned by foreign UID range, but didn't find parent directory that is owned by peer among ancestors.");
+                        *ret_current_owner_uid = st.st_uid;
                         return DIRECTORY_IS_OTHERWISE_OWNED;
                 }
 
                 if (new_st.st_uid == peer_uid) { /* Parent inode is owned by the peer. That's good! Everything's fine. */
                         log_debug("Directory file descriptor is owned by foreign UID range, and ancestor is owned by peer.");
+                        *ret_current_owner_uid = st.st_uid;
                         return DIRECTORY_IS_FOREIGN_OWNED;
                 }
 
@@ -721,6 +735,7 @@ static DirectoryOwnership validate_directory_fd(int fd, uid_t peer_uid) {
         }
 
         log_debug("Failed to find peer owned parent directory after %u levels, refusing.", n_level);
+        *ret_current_owner_uid = st.st_uid;
         return DIRECTORY_IS_OTHERWISE_OWNED;
 }
 
@@ -771,7 +786,8 @@ static int vl_method_mount_directory(
         if (r < 0)
                 return log_debug_errno(r, "Failed to get client UID: %m");
 
-        DirectoryOwnership owned_by = validate_directory_fd(directory_fd, peer_uid);
+        uid_t current_owner_uid;
+        DirectoryOwnership owned_by = validate_directory_fd(directory_fd, peer_uid, &current_owner_uid);
         if (owned_by == -EREMOTEIO)
                 return sd_varlink_errorbo(link, "io.systemd.MountFileSystem.BadFileDescriptorFlags", SD_JSON_BUILD_PAIR_STRING("parameter", "directoryFileDescriptor"));
         if (owned_by < 0)
@@ -836,10 +852,28 @@ static int vl_method_mount_directory(
         if (r < 0)
                 return r;
 
-        _cleanup_close_ int mount_fd = open_tree(directory_fd, "", OPEN_TREE_CLONE|OPEN_TREE_CLOEXEC|AT_SYMLINK_NOFOLLOW|AT_EMPTY_PATH);
+        _cleanup_close_ int mount_fd = open_tree_try_drop_idmap(
+                        directory_fd,
+                        "",
+                        OPEN_TREE_CLONE|OPEN_TREE_CLOEXEC|AT_SYMLINK_NOFOLLOW|AT_EMPTY_PATH);
         if (mount_fd < 0)
                 return log_debug_errno(errno, "Failed to issue open_tree() of provided directory '%s': %m", strna(directory_path));
 
+        /* MOUNT_ATTR_IDMAP has possibly been cleared. Let's verify that the underlying data matches our expectations. */
+        struct stat unmapped_st;
+        if (fstat(mount_fd, &unmapped_st) < 0)
+                return log_debug_errno(errno, "Failed to stat unmapped inode: %m");
+
+        r = stat_verify_directory(&unmapped_st);
+        if (r < 0)
+                return r;
+
+        /* For now, let's simply refuse things if dropping the idmapping changed anything. For now that
+         * should be good enough, because the primary usecase for this (homed) will mount the foreign UID
+         * range 1:1. */
+        if (unmapped_st.st_uid != current_owner_uid)
+                return log_debug_errno(SYNTHETIC_ERRNO(EPERM), "Owner UID of mount after clearing ID mapping not the same anymore, refusing.");
+
         if (p.read_only > 0 && mount_setattr(
                             mount_fd, "", AT_EMPTY_PATH,
                             &(struct mount_attr) {
index 56870a341f735808d36a628a05c36030d54c5181..f33e8bc0086de8ffcbfb051ffdc7a13a9fe65199 100644 (file)
@@ -826,17 +826,16 @@ static int mount_bind(const char *dest, CustomMount *m, uid_t uid_shift, uid_t u
          * caller's userns *without* any mount idmapping in place. To get that uid, we clone the
          * mount source tree and clear any existing idmapping and temporarily mount that tree over
          * the mount source before we stat the mount source to figure out the source uid. */
-        _cleanup_close_ int fd_clone = open_tree_attr_with_fallback(
+        _cleanup_close_ int fd_clone =
+                idmapping == REMOUNT_IDMAPPING_NONE ?
+                RET_NERRNO(open_tree(
                         AT_FDCWD,
                         m->source,
-                        open_tree_flags,
-                        &(struct mount_attr) {
-                                .attr_clr = idmapping != REMOUNT_IDMAPPING_NONE ? MOUNT_ATTR_IDMAP : 0,
-                        });
-        if (ERRNO_IS_NEG_NOT_SUPPORTED(fd_clone))
-                /* We can only clear idmapped mounts with open_tree_attr(), but there might not be one in
-                 * the first place, so we keep going if we get a not supported error. */
-                fd_clone = open_tree(AT_FDCWD, m->source, open_tree_flags);
+                        open_tree_flags)) :
+                open_tree_try_drop_idmap(
+                        AT_FDCWD,
+                        m->source,
+                        open_tree_flags);
         if (fd_clone < 0)
                 return log_error_errno(errno, "Failed to clone %s: %m", m->source);
 
index 52f45b74e4dcc0df74ef836dbd7409c1ab930c5e..d4b86e3fe94108f47537011c8b0afe5d06c5075f 100644 (file)
@@ -1477,7 +1477,7 @@ int make_userns(uid_t uid_shift,
         return TAKE_FD(userns_fd);
 }
 
-int open_tree_attr_with_fallback(int dir_fd, const char *path, unsigned int flags, struct mount_attr *attr) {
+int open_tree_attr_with_fallback(int dir_fd, const char *path, unsigned flags, struct mount_attr *attr) {
         _cleanup_close_ int fd = -EBADF;
 
         assert(dir_fd >= 0 || dir_fd == AT_FDCWD);
@@ -1507,6 +1507,42 @@ int open_tree_attr_with_fallback(int dir_fd, const char *path, unsigned int flag
         return TAKE_FD(fd);
 }
 
+int open_tree_try_drop_idmap(int dir_fd, const char *path, unsigned flags) {
+        /* Tries to drop MOUNT_ATTR_IDMAP while calling open_tree_attr(), but if that doesn't work just uses
+         * a regular open_tree() */
+
+        assert(dir_fd >= 0 || dir_fd == AT_FDCWD);
+
+        if (isempty(path)) {
+                path = "";
+                flags |= AT_EMPTY_PATH;
+        }
+
+        _cleanup_close_ int fd = open_tree_attr_with_fallback(
+                        dir_fd,
+                        path,
+                        flags,
+                        &(struct mount_attr) {
+                                .attr_clr = MOUNT_ATTR_IDMAP,
+                        });
+        if (fd < 0) {
+                if (!ERRNO_IS_NEG_NOT_SUPPORTED(fd))
+                        return log_debug_errno(fd, "Failed to clear idmap of directory with open_tree_attr(): %m");
+
+                log_debug_errno(fd, "Failed to clear idmap with open_tree_attr(), retrying open_tree() without clearing idmap: %m");
+
+                fd = RET_NERRNO(open_tree(dir_fd, path, flags));
+                if (fd < 0)
+                        return log_debug_errno(fd, "Both open_tree() and open_tree_attr() failed, giving up: %m");
+
+                log_debug("open_tree() without clearing idmap worked.");
+                return TAKE_FD(fd);
+        }
+
+        log_debug("Successfully acquired mount fd with cleared idmap.");
+        return TAKE_FD(fd);
+}
+
 int remount_idmap_fd(
                 char **paths,
                 int userns_fd,
index e45139dcf387472322617ac16ec059b61c309ab4..902092be68b9470f44b61b202084ab4f3be76b38 100644 (file)
@@ -145,7 +145,8 @@ typedef enum RemountIdmapping {
         _REMOUNT_IDMAPPING_INVALID = -EINVAL,
 } RemountIdmapping;
 
-int open_tree_attr_with_fallback(int dir_fd, const char *path, unsigned int flags, struct mount_attr *attr);
+int open_tree_attr_with_fallback(int dir_fd, const char *path, unsigned flags, struct mount_attr *attr);
+int open_tree_try_drop_idmap(int dir_fd, const char *path, unsigned flags);
 
 int make_userns(uid_t uid_shift, uid_t uid_range, uid_t host_owner, uid_t dest_owner, RemountIdmapping idmapping);
 int remount_idmap_fd(char **p, int userns_fd, uint64_t extra_mount_attr_set);
index 6fd80359e32dc6f2d58d8c0081c5a03f1b019b17..73105007f925fbee17b1b46e00c9dc92743680f3 100644 (file)
@@ -29,9 +29,16 @@ ProtectHostname=yes
 RestrictAddressFamilies=AF_UNIX AF_NETLINK AF_INET AF_INET6
 RestrictRealtime=yes
 RestrictSUIDSGID=yes
-SystemCallArchitectures=native
-SystemCallErrorNumber=EPERM
-SystemCallFilter=@system-service @mount
+
+# FIXME: libseccomp (as of 2.6.0) doesn't know the open_tree_attr() system call
+# yet which we need. Thus, applying any system call filter would block this
+# system call unconditionally, which is not desirable. This should be added
+# back once libseccomp is updated. See
+# https://github.com/seccomp/libseccomp/issues/465
+
+#SystemCallArchitectures=native
+#SystemCallErrorNumber=EPERM
+#SystemCallFilter=@system-service @mount
 Type=notify
 NotifyAccess=all
 FileDescriptorStoreMax=4096