From: Lennart Poettering Date: Sat, 23 Aug 2025 06:15:01 +0000 (+0200) Subject: mountfsd: make MountDirectory() work with systemd-homed X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=9f69ff69f7d141ec93eba5f6f73df59df4df73fa;p=thirdparty%2Fsystemd.git mountfsd: make MountDirectory() work with systemd-homed systemd-homed already applies an idmap to its mounts, hence we need to undo it before we can create our own. --- diff --git a/src/mountfsd/mountwork.c b/src/mountfsd/mountwork.c index 74b97d7371a..401d8c6dfc7 100644 --- a/src/mountfsd/mountwork.c +++ b/src/mountfsd/mountwork.c @@ -27,6 +27,7 @@ #include "loop-util.h" #include "main-func.h" #include "memory-util.h" +#include "mount-util.h" #include "namespace-util.h" #include "nsresource.h" #include "nulstr-util.h" @@ -41,6 +42,7 @@ #include "time-util.h" #include "uid-classification.h" #include "uid-range.h" +#include "user-util.h" #include "varlink-io.systemd.MountFileSystem.h" #include "varlink-util.h" @@ -637,10 +639,16 @@ static MountMapMode default_mount_map_mode(DirectoryOwnership ownership) { static JSON_DISPATCH_ENUM_DEFINE(dispatch_mount_directory_mode, MountMapMode, mount_map_mode_from_string); -static DirectoryOwnership validate_directory_fd(int fd, uid_t peer_uid) { +static DirectoryOwnership validate_directory_fd( + int fd, + uid_t peer_uid, + uid_t *ret_current_owner_uid) { + int r, fl; assert(fd >= 0); + assert(uid_is_valid(peer_uid)); + assert(ret_current_owner_uid); /* Checks if the specified directory fd looks sane. Returns a DirectoryOwnership that categorizes the * ownership situation in comparison to the peer's UID. @@ -665,6 +673,7 @@ static DirectoryOwnership validate_directory_fd(int fd, uid_t peer_uid) { return log_debug_errno(fl, "Directory file descriptor has unsafe flags set: %m"); if (st.st_uid == 0) { + *ret_current_owner_uid = st.st_uid; if (peer_uid == 0) { log_debug("Directory file descriptor points to root owned directory, who is also the peer."); return DIRECTORY_IS_ROOT_PEER_OWNED; @@ -674,6 +683,7 @@ static DirectoryOwnership validate_directory_fd(int fd, uid_t peer_uid) { } if (st.st_uid == peer_uid) { log_debug("Directory file descriptor points to peer owned directory."); + *ret_current_owner_uid = st.st_uid; return DIRECTORY_IS_PEER_OWNED; } @@ -687,12 +697,14 @@ static DirectoryOwnership validate_directory_fd(int fd, uid_t peer_uid) { /* Stop iteration if we find a directory up the tree that is neither owned by the user, nor is from the foreign UID range */ if (!uid_is_foreign(st.st_uid) || !gid_is_foreign(st.st_gid)) { log_debug("Directory file descriptor points to directory which itself or its parents is neither owned by foreign UID range nor by the user."); + *ret_current_owner_uid = st.st_uid; return DIRECTORY_IS_OTHERWISE_OWNED; } /* If the peer is root, then it doesn't matter if we find a parent owned by root, let's shortcut things. */ if (peer_uid == 0) { log_debug("Directory file descriptor is owned by foreign UID range, and peer is root."); + *ret_current_owner_uid = st.st_uid; return DIRECTORY_IS_FOREIGN_OWNED; } @@ -708,11 +720,13 @@ static DirectoryOwnership validate_directory_fd(int fd, uid_t peer_uid) { /* Safety check to see if we hit the root dir */ if (stat_inode_same(&st, &new_st)) { log_debug("Directory file descriptor is owned by foreign UID range, but didn't find parent directory that is owned by peer among ancestors."); + *ret_current_owner_uid = st.st_uid; return DIRECTORY_IS_OTHERWISE_OWNED; } if (new_st.st_uid == peer_uid) { /* Parent inode is owned by the peer. That's good! Everything's fine. */ log_debug("Directory file descriptor is owned by foreign UID range, and ancestor is owned by peer."); + *ret_current_owner_uid = st.st_uid; return DIRECTORY_IS_FOREIGN_OWNED; } @@ -721,6 +735,7 @@ static DirectoryOwnership validate_directory_fd(int fd, uid_t peer_uid) { } log_debug("Failed to find peer owned parent directory after %u levels, refusing.", n_level); + *ret_current_owner_uid = st.st_uid; return DIRECTORY_IS_OTHERWISE_OWNED; } @@ -771,7 +786,8 @@ static int vl_method_mount_directory( if (r < 0) return log_debug_errno(r, "Failed to get client UID: %m"); - DirectoryOwnership owned_by = validate_directory_fd(directory_fd, peer_uid); + uid_t current_owner_uid; + DirectoryOwnership owned_by = validate_directory_fd(directory_fd, peer_uid, ¤t_owner_uid); if (owned_by == -EREMOTEIO) return sd_varlink_errorbo(link, "io.systemd.MountFileSystem.BadFileDescriptorFlags", SD_JSON_BUILD_PAIR_STRING("parameter", "directoryFileDescriptor")); if (owned_by < 0) @@ -836,10 +852,28 @@ static int vl_method_mount_directory( if (r < 0) return r; - _cleanup_close_ int mount_fd = open_tree(directory_fd, "", OPEN_TREE_CLONE|OPEN_TREE_CLOEXEC|AT_SYMLINK_NOFOLLOW|AT_EMPTY_PATH); + _cleanup_close_ int mount_fd = open_tree_try_drop_idmap( + directory_fd, + "", + OPEN_TREE_CLONE|OPEN_TREE_CLOEXEC|AT_SYMLINK_NOFOLLOW|AT_EMPTY_PATH); if (mount_fd < 0) return log_debug_errno(errno, "Failed to issue open_tree() of provided directory '%s': %m", strna(directory_path)); + /* MOUNT_ATTR_IDMAP has possibly been cleared. Let's verify that the underlying data matches our expectations. */ + struct stat unmapped_st; + if (fstat(mount_fd, &unmapped_st) < 0) + return log_debug_errno(errno, "Failed to stat unmapped inode: %m"); + + r = stat_verify_directory(&unmapped_st); + if (r < 0) + return r; + + /* For now, let's simply refuse things if dropping the idmapping changed anything. For now that + * should be good enough, because the primary usecase for this (homed) will mount the foreign UID + * range 1:1. */ + if (unmapped_st.st_uid != current_owner_uid) + return log_debug_errno(SYNTHETIC_ERRNO(EPERM), "Owner UID of mount after clearing ID mapping not the same anymore, refusing."); + if (p.read_only > 0 && mount_setattr( mount_fd, "", AT_EMPTY_PATH, &(struct mount_attr) { diff --git a/src/nspawn/nspawn-mount.c b/src/nspawn/nspawn-mount.c index 56870a341f7..f33e8bc0086 100644 --- a/src/nspawn/nspawn-mount.c +++ b/src/nspawn/nspawn-mount.c @@ -826,17 +826,16 @@ static int mount_bind(const char *dest, CustomMount *m, uid_t uid_shift, uid_t u * caller's userns *without* any mount idmapping in place. To get that uid, we clone the * mount source tree and clear any existing idmapping and temporarily mount that tree over * the mount source before we stat the mount source to figure out the source uid. */ - _cleanup_close_ int fd_clone = open_tree_attr_with_fallback( + _cleanup_close_ int fd_clone = + idmapping == REMOUNT_IDMAPPING_NONE ? + RET_NERRNO(open_tree( AT_FDCWD, m->source, - open_tree_flags, - &(struct mount_attr) { - .attr_clr = idmapping != REMOUNT_IDMAPPING_NONE ? MOUNT_ATTR_IDMAP : 0, - }); - if (ERRNO_IS_NEG_NOT_SUPPORTED(fd_clone)) - /* We can only clear idmapped mounts with open_tree_attr(), but there might not be one in - * the first place, so we keep going if we get a not supported error. */ - fd_clone = open_tree(AT_FDCWD, m->source, open_tree_flags); + open_tree_flags)) : + open_tree_try_drop_idmap( + AT_FDCWD, + m->source, + open_tree_flags); if (fd_clone < 0) return log_error_errno(errno, "Failed to clone %s: %m", m->source); diff --git a/src/shared/mount-util.c b/src/shared/mount-util.c index 52f45b74e4d..d4b86e3fe94 100644 --- a/src/shared/mount-util.c +++ b/src/shared/mount-util.c @@ -1477,7 +1477,7 @@ int make_userns(uid_t uid_shift, return TAKE_FD(userns_fd); } -int open_tree_attr_with_fallback(int dir_fd, const char *path, unsigned int flags, struct mount_attr *attr) { +int open_tree_attr_with_fallback(int dir_fd, const char *path, unsigned flags, struct mount_attr *attr) { _cleanup_close_ int fd = -EBADF; assert(dir_fd >= 0 || dir_fd == AT_FDCWD); @@ -1507,6 +1507,42 @@ int open_tree_attr_with_fallback(int dir_fd, const char *path, unsigned int flag return TAKE_FD(fd); } +int open_tree_try_drop_idmap(int dir_fd, const char *path, unsigned flags) { + /* Tries to drop MOUNT_ATTR_IDMAP while calling open_tree_attr(), but if that doesn't work just uses + * a regular open_tree() */ + + assert(dir_fd >= 0 || dir_fd == AT_FDCWD); + + if (isempty(path)) { + path = ""; + flags |= AT_EMPTY_PATH; + } + + _cleanup_close_ int fd = open_tree_attr_with_fallback( + dir_fd, + path, + flags, + &(struct mount_attr) { + .attr_clr = MOUNT_ATTR_IDMAP, + }); + if (fd < 0) { + if (!ERRNO_IS_NEG_NOT_SUPPORTED(fd)) + return log_debug_errno(fd, "Failed to clear idmap of directory with open_tree_attr(): %m"); + + log_debug_errno(fd, "Failed to clear idmap with open_tree_attr(), retrying open_tree() without clearing idmap: %m"); + + fd = RET_NERRNO(open_tree(dir_fd, path, flags)); + if (fd < 0) + return log_debug_errno(fd, "Both open_tree() and open_tree_attr() failed, giving up: %m"); + + log_debug("open_tree() without clearing idmap worked."); + return TAKE_FD(fd); + } + + log_debug("Successfully acquired mount fd with cleared idmap."); + return TAKE_FD(fd); +} + int remount_idmap_fd( char **paths, int userns_fd, diff --git a/src/shared/mount-util.h b/src/shared/mount-util.h index e45139dcf38..902092be68b 100644 --- a/src/shared/mount-util.h +++ b/src/shared/mount-util.h @@ -145,7 +145,8 @@ typedef enum RemountIdmapping { _REMOUNT_IDMAPPING_INVALID = -EINVAL, } RemountIdmapping; -int open_tree_attr_with_fallback(int dir_fd, const char *path, unsigned int flags, struct mount_attr *attr); +int open_tree_attr_with_fallback(int dir_fd, const char *path, unsigned flags, struct mount_attr *attr); +int open_tree_try_drop_idmap(int dir_fd, const char *path, unsigned flags); int make_userns(uid_t uid_shift, uid_t uid_range, uid_t host_owner, uid_t dest_owner, RemountIdmapping idmapping); int remount_idmap_fd(char **p, int userns_fd, uint64_t extra_mount_attr_set); diff --git a/units/systemd-mountfsd.service.in b/units/systemd-mountfsd.service.in index 6fd80359e32..73105007f92 100644 --- a/units/systemd-mountfsd.service.in +++ b/units/systemd-mountfsd.service.in @@ -29,9 +29,16 @@ ProtectHostname=yes RestrictAddressFamilies=AF_UNIX AF_NETLINK AF_INET AF_INET6 RestrictRealtime=yes RestrictSUIDSGID=yes -SystemCallArchitectures=native -SystemCallErrorNumber=EPERM -SystemCallFilter=@system-service @mount + +# FIXME: libseccomp (as of 2.6.0) doesn't know the open_tree_attr() system call +# yet which we need. Thus, applying any system call filter would block this +# system call unconditionally, which is not desirable. This should be added +# back once libseccomp is updated. See +# https://github.com/seccomp/libseccomp/issues/465 + +#SystemCallArchitectures=native +#SystemCallErrorNumber=EPERM +#SystemCallFilter=@system-service @mount Type=notify NotifyAccess=all FileDescriptorStoreMax=4096