From: Daan De Meyer Date: Fri, 28 Nov 2025 18:06:37 +0000 (+0100) Subject: mount-util: Add mount_fd_clone() helper X-Git-Tag: v259~30^2~2 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=273c6bc045fdfa50c693d0b88ec2a4cbc9258329;p=thirdparty%2Fsystemd.git mount-util: Add mount_fd_clone() helper The kernel prevents you from open_tree()'ing an open_tree() fd unless it was created from the caller's mount namespace. For various use cases, we want to be able to open_tree() arbitrary mount file descriptors. Turns out there's a way go get around it by mounting the mount file descriptor in a throw-away mount namespace and then open_tree()'ing the mount file descriptor. Let's implement this as a new helper mount_fd_clone() and add a test for it. Because move_mount()'ing the original fd makes it pretty useless as it can't be move_mount()'ed again, we optionally make a second clone which can replace the original fd so it can be cloned again later. --- diff --git a/src/shared/mount-util.c b/src/shared/mount-util.c index 0cfcfd9cc56..ec538ec8d80 100644 --- a/src/shared/mount-util.c +++ b/src/shared/mount-util.c @@ -1,7 +1,9 @@ /* SPDX-License-Identifier: LGPL-2.1-or-later */ +#include #include #include +#include #include #include @@ -30,6 +32,7 @@ #include "process-util.h" #include "runtime-scope.h" #include "set.h" +#include "socket-util.h" #include "sort-util.h" #include "stat-util.h" #include "string-util.h" @@ -1421,6 +1424,103 @@ int fd_make_mount_point(int fd) { return 1; } +int mount_fd_clone(int mount_fd, bool recursive, int *replacement_fd) { + const int flags = OPEN_TREE_CLONE|OPEN_TREE_CLOEXEC|AT_EMPTY_PATH|(recursive ? AT_RECURSIVE : 0); + int r; + + assert(mount_fd >= 0); + + /* If the input mount fd is supposed to remain clonable after calling this function, call it as + * follows: mount_fd_clone(mount_fd, recursive, &mount_fd). */ + + /* Clone a detached mount (that may be owned by a foreign mountns, e.g. mountfsd's). For this to + * work on older kernels, we have to jump through some hoops, because the kernel currently doesn't + * allow us to just call open_tree(OPEN_TREE_CLONE) directly to get a clone of a mount that is + * detached and owned by another mountns. Hence here's what we do: we clone short-lived child in a + * new mount namespace owned by our userns. There, we attach the mount (invisible to anyone else). + * This is sufficient to pass the kernel check, so next we use open_tree(OPEN_TREE_CLONE) to get our + * own detached mount. This we send back to the parent, which then can use it. */ + + r = RET_NERRNO(open_tree(mount_fd, "", flags)); + if (r != -EINVAL) + /* The straightforward path just works? Yay! Don't bother with the complex logic below. No + * need to put a replacement fd in replacement_fd as the original fd is still usable. */ + return r; + + _cleanup_close_pair_ int transfer_fds[2] = EBADF_PAIR; + r = socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, transfer_fds); + if (r < 0) + return log_debug_errno(errno, "Failed to create socket pair: %m"); + + _cleanup_close_pair_ int errno_pipe_fds[2] = EBADF_PAIR; + if (pipe2(errno_pipe_fds, O_CLOEXEC|O_NONBLOCK) < 0) + return log_debug_errno(errno, "Failed to open pipe: %m"); + + /* Fork a child. Note that we set FORK_NEW_MOUNTNS|FORK_MOUNTNS_SLAVE here, i.e. get a new mount namespace */ + r = safe_fork_full( + "(sd-clonemnt)", + /* stdio_fds= */ NULL, + (int[]) { mount_fd, transfer_fds[1], errno_pipe_fds[1] }, 3, + FORK_RESET_SIGNALS|FORK_CLOSE_ALL_FDS|FORK_DEATHSIG_SIGKILL|FORK_REOPEN_LOG|FORK_WAIT|FORK_NEW_MOUNTNS|FORK_MOUNTNS_SLAVE, + /* ret_pid= */ NULL); + if (r < 0) { + errno_pipe_fds[1] = safe_close(errno_pipe_fds[1]); + + int q = read_errno(errno_pipe_fds[0]); + if (q < 0 && q != -EIO) + return q; + + return r; + } + if (r == 0) { /* Child */ + + /* Attach mount */ + if (move_mount(mount_fd, "", -EBADF, "/", MOVE_MOUNT_F_EMPTY_PATH) < 0) { + log_debug_errno(errno, "Failed to move mount file descriptor to '/': %m"); + report_errno_and_exit(errno_pipe_fds[1], -errno); + } + + /* If requested by the caller, we clone the fd twice. Why? After move_mount(), the input file + * descriptor can't be move_mount()'ed again, which means we can't clone it again if it comes + * from a different mount namespace. To ensure they can clone the same fd multiple times, + * callers can pass a pointer to the input fd which will be replaced with a second clone, + * which can be move_mount()'ed and thus can be cloned again. */ + + for (int i = 0; i < 1 + !!replacement_fd; i++) { + /* And now clone the attached mount that is now ours. */ + _cleanup_close_ int cloned_fd = open_tree(mount_fd, "", flags); + if (cloned_fd < 0) { + log_debug_errno(errno, "Failed to clone mount file descriptor: %m"); + report_errno_and_exit(errno_pipe_fds[1], -errno); + } + + /* And send it to the parent. */ + r = send_one_fd(transfer_fds[1], cloned_fd, /* flags= */ 0); + if (r < 0) + report_errno_and_exit(errno_pipe_fds[1], r); + } + + _exit(EXIT_SUCCESS); + } + + transfer_fds[1] = safe_close(transfer_fds[1]); + + /* Accept the new cloned mount */ + _cleanup_close_ int fd1 = receive_one_fd(transfer_fds[0], 0); + if (fd1 < 0) + return fd1; + + if (replacement_fd) { + int fd2 = receive_one_fd(transfer_fds[0], 0); + if (fd2 < 0) + return fd2; + + close_and_replace(*replacement_fd, fd2); + } + + return TAKE_FD(fd1); +} + int make_userns(uid_t uid_shift, uid_t uid_range, uid_t source_owner, diff --git a/src/shared/mount-util.h b/src/shared/mount-util.h index 0fedbe44f4c..8d907988ebc 100644 --- a/src/shared/mount-util.h +++ b/src/shared/mount-util.h @@ -121,6 +121,8 @@ int mount_image_in_namespace( int make_mount_point(const char *path); int fd_make_mount_point(int fd); +int mount_fd_clone(int mount_fd, bool recursive, int *replacement_fd); + typedef enum RemountIdmapping { REMOUNT_IDMAPPING_NONE, /* Include a mapping from UID_MAPPED_ROOT (i.e. UID 2^31-2) on the backing fs to UID 0 on the diff --git a/src/test/test-mount-util.c b/src/test/test-mount-util.c index 7e40ca79ea3..137b58b4d4c 100644 --- a/src/test/test-mount-util.c +++ b/src/test/test-mount-util.c @@ -19,6 +19,7 @@ #include "process-util.h" #include "random-util.h" #include "rm-rf.h" +#include "socket-util.h" #include "string-util.h" #include "strv.h" #include "tests.h" @@ -541,4 +542,66 @@ TEST(umountat) { ASSERT_ERROR(umountat_detach_verbose(LOG_ERR, dfd, "foo"), EINVAL); } +TEST(mount_fd_clone) { + _cleanup_(rm_rf_physical_and_freep) char *t = NULL; + _cleanup_close_pair_ int fds[2] = EBADF_PAIR; + int r; + + CHECK_PRIV; + + ASSERT_OK(mkdtemp_malloc(NULL, &t)); + + /* Set up a socket pair to transfer the mount fd from the child (in a different mountns) to us. */ + ASSERT_OK_ERRNO(socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, fds)); + + r = ASSERT_OK(safe_fork_full( + "(mount-fd-clone-setup)", + /* stdio_fds= */ NULL, + &fds[1], 1, + FORK_COMMON_FLAGS, + NULL)); + if (r == 0) { + /* Create a tmpfs mount in this child's mountns. */ + ASSERT_OK(mount_nofollow_verbose(LOG_ERR, "tmpfs", t, "tmpfs", 0, NULL)); + + /* Create a file in it to verify the mount later. */ + _cleanup_free_ char *marker = ASSERT_NOT_NULL(path_join(t, "marker")); + ASSERT_OK(touch(marker)); + + /* Clone the mount as a detached mount fd. */ + _cleanup_close_ int mount_fd = ASSERT_OK_ERRNO(open_tree(AT_FDCWD, t, OPEN_TREE_CLONE|OPEN_TREE_CLOEXEC)); + + /* Send the mount fd to the parent. */ + ASSERT_OK(send_one_fd(fds[1], mount_fd, 0)); + + _exit(EXIT_SUCCESS); + } + + fds[1] = safe_close(fds[1]); + + /* Parent: Receive the mount fd, clone it with mount_fd_clone(), and verify we can attach it. */ + _cleanup_close_ int foreign_mount_fd = ASSERT_OK(receive_one_fd(fds[0], 0)); + _cleanup_close_ int first_clone = ASSERT_OK( + mount_fd_clone(foreign_mount_fd, /* recursive= */ true, &foreign_mount_fd)); + _cleanup_close_ _unused_ int second_clone = ASSERT_OK( + mount_fd_clone(foreign_mount_fd, /* recursive= */ true, /* replacement_fd= */ NULL)); + _cleanup_free_ char *target = ASSERT_NOT_NULL(path_join(t, "target")); + ASSERT_OK_ERRNO(mkdir(target, 0755)); + + r = ASSERT_OK(safe_fork_full( + "(mount-fd-clone-verify)", + /* stdio_fds= */ NULL, + &first_clone, 1, + FORK_COMMON_FLAGS, + NULL)); + if (r == 0) { + ASSERT_OK_ERRNO(move_mount(first_clone, "", AT_FDCWD, target, MOVE_MOUNT_F_EMPTY_PATH)); + + _cleanup_free_ char *marker = ASSERT_NOT_NULL(path_join(target, "marker")); + ASSERT_OK_ERRNO(access(marker, F_OK)); + + _exit(EXIT_SUCCESS); + } +} + DEFINE_TEST_MAIN(LOG_DEBUG);