/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#include <sched.h>
#include <stdlib.h>
#include <sys/mount.h>
+#include <sys/socket.h>
#include <sys/stat.h>
#include <unistd.h>
#include "process-util.h"
#include "runtime-scope.h"
#include "set.h"
+#include "socket-util.h"
#include "sort-util.h"
#include "stat-util.h"
#include "string-util.h"
return 1;
}
+int mount_fd_clone(int mount_fd, bool recursive, int *replacement_fd) {
+ const int flags = OPEN_TREE_CLONE|OPEN_TREE_CLOEXEC|AT_EMPTY_PATH|(recursive ? AT_RECURSIVE : 0);
+ int r;
+
+ assert(mount_fd >= 0);
+
+ /* If the input mount fd is supposed to remain clonable after calling this function, call it as
+ * follows: mount_fd_clone(mount_fd, recursive, &mount_fd). */
+
+ /* Clone a detached mount (that may be owned by a foreign mountns, e.g. mountfsd's). For this to
+ * work on older kernels, we have to jump through some hoops, because the kernel currently doesn't
+ * allow us to just call open_tree(OPEN_TREE_CLONE) directly to get a clone of a mount that is
+ * detached and owned by another mountns. Hence here's what we do: we clone short-lived child in a
+ * new mount namespace owned by our userns. There, we attach the mount (invisible to anyone else).
+ * This is sufficient to pass the kernel check, so next we use open_tree(OPEN_TREE_CLONE) to get our
+ * own detached mount. This we send back to the parent, which then can use it. */
+
+ r = RET_NERRNO(open_tree(mount_fd, "", flags));
+ if (r != -EINVAL)
+ /* The straightforward path just works? Yay! Don't bother with the complex logic below. No
+ * need to put a replacement fd in replacement_fd as the original fd is still usable. */
+ return r;
+
+ _cleanup_close_pair_ int transfer_fds[2] = EBADF_PAIR;
+ r = socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, transfer_fds);
+ if (r < 0)
+ return log_debug_errno(errno, "Failed to create socket pair: %m");
+
+ _cleanup_close_pair_ int errno_pipe_fds[2] = EBADF_PAIR;
+ if (pipe2(errno_pipe_fds, O_CLOEXEC|O_NONBLOCK) < 0)
+ return log_debug_errno(errno, "Failed to open pipe: %m");
+
+ /* Fork a child. Note that we set FORK_NEW_MOUNTNS|FORK_MOUNTNS_SLAVE here, i.e. get a new mount namespace */
+ r = safe_fork_full(
+ "(sd-clonemnt)",
+ /* stdio_fds= */ NULL,
+ (int[]) { mount_fd, transfer_fds[1], errno_pipe_fds[1] }, 3,
+ FORK_RESET_SIGNALS|FORK_CLOSE_ALL_FDS|FORK_DEATHSIG_SIGKILL|FORK_REOPEN_LOG|FORK_WAIT|FORK_NEW_MOUNTNS|FORK_MOUNTNS_SLAVE,
+ /* ret_pid= */ NULL);
+ if (r < 0) {
+ errno_pipe_fds[1] = safe_close(errno_pipe_fds[1]);
+
+ int q = read_errno(errno_pipe_fds[0]);
+ if (q < 0 && q != -EIO)
+ return q;
+
+ return r;
+ }
+ if (r == 0) { /* Child */
+
+ /* Attach mount */
+ if (move_mount(mount_fd, "", -EBADF, "/", MOVE_MOUNT_F_EMPTY_PATH) < 0) {
+ log_debug_errno(errno, "Failed to move mount file descriptor to '/': %m");
+ report_errno_and_exit(errno_pipe_fds[1], -errno);
+ }
+
+ /* If requested by the caller, we clone the fd twice. Why? After move_mount(), the input file
+ * descriptor can't be move_mount()'ed again, which means we can't clone it again if it comes
+ * from a different mount namespace. To ensure they can clone the same fd multiple times,
+ * callers can pass a pointer to the input fd which will be replaced with a second clone,
+ * which can be move_mount()'ed and thus can be cloned again. */
+
+ for (int i = 0; i < 1 + !!replacement_fd; i++) {
+ /* And now clone the attached mount that is now ours. */
+ _cleanup_close_ int cloned_fd = open_tree(mount_fd, "", flags);
+ if (cloned_fd < 0) {
+ log_debug_errno(errno, "Failed to clone mount file descriptor: %m");
+ report_errno_and_exit(errno_pipe_fds[1], -errno);
+ }
+
+ /* And send it to the parent. */
+ r = send_one_fd(transfer_fds[1], cloned_fd, /* flags= */ 0);
+ if (r < 0)
+ report_errno_and_exit(errno_pipe_fds[1], r);
+ }
+
+ _exit(EXIT_SUCCESS);
+ }
+
+ transfer_fds[1] = safe_close(transfer_fds[1]);
+
+ /* Accept the new cloned mount */
+ _cleanup_close_ int fd1 = receive_one_fd(transfer_fds[0], 0);
+ if (fd1 < 0)
+ return fd1;
+
+ if (replacement_fd) {
+ int fd2 = receive_one_fd(transfer_fds[0], 0);
+ if (fd2 < 0)
+ return fd2;
+
+ close_and_replace(*replacement_fd, fd2);
+ }
+
+ return TAKE_FD(fd1);
+}
+
int make_userns(uid_t uid_shift,
uid_t uid_range,
uid_t source_owner,
#include "process-util.h"
#include "random-util.h"
#include "rm-rf.h"
+#include "socket-util.h"
#include "string-util.h"
#include "strv.h"
#include "tests.h"
ASSERT_ERROR(umountat_detach_verbose(LOG_ERR, dfd, "foo"), EINVAL);
}
+TEST(mount_fd_clone) {
+ _cleanup_(rm_rf_physical_and_freep) char *t = NULL;
+ _cleanup_close_pair_ int fds[2] = EBADF_PAIR;
+ int r;
+
+ CHECK_PRIV;
+
+ ASSERT_OK(mkdtemp_malloc(NULL, &t));
+
+ /* Set up a socket pair to transfer the mount fd from the child (in a different mountns) to us. */
+ ASSERT_OK_ERRNO(socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, fds));
+
+ r = ASSERT_OK(safe_fork_full(
+ "(mount-fd-clone-setup)",
+ /* stdio_fds= */ NULL,
+ &fds[1], 1,
+ FORK_COMMON_FLAGS,
+ NULL));
+ if (r == 0) {
+ /* Create a tmpfs mount in this child's mountns. */
+ ASSERT_OK(mount_nofollow_verbose(LOG_ERR, "tmpfs", t, "tmpfs", 0, NULL));
+
+ /* Create a file in it to verify the mount later. */
+ _cleanup_free_ char *marker = ASSERT_NOT_NULL(path_join(t, "marker"));
+ ASSERT_OK(touch(marker));
+
+ /* Clone the mount as a detached mount fd. */
+ _cleanup_close_ int mount_fd = ASSERT_OK_ERRNO(open_tree(AT_FDCWD, t, OPEN_TREE_CLONE|OPEN_TREE_CLOEXEC));
+
+ /* Send the mount fd to the parent. */
+ ASSERT_OK(send_one_fd(fds[1], mount_fd, 0));
+
+ _exit(EXIT_SUCCESS);
+ }
+
+ fds[1] = safe_close(fds[1]);
+
+ /* Parent: Receive the mount fd, clone it with mount_fd_clone(), and verify we can attach it. */
+ _cleanup_close_ int foreign_mount_fd = ASSERT_OK(receive_one_fd(fds[0], 0));
+ _cleanup_close_ int first_clone = ASSERT_OK(
+ mount_fd_clone(foreign_mount_fd, /* recursive= */ true, &foreign_mount_fd));
+ _cleanup_close_ _unused_ int second_clone = ASSERT_OK(
+ mount_fd_clone(foreign_mount_fd, /* recursive= */ true, /* replacement_fd= */ NULL));
+ _cleanup_free_ char *target = ASSERT_NOT_NULL(path_join(t, "target"));
+ ASSERT_OK_ERRNO(mkdir(target, 0755));
+
+ r = ASSERT_OK(safe_fork_full(
+ "(mount-fd-clone-verify)",
+ /* stdio_fds= */ NULL,
+ &first_clone, 1,
+ FORK_COMMON_FLAGS,
+ NULL));
+ if (r == 0) {
+ ASSERT_OK_ERRNO(move_mount(first_clone, "", AT_FDCWD, target, MOVE_MOUNT_F_EMPTY_PATH));
+
+ _cleanup_free_ char *marker = ASSERT_NOT_NULL(path_join(target, "marker"));
+ ASSERT_OK_ERRNO(access(marker, F_OK));
+
+ _exit(EXIT_SUCCESS);
+ }
+}
+
DEFINE_TEST_MAIN(LOG_DEBUG);