From 26f80a9d337bac087889bc9244ea4fad8ffb294f Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Fri, 28 Nov 2025 12:21:41 +0100 Subject: [PATCH] namespace: port mount_private_apivfs() to fsopen() and friends This is not just refactoring, but has the big benefit that it makes us indepdendent from a temporary directory we might not have enough access to create. (This matters with the new PrivateUsers=managed). --- src/core/namespace.c | 60 ++++++++++++++++++++++++++++++++++---------- 1 file changed, 47 insertions(+), 13 deletions(-) diff --git a/src/core/namespace.c b/src/core/namespace.c index 56fc268fef2..680ee262852 100644 --- a/src/core/namespace.c +++ b/src/core/namespace.c @@ -1462,16 +1462,45 @@ static int mount_private_apivfs( assert(entry_path); assert(bind_source); - (void) mkdir_p_label(entry_path, 0755); + bool noprivs = false; + + /* First, check if we have enough privileges to mount a new instance. */ + _cleanup_close_ int mount_fd = make_fsmount( + LOG_DEBUG, + /* what= */ fstype, + fstype, + MS_NOSUID|MS_NOEXEC|MS_NODEV, + opts, + /* userns_fd= */ -EBADF); + if (ERRNO_IS_NEG_PRIVILEGE(mount_fd)) + noprivs = true; + else if (ERRNO_IS_NEG_NOT_SUPPORTED(mount_fd)) { + /* Fallback for kernels lacking mount_setattr() */ + + // FIXME: This compatibility code path shall be removed once kernel 5.12 + // becomes the new minimal baseline + + r = create_temporary_mount_point(scope, &temporary_mount); + if (r < 0) + return r; - /* First, check if we have enough privileges to mount a new instance. Note, a new sysfs instance - * cannot be mounted on an already existing mount. Let's use a temporary place. */ - r = create_temporary_mount_point(scope, &temporary_mount); - if (r < 0) - return r; + r = mount_nofollow_verbose( + LOG_DEBUG, + fstype, + temporary_mount, + fstype, + MS_NOSUID|MS_NOEXEC|MS_NODEV, + opts); + if (ERRNO_IS_NEG_PRIVILEGE(r)) + noprivs = true; + else if (r < 0) + return r; + } else if (mount_fd < 0) + return log_debug_errno(mount_fd, "Failed to make file system mount: %m"); + + (void) mkdir_p_label(entry_path, 0755); - r = mount_nofollow_verbose(LOG_DEBUG, fstype, temporary_mount, fstype, MS_NOSUID|MS_NOEXEC|MS_NODEV, opts); - if (ERRNO_IS_NEG_PRIVILEGE(r)) { + if (noprivs) { /* When we do not have enough privileges to mount a new instance, fall back to use an * existing mount. */ @@ -1490,8 +1519,6 @@ static int mount_private_apivfs( return 1; } - if (r < 0) - return r; /* OK. We have a new mount instance. Let's clear an existing mount and its submounts. */ r = umount_recursive(entry_path, /* flags= */ 0); @@ -1499,9 +1526,16 @@ static int mount_private_apivfs( log_debug_errno(r, "Failed to unmount directories below '%s', ignoring: %m", entry_path); /* Then, move the new mount instance. */ - r = mount_nofollow_verbose(LOG_DEBUG, temporary_mount, entry_path, /* fstype= */ NULL, MS_MOVE, /* options= */ NULL); - if (r < 0) - return r; + if (mount_fd >= 0) { + r = RET_NERRNO(move_mount(mount_fd, "", -EBADF, entry_path, MOVE_MOUNT_F_EMPTY_PATH)); + if (r < 0) + return log_debug_errno(r, "Failed to attach '%s' to '%s': %m", fstype, entry_path); + } else if (temporary_mount) { + r = mount_nofollow_verbose(LOG_DEBUG, temporary_mount, entry_path, /* fstype= */ NULL, MS_MOVE, /* options= */ NULL); + if (r < 0) + return r; + } else + assert_not_reached(); /* We mounted a new instance now. Let's bind mount the children over now. This matters for nspawn * where a bunch of files are overmounted, in particular the boot id. */ -- 2.47.3