From: Mike Yuan <me@yhndnzj.com>
Date: Fri, 7 Nov 2025 20:53:02 +0000 (+0100)
Subject: core/exec-credential: port to new mount API, ensure atomicity for creds installation
X-Git-Tag: v259-rc1~92^2~4
X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=d796c6b7c64bd47b192c5351c955e9b8f4298bd4;p=thirdparty%2Fsystemd.git

core/exec-credential: port to new mount API, ensure atomicity for creds installation

This allows us to kill a great deal of complexity imposed
by the mountns and workspace reuse.
---

diff --git a/src/core/exec-credential.c b/src/core/exec-credential.c
index 7c1c98349a8..632365fbac7 100644
--- a/src/core/exec-credential.c
+++ b/src/core/exec-credential.c
@@ -796,27 +796,18 @@ static int acquire_credentials(
                 const CGroupContext *cgroup_context,
                 const ExecParameters *params,
                 const char *unit,
-                const char *p,
+                int dfd,
                 uid_t uid,
                 gid_t gid,
                 bool ownership_ok) {
 
-        _cleanup_close_ int dfd = -EBADF;
         int r;
 
         assert(context);
         assert(cgroup_context);
         assert(params);
         assert(unit);
-        assert(p);
-
-        dfd = open(p, O_DIRECTORY|O_CLOEXEC);
-        if (dfd < 0)
-                return -errno;
-
-        r = fd_acl_make_writable(dfd); /* Add the "w" bit, if we are reusing an already set up credentials dir where it was unset */
-        if (r < 0)
-                return r;
+        assert(dfd >= 0);
 
         struct load_cred_args args = {
                 .context = context,
@@ -923,7 +914,15 @@ static int acquire_credentials(
                         return r;
         }
 
-        r = fd_acl_make_read_only(dfd); /* Now take away the "w" bit */
+        return 0;
+}
+
+static int credentials_dir_finalize_permissions(int dfd, uid_t uid, gid_t gid, bool ownership_ok) {
+        int r;
+
+        assert(dfd >= 0);
+
+        r = fd_acl_make_read_only(dfd); /* Take away the "w" bit */
         if (r < 0)
                 return r;
 
@@ -947,157 +946,154 @@ static int acquire_credentials(
         return 0;
 }
 
-static int setup_credentials_internal(
+static int setup_credentials_plain_dir(
                 const ExecContext *context,
                 const CGroupContext *cgroup_context,
                 const ExecParameters *params,
                 const char *unit,
-                const char *final,        /* This is where the credential store shall eventually end up at */
-                const char *workspace,    /* This is where we can prepare it before moving it to the final place */
-                bool reuse_workspace,     /* Whether to reuse any existing workspace mount if it already is a mount */
-                bool must_mount,          /* Whether to require that we mount something, it's not OK to use the plain directory fall back */
+                const char *cred_dir,
                 uid_t uid,
                 gid_t gid) {
 
-        bool final_mounted;
-        int r, workspace_mounted; /* negative if we don't know yet whether we have/can mount something; true
-                                   * if we mounted something; false if we definitely can't mount anything */
+        _cleanup_free_ char *t = NULL, *workspace = NULL;
+        _cleanup_(rm_rf_safep) const char *workspace_rm = NULL;
+        _cleanup_close_ int dfd = -EBADF;
+        int r;
 
         assert(context);
         assert(params);
         assert(unit);
-        assert(final);
-        assert(workspace);
+        assert(cred_dir);
 
-        r = path_is_mount_point(final);
-        if (r < 0)
-                return log_debug_errno(r, "Failed to determine if '%s' is a mountpoint: %m", final);
-        final_mounted = r > 0;
+        /* Temporary workspace, that remains inaccessible all the time. We prepare stuff there before moving
+         * it into place, so that users can't access half-initialized credential stores. */
+        t = path_join(params->prefix[EXEC_DIRECTORY_RUNTIME], "systemd/temporary-credentials");
+        if (!t)
+                return -ENOMEM;
 
-        if (final_mounted) {
-                if (FLAGS_SET(params->flags, EXEC_SETUP_CREDENTIALS_FRESH)) {
-                        r = umount_verbose(LOG_DEBUG, final, MNT_DETACH|UMOUNT_NOFOLLOW);
-                        if (r < 0)
-                                return r;
+        r = mkdir_label(t, 0700);
+        if (r < 0 && r != -EEXIST)
+                return r;
 
-                        final_mounted = false;
-                } else {
-                        /* We can reuse the previous credential dir */
-                        r = dir_is_empty(final, /* ignore_hidden_or_backup = */ false);
-                        if (r < 0)
-                                return r;
-                        if (r == 0) {
-                                log_debug("Credential dir for unit '%s' already set up, skipping.", unit);
-                                return 0;
-                        }
-                }
+        workspace = path_join(t, unit);
+        if (!workspace)
+                return -ENOMEM;
+
+        dfd = open_mkdir(workspace, O_CLOEXEC|O_EXCL, 0700);
+        if (dfd < 0)
+                return log_debug_errno(dfd, "Failed to create workspace for credentials: %m");
+        workspace_rm = workspace;
+
+        (void) label_fix_full(dfd, /* inode_path = */ NULL, cred_dir, /* flags = */ 0);
+
+        r = acquire_credentials(context, cgroup_context, params, unit, dfd, uid, gid, /* ownership_ok = */ false);
+        if (r < 0)
+                return r;
+
+        r = RET_NERRNO(rename(workspace, cred_dir));
+        if (r >= 0)
+                workspace_rm = NULL;
+        if (r == -EEXIST) {
+                log_debug_errno(r, "Credential dir '%s' already populated, exchanging with workspace.", cred_dir);
+                r = RET_NERRNO(renameat2(AT_FDCWD, workspace, AT_FDCWD, cred_dir, RENAME_EXCHANGE));
         }
+        if (r < 0)
+                return log_debug_errno(r, "Failed to move credentials workspace into place: %m");
+
+        /* rename() requires both the source and target to be writable, hence lock down write permission
+         * as last step. */
+        r = credentials_dir_finalize_permissions(dfd, uid, gid, /* ownership_ok = */ false);
+        if (r < 0)
+                return log_debug_errno(r, "Failed to adjust ACLs of credentials dir: %m");
+
+        return 0;
+}
 
-        if (reuse_workspace) {
-                r = path_is_mount_point(workspace);
+static int setup_credentials_internal(
+                const ExecContext *context,
+                const CGroupContext *cgroup_context,
+                const ExecParameters *params,
+                const char *unit,
+                const char *cred_dir,
+                uid_t uid,
+                gid_t gid) {
+
+        _cleanup_close_ int fs_fd = -EBADF, mfd = -EBADF, dfd = -EBADF;
+        bool dir_mounted;
+        int r;
+
+        assert(context);
+        assert(params);
+        assert(unit);
+        assert(cred_dir);
+
+        if (!FLAGS_SET(params->flags, EXEC_SETUP_CREDENTIALS_FRESH)) {
+                /* We may reuse the previous credential dir */
+                r = dir_is_empty(cred_dir, /* ignore_hidden_or_backup = */ false);
                 if (r < 0)
                         return r;
-                if (r > 0)
-                        workspace_mounted = true; /* If this is already a mount, and we are supposed to reuse
-                                                   * it, let's keep this in mind */
-                else
-                        workspace_mounted = -1; /* We need to figure out if we can mount something to the workspace */
-        } else
-                workspace_mounted = -1; /* ditto */
-
-        /* If both the final place and the workspace are mounted, we have no mounts to set up, based on
-         * the assumption that they're actually the same tmpfs (but the latter with MS_RDONLY different).
-         * If the workspace is not mounted, we just bind the final place over and make it writable. */
-        must_mount = must_mount || final_mounted;
-
-        if (workspace_mounted < 0) {
-                if (!final_mounted)
-                        /* Nothing is mounted on the workspace yet, let's try to mount a new tmpfs if
-                         * not using the final place. */
-                        r = mount_credentials_fs(workspace);
-                if (final_mounted || r < 0) {
-                        /* If using final place or failed to mount new tmpfs, make a bind mount from
-                         * the final to the workspace, so that we can make it writable there. */
-                        r = mount_nofollow_verbose(LOG_DEBUG, final, workspace, NULL, MS_BIND|MS_REC, NULL);
-                        if (r < 0) {
-                                if (!ERRNO_IS_PRIVILEGE(r))
-                                        /* Propagate anything that isn't a permission problem. */
-                                        return r;
-
-                                if (must_mount)
-                                        /* If it's not OK to use the plain directory fallback, propagate all
-                                         * errors too. */
-                                        return r;
-
-                                /* If we lack privileges to bind mount stuff, then let's gracefully proceed
-                                 * for compat with container envs, and just use the final dir as is.
-                                 * Final place must not be mounted in this case (refused by must_mount
-                                 * above) */
-
-                                workspace_mounted = false;
-                        } else {
-                                /* Make the new bind mount writable (i.e. drop MS_RDONLY) */
-                                r = mount_nofollow_verbose(LOG_DEBUG,
-                                                           NULL,
-                                                           workspace,
-                                                           NULL,
-                                                           MS_BIND|MS_REMOUNT|credentials_fs_mount_flags(/* ro= */ false),
-                                                           NULL);
-                                if (r < 0)
-                                        return r;
-
-                                workspace_mounted = true;
-                        }
-                } else
-                        workspace_mounted = true;
+                if (r == 0) {
+                        log_debug("Credential dir for unit '%s' already set up, skipping.", unit);
+                        return 0;
+                }
         }
 
-        assert(workspace_mounted >= 0);
-        assert(!must_mount || workspace_mounted);
+        r = path_is_mount_point(cred_dir);
+        if (r < 0)
+                return log_debug_errno(r, "Failed to determine if '%s' is a mountpoint: %m", cred_dir);
+        dir_mounted = r > 0;
 
-        const char *where = workspace_mounted ? workspace : final;
+        mfd = fsmount_credentials_fs(&fs_fd);
+        if (ERRNO_IS_NEG_PRIVILEGE(mfd) && !dir_mounted) {
+                log_debug_errno(mfd, "Lacking privilege to mount credentials fs, falling back to plain directory.");
+                return setup_credentials_plain_dir(context, cgroup_context, params, unit, cred_dir, uid, gid);
+        }
+        if (mfd < 0)
+                return log_debug_errno(mfd, "Failed to mount credentials fs: %m");
 
-        (void) label_fix_full(AT_FDCWD, where, final, 0);
+        dfd = fd_reopen(mfd, O_DIRECTORY|O_CLOEXEC);
+        if (dfd < 0)
+                return dfd;
 
-        r = acquire_credentials(context, cgroup_context, params, unit, where, uid, gid, workspace_mounted);
-        if (r < 0) {
-                /* If we're using final place as workspace, and failed to acquire credentials, we might
-                 * have left half-written creds there. Let's get rid of the whole mount, so future
-                 * calls won't reuse it. */
-                if (final_mounted)
-                        (void) umount_verbose(LOG_DEBUG, final, MNT_DETACH|UMOUNT_NOFOLLOW);
+        (void) label_fix_full(dfd, /* inode_path = */ NULL, cred_dir, /* flags = */ 0);
 
+        r = acquire_credentials(context, cgroup_context, params, unit, dfd, uid, gid, /* ownership_ok = */ true);
+        if (r < 0)
                 return r;
-        }
 
-        if (workspace_mounted) {
-                if (!final_mounted) {
-                        /* Make workspace read-only now, so that any bind mount we make from it defaults to
-                         * read-only too */
-                        r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|credentials_fs_mount_flags(/* ro= */ true), NULL);
-                        if (r < 0)
-                                return r;
+        r = credentials_dir_finalize_permissions(dfd, uid, gid, /* ownership_ok = */ true);
+        if (r < 0)
+                return log_debug_errno(r, "Failed to adjust ACLs of credentials dir: %m");
 
-                        /* And mount it to the final place, read-only */
-                        r = mount_nofollow_verbose(LOG_DEBUG, workspace, final, NULL, MS_MOVE, NULL);
-                } else
-                        /* Otherwise we just get rid of the bind mount of final place */
-                        r = umount_verbose(LOG_DEBUG, workspace, MNT_DETACH|UMOUNT_NOFOLLOW);
-                if (r < 0)
-                        return r;
-        } else {
-                _cleanup_free_ char *parent = NULL;
+        if (fsconfig(fs_fd, FSCONFIG_SET_FLAG, "ro", NULL, 0) < 0)
+                return -errno;
+
+        if (fsconfig(fs_fd, FSCONFIG_CMD_RECONFIGURE, NULL, NULL, 0) < 0)
+                return -errno;
+
+        log_debug("Successfully reconfigured credentials fs to be read only.");
 
-                /* If we do not have our own mount put used the plain directory fallback, then we need to
-                 * open access to the top-level credential directory and the per-service directory now */
+        if (dir_mounted) {
+                /* Firstly, try to move beneath the existing mount, which guarantees strictly atomic replacement
+                 * (needs kernel >= 6.5) */
+                r = move_mount(mfd, "", AT_FDCWD, cred_dir, MOVE_MOUNT_F_EMPTY_PATH|MOVE_MOUNT_BENEATH);
+                if (r >= 0)
+                        return umount_verbose(LOG_DEBUG, cred_dir, MNT_DETACH|UMOUNT_NOFOLLOW);
+                if (errno != EINVAL)
+                        return log_debug_errno(errno, "Failed to move credentials fs into place: %m");
 
-                r = path_extract_directory(final, &parent);
+                log_debug_errno(errno, "Unable to move credentials fs beneath existing mount '%s', unmounting instead: %m",
+                                cred_dir);
+
+                r = umount_verbose(LOG_DEBUG, cred_dir, MNT_DETACH|UMOUNT_NOFOLLOW);
                 if (r < 0)
                         return r;
-                if (chmod(parent, 0755) < 0)
-                        return -errno;
         }
 
+        r = move_mount(mfd, "", AT_FDCWD, cred_dir, MOVE_MOUNT_F_EMPTY_PATH);
+        if (r < 0)
+                return log_debug_errno(errno, "Failed to move credentials fs into place: %m");
+
         return 0;
 }
 
@@ -1140,96 +1136,12 @@ int exec_setup_credentials(
         if (r < 0 && r != -EEXIST)
                 return r;
 
-        r = safe_fork("(sd-mkdcreds)", FORK_DEATHSIG_SIGTERM|FORK_WAIT|FORK_NEW_MOUNTNS, NULL);
-        if (r < 0) {
-                _cleanup_(rmdir_and_freep) char *u = NULL; /* remove the temporary workspace if we can */
-                _cleanup_free_ char *t = NULL;
-
-                /* If this is not a privilege or support issue then propagate the error */
-                if (!ERRNO_IS_NOT_SUPPORTED(r) && !ERRNO_IS_PRIVILEGE(r))
-                        return r;
-
-                /* Temporary workspace, that remains inaccessible all the time. We prepare stuff there before moving
-                 * it into place, so that users can't access half-initialized credential stores. */
-                t = path_join(params->prefix[EXEC_DIRECTORY_RUNTIME], "systemd/temporary-credentials");
-                if (!t)
-                        return -ENOMEM;
-
-                /* We can't set up a mount namespace. In that case operate on a fixed, inaccessible per-unit
-                 * directory outside of /run/credentials/ first, and then move it over to /run/credentials/
-                 * after it is fully set up */
-                u = path_join(t, unit);
-                if (!u)
-                        return -ENOMEM;
-
-                FOREACH_STRING(i, t, u) {
-                        r = mkdir_label(i, 0700);
-                        if (r < 0 && r != -EEXIST)
-                                return log_debug_errno(r, "Failed to make directory '%s': %m", i);
-                }
-
-                r = setup_credentials_internal(
-                                context,
-                                cgroup_context,
-                                params,
-                                unit,
-                                p,       /* final mount point */
-                                u,       /* temporary workspace to overmount */
-                                true,    /* reuse the workspace if it is already a mount */
-                                false,   /* it's OK to fall back to a plain directory if we can't mount anything */
-                                uid,
-                                gid);
-                if (r < 0)
-                        return r;
-
-        } else if (r == 0) {
-
-                /* We managed to set up a mount namespace, and are now in a child. That's great. In this case
-                 * we can use the same directory for all cases, after turning off propagation. Question
-                 * though is: where do we turn off propagation exactly, and where do we place the workspace
-                 * directory? We need some place that is guaranteed to be a mount point in the host, and
-                 * which is guaranteed to have a subdir we can mount over. /run/ is not suitable for this,
-                 * since we ultimately want to move the resulting file system there, i.e. we need propagation
-                 * for /run/ eventually. We could use our own /run/systemd/bind mount on itself, but that
-                 * would be visible in the host mount table all the time, which we want to avoid. Hence, what
-                 * we do here instead we use /dev/ and /dev/shm/ for our purposes. We know for sure that
-                 * /dev/ is a mount point and we now for sure that /dev/shm/ exists. Hence we can turn off
-                 * propagation on the former, and then overmount the latter.
-                 *
-                 * Yes it's nasty playing games with /dev/ and /dev/shm/ like this, since it does not exist
-                 * for this purpose, but there are few other candidates that work equally well for us, and
-                 * given that we do this in a privately namespaced short-lived single-threaded process that
-                 * no one else sees this should be OK to do. */
-
-                /* Turn off propagation from our namespace to host */
-                r = mount_nofollow_verbose(LOG_DEBUG, NULL, "/dev", NULL, MS_SLAVE|MS_REC, NULL);
-                if (r < 0)
-                        goto child_fail;
-
-                r = setup_credentials_internal(
-                                context,
-                                cgroup_context,
-                                params,
-                                unit,
-                                p,           /* final mount point */
-                                "/dev/shm",  /* temporary workspace to overmount */
-                                false,       /* do not reuse /dev/shm if it is already a mount, under no circumstances */
-                                true,        /* insist that something is mounted, do not allow fallback to plain directory */
-                                uid,
-                                gid);
-                if (r < 0)
-                        goto child_fail;
-
-                _exit(EXIT_SUCCESS);
-
-        child_fail:
-                _exit(EXIT_FAILURE);
-        }
+        r = setup_credentials_internal(context, cgroup_context, params, unit, p, uid, gid);
 
         /* If the credentials dir is empty and not a mount point, then there's no point in having it. Let's
          * try to remove it. This matters in particular if we created the dir as mount point but then didn't
          * actually end up mounting anything on it. In that case we'd rather have ENOENT than EACCESS being
          * seen by users when trying access this inode. */
         (void) rmdir(p);
-        return 0;
+        return r;
 }