]> git.ipfire.org Git - thirdparty/systemd.git/commitdiff
nspawn: Prepare --bind-user= logic for reuse in systemd-vmspawn
authorDaanDeMeyer <daan.j.demeyer@gmail.com>
Fri, 4 Jul 2025 08:26:34 +0000 (10:26 +0200)
committerDaan De Meyer <daan.j.demeyer@gmail.com>
Mon, 14 Jul 2025 14:25:22 +0000 (16:25 +0200)
Aside from the usual boilerplate of moving the shared logic to shared/,
we also rework the implementation of --bind-user= to be similar to what
we'll do in systemd-vmspawn. Instead of messing with the nspawn container
user namespace, we use idmapped mounts to map the user's home directory on
the host to the mapped uid in the container.

Ideally we'd also use the "userdb.transient" credentials to provision the
user records, but this would only work for booted containers, whereas the
current logic works for non-booted containers as well.

Aside from being similar to how we'll implement --bind-user= in vmspawn,
using idmapped mounts also allows supporting --bind-user= without having to
use --private-users=.

man/systemd-nspawn.xml
src/basic/forward.h
src/nspawn/nspawn-bind-user.c
src/nspawn/nspawn-bind-user.h
src/nspawn/nspawn-mount.c
src/nspawn/nspawn-mount.h
src/nspawn/nspawn.c
src/shared/machine-bind-user.c [new file with mode: 0644]
src/shared/machine-bind-user.h [new file with mode: 0644]
src/shared/meson.build

index d7d7d17f663146aeca4d3f2a5cd174ab9786bce9..583306935af961750b4e480f933b212bdc4b0fe9 100644 (file)
@@ -1605,10 +1605,8 @@ After=sys-subsystem-net-devices-ens1.device</programlisting>
 
         <orderedlist>
           <listitem><para>The user's home directory is bind mounted from the host into
-          <filename>/run/host/home/</filename>.</para></listitem>
-
-          <listitem><para>An additional UID/GID mapping is added that maps the host user's UID/GID to a
-          container UID/GID, allocated from the 60514…60577 range.</para></listitem>
+          <filename>/run/host/home/</filename>, using an idmapped mount to map the host user's UID/GID to its
+          assigned UID/GID in the container.</para></listitem>
 
           <listitem><para>A JSON user and group record is generated in <filename>/run/userdb/</filename> that
           describes the mapped user. It contains a minimized representation of the host's user record,
@@ -1644,9 +1642,6 @@ After=sys-subsystem-net-devices-ens1.device</programlisting>
         the container's <filename>/etc/passwd</filename> and <filename>/etc/group</filename>, and thus might
         not detect existing accounts in other databases.</para>
 
-        <para>This operation is only supported in combination with
-        <option>--private-users=</option>/<option>-U</option>.</para>
-
         <xi:include href="version-info.xml" xpointer="v249"/></listitem>
       </varlistentry>
 
index 7175120e5b4979c856530e1734cf42342a5a088d..53b217b07b5b550c226383037dae13ee37bfd965 100644 (file)
@@ -291,6 +291,8 @@ typedef struct ImagePolicy ImagePolicy;
 typedef struct InstallInfo InstallInfo;
 typedef struct LookupPaths LookupPaths;
 typedef struct LoopDevice LoopDevice;
+typedef struct MachineBindUserContext MachineBindUserContext;
+typedef struct MachineCredentialContext MachineCredentialContext;
 typedef struct MountOptions MountOptions;
 typedef struct OpenFile OpenFile;
 typedef struct Pkcs11EncryptedKey Pkcs11EncryptedKey;
index a17365d497f97eef498c2c838f73b877e42433b2..d9a06e2c03753dfa9973d070d92010cfc8ffe189 100644 (file)
 /* SPDX-License-Identifier: LGPL-2.1-or-later */
 
-#include <grp.h>
-#include <pwd.h>
 #include <unistd.h>
 
+#include "sd-json.h"
+
 #include "alloc-util.h"
-#include "chase.h"
-#include "fd-util.h"
 #include "fileio.h"
 #include "format-util.h"
-#include "json-util.h"
 #include "log.h"
-#include "nspawn-mount.h"
 #include "nspawn.h"
+#include "machine-bind-user.h"
 #include "nspawn-bind-user.h"
+#include "user-record.h"
+#include "group-record.h"
 #include "path-util.h"
 #include "string-util.h"
-#include "strv.h"
 #include "user-util.h"
-#include "userdb.h"
-
-static int check_etc_passwd_collisions(
-                const char *directory,
-                const char *name,
-                uid_t uid) {
-
-        _cleanup_fclose_ FILE *f = NULL;
-        int r;
-
-        assert(directory);
-        assert(name || uid_is_valid(uid));
-
-        r = chase_and_fopen_unlocked("/etc/passwd", directory, CHASE_PREFIX_ROOT, "re", NULL, &f);
-        if (r == -ENOENT)
-                return 0; /* no user database? then no user, hence no collision */
-        if (r < 0)
-                return log_error_errno(r, "Failed to open /etc/passwd of container: %m");
-
-        for (;;) {
-                struct passwd *pw;
-
-                r = fgetpwent_sane(f, &pw);
-                if (r < 0)
-                        return log_error_errno(r, "Failed to iterate through /etc/passwd of container: %m");
-                if (r == 0) /* EOF */
-                        return 0; /* no collision */
-
-                if (name && streq_ptr(pw->pw_name, name))
-                        return 1; /* name collision */
-                if (uid_is_valid(uid) && pw->pw_uid == uid)
-                        return 1; /* UID collision */
-        }
-}
-
-static int check_etc_group_collisions(
-                const char *directory,
-                const char *name,
-                gid_t gid) {
-
-        _cleanup_fclose_ FILE *f = NULL;
-        int r;
-
-        assert(directory);
-        assert(name || gid_is_valid(gid));
-
-        r = chase_and_fopen_unlocked("/etc/group", directory, CHASE_PREFIX_ROOT, "re", NULL, &f);
-        if (r == -ENOENT)
-                return 0; /* no group database? then no group, hence no collision */
-        if (r < 0)
-                return log_error_errno(r, "Failed to open /etc/group of container: %m");
-
-        for (;;) {
-                struct group *gr;
-
-                r = fgetgrent_sane(f, &gr);
-                if (r < 0)
-                        return log_error_errno(r, "Failed to iterate through /etc/group of container: %m");
-                if (r == 0)
-                        return 0; /* no collision */
-
-                if (name && streq_ptr(gr->gr_name, name))
-                        return 1; /* name collision */
-                if (gid_is_valid(gid) && gr->gr_gid == gid)
-                        return 1; /* gid collision */
-        }
-}
-
-static int convert_user(
-                const char *directory,
-                UserRecord *u,
-                GroupRecord *g,
-                uid_t allocate_uid,
-                const char *shell,
-                bool shell_copy,
-                UserRecord **ret_converted_user,
-                GroupRecord **ret_converted_group) {
-
-        _cleanup_(group_record_unrefp) GroupRecord *converted_group = NULL;
-        _cleanup_(user_record_unrefp) UserRecord *converted_user = NULL;
-        _cleanup_free_ char *h = NULL;
-        sd_json_variant *p, *hp = NULL, *ssh = NULL;
-        int r;
-
-        assert(u);
-        assert(g);
-        assert(user_record_gid(u) == g->gid);
-
-        if (shell_copy)
-                shell = u->shell;
-
-        r = check_etc_passwd_collisions(directory, u->user_name, UID_INVALID);
-        if (r < 0)
-                return r;
-        if (r > 0)
-                return log_error_errno(SYNTHETIC_ERRNO(EBUSY),
-                                       "Sorry, the user '%s' already exists in the container.", u->user_name);
-
-        r = check_etc_group_collisions(directory, g->group_name, GID_INVALID);
-        if (r < 0)
-                return r;
-        if (r > 0)
-                return log_error_errno(SYNTHETIC_ERRNO(EBUSY),
-                                       "Sorry, the group '%s' already exists in the container.", g->group_name);
-
-        h = path_join("/run/host/home/", u->user_name);
-        if (!h)
-                return log_oom();
-
-        /* Acquire the source hashed password array as-is, so that it retains the JSON_VARIANT_SENSITIVE flag */
-        p = sd_json_variant_by_key(u->json, "privileged");
-        if (p) {
-                hp = sd_json_variant_by_key(p, "hashedPassword");
-                ssh = sd_json_variant_by_key(p, "sshAuthorizedKeys");
-        }
-
-        r = user_record_build(
-                        &converted_user,
-                        SD_JSON_BUILD_OBJECT(
-                                        SD_JSON_BUILD_PAIR("userName", SD_JSON_BUILD_STRING(u->user_name)),
-                                        SD_JSON_BUILD_PAIR("uid", SD_JSON_BUILD_UNSIGNED(allocate_uid)),
-                                        SD_JSON_BUILD_PAIR("gid", SD_JSON_BUILD_UNSIGNED(allocate_uid)),
-                                        SD_JSON_BUILD_PAIR_CONDITION(u->disposition >= 0, "disposition", SD_JSON_BUILD_STRING(user_disposition_to_string(u->disposition))),
-                                        SD_JSON_BUILD_PAIR("homeDirectory", SD_JSON_BUILD_STRING(h)),
-                                        SD_JSON_BUILD_PAIR("service", JSON_BUILD_CONST_STRING("io.systemd.NSpawn")),
-                                        JSON_BUILD_PAIR_STRING_NON_EMPTY("shell", shell),
-                                        SD_JSON_BUILD_PAIR("privileged", SD_JSON_BUILD_OBJECT(
-                                                                           SD_JSON_BUILD_PAIR_CONDITION(!strv_isempty(u->hashed_password), "hashedPassword", SD_JSON_BUILD_VARIANT(hp)),
-                                                                           SD_JSON_BUILD_PAIR_CONDITION(!!ssh, "sshAuthorizedKeys", SD_JSON_BUILD_VARIANT(ssh))))));
-        if (r < 0)
-                return log_error_errno(r, "Failed to build container user record: %m");
-
-        r = group_record_build(
-                        &converted_group,
-                        SD_JSON_BUILD_OBJECT(
-                                        SD_JSON_BUILD_PAIR("groupName", SD_JSON_BUILD_STRING(g->group_name)),
-                                        SD_JSON_BUILD_PAIR("gid", SD_JSON_BUILD_UNSIGNED(allocate_uid)),
-                                        SD_JSON_BUILD_PAIR_CONDITION(g->disposition >= 0, "disposition", SD_JSON_BUILD_STRING(user_disposition_to_string(g->disposition))),
-                                        SD_JSON_BUILD_PAIR("service", JSON_BUILD_CONST_STRING("io.systemd.NSpawn"))));
-        if (r < 0)
-                return log_error_errno(r, "Failed to build container group record: %m");
-
-        *ret_converted_user = TAKE_PTR(converted_user);
-        *ret_converted_group = TAKE_PTR(converted_group);
-
-        return 0;
-}
-
-static int find_free_uid(const char *directory, uid_t max_uid, uid_t *current_uid) {
-        int r;
-
-        assert(directory);
-        assert(current_uid);
-
-        for (;; (*current_uid)++) {
-                if (*current_uid > MAP_UID_MAX || *current_uid > max_uid)
-                        return log_error_errno(
-                                        SYNTHETIC_ERRNO(EBUSY),
-                                        "No suitable available UID in range " UID_FMT "…" UID_FMT " in container detected, can't map user.",
-                                        MAP_UID_MIN, MAP_UID_MAX);
-
-                r = check_etc_passwd_collisions(directory, NULL, *current_uid);
-                if (r < 0)
-                        return r;
-                if (r > 0) /* already used */
-                        continue;
-
-                /* We want to use the UID also as GID, hence check for it in /etc/group too */
-                r = check_etc_group_collisions(directory, NULL, (gid_t) *current_uid);
-                if (r <= 0)
-                        return r;
-        }
-}
-
-BindUserContext* bind_user_context_free(BindUserContext *c) {
-        if (!c)
-                return NULL;
-
-        FOREACH_ARRAY(d, c->data, c->n_data) {
-                user_record_unref(d->host_user);
-                group_record_unref(d->host_group);
-                user_record_unref(d->payload_user);
-                group_record_unref(d->payload_group);
-        }
-
-        return mfree(c);
-}
-
-int bind_user_prepare(
-                const char *directory,
-                char **bind_user,
-                const char *bind_user_shell,
-                bool bind_user_shell_copy,
-                uid_t uid_shift,
-                uid_t uid_range,
-                CustomMount **custom_mounts,
-                size_t *n_custom_mounts,
-                BindUserContext **ret) {
-
-        _cleanup_(bind_user_context_freep) BindUserContext *c = NULL;
-        uid_t current_uid = MAP_UID_MIN;
-        int r;
-
-        assert(custom_mounts);
-        assert(n_custom_mounts);
-        assert(ret);
-
-        /* This resolves the users specified in 'bind_user', generates a minimalized JSON user + group record
-         * for it to stick in the container, allocates a UID/GID for it, and updates the custom mount table,
-         * to include an appropriate bind mount mapping.
-         *
-         * This extends the passed custom_mounts/n_custom_mounts with the home directories, and allocates a
-         * new BindUserContext for the user records */
-
-        if (strv_isempty(bind_user)) {
-                *ret = NULL;
-                return 0;
-        }
-
-        c = new0(BindUserContext, 1);
-        if (!c)
-                return log_oom();
-
-        STRV_FOREACH(n, bind_user) {
-                _cleanup_(user_record_unrefp) UserRecord *u = NULL, *cu = NULL;
-                _cleanup_(group_record_unrefp) GroupRecord *g = NULL, *cg = NULL;
-                _cleanup_free_ char *sm = NULL, *sd = NULL;
-
-                r = userdb_by_name(*n, /* match= */ NULL, USERDB_DONT_SYNTHESIZE_INTRINSIC|USERDB_DONT_SYNTHESIZE_FOREIGN, &u);
-                if (r < 0)
-                        return log_error_errno(r, "Failed to resolve user '%s': %m", *n);
-
-                /* For now, let's refuse mapping the root/nobody users explicitly. The records we generate
-                 * are strictly additive, nss-systemd is typically placed last in /etc/nsswitch.conf. Thus
-                 * even if we wanted, we couldn't override the root or nobody user records. Note we also
-                 * check for name conflicts in /etc/passwd + /etc/group later on, which would usually filter
-                 * out root/nobody too, hence these checks might appear redundant — but they actually are
-                 * not, as we want to support environments where /etc/passwd and /etc/group are non-existent,
-                 * and the user/group databases fully synthesized at runtime. Moreover, the name of the
-                 * user/group name of the "nobody" account differs between distros, hence a check by numeric
-                 * UID is safer. */
-                if (user_record_is_root(u))
-                        return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Mapping 'root' user not supported, sorry.");
-
-                if (user_record_is_nobody(u))
-                        return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Mapping 'nobody' user not supported, sorry.");
-
-                if (!uid_is_valid(u->uid))
-                        return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Cannot bind user with no UID, refusing.");
-
-                if (u->uid >= uid_shift && u->uid < uid_shift + uid_range)
-                        return log_error_errno(
-                                        SYNTHETIC_ERRNO(EINVAL),
-                                        "UID "UID_FMT" of user '%s' to map is already in container UID range ("UID_FMT" - "UID_FMT"), refusing.",
-                                        u->uid, u->user_name, uid_shift, uid_shift + uid_range);
-
-                r = groupdb_by_gid(user_record_gid(u), /* match= */ NULL, USERDB_DONT_SYNTHESIZE_INTRINSIC|USERDB_DONT_SYNTHESIZE_FOREIGN, &g);
-                if (r < 0)
-                        return log_error_errno(r, "Failed to resolve group of user '%s': %m", u->user_name);
-
-                if (g->gid >= uid_shift && g->gid < uid_shift + uid_range)
-                        return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "GID of group '%s' to map is already in container GID range, refusing.", g->group_name);
-
-                /* We want to synthesize exactly one user + group from the host into the container. This only
-                 * makes sense if the user on the host has its own private group. We can't reasonably check
-                 * this, so we just check of the name of user and group match.
-                 *
-                 * One of these days we might want to support users in a shared/common group too, but it's
-                 * not clear to me how this would have to be mapped, precisely given that the common group
-                 * probably already exists in the container. */
-                if (!streq(u->user_name, g->group_name))
-                        return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
-                                               "Sorry, mapping users without private groups is currently not supported.");
-
-                r = find_free_uid(directory, uid_range, &current_uid);
-                if (r < 0)
-                        return r;
-
-                r = convert_user(directory, u, g, current_uid, bind_user_shell, bind_user_shell_copy, &cu, &cg);
-                if (r < 0)
-                        return r;
-
-                if (!GREEDY_REALLOC(c->data, c->n_data + 1))
-                        return log_oom();
-
-                sm = strdup(user_record_home_directory(u));
-                if (!sm)
-                        return log_oom();
-
-                sd = strdup(user_record_home_directory(cu));
-                if (!sd)
-                        return log_oom();
-
-                if (!GREEDY_REALLOC(*custom_mounts, *n_custom_mounts + 1))
-                        return log_oom();
-
-                (*custom_mounts)[(*n_custom_mounts)++] = (CustomMount) {
-                        .type = CUSTOM_MOUNT_BIND,
-                        .source = TAKE_PTR(sm),
-                        .destination = TAKE_PTR(sd),
-                };
-
-                c->data[c->n_data++] = (BindUserData) {
-                        .host_user = TAKE_PTR(u),
-                        .host_group = TAKE_PTR(g),
-                        .payload_user = TAKE_PTR(cu),
-                        .payload_group = TAKE_PTR(cg),
-                };
-
-                current_uid++;
-        }
-
-        *ret = TAKE_PTR(c);
-        return 1;
-}
 
 static int write_and_symlink(
                 const char *root,
@@ -384,10 +67,7 @@ static int write_and_symlink(
         return 0;
 }
 
-int bind_user_setup(
-                const BindUserContext *c,
-                const char *root) {
-
+int bind_user_setup(const MachineBindUserContext *c, const char *root) {
         static const UserRecordLoadFlags strip_flags = /* Removes privileged info */
                 USER_RECORD_LOAD_MASK_PRIVILEGED|
                 USER_RECORD_PERMISSIVE;
index cb4d246bece3f0908b610f70df8f060ede6f521b..d4154218c0f3ceb52eb3fba56aaf5fd2965dbecf 100644 (file)
@@ -1,29 +1,5 @@
 /* SPDX-License-Identifier: LGPL-2.1-or-later */
-#pragma once
 
 #include "forward.h"
 
-typedef struct CustomMount CustomMount;
-
-typedef struct BindUserData {
-        /* The host's user/group records */
-        UserRecord *host_user;
-        GroupRecord *host_group;
-
-        /* The mapped records to place into the container */
-        UserRecord *payload_user;
-        GroupRecord *payload_group;
-} BindUserData;
-
-typedef struct BindUserContext {
-        BindUserData *data;
-        size_t n_data;
-} BindUserContext;
-
-BindUserContext* bind_user_context_free(BindUserContext *c);
-
-DEFINE_TRIVIAL_CLEANUP_FUNC(BindUserContext*, bind_user_context_free);
-
-int bind_user_prepare(const char *directory, char **bind_user, const char *bind_user_shell, bool bind_user_shell_copy, uid_t uid_shift, uid_t uid_range, CustomMount **custom_mounts, size_t *n_custom_mounts, BindUserContext **ret);
-
-int bind_user_setup(const BindUserContext *c, const char *root);
+int bind_user_setup(const MachineBindUserContext *c, const char *root);
index 4cc638877d8edb195ffc17a2e892d74acdd543f6..05cad27f91d74a5fa9a6891287d27a2af55c341b 100644 (file)
@@ -25,6 +25,7 @@
 #include "string-util.h"
 #include "strv.h"
 #include "tmpfile-util.h"
+#include "user-util.h"
 
 CustomMount* custom_mount_add(CustomMount **l, size_t *n, CustomMountType t) {
         CustomMount *ret;
@@ -41,7 +42,8 @@ CustomMount* custom_mount_add(CustomMount **l, size_t *n, CustomMountType t) {
         (*n)++;
 
         *ret = (CustomMount) {
-                .type = t
+                .type = t,
+                .destination_uid = UID_INVALID,
         };
 
         return ret;
@@ -849,7 +851,7 @@ static int mount_bind(const char *dest, CustomMount *m, uid_t uid_shift, uid_t u
                 if (stat(where, &dest_st) < 0)
                         return log_error_errno(errno, "Failed to stat %s: %m", where);
 
-                dest_uid = dest_st.st_uid;
+                dest_uid = uid_is_valid(m->destination_uid) ? uid_shift + m->destination_uid : dest_st.st_uid;
 
                 if (S_ISDIR(source_st.st_mode) && !S_ISDIR(dest_st.st_mode))
                         return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
@@ -880,7 +882,7 @@ static int mount_bind(const char *dest, CustomMount *m, uid_t uid_shift, uid_t u
                 if (chown(where, uid_shift, uid_shift) < 0)
                         return log_error_errno(errno, "Failed to chown %s: %m", where);
 
-                dest_uid = uid_shift;
+                dest_uid = uid_shift + (uid_is_valid(m->destination_uid) ? m->destination_uid : 0);
         }
 
         if (move_mount(fd_clone, "", AT_FDCWD, where, MOVE_MOUNT_F_EMPTY_PATH) < 0)
index 26b2380dcb9d6dba7e5f8c565a74a0d95e020cc3..f049cf4aee5b9dac6811fedf8c26118cc4059073 100644 (file)
@@ -38,6 +38,7 @@ typedef struct CustomMount {
         bool read_only;
         char *source; /* for overlayfs this is the upper directory */
         char *destination;
+        uid_t destination_uid;
         char *options;
         char *work_dir;
         char **lower;
index c021fd675e2fc2e5d28117b3326776a233eab84e..a35fb2c5ad37fd1cbd66309ca93186a37b869317 100644 (file)
@@ -34,7 +34,6 @@
 #include "capability-list.h"
 #include "capability-util.h"
 #include "cgroup-setup.h"
-#include "cgroup-util.h"
 #include "chase.h"
 #include "common-signal.h"
 #include "constants.h"
@@ -55,7 +54,6 @@
 #include "format-util.h"
 #include "fs-util.h"
 #include "gpt.h"
-#include "group-record.h"
 #include "hexdecoct.h"
 #include "hostname-setup.h"
 #include "hostname-util.h"
@@ -66,6 +64,7 @@
 #include "log.h"
 #include "loop-util.h"
 #include "loopback-setup.h"
+#include "machine-bind-user.h"
 #include "machine-credential.h"
 #include "main-func.h"
 #include "mkdir.h"
@@ -1731,9 +1730,6 @@ static int verify_arguments(void) {
                         return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "AmbientCapability= setting is not useful for boot mode.");
         }
 
-        if (arg_userns_mode == USER_NAMESPACE_NO && !strv_isempty(arg_bind_user))
-                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--bind-user= requires --private-users");
-
         /* Drop duplicate --bind-user= entries */
         strv_uniq(arg_bind_user);
 
@@ -3878,7 +3874,6 @@ static int outer_child(
                 int netns_fd,
                 const char *unix_export_path) {
 
-        _cleanup_(bind_user_context_freep) BindUserContext *bind_user_context = NULL;
         _cleanup_strv_free_ char **os_release_pairs = NULL;
         bool idmap = false;
         ssize_t l;
@@ -4043,38 +4038,41 @@ static int outer_child(
         if (r < 0)
                 return r;
 
-        r = bind_user_prepare(
+        _cleanup_(machine_bind_user_context_freep) MachineBindUserContext *bind_user_context = NULL;
+        r = machine_bind_user_prepare(
                         directory,
                         arg_bind_user,
                         arg_bind_user_shell,
                         arg_bind_user_shell_copy,
-                        chown_uid,
-                        chown_range,
-                        &arg_custom_mounts, &arg_n_custom_mounts,
                         &bind_user_context);
         if (r < 0)
                 return r;
 
-        if (arg_userns_mode != USER_NAMESPACE_NO && bind_user_context) {
-                /* Send the user maps we determined to the parent, so that it installs it in our user
-                 * namespace UID map table */
+        if (bind_user_context)
+                FOREACH_ARRAY(bind_user, bind_user_context->data, bind_user_context->n_data) {
+                        _cleanup_free_ char *sm = strdup(user_record_home_directory(bind_user->host_user));
+                        if (!sm)
+                                return log_oom();
 
-                FOREACH_ARRAY(d, bind_user_context->data, bind_user_context->n_data) {
-                        uid_t map[] = {
-                                d->payload_user->uid,
-                                d->host_user->uid,
-                                (uid_t) d->payload_group->gid,
-                                (uid_t) d->host_group->gid,
-                        };
+                        _cleanup_free_ char *sd = strdup(user_record_home_directory(bind_user->payload_user));
+                        if (!sd)
+                                return log_oom();
 
-                        l = send(fd_outer_socket, map, sizeof(map), MSG_NOSIGNAL);
-                        if (l < 0)
-                                return log_error_errno(errno, "Failed to send user UID map: %m");
-                        if (l != sizeof(map))
-                                return log_error_errno(SYNTHETIC_ERRNO(EIO),
-                                                       "Short write while sending user UID map.");
+                        if (!GREEDY_REALLOC(arg_custom_mounts, arg_n_custom_mounts + 1))
+                                return log_oom();
+
+                        char *options = strdup("owneridmap");
+                        if (!options)
+                                return log_oom();
+
+                        arg_custom_mounts[arg_n_custom_mounts++] = (CustomMount) {
+                                .type = CUSTOM_MOUNT_BIND,
+                                .source = TAKE_PTR(sm),
+                                .destination = TAKE_PTR(sd),
+                                .options = TAKE_PTR(options),
+                                .destination_uid = bind_user->payload_user->uid,
+                        };
                 }
-        }
 
         r = mount_custom(
                         directory,
@@ -4492,69 +4490,6 @@ static int uid_shift_pick(uid_t *shift, LockFile *ret_lock_file) {
         }
 }
 
-static int add_one_uid_map(
-                char **p,
-                uid_t container_uid,
-                uid_t host_uid,
-                uid_t range) {
-
-        return strextendf(p,
-                       UID_FMT " " UID_FMT " " UID_FMT "\n",
-                       container_uid, host_uid, range);
-}
-
-static int make_uid_map_string(
-                const uid_t bind_user_uid[],
-                size_t n_bind_user_uid,
-                size_t offset,
-                char **ret) {
-
-        _cleanup_free_ char *s = NULL;
-        uid_t previous_uid = 0;
-        int r;
-
-        assert(n_bind_user_uid == 0 || bind_user_uid);
-        assert(IN_SET(offset, 0, 2)); /* used to switch between UID and GID map */
-        assert(ret);
-
-        /* The bind_user_uid[] array is a series of 4 uid_t values, for each --bind-user= entry one
-         * quadruplet, consisting of host and container UID + GID. */
-
-        for (size_t i = 0; i < n_bind_user_uid; i++) {
-                uid_t payload_uid = bind_user_uid[i*4+offset],
-                        host_uid = bind_user_uid[i*4+offset+1];
-
-                assert(previous_uid <= payload_uid);
-                assert(payload_uid < arg_uid_range);
-
-                /* Add a range to close the gap to previous entry */
-                if (payload_uid > previous_uid) {
-                        r = add_one_uid_map(&s, previous_uid, arg_uid_shift + previous_uid, payload_uid - previous_uid);
-                        if (r < 0)
-                                return r;
-                }
-
-                /* Map this specific user */
-                r = add_one_uid_map(&s, payload_uid, host_uid, 1);
-                if (r < 0)
-                        return r;
-
-                previous_uid = payload_uid + 1;
-        }
-
-        /* And add a range to close the gap to finish the range */
-        if (arg_uid_range > previous_uid) {
-                r = add_one_uid_map(&s, previous_uid, arg_uid_shift + previous_uid, arg_uid_range - previous_uid);
-                if (r < 0)
-                        return r;
-        }
-
-        assert(s);
-
-        *ret = TAKE_PTR(s);
-        return 0;
-}
-
 static int setup_uid_map(
                 const PidRef *pid,
                 const uid_t bind_user_uid[],
@@ -4567,8 +4502,7 @@ static int setup_uid_map(
         assert(pidref_is_set(pid));
         assert(pid->pid > 1);
 
-        /* Build the UID map string */
-        if (make_uid_map_string(bind_user_uid, n_bind_user_uid, 0, &s) < 0) /* offset=0 contains the UID pair */
+        if (asprintf(&s, "0 " UID_FMT " " UID_FMT "\n", arg_uid_shift, arg_uid_range) < 0)
                 return log_oom();
 
         xsprintf(uid_map, "/proc/" PID_FMT "/uid_map", pid->pid);
@@ -4576,11 +4510,6 @@ static int setup_uid_map(
         if (r < 0)
                 return log_error_errno(r, "Failed to write UID map: %m");
 
-        /* And now build the GID map string */
-        s = mfree(s);
-        if (make_uid_map_string(bind_user_uid, n_bind_user_uid, 2, &s) < 0) /* offset=2 contains the GID pair */
-                return log_oom();
-
         xsprintf(uid_map, "/proc/" PID_FMT "/gid_map", pid->pid);
         r = write_string_file(uid_map, s, WRITE_STRING_FILE_DISABLE_BUFFER);
         if (r < 0)
@@ -5314,26 +5243,6 @@ static int run_container(
                         if (l != sizeof arg_uid_shift)
                                 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short write while writing UID shift.");
                 }
-
-                n_bind_user_uid = strv_length(arg_bind_user);
-                if (n_bind_user_uid > 0) {
-                        /* Right after the UID shift, we'll receive the list of UID mappings for the
-                         * --bind-user= logic. Always a quadruplet of payload and host UID + GID. */
-
-                        bind_user_uid = new(uid_t, n_bind_user_uid*4);
-                        if (!bind_user_uid)
-                                return log_oom();
-
-                        for (size_t i = 0; i < n_bind_user_uid; i++) {
-                                l = recv(fd_outer_socket_pair[0], bind_user_uid + i*4, sizeof(uid_t)*4, 0);
-                                if (l < 0)
-                                        return log_error_errno(errno, "Failed to read user UID map pair: %m");
-                                if (l != sizeof(uid_t)*4)
-                                        return log_full_errno(l == 0 ? LOG_DEBUG : LOG_WARNING,
-                                                              SYNTHETIC_ERRNO(EIO),
-                                                              "Short read while reading bind user UID pairs.");
-                        }
-                }
         }
 
         /* Wait for the outer child. */
diff --git a/src/shared/machine-bind-user.c b/src/shared/machine-bind-user.c
new file mode 100644 (file)
index 0000000..e4566bb
--- /dev/null
@@ -0,0 +1,302 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <grp.h>
+#include <pwd.h>
+#include <unistd.h>
+
+#include "alloc-util.h"
+#include "chase.h"
+#include "fd-util.h"
+#include "format-util.h"
+#include "json-util.h"
+#include "log.h"
+#include "machine-bind-user.h"
+#include "path-util.h"
+#include "string-util.h"
+#include "strv.h"
+#include "user-util.h"
+#include "userdb.h"
+
+static int check_etc_passwd_collisions(
+                const char *directory,
+                const char *name,
+                uid_t uid) {
+
+        _cleanup_fclose_ FILE *f = NULL;
+        int r;
+
+        assert(name || uid_is_valid(uid));
+
+        if (!directory)
+                return 0;
+
+        r = chase_and_fopen_unlocked("/etc/passwd", directory, CHASE_PREFIX_ROOT, "re", NULL, &f);
+        if (r == -ENOENT)
+                return 0; /* no user database? then no user, hence no collision */
+        if (r < 0)
+                return log_error_errno(r, "Failed to open /etc/passwd of container: %m");
+
+        for (;;) {
+                struct passwd *pw;
+
+                r = fgetpwent_sane(f, &pw);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to iterate through /etc/passwd of container: %m");
+                if (r == 0) /* EOF */
+                        return 0; /* no collision */
+
+                if (name && streq_ptr(pw->pw_name, name))
+                        return 1; /* name collision */
+                if (uid_is_valid(uid) && pw->pw_uid == uid)
+                        return 1; /* UID collision */
+        }
+}
+
+static int check_etc_group_collisions(
+                const char *directory,
+                const char *name,
+                gid_t gid) {
+
+        _cleanup_fclose_ FILE *f = NULL;
+        int r;
+
+        assert(name || gid_is_valid(gid));
+
+        if (!directory)
+                return 0;
+
+        r = chase_and_fopen_unlocked("/etc/group", directory, CHASE_PREFIX_ROOT, "re", NULL, &f);
+        if (r == -ENOENT)
+                return 0; /* no group database? then no group, hence no collision */
+        if (r < 0)
+                return log_error_errno(r, "Failed to open /etc/group of container: %m");
+
+        for (;;) {
+                struct group *gr;
+
+                r = fgetgrent_sane(f, &gr);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to iterate through /etc/group of container: %m");
+                if (r == 0)
+                        return 0; /* no collision */
+
+                if (name && streq_ptr(gr->gr_name, name))
+                        return 1; /* name collision */
+                if (gid_is_valid(gid) && gr->gr_gid == gid)
+                        return 1; /* gid collision */
+        }
+}
+
+static int convert_user(
+                const char *directory,
+                UserRecord *u,
+                GroupRecord *g,
+                uid_t allocate_uid,
+                const char *shell,
+                bool shell_copy,
+                UserRecord **ret_converted_user,
+                GroupRecord **ret_converted_group) {
+
+        _cleanup_(group_record_unrefp) GroupRecord *converted_group = NULL;
+        _cleanup_(user_record_unrefp) UserRecord *converted_user = NULL;
+        _cleanup_free_ char *h = NULL;
+        sd_json_variant *p, *hp = NULL, *ssh = NULL;
+        int r;
+
+        assert(u);
+        assert(g);
+        assert(user_record_gid(u) == g->gid);
+
+        if (shell_copy)
+                shell = u->shell;
+
+        r = check_etc_passwd_collisions(directory, u->user_name, UID_INVALID);
+        if (r < 0)
+                return r;
+        if (r > 0)
+                return log_error_errno(SYNTHETIC_ERRNO(EBUSY),
+                                       "Sorry, the user '%s' already exists in the container.", u->user_name);
+
+        r = check_etc_group_collisions(directory, g->group_name, GID_INVALID);
+        if (r < 0)
+                return r;
+        if (r > 0)
+                return log_error_errno(SYNTHETIC_ERRNO(EBUSY),
+                                       "Sorry, the group '%s' already exists in the container.", g->group_name);
+
+        h = path_join("/run/host/home/", u->user_name);
+        if (!h)
+                return log_oom();
+
+        /* Acquire the source hashed password array as-is, so that it retains the JSON_VARIANT_SENSITIVE flag */
+        p = sd_json_variant_by_key(u->json, "privileged");
+        if (p) {
+                hp = sd_json_variant_by_key(p, "hashedPassword");
+                ssh = sd_json_variant_by_key(p, "sshAuthorizedKeys");
+        }
+
+        r = user_record_build(
+                        &converted_user,
+                        SD_JSON_BUILD_OBJECT(
+                                        SD_JSON_BUILD_PAIR("userName", SD_JSON_BUILD_STRING(u->user_name)),
+                                        SD_JSON_BUILD_PAIR("uid", SD_JSON_BUILD_UNSIGNED(allocate_uid)),
+                                        SD_JSON_BUILD_PAIR("gid", SD_JSON_BUILD_UNSIGNED(allocate_uid)),
+                                        SD_JSON_BUILD_PAIR_CONDITION(u->disposition >= 0, "disposition", SD_JSON_BUILD_STRING(user_disposition_to_string(u->disposition))),
+                                        SD_JSON_BUILD_PAIR("homeDirectory", SD_JSON_BUILD_STRING(h)),
+                                        SD_JSON_BUILD_PAIR("service", JSON_BUILD_CONST_STRING("io.systemd.NSpawn")),
+                                        JSON_BUILD_PAIR_STRING_NON_EMPTY("shell", shell),
+                                        SD_JSON_BUILD_PAIR("privileged", SD_JSON_BUILD_OBJECT(
+                                                                           SD_JSON_BUILD_PAIR_CONDITION(!strv_isempty(u->hashed_password), "hashedPassword", SD_JSON_BUILD_VARIANT(hp)),
+                                                                           SD_JSON_BUILD_PAIR_CONDITION(!!ssh, "sshAuthorizedKeys", SD_JSON_BUILD_VARIANT(ssh))))));
+        if (r < 0)
+                return log_error_errno(r, "Failed to build container user record: %m");
+
+        r = group_record_build(
+                        &converted_group,
+                        SD_JSON_BUILD_OBJECT(
+                                        SD_JSON_BUILD_PAIR("groupName", SD_JSON_BUILD_STRING(g->group_name)),
+                                        SD_JSON_BUILD_PAIR("gid", SD_JSON_BUILD_UNSIGNED(allocate_uid)),
+                                        SD_JSON_BUILD_PAIR_CONDITION(g->disposition >= 0, "disposition", SD_JSON_BUILD_STRING(user_disposition_to_string(g->disposition))),
+                                        SD_JSON_BUILD_PAIR("service", JSON_BUILD_CONST_STRING("io.systemd.NSpawn"))));
+        if (r < 0)
+                return log_error_errno(r, "Failed to build container group record: %m");
+
+        *ret_converted_user = TAKE_PTR(converted_user);
+        *ret_converted_group = TAKE_PTR(converted_group);
+
+        return 0;
+}
+
+static int find_free_uid(const char *directory, uid_t *current_uid) {
+        int r;
+
+        assert(current_uid);
+
+        for (;; (*current_uid)++) {
+                if (*current_uid > MAP_UID_MAX)
+                        return log_error_errno(
+                                        SYNTHETIC_ERRNO(EBUSY),
+                                        "No suitable available UID in range " UID_FMT "…" UID_FMT " in container detected, can't map user.",
+                                        MAP_UID_MIN, MAP_UID_MAX);
+
+                r = check_etc_passwd_collisions(directory, NULL, *current_uid);
+                if (r < 0)
+                        return r;
+                if (r > 0) /* already used */
+                        continue;
+
+                /* We want to use the UID also as GID, hence check for it in /etc/group too */
+                r = check_etc_group_collisions(directory, NULL, (gid_t) *current_uid);
+                if (r <= 0)
+                        return r;
+        }
+}
+
+MachineBindUserContext* machine_bind_user_context_free(MachineBindUserContext *c) {
+        if (!c)
+                return NULL;
+
+        FOREACH_ARRAY(d, c->data, c->n_data) {
+                user_record_unref(d->host_user);
+                group_record_unref(d->host_group);
+                user_record_unref(d->payload_user);
+                group_record_unref(d->payload_group);
+        }
+
+        return mfree(c);
+}
+
+int machine_bind_user_prepare(
+                const char *directory,
+                char **bind_user,
+                const char *bind_user_shell,
+                bool bind_user_shell_copy,
+                MachineBindUserContext **ret) {
+
+        _cleanup_(machine_bind_user_context_freep) MachineBindUserContext *c = NULL;
+        uid_t current_uid = MAP_UID_MIN;
+        int r;
+
+        assert(ret);
+
+        /* This resolves the users specified in 'bind_user', generates a minimalized JSON user + group record
+         * for it to stick in the container, allocates a UID/GID for it, and updates the custom mount table,
+         * to include an appropriate bind mount mapping.
+         *
+         * This extends the passed custom_mounts/n_custom_mounts with the home directories, and allocates a
+         * new BindUserContext for the user records */
+
+        if (strv_isempty(bind_user)) {
+                *ret = NULL;
+                return 0;
+        }
+
+        c = new0(MachineBindUserContext, 1);
+        if (!c)
+                return log_oom();
+
+        STRV_FOREACH(n, bind_user) {
+                _cleanup_(user_record_unrefp) UserRecord *u = NULL, *cu = NULL;
+                _cleanup_(group_record_unrefp) GroupRecord *g = NULL, *cg = NULL;
+
+                r = userdb_by_name(*n, /* match= */ NULL, USERDB_DONT_SYNTHESIZE_INTRINSIC|USERDB_DONT_SYNTHESIZE_FOREIGN, &u);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to resolve user '%s': %m", *n);
+
+                /* For now, let's refuse mapping the root/nobody users explicitly. The records we generate
+                 * are strictly additive, nss-systemd is typically placed last in /etc/nsswitch.conf. Thus
+                 * even if we wanted, we couldn't override the root or nobody user records. Note we also
+                 * check for name conflicts in /etc/passwd + /etc/group later on, which would usually filter
+                 * out root/nobody too, hence these checks might appear redundant — but they actually are
+                 * not, as we want to support environments where /etc/passwd and /etc/group are non-existent,
+                 * and the user/group databases fully synthesized at runtime. Moreover, the name of the
+                 * user/group name of the "nobody" account differs between distros, hence a check by numeric
+                 * UID is safer. */
+                if (user_record_is_root(u))
+                        return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Mapping 'root' user not supported, sorry.");
+
+                if (user_record_is_nobody(u))
+                        return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Mapping 'nobody' user not supported, sorry.");
+
+                if (!uid_is_valid(u->uid))
+                        return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Cannot bind user with no UID, refusing.");
+
+                r = groupdb_by_gid(user_record_gid(u), /* match= */ NULL, USERDB_DONT_SYNTHESIZE_INTRINSIC|USERDB_DONT_SYNTHESIZE_FOREIGN, &g);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to resolve group of user '%s': %m", u->user_name);
+
+                /* We want to synthesize exactly one user + group from the host into the container. This only
+                 * makes sense if the user on the host has its own private group. We can't reasonably check
+                 * this, so we just check of the name of user and group match.
+                 *
+                 * One of these days we might want to support users in a shared/common group too, but it's
+                 * not clear to me how this would have to be mapped, precisely given that the common group
+                 * probably already exists in the container. */
+                if (!streq(u->user_name, g->group_name))
+                        return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
+                                               "Sorry, mapping users without private groups is currently not supported.");
+
+                r = find_free_uid(directory, &current_uid);
+                if (r < 0)
+                        return r;
+
+                r = convert_user(directory, u, g, current_uid, bind_user_shell, bind_user_shell_copy, &cu, &cg);
+                if (r < 0)
+                        return r;
+
+                if (!GREEDY_REALLOC(c->data, c->n_data + 1))
+                        return log_oom();
+
+                c->data[c->n_data++] = (MachineBindUserData) {
+                        .host_user = TAKE_PTR(u),
+                        .host_group = TAKE_PTR(g),
+                        .payload_user = TAKE_PTR(cu),
+                        .payload_group = TAKE_PTR(cg),
+                };
+
+                current_uid++;
+        }
+
+        *ret = TAKE_PTR(c);
+        return 1;
+}
diff --git a/src/shared/machine-bind-user.h b/src/shared/machine-bind-user.h
new file mode 100644 (file)
index 0000000..c0a74a7
--- /dev/null
@@ -0,0 +1,30 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "forward.h"
+
+typedef struct MachineBindUserData {
+        /* The host's user/group records */
+        UserRecord *host_user;
+        GroupRecord *host_group;
+
+        /* The mapped records to place into the container */
+        UserRecord *payload_user;
+        GroupRecord *payload_group;
+} MachineBindUserData;
+
+typedef struct MachineBindUserContext {
+        MachineBindUserData *data;
+        size_t n_data;
+} MachineBindUserContext;
+
+MachineBindUserContext* machine_bind_user_context_free(MachineBindUserContext *c);
+
+DEFINE_TRIVIAL_CLEANUP_FUNC(MachineBindUserContext*, machine_bind_user_context_free);
+
+int machine_bind_user_prepare(
+                const char *directory,
+                char **bind_user,
+                const char *bind_user_shell,
+                bool bind_user_shell_copy,
+                MachineBindUserContext **ret);
index 2a49a5e9b88cc72c81690a405803f7883177a16d..c3eca33dd7196518e1114c9bae66869868025530 100644 (file)
@@ -119,6 +119,7 @@ shared_sources = files(
         'loop-util.c',
         'loopback-setup.c',
         'lsm-util.c',
+        'machine-bind-user.c',
         'machine-credential.c',
         'machine-id-setup.c',
         'machine-pool.c',