]> git.ipfire.org Git - thirdparty/systemd.git/commitdiff
nspawn: add support for 'managed' userns mode even when we run privileged
authorLennart Poettering <lennart@poettering.net>
Mon, 25 Nov 2024 11:20:13 +0000 (12:20 +0100)
committerLennart Poettering <lennart@poettering.net>
Thu, 23 Jan 2025 20:48:02 +0000 (21:48 +0100)
So far, we supported two modes:

1. when running unpriv we'd get the mounts from mountfsd, and the userns
   from nsresourced
2. when running priv we'd do the mounts/userns ourselves

This untangles this a bit, so that we can also use mountfsd/nsresourced
when running privilged.

I think this is generally a bit nicer, and probably something we should
switch to entirely one day, as it reduces the variety of codepaths.

With this patch the default behaviour remains unchanged, but by
selecting the new "managed" option for --private-users= the codepaths
via mountfsd/nsresourced can be explicitly requested even when running
with privs.

This is mostly just reworks that we check for arg_userns_mode !=
USER_NAMESPACE_MANAGED rather than arg_privileged for a number of
codepaths, but requires more fixes, too. The devil is in the details.

shell-completion/bash/systemd-nspawn
src/nspawn/nspawn-cgroup.c
src/nspawn/nspawn-cgroup.h
src/nspawn/nspawn-mount.c
src/nspawn/nspawn-mount.h
src/nspawn/nspawn-settings.h
src/nspawn/nspawn.c

index 0a1761d110e2ba1f1d676bc8613c5cd992391087..e1829287f456169c51fef828adb04d05c27d3754 100644 (file)
@@ -157,7 +157,7 @@ _systemd_nspawn() {
                 comps='yes no'
                 ;;
             --private-users)
-                comps='yes no pick'
+                comps='yes no pick identity managed'
                 ;;
             --network-namespace-path)
                 comps=$( compgen -A file -- "$cur" )
index 6d6a8a814b185a96fa48c1d698c931418071e77b..4ee21c07790854ffc2256845056c833eb0f832d5 100644 (file)
@@ -119,7 +119,7 @@ int create_subcgroup(
                 CGroupUnified unified_requested,
                 uid_t uid_shift,
                 int userns_fd,
-                bool privileged) {
+                UserNamespaceMode userns_mode) {
 
         _cleanup_free_ char *cgroup = NULL, *payload = NULL;
         CGroupMask supported;
@@ -163,14 +163,14 @@ int create_subcgroup(
         if (!payload)
                 return log_oom();
 
-        if (privileged)
+        if (userns_mode != USER_NAMESPACE_MANAGED)
                 r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, payload, pid);
         else
                 r = cg_create(SYSTEMD_CGROUP_CONTROLLER, payload);
         if (r < 0)
                 return log_error_errno(r, "Failed to create %s subcgroup: %m", payload);
 
-        if (privileged) {
+        if (userns_mode != USER_NAMESPACE_MANAGED) {
                 _cleanup_free_ char *fs = NULL;
                 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, payload, NULL, &fs);
                 if (r < 0)
index 7e2cd53ddccc56b0143be896bb352cdfd6ae2204..8f039ffb283f675c1138506f9779c08a2efe323f 100644 (file)
@@ -5,9 +5,10 @@
 #include <sys/types.h>
 
 #include "cgroup-util.h"
+#include "nspawn-settings.h"
 
 int sync_cgroup(pid_t pid, CGroupUnified unified_requested, uid_t uid_shift);
-int create_subcgroup(pid_t pid, bool keep_unit, CGroupUnified unified_requested, uid_t uid_shift, int userns_fd, bool privileged);
+int create_subcgroup(pid_t pid, bool keep_unit, CGroupUnified unified_requested, uid_t uid_shift, int userns_fd, UserNamespaceMode userns_mode);
 
 int mount_cgroups(const char *dest, CGroupUnified unified_requested, bool userns, uid_t uid_shift, uid_t uid_range, const char *selinux_apifs_context, bool use_cgns);
 int mount_systemd_cgroup_writable(const char *dest, CGroupUnified unified_requested);
index 0f1fff375ce00fbc68e185edd7a739af35c78041..552d629a188efe4e1b92ab389f34c0688467dd93 100644 (file)
@@ -590,11 +590,11 @@ int mount_all(const char *dest,
                 { "tmpfs",                  "/tmp",                         "tmpfs", "mode=01777" NESTED_TMPFS_LIMITS, MS_NOSUID|MS_NODEV|MS_STRICTATIME,
                   MOUNT_FATAL|MOUNT_APPLY_TMPFS_TMP|MOUNT_MKDIR|MOUNT_USRQUOTA_GRACEFUL },
                 { "tmpfs",                  "/sys",                         "tmpfs", "mode=0555" TMPFS_LIMITS_SYS,     MS_NOSUID|MS_NOEXEC|MS_NODEV,
-                  MOUNT_FATAL|MOUNT_APPLY_APIVFS_NETNS|MOUNT_MKDIR|MOUNT_PRIVILEGED },
+                  MOUNT_FATAL|MOUNT_APPLY_APIVFS_NETNS|MOUNT_MKDIR|MOUNT_UNMANAGED },
                 { "sysfs",                  "/sys",                         "sysfs", NULL,                             SYS_DEFAULT_MOUNT_FLAGS,
-                  MOUNT_FATAL|MOUNT_APPLY_APIVFS_RO|MOUNT_MKDIR|MOUNT_PRIVILEGED },    /* skipped if above was mounted */
+                  MOUNT_FATAL|MOUNT_APPLY_APIVFS_RO|MOUNT_MKDIR|MOUNT_UNMANAGED },    /* skipped if above was mounted */
                 { "sysfs",                  "/sys",                         "sysfs", NULL,                             MS_NOSUID|MS_NOEXEC|MS_NODEV,
-                  MOUNT_FATAL|MOUNT_MKDIR|MOUNT_PRIVILEGED },                          /* skipped if above was mounted */
+                  MOUNT_FATAL|MOUNT_MKDIR|MOUNT_UNMANAGED },                          /* skipped if above was mounted */
                 { "tmpfs",                  "/dev",                         "tmpfs", "mode=0755" TMPFS_LIMITS_PRIVATE_DEV, MS_NOSUID|MS_STRICTATIME,
                   MOUNT_FATAL|MOUNT_MKDIR },
                 { "tmpfs",                  "/dev/shm",                     "tmpfs", "mode=01777" NESTED_TMPFS_LIMITS, MS_NOSUID|MS_NODEV|MS_STRICTATIME,
@@ -617,9 +617,9 @@ int mount_all(const char *dest,
                 { "/sys/fs/selinux",        "/sys/fs/selinux",              NULL,    NULL,                             MS_BIND,
                   MOUNT_MKDIR|MOUNT_PRIVILEGED },  /* Bind mount first (mkdir/chown the mount point in case /sys/ is mounted as minimal skeleton tmpfs) */
                 { NULL,                     "/sys/fs/selinux",              NULL,    NULL,                             MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT,
-                  MOUNT_PRIVILEGED },              /* Then, make it r/o (don't mkdir/chown the mount point here, the previous entry already did that) */
+                  MOUNT_UNMANAGED|MOUNT_PRIVILEGED },  /* Then, make it r/o (don't mkdir/chown the mount point here, the previous entry already did that) */
                 { NULL,                     "/sys/fs/selinux",              NULL,    NULL,                             MS_PRIVATE,
-                  MOUNT_PRIVILEGED },              /* Turn off propagation (we only want that for the mount propagation tunnel dir) */
+                  MOUNT_UNMANAGED|MOUNT_PRIVILEGED },  /* Turn off propagation (we only want that for the mount propagation tunnel dir) */
 #endif
         };
 
@@ -628,6 +628,7 @@ int mount_all(const char *dest,
         bool ro = FLAGS_SET(mount_settings, MOUNT_APPLY_APIVFS_RO);
         bool in_userns = FLAGS_SET(mount_settings, MOUNT_IN_USERNS);
         bool tmpfs_tmp = FLAGS_SET(mount_settings, MOUNT_APPLY_TMPFS_TMP);
+        bool unmanaged = FLAGS_SET(mount_settings, MOUNT_UNMANAGED);
         bool privileged = FLAGS_SET(mount_settings, MOUNT_PRIVILEGED);
         int r;
 
@@ -636,8 +637,9 @@ int mount_all(const char *dest,
                 bool fatal = FLAGS_SET(m->mount_settings, MOUNT_FATAL);
                 const char *o;
 
-                /* If we are not privileged but the entry is marked as privileged and to be mounted outside the user namespace, then skip it */
-                if (!privileged && FLAGS_SET(m->mount_settings, MOUNT_PRIVILEGED) && !FLAGS_SET(m->mount_settings, MOUNT_IN_USERNS))
+                /* If we are in managed user namespace mode but the entry is marked for mount outside of
+                 * managed user namespace mode, and to be mounted outside the user namespace, then skip it */
+                if (!unmanaged && FLAGS_SET(m->mount_settings, MOUNT_UNMANAGED) && !FLAGS_SET(m->mount_settings, MOUNT_IN_USERNS))
                         continue;
 
                 if (in_userns != FLAGS_SET(m->mount_settings, MOUNT_IN_USERNS))
@@ -652,6 +654,9 @@ int mount_all(const char *dest,
                 if (!tmpfs_tmp && FLAGS_SET(m->mount_settings, MOUNT_APPLY_TMPFS_TMP))
                         continue;
 
+                if (!privileged && FLAGS_SET(m->mount_settings, MOUNT_PRIVILEGED))
+                        continue;
+
                 r = chase(m->where, dest, CHASE_NONEXISTENT|CHASE_PREFIX_ROOT, &where, NULL);
                 if (r < 0)
                         return log_error_errno(r, "Failed to resolve %s%s: %m", strempty(dest), m->where);
index 1f54b5162521e5aa3b161fba48aafba62b01f6e0..87b3b91c432a69e27aa7c0b835cb2ba11a6d5812 100644 (file)
@@ -20,8 +20,9 @@ typedef enum MountSettingsMask {
         MOUNT_TOUCH              = 1 << 9, /* if set, touch file to mount over first */
         MOUNT_PREFIX_ROOT        = 1 << 10,/* if set, prefix the source path with the container's root directory */
         MOUNT_FOLLOW_SYMLINKS    = 1 << 11,/* if set, we'll follow symlinks for the mount target */
-        MOUNT_PRIVILEGED         = 1 << 12,/* if set, we'll only mount this in the outer child if we are running in privileged mode */
-        MOUNT_USRQUOTA_GRACEFUL  = 1 << 13,/* if set, append "usrquota" to mount options if kernel tmpfs supports that */
+        MOUNT_UNMANAGED          = 1 << 12,/* if set, we'll only mount this in the outer child if we are running in privileged mode */
+        MOUNT_PRIVILEGED         = 1 << 13,/* if set, we'll only mount this if we have full privileges */
+        MOUNT_USRQUOTA_GRACEFUL  = 1 << 14,/* if set, append "usrquota" to mount options if kernel tmpfs supports that */
 } MountSettingsMask;
 
 typedef enum CustomMountType {
index 767057eeb40aa93b0f5394125848d70c85f8aa35..0b3050639160f7ae70cc8bb2a9c50e59a08a6d23 100644 (file)
@@ -29,6 +29,7 @@ typedef enum UserNamespaceMode {
         USER_NAMESPACE_NO,
         USER_NAMESPACE_FIXED,
         USER_NAMESPACE_PICK,
+        USER_NAMESPACE_MANAGED,
         _USER_NAMESPACE_MODE_MAX,
         _USER_NAMESPACE_MODE_INVALID = -EINVAL,
 } UserNamespaceMode;
index 929342c22852af0f60b6079eb9917194d3a58dda..4c054b2dbb85056193f052f4c723a2505f59333e 100644 (file)
@@ -140,7 +140,7 @@ static char *arg_hostname = NULL;    /* The name the payload sees by default */
 static const char *arg_selinux_context = NULL;
 static const char *arg_selinux_apifs_context = NULL;
 static char *arg_slice = NULL;
-static bool arg_private_network = false;
+static bool arg_private_network; /* initialized depending on arg_privileged in run() */
 static bool arg_read_only = false;
 static StartMode arg_start_mode = START_PID1;
 static bool arg_ephemeral = false;
@@ -198,7 +198,7 @@ static VolatileMode arg_volatile_mode = VOLATILE_NO;
 static ExposePort *arg_expose_ports = NULL;
 static char **arg_property = NULL;
 static sd_bus_message *arg_property_message = NULL;
-static UserNamespaceMode arg_userns_mode = USER_NAMESPACE_NO;
+static UserNamespaceMode arg_userns_mode; /* initialized depending on arg_privileged in run() */
 static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
 static UserNamespaceOwnership arg_userns_ownership = _USER_NAMESPACE_OWNERSHIP_INVALID;
 static int arg_kill_signal = 0;
@@ -370,7 +370,7 @@ static int help(void) {
                "                            the service unit nspawn is running in\n"
                "\n%3$sUser Namespacing:%4$s\n"
                "     --private-users=no     Run without user namespacing\n"
-               "     --private-users=yes|pick|identity\n"
+               "     --private-users=yes|pick|identity|managed\n"
                "                            Run within user namespace, autoselect UID/GID range\n"
                "     --private-users=UIDBASE[:NUIDS]\n"
                "                            Similar, but with user configured UID/GID range\n"
@@ -519,7 +519,7 @@ static int detect_unified_cgroup_hierarchy_from_environment(void) {
 static int detect_unified_cgroup_hierarchy_from_image(const char *directory) {
         int r;
 
-        if (!arg_privileged) {
+        if (arg_userns_mode == USER_NAMESPACE_MANAGED) {
                 /* We only support the unified mode when running unprivileged */
                 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
                 return 0;
@@ -1258,6 +1258,11 @@ static int parse_argv(int argc, char *argv[]) {
                                 arg_userns_mode = USER_NAMESPACE_FIXED;
                                 arg_uid_shift = 0;
                                 arg_uid_range = UINT32_C(0x10000);
+                        } else if (streq(optarg, "managed")) {
+                                /* managed: User namespace on, and acquire it from systemd-nsresourced */
+                                arg_userns_mode = USER_NAMESPACE_MANAGED;
+                                arg_uid_shift = UID_INVALID;
+                                arg_uid_range = UINT32_C(0x10000);
                         } else {
                                 /* anything else: User namespacing on, UID range is explicitly configured */
                                 r = parse_userns_uid_range(optarg, &arg_uid_shift, &arg_uid_range);
@@ -1272,9 +1277,8 @@ static int parse_argv(int argc, char *argv[]) {
 
                 case 'U':
                         if (userns_supported()) {
-                                arg_userns_mode = USER_NAMESPACE_PICK; /* Note that arg_userns_ownership is
-                                                                        * implied by USER_NAMESPACE_PICK
-                                                                        * further down. */
+                                /* Note that arg_userns_ownership is implied by USER_NAMESPACE_PICK further down. */
+                                arg_userns_mode = arg_privileged ? USER_NAMESPACE_PICK : USER_NAMESPACE_MANAGED;
                                 arg_uid_shift = UID_INVALID;
                                 arg_uid_range = UINT32_C(0x10000);
 
@@ -1657,14 +1661,23 @@ static int parse_argv(int argc, char *argv[]) {
 static int verify_arguments(void) {
         int r;
 
-        SET_FLAG(arg_mount_settings, MOUNT_PRIVILEGED, arg_privileged);
+        SET_FLAG(arg_mount_settings, MOUNT_UNMANAGED, arg_userns_mode != USER_NAMESPACE_MANAGED);
 
-        if (!arg_privileged) {
-                if (!arg_private_network) {
-                        log_notice("Automatically implying --private-network, since mounting /sys/ in an unprivileged user namespaces requires network namespacing.");
-                        arg_private_network = true;
-                }
-        }
+        /* We can mount selinuxfs only if we are privileged and can do so before userns. In managed mode we
+         * have to enter the userns earlier, hence cannot do that. */
+        /* SET_FLAG(arg_mount_settings, MOUNT_PRIVILEGED, arg_privileged); */
+        SET_FLAG(arg_mount_settings, MOUNT_PRIVILEGED, arg_userns_mode != USER_NAMESPACE_MANAGED);
+
+        SET_FLAG(arg_mount_settings, MOUNT_USE_USERNS, arg_userns_mode != USER_NAMESPACE_NO);
+
+        if (arg_private_network)
+                SET_FLAG(arg_mount_settings, MOUNT_APPLY_APIVFS_NETNS, arg_private_network);
+
+        if (!arg_privileged && arg_userns_mode != USER_NAMESPACE_MANAGED)
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Unprivileged operation requires managed user namespaces, as otherwise no UID range can be acquired.");
+
+        if (arg_userns_mode == USER_NAMESPACE_MANAGED && !arg_private_network)
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Managed user namespace operation requires private networking, as otherwise /sys/ may not be mounted.");
 
         if (arg_start_mode == START_PID2 && arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN) {
                 /* If we are running the stub init in the container, we don't need to look at what the init
@@ -1685,12 +1698,6 @@ static int verify_arguments(void) {
                         arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
         }
 
-        if (arg_userns_mode != USER_NAMESPACE_NO)
-                arg_mount_settings |= MOUNT_USE_USERNS;
-
-        if (arg_private_network)
-                arg_mount_settings |= MOUNT_APPLY_APIVFS_NETNS;
-
         if (!(arg_clone_ns_flags & CLONE_NEWPID) ||
             !(arg_clone_ns_flags & CLONE_NEWUTS)) {
                 arg_register = false;
@@ -1700,8 +1707,7 @@ static int verify_arguments(void) {
 
         if (arg_userns_ownership < 0)
                 arg_userns_ownership =
-                        arg_userns_mode == USER_NAMESPACE_PICK ? USER_NAMESPACE_OWNERSHIP_AUTO :
-                                                                 USER_NAMESPACE_OWNERSHIP_OFF;
+                        IN_SET(arg_userns_mode, USER_NAMESPACE_PICK, USER_NAMESPACE_MANAGED) ? USER_NAMESPACE_OWNERSHIP_AUTO : USER_NAMESPACE_OWNERSHIP_OFF;
 
         if (arg_start_mode == START_BOOT && arg_kill_signal <= 0)
                 arg_kill_signal = SIGRTMIN+3;
@@ -1810,10 +1816,18 @@ static int verify_network_interfaces_initialized(void) {
         return 0;
 }
 
+static int in_child_chown(void) {
+        /* Returns true when chown()ing inodes we create inside the outer child is required. Basically, we
+         * need the chowning when we implement userns ourselves. If userns is off we don#t need to chown(),
+         * obviously. And if we are in managed mode we already entered the userns, and hence don#t need to
+         * manually chown either. */
+        return IN_SET(arg_userns_mode, USER_NAMESPACE_PICK, USER_NAMESPACE_FIXED);
+}
+
 static int userns_chown_at(int fd, const char *fname, uid_t uid, gid_t gid, int flags) {
         assert(fd >= 0 || fd == AT_FDCWD);
 
-        if (arg_userns_mode == USER_NAMESPACE_NO)
+        if (!in_child_chown())
                 return 0;
 
         if (uid == UID_INVALID && gid == GID_INVALID)
@@ -2296,18 +2310,24 @@ static int copy_devnode_one(const char *dest, const char *node, bool ignore_mkno
         if (r < 0)
                 return log_error_errno(r, "Failed to create directory %s: %m", parent);
 
-        if (mknod(to, st.st_mode, st.st_rdev) < 0) {
-                r = -errno; /* Save the original error code. */
+        r = RET_NERRNO(mknod(to, st.st_mode, st.st_rdev));
+        if (r < 0) {
                 /* Explicitly warn the user when /dev/ is already populated. */
                 if (r == -EEXIST)
                         log_notice("%s/dev/ is pre-mounted and pre-populated. If a pre-mounted /dev/ is provided it needs to be an unpopulated file system.", dest);
+
                 /* If arg_uid_shift != 0, then we cannot fall back to use bind mount. */
-                if (arg_uid_shift != 0) {
+                if (!(arg_userns_mode == USER_NAMESPACE_NO ||
+                      (arg_userns_mode == USER_NAMESPACE_FIXED && arg_uid_shift == 0))) {
                         if (ignore_mknod_failure) {
                                 log_debug_errno(r, "Failed to mknod(%s), ignoring: %m", to);
                                 return 0;
                         }
-                        return log_error_errno(r, "Failed to mknod(%s): %m", to);
+
+                        if (arg_userns_mode != USER_NAMESPACE_MANAGED || !ERRNO_IS_NEG_PRIVILEGE(r))
+                                return log_error_errno(r, "Failed to mknod(%s): %m", to);
+
+                        log_debug_errno(r, "Failed to create device node '%s' and running in managed mode, resorting to bind mount: %m", to);
                 }
 
                 /* Some systems abusively restrict mknod but allow bind mounts. */
@@ -2403,7 +2423,7 @@ static int make_extra_nodes(const char *dest) {
         return 0;
 }
 
-static int setup_pts(const char *dest) {
+static int setup_pts(const char *dest, uid_t chown_uid) {
         _cleanup_free_ char *options = NULL;
         const char *p;
         int r;
@@ -2412,13 +2432,13 @@ static int setup_pts(const char *dest) {
         if (arg_selinux_apifs_context)
                 (void) asprintf(&options,
                                 "newinstance,ptmxmode=0666,mode=" STRINGIFY(TTY_MODE) ",gid=" GID_FMT ",context=\"%s\"",
-                                arg_uid_shift + TTY_GID,
+                                chown_uid + TTY_GID,
                                 arg_selinux_apifs_context);
         else
 #endif
                 (void) asprintf(&options,
                                 "newinstance,ptmxmode=0666,mode=" STRINGIFY(TTY_MODE) ",gid=" GID_FMT,
-                                arg_uid_shift + TTY_GID);
+                                chown_uid + TTY_GID);
 
         if (!options)
                 return log_oom();
@@ -2855,7 +2875,9 @@ static int reset_audit_loginuid(void) {
         if ((arg_clone_ns_flags & CLONE_NEWPID) == 0)
                 return 0;
 
-        if (!arg_privileged)
+        /* if we are in managed userns mode, then we are already in our userns, hence we cannot reset the
+         * loginuid anyway, hence don't bother */
+        if (arg_userns_mode == USER_NAMESPACE_MANAGED)
                 return 0;
 
         r = read_virtual_file("/proc/self/loginuid", SIZE_MAX, &p, /* ret_size= */ NULL);
@@ -2887,8 +2909,8 @@ static int mount_tunnel_dig(const char *root) {
         const char *p, *q;
         int r;
 
-        if (!arg_privileged) {
-                log_debug("Not digging mount tunnel, because running unprivileged.");
+        if (arg_userns_mode == USER_NAMESPACE_MANAGED) {
+                log_debug("Not digging mount tunnel, because running in managed user namespace mode.");
                 return 0;
         }
 
@@ -2920,8 +2942,8 @@ static int mount_tunnel_dig(const char *root) {
 static int mount_tunnel_open(void) {
         int r;
 
-        if (!arg_privileged) {
-                log_debug("Not opening up mount tunnel, because running unprivileged.");
+        if (arg_userns_mode == USER_NAMESPACE_MANAGED) {
+                log_debug("Not opening up mount tunnel, because running in managed user namespace mode.");
                 return 0;
         }
 
@@ -3268,6 +3290,12 @@ static int chase_and_update(char **p, unsigned flags) {
 static int determine_uid_shift(const char *directory) {
         assert(directory);
 
+        if (arg_userns_mode == USER_NAMESPACE_MANAGED) {
+                /* In managed mode we should already know the UID shift */
+                assert(uid_is_valid(arg_uid_shift));
+                return 0;
+        }
+
         if (arg_userns_mode == USER_NAMESPACE_NO) {
                 arg_uid_shift = 0;
                 return 0;
@@ -3448,7 +3476,7 @@ static int inner_child(
         if (!arg_network_namespace_path && arg_private_network) {
                 _cleanup_close_ int netns_fd = -EBADF;
 
-                if (arg_privileged)
+                if (arg_userns_mode != USER_NAMESPACE_MANAGED)
                         if (unshare(CLONE_NEWNET) < 0)
                                 return log_error_errno(errno, "Failed to unshare network namespace: %m");
 
@@ -3464,8 +3492,8 @@ static int inner_child(
                 (void) barrier_place(barrier); /* #3 */
         }
 
-        if (arg_privileged) {
-                r = mount_sysfs(NULL, arg_mount_settings);
+        if (arg_userns_mode != USER_NAMESPACE_MANAGED) {
+                r = mount_sysfs(NULL, arg_mount_settings | MOUNT_IN_USERNS);
                 if (r < 0)
                         return r;
         }
@@ -3818,8 +3846,8 @@ static int setup_unix_export_dir_outside(char **ret) {
 
         assert(ret);
 
-        if (!arg_privileged) {
-                log_debug("Not digging socket tunnel, because running unprivileged.");
+        if (arg_userns_mode == USER_NAMESPACE_MANAGED) {
+                log_debug("Not digging socket tunnel, because running in managed user namespace mode.");
                 return 0;
         }
 
@@ -3875,7 +3903,7 @@ static int setup_unix_export_host_inside(const char *directory, const char *unix
 
         assert(directory);
 
-        if (!arg_privileged)
+        if (arg_userns_mode == USER_NAMESPACE_MANAGED)
                 return 0;
 
         assert(unix_export_path);
@@ -3929,7 +3957,9 @@ static DissectImageFlags determine_dissect_image_flags(void) {
                 DISSECT_IMAGE_PIN_PARTITION_DEVICES |
                 (arg_read_only ? DISSECT_IMAGE_READ_ONLY : DISSECT_IMAGE_FSCK|DISSECT_IMAGE_GROWFS) |
                 DISSECT_IMAGE_ALLOW_USERSPACE_VERITY |
-                (arg_console_mode == CONSOLE_INTERACTIVE ? DISSECT_IMAGE_ALLOW_INTERACTIVE_AUTH : 0);
+                (arg_console_mode == CONSOLE_INTERACTIVE ? DISSECT_IMAGE_ALLOW_INTERACTIVE_AUTH : 0) |
+                ((arg_userns_ownership == USER_NAMESPACE_OWNERSHIP_FOREIGN) ? DISSECT_IMAGE_FOREIGN_UID :
+                 (arg_userns_ownership != USER_NAMESPACE_OWNERSHIP_AUTO) ? DISSECT_IMAGE_IDENTITY_UID : 0);
 }
 
 static int outer_child(
@@ -3954,9 +3984,9 @@ static int outer_child(
         /* This is the "outer" child process, i.e the one forked off by the container manager itself.  Its
          * namespace situation is:
          *
-         *  - CLONE_NEWNS   : already has its own (created by clone() if arg_privileged, or unshare() if !arg_unprivileged)
-         *  - CLONE_NEWUSER : if  arg_privileged: still in the host's
-         *                    if !arg_privileged: already has its own (created by nsresource_allocate_userns()->setns(userns_fd))
+         *  - CLONE_NEWUSER : if not in USER_NAMESPACE_MANAGED mode: still in the host's
+         *                    if USER_NAMESPACE_MANAGED mode: already has its own (created by nsresource_allocate_userns()->setns(userns_fd))
+         *  - CLONE_NEWNS   : already has its own (created by clone() if not USER_NAMESPACE_MANAGED, or unshare() otherwise)
          *  - CLONE_NEWPID  : still in the host's
          *  - CLONE_NEWUTS  : still in the host's
          *  - CLONE_NEWIPC  : still in the host's
@@ -4035,6 +4065,18 @@ static int outer_child(
         if (r < 0)
                 return r;
 
+        /* If we do userns on our own, we need to chown() all files ourselves before. Otherwise, if userns is
+         * off or we are in managed mode we already have the userns applied, hence don't need to chown
+         * anything */
+        uid_t chown_uid, chown_range;
+        if (in_child_chown()) {
+                chown_uid = arg_uid_shift;
+                chown_range = arg_uid_range;
+        } else {
+                chown_uid = 0;
+                chown_range = UINT32_C(0x10000);
+        }
+
         if (arg_userns_mode != USER_NAMESPACE_NO) {
                 _cleanup_close_ int mntns_fd = -EBADF;
 
@@ -4066,6 +4108,9 @@ static int outer_child(
                         if (l != sizeof(arg_uid_shift))
                                 return log_error_errno(SYNTHETIC_ERRNO(EIO),
                                                        "Short read while receiving UID shift.");
+
+                        if (in_child_chown())
+                                chown_uid = arg_uid_shift;
                 }
 
                 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO,
@@ -4090,7 +4135,7 @@ static int outer_child(
         r = setup_volatile_mode(
                         directory,
                         arg_volatile_mode,
-                        arg_uid_shift,
+                        chown_uid,
                         arg_selinux_apifs_context);
         if (r < 0)
                 return r;
@@ -4098,8 +4143,8 @@ static int outer_child(
         r = bind_user_prepare(
                         directory,
                         arg_bind_user,
-                        arg_uid_shift,
-                        arg_uid_range,
+                        chown_uid,
+                        chown_range,
                         &arg_custom_mounts, &arg_n_custom_mounts,
                         &bind_user_context);
         if (r < 0)
@@ -4130,16 +4175,16 @@ static int outer_child(
                         directory,
                         arg_custom_mounts,
                         arg_n_custom_mounts,
-                        arg_uid_shift,
-                        arg_uid_range,
+                        chown_uid,
+                        chown_range,
                         arg_selinux_apifs_context,
                         MOUNT_ROOT_ONLY);
         if (r < 0)
                 return r;
 
-        if (arg_userns_mode != USER_NAMESPACE_NO &&
+        if (!IN_SET(arg_userns_mode, USER_NAMESPACE_NO, USER_NAMESPACE_MANAGED) &&
             IN_SET(arg_userns_ownership, USER_NAMESPACE_OWNERSHIP_MAP, USER_NAMESPACE_OWNERSHIP_FOREIGN, USER_NAMESPACE_OWNERSHIP_AUTO) &&
-            arg_uid_shift != 0) {
+            chown_uid != 0) {
                 _cleanup_strv_free_ char **dirs = NULL;
                 RemountIdmapping mapping;
 
@@ -4191,8 +4236,8 @@ static int outer_child(
 
                 r = remount_idmap(
                                 dirs,
-                                arg_uid_shift,
-                                arg_uid_range,
+                                chown_uid,
+                                chown_range,
                                 /* host_owner= */ UID_INVALID,
                                 /* dest_owner= */ UID_INVALID,
                                 mapping);
@@ -4217,7 +4262,7 @@ static int outer_child(
         r = setup_volatile_mode_after_remount_idmap(
                         directory,
                         arg_volatile_mode,
-                        arg_uid_shift,
+                        chown_uid,
                         arg_selinux_apifs_context);
         if (r < 0)
                 return r;
@@ -4227,8 +4272,8 @@ static int outer_child(
                 r = dissected_image_mount_and_warn(
                                 dissected_image,
                                 directory,
-                                arg_uid_shift,
-                                arg_uid_range,
+                                chown_uid,
+                                chown_range,
                                 /* userns_fd= */ -EBADF,
                                 determine_dissect_image_flags()|
                                 DISSECT_IMAGE_MOUNT_NON_ROOT_ONLY|
@@ -4252,11 +4297,11 @@ static int outer_child(
                                                "Short write while sending cgroup mode.");
         }
 
-        r = recursive_chown(directory, arg_uid_shift, arg_uid_range);
+        r = recursive_chown(directory, chown_uid, chown_range);
         if (r < 0)
                 return r;
 
-        r = base_filesystem_create(directory, arg_uid_shift, (gid_t) arg_uid_shift);
+        r = base_filesystem_create(directory, chown_uid, (gid_t) chown_uid);
         if (r < 0)
                 return r;
 
@@ -4269,7 +4314,7 @@ static int outer_child(
 
         r = mount_all(directory,
                       arg_mount_settings,
-                      arg_uid_shift,
+                      chown_uid,
                       arg_selinux_apifs_context);
         if (r < 0)
                 return r;
@@ -4287,16 +4332,16 @@ static int outer_child(
         if (r < 0)
                 return r;
 
-        (void) dev_setup(directory, arg_uid_shift, arg_uid_shift);
+        (void) dev_setup(directory, chown_uid, chown_uid);
 
         p = prefix_roota(directory, "/run/host");
-        (void) make_inaccessible_nodes(p, arg_uid_shift, arg_uid_shift);
+        (void) make_inaccessible_nodes(p, chown_uid, chown_uid);
 
         r = setup_unix_export_host_inside(directory, unix_export_path);
         if (r < 0)
                 return r;
 
-        r = setup_pts(directory);
+        r = setup_pts(directory, chown_uid);
         if (r < 0)
                 return r;
 
@@ -4320,8 +4365,8 @@ static int outer_child(
                         directory,
                         arg_custom_mounts,
                         arg_n_custom_mounts,
-                        arg_uid_shift,
-                        arg_uid_range,
+                        chown_uid,
+                        chown_range,
                         arg_selinux_apifs_context,
                         MOUNT_NON_ROOT_ONLY);
         if (r < 0)
@@ -4356,8 +4401,8 @@ static int outer_child(
                                 directory,
                                 arg_unified_cgroup_hierarchy,
                                 arg_userns_mode != USER_NAMESPACE_NO,
-                                arg_uid_shift,
-                                arg_uid_range,
+                                chown_uid,
+                                chown_range,
                                 arg_selinux_apifs_context,
                                 false);
                 if (r < 0)
@@ -4373,7 +4418,7 @@ static int outer_child(
          * (and fork for it) for which we then mount sysfs/procfs, and only then switch root. */
 
         _cleanup_close_ int notify_fd = -EBADF;
-        if (arg_privileged) {
+        if (arg_userns_mode != USER_NAMESPACE_MANAGED) {
                 /* Mark everything as shared so our mounts get propagated down. This is required to make new
                  * bind mounts available in systemd services inside the container that create a new mount
                  * namespace.  See https://github.com/systemd/systemd/issues/3860 Further submounts (such as
@@ -4416,8 +4461,8 @@ static int outer_child(
 
         pid = raw_clone(SIGCHLD|CLONE_NEWNS|
                         arg_clone_ns_flags |
-                        (arg_userns_mode != USER_NAMESPACE_NO ? CLONE_NEWUSER : 0) |
-                        ((arg_private_network && !arg_privileged) ? CLONE_NEWNET : 0));
+                        (IN_SET(arg_userns_mode, USER_NAMESPACE_FIXED, USER_NAMESPACE_PICK) ? CLONE_NEWUSER : 0) |
+                        ((arg_private_network && arg_userns_mode == USER_NAMESPACE_MANAGED) ? CLONE_NEWNET : 0));
         if (pid < 0)
                 return log_error_errno(errno, "Failed to fork inner child: %m");
         if (pid == 0) {
@@ -4436,9 +4481,10 @@ static int outer_child(
                                 return log_error_errno(r, "Failed to join network namespace: %m");
                 }
 
-                if (!arg_privileged) {
-                        /* In unprivileged operation, sysfs + procfs are special, we'll have to mount them
-                         * inside the inner namespaces, but before we switch root. Hence do so here. */
+                if (arg_userns_mode == USER_NAMESPACE_MANAGED) {
+                        /* In managed usernamespace operation, sysfs + procfs are special, we'll have to
+                         * mount them inside the inner namespaces, but before we switch root. Hence do so
+                         * here. */
                         _cleanup_free_ char *j = path_join(directory, "/proc");
                         if (!j)
                                 return log_oom();
@@ -5290,9 +5336,8 @@ static int run_container(
                                                "Path %s doesn't refer to a network namespace, refusing.", arg_network_namespace_path);
         }
 
-        if (arg_privileged) {
+        if (arg_userns_mode != USER_NAMESPACE_MANAGED) {
                 assert(userns_fd < 0);
-
                 /* If we have no user namespace then we'll clone and create a new mount namespace right-away. */
 
                 *pid = raw_clone(SIGCHLD|CLONE_NEWNS);
@@ -5302,7 +5347,6 @@ static int run_container(
                                                ", do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in)" : "");
         } else {
                 assert(userns_fd >= 0);
-
                 /* If we have a user namespace then we'll clone() first, and then join the user namespace,
                  * and then open the mount namespace, so that it is owned by the user namespace */
 
@@ -5459,9 +5503,11 @@ static int run_container(
                 if (!barrier_place_and_sync(&barrier)) /* #1 */
                         return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Child died too early.");
 
-                r = setup_uid_map(*pid, bind_user_uid, n_bind_user_uid);
-                if (r < 0)
-                        return r;
+                if (arg_userns_mode != USER_NAMESPACE_MANAGED) {
+                        r = setup_uid_map(*pid, bind_user_uid, n_bind_user_uid);
+                        if (r < 0)
+                                return r;
+                }
 
                 (void) barrier_place(&barrier); /* #2 */
         }
@@ -5485,7 +5531,7 @@ static int run_container(
                         return r;
 
                 if (arg_network_veth) {
-                        if (arg_privileged) {
+                        if (arg_userns_mode != USER_NAMESPACE_MANAGED) {
                                 r = setup_veth(arg_machine, *pid, veth_name,
                                                arg_network_bridge || arg_network_zone, &arg_network_provided_mac);
                                 if (r < 0)
@@ -5623,7 +5669,7 @@ static int run_container(
                         arg_unified_cgroup_hierarchy,
                         arg_uid_shift,
                         userns_fd,
-                        arg_privileged);
+                        arg_userns_mode);
         if (r < 0)
                 return r;
 
@@ -5665,7 +5711,7 @@ static int run_container(
         if (!barrier_sync(&barrier)) /* #5.1 */
                 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Child died too early.");
 
-        if (arg_userns_mode != USER_NAMESPACE_NO) {
+        if (!IN_SET(arg_userns_mode, USER_NAMESPACE_NO, USER_NAMESPACE_MANAGED)) {
                 r = wipe_fully_visible_api_fs(mntns_fd);
                 if (r < 0)
                         return r;
@@ -5792,7 +5838,7 @@ static int run_container(
 
         fd_kmsg_fifo = safe_close(fd_kmsg_fifo);
 
-        if (arg_private_network && arg_privileged) {
+        if (arg_private_network && arg_userns_mode != USER_NAMESPACE_MANAGED) {
                 r = move_back_network_interfaces(child_netns_fd, arg_network_interfaces);
                 if (r < 0)
                         return r;
@@ -5957,6 +6003,16 @@ static int cant_be_in_netns(void) {
         return 0;
 }
 
+static void initialize_defaults(void) {
+        arg_privileged = getuid() == 0;
+
+        /* If running unprivileged default to systemd-nsresourced operation */
+        arg_userns_mode = arg_privileged ? USER_NAMESPACE_NO : USER_NAMESPACE_MANAGED;
+
+        /* Imply private networking for unprivileged operation, since kernel otherwise refuses mounting sysfs */
+        arg_private_network = !arg_privileged;
+}
+
 static int run(int argc, char *argv[]) {
         bool remove_directory = false, remove_image = false, veth_created = false;
         _cleanup_close_ int master = -EBADF, userns_fd = -EBADF, mount_fd = -EBADF;
@@ -5973,7 +6029,7 @@ static int run(int argc, char *argv[]) {
 
         log_setup();
 
-        arg_privileged = getuid() == 0;
+        initialize_defaults();
 
         r = parse_argv(argc, argv);
         if (r <= 0)
@@ -6030,14 +6086,14 @@ static int run(int argc, char *argv[]) {
         /* Reapply environment settings. */
         (void) detect_unified_cgroup_hierarchy_from_environment();
 
-        if (!arg_privileged) {
+        if (arg_userns_mode == USER_NAMESPACE_MANAGED) {
                 r = cg_all_unified();
                 if (r < 0) {
                         log_error_errno(r, "Failed to determine if we are in unified cgroupv2 mode: %m");
                         goto finish;
                 }
                 if (r == 0)
-                        return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "Unprivileged operation only supported in unified cgroupv2 mode.");
+                        return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "Managed user namespace operation only supported in unified cgroupv2 mode.");
         }
 
         /* Ignore SIGPIPE here, because we use splice() on the ptyfwd stuff and that will generate SIGPIPE if
@@ -6066,8 +6122,8 @@ static int run(int argc, char *argv[]) {
         if (arg_console_mode == CONSOLE_PIPE) /* if we pass STDERR on to the container, don't add our own logs into it too */
                 arg_quiet = true;
 
-        if (!arg_privileged) {
-                /* if we are unprivileged, let's allocate a 64K userns first */
+        if (arg_userns_mode == USER_NAMESPACE_MANAGED) {
+                /* Let's allocate a 64K userns first, if managed mode is chosen */
 
                 _cleanup_free_ char *userns_name = NULL;
                 if (asprintf(&userns_name, "nspawn-" PID_FMT "-%s", getpid_cached(), arg_machine) < 0) {
@@ -6080,6 +6136,14 @@ static int run(int argc, char *argv[]) {
                         r = log_error_errno(userns_fd, "Failed to allocate user namespace with 64K users: %m");
                         goto finish;
                 }
+
+                r = userns_get_base_uid(userns_fd, &arg_uid_shift, /* ret_gid= */ NULL);
+                if (r < 0) {
+                        log_error_errno(r, "Failed to determine UID shift from userns: %m");
+                        goto finish;
+                }
+
+                arg_uid_range = UINT32_C(0x10000);
         }
 
         if (arg_directory) {
@@ -6254,7 +6318,7 @@ static int run(int argc, char *argv[]) {
                         }
                 }
 
-                if (!arg_privileged) {
+                if (arg_userns_mode == USER_NAMESPACE_MANAGED) {
                         r = mountfsd_mount_directory(
                                         arg_directory,
                                         userns_fd,
@@ -6337,7 +6401,7 @@ static int run(int argc, char *argv[]) {
                                 dissect_image_flags |= DISSECT_IMAGE_NO_PARTITION_TABLE;
                 }
 
-                if (arg_privileged) {
+                if (arg_userns_mode != USER_NAMESPACE_MANAGED) {
                         r = loop_device_make_by_path(
                                         arg_image,
                                         arg_read_only ? O_RDONLY : O_RDWR,
@@ -6493,7 +6557,7 @@ finish:
                         log_warning_errno(errno, "Can't remove image file '%s', ignoring: %m", arg_image);
         }
 
-        if (arg_machine && arg_privileged) {
+        if (arg_machine && arg_userns_mode != USER_NAMESPACE_MANAGED) {
                 const char *p;
 
                 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
@@ -6507,7 +6571,7 @@ finish:
         expose_port_flush(&fw_ctx, arg_expose_ports, AF_INET,  &expose_args.address4);
         expose_port_flush(&fw_ctx, arg_expose_ports, AF_INET6, &expose_args.address6);
 
-        if (arg_privileged) {
+        if (arg_userns_mode != USER_NAMESPACE_MANAGED) {
                 if (veth_created)
                         (void) remove_veth_links(veth_name, arg_network_veth_extra);
                 (void) remove_bridge(arg_network_zone);