]> git.ipfire.org Git - thirdparty/systemd.git/commitdiff
nspawn: Add support for delegated UID ranges 40415/head
authorDaan De Meyer <daan@amutable.com>
Fri, 20 Feb 2026 13:16:48 +0000 (14:16 +0100)
committerDaan De Meyer <daan@amutable.com>
Wed, 25 Feb 2026 10:31:06 +0000 (11:31 +0100)
We expose this via --private-users-delegate= which takes the number of
ranges to delegate. On top of delegating the ranges, we also mount in
the nsresourced socket and the mountfsd socket so that nested containers
can use nsresourced to allocate from the delegated ranges and mountfsd to
mount images.

Finally, we also create /run/systemd/dissect-root with systemd-tmpfiles to
make sure it is always available as unpriv users won't be able to create it
themselves.

man/systemd-nspawn.xml
man/systemd.nspawn.xml
src/nspawn/nspawn-gperf.gperf
src/nspawn/nspawn-settings.h
src/nspawn/nspawn.c
src/shared/nsresource.c
src/shared/nsresource.h
tmpfiles.d/systemd-nspawn.conf
tmpfiles.d/systemd.conf.in
units/systemd-mountfsd.socket
units/systemd-nsresourced.socket

index da598d090c6a3d2ae2eaff6a8c17173c1ee0f705..99e6147b2b1421a1dc27bee8aabf06c74e3900da 100644 (file)
         <xi:include href="version-info.xml" xpointer="v230"/></listitem>
       </varlistentry>
 
+      <varlistentry>
+        <term><option>--private-users-delegate=</option></term>
+
+        <listitem><para>Takes a non-negative integer. Requests that the specified number of additional 64K
+        UID/GID ranges are delegated into the container's user namespace. These delegated ranges are mapped
+        1:1 (i.e. the same UID/GID values are used inside and outside the user namespace) and can be used by
+        nested containers to allocate their own transient UID/GID ranges via
+        <citerefentry><refentrytitle>systemd-nsresourced.service</refentrytitle><manvolnum>8</manvolnum></citerefentry>.</para>
+
+        <para>This option requires <option>--private-users=managed</option>, as the delegation is performed
+        by
+        <citerefentry><refentrytitle>systemd-nsresourced.service</refentrytitle><manvolnum>8</manvolnum></citerefentry>
+        as part of the user namespace allocation. The maximum number of delegated ranges is 16. Defaults to
+        0, i.e. no delegation.</para>
+
+        <para>When this option is used with a non-zero value, the
+        <citerefentry><refentrytitle>systemd-nsresourced.service</refentrytitle><manvolnum>8</manvolnum></citerefentry>
+        Varlink socket (<filename>/run/systemd/io.systemd.NamespaceResource</filename>) is automatically
+        bind-mounted into the container along with the necessary discovery symlinks in
+        <filename>/run/systemd/userdb/</filename> and <filename>/run/varlink/registry/</filename>. This
+        allows processes inside the container to contact
+        <command>systemd-nsresourced</command> on the host in order to allocate nested user namespaces from
+        the delegated ranges.</para>
+
+        <xi:include href="version-info.xml" xpointer="v260"/></listitem>
+      </varlistentry>
+
       <varlistentry>
         <term><option>-U</option></term>
 
index 6492a8911aa0111788f254be37773ede7d19ac0f..bf9526df8069f71805ed582d33eb5ae7b2d0f51c 100644 (file)
         <xi:include href="version-info.xml" xpointer="v230"/></listitem>
       </varlistentry>
 
+      <varlistentry>
+        <term><varname>PrivateUsersDelegate=</varname></term>
+
+        <listitem><para>Takes a non-negative integer. Configures delegation of additional 64K UID/GID ranges
+        into the container's user namespace for use by nested containers. When set to a value greater than
+        zero, the
+        <citerefentry><refentrytitle>systemd-nsresourced.service</refentrytitle><manvolnum>8</manvolnum></citerefentry>
+        varlink socket will be bind-mounted into the container so that processes inside the container can
+        allocate further user namespaces from the delegated ranges. This is equivalent to the
+        <option>--private-users-delegate=</option> command line switch. Requires
+        <varname>PrivateUsers=managed</varname>. Defaults to 0. See
+        <citerefentry><refentrytitle>systemd-nspawn</refentrytitle><manvolnum>1</manvolnum></citerefentry>
+        for details.</para>
+
+        <xi:include href="version-info.xml" xpointer="v260"/></listitem>
+      </varlistentry>
+
       <varlistentry>
         <term><varname>NotifyReady=</varname></term>
 
index a44a13ef29e4e4700394fe31ce3d40ea6ff58918..cdad70706e6056ff456ceabe5a684773671cf89c 100644 (file)
@@ -34,6 +34,7 @@ Exec.MachineID,               config_parse_id128,              0,
 Exec.WorkingDirectory,        config_parse_path,               0,                        offsetof(Settings, working_directory)
 Exec.PivotRoot,               config_parse_pivot_root,         0,                        0
 Exec.PrivateUsers,            config_parse_private_users,      0,                        0
+Exec.PrivateUsersDelegate,    config_parse_unsigned,           0,                        offsetof(Settings, delegate_container_ranges)
 Exec.NotifyReady,             config_parse_tristate,           0,                        offsetof(Settings, notify_ready)
 Exec.SystemCallFilter,        config_parse_syscall_filter,     0,                        0
 Exec.LimitCPU,                config_parse_rlimit,             RLIMIT_CPU,               offsetof(Settings, rlimit)
index 197c6e2f79a89d12675dbfa9f843af765ef6d86b..84c342b83c1ebf261bbcc7b26775ab89df3d5bdf 100644 (file)
@@ -175,6 +175,7 @@ typedef struct Settings {
         char *pivot_root_old;
         UserNamespaceMode userns_mode;
         uid_t uid_shift, uid_range;
+        unsigned delegate_container_ranges;
         int notify_ready;
         char **syscall_allow_list;
         char **syscall_deny_list;
index 08afa171ae886fb9a26c3cdb13a8b939dc080836..722be8bbf7cb69b15ba3beb8103dbfc0b3046d64 100644 (file)
@@ -214,6 +214,7 @@ static char **arg_property = NULL;
 static sd_bus_message *arg_property_message = NULL;
 static UserNamespaceMode arg_userns_mode; /* initialized depending on arg_privileged in run() */
 static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
+static unsigned arg_delegate_container_ranges = 0;
 static UserNamespaceOwnership arg_userns_ownership = _USER_NAMESPACE_OWNERSHIP_INVALID;
 static int arg_kill_signal = 0;
 static SettingsMask arg_settings_mask = 0;
@@ -430,6 +431,10 @@ static int help(void) {
                "     --private-users-ownership=MODE\n"
                "                            Adjust ('chown') or map ('map') OS tree ownership\n"
                "                            to private UID/GID range\n"
+               "     --private-users-delegate=N\n"
+               "                            Delegate N additional 64K UID/GID ranges for use\n"
+               "                            by nested containers (requires managed user\n"
+               "                            namespaces)\n"
                "  -U                        Equivalent to --private-users=pick and\n"
                "                            --private-users-ownership=auto\n"
                "\n%3$sNetworking:%4$s\n"
@@ -710,6 +715,7 @@ static int parse_argv(int argc, char *argv[]) {
                 ARG_TEMPLATE,
                 ARG_PROPERTY,
                 ARG_PRIVATE_USERS,
+                ARG_PRIVATE_USERS_DELEGATE,
                 ARG_KILL_SIGNAL,
                 ARG_SETTINGS,
                 ARG_CHDIR,
@@ -794,6 +800,7 @@ static int parse_argv(int argc, char *argv[]) {
                 { "private-users",          optional_argument, NULL, ARG_PRIVATE_USERS          },
                 { "private-users-chown",    optional_argument, NULL, ARG_PRIVATE_USERS_CHOWN    }, /* obsolete */
                 { "private-users-ownership",required_argument, NULL, ARG_PRIVATE_USERS_OWNERSHIP},
+                { "private-users-delegate", required_argument, NULL, ARG_PRIVATE_USERS_DELEGATE },
                 { "kill-signal",            required_argument, NULL, ARG_KILL_SIGNAL            },
                 { "settings",               required_argument, NULL, ARG_SETTINGS               },
                 { "chdir",                  required_argument, NULL, ARG_CHDIR                  },
@@ -1249,6 +1256,14 @@ static int parse_argv(int argc, char *argv[]) {
                         arg_settings_mask |= SETTING_USERNS;
                         break;
 
+                case ARG_PRIVATE_USERS_DELEGATE:
+                        r = safe_atou(optarg, &arg_delegate_container_ranges);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to parse --private-users-delegate= parameter: %s", optarg);
+
+                        arg_settings_mask |= SETTING_USERNS;
+                        break;
+
                 case ARG_KILL_SIGNAL:
                         if (streq(optarg, "help"))
                                 return DUMP_STRING_TABLE(signal, int, _NSIG);
@@ -1648,6 +1663,9 @@ static int verify_arguments(void) {
         if (arg_userns_mode == USER_NAMESPACE_MANAGED && !arg_private_network)
                 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Managed user namespace operation requires private networking, as otherwise /sys/ may not be mounted.");
 
+        if (arg_delegate_container_ranges > 0 && arg_userns_mode != USER_NAMESPACE_MANAGED)
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--private-users-delegate= requires --private-users=managed.");
+
         if (!(arg_clone_ns_flags & CLONE_NEWPID) ||
             !(arg_clone_ns_flags & CLONE_NEWUTS)) {
                 arg_register = false;
@@ -2881,6 +2899,43 @@ static int setup_machine_id(const char *directory) {
         return 0;
 }
 
+static int setup_varlink_socket(const char *directory, const char *name) {
+        int r;
+
+        assert(directory);
+
+        if (arg_delegate_container_ranges == 0)
+                return 0;
+
+        r = make_run_host(directory);
+        if (r < 0)
+                return r;
+
+        _cleanup_free_ char *src = path_join("/run/systemd", name);
+        if (!src)
+                return log_oom();
+
+        _cleanup_free_ char *dest = path_join(directory, "/run/host", name);
+        if (!dest)
+                return log_oom();
+
+        r = touch(dest);
+        if (r < 0)
+                return log_error_errno(r, "Failed to create %s: %m", dest);
+
+        r = userns_lchown(dest, 0, 0);
+        if (r < 0)
+                return log_error_errno(r, "Failed to chown %s: %m", dest);
+
+        return mount_nofollow_verbose(
+                        LOG_ERR,
+                        src,
+                        dest,
+                        /* fstype= */ NULL,
+                        MS_BIND|MS_RDONLY,
+                        /* options= */ NULL);
+}
+
 static int recursive_chown(const char *directory, uid_t shift, uid_t range) {
         int r;
 
@@ -4371,6 +4426,14 @@ static int outer_child(
         if (r < 0)
                 return r;
 
+        r = setup_varlink_socket(directory, "io.systemd.NamespaceResource");
+        if (r < 0)
+                return r;
+
+        r = setup_varlink_socket(directory, "io.systemd.MountFileSystem");
+        if (r < 0)
+                return r;
+
         /* The same stuff as the $container env var, but nicely readable for the entire payload */
         free(p);
         p = path_join(directory, "/run/host/container-manager");
@@ -4885,6 +4948,7 @@ static int merge_settings(Settings *settings, const char *path) {
                         arg_uid_shift = settings->uid_shift;
                         arg_uid_range = settings->uid_range;
                         arg_userns_ownership = settings->userns_ownership;
+                        arg_delegate_container_ranges = settings->delegate_container_ranges;
                 }
         }
 
@@ -6158,10 +6222,11 @@ static int run(int argc, char *argv[]) {
                         goto finish;
                 }
 
-                userns_fd = nsresource_allocate_userns(
+                userns_fd = nsresource_allocate_userns_full(
                                 nsresource_link,
                                 userns_name,
-                                NSRESOURCE_UIDS_64K); /* allocate 64K UIDs */
+                                NSRESOURCE_UIDS_64K,
+                                arg_delegate_container_ranges);
                 if (userns_fd < 0) {
                         r = log_error_errno(userns_fd, "Failed to allocate user namespace with 64K users: %m");
                         goto finish;
index 615f99eff10879ca7e1b84691a18fd0c4298ec1f..6f70bcaf1f3cbddfa9e8e276624245c9ba1f910a 100644 (file)
@@ -75,7 +75,7 @@ int nsresource_connect(sd_varlink **ret) {
         return 0;
 }
 
-int nsresource_allocate_userns(sd_varlink *vl, const char *name, uint64_t size) {
+int nsresource_allocate_userns_full(sd_varlink *vl, const char *name, uint64_t size, uint64_t delegate_container_ranges) {
         _cleanup_close_ int userns_fd = -EBADF;
         _cleanup_free_ char *_name = NULL;
         const char *error_id;
@@ -120,7 +120,8 @@ int nsresource_allocate_userns(sd_varlink *vl, const char *name, uint64_t size)
                         SD_JSON_BUILD_PAIR_STRING("name", name),
                         SD_JSON_BUILD_PAIR_BOOLEAN("mangleName", true),
                         SD_JSON_BUILD_PAIR_UNSIGNED("size", size),
-                        SD_JSON_BUILD_PAIR_UNSIGNED("userNamespaceFileDescriptor", userns_fd_idx));
+                        SD_JSON_BUILD_PAIR_UNSIGNED("userNamespaceFileDescriptor", userns_fd_idx),
+                        JSON_BUILD_PAIR_UNSIGNED_NON_ZERO("delegateContainerRanges", delegate_container_ranges));
         if (r < 0)
                 return log_debug_errno(r, "Failed to call AllocateUserRange() varlink call: %m");
         if (streq_ptr(error_id, "io.systemd.NamespaceResource.UserNamespaceInterfaceNotSupported"))
index 93957a10c8237fc45c4a3474771df59e17cf170f..5633fd9bf35bc3494b1de02b822e047e0d709193 100644 (file)
@@ -17,7 +17,10 @@ int nsresource_connect(sd_varlink **ret);
  * operations under the original identity, until the connection is closed. The 'link' parameter may be passed
  * as NULL in which case a short-lived connection is created, just to execute the requested operation. */
 
-int nsresource_allocate_userns(sd_varlink *vl, const char *name, uint64_t size);
+int nsresource_allocate_userns_full(sd_varlink *vl, const char *name, uint64_t size, uint64_t delegate_container_ranges);
+static inline int nsresource_allocate_userns(sd_varlink *vl, const char *name, uint64_t size) {
+        return nsresource_allocate_userns_full(vl, name, size, /* delegate_container_ranges= */ 0);
+}
 int nsresource_register_userns(sd_varlink *vl, const char *name, int userns_fd);
 int nsresource_add_mount(sd_varlink *vl, int userns_fd, int mount_fd);
 int nsresource_add_cgroup(sd_varlink *vl, int userns_fd, int cgroup_fd);
index 40e6787233efc0beae65d6d2cb958f7e1bec7e10..8254650e9308b4e55f98932bf427b43be979cf85 100644 (file)
@@ -21,3 +21,8 @@ Q /var/lib/machines 0700 - - -
 
 R! /var/lib/machines/.#*
 R! /.#machine.*
+
+# If the nsresourced/mountfsd sockets are mounted into /run/host, symlink them to their canonical
+# location in /run/systemd.
+L? /run/systemd/io.systemd.NamespaceResource - - - - /run/host/io.systemd.NamespaceResource
+L? /run/systemd/io.systemd.MountFileSystem   - - - - /run/host/io.systemd.MountFileSystem
index 6436400cde63f19738d6c6191cfa62dcfe3605da..f601cb87f9d69146141c9da2c13b3b0c96a5e07a 100644 (file)
@@ -18,6 +18,7 @@ d$ /run/systemd/sessions 0755 root root -
 d$ /run/systemd/users 0755 root root -
 d /run/systemd/machines 0755 root root -
 d$ /run/systemd/shutdown 0755 root root -
+d /run/systemd/dissect-root 0000 root root -
 
 d /run/log 0755 root root -
 
index 431369a1a181e785300e7076025d6f9bc14a869f..a3e19cc418cb5e9e6ea53adae39e93d18bd0e8a6 100644 (file)
@@ -13,6 +13,7 @@ Documentation=man:systemd-mountfsd.service(8)
 DefaultDependencies=no
 Conflicts=shutdown.target
 Before=sockets.target shutdown.target
+ConditionPathExists=!/run/host/io.systemd.MountFileSystem
 
 [Socket]
 ListenStream=/run/systemd/io.systemd.MountFileSystem
index c159a5676ae2fc82002e4f0a561f1edfb6166fb2..6b4a883df302114c0a1209da569d4af292837256 100644 (file)
@@ -13,6 +13,7 @@ Documentation=man:systemd-nsresourced.service(8)
 DefaultDependencies=no
 Conflicts=shutdown.target
 Before=sockets.target shutdown.target
+ConditionPathExists=!/run/host/io.systemd.NamespaceResource
 
 [Socket]
 ListenStream=/run/systemd/io.systemd.NamespaceResource