<xi:include href="version-info.xml" xpointer="v230"/></listitem>
</varlistentry>
+ <varlistentry>
+ <term><option>--private-users-delegate=</option></term>
+
+ <listitem><para>Takes a non-negative integer. Requests that the specified number of additional 64K
+ UID/GID ranges are delegated into the container's user namespace. These delegated ranges are mapped
+ 1:1 (i.e. the same UID/GID values are used inside and outside the user namespace) and can be used by
+ nested containers to allocate their own transient UID/GID ranges via
+ <citerefentry><refentrytitle>systemd-nsresourced.service</refentrytitle><manvolnum>8</manvolnum></citerefentry>.</para>
+
+ <para>This option requires <option>--private-users=managed</option>, as the delegation is performed
+ by
+ <citerefentry><refentrytitle>systemd-nsresourced.service</refentrytitle><manvolnum>8</manvolnum></citerefentry>
+ as part of the user namespace allocation. The maximum number of delegated ranges is 16. Defaults to
+ 0, i.e. no delegation.</para>
+
+ <para>When this option is used with a non-zero value, the
+ <citerefentry><refentrytitle>systemd-nsresourced.service</refentrytitle><manvolnum>8</manvolnum></citerefentry>
+ Varlink socket (<filename>/run/systemd/io.systemd.NamespaceResource</filename>) is automatically
+ bind-mounted into the container along with the necessary discovery symlinks in
+ <filename>/run/systemd/userdb/</filename> and <filename>/run/varlink/registry/</filename>. This
+ allows processes inside the container to contact
+ <command>systemd-nsresourced</command> on the host in order to allocate nested user namespaces from
+ the delegated ranges.</para>
+
+ <xi:include href="version-info.xml" xpointer="v260"/></listitem>
+ </varlistentry>
+
<varlistentry>
<term><option>-U</option></term>
<xi:include href="version-info.xml" xpointer="v230"/></listitem>
</varlistentry>
+ <varlistentry>
+ <term><varname>PrivateUsersDelegate=</varname></term>
+
+ <listitem><para>Takes a non-negative integer. Configures delegation of additional 64K UID/GID ranges
+ into the container's user namespace for use by nested containers. When set to a value greater than
+ zero, the
+ <citerefentry><refentrytitle>systemd-nsresourced.service</refentrytitle><manvolnum>8</manvolnum></citerefentry>
+ varlink socket will be bind-mounted into the container so that processes inside the container can
+ allocate further user namespaces from the delegated ranges. This is equivalent to the
+ <option>--private-users-delegate=</option> command line switch. Requires
+ <varname>PrivateUsers=managed</varname>. Defaults to 0. See
+ <citerefentry><refentrytitle>systemd-nspawn</refentrytitle><manvolnum>1</manvolnum></citerefentry>
+ for details.</para>
+
+ <xi:include href="version-info.xml" xpointer="v260"/></listitem>
+ </varlistentry>
+
<varlistentry>
<term><varname>NotifyReady=</varname></term>
Exec.WorkingDirectory, config_parse_path, 0, offsetof(Settings, working_directory)
Exec.PivotRoot, config_parse_pivot_root, 0, 0
Exec.PrivateUsers, config_parse_private_users, 0, 0
+Exec.PrivateUsersDelegate, config_parse_unsigned, 0, offsetof(Settings, delegate_container_ranges)
Exec.NotifyReady, config_parse_tristate, 0, offsetof(Settings, notify_ready)
Exec.SystemCallFilter, config_parse_syscall_filter, 0, 0
Exec.LimitCPU, config_parse_rlimit, RLIMIT_CPU, offsetof(Settings, rlimit)
char *pivot_root_old;
UserNamespaceMode userns_mode;
uid_t uid_shift, uid_range;
+ unsigned delegate_container_ranges;
int notify_ready;
char **syscall_allow_list;
char **syscall_deny_list;
static sd_bus_message *arg_property_message = NULL;
static UserNamespaceMode arg_userns_mode; /* initialized depending on arg_privileged in run() */
static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
+static unsigned arg_delegate_container_ranges = 0;
static UserNamespaceOwnership arg_userns_ownership = _USER_NAMESPACE_OWNERSHIP_INVALID;
static int arg_kill_signal = 0;
static SettingsMask arg_settings_mask = 0;
" --private-users-ownership=MODE\n"
" Adjust ('chown') or map ('map') OS tree ownership\n"
" to private UID/GID range\n"
+ " --private-users-delegate=N\n"
+ " Delegate N additional 64K UID/GID ranges for use\n"
+ " by nested containers (requires managed user\n"
+ " namespaces)\n"
" -U Equivalent to --private-users=pick and\n"
" --private-users-ownership=auto\n"
"\n%3$sNetworking:%4$s\n"
ARG_TEMPLATE,
ARG_PROPERTY,
ARG_PRIVATE_USERS,
+ ARG_PRIVATE_USERS_DELEGATE,
ARG_KILL_SIGNAL,
ARG_SETTINGS,
ARG_CHDIR,
{ "private-users", optional_argument, NULL, ARG_PRIVATE_USERS },
{ "private-users-chown", optional_argument, NULL, ARG_PRIVATE_USERS_CHOWN }, /* obsolete */
{ "private-users-ownership",required_argument, NULL, ARG_PRIVATE_USERS_OWNERSHIP},
+ { "private-users-delegate", required_argument, NULL, ARG_PRIVATE_USERS_DELEGATE },
{ "kill-signal", required_argument, NULL, ARG_KILL_SIGNAL },
{ "settings", required_argument, NULL, ARG_SETTINGS },
{ "chdir", required_argument, NULL, ARG_CHDIR },
arg_settings_mask |= SETTING_USERNS;
break;
+ case ARG_PRIVATE_USERS_DELEGATE:
+ r = safe_atou(optarg, &arg_delegate_container_ranges);
+ if (r < 0)
+ return log_error_errno(r, "Failed to parse --private-users-delegate= parameter: %s", optarg);
+
+ arg_settings_mask |= SETTING_USERNS;
+ break;
+
case ARG_KILL_SIGNAL:
if (streq(optarg, "help"))
return DUMP_STRING_TABLE(signal, int, _NSIG);
if (arg_userns_mode == USER_NAMESPACE_MANAGED && !arg_private_network)
return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Managed user namespace operation requires private networking, as otherwise /sys/ may not be mounted.");
+ if (arg_delegate_container_ranges > 0 && arg_userns_mode != USER_NAMESPACE_MANAGED)
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--private-users-delegate= requires --private-users=managed.");
+
if (!(arg_clone_ns_flags & CLONE_NEWPID) ||
!(arg_clone_ns_flags & CLONE_NEWUTS)) {
arg_register = false;
return 0;
}
+static int setup_varlink_socket(const char *directory, const char *name) {
+ int r;
+
+ assert(directory);
+
+ if (arg_delegate_container_ranges == 0)
+ return 0;
+
+ r = make_run_host(directory);
+ if (r < 0)
+ return r;
+
+ _cleanup_free_ char *src = path_join("/run/systemd", name);
+ if (!src)
+ return log_oom();
+
+ _cleanup_free_ char *dest = path_join(directory, "/run/host", name);
+ if (!dest)
+ return log_oom();
+
+ r = touch(dest);
+ if (r < 0)
+ return log_error_errno(r, "Failed to create %s: %m", dest);
+
+ r = userns_lchown(dest, 0, 0);
+ if (r < 0)
+ return log_error_errno(r, "Failed to chown %s: %m", dest);
+
+ return mount_nofollow_verbose(
+ LOG_ERR,
+ src,
+ dest,
+ /* fstype= */ NULL,
+ MS_BIND|MS_RDONLY,
+ /* options= */ NULL);
+}
+
static int recursive_chown(const char *directory, uid_t shift, uid_t range) {
int r;
if (r < 0)
return r;
+ r = setup_varlink_socket(directory, "io.systemd.NamespaceResource");
+ if (r < 0)
+ return r;
+
+ r = setup_varlink_socket(directory, "io.systemd.MountFileSystem");
+ if (r < 0)
+ return r;
+
/* The same stuff as the $container env var, but nicely readable for the entire payload */
free(p);
p = path_join(directory, "/run/host/container-manager");
arg_uid_shift = settings->uid_shift;
arg_uid_range = settings->uid_range;
arg_userns_ownership = settings->userns_ownership;
+ arg_delegate_container_ranges = settings->delegate_container_ranges;
}
}
goto finish;
}
- userns_fd = nsresource_allocate_userns(
+ userns_fd = nsresource_allocate_userns_full(
nsresource_link,
userns_name,
- NSRESOURCE_UIDS_64K); /* allocate 64K UIDs */
+ NSRESOURCE_UIDS_64K,
+ arg_delegate_container_ranges);
if (userns_fd < 0) {
r = log_error_errno(userns_fd, "Failed to allocate user namespace with 64K users: %m");
goto finish;
return 0;
}
-int nsresource_allocate_userns(sd_varlink *vl, const char *name, uint64_t size) {
+int nsresource_allocate_userns_full(sd_varlink *vl, const char *name, uint64_t size, uint64_t delegate_container_ranges) {
_cleanup_close_ int userns_fd = -EBADF;
_cleanup_free_ char *_name = NULL;
const char *error_id;
SD_JSON_BUILD_PAIR_STRING("name", name),
SD_JSON_BUILD_PAIR_BOOLEAN("mangleName", true),
SD_JSON_BUILD_PAIR_UNSIGNED("size", size),
- SD_JSON_BUILD_PAIR_UNSIGNED("userNamespaceFileDescriptor", userns_fd_idx));
+ SD_JSON_BUILD_PAIR_UNSIGNED("userNamespaceFileDescriptor", userns_fd_idx),
+ JSON_BUILD_PAIR_UNSIGNED_NON_ZERO("delegateContainerRanges", delegate_container_ranges));
if (r < 0)
return log_debug_errno(r, "Failed to call AllocateUserRange() varlink call: %m");
if (streq_ptr(error_id, "io.systemd.NamespaceResource.UserNamespaceInterfaceNotSupported"))
* operations under the original identity, until the connection is closed. The 'link' parameter may be passed
* as NULL in which case a short-lived connection is created, just to execute the requested operation. */
-int nsresource_allocate_userns(sd_varlink *vl, const char *name, uint64_t size);
+int nsresource_allocate_userns_full(sd_varlink *vl, const char *name, uint64_t size, uint64_t delegate_container_ranges);
+static inline int nsresource_allocate_userns(sd_varlink *vl, const char *name, uint64_t size) {
+ return nsresource_allocate_userns_full(vl, name, size, /* delegate_container_ranges= */ 0);
+}
int nsresource_register_userns(sd_varlink *vl, const char *name, int userns_fd);
int nsresource_add_mount(sd_varlink *vl, int userns_fd, int mount_fd);
int nsresource_add_cgroup(sd_varlink *vl, int userns_fd, int cgroup_fd);
R! /var/lib/machines/.#*
R! /.#machine.*
+
+# If the nsresourced/mountfsd sockets are mounted into /run/host, symlink them to their canonical
+# location in /run/systemd.
+L? /run/systemd/io.systemd.NamespaceResource - - - - /run/host/io.systemd.NamespaceResource
+L? /run/systemd/io.systemd.MountFileSystem - - - - /run/host/io.systemd.MountFileSystem
d$ /run/systemd/users 0755 root root -
d /run/systemd/machines 0755 root root -
d$ /run/systemd/shutdown 0755 root root -
+d /run/systemd/dissect-root 0000 root root -
d /run/log 0755 root root -
DefaultDependencies=no
Conflicts=shutdown.target
Before=sockets.target shutdown.target
+ConditionPathExists=!/run/host/io.systemd.MountFileSystem
[Socket]
ListenStream=/run/systemd/io.systemd.MountFileSystem
DefaultDependencies=no
Conflicts=shutdown.target
Before=sockets.target shutdown.target
+ConditionPathExists=!/run/host/io.systemd.NamespaceResource
[Socket]
ListenStream=/run/systemd/io.systemd.NamespaceResource