@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly t RestrictNamespaces = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
+ readonly t DelegateNamespaces = ...;
+ @org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly (bas) RestrictFileSystems = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly a(ssbt) BindPaths = [...];
<!--property RestrictNamespaces is not documented!-->
+ <!--property DelegateNamespaces is not documented!-->
+
<!--property RestrictFileSystems is not documented!-->
<!--property BindPaths is not documented!-->
<variablelist class="dbus-property" generated="True" extra-ref="RestrictNamespaces"/>
+ <variablelist class="dbus-property" generated="True" extra-ref="DelegateNamespaces"/>
+
<variablelist class="dbus-property" generated="True" extra-ref="RestrictFileSystems"/>
<variablelist class="dbus-property" generated="True" extra-ref="BindPaths"/>
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly t RestrictNamespaces = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
+ readonly t DelegateNamespaces = ...;
+ @org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly (bas) RestrictFileSystems = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly a(ssbt) BindPaths = [...];
<!--property RestrictNamespaces is not documented!-->
+ <!--property DelegateNamespaces is not documented!-->
+
<!--property RestrictFileSystems is not documented!-->
<!--property BindPaths is not documented!-->
<variablelist class="dbus-property" generated="True" extra-ref="RestrictNamespaces"/>
+ <variablelist class="dbus-property" generated="True" extra-ref="DelegateNamespaces"/>
+
<variablelist class="dbus-property" generated="True" extra-ref="RestrictFileSystems"/>
<variablelist class="dbus-property" generated="True" extra-ref="BindPaths"/>
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly t RestrictNamespaces = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
+ readonly t DelegateNamespaces = ...;
+ @org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly (bas) RestrictFileSystems = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly a(ssbt) BindPaths = [...];
<!--property RestrictNamespaces is not documented!-->
+ <!--property DelegateNamespaces is not documented!-->
+
<!--property RestrictFileSystems is not documented!-->
<!--property BindPaths is not documented!-->
<variablelist class="dbus-property" generated="True" extra-ref="RestrictNamespaces"/>
+ <variablelist class="dbus-property" generated="True" extra-ref="DelegateNamespaces"/>
+
<variablelist class="dbus-property" generated="True" extra-ref="RestrictFileSystems"/>
<variablelist class="dbus-property" generated="True" extra-ref="BindPaths"/>
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly t RestrictNamespaces = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
+ readonly t DelegateNamespaces = ...;
+ @org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly (bas) RestrictFileSystems = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly a(ssbt) BindPaths = [...];
<!--property RestrictNamespaces is not documented!-->
+ <!--property DelegateNamespaces is not documented!-->
+
<!--property RestrictFileSystems is not documented!-->
<!--property BindPaths is not documented!-->
<variablelist class="dbus-property" generated="True" extra-ref="RestrictNamespaces"/>
+ <variablelist class="dbus-property" generated="True" extra-ref="DelegateNamespaces"/>
+
<variablelist class="dbus-property" generated="True" extra-ref="RestrictFileSystems"/>
<variablelist class="dbus-property" generated="True" extra-ref="BindPaths"/>
<varname>ProtectControlGroupsEx</varname>,
<varname>PrivateUsersEx</varname>, and
<varname>PrivatePIDs</varname> were added in version 257.</para>
- <para><varname>ProtectHostnameEx</varname> and <function>RemoveSubGroup()</function> were added in version 258.</para>
+ <para><varname>ProtectHostnameEx</varname>,
+ <varname>DelegateNamespaces</varname>, and
+ <function>RemoveSubGroup()</function> were added in version 258.</para>
</refsect2>
<refsect2>
<title>Socket Unit Objects</title>
<varname>ManagedOOMMemoryPressureDurationUSec</varname>,
<varname>ProtectControlGroupsEx</varname>, and
<varname>PrivatePIDs</varname> were added in version 257.</para>
- <para><varname>ProtectHostnameEx</varname> and <function>RemoveSubgroup()</function> were added in version 258.</para>
+ <para><varname>ProtectHostnameEx</varname>,
+ <varname>DelegateNamespaces</varname>, and
+ <function>RemoveSubgroup()</function> were added in version 258.</para>
</refsect2>
<refsect2>
<title>Mount Unit Objects</title>
<varname>ProtectControlGroupsEx</varname>, and
<varname>PrivatePIDs</varname> were added in version 257.</para>
<para><varname>ProtectHostnameEx</varname>,
+ <varname>DelegateNamespaces</varname>,
<function>RemoveSubgroup()</function>,
<varname>ReloadResult</varname>, and
<varname>CleanResult</varname> were added in version 258.</para>
<varname>ManagedOOMMemoryPressureDurationUSec</varname>,
<varname>ProtectControlGroupsEx</varname>, and
<varname>PrivatePIDs</varname> were added in version 257.</para>
- <para><varname>ProtectHostnameEx</varname> and <function>RemoveSubgroup()</function> were added in version 258.</para>
+ <para><varname>ProtectHostnameEx</varname>,
+ <varname>DelegateNamespaces</varname>, and
+ <function>RemoveSubgroup()</function> were added in version 258.</para>
</refsect2>
<refsect2>
<title>Slice Unit Objects</title>
<xi:include href="version-info.xml" xpointer="v233"/></listitem>
</varlistentry>
+ <varlistentry>
+ <term><varname>DelegateNamespaces=</varname></term>
+
+ <listitem><para>Delegates ownership of the given namespace types to the user namespace of the
+ processes of this unit. For details about Linux namespaces, see <citerefentry
+ project='man-pages'><refentrytitle>namespaces</refentrytitle><manvolnum>7</manvolnum></citerefentry>.
+ Either takes a boolean argument, or a space-separated list of namespace type identifiers. If false
+ (the default), the unit's processes' user namespace will not have ownership over any namespaces
+ created during setup of the unit's sandboxed environment. If true, ownership of all namespace types
+ (except for user namespaces, where the concept doesn't apply) created during setup of the unit's
+ sandboxed environment is delegated to the unit's processes' user namespace. Otherwise, a
+ space-separated list of namespace type identifiers must be specified, consisting of any combination
+ of: <constant>cgroup</constant>, <constant>ipc</constant>, <constant>net</constant>,
+ <constant>mnt</constant>, <constant>pid</constant>, and <constant>uts</constant>. All namespaces of
+ the listed types will be owned by the unit's processes' user namespace if they are created during
+ setup of the unit's sandboxed environment (allow-listing). By prepending the list with a single tilde
+ character (<literal>~</literal>) the effect may be inverted: all namespaces of types not listed and
+ created during setup of the unit's sandboxed environment will be owned by the unit's processes' user
+ namespace (deny-listing). If the empty string is assigned, the default namespace ownership is
+ applied, which is equivalent to false. This option may appear more than once, in which case the
+ namespace types are merged by <constant>OR</constant>, or by <constant>AND</constant> if the lines
+ are prefixed with <literal>~</literal> (see examples below). Internally, this setting controls the
+ order in which namespaces are unshared by systemd. Namespace types that should be owned by the unit's
+ processes' user namespace will be unshared after unsharing the user namespace. Internally, this
+ setting controls the order in which namespaces are unshared. Delegated namespaces will be unshared
+ after the user namespace is unshared. Other namespaces will be unshared before the user namespace is
+ unshared.</para>
+
+ <para>Delegating any namespace with <varname>DelegateNamespaces=</varname> implies
+ <varname>PrivateUsers=self</varname> unless <varname>PrivateUsers=</varname> is explicitly enabled
+ already by the unit. Delegating a namespace does not imply that the namespace is unshared, that is
+ done with the namespace specific unit setting such as <varname>PrivateNetwork=</varname> or
+ <varname>PrivateMounts=</varname>.</para>
+
+ <xi:include href="version-info.xml" xpointer="v258"/></listitem>
+ </varlistentry>
+
<varlistentry>
<term><varname>LockPersonality=</varname></term>
SD_BUS_PROPERTY("RestrictRealtime", "b", bus_property_get_bool, offsetof(ExecContext, restrict_realtime), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("RestrictSUIDSGID", "b", bus_property_get_bool, offsetof(ExecContext, restrict_suid_sgid), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("RestrictNamespaces", "t", bus_property_get_ulong, offsetof(ExecContext, restrict_namespaces), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("DelegateNamespaces", "t", bus_property_get_ulong, offsetof(ExecContext, delegate_namespaces), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("RestrictFileSystems", "(bas)", property_get_restrict_filesystems, 0, SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("BindPaths", "a(ssbt)", property_get_bind_paths, 0, SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("BindReadOnlyPaths", "a(ssbt)", property_get_bind_paths, 0, SD_BUS_VTABLE_PROPERTY_CONST),
if (streq(name, "RestrictNamespaces"))
return bus_set_transient_namespace_flag(u, name, &c->restrict_namespaces, message, flags, error);
+ if (streq(name, "DelegateNamespaces"))
+ return bus_set_transient_namespace_flag(u, name, &c->delegate_namespaces, message, flags, error);
+
if (streq(name, "RestrictFileSystems")) {
int allow_list;
_cleanup_strv_free_ char **l = NULL;
!strv_isempty(context->read_only_paths) ||
!strv_isempty(context->inaccessible_paths) ||
!strv_isempty(context->exec_paths) ||
- !strv_isempty(context->no_exec_paths);
+ !strv_isempty(context->no_exec_paths) ||
+ context->delegate_namespaces != NAMESPACE_FLAGS_INITIAL;
+}
+
+static PrivateUsers exec_context_get_effective_private_users(
+ const ExecContext *context,
+ const ExecParameters *params) {
+
+ assert(context);
+ assert(params);
+
+ if (context->private_users != PRIVATE_USERS_NO)
+ return context->private_users;
+
+ if (exec_context_need_unprivileged_private_users(context, params))
+ return PRIVATE_USERS_SELF;
+
+ /* If any namespace is delegated with DelegateNamespaces=, always set up a user namespace. */
+ if (context->delegate_namespaces != NAMESPACE_FLAGS_INITIAL)
+ return PRIVATE_USERS_SELF;
+
+ return PRIVATE_USERS_NO;
+}
+
+static bool exec_namespace_is_delegated(
+ const ExecContext *context,
+ const ExecParameters *params,
+ unsigned long namespace) {
+
+ assert(context);
+ assert(params);
+ assert(namespace != CLONE_NEWUSER);
+
+ /* If we need unprivileged private users, we've already unshared a user namespace by the time we call
+ * setup_delegated_namespaces() for the first time so let's make sure we do all other namespace
+ * unsharing in the first call to setup_delegated_namespaces() by returning false here. */
+ if (exec_context_need_unprivileged_private_users(context, params))
+ return false;
+
+ if (context->delegate_namespaces == NAMESPACE_FLAGS_INITIAL)
+ return false;
+
+ return FLAGS_SET(context->delegate_namespaces, namespace);
}
static int setup_delegated_namespaces(
const ExecContext *context,
ExecParameters *params,
ExecRuntime *runtime,
+ bool delegate,
const char *memory_pressure_path,
uid_t uid,
uid_t gid,
int r;
+ /* This function is called twice, once before unsharing the user namespace, and once after unsharing
+ * the user namespace. When called before unsharing the user namespace, "delegate" is set to "false".
+ * When called after unsharing the user namespace, "delegate" is set to "true". The net effect is
+ * that all namespaces that should not be delegated are unshared when this function is called the
+ * first time and all namespaces that should be delegated are unshared when this function is called
+ * the second time. */
+
assert(context);
assert(params);
assert(reterr_exit_status);
if (exec_needs_network_namespace(context) &&
+ exec_namespace_is_delegated(context, params, CLONE_NEWNET) == delegate &&
runtime && runtime->shared && runtime->shared->netns_storage_socket[0] >= 0) {
/* Try to enable network namespacing if network namespacing is available and we have
- * CAP_NET_ADMIN. We need CAP_NET_ADMIN to be able to configure the loopback device in the
- * new network namespace. And if we don't have that, then we could only create a network
+ * CAP_NET_ADMIN in the current user namespace (either the system manager one or the unit's
+ * own user namespace). We need CAP_NET_ADMIN to be able to configure the loopback device in
+ * the new network namespace. And if we don't have that, then we could only create a network
* namespace without the ability to set up "lo". Hence gracefully skip things then. */
if (ns_type_supported(NAMESPACE_NET) && have_effective_cap(CAP_NET_ADMIN) > 0) {
r = setup_shareable_ns(runtime->shared->netns_storage_socket, CLONE_NEWNET);
else if (r < 0) {
*reterr_exit_status = EXIT_NETWORK;
return log_exec_error_errno(context, params, r, "Failed to set up network namespacing: %m");
- }
+ } else
+ log_exec_debug(context, params, "Set up %snetwork namespace", delegate ? "delegated " : "");
} else if (context->network_namespace_path) {
*reterr_exit_status = EXIT_NETWORK;
return log_exec_error_errno(context, params, SYNTHETIC_ERRNO(EOPNOTSUPP),
log_exec_notice(context, params, "PrivateNetwork=yes is configured, but the kernel does not support or we lack privileges for network namespace, proceeding without.");
}
- if (exec_needs_ipc_namespace(context) && runtime &&
- runtime->shared && runtime->shared->ipcns_storage_socket[0] >= 0) {
+ if (exec_needs_ipc_namespace(context) &&
+ exec_namespace_is_delegated(context, params, CLONE_NEWIPC) == delegate &&
+ runtime && runtime->shared && runtime->shared->ipcns_storage_socket[0] >= 0) {
if (ns_type_supported(NAMESPACE_IPC)) {
r = setup_shareable_ns(runtime->shared->ipcns_storage_socket, CLONE_NEWIPC);
else if (r < 0) {
*reterr_exit_status = EXIT_NAMESPACE;
return log_exec_error_errno(context, params, r, "Failed to set up IPC namespacing: %m");
- }
+ } else
+ log_exec_debug(context, params, "Set up %sIPC namespace", delegate ? "delegated " : "");
} else if (context->ipc_namespace_path) {
*reterr_exit_status = EXIT_NAMESPACE;
return log_exec_error_errno(context, params, SYNTHETIC_ERRNO(EOPNOTSUPP),
log_exec_warning(context, params, "PrivateIPC=yes is configured, but the kernel does not support IPC namespaces, ignoring.");
}
- if (needs_sandboxing && exec_needs_cgroup_namespace(context, params)) {
+ if (needs_sandboxing && exec_needs_cgroup_namespace(context, params) &&
+ exec_namespace_is_delegated(context, params, CLONE_NEWCGROUP) == delegate) {
if (unshare(CLONE_NEWCGROUP) < 0) {
*reterr_exit_status = EXIT_NAMESPACE;
return log_exec_error_errno(context, params, errno, "Failed to set up cgroup namespacing: %m");
}
+
+ log_exec_debug(context, params, "Set up %scgroup namespace", delegate ? "delegated " : "");
}
/* Unshare a new PID namespace before setting up mounts to ensure /proc/ is mounted with only processes in PID namespace visible.
* Note PrivatePIDs=yes implies MountAPIVFS=yes so we'll always ensure procfs is remounted. */
- if (needs_sandboxing && exec_needs_pid_namespace(context)) {
+ if (needs_sandboxing && exec_needs_pid_namespace(context) &&
+ exec_namespace_is_delegated(context, params, CLONE_NEWPID) == delegate) {
if (params->pidref_transport_fd < 0) {
*reterr_exit_status = EXIT_NAMESPACE;
return log_exec_error_errno(context, params, SYNTHETIC_ERRNO(ENOTCONN), "PidRef socket is not set up: %m");
*reterr_exit_status = EXIT_NAMESPACE;
return log_exec_error_errno(context, params, r, "Failed to set up pid namespace: %m");
}
+
+ log_exec_debug(context, params, "Set up %spid namespace", delegate ? "delegated " : "");
}
/* If PrivatePIDs= yes is configured, we're now running as pid 1 in a pid namespace! */
- if (exec_needs_mount_namespace(context, params, runtime)) {
+ if (exec_needs_mount_namespace(context, params, runtime) &&
+ exec_namespace_is_delegated(context, params, CLONE_NEWNS) == delegate) {
_cleanup_free_ char *error_path = NULL;
r = apply_mount_namespace(command->flags,
return log_exec_error_errno(context, params, r, "Failed to set up mount namespacing%s%s: %m",
error_path ? ": " : "", strempty(error_path));
}
+
+ log_exec_debug(context, params, "Set up %smount namespace", delegate ? "delegated " : "");
}
- if (needs_sandboxing) {
+ if (needs_sandboxing && exec_namespace_is_delegated(context, params, CLONE_NEWUTS) == delegate) {
r = apply_protect_hostname(context, params, reterr_exit_status);
if (r < 0)
return r;
+
+ log_exec_debug(context, params, "Set up %sUTS namespace", delegate ? "delegated " : "");
}
return 0;
char **final_argv = NULL;
dev_t journal_stream_dev = 0;
ino_t journal_stream_ino = 0;
- bool userns_set_up = false;
bool needs_sandboxing, /* Do we need to set up full sandboxing? (i.e. all namespacing, all MAC stuff, caps, yadda yadda */
needs_setuid, /* Do we need to do the actual setresuid()/setresgid() calls? */
needs_mount_namespace; /* Do we need to set up a mount namespace for this kernel? */
/* If we're unprivileged, set up the user namespace first to enable use of the other namespaces.
* Users with CAP_SYS_ADMIN can set up user namespaces last because they will be able to
* set up all of the other namespaces (i.e. network, mount, UTS) without a user namespace. */
- PrivateUsers pu = context->private_users;
- if (pu == PRIVATE_USERS_NO)
- pu = PRIVATE_USERS_SELF;
+ PrivateUsers pu = exec_context_get_effective_private_users(context, params);
/* The kernel requires /proc/pid/setgroups be set to "deny" prior to writing /proc/pid/gid_map in
* unprivileged user namespaces. */
log_exec_info_errno(context, params, r, "Failed to set up user namespacing for unprivileged user, ignoring: %m");
else {
assert(r > 0);
- userns_set_up = true;
+ log_debug("Set up unprivileged user namespace");
}
}
+ /* Call setup_delegated_namespaces() the first time to unshare all non-delegated namespaces. */
r = setup_delegated_namespaces(
context,
params,
runtime,
+ /* delegate= */ false,
memory_pressure_path,
uid,
gid,
* case of mount namespaces being less privileged when the mount point list is copied from a
* different user namespace). */
- if (needs_sandboxing && !userns_set_up) {
- r = setup_private_users(context->private_users, saved_uid, saved_gid, uid, gid,
- /* allow_setgroups= */ context->private_users == PRIVATE_USERS_FULL);
+ if (needs_sandboxing && !exec_context_need_unprivileged_private_users(context, params)) {
+ PrivateUsers pu = exec_context_get_effective_private_users(context, params);
+
+ r = setup_private_users(pu, saved_uid, saved_gid, uid, gid,
+ /* allow_setgroups= */ pu == PRIVATE_USERS_FULL);
if (r < 0) {
*exit_status = EXIT_USER;
return log_exec_error_errno(context, params, r, "Failed to set up user namespacing: %m");
}
+
+ log_debug("Set up privileged user namespace");
}
+ /* Call setup_delegated_namespaces() the second time to unshare all delegated namespaces. */
+ r = setup_delegated_namespaces(
+ context,
+ params,
+ runtime,
+ /* delegate= */ true,
+ memory_pressure_path,
+ uid,
+ gid,
+ command,
+ needs_sandboxing,
+ has_cap_sys_admin,
+ exit_status);
+ if (r < 0)
+ return r;
+
/* Now that the mount namespace has been set up and privileges adjusted, let's look for the thing we
* shall execute. */
return r;
}
+ if (c->delegate_namespaces != NAMESPACE_FLAGS_INITIAL) {
+ r = serialize_item_format(f, "exec-context-delegate-namespaces", "%lu", c->delegate_namespaces);
+ if (r < 0)
+ return r;
+ }
+
#if HAVE_LIBBPF
if (exec_context_restrict_filesystems_set(c)) {
char *fs;
r = safe_atolu(val, &c->restrict_namespaces);
if (r < 0)
return r;
+ } else if ((val = startswith(l, "exec-context-delegate-namespaces="))) {
+ r = safe_atolu(val, &c->delegate_namespaces);
+ if (r < 0)
+ return r;
} else if ((val = startswith(l, "exec-context-restrict-filesystems="))) {
r = set_ensure_allocated(&c->restrict_filesystems, &string_hash_ops);
if (r < 0)
.timeout_clean_usec = USEC_INFINITY,
.capability_bounding_set = CAP_MASK_UNSET,
.restrict_namespaces = NAMESPACE_FLAGS_INITIAL,
+ .delegate_namespaces = NAMESPACE_FLAGS_INITIAL,
.log_level_max = -1,
#if HAVE_SECCOMP
.syscall_errno = SECCOMP_ERROR_NUMBER_KILL,
unsigned long personality;
unsigned long restrict_namespaces; /* The CLONE_NEWxyz flags permitted to the unit's processes */
+ unsigned long delegate_namespaces; /* The CLONE_NEWxyz flags delegated to the unit's processes */
Set *restrict_filesystems;
bool restrict_filesystems_allow_list:1;
{{type}}.SystemCallErrorNumber, config_parse_syscall_errno, 0, offsetof({{type}}, exec_context)
{{type}}.SystemCallLog, config_parse_syscall_log, 0, offsetof({{type}}, exec_context)
{{type}}.MemoryDenyWriteExecute, config_parse_bool, 0, offsetof({{type}}, exec_context.memory_deny_write_execute)
-{{type}}.RestrictNamespaces, config_parse_restrict_namespaces, 0, offsetof({{type}}, exec_context)
+{{type}}.RestrictNamespaces, config_parse_namespace_flags, 0, offsetof({{type}}, exec_context.restrict_namespaces)
+{{type}}.DelegateNamespaces, config_parse_namespace_flags, 0, offsetof({{type}}, exec_context.delegate_namespaces)
{{type}}.RestrictRealtime, config_parse_bool, 0, offsetof({{type}}, exec_context.restrict_realtime)
{{type}}.RestrictSUIDSGID, config_parse_bool, 0, offsetof({{type}}, exec_context.restrict_suid_sgid)
{{type}}.RestrictAddressFamilies, config_parse_address_families, 0, offsetof({{type}}, exec_context)
}
}
-int config_parse_restrict_namespaces(
+int config_parse_namespace_flags(
const char *unit,
const char *filename,
unsigned line,
void *data,
void *userdata) {
- ExecContext *c = data;
- unsigned long flags;
+ unsigned long *flags = data;
+ unsigned long all = UPDATE_FLAG(NAMESPACE_FLAGS_ALL, CLONE_NEWUSER, !streq(lvalue, "DelegateNamespaces"));
+ unsigned long f;
bool invert = false;
int r;
if (isempty(rvalue)) {
/* Reset to the default. */
- c->restrict_namespaces = NAMESPACE_FLAGS_INITIAL;
+ *flags = NAMESPACE_FLAGS_INITIAL;
return 0;
}
/* Boolean parameter ignores the previous settings */
r = parse_boolean(rvalue);
if (r > 0) {
- c->restrict_namespaces = 0;
+ *flags = 0;
return 0;
} else if (r == 0) {
- c->restrict_namespaces = NAMESPACE_FLAGS_ALL;
+ *flags = all;
return 0;
}
}
/* Not a boolean argument, in this case it's a list of namespace types. */
- r = namespace_flags_from_string(rvalue, &flags);
+ r = namespace_flags_from_string(rvalue, &f);
if (r < 0) {
log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse namespace type string, ignoring: %s", rvalue);
return 0;
}
- if (c->restrict_namespaces == NAMESPACE_FLAGS_INITIAL)
+ if (*flags == NAMESPACE_FLAGS_INITIAL)
/* Initial assignment. Just set the value. */
- c->restrict_namespaces = invert ? (~flags) & NAMESPACE_FLAGS_ALL : flags;
+ f = invert ? (~f) & all : f;
else
/* Merge the value with the previous one. */
- SET_FLAG(c->restrict_namespaces, flags, !invert);
+ f = UPDATE_FLAG(*flags, f, !invert);
+
+ if (FLAGS_SET(f, CLONE_NEWUSER) && streq(lvalue, "DelegateNamespaces")) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "The user namespace cannot be delegated with DelegateNamespaces=, ignoring: %s", rvalue);
+ return 0;
+ }
+
+ *flags = f;
return 0;
}
{ config_parse_syscall_errno, "ERRNO" },
{ config_parse_syscall_log, "SYSCALLS" },
{ config_parse_address_families, "FAMILIES" },
- { config_parse_restrict_namespaces, "NAMESPACES" },
+ { config_parse_namespace_flags, "NAMESPACES" },
#endif
{ config_parse_restrict_filesystems, "FILESYSTEMS" },
{ config_parse_cpu_shares, "SHARES" },
CONFIG_PARSER_PROTOTYPE(config_parse_fdname);
CONFIG_PARSER_PROTOTYPE(config_parse_user_group_compat);
CONFIG_PARSER_PROTOTYPE(config_parse_user_group_strv_compat);
-CONFIG_PARSER_PROTOTYPE(config_parse_restrict_namespaces);
+CONFIG_PARSER_PROTOTYPE(config_parse_namespace_flags);
CONFIG_PARSER_PROTOTYPE(config_parse_restrict_filesystems);
CONFIG_PARSER_PROTOTYPE(config_parse_bind_paths);
CONFIG_PARSER_PROTOTYPE(config_parse_exec_keyring_mode);
return 1;
}
- if (streq(field, "RestrictNamespaces")) {
+ if (STR_IN_SET(field, "RestrictNamespaces",
+ "DelegateNamespaces")) {
bool invert = false;
unsigned long flags;
--- /dev/null
+#!/usr/bin/env bash
+# SPDX-License-Identifier: LGPL-2.1-or-later
+# shellcheck disable=SC2016
+set -eux
+set -o pipefail
+
+# shellcheck source=test/units/test-control.sh
+. "$(dirname "$0")"/test-control.sh
+# shellcheck source=test/units/util.sh
+. "$(dirname "$0")"/util.sh
+
+testcase_mount() {
+ (! systemd-run -p PrivateUsersEx=self -p PrivateMounts=yes --wait --pipe -- mount --bind /usr /home)
+ systemd-run -p PrivateUsersEx=self -p PrivateMounts=yes -p DelegateNamespaces=mnt --wait --pipe -- mount --bind /usr /home
+}
+
+testcase_network() {
+ (! systemd-run -p PrivateUsersEx=self -p PrivateNetwork=yes --wait --pipe -- ip link add veth1 type veth peer name veth2)
+ systemd-run -p PrivateUsersEx=self -p PrivateMounts=yes -p DelegateNamespaces=mnt --wait --pipe -- ip link add veth1 type veth peer name veth2
+}
+
+testcase_cgroup() {
+ (! systemd-run -p PrivateUsersEx=self -p ProtectControlGroupsEx=private --wait --pipe -- sh -c 'echo 0 >/sys/fs/cgroup/cgroup.pressure')
+ systemd-run -p PrivateUsersEx=self -p ProtectControlGroupsEx=private -p DelegateNamespaces=cgroup --wait --pipe -- sh -c 'echo 0 >/sys/fs/cgroup/cgroup.pressure'
+}
+
+testcase_pid() {
+ (! systemd-run -p PrivateUsersEx=self -p PrivatePIDs=yes --wait --pipe -- sh -c 'echo 5 >/proc/sys/kernel/ns_last_pid')
+ systemd-run -p PrivateUsersEx=self -p PrivatePIDs=yes -p DelegateNamespaces=pid --wait --pipe -- sh -c 'echo 5 >/proc/sys/kernel/ns_last_pid'
+}
+
+testcase_uts() {
+ (! systemd-run -p PrivateUsersEx=self -p ProtectHostnameEx=private --wait --pipe -- hostname abc)
+ systemd-run -p PrivateUsersEx=self -p ProtectHostnameEx=private -p DelegateNamespaces=uts --wait --pipe -- hostname abc
+}
+
+testcase_implied_private_users_self() {
+ # If not explicitly set PrivateUsers=self is implied.
+ systemd-run -p PrivateMounts=yes -p DelegateNamespaces=mnt --wait --pipe -- mount --bind /usr /home
+ # If explicitly set it PrivateUsers= is not overridden.
+ systemd-run -p PrivateUsersEx=identity -p PrivateMounts=yes -p DelegateNamespaces=mnt --wait --pipe -- mount --bind /usr /home
+ systemd-run -p PrivateUsersEx=identity -p PrivateMounts=yes -p DelegateNamespaces=mnt --wait bash -c 'test "$(cat /proc/self/uid_map)" == " 0 0 65536"'
+}
+
+testcase_multiple_features() {
+ unsquashfs -no-xattrs -d /tmp/TEST-07-PID1-delegate-namespaces-root /usr/share/minimal_0.raw
+
+ systemd-run \
+ -p PrivatePIDs=yes \
+ -p RootDirectory=/tmp/TEST-07-PID1-delegate-namespaces-root \
+ -p ProcSubset=pid \
+ -p BindReadOnlyPaths=/usr/share \
+ -p NoNewPrivileges=yes \
+ -p ProtectSystem=strict \
+ -p User=testuser\
+ -p Group=testuser \
+ -p RuntimeDirectory=abc \
+ -p StateDirectory=qed \
+ -p InaccessiblePaths=/usr/include \
+ -p TemporaryFileSystem=/home \
+ -p PrivateTmp=yes \
+ -p PrivateDevices=yes \
+ -p PrivateNetwork=yes \
+ -p PrivateUsersEx=self \
+ -p PrivateIPC=yes \
+ -p ProtectHostname=yes \
+ -p ProtectClock=yes \
+ -p ProtectKernelTunables=yes \
+ -p ProtectKernelModules=yes \
+ -p ProtectKernelLogs=yes \
+ -p ProtectControlGroupsEx=private \
+ -p LockPersonality=yes \
+ -p Environment=ABC=QED \
+ -p DelegateNamespaces=yes \
+ --wait \
+ --pipe \
+ grep MARKER=1 /etc/os-release
+
+ rm -rf /tmp/TEST-07-PID1-delegate-namespaces-root
+}