From: Daan De Meyer Date: Tue, 4 Feb 2025 14:48:36 +0000 (+0100) Subject: core: Add DelegateNamespaces= X-Git-Tag: v258-rc1~1212^2 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=refs%2Fpull%2F36532%2Fhead;p=thirdparty%2Fsystemd.git core: Add DelegateNamespaces= This delegates one or more namespaces to the service. Concretely, this setting influences in which order we unshare namespaces. Delegated namespaces are unshared *after* the user namespace is unshared. Other namespaces are unshared *before* the user namespace is unshared. Fixes #35369 --- diff --git a/man/org.freedesktop.systemd1.xml b/man/org.freedesktop.systemd1.xml index 5e1c3e2c08e..b9965543885 100644 --- a/man/org.freedesktop.systemd1.xml +++ b/man/org.freedesktop.systemd1.xml @@ -3358,6 +3358,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice { @org.freedesktop.DBus.Property.EmitsChangedSignal("const") readonly t RestrictNamespaces = ...; @org.freedesktop.DBus.Property.EmitsChangedSignal("const") + readonly t DelegateNamespaces = ...; + @org.freedesktop.DBus.Property.EmitsChangedSignal("const") readonly (bas) RestrictFileSystems = ...; @org.freedesktop.DBus.Property.EmitsChangedSignal("const") readonly a(ssbt) BindPaths = [...]; @@ -3963,6 +3965,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice { + + @@ -4685,6 +4689,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice { + + @@ -5559,6 +5565,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket { @org.freedesktop.DBus.Property.EmitsChangedSignal("const") readonly t RestrictNamespaces = ...; @org.freedesktop.DBus.Property.EmitsChangedSignal("const") + readonly t DelegateNamespaces = ...; + @org.freedesktop.DBus.Property.EmitsChangedSignal("const") readonly (bas) RestrictFileSystems = ...; @org.freedesktop.DBus.Property.EmitsChangedSignal("const") readonly a(ssbt) BindPaths = [...]; @@ -6176,6 +6184,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket { + + @@ -6870,6 +6880,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket { + + @@ -7576,6 +7588,8 @@ node /org/freedesktop/systemd1/unit/home_2emount { @org.freedesktop.DBus.Property.EmitsChangedSignal("const") readonly t RestrictNamespaces = ...; @org.freedesktop.DBus.Property.EmitsChangedSignal("const") + readonly t DelegateNamespaces = ...; + @org.freedesktop.DBus.Property.EmitsChangedSignal("const") readonly (bas) RestrictFileSystems = ...; @org.freedesktop.DBus.Property.EmitsChangedSignal("const") readonly a(ssbt) BindPaths = [...]; @@ -8123,6 +8137,8 @@ node /org/freedesktop/systemd1/unit/home_2emount { + + @@ -8733,6 +8749,8 @@ node /org/freedesktop/systemd1/unit/home_2emount { + + @@ -9566,6 +9584,8 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap { @org.freedesktop.DBus.Property.EmitsChangedSignal("const") readonly t RestrictNamespaces = ...; @org.freedesktop.DBus.Property.EmitsChangedSignal("const") + readonly t DelegateNamespaces = ...; + @org.freedesktop.DBus.Property.EmitsChangedSignal("const") readonly (bas) RestrictFileSystems = ...; @org.freedesktop.DBus.Property.EmitsChangedSignal("const") readonly a(ssbt) BindPaths = [...]; @@ -10095,6 +10115,8 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap { + + @@ -10687,6 +10709,8 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap { + + @@ -12385,7 +12409,9 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \ ProtectControlGroupsEx, PrivateUsersEx, and PrivatePIDs were added in version 257. - ProtectHostnameEx and RemoveSubGroup() were added in version 258. + ProtectHostnameEx, + DelegateNamespaces, and + RemoveSubGroup() were added in version 258. Socket Unit Objects @@ -12429,7 +12455,9 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \ ManagedOOMMemoryPressureDurationUSec, ProtectControlGroupsEx, and PrivatePIDs were added in version 257. - ProtectHostnameEx and RemoveSubgroup() were added in version 258. + ProtectHostnameEx, + DelegateNamespaces, and + RemoveSubgroup() were added in version 258. Mount Unit Objects @@ -12471,6 +12499,7 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \ ProtectControlGroupsEx, and PrivatePIDs were added in version 257. ProtectHostnameEx, + DelegateNamespaces, RemoveSubgroup(), ReloadResult, and CleanResult were added in version 258. @@ -12514,7 +12543,9 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \ ManagedOOMMemoryPressureDurationUSec, ProtectControlGroupsEx, and PrivatePIDs were added in version 257. - ProtectHostnameEx and RemoveSubgroup() were added in version 258. + ProtectHostnameEx, + DelegateNamespaces, and + RemoveSubgroup() were added in version 258. Slice Unit Objects diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml index 29445946409..fbd25a1a180 100644 --- a/man/systemd.exec.xml +++ b/man/systemd.exec.xml @@ -2375,6 +2375,43 @@ RestrictNamespaces=~cgroup net + + DelegateNamespaces= + + Delegates ownership of the given namespace types to the user namespace of the + processes of this unit. For details about Linux namespaces, see namespaces7. + Either takes a boolean argument, or a space-separated list of namespace type identifiers. If false + (the default), the unit's processes' user namespace will not have ownership over any namespaces + created during setup of the unit's sandboxed environment. If true, ownership of all namespace types + (except for user namespaces, where the concept doesn't apply) created during setup of the unit's + sandboxed environment is delegated to the unit's processes' user namespace. Otherwise, a + space-separated list of namespace type identifiers must be specified, consisting of any combination + of: cgroup, ipc, net, + mnt, pid, and uts. All namespaces of + the listed types will be owned by the unit's processes' user namespace if they are created during + setup of the unit's sandboxed environment (allow-listing). By prepending the list with a single tilde + character (~) the effect may be inverted: all namespaces of types not listed and + created during setup of the unit's sandboxed environment will be owned by the unit's processes' user + namespace (deny-listing). If the empty string is assigned, the default namespace ownership is + applied, which is equivalent to false. This option may appear more than once, in which case the + namespace types are merged by OR, or by AND if the lines + are prefixed with ~ (see examples below). Internally, this setting controls the + order in which namespaces are unshared by systemd. Namespace types that should be owned by the unit's + processes' user namespace will be unshared after unsharing the user namespace. Internally, this + setting controls the order in which namespaces are unshared. Delegated namespaces will be unshared + after the user namespace is unshared. Other namespaces will be unshared before the user namespace is + unshared. + + Delegating any namespace with DelegateNamespaces= implies + PrivateUsers=self unless PrivateUsers= is explicitly enabled + already by the unit. Delegating a namespace does not imply that the namespace is unshared, that is + done with the namespace specific unit setting such as PrivateNetwork= or + PrivateMounts=. + + + + LockPersonality= diff --git a/src/core/dbus-execute.c b/src/core/dbus-execute.c index ecdb6e89c38..8039565f263 100644 --- a/src/core/dbus-execute.c +++ b/src/core/dbus-execute.c @@ -1263,6 +1263,7 @@ const sd_bus_vtable bus_exec_vtable[] = { SD_BUS_PROPERTY("RestrictRealtime", "b", bus_property_get_bool, offsetof(ExecContext, restrict_realtime), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("RestrictSUIDSGID", "b", bus_property_get_bool, offsetof(ExecContext, restrict_suid_sgid), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("RestrictNamespaces", "t", bus_property_get_ulong, offsetof(ExecContext, restrict_namespaces), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DelegateNamespaces", "t", bus_property_get_ulong, offsetof(ExecContext, delegate_namespaces), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("RestrictFileSystems", "(bas)", property_get_restrict_filesystems, 0, SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("BindPaths", "a(ssbt)", property_get_bind_paths, 0, SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("BindReadOnlyPaths", "a(ssbt)", property_get_bind_paths, 0, SD_BUS_VTABLE_PROPERTY_CONST), @@ -2194,6 +2195,9 @@ int bus_exec_context_set_transient_property( if (streq(name, "RestrictNamespaces")) return bus_set_transient_namespace_flag(u, name, &c->restrict_namespaces, message, flags, error); + if (streq(name, "DelegateNamespaces")) + return bus_set_transient_namespace_flag(u, name, &c->delegate_namespaces, message, flags, error); + if (streq(name, "RestrictFileSystems")) { int allow_list; _cleanup_strv_free_ char **l = NULL; diff --git a/src/core/exec-invoke.c b/src/core/exec-invoke.c index df910117e35..8a3cac6a064 100644 --- a/src/core/exec-invoke.c +++ b/src/core/exec-invoke.c @@ -4209,13 +4209,56 @@ static bool exec_context_need_unprivileged_private_users( !strv_isempty(context->read_only_paths) || !strv_isempty(context->inaccessible_paths) || !strv_isempty(context->exec_paths) || - !strv_isempty(context->no_exec_paths); + !strv_isempty(context->no_exec_paths) || + context->delegate_namespaces != NAMESPACE_FLAGS_INITIAL; +} + +static PrivateUsers exec_context_get_effective_private_users( + const ExecContext *context, + const ExecParameters *params) { + + assert(context); + assert(params); + + if (context->private_users != PRIVATE_USERS_NO) + return context->private_users; + + if (exec_context_need_unprivileged_private_users(context, params)) + return PRIVATE_USERS_SELF; + + /* If any namespace is delegated with DelegateNamespaces=, always set up a user namespace. */ + if (context->delegate_namespaces != NAMESPACE_FLAGS_INITIAL) + return PRIVATE_USERS_SELF; + + return PRIVATE_USERS_NO; +} + +static bool exec_namespace_is_delegated( + const ExecContext *context, + const ExecParameters *params, + unsigned long namespace) { + + assert(context); + assert(params); + assert(namespace != CLONE_NEWUSER); + + /* If we need unprivileged private users, we've already unshared a user namespace by the time we call + * setup_delegated_namespaces() for the first time so let's make sure we do all other namespace + * unsharing in the first call to setup_delegated_namespaces() by returning false here. */ + if (exec_context_need_unprivileged_private_users(context, params)) + return false; + + if (context->delegate_namespaces == NAMESPACE_FLAGS_INITIAL) + return false; + + return FLAGS_SET(context->delegate_namespaces, namespace); } static int setup_delegated_namespaces( const ExecContext *context, ExecParameters *params, ExecRuntime *runtime, + bool delegate, const char *memory_pressure_path, uid_t uid, uid_t gid, @@ -4226,16 +4269,25 @@ static int setup_delegated_namespaces( int r; + /* This function is called twice, once before unsharing the user namespace, and once after unsharing + * the user namespace. When called before unsharing the user namespace, "delegate" is set to "false". + * When called after unsharing the user namespace, "delegate" is set to "true". The net effect is + * that all namespaces that should not be delegated are unshared when this function is called the + * first time and all namespaces that should be delegated are unshared when this function is called + * the second time. */ + assert(context); assert(params); assert(reterr_exit_status); if (exec_needs_network_namespace(context) && + exec_namespace_is_delegated(context, params, CLONE_NEWNET) == delegate && runtime && runtime->shared && runtime->shared->netns_storage_socket[0] >= 0) { /* Try to enable network namespacing if network namespacing is available and we have - * CAP_NET_ADMIN. We need CAP_NET_ADMIN to be able to configure the loopback device in the - * new network namespace. And if we don't have that, then we could only create a network + * CAP_NET_ADMIN in the current user namespace (either the system manager one or the unit's + * own user namespace). We need CAP_NET_ADMIN to be able to configure the loopback device in + * the new network namespace. And if we don't have that, then we could only create a network * namespace without the ability to set up "lo". Hence gracefully skip things then. */ if (ns_type_supported(NAMESPACE_NET) && have_effective_cap(CAP_NET_ADMIN) > 0) { r = setup_shareable_ns(runtime->shared->netns_storage_socket, CLONE_NEWNET); @@ -4245,7 +4297,8 @@ static int setup_delegated_namespaces( else if (r < 0) { *reterr_exit_status = EXIT_NETWORK; return log_exec_error_errno(context, params, r, "Failed to set up network namespacing: %m"); - } + } else + log_exec_debug(context, params, "Set up %snetwork namespace", delegate ? "delegated " : ""); } else if (context->network_namespace_path) { *reterr_exit_status = EXIT_NETWORK; return log_exec_error_errno(context, params, SYNTHETIC_ERRNO(EOPNOTSUPP), @@ -4254,8 +4307,9 @@ static int setup_delegated_namespaces( log_exec_notice(context, params, "PrivateNetwork=yes is configured, but the kernel does not support or we lack privileges for network namespace, proceeding without."); } - if (exec_needs_ipc_namespace(context) && runtime && - runtime->shared && runtime->shared->ipcns_storage_socket[0] >= 0) { + if (exec_needs_ipc_namespace(context) && + exec_namespace_is_delegated(context, params, CLONE_NEWIPC) == delegate && + runtime && runtime->shared && runtime->shared->ipcns_storage_socket[0] >= 0) { if (ns_type_supported(NAMESPACE_IPC)) { r = setup_shareable_ns(runtime->shared->ipcns_storage_socket, CLONE_NEWIPC); @@ -4265,7 +4319,8 @@ static int setup_delegated_namespaces( else if (r < 0) { *reterr_exit_status = EXIT_NAMESPACE; return log_exec_error_errno(context, params, r, "Failed to set up IPC namespacing: %m"); - } + } else + log_exec_debug(context, params, "Set up %sIPC namespace", delegate ? "delegated " : ""); } else if (context->ipc_namespace_path) { *reterr_exit_status = EXIT_NAMESPACE; return log_exec_error_errno(context, params, SYNTHETIC_ERRNO(EOPNOTSUPP), @@ -4274,16 +4329,20 @@ static int setup_delegated_namespaces( log_exec_warning(context, params, "PrivateIPC=yes is configured, but the kernel does not support IPC namespaces, ignoring."); } - if (needs_sandboxing && exec_needs_cgroup_namespace(context, params)) { + if (needs_sandboxing && exec_needs_cgroup_namespace(context, params) && + exec_namespace_is_delegated(context, params, CLONE_NEWCGROUP) == delegate) { if (unshare(CLONE_NEWCGROUP) < 0) { *reterr_exit_status = EXIT_NAMESPACE; return log_exec_error_errno(context, params, errno, "Failed to set up cgroup namespacing: %m"); } + + log_exec_debug(context, params, "Set up %scgroup namespace", delegate ? "delegated " : ""); } /* Unshare a new PID namespace before setting up mounts to ensure /proc/ is mounted with only processes in PID namespace visible. * Note PrivatePIDs=yes implies MountAPIVFS=yes so we'll always ensure procfs is remounted. */ - if (needs_sandboxing && exec_needs_pid_namespace(context)) { + if (needs_sandboxing && exec_needs_pid_namespace(context) && + exec_namespace_is_delegated(context, params, CLONE_NEWPID) == delegate) { if (params->pidref_transport_fd < 0) { *reterr_exit_status = EXIT_NAMESPACE; return log_exec_error_errno(context, params, SYNTHETIC_ERRNO(ENOTCONN), "PidRef socket is not set up: %m"); @@ -4313,11 +4372,14 @@ static int setup_delegated_namespaces( *reterr_exit_status = EXIT_NAMESPACE; return log_exec_error_errno(context, params, r, "Failed to set up pid namespace: %m"); } + + log_exec_debug(context, params, "Set up %spid namespace", delegate ? "delegated " : ""); } /* If PrivatePIDs= yes is configured, we're now running as pid 1 in a pid namespace! */ - if (exec_needs_mount_namespace(context, params, runtime)) { + if (exec_needs_mount_namespace(context, params, runtime) && + exec_namespace_is_delegated(context, params, CLONE_NEWNS) == delegate) { _cleanup_free_ char *error_path = NULL; r = apply_mount_namespace(command->flags, @@ -4334,12 +4396,16 @@ static int setup_delegated_namespaces( return log_exec_error_errno(context, params, r, "Failed to set up mount namespacing%s%s: %m", error_path ? ": " : "", strempty(error_path)); } + + log_exec_debug(context, params, "Set up %smount namespace", delegate ? "delegated " : ""); } - if (needs_sandboxing) { + if (needs_sandboxing && exec_namespace_is_delegated(context, params, CLONE_NEWUTS) == delegate) { r = apply_protect_hostname(context, params, reterr_exit_status); if (r < 0) return r; + + log_exec_debug(context, params, "Set up %sUTS namespace", delegate ? "delegated " : ""); } return 0; @@ -4531,7 +4597,6 @@ int exec_invoke( char **final_argv = NULL; dev_t journal_stream_dev = 0; ino_t journal_stream_ino = 0; - bool userns_set_up = false; bool needs_sandboxing, /* Do we need to set up full sandboxing? (i.e. all namespacing, all MAC stuff, caps, yadda yadda */ needs_setuid, /* Do we need to do the actual setresuid()/setresgid() calls? */ needs_mount_namespace; /* Do we need to set up a mount namespace for this kernel? */ @@ -5264,9 +5329,7 @@ int exec_invoke( /* If we're unprivileged, set up the user namespace first to enable use of the other namespaces. * Users with CAP_SYS_ADMIN can set up user namespaces last because they will be able to * set up all of the other namespaces (i.e. network, mount, UTS) without a user namespace. */ - PrivateUsers pu = context->private_users; - if (pu == PRIVATE_USERS_NO) - pu = PRIVATE_USERS_SELF; + PrivateUsers pu = exec_context_get_effective_private_users(context, params); /* The kernel requires /proc/pid/setgroups be set to "deny" prior to writing /proc/pid/gid_map in * unprivileged user namespaces. */ @@ -5281,14 +5344,16 @@ int exec_invoke( log_exec_info_errno(context, params, r, "Failed to set up user namespacing for unprivileged user, ignoring: %m"); else { assert(r > 0); - userns_set_up = true; + log_debug("Set up unprivileged user namespace"); } } + /* Call setup_delegated_namespaces() the first time to unshare all non-delegated namespaces. */ r = setup_delegated_namespaces( context, params, runtime, + /* delegate= */ false, memory_pressure_path, uid, gid, @@ -5331,15 +5396,35 @@ int exec_invoke( * case of mount namespaces being less privileged when the mount point list is copied from a * different user namespace). */ - if (needs_sandboxing && !userns_set_up) { - r = setup_private_users(context->private_users, saved_uid, saved_gid, uid, gid, - /* allow_setgroups= */ context->private_users == PRIVATE_USERS_FULL); + if (needs_sandboxing && !exec_context_need_unprivileged_private_users(context, params)) { + PrivateUsers pu = exec_context_get_effective_private_users(context, params); + + r = setup_private_users(pu, saved_uid, saved_gid, uid, gid, + /* allow_setgroups= */ pu == PRIVATE_USERS_FULL); if (r < 0) { *exit_status = EXIT_USER; return log_exec_error_errno(context, params, r, "Failed to set up user namespacing: %m"); } + + log_debug("Set up privileged user namespace"); } + /* Call setup_delegated_namespaces() the second time to unshare all delegated namespaces. */ + r = setup_delegated_namespaces( + context, + params, + runtime, + /* delegate= */ true, + memory_pressure_path, + uid, + gid, + command, + needs_sandboxing, + has_cap_sys_admin, + exit_status); + if (r < 0) + return r; + /* Now that the mount namespace has been set up and privileges adjusted, let's look for the thing we * shall execute. */ diff --git a/src/core/execute-serialize.c b/src/core/execute-serialize.c index f05c69bf2c5..0eec9cdf8fd 100644 --- a/src/core/execute-serialize.c +++ b/src/core/execute-serialize.c @@ -2474,6 +2474,12 @@ static int exec_context_serialize(const ExecContext *c, FILE *f) { return r; } + if (c->delegate_namespaces != NAMESPACE_FLAGS_INITIAL) { + r = serialize_item_format(f, "exec-context-delegate-namespaces", "%lu", c->delegate_namespaces); + if (r < 0) + return r; + } + #if HAVE_LIBBPF if (exec_context_restrict_filesystems_set(c)) { char *fs; @@ -3536,6 +3542,10 @@ static int exec_context_deserialize(ExecContext *c, FILE *f) { r = safe_atolu(val, &c->restrict_namespaces); if (r < 0) return r; + } else if ((val = startswith(l, "exec-context-delegate-namespaces="))) { + r = safe_atolu(val, &c->delegate_namespaces); + if (r < 0) + return r; } else if ((val = startswith(l, "exec-context-restrict-filesystems="))) { r = set_ensure_allocated(&c->restrict_filesystems, &string_hash_ops); if (r < 0) diff --git a/src/core/execute.c b/src/core/execute.c index 6811bf301ca..77399eb73a6 100644 --- a/src/core/execute.c +++ b/src/core/execute.c @@ -611,6 +611,7 @@ void exec_context_init(ExecContext *c) { .timeout_clean_usec = USEC_INFINITY, .capability_bounding_set = CAP_MASK_UNSET, .restrict_namespaces = NAMESPACE_FLAGS_INITIAL, + .delegate_namespaces = NAMESPACE_FLAGS_INITIAL, .log_level_max = -1, #if HAVE_SECCOMP .syscall_errno = SECCOMP_ERROR_NUMBER_KILL, diff --git a/src/core/execute.h b/src/core/execute.h index 6421f19cc44..559766b3684 100644 --- a/src/core/execute.h +++ b/src/core/execute.h @@ -350,6 +350,7 @@ struct ExecContext { unsigned long personality; unsigned long restrict_namespaces; /* The CLONE_NEWxyz flags permitted to the unit's processes */ + unsigned long delegate_namespaces; /* The CLONE_NEWxyz flags delegated to the unit's processes */ Set *restrict_filesystems; bool restrict_filesystems_allow_list:1; diff --git a/src/core/load-fragment-gperf.gperf.in b/src/core/load-fragment-gperf.gperf.in index 5104c107198..a0907e8a4b1 100644 --- a/src/core/load-fragment-gperf.gperf.in +++ b/src/core/load-fragment-gperf.gperf.in @@ -73,7 +73,8 @@ {{type}}.SystemCallErrorNumber, config_parse_syscall_errno, 0, offsetof({{type}}, exec_context) {{type}}.SystemCallLog, config_parse_syscall_log, 0, offsetof({{type}}, exec_context) {{type}}.MemoryDenyWriteExecute, config_parse_bool, 0, offsetof({{type}}, exec_context.memory_deny_write_execute) -{{type}}.RestrictNamespaces, config_parse_restrict_namespaces, 0, offsetof({{type}}, exec_context) +{{type}}.RestrictNamespaces, config_parse_namespace_flags, 0, offsetof({{type}}, exec_context.restrict_namespaces) +{{type}}.DelegateNamespaces, config_parse_namespace_flags, 0, offsetof({{type}}, exec_context.delegate_namespaces) {{type}}.RestrictRealtime, config_parse_bool, 0, offsetof({{type}}, exec_context.restrict_realtime) {{type}}.RestrictSUIDSGID, config_parse_bool, 0, offsetof({{type}}, exec_context.restrict_suid_sgid) {{type}}.RestrictAddressFamilies, config_parse_address_families, 0, offsetof({{type}}, exec_context) diff --git a/src/core/load-fragment.c b/src/core/load-fragment.c index 8460de263e2..4bd65c1aaea 100644 --- a/src/core/load-fragment.c +++ b/src/core/load-fragment.c @@ -3566,7 +3566,7 @@ int config_parse_address_families( } } -int config_parse_restrict_namespaces( +int config_parse_namespace_flags( const char *unit, const char *filename, unsigned line, @@ -3578,24 +3578,25 @@ int config_parse_restrict_namespaces( void *data, void *userdata) { - ExecContext *c = data; - unsigned long flags; + unsigned long *flags = data; + unsigned long all = UPDATE_FLAG(NAMESPACE_FLAGS_ALL, CLONE_NEWUSER, !streq(lvalue, "DelegateNamespaces")); + unsigned long f; bool invert = false; int r; if (isempty(rvalue)) { /* Reset to the default. */ - c->restrict_namespaces = NAMESPACE_FLAGS_INITIAL; + *flags = NAMESPACE_FLAGS_INITIAL; return 0; } /* Boolean parameter ignores the previous settings */ r = parse_boolean(rvalue); if (r > 0) { - c->restrict_namespaces = 0; + *flags = 0; return 0; } else if (r == 0) { - c->restrict_namespaces = NAMESPACE_FLAGS_ALL; + *flags = all; return 0; } @@ -3605,18 +3606,25 @@ int config_parse_restrict_namespaces( } /* Not a boolean argument, in this case it's a list of namespace types. */ - r = namespace_flags_from_string(rvalue, &flags); + r = namespace_flags_from_string(rvalue, &f); if (r < 0) { log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse namespace type string, ignoring: %s", rvalue); return 0; } - if (c->restrict_namespaces == NAMESPACE_FLAGS_INITIAL) + if (*flags == NAMESPACE_FLAGS_INITIAL) /* Initial assignment. Just set the value. */ - c->restrict_namespaces = invert ? (~flags) & NAMESPACE_FLAGS_ALL : flags; + f = invert ? (~f) & all : f; else /* Merge the value with the previous one. */ - SET_FLAG(c->restrict_namespaces, flags, !invert); + f = UPDATE_FLAG(*flags, f, !invert); + + if (FLAGS_SET(f, CLONE_NEWUSER) && streq(lvalue, "DelegateNamespaces")) { + log_syntax(unit, LOG_WARNING, filename, line, r, "The user namespace cannot be delegated with DelegateNamespaces=, ignoring: %s", rvalue); + return 0; + } + + *flags = f; return 0; } @@ -6359,7 +6367,7 @@ void unit_dump_config_items(FILE *f) { { config_parse_syscall_errno, "ERRNO" }, { config_parse_syscall_log, "SYSCALLS" }, { config_parse_address_families, "FAMILIES" }, - { config_parse_restrict_namespaces, "NAMESPACES" }, + { config_parse_namespace_flags, "NAMESPACES" }, #endif { config_parse_restrict_filesystems, "FILESYSTEMS" }, { config_parse_cpu_shares, "SHARES" }, diff --git a/src/core/load-fragment.h b/src/core/load-fragment.h index 881ce152d55..7b758df2e68 100644 --- a/src/core/load-fragment.h +++ b/src/core/load-fragment.h @@ -127,7 +127,7 @@ CONFIG_PARSER_PROTOTYPE(config_parse_working_directory); CONFIG_PARSER_PROTOTYPE(config_parse_fdname); CONFIG_PARSER_PROTOTYPE(config_parse_user_group_compat); CONFIG_PARSER_PROTOTYPE(config_parse_user_group_strv_compat); -CONFIG_PARSER_PROTOTYPE(config_parse_restrict_namespaces); +CONFIG_PARSER_PROTOTYPE(config_parse_namespace_flags); CONFIG_PARSER_PROTOTYPE(config_parse_restrict_filesystems); CONFIG_PARSER_PROTOTYPE(config_parse_bind_paths); CONFIG_PARSER_PROTOTYPE(config_parse_exec_keyring_mode); diff --git a/src/shared/bus-unit-util.c b/src/shared/bus-unit-util.c index 3a246e82bc6..f484a6fd1b2 100644 --- a/src/shared/bus-unit-util.c +++ b/src/shared/bus-unit-util.c @@ -1667,7 +1667,8 @@ static int bus_append_execute_property(sd_bus_message *m, const char *field, con return 1; } - if (streq(field, "RestrictNamespaces")) { + if (STR_IN_SET(field, "RestrictNamespaces", + "DelegateNamespaces")) { bool invert = false; unsigned long flags; diff --git a/test/units/TEST-07-PID1.delegate-namespaces.sh b/test/units/TEST-07-PID1.delegate-namespaces.sh new file mode 100755 index 00000000000..fe0defaeb65 --- /dev/null +++ b/test/units/TEST-07-PID1.delegate-namespaces.sh @@ -0,0 +1,80 @@ +#!/usr/bin/env bash +# SPDX-License-Identifier: LGPL-2.1-or-later +# shellcheck disable=SC2016 +set -eux +set -o pipefail + +# shellcheck source=test/units/test-control.sh +. "$(dirname "$0")"/test-control.sh +# shellcheck source=test/units/util.sh +. "$(dirname "$0")"/util.sh + +testcase_mount() { + (! systemd-run -p PrivateUsersEx=self -p PrivateMounts=yes --wait --pipe -- mount --bind /usr /home) + systemd-run -p PrivateUsersEx=self -p PrivateMounts=yes -p DelegateNamespaces=mnt --wait --pipe -- mount --bind /usr /home +} + +testcase_network() { + (! systemd-run -p PrivateUsersEx=self -p PrivateNetwork=yes --wait --pipe -- ip link add veth1 type veth peer name veth2) + systemd-run -p PrivateUsersEx=self -p PrivateMounts=yes -p DelegateNamespaces=mnt --wait --pipe -- ip link add veth1 type veth peer name veth2 +} + +testcase_cgroup() { + (! systemd-run -p PrivateUsersEx=self -p ProtectControlGroupsEx=private --wait --pipe -- sh -c 'echo 0 >/sys/fs/cgroup/cgroup.pressure') + systemd-run -p PrivateUsersEx=self -p ProtectControlGroupsEx=private -p DelegateNamespaces=cgroup --wait --pipe -- sh -c 'echo 0 >/sys/fs/cgroup/cgroup.pressure' +} + +testcase_pid() { + (! systemd-run -p PrivateUsersEx=self -p PrivatePIDs=yes --wait --pipe -- sh -c 'echo 5 >/proc/sys/kernel/ns_last_pid') + systemd-run -p PrivateUsersEx=self -p PrivatePIDs=yes -p DelegateNamespaces=pid --wait --pipe -- sh -c 'echo 5 >/proc/sys/kernel/ns_last_pid' +} + +testcase_uts() { + (! systemd-run -p PrivateUsersEx=self -p ProtectHostnameEx=private --wait --pipe -- hostname abc) + systemd-run -p PrivateUsersEx=self -p ProtectHostnameEx=private -p DelegateNamespaces=uts --wait --pipe -- hostname abc +} + +testcase_implied_private_users_self() { + # If not explicitly set PrivateUsers=self is implied. + systemd-run -p PrivateMounts=yes -p DelegateNamespaces=mnt --wait --pipe -- mount --bind /usr /home + # If explicitly set it PrivateUsers= is not overridden. + systemd-run -p PrivateUsersEx=identity -p PrivateMounts=yes -p DelegateNamespaces=mnt --wait --pipe -- mount --bind /usr /home + systemd-run -p PrivateUsersEx=identity -p PrivateMounts=yes -p DelegateNamespaces=mnt --wait bash -c 'test "$(cat /proc/self/uid_map)" == " 0 0 65536"' +} + +testcase_multiple_features() { + unsquashfs -no-xattrs -d /tmp/TEST-07-PID1-delegate-namespaces-root /usr/share/minimal_0.raw + + systemd-run \ + -p PrivatePIDs=yes \ + -p RootDirectory=/tmp/TEST-07-PID1-delegate-namespaces-root \ + -p ProcSubset=pid \ + -p BindReadOnlyPaths=/usr/share \ + -p NoNewPrivileges=yes \ + -p ProtectSystem=strict \ + -p User=testuser\ + -p Group=testuser \ + -p RuntimeDirectory=abc \ + -p StateDirectory=qed \ + -p InaccessiblePaths=/usr/include \ + -p TemporaryFileSystem=/home \ + -p PrivateTmp=yes \ + -p PrivateDevices=yes \ + -p PrivateNetwork=yes \ + -p PrivateUsersEx=self \ + -p PrivateIPC=yes \ + -p ProtectHostname=yes \ + -p ProtectClock=yes \ + -p ProtectKernelTunables=yes \ + -p ProtectKernelModules=yes \ + -p ProtectKernelLogs=yes \ + -p ProtectControlGroupsEx=private \ + -p LockPersonality=yes \ + -p Environment=ABC=QED \ + -p DelegateNamespaces=yes \ + --wait \ + --pipe \ + grep MARKER=1 /etc/os-release + + rm -rf /tmp/TEST-07-PID1-delegate-namespaces-root +}