From: Lennart Poettering Date: Fri, 21 Apr 2023 16:22:35 +0000 (+0200) Subject: core: add DelegateSubgroup= setting X-Git-Tag: v254-rc1~610^2~6 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=a8b993dc11319292c54b301f3faffc4a05ab5ec1;p=thirdparty%2Fsystemd.git core: add DelegateSubgroup= setting This implements a minimal subset of #24961, but in a lot more restrictive way: we only allow one level of subcgroup (as that's enough to address the no-processes in inner cgroups rule), and does not change anything about threaded cgroup logic or similar, or make any of this new behaviour mandatory. All this does is this: all non-control processes we invoke for a unit we'll invoke in a subgroup by the specified name. We'll later port all our current services that use cgroup delegation over to this, i.e. user@.service, systemd-nspawn@.service and systemd-udevd.service. --- diff --git a/docs/CGROUP_DELEGATION.md b/docs/CGROUP_DELEGATION.md index f5509fb8331..4210a757676 100644 --- a/docs/CGROUP_DELEGATION.md +++ b/docs/CGROUP_DELEGATION.md @@ -271,7 +271,9 @@ your service has any of these four settings set, you must be prepared that a means that your service code should have moved itself further down the cgroup tree by the time it notifies the service manager about start-up readiness, so that the service's main cgroup is definitely an inner node by the time the -service manager might start `ExecStartPost=`.) +service manager might start `ExecStartPost=`. Starting with systemd 254 you may +also use `DelegateSubgroup=` to let the service manager put your initial +service process into a subgroup right away.) (Also note, if you intend to use "threaded" cgroups — as added in Linux 4.14 —, then you should do that *two* levels down from the main service cgroup your diff --git a/man/org.freedesktop.systemd1.xml b/man/org.freedesktop.systemd1.xml index e462c606362..f2e892671a4 100644 --- a/man/org.freedesktop.systemd1.xml +++ b/man/org.freedesktop.systemd1.xml @@ -2713,6 +2713,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice { @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly as DelegateControllers = ['...', ...]; @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly s DelegateSubgroup = '...'; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly b CPUAccounting = ...; @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly t CPUWeight = ...; @@ -3942,6 +3944,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice { + + @@ -4544,6 +4548,10 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice { memory controller is reached. It will take into consideration limits on all parent slices, other than the limits set on the unit itself. + DelegateSubgroup contains the cgroup subgroup to place invoked unit processes + in. As configured by the option of the same name in unit files. This is set to the empty string when it + does not apply or no subgroup has been configured. + RuntimeDirectorySymlink, StateDirectorySymlink, CacheDirectorySymlink and LogsDirectorySymlink respectively implement the destination parameter of the unit files settings RuntimeDirectory, @@ -4715,6 +4723,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket { @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly as DelegateControllers = ['...', ...]; @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly s DelegateSubgroup = '...'; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly b CPUAccounting = ...; @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly t CPUWeight = ...; @@ -5936,6 +5946,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket { + + @@ -6588,6 +6600,8 @@ node /org/freedesktop/systemd1/unit/home_2emount { @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly as DelegateControllers = ['...', ...]; @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly s DelegateSubgroup = '...'; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly b CPUAccounting = ...; @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly t CPUWeight = ...; @@ -7655,6 +7669,8 @@ node /org/freedesktop/systemd1/unit/home_2emount { + + @@ -8434,6 +8450,8 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap { @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly as DelegateControllers = ['...', ...]; @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly s DelegateSubgroup = '...'; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly b CPUAccounting = ...; @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly t CPUWeight = ...; @@ -9473,6 +9491,8 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap { + + @@ -10111,6 +10131,8 @@ node /org/freedesktop/systemd1/unit/system_2eslice { @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly as DelegateControllers = ['...', ...]; @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly s DelegateSubgroup = '...'; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly b CPUAccounting = ...; @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly t CPUWeight = ...; @@ -10456,6 +10478,8 @@ node /org/freedesktop/systemd1/unit/system_2eslice { + + @@ -10656,6 +10680,8 @@ node /org/freedesktop/systemd1/unit/session_2d1_2escope { @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly as DelegateControllers = ['...', ...]; @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly s DelegateSubgroup = '...'; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly b CPUAccounting = ...; @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly t CPUWeight = ...; @@ -11051,6 +11077,8 @@ node /org/freedesktop/systemd1/unit/session_2d1_2escope { + + diff --git a/man/systemd.resource-control.xml b/man/systemd.resource-control.xml index f4e4a492a00..610c11feb32 100644 --- a/man/systemd.resource-control.xml +++ b/man/systemd.resource-control.xml @@ -1148,10 +1148,11 @@ DeviceAllow=/dev/loop-control Delegate= - Turns on delegation of further resource control partitioning to processes of the unit. Units where this - is enabled may create and manage their own private subhierarchy of control groups below the control group of - the unit itself. For unprivileged services (i.e. those using the User= setting) the unit's - control group will be made accessible to the relevant user. + Turns on delegation of further resource control partitioning to processes of the unit. Units + where this is enabled may create and manage their own private subhierarchy of control groups below + the control group of the unit itself. For unprivileged services (i.e. those using the + User= setting) the unit's control group will be made accessible to the relevant + user. When enabled the service manager will refrain from manipulating control groups or moving processes below the unit's control group, so that a clear concept of ownership is established: the @@ -1188,6 +1189,29 @@ DeviceAllow=/dev/loop-control + + DelegateSubgroup= + + + Place unit processes in the specified subgroup of the unit's control group. Takes a valid + control group name (not a path!) as parameter, or an empty string to turn this feature + off. Defaults to off. The control group name must be usable as filename and avoid conflicts with + the kernel's control group attribute files (i.e. cgroup.procs is not an + acceptable name, since the kernel exposes a native control group attribute file by that name). This + option has no effect unless control group delegation is turned on via Delegate=, + see above. Note that this setting only applies to "main" processes of a unit, i.e. for services to + ExecStart=, but not for ExecReload= and similar. If + delegation is enabled, the latter are always placed inside a subgroup named + .control. The specified subgroup is automatically created (and potentially + ownership is passed to the unit's configured user/group) when a process is started in it. + + This option is useful to avoid manually moving the invoked process into a subgroup after it + has been started. Since no processes should live in inner nodes of the control group tree it's + almost always necessary to run the main ("supervising") process of a unit that has delegation + turned on in a subgroup. + + + DisableControllers= diff --git a/src/core/cgroup.c b/src/core/cgroup.c index cd48183f7a7..4ec5dfa587f 100644 --- a/src/core/cgroup.c +++ b/src/core/cgroup.c @@ -293,6 +293,8 @@ void cgroup_context_done(CGroupContext *c) { cpu_set_reset(&c->startup_cpuset_cpus); cpu_set_reset(&c->cpuset_mems); cpu_set_reset(&c->startup_cpuset_mems); + + c->delegate_subgroup = mfree(c->delegate_subgroup); } static int unit_get_kernel_memory_limit(Unit *u, const char *file, uint64_t *ret) { @@ -570,6 +572,10 @@ void cgroup_context_dump(Unit *u, FILE* f, const char *prefix) { prefix, managed_oom_preference_to_string(c->moom_preference), prefix, cgroup_pressure_watch_to_string(c->memory_pressure_watch)); + if (c->delegate_subgroup) + fprintf(f, "%sDelegateSubgroup: %s\n", + prefix, c->delegate_subgroup); + if (c->memory_pressure_threshold_usec != USEC_INFINITY) fprintf(f, "%sMemoryPressureThresholdSec: %s\n", prefix, FORMAT_TIMESPAN(c->memory_pressure_threshold_usec, 1)); diff --git a/src/core/cgroup.h b/src/core/cgroup.h index d445ea1e8d5..bbbf9408cc2 100644 --- a/src/core/cgroup.h +++ b/src/core/cgroup.h @@ -133,6 +133,7 @@ struct CGroupContext { bool delegate; CGroupMask delegate_controllers; CGroupMask disable_controllers; + char *delegate_subgroup; /* For unified hierarchy */ uint64_t cpu_weight; diff --git a/src/core/dbus-cgroup.c b/src/core/dbus-cgroup.c index 3a02fcbdb10..682ad5edd52 100644 --- a/src/core/dbus-cgroup.c +++ b/src/core/dbus-cgroup.c @@ -435,6 +435,7 @@ const sd_bus_vtable bus_cgroup_vtable[] = { SD_BUS_VTABLE_START(0), SD_BUS_PROPERTY("Delegate", "b", bus_property_get_bool, offsetof(CGroupContext, delegate), 0), SD_BUS_PROPERTY("DelegateControllers", "as", property_get_delegate_controllers, 0, 0), + SD_BUS_PROPERTY("DelegateSubgroup", "s", NULL, offsetof(CGroupContext, delegate_subgroup), 0), SD_BUS_PROPERTY("CPUAccounting", "b", bus_property_get_bool, offsetof(CGroupContext, cpu_accounting), 0), SD_BUS_PROPERTY("CPUWeight", "t", NULL, offsetof(CGroupContext, cpu_weight), 0), SD_BUS_PROPERTY("StartupCPUWeight", "t", NULL, offsetof(CGroupContext, startup_cpu_weight), 0), @@ -536,6 +537,33 @@ static int bus_cgroup_set_transient_property( return 1; + } else if (streq(name, "DelegateSubgroup")) { + const char *s; + + if (!UNIT_VTABLE(u)->can_delegate) + return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Delegation not available for unit type"); + + r = sd_bus_message_read(message, "s", &s); + if (r < 0) + return r; + + if (!isempty(s) && cg_needs_escape(s)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid control group name: %s", s); + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + if (isempty(s)) + c->delegate_subgroup = mfree(c->delegate_subgroup); + else { + r = free_and_strdup_warn(&c->delegate_subgroup, s); + if (r < 0) + return r; + } + + unit_write_settingf(u, flags, name, "DelegateSubgroup=%s", s); + } + + return 1; + } else if (STR_IN_SET(name, "DelegateControllers", "DisableControllers")) { CGroupMask mask = 0; diff --git a/src/core/execute.c b/src/core/execute.c index 60f7a6439c1..da7a37c187e 100644 --- a/src/core/execute.c +++ b/src/core/execute.c @@ -4178,8 +4178,12 @@ static int compile_suggested_paths(const ExecContext *c, const ExecParameters *p return 0; } -static int exec_parameters_get_cgroup_path(const ExecParameters *params, char **ret) { - bool using_subcgroup; +static int exec_parameters_get_cgroup_path( + const ExecParameters *params, + const CGroupContext *c, + char **ret) { + + const char *subgroup = NULL; char *p; assert(params); @@ -4197,16 +4201,22 @@ static int exec_parameters_get_cgroup_path(const ExecParameters *params, char ** * this is not necessary, the cgroup is still empty. We distinguish these cases with the EXEC_CONTROL_CGROUP * flag, which is only passed for the former statements, not for the latter. */ - using_subcgroup = FLAGS_SET(params->flags, EXEC_CONTROL_CGROUP|EXEC_CGROUP_DELEGATE|EXEC_IS_CONTROL); - if (using_subcgroup) - p = path_join(params->cgroup_path, ".control"); + if (FLAGS_SET(params->flags, EXEC_CGROUP_DELEGATE) && (FLAGS_SET(params->flags, EXEC_CONTROL_CGROUP) || c->delegate_subgroup)) { + if (FLAGS_SET(params->flags, EXEC_IS_CONTROL)) + subgroup = ".control"; + else + subgroup = c->delegate_subgroup; + } + + if (subgroup) + p = path_join(params->cgroup_path, subgroup); else p = strdup(params->cgroup_path); if (!p) return -ENOMEM; *ret = p; - return using_subcgroup; + return !!subgroup; } static int exec_context_cpu_affinity_from_numa(const ExecContext *c, CPUSet *ret) { @@ -4705,7 +4715,7 @@ static int exec_child( if (params->cgroup_path) { _cleanup_free_ char *p = NULL; - r = exec_parameters_get_cgroup_path(params, &p); + r = exec_parameters_get_cgroup_path(params, cgroup_context, &p); if (r < 0) { *exit_status = EXIT_CGROUP; return log_unit_error_errno(unit, r, "Failed to acquire cgroup path: %m"); @@ -4880,11 +4890,26 @@ static int exec_child( * touch a single hierarchy too. */ if (params->flags & EXEC_CGROUP_DELEGATE) { + _cleanup_free_ char *p = NULL; + r = cg_set_access(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, uid, gid); if (r < 0) { *exit_status = EXIT_CGROUP; return log_unit_error_errno(unit, r, "Failed to adjust control group access: %m"); } + + r = exec_parameters_get_cgroup_path(params, cgroup_context, &p); + if (r < 0) { + *exit_status = EXIT_CGROUP; + return log_unit_error_errno(unit, r, "Failed to acquire cgroup path: %m"); + } + if (r > 0) { + r = cg_set_access(SYSTEMD_CGROUP_CONTROLLER, p, uid, gid); + if (r < 0) { + *exit_status = EXIT_CGROUP; + return log_unit_error_errno(unit, r, "Failed to adjust control subgroup access: %m"); + } + } } if (cgroup_context && cg_unified() > 0 && is_pressure_supported() > 0) { @@ -5635,13 +5660,13 @@ int exec_spawn(Unit *unit, log_command_line(unit, "About to execute", command->path, command->argv); if (params->cgroup_path) { - r = exec_parameters_get_cgroup_path(params, &subcgroup_path); + r = exec_parameters_get_cgroup_path(params, cgroup_context, &subcgroup_path); if (r < 0) return log_unit_error_errno(unit, r, "Failed to acquire subcgroup path: %m"); if (r > 0) { /* We are using a child cgroup */ r = cg_create(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path); if (r < 0) - return log_unit_error_errno(unit, r, "Failed to create control group '%s': %m", subcgroup_path); + return log_unit_error_errno(unit, r, "Failed to create subcgroup '%s': %m", subcgroup_path); /* Normally we would not propagate the xattrs to children but since we created this * sub-cgroup internally we should do it. */ diff --git a/src/core/load-fragment-gperf.gperf.in b/src/core/load-fragment-gperf.gperf.in index 8a2823b075a..110bccb7ad4 100644 --- a/src/core/load-fragment-gperf.gperf.in +++ b/src/core/load-fragment-gperf.gperf.in @@ -237,6 +237,7 @@ {{type}}.TasksAccounting, config_parse_bool, 0, offsetof({{type}}, cgroup_context.tasks_accounting) {{type}}.TasksMax, config_parse_tasks_max, 0, offsetof({{type}}, cgroup_context.tasks_max) {{type}}.Delegate, config_parse_delegate, 0, offsetof({{type}}, cgroup_context) +{{type}}.DelegateSubgroup, config_parse_delegate_subgroup , 0, offsetof({{type}}, cgroup_context) {{type}}.DisableControllers, config_parse_disable_controllers, 0, offsetof({{type}}, cgroup_context) {{type}}.IPAccounting, config_parse_bool, 0, offsetof({{type}}, cgroup_context.ip_accounting) {{type}}.IPAddressAllow, config_parse_in_addr_prefixes, AF_UNSPEC, offsetof({{type}}, cgroup_context.ip_address_allow) diff --git a/src/core/load-fragment.c b/src/core/load-fragment.c index 581a051d460..ebfe98d7cc9 100644 --- a/src/core/load-fragment.c +++ b/src/core/load-fragment.c @@ -4029,6 +4029,42 @@ int config_parse_delegate( return 0; } +int config_parse_delegate_subgroup( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + CGroupContext *c = ASSERT_PTR(data); + UnitType t; + + t = unit_name_to_type(unit); + assert(t >= 0); + + if (!unit_vtable[t]->can_delegate) { + log_syntax(unit, LOG_WARNING, filename, line, 0, "DelegateSubgroup= setting not supported for this unit type, ignoring."); + return 0; + } + + if (isempty(rvalue)) { + c->delegate_subgroup = mfree(c->delegate_subgroup); + return 0; + } + + if (cg_needs_escape(rvalue)) { /* Insist that specified names don't need escaping */ + log_syntax(unit, LOG_WARNING, filename, line, 0, "Invalid control group name, ignoring: %s", rvalue); + return 0; + } + + return free_and_strdup_warn(&c->delegate_subgroup, rvalue); +} + int config_parse_managed_oom_mode( const char *unit, const char *filename, diff --git a/src/core/load-fragment.h b/src/core/load-fragment.h index a38d697338a..59f02a32072 100644 --- a/src/core/load-fragment.h +++ b/src/core/load-fragment.h @@ -83,6 +83,7 @@ CONFIG_PARSER_PROTOTYPE(config_parse_cpu_shares); CONFIG_PARSER_PROTOTYPE(config_parse_memory_limit); CONFIG_PARSER_PROTOTYPE(config_parse_tasks_max); CONFIG_PARSER_PROTOTYPE(config_parse_delegate); +CONFIG_PARSER_PROTOTYPE(config_parse_delegate_subgroup); CONFIG_PARSER_PROTOTYPE(config_parse_managed_oom_mode); CONFIG_PARSER_PROTOTYPE(config_parse_managed_oom_mem_pressure_limit); CONFIG_PARSER_PROTOTYPE(config_parse_managed_oom_preference); diff --git a/src/shared/bus-unit-util.c b/src/shared/bus-unit-util.c index ebbd1f7f28e..a3211796097 100644 --- a/src/shared/bus-unit-util.c +++ b/src/shared/bus-unit-util.c @@ -461,7 +461,8 @@ static int bus_append_cgroup_property(sd_bus_message *m, const char *field, cons "ManagedOOMSwap", "ManagedOOMMemoryPressure", "ManagedOOMPreference", - "MemoryPressureWatch")) + "MemoryPressureWatch", + "DelegateSubgroup")) return bus_append_string(m, field, eq); if (STR_IN_SET(field, "ManagedOOMMemoryPressureLimit")) {