]> git.ipfire.org Git - thirdparty/systemd.git/commitdiff
core: add DelegateSubgroup= setting
authorLennart Poettering <lennart@poettering.net>
Fri, 21 Apr 2023 16:22:35 +0000 (18:22 +0200)
committerLennart Poettering <lennart@poettering.net>
Thu, 27 Apr 2023 10:18:32 +0000 (12:18 +0200)
This implements a minimal subset of #24961, but in a lot more
restrictive way: we only allow one level of subcgroup (as that's enough
to address the no-processes in inner cgroups rule), and does not change
anything about threaded cgroup logic or similar, or make any of this new
behaviour mandatory.

All this does is this: all non-control processes we invoke for a unit
we'll invoke in a subgroup by the specified name.

We'll later port all our current services that use cgroup delegation
over to this, i.e. user@.service, systemd-nspawn@.service and
systemd-udevd.service.

docs/CGROUP_DELEGATION.md
man/org.freedesktop.systemd1.xml
man/systemd.resource-control.xml
src/core/cgroup.c
src/core/cgroup.h
src/core/dbus-cgroup.c
src/core/execute.c
src/core/load-fragment-gperf.gperf.in
src/core/load-fragment.c
src/core/load-fragment.h
src/shared/bus-unit-util.c

index f5509fb833160e66595ae2951daa0c68806cec44..4210a75767643f3c1ff965ccadc58734c1f79fcd 100644 (file)
@@ -271,7 +271,9 @@ your service has any of these four settings set, you must be prepared that a
 means that your service code should have moved itself further down the cgroup
 tree by the time it notifies the service manager about start-up readiness, so
 that the service's main cgroup is definitely an inner node by the time the
-service manager might start `ExecStartPost=`.)
+service manager might start `ExecStartPost=`. Starting with systemd 254 you may
+also use `DelegateSubgroup=` to let the service manager put your initial
+service process into a subgroup right away.)
 
 (Also note, if you intend to use "threaded" cgroups — as added in Linux 4.14 —,
 then you should do that *two* levels down from the main service cgroup your
index e462c606362c6255817387baf2d47da16e58f9a6..f2e892671a42a04b3a6871ca4e15f5d0dc71cc01 100644 (file)
@@ -2713,6 +2713,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice {
       @org.freedesktop.DBus.Property.EmitsChangedSignal("false")
       readonly as DelegateControllers = ['...', ...];
       @org.freedesktop.DBus.Property.EmitsChangedSignal("false")
+      readonly s DelegateSubgroup = '...';
+      @org.freedesktop.DBus.Property.EmitsChangedSignal("false")
       readonly b CPUAccounting = ...;
       @org.freedesktop.DBus.Property.EmitsChangedSignal("false")
       readonly t CPUWeight = ...;
@@ -3942,6 +3944,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice {
 
     <variablelist class="dbus-property" generated="True" extra-ref="DelegateControllers"/>
 
+    <variablelist class="dbus-property" generated="True" extra-ref="DelegateSubgroup"/>
+
     <variablelist class="dbus-property" generated="True" extra-ref="CPUAccounting"/>
 
     <variablelist class="dbus-property" generated="True" extra-ref="CPUWeight"/>
@@ -4544,6 +4548,10 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice {
       memory controller is reached. It will take into consideration limits on all parent slices, other than the
       limits set on the unit itself.</para>
 
+      <para><varname>DelegateSubgroup</varname> contains the cgroup subgroup to place invoked unit processes
+      in. As configured by the option of the same name in unit files. This is set to the empty string when it
+      does not apply or no subgroup has been configured.</para>
+
       <para><varname>RuntimeDirectorySymlink</varname>, <varname>StateDirectorySymlink</varname>,
       <varname>CacheDirectorySymlink</varname> and  <varname>LogsDirectorySymlink</varname> respectively
       implement the destination parameter of the unit files settings <varname>RuntimeDirectory</varname>,
@@ -4715,6 +4723,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket {
       @org.freedesktop.DBus.Property.EmitsChangedSignal("false")
       readonly as DelegateControllers = ['...', ...];
       @org.freedesktop.DBus.Property.EmitsChangedSignal("false")
+      readonly s DelegateSubgroup = '...';
+      @org.freedesktop.DBus.Property.EmitsChangedSignal("false")
       readonly b CPUAccounting = ...;
       @org.freedesktop.DBus.Property.EmitsChangedSignal("false")
       readonly t CPUWeight = ...;
@@ -5936,6 +5946,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket {
 
     <variablelist class="dbus-property" generated="True" extra-ref="DelegateControllers"/>
 
+    <variablelist class="dbus-property" generated="True" extra-ref="DelegateSubgroup"/>
+
     <variablelist class="dbus-property" generated="True" extra-ref="CPUAccounting"/>
 
     <variablelist class="dbus-property" generated="True" extra-ref="CPUWeight"/>
@@ -6588,6 +6600,8 @@ node /org/freedesktop/systemd1/unit/home_2emount {
       @org.freedesktop.DBus.Property.EmitsChangedSignal("false")
       readonly as DelegateControllers = ['...', ...];
       @org.freedesktop.DBus.Property.EmitsChangedSignal("false")
+      readonly s DelegateSubgroup = '...';
+      @org.freedesktop.DBus.Property.EmitsChangedSignal("false")
       readonly b CPUAccounting = ...;
       @org.freedesktop.DBus.Property.EmitsChangedSignal("false")
       readonly t CPUWeight = ...;
@@ -7655,6 +7669,8 @@ node /org/freedesktop/systemd1/unit/home_2emount {
 
     <variablelist class="dbus-property" generated="True" extra-ref="DelegateControllers"/>
 
+    <variablelist class="dbus-property" generated="True" extra-ref="DelegateSubgroup"/>
+
     <variablelist class="dbus-property" generated="True" extra-ref="CPUAccounting"/>
 
     <variablelist class="dbus-property" generated="True" extra-ref="CPUWeight"/>
@@ -8434,6 +8450,8 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap {
       @org.freedesktop.DBus.Property.EmitsChangedSignal("false")
       readonly as DelegateControllers = ['...', ...];
       @org.freedesktop.DBus.Property.EmitsChangedSignal("false")
+      readonly s DelegateSubgroup = '...';
+      @org.freedesktop.DBus.Property.EmitsChangedSignal("false")
       readonly b CPUAccounting = ...;
       @org.freedesktop.DBus.Property.EmitsChangedSignal("false")
       readonly t CPUWeight = ...;
@@ -9473,6 +9491,8 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap {
 
     <variablelist class="dbus-property" generated="True" extra-ref="DelegateControllers"/>
 
+    <variablelist class="dbus-property" generated="True" extra-ref="DelegateSubgroup"/>
+
     <variablelist class="dbus-property" generated="True" extra-ref="CPUAccounting"/>
 
     <variablelist class="dbus-property" generated="True" extra-ref="CPUWeight"/>
@@ -10111,6 +10131,8 @@ node /org/freedesktop/systemd1/unit/system_2eslice {
       @org.freedesktop.DBus.Property.EmitsChangedSignal("false")
       readonly as DelegateControllers = ['...', ...];
       @org.freedesktop.DBus.Property.EmitsChangedSignal("false")
+      readonly s DelegateSubgroup = '...';
+      @org.freedesktop.DBus.Property.EmitsChangedSignal("false")
       readonly b CPUAccounting = ...;
       @org.freedesktop.DBus.Property.EmitsChangedSignal("false")
       readonly t CPUWeight = ...;
@@ -10456,6 +10478,8 @@ node /org/freedesktop/systemd1/unit/system_2eslice {
 
     <variablelist class="dbus-property" generated="True" extra-ref="DelegateControllers"/>
 
+    <variablelist class="dbus-property" generated="True" extra-ref="DelegateSubgroup"/>
+
     <variablelist class="dbus-property" generated="True" extra-ref="CPUAccounting"/>
 
     <variablelist class="dbus-property" generated="True" extra-ref="CPUWeight"/>
@@ -10656,6 +10680,8 @@ node /org/freedesktop/systemd1/unit/session_2d1_2escope {
       @org.freedesktop.DBus.Property.EmitsChangedSignal("false")
       readonly as DelegateControllers = ['...', ...];
       @org.freedesktop.DBus.Property.EmitsChangedSignal("false")
+      readonly s DelegateSubgroup = '...';
+      @org.freedesktop.DBus.Property.EmitsChangedSignal("false")
       readonly b CPUAccounting = ...;
       @org.freedesktop.DBus.Property.EmitsChangedSignal("false")
       readonly t CPUWeight = ...;
@@ -11051,6 +11077,8 @@ node /org/freedesktop/systemd1/unit/session_2d1_2escope {
 
     <variablelist class="dbus-property" generated="True" extra-ref="DelegateControllers"/>
 
+    <variablelist class="dbus-property" generated="True" extra-ref="DelegateSubgroup"/>
+
     <variablelist class="dbus-property" generated="True" extra-ref="CPUAccounting"/>
 
     <variablelist class="dbus-property" generated="True" extra-ref="CPUWeight"/>
index f4e4a492a000e86a0136e234e9862b47295d5a17..610c11feb32a7e363938f4c99dc5cb706460d153 100644 (file)
@@ -1148,10 +1148,11 @@ DeviceAllow=/dev/loop-control
         <term><varname>Delegate=</varname></term>
 
         <listitem>
-          <para>Turns on delegation of further resource control partitioning to processes of the unit. Units where this
-          is enabled may create and manage their own private subhierarchy of control groups below the control group of
-          the unit itself. For unprivileged services (i.e. those using the <varname>User=</varname> setting) the unit's
-          control group will be made accessible to the relevant user.</para>
+          <para>Turns on delegation of further resource control partitioning to processes of the unit. Units
+          where this is enabled may create and manage their own private subhierarchy of control groups below
+          the control group of the unit itself. For unprivileged services (i.e. those using the
+          <varname>User=</varname> setting) the unit's control group will be made accessible to the relevant
+          user.</para>
 
           <para>When enabled the service manager will refrain from manipulating control groups or moving
           processes below the unit's control group, so that a clear concept of ownership is established: the
@@ -1188,6 +1189,29 @@ DeviceAllow=/dev/loop-control
         </listitem>
       </varlistentry>
 
+      <varlistentry>
+        <term><varname>DelegateSubgroup=</varname></term>
+
+        <listitem>
+          <para>Place unit processes in the specified subgroup of the unit's control group. Takes a valid
+          control group name (not a path!) as parameter, or an empty string to turn this feature
+          off. Defaults to off. The control group name must be usable as filename and avoid conflicts with
+          the kernel's control group attribute files (i.e. <filename>cgroup.procs</filename> is not an
+          acceptable name, since the kernel exposes a native control group attribute file by that name). This
+          option has no effect unless control group delegation is turned on via <varname>Delegate=</varname>,
+          see above. Note that this setting only applies to "main" processes of a unit, i.e. for services to
+          <varname>ExecStart=</varname>, but not for <varname>ExecReload=</varname> and similar. If
+          delegation is enabled, the latter are always placed inside a subgroup named
+          <filename>.control</filename>. The specified subgroup is automatically created (and potentially
+          ownership is passed to the unit's configured user/group) when a process is started in it.</para>
+
+          <para>This option is useful to avoid manually moving the invoked process into a subgroup after it
+          has been started. Since no processes should live in inner nodes of the control group tree it's
+          almost always necessary to run the main ("supervising") process of a unit that has delegation
+          turned on in a subgroup.</para>
+        </listitem>
+      </varlistentry>
+
       <varlistentry>
         <term><varname>DisableControllers=</varname></term>
 
index cd48183f7a7acae7e2cc9e2a47caf2ea4b44b0d1..4ec5dfa587f33f77d8cbb8dbd443e78cd77b5af9 100644 (file)
@@ -293,6 +293,8 @@ void cgroup_context_done(CGroupContext *c) {
         cpu_set_reset(&c->startup_cpuset_cpus);
         cpu_set_reset(&c->cpuset_mems);
         cpu_set_reset(&c->startup_cpuset_mems);
+
+        c->delegate_subgroup = mfree(c->delegate_subgroup);
 }
 
 static int unit_get_kernel_memory_limit(Unit *u, const char *file, uint64_t *ret) {
@@ -570,6 +572,10 @@ void cgroup_context_dump(Unit *u, FILE* f, const char *prefix) {
                 prefix, managed_oom_preference_to_string(c->moom_preference),
                 prefix, cgroup_pressure_watch_to_string(c->memory_pressure_watch));
 
+        if (c->delegate_subgroup)
+                fprintf(f, "%sDelegateSubgroup: %s\n",
+                        prefix, c->delegate_subgroup);
+
         if (c->memory_pressure_threshold_usec != USEC_INFINITY)
                 fprintf(f, "%sMemoryPressureThresholdSec: %s\n",
                         prefix, FORMAT_TIMESPAN(c->memory_pressure_threshold_usec, 1));
index d445ea1e8d5f7cb524cc9998274083356ad845b3..bbbf9408cc24a830f8d28a645c4bf3a07bb43420 100644 (file)
@@ -133,6 +133,7 @@ struct CGroupContext {
         bool delegate;
         CGroupMask delegate_controllers;
         CGroupMask disable_controllers;
+        char *delegate_subgroup;
 
         /* For unified hierarchy */
         uint64_t cpu_weight;
index 3a02fcbdb104a9ae5c241d7f0c58409fd5097d95..682ad5edd5203c81858fcad275575484a5da2d7b 100644 (file)
@@ -435,6 +435,7 @@ const sd_bus_vtable bus_cgroup_vtable[] = {
         SD_BUS_VTABLE_START(0),
         SD_BUS_PROPERTY("Delegate", "b", bus_property_get_bool, offsetof(CGroupContext, delegate), 0),
         SD_BUS_PROPERTY("DelegateControllers", "as", property_get_delegate_controllers, 0, 0),
+        SD_BUS_PROPERTY("DelegateSubgroup", "s", NULL, offsetof(CGroupContext, delegate_subgroup), 0),
         SD_BUS_PROPERTY("CPUAccounting", "b", bus_property_get_bool, offsetof(CGroupContext, cpu_accounting), 0),
         SD_BUS_PROPERTY("CPUWeight", "t", NULL, offsetof(CGroupContext, cpu_weight), 0),
         SD_BUS_PROPERTY("StartupCPUWeight", "t", NULL, offsetof(CGroupContext, startup_cpu_weight), 0),
@@ -536,6 +537,33 @@ static int bus_cgroup_set_transient_property(
 
                 return 1;
 
+        } else if (streq(name, "DelegateSubgroup")) {
+                const char *s;
+
+                if (!UNIT_VTABLE(u)->can_delegate)
+                        return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Delegation not available for unit type");
+
+                r = sd_bus_message_read(message, "s", &s);
+                if (r < 0)
+                        return r;
+
+                if (!isempty(s) && cg_needs_escape(s))
+                        return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid control group name: %s", s);
+
+                if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+                        if (isempty(s))
+                                c->delegate_subgroup = mfree(c->delegate_subgroup);
+                        else {
+                                r = free_and_strdup_warn(&c->delegate_subgroup, s);
+                                if (r < 0)
+                                        return r;
+                        }
+
+                        unit_write_settingf(u, flags, name, "DelegateSubgroup=%s", s);
+                }
+
+                return 1;
+
         } else if (STR_IN_SET(name, "DelegateControllers", "DisableControllers")) {
                 CGroupMask mask = 0;
 
index 60f7a6439c1e47ff304871e2bef121d18d63fae7..da7a37c187ead7285a921e0b6b02fd2c34c64e0a 100644 (file)
@@ -4178,8 +4178,12 @@ static int compile_suggested_paths(const ExecContext *c, const ExecParameters *p
         return 0;
 }
 
-static int exec_parameters_get_cgroup_path(const ExecParameters *params, char **ret) {
-        bool using_subcgroup;
+static int exec_parameters_get_cgroup_path(
+                const ExecParameters *params,
+                const CGroupContext *c,
+                char **ret) {
+
+        const char *subgroup = NULL;
         char *p;
 
         assert(params);
@@ -4197,16 +4201,22 @@ static int exec_parameters_get_cgroup_path(const ExecParameters *params, char **
          * this is not necessary, the cgroup is still empty. We distinguish these cases with the EXEC_CONTROL_CGROUP
          * flag, which is only passed for the former statements, not for the latter. */
 
-        using_subcgroup = FLAGS_SET(params->flags, EXEC_CONTROL_CGROUP|EXEC_CGROUP_DELEGATE|EXEC_IS_CONTROL);
-        if (using_subcgroup)
-                p = path_join(params->cgroup_path, ".control");
+        if (FLAGS_SET(params->flags, EXEC_CGROUP_DELEGATE) && (FLAGS_SET(params->flags, EXEC_CONTROL_CGROUP) || c->delegate_subgroup)) {
+                if (FLAGS_SET(params->flags, EXEC_IS_CONTROL))
+                        subgroup = ".control";
+                else
+                        subgroup = c->delegate_subgroup;
+        }
+
+        if (subgroup)
+                p = path_join(params->cgroup_path, subgroup);
         else
                 p = strdup(params->cgroup_path);
         if (!p)
                 return -ENOMEM;
 
         *ret = p;
-        return using_subcgroup;
+        return !!subgroup;
 }
 
 static int exec_context_cpu_affinity_from_numa(const ExecContext *c, CPUSet *ret) {
@@ -4705,7 +4715,7 @@ static int exec_child(
         if (params->cgroup_path) {
                 _cleanup_free_ char *p = NULL;
 
-                r = exec_parameters_get_cgroup_path(params, &p);
+                r = exec_parameters_get_cgroup_path(params, cgroup_context, &p);
                 if (r < 0) {
                         *exit_status = EXIT_CGROUP;
                         return log_unit_error_errno(unit, r, "Failed to acquire cgroup path: %m");
@@ -4880,11 +4890,26 @@ static int exec_child(
                  * touch a single hierarchy too. */
 
                 if (params->flags & EXEC_CGROUP_DELEGATE) {
+                        _cleanup_free_ char *p = NULL;
+
                         r = cg_set_access(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, uid, gid);
                         if (r < 0) {
                                 *exit_status = EXIT_CGROUP;
                                 return log_unit_error_errno(unit, r, "Failed to adjust control group access: %m");
                         }
+
+                        r = exec_parameters_get_cgroup_path(params, cgroup_context, &p);
+                        if (r < 0) {
+                                *exit_status = EXIT_CGROUP;
+                                return log_unit_error_errno(unit, r, "Failed to acquire cgroup path: %m");
+                        }
+                        if (r > 0) {
+                                r = cg_set_access(SYSTEMD_CGROUP_CONTROLLER, p, uid, gid);
+                                if (r < 0) {
+                                        *exit_status = EXIT_CGROUP;
+                                        return log_unit_error_errno(unit, r, "Failed to adjust control subgroup access: %m");
+                                }
+                        }
                 }
 
                 if (cgroup_context && cg_unified() > 0 && is_pressure_supported() > 0) {
@@ -5635,13 +5660,13 @@ int exec_spawn(Unit *unit,
         log_command_line(unit, "About to execute", command->path, command->argv);
 
         if (params->cgroup_path) {
-                r = exec_parameters_get_cgroup_path(params, &subcgroup_path);
+                r = exec_parameters_get_cgroup_path(params, cgroup_context, &subcgroup_path);
                 if (r < 0)
                         return log_unit_error_errno(unit, r, "Failed to acquire subcgroup path: %m");
                 if (r > 0) { /* We are using a child cgroup */
                         r = cg_create(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path);
                         if (r < 0)
-                                return log_unit_error_errno(unit, r, "Failed to create control group '%s': %m", subcgroup_path);
+                                return log_unit_error_errno(unit, r, "Failed to create subcgroup '%s': %m", subcgroup_path);
 
                         /* Normally we would not propagate the xattrs to children but since we created this
                          * sub-cgroup internally we should do it. */
index 8a2823b075a0ebbc6e68b29c7ea9af61219ec175..110bccb7ad444bc1f0fc62be973c8280b9d5f72c 100644 (file)
 {{type}}.TasksAccounting,                  config_parse_bool,                           0,                                  offsetof({{type}}, cgroup_context.tasks_accounting)
 {{type}}.TasksMax,                         config_parse_tasks_max,                      0,                                  offsetof({{type}}, cgroup_context.tasks_max)
 {{type}}.Delegate,                         config_parse_delegate,                       0,                                  offsetof({{type}}, cgroup_context)
+{{type}}.DelegateSubgroup,                 config_parse_delegate_subgroup ,             0,                                  offsetof({{type}}, cgroup_context)
 {{type}}.DisableControllers,               config_parse_disable_controllers,            0,                                  offsetof({{type}}, cgroup_context)
 {{type}}.IPAccounting,                     config_parse_bool,                           0,                                  offsetof({{type}}, cgroup_context.ip_accounting)
 {{type}}.IPAddressAllow,                   config_parse_in_addr_prefixes,               AF_UNSPEC,                          offsetof({{type}}, cgroup_context.ip_address_allow)
index 581a051d460dd08857a748233872da7584028349..ebfe98d7cc9ab86dc718d77c39d56e21a9d4e4d0 100644 (file)
@@ -4029,6 +4029,42 @@ int config_parse_delegate(
         return 0;
 }
 
+int config_parse_delegate_subgroup(
+                const char *unit,
+                const char *filename,
+                unsigned line,
+                const char *section,
+                unsigned section_line,
+                const char *lvalue,
+                int ltype,
+                const char *rvalue,
+                void *data,
+                void *userdata) {
+
+        CGroupContext *c = ASSERT_PTR(data);
+        UnitType t;
+
+        t = unit_name_to_type(unit);
+        assert(t >= 0);
+
+        if (!unit_vtable[t]->can_delegate) {
+                log_syntax(unit, LOG_WARNING, filename, line, 0, "DelegateSubgroup= setting not supported for this unit type, ignoring.");
+                return 0;
+        }
+
+        if (isempty(rvalue)) {
+                c->delegate_subgroup = mfree(c->delegate_subgroup);
+                return 0;
+        }
+
+        if (cg_needs_escape(rvalue)) { /* Insist that specified names don't need escaping */
+                log_syntax(unit, LOG_WARNING, filename, line, 0, "Invalid control group name, ignoring: %s", rvalue);
+                return 0;
+        }
+
+        return free_and_strdup_warn(&c->delegate_subgroup, rvalue);
+}
+
 int config_parse_managed_oom_mode(
                 const char *unit,
                 const char *filename,
index a38d697338adfc48f5a43f357a3f1807ff1ef0d7..59f02a32072f24a2adb10d20e5c7e987196aed2d 100644 (file)
@@ -83,6 +83,7 @@ CONFIG_PARSER_PROTOTYPE(config_parse_cpu_shares);
 CONFIG_PARSER_PROTOTYPE(config_parse_memory_limit);
 CONFIG_PARSER_PROTOTYPE(config_parse_tasks_max);
 CONFIG_PARSER_PROTOTYPE(config_parse_delegate);
+CONFIG_PARSER_PROTOTYPE(config_parse_delegate_subgroup);
 CONFIG_PARSER_PROTOTYPE(config_parse_managed_oom_mode);
 CONFIG_PARSER_PROTOTYPE(config_parse_managed_oom_mem_pressure_limit);
 CONFIG_PARSER_PROTOTYPE(config_parse_managed_oom_preference);
index ebbd1f7f28e502b8b848ac3f7577182a4ee6a14c..a321179609743acdfe41a52dc3d16035ac683237 100644 (file)
@@ -461,7 +461,8 @@ static int bus_append_cgroup_property(sd_bus_message *m, const char *field, cons
                               "ManagedOOMSwap",
                               "ManagedOOMMemoryPressure",
                               "ManagedOOMPreference",
-                              "MemoryPressureWatch"))
+                              "MemoryPressureWatch",
+                              "DelegateSubgroup"))
                 return bus_append_string(m, field, eq);
 
         if (STR_IN_SET(field, "ManagedOOMMemoryPressureLimit")) {