]> git.ipfire.org Git - thirdparty/systemd.git/commitdiff
cgroup: Add DisableControllers= directive to disable controller in subtree 10567/head
authorChris Down <chris@chrisdown.name>
Mon, 3 Dec 2018 14:38:06 +0000 (14:38 +0000)
committerChris Down <chris@chrisdown.name>
Mon, 3 Dec 2018 15:40:31 +0000 (15:40 +0000)
Some controllers (like the CPU controller) have a performance cost that
is non-trivial on certain workloads. While this can be mitigated and
improved to an extent, there will for some controllers always be some
overheads associated with the benefits gained from the controller.
Inside Facebook, the fix applied has been to disable the CPU controller
forcibly with `cgroup_disable=cpu` on the kernel command line.

This presents a problem: to disable or reenable the controller, a reboot
is required, but this is quite cumbersome and slow to do for many
thousands of machines, especially machines where disabling/enabling a
stateful service on a machine is a matter of several minutes.

Currently systemd provides some configuration knobs for these in the
form of `[Default]CPUAccounting`, `[Default]MemoryAccounting`, and the
like. The limitation of these is that Default*Accounting is overrideable
by individual services, of which any one could decide to reenable a
controller within the hierarchy at any point just by using a controller
feature implicitly (eg. `CPUWeight`), even if the use of that CPU
feature could just be opportunistic. Since many services are provided by
the distribution, or by upstream teams at a particular organisation,
it's not a sustainable solution to simply try to find and remove
offending directives from these units.

This commit presents a more direct solution -- a DisableControllers=
directive that forcibly disallows a controller from being enabled within
a subtree.

man/systemd.resource-control.xml
src/core/cgroup.c
src/core/cgroup.h
src/core/dbus-cgroup.c
src/core/load-fragment-gperf.gperf.m4
src/core/load-fragment.c
src/core/load-fragment.h
src/test/test-cgroup-mask.c
test/meson.build
test/nomem.slice [new file with mode: 0644]
test/nomemleaf.service [new file with mode: 0644]

index 4e282dad3d39e4b734ed7a6ca83743ca952325a9..aa7d9bcd59bff25e5d8d1c7e33b00383ddd12bfa 100644 (file)
         </listitem>
       </varlistentry>
 
+      <varlistentry>
+        <term><varname>DisableControllers=</varname></term>
+
+        <listitem>
+          <para>Disables controllers from being enabled for a unit's children. If a controller listed is already in use
+          in its subtree, the controller will be removed from the subtree. This can be used to avoid child units being
+          able to implicitly or explicitly enable a controller. Defaults to not disabling any controllers.</para>
+
+          <para>It may not be possible to successfully disable a controller if the unit or any child of the unit in
+          question delegates controllers to its children, as any delegated subtree of the cgroup hierarchy is unmanaged
+          by systemd.</para>
+
+          <para>Multiple controllers may be specified, separated by spaces. You may also pass
+          <varname>DisableControllers=</varname> multiple times, in which case each new instance adds another controller
+          to disable. Passing <varname>DisableControllers=</varname> by itself with no controller name present resets
+          the disabled controller list.</para>
+
+          <para>Valid controllers are <option>cpu</option>, <option>cpuacct</option>, <option>io</option>,
+          <option>blkio</option>, <option>memory</option>, <option>devices</option>, and <option>pids</option>.</para>
+        </listitem>
+      </varlistentry>
     </variablelist>
   </refsect1>
 
index 13c595bbcdb58be86bb4a07c35cdf6e1e9951fd5..b585e4bd2bb51b97b685f52e79d5fee4eba3b1cf 100644 (file)
@@ -1346,7 +1346,7 @@ CGroupMask unit_get_own_mask(Unit *u) {
         if (!c)
                 return 0;
 
-        return cgroup_context_get_mask(c) | unit_get_bpf_mask(u) | unit_get_delegate_mask(u);
+        return (cgroup_context_get_mask(c) | unit_get_bpf_mask(u) | unit_get_delegate_mask(u)) & ~unit_get_ancestor_disable_mask(u);
 }
 
 CGroupMask unit_get_delegate_mask(Unit *u) {
@@ -1461,6 +1461,7 @@ CGroupMask unit_get_target_mask(Unit *u) {
 
         mask = unit_get_own_mask(u) | unit_get_members_mask(u) | unit_get_siblings_mask(u);
         mask &= u->manager->cgroup_supported;
+        mask &= ~unit_get_ancestor_disable_mask(u);
 
         return mask;
 }
@@ -1475,6 +1476,7 @@ CGroupMask unit_get_enable_mask(Unit *u) {
 
         mask = unit_get_members_mask(u);
         mask &= u->manager->cgroup_supported;
+        mask &= ~unit_get_ancestor_disable_mask(u);
 
         return mask;
 }
@@ -1914,8 +1916,8 @@ static bool unit_has_mask_enables_realized(
          * we want to add is already added. */
 
         return u->cgroup_realized &&
-                ((u->cgroup_realized_mask | target_mask) & CGROUP_MASK_V1) == u->cgroup_realized_mask &&
-                ((u->cgroup_enabled_mask | enable_mask) & CGROUP_MASK_V2) == u->cgroup_enabled_mask;
+                ((u->cgroup_realized_mask | target_mask) & CGROUP_MASK_V1) == (u->cgroup_realized_mask & CGROUP_MASK_V1) &&
+                ((u->cgroup_enabled_mask | enable_mask) & CGROUP_MASK_V2) == (u->cgroup_enabled_mask & CGROUP_MASK_V2);
 }
 
 void unit_add_to_cgroup_realize_queue(Unit *u) {
@@ -1965,11 +1967,7 @@ static int unit_realize_cgroup_now_enable(Unit *u, ManagerState state) {
         new_target_mask = u->cgroup_realized_mask | target_mask;
         new_enable_mask = u->cgroup_enabled_mask | enable_mask;
 
-        r = unit_create_cgroup(u, new_target_mask, new_enable_mask, state);
-        if (r < 0)
-                return r;
-
-        return 0;
+        return unit_create_cgroup(u, new_target_mask, new_enable_mask, state);
 }
 
 /* Controllers can only be disabled depth-first, from the leaves of the
@@ -2043,7 +2041,7 @@ static int unit_realize_cgroup_now_disable(Unit *u, ManagerState state) {
  *     h i   j k  l m   n o
  *
  * 1. We want to realise cgroup "d" now.
- * 2. cgroup "a" has DisableController=cpu in the associated unit.
+ * 2. cgroup "a" has DisableControllers=cpu in the associated unit.
  * 3. cgroup "k" just started requesting the memory controller.
  *
  * To make this work we must do the following in order:
index 828b6f07951ab77411f6a634275aab4f3dee4d62..6d094e9ecda9ef90ca859b0ea454c77ee086519f 100644 (file)
@@ -118,6 +118,8 @@ struct CGroupContext {
 
         bool delegate;
         CGroupMask delegate_controllers;
+
+        CGroupMask disable_controllers;
 };
 
 /* Used when querying IP accounting data */
@@ -151,6 +153,9 @@ CGroupMask unit_get_delegate_mask(Unit *u);
 CGroupMask unit_get_members_mask(Unit *u);
 CGroupMask unit_get_siblings_mask(Unit *u);
 CGroupMask unit_get_subtree_mask(Unit *u);
+CGroupMask unit_get_disable_mask(Unit *u);
+CGroupMask unit_get_ancestor_disable_mask(Unit *u);
+
 CGroupMask unit_get_target_mask(Unit *u);
 CGroupMask unit_get_enable_mask(Unit *u);
 
index a2f532076dcdf0798c24b10f98d154c3ff471e76..53890bcafbfe49317c1cd0191b1ba7c078d9342f 100644 (file)
@@ -17,7 +17,7 @@
 
 static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_cgroup_device_policy, cgroup_device_policy, CGroupDevicePolicy);
 
-static int property_get_delegate_controllers(
+static int property_get_cgroup_mask(
                 sd_bus *bus,
                 const char *path,
                 const char *interface,
@@ -26,26 +26,22 @@ static int property_get_delegate_controllers(
                 void *userdata,
                 sd_bus_error *error) {
 
-        CGroupContext *c = userdata;
-        CGroupController cc;
+        CGroupMask *mask = userdata;
+        CGroupController ctrl;
         int r;
 
         assert(bus);
         assert(reply);
-        assert(c);
-
-        if (!c->delegate)
-                return sd_bus_message_append(reply, "as", 0);
 
         r = sd_bus_message_open_container(reply, 'a', "s");
         if (r < 0)
                 return r;
 
-        for (cc = 0; cc < _CGROUP_CONTROLLER_MAX; cc++) {
-                if ((c->delegate_controllers & CGROUP_CONTROLLER_TO_MASK(cc)) == 0)
+        for (ctrl = 0; ctrl < _CGROUP_CONTROLLER_MAX; ctrl++) {
+                if ((*mask & CGROUP_CONTROLLER_TO_MASK(ctrl)) == 0)
                         continue;
 
-                r = sd_bus_message_append(reply, "s", cgroup_controller_to_string(cc));
+                r = sd_bus_message_append(reply, "s", cgroup_controller_to_string(ctrl));
                 if (r < 0)
                         return r;
         }
@@ -53,6 +49,27 @@ static int property_get_delegate_controllers(
         return sd_bus_message_close_container(reply);
 }
 
+static int property_get_delegate_controllers(
+                sd_bus *bus,
+                const char *path,
+                const char *interface,
+                const char *property,
+                sd_bus_message *reply,
+                void *userdata,
+                sd_bus_error *error) {
+
+        CGroupContext *c = userdata;
+
+        assert(bus);
+        assert(reply);
+        assert(c);
+
+        if (!c->delegate)
+                return sd_bus_message_append(reply, "as", 0);
+
+        return property_get_cgroup_mask(bus, path, interface, property, reply, &c->delegate_controllers, error);
+}
+
 static int property_get_io_device_weight(
                 sd_bus *bus,
                 const char *path,
@@ -342,6 +359,7 @@ const sd_bus_vtable bus_cgroup_vtable[] = {
         SD_BUS_PROPERTY("IPAccounting", "b", bus_property_get_bool, offsetof(CGroupContext, ip_accounting), 0),
         SD_BUS_PROPERTY("IPAddressAllow", "a(iayu)", property_get_ip_address_access, offsetof(CGroupContext, ip_address_allow), 0),
         SD_BUS_PROPERTY("IPAddressDeny", "a(iayu)", property_get_ip_address_access, offsetof(CGroupContext, ip_address_deny), 0),
+        SD_BUS_PROPERTY("DisableControllers", "as", property_get_cgroup_mask, offsetof(CGroupContext, disable_controllers), 0),
         SD_BUS_VTABLE_END
 };
 
index 97a707c1447e3e486f0e853212d1f13343335fc3..cdbc67f885d2f4c689d290d3bcfbb6b309de618a 100644 (file)
@@ -192,6 +192,7 @@ $1.BlockIOWriteBandwidth,        config_parse_blockio_bandwidth,     0,
 $1.TasksAccounting,              config_parse_bool,                  0,                             offsetof($1, cgroup_context.tasks_accounting)
 $1.TasksMax,                     config_parse_tasks_max,             0,                             offsetof($1, cgroup_context.tasks_max)
 $1.Delegate,                     config_parse_delegate,              0,                             offsetof($1, cgroup_context)
+$1.DisableControllers,           config_parse_disable_controllers,   0,                             offsetof($1, cgroup_context)
 $1.IPAccounting,                 config_parse_bool,                  0,                             offsetof($1, cgroup_context.ip_accounting)
 $1.IPAddressAllow,               config_parse_ip_address_access,     0,                             offsetof($1, cgroup_context.ip_address_allow)
 $1.IPAddressDeny,                config_parse_ip_address_access,     0,                             offsetof($1, cgroup_context.ip_address_deny)
index d1988190c2ad56db75fea6294e8cdd64785efcbb..041b62231485bad287b48d75a47b31e21a8a0762 100644 (file)
@@ -4323,6 +4323,41 @@ int config_parse_exit_status(
         return 0;
 }
 
+int config_parse_disable_controllers(
+                const char *unit,
+                const char *filename,
+                unsigned line,
+                const char *section,
+                unsigned section_line,
+                const char *lvalue,
+                int ltype,
+                const char *rvalue,
+                void *data,
+                void *userdata) {
+
+        int r;
+        CGroupContext *c = data;
+        CGroupMask disabled_mask;
+
+        /* 1. If empty, make all controllers eligible for use again.
+         * 2. If non-empty, merge all listed controllers, space separated. */
+
+        if (isempty(rvalue)) {
+                c->disable_controllers = 0;
+                return 0;
+        }
+
+        r = cg_mask_from_string(rvalue, &disabled_mask);
+        if (r < 0 || disabled_mask <= 0) {
+                log_syntax(unit, LOG_ERR, filename, line, r, "Invalid cgroup string: %s, ignoring", rvalue);
+                return 0;
+        }
+
+        c->disable_controllers |= disabled_mask;
+
+        return 0;
+}
+
 #define FOLLOW_MAX 8
 
 static int open_follow(char **filename, FILE **_f, Set *names, char **_final) {
index 71d94a77629becc2d25373989d303cc1978cbc99..e0d3b4ec3bddcc39ac5ca1051ef3311e7aa164fb 100644 (file)
@@ -105,6 +105,7 @@ CONFIG_PARSER_PROTOTYPE(config_parse_log_extra_fields);
 CONFIG_PARSER_PROTOTYPE(config_parse_collect_mode);
 CONFIG_PARSER_PROTOTYPE(config_parse_pid_file);
 CONFIG_PARSER_PROTOTYPE(config_parse_exit_status);
+CONFIG_PARSER_PROTOTYPE(config_parse_disable_controllers);
 
 /* gperf prototypes */
 const struct ConfigPerfItem* load_fragment_gperf_lookup(const char *key, GPERF_LEN_TYPE length);
index 143e7aa42477edb1f7c72f6a70111369d3511754..7f6c0c27722195cdead667ee13005fd13bc764b2 100644 (file)
@@ -30,7 +30,7 @@ static void log_cgroup_mask(CGroupMask got, CGroupMask expected) {
 static int test_cgroup_mask(void) {
         _cleanup_(rm_rf_physical_and_freep) char *runtime_dir = NULL;
         _cleanup_(manager_freep) Manager *m = NULL;
-        Unit *son, *daughter, *parent, *root, *grandchild, *parent_deep;
+        Unit *son, *daughter, *parent, *root, *grandchild, *parent_deep, *nomem_parent, *nomem_leaf;
         int r;
         CGroupMask cpu_accounting_mask = get_cpu_accounting_mask();
 
@@ -68,11 +68,15 @@ static int test_cgroup_mask(void) {
         assert_se(manager_load_startable_unit_or_warn(m, "daughter.service", NULL, &daughter) >= 0);
         assert_se(manager_load_startable_unit_or_warn(m, "grandchild.service", NULL, &grandchild) >= 0);
         assert_se(manager_load_startable_unit_or_warn(m, "parent-deep.slice", NULL, &parent_deep) >= 0);
+        assert_se(manager_load_startable_unit_or_warn(m, "nomem.slice", NULL, &nomem_parent) >= 0);
+        assert_se(manager_load_startable_unit_or_warn(m, "nomemleaf.service", NULL, &nomem_leaf) >= 0);
         assert_se(UNIT_DEREF(son->slice) == parent);
         assert_se(UNIT_DEREF(daughter->slice) == parent);
         assert_se(UNIT_DEREF(parent_deep->slice) == parent);
         assert_se(UNIT_DEREF(grandchild->slice) == parent_deep);
+        assert_se(UNIT_DEREF(nomem_leaf->slice) == nomem_parent);
         root = UNIT_DEREF(parent->slice);
+        assert_se(UNIT_DEREF(nomem_parent->slice) == root);
 
         /* Verify per-unit cgroups settings. */
         ASSERT_CGROUP_MASK_JOINED(unit_get_own_mask(son), CGROUP_MASK_CPU);
@@ -80,6 +84,8 @@ static int test_cgroup_mask(void) {
         ASSERT_CGROUP_MASK_JOINED(unit_get_own_mask(grandchild), 0);
         ASSERT_CGROUP_MASK_JOINED(unit_get_own_mask(parent_deep), CGROUP_MASK_MEMORY);
         ASSERT_CGROUP_MASK_JOINED(unit_get_own_mask(parent), (CGROUP_MASK_IO | CGROUP_MASK_BLKIO));
+        ASSERT_CGROUP_MASK_JOINED(unit_get_own_mask(nomem_parent), 0);
+        ASSERT_CGROUP_MASK_JOINED(unit_get_own_mask(nomem_leaf), (CGROUP_MASK_IO | CGROUP_MASK_BLKIO));
         ASSERT_CGROUP_MASK_JOINED(unit_get_own_mask(root), 0);
 
         /* Verify aggregation of member masks */
@@ -88,6 +94,8 @@ static int test_cgroup_mask(void) {
         ASSERT_CGROUP_MASK_JOINED(unit_get_members_mask(grandchild), 0);
         ASSERT_CGROUP_MASK_JOINED(unit_get_members_mask(parent_deep), 0);
         ASSERT_CGROUP_MASK_JOINED(unit_get_members_mask(parent), (CGROUP_MASK_CPU | cpu_accounting_mask | CGROUP_MASK_MEMORY));
+        ASSERT_CGROUP_MASK_JOINED(unit_get_members_mask(nomem_parent), (CGROUP_MASK_IO | CGROUP_MASK_BLKIO));
+        ASSERT_CGROUP_MASK_JOINED(unit_get_members_mask(nomem_leaf), 0);
         ASSERT_CGROUP_MASK_JOINED(unit_get_members_mask(root), (CGROUP_MASK_CPU | cpu_accounting_mask | CGROUP_MASK_IO | CGROUP_MASK_BLKIO | CGROUP_MASK_MEMORY));
 
         /* Verify aggregation of sibling masks. */
@@ -96,6 +104,8 @@ static int test_cgroup_mask(void) {
         ASSERT_CGROUP_MASK_JOINED(unit_get_siblings_mask(grandchild), 0);
         ASSERT_CGROUP_MASK_JOINED(unit_get_siblings_mask(parent_deep), (CGROUP_MASK_CPU | cpu_accounting_mask | CGROUP_MASK_MEMORY));
         ASSERT_CGROUP_MASK_JOINED(unit_get_siblings_mask(parent), (CGROUP_MASK_CPU | cpu_accounting_mask | CGROUP_MASK_IO | CGROUP_MASK_BLKIO | CGROUP_MASK_MEMORY));
+        ASSERT_CGROUP_MASK_JOINED(unit_get_siblings_mask(nomem_parent), (CGROUP_MASK_CPU | CGROUP_MASK_CPUACCT | CGROUP_MASK_IO | CGROUP_MASK_BLKIO | CGROUP_MASK_MEMORY));
+        ASSERT_CGROUP_MASK_JOINED(unit_get_siblings_mask(nomem_leaf), (CGROUP_MASK_IO | CGROUP_MASK_BLKIO));
         ASSERT_CGROUP_MASK_JOINED(unit_get_siblings_mask(root), (CGROUP_MASK_CPU | cpu_accounting_mask | CGROUP_MASK_IO | CGROUP_MASK_BLKIO | CGROUP_MASK_MEMORY));
 
         /* Verify aggregation of target masks. */
@@ -104,6 +114,8 @@ static int test_cgroup_mask(void) {
         ASSERT_CGROUP_MASK(unit_get_target_mask(grandchild), 0);
         ASSERT_CGROUP_MASK(unit_get_target_mask(parent_deep), (CGROUP_MASK_EXTEND_JOINED(CGROUP_MASK_CPU | cpu_accounting_mask | CGROUP_MASK_MEMORY) & m->cgroup_supported));
         ASSERT_CGROUP_MASK(unit_get_target_mask(parent), (CGROUP_MASK_EXTEND_JOINED(CGROUP_MASK_CPU | cpu_accounting_mask | CGROUP_MASK_IO | CGROUP_MASK_BLKIO | CGROUP_MASK_MEMORY) & m->cgroup_supported));
+        ASSERT_CGROUP_MASK(unit_get_target_mask(nomem_parent), (CGROUP_MASK_EXTEND_JOINED(CGROUP_MASK_CPU | CGROUP_MASK_CPUACCT | CGROUP_MASK_IO | CGROUP_MASK_BLKIO) & m->cgroup_supported));
+        ASSERT_CGROUP_MASK(unit_get_target_mask(nomem_leaf), (CGROUP_MASK_EXTEND_JOINED(CGROUP_MASK_IO | CGROUP_MASK_BLKIO) & m->cgroup_supported));
         ASSERT_CGROUP_MASK(unit_get_target_mask(root), (CGROUP_MASK_EXTEND_JOINED(CGROUP_MASK_CPU | cpu_accounting_mask | CGROUP_MASK_IO | CGROUP_MASK_BLKIO | CGROUP_MASK_MEMORY) & m->cgroup_supported));
 
         return 0;
index bf02e39f4320446060410d9cb31dd497f591c69e..d98bfb80d2d973827c9ae76bbbefb2b58978cca3 100644 (file)
@@ -18,6 +18,8 @@ test_data_files = '''
         hwdb/10-bad.hwdb
         journal-data/journal-1.txt
         journal-data/journal-2.txt
+        nomem.slice
+        nomemleaf.service
         parent-deep.slice
         parent.slice
         sched_idle_bad.service
diff --git a/test/nomem.slice b/test/nomem.slice
new file mode 100644 (file)
index 0000000..9c5d208
--- /dev/null
@@ -0,0 +1,5 @@
+[Unit]
+Description=Nomem Parent Slice
+
+[Slice]
+DisableControllers=memory
diff --git a/test/nomemleaf.service b/test/nomemleaf.service
new file mode 100644 (file)
index 0000000..3cbaccb
--- /dev/null
@@ -0,0 +1,9 @@
+[Unit]
+Description=Nomem Leaf Service
+
+[Service]
+Slice=nomem.slice
+Type=oneshot
+ExecStart=/bin/true
+IOWeight=200
+MemoryAccounting=true