Merge pull request #11009 from poettering/root-cgroup-again

author Lennart Poettering <lennart@poettering.net>

Tue, 4 Dec 2018 11:33:03 +0000 (12:33 +0100)

committer GitHub <noreply@github.com>

Tue, 4 Dec 2018 11:33:03 +0000 (12:33 +0100)
author Lennart Poettering <lennart@poettering.net>
Tue, 4 Dec 2018 11:33:03 +0000 (12:33 +0100)
committer GitHub <noreply@github.com>
Tue, 4 Dec 2018 11:33:03 +0000 (12:33 +0100)
diff --combined src/core/cgroup.c

index b585e4bd2bb51b97b685f52e79d5fee4eba3b1cf,cc66f11e9ce17d5be368abd4116939e98863493d..2dd53191e0744b5e0b3e4b0bf50820b2a3973ebe
--- 1/src/core/cgroup.c
--- 2/src/core/cgroup.c
+++ b/src/core/cgroup.c
@@@ -375,26 -375,6 +375,26 @@@ int cgroup_add_device_allow(CGroupConte
           return 0;
   }
   
+ +static void cgroup_xattr_apply(Unit *u) {
+ +        char ids[SD_ID128_STRING_MAX];
+ +        int r;
+ +
+ +        assert(u);
+ +
+ +        if (!MANAGER_IS_SYSTEM(u->manager))
+ +                return;
+ +
+ +        if (sd_id128_is_null(u->invocation_id))
+ +                return;
+ +
+ +        r = cg_set_xattr(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path,
+ +                         "trusted.invocation_id",
+ +                         sd_id128_to_string(u->invocation_id, ids), 32,
+ +                         0);
+ +        if (r < 0)
+ +                log_unit_debug_errno(u, r, "Failed to set invocation ID on control group %s, ignoring: %m", u->cgroup_path);
+ +}
+ +
   static int lookup_block_device(const char *p, dev_t *ret) {
           struct stat st = {};
           int r;
@@@ -875,68 -855,53 +875,53 @@@ static void cgroup_context_apply
           if (is_local_root) /* Make sure we don't try to display messages with an empty path. */
                   path = "/";
   
-         /* We generally ignore errors caused by read-only mounted
-          * cgroup trees (assuming we are running in a container then),
-          * and missing cgroups, i.e. EROFS and ENOENT. */
+         /* We generally ignore errors caused by read-only mounted cgroup trees (assuming we are running in a container
+          * then), and missing cgroups, i.e. EROFS and ENOENT. */
   
-         if (apply_mask & CGROUP_MASK_CPU) {
-                 bool has_weight, has_shares;
- 
-                 has_weight = cgroup_context_has_cpu_weight(c);
-                 has_shares = cgroup_context_has_cpu_shares(c);
+         /* In fully unified mode these attributes don't exist on the host cgroup root. On legacy the weights exist, but
+          * setting the weight makes very little sense on the host root cgroup, as there are no other cgroups at this
+          * level. The quota exists there too, but any attempt to write to it is refused with EINVAL. Inside of
+          * containers we want to leave control of these to the container manager (and if cgroupsv2 delegation is used
+          * we couldn't even write to them if we wanted to). */
+         if ((apply_mask & CGROUP_MASK_CPU) && !is_local_root) {
   
                   if (cg_all_unified() > 0) {
+                         uint64_t weight;
   
-                         /* In fully unified mode these attributes don't exist on the host cgroup root, and inside of
-                          * containers we want to leave control of these to the container manager (and if delegation is
-                          * used we couldn't even write to them if we wanted to). */
-                         if (!is_local_root) {
-                                 uint64_t weight;
- 
-                                 if (has_weight)
-                                         weight = cgroup_context_cpu_weight(c, state);
-                                 else if (has_shares) {
-                                         uint64_t shares;
+                         if (cgroup_context_has_cpu_weight(c))
+                                 weight = cgroup_context_cpu_weight(c, state);
+                         else if (cgroup_context_has_cpu_shares(c)) {
+                                 uint64_t shares;
   
-                                         shares = cgroup_context_cpu_shares(c, state);
-                                         weight = cgroup_cpu_shares_to_weight(shares);
+                                 shares = cgroup_context_cpu_shares(c, state);
+                                 weight = cgroup_cpu_shares_to_weight(shares);
   
-                                         log_cgroup_compat(u, "Applying [Startup]CPUShares %" PRIu64 " as [Startup]CPUWeight %" PRIu64 " on %s",
-                                                           shares, weight, path);
-                                 } else
-                                         weight = CGROUP_WEIGHT_DEFAULT;
+                                 log_cgroup_compat(u, "Applying [Startup]CPUShares=%" PRIu64 " as [Startup]CPUWeight=%" PRIu64 " on %s",
+                                                   shares, weight, path);
+                         } else
+                                 weight = CGROUP_WEIGHT_DEFAULT;
   
-                                 cgroup_apply_unified_cpu_weight(u, weight);
-                                 cgroup_apply_unified_cpu_quota(u, c->cpu_quota_per_sec_usec);
-                         }
+                         cgroup_apply_unified_cpu_weight(u, weight);
+                         cgroup_apply_unified_cpu_quota(u, c->cpu_quota_per_sec_usec);
   
                   } else {
-                         /* Setting the weight makes very little sense on the host root cgroup, as there are no other
-                          * cgroups at this level. And for containers we want to leave management of this to the
-                          * container manager */
-                         if (!is_local_root) {
-                                 uint64_t shares;
- 
-                                 if (has_weight) {
-                                         uint64_t weight;
+                         uint64_t shares;
   
-                                         weight = cgroup_context_cpu_weight(c, state);
-                                         shares = cgroup_cpu_weight_to_shares(weight);
+                         if (cgroup_context_has_cpu_weight(c)) {
+                                 uint64_t weight;
   
-                                         log_cgroup_compat(u, "Applying [Startup]CPUWeight %" PRIu64 " as [Startup]CPUShares %" PRIu64 " on %s",
-                                                           weight, shares, path);
-                                 } else if (has_shares)
-                                         shares = cgroup_context_cpu_shares(c, state);
-                                 else
-                                         shares = CGROUP_CPU_SHARES_DEFAULT;
+                                 weight = cgroup_context_cpu_weight(c, state);
+                                 shares = cgroup_cpu_weight_to_shares(weight);
   
-                                 cgroup_apply_legacy_cpu_shares(u, shares);
-                         }
+                                 log_cgroup_compat(u, "Applying [Startup]CPUWeight=%" PRIu64 " as [Startup]CPUShares=%" PRIu64 " on %s",
+                                                   weight, shares, path);
+                         } else if (cgroup_context_has_cpu_shares(c))
+                                 shares = cgroup_context_cpu_shares(c, state);
+                         else
+                                 shares = CGROUP_CPU_SHARES_DEFAULT;
   
-                         /* The "cpu" quota attribute is available on the host root, hence manage it there. But in
-                          * containers let's leave this to the container manager. */
-                         if (is_host_root || !is_local_root)
-                                 cgroup_apply_legacy_cpu_quota(u, c->cpu_quota_per_sec_usec);
+                         cgroup_apply_legacy_cpu_shares(u, shares);
+                         cgroup_apply_legacy_cpu_quota(u, c->cpu_quota_per_sec_usec);
                   }
           }
   
@@@ -959,7 -924,7 +944,7 @@@
                           blkio_weight = cgroup_context_blkio_weight(c, state);
                           weight = cgroup_weight_blkio_to_io(blkio_weight);
   
-                         log_cgroup_compat(u, "Applying [Startup]BlockIOWeight %" PRIu64 " as [Startup]IOWeight %" PRIu64,
+                         log_cgroup_compat(u, "Applying [Startup]BlockIOWeight=%" PRIu64 " as [Startup]IOWeight=%" PRIu64,
                                             blkio_weight, weight);
                   } else
                           weight = CGROUP_WEIGHT_DEFAULT;
@@@ -988,7 -953,7 +973,7 @@@
                           LIST_FOREACH(device_weights, w, c->blockio_device_weights) {
                                   weight = cgroup_weight_blkio_to_io(w->weight);
   
-                                 log_cgroup_compat(u, "Applying BlockIODeviceWeight %" PRIu64 " as IODeviceWeight %" PRIu64 " for %s",
+                                 log_cgroup_compat(u, "Applying BlockIODeviceWeight=%" PRIu64 " as IODeviceWeight=%" PRIu64 " for %s",
                                                     w->weight, weight, w->path);
   
                                   cgroup_apply_io_device_weight(u, w->path, weight);
@@@ -1004,7 -969,7 +989,7 @@@
                                   limits[CGROUP_IO_RBPS_MAX] = b->rbps;
                                   limits[CGROUP_IO_WBPS_MAX] = b->wbps;
   
-                                 log_cgroup_compat(u, "Applying BlockIO{Read|Write}Bandwidth %" PRIu64 " %" PRIu64 " as IO{Read|Write}BandwidthMax for %s",
+                                 log_cgroup_compat(u, "Applying BlockIO{Read|Write}Bandwidth=%" PRIu64 " %" PRIu64 " as IO{Read|Write}BandwidthMax= for %s",
                                                     b->rbps, b->wbps, b->path);
   
                                   cgroup_apply_io_device_limit(u, b->path, limits);
@@@ -1030,7 -995,7 +1015,7 @@@
                                   io_weight = cgroup_context_io_weight(c, state);
                                   weight = cgroup_weight_io_to_blkio(cgroup_context_io_weight(c, state));
   
-                                 log_cgroup_compat(u, "Applying [Startup]IOWeight %" PRIu64 " as [Startup]BlockIOWeight %" PRIu64,
+                                 log_cgroup_compat(u, "Applying [Startup]IOWeight=%" PRIu64 " as [Startup]BlockIOWeight=%" PRIu64,
                                                     io_weight, weight);
                           } else if (has_blockio)
                                   weight = cgroup_context_blkio_weight(c, state);
@@@ -1046,7 -1011,7 +1031,7 @@@
                                   LIST_FOREACH(device_weights, w, c->io_device_weights) {
                                           weight = cgroup_weight_io_to_blkio(w->weight);
   
-                                         log_cgroup_compat(u, "Applying IODeviceWeight %" PRIu64 " as BlockIODeviceWeight %" PRIu64 " for %s",
+                                         log_cgroup_compat(u, "Applying IODeviceWeight=%" PRIu64 " as BlockIODeviceWeight=%" PRIu64 " for %s",
                                                             w->weight, weight, w->path);
   
                                           cgroup_apply_blkio_device_weight(u, w->path, weight);
@@@ -1066,7 -1031,7 +1051,7 @@@
                                   CGroupIODeviceLimit *l;
   
                                   LIST_FOREACH(device_limits, l, c->io_device_limits) {
-                                         log_cgroup_compat(u, "Applying IO{Read|Write}Bandwidth %" PRIu64 " %" PRIu64 " as BlockIO{Read|Write}BandwidthMax for %s",
+                                         log_cgroup_compat(u, "Applying IO{Read|Write}Bandwidth=%" PRIu64 " %" PRIu64 " as BlockIO{Read|Write}BandwidthMax= for %s",
                                                             l->limits[CGROUP_IO_RBPS_MAX], l->limits[CGROUP_IO_WBPS_MAX], l->path);
   
                                           cgroup_apply_blkio_device_limit(u, l->path, l->limits[CGROUP_IO_RBPS_MAX], l->limits[CGROUP_IO_WBPS_MAX]);
@@@ -1080,56 -1045,51 +1065,51 @@@
                   }
           }
   
-         if (apply_mask & CGROUP_MASK_MEMORY) {
+         /* In unified mode 'memory' attributes do not exist on the root cgroup. In legacy mode 'memory.limit_in_bytes'
+          * exists on the root cgroup, but any writes to it are refused with EINVAL. And if we run in a container we
+          * want to leave control to the container manager (and if proper cgroupsv2 delegation is used we couldn't even
+          * write to this if we wanted to.) */
+         if ((apply_mask & CGROUP_MASK_MEMORY) && !is_local_root) {
   
                   if (cg_all_unified() > 0) {
-                         /* In unified mode 'memory' attributes do not exist on the root cgroup. And if we run in a
-                          * container we want to leave control to the container manager (and if proper delegation is
-                          * used we couldn't even write to this if we wanted to. */
-                         if (!is_local_root) {
-                                 uint64_t max, swap_max = CGROUP_LIMIT_MAX;
- 
-                                 if (cgroup_context_has_unified_memory_config(c)) {
-                                         max = c->memory_max;
-                                         swap_max = c->memory_swap_max;
-                                 } else {
-                                         max = c->memory_limit;
- 
-                                         if (max != CGROUP_LIMIT_MAX)
-                                                 log_cgroup_compat(u, "Applying MemoryLimit=%" PRIu64 " as MemoryMax=", max);
-                                 }
+                         uint64_t max, swap_max = CGROUP_LIMIT_MAX;
+ 
+                         if (cgroup_context_has_unified_memory_config(c)) {
+                                 max = c->memory_max;
+                                 swap_max = c->memory_swap_max;
+                         } else {
+                                 max = c->memory_limit;
   
-                                 cgroup_apply_unified_memory_limit(u, "memory.min", c->memory_min);
-                                 cgroup_apply_unified_memory_limit(u, "memory.low", c->memory_low);
-                                 cgroup_apply_unified_memory_limit(u, "memory.high", c->memory_high);
-                                 cgroup_apply_unified_memory_limit(u, "memory.max", max);
-                                 cgroup_apply_unified_memory_limit(u, "memory.swap.max", swap_max);
+                                 if (max != CGROUP_LIMIT_MAX)
+                                         log_cgroup_compat(u, "Applying MemoryLimit=%" PRIu64 " as MemoryMax=", max);
                           }
-                 } else {
   
-                         /* In legacy mode 'memory' exists on the host root, but in container mode we want to leave it
-                          * to the container manager around us */
-                         if (is_host_root || !is_local_root) {
-                                 char buf[DECIMAL_STR_MAX(uint64_t) + 1];
-                                 uint64_t val;
+                         cgroup_apply_unified_memory_limit(u, "memory.min", c->memory_min);
+                         cgroup_apply_unified_memory_limit(u, "memory.low", c->memory_low);
+                         cgroup_apply_unified_memory_limit(u, "memory.high", c->memory_high);
+                         cgroup_apply_unified_memory_limit(u, "memory.max", max);
+                         cgroup_apply_unified_memory_limit(u, "memory.swap.max", swap_max);
   
-                                 if (cgroup_context_has_unified_memory_config(c)) {
-                                         val = c->memory_max;
-                                         log_cgroup_compat(u, "Applying MemoryMax=%" PRIi64 " as MemoryLimit=", val);
-                                 } else
-                                         val = c->memory_limit;
+                 } else {
+                         char buf[DECIMAL_STR_MAX(uint64_t) + 1];
+                         uint64_t val;
   
-                                 if (val == CGROUP_LIMIT_MAX)
-                                         strncpy(buf, "-1\n", sizeof(buf));
-                                 else
-                                         xsprintf(buf, "%" PRIu64 "\n", val);
+                         if (cgroup_context_has_unified_memory_config(c)) {
+                                 val = c->memory_max;
+                                 log_cgroup_compat(u, "Applying MemoryMax=%" PRIi64 " as MemoryLimit=", val);
+                         } else
+                                 val = c->memory_limit;
   
-                                 (void) set_attribute_and_warn(u, "memory", "memory.limit_in_bytes", buf);
-                         }
+                         if (val == CGROUP_LIMIT_MAX)
+                                 strncpy(buf, "-1\n", sizeof(buf));
+                         else
+                                 xsprintf(buf, "%" PRIu64 "\n", val);
+ 
+                         (void) set_attribute_and_warn(u, "memory", "memory.limit_in_bytes", buf);
                   }
           }
   
-         /* On cgroupsv2 we can apply BPF everywhre. On cgroupsv1 we apply it everywhere except for the root of
+         /* On cgroupsv2 we can apply BPF everywhere. On cgroupsv1 we apply it everywhere except for the root of
            * containers, where we leave this to the manager */
           if ((apply_mask & (CGROUP_MASK_DEVICES | CGROUP_MASK_BPF_DEVICES)) &&
               (is_host_root || cg_all_unified() > 0 || !is_local_root)) {
@@@ -1238,7 -1198,6 +1218,6 @@@
                                   r = procfs_tasks_set_limit(TASKS_MAX);
                           else
                                   r = 0;
- 
                           if (r < 0)
                                   log_unit_full(u, LOG_LEVEL_CGROUP_WRITE(r), r,
                                                 "Failed to write to tasks limit sysctls: %m");
@@@ -1346,7 -1305,7 +1325,7 @@@ CGroupMask unit_get_own_mask(Unit *u) 
           if (!c)
                   return 0;
   
- -        return cgroup_context_get_mask(c) | unit_get_bpf_mask(u) | unit_get_delegate_mask(u);
+ +        return (cgroup_context_get_mask(c) | unit_get_bpf_mask(u) | unit_get_delegate_mask(u)) & ~unit_get_ancestor_disable_mask(u);
   }
   
   CGroupMask unit_get_delegate_mask(Unit *u) {
@@@ -1416,31 -1375,6 +1395,31 @@@ CGroupMask unit_get_siblings_mask(Unit 
           return unit_get_subtree_mask(u); /* we are the top-level slice */
   }
   
+ +CGroupMask unit_get_disable_mask(Unit *u) {
+ +        CGroupContext *c;
+ +
+ +        c = unit_get_cgroup_context(u);
+ +        if (!c)
+ +                return 0;
+ +
+ +        return c->disable_controllers;
+ +}
+ +
+ +CGroupMask unit_get_ancestor_disable_mask(Unit *u) {
+ +        CGroupMask mask;
+ +
+ +        assert(u);
+ +        mask = unit_get_disable_mask(u);
+ +
+ +        /* Returns the mask of controllers which are marked as forcibly
+ +         * disabled in any ancestor unit or the unit in question. */
+ +
+ +        if (UNIT_ISSET(u->slice))
+ +                mask |= unit_get_ancestor_disable_mask(UNIT_DEREF(u->slice));
+ +
+ +        return mask;
+ +}
+ +
   CGroupMask unit_get_subtree_mask(Unit *u) {
   
           /* Returns the mask of this subtree, meaning of the group
@@@ -1461,7 -1395,6 +1440,7 @@@ CGroupMask unit_get_target_mask(Unit *u
   
           mask = unit_get_own_mask(u) | unit_get_members_mask(u) | unit_get_siblings_mask(u);
           mask &= u->manager->cgroup_supported;
+ +        mask &= ~unit_get_ancestor_disable_mask(u);
   
           return mask;
   }
@@@ -1476,7 -1409,6 +1455,7 @@@ CGroupMask unit_get_enable_mask(Unit *u
   
           mask = unit_get_members_mask(u);
           mask &= u->manager->cgroup_supported;
+ +        mask &= ~unit_get_ancestor_disable_mask(u);
   
           return mask;
   }
@@@ -1644,8 -1576,7 +1623,8 @@@ int unit_pick_cgroup_path(Unit *u) 
   static int unit_create_cgroup(
                   Unit *u,
                   CGroupMask target_mask,
- -                CGroupMask enable_mask) {
+ +                CGroupMask enable_mask,
+ +                ManagerState state) {
   
           bool created;
           int r;
@@@ -1713,10 -1644,6 +1692,10 @@@
                           log_unit_warning_errno(u, r, "Failed to migrate cgroup from to %s, ignoring: %m", u->cgroup_path);
           }
   
+ +        /* Set attributes */
+ +        cgroup_context_apply(u, target_mask, state);
+ +        cgroup_xattr_apply(u);
+ +
           return 0;
   }
   
@@@ -1858,6 -1785,26 +1837,6 @@@ int unit_attach_pids_to_cgroup(Unit *u
           return r;
   }
   
- -static void cgroup_xattr_apply(Unit *u) {
- -        char ids[SD_ID128_STRING_MAX];
- -        int r;
- -
- -        assert(u);
- -
- -        if (!MANAGER_IS_SYSTEM(u->manager))
- -                return;
- -
- -        if (sd_id128_is_null(u->invocation_id))
- -                return;
- -
- -        r = cg_set_xattr(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path,
- -                         "trusted.invocation_id",
- -                         sd_id128_to_string(u->invocation_id, ids), 32,
- -                         0);
- -        if (r < 0)
- -                log_unit_debug_errno(u, r, "Failed to set invocation ID on control group %s, ignoring: %m", u->cgroup_path);
- -}
- -
   static bool unit_has_mask_realized(
                   Unit *u,
                   CGroupMask target_mask,
@@@ -1886,40 -1833,6 +1865,40 @@@
                   u->cgroup_invalidated_mask == 0;
   }
   
+ +static bool unit_has_mask_disables_realized(
+ +                Unit *u,
+ +                CGroupMask target_mask,
+ +                CGroupMask enable_mask) {
+ +
+ +        assert(u);
+ +
+ +        /* Returns true if all controllers which should be disabled are indeed disabled.
+ +         *
+ +         * Unlike unit_has_mask_realized, we don't care what was enabled, only that anything we want to remove is
+ +         * already removed. */
+ +
+ +        return !u->cgroup_realized ||
+ +                (FLAGS_SET(u->cgroup_realized_mask, target_mask & CGROUP_MASK_V1) &&
+ +                 FLAGS_SET(u->cgroup_enabled_mask, enable_mask & CGROUP_MASK_V2));
+ +}
+ +
+ +static bool unit_has_mask_enables_realized(
+ +                Unit *u,
+ +                CGroupMask target_mask,
+ +                CGroupMask enable_mask) {
+ +
+ +        assert(u);
+ +
+ +        /* Returns true if all controllers which should be enabled are indeed enabled.
+ +         *
+ +         * Unlike unit_has_mask_realized, we don't care about the controllers that are not present, only that anything
+ +         * we want to add is already added. */
+ +
+ +        return u->cgroup_realized &&
+ +                ((u->cgroup_realized_mask | target_mask) & CGROUP_MASK_V1) == (u->cgroup_realized_mask & CGROUP_MASK_V1) &&
+ +                ((u->cgroup_enabled_mask | enable_mask) & CGROUP_MASK_V2) == (u->cgroup_enabled_mask & CGROUP_MASK_V2);
+ +}
+ +
   void unit_add_to_cgroup_realize_queue(Unit *u) {
           assert(u);
   
@@@ -1940,127 -1853,10 +1919,127 @@@ static void unit_remove_from_cgroup_rea
           u->in_cgroup_realize_queue = false;
   }
   
+ +/* Controllers can only be enabled breadth-first, from the root of the
+ + * hierarchy downwards to the unit in question. */
+ +static int unit_realize_cgroup_now_enable(Unit *u, ManagerState state) {
+ +        CGroupMask target_mask, enable_mask, new_target_mask, new_enable_mask;
+ +        int r;
+ +
+ +        assert(u);
+ +
+ +        /* First go deal with this unit's parent, or we won't be able to enable
+ +         * any new controllers at this layer. */
+ +        if (UNIT_ISSET(u->slice)) {
+ +                r = unit_realize_cgroup_now_enable(UNIT_DEREF(u->slice), state);
+ +                if (r < 0)
+ +                        return r;
+ +        }
+ +
+ +        target_mask = unit_get_target_mask(u);
+ +        enable_mask = unit_get_enable_mask(u);
+ +
+ +        /* We can only enable in this direction, don't try to disable anything.
+ +         */
+ +        if (unit_has_mask_enables_realized(u, target_mask, enable_mask))
+ +                return 0;
+ +
+ +        new_target_mask = u->cgroup_realized_mask | target_mask;
+ +        new_enable_mask = u->cgroup_enabled_mask | enable_mask;
+ +
+ +        return unit_create_cgroup(u, new_target_mask, new_enable_mask, state);
+ +}
+ +
+ +/* Controllers can only be disabled depth-first, from the leaves of the
+ + * hierarchy upwards to the unit in question. */
+ +static int unit_realize_cgroup_now_disable(Unit *u, ManagerState state) {
+ +        Iterator i;
+ +        Unit *m;
+ +        void *v;
+ +
+ +        assert(u);
+ +
+ +        if (u->type != UNIT_SLICE)
+ +                return 0;
+ +
+ +        HASHMAP_FOREACH_KEY(v, m, u->dependencies[UNIT_BEFORE], i) {
+ +                CGroupMask target_mask, enable_mask, new_target_mask, new_enable_mask;
+ +                int r;
+ +
+ +                if (UNIT_DEREF(m->slice) != u)
+ +                        continue;
+ +
+ +                /* The cgroup for this unit might not actually be fully
+ +                 * realised yet, in which case it isn't holding any controllers
+ +                 * open anyway. */
+ +                if (!m->cgroup_path)
+ +                        continue;
+ +
+ +                /* We must disable those below us first in order to release the
+ +                 * controller. */
+ +                if (m->type == UNIT_SLICE)
+ +                        (void) unit_realize_cgroup_now_disable(m, state);
+ +
+ +                target_mask = unit_get_target_mask(m);
+ +                enable_mask = unit_get_enable_mask(m);
+ +
+ +                /* We can only disable in this direction, don't try to enable
+ +                 * anything. */
+ +                if (unit_has_mask_disables_realized(m, target_mask, enable_mask))
+ +                        continue;
+ +
+ +                new_target_mask = m->cgroup_realized_mask & target_mask;
+ +                new_enable_mask = m->cgroup_enabled_mask & enable_mask;
+ +
+ +                r = unit_create_cgroup(m, new_target_mask, new_enable_mask, state);
+ +                if (r < 0)
+ +                        return r;
+ +        }
+ +
+ +        return 0;
+ +}
+ +
   /* Check if necessary controllers and attributes for a unit are in place.
    *
- - * If so, do nothing.
- - * If not, create paths, move processes over, and set attributes.
+ + * - If so, do nothing.
+ + * - If not, create paths, move processes over, and set attributes.
+ + *
+ + * Controllers can only be *enabled* in a breadth-first way, and *disabled* in
+ + * a depth-first way. As such the process looks like this:
+ + *
+ + * Suppose we have a cgroup hierarchy which looks like this:
+ + *
+ + *             root
+ + *            /    \
+ + *           /      \
+ + *          /        \
+ + *         a          b
+ + *        / \        / \
+ + *       /   \      /   \
+ + *      c     d    e     f
+ + *     / \   / \  / \   / \
+ + *     h i   j k  l m   n o
+ + *
+ + * 1. We want to realise cgroup "d" now.
+ + * 2. cgroup "a" has DisableControllers=cpu in the associated unit.
+ + * 3. cgroup "k" just started requesting the memory controller.
+ + *
+ + * To make this work we must do the following in order:
+ + *
+ + * 1. Disable CPU controller in k, j
+ + * 2. Disable CPU controller in d
+ + * 3. Enable memory controller in root
+ + * 4. Enable memory controller in a
+ + * 5. Enable memory controller in d
+ + * 6. Enable memory controller in k
+ + *
+ + * Notice that we need to touch j in one direction, but not the other. We also
+ + * don't go beyond d when disabling -- it's up to "a" to get realized if it
+ + * wants to disable further. The basic rules are therefore:
+ + *
+ + * - If you're disabling something, you need to realise all of the cgroups from
+ + *   your recursive descendants to the root. This starts from the leaves.
+ + * - If you're enabling something, you need to realise from the root cgroup
+ + *   downwards, but you don't need to iterate your recursive descendants.
    *
    * Returns 0 on success and < 0 on failure. */
   static int unit_realize_cgroup_now(Unit *u, ManagerState state) {
@@@ -2077,23 -1873,22 +2056,23 @@@
           if (unit_has_mask_realized(u, target_mask, enable_mask))
                   return 0;
   
- -        /* First, realize parents */
+ +        /* Disable controllers below us, if there are any */
+ +        r = unit_realize_cgroup_now_disable(u, state);
+ +        if (r < 0)
+ +                return r;
+ +
+ +        /* Enable controllers above us, if there are any */
           if (UNIT_ISSET(u->slice)) {
- -                r = unit_realize_cgroup_now(UNIT_DEREF(u->slice), state);
+ +                r = unit_realize_cgroup_now_enable(UNIT_DEREF(u->slice), state);
                   if (r < 0)
                           return r;
           }
   
- -        /* And then do the real work */
- -        r = unit_create_cgroup(u, target_mask, enable_mask);
+ +        /* Now actually deal with the cgroup we were trying to realise and set attributes */
+ +        r = unit_create_cgroup(u, target_mask, enable_mask, state);
           if (r < 0)
                   return r;
   
- -        /* Finally, apply the necessary attributes. */
- -        cgroup_context_apply(u, target_mask, state);
- -        cgroup_xattr_apply(u);
- -
           /* Now, reset the invalidation mask */
           u->cgroup_invalidated_mask = 0;
           return 0;
author	Lennart Poettering <lennart@poettering.net>
	Tue, 4 Dec 2018 11:33:03 +0000 (12:33 +0100)
committer	GitHub <noreply@github.com>
	Tue, 4 Dec 2018 11:33:03 +0000 (12:33 +0100)