core: reduce the number of stalled PIDs from the watched processes list when possible

[thirdparty/systemd.git] / src / core / cgroup.c
diff --git a/src/core/cgroup.c b/src/core/cgroup.c

index 2f6c8bd9ca345d66dd4e2c63a6888bfad973bfe9..ad67ba043892db86d64cc9442ae5a3121f9bad1a 100644 (file)
--- a/src/core/cgroup.c
+++ b/src/core/cgroup.c
@@ -5,15 +5,16 @@
  
  #include "alloc-util.h"
  #include "blockdev-util.h"
+#include "bpf-devices.h"
  #include "bpf-firewall.h"
  #include "btrfs-util.h"
-#include "bpf-devices.h"
  #include "bus-error.h"
  #include "cgroup-util.h"
  #include "cgroup.h"
  #include "fd-util.h"
  #include "fileio.h"
  #include "fs-util.h"
+#include "nulstr-util.h"
  #include "parse-util.h"
  #include "path-util.h"
  #include "process-util.h"
@@ -25,7 +26,7 @@
  #include "string-util.h"
  #include "virt.h"
  
-#define CGROUP_CPU_QUOTA_PERIOD_USEC ((usec_t) 100 * USEC_PER_MSEC)
+#define CGROUP_CPU_QUOTA_DEFAULT_PERIOD_USEC ((usec_t) 100 * USEC_PER_MSEC)
  
  /* Returns the log level to use when cgroup attribute writes fail. When an attribute is missing or we have access
   * problems we downgrade to LOG_DEBUG. This is supposed to be nice to container managers and kernels which want to mask
@@ -98,6 +99,7 @@ void cgroup_context_init(CGroupContext *c) {
                  .cpu_weight = CGROUP_WEIGHT_INVALID,
                  .startup_cpu_weight = CGROUP_WEIGHT_INVALID,
                  .cpu_quota_per_sec_usec = USEC_INFINITY,
+                .cpu_quota_period_usec = USEC_INFINITY,
  
                  .cpu_shares = CGROUP_CPU_SHARES_INVALID,
                  .startup_cpu_shares = CGROUP_CPU_SHARES_INVALID,
@@ -206,6 +208,7 @@ void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix) {
          CGroupDeviceAllow *a;
          IPAddressAccessItem *iaai;
          char u[FORMAT_TIMESPAN_MAX];
+        char v[FORMAT_TIMESPAN_MAX];
  
          assert(c);
          assert(f);
@@ -224,6 +227,7 @@ void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix) {
                  "%sCPUShares=%" PRIu64 "\n"
                  "%sStartupCPUShares=%" PRIu64 "\n"
                  "%sCPUQuotaPerSecSec=%s\n"
+                "%sCPUQuotaPeriodSec=%s\n"
                  "%sIOWeight=%" PRIu64 "\n"
                  "%sStartupIOWeight=%" PRIu64 "\n"
                  "%sBlockIOWeight=%" PRIu64 "\n"
@@ -248,6 +252,7 @@ void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix) {
                  prefix, c->cpu_shares,
                  prefix, c->startup_cpu_shares,
                  prefix, format_timespan(u, sizeof(u), c->cpu_quota_per_sec_usec, 1),
+                prefix, format_timespan(v, sizeof(v), c->cpu_quota_period_usec, 1),
                  prefix, c->io_weight,
                  prefix, c->startup_io_weight,
                  prefix, c->blockio_weight,
@@ -375,27 +380,52 @@ int cgroup_add_device_allow(CGroupContext *c, const char *dev, const char *mode)
          return 0;
  }
  
+static void cgroup_xattr_apply(Unit *u) {
+        char ids[SD_ID128_STRING_MAX];
+        int r;
+
+        assert(u);
+
+        if (!MANAGER_IS_SYSTEM(u->manager))
+                return;
+
+        if (sd_id128_is_null(u->invocation_id))
+                return;
+
+        r = cg_set_xattr(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path,
+                         "trusted.invocation_id",
+                         sd_id128_to_string(u->invocation_id, ids), 32,
+                         0);
+        if (r < 0)
+                log_unit_debug_errno(u, r, "Failed to set invocation ID on control group %s, ignoring: %m", u->cgroup_path);
+}
+
  static int lookup_block_device(const char *p, dev_t *ret) {
-        struct stat st = {};
+        dev_t rdev, dev = 0;
+        mode_t mode;
          int r;
  
          assert(p);
          assert(ret);
  
-        r = device_path_parse_major_minor(p, &st.st_mode, &st.st_rdev);
+        r = device_path_parse_major_minor(p, &mode, &rdev);
          if (r == -ENODEV) { /* not a parsable device node, need to go to disk */
+                struct stat st;
                  if (stat(p, &st) < 0)
                          return log_warning_errno(errno, "Couldn't stat device '%s': %m", p);
+                rdev = (dev_t)st.st_rdev;
+                dev = (dev_t)st.st_dev;
+                mode = st.st_mode;
          } else if (r < 0)
                  return log_warning_errno(r, "Failed to parse major/minor from path '%s': %m", p);
  
-        if (S_ISCHR(st.st_mode)) {
+        if (S_ISCHR(mode)) {
                  log_warning("Device node '%s' is a character device, but block device needed.", p);
                  return -ENOTBLK;
-        } else if (S_ISBLK(st.st_mode))
-                *ret = st.st_rdev;
-        else if (major(st.st_dev) != 0)
-                *ret = st.st_dev; /* If this is not a device node then use the block device this file is stored on */
+        } else if (S_ISBLK(mode))
+                *ret = rdev;
+        else if (major(dev) != 0)
+                *ret = dev; /* If this is not a device node then use the block device this file is stored on */
          else {
                  /* If this is btrfs, getting the backing block device is a bit harder */
                  r = btrfs_get_block_device(p, ret);
@@ -416,7 +446,8 @@ static int lookup_block_device(const char *p, dev_t *ret) {
  }
  
  static int whitelist_device(BPFProgram *prog, const char *path, const char *node, const char *acc) {
-        struct stat st = {};
+        dev_t rdev;
+        mode_t mode;
          int r;
  
          assert(path);
@@ -425,11 +456,12 @@ static int whitelist_device(BPFProgram *prog, const char *path, const char *node
          /* Some special handling for /dev/block/%u:%u, /dev/char/%u:%u, /run/systemd/inaccessible/chr and
           * /run/systemd/inaccessible/blk paths. Instead of stat()ing these we parse out the major/minor directly. This
           * means clients can use these path without the device node actually around */
-        r = device_path_parse_major_minor(node, &st.st_mode, &st.st_rdev);
+        r = device_path_parse_major_minor(node, &mode, &rdev);
          if (r < 0) {
                  if (r != -ENODEV)
                          return log_warning_errno(r, "Couldn't parse major/minor from device path '%s': %m", node);
  
+                struct stat st;
                  if (stat(node, &st) < 0)
                          return log_warning_errno(errno, "Couldn't stat device %s: %m", node);
  
@@ -437,22 +469,24 @@ static int whitelist_device(BPFProgram *prog, const char *path, const char *node
                          log_warning("%s is not a device.", node);
                          return -ENODEV;
                  }
+                rdev = (dev_t) st.st_rdev;
+                mode = st.st_mode;
          }
  
          if (cg_all_unified() > 0) {
                  if (!prog)
                          return 0;
  
-                return cgroup_bpf_whitelist_device(prog, S_ISCHR(st.st_mode) ? BPF_DEVCG_DEV_CHAR : BPF_DEVCG_DEV_BLOCK,
-                                                   major(st.st_rdev), minor(st.st_rdev), acc);
+                return cgroup_bpf_whitelist_device(prog, S_ISCHR(mode) ? BPF_DEVCG_DEV_CHAR : BPF_DEVCG_DEV_BLOCK,
+                                                   major(rdev), minor(rdev), acc);
  
          } else {
                  char buf[2+DECIMAL_STR_MAX(dev_t)*2+2+4];
  
                  sprintf(buf,
                          "%c %u:%u %s",
-                        S_ISCHR(st.st_mode) ? 'c' : 'b',
-                        major(st.st_rdev), minor(st.st_rdev),
+                        S_ISCHR(mode) ? 'c' : 'b',
+                        major(rdev), minor(rdev),
                          acc);
  
                  /* Changing the devices list of a populated cgroup might result in EINVAL, hence ignore EINVAL here. */
@@ -627,6 +661,40 @@ static uint64_t cgroup_context_cpu_shares(CGroupContext *c, ManagerState state)
                  return CGROUP_CPU_SHARES_DEFAULT;
  }
  
+usec_t cgroup_cpu_adjust_period(usec_t period, usec_t quota, usec_t resolution, usec_t max_period) {
+        /* kernel uses a minimum resolution of 1ms, so both period and (quota * period)
+         * need to be higher than that boundary. quota is specified in USecPerSec.
+         * Additionally, period must be at most max_period. */
+        assert(quota > 0);
+
+        return MIN(MAX3(period, resolution, resolution * USEC_PER_SEC / quota), max_period);
+}
+
+static usec_t cgroup_cpu_adjust_period_and_log(Unit *u, usec_t period, usec_t quota) {
+        usec_t new_period;
+
+        if (quota == USEC_INFINITY)
+                /* Always use default period for infinity quota. */
+                return CGROUP_CPU_QUOTA_DEFAULT_PERIOD_USEC;
+
+        if (period == USEC_INFINITY)
+                /* Default period was requested. */
+                period = CGROUP_CPU_QUOTA_DEFAULT_PERIOD_USEC;
+
+        /* Clamp to interval [1ms, 1s] */
+        new_period = cgroup_cpu_adjust_period(period, quota, USEC_PER_MSEC, USEC_PER_SEC);
+
+        if (new_period != period) {
+                char v[FORMAT_TIMESPAN_MAX];
+                log_unit_full(u, u->warned_clamping_cpu_quota_period ? LOG_DEBUG : LOG_WARNING, 0,
+                              "Clamping CPU interval for cpu.max: period is now %s",
+                              format_timespan(v, sizeof(v), new_period, 1));
+                u->warned_clamping_cpu_quota_period = true;
+        }
+
+        return new_period;
+}
+
  static void cgroup_apply_unified_cpu_weight(Unit *u, uint64_t weight) {
          char buf[DECIMAL_STR_MAX(uint64_t) + 2];
  
@@ -634,14 +702,15 @@ static void cgroup_apply_unified_cpu_weight(Unit *u, uint64_t weight) {
          (void) set_attribute_and_warn(u, "cpu", "cpu.weight", buf);
  }
  
-static void cgroup_apply_unified_cpu_quota(Unit *u, usec_t quota) {
+static void cgroup_apply_unified_cpu_quota(Unit *u, usec_t quota, usec_t period) {
          char buf[(DECIMAL_STR_MAX(usec_t) + 1) * 2 + 1];
  
+        period = cgroup_cpu_adjust_period_and_log(u, period, quota);
          if (quota != USEC_INFINITY)
                  xsprintf(buf, USEC_FMT " " USEC_FMT "\n",
-                         quota * CGROUP_CPU_QUOTA_PERIOD_USEC / USEC_PER_SEC, CGROUP_CPU_QUOTA_PERIOD_USEC);
+                         MAX(quota * period / USEC_PER_SEC, USEC_PER_MSEC), period);
          else
-                xsprintf(buf, "max " USEC_FMT "\n", CGROUP_CPU_QUOTA_PERIOD_USEC);
+                xsprintf(buf, "max " USEC_FMT "\n", period);
          (void) set_attribute_and_warn(u, "cpu", "cpu.max", buf);
  }
  
@@ -652,14 +721,16 @@ static void cgroup_apply_legacy_cpu_shares(Unit *u, uint64_t shares) {
          (void) set_attribute_and_warn(u, "cpu", "cpu.shares", buf);
  }
  
-static void cgroup_apply_legacy_cpu_quota(Unit *u, usec_t quota) {
+static void cgroup_apply_legacy_cpu_quota(Unit *u, usec_t quota, usec_t period) {
          char buf[DECIMAL_STR_MAX(usec_t) + 2];
  
-        xsprintf(buf, USEC_FMT "\n", CGROUP_CPU_QUOTA_PERIOD_USEC);
+        period = cgroup_cpu_adjust_period_and_log(u, period, quota);
+
+        xsprintf(buf, USEC_FMT "\n", period);
          (void) set_attribute_and_warn(u, "cpu", "cpu.cfs_period_us", buf);
  
          if (quota != USEC_INFINITY) {
-                xsprintf(buf, USEC_FMT "\n", quota * CGROUP_CPU_QUOTA_PERIOD_USEC / USEC_PER_SEC);
+                xsprintf(buf, USEC_FMT "\n", MAX(quota * period / USEC_PER_SEC, USEC_PER_MSEC));
                  (void) set_attribute_and_warn(u, "cpu", "cpu.cfs_quota_us", buf);
          } else
                  (void) set_attribute_and_warn(u, "cpu", "cpu.cfs_quota_us", "-1\n");
@@ -861,7 +932,7 @@ static void cgroup_context_apply(
          /* In fully unified mode these attributes don't exist on the host cgroup root. On legacy the weights exist, but
           * setting the weight makes very little sense on the host root cgroup, as there are no other cgroups at this
           * level. The quota exists there too, but any attempt to write to it is refused with EINVAL. Inside of
-         * containers we want to leave control of these to the container manager (and if cgroupsv2 delegation is used
+         * containers we want to leave control of these to the container manager (and if cgroup v2 delegation is used
           * we couldn't even write to them if we wanted to). */
          if ((apply_mask & CGROUP_MASK_CPU) && !is_local_root) {
  
@@ -882,7 +953,7 @@ static void cgroup_context_apply(
                                  weight = CGROUP_WEIGHT_DEFAULT;
  
                          cgroup_apply_unified_cpu_weight(u, weight);
-                        cgroup_apply_unified_cpu_quota(u, c->cpu_quota_per_sec_usec);
+                        cgroup_apply_unified_cpu_quota(u, c->cpu_quota_per_sec_usec, c->cpu_quota_period_usec);
  
                  } else {
                          uint64_t shares;
@@ -901,11 +972,11 @@ static void cgroup_context_apply(
                                  shares = CGROUP_CPU_SHARES_DEFAULT;
  
                          cgroup_apply_legacy_cpu_shares(u, shares);
-                        cgroup_apply_legacy_cpu_quota(u, c->cpu_quota_per_sec_usec);
+                        cgroup_apply_legacy_cpu_quota(u, c->cpu_quota_per_sec_usec, c->cpu_quota_period_usec);
                  }
          }
  
-        /* The 'io' controller attributes are not exported on the host's root cgroup (being a pure cgroupsv2
+        /* The 'io' controller attributes are not exported on the host's root cgroup (being a pure cgroup v2
           * controller), and in case of containers we want to leave control of these attributes to the container manager
           * (and we couldn't access that stuff anyway, even if we tried if proper delegation is used). */
          if ((apply_mask & CGROUP_MASK_IO) && !is_local_root) {
@@ -924,7 +995,7 @@ static void cgroup_context_apply(
                          blkio_weight = cgroup_context_blkio_weight(c, state);
                          weight = cgroup_weight_blkio_to_io(blkio_weight);
  
-                        log_cgroup_compat(u, "Applying [Startup]BlockIOWeight %" PRIu64 " as [Startup]IOWeight %" PRIu64,
+                        log_cgroup_compat(u, "Applying [Startup]BlockIOWeight=%" PRIu64 " as [Startup]IOWeight=%" PRIu64,
                                            blkio_weight, weight);
                  } else
                          weight = CGROUP_WEIGHT_DEFAULT;
@@ -953,7 +1024,7 @@ static void cgroup_context_apply(
                          LIST_FOREACH(device_weights, w, c->blockio_device_weights) {
                                  weight = cgroup_weight_blkio_to_io(w->weight);
  
-                                log_cgroup_compat(u, "Applying BlockIODeviceWeight %" PRIu64 " as IODeviceWeight %" PRIu64 " for %s",
+                                log_cgroup_compat(u, "Applying BlockIODeviceWeight=%" PRIu64 " as IODeviceWeight=%" PRIu64 " for %s",
                                                    w->weight, weight, w->path);
  
                                  cgroup_apply_io_device_weight(u, w->path, weight);
@@ -969,7 +1040,7 @@ static void cgroup_context_apply(
                                  limits[CGROUP_IO_RBPS_MAX] = b->rbps;
                                  limits[CGROUP_IO_WBPS_MAX] = b->wbps;
  
-                                log_cgroup_compat(u, "Applying BlockIO{Read|Write}Bandwidth %" PRIu64 " %" PRIu64 " as IO{Read|Write}BandwidthMax for %s",
+                                log_cgroup_compat(u, "Applying BlockIO{Read|Write}Bandwidth=%" PRIu64 " %" PRIu64 " as IO{Read|Write}BandwidthMax= for %s",
                                                    b->rbps, b->wbps, b->path);
  
                                  cgroup_apply_io_device_limit(u, b->path, limits);
@@ -995,7 +1066,7 @@ static void cgroup_context_apply(
                                  io_weight = cgroup_context_io_weight(c, state);
                                  weight = cgroup_weight_io_to_blkio(cgroup_context_io_weight(c, state));
  
-                                log_cgroup_compat(u, "Applying [Startup]IOWeight %" PRIu64 " as [Startup]BlockIOWeight %" PRIu64,
+                                log_cgroup_compat(u, "Applying [Startup]IOWeight=%" PRIu64 " as [Startup]BlockIOWeight=%" PRIu64,
                                                    io_weight, weight);
                          } else if (has_blockio)
                                  weight = cgroup_context_blkio_weight(c, state);
@@ -1011,7 +1082,7 @@ static void cgroup_context_apply(
                                  LIST_FOREACH(device_weights, w, c->io_device_weights) {
                                          weight = cgroup_weight_io_to_blkio(w->weight);
  
-                                        log_cgroup_compat(u, "Applying IODeviceWeight %" PRIu64 " as BlockIODeviceWeight %" PRIu64 " for %s",
+                                        log_cgroup_compat(u, "Applying IODeviceWeight=%" PRIu64 " as BlockIODeviceWeight=%" PRIu64 " for %s",
                                                            w->weight, weight, w->path);
  
                                          cgroup_apply_blkio_device_weight(u, w->path, weight);
@@ -1031,7 +1102,7 @@ static void cgroup_context_apply(
                                  CGroupIODeviceLimit *l;
  
                                  LIST_FOREACH(device_limits, l, c->io_device_limits) {
-                                        log_cgroup_compat(u, "Applying IO{Read|Write}Bandwidth %" PRIu64 " %" PRIu64 " as BlockIO{Read|Write}BandwidthMax for %s",
+                                        log_cgroup_compat(u, "Applying IO{Read|Write}Bandwidth=%" PRIu64 " %" PRIu64 " as BlockIO{Read|Write}BandwidthMax= for %s",
                                                            l->limits[CGROUP_IO_RBPS_MAX], l->limits[CGROUP_IO_WBPS_MAX], l->path);
  
                                          cgroup_apply_blkio_device_limit(u, l->path, l->limits[CGROUP_IO_RBPS_MAX], l->limits[CGROUP_IO_WBPS_MAX]);
@@ -1047,7 +1118,7 @@ static void cgroup_context_apply(
  
          /* In unified mode 'memory' attributes do not exist on the root cgroup. In legacy mode 'memory.limit_in_bytes'
           * exists on the root cgroup, but any writes to it are refused with EINVAL. And if we run in a container we
-         * want to leave control to the container manager (and if proper cgroupsv2 delegation is used we couldn't even
+         * want to leave control to the container manager (and if proper cgroup v2 delegation is used we couldn't even
           * write to this if we wanted to.) */
          if ((apply_mask & CGROUP_MASK_MEMORY) && !is_local_root) {
  
@@ -1089,7 +1160,7 @@ static void cgroup_context_apply(
                  }
          }
  
-        /* On cgroupsv2 we can apply BPF everywhere. On cgroupsv1 we apply it everywhere except for the root of
+        /* On cgroup v2 we can apply BPF everywhere. On cgroup v1 we apply it everywhere except for the root of
           * containers, where we leave this to the manager */
          if ((apply_mask & (CGROUP_MASK_DEVICES | CGROUP_MASK_BPF_DEVICES)) &&
              (is_host_root || cg_all_unified() > 0 || !is_local_root)) {
@@ -1305,7 +1376,7 @@ CGroupMask unit_get_own_mask(Unit *u) {
          if (!c)
                  return 0;
  
-        return cgroup_context_get_mask(c) | unit_get_bpf_mask(u) | unit_get_delegate_mask(u);
+        return (cgroup_context_get_mask(c) | unit_get_bpf_mask(u) | unit_get_delegate_mask(u)) & ~unit_get_ancestor_disable_mask(u);
  }
  
  CGroupMask unit_get_delegate_mask(Unit *u) {
@@ -1347,14 +1418,8 @@ CGroupMask unit_get_members_mask(Unit *u) {
                  Iterator i;
  
                  HASHMAP_FOREACH_KEY(v, member, u->dependencies[UNIT_BEFORE], i) {
-
-                        if (member == u)
-                                continue;
-
-                        if (UNIT_DEREF(member->slice) != u)
-                                continue;
-
-                        u->cgroup_members_mask |= unit_get_subtree_mask(member); /* note that this calls ourselves again, for the children */
+                        if (UNIT_DEREF(member->slice) == u)
+                                u->cgroup_members_mask |= unit_get_subtree_mask(member); /* note that this calls ourselves again, for the children */
                  }
          }
  
@@ -1375,6 +1440,31 @@ CGroupMask unit_get_siblings_mask(Unit *u) {
          return unit_get_subtree_mask(u); /* we are the top-level slice */
  }
  
+CGroupMask unit_get_disable_mask(Unit *u) {
+        CGroupContext *c;
+
+        c = unit_get_cgroup_context(u);
+        if (!c)
+                return 0;
+
+        return c->disable_controllers;
+}
+
+CGroupMask unit_get_ancestor_disable_mask(Unit *u) {
+        CGroupMask mask;
+
+        assert(u);
+        mask = unit_get_disable_mask(u);
+
+        /* Returns the mask of controllers which are marked as forcibly
+         * disabled in any ancestor unit or the unit in question. */
+
+        if (UNIT_ISSET(u->slice))
+                mask |= unit_get_ancestor_disable_mask(UNIT_DEREF(u->slice));
+
+        return mask;
+}
+
  CGroupMask unit_get_subtree_mask(Unit *u) {
  
          /* Returns the mask of this subtree, meaning of the group
@@ -1395,6 +1485,7 @@ CGroupMask unit_get_target_mask(Unit *u) {
  
          mask = unit_get_own_mask(u) | unit_get_members_mask(u) | unit_get_siblings_mask(u);
          mask &= u->manager->cgroup_supported;
+        mask &= ~unit_get_ancestor_disable_mask(u);
  
          return mask;
  }
@@ -1409,6 +1500,7 @@ CGroupMask unit_get_enable_mask(Unit *u) {
  
          mask = unit_get_members_mask(u);
          mask &= u->manager->cgroup_supported;
+        mask &= ~unit_get_ancestor_disable_mask(u);
  
          return mask;
  }
@@ -1444,7 +1536,7 @@ static const char *migrate_callback(CGroupMask mask, void *userdata) {
          return unit_get_realized_cgroup_path(userdata, mask);
  }
  
-char *unit_default_cgroup_path(Unit *u) {
+char *unit_default_cgroup_path(const Unit *u) {
          _cleanup_free_ char *escaped = NULL, *slice = NULL;
          int r;
  
@@ -1576,7 +1668,8 @@ int unit_pick_cgroup_path(Unit *u) {
  static int unit_create_cgroup(
                  Unit *u,
                  CGroupMask target_mask,
-                CGroupMask enable_mask) {
+                CGroupMask enable_mask,
+                ManagerState state) {
  
          bool created;
          int r;
@@ -1644,6 +1737,10 @@ static int unit_create_cgroup(
                          log_unit_warning_errno(u, r, "Failed to migrate cgroup from to %s, ignoring: %m", u->cgroup_path);
          }
  
+        /* Set attributes */
+        cgroup_context_apply(u, target_mask, state);
+        cgroup_xattr_apply(u);
+
          return 0;
  }
  
@@ -1785,26 +1882,6 @@ int unit_attach_pids_to_cgroup(Unit *u, Set *pids, const char *suffix_path) {
          return r;
  }
  
-static void cgroup_xattr_apply(Unit *u) {
-        char ids[SD_ID128_STRING_MAX];
-        int r;
-
-        assert(u);
-
-        if (!MANAGER_IS_SYSTEM(u->manager))
-                return;
-
-        if (sd_id128_is_null(u->invocation_id))
-                return;
-
-        r = cg_set_xattr(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path,
-                         "trusted.invocation_id",
-                         sd_id128_to_string(u->invocation_id, ids), 32,
-                         0);
-        if (r < 0)
-                log_unit_debug_errno(u, r, "Failed to set invocation ID on control group %s, ignoring: %m", u->cgroup_path);
-}
-
  static bool unit_has_mask_realized(
                  Unit *u,
                  CGroupMask target_mask,
@@ -1815,14 +1892,14 @@ static bool unit_has_mask_realized(
          /* Returns true if this unit is fully realized. We check four things:
           *
           * 1. Whether the cgroup was created at all
-         * 2. Whether the cgroup was created in all the hierarchies we need it to be created in (in case of cgroupsv1)
-         * 3. Whether the cgroup has all the right controllers enabled (in case of cgroupsv2)
+         * 2. Whether the cgroup was created in all the hierarchies we need it to be created in (in case of cgroup v1)
+         * 3. Whether the cgroup has all the right controllers enabled (in case of cgroup v2)
           * 4. Whether the invalidation mask is currently zero
           *
           * If you wonder why we mask the target realization and enable mask with CGROUP_MASK_V1/CGROUP_MASK_V2: note
-         * that there are three sets of bitmasks: CGROUP_MASK_V1 (for real cgroupv1 controllers), CGROUP_MASK_V2 (for
-         * real cgroupv2 controllers) and CGROUP_MASK_BPF (for BPF-based pseudo-controllers). Now, cgroup_realized_mask
-         * is only matters for cgroupsv1 controllers, and cgroup_enabled_mask only used for cgroupsv2, and if they
+         * that there are three sets of bitmasks: CGROUP_MASK_V1 (for real cgroup v1 controllers), CGROUP_MASK_V2 (for
+         * real cgroup v2 controllers) and CGROUP_MASK_BPF (for BPF-based pseudo-controllers). Now, cgroup_realized_mask
+         * is only matters for cgroup v1 controllers, and cgroup_enabled_mask only used for cgroup v2, and if they
           * differ in the others, we don't really care. (After all, the cgroup_enabled_mask tracks with controllers are
           * enabled through cgroup.subtree_control, and since the BPF pseudo-controllers don't show up there, they
           * simply don't matter. */
@@ -1833,6 +1910,40 @@ static bool unit_has_mask_realized(
                  u->cgroup_invalidated_mask == 0;
  }
  
+static bool unit_has_mask_disables_realized(
+                Unit *u,
+                CGroupMask target_mask,
+                CGroupMask enable_mask) {
+
+        assert(u);
+
+        /* Returns true if all controllers which should be disabled are indeed disabled.
+         *
+         * Unlike unit_has_mask_realized, we don't care what was enabled, only that anything we want to remove is
+         * already removed. */
+
+        return !u->cgroup_realized ||
+                (FLAGS_SET(u->cgroup_realized_mask, target_mask & CGROUP_MASK_V1) &&
+                 FLAGS_SET(u->cgroup_enabled_mask, enable_mask & CGROUP_MASK_V2));
+}
+
+static bool unit_has_mask_enables_realized(
+                Unit *u,
+                CGroupMask target_mask,
+                CGroupMask enable_mask) {
+
+        assert(u);
+
+        /* Returns true if all controllers which should be enabled are indeed enabled.
+         *
+         * Unlike unit_has_mask_realized, we don't care about the controllers that are not present, only that anything
+         * we want to add is already added. */
+
+        return u->cgroup_realized &&
+                ((u->cgroup_realized_mask | target_mask) & CGROUP_MASK_V1) == (u->cgroup_realized_mask & CGROUP_MASK_V1) &&
+                ((u->cgroup_enabled_mask | enable_mask) & CGROUP_MASK_V2) == (u->cgroup_enabled_mask & CGROUP_MASK_V2);
+}
+
  void unit_add_to_cgroup_realize_queue(Unit *u) {
          assert(u);
  
@@ -1853,10 +1964,127 @@ static void unit_remove_from_cgroup_realize_queue(Unit *u) {
          u->in_cgroup_realize_queue = false;
  }
  
+/* Controllers can only be enabled breadth-first, from the root of the
+ * hierarchy downwards to the unit in question. */
+static int unit_realize_cgroup_now_enable(Unit *u, ManagerState state) {
+        CGroupMask target_mask, enable_mask, new_target_mask, new_enable_mask;
+        int r;
+
+        assert(u);
+
+        /* First go deal with this unit's parent, or we won't be able to enable
+         * any new controllers at this layer. */
+        if (UNIT_ISSET(u->slice)) {
+                r = unit_realize_cgroup_now_enable(UNIT_DEREF(u->slice), state);
+                if (r < 0)
+                        return r;
+        }
+
+        target_mask = unit_get_target_mask(u);
+        enable_mask = unit_get_enable_mask(u);
+
+        /* We can only enable in this direction, don't try to disable anything.
+         */
+        if (unit_has_mask_enables_realized(u, target_mask, enable_mask))
+                return 0;
+
+        new_target_mask = u->cgroup_realized_mask | target_mask;
+        new_enable_mask = u->cgroup_enabled_mask | enable_mask;
+
+        return unit_create_cgroup(u, new_target_mask, new_enable_mask, state);
+}
+
+/* Controllers can only be disabled depth-first, from the leaves of the
+ * hierarchy upwards to the unit in question. */
+static int unit_realize_cgroup_now_disable(Unit *u, ManagerState state) {
+        Iterator i;
+        Unit *m;
+        void *v;
+
+        assert(u);
+
+        if (u->type != UNIT_SLICE)
+                return 0;
+
+        HASHMAP_FOREACH_KEY(v, m, u->dependencies[UNIT_BEFORE], i) {
+                CGroupMask target_mask, enable_mask, new_target_mask, new_enable_mask;
+                int r;
+
+                if (UNIT_DEREF(m->slice) != u)
+                        continue;
+
+                /* The cgroup for this unit might not actually be fully
+                 * realised yet, in which case it isn't holding any controllers
+                 * open anyway. */
+                if (!m->cgroup_path)
+                        continue;
+
+                /* We must disable those below us first in order to release the
+                 * controller. */
+                if (m->type == UNIT_SLICE)
+                        (void) unit_realize_cgroup_now_disable(m, state);
+
+                target_mask = unit_get_target_mask(m);
+                enable_mask = unit_get_enable_mask(m);
+
+                /* We can only disable in this direction, don't try to enable
+                 * anything. */
+                if (unit_has_mask_disables_realized(m, target_mask, enable_mask))
+                        continue;
+
+                new_target_mask = m->cgroup_realized_mask & target_mask;
+                new_enable_mask = m->cgroup_enabled_mask & enable_mask;
+
+                r = unit_create_cgroup(m, new_target_mask, new_enable_mask, state);
+                if (r < 0)
+                        return r;
+        }
+
+        return 0;
+}
+
  /* Check if necessary controllers and attributes for a unit are in place.
   *
- * If so, do nothing.
- * If not, create paths, move processes over, and set attributes.
+ * - If so, do nothing.
+ * - If not, create paths, move processes over, and set attributes.
+ *
+ * Controllers can only be *enabled* in a breadth-first way, and *disabled* in
+ * a depth-first way. As such the process looks like this:
+ *
+ * Suppose we have a cgroup hierarchy which looks like this:
+ *
+ *             root
+ *            /    \
+ *           /      \
+ *          /        \
+ *         a          b
+ *        / \        / \
+ *       /   \      /   \
+ *      c     d    e     f
+ *     / \   / \  / \   / \
+ *     h i   j k  l m   n o
+ *
+ * 1. We want to realise cgroup "d" now.
+ * 2. cgroup "a" has DisableControllers=cpu in the associated unit.
+ * 3. cgroup "k" just started requesting the memory controller.
+ *
+ * To make this work we must do the following in order:
+ *
+ * 1. Disable CPU controller in k, j
+ * 2. Disable CPU controller in d
+ * 3. Enable memory controller in root
+ * 4. Enable memory controller in a
+ * 5. Enable memory controller in d
+ * 6. Enable memory controller in k
+ *
+ * Notice that we need to touch j in one direction, but not the other. We also
+ * don't go beyond d when disabling -- it's up to "a" to get realized if it
+ * wants to disable further. The basic rules are therefore:
+ *
+ * - If you're disabling something, you need to realise all of the cgroups from
+ *   your recursive descendants to the root. This starts from the leaves.
+ * - If you're enabling something, you need to realise from the root cgroup
+ *   downwards, but you don't need to iterate your recursive descendants.
   *
   * Returns 0 on success and < 0 on failure. */
  static int unit_realize_cgroup_now(Unit *u, ManagerState state) {
@@ -1873,22 +2101,23 @@ static int unit_realize_cgroup_now(Unit *u, ManagerState state) {
          if (unit_has_mask_realized(u, target_mask, enable_mask))
                  return 0;
  
-        /* First, realize parents */
+        /* Disable controllers below us, if there are any */
+        r = unit_realize_cgroup_now_disable(u, state);
+        if (r < 0)
+                return r;
+
+        /* Enable controllers above us, if there are any */
          if (UNIT_ISSET(u->slice)) {
-                r = unit_realize_cgroup_now(UNIT_DEREF(u->slice), state);
+                r = unit_realize_cgroup_now_enable(UNIT_DEREF(u->slice), state);
                  if (r < 0)
                          return r;
          }
  
-        /* And then do the real work */
-        r = unit_create_cgroup(u, target_mask, enable_mask);
+        /* Now actually deal with the cgroup we were trying to realise and set attributes */
+        r = unit_create_cgroup(u, target_mask, enable_mask, state);
          if (r < 0)
                  return r;
  
-        /* Finally, apply the necessary attributes. */
-        cgroup_context_apply(u, target_mask, state);
-        cgroup_xattr_apply(u);
-
          /* Now, reset the invalidation mask */
          u->cgroup_invalidated_mask = 0;
          return 0;
@@ -1936,9 +2165,6 @@ static void unit_add_siblings_to_cgroup_realize_queue(Unit *u) {
                  void *v;
  
                  HASHMAP_FOREACH_KEY(v, m, u->dependencies[UNIT_BEFORE], i) {
-                        if (m == u)
-                                continue;
-
                          /* Skip units that have a dependency on the slice
                           * but aren't actually in it. */
                          if (UNIT_DEREF(m->slice) != slice)
@@ -2043,7 +2269,7 @@ void unit_prune_cgroup(Unit *u) {
  
  int unit_search_main_pid(Unit *u, pid_t *ret) {
          _cleanup_fclose_ FILE *f = NULL;
-        pid_t pid = 0, npid, mypid;
+        pid_t pid = 0, npid;
          int r;
  
          assert(u);
@@ -2056,15 +2282,12 @@ int unit_search_main_pid(Unit *u, pid_t *ret) {
          if (r < 0)
                  return r;
  
-        mypid = getpid_cached();
          while (cg_read_pid(f, &npid) > 0)  {
-                pid_t ppid;
  
                  if (npid == pid)
                          continue;
  
-                /* Ignore processes that aren't our kids */
-                if (get_process_ppid(npid, &ppid) >= 0 && ppid != mypid)
+                if (pid_is_my_child(npid) == 0)
                          continue;
  
                  if (pid != 0)
@@ -2096,7 +2319,7 @@ static int unit_watch_pids_in_path(Unit *u, const char *path) {
                  pid_t pid;
  
                  while ((r = cg_read_pid(f, &pid)) > 0) {
-                        r = unit_watch_pid(u, pid);
+                        r = unit_watch_pid(u, pid, false);
                          if (r < 0 && ret >= 0)
                                  ret = r;
                  }
@@ -2596,7 +2819,7 @@ int unit_get_memory_current(Unit *u, uint64_t *ret) {
  
          /* The root cgroup doesn't expose this information, let's get it from /proc instead */
          if (unit_has_host_root_cgroup(u))
-                return procfs_memory_get_current(ret);
+                return procfs_memory_get_used(ret);
  
          if ((u->cgroup_realized_mask & CGROUP_MASK_MEMORY) == 0)
                  return -ENODATA;
@@ -2660,23 +2883,22 @@ static int unit_get_cpu_usage_raw(Unit *u, nsec_t *ret) {
          if (unit_has_host_root_cgroup(u))
                  return procfs_cpu_get_usage(ret);
  
-        r = cg_all_unified();
-        if (r < 0)
-                return r;
-
          /* Requisite controllers for CPU accounting are not enabled */
          if ((get_cpu_accounting_mask() & ~u->cgroup_realized_mask) != 0)
                  return -ENODATA;
  
+        r = cg_all_unified();
+        if (r < 0)
+                return r;
          if (r > 0) {
                  _cleanup_free_ char *val = NULL;
                  uint64_t us;
  
                  r = cg_get_keyed_attribute("cpu", u->cgroup_path, "cpu.stat", STRV_MAKE("usage_usec"), &val);
-                if (r < 0)
-                        return r;
                  if (IN_SET(r, -ENOENT, -ENXIO))
                          return -ENODATA;
+                if (r < 0)
+                        return r;
  
                  r = safe_atou64(val, &us);
                  if (r < 0)
@@ -2851,13 +3073,8 @@ void unit_invalidate_cgroup_bpf(Unit *u) {
                  void *v;
  
                  HASHMAP_FOREACH_KEY(v, member, u->dependencies[UNIT_BEFORE], i) {
-                        if (member == u)
-                                continue;
-
-                        if (UNIT_DEREF(member->slice) != u)
-                                continue;
-
-                        unit_invalidate_cgroup_bpf(member);
+                        if (UNIT_DEREF(member->slice) == u)
+                                unit_invalidate_cgroup_bpf(member);
                  }
          }
  }