return 0;
}
+static void cgroup_xattr_apply(Unit *u) {
+ char ids[SD_ID128_STRING_MAX];
+ int r;
+
+ assert(u);
+
+ if (!MANAGER_IS_SYSTEM(u->manager))
+ return;
+
+ if (sd_id128_is_null(u->invocation_id))
+ return;
+
+ r = cg_set_xattr(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path,
+ "trusted.invocation_id",
+ sd_id128_to_string(u->invocation_id, ids), 32,
+ 0);
+ if (r < 0)
+ log_unit_debug_errno(u, r, "Failed to set invocation ID on control group %s, ignoring: %m", u->cgroup_path);
+}
+
static int lookup_block_device(const char *p, dev_t *ret) {
struct stat st = {};
int r;
if (is_local_root) /* Make sure we don't try to display messages with an empty path. */
path = "/";
- /* We generally ignore errors caused by read-only mounted
- * cgroup trees (assuming we are running in a container then),
- * and missing cgroups, i.e. EROFS and ENOENT. */
+ /* We generally ignore errors caused by read-only mounted cgroup trees (assuming we are running in a container
+ * then), and missing cgroups, i.e. EROFS and ENOENT. */
- if (apply_mask & CGROUP_MASK_CPU) {
- bool has_weight, has_shares;
-
- has_weight = cgroup_context_has_cpu_weight(c);
- has_shares = cgroup_context_has_cpu_shares(c);
+ /* In fully unified mode these attributes don't exist on the host cgroup root. On legacy the weights exist, but
+ * setting the weight makes very little sense on the host root cgroup, as there are no other cgroups at this
+ * level. The quota exists there too, but any attempt to write to it is refused with EINVAL. Inside of
+ * containers we want to leave control of these to the container manager (and if cgroupsv2 delegation is used
+ * we couldn't even write to them if we wanted to). */
+ if ((apply_mask & CGROUP_MASK_CPU) && !is_local_root) {
if (cg_all_unified() > 0) {
+ uint64_t weight;
- /* In fully unified mode these attributes don't exist on the host cgroup root, and inside of
- * containers we want to leave control of these to the container manager (and if delegation is
- * used we couldn't even write to them if we wanted to). */
- if (!is_local_root) {
- uint64_t weight;
-
- if (has_weight)
- weight = cgroup_context_cpu_weight(c, state);
- else if (has_shares) {
- uint64_t shares;
+ if (cgroup_context_has_cpu_weight(c))
+ weight = cgroup_context_cpu_weight(c, state);
+ else if (cgroup_context_has_cpu_shares(c)) {
+ uint64_t shares;
- shares = cgroup_context_cpu_shares(c, state);
- weight = cgroup_cpu_shares_to_weight(shares);
+ shares = cgroup_context_cpu_shares(c, state);
+ weight = cgroup_cpu_shares_to_weight(shares);
- log_cgroup_compat(u, "Applying [Startup]CPUShares %" PRIu64 " as [Startup]CPUWeight %" PRIu64 " on %s",
- shares, weight, path);
- } else
- weight = CGROUP_WEIGHT_DEFAULT;
+ log_cgroup_compat(u, "Applying [Startup]CPUShares=%" PRIu64 " as [Startup]CPUWeight=%" PRIu64 " on %s",
+ shares, weight, path);
+ } else
+ weight = CGROUP_WEIGHT_DEFAULT;
- cgroup_apply_unified_cpu_weight(u, weight);
- cgroup_apply_unified_cpu_quota(u, c->cpu_quota_per_sec_usec);
- }
+ cgroup_apply_unified_cpu_weight(u, weight);
+ cgroup_apply_unified_cpu_quota(u, c->cpu_quota_per_sec_usec);
} else {
- /* Setting the weight makes very little sense on the host root cgroup, as there are no other
- * cgroups at this level. And for containers we want to leave management of this to the
- * container manager */
- if (!is_local_root) {
- uint64_t shares;
-
- if (has_weight) {
- uint64_t weight;
+ uint64_t shares;
- weight = cgroup_context_cpu_weight(c, state);
- shares = cgroup_cpu_weight_to_shares(weight);
+ if (cgroup_context_has_cpu_weight(c)) {
+ uint64_t weight;
- log_cgroup_compat(u, "Applying [Startup]CPUWeight %" PRIu64 " as [Startup]CPUShares %" PRIu64 " on %s",
- weight, shares, path);
- } else if (has_shares)
- shares = cgroup_context_cpu_shares(c, state);
- else
- shares = CGROUP_CPU_SHARES_DEFAULT;
+ weight = cgroup_context_cpu_weight(c, state);
+ shares = cgroup_cpu_weight_to_shares(weight);
- cgroup_apply_legacy_cpu_shares(u, shares);
- }
+ log_cgroup_compat(u, "Applying [Startup]CPUWeight=%" PRIu64 " as [Startup]CPUShares=%" PRIu64 " on %s",
+ weight, shares, path);
+ } else if (cgroup_context_has_cpu_shares(c))
+ shares = cgroup_context_cpu_shares(c, state);
+ else
+ shares = CGROUP_CPU_SHARES_DEFAULT;
- /* The "cpu" quota attribute is available on the host root, hence manage it there. But in
- * containers let's leave this to the container manager. */
- if (is_host_root || !is_local_root)
- cgroup_apply_legacy_cpu_quota(u, c->cpu_quota_per_sec_usec);
+ cgroup_apply_legacy_cpu_shares(u, shares);
+ cgroup_apply_legacy_cpu_quota(u, c->cpu_quota_per_sec_usec);
}
}
blkio_weight = cgroup_context_blkio_weight(c, state);
weight = cgroup_weight_blkio_to_io(blkio_weight);
- log_cgroup_compat(u, "Applying [Startup]BlockIOWeight %" PRIu64 " as [Startup]IOWeight %" PRIu64,
+ log_cgroup_compat(u, "Applying [Startup]BlockIOWeight=%" PRIu64 " as [Startup]IOWeight=%" PRIu64,
blkio_weight, weight);
} else
weight = CGROUP_WEIGHT_DEFAULT;
LIST_FOREACH(device_weights, w, c->blockio_device_weights) {
weight = cgroup_weight_blkio_to_io(w->weight);
- log_cgroup_compat(u, "Applying BlockIODeviceWeight %" PRIu64 " as IODeviceWeight %" PRIu64 " for %s",
+ log_cgroup_compat(u, "Applying BlockIODeviceWeight=%" PRIu64 " as IODeviceWeight=%" PRIu64 " for %s",
w->weight, weight, w->path);
cgroup_apply_io_device_weight(u, w->path, weight);
limits[CGROUP_IO_RBPS_MAX] = b->rbps;
limits[CGROUP_IO_WBPS_MAX] = b->wbps;
- log_cgroup_compat(u, "Applying BlockIO{Read|Write}Bandwidth %" PRIu64 " %" PRIu64 " as IO{Read|Write}BandwidthMax for %s",
+ log_cgroup_compat(u, "Applying BlockIO{Read|Write}Bandwidth=%" PRIu64 " %" PRIu64 " as IO{Read|Write}BandwidthMax= for %s",
b->rbps, b->wbps, b->path);
cgroup_apply_io_device_limit(u, b->path, limits);
io_weight = cgroup_context_io_weight(c, state);
weight = cgroup_weight_io_to_blkio(cgroup_context_io_weight(c, state));
- log_cgroup_compat(u, "Applying [Startup]IOWeight %" PRIu64 " as [Startup]BlockIOWeight %" PRIu64,
+ log_cgroup_compat(u, "Applying [Startup]IOWeight=%" PRIu64 " as [Startup]BlockIOWeight=%" PRIu64,
io_weight, weight);
} else if (has_blockio)
weight = cgroup_context_blkio_weight(c, state);
LIST_FOREACH(device_weights, w, c->io_device_weights) {
weight = cgroup_weight_io_to_blkio(w->weight);
- log_cgroup_compat(u, "Applying IODeviceWeight %" PRIu64 " as BlockIODeviceWeight %" PRIu64 " for %s",
+ log_cgroup_compat(u, "Applying IODeviceWeight=%" PRIu64 " as BlockIODeviceWeight=%" PRIu64 " for %s",
w->weight, weight, w->path);
cgroup_apply_blkio_device_weight(u, w->path, weight);
CGroupIODeviceLimit *l;
LIST_FOREACH(device_limits, l, c->io_device_limits) {
- log_cgroup_compat(u, "Applying IO{Read|Write}Bandwidth %" PRIu64 " %" PRIu64 " as BlockIO{Read|Write}BandwidthMax for %s",
+ log_cgroup_compat(u, "Applying IO{Read|Write}Bandwidth=%" PRIu64 " %" PRIu64 " as BlockIO{Read|Write}BandwidthMax= for %s",
l->limits[CGROUP_IO_RBPS_MAX], l->limits[CGROUP_IO_WBPS_MAX], l->path);
cgroup_apply_blkio_device_limit(u, l->path, l->limits[CGROUP_IO_RBPS_MAX], l->limits[CGROUP_IO_WBPS_MAX]);
}
}
- if (apply_mask & CGROUP_MASK_MEMORY) {
+ /* In unified mode 'memory' attributes do not exist on the root cgroup. In legacy mode 'memory.limit_in_bytes'
+ * exists on the root cgroup, but any writes to it are refused with EINVAL. And if we run in a container we
+ * want to leave control to the container manager (and if proper cgroupsv2 delegation is used we couldn't even
+ * write to this if we wanted to.) */
+ if ((apply_mask & CGROUP_MASK_MEMORY) && !is_local_root) {
if (cg_all_unified() > 0) {
- /* In unified mode 'memory' attributes do not exist on the root cgroup. And if we run in a
- * container we want to leave control to the container manager (and if proper delegation is
- * used we couldn't even write to this if we wanted to. */
- if (!is_local_root) {
- uint64_t max, swap_max = CGROUP_LIMIT_MAX;
-
- if (cgroup_context_has_unified_memory_config(c)) {
- max = c->memory_max;
- swap_max = c->memory_swap_max;
- } else {
- max = c->memory_limit;
-
- if (max != CGROUP_LIMIT_MAX)
- log_cgroup_compat(u, "Applying MemoryLimit=%" PRIu64 " as MemoryMax=", max);
- }
+ uint64_t max, swap_max = CGROUP_LIMIT_MAX;
+
+ if (cgroup_context_has_unified_memory_config(c)) {
+ max = c->memory_max;
+ swap_max = c->memory_swap_max;
+ } else {
+ max = c->memory_limit;
- cgroup_apply_unified_memory_limit(u, "memory.min", c->memory_min);
- cgroup_apply_unified_memory_limit(u, "memory.low", c->memory_low);
- cgroup_apply_unified_memory_limit(u, "memory.high", c->memory_high);
- cgroup_apply_unified_memory_limit(u, "memory.max", max);
- cgroup_apply_unified_memory_limit(u, "memory.swap.max", swap_max);
+ if (max != CGROUP_LIMIT_MAX)
+ log_cgroup_compat(u, "Applying MemoryLimit=%" PRIu64 " as MemoryMax=", max);
}
- } else {
- /* In legacy mode 'memory' exists on the host root, but in container mode we want to leave it
- * to the container manager around us */
- if (is_host_root || !is_local_root) {
- char buf[DECIMAL_STR_MAX(uint64_t) + 1];
- uint64_t val;
+ cgroup_apply_unified_memory_limit(u, "memory.min", c->memory_min);
+ cgroup_apply_unified_memory_limit(u, "memory.low", c->memory_low);
+ cgroup_apply_unified_memory_limit(u, "memory.high", c->memory_high);
+ cgroup_apply_unified_memory_limit(u, "memory.max", max);
+ cgroup_apply_unified_memory_limit(u, "memory.swap.max", swap_max);
- if (cgroup_context_has_unified_memory_config(c)) {
- val = c->memory_max;
- log_cgroup_compat(u, "Applying MemoryMax=%" PRIi64 " as MemoryLimit=", val);
- } else
- val = c->memory_limit;
+ } else {
+ char buf[DECIMAL_STR_MAX(uint64_t) + 1];
+ uint64_t val;
- if (val == CGROUP_LIMIT_MAX)
- strncpy(buf, "-1\n", sizeof(buf));
- else
- xsprintf(buf, "%" PRIu64 "\n", val);
+ if (cgroup_context_has_unified_memory_config(c)) {
+ val = c->memory_max;
+ log_cgroup_compat(u, "Applying MemoryMax=%" PRIi64 " as MemoryLimit=", val);
+ } else
+ val = c->memory_limit;
- (void) set_attribute_and_warn(u, "memory", "memory.limit_in_bytes", buf);
- }
+ if (val == CGROUP_LIMIT_MAX)
+ strncpy(buf, "-1\n", sizeof(buf));
+ else
+ xsprintf(buf, "%" PRIu64 "\n", val);
+
+ (void) set_attribute_and_warn(u, "memory", "memory.limit_in_bytes", buf);
}
}
- /* On cgroupsv2 we can apply BPF everywhre. On cgroupsv1 we apply it everywhere except for the root of
+ /* On cgroupsv2 we can apply BPF everywhere. On cgroupsv1 we apply it everywhere except for the root of
* containers, where we leave this to the manager */
if ((apply_mask & (CGROUP_MASK_DEVICES | CGROUP_MASK_BPF_DEVICES)) &&
(is_host_root || cg_all_unified() > 0 || !is_local_root)) {
r = procfs_tasks_set_limit(TASKS_MAX);
else
r = 0;
-
if (r < 0)
log_unit_full(u, LOG_LEVEL_CGROUP_WRITE(r), r,
"Failed to write to tasks limit sysctls: %m");
if (!c)
return 0;
- return cgroup_context_get_mask(c) | unit_get_bpf_mask(u) | unit_get_delegate_mask(u);
+ return (cgroup_context_get_mask(c) | unit_get_bpf_mask(u) | unit_get_delegate_mask(u)) & ~unit_get_ancestor_disable_mask(u);
}
CGroupMask unit_get_delegate_mask(Unit *u) {
return unit_get_subtree_mask(u); /* we are the top-level slice */
}
+CGroupMask unit_get_disable_mask(Unit *u) {
+ CGroupContext *c;
+
+ c = unit_get_cgroup_context(u);
+ if (!c)
+ return 0;
+
+ return c->disable_controllers;
+}
+
+CGroupMask unit_get_ancestor_disable_mask(Unit *u) {
+ CGroupMask mask;
+
+ assert(u);
+ mask = unit_get_disable_mask(u);
+
+ /* Returns the mask of controllers which are marked as forcibly
+ * disabled in any ancestor unit or the unit in question. */
+
+ if (UNIT_ISSET(u->slice))
+ mask |= unit_get_ancestor_disable_mask(UNIT_DEREF(u->slice));
+
+ return mask;
+}
+
CGroupMask unit_get_subtree_mask(Unit *u) {
/* Returns the mask of this subtree, meaning of the group
mask = unit_get_own_mask(u) | unit_get_members_mask(u) | unit_get_siblings_mask(u);
mask &= u->manager->cgroup_supported;
+ mask &= ~unit_get_ancestor_disable_mask(u);
return mask;
}
mask = unit_get_members_mask(u);
mask &= u->manager->cgroup_supported;
+ mask &= ~unit_get_ancestor_disable_mask(u);
return mask;
}
static int unit_create_cgroup(
Unit *u,
CGroupMask target_mask,
- CGroupMask enable_mask) {
+ CGroupMask enable_mask,
+ ManagerState state) {
bool created;
int r;
log_unit_warning_errno(u, r, "Failed to migrate cgroup from to %s, ignoring: %m", u->cgroup_path);
}
+ /* Set attributes */
+ cgroup_context_apply(u, target_mask, state);
+ cgroup_xattr_apply(u);
+
return 0;
}
return r;
}
-static void cgroup_xattr_apply(Unit *u) {
- char ids[SD_ID128_STRING_MAX];
- int r;
-
- assert(u);
-
- if (!MANAGER_IS_SYSTEM(u->manager))
- return;
-
- if (sd_id128_is_null(u->invocation_id))
- return;
-
- r = cg_set_xattr(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path,
- "trusted.invocation_id",
- sd_id128_to_string(u->invocation_id, ids), 32,
- 0);
- if (r < 0)
- log_unit_debug_errno(u, r, "Failed to set invocation ID on control group %s, ignoring: %m", u->cgroup_path);
-}
-
static bool unit_has_mask_realized(
Unit *u,
CGroupMask target_mask,
u->cgroup_invalidated_mask == 0;
}
+static bool unit_has_mask_disables_realized(
+ Unit *u,
+ CGroupMask target_mask,
+ CGroupMask enable_mask) {
+
+ assert(u);
+
+ /* Returns true if all controllers which should be disabled are indeed disabled.
+ *
+ * Unlike unit_has_mask_realized, we don't care what was enabled, only that anything we want to remove is
+ * already removed. */
+
+ return !u->cgroup_realized ||
+ (FLAGS_SET(u->cgroup_realized_mask, target_mask & CGROUP_MASK_V1) &&
+ FLAGS_SET(u->cgroup_enabled_mask, enable_mask & CGROUP_MASK_V2));
+}
+
+static bool unit_has_mask_enables_realized(
+ Unit *u,
+ CGroupMask target_mask,
+ CGroupMask enable_mask) {
+
+ assert(u);
+
+ /* Returns true if all controllers which should be enabled are indeed enabled.
+ *
+ * Unlike unit_has_mask_realized, we don't care about the controllers that are not present, only that anything
+ * we want to add is already added. */
+
+ return u->cgroup_realized &&
+ ((u->cgroup_realized_mask | target_mask) & CGROUP_MASK_V1) == (u->cgroup_realized_mask & CGROUP_MASK_V1) &&
+ ((u->cgroup_enabled_mask | enable_mask) & CGROUP_MASK_V2) == (u->cgroup_enabled_mask & CGROUP_MASK_V2);
+}
+
void unit_add_to_cgroup_realize_queue(Unit *u) {
assert(u);
u->in_cgroup_realize_queue = false;
}
+/* Controllers can only be enabled breadth-first, from the root of the
+ * hierarchy downwards to the unit in question. */
+static int unit_realize_cgroup_now_enable(Unit *u, ManagerState state) {
+ CGroupMask target_mask, enable_mask, new_target_mask, new_enable_mask;
+ int r;
+
+ assert(u);
+
+ /* First go deal with this unit's parent, or we won't be able to enable
+ * any new controllers at this layer. */
+ if (UNIT_ISSET(u->slice)) {
+ r = unit_realize_cgroup_now_enable(UNIT_DEREF(u->slice), state);
+ if (r < 0)
+ return r;
+ }
+
+ target_mask = unit_get_target_mask(u);
+ enable_mask = unit_get_enable_mask(u);
+
+ /* We can only enable in this direction, don't try to disable anything.
+ */
+ if (unit_has_mask_enables_realized(u, target_mask, enable_mask))
+ return 0;
+
+ new_target_mask = u->cgroup_realized_mask | target_mask;
+ new_enable_mask = u->cgroup_enabled_mask | enable_mask;
+
+ return unit_create_cgroup(u, new_target_mask, new_enable_mask, state);
+}
+
+/* Controllers can only be disabled depth-first, from the leaves of the
+ * hierarchy upwards to the unit in question. */
+static int unit_realize_cgroup_now_disable(Unit *u, ManagerState state) {
+ Iterator i;
+ Unit *m;
+ void *v;
+
+ assert(u);
+
+ if (u->type != UNIT_SLICE)
+ return 0;
+
+ HASHMAP_FOREACH_KEY(v, m, u->dependencies[UNIT_BEFORE], i) {
+ CGroupMask target_mask, enable_mask, new_target_mask, new_enable_mask;
+ int r;
+
+ if (UNIT_DEREF(m->slice) != u)
+ continue;
+
+ /* The cgroup for this unit might not actually be fully
+ * realised yet, in which case it isn't holding any controllers
+ * open anyway. */
+ if (!m->cgroup_path)
+ continue;
+
+ /* We must disable those below us first in order to release the
+ * controller. */
+ if (m->type == UNIT_SLICE)
+ (void) unit_realize_cgroup_now_disable(m, state);
+
+ target_mask = unit_get_target_mask(m);
+ enable_mask = unit_get_enable_mask(m);
+
+ /* We can only disable in this direction, don't try to enable
+ * anything. */
+ if (unit_has_mask_disables_realized(m, target_mask, enable_mask))
+ continue;
+
+ new_target_mask = m->cgroup_realized_mask & target_mask;
+ new_enable_mask = m->cgroup_enabled_mask & enable_mask;
+
+ r = unit_create_cgroup(m, new_target_mask, new_enable_mask, state);
+ if (r < 0)
+ return r;
+ }
+
+ return 0;
+}
+
/* Check if necessary controllers and attributes for a unit are in place.
*
- * If so, do nothing.
- * If not, create paths, move processes over, and set attributes.
+ * - If so, do nothing.
+ * - If not, create paths, move processes over, and set attributes.
+ *
+ * Controllers can only be *enabled* in a breadth-first way, and *disabled* in
+ * a depth-first way. As such the process looks like this:
+ *
+ * Suppose we have a cgroup hierarchy which looks like this:
+ *
+ * root
+ * / \
+ * / \
+ * / \
+ * a b
+ * / \ / \
+ * / \ / \
+ * c d e f
+ * / \ / \ / \ / \
+ * h i j k l m n o
+ *
+ * 1. We want to realise cgroup "d" now.
+ * 2. cgroup "a" has DisableControllers=cpu in the associated unit.
+ * 3. cgroup "k" just started requesting the memory controller.
+ *
+ * To make this work we must do the following in order:
+ *
+ * 1. Disable CPU controller in k, j
+ * 2. Disable CPU controller in d
+ * 3. Enable memory controller in root
+ * 4. Enable memory controller in a
+ * 5. Enable memory controller in d
+ * 6. Enable memory controller in k
+ *
+ * Notice that we need to touch j in one direction, but not the other. We also
+ * don't go beyond d when disabling -- it's up to "a" to get realized if it
+ * wants to disable further. The basic rules are therefore:
+ *
+ * - If you're disabling something, you need to realise all of the cgroups from
+ * your recursive descendants to the root. This starts from the leaves.
+ * - If you're enabling something, you need to realise from the root cgroup
+ * downwards, but you don't need to iterate your recursive descendants.
*
* Returns 0 on success and < 0 on failure. */
static int unit_realize_cgroup_now(Unit *u, ManagerState state) {
if (unit_has_mask_realized(u, target_mask, enable_mask))
return 0;
- /* First, realize parents */
+ /* Disable controllers below us, if there are any */
+ r = unit_realize_cgroup_now_disable(u, state);
+ if (r < 0)
+ return r;
+
+ /* Enable controllers above us, if there are any */
if (UNIT_ISSET(u->slice)) {
- r = unit_realize_cgroup_now(UNIT_DEREF(u->slice), state);
+ r = unit_realize_cgroup_now_enable(UNIT_DEREF(u->slice), state);
if (r < 0)
return r;
}
- /* And then do the real work */
- r = unit_create_cgroup(u, target_mask, enable_mask);
+ /* Now actually deal with the cgroup we were trying to realise and set attributes */
+ r = unit_create_cgroup(u, target_mask, enable_mask, state);
if (r < 0)
return r;
- /* Finally, apply the necessary attributes. */
- cgroup_context_apply(u, target_mask, state);
- cgroup_xattr_apply(u);
-
/* Now, reset the invalidation mask */
u->cgroup_invalidated_mask = 0;
return 0;