};
}
-int cgroup_context_add_io_device_weight_dup(CGroupContext *c, CGroupIODeviceWeight *w) {
+int cgroup_context_add_io_device_weight_dup(CGroupContext *c, const CGroupIODeviceWeight *w) {
_cleanup_free_ CGroupIODeviceWeight *n = NULL;
assert(c);
assert(w);
- n = new0(CGroupIODeviceWeight, 1);
+ n = new(CGroupIODeviceWeight, 1);
if (!n)
return -ENOMEM;
- n->path = strdup(w->path);
+ *n = (CGroupIODeviceWeight) {
+ .path = strdup(w->path),
+ .weight = w->weight,
+ };
if (!n->path)
return -ENOMEM;
- n->weight = w->weight;
LIST_PREPEND(device_weights, c->io_device_weights, TAKE_PTR(n));
return 0;
}
-int cgroup_context_add_io_device_limit_dup(CGroupContext *c, CGroupIODeviceLimit *l) {
+int cgroup_context_add_io_device_limit_dup(CGroupContext *c, const CGroupIODeviceLimit *l) {
_cleanup_free_ CGroupIODeviceLimit *n = NULL;
assert(c);
assert(l);
n = new0(CGroupIODeviceLimit, 1);
- if (!l)
+ if (!n)
return -ENOMEM;
n->path = strdup(l->path);
return 0;
}
-int cgroup_context_add_io_device_latency_dup(CGroupContext *c, CGroupIODeviceLatency *l) {
+int cgroup_context_add_io_device_latency_dup(CGroupContext *c, const CGroupIODeviceLatency *l) {
_cleanup_free_ CGroupIODeviceLatency *n = NULL;
assert(c);
assert(l);
- n = new0(CGroupIODeviceLatency, 1);
+ n = new(CGroupIODeviceLatency, 1);
if (!n)
return -ENOMEM;
- n->path = strdup(l->path);
+ *n = (CGroupIODeviceLatency) {
+ .path = strdup(l->path),
+ .target_usec = l->target_usec,
+ };
if (!n->path)
return -ENOMEM;
- n->target_usec = l->target_usec;
-
LIST_PREPEND(device_latencies, c->io_device_latencies, TAKE_PTR(n));
return 0;
}
-int cgroup_context_add_block_io_device_weight_dup(CGroupContext *c, CGroupBlockIODeviceWeight *w) {
+int cgroup_context_add_block_io_device_weight_dup(CGroupContext *c, const CGroupBlockIODeviceWeight *w) {
_cleanup_free_ CGroupBlockIODeviceWeight *n = NULL;
assert(c);
assert(w);
- n = new0(CGroupBlockIODeviceWeight, 1);
+ n = new(CGroupBlockIODeviceWeight, 1);
if (!n)
return -ENOMEM;
- n->path = strdup(w->path);
+ *n = (CGroupBlockIODeviceWeight) {
+ .path = strdup(w->path),
+ .weight = w->weight,
+ };
if (!n->path)
return -ENOMEM;
- n->weight = w->weight;
-
LIST_PREPEND(device_weights, c->blockio_device_weights, TAKE_PTR(n));
return 0;
}
-int cgroup_context_add_block_io_device_bandwidth_dup(CGroupContext *c, CGroupBlockIODeviceBandwidth *b) {
+int cgroup_context_add_block_io_device_bandwidth_dup(CGroupContext *c, const CGroupBlockIODeviceBandwidth *b) {
_cleanup_free_ CGroupBlockIODeviceBandwidth *n = NULL;
assert(c);
assert(b);
- n = new0(CGroupBlockIODeviceBandwidth, 1);
+ n = new(CGroupBlockIODeviceBandwidth, 1);
if (!n)
return -ENOMEM;
return 0;
}
-int cgroup_context_add_device_allow_dup(CGroupContext *c, CGroupDeviceAllow *a) {
+int cgroup_context_add_device_allow_dup(CGroupContext *c, const CGroupDeviceAllow *a) {
_cleanup_free_ CGroupDeviceAllow *n = NULL;
assert(c);
assert(a);
- n = new0(CGroupDeviceAllow, 1);
+ n = new(CGroupDeviceAllow, 1);
if (!n)
return -ENOMEM;
- n->path = strdup(a->path);
+ *n = (CGroupDeviceAllow) {
+ .path = strdup(a->path),
+ .permissions = a->permissions,
+ };
if (!n->path)
return -ENOMEM;
- n->permissions = a->permissions;
-
LIST_PREPEND(device_allow, c->device_allow, TAKE_PTR(n));
return 0;
}
-static int cgroup_context_add_socket_bind_item_dup(CGroupContext *c, CGroupSocketBindItem *i, CGroupSocketBindItem *h) {
+static int cgroup_context_add_socket_bind_item_dup(CGroupContext *c, const CGroupSocketBindItem *i, CGroupSocketBindItem *h) {
_cleanup_free_ CGroupSocketBindItem *n = NULL;
assert(c);
assert(i);
- n = new0(CGroupSocketBindItem, 1);
+ n = new(CGroupSocketBindItem, 1);
if (!n)
return -ENOMEM;
return 0;
}
-int cgroup_context_add_socket_bind_item_allow_dup(CGroupContext *c, CGroupSocketBindItem *i) {
+int cgroup_context_add_socket_bind_item_allow_dup(CGroupContext *c, const CGroupSocketBindItem *i) {
return cgroup_context_add_socket_bind_item_dup(c, i, c->socket_bind_allow);
}
-int cgroup_context_add_socket_bind_item_deny_dup(CGroupContext *c, CGroupSocketBindItem *i) {
+int cgroup_context_add_socket_bind_item_deny_dup(CGroupContext *c, const CGroupSocketBindItem *i) {
return cgroup_context_add_socket_bind_item_dup(c, i, c->socket_bind_deny);
}
dst->tasks_accounting = src->tasks_accounting;
dst->ip_accounting = src->ip_accounting;
- dst->memory_oom_group = dst->memory_oom_group;
+ dst->memory_oom_group = src->memory_oom_group;
dst->cpu_weight = src->cpu_weight;
dst->startup_cpu_weight = src->startup_cpu_weight;
return;
}
- r = sd_event_source_set_priority(s, SD_EVENT_PRIORITY_NORMAL-8);
+ r = sd_event_source_set_priority(s, EVENT_PRIORITY_CGROUP_OOM);
if (r < 0) {
log_error_errno(r, "Failed to set priority of cgroup oom event source: %m");
return;
unit_add_to_cgroup_empty_queue(u);
}
- /* Disregard freezer state changes due to operations not initiated by us */
- if (values[1] && IN_SET(u->freezer_state, FREEZER_FREEZING, FREEZER_THAWING)) {
+ /* Disregard freezer state changes due to operations not initiated by us.
+ * See: https://github.com/systemd/systemd/pull/13512/files#r416469963 and
+ * https://github.com/systemd/systemd/pull/13512#issuecomment-573007207 */
+ if (values[1] && IN_SET(u->freezer_state, FREEZER_FREEZING, FREEZER_FREEZING_BY_PARENT, FREEZER_THAWING)) {
if (streq(values[1], "0"))
unit_thawed(u);
else
/* Schedule cgroup empty checks early, but after having processed service notification messages or
* SIGCHLD signals, so that a cgroup running empty is always just the last safety net of
* notification, and we collected the metadata the notification and SIGCHLD stuff offers first. */
- r = sd_event_source_set_priority(m->cgroup_empty_event_source, SD_EVENT_PRIORITY_NORMAL-5);
+ r = sd_event_source_set_priority(m->cgroup_empty_event_source, EVENT_PRIORITY_CGROUP_EMPTY);
if (r < 0)
return log_error_errno(r, "Failed to set priority of cgroup empty event source: %m");
/* Process cgroup empty notifications early. Note that when this event is dispatched it'll
* just add the unit to a cgroup empty queue, hence let's run earlier than that. Also see
* handling of cgroup agent notifications, for the classic cgroup hierarchy support. */
- r = sd_event_source_set_priority(m->cgroup_inotify_event_source, SD_EVENT_PRIORITY_NORMAL-9);
+ r = sd_event_source_set_priority(m->cgroup_inotify_event_source, EVENT_PRIORITY_CGROUP_INOTIFY);
if (r < 0)
return log_error_errno(r, "Failed to set priority of inotify event source: %m");
return r;
}
+static uint64_t unit_get_effective_limit_one(Unit *u, CGroupLimitType type) {
+ CGroupContext *cc;
+
+ assert(u);
+ assert(UNIT_HAS_CGROUP_CONTEXT(u));
+
+ if (unit_has_name(u, SPECIAL_ROOT_SLICE))
+ switch (type) {
+ case CGROUP_LIMIT_MEMORY_MAX:
+ case CGROUP_LIMIT_MEMORY_HIGH:
+ return physical_memory();
+ case CGROUP_LIMIT_TASKS_MAX:
+ return system_tasks_max();
+ default:
+ assert_not_reached();
+ }
+
+ cc = ASSERT_PTR(unit_get_cgroup_context(u));
+ switch (type) {
+ /* Note: on legacy/hybrid hierarchies memory_max stays CGROUP_LIMIT_MAX unless configured
+ * explicitly. Effective value of MemoryLimit= (cgroup v1) is not implemented. */
+ case CGROUP_LIMIT_MEMORY_MAX:
+ return cc->memory_max;
+ case CGROUP_LIMIT_MEMORY_HIGH:
+ return cc->memory_high;
+ case CGROUP_LIMIT_TASKS_MAX:
+ return cgroup_tasks_max_resolve(&cc->tasks_max);
+ default:
+ assert_not_reached();
+ }
+}
+
+int unit_get_effective_limit(Unit *u, CGroupLimitType type, uint64_t *ret) {
+ uint64_t infimum;
+
+ assert(u);
+ assert(ret);
+ assert(type >= 0);
+ assert(type < _CGROUP_LIMIT_TYPE_MAX);
+
+ if (!UNIT_HAS_CGROUP_CONTEXT(u))
+ return -EINVAL;
+
+ infimum = unit_get_effective_limit_one(u, type);
+ for (Unit *slice = UNIT_GET_SLICE(u); slice; slice = UNIT_GET_SLICE(slice))
+ infimum = MIN(infimum, unit_get_effective_limit_one(slice, type));
+
+ *ret = infimum;
+ return 0;
+}
+
static int unit_get_io_accounting_raw(Unit *u, uint64_t ret[static _CGROUP_IO_ACCOUNTING_METRIC_MAX]) {
static const char *const field_names[_CGROUP_IO_ACCOUNTING_METRIC_MAX] = {
[CGROUP_IO_READ_BYTES] = "rbytes=",
unit_invalidate_cgroup(u, CGROUP_MASK_CPU|CGROUP_MASK_IO|CGROUP_MASK_BLKIO|CGROUP_MASK_CPUSET);
}
-int unit_cgroup_freezer_action(Unit *u, FreezerAction action) {
- _cleanup_free_ char *path = NULL;
- FreezerState target, kernel = _FREEZER_STATE_INVALID;
- int r, ret;
+static int unit_cgroup_freezer_kernel_state(Unit *u, FreezerState *ret) {
+ _cleanup_free_ char *val = NULL;
+ FreezerState s;
+ int r;
assert(u);
- assert(IN_SET(action, FREEZER_FREEZE, FREEZER_THAW));
+ assert(ret);
- if (!cg_freezer_supported())
- return 0;
+ r = cg_get_keyed_attribute(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, "cgroup.events",
+ STRV_MAKE("frozen"), &val);
+ if (IN_SET(r, -ENOENT, -ENXIO))
+ return -ENODATA;
+ if (r < 0)
+ return r;
- /* Ignore all requests to thaw init.scope or -.slice and reject all requests to freeze them */
- if (unit_has_name(u, SPECIAL_ROOT_SLICE) || unit_has_name(u, SPECIAL_INIT_SCOPE))
- return action == FREEZER_FREEZE ? -EPERM : 0;
+ if (streq(val, "0"))
+ s = FREEZER_RUNNING;
+ else if (streq(val, "1"))
+ s = FREEZER_FROZEN;
+ else {
+ log_unit_debug_errno(u, SYNTHETIC_ERRNO(EINVAL), "Unexpected cgroup frozen state: %s", val);
+ s = _FREEZER_STATE_INVALID;
+ }
- if (!u->cgroup_realized)
- return -EBUSY;
+ *ret = s;
+ return 0;
+}
- if (action == FREEZER_THAW) {
- Unit *slice = UNIT_GET_SLICE(u);
+int unit_cgroup_freezer_action(Unit *u, FreezerAction action) {
+ _cleanup_free_ char *path = NULL;
+ FreezerState target, current, next;
+ int r;
- if (slice) {
- r = unit_cgroup_freezer_action(slice, FREEZER_THAW);
- if (r < 0)
- return log_unit_error_errno(u, r, "Failed to thaw slice %s of unit: %m", slice->id);
- }
- }
+ assert(u);
+ assert(IN_SET(action, FREEZER_FREEZE, FREEZER_PARENT_FREEZE,
+ FREEZER_THAW, FREEZER_PARENT_THAW));
- target = action == FREEZER_FREEZE ? FREEZER_FROZEN : FREEZER_RUNNING;
+ if (!cg_freezer_supported() || !u->cgroup_realized)
+ return 0;
+
+ unit_next_freezer_state(u, action, &next, &target);
- r = unit_freezer_state_kernel(u, &kernel);
+ r = unit_cgroup_freezer_kernel_state(u, ¤t);
if (r < 0)
- log_unit_debug_errno(u, r, "Failed to obtain cgroup freezer state: %m");
+ return r;
- if (target == kernel) {
- u->freezer_state = target;
- if (action == FREEZER_FREEZE)
- return 0;
- ret = 0;
- } else
- ret = 1;
+ if (current == target)
+ next = freezer_state_finish(next);
+ else if (IN_SET(next, FREEZER_FROZEN, FREEZER_FROZEN_BY_PARENT, FREEZER_RUNNING)) {
+ /* We're transitioning into a finished state, which implies that the cgroup's
+ * current state already matches the target and thus we'd return 0. But, reality
+ * shows otherwise. This indicates that our freezer_state tracking has diverged
+ * from the real state of the cgroup, which can happen if someone meddles with the
+ * cgroup from underneath us. This really shouldn't happen during normal operation,
+ * though. So, let's warn about it and fix up the state to be valid */
+
+ log_unit_warning(u, "Unit wants to transition to %s freezer state but cgroup is unexpectedly %s, fixing up.",
+ freezer_state_to_string(next), freezer_state_to_string(current) ?: "(invalid)");
+
+ if (next == FREEZER_FROZEN)
+ next = FREEZER_FREEZING;
+ else if (next == FREEZER_FROZEN_BY_PARENT)
+ next = FREEZER_FREEZING_BY_PARENT;
+ else if (next == FREEZER_RUNNING)
+ next = FREEZER_THAWING;
+ }
r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, "cgroup.freeze", &path);
if (r < 0)
return r;
- log_unit_debug(u, "%s unit.", action == FREEZER_FREEZE ? "Freezing" : "Thawing");
-
- if (target != kernel) {
- if (action == FREEZER_FREEZE)
- u->freezer_state = FREEZER_FREEZING;
- else
- u->freezer_state = FREEZER_THAWING;
- }
+ log_unit_debug(u, "Unit freezer state was %s, now %s.",
+ freezer_state_to_string(u->freezer_state),
+ freezer_state_to_string(next));
- r = write_string_file(path, one_zero(action == FREEZER_FREEZE), WRITE_STRING_FILE_DISABLE_BUFFER);
+ r = write_string_file(path, one_zero(target == FREEZER_FROZEN), WRITE_STRING_FILE_DISABLE_BUFFER);
if (r < 0)
return r;
- return ret;
+ u->freezer_state = next;
+ return target != current;
}
int unit_get_cpuset(Unit *u, CPUSet *cpus, const char *name) {
DEFINE_STRING_TABLE_LOOKUP(cgroup_device_policy, CGroupDevicePolicy);
-static const char* const freezer_action_table[_FREEZER_ACTION_MAX] = {
- [FREEZER_FREEZE] = "freeze",
- [FREEZER_THAW] = "thaw",
-};
-
-DEFINE_STRING_TABLE_LOOKUP(freezer_action, FreezerAction);
-
static const char* const cgroup_pressure_watch_table[_CGROUP_PRESSURE_WATCH_MAX] = {
- [CGROUP_PRESSURE_WATCH_OFF] = "off",
+ [CGROUP_PRESSURE_WATCH_OFF] = "off",
[CGROUP_PRESSURE_WATCH_AUTO] = "auto",
- [CGROUP_PRESSURE_WATCH_ON] = "on",
+ [CGROUP_PRESSURE_WATCH_ON] = "on",
[CGROUP_PRESSURE_WATCH_SKIP] = "skip",
};
};
DEFINE_STRING_TABLE_LOOKUP(cgroup_memory_accounting_metric, CGroupMemoryAccountingMetric);
+
+static const char *const cgroup_limit_type_table[_CGROUP_LIMIT_TYPE_MAX] = {
+ [CGROUP_LIMIT_MEMORY_MAX] = "EffectiveMemoryMax",
+ [CGROUP_LIMIT_MEMORY_HIGH] = "EffectiveMemoryHigh",
+ [CGROUP_LIMIT_TASKS_MAX] = "EffectiveTasksMax",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(cgroup_limit_type, CGroupLimitType);