]> git.ipfire.org Git - thirdparty/systemd.git/blobdiff - src/core/cgroup.c
core: Rework recursive freeze/thaw
[thirdparty/systemd.git] / src / core / cgroup.c
index 311a4197aa1434067f9e20980105c0df57ebb560..5520118026d39cff924bd8397252b691c5a1de66 100644 (file)
@@ -190,33 +190,35 @@ void cgroup_context_init(CGroupContext *c) {
         };
 }
 
-int cgroup_context_add_io_device_weight_dup(CGroupContext *c, CGroupIODeviceWeight *w) {
+int cgroup_context_add_io_device_weight_dup(CGroupContext *c, const CGroupIODeviceWeight *w) {
         _cleanup_free_ CGroupIODeviceWeight *n = NULL;
 
         assert(c);
         assert(w);
 
-        n = new0(CGroupIODeviceWeight, 1);
+        n = new(CGroupIODeviceWeight, 1);
         if (!n)
                 return -ENOMEM;
 
-        n->path = strdup(w->path);
+        *n = (CGroupIODeviceWeight) {
+                .path = strdup(w->path),
+                .weight = w->weight,
+        };
         if (!n->path)
                 return -ENOMEM;
-        n->weight = w->weight;
 
         LIST_PREPEND(device_weights, c->io_device_weights, TAKE_PTR(n));
         return 0;
 }
 
-int cgroup_context_add_io_device_limit_dup(CGroupContext *c, CGroupIODeviceLimit *l) {
+int cgroup_context_add_io_device_limit_dup(CGroupContext *c, const CGroupIODeviceLimit *l) {
         _cleanup_free_ CGroupIODeviceLimit *n = NULL;
 
         assert(c);
         assert(l);
 
         n = new0(CGroupIODeviceLimit, 1);
-        if (!l)
+        if (!n)
                 return -ENOMEM;
 
         n->path = strdup(l->path);
@@ -230,53 +232,55 @@ int cgroup_context_add_io_device_limit_dup(CGroupContext *c, CGroupIODeviceLimit
         return 0;
 }
 
-int cgroup_context_add_io_device_latency_dup(CGroupContext *c, CGroupIODeviceLatency *l) {
+int cgroup_context_add_io_device_latency_dup(CGroupContext *c, const CGroupIODeviceLatency *l) {
         _cleanup_free_ CGroupIODeviceLatency *n = NULL;
 
         assert(c);
         assert(l);
 
-        n = new0(CGroupIODeviceLatency, 1);
+        n = new(CGroupIODeviceLatency, 1);
         if (!n)
                 return -ENOMEM;
 
-        n->path = strdup(l->path);
+        *n = (CGroupIODeviceLatency) {
+                .path = strdup(l->path),
+                .target_usec = l->target_usec,
+        };
         if (!n->path)
                 return -ENOMEM;
 
-        n->target_usec = l->target_usec;
-
         LIST_PREPEND(device_latencies, c->io_device_latencies, TAKE_PTR(n));
         return 0;
 }
 
-int cgroup_context_add_block_io_device_weight_dup(CGroupContext *c, CGroupBlockIODeviceWeight *w) {
+int cgroup_context_add_block_io_device_weight_dup(CGroupContext *c, const CGroupBlockIODeviceWeight *w) {
         _cleanup_free_ CGroupBlockIODeviceWeight *n = NULL;
 
         assert(c);
         assert(w);
 
-        n = new0(CGroupBlockIODeviceWeight, 1);
+        n = new(CGroupBlockIODeviceWeight, 1);
         if (!n)
                 return -ENOMEM;
 
-        n->path = strdup(w->path);
+        *n = (CGroupBlockIODeviceWeight) {
+                .path = strdup(w->path),
+                .weight = w->weight,
+        };
         if (!n->path)
                 return -ENOMEM;
 
-        n->weight = w->weight;
-
         LIST_PREPEND(device_weights, c->blockio_device_weights, TAKE_PTR(n));
         return 0;
 }
 
-int cgroup_context_add_block_io_device_bandwidth_dup(CGroupContext *c, CGroupBlockIODeviceBandwidth *b) {
+int cgroup_context_add_block_io_device_bandwidth_dup(CGroupContext *c, const CGroupBlockIODeviceBandwidth *b) {
         _cleanup_free_ CGroupBlockIODeviceBandwidth *n = NULL;
 
         assert(c);
         assert(b);
 
-        n = new0(CGroupBlockIODeviceBandwidth, 1);
+        n = new(CGroupBlockIODeviceBandwidth, 1);
         if (!n)
                 return -ENOMEM;
 
@@ -289,33 +293,34 @@ int cgroup_context_add_block_io_device_bandwidth_dup(CGroupContext *c, CGroupBlo
         return 0;
 }
 
-int cgroup_context_add_device_allow_dup(CGroupContext *c, CGroupDeviceAllow *a) {
+int cgroup_context_add_device_allow_dup(CGroupContext *c, const CGroupDeviceAllow *a) {
         _cleanup_free_ CGroupDeviceAllow *n = NULL;
 
         assert(c);
         assert(a);
 
-        n = new0(CGroupDeviceAllow, 1);
+        n = new(CGroupDeviceAllow, 1);
         if (!n)
                 return -ENOMEM;
 
-        n->path = strdup(a->path);
+        *n = (CGroupDeviceAllow) {
+                .path = strdup(a->path),
+                .permissions = a->permissions,
+        };
         if (!n->path)
                 return -ENOMEM;
 
-        n->permissions = a->permissions;
-
         LIST_PREPEND(device_allow, c->device_allow, TAKE_PTR(n));
         return 0;
 }
 
-static int cgroup_context_add_socket_bind_item_dup(CGroupContext *c, CGroupSocketBindItem *i, CGroupSocketBindItem *h) {
+static int cgroup_context_add_socket_bind_item_dup(CGroupContext *c, const CGroupSocketBindItem *i, CGroupSocketBindItem *h) {
         _cleanup_free_ CGroupSocketBindItem *n = NULL;
 
         assert(c);
         assert(i);
 
-        n = new0(CGroupSocketBindItem, 1);
+        n = new(CGroupSocketBindItem, 1);
         if (!n)
                 return -ENOMEM;
 
@@ -330,11 +335,11 @@ static int cgroup_context_add_socket_bind_item_dup(CGroupContext *c, CGroupSocke
         return 0;
 }
 
-int cgroup_context_add_socket_bind_item_allow_dup(CGroupContext *c, CGroupSocketBindItem *i) {
+int cgroup_context_add_socket_bind_item_allow_dup(CGroupContext *c, const CGroupSocketBindItem *i) {
         return cgroup_context_add_socket_bind_item_dup(c, i, c->socket_bind_allow);
 }
 
-int cgroup_context_add_socket_bind_item_deny_dup(CGroupContext *c, CGroupSocketBindItem *i) {
+int cgroup_context_add_socket_bind_item_deny_dup(CGroupContext *c, const CGroupSocketBindItem *i) {
         return cgroup_context_add_socket_bind_item_dup(c, i, c->socket_bind_deny);
 }
 
@@ -353,7 +358,7 @@ int cgroup_context_copy(CGroupContext *dst, const CGroupContext *src) {
         dst->tasks_accounting = src->tasks_accounting;
         dst->ip_accounting = src->ip_accounting;
 
-        dst->memory_oom_group = dst->memory_oom_group;
+        dst->memory_oom_group = src->memory_oom_group;
 
         dst->cpu_weight = src->cpu_weight;
         dst->startup_cpu_weight = src->startup_cpu_weight;
@@ -3853,7 +3858,7 @@ static void unit_add_to_cgroup_oom_queue(Unit *u) {
                         return;
                 }
 
-                r = sd_event_source_set_priority(s, SD_EVENT_PRIORITY_NORMAL-8);
+                r = sd_event_source_set_priority(s, EVENT_PRIORITY_CGROUP_OOM);
                 if (r < 0) {
                         log_error_errno(r, "Failed to set priority of cgroup oom event source: %m");
                         return;
@@ -3892,8 +3897,10 @@ static int unit_check_cgroup_events(Unit *u) {
                         unit_add_to_cgroup_empty_queue(u);
         }
 
-        /* Disregard freezer state changes due to operations not initiated by us */
-        if (values[1] && IN_SET(u->freezer_state, FREEZER_FREEZING, FREEZER_THAWING)) {
+        /* Disregard freezer state changes due to operations not initiated by us.
+         * See: https://github.com/systemd/systemd/pull/13512/files#r416469963 and
+         *      https://github.com/systemd/systemd/pull/13512#issuecomment-573007207 */
+        if (values[1] && IN_SET(u->freezer_state, FREEZER_FREEZING, FREEZER_FREEZING_BY_PARENT, FREEZER_THAWING)) {
                 if (streq(values[1], "0"))
                         unit_thawed(u);
                 else
@@ -4059,7 +4066,7 @@ int manager_setup_cgroup(Manager *m) {
         /* Schedule cgroup empty checks early, but after having processed service notification messages or
          * SIGCHLD signals, so that a cgroup running empty is always just the last safety net of
          * notification, and we collected the metadata the notification and SIGCHLD stuff offers first. */
-        r = sd_event_source_set_priority(m->cgroup_empty_event_source, SD_EVENT_PRIORITY_NORMAL-5);
+        r = sd_event_source_set_priority(m->cgroup_empty_event_source, EVENT_PRIORITY_CGROUP_EMPTY);
         if (r < 0)
                 return log_error_errno(r, "Failed to set priority of cgroup empty event source: %m");
 
@@ -4088,7 +4095,7 @@ int manager_setup_cgroup(Manager *m) {
                 /* Process cgroup empty notifications early. Note that when this event is dispatched it'll
                  * just add the unit to a cgroup empty queue, hence let's run earlier than that. Also see
                  * handling of cgroup agent notifications, for the classic cgroup hierarchy support. */
-                r = sd_event_source_set_priority(m->cgroup_inotify_event_source, SD_EVENT_PRIORITY_NORMAL-9);
+                r = sd_event_source_set_priority(m->cgroup_inotify_event_source, EVENT_PRIORITY_CGROUP_INOTIFY);
                 if (r < 0)
                         return log_error_errno(r, "Failed to set priority of inotify event source: %m");
 
@@ -4555,6 +4562,57 @@ int unit_get_ip_accounting(
         return r;
 }
 
+static uint64_t unit_get_effective_limit_one(Unit *u, CGroupLimitType type) {
+        CGroupContext *cc;
+
+        assert(u);
+        assert(UNIT_HAS_CGROUP_CONTEXT(u));
+
+        if (unit_has_name(u, SPECIAL_ROOT_SLICE))
+                switch (type) {
+                        case CGROUP_LIMIT_MEMORY_MAX:
+                        case CGROUP_LIMIT_MEMORY_HIGH:
+                                return physical_memory();
+                        case CGROUP_LIMIT_TASKS_MAX:
+                                return system_tasks_max();
+                        default:
+                                assert_not_reached();
+                }
+
+        cc = ASSERT_PTR(unit_get_cgroup_context(u));
+        switch (type) {
+                /* Note: on legacy/hybrid hierarchies memory_max stays CGROUP_LIMIT_MAX unless configured
+                 * explicitly. Effective value of MemoryLimit= (cgroup v1) is not implemented. */
+                case CGROUP_LIMIT_MEMORY_MAX:
+                        return cc->memory_max;
+                case CGROUP_LIMIT_MEMORY_HIGH:
+                        return cc->memory_high;
+                case CGROUP_LIMIT_TASKS_MAX:
+                        return cgroup_tasks_max_resolve(&cc->tasks_max);
+                default:
+                        assert_not_reached();
+        }
+}
+
+int unit_get_effective_limit(Unit *u, CGroupLimitType type, uint64_t *ret) {
+        uint64_t infimum;
+
+        assert(u);
+        assert(ret);
+        assert(type >= 0);
+        assert(type < _CGROUP_LIMIT_TYPE_MAX);
+
+        if (!UNIT_HAS_CGROUP_CONTEXT(u))
+                return -EINVAL;
+
+        infimum = unit_get_effective_limit_one(u, type);
+        for (Unit *slice = UNIT_GET_SLICE(u); slice; slice = UNIT_GET_SLICE(slice))
+                infimum = MIN(infimum, unit_get_effective_limit_one(slice, type));
+
+        *ret = infimum;
+        return 0;
+}
+
 static int unit_get_io_accounting_raw(Unit *u, uint64_t ret[static _CGROUP_IO_ACCOUNTING_METRIC_MAX]) {
         static const char *const field_names[_CGROUP_IO_ACCOUNTING_METRIC_MAX] = {
                 [CGROUP_IO_READ_BYTES]       = "rbytes=",
@@ -4835,66 +4893,87 @@ void manager_invalidate_startup_units(Manager *m) {
                 unit_invalidate_cgroup(u, CGROUP_MASK_CPU|CGROUP_MASK_IO|CGROUP_MASK_BLKIO|CGROUP_MASK_CPUSET);
 }
 
-int unit_cgroup_freezer_action(Unit *u, FreezerAction action) {
-        _cleanup_free_ char *path = NULL;
-        FreezerState target, kernel = _FREEZER_STATE_INVALID;
-        int r, ret;
+static int unit_cgroup_freezer_kernel_state(Unit *u, FreezerState *ret) {
+        _cleanup_free_ char *val = NULL;
+        FreezerState s;
+        int r;
 
         assert(u);
-        assert(IN_SET(action, FREEZER_FREEZE, FREEZER_THAW));
+        assert(ret);
 
-        if (!cg_freezer_supported())
-                return 0;
+        r = cg_get_keyed_attribute(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, "cgroup.events",
+                                   STRV_MAKE("frozen"), &val);
+        if (IN_SET(r, -ENOENT, -ENXIO))
+                return -ENODATA;
+        if (r < 0)
+                return r;
 
-        /* Ignore all requests to thaw init.scope or -.slice and reject all requests to freeze them */
-        if (unit_has_name(u, SPECIAL_ROOT_SLICE) || unit_has_name(u, SPECIAL_INIT_SCOPE))
-                return action == FREEZER_FREEZE ? -EPERM : 0;
+        if (streq(val, "0"))
+                s = FREEZER_RUNNING;
+        else if (streq(val, "1"))
+                s = FREEZER_FROZEN;
+        else {
+                log_unit_debug_errno(u, SYNTHETIC_ERRNO(EINVAL), "Unexpected cgroup frozen state: %s", val);
+                s = _FREEZER_STATE_INVALID;
+        }
 
-        if (!u->cgroup_realized)
-                return -EBUSY;
+        *ret = s;
+        return 0;
+}
 
-        if (action == FREEZER_THAW) {
-                Unit *slice = UNIT_GET_SLICE(u);
+int unit_cgroup_freezer_action(Unit *u, FreezerAction action) {
+        _cleanup_free_ char *path = NULL;
+        FreezerState target, current, next;
+        int r;
 
-                if (slice) {
-                        r = unit_cgroup_freezer_action(slice, FREEZER_THAW);
-                        if (r < 0)
-                                return log_unit_error_errno(u, r, "Failed to thaw slice %s of unit: %m", slice->id);
-                }
-        }
+        assert(u);
+        assert(IN_SET(action, FREEZER_FREEZE, FREEZER_PARENT_FREEZE,
+                              FREEZER_THAW, FREEZER_PARENT_THAW));
 
-        target = action == FREEZER_FREEZE ? FREEZER_FROZEN : FREEZER_RUNNING;
+        if (!cg_freezer_supported() || !u->cgroup_realized)
+                return 0;
+
+        unit_next_freezer_state(u, action, &next, &target);
 
-        r = unit_freezer_state_kernel(u, &kernel);
+        r = unit_cgroup_freezer_kernel_state(u, &current);
         if (r < 0)
-                log_unit_debug_errno(u, r, "Failed to obtain cgroup freezer state: %m");
+                return r;
 
-        if (target == kernel) {
-                u->freezer_state = target;
-                if (action == FREEZER_FREEZE)
-                        return 0;
-                ret = 0;
-        } else
-                ret = 1;
+        if (current == target)
+                next = freezer_state_finish(next);
+        else if (IN_SET(next, FREEZER_FROZEN, FREEZER_FROZEN_BY_PARENT, FREEZER_RUNNING)) {
+                /* We're transitioning into a finished state, which implies that the cgroup's
+                 * current state already matches the target and thus we'd return 0. But, reality
+                 * shows otherwise. This indicates that our freezer_state tracking has diverged
+                 * from the real state of the cgroup, which can happen if someone meddles with the
+                 * cgroup from underneath us. This really shouldn't happen during normal operation,
+                 * though. So, let's warn about it and fix up the state to be valid */
+
+                log_unit_warning(u, "Unit wants to transition to %s freezer state but cgroup is unexpectedly %s, fixing up.",
+                                 freezer_state_to_string(next), freezer_state_to_string(current) ?: "(invalid)");
+
+                if (next == FREEZER_FROZEN)
+                        next = FREEZER_FREEZING;
+                else if (next == FREEZER_FROZEN_BY_PARENT)
+                        next = FREEZER_FREEZING_BY_PARENT;
+                else if (next == FREEZER_RUNNING)
+                        next = FREEZER_THAWING;
+        }
 
         r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, "cgroup.freeze", &path);
         if (r < 0)
                 return r;
 
-        log_unit_debug(u, "%s unit.", action == FREEZER_FREEZE ? "Freezing" : "Thawing");
-
-        if (target != kernel) {
-                if (action == FREEZER_FREEZE)
-                        u->freezer_state = FREEZER_FREEZING;
-                else
-                        u->freezer_state = FREEZER_THAWING;
-        }
+        log_unit_debug(u, "Unit freezer state was %s, now %s.",
+                       freezer_state_to_string(u->freezer_state),
+                       freezer_state_to_string(next));
 
-        r = write_string_file(path, one_zero(action == FREEZER_FREEZE), WRITE_STRING_FILE_DISABLE_BUFFER);
+        r = write_string_file(path, one_zero(target == FREEZER_FROZEN), WRITE_STRING_FILE_DISABLE_BUFFER);
         if (r < 0)
                 return r;
 
-        return ret;
+        u->freezer_state = next;
+        return target != current;
 }
 
 int unit_get_cpuset(Unit *u, CPUSet *cpus, const char *name) {
@@ -4933,17 +5012,10 @@ static const char* const cgroup_device_policy_table[_CGROUP_DEVICE_POLICY_MAX] =
 
 DEFINE_STRING_TABLE_LOOKUP(cgroup_device_policy, CGroupDevicePolicy);
 
-static const char* const freezer_action_table[_FREEZER_ACTION_MAX] = {
-        [FREEZER_FREEZE] = "freeze",
-        [FREEZER_THAW] = "thaw",
-};
-
-DEFINE_STRING_TABLE_LOOKUP(freezer_action, FreezerAction);
-
 static const char* const cgroup_pressure_watch_table[_CGROUP_PRESSURE_WATCH_MAX] = {
-        [CGROUP_PRESSURE_WATCH_OFF] = "off",
+        [CGROUP_PRESSURE_WATCH_OFF]  = "off",
         [CGROUP_PRESSURE_WATCH_AUTO] = "auto",
-        [CGROUP_PRESSURE_WATCH_ON] = "on",
+        [CGROUP_PRESSURE_WATCH_ON]   = "on",
         [CGROUP_PRESSURE_WATCH_SKIP] = "skip",
 };
 
@@ -4975,3 +5047,11 @@ static const char* const cgroup_memory_accounting_metric_table[_CGROUP_MEMORY_AC
 };
 
 DEFINE_STRING_TABLE_LOOKUP(cgroup_memory_accounting_metric, CGroupMemoryAccountingMetric);
+
+static const char *const cgroup_limit_type_table[_CGROUP_LIMIT_TYPE_MAX] = {
+        [CGROUP_LIMIT_MEMORY_MAX]  = "EffectiveMemoryMax",
+        [CGROUP_LIMIT_MEMORY_HIGH] = "EffectiveMemoryHigh",
+        [CGROUP_LIMIT_TASKS_MAX]   = "EffectiveTasksMax",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(cgroup_limit_type, CGroupLimitType);