]> git.ipfire.org Git - thirdparty/systemd.git/commitdiff
core: rework how we track cgroup realized state
authorMike Yuan <me@yhndnzj.com>
Wed, 4 Jun 2025 15:34:11 +0000 (17:34 +0200)
committerMike Yuan <me@yhndnzj.com>
Wed, 4 Jun 2025 20:03:47 +0000 (22:03 +0200)
Prompted by https://github.com/systemd/systemd/pull/37646#discussion_r2126882561

Follow-up for 879952a85311ca375c673480d9718a9e0cd93c1d

Currently, almost all cgroup attr getters check cgroup_path for whether
cgroup is around. This is actually great, because we never want to expose
a non-existent cgroup path via IPC and such. However, it is spuriously
initialized at places where it shouldn't be, e.g. in unit_warn_leftover_processes().
This matters especially to units that *may* carry processes to run, but
not *always*, notably socket units. unit_warn_leftover_processes() is supposed
to be informative only and not try to set cgroup tracking to realized in
a half-assed way.

Hence, let's kill cgroup_realized field, and make sure cgroup_path is set
only if cgroup has been created. Be extra careful with deserialization
though, since the previous versions don't follow this rule and we need
to patch cgroup_path manually based on cgroup_realized we got from deserialization.

Calls to unit_watch_cgroup*() are dropped in cgroup_runtime_deserialize_one(),
because unit_deserialize_state() will invalidate cgroup realized state and
reapply later.

src/core/bpf-firewall.c
src/core/cgroup.c
src/core/cgroup.h
src/core/unit-serialize.c
src/core/unit.c

index 01f7639469dc4ca162fd834977a407eee7a7662a..e0cbe16463c8836462bb88a312fde54c9cdf6fec 100644 (file)
@@ -687,13 +687,10 @@ int bpf_firewall_install(Unit *u) {
         cc = unit_get_cgroup_context(u);
         if (!cc)
                 return -EINVAL;
+
         crt = unit_get_cgroup_runtime(u);
-        if (!crt)
-                return -EINVAL;
-        if (!crt->cgroup_path)
-                return -EINVAL;
-        if (!crt->cgroup_realized)
-                return -EINVAL;
+        if (!crt || !crt->cgroup_path)
+                return -EOWNERDEAD;
 
         if (bpf_program_supported() <= 0)
                 return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EOPNOTSUPP),
index 2b1d2435a88de5410b07f01c8a0c552d783c2243..5752f7f0155684160213e13f54e4883f3bc54b6e 100644 (file)
@@ -1950,7 +1950,6 @@ int unit_set_cgroup_path(Unit *u, const char *path) {
         assert(u);
 
         crt = unit_get_cgroup_runtime(u);
-
         if (crt && streq_ptr(crt->cgroup_path, path))
                 return 0;
 
@@ -1976,6 +1975,17 @@ int unit_set_cgroup_path(Unit *u, const char *path) {
         return 1;
 }
 
+int unit_get_cgroup_path_with_fallback(const Unit *u, char **ret) {
+        assert(u);
+        assert(ret);
+
+        const CGroupRuntime *crt = unit_get_cgroup_runtime(u);
+        if (!crt || !crt->cgroup_path)
+                return unit_default_cgroup_path(u, ret);
+
+        return strdup_to_full(ret, crt->cgroup_path); /* returns 1 -> cgroup_path is alive */
+}
+
 int unit_watch_cgroup(Unit *u) {
         _cleanup_free_ char *events = NULL;
         int r;
@@ -2077,42 +2087,14 @@ int unit_watch_cgroup_memory(Unit *u) {
         return 0;
 }
 
-int unit_pick_cgroup_path(Unit *u) {
-        _cleanup_free_ char *path = NULL;
-        int r;
-
-        assert(u);
-
-        if (!UNIT_HAS_CGROUP_CONTEXT(u))
-                return -EINVAL;
-
-        CGroupRuntime *crt = unit_setup_cgroup_runtime(u);
-        if (!crt)
-                return -ENOMEM;
-        if (crt->cgroup_path)
-                return 0;
-
-        r = unit_default_cgroup_path(u, &path);
-        if (r < 0)
-                return log_unit_error_errno(u, r, "Failed to generate default cgroup path: %m");
-
-        r = unit_set_cgroup_path(u, path);
-        if (r == -EEXIST)
-                return log_unit_error_errno(u, r, "Control group %s exists already.", empty_to_root(path));
-        if (r < 0)
-                return log_unit_error_errno(u, r, "Failed to set unit's control group path to %s: %m", empty_to_root(path));
-
-        return 0;
-}
-
 static int unit_update_cgroup(
                 Unit *u,
                 CGroupMask target_mask,
                 CGroupMask enable_mask,
                 ManagerState state) {
 
-        bool created;
-        _cleanup_free_ char *cgroup_full_path = NULL;
+        _cleanup_free_ char *cgroup = NULL, *cgroup_full_path = NULL;
+        bool set_path, created;
         int r;
 
         assert(u);
@@ -2123,19 +2105,28 @@ static int unit_update_cgroup(
         if (u->freezer_state != FREEZER_RUNNING)
                 return log_unit_error_errno(u, SYNTHETIC_ERRNO(EBUSY), "Cannot realize cgroup for frozen unit.");
 
-        /* Figure out our cgroup path */
-        r = unit_pick_cgroup_path(u);
+        r = unit_get_cgroup_path_with_fallback(u, &cgroup);
         if (r < 0)
-                return r;
-
-        CGroupRuntime *crt = ASSERT_PTR(unit_get_cgroup_runtime(u));
+                return log_unit_error_errno(u, r, "Failed to get cgroup path: %m");
+        set_path = r == 0;
 
         /* First, create our own group */
-        r = cg_create(crt->cgroup_path);
+        r = cg_create(cgroup);
         if (r < 0)
-                return log_unit_error_errno(u, r, "Failed to create cgroup %s: %m", empty_to_root(crt->cgroup_path));
+                return log_unit_error_errno(u, r, "Failed to create cgroup %s: %m", empty_to_root(cgroup));
         created = r;
 
+        if (set_path) {
+                r = unit_set_cgroup_path(u, cgroup);
+                if (r == -EEXIST)
+                        return log_unit_error_errno(u, r, "Picked control group '%s' as default, but it's in use already.", empty_to_root(cgroup));
+                if (r < 0)
+                        return log_unit_error_errno(u, r, "Failed to set unit's control group path to '%s': %m", empty_to_root(cgroup));
+                assert(r > 0);
+        }
+
+        CGroupRuntime *crt = ASSERT_PTR(unit_get_cgroup_runtime(u));
+
         uint64_t cgroup_id = 0;
         r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, crt->cgroup_path, NULL, &cgroup_full_path);
         if (r == 0) {
@@ -2153,7 +2144,7 @@ static int unit_update_cgroup(
         (void) unit_watch_cgroup_memory(u);
 
         /* For v2 we preserve enabled controllers in delegated units, adjust others, */
-        if (created || !crt->cgroup_realized || !unit_cgroup_delegate(u)) {
+        if (created || !unit_cgroup_delegate(u)) {
                 CGroupMask result_mask = 0;
 
                 /* Enable all controllers we need */
@@ -2166,7 +2157,6 @@ static int unit_update_cgroup(
         }
 
         /* Keep track that this is now realized */
-        crt->cgroup_realized = true;
         crt->cgroup_realized_mask = target_mask;
 
         /* Set attributes */
@@ -2317,10 +2307,6 @@ int unit_remove_subcgroup(Unit *u, const char *suffix_path) {
         if (!unit_cgroup_delegate(u))
                 return -ENOMEDIUM;
 
-        r = unit_pick_cgroup_path(u);
-        if (r < 0)
-                return r;
-
         CGroupRuntime *crt = unit_get_cgroup_runtime(u);
         if (!crt || !crt->cgroup_path)
                 return -EOWNERDEAD;
@@ -2375,7 +2361,7 @@ static bool unit_has_mask_realized(
          * enabled through cgroup.subtree_control, and since the BPF pseudo-controllers don't show up there, they
          * simply don't matter. */
 
-        return crt->cgroup_realized &&
+        return crt->cgroup_path &&
                 ((crt->cgroup_realized_mask ^ target_mask) & CGROUP_MASK_V1) == 0 &&
                 ((crt->cgroup_enabled_mask ^ enable_mask) & CGROUP_MASK_V2) == 0 &&
                 crt->cgroup_invalidated_mask == 0;
@@ -2397,7 +2383,7 @@ static bool unit_has_mask_disables_realized(
          * Unlike unit_has_mask_realized, we don't care what was enabled, only that anything we want to remove is
          * already removed. */
 
-        return !crt->cgroup_realized ||
+        return !crt->cgroup_path ||
                 (FLAGS_SET(crt->cgroup_realized_mask, target_mask & CGROUP_MASK_V1) &&
                  FLAGS_SET(crt->cgroup_enabled_mask, enable_mask & CGROUP_MASK_V2));
 }
@@ -2418,7 +2404,7 @@ static bool unit_has_mask_enables_realized(
          * Unlike unit_has_mask_realized, we don't care about the controllers that are not present, only that anything
          * we want to add is already added. */
 
-        return crt->cgroup_realized &&
+        return crt->cgroup_path &&
                 ((crt->cgroup_realized_mask | target_mask) & CGROUP_MASK_V1) == (crt->cgroup_realized_mask & CGROUP_MASK_V1) &&
                 ((crt->cgroup_enabled_mask | enable_mask) & CGROUP_MASK_V2) == (crt->cgroup_enabled_mask & CGROUP_MASK_V2);
 }
@@ -2497,7 +2483,7 @@ static int unit_realize_cgroup_now_disable(Unit *u, ManagerState state) {
 
                 /* The cgroup for this unit might not actually be fully realised yet, in which case it isn't
                  * holding any controllers open anyway. */
-                if (!rt->cgroup_realized)
+                if (!rt->cgroup_path)
                         continue;
 
                 /* We must disable those below us first in order to release the controller. */
@@ -2669,7 +2655,7 @@ void unit_add_family_to_cgroup_realize_queue(Unit *u) {
                         /* We only enqueue siblings if they were realized once at least, in the main
                          * hierarchy. */
                         crt = unit_get_cgroup_runtime(m);
-                        if (!crt || !crt->cgroup_realized)
+                        if (!crt || !crt->cgroup_path)
                                 continue;
 
                         /* If the unit doesn't need any new controllers and has current ones
@@ -2885,7 +2871,6 @@ void unit_prune_cgroup(Unit *u) {
         assert(crt == unit_get_cgroup_runtime(u));
         assert(!crt->cgroup_path);
 
-        crt->cgroup_realized = false;
         crt->cgroup_realized_mask = 0;
         crt->cgroup_enabled_mask = 0;
 
@@ -4199,6 +4184,8 @@ CGroupRuntime* cgroup_runtime_new(void) {
                 .ipv6_deny_map_fd = -EBADF,
 
                 .cgroup_invalidated_mask = _CGROUP_MASK_ALL,
+
+                .deserialized_cgroup_realized = -1,
         };
 
         unit_reset_cpu_accounting(/* unit = */ NULL, crt);
@@ -4335,7 +4322,6 @@ int cgroup_runtime_serialize(Unit *u, FILE *f, FDSet *fds) {
         if (crt->cgroup_id != 0)
                 (void) serialize_item_format(f, "cgroup-id", "%" PRIu64, crt->cgroup_id);
 
-        (void) serialize_bool(f, "cgroup-realized", crt->cgroup_realized);
         (void) serialize_cgroup_mask(f, "cgroup-realized-mask", crt->cgroup_realized_mask);
         (void) serialize_cgroup_mask(f, "cgroup-enabled-mask", crt->cgroup_enabled_mask);
         (void) serialize_cgroup_mask(f, "cgroup-invalidated-mask", crt->cgroup_invalidated_mask);
@@ -4435,15 +4421,13 @@ int cgroup_runtime_deserialize_one(Unit *u, const char *key, const char *value,
                 if (r < 0)
                         log_unit_debug_errno(u, r, "Failed to set cgroup path %s, ignoring: %m", value);
 
-                (void) unit_watch_cgroup(u);
-                (void) unit_watch_cgroup_memory(u);
                 return 1;
         }
 
         if (MATCH_DESERIALIZE_IMMEDIATE(u, "cgroup-id", key, value, safe_atou64, cgroup_id))
                 return 1;
 
-        if (MATCH_DESERIALIZE(u, "cgroup-realized", key, value, parse_boolean, cgroup_realized))
+        if (MATCH_DESERIALIZE_IMMEDIATE(u, "cgroup-realized", key, value, parse_tristate, deserialized_cgroup_realized))
                 return 1;
 
         if (MATCH_DESERIALIZE_IMMEDIATE(u, "cgroup-realized-mask", key, value, cg_mask_from_string, cgroup_realized_mask))
index f8504499eef2804638284ee6480723dc8928aec4..07bcbd340e9c69010ac950706eaa8641cf08bc7a 100644 (file)
@@ -323,7 +323,6 @@ typedef struct CGroupRuntime {
         struct bpf_link *restrict_ifaces_egress_bpf_link;
 #endif
 
-        bool cgroup_realized:1;
         bool cgroup_members_mask_valid:1;
 
         /* Reset cgroup accounting next time we fork something off */
@@ -331,6 +330,8 @@ typedef struct CGroupRuntime {
 
         /* Whether we warned about clamping the CPU quota period */
         bool warned_clamping_cpu_quota_period:1;
+
+        int deserialized_cgroup_realized; /* tristate, for backwards compat */
 } CGroupRuntime;
 
 uint64_t cgroup_context_cpu_weight(CGroupContext *c, ManagerState state);
@@ -379,7 +380,7 @@ void unit_add_family_to_cgroup_realize_queue(Unit *u);
 
 int unit_default_cgroup_path(const Unit *u, char **ret);
 int unit_set_cgroup_path(Unit *u, const char *path);
-int unit_pick_cgroup_path(Unit *u);
+int unit_get_cgroup_path_with_fallback(const Unit *u, char **ret);
 
 int unit_realize_cgroup(Unit *u);
 void unit_prune_cgroup(Unit *u);
index c2e8496b716ef53b081b1eb3eab1a370e647778c..a35d57986bd7b2935d4da168ca5301bc2c2e02e1 100644 (file)
@@ -367,11 +367,17 @@ int unit_deserialize_state(Unit *u, FILE *f, FDSet *fds) {
         /* Let's make sure that everything that is deserialized also gets any potential new cgroup settings
          * applied after we are done. For that we invalidate anything already realized, so that we can
          * realize it again. */
-        CGroupRuntime *crt;
-        crt = unit_get_cgroup_runtime(u);
-        if (crt && crt->cgroup_realized) {
-                unit_invalidate_cgroup(u, _CGROUP_MASK_ALL);
-                unit_invalidate_cgroup_bpf(u);
+        CGroupRuntime *crt = unit_get_cgroup_runtime(u);
+        if (crt && crt->cgroup_path) {
+                /* Since v258, CGroupRuntime.cgroup_path is coupled with cgroup realized state, which however
+                 * wasn't the case in prior versions with the realized state tracked in a discrete field.
+                 * Patch cgroup_realized == 0 back to no cgroup_path here hence. */
+                if (crt->deserialized_cgroup_realized == 0)
+                        unit_release_cgroup(u, /* drop_cgroup_runtime = */ false);
+                else {
+                        unit_invalidate_cgroup(u, _CGROUP_MASK_ALL);
+                        unit_invalidate_cgroup_bpf(u);
+                }
         }
 
         return 0;
@@ -504,11 +510,9 @@ void unit_dump(Unit *u, FILE *f, const char *prefix) {
 
                 fprintf(f,
                         "%s\tSlice: %s\n"
-                        "%s\tCGroup: %s\n"
-                        "%s\tCGroup realized: %s\n",
+                        "%s\tCGroup: %s\n",
                         prefix, strna(unit_slice_name(u)),
-                        prefix, strna(crt ? crt->cgroup_path : NULL),
-                        prefix, yes_no(crt ? crt->cgroup_realized : false));
+                        prefix, strna(crt ? crt->cgroup_path : NULL));
 
                 if (crt && crt->cgroup_realized_mask != 0) {
                         _cleanup_free_ char *s = NULL;
index d617624dede4bfc5444c7be2e04ebd9e0493c48d..60e91c55fce21951b85b6f8422a858f35ba0227a 100644 (file)
@@ -3395,7 +3395,7 @@ int unit_set_slice(Unit *u, Unit *slice) {
         /* Disallow slice changes if @u is already bound to cgroups */
         if (UNIT_GET_SLICE(u)) {
                 CGroupRuntime *crt = unit_get_cgroup_runtime(u);
-                if (crt && crt->cgroup_realized)
+                if (crt && crt->cgroup_path)
                         return -EBUSY;
         }
 
@@ -6006,16 +6006,17 @@ static int unit_log_leftover_process_stop(const PidRef *pid, int sig, void *user
 }
 
 int unit_warn_leftover_processes(Unit *u, bool start) {
-        assert(u);
+        _cleanup_free_ char *cgroup = NULL;
+        int r;
 
-        (void) unit_pick_cgroup_path(u);
+        assert(u);
 
-        CGroupRuntime *crt = unit_get_cgroup_runtime(u);
-        if (!crt || !crt->cgroup_path)
-                return 0;
+        r = unit_get_cgroup_path_with_fallback(u, &cgroup);
+        if (r < 0)
+                return r;
 
         return cg_kill_recursive(
-                        crt->cgroup_path,
+                        cgroup,
                         /* sig= */ 0,
                         /* flags= */ 0,
                         /* killed_pids= */ NULL,