From 23ac08115af83e3a0a937fa207fc52511aba2ffa Mon Sep 17 00:00:00 2001 From: Mike Yuan Date: Wed, 4 Jun 2025 17:34:11 +0200 Subject: [PATCH] core: rework how we track cgroup realized state Prompted by https://github.com/systemd/systemd/pull/37646#discussion_r2126882561 Follow-up for 879952a85311ca375c673480d9718a9e0cd93c1d Currently, almost all cgroup attr getters check cgroup_path for whether cgroup is around. This is actually great, because we never want to expose a non-existent cgroup path via IPC and such. However, it is spuriously initialized at places where it shouldn't be, e.g. in unit_warn_leftover_processes(). This matters especially to units that *may* carry processes to run, but not *always*, notably socket units. unit_warn_leftover_processes() is supposed to be informative only and not try to set cgroup tracking to realized in a half-assed way. Hence, let's kill cgroup_realized field, and make sure cgroup_path is set only if cgroup has been created. Be extra careful with deserialization though, since the previous versions don't follow this rule and we need to patch cgroup_path manually based on cgroup_realized we got from deserialization. Calls to unit_watch_cgroup*() are dropped in cgroup_runtime_deserialize_one(), because unit_deserialize_state() will invalidate cgroup realized state and reapply later. --- src/core/bpf-firewall.c | 9 ++-- src/core/cgroup.c | 92 ++++++++++++++++----------------------- src/core/cgroup.h | 5 ++- src/core/unit-serialize.c | 22 ++++++---- src/core/unit.c | 15 ++++--- 5 files changed, 65 insertions(+), 78 deletions(-) diff --git a/src/core/bpf-firewall.c b/src/core/bpf-firewall.c index 01f7639469d..e0cbe16463c 100644 --- a/src/core/bpf-firewall.c +++ b/src/core/bpf-firewall.c @@ -687,13 +687,10 @@ int bpf_firewall_install(Unit *u) { cc = unit_get_cgroup_context(u); if (!cc) return -EINVAL; + crt = unit_get_cgroup_runtime(u); - if (!crt) - return -EINVAL; - if (!crt->cgroup_path) - return -EINVAL; - if (!crt->cgroup_realized) - return -EINVAL; + if (!crt || !crt->cgroup_path) + return -EOWNERDEAD; if (bpf_program_supported() <= 0) return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EOPNOTSUPP), diff --git a/src/core/cgroup.c b/src/core/cgroup.c index 2b1d2435a88..5752f7f0155 100644 --- a/src/core/cgroup.c +++ b/src/core/cgroup.c @@ -1950,7 +1950,6 @@ int unit_set_cgroup_path(Unit *u, const char *path) { assert(u); crt = unit_get_cgroup_runtime(u); - if (crt && streq_ptr(crt->cgroup_path, path)) return 0; @@ -1976,6 +1975,17 @@ int unit_set_cgroup_path(Unit *u, const char *path) { return 1; } +int unit_get_cgroup_path_with_fallback(const Unit *u, char **ret) { + assert(u); + assert(ret); + + const CGroupRuntime *crt = unit_get_cgroup_runtime(u); + if (!crt || !crt->cgroup_path) + return unit_default_cgroup_path(u, ret); + + return strdup_to_full(ret, crt->cgroup_path); /* returns 1 -> cgroup_path is alive */ +} + int unit_watch_cgroup(Unit *u) { _cleanup_free_ char *events = NULL; int r; @@ -2077,42 +2087,14 @@ int unit_watch_cgroup_memory(Unit *u) { return 0; } -int unit_pick_cgroup_path(Unit *u) { - _cleanup_free_ char *path = NULL; - int r; - - assert(u); - - if (!UNIT_HAS_CGROUP_CONTEXT(u)) - return -EINVAL; - - CGroupRuntime *crt = unit_setup_cgroup_runtime(u); - if (!crt) - return -ENOMEM; - if (crt->cgroup_path) - return 0; - - r = unit_default_cgroup_path(u, &path); - if (r < 0) - return log_unit_error_errno(u, r, "Failed to generate default cgroup path: %m"); - - r = unit_set_cgroup_path(u, path); - if (r == -EEXIST) - return log_unit_error_errno(u, r, "Control group %s exists already.", empty_to_root(path)); - if (r < 0) - return log_unit_error_errno(u, r, "Failed to set unit's control group path to %s: %m", empty_to_root(path)); - - return 0; -} - static int unit_update_cgroup( Unit *u, CGroupMask target_mask, CGroupMask enable_mask, ManagerState state) { - bool created; - _cleanup_free_ char *cgroup_full_path = NULL; + _cleanup_free_ char *cgroup = NULL, *cgroup_full_path = NULL; + bool set_path, created; int r; assert(u); @@ -2123,19 +2105,28 @@ static int unit_update_cgroup( if (u->freezer_state != FREEZER_RUNNING) return log_unit_error_errno(u, SYNTHETIC_ERRNO(EBUSY), "Cannot realize cgroup for frozen unit."); - /* Figure out our cgroup path */ - r = unit_pick_cgroup_path(u); + r = unit_get_cgroup_path_with_fallback(u, &cgroup); if (r < 0) - return r; - - CGroupRuntime *crt = ASSERT_PTR(unit_get_cgroup_runtime(u)); + return log_unit_error_errno(u, r, "Failed to get cgroup path: %m"); + set_path = r == 0; /* First, create our own group */ - r = cg_create(crt->cgroup_path); + r = cg_create(cgroup); if (r < 0) - return log_unit_error_errno(u, r, "Failed to create cgroup %s: %m", empty_to_root(crt->cgroup_path)); + return log_unit_error_errno(u, r, "Failed to create cgroup %s: %m", empty_to_root(cgroup)); created = r; + if (set_path) { + r = unit_set_cgroup_path(u, cgroup); + if (r == -EEXIST) + return log_unit_error_errno(u, r, "Picked control group '%s' as default, but it's in use already.", empty_to_root(cgroup)); + if (r < 0) + return log_unit_error_errno(u, r, "Failed to set unit's control group path to '%s': %m", empty_to_root(cgroup)); + assert(r > 0); + } + + CGroupRuntime *crt = ASSERT_PTR(unit_get_cgroup_runtime(u)); + uint64_t cgroup_id = 0; r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, crt->cgroup_path, NULL, &cgroup_full_path); if (r == 0) { @@ -2153,7 +2144,7 @@ static int unit_update_cgroup( (void) unit_watch_cgroup_memory(u); /* For v2 we preserve enabled controllers in delegated units, adjust others, */ - if (created || !crt->cgroup_realized || !unit_cgroup_delegate(u)) { + if (created || !unit_cgroup_delegate(u)) { CGroupMask result_mask = 0; /* Enable all controllers we need */ @@ -2166,7 +2157,6 @@ static int unit_update_cgroup( } /* Keep track that this is now realized */ - crt->cgroup_realized = true; crt->cgroup_realized_mask = target_mask; /* Set attributes */ @@ -2317,10 +2307,6 @@ int unit_remove_subcgroup(Unit *u, const char *suffix_path) { if (!unit_cgroup_delegate(u)) return -ENOMEDIUM; - r = unit_pick_cgroup_path(u); - if (r < 0) - return r; - CGroupRuntime *crt = unit_get_cgroup_runtime(u); if (!crt || !crt->cgroup_path) return -EOWNERDEAD; @@ -2375,7 +2361,7 @@ static bool unit_has_mask_realized( * enabled through cgroup.subtree_control, and since the BPF pseudo-controllers don't show up there, they * simply don't matter. */ - return crt->cgroup_realized && + return crt->cgroup_path && ((crt->cgroup_realized_mask ^ target_mask) & CGROUP_MASK_V1) == 0 && ((crt->cgroup_enabled_mask ^ enable_mask) & CGROUP_MASK_V2) == 0 && crt->cgroup_invalidated_mask == 0; @@ -2397,7 +2383,7 @@ static bool unit_has_mask_disables_realized( * Unlike unit_has_mask_realized, we don't care what was enabled, only that anything we want to remove is * already removed. */ - return !crt->cgroup_realized || + return !crt->cgroup_path || (FLAGS_SET(crt->cgroup_realized_mask, target_mask & CGROUP_MASK_V1) && FLAGS_SET(crt->cgroup_enabled_mask, enable_mask & CGROUP_MASK_V2)); } @@ -2418,7 +2404,7 @@ static bool unit_has_mask_enables_realized( * Unlike unit_has_mask_realized, we don't care about the controllers that are not present, only that anything * we want to add is already added. */ - return crt->cgroup_realized && + return crt->cgroup_path && ((crt->cgroup_realized_mask | target_mask) & CGROUP_MASK_V1) == (crt->cgroup_realized_mask & CGROUP_MASK_V1) && ((crt->cgroup_enabled_mask | enable_mask) & CGROUP_MASK_V2) == (crt->cgroup_enabled_mask & CGROUP_MASK_V2); } @@ -2497,7 +2483,7 @@ static int unit_realize_cgroup_now_disable(Unit *u, ManagerState state) { /* The cgroup for this unit might not actually be fully realised yet, in which case it isn't * holding any controllers open anyway. */ - if (!rt->cgroup_realized) + if (!rt->cgroup_path) continue; /* We must disable those below us first in order to release the controller. */ @@ -2669,7 +2655,7 @@ void unit_add_family_to_cgroup_realize_queue(Unit *u) { /* We only enqueue siblings if they were realized once at least, in the main * hierarchy. */ crt = unit_get_cgroup_runtime(m); - if (!crt || !crt->cgroup_realized) + if (!crt || !crt->cgroup_path) continue; /* If the unit doesn't need any new controllers and has current ones @@ -2885,7 +2871,6 @@ void unit_prune_cgroup(Unit *u) { assert(crt == unit_get_cgroup_runtime(u)); assert(!crt->cgroup_path); - crt->cgroup_realized = false; crt->cgroup_realized_mask = 0; crt->cgroup_enabled_mask = 0; @@ -4199,6 +4184,8 @@ CGroupRuntime* cgroup_runtime_new(void) { .ipv6_deny_map_fd = -EBADF, .cgroup_invalidated_mask = _CGROUP_MASK_ALL, + + .deserialized_cgroup_realized = -1, }; unit_reset_cpu_accounting(/* unit = */ NULL, crt); @@ -4335,7 +4322,6 @@ int cgroup_runtime_serialize(Unit *u, FILE *f, FDSet *fds) { if (crt->cgroup_id != 0) (void) serialize_item_format(f, "cgroup-id", "%" PRIu64, crt->cgroup_id); - (void) serialize_bool(f, "cgroup-realized", crt->cgroup_realized); (void) serialize_cgroup_mask(f, "cgroup-realized-mask", crt->cgroup_realized_mask); (void) serialize_cgroup_mask(f, "cgroup-enabled-mask", crt->cgroup_enabled_mask); (void) serialize_cgroup_mask(f, "cgroup-invalidated-mask", crt->cgroup_invalidated_mask); @@ -4435,15 +4421,13 @@ int cgroup_runtime_deserialize_one(Unit *u, const char *key, const char *value, if (r < 0) log_unit_debug_errno(u, r, "Failed to set cgroup path %s, ignoring: %m", value); - (void) unit_watch_cgroup(u); - (void) unit_watch_cgroup_memory(u); return 1; } if (MATCH_DESERIALIZE_IMMEDIATE(u, "cgroup-id", key, value, safe_atou64, cgroup_id)) return 1; - if (MATCH_DESERIALIZE(u, "cgroup-realized", key, value, parse_boolean, cgroup_realized)) + if (MATCH_DESERIALIZE_IMMEDIATE(u, "cgroup-realized", key, value, parse_tristate, deserialized_cgroup_realized)) return 1; if (MATCH_DESERIALIZE_IMMEDIATE(u, "cgroup-realized-mask", key, value, cg_mask_from_string, cgroup_realized_mask)) diff --git a/src/core/cgroup.h b/src/core/cgroup.h index f8504499eef..07bcbd340e9 100644 --- a/src/core/cgroup.h +++ b/src/core/cgroup.h @@ -323,7 +323,6 @@ typedef struct CGroupRuntime { struct bpf_link *restrict_ifaces_egress_bpf_link; #endif - bool cgroup_realized:1; bool cgroup_members_mask_valid:1; /* Reset cgroup accounting next time we fork something off */ @@ -331,6 +330,8 @@ typedef struct CGroupRuntime { /* Whether we warned about clamping the CPU quota period */ bool warned_clamping_cpu_quota_period:1; + + int deserialized_cgroup_realized; /* tristate, for backwards compat */ } CGroupRuntime; uint64_t cgroup_context_cpu_weight(CGroupContext *c, ManagerState state); @@ -379,7 +380,7 @@ void unit_add_family_to_cgroup_realize_queue(Unit *u); int unit_default_cgroup_path(const Unit *u, char **ret); int unit_set_cgroup_path(Unit *u, const char *path); -int unit_pick_cgroup_path(Unit *u); +int unit_get_cgroup_path_with_fallback(const Unit *u, char **ret); int unit_realize_cgroup(Unit *u); void unit_prune_cgroup(Unit *u); diff --git a/src/core/unit-serialize.c b/src/core/unit-serialize.c index c2e8496b716..a35d57986bd 100644 --- a/src/core/unit-serialize.c +++ b/src/core/unit-serialize.c @@ -367,11 +367,17 @@ int unit_deserialize_state(Unit *u, FILE *f, FDSet *fds) { /* Let's make sure that everything that is deserialized also gets any potential new cgroup settings * applied after we are done. For that we invalidate anything already realized, so that we can * realize it again. */ - CGroupRuntime *crt; - crt = unit_get_cgroup_runtime(u); - if (crt && crt->cgroup_realized) { - unit_invalidate_cgroup(u, _CGROUP_MASK_ALL); - unit_invalidate_cgroup_bpf(u); + CGroupRuntime *crt = unit_get_cgroup_runtime(u); + if (crt && crt->cgroup_path) { + /* Since v258, CGroupRuntime.cgroup_path is coupled with cgroup realized state, which however + * wasn't the case in prior versions with the realized state tracked in a discrete field. + * Patch cgroup_realized == 0 back to no cgroup_path here hence. */ + if (crt->deserialized_cgroup_realized == 0) + unit_release_cgroup(u, /* drop_cgroup_runtime = */ false); + else { + unit_invalidate_cgroup(u, _CGROUP_MASK_ALL); + unit_invalidate_cgroup_bpf(u); + } } return 0; @@ -504,11 +510,9 @@ void unit_dump(Unit *u, FILE *f, const char *prefix) { fprintf(f, "%s\tSlice: %s\n" - "%s\tCGroup: %s\n" - "%s\tCGroup realized: %s\n", + "%s\tCGroup: %s\n", prefix, strna(unit_slice_name(u)), - prefix, strna(crt ? crt->cgroup_path : NULL), - prefix, yes_no(crt ? crt->cgroup_realized : false)); + prefix, strna(crt ? crt->cgroup_path : NULL)); if (crt && crt->cgroup_realized_mask != 0) { _cleanup_free_ char *s = NULL; diff --git a/src/core/unit.c b/src/core/unit.c index d617624dede..60e91c55fce 100644 --- a/src/core/unit.c +++ b/src/core/unit.c @@ -3395,7 +3395,7 @@ int unit_set_slice(Unit *u, Unit *slice) { /* Disallow slice changes if @u is already bound to cgroups */ if (UNIT_GET_SLICE(u)) { CGroupRuntime *crt = unit_get_cgroup_runtime(u); - if (crt && crt->cgroup_realized) + if (crt && crt->cgroup_path) return -EBUSY; } @@ -6006,16 +6006,17 @@ static int unit_log_leftover_process_stop(const PidRef *pid, int sig, void *user } int unit_warn_leftover_processes(Unit *u, bool start) { - assert(u); + _cleanup_free_ char *cgroup = NULL; + int r; - (void) unit_pick_cgroup_path(u); + assert(u); - CGroupRuntime *crt = unit_get_cgroup_runtime(u); - if (!crt || !crt->cgroup_path) - return 0; + r = unit_get_cgroup_path_with_fallback(u, &cgroup); + if (r < 0) + return r; return cg_kill_recursive( - crt->cgroup_path, + cgroup, /* sig= */ 0, /* flags= */ 0, /* killed_pids= */ NULL, -- 2.47.3