From 056bc106e1e344f98cdfa86fdf62e6fed72958c9 Mon Sep 17 00:00:00 2001 From: Nandakumar Raghavan Date: Tue, 24 Mar 2026 13:42:42 +0000 Subject: [PATCH] core: fix EBUSY on restart and clean of delegated services When a service is configured with Delegate=yes and DelegateSubgroup=sub, the delegated container may write domain controllers (e.g. "pids") into the service cgroup's cgroup.subtree_control via its cgroupns root. On container exit the stale controllers remain, and on service restart clone3() with CLONE_INTO_CGROUP fails with EBUSY because placing a process into a cgroup that has domain controllers in subtree_control violates the no-internal- processes rule. The same issue affects systemctl clean, where cg_attach() fails with EBUSY for the same reason. Add unit_cgroup_disable_all_controllers() helper in cgroup.c that clears stale controllers via cg_enable(mask=0) and updates cgroup_enabled_mask to keep internal tracking in sync. Call it from service_start() and service_clean() right before spawning, so that resource control is preserved for any lingering processes from the previous invocation as long as possible. --- src/core/cgroup.c | 23 +++++++++++++++++++++++ src/core/cgroup.h | 2 ++ src/core/service.c | 6 +++++- 3 files changed, 30 insertions(+), 1 deletion(-) diff --git a/src/core/cgroup.c b/src/core/cgroup.c index ddca4afbc32..ae5874cd99d 100644 --- a/src/core/cgroup.c +++ b/src/core/cgroup.c @@ -3988,6 +3988,29 @@ bool unit_cgroup_delegate(Unit *u) { return c->delegate; } +void unit_cgroup_disable_all_controllers(Unit *u) { + int r; + + assert(u); + + CGroupRuntime *crt = unit_get_cgroup_runtime(u); + if (!crt || !crt->cgroup_path) + return; + + if (!unit_cgroup_delegate(u)) + return; + + /* For delegated units, the previous payload may have enabled controllers (e.g. "pids") in + * cgroup.subtree_control. These persist after the service stops and turn the cgroup into an + * "internal node", causing clone3(CLONE_INTO_CGROUP) to fail with EBUSY. Clear them now, right + * before the new start, so that resource control is preserved for lingering processes as long as + * possible. Ignore errors — if sub-cgroups still have live processes the write will fail, but so + * will the upcoming spawn. */ + r = cg_enable(u->manager->cgroup_supported, /* mask= */ 0, crt->cgroup_path, &crt->cgroup_enabled_mask); + if (r < 0) + log_unit_debug_errno(u, r, "Failed to disable controllers on cgroup %s, ignoring: %m", empty_to_root(crt->cgroup_path)); +} + void manager_invalidate_startup_units(Manager *m) { Unit *u; diff --git a/src/core/cgroup.h b/src/core/cgroup.h index ce98f4ba7cd..d9a6ded1102 100644 --- a/src/core/cgroup.h +++ b/src/core/cgroup.h @@ -471,6 +471,8 @@ void unit_cgroup_catchup(Unit *u); bool unit_cgroup_delegate(Unit *u); +void unit_cgroup_disable_all_controllers(Unit *u); + int unit_get_cpuset(Unit *u, CPUSet *cpus, const char *name); int unit_cgroup_freezer_action(Unit *u, FreezerAction action); diff --git a/src/core/service.c b/src/core/service.c index 569a6871d60..63e65994218 100644 --- a/src/core/service.c +++ b/src/core/service.c @@ -13,6 +13,7 @@ #include "bus-common-errors.h" #include "bus-error.h" #include "bus-util.h" +#include "cgroup.h" #include "chase.h" #include "dbus-service.h" #include "dbus-unit.h" @@ -3174,8 +3175,10 @@ static int service_start(Unit *u) { exec_status_reset(&s->main_exec_status); CGroupRuntime *crt = unit_get_cgroup_runtime(u); - if (crt) + if (crt) { + unit_cgroup_disable_all_controllers(u); crt->reset_accounting = true; + } service_enter_condition(s); return 1; @@ -5640,6 +5643,7 @@ static int service_clean(Unit *u, ExecCleanMask mask) { goto fail; } + unit_cgroup_disable_all_controllers(u); r = unit_fork_and_watch_rm_rf(u, l, &s->control_pid); if (r < 0) { log_unit_warning_errno(u, r, "Failed to spawn cleaning task: %m"); -- 2.47.3