From 4f7089f0c740b9b0b5e55913282b61d851991562 Mon Sep 17 00:00:00 2001 From: Luca Boccassi Date: Thu, 7 May 2026 20:02:57 +0100 Subject: [PATCH] core: do not leak resources when handling stale alias state on reload The fix for the corrupted state when units become aliased on reload leaks the now-aliased unit's resources, which become untracked and essentially lost. While fixing the state corruption is of course necessary, leaking processes/etc. is not ideal for a system and service manager, so instead attempt to keep track of them by creating stub units on-the-fly as replacements. This way resources are not leaked, there are clear indications of where they moved, and all state can be tracked as expected. Ignore timers/sockets/paths as those are internal resources/triggers. Follow-up for a77c7a8224447890a304bd857f412c8103f217f1 --- man/systemctl.xml | 11 - man/systemd.unit.xml | 10 + src/core/manager-serialize.c | 110 ++++++++- test/units/TEST-07-PID1.alias-corruption.sh | 236 +++++++++++++++++--- 4 files changed, 310 insertions(+), 57 deletions(-) diff --git a/man/systemctl.xml b/man/systemctl.xml index 8b8d1065556..b8ba0f945e9 100644 --- a/man/systemctl.xml +++ b/man/systemctl.xml @@ -1499,17 +1499,6 @@ Jan 12 10:46:45 example.com bluetoothd[8900]: gatt-time-server: Input/output err systemd listens on behalf of user configuration will stay accessible. - When unit aliasing is introduced during reload (e.g., converting - b.service to a symlink pointing to - a.service), the running state of the canonical - unit (a.service) is preserved. The old serialized - state of the now-aliased unit is discarded to prevent stale data from - corrupting the canonical unit's live state. Dependencies referencing - the alias name are automatically resolved to the canonical unit, and - the dependency graph is rebuilt from unit files, ensuring consistency. - If the now-aliased unit had running processes, they are abandoned and - will no longer be tracked by the service manager. - This command should not be confused with the reload command. diff --git a/man/systemd.unit.xml b/man/systemd.unit.xml index 47c30d869fc..2f1467b720a 100644 --- a/man/systemd.unit.xml +++ b/man/systemd.unit.xml @@ -281,6 +281,16 @@ generally ignored. This includes names that start with a . or end with a .ignore. + When unit aliasing is introduced during reload/reexec (e.g., converting + b.service to a symlink pointing to a.service), the running + state of the canonical unit (a.service) is preserved. The old serialized state + of the now-aliased unit is migrated to a new stub orphaned unit to prevent stale data from + corrupting the canonical unit's live state. Dependencies referencing the alias name are automatically + resolved to the canonical unit, and the dependency graph is rebuilt from unit files, ensuring + consistency. If the now-aliased unit had resources such as running processes, they will now be tracked + under the new orphaned unit. Once all resources are gone (e.g. all processes have exited) the orphaned + unit will be garbage collected automatically. + The unit file format is covered by the Interface Portability and Stability Promise. diff --git a/src/core/manager-serialize.c b/src/core/manager-serialize.c index c7948881648..7c557b2f3ff 100644 --- a/src/core/manager-serialize.c +++ b/src/core/manager-serialize.c @@ -1,5 +1,7 @@ /* SPDX-License-Identifier: LGPL-2.1-or-later */ +#include "sd-id128.h" + #include "alloc-util.h" #include "dbus.h" #include "dynamic-user.h" @@ -10,6 +12,7 @@ #include "glyph-util.h" #include "hashmap.h" #include "initrd-util.h" +#include "job.h" #include "manager.h" #include "manager-serialize.h" #include "parse-util.h" @@ -18,6 +21,8 @@ #include "string-util.h" #include "strv.h" #include "syslog-util.h" +#include "unit.h" +#include "unit-name.h" #include "unit-serialize.h" #include "user-util.h" #include "varlink.h" @@ -235,6 +240,87 @@ static int manager_collect_serialized_unit_names(FILE *f, Set **ret) { return 0; } +static int manager_synthesize_orphaned_unit( + Manager *m, + const char *original_name, + const char *canonical_name, + FILE *f, + FDSet *fds) { + + _cleanup_(unit_freep) Unit *orphan = NULL; + _cleanup_free_ char *orphaned_name = NULL; + sd_id128_t rnd; + UnitType t; + int r; + + assert(m); + assert(original_name); + assert(canonical_name); + assert(f); + assert(fds); + + t = unit_name_to_type(original_name); + if (t < 0) + return log_warning_errno(SYNTHETIC_ERRNO(EINVAL), + "Cannot synthesize unit for '%s' (overridden by alias to '%s'): invalid unit type. Skipping stale state.", + original_name, canonical_name); + + /* Only transition units that track external resources, forget internal ones (eg: timers) */ + if (!IN_SET(t, UNIT_SERVICE, UNIT_SCOPE, UNIT_MOUNT, UNIT_AUTOMOUNT)) + return log_warning_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), + "Cannot synthesize unit for '%s' (overridden by alias to '%s'): unsupported unit type. Skipping stale state.", + original_name, canonical_name); + + /* Use a naming convention with an "orphaned-" prefix to make it clear at a glance that these units + * were synthesized to adopt resources from a now-aliased unit */ + r = sd_id128_randomize(&rnd); + if (r < 0) + return log_warning_errno(r, "Failed to generate random ID for orphan unit: %m"); + + if (asprintf(&orphaned_name, "orphaned-r" SD_ID128_FORMAT_STR ".%s", + SD_ID128_FORMAT_VAL(rnd), unit_type_to_string(t)) < 0) + return log_oom(); + + r = unit_new_for_name(m, unit_vtable[t]->object_size, orphaned_name, &orphan); + if (r < 0) + return log_warning_errno(r, "Failed to allocate orphan unit '%s': %m", orphaned_name); + + /* Force the state as we know the unit file is gone, so it will hang around as long as resources are + * active, and then it will be automatically collected */ + orphan->load_state = UNIT_NOT_FOUND; + /* Ensure the orphan also gets garbage-collected when it ends up in 'failed' state. */ + orphan->collect_mode = COLLECT_INACTIVE_OR_FAILED; + + _cleanup_free_ char *description = strjoin("Orphaned resources adopted from aliased unit ", original_name); + if (!description) + return log_oom(); + + r = unit_deserialize_state(orphan, f, fds); + if (r < 0) + return log_notice_errno(r, "Failed to deserialize state into orphan unit '%s': %m", orphaned_name); + + /* From this point on the serialized stream for this unit has been fully consumed, so avoid failing. */ + + log_warning("Unit file for '%s' was overridden by a symlink to '%s'. Synthesized orphan unit '%s' to retain tracking of the previous unit's processes.", + original_name, canonical_name, orphaned_name); + + /* Override Description= so it's clear what this is for and can be traced back to the original. */ + free_and_replace(orphan->description, description); + + /* Any jobs reinstalled from the deserialized state targeted the original unit. Most of them + * (start, restart, reload, ...) make no sense for the synthesized orphan, especially after a + * service-to-scope conversion, so cancel them. JOB_STOP is the exception: a stop request that was + * already pending on the original unit is exactly what we want to keep around, since it'll cause + * the synthesized unit to just go away immediately. */ + if (orphan->job && orphan->job->type != JOB_STOP) + job_finish_and_invalidate(orphan->job, JOB_CANCELED, /* recursive= */ false, /* already= */ false); + if (orphan->nop_job) + job_finish_and_invalidate(orphan->nop_job, JOB_CANCELED, /* recursive= */ false, /* already= */ false); + + TAKE_PTR(orphan); + return 0; +} + static int manager_deserialize_one_unit( Manager *m, const char *name, @@ -274,21 +360,23 @@ static int manager_deserialize_one_unit( * so that any further aliases resolving to the same unit are skipped. * * The serialized data represents the old, independent unit. Deserializing this stale state - * would corrupt the canonical unit's live state, so we must discard it. + * onto the canonical unit would corrupt its live state. Instead, we synthesize a new unloaded + * unit with a unique name and migrate the cgroup/PID/etc. tracking from the stale state + * into it, so that the resources from the previously independent unit remain tracked. * * Take as an example, a.service is running. Someone created symlink b.service -> a.service. - * On first reload, the state file still has b.service as an independent dead unit (from - * before the symlink existed), but b.service now resolves to a.service. We must discard - * b.service's stale dead state to preserve a.service's running state. + * On first reload, the state file still has b.service as an independent unit (from before + * the symlink existed), but b.service now resolves to a.service. We retain a.service's + * running state, and synthesize an unloaded unit to keep tracking the processes that + * b.service used to own. * - * Note: This log message is checked in TEST-07-PID1.alias-corruption.sh, so the test case - * may need adjustment if the message is changed. + * Note: Log messages from this code path are checked in TEST-07-PID1.alias-corruption.sh, + * so the test case may need adjustment if they are changed. */ - log_warning("Unit file for '%s' was overridden by a symlink to '%s', which also has serialized state. Skipping stale state of old unit. Any processes from the overridden unit are now abandoned!", - name, - u->id); - - return unit_deserialize_state_skip(f); + r = manager_synthesize_orphaned_unit(m, name, u->id, f, fds); + if (r < 0) /* If we fail to orphan for any reason, then discard the unit */ + r = unit_deserialize_state_skip(f); + return r; } r = unit_deserialize_state(u, f, fds); diff --git a/test/units/TEST-07-PID1.alias-corruption.sh b/test/units/TEST-07-PID1.alias-corruption.sh index 009ee484a02..dea08f04041 100755 --- a/test/units/TEST-07-PID1.alias-corruption.sh +++ b/test/units/TEST-07-PID1.alias-corruption.sh @@ -56,14 +56,37 @@ reap_abandoned_pids() { abandoned_pids=() } +# Wait until at least one job for a unit name matching the given prefix is +# queued. Used to make sure the serialized state we're about to dump contains +# embedded 'job\n...\n\n' subsections so the deserialization path is fully +# exercised. +wait_for_pending_job() { + local prefix="${1:?}" + local i unit + + for i in {1..100}; do + for unit in $(systemctl list-jobs --no-legend | awk '{print $2}'); do + if [[ "$unit" == "$prefix"* ]]; then + return 0 + fi + done + sleep 0.1 + done + + echo "ERROR: no $prefix* jobs are pending" + systemctl list-jobs || true + return 1 +} + run_test() { local reload_cmd="${1:?}" - # If "with_pending_jobs", also create many Type=oneshot units that hang in - # "activating" state with a pending job, to ensure that the serialized state - # contains embedded "job" subsections to fully exercise the deserialization + # If "with_pending_jobs", also queue hanging reload jobs on the running sus + # units, so that the serialized state contains embedded 'job' subsections + # to fully exercise the deserialization local pending_jobs="${2:-}" local n_sus=20 - local current_pid journal_warnings new_pid orig_pid pid reload_start unit warning_count + local current_pid journal_warnings new_pid orig_pid p pid reload_start orphan orphan_cgroup unit warning_count + local orphan_units if [[ "$pending_jobs" == "with_pending_jobs" ]]; then n_sus=100 @@ -84,8 +107,8 @@ EOF # collect them. If they are dead/stopped, systemd can remove them from # memory before serialization. # - # In with_pending_jobs mode they additionally get a pending restart job - # queued via 'systemctl --no-block restart' AFTER they are running, so the + # In with_pending_jobs mode they additionally get a pending reload job + # queued via 'systemctl --no-block reload' AFTER they are running, so the # serialized stream contains 'job\n...\n\n' subsections AND the units have # a real MainPID. The skip-desync regression in unit_deserialize_state_skip() # stops at the job subsection's empty line marker, leaving the rest of the @@ -124,15 +147,7 @@ EOF done # Make sure at least one reload job is actually queued, otherwise the # serialized stream might not contain any job subsections yet. - for i in {1..100}; do - [[ -n "$(systemctl list-jobs --no-legend | grep -E '^[[:space:]]*[0-9]+ sus-' || true)" ]] && break - if (( i == 100 )); then - echo "ERROR: no sus-*.service reload jobs are pending" - systemctl list-jobs || true - return 1 - fi - sleep 0.1 - done + wait_for_pending_job sus- fi echo "Setup complete: 1 running legit unit, $n_sus ${pending_jobs:+job-bearing }sus units" @@ -188,32 +203,76 @@ EOF echo "legit.service PID remains $new_pid. Attempt $attempt passed." - # Verify that all sus unit processes were abandoned (still running but no longer tracked) - echo "Verifying sus unit processes were abandoned..." + # Verify that all sus unit processes were tracked in a synthesized + # orphan unit, not abandoned. + echo "Verifying sus unit processes were migrated into synthesized orphan units..." for unit in "${!sus_pids[@]}"; do pid=${sus_pids[$unit]} # Process should still be running if ! kill -0 "$pid" 2>/dev/null; then - echo "ERROR: $unit process (PID $pid) was killed instead of abandoned!" + echo "ERROR: $unit process (PID $pid) was killed instead of being preserved!" return 1 fi - # But the alias should now either be inactive (MainPID=0) or resolve to legit's PID. + # The alias should now either be inactive (MainPID=0) or resolve to legit's PID. current_pid=$(systemctl show -P MainPID "${unit}.service") if ! (( current_pid == 0 || current_pid == new_pid )); then echo "ERROR: $unit unexpectedly reports MainPID=$current_pid after aliasing!" return 1 fi - echo "$unit process (PID $pid) was correctly abandoned (still running, no longer tracked)" + echo "$unit process (PID $pid) is still running and the alias correctly does not claim it" done - # Check consistency between journal warnings and abandoned processes - echo "Checking journal for stale state warnings..." - journal_warnings=$(journalctl --since "$reload_start" --no-pager | grep "Skipping stale state" || true) - warning_count=$(echo "$journal_warnings" | grep -c "Skipping stale state" || true) + # Verify that synthesized orphan units exist that cover all the + # previously-tracked PIDs. Each preserved process must belong to a + # orphaned-r* unit. Orphan units retain the original unit's type + # (.service here) and are marked load-state=not-found, so list-units + # with --all is required to see them. + echo "Verifying synthesized orphan units are present and own the PIDs..." + orphan_units=$(systemctl list-units --all --no-legend --plain 'orphaned-r*' | awk '{print $1}') + if [[ -z "$orphan_units" ]]; then + echo "ERROR: No orphaned-r* units were synthesized!" + systemctl list-units --all --no-legend --plain || true + return 1 + fi + echo "Found orphan units:" + echo "$orphan_units" + + # Build a set of all PIDs reported by any orphan unit (via Tasks/cgroup membership). + unset tracked_pids + declare -A tracked_pids + for orphan in $orphan_units; do + orphan_cgroup=$(systemctl show -P ControlGroup "$orphan") + if [[ -z "$orphan_cgroup" || ! -r "/sys/fs/cgroup${orphan_cgroup}/cgroup.procs" ]]; then + # Orphan unit may have been recorded with the original unit's cgroup path which still exists + echo "ERROR: Cannot read cgroup.procs for $orphan at expected path /sys/fs/cgroup${orphan_cgroup}/cgroup.procs" + return 1 + fi + while read -r p; do + [[ -n "$p" ]] && tracked_pids[$p]=1 + done < "/sys/fs/cgroup${orphan_cgroup}/cgroup.procs" + done - echo "Found $warning_count 'Skipping stale state' warnings" + # Cross-check: every original sus PID must appear under exactly one orphan unit cgroup. + for unit in "${!sus_pids[@]}"; do + pid=${sus_pids[$unit]} + if [[ -z "${tracked_pids[$pid]:-}" ]]; then + echo "ERROR: PID $pid (from $unit) is not tracked by any synthesized orphan unit!" + echo "Orphan unit contents:" + for orphan in $orphan_units; do + echo " $orphan -> $(systemctl show -P ControlGroup "$orphan")" + done + return 1 + fi + done + echo "All ${#sus_pids[@]} sus PIDs are tracked by synthesized orphan units." + + # Check consistency between journal warnings and synthesized orphan units. + echo "Checking journal for 'Synthesized orphan unit' warnings..." + journal_warnings=$(journalctl --since "$reload_start" --no-pager | grep "Synthesized orphan unit" || true) + warning_count=$(echo "$journal_warnings" | grep -c "Synthesized orphan unit" || true) + + echo "Found $warning_count 'Synthesized orphan unit' warnings" - # Extract unit names from warnings and verify they match our sus units if (( warning_count > 0 )); then echo "Verifying warning consistency..." for unit in "${!sus_pids[@]}"; do @@ -223,6 +282,12 @@ EOF done fi + # Stop synthesized orphan units (which terminates their tracked + # processes) so we get a clean slate for the next iteration. + for orphan in $orphan_units; do + systemctl stop "$orphan" 2>/dev/null || true + done + reap_abandoned_pids if (( attempt < 3 )); then @@ -252,15 +317,7 @@ EOF for i in $(seq -f "%03g" 1 "$n_sus"); do systemctl --no-block reload sus-"${i}".service done - for i in {1..100}; do - [[ -n "$(systemctl list-jobs --no-legend | grep -E '^[[:space:]]*[0-9]+ sus-' || true)" ]] && break - if (( i == 100 )); then - echo "ERROR: no sus-*.service reload jobs are pending after reset" - systemctl list-jobs || true - return 1 - fi - sleep 0.1 - done + wait_for_pending_job sus- fi echo "Reset complete." @@ -274,13 +331,118 @@ EOF cleanup_test_units() { reap_abandoned_pids || true + # Stop any leftover synthesized orphan units from previous iterations. + # Do this in two passes since a queued stop job from the deserialized + # state may take a moment to dispatch and tear the unit down. + for orphan in $(systemctl list-units --all --no-legend --plain 'orphaned-r*' 2>/dev/null | awk '{print $1}'); do + systemctl stop "$orphan" 2>/dev/null || true + done + for orphan in $(systemctl list-units --all --no-legend --plain 'orphaned-r*' 2>/dev/null | awk '{print $1}'); do + systemctl kill --signal=SIGKILL "$orphan" 2>/dev/null || true + systemctl reset-failed "$orphan" 2>/dev/null || true + done systemctl stop legit.service 2>/dev/null || true + systemctl stop hung-stop.service 2>/dev/null || true for i in $(seq -f "%03g" 1 100); do systemctl stop sus-"${i}".service 2>/dev/null || true rm -f /run/systemd/system/sus-"${i}".service done rm -f /run/systemd/system/legit.service + rm -f /run/systemd/system/hung-stop.service + systemctl daemon-reload +} + +# Verify that a JOB_STOP that was pending on the original unit at the moment of +# serialization is preserved on the synthesized orphan unit, so the stop is +# eventually carried out (the orphan is torn down) instead of sitting around +# forever. +test_stop_job_preserved() { + local reload_cmd="${1:?}" + local hung_pid orphan stop_job substate + + echo "" + echo "=========================================" + echo "Testing pending-stop preservation with: systemctl $reload_cmd" + echo "=========================================" + + # Service whose main process traps SIGTERM and never exits, so a "stop" + # request stays pending in the queue and remains serialized. + cat >/run/systemd/system/legit.service <<'EOF' +[Service] +Type=simple +ExecStart=/bin/sleep infinity +EOF + cat >/run/systemd/system/hung-stop.service <<'EOF' +[Service] +Type=notify +NotifyAccess=all +ExecStart=/bin/bash -c 'trap "" TERM; systemd-notify --ready; while :; do sleep infinity & wait $!; done' +TimeoutStopSec=infinity +SendSIGKILL=no +EOF + systemctl daemon-reload + systemctl start legit.service + systemctl start hung-stop.service + + hung_pid=$(systemctl show -P MainPID hung-stop.service) + if (( hung_pid == 0 )); then + echo "ERROR: hung-stop.service did not start" + return 1 + fi + + # Queue a stop that will hang because the process traps SIGTERM and + # SendSIGKILL=no prevents escalation, so the job stays in the queue and + # is therefore present in the serialized state at reload time. + systemctl --no-block stop hung-stop.service + wait_for_pending_job hung-stop.service + + # Convert hung-stop.service into a symlink to legit.service: on the next + # reload the original unit becomes an alias of legit.service, and its + # serialized state (including the pending stop job) is fed into the + # synthesized orphaned-r* orphan unit. + rm -f /run/systemd/system/hung-stop.service + ln -sf /run/systemd/system/legit.service /run/systemd/system/hung-stop.service + + systemctl "$reload_cmd" + + orphan=$(systemctl list-units --all --no-legend --plain 'orphaned-r*' | awk '{print $1}' | head -n1) + if [[ -z "$orphan" ]]; then + echo "ERROR: no synthesized orphan unit was created" + systemctl list-units --all --no-legend --plain || true + return 1 + fi + echo "Synthesized orphan unit: $orphan" + + # The pending stop job from the original unit must have been carried over + # to the synthesized orphan, otherwise the orphan (and its tracked + # process) is leaked across the reload. Check if: + # - the stop job is still queued, + # - the orphan is in a stop-* sub-state, + # - the orphan has already finished stopping (dead/failed), + # - the orphan was already garbage-collected (no SubState reported). + stop_job=$(systemctl show -P Job "$orphan" 2>/dev/null || true) + substate=$(systemctl show -P SubState "$orphan" 2>/dev/null || true) + if [[ -z "$stop_job" ]] && [[ -n "$substate" ]] && ! [[ "$substate" =~ ^(stop-|dead|failed) ]]; then + echo "ERROR: stop job for original hung-stop.service was not preserved on $orphan!" + echo "Current substate: $substate" + systemctl list-jobs || true + return 1 + fi + + echo "Stop job for original hung-stop.service was correctly preserved on synthesized orphan $orphan" + + # Tear the hung orphan down for the next iteration. + systemctl kill --signal=SIGKILL "$orphan" 2>/dev/null || true + systemctl reset-failed "$orphan" 2>/dev/null || true + if kill -0 "$hung_pid" 2>/dev/null; then + kill -KILL "$hung_pid" 2>/dev/null || true + fi + + rm -f /run/systemd/system/hung-stop.service /run/systemd/system/legit.service + systemctl daemon-reload + + echo "$reload_cmd stop-job preservation test passed" } trap cleanup_test_units EXIT @@ -292,3 +454,7 @@ cleanup_test_units run_test daemon-reload with_pending_jobs cleanup_test_units run_test daemon-reexec with_pending_jobs +cleanup_test_units +test_stop_job_preserved daemon-reload +cleanup_test_units +test_stop_job_preserved daemon-reexec -- 2.47.3