/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#include "sd-id128.h"
+
#include "alloc-util.h"
#include "dbus.h"
#include "dynamic-user.h"
#include "glyph-util.h"
#include "hashmap.h"
#include "initrd-util.h"
+#include "job.h"
#include "manager.h"
#include "manager-serialize.h"
#include "parse-util.h"
#include "string-util.h"
#include "strv.h"
#include "syslog-util.h"
+#include "unit.h"
+#include "unit-name.h"
#include "unit-serialize.h"
#include "user-util.h"
#include "varlink.h"
return 0;
}
+static int manager_synthesize_orphaned_unit(
+ Manager *m,
+ const char *original_name,
+ const char *canonical_name,
+ FILE *f,
+ FDSet *fds) {
+
+ _cleanup_(unit_freep) Unit *orphan = NULL;
+ _cleanup_free_ char *orphaned_name = NULL;
+ sd_id128_t rnd;
+ UnitType t;
+ int r;
+
+ assert(m);
+ assert(original_name);
+ assert(canonical_name);
+ assert(f);
+ assert(fds);
+
+ t = unit_name_to_type(original_name);
+ if (t < 0)
+ return log_warning_errno(SYNTHETIC_ERRNO(EINVAL),
+ "Cannot synthesize unit for '%s' (overridden by alias to '%s'): invalid unit type. Skipping stale state.",
+ original_name, canonical_name);
+
+ /* Only transition units that track external resources, forget internal ones (eg: timers) */
+ if (!IN_SET(t, UNIT_SERVICE, UNIT_SCOPE, UNIT_MOUNT, UNIT_AUTOMOUNT))
+ return log_warning_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
+ "Cannot synthesize unit for '%s' (overridden by alias to '%s'): unsupported unit type. Skipping stale state.",
+ original_name, canonical_name);
+
+ /* Use a naming convention with an "orphaned-" prefix to make it clear at a glance that these units
+ * were synthesized to adopt resources from a now-aliased unit */
+ r = sd_id128_randomize(&rnd);
+ if (r < 0)
+ return log_warning_errno(r, "Failed to generate random ID for orphan unit: %m");
+
+ if (asprintf(&orphaned_name, "orphaned-r" SD_ID128_FORMAT_STR ".%s",
+ SD_ID128_FORMAT_VAL(rnd), unit_type_to_string(t)) < 0)
+ return log_oom();
+
+ r = unit_new_for_name(m, unit_vtable[t]->object_size, orphaned_name, &orphan);
+ if (r < 0)
+ return log_warning_errno(r, "Failed to allocate orphan unit '%s': %m", orphaned_name);
+
+ /* Force the state as we know the unit file is gone, so it will hang around as long as resources are
+ * active, and then it will be automatically collected */
+ orphan->load_state = UNIT_NOT_FOUND;
+ /* Ensure the orphan also gets garbage-collected when it ends up in 'failed' state. */
+ orphan->collect_mode = COLLECT_INACTIVE_OR_FAILED;
+
+ _cleanup_free_ char *description = strjoin("Orphaned resources adopted from aliased unit ", original_name);
+ if (!description)
+ return log_oom();
+
+ r = unit_deserialize_state(orphan, f, fds);
+ if (r < 0)
+ return log_notice_errno(r, "Failed to deserialize state into orphan unit '%s': %m", orphaned_name);
+
+ /* From this point on the serialized stream for this unit has been fully consumed, so avoid failing. */
+
+ log_warning("Unit file for '%s' was overridden by a symlink to '%s'. Synthesized orphan unit '%s' to retain tracking of the previous unit's processes.",
+ original_name, canonical_name, orphaned_name);
+
+ /* Override Description= so it's clear what this is for and can be traced back to the original. */
+ free_and_replace(orphan->description, description);
+
+ /* Any jobs reinstalled from the deserialized state targeted the original unit. Most of them
+ * (start, restart, reload, ...) make no sense for the synthesized orphan, especially after a
+ * service-to-scope conversion, so cancel them. JOB_STOP is the exception: a stop request that was
+ * already pending on the original unit is exactly what we want to keep around, since it'll cause
+ * the synthesized unit to just go away immediately. */
+ if (orphan->job && orphan->job->type != JOB_STOP)
+ job_finish_and_invalidate(orphan->job, JOB_CANCELED, /* recursive= */ false, /* already= */ false);
+ if (orphan->nop_job)
+ job_finish_and_invalidate(orphan->nop_job, JOB_CANCELED, /* recursive= */ false, /* already= */ false);
+
+ TAKE_PTR(orphan);
+ return 0;
+}
+
static int manager_deserialize_one_unit(
Manager *m,
const char *name,
* so that any further aliases resolving to the same unit are skipped.
*
* The serialized data represents the old, independent unit. Deserializing this stale state
- * would corrupt the canonical unit's live state, so we must discard it.
+ * onto the canonical unit would corrupt its live state. Instead, we synthesize a new unloaded
+ * unit with a unique name and migrate the cgroup/PID/etc. tracking from the stale state
+ * into it, so that the resources from the previously independent unit remain tracked.
*
* Take as an example, a.service is running. Someone created symlink b.service -> a.service.
- * On first reload, the state file still has b.service as an independent dead unit (from
- * before the symlink existed), but b.service now resolves to a.service. We must discard
- * b.service's stale dead state to preserve a.service's running state.
+ * On first reload, the state file still has b.service as an independent unit (from before
+ * the symlink existed), but b.service now resolves to a.service. We retain a.service's
+ * running state, and synthesize an unloaded unit to keep tracking the processes that
+ * b.service used to own.
*
- * Note: This log message is checked in TEST-07-PID1.alias-corruption.sh, so the test case
- * may need adjustment if the message is changed.
+ * Note: Log messages from this code path are checked in TEST-07-PID1.alias-corruption.sh,
+ * so the test case may need adjustment if they are changed.
*/
- log_warning("Unit file for '%s' was overridden by a symlink to '%s', which also has serialized state. Skipping stale state of old unit. Any processes from the overridden unit are now abandoned!",
- name,
- u->id);
-
- return unit_deserialize_state_skip(f);
+ r = manager_synthesize_orphaned_unit(m, name, u->id, f, fds);
+ if (r < 0) /* If we fail to orphan for any reason, then discard the unit */
+ r = unit_deserialize_state_skip(f);
+ return r;
}
r = unit_deserialize_state(u, f, fds);
abandoned_pids=()
}
+# Wait until at least one job for a unit name matching the given prefix is
+# queued. Used to make sure the serialized state we're about to dump contains
+# embedded 'job\n...\n\n' subsections so the deserialization path is fully
+# exercised.
+wait_for_pending_job() {
+ local prefix="${1:?}"
+ local i unit
+
+ for i in {1..100}; do
+ for unit in $(systemctl list-jobs --no-legend | awk '{print $2}'); do
+ if [[ "$unit" == "$prefix"* ]]; then
+ return 0
+ fi
+ done
+ sleep 0.1
+ done
+
+ echo "ERROR: no $prefix* jobs are pending"
+ systemctl list-jobs || true
+ return 1
+}
+
run_test() {
local reload_cmd="${1:?}"
- # If "with_pending_jobs", also create many Type=oneshot units that hang in
- # "activating" state with a pending job, to ensure that the serialized state
- # contains embedded "job" subsections to fully exercise the deserialization
+ # If "with_pending_jobs", also queue hanging reload jobs on the running sus
+ # units, so that the serialized state contains embedded 'job' subsections
+ # to fully exercise the deserialization
local pending_jobs="${2:-}"
local n_sus=20
- local current_pid journal_warnings new_pid orig_pid pid reload_start unit warning_count
+ local current_pid journal_warnings new_pid orig_pid p pid reload_start orphan orphan_cgroup unit warning_count
+ local orphan_units
if [[ "$pending_jobs" == "with_pending_jobs" ]]; then
n_sus=100
# collect them. If they are dead/stopped, systemd can remove them from
# memory before serialization.
#
- # In with_pending_jobs mode they additionally get a pending restart job
- # queued via 'systemctl --no-block restart' AFTER they are running, so the
+ # In with_pending_jobs mode they additionally get a pending reload job
+ # queued via 'systemctl --no-block reload' AFTER they are running, so the
# serialized stream contains 'job\n...\n\n' subsections AND the units have
# a real MainPID. The skip-desync regression in unit_deserialize_state_skip()
# stops at the job subsection's empty line marker, leaving the rest of the
done
# Make sure at least one reload job is actually queued, otherwise the
# serialized stream might not contain any job subsections yet.
- for i in {1..100}; do
- [[ -n "$(systemctl list-jobs --no-legend | grep -E '^[[:space:]]*[0-9]+ sus-' || true)" ]] && break
- if (( i == 100 )); then
- echo "ERROR: no sus-*.service reload jobs are pending"
- systemctl list-jobs || true
- return 1
- fi
- sleep 0.1
- done
+ wait_for_pending_job sus-
fi
echo "Setup complete: 1 running legit unit, $n_sus ${pending_jobs:+job-bearing }sus units"
echo "legit.service PID remains $new_pid. Attempt $attempt passed."
- # Verify that all sus unit processes were abandoned (still running but no longer tracked)
- echo "Verifying sus unit processes were abandoned..."
+ # Verify that all sus unit processes were tracked in a synthesized
+ # orphan unit, not abandoned.
+ echo "Verifying sus unit processes were migrated into synthesized orphan units..."
for unit in "${!sus_pids[@]}"; do
pid=${sus_pids[$unit]}
# Process should still be running
if ! kill -0 "$pid" 2>/dev/null; then
- echo "ERROR: $unit process (PID $pid) was killed instead of abandoned!"
+ echo "ERROR: $unit process (PID $pid) was killed instead of being preserved!"
return 1
fi
- # But the alias should now either be inactive (MainPID=0) or resolve to legit's PID.
+ # The alias should now either be inactive (MainPID=0) or resolve to legit's PID.
current_pid=$(systemctl show -P MainPID "${unit}.service")
if ! (( current_pid == 0 || current_pid == new_pid )); then
echo "ERROR: $unit unexpectedly reports MainPID=$current_pid after aliasing!"
return 1
fi
- echo "$unit process (PID $pid) was correctly abandoned (still running, no longer tracked)"
+ echo "$unit process (PID $pid) is still running and the alias correctly does not claim it"
done
- # Check consistency between journal warnings and abandoned processes
- echo "Checking journal for stale state warnings..."
- journal_warnings=$(journalctl --since "$reload_start" --no-pager | grep "Skipping stale state" || true)
- warning_count=$(echo "$journal_warnings" | grep -c "Skipping stale state" || true)
+ # Verify that synthesized orphan units exist that cover all the
+ # previously-tracked PIDs. Each preserved process must belong to a
+ # orphaned-r* unit. Orphan units retain the original unit's type
+ # (.service here) and are marked load-state=not-found, so list-units
+ # with --all is required to see them.
+ echo "Verifying synthesized orphan units are present and own the PIDs..."
+ orphan_units=$(systemctl list-units --all --no-legend --plain 'orphaned-r*' | awk '{print $1}')
+ if [[ -z "$orphan_units" ]]; then
+ echo "ERROR: No orphaned-r* units were synthesized!"
+ systemctl list-units --all --no-legend --plain || true
+ return 1
+ fi
+ echo "Found orphan units:"
+ echo "$orphan_units"
+
+ # Build a set of all PIDs reported by any orphan unit (via Tasks/cgroup membership).
+ unset tracked_pids
+ declare -A tracked_pids
+ for orphan in $orphan_units; do
+ orphan_cgroup=$(systemctl show -P ControlGroup "$orphan")
+ if [[ -z "$orphan_cgroup" || ! -r "/sys/fs/cgroup${orphan_cgroup}/cgroup.procs" ]]; then
+ # Orphan unit may have been recorded with the original unit's cgroup path which still exists
+ echo "ERROR: Cannot read cgroup.procs for $orphan at expected path /sys/fs/cgroup${orphan_cgroup}/cgroup.procs"
+ return 1
+ fi
+ while read -r p; do
+ [[ -n "$p" ]] && tracked_pids[$p]=1
+ done < "/sys/fs/cgroup${orphan_cgroup}/cgroup.procs"
+ done
- echo "Found $warning_count 'Skipping stale state' warnings"
+ # Cross-check: every original sus PID must appear under exactly one orphan unit cgroup.
+ for unit in "${!sus_pids[@]}"; do
+ pid=${sus_pids[$unit]}
+ if [[ -z "${tracked_pids[$pid]:-}" ]]; then
+ echo "ERROR: PID $pid (from $unit) is not tracked by any synthesized orphan unit!"
+ echo "Orphan unit contents:"
+ for orphan in $orphan_units; do
+ echo " $orphan -> $(systemctl show -P ControlGroup "$orphan")"
+ done
+ return 1
+ fi
+ done
+ echo "All ${#sus_pids[@]} sus PIDs are tracked by synthesized orphan units."
+
+ # Check consistency between journal warnings and synthesized orphan units.
+ echo "Checking journal for 'Synthesized orphan unit' warnings..."
+ journal_warnings=$(journalctl --since "$reload_start" --no-pager | grep "Synthesized orphan unit" || true)
+ warning_count=$(echo "$journal_warnings" | grep -c "Synthesized orphan unit" || true)
+
+ echo "Found $warning_count 'Synthesized orphan unit' warnings"
- # Extract unit names from warnings and verify they match our sus units
if (( warning_count > 0 )); then
echo "Verifying warning consistency..."
for unit in "${!sus_pids[@]}"; do
done
fi
+ # Stop synthesized orphan units (which terminates their tracked
+ # processes) so we get a clean slate for the next iteration.
+ for orphan in $orphan_units; do
+ systemctl stop "$orphan" 2>/dev/null || true
+ done
+
reap_abandoned_pids
if (( attempt < 3 )); then
for i in $(seq -f "%03g" 1 "$n_sus"); do
systemctl --no-block reload sus-"${i}".service
done
- for i in {1..100}; do
- [[ -n "$(systemctl list-jobs --no-legend | grep -E '^[[:space:]]*[0-9]+ sus-' || true)" ]] && break
- if (( i == 100 )); then
- echo "ERROR: no sus-*.service reload jobs are pending after reset"
- systemctl list-jobs || true
- return 1
- fi
- sleep 0.1
- done
+ wait_for_pending_job sus-
fi
echo "Reset complete."
cleanup_test_units() {
reap_abandoned_pids || true
+ # Stop any leftover synthesized orphan units from previous iterations.
+ # Do this in two passes since a queued stop job from the deserialized
+ # state may take a moment to dispatch and tear the unit down.
+ for orphan in $(systemctl list-units --all --no-legend --plain 'orphaned-r*' 2>/dev/null | awk '{print $1}'); do
+ systemctl stop "$orphan" 2>/dev/null || true
+ done
+ for orphan in $(systemctl list-units --all --no-legend --plain 'orphaned-r*' 2>/dev/null | awk '{print $1}'); do
+ systemctl kill --signal=SIGKILL "$orphan" 2>/dev/null || true
+ systemctl reset-failed "$orphan" 2>/dev/null || true
+ done
systemctl stop legit.service 2>/dev/null || true
+ systemctl stop hung-stop.service 2>/dev/null || true
for i in $(seq -f "%03g" 1 100); do
systemctl stop sus-"${i}".service 2>/dev/null || true
rm -f /run/systemd/system/sus-"${i}".service
done
rm -f /run/systemd/system/legit.service
+ rm -f /run/systemd/system/hung-stop.service
+ systemctl daemon-reload
+}
+
+# Verify that a JOB_STOP that was pending on the original unit at the moment of
+# serialization is preserved on the synthesized orphan unit, so the stop is
+# eventually carried out (the orphan is torn down) instead of sitting around
+# forever.
+test_stop_job_preserved() {
+ local reload_cmd="${1:?}"
+ local hung_pid orphan stop_job substate
+
+ echo ""
+ echo "========================================="
+ echo "Testing pending-stop preservation with: systemctl $reload_cmd"
+ echo "========================================="
+
+ # Service whose main process traps SIGTERM and never exits, so a "stop"
+ # request stays pending in the queue and remains serialized.
+ cat >/run/systemd/system/legit.service <<'EOF'
+[Service]
+Type=simple
+ExecStart=/bin/sleep infinity
+EOF
+ cat >/run/systemd/system/hung-stop.service <<'EOF'
+[Service]
+Type=notify
+NotifyAccess=all
+ExecStart=/bin/bash -c 'trap "" TERM; systemd-notify --ready; while :; do sleep infinity & wait $!; done'
+TimeoutStopSec=infinity
+SendSIGKILL=no
+EOF
+
systemctl daemon-reload
+ systemctl start legit.service
+ systemctl start hung-stop.service
+
+ hung_pid=$(systemctl show -P MainPID hung-stop.service)
+ if (( hung_pid == 0 )); then
+ echo "ERROR: hung-stop.service did not start"
+ return 1
+ fi
+
+ # Queue a stop that will hang because the process traps SIGTERM and
+ # SendSIGKILL=no prevents escalation, so the job stays in the queue and
+ # is therefore present in the serialized state at reload time.
+ systemctl --no-block stop hung-stop.service
+ wait_for_pending_job hung-stop.service
+
+ # Convert hung-stop.service into a symlink to legit.service: on the next
+ # reload the original unit becomes an alias of legit.service, and its
+ # serialized state (including the pending stop job) is fed into the
+ # synthesized orphaned-r* orphan unit.
+ rm -f /run/systemd/system/hung-stop.service
+ ln -sf /run/systemd/system/legit.service /run/systemd/system/hung-stop.service
+
+ systemctl "$reload_cmd"
+
+ orphan=$(systemctl list-units --all --no-legend --plain 'orphaned-r*' | awk '{print $1}' | head -n1)
+ if [[ -z "$orphan" ]]; then
+ echo "ERROR: no synthesized orphan unit was created"
+ systemctl list-units --all --no-legend --plain || true
+ return 1
+ fi
+ echo "Synthesized orphan unit: $orphan"
+
+ # The pending stop job from the original unit must have been carried over
+ # to the synthesized orphan, otherwise the orphan (and its tracked
+ # process) is leaked across the reload. Check if:
+ # - the stop job is still queued,
+ # - the orphan is in a stop-* sub-state,
+ # - the orphan has already finished stopping (dead/failed),
+ # - the orphan was already garbage-collected (no SubState reported).
+ stop_job=$(systemctl show -P Job "$orphan" 2>/dev/null || true)
+ substate=$(systemctl show -P SubState "$orphan" 2>/dev/null || true)
+ if [[ -z "$stop_job" ]] && [[ -n "$substate" ]] && ! [[ "$substate" =~ ^(stop-|dead|failed) ]]; then
+ echo "ERROR: stop job for original hung-stop.service was not preserved on $orphan!"
+ echo "Current substate: $substate"
+ systemctl list-jobs || true
+ return 1
+ fi
+
+ echo "Stop job for original hung-stop.service was correctly preserved on synthesized orphan $orphan"
+
+ # Tear the hung orphan down for the next iteration.
+ systemctl kill --signal=SIGKILL "$orphan" 2>/dev/null || true
+ systemctl reset-failed "$orphan" 2>/dev/null || true
+ if kill -0 "$hung_pid" 2>/dev/null; then
+ kill -KILL "$hung_pid" 2>/dev/null || true
+ fi
+
+ rm -f /run/systemd/system/hung-stop.service /run/systemd/system/legit.service
+ systemctl daemon-reload
+
+ echo "$reload_cmd stop-job preservation test passed"
}
trap cleanup_test_units EXIT
run_test daemon-reload with_pending_jobs
cleanup_test_units
run_test daemon-reexec with_pending_jobs
+cleanup_test_units
+test_stop_job_preserved daemon-reload
+cleanup_test_units
+test_stop_job_preserved daemon-reexec