systemd listens on behalf of user configuration will stay
accessible.</para>
+ <para>When unit aliasing is introduced during reload (e.g., converting
+ <filename>b.service</filename> to a symlink pointing to
+ <filename>a.service</filename>), the running state of the canonical
+ unit (<filename>a.service</filename>) is preserved. The old serialized
+ state of the now-aliased unit is discarded to prevent stale data from
+ corrupting the canonical unit's live state. Dependencies referencing
+ the alias name are automatically resolved to the canonical unit, and
+ the dependency graph is rebuilt from unit files, ensuring consistency.
+ If the now-aliased unit had running processes, they are abandoned and
+ will no longer be tracked by the service manager.</para>
+
<para>This command should not be confused with the
<command>reload</command> command.</para>
</listitem>
#include "manager-serialize.h"
#include "parse-util.h"
#include "serialize.h"
+#include "set.h"
#include "string-util.h"
#include "strv.h"
#include "syslog-util.h"
return 0;
}
-static int manager_deserialize_one_unit(Manager *m, const char *name, FILE *f, FDSet *fds) {
+static int manager_collect_serialized_unit_names(FILE *f, Set **ret) {
+ _cleanup_set_free_ Set *serialized_units = NULL;
+ off_t offset;
+ int r;
+
+ assert(f);
+ assert(ret);
+
+ offset = ftello(f);
+ if (offset < 0)
+ return log_error_errno(errno, "Failed to determine serialization offset: %m");
+
+ for (;;) {
+ _cleanup_free_ char *line = NULL;
+
+ r = read_stripped_line(f, LONG_LINE_MAX, &line);
+ if (r < 0)
+ return log_error_errno(r, "Failed to read serialization line: %m");
+ if (r == 0)
+ break;
+
+ r = set_ensure_consume(&serialized_units, &string_hash_ops_free, TAKE_PTR(line));
+ if (r < 0)
+ return log_oom();
+
+ r = unit_deserialize_state_skip(f);
+ if (r < 0)
+ return r;
+ }
+
+ if (fseeko(f, offset, SEEK_SET) < 0)
+ return log_error_errno(errno, "Failed to reset serialization offset: %m");
+
+ *ret = TAKE_PTR(serialized_units);
+ return 0;
+}
+
+static int manager_deserialize_one_unit(
+ Manager *m,
+ const char *name,
+ FILE *f,
+ FDSet *fds,
+ Set *serialized_units) {
+
Unit *u;
int r;
if (r < 0)
return log_notice_errno(r, "Failed to load unit \"%s\", skipping deserialization: %m", name);
+ if (!streq(u->id, name) &&
+ set_contains(serialized_units, u->id)) {
+ /*
+ * The unit from the state file (name) resolved to a different canonical unit (u->id), and
+ * the canonical unit also has its own state entry.
+ *
+ * This means the state entry for the unit name is stale. That is, when the state was
+ * serialized, the name referred to an independent unit, but it now resolves as an alias to
+ * the canonical unit. Deserializing it would overwrite the canonical unit's own serialized
+ * state, and thus corrupt its live runtime state.
+ *
+ * It is very important to note that this only affects units that were independent when the
+ * state file was written, but are now aliases (either because a reload created the symlink,
+ * or the symlink existed but this is the first reload). Normal aliases that were already
+ * aliases during the most recent serialization are filtered out in manager_serialize(), so
+ * they never appear in the state file.
+ *
+ * If the canonical unit does not have its own state entry, then this is instead a rename or
+ * canonical ID change, and this state entry is the only state we have for the unit. In that
+ * case we must preserve it. After doing so, we insert the canonical unit's ID into the set
+ * so that any further aliases resolving to the same unit are skipped.
+ *
+ * The serialized data represents the old, independent unit. Deserializing this stale state
+ * would corrupt the canonical unit's live state, so we must discard it.
+ *
+ * Take as an example, a.service is running. Someone created symlink b.service -> a.service.
+ * On first reload, the state file still has b.service as an independent dead unit (from
+ * before the symlink existed), but b.service now resolves to a.service. We must discard
+ * b.service's stale dead state to preserve a.service's running state.
+ *
+ * Note: This log message is checked in TEST-07-PID1.alias-corruption.sh, so the test case
+ * may need adjustment if the message is changed.
+ */
+ log_warning("Unit file for '%s' was overridden by a symlink to '%s', which also has serialized state. Skipping stale state of old unit. Any processes from the overridden unit are now abandoned!",
+ name,
+ u->id);
+
+ return unit_deserialize_state_skip(f);
+ }
+
r = unit_deserialize_state(u, f, fds);
if (r == -ENOMEM)
return log_oom();
if (r < 0)
return log_notice_errno(r, "Failed to deserialize unit \"%s\", skipping: %m", name);
+ /* If this unit was deserialized under an alias name (that is, it is a rename), record the canonical
+ * ID so that any further aliases pointing to the same unit are correctly skipped. */
+ if (!streq(u->id, name)) {
+ r = set_put_strdup(&serialized_units, u->id);
+ if (r < 0)
+ return log_oom();
+ }
+
return 0;
}
-static int manager_deserialize_units(Manager *m, FILE *f, FDSet *fds) {
+static int manager_deserialize_units(
+ Manager *m,
+ FILE *f,
+ FDSet *fds) {
+
+ _cleanup_set_free_ Set *serialized_units = NULL;
int r;
+ r = manager_collect_serialized_unit_names(f, &serialized_units);
+ if (r < 0)
+ return r;
+
for (;;) {
_cleanup_free_ char *line = NULL;
if (r == 0)
break;
- r = manager_deserialize_one_unit(m, line, f, fds);
+ r = manager_deserialize_one_unit(m, line, f, fds, serialized_units);
if (r == -ENOMEM)
return r;
if (r < 0) {
--- /dev/null
+#!/usr/bin/env bash
+# SPDX-License-Identifier: LGPL-2.1-or-later
+set -eux
+set -o pipefail
+
+# Verify that stale alias state doesn't overwrite canonical unit state.
+# 1. Legit unit is running (PID A).
+# 2. Sus units are running (PID B, C, D...).
+# 3. We alias sus -> legit.
+# 4. If the bug triggers, legit unit's state is overwritten by a sus unit's state.
+# 5. Legit unit thinks it is now PID B (or C, or D...).
+# 6. We detect this PID change as proof of corruption.
+
+declare -a abandoned_pids=()
+
+reap_abandoned_pids() {
+ local pid attempt
+
+ if (( ${#abandoned_pids[@]} == 0 )); then
+ return 0
+ fi
+
+ echo "Reaping ${#abandoned_pids[@]} abandoned processes..."
+
+ for pid in "${abandoned_pids[@]}"; do
+ kill "$pid" 2>/dev/null || true
+ done
+
+ for pid in "${abandoned_pids[@]}"; do
+ for attempt in $(seq 1 50); do
+ if ! kill -0 "$pid" 2>/dev/null; then
+ break
+ fi
+
+ sleep 0.1
+ done
+
+ if kill -0 "$pid" 2>/dev/null; then
+ kill -KILL "$pid" 2>/dev/null || true
+ fi
+
+ for attempt in $(seq 1 50); do
+ if ! kill -0 "$pid" 2>/dev/null; then
+ break
+ fi
+
+ sleep 0.1
+ done
+
+ if kill -0 "$pid" 2>/dev/null; then
+ echo "ERROR: Failed to reap abandoned process PID $pid"
+ return 1
+ fi
+ done
+
+ abandoned_pids=()
+}
+
+run_test() {
+ local reload_cmd="${1:?}"
+ local current_pid journal_warnings new_pid orig_pid pid reload_start unit warning_count
+
+ echo ""
+ echo "========================================="
+ echo "Testing with: systemctl $reload_cmd"
+ echo "========================================="
+
+ cat >/run/systemd/system/legit.service <<'EOF'
+[Service]
+Type=simple
+ExecStart=/bin/sleep infinity
+EOF
+
+ # Create 20 sus units. They must be Type=simple/running so systemd
+ # CANNOT garbage collect them. If they are dead/stopped, systemd can remove
+ # them from memory before serialization
+ echo "Creating 20 sus units..."
+ for i in $(seq -f "%02g" 1 20); do
+ cat >/run/systemd/system/sus-"${i}".service <<'EOF'
+[Service]
+Type=simple
+ExecStart=/bin/sleep infinity
+EOF
+ done
+
+ systemctl daemon-reload
+
+ echo "Starting legit unit..."
+ systemctl start legit.service
+
+ echo "Starting sus units..."
+ for i in $(seq -f "%02g" 1 20); do
+ systemctl start sus-"${i}".service
+ done
+
+ echo "Setup complete: 1 running legit unit, 20 running sus units"
+
+ orig_pid=$(systemctl show -P MainPID legit.service)
+ echo "Original legit PID: $orig_pid"
+
+ if (( orig_pid == 0 )); then
+ echo "Error: Legit PID is 0, setup failed."
+ return 1
+ fi
+
+ # Since ordering is not deterministic we should loop 3 times to reduce
+ # false negative rate (ordering luck). With this it's roughly 0.01% chance
+ # of falsely passing. Falsely failing does not happen, though.
+ for attempt in 1 2 3; do
+ echo ""
+ echo "--- Attempt $attempt/3 ---"
+
+ unset sus_pids
+ declare -A sus_pids
+ for i in $(seq -f "%02g" 1 20); do
+ pid=$(systemctl show -P MainPID sus-"${i}".service)
+ if (( pid != 0 )); then
+ sus_pids["sus-${i}"]=$pid
+ abandoned_pids+=("$pid")
+ echo "sus-${i}.service PID: $pid"
+ fi
+ done
+
+ echo "Converting sus units to symlinks -> legit.service..."
+ for i in $(seq -f "%02g" 1 20); do
+ rm -f /run/systemd/system/sus-"${i}".service
+ ln -sf /run/systemd/system/legit.service /run/systemd/system/sus-"${i}".service
+ done
+
+ reload_start=$(date '+%Y-%m-%d %H:%M:%S')
+
+ echo "Running $reload_cmd..."
+ systemctl "$reload_cmd"
+
+ # If the bug triggered, legit.service deserialized a sus unit's state
+ # and overwrote its own MainPID with the sus unit's PID.
+ new_pid=$(systemctl show -P MainPID legit.service)
+
+ if [[ "$new_pid" != "$orig_pid" ]]; then
+ echo "legit.service PID changed from $orig_pid to $new_pid!"
+ echo "The stale alias state corrupted the canonical unit."
+ return 1
+ fi
+
+ echo "legit.service PID remains $new_pid. Attempt $attempt passed."
+
+ # Verify that all sus unit processes were abandoned (still running but no longer tracked)
+ echo "Verifying sus unit processes were abandoned..."
+ for unit in "${!sus_pids[@]}"; do
+ pid=${sus_pids[$unit]}
+ # Process should still be running
+ if ! kill -0 "$pid" 2>/dev/null; then
+ echo "ERROR: $unit process (PID $pid) was killed instead of abandoned!"
+ return 1
+ fi
+ # But the alias should now either be inactive (MainPID=0) or resolve to legit's PID.
+ current_pid=$(systemctl show -P MainPID "${unit}.service")
+ if ! (( current_pid == 0 || current_pid == new_pid )); then
+ echo "ERROR: $unit unexpectedly reports MainPID=$current_pid after aliasing!"
+ return 1
+ fi
+ echo "$unit process (PID $pid) was correctly abandoned (still running, no longer tracked)"
+ done
+
+ # Check consistency between journal warnings and abandoned processes
+ echo "Checking journal for stale state warnings..."
+ journal_warnings=$(journalctl --since "$reload_start" --no-pager | grep "Skipping stale state" || true)
+ warning_count=$(echo "$journal_warnings" | grep -c "Skipping stale state" || true)
+
+ echo "Found $warning_count 'Skipping stale state' warnings"
+
+ # Extract unit names from warnings and verify they match our sus units
+ if (( warning_count > 0 )); then
+ echo "Verifying warning consistency..."
+ for unit in "${!sus_pids[@]}"; do
+ if [[ "$journal_warnings" != *"${unit}.service"* ]]; then
+ echo "WARNING: Expected journal warning for ${unit}.service but didn't find it"
+ fi
+ done
+ fi
+
+ reap_abandoned_pids
+
+ if (( attempt < 3 )); then
+ echo "Resetting sus units..."
+
+ # We must fully reset to get independent running units again
+ for i in $(seq -f "%02g" 1 20); do
+ rm -f /run/systemd/system/sus-"${i}".service
+ cat >/run/systemd/system/sus-"${i}".service <<'EOF'
+[Service]
+Type=simple
+ExecStart=/bin/sleep infinity
+EOF
+ done
+
+ systemctl "$reload_cmd"
+
+ # Ensure they are running again (they might have been
+ # abandoned/killed during the transition)
+ for i in $(seq -f "%02g" 1 20); do
+ systemctl start sus-"${i}".service
+ done
+
+ echo "Reset complete."
+ fi
+ done
+
+ echo "legit.service did not become sus through all 3 $reload_cmd cycles"
+
+ echo "$reload_cmd test passed"
+}
+
+cleanup_test_units() {
+ reap_abandoned_pids || true
+ systemctl stop legit.service 2>/dev/null || true
+ for i in $(seq -f "%02g" 1 20); do
+ systemctl stop sus-"${i}".service 2>/dev/null || true
+ rm -f /run/systemd/system/sus-"${i}".service
+ done
+ rm -f /run/systemd/system/legit.service
+ systemctl daemon-reload
+}
+
+trap cleanup_test_units EXIT
+
+run_test daemon-reload
+cleanup_test_units
+run_test daemon-reexec
--- /dev/null
+#!/usr/bin/env bash
+# SPDX-License-Identifier: LGPL-2.1-or-later
+set -eux
+set -o pipefail
+
+run_test() {
+ local reload_cmd="${1:?}"
+ local orig_pid new_pid
+
+ echo ""
+ echo "========================================="
+ echo "Testing rename preservation with: systemctl $reload_cmd"
+ echo "========================================="
+
+ cat >/run/systemd/system/rename.service <<'EOF'
+[Service]
+Type=simple
+ExecStart=/bin/sleep infinity
+EOF
+
+ systemctl daemon-reload
+ systemctl start rename.service
+
+ orig_pid=$(systemctl show -P MainPID rename.service)
+ (( orig_pid != 0 ))
+
+ # The old name becomes an alias to the new canonical unit...
+ rm -f /run/systemd/system/rename.service
+ cat >/run/systemd/system/the-unit-formerly-known-as-rename.service <<'EOF'
+[Service]
+Type=simple
+ExecStart=/bin/sleep infinity
+EOF
+ ln -sf /run/systemd/system/the-unit-formerly-known-as-rename.service /run/systemd/system/rename.service
+
+ systemctl "$reload_cmd"
+
+ # ...and the running service must stay tracked across the rename.
+ new_pid=$(systemctl show -P MainPID the-unit-formerly-known-as-rename.service)
+ (( new_pid == orig_pid ))
+ (( $(systemctl show -P MainPID rename.service) == orig_pid ))
+ [[ "$(systemctl show -P ActiveState the-unit-formerly-known-as-rename.service)" == active ]]
+ [[ "$(systemctl show -P ActiveState rename.service)" == active ]]
+}
+
+cleanup_test_units() {
+ systemctl stop the-unit-formerly-known-as-rename.service 2>/dev/null || true
+ systemctl stop rename.service 2>/dev/null || true
+ rm -f /run/systemd/system/rename.service
+ rm -f /run/systemd/system/the-unit-formerly-known-as-rename.service
+ systemctl daemon-reload
+}
+
+trap cleanup_test_units EXIT
+
+run_test daemon-reload
+cleanup_test_units
+run_test daemon-reexec