From: Luca Boccassi Date: Tue, 12 May 2026 11:40:54 +0000 (+0100) Subject: test: make TEST-75-RESOLVED robust against journald metadata race X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=9f08885b0e72e7756849cf1abab929a380166e34;p=thirdparty%2Fsystemd.git test: make TEST-75-RESOLVED robust against journald metadata race Even after switching the wait loop to a polling `journalctl --grep`, the test still fails intermittently because the very first messages emitted by the freshly-spawned systemd-networkd-wait-online process can carry stale journald metadata. journald associates `_SYSTEMD_UNIT=` (and friends) with each entry by reading `/proc/$pid/cgroup` of the originating PID; if those messages are produced before journald notices the cgroup migration into the new service, they get tagged with `_SYSTEMD_UNIT=init.scope`. The `-u $unit` filter then fails to match them. Capture a journal cursor before launching the unit, and grep using `--after-cursor=` plus `SYSLOG_IDENTIFIER=systemd-networkd-wait-online` instead of `-u $unit`. SYSLOG_IDENTIFIER is set by the program itself, so it's not subject to the cgroup-discovery race. The cursor bounds the search to entries produced by this invocation, so prior wait-online runs in earlier testcases don't interfere. Logs from the failing run showing the messages exist but are tagged with the wrong unit: [ 2570.948554] TEST-75-RESOLVED.sh[2178]: + unit=wait-online-dns-ede81407-b93b-459d-8e5d-69292b42d2ae.service [ 2571.023162] TEST-75-RESOLVED.sh[2178]: + systemd-run -u wait-online-dns-ede81407-b93b-459d-8e5d-69292b42d2ae.service ... [ 2571.049189] TEST-75-RESOLVED.sh[2178]: + timeout 30 bash -c 'until journalctl -b -u wait-online-dns-ede81407-b93b-459d-8e5d-69292b42d2ae.service --grep ...' [ 2571.964986] systemd-networkd-wait-online[2190]: dns0: No DNS server is accessible. [ 2601.051088] TEST-75-RESOLVED.sh[2178]: ++ cleanup And for that 2571.964986 entry: _SYSTEMD_CGROUP=/init.scope _SYSTEMD_UNIT=init.scope _EXE=/usr/lib/systemd/systemd-executor _CMDLINE=/usr/lib/systemd/systemd-executor --deserialize 68 ... SYSLOG_IDENTIFIER=systemd-networkd-wait-online MESSAGE=dns0: No DNS server is accessible. Follow-up for d4bc62713e09df09281f26f4bf385801a3ee2897 Co-developed-by: Claude Opus 4.7 --- diff --git a/test/units/TEST-75-RESOLVED.sh b/test/units/TEST-75-RESOLVED.sh index 7b5670a4bc7..03e50db7b32 100755 --- a/test/units/TEST-75-RESOLVED.sh +++ b/test/units/TEST-75-RESOLVED.sh @@ -1387,7 +1387,7 @@ testcase_15_wait_online_dns() { echo "===== journalctl -u $unit =====" journalctl -b --no-pager --no-hostname --full -u "$unit" echo "==========" - rm -f "$override" + rm -f "$override" "$cursor_file" restart_resolved resolvectl revert dns0 } @@ -1396,6 +1396,7 @@ testcase_15_wait_online_dns() { local unit local override + local cursor_file unit="wait-online-dns-$(systemd-id128 new -u).service" override="/run/systemd/resolved.conf.d/90-global-dns.conf" @@ -1416,12 +1417,26 @@ testcase_15_wait_online_dns() { systemctl stop systemd-resolved.service systemctl start systemd-resolved-monitor.socket systemd-resolved-varlink.socket + # Capture a journal cursor before starting the unit so we can match only on + # log messages emitted afterwards. We deliberately do not filter on + # _SYSTEMD_UNIT= because journald may attach stale cgroup metadata + # (e.g. _SYSTEMD_UNIT=init.scope) to the very first messages emitted by a + # freshly-spawned process, before its cgroup migration into the new service + # is observed. Filtering by SYSLOG_IDENTIFIER and a cursor is not affected + # by that race. + cursor_file=$(mktemp) + journalctl -n 0 --cursor-file="$cursor_file" + # Begin systemd-networkd-wait-online --dns systemd-run -u "$unit" -p "Environment=SYSTEMD_LOG_LEVEL=debug" -p "Environment=SYSTEMD_LOG_TARGET=journal" --service-type=exec \ /usr/lib/systemd/systemd-networkd-wait-online --timeout=0 --dns --interface=dns0 - # Wait until it blocks waiting for updated DNS config - timeout 30 bash -c "until journalctl -b -u $unit --grep 'dns0: No.*DNS server is accessible' >/dev/null 2>&1; do sleep 0.5; done" + # Wait until it blocks waiting for updated DNS config. + # Note: don't use 'journalctl -f | grep -m1 ...' here. Once grep exits on + # match, journalctl -f will only notice the closed pipe on its next write + # attempt, which may never come for an otherwise idle unit, causing the + # pipeline to hang. + timeout 30 bash -c "until journalctl --after-cursor=\"\$(cat \"$cursor_file\")\" SYSLOG_IDENTIFIER=systemd-networkd-wait-online --grep 'dns0: No.*DNS server is accessible' >/dev/null 2>&1; do sleep 0.5; done" # Update the global configuration. Restart rather than reload systemd-resolved so that # systemd-networkd-wait-online has to re-connect to the varlink service. @@ -1436,10 +1451,10 @@ testcase_15_wait_online_dns() { journalctl --sync # Check that a disconnect happened, and was handled. - journalctl -b -u "$unit" --grep="DNS configuration monitor disconnected, reconnecting..." >/dev/null + journalctl --after-cursor="$(cat "$cursor_file")" SYSLOG_IDENTIFIER=systemd-networkd-wait-online --grep="DNS configuration monitor disconnected, reconnecting..." >/dev/null # Check that dns0 was found to be online. - journalctl -b -u "$unit" --grep="dns0: link is configured by networkd and online." >/dev/null + journalctl --after-cursor="$(cat "$cursor_file")" SYSLOG_IDENTIFIER=systemd-networkd-wait-online --grep="dns0: link is configured by networkd and online." >/dev/null } testcase_delegate() {