From: Andrew Tridgell Date: Sat, 6 Jun 2026 05:43:02 +0000 (+1000) Subject: fleettest: add a per-target max_retry budget for flaky tests X-Git-Url: http://git.ipfire.org/gitweb/index.cgi?a=commitdiff_plain;h=bad790dd2ec0f1d913fd3df3e4c2a6920c421c30;p=thirdparty%2Frsync.git fleettest: add a per-target max_retry budget for flaky tests A slow or heavily-loaded fleet box can occasionally flake a concurrency- sensitive test (e.g. a daemon/lsh test under -j8 on a nested-VM Solaris box). Rather than dropping the whole target to a lower -j, add a per-target "max_retry" property: after a run, each failed test is re-run on its own up to max_retry more times, and any that then pass are dropped from the failure list. Recovered tests are listed in a new "RECOVERED" report section, so a flake is surfaced, never silently hidden. Applies to every pass for the target (pipe, tcp, protoNN, nonroot). Default 0 keeps the current no-retry behaviour. --- diff --git a/testsuite/fleettest.json.example b/testsuite/fleettest.json.example index dfbfbf26..821a6c6b 100644 --- a/testsuite/fleettest.json.example +++ b/testsuite/fleettest.json.example @@ -11,7 +11,8 @@ "this target mirrors), configure_flags. Optional (with defaults): make (\"make\"),", "python (\"python3\"), rsync_bin (\"rsync\"; \"rsync.exe\" on Cygwin), privilege", "(\"root\" | \"sudo\" | \"user\"), pipe_jobs/tcp_jobs (8), builddir (\"rsync-citest\",", - "relative to the remote $HOME), env_prefix, configure_pre, nonroot, protocols.", + "relative to the remote $HOME), env_prefix, configure_pre, nonroot, protocols,", + "max_retry.", "", "nonroot: true reruns -- as the non-root ssh user, after the sudo runs -- the", "tests that declare `fleet_nonroot = True` at module level (so the set is", @@ -19,8 +20,14 @@ "", "protocols: [30, 29] adds one extra stdio-pipe test pass per listed version,", "each run with runtests --protocol=N (the fleet analogue of a workflow's", - "check30/check29 steps) and shown as a protoNN column. Keys starting with", - "\"_\" are comments. See testsuite/README.md." + "check30/check29 steps) and shown as a protoNN column.", + "", + "max_retry: N (default 0) re-runs each failed test on its own up to N more", + "times and drops any that then pass (listed under RECOVERED, not hidden). Use", + "on a slow/loaded box where concurrency-sensitive tests occasionally flake,", + "instead of dropping the whole target to a lower pipe_jobs/tcp_jobs.", + "", + "Keys starting with \"_\" are comments. See testsuite/README.md." ], "targets": [ { @@ -40,12 +47,13 @@ "--disable-xxhash", "--disable-lz4"] }, { + "_comment": "Nested-VM OpenBSD occasionally flakes a daemon/tcp test under load; max_retry re-runs just the failed test rather than throttling the whole box (tcp_jobs/pipe_jobs are still available if you prefer that).", "name": "openbsd", "ssh_host": "root@openbsd", "workflow": "openbsd-build.yml", "make": "gmake", "configure_pre": "export AUTOCONF_VERSION=2.71 AUTOMAKE_VERSION=1.16;", - "tcp_jobs": 2, + "max_retry": 2, "configure_flags": ["--with-rrsync", "--disable-zstd", "--disable-md2man", "--disable-xxhash", "--disable-lz4"] }, diff --git a/testsuite/fleettest.py b/testsuite/fleettest.py index 4bcaef7a..a4f3f2ab 100755 --- a/testsuite/fleettest.py +++ b/testsuite/fleettest.py @@ -137,6 +137,12 @@ class Target: # stdio-pipe pass with runtests --protocol=N (the fleet analogue of a # workflow's check30/check29 steps). e.g. [30, 29]. Empty => proto pass off. protocols: list[int] = dataclasses.field(default_factory=list) + # Per-target retry budget for FLAKY tests: after a run, each failed test is + # re-run on its own up to max_retry more times, and any that then pass are + # dropped from the failure list (and reported as "recovered", never hidden). + # Use on a slow/loaded box where concurrency-sensitive tests occasionally + # flake, instead of dropping the whole target to a lower -j. 0 => no retry. + max_retry: int = 0 def load_fleet(path: Path) -> list[Target]: @@ -283,7 +289,7 @@ def build_script(t: Target) -> str: def test_script(t: Target, transport: str, skip_csv: str | None, jobs: int, - protocol: int | None = None) -> str: + protocol: int | None = None, only: list[str] | None = None) -> str: rb = f'--rsync-bin="$PWD/{t.rsync_bin}"' tcp = " --use-tcp" if transport == "tcp" else "" # protocol forces an older wire version (mirrors `make check30`/`check29`). @@ -291,9 +297,14 @@ def test_script(t: Target, transport: str, skip_csv: str | None, jobs: int, # PYTHONDONTWRITEBYTECODE: don't drop root-owned __pycache__/*.pyc into the # tree (a sudo run would, breaking the next non-root push --delete). env = "PYTHONDONTWRITEBYTECODE=1 " - if skip_csv: + # Named tests (a max_retry re-run) make runtests full_run False, so the + # expected-skip list does not apply -- only the named tests' pass/fail matter. + names = "" + if only: + names = " " + " ".join(only) + elif skip_csv: env += f"RSYNC_EXPECT_SKIPPED={skip_csv} " - runtests = f'{t.python} runtests.py {rb}{tcp}{proto} -j {jobs}' + runtests = f'{t.python} runtests.py {rb}{tcp}{proto} -j {jobs}{names}' # env_prefix (e.g. a brew PATH) must reach the test too: some tests build a # helper binary on the fly (a test may invoke `make`, which needs gawk etc.), # so the build tools must be on PATH at test time. @@ -349,6 +360,10 @@ class TransportResult: skip_expected: set[str] skip_got: set[str] raw: str + # Tests that failed the initial run but passed on a max_retry re-run, so they + # were dropped from `failed`. Surfaced in the report (a recovered flake is + # noted, never silently hidden). + recovered: list[str] = dataclasses.field(default_factory=list) @property def skip_mismatch(self) -> bool: @@ -376,6 +391,35 @@ def parse_transport(transport: str, r: CmdResult, skip_checked: bool) -> Transpo skip_checked, exp, got, r.out) +def retry_failed(t: Target, label: str, tr: TransportResult, rerun) -> None: + """Honour the target's max_retry budget: re-run each failed test on its own + (serially) up to max_retry more times; drop any that pass and record them in + tr.recovered. `rerun(names)` runs the given tests and returns a CmdResult. + A no-op when max_retry is 0 or there were no failures.""" + if not t.max_retry or not tr.failed: + return + remaining = list(tr.failed) + for attempt in range(1, t.max_retry + 1): + r = rerun(remaining) + still = [m.group(2) for m in RE_RESULT.finditer(r.out) + if m.group(1) in ("FAIL", "ERROR")] + recovered = [n for n in remaining if n not in still] + if recovered: + tr.recovered.extend(recovered) + log(f"[{t.name}] {label} retry {attempt}/{t.max_retry}: " + f"recovered {','.join(recovered)}" + + (f"; still failing {','.join(still)}" if still else "")) + remaining = [n for n in remaining if n in still] + if not remaining: + break + tr.failed = remaining + # The initial run's non-zero exit was the now-recovered failures; once they + # all pass on retry the cell is OK, so clear the stale exit code (only the + # failed tests can make runtests exit non-zero on a no-skip-list re-run). + if not remaining and tr.recovered and tr.exit_code != 0: + tr.exit_code = 0 + + @dataclasses.dataclass class TargetResult: target: str @@ -444,9 +488,12 @@ def run_target(t: Target, args, staging: str) -> TargetResult: t0 = time.monotonic() r = run_on(t, cmd, timeout=2400) res.timings[transport] = time.monotonic() - t0 - res.transports[transport] = parse_transport(transport, r, skip_csv is not None) + tr = parse_transport(transport, r, skip_csv is not None) + retry_failed(t, transport, tr, lambda names, tp=transport: run_on( + t, test_script(t, tp, None, 1, only=names), timeout=1200)) + res.transports[transport] = tr log(f"[{t.name}] {transport} done " - f"({'ok' if res.transports[transport].ok else 'ISSUE'})") + f"({'ok' if tr.ok else 'ISSUE'})") # Extra older-protocol passes (mirroring the workflow's check30/check29 # steps): same stdio-pipe transport and skip list as `make check`, but with @@ -461,9 +508,13 @@ def run_target(t: Target, args, staging: str) -> TargetResult: t0 = time.monotonic() r = run_on(t, cmd, timeout=2400) res.timings[label] = time.monotonic() - t0 - res.transports[label] = parse_transport(label, r, skip_csv is not None) + tr = parse_transport(label, r, skip_csv is not None) + retry_failed(t, label, tr, lambda names, pr=proto: run_on( + t, test_script(t, "pipe", None, 1, protocol=pr, only=names), + timeout=1200)) + res.transports[label] = tr log(f"[{t.name}] {label} done " - f"({'ok' if res.transports[label].ok else 'ISSUE'})") + f"({'ok' if tr.ok else 'ISSUE'})") # Extra non-root pass (after the sudo runs) for targets that opt in, running # the tests that declare `fleet_nonroot = True` (discovered in main()). @@ -471,9 +522,12 @@ def run_target(t: Target, args, staging: str) -> TargetResult: t0 = time.monotonic() r = run_on(t, nonroot_test_script(t, args.nonroot_tests), timeout=2400) res.timings["nonroot"] = time.monotonic() - t0 - res.transports["nonroot"] = parse_transport("nonroot", r, skip_checked=False) + tr = parse_transport("nonroot", r, skip_checked=False) + retry_failed(t, "nonroot", tr, lambda names: run_on( + t, nonroot_test_script(t, names), timeout=1200)) + res.transports["nonroot"] = tr log(f"[{t.name}] nonroot done " - f"({'ok' if res.transports['nonroot'].ok else 'ISSUE'})") + f"({'ok' if tr.ok else 'ISSUE'})") res.timings["total"] = time.monotonic() - started return res @@ -598,6 +652,17 @@ def print_report(results: list[TargetResult], args, fleet: list[Target]) -> bool for d in details: print(d) print("=" * 64) + # Recovered flakes: tests that failed but passed within the target's + # max_retry budget. The cell counts as OK, but list them so a flaky test is + # never silently swallowed. + recovered = [f"{res.target} / {transport}: {','.join(tr.recovered)}" + for res in results for transport in transports + if (tr := res.transports.get(transport)) and tr.recovered] + if recovered: + print("==== RECOVERED (flaky -- failed, then passed on retry) ====") + for r in recovered: + print(f" {r}") + print("=" * 64) print(f"{len(results)} targets x {len(transports)} transports = {cells} cells: " f"{ok_cells} OK, {cells - ok_cells} not OK") return all_ok