"this target mirrors), configure_flags. Optional (with defaults): make (\"make\"),",
"python (\"python3\"), rsync_bin (\"rsync\"; \"rsync.exe\" on Cygwin), privilege",
"(\"root\" | \"sudo\" | \"user\"), pipe_jobs/tcp_jobs (8), builddir (\"rsync-citest\",",
- "relative to the remote $HOME), env_prefix, configure_pre, nonroot, protocols.",
+ "relative to the remote $HOME), env_prefix, configure_pre, nonroot, protocols,",
+ "max_retry.",
"",
"nonroot: true reruns -- as the non-root ssh user, after the sudo runs -- the",
"tests that declare `fleet_nonroot = True` at module level (so the set is",
"",
"protocols: [30, 29] adds one extra stdio-pipe test pass per listed version,",
"each run with runtests --protocol=N (the fleet analogue of a workflow's",
- "check30/check29 steps) and shown as a protoNN column. Keys starting with",
- "\"_\" are comments. See testsuite/README.md."
+ "check30/check29 steps) and shown as a protoNN column.",
+ "",
+ "max_retry: N (default 0) re-runs each failed test on its own up to N more",
+ "times and drops any that then pass (listed under RECOVERED, not hidden). Use",
+ "on a slow/loaded box where concurrency-sensitive tests occasionally flake,",
+ "instead of dropping the whole target to a lower pipe_jobs/tcp_jobs.",
+ "",
+ "Keys starting with \"_\" are comments. See testsuite/README.md."
],
"targets": [
{
"--disable-xxhash", "--disable-lz4"]
},
{
+ "_comment": "Nested-VM OpenBSD occasionally flakes a daemon/tcp test under load; max_retry re-runs just the failed test rather than throttling the whole box (tcp_jobs/pipe_jobs are still available if you prefer that).",
"name": "openbsd",
"ssh_host": "root@openbsd",
"workflow": "openbsd-build.yml",
"make": "gmake",
"configure_pre": "export AUTOCONF_VERSION=2.71 AUTOMAKE_VERSION=1.16;",
- "tcp_jobs": 2,
+ "max_retry": 2,
"configure_flags": ["--with-rrsync", "--disable-zstd", "--disable-md2man",
"--disable-xxhash", "--disable-lz4"]
},
# stdio-pipe pass with runtests --protocol=N (the fleet analogue of a
# workflow's check30/check29 steps). e.g. [30, 29]. Empty => proto pass off.
protocols: list[int] = dataclasses.field(default_factory=list)
+ # Per-target retry budget for FLAKY tests: after a run, each failed test is
+ # re-run on its own up to max_retry more times, and any that then pass are
+ # dropped from the failure list (and reported as "recovered", never hidden).
+ # Use on a slow/loaded box where concurrency-sensitive tests occasionally
+ # flake, instead of dropping the whole target to a lower -j. 0 => no retry.
+ max_retry: int = 0
def load_fleet(path: Path) -> list[Target]:
def test_script(t: Target, transport: str, skip_csv: str | None, jobs: int,
- protocol: int | None = None) -> str:
+ protocol: int | None = None, only: list[str] | None = None) -> str:
rb = f'--rsync-bin="$PWD/{t.rsync_bin}"'
tcp = " --use-tcp" if transport == "tcp" else ""
# protocol forces an older wire version (mirrors `make check30`/`check29`).
# PYTHONDONTWRITEBYTECODE: don't drop root-owned __pycache__/*.pyc into the
# tree (a sudo run would, breaking the next non-root push --delete).
env = "PYTHONDONTWRITEBYTECODE=1 "
- if skip_csv:
+ # Named tests (a max_retry re-run) make runtests full_run False, so the
+ # expected-skip list does not apply -- only the named tests' pass/fail matter.
+ names = ""
+ if only:
+ names = " " + " ".join(only)
+ elif skip_csv:
env += f"RSYNC_EXPECT_SKIPPED={skip_csv} "
- runtests = f'{t.python} runtests.py {rb}{tcp}{proto} -j {jobs}'
+ runtests = f'{t.python} runtests.py {rb}{tcp}{proto} -j {jobs}{names}'
# env_prefix (e.g. a brew PATH) must reach the test too: some tests build a
# helper binary on the fly (a test may invoke `make`, which needs gawk etc.),
# so the build tools must be on PATH at test time.
skip_expected: set[str]
skip_got: set[str]
raw: str
+ # Tests that failed the initial run but passed on a max_retry re-run, so they
+ # were dropped from `failed`. Surfaced in the report (a recovered flake is
+ # noted, never silently hidden).
+ recovered: list[str] = dataclasses.field(default_factory=list)
@property
def skip_mismatch(self) -> bool:
skip_checked, exp, got, r.out)
+def retry_failed(t: Target, label: str, tr: TransportResult, rerun) -> None:
+ """Honour the target's max_retry budget: re-run each failed test on its own
+ (serially) up to max_retry more times; drop any that pass and record them in
+ tr.recovered. `rerun(names)` runs the given tests and returns a CmdResult.
+ A no-op when max_retry is 0 or there were no failures."""
+ if not t.max_retry or not tr.failed:
+ return
+ remaining = list(tr.failed)
+ for attempt in range(1, t.max_retry + 1):
+ r = rerun(remaining)
+ still = [m.group(2) for m in RE_RESULT.finditer(r.out)
+ if m.group(1) in ("FAIL", "ERROR")]
+ recovered = [n for n in remaining if n not in still]
+ if recovered:
+ tr.recovered.extend(recovered)
+ log(f"[{t.name}] {label} retry {attempt}/{t.max_retry}: "
+ f"recovered {','.join(recovered)}"
+ + (f"; still failing {','.join(still)}" if still else ""))
+ remaining = [n for n in remaining if n in still]
+ if not remaining:
+ break
+ tr.failed = remaining
+ # The initial run's non-zero exit was the now-recovered failures; once they
+ # all pass on retry the cell is OK, so clear the stale exit code (only the
+ # failed tests can make runtests exit non-zero on a no-skip-list re-run).
+ if not remaining and tr.recovered and tr.exit_code != 0:
+ tr.exit_code = 0
+
+
@dataclasses.dataclass
class TargetResult:
target: str
t0 = time.monotonic()
r = run_on(t, cmd, timeout=2400)
res.timings[transport] = time.monotonic() - t0
- res.transports[transport] = parse_transport(transport, r, skip_csv is not None)
+ tr = parse_transport(transport, r, skip_csv is not None)
+ retry_failed(t, transport, tr, lambda names, tp=transport: run_on(
+ t, test_script(t, tp, None, 1, only=names), timeout=1200))
+ res.transports[transport] = tr
log(f"[{t.name}] {transport} done "
- f"({'ok' if res.transports[transport].ok else 'ISSUE'})")
+ f"({'ok' if tr.ok else 'ISSUE'})")
# Extra older-protocol passes (mirroring the workflow's check30/check29
# steps): same stdio-pipe transport and skip list as `make check`, but with
t0 = time.monotonic()
r = run_on(t, cmd, timeout=2400)
res.timings[label] = time.monotonic() - t0
- res.transports[label] = parse_transport(label, r, skip_csv is not None)
+ tr = parse_transport(label, r, skip_csv is not None)
+ retry_failed(t, label, tr, lambda names, pr=proto: run_on(
+ t, test_script(t, "pipe", None, 1, protocol=pr, only=names),
+ timeout=1200))
+ res.transports[label] = tr
log(f"[{t.name}] {label} done "
- f"({'ok' if res.transports[label].ok else 'ISSUE'})")
+ f"({'ok' if tr.ok else 'ISSUE'})")
# Extra non-root pass (after the sudo runs) for targets that opt in, running
# the tests that declare `fleet_nonroot = True` (discovered in main()).
t0 = time.monotonic()
r = run_on(t, nonroot_test_script(t, args.nonroot_tests), timeout=2400)
res.timings["nonroot"] = time.monotonic() - t0
- res.transports["nonroot"] = parse_transport("nonroot", r, skip_checked=False)
+ tr = parse_transport("nonroot", r, skip_checked=False)
+ retry_failed(t, "nonroot", tr, lambda names: run_on(
+ t, nonroot_test_script(t, names), timeout=1200))
+ res.transports["nonroot"] = tr
log(f"[{t.name}] nonroot done "
- f"({'ok' if res.transports['nonroot'].ok else 'ISSUE'})")
+ f"({'ok' if tr.ok else 'ISSUE'})")
res.timings["total"] = time.monotonic() - started
return res
for d in details:
print(d)
print("=" * 64)
+ # Recovered flakes: tests that failed but passed within the target's
+ # max_retry budget. The cell counts as OK, but list them so a flaky test is
+ # never silently swallowed.
+ recovered = [f"{res.target} / {transport}: {','.join(tr.recovered)}"
+ for res in results for transport in transports
+ if (tr := res.transports.get(transport)) and tr.recovered]
+ if recovered:
+ print("==== RECOVERED (flaky -- failed, then passed on retry) ====")
+ for r in recovered:
+ print(f" {r}")
+ print("=" * 64)
print(f"{len(results)} targets x {len(transports)} transports = {cells} cells: "
f"{ok_cells} OK, {cells - ok_cells} not OK")
return all_ok