fleettest: add a per-target max_retry budget for flaky tests

author Andrew Tridgell <andrew@tridgell.net>

Sat, 6 Jun 2026 05:43:02 +0000 (15:43 +1000)

committer Andrew Tridgell <andrew@tridgell.net>

Sat, 6 Jun 2026 06:41:51 +0000 (16:41 +1000)
author Andrew Tridgell <andrew@tridgell.net>
Sat, 6 Jun 2026 05:43:02 +0000 (15:43 +1000)
committer Andrew Tridgell <andrew@tridgell.net>
Sat, 6 Jun 2026 06:41:51 +0000 (16:41 +1000)
diff --git a/testsuite/fleettest.json.example b/testsuite/fleettest.json.example

index dfbfbf260dc06dc8a44fde07e6b1e65e6224ede0..821a6c6b08d37287589de1c45882be8b92836c59 100644 (file)
--- a/testsuite/fleettest.json.example
+++ b/testsuite/fleettest.json.example
@@ -11,7 +11,8 @@
      "this target mirrors), configure_flags. Optional (with defaults): make (\"make\"),",
      "python (\"python3\"), rsync_bin (\"rsync\"; \"rsync.exe\" on Cygwin), privilege",
      "(\"root\" | \"sudo\" | \"user\"), pipe_jobs/tcp_jobs (8), builddir (\"rsync-citest\",",
-    "relative to the remote $HOME), env_prefix, configure_pre, nonroot, protocols.",
+    "relative to the remote $HOME), env_prefix, configure_pre, nonroot, protocols,",
+    "max_retry.",
      "",
      "nonroot: true reruns -- as the non-root ssh user, after the sudo runs -- the",
      "tests that declare `fleet_nonroot = True` at module level (so the set is",
@@ -19,8 +20,14 @@
      "",
      "protocols: [30, 29] adds one extra stdio-pipe test pass per listed version,",
      "each run with runtests --protocol=N (the fleet analogue of a workflow's",
-    "check30/check29 steps) and shown as a protoNN column. Keys starting with",
-    "\"_\" are comments. See testsuite/README.md."
+    "check30/check29 steps) and shown as a protoNN column.",
+    "",
+    "max_retry: N (default 0) re-runs each failed test on its own up to N more",
+    "times and drops any that then pass (listed under RECOVERED, not hidden). Use",
+    "on a slow/loaded box where concurrency-sensitive tests occasionally flake,",
+    "instead of dropping the whole target to a lower pipe_jobs/tcp_jobs.",
+    "",
+    "Keys starting with \"_\" are comments. See testsuite/README.md."
    ],
    "targets": [
      {
@@ -40,12 +47,13 @@
                            "--disable-xxhash", "--disable-lz4"]
      },
      {
+      "_comment": "Nested-VM OpenBSD occasionally flakes a daemon/tcp test under load; max_retry re-runs just the failed test rather than throttling the whole box (tcp_jobs/pipe_jobs are still available if you prefer that).",
        "name": "openbsd",
        "ssh_host": "root@openbsd",
        "workflow": "openbsd-build.yml",
        "make": "gmake",
        "configure_pre": "export AUTOCONF_VERSION=2.71 AUTOMAKE_VERSION=1.16;",
-      "tcp_jobs": 2,
+      "max_retry": 2,
        "configure_flags": ["--with-rrsync", "--disable-zstd", "--disable-md2man",
                            "--disable-xxhash", "--disable-lz4"]
      },
diff --git a/testsuite/fleettest.py b/testsuite/fleettest.py

index 4bcaef7aba2fc64e9b1dc5a709060a7582e7e45e..a4f3f2ab82dfdb6253dc574a576b9c36e8d7c012 100755 (executable)
--- a/testsuite/fleettest.py
+++ b/testsuite/fleettest.py
@@ -137,6 +137,12 @@ class Target:
      # stdio-pipe pass with runtests --protocol=N (the fleet analogue of a
      # workflow's check30/check29 steps). e.g. [30, 29]. Empty => proto pass off.
      protocols: list[int] = dataclasses.field(default_factory=list)
+    # Per-target retry budget for FLAKY tests: after a run, each failed test is
+    # re-run on its own up to max_retry more times, and any that then pass are
+    # dropped from the failure list (and reported as "recovered", never hidden).
+    # Use on a slow/loaded box where concurrency-sensitive tests occasionally
+    # flake, instead of dropping the whole target to a lower -j. 0 => no retry.
+    max_retry: int = 0
  
  
  def load_fleet(path: Path) -> list[Target]:
@@ -283,7 +289,7 @@ def build_script(t: Target) -> str:
  
  
  def test_script(t: Target, transport: str, skip_csv: str | None, jobs: int,
-                protocol: int | None = None) -> str:
+                protocol: int | None = None, only: list[str] | None = None) -> str:
      rb = f'--rsync-bin="$PWD/{t.rsync_bin}"'
      tcp = " --use-tcp" if transport == "tcp" else ""
      # protocol forces an older wire version (mirrors `make check30`/`check29`).
@@ -291,9 +297,14 @@ def test_script(t: Target, transport: str, skip_csv: str | None, jobs: int,
      # PYTHONDONTWRITEBYTECODE: don't drop root-owned __pycache__/*.pyc into the
      # tree (a sudo run would, breaking the next non-root push --delete).
      env = "PYTHONDONTWRITEBYTECODE=1 "
-    if skip_csv:
+    # Named tests (a max_retry re-run) make runtests full_run False, so the
+    # expected-skip list does not apply -- only the named tests' pass/fail matter.
+    names = ""
+    if only:
+        names = " " + " ".join(only)
+    elif skip_csv:
          env += f"RSYNC_EXPECT_SKIPPED={skip_csv} "
-    runtests = f'{t.python} runtests.py {rb}{tcp}{proto} -j {jobs}'
+    runtests = f'{t.python} runtests.py {rb}{tcp}{proto} -j {jobs}{names}'
      # env_prefix (e.g. a brew PATH) must reach the test too: some tests build a
      # helper binary on the fly (a test may invoke `make`, which needs gawk etc.),
      # so the build tools must be on PATH at test time.
@@ -349,6 +360,10 @@ class TransportResult:
      skip_expected: set[str]
      skip_got: set[str]
      raw: str
+    # Tests that failed the initial run but passed on a max_retry re-run, so they
+    # were dropped from `failed`.  Surfaced in the report (a recovered flake is
+    # noted, never silently hidden).
+    recovered: list[str] = dataclasses.field(default_factory=list)
  
      @property
      def skip_mismatch(self) -> bool:
@@ -376,6 +391,35 @@ def parse_transport(transport: str, r: CmdResult, skip_checked: bool) -> Transpo
                             skip_checked, exp, got, r.out)
  
  
+def retry_failed(t: Target, label: str, tr: TransportResult, rerun) -> None:
+    """Honour the target's max_retry budget: re-run each failed test on its own
+    (serially) up to max_retry more times; drop any that pass and record them in
+    tr.recovered.  `rerun(names)` runs the given tests and returns a CmdResult.
+    A no-op when max_retry is 0 or there were no failures."""
+    if not t.max_retry or not tr.failed:
+        return
+    remaining = list(tr.failed)
+    for attempt in range(1, t.max_retry + 1):
+        r = rerun(remaining)
+        still = [m.group(2) for m in RE_RESULT.finditer(r.out)
+                 if m.group(1) in ("FAIL", "ERROR")]
+        recovered = [n for n in remaining if n not in still]
+        if recovered:
+            tr.recovered.extend(recovered)
+            log(f"[{t.name}] {label} retry {attempt}/{t.max_retry}: "
+                f"recovered {','.join(recovered)}"
+                + (f"; still failing {','.join(still)}" if still else ""))
+        remaining = [n for n in remaining if n in still]
+        if not remaining:
+            break
+    tr.failed = remaining
+    # The initial run's non-zero exit was the now-recovered failures; once they
+    # all pass on retry the cell is OK, so clear the stale exit code (only the
+    # failed tests can make runtests exit non-zero on a no-skip-list re-run).
+    if not remaining and tr.recovered and tr.exit_code != 0:
+        tr.exit_code = 0
+
+
  @dataclasses.dataclass
  class TargetResult:
      target: str
@@ -444,9 +488,12 @@ def run_target(t: Target, args, staging: str) -> TargetResult:
          t0 = time.monotonic()
          r = run_on(t, cmd, timeout=2400)
          res.timings[transport] = time.monotonic() - t0
-        res.transports[transport] = parse_transport(transport, r, skip_csv is not None)
+        tr = parse_transport(transport, r, skip_csv is not None)
+        retry_failed(t, transport, tr, lambda names, tp=transport: run_on(
+            t, test_script(t, tp, None, 1, only=names), timeout=1200))
+        res.transports[transport] = tr
          log(f"[{t.name}] {transport} done "
-            f"({'ok' if res.transports[transport].ok else 'ISSUE'})")
+            f"({'ok' if tr.ok else 'ISSUE'})")
  
      # Extra older-protocol passes (mirroring the workflow's check30/check29
      # steps): same stdio-pipe transport and skip list as `make check`, but with
@@ -461,9 +508,13 @@ def run_target(t: Target, args, staging: str) -> TargetResult:
              t0 = time.monotonic()
              r = run_on(t, cmd, timeout=2400)
              res.timings[label] = time.monotonic() - t0
-            res.transports[label] = parse_transport(label, r, skip_csv is not None)
+            tr = parse_transport(label, r, skip_csv is not None)
+            retry_failed(t, label, tr, lambda names, pr=proto: run_on(
+                t, test_script(t, "pipe", None, 1, protocol=pr, only=names),
+                timeout=1200))
+            res.transports[label] = tr
              log(f"[{t.name}] {label} done "
-                f"({'ok' if res.transports[label].ok else 'ISSUE'})")
+                f"({'ok' if tr.ok else 'ISSUE'})")
  
      # Extra non-root pass (after the sudo runs) for targets that opt in, running
      # the tests that declare `fleet_nonroot = True` (discovered in main()).
@@ -471,9 +522,12 @@ def run_target(t: Target, args, staging: str) -> TargetResult:
          t0 = time.monotonic()
          r = run_on(t, nonroot_test_script(t, args.nonroot_tests), timeout=2400)
          res.timings["nonroot"] = time.monotonic() - t0
-        res.transports["nonroot"] = parse_transport("nonroot", r, skip_checked=False)
+        tr = parse_transport("nonroot", r, skip_checked=False)
+        retry_failed(t, "nonroot", tr, lambda names: run_on(
+            t, nonroot_test_script(t, names), timeout=1200))
+        res.transports["nonroot"] = tr
          log(f"[{t.name}] nonroot done "
-            f"({'ok' if res.transports['nonroot'].ok else 'ISSUE'})")
+            f"({'ok' if tr.ok else 'ISSUE'})")
      res.timings["total"] = time.monotonic() - started
      return res
  
@@ -598,6 +652,17 @@ def print_report(results: list[TargetResult], args, fleet: list[Target]) -> bool
          for d in details:
              print(d)
          print("=" * 64)
+    # Recovered flakes: tests that failed but passed within the target's
+    # max_retry budget.  The cell counts as OK, but list them so a flaky test is
+    # never silently swallowed.
+    recovered = [f"{res.target} / {transport}: {','.join(tr.recovered)}"
+                 for res in results for transport in transports
+                 if (tr := res.transports.get(transport)) and tr.recovered]
+    if recovered:
+        print("==== RECOVERED (flaky -- failed, then passed on retry) ====")
+        for r in recovered:
+            print(f"    {r}")
+        print("=" * 64)
      print(f"{len(results)} targets x {len(transports)} transports = {cells} cells: "
            f"{ok_cells} OK, {cells - ok_cells} not OK")
      return all_ok
author	Andrew Tridgell <andrew@tridgell.net>
	Sat, 6 Jun 2026 05:43:02 +0000 (15:43 +1000)
committer	Andrew Tridgell <andrew@tridgell.net>
	Sat, 6 Jun 2026 06:41:51 +0000 (16:41 +1000)
testsuite/fleettest.json.example		patch \| blob \| blame \| history
testsuite/fleettest.py		patch \| blob \| blame \| history