Chore: add backoff ro handle 429 Wikimedia requests in tests (#11364)

author shamoon <4887959+shamoon@users.noreply.github.com>

Fri, 14 Nov 2025 23:58:29 +0000 (15:58 -0800)

committer GitHub <noreply@github.com>

Fri, 14 Nov 2025 23:58:29 +0000 (15:58 -0800)
author shamoon <4887959+shamoon@users.noreply.github.com>
Fri, 14 Nov 2025 23:58:29 +0000 (15:58 -0800)
committer GitHub <noreply@github.com>
Fri, 14 Nov 2025 23:58:29 +0000 (15:58 -0800)
diff --git a/src/paperless_mail/tests/test_parsers_live.py b/src/paperless_mail/tests/test_parsers_live.py

index 2c3ef262aa953515da0ff9bc716245e37f3df366..456cb47d50945c11a65e99a21cee77f929257368 100644 (file)
--- a/src/paperless_mail/tests/test_parsers_live.py
+++ b/src/paperless_mail/tests/test_parsers_live.py
@@ -2,6 +2,7 @@ import os
  import shutil
  import subprocess
  import tempfile
+import time
  from pathlib import Path
  
  import httpx
@@ -53,14 +54,33 @@ class TestUrlCanary:
      Verify certain URLs are still available so testing is valid still
      """
  
-    # Wikimedia rejects requests without a browser-like User-Agent header and returns 403.
-    _WIKIMEDIA_HEADERS = {
-        "User-Agent": (
-            "Mozilla/5.0 (X11; Linux x86_64) "
-            "AppleWebKit/537.36 (KHTML, like Gecko) "
-            "Chrome/123.0.0.0 Safari/537.36"
-        ),
-    }
+    @classmethod
+    def _fetch_wikimedia(cls, url: str) -> httpx.Response:
+        """
+        Wikimedia occasionally throttles automated requests (HTTP 429). Retry a few
+        times with a short backoff so the tests stay stable, and skip if throttling
+        persists.
+        """
+        last_resp: httpx.Response | None = None
+        # Wikimedia rejects requests without a browser-like User-Agent header and returns 403.
+        headers = {
+            "User-Agent": (
+                "Mozilla/5.0 (X11; Linux x86_64) "
+                "AppleWebKit/537.36 (KHTML, like Gecko) "
+                "Chrome/123.0.0.0 Safari/537.36"
+            ),
+        }
+        for delay in (0, 1, 2):
+            resp = httpx.get(url, headers=headers, timeout=30.0)
+            if resp.status_code != httpx.codes.TOO_MANY_REQUESTS:
+                return resp
+            last_resp = resp
+            time.sleep(delay)
+
+        pytest.skip(
+            "Wikimedia throttled the canary request with HTTP 429; try rerunning later.",
+        )
+        return last_resp  # pragma: no cover
  
      def test_online_image_exception_on_not_available(self):
          """
@@ -76,11 +96,10 @@ class TestUrlCanary:
          whether this image stays online forever, so here we check if we can detect if is not
          available anymore.
          """
+        resp = self._fetch_wikimedia(
+            "https://upload.wikimedia.org/wikipedia/en/f/f7/nonexistent.png",
+        )
          with pytest.raises(httpx.HTTPStatusError) as exec_info:
-            resp = httpx.get(
-                "https://upload.wikimedia.org/wikipedia/en/f/f7/nonexistent.png",
-                headers=self._WIKIMEDIA_HEADERS,
-            )
              resp.raise_for_status()
  
          assert exec_info.value.response.status_code == httpx.codes.NOT_FOUND
@@ -100,9 +119,8 @@ class TestUrlCanary:
          """
  
          # Now check the URL used in samples/sample.html
-        resp = httpx.get(
+        resp = self._fetch_wikimedia(
              "https://upload.wikimedia.org/wikipedia/en/f/f7/RickRoll.png",
-            headers=self._WIKIMEDIA_HEADERS,
          )
          resp.raise_for_status()
author	shamoon <4887959+shamoon@users.noreply.github.com>
	Fri, 14 Nov 2025 23:58:29 +0000 (15:58 -0800)
committer	GitHub <noreply@github.com>
	Fri, 14 Nov 2025 23:58:29 +0000 (15:58 -0800)