]> git.ipfire.org Git - thirdparty/paperless-ngx.git/commitdiff
Chore: add backoff ro handle 429 Wikimedia requests in tests (#11364)
authorshamoon <4887959+shamoon@users.noreply.github.com>
Fri, 14 Nov 2025 23:58:29 +0000 (15:58 -0800)
committerGitHub <noreply@github.com>
Fri, 14 Nov 2025 23:58:29 +0000 (15:58 -0800)
src/paperless_mail/tests/test_parsers_live.py

index 2c3ef262aa953515da0ff9bc716245e37f3df366..456cb47d50945c11a65e99a21cee77f929257368 100644 (file)
@@ -2,6 +2,7 @@ import os
 import shutil
 import subprocess
 import tempfile
+import time
 from pathlib import Path
 
 import httpx
@@ -53,14 +54,33 @@ class TestUrlCanary:
     Verify certain URLs are still available so testing is valid still
     """
 
-    # Wikimedia rejects requests without a browser-like User-Agent header and returns 403.
-    _WIKIMEDIA_HEADERS = {
-        "User-Agent": (
-            "Mozilla/5.0 (X11; Linux x86_64) "
-            "AppleWebKit/537.36 (KHTML, like Gecko) "
-            "Chrome/123.0.0.0 Safari/537.36"
-        ),
-    }
+    @classmethod
+    def _fetch_wikimedia(cls, url: str) -> httpx.Response:
+        """
+        Wikimedia occasionally throttles automated requests (HTTP 429). Retry a few
+        times with a short backoff so the tests stay stable, and skip if throttling
+        persists.
+        """
+        last_resp: httpx.Response | None = None
+        # Wikimedia rejects requests without a browser-like User-Agent header and returns 403.
+        headers = {
+            "User-Agent": (
+                "Mozilla/5.0 (X11; Linux x86_64) "
+                "AppleWebKit/537.36 (KHTML, like Gecko) "
+                "Chrome/123.0.0.0 Safari/537.36"
+            ),
+        }
+        for delay in (0, 1, 2):
+            resp = httpx.get(url, headers=headers, timeout=30.0)
+            if resp.status_code != httpx.codes.TOO_MANY_REQUESTS:
+                return resp
+            last_resp = resp
+            time.sleep(delay)
+
+        pytest.skip(
+            "Wikimedia throttled the canary request with HTTP 429; try rerunning later.",
+        )
+        return last_resp  # pragma: no cover
 
     def test_online_image_exception_on_not_available(self):
         """
@@ -76,11 +96,10 @@ class TestUrlCanary:
         whether this image stays online forever, so here we check if we can detect if is not
         available anymore.
         """
+        resp = self._fetch_wikimedia(
+            "https://upload.wikimedia.org/wikipedia/en/f/f7/nonexistent.png",
+        )
         with pytest.raises(httpx.HTTPStatusError) as exec_info:
-            resp = httpx.get(
-                "https://upload.wikimedia.org/wikipedia/en/f/f7/nonexistent.png",
-                headers=self._WIKIMEDIA_HEADERS,
-            )
             resp.raise_for_status()
 
         assert exec_info.value.response.status_code == httpx.codes.NOT_FOUND
@@ -100,9 +119,8 @@ class TestUrlCanary:
         """
 
         # Now check the URL used in samples/sample.html
-        resp = httpx.get(
+        resp = self._fetch_wikimedia(
             "https://upload.wikimedia.org/wikipedia/en/f/f7/RickRoll.png",
-            headers=self._WIKIMEDIA_HEADERS,
         )
         resp.raise_for_status()