]> git.ipfire.org Git - thirdparty/paperless-ngx.git/commitdiff
Retry Tika parsing with PUT instead of form data in the event of a 500 error response...
authorTrenton H <797416+stumpylog@users.noreply.github.com>
Sun, 8 Oct 2023 01:36:27 +0000 (18:36 -0700)
committerGitHub <noreply@github.com>
Sun, 8 Oct 2023 01:36:27 +0000 (18:36 -0700)
src/paperless_tika/parsers.py
src/paperless_tika/tests/samples/multi-part-broken.odt [new file with mode: 0644]
src/paperless_tika/tests/test_live_tika.py

index 402a37215c0b4cdbbe1eb3bb483ef33093fc15b7..c410594bb8260aff2126bde001eda0f50aa51a88 100644 (file)
@@ -52,7 +52,18 @@ class TikaDocumentParser(DocumentParser):
 
         try:
             with TikaClient(tika_url=settings.TIKA_ENDPOINT) as client:
-                parsed = client.tika.as_text.from_file(document_path, mime_type)
+                try:
+                    parsed = client.tika.as_text.from_file(document_path, mime_type)
+                except httpx.HTTPStatusError as err:
+                    # Workaround https://issues.apache.org/jira/browse/TIKA-4110
+                    # Tika fails with some files as multi-part form data
+                    if err.response.status_code == httpx.codes.INTERNAL_SERVER_ERROR:
+                        parsed = client.tika.as_text.from_buffer(
+                            document_path.read_bytes(),
+                            mime_type,
+                        )
+                    else:  # pragma: nocover
+                        raise
         except Exception as err:
             raise ParseError(
                 f"Could not parse {document_path} with tika server at "
diff --git a/src/paperless_tika/tests/samples/multi-part-broken.odt b/src/paperless_tika/tests/samples/multi-part-broken.odt
new file mode 100644 (file)
index 0000000..82f593f
Binary files /dev/null and b/src/paperless_tika/tests/samples/multi-part-broken.odt differ
index f34278467b792608996e3d48746ba24fdb03db31..1c6225bdc69444651700831aaeba0f605ba3450d 100644 (file)
@@ -111,3 +111,27 @@ class TestTikaParserAgainstServer(TestCase):
         self.assertIsNotNone(self.parser.archive_path)
         with open(self.parser.archive_path, "rb") as f:
             self.assertTrue(b"PDF-" in f.read()[:10])
+
+    def test_tika_fails_multi_part(self):
+        """
+        GIVEN:
+            - An input ODT format document
+            - The document is known to crash Tika when uploaded via multi-part form data
+        WHEN:
+            - The document is parsed
+        THEN:
+            - Document content is correct
+            - Document date is correct
+        See also:
+            - https://issues.apache.org/jira/browse/TIKA-4110
+        """
+        test_file = self.SAMPLE_DIR / "multi-part-broken.odt"
+
+        util_call_with_backoff(
+            self.parser.parse,
+            [test_file, "application/vnd.oasis.opendocument.text"],
+        )
+
+        self.assertIsNotNone(self.parser.archive_path)
+        with open(self.parser.archive_path, "rb") as f:
+            self.assertTrue(b"PDF-" in f.read()[:10])