try:
with TikaClient(tika_url=settings.TIKA_ENDPOINT) as client:
- parsed = client.tika.as_text.from_file(document_path, mime_type)
+ try:
+ parsed = client.tika.as_text.from_file(document_path, mime_type)
+ except httpx.HTTPStatusError as err:
+ # Workaround https://issues.apache.org/jira/browse/TIKA-4110
+ # Tika fails with some files as multi-part form data
+ if err.response.status_code == httpx.codes.INTERNAL_SERVER_ERROR:
+ parsed = client.tika.as_text.from_buffer(
+ document_path.read_bytes(),
+ mime_type,
+ )
+ else: # pragma: nocover
+ raise
except Exception as err:
raise ParseError(
f"Could not parse {document_path} with tika server at "
self.assertIsNotNone(self.parser.archive_path)
with open(self.parser.archive_path, "rb") as f:
self.assertTrue(b"PDF-" in f.read()[:10])
+
+ def test_tika_fails_multi_part(self):
+ """
+ GIVEN:
+ - An input ODT format document
+ - The document is known to crash Tika when uploaded via multi-part form data
+ WHEN:
+ - The document is parsed
+ THEN:
+ - Document content is correct
+ - Document date is correct
+ See also:
+ - https://issues.apache.org/jira/browse/TIKA-4110
+ """
+ test_file = self.SAMPLE_DIR / "multi-part-broken.odt"
+
+ util_call_with_backoff(
+ self.parser.parse,
+ [test_file, "application/vnd.oasis.opendocument.text"],
+ )
+
+ self.assertIsNotNone(self.parser.archive_path)
+ with open(self.parser.archive_path, "rb") as f:
+ self.assertTrue(b"PDF-" in f.read()[:10])