Adds better error handling/checking around getting content of a document via Tika

author Trenton Holmes <797416+stumpylog@users.noreply.github.com>

Sun, 18 Jun 2023 14:04:53 +0000 (07:04 -0700)

committer Trenton H <797416+stumpylog@users.noreply.github.com>

Sun, 18 Jun 2023 15:39:17 +0000 (08:39 -0700)
author Trenton Holmes <797416+stumpylog@users.noreply.github.com>
Sun, 18 Jun 2023 14:04:53 +0000 (07:04 -0700)
committer Trenton H <797416+stumpylog@users.noreply.github.com>
Sun, 18 Jun 2023 15:39:17 +0000 (08:39 -0700)
diff --git a/Pipfile.lock b/Pipfile.lock

index d948729efeeba25542a9f5ab9a65ae4ea1ec0b0c..6bf949a7f3e6b96fabe01e69860dd6c02837a3ee 100644 (file)
--- a/Pipfile.lock
+++ b/Pipfile.lock
@@ -1746,11 +1746,11 @@
          },
          "tika-client": {
              "hashes": [
-                "sha256:43b53816b3783c9c77e16df314cad5ad66ab606391c26ad4bc94a784d473a156",
-                "sha256:e1ef3447b4307059e4a836e3786088498637323733f83a2f807b77f998d77610"
+                "sha256:29b702d64bbbaa324a75f99062efb3253239762cbf0a3419a47549c2de9379d0",
+                "sha256:63a93593068dc0da07108dc47c12cd3ff00f07403cff72c86bea6a89abafbf6d"
              ],
              "index": "pypi",
-            "version": "==0.0.3"
+            "version": "==0.1.1"
          },
          "tornado": {
              "hashes": [
diff --git a/src/paperless_mail/parsers.py b/src/paperless_mail/parsers.py

index 3ec3e64a09e4dc0bbc5c178f0848a7998938edf4..f7daa758e4dd950b78727a9d307e142fe7a32385 100644 (file)
--- a/src/paperless_mail/parsers.py
+++ b/src/paperless_mail/parsers.py
@@ -13,6 +13,7 @@ from humanfriendly import format_size
  from imap_tools import MailAttachment
  from imap_tools import MailMessage
  from tika_client import TikaClient
+from tika_client.data_models import TikaKey
  
  from documents.parsers import DocumentParser
  from documents.parsers import ParseError
@@ -172,8 +173,12 @@ class MailDocumentParser(DocumentParser):
              with TikaClient(tika_url=self.tika_server) as client:
                  parsed = client.tika.as_text.from_buffer(html, "text/html")
  
-                if "X-TIKA:content" in parsed.data:
-                    return parsed.data["X-TIKA:content"].strip()
+                if hasattr(parsed, "content") and parsed.content is not None:
+                    return parsed.content.strip()
+                elif TikaKey.Content in parsed.data:
+                    # May not be a completely handled type, but
+                    # the Tika response may still include content
+                    return parsed.data[TikaKey.Content].strip()
                  return ""
          except Exception as err:
              raise ParseError(
diff --git a/src/paperless_tika/parsers.py b/src/paperless_tika/parsers.py

index 10447ff534bda8a15dc8b39dc12694e76b959533..8b476bfd81d386d22848b8bbbc0f6c675e6980fb 100644 (file)
--- a/src/paperless_tika/parsers.py
+++ b/src/paperless_tika/parsers.py
@@ -4,6 +4,7 @@ from pathlib import Path
  import httpx
  from django.conf import settings
  from tika_client import TikaClient
+from tika_client.data_models import TikaKey
  
  from documents.parsers import DocumentParser
  from documents.parsers import ParseError
@@ -58,8 +59,15 @@ class TikaDocumentParser(DocumentParser):
                  f"{settings.TIKA_ENDPOINT}: {err}",
              ) from err
  
-        self.text = parsed.content.strip()
-        self.date = parsed.metadata.created
+        self.text = None
+        if hasattr(parsed, "content") and parsed.content is not None:
+            self.text = parsed.content.strip()
+        elif TikaKey.Content in parsed.data:
+            # May not be a completely handled type, but
+            # the Tika response may still include content
+            self.text = parsed.data[TikaKey.Content].strip()
+
+        self.date = parsed.created
          self.archive_path = self.convert_to_pdf(document_path, file_name)
  
      def convert_to_pdf(self, document_path, file_name):
diff --git a/src/paperless_tika/tests/samples/sample.doc b/src/paperless_tika/tests/samples/sample.doc

new file mode 100644 (file)

index 0000000..72178a7

Binary files /dev/null and b/src/paperless_tika/tests/samples/sample.doc differ
diff --git a/src/paperless_tika/tests/test_live_tika.py b/src/paperless_tika/tests/test_live_tika.py

index 9a83614b1f040a691bea083e2db9ac4a5c63a6e9..f4c8e01349285ca9ae407574aa1b73ec1325f0ab 100644 (file)
--- a/src/paperless_tika/tests/test_live_tika.py
+++ b/src/paperless_tika/tests/test_live_tika.py
@@ -118,3 +118,28 @@ class TestTikaParserAgainstServer(TestCase):
              self.assertTrue(b"PDF-" in f.read()[:10])
  
          # self.assertEqual(self.parser.date, datetime.datetime(2022, 9, 14))
+
+    def test_basic_parse_doc(self):
+        """
+        GIVEN:
+            - An input DOC format document
+        WHEN:
+            - The document is parsed
+        THEN:
+            - Document content is correct
+            - Document date is correct
+        """
+        test_file = self.SAMPLE_DIR / "sample.doc"
+
+        self.try_parse_with_wait(
+            test_file,
+            "application/msword",
+        )
+
+        self.assertIn(
+            "his is a test document, saved in the older .doc format",
+            self.parser.text,
+        )
+        self.assertIsNotNone(self.parser.archive_path)
+        with open(self.parser.archive_path, "rb") as f:
+            self.assertTrue(b"PDF-" in f.read()[:10])
author	Trenton Holmes <797416+stumpylog@users.noreply.github.com>
	Sun, 18 Jun 2023 14:04:53 +0000 (07:04 -0700)
committer	Trenton H <797416+stumpylog@users.noreply.github.com>
	Sun, 18 Jun 2023 15:39:17 +0000 (08:39 -0700)
Pipfile.lock		patch \| blob \| blame \| history
src/paperless_mail/parsers.py		patch \| blob \| blame \| history
src/paperless_tika/parsers.py		patch \| blob \| blame \| history
src/paperless_tika/tests/samples/sample.doc	[new file with mode: 0644]	patch \| blob
src/paperless_tika/tests/test_live_tika.py		patch \| blob \| blame \| history