]> git.ipfire.org Git - thirdparty/paperless-ngx.git/commitdiff
Updates tika client library and handle the changes to it
authorTrenton H <797416+stumpylog@users.noreply.github.com>
Mon, 26 Jun 2023 14:41:45 +0000 (07:41 -0700)
committerTrenton H <797416+stumpylog@users.noreply.github.com>
Mon, 26 Jun 2023 16:41:05 +0000 (10:41 -0600)
Pipfile
Pipfile.lock
src/paperless_mail/parsers.py
src/paperless_tika/parsers.py

diff --git a/Pipfile b/Pipfile
index edb0e46a95590539058a088ff8e1517f04ab5936..8a4ea03b23bb0c21e23e400cb4bfc12cff88f159 100644 (file)
--- a/Pipfile
+++ b/Pipfile
@@ -66,6 +66,7 @@ scipy = "==1.8.1"
 reportlab = "==3.6.12"
 # Pin this until piwheels is building a newer version (see https://www.piwheels.org/project/cryptography/)
 cryptography = "==40.0.1"
+httpx = "*"
 
 [dev-packages]
 # Linting
index 6bf949a7f3e6b96fabe01e69860dd6c02837a3ee..e9403c922e417f415d86b005b61f1940f22cb63e 100644 (file)
@@ -1,7 +1,7 @@
 {
     "_meta": {
         "hash": {
-            "sha256": "db3fc8c37931534327f89c6211581495328b6f6bf2c533df848fa23faa5d0cd3"
+            "sha256": "e63cdbb928210fc4dcf0554bde381abd0ff956923ae03ab9f6984025cd5a454d"
         },
         "pipfile-spec": 6,
         "requires": {},
                 "sha256:06781eb9ac53cde990577af654bd990a4949de37a28bdb4a230d434f3a30b9bd",
                 "sha256:5853a43053df830c20f8110c5e69fe44d035d850b2dfe795e196f00fdb774bdd"
             ],
-            "markers": "python_version >= '3.7'",
+            "index": "pypi",
             "version": "==0.24.1"
         },
         "humanfriendly": {
         },
         "tika-client": {
             "hashes": [
-                "sha256:29b702d64bbbaa324a75f99062efb3253239762cbf0a3419a47549c2de9379d0",
-                "sha256:63a93593068dc0da07108dc47c12cd3ff00f07403cff72c86bea6a89abafbf6d"
+                "sha256:6110bd73eaa133f9c8eb1ef2566e6c0c8123a0e4efbcfb85b86f8c1b26cb4de2",
+                "sha256:e8eaa52771c72426f5531c53dcc8dfc5e3bb6e1f91f89fc93674a81bfca59d6d"
             ],
             "index": "pypi",
-            "version": "==0.1.1"
+            "version": "==0.2.0"
         },
         "tornado": {
             "hashes": [
index 1fcc8918887e7fa397e1ee223b32a5c6739c9ccb..4365d21a4dd666d5a1cff4eca64f6550386da190 100644 (file)
@@ -13,7 +13,6 @@ from humanfriendly import format_size
 from imap_tools import MailAttachment
 from imap_tools import MailMessage
 from tika_client import TikaClient
-from tika_client.data_models import TikaKey
 
 from documents.parsers import DocumentParser
 from documents.parsers import ParseError
@@ -175,12 +174,8 @@ class MailDocumentParser(DocumentParser):
             with TikaClient(tika_url=self.tika_server) as client:
                 parsed = client.tika.as_text.from_buffer(html, "text/html")
 
-                if hasattr(parsed, "content") and parsed.content is not None:
+                if parsed.content is not None:
                     return parsed.content.strip()
-                elif TikaKey.Content in parsed.data:
-                    # May not be a completely handled type, but
-                    # the Tika response may still include content
-                    return parsed.data[TikaKey.Content].strip()
                 return ""
         except Exception as err:
             raise ParseError(
index 876696633cdbe4efec2c30fbe85b8ab830bea43e..0558727f5c0973a7bc71dbdc564fd297dd0080f5 100644 (file)
@@ -4,7 +4,6 @@ from pathlib import Path
 import httpx
 from django.conf import settings
 from tika_client import TikaClient
-from tika_client.data_models import TikaKey
 
 from documents.parsers import DocumentParser
 from documents.parsers import ParseError
@@ -59,13 +58,9 @@ class TikaDocumentParser(DocumentParser):
                 f"{settings.TIKA_ENDPOINT}: {err}",
             ) from err
 
-        self.text = None
-        if hasattr(parsed, "content") and parsed.content is not None:
-            self.text = parsed.content.strip()
-        elif TikaKey.Content in parsed.data:
-            # May not be a completely handled type, but
-            # the Tika response may still include content
-            self.text = parsed.data[TikaKey.Content].strip()
+        self.text = parsed.content
+        if self.text is not None:
+            self.text = self.text.strip()
 
         self.date = parsed.created
         self.archive_path = self.convert_to_pdf(document_path, file_name)