]> git.ipfire.org Git - thirdparty/paperless-ngx.git/commitdiff
Working arround current TIKA Library Bugs
authorSimon Siebert <simon@tuxedo.home.a17t.dev>
Thu, 6 Jul 2023 21:26:01 +0000 (23:26 +0200)
committerSimon Siebert <simon@tuxedo.home.a17t.dev>
Thu, 6 Jul 2023 21:26:01 +0000 (23:26 +0200)
src/paperless_tika/parsers.py

index 0558727f5c0973a7bc71dbdc564fd297dd0080f5..d69c5947a163102a56a20a0ed3d9f1fabf8a7922 100644 (file)
@@ -1,8 +1,13 @@
 import os
 from pathlib import Path
 
+import array
+
+import dateutil.parser
 import httpx
 from django.conf import settings
+from django.utils import timezone
+
 from tika_client import TikaClient
 
 from documents.parsers import DocumentParser
@@ -51,7 +56,9 @@ class TikaDocumentParser(DocumentParser):
 
         try:
             with TikaClient(tika_url=settings.TIKA_ENDPOINT) as client:
-                parsed = client.tika.as_text.from_file(document_path, mime_type)
+                with open(document_path, 'rb') as f:
+                    content = f.read()
+                    parsed = client.tika.as_text.from_buffer(content, mime_type)
         except Exception as err:
             raise ParseError(
                 f"Could not parse {document_path} with tika server at "
@@ -62,7 +69,9 @@ class TikaDocumentParser(DocumentParser):
         if self.text is not None:
             self.text = self.text.strip()
 
-        self.date = parsed.created
+        tz = timezone.get_current_timezone()
+
+        self.date = timezone.make_aware(parsed.created,tz)
         self.archive_path = self.convert_to_pdf(document_path, file_name)
 
     def convert_to_pdf(self, document_path, file_name):