]> git.ipfire.org Git - thirdparty/paperless-ngx.git/commitdiff
Sets the timezone of creation, if the date is known and naive
authorTrenton Holmes <797416+stumpylog@users.noreply.github.com>
Thu, 3 Aug 2023 16:52:39 +0000 (09:52 -0700)
committerTrenton Holmes <797416+stumpylog@users.noreply.github.com>
Thu, 3 Aug 2023 16:57:52 +0000 (09:57 -0700)
src/paperless_tika/parsers.py
src/paperless_tika/tests/test_tika_parser.py

index 0ba59d3f6232cbd31fa6e1a7188c96e6cca028a8..b6a9dd621ab4420b2f15795f8e2e77d92795b00d 100644 (file)
@@ -4,7 +4,6 @@ from pathlib import Path
 import httpx
 from django.conf import settings
 from django.utils import timezone
-
 from tika_client import TikaClient
 
 from documents.parsers import DocumentParser
@@ -53,9 +52,7 @@ class TikaDocumentParser(DocumentParser):
 
         try:
             with TikaClient(tika_url=settings.TIKA_ENDPOINT) as client:
-                with open(document_path, "rb") as f:
-                    content = f.read()
-                    parsed = client.tika.as_text.from_buffer(content, mime_type)
+                parsed = client.tika.as_text.from_file(document_path, mime_type)
         except Exception as err:
             raise ParseError(
                 f"Could not parse {document_path} with tika server at "
@@ -66,9 +63,10 @@ class TikaDocumentParser(DocumentParser):
         if self.text is not None:
             self.text = self.text.strip()
 
-        tz = timezone.get_current_timezone()
+        self.date = parsed.created
+        if self.date is not None and timezone.is_naive(self.date):
+            self.date = timezone.make_aware(self.date)
 
-        self.date = timezone.make_aware(parsed.created, tz)
         self.archive_path = self.convert_to_pdf(document_path, file_name)
 
     def convert_to_pdf(self, document_path, file_name):
index 8ba8e0e79c4a638f6dcfe9014611f4e06df54758..4f64afc04bb900a05255e893be4c9aa0471b33cf 100644 (file)
@@ -3,6 +3,11 @@ import os
 from pathlib import Path
 from unittest import mock
 
+try:
+    import zoneinfo
+except ImportError:
+    from backports import zoneinfo
+
 from django.test import TestCase
 from django.test import override_settings
 from httpx import Request
@@ -21,6 +26,7 @@ class TestTikaParser(HttpxMockMixin, TestCase):
     def tearDown(self) -> None:
         self.parser.cleanup()
 
+    @override_settings(TIME_ZONE="America/Chicago")
     def test_parse(self):
         # Pretend parse response
         self.httpx_mock.add_response(
@@ -44,7 +50,15 @@ class TestTikaParser(HttpxMockMixin, TestCase):
         with open(self.parser.archive_path, "rb") as f:
             self.assertEqual(f.read(), b"PDF document")
 
-        self.assertEqual(self.parser.date, datetime.datetime(2020, 11, 21))
+        self.assertEqual(
+            self.parser.date,
+            datetime.datetime(
+                2020,
+                11,
+                21,
+                tzinfo=zoneinfo.ZoneInfo("America/Chicago"),
+            ),
+        )
 
     def test_metadata(self):
         self.httpx_mock.add_response(