import httpx
from django.conf import settings
from django.utils import timezone
-
from tika_client import TikaClient
from documents.parsers import DocumentParser
try:
with TikaClient(tika_url=settings.TIKA_ENDPOINT) as client:
- with open(document_path, "rb") as f:
- content = f.read()
- parsed = client.tika.as_text.from_buffer(content, mime_type)
+ parsed = client.tika.as_text.from_file(document_path, mime_type)
except Exception as err:
raise ParseError(
f"Could not parse {document_path} with tika server at "
if self.text is not None:
self.text = self.text.strip()
- tz = timezone.get_current_timezone()
+ self.date = parsed.created
+ if self.date is not None and timezone.is_naive(self.date):
+ self.date = timezone.make_aware(self.date)
- self.date = timezone.make_aware(parsed.created, tz)
self.archive_path = self.convert_to_pdf(document_path, file_name)
def convert_to_pdf(self, document_path, file_name):
from pathlib import Path
from unittest import mock
+try:
+ import zoneinfo
+except ImportError:
+ from backports import zoneinfo
+
from django.test import TestCase
from django.test import override_settings
from httpx import Request
def tearDown(self) -> None:
self.parser.cleanup()
+ @override_settings(TIME_ZONE="America/Chicago")
def test_parse(self):
# Pretend parse response
self.httpx_mock.add_response(
with open(self.parser.archive_path, "rb") as f:
self.assertEqual(f.read(), b"PDF document")
- self.assertEqual(self.parser.date, datetime.datetime(2020, 11, 21))
+ self.assertEqual(
+ self.parser.date,
+ datetime.datetime(
+ 2020,
+ 11,
+ 21,
+ tzinfo=zoneinfo.ZoneInfo("America/Chicago"),
+ ),
+ )
def test_metadata(self):
self.httpx_mock.add_response(