import os
from pathlib import Path
+import array
+
+import dateutil.parser
import httpx
from django.conf import settings
+from django.utils import timezone
+
from tika_client import TikaClient
from documents.parsers import DocumentParser
try:
with TikaClient(tika_url=settings.TIKA_ENDPOINT) as client:
- parsed = client.tika.as_text.from_file(document_path, mime_type)
+ with open(document_path, 'rb') as f:
+ content = f.read()
+ parsed = client.tika.as_text.from_buffer(content, mime_type)
except Exception as err:
raise ParseError(
f"Could not parse {document_path} with tika server at "
if self.text is not None:
self.text = self.text.strip()
- self.date = parsed.created
+ tz = timezone.get_current_timezone()
+
+ self.date = timezone.make_aware(parsed.created,tz)
self.archive_path = self.convert_to_pdf(document_path, file_name)
def convert_to_pdf(self, document_path, file_name):