},
"tika-client": {
"hashes": [
- "sha256:43b53816b3783c9c77e16df314cad5ad66ab606391c26ad4bc94a784d473a156",
- "sha256:e1ef3447b4307059e4a836e3786088498637323733f83a2f807b77f998d77610"
+ "sha256:29b702d64bbbaa324a75f99062efb3253239762cbf0a3419a47549c2de9379d0",
+ "sha256:63a93593068dc0da07108dc47c12cd3ff00f07403cff72c86bea6a89abafbf6d"
],
"index": "pypi",
- "version": "==0.0.3"
+ "version": "==0.1.1"
},
"tornado": {
"hashes": [
from imap_tools import MailAttachment
from imap_tools import MailMessage
from tika_client import TikaClient
+from tika_client.data_models import TikaKey
from documents.parsers import DocumentParser
from documents.parsers import ParseError
with TikaClient(tika_url=self.tika_server) as client:
parsed = client.tika.as_text.from_buffer(html, "text/html")
- if "X-TIKA:content" in parsed.data:
- return parsed.data["X-TIKA:content"].strip()
+ if hasattr(parsed, "content") and parsed.content is not None:
+ return parsed.content.strip()
+ elif TikaKey.Content in parsed.data:
+ # May not be a completely handled type, but
+ # the Tika response may still include content
+ return parsed.data[TikaKey.Content].strip()
return ""
except Exception as err:
raise ParseError(
import httpx
from django.conf import settings
from tika_client import TikaClient
+from tika_client.data_models import TikaKey
from documents.parsers import DocumentParser
from documents.parsers import ParseError
f"{settings.TIKA_ENDPOINT}: {err}",
) from err
- self.text = parsed.content.strip()
- self.date = parsed.metadata.created
+ self.text = None
+ if hasattr(parsed, "content") and parsed.content is not None:
+ self.text = parsed.content.strip()
+ elif TikaKey.Content in parsed.data:
+ # May not be a completely handled type, but
+ # the Tika response may still include content
+ self.text = parsed.data[TikaKey.Content].strip()
+
+ self.date = parsed.created
self.archive_path = self.convert_to_pdf(document_path, file_name)
def convert_to_pdf(self, document_path, file_name):
self.assertTrue(b"PDF-" in f.read()[:10])
# self.assertEqual(self.parser.date, datetime.datetime(2022, 9, 14))
+
+ def test_basic_parse_doc(self):
+ """
+ GIVEN:
+ - An input DOC format document
+ WHEN:
+ - The document is parsed
+ THEN:
+ - Document content is correct
+ - Document date is correct
+ """
+ test_file = self.SAMPLE_DIR / "sample.doc"
+
+ self.try_parse_with_wait(
+ test_file,
+ "application/msword",
+ )
+
+ self.assertIn(
+ "his is a test document, saved in the older .doc format",
+ self.parser.text,
+ )
+ self.assertIsNotNone(self.parser.archive_path)
+ with open(self.parser.archive_path, "rb") as f:
+ self.assertTrue(b"PDF-" in f.read()[:10])