reportlab = "==3.6.12"
# Pin this until piwheels is building a newer version (see https://www.piwheels.org/project/cryptography/)
cryptography = "==40.0.1"
+httpx = "*"
[dev-packages]
# Linting
{
"_meta": {
"hash": {
- "sha256": "db3fc8c37931534327f89c6211581495328b6f6bf2c533df848fa23faa5d0cd3"
+ "sha256": "e63cdbb928210fc4dcf0554bde381abd0ff956923ae03ab9f6984025cd5a454d"
},
"pipfile-spec": 6,
"requires": {},
"sha256:06781eb9ac53cde990577af654bd990a4949de37a28bdb4a230d434f3a30b9bd",
"sha256:5853a43053df830c20f8110c5e69fe44d035d850b2dfe795e196f00fdb774bdd"
],
- "markers": "python_version >= '3.7'",
+ "index": "pypi",
"version": "==0.24.1"
},
"humanfriendly": {
},
"tika-client": {
"hashes": [
- "sha256:29b702d64bbbaa324a75f99062efb3253239762cbf0a3419a47549c2de9379d0",
- "sha256:63a93593068dc0da07108dc47c12cd3ff00f07403cff72c86bea6a89abafbf6d"
+ "sha256:6110bd73eaa133f9c8eb1ef2566e6c0c8123a0e4efbcfb85b86f8c1b26cb4de2",
+ "sha256:e8eaa52771c72426f5531c53dcc8dfc5e3bb6e1f91f89fc93674a81bfca59d6d"
],
"index": "pypi",
- "version": "==0.1.1"
+ "version": "==0.2.0"
},
"tornado": {
"hashes": [
from imap_tools import MailAttachment
from imap_tools import MailMessage
from tika_client import TikaClient
-from tika_client.data_models import TikaKey
from documents.parsers import DocumentParser
from documents.parsers import ParseError
with TikaClient(tika_url=self.tika_server) as client:
parsed = client.tika.as_text.from_buffer(html, "text/html")
- if hasattr(parsed, "content") and parsed.content is not None:
+ if parsed.content is not None:
return parsed.content.strip()
- elif TikaKey.Content in parsed.data:
- # May not be a completely handled type, but
- # the Tika response may still include content
- return parsed.data[TikaKey.Content].strip()
return ""
except Exception as err:
raise ParseError(
import httpx
from django.conf import settings
from tika_client import TikaClient
-from tika_client.data_models import TikaKey
from documents.parsers import DocumentParser
from documents.parsers import ParseError
f"{settings.TIKA_ENDPOINT}: {err}",
) from err
- self.text = None
- if hasattr(parsed, "content") and parsed.content is not None:
- self.text = parsed.content.strip()
- elif TikaKey.Content in parsed.data:
- # May not be a completely handled type, but
- # the Tika response may still include content
- self.text = parsed.data[TikaKey.Content].strip()
+ self.text = parsed.content
+ if self.text is not None:
+ self.text = self.text.strip()
self.date = parsed.created
self.archive_path = self.convert_to_pdf(document_path, file_name)