-import subprocess
-import tempfile
from pathlib import Path
from django.conf import settings
"""
from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.ai.documentintelligence.models import AnalyzeDocumentRequest
+ from azure.ai.documentintelligence.models import AnalyzeOutputOption
+ from azure.ai.documentintelligence.models import DocumentContentFormat
from azure.core.credentials import AzureKeyCredential
client = DocumentIntelligenceClient(
poller = client.begin_analyze_document(
model_id="prebuilt-read",
body=analyze_request,
- output=["pdf"], # request searchable PDF output
+ output_content_format=DocumentContentFormat.TEXT,
+ output=[AnalyzeOutputOption.PDF], # request searchable PDF output
content_type="application/json",
)
poller.wait()
result_id = poller.details["operation_id"]
+ result = poller.result()
# Download the PDF with embedded text
self.archive_path = Path(self.tempdir) / "archive.pdf"
):
f.write(chunk)
- with tempfile.NamedTemporaryFile(delete=False, suffix=".txt") as tmp:
- subprocess.run(
- [
- "pdftotext",
- "-q",
- "-layout",
- str(self.archive_path),
- tmp.name,
- ],
- )
- with Path(tmp.name).open(encoding="utf-8") as t:
- return t.read()
+ return result.content
def parse(self, document_path: Path, mime_type, file_name=None):
if not self.settings.engine_is_valid():