]> git.ipfire.org Git - thirdparty/paperless-ngx.git/commitdiff
Use output_content_format poller.result to get clean content
authorshamoon <4887959+shamoon@users.noreply.github.com>
Tue, 17 Jun 2025 19:52:48 +0000 (12:52 -0700)
committershamoon <4887959+shamoon@users.noreply.github.com>
Tue, 8 Jul 2025 21:19:44 +0000 (14:19 -0700)
src/paperless_remote/parsers.py

index 2027ea8e6dd5c04dc2963c3ab6554b3f91c989cc..1004ead3fe302588f69d94b7857082bde8612a8f 100644 (file)
@@ -1,5 +1,3 @@
-import subprocess
-import tempfile
 from pathlib import Path
 
 from django.conf import settings
@@ -65,6 +63,8 @@ class RemoteDocumentParser(RasterisedDocumentParser):
         """
         from azure.ai.documentintelligence import DocumentIntelligenceClient
         from azure.ai.documentintelligence.models import AnalyzeDocumentRequest
+        from azure.ai.documentintelligence.models import AnalyzeOutputOption
+        from azure.ai.documentintelligence.models import DocumentContentFormat
         from azure.core.credentials import AzureKeyCredential
 
         client = DocumentIntelligenceClient(
@@ -77,12 +77,14 @@ class RemoteDocumentParser(RasterisedDocumentParser):
             poller = client.begin_analyze_document(
                 model_id="prebuilt-read",
                 body=analyze_request,
-                output=["pdf"],  # request searchable PDF output
+                output_content_format=DocumentContentFormat.TEXT,
+                output=[AnalyzeOutputOption.PDF],  # request searchable PDF output
                 content_type="application/json",
             )
 
         poller.wait()
         result_id = poller.details["operation_id"]
+        result = poller.result()
 
         # Download the PDF with embedded text
         self.archive_path = Path(self.tempdir) / "archive.pdf"
@@ -93,18 +95,7 @@ class RemoteDocumentParser(RasterisedDocumentParser):
             ):
                 f.write(chunk)
 
-        with tempfile.NamedTemporaryFile(delete=False, suffix=".txt") as tmp:
-            subprocess.run(
-                [
-                    "pdftotext",
-                    "-q",
-                    "-layout",
-                    str(self.archive_path),
-                    tmp.name,
-                ],
-            )
-            with Path(tmp.name).open(encoding="utf-8") as t:
-                return t.read()
+        return result.content
 
     def parse(self, document_path: Path, mime_type, file_name=None):
         if not self.settings.engine_is_valid():