Use output_content_format poller.result to get clean content

author shamoon <4887959+shamoon@users.noreply.github.com>

Tue, 17 Jun 2025 19:52:48 +0000 (12:52 -0700)

committer shamoon <4887959+shamoon@users.noreply.github.com>

Tue, 8 Jul 2025 21:19:44 +0000 (14:19 -0700)
author shamoon <4887959+shamoon@users.noreply.github.com>
Tue, 17 Jun 2025 19:52:48 +0000 (12:52 -0700)
committer shamoon <4887959+shamoon@users.noreply.github.com>
Tue, 8 Jul 2025 21:19:44 +0000 (14:19 -0700)
diff --git a/src/paperless_remote/parsers.py b/src/paperless_remote/parsers.py

index 2027ea8e6dd5c04dc2963c3ab6554b3f91c989cc..1004ead3fe302588f69d94b7857082bde8612a8f 100644 (file)
--- a/src/paperless_remote/parsers.py
+++ b/src/paperless_remote/parsers.py
@@ -1,5 +1,3 @@
-import subprocess
-import tempfile
  from pathlib import Path
  
  from django.conf import settings
@@ -65,6 +63,8 @@ class RemoteDocumentParser(RasterisedDocumentParser):
          """
          from azure.ai.documentintelligence import DocumentIntelligenceClient
          from azure.ai.documentintelligence.models import AnalyzeDocumentRequest
+        from azure.ai.documentintelligence.models import AnalyzeOutputOption
+        from azure.ai.documentintelligence.models import DocumentContentFormat
          from azure.core.credentials import AzureKeyCredential
  
          client = DocumentIntelligenceClient(
@@ -77,12 +77,14 @@ class RemoteDocumentParser(RasterisedDocumentParser):
              poller = client.begin_analyze_document(
                  model_id="prebuilt-read",
                  body=analyze_request,
-                output=["pdf"],  # request searchable PDF output
+                output_content_format=DocumentContentFormat.TEXT,
+                output=[AnalyzeOutputOption.PDF],  # request searchable PDF output
                  content_type="application/json",
              )
  
          poller.wait()
          result_id = poller.details["operation_id"]
+        result = poller.result()
  
          # Download the PDF with embedded text
          self.archive_path = Path(self.tempdir) / "archive.pdf"
@@ -93,18 +95,7 @@ class RemoteDocumentParser(RasterisedDocumentParser):
              ):
                  f.write(chunk)
  
-        with tempfile.NamedTemporaryFile(delete=False, suffix=".txt") as tmp:
-            subprocess.run(
-                [
-                    "pdftotext",
-                    "-q",
-                    "-layout",
-                    str(self.archive_path),
-                    tmp.name,
-                ],
-            )
-            with Path(tmp.name).open(encoding="utf-8") as t:
-                return t.read()
+        return result.content
  
      def parse(self, document_path: Path, mime_type, file_name=None):
          if not self.settings.engine_is_valid():
author	shamoon <4887959+shamoon@users.noreply.github.com>
	Tue, 17 Jun 2025 19:52:48 +0000 (12:52 -0700)
committer	shamoon <4887959+shamoon@users.noreply.github.com>
	Tue, 8 Jul 2025 21:19:44 +0000 (14:19 -0700)