This actually works

author shamoon <4887959+shamoon@users.noreply.github.com>

Fri, 18 Apr 2025 20:03:51 +0000 (13:03 -0700)

committer shamoon <4887959+shamoon@users.noreply.github.com>

Tue, 8 Jul 2025 21:19:43 +0000 (14:19 -0700)
author shamoon <4887959+shamoon@users.noreply.github.com>
Fri, 18 Apr 2025 20:03:51 +0000 (13:03 -0700)
committer shamoon <4887959+shamoon@users.noreply.github.com>
Tue, 8 Jul 2025 21:19:43 +0000 (14:19 -0700)
diff --git a/src/paperless/settings.py b/src/paperless/settings.py

index 249423d54777f1062b017059b6a037b48094722e..08b781775eb16684821a801306a6e5b274338263 100644 (file)
--- a/src/paperless/settings.py
+++ b/src/paperless/settings.py
@@ -324,6 +324,7 @@ INSTALLED_APPS = [
      "paperless_tesseract.apps.PaperlessTesseractConfig",
      "paperless_text.apps.PaperlessTextConfig",
      "paperless_mail.apps.PaperlessMailConfig",
+    "paperless_remote.apps.PaperlessRemoteParserConfig",
      "django.contrib.admin",
      "rest_framework",
      "rest_framework.authtoken",
diff --git a/src/paperless_remote/parsers.py b/src/paperless_remote/parsers.py

index 1a04f067a7f28616789dbe8697a168328263b379..2027ea8e6dd5c04dc2963c3ab6554b3f91c989cc 100644 (file)
--- a/src/paperless_remote/parsers.py
+++ b/src/paperless_remote/parsers.py
@@ -64,6 +64,7 @@ class RemoteDocumentParser(RasterisedDocumentParser):
          This method uses the Azure AI Vision API to parse documents
          """
          from azure.ai.documentintelligence import DocumentIntelligenceClient
+        from azure.ai.documentintelligence.models import AnalyzeDocumentRequest
          from azure.core.credentials import AzureKeyCredential
  
          client = DocumentIntelligenceClient(
@@ -72,19 +73,25 @@ class RemoteDocumentParser(RasterisedDocumentParser):
          )
  
          with file.open("rb") as f:
+            analyze_request = AnalyzeDocumentRequest(bytes_source=f.read())
              poller = client.begin_analyze_document(
                  model_id="prebuilt-read",
-                analyze_request=f,
-                content_type="application/octet-stream",
-                output_format="pdf",
+                body=analyze_request,
+                output=["pdf"],  # request searchable PDF output
+                content_type="application/json",
              )
  
-        result = poller.result()
+        poller.wait()
+        result_id = poller.details["operation_id"]
  
          # Download the PDF with embedded text
-        pdf_bytes = client.get_analyze_result_pdf(result.result_id)
          self.archive_path = Path(self.tempdir) / "archive.pdf"
-        self.archive_path.write_bytes(pdf_bytes)
+        with self.archive_path.open("wb") as f:
+            for chunk in client.get_analyze_result_pdf(
+                model_id="prebuilt-read",
+                result_id=result_id,
+            ):
+                f.write(chunk)
  
          with tempfile.NamedTemporaryFile(delete=False, suffix=".txt") as tmp:
              subprocess.run(
@@ -96,7 +103,7 @@ class RemoteDocumentParser(RasterisedDocumentParser):
                      tmp.name,
                  ],
              )
-            with Path.open(tmp.name, encoding="utf-8") as t:
+            with Path(tmp.name).open(encoding="utf-8") as t:
                  return t.read()
  
      def parse(self, document_path: Path, mime_type, file_name=None):
author	shamoon <4887959+shamoon@users.noreply.github.com>
	Fri, 18 Apr 2025 20:03:51 +0000 (13:03 -0700)
committer	shamoon <4887959+shamoon@users.noreply.github.com>
	Tue, 8 Jul 2025 21:19:43 +0000 (14:19 -0700)
src/paperless/settings.py		patch \| blob \| blame \| history
src/paperless_remote/parsers.py		patch \| blob \| blame \| history