Update parsers.py

author shamoon <4887959+shamoon@users.noreply.github.com>

Wed, 9 Jul 2025 18:02:57 +0000 (11:02 -0700)

committer shamoon <4887959+shamoon@users.noreply.github.com>

Wed, 9 Jul 2025 18:02:57 +0000 (11:02 -0700)
author shamoon <4887959+shamoon@users.noreply.github.com>
Wed, 9 Jul 2025 18:02:57 +0000 (11:02 -0700)
committer shamoon <4887959+shamoon@users.noreply.github.com>
Wed, 9 Jul 2025 18:02:57 +0000 (11:02 -0700)
diff --git a/src/paperless_remote/parsers.py b/src/paperless_remote/parsers.py

index 1004ead3fe302588f69d94b7857082bde8612a8f..a3d460d137e5191deb16e1715e59beff596eed4c 100644 (file)
--- a/src/paperless_remote/parsers.py
+++ b/src/paperless_remote/parsers.py
@@ -25,14 +25,15 @@ class RemoteEngineConfig:
  
  class RemoteDocumentParser(RasterisedDocumentParser):
      """
-    This parser uses a remote ocr engine to parse documents
+    This parser uses a remote OCR engine to parse documents. Currently, it supports Azure AI Vision
+    as this is the only service that provides a remote OCR API with text-embedded PDF output.
      """
  
      logging_name = "paperless.parsing.remote"
  
      def get_settings(self) -> RemoteEngineConfig:
          """
-        This parser uses the OCR configuration settings to parse documents
+        Returns the configuration for the remote OCR engine, loaded from Django settings.
          """
          return RemoteEngineConfig(
              engine=settings.REMOTE_OCR_ENGINE,
@@ -59,7 +60,11 @@ class RemoteDocumentParser(RasterisedDocumentParser):
          file: Path,
      ) -> str | None:
          """
-        This method uses the Azure AI Vision API to parse documents
+        Uses Azure AI Vision to parse the document and return the text content.
+        It requests a searchable PDF output with embedded text.
+        The PDF is saved to the archive_path attribute.
+        Returns the text content extracted from the document.
+        If the parsing fails, it returns None.
          """
          from azure.ai.documentintelligence import DocumentIntelligenceClient
          from azure.ai.documentintelligence.models import AnalyzeDocumentRequest
author	shamoon <4887959+shamoon@users.noreply.github.com>
	Wed, 9 Jul 2025 18:02:57 +0000 (11:02 -0700)
committer	shamoon <4887959+shamoon@users.noreply.github.com>
	Wed, 9 Jul 2025 18:02:57 +0000 (11:02 -0700)