Ok, restart implementing this with just azure

author shamoon <4887959+shamoon@users.noreply.github.com>

Fri, 18 Apr 2025 18:38:36 +0000 (11:38 -0700)

committer shamoon <4887959+shamoon@users.noreply.github.com>

Tue, 8 Jul 2025 21:19:42 +0000 (14:19 -0700)
author shamoon <4887959+shamoon@users.noreply.github.com>
Fri, 18 Apr 2025 18:38:36 +0000 (11:38 -0700)
committer shamoon <4887959+shamoon@users.noreply.github.com>
Tue, 8 Jul 2025 21:19:42 +0000 (14:19 -0700)
diff --git a/src/paperless/settings.py b/src/paperless/settings.py

index ac5f675dd90bc385246faa551f93fe02d53449e5..249423d54777f1062b017059b6a037b48094722e 100644 (file)
--- a/src/paperless/settings.py
+++ b/src/paperless/settings.py
@@ -1409,3 +1409,11 @@ OUTLOOK_OAUTH_ENABLED = bool(
      and OUTLOOK_OAUTH_CLIENT_ID
      and OUTLOOK_OAUTH_CLIENT_SECRET,
  )
+
+###############################################################################
+# Remote Parser                                                               #
+###############################################################################
+
+REMOTE_OCR_ENGINE = os.getenv("PAPERLESS_REMOTE_OCR_ENGINE")
+REMOTE_OCR_API_KEY = os.getenv("PAPERLESS_REMOTE_OCR_API_KEY")
+REMOTE_OCR_ENDPOINT = os.getenv("PAPERLESS_REMOTE_OCR_ENDPOINT")
diff --git a/src/paperless_remote/__init__.py b/src/paperless_remote/__init__.py

new file mode 100644 (file)

index 0000000..5380ea5
--- /dev/null
+++ b/src/paperless_remote/__init__.py
@@ -0,0 +1,4 @@
+# this is here so that django finds the checks.
+from paperless_remote.checks import check_remote_parser_configured
+
+__all__ = ["check_remote_parser_configured"]
diff --git a/src/paperless_remote/apps.py b/src/paperless_remote/apps.py

new file mode 100644 (file)

index 0000000..8cd3199
--- /dev/null
+++ b/src/paperless_remote/apps.py
@@ -0,0 +1,14 @@
+from django.apps import AppConfig
+
+from paperless_remote.signals import remote_consumer_declaration
+
+
+class PaperlessRemoteParserConfig(AppConfig):
+    name = "paperless_remote"
+
+    def ready(self):
+        from documents.signals import document_consumer_declaration
+
+        document_consumer_declaration.connect(remote_consumer_declaration)
+
+        AppConfig.ready(self)
diff --git a/src/paperless_remote/checks.py b/src/paperless_remote/checks.py

new file mode 100644 (file)

index 0000000..ce72ebc
--- /dev/null
+++ b/src/paperless_remote/checks.py
@@ -0,0 +1,15 @@
+from django.conf import settings
+from django.core.checks import Error
+from django.core.checks import register
+
+
+@register()
+def check_remote_parser_configured(app_configs, **kwargs):
+    if settings.REMOTE_OCR_ENGINE == "azureai" and not settings.REMOTE_OCR_ENDPOINT:
+        return [
+            Error(
+                "Azure AI remote parser requires endpoint to be configured.",
+            ),
+        ]
+
+    return []
diff --git a/src/paperless_remote/parsers.py b/src/paperless_remote/parsers.py

new file mode 100644 (file)

index 0000000..03b5379
--- /dev/null
+++ b/src/paperless_remote/parsers.py
@@ -0,0 +1,74 @@
+from pathlib import Path
+
+from django.conf import settings
+
+from paperless_tesseract.parsers import RasterisedDocumentParser
+
+
+class RemoteEngineConfig:
+    def __init__(
+        self,
+        engine: str,
+        api_key: str | None = None,
+        endpoint: str | None = None,
+    ):
+        self.engine = engine
+        self.api_key = api_key
+        self.endpoint = endpoint
+
+    def engine_is_valid(self):
+        valid = self.engine in ["azureai"] and self.api_key is not None
+        if self.engine == "azureai":
+            valid = valid and self.endpoint is not None
+        return valid
+
+
+class RemoteDocumentParser(RasterisedDocumentParser):
+    """
+    This parser uses a remote ocr engine to parse documents
+    """
+
+    logging_name = "paperless.parsing.remote"
+
+    def get_settings(self) -> RemoteEngineConfig:
+        """
+        This parser uses the OCR configuration settings to parse documents
+        """
+        return RemoteEngineConfig(
+            engine=settings.REMOTE_OCR_ENGINE,
+            api_key=settings.REMOTE_OCR_API_KEY,
+            endpoint=settings.REMOTE_OCR_ENDPOINT,
+        )
+
+    def supported_mime_types(self):
+        if self.settings.engine_is_valid():
+            return [
+                "application/pdf",
+                "image/png",
+                "image/jpeg",
+                "image/tiff",
+                "image/bmp",
+                "image/gif",
+                "image/webp",
+            ]
+        else:
+            return []
+
+    def azure_ai_vision_parse(
+        self,
+        file: Path,
+    ) -> str | None:
+        """
+        This method uses the Azure AI Vision API to parse documents
+        """
+        # TODO: Implement the Azure AI Vision API parsing logic
+
+    def parse(self, document_path: Path, mime_type, file_name=None):
+        if not self.settings.engine_is_valid():
+            self.log.warning(
+                "No valid remote parser engine is configured, content will be empty.",
+            )
+            self.text = ""
+            return
+        elif self.settings.engine == "azureai":
+            self.text = self.azure_ai_vision_parse(document_path)
diff --git a/src/paperless_remote/signals.py b/src/paperless_remote/signals.py

new file mode 100644 (file)

index 0000000..81955a4
--- /dev/null
+++ b/src/paperless_remote/signals.py
@@ -0,0 +1,18 @@
+def get_parser(*args, **kwargs):
+    from paperless_remote.parsers import RemoteDocumentParser
+
+    return RemoteDocumentParser(*args, **kwargs)
+
+
+def get_supported_mime_types():
+    from paperless_remote.parsers import RemoteDocumentParser
+
+    return RemoteDocumentParser(None).supported_mime_types()
+
+
+def remote_consumer_declaration(sender, **kwargs):
+    return {
+        "parser": get_parser,
+        "weight": 5,
+        "mime_types": get_supported_mime_types(),
+    }
diff --git a/src/paperless_remote/tests/__init__.py b/src/paperless_remote/tests/__init__.py

new file mode 100644 (file)

index 0000000..e69de29
diff --git a/src/paperless_remote/tests/samples/simple-digital.pdf b/src/paperless_remote/tests/samples/simple-digital.pdf

new file mode 100644 (file)

index 0000000..e450de4

Binary files /dev/null and b/src/paperless_remote/tests/samples/simple-digital.pdf differ
diff --git a/src/paperless_remote/tests/test_checks.py b/src/paperless_remote/tests/test_checks.py

new file mode 100644 (file)

index 0000000..b153df2
--- /dev/null
+++ b/src/paperless_remote/tests/test_checks.py
@@ -0,0 +1,29 @@
+from django.test import TestCase
+from django.test import override_settings
+
+from paperless_remote import check_remote_parser_configured
+
+
+class TestChecks(TestCase):
+    @override_settings(REMOTE_OCR_ENGINE=None)
+    def test_no_engine(self):
+        msgs = check_remote_parser_configured(None)
+        self.assertEqual(len(msgs), 0)
+
+    @override_settings(REMOTE_OCR_ENGINE="azureai")
+    @override_settings(REMOTE_OCR_API_KEY="somekey")
+    @override_settings(REMOTE_OCR_ENDPOINT=None)
+    def test_azure_no_endpoint(self):
+        msgs = check_remote_parser_configured(None)
+        self.assertEqual(len(msgs), 1)
+        self.assertTrue(
+            msgs[0].msg.startswith(
+                "Azure AI Vision remote parser requires endpoint to be configured.",
+            ),
+        )
+
+    @override_settings(REMOTE_OCR_ENGINE="something")
+    @override_settings(REMOTE_OCR_API_KEY="somekey")
+    def test_valid_configuration(self):
+        msgs = check_remote_parser_configured(None)
+        self.assertEqual(len(msgs), 0)
diff --git a/src/paperless_remote/tests/test_parser.py b/src/paperless_remote/tests/test_parser.py

new file mode 100644 (file)

index 0000000..160796f
--- /dev/null
+++ b/src/paperless_remote/tests/test_parser.py
@@ -0,0 +1,91 @@
+import sys
+import uuid
+from pathlib import Path
+from unittest import mock
+
+import pytest
+from django.test import TestCase
+from django.test import override_settings
+
+from documents.tests.utils import DirectoriesMixin
+from documents.tests.utils import FileSystemAssertsMixin
+from paperless_remote.parsers import RemoteDocumentParser
+
+
+class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
+    SAMPLE_FILES = Path(__file__).resolve().parent / "samples"
+
+    def assertContainsStrings(self, content, strings):
+        # Asserts that all strings appear in content, in the given order.
+        indices = []
+        for s in strings:
+            if s in content:
+                indices.append(content.index(s))
+            else:
+                self.fail(f"'{s}' is not in '{content}'")
+        self.assertListEqual(indices, sorted(indices))
+
+    @pytest.mark.skipif(
+        sys.version_info > (3, 10),
+        reason="Fails on 3.11 only on CI, for some reason",
+    )  # TODO: investigate
+    @mock.patch("azure.ai.formrecognizer.DocumentAnalysisClient")
+    def test_get_text_with_azure(self, mock_azure_client):
+        result = mock.Mock()
+        result.content = "This is a test document."
+        result.pages = [
+            mock.Mock(
+                width=100,
+                height=100,
+                words=[
+                    mock.Mock(
+                        content="This",
+                        polygon=[
+                            mock.Mock(x=0, y=0),
+                        ],
+                    ),
+                    mock.Mock(
+                        content="is",
+                        polygon=[
+                            mock.Mock(x=10, y=10),
+                        ],
+                    ),
+                    mock.Mock(
+                        content="a",
+                        polygon=[
+                            mock.Mock(x=20, y=20),
+                        ],
+                    ),
+                    mock.Mock(
+                        content="test",
+                        polygon=[
+                            mock.Mock(x=30, y=30),
+                        ],
+                    ),
+                    mock.Mock(
+                        content="document.",
+                        polygon=[
+                            mock.Mock(x=40, y=40),
+                        ],
+                    ),
+                ],
+            ),
+        ]
+
+        mock_azure_client.return_value.begin_analyze_document.return_value.result.return_value = result
+
+        with override_settings(
+            REMOTE_OCR_ENGINE="azureaivision",
+            REMOTE_OCR_API_KEY="somekey",
+            REMOTE_OCR_ENDPOINT="https://endpoint.cognitiveservices.azure.com/",
+        ):
+            parser = RemoteDocumentParser(uuid.uuid4())
+            parser.parse(
+                self.SAMPLE_FILES / "simple-digital.pdf",
+                "application/pdf",
+            )
+
+            self.assertContainsStrings(
+                parser.text.strip(),
+                ["This is a test document."],
+            )
author	shamoon <4887959+shamoon@users.noreply.github.com>
	Fri, 18 Apr 2025 18:38:36 +0000 (11:38 -0700)
committer	shamoon <4887959+shamoon@users.noreply.github.com>
	Tue, 8 Jul 2025 21:19:42 +0000 (14:19 -0700)
src/paperless/settings.py		patch \| blob \| blame \| history
src/paperless_remote/__init__.py	[new file with mode: 0644]	patch \| blob
src/paperless_remote/apps.py	[new file with mode: 0644]	patch \| blob
src/paperless_remote/checks.py	[new file with mode: 0644]	patch \| blob
src/paperless_remote/parsers.py	[new file with mode: 0644]	patch \| blob
src/paperless_remote/signals.py	[new file with mode: 0644]	patch \| blob
src/paperless_remote/tests/__init__.py	[new file with mode: 0644]	patch \| blob
src/paperless_remote/tests/samples/simple-digital.pdf	[new file with mode: 0644]	patch \| blob
src/paperless_remote/tests/test_checks.py	[new file with mode: 0644]	patch \| blob
src/paperless_remote/tests/test_parser.py	[new file with mode: 0644]	patch \| blob