and OUTLOOK_OAUTH_CLIENT_ID
and OUTLOOK_OAUTH_CLIENT_SECRET,
)
+
+###############################################################################
+# Remote Parser #
+###############################################################################
+
+REMOTE_OCR_ENGINE = os.getenv("PAPERLESS_REMOTE_OCR_ENGINE")
+REMOTE_OCR_API_KEY = os.getenv("PAPERLESS_REMOTE_OCR_API_KEY")
+REMOTE_OCR_ENDPOINT = os.getenv("PAPERLESS_REMOTE_OCR_ENDPOINT")
--- /dev/null
+# this is here so that django finds the checks.
+from paperless_remote.checks import check_remote_parser_configured
+
+__all__ = ["check_remote_parser_configured"]
--- /dev/null
+from django.apps import AppConfig
+
+from paperless_remote.signals import remote_consumer_declaration
+
+
+class PaperlessRemoteParserConfig(AppConfig):
+ name = "paperless_remote"
+
+ def ready(self):
+ from documents.signals import document_consumer_declaration
+
+ document_consumer_declaration.connect(remote_consumer_declaration)
+
+ AppConfig.ready(self)
--- /dev/null
+from django.conf import settings
+from django.core.checks import Error
+from django.core.checks import register
+
+
+@register()
+def check_remote_parser_configured(app_configs, **kwargs):
+ if settings.REMOTE_OCR_ENGINE == "azureai" and not settings.REMOTE_OCR_ENDPOINT:
+ return [
+ Error(
+ "Azure AI remote parser requires endpoint to be configured.",
+ ),
+ ]
+
+ return []
--- /dev/null
+from pathlib import Path
+
+from django.conf import settings
+
+from paperless_tesseract.parsers import RasterisedDocumentParser
+
+
+class RemoteEngineConfig:
+ def __init__(
+ self,
+ engine: str,
+ api_key: str | None = None,
+ endpoint: str | None = None,
+ ):
+ self.engine = engine
+ self.api_key = api_key
+ self.endpoint = endpoint
+
+ def engine_is_valid(self):
+ valid = self.engine in ["azureai"] and self.api_key is not None
+ if self.engine == "azureai":
+ valid = valid and self.endpoint is not None
+ return valid
+
+
+class RemoteDocumentParser(RasterisedDocumentParser):
+ """
+ This parser uses a remote ocr engine to parse documents
+ """
+
+ logging_name = "paperless.parsing.remote"
+
+ def get_settings(self) -> RemoteEngineConfig:
+ """
+ This parser uses the OCR configuration settings to parse documents
+ """
+ return RemoteEngineConfig(
+ engine=settings.REMOTE_OCR_ENGINE,
+ api_key=settings.REMOTE_OCR_API_KEY,
+ endpoint=settings.REMOTE_OCR_ENDPOINT,
+ )
+
+ def supported_mime_types(self):
+ if self.settings.engine_is_valid():
+ return [
+ "application/pdf",
+ "image/png",
+ "image/jpeg",
+ "image/tiff",
+ "image/bmp",
+ "image/gif",
+ "image/webp",
+ ]
+ else:
+ return []
+
+ def azure_ai_vision_parse(
+ self,
+ file: Path,
+ ) -> str | None:
+ """
+ This method uses the Azure AI Vision API to parse documents
+ """
+ # TODO: Implement the Azure AI Vision API parsing logic
+
+ def parse(self, document_path: Path, mime_type, file_name=None):
+ if not self.settings.engine_is_valid():
+ self.log.warning(
+ "No valid remote parser engine is configured, content will be empty.",
+ )
+ self.text = ""
+ return
+ elif self.settings.engine == "azureai":
+ self.text = self.azure_ai_vision_parse(document_path)
--- /dev/null
+def get_parser(*args, **kwargs):
+ from paperless_remote.parsers import RemoteDocumentParser
+
+ return RemoteDocumentParser(*args, **kwargs)
+
+
+def get_supported_mime_types():
+ from paperless_remote.parsers import RemoteDocumentParser
+
+ return RemoteDocumentParser(None).supported_mime_types()
+
+
+def remote_consumer_declaration(sender, **kwargs):
+ return {
+ "parser": get_parser,
+ "weight": 5,
+ "mime_types": get_supported_mime_types(),
+ }
--- /dev/null
+from django.test import TestCase
+from django.test import override_settings
+
+from paperless_remote import check_remote_parser_configured
+
+
+class TestChecks(TestCase):
+ @override_settings(REMOTE_OCR_ENGINE=None)
+ def test_no_engine(self):
+ msgs = check_remote_parser_configured(None)
+ self.assertEqual(len(msgs), 0)
+
+ @override_settings(REMOTE_OCR_ENGINE="azureai")
+ @override_settings(REMOTE_OCR_API_KEY="somekey")
+ @override_settings(REMOTE_OCR_ENDPOINT=None)
+ def test_azure_no_endpoint(self):
+ msgs = check_remote_parser_configured(None)
+ self.assertEqual(len(msgs), 1)
+ self.assertTrue(
+ msgs[0].msg.startswith(
+ "Azure AI Vision remote parser requires endpoint to be configured.",
+ ),
+ )
+
+ @override_settings(REMOTE_OCR_ENGINE="something")
+ @override_settings(REMOTE_OCR_API_KEY="somekey")
+ def test_valid_configuration(self):
+ msgs = check_remote_parser_configured(None)
+ self.assertEqual(len(msgs), 0)
--- /dev/null
+import sys
+import uuid
+from pathlib import Path
+from unittest import mock
+
+import pytest
+from django.test import TestCase
+from django.test import override_settings
+
+from documents.tests.utils import DirectoriesMixin
+from documents.tests.utils import FileSystemAssertsMixin
+from paperless_remote.parsers import RemoteDocumentParser
+
+
+class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
+ SAMPLE_FILES = Path(__file__).resolve().parent / "samples"
+
+ def assertContainsStrings(self, content, strings):
+ # Asserts that all strings appear in content, in the given order.
+ indices = []
+ for s in strings:
+ if s in content:
+ indices.append(content.index(s))
+ else:
+ self.fail(f"'{s}' is not in '{content}'")
+ self.assertListEqual(indices, sorted(indices))
+
+ @pytest.mark.skipif(
+ sys.version_info > (3, 10),
+ reason="Fails on 3.11 only on CI, for some reason",
+ ) # TODO: investigate
+ @mock.patch("azure.ai.formrecognizer.DocumentAnalysisClient")
+ def test_get_text_with_azure(self, mock_azure_client):
+ result = mock.Mock()
+ result.content = "This is a test document."
+ result.pages = [
+ mock.Mock(
+ width=100,
+ height=100,
+ words=[
+ mock.Mock(
+ content="This",
+ polygon=[
+ mock.Mock(x=0, y=0),
+ ],
+ ),
+ mock.Mock(
+ content="is",
+ polygon=[
+ mock.Mock(x=10, y=10),
+ ],
+ ),
+ mock.Mock(
+ content="a",
+ polygon=[
+ mock.Mock(x=20, y=20),
+ ],
+ ),
+ mock.Mock(
+ content="test",
+ polygon=[
+ mock.Mock(x=30, y=30),
+ ],
+ ),
+ mock.Mock(
+ content="document.",
+ polygon=[
+ mock.Mock(x=40, y=40),
+ ],
+ ),
+ ],
+ ),
+ ]
+
+ mock_azure_client.return_value.begin_analyze_document.return_value.result.return_value = result
+
+ with override_settings(
+ REMOTE_OCR_ENGINE="azureaivision",
+ REMOTE_OCR_API_KEY="somekey",
+ REMOTE_OCR_ENDPOINT="https://endpoint.cognitiveservices.azure.com/",
+ ):
+ parser = RemoteDocumentParser(uuid.uuid4())
+ parser.parse(
+ self.SAMPLE_FILES / "simple-digital.pdf",
+ "application/pdf",
+ )
+
+ self.assertContainsStrings(
+ parser.text.strip(),
+ ["This is a test document."],
+ )