From 811bd66088bb9936198e7b8b370fd5590bf4100d Mon Sep 17 00:00:00 2001
From: shamoon <4887959+shamoon@users.noreply.github.com>
Date: Fri, 18 Apr 2025 11:38:36 -0700
Subject: [PATCH] Ok, restart implementing this with just azure
[ci skip]
---
src/paperless/settings.py | 8 ++
src/paperless_remote/__init__.py | 4 +
src/paperless_remote/apps.py | 14 +++
src/paperless_remote/checks.py | 15 +++
src/paperless_remote/parsers.py | 74 ++++++++++++++
src/paperless_remote/signals.py | 18 ++++
src/paperless_remote/tests/__init__.py | 0
.../tests/samples/simple-digital.pdf | Bin 0 -> 22926 bytes
src/paperless_remote/tests/test_checks.py | 29 ++++++
src/paperless_remote/tests/test_parser.py | 91 ++++++++++++++++++
10 files changed, 253 insertions(+)
create mode 100644 src/paperless_remote/__init__.py
create mode 100644 src/paperless_remote/apps.py
create mode 100644 src/paperless_remote/checks.py
create mode 100644 src/paperless_remote/parsers.py
create mode 100644 src/paperless_remote/signals.py
create mode 100644 src/paperless_remote/tests/__init__.py
create mode 100644 src/paperless_remote/tests/samples/simple-digital.pdf
create mode 100644 src/paperless_remote/tests/test_checks.py
create mode 100644 src/paperless_remote/tests/test_parser.py
diff --git a/src/paperless/settings.py b/src/paperless/settings.py
index ac5f675dd9..249423d547 100644
--- a/src/paperless/settings.py
+++ b/src/paperless/settings.py
@@ -1409,3 +1409,11 @@ OUTLOOK_OAUTH_ENABLED = bool(
and OUTLOOK_OAUTH_CLIENT_ID
and OUTLOOK_OAUTH_CLIENT_SECRET,
)
+
+###############################################################################
+# Remote Parser #
+###############################################################################
+
+REMOTE_OCR_ENGINE = os.getenv("PAPERLESS_REMOTE_OCR_ENGINE")
+REMOTE_OCR_API_KEY = os.getenv("PAPERLESS_REMOTE_OCR_API_KEY")
+REMOTE_OCR_ENDPOINT = os.getenv("PAPERLESS_REMOTE_OCR_ENDPOINT")
diff --git a/src/paperless_remote/__init__.py b/src/paperless_remote/__init__.py
new file mode 100644
index 0000000000..5380ea5ac8
--- /dev/null
+++ b/src/paperless_remote/__init__.py
@@ -0,0 +1,4 @@
+# this is here so that django finds the checks.
+from paperless_remote.checks import check_remote_parser_configured
+
+__all__ = ["check_remote_parser_configured"]
diff --git a/src/paperless_remote/apps.py b/src/paperless_remote/apps.py
new file mode 100644
index 0000000000..8cd3199f98
--- /dev/null
+++ b/src/paperless_remote/apps.py
@@ -0,0 +1,14 @@
+from django.apps import AppConfig
+
+from paperless_remote.signals import remote_consumer_declaration
+
+
+class PaperlessRemoteParserConfig(AppConfig):
+ name = "paperless_remote"
+
+ def ready(self):
+ from documents.signals import document_consumer_declaration
+
+ document_consumer_declaration.connect(remote_consumer_declaration)
+
+ AppConfig.ready(self)
diff --git a/src/paperless_remote/checks.py b/src/paperless_remote/checks.py
new file mode 100644
index 0000000000..ce72ebcc82
--- /dev/null
+++ b/src/paperless_remote/checks.py
@@ -0,0 +1,15 @@
+from django.conf import settings
+from django.core.checks import Error
+from django.core.checks import register
+
+
+@register()
+def check_remote_parser_configured(app_configs, **kwargs):
+ if settings.REMOTE_OCR_ENGINE == "azureai" and not settings.REMOTE_OCR_ENDPOINT:
+ return [
+ Error(
+ "Azure AI remote parser requires endpoint to be configured.",
+ ),
+ ]
+
+ return []
diff --git a/src/paperless_remote/parsers.py b/src/paperless_remote/parsers.py
new file mode 100644
index 0000000000..03b53793c1
--- /dev/null
+++ b/src/paperless_remote/parsers.py
@@ -0,0 +1,74 @@
+from pathlib import Path
+
+from django.conf import settings
+
+from paperless_tesseract.parsers import RasterisedDocumentParser
+
+
+class RemoteEngineConfig:
+ def __init__(
+ self,
+ engine: str,
+ api_key: str | None = None,
+ endpoint: str | None = None,
+ ):
+ self.engine = engine
+ self.api_key = api_key
+ self.endpoint = endpoint
+
+ def engine_is_valid(self):
+ valid = self.engine in ["azureai"] and self.api_key is not None
+ if self.engine == "azureai":
+ valid = valid and self.endpoint is not None
+ return valid
+
+
+class RemoteDocumentParser(RasterisedDocumentParser):
+ """
+ This parser uses a remote ocr engine to parse documents
+ """
+
+ logging_name = "paperless.parsing.remote"
+
+ def get_settings(self) -> RemoteEngineConfig:
+ """
+ This parser uses the OCR configuration settings to parse documents
+ """
+ return RemoteEngineConfig(
+ engine=settings.REMOTE_OCR_ENGINE,
+ api_key=settings.REMOTE_OCR_API_KEY,
+ endpoint=settings.REMOTE_OCR_ENDPOINT,
+ )
+
+ def supported_mime_types(self):
+ if self.settings.engine_is_valid():
+ return [
+ "application/pdf",
+ "image/png",
+ "image/jpeg",
+ "image/tiff",
+ "image/bmp",
+ "image/gif",
+ "image/webp",
+ ]
+ else:
+ return []
+
+ def azure_ai_vision_parse(
+ self,
+ file: Path,
+ ) -> str | None:
+ """
+ This method uses the Azure AI Vision API to parse documents
+ """
+ # TODO: Implement the Azure AI Vision API parsing logic
+
+ def parse(self, document_path: Path, mime_type, file_name=None):
+ if not self.settings.engine_is_valid():
+ self.log.warning(
+ "No valid remote parser engine is configured, content will be empty.",
+ )
+ self.text = ""
+ return
+ elif self.settings.engine == "azureai":
+ self.text = self.azure_ai_vision_parse(document_path)
diff --git a/src/paperless_remote/signals.py b/src/paperless_remote/signals.py
new file mode 100644
index 0000000000..81955a4794
--- /dev/null
+++ b/src/paperless_remote/signals.py
@@ -0,0 +1,18 @@
+def get_parser(*args, **kwargs):
+ from paperless_remote.parsers import RemoteDocumentParser
+
+ return RemoteDocumentParser(*args, **kwargs)
+
+
+def get_supported_mime_types():
+ from paperless_remote.parsers import RemoteDocumentParser
+
+ return RemoteDocumentParser(None).supported_mime_types()
+
+
+def remote_consumer_declaration(sender, **kwargs):
+ return {
+ "parser": get_parser,
+ "weight": 5,
+ "mime_types": get_supported_mime_types(),
+ }
diff --git a/src/paperless_remote/tests/__init__.py b/src/paperless_remote/tests/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/src/paperless_remote/tests/samples/simple-digital.pdf b/src/paperless_remote/tests/samples/simple-digital.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..e450de48269ce43785b8344c63e233a1794abae6
GIT binary patch
literal 22926
zc-ri{byQr-(g#WiK?V!%Ft{@e?gaNhaED70El40h5`w!E+=3G{5G=Sq
zl5_68=icvrPuBbAt+&ozvuE$_uCDr3@7g^*)m8Lr(z5Je4n6>VXJKbk=cmpb02h!G
z=xAmG5D@_>!yGK(RzU8D8g-DYwH+Mh0+O|Zz+uubsG~VdObp-(cY#6d0iK!cv4emR
z&Zjcxm#_4QlpX8}kR{tL0tpS}9~&ic6i$!Fi~>*2$uup%5rW5vDJRf;j6qA%Z1nm{
zGZVxhT6U(y*Sm)5m38FCKiqTEc&n8RQy`QPoTRi3JBD}_Zl_`tv`uNp%CM$fa-r-a
z+DL76!`+g`An8{HDuwN6Eb0vs7K!qJ%d%Ix^YRh}p;BSq{9hdOuMdf)J*TSzPESzY
z)PU9iFbDHL0YBXR(h
_+JL#&vq~n%qhUd`Pbpt5cGnRZDcJw
z%GSUPKG9LtkkgMdb=2GGed&=J4QBm`O}!nv8%6c-GXbxMV>M6
zBtn@}naG6iX7TG6!=aduq+1%C?^VX0@yk#*BVT4=Vq>uzUMyEF!B0BU?