Adds new setting to control color conversions (#4709)

author Trenton H <797416+stumpylog@users.noreply.github.com>

Wed, 29 Nov 2023 20:18:44 +0000 (12:18 -0800)

committer GitHub <noreply@github.com>

Wed, 29 Nov 2023 20:18:44 +0000 (12:18 -0800)
author Trenton H <797416+stumpylog@users.noreply.github.com>
Wed, 29 Nov 2023 20:18:44 +0000 (12:18 -0800)
committer GitHub <noreply@github.com>
Wed, 29 Nov 2023 20:18:44 +0000 (12:18 -0800)
diff --git a/docs/configuration.md b/docs/configuration.md

index e952ec41bba4b855bbe885a287d7c97902eac844..c0e8022ac3148b50047ecaee975ff853c11669ab 100644 (file)
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -704,6 +704,20 @@ but could result in missing text content.
          this value if you are certain your documents are not malicious and
          you need the text which was not OCRed
  
+#### [`PAPERLESS_OCR_COLOR_CONVERSION_STRATEGY=<RGB>`](#PAPERLESS_OCR_COLOR_CONVERSION_STRATEGY) {#PAPERLESS_OCR_COLOR_CONVERSION_STRATEGY}
+
+: Controls the Ghostscript color conversion strategy when creating the archive file. This setting
+will only be utilized if the output is a version of PDF/A.
+
+    Valid options are CMYK, Gray, LeaveColorUnchanged, RGB or UseDeviceIndependentColor.
+
+    You can find more on the settings [here](https://ghostscript.readthedocs.io/en/latest/VectorDevices.html#color-conversion-and-management) in the Ghostscript documentation.
+
+    !!! warning
+
+        Utilizing some of the options may result in errors when creating archive
+        files from PDFs.
+
  #### [`PAPERLESS_OCR_USER_ARGS=<json>`](#PAPERLESS_OCR_USER_ARGS) {#PAPERLESS_OCR_USER_ARGS}
  
  : OCRmyPDF offers many more options. Use this parameter to specify any
diff --git a/src/paperless/settings.py b/src/paperless/settings.py

index 86f1f569fb82c667cc0f9f0c9f38c8ce4e8e23ba..9daeb8a47a3ad613db5e6d687ffabfae4c08558d 100644 (file)
--- a/src/paperless/settings.py
+++ b/src/paperless/settings.py
@@ -864,6 +864,11 @@ OCR_MAX_IMAGE_PIXELS: Optional[int] = None
  if os.environ.get("PAPERLESS_OCR_MAX_IMAGE_PIXELS") is not None:
      OCR_MAX_IMAGE_PIXELS: int = int(os.environ.get("PAPERLESS_OCR_MAX_IMAGE_PIXELS"))
  
+OCR_COLOR_CONVERSION_STRATEGY = os.getenv(
+    "PAPERLESS_OCR_COLOR_CONVERSION_STRATEGY",
+    "RGB",
+)
+
  OCR_USER_ARGS = os.getenv("PAPERLESS_OCR_USER_ARGS", "{}")
  
  # GNUPG needs a home directory for some reason
diff --git a/src/paperless_tesseract/parsers.py b/src/paperless_tesseract/parsers.py

index 3523da7bde4a482abad0f76aaa03f1d664e2ea63..babcf6bcfd360d15e86ac966e0d550ea22057602 100644 (file)
--- a/src/paperless_tesseract/parsers.py
+++ b/src/paperless_tesseract/parsers.py
@@ -186,6 +186,11 @@ class RasterisedDocumentParser(DocumentParser):
              "progress_bar": False,
          }
  
+        if "pdfa" in ocrmypdf_args["output_type"]:
+            ocrmypdf_args[
+                "color_conversion_strategy"
+            ] = settings.OCR_COLOR_CONVERSION_STRATEGY
+
          if settings.OCR_MODE == "force" or safe_fallback:
              ocrmypdf_args["force_ocr"] = True
          elif settings.OCR_MODE in ["skip", "skip_noarchive"]:
author	Trenton H <797416+stumpylog@users.noreply.github.com>
	Wed, 29 Nov 2023 20:18:44 +0000 (12:18 -0800)
committer	GitHub <noreply@github.com>
	Wed, 29 Nov 2023 20:18:44 +0000 (12:18 -0800)
docs/configuration.md		patch \| blob \| blame \| history
src/paperless/settings.py		patch \| blob \| blame \| history
src/paperless_tesseract/parsers.py		patch \| blob \| blame \| history