Feature: Allow a user to disable the pixel limit for OCR entirely (#5996)

author Trenton H <797416+stumpylog@users.noreply.github.com>

Mon, 4 Mar 2024 22:37:36 +0000 (14:37 -0800)

committer GitHub <noreply@github.com>

Mon, 4 Mar 2024 22:37:36 +0000 (22:37 +0000)
author Trenton H <797416+stumpylog@users.noreply.github.com>
Mon, 4 Mar 2024 22:37:36 +0000 (14:37 -0800)
committer GitHub <noreply@github.com>
Mon, 4 Mar 2024 22:37:36 +0000 (22:37 +0000)
diff --git a/docs/advanced_usage.md b/docs/advanced_usage.md

index d4ff80f8783b47bb5c2162f08980b4f19a942dfb..863be639bbbc655d3120af00de8135869a3a9e9f 100644 (file)
--- a/docs/advanced_usage.md
+++ b/docs/advanced_usage.md
@@ -437,7 +437,7 @@ with Prometheus, as it exports metrics. For details on its capabilities,
  refer to the [Flower](https://flower.readthedocs.io/en/latest/index.html)
  documentation.
  
-Flower can be enabled with the setting [PAPERLESS_ENABLE_FLOWER](configuration/#PAPERLESS_ENABLE_FLOWER).
+Flower can be enabled with the setting [PAPERLESS_ENABLE_FLOWER](configuration.md#PAPERLESS_ENABLE_FLOWER).
  To configure Flower further, create a `flowerconfig.py` and
  place it into the `src/paperless` directory. For a Docker
  installation, you can use volumes to accomplish this:
diff --git a/docs/configuration.md b/docs/configuration.md

index 5fd14caf19d7009336261f4749b34d28e97e3d4e..c7b710c66c2a7b101c7737e6161ce3959cadfa83 100644 (file)
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -766,6 +766,8 @@ but could result in missing text content.
      If unset, will default to the value determined by
      [Pillow](https://pillow.readthedocs.io/en/stable/reference/Image.html#PIL.Image.MAX_IMAGE_PIXELS).
  
+    Setting this value to 0 will entirely disable the limit.  See the below warning.
+
      !!! note
  
          Increasing this limit could cause Paperless to consume additional
@@ -775,7 +777,7 @@ but could result in missing text content.
      !!! warning
  
          The limit is intended to prevent malicious files from consuming
-        system resources and causing crashes and other errors. Only increase
+        system resources and causing crashes and other errors. Only change
          this value if you are certain your documents are not malicious and
          you need the text which was not OCRed
  
diff --git a/src/paperless/migrations/0003_alter_applicationconfiguration_max_image_pixels.py b/src/paperless/migrations/0003_alter_applicationconfiguration_max_image_pixels.py

new file mode 100644 (file)

index 0000000..c27feef
--- /dev/null
+++ b/src/paperless/migrations/0003_alter_applicationconfiguration_max_image_pixels.py
@@ -0,0 +1,24 @@
+# Generated by Django 4.2.10 on 2024-03-04 17:30
+
+import django.core.validators
+from django.db import migrations
+from django.db import models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ("paperless", "0002_applicationconfiguration_app_logo_and_more"),
+    ]
+
+    operations = [
+        migrations.AlterField(
+            model_name="applicationconfiguration",
+            name="max_image_pixels",
+            field=models.FloatField(
+                null=True,
+                validators=[django.core.validators.MinValueValidator(0.0)],
+                verbose_name="Sets the maximum image size for decompression",
+            ),
+        ),
+    ]
diff --git a/src/paperless/models.py b/src/paperless/models.py

index 72805dc563309f26ceb0bff86c06d90b4cce97cb..1f6cfbcedd7fdd238b25d74aff312d48de4cd87a 100644 (file)
--- a/src/paperless/models.py
+++ b/src/paperless/models.py
@@ -151,7 +151,7 @@ class ApplicationConfiguration(AbstractSingletonModel):
      max_image_pixels = models.FloatField(
          verbose_name=_("Sets the maximum image size for decompression"),
          null=True,
-        validators=[MinValueValidator(1_000_000.0)],
+        validators=[MinValueValidator(0.0)],
      )
  
      color_conversion_strategy = models.CharField(
diff --git a/src/paperless_tesseract/parsers.py b/src/paperless_tesseract/parsers.py

index 09086585eff9f94b4ba2d60952e49d0cf5578372..020922703e3aedf3fa29d7a63f24d9e2726d0eaa 100644 (file)
--- a/src/paperless_tesseract/parsers.py
+++ b/src/paperless_tesseract/parsers.py
@@ -293,20 +293,19 @@ class RasterisedDocumentParser(DocumentParser):
                      f"they will not be used. Error: {e}",
                  )
  
-        if self.settings.max_image_pixel is not None:
+        if (
+            self.settings.max_image_pixel is not None
+            and self.settings.max_image_pixel >= 0
+        ):
              # Convert pixels to mega-pixels and provide to ocrmypdf
              max_pixels_mpixels = self.settings.max_image_pixel / 1_000_000.0
-            if max_pixels_mpixels > 0:
-                self.log.debug(
-                    f"Calculated {max_pixels_mpixels} megapixels for OCR",
-                )
-
-                ocrmypdf_args["max_image_mpixels"] = max_pixels_mpixels
-            else:
-                self.log.warning(
-                    "There is an issue with PAPERLESS_OCR_MAX_IMAGE_PIXELS, "
-                    "this value must be at least 1 megapixel if set",
-                )
+            msg = (
+                "OCR pixel limit is disabled!"
+                if max_pixels_mpixels == 0
+                else f"Calculated {max_pixels_mpixels} megapixels for OCR"
+            )
+            self.log.debug(msg)
+            ocrmypdf_args["max_image_mpixels"] = max_pixels_mpixels
  
          return ocrmypdf_args
author	Trenton H <797416+stumpylog@users.noreply.github.com>
	Mon, 4 Mar 2024 22:37:36 +0000 (14:37 -0800)
committer	GitHub <noreply@github.com>
	Mon, 4 Mar 2024 22:37:36 +0000 (22:37 +0000)
docs/advanced_usage.md		patch \| blob \| blame \| history
docs/configuration.md		patch \| blob \| blame \| history
src/paperless/migrations/0003_alter_applicationconfiguration_max_image_pixels.py	[new file with mode: 0644]	patch \| blob
src/paperless/models.py		patch \| blob \| blame \| history
src/paperless_tesseract/parsers.py		patch \| blob \| blame \| history