implement PAPERLESS_OCR_MAX_IMAGE_PIXELS

author Henning Häcker <henning.haecker+github.com@gmail.com>

Sat, 19 Mar 2022 00:03:45 +0000 (01:03 +0100)

committer Johann Bauer <bauerj@bauerj.eu>

Wed, 30 Mar 2022 07:23:45 +0000 (09:23 +0200)
author Henning Häcker <henning.haecker+github.com@gmail.com>
Sat, 19 Mar 2022 00:03:45 +0000 (01:03 +0100)
committer Johann Bauer <bauerj@bauerj.eu>
Wed, 30 Mar 2022 07:23:45 +0000 (09:23 +0200)
diff --git a/docs/configuration.rst b/docs/configuration.rst

index 82a14ae52b79a33bb41c5168fde2547925db6999..ab35b49d452f3004d9625cf2d24a1ca56e5a27d5 100644 (file)
--- a/docs/configuration.rst
+++ b/docs/configuration.rst
@@ -389,6 +389,15 @@ PAPERLESS_OCR_IMAGE_DPI=<num>
      Default is none, which will automatically calculate image DPI so that
      the produced PDF documents are A4 sized.
  
+PAPERLESS_OCR_MAX_IMAGE_PIXELS=<num>
+    Paperless will not OCR images that have more pixels than this limit.
+    This is intended to prevent decompression bombs from overloading paperless.
+    Increasing this limit is desired if you face a DecompressionBombError despite
+    the concerning file not being malicious; this could e.g. be caused by invalidly
+    recognized metadata.
+    If you have enough resources or if you are certain that your uploaded files
+    are not malicious you can increase this value to your needs.
+    The default value is 256000000.
  
  PAPERLESS_OCR_USER_ARGS=<json>
      OCRmyPDF offers many more options. Use this parameter to specify any
diff --git a/src/paperless_tesseract/parsers.py b/src/paperless_tesseract/parsers.py

index ad167ecf06d3db67343bdd5cb249843a97fadbf9..4065890eccae1968d390ba6a5eee6c2fb9a26fb3 100644 (file)
--- a/src/paperless_tesseract/parsers.py
+++ b/src/paperless_tesseract/parsers.py
@@ -8,6 +8,7 @@ from documents.parsers import make_thumbnail_from_pdf
  from documents.parsers import ParseError
  from PIL import Image
  
+Image.MAX_IMAGE_PIXELS = os.environ.get('PAPERLESS_OCR_MAX_IMAGE_PIXELS', Image.MAX_IMAGE_PIXELS)
  
  class NoTextFoundException(Exception):
      pass
diff --git a/src/paperless_text/parsers.py b/src/paperless_text/parsers.py

index a0f19c020d03243956a0bec204eaeec3bef09dec..e41e25e762a765380b593a4f9ee9fe4cc3ff5c88 100644 (file)
--- a/src/paperless_text/parsers.py
+++ b/src/paperless_text/parsers.py
@@ -6,6 +6,7 @@ from PIL import Image
  from PIL import ImageDraw
  from PIL import ImageFont
  
+Image.MAX_IMAGE_PIXELS = os.environ.get('PAPERLESS_OCR_MAX_IMAGE_PIXELS', Image.MAX_IMAGE_PIXELS)
  
  class TextDocumentParser(DocumentParser):
      """
author	Henning Häcker <henning.haecker+github.com@gmail.com>
	Sat, 19 Mar 2022 00:03:45 +0000 (01:03 +0100)
committer	Johann Bauer <bauerj@bauerj.eu>
	Wed, 30 Mar 2022 07:23:45 +0000 (09:23 +0200)
docs/configuration.rst		patch \| blob \| blame \| history
src/paperless_tesseract/parsers.py		patch \| blob \| blame \| history
src/paperless_text/parsers.py		patch \| blob \| blame \| history