]> git.ipfire.org Git - thirdparty/paperless-ngx.git/commitdiff
Entirely removes the optipng, updates ghostscript fall back to also use WebP. Update...
authorTrenton Holmes <holmes.trenton@gmail.com>
Sat, 11 Jun 2022 15:38:49 +0000 (08:38 -0700)
committerTrenton Holmes <holmes.trenton@gmail.com>
Sat, 11 Jun 2022 15:38:49 +0000 (08:38 -0700)
17 files changed:
.github/workflows/reusable-ci-backend.yml
Dockerfile
docs/configuration.rst
docs/setup.rst
paperless.conf.example
src/documents/consumer.py
src/documents/management/commands/convert_thumbnails.py
src/documents/management/commands/document_archiver.py
src/documents/management/commands/document_thumbnails.py
src/documents/models.py
src/documents/parsers.py
src/documents/tests/test_consumer.py
src/documents/tests/test_management_convert_thumbnail.py
src/documents/tests/test_parsers.py
src/paperless/checks.py
src/paperless/settings.py
src/paperless/tests/test_checks.py

index 977011b2c5de2c6403a9a6f0d5e82c126ba30ed6..e872e869619fefb1995d86795442248610ad2dd2 100644 (file)
@@ -74,7 +74,7 @@ jobs:
         name: Install system dependencies
         run: |
           sudo apt-get update -qq
-          sudo apt-get install -qq --no-install-recommends unpaper tesseract-ocr imagemagick ghostscript optipng libzbar0 poppler-utils
+          sudo apt-get install -qq --no-install-recommends unpaper tesseract-ocr imagemagick ghostscript libzbar0 poppler-utils
       -
         name: Install Python dependencies
         run: |
index 5338d8aa442773ac39c5fdaf17575e58857bd886..fda47998cd287b8e3a88c68321105b7d0959415a 100644 (file)
@@ -77,7 +77,6 @@ ARG RUNTIME_PACKAGES="\
   libraqm0 \
   libgnutls30 \
   libjpeg62-turbo \
-  optipng \
   python3 \
   python3-pip \
   python3-setuptools \
index b7ab978f473b0c11a777ab5641e269bbc348884e..a5db55927d9e8673e7f265591776add72e207cf8 100644 (file)
@@ -712,13 +712,6 @@ PAPERLESS_CONVERT_TMPDIR=<path>
 
     Default is none, which disables the temporary directory.
 
-PAPERLESS_OPTIMIZE_THUMBNAILS=<bool>
-    Use optipng to optimize thumbnails. This usually reduces the size of
-    thumbnails by about 20%, but uses considerable compute time during
-    consumption.
-
-    Defaults to true.
-
 PAPERLESS_POST_CONSUME_SCRIPT=<filename>
     After a document is consumed, Paperless can trigger an arbitrary script if
     you like.  This script will be passed a number of arguments for you to work
@@ -789,9 +782,6 @@ PAPERLESS_CONVERT_BINARY=<path>
 PAPERLESS_GS_BINARY=<path>
     Defaults to "/usr/bin/gs".
 
-PAPERLESS_OPTIPNG_BINARY=<path>
-    Defaults to "/usr/bin/optipng".
-
 
 .. _configuration-docker:
 
index 90b952e4c8b77965b0440ede7dc0c55818278b7e..b8d3ab8a3a44bcba7d2a4eecf7578cbf78e04b82 100644 (file)
@@ -286,7 +286,6 @@ writing. Windows is not and will never be supported.
 
     *   ``fonts-liberation`` for generating thumbnails for plain text files
     *   ``imagemagick`` >= 6 for PDF conversion
-    *   ``optipng`` for optimizing thumbnails
     *   ``gnupg`` for handling encrypted documents
     *   ``libpq-dev`` for PostgreSQL
     *   ``libmagic-dev`` for mime type detection
@@ -298,7 +297,7 @@ writing. Windows is not and will never be supported.
 
     .. code::
 
-        python3 python3-pip python3-dev imagemagick fonts-liberation optipng gnupg libpq-dev libmagic-dev mime-support libzbar0 poppler-utils
+        python3 python3-pip python3-dev imagemagick fonts-liberation gnupg libpq-dev libmagic-dev mime-support libzbar0 poppler-utils
 
     These dependencies are required for OCRmyPDF, which is used for text recognition.
 
@@ -730,8 +729,6 @@ configuring some options in paperless can help improve performance immensely:
 *   If you want to perform OCR on the device, consider using ``PAPERLESS_OCR_CLEAN=none``.
     This will speed up OCR times and use less memory at the expense of slightly worse
     OCR results.
-*   Set ``PAPERLESS_OPTIMIZE_THUMBNAILS`` to 'false' if you want faster consumption
-    times. Thumbnails will be about 20% larger.
 *   If using docker, consider setting ``PAPERLESS_WEBSERVER_WORKERS`` to
     1. This will save some memory.
 
index 97e907e1fb11b5bfa232b1967322860416401175..bb2449e058184e0f97d8abef4ccfd42b70cac132 100644 (file)
@@ -65,7 +65,6 @@
 #PAPERLESS_CONSUMER_SUBDIRS_AS_TAGS=false
 #PAPERLESS_CONSUMER_ENABLE_BARCODES=false
 #PAPERLESS_CONSUMER_ENABLE_BARCODES=PATCHT
-#PAPERLESS_OPTIMIZE_THUMBNAILS=true
 #PAPERLESS_PRE_CONSUME_SCRIPT=/path/to/an/arbitrary/script.sh
 #PAPERLESS_POST_CONSUME_SCRIPT=/path/to/an/arbitrary/script.sh
 #PAPERLESS_FILENAME_DATE_ORDER=YMD
@@ -84,4 +83,3 @@
 
 #PAPERLESS_CONVERT_BINARY=/usr/bin/convert
 #PAPERLESS_GS_BINARY=/usr/bin/gs
-#PAPERLESS_OPTIPNG_BINARY=/usr/bin/optipng
index 5e3d01fbcf4150e6ddabaf9d2025065701d858c4..e5794ce4fb4d4f23fdbdfe8545542207a4004773 100644 (file)
@@ -273,7 +273,7 @@ class Consumer(LoggingMixin):
 
             self.log("debug", f"Generating thumbnail for {self.filename}...")
             self._send_progress(70, 100, "WORKING", MESSAGE_GENERATING_THUMBNAIL)
-            thumbnail = document_parser.get_optimised_thumbnail(
+            thumbnail = document_parser.get_thumbnail(
                 self.path,
                 mime_type,
                 self.filename,
index 0be4cb70219fc34839ec80505b55bba4412e6f32..089c689c98e9ad6c2c0a7c2f30e483357e765739 100644 (file)
@@ -1,4 +1,5 @@
 import logging
+import multiprocessing.pool
 import shutil
 import tempfile
 import time
@@ -8,10 +9,44 @@ from django.core.management.base import BaseCommand
 from documents.models import Document
 from documents.parsers import run_convert
 
-
 logger = logging.getLogger("paperless.management.convert_thumbnails")
 
 
+def _do_convert(work_package):
+    _, existing_thumbnail, converted_thumbnail = work_package
+    try:
+
+        logger.info(f"Converting thumbnail: {existing_thumbnail}")
+
+        # Run actual conversion
+        run_convert(
+            density=300,
+            scale="500x5000>",
+            alpha="remove",
+            strip=True,
+            trim=False,
+            auto_orient=True,
+            input_file=f"{existing_thumbnail}[0]",
+            output_file=str(converted_thumbnail),
+        )
+
+        # Copy newly created thumbnail to thumbnail directory
+        shutil.copy(converted_thumbnail, existing_thumbnail.parent)
+
+        # Remove the PNG version
+        existing_thumbnail.unlink()
+
+        logger.info(
+            "Conversion to WebP completed, "
+            f"replaced {existing_thumbnail.name} with {converted_thumbnail.name}",
+        )
+
+    except Exception as e:
+        logger.error(
+            f"Error converting thumbnail" f" (existing file unchanged): {e}",
+        )
+
+
 class Command(BaseCommand):
 
     help = """
@@ -24,21 +59,19 @@ class Command(BaseCommand):
 
     def handle(self, *args, **options):
 
-        self.stdout.write("Converting all PNG thumbnails to WebP")
-
+        logger.info("Converting all PNG thumbnails to WebP")
         start = time.time()
-
         documents = Document.objects.all()
 
         with tempfile.TemporaryDirectory() as tempdir:
 
+            work_packages = []
+
             for document in documents:
                 existing_thumbnail = Path(document.thumbnail_path).resolve()
 
                 if existing_thumbnail.suffix == ".png":
 
-                    self.stdout.write(f"Converting thumbnail: {existing_thumbnail}")
-
                     # Change the existing filename suffix from png to webp
                     converted_thumbnail_name = existing_thumbnail.with_suffix(
                         ".webp",
@@ -49,46 +82,16 @@ class Command(BaseCommand):
                         Path(tempdir) / Path(converted_thumbnail_name)
                     ).resolve()
 
-                    try:
-                        # Run actual conversion
-                        run_convert(
-                            density=300,
-                            scale="500x5000>",
-                            alpha="remove",
-                            strip=True,
-                            trim=False,
-                            auto_orient=True,
-                            input_file=f"{existing_thumbnail}[0]",
-                            output_file=str(converted_thumbnail),
-                        )
-
-                        if converted_thumbnail.exists():
-                            # Copy newly created thumbnail to thumbnail directory
-                            shutil.copy(converted_thumbnail, existing_thumbnail.parent)
-
-                            # Remove the PNG version
-                            existing_thumbnail.unlink()
-
-                            self.stdout.write(
-                                self.style.SUCCESS(
-                                    "Conversion to WebP completed",
-                                ),
-                            )
-                        else:
-                            # Highly unlike to reach here
-                            self.stderr.write(
-                                self.style.WARNING("Converted thumbnail doesn't exist"),
-                            )
-
-                    except Exception as e:
-                        self.stderr.write(
-                            self.style.ERROR(
-                                f"Error converting thumbnail"
-                                f" (existing file unchanged): {e}",
-                            ),
-                        )
+                    # Package up the necessary info
+                    work_packages.append(
+                        (document, existing_thumbnail, converted_thumbnail),
+                    )
+
+            if len(work_packages):
+                with multiprocessing.pool.Pool(processes=4, maxtasksperchild=4) as pool:
+                    pool.map(_do_convert, work_packages)
 
             end = time.time()
             duration = end - start
 
-        self.stdout.write(f"Conversion completed in {duration:.3f}s")
+        logger.info(f"Conversion completed in {duration:.3f}s")
index bf0f352b55b08eac2f390eadf7fa688769816221..c51f1baeb468cc770448161b4e15a35fc40e77bf 100644 (file)
@@ -41,7 +41,7 @@ def handle_document(document_id):
     try:
         parser.parse(document.source_path, mime_type, document.get_public_filename())
 
-        thumbnail = parser.get_optimised_thumbnail(
+        thumbnail = parser.get_thumbnail(
             document.source_path,
             mime_type,
             document.get_public_filename(),
index 595d8ba3b2c25fb6b8df3a17306a2c6309aa9da7..535a0f670f99e7225e106f9fcb82e9550b0bbf93 100644 (file)
@@ -29,7 +29,7 @@ def _process_document(doc_in):
         if existing_thumbnail.exists() and existing_thumbnail.suffix == ".png":
             existing_thumbnail.unlink()
 
-        thumb = parser.get_optimised_thumbnail(
+        thumb = parser.get_thumbnail(
             document.source_path,
             document.mime_type,
             document.get_public_filename(),
index 221086ca2b79823f95ced60c8cceea32da1e80f9..9fed321c344468af6ed4f48878d9d58d98003301 100644 (file)
@@ -308,17 +308,11 @@ class Document(models.Model):
         png_file_path = os.path.join(settings.THUMBNAIL_DIR, png_file_name)
 
         # 1. Assume the thumbnail is WebP
-        if not os.path.exists(webp_file_path):
-            # 2. If WebP doesn't exist, check PNG
-            if not os.path.exists(png_file_path):
-                # 3. If PNG doesn't exist, filename is being constructed, return WebP
-                thumb = webp_file_path
-            else:
-                # 2.1 - PNG file exists, return path to it
-                thumb = png_file_path
+        if os.path.exists(png_file_path):
+            thumb = png_file_path
         else:
-            # 1.1 - WebP file exists, return path to it
             thumb = webp_file_path
+
         return os.path.normpath(thumb)
 
     @property
index bc8af0ec8238dc7422b122d594608d4a96d8e91c..721346fb0667b7cf3375aaecbc9676c8800be692 100644 (file)
@@ -150,11 +150,14 @@ def run_convert(
 
 
 def get_default_thumbnail() -> str:
+    """
+    Returns the path to a generic thumbnail
+    """
     return os.path.join(os.path.dirname(__file__), "resources", "document.png")
 
 
 def make_thumbnail_from_pdf_gs_fallback(in_path, temp_dir, logging_group=None) -> str:
-    out_path = os.path.join(temp_dir, "convert_gs.png")
+    out_path = os.path.join(temp_dir, "convert_gs.webp")
 
     # if convert fails, fall back to extracting
     # the first PDF page as a PNG using Ghostscript
@@ -319,29 +322,6 @@ class DocumentParser(LoggingMixin):
         """
         raise NotImplementedError()
 
-    def get_optimised_thumbnail(self, document_path, mime_type, file_name=None):
-        thumbnail = self.get_thumbnail(document_path, mime_type, file_name)
-        if settings.OPTIMIZE_THUMBNAILS and os.path.splitext(thumbnail)[1] == ".png":
-            out_path = os.path.join(self.tempdir, "thumb_optipng.png")
-
-            args = (
-                settings.OPTIPNG_BINARY,
-                "-silent",
-                "-o5",
-                thumbnail,
-                "-out",
-                out_path,
-            )
-
-            self.log("debug", f"Execute: {' '.join(args)}")
-
-            if not subprocess.Popen(args).wait() == 0:
-                raise ParseError(f"Optipng failed at {args}")
-
-            return out_path
-        else:
-            return thumbnail
-
     def get_text(self):
         return self.text
 
index 637c0d95e27036f2678953eb44023390ae9042fd..a770d3ff694eb6cfd530ad1a588c4cfd10555ef9 100644 (file)
@@ -183,7 +183,7 @@ class DummyParser(DocumentParser):
         _, self.fake_thumb = tempfile.mkstemp(suffix=".png", dir=scratch_dir)
         self.archive_path = archive_path
 
-    def get_optimised_thumbnail(self, document_path, mime_type, file_name=None):
+    def get_thumbnail(self, document_path, mime_type, file_name=None):
         return self.fake_thumb
 
     def parse(self, document_path, mime_type, file_name=None):
@@ -194,7 +194,7 @@ class CopyParser(DocumentParser):
     def get_thumbnail(self, document_path, mime_type, file_name=None):
         return self.fake_thumb
 
-    def get_optimised_thumbnail(self, document_path, mime_type, file_name=None):
+    def get_thumbnail(self, document_path, mime_type, file_name=None):
         return self.fake_thumb
 
     def __init__(self, logging_group, progress_callback=None):
@@ -216,7 +216,7 @@ class FaultyParser(DocumentParser):
         super().__init__(logging_group)
         _, self.fake_thumb = tempfile.mkstemp(suffix=".png", dir=scratch_dir)
 
-    def get_optimised_thumbnail(self, document_path, mime_type, file_name=None):
+    def get_thumbnail(self, document_path, mime_type, file_name=None):
         return self.fake_thumb
 
     def parse(self, document_path, mime_type, file_name=None):
index 162f05cfe02fa4a2f91d2e6751afeaefe4be71bb..8413cec3a05ce2385c1e5e0daf547e0722fe9568 100644 (file)
@@ -137,32 +137,3 @@ class TestConvertThumbnails(TestCase):
                 run_convert_mock.assert_called_once()
                 self.assertIn("Error converting thumbnail", stderr)
                 self.assertTrue(thumb_file.exists())
-
-    @mock.patch("documents.management.commands.convert_thumbnails.run_convert")
-    def test_convert_single_thumbnail_no_output(self, run_convert_mock):
-        """
-        GIVEN:
-            - Document exists with PNG thumbnail
-        WHEN:
-            - Thumbnail conversion is attempted, but there is no output WebP
-        THEN:
-            - Single thumbnail is converted
-        """
-
-        with tempfile.TemporaryDirectory() as thumbnail_dir:
-
-            with override_settings(
-                THUMBNAIL_DIR=thumbnail_dir,
-            ):
-
-                thumb_file = self.create_png_thumbnail_file(thumbnail_dir)
-
-                stdout, stderr = self.call_command()
-
-                run_convert_mock.assert_called_once()
-                self.assertIn(f"{thumb_file}", stdout)
-                self.assertNotIn("Conversion to WebP completed", stdout)
-                self.assertIn("Converted thumbnail doesn't exist", stderr)
-
-                self.assertTrue(thumb_file.exists())
-                self.assertFalse(thumb_file.with_suffix(".webp").exists())
index 34711bca8a43e6e5d90014b34bbc08e33cd44032..1942fe0dde931128e7b6c413a82570d9b95c3112 100644 (file)
@@ -87,31 +87,6 @@ def fake_get_thumbnail(self, path, mimetype, file_name):
     return os.path.join(os.path.dirname(__file__), "examples", "no-text.png")
 
 
-class TestBaseParser(TestCase):
-    def setUp(self) -> None:
-
-        self.scratch = tempfile.mkdtemp()
-        override_settings(SCRATCH_DIR=self.scratch).enable()
-
-    def tearDown(self) -> None:
-        shutil.rmtree(self.scratch)
-
-    @mock.patch("documents.parsers.DocumentParser.get_thumbnail", fake_get_thumbnail)
-    @override_settings(OPTIMIZE_THUMBNAILS=True)
-    def test_get_optimised_thumbnail(self):
-        parser = DocumentParser(None)
-
-        parser.get_optimised_thumbnail("any", "not important", "document.pdf")
-
-    @mock.patch("documents.parsers.DocumentParser.get_thumbnail", fake_get_thumbnail)
-    @override_settings(OPTIMIZE_THUMBNAILS=False)
-    def test_get_optimised_thumb_disabled(self):
-        parser = DocumentParser(None)
-
-        path = parser.get_optimised_thumbnail("any", "not important", "document.pdf")
-        self.assertEqual(path, fake_get_thumbnail(None, None, None, None))
-
-
 class TestParserAvailability(TestCase):
     def test_file_extensions(self):
 
index ee9b95e09b7d047b863fe7aa1198fe396dc4a184..26d18b69222a8fff390ff35d1d3f531dd36c2a85 100644 (file)
@@ -72,7 +72,7 @@ def binaries_check(app_configs, **kwargs):
     error = "Paperless can't find {}. Without it, consumption is impossible."
     hint = "Either it's not in your ${PATH} or it's not installed."
 
-    binaries = (settings.CONVERT_BINARY, settings.OPTIPNG_BINARY, "tesseract")
+    binaries = (settings.CONVERT_BINARY, "tesseract")
 
     check_messages = []
     for binary in binaries:
index 9a5d9453d771366834e5e2dcb4423e81bc44e316..8c8aa8482c301891bc9f2ec6bec5bab066815073 100644 (file)
@@ -526,8 +526,6 @@ CONSUMER_BARCODE_TIFF_SUPPORT = __get_boolean(
 
 CONSUMER_BARCODE_STRING = os.getenv("PAPERLESS_CONSUMER_BARCODE_STRING", "PATCHT")
 
-OPTIMIZE_THUMBNAILS = __get_boolean("PAPERLESS_OPTIMIZE_THUMBNAILS", "true")
-
 OCR_PAGES = int(os.getenv("PAPERLESS_OCR_PAGES", 0))
 
 # The default language that tesseract will attempt to use when parsing
@@ -570,8 +568,6 @@ CONVERT_MEMORY_LIMIT = os.getenv("PAPERLESS_CONVERT_MEMORY_LIMIT")
 
 GS_BINARY = os.getenv("PAPERLESS_GS_BINARY", "gs")
 
-OPTIPNG_BINARY = os.getenv("PAPERLESS_OPTIPNG_BINARY", "optipng")
-
 
 # Pre-2.x versions of Paperless stored your documents locally with GPG
 # encryption, but that is no longer the default.  This behaviour is still
index df0cb0afda30008ee9ab8bbc1d681d1bb646275d..ba45ebf7981a9c345fd8af6e7d8eef22004a2835 100644 (file)
@@ -13,9 +13,9 @@ class TestChecks(DirectoriesMixin, TestCase):
     def test_binaries(self):
         self.assertEqual(binaries_check(None), [])
 
-    @override_settings(CONVERT_BINARY="uuuhh", OPTIPNG_BINARY="forgot")
+    @override_settings(CONVERT_BINARY="uuuhh")
     def test_binaries_fail(self):
-        self.assertEqual(len(binaries_check(None)), 2)
+        self.assertEqual(len(binaries_check(None)), 1)
 
     def test_paths_check(self):
         self.assertEqual(paths_check(None), [])