name: Install system dependencies
run: |
sudo apt-get update -qq
- sudo apt-get install -qq --no-install-recommends unpaper tesseract-ocr imagemagick ghostscript optipng libzbar0 poppler-utils
+ sudo apt-get install -qq --no-install-recommends unpaper tesseract-ocr imagemagick ghostscript libzbar0 poppler-utils
-
name: Install Python dependencies
run: |
libraqm0 \
libgnutls30 \
libjpeg62-turbo \
- optipng \
python3 \
python3-pip \
python3-setuptools \
Default is none, which disables the temporary directory.
-PAPERLESS_OPTIMIZE_THUMBNAILS=<bool>
- Use optipng to optimize thumbnails. This usually reduces the size of
- thumbnails by about 20%, but uses considerable compute time during
- consumption.
-
- Defaults to true.
-
PAPERLESS_POST_CONSUME_SCRIPT=<filename>
After a document is consumed, Paperless can trigger an arbitrary script if
you like. This script will be passed a number of arguments for you to work
PAPERLESS_GS_BINARY=<path>
Defaults to "/usr/bin/gs".
-PAPERLESS_OPTIPNG_BINARY=<path>
- Defaults to "/usr/bin/optipng".
-
.. _configuration-docker:
* ``fonts-liberation`` for generating thumbnails for plain text files
* ``imagemagick`` >= 6 for PDF conversion
- * ``optipng`` for optimizing thumbnails
* ``gnupg`` for handling encrypted documents
* ``libpq-dev`` for PostgreSQL
* ``libmagic-dev`` for mime type detection
.. code::
- python3 python3-pip python3-dev imagemagick fonts-liberation optipng gnupg libpq-dev libmagic-dev mime-support libzbar0 poppler-utils
+ python3 python3-pip python3-dev imagemagick fonts-liberation gnupg libpq-dev libmagic-dev mime-support libzbar0 poppler-utils
These dependencies are required for OCRmyPDF, which is used for text recognition.
* If you want to perform OCR on the device, consider using ``PAPERLESS_OCR_CLEAN=none``.
This will speed up OCR times and use less memory at the expense of slightly worse
OCR results.
-* Set ``PAPERLESS_OPTIMIZE_THUMBNAILS`` to 'false' if you want faster consumption
- times. Thumbnails will be about 20% larger.
* If using docker, consider setting ``PAPERLESS_WEBSERVER_WORKERS`` to
1. This will save some memory.
#PAPERLESS_CONSUMER_SUBDIRS_AS_TAGS=false
#PAPERLESS_CONSUMER_ENABLE_BARCODES=false
#PAPERLESS_CONSUMER_ENABLE_BARCODES=PATCHT
-#PAPERLESS_OPTIMIZE_THUMBNAILS=true
#PAPERLESS_PRE_CONSUME_SCRIPT=/path/to/an/arbitrary/script.sh
#PAPERLESS_POST_CONSUME_SCRIPT=/path/to/an/arbitrary/script.sh
#PAPERLESS_FILENAME_DATE_ORDER=YMD
#PAPERLESS_CONVERT_BINARY=/usr/bin/convert
#PAPERLESS_GS_BINARY=/usr/bin/gs
-#PAPERLESS_OPTIPNG_BINARY=/usr/bin/optipng
self.log("debug", f"Generating thumbnail for {self.filename}...")
self._send_progress(70, 100, "WORKING", MESSAGE_GENERATING_THUMBNAIL)
- thumbnail = document_parser.get_optimised_thumbnail(
+ thumbnail = document_parser.get_thumbnail(
self.path,
mime_type,
self.filename,
import logging
+import multiprocessing.pool
import shutil
import tempfile
import time
from documents.models import Document
from documents.parsers import run_convert
-
logger = logging.getLogger("paperless.management.convert_thumbnails")
+def _do_convert(work_package):
+ _, existing_thumbnail, converted_thumbnail = work_package
+ try:
+
+ logger.info(f"Converting thumbnail: {existing_thumbnail}")
+
+ # Run actual conversion
+ run_convert(
+ density=300,
+ scale="500x5000>",
+ alpha="remove",
+ strip=True,
+ trim=False,
+ auto_orient=True,
+ input_file=f"{existing_thumbnail}[0]",
+ output_file=str(converted_thumbnail),
+ )
+
+ # Copy newly created thumbnail to thumbnail directory
+ shutil.copy(converted_thumbnail, existing_thumbnail.parent)
+
+ # Remove the PNG version
+ existing_thumbnail.unlink()
+
+ logger.info(
+ "Conversion to WebP completed, "
+ f"replaced {existing_thumbnail.name} with {converted_thumbnail.name}",
+ )
+
+ except Exception as e:
+ logger.error(
+ f"Error converting thumbnail" f" (existing file unchanged): {e}",
+ )
+
+
class Command(BaseCommand):
help = """
def handle(self, *args, **options):
- self.stdout.write("Converting all PNG thumbnails to WebP")
-
+ logger.info("Converting all PNG thumbnails to WebP")
start = time.time()
-
documents = Document.objects.all()
with tempfile.TemporaryDirectory() as tempdir:
+ work_packages = []
+
for document in documents:
existing_thumbnail = Path(document.thumbnail_path).resolve()
if existing_thumbnail.suffix == ".png":
- self.stdout.write(f"Converting thumbnail: {existing_thumbnail}")
-
# Change the existing filename suffix from png to webp
converted_thumbnail_name = existing_thumbnail.with_suffix(
".webp",
Path(tempdir) / Path(converted_thumbnail_name)
).resolve()
- try:
- # Run actual conversion
- run_convert(
- density=300,
- scale="500x5000>",
- alpha="remove",
- strip=True,
- trim=False,
- auto_orient=True,
- input_file=f"{existing_thumbnail}[0]",
- output_file=str(converted_thumbnail),
- )
-
- if converted_thumbnail.exists():
- # Copy newly created thumbnail to thumbnail directory
- shutil.copy(converted_thumbnail, existing_thumbnail.parent)
-
- # Remove the PNG version
- existing_thumbnail.unlink()
-
- self.stdout.write(
- self.style.SUCCESS(
- "Conversion to WebP completed",
- ),
- )
- else:
- # Highly unlike to reach here
- self.stderr.write(
- self.style.WARNING("Converted thumbnail doesn't exist"),
- )
-
- except Exception as e:
- self.stderr.write(
- self.style.ERROR(
- f"Error converting thumbnail"
- f" (existing file unchanged): {e}",
- ),
- )
+ # Package up the necessary info
+ work_packages.append(
+ (document, existing_thumbnail, converted_thumbnail),
+ )
+
+ if len(work_packages):
+ with multiprocessing.pool.Pool(processes=4, maxtasksperchild=4) as pool:
+ pool.map(_do_convert, work_packages)
end = time.time()
duration = end - start
- self.stdout.write(f"Conversion completed in {duration:.3f}s")
+ logger.info(f"Conversion completed in {duration:.3f}s")
try:
parser.parse(document.source_path, mime_type, document.get_public_filename())
- thumbnail = parser.get_optimised_thumbnail(
+ thumbnail = parser.get_thumbnail(
document.source_path,
mime_type,
document.get_public_filename(),
if existing_thumbnail.exists() and existing_thumbnail.suffix == ".png":
existing_thumbnail.unlink()
- thumb = parser.get_optimised_thumbnail(
+ thumb = parser.get_thumbnail(
document.source_path,
document.mime_type,
document.get_public_filename(),
png_file_path = os.path.join(settings.THUMBNAIL_DIR, png_file_name)
# 1. Assume the thumbnail is WebP
- if not os.path.exists(webp_file_path):
- # 2. If WebP doesn't exist, check PNG
- if not os.path.exists(png_file_path):
- # 3. If PNG doesn't exist, filename is being constructed, return WebP
- thumb = webp_file_path
- else:
- # 2.1 - PNG file exists, return path to it
- thumb = png_file_path
+ if os.path.exists(png_file_path):
+ thumb = png_file_path
else:
- # 1.1 - WebP file exists, return path to it
thumb = webp_file_path
+
return os.path.normpath(thumb)
@property
def get_default_thumbnail() -> str:
+ """
+ Returns the path to a generic thumbnail
+ """
return os.path.join(os.path.dirname(__file__), "resources", "document.png")
def make_thumbnail_from_pdf_gs_fallback(in_path, temp_dir, logging_group=None) -> str:
- out_path = os.path.join(temp_dir, "convert_gs.png")
+ out_path = os.path.join(temp_dir, "convert_gs.webp")
# if convert fails, fall back to extracting
# the first PDF page as a PNG using Ghostscript
"""
raise NotImplementedError()
- def get_optimised_thumbnail(self, document_path, mime_type, file_name=None):
- thumbnail = self.get_thumbnail(document_path, mime_type, file_name)
- if settings.OPTIMIZE_THUMBNAILS and os.path.splitext(thumbnail)[1] == ".png":
- out_path = os.path.join(self.tempdir, "thumb_optipng.png")
-
- args = (
- settings.OPTIPNG_BINARY,
- "-silent",
- "-o5",
- thumbnail,
- "-out",
- out_path,
- )
-
- self.log("debug", f"Execute: {' '.join(args)}")
-
- if not subprocess.Popen(args).wait() == 0:
- raise ParseError(f"Optipng failed at {args}")
-
- return out_path
- else:
- return thumbnail
-
def get_text(self):
return self.text
_, self.fake_thumb = tempfile.mkstemp(suffix=".png", dir=scratch_dir)
self.archive_path = archive_path
- def get_optimised_thumbnail(self, document_path, mime_type, file_name=None):
+ def get_thumbnail(self, document_path, mime_type, file_name=None):
return self.fake_thumb
def parse(self, document_path, mime_type, file_name=None):
def get_thumbnail(self, document_path, mime_type, file_name=None):
return self.fake_thumb
- def get_optimised_thumbnail(self, document_path, mime_type, file_name=None):
+ def get_thumbnail(self, document_path, mime_type, file_name=None):
return self.fake_thumb
def __init__(self, logging_group, progress_callback=None):
super().__init__(logging_group)
_, self.fake_thumb = tempfile.mkstemp(suffix=".png", dir=scratch_dir)
- def get_optimised_thumbnail(self, document_path, mime_type, file_name=None):
+ def get_thumbnail(self, document_path, mime_type, file_name=None):
return self.fake_thumb
def parse(self, document_path, mime_type, file_name=None):
run_convert_mock.assert_called_once()
self.assertIn("Error converting thumbnail", stderr)
self.assertTrue(thumb_file.exists())
-
- @mock.patch("documents.management.commands.convert_thumbnails.run_convert")
- def test_convert_single_thumbnail_no_output(self, run_convert_mock):
- """
- GIVEN:
- - Document exists with PNG thumbnail
- WHEN:
- - Thumbnail conversion is attempted, but there is no output WebP
- THEN:
- - Single thumbnail is converted
- """
-
- with tempfile.TemporaryDirectory() as thumbnail_dir:
-
- with override_settings(
- THUMBNAIL_DIR=thumbnail_dir,
- ):
-
- thumb_file = self.create_png_thumbnail_file(thumbnail_dir)
-
- stdout, stderr = self.call_command()
-
- run_convert_mock.assert_called_once()
- self.assertIn(f"{thumb_file}", stdout)
- self.assertNotIn("Conversion to WebP completed", stdout)
- self.assertIn("Converted thumbnail doesn't exist", stderr)
-
- self.assertTrue(thumb_file.exists())
- self.assertFalse(thumb_file.with_suffix(".webp").exists())
return os.path.join(os.path.dirname(__file__), "examples", "no-text.png")
-class TestBaseParser(TestCase):
- def setUp(self) -> None:
-
- self.scratch = tempfile.mkdtemp()
- override_settings(SCRATCH_DIR=self.scratch).enable()
-
- def tearDown(self) -> None:
- shutil.rmtree(self.scratch)
-
- @mock.patch("documents.parsers.DocumentParser.get_thumbnail", fake_get_thumbnail)
- @override_settings(OPTIMIZE_THUMBNAILS=True)
- def test_get_optimised_thumbnail(self):
- parser = DocumentParser(None)
-
- parser.get_optimised_thumbnail("any", "not important", "document.pdf")
-
- @mock.patch("documents.parsers.DocumentParser.get_thumbnail", fake_get_thumbnail)
- @override_settings(OPTIMIZE_THUMBNAILS=False)
- def test_get_optimised_thumb_disabled(self):
- parser = DocumentParser(None)
-
- path = parser.get_optimised_thumbnail("any", "not important", "document.pdf")
- self.assertEqual(path, fake_get_thumbnail(None, None, None, None))
-
-
class TestParserAvailability(TestCase):
def test_file_extensions(self):
error = "Paperless can't find {}. Without it, consumption is impossible."
hint = "Either it's not in your ${PATH} or it's not installed."
- binaries = (settings.CONVERT_BINARY, settings.OPTIPNG_BINARY, "tesseract")
+ binaries = (settings.CONVERT_BINARY, "tesseract")
check_messages = []
for binary in binaries:
CONSUMER_BARCODE_STRING = os.getenv("PAPERLESS_CONSUMER_BARCODE_STRING", "PATCHT")
-OPTIMIZE_THUMBNAILS = __get_boolean("PAPERLESS_OPTIMIZE_THUMBNAILS", "true")
-
OCR_PAGES = int(os.getenv("PAPERLESS_OCR_PAGES", 0))
# The default language that tesseract will attempt to use when parsing
GS_BINARY = os.getenv("PAPERLESS_GS_BINARY", "gs")
-OPTIPNG_BINARY = os.getenv("PAPERLESS_OPTIPNG_BINARY", "optipng")
-
# Pre-2.x versions of Paperless stored your documents locally with GPG
# encryption, but that is no longer the default. This behaviour is still
def test_binaries(self):
self.assertEqual(binaries_check(None), [])
- @override_settings(CONVERT_BINARY="uuuhh", OPTIPNG_BINARY="forgot")
+ @override_settings(CONVERT_BINARY="uuuhh")
def test_binaries_fail(self):
- self.assertEqual(len(binaries_check(None)), 2)
+ self.assertEqual(len(binaries_check(None)), 1)
def test_paths_check(self):
self.assertEqual(paths_check(None), [])