From: shamoon <4887959+shamoon@users.noreply.github.com> Date: Fri, 1 Aug 2025 17:26:35 +0000 (-0400) Subject: Fixhancement: improve text thumbnail generation for large files (#10483) X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;ds=inline;p=thirdparty%2Fpaperless-ngx.git Fixhancement: improve text thumbnail generation for large files (#10483) --- diff --git a/pyproject.toml b/pyproject.toml index 45edf60620..1a98266b7f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -239,6 +239,7 @@ testpaths = [ "src/paperless_mail/tests/", "src/paperless_tesseract/tests/", "src/paperless_tika/tests", + "src/paperless_text/tests/", ] addopts = [ "--pythonwarnings=all", diff --git a/src/paperless_text/parsers.py b/src/paperless_text/parsers.py index 58df11d7aa..4e37ccd822 100644 --- a/src/paperless_text/parsers.py +++ b/src/paperless_text/parsers.py @@ -16,7 +16,15 @@ class TextDocumentParser(DocumentParser): logging_name = "paperless.parsing.text" def get_thumbnail(self, document_path: Path, mime_type, file_name=None) -> Path: - text = self.read_file_handle_unicode_errors(document_path) + # Avoid reading entire file into memory + max_chars = 100_000 + file_size_limit = 50 * 1024 * 1024 + + if document_path.stat().st_size > file_size_limit: + text = "[File too large to preview]" + else: + with Path(document_path).open("r", encoding="utf-8", errors="replace") as f: + text = f.read(max_chars) img = Image.new("RGB", (500, 700), color="white") draw = ImageDraw.Draw(img) @@ -25,7 +33,7 @@ class TextDocumentParser(DocumentParser): size=20, layout_engine=ImageFont.Layout.BASIC, ) - draw.text((5, 5), text, font=font, fill="black") + draw.multiline_text((5, 5), text, font=font, fill="black", spacing=4) out_path = self.tempdir / "thumb.webp" img.save(out_path, format="WEBP") diff --git a/src/paperless_text/tests/test_parser.py b/src/paperless_text/tests/test_parser.py index 0f8cc19bac..5fea367444 100644 --- a/src/paperless_text/tests/test_parser.py +++ b/src/paperless_text/tests/test_parser.py @@ -1,3 +1,4 @@ +import tempfile from pathlib import Path from paperless_text.parsers import TextDocumentParser @@ -35,3 +36,26 @@ class TestTextParser: assert text_parser.get_text() == "Pantothens�ure\n" assert text_parser.get_archive_path() is None + + def test_thumbnail_large_file(self, text_parser: TextDocumentParser): + """ + GIVEN: + - A very large text file (>50MB) + WHEN: + - A thumbnail is requested + THEN: + - A thumbnail is created without reading the entire file into memory + """ + with tempfile.NamedTemporaryFile( + delete=False, + mode="w", + encoding="utf-8", + suffix=".txt", + ) as tmp: + tmp.write("A" * (51 * 1024 * 1024)) # 51 MB of 'A' + large_file = Path(tmp.name) + + thumb = text_parser.get_thumbnail(large_file, "text/plain") + assert thumb.exists() + assert thumb.is_file() + large_file.unlink()