Fixhancement: improve text thumbnail generation for large files (#10483)

author shamoon <4887959+shamoon@users.noreply.github.com>

Fri, 1 Aug 2025 17:26:35 +0000 (13:26 -0400)

committer GitHub <noreply@github.com>

Fri, 1 Aug 2025 17:26:35 +0000 (10:26 -0700)
author shamoon <4887959+shamoon@users.noreply.github.com>
Fri, 1 Aug 2025 17:26:35 +0000 (13:26 -0400)
committer GitHub <noreply@github.com>
Fri, 1 Aug 2025 17:26:35 +0000 (10:26 -0700)
diff --git a/pyproject.toml b/pyproject.toml

index 45edf60620b8ceb73eee90fa96aeb7b2342b8a52..1a98266b7f1d617e93d06272be40ba5b297318ae 100644 (file)
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -239,6 +239,7 @@ testpaths = [
    "src/paperless_mail/tests/",
    "src/paperless_tesseract/tests/",
    "src/paperless_tika/tests",
+  "src/paperless_text/tests/",
  ]
  addopts = [
    "--pythonwarnings=all",
diff --git a/src/paperless_text/parsers.py b/src/paperless_text/parsers.py

index 58df11d7aa32203852331cbfa2802448f13f5c2b..4e37ccd8224aefe8bcde6cd9b49ab925c3633cc1 100644 (file)
--- a/src/paperless_text/parsers.py
+++ b/src/paperless_text/parsers.py
@@ -16,7 +16,15 @@ class TextDocumentParser(DocumentParser):
      logging_name = "paperless.parsing.text"
  
      def get_thumbnail(self, document_path: Path, mime_type, file_name=None) -> Path:
-        text = self.read_file_handle_unicode_errors(document_path)
+        # Avoid reading entire file into memory
+        max_chars = 100_000
+        file_size_limit = 50 * 1024 * 1024
+
+        if document_path.stat().st_size > file_size_limit:
+            text = "[File too large to preview]"
+        else:
+            with Path(document_path).open("r", encoding="utf-8", errors="replace") as f:
+                text = f.read(max_chars)
  
          img = Image.new("RGB", (500, 700), color="white")
          draw = ImageDraw.Draw(img)
@@ -25,7 +33,7 @@ class TextDocumentParser(DocumentParser):
              size=20,
              layout_engine=ImageFont.Layout.BASIC,
          )
-        draw.text((5, 5), text, font=font, fill="black")
+        draw.multiline_text((5, 5), text, font=font, fill="black", spacing=4)
  
          out_path = self.tempdir / "thumb.webp"
          img.save(out_path, format="WEBP")
diff --git a/src/paperless_text/tests/test_parser.py b/src/paperless_text/tests/test_parser.py

index 0f8cc19bace1d58c43150c746856ef7e12c01375..5fea36744402358619d340efc219d08491b3087b 100644 (file)
--- a/src/paperless_text/tests/test_parser.py
+++ b/src/paperless_text/tests/test_parser.py
@@ -1,3 +1,4 @@
+import tempfile
  from pathlib import Path
  
  from paperless_text.parsers import TextDocumentParser
@@ -35,3 +36,26 @@ class TestTextParser:
  
          assert text_parser.get_text() == "Pantothens�ure\n"
          assert text_parser.get_archive_path() is None
+
+    def test_thumbnail_large_file(self, text_parser: TextDocumentParser):
+        """
+        GIVEN:
+            - A very large text file (>50MB)
+        WHEN:
+            - A thumbnail is requested
+        THEN:
+            - A thumbnail is created without reading the entire file into memory
+        """
+        with tempfile.NamedTemporaryFile(
+            delete=False,
+            mode="w",
+            encoding="utf-8",
+            suffix=".txt",
+        ) as tmp:
+            tmp.write("A" * (51 * 1024 * 1024))  # 51 MB of 'A'
+            large_file = Path(tmp.name)
+
+            thumb = text_parser.get_thumbnail(large_file, "text/plain")
+            assert thumb.exists()
+            assert thumb.is_file()
+            large_file.unlink()
author	shamoon <4887959+shamoon@users.noreply.github.com>
	Fri, 1 Aug 2025 17:26:35 +0000 (13:26 -0400)
committer	GitHub <noreply@github.com>
	Fri, 1 Aug 2025 17:26:35 +0000 (10:26 -0700)
pyproject.toml		patch \| blob \| blame \| history
src/paperless_text/parsers.py		patch \| blob \| blame \| history
src/paperless_text/tests/test_parser.py		patch \| blob \| blame \| history