"src/paperless_mail/tests/",
"src/paperless_tesseract/tests/",
"src/paperless_tika/tests",
+ "src/paperless_text/tests/",
]
addopts = [
"--pythonwarnings=all",
logging_name = "paperless.parsing.text"
def get_thumbnail(self, document_path: Path, mime_type, file_name=None) -> Path:
- text = self.read_file_handle_unicode_errors(document_path)
+ # Avoid reading entire file into memory
+ max_chars = 100_000
+ file_size_limit = 50 * 1024 * 1024
+
+ if document_path.stat().st_size > file_size_limit:
+ text = "[File too large to preview]"
+ else:
+ with Path(document_path).open("r", encoding="utf-8", errors="replace") as f:
+ text = f.read(max_chars)
img = Image.new("RGB", (500, 700), color="white")
draw = ImageDraw.Draw(img)
size=20,
layout_engine=ImageFont.Layout.BASIC,
)
- draw.text((5, 5), text, font=font, fill="black")
+ draw.multiline_text((5, 5), text, font=font, fill="black", spacing=4)
out_path = self.tempdir / "thumb.webp"
img.save(out_path, format="WEBP")
+import tempfile
from pathlib import Path
from paperless_text.parsers import TextDocumentParser
assert text_parser.get_text() == "Pantothens�ure\n"
assert text_parser.get_archive_path() is None
+
+ def test_thumbnail_large_file(self, text_parser: TextDocumentParser):
+ """
+ GIVEN:
+ - A very large text file (>50MB)
+ WHEN:
+ - A thumbnail is requested
+ THEN:
+ - A thumbnail is created without reading the entire file into memory
+ """
+ with tempfile.NamedTemporaryFile(
+ delete=False,
+ mode="w",
+ encoding="utf-8",
+ suffix=".txt",
+ ) as tmp:
+ tmp.write("A" * (51 * 1024 * 1024)) # 51 MB of 'A'
+ large_file = Path(tmp.name)
+
+ thumb = text_parser.get_thumbnail(large_file, "text/plain")
+ assert thumb.exists()
+ assert thumb.is_file()
+ large_file.unlink()