From: shamoon <4887959+shamoon@users.noreply.github.com> Date: Sun, 10 Nov 2024 11:33:47 +0000 (-0800) Subject: Fix: handle page count exception for pw-protected files (#8240) X-Git-Tag: v2.13.5~1^2 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=a6f4c75a72fdef3ad05452013d208ee7fed3edc7;p=thirdparty%2Fpaperless-ngx.git Fix: handle page count exception for pw-protected files (#8240) --- diff --git a/src/paperless_tesseract/parsers.py b/src/paperless_tesseract/parsers.py index 6b9ec3d93a..95c1dbfcc6 100644 --- a/src/paperless_tesseract/parsers.py +++ b/src/paperless_tesseract/parsers.py @@ -43,10 +43,15 @@ class RasterisedDocumentParser(DocumentParser): def get_page_count(self, document_path, mime_type): page_count = None if mime_type == "application/pdf": - import pikepdf + try: + import pikepdf - with pikepdf.Pdf.open(document_path) as pdf: - page_count = len(pdf.pages) + with pikepdf.Pdf.open(document_path) as pdf: + page_count = len(pdf.pages) + except Exception as e: + self.log.warning( + f"Unable to determine PDF page count {document_path}: {e}", + ) return page_count def extract_metadata(self, document_path, mime_type): diff --git a/src/paperless_tesseract/tests/test_parser.py b/src/paperless_tesseract/tests/test_parser.py index 45a5939ab7..f7490fbbf7 100644 --- a/src/paperless_tesseract/tests/test_parser.py +++ b/src/paperless_tesseract/tests/test_parser.py @@ -81,6 +81,24 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase): ) self.assertEqual(page_count, 6) + def test_get_page_count_password_protected(self): + """ + GIVEN: + - Password protected PDF file + WHEN: + - The number of pages is requested + THEN: + - The method returns None + """ + parser = RasterisedDocumentParser(uuid.uuid4()) + with self.assertLogs("paperless.parsing.tesseract", level="WARNING") as cm: + page_count = parser.get_page_count( + os.path.join(self.SAMPLE_FILES, "password-protected.pdf"), + "application/pdf", + ) + self.assertEqual(page_count, None) + self.assertIn("Unable to determine PDF page count", cm.output[0]) + def test_thumbnail(self): parser = RasterisedDocumentParser(uuid.uuid4()) thumb = parser.get_thumbnail(