]> git.ipfire.org Git - thirdparty/paperless-ngx.git/commitdiff
Fix: handle page count exception for pw-protected files (#8240)
authorshamoon <4887959+shamoon@users.noreply.github.com>
Sun, 10 Nov 2024 11:33:47 +0000 (03:33 -0800)
committerGitHub <noreply@github.com>
Sun, 10 Nov 2024 11:33:47 +0000 (03:33 -0800)
src/paperless_tesseract/parsers.py
src/paperless_tesseract/tests/test_parser.py

index 6b9ec3d93af93d451da77af4ef798624d47c4083..95c1dbfcc6b75805fc9811dbbd4d6b42b9d00e32 100644 (file)
@@ -43,10 +43,15 @@ class RasterisedDocumentParser(DocumentParser):
     def get_page_count(self, document_path, mime_type):
         page_count = None
         if mime_type == "application/pdf":
-            import pikepdf
+            try:
+                import pikepdf
 
-            with pikepdf.Pdf.open(document_path) as pdf:
-                page_count = len(pdf.pages)
+                with pikepdf.Pdf.open(document_path) as pdf:
+                    page_count = len(pdf.pages)
+            except Exception as e:
+                self.log.warning(
+                    f"Unable to determine PDF page count {document_path}: {e}",
+                )
         return page_count
 
     def extract_metadata(self, document_path, mime_type):
index 45a5939ab7f6c421f1e8f6507145bf41adda1a0a..f7490fbbf7f67700c19b767c188363f67d9648af 100644 (file)
@@ -81,6 +81,24 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
         )
         self.assertEqual(page_count, 6)
 
+    def test_get_page_count_password_protected(self):
+        """
+        GIVEN:
+            - Password protected PDF file
+        WHEN:
+            - The number of pages is requested
+        THEN:
+            - The method returns None
+        """
+        parser = RasterisedDocumentParser(uuid.uuid4())
+        with self.assertLogs("paperless.parsing.tesseract", level="WARNING") as cm:
+            page_count = parser.get_page_count(
+                os.path.join(self.SAMPLE_FILES, "password-protected.pdf"),
+                "application/pdf",
+            )
+            self.assertEqual(page_count, None)
+            self.assertIn("Unable to determine PDF page count", cm.output[0])
+
     def test_thumbnail(self):
         parser = RasterisedDocumentParser(uuid.uuid4())
         thumb = parser.get_thumbnail(