def get_page_count(self, document_path, mime_type):
page_count = None
if mime_type == "application/pdf":
- import pikepdf
+ try:
+ import pikepdf
- with pikepdf.Pdf.open(document_path) as pdf:
- page_count = len(pdf.pages)
+ with pikepdf.Pdf.open(document_path) as pdf:
+ page_count = len(pdf.pages)
+ except Exception as e:
+ self.log.warning(
+ f"Unable to determine PDF page count {document_path}: {e}",
+ )
return page_count
def extract_metadata(self, document_path, mime_type):
)
self.assertEqual(page_count, 6)
+ def test_get_page_count_password_protected(self):
+ """
+ GIVEN:
+ - Password protected PDF file
+ WHEN:
+ - The number of pages is requested
+ THEN:
+ - The method returns None
+ """
+ parser = RasterisedDocumentParser(uuid.uuid4())
+ with self.assertLogs("paperless.parsing.tesseract", level="WARNING") as cm:
+ page_count = parser.get_page_count(
+ os.path.join(self.SAMPLE_FILES, "password-protected.pdf"),
+ "application/pdf",
+ )
+ self.assertEqual(page_count, None)
+ self.assertIn("Unable to determine PDF page count", cm.output[0])
+
def test_thumbnail(self):
parser = RasterisedDocumentParser(uuid.uuid4())
thumb = parser.get_thumbnail(