More smoothly handle the case of a password protected PDF for barcodes

author Trenton H <holmes.trenton@gmail.com>

Mon, 24 Oct 2022 20:16:14 +0000 (13:16 -0700)

committer Trenton H <holmes.trenton@gmail.com>

Mon, 24 Oct 2022 20:16:14 +0000 (13:16 -0700)
author Trenton H <holmes.trenton@gmail.com>
Mon, 24 Oct 2022 20:16:14 +0000 (13:16 -0700)
committer Trenton H <holmes.trenton@gmail.com>
Mon, 24 Oct 2022 20:16:14 +0000 (13:16 -0700)
diff --git a/src/documents/barcodes.py b/src/documents/barcodes.py

index 13e78e1813b366bbabf84dd9f3f129859a01aaf7..1f5e33d376cc2d2322119ab3caff66c275890055 100644 (file)
--- a/src/documents/barcodes.py
+++ b/src/documents/barcodes.py
@@ -10,9 +10,12 @@ from typing import Tuple
  import magic
  from django.conf import settings
  from pdf2image import convert_from_path
+from pdf2image.exceptions import PDFPageCountError
  from pikepdf import Page
+from pikepdf import PasswordError
  from pikepdf import Pdf
  from pikepdf import PdfImage
+from pikepdf.models.image import HifiPrintImageNotTranscodableError
  from PIL import Image
  from PIL import ImageSequence
  from pyzbar import pyzbar
@@ -120,7 +123,9 @@ def scan_file_for_separating_barcodes(filepath: str) -> Tuple[Optional[str], Lis
                      pdfimage = PdfImage(page.images[image_key])
  
                      if "/CCITTFaxDecode" in pdfimage.filters:
-                        raise BarcodeImageFormatError()
+                        raise BarcodeImageFormatError(
+                            "Unable to decode CCITTFaxDecode images",
+                        )
  
                      # Not all images can be transcoded to a PIL image, which
                      # is what pyzbar expects to receive
@@ -132,7 +137,7 @@ def scan_file_for_separating_barcodes(filepath: str) -> Tuple[Optional[str], Lis
                          separator_page_numbers.append(page_num)
  
      def _pdf2image_barcode_scan(pdf_filepath: str):
-        # use a temporary directory in case the file os too big to handle in memory
+        # use a temporary directory in case the file is too big to handle in memory
          with tempfile.TemporaryDirectory() as path:
              pages_from_path = convert_from_path(pdf_filepath, output_folder=path)
              for current_page_number, page in enumerate(pages_from_path):
@@ -150,20 +155,42 @@ def scan_file_for_separating_barcodes(filepath: str) -> Tuple[Optional[str], Lis
          if mime_type == "image/tiff":
              pdf_filepath = convert_from_tiff_to_pdf(filepath)
  
+        # Chose the scanner
          if settings.CONSUMER_USE_LEGACY_DETECTION:
-            _pdf2image_barcode_scan(pdf_filepath)
+            logger.debug("Using pdf2image for barcodes")
+            scanner_function = _pdf2image_barcode_scan
          else:
-            try:
-                _pikepdf_barcode_scan(pdf_filepath)
-            except Exception as e:
+            logger.debug("Using pikepdf for barcodes")
+            scanner_function = _pikepdf_barcode_scan
  
-                logger.warning(
-                    f"Exception using pikepdf for barcodes,"
-                    f" falling back to pdf2image: {e}",
-                )
-                # Reset this incase pikepdf got part way through
+        # Run the scanner
+        try:
+            scanner_function(pdf_filepath)
+        # Neither method can handle password protected PDFs without it being
+        # provided.  Log it and continue
+        except (PasswordError, PDFPageCountError) as e:
+            logger.warning(
+                f"File is likely password protected, not splitting: {e}",
+            )
+        # Handle pikepdf related image decoding issues with a fallback
+        except (BarcodeImageFormatError, HifiPrintImageNotTranscodableError) as e:
+            logger.warning(
+                f"Falling back to pdf2image because: {e}",
+            )
+            try:
                  separator_page_numbers = []
                  _pdf2image_barcode_scan(pdf_filepath)
+            # This file is really borked, allow the consumption to continue
+            # but it may fail further on
+            except Exception as e:  # pragma: no cover
+                logger.warning(
+                    f"Exception during barcode scanning: {e}",
+                )
+        # We're not sure what happened, but allow the consumption to continue
+        except Exception as e:  # pragma: no cover
+            logger.warning(
+                f"Exception during barcode scanning: {e}",
+            )
  
      else:
          logger.warning(
diff --git a/src/documents/tests/samples/password-is-test.pdf b/src/documents/tests/samples/password-is-test.pdf

new file mode 100755 (executable)

index 0000000..b16b023

Binary files /dev/null and b/src/documents/tests/samples/password-is-test.pdf differ
diff --git a/src/documents/tests/test_barcodes.py b/src/documents/tests/test_barcodes.py

index 1c4ab7cc34fd03e07a4b68fb0db9495d7401683c..f4e5fce14f051c1ebc2c902d29e7572928f6145c 100644 (file)
--- a/src/documents/tests/test_barcodes.py
+++ b/src/documents/tests/test_barcodes.py
@@ -174,7 +174,7 @@ class TestBarcode(DirectoriesMixin, TestCase):
          self.assertEqual(pdf_file, test_file)
          self.assertListEqual(separator_page_numbers, [0])
  
-    def test_scan_file_for_separating_barcodes2(self):
+    def test_scan_file_for_separating_barcodes_none_present(self):
          test_file = os.path.join(self.SAMPLE_DIR, "simple.pdf")
          pdf_file, separator_page_numbers = barcodes.scan_file_for_separating_barcodes(
              test_file,
@@ -585,3 +585,40 @@ class TestBarcode(DirectoriesMixin, TestCase):
  
          with mock.patch("documents.tasks.async_to_sync"):
              self.assertEqual(tasks.consume_file(dst), "File successfully split")
+
+    def test_scan_file_for_separating_barcodes_password_pikepdf(self):
+        """
+        GIVEN:
+            - Password protected PDF
+            - pikepdf based scanning
+        WHEN:
+            - File is scanned for barcode
+        THEN:
+            - Scanning handle the exception without exception
+        """
+        test_file = os.path.join(self.SAMPLE_DIR, "password-is-test.pdf")
+        pdf_file, separator_page_numbers = barcodes.scan_file_for_separating_barcodes(
+            test_file,
+        )
+
+        self.assertEqual(pdf_file, test_file)
+        self.assertListEqual(separator_page_numbers, [])
+
+    @override_settings(CONSUMER_USE_LEGACY_DETECTION=True)
+    def test_scan_file_for_separating_barcodes_password_pdf2image(self):
+        """
+        GIVEN:
+            - Password protected PDF
+            - pdf2image based scanning
+        WHEN:
+            - File is scanned for barcode
+        THEN:
+            - Scanning handle the exception without exception
+        """
+        test_file = os.path.join(self.SAMPLE_DIR, "password-is-test.pdf")
+        pdf_file, separator_page_numbers = barcodes.scan_file_for_separating_barcodes(
+            test_file,
+        )
+
+        self.assertEqual(pdf_file, test_file)
+        self.assertListEqual(separator_page_numbers, [])
author	Trenton H <holmes.trenton@gmail.com>
	Mon, 24 Oct 2022 20:16:14 +0000 (13:16 -0700)
committer	Trenton H <holmes.trenton@gmail.com>
	Mon, 24 Oct 2022 20:16:14 +0000 (13:16 -0700)
src/documents/barcodes.py		patch \| blob \| blame \| history
src/documents/tests/samples/password-is-test.pdf	[new file with mode: 0755]	patch \| blob
src/documents/tests/test_barcodes.py		patch \| blob \| blame \| history