]> git.ipfire.org Git - thirdparty/paperless-ngx.git/commitdiff
Adds specific handling for CCITT Group 4, which pikepdf decodes, but not correctly 1745/head
authorTrenton Holmes <holmes.trenton@gmail.com>
Thu, 6 Oct 2022 02:58:40 +0000 (19:58 -0700)
committerTrenton H <holmes.trenton@gmail.com>
Tue, 11 Oct 2022 20:51:14 +0000 (13:51 -0700)
src/documents/barcodes.py
src/documents/tests/samples/barcodes/barcode-fax-image.pdf [new file with mode: 0644]
src/documents/tests/test_barcodes.py

index 54db83c1906865e5a445c569b971952c87e97110..a4be126a5501265a02e6f22fa6ef50126dba9306 100644 (file)
@@ -20,6 +20,10 @@ from pyzbar import pyzbar
 logger = logging.getLogger("paperless.barcodes")
 
 
+class BarcodeImageFormatError(Exception):
+    pass
+
+
 @lru_cache(maxsize=8)
 def supported_file_type(mime_type) -> bool:
     """
@@ -115,6 +119,9 @@ def scan_file_for_separating_barcodes(filepath: str) -> Tuple[Optional[str], Lis
                 for image_key in page.images:
                     pdfimage = PdfImage(page.images[image_key])
 
+                    if "/CCITTFaxDecode" in pdfimage.filters:
+                        raise BarcodeImageFormatError()
+
                     # Not all images can be transcoded to a PIL image, which
                     # is what pyzbar expects to receive
                     pillow_img = pdfimage.as_pil_image()
diff --git a/src/documents/tests/samples/barcodes/barcode-fax-image.pdf b/src/documents/tests/samples/barcodes/barcode-fax-image.pdf
new file mode 100644 (file)
index 0000000..2e248c8
Binary files /dev/null and b/src/documents/tests/samples/barcodes/barcode-fax-image.pdf differ
index 0f16845d201aff6f927492f177d58c0ef5fd5d54..ee8df9f349eaee7c2ad715087f981a23657a0b3a 100644 (file)
@@ -226,7 +226,7 @@ class TestBarcode(DirectoriesMixin, TestCase):
         WHEN:
             - The image tries to be transcoded to a PIL image, but fails
         THEN:
-            - The barcode reader is still called, as
+            - The barcode reader is still called
         """
 
         def _build_device_n_pdf(self, save_path: str):
@@ -279,6 +279,26 @@ class TestBarcode(DirectoriesMixin, TestCase):
 
                 reader.assert_called()
 
+    def test_scan_file_for_separating_barcodes_fax_decode(self):
+        """
+        GIVEN:
+            - A PDF containing an image encoded as CCITT Group 4 encoding
+        WHEN:
+            - Barcode processing happens with the file
+        THEN:
+            - The barcode is still detected
+        """
+        test_file = os.path.join(
+            self.BARCODE_SAMPLE_DIR,
+            "barcode-fax-image.pdf",
+        )
+        pdf_file, separator_page_numbers = barcodes.scan_file_for_separating_barcodes(
+            test_file,
+        )
+
+        self.assertEqual(pdf_file, test_file)
+        self.assertListEqual(separator_page_numbers, [1])
+
     def test_scan_file_for_separating_qr_barcodes(self):
         test_file = os.path.join(
             self.BARCODE_SAMPLE_DIR,