Always use pikepdf, then pdf2image if needed to check for barcodes instead of requiri...

author Trenton H <797416+stumpylog@users.noreply.github.com>

Wed, 9 Nov 2022 16:50:34 +0000 (08:50 -0800)

committer Trenton H <797416+stumpylog@users.noreply.github.com>

Wed, 9 Nov 2022 21:01:39 +0000 (13:01 -0800)
author Trenton H <797416+stumpylog@users.noreply.github.com>
Wed, 9 Nov 2022 16:50:34 +0000 (08:50 -0800)
committer Trenton H <797416+stumpylog@users.noreply.github.com>
Wed, 9 Nov 2022 21:01:39 +0000 (13:01 -0800)
diff --git a/docs/configuration.rst b/docs/configuration.rst

index 8a3a252529b3259016b1ea24cdcd2eaecd46ad88..f9c4f89cc9885c2db9055e0e814ae38c1f84e123 100644 (file)
--- a/docs/configuration.rst
+++ b/docs/configuration.rst
@@ -701,16 +701,6 @@ PAPERLESS_CONSUMER_ENABLE_BARCODES=<bool>
  
      Defaults to false.
  
-PAPERLESS_CONSUMER_USE_LEGACY_DETECTION=<bool>
-    Enables the legacy method of detecting barcodes.  By default, images are
-    extracted directly from the PDF structure for barcode detection.  If this
-    configuration value is set, images of the whole PDF page will be used instead.
-
-    This is a slower and more memory intensive process, but may be required for
-    certain files, depending on how it is produced and how images are encoded.
-
-    Defaults to false.
-
  
  PAPERLESS_CONSUMER_BARCODE_TIFF_SUPPORT=<bool>
      Whether TIFF image files should be scanned for barcodes.
diff --git a/src/documents/barcodes.py b/src/documents/barcodes.py

index 1f5e33d376cc2d2322119ab3caff66c275890055..9533742a882f4fe6f1633da5af7f9c6334147ec8 100644 (file)
--- a/src/documents/barcodes.py
+++ b/src/documents/barcodes.py
@@ -10,12 +10,10 @@ from typing import Tuple
  import magic
  from django.conf import settings
  from pdf2image import convert_from_path
-from pdf2image.exceptions import PDFPageCountError
  from pikepdf import Page
  from pikepdf import PasswordError
  from pikepdf import Pdf
  from pikepdf import PdfImage
-from pikepdf.models.image import HifiPrintImageNotTranscodableError
  from PIL import Image
  from PIL import ImageSequence
  from pyzbar import pyzbar
@@ -101,7 +99,7 @@ def convert_from_tiff_to_pdf(filepath: str) -> str:
                  images[0].save(newpath)
              else:
                  images[0].save(newpath, save_all=True, append_images=images[1:])
-        except OSError as e:
+        except OSError as e:  # pragma: no cover
              logger.warning(
                  f"Could not save the file as pdf. Error: {str(e)}",
              )
@@ -122,13 +120,16 @@ def scan_file_for_separating_barcodes(filepath: str) -> Tuple[Optional[str], Lis
                  for image_key in page.images:
                      pdfimage = PdfImage(page.images[image_key])
  
+                    # This type is known to have issues:
+                    # https://github.com/pikepdf/pikepdf/issues/401
                      if "/CCITTFaxDecode" in pdfimage.filters:
                          raise BarcodeImageFormatError(
                              "Unable to decode CCITTFaxDecode images",
                          )
  
                      # Not all images can be transcoded to a PIL image, which
-                    # is what pyzbar expects to receive
+                    # is what pyzbar expects to receive, so this may
+                    # raise an exception, triggering fallback
                      pillow_img = pdfimage.as_pil_image()
  
                      detected_barcodes = barcode_reader(pillow_img)
@@ -155,29 +156,23 @@ def scan_file_for_separating_barcodes(filepath: str) -> Tuple[Optional[str], Lis
          if mime_type == "image/tiff":
              pdf_filepath = convert_from_tiff_to_pdf(filepath)
  
-        # Chose the scanner
-        if settings.CONSUMER_USE_LEGACY_DETECTION:
-            logger.debug("Using pdf2image for barcodes")
-            scanner_function = _pdf2image_barcode_scan
-        else:
-            logger.debug("Using pikepdf for barcodes")
-            scanner_function = _pikepdf_barcode_scan
-
-        # Run the scanner
+        # Always try pikepdf first, it's usually fine, faster and
+        # uses less memory
          try:
-            scanner_function(pdf_filepath)
-        # Neither method can handle password protected PDFs without it being
-        # provided.  Log it and continue
-        except (PasswordError, PDFPageCountError) as e:
+            _pikepdf_barcode_scan(pdf_filepath)
+        # Password protected files can't be checked
+        except PasswordError as e:
              logger.warning(
-                f"File is likely password protected, not splitting: {e}",
+                f"File is likely password protected, not checking for barcodes: {e}",
              )
-        # Handle pikepdf related image decoding issues with a fallback
-        except (BarcodeImageFormatError, HifiPrintImageNotTranscodableError) as e:
+        # Handle pikepdf related image decoding issues with a fallback to page
+        # by page conversion to images in a temporary directory
+        except Exception as e:
              logger.warning(
                  f"Falling back to pdf2image because: {e}",
              )
              try:
+                # Clear the list in case some processing worked
                  separator_page_numbers = []
                  _pdf2image_barcode_scan(pdf_filepath)
              # This file is really borked, allow the consumption to continue
@@ -186,11 +181,6 @@ def scan_file_for_separating_barcodes(filepath: str) -> Tuple[Optional[str], Lis
                  logger.warning(
                      f"Exception during barcode scanning: {e}",
                  )
-        # We're not sure what happened, but allow the consumption to continue
-        except Exception as e:  # pragma: no cover
-            logger.warning(
-                f"Exception during barcode scanning: {e}",
-            )
  
      else:
          logger.warning(
diff --git a/src/documents/tests/test_barcodes.py b/src/documents/tests/test_barcodes.py

index f4e5fce14f051c1ebc2c902d29e7572928f6145c..ef086626d67c5732f85c3cbd8eb79966e2ddcc83 100644 (file)
--- a/src/documents/tests/test_barcodes.py
+++ b/src/documents/tests/test_barcodes.py
@@ -468,41 +468,6 @@ class TestBarcode(DirectoriesMixin, TestCase):
          self.assertTrue(os.path.isfile(target_file1))
          self.assertTrue(os.path.isfile(target_file2))
  
-    @override_settings(CONSUMER_USE_LEGACY_DETECTION=True)
-    def test_barcode_splitter_legacy_fallback(self):
-        """
-        GIVEN:
-            - File containing barcode
-            - Legacy method of detection is enabled
-        WHEN:
-            - File is scanned for barcodes
-        THEN:
-            - Barcodes are properly detected
-        """
-        test_file = os.path.join(
-            self.BARCODE_SAMPLE_DIR,
-            "patch-code-t-middle.pdf",
-        )
-        tempdir = tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR)
-
-        pdf_file, separator_page_numbers = barcodes.scan_file_for_separating_barcodes(
-            test_file,
-        )
-
-        self.assertEqual(test_file, pdf_file)
-        self.assertTrue(len(separator_page_numbers) > 0)
-
-        document_list = barcodes.separate_pages(test_file, separator_page_numbers)
-        self.assertTrue(document_list)
-        for document in document_list:
-            barcodes.save_to_dir(document, target_dir=tempdir)
-
-        target_file1 = os.path.join(tempdir, "patch-code-t-middle_document_0.pdf")
-        target_file2 = os.path.join(tempdir, "patch-code-t-middle_document_1.pdf")
-
-        self.assertTrue(os.path.isfile(target_file1))
-        self.assertTrue(os.path.isfile(target_file2))
-
      @override_settings(CONSUMER_ENABLE_BARCODES=True)
      def test_consume_barcode_file(self):
          test_file = os.path.join(
@@ -586,7 +551,7 @@ class TestBarcode(DirectoriesMixin, TestCase):
          with mock.patch("documents.tasks.async_to_sync"):
              self.assertEqual(tasks.consume_file(dst), "File successfully split")
  
-    def test_scan_file_for_separating_barcodes_password_pikepdf(self):
+    def test_scan_file_for_separating_barcodes_password(self):
          """
          GIVEN:
              - Password protected PDF
@@ -603,22 +568,3 @@ class TestBarcode(DirectoriesMixin, TestCase):
  
          self.assertEqual(pdf_file, test_file)
          self.assertListEqual(separator_page_numbers, [])
-
-    @override_settings(CONSUMER_USE_LEGACY_DETECTION=True)
-    def test_scan_file_for_separating_barcodes_password_pdf2image(self):
-        """
-        GIVEN:
-            - Password protected PDF
-            - pdf2image based scanning
-        WHEN:
-            - File is scanned for barcode
-        THEN:
-            - Scanning handle the exception without exception
-        """
-        test_file = os.path.join(self.SAMPLE_DIR, "password-is-test.pdf")
-        pdf_file, separator_page_numbers = barcodes.scan_file_for_separating_barcodes(
-            test_file,
-        )
-
-        self.assertEqual(pdf_file, test_file)
-        self.assertListEqual(separator_page_numbers, [])
diff --git a/src/paperless/settings.py b/src/paperless/settings.py

index 5547d6c335b1cb14b406be82316c123e019a819f..456e15745562eb9563fcb3831c301373143331fb 100644 (file)
--- a/src/paperless/settings.py
+++ b/src/paperless/settings.py
@@ -573,11 +573,6 @@ CONSUMER_BARCODE_TIFF_SUPPORT: Final[bool] = __get_boolean(
      "PAPERLESS_CONSUMER_BARCODE_TIFF_SUPPORT",
  )
  
-CONSUMER_USE_LEGACY_DETECTION: Final[bool] = __get_boolean(
-    "PAPERLESS_CONSUMER_USE_LEGACY_DETECTION",
-    "NO",
-)
-
  CONSUMER_BARCODE_STRING: Final[str] = os.getenv(
      "PAPERLESS_CONSUMER_BARCODE_STRING",
      "PATCHT",
author	Trenton H <797416+stumpylog@users.noreply.github.com>
	Wed, 9 Nov 2022 16:50:34 +0000 (08:50 -0800)
committer	Trenton H <797416+stumpylog@users.noreply.github.com>
	Wed, 9 Nov 2022 21:01:39 +0000 (13:01 -0800)
docs/configuration.rst		patch \| blob \| blame \| history
src/documents/barcodes.py		patch \| blob \| blame \| history
src/documents/tests/test_barcodes.py		patch \| blob \| blame \| history
src/paperless/settings.py		patch \| blob \| blame \| history