]> git.ipfire.org Git - thirdparty/paperless-ngx.git/commitdiff
Rescales images from PDFs so zbar can better find them
authorTrenton H <797416+stumpylog@users.noreply.github.com>
Wed, 18 Jan 2023 14:56:51 +0000 (06:56 -0800)
committerTrenton H <797416+stumpylog@users.noreply.github.com>
Tue, 24 Jan 2023 18:30:53 +0000 (10:30 -0800)
src/documents/barcodes.py
src/documents/tests/samples/barcodes/many-qr-codes.pdf [new file with mode: 0644]
src/documents/tests/test_barcodes.py

index 46a96061b44d0a61334a25d15f132b3b73da9eb2..597f228f36a55d69a50c7bf78a83c8d4d3aa816a 100644 (file)
@@ -4,6 +4,7 @@ import shutil
 import tempfile
 from dataclasses import dataclass
 from functools import lru_cache
+from math import ceil
 from pathlib import Path
 from typing import List
 from typing import Optional
@@ -172,6 +173,24 @@ def scan_file_for_barcodes(
                     # raise an exception, triggering fallback
                     pillow_img = pdfimage.as_pil_image()
 
+                    # Scale the image down
+                    # See: https://github.com/paperless-ngx/paperless-ngx/issues/2385
+                    # TLDR: zbar has issues with larger images
+                    width, height = pillow_img.size
+                    if width > 512:
+                        scaler = ceil(width / 512)
+                        new_width = int(width / scaler)
+                        new_height = int(height / scaler)
+                        pillow_img = pillow_img.resize((new_width, new_height))
+
+                    width, height = pillow_img.size
+
+                    if height > 1024:
+                        scaler = ceil(height / 1024)
+                        new_width = int(width / scaler)
+                        new_height = int(height / scaler)
+                        pillow_img = pillow_img.resize((new_width, new_height))
+
                     for barcode_value in barcode_reader(pillow_img):
                         detected_barcodes.append(Barcode(page_num, barcode_value))
 
@@ -234,12 +253,12 @@ def get_separating_barcodes(barcodes: List[Barcode]) -> List[int]:
     """
     Search the parsed barcodes for separators
     and returns a list of page numbers, which
-    separate the file into new files
+    separate the file into new files.
     """
     # filter all barcodes for the separator string
     # get the page numbers of the separating barcodes
 
-    return [bc.page for bc in barcodes if bc.is_separator]
+    return list({bc.page for bc in barcodes if bc.is_separator})
 
 
 def get_asn_from_barcodes(barcodes: List[Barcode]) -> Optional[int]:
@@ -266,7 +285,7 @@ def get_asn_from_barcodes(barcodes: List[Barcode]) -> Optional[int]:
         try:
             asn = int(asn_text)
         except ValueError as e:
-            logger.warn(f"Failed to parse ASN number because: {e}")
+            logger.warning(f"Failed to parse ASN number because: {e}")
 
     return asn
 
diff --git a/src/documents/tests/samples/barcodes/many-qr-codes.pdf b/src/documents/tests/samples/barcodes/many-qr-codes.pdf
new file mode 100644 (file)
index 0000000..f5d3f4a
Binary files /dev/null and b/src/documents/tests/samples/barcodes/many-qr-codes.pdf differ
index b2d0824edad1ced74656e3805617a56dfcca0307..1dc2a88bc9a0cbf693723eb74fac8e2cc063ad47 100644 (file)
@@ -447,6 +447,31 @@ class TestBarcode(DirectoriesMixin, TestCase):
         self.assertEqual(doc_barcode_info.pdf_path, test_file)
         self.assertListEqual(separator_page_numbers, [])
 
+    @override_settings(CONSUMER_BARCODE_STRING="ADAR-NEXTDOC")
+    def test_scan_file_for_separating_qr_barcodes(self):
+        """
+        GIVEN:
+            - Input PDF with certain QR codes that aren't detected at current size
+        WHEN:
+            - The input file is scanned for barcodes
+        THEN:
+            - QR codes are detected
+        """
+        test_file = os.path.join(
+            self.BARCODE_SAMPLE_DIR,
+            "many-qr-codes.pdf",
+        )
+
+        doc_barcode_info = barcodes.scan_file_for_barcodes(
+            test_file,
+        )
+        separator_page_numbers = barcodes.get_separating_barcodes(
+            doc_barcode_info.barcodes,
+        )
+
+        self.assertGreater(len(doc_barcode_info.barcodes), 0)
+        self.assertListEqual(separator_page_numbers, [1])
+
     def test_separate_pages(self):
         test_file = os.path.join(
             self.BARCODE_SAMPLE_DIR,