]> git.ipfire.org Git - thirdparty/paperless-ngx.git/commitdiff
Use dataclasses to group data about barcodes in documents
authorTrenton H <797416+stumpylog@users.noreply.github.com>
Wed, 18 Jan 2023 17:11:55 +0000 (09:11 -0800)
committerTrenton H <797416+stumpylog@users.noreply.github.com>
Tue, 24 Jan 2023 17:43:52 +0000 (09:43 -0800)
src/documents/barcodes.py
src/documents/tasks.py
src/documents/tests/test_barcodes.py

index e0664f90652417c9f698afbeaa35631280c59181..46a96061b44d0a61334a25d15f132b3b73da9eb2 100644 (file)
@@ -2,10 +2,11 @@ import logging
 import os
 import shutil
 import tempfile
+from dataclasses import dataclass
 from functools import lru_cache
+from pathlib import Path
 from typing import List
 from typing import Optional
-from typing import Tuple
 
 import magic
 from django.conf import settings
@@ -25,6 +26,42 @@ class BarcodeImageFormatError(Exception):
     pass
 
 
+@dataclass(frozen=True)
+class Barcode:
+    """
+    Holds the information about a single barcode and its location
+    """
+
+    page: int
+    value: str
+
+    @property
+    def is_separator(self) -> bool:
+        """
+        Returns True if the barcode value equals the configured separation value,
+        False otherwise
+        """
+        return self.value == settings.CONSUMER_BARCODE_STRING
+
+    @property
+    def is_asn(self) -> bool:
+        """
+        Returns True if the barcode value matches the configured ASN prefix,
+        False otherwise
+        """
+        return self.value.startswith(settings.CONSUMER_ASN_BARCODE_PREFIX)
+
+
+@dataclass
+class DocumentBarcodeInfo:
+    """
+    Describes a single document's barcode status
+    """
+
+    pdf_path: Path
+    barcodes: List[Barcode]
+
+
 @lru_cache(maxsize=8)
 def supported_file_type(mime_type) -> bool:
     """
@@ -109,14 +146,14 @@ def convert_from_tiff_to_pdf(filepath: str) -> str:
 
 def scan_file_for_barcodes(
     filepath: str,
-) -> Tuple[Optional[str], List[Tuple[int, str]]]:
+) -> DocumentBarcodeInfo:
     """
     Scan the provided pdf file for any barcodes
     Returns a PDF filepath and a list of
     (page_number, barcode_text) tuples
     """
 
-    def _pikepdf_barcode_scan(pdf_filepath: str):
+    def _pikepdf_barcode_scan(pdf_filepath: str) -> List[Barcode]:
         detected_barcodes = []
         with Pdf.open(pdf_filepath) as pdf:
             for page_num, page in enumerate(pdf.pages):
@@ -135,22 +172,21 @@ def scan_file_for_barcodes(
                     # raise an exception, triggering fallback
                     pillow_img = pdfimage.as_pil_image()
 
-                    barcodes_on_page = barcode_reader(pillow_img)
-                    detected_barcodes.extend(
-                        [(page_num, text) for text in barcodes_on_page],
-                    )
+                    for barcode_value in barcode_reader(pillow_img):
+                        detected_barcodes.append(Barcode(page_num, barcode_value))
+
         return detected_barcodes
 
-    def _pdf2image_barcode_scan(pdf_filepath: str):
+    def _pdf2image_barcode_scan(pdf_filepath: str) -> List[Barcode]:
         detected_barcodes = []
         # use a temporary directory in case the file is too big to handle in memory
         with tempfile.TemporaryDirectory() as path:
             pages_from_path = convert_from_path(pdf_filepath, output_folder=path)
             for current_page_number, page in enumerate(pages_from_path):
-                barcodes_on_page = barcode_reader(page)
-                detected_barcodes.extend(
-                    [(current_page_number, text) for text in barcodes_on_page],
-                )
+                for barcode_value in barcode_reader(page):
+                    detected_barcodes.append(
+                        Barcode(current_page_number, barcode_value),
+                    )
         return detected_barcodes
 
     pdf_filepath = None
@@ -191,26 +227,22 @@ def scan_file_for_barcodes(
             f"Unsupported file format for barcode reader: {str(mime_type)}",
         )
 
-    return pdf_filepath, barcodes
+    return DocumentBarcodeInfo(pdf_filepath, barcodes)
 
 
-def get_separating_barcodes(barcodes: List[Tuple[int, str]]) -> List[int]:
+def get_separating_barcodes(barcodes: List[Barcode]) -> List[int]:
     """
     Search the parsed barcodes for separators
-    and returns a list of pagenumbers, which
+    and returns a list of page numbers, which
     separate the file into new files
     """
     # filter all barcodes for the separator string
-    separator_barcodes = list(
-        filter(lambda bc: bc[1] == settings.CONSUMER_BARCODE_STRING, barcodes),
-    )
     # get the page numbers of the separating barcodes
-    separator_page_numbers = [page for page, _ in separator_barcodes]
 
-    return separator_page_numbers
+    return [bc.page for bc in barcodes if bc.is_separator]
 
 
-def get_asn_from_barcodes(barcodes: List[Tuple[int, str]]) -> Optional[int]:
+def get_asn_from_barcodes(barcodes: List[Barcode]) -> Optional[int]:
     """
     Search the parsed barcodes for any ASNs.
     The first barcode that starts with CONSUMER_ASN_BARCODE_PREFIX
@@ -219,11 +251,9 @@ def get_asn_from_barcodes(barcodes: List[Tuple[int, str]]) -> Optional[int]:
     """
     asn = None
 
-    # only the barcode text is important here -> discard the page number
-    barcodes = [text for _, text in barcodes]
     # get the first barcode that starts with CONSUMER_ASN_BARCODE_PREFIX
     asn_text = next(
-        (x for x in barcodes if x.startswith(settings.CONSUMER_ASN_BARCODE_PREFIX)),
+        (x.value for x in barcodes if x.is_asn),
         None,
     )
 
index 7f4c8e125dd9fa20456df6fdb3f0916330f2a8ff..77b48e33df92afea33ec0411794082674ea47534 100644 (file)
@@ -112,17 +112,20 @@ def consume_file(
 
     # read all barcodes in the current document
     if settings.CONSUMER_ENABLE_BARCODES or settings.CONSUMER_ENABLE_ASN_BARCODE:
-        pdf_filepath, parsed_barcodes = barcodes.scan_file_for_barcodes(path)
+        doc_barcode_info = barcodes.scan_file_for_barcodes(path)
 
         # split document by separator pages, if enabled
         if settings.CONSUMER_ENABLE_BARCODES:
-            separators = barcodes.get_separating_barcodes(parsed_barcodes)
+            separators = barcodes.get_separating_barcodes(doc_barcode_info.barcodes)
 
             if len(separators) > 0:
                 logger.debug(
                     f"Pages with separators found in: {str(path)}",
                 )
-                document_list = barcodes.separate_pages(pdf_filepath, separators)
+                document_list = barcodes.separate_pages(
+                    doc_barcode_info.pdf_path,
+                    separators,
+                )
 
                 if document_list:
                     for n, document in enumerate(document_list):
@@ -151,10 +154,10 @@ def consume_file(
                         )
 
                     # Delete the PDF file which was split
-                    os.remove(pdf_filepath)
+                    os.remove(doc_barcode_info.pdf_path)
 
                     # If the original was a TIFF, remove the original file as well
-                    if str(pdf_filepath) != str(path):
+                    if str(doc_barcode_info.pdf_path) != str(path):
                         logger.debug(f"Deleting file {path}")
                         os.unlink(path)
 
@@ -181,7 +184,7 @@ def consume_file(
 
         # try reading the ASN from barcode
         if settings.CONSUMER_ENABLE_ASN_BARCODE:
-            asn = barcodes.get_asn_from_barcodes(parsed_barcodes)
+            asn = barcodes.get_asn_from_barcodes(doc_barcode_info.barcodes)
             if asn:
                 logger.info(f"Found ASN in barcode: {asn}")
 
index 15e7efd94fff14f8cd370e3075538f9d95726f1c..dba4afc99a7d985c73a28519fc79839d84a08baa 100644 (file)
@@ -191,26 +191,26 @@ class TestBarcode(DirectoriesMixin, TestCase):
             self.BARCODE_SAMPLE_DIR,
             "patch-code-t.pdf",
         )
-        pdf_file, parsed_barcodes = barcodes.scan_file_for_barcodes(
+        doc_barcode_info = barcodes.scan_file_for_barcodes(
             test_file,
         )
         separator_page_numbers = barcodes.get_separating_barcodes(
-            parsed_barcodes,
+            doc_barcode_info.barcodes,
         )
 
-        self.assertEqual(pdf_file, test_file)
+        self.assertEqual(doc_barcode_info.pdf_path, test_file)
         self.assertListEqual(separator_page_numbers, [0])
 
     def test_scan_file_for_separating_barcodes_none_present(self):
         test_file = os.path.join(self.SAMPLE_DIR, "simple.pdf")
-        pdf_file, parsed_barcodes = barcodes.scan_file_for_barcodes(
+        doc_barcode_info = barcodes.scan_file_for_barcodes(
             test_file,
         )
         separator_page_numbers = barcodes.get_separating_barcodes(
-            parsed_barcodes,
+            doc_barcode_info.barcodes,
         )
 
-        self.assertEqual(pdf_file, test_file)
+        self.assertEqual(doc_barcode_info.pdf_path, test_file)
         self.assertListEqual(separator_page_numbers, [])
 
     def test_scan_file_for_separating_barcodes3(self):
@@ -218,14 +218,14 @@ class TestBarcode(DirectoriesMixin, TestCase):
             self.BARCODE_SAMPLE_DIR,
             "patch-code-t-middle.pdf",
         )
-        pdf_file, parsed_barcodes = barcodes.scan_file_for_barcodes(
+        doc_barcode_info = barcodes.scan_file_for_barcodes(
             test_file,
         )
         separator_page_numbers = barcodes.get_separating_barcodes(
-            parsed_barcodes,
+            doc_barcode_info.barcodes,
         )
 
-        self.assertEqual(pdf_file, test_file)
+        self.assertEqual(doc_barcode_info.pdf_path, test_file)
         self.assertListEqual(separator_page_numbers, [1])
 
     def test_scan_file_for_separating_barcodes4(self):
@@ -233,14 +233,14 @@ class TestBarcode(DirectoriesMixin, TestCase):
             self.BARCODE_SAMPLE_DIR,
             "several-patcht-codes.pdf",
         )
-        pdf_file, parsed_barcodes = barcodes.scan_file_for_barcodes(
+        doc_barcode_info = barcodes.scan_file_for_barcodes(
             test_file,
         )
         separator_page_numbers = barcodes.get_separating_barcodes(
-            parsed_barcodes,
+            doc_barcode_info.barcodes,
         )
 
-        self.assertEqual(pdf_file, test_file)
+        self.assertEqual(doc_barcode_info.pdf_path, test_file)
         self.assertListEqual(separator_page_numbers, [2, 5])
 
     def test_scan_file_for_separating_barcodes_upsidedown(self):
@@ -248,14 +248,14 @@ class TestBarcode(DirectoriesMixin, TestCase):
             self.BARCODE_SAMPLE_DIR,
             "patch-code-t-middle_reverse.pdf",
         )
-        pdf_file, parsed_barcodes = barcodes.scan_file_for_barcodes(
+        doc_barcode_info = barcodes.scan_file_for_barcodes(
             test_file,
         )
         separator_page_numbers = barcodes.get_separating_barcodes(
-            parsed_barcodes,
+            doc_barcode_info.barcodes,
         )
 
-        self.assertEqual(pdf_file, test_file)
+        self.assertEqual(doc_barcode_info.pdf_path, test_file)
         self.assertListEqual(separator_page_numbers, [1])
 
     def test_scan_file_for_barcodes_pillow_transcode_error(self):
@@ -312,7 +312,7 @@ class TestBarcode(DirectoriesMixin, TestCase):
             with mock.patch("documents.barcodes.barcode_reader") as reader:
                 reader.return_value = list()
 
-                _, _ = barcodes.scan_file_for_barcodes(
+                _ = barcodes.scan_file_for_barcodes(
                     str(device_n_pdf.name),
                 )
 
@@ -331,14 +331,14 @@ class TestBarcode(DirectoriesMixin, TestCase):
             self.BARCODE_SAMPLE_DIR,
             "barcode-fax-image.pdf",
         )
-        pdf_file, parsed_barcodes = barcodes.scan_file_for_barcodes(
+        doc_barcode_info = barcodes.scan_file_for_barcodes(
             test_file,
         )
         separator_page_numbers = barcodes.get_separating_barcodes(
-            parsed_barcodes,
+            doc_barcode_info.barcodes,
         )
 
-        self.assertEqual(pdf_file, test_file)
+        self.assertEqual(doc_barcode_info.pdf_path, test_file)
         self.assertListEqual(separator_page_numbers, [1])
 
     def test_scan_file_for_separating_qr_barcodes(self):
@@ -346,14 +346,14 @@ class TestBarcode(DirectoriesMixin, TestCase):
             self.BARCODE_SAMPLE_DIR,
             "patch-code-t-qr.pdf",
         )
-        pdf_file, parsed_barcodes = barcodes.scan_file_for_barcodes(
+        doc_barcode_info = barcodes.scan_file_for_barcodes(
             test_file,
         )
         separator_page_numbers = barcodes.get_separating_barcodes(
-            parsed_barcodes,
+            doc_barcode_info.barcodes,
         )
 
-        self.assertEqual(pdf_file, test_file)
+        self.assertEqual(doc_barcode_info.pdf_path, test_file)
         self.assertListEqual(separator_page_numbers, [0])
 
     @override_settings(CONSUMER_BARCODE_STRING="CUSTOM BARCODE")
@@ -362,14 +362,14 @@ class TestBarcode(DirectoriesMixin, TestCase):
             self.BARCODE_SAMPLE_DIR,
             "barcode-39-custom.pdf",
         )
-        pdf_file, parsed_barcodes = barcodes.scan_file_for_barcodes(
+        doc_barcode_info = barcodes.scan_file_for_barcodes(
             test_file,
         )
         separator_page_numbers = barcodes.get_separating_barcodes(
-            parsed_barcodes,
+            doc_barcode_info.barcodes,
         )
 
-        self.assertEqual(pdf_file, test_file)
+        self.assertEqual(doc_barcode_info.pdf_path, test_file)
         self.assertListEqual(separator_page_numbers, [0])
 
     @override_settings(CONSUMER_BARCODE_STRING="CUSTOM BARCODE")
@@ -378,14 +378,14 @@ class TestBarcode(DirectoriesMixin, TestCase):
             self.BARCODE_SAMPLE_DIR,
             "barcode-qr-custom.pdf",
         )
-        pdf_file, parsed_barcodes = barcodes.scan_file_for_barcodes(
+        doc_barcode_info = barcodes.scan_file_for_barcodes(
             test_file,
         )
         separator_page_numbers = barcodes.get_separating_barcodes(
-            parsed_barcodes,
+            doc_barcode_info.barcodes,
         )
 
-        self.assertEqual(pdf_file, test_file)
+        self.assertEqual(doc_barcode_info.pdf_path, test_file)
         self.assertListEqual(separator_page_numbers, [0])
 
     @override_settings(CONSUMER_BARCODE_STRING="CUSTOM BARCODE")
@@ -394,14 +394,14 @@ class TestBarcode(DirectoriesMixin, TestCase):
             self.BARCODE_SAMPLE_DIR,
             "barcode-128-custom.pdf",
         )
-        pdf_file, parsed_barcodes = barcodes.scan_file_for_barcodes(
+        doc_barcode_info = barcodes.scan_file_for_barcodes(
             test_file,
         )
         separator_page_numbers = barcodes.get_separating_barcodes(
-            parsed_barcodes,
+            doc_barcode_info.barcodes,
         )
 
-        self.assertEqual(pdf_file, test_file)
+        self.assertEqual(doc_barcode_info.pdf_path, test_file)
         self.assertListEqual(separator_page_numbers, [0])
 
     def test_scan_file_for_separating_wrong_qr_barcodes(self):
@@ -409,14 +409,14 @@ class TestBarcode(DirectoriesMixin, TestCase):
             self.BARCODE_SAMPLE_DIR,
             "barcode-39-custom.pdf",
         )
-        pdf_file, parsed_barcodes = barcodes.scan_file_for_barcodes(
+        doc_barcode_info = barcodes.scan_file_for_barcodes(
             test_file,
         )
         separator_page_numbers = barcodes.get_separating_barcodes(
-            parsed_barcodes,
+            doc_barcode_info.barcodes,
         )
 
-        self.assertEqual(pdf_file, test_file)
+        self.assertEqual(doc_barcode_info.pdf_path, test_file)
         self.assertListEqual(separator_page_numbers, [])
 
     def test_separate_pages(self):
@@ -507,14 +507,14 @@ class TestBarcode(DirectoriesMixin, TestCase):
         )
         tempdir = tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR)
 
-        pdf_file, parsed_barcodes = barcodes.scan_file_for_barcodes(
+        doc_barcode_info = barcodes.scan_file_for_barcodes(
             test_file,
         )
         separator_page_numbers = barcodes.get_separating_barcodes(
-            parsed_barcodes,
+            doc_barcode_info.barcodes,
         )
 
-        self.assertEqual(test_file, pdf_file)
+        self.assertEqual(test_file, doc_barcode_info.pdf_path)
         self.assertTrue(len(separator_page_numbers) > 0)
 
         document_list = barcodes.separate_pages(test_file, separator_page_numbers)
@@ -622,14 +622,14 @@ class TestBarcode(DirectoriesMixin, TestCase):
             - Scanning handle the exception without exception
         """
         test_file = os.path.join(self.SAMPLE_DIR, "password-is-test.pdf")
-        pdf_file, parsed_barcodes = barcodes.scan_file_for_barcodes(
+        doc_barcode_info = barcodes.scan_file_for_barcodes(
             test_file,
         )
         separator_page_numbers = barcodes.get_separating_barcodes(
-            parsed_barcodes,
+            doc_barcode_info.barcodes,
         )
 
-        self.assertEqual(pdf_file, test_file)
+        self.assertEqual(doc_barcode_info.pdf_path, test_file)
         self.assertListEqual(separator_page_numbers, [])
 
     def test_scan_file_for_asn_barcode(self):
@@ -637,12 +637,12 @@ class TestBarcode(DirectoriesMixin, TestCase):
             self.BARCODE_SAMPLE_DIR,
             "barcode-39-asn-123.pdf",
         )
-        pdf_file, parsed_barcodes = barcodes.scan_file_for_barcodes(
+        doc_barcode_info = barcodes.scan_file_for_barcodes(
             test_file,
         )
-        asn = barcodes.get_asn_from_barcodes(parsed_barcodes)
+        asn = barcodes.get_asn_from_barcodes(doc_barcode_info.barcodes)
 
-        self.assertEqual(pdf_file, test_file)
+        self.assertEqual(doc_barcode_info.pdf_path, test_file)
         self.assertEqual(asn, 123)
 
     def test_scan_file_for_asn_not_existing(self):
@@ -650,12 +650,12 @@ class TestBarcode(DirectoriesMixin, TestCase):
             self.BARCODE_SAMPLE_DIR,
             "patch-code-t.pdf",
         )
-        pdf_file, parsed_barcodes = barcodes.scan_file_for_barcodes(
+        doc_barcode_info = barcodes.scan_file_for_barcodes(
             test_file,
         )
-        asn = barcodes.get_asn_from_barcodes(parsed_barcodes)
+        asn = barcodes.get_asn_from_barcodes(doc_barcode_info.barcodes)
 
-        self.assertEqual(pdf_file, test_file)
+        self.assertEqual(doc_barcode_info.pdf_path, test_file)
         self.assertEqual(asn, None)
 
     def test_scan_file_for_asn_barcode_invalid(self):
@@ -663,13 +663,13 @@ class TestBarcode(DirectoriesMixin, TestCase):
             self.BARCODE_SAMPLE_DIR,
             "barcode-39-asn-invalid.pdf",
         )
-        pdf_file, parsed_barcodes = barcodes.scan_file_for_barcodes(
+        doc_barcode_info = barcodes.scan_file_for_barcodes(
             test_file,
         )
 
-        asn = barcodes.get_asn_from_barcodes(parsed_barcodes)
+        asn = barcodes.get_asn_from_barcodes(doc_barcode_info.barcodes)
 
-        self.assertEqual(pdf_file, test_file)
+        self.assertEqual(doc_barcode_info.pdf_path, test_file)
         self.assertEqual(asn, None)
 
     @override_settings(CONSUMER_ASN_BARCODE_PREFIX="CUSTOM-PREFIX-")
@@ -678,10 +678,10 @@ class TestBarcode(DirectoriesMixin, TestCase):
             self.BARCODE_SAMPLE_DIR,
             "barcode-39-asn-custom-prefix.pdf",
         )
-        pdf_file, parsed_barcodes = barcodes.scan_file_for_barcodes(
+        doc_barcode_info = barcodes.scan_file_for_barcodes(
             test_file,
         )
-        asn = barcodes.get_asn_from_barcodes(parsed_barcodes)
+        asn = barcodes.get_asn_from_barcodes(doc_barcode_info.barcodes)
 
-        self.assertEqual(pdf_file, test_file)
+        self.assertEqual(doc_barcode_info.pdf_path, test_file)
         self.assertEqual(asn, 123)