]> git.ipfire.org Git - thirdparty/paperless-ngx.git/commitdiff
Feature: split documents on ASN barcode (#2554)
authorFabian Ohler <muued@users.noreply.github.com>
Wed, 1 Feb 2023 09:13:30 +0000 (10:13 +0100)
committerGitHub <noreply@github.com>
Wed, 1 Feb 2023 09:13:30 +0000 (01:13 -0800)
* also split documents when an ASN barcode is found

* linter

* fix test case parameters

* avoid pre-python-3.9 features

* simplify dict-creation in tests

* simplify dict-creation in tests for empty dicts

* Add test cases for the splitting by ASN barcode feature

* deleted supporting files for test case construction

src/documents/barcodes.py
src/documents/tests/samples/barcodes/split-by-asn-1.pdf [new file with mode: 0644]
src/documents/tests/samples/barcodes/split-by-asn-2.pdf [new file with mode: 0644]
src/documents/tests/test_barcodes.py

index 6e3ecfe058fa95941556e16788069ea091da5558..9adb8aeeaf4c4e78d922852040cf22d7af471367 100644 (file)
@@ -5,6 +5,7 @@ import tempfile
 from dataclasses import dataclass
 from functools import lru_cache
 from pathlib import Path
+from typing import Dict
 from typing import List
 from typing import Optional
 
@@ -201,16 +202,25 @@ def scan_file_for_barcodes(
     return DocumentBarcodeInfo(pdf_filepath, barcodes)
 
 
-def get_separating_barcodes(barcodes: List[Barcode]) -> List[int]:
+def get_separating_barcodes(barcodes: List[Barcode]) -> Dict[int, bool]:
     """
     Search the parsed barcodes for separators
-    and returns a list of page numbers, which
-    separate the file into new files.
+    and returns a dict of page numbers, which
+    separate the file into new files, together
+    with the information whether to keep the page.
     """
     # filter all barcodes for the separator string
     # get the page numbers of the separating barcodes
+    separator_pages = {bc.page: False for bc in barcodes if bc.is_separator}
+    if not settings.CONSUMER_ENABLE_ASN_BARCODE:
+        return separator_pages
 
-    return list({bc.page for bc in barcodes if bc.is_separator})
+    # add the page numbers of the ASN barcodes
+    # (except for first page, that might lead to infinite loops).
+    return {
+        **separator_pages,
+        **{bc.page: True for bc in barcodes if bc.is_asn and bc.page != 0},
+    }
 
 
 def get_asn_from_barcodes(barcodes: List[Barcode]) -> Optional[int]:
@@ -242,10 +252,11 @@ def get_asn_from_barcodes(barcodes: List[Barcode]) -> Optional[int]:
     return asn
 
 
-def separate_pages(filepath: str, pages_to_split_on: List[int]) -> List[str]:
+def separate_pages(filepath: str, pages_to_split_on: Dict[int, bool]) -> List[str]:
     """
     Separate the provided pdf file on the pages_to_split_on.
-    The pages which are defined by page_numbers will be removed.
+    The pages which are defined by the keys in page_numbers
+    will be removed if the corresponding value is false.
     Returns a list of (temporary) filepaths to consume.
     These will need to be deleted later.
     """
@@ -261,26 +272,28 @@ def separate_pages(filepath: str, pages_to_split_on: List[int]) -> List[str]:
     fname = os.path.splitext(os.path.basename(filepath))[0]
     pdf = Pdf.open(filepath)
 
+    # Start with an empty document
+    current_document: List[Page] = []
     # A list of documents, ie a list of lists of pages
-    documents: List[List[Page]] = []
-    # A single document, ie a list of pages
-    document: List[Page] = []
+    documents: List[List[Page]] = [current_document]
 
     for idx, page in enumerate(pdf.pages):
         # Keep building the new PDF as long as it is not a
         # separator index
         if idx not in pages_to_split_on:
-            document.append(page)
-            # Make sure to append the very last document to the documents
-            if idx == (len(pdf.pages) - 1):
-                documents.append(document)
-                document = []
-        else:
-            # This is a split index, save the current PDF pages, and restart
-            # a new destination page listing
-            logger.debug(f"Starting new document at idx {idx}")
-            documents.append(document)
-            document = []
+            current_document.append(page)
+            continue
+
+        # This is a split index
+        # Start a new destination page listing
+        logger.debug(f"Starting new document at idx {idx}")
+        current_document = []
+        documents.append(current_document)
+        keep_page = pages_to_split_on[idx]
+        if keep_page:
+            # Keep the page
+            # (new document is started by asn barcode)
+            current_document.append(page)
 
     documents = [x for x in documents if len(x)]
 
diff --git a/src/documents/tests/samples/barcodes/split-by-asn-1.pdf b/src/documents/tests/samples/barcodes/split-by-asn-1.pdf
new file mode 100644 (file)
index 0000000..82374b9
Binary files /dev/null and b/src/documents/tests/samples/barcodes/split-by-asn-1.pdf differ
diff --git a/src/documents/tests/samples/barcodes/split-by-asn-2.pdf b/src/documents/tests/samples/barcodes/split-by-asn-2.pdf
new file mode 100644 (file)
index 0000000..05cc16a
Binary files /dev/null and b/src/documents/tests/samples/barcodes/split-by-asn-2.pdf differ
index 4f7f1278ac5031b26e58a63224912e0eb52d8ff3..86c53755bc08b79a6e016b270f1d64703c170392 100644 (file)
@@ -294,7 +294,7 @@ class TestBarcode(DirectoriesMixin, TestCase):
         )
 
         self.assertEqual(doc_barcode_info.pdf_path, test_file)
-        self.assertListEqual(separator_page_numbers, [0])
+        self.assertDictEqual(separator_page_numbers, {0: False})
 
     def test_scan_file_for_separating_barcodes_none_present(self):
         """
@@ -314,7 +314,7 @@ class TestBarcode(DirectoriesMixin, TestCase):
         )
 
         self.assertEqual(doc_barcode_info.pdf_path, test_file)
-        self.assertListEqual(separator_page_numbers, [])
+        self.assertDictEqual(separator_page_numbers, {})
 
     def test_scan_file_for_separating_barcodes_middle_page(self):
         """
@@ -337,7 +337,7 @@ class TestBarcode(DirectoriesMixin, TestCase):
         )
 
         self.assertEqual(doc_barcode_info.pdf_path, test_file)
-        self.assertListEqual(separator_page_numbers, [1])
+        self.assertDictEqual(separator_page_numbers, {1: False})
 
     def test_scan_file_for_separating_barcodes_multiple_pages(self):
         """
@@ -360,7 +360,7 @@ class TestBarcode(DirectoriesMixin, TestCase):
         )
 
         self.assertEqual(doc_barcode_info.pdf_path, test_file)
-        self.assertListEqual(separator_page_numbers, [2, 5])
+        self.assertDictEqual(separator_page_numbers, {2: False, 5: False})
 
     def test_scan_file_for_separating_barcodes_upside_down(self):
         """
@@ -384,7 +384,7 @@ class TestBarcode(DirectoriesMixin, TestCase):
         )
 
         self.assertEqual(doc_barcode_info.pdf_path, test_file)
-        self.assertListEqual(separator_page_numbers, [1])
+        self.assertDictEqual(separator_page_numbers, {1: False})
 
     def test_scan_file_for_separating_barcodes_fax_decode(self):
         """
@@ -407,7 +407,7 @@ class TestBarcode(DirectoriesMixin, TestCase):
         )
 
         self.assertEqual(doc_barcode_info.pdf_path, test_file)
-        self.assertListEqual(separator_page_numbers, [1])
+        self.assertDictEqual(separator_page_numbers, {1: False})
 
     def test_scan_file_for_separating_qr_barcodes(self):
         """
@@ -431,7 +431,7 @@ class TestBarcode(DirectoriesMixin, TestCase):
         )
 
         self.assertEqual(doc_barcode_info.pdf_path, test_file)
-        self.assertListEqual(separator_page_numbers, [0])
+        self.assertDictEqual(separator_page_numbers, {0: False})
 
     @override_settings(CONSUMER_BARCODE_STRING="CUSTOM BARCODE")
     def test_scan_file_for_separating_custom_barcodes(self):
@@ -456,7 +456,7 @@ class TestBarcode(DirectoriesMixin, TestCase):
         )
 
         self.assertEqual(doc_barcode_info.pdf_path, test_file)
-        self.assertListEqual(separator_page_numbers, [0])
+        self.assertDictEqual(separator_page_numbers, {0: False})
 
     @override_settings(CONSUMER_BARCODE_STRING="CUSTOM BARCODE")
     def test_scan_file_for_separating_custom_qr_barcodes(self):
@@ -482,7 +482,7 @@ class TestBarcode(DirectoriesMixin, TestCase):
         )
 
         self.assertEqual(doc_barcode_info.pdf_path, test_file)
-        self.assertListEqual(separator_page_numbers, [0])
+        self.assertDictEqual(separator_page_numbers, {0: False})
 
     @override_settings(CONSUMER_BARCODE_STRING="CUSTOM BARCODE")
     def test_scan_file_for_separating_custom_128_barcodes(self):
@@ -508,7 +508,7 @@ class TestBarcode(DirectoriesMixin, TestCase):
         )
 
         self.assertEqual(doc_barcode_info.pdf_path, test_file)
-        self.assertListEqual(separator_page_numbers, [0])
+        self.assertDictEqual(separator_page_numbers, {0: False})
 
     def test_scan_file_for_separating_wrong_qr_barcodes(self):
         """
@@ -533,7 +533,7 @@ class TestBarcode(DirectoriesMixin, TestCase):
         )
 
         self.assertEqual(doc_barcode_info.pdf_path, test_file)
-        self.assertListEqual(separator_page_numbers, [])
+        self.assertDictEqual(separator_page_numbers, {})
 
     @override_settings(CONSUMER_BARCODE_STRING="ADAR-NEXTDOC")
     def test_scan_file_for_separating_qr_barcodes(self):
@@ -558,7 +558,7 @@ class TestBarcode(DirectoriesMixin, TestCase):
         )
 
         self.assertGreater(len(doc_barcode_info.barcodes), 0)
-        self.assertListEqual(separator_page_numbers, [1])
+        self.assertDictEqual(separator_page_numbers, {1: False})
 
     def test_separate_pages(self):
         """
@@ -573,7 +573,7 @@ class TestBarcode(DirectoriesMixin, TestCase):
             self.BARCODE_SAMPLE_DIR,
             "patch-code-t-middle.pdf",
         )
-        documents = barcodes.separate_pages(test_file, [1])
+        documents = barcodes.separate_pages(test_file, {1: False})
 
         self.assertEqual(len(documents), 2)
 
@@ -591,7 +591,7 @@ class TestBarcode(DirectoriesMixin, TestCase):
             self.BARCODE_SAMPLE_DIR,
             "patch-code-t-double.pdf",
         )
-        pages = barcodes.separate_pages(test_file, [1, 2])
+        pages = barcodes.separate_pages(test_file, {1: False, 2: False})
 
         self.assertEqual(len(pages), 2)
 
@@ -610,7 +610,7 @@ class TestBarcode(DirectoriesMixin, TestCase):
             "patch-code-t-middle.pdf",
         )
         with self.assertLogs("paperless.barcodes", level="WARNING") as cm:
-            pages = barcodes.separate_pages(test_file, [])
+            pages = barcodes.separate_pages(test_file, {})
             self.assertEqual(pages, [])
             self.assertEqual(
                 cm.output,
@@ -858,7 +858,88 @@ class TestBarcode(DirectoriesMixin, TestCase):
         )
 
         self.assertEqual(doc_barcode_info.pdf_path, test_file)
-        self.assertListEqual(separator_page_numbers, [])
+        self.assertDictEqual(separator_page_numbers, {})
+
+    @override_settings(
+        CONSUMER_ENABLE_BARCODES=True,
+        CONSUMER_ENABLE_ASN_BARCODE=True,
+    )
+    def test_separate_pages_by_asn_barcodes_and_patcht(self):
+        """
+        GIVEN:
+            - Input PDF with a patch code on page 3 and ASN barcodes on pages 1,5,6,9,11
+        WHEN:
+            - Input file is split on barcodes
+        THEN:
+            - Correct number of files produced, split correctly by correct pages
+        """
+        test_file = os.path.join(
+            os.path.dirname(__file__),
+            self.BARCODE_SAMPLE_DIR,
+            "split-by-asn-2.pdf",
+        )
+
+        doc_barcode_info = barcodes.scan_file_for_barcodes(
+            test_file,
+        )
+        separator_page_numbers = barcodes.get_separating_barcodes(
+            doc_barcode_info.barcodes,
+        )
+
+        self.assertEqual(test_file, doc_barcode_info.pdf_path)
+        self.assertDictEqual(
+            separator_page_numbers,
+            {
+                2: False,
+                4: True,
+                5: True,
+                8: True,
+                10: True,
+            },
+        )
+
+        document_list = barcodes.separate_pages(test_file, separator_page_numbers)
+        self.assertEqual(len(document_list), 6)
+
+    @override_settings(
+        CONSUMER_ENABLE_BARCODES=True,
+        CONSUMER_ENABLE_ASN_BARCODE=True,
+    )
+    def test_separate_pages_by_asn_barcodes(self):
+        """
+        GIVEN:
+            - Input PDF with ASN barcodes on pages 1,3,4,7,9
+        WHEN:
+            - Input file is split on barcodes
+        THEN:
+            - Correct number of files produced, split correctly by correct pages
+        """
+        test_file = os.path.join(
+            os.path.dirname(__file__),
+            self.BARCODE_SAMPLE_DIR,
+            "split-by-asn-1.pdf",
+        )
+
+        doc_barcode_info = barcodes.scan_file_for_barcodes(
+            test_file,
+        )
+        separator_page_numbers = barcodes.get_separating_barcodes(
+            doc_barcode_info.barcodes,
+        )
+
+        self.assertEqual(test_file, doc_barcode_info.pdf_path)
+        self.assertDictEqual(
+            separator_page_numbers,
+            {
+                2: True,
+                3: True,
+                6: True,
+                8: True,
+            },
+        )
+
+        document_list = barcodes.separate_pages(test_file, separator_page_numbers)
+        self.assertEqual(len(document_list), 5)
 
 
 class TestAsnBarcodes(DirectoriesMixin, TestCase):