from dataclasses import dataclass
from functools import lru_cache
from pathlib import Path
+from typing import Dict
from typing import List
from typing import Optional
return DocumentBarcodeInfo(pdf_filepath, barcodes)
-def get_separating_barcodes(barcodes: List[Barcode]) -> List[int]:
+def get_separating_barcodes(barcodes: List[Barcode]) -> Dict[int, bool]:
"""
Search the parsed barcodes for separators
- and returns a list of page numbers, which
- separate the file into new files.
+ and returns a dict of page numbers, which
+ separate the file into new files, together
+ with the information whether to keep the page.
"""
# filter all barcodes for the separator string
# get the page numbers of the separating barcodes
+ separator_pages = {bc.page: False for bc in barcodes if bc.is_separator}
+ if not settings.CONSUMER_ENABLE_ASN_BARCODE:
+ return separator_pages
- return list({bc.page for bc in barcodes if bc.is_separator})
+ # add the page numbers of the ASN barcodes
+ # (except for first page, that might lead to infinite loops).
+ return {
+ **separator_pages,
+ **{bc.page: True for bc in barcodes if bc.is_asn and bc.page != 0},
+ }
def get_asn_from_barcodes(barcodes: List[Barcode]) -> Optional[int]:
return asn
-def separate_pages(filepath: str, pages_to_split_on: List[int]) -> List[str]:
+def separate_pages(filepath: str, pages_to_split_on: Dict[int, bool]) -> List[str]:
"""
Separate the provided pdf file on the pages_to_split_on.
- The pages which are defined by page_numbers will be removed.
+ The pages which are defined by the keys in page_numbers
+ will be removed if the corresponding value is false.
Returns a list of (temporary) filepaths to consume.
These will need to be deleted later.
"""
fname = os.path.splitext(os.path.basename(filepath))[0]
pdf = Pdf.open(filepath)
+ # Start with an empty document
+ current_document: List[Page] = []
# A list of documents, ie a list of lists of pages
- documents: List[List[Page]] = []
- # A single document, ie a list of pages
- document: List[Page] = []
+ documents: List[List[Page]] = [current_document]
for idx, page in enumerate(pdf.pages):
# Keep building the new PDF as long as it is not a
# separator index
if idx not in pages_to_split_on:
- document.append(page)
- # Make sure to append the very last document to the documents
- if idx == (len(pdf.pages) - 1):
- documents.append(document)
- document = []
- else:
- # This is a split index, save the current PDF pages, and restart
- # a new destination page listing
- logger.debug(f"Starting new document at idx {idx}")
- documents.append(document)
- document = []
+ current_document.append(page)
+ continue
+
+ # This is a split index
+ # Start a new destination page listing
+ logger.debug(f"Starting new document at idx {idx}")
+ current_document = []
+ documents.append(current_document)
+ keep_page = pages_to_split_on[idx]
+ if keep_page:
+ # Keep the page
+ # (new document is started by asn barcode)
+ current_document.append(page)
documents = [x for x in documents if len(x)]
)
self.assertEqual(doc_barcode_info.pdf_path, test_file)
- self.assertListEqual(separator_page_numbers, [0])
+ self.assertDictEqual(separator_page_numbers, {0: False})
def test_scan_file_for_separating_barcodes_none_present(self):
"""
)
self.assertEqual(doc_barcode_info.pdf_path, test_file)
- self.assertListEqual(separator_page_numbers, [])
+ self.assertDictEqual(separator_page_numbers, {})
def test_scan_file_for_separating_barcodes_middle_page(self):
"""
)
self.assertEqual(doc_barcode_info.pdf_path, test_file)
- self.assertListEqual(separator_page_numbers, [1])
+ self.assertDictEqual(separator_page_numbers, {1: False})
def test_scan_file_for_separating_barcodes_multiple_pages(self):
"""
)
self.assertEqual(doc_barcode_info.pdf_path, test_file)
- self.assertListEqual(separator_page_numbers, [2, 5])
+ self.assertDictEqual(separator_page_numbers, {2: False, 5: False})
def test_scan_file_for_separating_barcodes_upside_down(self):
"""
)
self.assertEqual(doc_barcode_info.pdf_path, test_file)
- self.assertListEqual(separator_page_numbers, [1])
+ self.assertDictEqual(separator_page_numbers, {1: False})
def test_scan_file_for_separating_barcodes_fax_decode(self):
"""
)
self.assertEqual(doc_barcode_info.pdf_path, test_file)
- self.assertListEqual(separator_page_numbers, [1])
+ self.assertDictEqual(separator_page_numbers, {1: False})
def test_scan_file_for_separating_qr_barcodes(self):
"""
)
self.assertEqual(doc_barcode_info.pdf_path, test_file)
- self.assertListEqual(separator_page_numbers, [0])
+ self.assertDictEqual(separator_page_numbers, {0: False})
@override_settings(CONSUMER_BARCODE_STRING="CUSTOM BARCODE")
def test_scan_file_for_separating_custom_barcodes(self):
)
self.assertEqual(doc_barcode_info.pdf_path, test_file)
- self.assertListEqual(separator_page_numbers, [0])
+ self.assertDictEqual(separator_page_numbers, {0: False})
@override_settings(CONSUMER_BARCODE_STRING="CUSTOM BARCODE")
def test_scan_file_for_separating_custom_qr_barcodes(self):
)
self.assertEqual(doc_barcode_info.pdf_path, test_file)
- self.assertListEqual(separator_page_numbers, [0])
+ self.assertDictEqual(separator_page_numbers, {0: False})
@override_settings(CONSUMER_BARCODE_STRING="CUSTOM BARCODE")
def test_scan_file_for_separating_custom_128_barcodes(self):
)
self.assertEqual(doc_barcode_info.pdf_path, test_file)
- self.assertListEqual(separator_page_numbers, [0])
+ self.assertDictEqual(separator_page_numbers, {0: False})
def test_scan_file_for_separating_wrong_qr_barcodes(self):
"""
)
self.assertEqual(doc_barcode_info.pdf_path, test_file)
- self.assertListEqual(separator_page_numbers, [])
+ self.assertDictEqual(separator_page_numbers, {})
@override_settings(CONSUMER_BARCODE_STRING="ADAR-NEXTDOC")
def test_scan_file_for_separating_qr_barcodes(self):
)
self.assertGreater(len(doc_barcode_info.barcodes), 0)
- self.assertListEqual(separator_page_numbers, [1])
+ self.assertDictEqual(separator_page_numbers, {1: False})
def test_separate_pages(self):
"""
self.BARCODE_SAMPLE_DIR,
"patch-code-t-middle.pdf",
)
- documents = barcodes.separate_pages(test_file, [1])
+ documents = barcodes.separate_pages(test_file, {1: False})
self.assertEqual(len(documents), 2)
self.BARCODE_SAMPLE_DIR,
"patch-code-t-double.pdf",
)
- pages = barcodes.separate_pages(test_file, [1, 2])
+ pages = barcodes.separate_pages(test_file, {1: False, 2: False})
self.assertEqual(len(pages), 2)
"patch-code-t-middle.pdf",
)
with self.assertLogs("paperless.barcodes", level="WARNING") as cm:
- pages = barcodes.separate_pages(test_file, [])
+ pages = barcodes.separate_pages(test_file, {})
self.assertEqual(pages, [])
self.assertEqual(
cm.output,
)
self.assertEqual(doc_barcode_info.pdf_path, test_file)
- self.assertListEqual(separator_page_numbers, [])
+ self.assertDictEqual(separator_page_numbers, {})
+
+ @override_settings(
+ CONSUMER_ENABLE_BARCODES=True,
+ CONSUMER_ENABLE_ASN_BARCODE=True,
+ )
+ def test_separate_pages_by_asn_barcodes_and_patcht(self):
+ """
+ GIVEN:
+ - Input PDF with a patch code on page 3 and ASN barcodes on pages 1,5,6,9,11
+ WHEN:
+ - Input file is split on barcodes
+ THEN:
+ - Correct number of files produced, split correctly by correct pages
+ """
+ test_file = os.path.join(
+ os.path.dirname(__file__),
+ self.BARCODE_SAMPLE_DIR,
+ "split-by-asn-2.pdf",
+ )
+
+ doc_barcode_info = barcodes.scan_file_for_barcodes(
+ test_file,
+ )
+ separator_page_numbers = barcodes.get_separating_barcodes(
+ doc_barcode_info.barcodes,
+ )
+
+ self.assertEqual(test_file, doc_barcode_info.pdf_path)
+ self.assertDictEqual(
+ separator_page_numbers,
+ {
+ 2: False,
+ 4: True,
+ 5: True,
+ 8: True,
+ 10: True,
+ },
+ )
+
+ document_list = barcodes.separate_pages(test_file, separator_page_numbers)
+ self.assertEqual(len(document_list), 6)
+
+ @override_settings(
+ CONSUMER_ENABLE_BARCODES=True,
+ CONSUMER_ENABLE_ASN_BARCODE=True,
+ )
+ def test_separate_pages_by_asn_barcodes(self):
+ """
+ GIVEN:
+ - Input PDF with ASN barcodes on pages 1,3,4,7,9
+ WHEN:
+ - Input file is split on barcodes
+ THEN:
+ - Correct number of files produced, split correctly by correct pages
+ """
+ test_file = os.path.join(
+ os.path.dirname(__file__),
+ self.BARCODE_SAMPLE_DIR,
+ "split-by-asn-1.pdf",
+ )
+
+ doc_barcode_info = barcodes.scan_file_for_barcodes(
+ test_file,
+ )
+ separator_page_numbers = barcodes.get_separating_barcodes(
+ doc_barcode_info.barcodes,
+ )
+
+ self.assertEqual(test_file, doc_barcode_info.pdf_path)
+ self.assertDictEqual(
+ separator_page_numbers,
+ {
+ 2: True,
+ 3: True,
+ 6: True,
+ 8: True,
+ },
+ )
+
+ document_list = barcodes.separate_pages(test_file, separator_page_numbers)
+ self.assertEqual(len(document_list), 5)
class TestAsnBarcodes(DirectoriesMixin, TestCase):