import os
import shutil
import tempfile
+from dataclasses import dataclass
from functools import lru_cache
+from pathlib import Path
from typing import List
from typing import Optional
-from typing import Tuple
import magic
from django.conf import settings
pass
+@dataclass(frozen=True)
+class Barcode:
+ """
+ Holds the information about a single barcode and its location
+ """
+
+ page: int
+ value: str
+
+ @property
+ def is_separator(self) -> bool:
+ """
+ Returns True if the barcode value equals the configured separation value,
+ False otherwise
+ """
+ return self.value == settings.CONSUMER_BARCODE_STRING
+
+ @property
+ def is_asn(self) -> bool:
+ """
+ Returns True if the barcode value matches the configured ASN prefix,
+ False otherwise
+ """
+ return self.value.startswith(settings.CONSUMER_ASN_BARCODE_PREFIX)
+
+
+@dataclass
+class DocumentBarcodeInfo:
+ """
+ Describes a single document's barcode status
+ """
+
+ pdf_path: Path
+ barcodes: List[Barcode]
+
+
@lru_cache(maxsize=8)
def supported_file_type(mime_type) -> bool:
"""
def scan_file_for_barcodes(
filepath: str,
-) -> Tuple[Optional[str], List[Tuple[int, str]]]:
+) -> DocumentBarcodeInfo:
"""
Scan the provided pdf file for any barcodes
Returns a PDF filepath and a list of
(page_number, barcode_text) tuples
"""
- def _pikepdf_barcode_scan(pdf_filepath: str):
+ def _pikepdf_barcode_scan(pdf_filepath: str) -> List[Barcode]:
detected_barcodes = []
with Pdf.open(pdf_filepath) as pdf:
for page_num, page in enumerate(pdf.pages):
# raise an exception, triggering fallback
pillow_img = pdfimage.as_pil_image()
- barcodes_on_page = barcode_reader(pillow_img)
- detected_barcodes.extend(
- [(page_num, text) for text in barcodes_on_page],
- )
+ for barcode_value in barcode_reader(pillow_img):
+ detected_barcodes.append(Barcode(page_num, barcode_value))
+
return detected_barcodes
- def _pdf2image_barcode_scan(pdf_filepath: str):
+ def _pdf2image_barcode_scan(pdf_filepath: str) -> List[Barcode]:
detected_barcodes = []
# use a temporary directory in case the file is too big to handle in memory
with tempfile.TemporaryDirectory() as path:
pages_from_path = convert_from_path(pdf_filepath, output_folder=path)
for current_page_number, page in enumerate(pages_from_path):
- barcodes_on_page = barcode_reader(page)
- detected_barcodes.extend(
- [(current_page_number, text) for text in barcodes_on_page],
- )
+ for barcode_value in barcode_reader(page):
+ detected_barcodes.append(
+ Barcode(current_page_number, barcode_value),
+ )
return detected_barcodes
pdf_filepath = None
f"Unsupported file format for barcode reader: {str(mime_type)}",
)
- return pdf_filepath, barcodes
+ return DocumentBarcodeInfo(pdf_filepath, barcodes)
-def get_separating_barcodes(barcodes: List[Tuple[int, str]]) -> List[int]:
+def get_separating_barcodes(barcodes: List[Barcode]) -> List[int]:
"""
Search the parsed barcodes for separators
- and returns a list of pagenumbers, which
+ and returns a list of page numbers, which
separate the file into new files
"""
# filter all barcodes for the separator string
- separator_barcodes = list(
- filter(lambda bc: bc[1] == settings.CONSUMER_BARCODE_STRING, barcodes),
- )
# get the page numbers of the separating barcodes
- separator_page_numbers = [page for page, _ in separator_barcodes]
- return separator_page_numbers
+ return [bc.page for bc in barcodes if bc.is_separator]
-def get_asn_from_barcodes(barcodes: List[Tuple[int, str]]) -> Optional[int]:
+def get_asn_from_barcodes(barcodes: List[Barcode]) -> Optional[int]:
"""
Search the parsed barcodes for any ASNs.
The first barcode that starts with CONSUMER_ASN_BARCODE_PREFIX
"""
asn = None
- # only the barcode text is important here -> discard the page number
- barcodes = [text for _, text in barcodes]
# get the first barcode that starts with CONSUMER_ASN_BARCODE_PREFIX
asn_text = next(
- (x for x in barcodes if x.startswith(settings.CONSUMER_ASN_BARCODE_PREFIX)),
+ (x.value for x in barcodes if x.is_asn),
None,
)
self.BARCODE_SAMPLE_DIR,
"patch-code-t.pdf",
)
- pdf_file, parsed_barcodes = barcodes.scan_file_for_barcodes(
+ doc_barcode_info = barcodes.scan_file_for_barcodes(
test_file,
)
separator_page_numbers = barcodes.get_separating_barcodes(
- parsed_barcodes,
+ doc_barcode_info.barcodes,
)
- self.assertEqual(pdf_file, test_file)
+ self.assertEqual(doc_barcode_info.pdf_path, test_file)
self.assertListEqual(separator_page_numbers, [0])
def test_scan_file_for_separating_barcodes_none_present(self):
test_file = os.path.join(self.SAMPLE_DIR, "simple.pdf")
- pdf_file, parsed_barcodes = barcodes.scan_file_for_barcodes(
+ doc_barcode_info = barcodes.scan_file_for_barcodes(
test_file,
)
separator_page_numbers = barcodes.get_separating_barcodes(
- parsed_barcodes,
+ doc_barcode_info.barcodes,
)
- self.assertEqual(pdf_file, test_file)
+ self.assertEqual(doc_barcode_info.pdf_path, test_file)
self.assertListEqual(separator_page_numbers, [])
def test_scan_file_for_separating_barcodes3(self):
self.BARCODE_SAMPLE_DIR,
"patch-code-t-middle.pdf",
)
- pdf_file, parsed_barcodes = barcodes.scan_file_for_barcodes(
+ doc_barcode_info = barcodes.scan_file_for_barcodes(
test_file,
)
separator_page_numbers = barcodes.get_separating_barcodes(
- parsed_barcodes,
+ doc_barcode_info.barcodes,
)
- self.assertEqual(pdf_file, test_file)
+ self.assertEqual(doc_barcode_info.pdf_path, test_file)
self.assertListEqual(separator_page_numbers, [1])
def test_scan_file_for_separating_barcodes4(self):
self.BARCODE_SAMPLE_DIR,
"several-patcht-codes.pdf",
)
- pdf_file, parsed_barcodes = barcodes.scan_file_for_barcodes(
+ doc_barcode_info = barcodes.scan_file_for_barcodes(
test_file,
)
separator_page_numbers = barcodes.get_separating_barcodes(
- parsed_barcodes,
+ doc_barcode_info.barcodes,
)
- self.assertEqual(pdf_file, test_file)
+ self.assertEqual(doc_barcode_info.pdf_path, test_file)
self.assertListEqual(separator_page_numbers, [2, 5])
def test_scan_file_for_separating_barcodes_upsidedown(self):
self.BARCODE_SAMPLE_DIR,
"patch-code-t-middle_reverse.pdf",
)
- pdf_file, parsed_barcodes = barcodes.scan_file_for_barcodes(
+ doc_barcode_info = barcodes.scan_file_for_barcodes(
test_file,
)
separator_page_numbers = barcodes.get_separating_barcodes(
- parsed_barcodes,
+ doc_barcode_info.barcodes,
)
- self.assertEqual(pdf_file, test_file)
+ self.assertEqual(doc_barcode_info.pdf_path, test_file)
self.assertListEqual(separator_page_numbers, [1])
def test_scan_file_for_barcodes_pillow_transcode_error(self):
with mock.patch("documents.barcodes.barcode_reader") as reader:
reader.return_value = list()
- _, _ = barcodes.scan_file_for_barcodes(
+ _ = barcodes.scan_file_for_barcodes(
str(device_n_pdf.name),
)
self.BARCODE_SAMPLE_DIR,
"barcode-fax-image.pdf",
)
- pdf_file, parsed_barcodes = barcodes.scan_file_for_barcodes(
+ doc_barcode_info = barcodes.scan_file_for_barcodes(
test_file,
)
separator_page_numbers = barcodes.get_separating_barcodes(
- parsed_barcodes,
+ doc_barcode_info.barcodes,
)
- self.assertEqual(pdf_file, test_file)
+ self.assertEqual(doc_barcode_info.pdf_path, test_file)
self.assertListEqual(separator_page_numbers, [1])
def test_scan_file_for_separating_qr_barcodes(self):
self.BARCODE_SAMPLE_DIR,
"patch-code-t-qr.pdf",
)
- pdf_file, parsed_barcodes = barcodes.scan_file_for_barcodes(
+ doc_barcode_info = barcodes.scan_file_for_barcodes(
test_file,
)
separator_page_numbers = barcodes.get_separating_barcodes(
- parsed_barcodes,
+ doc_barcode_info.barcodes,
)
- self.assertEqual(pdf_file, test_file)
+ self.assertEqual(doc_barcode_info.pdf_path, test_file)
self.assertListEqual(separator_page_numbers, [0])
@override_settings(CONSUMER_BARCODE_STRING="CUSTOM BARCODE")
self.BARCODE_SAMPLE_DIR,
"barcode-39-custom.pdf",
)
- pdf_file, parsed_barcodes = barcodes.scan_file_for_barcodes(
+ doc_barcode_info = barcodes.scan_file_for_barcodes(
test_file,
)
separator_page_numbers = barcodes.get_separating_barcodes(
- parsed_barcodes,
+ doc_barcode_info.barcodes,
)
- self.assertEqual(pdf_file, test_file)
+ self.assertEqual(doc_barcode_info.pdf_path, test_file)
self.assertListEqual(separator_page_numbers, [0])
@override_settings(CONSUMER_BARCODE_STRING="CUSTOM BARCODE")
self.BARCODE_SAMPLE_DIR,
"barcode-qr-custom.pdf",
)
- pdf_file, parsed_barcodes = barcodes.scan_file_for_barcodes(
+ doc_barcode_info = barcodes.scan_file_for_barcodes(
test_file,
)
separator_page_numbers = barcodes.get_separating_barcodes(
- parsed_barcodes,
+ doc_barcode_info.barcodes,
)
- self.assertEqual(pdf_file, test_file)
+ self.assertEqual(doc_barcode_info.pdf_path, test_file)
self.assertListEqual(separator_page_numbers, [0])
@override_settings(CONSUMER_BARCODE_STRING="CUSTOM BARCODE")
self.BARCODE_SAMPLE_DIR,
"barcode-128-custom.pdf",
)
- pdf_file, parsed_barcodes = barcodes.scan_file_for_barcodes(
+ doc_barcode_info = barcodes.scan_file_for_barcodes(
test_file,
)
separator_page_numbers = barcodes.get_separating_barcodes(
- parsed_barcodes,
+ doc_barcode_info.barcodes,
)
- self.assertEqual(pdf_file, test_file)
+ self.assertEqual(doc_barcode_info.pdf_path, test_file)
self.assertListEqual(separator_page_numbers, [0])
def test_scan_file_for_separating_wrong_qr_barcodes(self):
self.BARCODE_SAMPLE_DIR,
"barcode-39-custom.pdf",
)
- pdf_file, parsed_barcodes = barcodes.scan_file_for_barcodes(
+ doc_barcode_info = barcodes.scan_file_for_barcodes(
test_file,
)
separator_page_numbers = barcodes.get_separating_barcodes(
- parsed_barcodes,
+ doc_barcode_info.barcodes,
)
- self.assertEqual(pdf_file, test_file)
+ self.assertEqual(doc_barcode_info.pdf_path, test_file)
self.assertListEqual(separator_page_numbers, [])
def test_separate_pages(self):
)
tempdir = tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR)
- pdf_file, parsed_barcodes = barcodes.scan_file_for_barcodes(
+ doc_barcode_info = barcodes.scan_file_for_barcodes(
test_file,
)
separator_page_numbers = barcodes.get_separating_barcodes(
- parsed_barcodes,
+ doc_barcode_info.barcodes,
)
- self.assertEqual(test_file, pdf_file)
+ self.assertEqual(test_file, doc_barcode_info.pdf_path)
self.assertTrue(len(separator_page_numbers) > 0)
document_list = barcodes.separate_pages(test_file, separator_page_numbers)
- Scanning handle the exception without exception
"""
test_file = os.path.join(self.SAMPLE_DIR, "password-is-test.pdf")
- pdf_file, parsed_barcodes = barcodes.scan_file_for_barcodes(
+ doc_barcode_info = barcodes.scan_file_for_barcodes(
test_file,
)
separator_page_numbers = barcodes.get_separating_barcodes(
- parsed_barcodes,
+ doc_barcode_info.barcodes,
)
- self.assertEqual(pdf_file, test_file)
+ self.assertEqual(doc_barcode_info.pdf_path, test_file)
self.assertListEqual(separator_page_numbers, [])
def test_scan_file_for_asn_barcode(self):
self.BARCODE_SAMPLE_DIR,
"barcode-39-asn-123.pdf",
)
- pdf_file, parsed_barcodes = barcodes.scan_file_for_barcodes(
+ doc_barcode_info = barcodes.scan_file_for_barcodes(
test_file,
)
- asn = barcodes.get_asn_from_barcodes(parsed_barcodes)
+ asn = barcodes.get_asn_from_barcodes(doc_barcode_info.barcodes)
- self.assertEqual(pdf_file, test_file)
+ self.assertEqual(doc_barcode_info.pdf_path, test_file)
self.assertEqual(asn, 123)
def test_scan_file_for_asn_not_existing(self):
self.BARCODE_SAMPLE_DIR,
"patch-code-t.pdf",
)
- pdf_file, parsed_barcodes = barcodes.scan_file_for_barcodes(
+ doc_barcode_info = barcodes.scan_file_for_barcodes(
test_file,
)
- asn = barcodes.get_asn_from_barcodes(parsed_barcodes)
+ asn = barcodes.get_asn_from_barcodes(doc_barcode_info.barcodes)
- self.assertEqual(pdf_file, test_file)
+ self.assertEqual(doc_barcode_info.pdf_path, test_file)
self.assertEqual(asn, None)
def test_scan_file_for_asn_barcode_invalid(self):
self.BARCODE_SAMPLE_DIR,
"barcode-39-asn-invalid.pdf",
)
- pdf_file, parsed_barcodes = barcodes.scan_file_for_barcodes(
+ doc_barcode_info = barcodes.scan_file_for_barcodes(
test_file,
)
- asn = barcodes.get_asn_from_barcodes(parsed_barcodes)
+ asn = barcodes.get_asn_from_barcodes(doc_barcode_info.barcodes)
- self.assertEqual(pdf_file, test_file)
+ self.assertEqual(doc_barcode_info.pdf_path, test_file)
self.assertEqual(asn, None)
@override_settings(CONSUMER_ASN_BARCODE_PREFIX="CUSTOM-PREFIX-")
self.BARCODE_SAMPLE_DIR,
"barcode-39-asn-custom-prefix.pdf",
)
- pdf_file, parsed_barcodes = barcodes.scan_file_for_barcodes(
+ doc_barcode_info = barcodes.scan_file_for_barcodes(
test_file,
)
- asn = barcodes.get_asn_from_barcodes(parsed_barcodes)
+ asn = barcodes.get_asn_from_barcodes(doc_barcode_info.barcodes)
- self.assertEqual(pdf_file, test_file)
+ self.assertEqual(doc_barcode_info.pdf_path, test_file)
self.assertEqual(asn, 123)