Defaults to false.
-PAPERLESS_CONSUMER_USE_LEGACY_DETECTION=<bool>
- Enables the legacy method of detecting barcodes. By default, images are
- extracted directly from the PDF structure for barcode detection. If this
- configuration value is set, images of the whole PDF page will be used instead.
-
- This is a slower and more memory intensive process, but may be required for
- certain files, depending on how it is produced and how images are encoded.
-
- Defaults to false.
-
PAPERLESS_CONSUMER_BARCODE_TIFF_SUPPORT=<bool>
Whether TIFF image files should be scanned for barcodes.
import magic
from django.conf import settings
from pdf2image import convert_from_path
-from pdf2image.exceptions import PDFPageCountError
from pikepdf import Page
from pikepdf import PasswordError
from pikepdf import Pdf
from pikepdf import PdfImage
-from pikepdf.models.image import HifiPrintImageNotTranscodableError
from PIL import Image
from PIL import ImageSequence
from pyzbar import pyzbar
images[0].save(newpath)
else:
images[0].save(newpath, save_all=True, append_images=images[1:])
- except OSError as e:
+ except OSError as e: # pragma: no cover
logger.warning(
f"Could not save the file as pdf. Error: {str(e)}",
)
for image_key in page.images:
pdfimage = PdfImage(page.images[image_key])
+ # This type is known to have issues:
+ # https://github.com/pikepdf/pikepdf/issues/401
if "/CCITTFaxDecode" in pdfimage.filters:
raise BarcodeImageFormatError(
"Unable to decode CCITTFaxDecode images",
)
# Not all images can be transcoded to a PIL image, which
- # is what pyzbar expects to receive
+ # is what pyzbar expects to receive, so this may
+ # raise an exception, triggering fallback
pillow_img = pdfimage.as_pil_image()
detected_barcodes = barcode_reader(pillow_img)
if mime_type == "image/tiff":
pdf_filepath = convert_from_tiff_to_pdf(filepath)
- # Chose the scanner
- if settings.CONSUMER_USE_LEGACY_DETECTION:
- logger.debug("Using pdf2image for barcodes")
- scanner_function = _pdf2image_barcode_scan
- else:
- logger.debug("Using pikepdf for barcodes")
- scanner_function = _pikepdf_barcode_scan
-
- # Run the scanner
+ # Always try pikepdf first, it's usually fine, faster and
+ # uses less memory
try:
- scanner_function(pdf_filepath)
- # Neither method can handle password protected PDFs without it being
- # provided. Log it and continue
- except (PasswordError, PDFPageCountError) as e:
+ _pikepdf_barcode_scan(pdf_filepath)
+ # Password protected files can't be checked
+ except PasswordError as e:
logger.warning(
- f"File is likely password protected, not splitting: {e}",
+ f"File is likely password protected, not checking for barcodes: {e}",
)
- # Handle pikepdf related image decoding issues with a fallback
- except (BarcodeImageFormatError, HifiPrintImageNotTranscodableError) as e:
+ # Handle pikepdf related image decoding issues with a fallback to page
+ # by page conversion to images in a temporary directory
+ except Exception as e:
logger.warning(
f"Falling back to pdf2image because: {e}",
)
try:
+ # Clear the list in case some processing worked
separator_page_numbers = []
_pdf2image_barcode_scan(pdf_filepath)
# This file is really borked, allow the consumption to continue
logger.warning(
f"Exception during barcode scanning: {e}",
)
- # We're not sure what happened, but allow the consumption to continue
- except Exception as e: # pragma: no cover
- logger.warning(
- f"Exception during barcode scanning: {e}",
- )
else:
logger.warning(
self.assertTrue(os.path.isfile(target_file1))
self.assertTrue(os.path.isfile(target_file2))
- @override_settings(CONSUMER_USE_LEGACY_DETECTION=True)
- def test_barcode_splitter_legacy_fallback(self):
- """
- GIVEN:
- - File containing barcode
- - Legacy method of detection is enabled
- WHEN:
- - File is scanned for barcodes
- THEN:
- - Barcodes are properly detected
- """
- test_file = os.path.join(
- self.BARCODE_SAMPLE_DIR,
- "patch-code-t-middle.pdf",
- )
- tempdir = tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR)
-
- pdf_file, separator_page_numbers = barcodes.scan_file_for_separating_barcodes(
- test_file,
- )
-
- self.assertEqual(test_file, pdf_file)
- self.assertTrue(len(separator_page_numbers) > 0)
-
- document_list = barcodes.separate_pages(test_file, separator_page_numbers)
- self.assertTrue(document_list)
- for document in document_list:
- barcodes.save_to_dir(document, target_dir=tempdir)
-
- target_file1 = os.path.join(tempdir, "patch-code-t-middle_document_0.pdf")
- target_file2 = os.path.join(tempdir, "patch-code-t-middle_document_1.pdf")
-
- self.assertTrue(os.path.isfile(target_file1))
- self.assertTrue(os.path.isfile(target_file2))
-
@override_settings(CONSUMER_ENABLE_BARCODES=True)
def test_consume_barcode_file(self):
test_file = os.path.join(
with mock.patch("documents.tasks.async_to_sync"):
self.assertEqual(tasks.consume_file(dst), "File successfully split")
- def test_scan_file_for_separating_barcodes_password_pikepdf(self):
+ def test_scan_file_for_separating_barcodes_password(self):
"""
GIVEN:
- Password protected PDF
self.assertEqual(pdf_file, test_file)
self.assertListEqual(separator_page_numbers, [])
-
- @override_settings(CONSUMER_USE_LEGACY_DETECTION=True)
- def test_scan_file_for_separating_barcodes_password_pdf2image(self):
- """
- GIVEN:
- - Password protected PDF
- - pdf2image based scanning
- WHEN:
- - File is scanned for barcode
- THEN:
- - Scanning handle the exception without exception
- """
- test_file = os.path.join(self.SAMPLE_DIR, "password-is-test.pdf")
- pdf_file, separator_page_numbers = barcodes.scan_file_for_separating_barcodes(
- test_file,
- )
-
- self.assertEqual(pdf_file, test_file)
- self.assertListEqual(separator_page_numbers, [])
"PAPERLESS_CONSUMER_BARCODE_TIFF_SUPPORT",
)
-CONSUMER_USE_LEGACY_DETECTION: Final[bool] = __get_boolean(
- "PAPERLESS_CONSUMER_USE_LEGACY_DETECTION",
- "NO",
-)
-
CONSUMER_BARCODE_STRING: Final[str] = os.getenv(
"PAPERLESS_CONSUMER_BARCODE_STRING",
"PATCHT",