]> git.ipfire.org Git - thirdparty/paperless-ngx.git/commitdiff
Refactor: performance and storage optimization of barcode scanning (#7646)
authorLukas Metzger <1814751+loewexy@users.noreply.github.com>
Sat, 7 Sep 2024 23:11:36 +0000 (01:11 +0200)
committerGitHub <noreply@github.com>
Sat, 7 Sep 2024 23:11:36 +0000 (16:11 -0700)
---------

Co-authored-by: Lukas Metzger <1814751+loewexy@users.noreply.github.com>
Co-authored-by: shamoon <4887959+shamoon@users.noreply.github.com>
docs/configuration.md
src/documents/barcodes.py
src/paperless/settings.py

index 7172afcb3c513556eedab3669addb21d2a83876b..d8ec27d2c8339c733bc2944805f6f03b682e57f6 100644 (file)
@@ -1289,6 +1289,15 @@ combination with PAPERLESS_CONSUMER_BARCODE_UPSCALE bigger than 1.0.
 
     Defaults to "300"
 
+#### [`PAPERLESS_CONSUMER_BARCODE_MAX_PAGES=<int>`](#PAPERLESS_CONSUMER_BARCODE_MAX_PAGES) {#PAPERLESS_CONSUMER_BARCODE_MAX_PAGES}
+
+: Because barcode detection is a computationally-intensive operation, this setting
+limits the detection of barcodes to a number of first pages. If your scanner has
+a limit for the number of pages that can be scanned it would be sensible to set this
+as the limit here.
+
+    Defaults to "0", allowing all pages to be checked for barcodes.
+
 #### [`PAPERLESS_CONSUMER_ENABLE_TAG_BARCODE=<bool>`](#PAPERLESS_CONSUMER_ENABLE_TAG_BARCODE) {#PAPERLESS_CONSUMER_ENABLE_TAG_BARCODE}
 
 : Enables the detection of barcodes in the scanned document and
index 2e290a61b6805089030d90129d940ecc28bad52a..97177cbf637e48da089bc1429f18d57c8ed6aeb2 100644 (file)
@@ -7,8 +7,8 @@ from typing import Optional
 
 from django.conf import settings
 from pdf2image import convert_from_path
-from pdf2image.exceptions import PDFPageCountError
 from pikepdf import Page
+from pikepdf import PasswordError
 from pikepdf import Pdf
 from PIL import Image
 
@@ -231,13 +231,41 @@ class BarcodePlugin(ConsumeTaskPlugin):
             logger.debug("Scanning for barcodes using ZXING")
 
         try:
-            pages_from_path = convert_from_path(
-                self.pdf_file,
-                dpi=settings.CONSUMER_BARCODE_DPI,
-                output_folder=self.temp_dir.name,
+            # Read number of pages from pdf
+            with Pdf.open(self.pdf_file) as pdf:
+                num_of_pages = len(pdf.pages)
+            logger.debug(f"PDF has {num_of_pages} pages")
+
+            # Get limit from configuration
+            barcode_max_pages = (
+                num_of_pages
+                if settings.CONSUMER_BARCODE_MAX_PAGES == 0
+                else settings.CONSUMER_BARCODE_MAX_PAGES
             )
 
-            for current_page_number, page in enumerate(pages_from_path):
+            if barcode_max_pages < num_of_pages:  # pragma: no cover
+                logger.debug(
+                    f"Barcodes detection will be limited to the first {barcode_max_pages} pages",
+                )
+
+            # Loop al page
+            for current_page_number in range(min(num_of_pages, barcode_max_pages)):
+                logger.debug(f"Processing page {current_page_number}")
+
+                # Convert page to image
+                page = convert_from_path(
+                    self.pdf_file,
+                    dpi=settings.CONSUMER_BARCODE_DPI,
+                    output_folder=self.temp_dir.name,
+                    first_page=current_page_number + 1,
+                    last_page=current_page_number + 1,
+                )[0]
+
+                # Remember filename, since it is lost by upscaling
+                page_filepath = Path(page.filename)
+                logger.debug(f"Image is at {page_filepath}")
+
+                # Upscale image if configured
                 factor = settings.CONSUMER_BARCODE_UPSCALE
                 if factor > 1.0:
                     logger.debug(
@@ -248,14 +276,18 @@ class BarcodePlugin(ConsumeTaskPlugin):
                         (int(round(x * factor)), (int(round(y * factor)))),
                     )
 
+                # Detect barcodes
                 for barcode_value in reader(page):
                     self.barcodes.append(
                         Barcode(current_page_number, barcode_value),
                     )
 
+                # Delete temporary image file
+                page_filepath.unlink()
+
         # Password protected files can't be checked
         # This is the exception raised for those
-        except PDFPageCountError as e:
+        except PasswordError as e:
             logger.warning(
                 f"File is likely password protected, not checking for barcodes: {e}",
             )
index ee6110732f13cb3284ac18c57eb4f114401f48f5..ebe64ba9ec1f2ab7342a31608f935e03e55b0173 100644 (file)
@@ -925,6 +925,11 @@ CONSUMER_BARCODE_UPSCALE: Final[float] = __get_float(
 
 CONSUMER_BARCODE_DPI: Final[int] = __get_int("PAPERLESS_CONSUMER_BARCODE_DPI", 300)
 
+CONSUMER_BARCODE_MAX_PAGES: Final[int] = __get_int(
+    "PAPERLESS_CONSUMER_BARCODE_MAX_PAGES",
+    0,
+)
+
 CONSUMER_ENABLE_TAG_BARCODE: Final[bool] = __get_boolean(
     "PAPERLESS_CONSUMER_ENABLE_TAG_BARCODE",
 )