]> git.ipfire.org Git - thirdparty/paperless-ngx.git/commitdiff
Refactor file consumption task to allow beginnings of a plugin system (#5367)
authorTrenton H <797416+stumpylog@users.noreply.github.com>
Sat, 13 Jan 2024 16:11:14 +0000 (08:11 -0800)
committerGitHub <noreply@github.com>
Sat, 13 Jan 2024 16:11:14 +0000 (16:11 +0000)
src/documents/barcodes.py
src/documents/consumer.py
src/documents/double_sided.py
src/documents/plugins/__init__.py [new file with mode: 0644]
src/documents/plugins/base.py [new file with mode: 0644]
src/documents/plugins/helpers.py [new file with mode: 0644]
src/documents/tasks.py
src/documents/tests/test_barcodes.py
src/documents/tests/test_double_sided.py
src/documents/tests/test_workflows.py
src/documents/tests/utils.py

index 5a2c3381ae5b1e3d604c63bb258266dfeb24498d..606451f84396a3d052c4cc9d449b9bf8ccfefcf4 100644 (file)
@@ -3,7 +3,6 @@ import re
 import tempfile
 from dataclasses import dataclass
 from pathlib import Path
-from typing import Final
 from typing import Optional
 
 from django.conf import settings
@@ -15,8 +14,9 @@ from PIL import Image
 
 from documents.converters import convert_from_tiff_to_pdf
 from documents.data_models import ConsumableDocument
-from documents.data_models import DocumentMetadataOverrides
-from documents.data_models import DocumentSource
+from documents.plugins.base import ConsumeTaskPlugin
+from documents.plugins.base import StopConsumeTaskError
+from documents.plugins.helpers import ProgressStatusOptions
 from documents.utils import copy_basic_file_stats
 from documents.utils import copy_file_with_basic_stats
 
@@ -26,7 +26,7 @@ logger = logging.getLogger("paperless.barcodes")
 @dataclass(frozen=True)
 class Barcode:
     """
-    Holds the information about a single barcode and its location
+    Holds the information about a single barcode and its location in a document
     """
 
     page: int
@@ -49,77 +49,111 @@ class Barcode:
         return self.value.startswith(settings.CONSUMER_ASN_BARCODE_PREFIX)
 
 
-class BarcodeReader:
-    def __init__(self, filepath: Path, mime_type: str) -> None:
-        self.file: Final[Path] = filepath
-        self.mime: Final[str] = mime_type
-        self.pdf_file: Path = self.file
-        self.barcodes: list[Barcode] = []
-        self._tiff_conversion_done = False
-        self.temp_dir: Optional[tempfile.TemporaryDirectory] = None
+class BarcodePlugin(ConsumeTaskPlugin):
+    NAME: str = "BarcodePlugin"
 
+    @property
+    def able_to_run(self) -> bool:
+        """
+        Able to run if:
+          - ASN from barcode detection is enabled or
+          - Barcode support is enabled and the mime type is supported
+        """
         if settings.CONSUMER_BARCODE_TIFF_SUPPORT:
-            self.SUPPORTED_FILE_MIMES = {"application/pdf", "image/tiff"}
+            supported_mimes = {"application/pdf", "image/tiff"}
         else:
-            self.SUPPORTED_FILE_MIMES = {"application/pdf"}
+            supported_mimes = {"application/pdf"}
 
-    def __enter__(self):
-        if self.supported_mime_type:
-            self.temp_dir = tempfile.TemporaryDirectory(prefix="paperless-barcodes")
-        return self
+        return (
+            settings.CONSUMER_ENABLE_ASN_BARCODE or settings.CONSUMER_ENABLE_BARCODES
+        ) and self.input_doc.mime_type in supported_mimes
 
-    def __exit__(self, exc_type, exc_val, exc_tb):
-        if self.temp_dir is not None:
-            self.temp_dir.cleanup()
-            self.temp_dir = None
+    def setup(self):
+        self.temp_dir = tempfile.TemporaryDirectory(
+            dir=self.base_tmp_dir,
+            prefix="barcode",
+        )
+        self.pdf_file = self.input_doc.original_file
+        self._tiff_conversion_done = False
+        self.barcodes: list[Barcode] = []
 
-    @property
-    def supported_mime_type(self) -> bool:
-        """
-        Return True if the given mime type is supported for barcodes, false otherwise
-        """
-        return self.mime in self.SUPPORTED_FILE_MIMES
+    def run(self) -> Optional[str]:
+        # Maybe do the conversion of TIFF to PDF
+        self.convert_from_tiff_to_pdf()
 
-    @property
-    def asn(self) -> Optional[int]:
-        """
-        Search the parsed barcodes for any ASNs.
-        The first barcode that starts with CONSUMER_ASN_BARCODE_PREFIX
-        is considered the ASN to be used.
-        Returns the detected ASN (or None)
-        """
-        asn = None
+        # Locate any barcodes in the files
+        self.detect()
 
-        if not self.supported_mime_type:
-            return None
+        # Update/overwrite an ASN if possible
+        located_asn = self.asn
+        if located_asn is not None:
+            logger.info(f"Found ASN in barcode: {located_asn}")
+            self.metadata.asn = located_asn
 
-        # Ensure the barcodes have been read
-        self.detect()
+        separator_pages = self.get_separation_pages()
+        if not separator_pages:
+            return "No pages to split on!"
 
-        # get the first barcode that starts with CONSUMER_ASN_BARCODE_PREFIX
-        asn_text = next(
-            (x.value for x in self.barcodes if x.is_asn),
-            None,
-        )
+        # We have pages to split against
 
-        if asn_text:
-            logger.debug(f"Found ASN Barcode: {asn_text}")
-            # remove the prefix and remove whitespace
-            asn_text = asn_text[len(settings.CONSUMER_ASN_BARCODE_PREFIX) :].strip()
+        # Note this does NOT use the base_temp_dir, as that will be removed
+        tmp_dir = Path(
+            tempfile.mkdtemp(
+                dir=settings.SCRATCH_DIR,
+                prefix="paperless-barcode-split-",
+            ),
+        ).resolve()
 
-            # remove non-numeric parts of the remaining string
-            asn_text = re.sub(r"\D", "", asn_text)
+        from documents import tasks
 
-            # now, try parsing the ASN number
-            try:
-                asn = int(asn_text)
-            except ValueError as e:
-                logger.warning(f"Failed to parse ASN number because: {e}")
+        # Create the split document tasks
+        for new_document in self.separate_pages(separator_pages):
+            copy_file_with_basic_stats(new_document, tmp_dir / new_document.name)
 
-        return asn
+            task = tasks.consume_file.delay(
+                ConsumableDocument(
+                    # Same source, for templates
+                    source=self.input_doc.source,
+                    mailrule_id=self.input_doc.mailrule_id,
+                    # Can't use same folder or the consume might grab it again
+                    original_file=(tmp_dir / new_document.name).resolve(),
+                ),
+                # All the same metadata
+                self.metadata,
+            )
+            logger.info(f"Created new task {task.id} for {new_document.name}")
+
+        # This file is now two or more files
+        self.input_doc.original_file.unlink()
+
+        msg = "Barcode splitting complete!"
+
+        # Update the progress to complete
+        self.status_mgr.send_progress(ProgressStatusOptions.SUCCESS, msg, 100, 100)
+
+        # Request the consume task stops
+        raise StopConsumeTaskError(msg)
+
+    def cleanup(self) -> None:
+        self.temp_dir.cleanup()
+
+    def convert_from_tiff_to_pdf(self):
+        """
+        May convert a TIFF image into a PDF, if the input is a TIFF and
+        the TIFF has not been made into a PDF
+        """
+        # Nothing to do, pdf_file is already assigned correctly
+        if self.input_doc.mime_type != "image/tiff" or self._tiff_conversion_done:
+            return
+
+        self.pdf_file = convert_from_tiff_to_pdf(
+            self.input_doc.original_file,
+            Path(self.temp_dir.name),
+        )
+        self._tiff_conversion_done = True
 
     @staticmethod
-    def read_barcodes_zxing(image: Image) -> list[str]:
+    def read_barcodes_zxing(image: Image.Image) -> list[str]:
         barcodes = []
 
         import zxingcpp
@@ -135,7 +169,7 @@ class BarcodeReader:
         return barcodes
 
     @staticmethod
-    def read_barcodes_pyzbar(image: Image) -> list[str]:
+    def read_barcodes_pyzbar(image: Image.Image) -> list[str]:
         barcodes = []
 
         from pyzbar import pyzbar
@@ -154,18 +188,6 @@ class BarcodeReader:
 
         return barcodes
 
-    def convert_from_tiff_to_pdf(self):
-        """
-        May convert a TIFF image into a PDF, if the input is a TIFF and
-        the TIFF has not been made into a PDF
-        """
-        # Nothing to do, pdf_file is already assigned correctly
-        if self.mime != "image/tiff" or self._tiff_conversion_done:
-            return
-
-        self._tiff_conversion_done = True
-        self.pdf_file = convert_from_tiff_to_pdf(self.file, Path(self.temp_dir.name))
-
     def detect(self) -> None:
         """
         Scan all pages of the PDF as images, updating barcodes and the pages
@@ -218,10 +240,45 @@ class BarcodeReader:
         # This file is really borked, allow the consumption to continue
         # but it may fail further on
         except Exception as e:  # pragma: no cover
-            logger.exception(
+            logger.warning(
                 f"Exception during barcode scanning: {e}",
             )
 
+    @property
+    def asn(self) -> Optional[int]:
+        """
+        Search the parsed barcodes for any ASNs.
+        The first barcode that starts with CONSUMER_ASN_BARCODE_PREFIX
+        is considered the ASN to be used.
+        Returns the detected ASN (or None)
+        """
+        asn = None
+
+        # Ensure the barcodes have been read
+        self.detect()
+
+        # get the first barcode that starts with CONSUMER_ASN_BARCODE_PREFIX
+        asn_text = next(
+            (x.value for x in self.barcodes if x.is_asn),
+            None,
+        )
+
+        if asn_text:
+            logger.debug(f"Found ASN Barcode: {asn_text}")
+            # remove the prefix and remove whitespace
+            asn_text = asn_text[len(settings.CONSUMER_ASN_BARCODE_PREFIX) :].strip()
+
+            # remove non-numeric parts of the remaining string
+            asn_text = re.sub(r"\D", "", asn_text)
+
+            # now, try parsing the ASN number
+            try:
+                asn = int(asn_text)
+            except ValueError as e:
+                logger.warning(f"Failed to parse ASN number because: {e}")
+
+        return asn
+
     def get_separation_pages(self) -> dict[int, bool]:
         """
         Search the parsed barcodes for separators and returns a dict of page
@@ -251,7 +308,7 @@ class BarcodeReader:
         """
 
         document_paths = []
-        fname = self.file.stem
+        fname = self.input_doc.original_file.stem
         with Pdf.open(self.pdf_file) as input_pdf:
             # Start with an empty document
             current_document: list[Page] = []
@@ -292,58 +349,8 @@ class BarcodeReader:
                 with open(savepath, "wb") as out:
                     dst.save(out)
 
-                copy_basic_file_stats(self.file, savepath)
+                copy_basic_file_stats(self.input_doc.original_file, savepath)
 
                 document_paths.append(savepath)
 
             return document_paths
-
-    def separate(
-        self,
-        source: DocumentSource,
-        overrides: DocumentMetadataOverrides,
-    ) -> bool:
-        """
-        Separates the document, based on barcodes and configuration, creating new
-        documents as required in the appropriate location.
-
-        Returns True if a split happened, False otherwise
-        """
-        # Do nothing
-        if not self.supported_mime_type:
-            logger.warning(f"Unsupported file format for barcode reader: {self.mime}")
-            return False
-
-        # Does nothing unless needed
-        self.convert_from_tiff_to_pdf()
-
-        # Actually read the codes, if any
-        self.detect()
-
-        separator_pages = self.get_separation_pages()
-
-        # Also do nothing
-        if not separator_pages:
-            logger.warning("No pages to split on!")
-            return False
-
-        tmp_dir = Path(tempfile.mkdtemp(prefix="paperless-barcode-split-")).resolve()
-
-        from documents import tasks
-
-        # Create the split document tasks
-        for new_document in self.separate_pages(separator_pages):
-            copy_file_with_basic_stats(new_document, tmp_dir / new_document.name)
-
-            tasks.consume_file.delay(
-                ConsumableDocument(
-                    # Same source, for templates
-                    source=source,
-                    # Can't use same folder or the consume might grab it again
-                    original_file=(tmp_dir / new_document.name).resolve(),
-                ),
-                # All the same metadata
-                overrides,
-            )
-        logger.info("Barcode splitting complete!")
-        return True
index 06e9f68fc4ab0adaddaf69715e12865d2ccf21e7..01b25edeaf9af0c7a6c92ed2385484f37a4d4f01 100644 (file)
@@ -21,7 +21,6 @@ from filelock import FileLock
 from rest_framework.reverse import reverse
 
 from documents.classifier import load_classifier
-from documents.data_models import ConsumableDocument
 from documents.data_models import DocumentMetadataOverrides
 from documents.file_handling import create_source_path_directory
 from documents.file_handling import generate_unique_filename
@@ -42,12 +41,83 @@ from documents.parsers import ParseError
 from documents.parsers import get_parser_class_for_mime_type
 from documents.parsers import parse_date
 from documents.permissions import set_permissions_for_object
+from documents.plugins.base import AlwaysRunPluginMixin
+from documents.plugins.base import ConsumeTaskPlugin
+from documents.plugins.base import NoCleanupPluginMixin
+from documents.plugins.base import NoSetupPluginMixin
 from documents.signals import document_consumption_finished
 from documents.signals import document_consumption_started
 from documents.utils import copy_basic_file_stats
 from documents.utils import copy_file_with_basic_stats
 
 
+class WorkflowTriggerPlugin(
+    NoCleanupPluginMixin,
+    NoSetupPluginMixin,
+    AlwaysRunPluginMixin,
+    ConsumeTaskPlugin,
+):
+    NAME: str = "WorkflowTriggerPlugin"
+
+    def run(self) -> Optional[str]:
+        """
+        Get overrides from matching workflows
+        """
+        overrides = DocumentMetadataOverrides()
+        for workflow in Workflow.objects.filter(enabled=True).order_by("order"):
+            template_overrides = DocumentMetadataOverrides()
+
+            if document_matches_workflow(
+                self.input_doc,
+                workflow,
+                WorkflowTrigger.WorkflowTriggerType.CONSUMPTION,
+            ):
+                for action in workflow.actions.all():
+                    if action.assign_title is not None:
+                        template_overrides.title = action.assign_title
+                    if action.assign_tags is not None:
+                        template_overrides.tag_ids = [
+                            tag.pk for tag in action.assign_tags.all()
+                        ]
+                    if action.assign_correspondent is not None:
+                        template_overrides.correspondent_id = (
+                            action.assign_correspondent.pk
+                        )
+                    if action.assign_document_type is not None:
+                        template_overrides.document_type_id = (
+                            action.assign_document_type.pk
+                        )
+                    if action.assign_storage_path is not None:
+                        template_overrides.storage_path_id = (
+                            action.assign_storage_path.pk
+                        )
+                    if action.assign_owner is not None:
+                        template_overrides.owner_id = action.assign_owner.pk
+                    if action.assign_view_users is not None:
+                        template_overrides.view_users = [
+                            user.pk for user in action.assign_view_users.all()
+                        ]
+                    if action.assign_view_groups is not None:
+                        template_overrides.view_groups = [
+                            group.pk for group in action.assign_view_groups.all()
+                        ]
+                    if action.assign_change_users is not None:
+                        template_overrides.change_users = [
+                            user.pk for user in action.assign_change_users.all()
+                        ]
+                    if action.assign_change_groups is not None:
+                        template_overrides.change_groups = [
+                            group.pk for group in action.assign_change_groups.all()
+                        ]
+                    if action.assign_custom_fields is not None:
+                        template_overrides.custom_field_ids = [
+                            field.pk for field in action.assign_custom_fields.all()
+                        ]
+
+                    overrides.update(template_overrides)
+        self.metadata.update(overrides)
+
+
 class ConsumerError(Exception):
     pass
 
@@ -602,70 +672,6 @@ class Consumer(LoggingMixin):
 
         return document
 
-    def get_workflow_overrides(
-        self,
-        input_doc: ConsumableDocument,
-    ) -> DocumentMetadataOverrides:
-        """
-        Get overrides from matching workflows
-        """
-        overrides = DocumentMetadataOverrides()
-        for workflow in Workflow.objects.filter(enabled=True).order_by("order"):
-            template_overrides = DocumentMetadataOverrides()
-
-            if document_matches_workflow(
-                input_doc,
-                workflow,
-                WorkflowTrigger.WorkflowTriggerType.CONSUMPTION,
-            ):
-                for action in workflow.actions.all():
-                    self.log.info(
-                        f"Applying overrides in {action} from {workflow}",
-                    )
-                    if action.assign_title is not None:
-                        template_overrides.title = action.assign_title
-                    if action.assign_tags is not None:
-                        template_overrides.tag_ids = [
-                            tag.pk for tag in action.assign_tags.all()
-                        ]
-                    if action.assign_correspondent is not None:
-                        template_overrides.correspondent_id = (
-                            action.assign_correspondent.pk
-                        )
-                    if action.assign_document_type is not None:
-                        template_overrides.document_type_id = (
-                            action.assign_document_type.pk
-                        )
-                    if action.assign_storage_path is not None:
-                        template_overrides.storage_path_id = (
-                            action.assign_storage_path.pk
-                        )
-                    if action.assign_owner is not None:
-                        template_overrides.owner_id = action.assign_owner.pk
-                    if action.assign_view_users is not None:
-                        template_overrides.view_users = [
-                            user.pk for user in action.assign_view_users.all()
-                        ]
-                    if action.assign_view_groups is not None:
-                        template_overrides.view_groups = [
-                            group.pk for group in action.assign_view_groups.all()
-                        ]
-                    if action.assign_change_users is not None:
-                        template_overrides.change_users = [
-                            user.pk for user in action.assign_change_users.all()
-                        ]
-                    if action.assign_change_groups is not None:
-                        template_overrides.change_groups = [
-                            group.pk for group in action.assign_change_groups.all()
-                        ]
-                    if action.assign_custom_fields is not None:
-                        template_overrides.custom_field_ids = [
-                            field.pk for field in action.assign_custom_fields.all()
-                        ]
-
-                    overrides.update(template_overrides)
-        return overrides
-
     def _parse_title_placeholders(self, title: str) -> str:
         local_added = timezone.localtime(timezone.now())
 
index 5acde159730fe1758fdec253fe9fe2f20d164b7a..bfe66f4feee77d6251029534e90331862b95df93 100644 (file)
@@ -3,127 +3,145 @@ import logging
 import os
 import shutil
 from pathlib import Path
+from typing import Final
+from typing import Optional
 
 from django.conf import settings
 from pikepdf import Pdf
 
 from documents.consumer import ConsumerError
 from documents.converters import convert_from_tiff_to_pdf
-from documents.data_models import ConsumableDocument
+from documents.plugins.base import ConsumeTaskPlugin
+from documents.plugins.base import NoCleanupPluginMixin
+from documents.plugins.base import NoSetupPluginMixin
+from documents.plugins.base import StopConsumeTaskError
 
 logger = logging.getLogger("paperless.double_sided")
 
 # Hardcoded for now, could be made a configurable setting if needed
-TIMEOUT_MINUTES = 30
+TIMEOUT_MINUTES: Final[int] = 30
+TIMEOUT_SECONDS: Final[int] = TIMEOUT_MINUTES * 60
 
 # Used by test cases
 STAGING_FILE_NAME = "double-sided-staging.pdf"
 
 
-def collate(input_doc: ConsumableDocument) -> str:
-    """
-    Tries to collate pages from 2 single sided scans of a double sided
-    document.
-
-    When called with a file, it checks whether or not a staging file
-    exists, if not, the current file is turned into that staging file
-    containing the odd numbered pages.
-
-    If a staging file exists, and it is not too old, the current file is
-    considered to be the second part (the even numbered pages) and it will
-    collate the pages of both, the pages of the second file will be added
-    in reverse order, since the ADF will have scanned the pages from bottom
-    to top.
-
-    Returns a status message on success, or raises a ConsumerError
-    in case of failure.
-    """
-
-    # Make sure scratch dir exists, Consumer might not have run yet
-    settings.SCRATCH_DIR.mkdir(exist_ok=True)
-
-    if input_doc.mime_type == "application/pdf":
-        pdf_file = input_doc.original_file
-    elif (
-        input_doc.mime_type == "image/tiff"
-        and settings.CONSUMER_COLLATE_DOUBLE_SIDED_TIFF_SUPPORT
-    ):
-        pdf_file = convert_from_tiff_to_pdf(
-            input_doc.original_file,
-            settings.SCRATCH_DIR,
+class CollatePlugin(NoCleanupPluginMixin, NoSetupPluginMixin, ConsumeTaskPlugin):
+    NAME: str = "CollatePlugin"
+
+    @property
+    def able_to_run(self) -> bool:
+        return (
+            settings.CONSUMER_ENABLE_COLLATE_DOUBLE_SIDED
+            and settings.CONSUMER_COLLATE_DOUBLE_SIDED_SUBDIR_NAME
+            in self.input_doc.original_file.parts
         )
-        input_doc.original_file.unlink()
-    else:
-        raise ConsumerError("Unsupported file type for collation of double-sided scans")
-
-    staging = settings.SCRATCH_DIR / STAGING_FILE_NAME
-
-    valid_staging_exists = False
-    if staging.exists():
-        stats = os.stat(str(staging))
-        # if the file is older than the timeout, we don't consider
-        # it valid
-        if dt.datetime.now().timestamp() - stats.st_mtime > TIMEOUT_MINUTES * 60:
-            logger.warning("Outdated double sided staging file exists, deleting it")
-            os.unlink(str(staging))
+
+    def run(self) -> Optional[str]:
+        """
+        Tries to collate pages from 2 single sided scans of a double sided
+        document.
+
+        When called with a file, it checks whether or not a staging file
+        exists, if not, the current file is turned into that staging file
+        containing the odd numbered pages.
+
+        If a staging file exists, and it is not too old, the current file is
+        considered to be the second part (the even numbered pages) and it will
+        collate the pages of both, the pages of the second file will be added
+        in reverse order, since the ADF will have scanned the pages from bottom
+        to top.
+
+        Returns a status message on success, or raises a ConsumerError
+        in case of failure.
+        """
+
+        if self.input_doc.mime_type == "application/pdf":
+            pdf_file = self.input_doc.original_file
+        elif (
+            self.input_doc.mime_type == "image/tiff"
+            and settings.CONSUMER_COLLATE_DOUBLE_SIDED_TIFF_SUPPORT
+        ):
+            pdf_file = convert_from_tiff_to_pdf(
+                self.input_doc.original_file,
+                self.base_tmp_dir,
+            )
+            self.input_doc.original_file.unlink()
         else:
-            valid_staging_exists = True
-
-    if valid_staging_exists:
-        try:
-            # Collate pages from second PDF in reverse order
-            with Pdf.open(staging) as pdf1, Pdf.open(pdf_file) as pdf2:
-                pdf2.pages.reverse()
-                try:
-                    for i, page in enumerate(pdf2.pages):
-                        pdf1.pages.insert(2 * i + 1, page)
-                except IndexError:
-                    raise ConsumerError(
-                        "This second file (even numbered pages) contains more "
-                        "pages than the first/odd numbered one. This means the "
-                        "two uploaded files don't belong to the same double-"
-                        "sided scan. Please retry, starting with the odd "
-                        "numbered pages again.",
+            raise ConsumerError(
+                "Unsupported file type for collation of double-sided scans",
+            )
+
+        staging: Path = settings.SCRATCH_DIR / STAGING_FILE_NAME
+
+        valid_staging_exists = False
+        if staging.exists():
+            stats = staging.stat()
+            # if the file is older than the timeout, we don't consider
+            # it valid
+            if (dt.datetime.now().timestamp() - stats.st_mtime) > TIMEOUT_SECONDS:
+                logger.warning("Outdated double sided staging file exists, deleting it")
+                staging.unlink()
+            else:
+                valid_staging_exists = True
+
+        if valid_staging_exists:
+            try:
+                # Collate pages from second PDF in reverse order
+                with Pdf.open(staging) as pdf1, Pdf.open(pdf_file) as pdf2:
+                    pdf2.pages.reverse()
+                    try:
+                        for i, page in enumerate(pdf2.pages):
+                            pdf1.pages.insert(2 * i + 1, page)
+                    except IndexError:
+                        raise ConsumerError(
+                            "This second file (even numbered pages) contains more "
+                            "pages than the first/odd numbered one. This means the "
+                            "two uploaded files don't belong to the same double-"
+                            "sided scan. Please retry, starting with the odd "
+                            "numbered pages again.",
+                        )
+                    # Merged file has the same path, but without the
+                    # double-sided subdir. Therefore, it is also in the
+                    # consumption dir and will be picked up for processing
+                    old_file = self.input_doc.original_file
+                    new_file = Path(
+                        *(
+                            part
+                            for part in old_file.with_name(
+                                f"{old_file.stem}-collated.pdf",
+                            ).parts
+                            if part
+                            != settings.CONSUMER_COLLATE_DOUBLE_SIDED_SUBDIR_NAME
+                        ),
                     )
-                # Merged file has the same path, but without the
-                # double-sided subdir. Therefore, it is also in the
-                # consumption dir and will be picked up for processing
-                old_file = input_doc.original_file
-                new_file = Path(
-                    *(
-                        part
-                        for part in old_file.with_name(
-                            f"{old_file.stem}-collated.pdf",
-                        ).parts
-                        if part != settings.CONSUMER_COLLATE_DOUBLE_SIDED_SUBDIR_NAME
-                    ),
+                    # If the user didn't create the subdirs yet, do it for them
+                    new_file.parent.mkdir(parents=True, exist_ok=True)
+                    pdf1.save(new_file)
+                logger.info("Collated documents into new file %s", new_file)
+                raise StopConsumeTaskError(
+                    "Success. Even numbered pages of double sided scan collated "
+                    "with odd pages",
                 )
-                # If the user didn't create the subdirs yet, do it for them
-                new_file.parent.mkdir(parents=True, exist_ok=True)
-                pdf1.save(new_file)
-            logger.info("Collated documents into new file %s", new_file)
-            return (
-                "Success. Even numbered pages of double sided scan collated "
-                "with odd pages"
+            finally:
+                # Delete staging and recently uploaded file no matter what.
+                # If any error occurs, the user needs to be able to restart
+                # the process from scratch; after all, the staging file
+                # with the odd numbered pages might be the culprit
+                pdf_file.unlink()
+                staging.unlink()
+
+        else:
+            shutil.move(pdf_file, staging)
+            # update access to modification time so we know if the file
+            # is outdated when another file gets uploaded
+            timestamp = dt.datetime.now().timestamp()
+            os.utime(staging, (timestamp, timestamp))
+            logger.info(
+                "Got scan with odd numbered pages of double-sided scan, moved it to %s",
+                staging,
+            )
+            raise StopConsumeTaskError(
+                "Received odd numbered pages of double sided scan, waiting up to "
+                f"{TIMEOUT_MINUTES} minutes for even numbered pages",
             )
-        finally:
-            # Delete staging and recently uploaded file no matter what.
-            # If any error occurs, the user needs to be able to restart
-            # the process from scratch; after all, the staging file
-            # with the odd numbered pages might be the culprit
-            pdf_file.unlink()
-            staging.unlink()
-
-    else:
-        shutil.move(pdf_file, staging)
-        # update access to modification time so we know if the file
-        # is outdated when another file gets uploaded
-        os.utime(staging, (dt.datetime.now().timestamp(),) * 2)
-        logger.info(
-            "Got scan with odd numbered pages of double-sided scan, moved it to %s",
-            staging,
-        )
-        return (
-            "Received odd numbered pages of double sided scan, waiting up to "
-            f"{TIMEOUT_MINUTES} minutes for even numbered pages"
-        )
diff --git a/src/documents/plugins/__init__.py b/src/documents/plugins/__init__.py
new file mode 100644 (file)
index 0000000..e69de29
diff --git a/src/documents/plugins/base.py b/src/documents/plugins/base.py
new file mode 100644 (file)
index 0000000..aec4887
--- /dev/null
@@ -0,0 +1,131 @@
+import abc
+from pathlib import Path
+from typing import Final
+from typing import Optional
+
+from documents.data_models import ConsumableDocument
+from documents.data_models import DocumentMetadataOverrides
+from documents.plugins.helpers import ProgressManager
+
+
+class StopConsumeTaskError(Exception):
+    """
+    A plugin setup or run may raise this to exit the asynchronous consume task.
+
+    Most likely, this means it has created one or more new tasks to execute instead,
+    such as when a barcode has been used to create new documents
+    """
+
+    def __init__(self, message: str) -> None:
+        self.message = message
+        super().__init__(message)
+
+
+class ConsumeTaskPlugin(abc.ABC):
+    """
+    Defines the interface for a plugin for the document consume task
+    Meanings as per RFC2119 (https://datatracker.ietf.org/doc/html/rfc2119)
+
+    Plugin Implementation
+
+    The plugin SHALL implement property able_to_run and methods setup, run and cleanup.
+    The plugin property able_to_run SHALL return True if the plugin is able to run, given the conditions, settings and document information.
+    The plugin property able_to_run MAY be hardcoded to return True.
+    The plugin setup SHOULD perform any resource creation or additional initialization needed to run the document.
+    The plugin setup MAY be a non-operation.
+    The plugin cleanup SHOULD perform resource cleanup, including in the event of an error.
+    The plugin cleanup MAY be a non-operation.
+    The plugin run SHALL perform any operations against the document or system state required for the plugin.
+    The plugin run MAY update the document metadata.
+    The plugin run MAY return an informational message.
+    The plugin run MAY raise StopConsumeTaskError to cease any further operations against the document.
+
+    Plugin Manager Implementation
+
+    The plugin manager SHALL provide the plugin with the input document, document metadata, progress manager and a created temporary directory.
+    The plugin manager SHALL execute the plugin setup, run and cleanup, in that order IF the plugin property able_to_run is True.
+    The plugin manager SHOULD log the return message of executing a plugin's run.
+    The plugin manager SHALL always execute the plugin cleanup, IF the plugin property able_to_run is True.
+    The plugin manager SHALL cease calling plugins and exit the task IF a plugin raises StopConsumeTaskError.
+    The plugin manager SHOULD return the StopConsumeTaskError message IF a plugin raises StopConsumeTaskError.
+    """
+
+    NAME: str = "ConsumeTaskPlugin"
+
+    def __init__(
+        self,
+        input_doc: ConsumableDocument,
+        metadata: DocumentMetadataOverrides,
+        status_mgr: ProgressManager,
+        base_tmp_dir: Path,
+        task_id: str,
+    ) -> None:
+        super().__init__()
+        self.input_doc = input_doc
+        self.metadata = metadata
+        self.base_tmp_dir: Final = base_tmp_dir
+        self.status_mgr = status_mgr
+        self.task_id: Final = task_id
+
+    @abc.abstractproperty
+    def able_to_run(self) -> bool:
+        """
+        Return True if the conditions are met for the plugin to run, False otherwise
+
+        If False, setup(), run() and cleanup() will not be called
+        """
+
+    @abc.abstractmethod
+    def setup(self) -> None:
+        """
+        Allows the plugin to perform any additional setup it may need, such as creating
+        a temporary directory, copying a file somewhere, etc.
+
+        Executed before run()
+
+        In general, this should be the "light" work, not the bulk of processing
+        """
+
+    @abc.abstractmethod
+    def run(self) -> Optional[str]:
+        """
+        The bulk of plugin processing, this does whatever action the plugin is for.
+
+        Executed after setup() and before cleanup()
+        """
+
+    @abc.abstractmethod
+    def cleanup(self) -> None:
+        """
+        Allows the plugin to execute any cleanup it may require
+
+        Executed after run(), even in the case of error
+        """
+
+
+class AlwaysRunPluginMixin(ConsumeTaskPlugin):
+    """
+    A plugin which is always able to run
+    """
+
+    @property
+    def able_to_run(self) -> bool:
+        return True
+
+
+class NoSetupPluginMixin(ConsumeTaskPlugin):
+    """
+    A plugin which requires no setup
+    """
+
+    def setup(self) -> None:
+        pass
+
+
+class NoCleanupPluginMixin(ConsumeTaskPlugin):
+    """
+    A plugin which needs to clean up no files
+    """
+
+    def cleanup(self) -> None:
+        pass
diff --git a/src/documents/plugins/helpers.py b/src/documents/plugins/helpers.py
new file mode 100644 (file)
index 0000000..92fe125
--- /dev/null
@@ -0,0 +1,82 @@
+import enum
+from typing import TYPE_CHECKING
+from typing import Optional
+from typing import Union
+
+from asgiref.sync import async_to_sync
+from channels.layers import get_channel_layer
+from channels_redis.pubsub import RedisPubSubChannelLayer
+
+
+class ProgressStatusOptions(str, enum.Enum):
+    STARTED = "STARTED"
+    WORKING = "WORKING"
+    SUCCESS = "SUCCESS"
+    FAILED = "FAILED"
+
+
+class ProgressManager:
+    """
+    Handles sending of progress information via the channel layer, with proper management
+    of the open/close of the layer to ensure messages go out and everything is cleaned up
+    """
+
+    def __init__(self, filename: str, task_id: Optional[str] = None) -> None:
+        self.filename = filename
+        self._channel: Optional[RedisPubSubChannelLayer] = None
+        self.task_id = task_id
+
+    def __enter__(self):
+        self.open()
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.close()
+
+    def open(self) -> None:
+        """
+        If not already opened, gets the default channel layer
+        opened and ready to send messages
+        """
+        if self._channel is None:
+            self._channel = get_channel_layer()
+
+    def close(self) -> None:
+        """
+        If it was opened, flushes the channel layer
+        """
+        if self._channel is not None:
+            async_to_sync(self._channel.flush)
+            self._channel = None
+
+    def send_progress(
+        self,
+        status: ProgressStatusOptions,
+        message: str,
+        current_progress: int,
+        max_progress: int,
+        extra_args: Optional[dict[str, Union[str, int]]] = None,
+    ) -> None:
+        # Ensure the layer is open
+        self.open()
+
+        # Just for IDEs
+        if TYPE_CHECKING:
+            assert self._channel is not None
+
+        payload = {
+            "type": "status_update",
+            "data": {
+                "filename": self.filename,
+                "task_id": self.task_id,
+                "current_progress": current_progress,
+                "max_progress": max_progress,
+                "status": status,
+                "message": message,
+            },
+        }
+        if extra_args is not None:
+            payload["data"].update(extra_args)
+
+        # Construct and send the update
+        async_to_sync(self._channel.group_send)("status_updates", payload)
index abb9cd39db119459ff9c456f9a516733c9e02560..a83c2e6cdd65a207537477dedd87a39bfdf2b361 100644 (file)
@@ -2,30 +2,30 @@ import hashlib
 import logging
 import shutil
 import uuid
+from pathlib import Path
+from tempfile import TemporaryDirectory
 from typing import Optional
 
 import tqdm
-from asgiref.sync import async_to_sync
 from celery import Task
 from celery import shared_task
-from channels.layers import get_channel_layer
 from django.conf import settings
 from django.db import transaction
 from django.db.models.signals import post_save
 from filelock import FileLock
-from redis.exceptions import ConnectionError
 from whoosh.writing import AsyncWriter
 
 from documents import index
 from documents import sanity_checker
-from documents.barcodes import BarcodeReader
+from documents.barcodes import BarcodePlugin
 from documents.classifier import DocumentClassifier
 from documents.classifier import load_classifier
 from documents.consumer import Consumer
 from documents.consumer import ConsumerError
+from documents.consumer import WorkflowTriggerPlugin
 from documents.data_models import ConsumableDocument
 from documents.data_models import DocumentMetadataOverrides
-from documents.double_sided import collate
+from documents.double_sided import CollatePlugin
 from documents.file_handling import create_source_path_directory
 from documents.file_handling import generate_unique_filename
 from documents.models import Correspondent
@@ -35,6 +35,10 @@ from documents.models import StoragePath
 from documents.models import Tag
 from documents.parsers import DocumentParser
 from documents.parsers import get_parser_class_for_mime_type
+from documents.plugins.base import ConsumeTaskPlugin
+from documents.plugins.base import ProgressManager
+from documents.plugins.base import StopConsumeTaskError
+from documents.plugins.helpers import ProgressStatusOptions
 from documents.sanity_checker import SanityCheckFailedException
 from documents.signals import document_updated
 
@@ -102,70 +106,60 @@ def consume_file(
     input_doc: ConsumableDocument,
     overrides: Optional[DocumentMetadataOverrides] = None,
 ):
-    def send_progress(status="SUCCESS", message="finished"):
-        payload = {
-            "filename": overrides.filename or input_doc.original_file.name,
-            "task_id": None,
-            "current_progress": 100,
-            "max_progress": 100,
-            "status": status,
-            "message": message,
-        }
-        try:
-            async_to_sync(get_channel_layer().group_send)(
-                "status_updates",
-                {"type": "status_update", "data": payload},
-            )
-        except ConnectionError as e:
-            logger.warning(f"ConnectionError on status send: {e!s}")
-
     # Default no overrides
     if overrides is None:
         overrides = DocumentMetadataOverrides()
 
-    # Handle collation of double-sided documents scanned in two parts
-    if settings.CONSUMER_ENABLE_COLLATE_DOUBLE_SIDED and (
-        settings.CONSUMER_COLLATE_DOUBLE_SIDED_SUBDIR_NAME
-        in input_doc.original_file.parts
-    ):
-        try:
-            msg = collate(input_doc)
-            send_progress(message=msg)
-            return msg
-        except ConsumerError as e:
-            send_progress(status="FAILURE", message=e.args[0])
-            raise e
-
-    # read all barcodes in the current document
-    if settings.CONSUMER_ENABLE_BARCODES or settings.CONSUMER_ENABLE_ASN_BARCODE:
-        with BarcodeReader(input_doc.original_file, input_doc.mime_type) as reader:
-            if settings.CONSUMER_ENABLE_BARCODES and reader.separate(
-                input_doc.source,
+    plugins: list[type[ConsumeTaskPlugin]] = [
+        CollatePlugin,
+        BarcodePlugin,
+        WorkflowTriggerPlugin,
+    ]
+
+    with ProgressManager(
+        overrides.filename or input_doc.original_file.name,
+        self.request.id,
+    ) as status_mgr, TemporaryDirectory(dir=settings.SCRATCH_DIR) as tmp_dir:
+        tmp_dir = Path(tmp_dir)
+        for plugin_class in plugins:
+            plugin_name = plugin_class.NAME
+
+            plugin = plugin_class(
+                input_doc,
                 overrides,
-            ):
-                # notify the sender, otherwise the progress bar
-                # in the UI stays stuck
-                send_progress()
-                # consuming stops here, since the original document with
-                # the barcodes has been split and will be consumed separately
-                input_doc.original_file.unlink()
-                return "File successfully split"
-
-            # try reading the ASN from barcode
-            if (
-                settings.CONSUMER_ENABLE_ASN_BARCODE
-                and (located_asn := reader.asn) is not None
-            ):
-                # Note this will take precedence over an API provided ASN
-                # But it's from a physical barcode, so that's good
-                overrides.asn = located_asn
-                logger.info(f"Found ASN in barcode: {overrides.asn}")
-
-    template_overrides = Consumer().get_workflow_overrides(
-        input_doc=input_doc,
-    )
+                status_mgr,
+                tmp_dir,
+                self.request.id,
+            )
+
+            if not plugin.able_to_run:
+                logger.debug(f"Skipping plugin {plugin_name}")
+                continue
+
+            try:
+                logger.debug(f"Executing plugin {plugin_name}")
+                plugin.setup()
+
+                msg = plugin.run()
+
+                if msg is not None:
+                    logger.info(f"{plugin_name} completed with: {msg}")
+                else:
+                    logger.info(f"{plugin_name} completed with no message")
+
+                overrides = plugin.metadata
+
+            except StopConsumeTaskError as e:
+                logger.info(f"{plugin_name} requested task exit: {e.message}")
+                return e.message
+
+            except Exception as e:
+                logger.exception(f"{plugin_name} failed: {e}")
+                status_mgr.send_progress(ProgressStatusOptions.FAILED, f"{e}", 100, 100)
+                raise
 
-    overrides.update(template_overrides)
+            finally:
+                plugin.cleanup()
 
     # continue with consumption if no barcode was found
     document = Consumer().try_consume_file(
index e4d8ccc57aaeb6f6cd0f06324207397609b66a93..4552a2b77fb0a5004ca485f39acd782f06674283 100644 (file)
@@ -1,4 +1,7 @@
 import shutil
+from collections.abc import Generator
+from contextlib import contextmanager
+from pathlib import Path
 from unittest import mock
 
 import pytest
@@ -7,14 +10,13 @@ from django.test import TestCase
 from django.test import override_settings
 
 from documents import tasks
-from documents.barcodes import BarcodeReader
-from documents.consumer import ConsumerError
+from documents.barcodes import BarcodePlugin
 from documents.data_models import ConsumableDocument
 from documents.data_models import DocumentMetadataOverrides
 from documents.data_models import DocumentSource
-from documents.models import Document
 from documents.tests.utils import DirectoriesMixin
 from documents.tests.utils import DocumentConsumeDelayMixin
+from documents.tests.utils import DummyProgressManager
 from documents.tests.utils import FileSystemAssertsMixin
 from documents.tests.utils import SampleDirMixin
 
@@ -26,8 +28,29 @@ except ImportError:
     HAS_ZXING_LIB = False
 
 
+class GetReaderPluginMixin:
+    @contextmanager
+    def get_reader(self, filepath: Path) -> Generator[BarcodePlugin, None, None]:
+        reader = BarcodePlugin(
+            ConsumableDocument(DocumentSource.ConsumeFolder, original_file=filepath),
+            DocumentMetadataOverrides(),
+            DummyProgressManager(filepath.name, None),
+            self.dirs.scratch_dir,
+            "task-id",
+        )
+        reader.setup()
+        yield reader
+        reader.cleanup()
+
+
 @override_settings(CONSUMER_BARCODE_SCANNER="PYZBAR")
-class TestBarcode(DirectoriesMixin, FileSystemAssertsMixin, SampleDirMixin, TestCase):
+class TestBarcode(
+    DirectoriesMixin,
+    FileSystemAssertsMixin,
+    SampleDirMixin,
+    GetReaderPluginMixin,
+    TestCase,
+):
     def test_scan_file_for_separating_barcodes(self):
         """
         GIVEN:
@@ -39,7 +62,7 @@ class TestBarcode(DirectoriesMixin, FileSystemAssertsMixin, SampleDirMixin, Test
         """
         test_file = self.BARCODE_SAMPLE_DIR / "patch-code-t.pdf"
 
-        with BarcodeReader(test_file, "application/pdf") as reader:
+        with self.get_reader(test_file) as reader:
             reader.detect()
             separator_page_numbers = reader.get_separation_pages()
 
@@ -60,7 +83,7 @@ class TestBarcode(DirectoriesMixin, FileSystemAssertsMixin, SampleDirMixin, Test
         """
         test_file = self.BARCODE_SAMPLE_DIR / "patch-code-t-middle.tiff"
 
-        with BarcodeReader(test_file, "image/tiff") as reader:
+        with self.get_reader(test_file) as reader:
             reader.detect()
             separator_page_numbers = reader.get_separation_pages()
 
@@ -80,7 +103,7 @@ class TestBarcode(DirectoriesMixin, FileSystemAssertsMixin, SampleDirMixin, Test
         """
         test_file = self.BARCODE_SAMPLE_DIR / "patch-code-t-middle-alpha.tiff"
 
-        with BarcodeReader(test_file, "image/tiff") as reader:
+        with self.get_reader(test_file) as reader:
             reader.detect()
             separator_page_numbers = reader.get_separation_pages()
 
@@ -97,7 +120,7 @@ class TestBarcode(DirectoriesMixin, FileSystemAssertsMixin, SampleDirMixin, Test
             - No pages to split on
         """
         test_file = self.SAMPLE_DIR / "simple.pdf"
-        with BarcodeReader(test_file, "application/pdf") as reader:
+        with self.get_reader(test_file) as reader:
             reader.detect()
             separator_page_numbers = reader.get_separation_pages()
 
@@ -115,7 +138,7 @@ class TestBarcode(DirectoriesMixin, FileSystemAssertsMixin, SampleDirMixin, Test
         """
         test_file = self.BARCODE_SAMPLE_DIR / "patch-code-t-middle.pdf"
 
-        with BarcodeReader(test_file, "application/pdf") as reader:
+        with self.get_reader(test_file) as reader:
             reader.detect()
             separator_page_numbers = reader.get_separation_pages()
 
@@ -133,7 +156,7 @@ class TestBarcode(DirectoriesMixin, FileSystemAssertsMixin, SampleDirMixin, Test
         """
         test_file = self.BARCODE_SAMPLE_DIR / "several-patcht-codes.pdf"
 
-        with BarcodeReader(test_file, "application/pdf") as reader:
+        with self.get_reader(test_file) as reader:
             reader.detect()
             separator_page_numbers = reader.get_separation_pages()
 
@@ -158,7 +181,7 @@ class TestBarcode(DirectoriesMixin, FileSystemAssertsMixin, SampleDirMixin, Test
         ]:
             test_file = self.BARCODE_SAMPLE_DIR / test_file
 
-            with BarcodeReader(test_file, "application/pdf") as reader:
+            with self.get_reader(test_file) as reader:
                 reader.detect()
                 separator_page_numbers = reader.get_separation_pages()
 
@@ -177,7 +200,7 @@ class TestBarcode(DirectoriesMixin, FileSystemAssertsMixin, SampleDirMixin, Test
         """
         test_file = self.BARCODE_SAMPLE_DIR / "patch-code-t-middle-unreadable.pdf"
 
-        with BarcodeReader(test_file, "application/pdf") as reader:
+        with self.get_reader(test_file) as reader:
             reader.detect()
             separator_page_numbers = reader.get_separation_pages()
 
@@ -195,7 +218,7 @@ class TestBarcode(DirectoriesMixin, FileSystemAssertsMixin, SampleDirMixin, Test
         """
         test_file = self.BARCODE_SAMPLE_DIR / "barcode-fax-image.pdf"
 
-        with BarcodeReader(test_file, "application/pdf") as reader:
+        with self.get_reader(test_file) as reader:
             reader.detect()
             separator_page_numbers = reader.get_separation_pages()
 
@@ -214,7 +237,7 @@ class TestBarcode(DirectoriesMixin, FileSystemAssertsMixin, SampleDirMixin, Test
         """
         test_file = self.BARCODE_SAMPLE_DIR / "patch-code-t-qr.pdf"
 
-        with BarcodeReader(test_file, "application/pdf") as reader:
+        with self.get_reader(test_file) as reader:
             reader.detect()
             separator_page_numbers = reader.get_separation_pages()
 
@@ -234,7 +257,7 @@ class TestBarcode(DirectoriesMixin, FileSystemAssertsMixin, SampleDirMixin, Test
         """
         test_file = self.BARCODE_SAMPLE_DIR / "barcode-39-custom.pdf"
 
-        with BarcodeReader(test_file, "application/pdf") as reader:
+        with self.get_reader(test_file) as reader:
             reader.detect()
             separator_page_numbers = reader.get_separation_pages()
 
@@ -255,7 +278,7 @@ class TestBarcode(DirectoriesMixin, FileSystemAssertsMixin, SampleDirMixin, Test
         """
         test_file = self.BARCODE_SAMPLE_DIR / "barcode-qr-custom.pdf"
 
-        with BarcodeReader(test_file, "application/pdf") as reader:
+        with self.get_reader(test_file) as reader:
             reader.detect()
             separator_page_numbers = reader.get_separation_pages()
 
@@ -276,7 +299,7 @@ class TestBarcode(DirectoriesMixin, FileSystemAssertsMixin, SampleDirMixin, Test
         """
         test_file = self.BARCODE_SAMPLE_DIR / "barcode-128-custom.pdf"
 
-        with BarcodeReader(test_file, "application/pdf") as reader:
+        with self.get_reader(test_file) as reader:
             reader.detect()
             separator_page_numbers = reader.get_separation_pages()
 
@@ -296,7 +319,7 @@ class TestBarcode(DirectoriesMixin, FileSystemAssertsMixin, SampleDirMixin, Test
         """
         test_file = self.BARCODE_SAMPLE_DIR / "barcode-39-custom.pdf"
 
-        with BarcodeReader(test_file, "application/pdf") as reader:
+        with self.get_reader(test_file) as reader:
             reader.detect()
             separator_page_numbers = reader.get_separation_pages()
 
@@ -315,7 +338,7 @@ class TestBarcode(DirectoriesMixin, FileSystemAssertsMixin, SampleDirMixin, Test
         """
         test_file = self.BARCODE_SAMPLE_DIR / "many-qr-codes.pdf"
 
-        with BarcodeReader(test_file, "application/pdf") as reader:
+        with self.get_reader(test_file) as reader:
             reader.detect()
             separator_page_numbers = reader.get_separation_pages()
 
@@ -334,7 +357,7 @@ class TestBarcode(DirectoriesMixin, FileSystemAssertsMixin, SampleDirMixin, Test
         """
         test_file = self.SAMPLE_DIR / "password-is-test.pdf"
         with self.assertLogs("paperless.barcodes", level="WARNING") as cm:
-            with BarcodeReader(test_file, "application/pdf") as reader:
+            with self.get_reader(test_file) as reader:
                 reader.detect()
                 warning = cm.output[0]
                 expected_str = "WARNING:paperless.barcodes:File is likely password protected, not checking for barcodes"
@@ -356,7 +379,7 @@ class TestBarcode(DirectoriesMixin, FileSystemAssertsMixin, SampleDirMixin, Test
         """
         test_file = self.BARCODE_SAMPLE_DIR / "patch-code-t-middle.pdf"
 
-        with BarcodeReader(test_file, "application/pdf") as reader:
+        with self.get_reader(test_file) as reader:
             documents = reader.separate_pages({1: False})
 
             self.assertEqual(reader.pdf_file, test_file)
@@ -373,7 +396,7 @@ class TestBarcode(DirectoriesMixin, FileSystemAssertsMixin, SampleDirMixin, Test
         """
         test_file = self.BARCODE_SAMPLE_DIR / "patch-code-t-double.pdf"
 
-        with BarcodeReader(test_file, "application/pdf") as reader:
+        with self.get_reader(test_file) as reader:
             documents = reader.separate_pages({1: False, 2: False})
 
             self.assertEqual(len(documents), 2)
@@ -385,32 +408,18 @@ class TestBarcode(DirectoriesMixin, FileSystemAssertsMixin, SampleDirMixin, Test
         WHEN:
             - No separation pages are provided
         THEN:
-            - No new documents are produced
-            - A warning is logged
+            - Nothing happens
         """
         test_file = self.SAMPLE_DIR / "simple.pdf"
 
-        with self.assertLogs("paperless.barcodes", level="WARNING") as cm:
-            with BarcodeReader(test_file, "application/pdf") as reader:
-                self.assertFalse(
-                    reader.separate(
-                        DocumentSource.ApiUpload,
-                        DocumentMetadataOverrides(),
-                    ),
-                )
-                self.assertEqual(
-                    cm.output,
-                    [
-                        "WARNING:paperless.barcodes:No pages to split on!",
-                    ],
-                )
+        with self.get_reader(test_file) as reader:
+            self.assertEqual("No pages to split on!", reader.run())
 
     @override_settings(
         CONSUMER_ENABLE_BARCODES=True,
         CONSUMER_BARCODE_TIFF_SUPPORT=True,
     )
-    @mock.patch("documents.consumer.Consumer.try_consume_file")
-    def test_consume_barcode_unsupported_jpg_file(self, m):
+    def test_consume_barcode_unsupported_jpg_file(self):
         """
         GIVEN:
             - JPEG image as input
@@ -422,35 +431,8 @@ class TestBarcode(DirectoriesMixin, FileSystemAssertsMixin, SampleDirMixin, Test
         """
         test_file = self.SAMPLE_DIR / "simple.jpg"
 
-        dst = settings.SCRATCH_DIR / "simple.jpg"
-        shutil.copy(test_file, dst)
-
-        with self.assertLogs("paperless.barcodes", level="WARNING") as cm:
-            self.assertIn(
-                "Success",
-                tasks.consume_file(
-                    ConsumableDocument(
-                        source=DocumentSource.ConsumeFolder,
-                        original_file=dst,
-                    ),
-                    None,
-                ),
-            )
-
-        self.assertListEqual(
-            cm.output,
-            [
-                "WARNING:paperless.barcodes:Unsupported file format for barcode reader: image/jpeg",
-            ],
-        )
-        m.assert_called_once()
-
-        args, kwargs = m.call_args
-        self.assertIsNone(kwargs["override_filename"])
-        self.assertIsNone(kwargs["override_title"])
-        self.assertIsNone(kwargs["override_correspondent_id"])
-        self.assertIsNone(kwargs["override_document_type_id"])
-        self.assertIsNone(kwargs["override_tag_ids"])
+        with self.get_reader(test_file) as reader:
+            self.assertFalse(reader.able_to_run)
 
     @override_settings(
         CONSUMER_ENABLE_BARCODES=True,
@@ -467,7 +449,7 @@ class TestBarcode(DirectoriesMixin, FileSystemAssertsMixin, SampleDirMixin, Test
         """
         test_file = self.BARCODE_SAMPLE_DIR / "split-by-asn-2.pdf"
 
-        with BarcodeReader(test_file, "application/pdf") as reader:
+        with self.get_reader(test_file) as reader:
             reader.detect()
             separator_page_numbers = reader.get_separation_pages()
 
@@ -504,7 +486,7 @@ class TestBarcode(DirectoriesMixin, FileSystemAssertsMixin, SampleDirMixin, Test
         """
         test_file = self.BARCODE_SAMPLE_DIR / "split-by-asn-1.pdf"
 
-        with BarcodeReader(test_file, "application/pdf") as reader:
+        with self.get_reader(test_file) as reader:
             reader.detect()
             separator_page_numbers = reader.get_separation_pages()
 
@@ -550,7 +532,7 @@ class TestBarcodeNewConsume(
 
         overrides = DocumentMetadataOverrides(tag_ids=[1, 2, 9])
 
-        with mock.patch("documents.tasks.async_to_sync") as progress_mocker:
+        with mock.patch("documents.tasks.ProgressManager", DummyProgressManager):
             self.assertEqual(
                 tasks.consume_file(
                     ConsumableDocument(
@@ -559,10 +541,8 @@ class TestBarcodeNewConsume(
                     ),
                     overrides,
                 ),
-                "File successfully split",
+                "Barcode splitting complete!",
             )
-            # We let the consumer know progress is done
-            progress_mocker.assert_called_once()
             # 2 new document consume tasks created
             self.assertEqual(self.consume_file_mock.call_count, 2)
 
@@ -580,7 +560,20 @@ class TestBarcodeNewConsume(
                 self.assertEqual(overrides, new_doc_overrides)
 
 
-class TestAsnBarcode(DirectoriesMixin, SampleDirMixin, TestCase):
+class TestAsnBarcode(DirectoriesMixin, SampleDirMixin, GetReaderPluginMixin, TestCase):
+    @contextmanager
+    def get_reader(self, filepath: Path) -> BarcodePlugin:
+        reader = BarcodePlugin(
+            ConsumableDocument(DocumentSource.ConsumeFolder, original_file=filepath),
+            DocumentMetadataOverrides(),
+            DummyProgressManager(filepath.name, None),
+            self.dirs.scratch_dir,
+            "task-id",
+        )
+        reader.setup()
+        yield reader
+        reader.cleanup()
+
     @override_settings(CONSUMER_ASN_BARCODE_PREFIX="CUSTOM-PREFIX-")
     def test_scan_file_for_asn_custom_prefix(self):
         """
@@ -594,7 +587,7 @@ class TestAsnBarcode(DirectoriesMixin, SampleDirMixin, TestCase):
             - The ASN integer value is correct
         """
         test_file = self.BARCODE_SAMPLE_DIR / "barcode-39-asn-custom-prefix.pdf"
-        with BarcodeReader(test_file, "application/pdf") as reader:
+        with self.get_reader(test_file) as reader:
             asn = reader.asn
 
             self.assertEqual(reader.pdf_file, test_file)
@@ -613,7 +606,7 @@ class TestAsnBarcode(DirectoriesMixin, SampleDirMixin, TestCase):
         """
         test_file = self.BARCODE_SAMPLE_DIR / "barcode-39-asn-123.pdf"
 
-        with BarcodeReader(test_file, "application/pdf") as reader:
+        with self.get_reader(test_file) as reader:
             asn = reader.asn
 
             self.assertEqual(reader.pdf_file, test_file)
@@ -630,55 +623,12 @@ class TestAsnBarcode(DirectoriesMixin, SampleDirMixin, TestCase):
         """
         test_file = self.BARCODE_SAMPLE_DIR / "patch-code-t.pdf"
 
-        with BarcodeReader(test_file, "application/pdf") as reader:
+        with self.get_reader(test_file) as reader:
             asn = reader.asn
 
             self.assertEqual(reader.pdf_file, test_file)
             self.assertEqual(asn, None)
 
-    @override_settings(CONSUMER_ENABLE_ASN_BARCODE=True)
-    def test_scan_file_for_asn_already_exists(self):
-        """
-        GIVEN:
-            - PDF with an ASN barcode
-            - ASN value already exists
-        WHEN:
-            - File is scanned for barcodes
-        THEN:
-            - ASN is retrieved from the document
-            - Consumption fails
-        """
-
-        Document.objects.create(
-            title="WOW",
-            content="the content",
-            archive_serial_number=123,
-            checksum="456",
-            mime_type="application/pdf",
-        )
-
-        test_file = self.BARCODE_SAMPLE_DIR / "barcode-39-asn-123.pdf"
-
-        dst = settings.SCRATCH_DIR / "barcode-39-asn-123.pdf"
-        shutil.copy(test_file, dst)
-
-        with mock.patch("documents.consumer.Consumer._send_progress"):
-            with self.assertRaises(ConsumerError) as cm, self.assertLogs(
-                "paperless.consumer",
-                level="ERROR",
-            ) as logs_cm:
-                tasks.consume_file(
-                    ConsumableDocument(
-                        source=DocumentSource.ConsumeFolder,
-                        original_file=dst,
-                    ),
-                    None,
-                )
-            self.assertIn("Not consuming barcode-39-asn-123.pdf", str(cm.exception))
-            error_str = logs_cm.output[0]
-            expected_str = "ERROR:paperless.consumer:Not consuming barcode-39-asn-123.pdf: Given ASN already exists!"
-            self.assertEqual(expected_str, error_str)
-
     def test_scan_file_for_asn_barcode_invalid(self):
         """
         GIVEN:
@@ -692,7 +642,7 @@ class TestAsnBarcode(DirectoriesMixin, SampleDirMixin, TestCase):
         """
         test_file = self.BARCODE_SAMPLE_DIR / "barcode-39-asn-invalid.pdf"
 
-        with BarcodeReader(test_file, "application/pdf") as reader:
+        with self.get_reader(test_file) as reader:
             asn = reader.asn
 
             self.assertEqual(reader.pdf_file, test_file)
@@ -718,7 +668,9 @@ class TestAsnBarcode(DirectoriesMixin, SampleDirMixin, TestCase):
         dst = settings.SCRATCH_DIR / "barcode-39-asn-123.pdf"
         shutil.copy(test_file, dst)
 
-        with mock.patch("documents.consumer.Consumer.try_consume_file") as mocked_call:
+        with mock.patch(
+            "documents.consumer.Consumer.try_consume_file",
+        ) as mocked_consumer:
             tasks.consume_file(
                 ConsumableDocument(
                     source=DocumentSource.ConsumeFolder,
@@ -726,40 +678,11 @@ class TestAsnBarcode(DirectoriesMixin, SampleDirMixin, TestCase):
                 ),
                 None,
             )
-
-            args, kwargs = mocked_call.call_args
+            mocked_consumer.assert_called_once()
+            args, kwargs = mocked_consumer.call_args
 
             self.assertEqual(kwargs["override_asn"], 123)
 
-    @override_settings(CONSUMER_ENABLE_ASN_BARCODE=True)
-    def test_asn_too_large(self):
-        """
-        GIVEN:
-            - ASN from barcode enabled
-            - Barcode contains too large an ASN value
-        WHEN:
-            - ASN from barcode checked for correctness
-        THEN:
-            - Exception is raised regarding size limits
-        """
-        src = self.BARCODE_SAMPLE_DIR / "barcode-128-asn-too-large.pdf"
-
-        dst = self.dirs.scratch_dir / "barcode-128-asn-too-large.pdf"
-        shutil.copy(src, dst)
-
-        input_doc = ConsumableDocument(
-            source=DocumentSource.ConsumeFolder,
-            original_file=dst,
-        )
-
-        with mock.patch("documents.consumer.Consumer._send_progress"):
-            self.assertRaisesMessage(
-                ConsumerError,
-                "Given ASN 4294967296 is out of range [0, 4,294,967,295]",
-                tasks.consume_file,
-                input_doc,
-            )
-
     @override_settings(CONSUMER_BARCODE_SCANNER="PYZBAR")
     def test_scan_file_for_qrcode_without_upscale(self):
         """
@@ -774,7 +697,7 @@ class TestAsnBarcode(DirectoriesMixin, SampleDirMixin, TestCase):
 
         test_file = self.BARCODE_SAMPLE_DIR / "barcode-qr-asn-000123-upscale-dpi.pdf"
 
-        with BarcodeReader(test_file, "application/pdf") as reader:
+        with self.get_reader(test_file) as reader:
             reader.detect()
             self.assertEqual(len(reader.barcodes), 0)
 
@@ -796,7 +719,7 @@ class TestAsnBarcode(DirectoriesMixin, SampleDirMixin, TestCase):
 
         test_file = self.BARCODE_SAMPLE_DIR / "barcode-qr-asn-000123-upscale-dpi.pdf"
 
-        with BarcodeReader(test_file, "application/pdf") as reader:
+        with self.get_reader(test_file) as reader:
             reader.detect()
             self.assertEqual(len(reader.barcodes), 1)
             self.assertEqual(reader.asn, 123)
index 88cbe7d87dd3020916741223182a2c21fc5a87f9..c665944910d04dbd736e163c555c8243f0614af3 100644 (file)
@@ -17,6 +17,7 @@ from documents.data_models import DocumentSource
 from documents.double_sided import STAGING_FILE_NAME
 from documents.double_sided import TIMEOUT_MINUTES
 from documents.tests.utils import DirectoriesMixin
+from documents.tests.utils import DummyProgressManager
 from documents.tests.utils import FileSystemAssertsMixin
 
 
@@ -42,9 +43,10 @@ class TestDoubleSided(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
         dst = self.dirs.double_sided_dir / dstname
         dst.parent.mkdir(parents=True, exist_ok=True)
         shutil.copy(src, dst)
-        with mock.patch("documents.tasks.async_to_sync"), mock.patch(
-            "documents.consumer.async_to_sync",
-        ):
+        with mock.patch(
+            "documents.tasks.ProgressManager",
+            DummyProgressManager,
+        ), mock.patch("documents.consumer.async_to_sync"):
             msg = tasks.consume_file(
                 ConsumableDocument(
                     source=DocumentSource.ConsumeFolder,
@@ -211,7 +213,7 @@ class TestDoubleSided(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
         """
         msg = self.consume_file("simple.pdf", Path("..") / "simple.pdf")
         self.assertIsNotFile(self.staging_file)
-        self.assertRegex(msg, "Success. New document .* created")
+        self.assertRegex(msg, r"Success. New document id \d+ created")
 
     def test_subdirectory_upload(self):
         """
@@ -250,4 +252,4 @@ class TestDoubleSided(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
         """
         msg = self.consume_file("simple.pdf")
         self.assertIsNotFile(self.staging_file)
-        self.assertRegex(msg, "Success. New document .* created")
+        self.assertRegex(msg, r"Success. New document id \d+ created")
index b688eecc9af8556320ca5fdec626c7708025d0b0..e92a00682a4c6cdec3bde1859e204c66803f6b4f 100644 (file)
@@ -24,6 +24,7 @@ from documents.models import WorkflowAction
 from documents.models import WorkflowTrigger
 from documents.signals import document_consumption_finished
 from documents.tests.utils import DirectoriesMixin
+from documents.tests.utils import DummyProgressManager
 from documents.tests.utils import FileSystemAssertsMixin
 from paperless_mail.models import MailAccount
 from paperless_mail.models import MailRule
@@ -126,7 +127,7 @@ class TestWorkflows(DirectoriesMixin, FileSystemAssertsMixin, APITestCase):
 
         test_file = self.SAMPLE_DIR / "simple.pdf"
 
-        with mock.patch("documents.tasks.async_to_sync"):
+        with mock.patch("documents.tasks.ProgressManager", DummyProgressManager):
             with self.assertLogs("paperless.matching", level="INFO") as cm:
                 tasks.consume_file(
                     ConsumableDocument(
@@ -203,7 +204,7 @@ class TestWorkflows(DirectoriesMixin, FileSystemAssertsMixin, APITestCase):
         w.save()
 
         test_file = self.SAMPLE_DIR / "simple.pdf"
-        with mock.patch("documents.tasks.async_to_sync"):
+        with mock.patch("documents.tasks.ProgressManager", DummyProgressManager):
             with self.assertLogs("paperless.matching", level="INFO") as cm:
                 tasks.consume_file(
                     ConsumableDocument(
@@ -294,7 +295,7 @@ class TestWorkflows(DirectoriesMixin, FileSystemAssertsMixin, APITestCase):
 
         test_file = self.SAMPLE_DIR / "simple.pdf"
 
-        with mock.patch("documents.tasks.async_to_sync"):
+        with mock.patch("documents.tasks.ProgressManager", DummyProgressManager):
             with self.assertLogs("paperless.matching", level="INFO") as cm:
                 tasks.consume_file(
                     ConsumableDocument(
@@ -356,7 +357,7 @@ class TestWorkflows(DirectoriesMixin, FileSystemAssertsMixin, APITestCase):
 
         test_file = self.SAMPLE_DIR / "simple.pdf"
 
-        with mock.patch("documents.tasks.async_to_sync"):
+        with mock.patch("documents.tasks.ProgressManager", DummyProgressManager):
             with self.assertLogs("paperless.matching", level="DEBUG") as cm:
                 tasks.consume_file(
                     ConsumableDocument(
@@ -407,7 +408,7 @@ class TestWorkflows(DirectoriesMixin, FileSystemAssertsMixin, APITestCase):
 
         test_file = self.SAMPLE_DIR / "simple.pdf"
 
-        with mock.patch("documents.tasks.async_to_sync"):
+        with mock.patch("documents.tasks.ProgressManager", DummyProgressManager):
             with self.assertLogs("paperless.matching", level="DEBUG") as cm:
                 tasks.consume_file(
                     ConsumableDocument(
@@ -468,7 +469,7 @@ class TestWorkflows(DirectoriesMixin, FileSystemAssertsMixin, APITestCase):
 
         test_file = self.SAMPLE_DIR / "simple.pdf"
 
-        with mock.patch("documents.tasks.async_to_sync"):
+        with mock.patch("documents.tasks.ProgressManager", DummyProgressManager):
             with self.assertLogs("paperless.matching", level="DEBUG") as cm:
                 tasks.consume_file(
                     ConsumableDocument(
@@ -529,7 +530,7 @@ class TestWorkflows(DirectoriesMixin, FileSystemAssertsMixin, APITestCase):
 
         test_file = self.SAMPLE_DIR / "simple.pdf"
 
-        with mock.patch("documents.tasks.async_to_sync"):
+        with mock.patch("documents.tasks.ProgressManager", DummyProgressManager):
             with self.assertLogs("paperless.matching", level="DEBUG") as cm:
                 tasks.consume_file(
                     ConsumableDocument(
@@ -591,7 +592,7 @@ class TestWorkflows(DirectoriesMixin, FileSystemAssertsMixin, APITestCase):
 
         test_file = self.SAMPLE_DIR / "simple.pdf"
 
-        with mock.patch("documents.tasks.async_to_sync"):
+        with mock.patch("documents.tasks.ProgressManager", DummyProgressManager):
             with self.assertLogs("paperless.matching", level="DEBUG") as cm:
                 tasks.consume_file(
                     ConsumableDocument(
@@ -686,7 +687,7 @@ class TestWorkflows(DirectoriesMixin, FileSystemAssertsMixin, APITestCase):
 
         test_file = self.SAMPLE_DIR / "simple.pdf"
 
-        with mock.patch("documents.tasks.async_to_sync"):
+        with mock.patch("documents.tasks.ProgressManager", DummyProgressManager):
             with self.assertLogs("paperless.matching", level="INFO") as cm:
                 tasks.consume_file(
                     ConsumableDocument(
index 0b6d8fcad54918b2b6a2d9f7dec919e2c7679817..4c3305d1329d31ffd830c0e4fcc764897898f75a 100644 (file)
@@ -9,6 +9,7 @@ from os import PathLike
 from pathlib import Path
 from typing import Any
 from typing import Callable
+from typing import Optional
 from typing import Union
 from unittest import mock
 
@@ -23,6 +24,7 @@ from django.test import override_settings
 from documents.data_models import ConsumableDocument
 from documents.data_models import DocumentMetadataOverrides
 from documents.parsers import ParseError
+from documents.plugins.helpers import ProgressStatusOptions
 
 
 def setup_directories():
@@ -146,6 +148,11 @@ def util_call_with_backoff(
 
 
 class DirectoriesMixin:
+    """
+    Creates and overrides settings for all folders and paths, then ensures
+    they are cleaned up on exit
+    """
+
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
         self.dirs = None
@@ -160,6 +167,10 @@ class DirectoriesMixin:
 
 
 class FileSystemAssertsMixin:
+    """
+    Utilities for checks various state information of the file system
+    """
+
     def assertIsFile(self, path: Union[PathLike, str]):
         self.assertTrue(Path(path).resolve().is_file(), f"File does not exist: {path}")
 
@@ -188,6 +199,11 @@ class FileSystemAssertsMixin:
 
 
 class ConsumerProgressMixin:
+    """
+    Mocks the Consumer _send_progress, preventing attempts to connect to Redis
+    and allowing access to its calls for verification
+    """
+
     def setUp(self) -> None:
         self.send_progress_patcher = mock.patch(
             "documents.consumer.Consumer._send_progress",
@@ -310,3 +326,59 @@ class SampleDirMixin:
     SAMPLE_DIR = Path(__file__).parent / "samples"
 
     BARCODE_SAMPLE_DIR = SAMPLE_DIR / "barcodes"
+
+
+class DummyProgressManager:
+    """
+    A dummy handler for progress management that doesn't actually try to
+    connect to Redis.  Payloads are stored for test assertions if needed.
+
+    Use it with
+      mock.patch("documents.tasks.ProgressManager", DummyProgressManager)
+    """
+
+    def __init__(self, filename: str, task_id: Optional[str] = None) -> None:
+        self.filename = filename
+        self.task_id = task_id
+        print("hello world")
+        self.payloads = []
+
+    def __enter__(self):
+        self.open()
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.close()
+
+    def open(self) -> None:
+        pass
+
+    def close(self) -> None:
+        pass
+
+    def send_progress(
+        self,
+        status: ProgressStatusOptions,
+        message: str,
+        current_progress: int,
+        max_progress: int,
+        extra_args: Optional[dict[str, Union[str, int]]] = None,
+    ) -> None:
+        # Ensure the layer is open
+        self.open()
+
+        payload = {
+            "type": "status_update",
+            "data": {
+                "filename": self.filename,
+                "task_id": self.task_id,
+                "current_progress": current_progress,
+                "max_progress": max_progress,
+                "status": status,
+                "message": message,
+            },
+        }
+        if extra_args is not None:
+            payload["data"].update(extra_args)
+
+        self.payloads.append(payload)