import tempfile
from dataclasses import dataclass
from pathlib import Path
-from typing import Final
from typing import Optional
from django.conf import settings
from documents.converters import convert_from_tiff_to_pdf
from documents.data_models import ConsumableDocument
-from documents.data_models import DocumentMetadataOverrides
-from documents.data_models import DocumentSource
+from documents.plugins.base import ConsumeTaskPlugin
+from documents.plugins.base import StopConsumeTaskError
+from documents.plugins.helpers import ProgressStatusOptions
from documents.utils import copy_basic_file_stats
from documents.utils import copy_file_with_basic_stats
@dataclass(frozen=True)
class Barcode:
"""
- Holds the information about a single barcode and its location
+ Holds the information about a single barcode and its location in a document
"""
page: int
return self.value.startswith(settings.CONSUMER_ASN_BARCODE_PREFIX)
-class BarcodeReader:
- def __init__(self, filepath: Path, mime_type: str) -> None:
- self.file: Final[Path] = filepath
- self.mime: Final[str] = mime_type
- self.pdf_file: Path = self.file
- self.barcodes: list[Barcode] = []
- self._tiff_conversion_done = False
- self.temp_dir: Optional[tempfile.TemporaryDirectory] = None
+class BarcodePlugin(ConsumeTaskPlugin):
+ NAME: str = "BarcodePlugin"
+ @property
+ def able_to_run(self) -> bool:
+ """
+ Able to run if:
+ - ASN from barcode detection is enabled or
+ - Barcode support is enabled and the mime type is supported
+ """
if settings.CONSUMER_BARCODE_TIFF_SUPPORT:
- self.SUPPORTED_FILE_MIMES = {"application/pdf", "image/tiff"}
+ supported_mimes = {"application/pdf", "image/tiff"}
else:
- self.SUPPORTED_FILE_MIMES = {"application/pdf"}
+ supported_mimes = {"application/pdf"}
- def __enter__(self):
- if self.supported_mime_type:
- self.temp_dir = tempfile.TemporaryDirectory(prefix="paperless-barcodes")
- return self
+ return (
+ settings.CONSUMER_ENABLE_ASN_BARCODE or settings.CONSUMER_ENABLE_BARCODES
+ ) and self.input_doc.mime_type in supported_mimes
- def __exit__(self, exc_type, exc_val, exc_tb):
- if self.temp_dir is not None:
- self.temp_dir.cleanup()
- self.temp_dir = None
+ def setup(self):
+ self.temp_dir = tempfile.TemporaryDirectory(
+ dir=self.base_tmp_dir,
+ prefix="barcode",
+ )
+ self.pdf_file = self.input_doc.original_file
+ self._tiff_conversion_done = False
+ self.barcodes: list[Barcode] = []
- @property
- def supported_mime_type(self) -> bool:
- """
- Return True if the given mime type is supported for barcodes, false otherwise
- """
- return self.mime in self.SUPPORTED_FILE_MIMES
+ def run(self) -> Optional[str]:
+ # Maybe do the conversion of TIFF to PDF
+ self.convert_from_tiff_to_pdf()
- @property
- def asn(self) -> Optional[int]:
- """
- Search the parsed barcodes for any ASNs.
- The first barcode that starts with CONSUMER_ASN_BARCODE_PREFIX
- is considered the ASN to be used.
- Returns the detected ASN (or None)
- """
- asn = None
+ # Locate any barcodes in the files
+ self.detect()
- if not self.supported_mime_type:
- return None
+ # Update/overwrite an ASN if possible
+ located_asn = self.asn
+ if located_asn is not None:
+ logger.info(f"Found ASN in barcode: {located_asn}")
+ self.metadata.asn = located_asn
- # Ensure the barcodes have been read
- self.detect()
+ separator_pages = self.get_separation_pages()
+ if not separator_pages:
+ return "No pages to split on!"
- # get the first barcode that starts with CONSUMER_ASN_BARCODE_PREFIX
- asn_text = next(
- (x.value for x in self.barcodes if x.is_asn),
- None,
- )
+ # We have pages to split against
- if asn_text:
- logger.debug(f"Found ASN Barcode: {asn_text}")
- # remove the prefix and remove whitespace
- asn_text = asn_text[len(settings.CONSUMER_ASN_BARCODE_PREFIX) :].strip()
+ # Note this does NOT use the base_temp_dir, as that will be removed
+ tmp_dir = Path(
+ tempfile.mkdtemp(
+ dir=settings.SCRATCH_DIR,
+ prefix="paperless-barcode-split-",
+ ),
+ ).resolve()
- # remove non-numeric parts of the remaining string
- asn_text = re.sub(r"\D", "", asn_text)
+ from documents import tasks
- # now, try parsing the ASN number
- try:
- asn = int(asn_text)
- except ValueError as e:
- logger.warning(f"Failed to parse ASN number because: {e}")
+ # Create the split document tasks
+ for new_document in self.separate_pages(separator_pages):
+ copy_file_with_basic_stats(new_document, tmp_dir / new_document.name)
- return asn
+ task = tasks.consume_file.delay(
+ ConsumableDocument(
+ # Same source, for templates
+ source=self.input_doc.source,
+ mailrule_id=self.input_doc.mailrule_id,
+ # Can't use same folder or the consume might grab it again
+ original_file=(tmp_dir / new_document.name).resolve(),
+ ),
+ # All the same metadata
+ self.metadata,
+ )
+ logger.info(f"Created new task {task.id} for {new_document.name}")
+
+ # This file is now two or more files
+ self.input_doc.original_file.unlink()
+
+ msg = "Barcode splitting complete!"
+
+ # Update the progress to complete
+ self.status_mgr.send_progress(ProgressStatusOptions.SUCCESS, msg, 100, 100)
+
+ # Request the consume task stops
+ raise StopConsumeTaskError(msg)
+
+ def cleanup(self) -> None:
+ self.temp_dir.cleanup()
+
+ def convert_from_tiff_to_pdf(self):
+ """
+ May convert a TIFF image into a PDF, if the input is a TIFF and
+ the TIFF has not been made into a PDF
+ """
+ # Nothing to do, pdf_file is already assigned correctly
+ if self.input_doc.mime_type != "image/tiff" or self._tiff_conversion_done:
+ return
+
+ self.pdf_file = convert_from_tiff_to_pdf(
+ self.input_doc.original_file,
+ Path(self.temp_dir.name),
+ )
+ self._tiff_conversion_done = True
@staticmethod
- def read_barcodes_zxing(image: Image) -> list[str]:
+ def read_barcodes_zxing(image: Image.Image) -> list[str]:
barcodes = []
import zxingcpp
return barcodes
@staticmethod
- def read_barcodes_pyzbar(image: Image) -> list[str]:
+ def read_barcodes_pyzbar(image: Image.Image) -> list[str]:
barcodes = []
from pyzbar import pyzbar
return barcodes
- def convert_from_tiff_to_pdf(self):
- """
- May convert a TIFF image into a PDF, if the input is a TIFF and
- the TIFF has not been made into a PDF
- """
- # Nothing to do, pdf_file is already assigned correctly
- if self.mime != "image/tiff" or self._tiff_conversion_done:
- return
-
- self._tiff_conversion_done = True
- self.pdf_file = convert_from_tiff_to_pdf(self.file, Path(self.temp_dir.name))
-
def detect(self) -> None:
"""
Scan all pages of the PDF as images, updating barcodes and the pages
# This file is really borked, allow the consumption to continue
# but it may fail further on
except Exception as e: # pragma: no cover
- logger.exception(
+ logger.warning(
f"Exception during barcode scanning: {e}",
)
+ @property
+ def asn(self) -> Optional[int]:
+ """
+ Search the parsed barcodes for any ASNs.
+ The first barcode that starts with CONSUMER_ASN_BARCODE_PREFIX
+ is considered the ASN to be used.
+ Returns the detected ASN (or None)
+ """
+ asn = None
+
+ # Ensure the barcodes have been read
+ self.detect()
+
+ # get the first barcode that starts with CONSUMER_ASN_BARCODE_PREFIX
+ asn_text = next(
+ (x.value for x in self.barcodes if x.is_asn),
+ None,
+ )
+
+ if asn_text:
+ logger.debug(f"Found ASN Barcode: {asn_text}")
+ # remove the prefix and remove whitespace
+ asn_text = asn_text[len(settings.CONSUMER_ASN_BARCODE_PREFIX) :].strip()
+
+ # remove non-numeric parts of the remaining string
+ asn_text = re.sub(r"\D", "", asn_text)
+
+ # now, try parsing the ASN number
+ try:
+ asn = int(asn_text)
+ except ValueError as e:
+ logger.warning(f"Failed to parse ASN number because: {e}")
+
+ return asn
+
def get_separation_pages(self) -> dict[int, bool]:
"""
Search the parsed barcodes for separators and returns a dict of page
"""
document_paths = []
- fname = self.file.stem
+ fname = self.input_doc.original_file.stem
with Pdf.open(self.pdf_file) as input_pdf:
# Start with an empty document
current_document: list[Page] = []
with open(savepath, "wb") as out:
dst.save(out)
- copy_basic_file_stats(self.file, savepath)
+ copy_basic_file_stats(self.input_doc.original_file, savepath)
document_paths.append(savepath)
return document_paths
-
- def separate(
- self,
- source: DocumentSource,
- overrides: DocumentMetadataOverrides,
- ) -> bool:
- """
- Separates the document, based on barcodes and configuration, creating new
- documents as required in the appropriate location.
-
- Returns True if a split happened, False otherwise
- """
- # Do nothing
- if not self.supported_mime_type:
- logger.warning(f"Unsupported file format for barcode reader: {self.mime}")
- return False
-
- # Does nothing unless needed
- self.convert_from_tiff_to_pdf()
-
- # Actually read the codes, if any
- self.detect()
-
- separator_pages = self.get_separation_pages()
-
- # Also do nothing
- if not separator_pages:
- logger.warning("No pages to split on!")
- return False
-
- tmp_dir = Path(tempfile.mkdtemp(prefix="paperless-barcode-split-")).resolve()
-
- from documents import tasks
-
- # Create the split document tasks
- for new_document in self.separate_pages(separator_pages):
- copy_file_with_basic_stats(new_document, tmp_dir / new_document.name)
-
- tasks.consume_file.delay(
- ConsumableDocument(
- # Same source, for templates
- source=source,
- # Can't use same folder or the consume might grab it again
- original_file=(tmp_dir / new_document.name).resolve(),
- ),
- # All the same metadata
- overrides,
- )
- logger.info("Barcode splitting complete!")
- return True
from rest_framework.reverse import reverse
from documents.classifier import load_classifier
-from documents.data_models import ConsumableDocument
from documents.data_models import DocumentMetadataOverrides
from documents.file_handling import create_source_path_directory
from documents.file_handling import generate_unique_filename
from documents.parsers import get_parser_class_for_mime_type
from documents.parsers import parse_date
from documents.permissions import set_permissions_for_object
+from documents.plugins.base import AlwaysRunPluginMixin
+from documents.plugins.base import ConsumeTaskPlugin
+from documents.plugins.base import NoCleanupPluginMixin
+from documents.plugins.base import NoSetupPluginMixin
from documents.signals import document_consumption_finished
from documents.signals import document_consumption_started
from documents.utils import copy_basic_file_stats
from documents.utils import copy_file_with_basic_stats
+class WorkflowTriggerPlugin(
+ NoCleanupPluginMixin,
+ NoSetupPluginMixin,
+ AlwaysRunPluginMixin,
+ ConsumeTaskPlugin,
+):
+ NAME: str = "WorkflowTriggerPlugin"
+
+ def run(self) -> Optional[str]:
+ """
+ Get overrides from matching workflows
+ """
+ overrides = DocumentMetadataOverrides()
+ for workflow in Workflow.objects.filter(enabled=True).order_by("order"):
+ template_overrides = DocumentMetadataOverrides()
+
+ if document_matches_workflow(
+ self.input_doc,
+ workflow,
+ WorkflowTrigger.WorkflowTriggerType.CONSUMPTION,
+ ):
+ for action in workflow.actions.all():
+ if action.assign_title is not None:
+ template_overrides.title = action.assign_title
+ if action.assign_tags is not None:
+ template_overrides.tag_ids = [
+ tag.pk for tag in action.assign_tags.all()
+ ]
+ if action.assign_correspondent is not None:
+ template_overrides.correspondent_id = (
+ action.assign_correspondent.pk
+ )
+ if action.assign_document_type is not None:
+ template_overrides.document_type_id = (
+ action.assign_document_type.pk
+ )
+ if action.assign_storage_path is not None:
+ template_overrides.storage_path_id = (
+ action.assign_storage_path.pk
+ )
+ if action.assign_owner is not None:
+ template_overrides.owner_id = action.assign_owner.pk
+ if action.assign_view_users is not None:
+ template_overrides.view_users = [
+ user.pk for user in action.assign_view_users.all()
+ ]
+ if action.assign_view_groups is not None:
+ template_overrides.view_groups = [
+ group.pk for group in action.assign_view_groups.all()
+ ]
+ if action.assign_change_users is not None:
+ template_overrides.change_users = [
+ user.pk for user in action.assign_change_users.all()
+ ]
+ if action.assign_change_groups is not None:
+ template_overrides.change_groups = [
+ group.pk for group in action.assign_change_groups.all()
+ ]
+ if action.assign_custom_fields is not None:
+ template_overrides.custom_field_ids = [
+ field.pk for field in action.assign_custom_fields.all()
+ ]
+
+ overrides.update(template_overrides)
+ self.metadata.update(overrides)
+
+
class ConsumerError(Exception):
pass
return document
- def get_workflow_overrides(
- self,
- input_doc: ConsumableDocument,
- ) -> DocumentMetadataOverrides:
- """
- Get overrides from matching workflows
- """
- overrides = DocumentMetadataOverrides()
- for workflow in Workflow.objects.filter(enabled=True).order_by("order"):
- template_overrides = DocumentMetadataOverrides()
-
- if document_matches_workflow(
- input_doc,
- workflow,
- WorkflowTrigger.WorkflowTriggerType.CONSUMPTION,
- ):
- for action in workflow.actions.all():
- self.log.info(
- f"Applying overrides in {action} from {workflow}",
- )
- if action.assign_title is not None:
- template_overrides.title = action.assign_title
- if action.assign_tags is not None:
- template_overrides.tag_ids = [
- tag.pk for tag in action.assign_tags.all()
- ]
- if action.assign_correspondent is not None:
- template_overrides.correspondent_id = (
- action.assign_correspondent.pk
- )
- if action.assign_document_type is not None:
- template_overrides.document_type_id = (
- action.assign_document_type.pk
- )
- if action.assign_storage_path is not None:
- template_overrides.storage_path_id = (
- action.assign_storage_path.pk
- )
- if action.assign_owner is not None:
- template_overrides.owner_id = action.assign_owner.pk
- if action.assign_view_users is not None:
- template_overrides.view_users = [
- user.pk for user in action.assign_view_users.all()
- ]
- if action.assign_view_groups is not None:
- template_overrides.view_groups = [
- group.pk for group in action.assign_view_groups.all()
- ]
- if action.assign_change_users is not None:
- template_overrides.change_users = [
- user.pk for user in action.assign_change_users.all()
- ]
- if action.assign_change_groups is not None:
- template_overrides.change_groups = [
- group.pk for group in action.assign_change_groups.all()
- ]
- if action.assign_custom_fields is not None:
- template_overrides.custom_field_ids = [
- field.pk for field in action.assign_custom_fields.all()
- ]
-
- overrides.update(template_overrides)
- return overrides
-
def _parse_title_placeholders(self, title: str) -> str:
local_added = timezone.localtime(timezone.now())
import os
import shutil
from pathlib import Path
+from typing import Final
+from typing import Optional
from django.conf import settings
from pikepdf import Pdf
from documents.consumer import ConsumerError
from documents.converters import convert_from_tiff_to_pdf
-from documents.data_models import ConsumableDocument
+from documents.plugins.base import ConsumeTaskPlugin
+from documents.plugins.base import NoCleanupPluginMixin
+from documents.plugins.base import NoSetupPluginMixin
+from documents.plugins.base import StopConsumeTaskError
logger = logging.getLogger("paperless.double_sided")
# Hardcoded for now, could be made a configurable setting if needed
-TIMEOUT_MINUTES = 30
+TIMEOUT_MINUTES: Final[int] = 30
+TIMEOUT_SECONDS: Final[int] = TIMEOUT_MINUTES * 60
# Used by test cases
STAGING_FILE_NAME = "double-sided-staging.pdf"
-def collate(input_doc: ConsumableDocument) -> str:
- """
- Tries to collate pages from 2 single sided scans of a double sided
- document.
-
- When called with a file, it checks whether or not a staging file
- exists, if not, the current file is turned into that staging file
- containing the odd numbered pages.
-
- If a staging file exists, and it is not too old, the current file is
- considered to be the second part (the even numbered pages) and it will
- collate the pages of both, the pages of the second file will be added
- in reverse order, since the ADF will have scanned the pages from bottom
- to top.
-
- Returns a status message on success, or raises a ConsumerError
- in case of failure.
- """
-
- # Make sure scratch dir exists, Consumer might not have run yet
- settings.SCRATCH_DIR.mkdir(exist_ok=True)
-
- if input_doc.mime_type == "application/pdf":
- pdf_file = input_doc.original_file
- elif (
- input_doc.mime_type == "image/tiff"
- and settings.CONSUMER_COLLATE_DOUBLE_SIDED_TIFF_SUPPORT
- ):
- pdf_file = convert_from_tiff_to_pdf(
- input_doc.original_file,
- settings.SCRATCH_DIR,
+class CollatePlugin(NoCleanupPluginMixin, NoSetupPluginMixin, ConsumeTaskPlugin):
+ NAME: str = "CollatePlugin"
+
+ @property
+ def able_to_run(self) -> bool:
+ return (
+ settings.CONSUMER_ENABLE_COLLATE_DOUBLE_SIDED
+ and settings.CONSUMER_COLLATE_DOUBLE_SIDED_SUBDIR_NAME
+ in self.input_doc.original_file.parts
)
- input_doc.original_file.unlink()
- else:
- raise ConsumerError("Unsupported file type for collation of double-sided scans")
-
- staging = settings.SCRATCH_DIR / STAGING_FILE_NAME
-
- valid_staging_exists = False
- if staging.exists():
- stats = os.stat(str(staging))
- # if the file is older than the timeout, we don't consider
- # it valid
- if dt.datetime.now().timestamp() - stats.st_mtime > TIMEOUT_MINUTES * 60:
- logger.warning("Outdated double sided staging file exists, deleting it")
- os.unlink(str(staging))
+
+ def run(self) -> Optional[str]:
+ """
+ Tries to collate pages from 2 single sided scans of a double sided
+ document.
+
+ When called with a file, it checks whether or not a staging file
+ exists, if not, the current file is turned into that staging file
+ containing the odd numbered pages.
+
+ If a staging file exists, and it is not too old, the current file is
+ considered to be the second part (the even numbered pages) and it will
+ collate the pages of both, the pages of the second file will be added
+ in reverse order, since the ADF will have scanned the pages from bottom
+ to top.
+
+ Returns a status message on success, or raises a ConsumerError
+ in case of failure.
+ """
+
+ if self.input_doc.mime_type == "application/pdf":
+ pdf_file = self.input_doc.original_file
+ elif (
+ self.input_doc.mime_type == "image/tiff"
+ and settings.CONSUMER_COLLATE_DOUBLE_SIDED_TIFF_SUPPORT
+ ):
+ pdf_file = convert_from_tiff_to_pdf(
+ self.input_doc.original_file,
+ self.base_tmp_dir,
+ )
+ self.input_doc.original_file.unlink()
else:
- valid_staging_exists = True
-
- if valid_staging_exists:
- try:
- # Collate pages from second PDF in reverse order
- with Pdf.open(staging) as pdf1, Pdf.open(pdf_file) as pdf2:
- pdf2.pages.reverse()
- try:
- for i, page in enumerate(pdf2.pages):
- pdf1.pages.insert(2 * i + 1, page)
- except IndexError:
- raise ConsumerError(
- "This second file (even numbered pages) contains more "
- "pages than the first/odd numbered one. This means the "
- "two uploaded files don't belong to the same double-"
- "sided scan. Please retry, starting with the odd "
- "numbered pages again.",
+ raise ConsumerError(
+ "Unsupported file type for collation of double-sided scans",
+ )
+
+ staging: Path = settings.SCRATCH_DIR / STAGING_FILE_NAME
+
+ valid_staging_exists = False
+ if staging.exists():
+ stats = staging.stat()
+ # if the file is older than the timeout, we don't consider
+ # it valid
+ if (dt.datetime.now().timestamp() - stats.st_mtime) > TIMEOUT_SECONDS:
+ logger.warning("Outdated double sided staging file exists, deleting it")
+ staging.unlink()
+ else:
+ valid_staging_exists = True
+
+ if valid_staging_exists:
+ try:
+ # Collate pages from second PDF in reverse order
+ with Pdf.open(staging) as pdf1, Pdf.open(pdf_file) as pdf2:
+ pdf2.pages.reverse()
+ try:
+ for i, page in enumerate(pdf2.pages):
+ pdf1.pages.insert(2 * i + 1, page)
+ except IndexError:
+ raise ConsumerError(
+ "This second file (even numbered pages) contains more "
+ "pages than the first/odd numbered one. This means the "
+ "two uploaded files don't belong to the same double-"
+ "sided scan. Please retry, starting with the odd "
+ "numbered pages again.",
+ )
+ # Merged file has the same path, but without the
+ # double-sided subdir. Therefore, it is also in the
+ # consumption dir and will be picked up for processing
+ old_file = self.input_doc.original_file
+ new_file = Path(
+ *(
+ part
+ for part in old_file.with_name(
+ f"{old_file.stem}-collated.pdf",
+ ).parts
+ if part
+ != settings.CONSUMER_COLLATE_DOUBLE_SIDED_SUBDIR_NAME
+ ),
)
- # Merged file has the same path, but without the
- # double-sided subdir. Therefore, it is also in the
- # consumption dir and will be picked up for processing
- old_file = input_doc.original_file
- new_file = Path(
- *(
- part
- for part in old_file.with_name(
- f"{old_file.stem}-collated.pdf",
- ).parts
- if part != settings.CONSUMER_COLLATE_DOUBLE_SIDED_SUBDIR_NAME
- ),
+ # If the user didn't create the subdirs yet, do it for them
+ new_file.parent.mkdir(parents=True, exist_ok=True)
+ pdf1.save(new_file)
+ logger.info("Collated documents into new file %s", new_file)
+ raise StopConsumeTaskError(
+ "Success. Even numbered pages of double sided scan collated "
+ "with odd pages",
)
- # If the user didn't create the subdirs yet, do it for them
- new_file.parent.mkdir(parents=True, exist_ok=True)
- pdf1.save(new_file)
- logger.info("Collated documents into new file %s", new_file)
- return (
- "Success. Even numbered pages of double sided scan collated "
- "with odd pages"
+ finally:
+ # Delete staging and recently uploaded file no matter what.
+ # If any error occurs, the user needs to be able to restart
+ # the process from scratch; after all, the staging file
+ # with the odd numbered pages might be the culprit
+ pdf_file.unlink()
+ staging.unlink()
+
+ else:
+ shutil.move(pdf_file, staging)
+ # update access to modification time so we know if the file
+ # is outdated when another file gets uploaded
+ timestamp = dt.datetime.now().timestamp()
+ os.utime(staging, (timestamp, timestamp))
+ logger.info(
+ "Got scan with odd numbered pages of double-sided scan, moved it to %s",
+ staging,
+ )
+ raise StopConsumeTaskError(
+ "Received odd numbered pages of double sided scan, waiting up to "
+ f"{TIMEOUT_MINUTES} minutes for even numbered pages",
)
- finally:
- # Delete staging and recently uploaded file no matter what.
- # If any error occurs, the user needs to be able to restart
- # the process from scratch; after all, the staging file
- # with the odd numbered pages might be the culprit
- pdf_file.unlink()
- staging.unlink()
-
- else:
- shutil.move(pdf_file, staging)
- # update access to modification time so we know if the file
- # is outdated when another file gets uploaded
- os.utime(staging, (dt.datetime.now().timestamp(),) * 2)
- logger.info(
- "Got scan with odd numbered pages of double-sided scan, moved it to %s",
- staging,
- )
- return (
- "Received odd numbered pages of double sided scan, waiting up to "
- f"{TIMEOUT_MINUTES} minutes for even numbered pages"
- )
--- /dev/null
+import abc
+from pathlib import Path
+from typing import Final
+from typing import Optional
+
+from documents.data_models import ConsumableDocument
+from documents.data_models import DocumentMetadataOverrides
+from documents.plugins.helpers import ProgressManager
+
+
+class StopConsumeTaskError(Exception):
+ """
+ A plugin setup or run may raise this to exit the asynchronous consume task.
+
+ Most likely, this means it has created one or more new tasks to execute instead,
+ such as when a barcode has been used to create new documents
+ """
+
+ def __init__(self, message: str) -> None:
+ self.message = message
+ super().__init__(message)
+
+
+class ConsumeTaskPlugin(abc.ABC):
+ """
+ Defines the interface for a plugin for the document consume task
+ Meanings as per RFC2119 (https://datatracker.ietf.org/doc/html/rfc2119)
+
+ Plugin Implementation
+
+ The plugin SHALL implement property able_to_run and methods setup, run and cleanup.
+ The plugin property able_to_run SHALL return True if the plugin is able to run, given the conditions, settings and document information.
+ The plugin property able_to_run MAY be hardcoded to return True.
+ The plugin setup SHOULD perform any resource creation or additional initialization needed to run the document.
+ The plugin setup MAY be a non-operation.
+ The plugin cleanup SHOULD perform resource cleanup, including in the event of an error.
+ The plugin cleanup MAY be a non-operation.
+ The plugin run SHALL perform any operations against the document or system state required for the plugin.
+ The plugin run MAY update the document metadata.
+ The plugin run MAY return an informational message.
+ The plugin run MAY raise StopConsumeTaskError to cease any further operations against the document.
+
+ Plugin Manager Implementation
+
+ The plugin manager SHALL provide the plugin with the input document, document metadata, progress manager and a created temporary directory.
+ The plugin manager SHALL execute the plugin setup, run and cleanup, in that order IF the plugin property able_to_run is True.
+ The plugin manager SHOULD log the return message of executing a plugin's run.
+ The plugin manager SHALL always execute the plugin cleanup, IF the plugin property able_to_run is True.
+ The plugin manager SHALL cease calling plugins and exit the task IF a plugin raises StopConsumeTaskError.
+ The plugin manager SHOULD return the StopConsumeTaskError message IF a plugin raises StopConsumeTaskError.
+ """
+
+ NAME: str = "ConsumeTaskPlugin"
+
+ def __init__(
+ self,
+ input_doc: ConsumableDocument,
+ metadata: DocumentMetadataOverrides,
+ status_mgr: ProgressManager,
+ base_tmp_dir: Path,
+ task_id: str,
+ ) -> None:
+ super().__init__()
+ self.input_doc = input_doc
+ self.metadata = metadata
+ self.base_tmp_dir: Final = base_tmp_dir
+ self.status_mgr = status_mgr
+ self.task_id: Final = task_id
+
+ @abc.abstractproperty
+ def able_to_run(self) -> bool:
+ """
+ Return True if the conditions are met for the plugin to run, False otherwise
+
+ If False, setup(), run() and cleanup() will not be called
+ """
+
+ @abc.abstractmethod
+ def setup(self) -> None:
+ """
+ Allows the plugin to perform any additional setup it may need, such as creating
+ a temporary directory, copying a file somewhere, etc.
+
+ Executed before run()
+
+ In general, this should be the "light" work, not the bulk of processing
+ """
+
+ @abc.abstractmethod
+ def run(self) -> Optional[str]:
+ """
+ The bulk of plugin processing, this does whatever action the plugin is for.
+
+ Executed after setup() and before cleanup()
+ """
+
+ @abc.abstractmethod
+ def cleanup(self) -> None:
+ """
+ Allows the plugin to execute any cleanup it may require
+
+ Executed after run(), even in the case of error
+ """
+
+
+class AlwaysRunPluginMixin(ConsumeTaskPlugin):
+ """
+ A plugin which is always able to run
+ """
+
+ @property
+ def able_to_run(self) -> bool:
+ return True
+
+
+class NoSetupPluginMixin(ConsumeTaskPlugin):
+ """
+ A plugin which requires no setup
+ """
+
+ def setup(self) -> None:
+ pass
+
+
+class NoCleanupPluginMixin(ConsumeTaskPlugin):
+ """
+ A plugin which needs to clean up no files
+ """
+
+ def cleanup(self) -> None:
+ pass
--- /dev/null
+import enum
+from typing import TYPE_CHECKING
+from typing import Optional
+from typing import Union
+
+from asgiref.sync import async_to_sync
+from channels.layers import get_channel_layer
+from channels_redis.pubsub import RedisPubSubChannelLayer
+
+
+class ProgressStatusOptions(str, enum.Enum):
+ STARTED = "STARTED"
+ WORKING = "WORKING"
+ SUCCESS = "SUCCESS"
+ FAILED = "FAILED"
+
+
+class ProgressManager:
+ """
+ Handles sending of progress information via the channel layer, with proper management
+ of the open/close of the layer to ensure messages go out and everything is cleaned up
+ """
+
+ def __init__(self, filename: str, task_id: Optional[str] = None) -> None:
+ self.filename = filename
+ self._channel: Optional[RedisPubSubChannelLayer] = None
+ self.task_id = task_id
+
+ def __enter__(self):
+ self.open()
+ return self
+
+ def __exit__(self, exc_type, exc_val, exc_tb):
+ self.close()
+
+ def open(self) -> None:
+ """
+ If not already opened, gets the default channel layer
+ opened and ready to send messages
+ """
+ if self._channel is None:
+ self._channel = get_channel_layer()
+
+ def close(self) -> None:
+ """
+ If it was opened, flushes the channel layer
+ """
+ if self._channel is not None:
+ async_to_sync(self._channel.flush)
+ self._channel = None
+
+ def send_progress(
+ self,
+ status: ProgressStatusOptions,
+ message: str,
+ current_progress: int,
+ max_progress: int,
+ extra_args: Optional[dict[str, Union[str, int]]] = None,
+ ) -> None:
+ # Ensure the layer is open
+ self.open()
+
+ # Just for IDEs
+ if TYPE_CHECKING:
+ assert self._channel is not None
+
+ payload = {
+ "type": "status_update",
+ "data": {
+ "filename": self.filename,
+ "task_id": self.task_id,
+ "current_progress": current_progress,
+ "max_progress": max_progress,
+ "status": status,
+ "message": message,
+ },
+ }
+ if extra_args is not None:
+ payload["data"].update(extra_args)
+
+ # Construct and send the update
+ async_to_sync(self._channel.group_send)("status_updates", payload)
import logging
import shutil
import uuid
+from pathlib import Path
+from tempfile import TemporaryDirectory
from typing import Optional
import tqdm
-from asgiref.sync import async_to_sync
from celery import Task
from celery import shared_task
-from channels.layers import get_channel_layer
from django.conf import settings
from django.db import transaction
from django.db.models.signals import post_save
from filelock import FileLock
-from redis.exceptions import ConnectionError
from whoosh.writing import AsyncWriter
from documents import index
from documents import sanity_checker
-from documents.barcodes import BarcodeReader
+from documents.barcodes import BarcodePlugin
from documents.classifier import DocumentClassifier
from documents.classifier import load_classifier
from documents.consumer import Consumer
from documents.consumer import ConsumerError
+from documents.consumer import WorkflowTriggerPlugin
from documents.data_models import ConsumableDocument
from documents.data_models import DocumentMetadataOverrides
-from documents.double_sided import collate
+from documents.double_sided import CollatePlugin
from documents.file_handling import create_source_path_directory
from documents.file_handling import generate_unique_filename
from documents.models import Correspondent
from documents.models import Tag
from documents.parsers import DocumentParser
from documents.parsers import get_parser_class_for_mime_type
+from documents.plugins.base import ConsumeTaskPlugin
+from documents.plugins.base import ProgressManager
+from documents.plugins.base import StopConsumeTaskError
+from documents.plugins.helpers import ProgressStatusOptions
from documents.sanity_checker import SanityCheckFailedException
from documents.signals import document_updated
input_doc: ConsumableDocument,
overrides: Optional[DocumentMetadataOverrides] = None,
):
- def send_progress(status="SUCCESS", message="finished"):
- payload = {
- "filename": overrides.filename or input_doc.original_file.name,
- "task_id": None,
- "current_progress": 100,
- "max_progress": 100,
- "status": status,
- "message": message,
- }
- try:
- async_to_sync(get_channel_layer().group_send)(
- "status_updates",
- {"type": "status_update", "data": payload},
- )
- except ConnectionError as e:
- logger.warning(f"ConnectionError on status send: {e!s}")
-
# Default no overrides
if overrides is None:
overrides = DocumentMetadataOverrides()
- # Handle collation of double-sided documents scanned in two parts
- if settings.CONSUMER_ENABLE_COLLATE_DOUBLE_SIDED and (
- settings.CONSUMER_COLLATE_DOUBLE_SIDED_SUBDIR_NAME
- in input_doc.original_file.parts
- ):
- try:
- msg = collate(input_doc)
- send_progress(message=msg)
- return msg
- except ConsumerError as e:
- send_progress(status="FAILURE", message=e.args[0])
- raise e
-
- # read all barcodes in the current document
- if settings.CONSUMER_ENABLE_BARCODES or settings.CONSUMER_ENABLE_ASN_BARCODE:
- with BarcodeReader(input_doc.original_file, input_doc.mime_type) as reader:
- if settings.CONSUMER_ENABLE_BARCODES and reader.separate(
- input_doc.source,
+ plugins: list[type[ConsumeTaskPlugin]] = [
+ CollatePlugin,
+ BarcodePlugin,
+ WorkflowTriggerPlugin,
+ ]
+
+ with ProgressManager(
+ overrides.filename or input_doc.original_file.name,
+ self.request.id,
+ ) as status_mgr, TemporaryDirectory(dir=settings.SCRATCH_DIR) as tmp_dir:
+ tmp_dir = Path(tmp_dir)
+ for plugin_class in plugins:
+ plugin_name = plugin_class.NAME
+
+ plugin = plugin_class(
+ input_doc,
overrides,
- ):
- # notify the sender, otherwise the progress bar
- # in the UI stays stuck
- send_progress()
- # consuming stops here, since the original document with
- # the barcodes has been split and will be consumed separately
- input_doc.original_file.unlink()
- return "File successfully split"
-
- # try reading the ASN from barcode
- if (
- settings.CONSUMER_ENABLE_ASN_BARCODE
- and (located_asn := reader.asn) is not None
- ):
- # Note this will take precedence over an API provided ASN
- # But it's from a physical barcode, so that's good
- overrides.asn = located_asn
- logger.info(f"Found ASN in barcode: {overrides.asn}")
-
- template_overrides = Consumer().get_workflow_overrides(
- input_doc=input_doc,
- )
+ status_mgr,
+ tmp_dir,
+ self.request.id,
+ )
+
+ if not plugin.able_to_run:
+ logger.debug(f"Skipping plugin {plugin_name}")
+ continue
+
+ try:
+ logger.debug(f"Executing plugin {plugin_name}")
+ plugin.setup()
+
+ msg = plugin.run()
+
+ if msg is not None:
+ logger.info(f"{plugin_name} completed with: {msg}")
+ else:
+ logger.info(f"{plugin_name} completed with no message")
+
+ overrides = plugin.metadata
+
+ except StopConsumeTaskError as e:
+ logger.info(f"{plugin_name} requested task exit: {e.message}")
+ return e.message
+
+ except Exception as e:
+ logger.exception(f"{plugin_name} failed: {e}")
+ status_mgr.send_progress(ProgressStatusOptions.FAILED, f"{e}", 100, 100)
+ raise
- overrides.update(template_overrides)
+ finally:
+ plugin.cleanup()
# continue with consumption if no barcode was found
document = Consumer().try_consume_file(
import shutil
+from collections.abc import Generator
+from contextlib import contextmanager
+from pathlib import Path
from unittest import mock
import pytest
from django.test import override_settings
from documents import tasks
-from documents.barcodes import BarcodeReader
-from documents.consumer import ConsumerError
+from documents.barcodes import BarcodePlugin
from documents.data_models import ConsumableDocument
from documents.data_models import DocumentMetadataOverrides
from documents.data_models import DocumentSource
-from documents.models import Document
from documents.tests.utils import DirectoriesMixin
from documents.tests.utils import DocumentConsumeDelayMixin
+from documents.tests.utils import DummyProgressManager
from documents.tests.utils import FileSystemAssertsMixin
from documents.tests.utils import SampleDirMixin
HAS_ZXING_LIB = False
+class GetReaderPluginMixin:
+ @contextmanager
+ def get_reader(self, filepath: Path) -> Generator[BarcodePlugin, None, None]:
+ reader = BarcodePlugin(
+ ConsumableDocument(DocumentSource.ConsumeFolder, original_file=filepath),
+ DocumentMetadataOverrides(),
+ DummyProgressManager(filepath.name, None),
+ self.dirs.scratch_dir,
+ "task-id",
+ )
+ reader.setup()
+ yield reader
+ reader.cleanup()
+
+
@override_settings(CONSUMER_BARCODE_SCANNER="PYZBAR")
-class TestBarcode(DirectoriesMixin, FileSystemAssertsMixin, SampleDirMixin, TestCase):
+class TestBarcode(
+ DirectoriesMixin,
+ FileSystemAssertsMixin,
+ SampleDirMixin,
+ GetReaderPluginMixin,
+ TestCase,
+):
def test_scan_file_for_separating_barcodes(self):
"""
GIVEN:
"""
test_file = self.BARCODE_SAMPLE_DIR / "patch-code-t.pdf"
- with BarcodeReader(test_file, "application/pdf") as reader:
+ with self.get_reader(test_file) as reader:
reader.detect()
separator_page_numbers = reader.get_separation_pages()
"""
test_file = self.BARCODE_SAMPLE_DIR / "patch-code-t-middle.tiff"
- with BarcodeReader(test_file, "image/tiff") as reader:
+ with self.get_reader(test_file) as reader:
reader.detect()
separator_page_numbers = reader.get_separation_pages()
"""
test_file = self.BARCODE_SAMPLE_DIR / "patch-code-t-middle-alpha.tiff"
- with BarcodeReader(test_file, "image/tiff") as reader:
+ with self.get_reader(test_file) as reader:
reader.detect()
separator_page_numbers = reader.get_separation_pages()
- No pages to split on
"""
test_file = self.SAMPLE_DIR / "simple.pdf"
- with BarcodeReader(test_file, "application/pdf") as reader:
+ with self.get_reader(test_file) as reader:
reader.detect()
separator_page_numbers = reader.get_separation_pages()
"""
test_file = self.BARCODE_SAMPLE_DIR / "patch-code-t-middle.pdf"
- with BarcodeReader(test_file, "application/pdf") as reader:
+ with self.get_reader(test_file) as reader:
reader.detect()
separator_page_numbers = reader.get_separation_pages()
"""
test_file = self.BARCODE_SAMPLE_DIR / "several-patcht-codes.pdf"
- with BarcodeReader(test_file, "application/pdf") as reader:
+ with self.get_reader(test_file) as reader:
reader.detect()
separator_page_numbers = reader.get_separation_pages()
]:
test_file = self.BARCODE_SAMPLE_DIR / test_file
- with BarcodeReader(test_file, "application/pdf") as reader:
+ with self.get_reader(test_file) as reader:
reader.detect()
separator_page_numbers = reader.get_separation_pages()
"""
test_file = self.BARCODE_SAMPLE_DIR / "patch-code-t-middle-unreadable.pdf"
- with BarcodeReader(test_file, "application/pdf") as reader:
+ with self.get_reader(test_file) as reader:
reader.detect()
separator_page_numbers = reader.get_separation_pages()
"""
test_file = self.BARCODE_SAMPLE_DIR / "barcode-fax-image.pdf"
- with BarcodeReader(test_file, "application/pdf") as reader:
+ with self.get_reader(test_file) as reader:
reader.detect()
separator_page_numbers = reader.get_separation_pages()
"""
test_file = self.BARCODE_SAMPLE_DIR / "patch-code-t-qr.pdf"
- with BarcodeReader(test_file, "application/pdf") as reader:
+ with self.get_reader(test_file) as reader:
reader.detect()
separator_page_numbers = reader.get_separation_pages()
"""
test_file = self.BARCODE_SAMPLE_DIR / "barcode-39-custom.pdf"
- with BarcodeReader(test_file, "application/pdf") as reader:
+ with self.get_reader(test_file) as reader:
reader.detect()
separator_page_numbers = reader.get_separation_pages()
"""
test_file = self.BARCODE_SAMPLE_DIR / "barcode-qr-custom.pdf"
- with BarcodeReader(test_file, "application/pdf") as reader:
+ with self.get_reader(test_file) as reader:
reader.detect()
separator_page_numbers = reader.get_separation_pages()
"""
test_file = self.BARCODE_SAMPLE_DIR / "barcode-128-custom.pdf"
- with BarcodeReader(test_file, "application/pdf") as reader:
+ with self.get_reader(test_file) as reader:
reader.detect()
separator_page_numbers = reader.get_separation_pages()
"""
test_file = self.BARCODE_SAMPLE_DIR / "barcode-39-custom.pdf"
- with BarcodeReader(test_file, "application/pdf") as reader:
+ with self.get_reader(test_file) as reader:
reader.detect()
separator_page_numbers = reader.get_separation_pages()
"""
test_file = self.BARCODE_SAMPLE_DIR / "many-qr-codes.pdf"
- with BarcodeReader(test_file, "application/pdf") as reader:
+ with self.get_reader(test_file) as reader:
reader.detect()
separator_page_numbers = reader.get_separation_pages()
"""
test_file = self.SAMPLE_DIR / "password-is-test.pdf"
with self.assertLogs("paperless.barcodes", level="WARNING") as cm:
- with BarcodeReader(test_file, "application/pdf") as reader:
+ with self.get_reader(test_file) as reader:
reader.detect()
warning = cm.output[0]
expected_str = "WARNING:paperless.barcodes:File is likely password protected, not checking for barcodes"
"""
test_file = self.BARCODE_SAMPLE_DIR / "patch-code-t-middle.pdf"
- with BarcodeReader(test_file, "application/pdf") as reader:
+ with self.get_reader(test_file) as reader:
documents = reader.separate_pages({1: False})
self.assertEqual(reader.pdf_file, test_file)
"""
test_file = self.BARCODE_SAMPLE_DIR / "patch-code-t-double.pdf"
- with BarcodeReader(test_file, "application/pdf") as reader:
+ with self.get_reader(test_file) as reader:
documents = reader.separate_pages({1: False, 2: False})
self.assertEqual(len(documents), 2)
WHEN:
- No separation pages are provided
THEN:
- - No new documents are produced
- - A warning is logged
+ - Nothing happens
"""
test_file = self.SAMPLE_DIR / "simple.pdf"
- with self.assertLogs("paperless.barcodes", level="WARNING") as cm:
- with BarcodeReader(test_file, "application/pdf") as reader:
- self.assertFalse(
- reader.separate(
- DocumentSource.ApiUpload,
- DocumentMetadataOverrides(),
- ),
- )
- self.assertEqual(
- cm.output,
- [
- "WARNING:paperless.barcodes:No pages to split on!",
- ],
- )
+ with self.get_reader(test_file) as reader:
+ self.assertEqual("No pages to split on!", reader.run())
@override_settings(
CONSUMER_ENABLE_BARCODES=True,
CONSUMER_BARCODE_TIFF_SUPPORT=True,
)
- @mock.patch("documents.consumer.Consumer.try_consume_file")
- def test_consume_barcode_unsupported_jpg_file(self, m):
+ def test_consume_barcode_unsupported_jpg_file(self):
"""
GIVEN:
- JPEG image as input
"""
test_file = self.SAMPLE_DIR / "simple.jpg"
- dst = settings.SCRATCH_DIR / "simple.jpg"
- shutil.copy(test_file, dst)
-
- with self.assertLogs("paperless.barcodes", level="WARNING") as cm:
- self.assertIn(
- "Success",
- tasks.consume_file(
- ConsumableDocument(
- source=DocumentSource.ConsumeFolder,
- original_file=dst,
- ),
- None,
- ),
- )
-
- self.assertListEqual(
- cm.output,
- [
- "WARNING:paperless.barcodes:Unsupported file format for barcode reader: image/jpeg",
- ],
- )
- m.assert_called_once()
-
- args, kwargs = m.call_args
- self.assertIsNone(kwargs["override_filename"])
- self.assertIsNone(kwargs["override_title"])
- self.assertIsNone(kwargs["override_correspondent_id"])
- self.assertIsNone(kwargs["override_document_type_id"])
- self.assertIsNone(kwargs["override_tag_ids"])
+ with self.get_reader(test_file) as reader:
+ self.assertFalse(reader.able_to_run)
@override_settings(
CONSUMER_ENABLE_BARCODES=True,
"""
test_file = self.BARCODE_SAMPLE_DIR / "split-by-asn-2.pdf"
- with BarcodeReader(test_file, "application/pdf") as reader:
+ with self.get_reader(test_file) as reader:
reader.detect()
separator_page_numbers = reader.get_separation_pages()
"""
test_file = self.BARCODE_SAMPLE_DIR / "split-by-asn-1.pdf"
- with BarcodeReader(test_file, "application/pdf") as reader:
+ with self.get_reader(test_file) as reader:
reader.detect()
separator_page_numbers = reader.get_separation_pages()
overrides = DocumentMetadataOverrides(tag_ids=[1, 2, 9])
- with mock.patch("documents.tasks.async_to_sync") as progress_mocker:
+ with mock.patch("documents.tasks.ProgressManager", DummyProgressManager):
self.assertEqual(
tasks.consume_file(
ConsumableDocument(
),
overrides,
),
- "File successfully split",
+ "Barcode splitting complete!",
)
- # We let the consumer know progress is done
- progress_mocker.assert_called_once()
# 2 new document consume tasks created
self.assertEqual(self.consume_file_mock.call_count, 2)
self.assertEqual(overrides, new_doc_overrides)
-class TestAsnBarcode(DirectoriesMixin, SampleDirMixin, TestCase):
+class TestAsnBarcode(DirectoriesMixin, SampleDirMixin, GetReaderPluginMixin, TestCase):
+ @contextmanager
+ def get_reader(self, filepath: Path) -> BarcodePlugin:
+ reader = BarcodePlugin(
+ ConsumableDocument(DocumentSource.ConsumeFolder, original_file=filepath),
+ DocumentMetadataOverrides(),
+ DummyProgressManager(filepath.name, None),
+ self.dirs.scratch_dir,
+ "task-id",
+ )
+ reader.setup()
+ yield reader
+ reader.cleanup()
+
@override_settings(CONSUMER_ASN_BARCODE_PREFIX="CUSTOM-PREFIX-")
def test_scan_file_for_asn_custom_prefix(self):
"""
- The ASN integer value is correct
"""
test_file = self.BARCODE_SAMPLE_DIR / "barcode-39-asn-custom-prefix.pdf"
- with BarcodeReader(test_file, "application/pdf") as reader:
+ with self.get_reader(test_file) as reader:
asn = reader.asn
self.assertEqual(reader.pdf_file, test_file)
"""
test_file = self.BARCODE_SAMPLE_DIR / "barcode-39-asn-123.pdf"
- with BarcodeReader(test_file, "application/pdf") as reader:
+ with self.get_reader(test_file) as reader:
asn = reader.asn
self.assertEqual(reader.pdf_file, test_file)
"""
test_file = self.BARCODE_SAMPLE_DIR / "patch-code-t.pdf"
- with BarcodeReader(test_file, "application/pdf") as reader:
+ with self.get_reader(test_file) as reader:
asn = reader.asn
self.assertEqual(reader.pdf_file, test_file)
self.assertEqual(asn, None)
- @override_settings(CONSUMER_ENABLE_ASN_BARCODE=True)
- def test_scan_file_for_asn_already_exists(self):
- """
- GIVEN:
- - PDF with an ASN barcode
- - ASN value already exists
- WHEN:
- - File is scanned for barcodes
- THEN:
- - ASN is retrieved from the document
- - Consumption fails
- """
-
- Document.objects.create(
- title="WOW",
- content="the content",
- archive_serial_number=123,
- checksum="456",
- mime_type="application/pdf",
- )
-
- test_file = self.BARCODE_SAMPLE_DIR / "barcode-39-asn-123.pdf"
-
- dst = settings.SCRATCH_DIR / "barcode-39-asn-123.pdf"
- shutil.copy(test_file, dst)
-
- with mock.patch("documents.consumer.Consumer._send_progress"):
- with self.assertRaises(ConsumerError) as cm, self.assertLogs(
- "paperless.consumer",
- level="ERROR",
- ) as logs_cm:
- tasks.consume_file(
- ConsumableDocument(
- source=DocumentSource.ConsumeFolder,
- original_file=dst,
- ),
- None,
- )
- self.assertIn("Not consuming barcode-39-asn-123.pdf", str(cm.exception))
- error_str = logs_cm.output[0]
- expected_str = "ERROR:paperless.consumer:Not consuming barcode-39-asn-123.pdf: Given ASN already exists!"
- self.assertEqual(expected_str, error_str)
-
def test_scan_file_for_asn_barcode_invalid(self):
"""
GIVEN:
"""
test_file = self.BARCODE_SAMPLE_DIR / "barcode-39-asn-invalid.pdf"
- with BarcodeReader(test_file, "application/pdf") as reader:
+ with self.get_reader(test_file) as reader:
asn = reader.asn
self.assertEqual(reader.pdf_file, test_file)
dst = settings.SCRATCH_DIR / "barcode-39-asn-123.pdf"
shutil.copy(test_file, dst)
- with mock.patch("documents.consumer.Consumer.try_consume_file") as mocked_call:
+ with mock.patch(
+ "documents.consumer.Consumer.try_consume_file",
+ ) as mocked_consumer:
tasks.consume_file(
ConsumableDocument(
source=DocumentSource.ConsumeFolder,
),
None,
)
-
- args, kwargs = mocked_call.call_args
+ mocked_consumer.assert_called_once()
+ args, kwargs = mocked_consumer.call_args
self.assertEqual(kwargs["override_asn"], 123)
- @override_settings(CONSUMER_ENABLE_ASN_BARCODE=True)
- def test_asn_too_large(self):
- """
- GIVEN:
- - ASN from barcode enabled
- - Barcode contains too large an ASN value
- WHEN:
- - ASN from barcode checked for correctness
- THEN:
- - Exception is raised regarding size limits
- """
- src = self.BARCODE_SAMPLE_DIR / "barcode-128-asn-too-large.pdf"
-
- dst = self.dirs.scratch_dir / "barcode-128-asn-too-large.pdf"
- shutil.copy(src, dst)
-
- input_doc = ConsumableDocument(
- source=DocumentSource.ConsumeFolder,
- original_file=dst,
- )
-
- with mock.patch("documents.consumer.Consumer._send_progress"):
- self.assertRaisesMessage(
- ConsumerError,
- "Given ASN 4294967296 is out of range [0, 4,294,967,295]",
- tasks.consume_file,
- input_doc,
- )
-
@override_settings(CONSUMER_BARCODE_SCANNER="PYZBAR")
def test_scan_file_for_qrcode_without_upscale(self):
"""
test_file = self.BARCODE_SAMPLE_DIR / "barcode-qr-asn-000123-upscale-dpi.pdf"
- with BarcodeReader(test_file, "application/pdf") as reader:
+ with self.get_reader(test_file) as reader:
reader.detect()
self.assertEqual(len(reader.barcodes), 0)
test_file = self.BARCODE_SAMPLE_DIR / "barcode-qr-asn-000123-upscale-dpi.pdf"
- with BarcodeReader(test_file, "application/pdf") as reader:
+ with self.get_reader(test_file) as reader:
reader.detect()
self.assertEqual(len(reader.barcodes), 1)
self.assertEqual(reader.asn, 123)
from documents.double_sided import STAGING_FILE_NAME
from documents.double_sided import TIMEOUT_MINUTES
from documents.tests.utils import DirectoriesMixin
+from documents.tests.utils import DummyProgressManager
from documents.tests.utils import FileSystemAssertsMixin
dst = self.dirs.double_sided_dir / dstname
dst.parent.mkdir(parents=True, exist_ok=True)
shutil.copy(src, dst)
- with mock.patch("documents.tasks.async_to_sync"), mock.patch(
- "documents.consumer.async_to_sync",
- ):
+ with mock.patch(
+ "documents.tasks.ProgressManager",
+ DummyProgressManager,
+ ), mock.patch("documents.consumer.async_to_sync"):
msg = tasks.consume_file(
ConsumableDocument(
source=DocumentSource.ConsumeFolder,
"""
msg = self.consume_file("simple.pdf", Path("..") / "simple.pdf")
self.assertIsNotFile(self.staging_file)
- self.assertRegex(msg, "Success. New document .* created")
+ self.assertRegex(msg, r"Success. New document id \d+ created")
def test_subdirectory_upload(self):
"""
"""
msg = self.consume_file("simple.pdf")
self.assertIsNotFile(self.staging_file)
- self.assertRegex(msg, "Success. New document .* created")
+ self.assertRegex(msg, r"Success. New document id \d+ created")
from documents.models import WorkflowTrigger
from documents.signals import document_consumption_finished
from documents.tests.utils import DirectoriesMixin
+from documents.tests.utils import DummyProgressManager
from documents.tests.utils import FileSystemAssertsMixin
from paperless_mail.models import MailAccount
from paperless_mail.models import MailRule
test_file = self.SAMPLE_DIR / "simple.pdf"
- with mock.patch("documents.tasks.async_to_sync"):
+ with mock.patch("documents.tasks.ProgressManager", DummyProgressManager):
with self.assertLogs("paperless.matching", level="INFO") as cm:
tasks.consume_file(
ConsumableDocument(
w.save()
test_file = self.SAMPLE_DIR / "simple.pdf"
- with mock.patch("documents.tasks.async_to_sync"):
+ with mock.patch("documents.tasks.ProgressManager", DummyProgressManager):
with self.assertLogs("paperless.matching", level="INFO") as cm:
tasks.consume_file(
ConsumableDocument(
test_file = self.SAMPLE_DIR / "simple.pdf"
- with mock.patch("documents.tasks.async_to_sync"):
+ with mock.patch("documents.tasks.ProgressManager", DummyProgressManager):
with self.assertLogs("paperless.matching", level="INFO") as cm:
tasks.consume_file(
ConsumableDocument(
test_file = self.SAMPLE_DIR / "simple.pdf"
- with mock.patch("documents.tasks.async_to_sync"):
+ with mock.patch("documents.tasks.ProgressManager", DummyProgressManager):
with self.assertLogs("paperless.matching", level="DEBUG") as cm:
tasks.consume_file(
ConsumableDocument(
test_file = self.SAMPLE_DIR / "simple.pdf"
- with mock.patch("documents.tasks.async_to_sync"):
+ with mock.patch("documents.tasks.ProgressManager", DummyProgressManager):
with self.assertLogs("paperless.matching", level="DEBUG") as cm:
tasks.consume_file(
ConsumableDocument(
test_file = self.SAMPLE_DIR / "simple.pdf"
- with mock.patch("documents.tasks.async_to_sync"):
+ with mock.patch("documents.tasks.ProgressManager", DummyProgressManager):
with self.assertLogs("paperless.matching", level="DEBUG") as cm:
tasks.consume_file(
ConsumableDocument(
test_file = self.SAMPLE_DIR / "simple.pdf"
- with mock.patch("documents.tasks.async_to_sync"):
+ with mock.patch("documents.tasks.ProgressManager", DummyProgressManager):
with self.assertLogs("paperless.matching", level="DEBUG") as cm:
tasks.consume_file(
ConsumableDocument(
test_file = self.SAMPLE_DIR / "simple.pdf"
- with mock.patch("documents.tasks.async_to_sync"):
+ with mock.patch("documents.tasks.ProgressManager", DummyProgressManager):
with self.assertLogs("paperless.matching", level="DEBUG") as cm:
tasks.consume_file(
ConsumableDocument(
test_file = self.SAMPLE_DIR / "simple.pdf"
- with mock.patch("documents.tasks.async_to_sync"):
+ with mock.patch("documents.tasks.ProgressManager", DummyProgressManager):
with self.assertLogs("paperless.matching", level="INFO") as cm:
tasks.consume_file(
ConsumableDocument(
from pathlib import Path
from typing import Any
from typing import Callable
+from typing import Optional
from typing import Union
from unittest import mock
from documents.data_models import ConsumableDocument
from documents.data_models import DocumentMetadataOverrides
from documents.parsers import ParseError
+from documents.plugins.helpers import ProgressStatusOptions
def setup_directories():
class DirectoriesMixin:
+ """
+ Creates and overrides settings for all folders and paths, then ensures
+ they are cleaned up on exit
+ """
+
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.dirs = None
class FileSystemAssertsMixin:
+ """
+ Utilities for checks various state information of the file system
+ """
+
def assertIsFile(self, path: Union[PathLike, str]):
self.assertTrue(Path(path).resolve().is_file(), f"File does not exist: {path}")
class ConsumerProgressMixin:
+ """
+ Mocks the Consumer _send_progress, preventing attempts to connect to Redis
+ and allowing access to its calls for verification
+ """
+
def setUp(self) -> None:
self.send_progress_patcher = mock.patch(
"documents.consumer.Consumer._send_progress",
SAMPLE_DIR = Path(__file__).parent / "samples"
BARCODE_SAMPLE_DIR = SAMPLE_DIR / "barcodes"
+
+
+class DummyProgressManager:
+ """
+ A dummy handler for progress management that doesn't actually try to
+ connect to Redis. Payloads are stored for test assertions if needed.
+
+ Use it with
+ mock.patch("documents.tasks.ProgressManager", DummyProgressManager)
+ """
+
+ def __init__(self, filename: str, task_id: Optional[str] = None) -> None:
+ self.filename = filename
+ self.task_id = task_id
+ print("hello world")
+ self.payloads = []
+
+ def __enter__(self):
+ self.open()
+ return self
+
+ def __exit__(self, exc_type, exc_val, exc_tb):
+ self.close()
+
+ def open(self) -> None:
+ pass
+
+ def close(self) -> None:
+ pass
+
+ def send_progress(
+ self,
+ status: ProgressStatusOptions,
+ message: str,
+ current_progress: int,
+ max_progress: int,
+ extra_args: Optional[dict[str, Union[str, int]]] = None,
+ ) -> None:
+ # Ensure the layer is open
+ self.open()
+
+ payload = {
+ "type": "status_update",
+ "data": {
+ "filename": self.filename,
+ "task_id": self.task_id,
+ "current_progress": current_progress,
+ "max_progress": max_progress,
+ "status": status,
+ "message": message,
+ },
+ }
+ if extra_args is not None:
+ payload["data"].update(extra_args)
+
+ self.payloads.append(payload)