Performance: pre-filter document list in scheduled workflow checks (#10031)

author shamoon <4887959+shamoon@users.noreply.github.com>

Tue, 3 Jun 2025 21:47:29 +0000 (14:47 -0700)

committer GitHub <noreply@github.com>

Tue, 3 Jun 2025 21:47:29 +0000 (21:47 +0000)
author shamoon <4887959+shamoon@users.noreply.github.com>
Tue, 3 Jun 2025 21:47:29 +0000 (14:47 -0700)
committer GitHub <noreply@github.com>
Tue, 3 Jun 2025 21:47:29 +0000 (21:47 +0000)
diff --git a/src/documents/matching.py b/src/documents/matching.py

index ab3866518e31fbf72e154a34e6e8296f85c43bcc..15a8ec443cedaa79bd24e81a969c804709c7bde4 100644 (file)
--- a/src/documents/matching.py
+++ b/src/documents/matching.py
@@ -3,6 +3,7 @@ from __future__ import annotations
  import logging
  import re
  from fnmatch import fnmatch
+from fnmatch import translate as fnmatch_translate
  from typing import TYPE_CHECKING
  
  from documents.data_models import ConsumableDocument
@@ -18,6 +19,8 @@ from documents.models import WorkflowTrigger
  from documents.permissions import get_objects_for_user_owner_aware
  
  if TYPE_CHECKING:
+    from django.db.models import QuerySet
+
      from documents.classifier import DocumentClassifier
  
  logger = logging.getLogger("paperless.matching")
@@ -389,6 +392,40 @@ def existing_document_matches_workflow(
      return (trigger_matched, reason)
  
  
+def prefilter_documents_by_workflowtrigger(
+    documents: QuerySet[Document],
+    trigger: WorkflowTrigger,
+) -> QuerySet[Document]:
+    """
+    To prevent scheduled workflows checking every document, we prefilter the
+    documents by the workflow trigger filters. This is done before e.g.
+    document_matches_workflow in run_workflows
+    """
+
+    if trigger.filter_has_tags.all().count() > 0:
+        documents = documents.filter(
+            tags__in=trigger.filter_has_tags.all(),
+        ).distinct()
+
+    if trigger.filter_has_correspondent is not None:
+        documents = documents.filter(
+            correspondent=trigger.filter_has_correspondent,
+        )
+
+    if trigger.filter_has_document_type is not None:
+        documents = documents.filter(
+            document_type=trigger.filter_has_document_type,
+        )
+
+    if trigger.filter_filename is not None and len(trigger.filter_filename) > 0:
+        # the true fnmatch will actually run later so we just want a loose filter here
+        regex = fnmatch_translate(trigger.filter_filename).lstrip("^").rstrip("$")
+        regex = f"(?i){regex}"
+        documents = documents.filter(original_filename__regex=regex)
+
+    return documents
+
+
  def document_matches_workflow(
      document: ConsumableDocument | Document,
      workflow: Workflow,
diff --git a/src/documents/tasks.py b/src/documents/tasks.py

index 1d4b33ff3689366f53c9d4741fc9344a78ba24bc..2ab5ab1cb6b4c01aeebde05032a823da1683a0eb 100644 (file)
--- a/src/documents/tasks.py
+++ b/src/documents/tasks.py
@@ -33,6 +33,7 @@ from documents.data_models import DocumentMetadataOverrides
  from documents.double_sided import CollatePlugin
  from documents.file_handling import create_source_path_directory
  from documents.file_handling import generate_unique_filename
+from documents.matching import prefilter_documents_by_workflowtrigger
  from documents.models import Correspondent
  from documents.models import CustomFieldInstance
  from documents.models import Document
@@ -473,6 +474,12 @@ def check_scheduled_workflows():
  
                          documents = Document.objects.filter(id__in=matched_ids)
  
+                if documents.count() > 0:
+                    documents = prefilter_documents_by_workflowtrigger(
+                        documents,
+                        trigger,
+                    )
+
                  if documents.count() > 0:
                      logger.debug(
                          f"Found {documents.count()} documents for trigger {trigger}",
diff --git a/src/documents/tests/test_workflows.py b/src/documents/tests/test_workflows.py

index 51ea2be2399785933c387293e40cd47ac0695ab2..b577eeeb4ee41767ef83999ed83f793fd428f143 100644 (file)
--- a/src/documents/tests/test_workflows.py
+++ b/src/documents/tests/test_workflows.py
@@ -25,6 +25,7 @@ from documents import tasks
  from documents.data_models import ConsumableDocument
  from documents.data_models import DocumentSource
  from documents.matching import document_matches_workflow
+from documents.matching import prefilter_documents_by_workflowtrigger
  from documents.models import Correspondent
  from documents.models import CustomField
  from documents.models import CustomFieldInstance
@@ -1711,6 +1712,55 @@ class TestWorkflows(
          doc2.refresh_from_db()
          self.assertIsNone(doc2.owner)  # has not triggered yet
  
+    def test_workflow_scheduled_filters_queryset(self):
+        """
+        GIVEN:
+            - Existing workflow with scheduled trigger
+        WHEN:
+            - Workflows run and matching documents are found
+        THEN:
+            - prefilter_documents_by_workflowtrigger appropriately filters
+        """
+        trigger = WorkflowTrigger.objects.create(
+            type=WorkflowTrigger.WorkflowTriggerType.SCHEDULED,
+            schedule_offset_days=-7,
+            schedule_date_field=WorkflowTrigger.ScheduleDateField.CREATED,
+            filter_filename="*sample*",
+            filter_has_document_type=self.dt,
+            filter_has_correspondent=self.c,
+        )
+        trigger.filter_has_tags.set([self.t1])
+        trigger.save()
+        action = WorkflowAction.objects.create(
+            assign_owner=self.user2,
+        )
+        w = Workflow.objects.create(
+            name="Workflow 1",
+            order=0,
+        )
+        w.triggers.add(trigger)
+        w.actions.add(action)
+        w.save()
+
+        # create 10 docs with half having the document type
+        for i in range(10):
+            doc = Document.objects.create(
+                title=f"sample test {i}",
+                checksum=f"checksum{i}",
+                correspondent=self.c,
+                original_filename=f"sample_{i}.pdf",
+                document_type=self.dt if i % 2 == 0 else None,
+            )
+            doc.tags.set([self.t1])
+            doc.save()
+
+        documents = Document.objects.all()
+        filtered_docs = prefilter_documents_by_workflowtrigger(
+            documents,
+            trigger,
+        )
+        self.assertEqual(filtered_docs.count(), 5)
+
      def test_workflow_enabled_disabled(self):
          trigger = WorkflowTrigger.objects.create(
              type=WorkflowTrigger.WorkflowTriggerType.DOCUMENT_ADDED,
author	shamoon <4887959+shamoon@users.noreply.github.com>
	Tue, 3 Jun 2025 21:47:29 +0000 (14:47 -0700)
committer	GitHub <noreply@github.com>
	Tue, 3 Jun 2025 21:47:29 +0000 (21:47 +0000)
src/documents/matching.py		patch \| blob \| blame \| history
src/documents/tasks.py		patch \| blob \| blame \| history
src/documents/tests/test_workflows.py		patch \| blob \| blame \| history