Add to handler, matching, retagger

author shamoon <4887959+shamoon@users.noreply.github.com>

Fri, 13 Dec 2024 21:54:20 +0000 (13:54 -0800)

committer shamoon <4887959+shamoon@users.noreply.github.com>

Fri, 21 Mar 2025 04:54:05 +0000 (21:54 -0700)
author shamoon <4887959+shamoon@users.noreply.github.com>
Fri, 13 Dec 2024 21:54:20 +0000 (13:54 -0800)
committer shamoon <4887959+shamoon@users.noreply.github.com>
Fri, 21 Mar 2025 04:54:05 +0000 (21:54 -0700)
diff --git a/docs/administration.md b/docs/administration.md

index 8e646b3261aab7fdb1dcc5dc02cde84e69be3f9a..63333dee92c53b4808c4c07122e2c115534cc536 100644 (file)
--- a/docs/administration.md
+++ b/docs/administration.md
@@ -372,17 +372,19 @@ currently-imported docs. This problem is common enough that there are
  tools for it.
  
  ```
-document_retagger [-h] [-c] [-T] [-t] [-i] [--id-range] [--use-first] [-f]
+document_retagger [-h] [-c] [-T] [-t] [-cf] [-i] [--id-range] [--use-first] [-f] [--suggest]
  
  optional arguments:
  -c, --correspondent
  -T, --tags
  -t, --document_type
  -s, --storage_path
+-cf, --custom_fields
  -i, --inbox-only
  --id-range
  --use-first
  -f, --overwrite
+--suggest
  ```
  
  Run this after changing or adding matching rules. It'll loop over all
@@ -408,6 +410,8 @@ to override this behavior and just use the first correspondent or type
  it finds. This option does not apply to tags, since any amount of tags
  can be applied to a document.
  
+If you want to suggest changes but not apply them, specify `--suggest`.
+
  Finally, `-f` specifies that you wish to overwrite already assigned
  correspondents, types and/or tags. The default behavior is to not assign
  correspondents and types to documents that have this data already
diff --git a/src/documents/apps.py b/src/documents/apps.py

index f3b798c0b5bd21689010418c229f4a19a545b037..812c5d2a4254c135500fd071e6212bcc4d5f2853 100644 (file)
--- a/src/documents/apps.py
+++ b/src/documents/apps.py
@@ -15,6 +15,7 @@ class DocumentsConfig(AppConfig):
          from documents.signals.handlers import run_workflows_added
          from documents.signals.handlers import run_workflows_updated
          from documents.signals.handlers import set_correspondent
+        from documents.signals.handlers import set_custom_fields
          from documents.signals.handlers import set_document_type
          from documents.signals.handlers import set_storage_path
          from documents.signals.handlers import set_tags
@@ -24,6 +25,7 @@ class DocumentsConfig(AppConfig):
          document_consumption_finished.connect(set_document_type)
          document_consumption_finished.connect(set_tags)
          document_consumption_finished.connect(set_storage_path)
+        document_consumption_finished.connect(set_custom_fields)
          document_consumption_finished.connect(add_to_index)
          document_consumption_finished.connect(run_workflows_added)
          document_updated.connect(run_workflows_updated)
diff --git a/src/documents/management/commands/document_retagger.py b/src/documents/management/commands/document_retagger.py

index 10bb54b711a3507782b090e9d65eec172047f88d..7f73366d4753a2d2377ab0b04725d2a86d8deb47 100644 (file)
--- a/src/documents/management/commands/document_retagger.py
+++ b/src/documents/management/commands/document_retagger.py
@@ -7,6 +7,7 @@ from documents.classifier import load_classifier
  from documents.management.commands.mixins import ProgressBarMixin
  from documents.models import Document
  from documents.signals.handlers import set_correspondent
+from documents.signals.handlers import set_custom_fields
  from documents.signals.handlers import set_document_type
  from documents.signals.handlers import set_storage_path
  from documents.signals.handlers import set_tags
@@ -17,9 +18,9 @@ logger = logging.getLogger("paperless.management.retagger")
  class Command(ProgressBarMixin, BaseCommand):
      help = (
          "Using the current classification model, assigns correspondents, tags "
-        "and document types to all documents, effectively allowing you to "
-        "back-tag all previously indexed documents with metadata created (or "
-        "modified) after their initial import."
+        "document types, storage paths and custom fields to all documents, effectively"
+        "allowing you to back-tag all previously indexed documents with metadata created "
+        "(or modified) after their initial import."
      )
  
      def add_arguments(self, parser):
@@ -27,6 +28,12 @@ class Command(ProgressBarMixin, BaseCommand):
          parser.add_argument("-T", "--tags", default=False, action="store_true")
          parser.add_argument("-t", "--document_type", default=False, action="store_true")
          parser.add_argument("-s", "--storage_path", default=False, action="store_true")
+        parser.add_argument(
+            "-cf",
+            "--custom_fields",
+            default=False,
+            action="store_true",
+        )
          parser.add_argument("-i", "--inbox-only", default=False, action="store_true")
          parser.add_argument(
              "--use-first",
@@ -134,3 +141,16 @@ class Command(ProgressBarMixin, BaseCommand):
                      stdout=self.stdout,
                      style_func=self.style,
                  )
+
+            if options["custom_fields"]:
+                set_custom_fields(
+                    sender=None,
+                    document=document,
+                    classifier=classifier,
+                    replace=options["overwrite"],
+                    use_first=options["use_first"],
+                    suggest=options["suggest"],
+                    base_url=options["base_url"],
+                    stdout=self.stdout,
+                    style_func=self.style,
+                )
diff --git a/src/documents/matching.py b/src/documents/matching.py

index ab3866518e31fbf72e154a34e6e8296f85c43bcc..08cb5da770ba64619bcbf83f9eb35f7cc1bc9706 100644 (file)
--- a/src/documents/matching.py
+++ b/src/documents/matching.py
@@ -132,6 +132,25 @@ def match_storage_paths(document: Document, classifier: DocumentClassifier, user
      )
  
  
+def match_custom_fields(document: Document, classifier: DocumentClassifier, user=None):
+    predicted_custom_field_ids = (
+        classifier.predict_custom_fields(document.content) if classifier else []
+    )
+
+    fields = [instance.field for instance in document.custom_fields.all()]
+
+    return list(
+        filter(
+            lambda o: matches(o, document)
+            or (
+                o.matching_algorithm == MatchingModel.MATCH_AUTO
+                and o.pk in predicted_custom_field_ids
+            ),
+            fields,
+        ),
+    )
+
+
  def matches(matching_model: MatchingModel, document: Document):
      search_kwargs = {}
  
diff --git a/src/documents/signals/handlers.py b/src/documents/signals/handlers.py

index 40773537553693d0c5b734802d1a4008ef5cdd32..da54f456e81f90e350ddd113449fe2f70183bf94 100644 (file)
--- a/src/documents/signals/handlers.py
+++ b/src/documents/signals/handlers.py
@@ -318,6 +318,67 @@ def set_storage_path(
              document.save(update_fields=("storage_path",))
  
  
+def set_custom_fields(
+    document: Document,
+    logging_group=None,
+    classifier: DocumentClassifier | None = None,
+    replace=False,
+    suggest=False,
+    base_url=None,
+    stdout=None,
+    style_func=None,
+    **kwargs,
+):
+    if replace:
+        CustomFieldInstance.objects.filter(document=document).exclude(
+            Q(field__match="") & ~Q(field__matching_algorithm=CustomField.MATCH_AUTO),
+        ).delete()
+
+    current_fields = set([instance.field for instance in document.custom_fields.all()])
+
+    matched_fields = matching.match_custom_fields(document, classifier)
+
+    relevant_fields = set(matched_fields) - current_fields
+
+    if suggest:
+        extra_fields = current_fields - set(matched_fields)
+        extra_fields = [
+            f for f in extra_fields if f.matching_algorithm == MatchingModel.MATCH_AUTO
+        ]
+        if not relevant_fields and not extra_fields:
+            return
+        doc_str = style_func.SUCCESS(str(document))
+        if base_url:
+            stdout.write(doc_str)
+            stdout.write(f"{base_url}/documents/{document.pk}")
+        else:
+            stdout.write(doc_str + style_func.SUCCESS(f" [{document.pk}]"))
+        if relevant_fields:
+            stdout.write(
+                "Suggest custom fields: "
+                + ", ".join([f.name for f in relevant_fields]),
+            )
+        if extra_fields:
+            stdout.write(
+                "Extra custom fields: " + ", ".join([f.name for f in extra_fields]),
+            )
+    else:
+        if not relevant_fields:
+            return
+
+        message = 'Assigning custom fields "{}" to "{}"'
+        logger.info(
+            message.format(document, ", ".join([f.name for f in relevant_fields])),
+            extra={"group": logging_group},
+        )
+
+        for field in relevant_fields:
+            CustomFieldInstance.objects.create(
+                field=field,
+                document=document,
+            )
+
+
  # see empty_trash in documents/tasks.py for signal handling
  def cleanup_document_deletion(sender, instance, **kwargs):
      with FileLock(settings.MEDIA_LOCK):
author	shamoon <4887959+shamoon@users.noreply.github.com>
	Fri, 13 Dec 2024 21:54:20 +0000 (13:54 -0800)
committer	shamoon <4887959+shamoon@users.noreply.github.com>
	Fri, 21 Mar 2025 04:54:05 +0000 (21:54 -0700)
docs/administration.md		patch \| blob \| blame \| history
src/documents/apps.py		patch \| blob \| blame \| history
src/documents/management/commands/document_retagger.py		patch \| blob \| blame \| history
src/documents/matching.py		patch \| blob \| blame \| history
src/documents/signals/handlers.py		patch \| blob \| blame \| history