]> git.ipfire.org Git - thirdparty/paperless-ngx.git/commitdiff
Feature: Allow deletion of documents via the fuzzy matching command (#4957)
authorTrenton H <797416+stumpylog@users.noreply.github.com>
Mon, 18 Dec 2023 02:37:38 +0000 (18:37 -0800)
committerGitHub <noreply@github.com>
Mon, 18 Dec 2023 02:37:38 +0000 (18:37 -0800)
* Adds new flag allowing deletion of one of a document pair which is over the match ratio

* Documents the new command option

docs/administration.md
src/documents/management/commands/document_fuzzy_match.py
src/documents/tests/test_management_fuzzy.py

index 808d6afaf11a111ef3efc6ca2f120097c8d7d540..cf8a24294e50ccd0b584d80528a834d7430f436a 100644 (file)
@@ -607,3 +607,10 @@ document_fuzzy_match [--ratio] [--processes N]
 | ----------- | -------- | ------------------- | ------------------------------------------------------------------------------------------------------------------------------ |
 | --ratio     | No       | 85.0                | a number between 0 and 100, setting how similar a document must be for it to be reported. Higher numbers mean more similarity. |
 | --processes | No       | 1/4 of system cores | Number of processes to use for matching. Setting 1 disables multiple processes                                                 |
+| --delete    | No       | False               | If provided, one document of a matched pair above the ratio will be deleted.                                                   |
+
+!!! warning
+
+    If providing the `--delete` option, it is highly recommended to have a backup.
+    While every effort has been taken to ensure proper operation, there is always the
+    chance of deletion of a file you want to keep.
index 597a9d2c131d3805f448af521e8bc563c2349d4a..9e01ff1b06cb850a4d86b51d3390e1fa4389b23d 100644 (file)
@@ -53,6 +53,12 @@ class Command(MultiProcessMixin, ProgressBarMixin, BaseCommand):
             type=float,
             help="Ratio to consider documents a match",
         )
+        parser.add_argument(
+            "--delete",
+            default=False,
+            action="store_true",
+            help="If set, one document of matches above the ratio WILL BE DELETED",
+        )
         self.add_argument_progress_bar_mixin(parser)
         self.add_argument_processes_mixin(parser)
 
@@ -63,6 +69,13 @@ class Command(MultiProcessMixin, ProgressBarMixin, BaseCommand):
         self.handle_processes_mixin(**options)
         self.handle_progress_bar_mixin(**options)
 
+        if options["delete"]:
+            self.stdout.write(
+                self.style.WARNING(
+                    "The command is configured to delete documents.  Use with caution",
+                ),
+            )
+
         opt_ratio = options["ratio"]
         checked_pairs: set[tuple[int, int]] = set()
         work_pkgs: list[_WorkPackage] = []
@@ -81,15 +94,12 @@ class Command(MultiProcessMixin, ProgressBarMixin, BaseCommand):
                     continue
                 # Skip matching which have already been matched together
                 # doc 1 to doc 2 is the same as doc 2 to doc 1
-                if (first_doc.pk, second_doc.pk) in checked_pairs or (
-                    second_doc.pk,
-                    first_doc.pk,
-                ) in checked_pairs:
+                doc_1_to_doc_2 = (first_doc.pk, second_doc.pk)
+                doc_2_to_doc_1 = doc_1_to_doc_2[::-1]
+                if doc_1_to_doc_2 in checked_pairs or doc_2_to_doc_1 in checked_pairs:
                     continue
-                checked_pairs.update(
-                    [(first_doc.pk, second_doc.pk), (second_doc.pk, first_doc.pk)],
-                )
-
+                checked_pairs.update([doc_1_to_doc_2, doc_2_to_doc_1])
+                # Actually something useful to work on now
                 work_pkgs.append(_WorkPackage(first_doc, second_doc))
 
         # Don't spin up a pool of 1 process
@@ -109,6 +119,7 @@ class Command(MultiProcessMixin, ProgressBarMixin, BaseCommand):
 
         # Check results
         messages = []
+        maybe_delete_ids = []
         for result in sorted(results):
             if result.ratio >= opt_ratio:
                 messages.append(
@@ -117,6 +128,7 @@ class Command(MultiProcessMixin, ProgressBarMixin, BaseCommand):
                         f" to {result.doc_two_pk} (confidence {result.ratio:.3f})",
                     ),
                 )
+                maybe_delete_ids.append(result.doc_two_pk)
 
         if len(messages) == 0:
             messages.append(
@@ -125,3 +137,10 @@ class Command(MultiProcessMixin, ProgressBarMixin, BaseCommand):
         self.stdout.writelines(
             messages,
         )
+        if options["delete"]:
+            self.stdout.write(
+                self.style.NOTICE(
+                    f"Deleting {len(maybe_delete_ids)} documents based on ratio matches",
+                ),
+            )
+            Document.objects.filter(pk__in=maybe_delete_ids).delete()
index abbf3c921e80a0502f91801dee66ff00326f5897..c215c43cac400df3e4c419627e68d760be284163 100644 (file)
@@ -157,3 +157,55 @@ class TestFuzzyMatchCommand(TestCase):
         self.assertRegex(lines[0], self.MSG_REGEX)
         self.assertRegex(lines[1], self.MSG_REGEX)
         self.assertRegex(lines[2], self.MSG_REGEX)
+
+    def test_document_deletion(self):
+        """
+        GIVEN:
+            - 3 documents exist
+            - Document 1 to document 3 has a similarity over 85.0
+        WHEN:
+            - Command is called with the --delete option
+        THEN:
+            - User is warned about the deletion flag
+            - Document 3 is deleted
+            - Documents 1 and 2 remain
+        """
+        # Content similarity is 86.667
+        Document.objects.create(
+            checksum="BEEFCAFE",
+            title="A",
+            content="first document scanned by bob",
+            mime_type="application/pdf",
+            filename="test.pdf",
+        )
+        Document.objects.create(
+            checksum="DEADBEAF",
+            title="A",
+            content="second document scanned by alice",
+            mime_type="application/pdf",
+            filename="other_test.pdf",
+        )
+        Document.objects.create(
+            checksum="CATTLE",
+            title="A",
+            content="first document scanned by pete",
+            mime_type="application/pdf",
+            filename="final_test.pdf",
+        )
+
+        self.assertEqual(Document.objects.count(), 3)
+
+        stdout, _ = self.call_command("--delete")
+        print(stdout)
+        lines = [x.strip() for x in stdout.split("\n") if len(x.strip())]
+        self.assertEqual(len(lines), 3)
+        self.assertEqual(
+            lines[0],
+            "The command is configured to delete documents.  Use with caution",
+        )
+        self.assertRegex(lines[1], self.MSG_REGEX)
+        self.assertEqual(lines[2], "Deleting 1 documents based on ratio matches")
+
+        self.assertEqual(Document.objects.count(), 2)
+        self.assertIsNotNone(Document.objects.get(pk=1))
+        self.assertIsNotNone(Document.objects.get(pk=2))