]> git.ipfire.org Git - thirdparty/paperless-ngx.git/commitdiff
Implements a new command for fuzzy matching document content and reporting potential...
authorTrenton Holmes <797416+stumpylog@users.noreply.github.com>
Sun, 10 Sep 2023 23:32:10 +0000 (16:32 -0700)
committerTrenton H <797416+stumpylog@users.noreply.github.com>
Tue, 12 Sep 2023 15:17:12 +0000 (08:17 -0700)
docker/install_management_commands.sh
docs/administration.md
src/documents/management/commands/document_fuzzy_match.py [new file with mode: 0644]
src/documents/tests/test_management_fuzzy.py [new file with mode: 0644]

index e5c8b30a04f13d6737c145b65f245fd7c71bd457..38604af9d20f51d855e0044e5aeda95318b6266f 100755 (executable)
@@ -13,6 +13,7 @@ for command in decrypt_documents \
        document_retagger \
        document_thumbnails \
        document_sanity_checker \
+       document_fuzzy_match \
        manage_superuser;
 do
        echo "installing $command..."
index 2003edec9a7390ba1542311f88c45cdb1e6bf0e0..7ecdb76a6674cd706c9d6b6632307c8893413d55 100644 (file)
@@ -572,3 +572,20 @@ it here)
 ```
 decrypt_documents [--passphrase SECR3TP4SSPHRA$E]
 ```
+
+### Detecting duplicates {#fuzzy_duplicate}
+
+Paperless already catches and prevents upload of exactly matching documents,
+however a new scan of an existing document may not produce an exact bit for bit
+duplicate. But the content should be exact or close, allowing detection.
+
+This tool does a fuzzy match over document content, looking for
+those which look close according to a given ratio.
+
+```
+document_fuzzy_match [--ratio]
+```
+
+Optional arguments:
+--ratio - a number between 0 and 100, setting how similar a document must be for it to be reported.
+Higher numbers mean more similarity.
diff --git a/src/documents/management/commands/document_fuzzy_match.py b/src/documents/management/commands/document_fuzzy_match.py
new file mode 100644 (file)
index 0000000..f33e2d0
--- /dev/null
@@ -0,0 +1,63 @@
+from typing import Final
+
+import rapidfuzz
+from django.core.management import BaseCommand
+from django.core.management import CommandError
+
+from documents.models import Document
+
+
+class Command(BaseCommand):
+    help = "Manages the document index."
+
+    def add_arguments(self, parser):
+        parser.add_argument(
+            "--ratio",
+            default=85.0,
+            type=float,
+            help="Ratio to consider documents a match",
+        )
+
+    def handle(self, *args, **options):
+        RATIO_MIN: Final[float] = 0.0
+        RATIO_MAX: Final[float] = 100.0
+
+        opt_ratio = options["ratio"]
+        match_pairs = set()
+
+        # Ratio is a float from 0.0 to 100.0
+        if opt_ratio < RATIO_MIN or opt_ratio > RATIO_MAX:
+            raise CommandError("The ratio must be between 0 and 100")
+
+        all_docs = Document.objects.all().order_by("id")
+
+        for first_doc in all_docs:
+            for second_doc in all_docs:
+                if first_doc.pk == second_doc.pk:
+                    continue
+
+                # Normalize the string some, lower case, whitespace, etc
+                first_string = rapidfuzz.utils.default_process(first_doc.content)
+                second_string = rapidfuzz.utils.default_process(second_doc.content)
+
+                # Basic matching ratio
+                match = rapidfuzz.fuzz.ratio(first_string, second_string)
+
+                if match >= opt_ratio:
+                    # Skip matching which have already been matched together
+                    # doc 1 to doc 2 is the same as doc 2 to doc 1
+                    if (first_doc.pk, second_doc.pk) in match_pairs or (
+                        second_doc.pk,
+                        first_doc.pk,
+                    ) in match_pairs:
+                        continue
+                    else:
+                        match_pairs.add((first_doc.pk, second_doc.pk))
+                        match_pairs.add((second_doc.pk, first_doc.pk))
+
+                    self.stdout.write(
+                        self.style.NOTICE(
+                            f"Document {first_doc.pk} fuzzy match"
+                            f" to {second_doc.pk} (confidence {match:.3f})",
+                        ),
+                    )
diff --git a/src/documents/tests/test_management_fuzzy.py b/src/documents/tests/test_management_fuzzy.py
new file mode 100644 (file)
index 0000000..71b04b5
--- /dev/null
@@ -0,0 +1,97 @@
+from io import StringIO
+
+from django.core.management import CommandError
+from django.core.management import call_command
+from django.test import TestCase
+
+from documents.models import Document
+
+
+class TestFuzzyMatchCommand(TestCase):
+    def call_command(self, *args, **kwargs):
+        stdout = StringIO()
+        stderr = StringIO()
+        call_command(
+            "document_fuzzy_match",
+            *args,
+            stdout=stdout,
+            stderr=stderr,
+            **kwargs,
+        )
+        return stdout.getvalue(), stderr.getvalue()
+
+    def test_invalid_ratio_lower_limit(self):
+        with self.assertRaises(CommandError):
+            self.call_command("--ratio", "-1")
+
+    def test_invalid_ratio_upper_limit(self):
+        with self.assertRaises(CommandError):
+            self.call_command("--ratio", "101")
+
+    def test_no_matches(self):
+        # Content similarity is 82.35
+        Document.objects.create(
+            checksum="BEEFCAFE",
+            title="A",
+            content="first document",
+            mime_type="application/pdf",
+            filename="test.pdf",
+        )
+        Document.objects.create(
+            checksum="DEADBEAF",
+            title="A",
+            content="other first document",
+            mime_type="application/pdf",
+            filename="other_test.pdf",
+        )
+        stdout, _ = self.call_command()
+        self.assertEqual(stdout, "")
+
+    def test_with_matches(self):
+        # Content similarity is 86.667
+        Document.objects.create(
+            checksum="BEEFCAFE",
+            title="A",
+            content="first document scanned by bob",
+            mime_type="application/pdf",
+            filename="test.pdf",
+        )
+        Document.objects.create(
+            checksum="DEADBEAF",
+            title="A",
+            content="first document scanned by alice",
+            mime_type="application/pdf",
+            filename="other_test.pdf",
+        )
+        stdout, _ = self.call_command()
+        self.assertEqual(stdout, "Document 1 fuzzy match to 2 (confidence 86.667)\n")
+
+    def test_with_3_matches(self):
+        # Content similarity is 86.667
+        Document.objects.create(
+            checksum="BEEFCAFE",
+            title="A",
+            content="first document scanned by bob",
+            mime_type="application/pdf",
+            filename="test.pdf",
+        )
+        Document.objects.create(
+            checksum="DEADBEAF",
+            title="A",
+            content="first document scanned by alice",
+            mime_type="application/pdf",
+            filename="other_test.pdf",
+        )
+        Document.objects.create(
+            checksum="CATTLE",
+            title="A",
+            content="first document scanned by pete",
+            mime_type="application/pdf",
+            filename="final_test.pdf",
+        )
+        stdout, _ = self.call_command()
+        lines = [x.strip() for x in stdout.split("\n") if len(x.strip())]
+        self.assertEqual(len(lines), 3)
+        self.assertEqual(lines[0], "Document 1 fuzzy match to 2 (confidence 86.667)")
+        self.assertEqual(lines[1], "Document 1 fuzzy match to 3 (confidence 88.136)")
+        self.assertEqual(lines[2], "Document 2 fuzzy match to 3 (confidence 88.525)")