--- /dev/null
+from typing import Final
+
+import rapidfuzz
+from django.core.management import BaseCommand
+from django.core.management import CommandError
+
+from documents.models import Document
+
+
+class Command(BaseCommand):
+ help = "Manages the document index."
+
+ def add_arguments(self, parser):
+ parser.add_argument(
+ "--ratio",
+ default=85.0,
+ type=float,
+ help="Ratio to consider documents a match",
+ )
+
+ def handle(self, *args, **options):
+ RATIO_MIN: Final[float] = 0.0
+ RATIO_MAX: Final[float] = 100.0
+
+ opt_ratio = options["ratio"]
+ match_pairs = set()
+
+ # Ratio is a float from 0.0 to 100.0
+ if opt_ratio < RATIO_MIN or opt_ratio > RATIO_MAX:
+ raise CommandError("The ratio must be between 0 and 100")
+
+ all_docs = Document.objects.all().order_by("id")
+
+ for first_doc in all_docs:
+ for second_doc in all_docs:
+ if first_doc.pk == second_doc.pk:
+ continue
+
+ # Normalize the string some, lower case, whitespace, etc
+ first_string = rapidfuzz.utils.default_process(first_doc.content)
+ second_string = rapidfuzz.utils.default_process(second_doc.content)
+
+ # Basic matching ratio
+ match = rapidfuzz.fuzz.ratio(first_string, second_string)
+
+ if match >= opt_ratio:
+ # Skip matching which have already been matched together
+ # doc 1 to doc 2 is the same as doc 2 to doc 1
+ if (first_doc.pk, second_doc.pk) in match_pairs or (
+ second_doc.pk,
+ first_doc.pk,
+ ) in match_pairs:
+ continue
+ else:
+ match_pairs.add((first_doc.pk, second_doc.pk))
+ match_pairs.add((second_doc.pk, first_doc.pk))
+
+ self.stdout.write(
+ self.style.NOTICE(
+ f"Document {first_doc.pk} fuzzy match"
+ f" to {second_doc.pk} (confidence {match:.3f})",
+ ),
+ )
--- /dev/null
+from io import StringIO
+
+from django.core.management import CommandError
+from django.core.management import call_command
+from django.test import TestCase
+
+from documents.models import Document
+
+
+class TestFuzzyMatchCommand(TestCase):
+ def call_command(self, *args, **kwargs):
+ stdout = StringIO()
+ stderr = StringIO()
+ call_command(
+ "document_fuzzy_match",
+ *args,
+ stdout=stdout,
+ stderr=stderr,
+ **kwargs,
+ )
+ return stdout.getvalue(), stderr.getvalue()
+
+ def test_invalid_ratio_lower_limit(self):
+ with self.assertRaises(CommandError):
+ self.call_command("--ratio", "-1")
+
+ def test_invalid_ratio_upper_limit(self):
+ with self.assertRaises(CommandError):
+ self.call_command("--ratio", "101")
+
+ def test_no_matches(self):
+ # Content similarity is 82.35
+ Document.objects.create(
+ checksum="BEEFCAFE",
+ title="A",
+ content="first document",
+ mime_type="application/pdf",
+ filename="test.pdf",
+ )
+ Document.objects.create(
+ checksum="DEADBEAF",
+ title="A",
+ content="other first document",
+ mime_type="application/pdf",
+ filename="other_test.pdf",
+ )
+ stdout, _ = self.call_command()
+ self.assertEqual(stdout, "")
+
+ def test_with_matches(self):
+ # Content similarity is 86.667
+ Document.objects.create(
+ checksum="BEEFCAFE",
+ title="A",
+ content="first document scanned by bob",
+ mime_type="application/pdf",
+ filename="test.pdf",
+ )
+ Document.objects.create(
+ checksum="DEADBEAF",
+ title="A",
+ content="first document scanned by alice",
+ mime_type="application/pdf",
+ filename="other_test.pdf",
+ )
+ stdout, _ = self.call_command()
+ self.assertEqual(stdout, "Document 1 fuzzy match to 2 (confidence 86.667)\n")
+
+ def test_with_3_matches(self):
+ # Content similarity is 86.667
+ Document.objects.create(
+ checksum="BEEFCAFE",
+ title="A",
+ content="first document scanned by bob",
+ mime_type="application/pdf",
+ filename="test.pdf",
+ )
+ Document.objects.create(
+ checksum="DEADBEAF",
+ title="A",
+ content="first document scanned by alice",
+ mime_type="application/pdf",
+ filename="other_test.pdf",
+ )
+ Document.objects.create(
+ checksum="CATTLE",
+ title="A",
+ content="first document scanned by pete",
+ mime_type="application/pdf",
+ filename="final_test.pdf",
+ )
+ stdout, _ = self.call_command()
+ lines = [x.strip() for x in stdout.split("\n") if len(x.strip())]
+ self.assertEqual(len(lines), 3)
+ self.assertEqual(lines[0], "Document 1 fuzzy match to 2 (confidence 86.667)")
+ self.assertEqual(lines[1], "Document 1 fuzzy match to 3 (confidence 88.136)")
+ self.assertEqual(lines[2], "Document 2 fuzzy match to 3 (confidence 88.525)")