Cleans up the docs, adds validation of the process count, include the test descriptions

author Trenton H <797416+stumpylog@users.noreply.github.com>

Tue, 12 Sep 2023 00:10:09 +0000 (17:10 -0700)

committer Trenton H <797416+stumpylog@users.noreply.github.com>

Tue, 12 Sep 2023 15:17:12 +0000 (08:17 -0700)
author Trenton H <797416+stumpylog@users.noreply.github.com>
Tue, 12 Sep 2023 00:10:09 +0000 (17:10 -0700)
committer Trenton H <797416+stumpylog@users.noreply.github.com>
Tue, 12 Sep 2023 15:17:12 +0000 (08:17 -0700)
diff --git a/docs/administration.md b/docs/administration.md

index 75cca5997b66cc0a51cf0009a6e9f826d57429e6..6e657447a297cc07fc08cf1e87298b343aa0843e 100644 (file)
--- a/docs/administration.md
+++ b/docs/administration.md
@@ -582,8 +582,11 @@ duplicate. But the content should be exact or close, allowing detection.
  This tool does a fuzzy match over document content, looking for
  those which look close according to a given ratio.
  
+At this time, other metadata (such as correspondent or type) is not
+take into account by the detection.
+
  ```
-document_fuzzy_match [--ratio]
+document_fuzzy_match [--ratio] [--processes N]
  ```
  
  | Option      | Required | Default | Description                                                                                                                    |
diff --git a/src/documents/management/commands/document_fuzzy_match.py b/src/documents/management/commands/document_fuzzy_match.py

index eb37b2bf433db6868af36669b7edb62a6f3e8691..26ce55a39b771dff90adb4b54d04ea5b5da98c4f 100644 (file)
--- a/src/documents/management/commands/document_fuzzy_match.py
+++ b/src/documents/management/commands/document_fuzzy_match.py
@@ -27,6 +27,10 @@ class _WorkResult:
  
  
  def _process_and_match(work: _WorkPackage) -> _WorkResult:
+    """
+    Does basic processing of document content, gets the basic ratio
+    and returns the result package
+    """
      # Normalize the string some, lower case, whitespace, etc
      first_string = rapidfuzz.utils.default_process(work.first_doc.content)
      second_string = rapidfuzz.utils.default_process(work.second_doc.content)
@@ -72,6 +76,9 @@ class Command(BaseCommand):
          if opt_ratio < RATIO_MIN or opt_ratio > RATIO_MAX:
              raise CommandError("The ratio must be between 0 and 100")
  
+        if options["processes"] < 1:
+            raise CommandError("There must be at least 1 process")
+
          all_docs = Document.objects.all().order_by("id")
  
          # Build work packages for processing
diff --git a/src/documents/tests/test_management_fuzzy.py b/src/documents/tests/test_management_fuzzy.py

index 6b4520bc4b7ea2e16f5752275f1ddee8b80336ce..4a3d96c461e4bed6fcb2cebef2caf8b3818b53da 100644 (file)
--- a/src/documents/tests/test_management_fuzzy.py
+++ b/src/documents/tests/test_management_fuzzy.py
@@ -22,15 +22,54 @@ class TestFuzzyMatchCommand(TestCase):
          return stdout.getvalue(), stderr.getvalue()
  
      def test_invalid_ratio_lower_limit(self):
-        with self.assertRaises(CommandError):
+        """
+        GIVEN:
+            - Invalid ratio below lower limit
+        WHEN:
+            - Command is called
+        THEN:
+            - Error is raised indicating issue
+        """
+        with self.assertRaises(CommandError) as e:
              self.call_command("--ratio", "-1")
+            self.assertIn("The ratio must be between 0 and 100", str(e))
  
      def test_invalid_ratio_upper_limit(self):
-        with self.assertRaises(CommandError):
+        """
+        GIVEN:s
+            - Invalid ratio above upper
+        WHEN:
+            - Command is called
+        THEN:
+            - Error is raised indicating issue
+        """
+        with self.assertRaises(CommandError) as e:
              self.call_command("--ratio", "101")
+            self.assertIn("The ratio must be between 0 and 100", str(e))
+
+    def test_invalid_process_count(self):
+        """
+        GIVEN:
+            - Invalid process count less than 0 above upper
+        WHEN:
+            - Command is called
+        THEN:
+            - Error is raised indicating issue
+        """
+        with self.assertRaises(CommandError) as e:
+            self.call_command("--processes", "0")
+            self.assertIn("There must be at least 1 process", str(e))
  
      def test_no_matches(self):
-        # Content similarity is 82.35
+        """
+        GIVEN:
+            - 2 documents exist
+            - Similarity between content is 82.32
+        WHEN:
+            - Command is called
+        THEN:
+            - No matches are found
+        """
          Document.objects.create(
              checksum="BEEFCAFE",
              title="A",
@@ -49,6 +88,16 @@ class TestFuzzyMatchCommand(TestCase):
          self.assertEqual(stdout, "No matches found\n")
  
      def test_with_matches(self):
+        """
+        GIVEN:
+            - 2 documents exist
+            - Similarity between content is 86.667
+        WHEN:
+            - Command is called
+        THEN:
+            - 1 match is returned from doc 1 to doc 2
+            - No match from doc 2 to doc 1 reported
+        """
          # Content similarity is 86.667
          Document.objects.create(
              checksum="BEEFCAFE",
@@ -68,6 +117,16 @@ class TestFuzzyMatchCommand(TestCase):
          self.assertEqual(stdout, "Document 1 fuzzy match to 2 (confidence 86.667)\n")
  
      def test_with_3_matches(self):
+        """
+        GIVEN:
+            - 3 documents exist
+            - All documents have similarity over 85.0
+        WHEN:
+            - Command is called
+        THEN:
+            - 3 matches is returned from each document to the others
+            - No duplication of matches returned
+        """
          # Content similarity is 86.667
          Document.objects.create(
              checksum="BEEFCAFE",
author	Trenton H <797416+stumpylog@users.noreply.github.com>
	Tue, 12 Sep 2023 00:10:09 +0000 (17:10 -0700)
committer	Trenton H <797416+stumpylog@users.noreply.github.com>
	Tue, 12 Sep 2023 15:17:12 +0000 (08:17 -0700)
docs/administration.md		patch \| blob \| blame \| history
src/documents/management/commands/document_fuzzy_match.py		patch \| blob \| blame \| history
src/documents/tests/test_management_fuzzy.py		patch \| blob \| blame \| history