]> git.ipfire.org Git - thirdparty/paperless-ngx.git/commitdiff
Cleans up the docs, adds validation of the process count, include the test descriptions
authorTrenton H <797416+stumpylog@users.noreply.github.com>
Tue, 12 Sep 2023 00:10:09 +0000 (17:10 -0700)
committerTrenton H <797416+stumpylog@users.noreply.github.com>
Tue, 12 Sep 2023 15:17:12 +0000 (08:17 -0700)
docs/administration.md
src/documents/management/commands/document_fuzzy_match.py
src/documents/tests/test_management_fuzzy.py

index 75cca5997b66cc0a51cf0009a6e9f826d57429e6..6e657447a297cc07fc08cf1e87298b343aa0843e 100644 (file)
@@ -582,8 +582,11 @@ duplicate. But the content should be exact or close, allowing detection.
 This tool does a fuzzy match over document content, looking for
 those which look close according to a given ratio.
 
+At this time, other metadata (such as correspondent or type) is not
+take into account by the detection.
+
 ```
-document_fuzzy_match [--ratio]
+document_fuzzy_match [--ratio] [--processes N]
 ```
 
 | Option      | Required | Default | Description                                                                                                                    |
index eb37b2bf433db6868af36669b7edb62a6f3e8691..26ce55a39b771dff90adb4b54d04ea5b5da98c4f 100644 (file)
@@ -27,6 +27,10 @@ class _WorkResult:
 
 
 def _process_and_match(work: _WorkPackage) -> _WorkResult:
+    """
+    Does basic processing of document content, gets the basic ratio
+    and returns the result package
+    """
     # Normalize the string some, lower case, whitespace, etc
     first_string = rapidfuzz.utils.default_process(work.first_doc.content)
     second_string = rapidfuzz.utils.default_process(work.second_doc.content)
@@ -72,6 +76,9 @@ class Command(BaseCommand):
         if opt_ratio < RATIO_MIN or opt_ratio > RATIO_MAX:
             raise CommandError("The ratio must be between 0 and 100")
 
+        if options["processes"] < 1:
+            raise CommandError("There must be at least 1 process")
+
         all_docs = Document.objects.all().order_by("id")
 
         # Build work packages for processing
index 6b4520bc4b7ea2e16f5752275f1ddee8b80336ce..4a3d96c461e4bed6fcb2cebef2caf8b3818b53da 100644 (file)
@@ -22,15 +22,54 @@ class TestFuzzyMatchCommand(TestCase):
         return stdout.getvalue(), stderr.getvalue()
 
     def test_invalid_ratio_lower_limit(self):
-        with self.assertRaises(CommandError):
+        """
+        GIVEN:
+            - Invalid ratio below lower limit
+        WHEN:
+            - Command is called
+        THEN:
+            - Error is raised indicating issue
+        """
+        with self.assertRaises(CommandError) as e:
             self.call_command("--ratio", "-1")
+            self.assertIn("The ratio must be between 0 and 100", str(e))
 
     def test_invalid_ratio_upper_limit(self):
-        with self.assertRaises(CommandError):
+        """
+        GIVEN:s
+            - Invalid ratio above upper
+        WHEN:
+            - Command is called
+        THEN:
+            - Error is raised indicating issue
+        """
+        with self.assertRaises(CommandError) as e:
             self.call_command("--ratio", "101")
+            self.assertIn("The ratio must be between 0 and 100", str(e))
+
+    def test_invalid_process_count(self):
+        """
+        GIVEN:
+            - Invalid process count less than 0 above upper
+        WHEN:
+            - Command is called
+        THEN:
+            - Error is raised indicating issue
+        """
+        with self.assertRaises(CommandError) as e:
+            self.call_command("--processes", "0")
+            self.assertIn("There must be at least 1 process", str(e))
 
     def test_no_matches(self):
-        # Content similarity is 82.35
+        """
+        GIVEN:
+            - 2 documents exist
+            - Similarity between content is 82.32
+        WHEN:
+            - Command is called
+        THEN:
+            - No matches are found
+        """
         Document.objects.create(
             checksum="BEEFCAFE",
             title="A",
@@ -49,6 +88,16 @@ class TestFuzzyMatchCommand(TestCase):
         self.assertEqual(stdout, "No matches found\n")
 
     def test_with_matches(self):
+        """
+        GIVEN:
+            - 2 documents exist
+            - Similarity between content is 86.667
+        WHEN:
+            - Command is called
+        THEN:
+            - 1 match is returned from doc 1 to doc 2
+            - No match from doc 2 to doc 1 reported
+        """
         # Content similarity is 86.667
         Document.objects.create(
             checksum="BEEFCAFE",
@@ -68,6 +117,16 @@ class TestFuzzyMatchCommand(TestCase):
         self.assertEqual(stdout, "Document 1 fuzzy match to 2 (confidence 86.667)\n")
 
     def test_with_3_matches(self):
+        """
+        GIVEN:
+            - 3 documents exist
+            - All documents have similarity over 85.0
+        WHEN:
+            - Command is called
+        THEN:
+            - 3 matches is returned from each document to the others
+            - No duplication of matches returned
+        """
         # Content similarity is 86.667
         Document.objects.create(
             checksum="BEEFCAFE",