* Adds new flag allowing deletion of one of a document pair which is over the match ratio
* Documents the new command option
| ----------- | -------- | ------------------- | ------------------------------------------------------------------------------------------------------------------------------ |
| --ratio | No | 85.0 | a number between 0 and 100, setting how similar a document must be for it to be reported. Higher numbers mean more similarity. |
| --processes | No | 1/4 of system cores | Number of processes to use for matching. Setting 1 disables multiple processes |
+| --delete | No | False | If provided, one document of a matched pair above the ratio will be deleted. |
+
+!!! warning
+
+ If providing the `--delete` option, it is highly recommended to have a backup.
+ While every effort has been taken to ensure proper operation, there is always the
+ chance of deletion of a file you want to keep.
type=float,
help="Ratio to consider documents a match",
)
+ parser.add_argument(
+ "--delete",
+ default=False,
+ action="store_true",
+ help="If set, one document of matches above the ratio WILL BE DELETED",
+ )
self.add_argument_progress_bar_mixin(parser)
self.add_argument_processes_mixin(parser)
self.handle_processes_mixin(**options)
self.handle_progress_bar_mixin(**options)
+ if options["delete"]:
+ self.stdout.write(
+ self.style.WARNING(
+ "The command is configured to delete documents. Use with caution",
+ ),
+ )
+
opt_ratio = options["ratio"]
checked_pairs: set[tuple[int, int]] = set()
work_pkgs: list[_WorkPackage] = []
continue
# Skip matching which have already been matched together
# doc 1 to doc 2 is the same as doc 2 to doc 1
- if (first_doc.pk, second_doc.pk) in checked_pairs or (
- second_doc.pk,
- first_doc.pk,
- ) in checked_pairs:
+ doc_1_to_doc_2 = (first_doc.pk, second_doc.pk)
+ doc_2_to_doc_1 = doc_1_to_doc_2[::-1]
+ if doc_1_to_doc_2 in checked_pairs or doc_2_to_doc_1 in checked_pairs:
continue
- checked_pairs.update(
- [(first_doc.pk, second_doc.pk), (second_doc.pk, first_doc.pk)],
- )
-
+ checked_pairs.update([doc_1_to_doc_2, doc_2_to_doc_1])
+ # Actually something useful to work on now
work_pkgs.append(_WorkPackage(first_doc, second_doc))
# Don't spin up a pool of 1 process
# Check results
messages = []
+ maybe_delete_ids = []
for result in sorted(results):
if result.ratio >= opt_ratio:
messages.append(
f" to {result.doc_two_pk} (confidence {result.ratio:.3f})",
),
)
+ maybe_delete_ids.append(result.doc_two_pk)
if len(messages) == 0:
messages.append(
self.stdout.writelines(
messages,
)
+ if options["delete"]:
+ self.stdout.write(
+ self.style.NOTICE(
+ f"Deleting {len(maybe_delete_ids)} documents based on ratio matches",
+ ),
+ )
+ Document.objects.filter(pk__in=maybe_delete_ids).delete()
self.assertRegex(lines[0], self.MSG_REGEX)
self.assertRegex(lines[1], self.MSG_REGEX)
self.assertRegex(lines[2], self.MSG_REGEX)
+
+ def test_document_deletion(self):
+ """
+ GIVEN:
+ - 3 documents exist
+ - Document 1 to document 3 has a similarity over 85.0
+ WHEN:
+ - Command is called with the --delete option
+ THEN:
+ - User is warned about the deletion flag
+ - Document 3 is deleted
+ - Documents 1 and 2 remain
+ """
+ # Content similarity is 86.667
+ Document.objects.create(
+ checksum="BEEFCAFE",
+ title="A",
+ content="first document scanned by bob",
+ mime_type="application/pdf",
+ filename="test.pdf",
+ )
+ Document.objects.create(
+ checksum="DEADBEAF",
+ title="A",
+ content="second document scanned by alice",
+ mime_type="application/pdf",
+ filename="other_test.pdf",
+ )
+ Document.objects.create(
+ checksum="CATTLE",
+ title="A",
+ content="first document scanned by pete",
+ mime_type="application/pdf",
+ filename="final_test.pdf",
+ )
+
+ self.assertEqual(Document.objects.count(), 3)
+
+ stdout, _ = self.call_command("--delete")
+ print(stdout)
+ lines = [x.strip() for x in stdout.split("\n") if len(x.strip())]
+ self.assertEqual(len(lines), 3)
+ self.assertEqual(
+ lines[0],
+ "The command is configured to delete documents. Use with caution",
+ )
+ self.assertRegex(lines[1], self.MSG_REGEX)
+ self.assertEqual(lines[2], "Deleting 1 documents based on ratio matches")
+
+ self.assertEqual(Document.objects.count(), 2)
+ self.assertIsNotNone(Document.objects.get(pk=1))
+ self.assertIsNotNone(Document.objects.get(pk=2))