Saves work on a new management comment to re-ocr a file

author Trenton Holmes <holmes.trenton@gmail.com>

Tue, 31 May 2022 16:33:09 +0000 (09:33 -0700)

committer Michael Shamoon <4887959+shamoon@users.noreply.github.com>

Sat, 2 Jul 2022 15:39:09 +0000 (08:39 -0700)
author Trenton Holmes <holmes.trenton@gmail.com>
Tue, 31 May 2022 16:33:09 +0000 (09:33 -0700)
committer Michael Shamoon <4887959+shamoon@users.noreply.github.com>
Sat, 2 Jul 2022 15:39:09 +0000 (08:39 -0700)
diff --git a/src/documents/management/commands/document_redo_ocr.py b/src/documents/management/commands/document_redo_ocr.py

new file mode 100644 (file)

index 0000000..c356394
--- /dev/null
+++ b/src/documents/management/commands/document_redo_ocr.py
@@ -0,0 +1,69 @@
+import logging
+import shutil
+from pathlib import Path
+from typing import Type
+
+from django.core.exceptions import ObjectDoesNotExist
+from django.core.management.base import BaseCommand
+from documents.models import Document
+from documents.parsers import DocumentParser
+from documents.parsers import get_parser_class_for_mime_type
+from documents.parsers import ParseError
+
+
+class Command(BaseCommand):
+
+    help = """
+        This will rename all documents to match the latest filename format.
+    """.replace(
+        "    ",
+        "",
+    )
+
+    def add_arguments(self, parser):
+        parser.add_argument(
+            "documents",
+            nargs="+",
+            help="Document primary keys for re-processing OCR on",
+        )
+
+    def handle(self, *args, **options):
+
+        logging.getLogger().handlers[0].level = logging.ERROR
+
+        all_docs = Document.objects.all()
+
+        for doc_pk in args.documents:
+            try:
+                self.stdout.write(f"Parsing document {doc_pk}")
+                doc: Document = all_docs.get(pk=doc_pk)
+            except ObjectDoesNotExist:
+                self.stdout.write(self.style.ERROR(f"Document {doc_pk} does not exist"))
+                continue
+
+            # Get the correct parser for this mime type
+            parser_class: Type[DocumentParser] = get_parser_class_for_mime_type(
+                doc.mime_type,
+            )
+            document_parser: DocumentParser = parser_class(
+                "redo-ocr",
+            )
+
+            # Create a file path to copy the original file to for working on
+            temp_file = (Path(document_parser.tempdir) / Path("new-ocr-file")).resolve()
+
+            shutil.copy(doc.source_path, temp_file)
+
+            try:
+                # Try to re-parse the document into text
+                document_parser.parse(str(temp_file), doc.mime_type)
+
+                doc.content = document_parser.get_text()
+                doc.save()
+
+            except ParseError as e:
+                self.stdout.write(self.style.ERROR(f"Error parsing document: {e}"))
+            finally:
+                # Remove the file path if it was created
+                if temp_file.exists() and temp_file.is_file():
+                    temp_file.unlink()
author	Trenton Holmes <holmes.trenton@gmail.com>
	Tue, 31 May 2022 16:33:09 +0000 (09:33 -0700)
committer	Michael Shamoon <4887959+shamoon@users.noreply.github.com>
	Sat, 2 Jul 2022 15:39:09 +0000 (08:39 -0700)