Extract redo ocr to task

author Michael Shamoon <4887959+shamoon@users.noreply.github.com>

Wed, 22 Jun 2022 12:53:13 +0000 (05:53 -0700)

committer Michael Shamoon <4887959+shamoon@users.noreply.github.com>

Sat, 2 Jul 2022 15:39:09 +0000 (08:39 -0700)
author Michael Shamoon <4887959+shamoon@users.noreply.github.com>
Wed, 22 Jun 2022 12:53:13 +0000 (05:53 -0700)
committer Michael Shamoon <4887959+shamoon@users.noreply.github.com>
Sat, 2 Jul 2022 15:39:09 +0000 (08:39 -0700)
diff --git a/src/documents/management/commands/document_redo_ocr.py b/src/documents/management/commands/document_redo_ocr.py

index 3ead5a4851cf46333ab7449c11b091f13fda417d..1e44e6134928199fd8f02351ddd4da35240be2c5 100644 (file)
--- a/src/documents/management/commands/document_redo_ocr.py
+++ b/src/documents/management/commands/document_redo_ocr.py
@@ -1,14 +1,6 @@
-import shutil
-from pathlib import Path
-from typing import Type
-
  import tqdm
-from django.core.exceptions import ObjectDoesNotExist
  from django.core.management.base import BaseCommand
-from documents.models import Document
-from documents.parsers import DocumentParser
-from documents.parsers import get_parser_class_for_mime_type
-from documents.parsers import ParseError
+from documents.tasks import redo_ocr
  
  
  class Command(BaseCommand):
@@ -36,47 +28,8 @@ class Command(BaseCommand):
          )
  
      def handle(self, *args, **options):
-
-        all_docs = Document.objects.all()
-
-        for doc_pk in tqdm.tqdm(
+        doc_pks = tqdm.tqdm(
              options["documents"],
              disable=options["no_progress_bar"],
-        ):
-            try:
-                self.stdout.write(f"Parsing document {doc_pk}")
-                doc: Document = all_docs.get(pk=doc_pk)
-            except ObjectDoesNotExist:
-                self.stdout.write(self.style.ERROR(f"Document {doc_pk} does not exist"))
-                continue
-
-            # Get the correct parser for this mime type
-            parser_class: Type[DocumentParser] = get_parser_class_for_mime_type(
-                doc.mime_type,
-            )
-            document_parser: DocumentParser = parser_class(
-                "redo-ocr",
-            )
-
-            # Create a file path to copy the original file to for working on
-            temp_file = (Path(document_parser.tempdir) / Path("new-ocr-file")).resolve()
-
-            shutil.copy(doc.source_path, temp_file)
-
-            try:
-                self.stdout.write(
-                    f"Using {type(document_parser).__name__} for document",
-                )
-                # Try to re-parse the document into text
-                document_parser.parse(str(temp_file), doc.mime_type)
-
-                doc.content = document_parser.get_text()
-                doc.save()
-                self.stdout.write("Document OCR updated")
-
-            except ParseError as e:
-                self.stdout.write(self.style.ERROR(f"Error parsing document: {e}"))
-            finally:
-                # Remove the file path if it was created
-                if temp_file.exists() and temp_file.is_file():
-                    temp_file.unlink()
+        )
+        redo_ocr(doc_pks)
diff --git a/src/documents/tasks.py b/src/documents/tasks.py

index 4c57b2eeef8ccb8dec27353f8c72f08fa5f5ebde..1070471ba2cb403b13d8578d39d71561747d3782 100644 (file)
--- a/src/documents/tasks.py
+++ b/src/documents/tasks.py
@@ -2,13 +2,16 @@ import logging
  import os
  import shutil
  import tempfile
+from pathlib import Path
  from typing import List  # for type hinting. Can be removed, if only Python >3.8 is used
+from typing import Type
  
  import magic
  import tqdm
  from asgiref.sync import async_to_sync
  from channels.layers import get_channel_layer
  from django.conf import settings
+from django.core.exceptions import ObjectDoesNotExist
  from django.db.models.signals import post_save
  from documents import index
  from documents import sanity_checker
@@ -21,6 +24,9 @@ from documents.models import Document
  from documents.models import DocumentType
  from documents.models import StoragePath
  from documents.models import Tag
+from documents.parsers import DocumentParser
+from documents.parsers import get_parser_class_for_mime_type
+from documents.parsers import ParseError
  from documents.sanity_checker import SanityCheckFailedException
  from pdf2image import convert_from_path
  from pikepdf import Pdf
@@ -359,3 +365,46 @@ def bulk_update_documents(document_ids):
      with AsyncWriter(ix) as writer:
          for doc in documents:
              index.update_document(writer, doc)
+
+
+def redo_ocr(document_ids):
+    all_docs = Document.objects.all()
+
+    for doc_pk in document_ids:
+        try:
+            logger.info(f"Parsing document {doc_pk}")
+            doc: Document = all_docs.get(pk=doc_pk)
+        except ObjectDoesNotExist:
+            logger.error(f"Document {doc_pk} does not exist")
+            continue
+
+        # Get the correct parser for this mime type
+        parser_class: Type[DocumentParser] = get_parser_class_for_mime_type(
+            doc.mime_type,
+        )
+        document_parser: DocumentParser = parser_class(
+            "redo-ocr",
+        )
+
+        # Create a file path to copy the original file to for working on
+        temp_file = (Path(document_parser.tempdir) / Path("new-ocr-file")).resolve()
+
+        shutil.copy(doc.source_path, temp_file)
+
+        try:
+            logger.info(
+                f"Using {type(document_parser).__name__} for document",
+            )
+            # Try to re-parse the document into text
+            document_parser.parse(str(temp_file), doc.mime_type)
+
+            doc.content = document_parser.get_text()
+            doc.save()
+            logger.info("Document OCR updated")
+
+        except ParseError as e:
+            logger.error(f"Error parsing document: {e}")
+        finally:
+            # Remove the file path if it was created
+            if temp_file.exists() and temp_file.is_file():
+                temp_file.unlink()
author	Michael Shamoon <4887959+shamoon@users.noreply.github.com>
	Wed, 22 Jun 2022 12:53:13 +0000 (05:53 -0700)
committer	Michael Shamoon <4887959+shamoon@users.noreply.github.com>
	Sat, 2 Jul 2022 15:39:09 +0000 (08:39 -0700)
src/documents/management/commands/document_redo_ocr.py		patch \| blob \| blame \| history
src/documents/tasks.py		patch \| blob \| blame \| history