]> git.ipfire.org Git - thirdparty/paperless-ngx.git/commitdiff
Extract redo ocr to task
authorMichael Shamoon <4887959+shamoon@users.noreply.github.com>
Wed, 22 Jun 2022 12:53:13 +0000 (05:53 -0700)
committerMichael Shamoon <4887959+shamoon@users.noreply.github.com>
Sat, 2 Jul 2022 15:39:09 +0000 (08:39 -0700)
src/documents/management/commands/document_redo_ocr.py
src/documents/tasks.py

index 3ead5a4851cf46333ab7449c11b091f13fda417d..1e44e6134928199fd8f02351ddd4da35240be2c5 100644 (file)
@@ -1,14 +1,6 @@
-import shutil
-from pathlib import Path
-from typing import Type
-
 import tqdm
-from django.core.exceptions import ObjectDoesNotExist
 from django.core.management.base import BaseCommand
-from documents.models import Document
-from documents.parsers import DocumentParser
-from documents.parsers import get_parser_class_for_mime_type
-from documents.parsers import ParseError
+from documents.tasks import redo_ocr
 
 
 class Command(BaseCommand):
@@ -36,47 +28,8 @@ class Command(BaseCommand):
         )
 
     def handle(self, *args, **options):
-
-        all_docs = Document.objects.all()
-
-        for doc_pk in tqdm.tqdm(
+        doc_pks = tqdm.tqdm(
             options["documents"],
             disable=options["no_progress_bar"],
-        ):
-            try:
-                self.stdout.write(f"Parsing document {doc_pk}")
-                doc: Document = all_docs.get(pk=doc_pk)
-            except ObjectDoesNotExist:
-                self.stdout.write(self.style.ERROR(f"Document {doc_pk} does not exist"))
-                continue
-
-            # Get the correct parser for this mime type
-            parser_class: Type[DocumentParser] = get_parser_class_for_mime_type(
-                doc.mime_type,
-            )
-            document_parser: DocumentParser = parser_class(
-                "redo-ocr",
-            )
-
-            # Create a file path to copy the original file to for working on
-            temp_file = (Path(document_parser.tempdir) / Path("new-ocr-file")).resolve()
-
-            shutil.copy(doc.source_path, temp_file)
-
-            try:
-                self.stdout.write(
-                    f"Using {type(document_parser).__name__} for document",
-                )
-                # Try to re-parse the document into text
-                document_parser.parse(str(temp_file), doc.mime_type)
-
-                doc.content = document_parser.get_text()
-                doc.save()
-                self.stdout.write("Document OCR updated")
-
-            except ParseError as e:
-                self.stdout.write(self.style.ERROR(f"Error parsing document: {e}"))
-            finally:
-                # Remove the file path if it was created
-                if temp_file.exists() and temp_file.is_file():
-                    temp_file.unlink()
+        )
+        redo_ocr(doc_pks)
index 4c57b2eeef8ccb8dec27353f8c72f08fa5f5ebde..1070471ba2cb403b13d8578d39d71561747d3782 100644 (file)
@@ -2,13 +2,16 @@ import logging
 import os
 import shutil
 import tempfile
+from pathlib import Path
 from typing import List  # for type hinting. Can be removed, if only Python >3.8 is used
+from typing import Type
 
 import magic
 import tqdm
 from asgiref.sync import async_to_sync
 from channels.layers import get_channel_layer
 from django.conf import settings
+from django.core.exceptions import ObjectDoesNotExist
 from django.db.models.signals import post_save
 from documents import index
 from documents import sanity_checker
@@ -21,6 +24,9 @@ from documents.models import Document
 from documents.models import DocumentType
 from documents.models import StoragePath
 from documents.models import Tag
+from documents.parsers import DocumentParser
+from documents.parsers import get_parser_class_for_mime_type
+from documents.parsers import ParseError
 from documents.sanity_checker import SanityCheckFailedException
 from pdf2image import convert_from_path
 from pikepdf import Pdf
@@ -359,3 +365,46 @@ def bulk_update_documents(document_ids):
     with AsyncWriter(ix) as writer:
         for doc in documents:
             index.update_document(writer, doc)
+
+
+def redo_ocr(document_ids):
+    all_docs = Document.objects.all()
+
+    for doc_pk in document_ids:
+        try:
+            logger.info(f"Parsing document {doc_pk}")
+            doc: Document = all_docs.get(pk=doc_pk)
+        except ObjectDoesNotExist:
+            logger.error(f"Document {doc_pk} does not exist")
+            continue
+
+        # Get the correct parser for this mime type
+        parser_class: Type[DocumentParser] = get_parser_class_for_mime_type(
+            doc.mime_type,
+        )
+        document_parser: DocumentParser = parser_class(
+            "redo-ocr",
+        )
+
+        # Create a file path to copy the original file to for working on
+        temp_file = (Path(document_parser.tempdir) / Path("new-ocr-file")).resolve()
+
+        shutil.copy(doc.source_path, temp_file)
+
+        try:
+            logger.info(
+                f"Using {type(document_parser).__name__} for document",
+            )
+            # Try to re-parse the document into text
+            document_parser.parse(str(temp_file), doc.mime_type)
+
+            doc.content = document_parser.get_text()
+            doc.save()
+            logger.info("Document OCR updated")
+
+        except ParseError as e:
+            logger.error(f"Error parsing document: {e}")
+        finally:
+            # Remove the file path if it was created
+            if temp_file.exists() and temp_file.is_file():
+                temp_file.unlink()