Implements a better re-do of OCR by making the document archiver function common...

author Trenton Holmes <holmes.trenton@gmail.com>

Mon, 22 Aug 2022 01:20:59 +0000 (18:20 -0700)

committer Trenton Holmes <holmes.trenton@gmail.com>

Mon, 22 Aug 2022 01:20:59 +0000 (18:20 -0700)
author Trenton Holmes <holmes.trenton@gmail.com>
Mon, 22 Aug 2022 01:20:59 +0000 (18:20 -0700)
committer Trenton Holmes <holmes.trenton@gmail.com>
Mon, 22 Aug 2022 01:20:59 +0000 (18:20 -0700)
diff --git a/src/documents/bulk_edit.py b/src/documents/bulk_edit.py

index babd5f3b4da59b4f83e08d9a029275f4d144b1b9..0cf0daf3e25a7b4ec5d654142d5337c8aec6b7b5 100644 (file)
--- a/src/documents/bulk_edit.py
+++ b/src/documents/bulk_edit.py
@@ -122,6 +122,10 @@ def delete(doc_ids):
  
  def redo_ocr(doc_ids):
  
-    async_task("documents.tasks.redo_ocr", document_ids=doc_ids)
+    for document_id in doc_ids:
+        async_task(
+            "documents.tasks.update_document_archive_file",
+            document_id=document_id,
+        )
  
      return "OK"
diff --git a/src/documents/management/commands/document_archiver.py b/src/documents/management/commands/document_archiver.py

index c51f1baeb468cc770448161b4e15a35fc40e77bf..fa78a1963cdcd35e990f8416f5a1130beb7c4311 100644 (file)
--- a/src/documents/management/commands/document_archiver.py
+++ b/src/documents/management/commands/document_archiver.py
@@ -1,85 +1,18 @@
-import hashlib
  import logging
  import multiprocessing
  import os
-import shutil
-import uuid
  
  import tqdm
  from django import db
  from django.conf import settings
  from django.core.management.base import BaseCommand
-from django.db import transaction
  from documents.models import Document
-from filelock import FileLock
-
-from ... import index
-from ...file_handling import create_source_path_directory
-from ...file_handling import generate_unique_filename
-from ...parsers import get_parser_class_for_mime_type
+from documents.tasks import update_document_archive_file
  
  
  logger = logging.getLogger("paperless.management.archiver")
  
  
-def handle_document(document_id):
-    document = Document.objects.get(id=document_id)
-
-    mime_type = document.mime_type
-
-    parser_class = get_parser_class_for_mime_type(mime_type)
-
-    if not parser_class:
-        logger.error(
-            f"No parser found for mime type {mime_type}, cannot "
-            f"archive document {document} (ID: {document_id})",
-        )
-        return
-
-    parser = parser_class(logging_group=uuid.uuid4())
-
-    try:
-        parser.parse(document.source_path, mime_type, document.get_public_filename())
-
-        thumbnail = parser.get_thumbnail(
-            document.source_path,
-            mime_type,
-            document.get_public_filename(),
-        )
-
-        if parser.get_archive_path():
-            with transaction.atomic():
-                with open(parser.get_archive_path(), "rb") as f:
-                    checksum = hashlib.md5(f.read()).hexdigest()
-                # I'm going to save first so that in case the file move
-                # fails, the database is rolled back.
-                # We also don't use save() since that triggers the filehandling
-                # logic, and we don't want that yet (file not yet in place)
-                document.archive_filename = generate_unique_filename(
-                    document,
-                    archive_filename=True,
-                )
-                Document.objects.filter(pk=document.pk).update(
-                    archive_checksum=checksum,
-                    content=parser.get_text(),
-                    archive_filename=document.archive_filename,
-                )
-                with FileLock(settings.MEDIA_LOCK):
-                    create_source_path_directory(document.archive_path)
-                    shutil.move(parser.get_archive_path(), document.archive_path)
-                    shutil.move(thumbnail, document.thumbnail_path)
-
-            with index.open_index_writer() as writer:
-                index.update_document(writer, document)
-
-    except Exception:
-        logger.exception(
-            f"Error while parsing document {document} " f"(ID: {document_id})",
-        )
-    finally:
-        parser.cleanup()
-
-
  class Command(BaseCommand):
  
      help = """
@@ -146,7 +79,7 @@ class Command(BaseCommand):
              with multiprocessing.Pool(processes=settings.TASK_WORKERS) as pool:
                  list(
                      tqdm.tqdm(
-                        pool.imap_unordered(handle_document, document_ids),
+                        pool.imap_unordered(update_document_archive_file, document_ids),
                          total=len(document_ids),
                          disable=options["no_progress_bar"],
                      ),
diff --git a/src/documents/management/commands/document_redo_ocr.py b/src/documents/management/commands/document_redo_ocr.py

deleted file mode 100644 (file)

index 1e44e61..0000000
--- a/src/documents/management/commands/document_redo_ocr.py
+++ /dev/null
@@ -1,35 +0,0 @@
-import tqdm
-from django.core.management.base import BaseCommand
-from documents.tasks import redo_ocr
-
-
-class Command(BaseCommand):
-
-    help = """
-        This will rename all documents to match the latest filename format.
-    """.replace(
-        "    ",
-        "",
-    )
-
-    def add_arguments(self, parser):
-
-        parser.add_argument(
-            "--no-progress-bar",
-            default=False,
-            action="store_true",
-            help="If set, the progress bar will not be shown",
-        )
-
-        parser.add_argument(
-            "documents",
-            nargs="+",
-            help="Document primary keys for re-processing OCR on",
-        )
-
-    def handle(self, *args, **options):
-        doc_pks = tqdm.tqdm(
-            options["documents"],
-            disable=options["no_progress_bar"],
-        )
-        redo_ocr(doc_pks)
diff --git a/src/documents/tasks.py b/src/documents/tasks.py

index 35404587d43c041012ebbc0695c20e8fffd90b64..b1793e760d703aa02858bb5f09c0ba6ad3c01587 100644 (file)
--- a/src/documents/tasks.py
+++ b/src/documents/tasks.py
@@ -1,6 +1,8 @@
+import hashlib
  import logging
  import os
  import shutil
+import uuid
  from pathlib import Path
  from typing import Type
  
@@ -8,7 +10,7 @@ import tqdm
  from asgiref.sync import async_to_sync
  from channels.layers import get_channel_layer
  from django.conf import settings
-from django.core.exceptions import ObjectDoesNotExist
+from django.db import transaction
  from django.db.models.signals import post_save
  from documents import barcodes
  from documents import index
@@ -17,6 +19,8 @@ from documents.classifier import DocumentClassifier
  from documents.classifier import load_classifier
  from documents.consumer import Consumer
  from documents.consumer import ConsumerError
+from documents.file_handling import create_source_path_directory
+from documents.file_handling import generate_unique_filename
  from documents.models import Correspondent
  from documents.models import Document
  from documents.models import DocumentType
@@ -24,8 +28,8 @@ from documents.models import StoragePath
  from documents.models import Tag
  from documents.parsers import DocumentParser
  from documents.parsers import get_parser_class_for_mime_type
-from documents.parsers import ParseError
  from documents.sanity_checker import SanityCheckFailedException
+from filelock import FileLock
  from whoosh.writing import AsyncWriter
  
  
@@ -213,44 +217,62 @@ def bulk_update_documents(document_ids):
              index.update_document(writer, doc)
  
  
-def redo_ocr(document_ids):
-    all_docs = Document.objects.all()
+def update_document_archive_file(document_id):
+    """
+    Re-creates the archive file of a document, including new OCR content and thumbnail
+    """
+    document = Document.objects.get(id=document_id)
  
-    for doc_pk in document_ids:
-        try:
-            logger.info(f"Parsing document {doc_pk}")
-            doc: Document = all_docs.get(pk=doc_pk)
-        except ObjectDoesNotExist:
-            logger.error(f"Document {doc_pk} does not exist")
-            continue
+    mime_type = document.mime_type
  
-        # Get the correct parser for this mime type
-        parser_class: Type[DocumentParser] = get_parser_class_for_mime_type(
-            doc.mime_type,
+    parser_class: Type[DocumentParser] = get_parser_class_for_mime_type(mime_type)
+
+    if not parser_class:
+        logger.error(
+            f"No parser found for mime type {mime_type}, cannot "
+            f"archive document {document} (ID: {document_id})",
          )
-        document_parser: DocumentParser = parser_class(
-            "redo-ocr",
+        return
+
+    parser: DocumentParser = parser_class(logging_group=uuid.uuid4())
+
+    try:
+        parser.parse(document.source_path, mime_type, document.get_public_filename())
+
+        thumbnail = parser.get_thumbnail(
+            document.source_path,
+            mime_type,
+            document.get_public_filename(),
          )
  
-        # Create a file path to copy the original file to for working on
-        temp_file = (Path(document_parser.tempdir) / Path("new-ocr-file")).resolve()
+        if parser.get_archive_path():
+            with transaction.atomic():
+                with open(parser.get_archive_path(), "rb") as f:
+                    checksum = hashlib.md5(f.read()).hexdigest()
+                # I'm going to save first so that in case the file move
+                # fails, the database is rolled back.
+                # We also don't use save() since that triggers the filehandling
+                # logic, and we don't want that yet (file not yet in place)
+                document.archive_filename = generate_unique_filename(
+                    document,
+                    archive_filename=True,
+                )
+                Document.objects.filter(pk=document.pk).update(
+                    archive_checksum=checksum,
+                    content=parser.get_text(),
+                    archive_filename=document.archive_filename,
+                )
+                with FileLock(settings.MEDIA_LOCK):
+                    create_source_path_directory(document.archive_path)
+                    shutil.move(parser.get_archive_path(), document.archive_path)
+                    shutil.move(thumbnail, document.thumbnail_path)
  
-        shutil.copy(doc.source_path, temp_file)
+            with index.open_index_writer() as writer:
+                index.update_document(writer, document)
  
-        try:
-            logger.info(
-                f"Using {type(document_parser).__name__} for document",
-            )
-            # Try to re-parse the document into text
-            document_parser.parse(str(temp_file), doc.mime_type)
-
-            doc.content = document_parser.get_text()
-            doc.save()
-            logger.info("Document OCR updated")
-
-        except ParseError as e:
-            logger.error(f"Error parsing document: {e}")
-        finally:
-            # Remove the file path if it was created
-            if temp_file.exists() and temp_file.is_file():
-                temp_file.unlink()
+    except Exception:
+        logger.exception(
+            f"Error while parsing document {document} " f"(ID: {document_id})",
+        )
+    finally:
+        parser.cleanup()
diff --git a/src/documents/tests/test_management.py b/src/documents/tests/test_management.py

index 76a5459b52f63cbbcf4e03fc463f9a5898593313..fe217676b182799edd0e19318081e6535ee9e4ac 100644 (file)
--- a/src/documents/tests/test_management.py
+++ b/src/documents/tests/test_management.py
@@ -10,8 +10,8 @@ from django.core.management import call_command
  from django.test import override_settings
  from django.test import TestCase
  from documents.file_handling import generate_filename
-from documents.management.commands.document_archiver import handle_document
  from documents.models import Document
+from documents.tasks import update_document_archive_file
  from documents.tests.utils import DirectoriesMixin
  
  
@@ -46,7 +46,7 @@ class TestArchiver(DirectoriesMixin, TestCase):
              os.path.join(self.dirs.originals_dir, f"{doc.id:07}.pdf"),
          )
  
-        handle_document(doc.pk)
+        update_document_archive_file(doc.pk)
  
          doc = Document.objects.get(id=doc.id)
  
@@ -63,7 +63,7 @@ class TestArchiver(DirectoriesMixin, TestCase):
          doc.save()
          shutil.copy(sample_file, doc.source_path)
  
-        handle_document(doc.pk)
+        update_document_archive_file(doc.pk)
  
          doc = Document.objects.get(id=doc.id)
  
@@ -94,8 +94,8 @@ class TestArchiver(DirectoriesMixin, TestCase):
              os.path.join(self.dirs.originals_dir, f"document_01.pdf"),
          )
  
-        handle_document(doc2.pk)
-        handle_document(doc1.pk)
+        update_document_archive_file(doc2.pk)
+        update_document_archive_file(doc1.pk)
  
          doc1 = Document.objects.get(id=doc1.id)
          doc2 = Document.objects.get(id=doc2.id)
author	Trenton Holmes <holmes.trenton@gmail.com>
	Mon, 22 Aug 2022 01:20:59 +0000 (18:20 -0700)
committer	Trenton Holmes <holmes.trenton@gmail.com>
	Mon, 22 Aug 2022 01:20:59 +0000 (18:20 -0700)
src/documents/bulk_edit.py		patch \| blob \| blame \| history
src/documents/management/commands/document_archiver.py		patch \| blob \| blame \| history
src/documents/management/commands/document_redo_ocr.py	[deleted file]	patch \| blob \| blame \| history
src/documents/tasks.py		patch \| blob \| blame \| history
src/documents/tests/test_management.py		patch \| blob \| blame \| history