def redo_ocr(doc_ids):
- async_task("documents.tasks.redo_ocr", document_ids=doc_ids)
+ for document_id in doc_ids:
+ async_task(
+ "documents.tasks.update_document_archive_file",
+ document_id=document_id,
+ )
return "OK"
-import hashlib
import logging
import multiprocessing
import os
-import shutil
-import uuid
import tqdm
from django import db
from django.conf import settings
from django.core.management.base import BaseCommand
-from django.db import transaction
from documents.models import Document
-from filelock import FileLock
-
-from ... import index
-from ...file_handling import create_source_path_directory
-from ...file_handling import generate_unique_filename
-from ...parsers import get_parser_class_for_mime_type
+from documents.tasks import update_document_archive_file
logger = logging.getLogger("paperless.management.archiver")
-def handle_document(document_id):
- document = Document.objects.get(id=document_id)
-
- mime_type = document.mime_type
-
- parser_class = get_parser_class_for_mime_type(mime_type)
-
- if not parser_class:
- logger.error(
- f"No parser found for mime type {mime_type}, cannot "
- f"archive document {document} (ID: {document_id})",
- )
- return
-
- parser = parser_class(logging_group=uuid.uuid4())
-
- try:
- parser.parse(document.source_path, mime_type, document.get_public_filename())
-
- thumbnail = parser.get_thumbnail(
- document.source_path,
- mime_type,
- document.get_public_filename(),
- )
-
- if parser.get_archive_path():
- with transaction.atomic():
- with open(parser.get_archive_path(), "rb") as f:
- checksum = hashlib.md5(f.read()).hexdigest()
- # I'm going to save first so that in case the file move
- # fails, the database is rolled back.
- # We also don't use save() since that triggers the filehandling
- # logic, and we don't want that yet (file not yet in place)
- document.archive_filename = generate_unique_filename(
- document,
- archive_filename=True,
- )
- Document.objects.filter(pk=document.pk).update(
- archive_checksum=checksum,
- content=parser.get_text(),
- archive_filename=document.archive_filename,
- )
- with FileLock(settings.MEDIA_LOCK):
- create_source_path_directory(document.archive_path)
- shutil.move(parser.get_archive_path(), document.archive_path)
- shutil.move(thumbnail, document.thumbnail_path)
-
- with index.open_index_writer() as writer:
- index.update_document(writer, document)
-
- except Exception:
- logger.exception(
- f"Error while parsing document {document} " f"(ID: {document_id})",
- )
- finally:
- parser.cleanup()
-
-
class Command(BaseCommand):
help = """
with multiprocessing.Pool(processes=settings.TASK_WORKERS) as pool:
list(
tqdm.tqdm(
- pool.imap_unordered(handle_document, document_ids),
+ pool.imap_unordered(update_document_archive_file, document_ids),
total=len(document_ids),
disable=options["no_progress_bar"],
),
+++ /dev/null
-import tqdm
-from django.core.management.base import BaseCommand
-from documents.tasks import redo_ocr
-
-
-class Command(BaseCommand):
-
- help = """
- This will rename all documents to match the latest filename format.
- """.replace(
- " ",
- "",
- )
-
- def add_arguments(self, parser):
-
- parser.add_argument(
- "--no-progress-bar",
- default=False,
- action="store_true",
- help="If set, the progress bar will not be shown",
- )
-
- parser.add_argument(
- "documents",
- nargs="+",
- help="Document primary keys for re-processing OCR on",
- )
-
- def handle(self, *args, **options):
- doc_pks = tqdm.tqdm(
- options["documents"],
- disable=options["no_progress_bar"],
- )
- redo_ocr(doc_pks)
+import hashlib
import logging
import os
import shutil
+import uuid
from pathlib import Path
from typing import Type
from asgiref.sync import async_to_sync
from channels.layers import get_channel_layer
from django.conf import settings
-from django.core.exceptions import ObjectDoesNotExist
+from django.db import transaction
from django.db.models.signals import post_save
from documents import barcodes
from documents import index
from documents.classifier import load_classifier
from documents.consumer import Consumer
from documents.consumer import ConsumerError
+from documents.file_handling import create_source_path_directory
+from documents.file_handling import generate_unique_filename
from documents.models import Correspondent
from documents.models import Document
from documents.models import DocumentType
from documents.models import Tag
from documents.parsers import DocumentParser
from documents.parsers import get_parser_class_for_mime_type
-from documents.parsers import ParseError
from documents.sanity_checker import SanityCheckFailedException
+from filelock import FileLock
from whoosh.writing import AsyncWriter
index.update_document(writer, doc)
-def redo_ocr(document_ids):
- all_docs = Document.objects.all()
+def update_document_archive_file(document_id):
+ """
+ Re-creates the archive file of a document, including new OCR content and thumbnail
+ """
+ document = Document.objects.get(id=document_id)
- for doc_pk in document_ids:
- try:
- logger.info(f"Parsing document {doc_pk}")
- doc: Document = all_docs.get(pk=doc_pk)
- except ObjectDoesNotExist:
- logger.error(f"Document {doc_pk} does not exist")
- continue
+ mime_type = document.mime_type
- # Get the correct parser for this mime type
- parser_class: Type[DocumentParser] = get_parser_class_for_mime_type(
- doc.mime_type,
+ parser_class: Type[DocumentParser] = get_parser_class_for_mime_type(mime_type)
+
+ if not parser_class:
+ logger.error(
+ f"No parser found for mime type {mime_type}, cannot "
+ f"archive document {document} (ID: {document_id})",
)
- document_parser: DocumentParser = parser_class(
- "redo-ocr",
+ return
+
+ parser: DocumentParser = parser_class(logging_group=uuid.uuid4())
+
+ try:
+ parser.parse(document.source_path, mime_type, document.get_public_filename())
+
+ thumbnail = parser.get_thumbnail(
+ document.source_path,
+ mime_type,
+ document.get_public_filename(),
)
- # Create a file path to copy the original file to for working on
- temp_file = (Path(document_parser.tempdir) / Path("new-ocr-file")).resolve()
+ if parser.get_archive_path():
+ with transaction.atomic():
+ with open(parser.get_archive_path(), "rb") as f:
+ checksum = hashlib.md5(f.read()).hexdigest()
+ # I'm going to save first so that in case the file move
+ # fails, the database is rolled back.
+ # We also don't use save() since that triggers the filehandling
+ # logic, and we don't want that yet (file not yet in place)
+ document.archive_filename = generate_unique_filename(
+ document,
+ archive_filename=True,
+ )
+ Document.objects.filter(pk=document.pk).update(
+ archive_checksum=checksum,
+ content=parser.get_text(),
+ archive_filename=document.archive_filename,
+ )
+ with FileLock(settings.MEDIA_LOCK):
+ create_source_path_directory(document.archive_path)
+ shutil.move(parser.get_archive_path(), document.archive_path)
+ shutil.move(thumbnail, document.thumbnail_path)
- shutil.copy(doc.source_path, temp_file)
+ with index.open_index_writer() as writer:
+ index.update_document(writer, document)
- try:
- logger.info(
- f"Using {type(document_parser).__name__} for document",
- )
- # Try to re-parse the document into text
- document_parser.parse(str(temp_file), doc.mime_type)
-
- doc.content = document_parser.get_text()
- doc.save()
- logger.info("Document OCR updated")
-
- except ParseError as e:
- logger.error(f"Error parsing document: {e}")
- finally:
- # Remove the file path if it was created
- if temp_file.exists() and temp_file.is_file():
- temp_file.unlink()
+ except Exception:
+ logger.exception(
+ f"Error while parsing document {document} " f"(ID: {document_id})",
+ )
+ finally:
+ parser.cleanup()
from django.test import override_settings
from django.test import TestCase
from documents.file_handling import generate_filename
-from documents.management.commands.document_archiver import handle_document
from documents.models import Document
+from documents.tasks import update_document_archive_file
from documents.tests.utils import DirectoriesMixin
os.path.join(self.dirs.originals_dir, f"{doc.id:07}.pdf"),
)
- handle_document(doc.pk)
+ update_document_archive_file(doc.pk)
doc = Document.objects.get(id=doc.id)
doc.save()
shutil.copy(sample_file, doc.source_path)
- handle_document(doc.pk)
+ update_document_archive_file(doc.pk)
doc = Document.objects.get(id=doc.id)
os.path.join(self.dirs.originals_dir, f"document_01.pdf"),
)
- handle_document(doc2.pk)
- handle_document(doc1.pk)
+ update_document_archive_file(doc2.pk)
+ update_document_archive_file(doc1.pk)
doc1 = Document.objects.get(id=doc1.id)
doc2 = Document.objects.get(id=doc2.id)