from documents.permissions import set_permissions_for_object
from documents.tasks import bulk_update_documents
from documents.tasks import consume_file
-from documents.tasks import update_document_archive_file
+from documents.tasks import update_document_content_maybe_archive_file
logger: logging.Logger = logging.getLogger("paperless.bulk_edit")
def reprocess(doc_ids: list[int]) -> Literal["OK"]:
for document_id in doc_ids:
- update_document_archive_file.delay(
+ update_document_content_maybe_archive_file.delay(
document_id=document_id,
)
doc.checksum = hashlib.md5(doc.source_path.read_bytes()).hexdigest()
doc.save()
rotate_tasks.append(
- update_document_archive_file.s(
+ update_document_content_maybe_archive_file.s(
document_id=doc.id,
),
)
if doc.page_count is not None:
doc.page_count = doc.page_count - len(pages)
doc.save()
- update_document_archive_file.delay(document_id=doc.id)
+ update_document_content_maybe_archive_file.delay(document_id=doc.id)
logger.info(f"Deleted pages {pages} from document {doc.id}")
except Exception as e:
logger.exception(f"Error deleting pages from document {doc.id}: {e}")
from documents.management.commands.mixins import MultiProcessMixin
from documents.management.commands.mixins import ProgressBarMixin
from documents.models import Document
-from documents.tasks import update_document_archive_file
+from documents.tasks import update_document_content_maybe_archive_file
logger = logging.getLogger("paperless.management.archiver")
if self.process_count == 1:
for doc_id in document_ids:
- update_document_archive_file(doc_id)
+ update_document_content_maybe_archive_file(doc_id)
else: # pragma: no cover
with multiprocessing.Pool(self.process_count) as pool:
list(
tqdm.tqdm(
pool.imap_unordered(
- update_document_archive_file,
+ update_document_content_maybe_archive_file,
document_ids,
),
total=len(document_ids),
@shared_task
-def update_document_archive_file(document_id):
+def update_document_content_maybe_archive_file(document_id):
"""
- Re-creates the archive file of a document, including new OCR content and thumbnail
+ Re-creates OCR content and thumbnail for a document, and archive file if
+ it exists.
"""
document = Document.objects.get(id=document_id)
document.get_public_filename(),
)
- if parser.get_archive_path():
- with transaction.atomic():
+ with transaction.atomic():
+ oldDocument = Document.objects.get(pk=document.pk)
+ if parser.get_archive_path():
with open(parser.get_archive_path(), "rb") as f:
checksum = hashlib.md5(f.read()).hexdigest()
# I'm going to save first so that in case the file move
document,
archive_filename=True,
)
- oldDocument = Document.objects.get(pk=document.pk)
Document.objects.filter(pk=document.pk).update(
archive_checksum=checksum,
content=parser.get_text(),
],
},
additional_data={
- "reason": "Update document archive file",
+ "reason": "Update document content",
},
action=LogEntry.Action.UPDATE,
)
+ else:
+ Document.objects.filter(pk=document.pk).update(
+ content=parser.get_text(),
+ )
- with FileLock(settings.MEDIA_LOCK):
+ if settings.AUDIT_LOG_ENABLED:
+ LogEntry.objects.log_create(
+ instance=oldDocument,
+ changes={
+ "content": [oldDocument.content, parser.get_text()],
+ },
+ additional_data={
+ "reason": "Update document content",
+ },
+ action=LogEntry.Action.UPDATE,
+ )
+
+ with FileLock(settings.MEDIA_LOCK):
+ if parser.get_archive_path():
create_source_path_directory(document.archive_path)
shutil.move(parser.get_archive_path(), document.archive_path)
- shutil.move(thumbnail, document.thumbnail_path)
+ shutil.move(thumbnail, document.thumbnail_path)
- document.refresh_from_db()
- logger.info(
- f"Updating index for document {document_id} ({document.archive_checksum})",
- )
- with index.open_index_writer() as writer:
- index.update_document(writer, document)
+ document.refresh_from_db()
+ logger.info(
+ f"Updating index for document {document_id} ({document.archive_checksum})",
+ )
+ with index.open_index_writer() as writer:
+ index.update_document(writer, document)
- clear_document_caches(document.pk)
+ clear_document_caches(document.pk)
except Exception:
logger.exception(
mock_consume_file.assert_not_called()
@mock.patch("documents.tasks.bulk_update_documents.si")
- @mock.patch("documents.tasks.update_document_archive_file.s")
+ @mock.patch("documents.tasks.update_document_content_maybe_archive_file.s")
@mock.patch("celery.chord.delay")
def test_rotate(self, mock_chord, mock_update_document, mock_update_documents):
"""
self.assertEqual(result, "OK")
@mock.patch("documents.tasks.bulk_update_documents.si")
- @mock.patch("documents.tasks.update_document_archive_file.s")
+ @mock.patch("documents.tasks.update_document_content_maybe_archive_file.s")
@mock.patch("pikepdf.Pdf.save")
def test_rotate_with_error(
self,
mock_update_archive_file.assert_not_called()
@mock.patch("documents.tasks.bulk_update_documents.si")
- @mock.patch("documents.tasks.update_document_archive_file.s")
+ @mock.patch("documents.tasks.update_document_content_maybe_archive_file.s")
@mock.patch("celery.chord.delay")
def test_rotate_non_pdf(
self,
mock_chord.assert_called_once()
self.assertEqual(result, "OK")
- @mock.patch("documents.tasks.update_document_archive_file.delay")
+ @mock.patch("documents.tasks.update_document_content_maybe_archive_file.delay")
@mock.patch("pikepdf.Pdf.save")
def test_delete_pages(self, mock_pdf_save, mock_update_archive_file):
"""
self.doc2.refresh_from_db()
self.assertEqual(self.doc2.page_count, expected_page_count)
- @mock.patch("documents.tasks.update_document_archive_file.delay")
+ @mock.patch("documents.tasks.update_document_content_maybe_archive_file.delay")
@mock.patch("pikepdf.Pdf.save")
def test_delete_pages_with_error(self, mock_pdf_save, mock_update_archive_file):
"""
from documents.file_handling import generate_filename
from documents.models import Document
-from documents.tasks import update_document_archive_file
+from documents.tasks import update_document_content_maybe_archive_file
from documents.tests.utils import DirectoriesMixin
from documents.tests.utils import FileSystemAssertsMixin
os.path.join(self.dirs.originals_dir, f"{doc.id:07}.pdf"),
)
- update_document_archive_file(doc.pk)
+ update_document_content_maybe_archive_file(doc.pk)
doc = Document.objects.get(id=doc.id)
doc.save()
shutil.copy(sample_file, doc.source_path)
- update_document_archive_file(doc.pk)
+ update_document_content_maybe_archive_file(doc.pk)
doc = Document.objects.get(id=doc.id)
os.path.join(self.dirs.originals_dir, "document_01.pdf"),
)
- update_document_archive_file(doc2.pk)
- update_document_archive_file(doc1.pk)
+ update_document_content_maybe_archive_file(doc2.pk)
+ update_document_content_maybe_archive_file(doc1.pk)
doc1 = Document.objects.get(id=doc1.id)
doc2 = Document.objects.get(id=doc2.id)
import os
+import shutil
from datetime import timedelta
+from pathlib import Path
from unittest import mock
from django.conf import settings
tasks.empty_trash()
self.assertEqual(Document.global_objects.count(), 0)
+
+
+class TestUpdateContent(DirectoriesMixin, TestCase):
+ def test_update_content_maybe_archive_file(self):
+ """
+ GIVEN:
+ - Existing document with archive file
+ WHEN:
+ - Update content task is called
+ THEN:
+ - Document is reprocessed, content and checksum are updated
+ """
+ sample1 = self.dirs.scratch_dir / "sample.pdf"
+ shutil.copy(
+ Path(__file__).parent
+ / "samples"
+ / "documents"
+ / "originals"
+ / "0000001.pdf",
+ sample1,
+ )
+ sample1_archive = self.dirs.archive_dir / "sample_archive.pdf"
+ shutil.copy(
+ Path(__file__).parent
+ / "samples"
+ / "documents"
+ / "originals"
+ / "0000001.pdf",
+ sample1_archive,
+ )
+ doc = Document.objects.create(
+ title="test",
+ content="my document",
+ checksum="wow",
+ archive_checksum="wow",
+ filename=sample1,
+ mime_type="application/pdf",
+ archive_filename=sample1_archive,
+ )
+
+ tasks.update_document_content_maybe_archive_file(doc.pk)
+ self.assertNotEqual(Document.objects.get(pk=doc.pk).content, "test")
+ self.assertNotEqual(Document.objects.get(pk=doc.pk).archive_checksum, "wow")
+
+ def test_update_content_maybe_archive_file_no_archive(self):
+ """
+ GIVEN:
+ - Existing document without archive file
+ WHEN:
+ - Update content task is called
+ THEN:
+ - Document is reprocessed, content is updated
+ """
+ sample1 = self.dirs.scratch_dir / "sample.pdf"
+ shutil.copy(
+ Path(__file__).parent
+ / "samples"
+ / "documents"
+ / "originals"
+ / "0000001.pdf",
+ sample1,
+ )
+ doc = Document.objects.create(
+ title="test",
+ content="my document",
+ checksum="wow",
+ filename=sample1,
+ mime_type="application/pdf",
+ )
+
+ tasks.update_document_content_maybe_archive_file(doc.pk)
+ self.assertNotEqual(Document.objects.get(pk=doc.pk).content, "test")