]> git.ipfire.org Git - thirdparty/paperless-ngx.git/commitdiff
Change: change update content to handle archive disabled (#8315)
authorshamoon <4887959+shamoon@users.noreply.github.com>
Wed, 20 Nov 2024 20:01:13 +0000 (12:01 -0800)
committerGitHub <noreply@github.com>
Wed, 20 Nov 2024 20:01:13 +0000 (20:01 +0000)
src/documents/bulk_edit.py
src/documents/management/commands/document_archiver.py
src/documents/tasks.py
src/documents/tests/test_bulk_edit.py
src/documents/tests/test_management.py
src/documents/tests/test_tasks.py

index 4d03769c8e924575136f76e0b516aef970ed467f..83be5eea9626eb36bee27fe74874e0dbd78ef11e 100644 (file)
@@ -24,7 +24,7 @@ from documents.models import StoragePath
 from documents.permissions import set_permissions_for_object
 from documents.tasks import bulk_update_documents
 from documents.tasks import consume_file
-from documents.tasks import update_document_archive_file
+from documents.tasks import update_document_content_maybe_archive_file
 
 logger: logging.Logger = logging.getLogger("paperless.bulk_edit")
 
@@ -191,7 +191,7 @@ def delete(doc_ids: list[int]) -> Literal["OK"]:
 
 def reprocess(doc_ids: list[int]) -> Literal["OK"]:
     for document_id in doc_ids:
-        update_document_archive_file.delay(
+        update_document_content_maybe_archive_file.delay(
             document_id=document_id,
         )
 
@@ -245,7 +245,7 @@ def rotate(doc_ids: list[int], degrees: int) -> Literal["OK"]:
                 doc.checksum = hashlib.md5(doc.source_path.read_bytes()).hexdigest()
                 doc.save()
                 rotate_tasks.append(
-                    update_document_archive_file.s(
+                    update_document_content_maybe_archive_file.s(
                         document_id=doc.id,
                     ),
                 )
@@ -423,7 +423,7 @@ def delete_pages(doc_ids: list[int], pages: list[int]) -> Literal["OK"]:
             if doc.page_count is not None:
                 doc.page_count = doc.page_count - len(pages)
             doc.save()
-            update_document_archive_file.delay(document_id=doc.id)
+            update_document_content_maybe_archive_file.delay(document_id=doc.id)
             logger.info(f"Deleted pages {pages} from document {doc.id}")
     except Exception as e:
         logger.exception(f"Error deleting pages from document {doc.id}: {e}")
index 12677dd7973039d1bd9f9a63b559588218b01949..1aa52117a3a9571ca01f6410c64f15fc6739e738 100644 (file)
@@ -9,7 +9,7 @@ from django.core.management.base import BaseCommand
 from documents.management.commands.mixins import MultiProcessMixin
 from documents.management.commands.mixins import ProgressBarMixin
 from documents.models import Document
-from documents.tasks import update_document_archive_file
+from documents.tasks import update_document_content_maybe_archive_file
 
 logger = logging.getLogger("paperless.management.archiver")
 
@@ -77,13 +77,13 @@ class Command(MultiProcessMixin, ProgressBarMixin, BaseCommand):
 
             if self.process_count == 1:
                 for doc_id in document_ids:
-                    update_document_archive_file(doc_id)
+                    update_document_content_maybe_archive_file(doc_id)
             else:  # pragma: no cover
                 with multiprocessing.Pool(self.process_count) as pool:
                     list(
                         tqdm.tqdm(
                             pool.imap_unordered(
-                                update_document_archive_file,
+                                update_document_content_maybe_archive_file,
                                 document_ids,
                             ),
                             total=len(document_ids),
index 8f5ee51bc62252d32907cba7b5130c585461b5ec..e04cdb34e28f810df116ba624f501555abc901d0 100644 (file)
@@ -206,9 +206,10 @@ def bulk_update_documents(document_ids):
 
 
 @shared_task
-def update_document_archive_file(document_id):
+def update_document_content_maybe_archive_file(document_id):
     """
-    Re-creates the archive file of a document, including new OCR content and thumbnail
+    Re-creates OCR content and thumbnail for a document, and archive file if
+    it exists.
     """
     document = Document.objects.get(id=document_id)
 
@@ -234,8 +235,9 @@ def update_document_archive_file(document_id):
             document.get_public_filename(),
         )
 
-        if parser.get_archive_path():
-            with transaction.atomic():
+        with transaction.atomic():
+            oldDocument = Document.objects.get(pk=document.pk)
+            if parser.get_archive_path():
                 with open(parser.get_archive_path(), "rb") as f:
                     checksum = hashlib.md5(f.read()).hexdigest()
                 # I'm going to save first so that in case the file move
@@ -246,7 +248,6 @@ def update_document_archive_file(document_id):
                     document,
                     archive_filename=True,
                 )
-                oldDocument = Document.objects.get(pk=document.pk)
                 Document.objects.filter(pk=document.pk).update(
                     archive_checksum=checksum,
                     content=parser.get_text(),
@@ -268,24 +269,41 @@ def update_document_archive_file(document_id):
                             ],
                         },
                         additional_data={
-                            "reason": "Update document archive file",
+                            "reason": "Update document content",
                         },
                         action=LogEntry.Action.UPDATE,
                     )
+            else:
+                Document.objects.filter(pk=document.pk).update(
+                    content=parser.get_text(),
+                )
 
-                with FileLock(settings.MEDIA_LOCK):
+                if settings.AUDIT_LOG_ENABLED:
+                    LogEntry.objects.log_create(
+                        instance=oldDocument,
+                        changes={
+                            "content": [oldDocument.content, parser.get_text()],
+                        },
+                        additional_data={
+                            "reason": "Update document content",
+                        },
+                        action=LogEntry.Action.UPDATE,
+                    )
+
+            with FileLock(settings.MEDIA_LOCK):
+                if parser.get_archive_path():
                     create_source_path_directory(document.archive_path)
                     shutil.move(parser.get_archive_path(), document.archive_path)
-                    shutil.move(thumbnail, document.thumbnail_path)
+                shutil.move(thumbnail, document.thumbnail_path)
 
-            document.refresh_from_db()
-            logger.info(
-                f"Updating index for document {document_id} ({document.archive_checksum})",
-            )
-            with index.open_index_writer() as writer:
-                index.update_document(writer, document)
+        document.refresh_from_db()
+        logger.info(
+            f"Updating index for document {document_id} ({document.archive_checksum})",
+        )
+        with index.open_index_writer() as writer:
+            index.update_document(writer, document)
 
-            clear_document_caches(document.pk)
+        clear_document_caches(document.pk)
 
     except Exception:
         logger.exception(
index c6e846a772e60779da9430e76f9d2d95eebb70e0..bb5ebf04d27ded0c58d6b60a6ded4a0cdf18eeac 100644 (file)
@@ -607,7 +607,7 @@ class TestPDFActions(DirectoriesMixin, TestCase):
         mock_consume_file.assert_not_called()
 
     @mock.patch("documents.tasks.bulk_update_documents.si")
-    @mock.patch("documents.tasks.update_document_archive_file.s")
+    @mock.patch("documents.tasks.update_document_content_maybe_archive_file.s")
     @mock.patch("celery.chord.delay")
     def test_rotate(self, mock_chord, mock_update_document, mock_update_documents):
         """
@@ -626,7 +626,7 @@ class TestPDFActions(DirectoriesMixin, TestCase):
         self.assertEqual(result, "OK")
 
     @mock.patch("documents.tasks.bulk_update_documents.si")
-    @mock.patch("documents.tasks.update_document_archive_file.s")
+    @mock.patch("documents.tasks.update_document_content_maybe_archive_file.s")
     @mock.patch("pikepdf.Pdf.save")
     def test_rotate_with_error(
         self,
@@ -654,7 +654,7 @@ class TestPDFActions(DirectoriesMixin, TestCase):
             mock_update_archive_file.assert_not_called()
 
     @mock.patch("documents.tasks.bulk_update_documents.si")
-    @mock.patch("documents.tasks.update_document_archive_file.s")
+    @mock.patch("documents.tasks.update_document_content_maybe_archive_file.s")
     @mock.patch("celery.chord.delay")
     def test_rotate_non_pdf(
         self,
@@ -680,7 +680,7 @@ class TestPDFActions(DirectoriesMixin, TestCase):
             mock_chord.assert_called_once()
             self.assertEqual(result, "OK")
 
-    @mock.patch("documents.tasks.update_document_archive_file.delay")
+    @mock.patch("documents.tasks.update_document_content_maybe_archive_file.delay")
     @mock.patch("pikepdf.Pdf.save")
     def test_delete_pages(self, mock_pdf_save, mock_update_archive_file):
         """
@@ -705,7 +705,7 @@ class TestPDFActions(DirectoriesMixin, TestCase):
         self.doc2.refresh_from_db()
         self.assertEqual(self.doc2.page_count, expected_page_count)
 
-    @mock.patch("documents.tasks.update_document_archive_file.delay")
+    @mock.patch("documents.tasks.update_document_content_maybe_archive_file.delay")
     @mock.patch("pikepdf.Pdf.save")
     def test_delete_pages_with_error(self, mock_pdf_save, mock_update_archive_file):
         """
index 76a0a2c743a7291beedadffa324ea474ba4f1135..5340035e7274f7a1939a88192e67187ee1ed4f7e 100644 (file)
@@ -13,7 +13,7 @@ from django.test import override_settings
 
 from documents.file_handling import generate_filename
 from documents.models import Document
-from documents.tasks import update_document_archive_file
+from documents.tasks import update_document_content_maybe_archive_file
 from documents.tests.utils import DirectoriesMixin
 from documents.tests.utils import FileSystemAssertsMixin
 
@@ -46,7 +46,7 @@ class TestArchiver(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
             os.path.join(self.dirs.originals_dir, f"{doc.id:07}.pdf"),
         )
 
-        update_document_archive_file(doc.pk)
+        update_document_content_maybe_archive_file(doc.pk)
 
         doc = Document.objects.get(id=doc.id)
 
@@ -63,7 +63,7 @@ class TestArchiver(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
         doc.save()
         shutil.copy(sample_file, doc.source_path)
 
-        update_document_archive_file(doc.pk)
+        update_document_content_maybe_archive_file(doc.pk)
 
         doc = Document.objects.get(id=doc.id)
 
@@ -94,8 +94,8 @@ class TestArchiver(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
             os.path.join(self.dirs.originals_dir, "document_01.pdf"),
         )
 
-        update_document_archive_file(doc2.pk)
-        update_document_archive_file(doc1.pk)
+        update_document_content_maybe_archive_file(doc2.pk)
+        update_document_content_maybe_archive_file(doc1.pk)
 
         doc1 = Document.objects.get(id=doc1.id)
         doc2 = Document.objects.get(id=doc2.id)
index a1d21b5327f63d1e81cb6fd4c6aa78b9ef8c0a35..0f9f8511f01debcdebe1952ed345b002e2745800 100644 (file)
@@ -1,5 +1,7 @@
 import os
+import shutil
 from datetime import timedelta
+from pathlib import Path
 from unittest import mock
 
 from django.conf import settings
@@ -184,3 +186,75 @@ class TestEmptyTrashTask(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
 
         tasks.empty_trash()
         self.assertEqual(Document.global_objects.count(), 0)
+
+
+class TestUpdateContent(DirectoriesMixin, TestCase):
+    def test_update_content_maybe_archive_file(self):
+        """
+        GIVEN:
+            - Existing document with archive file
+        WHEN:
+            - Update content task is called
+        THEN:
+            - Document is reprocessed, content and checksum are updated
+        """
+        sample1 = self.dirs.scratch_dir / "sample.pdf"
+        shutil.copy(
+            Path(__file__).parent
+            / "samples"
+            / "documents"
+            / "originals"
+            / "0000001.pdf",
+            sample1,
+        )
+        sample1_archive = self.dirs.archive_dir / "sample_archive.pdf"
+        shutil.copy(
+            Path(__file__).parent
+            / "samples"
+            / "documents"
+            / "originals"
+            / "0000001.pdf",
+            sample1_archive,
+        )
+        doc = Document.objects.create(
+            title="test",
+            content="my document",
+            checksum="wow",
+            archive_checksum="wow",
+            filename=sample1,
+            mime_type="application/pdf",
+            archive_filename=sample1_archive,
+        )
+
+        tasks.update_document_content_maybe_archive_file(doc.pk)
+        self.assertNotEqual(Document.objects.get(pk=doc.pk).content, "test")
+        self.assertNotEqual(Document.objects.get(pk=doc.pk).archive_checksum, "wow")
+
+    def test_update_content_maybe_archive_file_no_archive(self):
+        """
+        GIVEN:
+            - Existing document without archive file
+        WHEN:
+            - Update content task is called
+        THEN:
+            - Document is reprocessed, content is updated
+        """
+        sample1 = self.dirs.scratch_dir / "sample.pdf"
+        shutil.copy(
+            Path(__file__).parent
+            / "samples"
+            / "documents"
+            / "originals"
+            / "0000001.pdf",
+            sample1,
+        )
+        doc = Document.objects.create(
+            title="test",
+            content="my document",
+            checksum="wow",
+            filename=sample1,
+            mime_type="application/pdf",
+        )
+
+        tasks.update_document_content_maybe_archive_file(doc.pk)
+        self.assertNotEqual(Document.objects.get(pk=doc.pk).content, "test")