Change: change update content to handle archive disabled (#8315)

author shamoon <4887959+shamoon@users.noreply.github.com>

Wed, 20 Nov 2024 20:01:13 +0000 (12:01 -0800)

committer GitHub <noreply@github.com>

Wed, 20 Nov 2024 20:01:13 +0000 (20:01 +0000)
author shamoon <4887959+shamoon@users.noreply.github.com>
Wed, 20 Nov 2024 20:01:13 +0000 (12:01 -0800)
committer GitHub <noreply@github.com>
Wed, 20 Nov 2024 20:01:13 +0000 (20:01 +0000)
diff --git a/src/documents/bulk_edit.py b/src/documents/bulk_edit.py

index 4d03769c8e924575136f76e0b516aef970ed467f..83be5eea9626eb36bee27fe74874e0dbd78ef11e 100644 (file)
--- a/src/documents/bulk_edit.py
+++ b/src/documents/bulk_edit.py
@@ -24,7 +24,7 @@ from documents.models import StoragePath
  from documents.permissions import set_permissions_for_object
  from documents.tasks import bulk_update_documents
  from documents.tasks import consume_file
-from documents.tasks import update_document_archive_file
+from documents.tasks import update_document_content_maybe_archive_file
  
  logger: logging.Logger = logging.getLogger("paperless.bulk_edit")
  
@@ -191,7 +191,7 @@ def delete(doc_ids: list[int]) -> Literal["OK"]:
  
  def reprocess(doc_ids: list[int]) -> Literal["OK"]:
      for document_id in doc_ids:
-        update_document_archive_file.delay(
+        update_document_content_maybe_archive_file.delay(
              document_id=document_id,
          )
  
@@ -245,7 +245,7 @@ def rotate(doc_ids: list[int], degrees: int) -> Literal["OK"]:
                  doc.checksum = hashlib.md5(doc.source_path.read_bytes()).hexdigest()
                  doc.save()
                  rotate_tasks.append(
-                    update_document_archive_file.s(
+                    update_document_content_maybe_archive_file.s(
                          document_id=doc.id,
                      ),
                  )
@@ -423,7 +423,7 @@ def delete_pages(doc_ids: list[int], pages: list[int]) -> Literal["OK"]:
              if doc.page_count is not None:
                  doc.page_count = doc.page_count - len(pages)
              doc.save()
-            update_document_archive_file.delay(document_id=doc.id)
+            update_document_content_maybe_archive_file.delay(document_id=doc.id)
              logger.info(f"Deleted pages {pages} from document {doc.id}")
      except Exception as e:
          logger.exception(f"Error deleting pages from document {doc.id}: {e}")
diff --git a/src/documents/management/commands/document_archiver.py b/src/documents/management/commands/document_archiver.py

index 12677dd7973039d1bd9f9a63b559588218b01949..1aa52117a3a9571ca01f6410c64f15fc6739e738 100644 (file)
--- a/src/documents/management/commands/document_archiver.py
+++ b/src/documents/management/commands/document_archiver.py
@@ -9,7 +9,7 @@ from django.core.management.base import BaseCommand
  from documents.management.commands.mixins import MultiProcessMixin
  from documents.management.commands.mixins import ProgressBarMixin
  from documents.models import Document
-from documents.tasks import update_document_archive_file
+from documents.tasks import update_document_content_maybe_archive_file
  
  logger = logging.getLogger("paperless.management.archiver")
  
@@ -77,13 +77,13 @@ class Command(MultiProcessMixin, ProgressBarMixin, BaseCommand):
  
              if self.process_count == 1:
                  for doc_id in document_ids:
-                    update_document_archive_file(doc_id)
+                    update_document_content_maybe_archive_file(doc_id)
              else:  # pragma: no cover
                  with multiprocessing.Pool(self.process_count) as pool:
                      list(
                          tqdm.tqdm(
                              pool.imap_unordered(
-                                update_document_archive_file,
+                                update_document_content_maybe_archive_file,
                                  document_ids,
                              ),
                              total=len(document_ids),
diff --git a/src/documents/tasks.py b/src/documents/tasks.py

index 8f5ee51bc62252d32907cba7b5130c585461b5ec..e04cdb34e28f810df116ba624f501555abc901d0 100644 (file)
--- a/src/documents/tasks.py
+++ b/src/documents/tasks.py
@@ -206,9 +206,10 @@ def bulk_update_documents(document_ids):
  
  
  @shared_task
-def update_document_archive_file(document_id):
+def update_document_content_maybe_archive_file(document_id):
      """
-    Re-creates the archive file of a document, including new OCR content and thumbnail
+    Re-creates OCR content and thumbnail for a document, and archive file if
+    it exists.
      """
      document = Document.objects.get(id=document_id)
  
@@ -234,8 +235,9 @@ def update_document_archive_file(document_id):
              document.get_public_filename(),
          )
  
-        if parser.get_archive_path():
-            with transaction.atomic():
+        with transaction.atomic():
+            oldDocument = Document.objects.get(pk=document.pk)
+            if parser.get_archive_path():
                  with open(parser.get_archive_path(), "rb") as f:
                      checksum = hashlib.md5(f.read()).hexdigest()
                  # I'm going to save first so that in case the file move
@@ -246,7 +248,6 @@ def update_document_archive_file(document_id):
                      document,
                      archive_filename=True,
                  )
-                oldDocument = Document.objects.get(pk=document.pk)
                  Document.objects.filter(pk=document.pk).update(
                      archive_checksum=checksum,
                      content=parser.get_text(),
@@ -268,24 +269,41 @@ def update_document_archive_file(document_id):
                              ],
                          },
                          additional_data={
-                            "reason": "Update document archive file",
+                            "reason": "Update document content",
                          },
                          action=LogEntry.Action.UPDATE,
                      )
+            else:
+                Document.objects.filter(pk=document.pk).update(
+                    content=parser.get_text(),
+                )
  
-                with FileLock(settings.MEDIA_LOCK):
+                if settings.AUDIT_LOG_ENABLED:
+                    LogEntry.objects.log_create(
+                        instance=oldDocument,
+                        changes={
+                            "content": [oldDocument.content, parser.get_text()],
+                        },
+                        additional_data={
+                            "reason": "Update document content",
+                        },
+                        action=LogEntry.Action.UPDATE,
+                    )
+
+            with FileLock(settings.MEDIA_LOCK):
+                if parser.get_archive_path():
                      create_source_path_directory(document.archive_path)
                      shutil.move(parser.get_archive_path(), document.archive_path)
-                    shutil.move(thumbnail, document.thumbnail_path)
+                shutil.move(thumbnail, document.thumbnail_path)
  
-            document.refresh_from_db()
-            logger.info(
-                f"Updating index for document {document_id} ({document.archive_checksum})",
-            )
-            with index.open_index_writer() as writer:
-                index.update_document(writer, document)
+        document.refresh_from_db()
+        logger.info(
+            f"Updating index for document {document_id} ({document.archive_checksum})",
+        )
+        with index.open_index_writer() as writer:
+            index.update_document(writer, document)
  
-            clear_document_caches(document.pk)
+        clear_document_caches(document.pk)
  
      except Exception:
          logger.exception(
diff --git a/src/documents/tests/test_bulk_edit.py b/src/documents/tests/test_bulk_edit.py

index c6e846a772e60779da9430e76f9d2d95eebb70e0..bb5ebf04d27ded0c58d6b60a6ded4a0cdf18eeac 100644 (file)
--- a/src/documents/tests/test_bulk_edit.py
+++ b/src/documents/tests/test_bulk_edit.py
@@ -607,7 +607,7 @@ class TestPDFActions(DirectoriesMixin, TestCase):
          mock_consume_file.assert_not_called()
  
      @mock.patch("documents.tasks.bulk_update_documents.si")
-    @mock.patch("documents.tasks.update_document_archive_file.s")
+    @mock.patch("documents.tasks.update_document_content_maybe_archive_file.s")
      @mock.patch("celery.chord.delay")
      def test_rotate(self, mock_chord, mock_update_document, mock_update_documents):
          """
@@ -626,7 +626,7 @@ class TestPDFActions(DirectoriesMixin, TestCase):
          self.assertEqual(result, "OK")
  
      @mock.patch("documents.tasks.bulk_update_documents.si")
-    @mock.patch("documents.tasks.update_document_archive_file.s")
+    @mock.patch("documents.tasks.update_document_content_maybe_archive_file.s")
      @mock.patch("pikepdf.Pdf.save")
      def test_rotate_with_error(
          self,
@@ -654,7 +654,7 @@ class TestPDFActions(DirectoriesMixin, TestCase):
              mock_update_archive_file.assert_not_called()
  
      @mock.patch("documents.tasks.bulk_update_documents.si")
-    @mock.patch("documents.tasks.update_document_archive_file.s")
+    @mock.patch("documents.tasks.update_document_content_maybe_archive_file.s")
      @mock.patch("celery.chord.delay")
      def test_rotate_non_pdf(
          self,
@@ -680,7 +680,7 @@ class TestPDFActions(DirectoriesMixin, TestCase):
              mock_chord.assert_called_once()
              self.assertEqual(result, "OK")
  
-    @mock.patch("documents.tasks.update_document_archive_file.delay")
+    @mock.patch("documents.tasks.update_document_content_maybe_archive_file.delay")
      @mock.patch("pikepdf.Pdf.save")
      def test_delete_pages(self, mock_pdf_save, mock_update_archive_file):
          """
@@ -705,7 +705,7 @@ class TestPDFActions(DirectoriesMixin, TestCase):
          self.doc2.refresh_from_db()
          self.assertEqual(self.doc2.page_count, expected_page_count)
  
-    @mock.patch("documents.tasks.update_document_archive_file.delay")
+    @mock.patch("documents.tasks.update_document_content_maybe_archive_file.delay")
      @mock.patch("pikepdf.Pdf.save")
      def test_delete_pages_with_error(self, mock_pdf_save, mock_update_archive_file):
          """
diff --git a/src/documents/tests/test_management.py b/src/documents/tests/test_management.py

index 76a0a2c743a7291beedadffa324ea474ba4f1135..5340035e7274f7a1939a88192e67187ee1ed4f7e 100644 (file)
--- a/src/documents/tests/test_management.py
+++ b/src/documents/tests/test_management.py
@@ -13,7 +13,7 @@ from django.test import override_settings
  
  from documents.file_handling import generate_filename
  from documents.models import Document
-from documents.tasks import update_document_archive_file
+from documents.tasks import update_document_content_maybe_archive_file
  from documents.tests.utils import DirectoriesMixin
  from documents.tests.utils import FileSystemAssertsMixin
  
@@ -46,7 +46,7 @@ class TestArchiver(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
              os.path.join(self.dirs.originals_dir, f"{doc.id:07}.pdf"),
          )
  
-        update_document_archive_file(doc.pk)
+        update_document_content_maybe_archive_file(doc.pk)
  
          doc = Document.objects.get(id=doc.id)
  
@@ -63,7 +63,7 @@ class TestArchiver(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
          doc.save()
          shutil.copy(sample_file, doc.source_path)
  
-        update_document_archive_file(doc.pk)
+        update_document_content_maybe_archive_file(doc.pk)
  
          doc = Document.objects.get(id=doc.id)
  
@@ -94,8 +94,8 @@ class TestArchiver(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
              os.path.join(self.dirs.originals_dir, "document_01.pdf"),
          )
  
-        update_document_archive_file(doc2.pk)
-        update_document_archive_file(doc1.pk)
+        update_document_content_maybe_archive_file(doc2.pk)
+        update_document_content_maybe_archive_file(doc1.pk)
  
          doc1 = Document.objects.get(id=doc1.id)
          doc2 = Document.objects.get(id=doc2.id)
diff --git a/src/documents/tests/test_tasks.py b/src/documents/tests/test_tasks.py

index a1d21b5327f63d1e81cb6fd4c6aa78b9ef8c0a35..0f9f8511f01debcdebe1952ed345b002e2745800 100644 (file)
--- a/src/documents/tests/test_tasks.py
+++ b/src/documents/tests/test_tasks.py
@@ -1,5 +1,7 @@
  import os
+import shutil
  from datetime import timedelta
+from pathlib import Path
  from unittest import mock
  
  from django.conf import settings
@@ -184,3 +186,75 @@ class TestEmptyTrashTask(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
  
          tasks.empty_trash()
          self.assertEqual(Document.global_objects.count(), 0)
+
+
+class TestUpdateContent(DirectoriesMixin, TestCase):
+    def test_update_content_maybe_archive_file(self):
+        """
+        GIVEN:
+            - Existing document with archive file
+        WHEN:
+            - Update content task is called
+        THEN:
+            - Document is reprocessed, content and checksum are updated
+        """
+        sample1 = self.dirs.scratch_dir / "sample.pdf"
+        shutil.copy(
+            Path(__file__).parent
+            / "samples"
+            / "documents"
+            / "originals"
+            / "0000001.pdf",
+            sample1,
+        )
+        sample1_archive = self.dirs.archive_dir / "sample_archive.pdf"
+        shutil.copy(
+            Path(__file__).parent
+            / "samples"
+            / "documents"
+            / "originals"
+            / "0000001.pdf",
+            sample1_archive,
+        )
+        doc = Document.objects.create(
+            title="test",
+            content="my document",
+            checksum="wow",
+            archive_checksum="wow",
+            filename=sample1,
+            mime_type="application/pdf",
+            archive_filename=sample1_archive,
+        )
+
+        tasks.update_document_content_maybe_archive_file(doc.pk)
+        self.assertNotEqual(Document.objects.get(pk=doc.pk).content, "test")
+        self.assertNotEqual(Document.objects.get(pk=doc.pk).archive_checksum, "wow")
+
+    def test_update_content_maybe_archive_file_no_archive(self):
+        """
+        GIVEN:
+            - Existing document without archive file
+        WHEN:
+            - Update content task is called
+        THEN:
+            - Document is reprocessed, content is updated
+        """
+        sample1 = self.dirs.scratch_dir / "sample.pdf"
+        shutil.copy(
+            Path(__file__).parent
+            / "samples"
+            / "documents"
+            / "originals"
+            / "0000001.pdf",
+            sample1,
+        )
+        doc = Document.objects.create(
+            title="test",
+            content="my document",
+            checksum="wow",
+            filename=sample1,
+            mime_type="application/pdf",
+        )
+
+        tasks.update_document_content_maybe_archive_file(doc.pk)
+        self.assertNotEqual(Document.objects.get(pk=doc.pk).content, "test")
author	shamoon <4887959+shamoon@users.noreply.github.com>
	Wed, 20 Nov 2024 20:01:13 +0000 (12:01 -0800)
committer	GitHub <noreply@github.com>
	Wed, 20 Nov 2024 20:01:13 +0000 (20:01 +0000)
src/documents/bulk_edit.py		patch \| blob \| blame \| history
src/documents/management/commands/document_archiver.py		patch \| blob \| blame \| history
src/documents/tasks.py		patch \| blob \| blame \| history
src/documents/tests/test_bulk_edit.py		patch \| blob \| blame \| history
src/documents/tests/test_management.py		patch \| blob \| blame \| history
src/documents/tests/test_tasks.py		patch \| blob \| blame \| history