more testing of the migration

author jonaswinkler <jonas.winkler@jpwinkler.de>

Wed, 10 Feb 2021 15:58:55 +0000 (16:58 +0100)

committer jonaswinkler <jonas.winkler@jpwinkler.de>

Wed, 10 Feb 2021 15:58:55 +0000 (16:58 +0100)
author jonaswinkler <jonas.winkler@jpwinkler.de>
Wed, 10 Feb 2021 15:58:55 +0000 (16:58 +0100)
committer jonaswinkler <jonas.winkler@jpwinkler.de>
Wed, 10 Feb 2021 15:58:55 +0000 (16:58 +0100)
diff --git a/src/documents/migrations/1012_fix_archive_files.py b/src/documents/migrations/1012_fix_archive_files.py

index 4fa258b8a03ec04ad5ceaae29940f7a6725f7e8f..5f5064396b1b8ba976f4fda59d0840e6a40843b4 100644 (file)
--- a/src/documents/migrations/1012_fix_archive_files.py
+++ b/src/documents/migrations/1012_fix_archive_files.py
@@ -160,7 +160,12 @@ def generate_filename(doc, counter=0, append_gpg=True, archive_filename=False):
  ###############################################################################
  
  
-def create_archive_version(doc, retry_count=4):
+def parse_wrapper(parser, path, mime_type, file_name):
+    # this is here so that I can mock this out for testing.
+    parser.parse(path, mime_type, file_name)
+
+
+def create_archive_version(doc, retry_count=3):
      from documents.parsers import get_parser_class_for_mime_type, \
          DocumentParser, \
          ParseError
@@ -172,8 +177,8 @@ def create_archive_version(doc, retry_count=4):
      for try_num in range(retry_count):
          parser: DocumentParser = parser_class(None, None)
          try:
-            parser.parse(source_path(doc), doc.mime_type,
-                         os.path.basename(doc.filename))
+            parse_wrapper(parser, source_path(doc), doc.mime_type,
+                          os.path.basename(doc.filename))
              doc.content = parser.get_text()
  
              if parser.get_archive_path() and os.path.isfile(
@@ -225,25 +230,28 @@ def move_old_to_new_locations(apps, schema_editor):
      for doc in Document.objects.filter(archive_checksum__isnull=False):
          old_path = archive_path_old(doc)
  
-        if not os.path.isfile(old_path):
-            raise ValueError(
-                f"Archived document ID:{doc.id} does not exist at: "
-                f"{old_path}")
-
          if old_path in old_archive_path_to_id:
              affected_document_ids.add(doc.id)
              affected_document_ids.add(old_archive_path_to_id[old_path])
          else:
              old_archive_path_to_id[old_path] = doc.id
  
-    # check that we can regenerate these archive versions
+    # check that archive files of all unaffected documents are in place
+    for doc in Document.objects.filter(archive_checksum__isnull=False):
+        old_path = archive_path_old(doc)
+        if doc.id not in affected_document_ids and not os.path.isfile(old_path):
+            raise ValueError(
+                f"Archived document ID:{doc.id} does not exist at: "
+                f"{old_path}")
+
+    # check that we can regenerate affected archive versions
      for doc_id in affected_document_ids:
          from documents.parsers import get_parser_class_for_mime_type
  
          doc = Document.objects.get(id=doc_id)
          parser_class = get_parser_class_for_mime_type(doc.mime_type)
          if not parser_class:
-            raise Exception(
+            raise ValueError(
                  f"Document ID:{doc.id} has an invalid archived document, "
                  f"but no parsers are available. Cannot migrate.")
  
@@ -253,6 +261,9 @@ def move_old_to_new_locations(apps, schema_editor):
              old_path = archive_path_old(doc)
              # remove affected archive versions
              if os.path.isfile(old_path):
+                logger.debug(
+                    f"Removing {old_path}"
+                )
                  os.unlink(old_path)
          else:
              # Set archive path for unaffected files
@@ -267,8 +278,6 @@ def move_old_to_new_locations(apps, schema_editor):
          create_archive_version(doc)
  
  
-
-
  def move_new_to_old_locations(apps, schema_editor):
      Document = apps.get_model("documents", "Document")
  
diff --git a/src/documents/tests/test_migration_archive_files.py b/src/documents/tests/test_migration_archive_files.py

index 637aaeee0a2d4c8682f0e7c7c0ff7f31c296fbfa..b3f2e916c52b2acd207e6fdb715fc2dcbc71da26 100644 (file)
--- a/src/documents/tests/test_migration_archive_files.py
+++ b/src/documents/tests/test_migration_archive_files.py
@@ -2,10 +2,12 @@ import hashlib
  import os
  import shutil
  from pathlib import Path
+from unittest import mock
  
  from django.conf import settings
  from django.test import override_settings
  
+from documents.parsers import ParseError
  from documents.tests.utils import DirectoriesMixin, TestMigrations
  
  
@@ -169,6 +171,11 @@ class TestMigrateArchiveFilesWithFilenameFormat(TestMigrateArchiveFiles):
          self.assertEqual(Document.objects.get(id=self.clash4.id).archive_filename, "clash.png.pdf")
  
  
+def fake_parse_wrapper(parser, path, mime_type, file_name):
+    parser.archive_path = None
+    parser.text = "the text"
+
+
  @override_settings(PAPERLESS_FILENAME_FORMAT="")
  class TestMigrateArchiveFilesErrors(DirectoriesMixin, TestMigrations):
  
@@ -185,6 +192,73 @@ class TestMigrateArchiveFilesErrors(DirectoriesMixin, TestMigrations):
  
          self.assertRaisesMessage(ValueError, "does not exist at: ", self.performMigration)
  
+    def test_parser_missing(self):
+        Document = self.apps.get_model("documents", "Document")
+
+        doc1 = make_test_document(Document, "document", "invalid/typesss768", simple_png, "document.png", simple_pdf)
+        doc2 = make_test_document(Document, "document", "invalid/typesss768", simple_jpg, "document.jpg", simple_pdf)
+
+        self.assertRaisesMessage(ValueError, "no parsers are available", self.performMigration)
+
+    @mock.patch("documents.migrations.1012_fix_archive_files.parse_wrapper")
+    def test_parser_error(self, m):
+        m.side_effect = ParseError()
+        Document = self.apps.get_model("documents", "Document")
+
+        doc1 = make_test_document(Document, "document", "image/png", simple_png, "document.png", simple_pdf)
+        doc2 = make_test_document(Document, "document", "application/pdf", simple_jpg, "document.jpg", simple_pdf)
+
+        self.assertIsNotNone(doc1.archive_checksum)
+        self.assertIsNotNone(doc2.archive_checksum)
+
+        with self.assertLogs() as capture:
+            self.performMigration()
+
+        self.assertEqual(m.call_count, 6)
+
+        self.assertEqual(
+            len(list(filter(lambda log: "Parse error, will try again in 5 seconds" in log, capture.output))),
+            4)
+
+        self.assertEqual(
+            len(list(filter(lambda log: "Unable to regenerate archive document for ID:" in log, capture.output))),
+            2)
+
+        Document = self.apps.get_model("documents", "Document")
+
+        doc1 = Document.objects.get(id=doc1.id)
+        doc2 = Document.objects.get(id=doc2.id)
+
+        self.assertIsNone(doc1.archive_checksum)
+        self.assertIsNone(doc2.archive_checksum)
+        self.assertIsNone(doc1.archive_filename)
+        self.assertIsNone(doc2.archive_filename)
+
+    @mock.patch("documents.migrations.1012_fix_archive_files.parse_wrapper")
+    def test_parser_no_archive(self, m):
+        m.side_effect = fake_parse_wrapper
+
+        Document = self.apps.get_model("documents", "Document")
+
+        doc1 = make_test_document(Document, "document", "image/png", simple_png, "document.png", simple_pdf)
+        doc2 = make_test_document(Document, "document", "application/pdf", simple_jpg, "document.jpg", simple_pdf)
+
+        with self.assertLogs() as capture:
+            self.performMigration()
+
+        self.assertEqual(
+            len(list(filter(lambda log: "Parser did not return an archive document for document" in log, capture.output))),
+            2)
+
+        Document = self.apps.get_model("documents", "Document")
+
+        doc1 = Document.objects.get(id=doc1.id)
+        doc2 = Document.objects.get(id=doc2.id)
+
+        self.assertIsNone(doc1.archive_checksum)
+        self.assertIsNone(doc2.archive_checksum)
+        self.assertIsNone(doc1.archive_filename)
+        self.assertIsNone(doc2.archive_filename)
  
  @override_settings(PAPERLESS_FILENAME_FORMAT="")
  class TestMigrateArchiveFilesBackwards(DirectoriesMixin, TestMigrations):
author	jonaswinkler <jonas.winkler@jpwinkler.de>
	Wed, 10 Feb 2021 15:58:55 +0000 (16:58 +0100)
committer	jonaswinkler <jonas.winkler@jpwinkler.de>
	Wed, 10 Feb 2021 15:58:55 +0000 (16:58 +0100)
src/documents/migrations/1012_fix_archive_files.py		patch \| blob \| blame \| history
src/documents/tests/test_migration_archive_files.py		patch \| blob \| blame \| history