]> git.ipfire.org Git - thirdparty/paperless-ngx.git/commitdiff
more testing of the migration
authorjonaswinkler <jonas.winkler@jpwinkler.de>
Wed, 10 Feb 2021 15:58:55 +0000 (16:58 +0100)
committerjonaswinkler <jonas.winkler@jpwinkler.de>
Wed, 10 Feb 2021 15:58:55 +0000 (16:58 +0100)
src/documents/migrations/1012_fix_archive_files.py
src/documents/tests/test_migration_archive_files.py

index 4fa258b8a03ec04ad5ceaae29940f7a6725f7e8f..5f5064396b1b8ba976f4fda59d0840e6a40843b4 100644 (file)
@@ -160,7 +160,12 @@ def generate_filename(doc, counter=0, append_gpg=True, archive_filename=False):
 ###############################################################################
 
 
-def create_archive_version(doc, retry_count=4):
+def parse_wrapper(parser, path, mime_type, file_name):
+    # this is here so that I can mock this out for testing.
+    parser.parse(path, mime_type, file_name)
+
+
+def create_archive_version(doc, retry_count=3):
     from documents.parsers import get_parser_class_for_mime_type, \
         DocumentParser, \
         ParseError
@@ -172,8 +177,8 @@ def create_archive_version(doc, retry_count=4):
     for try_num in range(retry_count):
         parser: DocumentParser = parser_class(None, None)
         try:
-            parser.parse(source_path(doc), doc.mime_type,
-                         os.path.basename(doc.filename))
+            parse_wrapper(parser, source_path(doc), doc.mime_type,
+                          os.path.basename(doc.filename))
             doc.content = parser.get_text()
 
             if parser.get_archive_path() and os.path.isfile(
@@ -225,25 +230,28 @@ def move_old_to_new_locations(apps, schema_editor):
     for doc in Document.objects.filter(archive_checksum__isnull=False):
         old_path = archive_path_old(doc)
 
-        if not os.path.isfile(old_path):
-            raise ValueError(
-                f"Archived document ID:{doc.id} does not exist at: "
-                f"{old_path}")
-
         if old_path in old_archive_path_to_id:
             affected_document_ids.add(doc.id)
             affected_document_ids.add(old_archive_path_to_id[old_path])
         else:
             old_archive_path_to_id[old_path] = doc.id
 
-    # check that we can regenerate these archive versions
+    # check that archive files of all unaffected documents are in place
+    for doc in Document.objects.filter(archive_checksum__isnull=False):
+        old_path = archive_path_old(doc)
+        if doc.id not in affected_document_ids and not os.path.isfile(old_path):
+            raise ValueError(
+                f"Archived document ID:{doc.id} does not exist at: "
+                f"{old_path}")
+
+    # check that we can regenerate affected archive versions
     for doc_id in affected_document_ids:
         from documents.parsers import get_parser_class_for_mime_type
 
         doc = Document.objects.get(id=doc_id)
         parser_class = get_parser_class_for_mime_type(doc.mime_type)
         if not parser_class:
-            raise Exception(
+            raise ValueError(
                 f"Document ID:{doc.id} has an invalid archived document, "
                 f"but no parsers are available. Cannot migrate.")
 
@@ -253,6 +261,9 @@ def move_old_to_new_locations(apps, schema_editor):
             old_path = archive_path_old(doc)
             # remove affected archive versions
             if os.path.isfile(old_path):
+                logger.debug(
+                    f"Removing {old_path}"
+                )
                 os.unlink(old_path)
         else:
             # Set archive path for unaffected files
@@ -267,8 +278,6 @@ def move_old_to_new_locations(apps, schema_editor):
         create_archive_version(doc)
 
 
-
-
 def move_new_to_old_locations(apps, schema_editor):
     Document = apps.get_model("documents", "Document")
 
index 637aaeee0a2d4c8682f0e7c7c0ff7f31c296fbfa..b3f2e916c52b2acd207e6fdb715fc2dcbc71da26 100644 (file)
@@ -2,10 +2,12 @@ import hashlib
 import os
 import shutil
 from pathlib import Path
+from unittest import mock
 
 from django.conf import settings
 from django.test import override_settings
 
+from documents.parsers import ParseError
 from documents.tests.utils import DirectoriesMixin, TestMigrations
 
 
@@ -169,6 +171,11 @@ class TestMigrateArchiveFilesWithFilenameFormat(TestMigrateArchiveFiles):
         self.assertEqual(Document.objects.get(id=self.clash4.id).archive_filename, "clash.png.pdf")
 
 
+def fake_parse_wrapper(parser, path, mime_type, file_name):
+    parser.archive_path = None
+    parser.text = "the text"
+
+
 @override_settings(PAPERLESS_FILENAME_FORMAT="")
 class TestMigrateArchiveFilesErrors(DirectoriesMixin, TestMigrations):
 
@@ -185,6 +192,73 @@ class TestMigrateArchiveFilesErrors(DirectoriesMixin, TestMigrations):
 
         self.assertRaisesMessage(ValueError, "does not exist at: ", self.performMigration)
 
+    def test_parser_missing(self):
+        Document = self.apps.get_model("documents", "Document")
+
+        doc1 = make_test_document(Document, "document", "invalid/typesss768", simple_png, "document.png", simple_pdf)
+        doc2 = make_test_document(Document, "document", "invalid/typesss768", simple_jpg, "document.jpg", simple_pdf)
+
+        self.assertRaisesMessage(ValueError, "no parsers are available", self.performMigration)
+
+    @mock.patch("documents.migrations.1012_fix_archive_files.parse_wrapper")
+    def test_parser_error(self, m):
+        m.side_effect = ParseError()
+        Document = self.apps.get_model("documents", "Document")
+
+        doc1 = make_test_document(Document, "document", "image/png", simple_png, "document.png", simple_pdf)
+        doc2 = make_test_document(Document, "document", "application/pdf", simple_jpg, "document.jpg", simple_pdf)
+
+        self.assertIsNotNone(doc1.archive_checksum)
+        self.assertIsNotNone(doc2.archive_checksum)
+
+        with self.assertLogs() as capture:
+            self.performMigration()
+
+        self.assertEqual(m.call_count, 6)
+
+        self.assertEqual(
+            len(list(filter(lambda log: "Parse error, will try again in 5 seconds" in log, capture.output))),
+            4)
+
+        self.assertEqual(
+            len(list(filter(lambda log: "Unable to regenerate archive document for ID:" in log, capture.output))),
+            2)
+
+        Document = self.apps.get_model("documents", "Document")
+
+        doc1 = Document.objects.get(id=doc1.id)
+        doc2 = Document.objects.get(id=doc2.id)
+
+        self.assertIsNone(doc1.archive_checksum)
+        self.assertIsNone(doc2.archive_checksum)
+        self.assertIsNone(doc1.archive_filename)
+        self.assertIsNone(doc2.archive_filename)
+
+    @mock.patch("documents.migrations.1012_fix_archive_files.parse_wrapper")
+    def test_parser_no_archive(self, m):
+        m.side_effect = fake_parse_wrapper
+
+        Document = self.apps.get_model("documents", "Document")
+
+        doc1 = make_test_document(Document, "document", "image/png", simple_png, "document.png", simple_pdf)
+        doc2 = make_test_document(Document, "document", "application/pdf", simple_jpg, "document.jpg", simple_pdf)
+
+        with self.assertLogs() as capture:
+            self.performMigration()
+
+        self.assertEqual(
+            len(list(filter(lambda log: "Parser did not return an archive document for document" in log, capture.output))),
+            2)
+
+        Document = self.apps.get_model("documents", "Document")
+
+        doc1 = Document.objects.get(id=doc1.id)
+        doc2 = Document.objects.get(id=doc2.id)
+
+        self.assertIsNone(doc1.archive_checksum)
+        self.assertIsNone(doc2.archive_checksum)
+        self.assertIsNone(doc1.archive_filename)
+        self.assertIsNone(doc2.archive_filename)
 
 @override_settings(PAPERLESS_FILENAME_FORMAT="")
 class TestMigrateArchiveFilesBackwards(DirectoriesMixin, TestMigrations):