###############################################################################
-def create_archive_version(doc, retry_count=4):
+def parse_wrapper(parser, path, mime_type, file_name):
+ # this is here so that I can mock this out for testing.
+ parser.parse(path, mime_type, file_name)
+
+
+def create_archive_version(doc, retry_count=3):
from documents.parsers import get_parser_class_for_mime_type, \
DocumentParser, \
ParseError
for try_num in range(retry_count):
parser: DocumentParser = parser_class(None, None)
try:
- parser.parse(source_path(doc), doc.mime_type,
- os.path.basename(doc.filename))
+ parse_wrapper(parser, source_path(doc), doc.mime_type,
+ os.path.basename(doc.filename))
doc.content = parser.get_text()
if parser.get_archive_path() and os.path.isfile(
for doc in Document.objects.filter(archive_checksum__isnull=False):
old_path = archive_path_old(doc)
- if not os.path.isfile(old_path):
- raise ValueError(
- f"Archived document ID:{doc.id} does not exist at: "
- f"{old_path}")
-
if old_path in old_archive_path_to_id:
affected_document_ids.add(doc.id)
affected_document_ids.add(old_archive_path_to_id[old_path])
else:
old_archive_path_to_id[old_path] = doc.id
- # check that we can regenerate these archive versions
+ # check that archive files of all unaffected documents are in place
+ for doc in Document.objects.filter(archive_checksum__isnull=False):
+ old_path = archive_path_old(doc)
+ if doc.id not in affected_document_ids and not os.path.isfile(old_path):
+ raise ValueError(
+ f"Archived document ID:{doc.id} does not exist at: "
+ f"{old_path}")
+
+ # check that we can regenerate affected archive versions
for doc_id in affected_document_ids:
from documents.parsers import get_parser_class_for_mime_type
doc = Document.objects.get(id=doc_id)
parser_class = get_parser_class_for_mime_type(doc.mime_type)
if not parser_class:
- raise Exception(
+ raise ValueError(
f"Document ID:{doc.id} has an invalid archived document, "
f"but no parsers are available. Cannot migrate.")
old_path = archive_path_old(doc)
# remove affected archive versions
if os.path.isfile(old_path):
+ logger.debug(
+ f"Removing {old_path}"
+ )
os.unlink(old_path)
else:
# Set archive path for unaffected files
create_archive_version(doc)
-
-
def move_new_to_old_locations(apps, schema_editor):
Document = apps.get_model("documents", "Document")
import os
import shutil
from pathlib import Path
+from unittest import mock
from django.conf import settings
from django.test import override_settings
+from documents.parsers import ParseError
from documents.tests.utils import DirectoriesMixin, TestMigrations
self.assertEqual(Document.objects.get(id=self.clash4.id).archive_filename, "clash.png.pdf")
+def fake_parse_wrapper(parser, path, mime_type, file_name):
+ parser.archive_path = None
+ parser.text = "the text"
+
+
@override_settings(PAPERLESS_FILENAME_FORMAT="")
class TestMigrateArchiveFilesErrors(DirectoriesMixin, TestMigrations):
self.assertRaisesMessage(ValueError, "does not exist at: ", self.performMigration)
+ def test_parser_missing(self):
+ Document = self.apps.get_model("documents", "Document")
+
+ doc1 = make_test_document(Document, "document", "invalid/typesss768", simple_png, "document.png", simple_pdf)
+ doc2 = make_test_document(Document, "document", "invalid/typesss768", simple_jpg, "document.jpg", simple_pdf)
+
+ self.assertRaisesMessage(ValueError, "no parsers are available", self.performMigration)
+
+ @mock.patch("documents.migrations.1012_fix_archive_files.parse_wrapper")
+ def test_parser_error(self, m):
+ m.side_effect = ParseError()
+ Document = self.apps.get_model("documents", "Document")
+
+ doc1 = make_test_document(Document, "document", "image/png", simple_png, "document.png", simple_pdf)
+ doc2 = make_test_document(Document, "document", "application/pdf", simple_jpg, "document.jpg", simple_pdf)
+
+ self.assertIsNotNone(doc1.archive_checksum)
+ self.assertIsNotNone(doc2.archive_checksum)
+
+ with self.assertLogs() as capture:
+ self.performMigration()
+
+ self.assertEqual(m.call_count, 6)
+
+ self.assertEqual(
+ len(list(filter(lambda log: "Parse error, will try again in 5 seconds" in log, capture.output))),
+ 4)
+
+ self.assertEqual(
+ len(list(filter(lambda log: "Unable to regenerate archive document for ID:" in log, capture.output))),
+ 2)
+
+ Document = self.apps.get_model("documents", "Document")
+
+ doc1 = Document.objects.get(id=doc1.id)
+ doc2 = Document.objects.get(id=doc2.id)
+
+ self.assertIsNone(doc1.archive_checksum)
+ self.assertIsNone(doc2.archive_checksum)
+ self.assertIsNone(doc1.archive_filename)
+ self.assertIsNone(doc2.archive_filename)
+
+ @mock.patch("documents.migrations.1012_fix_archive_files.parse_wrapper")
+ def test_parser_no_archive(self, m):
+ m.side_effect = fake_parse_wrapper
+
+ Document = self.apps.get_model("documents", "Document")
+
+ doc1 = make_test_document(Document, "document", "image/png", simple_png, "document.png", simple_pdf)
+ doc2 = make_test_document(Document, "document", "application/pdf", simple_jpg, "document.jpg", simple_pdf)
+
+ with self.assertLogs() as capture:
+ self.performMigration()
+
+ self.assertEqual(
+ len(list(filter(lambda log: "Parser did not return an archive document for document" in log, capture.output))),
+ 2)
+
+ Document = self.apps.get_model("documents", "Document")
+
+ doc1 = Document.objects.get(id=doc1.id)
+ doc2 = Document.objects.get(id=doc2.id)
+
+ self.assertIsNone(doc1.archive_checksum)
+ self.assertIsNone(doc2.archive_checksum)
+ self.assertIsNone(doc1.archive_filename)
+ self.assertIsNone(doc2.archive_filename)
@override_settings(PAPERLESS_FILENAME_FORMAT="")
class TestMigrateArchiveFilesBackwards(DirectoriesMixin, TestMigrations):