import os
import re
+import shutil
import tempfile
from pathlib import Path
from unittest import mock
class TestClassifier(DirectoriesMixin, TestCase):
+
+ SAMPLE_MODEL_FILE = os.path.join(os.path.dirname(__file__), "data", "model.pickle")
+
def setUp(self):
super().setUp()
self.classifier = DocumentClassifier()
# self.classifier.train()
# self.classifier.save()
- @override_settings(
- MODEL_FILE=os.path.join(os.path.dirname(__file__), "data", "model.pickle"),
- )
def test_load_and_classify(self):
# Generate test data, train and save to the model file
# This ensures the model file sklearn version matches
# and eliminates a warning
+ shutil.copy(
+ self.SAMPLE_MODEL_FILE,
+ os.path.join(self.dirs.data_dir, "classification_model.pickle"),
+ )
self.generate_test_data()
self.classifier.train()
self.classifier.save()
self.assertCountEqual(new_classifier.predict_tags(self.doc2.content), [45, 12])
- @override_settings(
- MODEL_FILE=os.path.join(os.path.dirname(__file__), "data", "model.pickle"),
- )
@mock.patch("documents.classifier.pickle.load")
def test_load_corrupt_file(self, patched_pickle_load):
"""
THEN:
- The ClassifierModelCorruptError is raised
"""
+ shutil.copy(
+ self.SAMPLE_MODEL_FILE,
+ os.path.join(self.dirs.data_dir, "classification_model.pickle"),
+ )
# First load is the schema version
patched_pickle_load.side_effect = [DocumentClassifier.FORMAT_VERSION, OSError()]
- Text from all pages extracted
"""
parser = RasterisedDocumentParser(None)
- parser.parse(
- os.path.join(self.SAMPLE_FILES, "multi-page-images-alpha.tiff"),
- "image/tiff",
- )
- self.assertTrue(os.path.isfile(parser.archive_path))
- self.assertContainsStrings(
- parser.get_text().lower(),
- ["page 1", "page 2", "page 3"],
- )
+ sample_file = os.path.join(self.SAMPLE_FILES, "multi-page-images-alpha.tiff")
+ with tempfile.NamedTemporaryFile() as tmp_file:
+ shutil.copy(sample_file, tmp_file.name)
+ parser.parse(
+ tmp_file.name,
+ "image/tiff",
+ )
+ self.assertTrue(os.path.isfile(parser.archive_path))
+ self.assertContainsStrings(
+ parser.get_text().lower(),
+ ["page 1", "page 2", "page 3"],
+ )
def test_multi_page_tiff_alpha_srgb(self):
"""
- Text from all pages extracted
"""
parser = RasterisedDocumentParser(None)
- parser.parse(
- os.path.join(self.SAMPLE_FILES, "multi-page-images-alpha-rgb.tiff"),
- "image/tiff",
- )
- self.assertTrue(os.path.isfile(parser.archive_path))
- self.assertContainsStrings(
- parser.get_text().lower(),
- ["page 1", "page 2", "page 3"],
+ sample_file = os.path.join(
+ self.SAMPLE_FILES,
+ "multi-page-images-alpha-rgb.tiff",
)
+ with tempfile.NamedTemporaryFile() as tmp_file:
+ shutil.copy(sample_file, tmp_file.name)
+ parser.parse(
+ tmp_file.name,
+ "image/tiff",
+ )
+ self.assertTrue(os.path.isfile(parser.archive_path))
+ self.assertContainsStrings(
+ parser.get_text().lower(),
+ ["page 1", "page 2", "page 3"],
+ )
def test_ocrmypdf_parameters(self):
parser = RasterisedDocumentParser(None)