class TestParser(DirectoriesMixin, TestCase):
+
+ SAMPLE_FILES = os.path.join(os.path.dirname(__file__), "samples")
+
def assertContainsStrings(self, content, strings):
# Asserts that all strings appear in content, in the given order.
indices = []
self.fail(f"'{s}' is not in '{content}'")
self.assertListEqual(indices, sorted(indices))
- text_cases = [
- ("simple string", "simple string"),
- ("simple newline\n testing string", "simple newline\ntesting string"),
- ("utf-8 строка с пробелами в конце ", "utf-8 строка с пробелами в конце"),
- ]
-
def test_post_process_text(self):
- for source, result in self.text_cases:
+
+ text_cases = [
+ ("simple string", "simple string"),
+ ("simple newline\n testing string", "simple newline\ntesting string"),
+ (
+ "utf-8 строка с пробелами в конце ",
+ "utf-8 строка с пробелами в конце",
+ ),
+ ]
+
+ for source, result in text_cases:
actual_result = post_process_text(source)
self.assertEqual(
result,
),
)
- SAMPLE_FILES = os.path.join(os.path.dirname(__file__), "samples")
-
def test_get_text_from_pdf(self):
parser = RasterisedDocumentParser(uuid.uuid4())
text = parser.extract_text(
self.assertIn("[OCR skipped on page(s) 4-6]", sidecar)
+ @override_settings(OCR_MODE="redo")
+ def test_single_page_mixed(self):
+ """
+ GIVEN:
+ - File with some text contained in images and some in text layer
+ - Text and images are mixed on the same page
+ - OCR mode set to redo
+ WHEN:
+ - Document is parsed
+ THEN:
+ - Text from images is extracted
+ - Full content of the file is parsed (not just the image text)
+ - An archive file is created with the OCRd text and the original text
+ """
+ parser = RasterisedDocumentParser(None)
+ parser.parse(
+ os.path.join(self.SAMPLE_FILES, "single-page-mixed.pdf"),
+ "application/pdf",
+ )
+ self.assertIsNotNone(parser.archive_path)
+ self.assertTrue(os.path.isfile(parser.archive_path))
+ self.assertContainsStrings(
+ parser.get_text().lower(),
+ [
+ "this is some normal text, present on page 1 of the document.",
+ "this is some text, but in an image, also on page 1.",
+ "this is further text on page 1.",
+ ],
+ )
+
+ with open(os.path.join(parser.tempdir, "sidecar.txt")) as f:
+ sidecar = f.read().lower()
+
+ self.assertIn("this is some text, but in an image, also on page 1.", sidecar)
+ self.assertNotIn(
+ "this is some normal text, present on page 1 of the document.",
+ sidecar,
+ )
+
@override_settings(OCR_MODE="skip_noarchive")
def test_multi_page_mixed_no_archive(self):
"""