Adds a test to cover this edge case

author Trenton H <797416+stumpylog@users.noreply.github.com>

Mon, 21 Nov 2022 22:56:14 +0000 (14:56 -0800)

committer Trenton H <797416+stumpylog@users.noreply.github.com>

Tue, 22 Nov 2022 15:22:41 +0000 (07:22 -0800)
author Trenton H <797416+stumpylog@users.noreply.github.com>
Mon, 21 Nov 2022 22:56:14 +0000 (14:56 -0800)
committer Trenton H <797416+stumpylog@users.noreply.github.com>
Tue, 22 Nov 2022 15:22:41 +0000 (07:22 -0800)
diff --git a/src/paperless_tesseract/tests/samples/single-page-mixed.pdf b/src/paperless_tesseract/tests/samples/single-page-mixed.pdf

new file mode 100644 (file)

index 0000000..2281fd3

Binary files /dev/null and b/src/paperless_tesseract/tests/samples/single-page-mixed.pdf differ
diff --git a/src/paperless_tesseract/tests/test_parser.py b/src/paperless_tesseract/tests/test_parser.py

index 858cc7701327726cc0a9d55508097d24221d8e54..67c1ad85996aa59ef3b9f6ade1d620f8ee18897a 100644 (file)
--- a/src/paperless_tesseract/tests/test_parser.py
+++ b/src/paperless_tesseract/tests/test_parser.py
@@ -37,6 +37,9 @@ class FakeImageFile(ContextManager):
  
  
  class TestParser(DirectoriesMixin, TestCase):
+
+    SAMPLE_FILES = os.path.join(os.path.dirname(__file__), "samples")
+
      def assertContainsStrings(self, content, strings):
          # Asserts that all strings appear in content, in the given order.
          indices = []
@@ -47,14 +50,18 @@ class TestParser(DirectoriesMixin, TestCase):
                  self.fail(f"'{s}' is not in '{content}'")
          self.assertListEqual(indices, sorted(indices))
  
-    text_cases = [
-        ("simple     string", "simple string"),
-        ("simple    newline\n   testing string", "simple newline\ntesting string"),
-        ("utf-8   строка с пробелами в конце  ", "utf-8 строка с пробелами в конце"),
-    ]
-
      def test_post_process_text(self):
-        for source, result in self.text_cases:
+
+        text_cases = [
+            ("simple     string", "simple string"),
+            ("simple    newline\n   testing string", "simple newline\ntesting string"),
+            (
+                "utf-8   строка с пробелами в конце  ",
+                "utf-8 строка с пробелами в конце",
+            ),
+        ]
+
+        for source, result in text_cases:
              actual_result = post_process_text(source)
              self.assertEqual(
                  result,
@@ -66,8 +73,6 @@ class TestParser(DirectoriesMixin, TestCase):
                  ),
              )
  
-    SAMPLE_FILES = os.path.join(os.path.dirname(__file__), "samples")
-
      def test_get_text_from_pdf(self):
          parser = RasterisedDocumentParser(uuid.uuid4())
          text = parser.extract_text(
@@ -461,6 +466,45 @@ class TestParser(DirectoriesMixin, TestCase):
  
          self.assertIn("[OCR skipped on page(s) 4-6]", sidecar)
  
+    @override_settings(OCR_MODE="redo")
+    def test_single_page_mixed(self):
+        """
+        GIVEN:
+            - File with some text contained in images and some in text layer
+            - Text and images are mixed on the same page
+            - OCR mode set to redo
+        WHEN:
+            - Document is parsed
+        THEN:
+            - Text from images is extracted
+            - Full content of the file is parsed (not just the image text)
+            - An archive file is created with the OCRd text and the original text
+        """
+        parser = RasterisedDocumentParser(None)
+        parser.parse(
+            os.path.join(self.SAMPLE_FILES, "single-page-mixed.pdf"),
+            "application/pdf",
+        )
+        self.assertIsNotNone(parser.archive_path)
+        self.assertTrue(os.path.isfile(parser.archive_path))
+        self.assertContainsStrings(
+            parser.get_text().lower(),
+            [
+                "this is some normal text, present on page 1 of the document.",
+                "this is some text, but in an image, also on page 1.",
+                "this is further text on page 1.",
+            ],
+        )
+
+        with open(os.path.join(parser.tempdir, "sidecar.txt")) as f:
+            sidecar = f.read().lower()
+
+        self.assertIn("this is some text, but in an image, also on page 1.", sidecar)
+        self.assertNotIn(
+            "this is some normal text, present on page 1 of the document.",
+            sidecar,
+        )
+
      @override_settings(OCR_MODE="skip_noarchive")
      def test_multi_page_mixed_no_archive(self):
          """
author	Trenton H <797416+stumpylog@users.noreply.github.com>
	Mon, 21 Nov 2022 22:56:14 +0000 (14:56 -0800)
committer	Trenton H <797416+stumpylog@users.noreply.github.com>
	Tue, 22 Nov 2022 15:22:41 +0000 (07:22 -0800)
src/paperless_tesseract/tests/samples/single-page-mixed.pdf	[new file with mode: 0644]	patch \| blob
src/paperless_tesseract/tests/test_parser.py		patch \| blob \| blame \| history