]> git.ipfire.org Git - thirdparty/paperless-ngx.git/commitdiff
Adds testing coverage of multipage TIFF with alpha, without and with alpha/sRGB
authorTrenton H <797416+stumpylog@users.noreply.github.com>
Tue, 3 Jan 2023 16:21:23 +0000 (08:21 -0800)
committerTrenton H <797416+stumpylog@users.noreply.github.com>
Tue, 3 Jan 2023 17:56:19 +0000 (09:56 -0800)
src/paperless_tesseract/tests/samples/multi-page-images-alpha-rgb.tiff [new file with mode: 0644]
src/paperless_tesseract/tests/samples/multi-page-images-alpha.tiff [new file with mode: 0644]
src/paperless_tesseract/tests/samples/multi-page-images.tiff [new file with mode: 0644]
src/paperless_tesseract/tests/test_parser.py

diff --git a/src/paperless_tesseract/tests/samples/multi-page-images-alpha-rgb.tiff b/src/paperless_tesseract/tests/samples/multi-page-images-alpha-rgb.tiff
new file mode 100644 (file)
index 0000000..3111b7c
Binary files /dev/null and b/src/paperless_tesseract/tests/samples/multi-page-images-alpha-rgb.tiff differ
diff --git a/src/paperless_tesseract/tests/samples/multi-page-images-alpha.tiff b/src/paperless_tesseract/tests/samples/multi-page-images-alpha.tiff
new file mode 100644 (file)
index 0000000..c612cf4
Binary files /dev/null and b/src/paperless_tesseract/tests/samples/multi-page-images-alpha.tiff differ
diff --git a/src/paperless_tesseract/tests/samples/multi-page-images.tiff b/src/paperless_tesseract/tests/samples/multi-page-images.tiff
new file mode 100644 (file)
index 0000000..0e62b60
Binary files /dev/null and b/src/paperless_tesseract/tests/samples/multi-page-images.tiff differ
index 4d6890653d8d6e38ab1a4a3b905f04d5908cd823..28af8dec18dbf03cbc35289b3bec47e7b40cf256 100644 (file)
@@ -542,6 +542,69 @@ class TestParser(DirectoriesMixin, TestCase):
             ],
         )
 
+    def test_multi_page_tiff(self):
+        """
+        GIVEN:
+            - Multi-page TIFF image
+        WHEN:
+            - Image is parsed
+        THEN:
+            - Text from all pages extracted
+        """
+        parser = RasterisedDocumentParser(None)
+        parser.parse(
+            os.path.join(self.SAMPLE_FILES, "multi-page-images.tiff"),
+            "image/tiff",
+        )
+        self.assertTrue(os.path.isfile(parser.archive_path))
+        self.assertContainsStrings(
+            parser.get_text().lower(),
+            ["page 1", "page 2", "page 3"],
+        )
+
+    def test_multi_page_tiff_alpha(self):
+        """
+        GIVEN:
+            - Multi-page TIFF image
+            - Image include an alpha channel
+        WHEN:
+            - Image is parsed
+        THEN:
+            - Text from all pages extracted
+        """
+        parser = RasterisedDocumentParser(None)
+        parser.parse(
+            os.path.join(self.SAMPLE_FILES, "multi-page-images-alpha.tiff"),
+            "image/tiff",
+        )
+        self.assertTrue(os.path.isfile(parser.archive_path))
+        self.assertContainsStrings(
+            parser.get_text().lower(),
+            ["page 1", "page 2", "page 3"],
+        )
+
+    def test_multi_page_tiff_alpha_srgb(self):
+        """
+        GIVEN:
+            - Multi-page TIFF image
+            - Image include an alpha channel
+            - Image is srgb colorspace
+        WHEN:
+            - Image is parsed
+        THEN:
+            - Text from all pages extracted
+        """
+        parser = RasterisedDocumentParser(None)
+        parser.parse(
+            os.path.join(self.SAMPLE_FILES, "multi-page-images-alpha-rgb.tiff"),
+            "image/tiff",
+        )
+        self.assertTrue(os.path.isfile(parser.archive_path))
+        self.assertContainsStrings(
+            parser.get_text().lower(),
+            ["page 1", "page 2", "page 3"],
+        )
+
     def test_ocrmypdf_parameters(self):
         parser = RasterisedDocumentParser(None)
         params = parser.construct_ocrmypdf_parameters(