From: Trenton H <797416+stumpylog@users.noreply.github.com> Date: Tue, 3 Jan 2023 16:21:23 +0000 (-0800) Subject: Adds testing coverage of multipage TIFF with alpha, without and with alpha/sRGB X-Git-Tag: v1.12.0-beta.rc0~79 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=0fd51e35e1345f8ec381cc511d473079812c2a43;p=thirdparty%2Fpaperless-ngx.git Adds testing coverage of multipage TIFF with alpha, without and with alpha/sRGB --- diff --git a/src/paperless_tesseract/tests/samples/multi-page-images-alpha-rgb.tiff b/src/paperless_tesseract/tests/samples/multi-page-images-alpha-rgb.tiff new file mode 100644 index 0000000000..3111b7c4cc Binary files /dev/null and b/src/paperless_tesseract/tests/samples/multi-page-images-alpha-rgb.tiff differ diff --git a/src/paperless_tesseract/tests/samples/multi-page-images-alpha.tiff b/src/paperless_tesseract/tests/samples/multi-page-images-alpha.tiff new file mode 100644 index 0000000000..c612cf4bae Binary files /dev/null and b/src/paperless_tesseract/tests/samples/multi-page-images-alpha.tiff differ diff --git a/src/paperless_tesseract/tests/samples/multi-page-images.tiff b/src/paperless_tesseract/tests/samples/multi-page-images.tiff new file mode 100644 index 0000000000..0e62b60106 Binary files /dev/null and b/src/paperless_tesseract/tests/samples/multi-page-images.tiff differ diff --git a/src/paperless_tesseract/tests/test_parser.py b/src/paperless_tesseract/tests/test_parser.py index 4d6890653d..28af8dec18 100644 --- a/src/paperless_tesseract/tests/test_parser.py +++ b/src/paperless_tesseract/tests/test_parser.py @@ -542,6 +542,69 @@ class TestParser(DirectoriesMixin, TestCase): ], ) + def test_multi_page_tiff(self): + """ + GIVEN: + - Multi-page TIFF image + WHEN: + - Image is parsed + THEN: + - Text from all pages extracted + """ + parser = RasterisedDocumentParser(None) + parser.parse( + os.path.join(self.SAMPLE_FILES, "multi-page-images.tiff"), + "image/tiff", + ) + self.assertTrue(os.path.isfile(parser.archive_path)) + self.assertContainsStrings( + parser.get_text().lower(), + ["page 1", "page 2", "page 3"], + ) + + def test_multi_page_tiff_alpha(self): + """ + GIVEN: + - Multi-page TIFF image + - Image include an alpha channel + WHEN: + - Image is parsed + THEN: + - Text from all pages extracted + """ + parser = RasterisedDocumentParser(None) + parser.parse( + os.path.join(self.SAMPLE_FILES, "multi-page-images-alpha.tiff"), + "image/tiff", + ) + self.assertTrue(os.path.isfile(parser.archive_path)) + self.assertContainsStrings( + parser.get_text().lower(), + ["page 1", "page 2", "page 3"], + ) + + def test_multi_page_tiff_alpha_srgb(self): + """ + GIVEN: + - Multi-page TIFF image + - Image include an alpha channel + - Image is srgb colorspace + WHEN: + - Image is parsed + THEN: + - Text from all pages extracted + """ + parser = RasterisedDocumentParser(None) + parser.parse( + os.path.join(self.SAMPLE_FILES, "multi-page-images-alpha-rgb.tiff"), + "image/tiff", + ) + self.assertTrue(os.path.isfile(parser.archive_path)) + self.assertContainsStrings( + parser.get_text().lower(), + ["page 1", "page 2", "page 3"], + ) + def test_ocrmypdf_parameters(self): parser = RasterisedDocumentParser(None) params = parser.construct_ocrmypdf_parameters(