From: Dennis Brakhane Date: Tue, 11 Jul 2023 14:41:31 +0000 (+0200) Subject: Don't consider better OCR as failing X-Git-Tag: v1.17.0~1^2~52^2 X-Git-Url: http://git.ipfire.org/gitweb/gitweb.cgi?a=commitdiff_plain;h=refs%2Fpull%2F3783%2Fhead;p=thirdparty%2Fpaperless-ngx.git Don't consider better OCR as failing Tesseract 5.3.0 does a better job at OCR, and correctly reads "a webp" instead of "awebp", this is good, so we don't want the test to fail. --- diff --git a/src/paperless_tesseract/tests/test_parser.py b/src/paperless_tesseract/tests/test_parser.py index 7850ad4ef8..8b3de5615c 100644 --- a/src/paperless_tesseract/tests/test_parser.py +++ b/src/paperless_tesseract/tests/test_parser.py @@ -861,8 +861,9 @@ class TestParserFileTypes(DirectoriesMixin, FileSystemAssertsMixin, TestCase): parser = RasterisedDocumentParser(None) parser.parse(os.path.join(self.SAMPLE_FILES, "document.webp"), "image/webp") self.assertIsFile(parser.archive_path) - # OCR consistent mangles this space, oh well - self.assertIn( - "this is awebp document, created 11/14/2022.", + # Older tesseracts consistently mangle the space between "a webp", + # tesseract 5.3.0 seems to do a better job, so we're accepting both + self.assertRegex( parser.get_text().lower(), + r"this is a ?webp document, created 11/14/2022.", )