]> git.ipfire.org Git - thirdparty/paperless-ngx.git/commitdiff
Allows parsing of WebP format images
authorTrenton H <797416+stumpylog@users.noreply.github.com>
Mon, 14 Nov 2022 23:38:35 +0000 (15:38 -0800)
committerTrenton H <797416+stumpylog@users.noreply.github.com>
Mon, 28 Nov 2022 17:35:54 +0000 (09:35 -0800)
src/paperless_tesseract/parsers.py
src/paperless_tesseract/signals.py
src/paperless_tesseract/tests/samples/document.webp [new file with mode: 0755]
src/paperless_tesseract/tests/test_parser.py

index aa3ad64fa227e8998b749a75b0bf29cf1e26cce8..bde2ad25e196fdbd88aaa5ce78d309d35161c80d 100644 (file)
@@ -66,6 +66,7 @@ class RasterisedDocumentParser(DocumentParser):
             "image/tiff",
             "image/bmp",
             "image/gif",
+            "image/webp",
         ]
 
     def has_alpha(self, image):
index 85f2cab9fec12bbdcee3dba12dfb896a84c2a45a..c4fd1e039583740a0b9e9c1930b95ad88ae5a117 100644 (file)
@@ -15,5 +15,6 @@ def tesseract_consumer_declaration(sender, **kwargs):
             "image/tiff": ".tif",
             "image/gif": ".gif",
             "image/bmp": ".bmp",
+            "image/webp": ".webp",
         },
     }
diff --git a/src/paperless_tesseract/tests/samples/document.webp b/src/paperless_tesseract/tests/samples/document.webp
new file mode 100755 (executable)
index 0000000..c19ba29
Binary files /dev/null and b/src/paperless_tesseract/tests/samples/document.webp differ
index 67c1ad85996aa59ef3b9f6ade1d620f8ee18897a..a0550bde93d4531d36e848930048e7967ae6c783 100644 (file)
@@ -597,23 +597,34 @@ class TestParserFileTypes(DirectoriesMixin, TestCase):
         parser = RasterisedDocumentParser(None)
         parser.parse(os.path.join(self.SAMPLE_FILES, "simple.bmp"), "image/bmp")
         self.assertTrue(os.path.isfile(parser.archive_path))
-        self.assertTrue("this is a test document" in parser.get_text().lower())
+        self.assertIn("this is a test document", parser.get_text().lower())
 
     def test_jpg(self):
         parser = RasterisedDocumentParser(None)
         parser.parse(os.path.join(self.SAMPLE_FILES, "simple.jpg"), "image/jpeg")
         self.assertTrue(os.path.isfile(parser.archive_path))
-        self.assertTrue("this is a test document" in parser.get_text().lower())
+        self.assertIn("this is a test document", parser.get_text().lower())
 
     @override_settings(OCR_IMAGE_DPI=200)
     def test_gif(self):
         parser = RasterisedDocumentParser(None)
         parser.parse(os.path.join(self.SAMPLE_FILES, "simple.gif"), "image/gif")
         self.assertTrue(os.path.isfile(parser.archive_path))
-        self.assertTrue("this is a test document" in parser.get_text().lower())
+        self.assertIn("this is a test document", parser.get_text().lower())
 
     def test_tiff(self):
         parser = RasterisedDocumentParser(None)
         parser.parse(os.path.join(self.SAMPLE_FILES, "simple.tif"), "image/tiff")
         self.assertTrue(os.path.isfile(parser.archive_path))
-        self.assertTrue("this is a test document" in parser.get_text().lower())
+        self.assertIn("this is a test document", parser.get_text().lower())
+
+    @override_settings(OCR_IMAGE_DPI=72)
+    def test_webp(self):
+        parser = RasterisedDocumentParser(None)
+        parser.parse(os.path.join(self.SAMPLE_FILES, "document.webp"), "image/webp")
+        self.assertTrue(os.path.isfile(parser.archive_path))
+        # OCR consistent mangles this space, oh well
+        self.assertIn(
+            "this is awebp document, created 11/14/2022.",
+            parser.get_text().lower(),
+        )