In the case of an RTL language being extracted via pdfminer.six, fall back to forced...

author Trenton H <797416+stumpylog@users.noreply.github.com>

Tue, 29 Nov 2022 21:19:16 +0000 (13:19 -0800)

committer Trenton H <797416+stumpylog@users.noreply.github.com>

Fri, 30 Dec 2022 00:02:02 +0000 (16:02 -0800)
author Trenton H <797416+stumpylog@users.noreply.github.com>
Tue, 29 Nov 2022 21:19:16 +0000 (13:19 -0800)
committer Trenton H <797416+stumpylog@users.noreply.github.com>
Fri, 30 Dec 2022 00:02:02 +0000 (16:02 -0800)
diff --git a/src/paperless_tesseract/parsers.py b/src/paperless_tesseract/parsers.py

index bde2ad25e196fdbd88aaa5ce78d309d35161c80d..4cc9b8e5f7547978e3700f194a7154010bd937ef 100644 (file)
--- a/src/paperless_tesseract/parsers.py
+++ b/src/paperless_tesseract/parsers.py
@@ -13,6 +13,10 @@ class NoTextFoundException(Exception):
      pass
  
  
+class RtlLanguageException(Exception):
+    pass
+
+
  class RasterisedDocumentParser(DocumentParser):
      """
      This parser uses Tesseract to try and get some text out of a rasterised
@@ -125,7 +129,26 @@ class RasterisedDocumentParser(DocumentParser):
              stripped = post_process_text(pdfminer_extract_text(pdf_file))
  
              self.log("debug", f"Extracted text from PDF file {pdf_file}")
+
+            # pdfminer.six does not handle RTL text
+            # as a hack, for some languages, return no text, to force
+            # OCRMyPdf/Tesseract do handle this correctly
+            from langdetect import detect
+
+            lang = detect(stripped)
+
+            self.log("debug", f"Detected language {lang}")
+
+            if lang in {
+                "ar",  # Arabic
+                "he",  # Hebrew,
+                "fa",  # Persian
+            }:
+                raise RtlLanguageException()
              return stripped
+        except RtlLanguageException:
+            self.log("warning", f"Detected RTL language {lang}")
+            return None
          except Exception:
              # TODO catch all for various issues with PDFminer.six.
              #  If PDFminer fails, fall back to OCR.
@@ -305,7 +328,7 @@ class RasterisedDocumentParser(DocumentParser):
              )
              if original_has_text:
                  self.text = text_original
-        except (NoTextFoundException, InputFileError) as e:
+        except (NoTextFoundException, RtlLanguageException, InputFileError) as e:
              self.log(
                  "warning",
                  f"Encountered an error while running OCR: {str(e)}. "
diff --git a/src/paperless_tesseract/tests/samples/rtl-test.pdf b/src/paperless_tesseract/tests/samples/rtl-test.pdf

new file mode 100755 (executable)

index 0000000..daa666f

Binary files /dev/null and b/src/paperless_tesseract/tests/samples/rtl-test.pdf differ
diff --git a/src/paperless_tesseract/tests/test_parser.py b/src/paperless_tesseract/tests/test_parser.py

index a0550bde93d4531d36e848930048e7967ae6c783..4d6890653d8d6e38ab1a4a3b905f04d5908cd823 100644 (file)
--- a/src/paperless_tesseract/tests/test_parser.py
+++ b/src/paperless_tesseract/tests/test_parser.py
@@ -588,6 +588,39 @@ class TestParser(DirectoriesMixin, TestCase):
              params = parser.construct_ocrmypdf_parameters("", "", "", "")
              self.assertNotIn("deskew", params)
  
+    def test_rtl_language_detection(self):
+        """
+        GIVEN:
+            - File with text in an RTL language
+        WHEN:
+            - Document is parsed
+        THEN:
+            - Text from the document is extracted
+        """
+        parser = RasterisedDocumentParser(None)
+        with mock.patch.object(
+            parser,
+            "construct_ocrmypdf_parameters",
+            wraps=parser.construct_ocrmypdf_parameters,
+        ) as wrapped:
+
+            parser.parse(
+                os.path.join(self.SAMPLE_FILES, "rtl-test.pdf"),
+                "application/pdf",
+            )
+
+            # There isn't a good way to actually check this working, with RTL correctly return
+            #  as it would require tesseract-ocr-ara installed for everyone running the
+            #  test suite.  This test does provide the coverage though and attempts to ensure
+            # the force OCR happens
+            self.assertIsNotNone(parser.get_text())
+
+            self.assertEqual(parser.construct_ocrmypdf_parameters.call_count, 2)
+            # Check the last call kwargs
+            self.assertTrue(
+                parser.construct_ocrmypdf_parameters.call_args.kwargs["safe_fallback"],
+            )
+
  
  class TestParserFileTypes(DirectoriesMixin, TestCase):
author	Trenton H <797416+stumpylog@users.noreply.github.com>
	Tue, 29 Nov 2022 21:19:16 +0000 (13:19 -0800)
committer	Trenton H <797416+stumpylog@users.noreply.github.com>
	Fri, 30 Dec 2022 00:02:02 +0000 (16:02 -0800)
src/paperless_tesseract/parsers.py		patch \| blob \| blame \| history
src/paperless_tesseract/tests/samples/rtl-test.pdf	[new file with mode: 0755]	patch \| blob
src/paperless_tesseract/tests/test_parser.py		patch \| blob \| blame \| history