pass
+class RtlLanguageException(Exception):
+ pass
+
+
class RasterisedDocumentParser(DocumentParser):
"""
This parser uses Tesseract to try and get some text out of a rasterised
stripped = post_process_text(pdfminer_extract_text(pdf_file))
self.log("debug", f"Extracted text from PDF file {pdf_file}")
+
+ # pdfminer.six does not handle RTL text
+ # as a hack, for some languages, return no text, to force
+ # OCRMyPdf/Tesseract do handle this correctly
+ from langdetect import detect
+
+ lang = detect(stripped)
+
+ self.log("debug", f"Detected language {lang}")
+
+ if lang in {
+ "ar", # Arabic
+ "he", # Hebrew,
+ "fa", # Persian
+ }:
+ raise RtlLanguageException()
return stripped
+ except RtlLanguageException:
+ self.log("warning", f"Detected RTL language {lang}")
+ return None
except Exception:
# TODO catch all for various issues with PDFminer.six.
# If PDFminer fails, fall back to OCR.
)
if original_has_text:
self.text = text_original
- except (NoTextFoundException, InputFileError) as e:
+ except (NoTextFoundException, RtlLanguageException, InputFileError) as e:
self.log(
"warning",
f"Encountered an error while running OCR: {str(e)}. "
params = parser.construct_ocrmypdf_parameters("", "", "", "")
self.assertNotIn("deskew", params)
+ def test_rtl_language_detection(self):
+ """
+ GIVEN:
+ - File with text in an RTL language
+ WHEN:
+ - Document is parsed
+ THEN:
+ - Text from the document is extracted
+ """
+ parser = RasterisedDocumentParser(None)
+ with mock.patch.object(
+ parser,
+ "construct_ocrmypdf_parameters",
+ wraps=parser.construct_ocrmypdf_parameters,
+ ) as wrapped:
+
+ parser.parse(
+ os.path.join(self.SAMPLE_FILES, "rtl-test.pdf"),
+ "application/pdf",
+ )
+
+ # There isn't a good way to actually check this working, with RTL correctly return
+ # as it would require tesseract-ocr-ara installed for everyone running the
+ # test suite. This test does provide the coverage though and attempts to ensure
+ # the force OCR happens
+ self.assertIsNotNone(parser.get_text())
+
+ self.assertEqual(parser.construct_ocrmypdf_parameters.call_count, 2)
+ # Check the last call kwargs
+ self.assertTrue(
+ parser.construct_ocrmypdf_parameters.call_args.kwargs["safe_fallback"],
+ )
+
class TestParserFileTypes(DirectoriesMixin, TestCase):