self.log("debug", f"Calling OCRmyPDF with args: {args}")
ocrmypdf.ocr(**args)
- self.archive_path = archive_path
+ # Only create archive file if archiving isn't being skipped
+ if settings.OCR_MODE != "skip_noarchive":
+ self.archive_path = archive_path
+
self.text = self.extract_text(sidecar_file, archive_path)
if not self.text:
@override_settings(OCR_MODE="skip_noarchive")
def test_skip_noarchive_withtext(self):
+ """
+ GIVEN:
+ - File with existing text layer
+ - OCR mode set to skip_noarchive
+ WHEN:
+ - Document is parsed
+ THEN:
+ - Text from images is extracted
+ - No archive file is created
+ """
parser = RasterisedDocumentParser(None)
parser.parse(
os.path.join(self.SAMPLE_FILES, "multi-page-digital.pdf"),
@override_settings(OCR_MODE="skip_noarchive")
def test_skip_noarchive_notext(self):
+ """
+ GIVEN:
+ - File with text contained in images but no text layer
+ - OCR mode set to skip_noarchive
+ WHEN:
+ - Document is parsed
+ THEN:
+ - Text from images is extracted
+ - No archive file is created
+ """
parser = RasterisedDocumentParser(None)
parser.parse(
os.path.join(self.SAMPLE_FILES, "multi-page-images.pdf"),
"application/pdf",
)
- self.assertTrue(os.path.isfile(parser.archive_path))
+
self.assertContainsStrings(
parser.get_text().lower(),
["page 1", "page 2", "page 3"],
)
+ self.assertIsNone(parser.archive_path)
+
@override_settings(OCR_MODE="skip")
def test_multi_page_mixed(self):
parser = RasterisedDocumentParser(None)
@override_settings(OCR_MODE="skip_noarchive")
def test_multi_page_mixed_no_archive(self):
+ """
+ GIVEN:
+ - File with some text contained in images and some in text layer
+ - OCR mode set to skip_noarchive
+ WHEN:
+ - Document is parsed
+ THEN:
+ - Text from images is extracted
+ - No archive file is created
+ """
parser = RasterisedDocumentParser(None)
parser.parse(
os.path.join(self.SAMPLE_FILES, "multi-page-mixed.pdf"),