an archived version of your documents when it finds any text in
them. This is useful if you don't want to have two
almost-identical versions of your digital documents in the media
- folder. This is the fastest option.
+ folder.
+
+ - `skip_neverarchive`: In addition to skip, paperless will never
+ create an archive version of your documents. This is the fastest option.
- `redo`: Paperless will OCR all pages of your documents and
attempt to replace any existing text layers with new text. This
Error(f'OCR output type "{settings.OCR_OUTPUT_TYPE}" is not valid'),
)
- if settings.OCR_MODE not in {"force", "skip", "redo", "skip_noarchive"}:
+ if settings.OCR_MODE not in {
+ "force",
+ "skip",
+ "redo",
+ "skip_noarchive",
+ "skip_neverarchive",
+ }:
msgs.append(Error(f'OCR output mode "{settings.OCR_MODE}" is not valid'))
if settings.OCR_CLEAN not in {"clean", "clean-final", "none"}:
if settings.OCR_MODE == "force" or safe_fallback:
ocrmypdf_args["force_ocr"] = True
- elif settings.OCR_MODE in ["skip", "skip_noarchive"]:
+ elif settings.OCR_MODE in ["skip", "skip_noarchive", "skip_neverarchive"]:
ocrmypdf_args["skip_text"] = True
elif settings.OCR_MODE == "redo":
ocrmypdf_args["redo_ocr"] = True
# If the original has text, and the user doesn't want an archive,
# we're done here
- if settings.OCR_MODE == "skip_noarchive" and original_has_text:
+ if (
+ settings.OCR_MODE in ["skip_noarchive", "skip_neverarchive"]
+ and original_has_text
+ ):
self.log("debug", "Document has text, skipping OCRmyPDF entirely.")
self.text = text_original
return
self.log("debug", f"Calling OCRmyPDF with args: {args}")
ocrmypdf.ocr(**args)
- self.archive_path = archive_path
+ # Only create archive file if archiving isn't being skipped
+ if settings.OCR_MODE != "skip_neverarchive":
+ self.archive_path = archive_path
self.text = self.extract_text(sidecar_file, archive_path)
self.assertIsNotNone(parser.archive_path)
+ @override_settings(OCR_MODE="skip_neverarchive")
+ def test_skip_neverarchive_withtext(self):
+ """
+ GIVEN:
+ - File with existing text layer
+ - OCR mode set to skip_neverarchive
+ WHEN:
+ - Document is parsed
+ THEN:
+ - Text from images is extracted
+ - No archive file is created
+ """
+ parser = RasterisedDocumentParser(None)
+ parser.parse(
+ os.path.join(self.SAMPLE_FILES, "multi-page-digital.pdf"),
+ "application/pdf",
+ )
+ self.assertIsNone(parser.archive_path)
+ self.assertContainsStrings(
+ parser.get_text().lower(),
+ ["page 1", "page 2", "page 3"],
+ )
+
+ @override_settings(OCR_MODE="skip_neverarchive")
+ def test_skip_neverarchive_notext(self):
+ """
+ GIVEN:
+ - File with text contained in images but no text layer
+ - OCR mode set to skip_neverarchive
+ WHEN:
+ - Document is parsed
+ THEN:
+ - Text from images is extracted
+ - No archive file is created
+ """
+ parser = RasterisedDocumentParser(None)
+ parser.parse(
+ os.path.join(self.SAMPLE_FILES, "multi-page-images.pdf"),
+ "application/pdf",
+ )
+ self.assertIsNone(parser.archive_path)
+ self.assertContainsStrings(
+ parser.get_text().lower(),
+ ["page 1", "page 2", "page 3"],
+ )
+
@override_settings(OCR_MODE="skip")
def test_multi_page_mixed(self):
"""