an archived version of your documents when it finds any text in
them. This is useful if you don't want to have two
almost-identical versions of your digital documents in the media
- folder.
-
- - `skip_neverarchive`: In addition to skip, paperless will never
- create an archive version of your documents. This is the fastest option.
+ folder. This is the fastest option.
- `redo`: Paperless will OCR all pages of your documents and
attempt to replace any existing text layers with new text. This
Error(f'OCR output type "{settings.OCR_OUTPUT_TYPE}" is not valid'),
)
- if settings.OCR_MODE not in {
- "force",
- "skip",
- "redo",
- "skip_noarchive",
- "skip_neverarchive",
- }:
+ if settings.OCR_MODE not in {"force", "skip", "redo", "skip_noarchive"}:
msgs.append(Error(f'OCR output mode "{settings.OCR_MODE}" is not valid'))
if settings.OCR_CLEAN not in {"clean", "clean-final", "none"}:
if settings.OCR_MODE == "force" or safe_fallback:
ocrmypdf_args["force_ocr"] = True
- elif settings.OCR_MODE in ["skip", "skip_noarchive", "skip_neverarchive"]:
+ elif settings.OCR_MODE in ["skip", "skip_noarchive"]:
ocrmypdf_args["skip_text"] = True
elif settings.OCR_MODE == "redo":
ocrmypdf_args["redo_ocr"] = True
# If the original has text, and the user doesn't want an archive,
# we're done here
- if (
- settings.OCR_MODE in ["skip_noarchive", "skip_neverarchive"]
- and original_has_text
- ):
+ if settings.OCR_MODE == "skip_noarchive" and original_has_text:
self.log("debug", "Document has text, skipping OCRmyPDF entirely.")
self.text = text_original
return
self.log("debug", f"Calling OCRmyPDF with args: {args}")
ocrmypdf.ocr(**args)
- # Only create archive file if archiving isn't being skipped
- if settings.OCR_MODE != "skip_neverarchive":
- self.archive_path = archive_path
+ self.archive_path = archive_path
self.text = self.extract_text(sidecar_file, archive_path)
self.assertIsNotNone(parser.archive_path)
- @override_settings(OCR_MODE="skip_neverarchive")
- def test_skip_neverarchive_withtext(self):
- """
- GIVEN:
- - File with existing text layer
- - OCR mode set to skip_neverarchive
- WHEN:
- - Document is parsed
- THEN:
- - Text from images is extracted
- - No archive file is created
- """
- parser = RasterisedDocumentParser(None)
- parser.parse(
- os.path.join(self.SAMPLE_FILES, "multi-page-digital.pdf"),
- "application/pdf",
- )
- self.assertIsNone(parser.archive_path)
- self.assertContainsStrings(
- parser.get_text().lower(),
- ["page 1", "page 2", "page 3"],
- )
-
- @override_settings(OCR_MODE="skip_neverarchive")
- def test_skip_neverarchive_notext(self):
- """
- GIVEN:
- - File with text contained in images but no text layer
- - OCR mode set to skip_neverarchive
- WHEN:
- - Document is parsed
- THEN:
- - Text from images is extracted
- - No archive file is created
- """
- parser = RasterisedDocumentParser(None)
- parser.parse(
- os.path.join(self.SAMPLE_FILES, "multi-page-images.pdf"),
- "application/pdf",
- )
- self.assertIsNone(parser.archive_path)
- self.assertContainsStrings(
- parser.get_text().lower(),
- ["page 1", "page 2", "page 3"],
- )
-
@override_settings(OCR_MODE="skip")
def test_multi_page_mixed(self):
"""