- `skip`: Paperless skips all pages and will perform ocr only on
pages where no text is present. This is the safest option.
- - `skip_noarchive`: In addition to skip, paperless won't create
- an archived version of your documents when it finds any text in
- them. This is useful if you don't want to have two
- almost-identical versions of your digital documents in the media
- folder. This is the fastest option.
-
- `redo`: Paperless will OCR all pages of your documents and
attempt to replace any existing text layers with new text. This
will be useful for documents from scanners that already
Read more about this in the [OCRmyPDF
documentation](https://ocrmypdf.readthedocs.io/en/latest/advanced.html#when-ocr-is-skipped).
+`PAPERLESS_OCR_SKIP_ARCHIVE_FILE=<mode>`
+
+: Specify when you would like paperless to skip creating an archived
+version of your documents. This is useful if you don't want to have two
+almost-identical versions of your documents in the media folder.
+
+ - `never`: Never skip creating an archived version.
+ - `with_text`: Skip creating an archived version for documents
+ that already have embedded text.
+ - `always`: Always skip creating an archived version.
+
+ The default is `never`.
+
`PAPERLESS_OCR_CLEAN=<mode>`
: Tells paperless to use `unpaper` to clean any input document before
["page 1", "page 2", "page 3"],
)
- @override_settings(OOCR_MODE="skip")
+ @override_settings(OCR_MODE="skip")
def test_multi_page_analog_pages_skip(self):
parser = RasterisedDocumentParser(None)
parser.parse(
self.assertIsNotNone(parser.archive_path)
+ @override_settings(OCR_SKIP_ARCHIVE_FILE="never")
+ def test_skip_archive_never_withtext(self):
+ """
+ GIVEN:
+ - File with existing text layer
+ - OCR_SKIP_ARCHIVE_FILE set to never
+ WHEN:
+ - Document is parsed
+ THEN:
+ - Text from text layer is extracted
+ - Archive file is created
+ """
+ parser = RasterisedDocumentParser(None)
+ parser.parse(
+ os.path.join(self.SAMPLE_FILES, "multi-page-digital.pdf"),
+ "application/pdf",
+ )
+ self.assertIsNotNone(parser.archive_path)
+ self.assertContainsStrings(
+ parser.get_text().lower(),
+ ["page 1", "page 2", "page 3"],
+ )
+
+ @override_settings(OCR_SKIP_ARCHIVE_FILE="never")
+ def test_skip_archive_never_withimages(self):
+ """
+ GIVEN:
+ - File with text contained in images but no text layer
+ - OCR_SKIP_ARCHIVE_FILE set to never
+ WHEN:
+ - Document is parsed
+ THEN:
+ - Text from images is extracted
+ - Archive file is created
+ """
+ parser = RasterisedDocumentParser(None)
+ parser.parse(
+ os.path.join(self.SAMPLE_FILES, "multi-page-images.pdf"),
+ "application/pdf",
+ )
+ self.assertIsNotNone(parser.archive_path)
+ self.assertContainsStrings(
+ parser.get_text().lower(),
+ ["page 1", "page 2", "page 3"],
+ )
+
+ @override_settings(OCR_SKIP_ARCHIVE_FILE="with_text")
+ def test_skip_archive_withtext_withtext(self):
+ """
+ GIVEN:
+ - File with existing text layer
+ - OCR_SKIP_ARCHIVE_FILE set to with_text
+ WHEN:
+ - Document is parsed
+ THEN:
+ - Text from text layer is extracted
+ - No archive file is created
+ """
+ parser = RasterisedDocumentParser(None)
+ parser.parse(
+ os.path.join(self.SAMPLE_FILES, "multi-page-digital.pdf"),
+ "application/pdf",
+ )
+ self.assertIsNone(parser.archive_path)
+ self.assertContainsStrings(
+ parser.get_text().lower(),
+ ["page 1", "page 2", "page 3"],
+ )
+
+ @override_settings(OCR_SKIP_ARCHIVE_FILE="with_text")
+ def test_skip_archive_withtext_withimages(self):
+ """
+ GIVEN:
+ - File with text contained in images but no text layer
+ - OCR_SKIP_ARCHIVE_FILE set to with_text
+ WHEN:
+ - Document is parsed
+ THEN:
+ - Text from images is extracted
+ - Archive file is created
+ """
+ parser = RasterisedDocumentParser(None)
+ parser.parse(
+ os.path.join(self.SAMPLE_FILES, "multi-page-images.pdf"),
+ "application/pdf",
+ )
+ self.assertIsNotNone(parser.archive_path)
+ self.assertContainsStrings(
+ parser.get_text().lower(),
+ ["page 1", "page 2", "page 3"],
+ )
+
+ @override_settings(OCR_SKIP_ARCHIVE_FILE="always")
+ def test_skip_archive_always_withtext(self):
+ """
+ GIVEN:
+ - File with existing text layer
+ - OCR_SKIP_ARCHIVE_FILE set to always
+ WHEN:
+ - Document is parsed
+ THEN:
+ - Text from text layer is extracted
+ - No archive file is created
+ """
+ parser = RasterisedDocumentParser(None)
+ parser.parse(
+ os.path.join(self.SAMPLE_FILES, "multi-page-digital.pdf"),
+ "application/pdf",
+ )
+ self.assertIsNone(parser.archive_path)
+ self.assertContainsStrings(
+ parser.get_text().lower(),
+ ["page 1", "page 2", "page 3"],
+ )
+
+ @override_settings(OCR_SKIP_ARCHIVE_FILE="always")
+ def test_skip_archive_always_withimages(self):
+ """
+ GIVEN:
+ - File with text contained in images but no text layer
+ - OCR_SKIP_ARCHIVE_FILE set to always
+ WHEN:
+ - Document is parsed
+ THEN:
+ - Text from images is extracted
+ - No archive file is created
+ """
+ parser = RasterisedDocumentParser(None)
+ parser.parse(
+ os.path.join(self.SAMPLE_FILES, "multi-page-images.pdf"),
+ "application/pdf",
+ )
+ self.assertIsNone(parser.archive_path)
+ self.assertContainsStrings(
+ parser.get_text().lower(),
+ ["page 1", "page 2", "page 3"],
+ )
+
@override_settings(OCR_MODE="skip")
def test_multi_page_mixed(self):
"""