Add PAPERLESS_OCR_SKIP_ARCHIVE_FILE config setting

author Brandon Rothweiler <brandonrothweiler@gmail.com>

Fri, 24 Feb 2023 03:42:57 +0000 (22:42 -0500)

committer Brandon Rothweiler <brandonrothweiler@gmail.com>

Fri, 24 Feb 2023 03:42:57 +0000 (22:42 -0500)
author Brandon Rothweiler <brandonrothweiler@gmail.com>
Fri, 24 Feb 2023 03:42:57 +0000 (22:42 -0500)
committer Brandon Rothweiler <brandonrothweiler@gmail.com>
Fri, 24 Feb 2023 03:42:57 +0000 (22:42 -0500)
diff --git a/docs/configuration.md b/docs/configuration.md

index 6c233c2e60c498cbcc7858e13da929bf57415cda..d3b391f1a32c58de21e994e85bf0eeb391740400 100644 (file)
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -415,12 +415,6 @@ modes are available:
      -   `skip`: Paperless skips all pages and will perform ocr only on
          pages where no text is present. This is the safest option.
  
-    -   `skip_noarchive`: In addition to skip, paperless won't create
-        an archived version of your documents when it finds any text in
-        them. This is useful if you don't want to have two
-        almost-identical versions of your digital documents in the media
-        folder. This is the fastest option.
-
      -   `redo`: Paperless will OCR all pages of your documents and
          attempt to replace any existing text layers with new text. This
          will be useful for documents from scanners that already
@@ -443,6 +437,19 @@ modes are available:
      Read more about this in the [OCRmyPDF
      documentation](https://ocrmypdf.readthedocs.io/en/latest/advanced.html#when-ocr-is-skipped).
  
+`PAPERLESS_OCR_SKIP_ARCHIVE_FILE=<mode>`
+
+: Specify when you would like paperless to skip creating an archived
+version of your documents. This is useful if you don't want to have two
+almost-identical versions of your documents in the media folder.
+
+    -   `never`: Never skip creating an archived version.
+    -   `with_text`: Skip creating an archived version for documents
+    that already have embedded text.
+    -   `always`: Always skip creating an archived version.
+
+    The default is `never`.
+
  `PAPERLESS_OCR_CLEAN=<mode>`
  
  : Tells paperless to use `unpaper` to clean any input document before
diff --git a/docs/setup.md b/docs/setup.md

index 7eaaf69f739e5df4deae8c51a12c85133b5f8f41..425448ff668682a345baa6a19ffcfc1a06046733 100644 (file)
--- a/docs/setup.md
+++ b/docs/setup.md
@@ -818,9 +818,10 @@ performance immensely:
    other tasks).
  - Keep `PAPERLESS_OCR_MODE` at its default value `skip` and consider
    OCR'ing your documents before feeding them into paperless. Some
-  scanners are able to do this! You might want to even specify
-  `skip_noarchive` to skip archive file generation for already ocr'ed
-  documents entirely.
+  scanners are able to do this!
+- Set `PAPERLESS_OCR_SKIP_ARCHIVE_FILE` to `with_text` to skip archive
+  file generation for already ocr'ed documents, or `always` to skip it
+  for all documents.
  - If you want to perform OCR on the device, consider using
    `PAPERLESS_OCR_CLEAN=none`. This will speed up OCR times and use
    less memory at the expense of slightly worse OCR results.
diff --git a/docs/usage.md b/docs/usage.md

index e162e6e3a9e1f5c5486239439958124f1f5390aa..14adef26bdbd52ac52ae1d5007b03f152d170547 100644 (file)
--- a/docs/usage.md
+++ b/docs/usage.md
@@ -60,8 +60,8 @@ following operations on your documents:
  
      This process can be configured to fit your needs. If you don't want
      paperless to create archived versions for digital documents, you can
-    configure that by configuring `PAPERLESS_OCR_MODE=skip_noarchive`.
-    Please read the
+    configure that by configuring
+    `PAPERLESS_OCR_SKIP_ARCHIVE_FILE=with_text`. Please read the
      [relevant section in the documentation](/configuration#ocr).
  
  !!! note
diff --git a/paperless.conf.example b/paperless.conf.example

index 524d9264ea03a5d378767031782c09f36adc45b1..6bd70697e35397d91c866b927f1230ec6b3e037d 100644 (file)
--- a/paperless.conf.example
+++ b/paperless.conf.example
@@ -42,6 +42,7 @@
  
  #PAPERLESS_OCR_LANGUAGE=eng
  #PAPERLESS_OCR_MODE=skip
+#PAPERLESS_OCR_SKIP_ARCHIVE_FILE=never
  #PAPERLESS_OCR_OUTPUT_TYPE=pdfa
  #PAPERLESS_OCR_PAGES=1
  #PAPERLESS_OCR_IMAGE_DPI=300
diff --git a/src/paperless/checks.py b/src/paperless/checks.py

index 845ff2d0bada8f33e46801d8f3f702623d35c738..8988798a08bc4806514ac079ffe20243e2c7e1bb 100644 (file)
--- a/src/paperless/checks.py
+++ b/src/paperless/checks.py
@@ -130,6 +130,23 @@ def settings_values_check(app_configs, **kwargs):
          if settings.OCR_MODE not in {"force", "skip", "redo", "skip_noarchive"}:
              msgs.append(Error(f'OCR output mode "{settings.OCR_MODE}" is not valid'))
  
+        if settings.OCR_MODE == "skip_noarchive":
+            msgs.append(
+                Warning(
+                    'OCR output mode "skip_noarchive" is deprecated and will be'
+                    "removed in a future version. Please use"
+                    "PAPERLESS_OCR_SKIP_ARCHIVE_FILE instead.",
+                ),
+            )
+
+        if settings.OCR_SKIP_ARCHIVE_FILE not in {"never", "with_text", "always"}:
+            msgs.append(
+                Error(
+                    "OCR_SKIP_ARCHIVE_FILE setting "
+                    f'"{settings.OCR_SKIP_ARCHIVE_FILE}" is not valid',
+                ),
+            )
+
          if settings.OCR_CLEAN not in {"clean", "clean-final", "none"}:
              msgs.append(Error(f'OCR clean mode "{settings.OCR_CLEAN}" is not valid'))
          return msgs
diff --git a/src/paperless/settings.py b/src/paperless/settings.py

index 41f08f3e2562a89b92e5b53c91f8f7a502dcda79..44e843a9c8c93383d0b6e9941167e4fa8f6e1ae7 100644 (file)
--- a/src/paperless/settings.py
+++ b/src/paperless/settings.py
@@ -725,6 +725,8 @@ OCR_OUTPUT_TYPE = os.getenv("PAPERLESS_OCR_OUTPUT_TYPE", "pdfa")
  # skip. redo, force
  OCR_MODE = os.getenv("PAPERLESS_OCR_MODE", "skip")
  
+OCR_SKIP_ARCHIVE_FILE = os.getenv("PAPERLESS_OCR_SKIP_ARCHIVE_FILE", "never")
+
  OCR_IMAGE_DPI = os.getenv("PAPERLESS_OCR_IMAGE_DPI")
  
  OCR_CLEAN = os.getenv("PAPERLESS_OCR_CLEAN", "clean")
diff --git a/src/paperless_tesseract/parsers.py b/src/paperless_tesseract/parsers.py

index 4227583f8de086a6b3e361ffdd38cb4a6ec8a1fc..bbb25feb95debfc86639d4de7099485403a80ee3 100644 (file)
--- a/src/paperless_tesseract/parsers.py
+++ b/src/paperless_tesseract/parsers.py
@@ -294,7 +294,11 @@ class RasterisedDocumentParser(DocumentParser):
  
          # If the original has text, and the user doesn't want an archive,
          # we're done here
-        if settings.OCR_MODE == "skip_noarchive" and original_has_text:
+        skip_archive_for_text = (
+            settings.OCR_MODE == "skip_noarchive"
+            or settings.OCR_SKIP_ARCHIVE_FILE in ["with_text", "always"]
+        )
+        if skip_archive_for_text and original_has_text:
              self.log("debug", "Document has text, skipping OCRmyPDF entirely.")
              self.text = text_original
              return
@@ -320,7 +324,8 @@ class RasterisedDocumentParser(DocumentParser):
              self.log("debug", f"Calling OCRmyPDF with args: {args}")
              ocrmypdf.ocr(**args)
  
-            self.archive_path = archive_path
+            if settings.OCR_SKIP_ARCHIVE_FILE != "always":
+                self.archive_path = archive_path
  
              self.text = self.extract_text(sidecar_file, archive_path)
  
diff --git a/src/paperless_tesseract/tests/test_parser.py b/src/paperless_tesseract/tests/test_parser.py

index 94b72a0ee80ff4aed875aa9c025c7d7c26b29d79..5cbbc4d5523ada2117c8f37796fecd00ced4bdd7 100644 (file)
--- a/src/paperless_tesseract/tests/test_parser.py
+++ b/src/paperless_tesseract/tests/test_parser.py
@@ -332,7 +332,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
              ["page 1", "page 2", "page 3"],
          )
  
-    @override_settings(OOCR_MODE="skip")
+    @override_settings(OCR_MODE="skip")
      def test_multi_page_analog_pages_skip(self):
          parser = RasterisedDocumentParser(None)
          parser.parse(
@@ -438,6 +438,144 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
  
          self.assertIsNotNone(parser.archive_path)
  
+    @override_settings(OCR_SKIP_ARCHIVE_FILE="never")
+    def test_skip_archive_never_withtext(self):
+        """
+        GIVEN:
+            - File with existing text layer
+            - OCR_SKIP_ARCHIVE_FILE set to never
+        WHEN:
+            - Document is parsed
+        THEN:
+            - Text from text layer is extracted
+            - Archive file is created
+        """
+        parser = RasterisedDocumentParser(None)
+        parser.parse(
+            os.path.join(self.SAMPLE_FILES, "multi-page-digital.pdf"),
+            "application/pdf",
+        )
+        self.assertIsNotNone(parser.archive_path)
+        self.assertContainsStrings(
+            parser.get_text().lower(),
+            ["page 1", "page 2", "page 3"],
+        )
+
+    @override_settings(OCR_SKIP_ARCHIVE_FILE="never")
+    def test_skip_archive_never_withimages(self):
+        """
+        GIVEN:
+            - File with text contained in images but no text layer
+            - OCR_SKIP_ARCHIVE_FILE set to never
+        WHEN:
+            - Document is parsed
+        THEN:
+            - Text from images is extracted
+            - Archive file is created
+        """
+        parser = RasterisedDocumentParser(None)
+        parser.parse(
+            os.path.join(self.SAMPLE_FILES, "multi-page-images.pdf"),
+            "application/pdf",
+        )
+        self.assertIsNotNone(parser.archive_path)
+        self.assertContainsStrings(
+            parser.get_text().lower(),
+            ["page 1", "page 2", "page 3"],
+        )
+
+    @override_settings(OCR_SKIP_ARCHIVE_FILE="with_text")
+    def test_skip_archive_withtext_withtext(self):
+        """
+        GIVEN:
+            - File with existing text layer
+            - OCR_SKIP_ARCHIVE_FILE set to with_text
+        WHEN:
+            - Document is parsed
+        THEN:
+            - Text from text layer is extracted
+            - No archive file is created
+        """
+        parser = RasterisedDocumentParser(None)
+        parser.parse(
+            os.path.join(self.SAMPLE_FILES, "multi-page-digital.pdf"),
+            "application/pdf",
+        )
+        self.assertIsNone(parser.archive_path)
+        self.assertContainsStrings(
+            parser.get_text().lower(),
+            ["page 1", "page 2", "page 3"],
+        )
+
+    @override_settings(OCR_SKIP_ARCHIVE_FILE="with_text")
+    def test_skip_archive_withtext_withimages(self):
+        """
+        GIVEN:
+            - File with text contained in images but no text layer
+            - OCR_SKIP_ARCHIVE_FILE set to with_text
+        WHEN:
+            - Document is parsed
+        THEN:
+            - Text from images is extracted
+            - Archive file is created
+        """
+        parser = RasterisedDocumentParser(None)
+        parser.parse(
+            os.path.join(self.SAMPLE_FILES, "multi-page-images.pdf"),
+            "application/pdf",
+        )
+        self.assertIsNotNone(parser.archive_path)
+        self.assertContainsStrings(
+            parser.get_text().lower(),
+            ["page 1", "page 2", "page 3"],
+        )
+
+    @override_settings(OCR_SKIP_ARCHIVE_FILE="always")
+    def test_skip_archive_always_withtext(self):
+        """
+        GIVEN:
+            - File with existing text layer
+            - OCR_SKIP_ARCHIVE_FILE set to always
+        WHEN:
+            - Document is parsed
+        THEN:
+            - Text from text layer is extracted
+            - No archive file is created
+        """
+        parser = RasterisedDocumentParser(None)
+        parser.parse(
+            os.path.join(self.SAMPLE_FILES, "multi-page-digital.pdf"),
+            "application/pdf",
+        )
+        self.assertIsNone(parser.archive_path)
+        self.assertContainsStrings(
+            parser.get_text().lower(),
+            ["page 1", "page 2", "page 3"],
+        )
+
+    @override_settings(OCR_SKIP_ARCHIVE_FILE="always")
+    def test_skip_archive_always_withimages(self):
+        """
+        GIVEN:
+            - File with text contained in images but no text layer
+            - OCR_SKIP_ARCHIVE_FILE set to always
+        WHEN:
+            - Document is parsed
+        THEN:
+            - Text from images is extracted
+            - No archive file is created
+        """
+        parser = RasterisedDocumentParser(None)
+        parser.parse(
+            os.path.join(self.SAMPLE_FILES, "multi-page-images.pdf"),
+            "application/pdf",
+        )
+        self.assertIsNone(parser.archive_path)
+        self.assertContainsStrings(
+            parser.get_text().lower(),
+            ["page 1", "page 2", "page 3"],
+        )
+
      @override_settings(OCR_MODE="skip")
      def test_multi_page_mixed(self):
          """
author	Brandon Rothweiler <brandonrothweiler@gmail.com>
	Fri, 24 Feb 2023 03:42:57 +0000 (22:42 -0500)
committer	Brandon Rothweiler <brandonrothweiler@gmail.com>
	Fri, 24 Feb 2023 03:42:57 +0000 (22:42 -0500)
docs/configuration.md		patch \| blob \| blame \| history
docs/setup.md		patch \| blob \| blame \| history
docs/usage.md		patch \| blob \| blame \| history
paperless.conf.example		patch \| blob \| blame \| history
src/paperless/checks.py		patch \| blob \| blame \| history
src/paperless/settings.py		patch \| blob \| blame \| history
src/paperless_tesseract/parsers.py		patch \| blob \| blame \| history
src/paperless_tesseract/tests/test_parser.py		patch \| blob \| blame \| history