Revert "Merge pull request #2732 from bdr99/skip_neverarchive"

author Brandon Rothweiler <brandonrothweiler@gmail.com>

Fri, 24 Feb 2023 02:26:53 +0000 (21:26 -0500)

committer Brandon Rothweiler <brandonrothweiler@gmail.com>

Fri, 24 Feb 2023 02:26:53 +0000 (21:26 -0500)
author Brandon Rothweiler <brandonrothweiler@gmail.com>
Fri, 24 Feb 2023 02:26:53 +0000 (21:26 -0500)
committer Brandon Rothweiler <brandonrothweiler@gmail.com>
Fri, 24 Feb 2023 02:26:53 +0000 (21:26 -0500)
diff --git a/docs/configuration.md b/docs/configuration.md

index 5cf0022f307a17cf7ea6569260e97f93d7440a5e..6c233c2e60c498cbcc7858e13da929bf57415cda 100644 (file)
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -419,10 +419,7 @@ modes are available:
          an archived version of your documents when it finds any text in
          them. This is useful if you don't want to have two
          almost-identical versions of your digital documents in the media
-        folder.
-
-    -   `skip_neverarchive`: In addition to skip, paperless will never
-        create an archive version of your documents. This is the fastest option.
+        folder. This is the fastest option.
  
      -   `redo`: Paperless will OCR all pages of your documents and
          attempt to replace any existing text layers with new text. This
diff --git a/src/paperless/checks.py b/src/paperless/checks.py

index 53972bc21f032faede87ab0ca16477d5e8a29dbb..845ff2d0bada8f33e46801d8f3f702623d35c738 100644 (file)
--- a/src/paperless/checks.py
+++ b/src/paperless/checks.py
@@ -127,13 +127,7 @@ def settings_values_check(app_configs, **kwargs):
                  Error(f'OCR output type "{settings.OCR_OUTPUT_TYPE}" is not valid'),
              )
  
-        if settings.OCR_MODE not in {
-            "force",
-            "skip",
-            "redo",
-            "skip_noarchive",
-            "skip_neverarchive",
-        }:
+        if settings.OCR_MODE not in {"force", "skip", "redo", "skip_noarchive"}:
              msgs.append(Error(f'OCR output mode "{settings.OCR_MODE}" is not valid'))
  
          if settings.OCR_CLEAN not in {"clean", "clean-final", "none"}:
diff --git a/src/paperless_tesseract/parsers.py b/src/paperless_tesseract/parsers.py

index 3a91e33901d888fe48d4f606e3767c74e674ab59..4227583f8de086a6b3e361ffdd38cb4a6ec8a1fc 100644 (file)
--- a/src/paperless_tesseract/parsers.py
+++ b/src/paperless_tesseract/parsers.py
@@ -192,7 +192,7 @@ class RasterisedDocumentParser(DocumentParser):
  
          if settings.OCR_MODE == "force" or safe_fallback:
              ocrmypdf_args["force_ocr"] = True
-        elif settings.OCR_MODE in ["skip", "skip_noarchive", "skip_neverarchive"]:
+        elif settings.OCR_MODE in ["skip", "skip_noarchive"]:
              ocrmypdf_args["skip_text"] = True
          elif settings.OCR_MODE == "redo":
              ocrmypdf_args["redo_ocr"] = True
@@ -294,10 +294,7 @@ class RasterisedDocumentParser(DocumentParser):
  
          # If the original has text, and the user doesn't want an archive,
          # we're done here
-        if (
-            settings.OCR_MODE in ["skip_noarchive", "skip_neverarchive"]
-            and original_has_text
-        ):
+        if settings.OCR_MODE == "skip_noarchive" and original_has_text:
              self.log("debug", "Document has text, skipping OCRmyPDF entirely.")
              self.text = text_original
              return
@@ -323,9 +320,7 @@ class RasterisedDocumentParser(DocumentParser):
              self.log("debug", f"Calling OCRmyPDF with args: {args}")
              ocrmypdf.ocr(**args)
  
-            # Only create archive file if archiving isn't being skipped
-            if settings.OCR_MODE != "skip_neverarchive":
-                self.archive_path = archive_path
+            self.archive_path = archive_path
  
              self.text = self.extract_text(sidecar_file, archive_path)
  
diff --git a/src/paperless_tesseract/tests/test_parser.py b/src/paperless_tesseract/tests/test_parser.py

index de0c3ce380ac9c1445e154ea67eb9a8043043c82..94b72a0ee80ff4aed875aa9c025c7d7c26b29d79 100644 (file)
--- a/src/paperless_tesseract/tests/test_parser.py
+++ b/src/paperless_tesseract/tests/test_parser.py
@@ -438,52 +438,6 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
  
          self.assertIsNotNone(parser.archive_path)
  
-    @override_settings(OCR_MODE="skip_neverarchive")
-    def test_skip_neverarchive_withtext(self):
-        """
-        GIVEN:
-            - File with existing text layer
-            - OCR mode set to skip_neverarchive
-        WHEN:
-            - Document is parsed
-        THEN:
-            - Text from images is extracted
-            - No archive file is created
-        """
-        parser = RasterisedDocumentParser(None)
-        parser.parse(
-            os.path.join(self.SAMPLE_FILES, "multi-page-digital.pdf"),
-            "application/pdf",
-        )
-        self.assertIsNone(parser.archive_path)
-        self.assertContainsStrings(
-            parser.get_text().lower(),
-            ["page 1", "page 2", "page 3"],
-        )
-
-    @override_settings(OCR_MODE="skip_neverarchive")
-    def test_skip_neverarchive_notext(self):
-        """
-        GIVEN:
-            - File with text contained in images but no text layer
-            - OCR mode set to skip_neverarchive
-        WHEN:
-            - Document is parsed
-        THEN:
-            - Text from images is extracted
-            - No archive file is created
-        """
-        parser = RasterisedDocumentParser(None)
-        parser.parse(
-            os.path.join(self.SAMPLE_FILES, "multi-page-images.pdf"),
-            "application/pdf",
-        )
-        self.assertIsNone(parser.archive_path)
-        self.assertContainsStrings(
-            parser.get_text().lower(),
-            ["page 1", "page 2", "page 3"],
-        )
-
      @override_settings(OCR_MODE="skip")
      def test_multi_page_mixed(self):
          """
author	Brandon Rothweiler <brandonrothweiler@gmail.com>
	Fri, 24 Feb 2023 02:26:53 +0000 (21:26 -0500)
committer	Brandon Rothweiler <brandonrothweiler@gmail.com>
	Fri, 24 Feb 2023 02:26:53 +0000 (21:26 -0500)
docs/configuration.md		patch \| blob \| blame \| history
src/paperless/checks.py		patch \| blob \| blame \| history
src/paperless_tesseract/parsers.py		patch \| blob \| blame \| history
src/paperless_tesseract/tests/test_parser.py		patch \| blob \| blame \| history